diff options
author | Tom Caputi <tcaputi@datto.com> | 2019-06-25 19:39:35 +0000 |
---|---|---|
committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2019-06-25 19:40:06 +0000 |
commit | eb633035c80613ec93d62f90482837adaaf21a0a (patch) | |
tree | 67f2e3e15231d06a3525ce3958bbce24aa3de7e8 /usr/src/uts/common/fs/zfs/dnode.c | |
parent | 07eb1aef88b873c5c1036d9cf69820c1ef6a32fb (diff) | |
download | illumos-gate-eb633035c80613ec93d62f90482837adaaf21a0a.tar.gz |
8727 Native data and metadata encryption for zfs
Portions contributed by: Jorgen Lundman <lundman@lundman.net>
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Portions contributed by: Paul Zuchowski <pzuchowski@datto.com>
Portions contributed by: Tim Chase <tim@chase2k.com>
Portions contributed by: Matthew Ahrens <mahrens@delphix.com>
Portions contributed by: ab-oe <arkadiusz.bubala@open-e.com>
Portions contributed by: Brian Behlendorf <behlendorf1@llnl.gov>
Portions contributed by: loli10K <ezomori.nozomu@gmail.com>
Portions contributed by: Igor K <igor@dilos.org>
Portions contributed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Jason Cohen <jwittlincohen@gmail.com>
Reviewed by: Allan Jude <allanjude@freebsd.org>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: RageLtMan <rageltman@sempervictus>
Reviewed by: Matthew Thode <prometheanfire@gentoo.org>
Reviewed by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed by: Kash Pande <kash@tripleback.net>
Reviewed by: Alek Pinchuk <apinchuk@datto.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: David Quigley <david.quigley@intel.com>
Reviewed by: Jorgen Lundman <lundman@lundman.net>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: C Fraire <cfraire@me.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Andy Stormont <astormont@racktopsystems.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Diffstat (limited to 'usr/src/uts/common/fs/zfs/dnode.c')
-rw-r--r-- | usr/src/uts/common/fs/zfs/dnode.c | 139 |
1 files changed, 98 insertions, 41 deletions
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index f360eb997e..5a86650d28 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -136,6 +136,7 @@ dnode_cons(void *arg, void *unused, int kmflag) bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); + bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid)); for (i = 0; i < TXG_SIZE; i++) { multilist_link_init(&dn->dn_dirty_link[i]); @@ -196,6 +197,7 @@ dnode_dest(void *arg, void *unused) ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_bonuslen[i]); ASSERT0(dn->dn_next_blksz[i]); + ASSERT0(dn->dn_next_maxblkid[i]); } ASSERT0(dn->dn_allocated_txg); @@ -617,6 +619,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_blksz[i]); + ASSERT0(dn->dn_next_maxblkid[i]); ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); @@ -659,7 +662,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) + dmu_object_type_t bonustype, int bonuslen, int dn_slots, + boolean_t keep_spill, dmu_tx_t *tx) { int nblkptr; @@ -708,7 +712,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; if (dn->dn_nblkptr != nblkptr) dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; - if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) { dbuf_rm_spill(dn, tx); dnode_rm_spill(dn, tx); } @@ -785,6 +789,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) sizeof (odn->dn_next_bonuslen)); bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], sizeof (odn->dn_next_blksz)); + bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0], + sizeof (odn->dn_next_maxblkid)); for (i = 0; i < TXG_SIZE; i++) { list_move_tail(&ndn->dn_dirty_records[i], &odn->dn_dirty_records[i]); @@ -1321,7 +1327,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, DNODE_STAT_BUMP(dnode_hold_dbuf_hold); return (SET_ERROR(EIO)); } - err = dbuf_read(db, NULL, DB_RF_CANFAIL); + /* + * We do not need to decrypt to read the dnode so it doesn't matter + * if we get the encrypted or decrypted version. + */ + err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT); if (err) { DNODE_STAT_BUMP(dnode_hold_dbuf_read); dbuf_rele(db, FTAG); @@ -1749,11 +1759,74 @@ fail: return (SET_ERROR(ENOTSUP)); } +static void +dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) +{ + uint64_t txgoff = tx->tx_txg & TXG_MASK; + int old_nlevels = dn->dn_nlevels; + dmu_buf_impl_t *db; + list_t *list; + dbuf_dirty_record_t *new, *dr, *dr_next; + + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + dn->dn_nlevels = new_nlevels; + + ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); + dn->dn_next_nlevels[txgoff] = new_nlevels; + + /* dirty the left indirects */ + db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); + ASSERT(db != NULL); + new = dbuf_dirty(db, tx); + dbuf_rele(db, FTAG); + + /* transfer the dirty records to the new indirect */ + mutex_enter(&dn->dn_mtx); + mutex_enter(&new->dt.di.dr_mtx); + list = &dn->dn_dirty_records[txgoff]; + for (dr = list_head(list); dr; dr = dr_next) { + dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); + if (dr->dr_dbuf->db_level != new_nlevels-1 && + dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); + list_remove(&dn->dn_dirty_records[txgoff], dr); + list_insert_tail(&new->dt.di.dr_children, dr); + dr->dr_parent = new; + } + } + mutex_exit(&new->dt.di.dr_mtx); + mutex_exit(&dn->dn_mtx); +} + +int +dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx) +{ + int ret = 0; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + if (dn->dn_nlevels == nlevels) { + ret = 0; + goto out; + } else if (nlevels < dn->dn_nlevels) { + ret = SET_ERROR(EINVAL); + goto out; + } + + dnode_set_nlevels_impl(dn, nlevels, tx); + +out: + rw_exit(&dn->dn_struct_rwlock); + return (ret); +} + /* read-holding callers must not rely on the lock being continuously held */ void -dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) +dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read, + boolean_t force) { - uint64_t txgoff = tx->tx_txg & TXG_MASK; int epbs, new_nlevels; uint64_t sz; @@ -1777,13 +1850,25 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) } } - if (blkid <= dn->dn_maxblkid) + /* + * Raw sends (indicated by the force flag) require that we take the + * given blkid even if the value is lower than the current value. + */ + if (!force && blkid <= dn->dn_maxblkid) goto out; + /* + * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff] + * to indicate that this field is set. This allows us to set the + * maxblkid to 0 on an existing object in dnode_sync(). + */ dn->dn_maxblkid = blkid; + dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] = + blkid | DMU_NEXT_MAXBLKID_SET; /* * Compute the number of levels necessary to support the new maxblkid. + * Raw sends will ensure nlevels is set correctly for us. */ new_nlevels = 1; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; @@ -1791,40 +1876,11 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) new_nlevels++; - if (new_nlevels > dn->dn_nlevels) { - int old_nlevels = dn->dn_nlevels; - dmu_buf_impl_t *db; - list_t *list; - dbuf_dirty_record_t *new, *dr, *dr_next; - - dn->dn_nlevels = new_nlevels; - - ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); - dn->dn_next_nlevels[txgoff] = new_nlevels; - - /* dirty the left indirects */ - db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); - ASSERT(db != NULL); - new = dbuf_dirty(db, tx); - dbuf_rele(db, FTAG); - - /* transfer the dirty records to the new indirect */ - mutex_enter(&dn->dn_mtx); - mutex_enter(&new->dt.di.dr_mtx); - list = &dn->dn_dirty_records[txgoff]; - for (dr = list_head(list); dr; dr = dr_next) { - dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); - if (dr->dr_dbuf->db_level != new_nlevels-1 && - dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); - list_remove(&dn->dn_dirty_records[txgoff], dr); - list_insert_tail(&new->dt.di.dr_children, dr); - dr->dr_parent = new; - } - } - mutex_exit(&new->dt.di.dr_mtx); - mutex_exit(&dn->dn_mtx); + if (!force) { + if (new_nlevels > dn->dn_nlevels) + dnode_set_nlevels_impl(dn, new_nlevels, tx); + } else { + ASSERT3U(dn->dn_nlevels, >=, new_nlevels); } out: @@ -2249,7 +2305,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, */ return (SET_ERROR(ESRCH)); } - error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); + error = dbuf_read(db, NULL, + DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT); if (error) { dbuf_rele(db, FTAG); return (error); |