summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/fs/zfs/dnode.c
diff options
context:
space:
mode:
authorTom Caputi <tcaputi@datto.com>2019-06-25 19:39:35 +0000
committerJerry Jelinek <jerry.jelinek@joyent.com>2019-06-25 19:40:06 +0000
commiteb633035c80613ec93d62f90482837adaaf21a0a (patch)
tree67f2e3e15231d06a3525ce3958bbce24aa3de7e8 /usr/src/uts/common/fs/zfs/dnode.c
parent07eb1aef88b873c5c1036d9cf69820c1ef6a32fb (diff)
downloadillumos-gate-eb633035c80613ec93d62f90482837adaaf21a0a.tar.gz
8727 Native data and metadata encryption for zfs
Portions contributed by: Jorgen Lundman <lundman@lundman.net> Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com> Portions contributed by: Paul Zuchowski <pzuchowski@datto.com> Portions contributed by: Tim Chase <tim@chase2k.com> Portions contributed by: Matthew Ahrens <mahrens@delphix.com> Portions contributed by: ab-oe <arkadiusz.bubala@open-e.com> Portions contributed by: Brian Behlendorf <behlendorf1@llnl.gov> Portions contributed by: loli10K <ezomori.nozomu@gmail.com> Portions contributed by: Igor K <igor@dilos.org> Portions contributed by: Richard Laager <rlaager@wiktel.com> Reviewed by: Jason Cohen <jwittlincohen@gmail.com> Reviewed by: Allan Jude <allanjude@freebsd.org> Reviewed by: George Melikov <mail@gmelikov.ru> Reviewed by: Paul Dagnelie <pcd@delphix.com> Reviewed by: RageLtMan <rageltman@sempervictus> Reviewed by: Matthew Thode <prometheanfire@gentoo.org> Reviewed by: Giuseppe Di Natale <dinatale2@llnl.gov> Reviewed by: Kash Pande <kash@tripleback.net> Reviewed by: Alek Pinchuk <apinchuk@datto.com> Reviewed by: Dan Kimmel <dan.kimmel@delphix.com> Reviewed by: David Quigley <david.quigley@intel.com> Reviewed by: Jorgen Lundman <lundman@lundman.net> Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov> Reviewed by: Toomas Soome <tsoome@me.com> Reviewed by: C Fraire <cfraire@me.com> Reviewed by: Jason King <jason.king@joyent.com> Reviewed by: Andy Stormont <astormont@racktopsystems.com> Approved by: Garrett D'Amore <garrett@damore.org>
Diffstat (limited to 'usr/src/uts/common/fs/zfs/dnode.c')
-rw-r--r--usr/src/uts/common/fs/zfs/dnode.c139
1 files changed, 98 insertions, 41 deletions
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index f360eb997e..5a86650d28 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -136,6 +136,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
+ bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid));
for (i = 0; i < TXG_SIZE; i++) {
multilist_link_init(&dn->dn_dirty_link[i]);
@@ -196,6 +197,7 @@ dnode_dest(void *arg, void *unused)
ASSERT0(dn->dn_rm_spillblk[i]);
ASSERT0(dn->dn_next_bonuslen[i]);
ASSERT0(dn->dn_next_blksz[i]);
+ ASSERT0(dn->dn_next_maxblkid[i]);
}
ASSERT0(dn->dn_allocated_txg);
@@ -617,6 +619,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT0(dn->dn_next_bonustype[i]);
ASSERT0(dn->dn_rm_spillblk[i]);
ASSERT0(dn->dn_next_blksz[i]);
+ ASSERT0(dn->dn_next_maxblkid[i]);
ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
@@ -659,7 +662,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
- dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots,
+ boolean_t keep_spill, dmu_tx_t *tx)
{
int nblkptr;
@@ -708,7 +712,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
if (dn->dn_nblkptr != nblkptr)
dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
- if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+ if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
dbuf_rm_spill(dn, tx);
dnode_rm_spill(dn, tx);
}
@@ -785,6 +789,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
sizeof (odn->dn_next_bonuslen));
bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
sizeof (odn->dn_next_blksz));
+ bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0],
+ sizeof (odn->dn_next_maxblkid));
for (i = 0; i < TXG_SIZE; i++) {
list_move_tail(&ndn->dn_dirty_records[i],
&odn->dn_dirty_records[i]);
@@ -1321,7 +1327,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
return (SET_ERROR(EIO));
}
- err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+ /*
+ * We do not need to decrypt to read the dnode so it doesn't matter
+ * if we get the encrypted or decrypted version.
+ */
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT);
if (err) {
DNODE_STAT_BUMP(dnode_hold_dbuf_read);
dbuf_rele(db, FTAG);
@@ -1749,11 +1759,74 @@ fail:
return (SET_ERROR(ENOTSUP));
}
+static void
+dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx)
+{
+ uint64_t txgoff = tx->tx_txg & TXG_MASK;
+ int old_nlevels = dn->dn_nlevels;
+ dmu_buf_impl_t *db;
+ list_t *list;
+ dbuf_dirty_record_t *new, *dr, *dr_next;
+
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ dn->dn_nlevels = new_nlevels;
+
+ ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
+ dn->dn_next_nlevels[txgoff] = new_nlevels;
+
+ /* dirty the left indirects */
+ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
+ ASSERT(db != NULL);
+ new = dbuf_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+
+ /* transfer the dirty records to the new indirect */
+ mutex_enter(&dn->dn_mtx);
+ mutex_enter(&new->dt.di.dr_mtx);
+ list = &dn->dn_dirty_records[txgoff];
+ for (dr = list_head(list); dr; dr = dr_next) {
+ dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
+ if (dr->dr_dbuf->db_level != new_nlevels-1 &&
+ dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+ ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
+ list_remove(&dn->dn_dirty_records[txgoff], dr);
+ list_insert_tail(&new->dt.di.dr_children, dr);
+ dr->dr_parent = new;
+ }
+ }
+ mutex_exit(&new->dt.di.dr_mtx);
+ mutex_exit(&dn->dn_mtx);
+}
+
+int
+dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx)
+{
+ int ret = 0;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+
+ if (dn->dn_nlevels == nlevels) {
+ ret = 0;
+ goto out;
+ } else if (nlevels < dn->dn_nlevels) {
+ ret = SET_ERROR(EINVAL);
+ goto out;
+ }
+
+ dnode_set_nlevels_impl(dn, nlevels, tx);
+
+out:
+ rw_exit(&dn->dn_struct_rwlock);
+ return (ret);
+}
+
/* read-holding callers must not rely on the lock being continuously held */
void
-dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
+ boolean_t force)
{
- uint64_t txgoff = tx->tx_txg & TXG_MASK;
int epbs, new_nlevels;
uint64_t sz;
@@ -1777,13 +1850,25 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
}
}
- if (blkid <= dn->dn_maxblkid)
+ /*
+ * Raw sends (indicated by the force flag) require that we take the
+ * given blkid even if the value is lower than the current value.
+ */
+ if (!force && blkid <= dn->dn_maxblkid)
goto out;
+ /*
+ * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
+ * to indicate that this field is set. This allows us to set the
+ * maxblkid to 0 on an existing object in dnode_sync().
+ */
dn->dn_maxblkid = blkid;
+ dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
+ blkid | DMU_NEXT_MAXBLKID_SET;
/*
* Compute the number of levels necessary to support the new maxblkid.
+ * Raw sends will ensure nlevels is set correctly for us.
*/
new_nlevels = 1;
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
@@ -1791,40 +1876,11 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
new_nlevels++;
- if (new_nlevels > dn->dn_nlevels) {
- int old_nlevels = dn->dn_nlevels;
- dmu_buf_impl_t *db;
- list_t *list;
- dbuf_dirty_record_t *new, *dr, *dr_next;
-
- dn->dn_nlevels = new_nlevels;
-
- ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
- dn->dn_next_nlevels[txgoff] = new_nlevels;
-
- /* dirty the left indirects */
- db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
- ASSERT(db != NULL);
- new = dbuf_dirty(db, tx);
- dbuf_rele(db, FTAG);
-
- /* transfer the dirty records to the new indirect */
- mutex_enter(&dn->dn_mtx);
- mutex_enter(&new->dt.di.dr_mtx);
- list = &dn->dn_dirty_records[txgoff];
- for (dr = list_head(list); dr; dr = dr_next) {
- dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
- if (dr->dr_dbuf->db_level != new_nlevels-1 &&
- dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
- dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
- ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
- list_remove(&dn->dn_dirty_records[txgoff], dr);
- list_insert_tail(&new->dt.di.dr_children, dr);
- dr->dr_parent = new;
- }
- }
- mutex_exit(&new->dt.di.dr_mtx);
- mutex_exit(&dn->dn_mtx);
+ if (!force) {
+ if (new_nlevels > dn->dn_nlevels)
+ dnode_set_nlevels_impl(dn, new_nlevels, tx);
+ } else {
+ ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
}
out:
@@ -2249,7 +2305,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
*/
return (SET_ERROR(ESRCH));
}
- error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
+ error = dbuf_read(db, NULL,
+ DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT);
if (error) {
dbuf_rele(db, FTAG);
return (error);