diff options
| author | Tom Caputi <tcaputi@datto.com> | 2019-06-25 19:39:35 +0000 |
|---|---|---|
| committer | Jerry Jelinek <jerry.jelinek@joyent.com> | 2019-06-25 19:40:06 +0000 |
| commit | eb633035c80613ec93d62f90482837adaaf21a0a (patch) | |
| tree | 67f2e3e15231d06a3525ce3958bbce24aa3de7e8 /usr/src/uts/common/fs | |
| parent | 07eb1aef88b873c5c1036d9cf69820c1ef6a32fb (diff) | |
| download | illumos-joyent-eb633035c80613ec93d62f90482837adaaf21a0a.tar.gz | |
8727 Native data and metadata encryption for zfs
Portions contributed by: Jorgen Lundman <lundman@lundman.net>
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Portions contributed by: Paul Zuchowski <pzuchowski@datto.com>
Portions contributed by: Tim Chase <tim@chase2k.com>
Portions contributed by: Matthew Ahrens <mahrens@delphix.com>
Portions contributed by: ab-oe <arkadiusz.bubala@open-e.com>
Portions contributed by: Brian Behlendorf <behlendorf1@llnl.gov>
Portions contributed by: loli10K <ezomori.nozomu@gmail.com>
Portions contributed by: Igor K <igor@dilos.org>
Portions contributed by: Richard Laager <rlaager@wiktel.com>
Reviewed by: Jason Cohen <jwittlincohen@gmail.com>
Reviewed by: Allan Jude <allanjude@freebsd.org>
Reviewed by: George Melikov <mail@gmelikov.ru>
Reviewed by: Paul Dagnelie <pcd@delphix.com>
Reviewed by: RageLtMan <rageltman@sempervictus>
Reviewed by: Matthew Thode <prometheanfire@gentoo.org>
Reviewed by: Giuseppe Di Natale <dinatale2@llnl.gov>
Reviewed by: Kash Pande <kash@tripleback.net>
Reviewed by: Alek Pinchuk <apinchuk@datto.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: David Quigley <david.quigley@intel.com>
Reviewed by: Jorgen Lundman <lundman@lundman.net>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed by: Toomas Soome <tsoome@me.com>
Reviewed by: C Fraire <cfraire@me.com>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Andy Stormont <astormont@racktopsystems.com>
Approved by: Garrett D'Amore <garrett@damore.org>
Diffstat (limited to 'usr/src/uts/common/fs')
74 files changed, 11514 insertions, 1205 deletions
diff --git a/usr/src/uts/common/fs/zfs/abd.c b/usr/src/uts/common/fs/zfs/abd.c index 0ab3513718..5417514e41 100644 --- a/usr/src/uts/common/fs/zfs/abd.c +++ b/usr/src/uts/common/fs/zfs/abd.c @@ -427,8 +427,9 @@ abd_alloc_for_io(size_t size, boolean_t is_metadata) * buffer data with sabd. Use abd_put() to free. sabd must not be freed while * any derived ABDs exist. */ -abd_t * -abd_get_offset(abd_t *sabd, size_t off) +/* ARGSUSED */ +static inline abd_t * +abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) { abd_t *abd; @@ -480,6 +481,25 @@ abd_get_offset(abd_t *sabd, size_t off) return (abd); } +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; + + VERIFY3U(size, >, 0); + + return (abd_get_offset_impl(sabd, off, size)); +} + +abd_t * +abd_get_offset_size(abd_t *sabd, size_t off, size_t size) +{ + ASSERT3U(off + size, <=, sabd->abd_size); + + return (abd_get_offset_impl(sabd, off, size)); +} + + /* * Allocate a linear ABD structure for buf. You must free this with abd_put() * since the resulting ABD doesn't own its own buffer. diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 3a07d72d93..90f5314d81 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -250,6 +250,21 @@ * ARC is disabled, then the L2ARC's block must be transformed to look * like the physical block in the main data pool before comparing the * checksum and determining its validity. + * + * The L1ARC has a slightly different system for storing encrypted data. + * Raw (encrypted + possibly compressed) data has a few subtle differences from + * data that is just compressed. The biggest difference is that it is not + * possible to decrypt encrypted data (or visa versa) if the keys aren't loaded. + * The other difference is that encryption cannot be treated as a suggestion. + * If a caller would prefer compressed data, but they actually wind up with + * uncompressed data the worst thing that could happen is there might be a + * performance hit. If the caller requests encrypted data, however, we must be + * sure they actually get it or else secret information could be leaked. Raw + * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore, + * may have both an encrypted version and a decrypted version of its data at + * once. When a caller needs a raw arc_buf_t, it is allocated and the data is + * copied out of this header. To avoid complications with b_pabd, raw buffers + * cannot be shared. */ #include <sys/spa.h> @@ -266,6 +281,8 @@ #include <sys/zio_checksum.h> #include <sys/multilist.h> #include <sys/abd.h> +#include <sys/zil.h> +#include <sys/fm/fs/zfs.h> #ifdef _KERNEL #include <sys/vmsystm.h> #include <vm/anon.h> @@ -481,7 +498,7 @@ typedef struct arc_stats { kstat_named_t arcstat_evict_skip; /* * Number of times arc_evict_state() was unable to evict enough - * buffers to reach it's target amount. + * buffers to reach its target amount. */ kstat_named_t arcstat_evict_not_enough; kstat_named_t arcstat_evict_l2_cached; @@ -883,7 +900,10 @@ struct arc_callback { void *acb_private; arc_read_done_func_t *acb_done; arc_buf_t *acb_buf; + boolean_t acb_encrypted; boolean_t acb_compressed; + boolean_t acb_noauth; + zbookmark_phys_t acb_zb; zio_t *acb_zio_dummy; zio_t *acb_zio_head; arc_callback_t *acb_next; @@ -963,6 +983,36 @@ typedef struct l1arc_buf_hdr { abd_t *b_pabd; } l1arc_buf_hdr_t; +/* + * Encrypted blocks will need to be stored encrypted on the L2ARC + * disk as they appear in the main pool. In order for this to work we + * need to pass around the encryption parameters so they can be used + * to write data to the L2ARC. This struct is only defined in the + * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED + * flag set. + */ +typedef struct arc_buf_hdr_crypt { + abd_t *b_rabd; /* raw encrypted data */ + dmu_object_type_t b_ot; /* object type */ + uint32_t b_ebufcnt; /* number or encryped buffers */ + + /* dsobj for looking up encryption key for l2arc encryption */ + uint64_t b_dsobj; /* for looking up key */ + + /* encryption parameters */ + uint8_t b_salt[ZIO_DATA_SALT_LEN]; + uint8_t b_iv[ZIO_DATA_IV_LEN]; + + /* + * Technically this could be removed since we will always be able to + * get the mac from the bp when we need it. However, it is inconvenient + * for callers of arc code to have to pass a bp in all the time. This + * also allows us to assert that L2ARC data is properly encrypted to + * match the data in the main storage pool. + */ + uint8_t b_mac[ZIO_DATA_MAC_LEN]; +} arc_buf_hdr_crypt_t; + typedef struct l2arc_dev l2arc_dev_t; typedef struct l2arc_buf_hdr { @@ -1013,6 +1063,11 @@ struct arc_buf_hdr { l2arc_buf_hdr_t b_l2hdr; /* L1ARC fields. Undefined when in l2arc_only state */ l1arc_buf_hdr_t b_l1hdr; + /* + * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED + * is set and the L1 header exists. + */ + arc_buf_hdr_crypt_t b_crypt_hdr; }; #define GHOST_STATE(state) \ @@ -1035,6 +1090,8 @@ struct arc_buf_hdr { #define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) #define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) #define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED) +#define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH) #define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) #define HDR_ISTYPE_METADATA(hdr) \ @@ -1043,6 +1100,13 @@ struct arc_buf_hdr { #define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) #define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) +#define HDR_HAS_RABD(hdr) \ + (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \ + (hdr)->b_crypt_hdr.b_rabd != NULL) +#define HDR_ENCRYPTED(hdr) \ + (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) +#define HDR_AUTHENTICATED(hdr) \ + (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) /* For storing compression mode in b_flags */ #define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) @@ -1055,12 +1119,14 @@ struct arc_buf_hdr { #define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) #define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) #define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) +#define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED) /* * Other sizes */ -#define HDR_FULL_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr)) #define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) /* @@ -1174,13 +1240,21 @@ static kcondvar_t l2arc_feed_thr_cv; static uint8_t l2arc_thread_exit; static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *); +typedef enum arc_fill_flags { + ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ + ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */ + ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */ + ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */ + ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ +} arc_fill_flags_t; + static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *); static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); -static void arc_hdr_free_pabd(arc_buf_hdr_t *); -static void arc_hdr_alloc_pabd(arc_buf_hdr_t *); +static void arc_hdr_free_pabd(arc_buf_hdr_t *, boolean_t); +static void arc_hdr_alloc_pabd(arc_buf_hdr_t *, boolean_t); static void arc_access(arc_buf_hdr_t *, kmutex_t *); static boolean_t arc_is_overflowing(); static void arc_buf_watch(arc_buf_t *); @@ -1323,7 +1397,9 @@ buf_hash_remove(arc_buf_hdr_t *hdr) /* * Global data structures and functions for the buf kmem cache. */ + static kmem_cache_t *hdr_full_cache; +static kmem_cache_t *hdr_full_crypt_cache; static kmem_cache_t *hdr_l2only_cache; static kmem_cache_t *buf_cache; @@ -1337,6 +1413,7 @@ buf_fini(void) for (i = 0; i < BUF_LOCKS; i++) mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); kmem_cache_destroy(hdr_full_cache); + kmem_cache_destroy(hdr_full_crypt_cache); kmem_cache_destroy(hdr_l2only_cache); kmem_cache_destroy(buf_cache); } @@ -1352,6 +1429,7 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) arc_buf_hdr_t *hdr = vbuf; bzero(hdr, HDR_FULL_SIZE); + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); @@ -1363,6 +1441,19 @@ hdr_full_cons(void *vbuf, void *unused, int kmflag) /* ARGSUSED */ static int +hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + (void) hdr_full_cons(vbuf, unused, kmflag); + bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr)); + arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int hdr_l2only_cons(void *vbuf, void *unused, int kmflag) { arc_buf_hdr_t *hdr = vbuf; @@ -1406,6 +1497,16 @@ hdr_full_dest(void *vbuf, void *unused) /* ARGSUSED */ static void +hdr_full_crypt_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *hdr = vbuf; + + hdr_full_dest(hdr, unused); + arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void hdr_l2only_dest(void *vbuf, void *unused) { arc_buf_hdr_t *hdr = vbuf; @@ -1467,6 +1568,9 @@ retry: hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, 0, hdr_full_cons, hdr_full_dest, hdr_recl, NULL, NULL, 0); + hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", + HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, + hdr_recl, NULL, NULL, 0); hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, hdr_recl, NULL, NULL, 0); @@ -1501,6 +1605,47 @@ arc_buf_lsize(arc_buf_t *buf) return (HDR_GET_LSIZE(buf->b_hdr)); } +/* + * This function will return B_TRUE if the buffer is encrypted in memory. + * This buffer can be decrypted by calling arc_untransform(). + */ +boolean_t +arc_is_encrypted(arc_buf_t *buf) +{ + return (ARC_BUF_ENCRYPTED(buf) != 0); +} + +/* + * Returns B_TRUE if the buffer represents data that has not had its MAC + * verified yet. + */ +boolean_t +arc_is_unauthenticated(arc_buf_t *buf) +{ + return (HDR_NOAUTH(buf->b_hdr) != 0); +} + +void +arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, + uint8_t *iv, uint8_t *mac) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(HDR_PROTECTED(hdr)); + + bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); + bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); + bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); + *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? + /* CONSTCOND */ + ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; +} + +/* + * Indicates how this buffer is compressed in memory. If it is not compressed + * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with + * arc_untransform() as long as it is also unencrypted. + */ enum zio_compress arc_get_compression(arc_buf_t *buf) { @@ -1510,6 +1655,18 @@ arc_get_compression(arc_buf_t *buf) #define ARC_MINTIME (hz>>4) /* 62 ms */ +/* + * Return the compression algorithm used to store this data in the ARC. If ARC + * compression is enabled or this is an encrypted block, this will be the same + * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF. + */ +static inline enum zio_compress +arc_hdr_get_compress(arc_buf_hdr_t *hdr) +{ + return (HDR_COMPRESSION_ENABLED(hdr) ? + HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); +} + static inline boolean_t arc_buf_is_shared(arc_buf_t *buf) { @@ -1537,6 +1694,7 @@ static inline void arc_cksum_free(arc_buf_hdr_t *hdr) { ASSERT(HDR_HAS_L1HDR(hdr)); + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); if (hdr->b_l1hdr.b_freeze_cksum != NULL) { kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); @@ -1547,6 +1705,7 @@ arc_cksum_free(arc_buf_hdr_t *hdr) /* * Return true iff at least one of the bufs on hdr is not compressed. + * Encrypted buffers count as compressed. */ static boolean_t arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) @@ -1593,6 +1752,11 @@ arc_cksum_verify(arc_buf_t *buf) mutex_exit(&hdr->b_l1hdr.b_freeze_lock); } +/* + * This function makes the assumption that data stored in the L2ARC + * will be transformed exactly as it is in the main pool. Because of + * this we can verify the checksum against the reading process's bp. + */ static boolean_t arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) { @@ -1689,6 +1853,7 @@ arc_cksum_compute(arc_buf_t *buf) return; } + ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(!ARC_BUF_COMPRESSED(buf)); hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), KM_SLEEP); @@ -1881,15 +2046,14 @@ arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) */ if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); - HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); } else { arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); - HDR_SET_COMPRESS(hdr, cmp); - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); ASSERT(HDR_COMPRESSION_ENABLED(hdr)); } + + HDR_SET_COMPRESS(hdr, cmp); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); } /* @@ -1921,15 +2085,250 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf) } /* + * Note: With encryption support, the following assertion is no longer + * necessarily valid. If we receive two back to back raw snapshots + * (send -w), the second receive can use a hdr with a cksum already + * calculated. This happens via: + * dmu_recv_stream() -> receive_read_record() -> arc_loan_raw_buf() + * The rsend/send_mixed_raw test case exercises this code path. + * * There were no decompressed bufs, so there should not be a * checksum on the hdr either. + * EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); */ - EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); return (copied); } /* + * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. + */ +static uint64_t +arc_hdr_size(arc_buf_hdr_t *hdr) +{ + uint64_t size; + + if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && + HDR_GET_PSIZE(hdr) > 0) { + size = HDR_GET_PSIZE(hdr); + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); + size = HDR_GET_LSIZE(hdr); + } + return (size); +} + +static int +arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) +{ + int ret; + uint64_t csize; + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t psize = HDR_GET_PSIZE(hdr); + void *tmpbuf = NULL; + abd_t *abd = hdr->b_l1hdr.b_pabd; + + ASSERT(HDR_LOCK(hdr) == NULL || MUTEX_HELD(HDR_LOCK(hdr))); + ASSERT(HDR_AUTHENTICATED(hdr)); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + /* + * The MAC is calculated on the compressed data that is stored on disk. + * However, if compressed arc is disabled we will only have the + * decompressed data available to us now. Compress it into a temporary + * abd so we can verify the MAC. The performance overhead of this will + * be relatively low, since most objects in an encrypted objset will + * be encrypted (instead of authenticated) anyway. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr)) { + tmpbuf = zio_buf_alloc(lsize); + abd = abd_get_from_buf(tmpbuf, lsize); + abd_take_ownership_of_buf(abd, B_TRUE); + + csize = zio_compress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pabd, tmpbuf, lsize); + ASSERT3U(csize, <=, psize); + abd_zero_off(abd, csize, psize - csize); + } + + /* + * Authentication is best effort. We authenticate whenever the key is + * available. If we succeed we clear ARC_FLAG_NOAUTH. + */ + if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + ASSERT3U(lsize, ==, psize); + ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd, + psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); + } else { + ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize, + hdr->b_crypt_hdr.b_mac); + } + + if (ret == 0) + arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); + else if (ret != ENOENT) + goto error; + + if (tmpbuf != NULL) + abd_free(abd); + + return (0); + +error: + if (tmpbuf != NULL) + abd_free(abd); + + return (ret); +} + +/* + * This function will take a header that only has raw encrypted data in + * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in + * b_l1hdr.b_pabd. If designated in the header flags, this function will + * also decompress the data. + */ +static int +arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) +{ + int ret; + abd_t *cabd = NULL; + void *tmp = NULL; + boolean_t no_crypt = B_FALSE; + boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); + + ASSERT(HDR_LOCK(hdr) == NULL || MUTEX_HELD(HDR_LOCK(hdr))); + ASSERT(HDR_ENCRYPTED(hdr)); + + arc_hdr_alloc_pabd(hdr, B_FALSE); + + ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, + B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, + hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd, + hdr->b_crypt_hdr.b_rabd, &no_crypt); + if (ret != 0) + goto error; + + if (no_crypt) { + abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, + HDR_GET_PSIZE(hdr)); + } + + /* + * If this header has disabled arc compression but the b_pabd is + * compressed after decrypting it, we need to decompress the newly + * decrypted data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr)) { + /* + * We want to make sure that we are correctly honoring the + * zfs_abd_scatter_enabled setting, so we allocate an abd here + * and then loan a buffer from it, rather than allocating a + * linear buffer and wrapping it in an abd later. + */ + cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); + tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); + + ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + if (ret != 0) { + abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); + goto error; + } + + abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, + arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_pabd = cabd; + } + + return (0); + +error: + arc_hdr_free_pabd(hdr, B_FALSE); + if (cabd != NULL) + arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr); + + return (ret); +} + +/* + * This function is called during arc_buf_fill() to prepare the header's + * abd plaintext pointer for use. This involves authenticated protected + * data and decrypting encrypted data into the plaintext abd. + */ +static int +arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa, + const zbookmark_phys_t *zb, boolean_t noauth) +{ + int ret; + + ASSERT(HDR_PROTECTED(hdr)); + + if (hash_lock != NULL) + mutex_enter(hash_lock); + + if (HDR_NOAUTH(hdr) && !noauth) { + /* + * The caller requested authenticated data but our data has + * not been authenticated yet. Verify the MAC now if we can. + */ + ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset); + if (ret != 0) + goto error; + } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) { + /* + * If we only have the encrypted version of the data, but the + * unencrypted version was requested we take this opportunity + * to store the decrypted version in the header for future use. + */ + ret = arc_hdr_decrypt(hdr, spa, zb); + if (ret != 0) + goto error; + } + + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + if (hash_lock != NULL) + mutex_exit(hash_lock); + + return (0); + +error: + if (hash_lock != NULL) + mutex_exit(hash_lock); + + return (ret); +} + +/* + * This function is used by the dbuf code to decrypt bonus buffers in place. + * The dbuf code itself doesn't have any locking for decrypting a shared dnode + * block, so we use the hash lock here to protect against concurrent calls to + * arc_buf_fill(). + */ +/* ARGSUSED */ +static void +arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(HDR_ENCRYPTED(hdr)); + ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); + ASSERT(HDR_LOCK(hdr) == NULL || MUTEX_HELD(HDR_LOCK(hdr))); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, + arc_buf_size(buf)); + buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; + buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; + hdr->b_crypt_hdr.b_ebufcnt -= 1; +} + +/* * Given a buf that has a data buffer attached to it, this function will * efficiently fill the buf with data of the specified compression setting from * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr @@ -1943,15 +2342,90 @@ arc_buf_try_copy_decompressed_data(arc_buf_t *buf) * the correct-sized data buffer. */ static int -arc_buf_fill(arc_buf_t *buf, boolean_t compressed) +arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, + arc_fill_flags_t flags) { + int error = 0; arc_buf_hdr_t *hdr = buf->b_hdr; - boolean_t hdr_compressed = (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + boolean_t hdr_compressed = + (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); + boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0; + boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0; dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; + kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr); ASSERT3P(buf->b_data, !=, NULL); - IMPLY(compressed, hdr_compressed); + IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf)); IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); + IMPLY(encrypted, HDR_ENCRYPTED(hdr)); + IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); + IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); + IMPLY(encrypted, !ARC_BUF_SHARED(buf)); + + /* + * If the caller wanted encrypted data we just need to copy it from + * b_rabd and potentially byteswap it. We won't be able to do any + * further transforms on it. + */ + if (encrypted) { + ASSERT(HDR_HAS_RABD(hdr)); + abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd, + HDR_GET_PSIZE(hdr)); + goto byteswap; + } + + /* + * Adjust encrypted and authenticated headers to accomodate + * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are + * allowed to fail decryption due to keys not being loaded + * without being marked as an IO error. + */ + if (HDR_PROTECTED(hdr)) { + error = arc_fill_hdr_crypt(hdr, hash_lock, spa, + zb, !!(flags & ARC_FILL_NOAUTH)); + if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) { + return (error); + } else if (error != 0) { + if (hash_lock != NULL) + mutex_enter(hash_lock); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); + if (hash_lock != NULL) + mutex_exit(hash_lock); + return (error); + } + } + + /* + * There is a special case here for dnode blocks which are + * decrypting their bonus buffers. These blocks may request to + * be decrypted in-place. This is necessary because there may + * be many dnodes pointing into this buffer and there is + * currently no method to synchronize replacing the backing + * b_data buffer and updating all of the pointers. Here we use + * the hash lock to ensure there are no races. If the need + * arises for other types to be decrypted in-place, they must + * add handling here as well. + */ + if ((flags & ARC_FILL_IN_PLACE) != 0) { + ASSERT(!hdr_compressed); + ASSERT(!compressed); + ASSERT(!encrypted); + + if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) { + ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); + + if (hash_lock != NULL) + mutex_enter(hash_lock); + arc_buf_untransform_in_place(buf, hash_lock); + if (hash_lock != NULL) + mutex_exit(hash_lock); + + /* Compute the hdr's checksum if necessary */ + arc_cksum_compute(buf); + } + + return (0); + } if (hdr_compressed == compressed) { if (!arc_buf_is_shared(buf)) { @@ -1970,7 +2444,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) if (arc_buf_is_shared(buf)) { ASSERT(ARC_BUF_COMPRESSED(buf)); - /* We need to give the buf it's own b_data */ + /* We need to give the buf its own b_data */ buf->b_flags &= ~ARC_BUF_FLAG_SHARED; buf->b_data = arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); @@ -2006,7 +2480,7 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, !=, NULL); return (0); } else { - int error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + error = zio_decompress_data(HDR_GET_COMPRESS(hdr), hdr->b_l1hdr.b_pabd, buf->b_data, HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); @@ -2017,13 +2491,19 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) if (error != 0) { zfs_dbgmsg( "hdr %p, compress %d, psize %d, lsize %d", - hdr, HDR_GET_COMPRESS(hdr), + hdr, arc_hdr_get_compress(hdr), HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); + if (hash_lock != NULL) + mutex_enter(hash_lock); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); + if (hash_lock != NULL) + mutex_exit(hash_lock); return (SET_ERROR(EIO)); } } } +byteswap: /* Byteswap the buf's data if necessary */ if (bswap != DMU_BSWAP_NUMFUNCS) { ASSERT(!HDR_SHARED_DATA(hdr)); @@ -2037,28 +2517,35 @@ arc_buf_fill(arc_buf_t *buf, boolean_t compressed) return (0); } -int -arc_decompress(arc_buf_t *buf) -{ - return (arc_buf_fill(buf, B_FALSE)); -} - /* - * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. + * If this function is being called to decrypt an encrypted buffer or verify an + * authenticated one, the key must be loaded and a mapping must be made + * available in the keystore via spa_keystore_create_mapping() or one of its + * callers. */ -static uint64_t -arc_hdr_size(arc_buf_hdr_t *hdr) +int +arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, + boolean_t in_place) { - uint64_t size; + int ret; + arc_fill_flags_t flags = 0; - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && - HDR_GET_PSIZE(hdr) > 0) { - size = HDR_GET_PSIZE(hdr); - } else { - ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); - size = HDR_GET_LSIZE(hdr); + if (in_place) + flags |= ARC_FILL_IN_PLACE; + + ret = arc_buf_fill(buf, spa, zb, flags); + if (ret == ECKSUM) { + /* + * Convert authentication and decryption errors to EIO + * (and generate an ereport) before leaving the ARC. + */ + ret = SET_ERROR(EIO); + spa_log_error(spa, zb); + zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + spa, NULL, zb, NULL, 0, 0); } - return (size); + + return (ret); } /* @@ -2077,6 +2564,7 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_add_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; @@ -2087,6 +2575,10 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) (void) zfs_refcount_add_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } + if (HDR_HAS_RABD(hdr)) { + (void) zfs_refcount_add_many(&state->arcs_esize[type], + HDR_GET_PSIZE(hdr), hdr); + } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) @@ -2112,6 +2604,7 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) ASSERT0(hdr->b_l1hdr.b_bufcnt); ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); (void) zfs_refcount_remove_many(&state->arcs_esize[type], HDR_GET_LSIZE(hdr), hdr); return; @@ -2122,6 +2615,10 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) (void) zfs_refcount_remove_many(&state->arcs_esize[type], arc_hdr_size(hdr), hdr); } + if (HDR_HAS_RABD(hdr)) { + (void) zfs_refcount_remove_many(&state->arcs_esize[type], + HDR_GET_PSIZE(hdr), hdr); + } for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { if (arc_buf_is_shared(buf)) @@ -2215,7 +2712,9 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, old_state = hdr->b_l1hdr.b_state; refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); bufcnt = hdr->b_l1hdr.b_bufcnt; - update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL); + + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || + HDR_HAS_RABD(hdr)); } else { old_state = arc_l2c_only; refcnt = 0; @@ -2286,6 +2785,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, (void) zfs_refcount_add_many(&new_state->arcs_size, HDR_GET_LSIZE(hdr), hdr); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); } else { uint32_t buffers = 0; @@ -2319,8 +2819,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, (void) zfs_refcount_add_many( &new_state->arcs_size, arc_hdr_size(hdr), hdr); - } else { - ASSERT(GHOST_STATE(old_state)); + } + + if (HDR_HAS_RABD(hdr)) { + (void) zfs_refcount_add_many( + &new_state->arcs_size, + HDR_GET_PSIZE(hdr), hdr); } } } @@ -2330,6 +2834,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, if (GHOST_STATE(old_state)) { ASSERT0(bufcnt); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); /* * When moving a header off of a ghost state, @@ -2369,9 +2874,20 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, buf); } ASSERT3U(bufcnt, ==, buffers); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - (void) zfs_refcount_remove_many( - &old_state->arcs_size, arc_hdr_size(hdr), hdr); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || + HDR_HAS_RABD(hdr)); + + if (hdr->b_l1hdr.b_pabd != NULL) { + (void) zfs_refcount_remove_many( + &old_state->arcs_size, arc_hdr_size(hdr), + hdr); + } + + if (HDR_HAS_RABD(hdr)) { + (void) zfs_refcount_remove_many( + &old_state->arcs_size, HDR_GET_PSIZE(hdr), + hdr); + } } } @@ -2463,12 +2979,13 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) { /* * The criteria for sharing a hdr's data are: - * 1. the hdr's compression matches the buf's compression - * 2. the hdr doesn't need to be byteswapped - * 3. the hdr isn't already being shared - * 4. the buf is either compressed or it is the last buf in the hdr list + * 1. the buffer is not encrypted + * 2. the hdr's compression matches the buf's compression + * 3. the hdr doesn't need to be byteswapped + * 4. the hdr isn't already being shared + * 5. the buf is either compressed or it is the last buf in the hdr list * - * Criterion #4 maintains the invariant that shared uncompressed + * Criterion #5 maintains the invariant that shared uncompressed * bufs must be the final buf in the hdr's b_buf list. Reading this, you * might ask, "if a compressed buf is allocated first, won't that be the * last thing in the list?", but in that case it's impossible to create @@ -2483,9 +3000,11 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) * sharing if the new buf isn't the first to be added. */ ASSERT3P(buf->b_hdr, ==, hdr); - boolean_t hdr_compressed = HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF; + boolean_t hdr_compressed = arc_hdr_get_compress(hdr) != + ZIO_COMPRESS_OFF; boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; - return (buf_compressed == hdr_compressed && + return (!ARC_BUF_ENCRYPTED(buf) && + buf_compressed == hdr_compressed && hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && !HDR_SHARED_DATA(hdr) && (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); @@ -2497,10 +3016,12 @@ arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) * copy was made successfully, or an error code otherwise. */ static int -arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, + void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth, boolean_t fill, arc_buf_t **ret) { arc_buf_t *buf; + arc_fill_flags_t flags = ARC_FILL_LOCKED; ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); @@ -2508,6 +3029,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, hdr->b_type == ARC_BUFC_METADATA); ASSERT3P(ret, !=, NULL); ASSERT3P(*ret, ==, NULL); + IMPLY(encrypted, compressed); buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; @@ -2525,16 +3047,28 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, /* * Only honor requests for compressed bufs if the hdr is actually - * compressed. + * compressed. This must be overriden if the buffer is encrypted since + * encrypted buffers cannot be decompressed. */ - if (compressed && HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) + if (encrypted) { buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; + buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED; + flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED; + } else if (compressed && + arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { + buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; + flags |= ARC_FILL_COMPRESSED; + } + + if (noauth) { + ASSERT0(encrypted); + flags |= ARC_FILL_NOAUTH; + } /* * If the hdr's data can be shared then we share the data buffer and * set the appropriate bit in the hdr's b_flags to indicate the hdr is - * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new - * buffer to store the buf's data. + * allocate a new buffer to store the buf's data. * * There are two additional restrictions here because we're sharing * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be @@ -2545,7 +3079,7 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, * need to be ABD-aware. */ boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) && - abd_is_linear(hdr->b_l1hdr.b_pabd); + hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd); /* Set up b_data and sharing */ if (can_share) { @@ -2561,13 +3095,16 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, void *tag, boolean_t compressed, hdr->b_l1hdr.b_buf = buf; hdr->b_l1hdr.b_bufcnt += 1; + if (encrypted) + hdr->b_crypt_hdr.b_ebufcnt += 1; /* * If the user wants the data from the hdr, we need to either copy or * decompress the data. */ if (fill) { - return (arc_buf_fill(buf, ARC_BUF_COMPRESSED(buf) != 0)); + ASSERT3P(zb, !=, NULL); + return (arc_buf_fill(buf, spa, zb, flags)); } return (0); @@ -2613,6 +3150,19 @@ arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, return (buf); } +arc_buf_t * +arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, + dmu_object_type_t ot, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type) +{ + arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, + byteorder, salt, iv, mac, ot, psize, lsize, compression_type); + + atomic_add_64(&arc_loaned_bytes, psize); + return (buf); +} + /* * Return a loaned arc buffer to the arc. @@ -2658,11 +3208,11 @@ l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) } static void -arc_hdr_free_on_write(arc_buf_hdr_t *hdr) +arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) { arc_state_t *state = hdr->b_l1hdr.b_state; arc_buf_contents_t type = arc_buf_type(hdr); - uint64_t size = arc_hdr_size(hdr); + uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); /* protected by hash lock, if in the hash table */ if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { @@ -2680,7 +3230,11 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr) arc_space_return(size, ARC_SPACE_DATA); } - l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); + if (free_rdata) { + l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type); + } else { + l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); + } } /* @@ -2691,10 +3245,12 @@ arc_hdr_free_on_write(arc_buf_hdr_t *hdr) static void arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { + /* LINTED */ arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(arc_can_share(hdr, buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!ARC_BUF_ENCRYPTED(buf)); ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); /* @@ -2702,7 +3258,8 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * refcount ownership to the hdr since it always owns * the refcount whenever an arc_buf_t is shared. */ - zfs_refcount_transfer_ownership(&state->arcs_size, buf, hdr); + zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, + arc_hdr_size(hdr), buf, hdr); hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, HDR_ISTYPE_METADATA(hdr)); @@ -2722,6 +3279,7 @@ arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) static void arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) { + /* LINTED */ arc_state_t *state = hdr->b_l1hdr.b_state; ASSERT(arc_buf_is_shared(buf)); @@ -2732,7 +3290,8 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) * We are no longer sharing this buffer so we need * to transfer its ownership to the rightful owner. */ - zfs_refcount_transfer_ownership(&state->arcs_size, hdr, buf); + zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, + arc_hdr_size(hdr), hdr, buf); arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); abd_put(hdr->b_l1hdr.b_pabd); @@ -2756,12 +3315,12 @@ arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) static arc_buf_t * arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) { - ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); - arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; arc_buf_t *lastbuf = NULL; + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr)) || HDR_EMPTY(hdr)); + /* * Remove the buf from the hdr list and locate the last * remaining buffer on the list. @@ -2824,6 +3383,21 @@ arc_buf_destroy_impl(arc_buf_t *buf) ASSERT(hdr->b_l1hdr.b_bufcnt > 0); hdr->b_l1hdr.b_bufcnt -= 1; + + if (ARC_BUF_ENCRYPTED(buf)) { + hdr->b_crypt_hdr.b_ebufcnt -= 1; + + /* + * If we have no more encrypted buffers and we've + * already gotten a copy of the decrypted data we can + * free b_rabd to save some space. + */ + if (hdr->b_crypt_hdr.b_ebufcnt == 0 && + HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && + !HDR_IO_IN_PROGRESS(hdr)) { + arc_hdr_free_pabd(hdr, B_TRUE); + } + } } arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); @@ -2838,16 +3412,17 @@ arc_buf_destroy_impl(arc_buf_t *buf) * There is an equivalent case for compressed bufs, but since * they aren't guaranteed to be the last buf in the list and * that is an exceedingly rare case, we just allow that space be - * wasted temporarily. + * wasted temporarily. We must also be careful not to share + * encrypted buffers, since they cannot be shared. */ - if (lastbuf != NULL) { + if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { /* Only one buf can be shared at once */ VERIFY(!arc_buf_is_shared(lastbuf)); /* hdr is uncompressed so can't have compressed buf */ VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - arc_hdr_free_pabd(hdr); + arc_hdr_free_pabd(hdr, B_FALSE); /* * We must setup a new shared block between the @@ -2868,7 +3443,7 @@ arc_buf_destroy_impl(arc_buf_t *buf) */ ASSERT3P(lastbuf, !=, NULL); ASSERT(arc_buf_is_shared(lastbuf) || - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); } /* @@ -2885,26 +3460,40 @@ arc_buf_destroy_impl(arc_buf_t *buf) } static void -arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr) +arc_hdr_alloc_pabd(arc_buf_hdr_t *hdr, boolean_t alloc_rdata) { + uint64_t size; + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata); + IMPLY(alloc_rdata, HDR_PROTECTED(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); - hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + if (alloc_rdata) { + size = HDR_GET_PSIZE(hdr); + ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); + hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr); + ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); + } else { + size = arc_hdr_size(hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + } - ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_compressed_size, size); ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); } static void -arc_hdr_free_pabd(arc_buf_hdr_t *hdr) +arc_hdr_free_pabd(arc_buf_hdr_t *hdr, boolean_t free_rdata) { + uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); + ASSERT(HDR_HAS_L1HDR(hdr)); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); + IMPLY(free_rdata, HDR_HAS_RABD(hdr)); + /* * If the hdr is currently being written to the l2arc then @@ -2913,28 +3502,41 @@ arc_hdr_free_pabd(arc_buf_hdr_t *hdr) * writing it to the l2arc device. */ if (HDR_L2_WRITING(hdr)) { - arc_hdr_free_on_write(hdr); + arc_hdr_free_on_write(hdr, free_rdata); ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else if (free_rdata) { + arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr); } else { arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, - arc_hdr_size(hdr), hdr); + size, hdr); } - hdr->b_l1hdr.b_pabd = NULL; - hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; - ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + if (free_rdata) { + hdr->b_crypt_hdr.b_rabd = NULL; + } else { + hdr->b_l1hdr.b_pabd = NULL; + } + + if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr)) + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + + ARCSTAT_INCR(arcstat_compressed_size, -size); ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); } static arc_buf_hdr_t * arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, - enum zio_compress compression_type, arc_buf_contents_t type) + boolean_t protected, enum zio_compress compression_type, + arc_buf_contents_t type, boolean_t alloc_rdata) { arc_buf_hdr_t *hdr; VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); - - hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); + if (protected) { + hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE); + } else { + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); + } ASSERT(HDR_EMPTY(hdr)); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); ASSERT3P(hdr->b_l1hdr.b_thawed, ==, NULL); @@ -2945,6 +3547,8 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, hdr->b_flags = 0; arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); arc_hdr_set_compress(hdr, compression_type); + if (protected) + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); hdr->b_l1hdr.b_state = arc_anon; hdr->b_l1hdr.b_arc_access = 0; @@ -2956,7 +3560,7 @@ arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, * the compressed or uncompressed data depending on the block * it references and compressed arc enablement. */ - arc_hdr_alloc_pabd(hdr); + arc_hdr_alloc_pabd(hdr, alloc_rdata); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); return (hdr); @@ -2980,6 +3584,16 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || (old == hdr_l2only_cache && new == hdr_full_cache)); + /* + * if the caller wanted a new full header and the header is to be + * encrypted we will actually allocate the header from the full crypt + * cache instead. The same applies to freeing from the old cache. + */ + if (HDR_PROTECTED(hdr) && new == hdr_full_cache) + new = hdr_full_crypt_cache; + if (HDR_PROTECTED(hdr) && old == hdr_full_cache) + old = hdr_full_crypt_cache; + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); @@ -2987,7 +3601,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); - if (new == hdr_full_cache) { + if (new == hdr_full_cache || new == hdr_full_crypt_cache) { arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); /* * arc_access and arc_change_state need to be aware that a @@ -2998,6 +3612,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) /* Verify previous threads set to NULL before freeing */ ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); } else { ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(hdr->b_l1hdr.b_bufcnt); @@ -3020,6 +3635,7 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) */ VERIFY(!HDR_L2_WRITING(hdr)); VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); #ifdef ZFS_DEBUG if (hdr->b_l1hdr.b_thawed != NULL) { @@ -3071,6 +3687,156 @@ arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) } /* + * This function allows an L1 header to be reallocated as a crypt + * header and vice versa. If we are going to a crypt header, the + * new fields will be zeroed out. + */ +static arc_buf_hdr_t * +arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) +{ + arc_buf_hdr_t *nhdr; + arc_buf_t *buf; + kmem_cache_t *ncache, *ocache; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node)); + ASSERT3P(hdr->b_hash_next, ==, NULL); + + if (need_crypt) { + ncache = hdr_full_crypt_cache; + ocache = hdr_full_cache; + } else { + ncache = hdr_full_cache; + ocache = hdr_full_crypt_cache; + } + + nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); + + /* + * Copy all members that aren't locks or condvars to the new header. + * No lists are pointing to us (as we asserted above), so we don't + * need to worry about the list nodes. + */ + nhdr->b_dva = hdr->b_dva; + nhdr->b_birth = hdr->b_birth; + nhdr->b_type = hdr->b_type; + nhdr->b_flags = hdr->b_flags; + nhdr->b_psize = hdr->b_psize; + nhdr->b_lsize = hdr->b_lsize; + nhdr->b_spa = hdr->b_spa; + nhdr->b_l2hdr.b_dev = hdr->b_l2hdr.b_dev; + nhdr->b_l2hdr.b_daddr = hdr->b_l2hdr.b_daddr; + nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; + nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt; + nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; + nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; + nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; + nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; + nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; +#ifdef ZFS_DEBUG + if (hdr->b_l1hdr.b_thawed != NULL) { + nhdr->b_l1hdr.b_thawed = hdr->b_l1hdr.b_thawed; + hdr->b_l1hdr.b_thawed = NULL; + } +#endif + + /* + * This refcount_add() exists only to ensure that the individual + * arc buffers always point to a header that is referenced, avoiding + * a small race condition that could trigger ASSERTs. + */ + (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG); + nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf; + for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + mutex_enter(&buf->b_evict_lock); + buf->b_hdr = nhdr; + mutex_exit(&buf->b_evict_lock); + } + zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt); + (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG); + ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); + + if (need_crypt) { + arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED); + } else { + arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED); + } + + /* unset all members of the original hdr */ + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_type = ARC_BUFC_INVALID; + hdr->b_flags = 0; + hdr->b_psize = 0; + hdr->b_lsize = 0; + hdr->b_spa = 0; + hdr->b_l2hdr.b_dev = NULL; + hdr->b_l2hdr.b_daddr = 0; + hdr->b_l1hdr.b_freeze_cksum = NULL; + hdr->b_l1hdr.b_buf = NULL; + hdr->b_l1hdr.b_bufcnt = 0; + hdr->b_l1hdr.b_byteswap = 0; + hdr->b_l1hdr.b_state = NULL; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_acb = NULL; + hdr->b_l1hdr.b_pabd = NULL; + + if (ocache == hdr_full_crypt_cache) { + ASSERT(!HDR_HAS_RABD(hdr)); + hdr->b_crypt_hdr.b_ot = DMU_OT_NONE; + hdr->b_crypt_hdr.b_ebufcnt = 0; + hdr->b_crypt_hdr.b_dsobj = 0; + bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); + bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); + bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); + } + + buf_discard_identity(hdr); + kmem_cache_free(ocache, hdr); + + return (nhdr); +} + +/* + * This function is used by the send / receive code to convert a newly + * allocated arc_buf_t to one that is suitable for a raw encrypted write. It + * is also used to allow the root objset block to be uupdated without altering + * its embedded MACs. Both block types will always be uncompressed so we do not + * have to worry about compression type or psize. + */ +void +arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, + dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, + const uint8_t *mac) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + + buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); + if (!HDR_PROTECTED(hdr)) + hdr = arc_hdr_realloc_crypt(hdr, B_TRUE); + hdr->b_crypt_hdr.b_dsobj = dsobj; + hdr->b_crypt_hdr.b_ot = ot; + hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? + DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); + if (!arc_hdr_has_uncompressed_buf(hdr)) + arc_cksum_free(hdr); + + if (salt != NULL) + bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); + if (iv != NULL) + bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); + if (mac != NULL) + bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); +} + +/* * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. * The buf is returned thawed since we expect the consumer to modify it. */ @@ -3078,11 +3844,12 @@ arc_buf_t * arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) { arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, - ZIO_COMPRESS_OFF, type); + B_FALSE, ZIO_COMPRESS_OFF, type, B_FALSE); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = NULL; - VERIFY0(arc_buf_alloc_impl(hdr, tag, B_FALSE, B_FALSE, &buf)); + VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, + B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); return (buf); @@ -3098,33 +3865,76 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, { ASSERT3U(lsize, >, 0); ASSERT3U(lsize, >=, psize); - ASSERT(compression_type > ZIO_COMPRESS_OFF); - ASSERT(compression_type < ZIO_COMPRESS_FUNCTIONS); + ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF); + ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - compression_type, ARC_BUFC_DATA); + B_FALSE, compression_type, ARC_BUFC_DATA, B_FALSE); ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); arc_buf_t *buf = NULL; - VERIFY0(arc_buf_alloc_impl(hdr, tag, B_TRUE, B_FALSE, &buf)); + VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, + B_TRUE, B_FALSE, B_FALSE, &buf)); arc_buf_thaw(buf); ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); if (!arc_buf_is_shared(buf)) { /* * To ensure that the hdr has the correct data in it if we call - * arc_decompress() on this buf before it's been written to + * arc_untransform() on this buf before it's been written to * disk, it's easiest if we just set up sharing between the * buf and the hdr. */ ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd)); - arc_hdr_free_pabd(hdr); + arc_hdr_free_pabd(hdr, B_FALSE); arc_share_buf(hdr, buf); } return (buf); } +arc_buf_t * +arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, + dmu_object_type_t ot, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ? + ARC_BUFC_METADATA : ARC_BUFC_DATA; + + ASSERT3U(lsize, >, 0); + ASSERT3U(lsize, >=, psize); + ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF); + ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); + + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, + compression_type, type, B_TRUE); + ASSERT(!MUTEX_HELD(HDR_LOCK(hdr))); + + hdr->b_crypt_hdr.b_dsobj = dsobj; + hdr->b_crypt_hdr.b_ot = ot; + hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? + DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); + bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); + bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); + bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); + + /* + * This buffer will be considered encrypted even if the ot is not an + * encrypted type. It will become authenticated instead in + * arc_write_ready(). + */ + buf = NULL; + VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE, + B_FALSE, B_FALSE, &buf)); + arc_buf_thaw(buf); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + return (buf); +} + static void arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) { @@ -3200,15 +4010,23 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) #endif if (hdr->b_l1hdr.b_pabd != NULL) { - arc_hdr_free_pabd(hdr); + arc_hdr_free_pabd(hdr, B_FALSE); } + + if (HDR_HAS_RABD(hdr)) + arc_hdr_free_pabd(hdr, B_TRUE); } ASSERT3P(hdr->b_hash_next, ==, NULL); if (HDR_HAS_L1HDR(hdr)) { ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); - kmem_cache_free(hdr_full_cache, hdr); + + if (!HDR_PROTECTED(hdr)) { + kmem_cache_free(hdr_full_cache, hdr); + } else { + kmem_cache_free(hdr_full_crypt_cache, hdr); + } } else { kmem_cache_free(hdr_l2only_cache, hdr); } @@ -3242,7 +4060,7 @@ arc_buf_destroy(arc_buf_t *buf, void* tag) /* * Evict the arc_buf_hdr that is provided as a parameter. The resultant - * state of the header is dependent on it's state prior to entering this + * state of the header is dependent on its state prior to entering this * function. The following transitions are possible: * * - arc_mru -> arc_mru_ghost @@ -3270,9 +4088,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) /* * l2arc_write_buffers() relies on a header's L1 portion - * (i.e. its b_pabd field) during it's write phase. + * (i.e. its b_pabd field) during its write phase. * Thus, we cannot push a header onto the arc_l2c_only - * state (removing it's L1 piece) until the header is + * state (removing its L1 piece) until the header is * done being written to the l2arc. */ if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { @@ -3285,8 +4103,9 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); if (HDR_HAS_L2HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_pabd == NULL); + ASSERT(!HDR_HAS_RABD(hdr)); /* * This buffer is cached on the 2nd Level ARC; * don't destroy the header. @@ -3352,7 +4171,11 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * This ensures that the accounting is updated correctly * in arc_free_data_impl(). */ - arc_hdr_free_pabd(hdr); + if (hdr->b_l1hdr.b_pabd != NULL) + arc_hdr_free_pabd(hdr, B_FALSE); + + if (HDR_HAS_RABD(hdr)) + arc_hdr_free_pabd(hdr, B_TRUE); arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); @@ -4323,7 +5146,7 @@ arc_reap_cb(void *arg, zthr_t *zthr) /* * Adapt arc info given the number of bytes we are trying to add and - * the state that we are comming from. This function is only called + * the state that we are coming from. This function is only called * when we are adding new content to the cache. */ static void @@ -4464,7 +5287,7 @@ arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) * upper limit, we must be adding data faster than the evict * thread can evict. Thus, to ensure we don't compound the * problem by adding more data and forcing arc_size to grow even - * further past it's target size, we halt and wait for the + * further past its target size, we halt and wait for the * eviction thread to catch up. * * It's also possible that the reclaim thread is unable to evict @@ -4799,24 +5622,69 @@ arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, } static void -arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) +arc_hdr_verify(arc_buf_hdr_t *hdr, const blkptr_t *bp) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF); } else { if (HDR_COMPRESSION_ENABLED(hdr)) { - ASSERT3U(HDR_GET_COMPRESS(hdr), ==, + ASSERT3U(arc_hdr_get_compress(hdr), ==, BP_GET_COMPRESS(bp)); } ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); + ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp)); + } +} + +/* + * XXX this should be changed to return an error, and callers + * re-read from disk on failure (on nondebug bits). + */ +static void +arc_hdr_verify_checksum(spa_t *spa, arc_buf_hdr_t *hdr, const blkptr_t *bp) +{ + arc_hdr_verify(hdr, bp); + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + return; + int err = 0; + abd_t *abd = NULL; + if (BP_IS_ENCRYPTED(bp)) { + if (HDR_HAS_RABD(hdr)) { + abd = hdr->b_crypt_hdr.b_rabd; + } + } else if (HDR_COMPRESSION_ENABLED(hdr)) { + abd = hdr->b_l1hdr.b_pabd; + } + if (abd != NULL) { + /* + * The offset is only used for labels, which are not + * cached in the ARC, so it doesn't matter what we + * pass for the offset parameter. + */ + int psize = HDR_GET_PSIZE(hdr); + err = zio_checksum_error_impl(spa, bp, + BP_GET_CHECKSUM(bp), abd, psize, 0, NULL); + if (err != 0) { + /* + * Use abd_copy_to_buf() rather than + * abd_borrow_buf_copy() so that we are sure to + * include the buf in crash dumps. + */ + void *buf = kmem_alloc(psize, KM_SLEEP); + abd_copy_to_buf(buf, abd, psize); + panic("checksum of cached data doesn't match BP " + "err=%u hdr=%p bp=%p abd=%p buf=%p", + err, (void *)hdr, (void *)bp, (void *)abd, buf); + } } } static void arc_read_done(zio_t *zio) { + blkptr_t *bp = zio->io_bp; arc_buf_hdr_t *hdr = zio->io_private; kmutex_t *hash_lock = NULL; arc_callback_t *callback_list; @@ -4847,6 +5715,26 @@ arc_read_done(zio_t *zio) ASSERT3P(hash_lock, !=, NULL); } + if (BP_IS_PROTECTED(bp)) { + hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); + hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; + zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, + hdr->b_crypt_hdr.b_iv); + + if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { + void *tmpbuf; + + tmpbuf = abd_borrow_buf_copy(zio->io_abd, + sizeof (zil_chain_t)); + zio_crypt_decode_mac_zil(tmpbuf, + hdr->b_crypt_hdr.b_mac); + abd_return_buf(zio->io_abd, tmpbuf, + sizeof (zil_chain_t)); + } else { + zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); + } + } + if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { @@ -4895,8 +5783,32 @@ arc_read_done(zio_t *zio) if (zio->io_error != 0) continue; - int error = arc_buf_alloc_impl(hdr, acb->acb_private, - acb->acb_compressed, B_TRUE, &acb->acb_buf); + int error = arc_buf_alloc_impl(hdr, zio->io_spa, + &acb->acb_zb, acb->acb_private, acb->acb_encrypted, + acb->acb_compressed, acb->acb_noauth, B_TRUE, + &acb->acb_buf); + + /* + * Assert non-speculative zios didn't fail because an + * encryption key wasn't loaded + */ + ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) || + error != EACCES); + + /* + * If we failed to decrypt, report an error now (as the zio + * layer would have done if it had done the transforms). + */ + if (error == ECKSUM) { + ASSERT(BP_IS_PROTECTED(bp)); + error = SET_ERROR(EIO); + if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { + spa_log_error(zio->io_spa, &acb->acb_zb); + zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0); + } + } + if (error != 0) { /* * Decompression failed. Set io_error @@ -4915,6 +5827,7 @@ arc_read_done(zio_t *zio) zio->io_error = error; } } + /* * If there are multiple callbacks, we must have the hash lock, * because the only way for multiple threads to find this hdr is @@ -4926,11 +5839,8 @@ arc_read_done(zio_t *zio) hdr->b_l1hdr.b_acb = NULL; arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); - if (callback_cnt == 0) { - ASSERT(HDR_PREFETCH(hdr)); - ASSERT0(hdr->b_l1hdr.b_bufcnt); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - } + if (callback_cnt == 0) + ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); @@ -4968,6 +5878,7 @@ arc_read_done(zio_t *zio) /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { + if (acb->acb_done != NULL) { if (zio->io_error != 0 && acb->acb_buf != NULL) { /* @@ -5022,7 +5933,11 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, kmutex_t *hash_lock = NULL; zio_t *rzio; uint64_t guid = spa_load_guid(spa); - boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; + boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0; + boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) && + (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; + boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && + (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; int rc = 0; ASSERT(!BP_IS_EMBEDDED(bp) || @@ -5037,7 +5952,15 @@ top: hdr = buf_hash_find(guid, bp, &hash_lock); } - if (hdr != NULL && HDR_HAS_L1HDR(hdr) && hdr->b_l1hdr.b_pabd != NULL) { + /* + * Determine if we have an L1 cache hit or a cache miss. For simplicity + * we maintain encrypted data seperately from compressed / uncompressed + * data. If the user is requesting raw encrypted data and we don't have + * that in the header we will read from disk to guarantee that we can + * get it even if the encryption keys aren't loaded. + */ + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || + (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { arc_buf_t *buf = NULL; *arc_flags |= ARC_FLAG_CACHED; @@ -5077,6 +6000,9 @@ top: acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; + acb->acb_encrypted = encrypted_read; + acb->acb_noauth = noauth_read; + acb->acb_zb = *zb; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, spa, NULL, NULL, NULL, zio_flags); @@ -5120,15 +6046,35 @@ top: ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); + arc_hdr_verify_checksum(spa, hdr, bp); + /* Get a buf with the desired data in it. */ - rc = arc_buf_alloc_impl(hdr, private, - compressed_read, B_TRUE, &buf); + rc = arc_buf_alloc_impl(hdr, spa, zb, private, + encrypted_read, compressed_read, noauth_read, + B_TRUE, &buf); + if (rc == ECKSUM) { + /* + * Convert authentication and decryption errors + * to EIO (and generate an ereport if needed) + * before leaving the ARC. + */ + rc = SET_ERROR(EIO); + if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { + spa_log_error(spa, zb); + zfs_ereport_post( + FM_EREPORT_ZFS_AUTHENTICATION, + spa, NULL, zb, NULL, 0, 0); + } + } if (rc != 0) { - arc_buf_destroy(buf, private); + (void) remove_reference(hdr, hash_lock, + private); + arc_buf_destroy_impl(buf); buf = NULL; } + /* assert any errors weren't due to unloaded keys */ ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || - rc == 0 || rc != ENOENT); + rc != EACCES); } else if (*arc_flags & ARC_FLAG_PREFETCH && zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); @@ -5155,13 +6101,15 @@ top: uint64_t addr = 0; boolean_t devw = B_FALSE; uint64_t size; + abd_t *hdr_abd; if (hdr == NULL) { /* this block is not in the cache */ arc_buf_hdr_t *exists = NULL; arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, - BP_GET_COMPRESS(bp), type); + BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), type, + encrypted_read); if (!BP_IS_EMBEDDED(bp)) { hdr->b_dva = *BP_IDENTITY(bp); @@ -5177,25 +6125,43 @@ top: } } else { /* - * This block is in the ghost cache. If it was L2-only - * (and thus didn't have an L1 hdr), we realloc the - * header to add an L1 hdr. + * This block is in the ghost cache or encrypted data + * was requested and we didn't have it. If it was + * L2-only (and thus didn't have an L1 hdr), + * we realloc the header to add an L1 hdr. */ if (!HDR_HAS_L1HDR(hdr)) { hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, hdr_full_cache); } - ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); - ASSERT(GHOST_STATE(hdr->b_l1hdr.b_state)); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); - ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + if (GHOST_STATE(hdr->b_l1hdr.b_state)) { + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT0(zfs_refcount_count( + &hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + } else if (HDR_IO_IN_PROGRESS(hdr)) { + /* + * If this header already had an IO in progress + * and we are performing another IO to fetch + * encrypted data we must wait until the first + * IO completes so as not to confuse + * arc_read_done(). This should be very rare + * and so the performance impact shouldn't + * matter. + */ + cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); + mutex_exit(hash_lock); + goto top; + } /* * This is a delicate dance that we play here. - * This hdr is in the ghost list so we access it - * to move it out of the ghost list before we + * This hdr might be in the ghost list so we access + * it to move it out of the ghost list before we * initiate the read. If it's a prefetch then * it won't have a callback so we'll remove the * reference that arc_buf_alloc_impl() created. We @@ -5203,28 +6169,44 @@ top: * avoid hitting an assert in remove_reference(). */ arc_access(hdr, hash_lock); - arc_hdr_alloc_pabd(hdr); + arc_hdr_alloc_pabd(hdr, encrypted_read); } - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); - size = arc_hdr_size(hdr); - /* - * If compression is enabled on the hdr, then will do - * RAW I/O and will store the compressed data in the hdr's - * data block. Otherwise, the hdr's data block will contain - * the uncompressed data. - */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { + if (encrypted_read) { + ASSERT(HDR_HAS_RABD(hdr)); + size = HDR_GET_PSIZE(hdr); + hdr_abd = hdr->b_crypt_hdr.b_rabd; zio_flags |= ZIO_FLAG_RAW; + } else { + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + size = arc_hdr_size(hdr); + hdr_abd = hdr->b_l1hdr.b_pabd; + + if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { + zio_flags |= ZIO_FLAG_RAW_COMPRESS; + } + + /* + * For authenticated bp's, we do not ask the ZIO layer + * to authenticate them since this will cause the entire + * IO to fail if the key isn't loaded. Instead, we + * defer authentication until arc_buf_fill(), which will + * verify the data when the key is available. + */ + if (BP_IS_AUTHENTICATED(bp)) + zio_flags |= ZIO_FLAG_RAW_ENCRYPT; } - if (*arc_flags & ARC_FLAG_PREFETCH) + if (*arc_flags & ARC_FLAG_PREFETCH && + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (BP_IS_AUTHENTICATED(bp)) + arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); if (BP_GET_LEVEL(bp) > 0) arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) @@ -5235,6 +6217,9 @@ top: acb->acb_done = done; acb->acb_private = private; acb->acb_compressed = compressed_read; + acb->acb_encrypted = encrypted_read; + acb->acb_noauth = noauth_read; + acb->acb_zb = *zb; ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); hdr->b_l1hdr.b_acb = acb; @@ -5309,7 +6294,7 @@ top: HDR_ISTYPE_METADATA(hdr)); cb->l2rcb_abd = abd; } else { - abd = hdr->b_l1hdr.b_pabd; + abd = hdr_abd; } ASSERT(addr >= VDEV_LABEL_START_SIZE && @@ -5322,7 +6307,7 @@ top: * Issue a null zio if the underlying buffer * was squashed to zero size by compression. */ - ASSERT3U(HDR_GET_COMPRESS(hdr), !=, + ASSERT3U(arc_hdr_get_compress(hdr), !=, ZIO_COMPRESS_EMPTY); rzio = zio_read_phys(pio, vd, addr, asize, abd, @@ -5339,7 +6324,8 @@ top: DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); - ARCSTAT_INCR(arcstat_l2_read_bytes, size); + ARCSTAT_INCR(arcstat_l2_read_bytes, + HDR_GET_PSIZE(hdr)); if (*arc_flags & ARC_FLAG_NOWAIT) { zio_nowait(rzio); @@ -5371,7 +6357,7 @@ top: } } - rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, + rzio = zio_read(pio, spa, bp, hdr_abd, size, arc_read_done, hdr, priority, zio_flags, zb); acb->acb_zio_head = rzio; @@ -5384,7 +6370,7 @@ top: ASSERT(*arc_flags & ARC_FLAG_NOWAIT); zio_nowait(rzio); } - return (0); + return (rc); } /* @@ -5448,7 +6434,7 @@ arc_release(arc_buf_t *buf, void *tag) arc_buf_hdr_t *hdr = buf->b_hdr; /* - * It would be nice to assert that if it's DMU metadata (level > + * It would be nice to assert that if its DMU metadata (level > * 0 || it's the dnode file), then it must be syncing context. * But we don't know that information at this level. */ @@ -5464,7 +6450,13 @@ arc_release(arc_buf_t *buf, void *tag) */ if (hdr->b_l1hdr.b_state == arc_anon) { mutex_exit(&buf->b_evict_lock); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + /* + * If we are called from dmu_convert_mdn_block_to_raw(), + * a write might be in progress. This is OK because + * the caller won't change the content of this buffer, + * only the flags (via arc_convert_to_raw()). + */ + /* ASSERT(!HDR_IO_IN_PROGRESS(hdr)); */ ASSERT(!HDR_IN_HASH_TABLE(hdr)); ASSERT(!HDR_HAS_L2HDR(hdr)); ASSERT(HDR_EMPTY(hdr)); @@ -5525,7 +6517,8 @@ arc_release(arc_buf_t *buf, void *tag) uint64_t spa = hdr->b_spa; uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t lsize = HDR_GET_LSIZE(hdr); - enum zio_compress compress = HDR_GET_COMPRESS(hdr); + boolean_t protected = HDR_PROTECTED(hdr); + enum zio_compress compress = arc_hdr_get_compress(hdr); arc_buf_contents_t type = arc_buf_type(hdr); VERIFY3U(hdr->b_type, ==, type); @@ -5550,6 +6543,7 @@ arc_release(arc_buf_t *buf, void *tag) * buffer, then we must stop sharing that block. */ if (arc_buf_is_shared(buf)) { + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); VERIFY(!arc_buf_is_shared(lastbuf)); /* @@ -5567,7 +6561,7 @@ arc_release(arc_buf_t *buf, void *tag) if (arc_can_share(hdr, lastbuf)) { arc_share_buf(hdr, lastbuf); } else { - arc_hdr_alloc_pabd(hdr); + arc_hdr_alloc_pabd(hdr, B_FALSE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, psize); } @@ -5582,10 +6576,10 @@ arc_release(arc_buf_t *buf, void *tag) * if we have a compressed, shared buffer. */ ASSERT(arc_buf_is_shared(lastbuf) || - HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF); + arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); ASSERT(!ARC_BUF_SHARED(buf)); } - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); ASSERT3P(state, !=, arc_l2c_only); (void) zfs_refcount_remove_many(&state->arcs_size, @@ -5599,16 +6593,24 @@ arc_release(arc_buf_t *buf, void *tag) } hdr->b_l1hdr.b_bufcnt -= 1; + if (ARC_BUF_ENCRYPTED(buf)) + hdr->b_crypt_hdr.b_ebufcnt -= 1; + arc_cksum_verify(buf); arc_buf_unwatch(buf); + /* if this is the last uncompressed buf free the checksum */ + if (!arc_hdr_has_uncompressed_buf(hdr)) + arc_cksum_free(hdr); + mutex_exit(hash_lock); /* * Allocate a new hdr. The new hdr will contain a b_pabd * buffer which will be freed in arc_write(). */ - nhdr = arc_hdr_alloc(spa, psize, lsize, compress, type); + nhdr = arc_hdr_alloc(spa, psize, lsize, protected, + compress, type, HDR_HAS_RABD(hdr)); ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); ASSERT0(nhdr->b_l1hdr.b_bufcnt); ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); @@ -5617,6 +6619,8 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_l1hdr.b_buf = buf; nhdr->b_l1hdr.b_bufcnt = 1; + if (ARC_BUF_ENCRYPTED(buf)) + nhdr->b_crypt_hdr.b_ebufcnt = 1; (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); buf->b_hdr = nhdr; @@ -5631,8 +6635,8 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_change_state(arc_anon, hdr, hash_lock); hdr->b_l1hdr.b_arc_access = 0; - mutex_exit(hash_lock); + mutex_exit(hash_lock); buf_discard_identity(hdr); arc_buf_thaw(buf); } @@ -5669,7 +6673,8 @@ arc_write_ready(zio_t *zio) arc_write_callback_t *callback = zio->io_private; arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - uint64_t psize = BP_IS_HOLE(zio->io_bp) ? 0 : BP_GET_PSIZE(zio->io_bp); + blkptr_t *bp = zio->io_bp; + uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp); ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); @@ -5687,11 +6692,15 @@ arc_write_ready(zio_t *zio) if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { - arc_hdr_free_pabd(hdr); + arc_hdr_free_pabd(hdr, B_FALSE); } } + + if (HDR_HAS_RABD(hdr)) + arc_hdr_free_pabd(hdr, B_TRUE); } ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); ASSERT(!HDR_SHARED_DATA(hdr)); ASSERT(!arc_buf_is_shared(buf)); @@ -5700,23 +6709,69 @@ arc_write_ready(zio_t *zio) if (HDR_IO_IN_PROGRESS(hdr)) ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); - arc_cksum_compute(buf); arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) + hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); + + if (BP_IS_PROTECTED(bp)) { + /* ZIL blocks are written through zio_rewrite */ + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); + ASSERT(HDR_PROTECTED(hdr)); + + if (BP_SHOULD_BYTESWAP(bp)) { + if (BP_GET_LEVEL(bp) > 0) { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; + } else { + hdr->b_l1hdr.b_byteswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); + } + } else { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + } + + hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); + hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; + zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, + hdr->b_crypt_hdr.b_iv); + zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); + } + + /* + * If this block was written for raw encryption but the zio layer + * ended up only authenticating it, adjust the buffer flags now. + */ + if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) { + arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); + buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; + if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF) + buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; + } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) { + buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; + buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; + } + + /* this must be done after the buffer flags are adjusted */ + arc_cksum_compute(buf); + enum zio_compress compress; - if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { compress = ZIO_COMPRESS_OFF; } else { - ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(zio->io_bp)); - compress = BP_GET_COMPRESS(zio->io_bp); + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); + compress = BP_GET_COMPRESS(bp); } HDR_SET_PSIZE(hdr, psize); arc_hdr_set_compress(hdr, compress); + if (zio->io_error != 0 || psize == 0) + goto out; /* - * Fill the hdr with data. If the hdr is compressed, the data we want - * is available from the zio, otherwise we can take it from the buf. + * Fill the hdr with data. If the buffer is encrypted we have no choice + * but to copy the data into b_rabd. If the hdr is compressed, the data + * we want is available from the zio, otherwise we can take it from + * the buf. * * We might be able to share the buf's data with the hdr here. However, * doing so would cause the ARC to be full of linear ABDs if we write a @@ -5726,23 +6781,29 @@ arc_write_ready(zio_t *zio) * written. Therefore, if they're allowed then we allocate one and copy * the data into it; otherwise, we share the data directly if we can. */ - if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { - arc_hdr_alloc_pabd(hdr); - + if (ARC_BUF_ENCRYPTED(buf)) { + ASSERT3U(psize, >, 0); + ASSERT(ARC_BUF_COMPRESSED(buf)); + arc_hdr_alloc_pabd(hdr, B_TRUE); + abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); + } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { /* * Ideally, we would always copy the io_abd into b_pabd, but the * user may have disabled compressed ARC, thus we must check the * hdr's compression setting rather than the io_bp's. */ - if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF) { - ASSERT3U(BP_GET_COMPRESS(zio->io_bp), !=, - ZIO_COMPRESS_OFF); + if (BP_IS_ENCRYPTED(bp)) { ASSERT3U(psize, >, 0); - + arc_hdr_alloc_pabd(hdr, B_TRUE); + abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); + } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && + !ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(psize, >, 0); + arc_hdr_alloc_pabd(hdr, B_FALSE); abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); } else { ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); - + arc_hdr_alloc_pabd(hdr, B_FALSE); abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, arc_buf_size(buf)); } @@ -5750,11 +6811,11 @@ arc_write_ready(zio_t *zio) ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); - arc_share_buf(hdr, buf); } - arc_hdr_verify(hdr, zio->io_bp); +out: + arc_hdr_verify(hdr, bp); } static void @@ -5882,17 +6943,33 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); if (l2arc) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); - if (ARC_BUF_COMPRESSED(buf)) { - /* - * We're writing a pre-compressed buffer. Make the - * compression algorithm requested by the zio_prop_t match - * the pre-compressed buffer's compression algorithm. - */ - localprop.zp_compress = HDR_GET_COMPRESS(hdr); - ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); + if (ARC_BUF_ENCRYPTED(buf)) { + ASSERT(ARC_BUF_COMPRESSED(buf)); + localprop.zp_encrypt = B_TRUE; + localprop.zp_compress = HDR_GET_COMPRESS(hdr); + /* CONSTCOND */ + localprop.zp_byteorder = + (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? + ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; + bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt, + ZIO_DATA_SALT_LEN); + bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv, + ZIO_DATA_IV_LEN); + bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac, + ZIO_DATA_MAC_LEN); + if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) { + localprop.zp_nopwrite = B_FALSE; + localprop.zp_copies = + MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1); + } zio_flags |= ZIO_FLAG_RAW; + } else if (ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); + localprop.zp_compress = HDR_GET_COMPRESS(hdr); + zio_flags |= ZIO_FLAG_RAW_COMPRESS; } + callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); callback->awcb_ready = ready; callback->awcb_children_ready = children_ready; @@ -5915,11 +6992,17 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, if (arc_buf_is_shared(buf)) { arc_unshare_buf(hdr, buf); } else { - arc_hdr_free_pabd(hdr); + arc_hdr_free_pabd(hdr, B_FALSE); } VERIFY3P(buf->b_data, !=, NULL); - arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); } + + if (HDR_HAS_RABD(hdr)) + arc_hdr_free_pabd(hdr, B_TRUE); + + if (!(zio_flags & ZIO_FLAG_RAW)) + arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); + ASSERT(!arc_buf_is_shared(buf)); ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); @@ -6123,8 +7206,8 @@ arc_state_multilist_index_func(multilist_t *ml, void *obj) /* * The assumption here, is the hash value for a given - * arc_buf_hdr_t will remain constant throughout it's lifetime - * (i.e. it's b_spa, b_dva, and b_birth fields don't change). + * arc_buf_hdr_t will remain constant throughout its lifetime + * (i.e. its b_spa, b_dva, and b_birth fields don't change). * Thus, we don't need to store the header's sublist index * on insertion, as this index can be recalculated on removal. * @@ -6248,6 +7331,8 @@ arc_state_fini(void) multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]); aggsum_fini(&arc_meta_used); aggsum_fini(&arc_size); @@ -6256,6 +7341,7 @@ arc_state_fini(void) aggsum_fini(&astat_hdr_size); aggsum_fini(&astat_other_size); aggsum_fini(&astat_l2_hdr_size); + } uint64_t @@ -6843,6 +7929,96 @@ top: kmem_free(cb, sizeof (l2arc_write_callback_t)); } +static int +l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) +{ + int ret; + spa_t *spa = zio->io_spa; + arc_buf_hdr_t *hdr = cb->l2rcb_hdr; + blkptr_t *bp = zio->io_bp; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + boolean_t no_crypt = B_FALSE; + + /* + * ZIL data is never be written to the L2ARC, so we don't need + * special handling for its unique MAC storage. + */ + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + /* + * If the data was encrypted, decrypt it now. Note that + * we must check the bp here and not the hdr, since the + * hdr does not have its encryption parameters updated + * until arc_read_done(). + */ + if (BP_IS_ENCRYPTED(bp)) { + abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); + + zio_crypt_decode_params_bp(bp, salt, iv); + zio_crypt_decode_mac_bp(bp, mac); + + ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb, + BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), + salt, iv, mac, HDR_GET_PSIZE(hdr), eabd, + hdr->b_l1hdr.b_pabd, &no_crypt); + if (ret != 0) { + arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); + goto error; + } + + /* + * If we actually performed decryption, replace b_pabd + * with the decrypted data. Otherwise we can just throw + * our decryption buffer away. + */ + if (!no_crypt) { + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, + arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_pabd = eabd; + zio->io_abd = eabd; + } else { + arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); + } + } + + /* + * If the L2ARC block was compressed, but ARC compression + * is disabled we decompress the data into a new buffer and + * replace the existing data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr)) { + abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr); + void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); + + ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr)); + if (ret != 0) { + abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); + arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); + goto error; + } + + abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, + arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_pabd = cabd; + zio->io_abd = cabd; + zio->io_size = HDR_GET_LSIZE(hdr); + } + + return (0); + +error: + return (ret); +} + + /* * A read to a cache device completed. Validate buffer contents before * handing over to the regular ARC routines. @@ -6850,17 +8026,19 @@ top: static void l2arc_read_done(zio_t *zio) { - l2arc_read_callback_t *cb; + int tfm_error = 0; + l2arc_read_callback_t *cb = zio->io_private; arc_buf_hdr_t *hdr; kmutex_t *hash_lock; boolean_t valid_cksum; + boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) && + (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT)); ASSERT3P(zio->io_vd, !=, NULL); ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); - cb = zio->io_private; ASSERT3P(cb, !=, NULL); hdr = cb->l2rcb_hdr; ASSERT3P(hdr, !=, NULL); @@ -6876,8 +8054,13 @@ l2arc_read_done(zio_t *zio) if (cb->l2rcb_abd != NULL) { ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); if (zio->io_error == 0) { - abd_copy(hdr->b_l1hdr.b_pabd, cb->l2rcb_abd, - arc_hdr_size(hdr)); + if (using_rdata) { + abd_copy(hdr->b_crypt_hdr.b_rabd, + cb->l2rcb_abd, arc_hdr_size(hdr)); + } else { + abd_copy(hdr->b_l1hdr.b_pabd, + cb->l2rcb_abd, arc_hdr_size(hdr)); + } } /* @@ -6893,7 +8076,15 @@ l2arc_read_done(zio_t *zio) */ abd_free(cb->l2rcb_abd); zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); - zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; + + if (using_rdata) { + ASSERT(HDR_HAS_RABD(hdr)); + zio->io_abd = zio->io_orig_abd = + hdr->b_crypt_hdr.b_rabd; + } else { + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; + } } ASSERT3P(zio->io_abd, !=, NULL); @@ -6901,12 +8092,23 @@ l2arc_read_done(zio_t *zio) /* * Check this survived the L2ARC journey. */ - ASSERT3P(zio->io_abd, ==, hdr->b_l1hdr.b_pabd); + ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || + (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ valid_cksum = arc_cksum_is_equal(hdr, zio); - if (valid_cksum && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + + /* + * b_rabd will always match the data as it exists on disk if it is + * being used. Therefore if we are reading into b_rabd we do not + * attempt to untransform the data. + */ + if (valid_cksum && !using_rdata) + tfm_error = l2arc_untransform(zio, cb); + + if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && + !HDR_L2_EVICTED(hdr)) { mutex_exit(hash_lock); zio->io_private = hdr; arc_read_done(zio); @@ -6921,7 +8123,7 @@ l2arc_read_done(zio_t *zio) } else { zio->io_error = SET_ERROR(EIO); } - if (!valid_cksum) + if (!valid_cksum || tfm_error != 0) ARCSTAT_BUMP(arcstat_l2_cksum_bad); /* @@ -6931,11 +8133,13 @@ l2arc_read_done(zio_t *zio) */ if (zio->io_waiter == NULL) { zio_t *pio = zio_unique_parent(zio); + void *abd = (using_rdata) ? + hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); zio_nowait(zio_read(pio, zio->io_spa, zio->io_bp, - hdr->b_l1hdr.b_pabd, zio->io_size, arc_read_done, + abd, zio->io_size, arc_read_done, hdr, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); } @@ -7095,6 +8299,123 @@ top: } /* + * Handle any abd transforms that might be required for writing to the L2ARC. + * If successful, this function will always return an abd with the data + * transformed as it is on disk in a new abd of asize bytes. + */ +static int +l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, + abd_t **abd_out) +{ + int ret; + void *tmp = NULL; + abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; + enum zio_compress compress = HDR_GET_COMPRESS(hdr); + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t size = arc_hdr_size(hdr); + boolean_t ismd = HDR_ISTYPE_METADATA(hdr); + boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); + dsl_crypto_key_t *dck = NULL; + uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 }; + boolean_t no_crypt = B_FALSE; + + ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr)) || + HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize); + ASSERT3U(psize, <=, asize); + + /* + * If this data simply needs its own buffer, we simply allocate it + * and copy the data. This may be done to eliminate a dependency on a + * shared buffer or to reallocate the buffer to match asize. + */ + if (HDR_HAS_RABD(hdr) && asize != psize) { + ASSERT3U(asize, >=, psize); + to_write = abd_alloc_for_io(asize, ismd); + abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); + if (psize != asize) + abd_zero_off(to_write, psize, asize - psize); + goto out; + } + + if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) && + !HDR_ENCRYPTED(hdr)) { + ASSERT3U(size, ==, psize); + to_write = abd_alloc_for_io(asize, ismd); + abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); + if (size != asize) + abd_zero_off(to_write, size, asize - size); + goto out; + } + + if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { + cabd = abd_alloc_for_io(asize, ismd); + tmp = abd_borrow_buf(cabd, asize); + + psize = zio_compress_data(compress, to_write, tmp, size); + ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); + if (psize < asize) + bzero((char *)tmp + psize, asize - psize); + psize = HDR_GET_PSIZE(hdr); + abd_return_buf_copy(cabd, tmp, asize); + to_write = cabd; + } + + if (HDR_ENCRYPTED(hdr)) { + eabd = abd_alloc_for_io(asize, ismd); + + /* + * If the dataset was disowned before the buffer + * made it to this point, the key to re-encrypt + * it won't be available. In this case we simply + * won't write the buffer to the L2ARC. + */ + ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj, + FTAG, &dck); + if (ret != 0) + goto error; + + ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key, + hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt, + hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd, + &no_crypt); + if (ret != 0) + goto error; + + if (no_crypt) + abd_copy(eabd, to_write, psize); + + if (psize != asize) + abd_zero_off(eabd, psize, asize - psize); + + /* assert that the MAC we got here matches the one we saved */ + ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + if (to_write == cabd) + abd_free(cabd); + + to_write = eabd; + } + +out: + ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd); + *abd_out = to_write; + return (0); + +error: + if (dck != NULL) + spa_keystore_dsl_key_rele(spa, dck, FTAG); + if (cabd != NULL) + abd_free(cabd); + if (eabd != NULL) + abd_free(eabd); + + *abd_out = NULL; + return (ret); +} + +/* * Find and write ARC buffers to the L2ARC device. * * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid @@ -7130,6 +8451,8 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) multilist_sublist_t *mls = l2arc_sublist_lock(try); uint64_t passed_sz = 0; + VERIFY3P(mls, !=, NULL); + /* * L2ARC fast warmup. * @@ -7147,6 +8470,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) for (; hdr; hdr = hdr_prev) { kmutex_t *hash_lock; + abd_t *to_write = NULL; if (arc_warm == B_FALSE) hdr_prev = multilist_sublist_next(mls, hdr); @@ -7184,9 +8508,10 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) ASSERT(HDR_HAS_L1HDR(hdr)); ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); - ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); ASSERT3U(arc_hdr_size(hdr), >, 0); - uint64_t psize = arc_hdr_size(hdr); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || + HDR_HAS_RABD(hdr)); + uint64_t psize = HDR_GET_PSIZE(hdr); uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); @@ -7196,6 +8521,57 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } + /* + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. + */ + arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); + ASSERT(HDR_HAS_L1HDR(hdr)); + + ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || + HDR_HAS_RABD(hdr)); + ASSERT3U(arc_hdr_size(hdr), >, 0); + + /* + * If this header has b_rabd, we can use this since it + * must always match the data exactly as it exists on + * disk. Otherwise, the L2ARC can normally use the + * hdr's data, but if we're sharing data between the + * hdr and one of its bufs, L2ARC needs its own copy of + * the data so that the ZIO below can't race with the + * buf consumer. To ensure that this copy will be + * available for the lifetime of the ZIO and be cleaned + * up afterwards, we add it to the l2arc_free_on_write + * queue. If we need to apply any transforms to the + * data (compression, encryption) we will also need the + * extra buffer. + */ + if (HDR_HAS_RABD(hdr) && psize == asize) { + to_write = hdr->b_crypt_hdr.b_rabd; + } else if ((HDR_COMPRESSION_ENABLED(hdr) || + HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) && + !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) && + psize == asize) { + to_write = hdr->b_l1hdr.b_pabd; + } else { + int ret; + arc_buf_contents_t type = arc_buf_type(hdr); + + ret = l2arc_apply_transforms(spa, hdr, asize, + &to_write); + if (ret != 0) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_L2_WRITING); + mutex_exit(hash_lock); + continue; + } + + l2arc_free_abd_on_write(to_write, asize, type); + } + if (pio == NULL) { /* * Insert a dummy header on the buflist so @@ -7223,37 +8599,9 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) list_insert_head(&dev->l2ad_buflist, hdr); mutex_exit(&dev->l2ad_mtx); - (void) zfs_refcount_add_many(&dev->l2ad_alloc, psize, - hdr); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); - /* - * Normally the L2ARC can use the hdr's data, but if - * we're sharing data between the hdr and one of its - * bufs, L2ARC needs its own copy of the data so that - * the ZIO below can't race with the buf consumer. - * Another case where we need to create a copy of the - * data is when the buffer size is not device-aligned - * and we need to pad the block to make it such. - * That also keeps the clock hand suitably aligned. - * - * To ensure that the copy will be available for the - * lifetime of the ZIO and be cleaned up afterwards, we - * add it to the l2arc_free_on_write queue. - */ - abd_t *to_write; - if (!HDR_SHARED_DATA(hdr) && psize == asize) { - to_write = hdr->b_l1hdr.b_pabd; - } else { - to_write = abd_alloc_for_io(asize, - HDR_ISTYPE_METADATA(hdr)); - abd_copy(to_write, hdr->b_l1hdr.b_pabd, psize); - if (asize != psize) { - abd_zero_off(to_write, psize, - asize - psize); - } - l2arc_free_abd_on_write(to_write, asize, - arc_buf_type(hdr)); - } wzio = zio_write_phys(pio, dev->l2ad_vdev, hdr->b_l2hdr.b_daddr, asize, to_write, ZIO_CHECKSUM_OFF, NULL, hdr, diff --git a/usr/src/uts/common/fs/zfs/bpobj.c b/usr/src/uts/common/fs/zfs/bpobj.c index bbdd765214..ec0d115cfc 100644 --- a/usr/src/uts/common/fs/zfs/bpobj.c +++ b/usr/src/uts/common/fs/zfs/bpobj.c @@ -266,7 +266,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, } if (free) { VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object, - (i + 1) * sizeof (blkptr_t), -1ULL, tx)); + (i + 1) * sizeof (blkptr_t), DMU_OBJECT_END, tx)); } if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0) goto out; @@ -344,7 +344,7 @@ bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx, if (free) { VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, - (i + 1) * sizeof (uint64_t), -1ULL, tx)); + (i + 1) * sizeof (uint64_t), DMU_OBJECT_END, tx)); } out: diff --git a/usr/src/uts/common/fs/zfs/bptree.c b/usr/src/uts/common/fs/zfs/bptree.c index c74d07236c..1a432507f7 100644 --- a/usr/src/uts/common/fs/zfs/bptree.c +++ b/usr/src/uts/common/fs/zfs/bptree.c @@ -211,7 +211,8 @@ bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, err = 0; for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { bptree_entry_phys_t bte; - int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; + int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST + | TRAVERSE_NO_DECRYPT; err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), &bte, DMU_READ_NO_PREFETCH); diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index b9d6ca26fe..9c7205bd0d 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -157,7 +157,7 @@ uint64_t dbuf_metadata_cache_overflow; * cache size). Once the eviction thread is woken up and eviction is required, * it will continue evicting buffers until it's able to reduce the cache size * to the low water mark. If the cache size continues to grow and hits the high - * water mark, then callers adding elments to the cache will begin to evict + * water mark, then callers adding elements to the cache will begin to evict * directly from the cache until the cache is no longer above the high water * mark. */ @@ -310,7 +310,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) dmu_buf_impl_t *dbf, **dbp; /* - * We musn't hold db_mtx to maintain lock ordering: + * We mustn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(zfs_refcount_is_zero(&db->db_holds)); @@ -413,7 +413,7 @@ dbuf_evict_user(dmu_buf_impl_t *db) boolean_t dbuf_is_metadata(dmu_buf_impl_t *db) { - if (db->db_level > 0) { + if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) { return (B_TRUE); } else { boolean_t is_metadata; @@ -941,6 +941,7 @@ dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) } } +/* ARGSUSED */ static void dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, arc_buf_t *buf, void *vdb) @@ -984,12 +985,71 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, dbuf_rele_and_unlock(db, NULL, B_FALSE); } -static void + +/* + * This function ensures that, when doing a decrypting read of a block, + * we make sure we have decrypted the dnode associated with it. We must do + * this so that we ensure we are fully authenticating the checksum-of-MACs + * tree from the root of the objset down to this block. Indirect blocks are + * always verified against their secure checksum-of-MACs assuming that the + * dnode containing them is correct. Now that we are doing a decrypting read, + * we can be sure that the key is loaded and verify that assumption. This is + * especially important considering that we always read encrypted dnode + * blocks as raw data (without verifying their MACs) to start, and + * decrypt / authenticate them when we need to read an encrypted bonus buffer. + */ +static int +dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) +{ + int err = 0; + objset_t *os = db->db_objset; + arc_buf_t *dnode_abuf; + dnode_t *dn; + zbookmark_phys_t zb; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (!os->os_encrypted || os->os_raw_receive || + (flags & DB_RF_NO_DECRYPT) != 0) + return (0); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL; + + if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) { + DB_DNODE_EXIT(db); + return (0); + } + + SET_BOOKMARK(&zb, dmu_objset_id(os), + DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid); + err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE); + + /* + * An error code of EACCES tells us that the key is still not + * available. This is ok if we are only reading authenticated + * (and therefore non-encrypted) blocks. + */ + if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID && + !DMU_OT_IS_ENCRYPTED(dn->dn_type)) || + (db->db_blkid == DMU_BONUS_BLKID && + !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)))) + err = 0; + + + DB_DNODE_EXIT(db); + + return (err); +} + +static int dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { dnode_t *dn; zbookmark_phys_t zb; arc_flags_t aflags = ARC_FLAG_NOWAIT; + int err, zio_flags = 0; DB_DNODE_ENTER(db); dn = DB_DNODE(db); @@ -1008,6 +1068,14 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + /* if the underlying dnode block is encrypted, decrypt it */ + err = dbuf_read_verify_dnode_crypt(db, flags); + if (err != 0) { + DB_DNODE_EXIT(db); + mutex_exit(&db->db_mtx); + return (err); + } + ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(max_bonuslen); arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); @@ -1018,7 +1086,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); - return; + return (0); } /* @@ -1058,7 +1126,30 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); - return; + return (0); + } + + SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), + db->db.db_object, db->db_level, db->db_blkid); + + /* + * All bps of an encrypted os should have the encryption bit set. + * If this is not true it indicates tampering and we report an error. + */ + if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) { + spa_log_error(db->db_objset->os_spa, &zb); + zfs_panic_recover("unencrypted block in encrypted " + "object set %llu", dmu_objset_id(db->db_objset)); + DB_DNODE_EXIT(db); + mutex_exit(&db->db_mtx); + return (SET_ERROR(EIO)); + } + + err = dbuf_read_verify_dnode_crypt(db, flags); + if (err != 0) { + DB_DNODE_EXIT(db); + mutex_exit(&db->db_mtx); + return (err); } DB_DNODE_EXIT(db); @@ -1069,16 +1160,19 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; - SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? - db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, - db->db.db_object, db->db_level, db->db_blkid); - dbuf_add_ref(db, NULL); - (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, - dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, - (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, + zio_flags = (flags & DB_RF_CANFAIL) ? + ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; + + if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) + zio_flags |= ZIO_FLAG_RAW; + + err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr, + dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + + return (err); } /* @@ -1116,7 +1210,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) * or (if there a no active holders) * just null out the current db_data pointer. */ - ASSERT(dr->dr_txg >= txg - 2); + ASSERT3U(dr->dr_txg, >=, txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dnode_t *dn = DB_DNODE(db); @@ -1125,18 +1219,31 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) arc_space_consume(bonuslen, ARC_SPACE_BONUS); bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) { + dnode_t *dn = DB_DNODE(db); int size = arc_buf_size(db->db_buf); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; enum zio_compress compress_type = arc_get_compression(db->db_buf); - if (compress_type == ZIO_COMPRESS_OFF) { - dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); - } else { + if (arc_is_encrypted(db->db_buf)) { + boolean_t byteorder; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + + arc_get_raw_params(db->db_buf, &byteorder, salt, + iv, mac); + dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db, + dmu_objset_id(dn->dn_objset), byteorder, salt, iv, + mac, dn->dn_type, size, arc_buf_lsize(db->db_buf), + compress_type); + } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db, size, arc_buf_lsize(db->db_buf), compress_type); + } else { + dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size); } bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { @@ -1172,20 +1279,36 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { + spa_t *spa = dn->dn_objset->os_spa; + /* - * If the arc buf is compressed, we need to decompress it to - * read the data. This could happen during the "zfs receive" of - * a stream which is compressed and deduplicated. + * Ensure that this block's dnode has been decrypted if + * the caller has requested decrypted data. */ - if (db->db_buf != NULL && - arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) { - dbuf_fix_old_data(db, - spa_syncing_txg(dmu_objset_spa(db->db_objset))); - err = arc_decompress(db->db_buf); + err = dbuf_read_verify_dnode_crypt(db, flags); + + /* + * If the arc buf is compressed or encrypted and the caller + * requested uncompressed data, we need to untransform it + * before returning. We also call arc_untransform() on any + * unauthenticated blocks, which will verify their MAC if + * the key is now available. + */ + if (err == 0 && db->db_buf != NULL && + (flags & DB_RF_NO_DECRYPT) == 0 && + (arc_is_encrypted(db->db_buf) || + arc_is_unauthenticated(db->db_buf) || + arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), + db->db.db_object, db->db_level, db->db_blkid); + dbuf_fix_old_data(db, spa_syncing_txg(spa)); + err = arc_untransform(db->db_buf, spa, &zb, B_FALSE); dbuf_set_data(db, db->db_buf); } mutex_exit(&db->db_mtx); - if (prefetch) + if (err == 0 && prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); @@ -1199,18 +1322,18 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); need_wait = B_TRUE; } - dbuf_read_impl(db, zio, flags); + err = dbuf_read_impl(db, zio, flags); /* dbuf_read_impl has dropped db_mtx for us */ - if (prefetch) + if (!err && prefetch) dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); - if (need_wait) + if (!err && need_wait) err = zio_wait(zio); } else { /* @@ -1300,6 +1423,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; + dr->dt.dl.dr_has_raw_params = B_FALSE; /* * Release the already-written buffer, so we leave it in @@ -1744,7 +1868,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ddt_prefetch(os->os_spa, db->db_blkptr); if (db->db_level == 0) { - dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); + ASSERT(!db->db_objset->os_raw_receive || + dn->dn_maxblkid >= db->db_blkid); + dnode_new_blkid(dn, db->db_blkid, tx, + drop_struct_lock, B_FALSE); ASSERT(dn->dn_maxblkid >= db->db_blkid); } @@ -1891,11 +2018,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (B_FALSE); } -void -dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) +static void +dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); ASSERT(!zfs_refcount_is_zero(&db->db_holds)); @@ -1926,13 +2052,20 @@ dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) - rf |= DB_RF_HAVESTRUCT; + flags |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); - (void) dbuf_read(db, NULL, rf); + (void) dbuf_read(db, NULL, flags); (void) dbuf_dirty(db, tx); } void +dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_will_dirty_impl(db_fake, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx); +} + +void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; @@ -1959,6 +2092,44 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) (void) dbuf_dirty(db, tx); } +/* + * This function is effectively the same as dmu_buf_will_dirty(), but + * indicates the caller expects raw encrypted data in the db, and provides + * the crypt params (byteorder, salt, iv, mac) which should be stored in the + * blkptr_t when this dbuf is written. This is only used for blocks of + * dnodes during a raw receive. + */ +void +dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dbuf_dirty_record_t *dr; + + /* + * dr_has_raw_params is only processed for blocks of dnodes + * (see dbuf_sync_dnode_leaf_crypt()). + */ + ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); + ASSERT3U(db->db_level, ==, 0); + + dmu_buf_will_dirty_impl(db_fake, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx); + + dr = db->db_last_dirty; + while (dr != NULL && dr->dr_txg > tx->tx_txg) + dr = dr->dr_next; + + ASSERT3P(dr, !=, NULL); + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + + dr->dt.dl.dr_has_raw_params = B_TRUE; + dr->dt.dl.dr_byteorder = byteorder; + bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN); + bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN); + bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN); +} + #pragma weak dmu_buf_fill_done = dbuf_fill_done /* ARGSUSED */ void @@ -2045,6 +2216,13 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) if (db->db_state == DB_CACHED && zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { + /* + * In practice, we will never have a case where we have an + * encrypted arc buffer while additional holds exist on the + * dbuf. We don't handle this here so we simply assert that + * fact instead. + */ + ASSERT(!arc_is_encrypted(buf)); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); @@ -2060,6 +2238,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { ASSERT(dr->dt.dl.dr_data == db->db_buf); + if (!arc_released(db->db_buf)) { ASSERT(dr->dt.dl.dr_override_state == DR_OVERRIDDEN); @@ -2383,15 +2562,20 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; arc_flags_t aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + /* dnodes are always read as raw and then converted later */ + if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) && + dpa->dpa_curlevel == 0) + zio_flags |= ZIO_FLAG_RAW; + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); ASSERT(dpa->dpa_zio != NULL); (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, - dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - &aflags, &dpa->dpa_zb); + dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb); } /* @@ -2399,6 +2583,7 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) * will either read in the next indirect block down the tree or issue the actual * prefetch if the next block down is our target. */ +/* ARGSUSED */ static void dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *iobp, arc_buf_t *abuf, void *private) @@ -2428,7 +2613,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, */ if (zio != NULL) { ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); - if (zio->io_flags & ZIO_FLAG_RAW) { + if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) { ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); } else { ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); @@ -2485,7 +2670,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, * Issue prefetch reads for the given block on the given level. If the indirect * blocks above that block are not in memory, we will read them in * asynchronously. As a result, this call never blocks waiting for a read to - * complete. + * complete. Note that the prefetch might fail if the dataset is encrypted and + * the encryption key is unmapped before the IO completes. */ void dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, @@ -2614,6 +2800,43 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, } /* + * Helper function for __dbuf_hold_impl() to copy a buffer. Handles + * the case of encrypted, compressed and uncompressed buffers by + * allocating the new buffer, respectively, with arc_alloc_raw_buf(), + * arc_alloc_compressed_buf() or arc_alloc_buf().* + * + * NOTE: Declared noinline to avoid stack bloat in __dbuf_hold_impl(). + */ +static void +dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db, dbuf_dirty_record_t *dr) +{ + arc_buf_t *data = dr->dt.dl.dr_data; + enum zio_compress compress_type = arc_get_compression(data); + + if (arc_is_encrypted(data)) { + boolean_t byteorder; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + + arc_get_raw_params(data, &byteorder, salt, iv, mac); + dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db, + dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac, + dn->dn_type, arc_buf_size(data), arc_buf_lsize(data), + compress_type)); + } else if (compress_type != ZIO_COMPRESS_OFF) { + dbuf_set_data(db, arc_alloc_compressed_buf( + dn->dn_objset->os_spa, db, arc_buf_size(data), + arc_buf_lsize(data), compress_type)); + } else { + dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db, + DBUF_GET_BUFC_TYPE(db), db->db.db_size)); + } + + bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); +} + +/* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ @@ -2677,16 +2900,8 @@ top: dn->dn_object != DMU_META_DNODE_OBJECT && db->db_state == DB_CACHED && db->db_data_pending) { dbuf_dirty_record_t *dr = db->db_data_pending; - - if (dr->dt.dl.dr_data == db->db_buf) { - arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - - dbuf_set_data(db, - arc_alloc_buf(dn->dn_objset->os_spa, db, type, - db->db.db_size)); - bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, - db->db.db_size); - } + if (dr->dt.dl.dr_data == db->db_buf) + dbuf_hold_copy(dn, db, dr); } if (multilist_link_active(&db->db_cache_link)) { @@ -2960,6 +3175,20 @@ dbuf_refcount(dmu_buf_impl_t *db) return (zfs_refcount_count(&db->db_holds)); } +uint64_t +dmu_buf_user_refcount(dmu_buf_t *db_fake) +{ + uint64_t holds; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + mutex_enter(&db->db_mtx); + ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt); + holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt; + mutex_exit(&db->db_mtx); + + return (holds); +} + void * dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user) @@ -3088,6 +3317,50 @@ dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) } } +/* + * When syncing out blocks of dnodes, adjust the block to deal with + * encryption. Normally, we make sure the block is decrypted before writing + * it. If we have crypt params, then we are writing a raw (encrypted) block, + * from a raw receive. In this case, set the ARC buf's crypt params so + * that the BP will be filled with the correct byteorder, salt, iv, and mac. + * + * XXX we should handle decrypting the dnode block in dbuf_dirty(). + */ +static void +dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr) +{ + int err; + dmu_buf_impl_t *db = dr->dr_dbuf; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); + ASSERT3U(db->db_level, ==, 0); + + if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) { + zbookmark_phys_t zb; + + /* + * Unfortunately, there is currently no mechanism for + * syncing context to handle decryption errors. An error + * here is only possible if an attacker maliciously + * changed a dnode block and updated the associated + * checksums going up the block tree. + */ + SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), + db->db.db_object, db->db_level, db->db_blkid); + err = arc_untransform(db->db_buf, db->db_objset->os_spa, + &zb, B_TRUE); + if (err) + panic("Invalid dnode block MAC"); + } else if (dr->dt.dl.dr_has_raw_params) { + (void) arc_release(dr->dt.dl.dr_data, db); + arc_convert_to_raw(dr->dt.dl.dr_data, + dmu_objset_id(db->db_objset), + dr->dt.dl.dr_byteorder, DMU_OT_DNODE, + dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac); + } +} + static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { @@ -3230,6 +3503,13 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } + /* + * If this is a dnode block, ensure it is appropriately encrypted + * or decrypted, depending on what we are writing to it this txg. + */ + if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT) + dbuf_prepare_encrypted_dnode_leaf(dr); + if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && zfs_refcount_count(&db->db_holds) > 1 && @@ -3247,16 +3527,26 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) * DNONE_DNODE blocks). */ int psize = arc_buf_size(*datap); + int lsize = arc_buf_lsize(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); enum zio_compress compress_type = arc_get_compression(*datap); - if (compress_type == ZIO_COMPRESS_OFF) { - *datap = arc_alloc_buf(os->os_spa, db, type, psize); - } else { + if (arc_is_encrypted(*datap)) { + boolean_t byteorder; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + + arc_get_raw_params(*datap, &byteorder, salt, iv, mac); + *datap = arc_alloc_raw_buf(os->os_spa, db, + dmu_objset_id(os), byteorder, salt, iv, mac, + dn->dn_type, psize, lsize, compress_type); + } else if (compress_type != ZIO_COMPRESS_OFF) { ASSERT3U(type, ==, ARC_BUFC_DATA); - int lsize = arc_buf_lsize(*datap); *datap = arc_alloc_compressed_buf(os->os_spa, db, psize, lsize, compress_type); + } else { + *datap = arc_alloc_buf(os->os_spa, db, type, psize); } bcopy(db->db.db_data, (*datap)->b_data, psize); } @@ -3357,8 +3647,10 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); if (db->db_blkid > dn->dn_phys->dn_maxblkid && - db->db_blkid != DMU_SPILL_BLKID) + db->db_blkid != DMU_SPILL_BLKID) { + ASSERT0(db->db_objset->os_raw_receive); dn->dn_phys->dn_maxblkid = db->db_blkid; + } mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { @@ -3393,7 +3685,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) DB_DNODE_EXIT(db); if (!BP_IS_EMBEDDED(bp)) - bp->blk_fill = fill; + BP_SET_FILL(bp, fill); mutex_exit(&db->db_mtx); @@ -3814,6 +4106,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); + DB_DNODE_EXIT(db); /* diff --git a/usr/src/uts/common/fs/zfs/ddt.c b/usr/src/uts/common/fs/zfs/ddt.c index 1d51329511..8bcf6af8ba 100644 --- a/usr/src/uts/common/fs/zfs/ddt.c +++ b/usr/src/uts/common/fs/zfs/ddt.c @@ -253,6 +253,10 @@ ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); } +/* + * The bp created via this function may be used for repairs and scrub, but it + * will be missing the salt / IV required to do a full decrypting read. + */ void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) @@ -263,11 +267,12 @@ ddt_bp_create(enum zio_checksum checksum, ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); bp->blk_cksum = ddk->ddk_cksum; - bp->blk_fill = 1; BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); + BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk)); + BP_SET_FILL(bp, 1); BP_SET_CHECKSUM(bp, checksum); BP_SET_TYPE(bp, DMU_OT_DEDUP); BP_SET_LEVEL(bp, 0); @@ -281,9 +286,12 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) ddk->ddk_cksum = bp->blk_cksum; ddk->ddk_prop = 0; + ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp)); + DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); + DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp)); } void @@ -367,7 +375,7 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) if (ddp->ddp_phys_birth == 0) continue; - for (int d = 0; d < SPA_DVAS_PER_BP; d++) + for (int d = 0; d < DDE_GET_NDVAS(dde); d++) dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); dds->dds_blocks += 1; @@ -521,6 +529,7 @@ ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) uint64_t ditto = spa->spa_dedup_ditto; int total_copies = 0; int desired_copies = 0; + int copies_needed = 0; for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; @@ -546,7 +555,13 @@ ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) if (total_refcnt >= ditto * ditto) desired_copies++; - return (MAX(desired_copies, total_copies) - total_copies); + copies_needed = MAX(desired_copies, total_copies) - total_copies; + + /* encrypted blocks store their IV in DVA[2] */ + if (DDK_GET_CRYPT(&dde->dde_key)) + copies_needed = MIN(copies_needed, SPA_DVAS_PER_BP - 1); + + return (copies_needed); } int @@ -556,7 +571,7 @@ ddt_ditto_copies_present(ddt_entry_t *dde) dva_t *dva = ddp->ddp_dva; int copies = 0 - DVA_GET_GANG(dva); - for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) + for (int d = 0; d < DDE_GET_NDVAS(dde); d++, dva++) if (DVA_IS_VALID(dva)) copies++; diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 95ca9f76aa..02bdfdfa12 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -96,60 +96,60 @@ int zfs_object_remap_one_indirect_delay_ticks = 0; uint64_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { - { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" }, - { DMU_BSWAP_UINT64, TRUE, TRUE, "object array" }, - { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" }, - { DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" }, - { DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" }, - { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" }, - { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" }, - { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" }, - { DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" }, - { DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" }, - { DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" }, - { DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" }, - { DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" }, - { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" }, - { DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" }, - { DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" }, - { DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" }, - { DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" }, - { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" }, - { DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" }, - { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" }, - { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" }, - { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" } + { DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "object array" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "packed nvlist" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "packed nvlist size" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj header" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map header" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, TRUE, "ZIL intent log" }, + { DMU_BSWAP_DNODE, TRUE, FALSE, TRUE, "DMU dnode" }, + { DMU_BSWAP_OBJSET, TRUE, TRUE, FALSE, "DMU objset" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL directory" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL directory child map" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset snap map" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL props" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL dataset" }, + { DMU_BSWAP_ZNODE, TRUE, FALSE, FALSE, "ZFS znode" }, + { DMU_BSWAP_OLDACL, TRUE, FALSE, TRUE, "ZFS V0 ACL" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "ZFS plain file" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS directory" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "ZFS master node" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS delete queue" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "zvol object" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "zvol prop" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "other uint8[]" }, + { DMU_BSWAP_UINT64, FALSE, FALSE, TRUE, "other uint64[]" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "other ZAP" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "persistent error log" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "SPA history" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA history offsets" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "Pool properties" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL permissions" }, + { DMU_BSWAP_ACL, TRUE, FALSE, TRUE, "ZFS ACL" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "ZFS SYSACL" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "FUID table" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "FUID table size" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset next clones" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan work queue" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group used" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group quota" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "snapshot refcount tags" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT ZAP algorithm" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT statistics" }, + { DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "System attributes" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA master node" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr registration" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr layouts" }, + { DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan translations" }, + { DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "deduplicated block" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL deadlist map" }, + { DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL deadlist map hdr" }, + { DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dir clones" }, + { DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" } }; const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { @@ -221,6 +221,8 @@ dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; + if (flags & DMU_READ_NO_DECRYPT) + db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); if (err == 0) { @@ -244,6 +246,8 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; + if (flags & DMU_READ_NO_DECRYPT) + db_flags |= DB_RF_NO_DECRYPT; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { @@ -341,14 +345,72 @@ dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) } /* + * Lookup and hold the bonus buffer for the provided dnode. If the dnode + * has not yet been allocated a new bonus dbuf a will be allocated. + * Returns ENOENT, EIO, or 0. + */ +int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp, + uint32_t flags) +{ + dmu_buf_impl_t *db; + int error; + uint32_t db_flags = DB_RF_MUST_SUCCEED; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + if (flags & DMU_READ_NO_DECRYPT) + db_flags |= DB_RF_NO_DECRYPT; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_bonus == NULL) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus == NULL) + dbuf_create_bonus(dn); + } + db = dn->dn_bonus; + + /* as long as the bonus buf is held, the dnode will be held */ + if (zfs_refcount_add(&db->db_holds, tag) == 1) { + VERIFY(dnode_add_ref(dn, db)); + atomic_inc_32(&dn->dn_dbufs_count); + } + + /* + * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's + * hold and incrementing the dbuf count to ensure that dnode_move() sees + * a dnode hold for every dbuf. + */ + rw_exit(&dn->dn_struct_rwlock); + + error = dbuf_read(db, NULL, db_flags); + if (error) { + dnode_evict_bonus(dn); + dbuf_rele(db, tag); + *dbp = NULL; + return (error); + } + + *dbp = &db->db; + return (0); +} + +/* * returns ENOENT, EIO, or 0. */ int -dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) +dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag, uint32_t flags, + dmu_buf_t **dbp) { dnode_t *dn; dmu_buf_impl_t *db; int error; + uint32_t db_flags = DB_RF_MUST_SUCCEED; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + if (flags & DMU_READ_NO_DECRYPT) + db_flags |= DB_RF_NO_DECRYPT; error = dnode_hold(os, object, FTAG, &dn); if (error) @@ -378,12 +440,24 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) dnode_rele(dn, FTAG); - VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); + error = dbuf_read(db, NULL, db_flags); + if (error) { + dnode_evict_bonus(dn); + dbuf_rele(db, tag); + *dbp = NULL; + return (error); + } *dbp = &db->db; return (0); } +int +dmu_bonus_hold(objset_t *os, uint64_t obj, void *tag, dmu_buf_t **dbp) +{ + return (dmu_bonus_hold_impl(os, obj, tag, DMU_READ_NO_PREFETCH, dbp)); +} + /* * returns ENOENT, EIO, or 0. * @@ -446,15 +520,20 @@ dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) } int -dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag, + dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; + uint32_t db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_DECRYPT) + db_flags |= DB_RF_NO_DECRYPT; DB_DNODE_ENTER(db); dn = DB_DNODE(db); - err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); + err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp); DB_DNODE_EXIT(db); return (err); @@ -619,8 +698,8 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) * indirect blocks prefeteched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. * - * Note that if the indirect blocks above the blocks being prefetched are not in - * cache, they will be asychronously read in. + * Note that if the indirect blocks above the blocks being prefetched are not + * in cache, they will be asychronously read in. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, @@ -835,6 +914,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len, uint64_t, dmu_tx_get_txg(tx)); dnode_free_range(dn, chunk_begin, chunk_len, tx); + dmu_tx_commit(tx); length -= chunk_len; @@ -883,7 +963,9 @@ dmu_free_long_object(objset_t *os, uint64_t object) dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { - err = dmu_object_free(os, object, tx); + if (err == 0) + err = dmu_object_free(os, object, tx); + dmu_tx_commit(tx); } else { dmu_tx_abort(tx); @@ -901,7 +983,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, if (err) return (err); ASSERT(offset < UINT64_MAX); - ASSERT(size == -1ULL || size <= UINT64_MAX - offset); + ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); @@ -1622,22 +1704,71 @@ dmu_return_arcbuf(arc_buf_t *buf) arc_buf_destroy(buf, FTAG); } +void +dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset, + dmu_buf_t *handle, dmu_tx_t *tx) +{ + dmu_buf_t *dst_handle; + dmu_buf_impl_t *dstdb; + dmu_buf_impl_t *srcdb = (dmu_buf_impl_t *)handle; + arc_buf_t *abuf; + uint64_t datalen; + boolean_t byteorder; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + + ASSERT3P(srcdb->db_buf, !=, NULL); + + /* hold the db that we want to write to */ + VERIFY0(dmu_buf_hold(os, object, offset, FTAG, &dst_handle, + DMU_READ_NO_DECRYPT)); + dstdb = (dmu_buf_impl_t *)dst_handle; + datalen = arc_buf_size(srcdb->db_buf); + + /* allocated an arc buffer that matches the type of srcdb->db_buf */ + if (arc_is_encrypted(srcdb->db_buf)) { + arc_get_raw_params(srcdb->db_buf, &byteorder, salt, iv, mac); + abuf = arc_loan_raw_buf(os->os_spa, dmu_objset_id(os), + byteorder, salt, iv, mac, DB_DNODE(dstdb)->dn_type, + datalen, arc_buf_lsize(srcdb->db_buf), + arc_get_compression(srcdb->db_buf)); + } else { + /* we won't get a compressed db back from dmu_buf_hold() */ + ASSERT3U(arc_get_compression(srcdb->db_buf), + ==, ZIO_COMPRESS_OFF); + abuf = arc_loan_buf(os->os_spa, + DMU_OT_IS_METADATA(DB_DNODE(dstdb)->dn_type), datalen); + } + + ASSERT3U(datalen, ==, arc_buf_size(abuf)); + + /* copy the data to the new buffer and assign it to the dstdb */ + bcopy(srcdb->db_buf->b_data, abuf->b_data, datalen); + dbuf_assign_arcbuf(dstdb, abuf, tx); + dmu_buf_rele(dst_handle, FTAG); +} + /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ -void -dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, +int +dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *db; + objset_t *os = dn->dn_objset; + uint64_t object = dn->dn_object; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); - VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); + db = dbuf_hold(dn, blkid, FTAG); + if (db == NULL) + return (SET_ERROR(EIO)); rw_exit(&dn->dn_struct_rwlock); /* @@ -1648,32 +1779,33 @@ dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { - objset_t *os; - uint64_t object; - /* compressed bufs must always be assignable to their dbuf */ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); os = dn->dn_objset; object = dn->dn_object; - dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } + + return (0); } -void -dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, +int +dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { + int err; dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; DB_DNODE_ENTER(dbuf); - dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx); + err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx); DB_DNODE_EXIT(dbuf); + + return (err); } typedef struct { @@ -1700,7 +1832,7 @@ dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); - bp->blk_fill = 1; + BP_SET_FILL(bp, 1); } } } @@ -2031,6 +2163,20 @@ dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) } int +dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + err = dnode_set_nlevels(dn, nlevels, tx); + dnode_rele(dn, FTAG); + return (err); +} + +int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { @@ -2045,6 +2191,23 @@ dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, return (err); } +int +dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (0); +} + void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) @@ -2084,8 +2247,6 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dnode_rele(dn, FTAG); } -int zfs_mdcomp_disable = 0; - /* * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. @@ -2104,6 +2265,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; + boolean_t encrypt = B_FALSE; int copies = os->os_copies; /* @@ -2114,16 +2276,12 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) * 3. all other level 0 blocks */ if (ismd) { - if (zfs_mdcomp_disable) { - compress = ZIO_COMPRESS_EMPTY; - } else { - /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. - */ - compress = zio_compress_select(os->os_spa, - ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); - } + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + compress = zio_compress_select(os->os_spa, + ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); /* * Metadata always gets checksummed. If the data @@ -2191,10 +2349,33 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } - zp->zp_checksum = checksum; - zp->zp_compress = compress; - ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); + /* + * All objects in an encrypted objset are protected from modification + * via a MAC. Encrypted objects store their IV and salt in the last DVA + * in the bp, so we cannot use all copies. Encrypted objects are also + * not subject to nopwrite since writing the same data will still + * result in a new ciphertext. Only encrypted blocks can be dedup'd + * to avoid ambiguity in the dedup code since the DDT does not store + * object types. + */ + if (os->os_encrypted && (wp & WP_NOFILL) == 0) { + encrypt = B_TRUE; + if (DMU_OT_IS_ENCRYPTED(type)) { + copies = MIN(copies, SPA_DVAS_PER_BP - 1); + nopwrite = B_FALSE; + } else { + dedup = B_FALSE; + } + + if (level <= 0 && + (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) { + compress = ZIO_COMPRESS_EMPTY; + } + } + + zp->zp_compress = compress; + zp->zp_checksum = checksum; zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); @@ -2203,6 +2384,11 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) zp->zp_nopwrite = nopwrite; zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? os->os_zpl_special_smallblock : 0; + zp->zp_encrypt = encrypt; + zp->zp_byteorder = ZFS_HOST_BYTEORDER; + bzero(zp->zp_salt, ZIO_DATA_SALT_LEN); + bzero(zp->zp_iv, ZIO_DATA_IV_LEN); + bzero(zp->zp_mac, ZIO_DATA_MAC_LEN); } int diff --git a/usr/src/uts/common/fs/zfs/dmu_diff.c b/usr/src/uts/common/fs/zfs/dmu_diff.c index 982b96132c..76c32b1264 100644 --- a/usr/src/uts/common/fs/zfs/dmu_diff.c +++ b/usr/src/uts/common/fs/zfs/dmu_diff.c @@ -131,11 +131,14 @@ diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *abuf; arc_flags_t aflags = ARC_FLAG_WAIT; int blksz = BP_GET_LSIZE(bp); + int zio_flags = ZIO_FLAG_CANFAIL; int i; + if (BP_IS_PROTECTED(bp)) + zio_flags |= ZIO_FLAG_RAW; + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &aflags, zb) != 0) + ZIO_PRIORITY_ASYNC_READ, zio_flags, &aflags, zb) != 0) return (SET_ERROR(EIO)); blk = abuf->b_data; @@ -206,8 +209,17 @@ dmu_diff(const char *tosnap_name, const char *fromsnap_name, da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; da.da_err = 0; + /* + * Since zfs diff only looks at dnodes which are stored in plaintext + * (other than bonus buffers), we don't technically need to decrypt + * the dataset to perform this operation. However, the command line + * utility will still fail if the keys are not loaded because the + * dataset isn't mounted and because it will fail when it attempts to + * call the ZFS_IOC_OBJ_TO_STATS ioctl. + */ error = traverse_dataset(tosnap, fromtxg, - TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, diff_cb, &da); + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, + diff_cb, &da); if (error != 0) { da.da_err = error; diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c index f835987e7d..1a91fefe88 100644 --- a/usr/src/uts/common/fs/zfs/dmu_object.c +++ b/usr/src/uts/common/fs/zfs/dmu_object.c @@ -24,6 +24,7 @@ * Copyright 2014 HybridCluster. All rights reserved. */ +#include <sys/dbuf.h> #include <sys/dmu.h> #include <sys/dmu_objset.h> #include <sys/dmu_tx.h> @@ -263,13 +264,13 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, - bonuslen, DNODE_MIN_SIZE, tx)); + bonuslen, DNODE_MIN_SIZE, B_FALSE, tx)); } int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, - dmu_tx_t *tx) + boolean_t keep_spill, dmu_tx_t *tx) { dnode_t *dn; int dn_slots = dnodesize >> DNODE_SHIFT; @@ -286,7 +287,30 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, if (err) return (err); - dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx); + dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, + keep_spill, tx); + + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, + FTAG, &dn); + if (err) + return (err); + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + dbuf_rm_spill(dn, tx); + dnode_rm_spill(dn, tx); + } + rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (err); diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 771b803973..4d0a5d2fd5 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -54,6 +54,7 @@ #include <sys/dsl_destroy.h> #include <sys/vdev.h> #include <sys/zfeature.h> +#include <sys/dmu_recv.h> #include "zfs_namecheck.h" /* @@ -418,16 +419,23 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (!BP_IS_HOLE(os->os_rootbp)) { arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; + enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_FLAG_L2CACHE; + if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) { + ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); + ASSERT(BP_IS_AUTHENTICATED(bp)); + zio_flags |= ZIO_FLAG_RAW; + } + dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, - ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); + ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (err != 0) { kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ @@ -468,6 +476,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, if (ds != NULL) { boolean_t needlock = B_FALSE; + os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0); + /* * Note: it's valid to open the objset if the dataset is * long-held, in which case the pool_config lock will not @@ -477,6 +487,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, needlock = B_TRUE; dsl_pool_config_enter(dmu_objset_pool(os), FTAG); } + err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); @@ -550,6 +561,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_compress = ZIO_COMPRESS_ON; + os->os_encrypted = B_FALSE; os->os_copies = spa_max_replication(spa); os->os_dedup_checksum = ZIO_CHECKSUM_OFF; os->os_dedup_verify = B_FALSE; @@ -640,16 +652,18 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) * can be held at a time. */ int -dmu_objset_hold(const char *name, void *tag, objset_t **osp) +dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, + objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; + ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; err = dsl_pool_hold(name, tag, &dp); if (err != 0) return (err); - err = dsl_dataset_hold(dp, name, tag, &ds); + err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds); if (err != 0) { dsl_pool_rele(dp, tag); return (err); @@ -664,23 +678,46 @@ dmu_objset_hold(const char *name, void *tag, objset_t **osp) return (err); } +int +dmu_objset_hold(const char *name, void *tag, objset_t **osp) +{ + return (dmu_objset_hold_flags(name, B_FALSE, tag, osp)); +} + +/* ARGSUSED */ static int dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, - boolean_t readonly, void *tag, objset_t **osp) + boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) { int err; err = dmu_objset_from_ds(ds, osp); if (err != 0) { - dsl_dataset_disown(ds, tag); + return (err); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { - dsl_dataset_disown(ds, tag); return (SET_ERROR(EINVAL)); } else if (!readonly && dsl_dataset_is_snapshot(ds)) { - dsl_dataset_disown(ds, tag); + return (SET_ERROR(EROFS)); + } else if (!readonly && decrypt && + dsl_dir_incompatible_encryption_version(ds->ds_dir)) { return (SET_ERROR(EROFS)); } - return (err); + + /* if we are decrypting, we can now check MACs in os->os_phys_buf */ + if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) { + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, + ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa, + &zb, B_FALSE); + if (err != 0) + return (err); + + ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf)); + } + + return (0); } /* @@ -690,48 +727,70 @@ dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, */ int dmu_objset_own(const char *name, dmu_objset_type_t type, - boolean_t readonly, void *tag, objset_t **osp) + boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; + ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); - err = dsl_dataset_own(dp, name, tag, &ds); + err = dsl_dataset_own(dp, name, flags, tag, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } - err = dmu_objset_own_impl(ds, type, readonly, tag, osp); + err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp); + if (err != 0) { + dsl_dataset_disown(ds, flags, tag); + dsl_pool_rele(dp, FTAG); + return (err); + } + dsl_pool_rele(dp, FTAG); - return (err); + return (0); } int dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, - boolean_t readonly, void *tag, objset_t **osp) + boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) { dsl_dataset_t *ds; int err; + ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; - err = dsl_dataset_own_obj(dp, obj, tag, &ds); + err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds); if (err != 0) return (err); - return (dmu_objset_own_impl(ds, type, readonly, tag, osp)); + err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp); + if (err != 0) { + dsl_dataset_disown(ds, flags, tag); + return (err); + } + + return (0); } void -dmu_objset_rele(objset_t *os, void *tag) +dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag) { + ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + dsl_pool_t *dp = dmu_objset_pool(os); - dsl_dataset_rele(os->os_dsl_dataset, tag); + dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag); dsl_pool_rele(dp, tag); } +void +dmu_objset_rele(objset_t *os, void *tag) +{ + dmu_objset_rele_flags(os, B_FALSE, tag); +} + /* * When we are called, os MUST refer to an objset associated with a dataset * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner @@ -745,7 +804,7 @@ dmu_objset_rele(objset_t *os, void *tag) */ void dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, - void *tag) + boolean_t decrypt, void *tag) { dsl_pool_t *dp; char name[ZFS_MAX_DATASET_NAME_LEN]; @@ -757,15 +816,18 @@ dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, dsl_dataset_name(ds, name); dp = ds->ds_dir->dd_pool; dsl_pool_config_enter(dp, FTAG); - dsl_dataset_disown(ds, tag); - VERIFY0(dsl_dataset_own(dp, name, tag, newds)); + + dsl_dataset_disown(ds, 0, tag); + VERIFY0(dsl_dataset_own(dp, name, + (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag, newds)); dsl_pool_config_exit(dp, FTAG); } void -dmu_objset_disown(objset_t *os, void *tag) +dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag) { - dsl_dataset_disown(os->os_dsl_dataset, tag); + dsl_dataset_disown(os->os_dsl_dataset, + (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag); } void @@ -842,6 +904,8 @@ dmu_objset_evict(objset_t *os) } else { mutex_exit(&os->os_lock); } + + } void @@ -887,16 +951,21 @@ dmu_objset_snap_cmtime(objset_t *os) return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); } -/* called from dsl for meta-objset */ +/* ARGSUSED */ objset_t * -dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - dmu_objset_type_t type, dmu_tx_t *tx) +dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx) { objset_t *os; dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); + if (blksz == 0) + blksz = 1 << DNODE_BLOCK_SHIFT; + if (ibs == 0) + ibs = DN_MAX_INDBLKSHIFT; + if (ds != NULL) VERIFY0(dmu_objset_from_ds(ds, &os)); else @@ -919,22 +988,25 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, * to convergence, so minimizing its dn_nlevels matters. */ if (ds != NULL) { - int levels = 1; - - /* - * Determine the number of levels necessary for the meta-dnode - * to contain DN_MAX_OBJECT dnodes. Note that in order to - * ensure that we do not overflow 64 bits, there has to be - * a nlevels that gives us a number of blocks > DN_MAX_OBJECT - * but < 2^64. Therefore, - * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be - * less than (64 - log2(DN_MAX_OBJECT)) (16). - */ - while ((uint64_t)mdn->dn_nblkptr << - (mdn->dn_datablkshift - DNODE_SHIFT + - (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < - DN_MAX_OBJECT) - levels++; + if (levels == 0) { + levels = 1; + + /* + * Determine the number of levels necessary for the + * meta-dnode to contain DN_MAX_OBJECT dnodes. Note + * that in order to ensure that we do not overflow + * 64 bits, there has to be a nlevels that gives us a + * number of blocks > DN_MAX_OBJECT but < 2^64. + * Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) + * (10) must be less than (64 - log2(DN_MAX_OBJECT)) + * (16). + */ + while ((uint64_t)mdn->dn_nblkptr << + (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) * + (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < + DN_MAX_OBJECT) + levels++; + } mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = mdn->dn_nlevels = levels; @@ -944,7 +1016,13 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); os->os_phys->os_type = type; - if (dmu_objset_userused_enabled(os)) { + + /* + * Enable user accounting if it is enabled and this is not an + * encrypted receive. + */ + if (dmu_objset_userused_enabled(os) && + (!os->os_encrypted || !dmu_objset_is_receiving(os))) { os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; os->os_flags = os->os_phys->os_flags; } @@ -954,6 +1032,14 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, return (os); } +/* called from dsl for meta-objset */ +objset_t * +dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + dmu_objset_type_t type, dmu_tx_t *tx) +{ + return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx)); +} + typedef struct dmu_objset_create_arg { const char *doca_name; cred_t *doca_cred; @@ -962,6 +1048,7 @@ typedef struct dmu_objset_create_arg { void *doca_userarg; dmu_objset_type_t doca_type; uint64_t doca_flags; + dsl_crypto_params_t *doca_dcp; } dmu_objset_create_arg_t; /*ARGSUSED*/ @@ -990,8 +1077,16 @@ dmu_objset_create_check(void *arg, dmu_tx_t *tx) dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } + + error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp, NULL); + if (error != 0) { + dsl_dir_rele(pdd, FTAG); + return (error); + } + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred); + dsl_dir_rele(pdd, FTAG); return (error); @@ -1002,23 +1097,25 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; dsl_dir_t *pdd; const char *tail; dsl_dataset_t *ds; uint64_t obj; blkptr_t *bp; objset_t *os; + zio_t *rzio; VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, - doca->doca_cred, tx); + doca->doca_cred, doca->doca_dcp, tx); - VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); + VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); bp = dsl_dataset_get_blkptr(ds); - os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, - ds, bp, doca->doca_type, tx); + os = dmu_objset_create_impl(spa, ds, bp, doca->doca_type, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); if (doca->doca_userfunc != NULL) { @@ -1026,16 +1123,68 @@ dmu_objset_create_sync(void *arg, dmu_tx_t *tx) doca->doca_cred, tx); } + /* + * The doca_userfunc() may write out some data that needs to be + * encrypted if the dataset is encrypted (specifically the root + * directory). This data must be written out before the encryption + * key mapping is removed by dsl_dataset_rele_flags(). Force the + * I/O to occur immediately by invoking the relevant sections of + * dsl_pool_sync(). + */ + if (os->os_encrypted) { + dsl_dataset_t *tmpds = NULL; + boolean_t need_sync_done = B_FALSE; + + mutex_enter(&ds->ds_lock); + ds->ds_owner = FTAG; + mutex_exit(&ds->ds_lock); + + rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds, + tx->tx_txg); + if (tmpds != NULL) { + dsl_dataset_sync(ds, rzio, tx); + need_sync_done = B_TRUE; + } + VERIFY0(zio_wait(rzio)); + dmu_objset_do_userquota_updates(os, tx); + taskq_wait(dp->dp_sync_taskq); + if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { + ASSERT3P(ds->ds_key_mapping, !=, NULL); + key_mapping_rele(spa, ds->ds_key_mapping, ds); + } + + rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds, + tx->tx_txg); + if (tmpds != NULL) { + dmu_buf_rele(ds->ds_dbuf, ds); + dsl_dataset_sync(ds, rzio, tx); + } + VERIFY0(zio_wait(rzio)); + + if (need_sync_done) { + ASSERT3P(ds->ds_key_mapping, !=, NULL); + key_mapping_rele(spa, ds->ds_key_mapping, ds); + dsl_dataset_sync_done(ds, tx); + } + + mutex_enter(&ds->ds_lock); + ds->ds_owner = NULL; + mutex_exit(&ds->ds_lock); + } + spa_history_log_internal_ds(ds, "create", tx, ""); - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, - void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) + dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg) { dmu_objset_create_arg_t doca; + dsl_crypto_params_t tmp_dcp = { 0 }; doca.doca_name = name; doca.doca_cred = CRED(); @@ -1044,9 +1193,19 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, doca.doca_userarg = arg; doca.doca_type = type; + /* + * Some callers (mostly for testing) do not provide a dcp on their + * own but various code inside the sync task will require it to be + * allocated. Rather than adding NULL checks throughout this code + * or adding dummy dcp's to all of the callers we simply create a + * dummy one here and use that. This zero dcp will have the same + * effect as asking for inheritence of all encryption params. + */ + doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp; + return (dsl_sync_task(name, dmu_objset_create_check, dmu_objset_create_sync, &doca, - 5, ZFS_SPACE_CHECK_NORMAL)); + 6, ZFS_SPACE_CHECK_NORMAL)); } typedef struct dmu_objset_clone_arg { @@ -1086,18 +1245,29 @@ dmu_objset_clone_check(void *arg, dmu_tx_t *tx) dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EDQUOT)); } - dsl_dir_rele(pdd, FTAG); error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); - if (error != 0) + if (error != 0) { + dsl_dir_rele(pdd, FTAG); return (error); + } /* You can only clone snapshots, not the head datasets. */ if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EINVAL)); } + + error = dmu_objset_clone_crypt_check(pdd, origin->ds_dir); + if (error != 0) { + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(pdd, FTAG); + return (error); + } + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(pdd, FTAG); return (0); } @@ -1117,7 +1287,7 @@ dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); obj = dsl_dataset_create_sync(pdd, tail, origin, 0, - doca->doca_cred, tx); + doca->doca_cred, NULL, tx); VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); dsl_dataset_name(origin, namebuf); @@ -1139,7 +1309,7 @@ dmu_objset_clone(const char *clone, const char *origin) return (dsl_sync_task(clone, dmu_objset_clone_check, dmu_objset_clone_sync, &doca, - 5, ZFS_SPACE_CHECK_NORMAL)); + 6, ZFS_SPACE_CHECK_NORMAL)); } static int @@ -1299,10 +1469,10 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) blkptr_t *bp = zio->io_bp; objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; + uint64_t fill = 0; ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); - ASSERT0(BP_GET_LEVEL(bp)); /* * Update rootbp fill count: it should be the number of objects @@ -1310,9 +1480,11 @@ dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) * objects that are stored in the objset_phys_t -- the meta * dnode and user/group accounting objects). */ - bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) - bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]); + fill += BP_GET_FILL(&dnp->dn_blkptr[i]); + + BP_SET_FILL(bp, fill); + if (os->os_dsl_dataset != NULL) rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG); *os->os_rootbp = *bp; @@ -1401,6 +1573,19 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) dmu_write_policy(os, NULL, 0, 0, &zp); + /* + * If we are either claiming the ZIL or doing a raw receive, write + * out the os_phys_buf raw. Neither of these actions will effect the + * MAC at this point. + */ + if (os->os_raw_receive || + os->os_next_write_raw[tx->tx_txg & TXG_MASK]) { + ASSERT(os->os_encrypted); + arc_convert_to_raw(os->os_phys_buf, + os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER, + DMU_OT_OBJSET, NULL, NULL, NULL); + } + zio = arc_write(pio, os->os_spa, tx->tx_txg, blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, @@ -1424,7 +1609,8 @@ dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) txgoff = tx->tx_txg & TXG_MASK; - if (dmu_objset_userused_enabled(os)) { + if (dmu_objset_userused_enabled(os) && + (!os->os_encrypted || !dmu_objset_is_receiving(os))) { /* * We must create the list here because it uses the * dn_dirty_link[] of this txg. But it may already @@ -1663,6 +1849,10 @@ dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) if (!dmu_objset_userused_enabled(os)) return; + /* if this is a raw receive just return and handle accounting later */ + if (os->os_encrypted && dmu_objset_is_receiving(os)) + return; + /* Allocate the user/groupused objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { VERIFY0(zap_create_claim(os, @@ -1742,6 +1932,18 @@ dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) if (!dmu_objset_userused_enabled(dn->dn_objset)) return; + /* + * Raw receives introduce a problem with user accounting. Raw + * receives cannot update the user accounting info because the + * user ids and the sizes are encrypted. To guarantee that we + * never end up with bad user accounting, we simply disable it + * during raw receives. We also disable this for normal receives + * so that an incremental raw receive may be done on top of an + * existing non-raw receive. + */ + if (os->os_encrypted && dmu_objset_is_receiving(os)) + return; + if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| DN_ID_CHKED_SPILL))) return; @@ -2394,6 +2596,13 @@ dmu_objset_find(char *name, int func(const char *, void *), void *arg, return (error); } +boolean_t +dmu_objset_incompatible_encryption_version(objset_t *os) +{ + return (dsl_dir_incompatible_encryption_version( + os->os_dsl_dataset->ds_dir)); +} + void dmu_objset_set_user(objset_t *os, void *user_ptr) { diff --git a/usr/src/uts/common/fs/zfs/dmu_recv.c b/usr/src/uts/common/fs/zfs/dmu_recv.c index 542bb42f3f..b6f63e7e22 100644 --- a/usr/src/uts/common/fs/zfs/dmu_recv.c +++ b/usr/src/uts/common/fs/zfs/dmu_recv.c @@ -67,7 +67,7 @@ typedef struct dmu_recv_begin_arg { const char *drba_origin; dmu_recv_cookie_t *drba_cookie; cred_t *drba_cred; - uint64_t drba_snapobj; + dsl_crypto_params_t *drba_dcp; } dmu_recv_begin_arg_t; static int @@ -77,6 +77,11 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t val; int error; dsl_pool_t *dp = ds->ds_dir->dd_pool; + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0; + boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0; + boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0; /* temporary clone name must not exist */ error = zap_lookup(dp->dp_meta_objset, @@ -110,6 +115,14 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, dsl_dataset_t *snap; uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + /* Can't raw receive on top of an unencrypted dataset */ + if (!encrypted && raw) + return (SET_ERROR(EINVAL)); + + /* Encryption is incompatible with embedded data */ + if (encrypted && embed) + return (SET_ERROR(EINVAL)); + /* Find snapshot in this dir that matches fromguid. */ while (obj != 0) { error = dsl_dataset_hold_obj(dp, obj, FTAG, @@ -129,7 +142,7 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, return (SET_ERROR(ENODEV)); if (drba->drba_cookie->drc_force) { - drba->drba_snapobj = obj; + drba->drba_cookie->drc_fromsnapobj = obj; } else { /* * If we are not forcing, there must be no @@ -139,7 +152,8 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ETXTBSY)); } - drba->drba_snapobj = ds->ds_prev->ds_object; + drba->drba_cookie->drc_fromsnapobj = + ds->ds_prev->ds_object; } dsl_dataset_rele(snap, FTAG); @@ -147,9 +161,34 @@ recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, /* if full, then must be forced */ if (!drba->drba_cookie->drc_force) return (SET_ERROR(EEXIST)); - /* start from $ORIGIN@$ORIGIN, if supported */ - drba->drba_snapobj = dp->dp_origin_snap != NULL ? - dp->dp_origin_snap->ds_object : 0; + + /* + * We don't support using zfs recv -F to blow away + * encrypted filesystems. This would require the + * dsl dir to point to the old encryption key and + * the new one at the same time during the receive. + */ + if ((!encrypted && raw) || encrypted) + return (SET_ERROR(EINVAL)); + + /* + * Perform the same encryption checks we would if + * we were creating a new dataset from scratch. + */ + if (!raw) { + boolean_t will_encrypt; + + error = dmu_objset_create_crypt_check( + ds->ds_dir->dd_parent, drba->drba_dcp, + &will_encrypt); + if (error != 0) + return (error); + + if (will_encrypt && embed) + return (SET_ERROR(EINVAL)); + } + + drba->drba_cookie->drc_fromsnapobj = 0; } return (0); @@ -164,6 +203,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) struct drr_begin *drrb = drba->drba_cookie->drc_drrb; uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; + ds_hold_flags_t dsflags = 0; int error; uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; @@ -214,18 +254,34 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE)) return (SET_ERROR(ENOTSUP)); - error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if ((featureflags & DMU_BACKUP_FEATURE_RAW)) { + /* raw receives require the encryption feature */ + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) + return (SET_ERROR(ENOTSUP)); + + /* embedded data is incompatible with encryption and raw recv */ + if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) + return (SET_ERROR(EINVAL)); + + /* raw receives require spill block allocation flag */ + if (!(flags & DRR_FLAG_SPILL_BLOCK)) + return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING)); + } else { + dsflags |= DS_HOLD_FLAG_DECRYPT; + } + + error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error == 0) { /* target fs already exists; recv into temp clone */ /* Can't recv a clone into an existing fs */ if (flags & DRR_FLAG_CLONE || drba->drba_origin) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } error = recv_begin_check_existing_impl(drba, ds, fromguid); - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); } else if (error == ENOENT) { /* target fs does not exist; must be a full backup or clone */ char buf[ZFS_MAX_DATASET_NAME_LEN]; @@ -250,10 +306,35 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) /* Open the parent of tofs */ ASSERT3U(strlen(tofs), <, sizeof (buf)); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); - error = dsl_dataset_hold(dp, buf, FTAG, &ds); + error = dsl_dataset_hold_flags(dp, buf, dsflags, FTAG, &ds); if (error != 0) return (error); + if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && + drba->drba_origin == NULL) { + boolean_t will_encrypt; + + /* + * Check that we aren't breaking any encryption rules + * and that we have all the parameters we need to + * create an encrypted dataset if necessary. If we are + * making an encrypted dataset the stream can't have + * embedded data. + */ + error = dmu_objset_create_crypt_check(ds->ds_dir, + drba->drba_dcp, &will_encrypt); + if (error != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + + if (will_encrypt && + (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + } + /* * Check filesystem and snapshot limits before receiving. We'll * recheck snapshot limits again at the end (we create the @@ -262,39 +343,46 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); if (error != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (error); } error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); if (error != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (error); } if (drba->drba_origin != NULL) { dsl_dataset_t *origin; - error = dsl_dataset_hold(dp, drba->drba_origin, - FTAG, &origin); + + error = dsl_dataset_hold_flags(dp, drba->drba_origin, + dsflags, FTAG, &origin); if (error != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (error); } if (!origin->ds_is_snapshot) { - dsl_dataset_rele(origin, FTAG); - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } if (dsl_dataset_phys(origin)->ds_guid != fromguid && fromguid != 0) { - dsl_dataset_rele(origin, FTAG); - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(ENODEV)); } - dsl_dataset_rele(origin, FTAG); + if (origin->ds_dir->dd_crypto_obj != 0 && + (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + dsl_dataset_rele_flags(origin, dsflags, FTAG); } - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); error = 0; } return (error); @@ -308,27 +396,51 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) objset_t *mos = dp->dp_meta_objset; struct drr_begin *drrb = drba->drba_cookie->drc_drrb; const char *tofs = drba->drba_cookie->drc_tofs; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds, *newds; + objset_t *os; uint64_t dsobj; + ds_hold_flags_t dsflags = 0; int error; uint64_t crflags = 0; + dsl_crypto_params_t dummy_dcp = { 0 }; + dsl_crypto_params_t *dcp = drba->drba_dcp; if (drrb->drr_flags & DRR_FLAG_CI_DATA) crflags |= DS_FLAG_CI_DATASET; + if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0) + dsflags |= DS_HOLD_FLAG_DECRYPT; + + /* + * Raw, non-incremental recvs always use a dummy dcp with + * the raw cmd set. Raw incremental recvs do not use a dcp + * since the encryption parameters are already set in stone. + */ + if (dcp == NULL && drba->drba_cookie->drc_fromsnapobj == 0 && + drba->drba_origin == NULL) { + ASSERT3P(dcp, ==, NULL); + dcp = &dummy_dcp; - error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + if (featureflags & DMU_BACKUP_FEATURE_RAW) + dcp->cp_cmd = DCP_CMD_RAW_RECV; + } + + error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error == 0) { /* create temporary clone */ dsl_dataset_t *snap = NULL; - if (drba->drba_snapobj != 0) { + + if (drba->drba_cookie->drc_fromsnapobj != 0) { VERIFY0(dsl_dataset_hold_obj(dp, - drba->drba_snapobj, FTAG, &snap)); + drba->drba_cookie->drc_fromsnapobj, FTAG, &snap)); + ASSERT3P(dcp, ==, NULL); } + dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, - snap, crflags, drba->drba_cred, tx); - if (drba->drba_snapobj != 0) + snap, crflags, drba->drba_cred, dcp, tx); + if (drba->drba_cookie->drc_fromsnapobj != 0) dsl_dataset_rele(snap, FTAG); - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); } else { dsl_dir_t *dd; const char *tail; @@ -339,18 +451,20 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) if (drba->drba_origin != NULL) { VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin)); + ASSERT3P(dcp, ==, NULL); } /* Create new dataset. */ - dsobj = dsl_dataset_create_sync(dd, - strrchr(tofs, '/') + 1, - origin, crflags, drba->drba_cred, tx); + dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1, + origin, crflags, drba->drba_cred, dcp, tx); if (origin != NULL) dsl_dataset_rele(origin, FTAG); dsl_dir_rele(dd, FTAG); drba->drba_cookie->drc_newfs = B_TRUE; } - VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); + + VERIFY0(dsl_dataset_own_obj(dp, dsobj, dsflags, dmu_recv_tag, &newds)); + VERIFY0(dmu_objset_from_ds(newds, &os)); if (drba->drba_cookie->drc_resumable) { dsl_dataset_zapify(newds, tx); @@ -370,32 +484,46 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) 8, 1, &zero, tx)); VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, 8, 1, &zero, tx)); - if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_LARGE_BLOCKS) { + if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, 8, 1, &one, tx)); } - if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_EMBED_DATA) { + if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, 8, 1, &one, tx)); } - if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & - DMU_BACKUP_FEATURE_COMPRESSED) { + if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) { VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, 8, 1, &one, tx)); } + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK, + 8, 1, &one, tx)); + } + } + + /* + * Usually the os->os_encrypted value is tied to the presence of a + * DSL Crypto Key object in the dd. However, that will not be received + * until dmu_recv_stream(), so we set the value manually for now. + */ + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + os->os_encrypted = B_TRUE; + drba->drba_cookie->drc_raw = B_TRUE; } dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; /* - * If we actually created a non-clone, we need to create the - * objset in our new dataset. + * If we actually created a non-clone, we need to create the objset + * in our new dataset. If this is a raw send we postpone this until + * dmu_recv_stream() so that we can allocate the metadnode with the + * properties from the DRR_BEGIN payload. */ rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); - if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { + if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) && + (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) { (void) dmu_objset_create_impl(dp->dp_spa, newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } @@ -413,6 +541,7 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; int error; + ds_hold_flags_t dsflags = 0; uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; @@ -463,29 +592,37 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); - if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + /* raw receives require spill block allocation flag */ + if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)) + return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING)); + } else { + dsflags |= DS_HOLD_FLAG_DECRYPT; + } + + if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) { /* %recv does not exist; continue in tofs */ - error = dsl_dataset_hold(dp, tofs, FTAG, &ds); + error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); if (error != 0) return (error); } /* check that ds is marked inconsistent */ if (!DS_IS_INCONSISTENT(ds)) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } /* check that there is resuming data, and that the toguid matches */ if (!dsl_dataset_is_zapified(ds)) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } uint64_t val; error = zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); if (error != 0 || drrb->drr_toguid != val) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } @@ -495,13 +632,13 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) * fails) because it will be marked inconsistent. */ if (dsl_dataset_has_owner(ds)) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EBUSY)); } /* There should not be any snapshots of this fs yet. */ if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } @@ -515,11 +652,11 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); if (drrb->drr_fromguid != val) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (SET_ERROR(EINVAL)); } - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (0); } @@ -529,7 +666,11 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); const char *tofs = drba->drba_cookie->drc_tofs; + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; + objset_t *os; + ds_hold_flags_t dsflags = 0; uint64_t dsobj; /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; @@ -537,9 +678,15 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, recv_clone_name); - if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) { + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + drba->drba_cookie->drc_raw = B_TRUE; + } else { + dsflags |= DS_HOLD_FLAG_DECRYPT; + } + + if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) { /* %recv does not exist; continue in tofs */ - VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds)); + VERIFY0(dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds)); drba->drba_cookie->drc_newfs = B_TRUE; } @@ -548,15 +695,17 @@ dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; dsobj = ds->ds_object; - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); - VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds)); + VERIFY0(dsl_dataset_own_obj(dp, dsobj, dsflags, dmu_recv_tag, &ds)); + VERIFY0(dmu_objset_from_ds(ds, &os)); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); - ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds))); + ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) || + drba->drba_cookie->drc_raw); rrw_exit(&ds->ds_bp_rwlock, FTAG); drba->drba_cookie->drc_ds = ds; @@ -596,6 +745,9 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, return (SET_ERROR(EINVAL)); } + if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK) + drc->drc_spill = B_TRUE; + drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); @@ -619,7 +771,7 @@ struct receive_record_arg { * If the record is a write, pointer to the arc_buf_t containing the * payload. */ - arc_buf_t *write_buf; + arc_buf_t *arc_buf; int payload_size; uint64_t bytes_read; /* bytes read from stream when record created */ boolean_t eos_marker; /* Marks the end of the stream */ @@ -643,10 +795,21 @@ struct receive_writer_arg { /* A map from guid to dataset to help handle dedup'd streams. */ avl_tree_t *guid_to_ds_map; boolean_t resumable; + boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ + boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ uint64_t last_object; uint64_t last_offset; uint64_t max_object; /* highest object ID referenced in stream */ uint64_t bytes_read; /* bytes read when current record created */ + + /* Encryption parameters for the last received DRR_OBJECT_RANGE */ + boolean_t or_crypt_params_present; + uint64_t or_firstobj; + uint64_t or_numslots; + uint8_t or_salt[ZIO_DATA_SALT_LEN]; + uint8_t or_iv[ZIO_DATA_IV_LEN]; + uint8_t or_mac[ZIO_DATA_MAC_LEN]; + boolean_t or_byteorder; }; struct objlist { @@ -679,12 +842,15 @@ struct receive_arg { zio_cksum_t prev_cksum; int err; boolean_t byteswap; + boolean_t raw; + uint64_t featureflags; /* Sorted list of objects not to issue prefetches for. */ struct objlist ignore_objlist; }; typedef struct guid_map_entry { uint64_t guid; + boolean_t raw; dsl_dataset_t *gme_ds; avl_node_t avlnode; } guid_map_entry_t; @@ -710,8 +876,14 @@ free_guid_map_onexit(void *arg) guid_map_entry_t *gmep; while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { - dsl_dataset_long_rele(gmep->gme_ds, gmep); - dsl_dataset_rele(gmep->gme_ds, gmep); + ds_hold_flags_t dsflags = DS_HOLD_FLAG_DECRYPT; + + if (gmep->raw) { + gmep->gme_ds->ds_objset->os_raw_receive = B_FALSE; + dsflags &= ~DS_HOLD_FLAG_DECRYPT; + } + + dsl_dataset_disown(gmep->gme_ds, dsflags, gmep); kmem_free(gmep, sizeof (guid_map_entry_t)); } avl_destroy(ca); @@ -727,7 +899,8 @@ receive_read(struct receive_arg *ra, int len, void *buf) * The code doesn't rely on this (lengths being multiples of 8). See * comment in dump_bytes. */ - ASSERT0(len % 8); + ASSERT(len % 8 == 0 || + (ra->featureflags & DMU_BACKUP_FEATURE_RAW) != 0); while (done < len) { ssize_t resid; @@ -780,7 +953,9 @@ byteswap_record(dmu_replay_record_t *drr) DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); DO32(drr_object.drr_bonuslen); + DO32(drr_object.drr_raw_bonuslen); DO64(drr_object.drr_toguid); + DO64(drr_object.drr_maxblkid); break; case DRR_FREEOBJECTS: DO64(drr_freeobjects.drr_firstobj); @@ -827,6 +1002,13 @@ byteswap_record(dmu_replay_record_t *drr) DO64(drr_spill.drr_object); DO64(drr_spill.drr_length); DO64(drr_spill.drr_toguid); + DO64(drr_spill.drr_compressed_size); + DO32(drr_spill.drr_type); + break; + case DRR_OBJECT_RANGE: + DO64(drr_object_range.drr_firstobj); + DO64(drr_object_range.drr_numslots); + DO64(drr_object_range.drr_toguid); break; case DRR_END: DO64(drr_end.drr_toguid); @@ -891,6 +1073,8 @@ save_resume_state(struct receive_writer_arg *rwa, rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; } +int receive_object_delay_frac = 0; + static int receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, void *data) @@ -902,6 +1086,10 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, uint8_t dn_slots = drro->drr_dn_slots != 0 ? drro->drr_dn_slots : DNODE_MIN_SLOTS; + if (receive_object_delay_frac != 0 && + spa_get_random(receive_object_delay_frac) == 0) + delay(1); + if (drro->drr_type == DMU_OT_NONE || !DMU_OT_IS_VALID(drro->drr_type) || !DMU_OT_IS_VALID(drro->drr_bonustype) || @@ -917,6 +1105,37 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, return (SET_ERROR(EINVAL)); } + if (rwa->raw) { + /* + * We should have received a DRR_OBJECT_RANGE record + * containing this block and stored it in rwa. + */ + if (drro->drr_object < rwa->or_firstobj || + drro->drr_object >= rwa->or_firstobj + rwa->or_numslots || + drro->drr_raw_bonuslen < drro->drr_bonuslen || + drro->drr_indblkshift > SPA_MAXBLOCKSHIFT || + drro->drr_nlevels > DN_MAX_LEVELS || + drro->drr_nblkptr > DN_MAX_NBLKPTR || + DN_SLOTS_TO_BONUSLEN(drro->drr_dn_slots) < + drro->drr_raw_bonuslen) + return (SET_ERROR(EINVAL)); + } else { + + /* + * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN + * record indicates this by setting DRR_FLAG_SPILL_BLOCK. + */ + if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) || + (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) { + return (SET_ERROR(EINVAL)); + } + + if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 || + drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) { + return (SET_ERROR(EINVAL)); + } + } + err = dmu_object_info(rwa->os, drro->drr_object, &doi); if (err != 0 && err != ENOENT && err != EEXIST) @@ -929,20 +1148,86 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, * If we are losing blkptrs or changing the block size this must * be a new file instance. We must clear out the previous file * contents before we can change this type of metadata in the dnode. + * Raw receives will also check that the indirect structure of the + * dnode hasn't changed. */ if (err == 0) { - int nblkptr; + uint32_t indblksz = drro->drr_indblkshift ? + 1ULL << drro->drr_indblkshift : 0; + int nblkptr = deduce_nblkptr(drro->drr_bonustype, + drro->drr_bonuslen); + boolean_t did_free = B_FALSE; object = drro->drr_object; - nblkptr = deduce_nblkptr(drro->drr_bonustype, - drro->drr_bonuslen); + /* nblkptr should be bounded by the bonus size and type */ + if (rwa->raw && nblkptr != drro->drr_nblkptr) + return (SET_ERROR(EINVAL)); + /* + * Check for indicators that the object was freed and + * reallocated. For all sends, these indicators are: + * - A changed block size + * - A smaller nblkptr + * - A changed dnode size + * For raw sends we also check a few other fields to + * ensure we are preserving the objset structure exactly + * as it was on the receive side: + * - A changed indirect block size + * - A smaller nlevels + */ if (drro->drr_blksz != doi.doi_data_block_size || nblkptr < doi.doi_nblkptr || + dn_slots != doi.doi_dnodesize >> DNODE_SHIFT || + (rwa->raw && + (indblksz != doi.doi_metadata_block_size || + drro->drr_nlevels < doi.doi_indirection))) { + err = dmu_free_long_range(rwa->os, + drro->drr_object, 0, DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + else + did_free = B_TRUE; + } + + /* + * The dmu does not currently support decreasing nlevels + * or changing the number of dnode slots on an object. For + * non-raw sends, this does not matter and the new object + * can just use the previous one's nlevels. For raw sends, + * however, the structure of the received dnode (including + * nlevels and dnode slots) must match that of the send + * side. Therefore, instead of using dmu_object_reclaim(), + * we must free the object completely and call + * dmu_object_claim_dnsize() instead. + */ + if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) || dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) { + err = dmu_free_long_object(rwa->os, drro->drr_object); + if (err != 0) + return (SET_ERROR(EINVAL)); + + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + object = DMU_NEW_OBJECT; + } + + /* + * For raw receives, free everything beyond the new incoming + * maxblkid. Normally this would be done with a DRR_FREE + * record that would come after this DRR_OBJECT record is + * processed. However, for raw receives we manually set the + * maxblkid from the drr_maxblkid and so we must first free + * everything above that blkid to ensure the DMU is always + * consistent with itself. We will never free the first block + * of the object here because a maxblkid of 0 could indicate + * an object with a single block or one with no blocks. This + * free may be skipped when dmu_free_long_range() was called + * above since it covers the entire object's contents. + */ + if (rwa->raw && object != DMU_NEW_OBJECT && !did_free) { err = dmu_free_long_range(rwa->os, drro->drr_object, - 0, DMU_OBJECT_END); + (drro->drr_maxblkid + 1) * doi.doi_data_block_size, + DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } @@ -955,7 +1240,11 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, * earlier in the stream. */ txg_wait_synced(dmu_objset_pool(rwa->os), 0); - object = drro->drr_object; + if (dmu_object_info(rwa->os, drro->drr_object, NULL) != ENOENT) + return (SET_ERROR(EINVAL)); + + /* object was freed and we are about to allocate a new one */ + object = DMU_NEW_OBJECT; } else { /* object is free and we are about to allocate a new one */ object = DMU_NEW_OBJECT; @@ -995,6 +1284,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, tx = dmu_tx_create(rwa->os); dmu_tx_hold_bonus(tx, object); + dmu_tx_hold_write(tx, object, 0, 0); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); @@ -1002,7 +1292,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, } if (object == DMU_NEW_OBJECT) { - /* currently free, want to be allocated */ + /* Currently free, wants to be allocated */ err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, @@ -1010,39 +1300,116 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, } else if (drro->drr_type != doi.doi_type || drro->drr_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || - drro->drr_bonuslen != doi.doi_bonus_size || - drro->drr_dn_slots != (doi.doi_dnodesize >> DNODE_SHIFT)) { - /* currently allocated, but with different properties */ + drro->drr_bonuslen != doi.doi_bonus_size) { + /* Currently allocated, but with different properties */ err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, - drro->drr_dn_slots << DNODE_SHIFT, tx); + dn_slots << DNODE_SHIFT, rwa->spill ? + DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx); + } else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) { + /* + * Currently allocated, the existing version of this object + * may reference a spill block that is no longer allocated + * at the source and needs to be freed. + */ + err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx); } + if (err != 0) { dmu_tx_commit(tx); return (SET_ERROR(EINVAL)); } + if (rwa->or_crypt_params_present) { + /* + * Set the crypt params for the buffer associated with this + * range of dnodes. This causes the blkptr_t to have the + * same crypt params (byteorder, salt, iv, mac) as on the + * sending side. + * + * Since we are committing this tx now, it is possible for + * the dnode block to end up on-disk with the incorrect MAC, + * if subsequent objects in this block are received in a + * different txg. However, since the dataset is marked as + * inconsistent, no code paths will do a non-raw read (or + * decrypt the block / verify the MAC). The receive code and + * scrub code can safely do raw reads and verify the + * checksum. They don't need to verify the MAC. + */ + dmu_buf_t *db = NULL; + uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE; + + err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os), + offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT); + if (err != 0) { + dmu_tx_commit(tx); + return (SET_ERROR(EINVAL)); + } + + dmu_buf_set_crypt_params(db, rwa->or_byteorder, + rwa->or_salt, rwa->or_iv, rwa->or_mac, tx); + + dmu_buf_rele(db, FTAG); + + rwa->or_crypt_params_present = B_FALSE; + } + dmu_object_set_checksum(rwa->os, drro->drr_object, drro->drr_checksumtype, tx); dmu_object_set_compress(rwa->os, drro->drr_object, drro->drr_compress, tx); + /* handle more restrictive dnode structuring for raw recvs */ + if (rwa->raw) { + /* + * Set the indirect block size, block shift, nlevels. + * This will not fail because we ensured all of the + * blocks were freed earlier if this is a new object. + * For non-new objects block size and indirect block + * shift cannot change and nlevels can only increase. + */ + VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object, + drro->drr_blksz, drro->drr_indblkshift, tx)); + VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object, + drro->drr_nlevels, tx)); + + /* + * Set the maxblkid. This will always succeed because + * we freed all blocks beyond the new maxblkid above. + */ + VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object, + drro->drr_maxblkid, tx)); + } + if (data != NULL) { dmu_buf_t *db; + dnode_t *dn; + uint32_t flags = DMU_READ_NO_PREFETCH; + + if (rwa->raw) + flags |= DMU_READ_NO_DECRYPT; + + VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn)); + VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags)); - VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); - bcopy(data, db->db_data, drro->drr_bonuslen); - if (rwa->byteswap) { + bcopy(data, db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro)); + + /* + * Raw bonus buffers have their byteorder determined by the + * DRR_OBJECT_RANGE record. + */ + if (rwa->byteswap && !rwa->raw) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); dmu_ot_byteswap[byteswap].ob_func(db->db_data, - drro->drr_bonuslen); + DRR_OBJECT_PAYLOAD_SIZE(drro)); } dmu_buf_rele(db, FTAG); + dnode_rele(dn, FTAG); } dmu_tx_commit(tx); @@ -1063,15 +1430,17 @@ receive_freeobjects(struct receive_writer_arg *rwa, for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0; next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { + dmu_object_info_t doi; int err; - err = dmu_object_info(rwa->os, obj, NULL); + err = dmu_object_info(rwa->os, obj, &doi); if (err == ENOENT) continue; else if (err != 0) return (err); err = dmu_free_long_object(rwa->os, obj); + if (err != 0) return (err); @@ -1087,8 +1456,9 @@ static int receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, arc_buf_t *abuf) { - dmu_tx_t *tx; int err; + dmu_tx_t *tx; + dnode_t *dn; if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) @@ -1113,7 +1483,6 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, return (SET_ERROR(EINVAL)); tx = dmu_tx_create(rwa->os); - dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); err = dmu_tx_assign(tx, TXG_WAIT); @@ -1121,18 +1490,23 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, dmu_tx_abort(tx); return (err); } - if (rwa->byteswap) { + + if (rwa->byteswap && !arc_is_encrypted(abuf) && + arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, DRR_WRITE_PAYLOAD_SIZE(drrw)); } - /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */ - dmu_buf_t *bonus; - if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) - return (SET_ERROR(EINVAL)); - dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); + VERIFY0(dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn)); + err = dmu_assign_arcbuf_by_dnode(dn, drrw->drr_offset, abuf, tx); + if (err != 0) { + dnode_rele(dn, FTAG); + dmu_tx_commit(tx); + return (err); + } + dnode_rele(dn, FTAG); /* * Note: If the receive fails, we want the resume stream to start @@ -1142,7 +1516,6 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, */ save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); dmu_tx_commit(tx); - dmu_buf_rele(bonus, FTAG); return (0); } @@ -1164,6 +1537,7 @@ receive_write_byref(struct receive_writer_arg *rwa, guid_map_entry_t *gmep; avl_index_t where; objset_t *ref_os = NULL; + int flags = DMU_READ_PREFETCH; dmu_buf_t *dbp; if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) @@ -1188,8 +1562,12 @@ receive_write_byref(struct receive_writer_arg *rwa, if (drrwbr->drr_object > rwa->max_object) rwa->max_object = drrwbr->drr_object; + if (rwa->raw) + flags |= DMU_READ_NO_DECRYPT; + + /* may return either a regular db or an encrypted one */ err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, - drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); + drrwbr->drr_refoffset, FTAG, &dbp, flags); if (err != 0) return (err); @@ -1202,8 +1580,14 @@ receive_write_byref(struct receive_writer_arg *rwa, dmu_tx_abort(tx); return (err); } - dmu_write(rwa->os, drrwbr->drr_object, - drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); + + if (rwa->raw) { + dmu_copy_from_buf(rwa->os, drrwbr->drr_object, + drrwbr->drr_offset, dbp, tx); + } else { + dmu_write(rwa->os, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); + } dmu_buf_rele(dbp, FTAG); /* See comment in restore_write. */ @@ -1229,6 +1613,8 @@ receive_write_embedded(struct receive_writer_arg *rwa, return (EINVAL); if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); + if (rwa->raw) + return (SET_ERROR(EINVAL)); if (drrwe->drr_object > rwa->max_object) rwa->max_object = drrwe->drr_object; @@ -1256,16 +1642,37 @@ receive_write_embedded(struct receive_writer_arg *rwa, static int receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, - void *data) + arc_buf_t *abuf) { dmu_tx_t *tx; dmu_buf_t *db, *db_spill; int err; + uint32_t flags = 0; if (drrs->drr_length < SPA_MINBLOCKSIZE || drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) return (SET_ERROR(EINVAL)); + /* + * This is an unmodified spill block which was added to the stream + * to resolve an issue with incorrectly removing spill blocks. It + * should be ignored by current versions of the code which support + * the DRR_FLAG_SPILL_BLOCK flag. + */ + if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) { + dmu_return_arcbuf(abuf); + return (0); + } + + if (rwa->raw) { + if (!DMU_OT_IS_VALID(drrs->drr_type) || + drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS || + drrs->drr_compressed_size == 0) + return (SET_ERROR(EINVAL)); + + flags |= DMU_READ_NO_DECRYPT; + } + if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); @@ -1273,7 +1680,8 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, rwa->max_object = drrs->drr_object; VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); - if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { + if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG, + &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } @@ -1289,12 +1697,27 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, dmu_tx_abort(tx); return (err); } - dmu_buf_will_dirty(db_spill, tx); - if (db_spill->db_size < drrs->drr_length) + /* + * Spill blocks may both grow and shrink. When a change in size + * occurs any existing dbuf must be updated to match the logical + * size of the provided arc_buf_t. + */ + if (db_spill->db_size != drrs->drr_length) { + dmu_buf_will_fill(db_spill, tx); VERIFY(0 == dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); - bcopy(data, db_spill->db_data, drrs->drr_length); + } + + if (rwa->byteswap && !arc_is_encrypted(abuf) && + arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrs->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, + DRR_SPILL_PAYLOAD_SIZE(drrs)); + } + + dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); @@ -1309,7 +1732,7 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) { int err; - if (drrf->drr_length != -1ULL && + if (drrf->drr_length != DMU_OBJECT_END && drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); @@ -1325,18 +1748,81 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) return (err); } +static int +receive_object_range(struct receive_writer_arg *rwa, + struct drr_object_range *drror) +{ + /* + * By default, we assume this block is in our native format + * (ZFS_HOST_BYTEORDER). We then take into account whether + * the send stream is byteswapped (rwa->byteswap). Finally, + * we need to byteswap again if this particular block was + * in non-native format on the send side. + */ + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^ + !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags); + + /* + * Since dnode block sizes are constant, we should not need to worry + * about making sure that the dnode block size is the same on the + * sending and receiving sides for the time being. For non-raw sends, + * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE + * record at all). Raw sends require this record type because the + * encryption parameters are used to protect an entire block of bonus + * buffers. If the size of dnode blocks ever becomes variable, + * handling will need to be added to ensure that dnode block sizes + * match on the sending and receiving side. + */ + if (drror->drr_numslots != DNODES_PER_BLOCK || + P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 || + !rwa->raw) + return (SET_ERROR(EINVAL)); + + if (drror->drr_firstobj > rwa->max_object) + rwa->max_object = drror->drr_firstobj; + + /* + * The DRR_OBJECT_RANGE handling must be deferred to receive_object() + * so that the block of dnodes is not written out when it's empty, + * and converted to a HOLE BP. + */ + rwa->or_crypt_params_present = B_TRUE; + rwa->or_firstobj = drror->drr_firstobj; + rwa->or_numslots = drror->drr_numslots; + bcopy(drror->drr_salt, rwa->or_salt, ZIO_DATA_SALT_LEN); + bcopy(drror->drr_iv, rwa->or_iv, ZIO_DATA_IV_LEN); + bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN); + rwa->or_byteorder = byteorder; + + return (0); +} + /* used to destroy the drc_ds on error */ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { - if (drc->drc_resumable) { - /* wait for our resume state to be written to disk */ - txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0); - dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + dsl_dataset_t *ds = drc->drc_ds; + ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT; + + /* + * Wait for the txg sync before cleaning up the receive. For + * resumable receives, this ensures that our resume state has + * been written out to disk. For raw receives, this ensures + * that the user accounting code will not attempt to do anything + * after we stopped receiving the dataset. + */ + txg_wait_synced(ds->ds_dir->dd_pool, 0); + ds->ds_objset->os_raw_receive = B_FALSE; + + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + if (drc->drc_resumable && !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dsl_dataset_disown(ds, dsflags, dmu_recv_tag); } else { char name[ZFS_MAX_DATASET_NAME_LEN]; - dsl_dataset_name(drc->drc_ds, name); - dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dsl_dataset_name(ds, name); + dsl_dataset_disown(ds, dsflags, dmu_recv_tag); (void) dsl_destroy_head(name); } } @@ -1384,6 +1870,7 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) err = receive_read(ra, sizeof (ra->next_rrd->header), &ra->next_rrd->header); ra->next_rrd->bytes_read = ra->bytes_read; + if (err != 0) { kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); ra->next_rrd = NULL; @@ -1525,9 +2012,13 @@ receive_read_record(struct receive_arg *ra) case DRR_OBJECT: { struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; - uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); - void *buf = kmem_zalloc(size, KM_SLEEP); + uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro); + void *buf = NULL; dmu_object_info_t doi; + + if (size != 0) + buf = kmem_zalloc(size, KM_SLEEP); + err = receive_read_payload_and_next_header(ra, size, buf); if (err != 0) { kmem_free(buf, size); @@ -1538,7 +2029,7 @@ receive_read_record(struct receive_arg *ra) * See receive_read_prefetch for an explanation why we're * storing this object in the ignore_obj_list. */ - if (err == ENOENT || + if (err == ENOENT || err == EEXIST || (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { objlist_insert(&ra->ignore_objlist, drro->drr_object); err = 0; @@ -1555,7 +2046,18 @@ receive_read_record(struct receive_arg *ra) struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; arc_buf_t *abuf; boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); - if (DRR_WRITE_COMPRESSED(drrw)) { + + if (ra->raw) { + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ + ra->byteswap; + + abuf = arc_loan_raw_buf(dmu_objset_spa(ra->os), + drrw->drr_object, byteorder, drrw->drr_salt, + drrw->drr_iv, drrw->drr_mac, drrw->drr_type, + drrw->drr_compressed_size, drrw->drr_logical_size, + drrw->drr_compressiontype); + } else if (DRR_WRITE_COMPRESSED(drrw)) { ASSERT3U(drrw->drr_compressed_size, >, 0); ASSERT3U(drrw->drr_logical_size, >=, drrw->drr_compressed_size); @@ -1575,7 +2077,7 @@ receive_read_record(struct receive_arg *ra) dmu_return_arcbuf(abuf); return (err); } - ra->rrd->write_buf = abuf; + ra->rrd->arc_buf = abuf; receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, drrw->drr_logical_size); return (err); @@ -1625,11 +2127,38 @@ receive_read_record(struct receive_arg *ra) case DRR_SPILL: { struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; - void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP); - err = receive_read_payload_and_next_header(ra, drrs->drr_length, - buf); - if (err != 0) - kmem_free(buf, drrs->drr_length); + arc_buf_t *abuf; + int len = DRR_SPILL_PAYLOAD_SIZE(drrs); + + /* DRR_SPILL records are either raw or uncompressed */ + if (ra->raw) { + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ + ra->byteswap; + + abuf = arc_loan_raw_buf(dmu_objset_spa(ra->os), + dmu_objset_id(ra->os), byteorder, drrs->drr_salt, + drrs->drr_iv, drrs->drr_mac, drrs->drr_type, + drrs->drr_compressed_size, drrs->drr_length, + drrs->drr_compressiontype); + } else { + abuf = arc_loan_buf(dmu_objset_spa(ra->os), + DMU_OT_IS_METADATA(drrs->drr_type), + drrs->drr_length); + } + + err = receive_read_payload_and_next_header(ra, len, + abuf->b_data); + if (err != 0) { + dmu_return_arcbuf(abuf); + return (err); + } + ra->rrd->arc_buf = abuf; + return (err); + } + case DRR_OBJECT_RANGE: + { + err = receive_read_payload_and_next_header(ra, 0, NULL); return (err); } default: @@ -1668,11 +2197,11 @@ receive_process_record(struct receive_writer_arg *rwa, case DRR_WRITE: { struct drr_write *drrw = &rrd->header.drr_u.drr_write; - err = receive_write(rwa, drrw, rrd->write_buf); + err = receive_write(rwa, drrw, rrd->arc_buf); /* if receive_write() is successful, it consumes the arc_buf */ if (err != 0) - dmu_return_arcbuf(rrd->write_buf); - rrd->write_buf = NULL; + dmu_return_arcbuf(rrd->arc_buf); + rrd->arc_buf = NULL; rrd->payload = NULL; return (err); } @@ -1699,11 +2228,20 @@ receive_process_record(struct receive_writer_arg *rwa, case DRR_SPILL: { struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; - err = receive_spill(rwa, drrs, rrd->payload); - kmem_free(rrd->payload, rrd->payload_size); + err = receive_spill(rwa, drrs, rrd->arc_buf); + /* if receive_spill() is successful, it consumes the arc_buf */ + if (err != 0) + dmu_return_arcbuf(rrd->arc_buf); + rrd->arc_buf = NULL; rrd->payload = NULL; return (err); } + case DRR_OBJECT_RANGE: + { + struct drr_object_range *drror = + &rrd->header.drr_u.drr_object_range; + return (receive_object_range(rwa, drror)); + } default: return (SET_ERROR(EINVAL)); } @@ -1727,9 +2265,9 @@ receive_writer_thread(void *arg) */ if (rwa->err == 0) { rwa->err = receive_process_record(rwa, rrd); - } else if (rrd->write_buf != NULL) { - dmu_return_arcbuf(rrd->write_buf); - rrd->write_buf = NULL; + } else if (rrd->arc_buf != NULL) { + dmu_return_arcbuf(rrd->arc_buf); + rrd->arc_buf = NULL; rrd->payload = NULL; } else if (rrd->payload != NULL) { kmem_free(rrd->payload, rrd->payload_size); @@ -1794,6 +2332,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, nvlist_t *begin_nvl = NULL; ra.byteswap = drc->drc_byteswap; + ra.raw = drc->drc_raw; ra.cksum = drc->drc_cksum; ra.vp = vp; ra.voff = *voffp; @@ -1819,17 +2358,21 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); + ra.featureflags = featureflags; + + ASSERT0(ra.os->os_encrypted && + (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)); /* if this stream is dedup'ed, set up the avl tree for guid mapping */ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { minor_t minor; if (cleanup_fd == -1) { - ra.err = SET_ERROR(EBADF); + err = SET_ERROR(EBADF); goto out; } - ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); - if (ra.err != 0) { + err = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (err != 0) { cleanup_fd = -1; goto out; } @@ -1843,12 +2386,12 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, err = zfs_onexit_add_cb(minor, free_guid_map_onexit, rwa.guid_to_ds_map, action_handlep); - if (ra.err != 0) + if (err != 0) goto out; } else { err = zfs_onexit_cb_data(minor, *action_handlep, (void **)&rwa.guid_to_ds_map); - if (ra.err != 0) + if (err != 0) goto out; } @@ -1873,6 +2416,38 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, goto out; } + /* handle DSL encryption key payload */ + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + nvlist_t *keynvl = NULL; + + ASSERT(ra.os->os_encrypted); + ASSERT(drc->drc_raw); + + err = nvlist_lookup_nvlist(begin_nvl, "crypt_keydata", &keynvl); + if (err != 0) + goto out; + + /* + * If this is a new dataset we set the key immediately. + * Otherwise we don't want to change the key until we + * are sure the rest of the receive succeeded so we stash + * the keynvl away until then. + */ + err = dsl_crypto_recv_raw(spa_name(ra.os->os_spa), + drc->drc_ds->ds_object, drc->drc_fromsnapobj, + drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); + if (err != 0) + goto out; + + /* see comment in dmu_recv_end_sync() */ + drc->drc_ivset_guid = 0; + (void) nvlist_lookup_uint64(keynvl, "to_ivset_guid", + &drc->drc_ivset_guid); + + if (!drc->drc_newfs) + drc->drc_keynvl = fnvlist_dup(keynvl); + } + if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { err = resume_check(&ra, begin_nvl); if (err != 0) @@ -1886,6 +2461,9 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, rwa.os = ra.os; rwa.byteswap = drc->drc_byteswap; rwa.resumable = drc->drc_resumable; + rwa.raw = drc->drc_raw; + rwa.spill = drc->drc_spill; + rwa.os->os_raw_receive = drc->drc_raw; (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, curproc, TS_RUN, minclsyspri); @@ -1926,10 +2504,10 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, sizeof (struct receive_record_arg) + ra.rrd->payload_size); ra.rrd = NULL; } - if (ra.next_rrd == NULL) - ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP); - ra.next_rrd->eos_marker = B_TRUE; - bqueue_enqueue(&rwa.q, ra.next_rrd, 1); + ASSERT3P(ra.rrd, ==, NULL); + ra.rrd = kmem_zalloc(sizeof (*ra.rrd), KM_SLEEP); + ra.rrd->eos_marker = B_TRUE; + bqueue_enqueue(&rwa.q, ra.rrd, 1); mutex_enter(&rwa.mutex); while (!rwa.done) { @@ -1979,6 +2557,14 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, err = rwa.err; out: + /* + * If we hit an error before we started the receive_writer_thread + * we need to clean up the next_rrd we create by processing the + * DRR_BEGIN record. + */ + if (ra.next_rrd != NULL) + kmem_free(ra.next_rrd, sizeof (*ra.next_rrd)); + nvlist_free(begin_nvl); if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) zfs_onexit_fd_rele(cleanup_fd); @@ -1990,6 +2576,7 @@ out: * the inconsistent state. */ dmu_recv_cleanup_ds(drc); + nvlist_free(drc->drc_keynvl); } *voffp = ra.voff; @@ -2045,6 +2632,15 @@ dmu_recv_end_check(void *arg, dmu_tx_t *tx) return (error); } } + if (drc->drc_keynvl != NULL) { + error = dsl_crypto_recv_raw_key_check(drc->drc_ds, + drc->drc_keynvl, tx); + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + } + error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, origin_head, drc->drc_force, drc->drc_owner, tx); if (error != 0) { @@ -2070,9 +2666,11 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); + boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0; spa_history_log_internal_ds(drc->drc_ds, "finish receiving", tx, "snap=%s", drc->drc_tosnap); + drc->drc_ds->ds_objset->os_raw_receive = B_FALSE; if (!drc->drc_newfs) { dsl_dataset_t *origin_head; @@ -2100,8 +2698,14 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) dsl_dataset_rele(snap, FTAG); } } - VERIFY3P(drc->drc_ds->ds_prev, ==, - origin_head->ds_prev); + if (drc->drc_keynvl != NULL) { + dsl_crypto_recv_raw_key_sync(drc->drc_ds, + drc->drc_keynvl, tx); + nvlist_free(drc->drc_keynvl); + drc->drc_keynvl = NULL; + } + + VERIFY3P(drc->drc_ds->ds_prev, ==, origin_head->ds_prev); dsl_dataset_clone_swap_sync_impl(drc->drc_ds, origin_head, tx); @@ -2162,21 +2766,50 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx) drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; } + + /* + * If this is a raw receive, the crypt_keydata nvlist will include + * a to_ivset_guid for us to set on the new snapshot. This value + * will override the value generated by the snapshot code. However, + * this value may not be present, because older implementations of + * the raw send code did not include this value, and we are still + * allowed to receive them if the zfs_disable_ivset_guid_check + * tunable is set, in which case we will leave the newly-generated + * value. + */ + if (drc->drc_raw && drc->drc_ivset_guid != 0) { + dmu_object_zapify(dp->dp_meta_objset, drc->drc_newsnapobj, + DMU_OT_DSL_DATASET, tx); + VERIFY0(zap_update(dp->dp_meta_objset, drc->drc_newsnapobj, + DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, + &drc->drc_ivset_guid, tx)); + } + /* * Release the hold from dmu_recv_begin. This must be done before - * we return to open context, so that when we free the dataset's dnode, - * we can evict its bonus buffer. + * we return to open context, so that when we free the dataset's dnode + * we can evict its bonus buffer. Since the dataset may be destroyed + * at this point (and therefore won't have a valid pointer to the spa) + * we release the key mapping manually here while we do have a valid + * pointer, if it exists. */ - dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); + if (!drc->drc_raw && encrypted) { + (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa, + drc->drc_ds->ds_object, drc->drc_ds); + } + dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag); drc->drc_ds = NULL; } static int -add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) +add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj, + boolean_t raw) { dsl_pool_t *dp; dsl_dataset_t *snapds; guid_map_entry_t *gmep; + objset_t *os; + ds_hold_flags_t dsflags = (raw) ? 0 : DS_HOLD_FLAG_DECRYPT; int err; ASSERT(guid_map != NULL); @@ -2185,12 +2818,29 @@ add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) if (err != 0) return (err); gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); - err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); + err = dsl_dataset_own_obj(dp, snapobj, dsflags, gmep, &snapds); if (err == 0) { + /* + * If this is a deduplicated raw send stream, we need + * to make sure that we can still read raw blocks from + * earlier datasets in the stream, so we set the + * os_raw_receive flag now. + */ + if (raw) { + err = dmu_objset_from_ds(snapds, &os); + if (err != 0) { + dsl_dataset_disown(snapds, dsflags, FTAG); + dsl_pool_rele(dp, FTAG); + kmem_free(gmep, sizeof (*gmep)); + return (err); + } + os->os_raw_receive = B_TRUE; + } + + gmep->raw = raw; gmep->guid = dsl_dataset_phys(snapds)->ds_guid; gmep->gme_ds = snapds; avl_add(guid_map, gmep); - dsl_dataset_long_hold(snapds, gmep); } else { kmem_free(gmep, sizeof (*gmep)); } @@ -2241,10 +2891,10 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) if (error != 0) { dmu_recv_cleanup_ds(drc); + nvlist_free(drc->drc_keynvl); } else if (drc->drc_guid_to_ds_map != NULL) { - (void) add_ds_to_guidmap(drc->drc_tofs, - drc->drc_guid_to_ds_map, - drc->drc_newsnapobj); + (void) add_ds_to_guidmap(drc->drc_tofs, drc->drc_guid_to_ds_map, + drc->drc_newsnapobj, drc->drc_raw); } return (error); } diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index 6d65086079..bfc0a6f585 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -61,6 +61,8 @@ int zfs_send_corrupt_data = B_FALSE; int zfs_send_queue_length = 16 * 1024 * 1024; /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ int zfs_send_set_freerecords_bit = B_TRUE; +/* Set this tunable to FALSE is disable sending unmodified spill blocks. */ +int zfs_send_unmodified_spill_blocks = B_TRUE; /* * Use this to override the recordsize calculation for fast zfs send estimates. @@ -90,6 +92,8 @@ struct send_block_record { bqueue_node_t ln; }; +static int do_dump(dmu_sendarg_t *dsa, struct send_block_record *data); + static int dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { @@ -97,18 +101,17 @@ dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) ssize_t resid; /* have to get resid to get detailed errno */ /* - * The code does not rely on this (len being a multiple of 8). We keep + * The code does not rely on len being a multiple of 8. We keep * this assertion because of the corresponding assertion in * receive_read(). Keeping this assertion ensures that we do not * inadvertently break backwards compatibility (causing the assertion - * in receive_read() to trigger on old software). - * - * Removing the assertions could be rolled into a new feature that uses - * data that isn't 8-byte aligned; if the assertions were removed, a - * feature flag would have to be added. + * in receive_read() to trigger on old software). Newer feature flags + * (such as raw send) may break this assertion since they were + * introduced after the requirement was made obsolete. */ - ASSERT0(len % 8); + ASSERT(len % 8 == 0 || + (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0); dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp, (caddr_t)buf, len, @@ -189,9 +192,6 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); - if (length != -1ULL && offset + length < offset) - length = -1ULL; - /* * If there is a pending op, but it's not PENDING_FREE, push it out, * since free block aggregation can only be done for blocks of the @@ -208,19 +208,22 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, if (dsp->dsa_pending_op == PENDING_FREE) { /* - * There should never be a PENDING_FREE if length is -1 - * (because dump_dnode is the only place where this - * function is called with a -1, and only after flushing - * any pending record). + * There should never be a PENDING_FREE if length is + * DMU_OBJECT_END (because dump_dnode is the only place where + * this function is called with a DMU_OBJECT_END, and only after + * flushing any pending record). */ - ASSERT(length != -1ULL); + ASSERT(length != DMU_OBJECT_END); /* * Check to see whether this free block can be aggregated * with pending one. */ if (drrf->drr_object == object && drrf->drr_offset + drrf->drr_length == offset) { - drrf->drr_length += length; + if (offset + length < offset) + drrf->drr_length = DMU_OBJECT_END; + else + drrf->drr_length += length; return (0); } else { /* not a continuation. Push out pending record */ @@ -234,9 +237,12 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, dsp->dsa_drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; - drrf->drr_length = length; + if (offset + length < offset) + drrf->drr_length = DMU_OBJECT_END; + else + drrf->drr_length = length; drrf->drr_toguid = dsp->dsa_toguid; - if (length == -1ULL) { + if (length == DMU_OBJECT_END) { if (dump_record(dsp, NULL, 0) != 0) return (SET_ERROR(EINTR)); } else { @@ -247,11 +253,11 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, } static int -dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, - uint64_t object, uint64_t offset, int lsize, int psize, const blkptr_t *bp, - void *data) +dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, + uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data) { uint64_t payload_size; + boolean_t raw = (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW); struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); /* @@ -284,16 +290,36 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, drrw->drr_toguid = dsp->dsa_toguid; drrw->drr_logical_size = lsize; - /* only set the compression fields if the buf is compressed */ - if (lsize != psize) { - ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED); + /* only set the compression fields if the buf is compressed or raw */ + if (raw || lsize != psize) { ASSERT(!BP_IS_EMBEDDED(bp)); - ASSERT(!BP_SHOULD_BYTESWAP(bp)); - ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); - ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); ASSERT3S(psize, >, 0); - ASSERT3S(lsize, >=, psize); + if (raw) { + ASSERT(BP_IS_PROTECTED(bp)); + + /* + * This is a raw protected block so we need to pass + * along everything the receiving side will need to + * interpret this block, including the byteswap, salt, + * IV, and MAC. + */ + if (BP_SHOULD_BYTESWAP(bp)) + drrw->drr_flags |= DRR_RAW_BYTESWAP; + zio_crypt_decode_params_bp(bp, drrw->drr_salt, + drrw->drr_iv); + zio_crypt_decode_mac_bp(bp, drrw->drr_mac); + } else { + /* this is a compressed block */ + ASSERT(dsp->dsa_featureflags & + DMU_BACKUP_FEATURE_COMPRESSED); + ASSERT(!BP_SHOULD_BYTESWAP(bp)); + ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); + ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); + ASSERT3S(lsize, >=, psize); + } + + /* set fields common to compressed and raw sends */ drrw->drr_compressiontype = BP_GET_COMPRESS(bp); drrw->drr_compressed_size = psize; payload_size = drrw->drr_compressed_size; @@ -301,22 +327,23 @@ dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, payload_size = drrw->drr_logical_size; } - if (bp == NULL || BP_IS_EMBEDDED(bp)) { + if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) { /* - * There's no pre-computed checksum for partial-block - * writes or embedded BP's, so (like - * fletcher4-checkummed blocks) userland will have to - * compute a dedup-capable checksum itself. + * There's no pre-computed checksum for partial-block writes, + * embedded BP's, or encrypted BP's that are being sent as + * plaintext, so (like fletcher4-checkummed blocks) userland + * will have to compute a dedup-capable checksum itself. */ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; } else { drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); if (zio_checksum_table[drrw->drr_checksumtype].ci_flags & ZCHECKSUM_FLAG_DEDUP) - drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; + drrw->drr_flags |= DRR_CHECKSUM_DEDUP; DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); + DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp)); drrw->drr_key.ddk_cksum = bp->blk_cksum; } @@ -360,9 +387,11 @@ dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, } static int -dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) +dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data) { struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); + uint64_t blksz = BP_GET_LSIZE(bp); + uint64_t payload_size = blksz; if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_record(dsp, NULL, 0) != 0) @@ -377,7 +406,26 @@ dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) drrs->drr_length = blksz; drrs->drr_toguid = dsp->dsa_toguid; - if (dump_record(dsp, data, blksz) != 0) + /* See comment in dump_dnode() for full details */ + if (zfs_send_unmodified_spill_blocks && + (bp->blk_birth <= dsp->dsa_fromtxg)) { + drrs->drr_flags |= DRR_SPILL_UNMODIFIED; + } + + /* handle raw send fields */ + if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { + ASSERT(BP_IS_PROTECTED(bp)); + + if (BP_SHOULD_BYTESWAP(bp)) + drrs->drr_flags |= DRR_RAW_BYTESWAP; + drrs->drr_compressiontype = BP_GET_COMPRESS(bp); + drrs->drr_compressed_size = BP_GET_PSIZE(bp); + zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv); + zio_crypt_decode_mac_bp(bp, drrs->drr_mac); + payload_size = drrs->drr_compressed_size; + } + + if (dump_record(dsp, data, payload_size) != 0) return (SET_ERROR(EINTR)); return (0); } @@ -429,9 +477,11 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) } static int -dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) +dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, + dnode_phys_t *dnp) { struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); + int bonuslen; if (object < dsp->dsa_resume_object) { /* @@ -472,20 +522,111 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; - if (dump_record(dsp, DN_BONUS(dnp), - P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) { - return (SET_ERROR(EINTR)); + bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8); + + if ((dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW)) { + ASSERT(BP_IS_ENCRYPTED(bp)); + + if (BP_SHOULD_BYTESWAP(bp)) + drro->drr_flags |= DRR_RAW_BYTESWAP; + + /* needed for reconstructing dnp on recv side */ + drro->drr_maxblkid = dnp->dn_maxblkid; + drro->drr_indblkshift = dnp->dn_indblkshift; + drro->drr_nlevels = dnp->dn_nlevels; + drro->drr_nblkptr = dnp->dn_nblkptr; + + /* + * Since we encrypt the entire bonus area, the (raw) part + * beyond the bonuslen is actually nonzero, so we need + * to send it. + */ + if (bonuslen != 0) { + drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp); + bonuslen = drro->drr_raw_bonuslen; + } } + /* + * DRR_OBJECT_SPILL is set for every dnode which references a + * spill block. This allows the receiving pool to definitively + * determine when a spill block should be kept or freed. + */ + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) + drro->drr_flags |= DRR_OBJECT_SPILL; + + if (dump_record(dsp, DN_BONUS(dnp), bonuslen) != 0) + return (SET_ERROR(EINTR)); + /* Free anything past the end of the file. */ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * - (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) return (SET_ERROR(EINTR)); + + /* + * Send DRR_SPILL records for unmodified spill blocks. This is useful + * because changing certain attributes of the object (e.g. blocksize) + * can cause old versions of ZFS to incorrectly remove a spill block. + * Including these records in the stream forces an up to date version + * to always be written ensuring they're never lost. Current versions + * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can + * ignore these unmodified spill blocks. + */ + if (zfs_send_unmodified_spill_blocks && + (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && + (DN_SPILL_BLKPTR(dnp)->blk_birth <= dsp->dsa_fromtxg)) { + struct send_block_record record; + + bzero(&record, sizeof (struct send_block_record)); + record.eos_marker = B_FALSE; + record.bp = *DN_SPILL_BLKPTR(dnp); + SET_BOOKMARK(&(record.zb), dmu_objset_id(dsp->dsa_os), + object, 0, DMU_SPILL_BLKID); + + if (do_dump(dsp, &record) != 0) + return (SET_ERROR(EINTR)); + } + if (dsp->dsa_err != 0) return (SET_ERROR(EINTR)); return (0); } +static int +dump_object_range(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t firstobj, + uint64_t numslots) +{ + struct drr_object_range *drror = + &(dsp->dsa_drr->drr_u.drr_object_range); + + /* we only use this record type for raw sends */ + ASSERT(BP_IS_PROTECTED(bp)); + ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW); + ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE); + ASSERT0(BP_GET_LEVEL(bp)); + + if (dsp->dsa_pending_op != PENDING_NONE) { + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dsp->dsa_pending_op = PENDING_NONE; + } + + bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); + dsp->dsa_drr->drr_type = DRR_OBJECT_RANGE; + drror->drr_firstobj = firstobj; + drror->drr_numslots = numslots; + drror->drr_toguid = dsp->dsa_toguid; + if (BP_SHOULD_BYTESWAP(bp)) + drror->drr_flags |= DRR_RAW_BYTESWAP; + zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv); + zio_crypt_decode_mac_bp(bp, drror->drr_mac); + + if (dump_record(dsp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + return (0); +} + static boolean_t backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) { @@ -529,6 +670,7 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= sta->resume.zb_object); + ASSERT3P(sta->ds, !=, NULL); if (sta->cancel) return (SET_ERROR(EINTR)); @@ -601,6 +743,18 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || zb->zb_object >= dsa->dsa_resume_object); + /* + * All bps of an encrypted os should have the encryption bit set. + * If this is not true it indicates tampering and we report an error. + */ + if (dsa->dsa_os->os_encrypted && + !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { + spa_log_error(spa, zb); + zfs_panic_recover("unencrypted block in encrypted " + "object set %llu", ds->ds_object); + return (SET_ERROR(EIO)); + } + if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); @@ -612,40 +766,66 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) } else if (BP_IS_HOLE(bp)) { uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); uint64_t offset = zb->zb_blkid * span; - err = dump_free(dsa, zb->zb_object, offset, span); + /* Don't dump free records for offsets > DMU_OBJECT_END */ + if (zb->zb_blkid == 0 || span <= DMU_OBJECT_END / zb->zb_blkid) + err = dump_free(dsa, zb->zb_object, offset, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; + enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + + if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { + ASSERT(BP_IS_ENCRYPTED(bp)); + ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); + zioflags |= ZIO_FLAG_RAW; + } ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &aflags, zb) != 0) + ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) return (SET_ERROR(EIO)); dnode_phys_t *blk = abuf->b_data; uint64_t dnobj = zb->zb_blkid * epb; - for (int i = 0; i < epb; i += blk[i].dn_extra_slots + 1) { - err = dump_dnode(dsa, dnobj + i, blk + i); - if (err != 0) - break; + + /* + * Raw sends require sending encryption parameters for the + * block of dnodes. Regular sends do not need to send this + * info. + */ + if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { + ASSERT(arc_is_encrypted(abuf)); + err = dump_object_range(dsa, bp, dnobj, epb); + } + + if (err == 0) { + for (int i = 0; i < epb; + i += blk[i].dn_extra_slots + 1) { + err = dump_dnode(dsa, bp, dnobj + i, blk + i); + if (err != 0) + break; + } } arc_buf_destroy(abuf, &abuf); } else if (type == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; - int blksz = BP_GET_LSIZE(bp); + enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + + if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) { + ASSERT(BP_IS_PROTECTED(bp)); + zioflags |= ZIO_FLAG_RAW; + } if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, - &aflags, zb) != 0) + ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) return (SET_ERROR(EIO)); - err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); + err = dump_spill(dsa, bp, zb->zb_object, abuf->b_data); arc_buf_destroy(abuf, &abuf); } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ @@ -667,6 +847,14 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) */ boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE && !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); + + /* + * Raw sends require that we always get raw data as it exists + * on disk, so we assert that we are not splitting blocks here. + */ + boolean_t request_raw = + (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0; + /* * We should only request compressed data from the ARC if all * the following are true: @@ -682,6 +870,8 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); + IMPLY(request_raw, !split_large_blocks); + IMPLY(request_raw, BP_IS_PROTECTED(bp)); ASSERT0(zb->zb_level); ASSERT(zb->zb_object > dsa->dsa_resume_object || (zb->zb_object == dsa->dsa_resume_object && @@ -695,8 +885,11 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) ASSERT3U(blksz, ==, BP_GET_LSIZE(bp)); enum zio_flag zioflags = ZIO_FLAG_CANFAIL; - if (request_compressed) + if (request_raw) zioflags |= ZIO_FLAG_RAW; + else if (request_compressed) + zioflags |= ZIO_FLAG_RAW_COMPRESS; + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) { if (zfs_send_corrupt_data) { @@ -716,6 +909,7 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) offset = zb->zb_blkid * blksz; if (split_large_blocks) { + ASSERT0(arc_is_encrypted(abuf)); ASSERT3U(arc_get_compression(abuf), ==, ZIO_COMPRESS_OFF); char *buf = abuf->b_data; @@ -758,7 +952,7 @@ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, - int outfd, uint64_t resumeobj, uint64_t resumeoff, + boolean_t rawok, int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp, offset_t *off) { objset_t *os; @@ -775,6 +969,28 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, return (err); } + /* + * If this is a non-raw send of an encrypted ds, we can ensure that + * the objset_phys_t is authenticated. This is safe because this is + * either a snapshot or we have owned the dataset, ensuring that + * it can't be modified. + */ + if (!rawok && os->os_encrypted && + arc_is_unauthenticated(os->os_phys_buf)) { + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT, + ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + err = arc_untransform(os->os_phys_buf, os->os_spa, + &zb, B_FALSE); + if (err != 0) { + dsl_pool_rele(dp, tag); + return (err); + } + + ASSERT0(arc_is_unauthenticated(os->os_phys_buf)); + } + drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; @@ -795,22 +1011,29 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, } #endif - if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) + /* raw sends imply large_block_ok */ + if ((large_block_ok || rawok) && + to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS]) featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; if (to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; - if (embedok && + + /* encrypted datasets will not have embedded blocks */ + if ((embedok || rawok) && !os->os_encrypted && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; - if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) - featureflags |= DMU_BACKUP_FEATURE_LZ4; } - if (compressok) { + + /* raw send implies compressok */ + if (compressok || rawok) featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; - } + if (rawok && os->os_encrypted) + featureflags |= DMU_BACKUP_FEATURE_RAW; + if ((featureflags & - (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED)) != - 0 && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { + (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED | + DMU_BACKUP_FEATURE_RAW)) != 0 && + spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { featureflags |= DMU_BACKUP_FEATURE_LZ4; } @@ -832,6 +1055,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, if (zfs_send_set_freerecords_bit) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS; + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK; + if (ancestor_zb != NULL) { drr->drr_u.drr_begin.drr_fromguid = ancestor_zb->zbm_guid; @@ -852,6 +1077,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, dsp->dsa_os = os; dsp->dsa_off = off; dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; + dsp->dsa_fromtxg = fromtxg; dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_featureflags = featureflags; dsp->dsa_resume_object = resumeobj; @@ -866,19 +1092,47 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, void *payload = NULL; size_t payload_len = 0; - if (resumeobj != 0 || resumeoff != 0) { - dmu_object_info_t to_doi; - err = dmu_object_info(os, resumeobj, &to_doi); - if (err != 0) - goto out; - SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0, - resumeoff / to_doi.doi_data_block_size); - + /* handle features that require a DRR_BEGIN payload */ + if (featureflags & + (DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_RAW)) { + nvlist_t *keynvl = NULL; nvlist_t *nvl = fnvlist_alloc(); - fnvlist_add_uint64(nvl, "resume_object", resumeobj); - fnvlist_add_uint64(nvl, "resume_offset", resumeoff); + + if (featureflags & DMU_BACKUP_FEATURE_RESUMING) { + dmu_object_info_t to_doi; + err = dmu_object_info(os, resumeobj, &to_doi); + if (err != 0) { + fnvlist_free(nvl); + goto out; + } + + SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, + resumeobj, 0, + resumeoff / to_doi.doi_data_block_size); + + fnvlist_add_uint64(nvl, "resume_object", resumeobj); + fnvlist_add_uint64(nvl, "resume_offset", resumeoff); + } + + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + uint64_t ivset_guid = (ancestor_zb != NULL) ? + ancestor_zb->zbm_ivset_guid : 0; + + ASSERT(os->os_encrypted); + + err = dsl_crypto_populate_key_nvlist(to_ds, + ivset_guid, &keynvl); + if (err != 0) { + fnvlist_free(nvl); + goto out; + } + + fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl); + } + payload = fnvlist_pack(nvl, &payload_len); drr->drr_payloadlen = payload_len; + fnvlist_free(keynvl); fnvlist_free(nvl); } @@ -896,6 +1150,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, to_arg.ds = to_ds; to_arg.fromtxg = fromtxg; to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; + if (rawok) + to_arg.flags |= TRAVERSE_NO_DECRYPT; (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc, TS_RUN, minclsyspri); @@ -942,7 +1198,6 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, if (dump_record(dsp, NULL, 0) != 0) err = dsp->dsa_err; - out: mutex_enter(&to_ds->ds_sendstream_lock); list_remove(&to_ds->ds_sendstreams, dsp); @@ -961,60 +1216,77 @@ out: int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, - int outfd, vnode_t *vp, offset_t *off) + boolean_t rawok, int outfd, vnode_t *vp, offset_t *off) { dsl_pool_t *dp; dsl_dataset_t *ds; dsl_dataset_t *fromds = NULL; + ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; int err; err = dsl_pool_hold(pool, FTAG, &dp); if (err != 0) return (err); - err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); + err = dsl_dataset_hold_obj_flags(dp, tosnap, dsflags, FTAG, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (fromsnap != 0) { - zfs_bookmark_phys_t zb; + zfs_bookmark_phys_t zb = { 0 }; boolean_t is_clone; err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); if (err != 0) { - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); dsl_pool_rele(dp, FTAG); return (err); } - if (!dsl_dataset_is_before(ds, fromds, 0)) + if (!dsl_dataset_is_before(ds, fromds, 0)) { err = SET_ERROR(EXDEV); + dsl_dataset_rele(fromds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); + } + zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; + + if (dsl_dataset_is_zapified(fromds)) { + (void) zap_lookup(dp->dp_meta_objset, + fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1, + &zb.zbm_ivset_guid); + } + is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, compressok, outfd, 0, 0, vp, off); + embedok, large_block_ok, compressok, rawok, outfd, + 0, 0, vp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, compressok, outfd, 0, 0, vp, off); + embedok, large_block_ok, compressok, rawok, outfd, + 0, 0, vp, off); } - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); return (err); } int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, - boolean_t large_block_ok, boolean_t compressok, int outfd, - uint64_t resumeobj, uint64_t resumeoff, - vnode_t *vp, offset_t *off) + boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, + int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp, + offset_t *off) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; + ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; boolean_t owned = B_FALSE; if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) @@ -1029,10 +1301,10 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, * We are sending a filesystem or volume. Ensure * that it doesn't change by owning the dataset. */ - err = dsl_dataset_own(dp, tosnap, FTAG, &ds); + err = dsl_dataset_own(dp, tosnap, dsflags, FTAG, &ds); owned = B_TRUE; } else { - err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); + err = dsl_dataset_hold_flags(dp, tosnap, dsflags, FTAG, &ds); } if (err != 0) { dsl_pool_rele(dp, FTAG); @@ -1040,7 +1312,7 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, } if (fromsnap != NULL) { - zfs_bookmark_phys_t zb; + zfs_bookmark_phys_t zb = { 0 }; boolean_t is_clone = B_FALSE; int fsnamelen = strchr(tosnap, '@') - tosnap; @@ -1066,28 +1338,40 @@ dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, dsl_dataset_phys(fromds)->ds_creation_txg; zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (ds->ds_dir != fromds->ds_dir); + + if (dsl_dataset_is_zapified(fromds)) { + (void) zap_lookup(dp->dp_meta_objset, + fromds->ds_object, + DS_FIELD_IVSET_GUID, 8, 1, + &zb.zbm_ivset_guid); + } dsl_dataset_rele(fromds, FTAG); } } else { err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); } if (err != 0) { - dsl_dataset_rele(ds, FTAG); + if (owned) + dsl_dataset_disown(ds, dsflags, FTAG); + else + dsl_dataset_rele_flags(ds, dsflags, FTAG); + dsl_pool_rele(dp, FTAG); return (err); } err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, - embedok, large_block_ok, compressok, + embedok, large_block_ok, compressok, rawok, outfd, resumeobj, resumeoff, vp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, - embedok, large_block_ok, compressok, + embedok, large_block_ok, compressok, rawok, outfd, resumeobj, resumeoff, vp, off); } if (owned) - dsl_dataset_disown(ds, FTAG); + dsl_dataset_disown(ds, dsflags, FTAG); else - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (err); } @@ -1242,7 +1526,8 @@ dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg, * traverse the blocks of the snapshot with birth times after * from_txg, summing their uncompressed size */ - err = traverse_dataset(ds, from_txg, TRAVERSE_POST, + err = traverse_dataset(ds, from_txg, + TRAVERSE_POST | TRAVERSE_NO_DECRYPT, dmu_calculate_send_traversal, &size); if (err) return (err); diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c index f57e510530..0547a09498 100644 --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c @@ -131,7 +131,7 @@ traverse_zil(traverse_data_t *td, zil_header_t *zh) zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, - claim_txg); + claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT)); zil_free(zilog); } @@ -179,6 +179,7 @@ traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) return; @@ -194,8 +195,11 @@ traverse_prefetch_metadata(traverse_data_t *td, if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) return; + if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) + zio_flags |= ZIO_FLAG_RAW; + (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); } static boolean_t @@ -294,6 +298,8 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + ASSERT(!BP_IS_PROTECTED(bp)); + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) @@ -318,11 +324,18 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; + uint32_t zio_flags = ZIO_FLAG_CANFAIL; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + /* + * dnode blocks might have their bonus buffers encrypted, so + * we must be careful to honor TRAVERSE_NO_DECRYPT + */ + if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) + zio_flags |= ZIO_FLAG_RAW; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err != 0) goto post; dnode_phys_t *child_dnp = buf->b_data; @@ -340,10 +353,14 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t flags = ARC_FLAG_WAIT; + if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) + zio_flags |= ZIO_FLAG_RAW; + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err != 0) goto post; @@ -492,6 +509,7 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { prefetch_data_t *pfd = arg; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | ARC_FLAG_PRESCIENT_PREFETCH; @@ -511,8 +529,11 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, cv_broadcast(&pfd->pd_cv); mutex_exit(&pfd->pd_mtx); + if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) + zio_flags |= ZIO_FLAG_RAW; + (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); + zio_flags, &aflags, zb); return (0); } @@ -581,15 +602,22 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); + SET_BOOKMARK(&czb, td.td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + /* See comment on ZIL traversal in dsl_scan_visitds. */ if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { + enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; - err = arc_read(NULL, td.td_spa, rootbp, - arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); + if ((td.td_flags & TRAVERSE_NO_DECRYPT) && + BP_IS_PROTECTED(rootbp)) + zio_flags |= ZIO_FLAG_RAW; + + err = arc_read(NULL, td.td_spa, rootbp, arc_getbuf_func, + &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &czb); if (err != 0) return (err); @@ -603,8 +631,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, &td, TQ_NOQUEUE) == TASKQID_INVALID) pd.pd_exited = B_TRUE; - SET_BOOKMARK(&czb, td.td_objset, - ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = traverse_visitbp(&td, NULL, rootbp, &czb); mutex_enter(&pd.pd_mtx); diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index f360eb997e..5a86650d28 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -136,6 +136,7 @@ dnode_cons(void *arg, void *unused, int kmflag) bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); + bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid)); for (i = 0; i < TXG_SIZE; i++) { multilist_link_init(&dn->dn_dirty_link[i]); @@ -196,6 +197,7 @@ dnode_dest(void *arg, void *unused) ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_bonuslen[i]); ASSERT0(dn->dn_next_blksz[i]); + ASSERT0(dn->dn_next_maxblkid[i]); } ASSERT0(dn->dn_allocated_txg); @@ -617,6 +619,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_blksz[i]); + ASSERT0(dn->dn_next_maxblkid[i]); ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); @@ -659,7 +662,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) + dmu_object_type_t bonustype, int bonuslen, int dn_slots, + boolean_t keep_spill, dmu_tx_t *tx) { int nblkptr; @@ -708,7 +712,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; if (dn->dn_nblkptr != nblkptr) dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; - if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) { dbuf_rm_spill(dn, tx); dnode_rm_spill(dn, tx); } @@ -785,6 +789,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn) sizeof (odn->dn_next_bonuslen)); bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], sizeof (odn->dn_next_blksz)); + bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0], + sizeof (odn->dn_next_maxblkid)); for (i = 0; i < TXG_SIZE; i++) { list_move_tail(&ndn->dn_dirty_records[i], &odn->dn_dirty_records[i]); @@ -1321,7 +1327,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, DNODE_STAT_BUMP(dnode_hold_dbuf_hold); return (SET_ERROR(EIO)); } - err = dbuf_read(db, NULL, DB_RF_CANFAIL); + /* + * We do not need to decrypt to read the dnode so it doesn't matter + * if we get the encrypted or decrypted version. + */ + err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT); if (err) { DNODE_STAT_BUMP(dnode_hold_dbuf_read); dbuf_rele(db, FTAG); @@ -1749,11 +1759,74 @@ fail: return (SET_ERROR(ENOTSUP)); } +static void +dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) +{ + uint64_t txgoff = tx->tx_txg & TXG_MASK; + int old_nlevels = dn->dn_nlevels; + dmu_buf_impl_t *db; + list_t *list; + dbuf_dirty_record_t *new, *dr, *dr_next; + + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + dn->dn_nlevels = new_nlevels; + + ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); + dn->dn_next_nlevels[txgoff] = new_nlevels; + + /* dirty the left indirects */ + db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); + ASSERT(db != NULL); + new = dbuf_dirty(db, tx); + dbuf_rele(db, FTAG); + + /* transfer the dirty records to the new indirect */ + mutex_enter(&dn->dn_mtx); + mutex_enter(&new->dt.di.dr_mtx); + list = &dn->dn_dirty_records[txgoff]; + for (dr = list_head(list); dr; dr = dr_next) { + dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); + if (dr->dr_dbuf->db_level != new_nlevels-1 && + dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); + list_remove(&dn->dn_dirty_records[txgoff], dr); + list_insert_tail(&new->dt.di.dr_children, dr); + dr->dr_parent = new; + } + } + mutex_exit(&new->dt.di.dr_mtx); + mutex_exit(&dn->dn_mtx); +} + +int +dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx) +{ + int ret = 0; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + if (dn->dn_nlevels == nlevels) { + ret = 0; + goto out; + } else if (nlevels < dn->dn_nlevels) { + ret = SET_ERROR(EINVAL); + goto out; + } + + dnode_set_nlevels_impl(dn, nlevels, tx); + +out: + rw_exit(&dn->dn_struct_rwlock); + return (ret); +} + /* read-holding callers must not rely on the lock being continuously held */ void -dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) +dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read, + boolean_t force) { - uint64_t txgoff = tx->tx_txg & TXG_MASK; int epbs, new_nlevels; uint64_t sz; @@ -1777,13 +1850,25 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) } } - if (blkid <= dn->dn_maxblkid) + /* + * Raw sends (indicated by the force flag) require that we take the + * given blkid even if the value is lower than the current value. + */ + if (!force && blkid <= dn->dn_maxblkid) goto out; + /* + * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff] + * to indicate that this field is set. This allows us to set the + * maxblkid to 0 on an existing object in dnode_sync(). + */ dn->dn_maxblkid = blkid; + dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] = + blkid | DMU_NEXT_MAXBLKID_SET; /* * Compute the number of levels necessary to support the new maxblkid. + * Raw sends will ensure nlevels is set correctly for us. */ new_nlevels = 1; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; @@ -1791,40 +1876,11 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) new_nlevels++; - if (new_nlevels > dn->dn_nlevels) { - int old_nlevels = dn->dn_nlevels; - dmu_buf_impl_t *db; - list_t *list; - dbuf_dirty_record_t *new, *dr, *dr_next; - - dn->dn_nlevels = new_nlevels; - - ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); - dn->dn_next_nlevels[txgoff] = new_nlevels; - - /* dirty the left indirects */ - db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); - ASSERT(db != NULL); - new = dbuf_dirty(db, tx); - dbuf_rele(db, FTAG); - - /* transfer the dirty records to the new indirect */ - mutex_enter(&dn->dn_mtx); - mutex_enter(&new->dt.di.dr_mtx); - list = &dn->dn_dirty_records[txgoff]; - for (dr = list_head(list); dr; dr = dr_next) { - dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); - if (dr->dr_dbuf->db_level != new_nlevels-1 && - dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && - dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { - ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); - list_remove(&dn->dn_dirty_records[txgoff], dr); - list_insert_tail(&new->dt.di.dr_children, dr); - dr->dr_parent = new; - } - } - mutex_exit(&new->dt.di.dr_mtx); - mutex_exit(&dn->dn_mtx); + if (!force) { + if (new_nlevels > dn->dn_nlevels) + dnode_set_nlevels_impl(dn, new_nlevels, tx); + } else { + ASSERT3U(dn->dn_nlevels, >=, new_nlevels); } out: @@ -2249,7 +2305,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, */ return (SET_ERROR(ESRCH)); } - error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); + error = dbuf_read(db, NULL, + DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT); if (error) { dbuf_rele(db, FTAG); return (error); diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c index 9283356608..f5ee8a290d 100644 --- a/usr/src/uts/common/fs/zfs/dnode_sync.c +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -31,6 +31,7 @@ #include <sys/dmu.h> #include <sys/dmu_tx.h> #include <sys/dmu_objset.h> +#include <sys/dmu_recv.h> #include <sys/dsl_dataset.h> #include <sys/spa.h> #include <sys/range_tree.h> @@ -383,7 +384,21 @@ dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, } } - if (trunc) { + /* + * Do not truncate the maxblkid if we are performing a raw + * receive. The raw receive sets the maxblkid manually and + * must not be overridden. Usually, the last DRR_FREE record + * will be at the maxblkid, because the source system sets + * the maxblkid when truncating. However, if the last block + * was freed by overwriting with zeros and being compressed + * away to a hole, the source system will generate a DRR_FREE + * record while leaving the maxblkid after the end of that + * record. In this case we need to leave the maxblkid as + * indicated in the DRR_OBJECT record, so that it matches the + * source system, ensuring that the cryptographic hashes will + * match. + */ + if (trunc && !dn->dn_objset->os_raw_receive) { dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1; uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * @@ -545,6 +560,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dn->dn_next_nlevels[txgoff] = 0; dn->dn_next_indblkshift[txgoff] = 0; dn->dn_next_blksz[txgoff] = 0; + dn->dn_next_maxblkid[txgoff] = 0; /* ASSERT(blkptrs are zero); */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); @@ -570,7 +586,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); /* * Now that we've released our hold, the dnode may - * be evicted, so we musn't access it. + * be evicted, so we mustn't access it. */ } @@ -580,6 +596,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) void dnode_sync(dnode_t *dn, dmu_tx_t *tx) { + objset_t *os = dn->dn_objset; dnode_phys_t *dnp = dn->dn_phys; int txgoff = tx->tx_txg & TXG_MASK; list_t *list = &dn->dn_dirty_records[txgoff]; @@ -594,8 +611,13 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); - if (dmu_objset_userused_enabled(dn->dn_objset) && - !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + /* + * Do user accounting if it is enabled and this is not + * an encrypted receive. + */ + if (dmu_objset_userused_enabled(os) && + !DMU_OBJECT_IS_SPECIAL(dn->dn_object) && + (!os->os_encrypted || !dmu_objset_is_receiving(os))) { mutex_enter(&dn->dn_mtx); dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); dn->dn_oldflags = dn->dn_phys->dn_flags; @@ -603,7 +625,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); dmu_objset_userquota_get_ids(dn, B_FALSE, tx); } else { - /* Once we account for it, we should always account for it. */ + /* Once we account for it, we should always account for it */ ASSERT(!(dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED)); } @@ -740,6 +762,19 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dn->dn_next_nlevels[txgoff] = 0; } + /* + * This must be done after dnode_sync_free_range() + * and dnode_increase_indirection(). See dnode_new_blkid() + * for an explanation of the high bit being set. + */ + if (dn->dn_next_maxblkid[txgoff]) { + mutex_enter(&dn->dn_mtx); + dnp->dn_maxblkid = + dn->dn_next_maxblkid[txgoff] & ~DMU_NEXT_MAXBLKID_SET; + dn->dn_next_maxblkid[txgoff] = 0; + mutex_exit(&dn->dn_mtx); + } + if (dn->dn_next_nblkptr[txgoff]) { /* this should only happen on a realloc */ ASSERT(dn->dn_allocated_txg == tx->tx_txg); diff --git a/usr/src/uts/common/fs/zfs/dsl_bookmark.c b/usr/src/uts/common/fs/zfs/dsl_bookmark.c index 0a58115341..a32198402f 100644 --- a/usr/src/uts/common/fs/zfs/dsl_bookmark.c +++ b/usr/src/uts/common/fs/zfs/dsl_bookmark.c @@ -70,6 +70,12 @@ dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname, if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; + /* + * Zero out the bookmark in case the one stored on disk + * is in an older, shorter format. + */ + bzero(bmark_phys, sizeof (*bmark_phys)); + err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t), sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0, NULL); @@ -188,8 +194,9 @@ dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { dsl_dataset_t *snapds, *bmark_fs; - zfs_bookmark_phys_t bmark_phys; + zfs_bookmark_phys_t bmark_phys = { 0 }; char *shortname; + uint32_t bmark_len = BOOKMARK_PHYS_SIZE_V1; VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair), FTAG, &snapds)); @@ -214,10 +221,29 @@ dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) bmark_phys.zbm_creation_time = dsl_dataset_phys(snapds)->ds_creation_time; + /* + * If the dataset is encrypted create a larger bookmark to + * accommodate the IVset guid. The IVset guid was added + * after the encryption feature to prevent a problem with + * raw sends. If we encounter an encrypted dataset without + * an IVset guid we fall back to a normal bookmark. + */ + if (snapds->ds_dir->dd_crypto_obj != 0 && + spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_BOOKMARK_V2)) { + int err = zap_lookup(mos, snapds->ds_object, + DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, + &bmark_phys.zbm_ivset_guid); + if (err == 0) { + bmark_len = BOOKMARK_PHYS_SIZE_V2; + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_BOOKMARK_V2, tx); + } + } + VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks, shortname, sizeof (uint64_t), - sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), - &bmark_phys, tx)); + bmark_len / sizeof (uint64_t), &bmark_phys, tx)); spa_history_log_internal_ds(bmark_fs, "bookmark", tx, "name=%s creation_txg=%llu target_snap=%llu", @@ -267,7 +293,7 @@ dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { char *bmark_name = attr.za_name; - zfs_bookmark_phys_t bmark_phys; + zfs_bookmark_phys_t bmark_phys = { 0 }; err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys); ASSERT3U(err, !=, ENOENT); @@ -290,6 +316,11 @@ dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) dsl_prop_nvlist_add_uint64(out_props, ZFS_PROP_CREATION, bmark_phys.zbm_creation_time); } + if (nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_IVSET_GUID))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_IVSET_GUID, bmark_phys.zbm_ivset_guid); + } fnvlist_add_nvlist(outnvl, bmark_name, out_props); fnvlist_free(out_props); @@ -337,13 +368,26 @@ typedef struct dsl_bookmark_destroy_arg { static int dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) { + int err; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t bmark_zapobj = ds->ds_bookmarks; matchtype_t mt = 0; + uint64_t int_size, num_ints; if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; + err = zap_length(mos, bmark_zapobj, name, &int_size, &num_ints); + if (err != 0) + return (err); + + ASSERT3U(int_size, ==, sizeof (uint64_t)); + + if (num_ints * int_size > BOOKMARK_PHYS_SIZE_V1) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_BOOKMARK_V2, tx); + } + return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); } diff --git a/usr/src/uts/common/fs/zfs/dsl_crypt.c b/usr/src/uts/common/fs/zfs/dsl_crypt.c new file mode 100644 index 0000000000..3937d3cb51 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/dsl_crypt.c @@ -0,0 +1,2898 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include <sys/dsl_crypt.h> +#include <sys/dsl_pool.h> +#include <sys/zap.h> +#include <sys/zil.h> +#include <sys/dsl_dir.h> +#include <sys/dsl_prop.h> +#include <sys/spa_impl.h> +#include <sys/dmu_objset.h> +#include <sys/zvol.h> + +/* + * This file's primary purpose is for managing master encryption keys in + * memory and on disk. For more info on how these keys are used, see the + * block comment in zio_crypt.c. + * + * All master keys are stored encrypted on disk in the form of the DSL + * Crypto Key ZAP object. The binary key data in this object is always + * randomly generated and is encrypted with the user's wrapping key. This + * layer of indirection allows the user to change their key without + * needing to re-encrypt the entire dataset. The ZAP also holds on to the + * (non-encrypted) encryption algorithm identifier, IV, and MAC needed to + * safely decrypt the master key. For more info on the user's key see the + * block comment in libzfs_crypto.c + * + * In-memory encryption keys are managed through the spa_keystore. The + * keystore consists of 3 AVL trees, which are as follows: + * + * The Wrapping Key Tree: + * The wrapping key (wkey) tree stores the user's keys that are fed into the + * kernel through 'zfs load-key' and related commands. Datasets inherit their + * parent's wkey by default, so these structures are refcounted. The wrapping + * keys remain in memory until they are explicitly unloaded (with + * "zfs unload-key"). Unloading is only possible when no datasets are using + * them (refcount=0). + * + * The DSL Crypto Key Tree: + * The DSL Crypto Keys (DCK) are the in-memory representation of decrypted + * master keys. They are used by the functions in zio_crypt.c to perform + * encryption, decryption, and authentication. Snapshots and clones of a given + * dataset will share a DSL Crypto Key, so they are also refcounted. Once the + * refcount on a key hits zero, it is immediately zeroed out and freed. + * + * The Crypto Key Mapping Tree: + * The zio layer needs to lookup master keys by their dataset object id. Since + * the DSL Crypto Keys can belong to multiple datasets, we maintain a tree of + * dsl_key_mapping_t's which essentially just map the dataset object id to its + * appropriate DSL Crypto Key. The management for creating and destroying these + * mappings hooks into the code for owning and disowning datasets. Usually, + * there will only be one active dataset owner, but there are times + * (particularly during dataset creation and destruction) when this may not be + * true or the dataset may not be initialized enough to own. As a result, this + * object is also refcounted. + */ + +/* + * This tunable allows datasets to be raw received even if the stream does + * not include IVset guids or if the guids don't match. This is used as part + * of the resolution for ZPOOL_ERRATA_ZOL_8308_ENCRYPTION. + */ +int zfs_disable_ivset_guid_check = 0; + +static void +dsl_wrapping_key_hold(dsl_wrapping_key_t *wkey, void *tag) +{ + (void) zfs_refcount_add(&wkey->wk_refcnt, tag); +} + +static void +dsl_wrapping_key_rele(dsl_wrapping_key_t *wkey, void *tag) +{ + (void) zfs_refcount_remove(&wkey->wk_refcnt, tag); +} + +static void +dsl_wrapping_key_free(dsl_wrapping_key_t *wkey) +{ + ASSERT0(zfs_refcount_count(&wkey->wk_refcnt)); + + if (wkey->wk_key.ck_data) { + bzero(wkey->wk_key.ck_data, + CRYPTO_BITS2BYTES(wkey->wk_key.ck_length)); + kmem_free(wkey->wk_key.ck_data, + CRYPTO_BITS2BYTES(wkey->wk_key.ck_length)); + } + + zfs_refcount_destroy(&wkey->wk_refcnt); + kmem_free(wkey, sizeof (dsl_wrapping_key_t)); +} + +static int +dsl_wrapping_key_create(uint8_t *wkeydata, zfs_keyformat_t keyformat, + uint64_t salt, uint64_t iters, dsl_wrapping_key_t **wkey_out) +{ + int ret; + dsl_wrapping_key_t *wkey; + + /* allocate the wrapping key */ + wkey = kmem_alloc(sizeof (dsl_wrapping_key_t), KM_SLEEP); + if (!wkey) + return (SET_ERROR(ENOMEM)); + + /* allocate and initialize the underlying crypto key */ + wkey->wk_key.ck_data = kmem_alloc(WRAPPING_KEY_LEN, KM_SLEEP); + if (!wkey->wk_key.ck_data) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + wkey->wk_key.ck_format = CRYPTO_KEY_RAW; + wkey->wk_key.ck_length = CRYPTO_BYTES2BITS(WRAPPING_KEY_LEN); + bcopy(wkeydata, wkey->wk_key.ck_data, WRAPPING_KEY_LEN); + + /* initialize the rest of the struct */ + zfs_refcount_create(&wkey->wk_refcnt); + wkey->wk_keyformat = keyformat; + wkey->wk_salt = salt; + wkey->wk_iters = iters; + + *wkey_out = wkey; + return (0); + +error: + dsl_wrapping_key_free(wkey); + + *wkey_out = NULL; + return (ret); +} + +int +dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props, + nvlist_t *crypto_args, dsl_crypto_params_t **dcp_out) +{ + int ret; + uint64_t crypt = ZIO_CRYPT_INHERIT; + uint64_t keyformat = ZFS_KEYFORMAT_NONE; + uint64_t salt = 0, iters = 0; + dsl_crypto_params_t *dcp = NULL; + dsl_wrapping_key_t *wkey = NULL; + uint8_t *wkeydata = NULL; + uint_t wkeydata_len = 0; + char *keylocation = NULL; + + dcp = kmem_zalloc(sizeof (dsl_crypto_params_t), KM_SLEEP); + if (!dcp) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + /* get relevant properties from the nvlist */ + dcp->cp_cmd = cmd; + + /* get relevant arguments from the nvlists */ + if (props != NULL) { + (void) nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_ENCRYPTION), &crypt); + (void) nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &keyformat); + (void) nvlist_lookup_string(props, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation); + (void) nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), &salt); + (void) nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &iters); + dcp->cp_crypt = crypt; + } + + if (crypto_args != NULL) { + (void) nvlist_lookup_uint8_array(crypto_args, "wkeydata", + &wkeydata, &wkeydata_len); + } + + /* check for valid command */ + if (dcp->cp_cmd >= DCP_CMD_MAX) { + ret = SET_ERROR(EINVAL); + goto error; + } else { + dcp->cp_cmd = cmd; + } + + /* check for valid crypt */ + if (dcp->cp_crypt >= ZIO_CRYPT_FUNCTIONS) { + ret = SET_ERROR(EINVAL); + goto error; + } else { + dcp->cp_crypt = crypt; + } + + /* check for valid keyformat */ + if (keyformat >= ZFS_KEYFORMAT_FORMATS) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* check for a valid keylocation (of any kind) and copy it in */ + if (keylocation != NULL) { + if (!zfs_prop_valid_keylocation(keylocation, B_FALSE)) { + ret = SET_ERROR(EINVAL); + goto error; + } + + dcp->cp_keylocation = spa_strdup(keylocation); + } + + /* check wrapping key length, if given */ + if (wkeydata != NULL && wkeydata_len != WRAPPING_KEY_LEN) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* if the user asked for the deault crypt, determine that now */ + if (dcp->cp_crypt == ZIO_CRYPT_ON) + dcp->cp_crypt = ZIO_CRYPT_ON_VALUE; + + /* create the wrapping key from the raw data */ + if (wkeydata != NULL) { + /* create the wrapping key with the verified parameters */ + ret = dsl_wrapping_key_create(wkeydata, keyformat, salt, + iters, &wkey); + if (ret != 0) + goto error; + + dcp->cp_wkey = wkey; + } + + /* + * Remove the encryption properties from the nvlist since they are not + * maintained through the DSL. + */ + (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION)); + (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT)); + (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT)); + (void) nvlist_remove_all(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS)); + + *dcp_out = dcp; + + return (0); + +error: + if (wkey != NULL) + dsl_wrapping_key_free(wkey); + if (dcp != NULL) + kmem_free(dcp, sizeof (dsl_crypto_params_t)); + + *dcp_out = NULL; + return (ret); +} + +void +dsl_crypto_params_free(dsl_crypto_params_t *dcp, boolean_t unload) +{ + if (dcp == NULL) + return; + + if (dcp->cp_keylocation != NULL) + spa_strfree(dcp->cp_keylocation); + if (unload && dcp->cp_wkey != NULL) + dsl_wrapping_key_free(dcp->cp_wkey); + + kmem_free(dcp, sizeof (dsl_crypto_params_t)); +} + +static int +spa_crypto_key_compare(const void *a, const void *b) +{ + const dsl_crypto_key_t *dcka = a; + const dsl_crypto_key_t *dckb = b; + + if (dcka->dck_obj < dckb->dck_obj) + return (-1); + if (dcka->dck_obj > dckb->dck_obj) + return (1); + return (0); +} + +static int +spa_key_mapping_compare(const void *a, const void *b) +{ + const dsl_key_mapping_t *kma = a; + const dsl_key_mapping_t *kmb = b; + + if (kma->km_dsobj < kmb->km_dsobj) + return (-1); + if (kma->km_dsobj > kmb->km_dsobj) + return (1); + return (0); +} + +static int +spa_wkey_compare(const void *a, const void *b) +{ + const dsl_wrapping_key_t *wka = a; + const dsl_wrapping_key_t *wkb = b; + + if (wka->wk_ddobj < wkb->wk_ddobj) + return (-1); + if (wka->wk_ddobj > wkb->wk_ddobj) + return (1); + return (0); +} + +void +spa_keystore_init(spa_keystore_t *sk) +{ + rw_init(&sk->sk_dk_lock, NULL, RW_DEFAULT, NULL); + rw_init(&sk->sk_km_lock, NULL, RW_DEFAULT, NULL); + rw_init(&sk->sk_wkeys_lock, NULL, RW_DEFAULT, NULL); + avl_create(&sk->sk_dsl_keys, spa_crypto_key_compare, + sizeof (dsl_crypto_key_t), + offsetof(dsl_crypto_key_t, dck_avl_link)); + avl_create(&sk->sk_key_mappings, spa_key_mapping_compare, + sizeof (dsl_key_mapping_t), + offsetof(dsl_key_mapping_t, km_avl_link)); + avl_create(&sk->sk_wkeys, spa_wkey_compare, sizeof (dsl_wrapping_key_t), + offsetof(dsl_wrapping_key_t, wk_avl_link)); +} + +void +spa_keystore_fini(spa_keystore_t *sk) +{ + dsl_wrapping_key_t *wkey; + void *cookie = NULL; + + ASSERT(avl_is_empty(&sk->sk_dsl_keys)); + ASSERT(avl_is_empty(&sk->sk_key_mappings)); + + while ((wkey = avl_destroy_nodes(&sk->sk_wkeys, &cookie)) != NULL) + dsl_wrapping_key_free(wkey); + + avl_destroy(&sk->sk_wkeys); + avl_destroy(&sk->sk_key_mappings); + avl_destroy(&sk->sk_dsl_keys); + rw_destroy(&sk->sk_wkeys_lock); + rw_destroy(&sk->sk_km_lock); + rw_destroy(&sk->sk_dk_lock); +} + +static int +dsl_dir_get_encryption_root_ddobj(dsl_dir_t *dd, uint64_t *rddobj) +{ + if (dd->dd_crypto_obj == 0) + return (SET_ERROR(ENOENT)); + + return (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, rddobj)); +} + +int +dsl_dir_get_encryption_version(dsl_dir_t *dd, uint64_t *version) +{ + *version = 0; + + if (dd->dd_crypto_obj == 0) + return (SET_ERROR(ENOENT)); + + /* version 0 is implied by ENOENT */ + (void) zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + DSL_CRYPTO_KEY_VERSION, 8, 1, version); + + return (0); +} + +boolean_t +dsl_dir_incompatible_encryption_version(dsl_dir_t *dd) +{ + int ret; + uint64_t version = 0; + + ret = dsl_dir_get_encryption_version(dd, &version); + if (ret != 0) + return (B_FALSE); + + return (version != ZIO_CRYPT_KEY_CURRENT_VERSION); +} + +static int +spa_keystore_wkey_hold_ddobj_impl(spa_t *spa, uint64_t ddobj, + void *tag, dsl_wrapping_key_t **wkey_out) +{ + int ret; + dsl_wrapping_key_t search_wkey; + dsl_wrapping_key_t *found_wkey; + + ASSERT(RW_LOCK_HELD(&spa->spa_keystore.sk_wkeys_lock)); + + /* init the search wrapping key */ + search_wkey.wk_ddobj = ddobj; + + /* lookup the wrapping key */ + found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &search_wkey, NULL); + if (!found_wkey) { + ret = SET_ERROR(ENOENT); + goto error; + } + + /* increment the refcount */ + dsl_wrapping_key_hold(found_wkey, tag); + + *wkey_out = found_wkey; + return (0); + +error: + *wkey_out = NULL; + return (ret); +} + +static int +spa_keystore_wkey_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag, + dsl_wrapping_key_t **wkey_out) +{ + int ret; + dsl_wrapping_key_t *wkey; + uint64_t rddobj; + boolean_t locked = B_FALSE; + + if (!RW_WRITE_HELD(&spa->spa_keystore.sk_wkeys_lock)) { + rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_READER); + locked = B_TRUE; + } + + /* get the ddobj that the keylocation property was inherited from */ + ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj); + if (ret != 0) + goto error; + + /* lookup the wkey in the avl tree */ + ret = spa_keystore_wkey_hold_ddobj_impl(spa, rddobj, tag, &wkey); + if (ret != 0) + goto error; + + /* unlock the wkey tree if we locked it */ + if (locked) + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + + *wkey_out = wkey; + return (0); + +error: + if (locked) + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + + *wkey_out = NULL; + return (ret); +} + +int +dsl_crypto_can_set_keylocation(const char *dsname, const char *keylocation) +{ + int ret = 0; + dsl_dir_t *dd = NULL; + dsl_pool_t *dp = NULL; + uint64_t rddobj; + + /* hold the dsl dir */ + ret = dsl_pool_hold(dsname, FTAG, &dp); + if (ret != 0) + goto out; + + ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL); + if (ret != 0) + goto out; + + /* if dd is not encrypted, the value may only be "none" */ + if (dd->dd_crypto_obj == 0) { + if (strcmp(keylocation, "none") != 0) { + ret = SET_ERROR(EACCES); + goto out; + } + + ret = 0; + goto out; + } + + /* check for a valid keylocation for encrypted datasets */ + if (!zfs_prop_valid_keylocation(keylocation, B_TRUE)) { + ret = SET_ERROR(EINVAL); + goto out; + } + + /* check that this is an encryption root */ + ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj); + if (ret != 0) + goto out; + + if (rddobj != dd->dd_object) { + ret = SET_ERROR(EACCES); + goto out; + } + + dsl_dir_rele(dd, FTAG); + dsl_pool_rele(dp, FTAG); + + return (0); + +out: + if (dd != NULL) + dsl_dir_rele(dd, FTAG); + if (dp != NULL) + dsl_pool_rele(dp, FTAG); + + return (ret); +} + +static void +dsl_crypto_key_free(dsl_crypto_key_t *dck) +{ + ASSERT(zfs_refcount_count(&dck->dck_holds) == 0); + + /* destroy the zio_crypt_key_t */ + zio_crypt_key_destroy(&dck->dck_key); + + /* free the refcount, wrapping key, and lock */ + zfs_refcount_destroy(&dck->dck_holds); + if (dck->dck_wkey) + dsl_wrapping_key_rele(dck->dck_wkey, dck); + + /* free the key */ + kmem_free(dck, sizeof (dsl_crypto_key_t)); +} + +static void +dsl_crypto_key_rele(dsl_crypto_key_t *dck, void *tag) +{ + if (zfs_refcount_remove(&dck->dck_holds, tag) == 0) + dsl_crypto_key_free(dck); +} + +static int +dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey, + uint64_t dckobj, void *tag, dsl_crypto_key_t **dck_out) +{ + int ret; + uint64_t crypt = 0, guid = 0, version = 0; + uint8_t raw_keydata[MASTER_KEY_MAX_LEN]; + uint8_t raw_hmac_keydata[SHA512_HMAC_KEYLEN]; + uint8_t iv[WRAPPING_IV_LEN]; + uint8_t mac[WRAPPING_MAC_LEN]; + dsl_crypto_key_t *dck; + + /* allocate and initialize the key */ + dck = kmem_zalloc(sizeof (dsl_crypto_key_t), KM_SLEEP); + if (!dck) + return (SET_ERROR(ENOMEM)); + + /* fetch all of the values we need from the ZAP */ + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, + &crypt); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &guid); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1, + MASTER_KEY_MAX_LEN, raw_keydata); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1, + SHA512_HMAC_KEYLEN, raw_hmac_keydata); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN, + iv); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN, + mac); + if (ret != 0) + goto error; + + /* the initial on-disk format for encryption did not have a version */ + (void) zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_VERSION, 8, 1, &version); + + /* + * Unwrap the keys. If there is an error return EACCES to indicate + * an authentication failure. + */ + ret = zio_crypt_key_unwrap(&wkey->wk_key, crypt, version, guid, + raw_keydata, raw_hmac_keydata, iv, mac, &dck->dck_key); + if (ret != 0) { + ret = SET_ERROR(EACCES); + goto error; + } + + /* finish initializing the dsl_crypto_key_t */ + zfs_refcount_create(&dck->dck_holds); + dsl_wrapping_key_hold(wkey, dck); + dck->dck_wkey = wkey; + dck->dck_obj = dckobj; + (void) zfs_refcount_add(&dck->dck_holds, tag); + + *dck_out = dck; + return (0); + +error: + if (dck != NULL) { + bzero(dck, sizeof (dsl_crypto_key_t)); + kmem_free(dck, sizeof (dsl_crypto_key_t)); + } + + *dck_out = NULL; + return (ret); +} + +static int +spa_keystore_dsl_key_hold_impl(spa_t *spa, uint64_t dckobj, void *tag, + dsl_crypto_key_t **dck_out) +{ + int ret; + dsl_crypto_key_t search_dck; + dsl_crypto_key_t *found_dck; + + ASSERT(RW_LOCK_HELD(&spa->spa_keystore.sk_dk_lock)); + + /* init the search key */ + search_dck.dck_obj = dckobj; + + /* find the matching key in the keystore */ + found_dck = avl_find(&spa->spa_keystore.sk_dsl_keys, &search_dck, NULL); + if (!found_dck) { + ret = SET_ERROR(ENOENT); + goto error; + } + + /* increment the refcount */ + (void) zfs_refcount_add(&found_dck->dck_holds, tag); + + *dck_out = found_dck; + return (0); + +error: + *dck_out = NULL; + return (ret); +} + +static int +spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag, + dsl_crypto_key_t **dck_out) +{ + int ret; + avl_index_t where; + dsl_crypto_key_t *dck_io = NULL, *dck_ks = NULL; + dsl_wrapping_key_t *wkey = NULL; + uint64_t dckobj = dd->dd_crypto_obj; + + /* Lookup the key in the tree of currently loaded keys */ + rw_enter(&spa->spa_keystore.sk_dk_lock, RW_READER); + ret = spa_keystore_dsl_key_hold_impl(spa, dckobj, tag, &dck_ks); + rw_exit(&spa->spa_keystore.sk_dk_lock); + if (ret == 0) { + *dck_out = dck_ks; + return (0); + } + + /* Lookup the wrapping key from the keystore */ + ret = spa_keystore_wkey_hold_dd(spa, dd, FTAG, &wkey); + if (ret != 0) { + *dck_out = NULL; + return (SET_ERROR(EACCES)); + } + + /* Read the key from disk */ + ret = dsl_crypto_key_open(spa->spa_meta_objset, wkey, dckobj, + tag, &dck_io); + if (ret != 0) { + dsl_wrapping_key_rele(wkey, FTAG); + *dck_out = NULL; + return (ret); + } + + /* + * Add the key to the keystore. It may already exist if it was + * added while performing the read from disk. In this case discard + * it and return the key from the keystore. + */ + rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER); + ret = spa_keystore_dsl_key_hold_impl(spa, dckobj, tag, &dck_ks); + if (ret != 0) { + (void) avl_find(&spa->spa_keystore.sk_dsl_keys, dck_io, &where); + avl_insert(&spa->spa_keystore.sk_dsl_keys, dck_io, where); + *dck_out = dck_io; + } else { + dsl_crypto_key_free(dck_io); + *dck_out = dck_ks; + } + + /* Release the wrapping key (the dsl key now has a reference to it) */ + dsl_wrapping_key_rele(wkey, FTAG); + rw_exit(&spa->spa_keystore.sk_dk_lock); + + return (0); +} + +void +spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, void *tag) +{ + rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER); + + if (zfs_refcount_remove(&dck->dck_holds, tag) == 0) { + avl_remove(&spa->spa_keystore.sk_dsl_keys, dck); + dsl_crypto_key_free(dck); + } + + rw_exit(&spa->spa_keystore.sk_dk_lock); +} + +int +spa_keystore_load_wkey_impl(spa_t *spa, dsl_wrapping_key_t *wkey) +{ + int ret; + avl_index_t where; + dsl_wrapping_key_t *found_wkey; + + rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER); + + /* insert the wrapping key into the keystore */ + found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, wkey, &where); + if (found_wkey != NULL) { + ret = SET_ERROR(EEXIST); + goto error_unlock; + } + avl_insert(&spa->spa_keystore.sk_wkeys, wkey, where); + + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + + return (0); + +error_unlock: + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + return (ret); +} + +int +spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp, + boolean_t noop) +{ + int ret; + dsl_dir_t *dd = NULL; + dsl_crypto_key_t *dck = NULL; + dsl_wrapping_key_t *wkey = dcp->cp_wkey; + dsl_pool_t *dp = NULL; + uint64_t keyformat, salt, iters; + + /* + * We don't validate the wrapping key's keyformat, salt, or iters + * since they will never be needed after the DCK has been wrapped. + */ + if (dcp->cp_wkey == NULL || + dcp->cp_cmd != DCP_CMD_NONE || + dcp->cp_crypt != ZIO_CRYPT_INHERIT || + dcp->cp_keylocation != NULL) + return (SET_ERROR(EINVAL)); + + ret = dsl_pool_hold(dsname, FTAG, &dp); + if (ret != 0) + goto error; + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) { + ret = (SET_ERROR(ENOTSUP)); + goto error; + } + + /* hold the dsl dir */ + ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL); + if (ret != 0) + goto error; + + /* initialize the wkey's ddobj */ + wkey->wk_ddobj = dd->dd_object; + + /* verify that the wkey is correct by opening its dsl key */ + ret = dsl_crypto_key_open(dp->dp_meta_objset, wkey, + dd->dd_crypto_obj, FTAG, &dck); + if (ret != 0) + goto error; + + /* initialize the wkey encryption parameters from the DSL Crypto Key */ + ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &keyformat); + if (ret != 0) + goto error; + + ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &salt); + if (ret != 0) + goto error; + + ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &iters); + if (ret != 0) + goto error; + + ASSERT3U(keyformat, <, ZFS_KEYFORMAT_FORMATS); + ASSERT3U(keyformat, !=, ZFS_KEYFORMAT_NONE); + IMPLY(keyformat == ZFS_KEYFORMAT_PASSPHRASE, iters != 0); + IMPLY(keyformat == ZFS_KEYFORMAT_PASSPHRASE, salt != 0); + IMPLY(keyformat != ZFS_KEYFORMAT_PASSPHRASE, iters == 0); + IMPLY(keyformat != ZFS_KEYFORMAT_PASSPHRASE, salt == 0); + + wkey->wk_keyformat = keyformat; + wkey->wk_salt = salt; + wkey->wk_iters = iters; + + /* + * At this point we have verified the wkey and confirmed that it can + * be used to decrypt a DSL Crypto Key. We can simply cleanup and + * return if this is all the user wanted to do. + */ + if (noop) + goto error; + + /* insert the wrapping key into the keystore */ + ret = spa_keystore_load_wkey_impl(dp->dp_spa, wkey); + if (ret != 0) + goto error; + + dsl_crypto_key_rele(dck, FTAG); + dsl_dir_rele(dd, FTAG); + dsl_pool_rele(dp, FTAG); + + return (0); + +error: + if (dck != NULL) + dsl_crypto_key_rele(dck, FTAG); + if (dd != NULL) + dsl_dir_rele(dd, FTAG); + if (dp != NULL) + dsl_pool_rele(dp, FTAG); + + return (ret); +} + +int +spa_keystore_unload_wkey_impl(spa_t *spa, uint64_t ddobj) +{ + int ret; + dsl_wrapping_key_t search_wkey; + dsl_wrapping_key_t *found_wkey; + + /* init the search wrapping key */ + search_wkey.wk_ddobj = ddobj; + + rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER); + + /* remove the wrapping key from the keystore */ + found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, + &search_wkey, NULL); + if (!found_wkey) { + ret = SET_ERROR(EACCES); + goto error_unlock; + } else if (zfs_refcount_count(&found_wkey->wk_refcnt) != 0) { + ret = SET_ERROR(EBUSY); + goto error_unlock; + } + avl_remove(&spa->spa_keystore.sk_wkeys, found_wkey); + + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + + /* free the wrapping key */ + dsl_wrapping_key_free(found_wkey); + + return (0); + +error_unlock: + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + return (ret); +} + +int +spa_keystore_unload_wkey(const char *dsname) +{ + int ret = 0; + dsl_dir_t *dd = NULL; + dsl_pool_t *dp = NULL; + spa_t *spa = NULL; + + ret = spa_open(dsname, &spa, FTAG); + if (ret != 0) + return (ret); + + /* + * Wait for any outstanding txg IO to complete, releasing any + * remaining references on the wkey. + */ + if (spa_mode(spa) != FREAD) + txg_wait_synced(spa->spa_dsl_pool, 0); + + spa_close(spa, FTAG); + + /* hold the dsl dir */ + ret = dsl_pool_hold(dsname, FTAG, &dp); + if (ret != 0) + goto error; + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) { + ret = (SET_ERROR(ENOTSUP)); + goto error; + } + + ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL); + if (ret != 0) + goto error; + + /* unload the wkey */ + ret = spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object); + if (ret != 0) + goto error; + + dsl_dir_rele(dd, FTAG); + dsl_pool_rele(dp, FTAG); + + return (0); + +error: + if (dd != NULL) + dsl_dir_rele(dd, FTAG); + if (dp != NULL) + dsl_pool_rele(dp, FTAG); + + return (ret); +} + +void +key_mapping_add_ref(dsl_key_mapping_t *km, void *tag) +{ + ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1); + (void) zfs_refcount_add(&km->km_refcnt, tag); +} + +/* + * The locking here is a little tricky to ensure we don't cause unnecessary + * performance problems. We want to release a key mapping whenever someone + * decrements the refcount to 0, but freeing the mapping requires removing + * it from the spa_keystore, which requires holding sk_km_lock as a writer. + * Most of the time we don't want to hold this lock as a writer, since the + * same lock is held as a reader for each IO that needs to encrypt / decrypt + * data for any dataset and in practice we will only actually free the + * mapping after unmounting a dataset. + */ +void +key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag) +{ + ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1); + + if (zfs_refcount_remove(&km->km_refcnt, tag) != 0) + return; + + /* + * We think we are going to need to free the mapping. Add a + * reference to prevent most other releasers from thinking + * this might be their responsibility. This is inherently + * racy, so we will confirm that we are legitimately the + * last holder once we have the sk_km_lock as a writer. + */ + (void) zfs_refcount_add(&km->km_refcnt, FTAG); + + rw_enter(&spa->spa_keystore.sk_km_lock, RW_WRITER); + if (zfs_refcount_remove(&km->km_refcnt, FTAG) != 0) { + rw_exit(&spa->spa_keystore.sk_km_lock); + return; + } + + avl_remove(&spa->spa_keystore.sk_key_mappings, km); + rw_exit(&spa->spa_keystore.sk_km_lock); + + spa_keystore_dsl_key_rele(spa, km->km_key, km); + kmem_free(km, sizeof (dsl_key_mapping_t)); +} + +int +spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, void *tag, + dsl_key_mapping_t **km_out) +{ + int ret; + avl_index_t where; + dsl_key_mapping_t *km, *found_km; + boolean_t should_free = B_FALSE; + + /* Allocate and initialize the mapping */ + km = kmem_zalloc(sizeof (dsl_key_mapping_t), KM_SLEEP); + zfs_refcount_create(&km->km_refcnt); + + ret = spa_keystore_dsl_key_hold_dd(spa, ds->ds_dir, km, &km->km_key); + if (ret != 0) { + zfs_refcount_destroy(&km->km_refcnt); + kmem_free(km, sizeof (dsl_key_mapping_t)); + + if (km_out != NULL) + *km_out = NULL; + return (ret); + } + + km->km_dsobj = ds->ds_object; + + rw_enter(&spa->spa_keystore.sk_km_lock, RW_WRITER); + + /* + * If a mapping already exists, simply increment its refcount and + * cleanup the one we made. We want to allocate / free outside of + * the lock because this lock is also used by the zio layer to lookup + * key mappings. Otherwise, use the one we created. Normally, there will + * only be one active reference at a time (the objset owner), but there + * are times when there could be multiple async users. + */ + found_km = avl_find(&spa->spa_keystore.sk_key_mappings, km, &where); + if (found_km != NULL) { + should_free = B_TRUE; + (void) zfs_refcount_add(&found_km->km_refcnt, tag); + if (km_out != NULL) + *km_out = found_km; + } else { + (void) zfs_refcount_add(&km->km_refcnt, tag); + avl_insert(&spa->spa_keystore.sk_key_mappings, km, where); + if (km_out != NULL) + *km_out = km; + } + + rw_exit(&spa->spa_keystore.sk_km_lock); + + if (should_free) { + spa_keystore_dsl_key_rele(spa, km->km_key, km); + zfs_refcount_destroy(&km->km_refcnt); + kmem_free(km, sizeof (dsl_key_mapping_t)); + } + + return (0); +} + +int +spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, void *tag) +{ + int ret; + dsl_key_mapping_t search_km; + dsl_key_mapping_t *found_km; + + /* init the search key mapping */ + search_km.km_dsobj = dsobj; + + rw_enter(&spa->spa_keystore.sk_km_lock, RW_READER); + + /* find the matching mapping */ + found_km = avl_find(&spa->spa_keystore.sk_key_mappings, + &search_km, NULL); + if (found_km == NULL) { + ret = SET_ERROR(ENOENT); + goto error_unlock; + } + + rw_exit(&spa->spa_keystore.sk_km_lock); + + key_mapping_rele(spa, found_km, tag); + + return (0); + +error_unlock: + rw_exit(&spa->spa_keystore.sk_km_lock); + return (ret); +} + +/* + * This function is primarily used by the zio and arc layer to lookup + * DSL Crypto Keys for encryption. Callers must release the key with + * spa_keystore_dsl_key_rele(). The function may also be called with + * dck_out == NULL and tag == NULL to simply check that a key exists + * without getting a reference to it. + */ +int +spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag, + dsl_crypto_key_t **dck_out) +{ + int ret; + dsl_key_mapping_t search_km; + dsl_key_mapping_t *found_km; + + ASSERT((tag != NULL && dck_out != NULL) || + (tag == NULL && dck_out == NULL)); + + /* init the search key mapping */ + search_km.km_dsobj = dsobj; + + rw_enter(&spa->spa_keystore.sk_km_lock, RW_READER); + + /* remove the mapping from the tree */ + found_km = avl_find(&spa->spa_keystore.sk_key_mappings, &search_km, + NULL); + if (found_km == NULL) { + ret = SET_ERROR(ENOENT); + goto error_unlock; + } + + if (found_km && tag) + (void) zfs_refcount_add(&found_km->km_key->dck_holds, tag); + + rw_exit(&spa->spa_keystore.sk_km_lock); + + if (dck_out != NULL) + *dck_out = found_km->km_key; + return (0); + +error_unlock: + rw_exit(&spa->spa_keystore.sk_km_lock); + + if (dck_out != NULL) + *dck_out = NULL; + return (ret); +} + +static int +dmu_objset_check_wkey_loaded(dsl_dir_t *dd) +{ + int ret; + dsl_wrapping_key_t *wkey = NULL; + + ret = spa_keystore_wkey_hold_dd(dd->dd_pool->dp_spa, dd, FTAG, + &wkey); + if (ret != 0) + return (SET_ERROR(EACCES)); + + dsl_wrapping_key_rele(wkey, FTAG); + + return (0); +} + +static zfs_keystatus_t +dsl_dataset_get_keystatus(dsl_dir_t *dd) +{ + /* check if this dd has a has a dsl key */ + if (dd->dd_crypto_obj == 0) + return (ZFS_KEYSTATUS_NONE); + + return (dmu_objset_check_wkey_loaded(dd) == 0 ? + ZFS_KEYSTATUS_AVAILABLE : ZFS_KEYSTATUS_UNAVAILABLE); +} + +static int +dsl_dir_get_crypt(dsl_dir_t *dd, uint64_t *crypt) +{ + if (dd->dd_crypto_obj == 0) { + *crypt = ZIO_CRYPT_OFF; + return (0); + } + + return (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, crypt)); +} + +static void +dsl_crypto_key_sync_impl(objset_t *mos, uint64_t dckobj, uint64_t crypt, + uint64_t root_ddobj, uint64_t guid, uint8_t *iv, uint8_t *mac, + uint8_t *keydata, uint8_t *hmac_keydata, uint64_t keyformat, + uint64_t salt, uint64_t iters, dmu_tx_t *tx) +{ + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, + &crypt, tx)); + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, + &root_ddobj, tx)); + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, + &guid, tx)); + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN, + iv, tx)); + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN, + mac, tx)); + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1, + MASTER_KEY_MAX_LEN, keydata, tx)); + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1, + SHA512_HMAC_KEYLEN, hmac_keydata, tx)); + VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), + 8, 1, &keyformat, tx)); + VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), + 8, 1, &salt, tx)); + VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), + 8, 1, &iters, tx)); +} + +static void +dsl_crypto_key_sync(dsl_crypto_key_t *dck, dmu_tx_t *tx) +{ + zio_crypt_key_t *key = &dck->dck_key; + dsl_wrapping_key_t *wkey = dck->dck_wkey; + uint8_t keydata[MASTER_KEY_MAX_LEN]; + uint8_t hmac_keydata[SHA512_HMAC_KEYLEN]; + uint8_t iv[WRAPPING_IV_LEN]; + uint8_t mac[WRAPPING_MAC_LEN]; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3U(key->zk_crypt, <, ZIO_CRYPT_FUNCTIONS); + + /* encrypt and store the keys along with the IV and MAC */ + VERIFY0(zio_crypt_key_wrap(&dck->dck_wkey->wk_key, key, iv, mac, + keydata, hmac_keydata)); + + /* update the ZAP with the obtained values */ + dsl_crypto_key_sync_impl(tx->tx_pool->dp_meta_objset, dck->dck_obj, + key->zk_crypt, wkey->wk_ddobj, key->zk_guid, iv, mac, keydata, + hmac_keydata, wkey->wk_keyformat, wkey->wk_salt, wkey->wk_iters, + tx); +} + +typedef struct spa_keystore_change_key_args { + const char *skcka_dsname; + dsl_crypto_params_t *skcka_cp; +} spa_keystore_change_key_args_t; + +static int +spa_keystore_change_key_check(void *arg, dmu_tx_t *tx) +{ + int ret; + dsl_dir_t *dd = NULL; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_keystore_change_key_args_t *skcka = arg; + dsl_crypto_params_t *dcp = skcka->skcka_cp; + uint64_t rddobj; + + /* check for the encryption feature */ + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) { + ret = SET_ERROR(ENOTSUP); + goto error; + } + + /* check for valid key change command */ + if (dcp->cp_cmd != DCP_CMD_NEW_KEY && + dcp->cp_cmd != DCP_CMD_INHERIT && + dcp->cp_cmd != DCP_CMD_FORCE_NEW_KEY && + dcp->cp_cmd != DCP_CMD_FORCE_INHERIT) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* hold the dd */ + ret = dsl_dir_hold(dp, skcka->skcka_dsname, FTAG, &dd, NULL); + if (ret != 0) + goto error; + + /* verify that the dataset is encrypted */ + if (dd->dd_crypto_obj == 0) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* clones must always use their origin's key */ + if (dsl_dir_is_clone(dd)) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* lookup the ddobj we are inheriting the keylocation from */ + ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj); + if (ret != 0) + goto error; + + /* Handle inheritance */ + if (dcp->cp_cmd == DCP_CMD_INHERIT || + dcp->cp_cmd == DCP_CMD_FORCE_INHERIT) { + /* no other encryption params should be given */ + if (dcp->cp_crypt != ZIO_CRYPT_INHERIT || + dcp->cp_keylocation != NULL || + dcp->cp_wkey != NULL) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* check that this is an encryption root */ + if (dd->dd_object != rddobj) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* check that the parent is encrypted */ + if (dd->dd_parent->dd_crypto_obj == 0) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* if we are rewrapping check that both keys are loaded */ + if (dcp->cp_cmd == DCP_CMD_INHERIT) { + ret = dmu_objset_check_wkey_loaded(dd); + if (ret != 0) + goto error; + + ret = dmu_objset_check_wkey_loaded(dd->dd_parent); + if (ret != 0) + goto error; + } + + dsl_dir_rele(dd, FTAG); + return (0); + } + + /* handle forcing an encryption root without rewrapping */ + if (dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY) { + /* no other encryption params should be given */ + if (dcp->cp_crypt != ZIO_CRYPT_INHERIT || + dcp->cp_keylocation != NULL || + dcp->cp_wkey != NULL) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* check that this is not an encryption root */ + if (dd->dd_object == rddobj) { + ret = SET_ERROR(EINVAL); + goto error; + } + + dsl_dir_rele(dd, FTAG); + return (0); + } + + /* crypt cannot be changed after creation */ + if (dcp->cp_crypt != ZIO_CRYPT_INHERIT) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* we are not inheritting our parent's wkey so we need one ourselves */ + if (dcp->cp_wkey == NULL) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* check for a valid keyformat for the new wrapping key */ + if (dcp->cp_wkey->wk_keyformat >= ZFS_KEYFORMAT_FORMATS || + dcp->cp_wkey->wk_keyformat == ZFS_KEYFORMAT_NONE) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* + * If this dataset is not currently an encryption root we need a new + * keylocation for this dataset's new wrapping key. Otherwise we can + * just keep the one we already had. + */ + if (dd->dd_object != rddobj && dcp->cp_keylocation == NULL) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* check that the keylocation is valid if it is not NULL */ + if (dcp->cp_keylocation != NULL && + !zfs_prop_valid_keylocation(dcp->cp_keylocation, B_TRUE)) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* passphrases require pbkdf2 salt and iters */ + if (dcp->cp_wkey->wk_keyformat == ZFS_KEYFORMAT_PASSPHRASE) { + if (dcp->cp_wkey->wk_salt == 0 || + dcp->cp_wkey->wk_iters < MIN_PBKDF2_ITERATIONS) { + ret = SET_ERROR(EINVAL); + goto error; + } + } else { + if (dcp->cp_wkey->wk_salt != 0 || dcp->cp_wkey->wk_iters != 0) { + ret = SET_ERROR(EINVAL); + goto error; + } + } + + /* make sure the dd's wkey is loaded */ + ret = dmu_objset_check_wkey_loaded(dd); + if (ret != 0) + goto error; + + dsl_dir_rele(dd, FTAG); + + return (0); + +error: + if (dd != NULL) + dsl_dir_rele(dd, FTAG); + + return (ret); +} + + +static void +spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj, + uint64_t new_rddobj, dsl_wrapping_key_t *wkey, dmu_tx_t *tx) +{ + zap_cursor_t *zc; + zap_attribute_t *za; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd = NULL; + dsl_crypto_key_t *dck = NULL; + uint64_t curr_rddobj; + + ASSERT(RW_WRITE_HELD(&dp->dp_spa->spa_keystore.sk_wkeys_lock)); + + /* hold the dd */ + VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); + + /* ignore hidden dsl dirs */ + if (dd->dd_myname[0] == '$' || dd->dd_myname[0] == '%') { + dsl_dir_rele(dd, FTAG); + return; + } + + /* + * Stop recursing if this dsl dir didn't inherit from the root + * or if this dd is a clone. + */ + VERIFY0(dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj)); + if (curr_rddobj != rddobj || dsl_dir_is_clone(dd)) { + dsl_dir_rele(dd, FTAG); + return; + } + + /* + * If we don't have a wrapping key just update the dck to reflect the + * new encryption root. Otherwise rewrap the entire dck and re-sync it + * to disk. + */ + if (wkey == NULL) { + VERIFY0(zap_update(dp->dp_meta_objset, dd->dd_crypto_obj, + DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, &new_rddobj, tx)); + } else { + VERIFY0(spa_keystore_dsl_key_hold_dd(dp->dp_spa, dd, + FTAG, &dck)); + dsl_wrapping_key_hold(wkey, dck); + dsl_wrapping_key_rele(dck->dck_wkey, dck); + dck->dck_wkey = wkey; + dsl_crypto_key_sync(dck, tx); + spa_keystore_dsl_key_rele(dp->dp_spa, dck, FTAG); + } + + zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* Recurse into all child dsl dirs. */ + for (zap_cursor_init(zc, dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + spa_keystore_change_key_sync_impl(rddobj, + za->za_first_integer, new_rddobj, wkey, tx); + } + zap_cursor_fini(zc); + + kmem_free(za, sizeof (zap_attribute_t)); + kmem_free(zc, sizeof (zap_cursor_t)); + + dsl_dir_rele(dd, FTAG); +} + +static void +spa_keystore_change_key_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_t *ds; + avl_index_t where; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + spa_keystore_change_key_args_t *skcka = arg; + dsl_crypto_params_t *dcp = skcka->skcka_cp; + dsl_wrapping_key_t *wkey = NULL, *found_wkey; + dsl_wrapping_key_t wkey_search; + char *keylocation = dcp->cp_keylocation; + uint64_t rddobj, new_rddobj; + + /* create and initialize the wrapping key */ + VERIFY0(dsl_dataset_hold(dp, skcka->skcka_dsname, FTAG, &ds)); + ASSERT(!ds->ds_is_snapshot); + + if (dcp->cp_cmd == DCP_CMD_NEW_KEY || + dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY) { + /* + * We are changing to a new wkey. Set additional properties + * which can be sent along with this ioctl. Note that this + * command can set keylocation even if it can't normally be + * set via 'zfs set' due to a non-local keylocation. + */ + if (dcp->cp_cmd == DCP_CMD_NEW_KEY) { + wkey = dcp->cp_wkey; + wkey->wk_ddobj = ds->ds_dir->dd_object; + } else { + keylocation = "prompt"; + } + + if (keylocation != NULL) { + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1, + keylocation, tx); + } + + VERIFY0(dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj)); + new_rddobj = ds->ds_dir->dd_object; + } else { + /* + * We are inheriting the parent's wkey. Unset any local + * keylocation and grab a reference to the wkey. + */ + if (dcp->cp_cmd == DCP_CMD_INHERIT) { + VERIFY0(spa_keystore_wkey_hold_dd(spa, + ds->ds_dir->dd_parent, FTAG, &wkey)); + } + + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), ZPROP_SRC_NONE, + 0, 0, NULL, tx); + + rddobj = ds->ds_dir->dd_object; + VERIFY0(dsl_dir_get_encryption_root_ddobj(ds->ds_dir->dd_parent, + &new_rddobj)); + } + + if (wkey == NULL) { + ASSERT(dcp->cp_cmd == DCP_CMD_FORCE_INHERIT || + dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY); + } + + rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER); + + /* recurse through all children and rewrap their keys */ + spa_keystore_change_key_sync_impl(rddobj, ds->ds_dir->dd_object, + new_rddobj, wkey, tx); + + /* + * All references to the old wkey should be released now (if it + * existed). Replace the wrapping key. + */ + wkey_search.wk_ddobj = ds->ds_dir->dd_object; + found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &wkey_search, NULL); + if (found_wkey != NULL) { + ASSERT0(zfs_refcount_count(&found_wkey->wk_refcnt)); + avl_remove(&spa->spa_keystore.sk_wkeys, found_wkey); + dsl_wrapping_key_free(found_wkey); + } + + if (dcp->cp_cmd == DCP_CMD_NEW_KEY) { + (void) avl_find(&spa->spa_keystore.sk_wkeys, wkey, &where); + avl_insert(&spa->spa_keystore.sk_wkeys, wkey, where); + } else if (wkey != NULL) { + dsl_wrapping_key_rele(wkey, FTAG); + } + + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + + dsl_dataset_rele(ds, FTAG); +} + +int +spa_keystore_change_key(const char *dsname, dsl_crypto_params_t *dcp) +{ + spa_keystore_change_key_args_t skcka; + + /* initialize the args struct */ + skcka.skcka_dsname = dsname; + skcka.skcka_cp = dcp; + + /* + * Perform the actual work in syncing context. The blocks modified + * here could be calculated but it would require holding the pool + * lock and traversing all of the datasets that will have their keys + * changed. + */ + return (dsl_sync_task(dsname, spa_keystore_change_key_check, + spa_keystore_change_key_sync, &skcka, 15, + ZFS_SPACE_CHECK_RESERVED)); +} + +int +dsl_dir_rename_crypt_check(dsl_dir_t *dd, dsl_dir_t *newparent) +{ + int ret; + uint64_t curr_rddobj, parent_rddobj; + + if (dd->dd_crypto_obj == 0) { + /* children of encrypted parents must be encrypted */ + if (newparent->dd_crypto_obj != 0) { + ret = SET_ERROR(EACCES); + goto error; + } + + return (0); + } + + ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj); + if (ret != 0) + goto error; + + /* + * if this is not an encryption root, we must make sure we are not + * moving dd to a new encryption root + */ + if (dd->dd_object != curr_rddobj) { + ret = dsl_dir_get_encryption_root_ddobj(newparent, + &parent_rddobj); + if (ret != 0) + goto error; + + if (parent_rddobj != curr_rddobj) { + ret = SET_ERROR(EACCES); + goto error; + } + } + + return (0); + +error: + return (ret); +} + +/* + * Check to make sure that a promote from targetdd to origindd will not require + * any key rewraps. + */ +int +dsl_dataset_promote_crypt_check(dsl_dir_t *target, dsl_dir_t *origin) +{ + int ret; + uint64_t rddobj, op_rddobj, tp_rddobj; + + /* If the dataset is not encrypted we don't need to check anything */ + if (origin->dd_crypto_obj == 0) + return (0); + + /* + * If we are not changing the first origin snapshot in a chain + * the encryption root won't change either. + */ + if (dsl_dir_is_clone(origin)) + return (0); + + /* + * If the origin is the encryption root we will update + * the DSL Crypto Key to point to the target instead. + */ + ret = dsl_dir_get_encryption_root_ddobj(origin, &rddobj); + if (ret != 0) + return (ret); + + if (rddobj == origin->dd_object) + return (0); + + /* + * The origin is inheriting its encryption root from its parent. + * Check that the parent of the target has the same encryption root. + */ + ret = dsl_dir_get_encryption_root_ddobj(origin->dd_parent, &op_rddobj); + if (ret != 0) + return (ret); + + ret = dsl_dir_get_encryption_root_ddobj(target->dd_parent, &tp_rddobj); + if (ret != 0) + return (ret); + + if (op_rddobj != tp_rddobj) + return (SET_ERROR(EACCES)); + + return (0); +} + +void +dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, + dmu_tx_t *tx) +{ + uint64_t rddobj; + dsl_pool_t *dp = target->dd_pool; + dsl_dataset_t *targetds; + dsl_dataset_t *originds; + char *keylocation; + + if (origin->dd_crypto_obj == 0) + return; + if (dsl_dir_is_clone(origin)) + return; + + VERIFY0(dsl_dir_get_encryption_root_ddobj(origin, &rddobj)); + + if (rddobj != origin->dd_object) + return; + + /* + * If the target is being promoted to the encryption root update the + * DSL Crypto Key and keylocation to reflect that. We also need to + * update the DSL Crypto Keys of all children inheriting their + * encryption root to point to the new target. Otherwise, the check + * function ensured that the encryption root will not change. + */ + keylocation = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(target)->dd_head_dataset_obj, FTAG, &targetds)); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(origin)->dd_head_dataset_obj, FTAG, &originds)); + + VERIFY0(dsl_prop_get_dd(origin, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + 1, ZAP_MAXVALUELEN, keylocation, NULL, B_FALSE)); + dsl_prop_set_sync_impl(targetds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1, keylocation, tx); + dsl_prop_set_sync_impl(originds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + ZPROP_SRC_NONE, 0, 0, NULL, tx); + + rw_enter(&dp->dp_spa->spa_keystore.sk_wkeys_lock, RW_WRITER); + spa_keystore_change_key_sync_impl(rddobj, origin->dd_object, + target->dd_object, NULL, tx); + rw_exit(&dp->dp_spa->spa_keystore.sk_wkeys_lock); + + dsl_dataset_rele(targetds, FTAG); + dsl_dataset_rele(originds, FTAG); + kmem_free(keylocation, ZAP_MAXVALUELEN); +} + +int +dmu_objset_clone_crypt_check(dsl_dir_t *parentdd, dsl_dir_t *origindd) +{ + int ret; + uint64_t pcrypt, crypt; + + /* + * Check that we are not making an unencrypted child of an + * encrypted parent. + */ + ret = dsl_dir_get_crypt(parentdd, &pcrypt); + if (ret != 0) + return (ret); + + ret = dsl_dir_get_crypt(origindd, &crypt); + if (ret != 0) + return (ret); + + ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT); + ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT); + + if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF) + return (SET_ERROR(EINVAL)); + + return (0); +} + + +int +dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp, + boolean_t *will_encrypt) +{ + int ret; + uint64_t pcrypt, crypt; + dsl_crypto_params_t dummy_dcp = { 0 }; + + if (will_encrypt != NULL) + *will_encrypt = B_FALSE; + + if (dcp == NULL) + dcp = &dummy_dcp; + + if (dcp->cp_cmd != DCP_CMD_NONE) + return (SET_ERROR(EINVAL)); + + if (parentdd != NULL) { + ret = dsl_dir_get_crypt(parentdd, &pcrypt); + if (ret != 0) + return (ret); + } else { + pcrypt = ZIO_CRYPT_OFF; + } + + crypt = (dcp->cp_crypt == ZIO_CRYPT_INHERIT) ? pcrypt : dcp->cp_crypt; + + ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT); + ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT); + + /* + * We can't create an unencrypted child of an encrypted parent + * under any circumstances. + */ + if (crypt == ZIO_CRYPT_OFF && pcrypt != ZIO_CRYPT_OFF) + return (SET_ERROR(EINVAL)); + + /* check for valid dcp with no encryption (inherited or local) */ + if (crypt == ZIO_CRYPT_OFF) { + /* Must not specify encryption params */ + if (dcp->cp_wkey != NULL || + (dcp->cp_keylocation != NULL && + strcmp(dcp->cp_keylocation, "none") != 0)) + return (SET_ERROR(EINVAL)); + + return (0); + } + + if (will_encrypt != NULL) + *will_encrypt = B_TRUE; + + /* + * We will now definitely be encrypting. Check the feature flag. When + * creating the pool the caller will check this for us since we won't + * technically have the feature activated yet. + */ + if (parentdd != NULL && + !spa_feature_is_enabled(parentdd->dd_pool->dp_spa, + SPA_FEATURE_ENCRYPTION)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + /* check for errata #4 (encryption enabled, bookmark_v2 disabled) */ + if (parentdd != NULL && + !spa_feature_is_enabled(parentdd->dd_pool->dp_spa, + SPA_FEATURE_BOOKMARK_V2)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + /* handle inheritance */ + if (dcp->cp_wkey == NULL) { + ASSERT3P(parentdd, !=, NULL); + + /* key must be fully unspecified */ + if (dcp->cp_keylocation != NULL) + return (SET_ERROR(EINVAL)); + + /* parent must have a key to inherit */ + if (pcrypt == ZIO_CRYPT_OFF) + return (SET_ERROR(EINVAL)); + + /* check for parent key */ + ret = dmu_objset_check_wkey_loaded(parentdd); + if (ret != 0) + return (ret); + + return (0); + } + + /* At this point we should have a fully specified key. Check location */ + if (dcp->cp_keylocation == NULL || + !zfs_prop_valid_keylocation(dcp->cp_keylocation, B_TRUE)) + return (SET_ERROR(EINVAL)); + + /* Must have fully specified keyformat */ + switch (dcp->cp_wkey->wk_keyformat) { + case ZFS_KEYFORMAT_HEX: + case ZFS_KEYFORMAT_RAW: + /* requires no pbkdf2 iters and salt */ + if (dcp->cp_wkey->wk_salt != 0 || + dcp->cp_wkey->wk_iters != 0) + return (SET_ERROR(EINVAL)); + break; + case ZFS_KEYFORMAT_PASSPHRASE: + /* requires pbkdf2 iters and salt */ + if (dcp->cp_wkey->wk_salt == 0 || + dcp->cp_wkey->wk_iters < MIN_PBKDF2_ITERATIONS) + return (SET_ERROR(EINVAL)); + break; + case ZFS_KEYFORMAT_NONE: + default: + /* keyformat must be specified and valid */ + return (SET_ERROR(EINVAL)); + } + + return (0); +} + +void +dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, + dsl_dataset_t *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dd->dd_pool; + uint64_t crypt; + dsl_wrapping_key_t *wkey; + + /* clones always use their origin's wrapping key */ + if (dsl_dir_is_clone(dd)) { + ASSERT3P(dcp, ==, NULL); + + /* + * If this is an encrypted clone we just need to clone the + * dck into dd. Zapify the dd so we can do that. + */ + if (origin->ds_dir->dd_crypto_obj != 0) { + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dsl_dir_zapify(dd, tx); + + dd->dd_crypto_obj = + dsl_crypto_key_clone_sync(origin->ds_dir, tx); + VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, + &dd->dd_crypto_obj, tx)); + } + + return; + } + + /* + * A NULL dcp at this point indicates this is the origin dataset + * which does not have an objset to encrypt. Raw receives will handle + * encryption separately later. In both cases we can simply return. + */ + if (dcp == NULL || dcp->cp_cmd == DCP_CMD_RAW_RECV) + return; + + crypt = dcp->cp_crypt; + wkey = dcp->cp_wkey; + + /* figure out the effective crypt */ + if (crypt == ZIO_CRYPT_INHERIT && dd->dd_parent != NULL) + VERIFY0(dsl_dir_get_crypt(dd->dd_parent, &crypt)); + + /* if we aren't doing encryption just return */ + if (crypt == ZIO_CRYPT_OFF || crypt == ZIO_CRYPT_INHERIT) + return; + + /* zapify the dd so that we can add the crypto key obj to it */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dsl_dir_zapify(dd, tx); + + /* use the new key if given or inherit from the parent */ + if (wkey == NULL) { + VERIFY0(spa_keystore_wkey_hold_dd(dp->dp_spa, + dd->dd_parent, FTAG, &wkey)); + } else { + wkey->wk_ddobj = dd->dd_object; + } + + ASSERT3P(wkey, !=, NULL); + + /* Create or clone the DSL crypto key and activate the feature */ + dd->dd_crypto_obj = dsl_crypto_key_create_sync(crypt, wkey, tx); + VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, &dd->dd_crypto_obj, + tx)); + dsl_dataset_activate_feature(dsobj, SPA_FEATURE_ENCRYPTION, tx); + + /* + * If we inherited the wrapping key we release our reference now. + * Otherwise, this is a new key and we need to load it into the + * keystore. + */ + if (dcp->cp_wkey == NULL) { + dsl_wrapping_key_rele(wkey, FTAG); + } else { + VERIFY0(spa_keystore_load_wkey_impl(dp->dp_spa, wkey)); + } +} + +typedef struct dsl_crypto_recv_key_arg { + uint64_t dcrka_dsobj; + uint64_t dcrka_fromobj; + dmu_objset_type_t dcrka_ostype; + nvlist_t *dcrka_nvl; + boolean_t dcrka_do_key; +} dsl_crypto_recv_key_arg_t; + +static int +dsl_crypto_recv_raw_objset_check(dsl_dataset_t *ds, dsl_dataset_t *fromds, + dmu_objset_type_t ostype, nvlist_t *nvl, dmu_tx_t *tx) +{ + int ret; + objset_t *os; + dnode_t *mdn; + uint8_t *buf = NULL; + uint_t len; + uint64_t intval, nlevels, blksz, ibs; + uint64_t nblkptr, maxblkid; + + if (ostype != DMU_OST_ZFS && ostype != DMU_OST_ZVOL) + return (SET_ERROR(EINVAL)); + + /* raw receives also need info about the structure of the metadnode */ + ret = nvlist_lookup_uint64(nvl, "mdn_compress", &intval); + if (ret != 0 || intval >= ZIO_COMPRESS_LEGACY_FUNCTIONS) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint64(nvl, "mdn_checksum", &intval); + if (ret != 0 || intval >= ZIO_CHECKSUM_LEGACY_FUNCTIONS) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint64(nvl, "mdn_nlevels", &nlevels); + if (ret != 0 || nlevels > DN_MAX_LEVELS) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint64(nvl, "mdn_blksz", &blksz); + if (ret != 0 || blksz < SPA_MINBLOCKSIZE) + return (SET_ERROR(EINVAL)); + else if (blksz > spa_maxblocksize(tx->tx_pool->dp_spa)) + return (SET_ERROR(ENOTSUP)); + + ret = nvlist_lookup_uint64(nvl, "mdn_indblkshift", &ibs); + if (ret != 0 || ibs < DN_MIN_INDBLKSHIFT || ibs > DN_MAX_INDBLKSHIFT) + return (SET_ERROR(ENOTSUP)); + + ret = nvlist_lookup_uint64(nvl, "mdn_nblkptr", &nblkptr); + if (ret != 0 || nblkptr != DN_MAX_NBLKPTR) + return (SET_ERROR(ENOTSUP)); + + ret = nvlist_lookup_uint64(nvl, "mdn_maxblkid", &maxblkid); + if (ret != 0) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint8_array(nvl, "portable_mac", &buf, &len); + if (ret != 0 || len != ZIO_OBJSET_MAC_LEN) + return (SET_ERROR(EINVAL)); + + ret = dmu_objset_from_ds(ds, &os); + if (ret != 0) + return (ret); + + /* + * Useraccounting is not portable and must be done with the keys loaded. + * Therefore, whenever we do any kind of receive the useraccounting + * must not be present. + */ + ASSERT0(os->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); + + mdn = DMU_META_DNODE(os); + + /* + * If we already created the objset, make sure its unchangeable + * properties match the ones received in the nvlist. + */ + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + if (!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) && + (mdn->dn_nlevels != nlevels || mdn->dn_datablksz != blksz || + mdn->dn_indblkshift != ibs || mdn->dn_nblkptr != nblkptr)) { + rrw_exit(&ds->ds_bp_rwlock, FTAG); + return (SET_ERROR(EINVAL)); + } + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + /* + * Check that the ivset guid of the fromds matches the one from the + * send stream. Older versions of the encryption code did not have + * an ivset guid on the from dataset and did not send one in the + * stream. For these streams we provide the + * zfs_disable_ivset_guid_check tunable to allow these datasets to + * be received with a generated ivset guid. + */ + if (fromds != NULL && !zfs_disable_ivset_guid_check) { + uint64_t from_ivset_guid = 0; + intval = 0; + + (void) nvlist_lookup_uint64(nvl, "from_ivset_guid", &intval); + (void) zap_lookup(tx->tx_pool->dp_meta_objset, + fromds->ds_object, DS_FIELD_IVSET_GUID, + sizeof (from_ivset_guid), 1, &from_ivset_guid); + + if (intval == 0 || from_ivset_guid == 0) + return (SET_ERROR(ZFS_ERR_FROM_IVSET_GUID_MISSING)); + + if (intval != from_ivset_guid) + return (SET_ERROR(ZFS_ERR_FROM_IVSET_GUID_MISMATCH)); + } + + /* + * Check that the ivset guid of the fromds matches the one from the + * send stream. Older versions of the encryption code did not have + * an ivset guid on the from dataset and did not send one in the + * stream. For these streams we provide the + * zfs_disable_ivset_guid_check tunable to allow these datasets to + * be received with a generated ivset guid. + */ + if (fromds != NULL && !zfs_disable_ivset_guid_check) { + uint64_t from_ivset_guid = 0; + intval = 0; + + (void) nvlist_lookup_uint64(nvl, "from_ivset_guid", &intval); + (void) zap_lookup(tx->tx_pool->dp_meta_objset, + fromds->ds_object, DS_FIELD_IVSET_GUID, + sizeof (from_ivset_guid), 1, &from_ivset_guid); + + if (intval == 0 || from_ivset_guid == 0) + return (SET_ERROR(ZFS_ERR_FROM_IVSET_GUID_MISSING)); + + if (intval != from_ivset_guid) + return (SET_ERROR(ZFS_ERR_FROM_IVSET_GUID_MISMATCH)); + } + + return (0); +} + +static void +dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype, + nvlist_t *nvl, dmu_tx_t *tx) +{ + dsl_pool_t *dp = tx->tx_pool; + objset_t *os; + dnode_t *mdn; + zio_t *zio; + uint8_t *portable_mac; + uint_t len; + uint64_t compress, checksum, nlevels, blksz, ibs, maxblkid; + boolean_t newds = B_FALSE; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + mdn = DMU_META_DNODE(os); + + /* + * Fetch the values we need from the nvlist. "to_ivset_guid" must + * be set on the snapshot, which doesn't exist yet. The receive + * code will take care of this for us later. + */ + compress = fnvlist_lookup_uint64(nvl, "mdn_compress"); + checksum = fnvlist_lookup_uint64(nvl, "mdn_checksum"); + nlevels = fnvlist_lookup_uint64(nvl, "mdn_nlevels"); + blksz = fnvlist_lookup_uint64(nvl, "mdn_blksz"); + ibs = fnvlist_lookup_uint64(nvl, "mdn_indblkshift"); + maxblkid = fnvlist_lookup_uint64(nvl, "mdn_maxblkid"); + VERIFY0(nvlist_lookup_uint8_array(nvl, "portable_mac", &portable_mac, + &len)); + + /* if we haven't created an objset for the ds yet, do that now */ + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + if (BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { + (void) dmu_objset_create_impl_dnstats(dp->dp_spa, ds, + dsl_dataset_get_blkptr(ds), ostype, nlevels, blksz, + ibs, tx); + newds = B_TRUE; + } + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + /* + * Set the portable MAC. The local MAC will always be zero since the + * incoming data will all be portable and user accounting will be + * deferred until the next mount. Afterwards, flag the os to be + * written out raw next time. + */ + arc_release(os->os_phys_buf, &os->os_phys_buf); + bcopy(portable_mac, os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN); + bzero(os->os_phys->os_local_mac, ZIO_OBJSET_MAC_LEN); + os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; + + /* set metadnode compression and checksum */ + mdn->dn_compress = compress; + mdn->dn_checksum = checksum; + + rw_enter(&mdn->dn_struct_rwlock, RW_WRITER); + dnode_new_blkid(mdn, maxblkid, tx, B_FALSE, B_TRUE); + rw_exit(&mdn->dn_struct_rwlock); + + /* + * We can't normally dirty the dataset in syncing context unless + * we are creating a new dataset. In this case, we perform a + * pseudo txg sync here instead. + */ + if (newds) { + dsl_dataset_dirty(ds, tx); + } else { + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + dsl_dataset_sync(ds, zio, tx); + VERIFY0(zio_wait(zio)); + + /* dsl_dataset_sync_done will drop this reference. */ + dmu_buf_add_ref(ds->ds_dbuf, ds); + dsl_dataset_sync_done(ds, tx); + } +} + +int +dsl_crypto_recv_raw_key_check(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx) +{ + int ret; + objset_t *mos = tx->tx_pool->dp_meta_objset; + uint8_t *buf = NULL; + uint_t len; + uint64_t intval, key_guid, version; + boolean_t is_passphrase = B_FALSE; + + ASSERT(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT); + + /* + * Read and check all the encryption values from the nvlist. We need + * all of the fields of a DSL Crypto Key, as well as a fully specified + * wrapping key. + */ + ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, &intval); + if (ret != 0 || intval >= ZIO_CRYPT_FUNCTIONS || + intval <= ZIO_CRYPT_OFF) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID, &intval); + if (ret != 0) + return (SET_ERROR(EINVAL)); + + /* + * If this is an incremental receive make sure the given key guid + * matches the one we already have. + */ + if (ds->ds_dir->dd_crypto_obj != 0) { + ret = zap_lookup(mos, ds->ds_dir->dd_crypto_obj, + DSL_CRYPTO_KEY_GUID, 8, 1, &key_guid); + if (ret != 0) + return (ret); + if (intval != key_guid) + return (SET_ERROR(EACCES)); + } + + ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY, + &buf, &len); + if (ret != 0 || len != MASTER_KEY_MAX_LEN) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY, + &buf, &len); + if (ret != 0 || len != SHA512_HMAC_KEYLEN) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_IV, &buf, &len); + if (ret != 0 || len != WRAPPING_IV_LEN) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, &buf, &len); + if (ret != 0 || len != WRAPPING_MAC_LEN) + return (SET_ERROR(EINVAL)); + + /* + * We don't support receiving old on-disk formats. The version 0 + * implementation protected several fields in an objset that were + * not always portable during a raw receive. As a result, we call + * the old version an on-disk errata #3. + */ + ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_VERSION, &version); + if (ret != 0 || version != ZIO_CRYPT_KEY_CURRENT_VERSION) + return (SET_ERROR(ENOTSUP)); + + ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), + &intval); + if (ret != 0 || intval >= ZFS_KEYFORMAT_FORMATS || + intval == ZFS_KEYFORMAT_NONE) + return (SET_ERROR(EINVAL)); + + is_passphrase = (intval == ZFS_KEYFORMAT_PASSPHRASE); + + /* + * for raw receives we allow any number of pbkdf2iters since there + * won't be a chance for the user to change it. + */ + ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), + &intval); + if (ret != 0 || (is_passphrase == (intval == 0))) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), + &intval); + if (ret != 0 || (is_passphrase == (intval == 0))) + return (SET_ERROR(EINVAL)); + + return (0); +} + +void +dsl_crypto_recv_raw_key_sync(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx) +{ + dsl_pool_t *dp = tx->tx_pool; + objset_t *mos = dp->dp_meta_objset; + dsl_dir_t *dd = ds->ds_dir; + uint_t len; + uint64_t rddobj, one = 1; + uint8_t *keydata, *hmac_keydata, *iv, *mac; + uint64_t crypt, key_guid, keyformat, iters, salt; + uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION; + char *keylocation = "prompt"; + + /* lookup the values we need to create the DSL Crypto Key */ + crypt = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE); + key_guid = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID); + keyformat = fnvlist_lookup_uint64(nvl, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT)); + iters = fnvlist_lookup_uint64(nvl, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS)); + salt = fnvlist_lookup_uint64(nvl, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT)); + VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY, + &keydata, &len)); + VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY, + &hmac_keydata, &len)); + VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_IV, &iv, &len)); + VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, &mac, &len)); + + /* if this is a new dataset setup the DSL Crypto Key. */ + if (dd->dd_crypto_obj == 0) { + /* zapify the dsl dir so we can add the key object to it */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dsl_dir_zapify(dd, tx); + + /* create the DSL Crypto Key on disk and activate the feature */ + dd->dd_crypto_obj = zap_create(mos, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + dd->dd_crypto_obj, DSL_CRYPTO_KEY_REFCOUNT, + sizeof (uint64_t), 1, &one, tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + dd->dd_crypto_obj, DSL_CRYPTO_KEY_VERSION, + sizeof (uint64_t), 1, &version, tx)); + + dsl_dataset_activate_feature(ds->ds_object, + SPA_FEATURE_ENCRYPTION, tx); + ds->ds_feature_inuse[SPA_FEATURE_ENCRYPTION] = B_TRUE; + + /* save the dd_crypto_obj on disk */ + VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_CRYPTO_KEY_OBJ, + sizeof (uint64_t), 1, &dd->dd_crypto_obj, tx)); + + /* + * Set the keylocation to prompt by default. If keylocation + * has been provided via the properties, this will be overridden + * later. + */ + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1, + keylocation, tx); + + rddobj = dd->dd_object; + } else { + VERIFY0(dsl_dir_get_encryption_root_ddobj(dd, &rddobj)); + } + + /* sync the key data to the ZAP object on disk */ + dsl_crypto_key_sync_impl(mos, dd->dd_crypto_obj, crypt, + rddobj, key_guid, iv, mac, keydata, hmac_keydata, keyformat, salt, + iters, tx); +} + +int +dsl_crypto_recv_key_check(void *arg, dmu_tx_t *tx) +{ + int ret; + dsl_crypto_recv_key_arg_t *dcrka = arg; + dsl_dataset_t *ds = NULL, *fromds = NULL; + + ret = dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_dsobj, + FTAG, &ds); + if (ret != 0) + goto out; + + if (dcrka->dcrka_fromobj != 0) { + ret = dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_fromobj, + FTAG, &fromds); + if (ret != 0) + goto out; + } + + ret = dsl_crypto_recv_raw_objset_check(ds, fromds, + dcrka->dcrka_ostype, dcrka->dcrka_nvl, tx); + if (ret != 0) + goto out; + + /* + * We run this check even if we won't be doing this part of + * the receive now so that we don't make the user wait until + * the receive finishes to fail. + */ + ret = dsl_crypto_recv_raw_key_check(ds, dcrka->dcrka_nvl, tx); + if (ret != 0) + goto out; + +out: + if (ds != NULL) + dsl_dataset_rele(ds, FTAG); + if (fromds != NULL) + dsl_dataset_rele(fromds, FTAG); + return (ret); +} + +void +dsl_crypto_recv_key_sync(void *arg, dmu_tx_t *tx) +{ + dsl_crypto_recv_key_arg_t *dcrka = arg; + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_dsobj, + FTAG, &ds)); + dsl_crypto_recv_raw_objset_sync(ds, dcrka->dcrka_ostype, + dcrka->dcrka_nvl, tx); + if (dcrka->dcrka_do_key) + dsl_crypto_recv_raw_key_sync(ds, dcrka->dcrka_nvl, tx); + dsl_dataset_rele(ds, FTAG); +} + +/* + * This function is used to sync an nvlist representing a DSL Crypto Key and + * the associated encryption parameters. The key will be written exactly as is + * without wrapping it. + */ +int +dsl_crypto_recv_raw(const char *poolname, uint64_t dsobj, uint64_t fromobj, + dmu_objset_type_t ostype, nvlist_t *nvl, boolean_t do_key) +{ + dsl_crypto_recv_key_arg_t dcrka; + + dcrka.dcrka_dsobj = dsobj; + dcrka.dcrka_fromobj = fromobj; + dcrka.dcrka_ostype = ostype; + dcrka.dcrka_nvl = nvl; + dcrka.dcrka_do_key = do_key; + + return (dsl_sync_task(poolname, dsl_crypto_recv_key_check, + dsl_crypto_recv_key_sync, &dcrka, 1, ZFS_SPACE_CHECK_NORMAL)); +} + +int +dsl_crypto_populate_key_nvlist(dsl_dataset_t *ds, uint64_t from_ivset_guid, + nvlist_t **nvl_out) +{ + int ret; + objset_t *os; + dnode_t *mdn; + uint64_t rddobj; + nvlist_t *nvl = NULL; + uint64_t dckobj = ds->ds_dir->dd_crypto_obj; + dsl_dir_t *rdd = NULL; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t crypt = 0, key_guid = 0, format = 0; + uint64_t iters = 0, salt = 0, version = 0; + uint64_t to_ivset_guid = 0; + uint8_t raw_keydata[MASTER_KEY_MAX_LEN]; + uint8_t raw_hmac_keydata[SHA512_HMAC_KEYLEN]; + uint8_t iv[WRAPPING_IV_LEN]; + uint8_t mac[WRAPPING_MAC_LEN]; + + ASSERT(dckobj != 0); + + VERIFY0(dmu_objset_from_ds(ds, &os)); + mdn = DMU_META_DNODE(os); + + ret = nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP); + if (ret != 0) + goto error; + + /* lookup values from the DSL Crypto Key */ + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, + &crypt); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &key_guid); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1, + MASTER_KEY_MAX_LEN, raw_keydata); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1, + SHA512_HMAC_KEYLEN, raw_hmac_keydata); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN, + iv); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN, + mac); + if (ret != 0) + goto error; + + /* see zfs_disable_ivset_guid_check tunable for errata info */ + ret = zap_lookup(mos, ds->ds_object, DS_FIELD_IVSET_GUID, 8, 1, + &to_ivset_guid); + if (ret != 0) + ASSERT3U(dp->dp_spa->spa_errata, !=, 0); + + /* + * We don't support raw sends of legacy on-disk formats. See the + * comment in dsl_crypto_recv_key_check() for details. + */ + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_VERSION, 8, 1, &version); + if (ret != 0 || version != ZIO_CRYPT_KEY_CURRENT_VERSION) { + dp->dp_spa->spa_errata = ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; + ret = SET_ERROR(ENOTSUP); + goto error; + } + + /* + * Lookup wrapping key properties. An early version of the code did + * not correctly add these values to the wrapping key or the DSL + * Crypto Key on disk for non encryption roots, so to be safe we + * always take the slightly circuitous route of looking it up from + * the encryption root's key. + */ + ret = dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj); + if (ret != 0) + goto error; + + dsl_pool_config_enter(dp, FTAG); + + ret = dsl_dir_hold_obj(dp, rddobj, NULL, FTAG, &rdd); + if (ret != 0) + goto error_unlock; + + ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &format); + if (ret != 0) + goto error_unlock; + + if (format == ZFS_KEYFORMAT_PASSPHRASE) { + ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &iters); + if (ret != 0) + goto error_unlock; + + ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &salt); + if (ret != 0) + goto error_unlock; + } + + dsl_dir_rele(rdd, FTAG); + dsl_pool_config_exit(dp, FTAG); + + fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, crypt); + fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_GUID, key_guid); + fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_VERSION, version); + VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY, + raw_keydata, MASTER_KEY_MAX_LEN)); + VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY, + raw_hmac_keydata, SHA512_HMAC_KEYLEN)); + VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_IV, iv, + WRAPPING_IV_LEN)); + VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, mac, + WRAPPING_MAC_LEN)); + VERIFY0(nvlist_add_uint8_array(nvl, "portable_mac", + os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN)); + fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), format); + fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), iters); + fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), salt); + fnvlist_add_uint64(nvl, "mdn_checksum", mdn->dn_checksum); + fnvlist_add_uint64(nvl, "mdn_compress", mdn->dn_compress); + fnvlist_add_uint64(nvl, "mdn_nlevels", mdn->dn_nlevels); + fnvlist_add_uint64(nvl, "mdn_blksz", mdn->dn_datablksz); + fnvlist_add_uint64(nvl, "mdn_indblkshift", mdn->dn_indblkshift); + fnvlist_add_uint64(nvl, "mdn_nblkptr", mdn->dn_nblkptr); + fnvlist_add_uint64(nvl, "mdn_maxblkid", mdn->dn_maxblkid); + fnvlist_add_uint64(nvl, "to_ivset_guid", to_ivset_guid); + fnvlist_add_uint64(nvl, "from_ivset_guid", from_ivset_guid); + + *nvl_out = nvl; + return (0); + +error_unlock: + dsl_pool_config_exit(dp, FTAG); +error: + if (rdd != NULL) + dsl_dir_rele(rdd, FTAG); + nvlist_free(nvl); + + *nvl_out = NULL; + return (ret); +} + +uint64_t +dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey, + dmu_tx_t *tx) +{ + dsl_crypto_key_t dck; + uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION; + uint64_t one = 1ULL; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(crypt, >, ZIO_CRYPT_OFF); + + /* create the DSL Crypto Key ZAP object */ + dck.dck_obj = zap_create(tx->tx_pool->dp_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + + /* fill in the key (on the stack) and sync it to disk */ + dck.dck_wkey = wkey; + VERIFY0(zio_crypt_key_init(crypt, &dck.dck_key)); + + dsl_crypto_key_sync(&dck, tx); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dck.dck_obj, + DSL_CRYPTO_KEY_REFCOUNT, sizeof (uint64_t), 1, &one, tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dck.dck_obj, + DSL_CRYPTO_KEY_VERSION, sizeof (uint64_t), 1, &version, tx)); + + zio_crypt_key_destroy(&dck.dck_key); + bzero(&dck.dck_key, sizeof (zio_crypt_key_t)); + + return (dck.dck_obj); +} + +uint64_t +dsl_crypto_key_clone_sync(dsl_dir_t *origindd, dmu_tx_t *tx) +{ + objset_t *mos = tx->tx_pool->dp_meta_objset; + + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY0(zap_increment(mos, origindd->dd_crypto_obj, + DSL_CRYPTO_KEY_REFCOUNT, 1, tx)); + + return (origindd->dd_crypto_obj); +} + +void +dsl_crypto_key_destroy_sync(uint64_t dckobj, dmu_tx_t *tx) +{ + objset_t *mos = tx->tx_pool->dp_meta_objset; + uint64_t refcnt; + + /* Decrement the refcount, destroy if this is the last reference */ + VERIFY0(zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_REFCOUNT, + sizeof (uint64_t), 1, &refcnt)); + + if (refcnt != 1) { + VERIFY0(zap_increment(mos, dckobj, DSL_CRYPTO_KEY_REFCOUNT, + -1, tx)); + } else { + VERIFY0(zap_destroy(mos, dckobj, tx)); + } +} + +void +dsl_dataset_crypt_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + uint64_t intval; + dsl_dir_t *dd = ds->ds_dir; + dsl_dir_t *enc_root; + char buf[ZFS_MAX_DATASET_NAME_LEN]; + + if (dd->dd_crypto_obj == 0) + return; + + intval = dsl_dataset_get_keystatus(dd); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEYSTATUS, intval); + + if (dsl_dir_get_crypt(dd, &intval) == 0) + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_ENCRYPTION, intval); + if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + DSL_CRYPTO_KEY_GUID, 8, 1, &intval) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEY_GUID, intval); + } + if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &intval) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEYFORMAT, intval); + } + if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &intval) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_PBKDF2_SALT, intval); + } + if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &intval) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_PBKDF2_ITERS, intval); + } + if (zap_lookup(dd->dd_pool->dp_meta_objset, ds->ds_object, + DS_FIELD_IVSET_GUID, 8, 1, &intval) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_IVSET_GUID, intval); + } + + if (dsl_dir_get_encryption_root_ddobj(dd, &intval) == 0) { + VERIFY0(dsl_dir_hold_obj(dd->dd_pool, intval, NULL, FTAG, + &enc_root)); + dsl_dir_name(enc_root, buf); + dsl_dir_rele(enc_root, FTAG); + dsl_prop_nvlist_add_string(nv, ZFS_PROP_ENCRYPTION_ROOT, buf); + } +} + +int +spa_crypt_get_salt(spa_t *spa, uint64_t dsobj, uint8_t *salt) +{ + int ret; + dsl_crypto_key_t *dck = NULL; + + /* look up the key from the spa's keystore */ + ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck); + if (ret != 0) + goto error; + + ret = zio_crypt_key_get_salt(&dck->dck_key, salt); + if (ret != 0) + goto error; + + spa_keystore_dsl_key_rele(spa, dck, FTAG); + return (0); + +error: + if (dck != NULL) + spa_keystore_dsl_key_rele(spa, dck, FTAG); + return (ret); +} + +/* + * Objset blocks are a special case for MAC generation. These blocks have 2 + * 256-bit MACs which are embedded within the block itself, rather than a + * single 128 bit MAC. As a result, this function handles encoding and decoding + * the MACs on its own, unlike other functions in this file. + */ +int +spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, + abd_t *abd, uint_t datalen, boolean_t byteswap) +{ + int ret; + dsl_crypto_key_t *dck = NULL; + void *buf = abd_borrow_buf_copy(abd, datalen); + objset_phys_t *osp = buf; + uint8_t portable_mac[ZIO_OBJSET_MAC_LEN]; + uint8_t local_mac[ZIO_OBJSET_MAC_LEN]; + + /* look up the key from the spa's keystore */ + ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck); + if (ret != 0) + goto error; + + /* calculate both HMACs */ + ret = zio_crypt_do_objset_hmacs(&dck->dck_key, buf, datalen, + byteswap, portable_mac, local_mac); + if (ret != 0) + goto error; + + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + /* if we are generating encode the HMACs in the objset_phys_t */ + if (generate) { + bcopy(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN); + bcopy(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN); + abd_return_buf_copy(abd, buf, datalen); + return (0); + } + + if (bcmp(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN) != 0 || + bcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) { + abd_return_buf(abd, buf, datalen); + return (SET_ERROR(ECKSUM)); + } + + abd_return_buf(abd, buf, datalen); + + return (0); + +error: + if (dck != NULL) + spa_keystore_dsl_key_rele(spa, dck, FTAG); + abd_return_buf(abd, buf, datalen); + return (ret); +} + +int +spa_do_crypt_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, abd_t *abd, + uint_t datalen, uint8_t *mac) +{ + int ret; + dsl_crypto_key_t *dck = NULL; + uint8_t *buf = abd_borrow_buf_copy(abd, datalen); + uint8_t digestbuf[ZIO_DATA_MAC_LEN]; + + /* look up the key from the spa's keystore */ + ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck); + if (ret != 0) + goto error; + + /* perform the hmac */ + ret = zio_crypt_do_hmac(&dck->dck_key, buf, datalen, + digestbuf, ZIO_DATA_MAC_LEN); + if (ret != 0) + goto error; + + abd_return_buf(abd, buf, datalen); + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + /* + * Truncate and fill in mac buffer if we were asked to generate a MAC. + * Otherwise verify that the MAC matched what we expected. + */ + if (generate) { + bcopy(digestbuf, mac, ZIO_DATA_MAC_LEN); + return (0); + } + + if (bcmp(digestbuf, mac, ZIO_DATA_MAC_LEN) != 0) + return (SET_ERROR(ECKSUM)); + + return (0); + +error: + if (dck != NULL) + spa_keystore_dsl_key_rele(spa, dck, FTAG); + abd_return_buf(abd, buf, datalen); + return (ret); +} + +/* + * This function serves as a multiplexer for encryption and decryption of + * all blocks (except the L2ARC). For encryption, it will populate the IV, + * salt, MAC, and cabd (the ciphertext). On decryption it will simply use + * these fields to populate pabd (the plaintext). + */ +/* ARGSUSED */ +int +spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb, + dmu_object_type_t ot, boolean_t dedup, boolean_t bswap, uint8_t *salt, + uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, + boolean_t *no_crypt) +{ + int ret; + dsl_crypto_key_t *dck = NULL; + uint8_t *plainbuf = NULL, *cipherbuf = NULL; + + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); + + /* look up the key from the spa's keystore */ + ret = spa_keystore_lookup_key(spa, zb->zb_objset, FTAG, &dck); + if (ret != 0) { + ret = SET_ERROR(EACCES); + return (ret); + } + + if (encrypt) { + plainbuf = abd_borrow_buf_copy(pabd, datalen); + cipherbuf = abd_borrow_buf(cabd, datalen); + } else { + plainbuf = abd_borrow_buf(pabd, datalen); + cipherbuf = abd_borrow_buf_copy(cabd, datalen); + } + + /* + * Both encryption and decryption functions need a salt for key + * generation and an IV. When encrypting a non-dedup block, we + * generate the salt and IV randomly to be stored by the caller. Dedup + * blocks perform a (more expensive) HMAC of the plaintext to obtain + * the salt and the IV. ZIL blocks have their salt and IV generated + * at allocation time in zio_alloc_zil(). On decryption, we simply use + * the provided values. + */ + if (encrypt && ot != DMU_OT_INTENT_LOG && !dedup) { + ret = zio_crypt_key_get_salt(&dck->dck_key, salt); + if (ret != 0) + goto error; + + ret = zio_crypt_generate_iv(iv); + if (ret != 0) + goto error; + } else if (encrypt && dedup) { + ret = zio_crypt_generate_iv_salt_dedup(&dck->dck_key, + plainbuf, datalen, iv, salt); + if (ret != 0) + goto error; + } + + /* call lower level function to perform encryption / decryption */ + ret = zio_do_crypt_data(encrypt, &dck->dck_key, ot, bswap, salt, iv, + mac, datalen, plainbuf, cipherbuf, no_crypt); + + /* + * Handle injected decryption faults. Unfortunately, we cannot inject + * faults for dnode blocks because we might trigger the panic in + * dbuf_prepare_encrypted_dnode_leaf(), which exists because syncing + * context is not prepared to handle malicious decryption failures. + */ + if (zio_injection_enabled && !encrypt && ot != DMU_OT_DNODE && ret == 0) + ret = zio_handle_decrypt_injection(spa, zb, ot, ECKSUM); + if (ret != 0) + goto error; + + if (encrypt) { + abd_return_buf(pabd, plainbuf, datalen); + abd_return_buf_copy(cabd, cipherbuf, datalen); + } else { + abd_return_buf_copy(pabd, plainbuf, datalen); + abd_return_buf(cabd, cipherbuf, datalen); + } + + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + return (0); + +error: + if (encrypt) { + /* zero out any state we might have changed while encrypting */ + bzero(salt, ZIO_DATA_SALT_LEN); + bzero(iv, ZIO_DATA_IV_LEN); + bzero(mac, ZIO_DATA_MAC_LEN); + abd_return_buf(pabd, plainbuf, datalen); + abd_return_buf_copy(cabd, cipherbuf, datalen); + } else { + abd_return_buf_copy(pabd, plainbuf, datalen); + abd_return_buf(cabd, cipherbuf, datalen); + } + + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + return (ret); +} diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c index fd4c35e000..a6061078f7 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dataset.c +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c @@ -540,6 +540,13 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_reserved = ds->ds_quota = 0; } + if (err == 0 && ds->ds_dir->dd_crypto_obj != 0 && + ds->ds_is_snapshot && + zap_contains(mos, dsobj, DS_FIELD_IVSET_GUID) != 0) { + dp->dp_spa->spa_errata = + ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; + } + dsl_deadlist_open(&ds->ds_deadlist, mos, dsl_dataset_phys(ds)->ds_deadlist_obj); uint64_t remap_deadlist_obj = @@ -591,17 +598,52 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, } } } + ASSERT3P(ds->ds_dbuf, ==, dbuf); ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data); ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); *dsp = ds; + return (0); } int -dsl_dataset_hold(dsl_pool_t *dp, const char *name, +dsl_dataset_create_key_mapping(dsl_dataset_t *ds) +{ + dsl_dir_t *dd = ds->ds_dir; + + if (dd->dd_crypto_obj == 0) + return (0); + + return (spa_keystore_create_mapping(dd->dd_pool->dp_spa, + ds, ds, &ds->ds_key_mapping)); +} + +int +dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) +{ + int err; + + err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); + if (err != 0) + return (err); + + ASSERT3P(*dsp, !=, NULL); + + if (flags & DS_HOLD_FLAG_DECRYPT) { + err = dsl_dataset_create_key_mapping(*dsp); + if (err != 0) + dsl_dataset_rele(*dsp, tag); + } + + return (err); +} + +int +dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) { dsl_dir_t *dd; @@ -617,7 +659,7 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, ASSERT(dsl_pool_config_held(dp)); obj = dsl_dir_phys(dd)->dd_head_dataset_obj; if (obj != 0) - err = dsl_dataset_hold_obj(dp, obj, tag, &ds); + err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &ds); else err = SET_ERROR(ENOENT); @@ -626,16 +668,18 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, dsl_dataset_t *snap_ds; if (*snapname++ != '@') { - dsl_dataset_rele(ds, tag); + dsl_dataset_rele_flags(ds, flags, tag); dsl_dir_rele(dd, FTAG); return (SET_ERROR(ENOENT)); } dprintf("looking for snapshot '%s'\n", snapname); err = dsl_dataset_snap_lookup(ds, snapname, &obj); - if (err == 0) - err = dsl_dataset_hold_obj(dp, obj, tag, &snap_ds); - dsl_dataset_rele(ds, tag); + if (err == 0) { + err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, + &snap_ds); + } + dsl_dataset_rele_flags(ds, flags, tag); if (err == 0) { mutex_enter(&snap_ds->ds_lock); @@ -653,14 +697,21 @@ dsl_dataset_hold(dsl_pool_t *dp, const char *name, } int -dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, +dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag, + dsl_dataset_t **dsp) +{ + return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp)); +} + +int +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) { - int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); + int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp); if (err != 0) return (err); if (!dsl_dataset_tryown(*dsp, tag)) { - dsl_dataset_rele(*dsp, tag); + dsl_dataset_rele_flags(*dsp, flags, tag); *dsp = NULL; return (SET_ERROR(EBUSY)); } @@ -668,14 +719,14 @@ dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, } int -dsl_dataset_own(dsl_pool_t *dp, const char *name, +dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) { - int err = dsl_dataset_hold(dp, name, tag, dsp); + int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp); if (err != 0) return (err); if (!dsl_dataset_tryown(*dsp, tag)) { - dsl_dataset_rele(*dsp, tag); + dsl_dataset_rele_flags(*dsp, flags, tag); return (SET_ERROR(EBUSY)); } return (0); @@ -757,7 +808,28 @@ dsl_dataset_rele(dsl_dataset_t *ds, void *tag) } void -dsl_dataset_disown(dsl_dataset_t *ds, void *tag) +dsl_dataset_remove_key_mapping(dsl_dataset_t *ds) +{ + dsl_dir_t *dd = ds->ds_dir; + + if (dd == NULL || dd->dd_crypto_obj == 0) + return; + + (void) spa_keystore_remove_mapping(dd->dd_pool->dp_spa, + ds->ds_object, ds); +} + +void +dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag) +{ + if (flags & DS_HOLD_FLAG_DECRYPT) + dsl_dataset_remove_key_mapping(ds); + + dsl_dataset_rele(ds, tag); +} + +void +dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag) { ASSERT3P(ds->ds_owner, ==, tag); ASSERT(ds->ds_dbuf != NULL); @@ -766,7 +838,7 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *tag) ds->ds_owner = NULL; mutex_exit(&ds->ds_lock); dsl_dataset_long_rele(ds, tag); - dsl_dataset_rele(ds, tag); + dsl_dataset_rele_flags(ds, flags, tag); } boolean_t @@ -795,7 +867,7 @@ dsl_dataset_has_owner(dsl_dataset_t *ds) return (rv); } -static void +void dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; @@ -825,7 +897,7 @@ dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx) uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - uint64_t flags, dmu_tx_t *tx) + dsl_crypto_params_t *dcp, uint64_t flags, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; dmu_buf_t *dbuf; @@ -924,6 +996,9 @@ dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, } } + /* handle encryption */ + dsl_dataset_create_crypt_sync(dsobj, dd, origin, dcp, tx); + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; @@ -946,6 +1021,8 @@ dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) zio_t *zio; bzero(&os->os_zil_header, sizeof (os->os_zil_header)); + if (os->os_encrypted) + os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dsl_dataset_sync(ds, zio, tx); @@ -959,7 +1036,8 @@ dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) uint64_t dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, - dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) + dsl_dataset_t *origin, uint64_t flags, cred_t *cr, + dsl_crypto_params_t *dcp, dmu_tx_t *tx) { dsl_pool_t *dp = pdd->dd_pool; uint64_t dsobj, ddobj; @@ -971,7 +1049,7 @@ dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); - dsobj = dsl_dataset_create_sync_dd(dd, origin, + dsobj = dsl_dataset_create_sync_dd(dd, origin, dcp, flags & ~DS_CREATE_FLAG_NODIRTY, tx); dsl_deleg_set_create_perms(dd, tx, cr); @@ -1099,8 +1177,18 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) dp = ds->ds_dir->dd_pool; if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { + objset_t *os = ds->ds_objset; + /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, ds); + + /* if this dataset is encrypted, grab a reference to the DCK */ + if (ds->ds_dir->dd_crypto_obj != 0 && + !os->os_raw_receive && + !os->os_next_write_raw[tx->tx_txg & TXG_MASK]) { + ASSERT3P(ds->ds_key_mapping, !=, NULL); + key_mapping_add_ref(ds->ds_key_mapping, ds); + } } } @@ -1471,6 +1559,30 @@ dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx)); } + /* + * Create a ivset guid for this snapshot if the dataset is + * encrypted. This may be overridden by a raw receive. A + * previous implementation of this code did not have this + * field as part of the on-disk format for ZFS encryption + * (see errata #4). As part of the remediation for this + * issue, we ask the user to enable the bookmark_v2 feature + * which is now a dependency of the encryption feature. We + * use this as a heuristic to determine when the user has + * elected to correct any datasets created with the old code. + * As a result, we only do this step if the bookmark_v2 + * feature is enabled, which limits the number of states a + * given pool / dataset can be in with regards to terms of + * correcting the issue. + */ + if (ds->ds_dir->dd_crypto_obj != 0 && + spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2)) { + uint64_t ivset_guid = unique_create(); + + dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_IVSET_GUID, + sizeof (ivset_guid), 1, &ivset_guid, tx)); + } + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg); dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj; dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg; @@ -1750,6 +1862,11 @@ dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) os->os_synced_dnodes = NULL; } + if (os->os_encrypted) + os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE; + else + ASSERT0(os->os_next_write_raw[tx->tx_txg & TXG_MASK]); + ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx))); dmu_buf_rele(ds->ds_dbuf, ds); @@ -1874,6 +1991,10 @@ get_receive_resume_stats_impl(dsl_dataset_t *ds) DS_FIELD_RESUME_COMPRESSOK) == 0) { fnvlist_add_boolean(token_nv, "compressok"); } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_RAWOK) == 0) { + fnvlist_add_boolean(token_nv, "rawok"); + } packed = fnvlist_pack(token_nv, &packed_size); fnvlist_free(token_nv); compressed = kmem_alloc(packed_size, KM_SLEEP); @@ -2196,6 +2317,7 @@ dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { + int err; dsl_pool_t *dp = ds->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); @@ -2240,13 +2362,24 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, dsl_get_userrefs(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, - dsl_get_defer_destroy(ds)); + DS_IS_DEFER_DESTROY(ds) ? 1 : 0); + dsl_dataset_crypt_stats(ds, nv); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { - uint64_t written; - if (dsl_get_written(ds, &written) == 0) { - dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, - written); + uint64_t written, comp, uncomp; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_dataset_t *prev; + + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); + if (err == 0) { + err = dsl_dataset_space_written(prev, ds, &written, + &comp, &uncomp); + dsl_dataset_rele(prev, FTAG); + if (err == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, + written); + } } } @@ -2685,7 +2818,7 @@ dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) fnvlist_add_string(ddra->ddra_result, "target", namebuf); cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", - ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx); + ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, NULL, tx); VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); @@ -2767,6 +2900,23 @@ dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) return (SET_ERROR(EXDEV)); } + snap = list_head(&ddpa->shared_snaps); + if (snap == NULL) { + err = SET_ERROR(ENOENT); + goto out; + } + origin_ds = snap->ds; + + /* + * Encrypted clones share a DSL Crypto Key with their origin's dsl dir. + * When doing a promote we must make sure the encryption root for + * both the target and the target's origin does not change to avoid + * needing to rewrap encryption keys + */ + err = dsl_dataset_promote_crypt_check(hds->ds_dir, origin_ds->ds_dir); + if (err != 0) + goto out; + /* * Compute and check the amount of space to transfer. Since this is * so expensive, don't do the preliminary check. @@ -2950,6 +3100,8 @@ dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, NULL, FTAG, &odd)); + dsl_dataset_promote_crypt_sync(hds->ds_dir, odd, tx); + /* change origin's next snap */ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj; diff --git a/usr/src/uts/common/fs/zfs/dsl_destroy.c b/usr/src/uts/common/fs/zfs/dsl_destroy.c index 59a946cca0..40ea657095 100644 --- a/usr/src/uts/common/fs/zfs/dsl_destroy.c +++ b/usr/src/uts/common/fs/zfs/dsl_destroy.c @@ -680,8 +680,8 @@ old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) ka.ds = ds; ka.tx = tx; VERIFY0(traverse_dataset(ds, - dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST, - kill_blkptr, &ka)); + dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST | + TRAVERSE_NO_DECRYPT, kill_blkptr, &ka)); ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || dsl_dataset_phys(ds)->ds_unique_bytes == 0); } @@ -784,6 +784,11 @@ dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) for (t = 0; t < DD_USED_NUM; t++) ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]); + if (dd->dd_crypto_obj != 0) { + dsl_crypto_key_destroy_sync(dd->dd_crypto_obj, tx); + (void) spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object); + } + VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx)); VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx)); if (dsl_dir_phys(dd)->dd_clones != 0) @@ -1033,7 +1038,8 @@ dsl_destroy_head(const char *name) * remove the objects from open context so that the txg sync * is not too long. */ - error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os); + error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_FALSE, + FTAG, &os); if (error == 0) { uint64_t prev_snap_txg = dsl_dataset_phys(dmu_objset_ds(os))-> @@ -1044,7 +1050,7 @@ dsl_destroy_head(const char *name) (void) dmu_free_long_object(os, obj); /* sync out all frees */ txg_wait_synced(dmu_objset_pool(os), 0); - dmu_objset_disown(os, FTAG); + dmu_objset_disown(os, B_FALSE, FTAG); } } diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index 298516f8a4..02cad5f98e 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -37,6 +37,7 @@ #include <sys/dsl_deleg.h> #include <sys/dmu_impl.h> #include <sys/spa.h> +#include <sys/spa_impl.h> #include <sys/metaslab.h> #include <sys/zap.h> #include <sys/zio.h> @@ -163,6 +164,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, { dmu_buf_t *dbuf; dsl_dir_t *dd; + dmu_object_info_t doi; int err; ASSERT(dsl_pool_config_held(dp)); @@ -171,14 +173,11 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, if (err != 0) return (err); dd = dmu_buf_get_user(dbuf); -#ifdef ZFS_DEBUG - { - dmu_object_info_t doi; - dmu_object_info_from_db(dbuf, &doi); - ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); - ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); - } -#endif + + dmu_object_info_from_db(dbuf, &doi); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); + if (dd == NULL) { dsl_dir_t *winner; @@ -186,6 +185,21 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; + + if (dsl_dir_is_zapified(dd) && + zap_contains(dp->dp_meta_objset, ddobj, + DD_FIELD_CRYPTO_KEY_OBJ) == 0) { + VERIFY0(zap_lookup(dp->dp_meta_objset, + ddobj, DD_FIELD_CRYPTO_KEY_OBJ, + sizeof (uint64_t), 1, &dd->dd_crypto_obj)); + + /* check for on-disk format errata */ + if (dsl_dir_incompatible_encryption_version(dd)) { + dp->dp_spa->spa_errata = + ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; + } + } + mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); dsl_prop_init(dd); @@ -945,6 +959,7 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; + dmu_buf_rele(dbuf, FTAG); return (ddobj); @@ -1945,6 +1960,14 @@ dsl_dir_rename_check(void *arg, dmu_tx_t *tx) } } + /* check for encryption errors */ + error = dsl_dir_rename_crypt_check(dd, newparent); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(EACCES)); + } + /* no rename into our descendant */ if (closest_common_ancestor(dd, newparent) == dd) { dsl_dir_rele(newparent, FTAG); diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 54c88b1e3c..76bae90e68 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -438,7 +438,8 @@ dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) } dsl_pool_t * -dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) +dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp, + uint64_t txg) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); @@ -451,6 +452,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = dmu_objset_create_impl(spa, NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); + spa->spa_meta_objset = dp->dp_meta_objset; /* create the pool directory */ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, @@ -488,11 +490,23 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) dsl_pool_create_origin(dp, tx); + /* + * Some features may be needed when creating the root dataset, so we + * create the feature objects here. + */ + if (spa_version(spa) >= SPA_VERSION_FEATURES) + spa_feature_create_zap_objects(spa, tx); + + if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF && + dcp->cp_crypt != ZIO_CRYPT_INHERIT) + spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx); + /* create the root dataset */ - obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); + obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx); /* create the root objset */ - VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); + VERIFY0(dsl_dataset_hold_obj_flags(dp, obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds)); #ifdef _KERNEL { objset_t *os; @@ -503,7 +517,7 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) zfs_create_fs(os, kcred, zplprops, tx); } #endif - dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dmu_tx_commit(tx); @@ -664,9 +678,22 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { + objset_t *os = ds->ds_objset; + ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, zio, tx); + + /* + * Release any key mappings created by calls to + * dsl_dataset_dirty() from the userquota accounting + * code paths. + */ + if (os->os_encrypted && !os->os_raw_receive && + !os->os_next_write_raw[txg & TXG_MASK]) { + ASSERT3P(ds->ds_key_mapping, !=, NULL); + key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); + } } VERIFY0(zio_wait(zio)); @@ -676,8 +703,17 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * * - move dead blocks from the pending deadlist to the on-disk deadlist * - release hold from dsl_dataset_dirty() + * - release key mapping hold from dsl_dataset_dirty() */ while ((ds = list_remove_head(&synced_datasets)) != NULL) { + objset_t *os = ds->ds_objset; + + if (os->os_encrypted && !os->os_raw_receive && + !os->os_next_write_raw[txg & TXG_MASK]) { + ASSERT3P(ds->ds_key_mapping, !=, NULL); + key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); + } + dsl_dataset_sync_done(ds, tx); } while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { @@ -1027,7 +1063,7 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) /* create the origin dir, ds, & snap-ds */ dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, - NULL, 0, kcred, tx); + NULL, 0, kcred, NULL, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c index ce0cd9b0fe..8197f0685a 100644 --- a/usr/src/uts/common/fs/zfs/dsl_prop.c +++ b/usr/src/uts/common/fs/zfs/dsl_prop.c @@ -926,7 +926,7 @@ typedef enum dsl_prop_getflags { DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */ DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */ DSL_PROP_GET_LOCAL = 0x4, /* local properties */ - DSL_PROP_GET_RECEIVED = 0x8 /* received properties */ + DSL_PROP_GET_RECEIVED = 0x8, /* received properties */ } dsl_prop_getflags_t; static int @@ -1093,6 +1093,7 @@ dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, if (err) break; } + out: return (err); } diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c index b5ef5a89e9..73634e33e2 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scan.c +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -487,6 +487,43 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); + + /* + * Detect if the pool contains the signature of #2094. If it + * does properly update the scn->scn_phys structure and notify + * the administrator by setting an errata for the pool. + */ + if (err == EOVERFLOW) { + uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1]; + VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24); + VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==, + (23 * sizeof (uint64_t))); + + err = zap_lookup(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, + sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp); + if (err == 0) { + uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS]; + + if (overflow & ~DSF_VISIT_DS_AGAIN || + scn->scn_async_destroying) { + spa->spa_errata = + ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY; + return (EOVERFLOW); + } + + bcopy(zaptmp, &scn->scn_phys, + SCAN_PHYS_NUMINTS * sizeof (uint64_t)); + scn->scn_phys.scn_flags = overflow; + + /* Required scrub already in progress. */ + if (scn->scn_phys.scn_state == DSS_FINISHED || + scn->scn_phys.scn_state == DSS_CANCELED) + spa->spa_errata = + ZPOOL_ERRATA_ZOL_2094_SCRUB; + } + } + if (err == ENOENT) return (0); else if (err) @@ -1379,7 +1416,7 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) zilog = zil_alloc(dp->dp_meta_objset, zh); (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, - claim_txg); + claim_txg, B_FALSE); zil_free(zilog); } @@ -1637,6 +1674,13 @@ dsl_scan_prefetch_thread(void *arg) mutex_exit(&spa->spa_scrub_lock); + if (BP_IS_PROTECTED(&spic->spic_bp)) { + ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE || + BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET); + ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0); + zio_flags |= ZIO_FLAG_RAW; + } + /* issue the prefetch asynchronously */ (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, @@ -1744,6 +1788,11 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_buf_t *buf; + if (BP_IS_PROTECTED(bp)) { + ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); + zio_flags |= ZIO_FLAG_RAW; + } + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { diff --git a/usr/src/uts/common/fs/zfs/hkdf.c b/usr/src/uts/common/fs/zfs/hkdf.c new file mode 100644 index 0000000000..1d6cc898e4 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/hkdf.c @@ -0,0 +1,173 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include <sys/dmu.h> +#include <sys/hkdf.h> +#include <sys/crypto/api.h> +#include <sys/sha2.h> +#include <sys/hkdf.h> + +static int +hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material, + uint_t km_len, uint8_t *out_buf) +{ + int ret; + crypto_mechanism_t mech; + crypto_key_t key; + crypto_data_t input_cd, output_cd; + + /* initialize HMAC mechanism */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + /* initialize the salt as a crypto key */ + key.ck_format = CRYPTO_KEY_RAW; + key.ck_length = CRYPTO_BYTES2BITS(salt_len); + key.ck_data = salt; + + /* initialize crypto data for the input and output data */ + input_cd.cd_format = CRYPTO_DATA_RAW; + input_cd.cd_offset = 0; + input_cd.cd_length = km_len; + input_cd.cd_raw.iov_base = (char *)key_material; + input_cd.cd_raw.iov_len = input_cd.cd_length; + + output_cd.cd_format = CRYPTO_DATA_RAW; + output_cd.cd_offset = 0; + output_cd.cd_length = SHA512_DIGEST_LENGTH; + output_cd.cd_raw.iov_base = (char *)out_buf; + output_cd.cd_raw.iov_len = output_cd.cd_length; + + ret = crypto_mac(&mech, &input_cd, &key, NULL, &output_cd, NULL); + if (ret != CRYPTO_SUCCESS) + return (SET_ERROR(EIO)); + + return (0); +} + +static int +hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len, + uint8_t *out_buf, uint_t out_len) +{ + int ret; + crypto_mechanism_t mech; + crypto_context_t ctx; + crypto_key_t key; + crypto_data_t T_cd, info_cd, c_cd; + uint_t i, T_len = 0, pos = 0; + uint8_t c; + uint_t N = (out_len + SHA512_DIGEST_LENGTH) / SHA512_DIGEST_LENGTH; + uint8_t T[SHA512_DIGEST_LENGTH]; + + if (N > 255) + return (SET_ERROR(EINVAL)); + + /* initialize HMAC mechanism */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + /* initialize the salt as a crypto key */ + key.ck_format = CRYPTO_KEY_RAW; + key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH); + key.ck_data = extract_key; + + /* initialize crypto data for the input and output data */ + T_cd.cd_format = CRYPTO_DATA_RAW; + T_cd.cd_offset = 0; + T_cd.cd_raw.iov_base = (char *)T; + + c_cd.cd_format = CRYPTO_DATA_RAW; + c_cd.cd_offset = 0; + c_cd.cd_length = 1; + c_cd.cd_raw.iov_base = (char *)&c; + c_cd.cd_raw.iov_len = c_cd.cd_length; + + info_cd.cd_format = CRYPTO_DATA_RAW; + info_cd.cd_offset = 0; + info_cd.cd_length = info_len; + info_cd.cd_raw.iov_base = (char *)info; + info_cd.cd_raw.iov_len = info_cd.cd_length; + + for (i = 1; i <= N; i++) { + c = i; + + T_cd.cd_length = T_len; + T_cd.cd_raw.iov_len = T_cd.cd_length; + + ret = crypto_mac_init(&mech, &key, NULL, &ctx, NULL); + if (ret != CRYPTO_SUCCESS) + return (SET_ERROR(EIO)); + + ret = crypto_mac_update(ctx, &T_cd, NULL); + if (ret != CRYPTO_SUCCESS) + return (SET_ERROR(EIO)); + + ret = crypto_mac_update(ctx, &info_cd, NULL); + if (ret != CRYPTO_SUCCESS) + return (SET_ERROR(EIO)); + + ret = crypto_mac_update(ctx, &c_cd, NULL); + if (ret != CRYPTO_SUCCESS) + return (SET_ERROR(EIO)); + + T_len = SHA512_DIGEST_LENGTH; + T_cd.cd_length = T_len; + T_cd.cd_raw.iov_len = T_cd.cd_length; + + ret = crypto_mac_final(ctx, &T_cd, NULL); + if (ret != CRYPTO_SUCCESS) + return (SET_ERROR(EIO)); + + bcopy(T, out_buf + pos, + (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos)); + pos += SHA512_DIGEST_LENGTH; + } + + return (0); +} + +/* + * HKDF is designed to be a relatively fast function for deriving keys from a + * master key + a salt. We use this function to generate new encryption keys + * so as to avoid hitting the cryptographic limits of the underlying + * encryption modes. Note that, for the sake of deriving encryption keys, the + * info parameter is called the "salt" everywhere else in the code. + */ +int +hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt, + uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key, + uint_t out_len) +{ + int ret; + uint8_t extract_key[SHA512_DIGEST_LENGTH]; + + ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len, + extract_key); + if (ret != 0) + return (ret); + + ret = hkdf_sha512_expand(extract_key, info, info_len, output_key, + out_len); + if (ret != 0) + return (ret); + + return (0); +} diff --git a/usr/src/uts/common/fs/zfs/refcount.c b/usr/src/uts/common/fs/zfs/refcount.c index cac716e469..657a46717c 100644 --- a/usr/src/uts/common/fs/zfs/refcount.c +++ b/usr/src/uts/common/fs/zfs/refcount.c @@ -234,9 +234,10 @@ zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) list_destroy(&removed); } +/* ARGSUSED */ void -zfs_refcount_transfer_ownership(zfs_refcount_t *rc, void *current_holder, - void *new_holder) +zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, + void *current_holder, void *new_holder) { reference_t *ref; boolean_t found = B_FALSE; @@ -249,7 +250,8 @@ zfs_refcount_transfer_ownership(zfs_refcount_t *rc, void *current_holder, for (ref = list_head(&rc->rc_list); ref; ref = list_next(&rc->rc_list, ref)) { - if (ref->ref_holder == current_holder) { + if (ref->ref_holder == current_holder && + ref->ref_number == number) { ref->ref_holder = new_holder; found = B_TRUE; break; @@ -259,6 +261,14 @@ zfs_refcount_transfer_ownership(zfs_refcount_t *rc, void *current_holder, mutex_exit(&rc->rc_mtx); } +void +zfs_refcount_transfer_ownership(zfs_refcount_t *rc, void *current_holder, + void *new_holder) +{ + zfs_refcount_transfer_ownership_many(rc, 1, current_holder, + new_holder); +} + /* * If tracking is enabled, return true if a reference exists that matches * the "holder" tag. If tracking is disabled, then return true if a reference diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c index d3c0a3e8ef..5a4bc705aa 100644 --- a/usr/src/uts/common/fs/zfs/sa.c +++ b/usr/src/uts/common/fs/zfs/sa.c @@ -680,7 +680,7 @@ sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, boolean_t dummy; if (hdl->sa_spill == NULL) { - VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL, + VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL, &hdl->sa_spill) == 0); } dmu_buf_will_dirty(hdl->sa_spill, tx); diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 718b2868de..c72e462b4f 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -1201,6 +1201,8 @@ spa_activate(spa_t *spa, int mode) avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); + + spa_keystore_init(&spa->spa_keystore); } /* @@ -1252,10 +1254,11 @@ spa_deactivate(spa_t *spa) * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); - avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); + spa_keystore_fini(&spa->spa_keystore); + spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); @@ -2089,8 +2092,8 @@ spa_load_verify(spa_t *spa) spa_load_verify_data, spa_load_verify_metadata); } error = traverse_pool(spa, spa->spa_verify_min_txg, - TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, - spa_load_verify_cb, rio); + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | + TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); } (void) zio_wait(rio); @@ -2290,7 +2293,7 @@ spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { - zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; @@ -3260,6 +3263,16 @@ spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } + /* + * Encryption was added before bookmark_v2, even though bookmark_v2 + * is now a dependency. If this pool has encryption enabled without + * bookmark_v2, trigger an errata message. + */ + if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && + !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { + spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; + } + return (0); } @@ -4829,11 +4842,27 @@ spa_l2cache_drop(spa_t *spa) } /* + * Verify encryption parameters for spa creation. If we are encrypting, we must + * have the encryption feature flag enabled. + */ +static int +spa_create_check_encryption_params(dsl_crypto_params_t *dcp, + boolean_t has_encryption) +{ + if (dcp->cp_crypt != ZIO_CRYPT_OFF && + dcp->cp_crypt != ZIO_CRYPT_INHERIT && + !has_encryption) + return (SET_ERROR(ENOTSUP)); + + return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); +} + +/* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, - nvlist_t *zplprops) + nvlist_t *zplprops, dsl_crypto_params_t *dcp) { spa_t *spa; char *altroot = NULL; @@ -4848,6 +4877,9 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, boolean_t has_features; char *poolname; nvlist_t *nvl; + boolean_t has_encryption; + spa_feature_t feat; + char *feat_name; if (props == NULL || nvlist_lookup_string(props, @@ -4888,10 +4920,27 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; has_features = B_FALSE; + has_encryption = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { - if (zpool_prop_feature(nvpair_name(elem))) + if (zpool_prop_feature(nvpair_name(elem))) { has_features = B_TRUE; + feat_name = strchr(nvpair_name(elem), '@') + 1; + VERIFY0(zfeature_lookup_name(feat_name, &feat)); + if (feat == SPA_FEATURE_ENCRYPTION) + has_encryption = B_TRUE; + } + } + + /* verify encryption params, if they were provided */ + if (dcp != NULL) { + error = spa_create_check_encryption_params(dcp, has_encryption); + if (error != 0) { + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } } if (has_features || nvlist_lookup_uint64(props, @@ -4991,8 +5040,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, } spa->spa_is_initializing = B_TRUE; - spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); - spa->spa_meta_objset = dp->dp_meta_objset; + spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); spa->spa_is_initializing = B_FALSE; /* @@ -5017,9 +5065,6 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, cmn_err(CE_PANIC, "failed to add pool config"); } - if (spa_version(spa) >= SPA_VERSION_FEATURES) - spa_feature_create_zap_objects(spa, tx); - if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c index e01260f312..4719696ca4 100644 --- a/usr/src/uts/common/fs/zfs/spa_config.c +++ b/usr/src/uts/common/fs/zfs/spa_config.c @@ -281,7 +281,7 @@ spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) */ if (target->spa_ccw_fail_time == 0) { zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, - target, NULL, NULL, 0, 0); + target, NULL, NULL, NULL, 0, 0); } target->spa_ccw_fail_time = gethrtime(); spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); @@ -408,6 +408,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); + fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); if (spa->spa_comment != NULL) { fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, spa->spa_comment); diff --git a/usr/src/uts/common/fs/zfs/spa_errlog.c b/usr/src/uts/common/fs/zfs/spa_errlog.c index 8ce780537a..f717ebb8c0 100644 --- a/usr/src/uts/common/fs/zfs/spa_errlog.c +++ b/usr/src/uts/common/fs/zfs/spa_errlog.c @@ -90,9 +90,8 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb) * during spa_errlog_sync(). */ void -spa_log_error(spa_t *spa, zio_t *zio) +spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) { - zbookmark_phys_t *zb = &zio->io_logical->io_bookmark; spa_error_entry_t search; spa_error_entry_t *new; avl_tree_t *tree; diff --git a/usr/src/uts/common/fs/zfs/spa_history.c b/usr/src/uts/common/fs/zfs/spa_history.c index 2ad0dcfc5c..897d3c6e9a 100644 --- a/usr/src/uts/common/fs/zfs/spa_history.c +++ b/usr/src/uts/common/fs/zfs/spa_history.c @@ -384,11 +384,16 @@ spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) { int err = 0; dmu_tx_t *tx; - nvlist_t *nvarg; + nvlist_t *nvarg, *in_nvl = NULL; if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) return (SET_ERROR(EINVAL)); + err = nvlist_lookup_nvlist(nvl, ZPOOL_HIST_INPUT_NVL, &in_nvl); + if (err == 0) { + (void) nvlist_remove_all(in_nvl, ZPOOL_HIDDEN_ARGS); + } + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { diff --git a/usr/src/uts/common/fs/zfs/sys/abd.h b/usr/src/uts/common/fs/zfs/sys/abd.h index 952a0b68ba..621635933e 100644 --- a/usr/src/uts/common/fs/zfs/sys/abd.h +++ b/usr/src/uts/common/fs/zfs/sys/abd.h @@ -73,6 +73,7 @@ abd_t *abd_alloc_for_io(size_t, boolean_t); abd_t *abd_alloc_sametype(abd_t *, size_t); void abd_free(abd_t *); abd_t *abd_get_offset(abd_t *, size_t); +abd_t *abd_get_offset_size(abd_t *, size_t, size_t); abd_t *abd_get_from_buf(void *, size_t); void abd_put(abd_t *); diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index 1ce4740fcf..f636d3dcf2 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -58,8 +58,17 @@ _NOTE(CONSTCOND) } while (0) typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; + +/* + * Because the ARC can store encrypted data, errors (not due to bugs) may arise + * while transforming data into its desired format - specifically, when + * decrypting, the key may not be present, or the HMAC may not be correct + * which signifies deliberate tampering with the on-disk state + * (assuming that the checksum was correct). The "error" parameter will be + * nonzero in this case, even if there is no associated zio. + */ typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, - const blkptr_t *bp, arc_buf_t *buf, void *private); + const blkptr_t *bp, arc_buf_t *buf, void *priv); typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); /* generic arc_done_func_t's which you can use */ @@ -93,20 +102,29 @@ typedef enum arc_flags ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */ ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */ ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */ + /* + * Encrypted or authenticated on disk (may be plaintext in memory). + * This header has b_crypt_hdr allocated. Does not include indirect + * blocks with checksums of MACs which will also have their X + * (encrypted) bit set in the bp. + */ + ARC_FLAG_PROTECTED = 1 << 15, + /* data has not been authenticated yet */ + ARC_FLAG_NOAUTH = 1 << 16, /* indicates that the buffer contains metadata (otherwise, data) */ - ARC_FLAG_BUFC_METADATA = 1 << 15, + ARC_FLAG_BUFC_METADATA = 1 << 17, /* Flags specifying whether optional hdr struct fields are defined */ - ARC_FLAG_HAS_L1HDR = 1 << 16, - ARC_FLAG_HAS_L2HDR = 1 << 17, + ARC_FLAG_HAS_L1HDR = 1 << 18, + ARC_FLAG_HAS_L2HDR = 1 << 19, /* * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. * This allows the l2arc to use the blkptr's checksum to verify * the data without having to store the checksum in the hdr. */ - ARC_FLAG_COMPRESSED_ARC = 1 << 18, - ARC_FLAG_SHARED_DATA = 1 << 19, + ARC_FLAG_COMPRESSED_ARC = 1 << 20, + ARC_FLAG_SHARED_DATA = 1 << 21, /* * The arc buffer's compression mode is stored in the top 7 bits of the @@ -125,7 +143,12 @@ typedef enum arc_flags typedef enum arc_buf_flags { ARC_BUF_FLAG_SHARED = 1 << 0, - ARC_BUF_FLAG_COMPRESSED = 1 << 1 + ARC_BUF_FLAG_COMPRESSED = 1 << 1, + /* + * indicates whether this arc_buf_t is encrypted, regardless of + * state on-disk + */ + ARC_BUF_FLAG_ENCRYPTED = 1 << 2 } arc_buf_flags_t; struct arc_buf { @@ -159,15 +182,31 @@ typedef enum arc_space_type { void arc_space_consume(uint64_t space, arc_space_type_t type); void arc_space_return(uint64_t space, arc_space_type_t type); boolean_t arc_is_metadata(arc_buf_t *buf); +boolean_t arc_is_encrypted(arc_buf_t *buf); +boolean_t arc_is_unauthenticated(arc_buf_t *buf); enum zio_compress arc_get_compression(arc_buf_t *buf); -int arc_decompress(arc_buf_t *buf); +void arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, + uint8_t *iv, uint8_t *mac); +int arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, + boolean_t in_place); +void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, + dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, + const uint8_t *mac); arc_buf_t *arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size); arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, enum zio_compress compression_type); +arc_buf_t *arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, + boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, + const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type); arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size); arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, enum zio_compress compression_type); +arc_buf_t *arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, + dmu_object_type_t ot, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type); void arc_return_buf(arc_buf_t *buf, void *tag); void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_destroy(arc_buf_t *buf, void *tag); diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index eedefa3615..271232c61c 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -54,6 +54,7 @@ extern "C" { #define DB_RF_NOPREFETCH (1 << 3) #define DB_RF_NEVERWAIT (1 << 4) #define DB_RF_CACHED (1 << 5) +#define DB_RF_NO_DECRYPT (1 << 6) /* * The simplified state transition diagram for dbufs looks like: @@ -153,6 +154,16 @@ typedef struct dbuf_dirty_record { override_states_t dr_override_state; uint8_t dr_copies; boolean_t dr_nopwrite; + boolean_t dr_has_raw_params; + + /* + * If dr_has_raw_params is set, the following crypt + * params will be set on the BP that's written. + */ + boolean_t dr_byteorder; + uint8_t dr_salt[ZIO_DATA_SALT_LEN]; + uint8_t dr_iv[ZIO_DATA_IV_LEN]; + uint8_t dr_mac[ZIO_DATA_MAC_LEN]; } dl; } dt; } dbuf_dirty_record_t; diff --git a/usr/src/uts/common/fs/zfs/sys/ddt.h b/usr/src/uts/common/fs/zfs/sys/ddt.h index 15d2a9a7ad..9181e59fff 100644 --- a/usr/src/uts/common/fs/zfs/sys/ddt.h +++ b/usr/src/uts/common/fs/zfs/sys/ddt.h @@ -67,9 +67,10 @@ enum ddt_class { typedef struct ddt_key { zio_cksum_t ddk_cksum; /* 256-bit block checksum */ /* - * Encoded with logical & physical size, and compression, as follows: + * Encoded with logical & physical size, encryption, and compression, + * as follows: * +-------+-------+-------+-------+-------+-------+-------+-------+ - * | 0 | 0 | 0 | comp | PSIZE | LSIZE | + * | 0 | 0 | 0 |X| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ */ uint64_t ddk_prop; @@ -85,11 +86,17 @@ typedef struct ddt_key { #define DDK_SET_PSIZE(ddk, x) \ BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) -#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8) -#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x) +#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 7) +#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 7, x) + +#define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1) +#define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x) #define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) +#define DDE_GET_NDVAS(dde) (DDK_GET_CRYPT(&dde->dde_key) \ + ? SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP) + typedef struct ddt_phys { dva_t ddp_dva[SPA_DVAS_PER_BP]; uint64_t ddp_refcnt; diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 28462ff4d5..ffce616cbc 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -77,6 +77,7 @@ struct arc_buf; struct zio_prop; struct sa_handle; struct locked_range; +struct dsl_crypto_params; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; @@ -106,7 +107,8 @@ typedef enum dmu_object_byteswap { #define DMU_OT_NEWTYPE 0x80 #define DMU_OT_METADATA 0x40 -#define DMU_OT_BYTESWAP_MASK 0x3f +#define DMU_OT_ENCRYPTED 0x20 +#define DMU_OT_BYTESWAP_MASK 0x1f /* * Defines a uint8_t object type. Object types specify if the data @@ -114,18 +116,28 @@ typedef enum dmu_object_byteswap { * (dmu_object_byteswap_t). All of the types created by this method * are cached in the dbuf metadata cache. */ -#define DMU_OT(byteswap, metadata) \ +#define DMU_OT(byteswap, metadata, encrypted) \ (DMU_OT_NEWTYPE | \ ((metadata) ? DMU_OT_METADATA : 0) | \ + ((encrypted) ? DMU_OT_ENCRYPTED : 0) | \ ((byteswap) & DMU_OT_BYTESWAP_MASK)) #define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ (ot) < DMU_OT_NUMTYPES) +/* + * MDB doesn't have dmu_ot; it defines these macros itself. + */ +#ifndef ZFS_MDB +#define DMU_OT_IS_METADATA_IMPL(ot) (dmu_ot[ot].ot_metadata) +#define DMU_OT_IS_ENCRYPTED_IMPL(ot) (dmu_ot[ot].ot_encrypt) +#define DMU_OT_BYTESWAP_IMPL(ot) (dmu_ot[ot].ot_byteswap) +#endif + #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_METADATA) : \ - dmu_ot[(ot)].ot_metadata) + DMU_OT_IS_METADATA_IMPL(ot)) #define DMU_OT_IS_DDT(ot) \ ((ot) == DMU_OT_DDT_ZAP) @@ -140,6 +152,10 @@ typedef enum dmu_object_byteswap { #define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache) +#define DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_ENCRYPTED) : \ + DMU_OT_IS_ENCRYPTED_IMPL(ot)) + /* * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill @@ -150,7 +166,7 @@ typedef enum dmu_object_byteswap { #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) : \ - dmu_ot[(ot)].ot_byteswap) + DMU_OT_BYTESWAP_IMPL(ot)) typedef enum dmu_object_type { DMU_OT_NONE, @@ -236,16 +252,27 @@ typedef enum dmu_object_type { /* * Names for valid types declared with DMU_OT(). */ - DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), - DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), - DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), - DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), - DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), - DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), - DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), - DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), - DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), - DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), + DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_FALSE), + DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_FALSE), + DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_FALSE), + DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_FALSE), + DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_FALSE), + DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_FALSE), + DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_FALSE), + DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_FALSE), + DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_FALSE), + DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_FALSE), + + DMU_OTN_UINT8_ENC_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_TRUE), + DMU_OTN_UINT8_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_TRUE), + DMU_OTN_UINT16_ENC_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_TRUE), + DMU_OTN_UINT16_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_TRUE), + DMU_OTN_UINT32_ENC_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_TRUE), + DMU_OTN_UINT32_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_TRUE), + DMU_OTN_UINT64_ENC_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_TRUE), + DMU_OTN_UINT64_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_TRUE), + DMU_OTN_ZAP_ENC_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_TRUE), + DMU_OTN_ZAP_ENC_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_TRUE), } dmu_object_type_t; /* @@ -285,19 +312,24 @@ void zfs_znode_byteswap(void *buf, size_t size); */ #define DMU_BONUS_BLKID (-1ULL) #define DMU_SPILL_BLKID (-2ULL) + /* * Public routines to create, destroy, open, and close objsets. */ +typedef void dmu_objset_create_sync_func_t(objset_t *os, void *arg, + cred_t *cr, dmu_tx_t *tx); + int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, - boolean_t readonly, void *tag, objset_t **osp); + boolean_t readonly, boolean_t key_required, void *tag, objset_t **osp); void dmu_objset_rele(objset_t *os, void *tag); -void dmu_objset_disown(objset_t *os, void *tag); +void dmu_objset_disown(objset_t *os, boolean_t key_required, void *tag); int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, - void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); + struct dsl_crypto_params *dcp, dmu_objset_create_sync_func_t func, + void *arg); int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); @@ -378,11 +410,12 @@ int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int dnodesize, dmu_tx_t *tx); int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, - int bonuslen, int dnodesize, dmu_tx_t *txp); + int bonuslen, int dnodesize, boolean_t keep_spill, dmu_tx_t *txp); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); +int dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Free an object from this objset. @@ -417,6 +450,13 @@ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg); /* + * Set the number of levels on a dnode. nlevels must be greater than the + * current number of levels or an EINVAL will be returned. + */ +int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, + dmu_tx_t *tx); + +/* * Set the data blocksize for an object. * * The object cannot have any blocks allcated beyond the first. If @@ -431,6 +471,14 @@ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx); /* + * Manually set the maxblkid on a dnode. This will adjust nlevels accordingly + * to accommodate the change. When calling this function, the caller must + * ensure that the object's nlevels can sufficiently support the new maxblkid. + */ +int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, + dmu_tx_t *tx); + +/* * Set the checksum property on a dnode. The new checksum algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ @@ -460,6 +508,11 @@ dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, struct zio_prop *zp); +void dmu_write_policy_override_compress(struct zio_prop *zp, + enum zio_compress compress); +void dmu_write_policy_override_encrypt(struct zio_prop *zp, + dmu_object_type_t ot, boolean_t byteorder, enum zio_compress compress, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a @@ -472,7 +525,11 @@ void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, * * Returns ENOENT, EIO, or 0. */ +int dmu_bonus_hold_impl(objset_t *os, uint64_t object, void *tag, + uint32_t flags, dmu_buf_t **dbp); int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); +int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp, + uint32_t flags); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); @@ -483,7 +540,8 @@ int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); * Special spill buffer support used by "SA" framework */ -int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); +int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag, + dmu_buf_t **dbp); int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); @@ -525,6 +583,7 @@ boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); +uint64_t dmu_buf_user_refcount(dmu_buf_t *db); /* * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a @@ -685,6 +744,8 @@ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); /* * You must create a transaction, then hold the objects which you will @@ -755,9 +816,9 @@ void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size, dmu_tx_t *tx); + uint64_t size, dmu_tx_t *tx); int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, - uint64_t size); + uint64_t size); int dmu_free_long_object(objset_t *os, uint64_t object); /* @@ -768,6 +829,7 @@ int dmu_free_long_object(objset_t *os, uint64_t object); */ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ +#define DMU_READ_NO_DECRYPT 2 /* don't decrypt */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, @@ -791,10 +853,15 @@ int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); -void dmu_assign_arcbuf_dnode(dnode_t *handle, uint64_t offset, +int dmu_assign_arcbuf_by_dnode(dnode_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); -void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, - dmu_tx_t *tx); +int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, + struct arc_buf *buf, dmu_tx_t *tx); +void dmu_convert_to_raw(dmu_buf_t *handle, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); +#define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf +void dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset, + dmu_buf_t *handle, dmu_tx_t *tx); int dmu_xuio_init(struct xuio *uio, int niov); void dmu_xuio_fini(struct xuio *uio); int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, @@ -838,6 +905,7 @@ typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; boolean_t ot_dbuf_metadata_cache; + boolean_t ot_encrypt; char *ot_name; } dmu_object_type_info_t; @@ -1008,8 +1076,6 @@ int dmu_diff(const char *tosnap_name, const char *fromsnap_name, #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; -extern int zfs_mdcomp_disable; - #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h index e820fe57ec..ccb5d7ac51 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h @@ -163,6 +163,7 @@ extern "C" { * dn_free_txg * dn_assigned_txg * dn_dirty_txg + * dd_assigned_tx * dn_notxholds * dn_dirtyctx * dn_dirtyctx_firstset @@ -277,6 +278,7 @@ typedef struct dmu_sendarg { objset_t *dsa_os; zio_cksum_t dsa_zc; uint64_t dsa_toguid; + uint64_t dsa_fromtxg; int dsa_err; dmu_pendop_t dsa_pending_op; uint64_t dsa_featureflags; diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h index cae1c7719a..41ae18a8b9 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h @@ -59,13 +59,19 @@ struct dmu_tx; #define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) +/* all flags are currently non-portable */ +#define OBJSET_CRYPT_PORTABLE_FLAGS_MASK (0) + typedef struct objset_phys { dnode_phys_t os_meta_dnode; zil_header_t os_zil_header; uint64_t os_type; uint64_t os_flags; + uint8_t os_portable_mac[ZIO_OBJSET_MAC_LEN]; + uint8_t os_local_mac[ZIO_OBJSET_MAC_LEN]; char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 - - sizeof (zil_header_t) - sizeof (uint64_t)*2]; + sizeof (zil_header_t) - sizeof (uint64_t)*2 - + 2*ZIO_OBJSET_MAC_LEN]; dnode_phys_t os_userused_dnode; dnode_phys_t os_groupused_dnode; } objset_phys_t; @@ -77,6 +83,8 @@ struct objset { spa_t *os_spa; arc_buf_t *os_phys_buf; objset_phys_t *os_phys; + boolean_t os_encrypted; + /* * The following "special" dnodes have no parent, are exempt * from dnode_move(), and are not recorded in os_dnodes, but they @@ -132,6 +140,10 @@ struct objset { uint64_t os_flags; uint64_t os_freed_dnodes; boolean_t os_rescan_dnodes; + boolean_t os_raw_receive; + + /* os_phys_buf should be written raw next txg */ + boolean_t os_next_write_raw[TXG_SIZE]; /* Protected by os_obj_lock */ kmutex_t os_obj_lock; @@ -171,14 +183,18 @@ struct objset { /* called from zpl */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); +int dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, + objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, - boolean_t readonly, void *tag, objset_t **osp); + boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp); int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj, - dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); + dmu_objset_type_t type, boolean_t readonly, boolean_t decrypt, + void *tag, objset_t **osp); void dmu_objset_refresh_ownership(struct dsl_dataset *ds, - struct dsl_dataset **newds, void *tag); + struct dsl_dataset **newds, boolean_t key_needed, void *tag); void dmu_objset_rele(objset_t *os, void *tag); -void dmu_objset_disown(objset_t *os, void *tag); +void dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag); +void dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag); int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_stats(objset_t *os, nvlist_t *nv); @@ -196,6 +212,9 @@ timestruc_t dmu_objset_snap_cmtime(objset_t *os); /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); +objset_t *dmu_objset_create_impl_dnstats(spa_t *spa, struct dsl_dataset *ds, + blkptr_t *bp, dmu_objset_type_t type, int levels, int blksz, int ibs, + dmu_tx_t *tx); objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, @@ -206,6 +225,7 @@ void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); boolean_t dmu_objset_userused_enabled(objset_t *os); int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); +boolean_t dmu_objset_incompatible_encryption_version(objset_t *os); int dmu_fsname(const char *snapname, char *buf); void dmu_objset_evict_done(objset_t *os); diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_recv.h b/usr/src/uts/common/fs/zfs/sys/dmu_recv.h index 56b69e61b1..e2b595e77b 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_recv.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_recv.h @@ -45,10 +45,15 @@ typedef struct dmu_recv_cookie { boolean_t drc_byteswap; boolean_t drc_force; boolean_t drc_resumable; + boolean_t drc_raw; boolean_t drc_clone; + boolean_t drc_spill; struct avl_tree *drc_guid_to_ds_map; + nvlist_t *drc_keynvl; zio_cksum_t drc_cksum; + uint64_t drc_fromsnapobj; uint64_t drc_newsnapobj; + uint64_t drc_ivset_guid; void *drc_owner; cred_t *drc_cred; } dmu_recv_cookie_t; diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_send.h b/usr/src/uts/common/fs/zfs/sys/dmu_send.h index 65d8e99db6..382f86622d 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_send.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_send.h @@ -40,15 +40,14 @@ struct avl_tree; struct dmu_replay_record; int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, - boolean_t large_block_ok, boolean_t compressok, int outfd, - uint64_t resumeobj, uint64_t resumeoff, - struct vnode *vp, offset_t *off); + boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, int outfd, + uint64_t resumeobj, uint64_t resumeoff, struct vnode *vp, offset_t *off); int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds, boolean_t stream_compressed, uint64_t *sizep); int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg, boolean_t stream_compressed, uint64_t *sizep); int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, - int outfd, struct vnode *vp, offset_t *off); + boolean_t rawok, int outfd, struct vnode *vp, offset_t *off); #endif /* _DMU_SEND_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h index c010edd440..8ceef5cf13 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h @@ -49,6 +49,15 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, #define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) #define TRAVERSE_HARD (1<<4) +/* + * Encrypted dnode blocks have encrypted bonus buffers while the rest + * of the dnode is left unencrypted. Callers can specify the + * TRAVERSE_NO_DECRYPT flag to indicate to the traversal code that + * they wish to receive the raw encrypted dnodes instead of attempting + * to read the logical data. + */ +#define TRAVERSE_NO_DECRYPT (1<<5) + /* Special traverse error return value to indicate skipping of children */ #define TRAVERSE_VISIT_NO_CHILDREN -1 diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h index 3b7d619172..da72903113 100644 --- a/usr/src/uts/common/fs/zfs/sys/dnode.h +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h @@ -74,9 +74,7 @@ extern "C" { /* * dnode id flags * - * Note: a file will never ever have its - * ids moved from bonus->spill - * and only in a crypto environment would it be on spill + * Note: a file will never ever have its ids moved from bonus->spill */ #define DN_ID_CHKED_BONUS 0x1 #define DN_ID_CHKED_SPILL 0x2 @@ -201,6 +199,8 @@ enum dnode_dirtycontext { * dataset and even within the same dnode block. */ +#define DNODE_CRYPT_PORTABLE_FLAGS_MASK (DNODE_FLAG_SPILL_BLKPTR) + typedef struct dnode_phys { uint8_t dn_type; /* dmu_object_type_t */ uint8_t dn_indblkshift; /* ln2(indirect block size) */ @@ -219,6 +219,13 @@ typedef struct dnode_phys { uint64_t dn_maxblkid; /* largest allocated block ID */ uint64_t dn_used; /* bytes (or sectors) of disk space */ + /* + * Both dn_pad2 and dn_pad3 are protected by the block's MAC. This + * allows us to protect any fields that might be added here in the + * future. In either case, developers will want to check + * zio_crypt_init_uios_dnode() to ensure the new field is being + * protected properly. + */ uint64_t dn_pad3[4]; union { blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)]; @@ -235,8 +242,8 @@ typedef struct dnode_phys { }; } dnode_phys_t; -#define DN_SPILL_BLKPTR(dnp) (blkptr_t *)((char *)(dnp) + \ - (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT)) +#define DN_SPILL_BLKPTR(dnp) ((blkptr_t *)((char *)(dnp) + \ + (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))) struct dnode { /* @@ -282,6 +289,7 @@ struct dnode { uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */ uint16_t dn_next_bonuslen[TXG_SIZE]; uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ + uint64_t dn_next_maxblkid[TXG_SIZE]; /* next maxblkid in bytes */ /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */ uint32_t dn_dbufs_count; /* count of dn_dbufs */ @@ -339,6 +347,12 @@ struct dnode { }; /* + * We use this (otherwise unused) bit to indicate if the value of + * dn_next_maxblkid[txgoff] is valid to use in dnode_sync(). + */ +#define DMU_NEXT_MAXBLKID_SET (1ULL << 63) + +/* * Adds a level of indirection between the dbuf and the dnode to avoid * iterating descendent dbufs in dnode_move(). Handles are not allocated * individually, but as an array of child dnodes in dnode_hold_impl(). @@ -381,15 +395,18 @@ void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, - dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); + dmu_object_type_t bonustype, int bonuslen, int dn_slots, + boolean_t keep_spill, dmu_tx_t *tx); void dnode_free(dnode_t *dn, dmu_tx_t *tx); void dnode_byteswap(dnode_phys_t *dnp); void dnode_buf_byteswap(void *buf, size_t size); void dnode_verify(dnode_t *dn); +int dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx); int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); void dnode_diduse_space(dnode_t *dn, int64_t space); -void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); +void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, + boolean_t have_read, boolean_t force); uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); void dnode_init(void); void dnode_fini(void); diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_bookmark.h b/usr/src/uts/common/fs/zfs/sys/dsl_bookmark.h index e477bb231c..3cdad74414 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_bookmark.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_bookmark.h @@ -36,8 +36,25 @@ typedef struct zfs_bookmark_phys { uint64_t zbm_guid; /* guid of bookmarked dataset */ uint64_t zbm_creation_txg; /* birth transaction group */ uint64_t zbm_creation_time; /* bookmark creation time */ + + /* the following fields are reserved for redacted send / recv */ + uint64_t zbm_redaction_obj; /* redaction list object */ + uint64_t zbm_flags; /* ZBM_FLAG_* */ + uint64_t zbm_referenced_bytes_refd; + uint64_t zbm_compressed_bytes_refd; + uint64_t zbm_uncompressed_bytes_refd; + uint64_t zbm_referenced_freed_before_next_snap; + uint64_t zbm_compressed_freed_before_next_snap; + uint64_t zbm_uncompressed_freed_before_next_snap; + + /* fields used for raw sends */ + uint64_t zbm_ivset_guid; } zfs_bookmark_phys_t; + +#define BOOKMARK_PHYS_SIZE_V1 (3 * sizeof (uint64_t)) +#define BOOKMARK_PHYS_SIZE_V2 (12 * sizeof (uint64_t)) + int dsl_bookmark_create(nvlist_t *, nvlist_t *); int dsl_get_bookmarks(const char *, nvlist_t *, nvlist_t *); int dsl_get_bookmarks_impl(dsl_dataset_t *, nvlist_t *, nvlist_t *); diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_crypt.h b/usr/src/uts/common/fs/zfs/sys/dsl_crypt.h new file mode 100644 index 0000000000..360a69b329 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/dsl_crypt.h @@ -0,0 +1,226 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#ifndef _SYS_DSL_CRYPT_H +#define _SYS_DSL_CRYPT_H + +#include <sys/dmu_tx.h> +#include <sys/dmu.h> +#include <sys/zio_crypt.h> +#include <sys/spa.h> +#include <sys/dsl_dataset.h> + +/* + * ZAP entry keys for DSL Crypto Keys stored on disk. In addition, + * ZFS_PROP_KEYFORMAT, ZFS_PROP_PBKDF2_SALT, and ZFS_PROP_PBKDF2_ITERS are + * also maintained here using their respective property names. + */ +#define DSL_CRYPTO_KEY_CRYPTO_SUITE "DSL_CRYPTO_SUITE" +#define DSL_CRYPTO_KEY_GUID "DSL_CRYPTO_GUID" +#define DSL_CRYPTO_KEY_IV "DSL_CRYPTO_IV" +#define DSL_CRYPTO_KEY_MAC "DSL_CRYPTO_MAC" +#define DSL_CRYPTO_KEY_MASTER_KEY "DSL_CRYPTO_MASTER_KEY_1" +#define DSL_CRYPTO_KEY_HMAC_KEY "DSL_CRYPTO_HMAC_KEY_1" +#define DSL_CRYPTO_KEY_ROOT_DDOBJ "DSL_CRYPTO_ROOT_DDOBJ" +#define DSL_CRYPTO_KEY_REFCOUNT "DSL_CRYPTO_REFCOUNT" +#define DSL_CRYPTO_KEY_VERSION "DSL_CRYPTO_VERSION" + +/* + * In-memory representation of a wrapping key. One of these structs will exist + * for each encryption root with its key loaded. + */ +typedef struct dsl_wrapping_key { + /* link on spa_keystore_t:sk_wkeys */ + avl_node_t wk_avl_link; + + /* keyformat property enum */ + zfs_keyformat_t wk_keyformat; + + /* the pbkdf2 salt, if the keyformat is of type passphrase */ + uint64_t wk_salt; + + /* the pbkdf2 iterations, if the keyformat is of type passphrase */ + uint64_t wk_iters; + + /* actual wrapping key */ + crypto_key_t wk_key; + + /* refcount of holders of this key */ + zfs_refcount_t wk_refcnt; + + /* dsl directory object that owns this wrapping key */ + uint64_t wk_ddobj; +} dsl_wrapping_key_t; + +/* enum of commands indicating special actions that should be run */ +typedef enum dcp_cmd { + /* key creation commands */ + DCP_CMD_NONE = 0, /* no specific command */ + DCP_CMD_RAW_RECV, /* raw receive */ + + /* key changing commands */ + DCP_CMD_NEW_KEY, /* rewrap key as an encryption root */ + DCP_CMD_INHERIT, /* rewrap key with parent's wrapping key */ + DCP_CMD_FORCE_NEW_KEY, /* change to encryption root without rewrap */ + DCP_CMD_FORCE_INHERIT, /* inherit parent's key without rewrap */ + + DCP_CMD_MAX +} dcp_cmd_t; + +/* + * This struct is a simple wrapper around all the parameters that are usually + * required to setup encryption. It exists so that all of the params can be + * passed around the kernel together for convenience. + */ +typedef struct dsl_crypto_params { + /* command indicating intended action */ + dcp_cmd_t cp_cmd; + + /* the encryption algorithm */ + enum zio_encrypt cp_crypt; + + /* keylocation property string */ + char *cp_keylocation; + + /* the wrapping key */ + dsl_wrapping_key_t *cp_wkey; +} dsl_crypto_params_t; + +/* + * In-memory representation of a DSL Crypto Key object. One of these structs + * (and corresponding on-disk ZAP object) will exist for each encrypted + * clone family that is mounted or otherwise reading protected data. + */ +typedef struct dsl_crypto_key { + /* link on spa_keystore_t:sk_dsl_keys */ + avl_node_t dck_avl_link; + + /* refcount of dsl_key_mapping_t's holding this key */ + zfs_refcount_t dck_holds; + + /* master key used to derive encryption keys */ + zio_crypt_key_t dck_key; + + /* wrapping key for syncing this structure to disk */ + dsl_wrapping_key_t *dck_wkey; + + /* on-disk object id */ + uint64_t dck_obj; +} dsl_crypto_key_t; + +/* + * In-memory mapping of a dataset object id to a DSL Crypto Key. This is used + * to look up the corresponding dsl_crypto_key_t from the zio layer for + * performing data encryption and decryption. + */ +typedef struct dsl_key_mapping { + /* link on spa_keystore_t:sk_key_mappings */ + avl_node_t km_avl_link; + + /* refcount of how many users are depending on this mapping */ + zfs_refcount_t km_refcnt; + + /* dataset this crypto key belongs to (index) */ + uint64_t km_dsobj; + + /* crypto key (value) of this record */ + dsl_crypto_key_t *km_key; +} dsl_key_mapping_t; + +/* in memory structure for holding all wrapping and dsl keys */ +typedef struct spa_keystore { + /* lock for protecting sk_dsl_keys */ + krwlock_t sk_dk_lock; + + /* tree of all dsl_crypto_key_t's */ + avl_tree_t sk_dsl_keys; + + /* lock for protecting sk_key_mappings */ + krwlock_t sk_km_lock; + + /* tree of all dsl_key_mapping_t's, indexed by dsobj */ + avl_tree_t sk_key_mappings; + + /* lock for protecting the wrapping keys tree */ + krwlock_t sk_wkeys_lock; + + /* tree of all dsl_wrapping_key_t's, indexed by ddobj */ + avl_tree_t sk_wkeys; +} spa_keystore_t; + +int dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props, + nvlist_t *crypto_args, dsl_crypto_params_t **dcp_out); +void dsl_crypto_params_free(dsl_crypto_params_t *dcp, boolean_t unload); +void dsl_dataset_crypt_stats(struct dsl_dataset *ds, nvlist_t *nv); +int dsl_crypto_can_set_keylocation(const char *dsname, const char *keylocation); +boolean_t dsl_dir_incompatible_encryption_version(dsl_dir_t *dd); + +void spa_keystore_init(spa_keystore_t *sk); +void spa_keystore_fini(spa_keystore_t *sk); + +void spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, void *tag); +int spa_keystore_load_wkey_impl(spa_t *spa, dsl_wrapping_key_t *wkey); +int spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp, + boolean_t noop); +int spa_keystore_unload_wkey_impl(spa_t *spa, uint64_t ddobj); +int spa_keystore_unload_wkey(const char *dsname); + +int spa_keystore_create_mapping(spa_t *spa, struct dsl_dataset *ds, void *tag, + dsl_key_mapping_t **km_out); +int spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, void *tag); +void key_mapping_add_ref(dsl_key_mapping_t *km, void *tag); +void key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag); +int spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag, + dsl_crypto_key_t **dck_out); + +int dsl_crypto_populate_key_nvlist(struct dsl_dataset *ds, + uint64_t from_ivset_guid, nvlist_t **nvl_out); +int dsl_crypto_recv_raw_key_check(struct dsl_dataset *ds, + nvlist_t *nvl, dmu_tx_t *tx); +void dsl_crypto_recv_raw_key_sync(struct dsl_dataset *ds, + nvlist_t *nvl, dmu_tx_t *tx); +int dsl_crypto_recv_raw(const char *poolname, uint64_t dsobj, uint64_t fromobj, + dmu_objset_type_t ostype, nvlist_t *nvl, boolean_t do_key); + +int spa_keystore_change_key(const char *dsname, dsl_crypto_params_t *dcp); +int dsl_dir_rename_crypt_check(dsl_dir_t *dd, dsl_dir_t *newparent); +int dsl_dataset_promote_crypt_check(dsl_dir_t *target, dsl_dir_t *origin); +void dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, + dmu_tx_t *tx); +int dmu_objset_create_crypt_check(dsl_dir_t *parentdd, + dsl_crypto_params_t *dcp, boolean_t *will_encrypt); +void dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, + struct dsl_dataset *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx); +uint64_t dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey, + dmu_tx_t *tx); +int dmu_objset_clone_crypt_check(dsl_dir_t *parentdd, dsl_dir_t *origindd); +uint64_t dsl_crypto_key_clone_sync(dsl_dir_t *origindd, dmu_tx_t *tx); +void dsl_crypto_key_destroy_sync(uint64_t dckobj, dmu_tx_t *tx); + +int spa_crypt_get_salt(spa_t *spa, uint64_t dsobj, uint8_t *salt); +int spa_do_crypt_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, + abd_t *abd, uint_t datalen, uint8_t *mac); +int spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, + abd_t *abd, uint_t datalen, boolean_t byteswap); +int spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb, + dmu_object_type_t ot, boolean_t dedup, boolean_t bswap, uint8_t *salt, + uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, + boolean_t *no_crypt); + +#endif /* _SYS_DSL_CRYPT_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h index 064ff617fd..189376eefc 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h @@ -40,6 +40,7 @@ #include <sys/dsl_deadlist.h> #include <sys/refcount.h> #include <sys/rrwlock.h> +#include <sys/dsl_crypt.h> #include <zfeature_common.h> #ifdef __cplusplus @@ -49,6 +50,8 @@ extern "C" { struct dsl_dataset; struct dsl_dir; struct dsl_pool; +struct dsl_crypto_params; +struct dsl_key_mapping; #define DS_FLAG_INCONSISTENT (1ULL<<0) #define DS_IS_INCONSISTENT(ds) \ @@ -106,6 +109,7 @@ struct dsl_pool; #define DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok" #define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok" #define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok" +#define DS_FIELD_RESUME_RAWOK "com.datto:resume_rawok" /* * This field is set to the object number of the remap deadlist if one exists. @@ -113,6 +117,12 @@ struct dsl_pool; #define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist" /* + * This field is set to the ivset guid for encrypted snapshots. This is used + * for validating raw receives. + */ +#define DS_FIELD_IVSET_GUID "com.datto:ivset_guid" + +/* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose * name lookups should be performed case-insensitively. */ @@ -164,6 +174,7 @@ typedef struct dsl_dataset { uint64_t ds_object; uint64_t ds_fsid_guid; boolean_t ds_is_snapshot; + struct dsl_key_mapping *ds_key_mapping; /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; @@ -293,26 +304,40 @@ typedef struct dsl_dataset_snapshot_arg { #define DS_UNIQUE_IS_ACCURATE(ds) \ ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) +/* flags for holding the dataset */ +typedef enum ds_hold_flags { + DS_HOLD_FLAG_DECRYPT = 1 << 0 /* needs access to encrypted data */ +} ds_hold_flags_t; + int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag, dsl_dataset_t **dsp); +int dsl_dataset_hold_flags(struct dsl_pool *dp, const char *name, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds, void *tag); +int dsl_dataset_create_key_mapping(dsl_dataset_t *ds); int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, dsl_dataset_t **); +int dsl_dataset_hold_obj_flags(struct dsl_pool *dp, uint64_t dsobj, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **); +void dsl_dataset_remove_key_mapping(dsl_dataset_t *ds); void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); +void dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, + void *tag); int dsl_dataset_own(struct dsl_pool *dp, const char *name, - void *tag, dsl_dataset_t **dsp); + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, - void *tag, dsl_dataset_t **dsp); -void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); +void dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag); void dsl_dataset_name(dsl_dataset_t *ds, char *name); boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); int dsl_dataset_namelen(dsl_dataset_t *ds); boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, - dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); + dsl_dataset_t *origin, uint64_t flags, cred_t *, + struct dsl_crypto_params *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, - uint64_t flags, dmu_tx_t *tx); + struct dsl_crypto_params *dcp, uint64_t flags, dmu_tx_t *tx); void dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx); int dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx); int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors); @@ -434,6 +459,8 @@ void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds); void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); +void dsl_dataset_activate_feature(uint64_t dsobj, + spa_feature_t f, dmu_tx_t *tx); void dsl_dataset_deactivate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx); diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_deleg.h b/usr/src/uts/common/fs/zfs/sys/dsl_deleg.h index 6fb6a121ad..dadbda324e 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_deleg.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_deleg.h @@ -58,6 +58,8 @@ extern "C" { #define ZFS_DELEG_PERM_DIFF "diff" #define ZFS_DELEG_PERM_BOOKMARK "bookmark" #define ZFS_DELEG_PERM_REMAP "remap" +#define ZFS_DELEG_PERM_LOAD_KEY "load-key" +#define ZFS_DELEG_PERM_CHANGE_KEY "change-key" /* * Note: the names of properties that are marked delegatable are also diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h index 21d953cb60..a9336f5321 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h @@ -33,6 +33,7 @@ #include <sys/dsl_synctask.h> #include <sys/refcount.h> #include <sys/zfs_context.h> +#include <sys/dsl_crypt.h> #ifdef __cplusplus extern "C" { @@ -48,6 +49,7 @@ struct dsl_dataset; #define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count" #define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" #define DD_FIELD_LAST_REMAP_TXG "com.delphix:last_remap_txg" +#define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj" typedef enum dd_used { DD_USED_HEAD, @@ -90,6 +92,7 @@ struct dsl_dir { /* These are immutable; no lock needed: */ uint64_t dd_object; + uint64_t dd_crypto_obj; dsl_pool_t *dd_pool; /* Stable until user eviction; no lock needed: */ diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h index 66098900db..de13fa8bfa 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h @@ -51,6 +51,7 @@ struct dsl_dataset; struct dsl_pool; struct dmu_tx; struct dsl_scan; +struct dsl_crypto_params; extern uint64_t zfs_dirty_data_max; extern uint64_t zfs_dirty_data_max_max; @@ -144,7 +145,8 @@ typedef struct dsl_pool { int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); int dsl_pool_open(dsl_pool_t *dp); void dsl_pool_close(dsl_pool_t *dp); -dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); +dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, + struct dsl_crypto_params *dcp, uint64_t txg); void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg); int dsl_pool_sync_context(dsl_pool_t *dp); diff --git a/usr/src/uts/common/fs/zfs/sys/hkdf.h b/usr/src/uts/common/fs/zfs/sys/hkdf.h new file mode 100644 index 0000000000..e0f7678c03 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/hkdf.h @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#ifndef _SYS_HKDF_H_ +#define _SYS_HKDF_H_ + +#include <sys/types.h> + +int hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt, + uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key, + uint_t out_len); + +#endif /* _SYS_HKDF_H_ */ diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h index 0059a245ee..ec36727065 100644 --- a/usr/src/uts/common/fs/zfs/sys/refcount.h +++ b/usr/src/uts/common/fs/zfs/sys/refcount.h @@ -76,6 +76,8 @@ int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, void *); int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, void *); void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *); void zfs_refcount_transfer_ownership(zfs_refcount_t *, void *, void *); +void zfs_refcount_transfer_ownership_many(zfs_refcount_t *, uint64_t, + void *, void *); boolean_t zfs_refcount_held(zfs_refcount_t *, void *); boolean_t zfs_refcount_not_held(zfs_refcount_t *, void *); @@ -107,6 +109,7 @@ typedef struct refcount { atomic_add_64(&(dst)->rc_count, __tmp); \ } #define zfs_refcount_transfer_ownership(rc, current_holder, new_holder) (void)0 +#define zfs_refcount_transfer_ownership_many(rc, nr, ch, nh) ((void)0) #define zfs_refcount_held(rc, holder) ((rc)->rc_count > 0) #define zfs_refcount_not_held(rc, holder) (B_TRUE) diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 53b9e4ef5d..2d998d624d 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -61,6 +61,7 @@ typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; struct dsl_pool; struct dsl_dataset; +struct dsl_crypto_params; /* * General-purpose 32-bit and 64-bit bitfield encodings. @@ -216,7 +217,7 @@ typedef struct zio_cksum_salt { * G gang block indicator * B byteorder (endianness) * D dedup - * X encryption (on version 30, which is not supported) + * X encryption * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type @@ -230,6 +231,83 @@ typedef struct zio_cksum_salt { */ /* + * The blkptr_t's of encrypted blocks also need to store the encryption + * parameters so that the block can be decrypted. This layout is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | vdev1 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 1 |G| offset1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 2 | vdev2 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 3 |G| offset2 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 4 | salt | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 5 | IV1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 8 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 9 | physical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | IV2 | fill count | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * c | checksum[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * d | checksum[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * e | MAC[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * f | MAC[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * salt Salt for generating encryption keys + * IV1 First 64 bits of encryption IV + * X Block requires encryption handling (set to 1) + * E blkptr_t contains embedded data (set to 0, see below) + * fill count number of non-zero blocks under this bp (truncated to 32 bits) + * IV2 Last 32 bits of encryption IV + * checksum[2] 128-bit checksum of the data this bp describes + * MAC[2] 128-bit message authentication code for this data + * + * The X bit being set indicates that this block is one of 3 types. If this is + * a level 0 block with an encrypted object type, the block is encrypted + * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted + * object type, this block is authenticated with an HMAC (see + * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC + * words to store a checksum-of-MACs from the level below (see + * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED() + * refers to both encrypted and authenticated blocks and BP_USES_CRYPT() + * refers to any of these 3 kinds of blocks. + * + * The additional encryption parameters are the salt, IV, and MAC which are + * explained in greater detail in the block comment at the top of zio_crypt.c. + * The MAC occupies half of the checksum space since it serves a very similar + * purpose: to prevent data corruption on disk. The only functional difference + * is that the checksum is used to detect on-disk corruption whether or not the + * encryption key is loaded and the MAC provides additional protection against + * malicious disk tampering. We use the 3rd DVA to store the salt and first + * 64 bits of the IV. As a result encrypted blocks can only have 2 copies + * maximum instead of the normal 3. The last 32 bits of the IV are stored in + * the upper bits of what is usually the fill count. Note that only blocks at + * level 0 or -2 are ever encrypted, which allows us to guarantee that these + * 32 bits are not trampled over by other code (see zio_crypt.c for details). + * The salt and IV are not used for authenticated bps or bps with an indirect + * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits + * for the fill count. + */ + +/* * "Embedded" blkptr_t's don't actually point to a block, instead they * have a data payload embedded in the blkptr_t itself. See the comment * in blkptr.c for more details. @@ -284,7 +362,9 @@ typedef struct zio_cksum_salt { * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before * other macros, as they assert that they are only used on BP's of the correct - * "embedded-ness". + * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use + * the payload space for encryption parameters (see the comment above on + * how encryption parameters are stored). */ #define BPE_GET_ETYPE(bp) \ @@ -308,7 +388,7 @@ _NOTE(CONSTCOND) } while (0) BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) #define BPE_SET_PSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ - BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ + BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ _NOTE(CONSTCOND) } while (0) typedef enum bp_embedded_type { @@ -410,6 +490,26 @@ _NOTE(CONSTCOND) } while (0) #define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) #define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) +/* encrypted, authenticated, and MAC cksum bps use the same bit */ +#define BP_USES_CRYPT(bp) BF64_GET((bp)->blk_prop, 61, 1) +#define BP_SET_CRYPT(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) + +#define BP_IS_ENCRYPTED(bp) \ + (BP_USES_CRYPT(bp) && \ + BP_GET_LEVEL(bp) == 0 && \ + DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) + +#define BP_IS_AUTHENTICATED(bp) \ + (BP_USES_CRYPT(bp) && \ + BP_GET_LEVEL(bp) == 0 && \ + !DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) + +#define BP_HAS_INDIRECT_MAC_CKSUM(bp) \ + (BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0) + +#define BP_IS_PROTECTED(bp) \ + (BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp)) + #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) @@ -427,7 +527,26 @@ _NOTE(CONSTCOND) } while (0) (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ } -#define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill) +#define BP_GET_FILL(bp) \ + ((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \ + ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)) + +#define BP_SET_FILL(bp, fill) \ +{ \ + if (BP_IS_ENCRYPTED(bp)) \ + BF64_SET((bp)->blk_fill, 0, 32, fill); \ + else \ + (bp)->blk_fill = fill; \ +} + +#define BP_GET_IV2(bp) \ + (ASSERT(BP_IS_ENCRYPTED(bp)), \ + BF64_GET((bp)->blk_fill, 32, 32)) +#define BP_SET_IV2(bp, iv2) \ +{ \ + ASSERT(BP_IS_ENCRYPTED(bp)); \ + BF64_SET((bp)->blk_fill, 32, 32, iv2); \ +} #define BP_IS_METADATA(bp) \ (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) @@ -436,7 +555,7 @@ _NOTE(CONSTCOND) } while (0) (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ - DVA_GET_ASIZE(&(bp)->blk_dva[2])) + (DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_GET_UCSIZE(bp) \ (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) @@ -445,13 +564,13 @@ _NOTE(CONSTCOND) } while (0) (BP_IS_EMBEDDED(bp) ? 0 : \ !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ - !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) + (!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) #define BP_COUNT_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ DVA_GET_GANG(&(bp)->blk_dva[1]) + \ - DVA_GET_GANG(&(bp)->blk_dva[2]))) + (DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))) #define DVA_EQUAL(dva1, dva2) \ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ @@ -470,6 +589,10 @@ _NOTE(CONSTCOND) } while (0) ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ ((zc1).zc_word[3] - (zc2).zc_word[3]))) +#define ZIO_CHECKSUM_MAC_EQUAL(zc1, zc2) \ + (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ + ((zc1).zc_word[1] - (zc2).zc_word[1]))) + #define ZIO_CHECKSUM_IS_ZERO(zc) \ (0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \ (zc)->zc_word[2] | (zc)->zc_word[3])) @@ -530,7 +653,7 @@ _NOTE(CONSTCOND) } while (0) #define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) -#define BP_SPRINTF_LEN 320 +#define BP_SPRINTF_LEN 400 /* * This macro allows code sharing between zfs, libzpool, and mdb. @@ -543,7 +666,18 @@ _NOTE(CONSTCOND) } while (0) { "zero", "single", "double", "triple" }; \ int len = 0; \ int copies = 0; \ - \ + const char *crypt_type; \ + if (bp != NULL) { \ + if (BP_IS_ENCRYPTED(bp)) { \ + crypt_type = "encrypted"; \ + } else if (BP_IS_AUTHENTICATED(bp)) { \ + crypt_type = "authenticated"; \ + } else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { \ + crypt_type = "indirect-MAC"; \ + } else { \ + crypt_type = "unencrypted"; \ + } \ + } \ if (bp == NULL) { \ len += func(buf + len, size - len, "<NULL>"); \ } else if (BP_IS_HOLE(bp)) { \ @@ -577,18 +711,27 @@ _NOTE(CONSTCOND) } while (0) (u_longlong_t)DVA_GET_ASIZE(dva), \ ws); \ } \ + if (BP_IS_ENCRYPTED(bp)) { \ + len += func(buf + len, size - len, \ + "salt=%llx iv=%llx:%llx%c", \ + (u_longlong_t)bp->blk_dva[2].dva_word[0], \ + (u_longlong_t)bp->blk_dva[2].dva_word[1], \ + (u_longlong_t)BP_GET_IV2(bp), \ + ws); \ + } \ if (BP_IS_GANG(bp) && \ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ copies--; \ len += func(buf + len, size - len, \ - "[L%llu %s] %s %s %s %s %s %s%c" \ + "[L%llu %s] %s %s %s %s %s %s %s%c" \ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ "cksum=%llx:%llx:%llx:%llx", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ checksum, \ compress, \ + crypt_type, \ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ BP_IS_GANG(bp) ? "gang" : "contiguous", \ BP_GET_DEDUP(bp) ? "dedup" : "unique", \ @@ -622,8 +765,8 @@ extern int spa_open_rewind(const char *pool, spa_t **, void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); -extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, - nvlist_t *zplprops); +extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, + nvlist_t *zplprops, struct dsl_crypto_params *dcp); extern int spa_import_rootpool(char *devpath, char *devid); extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); @@ -890,9 +1033,10 @@ extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, /* error handling */ struct zbookmark_phys; -extern void spa_log_error(spa_t *spa, zio_t *zio); +extern void spa_log_error(spa_t *spa, const struct zbookmark_phys *zb); extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, - zio_t *zio, uint64_t stateoroffset, uint64_t length); + const struct zbookmark_phys *zb, struct zio *zio, uint64_t stateoroffset, + uint64_t length); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 19516a1a1b..d63013ce0d 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -44,6 +44,7 @@ #include <sys/refcount.h> #include <sys/bplist.h> #include <sys/bpobj.h> +#include <sys/dsl_crypt.h> #include <sys/zfeature.h> #include <sys/zthr.h> #include <zfeature_common.h> @@ -372,6 +373,8 @@ struct spa { uint64_t spa_deadman_synctime; /* deadman expiration timer */ uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */ spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */ + spa_keystore_t spa_keystore; /* loaded crypto keys */ + uint64_t spa_errata; /* errata issues detected */ /* * spa_iokstat_lock protects spa_iokstat and diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h index 70916c45b7..1457200dd8 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -94,7 +94,7 @@ typedef enum drr_headertype { /* flag #21 is reserved for a Delphix feature */ #define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) #define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23) -/* flag #24 is reserved for the raw send feature */ +#define DMU_BACKUP_FEATURE_RAW (1 << 24) /* flag #25 is reserved for the ZSTD compression feature */ /* @@ -105,7 +105,8 @@ typedef enum drr_headertype { DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ DMU_BACKUP_FEATURE_RESUMING | \ DMU_BACKUP_FEATURE_LARGE_BLOCKS | DMU_BACKUP_FEATURE_LARGE_DNODE | \ - DMU_BACKUP_FEATURE_COMPRESSED) + DMU_BACKUP_FEATURE_COMPRESSED | \ + DMU_BACKUP_FEATURE_RAW) /* Are all features in the given flag word currently supported? */ #define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) @@ -149,20 +150,50 @@ typedef enum dmu_send_resume_token_version { * cannot necessarily be received as a clone correctly. */ #define DRR_FLAG_FREERECORDS (1<<2) +/* + * When DRR_FLAG_SPILL_BLOCK is set it indicates the DRR_OBJECT_SPILL + * and DRR_SPILL_UNMODIFIED flags are meaningful in the send stream. + * + * When DRR_FLAG_SPILL_BLOCK is set, DRR_OBJECT records will have + * DRR_OBJECT_SPILL set if and only if they should have a spill block + * (either an existing one, or a new one in the send stream). When clear + * the object does not have a spill block and any existing spill block + * should be freed. + * + * Similarly, when DRR_FLAG_SPILL_BLOCK is set, DRR_SPILL records will + * have DRR_SPILL_UNMODIFIED set if and only if they were included for + * backward compatibility purposes, and can be safely ignored by new versions + * of zfs receive. Previous versions of ZFS which do not understand the + * DRR_FLAG_SPILL_BLOCK will process this record and recreate any missing + * spill blocks. + */ +#define DRR_FLAG_SPILL_BLOCK (1<<3) /* - * flags in the drr_checksumflags field in the DRR_WRITE and - * DRR_WRITE_BYREF blocks + * flags in the drr_flags field in the DRR_WRITE, DRR_SPILL, DRR_OBJECT, + * DRR_WRITE_BYREF, and DRR_OBJECT_RANGE blocks */ -#define DRR_CHECKSUM_DEDUP (1<<0) +#define DRR_CHECKSUM_DEDUP (1<<0) /* not used for DRR_SPILL blocks */ +#define DRR_RAW_BYTESWAP (1<<1) +#define DRR_OBJECT_SPILL (1<<2) /* OBJECT record has a spill block */ +#define DRR_SPILL_UNMODIFIED (1<<2) /* SPILL record for unmodified block */ #define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) +#define DRR_IS_RAW_BYTESWAPPED(flags) ((flags) & DRR_RAW_BYTESWAP) +#define DRR_OBJECT_HAS_SPILL(flags) ((flags) & DRR_OBJECT_SPILL) +#define DRR_SPILL_IS_UNMODIFIED(flags) ((flags) & DRR_SPILL_UNMODIFIED) /* deal with compressed drr_write replay records */ #define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0) #define DRR_WRITE_PAYLOAD_SIZE(drrw) \ (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \ (drrw)->drr_logical_size) +#define DRR_SPILL_PAYLOAD_SIZE(drrs) \ + ((drrs)->drr_compressed_size ? \ + (drrs)->drr_compressed_size : (drrs)->drr_length) +#define DRR_OBJECT_PAYLOAD_SIZE(drro) \ + ((drro)->drr_raw_bonuslen != 0 ? \ + (drro)->drr_raw_bonuslen : P2ROUNDUP((drro)->drr_bonuslen, 8)) /* * zfs ioctl command structure @@ -171,7 +202,8 @@ typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, - DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES + DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_OBJECT_RANGE, + DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { @@ -198,8 +230,15 @@ typedef struct dmu_replay_record { uint8_t drr_checksumtype; uint8_t drr_compress; uint8_t drr_dn_slots; - uint8_t drr_pad[5]; + uint8_t drr_flags; + uint32_t drr_raw_bonuslen; uint64_t drr_toguid; + /* only (possibly) nonzero for raw streams */ + uint8_t drr_indblkshift; + uint8_t drr_nlevels; + uint8_t drr_nblkptr; + uint8_t drr_pad[5]; + uint64_t drr_maxblkid; /* bonus content follows */ } drr_object; struct drr_freeobjects { @@ -215,13 +254,17 @@ typedef struct dmu_replay_record { uint64_t drr_logical_size; uint64_t drr_toguid; uint8_t drr_checksumtype; - uint8_t drr_checksumflags; + uint8_t drr_flags; uint8_t drr_compressiontype; uint8_t drr_pad2[5]; /* deduplication key */ ddt_key_t drr_key; /* only nonzero if drr_compressiontype is not 0 */ uint64_t drr_compressed_size; + /* only nonzero for raw streams */ + uint8_t drr_salt[ZIO_DATA_SALT_LEN]; + uint8_t drr_iv[ZIO_DATA_IV_LEN]; + uint8_t drr_mac[ZIO_DATA_MAC_LEN]; /* content follows */ } drr_write; struct drr_free { @@ -242,7 +285,7 @@ typedef struct dmu_replay_record { uint64_t drr_refoffset; /* properties of the data */ uint8_t drr_checksumtype; - uint8_t drr_checksumflags; + uint8_t drr_flags; uint8_t drr_pad2[6]; ddt_key_t drr_key; /* deduplication key */ } drr_write_byref; @@ -250,7 +293,15 @@ typedef struct dmu_replay_record { uint64_t drr_object; uint64_t drr_length; uint64_t drr_toguid; - uint64_t drr_pad[4]; /* needed for crypto */ + uint8_t drr_flags; + uint8_t drr_compressiontype; + uint8_t drr_pad[6]; + /* only nonzero for raw streams */ + uint64_t drr_compressed_size; + uint8_t drr_salt[ZIO_DATA_SALT_LEN]; + uint8_t drr_iv[ZIO_DATA_IV_LEN]; + uint8_t drr_mac[ZIO_DATA_MAC_LEN]; + dmu_object_type_t drr_type; /* spill data follows */ } drr_spill; struct drr_write_embedded { @@ -266,6 +317,16 @@ typedef struct dmu_replay_record { uint32_t drr_psize; /* compr. (real) size of payload */ /* (possibly compressed) content follows */ } drr_write_embedded; + struct drr_object_range { + uint64_t drr_firstobj; + uint64_t drr_numslots; + uint64_t drr_toguid; + uint8_t drr_salt[ZIO_DATA_SALT_LEN]; + uint8_t drr_iv[ZIO_DATA_IV_LEN]; + uint8_t drr_mac[ZIO_DATA_MAC_LEN]; + uint8_t drr_flags; + uint8_t drr_pad[3]; + } drr_object_range; /* * Nore: drr_checksum is overlaid with all record types @@ -335,6 +396,7 @@ typedef enum zinject_type { ZINJECT_IGNORED_WRITES, ZINJECT_PANIC, ZINJECT_DELAY_IO, + ZINJECT_DECRYPT_FAULT, } zinject_type_t; typedef struct zfs_share { diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h index e6b18da95b..2e44ff2b14 100644 --- a/usr/src/uts/common/fs/zfs/sys/zil.h +++ b/usr/src/uts/common/fs/zfs/sys/zil.h @@ -33,6 +33,7 @@ #include <sys/spa.h> #include <sys/zio.h> #include <sys/dmu.h> +#include <sys/zio_crypt.h> #ifdef __cplusplus extern "C" { @@ -407,7 +408,8 @@ typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, struct lwb *lwb, zio_t *zio); extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, - zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg, + boolean_t decrypt); extern void zil_init(void); extern void zil_fini(void); diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 00d2ebbebb..ec4ec29d5a 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -104,6 +104,15 @@ enum zio_checksum { #define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 #define ZIO_DEDUPDITTO_MIN 100 +#define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_CCM +#define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF + +/* macros defining encryption lengths */ +#define ZIO_OBJSET_MAC_LEN 32 +#define ZIO_DATA_IV_LEN 12 +#define ZIO_DATA_SALT_LEN 8 +#define ZIO_DATA_MAC_LEN 16 + /* * The number of "legacy" compression functions which can be set on individual * objects. @@ -191,16 +200,18 @@ enum zio_flag { ZIO_FLAG_DONT_PROPAGATE = 1 << 20, ZIO_FLAG_IO_BYPASS = 1 << 21, ZIO_FLAG_IO_REWRITE = 1 << 22, - ZIO_FLAG_RAW = 1 << 23, - ZIO_FLAG_GANG_CHILD = 1 << 24, - ZIO_FLAG_DDT_CHILD = 1 << 25, - ZIO_FLAG_GODFATHER = 1 << 26, - ZIO_FLAG_NOPWRITE = 1 << 27, - ZIO_FLAG_REEXECUTED = 1 << 28, - ZIO_FLAG_DELEGATED = 1 << 29, + ZIO_FLAG_RAW_COMPRESS = 1 << 23, + ZIO_FLAG_RAW_ENCRYPT = 1 << 24, + ZIO_FLAG_GANG_CHILD = 1 << 25, + ZIO_FLAG_DDT_CHILD = 1 << 26, + ZIO_FLAG_GODFATHER = 1 << 27, + ZIO_FLAG_NOPWRITE = 1 << 28, + ZIO_FLAG_REEXECUTED = 1 << 29, + ZIO_FLAG_DELEGATED = 1 << 30, }; #define ZIO_FLAG_MUSTSUCCEED 0 +#define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) #define ZIO_DDT_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ @@ -314,12 +325,17 @@ typedef struct zio_prop { boolean_t zp_dedup_verify; boolean_t zp_nopwrite; uint32_t zp_zpl_smallblk; + boolean_t zp_encrypt; + boolean_t zp_byteorder; + uint8_t zp_salt[ZIO_DATA_SALT_LEN]; + uint8_t zp_iv[ZIO_DATA_IV_LEN]; + uint8_t zp_mac[ZIO_DATA_MAC_LEN]; } zio_prop_t; typedef struct zio_cksum_report zio_cksum_report_t; typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, - const void *good_data); + const abd_t *good_data); typedef void zio_cksum_free_f(void *cbdata, size_t size); struct zio_bad_cksum; /* defined in zio_checksum.h */ @@ -524,8 +540,9 @@ extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, enum zio_flag flags); -extern int zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, +extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog); +extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); extern void zio_shrink(zio_t *zio, uint64_t size); @@ -598,6 +615,8 @@ extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); +extern int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, + uint64_t type, int error); extern int zio_handle_fault_injection(zio_t *zio, int error); extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); @@ -607,18 +626,20 @@ extern hrtime_t zio_handle_io_delay(zio_t *zio); /* * Checksum ereport functions */ -extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, - uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info); +extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, + const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, + uint64_t length, void *arg, struct zio_bad_cksum *info); extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, - const void *good_data, const void *bad_data, boolean_t drop_if_identical); + const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical); extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report); extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); /* If we have the good data in hand, this function can be used */ extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, - struct zio *zio, uint64_t offset, uint64_t length, - const void *good_data, const void *bad_data, struct zio_bad_cksum *info); + const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, + uint64_t length, const abd_t *good_data, const abd_t *bad_data, + struct zio_bad_cksum *info); /* Called from spa_sync(), but primarily an injection handler */ extern void spa_handle_ignored_writes(spa_t *spa); diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h index 3eda057eae..6119163af8 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h @@ -54,7 +54,7 @@ typedef enum zio_checksum_flags { /* Uses salt value */ ZCHECKSUM_FLAG_SALTED = (1 << 4), /* Strong enough for nopwrite? */ - ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) + ZCHECKSUM_FLAG_NOPWRITE = (1 << 5), } zio_checksum_flags_t; /* @@ -103,7 +103,7 @@ extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, void *, uint64_t, uint64_t, zio_bad_cksum_t *); extern void zio_checksum_compute(zio_t *, enum zio_checksum, struct abd *, uint64_t); -extern int zio_checksum_error_impl(spa_t *, blkptr_t *, enum zio_checksum, +extern int zio_checksum_error_impl(spa_t *, const blkptr_t *, enum zio_checksum, struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *); extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); extern enum zio_checksum spa_dedup_checksum(spa_t *spa); diff --git a/usr/src/uts/common/fs/zfs/sys/zio_crypt.h b/usr/src/uts/common/fs/zfs/sys/zio_crypt.h new file mode 100644 index 0000000000..6163f97458 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/sys/zio_crypt.h @@ -0,0 +1,152 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#ifndef _SYS_ZIO_CRYPT_H +#define _SYS_ZIO_CRYPT_H + +#include <sys/dmu.h> +#include <sys/refcount.h> +#include <sys/crypto/api.h> +#include <sys/nvpair.h> +#include <sys/avl.h> +#include <sys/zio.h> + +#ifdef __cplusplus +extern "C" { +#endif + +/* forward declarations */ +struct zbookmark_phys; + +#define WRAPPING_KEY_LEN 32 +#define WRAPPING_IV_LEN ZIO_DATA_IV_LEN +#define WRAPPING_MAC_LEN ZIO_DATA_MAC_LEN +#define MASTER_KEY_MAX_LEN 32 +#define SHA512_HMAC_KEYLEN 64 + +#define ZIO_CRYPT_KEY_CURRENT_VERSION 1ULL + +typedef enum zio_crypt_type { + ZC_TYPE_NONE = 0, + ZC_TYPE_CCM, + ZC_TYPE_GCM +} zio_crypt_type_t; + +/* table of supported crypto algorithms, modes and keylengths. */ +typedef struct zio_crypt_info { + /* mechanism name, needed by ICP */ + crypto_mech_name_t ci_mechname; + + /* cipher mode type (GCM, CCM) */ + zio_crypt_type_t ci_crypt_type; + + /* length of the encryption key */ + size_t ci_keylen; + + /* human-readable name of the encryption alforithm */ + char *ci_name; +} zio_crypt_info_t; + +extern zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS]; + +/* in memory representation of an unwrapped key that is loaded into memory */ +typedef struct zio_crypt_key { + /* encryption algorithm */ + uint64_t zk_crypt; + + /* on-disk format version */ + uint64_t zk_version; + + /* GUID for uniquely identifying this key. Not encrypted on disk. */ + uint64_t zk_guid; + + /* buffer for master key */ + uint8_t zk_master_keydata[MASTER_KEY_MAX_LEN]; + + /* buffer for hmac key */ + uint8_t zk_hmac_keydata[SHA512_HMAC_KEYLEN]; + + /* buffer for currrent encryption key derived from master key */ + uint8_t zk_current_keydata[MASTER_KEY_MAX_LEN]; + + /* current 64 bit salt for deriving an encryption key */ + uint8_t zk_salt[ZIO_DATA_SALT_LEN]; + + /* count of how many times the current salt has been used */ + uint64_t zk_salt_count; + + /* illumos crypto api current encryption key */ + crypto_key_t zk_current_key; + + /* template of current encryption key for illumos crypto api */ + crypto_ctx_template_t zk_current_tmpl; + + /* illumos crypto api current hmac key */ + crypto_key_t zk_hmac_key; + + /* template of hmac key for illumos crypto api */ + crypto_ctx_template_t zk_hmac_tmpl; + + /* lock for changing the salt and dependant values */ + krwlock_t zk_salt_lock; +} zio_crypt_key_t; + +void zio_crypt_key_destroy(zio_crypt_key_t *key); +int zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key); +int zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt_out); + +int zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, + uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out); +int zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, + uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, + uint8_t *mac, zio_crypt_key_t *key); +int zio_crypt_generate_iv(uint8_t *ivbuf); +int zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, + uint_t datalen, uint8_t *ivbuf, uint8_t *salt); + +void zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv); +void zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv); +void zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac); +void zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac); +void zio_crypt_encode_mac_zil(void *data, uint8_t *mac); +void zio_crypt_decode_mac_zil(const void *data, uint8_t *mac); +void zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen); + +int zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, + uint_t datalen, boolean_t byteswap, uint8_t *cksum); +int zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, + uint_t datalen, boolean_t byteswap, uint8_t *cksum); +int zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, + uint8_t *digestbuf, uint_t digestlen); +int zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, + boolean_t byteswap, uint8_t *portable_mac, uint8_t *local_mac); +int zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, + boolean_t *no_crypt); +int zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, + boolean_t *no_crypt); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_CRYPT_H */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h index a36749a308..703522b67d 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h @@ -99,6 +99,18 @@ extern "C" { * physical I/O. The nop write feature can handle writes in either * syncing or open context (i.e. zil writes) and as a result is mutually * exclusive with dedup. + * + * Encryption: + * Encryption and authentication is handled by the ZIO_STAGE_ENCRYPT stage. + * This stage determines how the encryption metadata is stored in the bp. + * Decryption and MAC verification is performed during zio_decrypt() as a + * transform callback. Encryption is mutually exclusive with nopwrite, because + * blocks with the same plaintext will be encrypted with different salts and + * IV's (if dedup is off), and therefore have different ciphertexts. For dedup + * blocks we deterministically generate the IV and salt by performing an HMAC + * of the plaintext, which is computationally expensive, but allows us to keep + * support for encrypted dedup. See the block comment in zio_crypt.c for + * details. */ /* @@ -113,32 +125,33 @@ enum zio_stage { ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */ ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */ - ZIO_STAGE_CHECKSUM_GENERATE = 1 << 6, /* -W--- */ + ZIO_STAGE_ENCRYPT = 1 << 6, /* -W--- */ + ZIO_STAGE_CHECKSUM_GENERATE = 1 << 7, /* -W--- */ - ZIO_STAGE_NOP_WRITE = 1 << 7, /* -W--- */ + ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W--- */ - ZIO_STAGE_DDT_READ_START = 1 << 8, /* R---- */ - ZIO_STAGE_DDT_READ_DONE = 1 << 9, /* R---- */ - ZIO_STAGE_DDT_WRITE = 1 << 10, /* -W--- */ - ZIO_STAGE_DDT_FREE = 1 << 11, /* --F-- */ + ZIO_STAGE_DDT_READ_START = 1 << 9, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 10, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 11, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 12, /* --F-- */ - ZIO_STAGE_GANG_ASSEMBLE = 1 << 12, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE = 1 << 13, /* RWFC- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 13, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 14, /* RWFC- */ - ZIO_STAGE_DVA_THROTTLE = 1 << 14, /* -W--- */ - ZIO_STAGE_DVA_ALLOCATE = 1 << 15, /* -W--- */ - ZIO_STAGE_DVA_FREE = 1 << 16, /* --F-- */ - ZIO_STAGE_DVA_CLAIM = 1 << 17, /* ---C- */ + ZIO_STAGE_DVA_THROTTLE = 1 << 15, /* -W--- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 16, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 17, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 18, /* ---C- */ - ZIO_STAGE_READY = 1 << 18, /* RWFCI */ + ZIO_STAGE_READY = 1 << 19, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START = 1 << 19, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE = 1 << 20, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS = 1 << 21, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 20, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 21, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 22, /* RW--I */ - ZIO_STAGE_CHECKSUM_VERIFY = 1 << 22, /* R---- */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 23, /* R---- */ - ZIO_STAGE_DONE = 1 << 23 /* RWFCI */ + ZIO_STAGE_DONE = 1 << 24 /* RWFCI */ }; #define ZIO_INTERLOCK_STAGES \ @@ -190,12 +203,14 @@ enum zio_stage { #define ZIO_REWRITE_PIPELINE \ (ZIO_WRITE_COMMON_STAGES | \ ZIO_STAGE_WRITE_COMPRESS | \ + ZIO_STAGE_ENCRYPT | \ ZIO_STAGE_WRITE_BP_INIT) #define ZIO_WRITE_PIPELINE \ (ZIO_WRITE_COMMON_STAGES | \ ZIO_STAGE_WRITE_BP_INIT | \ ZIO_STAGE_WRITE_COMPRESS | \ + ZIO_STAGE_ENCRYPT | \ ZIO_STAGE_DVA_THROTTLE | \ ZIO_STAGE_DVA_ALLOCATE) @@ -210,6 +225,7 @@ enum zio_stage { ZIO_STAGE_WRITE_BP_INIT | \ ZIO_STAGE_ISSUE_ASYNC | \ ZIO_STAGE_WRITE_COMPRESS | \ + ZIO_STAGE_ENCRYPT | \ ZIO_STAGE_CHECKSUM_GENERATE | \ ZIO_STAGE_DDT_WRITE) diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index c9f1212168..73b7c8e2fc 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -1331,7 +1331,7 @@ vdev_probe_done(zio_t *zio) ASSERT(zio->io_error != 0); vdev_dbgmsg(vd, "failed probe"); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, - spa, vd, NULL, 0, 0); + spa, vd, NULL, NULL, 0, 0); zio->io_error = SET_ERROR(ENXIO); } @@ -4189,7 +4189,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; } - zfs_ereport_post(class, spa, vd, NULL, save_state, 0); + zfs_ereport_post(class, spa, vd, NULL, NULL, + save_state, 0); } /* Erase any notion of persistent removed state */ diff --git a/usr/src/uts/common/fs/zfs/vdev_indirect.c b/usr/src/uts/common/fs/zfs/vdev_indirect.c index 062c4073a8..9626589444 100644 --- a/usr/src/uts/common/fs/zfs/vdev_indirect.c +++ b/usr/src/uts/common/fs/zfs/vdev_indirect.c @@ -1381,7 +1381,7 @@ vdev_indirect_checksum_error(zio_t *zio, void *bad_buf = abd_borrow_buf_copy(ic->ic_data, is->is_size); abd_t *good_abd = is->is_good_child->ic_data; void *good_buf = abd_borrow_buf_copy(good_abd, is->is_size); - zfs_ereport_post_checksum(zio->io_spa, vd, zio, + zfs_ereport_post_checksum(zio->io_spa, vd, &zio->io_bookmark, zio, is->is_target_offset, is->is_size, good_buf, bad_buf, &zbc); abd_return_buf(ic->ic_data, bad_buf, is->is_size); abd_return_buf(good_abd, good_buf, is->is_size); @@ -1458,9 +1458,9 @@ vdev_indirect_all_checksum_errors(zio_t *zio) vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); - zfs_ereport_post_checksum(zio->io_spa, vd, zio, - is->is_target_offset, is->is_size, - NULL, NULL, NULL); + zfs_ereport_post_checksum(zio->io_spa, vd, + &zio->io_bookmark, zio, is->is_target_offset, + is->is_size, NULL, NULL, NULL); } } } diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index 6502353542..0dcbb863e3 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -267,21 +267,17 @@ static void vdev_raidz_map_free(raidz_map_t *rm) { int c; - size_t size; for (c = 0; c < rm->rm_firstdatacol; c++) { abd_free(rm->rm_col[c].rc_abd); if (rm->rm_col[c].rc_gdata != NULL) - zio_buf_free(rm->rm_col[c].rc_gdata, - rm->rm_col[c].rc_size); + abd_free(rm->rm_col[c].rc_gdata); + } - size = 0; - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) abd_put(rm->rm_col[c].rc_abd); - size += rm->rm_col[c].rc_size; - } if (rm->rm_abd_copy != NULL) abd_free(rm->rm_abd_copy); @@ -314,14 +310,14 @@ vdev_raidz_cksum_free(void *arg, size_t ignored) } static void -vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) +vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) { raidz_map_t *rm = zcr->zcr_cbdata; size_t c = zcr->zcr_cbinfo; - size_t x; + size_t x, offset; - const char *good = NULL; - char *bad; + const abd_t *good = NULL; + const abd_t *bad = rm->rm_col[c].rc_abd; if (good_data == NULL) { zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); @@ -336,8 +332,6 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) */ if (rm->rm_col[0].rc_gdata == NULL) { abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; - char *buf; - int offset; /* * Set up the rm_col[]s to generate the parity for @@ -346,20 +340,21 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) */ for (x = 0; x < rm->rm_firstdatacol; x++) { bad_parity[x] = rm->rm_col[x].rc_abd; - rm->rm_col[x].rc_gdata = - zio_buf_alloc(rm->rm_col[x].rc_size); rm->rm_col[x].rc_abd = - abd_get_from_buf(rm->rm_col[x].rc_gdata, + rm->rm_col[x].rc_gdata = + abd_alloc_sametype(rm->rm_col[x].rc_abd, rm->rm_col[x].rc_size); } /* fill in the data columns from good_data */ - buf = (char *)good_data; + offset = 0; for (; x < rm->rm_cols; x++) { abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = abd_get_from_buf(buf, - rm->rm_col[x].rc_size); - buf += rm->rm_col[x].rc_size; + + rm->rm_col[x].rc_abd = + abd_get_offset_size((abd_t *)good_data, + offset, rm->rm_col[x].rc_size); + offset += rm->rm_col[x].rc_size; } /* @@ -368,34 +363,35 @@ vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) vdev_raidz_generate_parity(rm); /* restore everything back to its original state */ - for (x = 0; x < rm->rm_firstdatacol; x++) { - abd_put(rm->rm_col[x].rc_abd); + for (x = 0; x < rm->rm_firstdatacol; x++) rm->rm_col[x].rc_abd = bad_parity[x]; - } offset = 0; for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { abd_put(rm->rm_col[x].rc_abd); - rm->rm_col[x].rc_abd = abd_get_offset( - rm->rm_abd_copy, offset); + rm->rm_col[x].rc_abd = abd_get_offset_size( + rm->rm_abd_copy, offset, + rm->rm_col[x].rc_size); offset += rm->rm_col[x].rc_size; } } ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); - good = rm->rm_col[c].rc_gdata; + good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, + rm->rm_col[c].rc_size); } else { /* adjust good_data to point at the start of our column */ - good = good_data; - + offset = 0; for (x = rm->rm_firstdatacol; x < c; x++) - good += rm->rm_col[x].rc_size; + offset += rm->rm_col[x].rc_size; + + good = abd_get_offset_size((abd_t *)good_data, offset, + rm->rm_col[c].rc_size); } - bad = abd_borrow_buf_copy(rm->rm_col[c].rc_abd, rm->rm_col[c].rc_size); /* we drop the ereport if it ends up that the data was good */ zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); - abd_return_buf(rm->rm_col[c].rc_abd, bad, rm->rm_col[c].rc_size); + abd_put((abd_t *)good); } /* @@ -438,14 +434,16 @@ vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) size += rm->rm_col[c].rc_size; - rm->rm_abd_copy = - abd_alloc_sametype(rm->rm_col[rm->rm_firstdatacol].rc_abd, size); + rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; - abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset); + abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, + col->rc_size); - abd_copy(tmp, col->rc_abd, col->rc_size); + ASSERT3S(tmp->abd_size, >=, col->rc_size); + ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); + abd_copy_off(tmp, col->rc_abd, 0, 0, col->rc_size); abd_put(col->rc_abd); col->rc_abd = tmp; @@ -562,13 +560,15 @@ vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset, for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_abd = - abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE); + abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); - rm->rm_col[c].rc_abd = abd_get_offset(abd, 0); + rm->rm_col[c].rc_abd = abd_get_offset_size(abd, 0, + rm->rm_col[c].rc_size); off = rm->rm_col[c].rc_size; for (c = c + 1; c < acols; c++) { - rm->rm_col[c].rc_abd = abd_get_offset(abd, off); + rm->rm_col[c].rc_abd = abd_get_offset_size(abd, off, + rm->rm_col[c].rc_size); off += rm->rm_col[c].rc_size; } @@ -683,7 +683,8 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + ASSERT3U(src->abd_size, >=, rm->rm_col[c].rc_size); + abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); } else { struct pqr_struct pqr = { p, NULL, NULL }; (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, @@ -711,20 +712,22 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); - (void) memcpy(q, p, rm->rm_col[c].rc_size); - } else { - struct pqr_struct pqr = { p, q, NULL }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, - vdev_raidz_pq_func, &pqr); - } + ASSERT(ccnt == pcnt || ccnt == 0); - if (c == rm->rm_firstdatacol) { + abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); for (i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; } } else { + struct pqr_struct pqr = { p, q, NULL }; + + ASSERT(ccnt <= pcnt); + + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pq_func, &pqr); + /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. @@ -758,22 +761,24 @@ vdev_raidz_generate_parity_pqr(raidz_map_t *rm) ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); if (c == rm->rm_firstdatacol) { - abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + ASSERT3S(src->abd_size, >=, rm->rm_col[c].rc_size); + ASSERT(ccnt == pcnt || ccnt == 0); + abd_copy_to_buf_off(p, src, 0, rm->rm_col[c].rc_size); (void) memcpy(q, p, rm->rm_col[c].rc_size); (void) memcpy(r, p, rm->rm_col[c].rc_size); - } else { - struct pqr_struct pqr = { p, q, r }; - (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, - vdev_raidz_pqr_func, &pqr); - } - if (c == rm->rm_firstdatacol) { for (i = ccnt; i < pcnt; i++) { p[i] = 0; q[i] = 0; r[i] = 0; } } else { + struct pqr_struct pqr = { p, q, r }; + + ASSERT(ccnt <= pcnt); + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pqr_func, &pqr); + /* * Treat short columns as though they are full of 0s. * Note that there's therefore nothing needed for P. @@ -941,7 +946,9 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; dst = rm->rm_col[x].rc_abd; - abd_copy(dst, src, rm->rm_col[x].rc_size); + ASSERT3S(dst->abd_size, >=, rm->rm_col[x].rc_size); + ASSERT3S(src->abd_size, >=, rm->rm_col[x].rc_size); + abd_copy_off(dst, src, 0, 0, rm->rm_col[x].rc_size); for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { uint64_t size = MIN(rm->rm_col[x].rc_size, @@ -979,14 +986,19 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) dst = rm->rm_col[x].rc_abd; if (c == rm->rm_firstdatacol) { - abd_copy(dst, src, size); + if (dst != src) { + ASSERT3S(dst->abd_size, >=, size); + ASSERT3S(src->abd_size, >=, size); + abd_copy_off(dst, src, 0, 0, size); + } if (rm->rm_col[x].rc_size > size) abd_zero_off(dst, size, rm->rm_col[x].rc_size - size); } else { ASSERT3U(size, <=, rm->rm_col[x].rc_size); - (void) abd_iterate_func2(dst, src, 0, 0, size, - vdev_raidz_reconst_q_pre_func, NULL); + if (src != dst) + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_q_pre_func, NULL); (void) abd_iterate_func(dst, size, rm->rm_col[x].rc_size - size, vdev_raidz_reconst_q_pre_tail_func, NULL); @@ -1475,7 +1487,9 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) bufs[c] = col->rc_abd; col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); - abd_copy(col->rc_abd, bufs[c], col->rc_size); + ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); + ASSERT3S(bufs[c]->abd_size, >=, col->rc_size); + abd_copy_off(col->rc_abd, bufs[c], 0, 0, col->rc_size); } } @@ -1571,7 +1585,9 @@ vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { raidz_col_t *col = &rm->rm_col[c]; - abd_copy(bufs[c], col->rc_abd, col->rc_size); + ASSERT3S(bufs[c]->abd_size, >=, col->rc_size); + ASSERT3S(col->rc_abd->abd_size, >=, col->rc_size); + abd_copy_off(bufs[c], col->rc_abd, 0, 0, col->rc_size); abd_free(col->rc_abd); col->rc_abd = bufs[c]; } @@ -2041,9 +2057,8 @@ vdev_raidz_io_start(zio_t *zio) * Report a checksum error for a child of a RAID-Z device. */ static void -raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) +raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) { - void *buf; vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -2057,11 +2072,9 @@ raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) zbc.zbc_has_cksum = 0; zbc.zbc_injected = rm->rm_ecksuminjected; - buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size); - zfs_ereport_post_checksum(zio->io_spa, vd, zio, - rc->rc_offset, rc->rc_size, buf, bad_data, - &zbc); - abd_return_buf(rc->rc_abd, buf, rc->rc_size); + zfs_ereport_post_checksum(zio->io_spa, vd, + &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, + rc->rc_abd, bad_data, &zbc); } } @@ -2091,7 +2104,7 @@ raidz_checksum_verify(zio_t *zio) static int raidz_parity_verify(zio_t *zio, raidz_map_t *rm) { - void *orig[VDEV_RAIDZ_MAXPARITY]; + abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int c, ret = 0; raidz_col_t *rc; @@ -2106,8 +2119,8 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; - orig[c] = zio_buf_alloc(rc->rc_size); - abd_copy_to_buf(orig[c], rc->rc_abd, rc->rc_size); + orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size); + abd_copy(orig[c], rc->rc_abd, rc->rc_size); } vdev_raidz_generate_parity(rm); @@ -2116,12 +2129,12 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; - if (abd_cmp_buf(rc->rc_abd, orig[c], rc->rc_size) != 0) { + if (abd_cmp(orig[c], rc->rc_abd, rc->rc_abd->abd_size) != 0) { raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = SET_ERROR(ECKSUM); ret++; } - zio_buf_free(orig[c], rc->rc_size); + abd_free(orig[c]); } return (ret); @@ -2156,7 +2169,7 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) { raidz_map_t *rm = zio->io_vsd; raidz_col_t *rc; - void *orig[VDEV_RAIDZ_MAXPARITY]; + abd_t *orig[VDEV_RAIDZ_MAXPARITY]; int tstore[VDEV_RAIDZ_MAXPARITY + 2]; int *tgts = &tstore[1]; int current, next, i, c, n; @@ -2205,7 +2218,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) ASSERT(orig[i] != NULL); } - orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); + orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, + rm->rm_col[0].rc_size); current = 0; next = tgts[current]; @@ -2224,7 +2238,9 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) ASSERT3S(c, >=, 0); ASSERT3S(c, <, rm->rm_cols); rc = &rm->rm_col[c]; - abd_copy_to_buf(orig[i], rc->rc_abd, + ASSERT3S(orig[i]->abd_size, >=, rc->rc_size); + ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size); + abd_copy_off(orig[i], rc->rc_abd, 0, 0, rc->rc_size); } @@ -2256,7 +2272,9 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) for (i = 0; i < n; i++) { c = tgts[i]; rc = &rm->rm_col[c]; - abd_copy_from_buf(rc->rc_abd, orig[i], + ASSERT3S(rc->rc_abd->abd_size, >=, rc->rc_size); + ASSERT3S(orig[i]->abd_size, >=, rc->rc_size); + abd_copy_off(rc->rc_abd, orig[i], 0, 0, rc->rc_size); } @@ -2294,9 +2312,8 @@ vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) } n--; done: - for (i = 0; i < n; i++) { - zio_buf_free(orig[i], rm->rm_col[0].rc_size); - } + for (i = 0; i < n; i++) + abd_free(orig[i]); return (ret); } @@ -2555,7 +2572,8 @@ vdev_raidz_io_done(zio_t *zio) zfs_ereport_start_checksum( zio->io_spa, vd->vdev_child[rc->rc_devidx], - zio, rc->rc_offset, rc->rc_size, + &zio->io_bookmark, zio, + rc->rc_offset, rc->rc_size, (void *)(uintptr_t)c, &zbc); } } diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c index 48b0be6665..dad227306b 100644 --- a/usr/src/uts/common/fs/zfs/zap_micro.c +++ b/usr/src/uts/common/fs/zfs/zap_micro.c @@ -670,7 +670,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, dmu_buf_will_dirty(db, tx); mzap_phys_t *zp = db->db_data; zp->mz_block_type = ZBT_MICRO; - zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; + (void) random_get_pseudo_bytes((void *)&zp->mz_salt, sizeof (uint64_t)); zp->mz_normflags = normflags; if (flags != 0) { diff --git a/usr/src/uts/common/fs/zfs/zcp_get.c b/usr/src/uts/common/fs/zfs/zcp_get.c index 1478c288d8..80814aeae4 100644 --- a/usr/src/uts/common/fs/zfs/zcp_get.c +++ b/usr/src/uts/common/fs/zfs/zcp_get.c @@ -421,6 +421,15 @@ get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, case ZFS_PROP_INCONSISTENT: numval = dsl_get_inconsistent(ds); break; + case ZFS_PROP_IVSET_GUID: + if (dsl_dataset_is_zapified(ds)) { + error = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_IVSET_GUID, + sizeof (numval), 1, &numval); + } else { + error = ENOENT; + } + break; case ZFS_PROP_RECEIVE_RESUME_TOKEN: { char *token = get_receive_resume_stats_impl(ds); VERIFY3U(strlcpy(strval, token, ZAP_MAXVALUELEN), <, diff --git a/usr/src/uts/common/fs/zfs/zfeature.c b/usr/src/uts/common/fs/zfs/zfeature.c index 35ce827979..da9077ee73 100644 --- a/usr/src/uts/common/fs/zfs/zfeature.c +++ b/usr/src/uts/common/fs/zfs/zfeature.c @@ -369,6 +369,19 @@ feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) spa->spa_feat_enabled_txg_obj, feature->fi_guid, sizeof (uint64_t), 1, &enabling_txg, tx)); } + + /* + * Errata #4 is mostly a problem with encrypted datasets, but it + * is also a problem where the old encryption feature did not + * depend on the bookmark_v2 feature. If the pool does not have + * any encrypted datasets we can resolve this issue simply by + * enabling this dependency. + */ + if (spa->spa_errata == ZPOOL_ERRATA_ZOL_8308_ENCRYPTION && + spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && + !spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION) && + feature->fi_feature == SPA_FEATURE_BOOKMARK_V2) + spa->spa_errata = 0; } static void @@ -413,8 +426,8 @@ spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx) * We create feature flags ZAP objects in two instances: during pool * creation and during pool upgrade. */ - ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)) || (!spa->spa_sync_on && - tx->tx_txg == TXG_INITIAL)); + ASSERT((!spa->spa_sync_on && tx->tx_txg == TXG_INITIAL) || + dsl_pool_sync_context(spa_get_dsl(spa))); spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, diff --git a/usr/src/uts/common/fs/zfs/zfs_fm.c b/usr/src/uts/common/fs/zfs/zfs_fm.c index 398a3d04aa..07a7a9f70b 100644 --- a/usr/src/uts/common/fs/zfs/zfs_fm.c +++ b/usr/src/uts/common/fs/zfs/zfs_fm.c @@ -104,8 +104,8 @@ #ifdef _KERNEL static void zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, - const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, - uint64_t stateoroffset, uint64_t size) + const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, + zio_t *zio, uint64_t stateoroffset, uint64_t size) { nvlist_t *ereport, *detector; @@ -318,24 +318,6 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, DATA_TYPE_UINT64, zio->io_size, NULL); } - - /* - * Payload for I/Os with corresponding logical information. - */ - if (zio->io_logical != NULL) - fm_payload_set(ereport, - FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, - DATA_TYPE_UINT64, - zio->io_logical->io_bookmark.zb_objset, - FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, - DATA_TYPE_UINT64, - zio->io_logical->io_bookmark.zb_object, - FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, - DATA_TYPE_INT64, - zio->io_logical->io_bookmark.zb_level, - FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, - DATA_TYPE_UINT64, - zio->io_logical->io_bookmark.zb_blkid, NULL); } else if (vd != NULL) { /* * If we have a vdev but no zio, this is a device fault, and the @@ -347,6 +329,20 @@ zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, DATA_TYPE_UINT64, stateoroffset, NULL); } + /* + * Payload for I/Os with corresponding logical information. + */ + if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, + DATA_TYPE_UINT64, zb->zb_objset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, + DATA_TYPE_UINT64, zb->zb_object, + FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, + DATA_TYPE_INT64, zb->zb_level, + FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, + DATA_TYPE_UINT64, zb->zb_blkid, NULL); + mutex_exit(&spa->spa_errlist_lock); *ereport_out = ereport; @@ -501,11 +497,11 @@ range_total_size(zfs_ecksum_info_t *eip) static zfs_ecksum_info_t * annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, - const uint8_t *goodbuf, const uint8_t *badbuf, size_t size, + const abd_t *goodabd, const abd_t *badabd, size_t size, boolean_t drop_if_identical) { - const uint64_t *good = (const uint64_t *)goodbuf; - const uint64_t *bad = (const uint64_t *)badbuf; + const uint64_t *good; + const uint64_t *bad; uint64_t allset = 0; uint64_t allcleared = 0; @@ -549,7 +545,7 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, } } - if (badbuf == NULL || goodbuf == NULL) + if (badabd == NULL || goodabd == NULL) return (eip); ASSERT3U(nui64s, <=, UINT32_MAX); @@ -557,6 +553,9 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(size, <=, UINT32_MAX); + good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); + bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); + /* build up the range list by comparing the two buffers. */ for (idx = 0; idx < nui64s; idx++) { if (good[idx] == bad[idx]) { @@ -586,6 +585,8 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, */ if (inline_size == 0 && drop_if_identical) { kmem_free(eip, sizeof (*eip)); + abd_return_buf((abd_t *)goodabd, (void *)good, size); + abd_return_buf((abd_t *)badabd, (void *)bad, size); return (NULL); } @@ -626,6 +627,10 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, eip->zei_ranges[range].zr_start *= sizeof (uint64_t); eip->zei_ranges[range].zr_end *= sizeof (uint64_t); } + + abd_return_buf((abd_t *)goodabd, (void *)good, size); + abd_return_buf((abd_t *)badabd, (void *)bad, size); + eip->zei_allowed_mingap *= sizeof (uint64_t); inline_size *= sizeof (uint64_t); @@ -666,15 +671,16 @@ annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, #endif void -zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, - uint64_t stateoroffset, uint64_t size) +zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, + const struct zbookmark_phys *zb, zio_t *zio, uint64_t stateoroffset, + uint64_t size) { #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; - zfs_ereport_start(&ereport, &detector, - subclass, spa, vd, zio, stateoroffset, size); + zfs_ereport_start(&ereport, &detector, subclass, spa, vd, + zb, zio, stateoroffset, size); if (ereport == NULL) return; @@ -687,7 +693,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, } void -zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, +zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, void *arg, zio_bad_cksum_t *info) { @@ -709,7 +715,7 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, #ifdef _KERNEL zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, - FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); + FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); if (report->zcr_ereport == NULL) { report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo); @@ -729,8 +735,8 @@ zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, } void -zfs_ereport_finish_checksum(zio_cksum_report_t *report, - const void *good_data, const void *bad_data, boolean_t drop_if_identical) +zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, + const abd_t *bad_data, boolean_t drop_if_identical) { #ifdef _KERNEL zfs_ecksum_info_t *info = NULL; @@ -777,17 +783,17 @@ zfs_ereport_send_interim_checksum(zio_cksum_report_t *report) } void -zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, +zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, uint64_t length, - const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc) + const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) { #ifdef _KERNEL nvlist_t *ereport = NULL; nvlist_t *detector = NULL; zfs_ecksum_info_t *info; - zfs_ereport_start(&ereport, &detector, - FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); + zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, + spa, vd, zb, zio, offset, length); if (ereport == NULL) return; diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 712abee22f..71018c3836 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -32,7 +32,7 @@ * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome <tsoome@me.com> * Copyright 2017 RackTop Systems. - * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Datto, Inc. All rights reserved. */ /* @@ -192,6 +192,7 @@ #include <sys/vdev_removal.h> #include <sys/vdev_impl.h> #include <sys/vdev_initialize.h> +#include <sys/dsl_crypt.h> #include "zfs_namecheck.h" #include "zfs_prop.h" @@ -593,12 +594,12 @@ zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ - error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, + error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); - dmu_objset_disown(os, setsl_tag); + dmu_objset_disown(os, B_TRUE, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; @@ -1285,6 +1286,22 @@ zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) return (0); } +/* ARGSUSED */ +static int +zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_LOAD_KEY, cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_CHANGE_KEY, cr)); +} + /* * Policy for allowing temporary snapshots to be taken or released */ @@ -1481,7 +1498,7 @@ zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) if (zfsvfs->z_vfs) { VFS_RELE(zfsvfs->z_vfs); } else { - dmu_objset_disown(zfsvfs->z_os, zfsvfs); + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } } @@ -1494,6 +1511,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; char *spa_name = zc->zc_name; + dsl_crypto_params_t *dcp = NULL; if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) @@ -1508,6 +1526,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) if (props) { nvlist_t *nvl = NULL; + nvlist_t *hidden_args = NULL; uint64_t version = SPA_VERSION; char *tname; @@ -1527,6 +1546,18 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) } (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } + + (void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS, + &hidden_args); + error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, + rootprops, hidden_args, &dcp); + if (error != 0) { + nvlist_free(config); + nvlist_free(props); + return (error); + } + (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS); + VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); @@ -1538,7 +1569,7 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) spa_name = tname; } - error = spa_create(zc->zc_name, config, props, zplprops); + error = spa_create(zc->zc_name, config, props, zplprops, dcp); /* * Set the remaining root properties @@ -1552,6 +1583,7 @@ pool_props_bad: nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); + dsl_crypto_params_free(dcp, !!error); return (error); } @@ -1830,15 +1862,16 @@ zfs_ioc_obj_to_path(zfs_cmd_t *zc) int error; /* XXX reading from objset not owned */ - if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) + if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, + FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { - dmu_objset_rele(os, FTAG); + dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); - dmu_objset_rele(os, FTAG); + dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } @@ -1859,15 +1892,16 @@ zfs_ioc_obj_to_stats(zfs_cmd_t *zc) int error; /* XXX reading from objset not owned */ - if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) + if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, + FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { - dmu_objset_rele(os, FTAG); + dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); - dmu_objset_rele(os, FTAG); + dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } @@ -2437,7 +2471,8 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); - uint64_t intval; + uint64_t intval = 0; + char *strval = NULL; int err = -1; if (prop == ZPROP_INVAL) { @@ -2453,10 +2488,12 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, &pair) == 0); } - if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) - return (-1); - - VERIFY(0 == nvpair_value_uint64(pair, &intval)); + /* all special properties are numeric except for keylocation */ + if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { + strval = fnvpair_value_string(pair); + } else { + intval = fnvpair_value_uint64(pair); + } switch (prop) { case ZFS_PROP_QUOTA: @@ -2480,6 +2517,16 @@ zfs_prop_set_special(const char *dsname, zprop_source_t source, if (err == 0) err = -1; break; + case ZFS_PROP_KEYLOCATION: + err = dsl_crypto_can_set_keylocation(dsname, strval); + + /* + * Set err to -1 to force the zfs_set_prop_nvlist code down the + * default path to set the value in the nvlist. + */ + if (err == 0) + err = -1; + break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; @@ -3183,6 +3230,8 @@ zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } + * (optional) "hidden_args" -> { "wkeydata" -> value } + * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outnvl: propname -> error code (int32) @@ -3193,15 +3242,18 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; + nvlist_t *hidden_args = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); int32_t type32; dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; + dsl_crypto_params_t *dcp = NULL; if (nvlist_lookup_int32(innvl, "type", &type32) != 0) return (SET_ERROR(EINVAL)); type = type32; (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); + (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); switch (type) { case DMU_OST_ZFS: @@ -3267,9 +3319,18 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) } } + error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops, + hidden_args, &dcp); + if (error != 0) { + nvlist_free(zct.zct_zplprops); + return (error); + } + error = dmu_objset_create(fsname, type, - is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); + is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct); + nvlist_free(zct.zct_zplprops); + dsl_crypto_params_free(dcp, !!error); /* * It would be nice to do this atomically. @@ -3287,6 +3348,8 @@ zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } + * (optional) "hidden_args" -> { "wkeydata" -> value } + * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outnvl: propname -> error code (int32) @@ -3308,9 +3371,8 @@ zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); + error = dmu_objset_clone(fsname, origin_name); - if (error != 0) - return (error); /* * It would be nice to do this atomically. @@ -4328,7 +4390,11 @@ extract_delay_props(nvlist_t *props) { nvlist_t *delayprops; nvpair_t *nvp, *tmp; - static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, 0 }; + static const zfs_prop_t delayable[] = { + ZFS_PROP_REFQUOTA, + ZFS_PROP_KEYLOCATION, + 0 + }; int i; VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); @@ -4517,7 +4583,7 @@ zfs_ioc_recv(zfs_cmd_t *zc) } } - if (delayprops != NULL) { + if (delayprops != NULL && props != NULL) { /* * Merge delayed props back in with initial props, in case * we're DEBUG and zfs_ioc_recv_inject_err is set (which means @@ -4627,6 +4693,7 @@ zfs_ioc_send(zfs_cmd_t *zc) boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); boolean_t compressok = (zc->zc_flags & 0x4); + boolean_t rawok = (zc->zc_flags & 0x8); if (zc->zc_obj != 0) { dsl_pool_t *dp; @@ -4658,7 +4725,8 @@ zfs_ioc_send(zfs_cmd_t *zc) if (error != 0) return (error); - error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, + FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); @@ -4674,7 +4742,7 @@ zfs_ioc_send(zfs_cmd_t *zc) } } - error = dmu_send_estimate(tosnap, fromsnap, compressok, + error = dmu_send_estimate(tosnap, fromsnap, compressok || rawok, &zc->zc_objset_type); if (fromsnap != NULL) @@ -4688,7 +4756,7 @@ zfs_ioc_send(zfs_cmd_t *zc) off = fp->f_offset; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, - zc->zc_fromobj, embedok, large_block_ok, compressok, + zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, zc->zc_cookie, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) @@ -5078,7 +5146,7 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(ds, &newds, - zfsvfs); + B_TRUE, zfsvfs); error = zfs_resume_fs(zfsvfs, newds); } } @@ -5087,12 +5155,12 @@ zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) VFS_RELE(zfsvfs->z_vfs); } else { /* XXX kind of reading contents without owning */ - error = dmu_objset_hold(zc->zc_name, FTAG, &os); + error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); error = dmu_objset_userspace_upgrade(os); - dmu_objset_rele(os, FTAG); + dmu_objset_rele_flags(os, B_TRUE, FTAG); } return (error); @@ -5250,7 +5318,7 @@ zfs_ioc_next_obj(zfs_cmd_t *zc) objset_t *os = NULL; int error; - error = dmu_objset_hold(zc->zc_name, FTAG, &os); + error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); @@ -5668,6 +5736,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted + * (optional) "rawok" -> (value ignored) + * presence indicates raw encrypted records should be used. * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * } @@ -5685,6 +5755,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) boolean_t largeblockok; boolean_t embedok; boolean_t compressok; + boolean_t rawok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; @@ -5697,6 +5768,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); + rawok = nvlist_exists(innvl, "rawok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); @@ -5707,7 +5779,7 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) off = fp->f_offset; error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, - fd, resumeobj, resumeoff, fp->f_vnode, &off); + rawok, fd, resumeobj, resumeoff, fp->f_vnode, &off); if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; @@ -5742,6 +5814,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) int error; char *fromname; boolean_t compressok; + boolean_t rawok; uint64_t space; error = dsl_pool_hold(snapname, FTAG, &dp); @@ -5755,6 +5828,7 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) } compressok = nvlist_exists(innvl, "compressok"); + rawok = nvlist_exists(innvl, "rawok"); error = nvlist_lookup_string(innvl, "from", &fromname); if (error == 0) { @@ -5768,8 +5842,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) goto out; - error = dmu_send_estimate(tosnap, fromsnap, compressok, - &space); + error = dmu_send_estimate(tosnap, fromsnap, + compressok || rawok, &space); dsl_dataset_rele(fromsnap, FTAG); } else if (strchr(fromname, '#') != NULL) { /* @@ -5784,7 +5858,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) if (error != 0) goto out; error = dmu_send_estimate_from_txg(tosnap, - frombm.zbm_creation_txg, compressok, &space); + frombm.zbm_creation_txg, compressok || rawok, + &space); } else { /* * from is not properly formatted as a snapshot or @@ -5797,7 +5872,8 @@ zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) /* * If estimating the size of a full send, use dmu_send_estimate. */ - error = dmu_send_estimate(tosnap, NULL, compressok, &space); + error = dmu_send_estimate(tosnap, NULL, compressok || rawok, + &space); } fnvlist_add_uint64(outnvl, "space", space); @@ -5846,6 +5922,124 @@ zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) return (err); } +/* + * Load a user's wrapping key into the kernel. + * innvl: { + * "hidden_args" -> { "wkeydata" -> value } + * raw uint8_t array of encryption wrapping key data (32 bytes) + * (optional) "noop" -> (value ignored) + * presence indicated key should only be verified, not loaded + * } + */ +/* ARGSUSED */ +static int +zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int ret = 0; + dsl_crypto_params_t *dcp = NULL; + nvlist_t *hidden_args; + boolean_t noop = nvlist_exists(innvl, "noop"); + + if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { + ret = SET_ERROR(EINVAL); + goto error; + } + + ret = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); + if (ret != 0) { + ret = SET_ERROR(EINVAL); + goto error; + } + + ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, + hidden_args, &dcp); + if (ret != 0) + goto error; + + ret = spa_keystore_load_wkey(dsname, dcp, noop); + if (ret != 0) + goto error; + + dsl_crypto_params_free(dcp, noop); + + return (0); + +error: + dsl_crypto_params_free(dcp, B_TRUE); + return (ret); +} + +/* + * Unload a user's wrapping key from the kernel. + * Both innvl and outnvl are unused. + */ +/* ARGSUSED */ +static int +zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int ret = 0; + + if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { + ret = (SET_ERROR(EINVAL)); + goto out; + } + + ret = spa_keystore_unload_wkey(dsname); + if (ret != 0) + goto out; + +out: + return (ret); +} + +/* + * Changes a user's wrapping key used to decrypt a dataset. The keyformat, + * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified + * here to change how the key is derived in userspace. + * + * innvl: { + * "hidden_args" (optional) -> { "wkeydata" -> value } + * raw uint8_t array of new encryption wrapping key data (32 bytes) + * "props" (optional) -> { prop -> value } + * } + * + * outnvl is unused + */ +/* ARGSUSED */ +static int +zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int ret; + uint64_t cmd = DCP_CMD_NONE; + dsl_crypto_params_t *dcp = NULL; + nvlist_t *args = NULL, *hidden_args = NULL; + + if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { + ret = (SET_ERROR(EINVAL)); + goto error; + } + + (void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd); + (void) nvlist_lookup_nvlist(innvl, "props", &args); + (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); + + ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp); + if (ret != 0) + goto error; + + ret = spa_keystore_change_key(dsname, dcp); + if (ret != 0) + goto error; + + dsl_crypto_params_free(dcp, B_FALSE); + + return (0); + +error: + dsl_crypto_params_free(dcp, B_TRUE); + return (ret); +} + static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void @@ -6040,6 +6234,17 @@ zfs_ioctl_init(void) zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE); + zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, + zfs_ioc_load_key, zfs_secpolicy_load_key, + DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE); + zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY, + zfs_ioc_unload_key, zfs_secpolicy_load_key, + DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE); + zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY, + zfs_ioc_change_key, zfs_secpolicy_change_key, + DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, + B_TRUE, B_TRUE); + /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index f7beea4cc9..dfd13539cd 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -974,8 +974,8 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp) * We claim to always be readonly so we can open snapshots; * other ZPL code will prevent us from writing to snapshots. */ - - error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); + error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, B_TRUE, zfsvfs, + &os); if (error != 0) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); @@ -983,7 +983,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp) error = zfsvfs_create_impl(zfvp, zfsvfs, os); if (error != 0) { - dmu_objset_disown(os, zfsvfs); + dmu_objset_disown(os, B_TRUE, zfsvfs); } return (error); } @@ -1084,7 +1084,10 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) zfsvfs->z_replay = B_FALSE; } } - zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ + + /* restore readonly bit */ + if (readonly != 0) + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; } /* @@ -1235,7 +1238,7 @@ zfs_domount(vfs_t *vfsp, char *osname) zfsctl_create(zfsvfs); out: if (error) { - dmu_objset_disown(zfsvfs->z_os, zfsvfs); + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_inc_32(&zfs_active_fs_count); @@ -1903,7 +1906,7 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) /* * Finally release the objset */ - dmu_objset_disown(os, zfsvfs); + dmu_objset_disown(os, B_TRUE, zfsvfs); } /* diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index a68fc3dd34..c8cb5b3935 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -915,8 +915,8 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); - dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), - woff, abuf, tx); + dmu_assign_arcbuf_by_dbuf( + sa_get_db(zp->z_sa_hdl), woff, abuf, tx); } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 7e88c51a0b..e56104f979 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -194,8 +194,8 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) * Read a log block and make sure it's valid. */ static int -zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, - char **end) +zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, + blkptr_t *nbp, void *dst, char **end) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; @@ -209,11 +209,14 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) zio_flags |= ZIO_FLAG_SPECULATIVE; + if (!decrypt) + zio_flags |= ZIO_FLAG_RAW; + SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, - ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, + &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; @@ -288,6 +291,14 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; + /* + * If we are not using the resulting data, we are just checking that + * it hasn't been corrupted so we don't need to waste CPU time + * decompressing and decrypting it. + */ + if (wbuf == NULL) + zio_flags |= ZIO_FLAG_RAW; + SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); @@ -308,7 +319,8 @@ zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) */ int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, - zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg, + boolean_t decrypt) { const zil_header_t *zh = zilog->zl_header; boolean_t claimed = !!zh->zh_claim_txg; @@ -347,7 +359,9 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, if (blk_seq > claim_blk_seq) break; - if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) + + error = parse_blk_func(zilog, &blk, arg, txg); + if (error != 0) break; ASSERT3U(max_blk_seq, <, blk_seq); max_blk_seq = blk_seq; @@ -356,7 +370,8 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) break; - error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); + error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, + lrbuf, &end); if (error != 0) break; @@ -366,7 +381,9 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, ASSERT3U(reclen, >=, sizeof (lr_t)); if (lr->lrc_seq > claim_lr_seq) goto done; - if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) + + error = parse_lr_func(zilog, lr, arg, txg); + if (error != 0) goto done; ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; @@ -381,7 +398,8 @@ done: zilog->zl_parse_lr_count = lr_count; ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || - (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); + (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) || + (decrypt && error == EIO)); zil_bp_tree_fini(zilog); zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); @@ -451,9 +469,12 @@ zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ - if (lr->lr_blkptr.blk_birth >= first_txg && - (error = zil_read_log_data(zilog, lr, NULL)) != 0) - return (error); + if (lr->lr_blkptr.blk_birth >= first_txg) { + error = zil_read_log_data(zilog, lr, NULL); + if (error != 0) + return (error); + } + return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); } @@ -646,9 +667,8 @@ zil_create(zilog_t *zilog) BP_ZERO(&blk); } - error = zio_alloc_zil(zilog->zl_spa, - zilog->zl_os->os_dsl_dataset->ds_object, txg, &blk, NULL, - ZIL_MIN_BLKSZ, &slog); + error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, + NULL, ZIL_MIN_BLKSZ, &slog); if (error == 0) zil_init_log_chain(zilog, &blk); @@ -736,7 +756,7 @@ zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); (void) zil_parse(zilog, zil_free_log_block, - zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); + zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE); } int @@ -750,7 +770,7 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) int error; error = dmu_objset_own_obj(dp, ds->ds_object, - DMU_OST_ANY, B_FALSE, FTAG, &os); + DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os); if (error != 0) { /* * EBUSY indicates that the objset is inconsistent, in which @@ -800,11 +820,13 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) zh->zh_claim_txg == 0)) { if (!BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_clear_log_block, - zil_noop_log_record, tx, first_txg); + zil_noop_log_record, tx, first_txg, B_FALSE); } BP_ZERO(&zh->zh_log); + if (os->os_encrypted) + os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; dsl_dataset_dirty(dmu_objset_ds(os), tx); - dmu_objset_disown(os, FTAG); + dmu_objset_disown(os, B_FALSE, FTAG); return (0); } @@ -824,18 +846,20 @@ zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_claim_log_block, - zil_claim_log_record, tx, first_txg); + zil_claim_log_record, tx, first_txg, B_FALSE); zh->zh_claim_txg = first_txg; zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) zh->zh_flags |= ZIL_REPLAY_NEEDED; zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; + if (os->os_encrypted) + os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; dsl_dataset_dirty(dmu_objset_ds(os), tx); } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); - dmu_objset_disown(os, FTAG); + dmu_objset_disown(os, B_FALSE, FTAG); return (0); } @@ -907,7 +931,7 @@ zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) */ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, zilog->zl_header->zh_claim_txg ? -1ULL : - spa_min_claim_txg(os->os_spa)); + spa_min_claim_txg(os->os_spa), B_FALSE); return ((error == ECKSUM || error == ENOENT) ? 0 : error); } @@ -1435,8 +1459,9 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) BP_ZERO(bp); /* pass the old blkptr in order to spread log blocks across devs */ - error = zio_alloc_zil(spa, zilog->zl_os->os_dsl_dataset->ds_object, - txg, bp, &lwb->lwb_blk, zil_blksz, &slog); + error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, &lwb->lwb_blk, + zil_blksz, &slog); + if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; @@ -3188,6 +3213,21 @@ zil_suspend(const char *osname, void **cookiep) return (0); } + /* + * The ZIL has work to do. Ensure that the associated encryption + * key will remain mapped while we are committing the log by + * grabbing a reference to it. If the key isn't loaded we have no + * choice but to return an error until the wrapping key is loaded. + */ + if (os->os_encrypted && + dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) { + zilog->zl_suspend--; + mutex_exit(&zilog->zl_lock); + dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); + dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); + return (SET_ERROR(EBUSY)); + } + zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); @@ -3202,9 +3242,10 @@ zil_suspend(const char *osname, void **cookiep) zil_commit_impl(zilog, 0); /* - * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we - * use txg_wait_synced() to ensure the data from the zilog has - * migrated to the main pool before calling zil_destroy(). + * Now that we've ensured all lwb's are LWB_STATE_DONE, + * txg_wait_synced() will be called from within zil_destroy(), + * which will ensure the data from the zilog has migrated to the + * main pool before it returns. */ txg_wait_synced(zilog->zl_dmu_pool, 0); @@ -3215,6 +3256,9 @@ zil_suspend(const char *osname, void **cookiep) cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); + if (os->os_encrypted) + dsl_dataset_remove_key_mapping(dmu_objset_ds(os)); + if (cookiep == NULL) zil_resume(os); else @@ -3381,7 +3425,7 @@ zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) zilog->zl_replay_time = ddi_get_lbolt(); ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, - zh->zh_claim_txg); + zh->zh_claim_txg, B_TRUE); kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); zil_destroy(zilog, B_FALSE); diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 76312e6a74..2e44d59daf 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -45,6 +45,7 @@ #include <sys/metaslab_impl.h> #include <sys/abd.h> #include <sys/cityhash.h> +#include <sys/dsl_crypt.h> /* * ========================================================================== @@ -270,6 +271,13 @@ zio_data_buf_free(void *buf, size_t size) kmem_cache_free(zio_data_buf_cache[c], buf); } +/* ARGSUSED */ +static void +zio_abd_free(void *abd, size_t size) +{ + abd_free((abd_t *)abd); +} + /* * ========================================================================== * Push and pop I/O transform buffers @@ -322,7 +330,7 @@ zio_pop_transforms(zio_t *zio) /* * ========================================================================== - * I/O transform callbacks for subblocks and decompression + * I/O transform callbacks for subblocks, decompression, and decryption * ========================================================================== */ static void @@ -348,6 +356,132 @@ zio_decompress(zio_t *zio, abd_t *data, uint64_t size) } } +static void +zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) +{ + int ret; + void *tmp; + blkptr_t *bp = zio->io_bp; + spa_t *spa = zio->io_spa; + uint64_t dsobj = zio->io_bookmark.zb_objset; + uint64_t lsize = BP_GET_LSIZE(bp); + dmu_object_type_t ot = BP_GET_TYPE(bp); + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + boolean_t no_crypt = B_FALSE; + + ASSERT(BP_USES_CRYPT(bp)); + ASSERT3U(size, !=, 0); + + if (zio->io_error != 0) + return; + + /* + * Verify the cksum of MACs stored in an indirect bp. It will always + * be possible to verify this since it does not require an encryption + * key. + */ + if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { + zio_crypt_decode_mac_bp(bp, mac); + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { + /* + * We haven't decompressed the data yet, but + * zio_crypt_do_indirect_mac_checksum() requires + * decompressed data to be able to parse out the MACs + * from the indirect block. We decompress it now and + * throw away the result after we are finished. + */ + tmp = zio_buf_alloc(lsize); + ret = zio_decompress_data(BP_GET_COMPRESS(bp), + zio->io_abd, tmp, zio->io_size, lsize); + if (ret != 0) { + ret = SET_ERROR(EIO); + goto error; + } + ret = zio_crypt_do_indirect_mac_checksum(B_FALSE, + tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac); + zio_buf_free(tmp, lsize); + } else { + ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, + zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); + } + abd_copy(data, zio->io_abd, size); + + if (ret != 0) + goto error; + + return; + } + + /* + * If this is an authenticated block, just check the MAC. It would be + * nice to separate this out into its own flag, but for the moment + * enum zio_flag is out of bits. + */ + if (BP_IS_AUTHENTICATED(bp)) { + if (ot == DMU_OT_OBJSET) { + ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, + dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp)); + } else { + zio_crypt_decode_mac_bp(bp, mac); + ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, + zio->io_abd, size, mac); + } + abd_copy(data, zio->io_abd, size); + + if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) { + ret = zio_handle_decrypt_injection(spa, + &zio->io_bookmark, ot, ECKSUM); + } + if (ret != 0) + goto error; + + return; + } + + zio_crypt_decode_params_bp(bp, salt, iv); + + if (ot == DMU_OT_INTENT_LOG) { + tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); + zio_crypt_decode_mac_zil(tmp, mac); + abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); + } else { + zio_crypt_decode_mac_bp(bp, mac); + } + + ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), + BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data, + zio->io_abd, &no_crypt); + if (no_crypt) + abd_copy(data, zio->io_abd, size); + + if (ret != 0) + goto error; + + return; + +error: + /* assert that the key was found unless this was speculative */ + ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); + + /* + * If there was a decryption / authentication error return EIO as + * the io_error. If this was not a speculative zio, create an ereport. + */ + if (ret == ECKSUM) { + zio->io_error = SET_ERROR(EIO); + if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { + spa_log_error(spa, &zio->io_bookmark); + zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + spa, NULL, &zio->io_bookmark, zio, 0, 0); + } + } else { + zio->io_error = ret; + } +} + /* * ========================================================================== * I/O parent/child relationships and pipeline interlocks @@ -565,7 +699,7 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); ASSERT(vd || stage == ZIO_STAGE_OPEN); - IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW) != 0); + IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); zio = kmem_cache_alloc(zio_cache, KM_SLEEP); bzero(zio, sizeof (zio_t)); @@ -836,9 +970,12 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, * Data can be NULL if we are going to call zio_write_override() to * provide the already-allocated BP. But we may need the data to * verify a dedup hit (if requested). In this case, don't try to - * dedup (just take the already-allocated BP verbatim). + * dedup (just take the already-allocated BP verbatim). Encrypted + * dedup blocks need data as well so we also disable dedup in this + * case. */ - if (data == NULL && zio->io_prop.zp_dedup_verify) { + if (data == NULL && + (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; } @@ -1189,23 +1326,30 @@ static int zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; + uint64_t psize = + BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_child_type == ZIO_CHILD_LOGICAL && - !(zio->io_flags & ZIO_FLAG_RAW)) { - uint64_t psize = - BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); + !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), psize, psize, zio_decompress); } - if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { - zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || + BP_HAS_INDIRECT_MAC_CKSUM(bp)) && + zio->io_child_type == ZIO_CHILD_LOGICAL) { + zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), + psize, psize, zio_decrypt); + } + if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { int psize = BPE_GET_PSIZE(bp); void *data = abd_borrow_buf(zio->io_abd, psize); + + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; decode_embedded_bp_compressed(bp, data); abd_return_buf_copy(zio->io_abd, data, psize); } else { @@ -1266,7 +1410,8 @@ zio_write_bp_init(zio_t *zio) ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); - if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { + if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && + !zp->zp_encrypt) { BP_SET_DEDUP(bp, 1); zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; return (ZIO_PIPELINE_CONTINUE); @@ -1295,8 +1440,6 @@ zio_write_compress(zio_t *zio) uint64_t psize = zio->io_size; int pass = 1; - EQUIV(lsize != psize, (zio->io_flags & ZIO_FLAG_RAW) != 0); - /* * If our children haven't all reached the ready stage, * wait for them and then repeat this pipeline stage. @@ -1347,13 +1490,15 @@ zio_write_compress(zio_t *zio) } /* If it's a compressed write that is not raw, compress the buffer. */ - if (compress != ZIO_COMPRESS_OFF && psize == lsize) { + if (compress != ZIO_COMPRESS_OFF && + !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { void *cbuf = zio_buf_alloc(lsize); psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); - } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE && + } else if (!zp->zp_dedup && !zp->zp_encrypt && + psize <= BPE_PAYLOAD_SIZE && zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { encode_embedded_bp_compressed(bp, @@ -1400,6 +1545,20 @@ zio_write_compress(zio_t *zio) zio->io_bp_override = NULL; *bp = zio->io_bp_orig; zio->io_pipeline = zio->io_orig_pipeline; + + } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 && + zp->zp_type == DMU_OT_DNODE) { + /* + * The DMU actually relies on the zio layer's compression + * to free metadnode blocks that have had all contained + * dnodes freed. As a result, even when doing a raw + * receive, we must check whether the block can be compressed + * to a hole. + */ + psize = zio_compress_data(ZIO_COMPRESS_EMPTY, + zio->io_abd, NULL, lsize); + if (psize == 0) + compress = ZIO_COMPRESS_OFF; } else { ASSERT3U(psize, !=, 0); } @@ -1417,7 +1576,6 @@ zio_write_compress(zio_t *zio) pass >= zfs_sync_pass_rewrite) { VERIFY3U(psize, !=, 0); enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; - zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { @@ -1447,6 +1605,8 @@ zio_write_compress(zio_t *zio) if (zp->zp_dedup) { ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(!zp->zp_encrypt || + DMU_OT_IS_ENCRYPTED(zp->zp_type)); zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; } if (zp->zp_nopwrite) { @@ -1794,7 +1954,8 @@ zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) "failure and the failure mode property for this pool " "is set to panic.", spa_name(spa)); - zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0); + zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, + NULL, NULL, 0, 0); mutex_enter(&spa->spa_suspend_lock); @@ -2231,6 +2392,13 @@ zio_write_gang_block(zio_t *pio) int error; boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); + /* + * encrypted blocks need DVA[2] free so encrypted gang headers can't + * have a third copy. + */ + if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP) + gbh_copies = SPA_DVAS_PER_BP - 1; + int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); @@ -2309,6 +2477,11 @@ zio_write_gang_block(zio_t *pio) zp.zp_dedup = B_FALSE; zp.zp_dedup_verify = B_FALSE; zp.zp_nopwrite = B_FALSE; + zp.zp_encrypt = gio->io_prop.zp_encrypt; + zp.zp_byteorder = gio->io_prop.zp_byteorder; + bzero(zp.zp_salt, ZIO_DATA_SALT_LEN); + bzero(zp.zp_iv, ZIO_DATA_IV_LEN); + bzero(zp.zp_mac, ZIO_DATA_MAC_LEN); zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], has_data ? abd_get_offset(pio->io_abd, pio->io_size - @@ -2383,6 +2556,7 @@ zio_nop_write(zio_t *zio) if (BP_IS_HOLE(bp_orig) || !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) || + BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || @@ -2521,7 +2695,7 @@ static boolean_t zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) { spa_t *spa = zio->io_spa; - boolean_t do_raw = (zio->io_flags & ZIO_FLAG_RAW); + boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); /* We should never get a raw, override zio */ ASSERT(!(zio->io_bp_override && do_raw)); @@ -2531,11 +2705,21 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) * because when zio->io_bp is an override bp, we will not have * pushed the I/O transforms. That's an important optimization * because otherwise we'd compress/encrypt all dmu_sync() data twice. + * However, we should never get a raw, override zio so in these + * cases we can compare the io_data directly. This is useful because + * it allows us to do dedup verification even if we don't have access + * to the original data (for instance, if the encryption keys aren't + * loaded). */ + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { zio_t *lio = dde->dde_lead_zio[p]; - if (lio != NULL) { + if (lio != NULL && do_raw) { + return (lio->io_size != zio->io_size || + abd_cmp(zio->io_abd, lio->io_abd, + zio->io_size) != 0); + } else if (lio != NULL) { return (lio->io_orig_size != zio->io_orig_size || abd_cmp(zio->io_orig_abd, lio->io_orig_abd, zio->io_orig_size) != 0); @@ -2545,7 +2729,36 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { ddt_phys_t *ddp = &dde->dde_phys[p]; - if (ddp->ddp_phys_birth != 0) { + if (ddp->ddp_phys_birth != 0 && do_raw) { + blkptr_t blk = *zio->io_bp; + uint64_t psize; + abd_t *tmpabd; + int error; + + ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + psize = BP_GET_PSIZE(&blk); + + if (psize != zio->io_size) + return (B_TRUE); + + ddt_exit(ddt); + + tmpabd = abd_alloc_for_io(psize, B_TRUE); + + error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, + psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_RAW, &zio->io_bookmark)); + + if (error == 0) { + if (abd_cmp(tmpabd, zio->io_abd, psize) != 0) + error = SET_ERROR(ENOENT); + } + + abd_free(tmpabd); + ddt_enter(ddt); + return (error != 0); + } else if (ddp->ddp_phys_birth != 0) { arc_buf_t *abuf = NULL; arc_flags_t aflags = ARC_FLAG_WAIT; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; @@ -2554,6 +2767,9 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + if (BP_GET_LSIZE(&blk) != zio->io_orig_size) + return (B_TRUE); + ddt_exit(ddt); /* @@ -2578,10 +2794,9 @@ zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) zio_flags, &aflags, &zio->io_bookmark); if (error == 0) { - if (arc_buf_size(abuf) != zio->io_orig_size || - abd_cmp_buf(zio->io_orig_abd, abuf->b_data, + if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, zio->io_orig_size) != 0) - error = SET_ERROR(EEXIST); + error = SET_ERROR(ENOENT); arc_buf_destroy(abuf, &abuf); } @@ -3048,7 +3263,7 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int -zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp, +zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t size, boolean_t *slog) { int error = 1; @@ -3074,14 +3289,15 @@ zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp, */ error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, &io_alloc_list, NULL, - cityhash4(0, 0, 0, objset) % spa->spa_alloc_count); + cityhash4(0, 0, 0, + os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); if (error == 0) { *slog = TRUE; } else { error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID, - &io_alloc_list, NULL, cityhash4(0, 0, 0, objset) % - spa->spa_alloc_count); + &io_alloc_list, NULL, cityhash4(0, 0, 0, + os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); if (error == 0) *slog = FALSE; } @@ -3098,6 +3314,23 @@ zio_alloc_zil(spa_t *spa, uint64_t objset, uint64_t txg, blkptr_t *new_bp, BP_SET_LEVEL(new_bp, 0); BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); + + /* + * encrypted blocks will require an IV and salt. We generate + * these now since we will not be rewriting the bp at + * rewrite time. + */ + if (os->os_encrypted) { + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t salt[ZIO_DATA_SALT_LEN]; + + BP_SET_CRYPT(new_bp, B_TRUE); + VERIFY0(spa_crypt_get_salt(spa, + dmu_objset_id(os), salt)); + VERIFY0(zio_crypt_generate_iv(iv)); + + zio_crypt_encode_params_bp(new_bp, salt, iv); + } } else { zfs_dbgmsg("%s: zil block allocation failure: " "size %llu, error %d", spa_name(spa), size, error); @@ -3332,7 +3565,7 @@ zio_change_priority(zio_t *pio, zio_priority_t priority) */ static void zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, - const void *good_buf) + const abd_t *good_buf) { /* no processing needed */ zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); @@ -3342,14 +3575,14 @@ zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, void zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) { - void *buf = zio_buf_alloc(zio->io_size); + void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); - abd_copy_to_buf(buf, zio->io_abd, zio->io_size); + abd_copy(abd, zio->io_abd, zio->io_size); zcr->zcr_cbinfo = zio->io_size; - zcr->zcr_cbdata = buf; + zcr->zcr_cbdata = abd; zcr->zcr_finish = zio_vsd_default_cksum_finish; - zcr->zcr_free = zio_buf_free; + zcr->zcr_free = zio_abd_free; } static int @@ -3460,6 +3693,164 @@ zio_vdev_io_bypass(zio_t *zio) /* * ========================================================================== + * Encrypt and store encryption parameters + * ========================================================================== + */ + + +/* + * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for + * managing the storage of encryption parameters and passing them to the + * lower-level encryption functions. + */ +static int +zio_encrypt(zio_t *zio) +{ + zio_prop_t *zp = &zio->io_prop; + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + uint64_t psize = BP_GET_PSIZE(bp); + uint64_t dsobj = zio->io_bookmark.zb_objset; + dmu_object_type_t ot = BP_GET_TYPE(bp); + void *enc_buf = NULL; + abd_t *eabd = NULL; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + boolean_t no_crypt = B_FALSE; + + /* the root zio already encrypted the data */ + if (zio->io_child_type == ZIO_CHILD_GANG) + return (ZIO_PIPELINE_CONTINUE); + + /* only ZIL blocks are re-encrypted on rewrite */ + if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) + return (ZIO_PIPELINE_CONTINUE); + + if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { + BP_SET_CRYPT(bp, B_FALSE); + return (ZIO_PIPELINE_CONTINUE); + } + + /* if we are doing raw encryption set the provided encryption params */ + if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { + ASSERT0(BP_GET_LEVEL(bp)); + BP_SET_CRYPT(bp, B_TRUE); + BP_SET_BYTEORDER(bp, zp->zp_byteorder); + if (ot != DMU_OT_OBJSET) + zio_crypt_encode_mac_bp(bp, zp->zp_mac); + + /* dnode blocks must be written out in the provided byteorder */ + if (zp->zp_byteorder != ZFS_HOST_BYTEORDER && + ot == DMU_OT_DNODE) { + void *bswap_buf = zio_buf_alloc(psize); + abd_t *babd = abd_get_from_buf(bswap_buf, psize); + + ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); + abd_copy_to_buf(bswap_buf, zio->io_abd, psize); + dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf, + psize); + + abd_take_ownership_of_buf(babd, B_TRUE); + zio_push_transform(zio, babd, psize, psize, NULL); + } + + if (DMU_OT_IS_ENCRYPTED(ot)) + zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); + return (ZIO_PIPELINE_CONTINUE); + } + + /* indirect blocks only maintain a cksum of the lower level MACs */ + if (BP_GET_LEVEL(bp) > 0) { + BP_SET_CRYPT(bp, B_TRUE); + VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, + zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), + mac)); + zio_crypt_encode_mac_bp(bp, mac); + return (ZIO_PIPELINE_CONTINUE); + } + + /* + * Objset blocks are a special case since they have 2 256-bit MACs + * embedded within them. + */ + if (ot == DMU_OT_OBJSET) { + ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); + ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); + BP_SET_CRYPT(bp, B_TRUE); + VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj, + zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp))); + return (ZIO_PIPELINE_CONTINUE); + } + + /* unencrypted object types are only authenticated with a MAC */ + if (!DMU_OT_IS_ENCRYPTED(ot)) { + BP_SET_CRYPT(bp, B_TRUE); + VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj, + zio->io_abd, psize, mac)); + zio_crypt_encode_mac_bp(bp, mac); + return (ZIO_PIPELINE_CONTINUE); + } + + /* + * Later passes of sync-to-convergence may decide to rewrite data + * in place to avoid more disk reallocations. This presents a problem + * for encryption because this consitutes rewriting the new data with + * the same encryption key and IV. However, this only applies to blocks + * in the MOS (particularly the spacemaps) and we do not encrypt the + * MOS. We assert that the zio is allocating or an intent log write + * to enforce this. + */ + ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); + ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); + ASSERT3U(psize, !=, 0); + + enc_buf = zio_buf_alloc(psize); + eabd = abd_get_from_buf(enc_buf, psize); + abd_take_ownership_of_buf(eabd, B_TRUE); + + /* + * For an explanation of what encryption parameters are stored + * where, see the block comment in zio_crypt.c. + */ + if (ot == DMU_OT_INTENT_LOG) { + zio_crypt_decode_params_bp(bp, salt, iv); + } else { + BP_SET_CRYPT(bp, B_TRUE); + } + + /* Perform the encryption. This should not fail */ + VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark, + BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), + salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt)); + + /* encode encryption metadata into the bp */ + if (ot == DMU_OT_INTENT_LOG) { + /* + * ZIL blocks store the MAC in the embedded checksum, so the + * transform must always be applied. + */ + zio_crypt_encode_mac_zil(enc_buf, mac); + zio_push_transform(zio, eabd, psize, psize, NULL); + } else { + BP_SET_CRYPT(bp, B_TRUE); + zio_crypt_encode_params_bp(bp, salt, iv); + zio_crypt_encode_mac_bp(bp, mac); + + if (no_crypt) { + ASSERT3U(ot, ==, DMU_OT_DNODE); + abd_free(eabd); + } else { + zio_push_transform(zio, eabd, psize, psize, NULL); + } + } + + return (ZIO_PIPELINE_CONTINUE); +} + +/* + * ========================================================================== * Generate and verify checksums * ========================================================================== */ @@ -3519,8 +3910,8 @@ zio_checksum_verify(zio_t *zio) if (error == ECKSUM && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { zfs_ereport_start_checksum(zio->io_spa, - zio->io_vd, zio, zio->io_offset, - zio->io_size, NULL, &info); + zio->io_vd, &zio->io_bookmark, zio, + zio->io_offset, zio->io_size, NULL, &info); } } @@ -3765,7 +4156,6 @@ zio_done(zio_t *zio) if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { - ASSERT(!BP_SHOULD_BYTESWAP(bp)); ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); ASSERT(BP_COUNT_GANG(bp) == 0 || (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); @@ -3790,26 +4180,19 @@ zio_done(zio_t *zio) zio_cksum_report_t *zcr = zio->io_cksum_report; uint64_t align = zcr->zcr_align; uint64_t asize = P2ROUNDUP(psize, align); - char *abuf = NULL; abd_t *adata = zio->io_abd; if (asize != psize) { - adata = abd_alloc_linear(asize, B_TRUE); + adata = abd_alloc(asize, B_TRUE); abd_copy(adata, zio->io_abd, psize); abd_zero_off(adata, psize, asize - psize); } - if (adata != NULL) - abuf = abd_borrow_buf_copy(adata, asize); - zio->io_cksum_report = zcr->zcr_next; zcr->zcr_next = NULL; - zcr->zcr_finish(zcr, abuf); + zcr->zcr_finish(zcr, adata); zfs_ereport_free_checksum(zcr); - if (adata != NULL) - abd_return_buf(adata, abuf, asize); - if (asize != psize) abd_free(adata); } @@ -3827,7 +4210,8 @@ zio_done(zio_t *zio) * device is currently unavailable. */ if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) - zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); + zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, + &zio->io_bookmark, zio, 0, 0); if ((zio->io_error == EIO || !(zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && @@ -3836,9 +4220,9 @@ zio_done(zio_t *zio) * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. */ - spa_log_error(spa, zio); - zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio, - 0, 0); + spa_log_error(spa, &zio->io_bookmark); + zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, + &zio->io_bookmark, zio, 0, 0); } } @@ -4029,6 +4413,7 @@ static zio_pipe_stage_t *zio_pipeline[] = { zio_free_bp_init, zio_issue_async, zio_write_compress, + zio_encrypt, zio_checksum_generate, zio_nop_write, zio_ddt_read_start, diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c index e1c98b0b99..d5aa9303b8 100644 --- a/usr/src/uts/common/fs/zfs/zio_checksum.c +++ b/usr/src/uts/common/fs/zfs/zio_checksum.c @@ -242,9 +242,9 @@ zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, * a tuple which is guaranteed to be unique for the life of the pool. */ static void -zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp) +zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) { - dva_t *dva = BP_IDENTITY(bp); + const dva_t *dva = BP_IDENTITY(bp); uint64_t txg = BP_PHYSICAL_BIRTH(bp); ASSERT(BP_IS_GANG(bp)); @@ -287,6 +287,25 @@ zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) mutex_exit(&spa->spa_cksum_tmpls_lock); } +/* convenience function to update a checksum to accomodate an encryption MAC */ +static void +zio_checksum_handle_crypt(zio_cksum_t *cksum, zio_cksum_t *saved, boolean_t xor) +{ + /* + * Weak checksums do not have their entropy spread evenly + * across the bits of the checksum. Therefore, when truncating + * a weak checksum we XOR the first 2 words with the last 2 so + * that we don't "lose" any entropy unnecessarily. + */ + if (xor) { + cksum->zc_word[0] ^= cksum->zc_word[2]; + cksum->zc_word[1] ^= cksum->zc_word[3]; + } + + cksum->zc_word[2] = saved->zc_word[2]; + cksum->zc_word[3] = saved->zc_word[3]; +} + /* * Generate the checksum. */ @@ -294,11 +313,13 @@ void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, abd_t *abd, uint64_t size) { + static const uint64_t zec_magic = ZEC_MAGIC; blkptr_t *bp = zio->io_bp; uint64_t offset = zio->io_offset; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t cksum; + zio_cksum_t cksum, saved; spa_t *spa = zio->io_spa; + boolean_t insecure = (ci->ci_flags & ZCHECKSUM_FLAG_DEDUP) == 0; ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS); ASSERT(ci->ci_func[0] != NULL); @@ -306,40 +327,68 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, zio_checksum_template_init(checksum, spa); if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { - zio_eck_t *eck; - void *data = abd_to_buf(abd); + zio_eck_t eck; + size_t eck_offset; + + bzero(&saved, sizeof (zio_cksum_t)); if (checksum == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = data; + zil_chain_t zilc; + abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t)); - size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ, + size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ, uint64_t); - eck = &zilc->zc_eck; + eck = zilc.zc_eck; + eck_offset = offsetof(zil_chain_t, zc_eck); } else { - eck = (zio_eck_t *)((char *)data + size) - 1; + eck_offset = size - sizeof (zio_eck_t); + abd_copy_to_buf_off(&eck, abd, eck_offset, + sizeof (zio_eck_t)); } - if (checksum == ZIO_CHECKSUM_GANG_HEADER) - zio_checksum_gang_verifier(&eck->zec_cksum, bp); - else if (checksum == ZIO_CHECKSUM_LABEL) - zio_checksum_label_verifier(&eck->zec_cksum, offset); - else - bp->blk_cksum = eck->zec_cksum; - eck->zec_magic = ZEC_MAGIC; + + if (checksum == ZIO_CHECKSUM_GANG_HEADER) { + zio_checksum_gang_verifier(&eck.zec_cksum, bp); + } else if (checksum == ZIO_CHECKSUM_LABEL) { + zio_checksum_label_verifier(&eck.zec_cksum, offset); + } else { + saved = eck.zec_cksum; + eck.zec_cksum = bp->blk_cksum; + } + + abd_copy_from_buf_off(abd, &zec_magic, + eck_offset + offsetof(zio_eck_t, zec_magic), + sizeof (zec_magic)); + abd_copy_from_buf_off(abd, &eck.zec_cksum, + eck_offset + offsetof(zio_eck_t, zec_cksum), + sizeof (zio_cksum_t)); + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], &cksum); - eck->zec_cksum = cksum; + if (bp != NULL && BP_USES_CRYPT(bp) && + BP_GET_TYPE(bp) != DMU_OT_OBJSET) + zio_checksum_handle_crypt(&cksum, &saved, insecure); + + abd_copy_from_buf_off(abd, &cksum, + eck_offset + offsetof(zio_eck_t, zec_cksum), + sizeof (zio_cksum_t)); } else { + saved = bp->blk_cksum; ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], - &bp->blk_cksum); + &cksum); + if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET) + zio_checksum_handle_crypt(&cksum, &saved, insecure); + bp->blk_cksum = cksum; } } int -zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, - abd_t *abd, uint64_t size, uint64_t offset, zio_bad_cksum_t *info) +zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, + enum zio_checksum checksum, abd_t *abd, uint64_t size, + uint64_t offset, zio_bad_cksum_t *info) { zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum; + zio_eck_t eck; int byteswap; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) @@ -348,33 +397,37 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, zio_checksum_template_init(checksum, spa); if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { - zio_eck_t *eck; zio_cksum_t verifier; - uint64_t data_size = size; - void *data = abd_borrow_buf_copy(abd, data_size); + size_t eck_offset; if (checksum == ZIO_CHECKSUM_ZILOG2) { - zil_chain_t *zilc = data; + zil_chain_t zilc; uint64_t nused; - eck = &zilc->zc_eck; - if (eck->zec_magic == ZEC_MAGIC) { - nused = zilc->zc_nused; - } else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) { - nused = BSWAP_64(zilc->zc_nused); + abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t)); + + eck = zilc.zc_eck; + eck_offset = offsetof(zil_chain_t, zc_eck) + + offsetof(zio_eck_t, zec_cksum); + + if (eck.zec_magic == ZEC_MAGIC) { + nused = zilc.zc_nused; + } else if (eck.zec_magic == BSWAP_64(ZEC_MAGIC)) { + nused = BSWAP_64(zilc.zc_nused); } else { - abd_return_buf(abd, data, data_size); return (SET_ERROR(ECKSUM)); } - if (nused > data_size) { - abd_return_buf(abd, data, data_size); + if (nused > size) { return (SET_ERROR(ECKSUM)); } size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); } else { - eck = (zio_eck_t *)((char *)data + data_size) - 1; + eck_offset = size - sizeof (zio_eck_t); + abd_copy_to_buf_off(&eck, abd, eck_offset, + sizeof (zio_eck_t)); + eck_offset += offsetof(zio_eck_t, zec_cksum); } if (checksum == ZIO_CHECKSUM_GANG_HEADER) @@ -384,20 +437,21 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, else verifier = bp->blk_cksum; - byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); + byteswap = (eck.zec_magic == BSWAP_64(ZEC_MAGIC)); if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); - size_t eck_offset = (size_t)(&eck->zec_cksum) - (size_t)data; - expected_cksum = eck->zec_cksum; - eck->zec_cksum = verifier; - abd_return_buf_copy(abd, data, data_size); + expected_cksum = eck.zec_cksum; + + abd_copy_from_buf_off(abd, &verifier, eck_offset, + sizeof (zio_cksum_t)); ci->ci_func[byteswap](abd, size, spa->spa_cksum_tmpls[checksum], &actual_cksum); - abd_copy_from_buf_off(abd, &expected_cksum, - eck_offset, sizeof (zio_cksum_t)); + + abd_copy_from_buf_off(abd, &expected_cksum, eck_offset, + sizeof (zio_cksum_t)); if (byteswap) { byteswap_uint64_array(&expected_cksum, @@ -410,6 +464,26 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, spa->spa_cksum_tmpls[checksum], &actual_cksum); } + /* + * MAC checksums are a special case since half of this checksum will + * actually be the encryption MAC. This will be verified by the + * decryption process, so we just check the truncated checksum now. + * Objset blocks use embedded MACs so we don't truncate the checksum + * for them. + */ + if (bp != NULL && BP_USES_CRYPT(bp) && + BP_GET_TYPE(bp) != DMU_OT_OBJSET) { + if (!(ci->ci_flags & ZCHECKSUM_FLAG_DEDUP)) { + actual_cksum.zc_word[0] ^= actual_cksum.zc_word[2]; + actual_cksum.zc_word[1] ^= actual_cksum.zc_word[3]; + } + + actual_cksum.zc_word[2] = 0; + actual_cksum.zc_word[3] = 0; + expected_cksum.zc_word[2] = 0; + expected_cksum.zc_word[3] = 0; + } + if (info != NULL) { info->zbc_expected = expected_cksum; info->zbc_actual = actual_cksum; @@ -418,7 +492,6 @@ zio_checksum_error_impl(spa_t *spa, blkptr_t *bp, enum zio_checksum checksum, info->zbc_injected = 0; info->zbc_has_cksum = 1; } - if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (SET_ERROR(ECKSUM)); diff --git a/usr/src/uts/common/fs/zfs/zio_compress.c b/usr/src/uts/common/fs/zfs/zio_compress.c index 9882806a7d..4ae2581e3b 100644 --- a/usr/src/uts/common/fs/zfs/zio_compress.c +++ b/usr/src/uts/common/fs/zfs/zio_compress.c @@ -144,20 +144,31 @@ zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); } +void *zio_decompress_failed_buf; + int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len, size_t d_len) { void *tmp = abd_borrow_buf_copy(src, s_len); int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len); - abd_return_buf(src, tmp, s_len); /* - * Decompression shouldn't fail, because we've already verifyied + * Decompression shouldn't fail, because we've already verified * the checksum. However, for extra protection (e.g. against bitflips * in non-ECC RAM), we handle this error (and test it). */ - ASSERT0(ret); + if (ret != 0) { + zio_decompress_failed_buf = kmem_alloc(s_len, KM_SLEEP); + bcopy(tmp, zio_decompress_failed_buf, s_len); + panic("decompression failed " + "err=%u c=%u buf=%p s_len=%u d_len=%u", + ret, (int)c, zio_decompress_failed_buf, + (int)s_len, (int)d_len); + } + + abd_return_buf(src, tmp, s_len); + if (zio_decompress_fail_fraction != 0 && spa_get_random(zio_decompress_fail_fraction) == 0) ret = SET_ERROR(EINVAL); diff --git a/usr/src/uts/common/fs/zfs/zio_crypt.c b/usr/src/uts/common/fs/zfs/zio_crypt.c new file mode 100644 index 0000000000..1d6b8286e3 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zio_crypt.c @@ -0,0 +1,2009 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include <sys/zio_crypt.h> +#include <sys/dmu.h> +#include <sys/dmu_objset.h> +#include <sys/dnode.h> +#include <sys/fs/zfs.h> +#include <sys/zio.h> +#include <sys/zil.h> +#include <sys/sha2.h> +#include <sys/hkdf.h> + +/* + * This file is responsible for handling all of the details of generating + * encryption parameters and performing encryption and authentication. + * + * BLOCK ENCRYPTION PARAMETERS: + * Encryption /Authentication Algorithm Suite (crypt): + * The encryption algorithm, mode, and key length we are going to use. We + * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit + * keys. All authentication is currently done with SHA512-HMAC. + * + * Plaintext: + * The unencrypted data that we want to encrypt. + * + * Initialization Vector (IV): + * An initialization vector for the encryption algorithms. This is used to + * "tweak" the encryption algorithms so that two blocks of the same data are + * encrypted into different ciphertext outputs, thus obfuscating block patterns. + * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is + * never reused with the same encryption key. This value is stored unencrypted + * and must simply be provided to the decryption function. We use a 96 bit IV + * (as recommended by NIST) for all block encryption. For non-dedup blocks we + * derive the IV randomly. The first 64 bits of the IV are stored in the second + * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of + * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits + * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count + * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of + * level 0 blocks is the number of allocated dnodes in that block. The on-disk + * format supports at most 2^15 slots per L0 dnode block, because the maximum + * block size is 16MB (2^24). In either case, for level 0 blocks this number + * will still be smaller than UINT32_MAX so it is safe to store the IV in the + * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count + * for the dnode code. + * + * Master key: + * This is the most important secret data of an encrypted dataset. It is used + * along with the salt to generate that actual encryption keys via HKDF. We + * do not use the master key to directly encrypt any data because there are + * theoretical limits on how much data can actually be safely encrypted with + * any encryption mode. The master key is stored encrypted on disk with the + * user's wrapping key. Its length is determined by the encryption algorithm. + * For details on how this is stored see the block comment in dsl_crypt.c + * + * Salt: + * Used as an input to the HKDF function, along with the master key. We use a + * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt + * can be used for encrypting many blocks, so we cache the current salt and the + * associated derived key in zio_crypt_t so we do not need to derive it again + * needlessly. + * + * Encryption Key: + * A secret binary key, generated from an HKDF function used to encrypt and + * decrypt data. + * + * Message Authenication Code (MAC) + * The MAC is an output of authenticated encryption modes such as AES-GCM and + * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted + * data on disk and return garbage to the application. Effectively, it is a + * checksum that can not be reproduced by an attacker. We store the MAC in the + * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated + * regular checksum of the ciphertext which can be used for scrubbing. + * + * OBJECT AUTHENTICATION: + * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because + * they contain some info that always needs to be readable. To prevent this + * data from being altered, we authenticate this data using SHA512-HMAC. This + * will produce a MAC (similar to the one produced via encryption) which can + * be used to verify the object was not modified. HMACs do not require key + * rotation or IVs, so we can keep up to the full 3 copies of authenticated + * data. + * + * ZIL ENCRYPTION: + * ZIL blocks have their bp written to disk ahead of the associated data, so we + * cannot store the MAC there as we normally do. For these blocks the MAC is + * stored in the embedded checksum within the zil_chain_t header. The salt and + * IV are generated for the block on bp allocation instead of at encryption + * time. In addition, ZIL blocks have some pieces that must be left in plaintext + * for claiming even though all of the sensitive user data still needs to be + * encrypted. The function zio_crypt_init_uios_zil() handles parsing which + * pieces of the block need to be encrypted. All data that is not encrypted is + * authenticated using the AAD mechanisms that the supported encryption modes + * provide for. In order to preserve the semantics of the ZIL for encrypted + * datasets, the ZIL is not protected at the objset level as described below. + * + * DNODE ENCRYPTION: + * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left + * in plaintext for scrubbing and claiming, but the bonus buffers might contain + * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing + * which pieces of the block need to be encrypted. For more details about + * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). + * + * OBJECT SET AUTHENTICATION: + * Up to this point, everything we have encrypted and authenticated has been + * at level 0 (or -2 for the ZIL). If we did not do any further work the + * on-disk format would be susceptible to attacks that deleted or rearrannged + * the order of level 0 blocks. Ideally, the cleanest solution would be to + * maintain a tree of authentication MACs going up the bp tree. However, this + * presents a problem for raw sends. Send files do not send information about + * indirect blocks so there would be no convenient way to transfer the MACs and + * they cannot be recalculated on the receive side without the master key which + * would defeat one of the purposes of raw sends in the first place. Instead, + * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs + * from the level below. We also include some portable fields from blk_prop such + * as the lsize and compression algorithm to prevent the data from being + * misinterpretted. + * + * At the objset level, we maintain 2 seperate 256 bit MACs in the + * objset_phys_t. The first one is "portable" and is the logical root of the + * MAC tree maintianed in the metadnode's bps. The second, is "local" and is + * used as the root MAC for the user accounting objects, which are also not + * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload + * of the send file. The useraccounting code ensures that the useraccounting + * info is not present upon a receive, so the local MAC can simply be cleared + * out at that time. For more info about objset_phys_t authentication, see + * zio_crypt_do_objset_hmacs(). + * + * CONSIDERATIONS FOR DEDUP: + * In order for dedup to work, blocks that we want to dedup with one another + * need to use the same IV and encryption key, so that they will have the same + * ciphertext. Normally, one should never reuse an IV with the same encryption + * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both + * blocks. In this case, however, since we are using the same plaindata as + * well all that we end up with is a duplicate of the original ciphertext we + * already had. As a result, an attacker with read access to the raw disk will + * be able to tell which blocks are the same but this information is given away + * by dedup anyway. In order to get the same IVs and encryption keys for + * equivalent blocks of data we use an HMAC of the plaindata. We use an HMAC + * here so that a reproducible checksum of the plaindata is never available to + * the attacker. The HMAC key is kept alongside the master key, encrypted on + * disk. The first 64 bits of the HMAC are used in place of the random salt, and + * the next 96 bits are used as the IV. As a result of this mechanism, dedup + * will only work within a clone family since encrypted dedup requires use of + * the same master and HMAC keys. + */ + +/* + * After encrypting many blocks with the same key we may start to run up + * against the theoretical limits of how much data can securely be encrypted + * with a single key using the supported encryption modes. The most obvious + * limitation is that our risk of generating 2 equivalent 96 bit IVs increases + * the more IVs we generate (which both GCM and CCM modes strictly forbid). + * This risk actually grows surprisingly quickly over time according to the + * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have + * generated n IVs with a cryptographically secure RNG, the approximate + * probability p(n) of a collision is given as: + * + * p(n) ~= e^(-n*(n-1)/(2*(2^96))) + * + * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] + * + * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion + * we must not write more than 398,065,730 blocks with the same encryption key. + * Therefore, we rotate our keys after 400,000,000 blocks have been written by + * generating a new random 64 bit salt for our HKDF encryption key generation + * function. + */ +#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 +#define ZFS_CURRENT_MAX_SALT_USES \ + (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) +unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; + +/* + * Set to a nonzero value to cause zio_do_crypt_uio() to fail 1/this many + * calls, to test decryption error handling code paths. + */ +uint64_t zio_decrypt_fail_fraction = 0; + +typedef struct blkptr_auth_buf { + uint64_t bab_prop; /* blk_prop - portable mask */ + uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ + uint64_t bab_pad; /* reserved for future use */ +} blkptr_auth_buf_t; + +zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { + {"", ZC_TYPE_NONE, 0, "inherit"}, + {"", ZC_TYPE_NONE, 0, "on"}, + {"", ZC_TYPE_NONE, 0, "off"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} +}; + +void +zio_crypt_key_destroy(zio_crypt_key_t *key) +{ + rw_destroy(&key->zk_salt_lock); + + /* free crypto templates */ + crypto_destroy_ctx_template(key->zk_current_tmpl); + crypto_destroy_ctx_template(key->zk_hmac_tmpl); + + /* zero out sensitive data */ + bzero(key, sizeof (zio_crypt_key_t)); +} + +int +zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech; + uint_t keydata_len; + + ASSERT(key != NULL); + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + bzero(key, sizeof (zio_crypt_key_t)); + + /* fill keydata buffers and salt with random data */ + ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_master_keydata, keydata_len); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for the ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = &key->zk_hmac_key; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + /* + * Initialize the crypto templates. It's ok if this fails because + * this is just an optimization. + */ + mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, + &key->zk_hmac_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_hmac_tmpl = NULL; + + key->zk_crypt = crypt; + key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; + key->zk_salt_count = 0; + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + return (0); + +error: + zio_crypt_key_destroy(key); + return (ret); +} + +static int +zio_crypt_key_change_salt(zio_crypt_key_t *key) +{ + int ret = 0; + uint8_t salt[ZIO_DATA_SALT_LEN]; + crypto_mechanism_t mech; + uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; + + /* generate a new salt */ + ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + rw_enter(&key->zk_salt_lock, RW_WRITER); + + /* someone beat us to the salt rotation, just unlock and return */ + if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) + goto out_unlock; + + /* derive the current key from the master key and the new salt */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); + if (ret != 0) + goto out_unlock; + + /* assign the salt and reset the usage count */ + bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); + key->zk_salt_count = 0; + + /* destroy the old context template and create the new one */ + crypto_destroy_ctx_template(key->zk_current_tmpl); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + rw_exit(&key->zk_salt_lock); + + return (0); + +out_unlock: + rw_exit(&key->zk_salt_lock); +error: + return (ret); +} + +/* See comment above zfs_key_max_salt_uses definition for details */ +int +zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) +{ + int ret; + boolean_t salt_change; + + rw_enter(&key->zk_salt_lock, RW_READER); + + bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); + salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= + ZFS_CURRENT_MAX_SALT_USES); + + rw_exit(&key->zk_salt_lock); + + if (salt_change) { + ret = zio_crypt_key_change_salt(key); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +void *failed_decrypt_buf; +int failed_decrypt_size; + +/* + * This function handles all encryption and decryption in zfs. When + * encrypting it expects puio to reference the plaintext and cuio to + * reference the cphertext. cuio must have enough space for the + * ciphertext + room for a MAC. datalen should be the length of the + * plaintext / ciphertext alone. + */ +/* ARGSUSED */ +static int +zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key, + crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen, + uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len) +{ + int ret; + crypto_data_t plaindata, cipherdata; + CK_AES_CCM_PARAMS ccmp; + CK_AES_GCM_PARAMS gcmp; + crypto_mechanism_t mech; + zio_crypt_info_t crypt_info; + uint_t plain_full_len, maclen; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW); + + /* lookup the encryption info */ + crypt_info = zio_crypt_table[crypt]; + + /* the mac will always be the last iovec_t in the cipher uio */ + maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len; + + ASSERT(maclen <= ZIO_DATA_MAC_LEN); + + /* setup encryption mechanism (same as crypt) */ + mech.cm_type = crypto_mech2id(crypt_info.ci_mechname); + + /* + * Strangely, the ICP requires that plain_full_len must include + * the MAC length when decrypting, even though the UIO does not + * need to have the extra space allocated. + */ + if (encrypt) { + plain_full_len = datalen; + } else { + plain_full_len = datalen + maclen; + } + + /* + * setup encryption params (currently only AES CCM and AES GCM + * are supported) + */ + if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) { + ccmp.ulNonceSize = ZIO_DATA_IV_LEN; + ccmp.ulAuthDataSize = auth_len; + ccmp.authData = authbuf; + ccmp.ulMACSize = maclen; + ccmp.nonce = ivbuf; + ccmp.ulDataSize = plain_full_len; + + mech.cm_param = (char *)(&ccmp); + mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS); + } else { + gcmp.ulIvLen = ZIO_DATA_IV_LEN; + gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN); + gcmp.ulAADLen = auth_len; + gcmp.pAAD = authbuf; + gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen); + gcmp.pIv = ivbuf; + + mech.cm_param = (char *)(&gcmp); + mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); + } + + /* populate the cipher and plain data structs. */ + plaindata.cd_format = CRYPTO_DATA_UIO; + plaindata.cd_offset = 0; + plaindata.cd_uio = puio; + plaindata.cd_miscdata = NULL; + plaindata.cd_length = plain_full_len; + + cipherdata.cd_format = CRYPTO_DATA_UIO; + cipherdata.cd_offset = 0; + cipherdata.cd_uio = cuio; + cipherdata.cd_miscdata = NULL; + cipherdata.cd_length = datalen + maclen; + + /* perform the actual encryption */ + if (encrypt) { + ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata, + NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + } else { + if (zio_decrypt_fail_fraction != 0 && + spa_get_random(zio_decrypt_fail_fraction) == 0) { + ret = CRYPTO_INVALID_MAC; + } else { + ret = crypto_decrypt(&mech, &cipherdata, + key, tmpl, &plaindata, NULL); + } + if (ret != CRYPTO_SUCCESS) { + ASSERT3U(ret, ==, CRYPTO_INVALID_MAC); + ret = SET_ERROR(ECKSUM); + goto error; + } + } + + return (0); + +error: + return (ret); +} + +int +zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, + uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) +{ + int ret; + uio_t puio, cuio; + uint64_t aad[3]; + iovec_t plain_iovecs[2], cipher_iovecs[3]; + uint64_t crypt = key->zk_crypt; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + /* generate iv for wrapping the master and hmac key */ + ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); + if (ret != 0) + goto error; + + /* initialize uio_ts */ + plain_iovecs[0].iov_base = (char *)key->zk_master_keydata; + plain_iovecs[0].iov_len = keydata_len; + plain_iovecs[1].iov_base = (char *)key->zk_hmac_keydata; + plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; + + cipher_iovecs[0].iov_base = (char *)keydata_out; + cipher_iovecs[0].iov_len = keydata_len; + cipher_iovecs[1].iov_base = (char *)hmac_keydata_out; + cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; + cipher_iovecs[2].iov_base = (char *)mac; + cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; + + /* + * Although we don't support writing to the old format, we do + * support rewrapping the key so that the user can move and + * quarantine datasets on the old format. + */ + if (key->zk_version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(key->zk_guid); + } else { + ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(key->zk_guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(key->zk_version); + } + + enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; + puio.uio_iov = plain_iovecs; + puio.uio_iovcnt = 2; + puio.uio_segflg = UIO_SYSSPACE; + cuio.uio_iov = cipher_iovecs; + cuio.uio_iovcnt = 3; + cuio.uio_segflg = UIO_SYSSPACE; + + /* encrypt the keys and store the resulting ciphertext and mac */ + ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len, + &puio, &cuio, (uint8_t *)aad, aad_len); + if (ret != 0) + goto error; + + return (0); + +error: + return (ret); +} + +int +zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, + uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, + uint8_t *mac, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech; + uio_t puio, cuio; + uint64_t aad[3]; + iovec_t plain_iovecs[2], cipher_iovecs[3]; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + keydata_len = zio_crypt_table[crypt].ci_keylen; + + /* initialize uio_ts */ + plain_iovecs[0].iov_base = (char *)key->zk_master_keydata; + plain_iovecs[0].iov_len = keydata_len; + plain_iovecs[1].iov_base = (char *)key->zk_hmac_keydata; + plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; + + cipher_iovecs[0].iov_base = (char *)keydata; + cipher_iovecs[0].iov_len = keydata_len; + cipher_iovecs[1].iov_base = (char *)hmac_keydata; + cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; + cipher_iovecs[2].iov_base = (char *)mac; + cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; + + if (version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(guid); + } else { + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(version); + } + + enc_len = keydata_len + SHA512_HMAC_KEYLEN; + puio.uio_iov = plain_iovecs; + puio.uio_segflg = UIO_SYSSPACE; + puio.uio_iovcnt = 2; + cuio.uio_iov = cipher_iovecs; + cuio.uio_iovcnt = 3; + cuio.uio_segflg = UIO_SYSSPACE; + + /* decrypt the keys and store the result in the output buffers */ + ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len, + &puio, &cuio, (uint8_t *)aad, aad_len); + if (ret != 0) + goto error; + + /* generate a fresh salt */ + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = key->zk_hmac_keydata; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + /* + * Initialize the crypto templates. It's ok if this fails because + * this is just an optimization. + */ + mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, + &key->zk_hmac_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_hmac_tmpl = NULL; + + key->zk_crypt = crypt; + key->zk_version = version; + key->zk_guid = guid; + key->zk_salt_count = 0; + + return (0); + +error: + zio_crypt_key_destroy(key); + return (ret); +} + +int +zio_crypt_generate_iv(uint8_t *ivbuf) +{ + int ret; + + /* randomly generate the IV */ + ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); + if (ret != 0) + goto error; + + return (0); + +error: + bzero(ivbuf, ZIO_DATA_IV_LEN); + return (ret); +} + +int +zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, + uint8_t *digestbuf, uint_t digestlen) +{ + int ret; + crypto_mechanism_t mech; + crypto_data_t in_data, digest_data; + uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; + + ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); + + /* initialize sha512-hmac mechanism and crypto data */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + /* initialize the crypto data */ + in_data.cd_format = CRYPTO_DATA_RAW; + in_data.cd_offset = 0; + in_data.cd_length = datalen; + in_data.cd_raw.iov_base = (char *)data; + in_data.cd_raw.iov_len = in_data.cd_length; + + digest_data.cd_format = CRYPTO_DATA_RAW; + digest_data.cd_offset = 0; + digest_data.cd_length = SHA512_DIGEST_LENGTH; + digest_data.cd_raw.iov_base = (char *)raw_digestbuf; + digest_data.cd_raw.iov_len = digest_data.cd_length; + + /* generate the hmac */ + ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl, + &digest_data, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_digestbuf, digestbuf, digestlen); + + return (0); + +error: + bzero(digestbuf, digestlen); + return (ret); +} + +int +zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, + uint_t datalen, uint8_t *ivbuf, uint8_t *salt) +{ + int ret; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + ret = zio_crypt_do_hmac(key, data, datalen, + digestbuf, SHA512_DIGEST_LENGTH); + if (ret != 0) + return (ret); + + bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN); + bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN); + + return (0); +} + +/* + * The following functions are used to encode and decode encryption parameters + * into blkptr_t and zil_header_t. The ICP wants to use these parameters as + * byte strings, which normally means that these strings would not need to deal + * with byteswapping at all. However, both blkptr_t and zil_header_t may be + * byteswapped by lower layers and so we must "undo" that byteswap here upon + * decoding and encoding in a non-native byteorder. These functions require + * that the byteorder bit is correct before being called. + */ +void +zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_ENCRYPTED(bp)); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); + bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, val32); + } else { + bcopy(salt, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); + + bcopy(iv, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); + + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, BSWAP_32(val32)); + } +} + +void +zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_PROTECTED(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_IS_AUTHENTICATED(bp)) { + bzero(salt, ZIO_DATA_SALT_LEN); + bzero(iv, ZIO_DATA_IV_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); + bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); + + val32 = (uint32_t)BP_GET_IV2(bp); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } else { + val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); + bcopy(&val64, salt, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); + bcopy(&val64, iv, sizeof (uint64_t)); + + val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } +} + +void +zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp)); + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], + sizeof (uint64_t)); + } else { + bcopy(mac, &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[2] = BSWAP_64(val64); + + bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[3] = BSWAP_64(val64); + } +} + +void +zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + bzero(mac, ZIO_DATA_MAC_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); + } else { + val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); + bcopy(&val64, mac, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); + bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); + } +} + +void +zio_crypt_encode_mac_zil(void *data, uint8_t *mac) +{ + zil_chain_t *zilc = data; + + bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], + sizeof (uint64_t)); +} + +void +zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) +{ + /* + * The ZIL MAC is embedded in the block it protects, which will + * not have been byteswapped by the time this function has been called. + * As a result, we don't need to worry about byteswapping the MAC. + */ + const zil_chain_t *zilc = data; + + bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); +} + +/* + * This routine takes a block of dnodes (src_abd) and copies only the bonus + * buffers to the same offsets in the dst buffer. datalen should be the size + * of both the src_abd and the dst buffer (not just the length of the bonus + * buffers). + */ +void +zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) +{ + uint_t i, max_dnp = datalen >> DNODE_SHIFT; + uint8_t *src; + dnode_phys_t *dnp, *sdnp, *ddnp; + + src = abd_borrow_buf_copy(src_abd, datalen); + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), + DN_MAX_BONUS_LEN(dnp)); + } + } + + abd_return_buf(src_abd, src, datalen); +} + +/* + * This function decides what fields from blk_prop are included in + * the on-disk various MAC algorithms. + */ +static void +zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) +{ + /* + * Version 0 did not properly zero out all non-portable fields + * as it should have done. We maintain this code so that we can + * do read-only imports of pools on this version. + */ + if (version == 0) { + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); + BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); + return; + } + + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + + /* + * The hole_birth feature might set these fields even if this bp + * is a hole. We zero them out here to guarantee that raw sends + * will function with or without the feature. + */ + if (BP_IS_HOLE(bp)) { + bp->blk_prop = 0ULL; + return; + } + + /* + * At L0 we want to verify these fields to ensure that data blocks + * can not be reinterpretted. For instance, we do not want an attacker + * to trick us into returning raw lz4 compressed data to the user + * by modifying the compression bits. At higher levels, we cannot + * enforce this policy since raw sends do not convey any information + * about indirect blocks, so these values might be different on the + * receive side. Fortunately, this does not open any new attack + * vectors, since any alterations that can be made to a higher level + * bp must still verify the correct order of the layer below it. + */ + if (BP_GET_LEVEL(bp) != 0) { + BP_SET_BYTEORDER(bp, 0); + BP_SET_COMPRESS(bp, 0); + + /* + * psize cannot be set to zero or it will trigger + * asserts, but the value doesn't really matter as + * long as it is constant. + */ + BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); + } + + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); +} + +static void +zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, + blkptr_auth_buf_t *bab, uint_t *bab_len) +{ + blkptr_t tmpbp = *bp; + + if (should_bswap) + byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); + + ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); + ASSERT0(BP_IS_EMBEDDED(&tmpbp)); + + zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); + + /* + * We always MAC blk_prop in LE to ensure portability. This + * must be done after decoding the mac, since the endianness + * will get zero'd out here. + */ + zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); + bab->bab_prop = LE_64(tmpbp.blk_prop); + bab->bab_pad = 0ULL; + + /* version 0 did not include the padding */ + *bab_len = sizeof (blkptr_auth_buf_t); + if (version == 0) + *bab_len -= sizeof (uint64_t); +} + +static int +zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + int ret; + uint_t bab_len; + blkptr_auth_buf_t bab; + crypto_data_t cd; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + cd.cd_length = bab_len; + cd.cd_raw.iov_base = (char *)&bab; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + return (0); + +error: + return (ret); +} + +static void +zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + SHA2Update(ctx, &bab, bab_len); +} + +static void +zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + bcopy(&bab, *aadp, bab_len); + *aadp += bab_len; + *aad_len += bab_len; +} + +static int +zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, dnode_phys_t *dnp) +{ + int ret, i; + dnode_phys_t *adnp; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + crypto_data_t cd; + uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; + + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + + /* authenticate the core dnode (masking out non-portable bits) */ + bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); + adnp = (dnode_phys_t *)tmp_dncore; + if (le_bswap) { + adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); + adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); + adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); + adnp->dn_used = BSWAP_64(adnp->dn_used); + } + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + + cd.cd_length = sizeof (tmp_dncore); + cd.cd_raw.iov_base = (char *)adnp; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + for (i = 0; i < dnp->dn_nblkptr; i++) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, &dnp->dn_blkptr[i]); + if (ret != 0) + goto error; + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, DN_SPILL_BLKPTR(dnp)); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * objset_phys_t blocks introduce a number of exceptions to the normal + * authentication process. objset_phys_t's contain 2 seperate HMACS for + * protecting the integrity of their data. The portable_mac protects the + * the metadnode. This MAC can be sent with a raw send and protects against + * reordering of data within the metadnode. The local_mac protects the user + * accounting objects which are not sent from one system to another. + * + * In addition, objset blocks are the only blocks that can be modified and + * written to disk without the key loaded under certain circumstances. During + * zil_claim() we need to be able to update the zil_header_t to complete + * claiming log blocks and during raw receives we need to write out the + * portable_mac from the send file. Both of these actions are possible + * because these fields are not protected by either MAC so neither one will + * need to modify the MACs without the key. However, when the modified blocks + * are written out they will be byteswapped into the host machine's native + * endianness which will modify fields protected by the MAC. As a result, MAC + * calculation for objset blocks works slightly differently from other block + * types. Where other block types MAC the data in whatever endianness is + * written to disk, objset blocks always MAC little endian version of their + * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() + * and le_bswap indicates whether a byteswap is needed to get this block + * into little endian format. + */ +/* ARGSUSED */ +int +zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, + boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) +{ + int ret; + crypto_mechanism_t mech; + crypto_context_t ctx; + crypto_data_t cd; + objset_phys_t *osp = data; + uint64_t intval; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; + uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; + + /* initialize HMAC mechanism */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + + /* calculate the portable MAC from the portable fields and metadnode */ + ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the os_type */ + intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + /* CONSTCOND */ + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in fields from the metadnode */ + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_meta_dnode); + if (ret) + goto error; + + /* store the final digest in a temporary buffer and copy what we need */ + cd.cd_length = SHA512_DIGEST_LENGTH; + cd.cd_raw.iov_base = (char *)raw_portable_mac; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_final(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + + /* + * The local MAC protects the user and group accounting. If these + * objects are not present, the local MAC is zeroed out. + */ + if ((osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || + (datalen <= OBJSET_OLD_PHYS_SIZE)) { + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (0); + } + + /* calculate the local MAC from the userused and groupused dnodes */ + ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the non-portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + /* CONSTCOND */ + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in fields from the user accounting dnodes */ + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_userused_dnode); + if (ret) + goto error; + + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_groupused_dnode); + if (ret) + goto error; + + /* store the final digest in a temporary buffer and copy what we need */ + cd.cd_length = SHA512_DIGEST_LENGTH; + cd.cd_raw.iov_base = (char *)raw_local_mac; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_final(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN); + + return (0); + +error: + bzero(portable_mac, ZIO_OBJSET_MAC_LEN); + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (ret); +} + +static void +zio_crypt_destroy_uio(uio_t *uio) +{ + if (uio->uio_iov) + kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); +} + +/* + * This function parses an uncompressed indirect block and returns a checksum + * of all the portable fields from all of the contained bps. The portable + * fields are the MAC and all of the fields from blk_prop except for the dedup, + * checksum, and psize bits. For an explanation of the purpose of this, see + * the comment block on object set authentication. + */ +static int +zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, + uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) +{ + blkptr_t *bp; + int i, epb = datalen >> SPA_BLKPTRSHIFT; + SHA2_CTX ctx; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + /* checksum all of the MACs from the layer below */ + SHA2Init(SHA512, &ctx); + for (i = 0, bp = buf; i < epb; i++, bp++) { + zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, + byteswap, bp); + } + SHA2Final(digestbuf, &ctx); + + if (generate) { + bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN); + return (0); + } + + if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) + return (SET_ERROR(ECKSUM)); + + return (0); +} + +int +zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + + /* + * Unfortunately, callers of this function will not always have + * easy access to the on-disk format version. This info is + * normally found in the DSL Crypto Key, but the checksum-of-MACs + * is expected to be verifiable even when the key isn't loaded. + * Here, instead of doing a ZAP lookup for the version for each + * zio, we simply try both existing formats. + */ + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, + datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); + if (ret == ECKSUM) { + ASSERT(!generate); + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, + buf, datalen, 0, byteswap, cksum); + } + + return (ret); +} + +int +zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + void *buf; + + buf = abd_borrow_buf_copy(abd, datalen); + ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, + byteswap, cksum); + abd_return_buf(abd, buf, datalen); + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting ZIL blocks. + * We do not check for the older ZIL chain because the encryption feature + * was not available before the newer ZIL chain was introduced. The goal + * here is to encrypt everything except the blkptr_t of a lr_write_t and + * the zil_chain_t header. Everything that is not encrypted is authenticated. + */ + +/* ARGSUSED */ +static int +zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio, + uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, + boolean_t *no_crypt) +{ + int ret; + uint64_t txtype, lr_len; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; + uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; + zil_chain_t *zilc; + lr_t *lr; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + /* cipherbuf always needs an extra iovec for the MAC */ + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + /* find the start and end record of the log block */ + zilc = (zil_chain_t *)src; + slrp = src + sizeof (zil_chain_t); + aadp = aadbuf; + blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + + /* calculate the number of encrypted iovecs we will need */ + for (; slrp < blkend; slrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + nr_iovecs++; + if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) + nr_iovecs++; + } + + nr_src += nr_iovecs; + nr_dst += nr_iovecs; + + /* allocate the iovec arrays */ + if (nr_src != 0) { + src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); + if (src_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + if (nr_dst != 0) { + dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); + if (dst_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + /* + * Copy the plain zil header over and authenticate everything except + * the checksum that will store our MAC. If we are writing the data + * the embedded checksum will not have been calculated yet, so we don't + * authenticate that. + */ + bcopy(src, dst, sizeof (zil_chain_t)); + bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t)); + aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); + aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); + + /* loop over records again, filling in iovecs */ + nr_iovecs = 0; + slrp = src + sizeof (zil_chain_t); + dlrp = dst + sizeof (zil_chain_t); + + for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + /* copy the common lr_t */ + bcopy(slrp, dlrp, sizeof (lr_t)); + bcopy(slrp, aadp, sizeof (lr_t)); + aadp += sizeof (lr_t); + aad_len += sizeof (lr_t); + + ASSERT3P(src_iovecs, !=, NULL); + ASSERT3P(dst_iovecs, !=, NULL); + + /* + * If this is a TX_WRITE record we want to encrypt everything + * except the bp if exists. If the bp does exist we want to + * authenticate it. + */ + if (txtype == TX_WRITE) { + crypt_len = sizeof (lr_write_t) - + sizeof (lr_t) - sizeof (blkptr_t); + src_iovecs[nr_iovecs].iov_base = (char *)slrp + + sizeof (lr_t); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = (char *)dlrp + + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + /* copy the bp now since it will not be encrypted */ + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), + sizeof (blkptr_t)); + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + aadp, sizeof (blkptr_t)); + aadp += sizeof (blkptr_t); + aad_len += sizeof (blkptr_t); + nr_iovecs++; + total_len += crypt_len; + + if (lr_len != sizeof (lr_write_t)) { + crypt_len = lr_len - sizeof (lr_write_t); + src_iovecs[nr_iovecs].iov_base = (char *) + slrp + sizeof (lr_write_t); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = (char *) + dlrp + sizeof (lr_write_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + nr_iovecs++; + total_len += crypt_len; + } + } else { + crypt_len = lr_len - sizeof (lr_t); + src_iovecs[nr_iovecs].iov_base = (char *)slrp + + sizeof (lr_t); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = (char *)dlrp + + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + nr_iovecs++; + total_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + if (encrypt) { + puio->uio_iov = src_iovecs; + puio->uio_iovcnt = nr_src; + cuio->uio_iov = dst_iovecs; + cuio->uio_iovcnt = nr_dst; + } else { + puio->uio_iov = dst_iovecs; + puio->uio_iovcnt = nr_dst; + cuio->uio_iov = src_iovecs; + cuio->uio_iovcnt = nr_src; + } + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (src_iovecs != NULL) + kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); + if (dst_iovecs != NULL) + kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + puio->uio_iov = NULL; + puio->uio_iovcnt = 0; + cuio->uio_iov = NULL; + cuio->uio_iovcnt = 0; + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting dnode blocks. + */ +static int +zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; + iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; + uint8_t *src, *dst, *aadp; + dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + aadp = aadbuf; + + /* + * Count the number of iovecs we will need to do the encryption by + * counting the number of bonus buffers that need to be encrypted. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + /* + * This block may still be byteswapped. However, all of the + * values we use are either uint8_t's (for which byteswapping + * is a noop) or a * != 0 check, which will work regardless + * of whether or not we byteswap. + */ + if (sdnp[i].dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && + sdnp[i].dn_bonuslen != 0) { + nr_iovecs++; + } + } + + nr_src += nr_iovecs; + nr_dst += nr_iovecs; + + if (nr_src != 0) { + src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); + if (src_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + if (nr_dst != 0) { + dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); + if (dst_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + nr_iovecs = 0; + + /* + * Iterate through the dnodes again, this time filling in the uios + * we allocated earlier. We also concatenate any data we want to + * authenticate onto aadbuf. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + /* copy over the core fields and blkptrs (kept as plaintext) */ + bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]), + sizeof (blkptr_t)); + } + + /* + * Handle authenticated data. We authenticate everything in + * the dnode that can be brought over when we do a raw send. + * This includes all of the core fields as well as the MACs + * stored in the bp checksums and all of the portable bits + * from blk_prop. We include the dnode padding here in case it + * ever gets used in the future. Some dn_flags and dn_used are + * not portable so we mask those out values out of the + * authenticated data. + */ + crypt_len = offsetof(dnode_phys_t, dn_blkptr); + bcopy(dnp, aadp, crypt_len); + adnp = (dnode_phys_t *)aadp; + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + aadp += crypt_len; + aad_len += crypt_len; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, &dnp->dn_blkptr[j]); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, DN_SPILL_BLKPTR(dnp)); + } + + /* + * If this bonus buffer needs to be encrypted, we prepare an + * iovec_t. The encryption / decryption functions will fill + * this in for us with the encrypted or decrypted data. + * Otherwise we add the bonus buffer to the authenticated + * data buffer and copy it over to the destination. The + * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that + * we can guarantee alignment with the AES block size + * (128 bits). + */ + crypt_len = DN_MAX_BONUS_LEN(dnp); + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + ASSERT3U(nr_iovecs, <, nr_src); + ASSERT3U(nr_iovecs, <, nr_dst); + ASSERT3P(src_iovecs, !=, NULL); + ASSERT3P(dst_iovecs, !=, NULL); + src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + nr_iovecs++; + total_len += crypt_len; + } else { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len); + bcopy(DN_BONUS(dnp), aadp, crypt_len); + aadp += crypt_len; + aad_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + if (encrypt) { + puio->uio_iov = src_iovecs; + puio->uio_iovcnt = nr_src; + cuio->uio_iov = dst_iovecs; + cuio->uio_iovcnt = nr_dst; + } else { + puio->uio_iov = dst_iovecs; + puio->uio_iovcnt = nr_dst; + cuio->uio_iov = src_iovecs; + cuio->uio_iovcnt = nr_src; + } + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (src_iovecs != NULL) + kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); + if (dst_iovecs != NULL) + kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + puio->uio_iov = NULL; + puio->uio_iovcnt = 0; + cuio->uio_iov = NULL; + cuio->uio_iovcnt = 0; + return (ret); +} + +/* ARGSUSED */ +static int +zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *cuio, + uint_t *enc_len) +{ + int ret; + uint_t nr_plain = 1, nr_cipher = 2; + iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; + + /* allocate the iovecs for the plain and cipher data */ + plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t), + KM_SLEEP); + if (!plain_iovecs) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), + KM_SLEEP); + if (!cipher_iovecs) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + plain_iovecs[0].iov_base = (void *)plainbuf; + plain_iovecs[0].iov_len = datalen; + cipher_iovecs[0].iov_base = (void *)cipherbuf; + cipher_iovecs[0].iov_len = datalen; + + *enc_len = datalen; + puio->uio_iov = plain_iovecs; + puio->uio_iovcnt = nr_plain; + cuio->uio_iov = cipher_iovecs; + cuio->uio_iovcnt = nr_cipher; + + return (0); + +error: + if (plain_iovecs != NULL) + kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); + if (cipher_iovecs != NULL) + kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); + + *enc_len = 0; + puio->uio_iov = NULL; + puio->uio_iovcnt = 0; + cuio->uio_iov = NULL; + cuio->uio_iovcnt = 0; + return (ret); +} + +/* + * This function builds up the plaintext (puio) and ciphertext (cuio) uios so + * that they can be used for encryption and decryption by zio_do_crypt_uio(). + * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks + * requiring special handling to parse out pieces that are to be encrypted. The + * authbuf is used by these special cases to store additional authenticated + * data (AAD) for the encryption modes. + */ +/* ARGSUSED */ +static int +zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + iovec_t *mac_iov; + + ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); + + /* route to handler */ + switch (ot) { + case DMU_OT_INTENT_LOG: + ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, + datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, + no_crypt); + break; + case DMU_OT_DNODE: + ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, + cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, + auth_len, no_crypt); + break; + default: + ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, + datalen, puio, cuio, enc_len); + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + break; + } + + if (ret != 0) + goto error; + + /* populate the uios */ + puio->uio_segflg = UIO_SYSSPACE; + cuio->uio_segflg = UIO_SYSSPACE; + + mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); + mac_iov->iov_base = (void *)mac; + mac_iov->iov_len = ZIO_DATA_MAC_LEN; + + return (0); + +error: + return (ret); +} + +/* + * Primary encryption / decryption entrypoint for zio data. + */ +int +zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, + boolean_t *no_crypt) +{ + int ret; + boolean_t locked = B_FALSE; + uint64_t crypt = key->zk_crypt; + uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; + uint_t enc_len, auth_len; + uio_t puio, cuio; + uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; + crypto_key_t tmp_ckey, *ckey = NULL; + crypto_ctx_template_t tmpl; + uint8_t *authbuf = NULL; + + bzero(&puio, sizeof (uio_t)); + bzero(&cuio, sizeof (uio_t)); + + /* create uios for encryption */ + ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, + cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, + &authbuf, &auth_len, no_crypt); + if (ret != 0) + return (ret); + + /* + * If the needed key is the current one, just use it. Otherwise we + * need to generate a temporary one from the given salt + master key. + * If we are encrypting, we must return a copy of the current salt + * so that it can be stored in the blkptr_t. + */ + rw_enter(&key->zk_salt_lock, RW_READER); + locked = B_TRUE; + + if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { + ckey = &key->zk_current_key; + tmpl = key->zk_current_tmpl; + } else { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); + if (ret != 0) + goto error; + + tmp_ckey.ck_format = CRYPTO_KEY_RAW; + tmp_ckey.ck_data = enc_keydata; + tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + ckey = &tmp_ckey; + tmpl = NULL; + } + + /* perform the encryption / decryption */ + ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len, + &puio, &cuio, authbuf, auth_len); + if (ret != 0) + goto error; + + if (locked) { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + } + + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + + return (0); + +error: + if (!encrypt) { + if (failed_decrypt_buf != NULL) + kmem_free(failed_decrypt_buf, failed_decrypt_size); + failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP); + failed_decrypt_size = datalen; + bcopy(cipherbuf, failed_decrypt_buf, datalen); + } + if (locked) + rw_exit(&key->zk_salt_lock); + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + + return (ret); +} + +/* + * Simple wrapper around zio_do_crypt_data() to work with abd's instead of + * linear buffers. + */ +int +zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, + boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, + uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) +{ + int ret; + void *ptmp, *ctmp; + + if (encrypt) { + ptmp = abd_borrow_buf_copy(pabd, datalen); + ctmp = abd_borrow_buf(cabd, datalen); + } else { + ptmp = abd_borrow_buf(pabd, datalen); + ctmp = abd_borrow_buf_copy(cabd, datalen); + } + + ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, + datalen, ptmp, ctmp, no_crypt); + if (ret != 0) + goto error; + + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (0); + +error: + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (ret); +} diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c index 71b859bc3d..f13fb18c16 100644 --- a/usr/src/uts/common/fs/zfs/zio_inject.c +++ b/usr/src/uts/common/fs/zfs/zio_inject.c @@ -194,6 +194,37 @@ zio_match_dva(zio_t *zio) /* + * Inject a decryption failure. Decryption failures can occur in + * both the ARC and the ZIO layers. + */ +int +zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, + uint64_t type, int error) +{ + int ret = 0; + inject_handler_t *handler; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) + continue; + + if (zio_match_handler((zbookmark_phys_t *)zb, type, ZI_NO_DVA, + &handler->zi_record, error)) { + ret = error; + break; + } + } + + rw_exit(&inject_lock); + return (ret); +} + +/* * Determine if the I/O in question should return failure. Returns the errno * to be returned to the caller. */ diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index c6d9378649..2ac660e9f7 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -396,6 +396,7 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) * Replay a TX_WRITE ZIL transaction that didn't get committed * after a system failure */ +/* ARGSUSED */ static int zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) { @@ -504,7 +505,7 @@ zvol_create_minor(const char *name) } /* lie and say we're read-only */ - error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, FTAG, &os); + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) { mutex_exit(&zfsdev_state_lock); @@ -512,13 +513,13 @@ zvol_create_minor(const char *name) } if ((minor = zfsdev_minor_alloc()) == 0) { - dmu_objset_disown(os, FTAG); + dmu_objset_disown(os, 1, FTAG); mutex_exit(&zfsdev_state_lock); return (SET_ERROR(ENXIO)); } if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) { - dmu_objset_disown(os, FTAG); + dmu_objset_disown(os, 1, FTAG); mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EAGAIN)); } @@ -530,7 +531,7 @@ zvol_create_minor(const char *name) if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) { ddi_soft_state_free(zfsdev_state, minor); - dmu_objset_disown(os, FTAG); + dmu_objset_disown(os, 1, FTAG); mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EAGAIN)); } @@ -541,7 +542,7 @@ zvol_create_minor(const char *name) minor, DDI_PSEUDO, 0) == DDI_FAILURE) { ddi_remove_minor_node(zfs_dip, chrbuf); ddi_soft_state_free(zfsdev_state, minor); - dmu_objset_disown(os, FTAG); + dmu_objset_disown(os, 1, FTAG); mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EAGAIN)); } @@ -569,7 +570,7 @@ zvol_create_minor(const char *name) else zil_replay(os, zv, zvol_replay_vector); } - dmu_objset_disown(os, FTAG); + dmu_objset_disown(os, 1, FTAG); zv->zv_objset = NULL; zvol_minors++; @@ -633,7 +634,7 @@ zvol_first_open(zvol_state_t *zv) uint64_t readonly; /* lie and say we're read-only */ - error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE, + error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE, B_TRUE, zvol_tag, &os); if (error) return (error); @@ -642,13 +643,13 @@ zvol_first_open(zvol_state_t *zv) error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) { ASSERT(error == 0); - dmu_objset_disown(os, zvol_tag); + dmu_objset_disown(os, 1, zvol_tag); return (error); } error = dnode_hold(os, ZVOL_OBJ, zvol_tag, &zv->zv_dn); if (error) { - dmu_objset_disown(os, zvol_tag); + dmu_objset_disown(os, 1, zvol_tag); return (error); } @@ -682,7 +683,7 @@ zvol_last_close(zvol_state_t *zv) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); dmu_objset_evict_dbufs(zv->zv_objset); - dmu_objset_disown(zv->zv_objset, zvol_tag); + dmu_objset_disown(zv->zv_objset, 1, zvol_tag); zv->zv_objset = NULL; } @@ -730,6 +731,7 @@ zvol_update_volsize(objset_t *os, uint64_t volsize) { dmu_tx_t *tx; int error; + uint64_t txg; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); @@ -741,11 +743,14 @@ zvol_update_volsize(objset_t *os, uint64_t volsize) dmu_tx_abort(tx); return (error); } + txg = dmu_tx_get_txg(tx); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); dmu_tx_commit(tx); + txg_wait_synced(dmu_objset_pool(os), txg); + if (error == 0) error = dmu_free_long_range(os, ZVOL_OBJ, volsize, DMU_OBJECT_END); @@ -850,7 +855,7 @@ zvol_set_volsize(const char *name, uint64_t volsize) zv = zvol_minor_lookup(name); if (zv == NULL || zv->zv_objset == NULL) { - if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, + if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE, FTAG, &os)) != 0) { mutex_exit(&zfsdev_state_lock); return (error); @@ -872,7 +877,7 @@ zvol_set_volsize(const char *name, uint64_t volsize) error = zvol_update_live_volsize(zv, volsize); out: if (owned) { - dmu_objset_disown(os, FTAG); + dmu_objset_disown(os, B_TRUE, FTAG); if (zv != NULL) zv->zv_objset = NULL; } @@ -901,7 +906,12 @@ zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) mutex_exit(&zfsdev_state_lock); return (err); } - if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + /* + * Check for a bad on-disk format version now since we + * lied about owning the dataset readonly before. + */ + if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || + dmu_objset_incompatible_encryption_version(zv->zv_objset))) { err = SET_ERROR(EROFS); goto out; } @@ -2099,6 +2109,9 @@ zvol_dumpify(zvol_state_t *zv) if (zv->zv_flags & ZVOL_RDONLY) return (SET_ERROR(EROFS)); + if (os->os_encrypted) + return (SET_ERROR(ENOTSUP)); + if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) { boolean_t resize = (dumpsize > 0); |
