diff options
Diffstat (limited to 'usr/src/uts/common/fs/zfs/dmu_send.c')
-rw-r--r-- | usr/src/uts/common/fs/zfs/dmu_send.c | 809 |
1 files changed, 610 insertions, 199 deletions
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c index 7d728c6877..ca09d10bd8 100644 --- a/usr/src/uts/common/fs/zfs/dmu_send.c +++ b/usr/src/uts/common/fs/zfs/dmu_send.c @@ -52,13 +52,38 @@ #include <sys/blkptr.h> #include <sys/dsl_bookmark.h> #include <sys/zfeature.h> +#include <sys/bqueue.h> /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; +int zfs_send_queue_length = 16 * 1024 * 1024; +int zfs_recv_queue_length = 16 * 1024 * 1024; static char *dmu_recv_tag = "dmu_recv_tag"; static const char *recv_clone_name = "%recv"; +#define BP_SPAN(datablkszsec, indblkshift, level) \ + (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (indblkshift - SPA_BLKPTRSHIFT))) + +struct send_thread_arg { + bqueue_t q; + dsl_dataset_t *ds; /* Dataset to traverse */ + uint64_t fromtxg; /* Traverse from this txg */ + int flags; /* flags to pass to traverse_dataset */ + int error_code; + boolean_t cancel; +}; + +struct send_block_record { + boolean_t eos_marker; /* Marks the end of the stream */ + blkptr_t bp; + zbookmark_phys_t zb; + uint8_t indblkshift; + uint16_t datablkszsec; + bqueue_node_t ln; +}; + static int dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { @@ -434,58 +459,115 @@ backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) return (B_FALSE); } -#define BP_SPAN(dnp, level) \ - (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ - (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) +/* + * This is the callback function to traverse_dataset that acts as the worker + * thread for dmu_send_impl. + */ +/*ARGSUSED*/ +static int +send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) +{ + struct send_thread_arg *sta = arg; + struct send_block_record *record; + uint64_t record_size; + int err = 0; -/* ARGSUSED */ + if (sta->cancel) + return (SET_ERROR(EINTR)); + + if (bp == NULL) { + ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); + return (0); + } else if (zb->zb_level < 0) { + return (0); + } + + record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP); + record->eos_marker = B_FALSE; + record->bp = *bp; + record->zb = *zb; + record->indblkshift = dnp->dn_indblkshift; + record->datablkszsec = dnp->dn_datablkszsec; + record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + bqueue_enqueue(&sta->q, record, record_size); + + return (err); +} + +/* + * This function kicks off the traverse_dataset. It also handles setting the + * error code of the thread in case something goes wrong, and pushes the End of + * Stream record when the traverse_dataset call has finished. If there is no + * dataset to traverse, the thread immediately pushes End of Stream marker. + */ +static void +send_traverse_thread(void *arg) +{ + struct send_thread_arg *st_arg = arg; + int err; + struct send_block_record *data; + + if (st_arg->ds != NULL) { + err = traverse_dataset(st_arg->ds, st_arg->fromtxg, + st_arg->flags, send_cb, arg); + if (err != EINTR) + st_arg->error_code = err; + } + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + data->eos_marker = B_TRUE; + bqueue_enqueue(&st_arg->q, data, 1); +} + +/* + * This function actually handles figuring out what kind of record needs to be + * dumped, reading the data (which has hopefully been prefetched), and calling + * the appropriate helper function. + */ static int -backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, - const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +do_dump(dmu_sendarg_t *dsa, struct send_block_record *data) { - dmu_sendarg_t *dsp = arg; + dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os); + const blkptr_t *bp = &data->bp; + const zbookmark_phys_t *zb = &data->zb; + uint8_t indblkshift = data->indblkshift; + uint16_t dblkszsec = data->datablkszsec; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; - if (issig(JUSTLOOKING) && issig(FORREAL)) - return (SET_ERROR(EINTR)); + ASSERT3U(zb->zb_level, >=, 0); if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); - } else if (zb->zb_level == ZB_ZIL_LEVEL) { - /* - * If we are sending a non-snapshot (which is allowed on - * read-only pools), it may have a ZIL, which must be ignored. - */ - return (0); } else if (BP_IS_HOLE(bp) && zb->zb_object == DMU_META_DNODE_OBJECT) { - uint64_t span = BP_SPAN(dnp, zb->zb_level); + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; - err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); + err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT); } else if (BP_IS_HOLE(bp)) { - uint64_t span = BP_SPAN(dnp, zb->zb_level); - err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); + uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level); + uint64_t offset = zb->zb_blkid * span; + err = dump_free(dsa, zb->zb_object, offset, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { - dnode_phys_t *blk; - int i; int blksz = BP_GET_LSIZE(bp); arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; + ASSERT0(zb->zb_level); + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (SET_ERROR(EIO)); - blk = abuf->b_data; - for (i = 0; i < blksz >> DNODE_SHIFT; i++) { - uint64_t dnobj = (zb->zb_blkid << - (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; - err = dump_dnode(dsp, dnobj, blk+i); + dnode_phys_t *blk = abuf->b_data; + uint64_t dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT); + for (int i = 0; i < blksz >> DNODE_SHIFT; i++) { + err = dump_dnode(dsa, dnobj + i, blk + i); if (err != 0) break; } @@ -500,20 +582,21 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, &aflags, zb) != 0) return (SET_ERROR(EIO)); - err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); + err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); - } else if (backup_do_embed(dsp, bp)) { + } else if (backup_do_embed(dsa, bp)) { /* it's an embedded level-0 block of a regular object */ - int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; - err = dump_write_embedded(dsp, zb->zb_object, + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; + ASSERT0(zb->zb_level); + err = dump_write_embedded(dsa, zb->zb_object, zb->zb_blkid * blksz, blksz, bp); - } else { /* it's a level-0 block of a regular object */ + } else { + /* it's a level-0 block of a regular object */ arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; - int blksz = BP_GET_LSIZE(bp); + int blksz = dblkszsec << SPA_MINBLOCKSHIFT; uint64_t offset; - ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, @@ -534,20 +617,20 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, offset = zb->zb_blkid * blksz; - if (!(dsp->dsa_featureflags & + if (!(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && blksz > SPA_OLD_MAXBLOCKSIZE) { char *buf = abuf->b_data; while (blksz > 0 && err == 0) { int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); - err = dump_write(dsp, type, zb->zb_object, + err = dump_write(dsa, type, zb->zb_object, offset, n, NULL, buf); offset += n; buf += n; blksz -= n; } } else { - err = dump_write(dsp, type, zb->zb_object, + err = dump_write(dsa, type, zb->zb_object, offset, blksz, bp, abuf->b_data); } (void) arc_buf_remove_ref(abuf, &abuf); @@ -558,11 +641,24 @@ backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, } /* - * Releases dp using the specified tag. + * Pop the new data off the queue, and free the old data. + */ +static struct send_block_record * +get_next_record(bqueue_t *bq, struct send_block_record *data) +{ + struct send_block_record *tmp = bqueue_dequeue(bq); + kmem_free(data, sizeof (*data)); + return (tmp); +} + +/* + * Actually do the bulk of the work in a zfs send. + * + * Note: Releases dp using the specified tag. */ static int -dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, - zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, +dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds, + zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) { objset_t *os; @@ -571,8 +667,9 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, int err; uint64_t fromtxg = 0; uint64_t featureflags = 0; + struct send_thread_arg to_arg; - err = dmu_objset_from_ds(ds, &os); + err = dmu_objset_from_ds(to_ds, &os); if (err != 0) { dsl_pool_rele(dp, tag); return (err); @@ -598,35 +695,34 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, } #endif - if (large_block_ok && ds->ds_large_blocks) + if (large_block_ok && to_ds->ds_large_blocks) featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; if (embedok && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; - } else { - embedok = B_FALSE; } DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); drr->drr_u.drr_begin.drr_creation_time = - dsl_dataset_phys(ds)->ds_creation_time; + dsl_dataset_phys(to_ds)->ds_creation_time; drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); if (is_clone) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; - drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid; - if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; + if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; - if (fromzb != NULL) { - drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; - fromtxg = fromzb->zbm_creation_txg; + if (ancestor_zb != NULL) { + drr->drr_u.drr_begin.drr_fromguid = + ancestor_zb->zbm_guid; + fromtxg = ancestor_zb->zbm_creation_txg; } - dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - if (!ds->ds_is_snapshot) { + dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname); + if (!to_ds->ds_is_snapshot) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } @@ -639,16 +735,16 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, dsp->dsa_proc = curproc; dsp->dsa_os = os; dsp->dsa_off = off; - dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; + dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid; dsp->dsa_pending_op = PENDING_NONE; - dsp->dsa_incremental = (fromzb != NULL); + dsp->dsa_incremental = (ancestor_zb != NULL); dsp->dsa_featureflags = featureflags; - mutex_enter(&ds->ds_sendstream_lock); - list_insert_head(&ds->ds_sendstreams, dsp); - mutex_exit(&ds->ds_sendstream_lock); + mutex_enter(&to_ds->ds_sendstream_lock); + list_insert_head(&to_ds->ds_sendstreams, dsp); + mutex_exit(&to_ds->ds_sendstream_lock); - dsl_dataset_long_hold(ds, FTAG); + dsl_dataset_long_hold(to_ds, FTAG); dsl_pool_rele(dp, tag); if (dump_record(dsp, NULL, 0) != 0) { @@ -656,8 +752,41 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, goto out; } - err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, - backup_cb, dsp); + err = bqueue_init(&to_arg.q, zfs_send_queue_length, + offsetof(struct send_block_record, ln)); + to_arg.error_code = 0; + to_arg.cancel = B_FALSE; + to_arg.ds = to_ds; + to_arg.fromtxg = fromtxg; + to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH; + (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc, + TS_RUN, minclsyspri); + + struct send_block_record *to_data; + to_data = bqueue_dequeue(&to_arg.q); + + while (!to_data->eos_marker && err == 0) { + err = do_dump(dsp, to_data); + to_data = get_next_record(&to_arg.q, to_data); + if (issig(JUSTLOOKING) && issig(FORREAL)) + err = EINTR; + } + + if (err != 0) { + to_arg.cancel = B_TRUE; + while (!to_data->eos_marker) { + to_data = get_next_record(&to_arg.q, to_data); + } + } + kmem_free(to_data, sizeof (*to_data)); + + bqueue_destroy(&to_arg.q); + + if (err == 0 && to_arg.error_code != 0) + err = to_arg.error_code; + + if (err != 0) + goto out; if (dsp->dsa_pending_op != PENDING_NONE) if (dump_record(dsp, NULL, 0) != 0) @@ -674,20 +803,18 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; - if (dump_record(dsp, NULL, 0) != 0) { + if (dump_record(dsp, NULL, 0) != 0) err = dsp->dsa_err; - goto out; - } out: - mutex_enter(&ds->ds_sendstream_lock); - list_remove(&ds->ds_sendstreams, dsp); - mutex_exit(&ds->ds_sendstream_lock); + mutex_enter(&to_ds->ds_sendstream_lock); + list_remove(&to_ds->ds_sendstreams, dsp); + mutex_exit(&to_ds->ds_sendstream_lock); kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); - dsl_dataset_long_rele(ds, FTAG); + dsl_dataset_long_rele(to_ds, FTAG); return (err); } @@ -1110,7 +1237,8 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx) * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ - if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) + if (fromguid != 0 && !(flags & DRR_FLAG_CLONE || + drba->drba_origin)) return (SET_ERROR(ENOENT)); /* Open the parent of tofs */ @@ -1292,21 +1420,57 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, &drba, 5, ZFS_SPACE_CHECK_NORMAL)); } -struct restorearg { +struct receive_record_arg { + dmu_replay_record_t header; + void *payload; /* Pointer to a buffer containing the payload */ + /* + * If the record is a write, pointer to the arc_buf_t containing the + * payload. + */ + arc_buf_t *write_buf; + int payload_size; + boolean_t eos_marker; /* Marks the end of the stream */ + bqueue_node_t node; +}; + +struct receive_writer_arg { objset_t *os; - int err; boolean_t byteswap; - vnode_t *vp; - uint64_t voff; - int bufsize; /* amount of memory allocated for buf */ + bqueue_t q; + /* + * These three args are used to signal to the main thread that we're + * done. + */ + kmutex_t mutex; + kcondvar_t cv; + boolean_t done; + int err; + /* A map from guid to dataset to help handle dedup'd streams. */ + avl_tree_t *guid_to_ds_map; +}; - dmu_replay_record_t *drr; - dmu_replay_record_t *next_drr; - char *buf; +struct receive_arg { + objset_t *os; + vnode_t *vp; /* The vnode to read the stream from */ + uint64_t voff; /* The current offset in the stream */ + /* + * A record that has had its payload read in, but hasn't yet been handed + * off to the worker thread. + */ + struct receive_record_arg *rrd; + /* A record that has had its header read in, but not its payload. */ + struct receive_record_arg *next_rrd; zio_cksum_t cksum; zio_cksum_t prev_cksum; + int err; + boolean_t byteswap; + /* Sorted list of objects not to issue prefetches for. */ + list_t ignore_obj_list; +}; - avl_tree_t *guid_to_ds_map; +struct receive_ign_obj_node { + list_node_t node; + uint64_t object; }; typedef struct guid_map_entry { @@ -1345,13 +1509,12 @@ free_guid_map_onexit(void *arg) } static int -restore_read(struct restorearg *ra, int len, void *buf) +receive_read(struct receive_arg *ra, int len, void *buf) { int done = 0; /* some things will require 8-byte alignment, so everything must */ ASSERT0(len % 8); - ASSERT3U(len, <=, ra->bufsize); while (done < len) { ssize_t resid; @@ -1470,7 +1633,8 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) } static int -restore_object(struct restorearg *ra, struct drr_object *drro, void *data) +receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, + void *data) { dmu_object_info_t doi; dmu_tx_t *tx; @@ -1484,12 +1648,12 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || - drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(ra->os)) || + drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || drro->drr_bonuslen > DN_MAX_BONUSLEN) { return (SET_ERROR(EINVAL)); } - err = dmu_object_info(ra->os, drro->drr_object, &doi); + err = dmu_object_info(rwa->os, drro->drr_object, &doi); if (err != 0 && err != ENOENT) return (SET_ERROR(EINVAL)); @@ -1508,14 +1672,14 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) if (drro->drr_blksz != doi.doi_data_block_size || nblkptr < doi.doi_nblkptr) { - err = dmu_free_long_range(ra->os, drro->drr_object, + err = dmu_free_long_range(rwa->os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } } - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_bonus(tx, object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { @@ -1525,7 +1689,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) if (object == DMU_NEW_OBJECT) { /* currently free, want to be allocated */ - err = dmu_object_claim(ra->os, drro->drr_object, + err = dmu_object_claim(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } else if (drro->drr_type != doi.doi_type || @@ -1533,7 +1697,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* currently allocated, but with different properties */ - err = dmu_object_reclaim(ra->os, drro->drr_object, + err = dmu_object_reclaim(rwa->os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } @@ -1542,20 +1706,20 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) return (SET_ERROR(EINVAL)); } - dmu_object_set_checksum(ra->os, drro->drr_object, + dmu_object_set_checksum(rwa->os, drro->drr_object, drro->drr_checksumtype, tx); - dmu_object_set_compress(ra->os, drro->drr_object, + dmu_object_set_compress(rwa->os, drro->drr_object, drro->drr_compress, tx); if (data != NULL) { dmu_buf_t *db; - VERIFY0(dmu_bonus_hold(ra->os, drro->drr_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); - if (ra->byteswap) { + if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); dmu_ot_byteswap[byteswap].ob_func(db->db_data, @@ -1569,7 +1733,7 @@ restore_object(struct restorearg *ra, struct drr_object *drro, void *data) /* ARGSUSED */ static int -restore_freeobjects(struct restorearg *ra, +receive_freeobjects(struct receive_writer_arg *rwa, struct drr_freeobjects *drrfo) { uint64_t obj; @@ -1579,13 +1743,13 @@ restore_freeobjects(struct restorearg *ra, for (obj = drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs; - (void) dmu_object_next(ra->os, &obj, FALSE, 0)) { + (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) { int err; - if (dmu_object_info(ra->os, obj, NULL) != 0) + if (dmu_object_info(rwa->os, obj, NULL) != 0) continue; - err = dmu_free_long_object(ra->os, obj); + err = dmu_free_long_object(rwa->os, obj); if (err != 0) return (err); } @@ -1593,7 +1757,8 @@ restore_freeobjects(struct restorearg *ra, } static int -restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) +receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw, + arc_buf_t *abuf) { dmu_tx_t *tx; int err; @@ -1602,10 +1767,10 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); - if (dmu_object_info(ra->os, drrw->drr_object, NULL) != 0) + if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_length); @@ -1614,7 +1779,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) dmu_tx_abort(tx); return (err); } - if (ra->byteswap) { + if (rwa->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, @@ -1622,7 +1787,7 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) } dmu_buf_t *bonus; - if (dmu_bonus_hold(ra->os, drrw->drr_object, FTAG, &bonus) != 0) + if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); dmu_tx_commit(tx); @@ -1638,7 +1803,8 @@ restore_write(struct restorearg *ra, struct drr_write *drrw, arc_buf_t *abuf) * data from the stream to fulfill this write. */ static int -restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) +receive_write_byref(struct receive_writer_arg *rwa, + struct drr_write_byref *drrwbr) { dmu_tx_t *tx; int err; @@ -1657,14 +1823,14 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) */ if (drrwbr->drr_toguid != drrwbr->drr_refguid) { gmesrch.guid = drrwbr->drr_refguid; - if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, + if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch, &where)) == NULL) { return (SET_ERROR(EINVAL)); } if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) return (SET_ERROR(EINVAL)); } else { - ref_os = ra->os; + ref_os = rwa->os; } err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, @@ -1672,7 +1838,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) if (err != 0) return (err); - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length); @@ -1681,7 +1847,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) dmu_tx_abort(tx); return (err); } - dmu_write(ra->os, drrwbr->drr_object, + dmu_write(rwa->os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); @@ -1689,7 +1855,7 @@ restore_write_byref(struct restorearg *ra, struct drr_write_byref *drrwbr) } static int -restore_write_embedded(struct restorearg *ra, +receive_write_embedded(struct receive_writer_arg *rwa, struct drr_write_embedded *drrwnp, void *data) { dmu_tx_t *tx; @@ -1706,7 +1872,7 @@ restore_write_embedded(struct restorearg *ra, if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_write(tx, drrwnp->drr_object, drrwnp->drr_offset, drrwnp->drr_length); @@ -1716,36 +1882,37 @@ restore_write_embedded(struct restorearg *ra, return (err); } - dmu_write_embedded(ra->os, drrwnp->drr_object, + dmu_write_embedded(rwa->os, drrwnp->drr_object, drrwnp->drr_offset, data, drrwnp->drr_etype, drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, - ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); + rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); dmu_tx_commit(tx); return (0); } static int -restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) +receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, + void *data) { dmu_tx_t *tx; dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || - drrs->drr_length > spa_maxblocksize(dmu_objset_spa(ra->os))) + drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) return (SET_ERROR(EINVAL)); - if (dmu_object_info(ra->os, drrs->drr_object, NULL) != 0) + if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - VERIFY0(dmu_bonus_hold(ra->os, drrs->drr_object, FTAG, &db)); + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } - tx = dmu_tx_create(ra->os); + tx = dmu_tx_create(rwa->os); dmu_tx_hold_spill(tx, db->db_object); @@ -1772,7 +1939,7 @@ restore_spill(struct restorearg *ra, struct drr_spill *drrs, void *data) /* ARGSUSED */ static int -restore_free(struct restorearg *ra, struct drr_free *drrf) +receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) { int err; @@ -1780,11 +1947,12 @@ restore_free(struct restorearg *ra, struct drr_free *drrf) drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); - if (dmu_object_info(ra->os, drrf->drr_object, NULL) != 0) + if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); - err = dmu_free_long_range(ra->os, drrf->drr_object, + err = dmu_free_long_range(rwa->os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); + return (err); } @@ -1799,7 +1967,7 @@ dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) } static void -restore_cksum(struct restorearg *ra, int len, void *buf) +receive_cksum(struct receive_arg *ra, int len, void *buf) { if (ra->byteswap) { fletcher_4_incremental_byteswap(buf, len, &ra->cksum); @@ -1809,30 +1977,42 @@ restore_cksum(struct restorearg *ra, int len, void *buf) } /* - * If len != 0, read payload into buf. - * Read next record's header into ra->next_drr. + * Read the payload into a buffer of size len, and update the current record's + * payload field. + * Allocate ra->next_rrd and read the next record's header into + * ra->next_rrd->header. * Verify checksum of payload and next record. */ static int -restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) +receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf) { int err; if (len != 0) { - ASSERT3U(len, <=, ra->bufsize); - err = restore_read(ra, len, buf); + ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); + ra->rrd->payload = buf; + ra->rrd->payload_size = len; + err = receive_read(ra, len, ra->rrd->payload); if (err != 0) return (err); - restore_cksum(ra, len, buf); + receive_cksum(ra, len, ra->rrd->payload); } ra->prev_cksum = ra->cksum; - err = restore_read(ra, sizeof (*ra->next_drr), ra->next_drr); - if (err != 0) + ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP); + err = receive_read(ra, sizeof (ra->next_rrd->header), + &ra->next_rrd->header); + if (err != 0) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; return (err); - if (ra->next_drr->drr_type == DRR_BEGIN) + } + if (ra->next_rrd->header.drr_type == DRR_BEGIN) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; return (SET_ERROR(EINVAL)); + } /* * Note: checksum is of everything up to but not including the @@ -1840,107 +2020,180 @@ restore_read_payload_and_next_header(struct restorearg *ra, int len, void *buf) */ ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); - restore_cksum(ra, + receive_cksum(ra, offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), - ra->next_drr); + &ra->next_rrd->header); - zio_cksum_t cksum_orig = ra->next_drr->drr_u.drr_checksum.drr_checksum; - zio_cksum_t *cksump = &ra->next_drr->drr_u.drr_checksum.drr_checksum; + zio_cksum_t cksum_orig = + ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; + zio_cksum_t *cksump = + &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum; if (ra->byteswap) - byteswap_record(ra->next_drr); + byteswap_record(&ra->next_rrd->header); if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && - !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) + !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) { + kmem_free(ra->next_rrd, sizeof (*ra->next_rrd)); + ra->next_rrd = NULL; return (SET_ERROR(ECKSUM)); + } - restore_cksum(ra, sizeof (cksum_orig), &cksum_orig); + receive_cksum(ra, sizeof (cksum_orig), &cksum_orig); return (0); } +/* + * Issue the prefetch reads for any necessary indirect blocks. + * + * We use the object ignore list to tell us whether or not to issue prefetches + * for a given object. We do this for both correctness (in case the blocksize + * of an object has changed) and performance (if the object doesn't exist, don't + * needlessly try to issue prefetches). We also trim the list as we go through + * the stream to prevent it from growing to an unbounded size. + * + * The object numbers within will always be in sorted order, and any write + * records we see will also be in sorted order, but they're not sorted with + * respect to each other (i.e. we can get several object records before + * receiving each object's write records). As a result, once we've reached a + * given object number, we can safely remove any reference to lower object + * numbers in the ignore list. In practice, we receive up to 32 object records + * before receiving write records, so the list can have up to 32 nodes in it. + */ +/* ARGSUSED */ +static void +receive_read_prefetch(struct receive_arg *ra, + uint64_t object, uint64_t offset, uint64_t length) +{ + struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list); + while (node != NULL && node->object < object) { + VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list)); + kmem_free(node, sizeof (*node)); + node = list_head(&ra->ignore_obj_list); + } + if (node == NULL || node->object > object) { + dmu_prefetch(ra->os, object, 1, offset, length, + ZIO_PRIORITY_SYNC_READ); + } +} + +/* + * Read records off the stream, issuing any necessary prefetches. + */ static int -restore_process_record(struct restorearg *ra) +receive_read_record(struct receive_arg *ra) { int err; - switch (ra->drr->drr_type) { + switch (ra->rrd->header.drr_type) { case DRR_OBJECT: { - struct drr_object *drro = &ra->drr->drr_u.drr_object; - err = restore_read_payload_and_next_header(ra, - P2ROUNDUP(drro->drr_bonuslen, 8), ra->buf); - if (err != 0) + struct drr_object *drro = &ra->rrd->header.drr_u.drr_object; + uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + dmu_object_info_t doi; + err = receive_read_payload_and_next_header(ra, size, buf); + if (err != 0) { + kmem_free(buf, size); return (err); - return (restore_object(ra, drro, ra->buf)); + } + err = dmu_object_info(ra->os, drro->drr_object, &doi); + /* + * See receive_read_prefetch for an explanation why we're + * storing this object in the ignore_obj_list. + */ + if (err == ENOENT || + (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { + struct receive_ign_obj_node *node = + kmem_zalloc(sizeof (*node), + KM_SLEEP); + node->object = drro->drr_object; +#ifdef ZFS_DEBUG + struct receive_ign_obj_node *last_object = + list_tail(&ra->ignore_obj_list); + uint64_t last_objnum = (last_object != NULL ? + last_object->object : 0); + ASSERT3U(node->object, >, last_objnum); +#endif + list_insert_tail(&ra->ignore_obj_list, node); + err = 0; + } + return (err); } case DRR_FREEOBJECTS: { - struct drr_freeobjects *drrfo = - &ra->drr->drr_u.drr_freeobjects; - err = restore_read_payload_and_next_header(ra, 0, NULL); - if (err != 0) - return (err); - return (restore_freeobjects(ra, drrfo)); + err = receive_read_payload_and_next_header(ra, 0, NULL); + return (err); } case DRR_WRITE: { - struct drr_write *drrw = &ra->drr->drr_u.drr_write; + struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write; arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os), drrw->drr_length); - err = restore_read_payload_and_next_header(ra, + err = receive_read_payload_and_next_header(ra, drrw->drr_length, abuf->b_data); - if (err != 0) - return (err); - err = restore_write(ra, drrw, abuf); - /* if restore_write() is successful, it consumes the arc_buf */ - if (err != 0) + if (err != 0) { dmu_return_arcbuf(abuf); + return (err); + } + ra->rrd->write_buf = abuf; + receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset, + drrw->drr_length); return (err); } case DRR_WRITE_BYREF: { - struct drr_write_byref *drrwbr = - &ra->drr->drr_u.drr_write_byref; - err = restore_read_payload_and_next_header(ra, 0, NULL); - if (err != 0) - return (err); - return (restore_write_byref(ra, drrwbr)); + struct drr_write_byref *drrwb = + &ra->rrd->header.drr_u.drr_write_byref; + err = receive_read_payload_and_next_header(ra, 0, NULL); + receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset, + drrwb->drr_length); + return (err); } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded *drrwe = - &ra->drr->drr_u.drr_write_embedded; - err = restore_read_payload_and_next_header(ra, - P2ROUNDUP(drrwe->drr_psize, 8), ra->buf); - if (err != 0) + &ra->rrd->header.drr_u.drr_write_embedded; + uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + + err = receive_read_payload_and_next_header(ra, size, buf); + if (err != 0) { + kmem_free(buf, size); return (err); - return (restore_write_embedded(ra, drrwe, ra->buf)); + } + + receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset, + drrwe->drr_length); + return (err); } case DRR_FREE: { - struct drr_free *drrf = &ra->drr->drr_u.drr_free; - err = restore_read_payload_and_next_header(ra, 0, NULL); - if (err != 0) - return (err); - return (restore_free(ra, drrf)); + /* + * It might be beneficial to prefetch indirect blocks here, but + * we don't really have the data to decide for sure. + */ + err = receive_read_payload_and_next_header(ra, 0, NULL); + return (err); } case DRR_END: { - struct drr_end *drre = &ra->drr->drr_u.drr_end; + struct drr_end *drre = &ra->rrd->header.drr_u.drr_end; if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum)) return (SET_ERROR(EINVAL)); return (0); } case DRR_SPILL: { - struct drr_spill *drrs = &ra->drr->drr_u.drr_spill; - err = restore_read_payload_and_next_header(ra, - drrs->drr_length, ra->buf); + struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill; + void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP); + err = receive_read_payload_and_next_header(ra, drrs->drr_length, + buf); if (err != 0) - return (err); - return (restore_spill(ra, drrs, ra->buf)); + kmem_free(buf, drrs->drr_length); + return (err); } default: return (SET_ERROR(EINVAL)); @@ -1948,6 +2201,118 @@ restore_process_record(struct restorearg *ra) } /* + * Commit the records to the pool. + */ +static int +receive_process_record(struct receive_writer_arg *rwa, + struct receive_record_arg *rrd) +{ + int err; + + switch (rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &rrd->header.drr_u.drr_object; + err = receive_object(rwa, drro, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects *drrfo = + &rrd->header.drr_u.drr_freeobjects; + return (receive_freeobjects(rwa, drrfo)); + } + case DRR_WRITE: + { + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + err = receive_write(rwa, drrw, rrd->write_buf); + /* if receive_write() is successful, it consumes the arc_buf */ + if (err != 0) + dmu_return_arcbuf(rrd->write_buf); + rrd->write_buf = NULL; + rrd->payload = NULL; + return (err); + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwbr = + &rrd->header.drr_u.drr_write_byref; + return (receive_write_byref(rwa, drrwbr)); + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &rrd->header.drr_u.drr_write_embedded; + err = receive_write_embedded(rwa, drrwe, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + case DRR_FREE: + { + struct drr_free *drrf = &rrd->header.drr_u.drr_free; + return (receive_free(rwa, drrf)); + } + case DRR_SPILL: + { + struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; + err = receive_spill(rwa, drrs, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + return (err); + } + default: + return (SET_ERROR(EINVAL)); + } +} + +/* + * dmu_recv_stream's worker thread; pull records off the queue, and then call + * receive_process_record When we're done, signal the main thread and exit. + */ +static void +receive_writer_thread(void *arg) +{ + struct receive_writer_arg *rwa = arg; + struct receive_record_arg *rrd; + for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; + rrd = bqueue_dequeue(&rwa->q)) { + /* + * If there's an error, the main thread will stop putting things + * on the queue, but we need to clear everything in it before we + * can exit. + */ + if (rwa->err == 0) { + rwa->err = receive_process_record(rwa, rrd); + } else if (rrd->write_buf != NULL) { + dmu_return_arcbuf(rrd->write_buf); + rrd->write_buf = NULL; + rrd->payload = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + kmem_free(rrd, sizeof (*rrd)); + } + kmem_free(rrd, sizeof (*rrd)); + mutex_enter(&rwa->mutex); + rwa->done = B_TRUE; + cv_signal(&rwa->cv); + mutex_exit(&rwa->mutex); +} + +/* + * Read in the stream's records, one by one, and apply them to the pool. There + * are two threads involved; the thread that calls this function will spin up a + * worker thread, read the records off the stream one by one, and issue + * prefetches for any necessary indirect blocks. It will then push the records + * onto an internal blocking queue. The worker thread will pull the records off + * the queue, and actually write the data into the DMU. This way, the worker + * thread doesn't have to wait for reads to complete, since everything it needs + * (the indirect blocks) will be prefetched. + * * NB: callers *must* call dmu_recv_end() if this succeeds. */ int @@ -1955,17 +2320,16 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep) { int err = 0; - struct restorearg ra = { 0 }; + struct receive_arg ra = { 0 }; + struct receive_writer_arg rwa = { 0 }; int featureflags; ra.byteswap = drc->drc_byteswap; ra.cksum = drc->drc_cksum; ra.vp = vp; ra.voff = *voffp; - ra.bufsize = SPA_MAXBLOCKSIZE; - ra.drr = kmem_alloc(sizeof (*ra.drr), KM_SLEEP); - ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); - ra.next_drr = kmem_alloc(sizeof (*ra.next_drr), KM_SLEEP); + list_create(&ra.ignore_obj_list, sizeof (struct receive_ign_obj_node), + offsetof(struct receive_ign_obj_node, node)); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, @@ -1996,48 +2360,92 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp, } if (*action_handlep == 0) { - ra.guid_to_ds_map = + rwa.guid_to_ds_map = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); - avl_create(ra.guid_to_ds_map, guid_compare, + avl_create(rwa.guid_to_ds_map, guid_compare, sizeof (guid_map_entry_t), offsetof(guid_map_entry_t, avlnode)); err = zfs_onexit_add_cb(minor, - free_guid_map_onexit, ra.guid_to_ds_map, + free_guid_map_onexit, rwa.guid_to_ds_map, action_handlep); if (ra.err != 0) goto out; } else { err = zfs_onexit_cb_data(minor, *action_handlep, - (void **)&ra.guid_to_ds_map); + (void **)&rwa.guid_to_ds_map); if (ra.err != 0) goto out; } - drc->drc_guid_to_ds_map = ra.guid_to_ds_map; + drc->drc_guid_to_ds_map = rwa.guid_to_ds_map; } - err = restore_read_payload_and_next_header(&ra, 0, NULL); - if (err != 0) + err = receive_read_payload_and_next_header(&ra, 0, NULL); + if (err) goto out; - for (;;) { - void *tmp; + (void) bqueue_init(&rwa.q, zfs_recv_queue_length, + offsetof(struct receive_record_arg, node)); + cv_init(&rwa.cv, NULL, CV_DEFAULT, NULL); + mutex_init(&rwa.mutex, NULL, MUTEX_DEFAULT, NULL); + rwa.os = ra.os; + rwa.byteswap = drc->drc_byteswap; + + (void) thread_create(NULL, 0, receive_writer_thread, &rwa, 0, curproc, + TS_RUN, minclsyspri); + /* + * We're reading rwa.err without locks, which is safe since we are the + * only reader, and the worker thread is the only writer. It's ok if we + * miss a write for an iteration or two of the loop, since the writer + * thread will keep freeing records we send it until we send it an eos + * marker. + * + * We can leave this loop in 3 ways: First, if rwa.err is + * non-zero. In that case, the writer thread will free the rrd we just + * pushed. Second, if we're interrupted; in that case, either it's the + * first loop and ra.rrd was never allocated, or it's later, and ra.rrd + * has been handed off to the writer thread who will free it. Finally, + * if receive_read_record fails or we're at the end of the stream, then + * we free ra.rrd and exit. + */ + while (rwa.err == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = SET_ERROR(EINTR); break; } - tmp = ra.next_drr; - ra.next_drr = ra.drr; - ra.drr = tmp; + ASSERT3P(ra.rrd, ==, NULL); + ra.rrd = ra.next_rrd; + ra.next_rrd = NULL; + /* Allocates and loads header into ra.next_rrd */ + err = receive_read_record(&ra); - /* process ra.drr, read in ra.next_drr */ - err = restore_process_record(&ra); - if (err != 0) - break; - if (ra.drr->drr_type == DRR_END) + if (ra.rrd->header.drr_type == DRR_END || err != 0) { + kmem_free(ra.rrd, sizeof (*ra.rrd)); + ra.rrd = NULL; break; + } + + bqueue_enqueue(&rwa.q, ra.rrd, + sizeof (struct receive_record_arg) + ra.rrd->payload_size); + ra.rrd = NULL; + } + if (ra.next_rrd == NULL) + ra.next_rrd = kmem_zalloc(sizeof (*ra.next_rrd), KM_SLEEP); + ra.next_rrd->eos_marker = B_TRUE; + bqueue_enqueue(&rwa.q, ra.next_rrd, 1); + + mutex_enter(&rwa.mutex); + while (!rwa.done) { + cv_wait(&rwa.cv, &rwa.mutex); } + mutex_exit(&rwa.mutex); + + cv_destroy(&rwa.cv); + mutex_destroy(&rwa.mutex); + bqueue_destroy(&rwa.q); + if (err == 0) + err = rwa.err; out: if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) @@ -2051,10 +2459,13 @@ out: dmu_recv_cleanup_ds(drc); } - kmem_free(ra.drr, sizeof (*ra.drr)); - kmem_free(ra.buf, ra.bufsize); - kmem_free(ra.next_drr, sizeof (*ra.next_drr)); *voffp = ra.voff; + for (struct receive_ign_obj_node *n = + list_remove_head(&ra.ignore_obj_list); n != NULL; + n = list_remove_head(&ra.ignore_obj_list)) { + kmem_free(n, sizeof (*n)); + } + list_destroy(&ra.ignore_obj_list); return (err); } |