summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Ahrens <mahrens@delphix.com>2013-07-29 10:58:53 -0800
committerChristopher Siden <chris.siden@delphix.com>2013-07-29 11:58:53 -0700
commit2f3d878000c3b33cde13e16c4f0a1ab15d883a18 (patch)
tree0e7b71ece8798be486f3080d2bdd2aa1d6a6ba74
parent34f2f8cf94052481c81be2e134b94a00b501bf21 (diff)
downloadillumos-joyent-2f3d878000c3b33cde13e16c4f0a1ab15d883a18.tar.gz
3834 incremental replication of 'holey' file systems is slow
Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Approved by: Richard Lowe <richlowe@richlowe.net>
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c20
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c54
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c49
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_impl.h6
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_send.h3
5 files changed, 110 insertions, 22 deletions
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 6993365c65..4ac8b1bdb6 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -27,6 +27,7 @@
#include <sys/zfs_context.h>
#include <sys/dmu.h>
+#include <sys/dmu_send.h>
#include <sys/dmu_impl.h>
#include <sys/dbuf.h>
#include <sys/dmu_objset.h>
@@ -796,9 +797,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
/*
* Evict (if its unreferenced) or clear (if its referenced) any level-0
* data blocks in the free range, so that any future readers will find
- * empty blocks. Also, if we happen accross any level-1 dbufs in the
+ * empty blocks. Also, if we happen across any level-1 dbufs in the
* range that have not already been marked dirty, mark them dirty so
* they stay in memory.
+ *
+ * This is a no-op if the dataset is in the middle of an incremental
+ * receive; see comment below for details.
*/
void
dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
@@ -814,6 +818,20 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
last_l1 = end >> epbs;
}
dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
+
+ if (dmu_objset_is_receiving(dn->dn_objset)) {
+ /*
+ * When processing a free record from a zfs receive,
+ * there should have been no previous modifications to the
+ * data in this range. Therefore there should be no dbufs
+ * in the range. Searching dn_dbufs for these non-existent
+ * dbufs can be very expensive, so simply ignore this.
+ */
+ VERIFY3P(dbuf_find(dn, 0, start), ==, NULL);
+ VERIFY3P(dbuf_find(dn, 0, end), ==, NULL);
+ return;
+ }
+
mutex_enter(&dn->dn_dbufs_mtx);
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index d485d3ba01..ba1b9c753b 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -80,6 +80,32 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
{
struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
+ /*
+ * When we receive a free record, dbuf_free_range() assumes
+ * that the receiving system doesn't have any dbufs in the range
+ * being freed. This is always true because there is a one-record
+ * constraint: we only send one WRITE record for any given
+ * object+offset. We know that the one-record constraint is
+ * true because we always send data in increasing order by
+ * object,offset.
+ *
+ * If the increasing-order constraint ever changes, we should find
+ * another way to assert that the one-record constraint is still
+ * satisfied.
+ */
+ ASSERT(object > dsp->dsa_last_data_object ||
+ (object == dsp->dsa_last_data_object &&
+ offset > dsp->dsa_last_data_offset));
+
+ /*
+ * If we are doing a non-incremental send, then there can't
+ * be any data in the dataset we're receiving into. Therefore
+ * a free record would simply be a no-op. Save space by not
+ * sending it to begin with.
+ */
+ if (!dsp->dsa_incremental)
+ return (0);
+
if (length != -1ULL && offset + length < offset)
length = -1ULL;
@@ -146,6 +172,15 @@ dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
{
struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
+ /*
+ * We send data in increasing object, offset order.
+ * See comment in dump_free() for details.
+ */
+ ASSERT(object > dsp->dsa_last_data_object ||
+ (object == dsp->dsa_last_data_object &&
+ offset > dsp->dsa_last_data_offset));
+ dsp->dsa_last_data_object = object;
+ dsp->dsa_last_data_offset = offset + blksz - 1;
/*
* If there is any kind of pending aggregation (currently either
@@ -213,6 +248,10 @@ dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
{
struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
+ /* See comment in dump_free(). */
+ if (!dsp->dsa_incremental)
+ return (0);
+
/*
* If there is a pending op, but it's not PENDING_FREEOBJECTS,
* push it out, since free block aggregation can only be done for
@@ -289,9 +328,9 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
return (SET_ERROR(EINTR));
- /* free anything past the end of the file */
+ /* Free anything past the end of the file. */
if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
- (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
+ (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
return (SET_ERROR(EINTR));
if (dsp->dsa_err != 0)
return (SET_ERROR(EINTR));
@@ -474,6 +513,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
dsp->dsa_toguid = ds->ds_phys->ds_guid;
ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
dsp->dsa_pending_op = PENDING_NONE;
+ dsp->dsa_incremental = (fromtxg != 0);
mutex_enter(&ds->ds_sendstream_lock);
list_insert_head(&ds->ds_sendstreams, dsp);
@@ -1765,3 +1805,13 @@ dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
else
return (dmu_recv_existing_end(drc));
}
+
+/*
+ * Return TRUE if this objset is currently being received into.
+ */
+boolean_t
+dmu_objset_is_receiving(objset_t *os)
+{
+ return (os->os_dsl_dataset != NULL &&
+ os->os_dsl_dataset->ds_owner == dmu_recv_tag);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 5338425326..929b0c4d97 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -587,8 +587,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
{
dmu_tx_hold_t *txh;
dnode_t *dn;
- uint64_t start, end, i;
- int err, shift;
+ int err;
zio_t *zio;
ASSERT(tx->tx_txg == 0);
@@ -599,34 +598,48 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
return;
dn = txh->txh_dnode;
- /* first block */
- if (off != 0)
- dmu_tx_count_write(txh, off, 1);
- /* last block */
- if (len != DMU_OBJECT_END)
- dmu_tx_count_write(txh, off+len, 1);
-
- dmu_tx_count_dnode(txh);
-
if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
return;
if (len == DMU_OBJECT_END)
len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
+ dmu_tx_count_dnode(txh);
+
/*
- * For i/o error checking, read the first and last level-0
- * blocks, and all the level-1 blocks. The above count_write's
- * have already taken care of the level-0 blocks.
+ * For i/o error checking, we read the first and last level-0
+ * blocks if they are not aligned, and all the level-1 blocks.
+ *
+ * Note: dbuf_free_range() assumes that we have not instantiated
+ * any level-0 dbufs that will be completely freed. Therefore we must
+ * exercise care to not read or count the first and last blocks
+ * if they are blocksize-aligned.
+ */
+ if (dn->dn_datablkshift == 0) {
+ dmu_tx_count_write(txh, off, len);
+ } else {
+ /* first block will be modified if it is not aligned */
+ if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
+ dmu_tx_count_write(txh, off, 1);
+ /* last block will be modified if it is not aligned */
+ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
+ dmu_tx_count_write(txh, off+len, 1);
+ }
+
+ /*
+ * Check level-1 blocks.
*/
if (dn->dn_nlevels > 1) {
- shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ int shift = dn->dn_datablkshift + dn->dn_indblkshift -
SPA_BLKPTRSHIFT;
- start = off >> shift;
- end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
+ uint64_t start = off >> shift;
+ uint64_t end = (off + len) >> shift;
+
+ ASSERT(dn->dn_datablkshift != 0);
+ ASSERT(dn->dn_indblkshift != 0);
zio = zio_root(tx->tx_pool->dp_spa,
NULL, NULL, ZIO_FLAG_CANFAIL);
- for (i = start; i <= end; i++) {
+ for (uint64_t i = start; i <= end; i++) {
uint64_t ibyte = i << shift;
err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
i = ibyte >> shift;
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
index defcdb29ca..98057ee36f 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_impl.h
@@ -21,7 +21,10 @@
/*
* Copyright 2010 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ */
+/*
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_DMU_IMPL_H
@@ -290,6 +293,9 @@ typedef struct dmu_sendarg {
uint64_t dsa_toguid;
int dsa_err;
dmu_pendop_t dsa_pending_op;
+ boolean_t dsa_incremental;
+ uint64_t dsa_last_data_object;
+ uint64_t dsa_last_data_offset;
} dmu_sendarg_t;
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_send.h b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
index 6442b20f7b..65514b7620 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_send.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_send.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
@@ -63,5 +63,6 @@ int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
int cleanup_fd, uint64_t *action_handlep);
int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
+boolean_t dmu_objset_is_receiving(objset_t *os);
#endif /* _DMU_SEND_H */