summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthew Ahrens <mahrens@delphix.com>2013-08-20 20:11:52 -0800
committerChristopher Siden <chris.siden@delphix.com>2013-08-20 21:11:52 -0700
commit713d6c208802cfbb806329ec0d154b641b80c355 (patch)
treed8661b4f2495c862cb92c82a928dbcc22008a000
parent8a16d130bdea488abf40f6692135d3c5b462fda4 (diff)
downloadillumos-joyent-713d6c208802cfbb806329ec0d154b641b80c355.tar.gz
4047 panic from dbuf_free_range() from dmu_free_object() while doing zfs receive
Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Approved by: Dan McDonald <danmcd@nexenta.com>
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c31
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c133
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_send.c2
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c3
-rw-r--r--usr/src/uts/common/fs/zfs/dnode.c7
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_destroy.c2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dnode.h4
8 files changed, 97 insertions, 89 deletions
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 4ac8b1bdb6..4056775ea7 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -40,6 +40,12 @@
#include <sys/sa.h>
#include <sys/sa_impl.h>
+/*
+ * Number of times that zfs_free_range() took the slow path while doing
+ * a zfs receive. A nonzero value indicates a potential performance problem.
+ */
+uint64_t zfs_free_range_recv_miss;
+
static void dbuf_destroy(dmu_buf_impl_t *db);
static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
@@ -819,20 +825,22 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
}
dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
- if (dmu_objset_is_receiving(dn->dn_objset)) {
+ mutex_enter(&dn->dn_dbufs_mtx);
+ if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
+ /* There can't be any dbufs in this range; no need to search. */
+ mutex_exit(&dn->dn_dbufs_mtx);
+ return;
+ } else if (dmu_objset_is_receiving(dn->dn_objset)) {
/*
- * When processing a free record from a zfs receive,
- * there should have been no previous modifications to the
- * data in this range. Therefore there should be no dbufs
- * in the range. Searching dn_dbufs for these non-existent
- * dbufs can be very expensive, so simply ignore this.
+ * If we are receiving, we expect there to be no dbufs in
+ * the range to be freed, because receive modifies each
+ * block at most once, and in offset order. If this is
+ * not the case, it can lead to performance problems,
+ * so note that we unexpectedly took the slow path.
*/
- VERIFY3P(dbuf_find(dn, 0, start), ==, NULL);
- VERIFY3P(dbuf_find(dn, 0, end), ==, NULL);
- return;
+ atomic_inc_64(&zfs_free_range_recv_miss);
}
- mutex_enter(&dn->dn_dbufs_mtx);
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
@@ -1720,6 +1728,9 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
return (odb);
}
list_insert_head(&dn->dn_dbufs, db);
+ if (db->db_level == 0 && db->db_blkid >=
+ dn->dn_unlisted_l0_blkid)
+ dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx);
arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 7fd54dfb1f..f81c4ebafc 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -569,98 +569,93 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
* the end so that the file gets shorter over time (if we crashes in the
* middle, this will leave us in a better state). We find allocated file
* data by simply searching the allocated level 1 indirects.
+ *
+ * On input, *start should be the first offset that does not need to be
+ * freed (e.g. "offset + length"). On return, *start will be the first
+ * offset that should be freed.
*/
static int
-get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
{
- uint64_t len = *start - limit;
- uint64_t blkcnt = 0;
- uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
+ uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
+ /* bytes of data covered by a level-1 indirect block */
uint64_t iblkrange =
dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
- ASSERT(limit <= *start);
+ ASSERT3U(minimum, <=, *start);
- if (len <= iblkrange * maxblks) {
- *start = limit;
+ if (*start - minimum <= iblkrange * maxblks) {
+ *start = minimum;
return (0);
}
ASSERT(ISP2(iblkrange));
- while (*start > limit && blkcnt < maxblks) {
+ for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
int err;
- /* find next allocated L1 indirect */
+ /*
+ * dnode_next_offset(BACKWARDS) will find an allocated L1
+ * indirect block at or before the input offset. We must
+ * decrement *start so that it is at the end of the region
+ * to search.
+ */
+ (*start)--;
err = dnode_next_offset(dn,
DNODE_FIND_BACKWARDS, start, 2, 1, 0);
- /* if there are no more, then we are done */
+ /* if there are no indirect blocks before start, we are done */
if (err == ESRCH) {
- *start = limit;
- return (0);
- } else if (err) {
+ *start = minimum;
+ break;
+ } else if (err != 0) {
return (err);
}
- blkcnt += 1;
- /* reset offset to end of "next" block back */
+ /* set start to the beginning of this L1 indirect */
*start = P2ALIGN(*start, iblkrange);
- if (*start <= limit)
- *start = limit;
- else
- *start -= 1;
}
+ if (*start < minimum)
+ *start = minimum;
return (0);
}
static int
dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
- uint64_t length, boolean_t free_dnode)
+ uint64_t length)
{
- dmu_tx_t *tx;
- uint64_t object_size, start, end, len;
- boolean_t trunc = (length == DMU_OBJECT_END);
- int align, err;
-
- align = 1 << dn->dn_datablkshift;
- ASSERT(align > 0);
- object_size = align == 1 ? dn->dn_datablksz :
- (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
-
- end = offset + length;
- if (trunc || end > object_size)
- end = object_size;
- if (end <= offset)
+ uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
+ int err;
+
+ if (offset >= object_size)
return (0);
- length = end - offset;
- while (length) {
- start = end;
- /* assert(offset <= start) */
- err = get_next_chunk(dn, &start, offset);
+ if (length == DMU_OBJECT_END || offset + length > object_size)
+ length = object_size - offset;
+
+ while (length != 0) {
+ uint64_t chunk_end, chunk_begin;
+
+ chunk_end = chunk_begin = offset + length;
+
+ /* move chunk_begin backwards to the beginning of this chunk */
+ err = get_next_chunk(dn, &chunk_begin, offset);
if (err)
return (err);
- len = trunc ? DMU_OBJECT_END : end - start;
+ ASSERT3U(chunk_begin, >=, offset);
+ ASSERT3U(chunk_begin, <=, chunk_end);
- tx = dmu_tx_create(os);
- dmu_tx_hold_free(tx, dn->dn_object, start, len);
+ dmu_tx_t *tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, dn->dn_object,
+ chunk_begin, chunk_end - chunk_begin);
err = dmu_tx_assign(tx, TXG_WAIT);
if (err) {
dmu_tx_abort(tx);
return (err);
}
-
- dnode_free_range(dn, start, trunc ? -1 : len, tx);
-
- if (start == 0 && free_dnode) {
- ASSERT(trunc);
- dnode_free(dn, tx);
- }
-
- length -= end - start;
-
+ dnode_free_range(dn, chunk_begin, chunk_end - chunk_begin, tx);
dmu_tx_commit(tx);
- end = start;
+
+ length -= chunk_end - chunk_begin;
}
return (0);
}
@@ -675,38 +670,32 @@ dmu_free_long_range(objset_t *os, uint64_t object,
err = dnode_hold(os, object, FTAG, &dn);
if (err != 0)
return (err);
- err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
+ err = dmu_free_long_range_impl(os, dn, offset, length);
dnode_rele(dn, FTAG);
return (err);
}
int
-dmu_free_object(objset_t *os, uint64_t object)
+dmu_free_long_object(objset_t *os, uint64_t object)
{
- dnode_t *dn;
dmu_tx_t *tx;
int err;
- err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
- FTAG, &dn);
+ err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
if (err != 0)
return (err);
- if (dn->dn_nlevels == 1) {
- tx = dmu_tx_create(os);
- dmu_tx_hold_bonus(tx, object);
- dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
- err = dmu_tx_assign(tx, TXG_WAIT);
- if (err == 0) {
- dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
- dnode_free(dn, tx);
- dmu_tx_commit(tx);
- } else {
- dmu_tx_abort(tx);
- }
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, object);
+ dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == 0) {
+ err = dmu_object_free(os, object, tx);
+ dmu_tx_commit(tx);
} else {
- err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
+ dmu_tx_abort(tx);
}
- dnode_rele(dn, FTAG);
+
return (err);
}
diff --git a/usr/src/uts/common/fs/zfs/dmu_send.c b/usr/src/uts/common/fs/zfs/dmu_send.c
index ba1b9c753b..ca58e84a0f 100644
--- a/usr/src/uts/common/fs/zfs/dmu_send.c
+++ b/usr/src/uts/common/fs/zfs/dmu_send.c
@@ -1232,7 +1232,7 @@ restore_freeobjects(struct restorearg *ra, objset_t *os,
if (dmu_object_info(os, obj, NULL) != 0)
continue;
- err = dmu_free_object(os, obj);
+ err = dmu_free_long_object(os, obj);
if (err != 0)
return (err);
}
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 8d23aa374c..dddaa0a16e 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -615,7 +615,8 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
* if they are blocksize-aligned.
*/
if (dn->dn_datablkshift == 0) {
- dmu_tx_count_write(txh, off, len);
+ if (off != 0 || len < dn->dn_datablksz)
+ dmu_tx_count_write(txh, off, len);
} else {
/* first block will be modified if it is not aligned */
if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index 92996a6f12..f9f5497023 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -115,6 +115,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
dn->dn_id_flags = 0;
dn->dn_dbufs_count = 0;
+ dn->dn_unlisted_l0_blkid = 0;
list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
@@ -167,6 +168,7 @@ dnode_dest(void *arg, void *unused)
ASSERT0(dn->dn_id_flags);
ASSERT0(dn->dn_dbufs_count);
+ ASSERT0(dn->dn_unlisted_l0_blkid);
list_destroy(&dn->dn_dbufs);
}
@@ -472,6 +474,7 @@ dnode_destroy(dnode_t *dn)
dn->dn_newuid = 0;
dn->dn_newgid = 0;
dn->dn_id_flags = 0;
+ dn->dn_unlisted_l0_blkid = 0;
dmu_zfetch_rele(&dn->dn_zfetch);
kmem_cache_free(dnode_cache, dn);
@@ -702,6 +705,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
ASSERT(list_is_empty(&ndn->dn_dbufs));
list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
ndn->dn_dbufs_count = odn->dn_dbufs_count;
+ ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
ndn->dn_bonus = odn->dn_bonus;
ndn->dn_have_spill = odn->dn_have_spill;
ndn->dn_zio = odn->dn_zio;
@@ -736,6 +740,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
odn->dn_dbufs_count = 0;
+ odn->dn_unlisted_l0_blkid = 0;
odn->dn_bonus = NULL;
odn->dn_zfetch.zf_dnode = NULL;
@@ -1523,7 +1528,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
blkshift = dn->dn_datablkshift;
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- if (len == -1ULL) {
+ if (len == DMU_OBJECT_END) {
len = UINT64_MAX - off;
trunc = TRUE;
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_destroy.c b/usr/src/uts/common/fs/zfs/dsl_destroy.c
index ef46df8e17..b002df7561 100644
--- a/usr/src/uts/common/fs/zfs/dsl_destroy.c
+++ b/usr/src/uts/common/fs/zfs/dsl_destroy.c
@@ -896,7 +896,7 @@ dsl_destroy_head(const char *name)
for (uint64_t obj = 0; error == 0;
error = dmu_object_next(os, &obj, FALSE,
prev_snap_txg))
- (void) dmu_free_object(os, obj);
+ (void) dmu_free_long_object(os, obj);
/* sync out all frees */
txg_wait_synced(dmu_objset_pool(os), 0);
dmu_objset_disown(os, FTAG);
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 6e07a156dc..ab9a1bf4f1 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -21,7 +21,7 @@
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
@@ -581,7 +581,7 @@ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx);
int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size);
-int dmu_free_object(objset_t *os, uint64_t object);
+int dmu_free_long_object(objset_t *os, uint64_t object);
/*
* Convenience functions.
diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h
index c3de03d369..55b87bc394 100644
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2013 by Delphix. All rights reserved.
*/
#ifndef _SYS_DNODE_H
@@ -188,6 +188,8 @@ typedef struct dnode {
/* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */
uint32_t dn_dbufs_count; /* count of dn_dbufs */
+ /* There are no level-0 blocks of this blkid or higher in dn_dbufs */
+ uint64_t dn_unlisted_l0_blkid;
/* protected by os_lock: */
list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */