summaryrefslogtreecommitdiff
path: root/usr/src/uts/common
diff options
context:
space:
mode:
authoreschrock <none@none>2006-03-03 20:08:16 -0800
committereschrock <none@none>2006-03-03 20:08:16 -0800
commitea8dc4b6d2251b437950c0056bc626b311c73c27 (patch)
tree69cc1808568f2ef8fd1e21c61e186ba452ea64da /usr/src/uts/common
parent5c18afbc96a46bc3a9e6f3667512daa374d6cd79 (diff)
downloadillumos-joyent-ea8dc4b6d2251b437950c0056bc626b311c73c27.tar.gz
PSARC 2006/077 zpool clear
PSARC 2006/139 FMA for ZFS 6284889 arc should replace the znode cache 6333006 DMU & DSL should not panic upon I/O error 6333092 concurrent reads to a file not scaling with number of readers 6338081 ZFS/FMA phase 1 6338386 need persistent error log 6341326 i/o error causes arc buf hash table corruption 6341639 zfs backup/restore should compute/verify checksum of backup stream 6348002 out of space due to changing properties 6354724 inaccurate error message from zfs restore 6354872 dmu_sync() blows predictive accounting 6355416 zpool scrubbing consumes all memory, system hung 6363995 df should only load libzfs when it encounters a ZFS filesystem 6366320 zfs backup/restore doesn't like signals 6368892 mount -m support needed for legacy mounts 6368902 boot archive fstat support needed for ZFS Mountroot 6369424 BFU complains when bfu'ing a ZFS root filesystem 6374062 mountroot support needed for ZFS 6376356 dirtying dbuf obj=43 lvl=0 blkid=0 but not tx_held 6378391 unused members of dmu_objset_stats_t 6378392 clean up zfs_cmd_t structure 6378685 buf_init should allocate its hash table more carefully 6378976 ziltest should be a first class citizen 6381086 zdb segfaults if there is a spa deferred-free bplist 6381203 deadlock due to i/o while assigning (tc_lock held) 6381209 freed space is not immediately available 6381344 'zpool clear' 6381345 FAULTED devices should really be UNAVAIL 6381346 import should mark devices as persistently unavailable 6383272 recursive mutex_enter() during log replay with zfs root 6386326 origin property is not displayed 6386354 libzfs does too much in its _init section, calls exit(1) 6386624 zpool should not complain about non-existent devices from libdiskmgt 6386910 spa needs to be i/o error hardened 6387735 need a mechanism to inject faults into ZFS 6387736 internal ZFS utilities should be placed in an ON-private package 6389928 libzfs should ship a lint library 6390609 malformed vdev config panics on zpool_create() 6390677 version number checking makes upgrades challenging 6390713 ztest hangs in zil_suspend() 6391873 metadata compression should be turned back on 6392113 ztest sometimes reports leaked blocks because ZIL isn't resilvered 6393004 minor memory leak in unique_insert()
Diffstat (limited to 'usr/src/uts/common')
-rw-r--r--usr/src/uts/common/Makefile.files5
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c1130
-rw-r--r--usr/src/uts/common/fs/zfs/bplist.c106
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c897
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c619
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_object.c47
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_objset.c154
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_traverse.c73
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c251
-rw-r--r--usr/src/uts/common/fs/zfs/dnode.c122
-rw-r--r--usr/src/uts/common/fs/zfs/dnode_sync.c91
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dataset.c308
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dir.c164
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_pool.c69
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_prop.c37
-rw-r--r--usr/src/uts/common/fs/zfs/fletcher.c53
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c11
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c815
-rw-r--r--usr/src/uts/common/fs/zfs/spa_config.c58
-rw-r--r--usr/src/uts/common/fs/zfs/spa_errlog.c436
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c116
-rw-r--r--usr/src/uts/common/fs/zfs/space_map.c13
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h24
-rw-r--r--usr/src/uts/common/fs/zfs/sys/bplist.h11
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dbuf.h37
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h75
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_objset.h19
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_traverse.h18
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu_tx.h15
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dnode.h34
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dataset.h15
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_dir.h15
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_pool.h9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/refcount.h9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h48
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h40
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h18
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h13
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zap_impl.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_acl.h8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h31
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_znode.h17
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h94
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_checksum.h9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_impl.h10
-rw-r--r--usr/src/uts/common/fs/zfs/uberblock.c13
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c392
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_cache.c10
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_disk.c3
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_file.c10
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c37
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_mirror.c17
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c45
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c54
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_root.c14
-rw-r--r--usr/src/uts/common/fs/zfs/zap.c324
-rw-r--r--usr/src/uts/common/fs/zfs/zap_micro.c19
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_acl.c52
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_dir.c25
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_fm.c316
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c173
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c669
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c140
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_znode.c366
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c39
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c233
-rw-r--r--usr/src/uts/common/fs/zfs/zio_checksum.c10
-rw-r--r--usr/src/uts/common/fs/zfs/zio_inject.c315
-rw-r--r--usr/src/uts/common/fs/zfs/zvol.c15
-rw-r--r--usr/src/uts/common/krtld/kobj.c63
-rw-r--r--usr/src/uts/common/krtld/kobj_stubs.c12
-rw-r--r--usr/src/uts/common/krtld/mapfile12
-rw-r--r--usr/src/uts/common/os/fm.c36
-rw-r--r--usr/src/uts/common/os/modsysfile.c16
-rw-r--r--usr/src/uts/common/os/policy.c39
-rw-r--r--usr/src/uts/common/sys/Makefile8
-rw-r--r--usr/src/uts/common/sys/Makefile.syshdrs11
-rw-r--r--usr/src/uts/common/sys/fm/fs/zfs.h75
-rw-r--r--usr/src/uts/common/sys/fm/protocol.h13
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h20
-rw-r--r--usr/src/uts/common/sys/kobj.h7
-rw-r--r--usr/src/uts/common/sys/policy.h6
-rw-r--r--usr/src/uts/common/sys/sysconf.h8
83 files changed, 6432 insertions, 3331 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index f2d155fd25..587e9e1535 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -864,6 +864,7 @@ ZFS_COMMON_OBJS += \
sha256.o \
spa.o \
spa_config.o \
+ spa_errlog.o \
spa_misc.o \
space_map.o \
txg.o \
@@ -882,10 +883,12 @@ ZFS_COMMON_OBJS += \
zap_leaf.o \
zap_micro.o \
zfs_byteswap.o \
+ zfs_fm.o \
zil.o \
zio.o \
zio_checksum.o \
- zio_compress.o
+ zio_compress.o \
+ zio_inject.o
ZFS_SHARED_OBJS += \
zfs_namecheck.o \
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index bd8a110990..904e746721 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -28,8 +28,8 @@
/*
* DVA-based Adjustable Relpacement Cache
*
- * While much of the theory of operation and algorithms used here
- * are based on the self-tuning, low overhead replacement cache
+ * While much of the theory of operation used here is
+ * based on the self-tuning, low overhead replacement cache
* presented by Megiddo and Modha at FAST 2003, there are some
* significant differences:
*
@@ -98,6 +98,15 @@
* must use: mutex_tryenter() to avoid deadlock. Also note that
* the "top" state mutex must be held before the "bot" state mutex.
*
+ * Arc buffers may have an associated eviction callback function.
+ * This function will be invoked prior to removing the buffer (e.g.
+ * in arc_do_user_evicts()). Note however that the data associated
+ * with the buffer may be evicted prior to the callback. The callback
+ * must be made with *no locks held* (to prevent deadlock). Additionally,
+ * the users of callbacks must ensure that their private data is
+ * protected from simultaneous callbacks from arc_buf_evict()
+ * and arc_do_user_evicts().
+ *
* Note that the majority of the performance stats are manipulated
* with atomic operations.
*/
@@ -136,10 +145,10 @@ static int arc_dead;
/*
* Note that buffers can be on one of 5 states:
* ARC_anon - anonymous (discussed below)
- * ARC_mru_top - recently used, currently cached
- * ARC_mru_bot - recentely used, no longer in cache
- * ARC_mfu_top - frequently used, currently cached
- * ARC_mfu_bot - frequently used, no longer in cache
+ * ARC_mru - recently used, currently cached
+ * ARC_mru_ghost - recentely used, no longer in cache
+ * ARC_mfu - frequently used, currently cached
+ * ARC_mfu_ghost - frequently used, no longer in cache
* When there are no active references to the buffer, they
* are linked onto one of the lists in arc. These are the
* only buffers that can be evicted or deleted.
@@ -147,9 +156,9 @@ static int arc_dead;
* Anonymous buffers are buffers that are not associated with
* a DVA. These are buffers that hold dirty block copies
* before they are written to stable storage. By definition,
- * they are "ref'd" and are considered part of arc_mru_top
+ * they are "ref'd" and are considered part of arc_mru
* that cannot be freed. Generally, they will aquire a DVA
- * as they are written and migrate onto the arc_mru_top list.
+ * as they are written and migrate onto the arc_mru list.
*/
typedef struct arc_state {
@@ -162,24 +171,22 @@ typedef struct arc_state {
/* The 5 states: */
static arc_state_t ARC_anon;
-static arc_state_t ARC_mru_top;
-static arc_state_t ARC_mru_bot;
-static arc_state_t ARC_mfu_top;
-static arc_state_t ARC_mfu_bot;
+static arc_state_t ARC_mru;
+static arc_state_t ARC_mru_ghost;
+static arc_state_t ARC_mfu;
+static arc_state_t ARC_mfu_ghost;
static struct arc {
arc_state_t *anon;
- arc_state_t *mru_top;
- arc_state_t *mru_bot;
- arc_state_t *mfu_top;
- arc_state_t *mfu_bot;
+ arc_state_t *mru;
+ arc_state_t *mru_ghost;
+ arc_state_t *mfu;
+ arc_state_t *mfu_ghost;
uint64_t size; /* Actual total arc size */
- uint64_t p; /* Target size (in bytes) of mru_top */
+ uint64_t p; /* Target size (in bytes) of mru */
uint64_t c; /* Target size of cache (in bytes) */
uint64_t c_min; /* Minimum target cache size */
uint64_t c_max; /* Maximum target cache size */
- uint64_t incr; /* Size by which to increment arc.c */
- int64_t size_check;
/* performance stats */
uint64_t hits;
@@ -195,12 +202,6 @@ static struct arc {
int no_grow; /* Don't try to grow cache size */
} arc;
-/* Default amount to grow arc.incr */
-static int64_t arc_incr_size = 1024;
-
-/* > 0 ==> time to increment arc.c */
-static int64_t arc_size_check_default = -1000;
-
static uint64_t arc_tempreserve;
typedef struct arc_callback arc_callback_t;
@@ -227,6 +228,7 @@ struct arc_buf_hdr {
arc_buf_hdr_t *b_hash_next;
arc_buf_t *b_buf;
uint32_t b_flags;
+ uint32_t b_datacnt;
kcondvar_t b_cv;
arc_callback_t *b_acb;
@@ -242,6 +244,13 @@ struct arc_buf_hdr {
refcount_t b_refcnt;
};
+static arc_buf_t *arc_eviction_list;
+static kmutex_t arc_eviction_mtx;
+static void arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
+
+#define GHOST_STATE(state) \
+ ((state) == arc.mru_ghost || (state) == arc.mfu_ghost)
+
/*
* Private ARC flags. These flags are private ARC only flags that will show up
* in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can
@@ -250,13 +259,17 @@ struct arc_buf_hdr {
* public flags, make sure not to smash the private ones.
*/
+#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */
#define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */
#define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */
#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
+#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
+#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
+#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
/*
* Hash table routines
@@ -353,6 +366,7 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
arc_buf_hdr_t *fbuf;
uint32_t max, i;
+ ASSERT(!HDR_IN_HASH_TABLE(buf));
fbufs_lastthread = curthread;
*lockp = hash_lock;
mutex_enter(hash_lock);
@@ -366,6 +380,7 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp)
buf->b_hash_next = buf_hash_table.ht_table[idx];
buf_hash_table.ht_table[idx] = buf;
+ buf->b_flags |= ARC_IN_HASH_TABLE;
/* collect some hash table performance data */
if (i > 0) {
@@ -391,6 +406,7 @@ buf_hash_remove(arc_buf_hdr_t *buf)
uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth);
ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx)));
+ ASSERT(HDR_IN_HASH_TABLE(buf));
bufp = &buf_hash_table.ht_table[idx];
while ((fbuf = *bufp) != buf) {
@@ -399,6 +415,7 @@ buf_hash_remove(arc_buf_hdr_t *buf)
}
*bufp = buf->b_hash_next;
buf->b_hash_next = NULL;
+ buf->b_flags &= ~ARC_IN_HASH_TABLE;
/* collect some hash table performance data */
atomic_add_64(&arc.hash_elements, -1);
@@ -456,6 +473,7 @@ hdr_dest(void *vbuf, void *unused)
cv_destroy(&buf->b_cv);
}
+static int arc_reclaim_needed(void);
void arc_kmem_reclaim(void);
/*
@@ -466,27 +484,33 @@ static void
hdr_recl(void *unused)
{
dprintf("hdr_recl called\n");
- arc_kmem_reclaim();
+ if (arc_reclaim_needed())
+ arc_kmem_reclaim();
}
static void
buf_init(void)
{
uint64_t *ct;
- uint64_t hsize = 1ULL << 10;
+ uint64_t hsize = 1ULL << 12;
int i, j;
/*
* The hash table is big enough to fill all of physical memory
- * with an average 4k block size. The table will take up
- * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte
- * pointers).
+ * with an average 64K block size. The table will take up
+ * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers).
*/
- while (hsize * 4096 < physmem * PAGESIZE)
+ while (hsize * 65536 < physmem * PAGESIZE)
hsize <<= 1;
-
+retry:
buf_hash_table.ht_mask = hsize - 1;
- buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP);
+ buf_hash_table.ht_table =
+ kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP);
+ if (buf_hash_table.ht_table == NULL) {
+ ASSERT(hsize > (1ULL << 8));
+ hsize >>= 1;
+ goto retry;
+ }
hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t),
0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0);
@@ -505,8 +529,6 @@ buf_init(void)
#define ARC_MINTIME (hz>>4) /* 62 ms */
-#define ARC_TAG (void *)0x05201962
-
static void
add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
{
@@ -514,14 +536,21 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
if ((refcount_add(&ab->b_refcnt, tag) == 1) &&
(ab->b_state != arc.anon)) {
+ int delta = ab->b_size * ab->b_datacnt;
ASSERT(!MUTEX_HELD(&ab->b_state->mtx));
mutex_enter(&ab->b_state->mtx);
- ASSERT(!refcount_is_zero(&ab->b_refcnt));
+ ASSERT(refcount_count(&ab->b_refcnt) > 0);
ASSERT(list_link_active(&ab->b_arc_node));
list_remove(&ab->b_state->list, ab);
- ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
- ab->b_state->lsize -= ab->b_size;
+ if (GHOST_STATE(ab->b_state)) {
+ ASSERT3U(ab->b_datacnt, ==, 0);
+ ASSERT3P(ab->b_buf, ==, NULL);
+ delta = ab->b_size;
+ }
+ ASSERT(delta > 0);
+ ASSERT3U(ab->b_state->lsize, >=, delta);
+ atomic_add_64(&ab->b_state->lsize, -delta);
mutex_exit(&ab->b_state->mtx);
}
}
@@ -531,7 +560,8 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
{
int cnt;
- ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(ab->b_state == arc.anon || MUTEX_HELD(hash_lock));
+ ASSERT(!GHOST_STATE(ab->b_state));
if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) &&
(ab->b_state != arc.anon)) {
@@ -540,8 +570,9 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
mutex_enter(&ab->b_state->mtx);
ASSERT(!list_link_active(&ab->b_arc_node));
list_insert_head(&ab->b_state->list, ab);
- ASSERT(ab->b_buf != NULL);
- ab->b_state->lsize += ab->b_size;
+ ASSERT(ab->b_datacnt > 0);
+ atomic_add_64(&ab->b_state->lsize, ab->b_size * ab->b_datacnt);
+ ASSERT3U(ab->b_state->size, >=, ab->b_state->lsize);
mutex_exit(&ab->b_state->mtx);
}
return (cnt);
@@ -552,49 +583,70 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag)
* for the buffer must be held by the caller.
*/
static void
-arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
- kmutex_t *hash_lock)
+arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
{
- arc_buf_t *buf;
+ arc_state_t *old_state = ab->b_state;
+ int refcnt = refcount_count(&ab->b_refcnt);
+ int from_delta, to_delta;
ASSERT(MUTEX_HELD(hash_lock));
+ ASSERT(new_state != old_state);
+ ASSERT(refcnt == 0 || ab->b_datacnt > 0);
+ ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+
+ from_delta = to_delta = ab->b_datacnt * ab->b_size;
/*
* If this buffer is evictable, transfer it from the
* old state list to the new state list.
*/
- if (refcount_is_zero(&ab->b_refcnt)) {
- if (ab->b_state != arc.anon) {
- int drop_mutex = FALSE;
+ if (refcnt == 0) {
+ if (old_state != arc.anon) {
+ int use_mutex = !MUTEX_HELD(&old_state->mtx);
+
+ if (use_mutex)
+ mutex_enter(&old_state->mtx);
- if (!MUTEX_HELD(&ab->b_state->mtx)) {
- mutex_enter(&ab->b_state->mtx);
- drop_mutex = TRUE;
- }
ASSERT(list_link_active(&ab->b_arc_node));
- list_remove(&ab->b_state->list, ab);
- ASSERT3U(ab->b_state->lsize, >=, ab->b_size);
- ab->b_state->lsize -= ab->b_size;
- if (drop_mutex)
- mutex_exit(&ab->b_state->mtx);
+ list_remove(&old_state->list, ab);
+
+ /* ghost elements have a ghost size */
+ if (GHOST_STATE(old_state)) {
+ ASSERT(ab->b_datacnt == 0);
+ ASSERT(ab->b_buf == NULL);
+ from_delta = ab->b_size;
+ }
+ ASSERT3U(old_state->lsize, >=, from_delta);
+ atomic_add_64(&old_state->lsize, -from_delta);
+
+ if (use_mutex)
+ mutex_exit(&old_state->mtx);
}
if (new_state != arc.anon) {
- int drop_mutex = FALSE;
+ int use_mutex = !MUTEX_HELD(&new_state->mtx);
- if (!MUTEX_HELD(&new_state->mtx)) {
+ if (use_mutex)
mutex_enter(&new_state->mtx);
- drop_mutex = TRUE;
- }
+
list_insert_head(&new_state->list, ab);
- ASSERT(ab->b_buf != NULL);
- new_state->lsize += ab->b_size;
- if (drop_mutex)
+
+ /* ghost elements have a ghost size */
+ if (GHOST_STATE(new_state)) {
+ ASSERT(ab->b_datacnt == 0);
+ ASSERT(ab->b_buf == NULL);
+ to_delta = ab->b_size;
+ }
+ atomic_add_64(&new_state->lsize, to_delta);
+ ASSERT3U(new_state->size + to_delta, >=,
+ new_state->lsize);
+
+ if (use_mutex)
mutex_exit(&new_state->mtx);
}
}
ASSERT(!BUF_EMPTY(ab));
- if (new_state == arc.anon && ab->b_state != arc.anon) {
+ if (new_state == arc.anon && old_state != arc.anon) {
buf_hash_remove(ab);
}
@@ -602,22 +654,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab,
* If this buffer isn't being transferred to the MRU-top
* state, it's safe to clear its prefetch flag
*/
- if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) {
+ if ((new_state != arc.mru) && (new_state != arc.mru_ghost)) {
ab->b_flags &= ~ARC_PREFETCH;
}
- buf = ab->b_buf;
- if (buf == NULL) {
- ASSERT3U(ab->b_state->size, >=, ab->b_size);
- atomic_add_64(&ab->b_state->size, -ab->b_size);
- /* we should only be here if we are deleting state */
- ASSERT(new_state == arc.anon &&
- (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot));
- } else while (buf) {
- ASSERT3U(ab->b_state->size, >=, ab->b_size);
- atomic_add_64(&ab->b_state->size, -ab->b_size);
- atomic_add_64(&new_state->size, ab->b_size);
- buf = buf->b_next;
+ /* adjust state sizes */
+ if (to_delta)
+ atomic_add_64(&new_state->size, to_delta);
+ if (from_delta) {
+ ASSERT3U(old_state->size, >=, from_delta);
+ atomic_add_64(&old_state->size, -from_delta);
}
ab->b_state = new_state;
}
@@ -637,9 +683,12 @@ arc_buf_alloc(spa_t *spa, int size, void *tag)
hdr->b_arc_access = 0;
buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
buf->b_hdr = hdr;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
buf->b_next = NULL;
buf->b_data = zio_buf_alloc(size);
hdr->b_buf = buf;
+ hdr->b_datacnt = 1;
hdr->b_flags = 0;
ASSERT(refcount_is_zero(&hdr->b_refcnt));
(void) refcount_add(&hdr->b_refcnt, tag);
@@ -650,35 +699,124 @@ arc_buf_alloc(spa_t *spa, int size, void *tag)
return (buf);
}
+static void *
+arc_data_copy(arc_buf_hdr_t *hdr, void *old_data)
+{
+ void *new_data = zio_buf_alloc(hdr->b_size);
+
+ atomic_add_64(&arc.size, hdr->b_size);
+ bcopy(old_data, new_data, hdr->b_size);
+ atomic_add_64(&hdr->b_state->size, hdr->b_size);
+ if (list_link_active(&hdr->b_arc_node)) {
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ atomic_add_64(&hdr->b_state->lsize, hdr->b_size);
+ }
+ return (new_data);
+}
+
+void
+arc_buf_add_ref(arc_buf_t *buf, void* tag)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+
+ mutex_enter(&arc_eviction_mtx);
+ hdr = buf->b_hdr;
+ if (buf->b_data == NULL) {
+ /*
+ * This buffer is evicted.
+ */
+ mutex_exit(&arc_eviction_mtx);
+ return;
+ } else {
+ /*
+ * Prevent this buffer from being evicted
+ * while we add a reference.
+ */
+ buf->b_hdr = NULL;
+ }
+ mutex_exit(&arc_eviction_mtx);
+
+ ASSERT(hdr->b_state != arc.anon);
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+ ASSERT(!GHOST_STATE(hdr->b_state));
+ buf->b_hdr = hdr;
+ add_reference(hdr, hash_lock, tag);
+ arc_access_and_exit(hdr, hash_lock);
+ atomic_add_64(&arc.hits, 1);
+}
+
+static void
+arc_buf_destroy(arc_buf_t *buf, boolean_t all)
+{
+ arc_buf_t **bufp;
+
+ /* free up data associated with the buf */
+ if (buf->b_data) {
+ arc_state_t *state = buf->b_hdr->b_state;
+ uint64_t size = buf->b_hdr->b_size;
+
+ zio_buf_free(buf->b_data, size);
+ atomic_add_64(&arc.size, -size);
+ if (list_link_active(&buf->b_hdr->b_arc_node)) {
+ ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt));
+ ASSERT(state != arc.anon);
+ ASSERT3U(state->lsize, >=, size);
+ atomic_add_64(&state->lsize, -size);
+ }
+ ASSERT3U(state->size, >=, size);
+ atomic_add_64(&state->size, -size);
+ buf->b_data = NULL;
+ ASSERT(buf->b_hdr->b_datacnt > 0);
+ buf->b_hdr->b_datacnt -= 1;
+ }
+
+ /* only remove the buf if requested */
+ if (!all)
+ return;
+
+ /* remove the buf from the hdr list */
+ for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next)
+ continue;
+ *bufp = buf->b_next;
+
+ ASSERT(buf->b_efunc == NULL);
+
+ /* clean up the buf */
+ buf->b_hdr = NULL;
+ kmem_cache_free(buf_cache, buf);
+}
+
static void
-arc_hdr_free(arc_buf_hdr_t *hdr)
+arc_hdr_destroy(arc_buf_hdr_t *hdr)
{
ASSERT(refcount_is_zero(&hdr->b_refcnt));
ASSERT3P(hdr->b_state, ==, arc.anon);
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
if (!BUF_EMPTY(hdr)) {
- /*
- * We can be called with an arc state lock held,
- * so we can't hold a hash lock here.
- * ASSERT(not in hash table)
- */
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ ASSERT(!HDR_IN_HASH_TABLE(hdr));
bzero(&hdr->b_dva, sizeof (dva_t));
hdr->b_birth = 0;
hdr->b_cksum0 = 0;
}
- if (hdr->b_buf) {
+ while (hdr->b_buf) {
arc_buf_t *buf = hdr->b_buf;
- ASSERT3U(hdr->b_size, >, 0);
- zio_buf_free(buf->b_data, hdr->b_size);
- atomic_add_64(&arc.size, -hdr->b_size);
- ASSERT3U(arc.anon->size, >=, hdr->b_size);
- atomic_add_64(&arc.anon->size, -hdr->b_size);
- ASSERT3P(buf->b_next, ==, NULL);
- kmem_cache_free(buf_cache, buf);
- hdr->b_buf = NULL;
+ if (buf->b_efunc) {
+ mutex_enter(&arc_eviction_mtx);
+ ASSERT(buf->b_hdr != NULL);
+ arc_buf_destroy(hdr->b_buf, FALSE);
+ hdr->b_buf = buf->b_next;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ mutex_exit(&arc_eviction_mtx);
+ } else {
+ arc_buf_destroy(hdr->b_buf, TRUE);
+ }
}
+
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT3P(hdr->b_hash_next, ==, NULL);
ASSERT3P(hdr->b_acb, ==, NULL);
@@ -689,36 +827,73 @@ void
arc_buf_free(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
- kmutex_t *hash_lock = HDR_LOCK(hdr);
- int freeable;
+ int hashed = hdr->b_state != arc.anon;
- mutex_enter(hash_lock);
- if (remove_reference(hdr, hash_lock, tag) > 0) {
- arc_buf_t **bufp = &hdr->b_buf;
- arc_state_t *state = hdr->b_state;
- uint64_t size = hdr->b_size;
-
- ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr));
- while (*bufp != buf) {
- ASSERT(*bufp);
- bufp = &(*bufp)->b_next;
- }
- *bufp = buf->b_next;
+ ASSERT(buf->b_efunc == NULL);
+ ASSERT(buf->b_data != NULL);
+
+ if (hashed) {
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+
+ mutex_enter(hash_lock);
+ (void) remove_reference(hdr, hash_lock, tag);
+ if (hdr->b_datacnt > 1)
+ arc_buf_destroy(buf, TRUE);
+ else
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
mutex_exit(hash_lock);
- zio_buf_free(buf->b_data, size);
- atomic_add_64(&arc.size, -size);
- kmem_cache_free(buf_cache, buf);
- ASSERT3U(state->size, >=, size);
- atomic_add_64(&state->size, -size);
- return;
+ } else if (HDR_IO_IN_PROGRESS(hdr)) {
+ int destroy_hdr;
+ /*
+ * We are in the middle of an async write. Don't destroy
+ * this buffer unless the write completes before we finish
+ * decrementing the reference count.
+ */
+ mutex_enter(&arc_eviction_mtx);
+ (void) remove_reference(hdr, NULL, tag);
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+ destroy_hdr = !HDR_IO_IN_PROGRESS(hdr);
+ mutex_exit(&arc_eviction_mtx);
+ if (destroy_hdr)
+ arc_hdr_destroy(hdr);
+ } else {
+ if (remove_reference(hdr, NULL, tag) > 0) {
+ ASSERT(HDR_IO_ERROR(hdr));
+ arc_buf_destroy(buf, TRUE);
+ } else {
+ arc_hdr_destroy(hdr);
+ }
}
+}
- /* don't free buffers that are in the middle of an async write */
- freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL);
- mutex_exit(hash_lock);
+int
+arc_buf_remove_ref(arc_buf_t *buf, void* tag)
+{
+ arc_buf_hdr_t *hdr = buf->b_hdr;
+ kmutex_t *hash_lock = HDR_LOCK(hdr);
+ int no_callback = (buf->b_efunc == NULL);
- if (freeable)
- arc_hdr_free(hdr);
+ if (hdr->b_state == arc.anon) {
+ arc_buf_free(buf, tag);
+ return (no_callback);
+ }
+
+ mutex_enter(hash_lock);
+ ASSERT(hdr->b_state != arc.anon);
+ ASSERT(buf->b_data != NULL);
+
+ (void) remove_reference(hdr, hash_lock, tag);
+ if (hdr->b_datacnt > 1) {
+ if (no_callback)
+ arc_buf_destroy(buf, TRUE);
+ } else if (no_callback) {
+ ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
+ }
+ ASSERT(no_callback || hdr->b_datacnt > 1 ||
+ refcount_is_zero(&hdr->b_refcnt));
+ mutex_exit(hash_lock);
+ return (no_callback);
}
int
@@ -732,19 +907,16 @@ arc_buf_size(arc_buf_t *buf)
* bytes. Move the removed buffers to the appropriate evict state.
*/
static uint64_t
-arc_evict_state(arc_state_t *state, int64_t bytes)
+arc_evict(arc_state_t *state, int64_t bytes)
{
arc_state_t *evicted_state;
- uint64_t bytes_evicted = 0;
+ uint64_t bytes_evicted = 0, skipped = 0;
arc_buf_hdr_t *ab, *ab_prev;
kmutex_t *hash_lock;
- ASSERT(state == arc.mru_top || state == arc.mfu_top);
+ ASSERT(state == arc.mru || state == arc.mfu);
- if (state == arc.mru_top)
- evicted_state = arc.mru_bot;
- else
- evicted_state = arc.mfu_bot;
+ evicted_state = (state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost;
mutex_enter(&state->mtx);
mutex_enter(&evicted_state->mtx);
@@ -754,19 +926,42 @@ arc_evict_state(arc_state_t *state, int64_t bytes)
hash_lock = HDR_LOCK(ab);
if (mutex_tryenter(hash_lock)) {
ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0);
+ ASSERT(ab->b_datacnt > 0);
+ while (ab->b_buf) {
+ arc_buf_t *buf = ab->b_buf;
+ if (buf->b_data)
+ bytes_evicted += ab->b_size;
+ if (buf->b_efunc) {
+ mutex_enter(&arc_eviction_mtx);
+ /*
+ * arc_buf_add_ref() could derail
+ * this eviction.
+ */
+ if (buf->b_hdr == NULL) {
+ mutex_exit(&arc_eviction_mtx);
+ mutex_exit(hash_lock);
+ goto skip;
+ }
+ arc_buf_destroy(buf, FALSE);
+ ab->b_buf = buf->b_next;
+ buf->b_next = arc_eviction_list;
+ arc_eviction_list = buf;
+ mutex_exit(&arc_eviction_mtx);
+ } else {
+ arc_buf_destroy(buf, TRUE);
+ }
+ }
+ ASSERT(ab->b_datacnt == 0);
arc_change_state(evicted_state, ab, hash_lock);
- zio_buf_free(ab->b_buf->b_data, ab->b_size);
- atomic_add_64(&arc.size, -ab->b_size);
- ASSERT3P(ab->b_buf->b_next, ==, NULL);
- kmem_cache_free(buf_cache, ab->b_buf);
- ab->b_buf = NULL;
+ ASSERT(HDR_IN_HASH_TABLE(ab));
+ ab->b_flags = ARC_IN_HASH_TABLE;
DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
- bytes_evicted += ab->b_size;
mutex_exit(hash_lock);
- if (bytes_evicted >= bytes)
+ if (bytes >= 0 && bytes_evicted >= bytes)
break;
} else {
- atomic_add_64(&arc.skipped, 1);
+skip:
+ skipped += 1;
}
}
mutex_exit(&evicted_state->mtx);
@@ -776,6 +971,9 @@ arc_evict_state(arc_state_t *state, int64_t bytes)
dprintf("only evicted %lld bytes from %x",
(longlong_t)bytes_evicted, state);
+ atomic_add_64(&arc.skipped, skipped);
+ if (bytes < 0)
+ return (skipped);
return (bytes_evicted);
}
@@ -784,25 +982,27 @@ arc_evict_state(arc_state_t *state, int64_t bytes)
* bytes. Destroy the buffers that are removed.
*/
static void
-arc_delete_state(arc_state_t *state, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, int64_t bytes)
{
- uint_t bufs_skipped = 0;
- uint64_t bytes_deleted = 0;
arc_buf_hdr_t *ab, *ab_prev;
kmutex_t *hash_lock;
+ uint64_t bytes_deleted = 0;
+ uint_t bufs_skipped = 0;
+ ASSERT(GHOST_STATE(state));
top:
mutex_enter(&state->mtx);
for (ab = list_tail(&state->list); ab; ab = ab_prev) {
ab_prev = list_prev(&state->list, ab);
hash_lock = HDR_LOCK(ab);
if (mutex_tryenter(hash_lock)) {
+ ASSERT(ab->b_buf == NULL);
arc_change_state(arc.anon, ab, hash_lock);
mutex_exit(hash_lock);
atomic_add_64(&arc.deleted, 1);
- DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
bytes_deleted += ab->b_size;
- arc_hdr_free(ab);
+ arc_hdr_destroy(ab);
+ DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
if (bytes >= 0 && bytes_deleted >= bytes)
break;
} else {
@@ -832,41 +1032,62 @@ arc_adjust(void)
{
int64_t top_sz, mru_over, arc_over;
- top_sz = arc.anon->size + arc.mru_top->size;
+ top_sz = arc.anon->size + arc.mru->size;
- if (top_sz > arc.p && arc.mru_top->lsize > 0) {
- int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p);
- (void) arc_evict_state(arc.mru_top, toevict);
- top_sz = arc.anon->size + arc.mru_top->size;
+ if (top_sz > arc.p && arc.mru->lsize > 0) {
+ int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p);
+ (void) arc_evict(arc.mru, toevict);
+ top_sz = arc.anon->size + arc.mru->size;
}
- mru_over = top_sz + arc.mru_bot->size - arc.c;
+ mru_over = top_sz + arc.mru_ghost->size - arc.c;
if (mru_over > 0) {
- if (arc.mru_bot->lsize > 0) {
- int64_t todelete = MIN(arc.mru_bot->lsize, mru_over);
- arc_delete_state(arc.mru_bot, todelete);
+ if (arc.mru_ghost->lsize > 0) {
+ int64_t todelete = MIN(arc.mru_ghost->lsize, mru_over);
+ arc_evict_ghost(arc.mru_ghost, todelete);
}
}
if ((arc_over = arc.size - arc.c) > 0) {
- int64_t table_over;
+ int64_t tbl_over;
- if (arc.mfu_top->lsize > 0) {
- int64_t toevict = MIN(arc.mfu_top->lsize, arc_over);
- (void) arc_evict_state(arc.mfu_top, toevict);
+ if (arc.mfu->lsize > 0) {
+ int64_t toevict = MIN(arc.mfu->lsize, arc_over);
+ (void) arc_evict(arc.mfu, toevict);
}
- table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize
- - arc.c*2;
+ tbl_over = arc.size + arc.mru_ghost->lsize +
+ arc.mfu_ghost->lsize - arc.c*2;
- if (table_over > 0 && arc.mfu_bot->lsize > 0) {
- int64_t todelete = MIN(arc.mfu_bot->lsize, table_over);
- arc_delete_state(arc.mfu_bot, todelete);
+ if (tbl_over > 0 && arc.mfu_ghost->lsize > 0) {
+ int64_t todelete = MIN(arc.mfu_ghost->lsize, tbl_over);
+ arc_evict_ghost(arc.mfu_ghost, todelete);
}
}
}
+static void
+arc_do_user_evicts(void)
+{
+ mutex_enter(&arc_eviction_mtx);
+ while (arc_eviction_list != NULL) {
+ arc_buf_t *buf = arc_eviction_list;
+ arc_eviction_list = buf->b_next;
+ buf->b_hdr = NULL;
+ mutex_exit(&arc_eviction_mtx);
+
+ ASSERT(buf->b_efunc != NULL);
+ VERIFY(buf->b_efunc(buf) == 0);
+
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ kmem_cache_free(buf_cache, buf);
+ mutex_enter(&arc_eviction_mtx);
+ }
+ mutex_exit(&arc_eviction_mtx);
+}
+
/*
* Flush all *evictable* data from the cache.
* NOTE: this will not touch "active" (i.e. referenced) data.
@@ -874,17 +1095,22 @@ arc_adjust(void)
void
arc_flush(void)
{
- arc_delete_state(arc.mru_top, -1);
- arc_delete_state(arc.mfu_top, -1);
+ while (arc_evict(arc.mru, -1));
+ while (arc_evict(arc.mfu, -1));
- arc_delete_state(arc.mru_bot, -1);
- arc_delete_state(arc.mfu_bot, -1);
+ arc_evict_ghost(arc.mru_ghost, -1);
+ arc_evict_ghost(arc.mfu_ghost, -1);
+
+ mutex_enter(&arc_reclaim_thr_lock);
+ arc_do_user_evicts();
+ mutex_exit(&arc_reclaim_thr_lock);
+ ASSERT(arc_eviction_list == NULL);
}
void
arc_kmem_reclaim(void)
{
- /* Remove 6.25% */
+ /* Remove 12.5% */
/*
* We need arc_reclaim_lock because we don't want multiple
* threads trying to reclaim concurrently.
@@ -898,19 +1124,23 @@ arc_kmem_reclaim(void)
if (arc_dead)
return;
+ if (arc.c <= arc.c_min)
+ return;
+
mutex_enter(&arc_reclaim_lock);
- atomic_add_64(&arc.c, -(arc.c >> 4));
+ atomic_add_64(&arc.c, -(arc.c >> 3));
+ atomic_add_64(&arc.p, -(arc.p >> 3));
+ if (arc.c > arc.size)
+ arc.c = arc.size;
if (arc.c < arc.c_min)
arc.c = arc.c_min;
- atomic_add_64(&arc.p, -(arc.p >> 4));
+ if (arc.p > arc.c)
+ arc.p = (arc.c >> 1);
+ ASSERT((int64_t)arc.p >= 0);
arc_adjust();
- /* Cool it for a while */
- arc.incr = 0;
- arc.size_check = arc_size_check_default << 3;
-
mutex_exit(&arc_reclaim_lock);
}
@@ -985,16 +1215,11 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
#endif
/*
- * an agressive reclamation will shrink the cache size as well as reap
- * free kmem buffers. The arc_kmem_reclaim function is called when the
- * header-cache is reaped, so we only reap the header cache if we're
- * performing an agressive reclaim. If we're not, just clean the kmem
- * buffer caches.
+ * An agressive reclamation will shrink the cache size as well as
+ * reap free buffers from the arc kmem caches.
*/
if (strat == ARC_RECLAIM_AGGR)
- kmem_cache_reap_now(hdr_cache);
-
- kmem_cache_reap_now(buf_cache);
+ arc_kmem_reclaim();
for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) {
if (zio_buf_cache[i] != prev_cache) {
@@ -1002,6 +1227,8 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
kmem_cache_reap_now(zio_buf_cache[i]);
}
}
+ kmem_cache_reap_now(buf_cache);
+ kmem_cache_reap_now(hdr_cache);
}
static void
@@ -1038,6 +1265,9 @@ arc_reclaim_thread(void)
arc.no_grow = FALSE;
}
+ if (arc_eviction_list != NULL)
+ arc_do_user_evicts();
+
/* block until needed, or one second, whichever is shorter */
CALLB_CPR_SAFE_BEGIN(&cpr);
(void) cv_timedwait(&arc_reclaim_thr_cv,
@@ -1051,14 +1281,37 @@ arc_reclaim_thread(void)
thread_exit();
}
+/*
+ * Adapt arc info given the number of bytes we are trying to add and
+ * the state that we are comming from. This function is only called
+ * when we are adding new content to the cache.
+ */
static void
-arc_try_grow(int64_t bytes)
+arc_adapt(int bytes, arc_state_t *state)
{
+ int mult;
+
+ ASSERT(bytes > 0);
/*
- * If we're within (2 * maxblocksize) bytes of the target
- * cache size, increment the target cache size
+ * Adapt the target size of the MRU list:
+ * - if we just hit in the MRU ghost list, then increase
+ * the target size of the MRU list.
+ * - if we just hit in the MFU ghost list, then increase
+ * the target size of the MFU list by decreasing the
+ * target size of the MRU list.
*/
- atomic_add_64((uint64_t *)&arc.size_check, 1);
+ if (state == arc.mru_ghost) {
+ mult = ((arc.mru_ghost->size >= arc.mfu_ghost->size) ?
+ 1 : (arc.mfu_ghost->size/arc.mru_ghost->size));
+
+ arc.p = MIN(arc.c, arc.p + bytes * mult);
+ } else if (state == arc.mfu_ghost) {
+ mult = ((arc.mfu_ghost->size >= arc.mru_ghost->size) ?
+ 1 : (arc.mru_ghost->size/arc.mfu_ghost->size));
+
+ arc.p = MAX(0, (int64_t)arc.p - bytes * mult);
+ }
+ ASSERT((int64_t)arc.p >= 0);
if (arc_reclaim_needed()) {
cv_signal(&arc_reclaim_thr_cv);
@@ -1068,52 +1321,36 @@ arc_try_grow(int64_t bytes)
if (arc.no_grow)
return;
+ if (arc.c >= arc.c_max)
+ return;
+
/*
- * return true if we successfully grow, or if there's enough space that
- * we don't have to grow. Above, we return false if we can't grow, or
- * if we shouldn't because a reclaim is in progress.
+ * If we're within (2 * maxblocksize) bytes of the target
+ * cache size, increment the target cache size
*/
- if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) {
- if (arc.size_check > 0) {
- arc.size_check = arc_size_check_default;
- atomic_add_64(&arc.incr, arc_incr_size);
- }
- atomic_add_64(&arc.c, MIN(bytes, arc.incr));
+ if (arc.size > arc.c - (2ULL << SPA_MAXBLOCKSHIFT)) {
+ atomic_add_64(&arc.c, (int64_t)bytes);
if (arc.c > arc.c_max)
arc.c = arc.c_max;
- else
- atomic_add_64(&arc.p, MIN(bytes, arc.incr));
- } else if (arc.size > arc.c) {
- if (arc.size_check > 0) {
- arc.size_check = arc_size_check_default;
- atomic_add_64(&arc.incr, arc_incr_size);
- }
- atomic_add_64(&arc.c, MIN(bytes, arc.incr));
- if (arc.c > arc.c_max)
- arc.c = arc.c_max;
- else
- atomic_add_64(&arc.p, MIN(bytes, arc.incr));
+ else if (state == arc.anon)
+ atomic_add_64(&arc.p, (int64_t)bytes);
+ if (arc.p > arc.c)
+ arc.p = arc.c;
}
+ ASSERT((int64_t)arc.p >= 0);
}
/*
- * check if the cache has reached its limits and eviction is required prior to
- * insert. In this situation, we want to evict if no_grow is set Otherwise, the
- * cache is either big enough that we can insert, or a arc_try_grow will result
- * in more space being made available.
+ * Check if the cache has reached its limits and eviction is required
+ * prior to insert.
*/
-
static int
arc_evict_needed()
{
-
if (arc_reclaim_needed())
return (1);
- if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c))
- return (1);
-
- return (0);
+ return (arc.size > arc.c);
}
/*
@@ -1121,21 +1358,21 @@ arc_evict_needed()
* inserted on its behalf. So, determine which cache must be victimized to
* satisfy an insertion for this state. We have the following cases:
*
- * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) ->
+ * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru) ->
* In this situation if we're out of space, but the resident size of the MFU is
* under the limit, victimize the MFU cache to satisfy this insertion request.
*
- * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) ->
+ * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru) ->
* Here, we've used up all of the available space for the MRU, so we need to
* evict from our own cache instead. Evict from the set of resident MRU
* entries.
*
- * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) ->
+ * 3. Insert for MFU (c - p) > sizeof(arc.mfu) ->
* c minus p represents the MFU space in the cache, since p is the size of the
* cache that is dedicated to the MRU. In this situation there's still space on
* the MFU side, so the MRU side needs to be victimized.
*
- * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) ->
+ * 4. Insert for MFU (c - p) < sizeof(arc.mfu) ->
* MFU's resident set is consuming more space than it has been allotted. In
* this situation, we must victimize our own cache, the MFU, for this insertion.
*/
@@ -1146,35 +1383,35 @@ arc_evict_for_state(arc_state_t *state, uint64_t bytes)
uint64_t mfu_space;
uint64_t evicted;
- ASSERT(state == arc.mru_top || state == arc.mfu_top);
+ ASSERT(state == arc.mru || state == arc.mfu);
- if (state == arc.mru_top) {
- mru_used = arc.anon->size + arc.mru_top->size;
+ if (state == arc.mru) {
+ mru_used = arc.anon->size + arc.mru->size;
if (arc.p > mru_used) {
/* case 1 */
- evicted = arc_evict_state(arc.mfu_top, bytes);
+ evicted = arc_evict(arc.mfu, bytes);
if (evicted < bytes) {
arc_adjust();
}
} else {
/* case 2 */
- evicted = arc_evict_state(arc.mru_top, bytes);
+ evicted = arc_evict(arc.mru, bytes);
if (evicted < bytes) {
arc_adjust();
}
}
} else {
- /* MFU_top case */
+ /* MFU case */
mfu_space = arc.c - arc.p;
- if (mfu_space > arc.mfu_top->size) {
+ if (mfu_space > arc.mfu->size) {
/* case 3 */
- evicted = arc_evict_state(arc.mru_top, bytes);
+ evicted = arc_evict(arc.mru, bytes);
if (evicted < bytes) {
arc_adjust();
}
} else {
/* case 4 */
- evicted = arc_evict_state(arc.mfu_top, bytes);
+ evicted = arc_evict(arc.mfu, bytes);
if (evicted < bytes) {
arc_adjust();
}
@@ -1184,11 +1421,13 @@ arc_evict_for_state(arc_state_t *state, uint64_t bytes)
/*
* This routine is called whenever a buffer is accessed.
+ * NOTE: the hash lock is dropped in this function.
*/
static void
-arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
+arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
{
- int blksz, mult;
+ arc_state_t *evict_state = NULL;
+ int blksz;
ASSERT(MUTEX_HELD(hash_lock));
@@ -1201,27 +1440,16 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
* to the MRU state.
*/
- arc_try_grow(blksz);
- if (arc_evict_needed()) {
- arc_evict_for_state(arc.mru_top, blksz);
- }
+ arc_adapt(blksz, arc.anon);
+ if (arc_evict_needed())
+ evict_state = arc.mru;
ASSERT(buf->b_arc_access == 0);
buf->b_arc_access = lbolt;
- DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *,
- buf);
- arc_change_state(arc.mru_top, buf, hash_lock);
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
+ arc_change_state(arc.mru, buf, hash_lock);
- /*
- * If we are using less than 2/3 of our total target
- * cache size, bump up the target size for the MRU
- * list.
- */
- if (arc.size < arc.c*2/3) {
- arc.p = arc.anon->size + arc.mru_top->size + arc.c/6;
- }
-
- } else if (buf->b_state == arc.mru_top) {
+ } else if (buf->b_state == arc.mru) {
/*
* If this buffer is in the MRU-top state and has the prefetch
* flag, the first read was actually part of a prefetch. In
@@ -1230,7 +1458,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
*/
if ((buf->b_flags & ARC_PREFETCH) != 0) {
buf->b_flags &= ~ARC_PREFETCH;
- atomic_add_64(&arc.mru_top->hits, 1);
+ atomic_add_64(&arc.mru->hits, 1);
+ mutex_exit(hash_lock);
return;
}
@@ -1246,12 +1475,11 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
* most frequently used state.
*/
buf->b_arc_access = lbolt;
- DTRACE_PROBE1(new_state__mfu_top,
- arc_buf_hdr_t *, buf);
- arc_change_state(arc.mfu_top, buf, hash_lock);
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(arc.mfu, buf, hash_lock);
}
- atomic_add_64(&arc.mru_top->hits, 1);
- } else if (buf->b_state == arc.mru_bot) {
+ atomic_add_64(&arc.mru->hits, 1);
+ } else if (buf->b_state == arc.mru_ghost) {
arc_state_t *new_state;
/*
* This buffer has been "accessed" recently, but
@@ -1260,30 +1488,23 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
*/
if (buf->b_flags & ARC_PREFETCH) {
- new_state = arc.mru_top;
- DTRACE_PROBE1(new_state__mru_top,
- arc_buf_hdr_t *, buf);
+ new_state = arc.mru;
+ buf->b_flags &= ~ARC_PREFETCH;
+ DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
} else {
- new_state = arc.mfu_top;
- DTRACE_PROBE1(new_state__mfu_top,
- arc_buf_hdr_t *, buf);
- }
-
- arc_try_grow(blksz);
- if (arc_evict_needed()) {
- arc_evict_for_state(new_state, blksz);
+ new_state = arc.mfu;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
}
- /* Bump up the target size of the MRU list */
- mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ?
- 1 : (arc.mfu_bot->size/arc.mru_bot->size));
- arc.p = MIN(arc.c, arc.p + blksz * mult);
+ arc_adapt(blksz, arc.mru_ghost);
+ if (arc_evict_needed())
+ evict_state = new_state;
buf->b_arc_access = lbolt;
arc_change_state(new_state, buf, hash_lock);
- atomic_add_64(&arc.mru_bot->hits, 1);
- } else if (buf->b_state == arc.mfu_top) {
+ atomic_add_64(&arc.mru_ghost->hits, 1);
+ } else if (buf->b_state == arc.mfu) {
/*
* This buffer has been accessed more than once and is
* still in the cache. Keep it in the MFU state.
@@ -1293,34 +1514,30 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
* so even if it was a prefetch, it will be put back at
* the head of the list when we remove_reference().
*/
- atomic_add_64(&arc.mfu_top->hits, 1);
- } else if (buf->b_state == arc.mfu_bot) {
+ atomic_add_64(&arc.mfu->hits, 1);
+ } else if (buf->b_state == arc.mfu_ghost) {
/*
* This buffer has been accessed more than once but has
* been evicted from the cache. Move it back to the
* MFU state.
*/
- arc_try_grow(blksz);
- if (arc_evict_needed()) {
- arc_evict_for_state(arc.mfu_top, blksz);
- }
-
- /* Bump up the target size for the MFU list */
- mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ?
- 1 : (arc.mru_bot->size/arc.mfu_bot->size));
- arc.p = MAX(0, (int64_t)arc.p - blksz * mult);
+ arc_adapt(blksz, arc.mfu_ghost);
+ if (arc_evict_needed())
+ evict_state = arc.mfu;
buf->b_arc_access = lbolt;
- DTRACE_PROBE1(new_state__mfu_top,
- arc_buf_hdr_t *, buf);
- arc_change_state(arc.mfu_top, buf, hash_lock);
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(arc.mfu, buf, hash_lock);
- atomic_add_64(&arc.mfu_bot->hits, 1);
+ atomic_add_64(&arc.mfu_ghost->hits, 1);
} else {
ASSERT(!"invalid arc state");
}
+ mutex_exit(hash_lock);
+ if (evict_state)
+ arc_evict_for_state(evict_state, blksz);
}
/* a generic arc_done_func_t which you can use */
@@ -1329,7 +1546,7 @@ void
arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
bcopy(buf->b_data, arg, buf->b_hdr->b_size);
- arc_buf_free(buf, arg);
+ VERIFY(arc_buf_remove_ref(buf, arg) == 1);
}
/* a generic arc_done_func_t which you can use */
@@ -1338,7 +1555,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg)
{
arc_buf_t **bufp = arg;
if (zio && zio->io_error) {
- arc_buf_free(buf, arg);
+ VERIFY(arc_buf_remove_ref(buf, arg) == 1);
*bufp = NULL;
} else {
*bufp = buf;
@@ -1387,13 +1604,13 @@ arc_read_done(zio_t *zio)
if (acb->acb_done) {
if (abuf == NULL) {
abuf = kmem_cache_alloc(buf_cache, KM_SLEEP);
- abuf->b_data = zio_buf_alloc(hdr->b_size);
- atomic_add_64(&arc.size, hdr->b_size);
- bcopy(buf->b_data, abuf->b_data, hdr->b_size);
+ abuf->b_data = arc_data_copy(hdr, buf->b_data);
abuf->b_hdr = hdr;
+ abuf->b_efunc = NULL;
+ abuf->b_private = NULL;
abuf->b_next = hdr->b_buf;
hdr->b_buf = abuf;
- atomic_add_64(&hdr->b_state->size, hdr->b_size);
+ hdr->b_datacnt += 1;
}
acb->acb_buf = abuf;
abuf = NULL;
@@ -1414,6 +1631,9 @@ arc_read_done(zio_t *zio)
}
hdr->b_acb = NULL;
hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ ASSERT(!HDR_BUF_AVAILABLE(hdr));
+ if (abuf == buf)
+ hdr->b_flags |= ARC_BUF_AVAILABLE;
ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
@@ -1421,9 +1641,21 @@ arc_read_done(zio_t *zio)
hdr->b_flags |= ARC_IO_ERROR;
if (hdr->b_state != arc.anon)
arc_change_state(arc.anon, hdr, hash_lock);
+ if (HDR_IN_HASH_TABLE(hdr))
+ buf_hash_remove(hdr);
freeable = refcount_is_zero(&hdr->b_refcnt);
+ /* translate checksum errors into IO errors */
+ if (zio->io_error == ECKSUM)
+ zio->io_error = EIO;
}
+ /*
+ * Broadcast before we drop the hash_lock. This is less efficient,
+ * but avoids the possibility that the hdr (and hence the cv) might
+ * be freed before we get to the cv_broadcast().
+ */
+ cv_broadcast(&hdr->b_cv);
+
if (!HDR_FREED_IN_READ(hdr)) {
/*
* Only call arc_access on anonymous buffers. This is because
@@ -1432,8 +1664,9 @@ arc_read_done(zio_t *zio)
* getting confused).
*/
if (zio->io_error == 0 && hdr->b_state == arc.anon)
- arc_access(hdr, hash_lock);
- mutex_exit(hash_lock);
+ arc_access_and_exit(hdr, hash_lock);
+ else
+ mutex_exit(hash_lock);
} else {
/*
* This block was freed while we waited for the read to
@@ -1445,8 +1678,6 @@ arc_read_done(zio_t *zio)
freeable = refcount_is_zero(&hdr->b_refcnt);
}
- cv_broadcast(&hdr->b_cv);
-
/* execute each callback and free its structure */
while ((acb = callback_list) != NULL) {
if (acb->acb_done)
@@ -1462,7 +1693,7 @@ arc_read_done(zio_t *zio)
}
if (freeable)
- arc_hdr_free(hdr);
+ arc_hdr_destroy(hdr);
}
/*
@@ -1486,7 +1717,7 @@ arc_read_done(zio_t *zio)
int
arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t arc_flags)
+ uint32_t arc_flags, zbookmark_t *zb)
{
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
@@ -1495,15 +1726,9 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
top:
hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
- if (hdr && hdr->b_buf) {
-
- ASSERT((hdr->b_state == arc.mru_top) ||
- (hdr->b_state == arc.mfu_top) ||
- ((hdr->b_state == arc.anon) &&
- (HDR_IO_IN_PROGRESS(hdr))));
+ if (hdr && hdr->b_datacnt > 0) {
if (HDR_IO_IN_PROGRESS(hdr)) {
-
if ((arc_flags & ARC_NOWAIT) && done) {
arc_callback_t *acb = NULL;
@@ -1527,35 +1752,39 @@ top:
mutex_exit(hash_lock);
goto top;
}
-
mutex_exit(hash_lock);
return (0);
}
- /*
- * If there is already a reference on this block, create
- * a new copy of the data so that we will be guaranteed
- * that arc_release() will always succeed.
- */
+ ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu);
- if (done)
- add_reference(hdr, hash_lock, private);
- if (done && refcount_count(&hdr->b_refcnt) > 1) {
- buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
- buf->b_data = zio_buf_alloc(hdr->b_size);
- ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1);
- atomic_add_64(&arc.size, hdr->b_size);
- bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size);
- buf->b_hdr = hdr;
- buf->b_next = hdr->b_buf;
- hdr->b_buf = buf;
- atomic_add_64(&hdr->b_state->size, hdr->b_size);
- } else {
+ if (done) {
+ /*
+ * If this block is already in use, create a new
+ * copy of the data so that we will be guaranteed
+ * that arc_release() will always succeed.
+ */
buf = hdr->b_buf;
+ ASSERT(buf);
+ ASSERT(buf->b_data);
+ if (!HDR_BUF_AVAILABLE(hdr)) {
+ void *data = arc_data_copy(hdr, buf->b_data);
+ buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
+ buf->b_hdr = hdr;
+ buf->b_data = data;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_next = hdr->b_buf;
+ hdr->b_buf = buf;
+ hdr->b_datacnt += 1;
+ } else {
+ ASSERT(buf->b_efunc == NULL);
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
+ }
+ add_reference(hdr, hash_lock, private);
}
DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
- arc_access(hdr, hash_lock);
- mutex_exit(hash_lock);
+ arc_access_and_exit(hdr, hash_lock);
atomic_add_64(&arc.hits, 1);
if (done)
done(NULL, buf, private);
@@ -1579,24 +1808,28 @@ top:
bzero(&hdr->b_dva, sizeof (dva_t));
hdr->b_birth = 0;
hdr->b_cksum0 = 0;
- arc_buf_free(buf, private);
+ (void) arc_buf_remove_ref(buf, private);
goto top; /* restart the IO request */
}
} else {
/* this block is in the ghost cache */
- ASSERT((hdr->b_state == arc.mru_bot) ||
- (hdr->b_state == arc.mfu_bot));
+ ASSERT(GHOST_STATE(hdr->b_state));
+ ASSERT(!HDR_IO_IN_PROGRESS(hdr));
add_reference(hdr, hash_lock, private);
+ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
+ ASSERT(hdr->b_buf == NULL);
buf = kmem_cache_alloc(buf_cache, KM_SLEEP);
- buf->b_data = zio_buf_alloc(hdr->b_size);
- atomic_add_64(&arc.size, hdr->b_size);
- ASSERT(!HDR_IO_IN_PROGRESS(hdr));
- ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
buf->b_hdr = hdr;
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
buf->b_next = NULL;
hdr->b_buf = buf;
+ buf->b_data = zio_buf_alloc(hdr->b_size);
+ atomic_add_64(&arc.size, hdr->b_size);
+ ASSERT(hdr->b_datacnt == 0);
+ hdr->b_datacnt = 1;
}
acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -1623,18 +1856,17 @@ top:
* buffer ought to notice that it's legit but has a pending I/O.
*/
- if ((hdr->b_state == arc.mru_bot) ||
- (hdr->b_state == arc.mfu_bot))
- arc_access(hdr, hash_lock);
-
- mutex_exit(hash_lock);
+ if (GHOST_STATE(hdr->b_state))
+ arc_access_and_exit(hdr, hash_lock);
+ else
+ mutex_exit(hash_lock);
ASSERT3U(hdr->b_size, ==, size);
- DTRACE_PROBE2(arc__miss, blkptr_t *, bp,
- uint64_t, size);
+ DTRACE_PROBE2(arc__miss, blkptr_t *, bp, uint64_t, size);
atomic_add_64(&arc.misses, 1);
+
rzio = zio_read(pio, spa, bp, buf->b_data, size,
- arc_read_done, buf, priority, flags);
+ arc_read_done, buf, priority, flags, zb);
if (arc_flags & ARC_WAIT)
return (zio_wait(rzio));
@@ -1660,10 +1892,18 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
- if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr))
- bcopy(hdr->b_buf->b_data, data, hdr->b_size);
- else
+ if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
+ arc_buf_t *buf = hdr->b_buf;
+
+ ASSERT(buf);
+ while (buf->b_data == NULL) {
+ buf = buf->b_next;
+ ASSERT(buf);
+ }
+ bcopy(buf->b_data, data, hdr->b_size);
+ } else {
rc = ENOENT;
+ }
if (hash_mtx)
mutex_exit(hash_mtx);
@@ -1671,6 +1911,104 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
return (rc);
}
+void
+arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
+{
+ ASSERT(buf->b_hdr != NULL);
+ ASSERT(buf->b_hdr->b_state != arc.anon);
+ ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+ buf->b_efunc = func;
+ buf->b_private = private;
+}
+
+/*
+ * This is used by the DMU to let the ARC know that a buffer is
+ * being evicted, so the ARC should clean up. If this arc buf
+ * is not yet in the evicted state, it will be put there.
+ */
+int
+arc_buf_evict(arc_buf_t *buf)
+{
+ arc_buf_hdr_t *hdr;
+ kmutex_t *hash_lock;
+ arc_buf_t **bufp;
+
+ mutex_enter(&arc_eviction_mtx);
+ hdr = buf->b_hdr;
+ if (hdr == NULL) {
+ /*
+ * We are in arc_do_user_evicts().
+ * NOTE: We can't be in arc_buf_add_ref() because
+ * that would violate the interface rules.
+ */
+ ASSERT(buf->b_data == NULL);
+ mutex_exit(&arc_eviction_mtx);
+ return (0);
+ } else if (buf->b_data == NULL) {
+ /*
+ * We are on the eviction list, pull us off.
+ */
+ bufp = &arc_eviction_list;
+ while (*bufp != buf)
+ bufp = &(*bufp)->b_next;
+ *bufp = buf->b_next;
+ mutex_exit(&arc_eviction_mtx);
+ goto out;
+ } else {
+ /*
+ * Prevent a race with arc_evict()
+ */
+ ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt);
+ buf->b_hdr = NULL;
+ }
+ mutex_exit(&arc_eviction_mtx);
+
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu);
+
+ /*
+ * Pull this buffer off of the hdr
+ */
+ bufp = &hdr->b_buf;
+ while (*bufp != buf)
+ bufp = &(*bufp)->b_next;
+ *bufp = buf->b_next;
+
+ ASSERT(buf->b_data != NULL);
+ buf->b_hdr = hdr;
+ arc_buf_destroy(buf, FALSE);
+
+ if (hdr->b_datacnt == 0) {
+ arc_state_t *old_state = hdr->b_state;
+ arc_state_t *evicted_state;
+
+ ASSERT(refcount_is_zero(&hdr->b_refcnt));
+
+ evicted_state =
+ (old_state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost;
+
+ mutex_enter(&old_state->mtx);
+ mutex_enter(&evicted_state->mtx);
+
+ arc_change_state(evicted_state, hdr, hash_lock);
+ ASSERT(HDR_IN_HASH_TABLE(hdr));
+ hdr->b_flags = ARC_IN_HASH_TABLE;
+
+ mutex_exit(&evicted_state->mtx);
+ mutex_exit(&old_state->mtx);
+ }
+ mutex_exit(hash_lock);
+out:
+ VERIFY(buf->b_efunc(buf) == 0);
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
+ buf->b_hdr = NULL;
+ kmem_cache_free(buf_cache, buf);
+ return (1);
+}
+
/*
* Release this buffer from the cache. This must be done
* after a read and prior to modifying the buffer contents.
@@ -1690,30 +2028,40 @@ arc_release(arc_buf_t *buf, void *tag)
/* this buffer is already released */
ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1);
ASSERT(BUF_EMPTY(hdr));
+ ASSERT(buf->b_efunc == NULL);
return;
}
mutex_enter(hash_lock);
- if (refcount_count(&hdr->b_refcnt) > 1) {
+ /*
+ * Do we have more than one buf?
+ */
+ if (hdr->b_buf != buf || buf->b_next != NULL) {
arc_buf_hdr_t *nhdr;
arc_buf_t **bufp;
uint64_t blksz = hdr->b_size;
spa_t *spa = hdr->b_spa;
+ ASSERT(hdr->b_datacnt > 1);
/*
* Pull the data off of this buf and attach it to
* a new anonymous buf.
*/
+ (void) remove_reference(hdr, hash_lock, tag);
bufp = &hdr->b_buf;
- while (*bufp != buf) {
- ASSERT(*bufp);
+ while (*bufp != buf)
bufp = &(*bufp)->b_next;
- }
*bufp = (*bufp)->b_next;
- (void) refcount_remove(&hdr->b_refcnt, tag);
+
ASSERT3U(hdr->b_state->size, >=, hdr->b_size);
atomic_add_64(&hdr->b_state->size, -hdr->b_size);
+ if (refcount_is_zero(&hdr->b_refcnt)) {
+ ASSERT3U(hdr->b_state->lsize, >=, hdr->b_size);
+ atomic_add_64(&hdr->b_state->lsize, -hdr->b_size);
+ }
+ hdr->b_datacnt -= 1;
+
mutex_exit(hash_lock);
nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP);
@@ -1723,6 +2071,7 @@ arc_release(arc_buf_t *buf, void *tag)
nhdr->b_state = arc.anon;
nhdr->b_arc_access = 0;
nhdr->b_flags = 0;
+ nhdr->b_datacnt = 1;
buf->b_hdr = nhdr;
buf->b_next = NULL;
(void) refcount_add(&nhdr->b_refcnt, tag);
@@ -1730,6 +2079,7 @@ arc_release(arc_buf_t *buf, void *tag)
hdr = nhdr;
} else {
+ ASSERT(refcount_count(&hdr->b_refcnt) == 1);
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
arc_change_state(arc.anon, hdr, hash_lock);
@@ -1739,14 +2089,30 @@ arc_release(arc_buf_t *buf, void *tag)
hdr->b_birth = 0;
hdr->b_cksum0 = 0;
}
+ buf->b_efunc = NULL;
+ buf->b_private = NULL;
}
int
arc_released(arc_buf_t *buf)
{
- return (buf->b_hdr->b_state == arc.anon);
+ return (buf->b_data != NULL && buf->b_hdr->b_state == arc.anon);
+}
+
+int
+arc_has_callback(arc_buf_t *buf)
+{
+ return (buf->b_efunc != NULL);
}
+#ifdef ZFS_DEBUG
+int
+arc_referenced(arc_buf_t *buf)
+{
+ return (refcount_count(&buf->b_hdr->b_refcnt));
+}
+#endif
+
static void
arc_write_done(zio_t *zio)
{
@@ -1758,6 +2124,7 @@ arc_write_done(zio_t *zio)
hdr = buf->b_hdr;
acb = hdr->b_acb;
hdr->b_acb = NULL;
+ ASSERT(acb != NULL);
/* this buffer is on no lists and is not in the hash table */
ASSERT3P(hdr->b_state, ==, arc.anon);
@@ -1765,9 +2132,12 @@ arc_write_done(zio_t *zio)
hdr->b_dva = *BP_IDENTITY(zio->io_bp);
hdr->b_birth = zio->io_bp->blk_birth;
hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
- /* clear the "in-write" flag */
- hdr->b_hash_next = NULL;
- /* This write may be all-zero */
+ /*
+ * If the block to be written was all-zero, we may have
+ * compressed it away. In this case no write was performed
+ * so there will be no dva/birth-date/checksum. The buffer
+ * must therefor remain anonymous (and uncached).
+ */
if (!BUF_EMPTY(hdr)) {
arc_buf_hdr_t *exists;
kmutex_t *hash_lock;
@@ -1787,27 +2157,41 @@ arc_write_done(zio_t *zio)
ASSERT(refcount_is_zero(&exists->b_refcnt));
arc_change_state(arc.anon, exists, hash_lock);
mutex_exit(hash_lock);
- arc_hdr_free(exists);
+ arc_hdr_destroy(exists);
exists = buf_hash_insert(hdr, &hash_lock);
ASSERT3P(exists, ==, NULL);
}
- arc_access(hdr, hash_lock);
- mutex_exit(hash_lock);
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ arc_access_and_exit(hdr, hash_lock);
+ } else if (acb->acb_done == NULL) {
+ int destroy_hdr;
+ /*
+ * This is an anonymous buffer with no user callback,
+ * destroy it if there are no active references.
+ */
+ mutex_enter(&arc_eviction_mtx);
+ destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
+ mutex_exit(&arc_eviction_mtx);
+ if (destroy_hdr)
+ arc_hdr_destroy(hdr);
+ } else {
+ hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
}
- if (acb && acb->acb_done) {
+
+ if (acb->acb_done) {
ASSERT(!refcount_is_zero(&hdr->b_refcnt));
acb->acb_done(zio, buf, acb->acb_private);
}
- if (acb)
- kmem_free(acb, sizeof (arc_callback_t));
+ kmem_free(acb, sizeof (arc_callback_t));
}
int
arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t arc_flags)
+ uint32_t arc_flags, zbookmark_t *zb)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
arc_callback_t *acb;
@@ -1822,8 +2206,9 @@ arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
acb->acb_private = private;
acb->acb_byteswap = (arc_byteswap_func_t *)-1;
hdr->b_acb = acb;
+ hdr->b_flags |= ARC_IO_IN_PROGRESS;
rzio = zio_write(pio, spa, checksum, compress, txg, bp,
- buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags);
+ buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb);
if (arc_flags & ARC_WAIT)
return (zio_wait(rzio));
@@ -1858,16 +2243,21 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
arc_change_state(arc.anon, ab, hash_lock);
if (refcount_is_zero(&ab->b_refcnt)) {
mutex_exit(hash_lock);
- arc_hdr_free(ab);
+ arc_hdr_destroy(ab);
atomic_add_64(&arc.deleted, 1);
} else {
ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1);
+ ASSERT3U(ab->b_datacnt, ==, 1);
if (HDR_IO_IN_PROGRESS(ab))
ab->b_flags |= ARC_FREED_IN_READ;
+ if (HDR_IN_HASH_TABLE(ab))
+ buf_hash_remove(ab);
ab->b_arc_access = 0;
bzero(&ab->b_dva, sizeof (dva_t));
ab->b_birth = 0;
ab->b_cksum0 = 0;
+ ab->b_buf->b_efunc = NULL;
+ ab->b_buf->b_private = NULL;
mutex_exit(hash_lock);
}
}
@@ -1967,23 +2357,26 @@ arc_init(void)
arc.c = arc.c_min;
arc.anon = &ARC_anon;
- arc.mru_top = &ARC_mru_top;
- arc.mru_bot = &ARC_mru_bot;
- arc.mfu_top = &ARC_mfu_top;
- arc.mfu_bot = &ARC_mfu_bot;
+ arc.mru = &ARC_mru;
+ arc.mru_ghost = &ARC_mru_ghost;
+ arc.mfu = &ARC_mfu;
+ arc.mfu_ghost = &ARC_mfu_ghost;
+ arc.size = 0;
- list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t),
+ list_create(&arc.mru->list, sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t),
+ list_create(&arc.mru_ghost->list, sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t),
+ list_create(&arc.mfu->list, sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_arc_node));
- list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t),
+ list_create(&arc.mfu_ghost->list, sizeof (arc_buf_hdr_t),
offsetof(arc_buf_hdr_t, b_arc_node));
buf_init();
arc_thread_exit = 0;
+ arc_eviction_list = NULL;
+ mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL);
(void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0,
TS_RUN, minclsyspri);
@@ -2002,14 +2395,15 @@ arc_fini(void)
arc_dead = TRUE;
+ mutex_destroy(&arc_eviction_mtx);
mutex_destroy(&arc_reclaim_lock);
mutex_destroy(&arc_reclaim_thr_lock);
cv_destroy(&arc_reclaim_thr_cv);
- list_destroy(&arc.mru_top->list);
- list_destroy(&arc.mru_bot->list);
- list_destroy(&arc.mfu_top->list);
- list_destroy(&arc.mfu_bot->list);
+ list_destroy(&arc.mru->list);
+ list_destroy(&arc.mru_ghost->list);
+ list_destroy(&arc.mfu->list);
+ list_destroy(&arc.mfu_ghost->list);
buf_fini();
}
diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c
index 68f79ac5a2..db0d3534d6 100644
--- a/usr/src/uts/common/fs/zfs/bplist.c
+++ b/usr/src/uts/common/fs/zfs/bplist.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -29,16 +28,18 @@
#include <sys/bplist.h>
#include <sys/zfs_context.h>
-static void
+static int
bplist_hold(bplist_t *bpl)
{
ASSERT(MUTEX_HELD(&bpl->bpl_lock));
if (bpl->bpl_dbuf == NULL) {
- bpl->bpl_dbuf = dmu_bonus_hold_tag(bpl->bpl_mos,
- bpl->bpl_object, bpl);
- dmu_buf_read(bpl->bpl_dbuf);
+ int err = dmu_bonus_hold(bpl->bpl_mos,
+ bpl->bpl_object, bpl, &bpl->bpl_dbuf);
+ if (err)
+ return (err);
bpl->bpl_phys = bpl->bpl_dbuf->db_data;
}
+ return (0);
}
uint64_t
@@ -58,12 +59,15 @@ bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx)
VERIFY(dmu_object_free(mos, object, tx) == 0);
}
-void
+int
bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
{
dmu_object_info_t doi;
+ int err;
- VERIFY(dmu_object_info(mos, object, &doi) == 0);
+ err = dmu_object_info(mos, object, &doi);
+ if (err)
+ return (err);
mutex_enter(&bpl->bpl_lock);
@@ -79,6 +83,7 @@ bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object)
bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT;
mutex_exit(&bpl->bpl_lock);
+ return (0);
}
void
@@ -89,11 +94,11 @@ bplist_close(bplist_t *bpl)
ASSERT(bpl->bpl_queue == NULL);
if (bpl->bpl_cached_dbuf) {
- dmu_buf_rele(bpl->bpl_cached_dbuf);
+ dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
bpl->bpl_cached_dbuf = NULL;
}
if (bpl->bpl_dbuf) {
- dmu_buf_rele_tag(bpl->bpl_dbuf, bpl);
+ dmu_buf_rele(bpl->bpl_dbuf, bpl);
bpl->bpl_dbuf = NULL;
bpl->bpl_phys = NULL;
}
@@ -110,22 +115,45 @@ bplist_empty(bplist_t *bpl)
return (B_TRUE);
mutex_enter(&bpl->bpl_lock);
- bplist_hold(bpl);
+ VERIFY(0 == bplist_hold(bpl)); /* XXX */
rv = (bpl->bpl_phys->bpl_entries == 0);
mutex_exit(&bpl->bpl_lock);
return (rv);
}
+static int
+bplist_cache(bplist_t *bpl, uint64_t blkid)
+{
+ int err = 0;
+
+ if (bpl->bpl_cached_dbuf == NULL ||
+ bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) {
+ if (bpl->bpl_cached_dbuf != NULL)
+ dmu_buf_rele(bpl->bpl_cached_dbuf, bpl);
+ err = dmu_buf_hold(bpl->bpl_mos,
+ bpl->bpl_object, blkid << bpl->bpl_blockshift,
+ bpl, &bpl->bpl_cached_dbuf);
+ ASSERT(err || bpl->bpl_cached_dbuf->db_size ==
+ 1ULL << bpl->bpl_blockshift);
+ }
+ return (err);
+}
+
int
bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
{
uint64_t blk, off;
blkptr_t *bparray;
- dmu_buf_t *db;
+ int err;
mutex_enter(&bpl->bpl_lock);
- bplist_hold(bpl);
+
+ err = bplist_hold(bpl);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
+ }
if (*itorp >= bpl->bpl_phys->bpl_entries) {
mutex_exit(&bpl->bpl_lock);
@@ -134,51 +162,44 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp)
blk = *itorp >> bpl->bpl_bpshift;
off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift);
- db = bpl->bpl_cached_dbuf;
- if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) {
- if (db != NULL)
- dmu_buf_rele(db);
- bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos,
- bpl->bpl_object, blk << bpl->bpl_blockshift);
+ err = bplist_cache(bpl, blk);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
}
- ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift);
-
- dmu_buf_read(db);
- bparray = db->db_data;
+ bparray = bpl->bpl_cached_dbuf->db_data;
*bp = bparray[off];
(*itorp)++;
mutex_exit(&bpl->bpl_lock);
return (0);
}
-void
+int
bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
{
uint64_t blk, off;
blkptr_t *bparray;
- dmu_buf_t *db;
+ int err;
ASSERT(!BP_IS_HOLE(bp));
mutex_enter(&bpl->bpl_lock);
- bplist_hold(bpl);
+ err = bplist_hold(bpl);
+ if (err)
+ return (err);
blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift;
off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift);
- db = bpl->bpl_cached_dbuf;
- if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) {
- if (db != NULL)
- dmu_buf_rele(db);
- bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos,
- bpl->bpl_object, blk << bpl->bpl_blockshift);
+ err = bplist_cache(bpl, blk);
+ if (err) {
+ mutex_exit(&bpl->bpl_lock);
+ return (err);
}
- ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift);
-
- dmu_buf_will_dirty(db, tx);
- bparray = db->db_data;
+ dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx);
+ bparray = bpl->bpl_cached_dbuf->db_data;
bparray[off] = *bp;
/* We never need the fill count. */
@@ -191,6 +212,8 @@ bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx)
bpl->bpl_phys->bpl_entries++;
bpl->bpl_phys->bpl_bytes += BP_GET_ASIZE(bp);
mutex_exit(&bpl->bpl_lock);
+
+ return (0);
}
/*
@@ -218,7 +241,7 @@ bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
while ((bpq = bpl->bpl_queue) != NULL) {
bpl->bpl_queue = bpq->bpq_next;
mutex_exit(&bpl->bpl_lock);
- bplist_enqueue(bpl, &bpq->bpq_blk, tx);
+ VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
kmem_free(bpq, sizeof (*bpq));
mutex_enter(&bpl->bpl_lock);
}
@@ -230,9 +253,10 @@ bplist_vacate(bplist_t *bpl, dmu_tx_t *tx)
{
mutex_enter(&bpl->bpl_lock);
ASSERT3P(bpl->bpl_queue, ==, NULL);
- bplist_hold(bpl);
+ VERIFY(0 == bplist_hold(bpl));
dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
- dmu_free_range(bpl->bpl_mos, bpl->bpl_object, 0, -1ULL, tx);
+ VERIFY(0 == dmu_free_range(bpl->bpl_mos,
+ bpl->bpl_object, 0, -1ULL, tx));
bpl->bpl_phys->bpl_entries = 0;
bpl->bpl_phys->bpl_bytes = 0;
mutex_exit(&bpl->bpl_lock);
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index 6f93e86078..13f4fdb202 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -118,7 +118,7 @@ dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
if (DBUF_EQUAL(db, os, obj, level, blkid)) {
mutex_enter(&db->db_mtx);
- if (!refcount_is_zero(&db->db_holds)) {
+ if (db->db_state != DB_EVICTING) {
mutex_exit(DBUF_HASH_MUTEX(h, idx));
return (db);
}
@@ -151,7 +151,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
mutex_enter(&dbf->db_mtx);
- if (!refcount_is_zero(&dbf->db_holds)) {
+ if (dbf->db_state != DB_EVICTING) {
mutex_exit(DBUF_HASH_MUTEX(h, idx));
return (dbf);
}
@@ -186,7 +186,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
* DBUF_HASH_MUTEX > db_mtx.
*/
ASSERT(refcount_is_zero(&db->db_holds));
- ASSERT(db->db_dnode != NULL);
+ ASSERT(db->db_state == DB_EVICTING);
ASSERT(!MUTEX_HELD(&db->db_mtx));
mutex_enter(DBUF_HASH_MUTEX(h, idx));
@@ -201,20 +201,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
atomic_add_64(&dbuf_hash_count, -1);
}
-static int dbuf_evictable(dmu_buf_impl_t *db);
-static void dbuf_clear(dmu_buf_impl_t *db);
-
-void
-dbuf_evict(dmu_buf_impl_t *db)
-{
- int err;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
- err = dbuf_evictable(db);
- ASSERT(err == TRUE);
- dbuf_clear(db);
- dbuf_destroy(db);
-}
+static arc_evict_func_t dbuf_do_evict;
static void
dbuf_evict_user(dmu_buf_impl_t *db)
@@ -233,23 +220,47 @@ dbuf_evict_user(dmu_buf_impl_t *db)
}
void
+dbuf_evict(dmu_buf_impl_t *db)
+{
+ int i;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_buf == NULL);
+
+#ifdef ZFS_DEBUG
+ for (i = 0; i < TXG_SIZE; i++) {
+ ASSERT(!list_link_active(&db->db_dirty_node[i]));
+ ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
+ }
+#endif
+ dbuf_clear(db);
+ dbuf_destroy(db);
+}
+
+void
dbuf_init(void)
{
- uint64_t hsize = 1;
+ uint64_t hsize = 1ULL << 16;
dbuf_hash_table_t *h = &dbuf_hash_table;
int i;
/*
* The hash table is big enough to fill all of physical memory
- * with an average 64k block size. The table will take up
- * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte
- * pointers).
+ * with an average 4K block size. The table will take up
+ * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
*/
- while (hsize * 65536 < physmem * PAGESIZE)
+ while (hsize * 4096 < physmem * PAGESIZE)
hsize <<= 1;
+retry:
h->hash_table_mask = hsize - 1;
- h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+ h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
+ if (h->hash_table == NULL) {
+ /* XXX - we should really return an error instead of assert */
+ ASSERT(hsize > (1ULL << 10));
+ hsize >>= 1;
+ goto retry;
+ }
dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
sizeof (dmu_buf_impl_t),
@@ -299,8 +310,9 @@ dbuf_verify(dmu_buf_impl_t *db)
} else {
ASSERT3U(db->db.db_object, ==, dn->dn_object);
ASSERT3P(db->db_objset, ==, dn->dn_objset);
- ASSERT(list_head(&dn->dn_dbufs));
ASSERT3U(db->db_level, <, dn->dn_nlevels);
+ ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+ list_head(&dn->dn_dbufs));
}
if (db->db_blkid == DB_BONUS_BLKID) {
ASSERT(dn != NULL);
@@ -311,19 +323,11 @@ dbuf_verify(dmu_buf_impl_t *db)
}
if (db->db_level == 0) {
- void **udpp = db->db_d.db_user_data_ptr_ptr;
/* we can be momentarily larger in dnode_set_blksz() */
if (db->db_blkid != DB_BONUS_BLKID && dn) {
ASSERT3U(db->db.db_size, >=, dn->dn_datablksz);
}
- if (udpp) {
- ASSERT((refcount_is_zero(&db->db_holds) &&
- *udpp == NULL) ||
- (!refcount_is_zero(&db->db_holds) &&
- *udpp == db->db.db_data));
- }
-
- if (IS_DNODE_DNODE(db->db.db_object)) {
+ if (db->db.db_object == DMU_META_DNODE_OBJECT) {
for (i = 0; i < TXG_SIZE; i++) {
/*
* it should only be modified in syncing
@@ -341,7 +345,7 @@ dbuf_verify(dmu_buf_impl_t *db)
if (db->db_parent == dn->dn_dbuf) {
/* db is pointed to by the dnode */
/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
- if (IS_DNODE_DNODE(db->db.db_object))
+ if (db->db.db_object == DMU_META_DNODE_OBJECT)
ASSERT(db->db_parent == NULL);
else
ASSERT(db->db_parent != NULL);
@@ -399,10 +403,19 @@ static void
dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(buf->b_data != NULL);
+ ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
db->db_buf = buf;
- db->db.db_data = buf->b_data;
- dbuf_update_data(db);
+ if (buf != NULL) {
+ ASSERT(buf->b_data != NULL);
+ db->db.db_data = buf->b_data;
+ if (!arc_released(buf))
+ arc_set_callback(buf, dbuf_do_evict, db);
+ dbuf_update_data(db);
+ } else {
+ dbuf_evict_user(db);
+ db->db.db_data = NULL;
+ db->db_state = DB_UNCACHED;
+ }
}
uint64_t
@@ -427,6 +440,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
* All reads are synchronous, so we must have a hold on the dbuf
*/
ASSERT(refcount_count(&db->db_holds) > 0);
+ ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
/* we were freed in flight; disregard any error */
@@ -440,60 +454,36 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
db->db_state = DB_CACHED;
} else {
ASSERT(db->db_blkid != DB_BONUS_BLKID);
- arc_buf_free(buf, db);
- db->db_state = DB_UNCACHED;
ASSERT3P(db->db_buf, ==, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ db->db_state = DB_UNCACHED;
}
cv_broadcast(&db->db_changed);
mutex_exit(&db->db_mtx);
+ dbuf_rele(db, NULL);
}
-void
+static void
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
- arc_buf_t *buf;
blkptr_t *bp;
+ zbookmark_t zb;
ASSERT(!refcount_is_zero(&db->db_holds));
/* We need the struct_rwlock to prevent db_blkptr from changing. */
ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
-
- /*
- * prefetch only data blocks (level 0) -- don't prefetch indirect
- * blocks
- */
- if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) {
- flags |= DB_RF_NOPREFETCH;
- }
-
- if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) {
- dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
- db->db.db_size);
- }
-
- if (db->db_state == DB_CACHED) {
- ASSERT(db->db.db_data != NULL);
- return;
- }
-
- mutex_enter(&db->db_mtx);
-
- if (db->db_state != DB_UNCACHED) {
- mutex_exit(&db->db_mtx);
- return;
- }
-
- ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db_state == DB_UNCACHED);
+ ASSERT(db->db_buf == NULL);
if (db->db_blkid == DB_BONUS_BLKID) {
ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
- buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- DN_MAX_BONUSLEN, db);
+ db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
if (db->db.db_size < DN_MAX_BONUSLEN)
- bzero(buf->b_data, DN_MAX_BONUSLEN);
- bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data,
+ bzero(db->db.db_data, DN_MAX_BONUSLEN);
+ bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
db->db.db_size);
- dbuf_set_data(db, buf);
+ dbuf_update_data(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
return;
@@ -522,20 +512,27 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
db->db_state = DB_READ;
mutex_exit(&db->db_mtx);
+ zb.zb_objset = db->db_objset->os_dsl_dataset ?
+ db->db_objset->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+
+ dbuf_add_ref(db, NULL);
/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
db->db_level > 0 ? byteswap_uint64_array :
dmu_ot[db->db_dnode->dn_type].ot_byteswap,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
- ARC_NOWAIT);
+ ARC_NOWAIT, &zb);
}
-static int
-dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags)
+int
+dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
- zio_t *zio;
- int err;
+ int err = 0;
+ int havepzio = (zio != NULL);
/*
* We don't have to hold the mutex to check db_state because it
@@ -545,71 +542,67 @@ dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags)
if (db->db_state == DB_CACHED)
return (0);
- if (db->db_state == DB_UNCACHED) {
- zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL,
- ZIO_FLAG_CANFAIL);
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_CACHED) {
+ mutex_exit(&db->db_mtx);
if ((flags & DB_RF_HAVESTRUCT) == 0)
- rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+ } else if (db->db_state == DB_UNCACHED) {
+ if (zio == NULL) {
+ zio = zio_root(db->db_dnode->dn_objset->os_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+ }
dbuf_read_impl(db, zio, flags);
+ /* dbuf_read_impl has dropped db_mtx for us */
+
+ if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
+ (flags & DB_RF_NOPREFETCH) == 0 &&
+ db->db_dnode != NULL) {
+ dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset,
+ db->db.db_size);
+ }
+
if ((flags & DB_RF_HAVESTRUCT) == 0)
rw_exit(&db->db_dnode->dn_struct_rwlock);
- err = zio_wait(zio);
- if (err)
- return (err);
- }
- mutex_enter(&db->db_mtx);
- while (db->db_state == DB_READ || db->db_state == DB_FILL) {
- ASSERT(db->db_state == DB_READ ||
- (flags & DB_RF_HAVESTRUCT) == 0);
- cv_wait(&db->db_changed, &db->db_mtx);
+ if (!havepzio)
+ err = zio_wait(zio);
+ } else {
+ if ((flags & DB_RF_HAVESTRUCT) == 0)
+ rw_exit(&db->db_dnode->dn_struct_rwlock);
+ if ((flags & DB_RF_NEVERWAIT) == 0) {
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL) {
+ ASSERT(db->db_state == DB_READ ||
+ (flags & DB_RF_HAVESTRUCT) == 0);
+ cv_wait(&db->db_changed, &db->db_mtx);
+ }
+ if (db->db_state == DB_UNCACHED)
+ err = EIO;
+ }
+ mutex_exit(&db->db_mtx);
}
- ASSERT3U(db->db_state, ==, DB_CACHED);
- mutex_exit(&db->db_mtx);
-
- return (0);
-}
-
-#pragma weak dmu_buf_read = dbuf_read
-void
-dbuf_read(dmu_buf_impl_t *db)
-{
- int err;
-
- err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED);
- ASSERT(err == 0);
-}
-
-#pragma weak dmu_buf_read_canfail = dbuf_read_canfail
-int
-dbuf_read_canfail(dmu_buf_impl_t *db)
-{
- return (dbuf_read_generic(db, DB_RF_CANFAIL));
-}
-
-void
-dbuf_read_havestruct(dmu_buf_impl_t *db)
-{
- int err;
- ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
- err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH));
- ASSERT(err == 0);
+ ASSERT(err || havepzio || db->db_state == DB_CACHED);
+ return (err);
}
static void
dbuf_noread(dmu_buf_impl_t *db)
{
ASSERT(!refcount_is_zero(&db->db_holds));
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
mutex_enter(&db->db_mtx);
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
if (db->db_state == DB_UNCACHED) {
- int blksz = (db->db_blkid == DB_BONUS_BLKID) ?
- DN_MAX_BONUSLEN : db->db.db_size;
+ ASSERT(db->db_buf == NULL);
ASSERT(db->db.db_data == NULL);
dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
- blksz, db));
+ db->db.db_size, db));
db->db_state = DB_FILL;
} else {
ASSERT3U(db->db_state, ==, DB_CACHED);
@@ -634,14 +627,13 @@ static void
dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
{
arc_buf_t **quiescing, **syncing;
- int size = (db->db_blkid == DB_BONUS_BLKID) ?
- DN_MAX_BONUSLEN : db->db.db_size;
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db.db_data != NULL);
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
- quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
- syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
+ quiescing = (arc_buf_t **)&db->db_d.db_data_old[(txg-1)&TXG_MASK];
+ syncing = (arc_buf_t **)&db->db_d.db_data_old[(txg-2)&TXG_MASK];
/*
* If this buffer is referenced from the current quiescing
@@ -656,13 +648,12 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
*/
ASSERT(*syncing != db->db_buf);
if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ int size = db->db.db_size;
*quiescing = arc_buf_alloc(
db->db_dnode->dn_objset->os_spa, size, db);
bcopy(db->db.db_data, (*quiescing)->b_data, size);
} else {
- db->db.db_data = NULL;
- db->db_buf = NULL;
- db->db_state = DB_UNCACHED;
+ dbuf_set_data(db, NULL);
}
return;
}
@@ -677,22 +668,49 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
ASSERT3P(*quiescing, ==, NULL);
ASSERT3U(db->db_dirtycnt, ==, 1);
if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ int size = db->db.db_size;
/* we can't copy if we have already started a write */
ASSERT(*syncing != db->db_data_pending);
*syncing = arc_buf_alloc(
db->db_dnode->dn_objset->os_spa, size, db);
bcopy(db->db.db_data, (*syncing)->b_data, size);
} else {
- db->db.db_data = NULL;
- db->db_buf = NULL;
- db->db_state = DB_UNCACHED;
+ dbuf_set_data(db, NULL);
}
}
}
+/*
+ * This is the "bonus buffer" version of the above routine
+ */
+static void
+dbuf_fix_old_bonus_data(dmu_buf_impl_t *db, uint64_t txg)
+{
+ void **quiescing, **syncing;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+ ASSERT(db->db.db_data != NULL);
+ ASSERT(db->db_blkid == DB_BONUS_BLKID);
+
+ quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK];
+ syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK];
+
+ if (*quiescing == db->db.db_data) {
+ ASSERT(*syncing != db->db.db_data);
+ *quiescing = zio_buf_alloc(DN_MAX_BONUSLEN);
+ bcopy(db->db.db_data, *quiescing, DN_MAX_BONUSLEN);
+ } else if (*syncing == db->db.db_data) {
+ ASSERT3P(*quiescing, ==, NULL);
+ ASSERT3U(db->db_dirtycnt, ==, 1);
+ *syncing = zio_buf_alloc(DN_MAX_BONUSLEN);
+ bcopy(db->db.db_data, *syncing, DN_MAX_BONUSLEN);
+ }
+}
+
void
dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg)
{
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
ASSERT(MUTEX_HELD(&db->db_mtx));
if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) {
db->db_d.db_overridden_by[txg&TXG_MASK] = NULL;
@@ -724,7 +742,8 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
mutex_enter(&dn->dn_dbufs_mtx);
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
- if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID))
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+ if (db->db_level != 0)
continue;
dprintf_dbuf(db, "found buf %s\n", "");
if (db->db_blkid < blkid ||
@@ -736,7 +755,8 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
continue;
mutex_enter(&db->db_mtx);
- if (db->db_state == DB_UNCACHED) {
+ if (db->db_state == DB_UNCACHED ||
+ db->db_state == DB_EVICTING) {
ASSERT(db->db.db_data == NULL);
mutex_exit(&db->db_mtx);
continue;
@@ -753,22 +773,40 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
mutex_exit(&db->db_mtx);
continue;
}
+ if (refcount_count(&db->db_holds) == 0) {
+ ASSERT(db->db_buf);
+ dbuf_clear(db);
+ continue;
+ }
+ /* The dbuf is CACHED and referenced */
- /* make a copy of the data if necessary */
- dbuf_fix_old_data(db, txg);
-
- if (db->db.db_data) {
- /* fill in with appropriate data */
+ if (!list_link_active(&db->db_dirty_node[txg & TXG_MASK])) {
+ /*
+ * This dbuf is not currently dirty. We will either
+ * uncache it (if its not referenced in the open
+ * context) or reset its contents to empty.
+ */
+ dbuf_fix_old_data(db, txg);
+ } else if (db->db_d.db_overridden_by[txg & TXG_MASK] != NULL) {
+ /*
+ * This dbuf is overridden. Clear that state.
+ */
+ dbuf_unoverride(db, txg);
+ }
+ /* fill in with appropriate data */
+ if (db->db_state == DB_CACHED) {
+ ASSERT(db->db.db_data != NULL);
arc_release(db->db_buf, db);
bzero(db->db.db_data, db->db.db_size);
}
+
mutex_exit(&db->db_mtx);
}
mutex_exit(&dn->dn_dbufs_mtx);
}
static int
-dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx)
+dbuf_new_block(dmu_buf_impl_t *db)
{
dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
uint64_t birth_txg = 0;
@@ -790,7 +828,7 @@ dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx)
birth_txg = db->db_blkptr->blk_birth;
if (birth_txg)
- return (!dsl_dataset_block_freeable(ds, birth_txg, tx));
+ return (!dsl_dataset_block_freeable(ds, birth_txg));
else
return (TRUE);
}
@@ -801,6 +839,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
arc_buf_t *buf, *obuf;
int osize = db->db.db_size;
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
/* XXX does *this* func really need the lock? */
ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock));
@@ -814,6 +854,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
* be happening.
*/
/* Make a copy of the data if necessary */
+ /*
+ * XXX we should be doing a dbuf_read, checking the return
+ * value and returning that up to our callers
+ */
dbuf_will_dirty(db, tx);
/* create the data buffer for the new block */
@@ -829,7 +873,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
/* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */
dbuf_set_data(db, buf);
- arc_buf_free(obuf, db);
+ VERIFY(arc_buf_remove_ref(obuf, db) == 1);
db->db.db_size = size;
/* fix up the dirty info */
@@ -861,7 +905,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
*/
ASSERT(!(dmu_tx_is_syncing(tx) &&
!BP_IS_HOLE(&dn->dn_objset->os_rootbp) &&
- !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
dn->dn_objset->os_dsl_dataset != NULL &&
!dsl_dir_is_private(
dn->dn_objset->os_dsl_dataset->ds_dir)));
@@ -871,7 +915,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
* check if we're already dirty. They are allowed to re-dirty
* in syncing context.
*/
- ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT ||
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
dn->dn_dirtyctx == DN_UNDIRTIED ||
dn->dn_dirtyctx ==
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
@@ -940,22 +984,27 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
- if (db->db_level == 0) {
+ /*
+ * If this buffer is dirty in an old transaction group we need
+ * to make a copy of it so that the changes we make in this
+ * transaction group won't leak out when we sync the older txg.
+ */
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ ASSERT(db->db.db_data != NULL);
+ ASSERT(db->db_d.db_data_old[txgoff] == NULL);
+ dbuf_fix_old_bonus_data(db, tx->tx_txg);
+ db->db_d.db_data_old[txgoff] = db->db.db_data;
+ } else if (db->db_level == 0) {
/*
* Release the data buffer from the cache so that we
* can modify it without impacting possible other users
* of this cached data block. Note that indirect blocks
* and private objects are not released until the syncing
* state (since they are only modified then).
- *
- * If this buffer is dirty in an old transaction group we need
- * to make a copy of it so that the changes we make in this
- * transaction group won't leak out when we sync the older txg.
*/
ASSERT(db->db_buf != NULL);
- ASSERT(db->db.db_data != NULL);
ASSERT(db->db_d.db_data_old[txgoff] == NULL);
- if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) {
+ if (db->db.db_object != DMU_META_DNODE_OBJECT) {
arc_release(db->db_buf, db);
dbuf_fix_old_data(db, tx->tx_txg);
ASSERT(db->db_buf != NULL);
@@ -978,12 +1027,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db);
mutex_exit(&dn->dn_mtx);
- /*
- * If writting this buffer will consume a new block on disk,
- * then update the accounting.
- */
if (db->db_blkid != DB_BONUS_BLKID) {
- if (!dbuf_new_block(db, tx) && db->db_blkptr) {
+ /*
+ * Update the accounting.
+ */
+ if (!dbuf_new_block(db) && db->db_blkptr) {
/*
* This is only a guess -- if the dbuf is dirty
* in a previous txg, we don't know how much
@@ -1028,7 +1076,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (drop_struct_lock)
rw_exit(&dn->dn_struct_rwlock);
dbuf_dirty(parent, tx);
- dbuf_remove_ref(parent, FTAG);
+ dbuf_rele(parent, FTAG);
} else {
if (drop_struct_lock)
rw_exit(&dn->dn_struct_rwlock);
@@ -1042,8 +1090,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dnode_t *dn = db->db_dnode;
int txgoff = tx->tx_txg & TXG_MASK;
+ int64_t holds;
ASSERT(tx->tx_txg != 0);
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
mutex_enter(&db->db_mtx);
@@ -1080,7 +1130,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(db->db_buf != NULL);
ASSERT(db->db_d.db_data_old[txgoff] != NULL);
if (db->db_d.db_data_old[txgoff] != db->db_buf)
- arc_buf_free(db->db_d.db_data_old[txgoff], db);
+ VERIFY(arc_buf_remove_ref(
+ db->db_d.db_data_old[txgoff], db) == 1);
db->db_d.db_data_old[txgoff] = NULL;
}
@@ -1095,15 +1146,17 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
- if (refcount_remove(&db->db_holds,
- (void *)(uintptr_t)tx->tx_txg) == 0) {
- /* make duf_verify() happy */
- if (db->db.db_data)
- bzero(db->db.db_data, db->db.db_size);
+ if ((holds = refcount_remove(&db->db_holds,
+ (void *)(uintptr_t)tx->tx_txg)) == 0) {
+ arc_buf_t *buf = db->db_buf;
+ ASSERT(arc_released(buf));
+ dbuf_set_data(db, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
dbuf_evict(db);
return (1);
}
+ ASSERT(holds > 0);
mutex_exit(&db->db_mtx);
return (0);
@@ -1120,19 +1173,21 @@ dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock))
rf |= DB_RF_HAVESTRUCT;
- (void) dbuf_read_generic(db, rf);
+ (void) dbuf_read(db, NULL, rf);
dbuf_dirty(db, tx);
}
-#pragma weak dmu_buf_will_fill = dbuf_will_fill
void
-dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
ASSERT(tx->tx_txg != 0);
ASSERT(db->db_level == 0);
ASSERT(!refcount_is_zero(&db->db_holds));
- ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) ||
+ ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
dmu_tx_private_ok(tx));
dbuf_noread(db);
@@ -1149,6 +1204,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
if (db->db_state == DB_FILL) {
if (db->db_level == 0 && db->db_d.db_freed_in_flight) {
+ ASSERT(db->db_blkid != DB_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
bzero(db->db.db_data, db->db.db_size);
@@ -1160,47 +1216,62 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&db->db_mtx);
}
-
-static void
+/*
+ * "Clear" the contents of this dbuf. This will mark the dbuf
+ * EVICTING and clear *most* of its references. Unfortunetely,
+ * when we are not holding the dn_dbufs_mtx, we can't clear the
+ * entry in the dn_dbufs list. We have to wait until dbuf_destroy()
+ * in this case. For callers from the DMU we will usually see:
+ * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
+ * For the arc callback, we will usually see:
+ * dbuf_do_evict()->dbuf_clear();dbuf_destroy()
+ * Sometimes, though, we will get a mix of these two:
+ * DMU: dbuf_clear()->arc_buf_evict()
+ * ARC: dbuf_do_evict()->dbuf_destroy()
+ */
+void
dbuf_clear(dmu_buf_impl_t *db)
{
dnode_t *dn = db->db_dnode;
+ dmu_buf_impl_t *parent = db->db_parent;
+ int dbuf_gone = FALSE;
- ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx));
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(refcount_is_zero(&db->db_holds));
+ dbuf_evict_user(db);
+
if (db->db_state == DB_CACHED) {
- ASSERT(db->db_buf != NULL);
- arc_buf_free(db->db_buf, db);
+ ASSERT(db->db.db_data != NULL);
+ if (db->db_blkid == DB_BONUS_BLKID)
+ zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
db->db.db_data = NULL;
- db->db_buf = NULL;
db->db_state = DB_UNCACHED;
}
ASSERT3U(db->db_state, ==, DB_UNCACHED);
- ASSERT(db->db_buf == NULL);
ASSERT(db->db_data_pending == NULL);
- mutex_exit(&db->db_mtx);
+ db->db_state = DB_EVICTING;
+ db->db_blkptr = NULL;
+
+ if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
+ list_remove(&dn->dn_dbufs, db);
+ dnode_rele(dn, db);
+ }
+
+ if (db->db_buf)
+ dbuf_gone = arc_buf_evict(db->db_buf);
+
+ if (!dbuf_gone)
+ mutex_exit(&db->db_mtx);
/*
* If this dbuf is referened from an indirect dbuf,
* decrement the ref count on the indirect dbuf.
*/
- if (db->db_parent && db->db_parent != dn->dn_dbuf)
- dbuf_remove_ref(db->db_parent, db);
-
- /* remove from dn_dbufs */
- list_remove(&dn->dn_dbufs, db);
-
- dnode_rele(dn, db);
-
- dbuf_hash_remove(db);
-
- db->db_dnode = NULL;
- db->db_parent = NULL;
- db->db_blkptr = NULL;
+ if (parent && parent != dn->dn_dbuf)
+ dbuf_rele(parent, db);
}
static int
@@ -1209,6 +1280,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
{
int nlevels, epbs;
+ ASSERT(blkid != DB_BONUS_BLKID);
+
if (dn->dn_phys->dn_nlevels == 0)
nlevels = 1;
else
@@ -1218,12 +1291,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
ASSERT3U(level * epbs, <, 64);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
- if (blkid == DB_BONUS_BLKID) {
- /* this is the bonus buffer */
- *parentp = NULL;
- *bpp = NULL;
- return (0);
- } else if (level >= nlevels ||
+ if (level >= nlevels ||
(blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
/* the buffer has no parent yet */
*parentp = NULL;
@@ -1235,10 +1303,13 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
blkid >> epbs, fail_sparse, NULL, parentp);
if (err)
return (err);
- dbuf_read_havestruct(*parentp);
- *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
- (blkid & ((1ULL << epbs) - 1));
- return (0);
+ err = dbuf_read(*parentp, NULL,
+ (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+ if (err == 0) {
+ *bpp = ((blkptr_t *)(*parentp)->db.db_data) +
+ (blkid & ((1ULL << epbs) - 1));
+ }
+ return (err);
} else {
/* the block is referenced from the dnode */
ASSERT3U(level, ==, nlevels-1);
@@ -1266,11 +1337,21 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db.db_object = dn->dn_object;
db->db_level = level;
db->db_blkid = blkid;
- db->db_state = DB_UNCACHED;
+ db->db_dirtied = 0;
+ db->db_dirtycnt = 0;
+ db->db_dnode = dn;
+ db->db_parent = parent;
+ db->db_blkptr = blkptr;
- if (db->db_blkid == DB_BONUS_BLKID) {
+ bzero(&db->db_d, sizeof (db->db_d));
+
+ if (blkid == DB_BONUS_BLKID) {
+ ASSERT3P(parent, ==, dn->dn_dbuf);
db->db.db_size = dn->dn_bonuslen;
db->db.db_offset = DB_BONUS_BLKID;
+ db->db_state = DB_UNCACHED;
+ /* the bonus dbuf is not placed in the hash table */
+ return (db);
} else {
int blocksize =
db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz;
@@ -1278,11 +1359,6 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
db->db.db_offset = db->db_blkid * blocksize;
}
- db->db_dirtied = 0;
- db->db_dirtycnt = 0;
-
- bzero(&db->db_d, sizeof (db->db_d));
-
/*
* Hold the dn_dbufs_mtx while we get the new dbuf
* in the hash table *and* added to the dbufs list.
@@ -1291,6 +1367,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
* dn_dbufs list.
*/
mutex_enter(&dn->dn_dbufs_mtx);
+ db->db_state = DB_EVICTING;
if ((odb = dbuf_hash_insert(db)) != NULL) {
/* someone else inserted it first */
kmem_cache_free(dbuf_cache, db);
@@ -1298,50 +1375,43 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
return (odb);
}
list_insert_head(&dn->dn_dbufs, db);
+ db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx);
if (parent && parent != dn->dn_dbuf)
dbuf_add_ref(parent, db);
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
+ refcount_count(&dn->dn_holds) > 0);
(void) refcount_add(&dn->dn_holds, db);
- db->db_dnode = dn;
- db->db_parent = parent;
- db->db_blkptr = blkptr;
-
dprintf_dbuf(db, "db=%p\n", db);
return (db);
}
static int
-dbuf_evictable(dmu_buf_impl_t *db)
+dbuf_do_evict(void *private)
{
- int i;
-
- ASSERT(MUTEX_HELD(&db->db_mtx));
- DBUF_VERIFY(db);
+ arc_buf_t *buf = private;
+ dmu_buf_impl_t *db = buf->b_private;
- if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED)
- return (FALSE);
+ if (!MUTEX_HELD(&db->db_mtx))
+ mutex_enter(&db->db_mtx);
- if (!refcount_is_zero(&db->db_holds))
- return (FALSE);
+ ASSERT(db->db_buf == buf);
+ ASSERT(refcount_is_zero(&db->db_holds));
-#ifdef ZFS_DEBUG
- for (i = 0; i < TXG_SIZE; i++) {
- ASSERT(!list_link_active(&db->db_dirty_node[i]));
- ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL);
+ if (db->db_state != DB_EVICTING) {
+ ASSERT(db->db_state == DB_CACHED);
+ DBUF_VERIFY(db);
+ db->db_buf = NULL;
+ dbuf_evict(db);
+ } else {
+ mutex_exit(&db->db_mtx);
+ dbuf_destroy(db);
}
-#endif
-
- /*
- * Now we know we want to free it.
- * This call must be done last, since it has side effects -
- * calling the db_evict_func().
- */
- dbuf_evict_user(db);
- return (TRUE);
+ return (0);
}
static void
@@ -1349,9 +1419,36 @@ dbuf_destroy(dmu_buf_impl_t *db)
{
ASSERT(refcount_is_zero(&db->db_holds));
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ dnode_t *dn = db->db_dnode;
+
+ /*
+ * If this dbuf is still on the dn_dbufs list,
+ * remove it from that list.
+ */
+ if (list_link_active(&db->db_link)) {
+ int need_mutex;
+
+ ASSERT(!MUTEX_HELD(&dn->dn_dbufs_mtx));
+ need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
+ if (need_mutex)
+ mutex_enter(&dn->dn_dbufs_mtx);
+
+ /* remove from dn_dbufs */
+ list_remove(&dn->dn_dbufs, db);
+
+ if (need_mutex)
+ mutex_exit(&dn->dn_dbufs_mtx);
+
+ dnode_rele(dn, db);
+ }
+ dbuf_hash_remove(db);
+ }
+ db->db_parent = NULL;
+ db->db_dnode = NULL;
+ db->db_buf = NULL;
+
ASSERT(db->db.db_data == NULL);
- ASSERT(db->db_dnode == NULL);
- ASSERT(db->db_parent == NULL);
ASSERT(db->db_hash_next == NULL);
ASSERT(db->db_blkptr == NULL);
ASSERT(db->db_data_pending == NULL);
@@ -1384,14 +1481,21 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) {
if (bp && !BP_IS_HOLE(bp)) {
+ zbookmark_t zb;
+ zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
+ dn->dn_objset->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = dn->dn_object;
+ zb.zb_level = 0;
+ zb.zb_blkid = blkid;
+
(void) arc_read(NULL, dn->dn_objset->os_spa, bp,
dmu_ot[dn->dn_type].ot_byteswap,
NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
- (ARC_NOWAIT | ARC_PREFETCH));
+ (ARC_NOWAIT | ARC_PREFETCH), &zb);
}
if (parent && parent != dn->dn_dbuf)
- dbuf_rele(parent);
+ dbuf_rele(parent, NULL);
}
}
@@ -1405,11 +1509,12 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
{
dmu_buf_impl_t *db, *parent = NULL;
+ ASSERT(blkid != DB_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT3U(dn->dn_nlevels, >, level);
*dbp = NULL;
-
+top:
/* dbuf_find() returns with db_mtx held */
db = dbuf_find(dn, level, blkid);
@@ -1423,13 +1528,26 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
err = ENOENT;
if (err) {
if (parent && parent != dn->dn_dbuf)
- dbuf_rele(parent);
+ dbuf_rele(parent, NULL);
return (err);
}
}
+ if (err && err != ENOENT)
+ return (err);
db = dbuf_create(dn, level, blkid, parent, bp);
}
+ if (db->db_buf && refcount_is_zero(&db->db_holds)) {
+ arc_buf_add_ref(db->db_buf, db);
+ if (db->db_buf->b_data == NULL) {
+ dbuf_clear(db);
+ goto top;
+ }
+ ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
+ }
+
+ ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
+
/*
* If this buffer is currently syncing out, and we are
* are still referencing it from db_data, we need to make
@@ -1437,7 +1555,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
* again in this txg.
*/
if (db->db_level == 0 && db->db_state == DB_CACHED &&
- !(dn->dn_object & DMU_PRIVATE_OBJECT) &&
+ dn->dn_object != DMU_META_DNODE_OBJECT &&
db->db_data_pending == db->db_buf) {
int size = (db->db_blkid == DB_BONUS_BLKID) ?
DN_MAX_BONUSLEN : db->db.db_size;
@@ -1448,14 +1566,14 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
db->db.db_size);
}
- dbuf_add_ref(db, tag);
+ (void) refcount_add(&db->db_holds, tag);
dbuf_update_data(db);
DBUF_VERIFY(db);
mutex_exit(&db->db_mtx);
/* NOTE: we can't rele the parent until after we drop the db_mtx */
if (parent && parent != dn->dn_dbuf)
- dbuf_rele(parent);
+ dbuf_rele(parent, NULL);
ASSERT3P(db->db_dnode, ==, dn);
ASSERT3U(db->db_blkid, ==, blkid);
@@ -1466,81 +1584,83 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
}
dmu_buf_impl_t *
-dbuf_hold(dnode_t *dn, uint64_t blkid)
+dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
{
dmu_buf_impl_t *db;
- (void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db);
- return (db);
+ int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
+ return (err ? NULL : db);
}
dmu_buf_impl_t *
dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
{
dmu_buf_impl_t *db;
- (void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
- return (db);
+ int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
+ return (err ? NULL : db);
}
dmu_buf_impl_t *
-dbuf_hold_bonus(dnode_t *dn, void *tag)
+dbuf_create_bonus(dnode_t *dn)
{
- dmu_buf_impl_t *db;
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- (void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db);
- rw_exit(&dn->dn_struct_rwlock);
+ dmu_buf_impl_t *db = dn->dn_bonus;
+
+ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+
+ ASSERT(dn->dn_bonus == NULL);
+ db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
return (db);
}
+#pragma weak dmu_buf_add_ref = dbuf_add_ref
void
dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
{
- (void) refcount_add(&db->db_holds, tag);
- /* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */
+ int64_t holds = refcount_add(&db->db_holds, tag);
+ ASSERT(holds > 1);
}
+#pragma weak dmu_buf_rele = dbuf_rele
void
-dbuf_remove_ref(dmu_buf_impl_t *db, void *tag)
+dbuf_rele(dmu_buf_impl_t *db, void *tag)
{
int64_t holds;
- dnode_t *dn = db->db_dnode;
- int need_mutex;
-
- ASSERT(dn != NULL);
- need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx);
-
- if (need_mutex) {
- dnode_add_ref(dn, FTAG);
- mutex_enter(&dn->dn_dbufs_mtx);
- }
mutex_enter(&db->db_mtx);
DBUF_VERIFY(db);
holds = refcount_remove(&db->db_holds, tag);
+ ASSERT(holds >= 0);
+
+ if (holds == db->db_dirtycnt &&
+ db->db_level == 0 && db->db_d.db_immediate_evict)
+ dbuf_evict_user(db);
if (holds == 0) {
- ASSERT3U(db->db_state, !=, DB_FILL);
- if (db->db_level == 0 &&
- db->db_d.db_user_data_ptr_ptr != NULL)
- *db->db_d.db_user_data_ptr_ptr = NULL;
- dbuf_evict(db);
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ mutex_exit(&db->db_mtx);
+ dnode_rele(db->db_dnode, db);
+ } else if (db->db_buf == NULL) {
+ /*
+ * This is a special case: we never associated this
+ * dbuf with any data allocated from the ARC.
+ */
+ ASSERT3U(db->db_state, ==, DB_UNCACHED);
+ dbuf_evict(db);
+ } else if (arc_released(db->db_buf)) {
+ arc_buf_t *buf = db->db_buf;
+ /*
+ * This dbuf has anonymous data associated with it.
+ */
+ dbuf_set_data(db, NULL);
+ VERIFY(arc_buf_remove_ref(buf, db) == 1);
+ dbuf_evict(db);
+ } else {
+ VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
+ mutex_exit(&db->db_mtx);
+ }
} else {
- if (holds == db->db_dirtycnt &&
- db->db_level == 0 && db->db_d.db_immediate_evict)
- dbuf_evict_user(db);
mutex_exit(&db->db_mtx);
}
-
- if (need_mutex) {
- mutex_exit(&dn->dn_dbufs_mtx);
- dnode_rele(dn, FTAG);
- }
-}
-
-void
-dbuf_rele(dmu_buf_impl_t *db)
-{
- dbuf_remove_ref(db, NULL);
}
#pragma weak dmu_buf_refcount = dbuf_refcount
@@ -1611,6 +1731,8 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
dnode_t *dn = db->db_dnode;
objset_impl_t *os = dn->dn_objset;
int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int checksum, compress;
+ zbookmark_t zb;
int blksz;
ASSERT(dmu_tx_is_syncing(tx));
@@ -1638,8 +1760,38 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
* be modified yet.
*/
+ if (db->db_blkid == DB_BONUS_BLKID) {
+ void **datap = &db->db_d.db_data_old[txg&TXG_MASK];
+ /*
+ * Simply copy the bonus data into the dnode. It will
+ * be written out when the dnode is synced (and it will
+ * be synced, since it must have been dirty for dbuf_sync
+ * to be called).
+ */
+ /*
+ * Use dn_phys->dn_bonuslen since db.db_size is the length
+ * of the bonus buffer in the open transaction rather than
+ * the syncing transaction.
+ */
+ ASSERT(*datap != NULL);
+ ASSERT3U(db->db_level, ==, 0);
+ ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ if (*datap != db->db.db_data)
+ zio_buf_free(*datap, DN_MAX_BONUSLEN);
+ db->db_d.db_data_old[txg&TXG_MASK] = NULL;
+ db->db_data_pending = NULL;
+ if (db->db_dirtied == txg)
+ db->db_dirtied = 0;
+ ASSERT(db->db_dirtycnt > 0);
+ db->db_dirtycnt -= 1;
+ mutex_exit(&db->db_mtx);
+ dbuf_rele(db, (void *)(uintptr_t)txg);
+ return;
+ }
+
if (db->db_level == 0) {
- data = &db->db_d.db_data_old[txg&TXG_MASK];
+ data = (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
blksz = arc_buf_size(*data);
/*
* If this buffer is currently "in use" (i.e., there are
@@ -1651,17 +1803,15 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
* modified in the syncing context (e.g. DNONE_DNODE blocks)
* or if there is no actual write involved (bonus blocks).
*/
- if (!(dn->dn_object & DMU_PRIVATE_OBJECT) &&
- db->db_d.db_overridden_by[txg&TXG_MASK] == NULL &&
- db->db_blkid != DB_BONUS_BLKID) {
+ if (dn->dn_object != DMU_META_DNODE_OBJECT &&
+ db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) {
if (refcount_count(&db->db_holds) > 1 &&
*data == db->db_buf) {
- *data = arc_buf_alloc(
- db->db_dnode->dn_objset->os_spa, blksz, db);
+ *data = arc_buf_alloc(os->os_spa, blksz, db);
bcopy(db->db.db_data, (*data)->b_data, blksz);
}
db->db_data_pending = *data;
- } else if (dn->dn_object & DMU_PRIVATE_OBJECT) {
+ } else if (dn->dn_object == DMU_META_DNODE_OBJECT) {
/*
* Private object buffers are released here rather
* than in dbuf_dirty() since they are only modified
@@ -1683,7 +1833,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
mutex_exit(&db->db_mtx);
- dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+ dbuf_rele(db, (void *)(uintptr_t)txg);
return;
}
blksz = db->db.db_size;
@@ -1692,35 +1842,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
ASSERT(*data != NULL);
- if (db->db_blkid == DB_BONUS_BLKID) {
- /*
- * Simply copy the bonus data into the dnode. It will
- * be written out when the dnode is synced (and it will
- * be synced, since it must have been dirty for dbuf_sync
- * to be called). The bonus data will be byte swapped
- * in dnode_byteswap.
- */
- /*
- * Use dn_phys->dn_bonuslen since db.db_size is the length
- * of the bonus buffer in the open transaction rather than
- * the syncing transaction.
- */
- ASSERT3U(db->db_level, ==, 0);
- ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz);
- bcopy((*data)->b_data, DN_BONUS(dn->dn_phys),
- dn->dn_phys->dn_bonuslen);
- if (*data != db->db_buf)
- arc_buf_free(*data, db);
- db->db_d.db_data_old[txg&TXG_MASK] = NULL;
- db->db_data_pending = NULL;
- if (db->db_dirtied == txg)
- db->db_dirtied = 0;
- ASSERT(db->db_dirtycnt > 0);
- db->db_dirtycnt -= 1;
- mutex_exit(&db->db_mtx);
- dbuf_remove_ref(db, (void *)(uintptr_t)txg);
- return;
- } else if (db->db_level > 0 && !arc_released(db->db_buf)) {
+ if (db->db_level > 0 && !arc_released(db->db_buf)) {
/*
* This indirect buffer was marked dirty, but
* never modified (if it had been modified, then
@@ -1733,7 +1855,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
mutex_exit(&db->db_mtx);
- dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+ dbuf_rele(db, (void *)(uintptr_t)txg);
return;
} else if (db->db_blkptr == NULL &&
db->db_level == dn->dn_phys->dn_nlevels-1 &&
@@ -1757,18 +1879,18 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
if (parent == NULL) {
rw_enter(&dn->dn_struct_rwlock, RW_READER);
(void) dbuf_hold_impl(dn, db->db_level+1,
- db->db_blkid >> epbs, FALSE, NULL, &parent);
+ db->db_blkid >> epbs, FALSE, FTAG, &parent);
rw_exit(&dn->dn_struct_rwlock);
dbuf_add_ref(parent, db);
db->db_parent = parent;
- dbuf_rele(parent);
+ dbuf_rele(parent, FTAG);
}
- dbuf_read(parent);
+ (void) dbuf_read(parent, NULL, DB_RF_MUST_SUCCEED);
} else {
mutex_exit(&db->db_mtx);
}
- ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL);
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || db->db_parent != NULL);
if (db->db_level > 0 &&
db->db_blkid > dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) {
@@ -1801,7 +1923,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
mutex_enter(&db->db_mtx);
db->db_dirtycnt -= 1;
mutex_exit(&db->db_mtx);
- dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+ dbuf_rele(db, (void *)(uintptr_t)txg);
return;
}
@@ -1812,20 +1934,17 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
ASSERT(db->db_level == parent->db_level-1);
ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK]));
/*
- * We may have read this block after we dirtied it,
+ * We may have read this indirect block after we dirtied it,
* so never released it from the cache.
*/
- arc_release(parent->db_buf, parent);
+ arc_release(parent->db_buf, db->db_parent);
db->db_blkptr = (blkptr_t *)parent->db.db_data +
(db->db_blkid & ((1ULL << epbs) - 1));
DBUF_VERIFY(db);
mutex_exit(&db->db_mtx);
- }
- ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
-
#ifdef ZFS_DEBUG
- if (db->db_parent == dn->dn_dbuf) {
+ } else {
/*
* We don't need to dnode_setdirty(dn) because if we got
* here then the parent is already dirty.
@@ -1833,11 +1952,14 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
ASSERT3P(db->db_blkptr, ==,
&dn->dn_phys->dn_blkptr[db->db_blkid]);
- }
#endif
+ }
+ ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
+
if (db->db_level == 0 &&
db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) {
- arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
+ arc_buf_t **old =
+ (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK];
int old_size = BP_GET_ASIZE(db->db_blkptr);
int new_size = BP_GET_ASIZE(*bpp);
@@ -1861,7 +1983,11 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
*bpp = NULL;
if (*old != db->db_buf)
- arc_buf_free(*old, db);
+ VERIFY(arc_buf_remove_ref(*old, db) == 1);
+ else if (!BP_IS_HOLE(db->db_blkptr))
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ else
+ ASSERT(arc_released(db->db_buf));
*old = NULL;
db->db_data_pending = NULL;
@@ -1870,54 +1996,55 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx)
ASSERT(db->db_dirtycnt > 0);
db->db_dirtycnt -= 1;
mutex_exit(&db->db_mtx);
- dbuf_remove_ref(db, (void *)(uintptr_t)txg);
- } else {
- int checksum, compress;
+ dbuf_rele(db, (void *)(uintptr_t)txg);
+ return;
+ }
- if (db->db_level > 0) {
- /*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
- */
- checksum = ZIO_CHECKSUM_FLETCHER_4;
- /* XXX - disable compresssion for now */
- compress = ZIO_COMPRESS_OFF;
+ if (db->db_level > 0) {
+ /*
+ * XXX -- we should design a compression algorithm
+ * that specializes in arrays of bps.
+ */
+ checksum = ZIO_CHECKSUM_FLETCHER_4;
+ compress = ZIO_COMPRESS_LZJB;
+ } else {
+ /*
+ * Allow dnode settings to override objset settings,
+ * except for metadata checksums.
+ */
+ if (dmu_ot[dn->dn_type].ot_metadata) {
+ checksum = os->os_md_checksum;
+ compress = zio_compress_select(dn->dn_compress,
+ os->os_md_compress);
} else {
- /*
- * Allow dnode settings to override objset settings,
- * except for metadata checksums.
- */
- if (dmu_ot[dn->dn_type].ot_metadata) {
- checksum = os->os_md_checksum;
- compress = zio_compress_select(dn->dn_compress,
- os->os_md_compress);
- } else {
- checksum = zio_checksum_select(dn->dn_checksum,
- os->os_checksum);
- compress = zio_compress_select(dn->dn_compress,
- os->os_compress);
- }
+ checksum = zio_checksum_select(dn->dn_checksum,
+ os->os_checksum);
+ compress = zio_compress_select(dn->dn_compress,
+ os->os_compress);
}
+ }
#ifdef ZFS_DEBUG
- if (db->db_parent) {
- ASSERT(list_link_active(
- &db->db_parent->db_dirty_node[txg&TXG_MASK]));
- ASSERT(db->db_parent == dn->dn_dbuf ||
- db->db_parent->db_level > 0);
- if (dn->dn_object & DMU_PRIVATE_OBJECT ||
- db->db_level > 0)
- ASSERT(*data == db->db_buf);
- }
-#endif
- ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
- (void) arc_write(zio, os->os_spa, checksum, compress, txg,
- db->db_blkptr, *data, dbuf_write_done, db,
- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT);
- /*
- * We can't access db after arc_write, since it could finish
- * and be freed, and we have no locks on it.
- */
+ if (db->db_parent) {
+ ASSERT(list_link_active(
+ &db->db_parent->db_dirty_node[txg&TXG_MASK]));
+ ASSERT(db->db_parent == dn->dn_dbuf ||
+ db->db_parent->db_level > 0);
+ if (dn->dn_object == DMU_META_DNODE_OBJECT || db->db_level > 0)
+ ASSERT(*data == db->db_buf);
}
+#endif
+ ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg);
+ zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
+ (void) arc_write(zio, os->os_spa, checksum, compress, txg,
+ db->db_blkptr, *data, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb);
+ /*
+ * We can't access db after arc_write, since it could finish
+ * and be freed, and we have no locks on it.
+ */
}
struct dbuf_arg {
@@ -1970,12 +2097,17 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
db->db_dirtied = 0;
if (db->db_level == 0) {
- arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK];
+ arc_buf_t **old =
+ (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK];
ASSERT(db->db_blkid != DB_BONUS_BLKID);
if (*old != db->db_buf)
- arc_buf_free(*old, db);
+ VERIFY(arc_buf_remove_ref(*old, db) == 1);
+ else if (!BP_IS_HOLE(db->db_blkptr))
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
+ else
+ ASSERT(arc_released(db->db_buf));
*old = NULL;
db->db_data_pending = NULL;
@@ -2007,6 +2139,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
db->db.db_size);
ASSERT3U(dn->dn_phys->dn_maxblkid
>> (db->db_level * epbs), >=, db->db_blkid);
+ arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
if (BP_IS_HOLE(bp))
@@ -2053,5 +2186,5 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
}
}
- dbuf_remove_ref(db, (void *)(uintptr_t)txg);
+ dbuf_rele(db, (void *)(uintptr_t)txg);
}
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 14fab6d420..f883842dad 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -40,6 +39,7 @@
#include <sys/dmu_zfetch.h>
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
+#include <sys/zio_checksum.h>
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint8_array, TRUE, "unallocated" },
@@ -70,101 +70,40 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint8_array, FALSE, "other uint8[]" },
{ byteswap_uint64_array, FALSE, "other uint64[]" },
{ zap_byteswap, TRUE, "other ZAP" },
+ { zap_byteswap, TRUE, "persistent error log" },
};
-static int
-dmu_buf_read_array_impl(dmu_buf_impl_t **dbp, int numbufs, uint32_t flags)
-{
- int i, err = 0;
- dnode_t *dn;
- zio_t *zio;
- int canfail;
- uint64_t rd_sz;
-
- if (numbufs == 0)
- return (0);
-
- rd_sz = numbufs * dbp[0]->db.db_size;
- ASSERT(rd_sz <= DMU_MAX_ACCESS);
-
- dn = dbp[0]->db_dnode;
- if (flags & DB_RF_CANFAIL) {
- canfail = 1;
- } else {
- canfail = 0;
- }
- zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, canfail);
-
- /* don't prefetch if read the read is large */
- if (rd_sz >= zfetch_array_rd_sz) {
- flags |= DB_RF_NOPREFETCH;
- }
-
- /* initiate async reads */
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- for (i = 0; i < numbufs; i++) {
- if (dbp[i]->db_state == DB_UNCACHED)
- dbuf_read_impl(dbp[i], zio, flags);
- }
- rw_exit(&dn->dn_struct_rwlock);
- err = zio_wait(zio);
-
- if (err)
- return (err);
-
- /* wait for other io to complete */
- for (i = 0; i < numbufs; i++) {
- mutex_enter(&dbp[i]->db_mtx);
- while (dbp[i]->db_state == DB_READ ||
- dbp[i]->db_state == DB_FILL)
- cv_wait(&dbp[i]->db_changed, &dbp[i]->db_mtx);
- ASSERT(dbp[i]->db_state == DB_CACHED);
- mutex_exit(&dbp[i]->db_mtx);
- }
-
- return (0);
-}
-
-void
-dmu_buf_read_array(dmu_buf_t **dbp_fake, int numbufs)
-{
- dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
- int err;
-
- err = dmu_buf_read_array_impl(dbp, numbufs, DB_RF_MUST_SUCCEED);
- ASSERT(err == 0);
-}
-
int
-dmu_buf_read_array_canfail(dmu_buf_t **dbp_fake, int numbufs)
-{
- dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
-
- return (dmu_buf_read_array_impl(dbp, numbufs, DB_RF_CANFAIL));
-}
-
-dmu_buf_t *
-dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset)
+dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
uint64_t blkid;
dmu_buf_impl_t *db;
+ int err;
/* dataset_verify(dd); */
- dn = dnode_hold(os->os, object, FTAG);
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
blkid = dbuf_whichblock(dn, offset);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
- db = dbuf_hold(dn, blkid);
+ db = dbuf_hold(dn, blkid, tag);
rw_exit(&dn->dn_struct_rwlock);
- dnode_rele(dn, FTAG);
- return (&db->db);
-}
+ if (db == NULL) {
+ err = EIO;
+ } else {
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+ if (err) {
+ dbuf_rele(db, tag);
+ db = NULL;
+ }
+ }
-dmu_buf_t *
-dmu_bonus_hold(objset_t *os, uint64_t object)
-{
- return (dmu_bonus_hold_tag(os, object, NULL));
+ dnode_rele(dn, FTAG);
+ *dbp = &db->db;
+ return (err);
}
int
@@ -174,41 +113,69 @@ dmu_bonus_max(void)
}
/*
- * Returns held bonus buffer if the object exists, NULL if it doesn't.
+ * returns ENOENT, EIO, or 0.
*/
-dmu_buf_t *
-dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag)
+int
+dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
{
- dnode_t *dn = dnode_hold(os->os, object, FTAG);
+ dnode_t *dn;
+ int err, count;
dmu_buf_impl_t *db;
- if (dn == NULL)
- return (NULL);
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
- db = dbuf_hold_bonus(dn, tag);
- /* XXX - hack: hold the first block if this is a ZAP object */
- if (dmu_ot[dn->dn_type].ot_byteswap == zap_byteswap) {
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- dn->dn_db0 = dbuf_hold(dn, 0);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ if (dn->dn_bonus == NULL) {
rw_exit(&dn->dn_struct_rwlock);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus == NULL)
+ dn->dn_bonus = dbuf_create_bonus(dn);
}
+ db = dn->dn_bonus;
+ rw_exit(&dn->dn_struct_rwlock);
+ mutex_enter(&db->db_mtx);
+ count = refcount_add(&db->db_holds, tag);
+ mutex_exit(&db->db_mtx);
+ if (count == 1)
+ dnode_add_ref(dn, db);
dnode_rele(dn, FTAG);
- return (&db->db);
+
+ VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
+
+ *dbp = &db->db;
+ return (0);
}
-static dmu_buf_t **
-dbuf_hold_array(dnode_t *dn,
- uint64_t offset, uint64_t length, int *numbufsp)
+int
+dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
{
+ dnode_t *dn;
dmu_buf_t **dbp;
uint64_t blkid, nblks, i;
+ uint32_t flags;
+ int err;
+ zio_t *zio;
+
+ ASSERT(length <= DMU_MAX_ACCESS);
if (length == 0) {
if (numbufsp)
*numbufsp = 0;
- return (NULL);
+ *dbpp = NULL;
+ return (0);
}
+ flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
+ if (length >= zfetch_array_rd_sz)
+ flags |= DB_RF_NOPREFETCH;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_datablkshift) {
int blkshift = dn->dn_datablkshift;
@@ -218,83 +185,62 @@ dbuf_hold_array(dnode_t *dn,
ASSERT3U(offset + length, <=, dn->dn_datablksz);
nblks = 1;
}
- dbp = kmem_alloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+ dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
+ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE);
blkid = dbuf_whichblock(dn, offset);
for (i = 0; i < nblks; i++) {
- dmu_buf_impl_t *dbuf;
- dbuf = dbuf_hold(dn, blkid+i);
- dbp[i] = &dbuf->db;
+ dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
+ if (db == NULL) {
+ rw_exit(&dn->dn_struct_rwlock);
+ dmu_buf_rele_array(dbp, nblks, tag);
+ dnode_rele(dn, FTAG);
+ zio_nowait(zio);
+ return (EIO);
+ }
+ /* initiate async i/o */
+ if (read && db->db_state == DB_UNCACHED) {
+ rw_exit(&dn->dn_struct_rwlock);
+ (void) dbuf_read(db, zio, flags);
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ }
+ dbp[i] = &db->db;
}
rw_exit(&dn->dn_struct_rwlock);
-
- if (numbufsp)
- *numbufsp = nblks;
- return (dbp);
-}
-
-dmu_buf_t **
-dmu_buf_hold_array(objset_t *os, uint64_t object,
- uint64_t offset, uint64_t length, int *numbufsp)
-{
- dnode_t *dn;
- dmu_buf_t **dbp;
-
- ASSERT(length <= DMU_MAX_ACCESS);
-
- if (length == 0) {
- if (numbufsp)
- *numbufsp = 0;
- return (NULL);
- }
-
- dn = dnode_hold(os->os, object, FTAG);
- dbp = dbuf_hold_array(dn, offset, length, numbufsp);
dnode_rele(dn, FTAG);
- return (dbp);
-}
-
-void
-dmu_buf_add_ref(dmu_buf_t *dbuf, void *tag)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
- dbuf_add_ref(db, tag);
-}
-
-void
-dmu_buf_remove_ref(dmu_buf_t *dbuf, void *tag)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
- dbuf_remove_ref(db, tag);
-}
-
-void
-dmu_buf_rele(dmu_buf_t *dbuf_fake)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
-
- /* XXX - hack: hold the first block if this is a ZAP object */
- if (db->db_blkid == DB_BONUS_BLKID &&
- dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
- dbuf_rele(db->db_dnode->dn_db0);
- dbuf_rele(db);
-}
+ /* wait for async i/o */
+ err = zio_wait(zio);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
-void
-dmu_buf_rele_tag(dmu_buf_t *dbuf_fake, void *tag)
-{
- dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake;
+ /* wait for other io to complete */
+ if (read) {
+ for (i = 0; i < nblks; i++) {
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
+ mutex_enter(&db->db_mtx);
+ while (db->db_state == DB_READ ||
+ db->db_state == DB_FILL)
+ cv_wait(&db->db_changed, &db->db_mtx);
+ if (db->db_state == DB_UNCACHED)
+ err = EIO;
+ mutex_exit(&db->db_mtx);
+ if (err) {
+ dmu_buf_rele_array(dbp, nblks, tag);
+ return (err);
+ }
+ }
+ }
- /* XXX - hack: hold the first block if this is a ZAP object */
- if (db->db_blkid == DB_BONUS_BLKID &&
- dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap)
- dbuf_rele(db->db_dnode->dn_db0);
- dbuf_remove_ref(db, tag);
+ *numbufsp = nblks;
+ *dbpp = dbp;
+ return (0);
}
void
-dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs)
+dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
{
int i;
dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
@@ -302,10 +248,10 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs)
if (numbufs == 0)
return;
- ASSERT((numbufs * dbp[0]->db.db_size) <= DMU_MAX_ACCESS);
-
- for (i = 0; i < numbufs; i++)
- dbuf_rele(dbp[i]);
+ for (i = 0; i < numbufs; i++) {
+ if (dbp[i])
+ dbuf_rele(dbp[i], tag);
+ }
kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
}
@@ -315,7 +261,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
{
dnode_t *dn;
uint64_t blkid;
- int nblks, i;
+ int nblks, i, err;
if (len == 0) { /* they're interested in the bonus buffer */
dn = os->os->os_meta_dnode;
@@ -335,8 +281,8 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
* already cached, we will do a *synchronous* read in the
* dnode_hold() call. The same is true for any indirects.
*/
- dn = dnode_hold(os->os, object, FTAG);
- if (dn == NULL)
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err != 0)
return;
rw_enter(&dn->dn_struct_rwlock, RW_READER);
@@ -359,39 +305,44 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
dnode_rele(dn, FTAG);
}
-void
+int
dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx)
{
- dnode_t *dn = dnode_hold(os->os, object, FTAG);
+ dnode_t *dn;
+ int err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
ASSERT(offset < UINT64_MAX);
ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
dnode_free_range(dn, offset, size, tx);
dnode_rele(dn, FTAG);
+ return (0);
}
-static int
-dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
- void *buf, uint32_t flags)
+int
+dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+ void *buf)
{
dnode_t *dn;
dmu_buf_t **dbp;
- int numbufs, i;
-
- dn = dnode_hold(os->os, object, FTAG);
+ int numbufs, i, err;
+ /*
+ * Deal with odd block sizes, where there can't be data past the
+ * first block.
+ */
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
if (dn->dn_datablkshift == 0) {
int newsz = offset > dn->dn_datablksz ? 0 :
MIN(size, dn->dn_datablksz - offset);
bzero((char *)buf + newsz, size - newsz);
size = newsz;
}
-
dnode_rele(dn, FTAG);
- if (size == 0)
- return (0);
-
while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
int err;
@@ -400,13 +351,10 @@ dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
* NB: we could do this block-at-a-time, but it's nice
* to be reading in parallel.
*/
- dbp = dmu_buf_hold_array(os, object, offset, mylen, &numbufs);
- err = dmu_buf_read_array_impl((dmu_buf_impl_t **)dbp, numbufs,
- flags);
- if (err) {
- dmu_buf_rele_array(dbp, numbufs);
+ err = dmu_buf_hold_array(os, object, offset, mylen,
+ TRUE, FTAG, &numbufs, &dbp);
+ if (err)
return (err);
- }
for (i = 0; i < numbufs; i++) {
int tocpy;
@@ -424,36 +372,20 @@ dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
size -= tocpy;
buf = (char *)buf + tocpy;
}
- dmu_buf_rele_array(dbp, numbufs);
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
}
return (0);
}
void
-dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
- void *buf)
-{
- int err;
-
- err = dmu_read_impl(os, object, offset, size, buf, DB_RF_MUST_SUCCEED);
- ASSERT3U(err, ==, 0);
-}
-
-int
-dmu_read_canfail(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
- void *buf)
-{
- return (dmu_read_impl(os, object, offset, size, buf, DB_RF_CANFAIL));
-}
-
-void
dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx)
{
dmu_buf_t **dbp;
int numbufs, i;
- dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
+ VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp));
for (i = 0; i < numbufs; i++) {
int tocpy;
@@ -481,7 +413,7 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
size -= tocpy;
buf = (char *)buf + tocpy;
}
- dmu_buf_rele_array(dbp, numbufs);
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
}
#ifdef _KERNEL
@@ -493,7 +425,10 @@ dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
int numbufs, i;
int err = 0;
- dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs);
+ err = dmu_buf_hold_array(os, object, offset, size,
+ FALSE, FTAG, &numbufs, &dbp);
+ if (err)
+ return (err);
for (i = 0; i < numbufs; i++) {
int tocpy;
@@ -530,7 +465,7 @@ dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
offset += tocpy;
size -= tocpy;
}
- dmu_buf_rele_array(dbp, numbufs);
+ dmu_buf_rele_array(dbp, numbufs, FTAG);
return (err);
}
#endif
@@ -539,6 +474,7 @@ struct backuparg {
dmu_replay_record_t *drr;
vnode_t *vp;
objset_t *os;
+ zio_cksum_t zc;
int err;
};
@@ -546,8 +482,9 @@ static int
dump_bytes(struct backuparg *ba, void *buf, int len)
{
ssize_t resid; /* have to get resid to get detailed errno */
- /* Need to compute checksum here */
ASSERT3U(len % 8, ==, 0);
+
+ fletcher_4_incremental_native(buf, len, &ba->zc);
ba->err = vn_rdwr(UIO_WRITE, ba->vp,
(caddr_t)buf, len,
0, UIO_SYSSPACE, FAPPEND, RLIM_INFINITY, CRED(), &resid);
@@ -652,7 +589,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
void *data = bc->bc_data;
int err = 0;
- if (issig(JUSTLOOKING))
+ if (issig(JUSTLOOKING) && issig(FORREAL))
return (EINTR);
ASSERT(data || bp == NULL);
@@ -681,16 +618,21 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg)
int blksz = BP_GET_LSIZE(bp);
if (data == NULL) {
arc_buf_t *abuf;
+ zbookmark_t zb;
+ zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object;
+ zb.zb_object = object;
+ zb.zb_level = level;
+ zb.zb_blkid = blkid;
(void) arc_read(NULL, spa, bp,
dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf,
ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED,
- ARC_WAIT);
+ ARC_WAIT, &zb);
if (abuf) {
err = dump_data(ba, type, object, blkid * blksz,
blksz, abuf->b_data);
- arc_buf_free(abuf, &abuf);
+ (void) arc_buf_remove_ref(abuf, &abuf);
}
} else {
err = dump_data(ba, type, object, blkid * blksz,
@@ -736,6 +678,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp)
ba.drr = drr;
ba.vp = vp;
ba.os = tosnap;
+ ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
kmem_free(drr, sizeof (dmu_replay_record_t));
@@ -755,6 +698,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp)
bzero(drr, sizeof (dmu_replay_record_t));
drr->drr_type = DRR_END;
+ drr->drr_u.drr_end.drr_checksum = ba.zc;
if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)))
return (ba.err);
@@ -773,6 +717,7 @@ struct restorearg {
int buflen; /* number of valid bytes in buf */
int bufoff; /* next offset to read */
int bufsize; /* amount of memory allocated for buf */
+ zio_cksum_t zc;
};
static int
@@ -789,8 +734,11 @@ replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
if (dd->dd_phys->dd_head_dataset_obj == 0)
goto die;
- ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
- NULL, DS_MODE_EXCLUSIVE, FTAG);
+ err = dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj,
+ NULL, DS_MODE_EXCLUSIVE, FTAG, &ds);
+ if (err)
+ goto die;
if (ds == NULL) {
err = EBUSY;
@@ -804,9 +752,11 @@ replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
}
/* most recent snapshot must match fromguid */
- ds_prev = dsl_dataset_open_obj(dd->dd_pool,
+ err = dsl_dataset_open_obj(dd->dd_pool,
ds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_STANDARD | DS_MODE_READONLY, FTAG);
+ DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds_prev);
+ if (err)
+ goto die;
if (ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) {
err = ENODEV;
goto die;
@@ -885,9 +835,8 @@ replay_full_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
/* the point of no (unsuccessful) return */
- err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname,
- DS_MODE_EXCLUSIVE, FTAG, &ds);
- ASSERT3U(err, ==, 0);
+ VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname,
+ DS_MODE_EXCLUSIVE, FTAG, &ds));
kmem_free(fsfullname, MAXNAMELEN);
(void) dmu_objset_create_impl(dsl_dataset_get_spa(ds),
@@ -921,9 +870,8 @@ replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
return (err);
/* set snapshot's creation time and guid */
- err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname,
- DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds);
- ASSERT3U(err, ==, 0);
+ VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname,
+ DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds));
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_phys->ds_creation_time = drrb->drr_creation_time;
@@ -932,8 +880,9 @@ replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG);
- ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj,
- NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG);
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj,
+ NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG, &ds));
dmu_buf_will_dirty(ds->ds_dbuf, tx);
ds->ds_phys->ds_restoring = FALSE;
dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG);
@@ -959,8 +908,6 @@ restore_read(struct restorearg *ra, int len)
ra->voff, UIO_SYSSPACE, FAPPEND,
RLIM_INFINITY, CRED(), &resid);
- /* Need to compute checksum */
-
ra->voff += ra->bufsize - leftover - resid;
ra->buflen = ra->bufsize - resid;
ra->bufoff = 0;
@@ -968,12 +915,17 @@ restore_read(struct restorearg *ra, int len)
ra->err = EINVAL;
if (ra->err)
return (NULL);
+ /* Could compute checksum here? */
}
ASSERT3U(ra->bufoff % 8, ==, 0);
ASSERT3U(ra->buflen - ra->bufoff, >=, len);
rv = ra->buf + ra->bufoff;
ra->bufoff += len;
+ if (ra->byteswap)
+ fletcher_4_incremental_byteswap(rv, len, &ra->zc);
+ else
+ fletcher_4_incremental_native(rv, len, &ra->zc);
return (rv);
}
@@ -1016,7 +968,10 @@ backup_byteswap(dmu_replay_record_t *drr)
DO64(drr_free.drr_length);
break;
case DRR_END:
- DO64(drr_end.drr_checksum);
+ DO64(drr_end.drr_checksum.zc_word[0]);
+ DO64(drr_end.drr_checksum.zc_word[1]);
+ DO64(drr_end.drr_checksum.zc_word[2]);
+ DO64(drr_end.drr_checksum.zc_word[3]);
break;
}
#undef DO64
@@ -1089,7 +1044,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
if (drro->drr_bonuslen) {
dmu_buf_t *db;
void *data;
- db = dmu_bonus_hold(os, drro->drr_object);
+ VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
ASSERT3U(db->db_size, ==, drro->drr_bonuslen);
@@ -1103,7 +1058,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
drro->drr_bonuslen);
}
- dmu_buf_rele(db);
+ dmu_buf_rele(db, FTAG);
}
dmu_tx_commit(tx);
return (0);
@@ -1202,21 +1157,22 @@ restore_free(struct restorearg *ra, objset_t *os,
dmu_tx_abort(tx);
return (err);
}
- dmu_free_range(os, drrf->drr_object,
+ err = dmu_free_range(os, drrf->drr_object,
drrf->drr_offset, drrf->drr_length, tx);
dmu_tx_commit(tx);
- return (0);
+ return (err);
}
int
-dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
+dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
vnode_t *vp, uint64_t voffset)
{
struct restorearg ra;
dmu_replay_record_t *drr;
- char *cp, *tosnap;
+ char *cp;
dsl_dir_t *dd = NULL;
objset_t *os = NULL;
+ zio_cksum_t pzc;
bzero(&ra, sizeof (ra));
ra.vp = vp;
@@ -1233,6 +1189,23 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
goto out;
}
+ /*
+ * NB: this assumes that struct drr_begin will be the largest in
+ * dmu_replay_record_t's drr_u, and thus we don't need to pad it
+ * with zeros to make it the same length as we wrote out.
+ */
+ ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN;
+ ((dmu_replay_record_t *)ra.buf)->drr_pad = 0;
+ ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb;
+ if (ra.byteswap) {
+ fletcher_4_incremental_byteswap(ra.buf,
+ sizeof (dmu_replay_record_t), &ra.zc);
+ } else {
+ fletcher_4_incremental_native(ra.buf,
+ sizeof (dmu_replay_record_t), &ra.zc);
+ }
+ (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */
+
if (ra.byteswap) {
drrb->drr_magic = BSWAP_64(drrb->drr_magic);
drrb->drr_version = BSWAP_64(drrb->drr_version);
@@ -1244,7 +1217,6 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
- tosnap = drrb->drr_toname;
if (drrb->drr_version != DMU_BACKUP_VERSION ||
drrb->drr_type >= DMU_OST_NUMTYPES ||
strchr(drrb->drr_toname, '@') == NULL) {
@@ -1260,12 +1232,10 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
cp = strchr(tosnap, '@');
*cp = '\0';
- dd = dsl_dir_open(tosnap, FTAG, NULL);
+ ra.err = dsl_dir_open(tosnap, FTAG, &dd, NULL);
*cp = '@';
- if (dd == NULL) {
- ra.err = ENOENT;
+ if (ra.err)
goto out;
- }
ra.err = dsl_dir_sync_task(dd, replay_incremental_sync,
drrb, 1<<20);
@@ -1275,12 +1245,10 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
cp = strchr(tosnap, '@');
*cp = '\0';
- dd = dsl_dir_open(tosnap, FTAG, &tail);
+ ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail);
*cp = '@';
- if (dd == NULL) {
- ra.err = ENOENT;
+ if (ra.err)
goto out;
- }
if (tail == NULL) {
ra.err = EEXIST;
goto out;
@@ -1306,9 +1274,10 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
/*
* Read records and process them.
*/
+ pzc = ra.zc;
while (ra.err == 0 &&
NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
- if (issig(JUSTLOOKING)) {
+ if (issig(JUSTLOOKING) && issig(FORREAL)) {
ra.err = EINTR;
goto out;
}
@@ -1348,7 +1317,22 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
break;
}
case DRR_END:
- /* Need to verify checksum. */
+ {
+ struct drr_end drre = drr->drr_u.drr_end;
+ /*
+ * We compare against the *previous* checksum
+ * value, because the stored checksum is of
+ * everything before the DRR_END record.
+ */
+ if (drre.drr_checksum.zc_word[0] != 0 &&
+ ((drre.drr_checksum.zc_word[0] - pzc.zc_word[0]) |
+ (drre.drr_checksum.zc_word[1] - pzc.zc_word[1]) |
+ (drre.drr_checksum.zc_word[2] - pzc.zc_word[2]) |
+ (drre.drr_checksum.zc_word[3] - pzc.zc_word[3]))) {
+ ra.err = ECKSUM;
+ goto out;
+ }
+
/*
* dd may be the parent of the dd we are
* restoring into (eg. if it's a full backup).
@@ -1356,10 +1340,12 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
ra.err = dsl_dir_sync_task(dmu_objset_ds(os)->
ds_dir, replay_end_sync, drrb, 1<<20);
goto out;
+ }
default:
ra.err = EINVAL;
goto out;
}
+ pzc = ra.zc;
}
out:
@@ -1443,6 +1429,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
dmu_buf_impl_t *db;
blkptr_t *blk;
int err;
+ zbookmark_t zb;
ASSERT(RW_LOCK_HELD(&tx->tx_suspend));
ASSERT(BP_IS_HOLE(bp));
@@ -1452,6 +1439,11 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
/*
+ * XXX why is this routine using dmu_buf_*() and casting between
+ * dmu_buf_impl_t and dmu_buf_t?
+ */
+
+ /*
* If this txg already synced, there's nothing to do.
*/
if (txg <= tx->tx_synced_txg) {
@@ -1459,7 +1451,10 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
* If we're running ziltest, we need the blkptr regardless.
*/
if (txg > spa_freeze_txg(dp->dp_spa)) {
- db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
+ err = dmu_buf_hold(os, object, offset,
+ FTAG, (dmu_buf_t **)&db);
+ if (err)
+ return (err);
/* if db_blkptr == NULL, this was an empty write */
if (db->db_blkptr)
*bp = *db->db_blkptr; /* structure assignment */
@@ -1467,7 +1462,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
bzero(bp, sizeof (blkptr_t));
*blkoff = offset - db->db.db_offset;
ASSERT3U(*blkoff, <, db->db.db_size);
- dmu_buf_rele((dmu_buf_t *)db);
+ dmu_buf_rele((dmu_buf_t *)db, FTAG);
return (0);
}
return (EALREADY);
@@ -1481,7 +1476,9 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
return (EINPROGRESS);
}
- db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset);
+ err = dmu_buf_hold(os, object, offset, FTAG, (dmu_buf_t **)&db);
+ if (err)
+ return (err);
mutex_enter(&db->db_mtx);
@@ -1491,7 +1488,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
*/
if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) {
mutex_exit(&db->db_mtx);
- dmu_buf_rele((dmu_buf_t *)db);
+ dmu_buf_rele((dmu_buf_t *)db, FTAG);
return (ENOENT);
}
@@ -1505,7 +1502,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
ASSERT(blk != IN_DMU_SYNC);
if (blk == IN_DMU_SYNC) {
mutex_exit(&db->db_mtx);
- dmu_buf_rele((dmu_buf_t *)db);
+ dmu_buf_rele((dmu_buf_t *)db, FTAG);
return (EBUSY);
}
arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
@@ -1522,11 +1519,15 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
blk->blk_birth = 0; /* mark as invalid */
+ zb.zb_objset = os->os->os_dsl_dataset->ds_object;
+ zb.zb_object = db->db.db_object;
+ zb.zb_level = db->db_level;
+ zb.zb_blkid = db->db_blkid;
err = arc_write(NULL, os->os->os_spa,
zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum),
zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress),
txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL,
- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
ASSERT(err == 0);
if (!BP_IS_HOLE(blk)) {
@@ -1546,7 +1547,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL);
arc_release(db->db_d.db_data_old[txg&TXG_MASK], db);
mutex_exit(&db->db_mtx);
- dmu_buf_rele((dmu_buf_t *)db);
+ dmu_buf_rele((dmu_buf_t *)db, FTAG);
/* Note that this block does not free on disk until txg syncs */
/*
@@ -1563,7 +1564,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
db->db_d.db_overridden_by[txg&TXG_MASK] = blk;
mutex_exit(&db->db_mtx);
- dmu_buf_rele((dmu_buf_t *)db);
+ dmu_buf_rele((dmu_buf_t *)db, FTAG);
ASSERT3U(txg, >, tx->tx_syncing_txg);
return (0);
}
@@ -1571,7 +1572,10 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff,
uint64_t
dmu_object_max_nonzero_offset(objset_t *os, uint64_t object)
{
- dnode_t *dn = dnode_hold(os->os, object, FTAG);
+ dnode_t *dn;
+
+ /* XXX assumes dnode_hold will not get an i/o error */
+ (void) dnode_hold(os->os, object, FTAG, &dn);
uint64_t rv = dnode_max_nonzero_offset(dn);
dnode_rele(dn, FTAG);
return (rv);
@@ -1581,8 +1585,13 @@ int
dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
dmu_tx_t *tx)
{
- dnode_t *dn = dnode_hold(os->os, object, FTAG);
- int err = dnode_set_blksz(dn, size, ibs, tx);
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
+ err = dnode_set_blksz(dn, size, ibs, tx);
dnode_rele(dn, FTAG);
return (err);
}
@@ -1591,7 +1600,10 @@ void
dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
dmu_tx_t *tx)
{
- dnode_t *dn = dnode_hold(os->os, object, FTAG);
+ dnode_t *dn;
+
+ /* XXX assumes dnode_hold will not get an i/o error */
+ (void) dnode_hold(os->os, object, FTAG, &dn);
ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
dn->dn_checksum = checksum;
dnode_setdirty(dn, tx);
@@ -1602,7 +1614,10 @@ void
dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
dmu_tx_t *tx)
{
- dnode_t *dn = dnode_hold(os->os, object, FTAG);
+ dnode_t *dn;
+
+ /* XXX assumes dnode_hold will not get an i/o error */
+ (void) dnode_hold(os->os, object, FTAG, &dn);
ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
dn->dn_compress = compress;
dnode_setdirty(dn, tx);
@@ -1615,7 +1630,9 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
dnode_t *dn;
int i, err;
- dn = dnode_hold(os->os, object, FTAG);
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
/*
* Sync any current changes before
* we go trundling through the block pointers.
@@ -1627,7 +1644,9 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
if (i != TXG_SIZE) {
dnode_rele(dn, FTAG);
txg_wait_synced(dmu_objset_pool(os), 0);
- dn = dnode_hold(os->os, object, FTAG);
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err)
+ return (err);
}
err = dnode_next_offset(dn, hole, off, 1, 1);
@@ -1665,10 +1684,11 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
int
dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
{
- dnode_t *dn = dnode_hold(os->os, object, FTAG);
+ dnode_t *dn;
+ int err = dnode_hold(os->os, object, FTAG, &dn);
- if (dn == NULL)
- return (ENOENT);
+ if (err)
+ return (err);
if (doi != NULL)
dmu_object_info_from_dnode(dn, doi);
@@ -1699,6 +1719,71 @@ dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512)
*nblk512 = dn->dn_phys->dn_secphys + 1; /* add 1 for dnode space */
}
+/*
+ * Given a bookmark, return the name of the dataset, object, and range in
+ * human-readable format.
+ */
+int
+spa_bookmark_name(spa_t *spa, zbookmark_t *zb, char *dsname, size_t dslen,
+ char *objname, size_t objlen, char *range, size_t rangelen)
+{
+ dsl_pool_t *dp;
+ dsl_dataset_t *ds = NULL;
+ objset_t *os = NULL;
+ dnode_t *dn = NULL;
+ int err, shift;
+
+ if (dslen < MAXNAMELEN || objlen < 32 || rangelen < 64)
+ return (ENOSPC);
+
+ dp = spa_get_dsl(spa);
+ if (zb->zb_objset != 0) {
+ rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dsl_dataset_open_obj(dp, zb->zb_objset,
+ NULL, DS_MODE_NONE, FTAG, &ds);
+ if (err) {
+ rw_exit(&dp->dp_config_rwlock);
+ return (err);
+ }
+ dsl_dataset_name(ds, dsname);
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ rw_exit(&dp->dp_config_rwlock);
+
+ err = dmu_objset_open(dsname, DMU_OST_ANY, DS_MODE_NONE, &os);
+ if (err)
+ goto out;
+
+ } else {
+ dsl_dataset_name(NULL, dsname);
+ os = dp->dp_meta_objset;
+ }
+
+
+ if (zb->zb_object == DMU_META_DNODE_OBJECT) {
+ (void) strncpy(objname, "mdn", objlen);
+ } else {
+ (void) snprintf(objname, objlen, "%lld",
+ (longlong_t)zb->zb_object);
+ }
+
+ err = dnode_hold(os->os, zb->zb_object, FTAG, &dn);
+ if (err)
+ goto out;
+
+ shift = (dn->dn_datablkshift?dn->dn_datablkshift:SPA_MAXBLOCKSHIFT) +
+ zb->zb_level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT);
+ (void) snprintf(range, rangelen, "%llu-%llu",
+ (u_longlong_t)(zb->zb_blkid << shift),
+ (u_longlong_t)((zb->zb_blkid+1) << shift));
+
+out:
+ if (dn)
+ dnode_rele(dn, FTAG);
+ if (os && os != dp->dp_meta_objset)
+ dmu_objset_close(os);
+ return (err);
+}
+
void
byteswap_uint64_array(void *vbuf, size_t size)
{
diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c
index d150d6c400..99d40c5ec5 100644
--- a/usr/src/uts/common/fs/zfs/dmu_object.c
+++ b/usr/src/uts/common/fs/zfs/dmu_object.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -39,7 +38,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
uint64_t object;
uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
(osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
- dnode_t *dn;
+ dnode_t *dn = NULL;
int restarted = B_FALSE;
mutex_enter(&osi->os_obj_lock);
@@ -62,7 +61,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
}
osi->os_obj_next = ++object;
- dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG);
+ /*
+ * XXX We should check for an i/o error here and return
+ * up to our caller. Actually we should pre-read it in
+ * dmu_tx_assign(), but there is currently no mechanism
+ * to do so.
+ */
+ (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
+ FTAG, &dn);
if (dn)
break;
@@ -84,13 +90,14 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
dnode_t *dn;
+ int err;
- if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx))
+ if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
return (EBADF);
- dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG);
- if (dn == NULL)
- return (EEXIST);
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+ if (err)
+ return (err);
dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
dnode_rele(dn, FTAG);
@@ -103,13 +110,15 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
dnode_t *dn;
+ int err;
- if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx))
+ if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
return (EBADF);
- dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG);
- if (dn == NULL)
- return (EBADF);
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err)
+ return (err);
dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
dnode_rele(dn, FTAG);
@@ -120,12 +129,14 @@ int
dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
{
dnode_t *dn;
+ int err;
- ASSERT(!(object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx));
+ ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
- dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG);
- if (dn == NULL)
- return (ENOENT);
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err)
+ return (err);
ASSERT(dn->dn_type != DMU_OT_NONE);
dnode_free(dn, tx);
diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c
index 8d77ff70c0..6625fdb98d 100644
--- a/usr/src/uts/common/fs/zfs/dmu_objset.c
+++ b/usr/src/uts/common/fs/zfs/dmu_objset.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -127,8 +126,9 @@ dmu_objset_byteswap(void *buf, size_t size)
osp->os_type = BSWAP_64(osp->os_type);
}
-objset_impl_t *
-dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
+int
+dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
+ objset_impl_t **osip)
{
objset_impl_t *winner, *osi;
int i, err, checksum;
@@ -141,15 +141,25 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
osi->os_rootbp = *bp;
osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t));
if (!BP_IS_HOLE(&osi->os_rootbp)) {
+ zbookmark_t zb;
+ zb.zb_objset = ds ? ds->ds_object : 0;
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = 0;
+
dprintf_bp(&osi->os_rootbp, "reading %s", "");
- (void) arc_read(NULL, spa, &osi->os_rootbp,
+ err = arc_read(NULL, spa, &osi->os_rootbp,
dmu_ot[DMU_OT_OBJSET].ot_byteswap,
arc_bcopy_func, osi->os_phys,
- ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+ ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, ARC_WAIT, &zb);
+ if (err) {
+ zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+ kmem_free(osi, sizeof (objset_impl_t));
+ return (err);
+ }
} else {
bzero(osi->os_phys, sizeof (objset_phys_t));
}
- osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
/*
* Note: the changed_cb will be called once before the register
@@ -159,18 +169,22 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
if (ds) {
err = dsl_prop_register(ds, "checksum",
checksum_changed_cb, osi);
- ASSERT(err == 0);
-
- err = dsl_prop_register(ds, "compression",
- compression_changed_cb, osi);
- ASSERT(err == 0);
+ if (err == 0)
+ err = dsl_prop_register(ds, "compression",
+ compression_changed_cb, osi);
+ if (err) {
+ zio_buf_free(osi->os_phys, sizeof (objset_phys_t));
+ kmem_free(osi, sizeof (objset_impl_t));
+ return (err);
+ }
} else {
/* It's the meta-objset. */
- /* XXX - turn off metadata compression temporarily */
osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
- osi->os_compress = ZIO_COMPRESS_OFF;
+ osi->os_compress = ZIO_COMPRESS_LZJB;
}
+ osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header);
+
/*
* Metadata always gets compressed and checksummed.
* If the data checksum is multi-bit correctable, and it's not
@@ -184,9 +198,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
osi->os_md_checksum = checksum;
else
osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4;
-
- /* XXX - turn off metadata compression temporarily */
- osi->os_md_compress = ZIO_COMPRESS_OFF;
+ osi->os_md_compress = ZIO_COMPRESS_LZJB;
for (i = 0; i < TXG_SIZE; i++) {
list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
@@ -210,7 +222,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp)
}
}
- return (osi);
+ *osip = osi;
+ return (0);
}
/* called from zpl */
@@ -235,7 +248,13 @@ dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
blkptr_t bp;
dsl_dataset_get_blkptr(ds, &bp);
- osi = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, &bp);
+ err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
+ ds, &bp, &osi);
+ if (err) {
+ dsl_dataset_close(ds, mode, os);
+ kmem_free(os, sizeof (objset_t));
+ return (err);
+ }
}
os->os = osi;
@@ -257,9 +276,51 @@ dmu_objset_close(objset_t *os)
}
void
+dmu_objset_evict_dbufs(objset_t *os)
+{
+ objset_impl_t *osi = os->os;
+ dnode_t *mdn = osi->os_meta_dnode;
+ dnode_t *dn;
+ int allzero = B_TRUE;
+
+ /*
+ * Each time we process an entry on the list, we first move it
+ * to the tail so that we don't process it over and over again.
+ * We use the meta-dnode as a marker: if we make a complete pass
+ * over the list without finding any work to do, we're done.
+ * This ensures that we complete in linear time rather than
+ * quadratic time, as described in detail in bug 1182169.
+ */
+ mutex_enter(&osi->os_lock);
+ list_remove(&osi->os_dnodes, mdn);
+ list_insert_tail(&osi->os_dnodes, mdn);
+ while ((dn = list_head(&osi->os_dnodes)) != NULL) {
+ list_remove(&osi->os_dnodes, dn);
+ list_insert_tail(&osi->os_dnodes, dn);
+ if (dn == mdn) {
+ if (allzero)
+ break;
+ allzero = B_TRUE;
+ continue;
+ }
+ if (!refcount_is_zero(&dn->dn_holds)) {
+ allzero = B_FALSE;
+ dnode_add_ref(dn, FTAG);
+ mutex_exit(&osi->os_lock);
+ dnode_evict_dbufs(dn);
+ dnode_rele(dn, FTAG);
+ mutex_enter(&osi->os_lock);
+ }
+ }
+ mutex_exit(&osi->os_lock);
+ dnode_evict_dbufs(mdn);
+}
+
+void
dmu_objset_evict(dsl_dataset_t *ds, void *arg)
{
objset_impl_t *osi = arg;
+ objset_t os;
int err, i;
for (i = 0; i < TXG_SIZE; i++) {
@@ -277,6 +338,13 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg)
ASSERT(err == 0);
}
+ /*
+ * We should need only a single pass over the dnode list, since
+ * nothing can be added to the list at this point.
+ */
+ os.os = osi;
+ dmu_objset_evict_dbufs(&os);
+
ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
@@ -297,7 +365,7 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
dnode_t *mdn;
ASSERT(dmu_tx_is_syncing(tx));
- osi = dmu_objset_open_impl(spa, ds, NULL);
+ VERIFY(0 == dmu_objset_open_impl(spa, ds, NULL, &osi));
mdn = osi->os_meta_dnode;
dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
@@ -314,9 +382,21 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type,
* needs to be synced multiple times as spa_sync() iterates
* to convergence, so minimizing its dn_nlevels matters.
*/
- if (ds != NULL)
+ if (ds != NULL) {
+ int levels = 1;
+
+ /*
+ * Determine the number of levels necessary for the meta-dnode
+ * to contain DN_MAX_OBJECT dnodes.
+ */
+ while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
+ (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
+ DN_MAX_OBJECT * sizeof (dnode_phys_t))
+ levels++;
+
mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
- mdn->dn_nlevels = DN_META_DNODE_LEVELS;
+ mdn->dn_nlevels = levels;
+ }
ASSERT(type != DMU_OST_NONE);
ASSERT(type != DMU_OST_ANY);
@@ -354,9 +434,8 @@ dmu_objset_create_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
if (err)
return (err);
- err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname,
- DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds);
- ASSERT3U(err, ==, 0);
+ VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname,
+ DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds));
dsl_dataset_get_blkptr(ds, &bp);
if (BP_IS_HOLE(&bp)) {
objset_impl_t *osi;
@@ -382,9 +461,9 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
const char *tail;
int err = 0;
- pds = dsl_dir_open(name, FTAG, &tail);
- if (pds == NULL)
- return (ENOENT);
+ err = dsl_dir_open(name, FTAG, &pds, &tail);
+ if (err)
+ return (err);
if (tail == NULL) {
dsl_dir_close(pds, FTAG);
return (EEXIST);
@@ -554,6 +633,7 @@ dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
int txgoff;
list_t *dirty_list;
int err;
+ zbookmark_t zb;
arc_buf_t *abuf =
arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG);
@@ -586,11 +666,15 @@ dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx)
* Sync the root block.
*/
bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t));
+ zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = 0;
err = arc_write(NULL, os->os_spa, os->os_md_checksum,
os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os,
- ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT);
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb);
ASSERT(err == 0);
- arc_buf_free(abuf, FTAG);
+ VERIFY(arc_buf_remove_ref(abuf, FTAG) == 1);
dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx);
@@ -707,10 +791,10 @@ dmu_objset_find(char *name, void func(char *, void *), void *arg, int flags)
zap_cursor_t zc;
zap_attribute_t attr;
char *child;
- int do_self;
+ int do_self, err;
- dd = dsl_dir_open(name, FTAG, NULL);
- if (dd == NULL)
+ err = dsl_dir_open(name, FTAG, &dd, NULL);
+ if (err)
return;
do_self = (dd->dd_phys->dd_head_dataset_obj != 0);
diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c
index fedeba015d..fbc55fec86 100644
--- a/usr/src/uts/common/fs/zfs/dmu_traverse.c
+++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -339,7 +338,7 @@ traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp,
} else {
error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data,
BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ,
- th->th_zio_flags | ZIO_FLAG_DONT_CACHE));
+ th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb));
if (BP_SHOULD_BYTESWAP(bp) && error == 0)
(zb->zb_level > 0 ? byteswap_uint64_array :
@@ -469,13 +468,70 @@ get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn,
return (rc);
}
+/* ARGSUSED */
+static void
+traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t maxtxg)
+{
+ traverse_handle_t *th = arg;
+ traverse_blk_cache_t *bc = &th->th_zil_cache;
+ zbookmark_t *zb = &bc->bc_bookmark;
+
+ if (bp->blk_birth < maxtxg) {
+ zb->zb_object = 0;
+ zb->zb_blkid = bp->blk_cksum.zc_word[3];
+ bc->bc_blkptr = *bp;
+ (void) th->th_func(bc, th->th_spa, th->th_arg);
+ }
+}
+
+/* ARGSUSED */
+static void
+traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t maxtxg)
+{
+ traverse_handle_t *th = arg;
+ traverse_blk_cache_t *bc = &th->th_zil_cache;
+ zbookmark_t *zb = &bc->bc_bookmark;
+
+ if (lrc->lrc_txtype == TX_WRITE) {
+ lr_write_t *lr = (lr_write_t *)lrc;
+ blkptr_t *bp = &lr->lr_blkptr;
+
+ if (bp->blk_birth != 0 && bp->blk_birth < maxtxg) {
+ zb->zb_object = lr->lr_foid;
+ zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
+ bc->bc_blkptr = *bp;
+ (void) th->th_func(bc, th->th_spa, th->th_arg);
+ }
+ }
+}
+
+static void
+traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc, uint64_t maxtxg)
+{
+ spa_t *spa = th->th_spa;
+ objset_phys_t *osphys = bc->bc_data;
+ dsl_pool_t *dp = spa_get_dsl(spa);
+ zilog_t *zilog;
+
+ ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]);
+ ASSERT(bc->bc_bookmark.zb_level == -1);
+
+ th->th_zil_cache.bc_bookmark = bc->bc_bookmark;
+
+ zilog = zil_alloc(dp->dp_meta_objset, &osphys->os_zil_header);
+
+ zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, maxtxg);
+
+ zil_free(zilog);
+}
+
static int
traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
{
zbookmark_t *zb = &zseg->seg_start;
traverse_blk_cache_t *bc;
dnode_phys_t *dn, *dn_tmp;
- int worklimit = 1000;
+ int worklimit = 100;
int rc;
dprintf("<%llu, %llu, %d, %llx>\n",
@@ -529,6 +585,8 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
if (zb->zb_level == -1) {
ASSERT(zb->zb_object == 0);
+ ASSERT(zb->zb_blkid == 0);
+ ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET);
if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) {
rc = traverse_callback(th, zseg, bc);
@@ -536,6 +594,9 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp)
ASSERT(rc == EINTR);
return (rc);
}
+ if ((th->th_advance & ADVANCE_ZIL) &&
+ zb->zb_objset != 0)
+ traverse_zil(th, bc, zseg->seg_maxtxg);
}
return (advance_from_osphys(zseg, th->th_advance));
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index 6576107ae2..894bd63f36 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -37,6 +37,9 @@
#include <sys/spa.h>
#include <sys/zfs_context.h>
+typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
+ uint64_t arg1, uint64_t arg2);
+
#ifdef ZFS_DEBUG
int dmu_use_tx_debug_bufs = 1;
#endif
@@ -60,6 +63,7 @@ dmu_tx_create(objset_t *os)
{
dmu_tx_t *tx = dmu_tx_create_ds(os->os->os_dsl_dataset->ds_dir);
tx->tx_objset = os;
+ tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
return (tx);
}
@@ -85,7 +89,7 @@ dmu_tx_is_syncing(dmu_tx_t *tx)
int
dmu_tx_private_ok(dmu_tx_t *tx)
{
- return (tx->tx_anyobj || tx->tx_privateobj);
+ return (tx->tx_anyobj);
}
static void
@@ -95,11 +99,16 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
{
dmu_tx_hold_t *dth;
dnode_t *dn = NULL;
+ int err;
if (object != DMU_NEW_OBJECT) {
- dn = dnode_hold(os->os, object, tx);
+ err = dnode_hold(os->os, object, tx, &dn);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
- if (tx->tx_txg != 0) {
+ if (err == 0 && tx->tx_txg != 0) {
mutex_enter(&dn->dn_mtx);
/*
* dn->dn_assigned_txg == tx->tx_txg doesn't pose a
@@ -118,15 +127,12 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
dth = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
dth->dth_dnode = dn;
dth->dth_type = type;
- dth->dth_func = func;
dth->dth_arg1 = arg1;
dth->dth_arg2 = arg2;
- /*
- * XXX Investigate using a different data structure to keep
- * track of dnodes in a tx. Maybe array, since there will
- * generally not be many entries?
- */
list_insert_tail(&tx->tx_holds, dth);
+
+ if (func)
+ func(tx, dn, arg1, arg2);
}
void
@@ -142,11 +148,27 @@ dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object)
}
}
+static int
+dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
+{
+ int err;
+ dmu_buf_impl_t *db;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ db = dbuf_hold_level(dn, level, blkid, FTAG);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (db == NULL)
+ return (EIO);
+ err = dbuf_read(db, zio, DB_RF_CANFAIL);
+ dbuf_rele(db, FTAG);
+ return (err);
+}
+
/* ARGSUSED */
static void
dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
{
- uint64_t start, end, space;
+ uint64_t start, end, i, space;
int min_bs, max_bs, min_ibs, max_ibs, epbs, bits;
if (len == 0)
@@ -158,6 +180,64 @@ dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
max_ibs = DN_MAX_INDBLKSHIFT;
/*
+ * For i/o error checking, read the first and last level-0
+ * blocks, and all the level-1 blocks. We needn't do this on
+ * the meta-dnode, because we've already read it in.
+ */
+
+ if (dn && dn->dn_object != DMU_META_DNODE_OBJECT) {
+ int err;
+
+ if (dn->dn_maxblkid == 0) {
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+ } else {
+ zio_t *zio = zio_root(tx->tx_pool->dp_spa,
+ NULL, NULL, ZIO_FLAG_CANFAIL);
+
+ /* first level-0 block */
+ start = off/dn->dn_datablksz;
+ err = dmu_tx_check_ioerr(zio, dn, 0, start);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+
+ /* last level-0 block */
+ end = (off+len)/dn->dn_datablksz;
+ if (end != start) {
+ err = dmu_tx_check_ioerr(zio, dn, 0, end);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+
+ /* level-1 blocks */
+ if (dn->dn_nlevels > 1) {
+ start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ for (i = start+1; i < end; i++) {
+ err = dmu_tx_check_ioerr(zio, dn, 1, i);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+ }
+
+ err = zio_wait(zio);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+ }
+
+ /*
* If there's more than one block, the blocksize can't change,
* so we can make a more precise estimate. Alternatively,
* if the dnode's ibs is larger than max_ibs, always use that.
@@ -218,7 +298,7 @@ dmu_tx_count_dnode(dmu_tx_t *tx, dnode_t *dn)
dmu_tx_count_write(tx, mdn, object << DNODE_SHIFT, 1 << DNODE_SHIFT);
if (dn && dn->dn_dbuf->db_blkptr &&
dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
- dn->dn_dbuf->db_blkptr->blk_birth, tx)) {
+ dn->dn_dbuf->db_blkptr->blk_birth)) {
tx->tx_space_tooverwrite +=
tx->tx_space_towrite - pre_write_space;
tx->tx_space_towrite = pre_write_space;
@@ -237,7 +317,7 @@ void
dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
{
ASSERT(tx->tx_txg == 0);
- ASSERT(len > 0 && len < DMU_MAX_ACCESS);
+ ASSERT(len < DMU_MAX_ACCESS);
ASSERT(UINT64_MAX - off >= len - 1);
dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE,
@@ -251,8 +331,6 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
uint64_t space = 0;
dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
- ASSERT(dn->dn_assigned_tx == tx || dn->dn_assigned_tx == NULL);
-
if (dn->dn_datablkshift == 0)
return;
/*
@@ -264,8 +342,10 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
blkid = off >> dn->dn_datablkshift;
nblks = (off + len) >> dn->dn_datablkshift;
- if (blkid >= dn->dn_maxblkid)
- goto out;
+ if (blkid >= dn->dn_maxblkid) {
+ rw_exit(&dn->dn_struct_rwlock);
+ return;
+ }
if (blkid + nblks > dn->dn_maxblkid)
nblks = dn->dn_maxblkid - blkid;
@@ -278,12 +358,12 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
blkptr_t *bp = dn->dn_phys->dn_blkptr;
ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr);
bp += blkid + i;
- if (dsl_dataset_block_freeable(ds, bp->blk_birth, tx)) {
+ if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
dprintf_bp(bp, "can free old%s", "");
space += BP_GET_ASIZE(bp);
}
}
- goto out;
+ nblks = 0;
}
while (nblks) {
@@ -299,20 +379,26 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
int i;
blkptr_t *bp;
- dbuf_read_havestruct(dbuf);
+ err = dbuf_read(dbuf, NULL,
+ DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
+ if (err != 0) {
+ tx->tx_err = err;
+ dbuf_rele(dbuf, FTAG);
+ break;
+ }
bp = dbuf->db.db_data;
bp += blkoff;
for (i = 0; i < tochk; i++) {
if (dsl_dataset_block_freeable(ds,
- bp[i].blk_birth, tx)) {
+ bp[i].blk_birth)) {
dprintf_bp(&bp[i],
"can free old%s", "");
space += BP_GET_ASIZE(&bp[i]);
}
}
- dbuf_remove_ref(dbuf, FTAG);
+ dbuf_rele(dbuf, FTAG);
} else {
/* the indirect block is sparse */
ASSERT(err == ENOENT);
@@ -321,7 +407,6 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
blkid += tochk;
nblks -= tochk;
}
-out:
rw_exit(&dn->dn_struct_rwlock);
tx->tx_space_tofree += space;
@@ -330,7 +415,9 @@ out:
static void
dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
{
- int dirty;
+ uint64_t start, end, i;
+ int dirty, err, shift;
+ zio_t *zio;
/* first block */
if (off != 0 /* || dn->dn_maxblkid == 0 */)
@@ -339,13 +426,46 @@ dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
if (len != DMU_OBJECT_END)
dmu_tx_count_write(tx, dn, off+len, 1);
- dmu_tx_count_dnode(tx, dn);
-
if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
return;
if (len == DMU_OBJECT_END)
len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off;
+ /*
+ * For i/o error checking, read the first and last level-0
+ * blocks, and all the level-1 blocks. The above count_write's
+ * will take care of the level-0 blocks.
+ */
+ shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ start = off >> shift;
+ end = dn->dn_datablkshift ? ((off+len) >> shift) : 0;
+
+ zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
+ for (i = start+1; i < end; i++) {
+ uint64_t ibyte = i << shift;
+ err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1);
+ i = ibyte >> shift;
+ if (err == ESRCH)
+ break;
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+
+ err = dmu_tx_check_ioerr(zio, dn, 1, i);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+ err = zio_wait(zio);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+
+ dmu_tx_count_dnode(tx, dn);
+
/* XXX locking */
dirty = dn->dn_dirtyblksz[0] | dn->dn_dirtyblksz[1] |
dn->dn_dirtyblksz[2] | dn->dn_dirtyblksz[3];
@@ -364,17 +484,17 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
/* ARGSUSED */
static void
-dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops)
+dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t add, uint64_t iname)
{
uint64_t nblocks;
- int epbs;
+ int epbs, err;
+ char *name = (char *)(uintptr_t)iname;
dmu_tx_count_dnode(tx, dn);
if (dn == NULL) {
/*
- * Assuming that nops+cops is not super huge, we will be
- * able to fit a new object's entries into one leaf
+ * We will be able to fit a new object's entries into one leaf
* block. So there will be at most 2 blocks total,
* including the header block.
*/
@@ -384,25 +504,44 @@ dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops)
ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap);
- if (dn->dn_maxblkid == 0 && nops == 0) {
+ if (dn->dn_maxblkid == 0 && !add) {
/*
* If there is only one block (i.e. this is a micro-zap)
- * and we are only doing updates, the accounting is simple.
+ * and we are not adding anything, the accounting is simple.
*/
+ err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
+ if (err) {
+ tx->tx_err = err;
+ return;
+ }
+
if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
- dn->dn_phys->dn_blkptr[0].blk_birth, tx))
+ dn->dn_phys->dn_blkptr[0].blk_birth))
tx->tx_space_tooverwrite += dn->dn_datablksz;
else
tx->tx_space_towrite += dn->dn_datablksz;
return;
}
+ if (dn->dn_maxblkid > 0 && name) {
+ /*
+ * access the name in this fat-zap so that we'll check
+ * for i/o errors to the leaf blocks, etc.
+ */
+ err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
+ 8, 0, NULL);
+ if (err == EIO) {
+ tx->tx_err = err;
+ return;
+ }
+ }
+
/*
- * 3 blocks overwritten per op: target leaf, ptrtbl block, header block
- * 3 new blocks written per op: new split leaf, 2 grown ptrtbl blocks
+ * 3 blocks overwritten: target leaf, ptrtbl block, header block
+ * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
*/
dmu_tx_count_write(tx, dn, dn->dn_maxblkid * dn->dn_datablksz,
- (nops * 6ULL + cops * 3ULL) << dn->dn_datablkshift);
+ (3 + add ? 3 : 0) << dn->dn_datablkshift);
/*
* If the modified blocks are scattered to the four winds,
@@ -410,17 +549,16 @@ dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops)
*/
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
- tx->tx_space_towrite +=
- ((nops + cops) * 3ULL) << dn->dn_indblkshift;
+ tx->tx_space_towrite += 3 << dn->dn_indblkshift;
}
void
-dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops)
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
{
ASSERT(tx->tx_txg == 0);
dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP,
- dmu_tx_hold_zap_impl, (ops > 0?ops:0), (ops < 0?-ops:0));
+ dmu_tx_hold_zap_impl, add, (uintptr_t)name);
}
void
@@ -492,7 +630,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
return;
/* XXX No checking on the meta dnode for now */
- if (db->db.db_object & DMU_PRIVATE_OBJECT)
+ if (db->db.db_object == DMU_META_DNODE_OBJECT)
return;
for (dth = list_head(&tx->tx_holds); dth;
@@ -572,20 +710,19 @@ static int
dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
{
dmu_tx_hold_t *dth;
- uint64_t lsize, asize, fsize;
+ uint64_t lsize, asize, fsize, towrite;
*last_dth = NULL;
- tx->tx_space_towrite = 0;
- tx->tx_space_tofree = 0;
- tx->tx_space_tooverwrite = 0;
tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg)
return (ERESTART);
+ if (tx->tx_err)
+ return (tx->tx_err);
for (dth = list_head(&tx->tx_holds); dth;
- *last_dth = dth, dth = list_next(&tx->tx_holds, dth)) {
+ dth = list_next(&tx->tx_holds, dth)) {
dnode_t *dn = dth->dth_dnode;
if (dn != NULL) {
mutex_enter(&dn->dn_mtx);
@@ -608,8 +745,21 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
(void) refcount_add(&dn->dn_tx_holds, tx);
mutex_exit(&dn->dn_mtx);
}
- if (dth->dth_func)
- dth->dth_func(tx, dn, dth->dth_arg1, dth->dth_arg2);
+ *last_dth = dth;
+ if (tx->tx_err)
+ return (tx->tx_err);
+ }
+
+ /*
+ * If a snapshot has been taken since we made our estimates,
+ * assume that we won't be able to free or overwrite anything.
+ */
+ if (tx->tx_objset &&
+ dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
+ tx->tx_lastsnap_txg) {
+ tx->tx_space_towrite += tx->tx_space_tooverwrite;
+ tx->tx_space_tooverwrite = 0;
+ tx->tx_space_tofree = 0;
}
/*
@@ -619,13 +769,16 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth)
tx->tx_space_tofree;
lsize = tx->tx_space_towrite + tx->tx_space_tooverwrite;
asize = spa_get_asize(tx->tx_pool->dp_spa, lsize);
+ towrite = tx->tx_space_towrite;
tx->tx_space_towrite = asize;
if (tx->tx_dir && asize != 0) {
int err = dsl_dir_tempreserve_space(tx->tx_dir,
lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx);
- if (err)
+ if (err) {
+ tx->tx_space_towrite = towrite;
return (err);
+ }
}
return (0);
@@ -688,8 +841,6 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
ASSERT(tx->tx_txg == 0);
ASSERT(txg_how != 0);
ASSERT(!dsl_pool_sync_context(tx->tx_pool));
- ASSERT3U(tx->tx_space_towrite, ==, 0);
- ASSERT3U(tx->tx_space_tofree, ==, 0);
while ((err = dmu_tx_try_assign(tx, txg_how, &last_dth)) != 0) {
uint64_t txg = dmu_tx_unassign(tx, last_dth);
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index 03ce2a0398..8adb692ec8 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -155,7 +155,7 @@ dnode_verify(dnode_t *dn)
}
if (dn->dn_phys->dn_type != DMU_OT_NONE)
ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
- ASSERT(IS_DNODE_DNODE(dn->dn_object) || dn->dn_dbuf);
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL);
if (dn->dn_dbuf != NULL) {
ASSERT3P(dn->dn_phys, ==,
(dnode_phys_t *)dn->dn_dbuf->db.db_data +
@@ -307,6 +307,11 @@ dnode_destroy(dnode_t *dn)
dn->dn_dirtyctx_firstset = NULL;
}
dmu_zfetch_rele(&dn->dn_zfetch);
+ if (dn->dn_bonus) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_evict(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ }
kmem_cache_free(dnode_cache, dn);
}
@@ -381,13 +386,10 @@ void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
{
- dmu_buf_impl_t *db = NULL;
-
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
- ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
- ASSERT(!(dn->dn_object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx));
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
ASSERT(tx->tx_txg != 0);
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0));
@@ -398,6 +400,10 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
ASSERT(dn->dn_dirtyblksz[2] == 0);
ASSERT(dn->dn_dirtyblksz[3] == 0);
+ /* clean up any unreferenced dbufs */
+ dnode_evict_dbufs(dn);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
/*
* XXX I should really have a generation number to tell if we
* need to do this...
@@ -421,17 +427,25 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dn->dn_type = ot;
if (dn->dn_bonuslen != bonuslen) {
+ dmu_buf_impl_t *db = NULL;
+
/* change bonus size */
if (bonuslen == 0)
bonuslen = 1; /* XXX */
- db = dbuf_hold_bonus(dn, FTAG);
- dbuf_read(db);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus == NULL)
+ dn->dn_bonus = dbuf_create_bonus(dn);
+ db = dn->dn_bonus;
+ rw_exit(&dn->dn_struct_rwlock);
+ if (refcount_add(&db->db_holds, FTAG) == 1)
+ dnode_add_ref(dn, db);
mutex_enter(&db->db_mtx);
ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
ASSERT(db->db.db_data != NULL);
db->db.db_size = bonuslen;
mutex_exit(&db->db_mtx);
dbuf_dirty(db, tx);
+ dbuf_rele(db, FTAG);
}
/* change bonus size and type */
@@ -445,14 +459,19 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
dn->dn_allocated_txg = tx->tx_txg;
mutex_exit(&dn->dn_mtx);
-
- if (db)
- dbuf_remove_ref(db, FTAG);
}
void
dnode_special_close(dnode_t *dn)
{
+ /*
+ * Wait for final references to the dnode to clear. This can
+ * only happen if the arc is asyncronously evicting state that
+ * has a hold on this dnode while we are trying to evict this
+ * dnode.
+ */
+ while (refcount_count(&dn->dn_holds) > 0)
+ delay(1);
dnode_destroy(dn);
}
@@ -498,21 +517,25 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
}
/*
- * Returns held dnode if the object number is valid, NULL if not.
- * Note that this will succeed even for free dnodes.
+ * errors:
+ * EINVAL - invalid object number.
+ * EIO - i/o error.
+ * succeeds even for free dnodes.
*/
-dnode_t *
-dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref)
+int
+dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
+ void *tag, dnode_t **dnp)
{
- int epb, idx;
+ int epb, idx, err;
int drop_struct_lock = FALSE;
+ int type;
uint64_t blk;
dnode_t *mdn, *dn;
dmu_buf_impl_t *db;
dnode_t **children_dnodes;
if (object == 0 || object >= DN_MAX_OBJECT)
- return (NULL);
+ return (EINVAL);
mdn = os->os_meta_dnode;
@@ -525,10 +548,16 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref)
blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
- db = dbuf_hold(mdn, blk);
+ db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock)
rw_exit(&mdn->dn_struct_rwlock);
- dbuf_read(db);
+ if (db == NULL)
+ return (EIO);
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL);
+ if (err) {
+ dbuf_rele(db, FTAG);
+ return (err);
+ }
ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
epb = db->db.db_size >> DNODE_SHIFT;
@@ -559,51 +588,53 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref)
}
mutex_enter(&dn->dn_mtx);
+ type = dn->dn_type;
if (dn->dn_free_txg ||
- ((flag & DNODE_MUST_BE_ALLOCATED) && dn->dn_type == DMU_OT_NONE) ||
- ((flag & DNODE_MUST_BE_FREE) && dn->dn_type != DMU_OT_NONE)) {
+ ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
+ ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) {
mutex_exit(&dn->dn_mtx);
- dbuf_rele(db);
- return (NULL);
+ dbuf_rele(db, FTAG);
+ return (type == DMU_OT_NONE ? ENOENT : EEXIST);
}
mutex_exit(&dn->dn_mtx);
- if (refcount_add(&dn->dn_holds, ref) == 1)
+ if (refcount_add(&dn->dn_holds, tag) == 1)
dbuf_add_ref(db, dn);
DNODE_VERIFY(dn);
ASSERT3P(dn->dn_dbuf, ==, db);
ASSERT3U(dn->dn_object, ==, object);
- dbuf_rele(db);
+ dbuf_rele(db, FTAG);
- return (dn);
+ *dnp = dn;
+ return (0);
}
/*
* Return held dnode if the object is allocated, NULL if not.
*/
-dnode_t *
-dnode_hold(objset_impl_t *os, uint64_t object, void *ref)
+int
+dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
{
- return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, ref));
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
}
void
-dnode_add_ref(dnode_t *dn, void *ref)
+dnode_add_ref(dnode_t *dn, void *tag)
{
ASSERT(refcount_count(&dn->dn_holds) > 0);
- (void) refcount_add(&dn->dn_holds, ref);
+ (void) refcount_add(&dn->dn_holds, tag);
}
void
-dnode_rele(dnode_t *dn, void *ref)
+dnode_rele(dnode_t *dn, void *tag)
{
uint64_t refs;
- refs = refcount_remove(&dn->dn_holds, ref);
+ refs = refcount_remove(&dn->dn_holds, tag);
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
if (refs == 0 && dn->dn_dbuf)
- dbuf_remove_ref(dn->dn_dbuf, dn);
+ dbuf_rele(dn->dn_dbuf, dn);
}
void
@@ -612,7 +643,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
objset_impl_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg;
- if (IS_DNODE_DNODE(dn->dn_object))
+ if (dn->dn_object == DMU_META_DNODE_OBJECT)
return;
DNODE_VERIFY(dn);
@@ -658,7 +689,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
* dnode will hang around after we finish processing its
* children.
*/
- (void) refcount_add(&dn->dn_holds, (void *)(uintptr_t)tx->tx_txg);
+ dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg);
dbuf_dirty(dn->dn_dbuf, tx);
@@ -764,7 +795,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
}
/* obtain the old block */
- db = dbuf_hold(dn, 0);
+ db = dbuf_hold(dn, 0, FTAG);
dbuf_new_size(db, size, tx);
@@ -773,7 +804,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
/* don't need dd_dirty_mtx, dnode is already dirty */
dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size;
dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
- dbuf_rele(db);
+ dbuf_rele(db, FTAG);
err = 0;
end:
@@ -844,7 +875,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
dmu_buf_impl_t *db = dbuf_hold_level(dn, old_nlevels, 0, FTAG);
dprintf("dn %p dirtying left indirects\n", dn);
dbuf_dirty(db, tx);
- dbuf_remove_ref(db, FTAG);
+ dbuf_rele(db, FTAG);
}
#ifdef ZFS_DEBUG
else if (old_nlevels > 1 && new_nlevels > old_nlevels) {
@@ -855,7 +886,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
db = dbuf_hold_level(dn, old_nlevels-1, i, FTAG);
ASSERT(!
list_link_active(&db->db_dirty_node[txgoff]));
- dbuf_remove_ref(db, FTAG);
+ dbuf_rele(db, FTAG);
}
}
#endif
@@ -976,7 +1007,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
data = db->db.db_data;
bzero(data + start, head);
}
- dbuf_remove_ref(db, FTAG);
+ dbuf_rele(db, FTAG);
}
off += head;
len -= head;
@@ -1009,7 +1040,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
bzero(db->db.db_data, tail);
}
- dbuf_remove_ref(db, FTAG);
+ dbuf_rele(db, FTAG);
}
len -= tail;
}
@@ -1022,7 +1053,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
db = dbuf_hold_level(dn, 1,
(off - head) >> (blkshift + epbs), FTAG);
dbuf_will_dirty(db, tx);
- dbuf_remove_ref(db, FTAG);
+ dbuf_rele(db, FTAG);
}
/* dirty the right indirects */
@@ -1030,7 +1061,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
db = dbuf_hold_level(dn, 1,
(off + len + tail - 1) >> (blkshift + epbs), FTAG);
dbuf_will_dirty(db, tx);
- dbuf_remove_ref(db, FTAG);
+ dbuf_rele(db, FTAG);
}
/*
@@ -1189,7 +1220,8 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
return (hole ? 0 : ESRCH);
return (error);
}
- dbuf_read_havestruct(db);
+ (void) dbuf_read(db, NULL,
+ DB_RF_MUST_SUCCEED | DB_RF_HAVESTRUCT);
data = db->db.db_data;
}
@@ -1228,7 +1260,7 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset,
}
if (db)
- dbuf_remove_ref(db, FTAG);
+ dbuf_rele(db, FTAG);
return (error);
}
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
index 597cafb44e..dcfb9ee7d2 100644
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -48,13 +47,15 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
/* this dnode can't be paged out because it's dirty */
db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG);
+ ASSERT(db != NULL);
for (i = 0; i < dn->dn_phys->dn_nblkptr; i++)
if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i]))
break;
if (i != dn->dn_phys->dn_nblkptr) {
ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK]));
- dbuf_read_havestruct(db);
+ (void) dbuf_read(db, NULL,
+ DB_RF_HAVESTRUCT | DB_RF_MUST_SUCCEED);
arc_release(db->db_buf, db);
/* copy dnode's block pointers to new indirect block */
ASSERT3U(sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr, <=,
@@ -102,7 +103,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx)
bzero(dn->dn_phys->dn_blkptr,
sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr);
- dbuf_remove_ref(db, FTAG);
+ dbuf_rele(db, FTAG);
}
static void
@@ -163,7 +164,8 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
/* db_data_old better be zeroed */
if (child->db_d.db_data_old[txg & TXG_MASK]) {
- buf = (child->db_d.db_data_old[txg & TXG_MASK])->b_data;
+ buf = ((arc_buf_t *)child->db_d.db_data_old
+ [txg & TXG_MASK])->b_data;
for (j = 0; j < child->db.db_size >> 3; j++) {
if (buf[j] != 0) {
panic("freed data not zero: "
@@ -194,7 +196,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx)
}
mutex_exit(&child->db_mtx);
- dbuf_remove_ref(child, FTAG);
+ dbuf_rele(child, FTAG);
}
}
#endif
@@ -211,7 +213,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
int txgoff = tx->tx_txg & TXG_MASK;
int all = TRUE;
- dbuf_read(db);
+ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
arc_release(db->db_buf, db);
bp = (blkptr_t *)db->db.db_data;
@@ -254,7 +256,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
} else {
all = FALSE;
}
- dbuf_remove_ref(subdb, FTAG);
+ dbuf_rele(subdb, FTAG);
}
#ifdef ZFS_DEBUG
bp -= (end-start)+1;
@@ -326,7 +328,7 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
ASSERT3P(db->db_blkptr, ==, bp);
free_blocks(dn, bp, 1, tx);
}
- dbuf_remove_ref(db, FTAG);
+ dbuf_rele(db, FTAG);
}
if (trunc) {
uint64_t off = (dn->dn_phys->dn_maxblkid + 1) *
@@ -338,6 +340,48 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
}
}
+/*
+ * Try to kick all the dnodes dbufs out of the cache...
+ */
+void
+dnode_evict_dbufs(dnode_t *dn)
+{
+ dmu_buf_impl_t *db;
+
+ mutex_enter(&dn->dn_dbufs_mtx);
+ while (db = list_head(&dn->dn_dbufs)) {
+ int progress = 0;
+ for (; db; db = list_next(&dn->dn_dbufs, db)) {
+ mutex_enter(&db->db_mtx);
+ if (db->db_state != DB_EVICTING &&
+ refcount_is_zero(&db->db_holds))
+ break;
+ else if (db->db_state == DB_EVICTING)
+ progress = 1;
+ mutex_exit(&db->db_mtx);
+ }
+ if (db) {
+ ASSERT(!arc_released(db->db_buf));
+ dbuf_clear(db);
+ mutex_exit(&dn->dn_dbufs_mtx);
+ progress = 1;
+ } else {
+ if (progress == 0)
+ break;
+ mutex_exit(&dn->dn_dbufs_mtx);
+ }
+ mutex_enter(&dn->dn_dbufs_mtx);
+ }
+ mutex_exit(&dn->dn_dbufs_mtx);
+ rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+ if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) {
+ mutex_enter(&dn->dn_bonus->db_mtx);
+ dbuf_evict(dn->dn_bonus);
+ dn->dn_bonus = NULL;
+ }
+ rw_exit(&dn->dn_struct_rwlock);
+}
+
static int
dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
{
@@ -352,32 +396,35 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
/* XXX - use dbuf_undirty()? */
list_remove(&dn->dn_dirty_dbufs[txgoff], db);
if (db->db_level == 0) {
- ASSERT3P(db->db_d.db_data_old[txgoff], ==, db->db_buf);
+ ASSERT(db->db_blkid == DB_BONUS_BLKID ||
+ db->db_d.db_data_old[txgoff] == db->db_buf);
if (db->db_d.db_overridden_by[txgoff])
dbuf_unoverride(db, tx->tx_txg);
db->db_d.db_data_old[txgoff] = NULL;
}
db->db_dirtycnt -= 1;
mutex_exit(&db->db_mtx);
- dbuf_remove_ref(db, (void *)(uintptr_t)tx->tx_txg);
+ dbuf_rele(db, (void *)(uintptr_t)tx->tx_txg);
}
- ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+ dnode_evict_dbufs(dn);
+ ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+
+ /*
+ * XXX - It would be nice to assert this, but we may still
+ * have residual holds from async evictions from the arc...
+ *
+ * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1);
+ */
/* Undirty next bits */
dn->dn_next_nlevels[txgoff] = 0;
dn->dn_next_indblkshift[txgoff] = 0;
/* free up all the blocks in the file. */
- dbuf_free_range(dn, 0, -1, tx);
dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx);
ASSERT3U(dn->dn_phys->dn_secphys, ==, 0);
- /*
- * All dbufs should be gone, since all holds are gone...
- */
- ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
-
/* ASSERT(blkptrs are zero); */
ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE);
ASSERT(dn->dn_type != DMU_OT_NONE);
@@ -394,7 +441,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
dn->dn_allocated_txg = 0;
mutex_exit(&dn->dn_mtx);
- ASSERT(!IS_DNODE_DNODE(dn->dn_object));
+ ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
/*
@@ -420,7 +467,7 @@ dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx)
/* ASSERT(dn->dn_objset->dd_snapshot == NULL); */
ASSERT(dmu_tx_is_syncing(tx));
- ASSERT(IS_DNODE_DNODE(dn->dn_object) ||
+ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
dn->dn_dirtyblksz[txgoff] > 0);
ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
@@ -533,7 +580,7 @@ dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx)
dn->dn_dirtyblksz[txgoff] = 0;
- if (!IS_DNODE_DNODE(dn->dn_object)) {
+ if (dn->dn_object != DMU_META_DNODE_OBJECT) {
dbuf_will_dirty(dn->dn_dbuf, tx);
dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
}
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index e77b772922..7db7745270 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -146,7 +145,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
-used, -compressed, -uncompressed, tx);
} else {
dprintf_bp(bp, "putting on dead list: %s", "");
- bplist_enqueue(&ds->ds_deadlist, bp, tx);
+ VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
/* if (bp->blk_birth > prev prev snap txg) prev unique += bs */
if (ds->ds_phys->ds_prev_snap_obj != 0) {
ASSERT3U(ds->ds_prev->ds_object, ==,
@@ -175,14 +174,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
mutex_exit(&ds->ds_lock);
}
-int
-dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx)
+uint64_t
+dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
{
- uint64_t prev_snap_txg;
+ uint64_t txg;
dsl_dir_t *dd;
- /* ASSERT that it is not a snapshot */
+
if (ds == NULL)
- return (TRUE);
+ return (0);
/*
* The snapshot creation could fail, but that would cause an
* incorrect FALSE return, which would only result in an
@@ -195,13 +194,19 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx)
*/
dd = ds->ds_dir;
mutex_enter(&dd->dd_lock);
- if (dd->dd_sync_func == dsl_dataset_snapshot_sync &&
- dd->dd_sync_txg < tx->tx_txg)
- prev_snap_txg = dd->dd_sync_txg;
+ if (dd->dd_sync_func == dsl_dataset_snapshot_sync)
+ txg = dd->dd_sync_txg;
else
- prev_snap_txg = ds->ds_phys->ds_prev_snap_txg;
+ txg = ds->ds_phys->ds_prev_snap_txg;
mutex_exit(&dd->dd_lock);
- return (blk_birth > prev_snap_txg);
+
+ return (txg);
+}
+
+int
+dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
+{
+ return (blk_birth > dsl_dataset_prev_snap_txg(ds));
}
/* ARGSUSED */
@@ -236,7 +241,7 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
kmem_free(ds, sizeof (dsl_dataset_t));
}
-static void
+static int
dsl_dataset_get_snapname(dsl_dataset_t *ds)
{
dsl_dataset_phys_t *headphys;
@@ -246,34 +251,37 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds)
objset_t *mos = dp->dp_meta_objset;
if (ds->ds_snapname[0])
- return;
+ return (0);
if (ds->ds_phys->ds_next_snap_obj == 0)
- return;
+ return (0);
- headdbuf = dmu_bonus_hold_tag(mos,
- ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG);
- dmu_buf_read(headdbuf);
+ err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ FTAG, &headdbuf);
+ if (err)
+ return (err);
headphys = headdbuf->db_data;
err = zap_value_search(dp->dp_meta_objset,
headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname);
- ASSERT(err == 0);
- dmu_buf_rele_tag(headdbuf, FTAG);
+ dmu_buf_rele(headdbuf, FTAG);
+ return (err);
}
-dsl_dataset_t *
+int
dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
- int mode, void *tag)
+ int mode, void *tag, dsl_dataset_t **dsp)
{
uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)];
objset_t *mos = dp->dp_meta_objset;
dmu_buf_t *dbuf;
dsl_dataset_t *ds;
+ int err;
ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
dsl_pool_sync_context(dp));
- dbuf = dmu_bonus_hold_tag(mos, dsobj, tag);
- dmu_buf_read(dbuf);
+ err = dmu_bonus_hold(mos, dsobj, tag, &dbuf);
+ if (err)
+ return (err);
ds = dmu_buf_get_user(dbuf);
if (ds == NULL) {
dsl_dataset_t *winner;
@@ -282,47 +290,60 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
ds->ds_dbuf = dbuf;
ds->ds_object = dsobj;
ds->ds_phys = dbuf->db_data;
- ds->ds_dir = dsl_dir_open_obj(dp,
- ds->ds_phys->ds_dir_obj, NULL, ds);
- bplist_open(&ds->ds_deadlist,
+ err = bplist_open(&ds->ds_deadlist,
mos, ds->ds_phys->ds_deadlist_obj);
+ if (err == 0) {
+ err = dsl_dir_open_obj(dp,
+ ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir);
+ }
+ if (err) {
+ /*
+ * we don't really need to close the blist if we
+ * just opened it.
+ */
+ kmem_free(ds, sizeof (dsl_dataset_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) {
ds->ds_snapname[0] = '\0';
if (ds->ds_phys->ds_prev_snap_obj) {
- ds->ds_prev =
- dsl_dataset_open_obj(dp,
+ err = dsl_dataset_open_obj(dp,
ds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_NONE, ds);
+ DS_MODE_NONE, ds, &ds->ds_prev);
}
} else {
if (snapname) {
#ifdef ZFS_DEBUG
dsl_dataset_phys_t *headphys;
- int err;
- dmu_buf_t *headdbuf = dmu_bonus_hold_tag(mos,
- ds->ds_dir->dd_phys->
- dd_head_dataset_obj, FTAG);
- dmu_buf_read(headdbuf);
- headphys = headdbuf->db_data;
- uint64_t foundobj;
- err = zap_lookup(dp->dp_meta_objset,
- headphys->ds_snapnames_zapobj,
- snapname, sizeof (foundobj), 1, &foundobj);
- ASSERT3U(err, ==, 0);
- ASSERT3U(foundobj, ==, dsobj);
- dmu_buf_rele_tag(headdbuf, FTAG);
+ dmu_buf_t *headdbuf;
+ err = dmu_bonus_hold(mos,
+ ds->ds_dir->dd_phys->dd_head_dataset_obj,
+ FTAG, &headdbuf);
+ if (err == 0) {
+ headphys = headdbuf->db_data;
+ uint64_t foundobj;
+ err = zap_lookup(dp->dp_meta_objset,
+ headphys->ds_snapnames_zapobj,
+ snapname, sizeof (foundobj), 1,
+ &foundobj);
+ ASSERT3U(foundobj, ==, dsobj);
+ dmu_buf_rele(headdbuf, FTAG);
+ }
#endif
(void) strcat(ds->ds_snapname, snapname);
} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
- dsl_dataset_get_snapname(ds);
+ err = dsl_dataset_get_snapname(ds);
}
}
- winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
- dsl_dataset_evict);
- if (winner) {
+ if (err == 0) {
+ winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys,
+ dsl_dataset_evict);
+ }
+ if (err || winner) {
bplist_close(&ds->ds_deadlist);
if (ds->ds_prev) {
dsl_dataset_close(ds->ds_prev,
@@ -330,6 +351,10 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
}
dsl_dir_close(ds->ds_dir, ds);
kmem_free(ds, sizeof (dsl_dataset_t));
+ if (err) {
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
ds = winner;
} else {
uint64_t new =
@@ -349,12 +374,13 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname,
(ds->ds_open_refcount + weight > DOS_REF_MAX)) {
mutex_exit(&ds->ds_lock);
dsl_dataset_close(ds, DS_MODE_NONE, tag);
- return (NULL);
+ return (EBUSY);
}
ds->ds_open_refcount += weight;
mutex_exit(&ds->ds_lock);
- return (ds);
+ *dsp = ds;
+ return (0);
}
int
@@ -368,9 +394,9 @@ dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
dsl_dataset_t *ds = NULL;
int err = 0;
- dd = dsl_dir_open_spa(spa, name, FTAG, &tail);
- if (dd == NULL)
- return (ENOENT);
+ err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail);
+ if (err)
+ return (err);
dp = dd->dd_pool;
obj = dd->dd_phys->dd_head_dataset_obj;
@@ -384,7 +410,10 @@ dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
if (tail != NULL) {
objset_t *mos = dp->dp_meta_objset;
- ds = dsl_dataset_open_obj(dp, obj, NULL, DS_MODE_NONE, tag);
+ err = dsl_dataset_open_obj(dp, obj, NULL,
+ DS_MODE_NONE, tag, &ds);
+ if (err)
+ goto out;
obj = ds->ds_phys->ds_snapnames_zapobj;
dsl_dataset_close(ds, DS_MODE_NONE, tag);
ds = NULL;
@@ -405,9 +434,7 @@ dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
if (err)
goto out;
}
- ds = dsl_dataset_open_obj(dp, obj, tail, mode, tag);
- if (ds == NULL)
- err = EBUSY;
+ err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds);
out:
rw_exit(&dp->dp_config_rwlock);
@@ -433,7 +460,7 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name)
(void) strcpy(name, "mos");
} else {
dsl_dir_name(ds->ds_dir, name);
- dsl_dataset_get_snapname(ds);
+ VERIFY(0 == dsl_dataset_get_snapname(ds));
if (ds->ds_snapname[0]) {
(void) strcat(name, "@");
if (!MUTEX_HELD(&ds->ds_lock)) {
@@ -462,7 +489,7 @@ dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag)
mode, ds->ds_open_refcount);
mutex_exit(&ds->ds_lock);
- dmu_buf_rele_tag(ds->ds_dbuf, tag);
+ dmu_buf_rele(ds->ds_dbuf, tag);
}
void
@@ -476,16 +503,16 @@ dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
dsl_dir_t *dd;
dsl_dir_create_root(mos, ddobjp, tx);
- dd = dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG);
- ASSERT(dd != NULL);
+ VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd));
dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
- dbuf = dmu_bonus_hold(mos, dsobj);
+ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsphys = dbuf->db_data;
dsphys->ds_dir_obj = dd->dd_object;
dsphys->ds_fsid_guid = unique_create();
+ unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */
(void) random_get_pseudo_bytes((void*)&dsphys->ds_guid,
sizeof (dsphys->ds_guid));
dsphys->ds_snapnames_zapobj =
@@ -494,13 +521,14 @@ dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx)
dsphys->ds_creation_txg = tx->tx_txg;
dsphys->ds_deadlist_obj =
bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- dmu_buf_rele(dbuf);
+ dmu_buf_rele(dbuf, FTAG);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
dd->dd_phys->dd_head_dataset_obj = dsobj;
dsl_dir_close(dd, FTAG);
- ds = dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG);
+ VERIFY(0 ==
+ dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds));
(void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx);
dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
}
@@ -537,14 +565,13 @@ dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
err = dsl_dir_create_sync(pds, lastname, tx);
if (err)
return (err);
- dd = dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, NULL);
- ASSERT(dd != NULL);
+ VERIFY(0 == dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, &dd, NULL));
/* This is the point of no (unsuccessful) return */
dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
- dbuf = dmu_bonus_hold(mos, dsobj);
+ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsphys = dbuf->db_data;
dsphys->ds_dir_obj = dd->dd_object;
@@ -576,7 +603,7 @@ dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
dmu_buf_will_dirty(dd->dd_dbuf, tx);
dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object;
}
- dmu_buf_rele(dbuf);
+ dmu_buf_rele(dbuf, FTAG);
dmu_buf_will_dirty(dd->dd_dbuf, tx);
dd->dd_phys->dd_head_dataset_obj = dsobj;
@@ -594,9 +621,9 @@ dsl_dataset_destroy(const char *name)
dsl_dir_t *dd;
const char *tail;
- dd = dsl_dir_open(name, FTAG, &tail);
- if (dd == NULL)
- return (ENOENT);
+ err = dsl_dir_open(name, FTAG, &dd, &tail);
+ if (err)
+ return (err);
dp = dd->dd_pool;
if (tail != NULL) {
@@ -631,10 +658,12 @@ dsl_dataset_destroy(const char *name)
* dsl_dataset_destroy_sync() to destroy the head dataset.
*/
rw_enter(&dp->dp_config_rwlock, RW_READER);
- pds = dsl_dir_open_obj(dd->dd_pool,
- dd->dd_phys->dd_parent_obj, NULL, FTAG);
+ err = dsl_dir_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_parent_obj, NULL, FTAG, &pds);
dsl_dir_close(dd, FTAG);
rw_exit(&dp->dp_config_rwlock);
+ if (err)
+ return (err);
(void) strcpy(buf, name);
cp = strrchr(buf, '/') + 1;
@@ -657,9 +686,9 @@ dsl_dataset_rollback(const char *name)
dsl_dir_t *dd;
const char *tail;
- dd = dsl_dir_open(name, FTAG, &tail);
- if (dd == NULL)
- return (ENOENT);
+ err = dsl_dir_open(name, FTAG, &dd, &tail);
+ if (err)
+ return (err);
if (tail != NULL) {
dsl_dir_close(dd, FTAG);
@@ -777,11 +806,14 @@ dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
{
objset_t *mos = dd->dd_pool->dp_meta_objset;
dsl_dataset_t *ds;
+ int err;
if (dd->dd_phys->dd_head_dataset_obj == 0)
return (EINVAL);
- ds = dsl_dataset_open_obj(dd->dd_pool,
- dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+ err = dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &ds);
+ if (err)
+ return (err);
if (ds->ds_phys->ds_prev_snap_txg == 0) {
/*
@@ -823,7 +855,8 @@ dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
ds->ds_phys->ds_deadlist_obj =
bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+ VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
+ ds->ds_phys->ds_deadlist_obj));
dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj);
{
@@ -891,27 +924,23 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
drop_lock = TRUE;
}
- ds = dsl_dataset_open_obj(dd->dd_pool,
+ err = dsl_dataset_open_obj(dd->dd_pool,
dd->dd_phys->dd_head_dataset_obj, NULL,
- snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG);
+ snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG, &ds);
- if (snapname) {
+ if (err == 0 && snapname) {
err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
snapname, 8, 1, &obj);
dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
- if (err) {
- if (drop_lock)
- rw_exit(&dp->dp_config_rwlock);
- return (err);
+ if (err == 0) {
+ err = dsl_dataset_open_obj(dd->dd_pool, obj, NULL,
+ DS_MODE_EXCLUSIVE, FTAG, &ds);
}
-
- ds = dsl_dataset_open_obj(dd->dd_pool, obj, NULL,
- DS_MODE_EXCLUSIVE, FTAG);
}
- if (ds == NULL) {
+ if (err) {
if (drop_lock)
rw_exit(&dp->dp_config_rwlock);
- return (EBUSY);
+ return (err);
}
obj = ds->ds_object;
@@ -942,22 +971,25 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
* them. Try again.
*/
if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) {
- mutex_exit(&ds->ds_lock);
dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
if (drop_lock)
rw_exit(&dp->dp_config_rwlock);
return (EAGAIN);
}
- /* THE POINT OF NO (unsuccessful) RETURN */
-
if (ds->ds_phys->ds_prev_snap_obj != 0) {
if (ds->ds_prev) {
ds_prev = ds->ds_prev;
} else {
- ds_prev = dsl_dataset_open_obj(dd->dd_pool,
+ err = dsl_dataset_open_obj(dd->dd_pool,
ds->ds_phys->ds_prev_snap_obj, NULL,
- DS_MODE_NONE, FTAG);
+ DS_MODE_NONE, FTAG, &ds_prev);
+ if (err) {
+ dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
+ if (drop_lock)
+ rw_exit(&dp->dp_config_rwlock);
+ return (err);
+ }
}
after_branch_point =
(ds_prev->ds_phys->ds_next_snap_obj != obj);
@@ -974,6 +1006,8 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
}
}
+ /* THE POINT OF NO (unsuccessful) RETURN */
+
ASSERT3P(tx->tx_pool, ==, dd->dd_pool);
zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
@@ -983,8 +1017,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
spa_scrub_restart(dp->dp_spa, tx->tx_txg);
- ds_next = dsl_dataset_open_obj(dd->dd_pool,
- ds->ds_phys->ds_next_snap_obj, NULL, DS_MODE_NONE, FTAG);
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+ ds->ds_phys->ds_next_snap_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_next));
ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj);
dmu_buf_will_dirty(ds_next->ds_dbuf, tx);
@@ -1006,7 +1041,8 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
while (bplist_iterate(&ds_next->ds_deadlist, &itor,
&bp) == 0) {
if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) {
- bplist_enqueue(&ds->ds_deadlist, &bp, tx);
+ VERIFY(0 == bplist_enqueue(&ds->ds_deadlist,
+ &bp, tx));
if (ds_prev && !after_branch_point &&
bp.blk_birth >
ds_prev->ds_phys->ds_prev_snap_txg) {
@@ -1030,8 +1066,8 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
/* set next's deadlist to our deadlist */
ds_next->ds_phys->ds_deadlist_obj =
ds->ds_phys->ds_deadlist_obj;
- bplist_open(&ds_next->ds_deadlist, mos,
- ds_next->ds_phys->ds_deadlist_obj);
+ VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos,
+ ds_next->ds_phys->ds_deadlist_obj));
ds->ds_phys->ds_deadlist_obj = 0;
if (ds_next->ds_phys->ds_next_snap_obj != 0) {
@@ -1049,9 +1085,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
*/
dsl_dataset_t *ds_after_next;
- ds_after_next = dsl_dataset_open_obj(dd->dd_pool,
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
ds_next->ds_phys->ds_next_snap_obj, NULL,
- DS_MODE_NONE, FTAG);
+ DS_MODE_NONE, FTAG, &ds_after_next));
itor = 0;
while (bplist_iterate(&ds_after_next->ds_deadlist,
&itor, &bp) == 0) {
@@ -1078,9 +1114,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE,
ds_next);
if (ds_prev) {
- ds_next->ds_prev = dsl_dataset_open_obj(
- dd->dd_pool, ds->ds_phys->ds_prev_snap_obj,
- NULL, DS_MODE_NONE, ds_next);
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, ds_next, &ds_next->ds_prev));
} else {
ds_next->ds_prev = NULL;
}
@@ -1144,8 +1180,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
} else {
/* remove from snapshot namespace */
dsl_dataset_t *ds_head;
- ds_head = dsl_dataset_open_obj(dd->dd_pool,
- dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_head));
#ifdef ZFS_DEBUG
{
uint64_t val;
@@ -1195,8 +1232,10 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
if (dd->dd_phys->dd_head_dataset_obj == 0)
return (EINVAL);
- ds = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL,
- DS_MODE_NONE, FTAG);
+ err = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds);
+ if (err)
+ return (err);
err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj,
snapname, 8, 1, &value);
@@ -1217,7 +1256,7 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0,
DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx);
- dbuf = dmu_bonus_hold(mos, dsobj);
+ VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsphys = dbuf->db_data;
dsphys->ds_dir_obj = dd->dd_object;
@@ -1237,13 +1276,14 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes;
dsphys->ds_restoring = ds->ds_phys->ds_restoring;
dsphys->ds_bp = ds->ds_phys->ds_bp;
- dmu_buf_rele(dbuf);
+ dmu_buf_rele(dbuf, FTAG);
if (ds->ds_phys->ds_prev_snap_obj != 0) {
dsl_dataset_t *ds_prev;
- ds_prev = dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_NONE, FTAG);
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, NULL,
+ DS_MODE_NONE, FTAG, &ds_prev));
ASSERT(ds_prev->ds_phys->ds_next_snap_obj ==
ds->ds_object ||
ds_prev->ds_phys->ds_num_children > 1);
@@ -1266,7 +1306,8 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
ds->ds_phys->ds_unique_bytes = 0;
ds->ds_phys->ds_deadlist_obj =
bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
- bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj);
+ VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
+ ds->ds_phys->ds_deadlist_obj));
dprintf("snap '%s' -> obj %llu\n", snapname, dsobj);
err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj,
@@ -1275,8 +1316,9 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
if (ds->ds_prev)
dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds);
- ds->ds_prev = dsl_dataset_open_obj(dp,
- ds->ds_phys->ds_prev_snap_obj, snapname, DS_MODE_NONE, ds);
+ VERIFY(0 == dsl_dataset_open_obj(dp,
+ ds->ds_phys->ds_prev_snap_obj, snapname,
+ DS_MODE_NONE, ds, &ds->ds_prev));
rw_exit(&dp->dp_config_rwlock);
dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
@@ -1295,7 +1337,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx)
dsl_dir_dirty(ds->ds_dir, tx);
bplist_close(&ds->ds_deadlist);
- dmu_buf_remove_ref(ds->ds_dbuf, ds);
+ dmu_buf_rele(ds->ds_dbuf, ds);
}
void
@@ -1319,7 +1361,6 @@ dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds)
dds->dds_creation_txg = ds->ds_phys->ds_creation_txg;
dds->dds_space_refd = ds->ds_phys->ds_used_bytes;
dds->dds_fsid_guid = ds->ds_phys->ds_fsid_guid;
- dds->dds_guid = ds->ds_phys->ds_guid;
if (ds->ds_phys->ds_next_snap_obj) {
/*
@@ -1332,8 +1373,6 @@ dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds)
dds->dds_uncompressed_bytes =
ds->ds_phys->ds_uncompressed_bytes;
}
-
- dds->dds_objset_obj = ds->ds_object;
}
dsl_pool_t *
@@ -1375,10 +1414,11 @@ dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
}
/* new fs better exist */
- nds = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname, FTAG, &tail);
- if (nds == NULL) {
+ err = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname,
+ FTAG, &nds, &tail);
+ if (err) {
dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
- return (ENOENT);
+ return (err);
}
dsl_dir_close(nds, FTAG);
@@ -1397,8 +1437,12 @@ dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
tail++;
- fsds = dsl_dataset_open_obj(dd->dd_pool,
- dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG);
+ err = dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &fsds);
+ if (err) {
+ dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG);
+ return (err);
+ }
/* new name better not be in use */
err = zap_lookup(mos, fsds->ds_phys->ds_snapnames_zapobj,
@@ -1414,7 +1458,7 @@ dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
/* The point of no (unsuccessful) return */
rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER);
- dsl_dataset_get_snapname(snds);
+ VERIFY(0 == dsl_dataset_get_snapname(snds));
err = zap_remove(mos, fsds->ds_phys->ds_snapnames_zapobj,
snds->ds_snapname, tx);
ASSERT3U(err, ==, 0);
@@ -1440,9 +1484,9 @@ dsl_dataset_rename(const char *osname, const char *newname)
struct osrenamearg ora;
int err;
- dd = dsl_dir_open(osname, FTAG, &tail);
- if (dd == NULL)
- return (ENOENT);
+ err = dsl_dir_open(osname, FTAG, &dd, &tail);
+ if (err)
+ return (err);
if (tail == NULL) {
err = dsl_dir_sync_task(dd,
dsl_dir_rename_sync, (void*)newname, 1<<12);
diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c
index 4ea1d62de5..8ffa145477 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dir.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dir.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -76,18 +75,20 @@ dsl_dir_evict(dmu_buf_t *db, void *arg)
kmem_free(dd, sizeof (dsl_dir_t));
}
-dsl_dir_t *
+int
dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
- const char *tail, void *tag)
+ const char *tail, void *tag, dsl_dir_t **ddp)
{
dmu_buf_t *dbuf;
dsl_dir_t *dd;
+ int err;
ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) ||
dsl_pool_sync_context(dp));
- dbuf = dmu_bonus_hold_tag(dp->dp_meta_objset, ddobj, tag);
- dmu_buf_read(dbuf);
+ err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf);
+ if (err)
+ return (err);
dd = dmu_buf_get_user(dbuf);
#ifdef ZFS_DEBUG
{
@@ -112,8 +113,13 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
offsetof(dsl_prop_cb_record_t, cbr_node));
if (dd->dd_phys->dd_parent_obj) {
- dd->dd_parent = dsl_dir_open_obj(dp,
- dd->dd_phys->dd_parent_obj, NULL, dd);
+ err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
+ NULL, dd, &dd->dd_parent);
+ if (err) {
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
+ }
if (tail) {
#ifdef ZFS_DEBUG
uint64_t foundobj;
@@ -122,8 +128,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
dd->dd_parent->dd_phys->
dd_child_dir_zapobj,
tail, sizeof (foundobj), 1, &foundobj);
- ASSERT3U(err, ==, 0);
- ASSERT3U(foundobj, ==, ddobj);
+ ASSERT(err || foundobj == ddobj);
#endif
(void) strcpy(dd->dd_myname, tail);
} else {
@@ -131,11 +136,12 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
dd->dd_parent->dd_phys->
dd_child_dir_zapobj,
ddobj, dd->dd_myname);
- /*
- * The caller should be protecting this ddobj
- * from being deleted concurrently
- */
- ASSERT(err == 0);
+ }
+ if (err) {
+ dsl_dir_close(dd->dd_parent, dd);
+ kmem_free(dd, sizeof (dsl_dir_t));
+ dmu_buf_rele(dbuf, tag);
+ return (err);
}
} else {
(void) strcpy(dd->dd_myname, spa_name(dp->dp_spa));
@@ -166,7 +172,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
ASSERT3P(dd->dd_pool, ==, dp);
ASSERT3U(dd->dd_object, ==, ddobj);
ASSERT3P(dd->dd_dbuf, ==, dbuf);
- return (dd);
+ *ddp = dd;
+ return (0);
}
void
@@ -174,7 +181,7 @@ dsl_dir_close(dsl_dir_t *dd, void *tag)
{
dprintf_dd(dd, "%s\n", "");
spa_close(dd->dd_pool->dp_spa, tag);
- dmu_buf_rele_tag(dd->dd_dbuf, tag);
+ dmu_buf_rele(dd->dd_dbuf, tag);
}
/* buf must be long enough (MAXNAMELEN should do) */
@@ -266,8 +273,9 @@ getcomponent(const char *path, char *component, const char **nextp)
* same as dsl_open_dir, ignore the first component of name and use the
* spa instead
*/
-dsl_dir_t *
-dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
+int
+dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+ dsl_dir_t **ddp, const char **tailp)
{
char buf[MAXNAMELEN];
const char *next, *nextnext = NULL;
@@ -280,15 +288,15 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
dprintf("%s\n", name);
if (name == NULL)
- return (NULL);
+ return (ENOENT);
err = getcomponent(name, buf, &next);
if (err)
- return (NULL);
+ return (err);
if (spa == NULL) {
err = spa_open(buf, &spa, FTAG);
if (err) {
dprintf("spa_open(%s) failed\n", buf);
- return (NULL);
+ return (err);
}
openedspa = TRUE;
@@ -299,17 +307,19 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
dp = spa_get_dsl(spa);
rw_enter(&dp->dp_config_rwlock, RW_READER);
- dd = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag);
+ err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
+ if (err) {
+ rw_exit(&dp->dp_config_rwlock);
+ if (openedspa)
+ spa_close(spa, FTAG);
+ return (err);
+ }
+
while (next != NULL) {
dsl_dir_t *child_ds;
err = getcomponent(next, buf, &nextnext);
- if (err) {
- dsl_dir_close(dd, tag);
- rw_exit(&dp->dp_config_rwlock);
- if (openedspa)
- spa_close(spa, FTAG);
- return (NULL);
- }
+ if (err)
+ break;
ASSERT(next[0] != '\0');
if (next[0] == '@')
break;
@@ -321,18 +331,28 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
err = zap_lookup(dp->dp_meta_objset,
dd->dd_phys->dd_child_dir_zapobj,
buf, sizeof (ddobj), 1, &ddobj);
- if (err == ENOENT) {
+ if (err) {
+ if (err == ENOENT)
+ err = 0;
break;
}
- ASSERT(err == 0);
- child_ds = dsl_dir_open_obj(dp, ddobj, buf, tag);
+ err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds);
+ if (err)
+ break;
dsl_dir_close(dd, tag);
dd = child_ds;
next = nextnext;
}
rw_exit(&dp->dp_config_rwlock);
+ if (err) {
+ dsl_dir_close(dd, tag);
+ if (openedspa)
+ spa_close(spa, FTAG);
+ return (err);
+ }
+
/*
* It's an error if there's more than one component left, or
* tailp==NULL and there's any component left.
@@ -342,14 +362,14 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
/* bad path name */
dsl_dir_close(dd, tag);
dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
- next = NULL;
- dd = NULL;
+ err = ENOENT;
}
if (tailp)
*tailp = next;
if (openedspa)
spa_close(spa, FTAG);
- return (dd);
+ *ddp = dd;
+ return (err);
}
/*
@@ -358,10 +378,10 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp)
* tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@'
* means that the last component is a snapshot.
*/
-dsl_dir_t *
-dsl_dir_open(const char *name, void *tag, const char **tailp)
+int
+dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp)
{
- return (dsl_dir_open_spa(NULL, name, tag, tailp));
+ return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp));
}
int
@@ -397,7 +417,7 @@ dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
dprintf("dataset_create: zap_add %s->%lld to %lld returned %d\n",
name, ddobj, pds->dd_phys->dd_child_dir_zapobj, err);
- dbuf = dmu_bonus_hold(mos, ddobj);
+ VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsphys = dbuf->db_data;
@@ -407,7 +427,7 @@ dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx)
DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx);
dsphys->dd_child_dir_zapobj = zap_create(mos,
DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
- dmu_buf_rele(dbuf);
+ dmu_buf_rele(dbuf, FTAG);
rw_exit(&pds->dd_pool->dp_config_rwlock);
@@ -431,7 +451,9 @@ dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx)
if (err)
goto out;
- dd = dsl_dir_open_obj(dp, obj, name, FTAG);
+ err = dsl_dir_open_obj(dp, obj, name, FTAG, &dd);
+ if (err)
+ goto out;
ASSERT3U(dd->dd_phys->dd_parent_obj, ==, pds->dd_object);
if (dmu_buf_refcount(dd->dd_dbuf) > 1) {
@@ -512,7 +534,7 @@ dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
sizeof (uint64_t), 1, ddobjp, tx);
ASSERT3U(error, ==, 0);
- dbuf = dmu_bonus_hold(mos, *ddobjp);
+ VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf));
dmu_buf_will_dirty(dbuf, tx);
dsp = dbuf->db_data;
@@ -522,7 +544,7 @@ dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx)
dsp->dd_child_dir_zapobj = zap_create(mos,
DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx);
- dmu_buf_rele(dbuf);
+ dmu_buf_rele(dbuf, FTAG);
}
void
@@ -530,7 +552,6 @@ dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds)
{
bzero(dds, sizeof (dmu_objset_stats_t));
- dds->dds_dir_obj = dd->dd_object;
dds->dds_available = dsl_dir_space_available(dd, NULL, 0, TRUE);
mutex_enter(&dd->dd_lock);
@@ -543,22 +564,17 @@ dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds)
dds->dds_creation_time = dd->dd_phys->dd_creation_time;
- dds->dds_is_placeholder = (dd->dd_phys->dd_head_dataset_obj == 0);
-
if (dd->dd_phys->dd_clone_parent_obj) {
dsl_dataset_t *ds;
rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
- ds = dsl_dataset_open_obj(dd->dd_pool,
- dd->dd_phys->dd_clone_parent_obj, NULL, DS_MODE_NONE, FTAG);
+ VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool,
+ dd->dd_phys->dd_clone_parent_obj,
+ NULL, DS_MODE_NONE, FTAG, &ds));
dsl_dataset_name(ds, dds->dds_clone_of);
- dds->dds_clone_of_obj = dd->dd_phys->dd_clone_parent_obj;
dsl_dataset_close(ds, DS_MODE_NONE, FTAG);
rw_exit(&dd->dd_pool->dp_config_rwlock);
}
-
- spa_altroot(dd->dd_pool->dp_spa, dds->dds_altroot,
- sizeof (dds->dds_altroot));
}
int
@@ -668,7 +684,7 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx)
mutex_exit(&dd->dd_lock);
/* release the hold from dsl_dir_dirty */
- dmu_buf_remove_ref(dd->dd_dbuf, dd);
+ dmu_buf_rele(dd->dd_dbuf, dd);
}
static uint64_t
@@ -679,7 +695,7 @@ dsl_dir_estimated_space(dsl_dir_t *dd)
ASSERT(MUTEX_HELD(&dd->dd_lock));
- space = dd->dd_used_bytes;
+ space = dd->dd_phys->dd_used_bytes;
ASSERT(space >= 0);
for (i = 0; i < TXG_SIZE; i++) {
space += dd->dd_space_towrite[i&TXG_MASK];
@@ -788,6 +804,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
struct tempreserve *tr;
ASSERT3U(txg, !=, 0);
+ ASSERT3S(asize, >=, 0);
mutex_enter(&dd->dd_lock);
/*
@@ -827,10 +844,14 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd,
/*
* If they are requesting more space, and our current estimate
* is over quota. They get to try again unless the actual
- * on-disk is over quota.
+ * on-disk is over quota and there are no pending changes (which
+ * may free up space for us).
*/
if (asize > 0 && est_used > quota) {
- if (dd->dd_used_bytes < quota)
+ if (dd->dd_space_towrite[txg & TXG_MASK] != 0 ||
+ dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 ||
+ dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 ||
+ dd->dd_used_bytes < quota)
edquot = ERESTART;
dprintf_dd(dd, "failing: used=%lluK est_used = %lluK "
"quota=%lluK tr=%lluK err=%d\n",
@@ -876,6 +897,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize,
tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
list_create(tr_list, sizeof (struct tempreserve),
offsetof(struct tempreserve, tr_node));
+ ASSERT3S(asize, >=, 0);
+ ASSERT3S(fsize, >=, 0);
err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize,
tr_list, tx);
@@ -975,8 +998,6 @@ dsl_dir_diduse_space(dsl_dir_t *dd,
ASSERT(uncompressed >= 0 ||
dd->dd_phys->dd_uncompressed_bytes >= -uncompressed);
dd->dd_used_bytes += used;
- if (used > 0)
- dd->dd_space_towrite[tx->tx_txg & TXG_MASK] -= used;
dd->dd_phys->dd_uncompressed_bytes += uncompressed;
dd->dd_phys->dd_compressed_bytes += compressed;
mutex_exit(&dd->dd_lock);
@@ -1013,9 +1034,9 @@ dsl_dir_set_quota(const char *ddname, uint64_t quota)
dsl_dir_t *dd;
int err;
- dd = dsl_dir_open(ddname, FTAG, NULL);
- if (dd == NULL)
- return (ENOENT);
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err)
+ return (err);
/*
* If someone removes a file, then tries to set the quota, we
* want to make sure the file freeing takes effect.
@@ -1073,9 +1094,9 @@ dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
dsl_dir_t *dd;
int err;
- dd = dsl_dir_open(ddname, FTAG, NULL);
- if (dd == NULL)
- return (ENOENT);
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err)
+ return (err);
err = dsl_dir_sync_task(dd,
dsl_dir_set_reservation_sync, &reservation, 0);
dsl_dir_close(dd, FTAG);
@@ -1128,11 +1149,10 @@ dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
return (ENXIO);
}
- newpds = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &tail);
-
/* new parent should exist */
- if (newpds == NULL)
- return (ENOENT);
+ err = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &newpds, &tail);
+ if (err)
+ return (err);
/* new name should not already exist */
if (tail == NULL) {
@@ -1195,8 +1215,8 @@ dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx)
(void) strcpy(dd->dd_myname, tail);
dsl_dir_close(dd->dd_parent, dd);
dd->dd_phys->dd_parent_obj = newpds->dd_object;
- dd->dd_parent = dsl_dir_open_obj(dd->dd_pool,
- newpds->dd_object, NULL, dd);
+ VERIFY(0 == dsl_dir_open_obj(dd->dd_pool,
+ newpds->dd_object, NULL, dd, &dd->dd_parent));
/* add to new parent zapobj */
err = zap_add(mos, newpds->dd_phys->dd_child_dir_zapobj,
diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c
index 5b71ccfaa9..b8e54be6f6 100644
--- a/usr/src/uts/common/fs/zfs/dsl_pool.c
+++ b/usr/src/uts/common/fs/zfs/dsl_pool.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -39,8 +38,8 @@
/* internal reserved dir name */
#define MOS_DIR_NAME "$MOS"
-static dsl_dir_t *
-dsl_pool_open_mos_dir(dsl_pool_t *dp)
+static int
+dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp)
{
uint64_t obj;
int err;
@@ -48,9 +47,10 @@ dsl_pool_open_mos_dir(dsl_pool_t *dp)
err = zap_lookup(dp->dp_meta_objset,
dp->dp_root_dir->dd_phys->dd_child_dir_zapobj,
MOS_DIR_NAME, sizeof (obj), 1, &obj);
- ASSERT3U(err, ==, 0);
+ if (err)
+ return (err);
- return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp));
+ return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp));
}
static dsl_pool_t *
@@ -74,38 +74,56 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
return (dp);
}
-dsl_pool_t *
-dsl_pool_open(spa_t *spa, uint64_t txg)
+int
+dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
{
int err;
dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
-
- dp->dp_meta_objset =
- &dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp)->os;
+ objset_impl_t *osi;
rw_enter(&dp->dp_config_rwlock, RW_READER);
+ err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
+ if (err)
+ goto out;
+ dp->dp_meta_objset = &osi->os;
+
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
&dp->dp_root_dir_obj);
- ASSERT3U(err, ==, 0);
+ if (err)
+ goto out;
+
+ err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp, &dp->dp_root_dir);
+ if (err)
+ goto out;
- dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
- NULL, dp);
- dp->dp_mos_dir = dsl_pool_open_mos_dir(dp);
+ err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir);
+ if (err)
+ goto out;
+
+out:
rw_exit(&dp->dp_config_rwlock);
+ if (err)
+ dsl_pool_close(dp);
+ else
+ *dpp = dp;
- return (dp);
+ return (err);
}
void
dsl_pool_close(dsl_pool_t *dp)
{
/* drop our reference from dsl_pool_open() */
- dsl_dir_close(dp->dp_mos_dir, dp);
- dsl_dir_close(dp->dp_root_dir, dp);
+ if (dp->dp_mos_dir)
+ dsl_dir_close(dp->dp_mos_dir, dp);
+ if (dp->dp_root_dir)
+ dsl_dir_close(dp->dp_root_dir, dp);
/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
- dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+ if (dp->dp_meta_objset)
+ dmu_objset_evict(NULL, dp->dp_meta_objset->os);
txg_list_destroy(&dp->dp_dirty_datasets);
txg_list_destroy(&dp->dp_dirty_dirs);
@@ -132,14 +150,13 @@ dsl_pool_create(spa_t *spa, uint64_t txg)
/* create and open the root dir */
dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx);
- dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
- NULL, dp);
+ VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj,
+ NULL, dp, &dp->dp_root_dir));
/* create and open the meta-objset dir */
- err = dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME,
- tx);
+ VERIFY(0 == dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx));
ASSERT3U(err, ==, 0);
- dp->dp_mos_dir = dsl_pool_open_mos_dir(dp);
+ VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir));
dmu_tx_commit(tx);
diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c
index 3feb93e468..fc33b1c591 100644
--- a/usr/src/uts/common/fs/zfs/dsl_prop.c
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -75,7 +74,10 @@ dsl_prop_get_impl(dsl_pool_t *dp, uint64_t ddobj, const char *propname,
ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock));
while (ddobj != 0) {
- dsl_dir_t *dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG);
+ dsl_dir_t *dd;
+ err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
+ if (err)
+ break;
err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
propname, intsz, numint, buf);
if (err != ENOENT) {
@@ -136,7 +138,8 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
cbr->cbr_func(cbr->cbr_arg, value);
- (void) dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, cbr);
+ VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object,
+ NULL, cbr, &dd));
rw_exit(&dd->dd_pool->dp_config_rwlock);
/* Leave dataset open until this callback is unregistered */
return (0);
@@ -164,9 +167,9 @@ dsl_prop_get(const char *ddname, const char *propname,
const char *tail;
int err;
- dd = dsl_dir_open(ddname, FTAG, &tail);
- if (dd == NULL)
- return (ENOENT);
+ err = dsl_dir_open(ddname, FTAG, &dd, &tail);
+ if (err)
+ return (err);
if (tail && tail[0] != '@') {
dsl_dir_close(dd, FTAG);
return (ENOENT);
@@ -258,7 +261,9 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
int err;
ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
- dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG);
+ err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
+ if (err)
+ return;
if (!first) {
/*
@@ -353,15 +358,15 @@ dsl_prop_set(const char *ddname, const char *propname,
int err;
struct prop_set_arg psa;
- dd = dsl_dir_open(ddname, FTAG, NULL);
- if (dd == NULL)
- return (ENOENT);
+ err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+ if (err)
+ return (err);
psa.name = propname;
psa.intsz = intsz;
psa.numints = numints;
psa.buf = buf;
- err = dsl_dir_sync_task(dd, dsl_prop_set_sync, &psa, 0);
+ err = dsl_dir_sync_task(dd, dsl_prop_set_sync, &psa, 1<<20);
dsl_dir_close(dd, FTAG);
@@ -457,10 +462,12 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
if (dd->dd_phys->dd_parent_obj == 0)
parent = NULL;
else
- parent = dsl_dir_open_obj(dp,
- dd->dd_phys->dd_parent_obj, NULL, FTAG);
+ err = dsl_dir_open_obj(dp,
+ dd->dd_phys->dd_parent_obj, NULL, FTAG, &parent);
if (dd != ds->ds_dir)
dsl_dir_close(dd, FTAG);
+ if (err)
+ break;
dd = parent;
}
rw_exit(&dp->dp_config_rwlock);
diff --git a/usr/src/uts/common/fs/zfs/fletcher.c b/usr/src/uts/common/fs/zfs/fletcher.c
index 03186d1387..edda3c9a9d 100644
--- a/usr/src/uts/common/fs/zfs/fletcher.c
+++ b/usr/src/uts/common/fs/zfs/fletcher.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -98,3 +97,49 @@ fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
ZIO_SET_CHECKSUM(zcp, a, b, c, d);
}
+
+void
+fletcher_4_incremental_native(const void *buf, uint64_t size,
+ zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ a = zcp->zc_word[0];
+ b = zcp->zc_word[1];
+ c = zcp->zc_word[2];
+ d = zcp->zc_word[3];
+
+ for (; ip < ipend; ip++) {
+ a += ip[0];
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
+ zio_cksum_t *zcp)
+{
+ const uint32_t *ip = buf;
+ const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+ uint64_t a, b, c, d;
+
+ a = zcp->zc_word[0];
+ b = zcp->zc_word[1];
+ c = zcp->zc_word[2];
+ d = zcp->zc_word[3];
+
+ for (; ip < ipend; ip++) {
+ a += BSWAP_32(ip[0]);
+ b += a;
+ c += b;
+ d += c;
+ }
+
+ ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index 9d682e4990..d31e6edda3 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -379,11 +378,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
os, tx);
}
- db = dmu_bonus_hold(os, smo->smo_object);
+ VERIFY(0 == dmu_bonus_hold(os, smo->smo_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
ASSERT3U(db->db_size, ==, sizeof (*smo));
bcopy(smo, db->db_data, db->db_size);
- dmu_buf_rele(db);
+ dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
}
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 9b9bcab217..02be864b36 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -33,6 +32,7 @@
*/
#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
#include <sys/spa_impl.h>
#include <sys/zio.h>
#include <sys/zio_checksum.h>
@@ -62,6 +62,44 @@ static uint32_t spa_active_count;
* ==========================================================================
*/
+static int
+spa_error_entry_compare(const void *a, const void *b)
+{
+ spa_error_entry_t *sa = (spa_error_entry_t *)a;
+ spa_error_entry_t *sb = (spa_error_entry_t *)b;
+ int ret;
+
+ ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
+ sizeof (zbookmark_t));
+
+ if (ret < 0)
+ return (-1);
+ else if (ret > 0)
+ return (1);
+ else
+ return (0);
+}
+
+/*
+ * Utility function which retrieves copies of the current logs and
+ * re-initializes them in the process.
+ */
+void
+spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
+{
+ ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
+
+ bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
+ bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
+
+ avl_create(&spa->spa_errlist_scrub,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+}
+
/*
* Activate an uninitialized pool.
*/
@@ -76,9 +114,6 @@ spa_activate(spa_t *spa)
spa->spa_normal_class = metaslab_class_create();
- spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry",
- 4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE);
-
for (t = 0; t < ZIO_TYPES; t++) {
spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue",
8, maxclsyspri, 50, INT_MAX,
@@ -95,6 +130,13 @@ spa_activate(spa_t *spa)
txg_list_create(&spa->spa_vdev_txg_list,
offsetof(struct vdev, vdev_txg_node));
+
+ avl_create(&spa->spa_errlist_scrub,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
+ avl_create(&spa->spa_errlist_last,
+ spa_error_entry_compare, sizeof (spa_error_entry_t),
+ offsetof(spa_error_entry_t, se_avl));
}
/*
@@ -124,12 +166,18 @@ spa_deactivate(spa_t *spa)
spa->spa_zio_intr_taskq[t] = NULL;
}
- taskq_destroy(spa->spa_vdev_retry_taskq);
- spa->spa_vdev_retry_taskq = NULL;
-
metaslab_class_destroy(spa->spa_normal_class);
spa->spa_normal_class = NULL;
+ /*
+ * If this was part of an import or the open otherwise failed, we may
+ * still have errors left in the queues. Empty them just in case.
+ */
+ spa_errlog_drain(spa);
+
+ avl_destroy(&spa->spa_errlist_scrub);
+ avl_destroy(&spa->spa_errlist_last);
+
spa->spa_state = POOL_STATE_UNINITIALIZED;
}
@@ -175,6 +223,11 @@ static void
spa_unload(spa_t *spa)
{
/*
+ * Stop async tasks.
+ */
+ spa_async_suspend(spa);
+
+ /*
* Stop syncing.
*/
if (spa->spa_sync_on) {
@@ -185,8 +238,8 @@ spa_unload(spa_t *spa)
/*
* Wait for any outstanding prefetch I/O to complete.
*/
- spa_config_enter(spa, RW_WRITER);
- spa_config_exit(spa);
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_config_exit(spa, FTAG);
/*
* Close the dsl pool.
@@ -203,16 +256,16 @@ spa_unload(spa_t *spa)
vdev_free(spa->spa_root_vdev);
spa->spa_root_vdev = NULL;
}
+
+ spa->spa_async_suspended = 0;
}
/*
* Load an existing storage pool, using the pool's builtin spa_config as a
- * source of configuration information. The 'readonly' flag will prevent us
- * from writing any updated state to disk, and can be use when testing a pool
- * for import.
+ * source of configuration information.
*/
static int
-spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
+spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
{
int error = 0;
nvlist_t *nvroot = NULL;
@@ -221,25 +274,34 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
uint64_t pool_guid;
zio_t *zio;
+ spa->spa_load_state = state;
if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
- nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
- return (EINVAL);
+ nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
+ error = EINVAL;
+ goto out;
+ }
(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
&spa->spa_config_txg);
- if (import && spa_guid_exists(pool_guid, 0))
- return (EEXIST);
+ if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
+ spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
+ spa_guid_exists(pool_guid, 0)) {
+ error = EEXIST;
+ goto out;
+ }
/*
* Parse the configuration into a vdev tree.
*/
- spa_config_enter(spa, RW_WRITER);
+ spa_config_enter(spa, RW_WRITER, FTAG);
rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
- spa_config_exit(spa);
+ spa_config_exit(spa, FTAG);
- if (rvd == NULL)
- return (EINVAL);
+ if (rvd == NULL) {
+ error = EINVAL;
+ goto out;
+ }
spa->spa_root_vdev = rvd;
ASSERT(spa_guid(spa) == pool_guid);
@@ -247,8 +309,10 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
/*
* Try to open all vdevs, loading each label in the process.
*/
- if (vdev_open(rvd) != 0)
- return (ENXIO);
+ if (vdev_open(rvd) != 0) {
+ error = ENXIO;
+ goto out;
+ }
/*
* Find the best uberblock.
@@ -264,8 +328,16 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
* If we weren't able to find a single valid uberblock, return failure.
*/
if (ub->ub_txg == 0) {
- dprintf("ub_txg is zero\n");
- return (ENXIO);
+ error = ENXIO;
+ goto out;
+ }
+
+ /*
+ * If the pool is newer than the code, we can't open it.
+ */
+ if (ub->ub_version > UBERBLOCK_VERSION) {
+ error = ENOTSUP;
+ goto out;
}
/*
@@ -273,11 +345,10 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
* incomplete configuration.
*/
if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
- rvd->vdev_state = VDEV_STATE_CANT_OPEN;
- rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM;
- dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n",
- rvd->vdev_guid_sum, ub->ub_guid_sum);
- return (ENXIO);
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_GUID_SUM);
+ error = ENXIO;
+ goto out;
}
/*
@@ -286,12 +357,22 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
spa->spa_state = POOL_STATE_ACTIVE;
spa->spa_ubsync = spa->spa_uberblock;
spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
- spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg);
+ error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
+ if (error) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ goto out;
+ }
spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
- VERIFY(zap_lookup(spa->spa_meta_objset,
+ if (zap_lookup(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
- sizeof (uint64_t), 1, &spa->spa_config_object) == 0);
+ sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
if (!mosconfig) {
dmu_buf_t *db;
@@ -299,21 +380,24 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
size_t nvsize = 0;
nvlist_t *newconfig = NULL;
- db = dmu_bonus_hold(spa->spa_meta_objset,
- spa->spa_config_object);
- dmu_buf_read(db);
+ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset,
+ spa->spa_config_object, FTAG, &db));
nvsize = *(uint64_t *)db->db_data;
- dmu_buf_rele(db);
+ dmu_buf_rele(db, FTAG);
packed = kmem_alloc(nvsize, KM_SLEEP);
- error = dmu_read_canfail(spa->spa_meta_objset,
+ error = dmu_read(spa->spa_meta_objset,
spa->spa_config_object, 0, nvsize, packed);
if (error == 0)
error = nvlist_unpack(packed, nvsize, &newconfig, 0);
kmem_free(packed, nvsize);
- if (error)
- return (ENXIO);
+ if (error) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
spa_config_set(spa, newconfig);
@@ -321,39 +405,76 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
spa_deactivate(spa);
spa_activate(spa);
- return (spa_load(spa, newconfig, readonly, import, B_TRUE));
+ return (spa_load(spa, newconfig, state, B_TRUE));
}
- VERIFY(zap_lookup(spa->spa_meta_objset,
+ if (zap_lookup(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
- sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0);
+ sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
/*
- * Load the vdev state for all top level vdevs.
+ * Load the persistent error log. If we have an older pool, this will
+ * not be present.
*/
- if ((error = vdev_load(rvd, import)) != 0)
- return (error);
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
+ sizeof (uint64_t), 1, &spa->spa_errlog_last);
+ if (error != 0 &&error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ error = zap_lookup(spa->spa_meta_objset,
+ DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
+ sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ /*
+ * Load the vdev state for all top level vdevs. We need to grab the
+ * config lock because all label I/O is done with the
+ * ZIO_FLAG_CONFIG_HELD flag.
+ */
+ spa_config_enter(spa, RW_READER, FTAG);
+ if ((error = vdev_load(rvd)) != 0) {
+ spa_config_exit(spa, FTAG);
+ goto out;
+ }
+ spa_config_exit(spa, FTAG);
/*
* Propagate the leaf DTLs we just loaded all the way up the tree.
*/
- spa_config_enter(spa, RW_WRITER);
+ spa_config_enter(spa, RW_WRITER, FTAG);
vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
- spa_config_exit(spa);
+ spa_config_exit(spa, FTAG);
/*
* Check the state of the root vdev. If it can't be opened, it
* indicates one or more toplevel vdevs are faulted.
*/
- if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
- return (ENXIO);
+ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
+ error = ENXIO;
+ goto out;
+ }
/*
* Claim log blocks that haven't been committed yet, and update all
* top-level vdevs to sync any config changes found in vdev_load().
* This must all happen in a single txg.
*/
- if ((spa_mode & FWRITE) && !readonly) {
+ if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa),
spa_first_txg(spa));
dmu_objset_find(spa->spa_name, zil_claim, tx, 0);
@@ -369,7 +490,14 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig)
txg_wait_synced(spa->spa_dsl_pool, 0);
}
- return (0);
+ error = 0;
+out:
+ if (error)
+ zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0);
+ spa->spa_load_state = SPA_LOAD_NONE;
+ spa->spa_ena = 0;
+
+ return (error);
}
/*
@@ -415,7 +543,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
spa_activate(spa);
error = spa_load(spa, spa->spa_config,
- B_FALSE, B_FALSE, B_FALSE);
+ SPA_LOAD_OPEN, B_FALSE);
if (error == EBADF) {
/*
@@ -432,7 +560,9 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
if (locked)
mutex_exit(&spa_namespace_lock);
return (ENOENT);
- } if (error) {
+ }
+
+ if (error) {
/*
* We can't open the pool, but we still have useful
* information: the state of each vdev after the
@@ -443,10 +573,14 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
B_TRUE);
spa_unload(spa);
spa_deactivate(spa);
+ spa->spa_last_open_failed = B_TRUE;
if (locked)
mutex_exit(&spa_namespace_lock);
*spapp = NULL;
return (error);
+ } else {
+ zfs_post_ok(spa, NULL);
+ spa->spa_last_open_failed = B_FALSE;
}
loaded = B_TRUE;
@@ -459,9 +593,9 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
*spapp = spa;
if (config != NULL) {
- spa_config_enter(spa, RW_READER);
+ spa_config_enter(spa, RW_READER, FTAG);
*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
- spa_config_exit(spa);
+ spa_config_exit(spa, FTAG);
}
/*
@@ -479,8 +613,36 @@ spa_open(const char *name, spa_t **spapp, void *tag)
return (spa_open_common(name, spapp, tag, NULL));
}
+/*
+ * Lookup the given spa_t, incrementing the inject count in the process,
+ * preventing it from being exported or destroyed.
+ */
+spa_t *
+spa_inject_addref(char *name)
+{
+ spa_t *spa;
+
+ mutex_enter(&spa_namespace_lock);
+ if ((spa = spa_lookup(name)) == NULL) {
+ mutex_exit(&spa_namespace_lock);
+ return (NULL);
+ }
+ spa->spa_inject_ref++;
+ mutex_exit(&spa_namespace_lock);
+
+ return (spa);
+}
+
+void
+spa_inject_delref(spa_t *spa)
+{
+ mutex_enter(&spa_namespace_lock);
+ spa->spa_inject_ref--;
+ mutex_exit(&spa_namespace_lock);
+}
+
int
-spa_get_stats(const char *name, nvlist_t **config)
+spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
{
int error;
spa_t *spa;
@@ -488,6 +650,29 @@ spa_get_stats(const char *name, nvlist_t **config)
*config = NULL;
error = spa_open_common(name, &spa, FTAG, config);
+ if (spa && *config != NULL)
+ VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
+ spa_get_errlog_size(spa)) == 0);
+
+ /*
+ * We want to get the alternate root even for faulted pools, so we cheat
+ * and call spa_lookup() directly.
+ */
+ if (altroot) {
+ if (spa == NULL) {
+ mutex_enter(&spa_namespace_lock);
+ spa = spa_lookup(name);
+ if (spa)
+ spa_altroot(spa, altroot, buflen);
+ else
+ altroot[0] = '\0';
+ spa = NULL;
+ mutex_exit(&spa_namespace_lock);
+ } else {
+ spa_altroot(spa, altroot, buflen);
+ }
+ }
+
if (spa != NULL)
spa_close(spa, FTAG);
@@ -551,9 +736,11 @@ spa_create(const char *pool, nvlist_t *nvroot, char *altroot)
DMU_OT_PACKED_NVLIST, 1 << 14,
DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
- VERIFY(zap_add(spa->spa_meta_objset,
+ if (zap_add(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
- sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0);
+ sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add pool config");
+ }
/*
* Create the deferred-free bplist object. Turn off compression
@@ -565,9 +752,11 @@ spa_create(const char *pool, nvlist_t *nvroot, char *altroot)
dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
ZIO_COMPRESS_OFF, tx);
- VERIFY(zap_add(spa->spa_meta_objset,
+ if (zap_add(spa->spa_meta_objset,
DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
- sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0);
+ sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
+ cmn_err(CE_PANIC, "failed to add bplist");
+ }
dmu_tx_commit(tx);
@@ -619,7 +808,7 @@ spa_import(const char *pool, nvlist_t *config, char *altroot)
* Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig
* so that we don't try to open the pool if the config is damaged.
*/
- error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE);
+ error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE);
if (error) {
spa_unload(spa);
@@ -694,7 +883,7 @@ spa_tryimport(nvlist_t *tryconfig)
* Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig
* so we don't try to open the pool if the config is damaged.
*/
- (void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE);
+ (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
/*
* If 'tryconfig' was at least parsable, return the current config.
@@ -738,6 +927,16 @@ spa_export_common(char *pool, int new_state)
}
/*
+ * Put a hold on the pool, drop the namespace lock, stop async tasks,
+ * reacquire the namespace lock, and see if we can export.
+ */
+ spa_open_ref(spa, FTAG);
+ mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
+ mutex_enter(&spa_namespace_lock);
+ spa_close(spa, FTAG);
+
+ /*
* The pool will be in core if it's openable,
* in which case we can modify its state.
*/
@@ -749,17 +948,20 @@ spa_export_common(char *pool, int new_state)
spa_scrub_suspend(spa);
txg_wait_synced(spa->spa_dsl_pool, 0);
- if (!spa_refcount_zero(spa)) {
+ /*
+ * A pool cannot be exported or destroyed if there are active
+ * references. If we are resetting a pool, allow references by
+ * fault injection handlers.
+ */
+ if (!spa_refcount_zero(spa) ||
+ (spa->spa_inject_ref != 0 &&
+ new_state != POOL_STATE_UNINITIALIZED)) {
spa_scrub_resume(spa);
+ spa_async_resume(spa);
mutex_exit(&spa_namespace_lock);
return (EBUSY);
}
- /*
- * Update the pool state.
- */
- spa->spa_state = new_state;
-
spa_scrub_resume(spa);
VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
@@ -771,7 +973,10 @@ spa_export_common(char *pool, int new_state)
* so mark them all dirty. spa_unload() will do the
* final sync that pushes these changes out.
*/
- vdev_config_dirty(spa->spa_root_vdev);
+ if (new_state != POOL_STATE_UNINITIALIZED) {
+ spa->spa_state = new_state;
+ vdev_config_dirty(spa->spa_root_vdev);
+ }
}
if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
@@ -779,8 +984,10 @@ spa_export_common(char *pool, int new_state)
spa_deactivate(spa);
}
- spa_remove(spa);
- spa_config_sync();
+ if (new_state != POOL_STATE_UNINITIALIZED) {
+ spa_remove(spa);
+ spa_config_sync();
+ }
mutex_exit(&spa_namespace_lock);
return (0);
@@ -805,6 +1012,17 @@ spa_export(char *pool)
}
/*
+ * Similar to spa_export(), this unloads the spa_t without actually removing it
+ * from the namespace in any way.
+ */
+int
+spa_reset(char *pool)
+{
+ return (spa_export_common(pool, POOL_STATE_UNINITIALIZED));
+}
+
+
+/*
* ==========================================================================
* Device manipulation
* ==========================================================================
@@ -845,7 +1063,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
tvd->vdev_id = rvd->vdev_children;
vdev_add_child(rvd, tvd);
}
- vdev_init(tvd, txg);
+ if ((error = vdev_init(tvd, txg)) != 0)
+ return (spa_vdev_exit(spa, vd, txg, error));
vdev_config_dirty(tvd);
}
@@ -871,7 +1090,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
* is automatically detached.
*/
int
-spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
{
uint64_t txg, open_txg;
int error;
@@ -881,7 +1100,7 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
txg = spa_vdev_enter(spa);
- oldvd = vdev_lookup_by_path(rvd, path);
+ oldvd = vdev_lookup_by_guid(rvd, guid);
if (oldvd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
@@ -954,6 +1173,12 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
newvd->vdev_id = pvd->vdev_children;
vdev_add_child(pvd, newvd);
+ /*
+ * If newvd is smaller than oldvd, but larger than its rsize,
+ * the addition of newvd may have decreased our parent's asize.
+ */
+ pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
+
tvd = newvd->vdev_top;
ASSERT(pvd->vdev_top == tvd);
ASSERT(tvd->vdev_parent == rvd);
@@ -962,7 +1187,6 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
* Update the config based on the new in-core state.
*/
spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
-
vdev_config_dirty(tvd);
/*
@@ -976,14 +1200,14 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
open_txg - TXG_INITIAL + 1);
mutex_exit(&newvd->vdev_dtl_lock);
+ dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg);
+
/*
* Mark newvd's DTL dirty in this txg.
*/
vdev_dirty(tvd, VDD_DTL, txg);
(void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg);
- dprintf("attached %s, replacing=%d\n", path, replacing);
-
(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
/*
@@ -1000,7 +1224,7 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing)
* is a replacing vdev.
*/
int
-spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
+spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
{
uint64_t txg;
int c, t, error;
@@ -1009,14 +1233,11 @@ spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
txg = spa_vdev_enter(spa);
- vd = vdev_lookup_by_path(rvd, path);
+ vd = vdev_lookup_by_guid(rvd, guid);
if (vd == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
- if (guid != 0 && vd->vdev_guid != guid)
- return (spa_vdev_exit(spa, NULL, txg, ENODEV));
-
pvd = vd->vdev_parent;
/*
@@ -1105,13 +1326,16 @@ spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
/*
* Reopen this top-level vdev to reassess health after detach.
*/
- vdev_reopen(tvd, NULL);
+ vdev_reopen(tvd);
/*
* If the device we just detached was smaller than the others,
- * it may be possible to add metaslabs (i.e. grow the pool).
+ * it may be possible to add metaslabs (i.e. grow the pool). We ignore
+ * the error here because the detach still succeeded - we just weren't
+ * able to reinitialize the metaslabs. This pool is in for a world of
+ * hurt, in any case.
*/
- vdev_metaslab_init(tvd, txg);
+ (void) vdev_metaslab_init(tvd, txg);
/*
* Update the config based on the new in-core state.
@@ -1133,72 +1357,59 @@ spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done)
(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
(void) txg_list_add(&tvd->vdev_dtl_list, vd, txg);
- dprintf("detached %s\n", path);
+ dprintf("detached %s in txg %llu\n", vd->vdev_path, txg);
return (spa_vdev_exit(spa, vd, txg, 0));
}
/*
- * If there are any replacing vdevs that have finished replacing, detach them.
- * We can't hold the config lock across detaches, so we lock the config,
- * build a list of candidates, unlock the config, and try each candidate.
+ * Find any device that's done replacing, so we can detach it.
*/
-typedef struct vdev_detach_link {
- char *vdl_path;
- uint64_t vdl_guid;
- list_node_t vdl_node;
-} vdev_detach_link_t;
-
-static void
-spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd)
+static vdev_t *
+spa_vdev_replace_done_hunt(vdev_t *vd)
{
+ vdev_t *newvd, *oldvd;
int c;
- for (c = 0; c < vd->vdev_children; c++)
- spa_vdev_replace_done_make_list(l, vd->vdev_child[c]);
+ for (c = 0; c < vd->vdev_children; c++) {
+ oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]);
+ if (oldvd != NULL)
+ return (oldvd);
+ }
if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) {
- vdev_t *cvd0 = vd->vdev_child[0];
- vdev_t *cvd1 = vd->vdev_child[1];
- vdev_detach_link_t *vdl;
- int dirty1;
-
- mutex_enter(&cvd1->vdev_dtl_lock);
- dirty1 = cvd1->vdev_dtl_map.sm_space |
- cvd1->vdev_dtl_scrub.sm_space;
- mutex_exit(&cvd1->vdev_dtl_lock);
-
- if (!dirty1) {
- vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP);
- vdl->vdl_path = spa_strdup(cvd0->vdev_path);
- vdl->vdl_guid = cvd0->vdev_guid;
- list_insert_tail(l, vdl);
+ oldvd = vd->vdev_child[0];
+ newvd = vd->vdev_child[1];
+
+ mutex_enter(&newvd->vdev_dtl_lock);
+ if (newvd->vdev_dtl_map.sm_space == 0 &&
+ newvd->vdev_dtl_scrub.sm_space == 0) {
+ mutex_exit(&newvd->vdev_dtl_lock);
+ return (oldvd);
}
+ mutex_exit(&newvd->vdev_dtl_lock);
}
+
+ return (NULL);
}
-void
+static void
spa_vdev_replace_done(spa_t *spa)
{
- vdev_detach_link_t *vdl;
- list_t vdlist;
-
- list_create(&vdlist, sizeof (vdev_detach_link_t),
- offsetof(vdev_detach_link_t, vdl_node));
-
- spa_config_enter(spa, RW_READER);
- spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev);
- spa_config_exit(spa);
-
- while ((vdl = list_head(&vdlist)) != NULL) {
- list_remove(&vdlist, vdl);
- (void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid,
- B_TRUE);
- spa_strfree(vdl->vdl_path);
- kmem_free(vdl, sizeof (*vdl));
+ vdev_t *vd;
+ uint64_t guid;
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) {
+ guid = vd->vdev_guid;
+ spa_config_exit(spa, FTAG);
+ if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
+ return;
+ spa_config_enter(spa, RW_READER, FTAG);
}
- list_destroy(&vdlist);
+ spa_config_exit(spa, FTAG);
}
/*
@@ -1234,7 +1445,16 @@ spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
* ==========================================================================
*/
-static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t);
+void
+spa_scrub_throttle(spa_t *spa, int direction)
+{
+ mutex_enter(&spa->spa_scrub_lock);
+ spa->spa_scrub_throttled += direction;
+ ASSERT(spa->spa_scrub_throttled >= 0);
+ if (spa->spa_scrub_throttled == 0)
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ mutex_exit(&spa->spa_scrub_lock);
+}
static void
spa_scrub_io_done(zio_t *zio)
@@ -1244,22 +1464,23 @@ spa_scrub_io_done(zio_t *zio)
zio_buf_free(zio->io_data, zio->io_size);
mutex_enter(&spa->spa_scrub_lock);
- if (zio->io_error)
- spa->spa_scrub_errors++;
- if (--spa->spa_scrub_inflight == 0)
- cv_broadcast(&spa->spa_scrub_io_cv);
- mutex_exit(&spa->spa_scrub_lock);
-
- if (zio->io_error) {
+ if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
vdev_t *vd = zio->io_vd;
+ spa->spa_scrub_errors++;
mutex_enter(&vd->vdev_stat_lock);
vd->vdev_stat.vs_scrub_errors++;
mutex_exit(&vd->vdev_stat_lock);
}
+ if (--spa->spa_scrub_inflight == 0) {
+ cv_broadcast(&spa->spa_scrub_io_cv);
+ ASSERT(spa->spa_scrub_throttled == 0);
+ }
+ mutex_exit(&spa->spa_scrub_lock);
}
static void
-spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags)
+spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags,
+ zbookmark_t *zb)
{
size_t size = BP_GET_LSIZE(bp);
void *data = zio_buf_alloc(size);
@@ -1268,8 +1489,13 @@ spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags)
spa->spa_scrub_inflight++;
mutex_exit(&spa->spa_scrub_lock);
+ if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+ flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */
+
+ flags |= ZIO_FLAG_CANFAIL;
+
zio_nowait(zio_read(NULL, spa, bp, data, size,
- spa_scrub_io_done, NULL, priority, flags));
+ spa_scrub_io_done, NULL, priority, flags, zb));
}
/* ARGSUSED */
@@ -1319,12 +1545,11 @@ spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a)
}
if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) {
spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY |
- ZIO_FLAG_RESILVER);
+ ZIO_FLAG_RESILVER, &bc->bc_bookmark);
}
} else {
spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB);
+ ZIO_FLAG_SCRUB, &bc->bc_bookmark);
}
return (0);
@@ -1348,19 +1573,25 @@ spa_scrub_thread(spa_t *spa)
*/
txg_wait_synced(spa_get_dsl(spa), 0);
- spa_config_enter(spa, RW_WRITER);
- vdev_reopen(rvd, NULL); /* purge all vdev caches */
+ dprintf("start %s mintxg=%llu maxtxg=%llu\n",
+ scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
+ spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ vdev_reopen(rvd); /* purge all vdev caches */
vdev_config_dirty(rvd); /* rewrite all disk labels */
vdev_scrub_stat_update(rvd, scrub_type, B_FALSE);
- spa_config_exit(spa);
+ spa_config_exit(spa, FTAG);
mutex_enter(&spa->spa_scrub_lock);
spa->spa_scrub_errors = 0;
spa->spa_scrub_active = 1;
+ ASSERT(spa->spa_scrub_inflight == 0);
+ ASSERT(spa->spa_scrub_throttled == 0);
while (!spa->spa_scrub_stop) {
CALLB_CPR_SAFE_BEGIN(&cprinfo);
- while (spa->spa_scrub_suspend) {
+ while (spa->spa_scrub_suspended) {
spa->spa_scrub_active = 0;
cv_broadcast(&spa->spa_scrub_cv);
cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
@@ -1376,6 +1607,9 @@ spa_scrub_thread(spa_t *spa)
mutex_enter(&spa->spa_scrub_lock);
if (error != EAGAIN)
break;
+
+ while (spa->spa_scrub_throttled > 0)
+ cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
}
while (spa->spa_scrub_inflight)
@@ -1384,16 +1618,25 @@ spa_scrub_thread(spa_t *spa)
if (spa->spa_scrub_restart_txg != 0)
error = ERESTART;
+ if (spa->spa_scrub_stop)
+ error = EINTR;
+
spa->spa_scrub_active = 0;
cv_broadcast(&spa->spa_scrub_cv);
/*
- * If the traverse completed, and there were no errors,
- * then the scrub was completely successful.
+ * Even if there were uncorrectable errors, we consider the scrub
+ * completed. The downside is that if there is a transient error during
+ * a resilver, we won't resilver the data properly to the target. But
+ * if the damage is permanent (more likely) we will resilver forever,
+ * which isn't really acceptable. Since there is enough information for
+ * the user to know what has failed and why, this seems like a more
+ * tractable approach.
*/
- complete = (error == 0 && spa->spa_scrub_errors == 0);
+ complete = (error == 0);
- dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+ dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n",
+ scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub",
spa->spa_scrub_maxtxg, complete ? "done" : "FAILED",
error, spa->spa_scrub_errors, spa->spa_scrub_stop);
@@ -1403,31 +1646,32 @@ spa_scrub_thread(spa_t *spa)
* If the scrub/resilver completed, update all DTLs to reflect this.
* Whether it succeeded or not, vacate all temporary scrub DTLs.
*/
- spa_config_enter(spa, RW_WRITER);
+ spa_config_enter(spa, RW_WRITER, FTAG);
vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1,
complete ? spa->spa_scrub_maxtxg : 0, B_TRUE);
- spa_config_exit(spa);
-
- spa_vdev_replace_done(spa);
-
- spa_config_enter(spa, RW_READER);
vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete);
- spa_config_exit(spa);
+ spa_errlog_rotate(spa);
+ spa_config_exit(spa, FTAG);
mutex_enter(&spa->spa_scrub_lock);
- spa->spa_scrub_type = POOL_SCRUB_NONE;
- spa->spa_scrub_active = 0;
- spa->spa_scrub_thread = NULL;
-
- cv_broadcast(&spa->spa_scrub_cv);
+ /*
+ * We may have finished replacing a device.
+ * Let the async thread assess this and handle the detach.
+ */
+ spa_async_request(spa, SPA_ASYNC_REPLACE_DONE);
/*
* If we were told to restart, our final act is to start a new scrub.
*/
if (error == ERESTART)
- VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0);
+ spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ?
+ SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB);
+ spa->spa_scrub_type = POOL_SCRUB_NONE;
+ spa->spa_scrub_active = 0;
+ spa->spa_scrub_thread = NULL;
+ cv_broadcast(&spa->spa_scrub_cv);
CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */
thread_exit();
}
@@ -1436,7 +1680,7 @@ void
spa_scrub_suspend(spa_t *spa)
{
mutex_enter(&spa->spa_scrub_lock);
- spa->spa_scrub_suspend++;
+ spa->spa_scrub_suspended++;
while (spa->spa_scrub_active) {
cv_broadcast(&spa->spa_scrub_cv);
cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
@@ -1450,8 +1694,8 @@ void
spa_scrub_resume(spa_t *spa)
{
mutex_enter(&spa->spa_scrub_lock);
- ASSERT(spa->spa_scrub_suspend != 0);
- if (--spa->spa_scrub_suspend == 0)
+ ASSERT(spa->spa_scrub_suspended != 0);
+ if (--spa->spa_scrub_suspended == 0)
cv_broadcast(&spa->spa_scrub_cv);
mutex_exit(&spa->spa_scrub_lock);
}
@@ -1469,17 +1713,19 @@ spa_scrub_restart(spa_t *spa, uint64_t txg)
mutex_exit(&spa->spa_scrub_lock);
}
-static int
-spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+int
+spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
{
space_seg_t *ss;
uint64_t mintxg, maxtxg;
vdev_t *rvd = spa->spa_root_vdev;
- int advance = 0;
+ int advance = ADVANCE_PRE | ADVANCE_ZIL;
if ((uint_t)type >= POOL_SCRUB_TYPES)
return (ENOTSUP);
+ mutex_enter(&spa->spa_scrub_lock);
+
/*
* If there's a scrub or resilver already in progress, stop it.
*/
@@ -1487,9 +1733,10 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
/*
* Don't stop a resilver unless forced.
*/
- if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force)
+ if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) {
+ mutex_exit(&spa->spa_scrub_lock);
return (EBUSY);
-
+ }
spa->spa_scrub_stop = 1;
cv_broadcast(&spa->spa_scrub_cv);
cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock);
@@ -1503,19 +1750,36 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
spa->spa_scrub_th = NULL;
}
- spa->spa_scrub_stop = 0;
- spa->spa_scrub_type = type;
- spa->spa_scrub_restart_txg = 0;
+ if (rvd == NULL) {
+ ASSERT(spa->spa_scrub_stop == 0);
+ ASSERT(spa->spa_scrub_type == type);
+ ASSERT(spa->spa_scrub_restart_txg == 0);
+ mutex_exit(&spa->spa_scrub_lock);
+ return (0);
+ }
mintxg = TXG_INITIAL - 1;
maxtxg = spa_last_synced_txg(spa) + 1;
- switch (type) {
+ mutex_enter(&rvd->vdev_dtl_lock);
- case POOL_SCRUB_NONE:
- break;
+ if (rvd->vdev_dtl_map.sm_space == 0) {
+ /*
+ * The pool-wide DTL is empty.
+ * If this is a resilver, there's nothing to do.
+ */
+ if (type == POOL_SCRUB_RESILVER)
+ type = POOL_SCRUB_NONE;
+ } else {
+ /*
+ * The pool-wide DTL is non-empty.
+ * If this is a normal scrub, upgrade to a resilver instead.
+ */
+ if (type == POOL_SCRUB_EVERYTHING)
+ type = POOL_SCRUB_RESILVER;
+ }
- case POOL_SCRUB_RESILVER:
+ if (type == POOL_SCRUB_RESILVER) {
/*
* Determine the resilvering boundaries.
*
@@ -1525,26 +1789,22 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
* Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1
* so we don't claim to resilver a txg that's still changing.
*/
- mutex_enter(&rvd->vdev_dtl_lock);
ss = avl_first(&rvd->vdev_dtl_map.sm_root);
- mintxg = ss ? ss->ss_start - 1 : 0;
+ mintxg = ss->ss_start - 1;
ss = avl_last(&rvd->vdev_dtl_map.sm_root);
- maxtxg = ss ? ss->ss_end : 0;
- maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1);
- mutex_exit(&rvd->vdev_dtl_lock);
+ maxtxg = MIN(ss->ss_end, maxtxg);
- advance = ADVANCE_PRE | ADVANCE_PRUNE;
- break;
-
- case POOL_SCRUB_EVERYTHING:
- /*
- * A scrub is like a resilver, but not pruned by DTL.
- */
- advance = ADVANCE_PRE;
- break;
+ advance |= ADVANCE_PRUNE;
}
- if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) {
+ mutex_exit(&rvd->vdev_dtl_lock);
+
+ spa->spa_scrub_stop = 0;
+ spa->spa_scrub_type = type;
+ spa->spa_scrub_restart_txg = 0;
+
+ if (type != POOL_SCRUB_NONE) {
+ spa->spa_scrub_mintxg = mintxg;
spa->spa_scrub_maxtxg = maxtxg;
spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL,
advance, ZIO_FLAG_CANFAIL);
@@ -1553,24 +1813,119 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force)
spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri);
}
+ mutex_exit(&spa->spa_scrub_lock);
+
return (0);
}
-int
-spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force)
+/*
+ * ==========================================================================
+ * SPA async task processing
+ * ==========================================================================
+ */
+
+static void
+spa_async_reopen(spa_t *spa)
{
- int error;
- traverse_handle_t *th;
+ vdev_t *rvd = spa->spa_root_vdev;
+ vdev_t *tvd;
+ int c;
- mutex_enter(&spa->spa_scrub_lock);
- error = spa_scrub_locked(spa, type, force);
- th = spa->spa_scrub_th;
- mutex_exit(&spa->spa_scrub_lock);
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ for (c = 0; c < rvd->vdev_children; c++) {
+ tvd = rvd->vdev_child[c];
+ if (tvd->vdev_reopen_wanted) {
+ tvd->vdev_reopen_wanted = 0;
+ vdev_reopen(tvd);
+ }
+ }
+
+ spa_config_exit(spa, FTAG);
+}
- if (th == NULL && type != POOL_SCRUB_NONE)
+static void
+spa_async_thread(spa_t *spa)
+{
+ int tasks;
+
+ ASSERT(spa->spa_sync_on);
+
+ mutex_enter(&spa->spa_async_lock);
+ tasks = spa->spa_async_tasks;
+ spa->spa_async_tasks = 0;
+ mutex_exit(&spa->spa_async_lock);
+
+ /*
+ * See if any devices need to be reopened.
+ */
+ if (tasks & SPA_ASYNC_REOPEN)
+ spa_async_reopen(spa);
+
+ /*
+ * If any devices are done replacing, detach them.
+ */
+ if (tasks & SPA_ASYNC_REPLACE_DONE)
spa_vdev_replace_done(spa);
- return (error);
+ /*
+ * Kick off a scrub.
+ */
+ if (tasks & SPA_ASYNC_SCRUB)
+ VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0);
+
+ /*
+ * Kick off a resilver.
+ */
+ if (tasks & SPA_ASYNC_RESILVER)
+ VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0);
+
+ /*
+ * Let the world know that we're done.
+ */
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_thread = NULL;
+ cv_broadcast(&spa->spa_async_cv);
+ mutex_exit(&spa->spa_async_lock);
+ thread_exit();
+}
+
+void
+spa_async_suspend(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_suspended++;
+ while (spa->spa_async_thread != NULL)
+ cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
+ mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_resume(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ ASSERT(spa->spa_async_suspended != 0);
+ spa->spa_async_suspended--;
+ mutex_exit(&spa->spa_async_lock);
+}
+
+static void
+spa_async_dispatch(spa_t *spa)
+{
+ mutex_enter(&spa->spa_async_lock);
+ if (spa->spa_async_tasks && !spa->spa_async_suspended &&
+ spa->spa_async_thread == NULL)
+ spa->spa_async_thread = thread_create(NULL, 0,
+ spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
+ mutex_exit(&spa->spa_async_lock);
+}
+
+void
+spa_async_request(spa_t *spa, int task)
+{
+ mutex_enter(&spa->spa_async_lock);
+ spa->spa_async_tasks |= task;
+ mutex_exit(&spa->spa_async_lock);
}
/*
@@ -1628,17 +1983,19 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
packed = kmem_alloc(nvsize, KM_SLEEP);
- VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0);
+ VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR,
+ KM_SLEEP) == 0);
dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize,
packed, tx);
kmem_free(packed, nvsize);
- db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object);
+ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset,
+ spa->spa_config_object, FTAG, &db));
dmu_buf_will_dirty(db, tx);
*(uint64_t *)db->db_data = nvsize;
- dmu_buf_rele(db);
+ dmu_buf_rele(db, FTAG);
}
/*
@@ -1651,7 +2008,6 @@ spa_sync(spa_t *spa, uint64_t txg)
dsl_pool_t *dp = spa->spa_dsl_pool;
objset_t *mos = spa->spa_meta_objset;
bplist_t *bpl = &spa->spa_sync_bplist;
- vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd;
dmu_tx_t *tx;
int dirty_vdevs;
@@ -1659,12 +2015,12 @@ spa_sync(spa_t *spa, uint64_t txg)
/*
* Lock out configuration changes.
*/
- spa_config_enter(spa, RW_READER);
+ spa_config_enter(spa, RW_READER, FTAG);
spa->spa_syncing_txg = txg;
spa->spa_sync_pass = 0;
- bplist_open(bpl, mos, spa->spa_sync_bplist_obj);
+ VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
/*
* If anything has changed in this txg, push the deferred frees
@@ -1685,6 +2041,8 @@ spa_sync(spa_t *spa, uint64_t txg)
spa_sync_config_object(spa, tx);
dmu_tx_commit(tx);
+ spa_errlog_sync(spa, txg);
+
dsl_pool_sync(dp, txg);
dirty_vdevs = 0;
@@ -1707,11 +2065,7 @@ spa_sync(spa_t *spa, uint64_t txg)
* Rewrite the vdev configuration (which includes the uberblock)
* to commit the transaction group.
*/
- while (spa_sync_labels(spa, txg)) {
- dprintf("waiting for devices to heal\n");
- delay(hz);
- vdev_reopen(rvd, NULL);
- }
+ VERIFY(0 == spa_sync_labels(spa, txg));
/*
* Make a stable copy of the fully synced uberblock.
@@ -1748,7 +2102,12 @@ spa_sync(spa_t *spa, uint64_t txg)
ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
ASSERT(bpl->bpl_queue == NULL);
- spa_config_exit(spa);
+ spa_config_exit(spa, FTAG);
+
+ /*
+ * If any async tasks have been requested, kick them off.
+ */
+ spa_async_dispatch(spa);
}
/*
@@ -1800,13 +2159,13 @@ spa_evict_all(void)
mutex_enter(&spa_namespace_lock);
while ((spa = spa_next(NULL)) != NULL) {
/*
- * Stop all scrub and resilver activity. spa_scrub() needs to
- * wait for the scrub thread, which may do a detach and sync the
- * configs, which needs spa_namespace_lock. Drop the lock while
- * maintaining a hold on the spa_t.
+ * Stop async tasks. The async thread may need to detach
+ * a device that's been replaced, which requires grabbing
+ * spa_namespace_lock, so we must drop it here.
*/
spa_open_ref(spa, FTAG);
mutex_exit(&spa_namespace_lock);
+ spa_async_suspend(spa);
VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0);
mutex_enter(&spa_namespace_lock);
spa_close(spa, FTAG);
@@ -1819,3 +2178,9 @@ spa_evict_all(void)
}
mutex_exit(&spa_namespace_lock);
}
+
+vdev_t *
+spa_lookup_by_guid(spa_t *spa, uint64_t guid)
+{
+ return (vdev_lookup_by_guid(spa->spa_root_vdev, guid));
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index abcd67ddb9..addf3af885 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -33,6 +32,11 @@
#include <sys/fs/zfs.h>
#include <sys/vdev_impl.h>
#include <sys/zfs_ioctl.h>
+#ifdef _KERNEL
+#include <sys/kobj.h>
+#endif
+
+extern int modrootloaded;
/*
* Pool configuration repository.
@@ -65,43 +69,39 @@ const char *spa_config_dir = ZPOOL_CACHE_DIR;
void
spa_config_load(void)
{
- vnode_t *vp;
void *buf = NULL;
- vattr_t vattr;
- ssize_t resid;
nvlist_t *nvlist, *child;
nvpair_t *nvpair;
spa_t *spa;
char pathname[128];
+ struct _buf *file;
+ struct bootstat bst;
/*
* Open the configuration file.
*/
- (void) snprintf(pathname, sizeof (pathname), "./%s/%s", spa_config_dir,
- ZPOOL_CACHE_FILE);
- if (vn_openat(pathname, UIO_SYSSPACE, FREAD | FOFFMAX, 0, &vp, 0, 0,
- rootdir) != 0)
+ (void) snprintf(pathname, sizeof (pathname), "%s%s/%s",
+ (modrootloaded) ? "./" : "", spa_config_dir, ZPOOL_CACHE_FILE);
+
+ file = kobj_open_file(pathname);
+ if (file == (struct _buf *)-1)
return;
- /*
- * Read the nvlist from the file.
- */
- if (VOP_GETATTR(vp, &vattr, 0, kcred) != 0)
+ if (kobj_fstat(file->_fd, &bst) != 0)
goto out;
- buf = kmem_alloc(vattr.va_size, KM_SLEEP);
+ buf = kmem_alloc(bst.st_size, KM_SLEEP);
- if (vn_rdwr(UIO_READ, vp, buf, vattr.va_size, 0, UIO_SYSSPACE,
- 0, RLIM64_INFINITY, kcred, &resid) != 0)
- goto out;
-
- if (resid != 0)
+ /*
+ * Read the nvlist from the file.
+ */
+ if (kobj_read_file(file, buf, bst.st_size, 0) < 0)
goto out;
/*
* Unpack the nvlist.
*/
- if (nvlist_unpack(buf, vattr.va_size, &nvlist, KM_SLEEP) != 0)
+ if (nvlist_unpack(buf, bst.st_size, &nvlist, KM_SLEEP) != 0)
goto out;
/*
@@ -133,10 +133,9 @@ spa_config_load(void)
out:
if (buf != NULL)
- kmem_free(buf, vattr.va_size);
+ kmem_free(buf, bst.st_size);
- (void) VOP_CLOSE(vp, FREAD | FOFFMAX, 1, 0, kcred);
- VN_RELE(vp);
+ kobj_close_file(file);
}
/*
@@ -157,7 +156,7 @@ spa_config_sync(void)
ASSERT(MUTEX_HELD(&spa_namespace_lock));
- VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
/*
* Add all known pools to the configuration list, ignoring those with
@@ -179,7 +178,8 @@ spa_config_sync(void)
buf = kmem_alloc(buflen, KM_SLEEP);
- VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, 0) == 0);
+ VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR,
+ KM_SLEEP) == 0);
/*
* Write the configuration to disk. We need to do the traditional
@@ -226,7 +226,7 @@ spa_all_configs(uint64_t *generation)
if (*generation == spa_config_generation)
return (NULL);
- VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0);
spa = NULL;
mutex_enter(&spa_namespace_lock);
@@ -279,7 +279,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
else if (txg != 0 && vd == rvd)
spa->spa_config_txg = txg;
- VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
UBERBLOCK_VERSION) == 0);
diff --git a/usr/src/uts/common/fs/zfs/spa_errlog.c b/usr/src/uts/common/fs/zfs/spa_errlog.c
new file mode 100644
index 0000000000..b52c3236d2
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/spa_errlog.c
@@ -0,0 +1,436 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * Routines to manage the on-disk persistent error log.
+ *
+ * Each pool stores a log of all logical data errors seen during normal
+ * operation. This is actually the union of two distinct logs: the last log,
+ * and the current log. All errors seen are logged to the current log. When a
+ * scrub completes, the current log becomes the last log, the last log is thrown
+ * out, and the current log is reinitialized. This way, if an error is somehow
+ * corrected, a new scrub will show that that it no longer exists, and will be
+ * deleted from the log when the scrub completes.
+ *
+ * The log is stored using a ZAP object whose key is a string form of the
+ * zbookmark tuple (objset, object, level, blkid), and whose contents is an
+ * optional 'objset:object' human-readable string describing the data. When an
+ * error is first logged, this string will be empty, indicating that no name is
+ * known. This prevents us from having to issue a potentially large amount of
+ * I/O to discover the object name during an error path. Instead, we do the
+ * calculation when the data is requested, storing the result so future queries
+ * will be faster.
+ *
+ * This log is then shipped into an nvlist where the key is the dataset name and
+ * the value is the object name. Userland is then responsible for uniquifying
+ * this list and displaying it to the user.
+ */
+
+#include <sys/dmu_tx.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zap.h>
+#include <sys/zio.h>
+
+/*
+ * This is a stripped-down version of strtoull, suitable only for converting
+ * lowercase hexidecimal numbers that don't overflow.
+ */
+static uint64_t
+strtonum(char *str, char **nptr)
+{
+ uint64_t val = 0;
+ char c;
+ int digit;
+
+ while ((c = *str) != '\0') {
+ if (c >= '0' && c <= '9')
+ digit = c - '0';
+ else if (c >= 'a' && c <= 'f')
+ digit = 10 + c - 'a';
+ else
+ break;
+
+ val *= 16;
+ val += digit;
+
+ str++;
+ }
+
+ *nptr = str;
+
+ return (val);
+}
+
+/*
+ * Convert a bookmark to a string.
+ */
+static void
+bookmark_to_name(zbookmark_t *zb, char *buf, size_t len)
+{
+ (void) snprintf(buf, len, "%llx:%llx:%llx:%llx",
+ (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object,
+ (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid);
+}
+
+/*
+ * Convert a string to a bookmark
+ */
+static void
+name_to_bookmark(char *buf, zbookmark_t *zb)
+{
+ zb->zb_objset = strtonum(buf, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_object = strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_level = (int)strtonum(buf + 1, &buf);
+ ASSERT(*buf == ':');
+ zb->zb_blkid = strtonum(buf + 1, &buf);
+ ASSERT(*buf == '\0');
+}
+
+/*
+ * Log an uncorrectable error to the persistent error log. We add it to the
+ * spa's list of pending errors. The changes are actually synced out to disk
+ * during spa_errlog_sync().
+ */
+void
+spa_log_error(spa_t *spa, zio_t *zio)
+{
+ zbookmark_t *zb = &zio->io_logical->io_bookmark;
+ spa_error_entry_t search;
+ spa_error_entry_t *new;
+ avl_tree_t *tree;
+ avl_index_t where;
+
+ /*
+ * If we are trying to import a pool, ignore any errors, as we won't be
+ * writing to the pool any time soon.
+ */
+ if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+ return;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * If we have had a request to rotate the log, log it to the next list
+ * instead of the current one.
+ */
+ if (spa->spa_scrub_active || spa->spa_scrub_finished)
+ tree = &spa->spa_errlist_scrub;
+ else
+ tree = &spa->spa_errlist_last;
+
+ search.se_bookmark = *zb;
+ if (avl_find(tree, &search, &where) != NULL) {
+ mutex_exit(&spa->spa_errlist_lock);
+ return;
+ }
+
+ new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP);
+ new->se_bookmark = *zb;
+ avl_insert(tree, new, where);
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Return the number of errors currently in the error log. This is actually the
+ * sum of both the last log and the current log, since we don't know the union
+ * of these logs until we reach userland.
+ */
+uint64_t
+spa_get_errlog_size(spa_t *spa)
+{
+ uint64_t total = 0, count;
+
+ mutex_enter(&spa->spa_errlog_lock);
+ if (spa->spa_errlog_scrub != 0 &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub,
+ &count) == 0)
+ total += count;
+
+ if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished &&
+ zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+ &count) == 0)
+ total += count;
+ mutex_exit(&spa->spa_errlog_lock);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ total += avl_numnodes(&spa->spa_errlist_last);
+ total += avl_numnodes(&spa->spa_errlist_scrub);
+ mutex_exit(&spa->spa_errlist_lock);
+
+ return (total);
+}
+
+#ifdef _KERNEL
+static int
+process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
+{
+ zap_cursor_t zc;
+ zap_attribute_t za;
+ zbookmark_t zb;
+
+ if (obj == 0)
+ return (0);
+
+ for (zap_cursor_init(&zc, spa->spa_meta_objset, obj);
+ zap_cursor_retrieve(&zc, &za) == 0;
+ zap_cursor_advance(&zc)) {
+
+ if (*count == 0) {
+ zap_cursor_fini(&zc);
+ return (ENOMEM);
+ }
+
+ name_to_bookmark(za.za_name, &zb);
+
+ if (copyout(&zb, (char *)addr +
+ (*count - 1) * sizeof (zbookmark_t),
+ sizeof (zbookmark_t)) != 0)
+ return (EFAULT);
+
+ *count -= 1;
+ }
+
+ zap_cursor_fini(&zc);
+
+ return (0);
+}
+
+static int
+process_error_list(avl_tree_t *list, void *addr, size_t *count)
+{
+ spa_error_entry_t *se;
+
+ for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) {
+
+ if (*count == 0)
+ return (ENOMEM);
+
+ if (copyout(&se->se_bookmark, (char *)addr +
+ (*count - 1) * sizeof (zbookmark_t),
+ sizeof (zbookmark_t)) != 0)
+ return (EFAULT);
+
+ *count -= 1;
+ }
+
+ return (0);
+}
+#endif
+
+/*
+ * Copy all known errors to userland as an array of bookmarks. This is
+ * actually a union of the on-disk last log and current log, as well as any
+ * pending error requests.
+ *
+ * Because the act of reading the on-disk log could cause errors to be
+ * generated, we have two separate locks: one for the error log and one for the
+ * in-core error lists. We only need the error list lock to log and error, so
+ * we grab the error log lock while we read the on-disk logs, and only pick up
+ * the error list lock when we are finished.
+ */
+int
+spa_get_errlog(spa_t *spa, void *uaddr, size_t *count)
+{
+ int ret = 0;
+
+#ifdef _KERNEL
+ mutex_enter(&spa->spa_errlog_lock);
+
+ ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
+
+ if (!ret && !spa->spa_scrub_finished)
+ ret = process_error_log(spa, spa->spa_errlog_last, uaddr,
+ count);
+
+ mutex_enter(&spa->spa_errlist_lock);
+ if (!ret)
+ ret = process_error_list(&spa->spa_errlist_scrub, uaddr,
+ count);
+ if (!ret)
+ ret = process_error_list(&spa->spa_errlist_last, uaddr,
+ count);
+ mutex_exit(&spa->spa_errlist_lock);
+
+ mutex_exit(&spa->spa_errlog_lock);
+#endif
+
+ return (ret);
+}
+
+/*
+ * Called when a scrub completes. This simply set a bit which tells which AVL
+ * tree to add new errors. spa_errlog_sync() is responsible for actually
+ * syncing the changes to the underlying objects.
+ */
+void
+spa_errlog_rotate(spa_t *spa)
+{
+ mutex_enter(&spa->spa_errlist_lock);
+
+ ASSERT(!spa->spa_scrub_finished);
+ spa->spa_scrub_finished = B_TRUE;
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Discard any pending errors from the spa_t. Called when unloading a faulted
+ * pool, as the errors encountered during the open cannot be synced to disk.
+ */
+void
+spa_errlog_drain(spa_t *spa)
+{
+ spa_error_entry_t *se;
+ void *cookie;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_last,
+ &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub,
+ &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+
+ mutex_exit(&spa->spa_errlist_lock);
+}
+
+/*
+ * Process a list of errors into the current on-disk log.
+ */
+static void
+sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx)
+{
+ spa_error_entry_t *se;
+ char buf[64];
+ void *cookie;
+
+ if (avl_numnodes(t) != 0) {
+ /* create log if necessary */
+ if (*obj == 0)
+ *obj = zap_create(spa->spa_meta_objset,
+ DMU_OT_ERROR_LOG, DMU_OT_NONE,
+ 0, tx);
+
+ /* add errors to the current log */
+ for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) {
+ char *name = se->se_name ? se->se_name : "";
+
+ bookmark_to_name(&se->se_bookmark, buf, sizeof (buf));
+
+ (void) zap_update(spa->spa_meta_objset,
+ *obj, buf, 1, strlen(name) + 1, name, tx);
+ }
+
+ /* purge the error list */
+ cookie = NULL;
+ while ((se = avl_destroy_nodes(t, &cookie)) != NULL)
+ kmem_free(se, sizeof (spa_error_entry_t));
+ }
+}
+
+/*
+ * Sync the error log out to disk. This is a little tricky because the act of
+ * writing the error log requires the spa_errlist_lock. So, we need to lock the
+ * error lists, take a copy of the lists, and then reinitialize them. Then, we
+ * drop the error list lock and take the error log lock, at which point we
+ * do the errlog processing. Then, if we encounter an I/O error during this
+ * process, we can successfully add the error to the list. Note that this will
+ * result in the perpetual recycling of errors, but it is an unlikely situation
+ * and not a performance critical operation.
+ */
+void
+spa_errlog_sync(spa_t *spa, uint64_t txg)
+{
+ dmu_tx_t *tx;
+ avl_tree_t scrub, last;
+ int scrub_finished;
+
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * Bail out early under normal circumstances.
+ */
+ if (avl_numnodes(&spa->spa_errlist_scrub) == 0 &&
+ avl_numnodes(&spa->spa_errlist_last) == 0 &&
+ !spa->spa_scrub_finished) {
+ mutex_exit(&spa->spa_errlist_lock);
+ return;
+ }
+
+ spa_get_errlists(spa, &last, &scrub);
+ scrub_finished = spa->spa_scrub_finished;
+ spa->spa_scrub_finished = B_FALSE;
+
+ mutex_exit(&spa->spa_errlist_lock);
+ mutex_enter(&spa->spa_errlog_lock);
+
+ tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+ /*
+ * Sync out the current list of errors.
+ */
+ sync_error_list(spa, &last, &spa->spa_errlog_last, tx);
+
+ /*
+ * Rotate the log if necessary.
+ */
+ if (scrub_finished) {
+ if (spa->spa_errlog_last != 0)
+ VERIFY(dmu_object_free(spa->spa_meta_objset,
+ spa->spa_errlog_last, tx) == 0);
+ spa->spa_errlog_last = spa->spa_errlog_scrub;
+ spa->spa_errlog_scrub = 0;
+
+ sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx);
+ }
+
+ /*
+ * Sync out any pending scrub errors.
+ */
+ sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx);
+
+ /*
+ * Update the MOS to reflect the new values.
+ */
+ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1,
+ &spa->spa_errlog_last, tx);
+ (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1,
+ &spa->spa_errlog_scrub, tx);
+
+ dmu_tx_commit(tx);
+
+ mutex_exit(&spa->spa_errlog_lock);
+}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index 1ea7edfb77..8e0f6ce722 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -60,6 +59,7 @@
* - Increase spa_refcount from non-zero
* - Check if spa_refcount is zero
* - Rename a spa_t
+ * - add/remove/attach/detach devices
* - Held for the duration of create/destroy/import/export
*
* It does not need to handle recursion. A create or destroy may
@@ -91,14 +91,6 @@
* must have the namespace lock or non-zero refcount to have any kind
* of spa_t pointer at all.
*
- * spa_vdev_lock (global mutex)
- *
- * This special lock is a global mutex used to serialize attempts to
- * access devices through ZFS. It makes sure that we do not try to add
- * a single vdev to multiple pools at the same time. It must be held
- * when adding or removing a device from the pool.
- *
- *
* The locking order is fairly straightforward:
*
* spa_namespace_lock -> spa_refcount
@@ -111,10 +103,9 @@
* There must be at least one valid reference on the spa_t to acquire
* the config lock.
*
- * spa_vdev_lock -> spa_config_lock
+ * spa_namespace_lock -> spa_config_lock
*
- * There are no locks required for spa_vdev_lock, but it must be
- * acquired before spa_config_lock.
+ * The namespace lock must always be taken before the config lock.
*
*
* The spa_namespace_lock and spa_config_cache_lock can be acquired directly and
@@ -136,6 +127,7 @@
* spa_evict_all() Shutdown and remove all spa_t structures in
* the system.
*
+ * spa_guid_exists() Determine whether a pool/device guid exists.
*
* The spa_refcount is manipulated using the following functions:
*
@@ -162,15 +154,14 @@
* spa_config_held() Returns true if the config lock is currently
* held in the given state.
*
- * The spa_vdev_lock, while acquired directly, is hidden by the following
- * functions, which imply additional semantics that must be followed:
+ * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit().
*
- * spa_vdev_enter() Acquire the vdev lock and the config lock for
- * writing.
+ * spa_vdev_enter() Acquire the namespace lock and the config lock
+ * for writing.
*
* spa_vdev_exit() Release the config lock, wait for all I/O
- * to complete, release the vdev lock, and sync
- * the updated configs to the cache.
+ * to complete, sync the updated configs to the
+ * cache, and release the namespace lock.
*
* The spa_name() function also requires either the spa_namespace_lock
* or the spa_config_lock, as both are needed to do a rename. spa_rename() is
@@ -191,8 +182,6 @@ int zfs_flags = ~0;
int zfs_flags = 0;
#endif
-static kmutex_t spa_vdev_lock;
-
#define SPA_MINREF 5 /* spa_refcnt for an open-but-idle pool */
/*
@@ -238,6 +227,7 @@ spa_add(const char *name)
spa->spa_freeze_txg = UINT64_MAX;
refcount_create(&spa->spa_refcount);
+ refcount_create(&spa->spa_config_lock.scl_count);
avl_add(&spa_namespace_avl, spa);
@@ -268,6 +258,7 @@ spa_remove(spa_t *spa)
spa_config_set(spa, NULL);
refcount_destroy(&spa->spa_refcount);
+ refcount_destroy(&spa->spa_config_lock.scl_count);
kmem_free(spa, sizeof (spa_t));
}
@@ -351,7 +342,7 @@ spa_refcount_zero(spa_t *spa)
* valid use during create.
*/
void
-spa_config_enter(spa_t *spa, krw_t rw)
+spa_config_enter(spa_t *spa, krw_t rw, void *tag)
{
spa_config_lock_t *scl = &spa->spa_config_lock;
@@ -362,13 +353,14 @@ spa_config_enter(spa_t *spa, krw_t rw)
while (scl->scl_writer != NULL)
cv_wait(&scl->scl_cv, &scl->scl_lock);
} else {
- while (scl->scl_writer != NULL || scl->scl_count > 0)
+ while (scl->scl_writer != NULL ||
+ !refcount_is_zero(&scl->scl_count))
cv_wait(&scl->scl_cv, &scl->scl_lock);
scl->scl_writer = curthread;
}
}
- scl->scl_count++;
+ (void) refcount_add(&scl->scl_count, tag);
mutex_exit(&scl->scl_lock);
}
@@ -377,14 +369,14 @@ spa_config_enter(spa_t *spa, krw_t rw)
* Release the spa config lock, notifying any waiters in the process.
*/
void
-spa_config_exit(spa_t *spa)
+spa_config_exit(spa_t *spa, void *tag)
{
spa_config_lock_t *scl = &spa->spa_config_lock;
mutex_enter(&scl->scl_lock);
- ASSERT(scl->scl_count > 0);
- if (--scl->scl_count == 0) {
+ ASSERT(!refcount_is_zero(&scl->scl_count));
+ if (refcount_remove(&scl->scl_count, tag) == 0) {
cv_broadcast(&scl->scl_cv);
scl->scl_writer = NULL; /* OK in either case */
}
@@ -405,7 +397,7 @@ spa_config_held(spa_t *spa, krw_t rw)
if (rw == RW_WRITER)
held = (scl->scl_writer == curthread);
else
- held = (scl->scl_count != 0);
+ held = !refcount_is_zero(&scl->scl_count);
mutex_exit(&scl->scl_lock);
return (held);
@@ -418,16 +410,22 @@ spa_config_held(spa_t *spa, krw_t rw)
*/
/*
- * Lock the given spa_t for the purpose of adding or removing a vdev. This
- * grabs the global spa_vdev_lock as well as the spa config lock for writing.
+ * Lock the given spa_t for the purpose of adding or removing a vdev.
+ * Grabs the global spa_namespace_lock plus the spa config lock for writing.
* It returns the next transaction group for the spa_t.
*/
uint64_t
spa_vdev_enter(spa_t *spa)
{
- mutex_enter(&spa_vdev_lock);
+ /*
+ * Suspend scrub activity while we mess with the config.
+ */
+ spa_scrub_suspend(spa);
- spa_config_enter(spa, RW_WRITER);
+ if (spa->spa_root_vdev != NULL) /* not spa_create() */
+ mutex_enter(&spa_namespace_lock);
+
+ spa_config_enter(spa, RW_WRITER, spa);
return (spa_last_synced_txg(spa) + 1);
}
@@ -441,14 +439,26 @@ spa_vdev_enter(spa_t *spa)
int
spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
{
- vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+ ASSERT(txg != 0);
+
+ /*
+ * Reassess the DTLs. spa_scrub() looks at the DTLs without
+ * taking the config lock at all, so keep it safe.
+ */
+ if (spa->spa_root_vdev)
+ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+
+ spa_config_exit(spa, spa);
- spa_config_exit(spa);
+ /*
+ * If there was a scrub or resilver in progress, indicate that
+ * it must restart, and then allow it to resume.
+ */
+ spa_scrub_restart(spa, txg);
+ spa_scrub_resume(spa);
- if (vd == spa->spa_root_vdev) { /* spa_create() */
- mutex_exit(&spa_vdev_lock);
+ if (vd == spa->spa_root_vdev) /* spa_create() */
return (error);
- }
/*
* Note: this txg_wait_synced() is important because it ensures
@@ -458,8 +468,6 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
if (error == 0)
txg_wait_synced(spa->spa_dsl_pool, txg);
- mutex_exit(&spa_vdev_lock);
-
if (vd != NULL) {
ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
vdev_free(vd);
@@ -469,11 +477,10 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
* If we're in the middle of export or destroy, don't sync the
* config -- it will do that anyway, and we deadlock if we try.
*/
- if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE) {
- mutex_enter(&spa_namespace_lock);
+ if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE)
spa_config_sync();
- mutex_exit(&spa_namespace_lock);
- }
+
+ mutex_exit(&spa_namespace_lock);
return (error);
}
@@ -497,7 +504,7 @@ spa_rename(const char *name, const char *newname)
* Lookup the spa_t and grab the config lock for writing. We need to
* actually open the pool so that we can sync out the necessary labels.
* It's OK to call spa_open() with the namespace lock held because we
- * alllow recursive calls for other reasons.
+ * allow recursive calls for other reasons.
*/
mutex_enter(&spa_namespace_lock);
if ((err = spa_open(name, &spa, FTAG)) != 0) {
@@ -505,7 +512,7 @@ spa_rename(const char *name, const char *newname)
return (err);
}
- spa_config_enter(spa, RW_WRITER);
+ spa_config_enter(spa, RW_WRITER, FTAG);
avl_remove(&spa_namespace_avl, spa);
spa_strfree(spa->spa_name);
@@ -519,7 +526,7 @@ spa_rename(const char *name, const char *newname)
*/
vdev_config_dirty(spa->spa_root_vdev);
- spa_config_exit(spa);
+ spa_config_exit(spa, FTAG);
txg_wait_synced(spa->spa_dsl_pool, 0);
@@ -548,12 +555,8 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
{
spa_t *spa;
avl_tree_t *t = &spa_namespace_avl;
- boolean_t locked = B_FALSE;
- if (mutex_owner(&spa_namespace_lock) != curthread) {
- mutex_enter(&spa_namespace_lock);
- locked = B_TRUE;
- }
+ ASSERT(MUTEX_HELD(&spa_namespace_lock));
for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) {
if (spa->spa_state == POOL_STATE_UNINITIALIZED)
@@ -565,9 +568,6 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid)
break;
}
- if (locked)
- mutex_exit(&spa_namespace_lock);
-
return (spa != NULL);
}
@@ -646,12 +646,12 @@ spa_freeze(spa_t *spa)
{
uint64_t freeze_txg = 0;
- spa_config_enter(spa, RW_WRITER);
+ spa_config_enter(spa, RW_WRITER, FTAG);
if (spa->spa_freeze_txg == UINT64_MAX) {
freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE;
spa->spa_freeze_txg = freeze_txg;
}
- spa_config_exit(spa);
+ spa_config_exit(spa, FTAG);
if (freeze_txg != 0)
txg_wait_synced(spa_get_dsl(spa), freeze_txg);
}
diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c
index 25f66bf94b..a99ec3f360 100644
--- a/usr/src/uts/common/fs/zfs/space_map.c
+++ b/usr/src/uts/common/fs/zfs/space_map.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -293,7 +292,8 @@ space_map_load(space_map_t *sm, space_map_obj_t *smo, uint8_t maptype,
dprintf("object=%llu offset=%llx size=%llx\n",
smo->smo_object, offset, size);
- dmu_read(os, smo->smo_object, offset, size, entry_map);
+ VERIFY(0 == dmu_read(os, smo->smo_object, offset, size,
+ entry_map));
entry_map_end = entry_map + (size / sizeof (uint64_t));
for (entry = entry_map; entry < entry_map_end; entry++) {
@@ -394,7 +394,8 @@ space_map_write(space_map_t *sm, space_map_obj_t *smo, objset_t *os,
{
uint64_t oldsize = smo->smo_objsize;
- dmu_free_range(os, smo->smo_object, 0, smo->smo_objsize, tx);
+ VERIFY(0 == dmu_free_range(os, smo->smo_object, 0,
+ smo->smo_objsize, tx));
smo->smo_objsize = 0;
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index b11cd42b6d..1a93d4e4ca 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -41,6 +40,7 @@ typedef struct arc_buf_hdr arc_buf_hdr_t;
typedef struct arc_buf arc_buf_t;
typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
typedef void arc_byteswap_func_t(void *buf, size_t size);
+typedef int arc_evict_func_t(void *private);
/* generic arc_done_func_t's which you can use */
arc_done_func_t arc_bcopy_func;
@@ -50,6 +50,8 @@ struct arc_buf {
arc_buf_hdr_t *b_hdr;
arc_buf_t *b_next;
void *b_data;
+ arc_evict_func_t *b_efunc;
+ void *b_private;
};
/*
@@ -60,22 +62,30 @@ struct arc_buf {
#define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */
arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag);
-void arc_buf_free(arc_buf_t *buf, void *tag);
+void arc_buf_add_ref(arc_buf_t *buf, void *tag);
+int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
int arc_buf_size(arc_buf_t *buf);
void arc_release(arc_buf_t *buf, void *tag);
int arc_released(arc_buf_t *buf);
+int arc_has_callback(arc_buf_t *buf);
+#ifdef ZFS_DEBUG
+int arc_referenced(arc_buf_t *buf);
+#endif
int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t arc_flags);
+ uint32_t arc_flags, zbookmark_t *zb);
int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress,
uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
arc_done_func_t *done, void *private, int priority, int flags,
- uint32_t arc_flags);
+ uint32_t arc_flags, zbookmark_t *zb);
int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_done_func_t *done, void *private, uint32_t arc_flags);
int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
+void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
+int arc_buf_evict(arc_buf_t *buf);
+
void arc_flush(void);
void arc_tempreserve_clear(uint64_t tempreserve);
int arc_tempreserve_space(uint64_t tempreserve);
diff --git a/usr/src/uts/common/fs/zfs/sys/bplist.h b/usr/src/uts/common/fs/zfs/sys/bplist.h
index 0933cb977b..c716fe7aa6 100644
--- a/usr/src/uts/common/fs/zfs/sys/bplist.h
+++ b/usr/src/uts/common/fs/zfs/sys/bplist.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -67,11 +66,11 @@ typedef struct bplist {
extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
-extern void bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
+extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
extern void bplist_close(bplist_t *bpl);
extern boolean_t bplist_empty(bplist_t *bpl);
extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
-extern void bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
+extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx);
extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp);
extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h
index d67901b31a..5724f7a324 100644
--- a/usr/src/uts/common/fs/zfs/sys/dbuf.h
+++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -45,13 +44,14 @@ extern "C" {
#define IN_DMU_SYNC ((blkptr_t *)-1)
/*
- * define flags for dbuf_read and friends
+ * define flags for dbuf_read
*/
#define DB_RF_MUST_SUCCEED 0
#define DB_RF_CANFAIL (1 << 1)
#define DB_RF_HAVESTRUCT (1 << 2)
#define DB_RF_NOPREFETCH (1 << 3)
+#define DB_RF_NEVERWAIT (1 << 4)
/*
* The state transition diagram for dbufs looks like:
@@ -59,7 +59,7 @@ extern "C" {
* +----> READ ----+
* | |
* | V
- * (alloc)-->UNCACHED CACHED-->(free)
+ * (alloc)-->UNCACHED CACHED-->EVICTING-->(free)
* | ^
* | |
* +----> FILL ----+
@@ -68,7 +68,8 @@ typedef enum dbuf_states {
DB_UNCACHED,
DB_FILL,
DB_READ,
- DB_CACHED
+ DB_CACHED,
+ DB_EVICTING
} dbuf_states_t;
struct objset_impl;
@@ -158,8 +159,8 @@ typedef struct dmu_buf_impl {
uint64_t db_dirtied;
/*
- * If dd_dnode != NULL, our link on the owner dnodes's dn_dbufs list.
- * Protected by its dn_mtx.
+ * If db_dnode != NULL, our link on the owner dnodes's dn_dbufs list.
+ * Protected by its dn_dbufs_mtx.
*/
list_node_t db_link;
@@ -194,7 +195,7 @@ typedef struct dmu_buf_impl {
* modify (dirty or clean). db_mtx must be held
* before dn_dirty_mtx.
*/
- arc_buf_t *db_data_old[TXG_SIZE];
+ void *db_data_old[TXG_SIZE];
blkptr_t *db_overridden_by[TXG_SIZE];
} db_d;
} dmu_buf_impl_t;
@@ -212,35 +213,32 @@ typedef struct dbuf_hash_table {
uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset);
dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
+dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn);
-dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid);
+dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
void *tag);
-dmu_buf_impl_t *dbuf_hold_bonus(struct dnode *dn, void *tag);
int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create,
void *tag, dmu_buf_impl_t **dbp);
void dbuf_prefetch(struct dnode *dn, uint64_t blkid);
void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
-void dbuf_remove_ref(dmu_buf_impl_t *db, void *tag);
uint64_t dbuf_refcount(dmu_buf_impl_t *db);
-void dbuf_rele(dmu_buf_impl_t *db);
+void dbuf_rele(dmu_buf_impl_t *db, void *tag);
dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
-void dbuf_read(dmu_buf_impl_t *db);
-int dbuf_read_canfail(dmu_buf_impl_t *db);
-void dbuf_read_havestruct(dmu_buf_impl_t *db);
-void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
+int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-void dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
void dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+void dbuf_clear(dmu_buf_impl_t *db);
void dbuf_evict(dmu_buf_impl_t *db);
void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
@@ -250,7 +248,6 @@ void dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg);
void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks,
struct dmu_tx *);
-void dbuf_downgrade(dmu_buf_impl_t *db, int evicting);
void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
void dbuf_init(void);
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 62cc46c4de..f0ba816a7c 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -99,6 +98,8 @@ typedef enum dmu_object_type {
DMU_OT_PLAIN_OTHER, /* UINT8 */
DMU_OT_UINT64_OTHER, /* UINT64 */
DMU_OT_ZAP_OTHER, /* ZAP */
+ /* new object types: */
+ DMU_OT_ERROR_LOG, /* ZAP */
DMU_OT_NUMTYPES
} dmu_object_type_t;
@@ -146,6 +147,7 @@ void zfs_znode_byteswap(void *buf, size_t size);
int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
objset_t **osp);
void dmu_objset_close(objset_t *os);
+void dmu_objset_evict_dbufs(objset_t *os);
int dmu_objset_create(const char *name, dmu_objset_type_t type,
objset_t *clone_parent,
void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg);
@@ -177,6 +179,8 @@ typedef void dmu_byteswap_func_t(void *buf, size_t size);
#define DMU_POOL_CONFIG "config"
#define DMU_POOL_ROOT_DATASET "root_dataset"
#define DMU_POOL_SYNC_BPLIST "sync_bplist"
+#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub"
+#define DMU_POOL_ERRLOG_LAST "errlog_last"
/*
* Allocate an object from this objset. The range of object numbers
@@ -268,8 +272,7 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
* dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
* buffer as well. You must release your hold with dmu_buf_rele().
*/
-dmu_buf_t *dmu_bonus_hold(objset_t *os, uint64_t object);
-dmu_buf_t *dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag);
+int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
int dmu_bonus_max(void);
/*
@@ -286,11 +289,10 @@ int dmu_bonus_max(void);
*
* The object number must be a valid, allocated object number.
*/
-dmu_buf_t *dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset);
+int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
+ void *tag, dmu_buf_t **);
void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
-void dmu_buf_remove_ref(dmu_buf_t *db, void* tag);
-void dmu_buf_rele(dmu_buf_t *db);
-void dmu_buf_rele_tag(dmu_buf_t *db, void *tag);
+void dmu_buf_rele(dmu_buf_t *db, void *tag);
uint64_t dmu_buf_refcount(dmu_buf_t *db);
/*
@@ -303,9 +305,9 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db);
* with dmu_buf_rele_array. You can NOT release the hold on each buffer
* individually with dmu_buf_rele.
*/
-dmu_buf_t **dmu_buf_hold_array(objset_t *os, uint64_t object,
- uint64_t offset, uint64_t length, int *numbufs);
-void dmu_buf_rele_array(dmu_buf_t **, int numbufs);
+int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
+ uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp);
+void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
/*
* Returns NULL on success, or the existing user ptr if it's already
@@ -348,19 +350,6 @@ void dmu_buf_rele_data(dmu_buf_t *db);
void *dmu_buf_get_user(dmu_buf_t *db);
/*
- * Indicate that you are going to read the buffer's data (db_data).
- *
- * This routine will read the data from disk if necessary.
- *
- * These routines will return 0 on success, or an errno if there is a
- * nonrecoverable I/O error.
- */
-void dmu_buf_read(dmu_buf_t *db);
-int dmu_buf_read_canfail(dmu_buf_t *db);
-void dmu_buf_read_array(dmu_buf_t **dbp, int numbufs);
-int dmu_buf_read_array_canfail(dmu_buf_t **dbp, int numbufs);
-
-/*
* Indicate that you are going to modify the buffer's data (db_data).
*
* The transaction (tx) must be assigned to a txg (ie. you've called
@@ -370,20 +359,6 @@ int dmu_buf_read_array_canfail(dmu_buf_t **dbp, int numbufs);
void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
/*
- * Indicate that you are going to modify the entire contents of the
- * buffer's data ("fill" it).
- *
- * This routine is the same as dmu_buf_will_dirty, except that it won't
- * read the contents off the disk, so the contents may be uninitialized
- * and you must overwrite it.
- *
- * The transaction (tx) must be assigned to a txg (ie. you've called
- * dmu_tx_assign()). The buffer's object must be held in the tx (ie.
- * you've called dmu_tx_hold_object(tx, db->db_object)).
- */
-/* void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); */
-
-/*
* You must create a transaction, then hold the objects which you will
* (or might) modify as part of this transaction. Then you must assign
* the transaction to a transaction group. Once the transaction has
@@ -408,7 +383,7 @@ dmu_tx_t *dmu_tx_create(objset_t *os);
void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
uint64_t len);
-void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name);
void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
void dmu_tx_abort(dmu_tx_t *tx);
int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
@@ -418,7 +393,7 @@ void dmu_tx_commit(dmu_tx_t *tx);
* Free up the data blocks for a defined range of a file. If size is
* zero, the range from offset to end-of-file is freed.
*/
-void dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
+int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx);
/*
@@ -427,10 +402,8 @@ void dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
* Canfail routines will return 0 on success, or an errno if there is a
* nonrecoverable I/O error.
*/
-void dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
+int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
void *buf);
-int dmu_read_canfail(objset_t *dd, uint64_t object, uint64_t offset,
- uint64_t size, void *buf);
void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
const void *buf, dmu_tx_t *tx);
int dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
@@ -491,8 +464,7 @@ uint64_t dmu_object_max_nonzero_offset(objset_t *os, uint64_t object);
typedef struct dmu_objset_stats {
dmu_objset_type_t dds_type;
uint8_t dds_is_snapshot;
- uint8_t dds_is_placeholder;
- uint8_t dds_pad[2];
+ uint8_t dds_pad[3];
uint64_t dds_creation_time;
uint64_t dds_creation_txg;
@@ -532,7 +504,6 @@ typedef struct dmu_objset_stats {
* change, so there is a small probability that it will collide.
*/
uint64_t dds_fsid_guid;
- uint64_t dds_guid;
uint64_t dds_objects_used; /* number of objects used */
uint64_t dds_objects_avail; /* number of objects available */
@@ -553,15 +524,9 @@ typedef struct dmu_objset_stats {
uint64_t dds_available;
/*
- * Miscellaneous
+ * Used for debugging purposes
*/
- char dds_altroot[MAXPATHLEN];
-
- /* The following are for debugging purposes only */
uint64_t dds_last_txg;
- uint64_t dds_dir_obj;
- uint64_t dds_objset_obj;
- uint64_t dds_clone_of_obj;
} dmu_objset_stats_t;
/*
@@ -617,7 +582,7 @@ void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
dmu_traverse_cb_t cb, void *arg);
int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp);
-int dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep,
+int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep,
struct vnode *vp, uint64_t voffset);
/* CRC64 table */
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
index d0a77fcfb9..ee14bfab85 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -86,12 +85,7 @@ typedef struct objset_impl {
list_t os_downgraded_dbufs;
} objset_impl_t;
-#define DMU_PRIVATE_OBJECT (1ULL << 63)
-
-#define DMU_META_DNODE_OBJECT (1ULL << 63)
-
-/* XXX rename this to DMU_IS_DNODE_OBJECT? */
-#define IS_DNODE_DNODE(object) ((object) == DMU_META_DNODE_OBJECT)
+#define DMU_META_DNODE_OBJECT 0
/* called from zpl */
int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
@@ -106,13 +100,14 @@ void dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds);
void dmu_objset_find(char *name, void func(char *, void *), void *arg,
int flags);
void dmu_objset_byteswap(void *buf, size_t size);
+void dmu_objset_evict_dbufs(objset_t *os);
/* called from dsl */
void dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx);
objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
dmu_objset_type_t type, dmu_tx_t *tx);
-objset_impl_t *dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds,
- blkptr_t *bp);
+int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
+ objset_impl_t **osip);
void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
#ifdef __cplusplus
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
index 7087912e00..a80345afd0 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -45,7 +44,8 @@ extern "C" {
#define ADVANCE_PRUNE 0x02 /* prune by prev snapshot birth time */
#define ADVANCE_DATA 0x04 /* read user data blocks */
#define ADVANCE_HOLES 0x08 /* visit holes */
-#define ADVANCE_NOLOCK 0x10 /* Don't grab SPA sync lock */
+#define ADVANCE_ZIL 0x10 /* visit intent log blocks */
+#define ADVANCE_NOLOCK 0x20 /* Don't grab SPA sync lock */
#define ZB_NO_LEVEL -2
#define ZB_MAXLEVEL 32 /* Next power of 2 >= DN_MAX_LEVELS */
@@ -58,13 +58,6 @@ extern "C" {
#define ZB_DN_CACHE 2
#define ZB_DEPTH 3
-typedef struct zbookmark {
- uint64_t zb_objset;
- uint64_t zb_object;
- int zb_level;
- uint64_t zb_blkid;
-} zbookmark_t;
-
typedef struct zseg {
uint64_t seg_mintxg;
uint64_t seg_maxtxg;
@@ -93,6 +86,7 @@ struct traverse_handle {
int th_zio_flags;
list_t th_seglist;
traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL];
+ traverse_blk_cache_t th_zil_cache;
uint64_t th_hits;
uint64_t th_arc_hits;
uint64_t th_reads;
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
index d04c7c8d6b..9b55c56bc9 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -54,6 +53,7 @@ struct dmu_tx {
struct dsl_dir *tx_dir;
struct dsl_pool *tx_pool;
uint64_t tx_txg;
+ uint64_t tx_lastsnap_txg;
txg_handle_t tx_txgh;
uint64_t tx_space_towrite;
refcount_t tx_space_written;
@@ -62,7 +62,7 @@ struct dmu_tx {
uint64_t tx_space_tooverwrite;
void *tx_tempreserve_cookie;
uint8_t tx_anyobj;
- uint8_t tx_privateobj;
+ int tx_err;
#ifdef ZFS_DEBUG
char *tx_debug_buf;
int tx_debug_len;
@@ -79,15 +79,10 @@ enum dmu_tx_hold_type {
THT_NUMTYPES
};
-typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
- uint64_t arg1, uint64_t arg2);
-
-
typedef struct dmu_tx_hold {
list_node_t dth_node;
struct dnode *dth_dnode;
enum dmu_tx_hold_type dth_type;
- dmu_tx_hold_func_t dth_func;
uint64_t dth_arg1;
uint64_t dth_arg2;
/* XXX track what the actual estimates were for this hold */
diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h
index 1b43805e93..31b148f295 100644
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -63,23 +62,16 @@ extern "C" {
#define DNODE_SIZE (1 << DNODE_SHIFT)
#define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT)
#define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT))
+#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT)
#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT)
#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
-#define DN_META_DNODE_LEVELS \
- (1 + (DN_MAX_OBJECT_SHIFT - DNODE_SHIFT + SPA_BLKPTRSHIFT - \
- DNODES_PER_BLOCK_SHIFT) / DNODES_PER_LEVEL_SHIFT)
-
/* The +2 here is a cheesy way to round up */
#define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
(DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT)))
-#define DN_MAX_OBJECT \
- ((uint64_t)DN_MAX_NBLKPTR << (DNODES_PER_BLOCK_SHIFT + \
- (DN_META_DNODE_LEVELS - 1) * DNODES_PER_LEVEL_SHIFT))
-
#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \
(((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t))))
@@ -213,15 +205,7 @@ typedef struct dnode {
kmutex_t dn_dbufs_mtx;
list_t dn_dbufs; /* linked list of descendent dbuf_t's */
- kcondvar_t dn_evicted; /* a child dbuf has been evicted */
-
- /*
- * Performance hack: whenever we have a hold on the bonus buffer of a
- * ZAP object, we will also have a hold on db0. This will keep the
- * meta-data for a micro-zap object cached as long as the znode for the
- * object is in the znode cache.
- */
- struct dmu_buf_impl *dn_db0;
+ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */
/* holds prefetch structure */
struct zfetch dn_zfetch;
@@ -237,9 +221,10 @@ dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
uint64_t object);
void dnode_special_close(dnode_t *dn);
-dnode_t *dnode_hold(struct objset_impl *dd, uint64_t object, void *ref);
-dnode_t *dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
- void *ref);
+int dnode_hold(struct objset_impl *dd, uint64_t object,
+ void *ref, dnode_t **dnp);
+int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+ void *ref, dnode_t **dnp);
void dnode_add_ref(dnode_t *dn, void *ref);
void dnode_rele(dnode_t *dn, void *ref);
void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx);
@@ -266,6 +251,7 @@ void dnode_init(void);
void dnode_fini(void);
int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl,
uint64_t blkfill);
+void dnode_evict_dbufs(dnode_t *dn);
#ifdef ZFS_DEBUG
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
index e56c8a67d9..3411eba68b 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -108,8 +107,8 @@ int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode,
void *tag, dsl_dataset_t **dsp);
int dsl_dataset_open(const char *name, int mode, void *tag,
dsl_dataset_t **dsp);
-dsl_dataset_t *dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
- const char *tail, int mode, void *tag);
+int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj,
+ const char *tail, int mode, void *tag, dsl_dataset_t **);
void dsl_dataset_name(dsl_dataset_t *ds, char *name);
void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag);
int dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname,
@@ -134,8 +133,8 @@ void dsl_dataset_sync(dsl_dataset_t *os, dmu_tx_t *tx);
void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth,
- dmu_tx_t *tx);
+int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
void dsl_dataset_stats(dsl_dataset_t *os, dmu_objset_stats_t *dds);
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
index 0499d731e6..5c23fdc497 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -98,11 +97,11 @@ struct dsl_dir {
};
void dsl_dir_close(dsl_dir_t *dd, void *tag);
-dsl_dir_t *dsl_dir_open(const char *name, void *tag, const char **tail);
-dsl_dir_t *dsl_dir_open_spa(spa_t *spa, const char *name, void *tag,
+int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail);
+int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **,
const char **tailp);
-dsl_dir_t *dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
- const char *tail, void *tag);
+int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
+ const char *tail, void *tag, dsl_dir_t **);
void dsl_dir_name(dsl_dir_t *dd, char *buf);
int dsl_dir_is_private(dsl_dir_t *dd);
int dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx);
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
index 4fca4548ad..2eab6ae945 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -67,7 +66,7 @@ typedef struct dsl_pool {
krwlock_t dp_config_rwlock;
} dsl_pool_t;
-dsl_pool_t *dsl_pool_open(spa_t *spa, uint64_t txg);
+int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
void dsl_pool_close(dsl_pool_t *dp);
dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg);
void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h
index f9fffd2443..0b7e12f2cb 100644
--- a/usr/src/uts/common/fs/zfs/sys/refcount.h
+++ b/usr/src/uts/common/fs/zfs/sys/refcount.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -42,7 +41,7 @@ extern "C" {
* particular object, use FTAG (which is a string) for the holder_tag.
* Otherwise, use the object that holds the reference.
*/
-#define FTAG ((void*)__func__)
+#define FTAG ((char *)__func__)
#if defined(DEBUG) || !defined(_KERNEL)
typedef struct reference {
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index fbe2822a13..2c8a43bb37 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -292,21 +291,30 @@ typedef struct blkptr {
/* state manipulation functions */
extern int spa_open(const char *pool, spa_t **, void *tag);
-extern int spa_get_stats(const char *pool, nvlist_t **config);
+extern int spa_get_stats(const char *pool, nvlist_t **config,
+ char *altroot, size_t buflen);
extern int spa_create(const char *pool, nvlist_t *config, char *altroot);
extern int spa_import(const char *pool, nvlist_t *config, char *altroot);
extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
extern int spa_destroy(char *pool);
extern int spa_export(char *pool);
+extern int spa_reset(char *pool);
+extern void spa_async_request(spa_t *spa, int flag);
+extern void spa_async_suspend(spa_t *spa);
+extern void spa_async_resume(spa_t *spa);
+extern spa_t *spa_inject_addref(char *pool);
+extern void spa_inject_delref(spa_t *spa);
+
+#define SPA_ASYNC_REOPEN 0x01
+#define SPA_ASYNC_REPLACE_DONE 0x02
+#define SPA_ASYNC_SCRUB 0x04
+#define SPA_ASYNC_RESILVER 0x08
/* device manipulation */
extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
-extern int spa_vdev_add_unlocked(spa_t *spa, nvlist_t *nvroot);
-extern int spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot,
+extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
int replacing);
-extern int spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid,
- int replace_done);
-extern void spa_vdev_replace_done(spa_t *spa);
+extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done);
extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
/* scrubbing */
@@ -314,6 +322,7 @@ extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
extern void spa_scrub_suspend(spa_t *spa);
extern void spa_scrub_resume(spa_t *spa);
extern void spa_scrub_restart(spa_t *spa, uint64_t txg);
+extern void spa_scrub_throttle(spa_t *spa, int direction);
/* spa syncing */
extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
@@ -345,8 +354,8 @@ extern void spa_close(spa_t *spa, void *tag);
extern boolean_t spa_refcount_zero(spa_t *spa);
/* Pool configuration lock */
-extern void spa_config_enter(spa_t *spa, krw_t rw);
-extern void spa_config_exit(spa_t *spa);
+extern void spa_config_enter(spa_t *spa, krw_t rw, void *tag);
+extern void spa_config_exit(spa_t *spa, void *tag);
extern boolean_t spa_config_held(spa_t *spa, krw_t rw);
/* Pool vdev add/remove lock */
@@ -383,6 +392,23 @@ extern uint64_t spa_get_random(uint64_t range);
extern void sprintf_blkptr(char *buf, int len, blkptr_t *bp);
extern void spa_freeze(spa_t *spa);
extern void spa_evict_all(void);
+extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid);
+
+/* error handling */
+struct zbookmark;
+struct zio;
+extern void spa_log_error(spa_t *spa, struct zio *zio);
+extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
+ struct zio *zio, uint64_t stateoroffset, uint64_t length);
+extern void zfs_post_ok(spa_t *spa, vdev_t *vd);
+extern uint64_t spa_get_errlog_size(spa_t *spa);
+extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
+extern void spa_errlog_rotate(spa_t *spa);
+extern void spa_errlog_drain(spa_t *spa);
+extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
+extern int spa_bookmark_name(spa_t *spa, struct zbookmark *zb, char *ds,
+ size_t dsname, char *obj, size_t objname, char *range, size_t rangelen);
+extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
/* Initialization and termination */
extern void spa_init(int flags);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 0fcef6c48b..e9192956c3 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -46,27 +45,33 @@ extern "C" {
typedef struct spa_config_lock {
kmutex_t scl_lock;
- uint64_t scl_count;
+ refcount_t scl_count;
kthread_t *scl_writer;
kcondvar_t scl_cv;
} spa_config_lock_t;
+typedef struct spa_error_entry {
+ zbookmark_t se_bookmark;
+ char *se_name;
+ avl_node_t se_avl;
+} spa_error_entry_t;
+
struct spa {
/*
* Fields protected by spa_namespace_lock.
*/
char *spa_name;
avl_node_t spa_avl;
- int spa_anon;
nvlist_t *spa_config;
uint64_t spa_config_txg; /* txg of last config change */
spa_config_lock_t spa_config_lock; /* configuration changes */
kmutex_t spa_config_cache_lock; /* for spa_config RW_READER */
int spa_sync_pass; /* iterate-to-convergence */
int spa_state; /* pool state */
- uint8_t spa_minref; /* min refcnt of open pool */
+ int spa_inject_ref; /* injection references */
uint8_t spa_traverse_wanted; /* traverse lock wanted */
- taskq_t *spa_vdev_retry_taskq;
+ uint8_t spa_sync_on; /* sync threads are running */
+ spa_load_state_t spa_load_state; /* current load operation */
taskq_t *spa_zio_issue_taskq[ZIO_TYPES];
taskq_t *spa_zio_intr_taskq[ZIO_TYPES];
dsl_pool_t *spa_dsl_pool;
@@ -88,18 +93,33 @@ struct spa {
kthread_t *spa_scrub_thread; /* scrub/resilver thread */
traverse_handle_t *spa_scrub_th; /* scrub traverse handle */
uint64_t spa_scrub_restart_txg; /* need to restart */
+ uint64_t spa_scrub_mintxg; /* min txg we'll scrub */
uint64_t spa_scrub_maxtxg; /* max txg we'll scrub */
uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */
+ int64_t spa_scrub_throttled; /* over-throttle scrub I/Os */
uint64_t spa_scrub_errors; /* scrub I/O error count */
+ int spa_scrub_suspended; /* tell scrubber to suspend */
kcondvar_t spa_scrub_cv; /* scrub thread state change */
kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */
uint8_t spa_scrub_stop; /* tell scrubber to stop */
- uint8_t spa_scrub_suspend; /* tell scrubber to suspend */
uint8_t spa_scrub_active; /* active or suspended? */
uint8_t spa_scrub_type; /* type of scrub we're doing */
- int spa_sync_on; /* sync threads are running */
+ kmutex_t spa_async_lock; /* protect async state */
+ kthread_t *spa_async_thread; /* thread doing async task */
+ int spa_async_suspended; /* async tasks suspended */
+ kcondvar_t spa_async_cv; /* wait for thread_exit() */
+ uint16_t spa_async_tasks; /* async task mask */
char *spa_root; /* alternate root directory */
kmutex_t spa_uberblock_lock; /* vdev_uberblock_load_done() */
+ uint64_t spa_ena; /* spa-wide ereport ENA */
+ boolean_t spa_last_open_failed; /* true if last open faled */
+ kmutex_t spa_errlog_lock; /* error log lock */
+ uint64_t spa_errlog_last; /* last error log object */
+ uint64_t spa_errlog_scrub; /* scrub error log object */
+ kmutex_t spa_errlist_lock; /* error list/ereport lock */
+ avl_tree_t spa_errlist_last; /* last error list */
+ avl_tree_t spa_errlist_scrub; /* scrub error list */
+ int spa_scrub_finished; /* indicator to rotate logs */
/*
* spa_refcnt must be the last element because it changes size based on
* compilation options. In order for the MDB module to function
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index 86d2f1b1ab..f3d7379049 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -60,11 +60,10 @@ typedef struct vdev_knob {
extern int vdev_open(vdev_t *);
extern void vdev_close(vdev_t *);
extern int vdev_create(vdev_t *, uint64_t txg);
-extern void vdev_init(vdev_t *, uint64_t txg);
-extern void vdev_reopen(vdev_t *, zio_t **zq);
+extern int vdev_init(vdev_t *, uint64_t txg);
+extern void vdev_reopen(vdev_t *);
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
-extern vdev_t *vdev_lookup_by_path(vdev_t *vd, const char *path);
extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
@@ -73,16 +72,16 @@ extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
extern const char *vdev_description(vdev_t *vd);
-extern void vdev_metaslab_init(vdev_t *vd, uint64_t txg);
+extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
extern void vdev_metaslab_fini(vdev_t *vd);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
extern void vdev_stat_update(zio_t *zio);
extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
boolean_t complete);
-extern void vdev_checksum_error(zio_t *zio, vdev_t *vd);
extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
-extern void vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux);
+extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
+ vdev_aux_t aux);
extern void vdev_space_update(vdev_t *vd, uint64_t space_delta,
uint64_t alloc_delta);
@@ -92,11 +91,10 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
extern void vdev_io_start(zio_t *zio);
extern void vdev_io_done(zio_t *zio);
-extern int vdev_online(spa_t *spa, const char *path);
-extern int vdev_offline(spa_t *spa, const char *path, int istmp);
+extern int vdev_online(spa_t *spa, uint64_t guid);
+extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp);
+extern void vdev_clear(spa_t *spa, vdev_t *vd);
-extern int vdev_error_setup(spa_t *spa, const char *path, int mode, int mask,
- uint64_t arg);
extern int vdev_error_inject(vdev_t *vd, zio_t *zio);
extern int vdev_is_dead(vdev_t *vd);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 53a202a906..2dfc45edff 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -103,9 +103,11 @@ struct vdev_cache {
struct vdev_queue {
uint64_t vq_min_pending;
uint64_t vq_max_pending;
+ uint64_t vq_scrub_limit;
uint64_t vq_agg_limit;
uint64_t vq_time_shift;
uint64_t vq_ramp_rate;
+ uint64_t vq_scrub_count;
avl_tree_t vq_deadline_tree;
avl_tree_t vq_read_tree;
avl_tree_t vq_write_tree;
@@ -150,10 +152,9 @@ struct vdev {
txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */
txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */
uint8_t vdev_dirty[TXG_SIZE]; /* per-txg dirty flags */
- int vdev_is_dirty; /* on config dirty list? */
+ uint8_t vdev_is_dirty; /* on config dirty list? */
+ uint8_t vdev_reopen_wanted; /* async reopen wanted? */
list_node_t vdev_dirty_node; /* config dirty list */
- zio_t *vdev_io_retry; /* I/O retry list */
- list_t vdev_io_pending; /* I/O pending list */
/*
* Leaf vdev state.
@@ -173,6 +174,8 @@ struct vdev {
uint8_t vdev_detached; /* device detached? */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
vdev_cache_t vdev_cache; /* physical block cache */
+ uint64_t vdev_not_present; /* not present during import */
+ hrtime_t vdev_last_try; /* last reopen time */
/*
* For DTrace to work in userland (libzpool) context, these fields must
@@ -183,8 +186,6 @@ struct vdev {
*/
kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */
kmutex_t vdev_dirty_lock; /* vdev_dirty[] */
- kmutex_t vdev_io_lock; /* vdev_io_pending list */
- kcondvar_t vdev_io_cv; /* vdev_io_pending list empty? */
kmutex_t vdev_stat_lock; /* vdev_stat */
};
@@ -260,7 +261,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
/*
* vdev sync load and sync
*/
-extern int vdev_load(vdev_t *vd, int import);
+extern int vdev_load(vdev_t *vd);
extern void vdev_sync(vdev_t *vd, uint64_t txg);
extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
extern void vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg);
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
index 9fb6a6c5a4..e77a2efa61 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h
@@ -199,7 +199,7 @@ void zap_put_leaf(struct zap_leaf *l);
int fzap_add_cd(zap_t *zap, const char *name,
uint64_t integer_size, uint64_t num_integers,
- const void *val, uint32_t cd, dmu_tx_t *tx, struct zap_leaf **lp);
+ const void *val, uint32_t cd, dmu_tx_t *tx);
void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
#ifdef __cplusplus
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
index 2ea27493f9..34057e83c9 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -103,7 +102,6 @@ int zfs_zaccess_rename(struct znode *, struct znode *,
struct znode *, struct znode *, cred_t *cr);
int zfs_zaccess_v4_perm(struct znode *, int, cred_t *);
void zfs_acl_free(zfs_acl_t *);
-zfs_acl_t *zfs_acl_node_read(struct znode *);
#endif
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index c914b23570..14ad31e629 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -31,6 +30,7 @@
#include <sys/cred.h>
#include <sys/dmu.h>
+#include <sys/zio.h>
#ifdef __cplusplus
extern "C" {
@@ -66,7 +66,7 @@ typedef struct dmu_replay_record {
char drr_toname[MAXNAMELEN];
} drr_begin;
struct drr_end {
- uint64_t drr_checksum;
+ zio_cksum_t drr_checksum;
} drr_end;
struct drr_object {
uint64_t drr_object;
@@ -97,15 +97,31 @@ typedef struct dmu_replay_record {
} drr_u;
} dmu_replay_record_t;
+typedef struct zinject_record {
+ uint64_t zi_objset;
+ uint64_t zi_object;
+ uint64_t zi_start;
+ uint64_t zi_end;
+ uint64_t zi_guid;
+ uint32_t zi_level;
+ uint32_t zi_error;
+ uint64_t zi_type;
+ uint32_t zi_freq;
+} zinject_record_t;
+
+#define ZINJECT_NULL 0x1
+#define ZINJECT_FLUSH_ARC 0x2
+#define ZINJECT_UNLOAD_SPA 0x4
+
typedef struct zfs_cmd {
char zc_name[MAXNAMELEN];
char zc_prop_name[MAXNAMELEN];
char zc_prop_value[MAXPATHLEN];
char zc_root[MAXPATHLEN];
- char zc_filename[MAXPATHLEN];
+ char zc_filename[MAXNAMELEN];
uint32_t zc_intsz;
uint32_t zc_numints;
- uint64_t zc_pool_guid;
+ uint64_t zc_guid;
uint64_t zc_config_src; /* really (char *) */
uint64_t zc_config_src_size;
uint64_t zc_config_dst; /* really (char *) */
@@ -116,9 +132,10 @@ typedef struct zfs_cmd {
uint64_t zc_volsize;
uint64_t zc_volblocksize;
uint64_t zc_objset_type;
- dmu_object_info_t zc_object_info;
dmu_objset_stats_t zc_objset_stats;
struct drr_begin zc_begin_record;
+ zinject_record_t zc_inject_record;
+ zbookmark_t zc_bookmark;
} zfs_cmd_t;
#define ZVOL_MAX_MINOR (1 << 16)
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
index f9331be00a..02f4b3b247 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -133,8 +132,6 @@ typedef struct zfs_dirlock {
struct zfs_dirlock *dl_next; /* next in z_dirlocks list */
} zfs_dirlock_t;
-struct zcache_state;
-
typedef struct znode {
struct zfsvfs *z_zfsvfs;
vnode_t *z_vnode;
@@ -150,16 +147,12 @@ typedef struct znode {
uint8_t z_atime_dirty; /* atime needs to be synced */
uint8_t z_dbuf_held; /* Is z_dbuf already held? */
uint8_t z_zn_prefetch; /* Prefetch znodes? */
- uint_t z_mapcnt; /* number of memory maps to file */
uint_t z_blksz; /* block size in bytes */
uint_t z_seq; /* modification sequence number */
+ uint64_t z_mapcnt; /* number of pages mapped to file */
uint64_t z_last_itx; /* last ZIL itx on this znode */
kmutex_t z_acl_lock; /* acl data lock */
list_node_t z_link_node; /* all znodes in fs link */
- list_node_t z_zcache_node;
- struct zcache_state *z_zcache_state;
- uint64_t z_zcache_access;
-
/*
* These are dmu managed fields.
*/
@@ -241,14 +234,12 @@ extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, dmu_tx_t *,
cred_t *cr);
extern void zfs_znode_init(void);
extern void zfs_znode_fini(void);
-extern znode_t *zfs_znode_alloc(zfsvfs_t *, dmu_buf_t *, uint64_t, int);
extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **);
extern void zfs_zinactive(znode_t *);
extern void zfs_znode_delete(znode_t *, dmu_tx_t *);
extern void zfs_znode_free(znode_t *);
extern int zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads);
extern void zfs_delete_wait_empty(zfsvfs_t *zfsvfs);
-extern void zfs_zcache_flush(zfsvfs_t *zfsvf);
extern void zfs_remove_op_tables();
extern int zfs_create_op_tables();
extern int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr);
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index 5d3227e546..d80310f2fa 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -109,23 +108,25 @@ enum zio_compress {
#define ZIO_PRIORITY_SCRUB (zio_priority_table[9])
#define ZIO_PRIORITY_TABLE_SIZE 10
-#define ZIO_FLAG_MUSTSUCCEED 0x0000
-#define ZIO_FLAG_CANFAIL 0x0001
-#define ZIO_FLAG_FAILFAST 0x0002
-#define ZIO_FLAG_CONFIG_HELD 0x0004
+#define ZIO_FLAG_MUSTSUCCEED 0x00000
+#define ZIO_FLAG_CANFAIL 0x00001
+#define ZIO_FLAG_FAILFAST 0x00002
+#define ZIO_FLAG_CONFIG_HELD 0x00004
-#define ZIO_FLAG_DONT_CACHE 0x0010
-#define ZIO_FLAG_DONT_QUEUE 0x0020
-#define ZIO_FLAG_DONT_PROPAGATE 0x0040
-#define ZIO_FLAG_DONT_RETRY 0x0080
+#define ZIO_FLAG_DONT_CACHE 0x00010
+#define ZIO_FLAG_DONT_QUEUE 0x00020
+#define ZIO_FLAG_DONT_PROPAGATE 0x00040
+#define ZIO_FLAG_DONT_RETRY 0x00080
-#define ZIO_FLAG_PHYSICAL 0x0100
-#define ZIO_FLAG_IO_BYPASS 0x0200
-#define ZIO_FLAG_IO_REPAIR 0x0400
-#define ZIO_FLAG_SPECULATIVE 0x0800
+#define ZIO_FLAG_PHYSICAL 0x00100
+#define ZIO_FLAG_IO_BYPASS 0x00200
+#define ZIO_FLAG_IO_REPAIR 0x00400
+#define ZIO_FLAG_SPECULATIVE 0x00800
-#define ZIO_FLAG_RESILVER 0x1000
-#define ZIO_FLAG_SCRUB 0x2000
+#define ZIO_FLAG_RESILVER 0x01000
+#define ZIO_FLAG_SCRUB 0x02000
+
+#define ZIO_FLAG_NOBOOKMARK 0x10000
#define ZIO_FLAG_GANG_INHERIT \
(ZIO_FLAG_CANFAIL | \
@@ -155,11 +156,39 @@ typedef struct zio_transform zio_transform_t;
extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
extern char *zio_type_name[ZIO_TYPES];
+/*
+ * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
+ * identifies any block in the pool. By convention, the meta-objset (MOS)
+ * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
+ * level -1 of the meta-dnode, and intent log blocks (which are chained
+ * off the root block) have blkid == sequence number. In summary:
+ *
+ * mos is objset 0
+ * meta-dnode is object 0
+ * root block is <objset, 0, -1, 0>
+ * intent log is <objset, 0, -1, ZIL sequence number>
+ *
+ * Note: this structure is called a bookmark because its first purpose was
+ * to remember where to resume a pool-wide traverse. The absolute ordering
+ * for block visitation during traversal is defined in compare_bookmark().
+ *
+ * Note: this structure is passed between userland and the kernel.
+ * Therefore it must not change size or alignment between 32/64 bit
+ * compilation options.
+ */
+typedef struct zbookmark {
+ uint64_t zb_objset;
+ uint64_t zb_object;
+ int64_t zb_level;
+ uint64_t zb_blkid;
+} zbookmark_t;
+
struct zio {
/* Core information about this I/O */
zio_t *io_parent;
zio_t *io_root;
spa_t *io_spa;
+ zbookmark_t io_bookmark;
int io_checksum;
int io_compress;
int io_dva_index;
@@ -170,6 +199,7 @@ struct zio {
zio_t *io_sibling_prev;
zio_t *io_sibling_next;
zio_transform_t *io_transform_stack;
+ zio_t *io_logical;
/* Callback info */
zio_done_func_t *io_done;
@@ -191,8 +221,6 @@ struct zio {
avl_tree_t *io_vdev_tree;
zio_t *io_delegate_list;
zio_t *io_delegate_next;
- zio_t *io_retry_next;
- list_node_t io_pending;
/* Internal pipeline state */
int io_flags;
@@ -212,6 +240,9 @@ struct zio {
void *io_waiter;
kmutex_t io_lock;
kcondvar_t io_cv;
+
+ /* FMA state */
+ uint64_t io_ena;
};
extern zio_t *zio_null(zio_t *pio, spa_t *spa,
@@ -222,15 +253,17 @@ extern zio_t *zio_root(spa_t *spa,
extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags);
+ int priority, int flags, zbookmark_t *zb);
extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
- zio_done_func_t *done, void *private, int priority, int flags);
+ zio_done_func_t *done, void *private, int priority, int flags,
+ zbookmark_t *zb);
extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
- zio_done_func_t *done, void *private, int priority, int flags);
+ zio_done_func_t *done, void *private, int priority, int flags,
+ zbookmark_t *zb);
extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
zio_done_func_t *done, void *private);
@@ -285,12 +318,27 @@ extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp);
extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
+boolean_t zio_should_retry(zio_t *zio);
+
/*
* Initial setup and teardown.
*/
extern void zio_init(void);
extern void zio_fini(void);
+/*
+ * Fault injection
+ */
+struct zinject_record;
+extern uint32_t zio_injection_enabled;
+extern int zio_inject_fault(char *name, int flags, int *id,
+ struct zinject_record *record);
+extern int zio_inject_list_next(int *id, char *name, size_t buflen,
+ struct zinject_record *record);
+extern int zio_clear_fault(int id);
+extern int zio_handle_fault_injection(zio_t *zio, int error);
+extern int zio_handle_device_injection(vdev_t *vd, int error);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
index ba3dc48d28..bb7bd41e0b 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -57,9 +56,11 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
*/
extern zio_checksum_t fletcher_2_native;
extern zio_checksum_t fletcher_4_native;
+extern zio_checksum_t fletcher_4_incremental_native;
extern zio_checksum_t fletcher_2_byteswap;
extern zio_checksum_t fletcher_4_byteswap;
+extern zio_checksum_t fletcher_4_incremental_byteswap;
extern zio_checksum_t zio_checksum_SHA256;
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
index 0b2b07de29..e1abf0e49d 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -201,6 +200,9 @@ struct zio_transform {
zio_transform_t *zt_next;
};
+extern void zio_inject_init(void);
+extern void zio_inject_fini(void);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/fs/zfs/uberblock.c b/usr/src/uts/common/fs/zfs/uberblock.c
index 63bff0ae4b..b6d3fe9595 100644
--- a/usr/src/uts/common/fs/zfs/uberblock.c
+++ b/usr/src/uts/common/fs/zfs/uberblock.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -30,9 +29,6 @@
#include <sys/uberblock_impl.h>
#include <sys/vdev_impl.h>
-/* Keep the uberblock version in a varialbe so we can get at it with mdb */
-static uint64_t uberblock_version = UBERBLOCK_VERSION;
-
int
uberblock_verify(uberblock_t *ub)
{
@@ -42,9 +38,6 @@ uberblock_verify(uberblock_t *ub)
if (ub->ub_magic != UBERBLOCK_MAGIC)
return (EINVAL);
- if (ub->ub_version != UBERBLOCK_VERSION)
- return (ENOTSUP);
-
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index 838e1bfc88..363be462ab 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -26,6 +26,7 @@
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
#include <sys/spa.h>
#include <sys/spa_impl.h>
#include <sys/dmu.h>
@@ -137,34 +138,6 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev)
}
vdev_t *
-vdev_lookup_by_path(vdev_t *vd, const char *path)
-{
- int c;
- vdev_t *mvd;
-
- if (vd->vdev_path != NULL) {
- if (vd->vdev_wholedisk == 1) {
- /*
- * For whole disks, the internal path has 's0', but the
- * path passed in by the user doesn't.
- */
- if (strlen(path) == strlen(vd->vdev_path) - 2 &&
- strncmp(path, vd->vdev_path, strlen(path)) == 0)
- return (vd);
- } else if (strcmp(path, vd->vdev_path) == 0) {
- return (vd);
- }
- }
-
- for (c = 0; c < vd->vdev_children; c++)
- if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
- NULL)
- return (mvd);
-
- return (NULL);
-}
-
-vdev_t *
vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
{
int c;
@@ -305,10 +278,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
vd->vdev_ops = ops;
vd->vdev_state = VDEV_STATE_CLOSED;
- mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL);
- cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL);
- list_create(&vd->vdev_io_pending, sizeof (zio_t),
- offsetof(zio_t, io_pending));
mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
@@ -343,9 +312,6 @@ vdev_free_common(vdev_t *vd)
mutex_exit(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_dtl_lock);
mutex_destroy(&vd->vdev_dirty_lock);
- list_destroy(&vd->vdev_io_pending);
- mutex_destroy(&vd->vdev_io_lock);
- cv_destroy(&vd->vdev_io_cv);
kmem_free(vd, sizeof (vdev_t));
}
@@ -402,6 +368,13 @@ vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype)
vd->vdev_wholedisk = -1ULL;
/*
+ * Look for the 'not present' flag. This will only be set if the device
+ * was not present at the time of import.
+ */
+ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+ &vd->vdev_not_present);
+
+ /*
* If we're a top-level vdev, try to load the allocation parameters.
*/
if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
@@ -536,8 +509,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
vdev_config_dirty(tvd);
}
- ASSERT(svd->vdev_io_retry == NULL);
- ASSERT(list_is_empty(&svd->vdev_io_pending));
+ tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted;
+ svd->vdev_reopen_wanted = 0;
}
static void
@@ -611,7 +584,7 @@ vdev_remove_parent(vdev_t *cvd)
vdev_free(mvd);
}
-void
+int
vdev_metaslab_init(vdev_t *vd, uint64_t txg)
{
spa_t *spa = vd->vdev_spa;
@@ -621,6 +594,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
space_map_obj_t *smo = vd->vdev_smo;
metaslab_t **mspp = vd->vdev_ms;
+ int ret;
dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc);
@@ -638,21 +612,29 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
ms_array = kmem_zalloc(newc * sizeof (uint64_t),
KM_SLEEP);
- dmu_read(spa->spa_meta_objset, vd->vdev_ms_array,
- 0, newc * sizeof (uint64_t), ms_array);
+ if ((ret = dmu_read(spa->spa_meta_objset,
+ vd->vdev_ms_array, 0,
+ newc * sizeof (uint64_t), ms_array)) != 0) {
+ kmem_free(ms_array, newc * sizeof (uint64_t));
+ goto error;
+ }
for (c = 0; c < newc; c++) {
if (ms_array[c] == 0)
continue;
- db = dmu_bonus_hold(spa->spa_meta_objset,
- ms_array[c]);
- dmu_buf_read(db);
+ if ((ret = dmu_bonus_hold(
+ spa->spa_meta_objset, ms_array[c],
+ FTAG, &db)) != 0) {
+ kmem_free(ms_array,
+ newc * sizeof (uint64_t));
+ goto error;
+ }
ASSERT3U(db->db_size, ==, sizeof (*smo));
bcopy(db->db_data, &vd->vdev_smo[c],
db->db_size);
ASSERT3U(vd->vdev_smo[c].smo_object, ==,
ms_array[c]);
- dmu_buf_rele(db);
+ dmu_buf_rele(db, FTAG);
}
kmem_free(ms_array, newc * sizeof (uint64_t));
}
@@ -674,6 +656,21 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
kmem_free(mspp, oldc * sizeof (*mspp));
}
+ return (0);
+
+error:
+ /*
+ * On error, undo any partial progress we may have made, and restore the
+ * old metaslab values.
+ */
+ kmem_free(vd->vdev_smo, newc * sizeof (*smo));
+ kmem_free(vd->vdev_ms, newc * sizeof (*mspp));
+
+ vd->vdev_smo = smo;
+ vd->vdev_ms = mspp;
+ vd->vdev_ms_count = oldc;
+
+ return (ret);
}
void
@@ -735,39 +732,39 @@ vdev_open(vdev_t *vd)
if (vd->vdev_offline) {
ASSERT(vd->vdev_children == 0);
- dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd));
- vd->vdev_state = VDEV_STATE_OFFLINE;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE);
return (ENXIO);
}
error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
+ if (zio_injection_enabled && error == 0)
+ error = zio_handle_device_injection(vd, ENXIO);
+
dprintf("%s = %d, osize %llu, state = %d\n",
vdev_description(vd), error, osize, vd->vdev_state);
if (error) {
- dprintf("%s in %s failed to open, error %d, aux %d\n",
- vdev_description(vd),
- vdev_description(vd->vdev_parent),
- error,
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
vd->vdev_stat.vs_aux);
-
- vd->vdev_state = VDEV_STATE_CANT_OPEN;
return (error);
}
vd->vdev_state = VDEV_STATE_HEALTHY;
for (c = 0; c < vd->vdev_children; c++)
- if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY)
- vd->vdev_state = VDEV_STATE_DEGRADED;
+ if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
+ VDEV_AUX_NONE);
+ break;
+ }
osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
if (vd->vdev_children == 0) {
if (osize < SPA_MINDEVSIZE) {
- vd->vdev_state = VDEV_STATE_CANT_OPEN;
- vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_TOO_SMALL);
return (EOVERFLOW);
}
psize = osize;
@@ -775,8 +772,8 @@ vdev_open(vdev_t *vd)
} else {
if (osize < SPA_MINDEVSIZE -
(VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) {
- vd->vdev_state = VDEV_STATE_CANT_OPEN;
- vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_TOO_SMALL);
return (EOVERFLOW);
}
psize = 0;
@@ -796,9 +793,8 @@ vdev_open(vdev_t *vd)
* Make sure the alignment requirement hasn't increased.
*/
if (ashift > vd->vdev_ashift) {
- dprintf("%s: ashift grew\n", vdev_description(vd));
- vd->vdev_state = VDEV_STATE_CANT_OPEN;
- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
return (EINVAL);
}
@@ -806,9 +802,8 @@ vdev_open(vdev_t *vd)
* Make sure the device hasn't shrunk.
*/
if (asize < vd->vdev_asize) {
- dprintf("%s: device shrank\n", vdev_description(vd));
- vd->vdev_state = VDEV_STATE_CANT_OPEN;
- vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
+ vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_BAD_LABEL);
return (EINVAL);
}
@@ -818,11 +813,29 @@ vdev_open(vdev_t *vd)
*/
if (vd->vdev_state == VDEV_STATE_HEALTHY &&
asize > vd->vdev_asize) {
- dprintf("%s: device grew\n", vdev_description(vd));
vd->vdev_asize = asize;
}
}
+ /*
+ * If we were able to open a vdev that was marked permanently
+ * unavailable, clear that state now.
+ */
+ if (vd->vdev_not_present)
+ vd->vdev_not_present = 0;
+
+ /*
+ * This allows the ZFS DE to close cases appropriately. If a device
+ * goes away and later returns, we want to close the associated case.
+ * But it's not enough to simply post this only when a device goes from
+ * CANT_OPEN -> HEALTHY. If we reboot the system and the device is
+ * back, we also need to close the case (otherwise we will try to replay
+ * it). So we have to post this notifier every time. Since this only
+ * occurs during pool open or error recovery, this should not be an
+ * issue.
+ */
+ zfs_post_ok(vd->vdev_spa, vd);
+
return (0);
}
@@ -832,8 +845,6 @@ vdev_open(vdev_t *vd)
void
vdev_close(vdev_t *vd)
{
- ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL);
-
vd->vdev_ops->vdev_op_close(vd);
if (vd->vdev_cache_active) {
@@ -846,43 +857,29 @@ vdev_close(vdev_t *vd)
vd->vdev_state = VDEV_STATE_OFFLINE;
else
vd->vdev_state = VDEV_STATE_CLOSED;
+ vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
}
void
-vdev_reopen(vdev_t *vd, zio_t **rq)
+vdev_reopen(vdev_t *vd)
{
- vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+ spa_t *spa = vd->vdev_spa;
+ vdev_t *rvd = spa->spa_root_vdev;
int c;
+ ASSERT(spa_config_held(spa, RW_WRITER));
+
if (vd == rvd) {
- ASSERT(rq == NULL);
for (c = 0; c < rvd->vdev_children; c++)
- vdev_reopen(rvd->vdev_child[c], NULL);
+ vdev_reopen(rvd->vdev_child[c]);
return;
}
/* only valid for top-level vdevs */
ASSERT3P(vd, ==, vd->vdev_top);
- /*
- * vdev_state can change when spa_config_lock is held as writer,
- * or when it's held as reader and we're doing a vdev_reopen().
- * To handle the latter case, we grab rvd's io_lock to serialize
- * reopens. This ensures that there's never more than one vdev
- * state changer active at a time.
- */
- mutex_enter(&rvd->vdev_io_lock);
-
- mutex_enter(&vd->vdev_io_lock);
- while (list_head(&vd->vdev_io_pending) != NULL)
- cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock);
vdev_close(vd);
(void) vdev_open(vd);
- if (rq != NULL) {
- *rq = vd->vdev_io_retry;
- vd->vdev_io_retry = NULL;
- }
- mutex_exit(&vd->vdev_io_lock);
/*
* Reassess root vdev's health.
@@ -892,8 +889,6 @@ vdev_reopen(vdev_t *vd, zio_t **rq)
uint64_t state = rvd->vdev_child[c]->vdev_state;
rvd->vdev_state = MIN(rvd->vdev_state, state);
}
-
- mutex_exit(&rvd->vdev_io_lock);
}
int
@@ -930,7 +925,7 @@ vdev_create(vdev_t *vd, uint64_t txg)
* For creation, we want to try to create all vdevs at once and then undo it
* if anything fails; this is much harder if we have pending transactions.
*/
-void
+int
vdev_init(vdev_t *vd, uint64_t txg)
{
/*
@@ -942,7 +937,7 @@ vdev_init(vdev_t *vd, uint64_t txg)
/*
* Initialize the vdev's metaslabs.
*/
- vdev_metaslab_init(vd, txg);
+ return (vdev_metaslab_init(vd, txg));
}
void
@@ -993,9 +988,10 @@ vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
void
vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
{
+ spa_t *spa = vd->vdev_spa;
int c;
- ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER));
+ ASSERT(spa_config_held(spa, RW_WRITER));
if (vd->vdev_children == 0) {
mutex_enter(&vd->vdev_dtl_lock);
@@ -1019,6 +1015,12 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
return;
}
+ /*
+ * Make sure the DTLs are always correct under the scrub lock.
+ */
+ if (vd == spa->spa_root_vdev)
+ mutex_enter(&spa->spa_scrub_lock);
+
mutex_enter(&vd->vdev_dtl_lock);
space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
@@ -1032,6 +1034,9 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
mutex_exit(&vd->vdev_dtl_lock);
}
+
+ if (vd == spa->spa_root_vdev)
+ mutex_exit(&spa->spa_scrub_lock);
}
static int
@@ -1047,11 +1052,12 @@ vdev_dtl_load(vdev_t *vd)
if (smo->smo_object == 0)
return (0);
- db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
- dmu_buf_read(db);
+ if ((error = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object,
+ FTAG, &db)) != 0)
+ return (error);
ASSERT3U(db->db_size, ==, sizeof (*smo));
bcopy(db->db_data, smo, db->db_size);
- dmu_buf_rele(db);
+ dmu_buf_rele(db, FTAG);
mutex_enter(&vd->vdev_dtl_lock);
error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC,
@@ -1100,8 +1106,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
vdev_config_dirty(vd->vdev_top);
}
- dmu_free_range(spa->spa_meta_objset, smo->smo_object,
- 0, smo->smo_objsize, tx);
+ VERIFY(0 == dmu_free_range(spa->spa_meta_objset, smo->smo_object,
+ 0, smo->smo_objsize, tx));
mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL);
@@ -1124,17 +1130,18 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
mutex_exit(&smlock);
mutex_destroy(&smlock);
- db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object);
+ VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object,
+ FTAG, &db));
dmu_buf_will_dirty(db, tx);
ASSERT3U(db->db_size, ==, sizeof (*smo));
bcopy(smo, db->db_data, db->db_size);
- dmu_buf_rele(db);
+ dmu_buf_rele(db, FTAG);
dmu_tx_commit(tx);
}
int
-vdev_load(vdev_t *vd, int import)
+vdev_load(vdev_t *vd)
{
spa_t *spa = vd->vdev_spa;
int c, error;
@@ -1147,7 +1154,7 @@ vdev_load(vdev_t *vd, int import)
* Recursively load all children.
*/
for (c = 0; c < vd->vdev_children; c++)
- if ((error = vdev_load(vd->vdev_child[c], import)) != 0)
+ if ((error = vdev_load(vd->vdev_child[c])) != 0)
return (error);
/*
@@ -1166,7 +1173,7 @@ vdev_load(vdev_t *vd, int import)
*/
if ((label = vdev_label_read_config(vd)) == NULL) {
dprintf("can't load label config\n");
- vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
return (0);
}
@@ -1174,7 +1181,7 @@ vdev_load(vdev_t *vd, int import)
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
&guid) != 0 || guid != spa_guid(spa)) {
dprintf("bad or missing pool GUID (%llu)\n", guid);
- vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
return (0);
@@ -1184,7 +1191,7 @@ vdev_load(vdev_t *vd, int import)
guid != vd->vdev_guid) {
dprintf("bad or missing vdev guid (%llu != %llu)\n",
guid, vd->vdev_guid);
- vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
return (0);
@@ -1201,14 +1208,15 @@ vdev_load(vdev_t *vd, int import)
if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
&state)) {
dprintf("missing pool state\n");
- vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
nvlist_free(label);
return (0);
}
if (state != POOL_STATE_ACTIVE &&
- (!import || state != POOL_STATE_EXPORTED)) {
+ (spa->spa_load_state == SPA_LOAD_OPEN ||
+ state != POOL_STATE_EXPORTED)) {
dprintf("pool state not active (%llu)\n", state);
nvlist_free(label);
return (EBADF);
@@ -1227,12 +1235,16 @@ vdev_load(vdev_t *vd, int import)
vd->vdev_ms_shift == 0 ||
vd->vdev_ashift == 0 ||
vd->vdev_asize == 0) {
- vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
return (0);
}
- vdev_metaslab_init(vd, 0);
+ if ((error = vdev_metaslab_init(vd, 0)) != 0) {
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ return (0);
+ }
}
/*
@@ -1243,7 +1255,7 @@ vdev_load(vdev_t *vd, int import)
if (error) {
dprintf("can't load DTL for %s, error %d\n",
vdev_description(vd), error);
- vdev_set_state(vd, VDEV_STATE_CANT_OPEN,
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
return (0);
}
@@ -1344,7 +1356,7 @@ vdev_description(vdev_t *vd)
}
int
-vdev_online(spa_t *spa, const char *path)
+vdev_online(spa_t *spa, uint64_t guid)
{
vdev_t *rvd, *vd;
uint64_t txg;
@@ -1352,24 +1364,14 @@ vdev_online(spa_t *spa, const char *path)
txg = spa_vdev_enter(spa);
rvd = spa->spa_root_vdev;
- if ((vd = vdev_lookup_by_path(rvd, path)) == NULL)
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
dprintf("ONLINE: %s\n", vdev_description(vd));
vd->vdev_offline = B_FALSE;
vd->vdev_tmpoffline = B_FALSE;
-
- /*
- * Clear the error counts. The idea is that you expect to see all
- * zeroes when everything is working, so if you've just onlined a
- * device, you don't want to keep hearing about errors from before.
- */
- vd->vdev_stat.vs_read_errors = 0;
- vd->vdev_stat.vs_write_errors = 0;
- vd->vdev_stat.vs_checksum_errors = 0;
-
- vdev_reopen(vd->vdev_top, NULL);
+ vdev_reopen(vd->vdev_top);
spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0));
@@ -1383,7 +1385,7 @@ vdev_online(spa_t *spa, const char *path)
}
int
-vdev_offline(spa_t *spa, const char *path, int istmp)
+vdev_offline(spa_t *spa, uint64_t guid, int istmp)
{
vdev_t *rvd, *vd;
uint64_t txg;
@@ -1391,7 +1393,7 @@ vdev_offline(spa_t *spa, const char *path, int istmp)
txg = spa_vdev_enter(spa);
rvd = spa->spa_root_vdev;
- if ((vd = vdev_lookup_by_path(rvd, path)) == NULL)
+ if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL)
return (spa_vdev_exit(spa, NULL, txg, ENODEV));
dprintf("OFFLINE: %s\n", vdev_description(vd));
@@ -1416,10 +1418,10 @@ vdev_offline(spa_t *spa, const char *path, int istmp)
* undo it and fail the request.
*/
vd->vdev_offline = B_TRUE;
- vdev_reopen(vd->vdev_top, NULL);
+ vdev_reopen(vd->vdev_top);
if (vdev_is_dead(vd->vdev_top)) {
vd->vdev_offline = B_FALSE;
- vdev_reopen(vd->vdev_top, NULL);
+ vdev_reopen(vd->vdev_top);
return (spa_vdev_exit(spa, NULL, txg, EBUSY));
}
@@ -1434,25 +1436,25 @@ vdev_offline(spa_t *spa, const char *path, int istmp)
return (spa_vdev_exit(spa, NULL, txg, 0));
}
-int
-vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg)
+/*
+ * Clear the error counts associated with this vdev. Unlike vdev_online() and
+ * vdev_offline(), we assume the spa config is locked. We also clear all
+ * children. If 'vd' is NULL, then the user wants to clear all vdevs.
+ */
+void
+vdev_clear(spa_t *spa, vdev_t *vd)
{
- vdev_t *vd;
-
- spa_config_enter(spa, RW_WRITER);
-
- if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) {
- spa_config_exit(spa);
- return (ENODEV);
- }
+ int c;
- vd->vdev_fault_mode = mode;
- vd->vdev_fault_mask = mask;
- vd->vdev_fault_arg = arg;
+ if (vd == NULL)
+ vd = spa->spa_root_vdev;
- spa_config_exit(spa);
+ vd->vdev_stat.vs_read_errors = 0;
+ vd->vdev_stat.vs_write_errors = 0;
+ vd->vdev_stat.vs_checksum_errors = 0;
- return (0);
+ for (c = 0; c < vd->vdev_children; c++)
+ vdev_clear(spa, vd->vdev_child[c]);
}
int
@@ -1631,24 +1633,6 @@ vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
}
/*
- * Report checksum errors that a vdev that didn't realize it made.
- * This can happen, for example, when RAID-Z combinatorial reconstruction
- * infers that one of its components returned bad data.
- */
-void
-vdev_checksum_error(zio_t *zio, vdev_t *vd)
-{
- dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
- vdev_description(vd));
-
- if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
- mutex_enter(&vd->vdev_stat_lock);
- vd->vdev_stat.vs_checksum_errors++;
- mutex_exit(&vd->vdev_stat_lock);
- }
-}
-
-/*
* Update the in-core space usage stats for this vdev and the root vdev.
*/
void
@@ -1709,6 +1693,14 @@ static vdev_knob_t vdev_knob[] = {
offsetof(struct vdev, vdev_queue.vq_max_pending)
},
{
+ "scrub_limit",
+ "maximum scrub/resilver I/O queue",
+ 0,
+ 10000,
+ 70,
+ offsetof(struct vdev, vdev_queue.vq_scrub_limit)
+ },
+ {
"agg_limit",
"maximum size of aggregated I/Os",
0,
@@ -1781,20 +1773,78 @@ vdev_config_clean(vdev_t *vd)
}
/*
- * Set a vdev's state, updating any parent's state as well.
+ * Set a vdev's state. If this is during an open, we don't update the parent
+ * state, because we're in the process of opening children depth-first.
+ * Otherwise, we propagate the change to the parent.
+ *
+ * If this routine places a device in a faulted state, an appropriate ereport is
+ * generated.
*/
void
-vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
+vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
{
- if (state == vd->vdev_state)
+ uint64_t prev_state;
+
+ if (state == vd->vdev_state) {
+ vd->vdev_stat.vs_aux = aux;
return;
+ }
+
+ prev_state = vd->vdev_state;
vd->vdev_state = state;
vd->vdev_stat.vs_aux = aux;
+ if (state == VDEV_STATE_CANT_OPEN) {
+ /*
+ * If we fail to open a vdev during an import, we mark it as
+ * "not available", which signifies that it was never there to
+ * begin with. Failure to open such a device is not considered
+ * an error.
+ */
+ if (!vd->vdev_not_present &&
+ vd != vd->vdev_spa->spa_root_vdev) {
+ const char *class;
+
+ switch (aux) {
+ case VDEV_AUX_OPEN_FAILED:
+ class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED;
+ break;
+ case VDEV_AUX_CORRUPT_DATA:
+ class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA;
+ break;
+ case VDEV_AUX_NO_REPLICAS:
+ class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS;
+ break;
+ case VDEV_AUX_BAD_GUID_SUM:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM;
+ break;
+ case VDEV_AUX_TOO_SMALL:
+ class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL;
+ break;
+ case VDEV_AUX_BAD_LABEL:
+ class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL;
+ break;
+ default:
+ class = FM_EREPORT_ZFS_DEVICE_UNKNOWN;
+ }
+
+ zfs_ereport_post(class, vd->vdev_spa,
+ vd, NULL, prev_state, 0);
+ }
+
+ if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT &&
+ vd->vdev_ops->vdev_op_leaf)
+ vd->vdev_not_present = 1;
+ }
+
+ if (isopen)
+ return;
+
if (vd->vdev_parent != NULL) {
int c;
int degraded = 0, faulted = 0;
+ int corrupted = 0;
vdev_t *parent, *child;
parent = vd->vdev_parent;
@@ -1804,9 +1854,23 @@ vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux)
faulted++;
else if (child->vdev_state == VDEV_STATE_DEGRADED)
degraded++;
+
+ if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA)
+ corrupted++;
}
vd->vdev_parent->vdev_ops->vdev_op_state_change(
vd->vdev_parent, faulted, degraded);
- }
+
+ /*
+ * Root special: if this is a toplevel vdev that cannot be
+ * opened due to corrupted metadata, then propagate the root
+ * vdev's aux state as 'corrupt' rather than 'insufficient
+ * replicas'.
+ */
+ if (corrupted && vd == vd->vdev_top)
+ vdev_set_state(vd->vdev_spa->spa_root_vdev,
+ B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ }
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c
index e1e7c1a36f..67a8924b52 100644
--- a/usr/src/uts/common/fs/zfs/vdev_cache.c
+++ b/usr/src/uts/common/fs/zfs/vdev_cache.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -286,7 +285,8 @@ vdev_cache_read(zio_t *zio)
fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset,
ve->ve_data, vc->vc_blocksize, ZIO_TYPE_READ,
ZIO_PRIORITY_CACHE_FILL,
- ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK,
vdev_cache_fill, ve);
ve->ve_fill_io = fio;
diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c
index 1556c387b2..b4d7d7a0d2 100644
--- a/usr/src/uts/common/fs/zfs/vdev_disk.c
+++ b/usr/src/uts/common/fs/zfs/vdev_disk.c
@@ -323,6 +323,9 @@ vdev_disk_io_done(zio_t *zio)
if (zio->io_type == ZIO_TYPE_WRITE)
vdev_cache_write(zio);
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
zio_next_stage(zio);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c
index a789008e17..a82abf80b7 100644
--- a/usr/src/uts/common/fs/zfs/vdev_file.c
+++ b/usr/src/uts/common/fs/zfs/vdev_file.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -190,6 +189,9 @@ vdev_file_io_done(zio_t *zio)
if (zio->io_type == ZIO_TYPE_WRITE)
vdev_cache_write(zio);
+ if (zio_injection_enabled && zio->io_error == 0)
+ zio->io_error = zio_handle_device_injection(zio->io_vd, EIO);
+
zio_next_stage(zio);
}
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 1282df0d9a..3571be9064 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -165,8 +165,8 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
zio_nowait(zio_read_phys(zio, vd,
vdev_label_offset(vd->vdev_psize, l, offset),
size, buf, ZIO_CHECKSUM_LABEL, done, private,
- ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_SPECULATIVE |
- ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY));
+ ZIO_PRIORITY_SYNC_READ,
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
}
static void
@@ -178,8 +178,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
zio_nowait(zio_write_phys(zio, vd,
vdev_label_offset(vd->vdev_psize, l, offset),
size, buf, ZIO_CHECKSUM_LABEL, done, private,
- ZIO_PRIORITY_SYNC_WRITE,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY));
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL));
}
/*
@@ -190,7 +189,7 @@ vdev_config_generate(vdev_t *vd, int getstats)
{
nvlist_t *nv = NULL;
- VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0);
+ VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
vd->vdev_ops->vdev_op_type) == 0);
@@ -209,6 +208,9 @@ vdev_config_generate(vdev_t *vd, int getstats)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
vd->vdev_wholedisk) == 0);
+ if (vd->vdev_not_present)
+ VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0);
+
if (vd == vd->vdev_top) {
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
vd->vdev_ms_array) == 0);
@@ -269,7 +271,6 @@ vdev_label_read_config(vdev_t *vd)
{
nvlist_t *config = NULL;
vdev_phys_t *vp;
- uint64_t version;
zio_t *zio;
int l;
@@ -280,8 +281,8 @@ vdev_label_read_config(vdev_t *vd)
for (l = 0; l < VDEV_LABELS; l++) {
- zio = zio_root(vd->vdev_spa, NULL, NULL,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD);
+ zio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_CANFAIL |
+ ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD);
vdev_label_read(zio, vd, l, vp,
offsetof(vdev_label_t, vl_vdev_phys),
@@ -289,10 +290,7 @@ vdev_label_read_config(vdev_t *vd)
if (zio_wait(zio) == 0 &&
nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
- &config, 0) == 0 &&
- nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
- &version) == 0 &&
- version == UBERBLOCK_VERSION)
+ &config, 0) == 0)
break;
if (config != NULL) {
@@ -341,16 +339,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg)
* Check whether this device is already in use.
* Ignore the check if crtxg == 0, which we use for device removal.
*/
- if (crtxg != 0 && (label = vdev_label_read_config(vd)) != NULL) {
- uint64_t version, state, pool_guid, device_guid, txg;
+ if (crtxg != 0 &&
+ (label = vdev_label_read_config(vd)) != NULL) {
+ uint64_t state, pool_guid, device_guid, txg;
uint64_t mycrtxg = 0;
(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
&mycrtxg);
- if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION,
- &version) == 0 && version == UBERBLOCK_VERSION &&
- nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
&state) == 0 && state == POOL_STATE_ACTIVE &&
nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
&pool_guid) == 0 &&
@@ -390,7 +387,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg)
buf = vp->vp_nvlist;
buflen = sizeof (vp->vp_nvlist);
- if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) {
+ if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) != 0) {
nvlist_free(label);
zio_buf_free(vp, sizeof (vdev_phys_t));
return (EINVAL);
@@ -491,7 +488,7 @@ vdev_uberblock_load_done(zio_t *zio)
ASSERT3U(zio->io_size, ==, sizeof (uberblock_phys_t));
- if (uberblock_verify(ub) == 0) {
+ if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
mutex_enter(&spa->spa_uberblock_lock);
if (vdev_uberblock_compare(ub, ubbest) > 0)
*ubbest = *ub;
@@ -645,7 +642,7 @@ vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg)
buf = vp->vp_nvlist;
buflen = sizeof (vp->vp_nvlist);
- if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) == 0)
+ if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0)
vdev_label_write(zio, vd, l, vp,
offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t),
vdev_sync_label_done, NULL);
diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c
index 45eb7ce78b..b88b999c6f 100644
--- a/usr/src/uts/common/fs/zfs/vdev_mirror.c
+++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -209,7 +208,8 @@ vdev_mirror_io_start(zio_t *zio)
mm = vdev_mirror_map_alloc(zio);
if (zio->io_type == ZIO_TYPE_READ) {
- if (zio->io_flags & ZIO_FLAG_SCRUB) {
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
+ vd->vdev_ops != &vdev_replacing_ops) {
/*
* For scrubbing reads we need to allocate a read
* buffer for each child and issue reads to all
@@ -384,11 +384,12 @@ static void
vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
{
if (faulted == vd->vdev_children)
- vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
else if (degraded + faulted != 0)
- vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
else
- vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
}
vdev_ops_t vdev_mirror_ops = {
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index 09831e1504..bb838fedd1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -103,6 +102,8 @@ vdev_queue_fini(vdev_t *vd)
{
vdev_queue_t *vq = &vd->vdev_queue;
+ ASSERT(vq->vq_scrub_count == 0);
+
avl_destroy(&vq->vq_deadline_tree);
avl_destroy(&vq->vq_read_tree);
avl_destroy(&vq->vq_write_tree);
@@ -112,6 +113,28 @@ vdev_queue_fini(vdev_t *vd)
}
static void
+vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
+{
+ avl_add(&vq->vq_deadline_tree, zio);
+ avl_add(zio->io_vdev_tree, zio);
+
+ if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) &&
+ ++vq->vq_scrub_count >= vq->vq_scrub_limit)
+ spa_scrub_throttle(zio->io_spa, 1);
+}
+
+static void
+vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
+{
+ if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) &&
+ vq->vq_scrub_count-- >= vq->vq_scrub_limit)
+ spa_scrub_throttle(zio->io_spa, -1);
+
+ avl_remove(&vq->vq_deadline_tree, zio);
+ avl_remove(zio->io_vdev_tree, zio);
+}
+
+static void
vdev_queue_agg_io_done(zio_t *aio)
{
zio_t *dio;
@@ -182,18 +205,19 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
aio = zio_vdev_child_io(fio, NULL, fio->io_vd,
fio->io_offset, buf, size, fio->io_type,
ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE |
- ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE,
+ ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE |
+ ZIO_FLAG_NOBOOKMARK,
vdev_queue_agg_io_done, NULL);
aio->io_delegate_list = fio;
for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
ASSERT(dio->io_type == aio->io_type);
+ ASSERT(dio->io_vdev_tree == tree);
if (dio->io_type == ZIO_TYPE_WRITE)
bcopy(dio->io_data, buf + offset, dio->io_size);
offset += dio->io_size;
- avl_remove(&vq->vq_deadline_tree, dio);
- avl_remove(tree, dio);
+ vdev_queue_io_remove(vq, dio);
zio_vdev_io_bypass(dio);
nagg++;
}
@@ -211,8 +235,8 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit,
return (aio);
}
- avl_remove(&vq->vq_deadline_tree, fio);
- avl_remove(tree, fio);
+ ASSERT(fio->io_vdev_tree == tree);
+ vdev_queue_io_remove(vq, fio);
avl_add(&vq->vq_pending_tree, fio);
@@ -245,8 +269,7 @@ vdev_queue_io(zio_t *zio)
zio->io_deadline = (zio->io_timestamp >> vq->vq_time_shift) +
zio->io_priority;
- avl_add(&vq->vq_deadline_tree, zio);
- avl_add(zio->io_vdev_tree, zio);
+ vdev_queue_io_add(vq, zio);
nio = vdev_queue_io_to_issue(vq, vq->vq_min_pending, &func);
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index c2c4985856..157ae5001c 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -32,6 +31,7 @@
#include <sys/zio.h>
#include <sys/zio_checksum.h>
#include <sys/fs/zfs.h>
+#include <sys/fm/fs/zfs.h>
/*
* Virtual device vector for RAID-Z.
@@ -327,6 +327,28 @@ vdev_raidz_io_start(zio_t *zio)
zio_wait_children_done(zio);
}
+/*
+ * Report a checksum error for a child of a RAID-Z device.
+ */
+static void
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
+{
+ vdev_t *vd = zio->io_vd->vdev_child[rc->rc_col];
+ dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
+ vdev_description(vd));
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_checksum_errors++;
+ mutex_exit(&vd->vdev_stat_lock);
+ }
+
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
+}
+
+
static void
vdev_raidz_io_done(zio_t *zio)
{
@@ -398,8 +420,7 @@ vdev_raidz_io_done(zio_t *zio)
bcopy(rc->rc_data, orig, rc->rc_size);
vdev_raidz_reconstruct(rm, c);
if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) {
- vdev_checksum_error(zio,
- vd->vdev_child[rc->rc_col]);
+ raidz_checksum_error(zio, rc);
rc->rc_error = ECKSUM;
unexpected_errors++;
}
@@ -500,8 +521,7 @@ vdev_raidz_io_done(zio_t *zio)
* inform it.
*/
if (rc->rc_tried && rc->rc_error == 0)
- vdev_checksum_error(zio,
- vd->vdev_child[rc->rc_col]);
+ raidz_checksum_error(zio, rc);
rc->rc_error = ECKSUM;
goto done;
}
@@ -511,9 +531,18 @@ vdev_raidz_io_done(zio_t *zio)
}
/*
- * All combinations failed to checksum.
+ * All combinations failed to checksum. Generate checksum ereports for
+ * every one.
*/
zio->io_error = ECKSUM;
+ if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+ for (c = 0; c < rm->rm_cols; c++) {
+ rc = &rm->rm_col[c];
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, vd->vdev_child[rc->rc_col], zio,
+ rc->rc_offset, rc->rc_size);
+ }
+ }
done:
zio_checksum_verified(zio);
@@ -558,11 +587,12 @@ static void
vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
{
if (faulted > 1)
- vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
else if (degraded + faulted != 0)
- vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
else
- vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
}
vdev_ops_t vdev_raidz_ops = {
diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c
index 4e44b5bb05..85671d00b1 100644
--- a/usr/src/uts/common/fs/zfs/vdev_root.c
+++ b/usr/src/uts/common/fs/zfs/vdev_root.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -79,11 +78,12 @@ static void
vdev_root_state_change(vdev_t *vd, int faulted, int degraded)
{
if (faulted > 0)
- vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_NO_REPLICAS);
else if (degraded != 0)
- vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
else
- vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
+ vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
}
vdev_ops_t vdev_root_ops = {
diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c
index 2866b7f729..8dc17ed4b1 100644
--- a/usr/src/uts/common/fs/zfs/zap.c
+++ b/usr/src/uts/common/fs/zfs/zap.c
@@ -45,6 +45,7 @@
#include <sys/dmu.h>
#include <sys/zfs_context.h>
#include <sys/zap.h>
+#include <sys/refcount.h>
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
@@ -54,8 +55,8 @@ int fzap_default_block_shift = 14; /* 16k blocksize */
static void zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx);
static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx);
-static zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
- dmu_tx_t *tx, krw_t lt);
+static int zap_get_leaf_byblk(zap_t *zap, uint64_t blkid,
+ dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp);
static void zap_leaf_pageout(dmu_buf_t *db, void *vl);
@@ -120,8 +121,8 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
/*
* set up block 1 - the first leaf
*/
- db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- 1<<FZAP_BLOCK_SHIFT(zap));
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db));
dmu_buf_will_dirty(db, tx);
l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP);
@@ -131,7 +132,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
zap_leaf_init(l);
kmem_free(l, sizeof (zap_leaf_t));
- dmu_buf_rele(db);
+ dmu_buf_rele(db, FTAG);
}
static int
@@ -157,6 +158,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
{
uint64_t b, newblk;
dmu_buf_t *db_old, *db_new;
+ int err;
int bs = FZAP_BLOCK_SHIFT(zap);
int hepb = 1<<(bs-4);
/* hepb = half the number of entries in a block */
@@ -181,26 +183,27 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
*/
b = tbl->zt_blks_copied;
- db_old = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_blk + b) << bs);
- dmu_buf_read(db_old);
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + b) << bs, FTAG, &db_old);
+ if (err)
+ return;
/* first half of entries in old[b] go to new[2*b+0] */
- db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (newblk + 2*b+0) << bs);
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+0) << bs, FTAG, &db_new));
dmu_buf_will_dirty(db_new, tx);
transfer_func(db_old->db_data, db_new->db_data, hepb);
- dmu_buf_rele(db_new);
+ dmu_buf_rele(db_new, FTAG);
/* second half of entries in old[b] go to new[2*b+1] */
- db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (newblk + 2*b+1) << bs);
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (newblk + 2*b+1) << bs, FTAG, &db_new));
dmu_buf_will_dirty(db_new, tx);
transfer_func((uint64_t *)db_old->db_data + hepb,
db_new->db_data, hepb);
- dmu_buf_rele(db_new);
+ dmu_buf_rele(db_new, FTAG);
- dmu_buf_rele(db_old);
+ dmu_buf_rele(db_old, FTAG);
tbl->zt_blks_copied++;
@@ -208,7 +211,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
tbl->zt_blks_copied, tbl->zt_numblks);
if (tbl->zt_blks_copied == tbl->zt_numblks) {
- dmu_free_range(zap->zap_objset, zap->zap_object,
+ (void) dmu_free_range(zap->zap_objset, zap->zap_object,
tbl->zt_blk << bs, tbl->zt_numblks << bs, tx);
tbl->zt_blk = newblk;
@@ -222,13 +225,14 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl,
}
}
-static uint64_t
+static int
zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
dmu_tx_t *tx)
{
- uint64_t blk, off, oldval;
- dmu_buf_t *db;
+ int err;
+ uint64_t blk, off;
int bs = FZAP_BLOCK_SHIFT(zap);
+ dmu_buf_t *db;
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
ASSERT(tbl->zt_blk != 0);
@@ -238,33 +242,41 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val,
blk = idx >> (bs-3);
off = idx & ((1<<(bs-3))-1);
- db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_blk + blk) << bs);
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + blk) << bs, FTAG, &db);
+ if (err)
+ return (err);
dmu_buf_will_dirty(db, tx);
- oldval = ((uint64_t *)db->db_data)[off];
- ((uint64_t *)db->db_data)[off] = val;
- dmu_buf_rele(db);
if (tbl->zt_nextblk != 0) {
- idx *= 2;
- blk = idx >> (bs-3);
- off = idx & ((1<<(bs-3))-1);
-
- db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_nextblk + blk) << bs);
- dmu_buf_will_dirty(db, tx);
- ((uint64_t *)db->db_data)[off] = val;
- ((uint64_t *)db->db_data)[off+1] = val;
- dmu_buf_rele(db);
+ uint64_t idx2 = idx * 2;
+ uint64_t blk2 = idx2 >> (bs-3);
+ uint64_t off2 = idx2 & ((1<<(bs-3))-1);
+ dmu_buf_t *db2;
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_nextblk + blk2) << bs, FTAG, &db2);
+ if (err) {
+ dmu_buf_rele(db, FTAG);
+ return (err);
+ }
+ dmu_buf_will_dirty(db2, tx);
+ ((uint64_t *)db2->db_data)[off2] = val;
+ ((uint64_t *)db2->db_data)[off2+1] = val;
+ dmu_buf_rele(db2, FTAG);
}
- return (oldval);
+ ((uint64_t *)db->db_data)[off] = val;
+ dmu_buf_rele(db, FTAG);
+
+ return (0);
}
-static uint64_t
-zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
+static int
+zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
{
- uint64_t blk, off, val;
+ uint64_t blk, off;
+ int err;
dmu_buf_t *db;
int bs = FZAP_BLOCK_SHIFT(zap);
@@ -273,12 +285,26 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx)
blk = idx >> (bs-3);
off = idx & ((1<<(bs-3))-1);
- db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (tbl->zt_blk + blk) << bs);
- dmu_buf_read(db);
- val = ((uint64_t *)db->db_data)[off];
- dmu_buf_rele(db);
- return (val);
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_blk + blk) << bs, FTAG, &db);
+ if (err)
+ return (err);
+ *valp = ((uint64_t *)db->db_data)[off];
+ dmu_buf_rele(db, FTAG);
+
+ if (tbl->zt_nextblk != 0) {
+ /*
+ * read the nextblk for the sake of i/o error checking,
+ * so that zap_table_load() will catch errors for
+ * zap_table_store.
+ */
+ blk = (idx*2) >> (bs-3);
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (tbl->zt_nextblk + blk) << bs, FTAG, &db);
+ dmu_buf_rele(db, FTAG);
+ }
+ return (err);
}
/*
@@ -310,19 +336,21 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
*/
uint64_t newblk;
dmu_buf_t *db_new;
+ int err;
ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==,
ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0);
newblk = zap_allocate_blocks(zap, 1, tx);
- db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- newblk << FZAP_BLOCK_SHIFT(zap));
-
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new);
+ if (err)
+ return;
dmu_buf_will_dirty(db_new, tx);
zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0),
db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap));
- dmu_buf_rele(db_new);
+ dmu_buf_rele(db_new, FTAG);
zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk;
zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1;
@@ -386,8 +414,8 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx)
l->l_dbuf = NULL;
l->l_phys = NULL;
- l->l_dbuf = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- l->l_blkid << FZAP_BLOCK_SHIFT(zap));
+ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf));
winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout);
ASSERT(winner == NULL);
dmu_buf_will_dirty(l->l_dbuf, tx);
@@ -403,7 +431,7 @@ zap_destroy_leaf(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx)
{
/* uint64_t offset = l->l_blkid << ZAP_BLOCK_SHIFT; */
rw_exit(&l->l_rwlock);
- dmu_buf_rele(l->l_dbuf);
+ dmu_buf_rele(l->l_dbuf, NULL);
/* XXX there are still holds on this block, so we can't free it? */
/* dmu_free_range(zap->zap_objset, zap->zap_object, */
/* offset, 1<<ZAP_BLOCK_SHIFT, tx); */
@@ -430,11 +458,11 @@ zap_put_leaf(zap_leaf_t *l)
while (nl) {
zap_leaf_t *nnl = nl->l_next;
rw_exit(&nl->l_rwlock);
- dmu_buf_rele(nl->l_dbuf);
+ dmu_buf_rele(nl->l_dbuf, NULL);
nl = nnl;
}
rw_exit(&l->l_rwlock);
- dmu_buf_rele(l->l_dbuf);
+ dmu_buf_rele(l->l_dbuf, NULL);
}
_NOTE(ARGSUSED(0))
@@ -489,23 +517,27 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db)
return (l);
}
-static zap_leaf_t *
-zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
+static int
+zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+ zap_leaf_t **lp)
{
dmu_buf_t *db;
zap_leaf_t *l;
int bs = FZAP_BLOCK_SHIFT(zap);
+ int err;
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
- db = dmu_buf_hold(zap->zap_objset, zap->zap_object, blkid << bs);
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ blkid << bs, NULL, &db);
+ if (err)
+ return (err);
ASSERT3U(db->db_object, ==, zap->zap_object);
ASSERT3U(db->db_offset, ==, blkid << bs);
ASSERT3U(db->db_size, ==, 1 << bs);
ASSERT(blkid != 0);
- dmu_buf_read(db);
l = dmu_buf_get_user(db);
if (l == NULL)
@@ -524,43 +556,53 @@ zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
ASSERT3U(l->lh_block_type, ==, ZBT_LEAF);
ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC);
- return (l);
+ *lp = l;
+ return (0);
}
-static zap_leaf_t *
-zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt)
+static int
+zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt,
+ zap_leaf_t **lp)
{
- zap_leaf_t *l, *nl;
+ int err;
+ zap_leaf_t *nl;
- l = zap_get_leaf_byblk_impl(zap, blkid, tx, lt);
+ err = zap_get_leaf_byblk_impl(zap, blkid, tx, lt, lp);
+ if (err)
+ return (err);
- nl = l;
+ nl = *lp;
while (nl->lh_next != 0) {
zap_leaf_t *nnl;
- nnl = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt);
+ err = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt, &nnl);
+ if (err) {
+ zap_put_leaf(*lp);
+ return (err);
+ }
nl->l_next = nnl;
nl = nnl;
}
- return (l);
+ return (err);
}
-static uint64_t
-zap_idx_to_blk(zap_t *zap, uint64_t idx)
+static int
+zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp)
{
ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
ASSERT3U(idx, <,
(1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift));
- return (ZAP_EMBEDDED_PTRTBL_ENT(zap, idx));
+ *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx);
+ return (0);
} else {
return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
- idx));
+ idx, valp));
}
}
-static void
+static int
zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
{
ASSERT(tx != NULL);
@@ -568,32 +610,37 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx)
if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) {
ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk;
+ return (0);
} else {
- (void) zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
- idx, blk, tx);
+ return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl,
+ idx, blk, tx));
}
}
-static zap_leaf_t *
-zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt)
+static int
+zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp)
{
- uint64_t idx;
- zap_leaf_t *l;
+ uint64_t idx, blk;
+ int err;
ASSERT(zap->zap_dbuf == NULL ||
zap->zap_f.zap_phys == zap->zap_dbuf->db_data);
ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC);
idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift);
- l = zap_get_leaf_byblk(zap, zap_idx_to_blk(zap, idx), tx, lt);
-
- ASSERT3U(ZAP_HASH_IDX(h, l->lh_prefix_len), ==, l->lh_prefix);
+ err = zap_idx_to_blk(zap, idx, &blk);
+ if (err != 0)
+ return (err);
+ err = zap_get_leaf_byblk(zap, blk, tx, lt, lp);
- return (l);
+ ASSERT(err ||
+ ZAP_HASH_IDX(h, (*lp)->lh_prefix_len) == (*lp)->lh_prefix);
+ return (err);
}
-static zap_leaf_t *
-zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
+static int
+zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx,
+ zap_leaf_t **lp)
{
zap_leaf_t *nl;
int prefix_diff, i, err;
@@ -616,11 +663,13 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap);
ASSERT3U(err, ==, 0);
ASSERT(!zap->zap_ismicro);
- l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+ (void) zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
- if (l->lh_prefix_len != old_prefix_len)
+ if (l->lh_prefix_len != old_prefix_len) {
/* it split while our locks were down */
- return (l);
+ *lp = l;
+ return (0);
+ }
}
ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
@@ -629,21 +678,33 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
(void) zap_leaf_chainmore(l, zap_create_leaf(zap, tx));
dprintf("chaining leaf %x/%d\n", l->lh_prefix,
l->lh_prefix_len);
- return (l);
+ *lp = l;
+ return (0);
}
ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix);
/* There's more than one pointer to us. Split this leaf. */
- nl = zap_leaf_split(zap, l, tx);
/* set sibling pointers */
prefix_diff =
- zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len;
- sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len) | 1) << prefix_diff;
+ zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - (l->lh_prefix_len + 1);
+ sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len + 1) | 1) << prefix_diff;
+
+ /* check for i/o errors before doing zap_leaf_split */
for (i = 0; i < (1ULL<<prefix_diff); i++) {
- ASSERT3U(zap_idx_to_blk(zap, sibling+i), ==, l->l_blkid);
- zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+ uint64_t blk;
+ err = zap_idx_to_blk(zap, sibling+i, &blk);
+ if (err)
+ return (err);
+ ASSERT3U(blk, ==, l->l_blkid);
+ }
+
+ nl = zap_leaf_split(zap, l, tx);
+
+ for (i = 0; i < (1ULL<<prefix_diff); i++) {
+ err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx);
+ ASSERT3U(err, ==, 0); /* we checked for i/o errors above */
/* dprintf("set %d to %u %x\n", sibling+i, nl->l_blkid, nl); */
}
@@ -657,7 +718,8 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx)
zap_put_leaf(nl);
}
- return (l);
+ *lp = l;
+ return (0);
}
static void
@@ -682,7 +744,8 @@ again:
err = zap_lockdir(os, zapobj, tx,
RW_WRITER, FALSE, &zap);
ASSERT3U(err, ==, 0);
- l = zap_get_leaf_byblk(zap, blkid, tx, RW_READER);
+ (void) zap_get_leaf_byblk(zap, blkid, tx,
+ RW_READER, &l);
goto again;
}
@@ -734,7 +797,9 @@ fzap_lookup(zap_t *zap, const char *name,
return (err);
hash = zap_hash(zap, name);
- l = zap_deref_leaf(zap, hash, NULL, RW_READER);
+ err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
err = zap_leaf_lookup(l, name, hash, &zeh);
if (err != 0)
goto out;
@@ -747,7 +812,7 @@ out:
int
fzap_add_cd(zap_t *zap, const char *name,
uint64_t integer_size, uint64_t num_integers,
- const void *val, uint32_t cd, dmu_tx_t *tx, zap_leaf_t **lp)
+ const void *val, uint32_t cd, dmu_tx_t *tx)
{
zap_leaf_t *l;
uint64_t hash;
@@ -759,14 +824,17 @@ fzap_add_cd(zap_t *zap, const char *name,
ASSERT(fzap_checksize(integer_size, num_integers) == 0);
hash = zap_hash(zap, name);
- l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
retry:
err = zap_leaf_lookup(l, name, hash, &zeh);
if (err == 0) {
err = EEXIST;
goto out;
}
- ASSERT(err == ENOENT);
+ if (err != ENOENT)
+ goto out;
/* XXX If this leaf is chained, split it if we can. */
err = zap_entry_create(l, name, hash, cd,
@@ -775,15 +843,14 @@ retry:
if (err == 0) {
zap_increment_num_entries(zap, 1, tx);
} else if (err == EAGAIN) {
- l = zap_expand_leaf(zap, l, hash, tx);
+ err = zap_expand_leaf(zap, l, hash, tx, &l);
+ if (err != 0)
+ goto out;
goto retry;
}
out:
- if (lp)
- *lp = l;
- else
- zap_put_leaf(l);
+ zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
return (err);
}
@@ -793,16 +860,14 @@ fzap_add(zap_t *zap, const char *name,
const void *val, dmu_tx_t *tx)
{
int err;
- zap_leaf_t *l;
err = fzap_checksize(integer_size, num_integers);
if (err != 0)
return (err);
err = fzap_add_cd(zap, name, integer_size, num_integers,
- val, ZAP_MAXCD, tx, &l);
+ val, ZAP_MAXCD, tx);
- zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
return (err);
}
@@ -821,7 +886,9 @@ fzap_update(zap_t *zap, const char *name,
return (err);
hash = zap_hash(zap, name);
- l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
retry:
err = zap_leaf_lookup(l, name, hash, &zeh);
create = (err == ENOENT);
@@ -839,10 +906,13 @@ retry:
}
if (err == EAGAIN) {
- l = zap_expand_leaf(zap, l, hash, tx);
+ err = zap_expand_leaf(zap, l, hash, tx, &l);
+ if (err != 0)
+ goto out;
goto retry;
}
+out:
zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx);
return (err);
}
@@ -857,7 +927,9 @@ fzap_length(zap_t *zap, const char *name,
zap_entry_handle_t zeh;
hash = zap_hash(zap, name);
- l = zap_deref_leaf(zap, hash, NULL, RW_READER);
+ err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l);
+ if (err != 0)
+ return (err);
err = zap_leaf_lookup(l, name, hash, &zeh);
if (err != 0)
goto out;
@@ -880,7 +952,9 @@ fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx)
zap_entry_handle_t zeh;
hash = zap_hash(zap, name);
- l = zap_deref_leaf(zap, hash, tx, RW_WRITER);
+ err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l);
+ if (err != 0)
+ return (err);
err = zap_leaf_lookup(l, name, hash, &zeh);
if (err == 0) {
zap_entry_remove(&zeh);
@@ -938,7 +1012,10 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
again:
if (zc->zc_leaf == NULL) {
- zc->zc_leaf = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER);
+ err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER,
+ &zc->zc_leaf);
+ if (err != 0)
+ return (err);
} else {
rw_enter(&zc->zc_leaf->l_rwlock, RW_READER);
}
@@ -982,7 +1059,7 @@ again:
static void
zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
{
- int i;
+ int i, err;
uint64_t lastblk = 0;
/*
@@ -997,10 +1074,11 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
continue;
lastblk = tbl[i];
- l = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER);
-
- zap_stats_leaf(zap, l, zs);
- zap_put_leaf(l);
+ err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l);
+ if (err == 0) {
+ zap_stats_leaf(zap, l, zs);
+ zap_put_leaf(l);
+ }
}
}
@@ -1028,12 +1106,16 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks;
b++) {
dmu_buf_t *db;
-
- db = dmu_buf_hold(zap->zap_objset, zap->zap_object,
- (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs);
- dmu_buf_read(db);
- zap_stats_ptrtbl(zap, db->db_data, 1<<(bs-3), zs);
- dmu_buf_rele(db);
+ int err;
+
+ err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
+ (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs,
+ FTAG, &db);
+ if (err == 0) {
+ zap_stats_ptrtbl(zap, db->db_data,
+ 1<<(bs-3), zs);
+ dmu_buf_rele(db, FTAG);
+ }
}
}
}
diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c
index 3e150b9b1d..2d3180e37f 100644
--- a/usr/src/uts/common/fs/zfs/zap_micro.c
+++ b/usr/src/uts/common/fs/zfs/zap_micro.c
@@ -29,6 +29,7 @@
#include <sys/dmu.h>
#include <sys/zfs_context.h>
#include <sys/zap.h>
+#include <sys/refcount.h>
#include <sys/zap_impl.h>
#include <sys/zap_leaf.h>
#include <sys/avl.h>
@@ -269,7 +270,9 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
*zapp = NULL;
- db = dmu_buf_hold(os, obj, 0);
+ err = dmu_buf_hold(os, obj, 0, NULL, &db);
+ if (err)
+ return (err);
#ifdef ZFS_DEBUG
{
@@ -279,12 +282,6 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
}
#endif
- /*
- * The zap can deal with EIO here, but its callers don't yet, so
- * spare them by doing a mustsucceed read.
- */
- dmu_buf_read(db);
-
zap = dmu_buf_get_user(db);
if (zap == NULL)
zap = mzap_open(os, obj, db);
@@ -340,7 +337,7 @@ void
zap_unlockdir(zap_t *zap)
{
rw_exit(&zap->zap_rwlock);
- dmu_buf_rele(zap->zap_dbuf);
+ dmu_buf_rele(zap->zap_dbuf, NULL);
}
static void
@@ -375,7 +372,7 @@ mzap_upgrade(zap_t *zap, dmu_tx_t *tx)
mze->mze_name, mze->mze_value);
err = fzap_add_cd(zap,
mze->mze_name, 8, 1, &mze->mze_value,
- mze->mze_cd, tx, NULL);
+ mze->mze_cd, tx);
ASSERT3U(err, ==, 0);
}
kmem_free(mzp, sz);
@@ -411,7 +408,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
dmu_buf_t *db;
mzap_phys_t *zp;
- db = dmu_buf_hold(os, obj, 0);
+ VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db));
#ifdef ZFS_DEBUG
{
@@ -426,7 +423,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx)
zp->mz_block_type = ZBT_MICRO;
zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
ASSERT(zp->mz_salt != 0);
- dmu_buf_rele(db);
+ dmu_buf_rele(db, FTAG);
}
int
diff --git a/usr/src/uts/common/fs/zfs/zfs_acl.c b/usr/src/uts/common/fs/zfs/zfs_acl.c
index 69acccf493..c70986b853 100644
--- a/usr/src/uts/common/fs/zfs/zfs_acl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_acl.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -288,25 +287,33 @@ zfs_acl_node_read_internal(znode_t *zp)
/*
* Read an external acl object.
*/
-zfs_acl_t *
-zfs_acl_node_read(znode_t *zp)
+static int
+zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp)
{
uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj;
zfs_acl_t *aclp;
+ int error;
ASSERT(MUTEX_HELD(&zp->z_acl_lock));
- if (zp->z_phys->zp_acl.z_acl_extern_obj == 0)
- return (zfs_acl_node_read_internal(zp));
+ if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
+ *aclpp = zfs_acl_node_read_internal(zp);
+ return (0);
+ }
aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count);
- dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
+ error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl);
+ if (error != 0) {
+ zfs_acl_free(aclp);
+ return (error);
+ }
aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count;
- return (aclp);
+ *aclpp = aclp;
+ return (0);
}
static boolean_t
@@ -868,15 +875,17 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp,
int
zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx)
{
- zfs_acl_t *aclp;
+ zfs_acl_t *aclp = NULL;
int error;
ASSERT(MUTEX_HELD(&zp->z_lock));
mutex_enter(&zp->z_acl_lock);
- aclp = zfs_acl_node_read(zp);
- error = zfs_acl_chmod(zp, mode, aclp, tx);
+ error = zfs_acl_node_read(zp, &aclp);
+ if (error == 0)
+ error = zfs_acl_chmod(zp, mode, aclp, tx);
mutex_exit(&zp->z_acl_lock);
- zfs_acl_free(aclp);
+ if (aclp)
+ zfs_acl_free(aclp);
return (error);
}
@@ -1047,7 +1056,7 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE);
if (pull_down) {
mutex_enter(&parent->z_acl_lock);
- paclp = zfs_acl_node_read(parent);
+ VERIFY(0 == zfs_acl_node_read(parent, &paclp));
mutex_exit(&parent->z_acl_lock);
aclp = zfs_acl_inherit(zp, paclp);
zfs_acl_free(paclp);
@@ -1106,7 +1115,12 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr)
mutex_enter(&zp->z_acl_lock);
- aclp = zfs_acl_node_read(zp);
+ error = zfs_acl_node_read(zp, &aclp);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
if (mask & VSA_ACECNT) {
vsecp->vsa_aclcnt = aclp->z_acl_count;
@@ -1240,6 +1254,7 @@ zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
int mode_wanted = v4_mode;
int cnt;
int i;
+ int error;
int access_deny = ACCESS_UNDETERMINED;
uint_t entry_type;
uid_t uid = crgetuid(cr);
@@ -1257,7 +1272,12 @@ zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr)
mutex_enter(&zp->z_acl_lock);
- aclp = zfs_acl_node_read(zp);
+ error = zfs_acl_node_read(zp, &aclp);
+ if (error != 0) {
+ mutex_exit(&zp->z_acl_lock);
+ return (error);
+ }
+
zacep = aclp->z_acl;
cnt = aclp->z_acl_count;
diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c
index ebdce10c33..d73315b47d 100644
--- a/usr/src/uts/common/fs/zfs/zfs_dir.c
+++ b/usr/src/uts/common/fs/zfs/zfs_dir.c
@@ -289,6 +289,21 @@ zfs_dq_hexname(char namebuf[17], uint64_t x)
return (name);
}
+/*
+ * Delete Queue Error Handling
+ *
+ * When dealing with the delete queue, we dmu_tx_hold_zap(), but we
+ * don't specify the name of the entry that we will be manipulating. We
+ * also fib and say that we won't be adding any new entries to the
+ * delete queue, even though we might (this is to lower the minimum file
+ * size that can be deleted in a full filesystem). So on the small
+ * chance that the delete queue is using a fat zap (ie. has more than
+ * 2000 entries), we *may* not pre-read a block that's needed.
+ * Therefore it is remotely possible for some of the assertions
+ * regarding the delete queue below to fail due to i/o error. On a
+ * nondebug system, this will result in the space being leaked.
+ */
+
void
zfs_dq_add(znode_t *zp, dmu_tx_t *tx)
{
@@ -338,9 +353,9 @@ zfs_purgedir(znode_t *dzp)
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, dzp->z_id);
- dmu_tx_hold_zap(tx, dzp->z_id, -1);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name);
dmu_tx_hold_bonus(tx, xzp->z_id);
- dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+ dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
dmu_tx_abort(tx);
@@ -579,10 +594,10 @@ zfs_rmnode(znode_t *zp)
*/
tx = dmu_tx_create(os);
dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
- dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1);
+ dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
if (xzp) {
dmu_tx_hold_bonus(tx, xzp->z_id);
- dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+ dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, TRUE, NULL);
}
if (acl_obj)
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
@@ -764,7 +779,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, zp->z_id);
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
error = dmu_tx_assign(tx, zfsvfs->z_assign);
if (error) {
dmu_tx_abort(tx);
diff --git a/usr/src/uts/common/fs/zfs/zfs_fm.c b/usr/src/uts/common/fs/zfs/zfs_fm.c
new file mode 100644
index 0000000000..007445c713
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zfs_fm.c
@@ -0,0 +1,316 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev.h>
+#include <sys/vdev_impl.h>
+#include <sys/zio.h>
+
+#include <sys/fm/fs/zfs.h>
+#include <sys/fm/protocol.h>
+#include <sys/fm/util.h>
+#include <sys/sysevent.h>
+
+/*
+ * This general routine is responsible for generating all the different ZFS
+ * ereports. The payload is dependent on the class, and which arguments are
+ * supplied to the function:
+ *
+ * EREPORT POOL VDEV IO
+ * block X X X
+ * data X X
+ * device X X
+ * pool X
+ *
+ * If we are in a loading state, all errors are chained together by the same
+ * SPA-wide ENA.
+ *
+ * For isolated I/O requests, we get the ENA from the zio_t. The propagation
+ * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want
+ * to chain together all ereports associated with a logical piece of data. For
+ * read I/Os, there are basically three 'types' of I/O, which form a roughly
+ * layered diagram:
+ *
+ * +---------------+
+ * | Aggregate I/O | No associated logical data or device
+ * +---------------+
+ * |
+ * V
+ * +---------------+ Reads associated with a piece of logical data.
+ * | Read I/O | This includes reads on behalf of RAID-Z,
+ * +---------------+ mirrors, gang blocks, retries, etc.
+ * |
+ * V
+ * +---------------+ Reads associated with a particular device, but
+ * | Physical I/O | no logical data. Issued as part of vdev caching
+ * +---------------+ and I/O aggregation.
+ *
+ * Note that 'physical I/O' here is not the same terminology as used in the rest
+ * of ZIO. Typically, 'physical I/O' simply means that there is no attached
+ * blockpointer. But I/O with no associated block pointer can still be related
+ * to a logical piece of data (i.e. RAID-Z requests).
+ *
+ * Purely physical I/O always have unique ENAs. They are not related to a
+ * particular piece of logical data, and therefore cannot be chained together.
+ * We still generate an ereport, but the DE doesn't correlate it with any
+ * logical piece of data. When such an I/O fails, the delegated I/O requests
+ * will issue a retry, which will trigger the 'real' ereport with the correct
+ * ENA.
+ *
+ * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
+ * When a new logical I/O is issued, we set this to point to itself. Child I/Os
+ * then inherit this pointer, so that when it is first set subsequent failures
+ * will use the same ENA. If a physical I/O is issued (by passing the
+ * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a
+ * unique ENA will be generated. For an aggregate I/O, this pointer is set to
+ * NULL, and no ereport will be generated (since it doesn't actually correspond
+ * to any particular device or piece of data).
+ */
+void
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+ uint64_t stateoroffset, uint64_t size)
+{
+#ifdef _KERNEL
+ nvlist_t *ereport, *detector;
+ uint64_t ena;
+ char class[64];
+
+ /*
+ * If we are doing a spa_tryimport(), ignore errors.
+ */
+ if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+ return;
+
+ /*
+ * If we are in the middle of opening a pool, and the previous attempt
+ * failed, don't bother logging any new ereports - we're just going to
+ * get the same diagnosis anyway.
+ */
+ if (spa->spa_load_state != SPA_LOAD_NONE &&
+ spa->spa_last_open_failed)
+ return;
+
+ /*
+ * Ignore any errors from I/Os that we are going to retry anyway - we
+ * only generate errors from the final failure.
+ */
+ if (zio && zio_should_retry(zio))
+ return;
+
+ if ((ereport = fm_nvlist_create(NULL)) == NULL)
+ return;
+
+ if ((detector = fm_nvlist_create(NULL)) == NULL) {
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ return;
+ }
+
+ /*
+ * Serialize ereport generation
+ */
+ mutex_enter(&spa->spa_errlist_lock);
+
+ /*
+ * Determine the ENA to use for this event. If we are in a loading
+ * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use
+ * a root zio-wide ENA. Otherwise, simply use a unique ENA.
+ */
+ if (spa->spa_load_state != SPA_LOAD_NONE) {
+ if (spa->spa_ena == 0)
+ spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
+ ena = spa->spa_ena;
+ } else if (zio != NULL && zio->io_logical != NULL) {
+ if (zio->io_logical->io_ena == 0)
+ zio->io_logical->io_ena =
+ fm_ena_generate(0, FM_ENA_FMT1);
+ ena = zio->io_logical->io_ena;
+ } else {
+ ena = fm_ena_generate(0, FM_ENA_FMT1);
+ }
+
+ /*
+ * Construct the full class, detector, and other standard FMA fields.
+ */
+ (void) snprintf(class, sizeof (class), "%s.%s",
+ ZFS_ERROR_CLASS, subclass);
+
+ fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
+ vd != NULL ? vd->vdev_guid : 0);
+
+ fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
+
+ /*
+ * Construct the per-ereport payload, depending on which parameters are
+ * passed in.
+ */
+
+ /*
+ * Generic payload members common to all ereports.
+ *
+ * The direct reference to spa_name is used rather than spa_name()
+ * because of the asynchronous nature of the zio pipeline. spa_name()
+ * asserts that the config lock is held in some form. This is always
+ * the case in I/O context, but because the check for RW_WRITER compares
+ * against 'curthread', we may be in an asynchronous context and blow
+ * this assert. Rather than loosen this assert, we acknowledge that all
+ * contexts in which this function is called (pool open, I/O) are safe,
+ * and dereference the name directly.
+ */
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL,
+ DATA_TYPE_STRING, spa->spa_name, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
+ DATA_TYPE_UINT64, spa_guid(spa),
+ FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
+ spa->spa_load_state, NULL);
+
+ if (vd != NULL) {
+ vdev_t *pvd = vd->vdev_parent;
+
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
+ DATA_TYPE_UINT64, vd->vdev_guid,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
+ DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
+ if (vd->vdev_path)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
+ DATA_TYPE_STRING, vd->vdev_path, NULL);
+ if (vd->vdev_devid)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
+ DATA_TYPE_STRING, vd->vdev_devid, NULL);
+
+ if (pvd != NULL) {
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
+ DATA_TYPE_UINT64, pvd->vdev_guid,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
+ DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
+ NULL);
+ if (pvd->vdev_path)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
+ DATA_TYPE_STRING, vd->vdev_path, NULL);
+ if (pvd->vdev_devid)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
+ DATA_TYPE_STRING, pvd->vdev_devid, NULL);
+ }
+ }
+
+ if (zio != NULL) {
+ /*
+ * Payload common to all I/Os.
+ */
+ fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
+ DATA_TYPE_INT32, zio->io_error, NULL);
+
+ /*
+ * If the 'size' parameter is non-zero, it indicates this is a
+ * RAID-Z or other I/O where the physical offset and length are
+ * provided for us, instead of within the zio_t.
+ */
+ if (vd != NULL) {
+ if (size)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+ DATA_TYPE_UINT64, stateoroffset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+ DATA_TYPE_UINT64, size);
+ else
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
+ DATA_TYPE_UINT64, zio->io_offset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
+ DATA_TYPE_UINT64, zio->io_size);
+ }
+
+ /*
+ * Payload for I/Os with corresponding logical information.
+ */
+ if (zio->io_logical != NULL)
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_objset,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_object,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
+ DATA_TYPE_INT32,
+ zio->io_logical->io_bookmark.zb_level,
+ FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
+ DATA_TYPE_UINT64,
+ zio->io_logical->io_bookmark.zb_blkid);
+ } else if (vd != NULL) {
+ /*
+ * If we have a vdev but no zio, this is a device fault, and the
+ * 'stateoroffset' parameter indicates the previous state of the
+ * vdev.
+ */
+ fm_payload_set(ereport,
+ FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
+ DATA_TYPE_UINT64, stateoroffset, NULL);
+ }
+ mutex_exit(&spa->spa_errlist_lock);
+
+ fm_ereport_post(ereport, EVCH_SLEEP);
+
+ fm_nvlist_destroy(ereport, FM_NVA_FREE);
+ fm_nvlist_destroy(detector, FM_NVA_FREE);
+#endif
+}
+
+/*
+ * The 'resource.fs.zfs.ok' event is an internal signal that the associated
+ * resource (pool or disk) has been identified by ZFS as healthy. This will
+ * then trigger the DE to close the associated case, if any.
+ */
+void
+zfs_post_ok(spa_t *spa, vdev_t *vd)
+{
+#ifdef _KERNEL
+ nvlist_t *resource;
+ char class[64];
+
+ if ((resource = fm_nvlist_create(NULL)) == NULL)
+ return;
+
+ (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE,
+ ZFS_ERROR_CLASS, FM_RESOURCE_OK);
+ VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0);
+ VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0);
+ VERIFY(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0);
+ if (vd)
+ VERIFY(nvlist_add_uint64(resource,
+ FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0);
+
+ fm_ereport_post(resource, EVCH_SLEEP);
+
+ fm_nvlist_destroy(resource, FM_NVA_FREE);
+#endif
+}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 29b01e4331..422b24a993 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -297,6 +297,16 @@ zfs_secpolicy_config(const char *unused, const char *unused2, cred_t *cr)
}
/*
+ * Policy for fault injection. Requires all privileges.
+ */
+/* ARGSUSED */
+static int
+zfs_secpolicy_inject(const char *unused, const char *unused2, cred_t *cr)
+{
+ return (secpolicy_zinject(cr));
+}
+
+/*
* Returns the nvlist as specified by the user in the zfs_cmd_t.
*/
static int
@@ -368,7 +378,7 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
return (error);
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 ||
- guid != zc->zc_pool_guid)
+ guid != zc->zc_guid)
error = EINVAL;
else
error = spa_import(zc->zc_name, config,
@@ -396,7 +406,8 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc)
if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL)
return (EEXIST);
- VERIFY(nvlist_pack(configs, &packed, &size, NV_ENCODE_NATIVE, 0) == 0);
+ VERIFY(nvlist_pack(configs, &packed, &size, NV_ENCODE_NATIVE,
+ KM_SLEEP) == 0);
if (size > zc->zc_config_dst_size)
error = ENOMEM;
@@ -420,7 +431,7 @@ zfs_ioc_pool_guid(zfs_cmd_t *zc)
error = spa_open(zc->zc_name, &spa, FTAG);
if (error == 0) {
- zc->zc_pool_guid = spa_guid(spa);
+ zc->zc_guid = spa_guid(spa);
spa_close(spa, FTAG);
}
return (error);
@@ -433,28 +444,37 @@ zfs_ioc_pool_stats(zfs_cmd_t *zc)
char *packed = NULL;
size_t size = 0;
int error;
+ int ret = 0;
- error = spa_get_stats(zc->zc_name, &config);
+ error = spa_get_stats(zc->zc_name, &config, zc->zc_root,
+ sizeof (zc->zc_root));
if (config != NULL) {
VERIFY(nvlist_pack(config, &packed, &size,
- NV_ENCODE_NATIVE, 0) == 0);
+ NV_ENCODE_NATIVE, KM_SLEEP) == 0);
if (size > zc->zc_config_dst_size)
- error = ENOMEM;
+ ret = ENOMEM;
else if (xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst,
size))
- error = EFAULT;
+ ret = EFAULT;
zc->zc_config_dst_size = size;
kmem_free(packed, size);
nvlist_free(config);
+
+ /*
+ * The config may be present even if 'error' is non-zero.
+ * In this case we return success, and preserve the real errno
+ * in 'zc_cookie'.
+ */
+ zc->zc_cookie = error;
} else {
- ASSERT(error != 0);
+ ret = error;
}
- return (error);
+ return (ret);
}
/*
@@ -479,7 +499,8 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
if (config == NULL)
return (EINVAL);
- VERIFY(nvlist_pack(config, &packed, &size, NV_ENCODE_NATIVE, 0) == 0);
+ VERIFY(nvlist_pack(config, &packed, &size, NV_ENCODE_NATIVE,
+ KM_SLEEP) == 0);
if (size > zc->zc_config_dst_size)
error = ENOMEM;
@@ -554,13 +575,12 @@ static int
zfs_ioc_vdev_online(zfs_cmd_t *zc)
{
spa_t *spa;
- char *path = zc->zc_prop_value;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
- error = vdev_online(spa, path);
+ error = vdev_online(spa, zc->zc_guid);
spa_close(spa, FTAG);
return (error);
}
@@ -569,14 +589,13 @@ static int
zfs_ioc_vdev_offline(zfs_cmd_t *zc)
{
spa_t *spa;
- char *path = zc->zc_prop_value;
int istmp = zc->zc_cookie;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
- error = vdev_offline(spa, path, istmp);
+ error = vdev_offline(spa, zc->zc_guid, istmp);
spa_close(spa, FTAG);
return (error);
}
@@ -585,7 +604,6 @@ static int
zfs_ioc_vdev_attach(zfs_cmd_t *zc)
{
spa_t *spa;
- char *path = zc->zc_prop_value;
int replacing = zc->zc_cookie;
nvlist_t *config;
int error;
@@ -595,7 +613,7 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc)
return (error);
if ((error = get_config(zc, &config)) == 0) {
- error = spa_vdev_attach(spa, path, config, replacing);
+ error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
nvlist_free(config);
}
@@ -607,14 +625,13 @@ static int
zfs_ioc_vdev_detach(zfs_cmd_t *zc)
{
spa_t *spa;
- char *path = zc->zc_prop_value;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
- error = spa_vdev_detach(spa, path, 0, B_FALSE);
+ error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
spa_close(spa, FTAG);
return (error);
@@ -625,7 +642,7 @@ zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
{
spa_t *spa;
char *path = zc->zc_prop_value;
- uint64_t guid = zc->zc_pool_guid;
+ uint64_t guid = zc->zc_guid;
int error;
error = spa_open(zc->zc_name, &spa, FTAG);
@@ -688,6 +705,8 @@ retry:
if (!error && zc->zc_objset_stats.dds_type == DMU_OST_ZVOL)
error = zvol_get_stats(zc, os);
+ spa_altroot(dmu_objset_spa(os), zc->zc_root, sizeof (zc->zc_root));
+
dmu_objset_close(os);
return (error);
}
@@ -1008,8 +1027,8 @@ zfs_ioc_recvbackup(zfs_cmd_t *zc)
fp = getf(fd);
if (fp == NULL)
return (EBADF);
- error = dmu_recvbackup(&zc->zc_begin_record, &zc->zc_cookie,
- fp->f_vnode, fp->f_offset);
+ error = dmu_recvbackup(zc->zc_filename, &zc->zc_begin_record,
+ &zc->zc_cookie, fp->f_vnode, fp->f_offset);
releasef(fd);
return (error);
}
@@ -1053,6 +1072,110 @@ zfs_ioc_sendbackup(zfs_cmd_t *zc)
return (error);
}
+static int
+zfs_ioc_inject_fault(zfs_cmd_t *zc)
+{
+ int id, error;
+
+ error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id,
+ &zc->zc_inject_record);
+
+ if (error == 0)
+ zc->zc_guid = (uint64_t)id;
+
+ return (error);
+}
+
+static int
+zfs_ioc_clear_fault(zfs_cmd_t *zc)
+{
+ return (zio_clear_fault((int)zc->zc_guid));
+}
+
+static int
+zfs_ioc_inject_list_next(zfs_cmd_t *zc)
+{
+ int id = (int)zc->zc_guid;
+ int error;
+
+ error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name),
+ &zc->zc_inject_record);
+
+ zc->zc_guid = id;
+
+ return (error);
+}
+
+static int
+zfs_ioc_error_log(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+ size_t count = (size_t)zc->zc_config_dst_size;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_config_dst,
+ &count);
+ if (error == 0)
+ zc->zc_config_dst_size = count;
+ else
+ zc->zc_config_dst_size = spa_get_errlog_size(spa);
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
+static int
+zfs_ioc_clear(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ vdev_t *vd;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ if (zc->zc_prop_value[0] == '\0')
+ vd = NULL;
+ else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) {
+ spa_config_exit(spa, FTAG);
+ spa_close(spa, FTAG);
+ return (ENODEV);
+ }
+
+ vdev_clear(spa, vd);
+
+ spa_config_exit(spa, FTAG);
+
+ spa_close(spa, FTAG);
+
+ return (0);
+}
+
+static int
+zfs_ioc_bookmark_name(zfs_cmd_t *zc)
+{
+ spa_t *spa;
+ int error;
+
+ if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+ return (error);
+
+ error = spa_bookmark_name(spa, &zc->zc_bookmark,
+ zc->zc_prop_name, sizeof (zc->zc_prop_name), zc->zc_prop_value,
+ sizeof (zc->zc_prop_value), zc->zc_filename,
+ sizeof (zc->zc_filename));
+
+ spa_close(spa, FTAG);
+
+ return (error);
+}
+
static zfs_ioc_vec_t zfs_ioc_vec[] = {
{ zfs_ioc_pool_create, zfs_secpolicy_config, pool_name },
{ zfs_ioc_pool_destroy, zfs_secpolicy_config, pool_name },
@@ -1087,6 +1210,12 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = {
{ zfs_ioc_rename, zfs_secpolicy_write, dataset_name },
{ zfs_ioc_recvbackup, zfs_secpolicy_write, dataset_name },
{ zfs_ioc_sendbackup, zfs_secpolicy_write, dataset_name },
+ { zfs_ioc_inject_fault, zfs_secpolicy_inject, no_name },
+ { zfs_ioc_clear_fault, zfs_secpolicy_inject, no_name },
+ { zfs_ioc_inject_list_next, zfs_secpolicy_inject, no_name },
+ { zfs_ioc_error_log, zfs_secpolicy_inject, pool_name },
+ { zfs_ioc_clear, zfs_secpolicy_config, pool_name },
+ { zfs_ioc_bookmark_name, zfs_secpolicy_inject, pool_name }
};
static int
@@ -1279,7 +1408,7 @@ _fini(void)
{
int error;
- if (spa_busy() || zfs_busy() || zvol_busy())
+ if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled)
return (EBUSY);
if ((error = mod_remove(&modlinkage)) != 0)
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index 17771b2e26..68a3e414eb 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -52,6 +52,7 @@
#include <sys/modctl.h>
#include <sys/zfs_ioctl.h>
#include <sys/zfs_ctldir.h>
+#include <sys/bootconf.h>
#include <sys/sunddi.h>
#include <sys/dnlc.h>
@@ -61,8 +62,11 @@ static major_t zfs_major;
static minor_t zfs_minor;
static kmutex_t zfs_dev_mtx;
+extern char zfs_bootpath[BO_MAXOBJNAME];
+
static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
+static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
@@ -71,6 +75,7 @@ static void zfs_objset_close(zfsvfs_t *zfsvfs);
static const fs_operation_def_t zfs_vfsops_template[] = {
VFSNAME_MOUNT, zfs_mount,
+ VFSNAME_MOUNTROOT, zfs_mountroot,
VFSNAME_UNMOUNT, zfs_umount,
VFSNAME_ROOT, zfs_root,
VFSNAME_STATVFS, zfs_statvfs,
@@ -150,6 +155,58 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
return (0);
}
+static int
+zfs_create_unique_device(dev_t *dev)
+{
+ major_t new_major;
+
+ do {
+ ASSERT3U(zfs_minor, <=, MAXMIN32);
+ minor_t start = zfs_minor;
+ do {
+ mutex_enter(&zfs_dev_mtx);
+ if (zfs_minor >= MAXMIN32) {
+ /*
+ * If we're still using the real major
+ * keep out of /dev/zfs and /dev/zvol minor
+ * number space. If we're using a getudev()'ed
+ * major number, we can use all of its minors.
+ */
+ if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
+ zfs_minor = ZFS_MIN_MINOR;
+ else
+ zfs_minor = 0;
+ } else {
+ zfs_minor++;
+ }
+ *dev = makedevice(zfs_major, zfs_minor);
+ mutex_exit(&zfs_dev_mtx);
+ } while (vfs_devismounted(*dev) && zfs_minor != start);
+ if (zfs_minor == start) {
+ /*
+ * We are using all ~262,000 minor numbers for the
+ * current major number. Create a new major number.
+ */
+ if ((new_major = getudev()) == (major_t)-1) {
+ cmn_err(CE_WARN,
+ "zfs_mount: Can't get unique major "
+ "device number.");
+ return (-1);
+ }
+ mutex_enter(&zfs_dev_mtx);
+ zfs_major = new_major;
+ zfs_minor = 0;
+
+ mutex_exit(&zfs_dev_mtx);
+ } else {
+ break;
+ }
+ /* CONSTANTCONDITION */
+ } while (1);
+
+ return (0);
+}
+
static void
atime_changed_cb(void *arg, uint64_t newval)
{
@@ -271,110 +328,182 @@ acl_inherit_changed_cb(void *arg, uint64_t newval)
zfsvfs->z_acl_inherit = newval;
}
-/*ARGSUSED*/
static int
-zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+zfs_refresh_properties(vfs_t *vfsp)
{
- zfsvfs_t *zfsvfs = NULL;
- znode_t *zp = NULL;
- vnode_t *vp = NULL;
- objset_t *os = NULL;
- struct dsl_dataset *ds;
- char *osname;
- uint64_t readonly, recordsize;
- pathname_t spn;
- dev_t mount_dev;
- major_t new_major;
- int mode;
- int error = 0;
- uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ?
- UIO_SYSSPACE : UIO_USERSPACE;
- int canwrite;
+ zfsvfs_t *zfsvfs = vfsp->vfs_data;
- if (mvp->v_type != VDIR)
- return (ENOTDIR);
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+ readonly_changed_cb(zfsvfs, B_TRUE);
+ } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+ if (dmu_objset_is_snapshot(zfsvfs->z_os))
+ return (EROFS);
+ readonly_changed_cb(zfsvfs, B_FALSE);
+ }
- mutex_enter(&mvp->v_lock);
- if ((uap->flags & MS_REMOUNT) == 0 &&
- (uap->flags & MS_OVERLAY) == 0 &&
- (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
- mutex_exit(&mvp->v_lock);
- return (EBUSY);
+ if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+ devices_changed_cb(zfsvfs, B_FALSE);
+ setuid_changed_cb(zfsvfs, B_FALSE);
+ } else {
+ if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
+ devices_changed_cb(zfsvfs, B_FALSE);
+ else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
+ devices_changed_cb(zfsvfs, B_TRUE);
+
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
+ setuid_changed_cb(zfsvfs, B_FALSE);
+ else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
+ setuid_changed_cb(zfsvfs, B_TRUE);
}
- mutex_exit(&mvp->v_lock);
- /*
- * ZFS does not support passing unparsed data in via MS_DATA.
- * Users should use the MS_OPTIONSTR interface; this means
- * that all option parsing is already done and the options struct
- * can be interrogated.
- */
- if ((uap->flags & MS_DATA) && uap->datalen > 0)
- return (EINVAL);
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
+ exec_changed_cb(zfsvfs, B_FALSE);
+ else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
+ exec_changed_cb(zfsvfs, B_TRUE);
+
+ return (0);
+}
+
+static int
+zfs_register_callbacks(vfs_t *vfsp)
+{
+ struct dsl_dataset *ds = NULL;
+ objset_t *os = NULL;
+ zfsvfs_t *zfsvfs = NULL;
+ int do_readonly = FALSE, readonly;
+ int do_setuid = FALSE, setuid;
+ int do_exec = FALSE, exec;
+ int do_devices = FALSE, devices;
+ int error = 0;
+
+ ASSERT(vfsp);
+ zfsvfs = vfsp->vfs_data;
+ ASSERT(zfsvfs);
+ os = zfsvfs->z_os;
/*
- * When doing a remount, we simply refresh our temporary properties
- * according to those options set in the current VFS options.
+ * The act of registering our callbacks will destroy any mount
+ * options we may have. In order to enable temporary overrides
+ * of mount options, we stash away the current values and restore
+ * restore them after we register the callbacks.
*/
- if (uap->flags & MS_REMOUNT) {
- zfsvfs = vfsp->vfs_data;
-
- if (vfs_optionisset(vfsp, MNTOPT_RO, NULL))
- readonly_changed_cb(zfsvfs, B_TRUE);
- else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
- if (dmu_objset_is_snapshot(zfsvfs->z_os))
- return (EROFS);
- readonly_changed_cb(zfsvfs, B_FALSE);
+ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
+ readonly = B_TRUE;
+ do_readonly = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
+ readonly = B_FALSE;
+ do_readonly = B_TRUE;
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
+ devices = B_FALSE;
+ setuid = B_FALSE;
+ do_devices = B_TRUE;
+ do_setuid = B_TRUE;
+ } else {
+ if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
+ devices = B_FALSE;
+ do_devices = B_TRUE;
+ } else if (vfs_optionisset(vfsp,
+ MNTOPT_DEVICES, NULL)) {
+ devices = B_TRUE;
+ do_devices = B_TRUE;
}
- if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
- devices_changed_cb(zfsvfs, B_FALSE);
- setuid_changed_cb(zfsvfs, B_FALSE);
- } else {
- if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL))
- devices_changed_cb(zfsvfs, B_FALSE);
- else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL))
- devices_changed_cb(zfsvfs, B_TRUE);
-
- if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL))
- setuid_changed_cb(zfsvfs, B_FALSE);
- else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL))
- setuid_changed_cb(zfsvfs, B_TRUE);
+ if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
+ setuid = B_FALSE;
+ do_setuid = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
+ setuid = B_TRUE;
+ do_setuid = B_TRUE;
}
-
- if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL))
- exec_changed_cb(zfsvfs, B_FALSE);
- else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL))
- exec_changed_cb(zfsvfs, B_TRUE);
-
- return (0);
+ }
+ if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
+ exec = B_FALSE;
+ do_exec = B_TRUE;
+ } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
+ exec = B_TRUE;
+ do_exec = B_TRUE;
}
/*
- * Get the objset name (the "special" mount argument).
+ * Register property callbacks.
+ *
+ * It would probably be fine to just check for i/o error from
+ * the first prop_register(), but I guess I like to go
+ * overboard...
*/
- if (error = pn_get(uap->spec, fromspace, &spn))
- return (error);
+ ds = dmu_objset_ds(os);
+ error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "recordsize", blksz_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "readonly", readonly_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "devices", devices_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "setuid", setuid_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "exec", exec_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "snapdir", snapdir_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "aclmode", acl_mode_changed_cb, zfsvfs);
+ error = error ? error : dsl_prop_register(ds,
+ "aclinherit", acl_inherit_changed_cb, zfsvfs);
+ if (error)
+ goto unregister;
- osname = spn.pn_path;
+ /*
+ * Invoke our callbacks to restore temporary mount options.
+ */
+ if (do_readonly)
+ readonly_changed_cb(zfsvfs, readonly);
+ if (do_setuid)
+ setuid_changed_cb(zfsvfs, setuid);
+ if (do_exec)
+ exec_changed_cb(zfsvfs, exec);
+ if (do_devices)
+ devices_changed_cb(zfsvfs, devices);
- if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
- goto out;
+ return (0);
+unregister:
/*
- * Refuse to mount a filesystem if we are in a local zone and the
- * dataset is not visible.
+ * We may attempt to unregister some callbacks that are not
+ * registered, but this is OK; it will simply return ENOMSG,
+ * which we will ignore.
*/
- if (!INGLOBALZONE(curproc) &&
- (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
- error = EPERM;
- goto out;
- }
+ (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs);
+ (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb,
+ zfsvfs);
+ return (error);
+
+}
+
+static int
+zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr)
+{
+ dev_t mount_dev;
+ uint64_t recordsize, readonly;
+ int error = 0;
+ int mode;
+ zfsvfs_t *zfsvfs;
+ znode_t *zp = NULL;
+
+ ASSERT(vfsp);
+ ASSERT(osname);
/*
* Initialize the zfs-specific filesystem structure.
* Should probably make this a kmem cache, shuffle fields,
- * and just bzero upto z_hold_mtx[].
+ * and just bzero up to z_hold_mtx[].
*/
zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
zfsvfs->z_vfs = vfsp;
@@ -388,63 +517,19 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
offsetof(znode_t, z_link_node));
rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL);
- /*
- * Initialize the generic filesystem structure.
- */
+ /* Initialize the generic filesystem structure. */
vfsp->vfs_bcount = 0;
vfsp->vfs_data = NULL;
- /*
- * Create a unique device for the mount.
- */
- do {
- ASSERT3U(zfs_minor, <=, MAXMIN32);
- minor_t start = zfs_minor;
- do {
- mutex_enter(&zfs_dev_mtx);
- if (zfs_minor >= MAXMIN32) {
- /*
- * If we're still using the real major number,
- * keep out of /dev/zfs and /dev/zvol minor
- * number space. If we're using a getudev()'ed
- * major number, we can use all of its minors.
- */
- if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
- zfs_minor = ZFS_MIN_MINOR;
- else
- zfs_minor = 0;
- } else {
- zfs_minor++;
- }
- mount_dev = makedevice(zfs_major, zfs_minor);
- mutex_exit(&zfs_dev_mtx);
- } while (vfs_devismounted(mount_dev) && zfs_minor != start);
- if (zfs_minor == start) {
- /*
- * We are using all ~262,000 minor numbers
- * for the current major number. Create a
- * new major number.
- */
- if ((new_major = getudev()) == (major_t)-1) {
- cmn_err(CE_WARN,
- "zfs_mount: Can't get unique"
- " major device number.");
- goto out;
- }
- mutex_enter(&zfs_dev_mtx);
- zfs_major = new_major;
- zfs_minor = 0;
- mutex_exit(&zfs_dev_mtx);
- } else {
- break;
- }
- /* CONSTANTCONDITION */
- } while (1);
-
+ if (zfs_create_unique_device(&mount_dev) == -1) {
+ error = ENODEV;
+ goto out;
+ }
ASSERT(vfs_devismounted(mount_dev) == 0);
- if (dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL) != 0)
- recordsize = SPA_MAXBLOCKSIZE;
+ if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
+ NULL))
+ goto out;
vfsp->vfs_dev = mount_dev;
vfsp->vfs_fstype = zfsfstype;
@@ -452,8 +537,7 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
vfsp->vfs_flag |= VFS_NOTRUNC;
vfsp->vfs_data = zfsvfs;
- error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL);
- if (error)
+ if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
goto out;
if (readonly)
@@ -467,7 +551,6 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
&zfsvfs->z_os);
}
- os = zfsvfs->z_os;
if (error)
goto out;
@@ -475,16 +558,18 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
if (error = zfs_init_fs(zfsvfs, &zp, cr))
goto out;
- if (dmu_objset_is_snapshot(os)) {
+ /* The call to zfs_init_fs leaves the vnode held, release it here. */
+ VN_RELE(ZTOV(zp));
+
+ if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
ASSERT(mode & DS_MODE_READONLY);
atime_changed_cb(zfsvfs, B_FALSE);
readonly_changed_cb(zfsvfs, B_TRUE);
zfsvfs->z_issnap = B_TRUE;
} else {
- int do_readonly = FALSE, readonly;
- int do_setuid = FALSE, setuid;
- int do_exec = FALSE, exec;
- int do_devices = FALSE, devices;
+ error = zfs_register_callbacks(vfsp);
+ if (error)
+ goto out;
/*
* Start a delete thread running.
@@ -494,119 +579,216 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
/*
* Parse and replay the intent log.
*/
- zil_replay(os, zfsvfs, &zfsvfs->z_assign, zfs_replay_vector,
- (void (*)(void *))zfs_delete_wait_empty);
+ zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
+ zfs_replay_vector, (void (*)(void *))zfs_delete_wait_empty);
if (!zil_disable)
- zfsvfs->z_log = zil_open(os, zfs_get_data);
+ zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+ }
- /*
- * The act of registering our callbacks will destroy any mount
- * options we may have. In order to enable temporary overrides
- * of mount options, we stash away the current values and
- * restore them after we register the callbacks.
- */
- if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) {
- readonly = B_TRUE;
- do_readonly = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
- readonly = B_FALSE;
- do_readonly = B_TRUE;
- }
- if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
- devices = B_FALSE;
- setuid = B_FALSE;
- do_devices = B_TRUE;
- do_setuid = B_TRUE;
- } else {
- if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
- devices = B_FALSE;
- do_devices = B_TRUE;
- } else if (vfs_optionisset(vfsp,
- MNTOPT_DEVICES, NULL)) {
- devices = B_TRUE;
- do_devices = B_TRUE;
- }
+ if (!zfsvfs->z_issnap)
+ zfsctl_create(zfsvfs);
+out:
+ if (error) {
+ if (zfsvfs->z_os)
+ dmu_objset_close(zfsvfs->z_os);
+ kmem_free(zfsvfs, sizeof (zfsvfs_t));
+ } else {
+ atomic_add_32(&zfs_active_fs_count, 1);
+ }
- if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
- setuid = B_FALSE;
- do_setuid = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
- setuid = B_TRUE;
- do_setuid = B_TRUE;
- }
- }
- if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
- exec = B_FALSE;
- do_exec = B_TRUE;
- } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
- exec = B_TRUE;
- do_exec = B_TRUE;
- }
+ return (error);
- /*
- * Register property callbacks.
- */
+}
+
+void
+zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
+{
+ objset_t *os = zfsvfs->z_os;
+ struct dsl_dataset *ds;
+
+ /*
+ * Unregister properties.
+ */
+ if (!dmu_objset_is_snapshot(os)) {
ds = dmu_objset_ds(os);
- VERIFY(dsl_prop_register(ds, "atime", atime_changed_cb,
+ VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
zfsvfs) == 0);
- VERIFY(dsl_prop_register(ds, "recordsize", blksz_changed_cb,
+ VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
zfsvfs) == 0);
- VERIFY(dsl_prop_register(ds, "readonly", readonly_changed_cb,
+ VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
zfsvfs) == 0);
- VERIFY(dsl_prop_register(ds, "devices", devices_changed_cb,
+ VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
zfsvfs) == 0);
- VERIFY(dsl_prop_register(ds, "setuid", setuid_changed_cb,
+ VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
zfsvfs) == 0);
- VERIFY(dsl_prop_register(ds, "exec", exec_changed_cb,
+ VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
zfsvfs) == 0);
- VERIFY(dsl_prop_register(ds, "snapdir", snapdir_changed_cb,
+ VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
zfsvfs) == 0);
- VERIFY(dsl_prop_register(ds, "aclmode", acl_mode_changed_cb,
+ VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
zfsvfs) == 0);
- VERIFY(dsl_prop_register(ds, "aclinherit",
+ VERIFY(dsl_prop_unregister(ds, "aclinherit",
acl_inherit_changed_cb, zfsvfs) == 0);
+ }
+}
+static int
+zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
+{
+ int error = 0;
+ int ret = 0;
+ static int zfsrootdone = 0;
+ zfsvfs_t *zfsvfs = NULL;
+ znode_t *zp = NULL;
+ vnode_t *vp = NULL;
+
+ ASSERT(vfsp);
+
+ /*
+ * The filesystem that we mount as root is defined in
+ * /etc/system using the zfsroot variable. The value defined
+ * there is copied early in startup code to zfs_bootpath
+ * (defined in modsysfile.c).
+ */
+ if (why == ROOT_INIT) {
+ if (zfsrootdone++)
+ return (EBUSY);
/*
- * Invoke our callbacks to restore temporary mount options.
+ * This needs to be done here, so that when we return from
+ * mountroot, the vfs resource name will be set correctly.
*/
- if (do_readonly)
- readonly_changed_cb(zfsvfs, readonly);
- if (do_setuid)
- setuid_changed_cb(zfsvfs, setuid);
- if (do_exec)
- exec_changed_cb(zfsvfs, exec);
- if (do_devices)
- devices_changed_cb(zfsvfs, devices);
- }
+ if (snprintf(rootfs.bo_name, BO_MAXOBJNAME, "%s", zfs_bootpath)
+ >= BO_MAXOBJNAME)
+ return (ENAMETOOLONG);
- vp = ZTOV(zp);
- if (!zfsvfs->z_issnap)
- zfsctl_create(zfsvfs);
-out:
- if (error) {
- if (zp)
- VN_RELE(vp);
+ if (error = vfs_lock(vfsp))
+ return (error);
- if (zfsvfs) {
- if (os)
- dmu_objset_close(os);
- kmem_free(zfsvfs, sizeof (zfsvfs_t));
- }
- } else {
- atomic_add_32(&zfs_active_fs_count, 1);
+ if (error = zfs_domount(vfsp, zfs_bootpath, CRED()))
+ goto out;
+
+ zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
+ ASSERT(zfsvfs);
+ if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp))
+ goto out;
+
+ vp = ZTOV(zp);
+ mutex_enter(&vp->v_lock);
+ vp->v_flag |= VROOT;
+ mutex_exit(&vp->v_lock);
+ rootvp = vp;
+
+ /*
+ * The zfs_zget call above returns with a hold on vp, we release
+ * it here.
+ */
VN_RELE(vp);
+
+ /*
+ * Mount root as readonly initially, it will be remouted
+ * read/write by /lib/svc/method/fs-usr.
+ */
+ readonly_changed_cb(vfsp->vfs_data, B_TRUE);
+ vfs_add((struct vnode *)0, vfsp,
+ (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
+out:
+ vfs_unlock(vfsp);
+ ret = (error) ? error : 0;
+ return (ret);
+
+ } else if (why == ROOT_REMOUNT) {
+
+ readonly_changed_cb(vfsp->vfs_data, B_FALSE);
+ vfsp->vfs_flag |= VFS_REMOUNT;
+ return (zfs_refresh_properties(vfsp));
+
+ } else if (why == ROOT_UNMOUNT) {
+ zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
+ (void) zfs_sync(vfsp, 0, 0);
+ return (0);
+ }
+
+ /*
+ * if "why" is equal to anything else other than ROOT_INIT,
+ * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
+ */
+ return (ENOTSUP);
+}
+
+/*ARGSUSED*/
+static int
+zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
+{
+ char *osname;
+ pathname_t spn;
+ int error = 0;
+ uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ?
+ UIO_SYSSPACE : UIO_USERSPACE;
+ int canwrite;
+
+ if (mvp->v_type != VDIR)
+ return (ENOTDIR);
+
+ mutex_enter(&mvp->v_lock);
+ if ((uap->flags & MS_REMOUNT) == 0 &&
+ (uap->flags & MS_OVERLAY) == 0 &&
+ (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
+ mutex_exit(&mvp->v_lock);
+ return (EBUSY);
+ }
+ mutex_exit(&mvp->v_lock);
+
+ /*
+ * ZFS does not support passing unparsed data in via MS_DATA.
+ * Users should use the MS_OPTIONSTR interface; this means
+ * that all option parsing is already done and the options struct
+ * can be interrogated.
+ */
+ if ((uap->flags & MS_DATA) && uap->datalen > 0)
+ return (EINVAL);
+
+ /*
+ * When doing a remount, we simply refresh our temporary properties
+ * according to those options set in the current VFS options.
+ */
+ if (uap->flags & MS_REMOUNT) {
+ return (zfs_refresh_properties(vfsp));
}
+ /*
+ * Get the objset name (the "special" mount argument).
+ */
+ if (error = pn_get(uap->spec, fromspace, &spn))
+ return (error);
+
+ osname = spn.pn_path;
+
+ if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0)
+ goto out;
+
+ /*
+ * Refuse to mount a filesystem if we are in a local zone and the
+ * dataset is not visible.
+ */
+ if (!INGLOBALZONE(curproc) &&
+ (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
+ error = EPERM;
+ goto out;
+ }
+
+ error = zfs_domount(vfsp, osname, cr);
+
+out:
pn_free(&spn);
return (error);
}
@@ -739,9 +921,6 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
return (0);
}
-
- zfs_zcache_flush(zfsvfs);
-
/*
* Stop all delete threads.
*/
@@ -866,7 +1045,6 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
zfs_delete_t *zd = &zfsvfs->z_delete_head;
znode_t *zp, *nextzp;
objset_t *os = zfsvfs->z_os;
- struct dsl_dataset *ds;
/*
* Stop all delete threads.
@@ -881,8 +1059,6 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
*/
rw_enter(&zfsvfs->z_um_lock, RW_WRITER);
- zfs_zcache_flush(zfsvfs);
-
/*
* Release all delete in progress znodes
* They will be processed when the file system remounts.
@@ -891,7 +1067,7 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
while (zp = list_head(&zd->z_znodes)) {
list_remove(&zd->z_znodes, zp);
zp->z_dbuf_held = 0;
- dmu_buf_rele(zp->z_dbuf);
+ dmu_buf_rele(zp->z_dbuf, NULL);
}
mutex_exit(&zd->z_mutex);
@@ -911,7 +1087,7 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
/* dbufs should only be held when force unmounting */
zp->z_dbuf_held = 0;
mutex_exit(&zfsvfs->z_znodes_lock);
- dmu_buf_rele(zp->z_dbuf);
+ dmu_buf_rele(zp->z_dbuf, NULL);
/* Start again */
mutex_enter(&zfsvfs->z_znodes_lock);
nextzp = list_head(&zfsvfs->z_all_znodes);
@@ -922,36 +1098,8 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
/*
* Unregister properties.
*/
- if (!dmu_objset_is_snapshot(os)) {
- ds = dmu_objset_ds(os);
-
- VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
- zfsvfs) == 0);
-
- VERIFY(dsl_prop_unregister(ds, "aclinherit",
- acl_inherit_changed_cb, zfsvfs) == 0);
- }
+ if (!dmu_objset_is_snapshot(os))
+ zfs_unregister_callbacks(zfsvfs);
/*
* Make the dmu drop all it dbuf holds so that zfs_inactive
@@ -977,6 +1125,11 @@ zfs_objset_close(zfsvfs_t *zfsvfs)
}
/*
+ * Evict all dbufs so that cached znodes will be freed
+ */
+ dmu_objset_evict_dbufs(os);
+
+ /*
* Finally close the objset
*/
dmu_objset_close(os);
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index da5b41101a..2b9da086cc 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -229,6 +229,14 @@ zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
case _FIOFFS:
return (zfs_sync(vp->v_vfsp, 0, cred));
+ /*
+ * The following two ioctls are used by bfu. Faking out,
+ * necessary to avoid bfu errors.
+ */
+ case _FIOGDIO:
+ case _FIOSDIO:
+ return (0);
+
case _FIO_SEEK_DATA:
case _FIO_SEEK_HOLE:
if (ddi_copyin((void *)data, &off, sizeof (off), flag))
@@ -436,12 +444,10 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
n = MIN(zfs_read_chunk_size,
zp->z_phys->zp_size - uio->uio_loffset);
n = MIN(n, cnt);
- dbpp = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id,
- uio->uio_loffset, n, &numbufs);
- if (error = dmu_buf_read_array_canfail(dbpp, numbufs)) {
- dmu_buf_rele_array(dbpp, numbufs);
+ error = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id,
+ uio->uio_loffset, n, TRUE, FTAG, &numbufs, &dbpp);
+ if (error)
goto out;
- }
/*
* Compute the adjustment to align the dmu buffers
* with the uio buffer.
@@ -467,7 +473,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
(n < size ? n : size), UIO_READ, uio);
}
if (error) {
- dmu_buf_rele_array(dbpp, numbufs);
+ dmu_buf_rele_array(dbpp, numbufs, FTAG);
goto out;
}
n -= dbp->db_size;
@@ -476,7 +482,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
delta = 0;
}
}
- dmu_buf_rele_array(dbpp, numbufs);
+ dmu_buf_rele_array(dbpp, numbufs, FTAG);
}
out:
rw_exit(&zp->z_grow_lock);
@@ -850,10 +856,10 @@ zfs_get_data(void *arg, lr_write_t *lr)
*/
if (sizeof (lr_write_t) + dlen <= reclen) { /* immediate write */
rw_enter(&zp->z_grow_lock, RW_READER);
- dmu_buf_t *db = dmu_buf_hold(os, lr->lr_foid, off);
- dmu_buf_read(db);
+ dmu_buf_t *db;
+ VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, off, FTAG, &db));
bcopy((char *)db->db_data + off - db->db_offset, lr + 1, dlen);
- dmu_buf_rele(db);
+ dmu_buf_rele(db, FTAG);
rw_exit(&zp->z_grow_lock);
} else {
/*
@@ -1071,7 +1077,7 @@ top:
tx = dmu_tx_create(os);
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
dmu_tx_hold_bonus(tx, dzp->z_id);
- dmu_tx_hold_zap(tx, dzp->z_id, 1);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, SPA_MAXBLOCKSIZE);
@@ -1266,7 +1272,7 @@ top:
* allow for either case.
*/
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, dzp->z_id, -1);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_bonus(tx, zp->z_id);
if (may_delete_now)
dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
@@ -1289,7 +1295,7 @@ top:
dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
/* charge as an update -- would be nice not to charge at all */
- dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1);
+ dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
error = dmu_tx_assign(tx, zfsvfs->z_assign);
if (error) {
@@ -1427,8 +1433,8 @@ top:
* Add a new entry to the directory.
*/
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, dzp->z_id, 1);
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
0, SPA_MAXBLOCKSIZE);
@@ -1534,9 +1540,9 @@ top:
rw_enter(&zp->z_parent_lock, RW_WRITER);
tx = dmu_tx_create(zfsvfs->z_os);
- dmu_tx_hold_zap(tx, dzp->z_id, 1);
+ dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
dmu_tx_hold_bonus(tx, zp->z_id);
- dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+ dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
error = dmu_tx_assign(tx, zfsvfs->z_assign);
if (error) {
dmu_tx_abort(tx);
@@ -2059,8 +2065,7 @@ top:
have_grow_lock = TRUE;
if (off < zp->z_phys->zp_size)
dmu_tx_hold_free(tx, zp->z_id, off, DMU_OBJECT_END);
- else if (zp->z_phys->zp_size &&
- zp->z_blksz < zfsvfs->z_max_blksz && off > zp->z_blksz)
+ else if (zp->z_blksz < zfsvfs->z_max_blksz && off > zp->z_blksz)
/* we will rewrite this block if we grow */
dmu_tx_hold_write(tx, zp->z_id, 0, zp->z_phys->zp_size);
}
@@ -2419,17 +2424,13 @@ top:
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */
dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */
- if (sdzp != tdzp) {
- dmu_tx_hold_zap(tx, sdzp->z_id, 1);
- dmu_tx_hold_zap(tx, tdzp->z_id, 1);
+ dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
+ dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
+ if (sdzp != tdzp)
dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */
- } else {
- dmu_tx_hold_zap(tx, sdzp->z_id, 2);
- }
- if (tzp) {
- dmu_tx_hold_bonus(tx, tzp->z_id); /* nlink changes */
- }
- dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1);
+ if (tzp)
+ dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */
+ dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL);
error = dmu_tx_assign(tx, zfsvfs->z_assign);
if (error) {
dmu_tx_abort(tx);
@@ -2532,7 +2533,7 @@ top:
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
dmu_tx_hold_bonus(tx, dzp->z_id);
- dmu_tx_hold_zap(tx, dzp->z_id, 1);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
error = dmu_tx_assign(tx, zfsvfs->z_assign);
@@ -2569,12 +2570,12 @@ top:
if (error)
goto out;
- dbp = dmu_buf_hold(zfsvfs->z_os, zoid, 0);
+ VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp));
dmu_buf_will_dirty(dbp, tx);
ASSERT3U(len, <=, dbp->db_size);
bcopy(link, dbp->db_data, len);
- dmu_buf_rele(dbp);
+ dmu_buf_rele(dbp, FTAG);
}
zp->z_phys->zp_size = len;
@@ -2631,15 +2632,15 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr)
error = uiomove(zp->z_phys + 1,
MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
} else {
- dmu_buf_t *dbp = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0);
- if ((error = dmu_buf_read_canfail(dbp)) != 0) {
- dmu_buf_rele(dbp);
+ dmu_buf_t *dbp;
+ error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp);
+ if (error) {
ZFS_EXIT(zfsvfs);
return (error);
}
error = uiomove(dbp->db_data,
MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio);
- dmu_buf_rele(dbp);
+ dmu_buf_rele(dbp, FTAG);
}
ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
@@ -2732,7 +2733,7 @@ top:
tx = dmu_tx_create(zfsvfs->z_os);
dmu_tx_hold_bonus(tx, szp->z_id);
- dmu_tx_hold_zap(tx, dzp->z_id, 1);
+ dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
error = dmu_tx_assign(tx, zfsvfs->z_assign);
if (error) {
dmu_tx_abort(tx);
@@ -2921,8 +2922,14 @@ zfs_inactive(vnode_t *vp, cred_t *cr)
B_INVAL, cr);
}
+ mutex_enter(&zp->z_lock);
vp->v_count = 0; /* count arrives as 1 */
- zfs_znode_free(zp);
+ if (zp->z_dbuf == NULL) {
+ mutex_exit(&zp->z_lock);
+ zfs_znode_free(zp);
+ } else {
+ mutex_exit(&zp->z_lock);
+ }
rw_exit(&zfsvfs->z_um_lock);
VFS_RELE(zfsvfs->z_vfs);
return;
@@ -2986,27 +2993,21 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
- uint_t cnt = 1;
int error;
ZFS_ENTER(zfsvfs);
/*
- * If file is being mapped, disallow frlock. We set the mapcnt to
- * -1 here to signal that we are in the process of setting a lock.
- * This prevents a race with zfs_map().
- * XXX - well, sort of; since zfs_map() does not change z_mapcnt,
- * we could be in the middle of zfs_map() and still call fs_frlock().
- * Also, we are doing no checking in zfs_addmap() (where z_mapcnt
- * *is* manipulated).
+ * We are following the UFS semantics with respect to mapcnt
+ * here: If we see that the file is mapped already, then we will
+ * return an error, but we don't worry about races between this
+ * function and zfs_map().
*/
- if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
- (int)(cnt = atomic_cas_32(&zp->z_mapcnt, 0, -1)) > 0) {
+ if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) {
ZFS_EXIT(zfsvfs);
return (EAGAIN);
}
error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr);
- ASSERT((cnt != 0) || ((int)atomic_cas_32(&zp->z_mapcnt, -1, 0) == -1));
ZFS_EXIT(zfsvfs);
return (error);
}
@@ -3074,7 +3075,7 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
ASSERT(io_off == cur_pp->p_offset);
va = ppmapin(cur_pp, PROT_READ | PROT_WRITE, (caddr_t)-1);
- err = dmu_read_canfail(os, oid, io_off, PAGESIZE, va);
+ err = dmu_read(os, oid, io_off, PAGESIZE, va);
ppmapout(va);
if (err) {
/* On error, toss the entire kluster */
@@ -3241,6 +3242,20 @@ out:
return (err);
}
+/*
+ * Request a memory map for a section of a file. This code interacts
+ * with common code and the VM system as follows:
+ *
+ * common code calls mmap(), which ends up in smmap_common()
+ *
+ * this calls VOP_MAP(), which takes you into (say) zfs
+ *
+ * zfs_map() calls as_map(), passing segvn_create() as the callback
+ *
+ * segvn_create() creates the new segment and calls VOP_ADDMAP()
+ *
+ * zfs_addmap() updates z_mapcnt
+ */
static int
zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
@@ -3269,15 +3284,10 @@ zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
/*
* If file is locked, disallow mapping.
- * XXX - since we don't modify z_mapcnt here, there is nothing
- * to stop a file lock being placed immediately after we complete
- * this check.
*/
- if (MANDMODE((mode_t)zp->z_phys->zp_mode)) {
- if (vn_has_flocks(vp) || zp->z_mapcnt == -1) {
- ZFS_EXIT(zfsvfs);
- return (EAGAIN);
- }
+ if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) {
+ ZFS_EXIT(zfsvfs);
+ return (EAGAIN);
}
as_rangelock(as);
@@ -3318,11 +3328,9 @@ static int
zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr)
{
- /*
- * XXX - shouldn't we be checking for file locks here?
- */
- ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0);
- atomic_add_32(&VTOZ(vp)->z_mapcnt, btopr(len));
+ uint64_t pages = btopr(len);
+
+ atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
return (0);
}
@@ -3331,8 +3339,10 @@ static int
zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr)
{
- atomic_add_32(&VTOZ(vp)->z_mapcnt, -btopr(len));
- ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0);
+ uint64_t pages = btopr(len);
+
+ ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
+ atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
index 7eb3a2410d..3fd338940e 100644
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -55,251 +54,6 @@
struct kmem_cache *znode_cache = NULL;
-/*
- * Note that znodes can be on one of 2 states:
- * ZCACHE_mru - recently used, currently cached
- * ZCACHE_mfu - frequently used, currently cached
- * When there are no active references to the znode, they
- * are linked onto one of the lists in zcache. These are the
- * only znodes that can be evicted.
- */
-
-typedef struct zcache_state {
- list_t list; /* linked list of evictable znodes in state */
- uint64_t lcnt; /* total number of znodes in the linked list */
- uint64_t cnt; /* total number of all znodes in this state */
- uint64_t hits;
- kmutex_t mtx;
-} zcache_state_t;
-
-/* The 2 states: */
-static zcache_state_t ZCACHE_mru;
-static zcache_state_t ZCACHE_mfu;
-
-static struct zcache {
- zcache_state_t *mru;
- zcache_state_t *mfu;
- uint64_t p; /* Target size of mru */
- uint64_t c; /* Target size of cache */
- uint64_t c_max; /* Maximum target cache size */
-
- /* performance stats */
- uint64_t missed;
- uint64_t evicted;
- uint64_t skipped;
-} zcache;
-
-void zcache_kmem_reclaim(void);
-
-#define ZCACHE_MINTIME (hz>>4) /* 62 ms */
-
-/*
- * Move the supplied znode to the indicated state. The mutex
- * for the znode must be held by the caller.
- */
-static void
-zcache_change_state(zcache_state_t *new_state, znode_t *zp)
-{
- /* ASSERT(MUTEX_HELD(hash_mtx)); */
- ASSERT(zp->z_active);
-
- if (zp->z_zcache_state) {
- ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
- atomic_add_64(&zp->z_zcache_state->cnt, -1);
- }
- atomic_add_64(&new_state->cnt, 1);
- zp->z_zcache_state = new_state;
-}
-
-static void
-zfs_zcache_evict(znode_t *zp, kmutex_t *hash_mtx)
-{
- zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-
- ASSERT(zp->z_phys);
- ASSERT(zp->z_dbuf_held);
-
- zp->z_dbuf_held = 0;
- mutex_exit(&zp->z_lock);
- dmu_buf_rele(zp->z_dbuf);
- mutex_exit(hash_mtx);
- VFS_RELE(zfsvfs->z_vfs);
-}
-
-/*
- * Evict znodes from list until we've removed the specified number
- */
-static void
-zcache_evict_state(zcache_state_t *state, int64_t cnt, zfsvfs_t *zfsvfs)
-{
- int znodes_evicted = 0;
- znode_t *zp, *zp_prev;
- kmutex_t *hash_mtx;
-
- ASSERT(state == zcache.mru || state == zcache.mfu);
-
- mutex_enter(&state->mtx);
-
- for (zp = list_tail(&state->list); zp; zp = zp_prev) {
- zp_prev = list_prev(&state->list, zp);
- if (zfsvfs && zp->z_zfsvfs != zfsvfs)
- continue;
- hash_mtx = ZFS_OBJ_MUTEX(zp);
- if (mutex_tryenter(hash_mtx)) {
- mutex_enter(&zp->z_lock);
- list_remove(&zp->z_zcache_state->list, zp);
- zp->z_zcache_state->lcnt -= 1;
- ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
- atomic_add_64(&zp->z_zcache_state->cnt, -1);
- zp->z_zcache_state = NULL;
- zp->z_zcache_access = 0;
- /* drops z_lock and hash_mtx */
- zfs_zcache_evict(zp, hash_mtx);
- znodes_evicted += 1;
- atomic_add_64(&zcache.evicted, 1);
- if (znodes_evicted >= cnt)
- break;
- } else {
- atomic_add_64(&zcache.skipped, 1);
- }
- }
- mutex_exit(&state->mtx);
-
- if (znodes_evicted < cnt)
- dprintf("only evicted %lld znodes from %x",
- (longlong_t)znodes_evicted, state);
-}
-
-static void
-zcache_adjust(void)
-{
- uint64_t mrucnt = zcache.mru->lcnt;
- uint64_t mfucnt = zcache.mfu->lcnt;
- uint64_t p = zcache.p;
- uint64_t c = zcache.c;
-
- if (mrucnt > p)
- zcache_evict_state(zcache.mru, mrucnt - p, NULL);
-
- if (mfucnt > 0 && mrucnt + mfucnt > c) {
- int64_t toevict = MIN(mfucnt, mrucnt + mfucnt - c);
- zcache_evict_state(zcache.mfu, toevict, NULL);
- }
-}
-
-/*
- * Flush all *evictable* data from the cache.
- * NOTE: this will not touch "active" (i.e. referenced) data.
- */
-void
-zfs_zcache_flush(zfsvfs_t *zfsvfs)
-{
- zcache_evict_state(zcache.mru, zcache.mru->lcnt, zfsvfs);
- zcache_evict_state(zcache.mfu, zcache.mfu->lcnt, zfsvfs);
-}
-
-static void
-zcache_try_grow(int64_t cnt)
-{
- int64_t size;
- /*
- * If we're almost to the current target cache size,
- * increment the target cache size
- */
- size = zcache.mru->lcnt + zcache.mfu->lcnt;
- if ((zcache.c - size) <= 1) {
- atomic_add_64(&zcache.c, cnt);
- if (zcache.c > zcache.c_max)
- zcache.c = zcache.c_max;
- else if (zcache.p + cnt < zcache.c)
- atomic_add_64(&zcache.p, cnt);
- }
-}
-
-/*
- * This routine is called whenever a znode is accessed.
- */
-static void
-zcache_access(znode_t *zp, kmutex_t *hash_mtx)
-{
- ASSERT(MUTEX_HELD(hash_mtx));
-
- if (zp->z_zcache_state == NULL) {
- /*
- * This znode is not in the cache.
- * Add the new znode to the MRU state.
- */
-
- zcache_try_grow(1);
-
- ASSERT(zp->z_zcache_access == 0);
- zp->z_zcache_access = lbolt;
- zcache_change_state(zcache.mru, zp);
- mutex_exit(hash_mtx);
-
- /*
- * If we are using less than 2/3 of our total target
- * cache size, bump up the target size for the MRU
- * list.
- */
- if (zcache.mru->lcnt + zcache.mfu->lcnt < zcache.c*2/3) {
- zcache.p = zcache.mru->lcnt + zcache.c/6;
- }
-
- zcache_adjust();
-
- atomic_add_64(&zcache.missed, 1);
- } else if (zp->z_zcache_state == zcache.mru) {
- /*
- * This znode has been "accessed" only once so far,
- * Move it to the MFU state.
- */
- if (lbolt > zp->z_zcache_access + ZCACHE_MINTIME) {
- /*
- * More than 125ms have passed since we
- * instantiated this buffer. Move it to the
- * most frequently used state.
- */
- zp->z_zcache_access = lbolt;
- zcache_change_state(zcache.mfu, zp);
- }
- atomic_add_64(&zcache.mru->hits, 1);
- mutex_exit(hash_mtx);
- } else {
- ASSERT(zp->z_zcache_state == zcache.mfu);
- /*
- * This buffer has been accessed more than once.
- * Keep it in the MFU state.
- */
- atomic_add_64(&zcache.mfu->hits, 1);
- mutex_exit(hash_mtx);
- }
-}
-
-static void
-zcache_init(void)
-{
- zcache.c = 20;
- zcache.c_max = 50;
-
- zcache.mru = &ZCACHE_mru;
- zcache.mfu = &ZCACHE_mfu;
-
- list_create(&zcache.mru->list, sizeof (znode_t),
- offsetof(znode_t, z_zcache_node));
- list_create(&zcache.mfu->list, sizeof (znode_t),
- offsetof(znode_t, z_zcache_node));
-}
-
-static void
-zcache_fini(void)
-{
- zfs_zcache_flush(NULL);
-
- list_destroy(&zcache.mru->list);
- list_destroy(&zcache.mfu->list);
-}
-
/*ARGSUSED*/
static void
znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
@@ -307,9 +61,15 @@ znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr)
znode_t *zp = user_ptr;
vnode_t *vp = ZTOV(zp);
+ mutex_enter(&zp->z_lock);
if (vp->v_count == 0) {
+ mutex_exit(&zp->z_lock);
vn_invalid(vp);
zfs_znode_free(zp);
+ } else {
+ /* signal force unmount that this znode can be freed */
+ zp->z_dbuf = NULL;
+ mutex_exit(&zp->z_lock);
}
}
@@ -359,15 +119,11 @@ zfs_znode_init(void)
znode_cache = kmem_cache_create("zfs_znode_cache",
sizeof (znode_t), 0, zfs_znode_cache_constructor,
zfs_znode_cache_destructor, NULL, NULL, NULL, 0);
-
- zcache_init();
}
void
zfs_znode_fini(void)
{
- zcache_fini();
-
/*
* Cleanup vfs & vnode ops
*/
@@ -488,8 +244,8 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) {
dmu_tx_t *tx = dmu_tx_create(os);
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 3); /* master node */
- dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1); /* delete queue */
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */
+ dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */
dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */
error = dmu_tx_assign(tx, TXG_WAIT);
ASSERT3U(error, ==, 0);
@@ -497,8 +253,10 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
dmu_tx_commit(tx);
}
- if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1, &version)) {
- return (EINVAL);
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1,
+ &version);
+ if (error) {
+ return (error);
} else if (version != ZFS_VERSION) {
(void) printf("Mismatched versions: File system "
"is version %lld on-disk format, which is "
@@ -524,9 +282,9 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
kmem_free(stats, sizeof (dmu_objset_stats_t));
stats = NULL;
- if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid)) {
- return (EINVAL);
- }
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid);
+ if (error)
+ return (error);
ASSERT(zoid != 0);
zfsvfs->z_root = zoid;
@@ -545,9 +303,9 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
return (error);
ASSERT3U((*zpp)->z_id, ==, zoid);
- if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid)) {
- return (EINVAL);
- }
+ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid);
+ if (error)
+ return (error);
zfsvfs->z_dqueue = zoid;
@@ -570,7 +328,7 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr)
* up to the caller to do, in case you don't want to
* return the znode
*/
-znode_t *
+static znode_t *
zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
{
znode_t *zp;
@@ -593,8 +351,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz)
zp->z_blksz = blksz;
zp->z_seq = 0x7A4653;
- bzero(&zp->z_zcache_node, sizeof (list_node_t));
-
mutex_enter(&zfsvfs->z_znodes_lock);
list_insert_tail(&zfsvfs->z_all_znodes, zp);
mutex_exit(&zfsvfs->z_znodes_lock);
@@ -662,9 +418,6 @@ zfs_znode_dmu_init(znode_t *zp)
ZTOV(zp)->v_flag |= VROOT;
}
- zp->z_zcache_state = NULL;
- zp->z_zcache_access = 0;
-
ASSERT(zp->z_dbuf_held == 0);
zp->z_dbuf_held = 1;
VFS_HOLD(zfsvfs->z_vfs);
@@ -715,6 +468,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
/*
* Create a new DMU object.
*/
+ /*
+ * There's currently no mechanism for pre-reading the blocks that will
+ * be to needed allocate a new object, so we accept the small chance
+ * that there will be an i/o error and we will fail one of the
+ * assertions below.
+ */
if (vap->va_type == VDIR) {
if (flag & IS_REPLAY) {
err = zap_create_claim(zfsvfs->z_os, *oid,
@@ -738,7 +497,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
}
}
- dbp = dmu_bonus_hold(zfsvfs->z_os, *oid);
+ VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp));
dmu_buf_will_dirty(dbp, tx);
/*
@@ -803,11 +562,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr,
mutex_enter(hash_mtx);
zfs_znode_dmu_init(zp);
- zcache_access(zp, hash_mtx);
+ mutex_exit(hash_mtx);
+
*zpp = zp;
} else {
ZTOV(zp)->v_count = 0;
- dmu_buf_rele(dbp);
+ dmu_buf_rele(dbp, NULL);
zfs_znode_free(zp);
}
}
@@ -818,25 +578,25 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
dmu_object_info_t doi;
dmu_buf_t *db;
znode_t *zp;
+ int err;
*zpp = NULL;
ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num);
- db = dmu_bonus_hold(zfsvfs->z_os, obj_num);
- if (db == NULL) {
+ err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db);
+ if (err) {
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
- return (ENOENT);
+ return (err);
}
dmu_object_info_from_db(db, &doi);
if (doi.doi_bonus_type != DMU_OT_ZNODE ||
doi.doi_bonus_size < sizeof (znode_phys_t)) {
- dmu_buf_rele(db);
+ dmu_buf_rele(db, NULL);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (EINVAL);
}
- dmu_buf_read(db);
ASSERT(db->db_object == obj_num);
ASSERT(db->db_offset == -1);
@@ -849,29 +609,23 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
ASSERT3U(zp->z_id, ==, obj_num);
if (zp->z_reap) {
- dmu_buf_rele(db);
+ dmu_buf_rele(db, NULL);
mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
return (ENOENT);
} else if (zp->z_dbuf_held) {
- dmu_buf_rele(db);
+ dmu_buf_rele(db, NULL);
} else {
zp->z_dbuf_held = 1;
VFS_HOLD(zfsvfs->z_vfs);
}
- if (zp->z_active == 0) {
+ if (zp->z_active == 0)
zp->z_active = 1;
- if (list_link_active(&zp->z_zcache_node)) {
- mutex_enter(&zp->z_zcache_state->mtx);
- list_remove(&zp->z_zcache_state->list, zp);
- zp->z_zcache_state->lcnt -= 1;
- mutex_exit(&zp->z_zcache_state->mtx);
- }
- }
+
VN_HOLD(ZTOV(zp));
mutex_exit(&zp->z_lock);
- zcache_access(zp, ZFS_OBJ_MUTEX(zp));
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
*zpp = zp;
return (0);
}
@@ -882,7 +636,7 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size);
ASSERT3U(zp->z_id, ==, obj_num);
zfs_znode_dmu_init(zp);
- zcache_access(zp, ZFS_OBJ_MUTEX(zp));
+ ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
*zpp = zp;
return (0);
}
@@ -899,15 +653,11 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx)
zp->z_phys->zp_acl.z_acl_extern_obj, tx);
ASSERT3U(error, ==, 0);
}
- if (zp->z_zcache_state) {
- ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
- atomic_add_64(&zp->z_zcache_state->cnt, -1);
- }
error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx);
ASSERT3U(error, ==, 0);
zp->z_dbuf_held = 0;
ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id);
- dmu_buf_rele(zp->z_dbuf);
+ dmu_buf_rele(zp->z_dbuf, NULL);
}
void
@@ -954,9 +704,6 @@ zfs_zinactive(znode_t *zp)
if (zp->z_reap) {
mutex_exit(&zp->z_lock);
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
- ASSERT3U(zp->z_zcache_state->cnt, >=, 1);
- atomic_add_64(&zp->z_zcache_state->cnt, -1);
- zp->z_zcache_state = NULL;
/* XATTR files are not put on the delete queue */
if (zp->z_phys->zp_flags & ZFS_XATTR) {
zfs_rmnode(zp);
@@ -970,23 +717,14 @@ zfs_zinactive(znode_t *zp)
VFS_RELE(zfsvfs->z_vfs);
return;
}
+ ASSERT(zp->z_phys);
+ ASSERT(zp->z_dbuf_held);
- /*
- * If the file system for this znode is no longer mounted,
- * evict the znode now, don't put it in the cache.
- */
- if (zfsvfs->z_unmounted1) {
- zfs_zcache_evict(zp, ZFS_OBJ_MUTEX(zp));
- return;
- }
-
- /* put znode on evictable list */
- mutex_enter(&zp->z_zcache_state->mtx);
- list_insert_head(&zp->z_zcache_state->list, zp);
- zp->z_zcache_state->lcnt += 1;
- mutex_exit(&zp->z_zcache_state->mtx);
+ zp->z_dbuf_held = 0;
mutex_exit(&zp->z_lock);
+ dmu_buf_rele(zp->z_dbuf, NULL);
ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id);
+ VFS_RELE(zfsvfs->z_vfs);
}
void
@@ -1206,7 +944,8 @@ zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx,
len = -1;
else if (end > size)
len = size - from;
- dmu_free_range(zp->z_zfsvfs->z_os, zp->z_id, from, len, tx);
+ VERIFY(0 == dmu_free_range(zp->z_zfsvfs->z_os,
+ zp->z_id, from, len, tx));
if (!have_grow_lock)
rw_exit(&zp->z_grow_lock);
@@ -1214,7 +953,6 @@ zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx,
return (0);
}
-
void
zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
{
@@ -1229,6 +967,10 @@ zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx)
/*
* First attempt to create master node.
*/
+ /*
+ * In an empty objset, there are no blocks to read and thus
+ * there can be no i/o errors (which we assert below).
+ */
moid = MASTER_NODE_OBJ;
error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE,
DMU_OT_NONE, 0, tx);
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 14b989fbd3..55040166b4 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -136,11 +136,17 @@ zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf)
uint64_t blksz = BP_GET_LSIZE(bp);
zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1;
zio_cksum_t cksum;
+ zbookmark_t zb;
int error;
+ zb.zb_objset = bp->blk_cksum.zc_word[2];
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = bp->blk_cksum.zc_word[3];
+
error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz,
NULL, NULL, ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
if (error) {
dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ",
zilog, bp, error);
@@ -551,6 +557,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
uint64_t txg;
uint64_t zil_blksz;
+ zbookmark_t zb;
int error;
ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
@@ -579,11 +586,21 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG,
zil_blksz, &ztp->zit_next_blk, txg);
if (error) {
+ /*
+ * Reinitialise the lwb.
+ * By returning NULL the caller will call tx_wait_synced()
+ */
+ mutex_enter(&zilog->zl_lock);
+ ASSERT(lwb->lwb_state == UNWRITTEN);
+ lwb->lwb_nused = 0;
+ lwb->lwb_seq = 0;
+ mutex_exit(&zilog->zl_lock);
txg_rele_to_sync(&lwb->lwb_txgh);
return (NULL);
}
ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg);
+ ztp->zit_pad = 0;
ztp->zit_nused = lwb->lwb_nused;
ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum;
@@ -617,9 +634,15 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
* write the old log block
*/
dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
+
+ zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[2];
+ zb.zb_object = 0;
+ zb.zb_level = -1;
+ zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[3];
+
zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0,
&lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb,
- ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED));
+ ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb));
return (nlwb);
}
@@ -674,7 +697,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
lwb = zil_lwb_write_start(zilog, lwb);
if (lwb == NULL)
return (NULL);
- if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) {
+ ASSERT(lwb->lwb_nused == 0);
+ if (reclen > ZIL_BLK_DATA_SZ(lwb)) {
txg_wait_synced(zilog->zl_dmu_pool, txg);
mutex_enter(&zilog->zl_lock);
zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq);
@@ -1157,10 +1181,17 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
* checksum error. We can safely ignore this because
* the later write will provide the correct data.
*/
+ zbookmark_t zb;
+
+ zb.zb_objset = dmu_objset_id(zilog->zl_os);
+ zb.zb_object = lrw->lr_foid;
+ zb.zb_level = -1;
+ zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
+
(void) zio_wait(zio_read(NULL, zilog->zl_spa,
wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+ ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
}
}
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 1554504a93..b9741ee5c2 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,13 +19,14 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
#pragma ident "%Z%%M% %I% %E% SMI"
#include <sys/zfs_context.h>
+#include <sys/fm/fs/zfs.h>
#include <sys/spa.h>
#include <sys/txg.h>
#include <sys/spa_impl.h>
@@ -35,9 +35,6 @@
#include <sys/zio_compress.h>
#include <sys/zio_checksum.h>
-static void zio_vdev_io_enter(zio_t *zio);
-static void zio_vdev_io_exit(zio_t *zio);
-
/*
* ==========================================================================
* I/O priority table
@@ -128,6 +125,8 @@ zio_init(void)
if (zio_buf_cache[c - 1] == NULL)
zio_buf_cache[c - 1] = zio_buf_cache[c];
}
+
+ zio_inject_init();
}
void
@@ -143,6 +142,8 @@ zio_fini(void)
}
zio_buf_cache[c] = NULL;
}
+
+ zio_inject_fini();
}
/*
@@ -263,11 +264,12 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
if (pio == NULL) {
if (!(flags & ZIO_FLAG_CONFIG_HELD))
- spa_config_enter(zio->io_spa, RW_READER);
+ spa_config_enter(zio->io_spa, RW_READER, zio);
zio->io_root = zio;
} else {
zio->io_root = pio->io_root;
-
+ if (!(flags & ZIO_FLAG_NOBOOKMARK))
+ zio->io_logical = pio->io_logical;
mutex_enter(&pio->io_lock);
if (stage < ZIO_STAGE_READY)
pio->io_children_notready++;
@@ -305,7 +307,7 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
zio_t *
zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
uint64_t size, zio_done_func_t *done, void *private,
- int priority, int flags)
+ int priority, int flags, zbookmark_t *zb)
{
zio_t *zio;
dva_t *dva;
@@ -314,6 +316,9 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private,
ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+ zio->io_bookmark = *zb;
+
+ zio->io_logical = zio;
/*
* Work off our copy of the bp so the caller can free it.
@@ -345,7 +350,8 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data,
zio_t *
zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
- zio_done_func_t *done, void *private, int priority, int flags)
+ zio_done_func_t *done, void *private, int priority, int flags,
+ zbookmark_t *zb)
{
zio_t *zio;
@@ -359,6 +365,10 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
ZIO_TYPE_WRITE, priority, flags,
ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+ zio->io_bookmark = *zb;
+
+ zio->io_logical = zio;
+
zio->io_checksum = checksum;
zio->io_compress = compress;
@@ -378,7 +388,8 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress,
zio_t *
zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
uint64_t txg, blkptr_t *bp, void *data, uint64_t size,
- zio_done_func_t *done, void *private, int priority, int flags)
+ zio_done_func_t *done, void *private, int priority, int flags,
+ zbookmark_t *zb)
{
zio_t *zio;
@@ -387,6 +398,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum,
ZIO_TYPE_WRITE, priority, flags,
ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
+ zio->io_bookmark = *zb;
zio->io_checksum = checksum;
zio->io_compress = ZIO_COMPRESS_OFF;
@@ -667,8 +679,6 @@ zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp)
mutex_exit(&zio->io_lock);
zio_next_stage(zio);
} else {
- if (zio->io_stage == ZIO_STAGE_VDEV_IO_START)
- zio_vdev_io_exit(zio);
zio->io_stalled = stage;
mutex_exit(&zio->io_lock);
}
@@ -683,8 +693,6 @@ zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp)
if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
pio->io_error = zio->io_error;
if (--*countp == 0 && pio->io_stalled == stage) {
- if (pio->io_stage == ZIO_STAGE_VDEV_IO_START)
- zio_vdev_io_enter(pio);
pio->io_stalled = 0;
mutex_exit(&pio->io_lock);
zio_next_stage_async(pio);
@@ -748,36 +756,45 @@ zio_done(zio_t *zio)
vdev_stat_update(zio);
if (zio->io_error) {
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
- bp ? bp : &zio->io_bp_copy);
- dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): error %d\n",
- zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
- zio_type_name[zio->io_type],
- vdev_description(vd),
- (u_longlong_t)zio->io_offset,
- zio, blkbuf, zio->io_error);
- }
-
- if (zio->io_numerrors != 0 && zio->io_type == ZIO_TYPE_WRITE) {
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
- bp ? bp : &zio->io_bp_copy);
- dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): %d errors\n",
- "partial write",
- zio_type_name[zio->io_type],
- vdev_description(vd),
- (u_longlong_t)zio->io_offset,
- zio, blkbuf, zio->io_numerrors);
- }
+ /*
+ * If this I/O is attached to a particular vdev,
+ * generate an error message describing the I/O failure
+ * at the block level. We ignore these errors if the
+ * device is currently unavailable.
+ */
+ if (zio->io_error != ECKSUM && zio->io_vd &&
+ !vdev_is_dead(zio->io_vd))
+ zfs_ereport_post(FM_EREPORT_ZFS_IO,
+ zio->io_spa, zio->io_vd, zio, 0, 0);
+
+ if ((zio->io_error == EIO ||
+ !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) &&
+ zio->io_logical == zio) {
+ /*
+ * For root I/O requests, tell the SPA to log the error
+ * appropriately. Also, generate a logical data
+ * ereport.
+ */
+ spa_log_error(zio->io_spa, zio);
+
+ zfs_ereport_post(FM_EREPORT_ZFS_DATA,
+ zio->io_spa, NULL, zio, 0, 0);
+ }
- if (zio->io_error && !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
- sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
- bp ? bp : &zio->io_bp_copy);
- panic("ZFS: %s (%s on %s off %llx: zio %p %s): error %d",
- zio->io_error == ECKSUM ? "bad checksum" : "I/O failure",
- zio_type_name[zio->io_type],
- vdev_description(vd),
- (u_longlong_t)zio->io_offset,
- zio, blkbuf, zio->io_error);
+ /*
+ * For I/O requests that cannot fail, panic appropriately.
+ */
+ if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) {
+ sprintf_blkptr(blkbuf, BP_SPRINTF_LEN,
+ bp ? bp : &zio->io_bp_copy);
+ panic("ZFS: %s (%s on %s off %llx: zio %p %s): error "
+ "%d", zio->io_error == ECKSUM ?
+ "bad checksum" : "I/O failure",
+ zio_type_name[zio->io_type],
+ vdev_description(vd),
+ (u_longlong_t)zio->io_offset,
+ zio, blkbuf, zio->io_error);
+ }
}
zio_clear_transform_stack(zio);
@@ -807,7 +824,7 @@ zio_done(zio_t *zio)
}
if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD))
- spa_config_exit(spa);
+ spa_config_exit(spa, zio);
if (zio->io_waiter != NULL) {
mutex_enter(&zio->io_lock);
@@ -988,7 +1005,8 @@ zio_read_gang_members(zio_t *zio)
zio_nowait(zio_read(zio, zio->io_spa, gbp,
(char *)zio->io_data + loff, lsize, NULL, NULL,
- zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT));
+ zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT,
+ &zio->io_bookmark));
}
zio_buf_free(gbh, gbufsize);
@@ -1022,7 +1040,8 @@ zio_rewrite_gang_members(zio_t *zio)
zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum,
zio->io_txg, gbp, (char *)zio->io_data + loff, lsize,
- NULL, NULL, zio->io_priority, zio->io_flags));
+ NULL, NULL, zio->io_priority, zio->io_flags,
+ &zio->io_bookmark));
}
zio_push_transform(zio, gbh, gsize, gbufsize);
@@ -1153,7 +1172,8 @@ zio_write_allocate_gang_members(zio_t *zio)
zio->io_checksum, zio->io_txg, gbp,
(char *)zio->io_data + loff, lsize,
zio_write_allocate_gang_member_done, NULL,
- zio->io_priority, zio->io_flags));
+ zio->io_priority, zio->io_flags,
+ &zio->io_bookmark));
} else {
lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE);
ASSERT(lsize != SPA_MINBLOCKSIZE);
@@ -1263,51 +1283,6 @@ zio_dva_translate(zio_t *zio)
* Read and write to physical devices
* ==========================================================================
*/
-static void
-zio_vdev_io_enter(zio_t *zio)
-{
- vdev_t *tvd = zio->io_vd->vdev_top;
-
- mutex_enter(&tvd->vdev_io_lock);
- ASSERT(zio->io_pending.list_next == NULL);
- list_insert_tail(&tvd->vdev_io_pending, zio);
- mutex_exit(&tvd->vdev_io_lock);
-}
-
-static void
-zio_vdev_io_exit(zio_t *zio)
-{
- vdev_t *tvd = zio->io_vd->vdev_top;
-
- mutex_enter(&tvd->vdev_io_lock);
- ASSERT(zio->io_pending.list_next != NULL);
- list_remove(&tvd->vdev_io_pending, zio);
- if (list_head(&tvd->vdev_io_pending) == NULL)
- cv_broadcast(&tvd->vdev_io_cv);
- mutex_exit(&tvd->vdev_io_lock);
-}
-
-static void
-zio_vdev_io_retry(void *vdarg)
-{
- vdev_t *vd = vdarg;
- zio_t *zio, *zq;
-
- ASSERT(vd == vd->vdev_top);
-
- /* XXPOLICY */
- delay(hz);
-
- vdev_reopen(vd, &zq);
-
- while ((zio = zq) != NULL) {
- zq = zio->io_retry_next;
- zio->io_retry_next = NULL;
- dprintf("async retry #%d for I/O to %s offset %llx\n",
- zio->io_retries, vdev_description(vd), zio->io_offset);
- zio_next_stage_async(zio);
- }
-}
static void
zio_vdev_io_setup(zio_t *zio)
@@ -1323,8 +1298,6 @@ zio_vdev_io_setup(zio_t *zio)
zio->io_offset += VDEV_LABEL_START_SIZE;
}
- zio_vdev_io_enter(zio);
-
zio_next_stage(zio);
}
@@ -1350,7 +1323,7 @@ zio_vdev_io_done(zio_t *zio)
}
/* XXPOLICY */
-static boolean_t
+boolean_t
zio_should_retry(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -1363,11 +1336,7 @@ zio_should_retry(zio_t *zio)
return (B_FALSE);
if (zio->io_flags & ZIO_FLAG_DONT_RETRY)
return (B_FALSE);
- if (zio->io_retries > 300 &&
- (zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL)))
- return (B_FALSE);
- if (zio->io_retries > 1 &&
- (zio->io_error == ECKSUM || zio->io_error == ENXIO))
+ if (zio->io_retries > 0)
return (B_FALSE);
return (B_TRUE);
@@ -1379,17 +1348,16 @@ zio_vdev_io_assess(zio_t *zio)
vdev_t *vd = zio->io_vd;
vdev_t *tvd = vd->vdev_top;
- zio_vdev_io_exit(zio);
-
ASSERT(zio->io_vsd == NULL);
+ if (zio_injection_enabled && !zio->io_error)
+ zio->io_error = zio_handle_fault_injection(zio, EIO);
+
/*
* If the I/O failed, determine whether we should attempt to retry it.
*/
/* XXPOLICY */
if (zio_should_retry(zio)) {
- zio_t *zq;
-
ASSERT(tvd == vd);
ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE));
@@ -1405,29 +1373,27 @@ zio_vdev_io_assess(zio_t *zio)
zio->io_retries, zio_type_name[zio->io_type],
vdev_description(vd), zio->io_offset);
- /*
- * If this is the first retry, do it immediately.
- */
- /* XXPOLICY */
- if (zio->io_retries == 1) {
- zio_next_stage_async(zio);
- return;
- }
+ zio_next_stage_async(zio);
+ return;
+ }
+ if (zio->io_error != 0 && !(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
+ zio->io_error != ECKSUM) {
/*
- * This was not the first retry, so go through the
- * longer enqueue/delay/vdev_reopen() process.
+ * Poor man's hotplug support. Even if we're done retrying this
+ * I/O, try to reopen the vdev to see if it's still attached.
+ * To avoid excessive thrashing, we only try it once a minute.
+ * This also has the effect of detecting when missing devices
+ * have come back, by polling the device once a minute.
+ *
+ * We need to do this asynchronously because we can't grab
+ * all the necessary locks way down here.
*/
- mutex_enter(&tvd->vdev_io_lock);
- ASSERT(zio->io_retry_next == NULL);
- zio->io_retry_next = zq = tvd->vdev_io_retry;
- tvd->vdev_io_retry = zio;
- mutex_exit(&tvd->vdev_io_lock);
- if (zq == NULL)
- (void) taskq_dispatch(
- tvd->vdev_spa->spa_vdev_retry_taskq,
- zio_vdev_io_retry, tvd, TQ_SLEEP);
- return;
+ if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) {
+ vd->vdev_last_try = gethrtime();
+ tvd->vdev_reopen_wanted = 1;
+ spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN);
+ }
}
zio_next_stage(zio);
@@ -1502,10 +1468,9 @@ zio_checksum_verify(zio_t *zio)
{
if (zio->io_bp != NULL) {
zio->io_error = zio_checksum_error(zio);
- if (zio->io_error) {
- dprintf("bad checksum on vdev %s\n",
- vdev_description(zio->io_vd));
- }
+ if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE))
+ zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
+ zio->io_spa, zio->io_vd, zio, 0, 0);
}
zio_next_stage(zio);
@@ -1660,7 +1625,7 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,
{
int error;
- spa_config_enter(spa, RW_READER);
+ spa_config_enter(spa, RW_READER, FTAG);
BP_ZERO(bp);
@@ -1677,7 +1642,7 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp,
bp->blk_birth = txg;
}
- spa_config_exit(spa);
+ spa_config_exit(spa, FTAG);
return (error);
}
@@ -1693,9 +1658,9 @@ zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
dprintf_bp(bp, "txg %llu: ", txg);
- spa_config_enter(spa, RW_READER);
+ spa_config_enter(spa, RW_READER, FTAG);
metaslab_free(spa, BP_IDENTITY(bp), txg);
- spa_config_exit(spa);
+ spa_config_exit(spa, FTAG);
}
diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c
index dc31527ce8..d57ab6d525 100644
--- a/usr/src/uts/common/fs/zfs/zio_checksum.c
+++ b/usr/src/uts/common/fs/zfs/zio_checksum.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -170,5 +169,8 @@ zio_checksum_error(zio_t *zio)
(actual_cksum.zc_word[3] - zc.zc_word[3]))
return (ECKSUM);
+ if (zio_injection_enabled && !zio->io_error)
+ return (zio_handle_fault_injection(zio, ECKSUM));
+
return (0);
}
diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c
new file mode 100644
index 0000000000..4cada09d83
--- /dev/null
+++ b/usr/src/uts/common/fs/zfs/zio_inject.c
@@ -0,0 +1,315 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * ZFS fault injection
+ *
+ * To handle fault injection, we keep track of a series of zinject_record_t
+ * structures which describe which logical block(s) should be injected with a
+ * fault. These are kept in a global list. Each record corresponds to a given
+ * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
+ * or exported while the injection record exists.
+ *
+ * Device level injection is done using the 'zi_guid' field. If this is set, it
+ * means that the error is destined for a particular device, not a piece of
+ * data.
+ *
+ * This is a rather poor data structure and algorithm, but we don't expect more
+ * than a few faults at any one time, so it should be sufficient for our needs.
+ */
+
+#include <sys/arc.h>
+#include <sys/zio_impl.h>
+#include <sys/zfs_ioctl.h>
+#include <sys/spa_impl.h>
+#include <sys/vdev_impl.h>
+
+uint32_t zio_injection_enabled;
+
+typedef struct inject_handler {
+ int zi_id;
+ spa_t *zi_spa;
+ zinject_record_t zi_record;
+ list_node_t zi_link;
+} inject_handler_t;
+
+static list_t inject_handlers;
+static krwlock_t inject_lock;
+static int inject_next_id = 1;
+
+/*
+ * Returns true if the given record matches the I/O in progress.
+ */
+static boolean_t
+zio_match_handler(zbookmark_t *zb, uint64_t type,
+ zinject_record_t *record, int error)
+{
+ /*
+ * Check for a match against the MOS, which is based on type
+ */
+ if (zb->zb_objset == 0 && record->zi_objset == 0 &&
+ record->zi_object == 0) {
+ if (record->zi_type == DMU_OT_NONE ||
+ type == record->zi_type)
+ return (record->zi_freq == 0 ||
+ spa_get_random(100) < record->zi_freq);
+ else
+ return (B_FALSE);
+ }
+
+ /*
+ * Check for an exact match.
+ */
+ if (zb->zb_objset == record->zi_objset &&
+ zb->zb_object == record->zi_object &&
+ zb->zb_level == record->zi_level &&
+ zb->zb_blkid >= record->zi_start &&
+ zb->zb_blkid <= record->zi_end &&
+ error == record->zi_error)
+ return (record->zi_freq == 0 ||
+ spa_get_random(100) < record->zi_freq);
+
+ return (B_FALSE);
+}
+
+/*
+ * Determine if the I/O in question should return failure. Returns the errno
+ * to be returned to the caller.
+ */
+int
+zio_handle_fault_injection(zio_t *zio, int error)
+{
+ int ret = 0;
+ inject_handler_t *handler;
+
+ /*
+ * Ignore I/O not associated with any logical data.
+ */
+ if (zio->io_logical == NULL)
+ return (0);
+
+ /*
+ * Currently, we only support fault injection on reads.
+ */
+ if (zio->io_type != ZIO_TYPE_READ)
+ return (0);
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ /* Ignore errors not destined for this pool */
+ if (zio->io_spa != handler->zi_spa)
+ continue;
+
+ /* Ignore device errors */
+ if (handler->zi_record.zi_guid != 0)
+ continue;
+
+ /* If this handler matches, return EIO */
+ if (zio_match_handler(&zio->io_logical->io_bookmark,
+ zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
+ &handler->zi_record, error)) {
+ ret = error;
+ break;
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+int
+zio_handle_device_injection(vdev_t *vd, int error)
+{
+ inject_handler_t *handler;
+ int ret = 0;
+
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler)) {
+
+ if (vd->vdev_guid == handler->zi_record.zi_guid) {
+ if (handler->zi_record.zi_error == error) {
+ /*
+ * For a failed open, pretend like the device
+ * has gone away.
+ */
+ if (error == ENXIO)
+ vd->vdev_stat.vs_aux =
+ VDEV_AUX_OPEN_FAILED;
+ ret = error;
+ break;
+ }
+ if (handler->zi_record.zi_error == ENXIO) {
+ ret = EIO;
+ break;
+ }
+ }
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+/*
+ * Create a new handler for the given record. We add it to the list, adding
+ * a reference to the spa_t in the process. We increment zio_injection_enabled,
+ * which is the switch to trigger all fault injection.
+ */
+int
+zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
+{
+ inject_handler_t *handler;
+ int error;
+ spa_t *spa;
+
+ /*
+ * If this is pool-wide metadata, make sure we unload the corresponding
+ * spa_t, so that the next attempt to load it will trigger the fault.
+ * We call spa_reset() to unload the pool appropriately.
+ */
+ if (flags & ZINJECT_UNLOAD_SPA)
+ if ((error = spa_reset(name)) != 0)
+ return (error);
+
+ if (!(flags & ZINJECT_NULL)) {
+ /*
+ * spa_inject_ref() will add an injection reference, which will
+ * prevent the pool from being removed from the namespace while
+ * still allowing it to be unloaded.
+ */
+ if ((spa = spa_inject_addref(name)) == NULL)
+ return (ENOENT);
+
+ handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
+
+ rw_enter(&inject_lock, RW_WRITER);
+
+ *id = handler->zi_id = inject_next_id++;
+ handler->zi_spa = spa;
+ handler->zi_record = *record;
+ list_insert_tail(&inject_handlers, handler);
+ atomic_add_32(&zio_injection_enabled, 1);
+
+ rw_exit(&inject_lock);
+ }
+
+ /*
+ * Flush the ARC, so that any attempts to read this data will end up
+ * going to the ZIO layer. Note that this is a little overkill, but
+ * we don't have the necessary ARC interfaces to do anything else, and
+ * fault injection isn't a performance critical path.
+ */
+ if (flags & ZINJECT_FLUSH_ARC)
+ arc_flush();
+
+ return (0);
+}
+
+/*
+ * Returns the next record with an ID greater than that supplied to the
+ * function. Used to iterate over all handlers in the system.
+ */
+int
+zio_inject_list_next(int *id, char *name, size_t buflen,
+ zinject_record_t *record)
+{
+ inject_handler_t *handler;
+ int ret;
+
+ mutex_enter(&spa_namespace_lock);
+ rw_enter(&inject_lock, RW_READER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler))
+ if (handler->zi_id > *id)
+ break;
+
+ if (handler) {
+ *record = handler->zi_record;
+ *id = handler->zi_id;
+ (void) strncpy(name, spa_name(handler->zi_spa), buflen);
+ ret = 0;
+ } else {
+ ret = ENOENT;
+ }
+
+ rw_exit(&inject_lock);
+ mutex_exit(&spa_namespace_lock);
+
+ return (ret);
+}
+
+/*
+ * Clear the fault handler with the given identifier, or return ENOENT if none
+ * exists.
+ */
+int
+zio_clear_fault(int id)
+{
+ inject_handler_t *handler;
+ int ret;
+
+ rw_enter(&inject_lock, RW_WRITER);
+
+ for (handler = list_head(&inject_handlers); handler != NULL;
+ handler = list_next(&inject_handlers, handler))
+ if (handler->zi_id == id)
+ break;
+
+ if (handler == NULL) {
+ ret = ENOENT;
+ } else {
+ list_remove(&inject_handlers, handler);
+ spa_inject_delref(handler->zi_spa);
+ kmem_free(handler, sizeof (inject_handler_t));
+ atomic_add_32(&zio_injection_enabled, -1);
+ ret = 0;
+ }
+
+ rw_exit(&inject_lock);
+
+ return (ret);
+}
+
+void
+zio_inject_init(void)
+{
+ list_create(&inject_handlers, sizeof (inject_handler_t),
+ offsetof(inject_handler_t, zi_link));
+}
+
+void
+zio_inject_fini(void)
+{
+ list_destroy(&inject_handlers);
+}
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index a570d4d971..69fb50c2c3 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -418,6 +417,7 @@ zvol_create_minor(zfs_cmd_t *zc)
zvol_size_changed(zv, dev);
+ /* XXX this should handle the possible i/o error */
VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
"readonly", zvol_readonly_changed_cb, zv) == 0);
@@ -500,7 +500,7 @@ zvol_set_volsize(zfs_cmd_t *zc)
}
tx = dmu_tx_create(zv->zv_objset);
- dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, 1);
+ dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
dmu_tx_hold_free(tx, ZVOL_OBJ, zc->zc_volsize, DMU_OBJECT_END);
error = dmu_tx_assign(tx, TXG_WAIT);
if (error) {
@@ -511,9 +511,10 @@ zvol_set_volsize(zfs_cmd_t *zc)
error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
&zc->zc_volsize, tx);
- if (error == 0)
- dmu_free_range(zv->zv_objset, ZVOL_OBJ, zc->zc_volsize,
+ if (error == 0) {
+ error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, zc->zc_volsize,
DMU_OBJECT_END, tx);
+ }
dmu_tx_commit(tx);
@@ -744,7 +745,7 @@ zvol_strategy(buf_t *bp)
size = volsize - off;
if (bp->b_flags & B_READ) {
- error = dmu_read_canfail(os, ZVOL_OBJ,
+ error = dmu_read(os, ZVOL_OBJ,
off, size, addr);
} else {
dmu_tx_t *tx = dmu_tx_create(os);
diff --git a/usr/src/uts/common/krtld/kobj.c b/usr/src/uts/common/krtld/kobj.c
index 003022d104..1cdf93e98f 100644
--- a/usr/src/uts/common/krtld/kobj.c
+++ b/usr/src/uts/common/krtld/kobj.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -108,6 +107,7 @@ static int kobj_boot_open(char *, int);
static int kobj_boot_close(int);
static int kobj_boot_seek(int, off_t, off_t);
static int kobj_boot_read(int, caddr_t, size_t);
+static int kobj_boot_fstat(int, struct bootstat *);
static Sym *lookup_one(struct module *, const char *);
static void sym_insert(struct module *, char *, symid_t);
@@ -3324,8 +3324,8 @@ kobj_open(char *filename)
*/
cred_t *saved_cred = curthread->t_cred;
curthread->t_cred = kcred;
- Errno = vn_open(filename, UIO_SYSSPACE, FREAD, 0, &vp,
- 0, 0);
+ Errno = vn_openat(filename, UIO_SYSSPACE, FREAD, 0, &vp,
+ 0, 0, rootdir);
curthread->t_cred = saved_cred;
}
kobjopen_free(ltp);
@@ -3458,6 +3458,47 @@ kobj_close(intptr_t descr)
(void) kobj_boot_close((int)descr);
}
+int
+kobj_fstat(intptr_t descr, struct bootstat *buf)
+{
+ if (buf == NULL)
+ return (-1);
+
+ if (_modrootloaded) {
+ vattr_t vattr;
+ struct vnode *vp = (struct vnode *)descr;
+ if (VOP_GETATTR(vp, &vattr, 0, kcred) != 0)
+ return (-1);
+
+ /*
+ * The vattr and bootstat structures are similar, but not
+ * identical. We do our best to fill in the bootstat structure
+ * from the contents of vattr (transfering only the ones that
+ * are obvious.
+ */
+
+ buf->st_mode = (uint32_t)vattr.va_mode;
+ buf->st_nlink = (uint32_t)vattr.va_nlink;
+ buf->st_uid = (int32_t)vattr.va_uid;
+ buf->st_gid = (int32_t)vattr.va_gid;
+ buf->st_rdev = (uint64_t)vattr.va_rdev;
+ buf->st_size = (uint64_t)vattr.va_size;
+ buf->st_atim.tv_sec = (int64_t)vattr.va_atime.tv_sec;
+ buf->st_atim.tv_nsec = (int64_t)vattr.va_atime.tv_nsec;
+ buf->st_mtim.tv_sec = (int64_t)vattr.va_mtime.tv_sec;
+ buf->st_mtim.tv_nsec = (int64_t)vattr.va_mtime.tv_nsec;
+ buf->st_ctim.tv_sec = (int64_t)vattr.va_ctime.tv_sec;
+ buf->st_ctim.tv_nsec = (int64_t)vattr.va_ctime.tv_nsec;
+ buf->st_blksize = (int32_t)vattr.va_blksize;
+ buf->st_blocks = (int64_t)vattr.va_nblocks;
+
+ return (0);
+ }
+
+ return (kobj_boot_fstat((int)descr, buf));
+}
+
+
struct _buf *
kobj_open_file(char *name)
{
@@ -4097,6 +4138,18 @@ kobj_record_file(char *filename)
}
#endif /* __x86 */
+static int
+kobj_boot_fstat(int fd, struct bootstat *stp)
+{
+#if defined(__sparc)
+ if (!standalone && _ioquiesced)
+ return (-1);
+ return (BOP_FSTAT(ops, fd, stp));
+#else
+ return (BRD_FSTAT(bfs_ops, fd, stp));
+#endif
+}
+
/*
* XXX these wrappers should go away when sparc is converted
* boot from ramdisk
diff --git a/usr/src/uts/common/krtld/kobj_stubs.c b/usr/src/uts/common/krtld/kobj_stubs.c
index 3d972194bb..c592fb5317 100644
--- a/usr/src/uts/common/krtld/kobj_stubs.c
+++ b/usr/src/uts/common/krtld/kobj_stubs.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -108,6 +107,13 @@ kobj_close(intptr_t descr)
/*ARGSUSED*/
int
+kobj_fstat(intptr_t descr, struct bootstat *buf)
+{
+ return (-1);
+}
+
+/*ARGSUSED*/
+int
kobj_filbuf(struct _buf *f)
{
return (-1);
diff --git a/usr/src/uts/common/krtld/mapfile b/usr/src/uts/common/krtld/mapfile
index 398c6dcf32..cb1f85b04a 100644
--- a/usr/src/uts/common/krtld/mapfile
+++ b/usr/src/uts/common/krtld/mapfile
@@ -1,13 +1,9 @@
#
-# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
-# Use is subject to license terms.
-#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
-# Common Development and Distribution License, Version 1.0 only
-# (the "License"). You may not use this file except in compliance
-# with the License.
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or http://www.opensolaris.org/os/licensing.
@@ -22,6 +18,9 @@
#
# CDDL HEADER END
#
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+# Use is subject to license terms.
+#
#pragma ident "%Z%%M% %I% %E% SMI"
#
@@ -36,6 +35,7 @@
kobj_export_module;
kobj_filbuf;
kobj_free;
+ kobj_fstat;
kobj_getelfsym;
kobj_getmodinfo;
kobj_getpagesize;
diff --git a/usr/src/uts/common/os/fm.c b/usr/src/uts/common/os/fm.c
index 6ff4626405..43c3acbef0 100644
--- a/usr/src/uts/common/os/fm.c
+++ b/usr/src/uts/common/os/fm.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -1070,6 +1069,37 @@ fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
}
}
+void
+fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
+ uint64_t vdev_guid)
+{
+ if (version != ZFS_SCHEME_VERSION0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ return;
+ }
+
+ if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
+ atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+
+ if (vdev_guid != 0) {
+ if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
+ atomic_add_64(
+ &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+ }
+ }
+}
+
uint64_t
fm_ena_increment(uint64_t ena)
{
diff --git a/usr/src/uts/common/os/modsysfile.c b/usr/src/uts/common/os/modsysfile.c
index 7ffcf66d10..0e36f3e2cc 100644
--- a/usr/src/uts/common/os/modsysfile.c
+++ b/usr/src/uts/common/os/modsysfile.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -73,6 +72,7 @@ static vmem_t *mod_sysfile_arena; /* parser memory */
char obp_bootpath[BO_MAXOBJNAME]; /* bootpath from obp */
char svm_bootpath[BO_MAXOBJNAME]; /* bootpath redirected via rootdev */
+char zfs_bootpath[BO_MAXOBJNAME]; /* zfs bootpath, set via zfsroot */
#if defined(_PSM_MODULES)
@@ -489,6 +489,8 @@ static struct modcmd modcmd[] = {
{ "set32", MOD_SET32 },
{ "SET64", MOD_SET64 },
{ "set64", MOD_SET64 },
+ { "ZFSROOT", MOD_ZFSROOT },
+ { "zfsroot", MOD_ZFSROOT },
{ NULL, MOD_UNKNOWN }
};
@@ -528,6 +530,7 @@ do_sysfile_cmd(struct _buf *file, const char *cmd)
*/
case MOD_ROOTFS:
case MOD_SWAPFS:
+ case MOD_ZFSROOT:
if ((token = kobj_lex(file, tok1, sizeof (tok1))) == COLON) {
token = kobj_lex(file, tok1, sizeof (tok1));
} else {
@@ -1520,7 +1523,10 @@ setparams()
(void) copystr(sysp->sys_ptr, bootobjp->bo_fstype,
BO_MAXOBJNAME, NULL);
break;
-
+ case MOD_ZFSROOT:
+ (void) copystr(sysp->sys_ptr, zfs_bootpath,
+ BO_MAXOBJNAME, NULL);
+ break;
default:
break;
}
diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c
index fe4a5c82df..2e027b7ba5 100644
--- a/usr/src/uts/common/os/policy.c
+++ b/usr/src/uts/common/os/policy.c
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -1741,13 +1740,10 @@ secpolicy_contract_event_choice(const cred_t *cr)
}
/*
- * Name: secpolicy_gart_access
- *
- * Normal: Verify if the subject has sufficient priveleges to make ioctls
- * to agpgart device
- *
- * Output: EPERM - if not privileged
+ * secpolicy_gart_access
*
+ * Determine if the subject has sufficient priveleges to make ioctls to agpgart
+ * device.
*/
int
secpolicy_gart_access(const cred_t *cr)
@@ -1756,13 +1752,10 @@ secpolicy_gart_access(const cred_t *cr)
}
/*
- * Name: secpolicy_gart_map
- *
- * Normal: Verify if the subject has sufficient privelegs to map aperture
- * range through agpgart driver
- *
- * Output: EPERM - if not privileged
+ * secpolicy_gart_map
*
+ * Determine if the subject has sufficient priveleges to map aperture range
+ * through agpgart driver.
*/
int
secpolicy_gart_map(const cred_t *cr)
@@ -1774,10 +1767,22 @@ secpolicy_gart_map(const cred_t *cr)
}
/*
+ * secpolicy_zinject
+ *
+ * Determine if the subject can inject faults in the ZFS fault injection
+ * framework. Requires all privileges.
+ */
+int
+secpolicy_zinject(const cred_t *cr)
+{
+ return (secpolicy_require_set(cr, PRIV_FULLSET, NULL));
+}
+
+/*
* secpolicy_zfs
*
- * Determine if the user has permission to manipulate ZFS datasets (not pools).
- * Equivalent to the SYS_MOUNT privilege.
+ * Determine if the subject has permission to manipulate ZFS datasets
+ * (not pools). Equivalent to the SYS_MOUNT privilege.
*/
int
secpolicy_zfs(const cred_t *cr)
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index f82a933903..516ecc0a5a 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -657,6 +657,9 @@ FMHDRS= \
protocol.h \
util.h
+FMFSHDRS= \
+ zfs.h
+
FMIOHDRS= \
ddi.h \
pci.h \
@@ -914,6 +917,7 @@ CHECKHDRS= \
$(TAVORHDRS:%.h=ib/adapters/tavor/%.check) \
$(ISOHDRS:%.h=iso/%.check) \
$(FMHDRS:%.h=fm/%.check) \
+ $(FMFSHDRS:%.h=fm/fs/%.check) \
$(FMIOHDRS:%.h=fm/io/%.check) \
$(FSHDRS:%.h=fs/%.check) \
$(LVMHDRS:%.h=lvm/%.check) \
@@ -949,6 +953,7 @@ CHECKHDRS= \
$(ROOTISOHDRS) \
$(ROOTFMHDRS) \
$(ROOTFMIOHDRS) \
+ $(ROOTFMFSHDRS) \
$(ROOTFSHDRS) \
$(ROOTIBDHDRS) \
$(ROOTIBHDRS) \
@@ -992,7 +997,8 @@ install_h: \
$(ROOTDCAMHDRS) \
$(ROOTISOHDRS) \
$(ROOTFMHDRS) \
- $(ROOTFMIOHDRS) \
+ $(ROOTFMFSHDRS) \
+ $(ROOTFMIOHDRS) \
$(ROOTFSHDRS) \
$(ROOTIBDHDRS) \
$(ROOTIBHDRS) \
diff --git a/usr/src/uts/common/sys/Makefile.syshdrs b/usr/src/uts/common/sys/Makefile.syshdrs
index cdc3436049..d9c363b48b 100644
--- a/usr/src/uts/common/sys/Makefile.syshdrs
+++ b/usr/src/uts/common/sys/Makefile.syshdrs
@@ -1,5 +1,5 @@
#
-# Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+# Copyright 2006 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
#
# ident "%Z%%M% %I% %E% SMI"
@@ -18,10 +18,13 @@ av/%.check: av/%.h
fm/%.check: fm/%.h
$(DOT_H_CHECK)
-fm/cpu/%.check: fm/cpu/%.h
+fm/cpu/%.check: fm/cpu/%.h
$(DOT_H_CHECK)
-fm/io/%.check: fm/io/%.h
+fm/fs/%.check: fm/fs/%.h
+ $(DOT_H_CHECK)
+
+fm/io/%.check: fm/io/%.h
$(DOT_H_CHECK)
fs/%.check: fs/%.h
@@ -129,6 +132,7 @@ ROOTDIRS= \
$(ROOTDIR)/iso \
$(ROOTDIR)/fm \
$(ROOTDIR)/fm/cpu \
+ $(ROOTDIR)/fm/fs \
$(ROOTDIR)/fm/io \
$(ROOTDIR)/fs \
$(ROOTDIR)/ib \
@@ -187,6 +191,7 @@ ROOTISOHDRS= $(ISOHDRS:%=$(ROOTDIR)/iso/%)
ROOTFMHDRS= $(FMHDRS:%=$(ROOTDIR)/fm/%)
ROOTFMCPUHDRS= $(FMCPUHDRS:%=$(ROOTDIR)/fm/cpu/%)
ROOTFMIOHDRS= $(FMIOHDRS:%=$(ROOTDIR)/fm/io/%)
+ROOTFMFSHDRS= $(FMFSHDRS:%=$(ROOTDIR)/fm/fs/%)
ROOTFSHDRS= $(FSHDRS:%=$(ROOTDIR)/fs/%)
diff --git a/usr/src/uts/common/sys/fm/fs/zfs.h b/usr/src/uts/common/sys/fm/fs/zfs.h
new file mode 100644
index 0000000000..aa5c7ee0d7
--- /dev/null
+++ b/usr/src/uts/common/sys/fm/fs/zfs.h
@@ -0,0 +1,75 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_FM_FS_ZFS_H
+#define _SYS_FM_FS_ZFS_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ZFS_ERROR_CLASS "fs.zfs"
+
+#define FM_EREPORT_ZFS_CHECKSUM "checksum"
+#define FM_EREPORT_ZFS_IO "io"
+#define FM_EREPORT_ZFS_DATA "data"
+#define FM_EREPORT_ZFS_POOL "zpool"
+#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown"
+#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed"
+#define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data"
+#define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas"
+#define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum"
+#define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small"
+#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label"
+
+#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool"
+#define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid"
+#define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path"
+#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid"
+#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid"
+#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type"
+#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path"
+#define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset"
+#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size"
+#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state"
+
+#define FM_RESOURCE_OK "ok"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_FM_FS_ZFS_H */
diff --git a/usr/src/uts/common/sys/fm/protocol.h b/usr/src/uts/common/sys/fm/protocol.h
index 89b761ef6c..1afa67f66b 100644
--- a/usr/src/uts/common/sys/fm/protocol.h
+++ b/usr/src/uts/common/sys/fm/protocol.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -168,6 +167,7 @@ extern "C" {
#define FM_FMRI_SCHEME_MOD "mod"
#define FM_FMRI_SCHEME_PKG "pkg"
#define FM_FMRI_SCHEME_LEGACY "legacy-hc"
+#define FM_FMRI_SCHEME_ZFS "zfs"
/* Scheme versions */
#define FMD_SCHEME_VERSION0 0
@@ -187,6 +187,8 @@ extern "C" {
#define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0
#define LEGACY_SCHEME_VERSION0 0
#define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0
+#define ZFS_SCHEME_VERSION0 0
+#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0
/* hc scheme member names */
#define FM_FMRI_HC_SERIAL_ID "serial"
@@ -253,6 +255,10 @@ extern "C" {
#define FM_FMRI_MOD_ID "mod-id"
#define FM_FMRI_MOD_DESC "mod-desc"
+/* zfs scheme member names */
+#define FM_FMRI_ZFS_POOL "pool"
+#define FM_FMRI_ZFS_VDEV "vdev"
+
extern nv_alloc_t *fm_nva_xcreate(char *, size_t);
extern void fm_nva_xdestroy(nv_alloc_t *);
@@ -277,6 +283,7 @@ extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *,
const char *, uint64_t);
extern void fm_authority_set(nvlist_t *, int, const char *, const char *,
const char *, const char *);
+extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t);
extern uint64_t fm_ena_increment(uint64_t);
extern uint64_t fm_ena_generate(uint64_t, uchar_t);
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index 65425c829c..0fa884dcaa 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -133,6 +133,8 @@ uint64_t zfs_prop_default_numeric(zfs_prop_t);
#define ZPOOL_CONFIG_STATS "stats"
#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk"
#define ZPOOL_CONFIG_OFFLINE "offline"
+#define ZPOOL_CONFIG_ERRCOUNT "error_count"
+#define ZPOOL_CONFIG_NOT_PRESENT "not_present"
#define VDEV_TYPE_ROOT "root"
#define VDEV_TYPE_MIRROR "mirror"
@@ -304,9 +306,25 @@ typedef enum zfs_ioc {
ZFS_IOC_ROLLBACK,
ZFS_IOC_RENAME,
ZFS_IOC_RECVBACKUP,
- ZFS_IOC_SENDBACKUP
+ ZFS_IOC_SENDBACKUP,
+ ZFS_IOC_INJECT_FAULT,
+ ZFS_IOC_CLEAR_FAULT,
+ ZFS_IOC_INJECT_LIST_NEXT,
+ ZFS_IOC_ERROR_LOG,
+ ZFS_IOC_CLEAR,
+ ZFS_IOC_BOOKMARK_NAME
} zfs_ioc_t;
+/*
+ * Internal SPA load state. Used by FMA diagnosis engine.
+ */
+typedef enum {
+ SPA_LOAD_NONE, /* no load in progress */
+ SPA_LOAD_OPEN, /* normal open */
+ SPA_LOAD_IMPORT, /* import in progress */
+ SPA_LOAD_TRYIMPORT /* tryimport in progress */
+} spa_load_state_t;
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/kobj.h b/usr/src/uts/common/sys/kobj.h
index 7d2bd0922e..9276aa370f 100644
--- a/usr/src/uts/common/sys/kobj.h
+++ b/usr/src/uts/common/sys/kobj.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -34,6 +33,7 @@
#include <sys/machelf.h>
#include <sys/vmem.h>
#include <sys/sdt.h>
+#include <sys/bootstat.h>
#ifdef __cplusplus
extern "C" {
@@ -162,6 +162,7 @@ extern uintptr_t kobj_getsymvalue(char *, int);
extern char *kobj_getsymname(uintptr_t, ulong_t *);
extern char *kobj_searchsym(struct module *, uintptr_t, ulong_t *);
+extern int kobj_fstat(intptr_t, struct bootstat *);
extern intptr_t kobj_open(char *);
extern int kobj_path_exists(char *, int);
extern struct _buf *kobj_open_path(char *, int, int);
diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h
index 9653a58b0e..beabb63818 100644
--- a/usr/src/uts/common/sys/policy.h
+++ b/usr/src/uts/common/sys/policy.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -141,6 +140,7 @@ int secpolicy_vnode_setdac(const cred_t *, uid_t);
int secpolicy_vnode_setid_retain(const cred_t *, boolean_t);
int secpolicy_vnode_setids_setgids(const cred_t *, gid_t);
int secpolicy_vnode_stky_modify(const cred_t *);
+int secpolicy_zinject(const cred_t *);
int secpolicy_zfs(const cred_t *);
void secpolicy_setid_clear(vattr_t *, cred_t *);
diff --git a/usr/src/uts/common/sys/sysconf.h b/usr/src/uts/common/sys/sysconf.h
index 4594d91287..654436a115 100644
--- a/usr/src/uts/common/sys/sysconf.h
+++ b/usr/src/uts/common/sys/sysconf.h
@@ -2,9 +2,8 @@
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License"). You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 1990-2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -72,6 +71,7 @@ struct modcmd {
#define MOD_UNKNOWN 9 /* unknown command */
#define MOD_SET32 10 /* like MOD_SET but -only- on 32-bit kernel */
#define MOD_SET64 11 /* like MOD_SET but -only- on 64-bit kernel */
+#define MOD_ZFSROOT 12 /* use zfs as the root filesystem */
/*
* Commands for mod_sysctl()