diff options
| author | eschrock <none@none> | 2006-03-03 20:08:16 -0800 |
|---|---|---|
| committer | eschrock <none@none> | 2006-03-03 20:08:16 -0800 |
| commit | ea8dc4b6d2251b437950c0056bc626b311c73c27 (patch) | |
| tree | 69cc1808568f2ef8fd1e21c61e186ba452ea64da /usr/src/uts/common | |
| parent | 5c18afbc96a46bc3a9e6f3667512daa374d6cd79 (diff) | |
| download | illumos-joyent-ea8dc4b6d2251b437950c0056bc626b311c73c27.tar.gz | |
PSARC 2006/077 zpool clear
PSARC 2006/139 FMA for ZFS
6284889 arc should replace the znode cache
6333006 DMU & DSL should not panic upon I/O error
6333092 concurrent reads to a file not scaling with number of readers
6338081 ZFS/FMA phase 1
6338386 need persistent error log
6341326 i/o error causes arc buf hash table corruption
6341639 zfs backup/restore should compute/verify checksum of backup stream
6348002 out of space due to changing properties
6354724 inaccurate error message from zfs restore
6354872 dmu_sync() blows predictive accounting
6355416 zpool scrubbing consumes all memory, system hung
6363995 df should only load libzfs when it encounters a ZFS filesystem
6366320 zfs backup/restore doesn't like signals
6368892 mount -m support needed for legacy mounts
6368902 boot archive fstat support needed for ZFS Mountroot
6369424 BFU complains when bfu'ing a ZFS root filesystem
6374062 mountroot support needed for ZFS
6376356 dirtying dbuf obj=43 lvl=0 blkid=0 but not tx_held
6378391 unused members of dmu_objset_stats_t
6378392 clean up zfs_cmd_t structure
6378685 buf_init should allocate its hash table more carefully
6378976 ziltest should be a first class citizen
6381086 zdb segfaults if there is a spa deferred-free bplist
6381203 deadlock due to i/o while assigning (tc_lock held)
6381209 freed space is not immediately available
6381344 'zpool clear'
6381345 FAULTED devices should really be UNAVAIL
6381346 import should mark devices as persistently unavailable
6383272 recursive mutex_enter() during log replay with zfs root
6386326 origin property is not displayed
6386354 libzfs does too much in its _init section, calls exit(1)
6386624 zpool should not complain about non-existent devices from libdiskmgt
6386910 spa needs to be i/o error hardened
6387735 need a mechanism to inject faults into ZFS
6387736 internal ZFS utilities should be placed in an ON-private package
6389928 libzfs should ship a lint library
6390609 malformed vdev config panics on zpool_create()
6390677 version number checking makes upgrades challenging
6390713 ztest hangs in zil_suspend()
6391873 metadata compression should be turned back on
6392113 ztest sometimes reports leaked blocks because ZIL isn't resilvered
6393004 minor memory leak in unique_insert()
Diffstat (limited to 'usr/src/uts/common')
83 files changed, 6432 insertions, 3331 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index f2d155fd25..587e9e1535 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -864,6 +864,7 @@ ZFS_COMMON_OBJS += \ sha256.o \ spa.o \ spa_config.o \ + spa_errlog.o \ spa_misc.o \ space_map.o \ txg.o \ @@ -882,10 +883,12 @@ ZFS_COMMON_OBJS += \ zap_leaf.o \ zap_micro.o \ zfs_byteswap.o \ + zfs_fm.o \ zil.o \ zio.o \ zio_checksum.o \ - zio_compress.o + zio_compress.o \ + zio_inject.o ZFS_SHARED_OBJS += \ zfs_namecheck.o \ diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index bd8a110990..904e746721 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -28,8 +28,8 @@ /* * DVA-based Adjustable Relpacement Cache * - * While much of the theory of operation and algorithms used here - * are based on the self-tuning, low overhead replacement cache + * While much of the theory of operation used here is + * based on the self-tuning, low overhead replacement cache * presented by Megiddo and Modha at FAST 2003, there are some * significant differences: * @@ -98,6 +98,15 @@ * must use: mutex_tryenter() to avoid deadlock. Also note that * the "top" state mutex must be held before the "bot" state mutex. * + * Arc buffers may have an associated eviction callback function. + * This function will be invoked prior to removing the buffer (e.g. + * in arc_do_user_evicts()). Note however that the data associated + * with the buffer may be evicted prior to the callback. The callback + * must be made with *no locks held* (to prevent deadlock). Additionally, + * the users of callbacks must ensure that their private data is + * protected from simultaneous callbacks from arc_buf_evict() + * and arc_do_user_evicts(). + * * Note that the majority of the performance stats are manipulated * with atomic operations. */ @@ -136,10 +145,10 @@ static int arc_dead; /* * Note that buffers can be on one of 5 states: * ARC_anon - anonymous (discussed below) - * ARC_mru_top - recently used, currently cached - * ARC_mru_bot - recentely used, no longer in cache - * ARC_mfu_top - frequently used, currently cached - * ARC_mfu_bot - frequently used, no longer in cache + * ARC_mru - recently used, currently cached + * ARC_mru_ghost - recentely used, no longer in cache + * ARC_mfu - frequently used, currently cached + * ARC_mfu_ghost - frequently used, no longer in cache * When there are no active references to the buffer, they * are linked onto one of the lists in arc. These are the * only buffers that can be evicted or deleted. @@ -147,9 +156,9 @@ static int arc_dead; * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies * before they are written to stable storage. By definition, - * they are "ref'd" and are considered part of arc_mru_top + * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will aquire a DVA - * as they are written and migrate onto the arc_mru_top list. + * as they are written and migrate onto the arc_mru list. */ typedef struct arc_state { @@ -162,24 +171,22 @@ typedef struct arc_state { /* The 5 states: */ static arc_state_t ARC_anon; -static arc_state_t ARC_mru_top; -static arc_state_t ARC_mru_bot; -static arc_state_t ARC_mfu_top; -static arc_state_t ARC_mfu_bot; +static arc_state_t ARC_mru; +static arc_state_t ARC_mru_ghost; +static arc_state_t ARC_mfu; +static arc_state_t ARC_mfu_ghost; static struct arc { arc_state_t *anon; - arc_state_t *mru_top; - arc_state_t *mru_bot; - arc_state_t *mfu_top; - arc_state_t *mfu_bot; + arc_state_t *mru; + arc_state_t *mru_ghost; + arc_state_t *mfu; + arc_state_t *mfu_ghost; uint64_t size; /* Actual total arc size */ - uint64_t p; /* Target size (in bytes) of mru_top */ + uint64_t p; /* Target size (in bytes) of mru */ uint64_t c; /* Target size of cache (in bytes) */ uint64_t c_min; /* Minimum target cache size */ uint64_t c_max; /* Maximum target cache size */ - uint64_t incr; /* Size by which to increment arc.c */ - int64_t size_check; /* performance stats */ uint64_t hits; @@ -195,12 +202,6 @@ static struct arc { int no_grow; /* Don't try to grow cache size */ } arc; -/* Default amount to grow arc.incr */ -static int64_t arc_incr_size = 1024; - -/* > 0 ==> time to increment arc.c */ -static int64_t arc_size_check_default = -1000; - static uint64_t arc_tempreserve; typedef struct arc_callback arc_callback_t; @@ -227,6 +228,7 @@ struct arc_buf_hdr { arc_buf_hdr_t *b_hash_next; arc_buf_t *b_buf; uint32_t b_flags; + uint32_t b_datacnt; kcondvar_t b_cv; arc_callback_t *b_acb; @@ -242,6 +244,13 @@ struct arc_buf_hdr { refcount_t b_refcnt; }; +static arc_buf_t *arc_eviction_list; +static kmutex_t arc_eviction_mtx; +static void arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock); + +#define GHOST_STATE(state) \ + ((state) == arc.mru_ghost || (state) == arc.mfu_ghost) + /* * Private ARC flags. These flags are private ARC only flags that will show up * in b_flags in the arc_hdr_buf_t. Some flags are publicly declared, and can @@ -250,13 +259,17 @@ struct arc_buf_hdr { * public flags, make sure not to smash the private ones. */ +#define ARC_IN_HASH_TABLE (1 << 9) /* this buffer is hashed */ #define ARC_IO_IN_PROGRESS (1 << 10) /* I/O in progress for buf */ #define ARC_IO_ERROR (1 << 11) /* I/O failed for buf */ #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ +#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) +#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) /* * Hash table routines @@ -353,6 +366,7 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) arc_buf_hdr_t *fbuf; uint32_t max, i; + ASSERT(!HDR_IN_HASH_TABLE(buf)); fbufs_lastthread = curthread; *lockp = hash_lock; mutex_enter(hash_lock); @@ -366,6 +380,7 @@ buf_hash_insert(arc_buf_hdr_t *buf, kmutex_t **lockp) buf->b_hash_next = buf_hash_table.ht_table[idx]; buf_hash_table.ht_table[idx] = buf; + buf->b_flags |= ARC_IN_HASH_TABLE; /* collect some hash table performance data */ if (i > 0) { @@ -391,6 +406,7 @@ buf_hash_remove(arc_buf_hdr_t *buf) uint64_t idx = BUF_HASH_INDEX(buf->b_spa, &buf->b_dva, buf->b_birth); ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); + ASSERT(HDR_IN_HASH_TABLE(buf)); bufp = &buf_hash_table.ht_table[idx]; while ((fbuf = *bufp) != buf) { @@ -399,6 +415,7 @@ buf_hash_remove(arc_buf_hdr_t *buf) } *bufp = buf->b_hash_next; buf->b_hash_next = NULL; + buf->b_flags &= ~ARC_IN_HASH_TABLE; /* collect some hash table performance data */ atomic_add_64(&arc.hash_elements, -1); @@ -456,6 +473,7 @@ hdr_dest(void *vbuf, void *unused) cv_destroy(&buf->b_cv); } +static int arc_reclaim_needed(void); void arc_kmem_reclaim(void); /* @@ -466,27 +484,33 @@ static void hdr_recl(void *unused) { dprintf("hdr_recl called\n"); - arc_kmem_reclaim(); + if (arc_reclaim_needed()) + arc_kmem_reclaim(); } static void buf_init(void) { uint64_t *ct; - uint64_t hsize = 1ULL << 10; + uint64_t hsize = 1ULL << 12; int i, j; /* * The hash table is big enough to fill all of physical memory - * with an average 4k block size. The table will take up - * totalmem*sizeof(void*)/4k bytes (eg. 2MB/GB with 8-byte - * pointers). + * with an average 64K block size. The table will take up + * totalmem*sizeof(void*)/64K (eg. 128KB/GB with 8-byte pointers). */ - while (hsize * 4096 < physmem * PAGESIZE) + while (hsize * 65536 < physmem * PAGESIZE) hsize <<= 1; - +retry: buf_hash_table.ht_mask = hsize - 1; - buf_hash_table.ht_table = kmem_zalloc(hsize * sizeof (void*), KM_SLEEP); + buf_hash_table.ht_table = + kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); + if (buf_hash_table.ht_table == NULL) { + ASSERT(hsize > (1ULL << 8)); + hsize >>= 1; + goto retry; + } hdr_cache = kmem_cache_create("arc_buf_hdr_t", sizeof (arc_buf_hdr_t), 0, hdr_cons, hdr_dest, hdr_recl, NULL, NULL, 0); @@ -505,8 +529,6 @@ buf_init(void) #define ARC_MINTIME (hz>>4) /* 62 ms */ -#define ARC_TAG (void *)0x05201962 - static void add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) { @@ -514,14 +536,21 @@ add_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) if ((refcount_add(&ab->b_refcnt, tag) == 1) && (ab->b_state != arc.anon)) { + int delta = ab->b_size * ab->b_datacnt; ASSERT(!MUTEX_HELD(&ab->b_state->mtx)); mutex_enter(&ab->b_state->mtx); - ASSERT(!refcount_is_zero(&ab->b_refcnt)); + ASSERT(refcount_count(&ab->b_refcnt) > 0); ASSERT(list_link_active(&ab->b_arc_node)); list_remove(&ab->b_state->list, ab); - ASSERT3U(ab->b_state->lsize, >=, ab->b_size); - ab->b_state->lsize -= ab->b_size; + if (GHOST_STATE(ab->b_state)) { + ASSERT3U(ab->b_datacnt, ==, 0); + ASSERT3P(ab->b_buf, ==, NULL); + delta = ab->b_size; + } + ASSERT(delta > 0); + ASSERT3U(ab->b_state->lsize, >=, delta); + atomic_add_64(&ab->b_state->lsize, -delta); mutex_exit(&ab->b_state->mtx); } } @@ -531,7 +560,8 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) { int cnt; - ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(ab->b_state == arc.anon || MUTEX_HELD(hash_lock)); + ASSERT(!GHOST_STATE(ab->b_state)); if (((cnt = refcount_remove(&ab->b_refcnt, tag)) == 0) && (ab->b_state != arc.anon)) { @@ -540,8 +570,9 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) mutex_enter(&ab->b_state->mtx); ASSERT(!list_link_active(&ab->b_arc_node)); list_insert_head(&ab->b_state->list, ab); - ASSERT(ab->b_buf != NULL); - ab->b_state->lsize += ab->b_size; + ASSERT(ab->b_datacnt > 0); + atomic_add_64(&ab->b_state->lsize, ab->b_size * ab->b_datacnt); + ASSERT3U(ab->b_state->size, >=, ab->b_state->lsize); mutex_exit(&ab->b_state->mtx); } return (cnt); @@ -552,49 +583,70 @@ remove_reference(arc_buf_hdr_t *ab, kmutex_t *hash_lock, void *tag) * for the buffer must be held by the caller. */ static void -arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, - kmutex_t *hash_lock) +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) { - arc_buf_t *buf; + arc_state_t *old_state = ab->b_state; + int refcnt = refcount_count(&ab->b_refcnt); + int from_delta, to_delta; ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(new_state != old_state); + ASSERT(refcnt == 0 || ab->b_datacnt > 0); + ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); + + from_delta = to_delta = ab->b_datacnt * ab->b_size; /* * If this buffer is evictable, transfer it from the * old state list to the new state list. */ - if (refcount_is_zero(&ab->b_refcnt)) { - if (ab->b_state != arc.anon) { - int drop_mutex = FALSE; + if (refcnt == 0) { + if (old_state != arc.anon) { + int use_mutex = !MUTEX_HELD(&old_state->mtx); + + if (use_mutex) + mutex_enter(&old_state->mtx); - if (!MUTEX_HELD(&ab->b_state->mtx)) { - mutex_enter(&ab->b_state->mtx); - drop_mutex = TRUE; - } ASSERT(list_link_active(&ab->b_arc_node)); - list_remove(&ab->b_state->list, ab); - ASSERT3U(ab->b_state->lsize, >=, ab->b_size); - ab->b_state->lsize -= ab->b_size; - if (drop_mutex) - mutex_exit(&ab->b_state->mtx); + list_remove(&old_state->list, ab); + + /* ghost elements have a ghost size */ + if (GHOST_STATE(old_state)) { + ASSERT(ab->b_datacnt == 0); + ASSERT(ab->b_buf == NULL); + from_delta = ab->b_size; + } + ASSERT3U(old_state->lsize, >=, from_delta); + atomic_add_64(&old_state->lsize, -from_delta); + + if (use_mutex) + mutex_exit(&old_state->mtx); } if (new_state != arc.anon) { - int drop_mutex = FALSE; + int use_mutex = !MUTEX_HELD(&new_state->mtx); - if (!MUTEX_HELD(&new_state->mtx)) { + if (use_mutex) mutex_enter(&new_state->mtx); - drop_mutex = TRUE; - } + list_insert_head(&new_state->list, ab); - ASSERT(ab->b_buf != NULL); - new_state->lsize += ab->b_size; - if (drop_mutex) + + /* ghost elements have a ghost size */ + if (GHOST_STATE(new_state)) { + ASSERT(ab->b_datacnt == 0); + ASSERT(ab->b_buf == NULL); + to_delta = ab->b_size; + } + atomic_add_64(&new_state->lsize, to_delta); + ASSERT3U(new_state->size + to_delta, >=, + new_state->lsize); + + if (use_mutex) mutex_exit(&new_state->mtx); } } ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc.anon && ab->b_state != arc.anon) { + if (new_state == arc.anon && old_state != arc.anon) { buf_hash_remove(ab); } @@ -602,22 +654,16 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, * If this buffer isn't being transferred to the MRU-top * state, it's safe to clear its prefetch flag */ - if ((new_state != arc.mru_top) && (new_state != arc.mru_bot)) { + if ((new_state != arc.mru) && (new_state != arc.mru_ghost)) { ab->b_flags &= ~ARC_PREFETCH; } - buf = ab->b_buf; - if (buf == NULL) { - ASSERT3U(ab->b_state->size, >=, ab->b_size); - atomic_add_64(&ab->b_state->size, -ab->b_size); - /* we should only be here if we are deleting state */ - ASSERT(new_state == arc.anon && - (ab->b_state == arc.mru_bot || ab->b_state == arc.mfu_bot)); - } else while (buf) { - ASSERT3U(ab->b_state->size, >=, ab->b_size); - atomic_add_64(&ab->b_state->size, -ab->b_size); - atomic_add_64(&new_state->size, ab->b_size); - buf = buf->b_next; + /* adjust state sizes */ + if (to_delta) + atomic_add_64(&new_state->size, to_delta); + if (from_delta) { + ASSERT3U(old_state->size, >=, from_delta); + atomic_add_64(&old_state->size, -from_delta); } ab->b_state = new_state; } @@ -637,9 +683,12 @@ arc_buf_alloc(spa_t *spa, int size, void *tag) hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_SLEEP); buf->b_hdr = hdr; + buf->b_efunc = NULL; + buf->b_private = NULL; buf->b_next = NULL; buf->b_data = zio_buf_alloc(size); hdr->b_buf = buf; + hdr->b_datacnt = 1; hdr->b_flags = 0; ASSERT(refcount_is_zero(&hdr->b_refcnt)); (void) refcount_add(&hdr->b_refcnt, tag); @@ -650,35 +699,124 @@ arc_buf_alloc(spa_t *spa, int size, void *tag) return (buf); } +static void * +arc_data_copy(arc_buf_hdr_t *hdr, void *old_data) +{ + void *new_data = zio_buf_alloc(hdr->b_size); + + atomic_add_64(&arc.size, hdr->b_size); + bcopy(old_data, new_data, hdr->b_size); + atomic_add_64(&hdr->b_state->size, hdr->b_size); + if (list_link_active(&hdr->b_arc_node)) { + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + atomic_add_64(&hdr->b_state->lsize, hdr->b_size); + } + return (new_data); +} + +void +arc_buf_add_ref(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + + mutex_enter(&arc_eviction_mtx); + hdr = buf->b_hdr; + if (buf->b_data == NULL) { + /* + * This buffer is evicted. + */ + mutex_exit(&arc_eviction_mtx); + return; + } else { + /* + * Prevent this buffer from being evicted + * while we add a reference. + */ + buf->b_hdr = NULL; + } + mutex_exit(&arc_eviction_mtx); + + ASSERT(hdr->b_state != arc.anon); + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + ASSERT(!GHOST_STATE(hdr->b_state)); + buf->b_hdr = hdr; + add_reference(hdr, hash_lock, tag); + arc_access_and_exit(hdr, hash_lock); + atomic_add_64(&arc.hits, 1); +} + +static void +arc_buf_destroy(arc_buf_t *buf, boolean_t all) +{ + arc_buf_t **bufp; + + /* free up data associated with the buf */ + if (buf->b_data) { + arc_state_t *state = buf->b_hdr->b_state; + uint64_t size = buf->b_hdr->b_size; + + zio_buf_free(buf->b_data, size); + atomic_add_64(&arc.size, -size); + if (list_link_active(&buf->b_hdr->b_arc_node)) { + ASSERT(refcount_is_zero(&buf->b_hdr->b_refcnt)); + ASSERT(state != arc.anon); + ASSERT3U(state->lsize, >=, size); + atomic_add_64(&state->lsize, -size); + } + ASSERT3U(state->size, >=, size); + atomic_add_64(&state->size, -size); + buf->b_data = NULL; + ASSERT(buf->b_hdr->b_datacnt > 0); + buf->b_hdr->b_datacnt -= 1; + } + + /* only remove the buf if requested */ + if (!all) + return; + + /* remove the buf from the hdr list */ + for (bufp = &buf->b_hdr->b_buf; *bufp != buf; bufp = &(*bufp)->b_next) + continue; + *bufp = buf->b_next; + + ASSERT(buf->b_efunc == NULL); + + /* clean up the buf */ + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); +} + static void -arc_hdr_free(arc_buf_hdr_t *hdr) +arc_hdr_destroy(arc_buf_hdr_t *hdr) { ASSERT(refcount_is_zero(&hdr->b_refcnt)); ASSERT3P(hdr->b_state, ==, arc.anon); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); if (!BUF_EMPTY(hdr)) { - /* - * We can be called with an arc state lock held, - * so we can't hold a hash lock here. - * ASSERT(not in hash table) - */ - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); bzero(&hdr->b_dva, sizeof (dva_t)); hdr->b_birth = 0; hdr->b_cksum0 = 0; } - if (hdr->b_buf) { + while (hdr->b_buf) { arc_buf_t *buf = hdr->b_buf; - ASSERT3U(hdr->b_size, >, 0); - zio_buf_free(buf->b_data, hdr->b_size); - atomic_add_64(&arc.size, -hdr->b_size); - ASSERT3U(arc.anon->size, >=, hdr->b_size); - atomic_add_64(&arc.anon->size, -hdr->b_size); - ASSERT3P(buf->b_next, ==, NULL); - kmem_cache_free(buf_cache, buf); - hdr->b_buf = NULL; + if (buf->b_efunc) { + mutex_enter(&arc_eviction_mtx); + ASSERT(buf->b_hdr != NULL); + arc_buf_destroy(hdr->b_buf, FALSE); + hdr->b_buf = buf->b_next; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&arc_eviction_mtx); + } else { + arc_buf_destroy(hdr->b_buf, TRUE); + } } + ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT3P(hdr->b_hash_next, ==, NULL); ASSERT3P(hdr->b_acb, ==, NULL); @@ -689,36 +827,73 @@ void arc_buf_free(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; - kmutex_t *hash_lock = HDR_LOCK(hdr); - int freeable; + int hashed = hdr->b_state != arc.anon; - mutex_enter(hash_lock); - if (remove_reference(hdr, hash_lock, tag) > 0) { - arc_buf_t **bufp = &hdr->b_buf; - arc_state_t *state = hdr->b_state; - uint64_t size = hdr->b_size; - - ASSERT(hdr->b_state != arc.anon || HDR_IO_ERROR(hdr)); - while (*bufp != buf) { - ASSERT(*bufp); - bufp = &(*bufp)->b_next; - } - *bufp = buf->b_next; + ASSERT(buf->b_efunc == NULL); + ASSERT(buf->b_data != NULL); + + if (hashed) { + kmutex_t *hash_lock = HDR_LOCK(hdr); + + mutex_enter(hash_lock); + (void) remove_reference(hdr, hash_lock, tag); + if (hdr->b_datacnt > 1) + arc_buf_destroy(buf, TRUE); + else + hdr->b_flags |= ARC_BUF_AVAILABLE; mutex_exit(hash_lock); - zio_buf_free(buf->b_data, size); - atomic_add_64(&arc.size, -size); - kmem_cache_free(buf_cache, buf); - ASSERT3U(state->size, >=, size); - atomic_add_64(&state->size, -size); - return; + } else if (HDR_IO_IN_PROGRESS(hdr)) { + int destroy_hdr; + /* + * We are in the middle of an async write. Don't destroy + * this buffer unless the write completes before we finish + * decrementing the reference count. + */ + mutex_enter(&arc_eviction_mtx); + (void) remove_reference(hdr, NULL, tag); + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + destroy_hdr = !HDR_IO_IN_PROGRESS(hdr); + mutex_exit(&arc_eviction_mtx); + if (destroy_hdr) + arc_hdr_destroy(hdr); + } else { + if (remove_reference(hdr, NULL, tag) > 0) { + ASSERT(HDR_IO_ERROR(hdr)); + arc_buf_destroy(buf, TRUE); + } else { + arc_hdr_destroy(hdr); + } } +} - /* don't free buffers that are in the middle of an async write */ - freeable = (hdr->b_state == arc.anon && hdr->b_acb == NULL); - mutex_exit(hash_lock); +int +arc_buf_remove_ref(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + kmutex_t *hash_lock = HDR_LOCK(hdr); + int no_callback = (buf->b_efunc == NULL); - if (freeable) - arc_hdr_free(hdr); + if (hdr->b_state == arc.anon) { + arc_buf_free(buf, tag); + return (no_callback); + } + + mutex_enter(hash_lock); + ASSERT(hdr->b_state != arc.anon); + ASSERT(buf->b_data != NULL); + + (void) remove_reference(hdr, hash_lock, tag); + if (hdr->b_datacnt > 1) { + if (no_callback) + arc_buf_destroy(buf, TRUE); + } else if (no_callback) { + ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + hdr->b_flags |= ARC_BUF_AVAILABLE; + } + ASSERT(no_callback || hdr->b_datacnt > 1 || + refcount_is_zero(&hdr->b_refcnt)); + mutex_exit(hash_lock); + return (no_callback); } int @@ -732,19 +907,16 @@ arc_buf_size(arc_buf_t *buf) * bytes. Move the removed buffers to the appropriate evict state. */ static uint64_t -arc_evict_state(arc_state_t *state, int64_t bytes) +arc_evict(arc_state_t *state, int64_t bytes) { arc_state_t *evicted_state; - uint64_t bytes_evicted = 0; + uint64_t bytes_evicted = 0, skipped = 0; arc_buf_hdr_t *ab, *ab_prev; kmutex_t *hash_lock; - ASSERT(state == arc.mru_top || state == arc.mfu_top); + ASSERT(state == arc.mru || state == arc.mfu); - if (state == arc.mru_top) - evicted_state = arc.mru_bot; - else - evicted_state = arc.mfu_bot; + evicted_state = (state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost; mutex_enter(&state->mtx); mutex_enter(&evicted_state->mtx); @@ -754,19 +926,42 @@ arc_evict_state(arc_state_t *state, int64_t bytes) hash_lock = HDR_LOCK(ab); if (mutex_tryenter(hash_lock)) { ASSERT3U(refcount_count(&ab->b_refcnt), ==, 0); + ASSERT(ab->b_datacnt > 0); + while (ab->b_buf) { + arc_buf_t *buf = ab->b_buf; + if (buf->b_data) + bytes_evicted += ab->b_size; + if (buf->b_efunc) { + mutex_enter(&arc_eviction_mtx); + /* + * arc_buf_add_ref() could derail + * this eviction. + */ + if (buf->b_hdr == NULL) { + mutex_exit(&arc_eviction_mtx); + mutex_exit(hash_lock); + goto skip; + } + arc_buf_destroy(buf, FALSE); + ab->b_buf = buf->b_next; + buf->b_next = arc_eviction_list; + arc_eviction_list = buf; + mutex_exit(&arc_eviction_mtx); + } else { + arc_buf_destroy(buf, TRUE); + } + } + ASSERT(ab->b_datacnt == 0); arc_change_state(evicted_state, ab, hash_lock); - zio_buf_free(ab->b_buf->b_data, ab->b_size); - atomic_add_64(&arc.size, -ab->b_size); - ASSERT3P(ab->b_buf->b_next, ==, NULL); - kmem_cache_free(buf_cache, ab->b_buf); - ab->b_buf = NULL; + ASSERT(HDR_IN_HASH_TABLE(ab)); + ab->b_flags = ARC_IN_HASH_TABLE; DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); - bytes_evicted += ab->b_size; mutex_exit(hash_lock); - if (bytes_evicted >= bytes) + if (bytes >= 0 && bytes_evicted >= bytes) break; } else { - atomic_add_64(&arc.skipped, 1); +skip: + skipped += 1; } } mutex_exit(&evicted_state->mtx); @@ -776,6 +971,9 @@ arc_evict_state(arc_state_t *state, int64_t bytes) dprintf("only evicted %lld bytes from %x", (longlong_t)bytes_evicted, state); + atomic_add_64(&arc.skipped, skipped); + if (bytes < 0) + return (skipped); return (bytes_evicted); } @@ -784,25 +982,27 @@ arc_evict_state(arc_state_t *state, int64_t bytes) * bytes. Destroy the buffers that are removed. */ static void -arc_delete_state(arc_state_t *state, int64_t bytes) +arc_evict_ghost(arc_state_t *state, int64_t bytes) { - uint_t bufs_skipped = 0; - uint64_t bytes_deleted = 0; arc_buf_hdr_t *ab, *ab_prev; kmutex_t *hash_lock; + uint64_t bytes_deleted = 0; + uint_t bufs_skipped = 0; + ASSERT(GHOST_STATE(state)); top: mutex_enter(&state->mtx); for (ab = list_tail(&state->list); ab; ab = ab_prev) { ab_prev = list_prev(&state->list, ab); hash_lock = HDR_LOCK(ab); if (mutex_tryenter(hash_lock)) { + ASSERT(ab->b_buf == NULL); arc_change_state(arc.anon, ab, hash_lock); mutex_exit(hash_lock); atomic_add_64(&arc.deleted, 1); - DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); bytes_deleted += ab->b_size; - arc_hdr_free(ab); + arc_hdr_destroy(ab); + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); if (bytes >= 0 && bytes_deleted >= bytes) break; } else { @@ -832,41 +1032,62 @@ arc_adjust(void) { int64_t top_sz, mru_over, arc_over; - top_sz = arc.anon->size + arc.mru_top->size; + top_sz = arc.anon->size + arc.mru->size; - if (top_sz > arc.p && arc.mru_top->lsize > 0) { - int64_t toevict = MIN(arc.mru_top->lsize, top_sz-arc.p); - (void) arc_evict_state(arc.mru_top, toevict); - top_sz = arc.anon->size + arc.mru_top->size; + if (top_sz > arc.p && arc.mru->lsize > 0) { + int64_t toevict = MIN(arc.mru->lsize, top_sz-arc.p); + (void) arc_evict(arc.mru, toevict); + top_sz = arc.anon->size + arc.mru->size; } - mru_over = top_sz + arc.mru_bot->size - arc.c; + mru_over = top_sz + arc.mru_ghost->size - arc.c; if (mru_over > 0) { - if (arc.mru_bot->lsize > 0) { - int64_t todelete = MIN(arc.mru_bot->lsize, mru_over); - arc_delete_state(arc.mru_bot, todelete); + if (arc.mru_ghost->lsize > 0) { + int64_t todelete = MIN(arc.mru_ghost->lsize, mru_over); + arc_evict_ghost(arc.mru_ghost, todelete); } } if ((arc_over = arc.size - arc.c) > 0) { - int64_t table_over; + int64_t tbl_over; - if (arc.mfu_top->lsize > 0) { - int64_t toevict = MIN(arc.mfu_top->lsize, arc_over); - (void) arc_evict_state(arc.mfu_top, toevict); + if (arc.mfu->lsize > 0) { + int64_t toevict = MIN(arc.mfu->lsize, arc_over); + (void) arc_evict(arc.mfu, toevict); } - table_over = arc.size + arc.mru_bot->lsize + arc.mfu_bot->lsize - - arc.c*2; + tbl_over = arc.size + arc.mru_ghost->lsize + + arc.mfu_ghost->lsize - arc.c*2; - if (table_over > 0 && arc.mfu_bot->lsize > 0) { - int64_t todelete = MIN(arc.mfu_bot->lsize, table_over); - arc_delete_state(arc.mfu_bot, todelete); + if (tbl_over > 0 && arc.mfu_ghost->lsize > 0) { + int64_t todelete = MIN(arc.mfu_ghost->lsize, tbl_over); + arc_evict_ghost(arc.mfu_ghost, todelete); } } } +static void +arc_do_user_evicts(void) +{ + mutex_enter(&arc_eviction_mtx); + while (arc_eviction_list != NULL) { + arc_buf_t *buf = arc_eviction_list; + arc_eviction_list = buf->b_next; + buf->b_hdr = NULL; + mutex_exit(&arc_eviction_mtx); + + ASSERT(buf->b_efunc != NULL); + VERIFY(buf->b_efunc(buf) == 0); + + buf->b_efunc = NULL; + buf->b_private = NULL; + kmem_cache_free(buf_cache, buf); + mutex_enter(&arc_eviction_mtx); + } + mutex_exit(&arc_eviction_mtx); +} + /* * Flush all *evictable* data from the cache. * NOTE: this will not touch "active" (i.e. referenced) data. @@ -874,17 +1095,22 @@ arc_adjust(void) void arc_flush(void) { - arc_delete_state(arc.mru_top, -1); - arc_delete_state(arc.mfu_top, -1); + while (arc_evict(arc.mru, -1)); + while (arc_evict(arc.mfu, -1)); - arc_delete_state(arc.mru_bot, -1); - arc_delete_state(arc.mfu_bot, -1); + arc_evict_ghost(arc.mru_ghost, -1); + arc_evict_ghost(arc.mfu_ghost, -1); + + mutex_enter(&arc_reclaim_thr_lock); + arc_do_user_evicts(); + mutex_exit(&arc_reclaim_thr_lock); + ASSERT(arc_eviction_list == NULL); } void arc_kmem_reclaim(void) { - /* Remove 6.25% */ + /* Remove 12.5% */ /* * We need arc_reclaim_lock because we don't want multiple * threads trying to reclaim concurrently. @@ -898,19 +1124,23 @@ arc_kmem_reclaim(void) if (arc_dead) return; + if (arc.c <= arc.c_min) + return; + mutex_enter(&arc_reclaim_lock); - atomic_add_64(&arc.c, -(arc.c >> 4)); + atomic_add_64(&arc.c, -(arc.c >> 3)); + atomic_add_64(&arc.p, -(arc.p >> 3)); + if (arc.c > arc.size) + arc.c = arc.size; if (arc.c < arc.c_min) arc.c = arc.c_min; - atomic_add_64(&arc.p, -(arc.p >> 4)); + if (arc.p > arc.c) + arc.p = (arc.c >> 1); + ASSERT((int64_t)arc.p >= 0); arc_adjust(); - /* Cool it for a while */ - arc.incr = 0; - arc.size_check = arc_size_check_default << 3; - mutex_exit(&arc_reclaim_lock); } @@ -985,16 +1215,11 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) #endif /* - * an agressive reclamation will shrink the cache size as well as reap - * free kmem buffers. The arc_kmem_reclaim function is called when the - * header-cache is reaped, so we only reap the header cache if we're - * performing an agressive reclaim. If we're not, just clean the kmem - * buffer caches. + * An agressive reclamation will shrink the cache size as well as + * reap free buffers from the arc kmem caches. */ if (strat == ARC_RECLAIM_AGGR) - kmem_cache_reap_now(hdr_cache); - - kmem_cache_reap_now(buf_cache); + arc_kmem_reclaim(); for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { if (zio_buf_cache[i] != prev_cache) { @@ -1002,6 +1227,8 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) kmem_cache_reap_now(zio_buf_cache[i]); } } + kmem_cache_reap_now(buf_cache); + kmem_cache_reap_now(hdr_cache); } static void @@ -1038,6 +1265,9 @@ arc_reclaim_thread(void) arc.no_grow = FALSE; } + if (arc_eviction_list != NULL) + arc_do_user_evicts(); + /* block until needed, or one second, whichever is shorter */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&arc_reclaim_thr_cv, @@ -1051,14 +1281,37 @@ arc_reclaim_thread(void) thread_exit(); } +/* + * Adapt arc info given the number of bytes we are trying to add and + * the state that we are comming from. This function is only called + * when we are adding new content to the cache. + */ static void -arc_try_grow(int64_t bytes) +arc_adapt(int bytes, arc_state_t *state) { + int mult; + + ASSERT(bytes > 0); /* - * If we're within (2 * maxblocksize) bytes of the target - * cache size, increment the target cache size + * Adapt the target size of the MRU list: + * - if we just hit in the MRU ghost list, then increase + * the target size of the MRU list. + * - if we just hit in the MFU ghost list, then increase + * the target size of the MFU list by decreasing the + * target size of the MRU list. */ - atomic_add_64((uint64_t *)&arc.size_check, 1); + if (state == arc.mru_ghost) { + mult = ((arc.mru_ghost->size >= arc.mfu_ghost->size) ? + 1 : (arc.mfu_ghost->size/arc.mru_ghost->size)); + + arc.p = MIN(arc.c, arc.p + bytes * mult); + } else if (state == arc.mfu_ghost) { + mult = ((arc.mfu_ghost->size >= arc.mru_ghost->size) ? + 1 : (arc.mru_ghost->size/arc.mfu_ghost->size)); + + arc.p = MAX(0, (int64_t)arc.p - bytes * mult); + } + ASSERT((int64_t)arc.p >= 0); if (arc_reclaim_needed()) { cv_signal(&arc_reclaim_thr_cv); @@ -1068,52 +1321,36 @@ arc_try_grow(int64_t bytes) if (arc.no_grow) return; + if (arc.c >= arc.c_max) + return; + /* - * return true if we successfully grow, or if there's enough space that - * we don't have to grow. Above, we return false if we can't grow, or - * if we shouldn't because a reclaim is in progress. + * If we're within (2 * maxblocksize) bytes of the target + * cache size, increment the target cache size */ - if ((arc.c - arc.size) <= (2ULL << SPA_MAXBLOCKSHIFT)) { - if (arc.size_check > 0) { - arc.size_check = arc_size_check_default; - atomic_add_64(&arc.incr, arc_incr_size); - } - atomic_add_64(&arc.c, MIN(bytes, arc.incr)); + if (arc.size > arc.c - (2ULL << SPA_MAXBLOCKSHIFT)) { + atomic_add_64(&arc.c, (int64_t)bytes); if (arc.c > arc.c_max) arc.c = arc.c_max; - else - atomic_add_64(&arc.p, MIN(bytes, arc.incr)); - } else if (arc.size > arc.c) { - if (arc.size_check > 0) { - arc.size_check = arc_size_check_default; - atomic_add_64(&arc.incr, arc_incr_size); - } - atomic_add_64(&arc.c, MIN(bytes, arc.incr)); - if (arc.c > arc.c_max) - arc.c = arc.c_max; - else - atomic_add_64(&arc.p, MIN(bytes, arc.incr)); + else if (state == arc.anon) + atomic_add_64(&arc.p, (int64_t)bytes); + if (arc.p > arc.c) + arc.p = arc.c; } + ASSERT((int64_t)arc.p >= 0); } /* - * check if the cache has reached its limits and eviction is required prior to - * insert. In this situation, we want to evict if no_grow is set Otherwise, the - * cache is either big enough that we can insert, or a arc_try_grow will result - * in more space being made available. + * Check if the cache has reached its limits and eviction is required + * prior to insert. */ - static int arc_evict_needed() { - if (arc_reclaim_needed()) return (1); - if (arc.no_grow || (arc.c > arc.c_max) || (arc.size > arc.c)) - return (1); - - return (0); + return (arc.size > arc.c); } /* @@ -1121,21 +1358,21 @@ arc_evict_needed() * inserted on its behalf. So, determine which cache must be victimized to * satisfy an insertion for this state. We have the following cases: * - * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru_top) -> + * 1. Insert for MRU, p > sizeof(arc.anon + arc.mru) -> * In this situation if we're out of space, but the resident size of the MFU is * under the limit, victimize the MFU cache to satisfy this insertion request. * - * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru_top) -> + * 2. Insert for MRU, p <= sizeof(arc.anon + arc.mru) -> * Here, we've used up all of the available space for the MRU, so we need to * evict from our own cache instead. Evict from the set of resident MRU * entries. * - * 3. Insert for MFU (c - p) > sizeof(arc.mfu_top) -> + * 3. Insert for MFU (c - p) > sizeof(arc.mfu) -> * c minus p represents the MFU space in the cache, since p is the size of the * cache that is dedicated to the MRU. In this situation there's still space on * the MFU side, so the MRU side needs to be victimized. * - * 4. Insert for MFU (c - p) < sizeof(arc.mfu_top) -> + * 4. Insert for MFU (c - p) < sizeof(arc.mfu) -> * MFU's resident set is consuming more space than it has been allotted. In * this situation, we must victimize our own cache, the MFU, for this insertion. */ @@ -1146,35 +1383,35 @@ arc_evict_for_state(arc_state_t *state, uint64_t bytes) uint64_t mfu_space; uint64_t evicted; - ASSERT(state == arc.mru_top || state == arc.mfu_top); + ASSERT(state == arc.mru || state == arc.mfu); - if (state == arc.mru_top) { - mru_used = arc.anon->size + arc.mru_top->size; + if (state == arc.mru) { + mru_used = arc.anon->size + arc.mru->size; if (arc.p > mru_used) { /* case 1 */ - evicted = arc_evict_state(arc.mfu_top, bytes); + evicted = arc_evict(arc.mfu, bytes); if (evicted < bytes) { arc_adjust(); } } else { /* case 2 */ - evicted = arc_evict_state(arc.mru_top, bytes); + evicted = arc_evict(arc.mru, bytes); if (evicted < bytes) { arc_adjust(); } } } else { - /* MFU_top case */ + /* MFU case */ mfu_space = arc.c - arc.p; - if (mfu_space > arc.mfu_top->size) { + if (mfu_space > arc.mfu->size) { /* case 3 */ - evicted = arc_evict_state(arc.mru_top, bytes); + evicted = arc_evict(arc.mru, bytes); if (evicted < bytes) { arc_adjust(); } } else { /* case 4 */ - evicted = arc_evict_state(arc.mfu_top, bytes); + evicted = arc_evict(arc.mfu, bytes); if (evicted < bytes) { arc_adjust(); } @@ -1184,11 +1421,13 @@ arc_evict_for_state(arc_state_t *state, uint64_t bytes) /* * This routine is called whenever a buffer is accessed. + * NOTE: the hash lock is dropped in this function. */ static void -arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) +arc_access_and_exit(arc_buf_hdr_t *buf, kmutex_t *hash_lock) { - int blksz, mult; + arc_state_t *evict_state = NULL; + int blksz; ASSERT(MUTEX_HELD(hash_lock)); @@ -1201,27 +1440,16 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * to the MRU state. */ - arc_try_grow(blksz); - if (arc_evict_needed()) { - arc_evict_for_state(arc.mru_top, blksz); - } + arc_adapt(blksz, arc.anon); + if (arc_evict_needed()) + evict_state = arc.mru; ASSERT(buf->b_arc_access == 0); buf->b_arc_access = lbolt; - DTRACE_PROBE1(new_state__mru_top, arc_buf_hdr_t *, - buf); - arc_change_state(arc.mru_top, buf, hash_lock); + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); + arc_change_state(arc.mru, buf, hash_lock); - /* - * If we are using less than 2/3 of our total target - * cache size, bump up the target size for the MRU - * list. - */ - if (arc.size < arc.c*2/3) { - arc.p = arc.anon->size + arc.mru_top->size + arc.c/6; - } - - } else if (buf->b_state == arc.mru_top) { + } else if (buf->b_state == arc.mru) { /* * If this buffer is in the MRU-top state and has the prefetch * flag, the first read was actually part of a prefetch. In @@ -1230,7 +1458,8 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) */ if ((buf->b_flags & ARC_PREFETCH) != 0) { buf->b_flags &= ~ARC_PREFETCH; - atomic_add_64(&arc.mru_top->hits, 1); + atomic_add_64(&arc.mru->hits, 1); + mutex_exit(hash_lock); return; } @@ -1246,12 +1475,11 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * most frequently used state. */ buf->b_arc_access = lbolt; - DTRACE_PROBE1(new_state__mfu_top, - arc_buf_hdr_t *, buf); - arc_change_state(arc.mfu_top, buf, hash_lock); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc.mfu, buf, hash_lock); } - atomic_add_64(&arc.mru_top->hits, 1); - } else if (buf->b_state == arc.mru_bot) { + atomic_add_64(&arc.mru->hits, 1); + } else if (buf->b_state == arc.mru_ghost) { arc_state_t *new_state; /* * This buffer has been "accessed" recently, but @@ -1260,30 +1488,23 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) */ if (buf->b_flags & ARC_PREFETCH) { - new_state = arc.mru_top; - DTRACE_PROBE1(new_state__mru_top, - arc_buf_hdr_t *, buf); + new_state = arc.mru; + buf->b_flags &= ~ARC_PREFETCH; + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); } else { - new_state = arc.mfu_top; - DTRACE_PROBE1(new_state__mfu_top, - arc_buf_hdr_t *, buf); - } - - arc_try_grow(blksz); - if (arc_evict_needed()) { - arc_evict_for_state(new_state, blksz); + new_state = arc.mfu; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); } - /* Bump up the target size of the MRU list */ - mult = ((arc.mru_bot->size >= arc.mfu_bot->size) ? - 1 : (arc.mfu_bot->size/arc.mru_bot->size)); - arc.p = MIN(arc.c, arc.p + blksz * mult); + arc_adapt(blksz, arc.mru_ghost); + if (arc_evict_needed()) + evict_state = new_state; buf->b_arc_access = lbolt; arc_change_state(new_state, buf, hash_lock); - atomic_add_64(&arc.mru_bot->hits, 1); - } else if (buf->b_state == arc.mfu_top) { + atomic_add_64(&arc.mru_ghost->hits, 1); + } else if (buf->b_state == arc.mfu) { /* * This buffer has been accessed more than once and is * still in the cache. Keep it in the MFU state. @@ -1293,34 +1514,30 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * so even if it was a prefetch, it will be put back at * the head of the list when we remove_reference(). */ - atomic_add_64(&arc.mfu_top->hits, 1); - } else if (buf->b_state == arc.mfu_bot) { + atomic_add_64(&arc.mfu->hits, 1); + } else if (buf->b_state == arc.mfu_ghost) { /* * This buffer has been accessed more than once but has * been evicted from the cache. Move it back to the * MFU state. */ - arc_try_grow(blksz); - if (arc_evict_needed()) { - arc_evict_for_state(arc.mfu_top, blksz); - } - - /* Bump up the target size for the MFU list */ - mult = ((arc.mfu_bot->size >= arc.mru_bot->size) ? - 1 : (arc.mru_bot->size/arc.mfu_bot->size)); - arc.p = MAX(0, (int64_t)arc.p - blksz * mult); + arc_adapt(blksz, arc.mfu_ghost); + if (arc_evict_needed()) + evict_state = arc.mfu; buf->b_arc_access = lbolt; - DTRACE_PROBE1(new_state__mfu_top, - arc_buf_hdr_t *, buf); - arc_change_state(arc.mfu_top, buf, hash_lock); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc.mfu, buf, hash_lock); - atomic_add_64(&arc.mfu_bot->hits, 1); + atomic_add_64(&arc.mfu_ghost->hits, 1); } else { ASSERT(!"invalid arc state"); } + mutex_exit(hash_lock); + if (evict_state) + arc_evict_for_state(evict_state, blksz); } /* a generic arc_done_func_t which you can use */ @@ -1329,7 +1546,7 @@ void arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) { bcopy(buf->b_data, arg, buf->b_hdr->b_size); - arc_buf_free(buf, arg); + VERIFY(arc_buf_remove_ref(buf, arg) == 1); } /* a generic arc_done_func_t which you can use */ @@ -1338,7 +1555,7 @@ arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; if (zio && zio->io_error) { - arc_buf_free(buf, arg); + VERIFY(arc_buf_remove_ref(buf, arg) == 1); *bufp = NULL; } else { *bufp = buf; @@ -1387,13 +1604,13 @@ arc_read_done(zio_t *zio) if (acb->acb_done) { if (abuf == NULL) { abuf = kmem_cache_alloc(buf_cache, KM_SLEEP); - abuf->b_data = zio_buf_alloc(hdr->b_size); - atomic_add_64(&arc.size, hdr->b_size); - bcopy(buf->b_data, abuf->b_data, hdr->b_size); + abuf->b_data = arc_data_copy(hdr, buf->b_data); abuf->b_hdr = hdr; + abuf->b_efunc = NULL; + abuf->b_private = NULL; abuf->b_next = hdr->b_buf; hdr->b_buf = abuf; - atomic_add_64(&hdr->b_state->size, hdr->b_size); + hdr->b_datacnt += 1; } acb->acb_buf = abuf; abuf = NULL; @@ -1414,6 +1631,9 @@ arc_read_done(zio_t *zio) } hdr->b_acb = NULL; hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + ASSERT(!HDR_BUF_AVAILABLE(hdr)); + if (abuf == buf) + hdr->b_flags |= ARC_BUF_AVAILABLE; ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); @@ -1421,9 +1641,21 @@ arc_read_done(zio_t *zio) hdr->b_flags |= ARC_IO_ERROR; if (hdr->b_state != arc.anon) arc_change_state(arc.anon, hdr, hash_lock); + if (HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); freeable = refcount_is_zero(&hdr->b_refcnt); + /* translate checksum errors into IO errors */ + if (zio->io_error == ECKSUM) + zio->io_error = EIO; } + /* + * Broadcast before we drop the hash_lock. This is less efficient, + * but avoids the possibility that the hdr (and hence the cv) might + * be freed before we get to the cv_broadcast(). + */ + cv_broadcast(&hdr->b_cv); + if (!HDR_FREED_IN_READ(hdr)) { /* * Only call arc_access on anonymous buffers. This is because @@ -1432,8 +1664,9 @@ arc_read_done(zio_t *zio) * getting confused). */ if (zio->io_error == 0 && hdr->b_state == arc.anon) - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); + arc_access_and_exit(hdr, hash_lock); + else + mutex_exit(hash_lock); } else { /* * This block was freed while we waited for the read to @@ -1445,8 +1678,6 @@ arc_read_done(zio_t *zio) freeable = refcount_is_zero(&hdr->b_refcnt); } - cv_broadcast(&hdr->b_cv); - /* execute each callback and free its structure */ while ((acb = callback_list) != NULL) { if (acb->acb_done) @@ -1462,7 +1693,7 @@ arc_read_done(zio_t *zio) } if (freeable) - arc_hdr_free(hdr); + arc_hdr_destroy(hdr); } /* @@ -1486,7 +1717,7 @@ arc_read_done(zio_t *zio) int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, arc_done_func_t *done, void *private, int priority, int flags, - uint32_t arc_flags) + uint32_t arc_flags, zbookmark_t *zb) { arc_buf_hdr_t *hdr; arc_buf_t *buf; @@ -1495,15 +1726,9 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, top: hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); - if (hdr && hdr->b_buf) { - - ASSERT((hdr->b_state == arc.mru_top) || - (hdr->b_state == arc.mfu_top) || - ((hdr->b_state == arc.anon) && - (HDR_IO_IN_PROGRESS(hdr)))); + if (hdr && hdr->b_datacnt > 0) { if (HDR_IO_IN_PROGRESS(hdr)) { - if ((arc_flags & ARC_NOWAIT) && done) { arc_callback_t *acb = NULL; @@ -1527,35 +1752,39 @@ top: mutex_exit(hash_lock); goto top; } - mutex_exit(hash_lock); return (0); } - /* - * If there is already a reference on this block, create - * a new copy of the data so that we will be guaranteed - * that arc_release() will always succeed. - */ + ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu); - if (done) - add_reference(hdr, hash_lock, private); - if (done && refcount_count(&hdr->b_refcnt) > 1) { - buf = kmem_cache_alloc(buf_cache, KM_SLEEP); - buf->b_data = zio_buf_alloc(hdr->b_size); - ASSERT3U(refcount_count(&hdr->b_refcnt), >, 1); - atomic_add_64(&arc.size, hdr->b_size); - bcopy(hdr->b_buf->b_data, buf->b_data, hdr->b_size); - buf->b_hdr = hdr; - buf->b_next = hdr->b_buf; - hdr->b_buf = buf; - atomic_add_64(&hdr->b_state->size, hdr->b_size); - } else { + if (done) { + /* + * If this block is already in use, create a new + * copy of the data so that we will be guaranteed + * that arc_release() will always succeed. + */ buf = hdr->b_buf; + ASSERT(buf); + ASSERT(buf->b_data); + if (!HDR_BUF_AVAILABLE(hdr)) { + void *data = arc_data_copy(hdr, buf->b_data); + buf = kmem_cache_alloc(buf_cache, KM_SLEEP); + buf->b_hdr = hdr; + buf->b_data = data; + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_next = hdr->b_buf; + hdr->b_buf = buf; + hdr->b_datacnt += 1; + } else { + ASSERT(buf->b_efunc == NULL); + hdr->b_flags &= ~ARC_BUF_AVAILABLE; + } + add_reference(hdr, hash_lock, private); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); + arc_access_and_exit(hdr, hash_lock); atomic_add_64(&arc.hits, 1); if (done) done(NULL, buf, private); @@ -1579,24 +1808,28 @@ top: bzero(&hdr->b_dva, sizeof (dva_t)); hdr->b_birth = 0; hdr->b_cksum0 = 0; - arc_buf_free(buf, private); + (void) arc_buf_remove_ref(buf, private); goto top; /* restart the IO request */ } } else { /* this block is in the ghost cache */ - ASSERT((hdr->b_state == arc.mru_bot) || - (hdr->b_state == arc.mfu_bot)); + ASSERT(GHOST_STATE(hdr->b_state)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); add_reference(hdr, hash_lock, private); + ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); + ASSERT(hdr->b_buf == NULL); buf = kmem_cache_alloc(buf_cache, KM_SLEEP); - buf->b_data = zio_buf_alloc(hdr->b_size); - atomic_add_64(&arc.size, hdr->b_size); - ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); buf->b_hdr = hdr; + buf->b_efunc = NULL; + buf->b_private = NULL; buf->b_next = NULL; hdr->b_buf = buf; + buf->b_data = zio_buf_alloc(hdr->b_size); + atomic_add_64(&arc.size, hdr->b_size); + ASSERT(hdr->b_datacnt == 0); + hdr->b_datacnt = 1; } acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); @@ -1623,18 +1856,17 @@ top: * buffer ought to notice that it's legit but has a pending I/O. */ - if ((hdr->b_state == arc.mru_bot) || - (hdr->b_state == arc.mfu_bot)) - arc_access(hdr, hash_lock); - - mutex_exit(hash_lock); + if (GHOST_STATE(hdr->b_state)) + arc_access_and_exit(hdr, hash_lock); + else + mutex_exit(hash_lock); ASSERT3U(hdr->b_size, ==, size); - DTRACE_PROBE2(arc__miss, blkptr_t *, bp, - uint64_t, size); + DTRACE_PROBE2(arc__miss, blkptr_t *, bp, uint64_t, size); atomic_add_64(&arc.misses, 1); + rzio = zio_read(pio, spa, bp, buf->b_data, size, - arc_read_done, buf, priority, flags); + arc_read_done, buf, priority, flags, zb); if (arc_flags & ARC_WAIT) return (zio_wait(rzio)); @@ -1660,10 +1892,18 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data) hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); - if (hdr && hdr->b_buf && !HDR_IO_IN_PROGRESS(hdr)) - bcopy(hdr->b_buf->b_data, data, hdr->b_size); - else + if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { + arc_buf_t *buf = hdr->b_buf; + + ASSERT(buf); + while (buf->b_data == NULL) { + buf = buf->b_next; + ASSERT(buf); + } + bcopy(buf->b_data, data, hdr->b_size); + } else { rc = ENOENT; + } if (hash_mtx) mutex_exit(hash_mtx); @@ -1671,6 +1911,104 @@ arc_tryread(spa_t *spa, blkptr_t *bp, void *data) return (rc); } +void +arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) +{ + ASSERT(buf->b_hdr != NULL); + ASSERT(buf->b_hdr->b_state != arc.anon); + ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + buf->b_efunc = func; + buf->b_private = private; +} + +/* + * This is used by the DMU to let the ARC know that a buffer is + * being evicted, so the ARC should clean up. If this arc buf + * is not yet in the evicted state, it will be put there. + */ +int +arc_buf_evict(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + arc_buf_t **bufp; + + mutex_enter(&arc_eviction_mtx); + hdr = buf->b_hdr; + if (hdr == NULL) { + /* + * We are in arc_do_user_evicts(). + * NOTE: We can't be in arc_buf_add_ref() because + * that would violate the interface rules. + */ + ASSERT(buf->b_data == NULL); + mutex_exit(&arc_eviction_mtx); + return (0); + } else if (buf->b_data == NULL) { + /* + * We are on the eviction list, pull us off. + */ + bufp = &arc_eviction_list; + while (*bufp != buf) + bufp = &(*bufp)->b_next; + *bufp = buf->b_next; + mutex_exit(&arc_eviction_mtx); + goto out; + } else { + /* + * Prevent a race with arc_evict() + */ + ASSERT3U(refcount_count(&hdr->b_refcnt), <, hdr->b_datacnt); + buf->b_hdr = NULL; + } + mutex_exit(&arc_eviction_mtx); + + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + ASSERT(hdr->b_state == arc.mru || hdr->b_state == arc.mfu); + + /* + * Pull this buffer off of the hdr + */ + bufp = &hdr->b_buf; + while (*bufp != buf) + bufp = &(*bufp)->b_next; + *bufp = buf->b_next; + + ASSERT(buf->b_data != NULL); + buf->b_hdr = hdr; + arc_buf_destroy(buf, FALSE); + + if (hdr->b_datacnt == 0) { + arc_state_t *old_state = hdr->b_state; + arc_state_t *evicted_state; + + ASSERT(refcount_is_zero(&hdr->b_refcnt)); + + evicted_state = + (old_state == arc.mru) ? arc.mru_ghost : arc.mfu_ghost; + + mutex_enter(&old_state->mtx); + mutex_enter(&evicted_state->mtx); + + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + hdr->b_flags = ARC_IN_HASH_TABLE; + + mutex_exit(&evicted_state->mtx); + mutex_exit(&old_state->mtx); + } + mutex_exit(hash_lock); +out: + VERIFY(buf->b_efunc(buf) == 0); + buf->b_efunc = NULL; + buf->b_private = NULL; + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); + return (1); +} + /* * Release this buffer from the cache. This must be done * after a read and prior to modifying the buffer contents. @@ -1690,30 +2028,40 @@ arc_release(arc_buf_t *buf, void *tag) /* this buffer is already released */ ASSERT3U(refcount_count(&hdr->b_refcnt), ==, 1); ASSERT(BUF_EMPTY(hdr)); + ASSERT(buf->b_efunc == NULL); return; } mutex_enter(hash_lock); - if (refcount_count(&hdr->b_refcnt) > 1) { + /* + * Do we have more than one buf? + */ + if (hdr->b_buf != buf || buf->b_next != NULL) { arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; spa_t *spa = hdr->b_spa; + ASSERT(hdr->b_datacnt > 1); /* * Pull the data off of this buf and attach it to * a new anonymous buf. */ + (void) remove_reference(hdr, hash_lock, tag); bufp = &hdr->b_buf; - while (*bufp != buf) { - ASSERT(*bufp); + while (*bufp != buf) bufp = &(*bufp)->b_next; - } *bufp = (*bufp)->b_next; - (void) refcount_remove(&hdr->b_refcnt, tag); + ASSERT3U(hdr->b_state->size, >=, hdr->b_size); atomic_add_64(&hdr->b_state->size, -hdr->b_size); + if (refcount_is_zero(&hdr->b_refcnt)) { + ASSERT3U(hdr->b_state->lsize, >=, hdr->b_size); + atomic_add_64(&hdr->b_state->lsize, -hdr->b_size); + } + hdr->b_datacnt -= 1; + mutex_exit(hash_lock); nhdr = kmem_cache_alloc(hdr_cache, KM_SLEEP); @@ -1723,6 +2071,7 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_state = arc.anon; nhdr->b_arc_access = 0; nhdr->b_flags = 0; + nhdr->b_datacnt = 1; buf->b_hdr = nhdr; buf->b_next = NULL; (void) refcount_add(&nhdr->b_refcnt, tag); @@ -1730,6 +2079,7 @@ arc_release(arc_buf_t *buf, void *tag) hdr = nhdr; } else { + ASSERT(refcount_count(&hdr->b_refcnt) == 1); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_change_state(arc.anon, hdr, hash_lock); @@ -1739,14 +2089,30 @@ arc_release(arc_buf_t *buf, void *tag) hdr->b_birth = 0; hdr->b_cksum0 = 0; } + buf->b_efunc = NULL; + buf->b_private = NULL; } int arc_released(arc_buf_t *buf) { - return (buf->b_hdr->b_state == arc.anon); + return (buf->b_data != NULL && buf->b_hdr->b_state == arc.anon); +} + +int +arc_has_callback(arc_buf_t *buf) +{ + return (buf->b_efunc != NULL); } +#ifdef ZFS_DEBUG +int +arc_referenced(arc_buf_t *buf) +{ + return (refcount_count(&buf->b_hdr->b_refcnt)); +} +#endif + static void arc_write_done(zio_t *zio) { @@ -1758,6 +2124,7 @@ arc_write_done(zio_t *zio) hdr = buf->b_hdr; acb = hdr->b_acb; hdr->b_acb = NULL; + ASSERT(acb != NULL); /* this buffer is on no lists and is not in the hash table */ ASSERT3P(hdr->b_state, ==, arc.anon); @@ -1765,9 +2132,12 @@ arc_write_done(zio_t *zio) hdr->b_dva = *BP_IDENTITY(zio->io_bp); hdr->b_birth = zio->io_bp->blk_birth; hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; - /* clear the "in-write" flag */ - hdr->b_hash_next = NULL; - /* This write may be all-zero */ + /* + * If the block to be written was all-zero, we may have + * compressed it away. In this case no write was performed + * so there will be no dva/birth-date/checksum. The buffer + * must therefor remain anonymous (and uncached). + */ if (!BUF_EMPTY(hdr)) { arc_buf_hdr_t *exists; kmutex_t *hash_lock; @@ -1787,27 +2157,41 @@ arc_write_done(zio_t *zio) ASSERT(refcount_is_zero(&exists->b_refcnt)); arc_change_state(arc.anon, exists, hash_lock); mutex_exit(hash_lock); - arc_hdr_free(exists); + arc_hdr_destroy(exists); exists = buf_hash_insert(hdr, &hash_lock); ASSERT3P(exists, ==, NULL); } - arc_access(hdr, hash_lock); - mutex_exit(hash_lock); + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + arc_access_and_exit(hdr, hash_lock); + } else if (acb->acb_done == NULL) { + int destroy_hdr; + /* + * This is an anonymous buffer with no user callback, + * destroy it if there are no active references. + */ + mutex_enter(&arc_eviction_mtx); + destroy_hdr = refcount_is_zero(&hdr->b_refcnt); + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; + mutex_exit(&arc_eviction_mtx); + if (destroy_hdr) + arc_hdr_destroy(hdr); + } else { + hdr->b_flags &= ~ARC_IO_IN_PROGRESS; } - if (acb && acb->acb_done) { + + if (acb->acb_done) { ASSERT(!refcount_is_zero(&hdr->b_refcnt)); acb->acb_done(zio, buf, acb->acb_private); } - if (acb) - kmem_free(acb, sizeof (arc_callback_t)); + kmem_free(acb, sizeof (arc_callback_t)); } int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, arc_done_func_t *done, void *private, int priority, int flags, - uint32_t arc_flags) + uint32_t arc_flags, zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_callback_t *acb; @@ -1822,8 +2206,9 @@ arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, acb->acb_private = private; acb->acb_byteswap = (arc_byteswap_func_t *)-1; hdr->b_acb = acb; + hdr->b_flags |= ARC_IO_IN_PROGRESS; rzio = zio_write(pio, spa, checksum, compress, txg, bp, - buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags); + buf->b_data, hdr->b_size, arc_write_done, buf, priority, flags, zb); if (arc_flags & ARC_WAIT) return (zio_wait(rzio)); @@ -1858,16 +2243,21 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_change_state(arc.anon, ab, hash_lock); if (refcount_is_zero(&ab->b_refcnt)) { mutex_exit(hash_lock); - arc_hdr_free(ab); + arc_hdr_destroy(ab); atomic_add_64(&arc.deleted, 1); } else { ASSERT3U(refcount_count(&ab->b_refcnt), ==, 1); + ASSERT3U(ab->b_datacnt, ==, 1); if (HDR_IO_IN_PROGRESS(ab)) ab->b_flags |= ARC_FREED_IN_READ; + if (HDR_IN_HASH_TABLE(ab)) + buf_hash_remove(ab); ab->b_arc_access = 0; bzero(&ab->b_dva, sizeof (dva_t)); ab->b_birth = 0; ab->b_cksum0 = 0; + ab->b_buf->b_efunc = NULL; + ab->b_buf->b_private = NULL; mutex_exit(hash_lock); } } @@ -1967,23 +2357,26 @@ arc_init(void) arc.c = arc.c_min; arc.anon = &ARC_anon; - arc.mru_top = &ARC_mru_top; - arc.mru_bot = &ARC_mru_bot; - arc.mfu_top = &ARC_mfu_top; - arc.mfu_bot = &ARC_mfu_bot; + arc.mru = &ARC_mru; + arc.mru_ghost = &ARC_mru_ghost; + arc.mfu = &ARC_mfu; + arc.mfu_ghost = &ARC_mfu_ghost; + arc.size = 0; - list_create(&arc.mru_top->list, sizeof (arc_buf_hdr_t), + list_create(&arc.mru->list, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc.mru_bot->list, sizeof (arc_buf_hdr_t), + list_create(&arc.mru_ghost->list, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc.mfu_top->list, sizeof (arc_buf_hdr_t), + list_create(&arc.mfu->list, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); - list_create(&arc.mfu_bot->list, sizeof (arc_buf_hdr_t), + list_create(&arc.mfu_ghost->list, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); buf_init(); arc_thread_exit = 0; + arc_eviction_list = NULL; + mutex_init(&arc_eviction_mtx, NULL, MUTEX_DEFAULT, NULL); (void) thread_create(NULL, 0, arc_reclaim_thread, NULL, 0, &p0, TS_RUN, minclsyspri); @@ -2002,14 +2395,15 @@ arc_fini(void) arc_dead = TRUE; + mutex_destroy(&arc_eviction_mtx); mutex_destroy(&arc_reclaim_lock); mutex_destroy(&arc_reclaim_thr_lock); cv_destroy(&arc_reclaim_thr_cv); - list_destroy(&arc.mru_top->list); - list_destroy(&arc.mru_bot->list); - list_destroy(&arc.mfu_top->list); - list_destroy(&arc.mfu_bot->list); + list_destroy(&arc.mru->list); + list_destroy(&arc.mru_ghost->list); + list_destroy(&arc.mfu->list); + list_destroy(&arc.mfu_ghost->list); buf_fini(); } diff --git a/usr/src/uts/common/fs/zfs/bplist.c b/usr/src/uts/common/fs/zfs/bplist.c index 68f79ac5a2..db0d3534d6 100644 --- a/usr/src/uts/common/fs/zfs/bplist.c +++ b/usr/src/uts/common/fs/zfs/bplist.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,16 +28,18 @@ #include <sys/bplist.h> #include <sys/zfs_context.h> -static void +static int bplist_hold(bplist_t *bpl) { ASSERT(MUTEX_HELD(&bpl->bpl_lock)); if (bpl->bpl_dbuf == NULL) { - bpl->bpl_dbuf = dmu_bonus_hold_tag(bpl->bpl_mos, - bpl->bpl_object, bpl); - dmu_buf_read(bpl->bpl_dbuf); + int err = dmu_bonus_hold(bpl->bpl_mos, + bpl->bpl_object, bpl, &bpl->bpl_dbuf); + if (err) + return (err); bpl->bpl_phys = bpl->bpl_dbuf->db_data; } + return (0); } uint64_t @@ -58,12 +59,15 @@ bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx) VERIFY(dmu_object_free(mos, object, tx) == 0); } -void +int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object) { dmu_object_info_t doi; + int err; - VERIFY(dmu_object_info(mos, object, &doi) == 0); + err = dmu_object_info(mos, object, &doi); + if (err) + return (err); mutex_enter(&bpl->bpl_lock); @@ -79,6 +83,7 @@ bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object) bpl->bpl_bpshift = bpl->bpl_blockshift - SPA_BLKPTRSHIFT; mutex_exit(&bpl->bpl_lock); + return (0); } void @@ -89,11 +94,11 @@ bplist_close(bplist_t *bpl) ASSERT(bpl->bpl_queue == NULL); if (bpl->bpl_cached_dbuf) { - dmu_buf_rele(bpl->bpl_cached_dbuf); + dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); bpl->bpl_cached_dbuf = NULL; } if (bpl->bpl_dbuf) { - dmu_buf_rele_tag(bpl->bpl_dbuf, bpl); + dmu_buf_rele(bpl->bpl_dbuf, bpl); bpl->bpl_dbuf = NULL; bpl->bpl_phys = NULL; } @@ -110,22 +115,45 @@ bplist_empty(bplist_t *bpl) return (B_TRUE); mutex_enter(&bpl->bpl_lock); - bplist_hold(bpl); + VERIFY(0 == bplist_hold(bpl)); /* XXX */ rv = (bpl->bpl_phys->bpl_entries == 0); mutex_exit(&bpl->bpl_lock); return (rv); } +static int +bplist_cache(bplist_t *bpl, uint64_t blkid) +{ + int err = 0; + + if (bpl->bpl_cached_dbuf == NULL || + bpl->bpl_cached_dbuf->db_offset != (blkid << bpl->bpl_blockshift)) { + if (bpl->bpl_cached_dbuf != NULL) + dmu_buf_rele(bpl->bpl_cached_dbuf, bpl); + err = dmu_buf_hold(bpl->bpl_mos, + bpl->bpl_object, blkid << bpl->bpl_blockshift, + bpl, &bpl->bpl_cached_dbuf); + ASSERT(err || bpl->bpl_cached_dbuf->db_size == + 1ULL << bpl->bpl_blockshift); + } + return (err); +} + int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) { uint64_t blk, off; blkptr_t *bparray; - dmu_buf_t *db; + int err; mutex_enter(&bpl->bpl_lock); - bplist_hold(bpl); + + err = bplist_hold(bpl); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); + } if (*itorp >= bpl->bpl_phys->bpl_entries) { mutex_exit(&bpl->bpl_lock); @@ -134,51 +162,44 @@ bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp) blk = *itorp >> bpl->bpl_bpshift; off = P2PHASE(*itorp, 1ULL << bpl->bpl_bpshift); - db = bpl->bpl_cached_dbuf; - if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) { - if (db != NULL) - dmu_buf_rele(db); - bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos, - bpl->bpl_object, blk << bpl->bpl_blockshift); + err = bplist_cache(bpl, blk); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); } - ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift); - - dmu_buf_read(db); - bparray = db->db_data; + bparray = bpl->bpl_cached_dbuf->db_data; *bp = bparray[off]; (*itorp)++; mutex_exit(&bpl->bpl_lock); return (0); } -void +int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx) { uint64_t blk, off; blkptr_t *bparray; - dmu_buf_t *db; + int err; ASSERT(!BP_IS_HOLE(bp)); mutex_enter(&bpl->bpl_lock); - bplist_hold(bpl); + err = bplist_hold(bpl); + if (err) + return (err); blk = bpl->bpl_phys->bpl_entries >> bpl->bpl_bpshift; off = P2PHASE(bpl->bpl_phys->bpl_entries, 1ULL << bpl->bpl_bpshift); - db = bpl->bpl_cached_dbuf; - if (db == NULL || db->db_offset != (blk << bpl->bpl_blockshift)) { - if (db != NULL) - dmu_buf_rele(db); - bpl->bpl_cached_dbuf = db = dmu_buf_hold(bpl->bpl_mos, - bpl->bpl_object, blk << bpl->bpl_blockshift); + err = bplist_cache(bpl, blk); + if (err) { + mutex_exit(&bpl->bpl_lock); + return (err); } - ASSERT3U(db->db_size, ==, 1ULL << bpl->bpl_blockshift); - - dmu_buf_will_dirty(db, tx); - bparray = db->db_data; + dmu_buf_will_dirty(bpl->bpl_cached_dbuf, tx); + bparray = bpl->bpl_cached_dbuf->db_data; bparray[off] = *bp; /* We never need the fill count. */ @@ -191,6 +212,8 @@ bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx) bpl->bpl_phys->bpl_entries++; bpl->bpl_phys->bpl_bytes += BP_GET_ASIZE(bp); mutex_exit(&bpl->bpl_lock); + + return (0); } /* @@ -218,7 +241,7 @@ bplist_sync(bplist_t *bpl, dmu_tx_t *tx) while ((bpq = bpl->bpl_queue) != NULL) { bpl->bpl_queue = bpq->bpq_next; mutex_exit(&bpl->bpl_lock); - bplist_enqueue(bpl, &bpq->bpq_blk, tx); + VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx)); kmem_free(bpq, sizeof (*bpq)); mutex_enter(&bpl->bpl_lock); } @@ -230,9 +253,10 @@ bplist_vacate(bplist_t *bpl, dmu_tx_t *tx) { mutex_enter(&bpl->bpl_lock); ASSERT3P(bpl->bpl_queue, ==, NULL); - bplist_hold(bpl); + VERIFY(0 == bplist_hold(bpl)); dmu_buf_will_dirty(bpl->bpl_dbuf, tx); - dmu_free_range(bpl->bpl_mos, bpl->bpl_object, 0, -1ULL, tx); + VERIFY(0 == dmu_free_range(bpl->bpl_mos, + bpl->bpl_object, 0, -1ULL, tx)); bpl->bpl_phys->bpl_entries = 0; bpl->bpl_phys->bpl_bytes = 0; mutex_exit(&bpl->bpl_lock); diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 6f93e86078..13f4fdb202 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -118,7 +118,7 @@ dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); - if (!refcount_is_zero(&db->db_holds)) { + if (db->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } @@ -151,7 +151,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db) for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); - if (!refcount_is_zero(&dbf->db_holds)) { + if (dbf->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } @@ -186,7 +186,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(refcount_is_zero(&db->db_holds)); - ASSERT(db->db_dnode != NULL); + ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); mutex_enter(DBUF_HASH_MUTEX(h, idx)); @@ -201,20 +201,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db) atomic_add_64(&dbuf_hash_count, -1); } -static int dbuf_evictable(dmu_buf_impl_t *db); -static void dbuf_clear(dmu_buf_impl_t *db); - -void -dbuf_evict(dmu_buf_impl_t *db) -{ - int err; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - err = dbuf_evictable(db); - ASSERT(err == TRUE); - dbuf_clear(db); - dbuf_destroy(db); -} +static arc_evict_func_t dbuf_do_evict; static void dbuf_evict_user(dmu_buf_impl_t *db) @@ -233,23 +220,47 @@ dbuf_evict_user(dmu_buf_impl_t *db) } void +dbuf_evict(dmu_buf_impl_t *db) +{ + int i; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_buf == NULL); + +#ifdef ZFS_DEBUG + for (i = 0; i < TXG_SIZE; i++) { + ASSERT(!list_link_active(&db->db_dirty_node[i])); + ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL); + } +#endif + dbuf_clear(db); + dbuf_destroy(db); +} + +void dbuf_init(void) { - uint64_t hsize = 1; + uint64_t hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; int i; /* * The hash table is big enough to fill all of physical memory - * with an average 64k block size. The table will take up - * totalmem*sizeof(void*)/64k bytes (i.e. 128KB/GB with 8-byte - * pointers). + * with an average 4K block size. The table will take up + * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). */ - while (hsize * 65536 < physmem * PAGESIZE) + while (hsize * 4096 < physmem * PAGESIZE) hsize <<= 1; +retry: h->hash_table_mask = hsize - 1; - h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_SLEEP); + h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); + if (h->hash_table == NULL) { + /* XXX - we should really return an error instead of assert */ + ASSERT(hsize > (1ULL << 10)); + hsize >>= 1; + goto retry; + } dbuf_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), @@ -299,8 +310,9 @@ dbuf_verify(dmu_buf_impl_t *db) } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); - ASSERT(list_head(&dn->dn_dbufs)); ASSERT3U(db->db_level, <, dn->dn_nlevels); + ASSERT(db->db_blkid == DB_BONUS_BLKID || + list_head(&dn->dn_dbufs)); } if (db->db_blkid == DB_BONUS_BLKID) { ASSERT(dn != NULL); @@ -311,19 +323,11 @@ dbuf_verify(dmu_buf_impl_t *db) } if (db->db_level == 0) { - void **udpp = db->db_d.db_user_data_ptr_ptr; /* we can be momentarily larger in dnode_set_blksz() */ if (db->db_blkid != DB_BONUS_BLKID && dn) { ASSERT3U(db->db.db_size, >=, dn->dn_datablksz); } - if (udpp) { - ASSERT((refcount_is_zero(&db->db_holds) && - *udpp == NULL) || - (!refcount_is_zero(&db->db_holds) && - *udpp == db->db.db_data)); - } - - if (IS_DNODE_DNODE(db->db.db_object)) { + if (db->db.db_object == DMU_META_DNODE_OBJECT) { for (i = 0; i < TXG_SIZE; i++) { /* * it should only be modified in syncing @@ -341,7 +345,7 @@ dbuf_verify(dmu_buf_impl_t *db) if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ - if (IS_DNODE_DNODE(db->db.db_object)) + if (db->db.db_object == DMU_META_DNODE_OBJECT) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); @@ -399,10 +403,19 @@ static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); - ASSERT(buf->b_data != NULL); + ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf)); db->db_buf = buf; - db->db.db_data = buf->b_data; - dbuf_update_data(db); + if (buf != NULL) { + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); + dbuf_update_data(db); + } else { + dbuf_evict_user(db); + db->db.db_data = NULL; + db->db_state = DB_UNCACHED; + } } uint64_t @@ -427,6 +440,7 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); + ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); if (db->db_level == 0 && db->db_d.db_freed_in_flight) { /* we were freed in flight; disregard any error */ @@ -440,60 +454,36 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) db->db_state = DB_CACHED; } else { ASSERT(db->db_blkid != DB_BONUS_BLKID); - arc_buf_free(buf, db); - db->db_state = DB_UNCACHED; ASSERT3P(db->db_buf, ==, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); + dbuf_rele(db, NULL); } -void +static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { - arc_buf_t *buf; blkptr_t *bp; + zbookmark_t zb; ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); - - /* - * prefetch only data blocks (level 0) -- don't prefetch indirect - * blocks - */ - if ((db->db_level > 0) || (db->db_blkid == DB_BONUS_BLKID)) { - flags |= DB_RF_NOPREFETCH; - } - - if (((flags & DB_RF_NOPREFETCH) == 0) && (db->db_dnode != NULL)) { - dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, - db->db.db_size); - } - - if (db->db_state == DB_CACHED) { - ASSERT(db->db.db_data != NULL); - return; - } - - mutex_enter(&db->db_mtx); - - if (db->db_state != DB_UNCACHED) { - mutex_exit(&db->db_mtx); - return; - } - - ASSERT3U(db->db_state, ==, DB_UNCACHED); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_state == DB_UNCACHED); + ASSERT(db->db_buf == NULL); if (db->db_blkid == DB_BONUS_BLKID) { ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); - buf = arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - DN_MAX_BONUSLEN, db); + db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); if (db->db.db_size < DN_MAX_BONUSLEN) - bzero(buf->b_data, DN_MAX_BONUSLEN); - bcopy(DN_BONUS(db->db_dnode->dn_phys), buf->b_data, + bzero(db->db.db_data, DN_MAX_BONUSLEN); + bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data, db->db.db_size); - dbuf_set_data(db, buf); + dbuf_update_data(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; @@ -522,20 +512,27 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) db->db_state = DB_READ; mutex_exit(&db->db_mtx); + zb.zb_objset = db->db_objset->os_dsl_dataset ? + db->db_objset->os_dsl_dataset->ds_object : 0; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + + dbuf_add_ref(db, NULL); /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, db->db_level > 0 ? byteswap_uint64_array : dmu_ot[db->db_dnode->dn_type].ot_byteswap, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, - ARC_NOWAIT); + ARC_NOWAIT, &zb); } -static int -dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags) +int +dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { - zio_t *zio; - int err; + int err = 0; + int havepzio = (zio != NULL); /* * We don't have to hold the mutex to check db_state because it @@ -545,71 +542,67 @@ dbuf_read_generic(dmu_buf_impl_t *db, uint32_t flags) if (db->db_state == DB_CACHED) return (0); - if (db->db_state == DB_UNCACHED) { - zio = zio_root(db->db_dnode->dn_objset->os_spa, NULL, NULL, - ZIO_FLAG_CANFAIL); + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_CACHED) { + mutex_exit(&db->db_mtx); if ((flags & DB_RF_HAVESTRUCT) == 0) - rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); + rw_exit(&db->db_dnode->dn_struct_rwlock); + } else if (db->db_state == DB_UNCACHED) { + if (zio == NULL) { + zio = zio_root(db->db_dnode->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + } dbuf_read_impl(db, zio, flags); + /* dbuf_read_impl has dropped db_mtx for us */ + + if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && + (flags & DB_RF_NOPREFETCH) == 0 && + db->db_dnode != NULL) { + dmu_zfetch(&db->db_dnode->dn_zfetch, db->db.db_offset, + db->db.db_size); + } + if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&db->db_dnode->dn_struct_rwlock); - err = zio_wait(zio); - if (err) - return (err); - } - mutex_enter(&db->db_mtx); - while (db->db_state == DB_READ || db->db_state == DB_FILL) { - ASSERT(db->db_state == DB_READ || - (flags & DB_RF_HAVESTRUCT) == 0); - cv_wait(&db->db_changed, &db->db_mtx); + if (!havepzio) + err = zio_wait(zio); + } else { + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&db->db_dnode->dn_struct_rwlock); + if ((flags & DB_RF_NEVERWAIT) == 0) { + while (db->db_state == DB_READ || + db->db_state == DB_FILL) { + ASSERT(db->db_state == DB_READ || + (flags & DB_RF_HAVESTRUCT) == 0); + cv_wait(&db->db_changed, &db->db_mtx); + } + if (db->db_state == DB_UNCACHED) + err = EIO; + } + mutex_exit(&db->db_mtx); } - ASSERT3U(db->db_state, ==, DB_CACHED); - mutex_exit(&db->db_mtx); - - return (0); -} - -#pragma weak dmu_buf_read = dbuf_read -void -dbuf_read(dmu_buf_impl_t *db) -{ - int err; - - err = dbuf_read_generic(db, DB_RF_MUST_SUCCEED); - ASSERT(err == 0); -} - -#pragma weak dmu_buf_read_canfail = dbuf_read_canfail -int -dbuf_read_canfail(dmu_buf_impl_t *db) -{ - return (dbuf_read_generic(db, DB_RF_CANFAIL)); -} - -void -dbuf_read_havestruct(dmu_buf_impl_t *db) -{ - int err; - ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); - err = dbuf_read_generic(db, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH)); - ASSERT(err == 0); + ASSERT(err || havepzio || db->db_state == DB_CACHED); + return (err); } static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_blkid != DB_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { - int blksz = (db->db_blkid == DB_BONUS_BLKID) ? - DN_MAX_BONUSLEN : db->db.db_size; + ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, - blksz, db)); + db->db.db_size, db)); db->db_state = DB_FILL; } else { ASSERT3U(db->db_state, ==, DB_CACHED); @@ -634,14 +627,13 @@ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { arc_buf_t **quiescing, **syncing; - int size = (db->db_blkid == DB_BONUS_BLKID) ? - DN_MAX_BONUSLEN : db->db.db_size; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); + ASSERT(db->db_blkid != DB_BONUS_BLKID); - quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK]; - syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK]; + quiescing = (arc_buf_t **)&db->db_d.db_data_old[(txg-1)&TXG_MASK]; + syncing = (arc_buf_t **)&db->db_d.db_data_old[(txg-2)&TXG_MASK]; /* * If this buffer is referenced from the current quiescing @@ -656,13 +648,12 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) */ ASSERT(*syncing != db->db_buf); if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + int size = db->db.db_size; *quiescing = arc_buf_alloc( db->db_dnode->dn_objset->os_spa, size, db); bcopy(db->db.db_data, (*quiescing)->b_data, size); } else { - db->db.db_data = NULL; - db->db_buf = NULL; - db->db_state = DB_UNCACHED; + dbuf_set_data(db, NULL); } return; } @@ -677,22 +668,49 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) ASSERT3P(*quiescing, ==, NULL); ASSERT3U(db->db_dirtycnt, ==, 1); if (refcount_count(&db->db_holds) > db->db_dirtycnt) { + int size = db->db.db_size; /* we can't copy if we have already started a write */ ASSERT(*syncing != db->db_data_pending); *syncing = arc_buf_alloc( db->db_dnode->dn_objset->os_spa, size, db); bcopy(db->db.db_data, (*syncing)->b_data, size); } else { - db->db.db_data = NULL; - db->db_buf = NULL; - db->db_state = DB_UNCACHED; + dbuf_set_data(db, NULL); } } } +/* + * This is the "bonus buffer" version of the above routine + */ +static void +dbuf_fix_old_bonus_data(dmu_buf_impl_t *db, uint64_t txg) +{ + void **quiescing, **syncing; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db.db_data != NULL); + ASSERT(db->db_blkid == DB_BONUS_BLKID); + + quiescing = &db->db_d.db_data_old[(txg-1)&TXG_MASK]; + syncing = &db->db_d.db_data_old[(txg-2)&TXG_MASK]; + + if (*quiescing == db->db.db_data) { + ASSERT(*syncing != db->db.db_data); + *quiescing = zio_buf_alloc(DN_MAX_BONUSLEN); + bcopy(db->db.db_data, *quiescing, DN_MAX_BONUSLEN); + } else if (*syncing == db->db.db_data) { + ASSERT3P(*quiescing, ==, NULL); + ASSERT3U(db->db_dirtycnt, ==, 1); + *syncing = zio_buf_alloc(DN_MAX_BONUSLEN); + bcopy(db->db.db_data, *syncing, DN_MAX_BONUSLEN); + } +} + void dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg) { + ASSERT(db->db_blkid != DB_BONUS_BLKID); ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_d.db_overridden_by[txg&TXG_MASK] == IN_DMU_SYNC) { db->db_d.db_overridden_by[txg&TXG_MASK] = NULL; @@ -724,7 +742,8 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); - if ((db->db_level != 0) || (db->db_blkid == DB_BONUS_BLKID)) + ASSERT(db->db_blkid != DB_BONUS_BLKID); + if (db->db_level != 0) continue; dprintf_dbuf(db, "found buf %s\n", ""); if (db->db_blkid < blkid || @@ -736,7 +755,8 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) continue; mutex_enter(&db->db_mtx); - if (db->db_state == DB_UNCACHED) { + if (db->db_state == DB_UNCACHED || + db->db_state == DB_EVICTING) { ASSERT(db->db.db_data == NULL); mutex_exit(&db->db_mtx); continue; @@ -753,22 +773,40 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) mutex_exit(&db->db_mtx); continue; } + if (refcount_count(&db->db_holds) == 0) { + ASSERT(db->db_buf); + dbuf_clear(db); + continue; + } + /* The dbuf is CACHED and referenced */ - /* make a copy of the data if necessary */ - dbuf_fix_old_data(db, txg); - - if (db->db.db_data) { - /* fill in with appropriate data */ + if (!list_link_active(&db->db_dirty_node[txg & TXG_MASK])) { + /* + * This dbuf is not currently dirty. We will either + * uncache it (if its not referenced in the open + * context) or reset its contents to empty. + */ + dbuf_fix_old_data(db, txg); + } else if (db->db_d.db_overridden_by[txg & TXG_MASK] != NULL) { + /* + * This dbuf is overridden. Clear that state. + */ + dbuf_unoverride(db, txg); + } + /* fill in with appropriate data */ + if (db->db_state == DB_CACHED) { + ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); bzero(db->db.db_data, db->db.db_size); } + mutex_exit(&db->db_mtx); } mutex_exit(&dn->dn_dbufs_mtx); } static int -dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx) +dbuf_new_block(dmu_buf_impl_t *db) { dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; uint64_t birth_txg = 0; @@ -790,7 +828,7 @@ dbuf_new_block(dmu_buf_impl_t *db, dmu_tx_t *tx) birth_txg = db->db_blkptr->blk_birth; if (birth_txg) - return (!dsl_dataset_block_freeable(ds, birth_txg, tx)); + return (!dsl_dataset_block_freeable(ds, birth_txg)); else return (TRUE); } @@ -801,6 +839,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) arc_buf_t *buf, *obuf; int osize = db->db.db_size; + ASSERT(db->db_blkid != DB_BONUS_BLKID); + /* XXX does *this* func really need the lock? */ ASSERT(RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)); @@ -814,6 +854,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) * be happening. */ /* Make a copy of the data if necessary */ + /* + * XXX we should be doing a dbuf_read, checking the return + * value and returning that up to our callers + */ dbuf_will_dirty(db, tx); /* create the data buffer for the new block */ @@ -829,7 +873,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) mutex_enter(&db->db_mtx); /* ASSERT3U(refcount_count(&db->db_holds), ==, 1); */ dbuf_set_data(db, buf); - arc_buf_free(obuf, db); + VERIFY(arc_buf_remove_ref(obuf, db) == 1); db->db.db_size = size; /* fix up the dirty info */ @@ -861,7 +905,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) */ ASSERT(!(dmu_tx_is_syncing(tx) && !BP_IS_HOLE(&dn->dn_objset->os_rootbp) && - !(dn->dn_object & DMU_PRIVATE_OBJECT) && + dn->dn_object != DMU_META_DNODE_OBJECT && dn->dn_objset->os_dsl_dataset != NULL && !dsl_dir_is_private( dn->dn_objset->os_dsl_dataset->ds_dir))); @@ -871,7 +915,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * check if we're already dirty. They are allowed to re-dirty * in syncing context. */ - ASSERT(dn->dn_object & DMU_PRIVATE_OBJECT || + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); @@ -940,22 +984,27 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); - if (db->db_level == 0) { + /* + * If this buffer is dirty in an old transaction group we need + * to make a copy of it so that the changes we make in this + * transaction group won't leak out when we sync the older txg. + */ + if (db->db_blkid == DB_BONUS_BLKID) { + ASSERT(db->db.db_data != NULL); + ASSERT(db->db_d.db_data_old[txgoff] == NULL); + dbuf_fix_old_bonus_data(db, tx->tx_txg); + db->db_d.db_data_old[txgoff] = db->db.db_data; + } else if (db->db_level == 0) { /* * Release the data buffer from the cache so that we * can modify it without impacting possible other users * of this cached data block. Note that indirect blocks * and private objects are not released until the syncing * state (since they are only modified then). - * - * If this buffer is dirty in an old transaction group we need - * to make a copy of it so that the changes we make in this - * transaction group won't leak out when we sync the older txg. */ ASSERT(db->db_buf != NULL); - ASSERT(db->db.db_data != NULL); ASSERT(db->db_d.db_data_old[txgoff] == NULL); - if (!(db->db.db_object & DMU_PRIVATE_OBJECT)) { + if (db->db.db_object != DMU_META_DNODE_OBJECT) { arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); ASSERT(db->db_buf != NULL); @@ -978,12 +1027,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) list_insert_tail(&dn->dn_dirty_dbufs[txgoff], db); mutex_exit(&dn->dn_mtx); - /* - * If writting this buffer will consume a new block on disk, - * then update the accounting. - */ if (db->db_blkid != DB_BONUS_BLKID) { - if (!dbuf_new_block(db, tx) && db->db_blkptr) { + /* + * Update the accounting. + */ + if (!dbuf_new_block(db) && db->db_blkptr) { /* * This is only a guess -- if the dbuf is dirty * in a previous txg, we don't know how much @@ -1028,7 +1076,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); dbuf_dirty(parent, tx); - dbuf_remove_ref(parent, FTAG); + dbuf_rele(parent, FTAG); } else { if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); @@ -1042,8 +1090,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn = db->db_dnode; int txgoff = tx->tx_txg & TXG_MASK; + int64_t holds; ASSERT(tx->tx_txg != 0); + ASSERT(db->db_blkid != DB_BONUS_BLKID); mutex_enter(&db->db_mtx); @@ -1080,7 +1130,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_buf != NULL); ASSERT(db->db_d.db_data_old[txgoff] != NULL); if (db->db_d.db_data_old[txgoff] != db->db_buf) - arc_buf_free(db->db_d.db_data_old[txgoff], db); + VERIFY(arc_buf_remove_ref( + db->db_d.db_data_old[txgoff], db) == 1); db->db_d.db_data_old[txgoff] = NULL; } @@ -1095,15 +1146,17 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; - if (refcount_remove(&db->db_holds, - (void *)(uintptr_t)tx->tx_txg) == 0) { - /* make duf_verify() happy */ - if (db->db.db_data) - bzero(db->db.db_data, db->db.db_size); + if ((holds = refcount_remove(&db->db_holds, + (void *)(uintptr_t)tx->tx_txg)) == 0) { + arc_buf_t *buf = db->db_buf; + ASSERT(arc_released(buf)); + dbuf_set_data(db, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); dbuf_evict(db); return (1); } + ASSERT(holds > 0); mutex_exit(&db->db_mtx); return (0); @@ -1120,19 +1173,21 @@ dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (RW_WRITE_HELD(&db->db_dnode->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; - (void) dbuf_read_generic(db, rf); + (void) dbuf_read(db, NULL, rf); dbuf_dirty(db, tx); } -#pragma weak dmu_buf_will_fill = dbuf_will_fill void -dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx) +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + ASSERT(db->db_blkid != DB_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!refcount_is_zero(&db->db_holds)); - ASSERT(!(db->db.db_object & DMU_PRIVATE_OBJECT) || + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); dbuf_noread(db); @@ -1149,6 +1204,7 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_d.db_freed_in_flight) { + ASSERT(db->db_blkid != DB_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); @@ -1160,47 +1216,62 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_exit(&db->db_mtx); } - -static void +/* + * "Clear" the contents of this dbuf. This will mark the dbuf + * EVICTING and clear *most* of its references. Unfortunetely, + * when we are not holding the dn_dbufs_mtx, we can't clear the + * entry in the dn_dbufs list. We have to wait until dbuf_destroy() + * in this case. For callers from the DMU we will usually see: + * dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy() + * For the arc callback, we will usually see: + * dbuf_do_evict()->dbuf_clear();dbuf_destroy() + * Sometimes, though, we will get a mix of these two: + * DMU: dbuf_clear()->arc_buf_evict() + * ARC: dbuf_do_evict()->dbuf_destroy() + */ +void dbuf_clear(dmu_buf_impl_t *db) { dnode_t *dn = db->db_dnode; + dmu_buf_impl_t *parent = db->db_parent; + int dbuf_gone = FALSE; - ASSERT(MUTEX_HELD(&dn->dn_dbufs_mtx)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); + dbuf_evict_user(db); + if (db->db_state == DB_CACHED) { - ASSERT(db->db_buf != NULL); - arc_buf_free(db->db_buf, db); + ASSERT(db->db.db_data != NULL); + if (db->db_blkid == DB_BONUS_BLKID) + zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); db->db.db_data = NULL; - db->db_buf = NULL; db->db_state = DB_UNCACHED; } ASSERT3U(db->db_state, ==, DB_UNCACHED); - ASSERT(db->db_buf == NULL); ASSERT(db->db_data_pending == NULL); - mutex_exit(&db->db_mtx); + db->db_state = DB_EVICTING; + db->db_blkptr = NULL; + + if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { + list_remove(&dn->dn_dbufs, db); + dnode_rele(dn, db); + } + + if (db->db_buf) + dbuf_gone = arc_buf_evict(db->db_buf); + + if (!dbuf_gone) + mutex_exit(&db->db_mtx); /* * If this dbuf is referened from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ - if (db->db_parent && db->db_parent != dn->dn_dbuf) - dbuf_remove_ref(db->db_parent, db); - - /* remove from dn_dbufs */ - list_remove(&dn->dn_dbufs, db); - - dnode_rele(dn, db); - - dbuf_hash_remove(db); - - db->db_dnode = NULL; - db->db_parent = NULL; - db->db_blkptr = NULL; + if (parent && parent != dn->dn_dbuf) + dbuf_rele(parent, db); } static int @@ -1209,6 +1280,8 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, { int nlevels, epbs; + ASSERT(blkid != DB_BONUS_BLKID); + if (dn->dn_phys->dn_nlevels == 0) nlevels = 1; else @@ -1218,12 +1291,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, ASSERT3U(level * epbs, <, 64); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); - if (blkid == DB_BONUS_BLKID) { - /* this is the bonus buffer */ - *parentp = NULL; - *bpp = NULL; - return (0); - } else if (level >= nlevels || + if (level >= nlevels || (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { /* the buffer has no parent yet */ *parentp = NULL; @@ -1235,10 +1303,13 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, blkid >> epbs, fail_sparse, NULL, parentp); if (err) return (err); - dbuf_read_havestruct(*parentp); - *bpp = ((blkptr_t *)(*parentp)->db.db_data) + - (blkid & ((1ULL << epbs) - 1)); - return (0); + err = dbuf_read(*parentp, NULL, + (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + if (err == 0) { + *bpp = ((blkptr_t *)(*parentp)->db.db_data) + + (blkid & ((1ULL << epbs) - 1)); + } + return (err); } else { /* the block is referenced from the dnode */ ASSERT3U(level, ==, nlevels-1); @@ -1266,11 +1337,21 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; - db->db_state = DB_UNCACHED; + db->db_dirtied = 0; + db->db_dirtycnt = 0; + db->db_dnode = dn; + db->db_parent = parent; + db->db_blkptr = blkptr; - if (db->db_blkid == DB_BONUS_BLKID) { + bzero(&db->db_d, sizeof (db->db_d)); + + if (blkid == DB_BONUS_BLKID) { + ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = dn->dn_bonuslen; db->db.db_offset = DB_BONUS_BLKID; + db->db_state = DB_UNCACHED; + /* the bonus dbuf is not placed in the hash table */ + return (db); } else { int blocksize = db->db_level ? 1<<dn->dn_indblkshift : dn->dn_datablksz; @@ -1278,11 +1359,6 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db.db_offset = db->db_blkid * blocksize; } - db->db_dirtied = 0; - db->db_dirtycnt = 0; - - bzero(&db->db_d, sizeof (db->db_d)); - /* * Hold the dn_dbufs_mtx while we get the new dbuf * in the hash table *and* added to the dbufs list. @@ -1291,6 +1367,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); + db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ kmem_cache_free(dbuf_cache, db); @@ -1298,50 +1375,43 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, return (odb); } list_insert_head(&dn->dn_dbufs, db); + db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + refcount_count(&dn->dn_holds) > 0); (void) refcount_add(&dn->dn_holds, db); - db->db_dnode = dn; - db->db_parent = parent; - db->db_blkptr = blkptr; - dprintf_dbuf(db, "db=%p\n", db); return (db); } static int -dbuf_evictable(dmu_buf_impl_t *db) +dbuf_do_evict(void *private) { - int i; - - ASSERT(MUTEX_HELD(&db->db_mtx)); - DBUF_VERIFY(db); + arc_buf_t *buf = private; + dmu_buf_impl_t *db = buf->b_private; - if (db->db_state != DB_UNCACHED && db->db_state != DB_CACHED) - return (FALSE); + if (!MUTEX_HELD(&db->db_mtx)) + mutex_enter(&db->db_mtx); - if (!refcount_is_zero(&db->db_holds)) - return (FALSE); + ASSERT(db->db_buf == buf); + ASSERT(refcount_is_zero(&db->db_holds)); -#ifdef ZFS_DEBUG - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(!list_link_active(&db->db_dirty_node[i])); - ASSERT(db->db_level != 0 || db->db_d.db_data_old[i] == NULL); + if (db->db_state != DB_EVICTING) { + ASSERT(db->db_state == DB_CACHED); + DBUF_VERIFY(db); + db->db_buf = NULL; + dbuf_evict(db); + } else { + mutex_exit(&db->db_mtx); + dbuf_destroy(db); } -#endif - - /* - * Now we know we want to free it. - * This call must be done last, since it has side effects - - * calling the db_evict_func(). - */ - dbuf_evict_user(db); - return (TRUE); + return (0); } static void @@ -1349,9 +1419,36 @@ dbuf_destroy(dmu_buf_impl_t *db) { ASSERT(refcount_is_zero(&db->db_holds)); + if (db->db_blkid != DB_BONUS_BLKID) { + dnode_t *dn = db->db_dnode; + + /* + * If this dbuf is still on the dn_dbufs list, + * remove it from that list. + */ + if (list_link_active(&db->db_link)) { + int need_mutex; + + ASSERT(!MUTEX_HELD(&dn->dn_dbufs_mtx)); + need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx); + if (need_mutex) + mutex_enter(&dn->dn_dbufs_mtx); + + /* remove from dn_dbufs */ + list_remove(&dn->dn_dbufs, db); + + if (need_mutex) + mutex_exit(&dn->dn_dbufs_mtx); + + dnode_rele(dn, db); + } + dbuf_hash_remove(db); + } + db->db_parent = NULL; + db->db_dnode = NULL; + db->db_buf = NULL; + ASSERT(db->db.db_data == NULL); - ASSERT(db->db_dnode == NULL); - ASSERT(db->db_parent == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); @@ -1384,14 +1481,21 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) if (dbuf_findbp(dn, 0, blkid, TRUE, &parent, &bp) == 0) { if (bp && !BP_IS_HOLE(bp)) { + zbookmark_t zb; + zb.zb_objset = dn->dn_objset->os_dsl_dataset ? + dn->dn_objset->os_dsl_dataset->ds_object : 0; + zb.zb_object = dn->dn_object; + zb.zb_level = 0; + zb.zb_blkid = blkid; + (void) arc_read(NULL, dn->dn_objset->os_spa, bp, dmu_ot[dn->dn_type].ot_byteswap, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, - (ARC_NOWAIT | ARC_PREFETCH)); + (ARC_NOWAIT | ARC_PREFETCH), &zb); } if (parent && parent != dn->dn_dbuf) - dbuf_rele(parent); + dbuf_rele(parent, NULL); } } @@ -1405,11 +1509,12 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, { dmu_buf_impl_t *db, *parent = NULL; + ASSERT(blkid != DB_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); *dbp = NULL; - +top: /* dbuf_find() returns with db_mtx held */ db = dbuf_find(dn, level, blkid); @@ -1423,13 +1528,26 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, err = ENOENT; if (err) { if (parent && parent != dn->dn_dbuf) - dbuf_rele(parent); + dbuf_rele(parent, NULL); return (err); } } + if (err && err != ENOENT) + return (err); db = dbuf_create(dn, level, blkid, parent, bp); } + if (db->db_buf && refcount_is_zero(&db->db_holds)) { + arc_buf_add_ref(db->db_buf, db); + if (db->db_buf->b_data == NULL) { + dbuf_clear(db); + goto top; + } + ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); + } + + ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); + /* * If this buffer is currently syncing out, and we are * are still referencing it from db_data, we need to make @@ -1437,7 +1555,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, * again in this txg. */ if (db->db_level == 0 && db->db_state == DB_CACHED && - !(dn->dn_object & DMU_PRIVATE_OBJECT) && + dn->dn_object != DMU_META_DNODE_OBJECT && db->db_data_pending == db->db_buf) { int size = (db->db_blkid == DB_BONUS_BLKID) ? DN_MAX_BONUSLEN : db->db.db_size; @@ -1448,14 +1566,14 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, db->db.db_size); } - dbuf_add_ref(db, tag); + (void) refcount_add(&db->db_holds, tag); dbuf_update_data(db); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (parent && parent != dn->dn_dbuf) - dbuf_rele(parent); + dbuf_rele(parent, NULL); ASSERT3P(db->db_dnode, ==, dn); ASSERT3U(db->db_blkid, ==, blkid); @@ -1466,81 +1584,83 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, } dmu_buf_impl_t * -dbuf_hold(dnode_t *dn, uint64_t blkid) +dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; - (void) dbuf_hold_impl(dn, 0, blkid, FALSE, NULL, &db); - return (db); + int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); + return (err ? NULL : db); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; - (void) dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); - return (db); + int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); + return (err ? NULL : db); } dmu_buf_impl_t * -dbuf_hold_bonus(dnode_t *dn, void *tag) +dbuf_create_bonus(dnode_t *dn) { - dmu_buf_impl_t *db; - rw_enter(&dn->dn_struct_rwlock, RW_READER); - (void) dbuf_hold_impl(dn, 0, DB_BONUS_BLKID, FALSE, tag, &db); - rw_exit(&dn->dn_struct_rwlock); + dmu_buf_impl_t *db = dn->dn_bonus; + + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + ASSERT(dn->dn_bonus == NULL); + db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); return (db); } +#pragma weak dmu_buf_add_ref = dbuf_add_ref void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { - (void) refcount_add(&db->db_holds, tag); - /* dprintf_dbuf(db, "adding ref %p; holds up to %lld\n", tag, holds); */ + int64_t holds = refcount_add(&db->db_holds, tag); + ASSERT(holds > 1); } +#pragma weak dmu_buf_rele = dbuf_rele void -dbuf_remove_ref(dmu_buf_impl_t *db, void *tag) +dbuf_rele(dmu_buf_impl_t *db, void *tag) { int64_t holds; - dnode_t *dn = db->db_dnode; - int need_mutex; - - ASSERT(dn != NULL); - need_mutex = !MUTEX_HELD(&dn->dn_dbufs_mtx); - - if (need_mutex) { - dnode_add_ref(dn, FTAG); - mutex_enter(&dn->dn_dbufs_mtx); - } mutex_enter(&db->db_mtx); DBUF_VERIFY(db); holds = refcount_remove(&db->db_holds, tag); + ASSERT(holds >= 0); + + if (holds == db->db_dirtycnt && + db->db_level == 0 && db->db_d.db_immediate_evict) + dbuf_evict_user(db); if (holds == 0) { - ASSERT3U(db->db_state, !=, DB_FILL); - if (db->db_level == 0 && - db->db_d.db_user_data_ptr_ptr != NULL) - *db->db_d.db_user_data_ptr_ptr = NULL; - dbuf_evict(db); + if (db->db_blkid == DB_BONUS_BLKID) { + mutex_exit(&db->db_mtx); + dnode_rele(db->db_dnode, db); + } else if (db->db_buf == NULL) { + /* + * This is a special case: we never associated this + * dbuf with any data allocated from the ARC. + */ + ASSERT3U(db->db_state, ==, DB_UNCACHED); + dbuf_evict(db); + } else if (arc_released(db->db_buf)) { + arc_buf_t *buf = db->db_buf; + /* + * This dbuf has anonymous data associated with it. + */ + dbuf_set_data(db, NULL); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + dbuf_evict(db); + } else { + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); + mutex_exit(&db->db_mtx); + } } else { - if (holds == db->db_dirtycnt && - db->db_level == 0 && db->db_d.db_immediate_evict) - dbuf_evict_user(db); mutex_exit(&db->db_mtx); } - - if (need_mutex) { - mutex_exit(&dn->dn_dbufs_mtx); - dnode_rele(dn, FTAG); - } -} - -void -dbuf_rele(dmu_buf_impl_t *db) -{ - dbuf_remove_ref(db, NULL); } #pragma weak dmu_buf_refcount = dbuf_refcount @@ -1611,6 +1731,8 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) dnode_t *dn = db->db_dnode; objset_impl_t *os = dn->dn_objset; int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + int checksum, compress; + zbookmark_t zb; int blksz; ASSERT(dmu_tx_is_syncing(tx)); @@ -1638,8 +1760,38 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) * be modified yet. */ + if (db->db_blkid == DB_BONUS_BLKID) { + void **datap = &db->db_d.db_data_old[txg&TXG_MASK]; + /* + * Simply copy the bonus data into the dnode. It will + * be written out when the dnode is synced (and it will + * be synced, since it must have been dirty for dbuf_sync + * to be called). + */ + /* + * Use dn_phys->dn_bonuslen since db.db_size is the length + * of the bonus buffer in the open transaction rather than + * the syncing transaction. + */ + ASSERT(*datap != NULL); + ASSERT3U(db->db_level, ==, 0); + ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); + bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); + if (*datap != db->db.db_data) + zio_buf_free(*datap, DN_MAX_BONUSLEN); + db->db_d.db_data_old[txg&TXG_MASK] = NULL; + db->db_data_pending = NULL; + if (db->db_dirtied == txg) + db->db_dirtied = 0; + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + mutex_exit(&db->db_mtx); + dbuf_rele(db, (void *)(uintptr_t)txg); + return; + } + if (db->db_level == 0) { - data = &db->db_d.db_data_old[txg&TXG_MASK]; + data = (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK]; blksz = arc_buf_size(*data); /* * If this buffer is currently "in use" (i.e., there are @@ -1651,17 +1803,15 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) * modified in the syncing context (e.g. DNONE_DNODE blocks) * or if there is no actual write involved (bonus blocks). */ - if (!(dn->dn_object & DMU_PRIVATE_OBJECT) && - db->db_d.db_overridden_by[txg&TXG_MASK] == NULL && - db->db_blkid != DB_BONUS_BLKID) { + if (dn->dn_object != DMU_META_DNODE_OBJECT && + db->db_d.db_overridden_by[txg&TXG_MASK] == NULL) { if (refcount_count(&db->db_holds) > 1 && *data == db->db_buf) { - *data = arc_buf_alloc( - db->db_dnode->dn_objset->os_spa, blksz, db); + *data = arc_buf_alloc(os->os_spa, blksz, db); bcopy(db->db.db_data, (*data)->b_data, blksz); } db->db_data_pending = *data; - } else if (dn->dn_object & DMU_PRIVATE_OBJECT) { + } else if (dn->dn_object == DMU_META_DNODE_OBJECT) { /* * Private object buffers are released here rather * than in dbuf_dirty() since they are only modified @@ -1683,7 +1833,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; mutex_exit(&db->db_mtx); - dbuf_remove_ref(db, (void *)(uintptr_t)txg); + dbuf_rele(db, (void *)(uintptr_t)txg); return; } blksz = db->db.db_size; @@ -1692,35 +1842,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) ASSERT(*data != NULL); - if (db->db_blkid == DB_BONUS_BLKID) { - /* - * Simply copy the bonus data into the dnode. It will - * be written out when the dnode is synced (and it will - * be synced, since it must have been dirty for dbuf_sync - * to be called). The bonus data will be byte swapped - * in dnode_byteswap. - */ - /* - * Use dn_phys->dn_bonuslen since db.db_size is the length - * of the bonus buffer in the open transaction rather than - * the syncing transaction. - */ - ASSERT3U(db->db_level, ==, 0); - ASSERT3U(dn->dn_phys->dn_bonuslen, <=, blksz); - bcopy((*data)->b_data, DN_BONUS(dn->dn_phys), - dn->dn_phys->dn_bonuslen); - if (*data != db->db_buf) - arc_buf_free(*data, db); - db->db_d.db_data_old[txg&TXG_MASK] = NULL; - db->db_data_pending = NULL; - if (db->db_dirtied == txg) - db->db_dirtied = 0; - ASSERT(db->db_dirtycnt > 0); - db->db_dirtycnt -= 1; - mutex_exit(&db->db_mtx); - dbuf_remove_ref(db, (void *)(uintptr_t)txg); - return; - } else if (db->db_level > 0 && !arc_released(db->db_buf)) { + if (db->db_level > 0 && !arc_released(db->db_buf)) { /* * This indirect buffer was marked dirty, but * never modified (if it had been modified, then @@ -1733,7 +1855,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; mutex_exit(&db->db_mtx); - dbuf_remove_ref(db, (void *)(uintptr_t)txg); + dbuf_rele(db, (void *)(uintptr_t)txg); return; } else if (db->db_blkptr == NULL && db->db_level == dn->dn_phys->dn_nlevels-1 && @@ -1757,18 +1879,18 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) if (parent == NULL) { rw_enter(&dn->dn_struct_rwlock, RW_READER); (void) dbuf_hold_impl(dn, db->db_level+1, - db->db_blkid >> epbs, FALSE, NULL, &parent); + db->db_blkid >> epbs, FALSE, FTAG, &parent); rw_exit(&dn->dn_struct_rwlock); dbuf_add_ref(parent, db); db->db_parent = parent; - dbuf_rele(parent); + dbuf_rele(parent, FTAG); } - dbuf_read(parent); + (void) dbuf_read(parent, NULL, DB_RF_MUST_SUCCEED); } else { mutex_exit(&db->db_mtx); } - ASSERT(IS_DNODE_DNODE(dn->dn_object) || db->db_parent != NULL); + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || db->db_parent != NULL); if (db->db_level > 0 && db->db_blkid > dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)) { @@ -1801,7 +1923,7 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) mutex_enter(&db->db_mtx); db->db_dirtycnt -= 1; mutex_exit(&db->db_mtx); - dbuf_remove_ref(db, (void *)(uintptr_t)txg); + dbuf_rele(db, (void *)(uintptr_t)txg); return; } @@ -1812,20 +1934,17 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) ASSERT(db->db_level == parent->db_level-1); ASSERT(list_link_active(&parent->db_dirty_node[txg&TXG_MASK])); /* - * We may have read this block after we dirtied it, + * We may have read this indirect block after we dirtied it, * so never released it from the cache. */ - arc_release(parent->db_buf, parent); + arc_release(parent->db_buf, db->db_parent); db->db_blkptr = (blkptr_t *)parent->db.db_data + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); - } - ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); - #ifdef ZFS_DEBUG - if (db->db_parent == dn->dn_dbuf) { + } else { /* * We don't need to dnode_setdirty(dn) because if we got * here then the parent is already dirty. @@ -1833,11 +1952,14 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); - } #endif + } + ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); + if (db->db_level == 0 && db->db_d.db_overridden_by[txg&TXG_MASK] != NULL) { - arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; + arc_buf_t **old = + (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK]; blkptr_t **bpp = &db->db_d.db_overridden_by[txg&TXG_MASK]; int old_size = BP_GET_ASIZE(db->db_blkptr); int new_size = BP_GET_ASIZE(*bpp); @@ -1861,7 +1983,11 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) *bpp = NULL; if (*old != db->db_buf) - arc_buf_free(*old, db); + VERIFY(arc_buf_remove_ref(*old, db) == 1); + else if (!BP_IS_HOLE(db->db_blkptr)) + arc_set_callback(db->db_buf, dbuf_do_evict, db); + else + ASSERT(arc_released(db->db_buf)); *old = NULL; db->db_data_pending = NULL; @@ -1870,54 +1996,55 @@ dbuf_sync(dmu_buf_impl_t *db, zio_t *zio, dmu_tx_t *tx) ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; mutex_exit(&db->db_mtx); - dbuf_remove_ref(db, (void *)(uintptr_t)txg); - } else { - int checksum, compress; + dbuf_rele(db, (void *)(uintptr_t)txg); + return; + } - if (db->db_level > 0) { - /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. - */ - checksum = ZIO_CHECKSUM_FLETCHER_4; - /* XXX - disable compresssion for now */ - compress = ZIO_COMPRESS_OFF; + if (db->db_level > 0) { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + checksum = ZIO_CHECKSUM_FLETCHER_4; + compress = ZIO_COMPRESS_LZJB; + } else { + /* + * Allow dnode settings to override objset settings, + * except for metadata checksums. + */ + if (dmu_ot[dn->dn_type].ot_metadata) { + checksum = os->os_md_checksum; + compress = zio_compress_select(dn->dn_compress, + os->os_md_compress); } else { - /* - * Allow dnode settings to override objset settings, - * except for metadata checksums. - */ - if (dmu_ot[dn->dn_type].ot_metadata) { - checksum = os->os_md_checksum; - compress = zio_compress_select(dn->dn_compress, - os->os_md_compress); - } else { - checksum = zio_checksum_select(dn->dn_checksum, - os->os_checksum); - compress = zio_compress_select(dn->dn_compress, - os->os_compress); - } + checksum = zio_checksum_select(dn->dn_checksum, + os->os_checksum); + compress = zio_compress_select(dn->dn_compress, + os->os_compress); } + } #ifdef ZFS_DEBUG - if (db->db_parent) { - ASSERT(list_link_active( - &db->db_parent->db_dirty_node[txg&TXG_MASK])); - ASSERT(db->db_parent == dn->dn_dbuf || - db->db_parent->db_level > 0); - if (dn->dn_object & DMU_PRIVATE_OBJECT || - db->db_level > 0) - ASSERT(*data == db->db_buf); - } -#endif - ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg); - (void) arc_write(zio, os->os_spa, checksum, compress, txg, - db->db_blkptr, *data, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT); - /* - * We can't access db after arc_write, since it could finish - * and be freed, and we have no locks on it. - */ + if (db->db_parent) { + ASSERT(list_link_active( + &db->db_parent->db_dirty_node[txg&TXG_MASK])); + ASSERT(db->db_parent == dn->dn_dbuf || + db->db_parent->db_level > 0); + if (dn->dn_object == DMU_META_DNODE_OBJECT || db->db_level > 0) + ASSERT(*data == db->db_buf); } +#endif + ASSERT3U(db->db_blkptr->blk_birth, <=, tx->tx_txg); + zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; + (void) arc_write(zio, os->os_spa, checksum, compress, txg, + db->db_blkptr, *data, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_NOWAIT, &zb); + /* + * We can't access db after arc_write, since it could finish + * and be freed, and we have no locks on it. + */ } struct dbuf_arg { @@ -1970,12 +2097,17 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) db->db_dirtied = 0; if (db->db_level == 0) { - arc_buf_t **old = &db->db_d.db_data_old[txg&TXG_MASK]; + arc_buf_t **old = + (arc_buf_t **)&db->db_d.db_data_old[txg&TXG_MASK]; ASSERT(db->db_blkid != DB_BONUS_BLKID); if (*old != db->db_buf) - arc_buf_free(*old, db); + VERIFY(arc_buf_remove_ref(*old, db) == 1); + else if (!BP_IS_HOLE(db->db_blkptr)) + arc_set_callback(db->db_buf, dbuf_do_evict, db); + else + ASSERT(arc_released(db->db_buf)); *old = NULL; db->db_data_pending = NULL; @@ -2007,6 +2139,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) db->db.db_size); ASSERT3U(dn->dn_phys->dn_maxblkid >> (db->db_level * epbs), >=, db->db_blkid); + arc_set_callback(db->db_buf, dbuf_do_evict, db); } for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { if (BP_IS_HOLE(bp)) @@ -2053,5 +2186,5 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) } } - dbuf_remove_ref(db, (void *)(uintptr_t)txg); + dbuf_rele(db, (void *)(uintptr_t)txg); } diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 14fab6d420..f883842dad 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -40,6 +39,7 @@ #include <sys/dmu_zfetch.h> #include <sys/zfs_ioctl.h> #include <sys/zap.h> +#include <sys/zio_checksum.h> const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, TRUE, "unallocated" }, @@ -70,101 +70,40 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint8_array, FALSE, "other uint8[]" }, { byteswap_uint64_array, FALSE, "other uint64[]" }, { zap_byteswap, TRUE, "other ZAP" }, + { zap_byteswap, TRUE, "persistent error log" }, }; -static int -dmu_buf_read_array_impl(dmu_buf_impl_t **dbp, int numbufs, uint32_t flags) -{ - int i, err = 0; - dnode_t *dn; - zio_t *zio; - int canfail; - uint64_t rd_sz; - - if (numbufs == 0) - return (0); - - rd_sz = numbufs * dbp[0]->db.db_size; - ASSERT(rd_sz <= DMU_MAX_ACCESS); - - dn = dbp[0]->db_dnode; - if (flags & DB_RF_CANFAIL) { - canfail = 1; - } else { - canfail = 0; - } - zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, canfail); - - /* don't prefetch if read the read is large */ - if (rd_sz >= zfetch_array_rd_sz) { - flags |= DB_RF_NOPREFETCH; - } - - /* initiate async reads */ - rw_enter(&dn->dn_struct_rwlock, RW_READER); - for (i = 0; i < numbufs; i++) { - if (dbp[i]->db_state == DB_UNCACHED) - dbuf_read_impl(dbp[i], zio, flags); - } - rw_exit(&dn->dn_struct_rwlock); - err = zio_wait(zio); - - if (err) - return (err); - - /* wait for other io to complete */ - for (i = 0; i < numbufs; i++) { - mutex_enter(&dbp[i]->db_mtx); - while (dbp[i]->db_state == DB_READ || - dbp[i]->db_state == DB_FILL) - cv_wait(&dbp[i]->db_changed, &dbp[i]->db_mtx); - ASSERT(dbp[i]->db_state == DB_CACHED); - mutex_exit(&dbp[i]->db_mtx); - } - - return (0); -} - -void -dmu_buf_read_array(dmu_buf_t **dbp_fake, int numbufs) -{ - dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; - int err; - - err = dmu_buf_read_array_impl(dbp, numbufs, DB_RF_MUST_SUCCEED); - ASSERT(err == 0); -} - int -dmu_buf_read_array_canfail(dmu_buf_t **dbp_fake, int numbufs) -{ - dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; - - return (dmu_buf_read_array_impl(dbp, numbufs, DB_RF_CANFAIL)); -} - -dmu_buf_t * -dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset) +dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; + int err; /* dataset_verify(dd); */ - dn = dnode_hold(os->os, object, FTAG); + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); blkid = dbuf_whichblock(dn, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); - db = dbuf_hold(dn, blkid); + db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); - dnode_rele(dn, FTAG); - return (&db->db); -} + if (db == NULL) { + err = EIO; + } else { + err = dbuf_read(db, NULL, DB_RF_CANFAIL); + if (err) { + dbuf_rele(db, tag); + db = NULL; + } + } -dmu_buf_t * -dmu_bonus_hold(objset_t *os, uint64_t object) -{ - return (dmu_bonus_hold_tag(os, object, NULL)); + dnode_rele(dn, FTAG); + *dbp = &db->db; + return (err); } int @@ -174,41 +113,69 @@ dmu_bonus_max(void) } /* - * Returns held bonus buffer if the object exists, NULL if it doesn't. + * returns ENOENT, EIO, or 0. */ -dmu_buf_t * -dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag) +int +dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { - dnode_t *dn = dnode_hold(os->os, object, FTAG); + dnode_t *dn; + int err, count; dmu_buf_impl_t *db; - if (dn == NULL) - return (NULL); + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); - db = dbuf_hold_bonus(dn, tag); - /* XXX - hack: hold the first block if this is a ZAP object */ - if (dmu_ot[dn->dn_type].ot_byteswap == zap_byteswap) { - rw_enter(&dn->dn_struct_rwlock, RW_READER); - dn->dn_db0 = dbuf_hold(dn, 0); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus == NULL) + dn->dn_bonus = dbuf_create_bonus(dn); } + db = dn->dn_bonus; + rw_exit(&dn->dn_struct_rwlock); + mutex_enter(&db->db_mtx); + count = refcount_add(&db->db_holds, tag); + mutex_exit(&db->db_mtx); + if (count == 1) + dnode_add_ref(dn, db); dnode_rele(dn, FTAG); - return (&db->db); + + VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED)); + + *dbp = &db->db; + return (0); } -static dmu_buf_t ** -dbuf_hold_array(dnode_t *dn, - uint64_t offset, uint64_t length, int *numbufsp) +int +dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { + dnode_t *dn; dmu_buf_t **dbp; uint64_t blkid, nblks, i; + uint32_t flags; + int err; + zio_t *zio; + + ASSERT(length <= DMU_MAX_ACCESS); if (length == 0) { if (numbufsp) *numbufsp = 0; - return (NULL); + *dbpp = NULL; + return (0); } + flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; + if (length >= zfetch_array_rd_sz) + flags |= DB_RF_NOPREFETCH; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; @@ -218,83 +185,62 @@ dbuf_hold_array(dnode_t *dn, ASSERT3U(offset + length, <=, dn->dn_datablksz); nblks = 1; } - dbp = kmem_alloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); + dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE); blkid = dbuf_whichblock(dn, offset); for (i = 0; i < nblks; i++) { - dmu_buf_impl_t *dbuf; - dbuf = dbuf_hold(dn, blkid+i); - dbp[i] = &dbuf->db; + dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag); + if (db == NULL) { + rw_exit(&dn->dn_struct_rwlock); + dmu_buf_rele_array(dbp, nblks, tag); + dnode_rele(dn, FTAG); + zio_nowait(zio); + return (EIO); + } + /* initiate async i/o */ + if (read && db->db_state == DB_UNCACHED) { + rw_exit(&dn->dn_struct_rwlock); + (void) dbuf_read(db, zio, flags); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + } + dbp[i] = &db->db; } rw_exit(&dn->dn_struct_rwlock); - - if (numbufsp) - *numbufsp = nblks; - return (dbp); -} - -dmu_buf_t ** -dmu_buf_hold_array(objset_t *os, uint64_t object, - uint64_t offset, uint64_t length, int *numbufsp) -{ - dnode_t *dn; - dmu_buf_t **dbp; - - ASSERT(length <= DMU_MAX_ACCESS); - - if (length == 0) { - if (numbufsp) - *numbufsp = 0; - return (NULL); - } - - dn = dnode_hold(os->os, object, FTAG); - dbp = dbuf_hold_array(dn, offset, length, numbufsp); dnode_rele(dn, FTAG); - return (dbp); -} - -void -dmu_buf_add_ref(dmu_buf_t *dbuf, void *tag) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; - dbuf_add_ref(db, tag); -} - -void -dmu_buf_remove_ref(dmu_buf_t *dbuf, void *tag) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; - dbuf_remove_ref(db, tag); -} - -void -dmu_buf_rele(dmu_buf_t *dbuf_fake) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake; - - /* XXX - hack: hold the first block if this is a ZAP object */ - if (db->db_blkid == DB_BONUS_BLKID && - dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap) - dbuf_rele(db->db_dnode->dn_db0); - dbuf_rele(db); -} + /* wait for async i/o */ + err = zio_wait(zio); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } -void -dmu_buf_rele_tag(dmu_buf_t *dbuf_fake, void *tag) -{ - dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf_fake; + /* wait for other io to complete */ + if (read) { + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || + db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) + err = EIO; + mutex_exit(&db->db_mtx); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + } + } - /* XXX - hack: hold the first block if this is a ZAP object */ - if (db->db_blkid == DB_BONUS_BLKID && - dmu_ot[db->db_dnode->dn_type].ot_byteswap == zap_byteswap) - dbuf_rele(db->db_dnode->dn_db0); - dbuf_remove_ref(db, tag); + *numbufsp = nblks; + *dbpp = dbp; + return (0); } void -dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs) +dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; @@ -302,10 +248,10 @@ dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs) if (numbufs == 0) return; - ASSERT((numbufs * dbp[0]->db.db_size) <= DMU_MAX_ACCESS); - - for (i = 0; i < numbufs; i++) - dbuf_rele(dbp[i]); + for (i = 0; i < numbufs; i++) { + if (dbp[i]) + dbuf_rele(dbp[i], tag); + } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } @@ -315,7 +261,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) { dnode_t *dn; uint64_t blkid; - int nblks, i; + int nblks, i, err; if (len == 0) { /* they're interested in the bonus buffer */ dn = os->os->os_meta_dnode; @@ -335,8 +281,8 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ - dn = dnode_hold(os->os, object, FTAG); - if (dn == NULL) + err = dnode_hold(os->os, object, FTAG, &dn); + if (err != 0) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); @@ -359,39 +305,44 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) dnode_rele(dn, FTAG); } -void +int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { - dnode_t *dn = dnode_hold(os->os, object, FTAG); + dnode_t *dn; + int err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == -1ULL || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); + return (0); } -static int -dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf, uint32_t flags) +int +dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf) { dnode_t *dn; dmu_buf_t **dbp; - int numbufs, i; - - dn = dnode_hold(os->os, object, FTAG); + int numbufs, i, err; + /* + * Deal with odd block sizes, where there can't be data past the + * first block. + */ + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); if (dn->dn_datablkshift == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); size = newsz; } - dnode_rele(dn, FTAG); - if (size == 0) - return (0); - while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int err; @@ -400,13 +351,10 @@ dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ - dbp = dmu_buf_hold_array(os, object, offset, mylen, &numbufs); - err = dmu_buf_read_array_impl((dmu_buf_impl_t **)dbp, numbufs, - flags); - if (err) { - dmu_buf_rele_array(dbp, numbufs); + err = dmu_buf_hold_array(os, object, offset, mylen, + TRUE, FTAG, &numbufs, &dbp); + if (err) return (err); - } for (i = 0; i < numbufs; i++) { int tocpy; @@ -424,36 +372,20 @@ dmu_read_impl(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, size -= tocpy; buf = (char *)buf + tocpy; } - dmu_buf_rele_array(dbp, numbufs); + dmu_buf_rele_array(dbp, numbufs, FTAG); } return (0); } void -dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf) -{ - int err; - - err = dmu_read_impl(os, object, offset, size, buf, DB_RF_MUST_SUCCEED); - ASSERT3U(err, ==, 0); -} - -int -dmu_read_canfail(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf) -{ - return (dmu_read_impl(os, object, offset, size, buf, DB_RF_CANFAIL)); -} - -void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; - dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs); + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { int tocpy; @@ -481,7 +413,7 @@ dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, size -= tocpy; buf = (char *)buf + tocpy; } - dmu_buf_rele_array(dbp, numbufs); + dmu_buf_rele_array(dbp, numbufs, FTAG); } #ifdef _KERNEL @@ -493,7 +425,10 @@ dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, int numbufs, i; int err = 0; - dbp = dmu_buf_hold_array(os, object, offset, size, &numbufs); + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); for (i = 0; i < numbufs; i++) { int tocpy; @@ -530,7 +465,7 @@ dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, offset += tocpy; size -= tocpy; } - dmu_buf_rele_array(dbp, numbufs); + dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } #endif @@ -539,6 +474,7 @@ struct backuparg { dmu_replay_record_t *drr; vnode_t *vp; objset_t *os; + zio_cksum_t zc; int err; }; @@ -546,8 +482,9 @@ static int dump_bytes(struct backuparg *ba, void *buf, int len) { ssize_t resid; /* have to get resid to get detailed errno */ - /* Need to compute checksum here */ ASSERT3U(len % 8, ==, 0); + + fletcher_4_incremental_native(buf, len, &ba->zc); ba->err = vn_rdwr(UIO_WRITE, ba->vp, (caddr_t)buf, len, 0, UIO_SYSSPACE, FAPPEND, RLIM_INFINITY, CRED(), &resid); @@ -652,7 +589,7 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) void *data = bc->bc_data; int err = 0; - if (issig(JUSTLOOKING)) + if (issig(JUSTLOOKING) && issig(FORREAL)) return (EINTR); ASSERT(data || bp == NULL); @@ -681,16 +618,21 @@ backup_cb(traverse_blk_cache_t *bc, spa_t *spa, void *arg) int blksz = BP_GET_LSIZE(bp); if (data == NULL) { arc_buf_t *abuf; + zbookmark_t zb; + zb.zb_objset = ba->os->os->os_dsl_dataset->ds_object; + zb.zb_object = object; + zb.zb_level = level; + zb.zb_blkid = blkid; (void) arc_read(NULL, spa, bp, dmu_ot[type].ot_byteswap, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_MUSTSUCCEED, - ARC_WAIT); + ARC_WAIT, &zb); if (abuf) { err = dump_data(ba, type, object, blkid * blksz, blksz, abuf->b_data); - arc_buf_free(abuf, &abuf); + (void) arc_buf_remove_ref(abuf, &abuf); } } else { err = dump_data(ba, type, object, blkid * blksz, @@ -736,6 +678,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp) ba.drr = drr; ba.vp = vp; ba.os = tosnap; + ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { kmem_free(drr, sizeof (dmu_replay_record_t)); @@ -755,6 +698,7 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, vnode_t *vp) bzero(drr, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; + drr->drr_u.drr_end.drr_checksum = ba.zc; if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) return (ba.err); @@ -773,6 +717,7 @@ struct restorearg { int buflen; /* number of valid bytes in buf */ int bufoff; /* next offset to read */ int bufsize; /* amount of memory allocated for buf */ + zio_cksum_t zc; }; static int @@ -789,8 +734,11 @@ replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) if (dd->dd_phys->dd_head_dataset_obj == 0) goto die; - ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj, - NULL, DS_MODE_EXCLUSIVE, FTAG); + err = dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, + NULL, DS_MODE_EXCLUSIVE, FTAG, &ds); + if (err) + goto die; if (ds == NULL) { err = EBUSY; @@ -804,9 +752,11 @@ replay_incremental_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) } /* most recent snapshot must match fromguid */ - ds_prev = dsl_dataset_open_obj(dd->dd_pool, + err = dsl_dataset_open_obj(dd->dd_pool, ds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_STANDARD | DS_MODE_READONLY, FTAG); + DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds_prev); + if (err) + goto die; if (ds_prev->ds_phys->ds_guid != drrb->drr_fromguid) { err = ENODEV; goto die; @@ -885,9 +835,8 @@ replay_full_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) /* the point of no (unsuccessful) return */ - err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname, - DS_MODE_EXCLUSIVE, FTAG, &ds); - ASSERT3U(err, ==, 0); + VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, fsfullname, + DS_MODE_EXCLUSIVE, FTAG, &ds)); kmem_free(fsfullname, MAXNAMELEN); (void) dmu_objset_create_impl(dsl_dataset_get_spa(ds), @@ -921,9 +870,8 @@ replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) return (err); /* set snapshot's creation time and guid */ - err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname, - DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds); - ASSERT3U(err, ==, 0); + VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, drrb->drr_toname, + DS_MODE_PRIMARY | DS_MODE_READONLY | DS_MODE_RESTORE, FTAG, &ds)); dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_creation_time = drrb->drr_creation_time; @@ -932,8 +880,9 @@ replay_end_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) dsl_dataset_close(ds, DS_MODE_PRIMARY, FTAG); - ds = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj, - NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG); + VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, + NULL, DS_MODE_STANDARD | DS_MODE_RESTORE, FTAG, &ds)); dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_restoring = FALSE; dsl_dataset_close(ds, DS_MODE_STANDARD, FTAG); @@ -959,8 +908,6 @@ restore_read(struct restorearg *ra, int len) ra->voff, UIO_SYSSPACE, FAPPEND, RLIM_INFINITY, CRED(), &resid); - /* Need to compute checksum */ - ra->voff += ra->bufsize - leftover - resid; ra->buflen = ra->bufsize - resid; ra->bufoff = 0; @@ -968,12 +915,17 @@ restore_read(struct restorearg *ra, int len) ra->err = EINVAL; if (ra->err) return (NULL); + /* Could compute checksum here? */ } ASSERT3U(ra->bufoff % 8, ==, 0); ASSERT3U(ra->buflen - ra->bufoff, >=, len); rv = ra->buf + ra->bufoff; ra->bufoff += len; + if (ra->byteswap) + fletcher_4_incremental_byteswap(rv, len, &ra->zc); + else + fletcher_4_incremental_native(rv, len, &ra->zc); return (rv); } @@ -1016,7 +968,10 @@ backup_byteswap(dmu_replay_record_t *drr) DO64(drr_free.drr_length); break; case DRR_END: - DO64(drr_end.drr_checksum); + DO64(drr_end.drr_checksum.zc_word[0]); + DO64(drr_end.drr_checksum.zc_word[1]); + DO64(drr_end.drr_checksum.zc_word[2]); + DO64(drr_end.drr_checksum.zc_word[3]); break; } #undef DO64 @@ -1089,7 +1044,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) if (drro->drr_bonuslen) { dmu_buf_t *db; void *data; - db = dmu_bonus_hold(os, drro->drr_object); + VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, ==, drro->drr_bonuslen); @@ -1103,7 +1058,7 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data, drro->drr_bonuslen); } - dmu_buf_rele(db); + dmu_buf_rele(db, FTAG); } dmu_tx_commit(tx); return (0); @@ -1202,21 +1157,22 @@ restore_free(struct restorearg *ra, objset_t *os, dmu_tx_abort(tx); return (err); } - dmu_free_range(os, drrf->drr_object, + err = dmu_free_range(os, drrf->drr_object, drrf->drr_offset, drrf->drr_length, tx); dmu_tx_commit(tx); - return (0); + return (err); } int -dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep, +dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, vnode_t *vp, uint64_t voffset) { struct restorearg ra; dmu_replay_record_t *drr; - char *cp, *tosnap; + char *cp; dsl_dir_t *dd = NULL; objset_t *os = NULL; + zio_cksum_t pzc; bzero(&ra, sizeof (ra)); ra.vp = vp; @@ -1233,6 +1189,23 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep, goto out; } + /* + * NB: this assumes that struct drr_begin will be the largest in + * dmu_replay_record_t's drr_u, and thus we don't need to pad it + * with zeros to make it the same length as we wrote out. + */ + ((dmu_replay_record_t *)ra.buf)->drr_type = DRR_BEGIN; + ((dmu_replay_record_t *)ra.buf)->drr_pad = 0; + ((dmu_replay_record_t *)ra.buf)->drr_u.drr_begin = *drrb; + if (ra.byteswap) { + fletcher_4_incremental_byteswap(ra.buf, + sizeof (dmu_replay_record_t), &ra.zc); + } else { + fletcher_4_incremental_native(ra.buf, + sizeof (dmu_replay_record_t), &ra.zc); + } + (void) strcpy(drrb->drr_toname, tosnap); /* for the sync funcs */ + if (ra.byteswap) { drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_version = BSWAP_64(drrb->drr_version); @@ -1244,7 +1217,6 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep, ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); - tosnap = drrb->drr_toname; if (drrb->drr_version != DMU_BACKUP_VERSION || drrb->drr_type >= DMU_OST_NUMTYPES || strchr(drrb->drr_toname, '@') == NULL) { @@ -1260,12 +1232,10 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep, cp = strchr(tosnap, '@'); *cp = '\0'; - dd = dsl_dir_open(tosnap, FTAG, NULL); + ra.err = dsl_dir_open(tosnap, FTAG, &dd, NULL); *cp = '@'; - if (dd == NULL) { - ra.err = ENOENT; + if (ra.err) goto out; - } ra.err = dsl_dir_sync_task(dd, replay_incremental_sync, drrb, 1<<20); @@ -1275,12 +1245,10 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep, cp = strchr(tosnap, '@'); *cp = '\0'; - dd = dsl_dir_open(tosnap, FTAG, &tail); + ra.err = dsl_dir_open(tosnap, FTAG, &dd, &tail); *cp = '@'; - if (dd == NULL) { - ra.err = ENOENT; + if (ra.err) goto out; - } if (tail == NULL) { ra.err = EEXIST; goto out; @@ -1306,9 +1274,10 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep, /* * Read records and process them. */ + pzc = ra.zc; while (ra.err == 0 && NULL != (drr = restore_read(&ra, sizeof (*drr)))) { - if (issig(JUSTLOOKING)) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { ra.err = EINTR; goto out; } @@ -1348,7 +1317,22 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep, break; } case DRR_END: - /* Need to verify checksum. */ + { + struct drr_end drre = drr->drr_u.drr_end; + /* + * We compare against the *previous* checksum + * value, because the stored checksum is of + * everything before the DRR_END record. + */ + if (drre.drr_checksum.zc_word[0] != 0 && + ((drre.drr_checksum.zc_word[0] - pzc.zc_word[0]) | + (drre.drr_checksum.zc_word[1] - pzc.zc_word[1]) | + (drre.drr_checksum.zc_word[2] - pzc.zc_word[2]) | + (drre.drr_checksum.zc_word[3] - pzc.zc_word[3]))) { + ra.err = ECKSUM; + goto out; + } + /* * dd may be the parent of the dd we are * restoring into (eg. if it's a full backup). @@ -1356,10 +1340,12 @@ dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep, ra.err = dsl_dir_sync_task(dmu_objset_ds(os)-> ds_dir, replay_end_sync, drrb, 1<<20); goto out; + } default: ra.err = EINVAL; goto out; } + pzc = ra.zc; } out: @@ -1443,6 +1429,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, dmu_buf_impl_t *db; blkptr_t *blk; int err; + zbookmark_t zb; ASSERT(RW_LOCK_HELD(&tx->tx_suspend)); ASSERT(BP_IS_HOLE(bp)); @@ -1452,6 +1439,11 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); /* + * XXX why is this routine using dmu_buf_*() and casting between + * dmu_buf_impl_t and dmu_buf_t? + */ + + /* * If this txg already synced, there's nothing to do. */ if (txg <= tx->tx_synced_txg) { @@ -1459,7 +1451,10 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, * If we're running ziltest, we need the blkptr regardless. */ if (txg > spa_freeze_txg(dp->dp_spa)) { - db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset); + err = dmu_buf_hold(os, object, offset, + FTAG, (dmu_buf_t **)&db); + if (err) + return (err); /* if db_blkptr == NULL, this was an empty write */ if (db->db_blkptr) *bp = *db->db_blkptr; /* structure assignment */ @@ -1467,7 +1462,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, bzero(bp, sizeof (blkptr_t)); *blkoff = offset - db->db.db_offset; ASSERT3U(*blkoff, <, db->db.db_size); - dmu_buf_rele((dmu_buf_t *)db); + dmu_buf_rele((dmu_buf_t *)db, FTAG); return (0); } return (EALREADY); @@ -1481,7 +1476,9 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, return (EINPROGRESS); } - db = (dmu_buf_impl_t *)dmu_buf_hold(os, object, offset); + err = dmu_buf_hold(os, object, offset, FTAG, (dmu_buf_t **)&db); + if (err) + return (err); mutex_enter(&db->db_mtx); @@ -1491,7 +1488,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, */ if (!list_link_active(&db->db_dirty_node[txg&TXG_MASK])) { mutex_exit(&db->db_mtx); - dmu_buf_rele((dmu_buf_t *)db); + dmu_buf_rele((dmu_buf_t *)db, FTAG); return (ENOENT); } @@ -1505,7 +1502,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, ASSERT(blk != IN_DMU_SYNC); if (blk == IN_DMU_SYNC) { mutex_exit(&db->db_mtx); - dmu_buf_rele((dmu_buf_t *)db); + dmu_buf_rele((dmu_buf_t *)db, FTAG); return (EBUSY); } arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); @@ -1522,11 +1519,15 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, blk = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); blk->blk_birth = 0; /* mark as invalid */ + zb.zb_objset = os->os->os_dsl_dataset->ds_object; + zb.zb_object = db->db.db_object; + zb.zb_level = db->db_level; + zb.zb_blkid = db->db_blkid; err = arc_write(NULL, os->os->os_spa, zio_checksum_select(db->db_dnode->dn_checksum, os->os->os_checksum), zio_compress_select(db->db_dnode->dn_compress, os->os->os_compress), txg, blk, db->db_d.db_data_old[txg&TXG_MASK], NULL, NULL, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT); + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb); ASSERT(err == 0); if (!BP_IS_HOLE(blk)) { @@ -1546,7 +1547,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, ASSERT3P(db->db_d.db_overridden_by[txg&TXG_MASK], ==, NULL); arc_release(db->db_d.db_data_old[txg&TXG_MASK], db); mutex_exit(&db->db_mtx); - dmu_buf_rele((dmu_buf_t *)db); + dmu_buf_rele((dmu_buf_t *)db, FTAG); /* Note that this block does not free on disk until txg syncs */ /* @@ -1563,7 +1564,7 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, db->db_d.db_overridden_by[txg&TXG_MASK] = blk; mutex_exit(&db->db_mtx); - dmu_buf_rele((dmu_buf_t *)db); + dmu_buf_rele((dmu_buf_t *)db, FTAG); ASSERT3U(txg, >, tx->tx_syncing_txg); return (0); } @@ -1571,7 +1572,10 @@ dmu_sync(objset_t *os, uint64_t object, uint64_t offset, uint64_t *blkoff, uint64_t dmu_object_max_nonzero_offset(objset_t *os, uint64_t object) { - dnode_t *dn = dnode_hold(os->os, object, FTAG); + dnode_t *dn; + + /* XXX assumes dnode_hold will not get an i/o error */ + (void) dnode_hold(os->os, object, FTAG, &dn); uint64_t rv = dnode_max_nonzero_offset(dn); dnode_rele(dn, FTAG); return (rv); @@ -1581,8 +1585,13 @@ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { - dnode_t *dn = dnode_hold(os->os, object, FTAG); - int err = dnode_set_blksz(dn, size, ibs, tx); + dnode_t *dn; + int err; + + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); + err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } @@ -1591,7 +1600,10 @@ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) { - dnode_t *dn = dnode_hold(os->os, object, FTAG); + dnode_t *dn; + + /* XXX assumes dnode_hold will not get an i/o error */ + (void) dnode_hold(os->os, object, FTAG, &dn); ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); @@ -1602,7 +1614,10 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx) { - dnode_t *dn = dnode_hold(os->os, object, FTAG); + dnode_t *dn; + + /* XXX assumes dnode_hold will not get an i/o error */ + (void) dnode_hold(os->os, object, FTAG, &dn); ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); dn->dn_compress = compress; dnode_setdirty(dn, tx); @@ -1615,7 +1630,9 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) dnode_t *dn; int i, err; - dn = dnode_hold(os->os, object, FTAG); + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); /* * Sync any current changes before * we go trundling through the block pointers. @@ -1627,7 +1644,9 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) if (i != TXG_SIZE) { dnode_rele(dn, FTAG); txg_wait_synced(dmu_objset_pool(os), 0); - dn = dnode_hold(os->os, object, FTAG); + err = dnode_hold(os->os, object, FTAG, &dn); + if (err) + return (err); } err = dnode_next_offset(dn, hole, off, 1, 1); @@ -1665,10 +1684,11 @@ dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { - dnode_t *dn = dnode_hold(os->os, object, FTAG); + dnode_t *dn; + int err = dnode_hold(os->os, object, FTAG, &dn); - if (dn == NULL) - return (ENOENT); + if (err) + return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); @@ -1699,6 +1719,71 @@ dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512) *nblk512 = dn->dn_phys->dn_secphys + 1; /* add 1 for dnode space */ } +/* + * Given a bookmark, return the name of the dataset, object, and range in + * human-readable format. + */ +int +spa_bookmark_name(spa_t *spa, zbookmark_t *zb, char *dsname, size_t dslen, + char *objname, size_t objlen, char *range, size_t rangelen) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds = NULL; + objset_t *os = NULL; + dnode_t *dn = NULL; + int err, shift; + + if (dslen < MAXNAMELEN || objlen < 32 || rangelen < 64) + return (ENOSPC); + + dp = spa_get_dsl(spa); + if (zb->zb_objset != 0) { + rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dsl_dataset_open_obj(dp, zb->zb_objset, + NULL, DS_MODE_NONE, FTAG, &ds); + if (err) { + rw_exit(&dp->dp_config_rwlock); + return (err); + } + dsl_dataset_name(ds, dsname); + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + rw_exit(&dp->dp_config_rwlock); + + err = dmu_objset_open(dsname, DMU_OST_ANY, DS_MODE_NONE, &os); + if (err) + goto out; + + } else { + dsl_dataset_name(NULL, dsname); + os = dp->dp_meta_objset; + } + + + if (zb->zb_object == DMU_META_DNODE_OBJECT) { + (void) strncpy(objname, "mdn", objlen); + } else { + (void) snprintf(objname, objlen, "%lld", + (longlong_t)zb->zb_object); + } + + err = dnode_hold(os->os, zb->zb_object, FTAG, &dn); + if (err) + goto out; + + shift = (dn->dn_datablkshift?dn->dn_datablkshift:SPA_MAXBLOCKSHIFT) + + zb->zb_level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT); + (void) snprintf(range, rangelen, "%llu-%llu", + (u_longlong_t)(zb->zb_blkid << shift), + (u_longlong_t)((zb->zb_blkid+1) << shift)); + +out: + if (dn) + dnode_rele(dn, FTAG); + if (os && os != dp->dp_meta_objset) + dmu_objset_close(os); + return (err); +} + void byteswap_uint64_array(void *vbuf, size_t size) { diff --git a/usr/src/uts/common/fs/zfs/dmu_object.c b/usr/src/uts/common/fs/zfs/dmu_object.c index d150d6c400..99d40c5ec5 100644 --- a/usr/src/uts/common/fs/zfs/dmu_object.c +++ b/usr/src/uts/common/fs/zfs/dmu_object.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,7 +38,7 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, uint64_t object; uint64_t L2_dnode_count = DNODES_PER_BLOCK << (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT); - dnode_t *dn; + dnode_t *dn = NULL; int restarted = B_FALSE; mutex_enter(&osi->os_obj_lock); @@ -62,7 +61,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, } osi->os_obj_next = ++object; - dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG); + /* + * XXX We should check for an i/o error here and return + * up to our caller. Actually we should pre-read it in + * dmu_tx_assign(), but there is currently no mechanism + * to do so. + */ + (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, + FTAG, &dn); if (dn) break; @@ -84,13 +90,14 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { dnode_t *dn; + int err; - if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx)) + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) return (EBADF); - dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG); - if (dn == NULL) - return (EEXIST); + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn); + if (err) + return (err); dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); dnode_rele(dn, FTAG); @@ -103,13 +110,15 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { dnode_t *dn; + int err; - if ((object & DMU_PRIVATE_OBJECT) && !dmu_tx_private_ok(tx)) + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) return (EBADF); - dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG); - if (dn == NULL) - return (EBADF); + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err) + return (err); dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); dnode_rele(dn, FTAG); @@ -120,12 +129,14 @@ int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; + int err; - ASSERT(!(object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx)); + ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); - dn = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, FTAG); - if (dn == NULL) - return (ENOENT); + err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + FTAG, &dn); + if (err) + return (err); ASSERT(dn->dn_type != DMU_OT_NONE); dnode_free(dn, tx); diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index 8d77ff70c0..6625fdb98d 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -127,8 +126,9 @@ dmu_objset_byteswap(void *buf, size_t size) osp->os_type = BSWAP_64(osp->os_type); } -objset_impl_t * -dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp) +int +dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + objset_impl_t **osip) { objset_impl_t *winner, *osi; int i, err, checksum; @@ -141,15 +141,25 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp) osi->os_rootbp = *bp; osi->os_phys = zio_buf_alloc(sizeof (objset_phys_t)); if (!BP_IS_HOLE(&osi->os_rootbp)) { + zbookmark_t zb; + zb.zb_objset = ds ? ds->ds_object : 0; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = 0; + dprintf_bp(&osi->os_rootbp, "reading %s", ""); - (void) arc_read(NULL, spa, &osi->os_rootbp, + err = arc_read(NULL, spa, &osi->os_rootbp, dmu_ot[DMU_OT_OBJSET].ot_byteswap, arc_bcopy_func, osi->os_phys, - ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT); + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, ARC_WAIT, &zb); + if (err) { + zio_buf_free(osi->os_phys, sizeof (objset_phys_t)); + kmem_free(osi, sizeof (objset_impl_t)); + return (err); + } } else { bzero(osi->os_phys, sizeof (objset_phys_t)); } - osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header); /* * Note: the changed_cb will be called once before the register @@ -159,18 +169,22 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp) if (ds) { err = dsl_prop_register(ds, "checksum", checksum_changed_cb, osi); - ASSERT(err == 0); - - err = dsl_prop_register(ds, "compression", - compression_changed_cb, osi); - ASSERT(err == 0); + if (err == 0) + err = dsl_prop_register(ds, "compression", + compression_changed_cb, osi); + if (err) { + zio_buf_free(osi->os_phys, sizeof (objset_phys_t)); + kmem_free(osi, sizeof (objset_impl_t)); + return (err); + } } else { /* It's the meta-objset. */ - /* XXX - turn off metadata compression temporarily */ osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4; - osi->os_compress = ZIO_COMPRESS_OFF; + osi->os_compress = ZIO_COMPRESS_LZJB; } + osi->os_zil = zil_alloc(&osi->os, &osi->os_phys->os_zil_header); + /* * Metadata always gets compressed and checksummed. * If the data checksum is multi-bit correctable, and it's not @@ -184,9 +198,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp) osi->os_md_checksum = checksum; else osi->os_md_checksum = ZIO_CHECKSUM_FLETCHER_4; - - /* XXX - turn off metadata compression temporarily */ - osi->os_md_compress = ZIO_COMPRESS_OFF; + osi->os_md_compress = ZIO_COMPRESS_LZJB; for (i = 0; i < TXG_SIZE; i++) { list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t), @@ -210,7 +222,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp) } } - return (osi); + *osip = osi; + return (0); } /* called from zpl */ @@ -235,7 +248,13 @@ dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, blkptr_t bp; dsl_dataset_get_blkptr(ds, &bp); - osi = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, &bp); + err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), + ds, &bp, &osi); + if (err) { + dsl_dataset_close(ds, mode, os); + kmem_free(os, sizeof (objset_t)); + return (err); + } } os->os = osi; @@ -257,9 +276,51 @@ dmu_objset_close(objset_t *os) } void +dmu_objset_evict_dbufs(objset_t *os) +{ + objset_impl_t *osi = os->os; + dnode_t *mdn = osi->os_meta_dnode; + dnode_t *dn; + int allzero = B_TRUE; + + /* + * Each time we process an entry on the list, we first move it + * to the tail so that we don't process it over and over again. + * We use the meta-dnode as a marker: if we make a complete pass + * over the list without finding any work to do, we're done. + * This ensures that we complete in linear time rather than + * quadratic time, as described in detail in bug 1182169. + */ + mutex_enter(&osi->os_lock); + list_remove(&osi->os_dnodes, mdn); + list_insert_tail(&osi->os_dnodes, mdn); + while ((dn = list_head(&osi->os_dnodes)) != NULL) { + list_remove(&osi->os_dnodes, dn); + list_insert_tail(&osi->os_dnodes, dn); + if (dn == mdn) { + if (allzero) + break; + allzero = B_TRUE; + continue; + } + if (!refcount_is_zero(&dn->dn_holds)) { + allzero = B_FALSE; + dnode_add_ref(dn, FTAG); + mutex_exit(&osi->os_lock); + dnode_evict_dbufs(dn); + dnode_rele(dn, FTAG); + mutex_enter(&osi->os_lock); + } + } + mutex_exit(&osi->os_lock); + dnode_evict_dbufs(mdn); +} + +void dmu_objset_evict(dsl_dataset_t *ds, void *arg) { objset_impl_t *osi = arg; + objset_t os; int err, i; for (i = 0; i < TXG_SIZE; i++) { @@ -277,6 +338,13 @@ dmu_objset_evict(dsl_dataset_t *ds, void *arg) ASSERT(err == 0); } + /* + * We should need only a single pass over the dnode list, since + * nothing can be added to the list at this point. + */ + os.os = osi; + dmu_objset_evict_dbufs(&os); + ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode); ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode); ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL); @@ -297,7 +365,7 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type, dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); - osi = dmu_objset_open_impl(spa, ds, NULL); + VERIFY(0 == dmu_objset_open_impl(spa, ds, NULL, &osi)); mdn = osi->os_meta_dnode; dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, @@ -314,9 +382,21 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, dmu_objset_type_t type, * needs to be synced multiple times as spa_sync() iterates * to convergence, so minimizing its dn_nlevels matters. */ - if (ds != NULL) + if (ds != NULL) { + int levels = 1; + + /* + * Determine the number of levels necessary for the meta-dnode + * to contain DN_MAX_OBJECT dnodes. + */ + while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < + DN_MAX_OBJECT * sizeof (dnode_phys_t)) + levels++; + mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = - mdn->dn_nlevels = DN_META_DNODE_LEVELS; + mdn->dn_nlevels = levels; + } ASSERT(type != DMU_OST_NONE); ASSERT(type != DMU_OST_ANY); @@ -354,9 +434,8 @@ dmu_objset_create_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) if (err) return (err); - err = dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname, - DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds); - ASSERT3U(err, ==, 0); + VERIFY(0 == dsl_dataset_open_spa(dd->dd_pool->dp_spa, oa->fullname, + DS_MODE_STANDARD | DS_MODE_READONLY, FTAG, &ds)); dsl_dataset_get_blkptr(ds, &bp); if (BP_IS_HOLE(&bp)) { objset_impl_t *osi; @@ -382,9 +461,9 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, const char *tail; int err = 0; - pds = dsl_dir_open(name, FTAG, &tail); - if (pds == NULL) - return (ENOENT); + err = dsl_dir_open(name, FTAG, &pds, &tail); + if (err) + return (err); if (tail == NULL) { dsl_dir_close(pds, FTAG); return (EEXIST); @@ -554,6 +633,7 @@ dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx) int txgoff; list_t *dirty_list; int err; + zbookmark_t zb; arc_buf_t *abuf = arc_buf_alloc(os->os_spa, sizeof (objset_phys_t), FTAG); @@ -586,11 +666,15 @@ dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx) * Sync the root block. */ bcopy(os->os_phys, abuf->b_data, sizeof (objset_phys_t)); + zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = 0; err = arc_write(NULL, os->os_spa, os->os_md_checksum, os->os_md_compress, tx->tx_txg, &os->os_rootbp, abuf, killer, os, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT); + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, ARC_WAIT, &zb); ASSERT(err == 0); - arc_buf_free(abuf, FTAG); + VERIFY(arc_buf_remove_ref(abuf, FTAG) == 1); dsl_dataset_set_blkptr(os->os_dsl_dataset, &os->os_rootbp, tx); @@ -707,10 +791,10 @@ dmu_objset_find(char *name, void func(char *, void *), void *arg, int flags) zap_cursor_t zc; zap_attribute_t attr; char *child; - int do_self; + int do_self, err; - dd = dsl_dir_open(name, FTAG, NULL); - if (dd == NULL) + err = dsl_dir_open(name, FTAG, &dd, NULL); + if (err) return; do_self = (dd->dd_phys->dd_head_dataset_obj != 0); diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c index fedeba015d..fbc55fec86 100644 --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -339,7 +338,7 @@ traverse_read(traverse_handle_t *th, traverse_blk_cache_t *bc, blkptr_t *bp, } else { error = zio_wait(zio_read(NULL, th->th_spa, bp, bc->bc_data, BP_GET_LSIZE(bp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, - th->th_zio_flags | ZIO_FLAG_DONT_CACHE)); + th->th_zio_flags | ZIO_FLAG_DONT_CACHE, zb)); if (BP_SHOULD_BYTESWAP(bp) && error == 0) (zb->zb_level > 0 ? byteswap_uint64_array : @@ -469,13 +468,70 @@ get_dnode(traverse_handle_t *th, uint64_t objset, dnode_phys_t *mdn, return (rc); } +/* ARGSUSED */ +static void +traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t maxtxg) +{ + traverse_handle_t *th = arg; + traverse_blk_cache_t *bc = &th->th_zil_cache; + zbookmark_t *zb = &bc->bc_bookmark; + + if (bp->blk_birth < maxtxg) { + zb->zb_object = 0; + zb->zb_blkid = bp->blk_cksum.zc_word[3]; + bc->bc_blkptr = *bp; + (void) th->th_func(bc, th->th_spa, th->th_arg); + } +} + +/* ARGSUSED */ +static void +traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t maxtxg) +{ + traverse_handle_t *th = arg; + traverse_blk_cache_t *bc = &th->th_zil_cache; + zbookmark_t *zb = &bc->bc_bookmark; + + if (lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + + if (bp->blk_birth != 0 && bp->blk_birth < maxtxg) { + zb->zb_object = lr->lr_foid; + zb->zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); + bc->bc_blkptr = *bp; + (void) th->th_func(bc, th->th_spa, th->th_arg); + } + } +} + +static void +traverse_zil(traverse_handle_t *th, traverse_blk_cache_t *bc, uint64_t maxtxg) +{ + spa_t *spa = th->th_spa; + objset_phys_t *osphys = bc->bc_data; + dsl_pool_t *dp = spa_get_dsl(spa); + zilog_t *zilog; + + ASSERT(bc == &th->th_cache[ZB_MDN_CACHE][ZB_MAXLEVEL - 1]); + ASSERT(bc->bc_bookmark.zb_level == -1); + + th->th_zil_cache.bc_bookmark = bc->bc_bookmark; + + zilog = zil_alloc(dp->dp_meta_objset, &osphys->os_zil_header); + + zil_parse(zilog, traverse_zil_block, traverse_zil_record, th, maxtxg); + + zil_free(zilog); +} + static int traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) { zbookmark_t *zb = &zseg->seg_start; traverse_blk_cache_t *bc; dnode_phys_t *dn, *dn_tmp; - int worklimit = 1000; + int worklimit = 100; int rc; dprintf("<%llu, %llu, %d, %llx>\n", @@ -529,6 +585,8 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) if (zb->zb_level == -1) { ASSERT(zb->zb_object == 0); + ASSERT(zb->zb_blkid == 0); + ASSERT(BP_GET_TYPE(&bc->bc_blkptr) == DMU_OT_OBJSET); if (bc->bc_blkptr.blk_birth > zseg->seg_mintxg) { rc = traverse_callback(th, zseg, bc); @@ -536,6 +594,9 @@ traverse_segment(traverse_handle_t *th, zseg_t *zseg, blkptr_t *mosbp) ASSERT(rc == EINTR); return (rc); } + if ((th->th_advance & ADVANCE_ZIL) && + zb->zb_objset != 0) + traverse_zil(th, bc, zseg->seg_maxtxg); } return (advance_from_osphys(zseg, th->th_advance)); diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index 6576107ae2..894bd63f36 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -37,6 +37,9 @@ #include <sys/spa.h> #include <sys/zfs_context.h> +typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, + uint64_t arg1, uint64_t arg2); + #ifdef ZFS_DEBUG int dmu_use_tx_debug_bufs = 1; #endif @@ -60,6 +63,7 @@ dmu_tx_create(objset_t *os) { dmu_tx_t *tx = dmu_tx_create_ds(os->os->os_dsl_dataset->ds_dir); tx->tx_objset = os; + tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); return (tx); } @@ -85,7 +89,7 @@ dmu_tx_is_syncing(dmu_tx_t *tx) int dmu_tx_private_ok(dmu_tx_t *tx) { - return (tx->tx_anyobj || tx->tx_privateobj); + return (tx->tx_anyobj); } static void @@ -95,11 +99,16 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, { dmu_tx_hold_t *dth; dnode_t *dn = NULL; + int err; if (object != DMU_NEW_OBJECT) { - dn = dnode_hold(os->os, object, tx); + err = dnode_hold(os->os, object, tx, &dn); + if (err) { + tx->tx_err = err; + return; + } - if (tx->tx_txg != 0) { + if (err == 0 && tx->tx_txg != 0) { mutex_enter(&dn->dn_mtx); /* * dn->dn_assigned_txg == tx->tx_txg doesn't pose a @@ -118,15 +127,12 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, dth = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); dth->dth_dnode = dn; dth->dth_type = type; - dth->dth_func = func; dth->dth_arg1 = arg1; dth->dth_arg2 = arg2; - /* - * XXX Investigate using a different data structure to keep - * track of dnodes in a tx. Maybe array, since there will - * generally not be many entries? - */ list_insert_tail(&tx->tx_holds, dth); + + if (func) + func(tx, dn, arg1, arg2); } void @@ -142,11 +148,27 @@ dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) } } +static int +dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) +{ + int err; + dmu_buf_impl_t *db; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold_level(dn, level, blkid, FTAG); + rw_exit(&dn->dn_struct_rwlock); + if (db == NULL) + return (EIO); + err = dbuf_read(db, zio, DB_RF_CANFAIL); + dbuf_rele(db, FTAG); + return (err); +} + /* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) { - uint64_t start, end, space; + uint64_t start, end, i, space; int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; if (len == 0) @@ -158,6 +180,64 @@ dmu_tx_count_write(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) max_ibs = DN_MAX_INDBLKSHIFT; /* + * For i/o error checking, read the first and last level-0 + * blocks, and all the level-1 blocks. We needn't do this on + * the meta-dnode, because we've already read it in. + */ + + if (dn && dn->dn_object != DMU_META_DNODE_OBJECT) { + int err; + + if (dn->dn_maxblkid == 0) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err) { + tx->tx_err = err; + return; + } + } else { + zio_t *zio = zio_root(tx->tx_pool->dp_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + + /* first level-0 block */ + start = off/dn->dn_datablksz; + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err) { + tx->tx_err = err; + return; + } + + /* last level-0 block */ + end = (off+len)/dn->dn_datablksz; + if (end != start) { + err = dmu_tx_check_ioerr(zio, dn, 0, end); + if (err) { + tx->tx_err = err; + return; + } + } + + /* level-1 blocks */ + if (dn->dn_nlevels > 1) { + start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; + end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (i = start+1; i < end; i++) { + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err) { + tx->tx_err = err; + return; + } + } + } + + err = zio_wait(zio); + if (err) { + tx->tx_err = err; + return; + } + } + } + + /* * If there's more than one block, the blocksize can't change, * so we can make a more precise estimate. Alternatively, * if the dnode's ibs is larger than max_ibs, always use that. @@ -218,7 +298,7 @@ dmu_tx_count_dnode(dmu_tx_t *tx, dnode_t *dn) dmu_tx_count_write(tx, mdn, object << DNODE_SHIFT, 1 << DNODE_SHIFT); if (dn && dn->dn_dbuf->db_blkptr && dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_dbuf->db_blkptr->blk_birth, tx)) { + dn->dn_dbuf->db_blkptr->blk_birth)) { tx->tx_space_tooverwrite += tx->tx_space_towrite - pre_write_space; tx->tx_space_towrite = pre_write_space; @@ -237,7 +317,7 @@ void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) { ASSERT(tx->tx_txg == 0); - ASSERT(len > 0 && len < DMU_MAX_ACCESS); + ASSERT(len < DMU_MAX_ACCESS); ASSERT(UINT64_MAX - off >= len - 1); dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE, @@ -251,8 +331,6 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) uint64_t space = 0; dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; - ASSERT(dn->dn_assigned_tx == tx || dn->dn_assigned_tx == NULL); - if (dn->dn_datablkshift == 0) return; /* @@ -264,8 +342,10 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) blkid = off >> dn->dn_datablkshift; nblks = (off + len) >> dn->dn_datablkshift; - if (blkid >= dn->dn_maxblkid) - goto out; + if (blkid >= dn->dn_maxblkid) { + rw_exit(&dn->dn_struct_rwlock); + return; + } if (blkid + nblks > dn->dn_maxblkid) nblks = dn->dn_maxblkid - blkid; @@ -278,12 +358,12 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) blkptr_t *bp = dn->dn_phys->dn_blkptr; ASSERT3U(blkid + i, <, dn->dn_phys->dn_nblkptr); bp += blkid + i; - if (dsl_dataset_block_freeable(ds, bp->blk_birth, tx)) { + if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { dprintf_bp(bp, "can free old%s", ""); space += BP_GET_ASIZE(bp); } } - goto out; + nblks = 0; } while (nblks) { @@ -299,20 +379,26 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) int i; blkptr_t *bp; - dbuf_read_havestruct(dbuf); + err = dbuf_read(dbuf, NULL, + DB_RF_HAVESTRUCT | DB_RF_CANFAIL); + if (err != 0) { + tx->tx_err = err; + dbuf_rele(dbuf, FTAG); + break; + } bp = dbuf->db.db_data; bp += blkoff; for (i = 0; i < tochk; i++) { if (dsl_dataset_block_freeable(ds, - bp[i].blk_birth, tx)) { + bp[i].blk_birth)) { dprintf_bp(&bp[i], "can free old%s", ""); space += BP_GET_ASIZE(&bp[i]); } } - dbuf_remove_ref(dbuf, FTAG); + dbuf_rele(dbuf, FTAG); } else { /* the indirect block is sparse */ ASSERT(err == ENOENT); @@ -321,7 +407,6 @@ dmu_tx_count_free(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) blkid += tochk; nblks -= tochk; } -out: rw_exit(&dn->dn_struct_rwlock); tx->tx_space_tofree += space; @@ -330,7 +415,9 @@ out: static void dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) { - int dirty; + uint64_t start, end, i; + int dirty, err, shift; + zio_t *zio; /* first block */ if (off != 0 /* || dn->dn_maxblkid == 0 */) @@ -339,13 +426,46 @@ dmu_tx_hold_free_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) if (len != DMU_OBJECT_END) dmu_tx_count_write(tx, dn, off+len, 1); - dmu_tx_count_dnode(tx, dn); - if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; + /* + * For i/o error checking, read the first and last level-0 + * blocks, and all the level-1 blocks. The above count_write's + * will take care of the level-0 blocks. + */ + shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; + start = off >> shift; + end = dn->dn_datablkshift ? ((off+len) >> shift) : 0; + + zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + for (i = start+1; i < end; i++) { + uint64_t ibyte = i << shift; + err = dnode_next_offset(dn, FALSE, &ibyte, 2, 1); + i = ibyte >> shift; + if (err == ESRCH) + break; + if (err) { + tx->tx_err = err; + return; + } + + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err) { + tx->tx_err = err; + return; + } + } + err = zio_wait(zio); + if (err) { + tx->tx_err = err; + return; + } + + dmu_tx_count_dnode(tx, dn); + /* XXX locking */ dirty = dn->dn_dirtyblksz[0] | dn->dn_dirtyblksz[1] | dn->dn_dirtyblksz[2] | dn->dn_dirtyblksz[3]; @@ -364,17 +484,17 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) /* ARGSUSED */ static void -dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops) +dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t add, uint64_t iname) { uint64_t nblocks; - int epbs; + int epbs, err; + char *name = (char *)(uintptr_t)iname; dmu_tx_count_dnode(tx, dn); if (dn == NULL) { /* - * Assuming that nops+cops is not super huge, we will be - * able to fit a new object's entries into one leaf + * We will be able to fit a new object's entries into one leaf * block. So there will be at most 2 blocks total, * including the header block. */ @@ -384,25 +504,44 @@ dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops) ASSERT3P(dmu_ot[dn->dn_type].ot_byteswap, ==, zap_byteswap); - if (dn->dn_maxblkid == 0 && nops == 0) { + if (dn->dn_maxblkid == 0 && !add) { /* * If there is only one block (i.e. this is a micro-zap) - * and we are only doing updates, the accounting is simple. + * and we are not adding anything, the accounting is simple. */ + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err) { + tx->tx_err = err; + return; + } + if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, - dn->dn_phys->dn_blkptr[0].blk_birth, tx)) + dn->dn_phys->dn_blkptr[0].blk_birth)) tx->tx_space_tooverwrite += dn->dn_datablksz; else tx->tx_space_towrite += dn->dn_datablksz; return; } + if (dn->dn_maxblkid > 0 && name) { + /* + * access the name in this fat-zap so that we'll check + * for i/o errors to the leaf blocks, etc. + */ + err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, + 8, 0, NULL); + if (err == EIO) { + tx->tx_err = err; + return; + } + } + /* - * 3 blocks overwritten per op: target leaf, ptrtbl block, header block - * 3 new blocks written per op: new split leaf, 2 grown ptrtbl blocks + * 3 blocks overwritten: target leaf, ptrtbl block, header block + * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks */ dmu_tx_count_write(tx, dn, dn->dn_maxblkid * dn->dn_datablksz, - (nops * 6ULL + cops * 3ULL) << dn->dn_datablkshift); + (3 + add ? 3 : 0) << dn->dn_datablkshift); /* * If the modified blocks are scattered to the four winds, @@ -410,17 +549,16 @@ dmu_tx_hold_zap_impl(dmu_tx_t *tx, dnode_t *dn, uint64_t nops, uint64_t cops) */ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) - tx->tx_space_towrite += - ((nops + cops) * 3ULL) << dn->dn_indblkshift; + tx->tx_space_towrite += 3 << dn->dn_indblkshift; } void -dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops) +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP, - dmu_tx_hold_zap_impl, (ops > 0?ops:0), (ops < 0?-ops:0)); + dmu_tx_hold_zap_impl, add, (uintptr_t)name); } void @@ -492,7 +630,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) return; /* XXX No checking on the meta dnode for now */ - if (db->db.db_object & DMU_PRIVATE_OBJECT) + if (db->db.db_object == DMU_META_DNODE_OBJECT) return; for (dth = list_head(&tx->tx_holds); dth; @@ -572,20 +710,19 @@ static int dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth) { dmu_tx_hold_t *dth; - uint64_t lsize, asize, fsize; + uint64_t lsize, asize, fsize, towrite; *last_dth = NULL; - tx->tx_space_towrite = 0; - tx->tx_space_tofree = 0; - tx->tx_space_tooverwrite = 0; tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); if (txg_how >= TXG_INITIAL && txg_how != tx->tx_txg) return (ERESTART); + if (tx->tx_err) + return (tx->tx_err); for (dth = list_head(&tx->tx_holds); dth; - *last_dth = dth, dth = list_next(&tx->tx_holds, dth)) { + dth = list_next(&tx->tx_holds, dth)) { dnode_t *dn = dth->dth_dnode; if (dn != NULL) { mutex_enter(&dn->dn_mtx); @@ -608,8 +745,21 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth) (void) refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } - if (dth->dth_func) - dth->dth_func(tx, dn, dth->dth_arg1, dth->dth_arg2); + *last_dth = dth; + if (tx->tx_err) + return (tx->tx_err); + } + + /* + * If a snapshot has been taken since we made our estimates, + * assume that we won't be able to free or overwrite anything. + */ + if (tx->tx_objset && + dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > + tx->tx_lastsnap_txg) { + tx->tx_space_towrite += tx->tx_space_tooverwrite; + tx->tx_space_tooverwrite = 0; + tx->tx_space_tofree = 0; } /* @@ -619,13 +769,16 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how, dmu_tx_hold_t **last_dth) tx->tx_space_tofree; lsize = tx->tx_space_towrite + tx->tx_space_tooverwrite; asize = spa_get_asize(tx->tx_pool->dp_spa, lsize); + towrite = tx->tx_space_towrite; tx->tx_space_towrite = asize; if (tx->tx_dir && asize != 0) { int err = dsl_dir_tempreserve_space(tx->tx_dir, lsize, asize, fsize, &tx->tx_tempreserve_cookie, tx); - if (err) + if (err) { + tx->tx_space_towrite = towrite; return (err); + } } return (0); @@ -688,8 +841,6 @@ dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) ASSERT(tx->tx_txg == 0); ASSERT(txg_how != 0); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); - ASSERT3U(tx->tx_space_towrite, ==, 0); - ASSERT3U(tx->tx_space_tofree, ==, 0); while ((err = dmu_tx_try_assign(tx, txg_how, &last_dth)) != 0) { uint64_t txg = dmu_tx_unassign(tx, last_dth); diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index 03ce2a0398..8adb692ec8 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -155,7 +155,7 @@ dnode_verify(dnode_t *dn) } if (dn->dn_phys->dn_type != DMU_OT_NONE) ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); - ASSERT(IS_DNODE_DNODE(dn->dn_object) || dn->dn_dbuf); + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL); if (dn->dn_dbuf != NULL) { ASSERT3P(dn->dn_phys, ==, (dnode_phys_t *)dn->dn_dbuf->db.db_data + @@ -307,6 +307,11 @@ dnode_destroy(dnode_t *dn) dn->dn_dirtyctx_firstset = NULL; } dmu_zfetch_rele(&dn->dn_zfetch); + if (dn->dn_bonus) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_evict(dn->dn_bonus); + dn->dn_bonus = NULL; + } kmem_cache_free(dnode_cache, dn); } @@ -381,13 +386,10 @@ void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - dmu_buf_impl_t *db = NULL; - ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE); ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); - ASSERT(!(dn->dn_object & DMU_PRIVATE_OBJECT) || dmu_tx_private_ok(tx)); + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); ASSERT(tx->tx_txg != 0); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); @@ -398,6 +400,10 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, ASSERT(dn->dn_dirtyblksz[2] == 0); ASSERT(dn->dn_dirtyblksz[3] == 0); + /* clean up any unreferenced dbufs */ + dnode_evict_dbufs(dn); + ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + /* * XXX I should really have a generation number to tell if we * need to do this... @@ -421,17 +427,25 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dn->dn_type = ot; if (dn->dn_bonuslen != bonuslen) { + dmu_buf_impl_t *db = NULL; + /* change bonus size */ if (bonuslen == 0) bonuslen = 1; /* XXX */ - db = dbuf_hold_bonus(dn, FTAG); - dbuf_read(db); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus == NULL) + dn->dn_bonus = dbuf_create_bonus(dn); + db = dn->dn_bonus; + rw_exit(&dn->dn_struct_rwlock); + if (refcount_add(&db->db_holds, FTAG) == 1) + dnode_add_ref(dn, db); mutex_enter(&db->db_mtx); ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); ASSERT(db->db.db_data != NULL); db->db.db_size = bonuslen; mutex_exit(&db->db_mtx); dbuf_dirty(db, tx); + dbuf_rele(db, FTAG); } /* change bonus size and type */ @@ -445,14 +459,19 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dn->dn_allocated_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); - - if (db) - dbuf_remove_ref(db, FTAG); } void dnode_special_close(dnode_t *dn) { + /* + * Wait for final references to the dnode to clear. This can + * only happen if the arc is asyncronously evicting state that + * has a hold on this dnode while we are trying to evict this + * dnode. + */ + while (refcount_count(&dn->dn_holds) > 0) + delay(1); dnode_destroy(dn); } @@ -498,21 +517,25 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg) } /* - * Returns held dnode if the object number is valid, NULL if not. - * Note that this will succeed even for free dnodes. + * errors: + * EINVAL - invalid object number. + * EIO - i/o error. + * succeeds even for free dnodes. */ -dnode_t * -dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref) +int +dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, + void *tag, dnode_t **dnp) { - int epb, idx; + int epb, idx, err; int drop_struct_lock = FALSE; + int type; uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; dnode_t **children_dnodes; if (object == 0 || object >= DN_MAX_OBJECT) - return (NULL); + return (EINVAL); mdn = os->os_meta_dnode; @@ -525,10 +548,16 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref) blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); - db = dbuf_hold(mdn, blk); + db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); - dbuf_read(db); + if (db == NULL) + return (EIO); + err = dbuf_read(db, NULL, DB_RF_CANFAIL); + if (err) { + dbuf_rele(db, FTAG); + return (err); + } ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT); epb = db->db.db_size >> DNODE_SHIFT; @@ -559,51 +588,53 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, void *ref) } mutex_enter(&dn->dn_mtx); + type = dn->dn_type; if (dn->dn_free_txg || - ((flag & DNODE_MUST_BE_ALLOCATED) && dn->dn_type == DMU_OT_NONE) || - ((flag & DNODE_MUST_BE_FREE) && dn->dn_type != DMU_OT_NONE)) { + ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || + ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) { mutex_exit(&dn->dn_mtx); - dbuf_rele(db); - return (NULL); + dbuf_rele(db, FTAG); + return (type == DMU_OT_NONE ? ENOENT : EEXIST); } mutex_exit(&dn->dn_mtx); - if (refcount_add(&dn->dn_holds, ref) == 1) + if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dn); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); ASSERT3U(dn->dn_object, ==, object); - dbuf_rele(db); + dbuf_rele(db, FTAG); - return (dn); + *dnp = dn; + return (0); } /* * Return held dnode if the object is allocated, NULL if not. */ -dnode_t * -dnode_hold(objset_impl_t *os, uint64_t object, void *ref) +int +dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp) { - return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, ref)); + return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); } void -dnode_add_ref(dnode_t *dn, void *ref) +dnode_add_ref(dnode_t *dn, void *tag) { ASSERT(refcount_count(&dn->dn_holds) > 0); - (void) refcount_add(&dn->dn_holds, ref); + (void) refcount_add(&dn->dn_holds, tag); } void -dnode_rele(dnode_t *dn, void *ref) +dnode_rele(dnode_t *dn, void *tag) { uint64_t refs; - refs = refcount_remove(&dn->dn_holds, ref); + refs = refcount_remove(&dn->dn_holds, tag); /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ if (refs == 0 && dn->dn_dbuf) - dbuf_remove_ref(dn->dn_dbuf, dn); + dbuf_rele(dn->dn_dbuf, dn); } void @@ -612,7 +643,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) objset_impl_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; - if (IS_DNODE_DNODE(dn->dn_object)) + if (dn->dn_object == DMU_META_DNODE_OBJECT) return; DNODE_VERIFY(dn); @@ -658,7 +689,7 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) * dnode will hang around after we finish processing its * children. */ - (void) refcount_add(&dn->dn_holds, (void *)(uintptr_t)tx->tx_txg); + dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg); dbuf_dirty(dn->dn_dbuf, tx); @@ -764,7 +795,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) } /* obtain the old block */ - db = dbuf_hold(dn, 0); + db = dbuf_hold(dn, 0, FTAG); dbuf_new_size(db, size, tx); @@ -773,7 +804,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) /* don't need dd_dirty_mtx, dnode is already dirty */ dn->dn_dirtyblksz[tx->tx_txg&TXG_MASK] = size; dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; - dbuf_rele(db); + dbuf_rele(db, FTAG); err = 0; end: @@ -844,7 +875,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) dmu_buf_impl_t *db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); dprintf("dn %p dirtying left indirects\n", dn); dbuf_dirty(db, tx); - dbuf_remove_ref(db, FTAG); + dbuf_rele(db, FTAG); } #ifdef ZFS_DEBUG else if (old_nlevels > 1 && new_nlevels > old_nlevels) { @@ -855,7 +886,7 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx) db = dbuf_hold_level(dn, old_nlevels-1, i, FTAG); ASSERT(! list_link_active(&db->db_dirty_node[txgoff])); - dbuf_remove_ref(db, FTAG); + dbuf_rele(db, FTAG); } } #endif @@ -976,7 +1007,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) data = db->db.db_data; bzero(data + start, head); } - dbuf_remove_ref(db, FTAG); + dbuf_rele(db, FTAG); } off += head; len -= head; @@ -1009,7 +1040,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) rw_enter(&dn->dn_struct_rwlock, RW_WRITER); bzero(db->db.db_data, tail); } - dbuf_remove_ref(db, FTAG); + dbuf_rele(db, FTAG); } len -= tail; } @@ -1022,7 +1053,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) db = dbuf_hold_level(dn, 1, (off - head) >> (blkshift + epbs), FTAG); dbuf_will_dirty(db, tx); - dbuf_remove_ref(db, FTAG); + dbuf_rele(db, FTAG); } /* dirty the right indirects */ @@ -1030,7 +1061,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) db = dbuf_hold_level(dn, 1, (off + len + tail - 1) >> (blkshift + epbs), FTAG); dbuf_will_dirty(db, tx); - dbuf_remove_ref(db, FTAG); + dbuf_rele(db, FTAG); } /* @@ -1189,7 +1220,8 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, return (hole ? 0 : ESRCH); return (error); } - dbuf_read_havestruct(db); + (void) dbuf_read(db, NULL, + DB_RF_MUST_SUCCEED | DB_RF_HAVESTRUCT); data = db->db.db_data; } @@ -1228,7 +1260,7 @@ dnode_next_offset_level(dnode_t *dn, boolean_t hole, uint64_t *offset, } if (db) - dbuf_remove_ref(db, FTAG); + dbuf_rele(db, FTAG); return (error); } diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c index 597cafb44e..dcfb9ee7d2 100644 --- a/usr/src/uts/common/fs/zfs/dnode_sync.c +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -48,13 +47,15 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) /* this dnode can't be paged out because it's dirty */ db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); + ASSERT(db != NULL); for (i = 0; i < dn->dn_phys->dn_nblkptr; i++) if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i])) break; if (i != dn->dn_phys->dn_nblkptr) { ASSERT(list_link_active(&db->db_dirty_node[txg&TXG_MASK])); - dbuf_read_havestruct(db); + (void) dbuf_read(db, NULL, + DB_RF_HAVESTRUCT | DB_RF_MUST_SUCCEED); arc_release(db->db_buf, db); /* copy dnode's block pointers to new indirect block */ ASSERT3U(sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr, <=, @@ -102,7 +103,7 @@ dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * dn->dn_phys->dn_nblkptr); - dbuf_remove_ref(db, FTAG); + dbuf_rele(db, FTAG); } static void @@ -163,7 +164,8 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) /* db_data_old better be zeroed */ if (child->db_d.db_data_old[txg & TXG_MASK]) { - buf = (child->db_d.db_data_old[txg & TXG_MASK])->b_data; + buf = ((arc_buf_t *)child->db_d.db_data_old + [txg & TXG_MASK])->b_data; for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " @@ -194,7 +196,7 @@ free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) } mutex_exit(&child->db_mtx); - dbuf_remove_ref(child, FTAG); + dbuf_rele(child, FTAG); } } #endif @@ -211,7 +213,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, int txgoff = tx->tx_txg & TXG_MASK; int all = TRUE; - dbuf_read(db); + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); arc_release(db->db_buf, db); bp = (blkptr_t *)db->db.db_data; @@ -254,7 +256,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, } else { all = FALSE; } - dbuf_remove_ref(subdb, FTAG); + dbuf_rele(subdb, FTAG); } #ifdef ZFS_DEBUG bp -= (end-start)+1; @@ -326,7 +328,7 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) ASSERT3P(db->db_blkptr, ==, bp); free_blocks(dn, bp, 1, tx); } - dbuf_remove_ref(db, FTAG); + dbuf_rele(db, FTAG); } if (trunc) { uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * @@ -338,6 +340,48 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) } } +/* + * Try to kick all the dnodes dbufs out of the cache... + */ +void +dnode_evict_dbufs(dnode_t *dn) +{ + dmu_buf_impl_t *db; + + mutex_enter(&dn->dn_dbufs_mtx); + while (db = list_head(&dn->dn_dbufs)) { + int progress = 0; + for (; db; db = list_next(&dn->dn_dbufs, db)) { + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING && + refcount_is_zero(&db->db_holds)) + break; + else if (db->db_state == DB_EVICTING) + progress = 1; + mutex_exit(&db->db_mtx); + } + if (db) { + ASSERT(!arc_released(db->db_buf)); + dbuf_clear(db); + mutex_exit(&dn->dn_dbufs_mtx); + progress = 1; + } else { + if (progress == 0) + break; + mutex_exit(&dn->dn_dbufs_mtx); + } + mutex_enter(&dn->dn_dbufs_mtx); + } + mutex_exit(&dn->dn_dbufs_mtx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_evict(dn->dn_bonus); + dn->dn_bonus = NULL; + } + rw_exit(&dn->dn_struct_rwlock); +} + static int dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) { @@ -352,32 +396,35 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) /* XXX - use dbuf_undirty()? */ list_remove(&dn->dn_dirty_dbufs[txgoff], db); if (db->db_level == 0) { - ASSERT3P(db->db_d.db_data_old[txgoff], ==, db->db_buf); + ASSERT(db->db_blkid == DB_BONUS_BLKID || + db->db_d.db_data_old[txgoff] == db->db_buf); if (db->db_d.db_overridden_by[txgoff]) dbuf_unoverride(db, tx->tx_txg); db->db_d.db_data_old[txgoff] = NULL; } db->db_dirtycnt -= 1; mutex_exit(&db->db_mtx); - dbuf_remove_ref(db, (void *)(uintptr_t)tx->tx_txg); + dbuf_rele(db, (void *)(uintptr_t)tx->tx_txg); } - ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); + dnode_evict_dbufs(dn); + ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); + + /* + * XXX - It would be nice to assert this, but we may still + * have residual holds from async evictions from the arc... + * + * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); + */ /* Undirty next bits */ dn->dn_next_nlevels[txgoff] = 0; dn->dn_next_indblkshift[txgoff] = 0; /* free up all the blocks in the file. */ - dbuf_free_range(dn, 0, -1, tx); dnode_sync_free_range(dn, 0, dn->dn_phys->dn_maxblkid+1, tx); ASSERT3U(dn->dn_phys->dn_secphys, ==, 0); - /* - * All dbufs should be gone, since all holds are gone... - */ - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); - /* ASSERT(blkptrs are zero); */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); ASSERT(dn->dn_type != DMU_OT_NONE); @@ -394,7 +441,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) dn->dn_allocated_txg = 0; mutex_exit(&dn->dn_mtx); - ASSERT(!IS_DNODE_DNODE(dn->dn_object)); + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); /* @@ -420,7 +467,7 @@ dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx) /* ASSERT(dn->dn_objset->dd_snapshot == NULL); */ ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(IS_DNODE_DNODE(dn->dn_object) || + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dirtyblksz[txgoff] > 0); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); @@ -533,7 +580,7 @@ dnode_sync(dnode_t *dn, int level, zio_t *zio, dmu_tx_t *tx) dn->dn_dirtyblksz[txgoff] = 0; - if (!IS_DNODE_DNODE(dn->dn_object)) { + if (dn->dn_object != DMU_META_DNODE_OBJECT) { dbuf_will_dirty(dn->dn_dbuf, tx); dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); } diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c index e77b772922..7db7745270 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dataset.c +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -146,7 +145,7 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) -used, -compressed, -uncompressed, tx); } else { dprintf_bp(bp, "putting on dead list: %s", ""); - bplist_enqueue(&ds->ds_deadlist, bp, tx); + VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ if (ds->ds_phys->ds_prev_snap_obj != 0) { ASSERT3U(ds->ds_prev->ds_object, ==, @@ -175,14 +174,14 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) mutex_exit(&ds->ds_lock); } -int -dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx) +uint64_t +dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) { - uint64_t prev_snap_txg; + uint64_t txg; dsl_dir_t *dd; - /* ASSERT that it is not a snapshot */ + if (ds == NULL) - return (TRUE); + return (0); /* * The snapshot creation could fail, but that would cause an * incorrect FALSE return, which would only result in an @@ -195,13 +194,19 @@ dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, dmu_tx_t *tx) */ dd = ds->ds_dir; mutex_enter(&dd->dd_lock); - if (dd->dd_sync_func == dsl_dataset_snapshot_sync && - dd->dd_sync_txg < tx->tx_txg) - prev_snap_txg = dd->dd_sync_txg; + if (dd->dd_sync_func == dsl_dataset_snapshot_sync) + txg = dd->dd_sync_txg; else - prev_snap_txg = ds->ds_phys->ds_prev_snap_txg; + txg = ds->ds_phys->ds_prev_snap_txg; mutex_exit(&dd->dd_lock); - return (blk_birth > prev_snap_txg); + + return (txg); +} + +int +dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) +{ + return (blk_birth > dsl_dataset_prev_snap_txg(ds)); } /* ARGSUSED */ @@ -236,7 +241,7 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) kmem_free(ds, sizeof (dsl_dataset_t)); } -static void +static int dsl_dataset_get_snapname(dsl_dataset_t *ds) { dsl_dataset_phys_t *headphys; @@ -246,34 +251,37 @@ dsl_dataset_get_snapname(dsl_dataset_t *ds) objset_t *mos = dp->dp_meta_objset; if (ds->ds_snapname[0]) - return; + return (0); if (ds->ds_phys->ds_next_snap_obj == 0) - return; + return (0); - headdbuf = dmu_bonus_hold_tag(mos, - ds->ds_dir->dd_phys->dd_head_dataset_obj, FTAG); - dmu_buf_read(headdbuf); + err = dmu_bonus_hold(mos, ds->ds_dir->dd_phys->dd_head_dataset_obj, + FTAG, &headdbuf); + if (err) + return (err); headphys = headdbuf->db_data; err = zap_value_search(dp->dp_meta_objset, headphys->ds_snapnames_zapobj, ds->ds_object, ds->ds_snapname); - ASSERT(err == 0); - dmu_buf_rele_tag(headdbuf, FTAG); + dmu_buf_rele(headdbuf, FTAG); + return (err); } -dsl_dataset_t * +int dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, - int mode, void *tag) + int mode, void *tag, dsl_dataset_t **dsp) { uint64_t weight = ds_refcnt_weight[DS_MODE_LEVEL(mode)]; objset_t *mos = dp->dp_meta_objset; dmu_buf_t *dbuf; dsl_dataset_t *ds; + int err; ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || dsl_pool_sync_context(dp)); - dbuf = dmu_bonus_hold_tag(mos, dsobj, tag); - dmu_buf_read(dbuf); + err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); + if (err) + return (err); ds = dmu_buf_get_user(dbuf); if (ds == NULL) { dsl_dataset_t *winner; @@ -282,47 +290,60 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, ds->ds_dbuf = dbuf; ds->ds_object = dsobj; ds->ds_phys = dbuf->db_data; - ds->ds_dir = dsl_dir_open_obj(dp, - ds->ds_phys->ds_dir_obj, NULL, ds); - bplist_open(&ds->ds_deadlist, + err = bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + if (err == 0) { + err = dsl_dir_open_obj(dp, + ds->ds_phys->ds_dir_obj, NULL, ds, &ds->ds_dir); + } + if (err) { + /* + * we don't really need to close the blist if we + * just opened it. + */ + kmem_free(ds, sizeof (dsl_dataset_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } if (ds->ds_dir->dd_phys->dd_head_dataset_obj == dsobj) { ds->ds_snapname[0] = '\0'; if (ds->ds_phys->ds_prev_snap_obj) { - ds->ds_prev = - dsl_dataset_open_obj(dp, + err = dsl_dataset_open_obj(dp, ds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_NONE, ds); + DS_MODE_NONE, ds, &ds->ds_prev); } } else { if (snapname) { #ifdef ZFS_DEBUG dsl_dataset_phys_t *headphys; - int err; - dmu_buf_t *headdbuf = dmu_bonus_hold_tag(mos, - ds->ds_dir->dd_phys-> - dd_head_dataset_obj, FTAG); - dmu_buf_read(headdbuf); - headphys = headdbuf->db_data; - uint64_t foundobj; - err = zap_lookup(dp->dp_meta_objset, - headphys->ds_snapnames_zapobj, - snapname, sizeof (foundobj), 1, &foundobj); - ASSERT3U(err, ==, 0); - ASSERT3U(foundobj, ==, dsobj); - dmu_buf_rele_tag(headdbuf, FTAG); + dmu_buf_t *headdbuf; + err = dmu_bonus_hold(mos, + ds->ds_dir->dd_phys->dd_head_dataset_obj, + FTAG, &headdbuf); + if (err == 0) { + headphys = headdbuf->db_data; + uint64_t foundobj; + err = zap_lookup(dp->dp_meta_objset, + headphys->ds_snapnames_zapobj, + snapname, sizeof (foundobj), 1, + &foundobj); + ASSERT3U(foundobj, ==, dsobj); + dmu_buf_rele(headdbuf, FTAG); + } #endif (void) strcat(ds->ds_snapname, snapname); } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { - dsl_dataset_get_snapname(ds); + err = dsl_dataset_get_snapname(ds); } } - winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, - dsl_dataset_evict); - if (winner) { + if (err == 0) { + winner = dmu_buf_set_user_ie(dbuf, ds, &ds->ds_phys, + dsl_dataset_evict); + } + if (err || winner) { bplist_close(&ds->ds_deadlist); if (ds->ds_prev) { dsl_dataset_close(ds->ds_prev, @@ -330,6 +351,10 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, } dsl_dir_close(ds->ds_dir, ds); kmem_free(ds, sizeof (dsl_dataset_t)); + if (err) { + dmu_buf_rele(dbuf, tag); + return (err); + } ds = winner; } else { uint64_t new = @@ -349,12 +374,13 @@ dsl_dataset_open_obj(dsl_pool_t *dp, uint64_t dsobj, const char *snapname, (ds->ds_open_refcount + weight > DOS_REF_MAX)) { mutex_exit(&ds->ds_lock); dsl_dataset_close(ds, DS_MODE_NONE, tag); - return (NULL); + return (EBUSY); } ds->ds_open_refcount += weight; mutex_exit(&ds->ds_lock); - return (ds); + *dsp = ds; + return (0); } int @@ -368,9 +394,9 @@ dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, dsl_dataset_t *ds = NULL; int err = 0; - dd = dsl_dir_open_spa(spa, name, FTAG, &tail); - if (dd == NULL) - return (ENOENT); + err = dsl_dir_open_spa(spa, name, FTAG, &dd, &tail); + if (err) + return (err); dp = dd->dd_pool; obj = dd->dd_phys->dd_head_dataset_obj; @@ -384,7 +410,10 @@ dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, if (tail != NULL) { objset_t *mos = dp->dp_meta_objset; - ds = dsl_dataset_open_obj(dp, obj, NULL, DS_MODE_NONE, tag); + err = dsl_dataset_open_obj(dp, obj, NULL, + DS_MODE_NONE, tag, &ds); + if (err) + goto out; obj = ds->ds_phys->ds_snapnames_zapobj; dsl_dataset_close(ds, DS_MODE_NONE, tag); ds = NULL; @@ -405,9 +434,7 @@ dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, if (err) goto out; } - ds = dsl_dataset_open_obj(dp, obj, tail, mode, tag); - if (ds == NULL) - err = EBUSY; + err = dsl_dataset_open_obj(dp, obj, tail, mode, tag, &ds); out: rw_exit(&dp->dp_config_rwlock); @@ -433,7 +460,7 @@ dsl_dataset_name(dsl_dataset_t *ds, char *name) (void) strcpy(name, "mos"); } else { dsl_dir_name(ds->ds_dir, name); - dsl_dataset_get_snapname(ds); + VERIFY(0 == dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { (void) strcat(name, "@"); if (!MUTEX_HELD(&ds->ds_lock)) { @@ -462,7 +489,7 @@ dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag) mode, ds->ds_open_refcount); mutex_exit(&ds->ds_lock); - dmu_buf_rele_tag(ds->ds_dbuf, tag); + dmu_buf_rele(ds->ds_dbuf, tag); } void @@ -476,16 +503,16 @@ dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx) dsl_dir_t *dd; dsl_dir_create_root(mos, ddobjp, tx); - dd = dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG); - ASSERT(dd != NULL); + VERIFY(0 == dsl_dir_open_obj(dp, *ddobjp, NULL, FTAG, &dd)); dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - dbuf = dmu_bonus_hold(mos, dsobj); + VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; dsphys->ds_dir_obj = dd->dd_object; dsphys->ds_fsid_guid = unique_create(); + unique_remove(dsphys->ds_fsid_guid); /* it isn't open yet */ (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); dsphys->ds_snapnames_zapobj = @@ -494,13 +521,14 @@ dsl_dataset_create_root(dsl_pool_t *dp, uint64_t *ddobjp, dmu_tx_t *tx) dsphys->ds_creation_txg = tx->tx_txg; dsphys->ds_deadlist_obj = bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - dmu_buf_rele(dbuf); + dmu_buf_rele(dbuf, FTAG); dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_head_dataset_obj = dsobj; dsl_dir_close(dd, FTAG); - ds = dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG); + VERIFY(0 == + dsl_dataset_open_obj(dp, dsobj, NULL, DS_MODE_NONE, FTAG, &ds)); (void) dmu_objset_create_impl(dp->dp_spa, ds, DMU_OST_ZFS, tx); dsl_dataset_close(ds, DS_MODE_NONE, FTAG); } @@ -537,14 +565,13 @@ dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname, err = dsl_dir_create_sync(pds, lastname, tx); if (err) return (err); - dd = dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, NULL); - ASSERT(dd != NULL); + VERIFY(0 == dsl_dir_open_spa(dp->dp_spa, fullname, FTAG, &dd, NULL)); /* This is the point of no (unsuccessful) return */ dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - dbuf = dmu_bonus_hold(mos, dsobj); + VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; dsphys->ds_dir_obj = dd->dd_object; @@ -576,7 +603,7 @@ dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname, dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_clone_parent_obj = clone_parent->ds_object; } - dmu_buf_rele(dbuf); + dmu_buf_rele(dbuf, FTAG); dmu_buf_will_dirty(dd->dd_dbuf, tx); dd->dd_phys->dd_head_dataset_obj = dsobj; @@ -594,9 +621,9 @@ dsl_dataset_destroy(const char *name) dsl_dir_t *dd; const char *tail; - dd = dsl_dir_open(name, FTAG, &tail); - if (dd == NULL) - return (ENOENT); + err = dsl_dir_open(name, FTAG, &dd, &tail); + if (err) + return (err); dp = dd->dd_pool; if (tail != NULL) { @@ -631,10 +658,12 @@ dsl_dataset_destroy(const char *name) * dsl_dataset_destroy_sync() to destroy the head dataset. */ rw_enter(&dp->dp_config_rwlock, RW_READER); - pds = dsl_dir_open_obj(dd->dd_pool, - dd->dd_phys->dd_parent_obj, NULL, FTAG); + err = dsl_dir_open_obj(dd->dd_pool, + dd->dd_phys->dd_parent_obj, NULL, FTAG, &pds); dsl_dir_close(dd, FTAG); rw_exit(&dp->dp_config_rwlock); + if (err) + return (err); (void) strcpy(buf, name); cp = strrchr(buf, '/') + 1; @@ -657,9 +686,9 @@ dsl_dataset_rollback(const char *name) dsl_dir_t *dd; const char *tail; - dd = dsl_dir_open(name, FTAG, &tail); - if (dd == NULL) - return (ENOENT); + err = dsl_dir_open(name, FTAG, &dd, &tail); + if (err) + return (err); if (tail != NULL) { dsl_dir_close(dd, FTAG); @@ -777,11 +806,14 @@ dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; dsl_dataset_t *ds; + int err; if (dd->dd_phys->dd_head_dataset_obj == 0) return (EINVAL); - ds = dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG); + err = dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &ds); + if (err) + return (err); if (ds->ds_phys->ds_prev_snap_txg == 0) { /* @@ -823,7 +855,8 @@ dsl_dataset_rollback_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); ds->ds_phys->ds_deadlist_obj = bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, + ds->ds_phys->ds_deadlist_obj)); dprintf("new deadlist obj = %llx\n", ds->ds_phys->ds_deadlist_obj); { @@ -891,27 +924,23 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) drop_lock = TRUE; } - ds = dsl_dataset_open_obj(dd->dd_pool, + err = dsl_dataset_open_obj(dd->dd_pool, dd->dd_phys->dd_head_dataset_obj, NULL, - snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG); + snapname ? DS_MODE_NONE : DS_MODE_EXCLUSIVE, FTAG, &ds); - if (snapname) { + if (err == 0 && snapname) { err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &obj); dsl_dataset_close(ds, DS_MODE_NONE, FTAG); - if (err) { - if (drop_lock) - rw_exit(&dp->dp_config_rwlock); - return (err); + if (err == 0) { + err = dsl_dataset_open_obj(dd->dd_pool, obj, NULL, + DS_MODE_EXCLUSIVE, FTAG, &ds); } - - ds = dsl_dataset_open_obj(dd->dd_pool, obj, NULL, - DS_MODE_EXCLUSIVE, FTAG); } - if (ds == NULL) { + if (err) { if (drop_lock) rw_exit(&dp->dp_config_rwlock); - return (EBUSY); + return (err); } obj = ds->ds_object; @@ -942,22 +971,25 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) * them. Try again. */ if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) { - mutex_exit(&ds->ds_lock); dsl_dataset_close(ds, DS_MODE_NONE, FTAG); if (drop_lock) rw_exit(&dp->dp_config_rwlock); return (EAGAIN); } - /* THE POINT OF NO (unsuccessful) RETURN */ - if (ds->ds_phys->ds_prev_snap_obj != 0) { if (ds->ds_prev) { ds_prev = ds->ds_prev; } else { - ds_prev = dsl_dataset_open_obj(dd->dd_pool, + err = dsl_dataset_open_obj(dd->dd_pool, ds->ds_phys->ds_prev_snap_obj, NULL, - DS_MODE_NONE, FTAG); + DS_MODE_NONE, FTAG, &ds_prev); + if (err) { + dsl_dataset_close(ds, DS_MODE_NONE, FTAG); + if (drop_lock) + rw_exit(&dp->dp_config_rwlock); + return (err); + } } after_branch_point = (ds_prev->ds_phys->ds_next_snap_obj != obj); @@ -974,6 +1006,8 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) } } + /* THE POINT OF NO (unsuccessful) RETURN */ + ASSERT3P(tx->tx_pool, ==, dd->dd_pool); zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); @@ -983,8 +1017,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) spa_scrub_restart(dp->dp_spa, tx->tx_txg); - ds_next = dsl_dataset_open_obj(dd->dd_pool, - ds->ds_phys->ds_next_snap_obj, NULL, DS_MODE_NONE, FTAG); + VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, + ds->ds_phys->ds_next_snap_obj, NULL, + DS_MODE_NONE, FTAG, &ds_next)); ASSERT3U(ds_next->ds_phys->ds_prev_snap_obj, ==, obj); dmu_buf_will_dirty(ds_next->ds_dbuf, tx); @@ -1006,7 +1041,8 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) while (bplist_iterate(&ds_next->ds_deadlist, &itor, &bp) == 0) { if (bp.blk_birth <= ds->ds_phys->ds_prev_snap_txg) { - bplist_enqueue(&ds->ds_deadlist, &bp, tx); + VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, + &bp, tx)); if (ds_prev && !after_branch_point && bp.blk_birth > ds_prev->ds_phys->ds_prev_snap_txg) { @@ -1030,8 +1066,8 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) /* set next's deadlist to our deadlist */ ds_next->ds_phys->ds_deadlist_obj = ds->ds_phys->ds_deadlist_obj; - bplist_open(&ds_next->ds_deadlist, mos, - ds_next->ds_phys->ds_deadlist_obj); + VERIFY(0 == bplist_open(&ds_next->ds_deadlist, mos, + ds_next->ds_phys->ds_deadlist_obj)); ds->ds_phys->ds_deadlist_obj = 0; if (ds_next->ds_phys->ds_next_snap_obj != 0) { @@ -1049,9 +1085,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) */ dsl_dataset_t *ds_after_next; - ds_after_next = dsl_dataset_open_obj(dd->dd_pool, + VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, ds_next->ds_phys->ds_next_snap_obj, NULL, - DS_MODE_NONE, FTAG); + DS_MODE_NONE, FTAG, &ds_after_next)); itor = 0; while (bplist_iterate(&ds_after_next->ds_deadlist, &itor, &bp) == 0) { @@ -1078,9 +1114,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) dsl_dataset_close(ds_next->ds_prev, DS_MODE_NONE, ds_next); if (ds_prev) { - ds_next->ds_prev = dsl_dataset_open_obj( - dd->dd_pool, ds->ds_phys->ds_prev_snap_obj, - NULL, DS_MODE_NONE, ds_next); + VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, + ds->ds_phys->ds_prev_snap_obj, NULL, + DS_MODE_NONE, ds_next, &ds_next->ds_prev)); } else { ds_next->ds_prev = NULL; } @@ -1144,8 +1180,9 @@ dsl_dataset_destroy_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) } else { /* remove from snapshot namespace */ dsl_dataset_t *ds_head; - ds_head = dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG); + VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, NULL, + DS_MODE_NONE, FTAG, &ds_head)); #ifdef ZFS_DEBUG { uint64_t val; @@ -1195,8 +1232,10 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) if (dd->dd_phys->dd_head_dataset_obj == 0) return (EINVAL); - ds = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL, - DS_MODE_NONE, FTAG); + err = dsl_dataset_open_obj(dp, dd->dd_phys->dd_head_dataset_obj, NULL, + DS_MODE_NONE, FTAG, &ds); + if (err) + return (err); err = zap_lookup(mos, ds->ds_phys->ds_snapnames_zapobj, snapname, 8, 1, &value); @@ -1217,7 +1256,7 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); - dbuf = dmu_bonus_hold(mos, dsobj); + VERIFY(0 == dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; dsphys->ds_dir_obj = dd->dd_object; @@ -1237,13 +1276,14 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) dsphys->ds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; dsphys->ds_restoring = ds->ds_phys->ds_restoring; dsphys->ds_bp = ds->ds_phys->ds_bp; - dmu_buf_rele(dbuf); + dmu_buf_rele(dbuf, FTAG); if (ds->ds_phys->ds_prev_snap_obj != 0) { dsl_dataset_t *ds_prev; - ds_prev = dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, NULL, DS_MODE_NONE, FTAG); + VERIFY(0 == dsl_dataset_open_obj(dp, + ds->ds_phys->ds_prev_snap_obj, NULL, + DS_MODE_NONE, FTAG, &ds_prev)); ASSERT(ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object || ds_prev->ds_phys->ds_num_children > 1); @@ -1266,7 +1306,8 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) ds->ds_phys->ds_unique_bytes = 0; ds->ds_phys->ds_deadlist_obj = bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); + VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, + ds->ds_phys->ds_deadlist_obj)); dprintf("snap '%s' -> obj %llu\n", snapname, dsobj); err = zap_add(mos, ds->ds_phys->ds_snapnames_zapobj, @@ -1275,8 +1316,9 @@ dsl_dataset_snapshot_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) if (ds->ds_prev) dsl_dataset_close(ds->ds_prev, DS_MODE_NONE, ds); - ds->ds_prev = dsl_dataset_open_obj(dp, - ds->ds_phys->ds_prev_snap_obj, snapname, DS_MODE_NONE, ds); + VERIFY(0 == dsl_dataset_open_obj(dp, + ds->ds_phys->ds_prev_snap_obj, snapname, + DS_MODE_NONE, ds, &ds->ds_prev)); rw_exit(&dp->dp_config_rwlock); dsl_dataset_close(ds, DS_MODE_NONE, FTAG); @@ -1295,7 +1337,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, dmu_tx_t *tx) dsl_dir_dirty(ds->ds_dir, tx); bplist_close(&ds->ds_deadlist); - dmu_buf_remove_ref(ds->ds_dbuf, ds); + dmu_buf_rele(ds->ds_dbuf, ds); } void @@ -1319,7 +1361,6 @@ dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds) dds->dds_creation_txg = ds->ds_phys->ds_creation_txg; dds->dds_space_refd = ds->ds_phys->ds_used_bytes; dds->dds_fsid_guid = ds->ds_phys->ds_fsid_guid; - dds->dds_guid = ds->ds_phys->ds_guid; if (ds->ds_phys->ds_next_snap_obj) { /* @@ -1332,8 +1373,6 @@ dsl_dataset_stats(dsl_dataset_t *ds, dmu_objset_stats_t *dds) dds->dds_uncompressed_bytes = ds->ds_phys->ds_uncompressed_bytes; } - - dds->dds_objset_obj = ds->ds_object; } dsl_pool_t * @@ -1375,10 +1414,11 @@ dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) } /* new fs better exist */ - nds = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname, FTAG, &tail); - if (nds == NULL) { + err = dsl_dir_open_spa(dd->dd_pool->dp_spa, ora->newname, + FTAG, &nds, &tail); + if (err) { dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG); - return (ENOENT); + return (err); } dsl_dir_close(nds, FTAG); @@ -1397,8 +1437,12 @@ dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) tail++; - fsds = dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG); + err = dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_head_dataset_obj, NULL, DS_MODE_NONE, FTAG, &fsds); + if (err) { + dsl_dataset_close(snds, DS_MODE_STANDARD, FTAG); + return (err); + } /* new name better not be in use */ err = zap_lookup(mos, fsds->ds_phys->ds_snapnames_zapobj, @@ -1414,7 +1458,7 @@ dsl_dataset_snapshot_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) /* The point of no (unsuccessful) return */ rw_enter(&dd->dd_pool->dp_config_rwlock, RW_WRITER); - dsl_dataset_get_snapname(snds); + VERIFY(0 == dsl_dataset_get_snapname(snds)); err = zap_remove(mos, fsds->ds_phys->ds_snapnames_zapobj, snds->ds_snapname, tx); ASSERT3U(err, ==, 0); @@ -1440,9 +1484,9 @@ dsl_dataset_rename(const char *osname, const char *newname) struct osrenamearg ora; int err; - dd = dsl_dir_open(osname, FTAG, &tail); - if (dd == NULL) - return (ENOENT); + err = dsl_dir_open(osname, FTAG, &dd, &tail); + if (err) + return (err); if (tail == NULL) { err = dsl_dir_sync_task(dd, dsl_dir_rename_sync, (void*)newname, 1<<12); diff --git a/usr/src/uts/common/fs/zfs/dsl_dir.c b/usr/src/uts/common/fs/zfs/dsl_dir.c index 4ea1d62de5..8ffa145477 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dir.c +++ b/usr/src/uts/common/fs/zfs/dsl_dir.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -76,18 +75,20 @@ dsl_dir_evict(dmu_buf_t *db, void *arg) kmem_free(dd, sizeof (dsl_dir_t)); } -dsl_dir_t * +int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - const char *tail, void *tag) + const char *tail, void *tag, dsl_dir_t **ddp) { dmu_buf_t *dbuf; dsl_dir_t *dd; + int err; ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock) || dsl_pool_sync_context(dp)); - dbuf = dmu_bonus_hold_tag(dp->dp_meta_objset, ddobj, tag); - dmu_buf_read(dbuf); + err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); + if (err) + return (err); dd = dmu_buf_get_user(dbuf); #ifdef ZFS_DEBUG { @@ -112,8 +113,13 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, offsetof(dsl_prop_cb_record_t, cbr_node)); if (dd->dd_phys->dd_parent_obj) { - dd->dd_parent = dsl_dir_open_obj(dp, - dd->dd_phys->dd_parent_obj, NULL, dd); + err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, + NULL, dd, &dd->dd_parent); + if (err) { + kmem_free(dd, sizeof (dsl_dir_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } if (tail) { #ifdef ZFS_DEBUG uint64_t foundobj; @@ -122,8 +128,7 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_parent->dd_phys-> dd_child_dir_zapobj, tail, sizeof (foundobj), 1, &foundobj); - ASSERT3U(err, ==, 0); - ASSERT3U(foundobj, ==, ddobj); + ASSERT(err || foundobj == ddobj); #endif (void) strcpy(dd->dd_myname, tail); } else { @@ -131,11 +136,12 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, dd->dd_parent->dd_phys-> dd_child_dir_zapobj, ddobj, dd->dd_myname); - /* - * The caller should be protecting this ddobj - * from being deleted concurrently - */ - ASSERT(err == 0); + } + if (err) { + dsl_dir_close(dd->dd_parent, dd); + kmem_free(dd, sizeof (dsl_dir_t)); + dmu_buf_rele(dbuf, tag); + return (err); } } else { (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); @@ -166,7 +172,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, ASSERT3P(dd->dd_pool, ==, dp); ASSERT3U(dd->dd_object, ==, ddobj); ASSERT3P(dd->dd_dbuf, ==, dbuf); - return (dd); + *ddp = dd; + return (0); } void @@ -174,7 +181,7 @@ dsl_dir_close(dsl_dir_t *dd, void *tag) { dprintf_dd(dd, "%s\n", ""); spa_close(dd->dd_pool->dp_spa, tag); - dmu_buf_rele_tag(dd->dd_dbuf, tag); + dmu_buf_rele(dd->dd_dbuf, tag); } /* buf must be long enough (MAXNAMELEN should do) */ @@ -266,8 +273,9 @@ getcomponent(const char *path, char *component, const char **nextp) * same as dsl_open_dir, ignore the first component of name and use the * spa instead */ -dsl_dir_t * -dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp) +int +dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, + dsl_dir_t **ddp, const char **tailp) { char buf[MAXNAMELEN]; const char *next, *nextnext = NULL; @@ -280,15 +288,15 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp) dprintf("%s\n", name); if (name == NULL) - return (NULL); + return (ENOENT); err = getcomponent(name, buf, &next); if (err) - return (NULL); + return (err); if (spa == NULL) { err = spa_open(buf, &spa, FTAG); if (err) { dprintf("spa_open(%s) failed\n", buf); - return (NULL); + return (err); } openedspa = TRUE; @@ -299,17 +307,19 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp) dp = spa_get_dsl(spa); rw_enter(&dp->dp_config_rwlock, RW_READER); - dd = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag); + err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); + if (err) { + rw_exit(&dp->dp_config_rwlock); + if (openedspa) + spa_close(spa, FTAG); + return (err); + } + while (next != NULL) { dsl_dir_t *child_ds; err = getcomponent(next, buf, &nextnext); - if (err) { - dsl_dir_close(dd, tag); - rw_exit(&dp->dp_config_rwlock); - if (openedspa) - spa_close(spa, FTAG); - return (NULL); - } + if (err) + break; ASSERT(next[0] != '\0'); if (next[0] == '@') break; @@ -321,18 +331,28 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp) err = zap_lookup(dp->dp_meta_objset, dd->dd_phys->dd_child_dir_zapobj, buf, sizeof (ddobj), 1, &ddobj); - if (err == ENOENT) { + if (err) { + if (err == ENOENT) + err = 0; break; } - ASSERT(err == 0); - child_ds = dsl_dir_open_obj(dp, ddobj, buf, tag); + err = dsl_dir_open_obj(dp, ddobj, buf, tag, &child_ds); + if (err) + break; dsl_dir_close(dd, tag); dd = child_ds; next = nextnext; } rw_exit(&dp->dp_config_rwlock); + if (err) { + dsl_dir_close(dd, tag); + if (openedspa) + spa_close(spa, FTAG); + return (err); + } + /* * It's an error if there's more than one component left, or * tailp==NULL and there's any component left. @@ -342,14 +362,14 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp) /* bad path name */ dsl_dir_close(dd, tag); dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); - next = NULL; - dd = NULL; + err = ENOENT; } if (tailp) *tailp = next; if (openedspa) spa_close(spa, FTAG); - return (dd); + *ddp = dd; + return (err); } /* @@ -358,10 +378,10 @@ dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, const char **tailp) * tail==NULL and we couldn't parse the whole name. (*tail)[0] == '@' * means that the last component is a snapshot. */ -dsl_dir_t * -dsl_dir_open(const char *name, void *tag, const char **tailp) +int +dsl_dir_open(const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) { - return (dsl_dir_open_spa(NULL, name, tag, tailp)); + return (dsl_dir_open_spa(NULL, name, tag, ddp, tailp)); } int @@ -397,7 +417,7 @@ dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx) dprintf("dataset_create: zap_add %s->%lld to %lld returned %d\n", name, ddobj, pds->dd_phys->dd_child_dir_zapobj, err); - dbuf = dmu_bonus_hold(mos, ddobj); + VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; @@ -407,7 +427,7 @@ dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx) DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); dsphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); - dmu_buf_rele(dbuf); + dmu_buf_rele(dbuf, FTAG); rw_exit(&pds->dd_pool->dp_config_rwlock); @@ -431,7 +451,9 @@ dsl_dir_destroy_sync(dsl_dir_t *pds, void *arg, dmu_tx_t *tx) if (err) goto out; - dd = dsl_dir_open_obj(dp, obj, name, FTAG); + err = dsl_dir_open_obj(dp, obj, name, FTAG, &dd); + if (err) + goto out; ASSERT3U(dd->dd_phys->dd_parent_obj, ==, pds->dd_object); if (dmu_buf_refcount(dd->dd_dbuf) > 1) { @@ -512,7 +534,7 @@ dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx) sizeof (uint64_t), 1, ddobjp, tx); ASSERT3U(error, ==, 0); - dbuf = dmu_bonus_hold(mos, *ddobjp); + VERIFY(0 == dmu_bonus_hold(mos, *ddobjp, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsp = dbuf->db_data; @@ -522,7 +544,7 @@ dsl_dir_create_root(objset_t *mos, uint64_t *ddobjp, dmu_tx_t *tx) dsp->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); - dmu_buf_rele(dbuf); + dmu_buf_rele(dbuf, FTAG); } void @@ -530,7 +552,6 @@ dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds) { bzero(dds, sizeof (dmu_objset_stats_t)); - dds->dds_dir_obj = dd->dd_object; dds->dds_available = dsl_dir_space_available(dd, NULL, 0, TRUE); mutex_enter(&dd->dd_lock); @@ -543,22 +564,17 @@ dsl_dir_stats(dsl_dir_t *dd, dmu_objset_stats_t *dds) dds->dds_creation_time = dd->dd_phys->dd_creation_time; - dds->dds_is_placeholder = (dd->dd_phys->dd_head_dataset_obj == 0); - if (dd->dd_phys->dd_clone_parent_obj) { dsl_dataset_t *ds; rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - ds = dsl_dataset_open_obj(dd->dd_pool, - dd->dd_phys->dd_clone_parent_obj, NULL, DS_MODE_NONE, FTAG); + VERIFY(0 == dsl_dataset_open_obj(dd->dd_pool, + dd->dd_phys->dd_clone_parent_obj, + NULL, DS_MODE_NONE, FTAG, &ds)); dsl_dataset_name(ds, dds->dds_clone_of); - dds->dds_clone_of_obj = dd->dd_phys->dd_clone_parent_obj; dsl_dataset_close(ds, DS_MODE_NONE, FTAG); rw_exit(&dd->dd_pool->dp_config_rwlock); } - - spa_altroot(dd->dd_pool->dp_spa, dds->dds_altroot, - sizeof (dds->dds_altroot)); } int @@ -668,7 +684,7 @@ dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) mutex_exit(&dd->dd_lock); /* release the hold from dsl_dir_dirty */ - dmu_buf_remove_ref(dd->dd_dbuf, dd); + dmu_buf_rele(dd->dd_dbuf, dd); } static uint64_t @@ -679,7 +695,7 @@ dsl_dir_estimated_space(dsl_dir_t *dd) ASSERT(MUTEX_HELD(&dd->dd_lock)); - space = dd->dd_used_bytes; + space = dd->dd_phys->dd_used_bytes; ASSERT(space >= 0); for (i = 0; i < TXG_SIZE; i++) { space += dd->dd_space_towrite[i&TXG_MASK]; @@ -788,6 +804,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, struct tempreserve *tr; ASSERT3U(txg, !=, 0); + ASSERT3S(asize, >=, 0); mutex_enter(&dd->dd_lock); /* @@ -827,10 +844,14 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, /* * If they are requesting more space, and our current estimate * is over quota. They get to try again unless the actual - * on-disk is over quota. + * on-disk is over quota and there are no pending changes (which + * may free up space for us). */ if (asize > 0 && est_used > quota) { - if (dd->dd_used_bytes < quota) + if (dd->dd_space_towrite[txg & TXG_MASK] != 0 || + dd->dd_space_towrite[(txg-1) & TXG_MASK] != 0 || + dd->dd_space_towrite[(txg-2) & TXG_MASK] != 0 || + dd->dd_used_bytes < quota) edquot = ERESTART; dprintf_dd(dd, "failing: used=%lluK est_used = %lluK " "quota=%lluK tr=%lluK err=%d\n", @@ -876,6 +897,8 @@ dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); + ASSERT3S(asize, >=, 0); + ASSERT3S(fsize, >=, 0); err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, tr_list, tx); @@ -975,8 +998,6 @@ dsl_dir_diduse_space(dsl_dir_t *dd, ASSERT(uncompressed >= 0 || dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); dd->dd_used_bytes += used; - if (used > 0) - dd->dd_space_towrite[tx->tx_txg & TXG_MASK] -= used; dd->dd_phys->dd_uncompressed_bytes += uncompressed; dd->dd_phys->dd_compressed_bytes += compressed; mutex_exit(&dd->dd_lock); @@ -1013,9 +1034,9 @@ dsl_dir_set_quota(const char *ddname, uint64_t quota) dsl_dir_t *dd; int err; - dd = dsl_dir_open(ddname, FTAG, NULL); - if (dd == NULL) - return (ENOENT); + err = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (err) + return (err); /* * If someone removes a file, then tries to set the quota, we * want to make sure the file freeing takes effect. @@ -1073,9 +1094,9 @@ dsl_dir_set_reservation(const char *ddname, uint64_t reservation) dsl_dir_t *dd; int err; - dd = dsl_dir_open(ddname, FTAG, NULL); - if (dd == NULL) - return (ENOENT); + err = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (err) + return (err); err = dsl_dir_sync_task(dd, dsl_dir_set_reservation_sync, &reservation, 0); dsl_dir_close(dd, FTAG); @@ -1128,11 +1149,10 @@ dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) return (ENXIO); } - newpds = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &tail); - /* new parent should exist */ - if (newpds == NULL) - return (ENOENT); + err = dsl_dir_open_spa(dp->dp_spa, newname, FTAG, &newpds, &tail); + if (err) + return (err); /* new name should not already exist */ if (tail == NULL) { @@ -1195,8 +1215,8 @@ dsl_dir_rename_sync(dsl_dir_t *dd, void *arg, dmu_tx_t *tx) (void) strcpy(dd->dd_myname, tail); dsl_dir_close(dd->dd_parent, dd); dd->dd_phys->dd_parent_obj = newpds->dd_object; - dd->dd_parent = dsl_dir_open_obj(dd->dd_pool, - newpds->dd_object, NULL, dd); + VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, + newpds->dd_object, NULL, dd, &dd->dd_parent)); /* add to new parent zapobj */ err = zap_add(mos, newpds->dd_phys->dd_child_dir_zapobj, diff --git a/usr/src/uts/common/fs/zfs/dsl_pool.c b/usr/src/uts/common/fs/zfs/dsl_pool.c index 5b71ccfaa9..b8e54be6f6 100644 --- a/usr/src/uts/common/fs/zfs/dsl_pool.c +++ b/usr/src/uts/common/fs/zfs/dsl_pool.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,8 +38,8 @@ /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" -static dsl_dir_t * -dsl_pool_open_mos_dir(dsl_pool_t *dp) +static int +dsl_pool_open_mos_dir(dsl_pool_t *dp, dsl_dir_t **ddp) { uint64_t obj; int err; @@ -48,9 +47,10 @@ dsl_pool_open_mos_dir(dsl_pool_t *dp) err = zap_lookup(dp->dp_meta_objset, dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, MOS_DIR_NAME, sizeof (obj), 1, &obj); - ASSERT3U(err, ==, 0); + if (err) + return (err); - return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp)); + return (dsl_dir_open_obj(dp, obj, MOS_DIR_NAME, dp, ddp)); } static dsl_pool_t * @@ -74,38 +74,56 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) return (dp); } -dsl_pool_t * -dsl_pool_open(spa_t *spa, uint64_t txg) +int +dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); - - dp->dp_meta_objset = - &dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp)->os; + objset_impl_t *osi; rw_enter(&dp->dp_config_rwlock, RW_READER); + err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi); + if (err) + goto out; + dp->dp_meta_objset = &osi->os; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); - ASSERT3U(err, ==, 0); + if (err) + goto out; + + err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + NULL, dp, &dp->dp_root_dir); + if (err) + goto out; - dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, - NULL, dp); - dp->dp_mos_dir = dsl_pool_open_mos_dir(dp); + err = dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir); + if (err) + goto out; + +out: rw_exit(&dp->dp_config_rwlock); + if (err) + dsl_pool_close(dp); + else + *dpp = dp; - return (dp); + return (err); } void dsl_pool_close(dsl_pool_t *dp) { /* drop our reference from dsl_pool_open() */ - dsl_dir_close(dp->dp_mos_dir, dp); - dsl_dir_close(dp->dp_root_dir, dp); + if (dp->dp_mos_dir) + dsl_dir_close(dp->dp_mos_dir, dp); + if (dp->dp_root_dir) + dsl_dir_close(dp->dp_root_dir, dp); /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ - dmu_objset_evict(NULL, dp->dp_meta_objset->os); + if (dp->dp_meta_objset) + dmu_objset_evict(NULL, dp->dp_meta_objset->os); txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_dirs); @@ -132,14 +150,13 @@ dsl_pool_create(spa_t *spa, uint64_t txg) /* create and open the root dir */ dsl_dataset_create_root(dp, &dp->dp_root_dir_obj, tx); - dp->dp_root_dir = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, - NULL, dp); + VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, + NULL, dp, &dp->dp_root_dir)); /* create and open the meta-objset dir */ - err = dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, - tx); + VERIFY(0 == dsl_dir_create_sync(dp->dp_root_dir, MOS_DIR_NAME, tx)); ASSERT3U(err, ==, 0); - dp->dp_mos_dir = dsl_pool_open_mos_dir(dp); + VERIFY(0 == dsl_pool_open_mos_dir(dp, &dp->dp_mos_dir)); dmu_tx_commit(tx); diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c index 3feb93e468..fc33b1c591 100644 --- a/usr/src/uts/common/fs/zfs/dsl_prop.c +++ b/usr/src/uts/common/fs/zfs/dsl_prop.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -75,7 +74,10 @@ dsl_prop_get_impl(dsl_pool_t *dp, uint64_t ddobj, const char *propname, ASSERT(RW_LOCK_HELD(&dp->dp_config_rwlock)); while (ddobj != 0) { - dsl_dir_t *dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG); + dsl_dir_t *dd; + err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); + if (err) + break; err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, intsz, numint, buf); if (err != ENOENT) { @@ -136,7 +138,8 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname, cbr->cbr_func(cbr->cbr_arg, value); - (void) dsl_dir_open_obj(dd->dd_pool, dd->dd_object, NULL, cbr); + VERIFY(0 == dsl_dir_open_obj(dd->dd_pool, dd->dd_object, + NULL, cbr, &dd)); rw_exit(&dd->dd_pool->dp_config_rwlock); /* Leave dataset open until this callback is unregistered */ return (0); @@ -164,9 +167,9 @@ dsl_prop_get(const char *ddname, const char *propname, const char *tail; int err; - dd = dsl_dir_open(ddname, FTAG, &tail); - if (dd == NULL) - return (ENOENT); + err = dsl_dir_open(ddname, FTAG, &dd, &tail); + if (err) + return (err); if (tail && tail[0] != '@') { dsl_dir_close(dd, FTAG); return (ENOENT); @@ -258,7 +261,9 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, int err; ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); - dd = dsl_dir_open_obj(dp, ddobj, NULL, FTAG); + err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); + if (err) + return; if (!first) { /* @@ -353,15 +358,15 @@ dsl_prop_set(const char *ddname, const char *propname, int err; struct prop_set_arg psa; - dd = dsl_dir_open(ddname, FTAG, NULL); - if (dd == NULL) - return (ENOENT); + err = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (err) + return (err); psa.name = propname; psa.intsz = intsz; psa.numints = numints; psa.buf = buf; - err = dsl_dir_sync_task(dd, dsl_prop_set_sync, &psa, 0); + err = dsl_dir_sync_task(dd, dsl_prop_set_sync, &psa, 1<<20); dsl_dir_close(dd, FTAG); @@ -457,10 +462,12 @@ dsl_prop_get_all(objset_t *os, nvlist_t **nvp) if (dd->dd_phys->dd_parent_obj == 0) parent = NULL; else - parent = dsl_dir_open_obj(dp, - dd->dd_phys->dd_parent_obj, NULL, FTAG); + err = dsl_dir_open_obj(dp, + dd->dd_phys->dd_parent_obj, NULL, FTAG, &parent); if (dd != ds->ds_dir) dsl_dir_close(dd, FTAG); + if (err) + break; dd = parent; } rw_exit(&dp->dp_config_rwlock); diff --git a/usr/src/uts/common/fs/zfs/fletcher.c b/usr/src/uts/common/fs/zfs/fletcher.c index 03186d1387..edda3c9a9d 100644 --- a/usr/src/uts/common/fs/zfs/fletcher.c +++ b/usr/src/uts/common/fs/zfs/fletcher.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -98,3 +97,49 @@ fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) ZIO_SET_CHECKSUM(zcp, a, b, c, d); } + +void +fletcher_4_incremental_native(const void *buf, uint64_t size, + zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + a = zcp->zc_word[0]; + b = zcp->zc_word[1]; + c = zcp->zc_word[2]; + d = zcp->zc_word[3]; + + for (; ip < ipend; ip++) { + a += ip[0]; + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} + +void +fletcher_4_incremental_byteswap(const void *buf, uint64_t size, + zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + a = zcp->zc_word[0]; + b = zcp->zc_word[1]; + c = zcp->zc_word[2]; + d = zcp->zc_word[3]; + + for (; ip < ipend; ip++) { + a += BSWAP_32(ip[0]); + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 9d682e4990..d31e6edda3 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -379,11 +378,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) os, tx); } - db = dmu_bonus_hold(os, smo->smo_object); + VERIFY(0 == dmu_bonus_hold(os, smo->smo_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, ==, sizeof (*smo)); bcopy(smo, db->db_data, db->db_size); - dmu_buf_rele(db); + dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); } diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 9b9bcab217..02be864b36 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -33,6 +32,7 @@ */ #include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> #include <sys/spa_impl.h> #include <sys/zio.h> #include <sys/zio_checksum.h> @@ -62,6 +62,44 @@ static uint32_t spa_active_count; * ========================================================================== */ +static int +spa_error_entry_compare(const void *a, const void *b) +{ + spa_error_entry_t *sa = (spa_error_entry_t *)a; + spa_error_entry_t *sb = (spa_error_entry_t *)b; + int ret; + + ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, + sizeof (zbookmark_t)); + + if (ret < 0) + return (-1); + else if (ret > 0) + return (1); + else + return (0); +} + +/* + * Utility function which retrieves copies of the current logs and + * re-initializes them in the process. + */ +void +spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) +{ + ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); + + bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); + bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); + + avl_create(&spa->spa_errlist_scrub, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_last, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); +} + /* * Activate an uninitialized pool. */ @@ -76,9 +114,6 @@ spa_activate(spa_t *spa) spa->spa_normal_class = metaslab_class_create(); - spa->spa_vdev_retry_taskq = taskq_create("spa_vdev_retry", - 4, maxclsyspri, 50, INT_MAX, TASKQ_PREPOPULATE); - for (t = 0; t < ZIO_TYPES; t++) { spa->spa_zio_issue_taskq[t] = taskq_create("spa_zio_issue", 8, maxclsyspri, 50, INT_MAX, @@ -95,6 +130,13 @@ spa_activate(spa_t *spa) txg_list_create(&spa->spa_vdev_txg_list, offsetof(struct vdev, vdev_txg_node)); + + avl_create(&spa->spa_errlist_scrub, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_last, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); } /* @@ -124,12 +166,18 @@ spa_deactivate(spa_t *spa) spa->spa_zio_intr_taskq[t] = NULL; } - taskq_destroy(spa->spa_vdev_retry_taskq); - spa->spa_vdev_retry_taskq = NULL; - metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; + /* + * If this was part of an import or the open otherwise failed, we may + * still have errors left in the queues. Empty them just in case. + */ + spa_errlog_drain(spa); + + avl_destroy(&spa->spa_errlist_scrub); + avl_destroy(&spa->spa_errlist_last); + spa->spa_state = POOL_STATE_UNINITIALIZED; } @@ -175,6 +223,11 @@ static void spa_unload(spa_t *spa) { /* + * Stop async tasks. + */ + spa_async_suspend(spa); + + /* * Stop syncing. */ if (spa->spa_sync_on) { @@ -185,8 +238,8 @@ spa_unload(spa_t *spa) /* * Wait for any outstanding prefetch I/O to complete. */ - spa_config_enter(spa, RW_WRITER); - spa_config_exit(spa); + spa_config_enter(spa, RW_WRITER, FTAG); + spa_config_exit(spa, FTAG); /* * Close the dsl pool. @@ -203,16 +256,16 @@ spa_unload(spa_t *spa) vdev_free(spa->spa_root_vdev); spa->spa_root_vdev = NULL; } + + spa->spa_async_suspended = 0; } /* * Load an existing storage pool, using the pool's builtin spa_config as a - * source of configuration information. The 'readonly' flag will prevent us - * from writing any updated state to disk, and can be use when testing a pool - * for import. + * source of configuration information. */ static int -spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) +spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) { int error = 0; nvlist_t *nvroot = NULL; @@ -221,25 +274,34 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) uint64_t pool_guid; zio_t *zio; + spa->spa_load_state = state; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || - nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) - return (EINVAL); + nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { + error = EINVAL; + goto out; + } (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); - if (import && spa_guid_exists(pool_guid, 0)) - return (EEXIST); + if ((spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0)) { + error = EEXIST; + goto out; + } /* * Parse the configuration into a vdev tree. */ - spa_config_enter(spa, RW_WRITER); + spa_config_enter(spa, RW_WRITER, FTAG); rvd = spa_config_parse(spa, nvroot, NULL, 0, VDEV_ALLOC_LOAD); - spa_config_exit(spa); + spa_config_exit(spa, FTAG); - if (rvd == NULL) - return (EINVAL); + if (rvd == NULL) { + error = EINVAL; + goto out; + } spa->spa_root_vdev = rvd; ASSERT(spa_guid(spa) == pool_guid); @@ -247,8 +309,10 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) /* * Try to open all vdevs, loading each label in the process. */ - if (vdev_open(rvd) != 0) - return (ENXIO); + if (vdev_open(rvd) != 0) { + error = ENXIO; + goto out; + } /* * Find the best uberblock. @@ -264,8 +328,16 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) { - dprintf("ub_txg is zero\n"); - return (ENXIO); + error = ENXIO; + goto out; + } + + /* + * If the pool is newer than the code, we can't open it. + */ + if (ub->ub_version > UBERBLOCK_VERSION) { + error = ENOTSUP; + goto out; } /* @@ -273,11 +345,10 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) * incomplete configuration. */ if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { - rvd->vdev_state = VDEV_STATE_CANT_OPEN; - rvd->vdev_stat.vs_aux = VDEV_AUX_BAD_GUID_SUM; - dprintf("vdev_guid_sum %llx != ub_guid_sum %llx\n", - rvd->vdev_guid_sum, ub->ub_guid_sum); - return (ENXIO); + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_GUID_SUM); + error = ENXIO; + goto out; } /* @@ -286,12 +357,22 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_first_txg = spa_last_synced_txg(spa) + 1; - spa->spa_dsl_pool = dsl_pool_open(spa, spa->spa_first_txg); + error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); + if (error) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + goto out; + } spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; - VERIFY(zap_lookup(spa->spa_meta_objset, + if (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, - sizeof (uint64_t), 1, &spa->spa_config_object) == 0); + sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } if (!mosconfig) { dmu_buf_t *db; @@ -299,21 +380,24 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) size_t nvsize = 0; nvlist_t *newconfig = NULL; - db = dmu_bonus_hold(spa->spa_meta_objset, - spa->spa_config_object); - dmu_buf_read(db); + VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, + spa->spa_config_object, FTAG, &db)); nvsize = *(uint64_t *)db->db_data; - dmu_buf_rele(db); + dmu_buf_rele(db, FTAG); packed = kmem_alloc(nvsize, KM_SLEEP); - error = dmu_read_canfail(spa->spa_meta_objset, + error = dmu_read(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, packed); if (error == 0) error = nvlist_unpack(packed, nvsize, &newconfig, 0); kmem_free(packed, nvsize); - if (error) - return (ENXIO); + if (error) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } spa_config_set(spa, newconfig); @@ -321,39 +405,76 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) spa_deactivate(spa); spa_activate(spa); - return (spa_load(spa, newconfig, readonly, import, B_TRUE)); + return (spa_load(spa, newconfig, state, B_TRUE)); } - VERIFY(zap_lookup(spa->spa_meta_objset, + if (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, - sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) == 0); + sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } /* - * Load the vdev state for all top level vdevs. + * Load the persistent error log. If we have an older pool, this will + * not be present. */ - if ((error = vdev_load(rvd, import)) != 0) - return (error); + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, + sizeof (uint64_t), 1, &spa->spa_errlog_last); + if (error != 0 &&error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, + sizeof (uint64_t), 1, &spa->spa_errlog_scrub); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + /* + * Load the vdev state for all top level vdevs. We need to grab the + * config lock because all label I/O is done with the + * ZIO_FLAG_CONFIG_HELD flag. + */ + spa_config_enter(spa, RW_READER, FTAG); + if ((error = vdev_load(rvd)) != 0) { + spa_config_exit(spa, FTAG); + goto out; + } + spa_config_exit(spa, FTAG); /* * Propagate the leaf DTLs we just loaded all the way up the tree. */ - spa_config_enter(spa, RW_WRITER); + spa_config_enter(spa, RW_WRITER, FTAG); vdev_dtl_reassess(rvd, 0, 0, B_FALSE); - spa_config_exit(spa); + spa_config_exit(spa, FTAG); /* * Check the state of the root vdev. If it can't be opened, it * indicates one or more toplevel vdevs are faulted. */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) - return (ENXIO); + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { + error = ENXIO; + goto out; + } /* * Claim log blocks that haven't been committed yet, and update all * top-level vdevs to sync any config changes found in vdev_load(). * This must all happen in a single txg. */ - if ((spa_mode & FWRITE) && !readonly) { + if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), spa_first_txg(spa)); dmu_objset_find(spa->spa_name, zil_claim, tx, 0); @@ -369,7 +490,14 @@ spa_load(spa_t *spa, nvlist_t *config, int readonly, int import, int mosconfig) txg_wait_synced(spa->spa_dsl_pool, 0); } - return (0); + error = 0; +out: + if (error) + zfs_ereport_post(FM_EREPORT_ZFS_POOL, spa, NULL, NULL, 0, 0); + spa->spa_load_state = SPA_LOAD_NONE; + spa->spa_ena = 0; + + return (error); } /* @@ -415,7 +543,7 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) spa_activate(spa); error = spa_load(spa, spa->spa_config, - B_FALSE, B_FALSE, B_FALSE); + SPA_LOAD_OPEN, B_FALSE); if (error == EBADF) { /* @@ -432,7 +560,9 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) if (locked) mutex_exit(&spa_namespace_lock); return (ENOENT); - } if (error) { + } + + if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the @@ -443,10 +573,14 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) B_TRUE); spa_unload(spa); spa_deactivate(spa); + spa->spa_last_open_failed = B_TRUE; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); + } else { + zfs_post_ok(spa, NULL); + spa->spa_last_open_failed = B_FALSE; } loaded = B_TRUE; @@ -459,9 +593,9 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) *spapp = spa; if (config != NULL) { - spa_config_enter(spa, RW_READER); + spa_config_enter(spa, RW_READER, FTAG); *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); - spa_config_exit(spa); + spa_config_exit(spa, FTAG); } /* @@ -479,8 +613,36 @@ spa_open(const char *name, spa_t **spapp, void *tag) return (spa_open_common(name, spapp, tag, NULL)); } +/* + * Lookup the given spa_t, incrementing the inject count in the process, + * preventing it from being exported or destroyed. + */ +spa_t * +spa_inject_addref(char *name) +{ + spa_t *spa; + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(name)) == NULL) { + mutex_exit(&spa_namespace_lock); + return (NULL); + } + spa->spa_inject_ref++; + mutex_exit(&spa_namespace_lock); + + return (spa); +} + +void +spa_inject_delref(spa_t *spa) +{ + mutex_enter(&spa_namespace_lock); + spa->spa_inject_ref--; + mutex_exit(&spa_namespace_lock); +} + int -spa_get_stats(const char *name, nvlist_t **config) +spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; @@ -488,6 +650,29 @@ spa_get_stats(const char *name, nvlist_t **config) *config = NULL; error = spa_open_common(name, &spa, FTAG, config); + if (spa && *config != NULL) + VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, + spa_get_errlog_size(spa)) == 0); + + /* + * We want to get the alternate root even for faulted pools, so we cheat + * and call spa_lookup() directly. + */ + if (altroot) { + if (spa == NULL) { + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(name); + if (spa) + spa_altroot(spa, altroot, buflen); + else + altroot[0] = '\0'; + spa = NULL; + mutex_exit(&spa_namespace_lock); + } else { + spa_altroot(spa, altroot, buflen); + } + } + if (spa != NULL) spa_close(spa, FTAG); @@ -551,9 +736,11 @@ spa_create(const char *pool, nvlist_t *nvroot, char *altroot) DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); - VERIFY(zap_add(spa->spa_meta_objset, + if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, - sizeof (uint64_t), 1, &spa->spa_config_object, tx) == 0); + sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { + cmn_err(CE_PANIC, "failed to add pool config"); + } /* * Create the deferred-free bplist object. Turn off compression @@ -565,9 +752,11 @@ spa_create(const char *pool, nvlist_t *nvroot, char *altroot) dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, ZIO_COMPRESS_OFF, tx); - VERIFY(zap_add(spa->spa_meta_objset, + if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, - sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) == 0); + sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { + cmn_err(CE_PANIC, "failed to add bplist"); + } dmu_tx_commit(tx); @@ -619,7 +808,7 @@ spa_import(const char *pool, nvlist_t *config, char *altroot) * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig * so that we don't try to open the pool if the config is damaged. */ - error = spa_load(spa, config, B_FALSE, B_TRUE, B_TRUE); + error = spa_load(spa, config, SPA_LOAD_IMPORT, B_TRUE); if (error) { spa_unload(spa); @@ -694,7 +883,7 @@ spa_tryimport(nvlist_t *tryconfig) * Pass off the heavy lifting to spa_load(). We pass TRUE for mosconfig * so we don't try to open the pool if the config is damaged. */ - (void) spa_load(spa, tryconfig, B_TRUE, B_TRUE, B_TRUE); + (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); /* * If 'tryconfig' was at least parsable, return the current config. @@ -738,6 +927,16 @@ spa_export_common(char *pool, int new_state) } /* + * Put a hold on the pool, drop the namespace lock, stop async tasks, + * reacquire the namespace lock, and see if we can export. + */ + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + spa_async_suspend(spa); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + + /* * The pool will be in core if it's openable, * in which case we can modify its state. */ @@ -749,17 +948,20 @@ spa_export_common(char *pool, int new_state) spa_scrub_suspend(spa); txg_wait_synced(spa->spa_dsl_pool, 0); - if (!spa_refcount_zero(spa)) { + /* + * A pool cannot be exported or destroyed if there are active + * references. If we are resetting a pool, allow references by + * fault injection handlers. + */ + if (!spa_refcount_zero(spa) || + (spa->spa_inject_ref != 0 && + new_state != POOL_STATE_UNINITIALIZED)) { spa_scrub_resume(spa); + spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (EBUSY); } - /* - * Update the pool state. - */ - spa->spa_state = new_state; - spa_scrub_resume(spa); VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); @@ -771,7 +973,10 @@ spa_export_common(char *pool, int new_state) * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ - vdev_config_dirty(spa->spa_root_vdev); + if (new_state != POOL_STATE_UNINITIALIZED) { + spa->spa_state = new_state; + vdev_config_dirty(spa->spa_root_vdev); + } } if (spa->spa_state != POOL_STATE_UNINITIALIZED) { @@ -779,8 +984,10 @@ spa_export_common(char *pool, int new_state) spa_deactivate(spa); } - spa_remove(spa); - spa_config_sync(); + if (new_state != POOL_STATE_UNINITIALIZED) { + spa_remove(spa); + spa_config_sync(); + } mutex_exit(&spa_namespace_lock); return (0); @@ -805,6 +1012,17 @@ spa_export(char *pool) } /* + * Similar to spa_export(), this unloads the spa_t without actually removing it + * from the namespace in any way. + */ +int +spa_reset(char *pool) +{ + return (spa_export_common(pool, POOL_STATE_UNINITIALIZED)); +} + + +/* * ========================================================================== * Device manipulation * ========================================================================== @@ -845,7 +1063,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) tvd->vdev_id = rvd->vdev_children; vdev_add_child(rvd, tvd); } - vdev_init(tvd, txg); + if ((error = vdev_init(tvd, txg)) != 0) + return (spa_vdev_exit(spa, vd, txg, error)); vdev_config_dirty(tvd); } @@ -871,7 +1090,7 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) * is automatically detached. */ int -spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) +spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { uint64_t txg, open_txg; int error; @@ -881,7 +1100,7 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) txg = spa_vdev_enter(spa); - oldvd = vdev_lookup_by_path(rvd, path); + oldvd = vdev_lookup_by_guid(rvd, guid); if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); @@ -954,6 +1173,12 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) newvd->vdev_id = pvd->vdev_children; vdev_add_child(pvd, newvd); + /* + * If newvd is smaller than oldvd, but larger than its rsize, + * the addition of newvd may have decreased our parent's asize. + */ + pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); + tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); @@ -962,7 +1187,6 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) * Update the config based on the new in-core state. */ spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); - vdev_config_dirty(tvd); /* @@ -976,14 +1200,14 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) open_txg - TXG_INITIAL + 1); mutex_exit(&newvd->vdev_dtl_lock); + dprintf("attached %s in txg %llu\n", newvd->vdev_path, txg); + /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, txg); (void) txg_list_add(&tvd->vdev_dtl_list, newvd, txg); - dprintf("attached %s, replacing=%d\n", path, replacing); - (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); /* @@ -1000,7 +1224,7 @@ spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, int replacing) * is a replacing vdev. */ int -spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) +spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) { uint64_t txg; int c, t, error; @@ -1009,14 +1233,11 @@ spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) txg = spa_vdev_enter(spa); - vd = vdev_lookup_by_path(rvd, path); + vd = vdev_lookup_by_guid(rvd, guid); if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - if (guid != 0 && vd->vdev_guid != guid) - return (spa_vdev_exit(spa, NULL, txg, ENODEV)); - pvd = vd->vdev_parent; /* @@ -1105,13 +1326,16 @@ spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) /* * Reopen this top-level vdev to reassess health after detach. */ - vdev_reopen(tvd, NULL); + vdev_reopen(tvd); /* * If the device we just detached was smaller than the others, - * it may be possible to add metaslabs (i.e. grow the pool). + * it may be possible to add metaslabs (i.e. grow the pool). We ignore + * the error here because the detach still succeeded - we just weren't + * able to reinitialize the metaslabs. This pool is in for a world of + * hurt, in any case. */ - vdev_metaslab_init(tvd, txg); + (void) vdev_metaslab_init(tvd, txg); /* * Update the config based on the new in-core state. @@ -1133,72 +1357,59 @@ spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, int replace_done) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); (void) txg_list_add(&tvd->vdev_dtl_list, vd, txg); - dprintf("detached %s\n", path); + dprintf("detached %s in txg %llu\n", vd->vdev_path, txg); return (spa_vdev_exit(spa, vd, txg, 0)); } /* - * If there are any replacing vdevs that have finished replacing, detach them. - * We can't hold the config lock across detaches, so we lock the config, - * build a list of candidates, unlock the config, and try each candidate. + * Find any device that's done replacing, so we can detach it. */ -typedef struct vdev_detach_link { - char *vdl_path; - uint64_t vdl_guid; - list_node_t vdl_node; -} vdev_detach_link_t; - -static void -spa_vdev_replace_done_make_list(list_t *l, vdev_t *vd) +static vdev_t * +spa_vdev_replace_done_hunt(vdev_t *vd) { + vdev_t *newvd, *oldvd; int c; - for (c = 0; c < vd->vdev_children; c++) - spa_vdev_replace_done_make_list(l, vd->vdev_child[c]); + for (c = 0; c < vd->vdev_children; c++) { + oldvd = spa_vdev_replace_done_hunt(vd->vdev_child[c]); + if (oldvd != NULL) + return (oldvd); + } if (vd->vdev_ops == &vdev_replacing_ops && vd->vdev_children == 2) { - vdev_t *cvd0 = vd->vdev_child[0]; - vdev_t *cvd1 = vd->vdev_child[1]; - vdev_detach_link_t *vdl; - int dirty1; - - mutex_enter(&cvd1->vdev_dtl_lock); - dirty1 = cvd1->vdev_dtl_map.sm_space | - cvd1->vdev_dtl_scrub.sm_space; - mutex_exit(&cvd1->vdev_dtl_lock); - - if (!dirty1) { - vdl = kmem_zalloc(sizeof (*vdl), KM_SLEEP); - vdl->vdl_path = spa_strdup(cvd0->vdev_path); - vdl->vdl_guid = cvd0->vdev_guid; - list_insert_tail(l, vdl); + oldvd = vd->vdev_child[0]; + newvd = vd->vdev_child[1]; + + mutex_enter(&newvd->vdev_dtl_lock); + if (newvd->vdev_dtl_map.sm_space == 0 && + newvd->vdev_dtl_scrub.sm_space == 0) { + mutex_exit(&newvd->vdev_dtl_lock); + return (oldvd); } + mutex_exit(&newvd->vdev_dtl_lock); } + + return (NULL); } -void +static void spa_vdev_replace_done(spa_t *spa) { - vdev_detach_link_t *vdl; - list_t vdlist; - - list_create(&vdlist, sizeof (vdev_detach_link_t), - offsetof(vdev_detach_link_t, vdl_node)); - - spa_config_enter(spa, RW_READER); - spa_vdev_replace_done_make_list(&vdlist, spa->spa_root_vdev); - spa_config_exit(spa); - - while ((vdl = list_head(&vdlist)) != NULL) { - list_remove(&vdlist, vdl); - (void) spa_vdev_detach(spa, vdl->vdl_path, vdl->vdl_guid, - B_TRUE); - spa_strfree(vdl->vdl_path); - kmem_free(vdl, sizeof (*vdl)); + vdev_t *vd; + uint64_t guid; + + spa_config_enter(spa, RW_READER, FTAG); + + while ((vd = spa_vdev_replace_done_hunt(spa->spa_root_vdev)) != NULL) { + guid = vd->vdev_guid; + spa_config_exit(spa, FTAG); + if (spa_vdev_detach(spa, guid, B_TRUE) != 0) + return; + spa_config_enter(spa, RW_READER, FTAG); } - list_destroy(&vdlist); + spa_config_exit(spa, FTAG); } /* @@ -1234,7 +1445,16 @@ spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) * ========================================================================== */ -static int spa_scrub_locked(spa_t *, pool_scrub_type_t, boolean_t); +void +spa_scrub_throttle(spa_t *spa, int direction) +{ + mutex_enter(&spa->spa_scrub_lock); + spa->spa_scrub_throttled += direction; + ASSERT(spa->spa_scrub_throttled >= 0); + if (spa->spa_scrub_throttled == 0) + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); +} static void spa_scrub_io_done(zio_t *zio) @@ -1244,22 +1464,23 @@ spa_scrub_io_done(zio_t *zio) zio_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); - if (zio->io_error) - spa->spa_scrub_errors++; - if (--spa->spa_scrub_inflight == 0) - cv_broadcast(&spa->spa_scrub_io_cv); - mutex_exit(&spa->spa_scrub_lock); - - if (zio->io_error) { + if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { vdev_t *vd = zio->io_vd; + spa->spa_scrub_errors++; mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_scrub_errors++; mutex_exit(&vd->vdev_stat_lock); } + if (--spa->spa_scrub_inflight == 0) { + cv_broadcast(&spa->spa_scrub_io_cv); + ASSERT(spa->spa_scrub_throttled == 0); + } + mutex_exit(&spa->spa_scrub_lock); } static void -spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags) +spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags, + zbookmark_t *zb) { size_t size = BP_GET_LSIZE(bp); void *data = zio_buf_alloc(size); @@ -1268,8 +1489,13 @@ spa_scrub_io_start(spa_t *spa, blkptr_t *bp, int priority, int flags) spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); + if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) + flags |= ZIO_FLAG_SPECULATIVE; /* intent log block */ + + flags |= ZIO_FLAG_CANFAIL; + zio_nowait(zio_read(NULL, spa, bp, data, size, - spa_scrub_io_done, NULL, priority, flags)); + spa_scrub_io_done, NULL, priority, flags, zb)); } /* ARGSUSED */ @@ -1319,12 +1545,11 @@ spa_scrub_cb(traverse_blk_cache_t *bc, spa_t *spa, void *a) } if (vdev_dtl_contains(&vd->vdev_dtl_map, bp->blk_birth, 1)) { spa_scrub_io_start(spa, bp, ZIO_PRIORITY_RESILVER, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | - ZIO_FLAG_RESILVER); + ZIO_FLAG_RESILVER, &bc->bc_bookmark); } } else { spa_scrub_io_start(spa, bp, ZIO_PRIORITY_SCRUB, - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_SCRUB); + ZIO_FLAG_SCRUB, &bc->bc_bookmark); } return (0); @@ -1348,19 +1573,25 @@ spa_scrub_thread(spa_t *spa) */ txg_wait_synced(spa_get_dsl(spa), 0); - spa_config_enter(spa, RW_WRITER); - vdev_reopen(rvd, NULL); /* purge all vdev caches */ + dprintf("start %s mintxg=%llu maxtxg=%llu\n", + scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", + spa->spa_scrub_mintxg, spa->spa_scrub_maxtxg); + + spa_config_enter(spa, RW_WRITER, FTAG); + vdev_reopen(rvd); /* purge all vdev caches */ vdev_config_dirty(rvd); /* rewrite all disk labels */ vdev_scrub_stat_update(rvd, scrub_type, B_FALSE); - spa_config_exit(spa); + spa_config_exit(spa, FTAG); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_errors = 0; spa->spa_scrub_active = 1; + ASSERT(spa->spa_scrub_inflight == 0); + ASSERT(spa->spa_scrub_throttled == 0); while (!spa->spa_scrub_stop) { CALLB_CPR_SAFE_BEGIN(&cprinfo); - while (spa->spa_scrub_suspend) { + while (spa->spa_scrub_suspended) { spa->spa_scrub_active = 0; cv_broadcast(&spa->spa_scrub_cv); cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); @@ -1376,6 +1607,9 @@ spa_scrub_thread(spa_t *spa) mutex_enter(&spa->spa_scrub_lock); if (error != EAGAIN) break; + + while (spa->spa_scrub_throttled > 0) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } while (spa->spa_scrub_inflight) @@ -1384,16 +1618,25 @@ spa_scrub_thread(spa_t *spa) if (spa->spa_scrub_restart_txg != 0) error = ERESTART; + if (spa->spa_scrub_stop) + error = EINTR; + spa->spa_scrub_active = 0; cv_broadcast(&spa->spa_scrub_cv); /* - * If the traverse completed, and there were no errors, - * then the scrub was completely successful. + * Even if there were uncorrectable errors, we consider the scrub + * completed. The downside is that if there is a transient error during + * a resilver, we won't resilver the data properly to the target. But + * if the damage is permanent (more likely) we will resilver forever, + * which isn't really acceptable. Since there is enough information for + * the user to know what has failed and why, this seems like a more + * tractable approach. */ - complete = (error == 0 && spa->spa_scrub_errors == 0); + complete = (error == 0); - dprintf("scrub to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", + dprintf("end %s to maxtxg=%llu %s, traverse=%d, %llu errors, stop=%u\n", + scrub_type == POOL_SCRUB_RESILVER ? "resilver" : "scrub", spa->spa_scrub_maxtxg, complete ? "done" : "FAILED", error, spa->spa_scrub_errors, spa->spa_scrub_stop); @@ -1403,31 +1646,32 @@ spa_scrub_thread(spa_t *spa) * If the scrub/resilver completed, update all DTLs to reflect this. * Whether it succeeded or not, vacate all temporary scrub DTLs. */ - spa_config_enter(spa, RW_WRITER); + spa_config_enter(spa, RW_WRITER, FTAG); vdev_dtl_reassess(rvd, spa_last_synced_txg(spa) + 1, complete ? spa->spa_scrub_maxtxg : 0, B_TRUE); - spa_config_exit(spa); - - spa_vdev_replace_done(spa); - - spa_config_enter(spa, RW_READER); vdev_scrub_stat_update(rvd, POOL_SCRUB_NONE, complete); - spa_config_exit(spa); + spa_errlog_rotate(spa); + spa_config_exit(spa, FTAG); mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_type = POOL_SCRUB_NONE; - spa->spa_scrub_active = 0; - spa->spa_scrub_thread = NULL; - - cv_broadcast(&spa->spa_scrub_cv); + /* + * We may have finished replacing a device. + * Let the async thread assess this and handle the detach. + */ + spa_async_request(spa, SPA_ASYNC_REPLACE_DONE); /* * If we were told to restart, our final act is to start a new scrub. */ if (error == ERESTART) - VERIFY(spa_scrub_locked(spa, scrub_type, B_TRUE) == 0); + spa_async_request(spa, scrub_type == POOL_SCRUB_RESILVER ? + SPA_ASYNC_RESILVER : SPA_ASYNC_SCRUB); + spa->spa_scrub_type = POOL_SCRUB_NONE; + spa->spa_scrub_active = 0; + spa->spa_scrub_thread = NULL; + cv_broadcast(&spa->spa_scrub_cv); CALLB_CPR_EXIT(&cprinfo); /* drops &spa->spa_scrub_lock */ thread_exit(); } @@ -1436,7 +1680,7 @@ void spa_scrub_suspend(spa_t *spa) { mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_suspend++; + spa->spa_scrub_suspended++; while (spa->spa_scrub_active) { cv_broadcast(&spa->spa_scrub_cv); cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); @@ -1450,8 +1694,8 @@ void spa_scrub_resume(spa_t *spa) { mutex_enter(&spa->spa_scrub_lock); - ASSERT(spa->spa_scrub_suspend != 0); - if (--spa->spa_scrub_suspend == 0) + ASSERT(spa->spa_scrub_suspended != 0); + if (--spa->spa_scrub_suspended == 0) cv_broadcast(&spa->spa_scrub_cv); mutex_exit(&spa->spa_scrub_lock); } @@ -1469,17 +1713,19 @@ spa_scrub_restart(spa_t *spa, uint64_t txg) mutex_exit(&spa->spa_scrub_lock); } -static int -spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) +int +spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) { space_seg_t *ss; uint64_t mintxg, maxtxg; vdev_t *rvd = spa->spa_root_vdev; - int advance = 0; + int advance = ADVANCE_PRE | ADVANCE_ZIL; if ((uint_t)type >= POOL_SCRUB_TYPES) return (ENOTSUP); + mutex_enter(&spa->spa_scrub_lock); + /* * If there's a scrub or resilver already in progress, stop it. */ @@ -1487,9 +1733,10 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) /* * Don't stop a resilver unless forced. */ - if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) + if (spa->spa_scrub_type == POOL_SCRUB_RESILVER && !force) { + mutex_exit(&spa->spa_scrub_lock); return (EBUSY); - + } spa->spa_scrub_stop = 1; cv_broadcast(&spa->spa_scrub_cv); cv_wait(&spa->spa_scrub_cv, &spa->spa_scrub_lock); @@ -1503,19 +1750,36 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) spa->spa_scrub_th = NULL; } - spa->spa_scrub_stop = 0; - spa->spa_scrub_type = type; - spa->spa_scrub_restart_txg = 0; + if (rvd == NULL) { + ASSERT(spa->spa_scrub_stop == 0); + ASSERT(spa->spa_scrub_type == type); + ASSERT(spa->spa_scrub_restart_txg == 0); + mutex_exit(&spa->spa_scrub_lock); + return (0); + } mintxg = TXG_INITIAL - 1; maxtxg = spa_last_synced_txg(spa) + 1; - switch (type) { + mutex_enter(&rvd->vdev_dtl_lock); - case POOL_SCRUB_NONE: - break; + if (rvd->vdev_dtl_map.sm_space == 0) { + /* + * The pool-wide DTL is empty. + * If this is a resilver, there's nothing to do. + */ + if (type == POOL_SCRUB_RESILVER) + type = POOL_SCRUB_NONE; + } else { + /* + * The pool-wide DTL is non-empty. + * If this is a normal scrub, upgrade to a resilver instead. + */ + if (type == POOL_SCRUB_EVERYTHING) + type = POOL_SCRUB_RESILVER; + } - case POOL_SCRUB_RESILVER: + if (type == POOL_SCRUB_RESILVER) { /* * Determine the resilvering boundaries. * @@ -1525,26 +1789,22 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) * Note: for maxtxg, we MIN with spa_last_synced_txg(spa) + 1 * so we don't claim to resilver a txg that's still changing. */ - mutex_enter(&rvd->vdev_dtl_lock); ss = avl_first(&rvd->vdev_dtl_map.sm_root); - mintxg = ss ? ss->ss_start - 1 : 0; + mintxg = ss->ss_start - 1; ss = avl_last(&rvd->vdev_dtl_map.sm_root); - maxtxg = ss ? ss->ss_end : 0; - maxtxg = MIN(maxtxg, spa_last_synced_txg(spa) + 1); - mutex_exit(&rvd->vdev_dtl_lock); + maxtxg = MIN(ss->ss_end, maxtxg); - advance = ADVANCE_PRE | ADVANCE_PRUNE; - break; - - case POOL_SCRUB_EVERYTHING: - /* - * A scrub is like a resilver, but not pruned by DTL. - */ - advance = ADVANCE_PRE; - break; + advance |= ADVANCE_PRUNE; } - if (mintxg != 0 && maxtxg != 0 && type != POOL_SCRUB_NONE) { + mutex_exit(&rvd->vdev_dtl_lock); + + spa->spa_scrub_stop = 0; + spa->spa_scrub_type = type; + spa->spa_scrub_restart_txg = 0; + + if (type != POOL_SCRUB_NONE) { + spa->spa_scrub_mintxg = mintxg; spa->spa_scrub_maxtxg = maxtxg; spa->spa_scrub_th = traverse_init(spa, spa_scrub_cb, NULL, advance, ZIO_FLAG_CANFAIL); @@ -1553,24 +1813,119 @@ spa_scrub_locked(spa_t *spa, pool_scrub_type_t type, boolean_t force) spa_scrub_thread, spa, 0, &p0, TS_RUN, minclsyspri); } + mutex_exit(&spa->spa_scrub_lock); + return (0); } -int -spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force) +/* + * ========================================================================== + * SPA async task processing + * ========================================================================== + */ + +static void +spa_async_reopen(spa_t *spa) { - int error; - traverse_handle_t *th; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *tvd; + int c; - mutex_enter(&spa->spa_scrub_lock); - error = spa_scrub_locked(spa, type, force); - th = spa->spa_scrub_th; - mutex_exit(&spa->spa_scrub_lock); + spa_config_enter(spa, RW_WRITER, FTAG); + + for (c = 0; c < rvd->vdev_children; c++) { + tvd = rvd->vdev_child[c]; + if (tvd->vdev_reopen_wanted) { + tvd->vdev_reopen_wanted = 0; + vdev_reopen(tvd); + } + } + + spa_config_exit(spa, FTAG); +} - if (th == NULL && type != POOL_SCRUB_NONE) +static void +spa_async_thread(spa_t *spa) +{ + int tasks; + + ASSERT(spa->spa_sync_on); + + mutex_enter(&spa->spa_async_lock); + tasks = spa->spa_async_tasks; + spa->spa_async_tasks = 0; + mutex_exit(&spa->spa_async_lock); + + /* + * See if any devices need to be reopened. + */ + if (tasks & SPA_ASYNC_REOPEN) + spa_async_reopen(spa); + + /* + * If any devices are done replacing, detach them. + */ + if (tasks & SPA_ASYNC_REPLACE_DONE) spa_vdev_replace_done(spa); - return (error); + /* + * Kick off a scrub. + */ + if (tasks & SPA_ASYNC_SCRUB) + VERIFY(spa_scrub(spa, POOL_SCRUB_EVERYTHING, B_TRUE) == 0); + + /* + * Kick off a resilver. + */ + if (tasks & SPA_ASYNC_RESILVER) + VERIFY(spa_scrub(spa, POOL_SCRUB_RESILVER, B_TRUE) == 0); + + /* + * Let the world know that we're done. + */ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_thread = NULL; + cv_broadcast(&spa->spa_async_cv); + mutex_exit(&spa->spa_async_lock); + thread_exit(); +} + +void +spa_async_suspend(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_suspended++; + while (spa->spa_async_thread != NULL) + cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); + mutex_exit(&spa->spa_async_lock); +} + +void +spa_async_resume(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + ASSERT(spa->spa_async_suspended != 0); + spa->spa_async_suspended--; + mutex_exit(&spa->spa_async_lock); +} + +static void +spa_async_dispatch(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + if (spa->spa_async_tasks && !spa->spa_async_suspended && + spa->spa_async_thread == NULL) + spa->spa_async_thread = thread_create(NULL, 0, + spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); + mutex_exit(&spa->spa_async_lock); +} + +void +spa_async_request(spa_t *spa, int task) +{ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_tasks |= task; + mutex_exit(&spa->spa_async_lock); } /* @@ -1628,17 +1983,19 @@ spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) packed = kmem_alloc(nvsize, KM_SLEEP); - VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, 0) == 0); + VERIFY(nvlist_pack(config, &packed, &nvsize, NV_ENCODE_XDR, + KM_SLEEP) == 0); dmu_write(spa->spa_meta_objset, spa->spa_config_object, 0, nvsize, packed, tx); kmem_free(packed, nvsize); - db = dmu_bonus_hold(spa->spa_meta_objset, spa->spa_config_object); + VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, + spa->spa_config_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; - dmu_buf_rele(db); + dmu_buf_rele(db, FTAG); } /* @@ -1651,7 +2008,6 @@ spa_sync(spa_t *spa, uint64_t txg) dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; bplist_t *bpl = &spa->spa_sync_bplist; - vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; int dirty_vdevs; @@ -1659,12 +2015,12 @@ spa_sync(spa_t *spa, uint64_t txg) /* * Lock out configuration changes. */ - spa_config_enter(spa, RW_READER); + spa_config_enter(spa, RW_READER, FTAG); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; - bplist_open(bpl, mos, spa->spa_sync_bplist_obj); + VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); /* * If anything has changed in this txg, push the deferred frees @@ -1685,6 +2041,8 @@ spa_sync(spa_t *spa, uint64_t txg) spa_sync_config_object(spa, tx); dmu_tx_commit(tx); + spa_errlog_sync(spa, txg); + dsl_pool_sync(dp, txg); dirty_vdevs = 0; @@ -1707,11 +2065,7 @@ spa_sync(spa_t *spa, uint64_t txg) * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. */ - while (spa_sync_labels(spa, txg)) { - dprintf("waiting for devices to heal\n"); - delay(hz); - vdev_reopen(rvd, NULL); - } + VERIFY(0 == spa_sync_labels(spa, txg)); /* * Make a stable copy of the fully synced uberblock. @@ -1748,7 +2102,12 @@ spa_sync(spa_t *spa, uint64_t txg) ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); ASSERT(bpl->bpl_queue == NULL); - spa_config_exit(spa); + spa_config_exit(spa, FTAG); + + /* + * If any async tasks have been requested, kick them off. + */ + spa_async_dispatch(spa); } /* @@ -1800,13 +2159,13 @@ spa_evict_all(void) mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* - * Stop all scrub and resilver activity. spa_scrub() needs to - * wait for the scrub thread, which may do a detach and sync the - * configs, which needs spa_namespace_lock. Drop the lock while - * maintaining a hold on the spa_t. + * Stop async tasks. The async thread may need to detach + * a device that's been replaced, which requires grabbing + * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); + spa_async_suspend(spa); VERIFY(spa_scrub(spa, POOL_SCRUB_NONE, B_TRUE) == 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); @@ -1819,3 +2178,9 @@ spa_evict_all(void) } mutex_exit(&spa_namespace_lock); } + +vdev_t * +spa_lookup_by_guid(spa_t *spa, uint64_t guid) +{ + return (vdev_lookup_by_guid(spa->spa_root_vdev, guid)); +} diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c index abcd67ddb9..addf3af885 100644 --- a/usr/src/uts/common/fs/zfs/spa_config.c +++ b/usr/src/uts/common/fs/zfs/spa_config.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,6 +32,11 @@ #include <sys/fs/zfs.h> #include <sys/vdev_impl.h> #include <sys/zfs_ioctl.h> +#ifdef _KERNEL +#include <sys/kobj.h> +#endif + +extern int modrootloaded; /* * Pool configuration repository. @@ -65,43 +69,39 @@ const char *spa_config_dir = ZPOOL_CACHE_DIR; void spa_config_load(void) { - vnode_t *vp; void *buf = NULL; - vattr_t vattr; - ssize_t resid; nvlist_t *nvlist, *child; nvpair_t *nvpair; spa_t *spa; char pathname[128]; + struct _buf *file; + struct bootstat bst; /* * Open the configuration file. */ - (void) snprintf(pathname, sizeof (pathname), "./%s/%s", spa_config_dir, - ZPOOL_CACHE_FILE); - if (vn_openat(pathname, UIO_SYSSPACE, FREAD | FOFFMAX, 0, &vp, 0, 0, - rootdir) != 0) + (void) snprintf(pathname, sizeof (pathname), "%s%s/%s", + (modrootloaded) ? "./" : "", spa_config_dir, ZPOOL_CACHE_FILE); + + file = kobj_open_file(pathname); + if (file == (struct _buf *)-1) return; - /* - * Read the nvlist from the file. - */ - if (VOP_GETATTR(vp, &vattr, 0, kcred) != 0) + if (kobj_fstat(file->_fd, &bst) != 0) goto out; - buf = kmem_alloc(vattr.va_size, KM_SLEEP); + buf = kmem_alloc(bst.st_size, KM_SLEEP); - if (vn_rdwr(UIO_READ, vp, buf, vattr.va_size, 0, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, &resid) != 0) - goto out; - - if (resid != 0) + /* + * Read the nvlist from the file. + */ + if (kobj_read_file(file, buf, bst.st_size, 0) < 0) goto out; /* * Unpack the nvlist. */ - if (nvlist_unpack(buf, vattr.va_size, &nvlist, KM_SLEEP) != 0) + if (nvlist_unpack(buf, bst.st_size, &nvlist, KM_SLEEP) != 0) goto out; /* @@ -133,10 +133,9 @@ spa_config_load(void) out: if (buf != NULL) - kmem_free(buf, vattr.va_size); + kmem_free(buf, bst.st_size); - (void) VOP_CLOSE(vp, FREAD | FOFFMAX, 1, 0, kcred); - VN_RELE(vp); + kobj_close_file(file); } /* @@ -157,7 +156,7 @@ spa_config_sync(void) ASSERT(MUTEX_HELD(&spa_namespace_lock)); - VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); /* * Add all known pools to the configuration list, ignoring those with @@ -179,7 +178,8 @@ spa_config_sync(void) buf = kmem_alloc(buflen, KM_SLEEP); - VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, 0) == 0); + VERIFY(nvlist_pack(config, &buf, &buflen, NV_ENCODE_XDR, + KM_SLEEP) == 0); /* * Write the configuration to disk. We need to do the traditional @@ -226,7 +226,7 @@ spa_all_configs(uint64_t *generation) if (*generation == spa_config_generation) return (NULL); - VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_alloc(&pools, NV_UNIQUE_NAME, KM_SLEEP) == 0); spa = NULL; mutex_enter(&spa_namespace_lock); @@ -279,7 +279,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) else if (txg != 0 && vd == rvd) spa->spa_config_txg = txg; - VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, UBERBLOCK_VERSION) == 0); diff --git a/usr/src/uts/common/fs/zfs/spa_errlog.c b/usr/src/uts/common/fs/zfs/spa_errlog.c new file mode 100644 index 0000000000..b52c3236d2 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/spa_errlog.c @@ -0,0 +1,436 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Routines to manage the on-disk persistent error log. + * + * Each pool stores a log of all logical data errors seen during normal + * operation. This is actually the union of two distinct logs: the last log, + * and the current log. All errors seen are logged to the current log. When a + * scrub completes, the current log becomes the last log, the last log is thrown + * out, and the current log is reinitialized. This way, if an error is somehow + * corrected, a new scrub will show that that it no longer exists, and will be + * deleted from the log when the scrub completes. + * + * The log is stored using a ZAP object whose key is a string form of the + * zbookmark tuple (objset, object, level, blkid), and whose contents is an + * optional 'objset:object' human-readable string describing the data. When an + * error is first logged, this string will be empty, indicating that no name is + * known. This prevents us from having to issue a potentially large amount of + * I/O to discover the object name during an error path. Instead, we do the + * calculation when the data is requested, storing the result so future queries + * will be faster. + * + * This log is then shipped into an nvlist where the key is the dataset name and + * the value is the object name. Userland is then responsible for uniquifying + * this list and displaying it to the user. + */ + +#include <sys/dmu_tx.h> +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/zap.h> +#include <sys/zio.h> + +/* + * This is a stripped-down version of strtoull, suitable only for converting + * lowercase hexidecimal numbers that don't overflow. + */ +static uint64_t +strtonum(char *str, char **nptr) +{ + uint64_t val = 0; + char c; + int digit; + + while ((c = *str) != '\0') { + if (c >= '0' && c <= '9') + digit = c - '0'; + else if (c >= 'a' && c <= 'f') + digit = 10 + c - 'a'; + else + break; + + val *= 16; + val += digit; + + str++; + } + + *nptr = str; + + return (val); +} + +/* + * Convert a bookmark to a string. + */ +static void +bookmark_to_name(zbookmark_t *zb, char *buf, size_t len) +{ + (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", + (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); +} + +/* + * Convert a string to a bookmark + */ +static void +name_to_bookmark(char *buf, zbookmark_t *zb) +{ + zb->zb_objset = strtonum(buf, &buf); + ASSERT(*buf == ':'); + zb->zb_object = strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_level = (int)strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_blkid = strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} + +/* + * Log an uncorrectable error to the persistent error log. We add it to the + * spa's list of pending errors. The changes are actually synced out to disk + * during spa_errlog_sync(). + */ +void +spa_log_error(spa_t *spa, zio_t *zio) +{ + zbookmark_t *zb = &zio->io_logical->io_bookmark; + spa_error_entry_t search; + spa_error_entry_t *new; + avl_tree_t *tree; + avl_index_t where; + + /* + * If we are trying to import a pool, ignore any errors, as we won't be + * writing to the pool any time soon. + */ + if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) + return; + + mutex_enter(&spa->spa_errlist_lock); + + /* + * If we have had a request to rotate the log, log it to the next list + * instead of the current one. + */ + if (spa->spa_scrub_active || spa->spa_scrub_finished) + tree = &spa->spa_errlist_scrub; + else + tree = &spa->spa_errlist_last; + + search.se_bookmark = *zb; + if (avl_find(tree, &search, &where) != NULL) { + mutex_exit(&spa->spa_errlist_lock); + return; + } + + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *zb; + avl_insert(tree, new, where); + + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Return the number of errors currently in the error log. This is actually the + * sum of both the last log and the current log, since we don't know the union + * of these logs until we reach userland. + */ +uint64_t +spa_get_errlog_size(spa_t *spa) +{ + uint64_t total = 0, count; + + mutex_enter(&spa->spa_errlog_lock); + if (spa->spa_errlog_scrub != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, + &count) == 0) + total += count; + + if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += avl_numnodes(&spa->spa_errlist_last); + total += avl_numnodes(&spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); + + return (total); +} + +#ifdef _KERNEL +static int +process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) +{ + zap_cursor_t zc; + zap_attribute_t za; + zbookmark_t zb; + + if (obj == 0) + return (0); + + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + + if (*count == 0) { + zap_cursor_fini(&zc); + return (ENOMEM); + } + + name_to_bookmark(za.za_name, &zb); + + if (copyout(&zb, (char *)addr + + (*count - 1) * sizeof (zbookmark_t), + sizeof (zbookmark_t)) != 0) + return (EFAULT); + + *count -= 1; + } + + zap_cursor_fini(&zc); + + return (0); +} + +static int +process_error_list(avl_tree_t *list, void *addr, size_t *count) +{ + spa_error_entry_t *se; + + for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + + if (*count == 0) + return (ENOMEM); + + if (copyout(&se->se_bookmark, (char *)addr + + (*count - 1) * sizeof (zbookmark_t), + sizeof (zbookmark_t)) != 0) + return (EFAULT); + + *count -= 1; + } + + return (0); +} +#endif + +/* + * Copy all known errors to userland as an array of bookmarks. This is + * actually a union of the on-disk last log and current log, as well as any + * pending error requests. + * + * Because the act of reading the on-disk log could cause errors to be + * generated, we have two separate locks: one for the error log and one for the + * in-core error lists. We only need the error list lock to log and error, so + * we grab the error log lock while we read the on-disk logs, and only pick up + * the error list lock when we are finished. + */ +int +spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) +{ + int ret = 0; + +#ifdef _KERNEL + mutex_enter(&spa->spa_errlog_lock); + + ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); + + if (!ret && !spa->spa_scrub_finished) + ret = process_error_log(spa, spa->spa_errlog_last, uaddr, + count); + + mutex_enter(&spa->spa_errlist_lock); + if (!ret) + ret = process_error_list(&spa->spa_errlist_scrub, uaddr, + count); + if (!ret) + ret = process_error_list(&spa->spa_errlist_last, uaddr, + count); + mutex_exit(&spa->spa_errlist_lock); + + mutex_exit(&spa->spa_errlog_lock); +#endif + + return (ret); +} + +/* + * Called when a scrub completes. This simply set a bit which tells which AVL + * tree to add new errors. spa_errlog_sync() is responsible for actually + * syncing the changes to the underlying objects. + */ +void +spa_errlog_rotate(spa_t *spa) +{ + mutex_enter(&spa->spa_errlist_lock); + + ASSERT(!spa->spa_scrub_finished); + spa->spa_scrub_finished = B_TRUE; + + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Discard any pending errors from the spa_t. Called when unloading a faulted + * pool, as the errors encountered during the open cannot be synced to disk. + */ +void +spa_errlog_drain(spa_t *spa) +{ + spa_error_entry_t *se; + void *cookie; + + mutex_enter(&spa->spa_errlist_lock); + + cookie = NULL; + while ((se = avl_destroy_nodes(&spa->spa_errlist_last, + &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + cookie = NULL; + while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, + &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Process a list of errors into the current on-disk log. + */ +static void +sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) +{ + spa_error_entry_t *se; + char buf[64]; + void *cookie; + + if (avl_numnodes(t) != 0) { + /* create log if necessary */ + if (*obj == 0) + *obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, + 0, tx); + + /* add errors to the current log */ + for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { + char *name = se->se_name ? se->se_name : ""; + + bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); + + (void) zap_update(spa->spa_meta_objset, + *obj, buf, 1, strlen(name) + 1, name, tx); + } + + /* purge the error list */ + cookie = NULL; + while ((se = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + } +} + +/* + * Sync the error log out to disk. This is a little tricky because the act of + * writing the error log requires the spa_errlist_lock. So, we need to lock the + * error lists, take a copy of the lists, and then reinitialize them. Then, we + * drop the error list lock and take the error log lock, at which point we + * do the errlog processing. Then, if we encounter an I/O error during this + * process, we can successfully add the error to the list. Note that this will + * result in the perpetual recycling of errors, but it is an unlikely situation + * and not a performance critical operation. + */ +void +spa_errlog_sync(spa_t *spa, uint64_t txg) +{ + dmu_tx_t *tx; + avl_tree_t scrub, last; + int scrub_finished; + + mutex_enter(&spa->spa_errlist_lock); + + /* + * Bail out early under normal circumstances. + */ + if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && + avl_numnodes(&spa->spa_errlist_last) == 0 && + !spa->spa_scrub_finished) { + mutex_exit(&spa->spa_errlist_lock); + return; + } + + spa_get_errlists(spa, &last, &scrub); + scrub_finished = spa->spa_scrub_finished; + spa->spa_scrub_finished = B_FALSE; + + mutex_exit(&spa->spa_errlist_lock); + mutex_enter(&spa->spa_errlog_lock); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + /* + * Sync out the current list of errors. + */ + sync_error_list(spa, &last, &spa->spa_errlog_last, tx); + + /* + * Rotate the log if necessary. + */ + if (scrub_finished) { + if (spa->spa_errlog_last != 0) + VERIFY(dmu_object_free(spa->spa_meta_objset, + spa->spa_errlog_last, tx) == 0); + spa->spa_errlog_last = spa->spa_errlog_scrub; + spa->spa_errlog_scrub = 0; + + sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); + } + + /* + * Sync out any pending scrub errors. + */ + sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); + + /* + * Update the MOS to reflect the new values. + */ + (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, + &spa->spa_errlog_last, tx); + (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, + &spa->spa_errlog_scrub, tx); + + dmu_tx_commit(tx); + + mutex_exit(&spa->spa_errlog_lock); +} diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 1ea7edfb77..8e0f6ce722 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -60,6 +59,7 @@ * - Increase spa_refcount from non-zero * - Check if spa_refcount is zero * - Rename a spa_t + * - add/remove/attach/detach devices * - Held for the duration of create/destroy/import/export * * It does not need to handle recursion. A create or destroy may @@ -91,14 +91,6 @@ * must have the namespace lock or non-zero refcount to have any kind * of spa_t pointer at all. * - * spa_vdev_lock (global mutex) - * - * This special lock is a global mutex used to serialize attempts to - * access devices through ZFS. It makes sure that we do not try to add - * a single vdev to multiple pools at the same time. It must be held - * when adding or removing a device from the pool. - * - * * The locking order is fairly straightforward: * * spa_namespace_lock -> spa_refcount @@ -111,10 +103,9 @@ * There must be at least one valid reference on the spa_t to acquire * the config lock. * - * spa_vdev_lock -> spa_config_lock + * spa_namespace_lock -> spa_config_lock * - * There are no locks required for spa_vdev_lock, but it must be - * acquired before spa_config_lock. + * The namespace lock must always be taken before the config lock. * * * The spa_namespace_lock and spa_config_cache_lock can be acquired directly and @@ -136,6 +127,7 @@ * spa_evict_all() Shutdown and remove all spa_t structures in * the system. * + * spa_guid_exists() Determine whether a pool/device guid exists. * * The spa_refcount is manipulated using the following functions: * @@ -162,15 +154,14 @@ * spa_config_held() Returns true if the config lock is currently * held in the given state. * - * The spa_vdev_lock, while acquired directly, is hidden by the following - * functions, which imply additional semantics that must be followed: + * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * - * spa_vdev_enter() Acquire the vdev lock and the config lock for - * writing. + * spa_vdev_enter() Acquire the namespace lock and the config lock + * for writing. * * spa_vdev_exit() Release the config lock, wait for all I/O - * to complete, release the vdev lock, and sync - * the updated configs to the cache. + * to complete, sync the updated configs to the + * cache, and release the namespace lock. * * The spa_name() function also requires either the spa_namespace_lock * or the spa_config_lock, as both are needed to do a rename. spa_rename() is @@ -191,8 +182,6 @@ int zfs_flags = ~0; int zfs_flags = 0; #endif -static kmutex_t spa_vdev_lock; - #define SPA_MINREF 5 /* spa_refcnt for an open-but-idle pool */ /* @@ -238,6 +227,7 @@ spa_add(const char *name) spa->spa_freeze_txg = UINT64_MAX; refcount_create(&spa->spa_refcount); + refcount_create(&spa->spa_config_lock.scl_count); avl_add(&spa_namespace_avl, spa); @@ -268,6 +258,7 @@ spa_remove(spa_t *spa) spa_config_set(spa, NULL); refcount_destroy(&spa->spa_refcount); + refcount_destroy(&spa->spa_config_lock.scl_count); kmem_free(spa, sizeof (spa_t)); } @@ -351,7 +342,7 @@ spa_refcount_zero(spa_t *spa) * valid use during create. */ void -spa_config_enter(spa_t *spa, krw_t rw) +spa_config_enter(spa_t *spa, krw_t rw, void *tag) { spa_config_lock_t *scl = &spa->spa_config_lock; @@ -362,13 +353,14 @@ spa_config_enter(spa_t *spa, krw_t rw) while (scl->scl_writer != NULL) cv_wait(&scl->scl_cv, &scl->scl_lock); } else { - while (scl->scl_writer != NULL || scl->scl_count > 0) + while (scl->scl_writer != NULL || + !refcount_is_zero(&scl->scl_count)) cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_writer = curthread; } } - scl->scl_count++; + (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } @@ -377,14 +369,14 @@ spa_config_enter(spa_t *spa, krw_t rw) * Release the spa config lock, notifying any waiters in the process. */ void -spa_config_exit(spa_t *spa) +spa_config_exit(spa_t *spa, void *tag) { spa_config_lock_t *scl = &spa->spa_config_lock; mutex_enter(&scl->scl_lock); - ASSERT(scl->scl_count > 0); - if (--scl->scl_count == 0) { + ASSERT(!refcount_is_zero(&scl->scl_count)); + if (refcount_remove(&scl->scl_count, tag) == 0) { cv_broadcast(&scl->scl_cv); scl->scl_writer = NULL; /* OK in either case */ } @@ -405,7 +397,7 @@ spa_config_held(spa_t *spa, krw_t rw) if (rw == RW_WRITER) held = (scl->scl_writer == curthread); else - held = (scl->scl_count != 0); + held = !refcount_is_zero(&scl->scl_count); mutex_exit(&scl->scl_lock); return (held); @@ -418,16 +410,22 @@ spa_config_held(spa_t *spa, krw_t rw) */ /* - * Lock the given spa_t for the purpose of adding or removing a vdev. This - * grabs the global spa_vdev_lock as well as the spa config lock for writing. + * Lock the given spa_t for the purpose of adding or removing a vdev. + * Grabs the global spa_namespace_lock plus the spa config lock for writing. * It returns the next transaction group for the spa_t. */ uint64_t spa_vdev_enter(spa_t *spa) { - mutex_enter(&spa_vdev_lock); + /* + * Suspend scrub activity while we mess with the config. + */ + spa_scrub_suspend(spa); - spa_config_enter(spa, RW_WRITER); + if (spa->spa_root_vdev != NULL) /* not spa_create() */ + mutex_enter(&spa_namespace_lock); + + spa_config_enter(spa, RW_WRITER, spa); return (spa_last_synced_txg(spa) + 1); } @@ -441,14 +439,26 @@ spa_vdev_enter(spa_t *spa) int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { - vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); + ASSERT(txg != 0); + + /* + * Reassess the DTLs. spa_scrub() looks at the DTLs without + * taking the config lock at all, so keep it safe. + */ + if (spa->spa_root_vdev) + vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); + + spa_config_exit(spa, spa); - spa_config_exit(spa); + /* + * If there was a scrub or resilver in progress, indicate that + * it must restart, and then allow it to resume. + */ + spa_scrub_restart(spa, txg); + spa_scrub_resume(spa); - if (vd == spa->spa_root_vdev) { /* spa_create() */ - mutex_exit(&spa_vdev_lock); + if (vd == spa->spa_root_vdev) /* spa_create() */ return (error); - } /* * Note: this txg_wait_synced() is important because it ensures @@ -458,8 +468,6 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) if (error == 0) txg_wait_synced(spa->spa_dsl_pool, txg); - mutex_exit(&spa_vdev_lock); - if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0); vdev_free(vd); @@ -469,11 +477,10 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) * If we're in the middle of export or destroy, don't sync the * config -- it will do that anyway, and we deadlock if we try. */ - if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE) { - mutex_enter(&spa_namespace_lock); + if (error == 0 && spa->spa_state == POOL_STATE_ACTIVE) spa_config_sync(); - mutex_exit(&spa_namespace_lock); - } + + mutex_exit(&spa_namespace_lock); return (error); } @@ -497,7 +504,7 @@ spa_rename(const char *name, const char *newname) * Lookup the spa_t and grab the config lock for writing. We need to * actually open the pool so that we can sync out the necessary labels. * It's OK to call spa_open() with the namespace lock held because we - * alllow recursive calls for other reasons. + * allow recursive calls for other reasons. */ mutex_enter(&spa_namespace_lock); if ((err = spa_open(name, &spa, FTAG)) != 0) { @@ -505,7 +512,7 @@ spa_rename(const char *name, const char *newname) return (err); } - spa_config_enter(spa, RW_WRITER); + spa_config_enter(spa, RW_WRITER, FTAG); avl_remove(&spa_namespace_avl, spa); spa_strfree(spa->spa_name); @@ -519,7 +526,7 @@ spa_rename(const char *name, const char *newname) */ vdev_config_dirty(spa->spa_root_vdev); - spa_config_exit(spa); + spa_config_exit(spa, FTAG); txg_wait_synced(spa->spa_dsl_pool, 0); @@ -548,12 +555,8 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; - boolean_t locked = B_FALSE; - if (mutex_owner(&spa_namespace_lock) != curthread) { - mutex_enter(&spa_namespace_lock); - locked = B_TRUE; - } + ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa->spa_state == POOL_STATE_UNINITIALIZED) @@ -565,9 +568,6 @@ spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) break; } - if (locked) - mutex_exit(&spa_namespace_lock); - return (spa != NULL); } @@ -646,12 +646,12 @@ spa_freeze(spa_t *spa) { uint64_t freeze_txg = 0; - spa_config_enter(spa, RW_WRITER); + spa_config_enter(spa, RW_WRITER, FTAG); if (spa->spa_freeze_txg == UINT64_MAX) { freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; spa->spa_freeze_txg = freeze_txg; } - spa_config_exit(spa); + spa_config_exit(spa, FTAG); if (freeze_txg != 0) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } diff --git a/usr/src/uts/common/fs/zfs/space_map.c b/usr/src/uts/common/fs/zfs/space_map.c index 25f66bf94b..a99ec3f360 100644 --- a/usr/src/uts/common/fs/zfs/space_map.c +++ b/usr/src/uts/common/fs/zfs/space_map.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -293,7 +292,8 @@ space_map_load(space_map_t *sm, space_map_obj_t *smo, uint8_t maptype, dprintf("object=%llu offset=%llx size=%llx\n", smo->smo_object, offset, size); - dmu_read(os, smo->smo_object, offset, size, entry_map); + VERIFY(0 == dmu_read(os, smo->smo_object, offset, size, + entry_map)); entry_map_end = entry_map + (size / sizeof (uint64_t)); for (entry = entry_map; entry < entry_map_end; entry++) { @@ -394,7 +394,8 @@ space_map_write(space_map_t *sm, space_map_obj_t *smo, objset_t *os, { uint64_t oldsize = smo->smo_objsize; - dmu_free_range(os, smo->smo_object, 0, smo->smo_objsize, tx); + VERIFY(0 == dmu_free_range(os, smo->smo_object, 0, + smo->smo_objsize, tx)); smo->smo_objsize = 0; diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index b11cd42b6d..1a93d4e4ca 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,6 +40,7 @@ typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); typedef void arc_byteswap_func_t(void *buf, size_t size); +typedef int arc_evict_func_t(void *private); /* generic arc_done_func_t's which you can use */ arc_done_func_t arc_bcopy_func; @@ -50,6 +50,8 @@ struct arc_buf { arc_buf_hdr_t *b_hdr; arc_buf_t *b_next; void *b_data; + arc_evict_func_t *b_efunc; + void *b_private; }; /* @@ -60,22 +62,30 @@ struct arc_buf { #define ARC_PREFETCH (1 << 3) /* I/O is a prefetch */ arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag); -void arc_buf_free(arc_buf_t *buf, void *tag); +void arc_buf_add_ref(arc_buf_t *buf, void *tag); +int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); void arc_release(arc_buf_t *buf, void *tag); int arc_released(arc_buf_t *buf); +int arc_has_callback(arc_buf_t *buf); +#ifdef ZFS_DEBUG +int arc_referenced(arc_buf_t *buf); +#endif int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, arc_done_func_t *done, void *private, int priority, int flags, - uint32_t arc_flags); + uint32_t arc_flags, zbookmark_t *zb); int arc_write(zio_t *pio, spa_t *spa, int checksum, int compress, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, arc_done_func_t *done, void *private, int priority, int flags, - uint32_t arc_flags); + uint32_t arc_flags, zbookmark_t *zb); int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_done_func_t *done, void *private, uint32_t arc_flags); int arc_tryread(spa_t *spa, blkptr_t *bp, void *data); +void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); +int arc_buf_evict(arc_buf_t *buf); + void arc_flush(void); void arc_tempreserve_clear(uint64_t tempreserve); int arc_tempreserve_space(uint64_t tempreserve); diff --git a/usr/src/uts/common/fs/zfs/sys/bplist.h b/usr/src/uts/common/fs/zfs/sys/bplist.h index 0933cb977b..c716fe7aa6 100644 --- a/usr/src/uts/common/fs/zfs/sys/bplist.h +++ b/usr/src/uts/common/fs/zfs/sys/bplist.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,11 +66,11 @@ typedef struct bplist { extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx); extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx); -extern void bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object); +extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object); extern void bplist_close(bplist_t *bpl); extern boolean_t bplist_empty(bplist_t *bpl); extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp); -extern void bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx); +extern int bplist_enqueue(bplist_t *bpl, blkptr_t *bp, dmu_tx_t *tx); extern void bplist_enqueue_deferred(bplist_t *bpl, blkptr_t *bp); extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx); extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx); diff --git a/usr/src/uts/common/fs/zfs/sys/dbuf.h b/usr/src/uts/common/fs/zfs/sys/dbuf.h index d67901b31a..5724f7a324 100644 --- a/usr/src/uts/common/fs/zfs/sys/dbuf.h +++ b/usr/src/uts/common/fs/zfs/sys/dbuf.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,13 +44,14 @@ extern "C" { #define IN_DMU_SYNC ((blkptr_t *)-1) /* - * define flags for dbuf_read and friends + * define flags for dbuf_read */ #define DB_RF_MUST_SUCCEED 0 #define DB_RF_CANFAIL (1 << 1) #define DB_RF_HAVESTRUCT (1 << 2) #define DB_RF_NOPREFETCH (1 << 3) +#define DB_RF_NEVERWAIT (1 << 4) /* * The state transition diagram for dbufs looks like: @@ -59,7 +59,7 @@ extern "C" { * +----> READ ----+ * | | * | V - * (alloc)-->UNCACHED CACHED-->(free) + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) * | ^ * | | * +----> FILL ----+ @@ -68,7 +68,8 @@ typedef enum dbuf_states { DB_UNCACHED, DB_FILL, DB_READ, - DB_CACHED + DB_CACHED, + DB_EVICTING } dbuf_states_t; struct objset_impl; @@ -158,8 +159,8 @@ typedef struct dmu_buf_impl { uint64_t db_dirtied; /* - * If dd_dnode != NULL, our link on the owner dnodes's dn_dbufs list. - * Protected by its dn_mtx. + * If db_dnode != NULL, our link on the owner dnodes's dn_dbufs list. + * Protected by its dn_dbufs_mtx. */ list_node_t db_link; @@ -194,7 +195,7 @@ typedef struct dmu_buf_impl { * modify (dirty or clean). db_mtx must be held * before dn_dirty_mtx. */ - arc_buf_t *db_data_old[TXG_SIZE]; + void *db_data_old[TXG_SIZE]; blkptr_t *db_overridden_by[TXG_SIZE]; } db_d; } dmu_buf_impl_t; @@ -212,35 +213,32 @@ typedef struct dbuf_hash_table { uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); +dmu_buf_impl_t *dbuf_create_bonus(struct dnode *dn); -dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid); +dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, void *tag); -dmu_buf_impl_t *dbuf_hold_bonus(struct dnode *dn, void *tag); int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, void *tag, dmu_buf_impl_t **dbp); void dbuf_prefetch(struct dnode *dn, uint64_t blkid); void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); -void dbuf_remove_ref(dmu_buf_impl_t *db, void *tag); uint64_t dbuf_refcount(dmu_buf_impl_t *db); -void dbuf_rele(dmu_buf_impl_t *db); +void dbuf_rele(dmu_buf_impl_t *db, void *tag); dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid); -void dbuf_read(dmu_buf_impl_t *db); -int dbuf_read_canfail(dmu_buf_impl_t *db); -void dbuf_read_havestruct(dmu_buf_impl_t *db); -void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -void dbuf_will_fill(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +void dbuf_clear(dmu_buf_impl_t *db); void dbuf_evict(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); @@ -250,7 +248,6 @@ void dbuf_unoverride(dmu_buf_impl_t *db, uint64_t txg); void dbuf_free_range(struct dnode *dn, uint64_t blkid, uint64_t nblks, struct dmu_tx *); -void dbuf_downgrade(dmu_buf_impl_t *db, int evicting); void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); void dbuf_init(void); diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 62cc46c4de..f0ba816a7c 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -99,6 +98,8 @@ typedef enum dmu_object_type { DMU_OT_PLAIN_OTHER, /* UINT8 */ DMU_OT_UINT64_OTHER, /* UINT64 */ DMU_OT_ZAP_OTHER, /* ZAP */ + /* new object types: */ + DMU_OT_ERROR_LOG, /* ZAP */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -146,6 +147,7 @@ void zfs_znode_byteswap(void *buf, size_t size); int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, objset_t **osp); void dmu_objset_close(objset_t *os); +void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, objset_t *clone_parent, void (*func)(objset_t *os, void *arg, dmu_tx_t *tx), void *arg); @@ -177,6 +179,8 @@ typedef void dmu_byteswap_func_t(void *buf, size_t size); #define DMU_POOL_CONFIG "config" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPLIST "sync_bplist" +#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" +#define DMU_POOL_ERRLOG_LAST "errlog_last" /* * Allocate an object from this objset. The range of object numbers @@ -268,8 +272,7 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release your hold with dmu_buf_rele(). */ -dmu_buf_t *dmu_bonus_hold(objset_t *os, uint64_t object); -dmu_buf_t *dmu_bonus_hold_tag(objset_t *os, uint64_t object, void *tag); +int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); /* @@ -286,11 +289,10 @@ int dmu_bonus_max(void); * * The object number must be a valid, allocated object number. */ -dmu_buf_t *dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset); +int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **); void dmu_buf_add_ref(dmu_buf_t *db, void* tag); -void dmu_buf_remove_ref(dmu_buf_t *db, void* tag); -void dmu_buf_rele(dmu_buf_t *db); -void dmu_buf_rele_tag(dmu_buf_t *db, void *tag); +void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); /* @@ -303,9 +305,9 @@ uint64_t dmu_buf_refcount(dmu_buf_t *db); * with dmu_buf_rele_array. You can NOT release the hold on each buffer * individually with dmu_buf_rele. */ -dmu_buf_t **dmu_buf_hold_array(objset_t *os, uint64_t object, - uint64_t offset, uint64_t length, int *numbufs); -void dmu_buf_rele_array(dmu_buf_t **, int numbufs); +int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); +void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); /* * Returns NULL on success, or the existing user ptr if it's already @@ -348,19 +350,6 @@ void dmu_buf_rele_data(dmu_buf_t *db); void *dmu_buf_get_user(dmu_buf_t *db); /* - * Indicate that you are going to read the buffer's data (db_data). - * - * This routine will read the data from disk if necessary. - * - * These routines will return 0 on success, or an errno if there is a - * nonrecoverable I/O error. - */ -void dmu_buf_read(dmu_buf_t *db); -int dmu_buf_read_canfail(dmu_buf_t *db); -void dmu_buf_read_array(dmu_buf_t **dbp, int numbufs); -int dmu_buf_read_array_canfail(dmu_buf_t **dbp, int numbufs); - -/* * Indicate that you are going to modify the buffer's data (db_data). * * The transaction (tx) must be assigned to a txg (ie. you've called @@ -370,20 +359,6 @@ int dmu_buf_read_array_canfail(dmu_buf_t **dbp, int numbufs); void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); /* - * Indicate that you are going to modify the entire contents of the - * buffer's data ("fill" it). - * - * This routine is the same as dmu_buf_will_dirty, except that it won't - * read the contents off the disk, so the contents may be uninitialized - * and you must overwrite it. - * - * The transaction (tx) must be assigned to a txg (ie. you've called - * dmu_tx_assign()). The buffer's object must be held in the tx (ie. - * you've called dmu_tx_hold_object(tx, db->db_object)). - */ -/* void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); */ - -/* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has @@ -408,7 +383,7 @@ dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); -void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int ops); +void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); @@ -418,7 +393,7 @@ void dmu_tx_commit(dmu_tx_t *tx); * Free up the data blocks for a defined range of a file. If size is * zero, the range from offset to end-of-file is freed. */ -void dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, +int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); /* @@ -427,10 +402,8 @@ void dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ -void dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, +int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf); -int dmu_read_canfail(objset_t *dd, uint64_t object, uint64_t offset, - uint64_t size, void *buf); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); int dmu_write_uio(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, @@ -491,8 +464,7 @@ uint64_t dmu_object_max_nonzero_offset(objset_t *os, uint64_t object); typedef struct dmu_objset_stats { dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; - uint8_t dds_is_placeholder; - uint8_t dds_pad[2]; + uint8_t dds_pad[3]; uint64_t dds_creation_time; uint64_t dds_creation_txg; @@ -532,7 +504,6 @@ typedef struct dmu_objset_stats { * change, so there is a small probability that it will collide. */ uint64_t dds_fsid_guid; - uint64_t dds_guid; uint64_t dds_objects_used; /* number of objects used */ uint64_t dds_objects_avail; /* number of objects available */ @@ -553,15 +524,9 @@ typedef struct dmu_objset_stats { uint64_t dds_available; /* - * Miscellaneous + * Used for debugging purposes */ - char dds_altroot[MAXPATHLEN]; - - /* The following are for debugging purposes only */ uint64_t dds_last_txg; - uint64_t dds_dir_obj; - uint64_t dds_objset_obj; - uint64_t dds_clone_of_obj; } dmu_objset_stats_t; /* @@ -617,7 +582,7 @@ void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, struct vnode *vp); -int dmu_recvbackup(struct drr_begin *drrb, uint64_t *sizep, +int dmu_recvbackup(char *tosnap, struct drr_begin *drrb, uint64_t *sizep, struct vnode *vp, uint64_t voffset); /* CRC64 table */ diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h index d0a77fcfb9..ee14bfab85 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_objset.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_objset.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -86,12 +85,7 @@ typedef struct objset_impl { list_t os_downgraded_dbufs; } objset_impl_t; -#define DMU_PRIVATE_OBJECT (1ULL << 63) - -#define DMU_META_DNODE_OBJECT (1ULL << 63) - -/* XXX rename this to DMU_IS_DNODE_OBJECT? */ -#define IS_DNODE_DNODE(object) ((object) == DMU_META_DNODE_OBJECT) +#define DMU_META_DNODE_OBJECT 0 /* called from zpl */ int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, @@ -106,13 +100,14 @@ void dmu_objset_stats(objset_t *os, dmu_objset_stats_t *dds); void dmu_objset_find(char *name, void func(char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); +void dmu_objset_evict_dbufs(objset_t *os); /* called from dsl */ void dmu_objset_sync(objset_impl_t *os, dmu_tx_t *tx); objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, dmu_objset_type_t type, dmu_tx_t *tx); -objset_impl_t *dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, - blkptr_t *bp); +int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, + objset_impl_t **osip); void dmu_objset_evict(struct dsl_dataset *ds, void *arg); #ifdef __cplusplus diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h index 7087912e00..a80345afd0 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_traverse.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,7 +44,8 @@ extern "C" { #define ADVANCE_PRUNE 0x02 /* prune by prev snapshot birth time */ #define ADVANCE_DATA 0x04 /* read user data blocks */ #define ADVANCE_HOLES 0x08 /* visit holes */ -#define ADVANCE_NOLOCK 0x10 /* Don't grab SPA sync lock */ +#define ADVANCE_ZIL 0x10 /* visit intent log blocks */ +#define ADVANCE_NOLOCK 0x20 /* Don't grab SPA sync lock */ #define ZB_NO_LEVEL -2 #define ZB_MAXLEVEL 32 /* Next power of 2 >= DN_MAX_LEVELS */ @@ -58,13 +58,6 @@ extern "C" { #define ZB_DN_CACHE 2 #define ZB_DEPTH 3 -typedef struct zbookmark { - uint64_t zb_objset; - uint64_t zb_object; - int zb_level; - uint64_t zb_blkid; -} zbookmark_t; - typedef struct zseg { uint64_t seg_mintxg; uint64_t seg_maxtxg; @@ -93,6 +86,7 @@ struct traverse_handle { int th_zio_flags; list_t th_seglist; traverse_blk_cache_t th_cache[ZB_DEPTH][ZB_MAXLEVEL]; + traverse_blk_cache_t th_zil_cache; uint64_t th_hits; uint64_t th_arc_hits; uint64_t th_reads; diff --git a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h index d04c7c8d6b..9b55c56bc9 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu_tx.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu_tx.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -54,6 +53,7 @@ struct dmu_tx { struct dsl_dir *tx_dir; struct dsl_pool *tx_pool; uint64_t tx_txg; + uint64_t tx_lastsnap_txg; txg_handle_t tx_txgh; uint64_t tx_space_towrite; refcount_t tx_space_written; @@ -62,7 +62,7 @@ struct dmu_tx { uint64_t tx_space_tooverwrite; void *tx_tempreserve_cookie; uint8_t tx_anyobj; - uint8_t tx_privateobj; + int tx_err; #ifdef ZFS_DEBUG char *tx_debug_buf; int tx_debug_len; @@ -79,15 +79,10 @@ enum dmu_tx_hold_type { THT_NUMTYPES }; -typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, - uint64_t arg1, uint64_t arg2); - - typedef struct dmu_tx_hold { list_node_t dth_node; struct dnode *dth_dnode; enum dmu_tx_hold_type dth_type; - dmu_tx_hold_func_t dth_func; uint64_t dth_arg1; uint64_t dth_arg2; /* XXX track what the actual estimates were for this hold */ diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h index 1b43805e93..31b148f295 100644 --- a/usr/src/uts/common/fs/zfs/sys/dnode.h +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -63,23 +62,16 @@ extern "C" { #define DNODE_SIZE (1 << DNODE_SHIFT) #define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) #define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) +#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) #define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) -#define DN_META_DNODE_LEVELS \ - (1 + (DN_MAX_OBJECT_SHIFT - DNODE_SHIFT + SPA_BLKPTRSHIFT - \ - DNODES_PER_BLOCK_SHIFT) / DNODES_PER_LEVEL_SHIFT) - /* The +2 here is a cheesy way to round up */ #define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) -#define DN_MAX_OBJECT \ - ((uint64_t)DN_MAX_NBLKPTR << (DNODES_PER_BLOCK_SHIFT + \ - (DN_META_DNODE_LEVELS - 1) * DNODES_PER_LEVEL_SHIFT)) - #define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) @@ -213,15 +205,7 @@ typedef struct dnode { kmutex_t dn_dbufs_mtx; list_t dn_dbufs; /* linked list of descendent dbuf_t's */ - kcondvar_t dn_evicted; /* a child dbuf has been evicted */ - - /* - * Performance hack: whenever we have a hold on the bonus buffer of a - * ZAP object, we will also have a hold on db0. This will keep the - * meta-data for a micro-zap object cached as long as the znode for the - * object is in the znode cache. - */ - struct dmu_buf_impl *dn_db0; + struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ /* holds prefetch structure */ struct zfetch dn_zfetch; @@ -237,9 +221,10 @@ dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp, uint64_t object); void dnode_special_close(dnode_t *dn); -dnode_t *dnode_hold(struct objset_impl *dd, uint64_t object, void *ref); -dnode_t *dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag, - void *ref); +int dnode_hold(struct objset_impl *dd, uint64_t object, + void *ref, dnode_t **dnp); +int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag, + void *ref, dnode_t **dnp); void dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); @@ -266,6 +251,7 @@ void dnode_init(void); void dnode_fini(void); int dnode_next_offset(dnode_t *dn, boolean_t hole, uint64_t *off, int minlvl, uint64_t blkfill); +void dnode_evict_dbufs(dnode_t *dn); #ifdef ZFS_DEBUG diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h index e56c8a67d9..3411eba68b 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dataset.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -108,8 +107,8 @@ int dsl_dataset_open_spa(spa_t *spa, const char *name, int mode, void *tag, dsl_dataset_t **dsp); int dsl_dataset_open(const char *name, int mode, void *tag, dsl_dataset_t **dsp); -dsl_dataset_t *dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj, - const char *tail, int mode, void *tag); +int dsl_dataset_open_obj(struct dsl_pool *dp, uint64_t dsobj, + const char *tail, int mode, void *tag, dsl_dataset_t **); void dsl_dataset_name(dsl_dataset_t *ds, char *name); void dsl_dataset_close(dsl_dataset_t *ds, int mode, void *tag); int dsl_dataset_create_sync(dsl_dir_t *pds, const char *fullname, @@ -134,8 +133,8 @@ void dsl_dataset_sync(dsl_dataset_t *os, dmu_tx_t *tx); void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); void dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); -int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth, - dmu_tx_t *tx); +int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); +uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); void dsl_dataset_stats(dsl_dataset_t *os, dmu_objset_stats_t *dds); diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h index 0499d731e6..5c23fdc497 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_dir.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_dir.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -98,11 +97,11 @@ struct dsl_dir { }; void dsl_dir_close(dsl_dir_t *dd, void *tag); -dsl_dir_t *dsl_dir_open(const char *name, void *tag, const char **tail); -dsl_dir_t *dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, +int dsl_dir_open(const char *name, void *tag, dsl_dir_t **, const char **tail); +int dsl_dir_open_spa(spa_t *spa, const char *name, void *tag, dsl_dir_t **, const char **tailp); -dsl_dir_t *dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, - const char *tail, void *tag); +int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, + const char *tail, void *tag, dsl_dir_t **); void dsl_dir_name(dsl_dir_t *dd, char *buf); int dsl_dir_is_private(dsl_dir_t *dd); int dsl_dir_create_sync(dsl_dir_t *pds, const char *name, dmu_tx_t *tx); diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h index 4fca4548ad..2eab6ae945 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,7 +66,7 @@ typedef struct dsl_pool { krwlock_t dp_config_rwlock; } dsl_pool_t; -dsl_pool_t *dsl_pool_open(spa_t *spa, uint64_t txg); +int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); void dsl_pool_close(dsl_pool_t *dp); dsl_pool_t *dsl_pool_create(spa_t *spa, uint64_t txg); void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); diff --git a/usr/src/uts/common/fs/zfs/sys/refcount.h b/usr/src/uts/common/fs/zfs/sys/refcount.h index f9fffd2443..0b7e12f2cb 100644 --- a/usr/src/uts/common/fs/zfs/sys/refcount.h +++ b/usr/src/uts/common/fs/zfs/sys/refcount.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -42,7 +41,7 @@ extern "C" { * particular object, use FTAG (which is a string) for the holder_tag. * Otherwise, use the object that holds the reference. */ -#define FTAG ((void*)__func__) +#define FTAG ((char *)__func__) #if defined(DEBUG) || !defined(_KERNEL) typedef struct reference { diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index fbe2822a13..2c8a43bb37 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -292,21 +291,30 @@ typedef struct blkptr { /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, void *tag); -extern int spa_get_stats(const char *pool, nvlist_t **config); +extern int spa_get_stats(const char *pool, nvlist_t **config, + char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, char *altroot); extern int spa_import(const char *pool, nvlist_t *config, char *altroot); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); extern int spa_export(char *pool); +extern int spa_reset(char *pool); +extern void spa_async_request(spa_t *spa, int flag); +extern void spa_async_suspend(spa_t *spa); +extern void spa_async_resume(spa_t *spa); +extern spa_t *spa_inject_addref(char *pool); +extern void spa_inject_delref(spa_t *spa); + +#define SPA_ASYNC_REOPEN 0x01 +#define SPA_ASYNC_REPLACE_DONE 0x02 +#define SPA_ASYNC_SCRUB 0x04 +#define SPA_ASYNC_RESILVER 0x08 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); -extern int spa_vdev_add_unlocked(spa_t *spa, nvlist_t *nvroot); -extern int spa_vdev_attach(spa_t *spa, const char *path, nvlist_t *nvroot, +extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing); -extern int spa_vdev_detach(spa_t *spa, const char *path, uint64_t guid, - int replace_done); -extern void spa_vdev_replace_done(spa_t *spa); +extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); /* scrubbing */ @@ -314,6 +322,7 @@ extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force); extern void spa_scrub_suspend(spa_t *spa); extern void spa_scrub_resume(spa_t *spa); extern void spa_scrub_restart(spa_t *spa, uint64_t txg); +extern void spa_scrub_throttle(spa_t *spa, int direction); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ @@ -345,8 +354,8 @@ extern void spa_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); /* Pool configuration lock */ -extern void spa_config_enter(spa_t *spa, krw_t rw); -extern void spa_config_exit(spa_t *spa); +extern void spa_config_enter(spa_t *spa, krw_t rw, void *tag); +extern void spa_config_exit(spa_t *spa, void *tag); extern boolean_t spa_config_held(spa_t *spa, krw_t rw); /* Pool vdev add/remove lock */ @@ -383,6 +392,23 @@ extern uint64_t spa_get_random(uint64_t range); extern void sprintf_blkptr(char *buf, int len, blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern void spa_evict_all(void); +extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid); + +/* error handling */ +struct zbookmark; +struct zio; +extern void spa_log_error(spa_t *spa, struct zio *zio); +extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t stateoroffset, uint64_t length); +extern void zfs_post_ok(spa_t *spa, vdev_t *vd); +extern uint64_t spa_get_errlog_size(spa_t *spa); +extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); +extern void spa_errlog_rotate(spa_t *spa); +extern void spa_errlog_drain(spa_t *spa); +extern void spa_errlog_sync(spa_t *spa, uint64_t txg); +extern int spa_bookmark_name(spa_t *spa, struct zbookmark *zb, char *ds, + size_t dsname, char *obj, size_t objname, char *range, size_t rangelen); +extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); /* Initialization and termination */ extern void spa_init(int flags); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 0fcef6c48b..e9192956c3 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -46,27 +45,33 @@ extern "C" { typedef struct spa_config_lock { kmutex_t scl_lock; - uint64_t scl_count; + refcount_t scl_count; kthread_t *scl_writer; kcondvar_t scl_cv; } spa_config_lock_t; +typedef struct spa_error_entry { + zbookmark_t se_bookmark; + char *se_name; + avl_node_t se_avl; +} spa_error_entry_t; + struct spa { /* * Fields protected by spa_namespace_lock. */ char *spa_name; avl_node_t spa_avl; - int spa_anon; nvlist_t *spa_config; uint64_t spa_config_txg; /* txg of last config change */ spa_config_lock_t spa_config_lock; /* configuration changes */ kmutex_t spa_config_cache_lock; /* for spa_config RW_READER */ int spa_sync_pass; /* iterate-to-convergence */ int spa_state; /* pool state */ - uint8_t spa_minref; /* min refcnt of open pool */ + int spa_inject_ref; /* injection references */ uint8_t spa_traverse_wanted; /* traverse lock wanted */ - taskq_t *spa_vdev_retry_taskq; + uint8_t spa_sync_on; /* sync threads are running */ + spa_load_state_t spa_load_state; /* current load operation */ taskq_t *spa_zio_issue_taskq[ZIO_TYPES]; taskq_t *spa_zio_intr_taskq[ZIO_TYPES]; dsl_pool_t *spa_dsl_pool; @@ -88,18 +93,33 @@ struct spa { kthread_t *spa_scrub_thread; /* scrub/resilver thread */ traverse_handle_t *spa_scrub_th; /* scrub traverse handle */ uint64_t spa_scrub_restart_txg; /* need to restart */ + uint64_t spa_scrub_mintxg; /* min txg we'll scrub */ uint64_t spa_scrub_maxtxg; /* max txg we'll scrub */ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ + int64_t spa_scrub_throttled; /* over-throttle scrub I/Os */ uint64_t spa_scrub_errors; /* scrub I/O error count */ + int spa_scrub_suspended; /* tell scrubber to suspend */ kcondvar_t spa_scrub_cv; /* scrub thread state change */ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_stop; /* tell scrubber to stop */ - uint8_t spa_scrub_suspend; /* tell scrubber to suspend */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ - int spa_sync_on; /* sync threads are running */ + kmutex_t spa_async_lock; /* protect async state */ + kthread_t *spa_async_thread; /* thread doing async task */ + int spa_async_suspended; /* async tasks suspended */ + kcondvar_t spa_async_cv; /* wait for thread_exit() */ + uint16_t spa_async_tasks; /* async task mask */ char *spa_root; /* alternate root directory */ kmutex_t spa_uberblock_lock; /* vdev_uberblock_load_done() */ + uint64_t spa_ena; /* spa-wide ereport ENA */ + boolean_t spa_last_open_failed; /* true if last open faled */ + kmutex_t spa_errlog_lock; /* error log lock */ + uint64_t spa_errlog_last; /* last error log object */ + uint64_t spa_errlog_scrub; /* scrub error log object */ + kmutex_t spa_errlist_lock; /* error list/ereport lock */ + avl_tree_t spa_errlist_last; /* last error list */ + avl_tree_t spa_errlist_scrub; /* scrub error list */ + int spa_scrub_finished; /* indicator to rotate logs */ /* * spa_refcnt must be the last element because it changes size based on * compilation options. In order for the MDB module to function diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index 86d2f1b1ab..f3d7379049 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -60,11 +60,10 @@ typedef struct vdev_knob { extern int vdev_open(vdev_t *); extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg); -extern void vdev_init(vdev_t *, uint64_t txg); -extern void vdev_reopen(vdev_t *, zio_t **zq); +extern int vdev_init(vdev_t *, uint64_t txg); +extern void vdev_reopen(vdev_t *); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); -extern vdev_t *vdev_lookup_by_path(vdev_t *vd, const char *path); extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size); extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size); @@ -73,16 +72,16 @@ extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, extern const char *vdev_description(vdev_t *vd); -extern void vdev_metaslab_init(vdev_t *vd, uint64_t txg); +extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_stat_update(zio_t *zio); extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete); -extern void vdev_checksum_error(zio_t *zio, vdev_t *vd); extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec); -extern void vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux); +extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, + vdev_aux_t aux); extern void vdev_space_update(vdev_t *vd, uint64_t space_delta, uint64_t alloc_delta); @@ -92,11 +91,10 @@ extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); extern void vdev_io_start(zio_t *zio); extern void vdev_io_done(zio_t *zio); -extern int vdev_online(spa_t *spa, const char *path); -extern int vdev_offline(spa_t *spa, const char *path, int istmp); +extern int vdev_online(spa_t *spa, uint64_t guid); +extern int vdev_offline(spa_t *spa, uint64_t guid, int istmp); +extern void vdev_clear(spa_t *spa, vdev_t *vd); -extern int vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, - uint64_t arg); extern int vdev_error_inject(vdev_t *vd, zio_t *zio); extern int vdev_is_dead(vdev_t *vd); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 53a202a906..2dfc45edff 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -103,9 +103,11 @@ struct vdev_cache { struct vdev_queue { uint64_t vq_min_pending; uint64_t vq_max_pending; + uint64_t vq_scrub_limit; uint64_t vq_agg_limit; uint64_t vq_time_shift; uint64_t vq_ramp_rate; + uint64_t vq_scrub_count; avl_tree_t vq_deadline_tree; avl_tree_t vq_read_tree; avl_tree_t vq_write_tree; @@ -150,10 +152,9 @@ struct vdev { txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ uint8_t vdev_dirty[TXG_SIZE]; /* per-txg dirty flags */ - int vdev_is_dirty; /* on config dirty list? */ + uint8_t vdev_is_dirty; /* on config dirty list? */ + uint8_t vdev_reopen_wanted; /* async reopen wanted? */ list_node_t vdev_dirty_node; /* config dirty list */ - zio_t *vdev_io_retry; /* I/O retry list */ - list_t vdev_io_pending; /* I/O pending list */ /* * Leaf vdev state. @@ -173,6 +174,8 @@ struct vdev { uint8_t vdev_detached; /* device detached? */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ vdev_cache_t vdev_cache; /* physical block cache */ + uint64_t vdev_not_present; /* not present during import */ + hrtime_t vdev_last_try; /* last reopen time */ /* * For DTrace to work in userland (libzpool) context, these fields must @@ -183,8 +186,6 @@ struct vdev { */ kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ kmutex_t vdev_dirty_lock; /* vdev_dirty[] */ - kmutex_t vdev_io_lock; /* vdev_io_pending list */ - kcondvar_t vdev_io_cv; /* vdev_io_pending list empty? */ kmutex_t vdev_stat_lock; /* vdev_stat */ }; @@ -260,7 +261,7 @@ extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ -extern int vdev_load(vdev_t *vd, int import); +extern int vdev_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); extern void vdev_dirty(vdev_t *vd, uint8_t flags, uint64_t txg); diff --git a/usr/src/uts/common/fs/zfs/sys/zap_impl.h b/usr/src/uts/common/fs/zfs/sys/zap_impl.h index 9fb6a6c5a4..e77a2efa61 100644 --- a/usr/src/uts/common/fs/zfs/sys/zap_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zap_impl.h @@ -199,7 +199,7 @@ void zap_put_leaf(struct zap_leaf *l); int fzap_add_cd(zap_t *zap, const char *name, uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, dmu_tx_t *tx, struct zap_leaf **lp); + const void *val, uint32_t cd, dmu_tx_t *tx); void fzap_upgrade(zap_t *zap, dmu_tx_t *tx); #ifdef __cplusplus diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h index 2ea27493f9..34057e83c9 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -103,7 +102,6 @@ int zfs_zaccess_rename(struct znode *, struct znode *, struct znode *, struct znode *, cred_t *cr); int zfs_zaccess_v4_perm(struct znode *, int, cred_t *); void zfs_acl_free(zfs_acl_t *); -zfs_acl_t *zfs_acl_node_read(struct znode *); #endif diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h index c914b23570..14ad31e629 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -31,6 +30,7 @@ #include <sys/cred.h> #include <sys/dmu.h> +#include <sys/zio.h> #ifdef __cplusplus extern "C" { @@ -66,7 +66,7 @@ typedef struct dmu_replay_record { char drr_toname[MAXNAMELEN]; } drr_begin; struct drr_end { - uint64_t drr_checksum; + zio_cksum_t drr_checksum; } drr_end; struct drr_object { uint64_t drr_object; @@ -97,15 +97,31 @@ typedef struct dmu_replay_record { } drr_u; } dmu_replay_record_t; +typedef struct zinject_record { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; +} zinject_record_t; + +#define ZINJECT_NULL 0x1 +#define ZINJECT_FLUSH_ARC 0x2 +#define ZINJECT_UNLOAD_SPA 0x4 + typedef struct zfs_cmd { char zc_name[MAXNAMELEN]; char zc_prop_name[MAXNAMELEN]; char zc_prop_value[MAXPATHLEN]; char zc_root[MAXPATHLEN]; - char zc_filename[MAXPATHLEN]; + char zc_filename[MAXNAMELEN]; uint32_t zc_intsz; uint32_t zc_numints; - uint64_t zc_pool_guid; + uint64_t zc_guid; uint64_t zc_config_src; /* really (char *) */ uint64_t zc_config_src_size; uint64_t zc_config_dst; /* really (char *) */ @@ -116,9 +132,10 @@ typedef struct zfs_cmd { uint64_t zc_volsize; uint64_t zc_volblocksize; uint64_t zc_objset_type; - dmu_object_info_t zc_object_info; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; + zinject_record_t zc_inject_record; + zbookmark_t zc_bookmark; } zfs_cmd_t; #define ZVOL_MAX_MINOR (1 << 16) diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h index f9331be00a..02f4b3b247 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -133,8 +132,6 @@ typedef struct zfs_dirlock { struct zfs_dirlock *dl_next; /* next in z_dirlocks list */ } zfs_dirlock_t; -struct zcache_state; - typedef struct znode { struct zfsvfs *z_zfsvfs; vnode_t *z_vnode; @@ -150,16 +147,12 @@ typedef struct znode { uint8_t z_atime_dirty; /* atime needs to be synced */ uint8_t z_dbuf_held; /* Is z_dbuf already held? */ uint8_t z_zn_prefetch; /* Prefetch znodes? */ - uint_t z_mapcnt; /* number of memory maps to file */ uint_t z_blksz; /* block size in bytes */ uint_t z_seq; /* modification sequence number */ + uint64_t z_mapcnt; /* number of pages mapped to file */ uint64_t z_last_itx; /* last ZIL itx on this znode */ kmutex_t z_acl_lock; /* acl data lock */ list_node_t z_link_node; /* all znodes in fs link */ - list_node_t z_zcache_node; - struct zcache_state *z_zcache_state; - uint64_t z_zcache_access; - /* * These are dmu managed fields. */ @@ -241,14 +234,12 @@ extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, dmu_tx_t *, cred_t *cr); extern void zfs_znode_init(void); extern void zfs_znode_fini(void); -extern znode_t *zfs_znode_alloc(zfsvfs_t *, dmu_buf_t *, uint64_t, int); extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); extern void zfs_zinactive(znode_t *); extern void zfs_znode_delete(znode_t *, dmu_tx_t *); extern void zfs_znode_free(znode_t *); extern int zfs_delete_thread_target(zfsvfs_t *zfsvfs, int nthreads); extern void zfs_delete_wait_empty(zfsvfs_t *zfsvfs); -extern void zfs_zcache_flush(zfsvfs_t *zfsvf); extern void zfs_remove_op_tables(); extern int zfs_create_op_tables(); extern int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr); diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 5d3227e546..d80310f2fa 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -109,23 +108,25 @@ enum zio_compress { #define ZIO_PRIORITY_SCRUB (zio_priority_table[9]) #define ZIO_PRIORITY_TABLE_SIZE 10 -#define ZIO_FLAG_MUSTSUCCEED 0x0000 -#define ZIO_FLAG_CANFAIL 0x0001 -#define ZIO_FLAG_FAILFAST 0x0002 -#define ZIO_FLAG_CONFIG_HELD 0x0004 +#define ZIO_FLAG_MUSTSUCCEED 0x00000 +#define ZIO_FLAG_CANFAIL 0x00001 +#define ZIO_FLAG_FAILFAST 0x00002 +#define ZIO_FLAG_CONFIG_HELD 0x00004 -#define ZIO_FLAG_DONT_CACHE 0x0010 -#define ZIO_FLAG_DONT_QUEUE 0x0020 -#define ZIO_FLAG_DONT_PROPAGATE 0x0040 -#define ZIO_FLAG_DONT_RETRY 0x0080 +#define ZIO_FLAG_DONT_CACHE 0x00010 +#define ZIO_FLAG_DONT_QUEUE 0x00020 +#define ZIO_FLAG_DONT_PROPAGATE 0x00040 +#define ZIO_FLAG_DONT_RETRY 0x00080 -#define ZIO_FLAG_PHYSICAL 0x0100 -#define ZIO_FLAG_IO_BYPASS 0x0200 -#define ZIO_FLAG_IO_REPAIR 0x0400 -#define ZIO_FLAG_SPECULATIVE 0x0800 +#define ZIO_FLAG_PHYSICAL 0x00100 +#define ZIO_FLAG_IO_BYPASS 0x00200 +#define ZIO_FLAG_IO_REPAIR 0x00400 +#define ZIO_FLAG_SPECULATIVE 0x00800 -#define ZIO_FLAG_RESILVER 0x1000 -#define ZIO_FLAG_SCRUB 0x2000 +#define ZIO_FLAG_RESILVER 0x01000 +#define ZIO_FLAG_SCRUB 0x02000 + +#define ZIO_FLAG_NOBOOKMARK 0x10000 #define ZIO_FLAG_GANG_INHERIT \ (ZIO_FLAG_CANFAIL | \ @@ -155,11 +156,39 @@ typedef struct zio_transform zio_transform_t; extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; extern char *zio_type_name[ZIO_TYPES]; +/* + * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely + * identifies any block in the pool. By convention, the meta-objset (MOS) + * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is + * level -1 of the meta-dnode, and intent log blocks (which are chained + * off the root block) have blkid == sequence number. In summary: + * + * mos is objset 0 + * meta-dnode is object 0 + * root block is <objset, 0, -1, 0> + * intent log is <objset, 0, -1, ZIL sequence number> + * + * Note: this structure is called a bookmark because its first purpose was + * to remember where to resume a pool-wide traverse. The absolute ordering + * for block visitation during traversal is defined in compare_bookmark(). + * + * Note: this structure is passed between userland and the kernel. + * Therefore it must not change size or alignment between 32/64 bit + * compilation options. + */ +typedef struct zbookmark { + uint64_t zb_objset; + uint64_t zb_object; + int64_t zb_level; + uint64_t zb_blkid; +} zbookmark_t; + struct zio { /* Core information about this I/O */ zio_t *io_parent; zio_t *io_root; spa_t *io_spa; + zbookmark_t io_bookmark; int io_checksum; int io_compress; int io_dva_index; @@ -170,6 +199,7 @@ struct zio { zio_t *io_sibling_prev; zio_t *io_sibling_next; zio_transform_t *io_transform_stack; + zio_t *io_logical; /* Callback info */ zio_done_func_t *io_done; @@ -191,8 +221,6 @@ struct zio { avl_tree_t *io_vdev_tree; zio_t *io_delegate_list; zio_t *io_delegate_next; - zio_t *io_retry_next; - list_node_t io_pending; /* Internal pipeline state */ int io_flags; @@ -212,6 +240,9 @@ struct zio { void *io_waiter; kmutex_t io_lock; kcondvar_t io_cv; + + /* FMA state */ + uint64_t io_ena; }; extern zio_t *zio_null(zio_t *pio, spa_t *spa, @@ -222,15 +253,17 @@ extern zio_t *zio_root(spa_t *spa, extern zio_t *zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags); + int priority, int flags, zbookmark_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *done, void *private, int priority, int flags); + zio_done_func_t *done, void *private, int priority, int flags, + zbookmark_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, int checksum, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *done, void *private, int priority, int flags); + zio_done_func_t *done, void *private, int priority, int flags, + zbookmark_t *zb); extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_done_func_t *done, void *private); @@ -285,12 +318,27 @@ extern void zio_set_gang_verifier(zio_t *zio, zio_cksum_t *zcp); extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); +boolean_t zio_should_retry(zio_t *zio); + /* * Initial setup and teardown. */ extern void zio_init(void); extern void zio_fini(void); +/* + * Fault injection + */ +struct zinject_record; +extern uint32_t zio_injection_enabled; +extern int zio_inject_fault(char *name, int flags, int *id, + struct zinject_record *record); +extern int zio_inject_list_next(int *id, char *name, size_t buflen, + struct zinject_record *record); +extern int zio_clear_fault(int id); +extern int zio_handle_fault_injection(zio_t *zio, int error); +extern int zio_handle_device_injection(vdev_t *vd, int error); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h index ba3dc48d28..bb7bd41e0b 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_checksum.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_checksum.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -57,9 +56,11 @@ extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; */ extern zio_checksum_t fletcher_2_native; extern zio_checksum_t fletcher_4_native; +extern zio_checksum_t fletcher_4_incremental_native; extern zio_checksum_t fletcher_2_byteswap; extern zio_checksum_t fletcher_4_byteswap; +extern zio_checksum_t fletcher_4_incremental_byteswap; extern zio_checksum_t zio_checksum_SHA256; diff --git a/usr/src/uts/common/fs/zfs/sys/zio_impl.h b/usr/src/uts/common/fs/zfs/sys/zio_impl.h index 0b2b07de29..e1abf0e49d 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -201,6 +200,9 @@ struct zio_transform { zio_transform_t *zt_next; }; +extern void zio_inject_init(void); +extern void zio_inject_fini(void); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/uberblock.c b/usr/src/uts/common/fs/zfs/uberblock.c index 63bff0ae4b..b6d3fe9595 100644 --- a/usr/src/uts/common/fs/zfs/uberblock.c +++ b/usr/src/uts/common/fs/zfs/uberblock.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -30,9 +29,6 @@ #include <sys/uberblock_impl.h> #include <sys/vdev_impl.h> -/* Keep the uberblock version in a varialbe so we can get at it with mdb */ -static uint64_t uberblock_version = UBERBLOCK_VERSION; - int uberblock_verify(uberblock_t *ub) { @@ -42,9 +38,6 @@ uberblock_verify(uberblock_t *ub) if (ub->ub_magic != UBERBLOCK_MAGIC) return (EINVAL); - if (ub->ub_version != UBERBLOCK_VERSION) - return (ENOTSUP); - return (0); } diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index 838e1bfc88..363be462ab 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -26,6 +26,7 @@ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> #include <sys/spa.h> #include <sys/spa_impl.h> #include <sys/dmu.h> @@ -137,34 +138,6 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev) } vdev_t * -vdev_lookup_by_path(vdev_t *vd, const char *path) -{ - int c; - vdev_t *mvd; - - if (vd->vdev_path != NULL) { - if (vd->vdev_wholedisk == 1) { - /* - * For whole disks, the internal path has 's0', but the - * path passed in by the user doesn't. - */ - if (strlen(path) == strlen(vd->vdev_path) - 2 && - strncmp(path, vd->vdev_path, strlen(path)) == 0) - return (vd); - } else if (strcmp(path, vd->vdev_path) == 0) { - return (vd); - } - } - - for (c = 0; c < vd->vdev_children; c++) - if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != - NULL) - return (mvd); - - return (NULL); -} - -vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { int c; @@ -305,10 +278,6 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; - mutex_init(&vd->vdev_io_lock, NULL, MUTEX_DEFAULT, NULL); - cv_init(&vd->vdev_io_cv, NULL, CV_DEFAULT, NULL); - list_create(&vd->vdev_io_pending, sizeof (zio_t), - offsetof(zio_t, io_pending)); mutex_init(&vd->vdev_dirty_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); @@ -343,9 +312,6 @@ vdev_free_common(vdev_t *vd) mutex_exit(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_dirty_lock); - list_destroy(&vd->vdev_io_pending); - mutex_destroy(&vd->vdev_io_lock); - cv_destroy(&vd->vdev_io_cv); kmem_free(vd, sizeof (vdev_t)); } @@ -402,6 +368,13 @@ vdev_alloc(spa_t *spa, nvlist_t *nv, vdev_t *parent, uint_t id, int alloctype) vd->vdev_wholedisk = -1ULL; /* + * Look for the 'not present' flag. This will only be set if the device + * was not present at the time of import. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &vd->vdev_not_present); + + /* * If we're a top-level vdev, try to load the allocation parameters. */ if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { @@ -536,8 +509,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) vdev_config_dirty(tvd); } - ASSERT(svd->vdev_io_retry == NULL); - ASSERT(list_is_empty(&svd->vdev_io_pending)); + tvd->vdev_reopen_wanted = svd->vdev_reopen_wanted; + svd->vdev_reopen_wanted = 0; } static void @@ -611,7 +584,7 @@ vdev_remove_parent(vdev_t *cvd) vdev_free(mvd); } -void +int vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; @@ -621,6 +594,7 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; space_map_obj_t *smo = vd->vdev_smo; metaslab_t **mspp = vd->vdev_ms; + int ret; dprintf("%s oldc %llu newc %llu\n", vdev_description(vd), oldc, newc); @@ -638,21 +612,29 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) ms_array = kmem_zalloc(newc * sizeof (uint64_t), KM_SLEEP); - dmu_read(spa->spa_meta_objset, vd->vdev_ms_array, - 0, newc * sizeof (uint64_t), ms_array); + if ((ret = dmu_read(spa->spa_meta_objset, + vd->vdev_ms_array, 0, + newc * sizeof (uint64_t), ms_array)) != 0) { + kmem_free(ms_array, newc * sizeof (uint64_t)); + goto error; + } for (c = 0; c < newc; c++) { if (ms_array[c] == 0) continue; - db = dmu_bonus_hold(spa->spa_meta_objset, - ms_array[c]); - dmu_buf_read(db); + if ((ret = dmu_bonus_hold( + spa->spa_meta_objset, ms_array[c], + FTAG, &db)) != 0) { + kmem_free(ms_array, + newc * sizeof (uint64_t)); + goto error; + } ASSERT3U(db->db_size, ==, sizeof (*smo)); bcopy(db->db_data, &vd->vdev_smo[c], db->db_size); ASSERT3U(vd->vdev_smo[c].smo_object, ==, ms_array[c]); - dmu_buf_rele(db); + dmu_buf_rele(db, FTAG); } kmem_free(ms_array, newc * sizeof (uint64_t)); } @@ -674,6 +656,21 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) kmem_free(mspp, oldc * sizeof (*mspp)); } + return (0); + +error: + /* + * On error, undo any partial progress we may have made, and restore the + * old metaslab values. + */ + kmem_free(vd->vdev_smo, newc * sizeof (*smo)); + kmem_free(vd->vdev_ms, newc * sizeof (*mspp)); + + vd->vdev_smo = smo; + vd->vdev_ms = mspp; + vd->vdev_ms_count = oldc; + + return (ret); } void @@ -735,39 +732,39 @@ vdev_open(vdev_t *vd) if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); - dprintf("OFFLINE: %s = ENXIO\n", vdev_description(vd)); - vd->vdev_state = VDEV_STATE_OFFLINE; + vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); return (ENXIO); } error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); + if (zio_injection_enabled && error == 0) + error = zio_handle_device_injection(vd, ENXIO); + dprintf("%s = %d, osize %llu, state = %d\n", vdev_description(vd), error, osize, vd->vdev_state); if (error) { - dprintf("%s in %s failed to open, error %d, aux %d\n", - vdev_description(vd), - vdev_description(vd->vdev_parent), - error, + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, vd->vdev_stat.vs_aux); - - vd->vdev_state = VDEV_STATE_CANT_OPEN; return (error); } vd->vdev_state = VDEV_STATE_HEALTHY; for (c = 0; c < vd->vdev_children; c++) - if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) - vd->vdev_state = VDEV_STATE_DEGRADED; + if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_NONE); + break; + } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); if (vd->vdev_children == 0) { if (osize < SPA_MINDEVSIZE) { - vd->vdev_state = VDEV_STATE_CANT_OPEN; - vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL; + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_TOO_SMALL); return (EOVERFLOW); } psize = osize; @@ -775,8 +772,8 @@ vdev_open(vdev_t *vd) } else { if (osize < SPA_MINDEVSIZE - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { - vd->vdev_state = VDEV_STATE_CANT_OPEN; - vd->vdev_stat.vs_aux = VDEV_AUX_TOO_SMALL; + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_TOO_SMALL); return (EOVERFLOW); } psize = 0; @@ -796,9 +793,8 @@ vdev_open(vdev_t *vd) * Make sure the alignment requirement hasn't increased. */ if (ashift > vd->vdev_ashift) { - dprintf("%s: ashift grew\n", vdev_description(vd)); - vd->vdev_state = VDEV_STATE_CANT_OPEN; - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); return (EINVAL); } @@ -806,9 +802,8 @@ vdev_open(vdev_t *vd) * Make sure the device hasn't shrunk. */ if (asize < vd->vdev_asize) { - dprintf("%s: device shrank\n", vdev_description(vd)); - vd->vdev_state = VDEV_STATE_CANT_OPEN; - vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); return (EINVAL); } @@ -818,11 +813,29 @@ vdev_open(vdev_t *vd) */ if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize) { - dprintf("%s: device grew\n", vdev_description(vd)); vd->vdev_asize = asize; } } + /* + * If we were able to open a vdev that was marked permanently + * unavailable, clear that state now. + */ + if (vd->vdev_not_present) + vd->vdev_not_present = 0; + + /* + * This allows the ZFS DE to close cases appropriately. If a device + * goes away and later returns, we want to close the associated case. + * But it's not enough to simply post this only when a device goes from + * CANT_OPEN -> HEALTHY. If we reboot the system and the device is + * back, we also need to close the case (otherwise we will try to replay + * it). So we have to post this notifier every time. Since this only + * occurs during pool open or error recovery, this should not be an + * issue. + */ + zfs_post_ok(vd->vdev_spa, vd); + return (0); } @@ -832,8 +845,6 @@ vdev_open(vdev_t *vd) void vdev_close(vdev_t *vd) { - ASSERT3P(list_head(&vd->vdev_io_pending), ==, NULL); - vd->vdev_ops->vdev_op_close(vd); if (vd->vdev_cache_active) { @@ -846,43 +857,29 @@ vdev_close(vdev_t *vd) vd->vdev_state = VDEV_STATE_OFFLINE; else vd->vdev_state = VDEV_STATE_CLOSED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } void -vdev_reopen(vdev_t *vd, zio_t **rq) +vdev_reopen(vdev_t *vd) { - vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; int c; + ASSERT(spa_config_held(spa, RW_WRITER)); + if (vd == rvd) { - ASSERT(rq == NULL); for (c = 0; c < rvd->vdev_children; c++) - vdev_reopen(rvd->vdev_child[c], NULL); + vdev_reopen(rvd->vdev_child[c]); return; } /* only valid for top-level vdevs */ ASSERT3P(vd, ==, vd->vdev_top); - /* - * vdev_state can change when spa_config_lock is held as writer, - * or when it's held as reader and we're doing a vdev_reopen(). - * To handle the latter case, we grab rvd's io_lock to serialize - * reopens. This ensures that there's never more than one vdev - * state changer active at a time. - */ - mutex_enter(&rvd->vdev_io_lock); - - mutex_enter(&vd->vdev_io_lock); - while (list_head(&vd->vdev_io_pending) != NULL) - cv_wait(&vd->vdev_io_cv, &vd->vdev_io_lock); vdev_close(vd); (void) vdev_open(vd); - if (rq != NULL) { - *rq = vd->vdev_io_retry; - vd->vdev_io_retry = NULL; - } - mutex_exit(&vd->vdev_io_lock); /* * Reassess root vdev's health. @@ -892,8 +889,6 @@ vdev_reopen(vdev_t *vd, zio_t **rq) uint64_t state = rvd->vdev_child[c]->vdev_state; rvd->vdev_state = MIN(rvd->vdev_state, state); } - - mutex_exit(&rvd->vdev_io_lock); } int @@ -930,7 +925,7 @@ vdev_create(vdev_t *vd, uint64_t txg) * For creation, we want to try to create all vdevs at once and then undo it * if anything fails; this is much harder if we have pending transactions. */ -void +int vdev_init(vdev_t *vd, uint64_t txg) { /* @@ -942,7 +937,7 @@ vdev_init(vdev_t *vd, uint64_t txg) /* * Initialize the vdev's metaslabs. */ - vdev_metaslab_init(vd, txg); + return (vdev_metaslab_init(vd, txg)); } void @@ -993,9 +988,10 @@ vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) { + spa_t *spa = vd->vdev_spa; int c; - ASSERT(spa_config_held(vd->vdev_spa, RW_WRITER)); + ASSERT(spa_config_held(spa, RW_WRITER)); if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); @@ -1019,6 +1015,12 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) return; } + /* + * Make sure the DTLs are always correct under the scrub lock. + */ + if (vd == spa->spa_root_vdev) + mutex_enter(&spa->spa_scrub_lock); + mutex_enter(&vd->vdev_dtl_lock); space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); @@ -1032,6 +1034,9 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); mutex_exit(&vd->vdev_dtl_lock); } + + if (vd == spa->spa_root_vdev) + mutex_exit(&spa->spa_scrub_lock); } static int @@ -1047,11 +1052,12 @@ vdev_dtl_load(vdev_t *vd) if (smo->smo_object == 0) return (0); - db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object); - dmu_buf_read(db); + if ((error = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object, + FTAG, &db)) != 0) + return (error); ASSERT3U(db->db_size, ==, sizeof (*smo)); bcopy(db->db_data, smo, db->db_size); - dmu_buf_rele(db); + dmu_buf_rele(db, FTAG); mutex_enter(&vd->vdev_dtl_lock); error = space_map_load(&vd->vdev_dtl_map, smo, SM_ALLOC, @@ -1100,8 +1106,8 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) vdev_config_dirty(vd->vdev_top); } - dmu_free_range(spa->spa_meta_objset, smo->smo_object, - 0, smo->smo_objsize, tx); + VERIFY(0 == dmu_free_range(spa->spa_meta_objset, smo->smo_object, + 0, smo->smo_objsize, tx)); mutex_init(&smlock, NULL, MUTEX_DEFAULT, NULL); @@ -1124,17 +1130,18 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) mutex_exit(&smlock); mutex_destroy(&smlock); - db = dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object); + VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, smo->smo_object, + FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, ==, sizeof (*smo)); bcopy(smo, db->db_data, db->db_size); - dmu_buf_rele(db); + dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); } int -vdev_load(vdev_t *vd, int import) +vdev_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; int c, error; @@ -1147,7 +1154,7 @@ vdev_load(vdev_t *vd, int import) * Recursively load all children. */ for (c = 0; c < vd->vdev_children; c++) - if ((error = vdev_load(vd->vdev_child[c], import)) != 0) + if ((error = vdev_load(vd->vdev_child[c])) != 0) return (error); /* @@ -1166,7 +1173,7 @@ vdev_load(vdev_t *vd, int import) */ if ((label = vdev_label_read_config(vd)) == NULL) { dprintf("can't load label config\n"); - vdev_set_state(vd, VDEV_STATE_CANT_OPEN, + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (0); } @@ -1174,7 +1181,7 @@ vdev_load(vdev_t *vd, int import) if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != spa_guid(spa)) { dprintf("bad or missing pool GUID (%llu)\n", guid); - vdev_set_state(vd, VDEV_STATE_CANT_OPEN, + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); @@ -1184,7 +1191,7 @@ vdev_load(vdev_t *vd, int import) guid != vd->vdev_guid) { dprintf("bad or missing vdev guid (%llu != %llu)\n", guid, vd->vdev_guid); - vdev_set_state(vd, VDEV_STATE_CANT_OPEN, + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); @@ -1201,14 +1208,15 @@ vdev_load(vdev_t *vd, int import) if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state)) { dprintf("missing pool state\n"); - vdev_set_state(vd, VDEV_STATE_CANT_OPEN, + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); nvlist_free(label); return (0); } if (state != POOL_STATE_ACTIVE && - (!import || state != POOL_STATE_EXPORTED)) { + (spa->spa_load_state == SPA_LOAD_OPEN || + state != POOL_STATE_EXPORTED)) { dprintf("pool state not active (%llu)\n", state); nvlist_free(label); return (EBADF); @@ -1227,12 +1235,16 @@ vdev_load(vdev_t *vd, int import) vd->vdev_ms_shift == 0 || vd->vdev_ashift == 0 || vd->vdev_asize == 0) { - vdev_set_state(vd, VDEV_STATE_CANT_OPEN, + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (0); } - vdev_metaslab_init(vd, 0); + if ((error = vdev_metaslab_init(vd, 0)) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (0); + } } /* @@ -1243,7 +1255,7 @@ vdev_load(vdev_t *vd, int import) if (error) { dprintf("can't load DTL for %s, error %d\n", vdev_description(vd), error); - vdev_set_state(vd, VDEV_STATE_CANT_OPEN, + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); return (0); } @@ -1344,7 +1356,7 @@ vdev_description(vdev_t *vd) } int -vdev_online(spa_t *spa, const char *path) +vdev_online(spa_t *spa, uint64_t guid) { vdev_t *rvd, *vd; uint64_t txg; @@ -1352,24 +1364,14 @@ vdev_online(spa_t *spa, const char *path) txg = spa_vdev_enter(spa); rvd = spa->spa_root_vdev; - if ((vd = vdev_lookup_by_path(rvd, path)) == NULL) + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); dprintf("ONLINE: %s\n", vdev_description(vd)); vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; - - /* - * Clear the error counts. The idea is that you expect to see all - * zeroes when everything is working, so if you've just onlined a - * device, you don't want to keep hearing about errors from before. - */ - vd->vdev_stat.vs_read_errors = 0; - vd->vdev_stat.vs_write_errors = 0; - vd->vdev_stat.vs_checksum_errors = 0; - - vdev_reopen(vd->vdev_top, NULL); + vdev_reopen(vd->vdev_top); spa_config_set(spa, spa_config_generate(spa, rvd, txg, 0)); @@ -1383,7 +1385,7 @@ vdev_online(spa_t *spa, const char *path) } int -vdev_offline(spa_t *spa, const char *path, int istmp) +vdev_offline(spa_t *spa, uint64_t guid, int istmp) { vdev_t *rvd, *vd; uint64_t txg; @@ -1391,7 +1393,7 @@ vdev_offline(spa_t *spa, const char *path, int istmp) txg = spa_vdev_enter(spa); rvd = spa->spa_root_vdev; - if ((vd = vdev_lookup_by_path(rvd, path)) == NULL) + if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); dprintf("OFFLINE: %s\n", vdev_description(vd)); @@ -1416,10 +1418,10 @@ vdev_offline(spa_t *spa, const char *path, int istmp) * undo it and fail the request. */ vd->vdev_offline = B_TRUE; - vdev_reopen(vd->vdev_top, NULL); + vdev_reopen(vd->vdev_top); if (vdev_is_dead(vd->vdev_top)) { vd->vdev_offline = B_FALSE; - vdev_reopen(vd->vdev_top, NULL); + vdev_reopen(vd->vdev_top); return (spa_vdev_exit(spa, NULL, txg, EBUSY)); } @@ -1434,25 +1436,25 @@ vdev_offline(spa_t *spa, const char *path, int istmp) return (spa_vdev_exit(spa, NULL, txg, 0)); } -int -vdev_error_setup(spa_t *spa, const char *path, int mode, int mask, uint64_t arg) +/* + * Clear the error counts associated with this vdev. Unlike vdev_online() and + * vdev_offline(), we assume the spa config is locked. We also clear all + * children. If 'vd' is NULL, then the user wants to clear all vdevs. + */ +void +vdev_clear(spa_t *spa, vdev_t *vd) { - vdev_t *vd; - - spa_config_enter(spa, RW_WRITER); - - if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, path)) == NULL) { - spa_config_exit(spa); - return (ENODEV); - } + int c; - vd->vdev_fault_mode = mode; - vd->vdev_fault_mask = mask; - vd->vdev_fault_arg = arg; + if (vd == NULL) + vd = spa->spa_root_vdev; - spa_config_exit(spa); + vd->vdev_stat.vs_read_errors = 0; + vd->vdev_stat.vs_write_errors = 0; + vd->vdev_stat.vs_checksum_errors = 0; - return (0); + for (c = 0; c < vd->vdev_children; c++) + vdev_clear(spa, vd->vdev_child[c]); } int @@ -1631,24 +1633,6 @@ vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) } /* - * Report checksum errors that a vdev that didn't realize it made. - * This can happen, for example, when RAID-Z combinatorial reconstruction - * infers that one of its components returned bad data. - */ -void -vdev_checksum_error(zio_t *zio, vdev_t *vd) -{ - dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", - vdev_description(vd)); - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_checksum_errors++; - mutex_exit(&vd->vdev_stat_lock); - } -} - -/* * Update the in-core space usage stats for this vdev and the root vdev. */ void @@ -1709,6 +1693,14 @@ static vdev_knob_t vdev_knob[] = { offsetof(struct vdev, vdev_queue.vq_max_pending) }, { + "scrub_limit", + "maximum scrub/resilver I/O queue", + 0, + 10000, + 70, + offsetof(struct vdev, vdev_queue.vq_scrub_limit) + }, + { "agg_limit", "maximum size of aggregated I/Os", 0, @@ -1781,20 +1773,78 @@ vdev_config_clean(vdev_t *vd) } /* - * Set a vdev's state, updating any parent's state as well. + * Set a vdev's state. If this is during an open, we don't update the parent + * state, because we're in the process of opening children depth-first. + * Otherwise, we propagate the change to the parent. + * + * If this routine places a device in a faulted state, an appropriate ereport is + * generated. */ void -vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux) +vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) { - if (state == vd->vdev_state) + uint64_t prev_state; + + if (state == vd->vdev_state) { + vd->vdev_stat.vs_aux = aux; return; + } + + prev_state = vd->vdev_state; vd->vdev_state = state; vd->vdev_stat.vs_aux = aux; + if (state == VDEV_STATE_CANT_OPEN) { + /* + * If we fail to open a vdev during an import, we mark it as + * "not available", which signifies that it was never there to + * begin with. Failure to open such a device is not considered + * an error. + */ + if (!vd->vdev_not_present && + vd != vd->vdev_spa->spa_root_vdev) { + const char *class; + + switch (aux) { + case VDEV_AUX_OPEN_FAILED: + class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; + break; + case VDEV_AUX_CORRUPT_DATA: + class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; + break; + case VDEV_AUX_NO_REPLICAS: + class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; + break; + case VDEV_AUX_BAD_GUID_SUM: + class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; + break; + case VDEV_AUX_TOO_SMALL: + class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; + break; + case VDEV_AUX_BAD_LABEL: + class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; + break; + default: + class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; + } + + zfs_ereport_post(class, vd->vdev_spa, + vd, NULL, prev_state, 0); + } + + if (vd->vdev_spa->spa_load_state == SPA_LOAD_IMPORT && + vd->vdev_ops->vdev_op_leaf) + vd->vdev_not_present = 1; + } + + if (isopen) + return; + if (vd->vdev_parent != NULL) { int c; int degraded = 0, faulted = 0; + int corrupted = 0; vdev_t *parent, *child; parent = vd->vdev_parent; @@ -1804,9 +1854,23 @@ vdev_set_state(vdev_t *vd, vdev_state_t state, vdev_aux_t aux) faulted++; else if (child->vdev_state == VDEV_STATE_DEGRADED) degraded++; + + if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) + corrupted++; } vd->vdev_parent->vdev_ops->vdev_op_state_change( vd->vdev_parent, faulted, degraded); - } + + /* + * Root special: if this is a toplevel vdev that cannot be + * opened due to corrupted metadata, then propagate the root + * vdev's aux state as 'corrupt' rather than 'insufficient + * replicas'. + */ + if (corrupted && vd == vd->vdev_top) + vdev_set_state(vd->vdev_spa->spa_root_vdev, + B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + } } diff --git a/usr/src/uts/common/fs/zfs/vdev_cache.c b/usr/src/uts/common/fs/zfs/vdev_cache.c index e1e7c1a36f..67a8924b52 100644 --- a/usr/src/uts/common/fs/zfs/vdev_cache.c +++ b/usr/src/uts/common/fs/zfs/vdev_cache.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -286,7 +285,8 @@ vdev_cache_read(zio_t *zio) fio = zio_vdev_child_io(zio, NULL, zio->io_vd, cache_offset, ve->ve_data, vc->vc_blocksize, ZIO_TYPE_READ, ZIO_PRIORITY_CACHE_FILL, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY | ZIO_FLAG_NOBOOKMARK, vdev_cache_fill, ve); ve->ve_fill_io = fio; diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index 1556c387b2..b4d7d7a0d2 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -323,6 +323,9 @@ vdev_disk_io_done(zio_t *zio) if (zio->io_type == ZIO_TYPE_WRITE) vdev_cache_write(zio); + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + zio_next_stage(zio); } diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c index a789008e17..a82abf80b7 100644 --- a/usr/src/uts/common/fs/zfs/vdev_file.c +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -190,6 +189,9 @@ vdev_file_io_done(zio_t *zio) if (zio->io_type == ZIO_TYPE_WRITE) vdev_cache_write(zio); + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_device_injection(zio->io_vd, EIO); + zio_next_stage(zio); } diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index 1282df0d9a..3571be9064 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -165,8 +165,8 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, zio_nowait(zio_read_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, - ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_SPECULATIVE | - ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY)); + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE)); } static void @@ -178,8 +178,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, zio_nowait(zio_write_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, - ZIO_PRIORITY_SYNC_WRITE, - ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_DONT_RETRY)); + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL)); } /* @@ -190,7 +189,7 @@ vdev_config_generate(vdev_t *vd, int getstats) { nvlist_t *nv = NULL; - VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type) == 0); @@ -209,6 +208,9 @@ vdev_config_generate(vdev_t *vd, int getstats) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, vd->vdev_wholedisk) == 0); + if (vd->vdev_not_present) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1) == 0); + if (vd == vd->vdev_top) { VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, vd->vdev_ms_array) == 0); @@ -269,7 +271,6 @@ vdev_label_read_config(vdev_t *vd) { nvlist_t *config = NULL; vdev_phys_t *vp; - uint64_t version; zio_t *zio; int l; @@ -280,8 +281,8 @@ vdev_label_read_config(vdev_t *vd) for (l = 0; l < VDEV_LABELS; l++) { - zio = zio_root(vd->vdev_spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_CONFIG_HELD); + zio = zio_root(vd->vdev_spa, NULL, NULL, ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CONFIG_HELD); vdev_label_read(zio, vd, l, vp, offsetof(vdev_label_t, vl_vdev_phys), @@ -289,10 +290,7 @@ vdev_label_read_config(vdev_t *vd) if (zio_wait(zio) == 0 && nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), - &config, 0) == 0 && - nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, - &version) == 0 && - version == UBERBLOCK_VERSION) + &config, 0) == 0) break; if (config != NULL) { @@ -341,16 +339,15 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg) * Check whether this device is already in use. * Ignore the check if crtxg == 0, which we use for device removal. */ - if (crtxg != 0 && (label = vdev_label_read_config(vd)) != NULL) { - uint64_t version, state, pool_guid, device_guid, txg; + if (crtxg != 0 && + (label = vdev_label_read_config(vd)) != NULL) { + uint64_t state, pool_guid, device_guid, txg; uint64_t mycrtxg = 0; (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, &mycrtxg); - if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, - &version) == 0 && version == UBERBLOCK_VERSION && - nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) == 0 && state == POOL_STATE_ACTIVE && nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0 && @@ -390,7 +387,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg) buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); - if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) != 0) { + if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) != 0) { nvlist_free(label); zio_buf_free(vp, sizeof (vdev_phys_t)); return (EINVAL); @@ -491,7 +488,7 @@ vdev_uberblock_load_done(zio_t *zio) ASSERT3U(zio->io_size, ==, sizeof (uberblock_phys_t)); - if (uberblock_verify(ub) == 0) { + if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&spa->spa_uberblock_lock); if (vdev_uberblock_compare(ub, ubbest) > 0) *ubbest = *ub; @@ -645,7 +642,7 @@ vdev_sync_label(zio_t *zio, vdev_t *vd, int l, uint64_t txg) buf = vp->vp_nvlist; buflen = sizeof (vp->vp_nvlist); - if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, 0) == 0) + if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) vdev_label_write(zio, vd, l, vp, offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), vdev_sync_label_done, NULL); diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c index 45eb7ce78b..b88b999c6f 100644 --- a/usr/src/uts/common/fs/zfs/vdev_mirror.c +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -209,7 +208,8 @@ vdev_mirror_io_start(zio_t *zio) mm = vdev_mirror_map_alloc(zio); if (zio->io_type == ZIO_TYPE_READ) { - if (zio->io_flags & ZIO_FLAG_SCRUB) { + if ((zio->io_flags & ZIO_FLAG_SCRUB) && + vd->vdev_ops != &vdev_replacing_ops) { /* * For scrubbing reads we need to allocate a read * buffer for each child and issue reads to all @@ -384,11 +384,12 @@ static void vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) { if (faulted == vd->vdev_children) - vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) - vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else - vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } vdev_ops_t vdev_mirror_ops = { diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index 09831e1504..bb838fedd1 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -103,6 +102,8 @@ vdev_queue_fini(vdev_t *vd) { vdev_queue_t *vq = &vd->vdev_queue; + ASSERT(vq->vq_scrub_count == 0); + avl_destroy(&vq->vq_deadline_tree); avl_destroy(&vq->vq_read_tree); avl_destroy(&vq->vq_write_tree); @@ -112,6 +113,28 @@ vdev_queue_fini(vdev_t *vd) } static void +vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) +{ + avl_add(&vq->vq_deadline_tree, zio); + avl_add(zio->io_vdev_tree, zio); + + if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) && + ++vq->vq_scrub_count >= vq->vq_scrub_limit) + spa_scrub_throttle(zio->io_spa, 1); +} + +static void +vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) +{ + if ((zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) && + vq->vq_scrub_count-- >= vq->vq_scrub_limit) + spa_scrub_throttle(zio->io_spa, -1); + + avl_remove(&vq->vq_deadline_tree, zio); + avl_remove(zio->io_vdev_tree, zio); +} + +static void vdev_queue_agg_io_done(zio_t *aio) { zio_t *dio; @@ -182,18 +205,19 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, aio = zio_vdev_child_io(fio, NULL, fio->io_vd, fio->io_offset, buf, size, fio->io_type, ZIO_PRIORITY_NOW, ZIO_FLAG_DONT_QUEUE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_NOBOOKMARK, vdev_queue_agg_io_done, NULL); aio->io_delegate_list = fio; for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { ASSERT(dio->io_type == aio->io_type); + ASSERT(dio->io_vdev_tree == tree); if (dio->io_type == ZIO_TYPE_WRITE) bcopy(dio->io_data, buf + offset, dio->io_size); offset += dio->io_size; - avl_remove(&vq->vq_deadline_tree, dio); - avl_remove(tree, dio); + vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); nagg++; } @@ -211,8 +235,8 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit, return (aio); } - avl_remove(&vq->vq_deadline_tree, fio); - avl_remove(tree, fio); + ASSERT(fio->io_vdev_tree == tree); + vdev_queue_io_remove(vq, fio); avl_add(&vq->vq_pending_tree, fio); @@ -245,8 +269,7 @@ vdev_queue_io(zio_t *zio) zio->io_deadline = (zio->io_timestamp >> vq->vq_time_shift) + zio->io_priority; - avl_add(&vq->vq_deadline_tree, zio); - avl_add(zio->io_vdev_tree, zio); + vdev_queue_io_add(vq, zio); nio = vdev_queue_io_to_issue(vq, vq->vq_min_pending, &func); diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index c2c4985856..157ae5001c 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -32,6 +31,7 @@ #include <sys/zio.h> #include <sys/zio_checksum.h> #include <sys/fs/zfs.h> +#include <sys/fm/fs/zfs.h> /* * Virtual device vector for RAID-Z. @@ -327,6 +327,28 @@ vdev_raidz_io_start(zio_t *zio) zio_wait_children_done(zio); } +/* + * Report a checksum error for a child of a RAID-Z device. + */ +static void +raidz_checksum_error(zio_t *zio, raidz_col_t *rc) +{ + vdev_t *vd = zio->io_vd->vdev_child[rc->rc_col]; + dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", + vdev_description(vd)); + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + } + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) + zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, + zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); +} + + static void vdev_raidz_io_done(zio_t *zio) { @@ -398,8 +420,7 @@ vdev_raidz_io_done(zio_t *zio) bcopy(rc->rc_data, orig, rc->rc_size); vdev_raidz_reconstruct(rm, c); if (bcmp(orig, rc->rc_data, rc->rc_size) != 0) { - vdev_checksum_error(zio, - vd->vdev_child[rc->rc_col]); + raidz_checksum_error(zio, rc); rc->rc_error = ECKSUM; unexpected_errors++; } @@ -500,8 +521,7 @@ vdev_raidz_io_done(zio_t *zio) * inform it. */ if (rc->rc_tried && rc->rc_error == 0) - vdev_checksum_error(zio, - vd->vdev_child[rc->rc_col]); + raidz_checksum_error(zio, rc); rc->rc_error = ECKSUM; goto done; } @@ -511,9 +531,18 @@ vdev_raidz_io_done(zio_t *zio) } /* - * All combinations failed to checksum. + * All combinations failed to checksum. Generate checksum ereports for + * every one. */ zio->io_error = ECKSUM; + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, + zio->io_spa, vd->vdev_child[rc->rc_col], zio, + rc->rc_offset, rc->rc_size); + } + } done: zio_checksum_verified(zio); @@ -558,11 +587,12 @@ static void vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) { if (faulted > 1) - vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); else if (degraded + faulted != 0) - vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else - vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } vdev_ops_t vdev_raidz_ops = { diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c index 4e44b5bb05..85671d00b1 100644 --- a/usr/src/uts/common/fs/zfs/vdev_root.c +++ b/usr/src/uts/common/fs/zfs/vdev_root.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -79,11 +78,12 @@ static void vdev_root_state_change(vdev_t *vd, int faulted, int degraded) { if (faulted > 0) - vdev_set_state(vd, VDEV_STATE_CANT_OPEN, VDEV_AUX_NO_REPLICAS); + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); else if (degraded != 0) - vdev_set_state(vd, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); else - vdev_set_state(vd, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } vdev_ops_t vdev_root_ops = { diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c index 2866b7f729..8dc17ed4b1 100644 --- a/usr/src/uts/common/fs/zfs/zap.c +++ b/usr/src/uts/common/fs/zfs/zap.c @@ -45,6 +45,7 @@ #include <sys/dmu.h> #include <sys/zfs_context.h> #include <sys/zap.h> +#include <sys/refcount.h> #include <sys/zap_impl.h> #include <sys/zap_leaf.h> @@ -54,8 +55,8 @@ int fzap_default_block_shift = 14; /* 16k blocksize */ static void zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx); static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx); -static zap_leaf_t *zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, - dmu_tx_t *tx, krw_t lt); +static int zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, + dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp); static void zap_leaf_pageout(dmu_buf_t *db, void *vl); @@ -120,8 +121,8 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx) /* * set up block 1 - the first leaf */ - db = dmu_buf_hold(zap->zap_objset, zap->zap_object, - 1<<FZAP_BLOCK_SHIFT(zap)); + VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + 1<<FZAP_BLOCK_SHIFT(zap), FTAG, &db)); dmu_buf_will_dirty(db, tx); l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); @@ -131,7 +132,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx) zap_leaf_init(l); kmem_free(l, sizeof (zap_leaf_t)); - dmu_buf_rele(db); + dmu_buf_rele(db, FTAG); } static int @@ -157,6 +158,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, { uint64_t b, newblk; dmu_buf_t *db_old, *db_new; + int err; int bs = FZAP_BLOCK_SHIFT(zap); int hepb = 1<<(bs-4); /* hepb = half the number of entries in a block */ @@ -181,26 +183,27 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, */ b = tbl->zt_blks_copied; - db_old = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + b) << bs); - dmu_buf_read(db_old); + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + b) << bs, FTAG, &db_old); + if (err) + return; /* first half of entries in old[b] go to new[2*b+0] */ - db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (newblk + 2*b+0) << bs); + VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + (newblk + 2*b+0) << bs, FTAG, &db_new)); dmu_buf_will_dirty(db_new, tx); transfer_func(db_old->db_data, db_new->db_data, hepb); - dmu_buf_rele(db_new); + dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ - db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (newblk + 2*b+1) << bs); + VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + (newblk + 2*b+1) << bs, FTAG, &db_new)); dmu_buf_will_dirty(db_new, tx); transfer_func((uint64_t *)db_old->db_data + hepb, db_new->db_data, hepb); - dmu_buf_rele(db_new); + dmu_buf_rele(db_new, FTAG); - dmu_buf_rele(db_old); + dmu_buf_rele(db_old, FTAG); tbl->zt_blks_copied++; @@ -208,7 +211,7 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, tbl->zt_blks_copied, tbl->zt_numblks); if (tbl->zt_blks_copied == tbl->zt_numblks) { - dmu_free_range(zap->zap_objset, zap->zap_object, + (void) dmu_free_range(zap->zap_objset, zap->zap_object, tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); tbl->zt_blk = newblk; @@ -222,13 +225,14 @@ zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, } } -static uint64_t +static int zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, dmu_tx_t *tx) { - uint64_t blk, off, oldval; - dmu_buf_t *db; + int err; + uint64_t blk, off; int bs = FZAP_BLOCK_SHIFT(zap); + dmu_buf_t *db; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(tbl->zt_blk != 0); @@ -238,33 +242,41 @@ zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, blk = idx >> (bs-3); off = idx & ((1<<(bs-3))-1); - db = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + blk) << bs); + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + blk) << bs, FTAG, &db); + if (err) + return (err); dmu_buf_will_dirty(db, tx); - oldval = ((uint64_t *)db->db_data)[off]; - ((uint64_t *)db->db_data)[off] = val; - dmu_buf_rele(db); if (tbl->zt_nextblk != 0) { - idx *= 2; - blk = idx >> (bs-3); - off = idx & ((1<<(bs-3))-1); - - db = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_nextblk + blk) << bs); - dmu_buf_will_dirty(db, tx); - ((uint64_t *)db->db_data)[off] = val; - ((uint64_t *)db->db_data)[off+1] = val; - dmu_buf_rele(db); + uint64_t idx2 = idx * 2; + uint64_t blk2 = idx2 >> (bs-3); + uint64_t off2 = idx2 & ((1<<(bs-3))-1); + dmu_buf_t *db2; + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_nextblk + blk2) << bs, FTAG, &db2); + if (err) { + dmu_buf_rele(db, FTAG); + return (err); + } + dmu_buf_will_dirty(db2, tx); + ((uint64_t *)db2->db_data)[off2] = val; + ((uint64_t *)db2->db_data)[off2+1] = val; + dmu_buf_rele(db2, FTAG); } - return (oldval); + ((uint64_t *)db->db_data)[off] = val; + dmu_buf_rele(db, FTAG); + + return (0); } -static uint64_t -zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx) +static int +zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) { - uint64_t blk, off, val; + uint64_t blk, off; + int err; dmu_buf_t *db; int bs = FZAP_BLOCK_SHIFT(zap); @@ -273,12 +285,26 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx) blk = idx >> (bs-3); off = idx & ((1<<(bs-3))-1); - db = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (tbl->zt_blk + blk) << bs); - dmu_buf_read(db); - val = ((uint64_t *)db->db_data)[off]; - dmu_buf_rele(db); - return (val); + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + blk) << bs, FTAG, &db); + if (err) + return (err); + *valp = ((uint64_t *)db->db_data)[off]; + dmu_buf_rele(db, FTAG); + + if (tbl->zt_nextblk != 0) { + /* + * read the nextblk for the sake of i/o error checking, + * so that zap_table_load() will catch errors for + * zap_table_store. + */ + blk = (idx*2) >> (bs-3); + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_nextblk + blk) << bs, FTAG, &db); + dmu_buf_rele(db, FTAG); + } + return (err); } /* @@ -310,19 +336,21 @@ zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) */ uint64_t newblk; dmu_buf_t *db_new; + int err; ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_shift, ==, ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); ASSERT3U(zap->zap_f.zap_phys->zap_ptrtbl.zt_blk, ==, 0); newblk = zap_allocate_blocks(zap, 1, tx); - db_new = dmu_buf_hold(zap->zap_objset, zap->zap_object, - newblk << FZAP_BLOCK_SHIFT(zap)); - + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new); + if (err) + return; dmu_buf_will_dirty(db_new, tx); zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); - dmu_buf_rele(db_new); + dmu_buf_rele(db_new, FTAG); zap->zap_f.zap_phys->zap_ptrtbl.zt_blk = newblk; zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks = 1; @@ -386,8 +414,8 @@ zap_create_leaf(zap_t *zap, dmu_tx_t *tx) l->l_dbuf = NULL; l->l_phys = NULL; - l->l_dbuf = dmu_buf_hold(zap->zap_objset, zap->zap_object, - l->l_blkid << FZAP_BLOCK_SHIFT(zap)); + VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, + l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf)); winner = dmu_buf_set_user(l->l_dbuf, l, &l->l_phys, zap_leaf_pageout); ASSERT(winner == NULL); dmu_buf_will_dirty(l->l_dbuf, tx); @@ -403,7 +431,7 @@ zap_destroy_leaf(zap_t *zap, zap_leaf_t *l, dmu_tx_t *tx) { /* uint64_t offset = l->l_blkid << ZAP_BLOCK_SHIFT; */ rw_exit(&l->l_rwlock); - dmu_buf_rele(l->l_dbuf); + dmu_buf_rele(l->l_dbuf, NULL); /* XXX there are still holds on this block, so we can't free it? */ /* dmu_free_range(zap->zap_objset, zap->zap_object, */ /* offset, 1<<ZAP_BLOCK_SHIFT, tx); */ @@ -430,11 +458,11 @@ zap_put_leaf(zap_leaf_t *l) while (nl) { zap_leaf_t *nnl = nl->l_next; rw_exit(&nl->l_rwlock); - dmu_buf_rele(nl->l_dbuf); + dmu_buf_rele(nl->l_dbuf, NULL); nl = nnl; } rw_exit(&l->l_rwlock); - dmu_buf_rele(l->l_dbuf); + dmu_buf_rele(l->l_dbuf, NULL); } _NOTE(ARGSUSED(0)) @@ -489,23 +517,27 @@ zap_open_leaf(uint64_t blkid, dmu_buf_t *db) return (l); } -static zap_leaf_t * -zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt) +static int +zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, + zap_leaf_t **lp) { dmu_buf_t *db; zap_leaf_t *l; int bs = FZAP_BLOCK_SHIFT(zap); + int err; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - db = dmu_buf_hold(zap->zap_objset, zap->zap_object, blkid << bs); + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + blkid << bs, NULL, &db); + if (err) + return (err); ASSERT3U(db->db_object, ==, zap->zap_object); ASSERT3U(db->db_offset, ==, blkid << bs); ASSERT3U(db->db_size, ==, 1 << bs); ASSERT(blkid != 0); - dmu_buf_read(db); l = dmu_buf_get_user(db); if (l == NULL) @@ -524,43 +556,53 @@ zap_get_leaf_byblk_impl(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt) ASSERT3U(l->lh_block_type, ==, ZBT_LEAF); ASSERT3U(l->lh_magic, ==, ZAP_LEAF_MAGIC); - return (l); + *lp = l; + return (0); } -static zap_leaf_t * -zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt) +static int +zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, + zap_leaf_t **lp) { - zap_leaf_t *l, *nl; + int err; + zap_leaf_t *nl; - l = zap_get_leaf_byblk_impl(zap, blkid, tx, lt); + err = zap_get_leaf_byblk_impl(zap, blkid, tx, lt, lp); + if (err) + return (err); - nl = l; + nl = *lp; while (nl->lh_next != 0) { zap_leaf_t *nnl; - nnl = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt); + err = zap_get_leaf_byblk_impl(zap, nl->lh_next, tx, lt, &nnl); + if (err) { + zap_put_leaf(*lp); + return (err); + } nl->l_next = nnl; nl = nnl; } - return (l); + return (err); } -static uint64_t -zap_idx_to_blk(zap_t *zap, uint64_t idx) +static int +zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) { ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { ASSERT3U(idx, <, (1ULL << zap->zap_f.zap_phys->zap_ptrtbl.zt_shift)); - return (ZAP_EMBEDDED_PTRTBL_ENT(zap, idx)); + *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); + return (0); } else { return (zap_table_load(zap, &zap->zap_f.zap_phys->zap_ptrtbl, - idx)); + idx, valp)); } } -static void +static int zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) { ASSERT(tx != NULL); @@ -568,32 +610,37 @@ zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) { ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; + return (0); } else { - (void) zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl, - idx, blk, tx); + return (zap_table_store(zap, &zap->zap_f.zap_phys->zap_ptrtbl, + idx, blk, tx)); } } -static zap_leaf_t * -zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt) +static int +zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { - uint64_t idx; - zap_leaf_t *l; + uint64_t idx, blk; + int err; ASSERT(zap->zap_dbuf == NULL || zap->zap_f.zap_phys == zap->zap_dbuf->db_data); ASSERT3U(zap->zap_f.zap_phys->zap_magic, ==, ZAP_MAGIC); idx = ZAP_HASH_IDX(h, zap->zap_f.zap_phys->zap_ptrtbl.zt_shift); - l = zap_get_leaf_byblk(zap, zap_idx_to_blk(zap, idx), tx, lt); - - ASSERT3U(ZAP_HASH_IDX(h, l->lh_prefix_len), ==, l->lh_prefix); + err = zap_idx_to_blk(zap, idx, &blk); + if (err != 0) + return (err); + err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); - return (l); + ASSERT(err || + ZAP_HASH_IDX(h, (*lp)->lh_prefix_len) == (*lp)->lh_prefix); + return (err); } -static zap_leaf_t * -zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx) +static int +zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx, + zap_leaf_t **lp) { zap_leaf_t *nl; int prefix_diff, i, err; @@ -616,11 +663,13 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx) err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, &zap); ASSERT3U(err, ==, 0); ASSERT(!zap->zap_ismicro); - l = zap_deref_leaf(zap, hash, tx, RW_WRITER); + (void) zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); - if (l->lh_prefix_len != old_prefix_len) + if (l->lh_prefix_len != old_prefix_len) { /* it split while our locks were down */ - return (l); + *lp = l; + return (0); + } } ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -629,21 +678,33 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx) (void) zap_leaf_chainmore(l, zap_create_leaf(zap, tx)); dprintf("chaining leaf %x/%d\n", l->lh_prefix, l->lh_prefix_len); - return (l); + *lp = l; + return (0); } ASSERT3U(ZAP_HASH_IDX(hash, l->lh_prefix_len), ==, l->lh_prefix); /* There's more than one pointer to us. Split this leaf. */ - nl = zap_leaf_split(zap, l, tx); /* set sibling pointers */ prefix_diff = - zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - l->lh_prefix_len; - sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len) | 1) << prefix_diff; + zap->zap_f.zap_phys->zap_ptrtbl.zt_shift - (l->lh_prefix_len + 1); + sibling = (ZAP_HASH_IDX(hash, l->lh_prefix_len + 1) | 1) << prefix_diff; + + /* check for i/o errors before doing zap_leaf_split */ for (i = 0; i < (1ULL<<prefix_diff); i++) { - ASSERT3U(zap_idx_to_blk(zap, sibling+i), ==, l->l_blkid); - zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx); + uint64_t blk; + err = zap_idx_to_blk(zap, sibling+i, &blk); + if (err) + return (err); + ASSERT3U(blk, ==, l->l_blkid); + } + + nl = zap_leaf_split(zap, l, tx); + + for (i = 0; i < (1ULL<<prefix_diff); i++) { + err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx); + ASSERT3U(err, ==, 0); /* we checked for i/o errors above */ /* dprintf("set %d to %u %x\n", sibling+i, nl->l_blkid, nl); */ } @@ -657,7 +718,8 @@ zap_expand_leaf(zap_t *zap, zap_leaf_t *l, uint64_t hash, dmu_tx_t *tx) zap_put_leaf(nl); } - return (l); + *lp = l; + return (0); } static void @@ -682,7 +744,8 @@ again: err = zap_lockdir(os, zapobj, tx, RW_WRITER, FALSE, &zap); ASSERT3U(err, ==, 0); - l = zap_get_leaf_byblk(zap, blkid, tx, RW_READER); + (void) zap_get_leaf_byblk(zap, blkid, tx, + RW_READER, &l); goto again; } @@ -734,7 +797,9 @@ fzap_lookup(zap_t *zap, const char *name, return (err); hash = zap_hash(zap, name); - l = zap_deref_leaf(zap, hash, NULL, RW_READER); + err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l); + if (err != 0) + return (err); err = zap_leaf_lookup(l, name, hash, &zeh); if (err != 0) goto out; @@ -747,7 +812,7 @@ out: int fzap_add_cd(zap_t *zap, const char *name, uint64_t integer_size, uint64_t num_integers, - const void *val, uint32_t cd, dmu_tx_t *tx, zap_leaf_t **lp) + const void *val, uint32_t cd, dmu_tx_t *tx) { zap_leaf_t *l; uint64_t hash; @@ -759,14 +824,17 @@ fzap_add_cd(zap_t *zap, const char *name, ASSERT(fzap_checksize(integer_size, num_integers) == 0); hash = zap_hash(zap, name); - l = zap_deref_leaf(zap, hash, tx, RW_WRITER); + err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); retry: err = zap_leaf_lookup(l, name, hash, &zeh); if (err == 0) { err = EEXIST; goto out; } - ASSERT(err == ENOENT); + if (err != ENOENT) + goto out; /* XXX If this leaf is chained, split it if we can. */ err = zap_entry_create(l, name, hash, cd, @@ -775,15 +843,14 @@ retry: if (err == 0) { zap_increment_num_entries(zap, 1, tx); } else if (err == EAGAIN) { - l = zap_expand_leaf(zap, l, hash, tx); + err = zap_expand_leaf(zap, l, hash, tx, &l); + if (err != 0) + goto out; goto retry; } out: - if (lp) - *lp = l; - else - zap_put_leaf(l); + zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); return (err); } @@ -793,16 +860,14 @@ fzap_add(zap_t *zap, const char *name, const void *val, dmu_tx_t *tx) { int err; - zap_leaf_t *l; err = fzap_checksize(integer_size, num_integers); if (err != 0) return (err); err = fzap_add_cd(zap, name, integer_size, num_integers, - val, ZAP_MAXCD, tx, &l); + val, ZAP_MAXCD, tx); - zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); return (err); } @@ -821,7 +886,9 @@ fzap_update(zap_t *zap, const char *name, return (err); hash = zap_hash(zap, name); - l = zap_deref_leaf(zap, hash, tx, RW_WRITER); + err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); retry: err = zap_leaf_lookup(l, name, hash, &zeh); create = (err == ENOENT); @@ -839,10 +906,13 @@ retry: } if (err == EAGAIN) { - l = zap_expand_leaf(zap, l, hash, tx); + err = zap_expand_leaf(zap, l, hash, tx, &l); + if (err != 0) + goto out; goto retry; } +out: zap_put_leaf_maybe_grow_ptrtbl(zap, l, tx); return (err); } @@ -857,7 +927,9 @@ fzap_length(zap_t *zap, const char *name, zap_entry_handle_t zeh; hash = zap_hash(zap, name); - l = zap_deref_leaf(zap, hash, NULL, RW_READER); + err = zap_deref_leaf(zap, hash, NULL, RW_READER, &l); + if (err != 0) + return (err); err = zap_leaf_lookup(l, name, hash, &zeh); if (err != 0) goto out; @@ -880,7 +952,9 @@ fzap_remove(zap_t *zap, const char *name, dmu_tx_t *tx) zap_entry_handle_t zeh; hash = zap_hash(zap, name); - l = zap_deref_leaf(zap, hash, tx, RW_WRITER); + err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); err = zap_leaf_lookup(l, name, hash, &zeh); if (err == 0) { zap_entry_remove(&zeh); @@ -938,7 +1012,10 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) again: if (zc->zc_leaf == NULL) { - zc->zc_leaf = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER); + err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, + &zc->zc_leaf); + if (err != 0) + return (err); } else { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); } @@ -982,7 +1059,7 @@ again: static void zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) { - int i; + int i, err; uint64_t lastblk = 0; /* @@ -997,10 +1074,11 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) continue; lastblk = tbl[i]; - l = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER); - - zap_stats_leaf(zap, l, zs); - zap_put_leaf(l); + err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); + if (err == 0) { + zap_stats_leaf(zap, l, zs); + zap_put_leaf(l); + } } } @@ -1028,12 +1106,16 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) for (b = 0; b < zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks; b++) { dmu_buf_t *db; - - db = dmu_buf_hold(zap->zap_objset, zap->zap_object, - (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs); - dmu_buf_read(db); - zap_stats_ptrtbl(zap, db->db_data, 1<<(bs-3), zs); - dmu_buf_rele(db); + int err; + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk + b) << bs, + FTAG, &db); + if (err == 0) { + zap_stats_ptrtbl(zap, db->db_data, + 1<<(bs-3), zs); + dmu_buf_rele(db, FTAG); + } } } } diff --git a/usr/src/uts/common/fs/zfs/zap_micro.c b/usr/src/uts/common/fs/zfs/zap_micro.c index 3e150b9b1d..2d3180e37f 100644 --- a/usr/src/uts/common/fs/zfs/zap_micro.c +++ b/usr/src/uts/common/fs/zfs/zap_micro.c @@ -29,6 +29,7 @@ #include <sys/dmu.h> #include <sys/zfs_context.h> #include <sys/zap.h> +#include <sys/refcount.h> #include <sys/zap_impl.h> #include <sys/zap_leaf.h> #include <sys/avl.h> @@ -269,7 +270,9 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, *zapp = NULL; - db = dmu_buf_hold(os, obj, 0); + err = dmu_buf_hold(os, obj, 0, NULL, &db); + if (err) + return (err); #ifdef ZFS_DEBUG { @@ -279,12 +282,6 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, } #endif - /* - * The zap can deal with EIO here, but its callers don't yet, so - * spare them by doing a mustsucceed read. - */ - dmu_buf_read(db); - zap = dmu_buf_get_user(db); if (zap == NULL) zap = mzap_open(os, obj, db); @@ -340,7 +337,7 @@ void zap_unlockdir(zap_t *zap) { rw_exit(&zap->zap_rwlock); - dmu_buf_rele(zap->zap_dbuf); + dmu_buf_rele(zap->zap_dbuf, NULL); } static void @@ -375,7 +372,7 @@ mzap_upgrade(zap_t *zap, dmu_tx_t *tx) mze->mze_name, mze->mze_value); err = fzap_add_cd(zap, mze->mze_name, 8, 1, &mze->mze_value, - mze->mze_cd, tx, NULL); + mze->mze_cd, tx); ASSERT3U(err, ==, 0); } kmem_free(mzp, sz); @@ -411,7 +408,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx) dmu_buf_t *db; mzap_phys_t *zp; - db = dmu_buf_hold(os, obj, 0); + VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db)); #ifdef ZFS_DEBUG { @@ -426,7 +423,7 @@ mzap_create_impl(objset_t *os, uint64_t obj, dmu_tx_t *tx) zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; ASSERT(zp->mz_salt != 0); - dmu_buf_rele(db); + dmu_buf_rele(db, FTAG); } int diff --git a/usr/src/uts/common/fs/zfs/zfs_acl.c b/usr/src/uts/common/fs/zfs/zfs_acl.c index 69acccf493..c70986b853 100644 --- a/usr/src/uts/common/fs/zfs/zfs_acl.c +++ b/usr/src/uts/common/fs/zfs/zfs_acl.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -288,25 +287,33 @@ zfs_acl_node_read_internal(znode_t *zp) /* * Read an external acl object. */ -zfs_acl_t * -zfs_acl_node_read(znode_t *zp) +static int +zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp) { uint64_t extacl = zp->z_phys->zp_acl.z_acl_extern_obj; zfs_acl_t *aclp; + int error; ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) - return (zfs_acl_node_read_internal(zp)); + if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { + *aclpp = zfs_acl_node_read_internal(zp); + return (0); + } aclp = zfs_acl_alloc(zp->z_phys->zp_acl.z_acl_count); - dmu_read(zp->z_zfsvfs->z_os, extacl, 0, + error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, ZFS_ACL_SIZE(zp->z_phys->zp_acl.z_acl_count), aclp->z_acl); + if (error != 0) { + zfs_acl_free(aclp); + return (error); + } aclp->z_acl_count = zp->z_phys->zp_acl.z_acl_count; - return (aclp); + *aclpp = aclp; + return (0); } static boolean_t @@ -868,15 +875,17 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp, int zfs_acl_chmod_setattr(znode_t *zp, uint64_t mode, dmu_tx_t *tx) { - zfs_acl_t *aclp; + zfs_acl_t *aclp = NULL; int error; ASSERT(MUTEX_HELD(&zp->z_lock)); mutex_enter(&zp->z_acl_lock); - aclp = zfs_acl_node_read(zp); - error = zfs_acl_chmod(zp, mode, aclp, tx); + error = zfs_acl_node_read(zp, &aclp); + if (error == 0) + error = zfs_acl_chmod(zp, mode, aclp, tx); mutex_exit(&zp->z_acl_lock); - zfs_acl_free(aclp); + if (aclp) + zfs_acl_free(aclp); return (error); } @@ -1047,7 +1056,7 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag, pull_down = (parent->z_phys->zp_flags & ZFS_INHERIT_ACE); if (pull_down) { mutex_enter(&parent->z_acl_lock); - paclp = zfs_acl_node_read(parent); + VERIFY(0 == zfs_acl_node_read(parent, &paclp)); mutex_exit(&parent->z_acl_lock); aclp = zfs_acl_inherit(zp, paclp); zfs_acl_free(paclp); @@ -1106,7 +1115,12 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, cred_t *cr) mutex_enter(&zp->z_acl_lock); - aclp = zfs_acl_node_read(zp); + error = zfs_acl_node_read(zp, &aclp); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + if (mask & VSA_ACECNT) { vsecp->vsa_aclcnt = aclp->z_acl_count; @@ -1240,6 +1254,7 @@ zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr) int mode_wanted = v4_mode; int cnt; int i; + int error; int access_deny = ACCESS_UNDETERMINED; uint_t entry_type; uid_t uid = crgetuid(cr); @@ -1257,7 +1272,12 @@ zfs_zaccess_common(znode_t *zp, int v4_mode, int *working_mode, cred_t *cr) mutex_enter(&zp->z_acl_lock); - aclp = zfs_acl_node_read(zp); + error = zfs_acl_node_read(zp, &aclp); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + zacep = aclp->z_acl; cnt = aclp->z_acl_count; diff --git a/usr/src/uts/common/fs/zfs/zfs_dir.c b/usr/src/uts/common/fs/zfs/zfs_dir.c index ebdce10c33..d73315b47d 100644 --- a/usr/src/uts/common/fs/zfs/zfs_dir.c +++ b/usr/src/uts/common/fs/zfs/zfs_dir.c @@ -289,6 +289,21 @@ zfs_dq_hexname(char namebuf[17], uint64_t x) return (name); } +/* + * Delete Queue Error Handling + * + * When dealing with the delete queue, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * delete queue, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the delete queue is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the delete queue below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ + void zfs_dq_add(znode_t *zp, dmu_tx_t *tx) { @@ -338,9 +353,9 @@ zfs_purgedir(znode_t *dzp) tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, dzp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, -1); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); dmu_tx_hold_bonus(tx, xzp->z_id); - dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1); + dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); @@ -579,10 +594,10 @@ zfs_rmnode(znode_t *zp) */ tx = dmu_tx_create(os); dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); - dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1); + dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL); if (xzp) { dmu_tx_hold_bonus(tx, xzp->z_id); - dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1); + dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, TRUE, NULL); } if (acl_obj) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); @@ -764,7 +779,7 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { dmu_tx_abort(tx); diff --git a/usr/src/uts/common/fs/zfs/zfs_fm.c b/usr/src/uts/common/fs/zfs/zfs_fm.c new file mode 100644 index 0000000000..007445c713 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zfs_fm.c @@ -0,0 +1,316 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include <sys/spa.h> +#include <sys/spa_impl.h> +#include <sys/vdev.h> +#include <sys/vdev_impl.h> +#include <sys/zio.h> + +#include <sys/fm/fs/zfs.h> +#include <sys/fm/protocol.h> +#include <sys/fm/util.h> +#include <sys/sysevent.h> + +/* + * This general routine is responsible for generating all the different ZFS + * ereports. The payload is dependent on the class, and which arguments are + * supplied to the function: + * + * EREPORT POOL VDEV IO + * block X X X + * data X X + * device X X + * pool X + * + * If we are in a loading state, all errors are chained together by the same + * SPA-wide ENA. + * + * For isolated I/O requests, we get the ENA from the zio_t. The propagation + * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want + * to chain together all ereports associated with a logical piece of data. For + * read I/Os, there are basically three 'types' of I/O, which form a roughly + * layered diagram: + * + * +---------------+ + * | Aggregate I/O | No associated logical data or device + * +---------------+ + * | + * V + * +---------------+ Reads associated with a piece of logical data. + * | Read I/O | This includes reads on behalf of RAID-Z, + * +---------------+ mirrors, gang blocks, retries, etc. + * | + * V + * +---------------+ Reads associated with a particular device, but + * | Physical I/O | no logical data. Issued as part of vdev caching + * +---------------+ and I/O aggregation. + * + * Note that 'physical I/O' here is not the same terminology as used in the rest + * of ZIO. Typically, 'physical I/O' simply means that there is no attached + * blockpointer. But I/O with no associated block pointer can still be related + * to a logical piece of data (i.e. RAID-Z requests). + * + * Purely physical I/O always have unique ENAs. They are not related to a + * particular piece of logical data, and therefore cannot be chained together. + * We still generate an ereport, but the DE doesn't correlate it with any + * logical piece of data. When such an I/O fails, the delegated I/O requests + * will issue a retry, which will trigger the 'real' ereport with the correct + * ENA. + * + * We keep track of the ENA for a ZIO chain through the 'io_logical' member. + * When a new logical I/O is issued, we set this to point to itself. Child I/Os + * then inherit this pointer, so that when it is first set subsequent failures + * will use the same ENA. If a physical I/O is issued (by passing the + * ZIO_FLAG_NOBOOKMARK flag), then this pointer is reset, guaranteeing that a + * unique ENA will be generated. For an aggregate I/O, this pointer is set to + * NULL, and no ereport will be generated (since it doesn't actually correspond + * to any particular device or piece of data). + */ +void +zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, + uint64_t stateoroffset, uint64_t size) +{ +#ifdef _KERNEL + nvlist_t *ereport, *detector; + uint64_t ena; + char class[64]; + + /* + * If we are doing a spa_tryimport(), ignore errors. + */ + if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) + return; + + /* + * If we are in the middle of opening a pool, and the previous attempt + * failed, don't bother logging any new ereports - we're just going to + * get the same diagnosis anyway. + */ + if (spa->spa_load_state != SPA_LOAD_NONE && + spa->spa_last_open_failed) + return; + + /* + * Ignore any errors from I/Os that we are going to retry anyway - we + * only generate errors from the final failure. + */ + if (zio && zio_should_retry(zio)) + return; + + if ((ereport = fm_nvlist_create(NULL)) == NULL) + return; + + if ((detector = fm_nvlist_create(NULL)) == NULL) { + fm_nvlist_destroy(ereport, FM_NVA_FREE); + return; + } + + /* + * Serialize ereport generation + */ + mutex_enter(&spa->spa_errlist_lock); + + /* + * Determine the ENA to use for this event. If we are in a loading + * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use + * a root zio-wide ENA. Otherwise, simply use a unique ENA. + */ + if (spa->spa_load_state != SPA_LOAD_NONE) { + if (spa->spa_ena == 0) + spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); + ena = spa->spa_ena; + } else if (zio != NULL && zio->io_logical != NULL) { + if (zio->io_logical->io_ena == 0) + zio->io_logical->io_ena = + fm_ena_generate(0, FM_ENA_FMT1); + ena = zio->io_logical->io_ena; + } else { + ena = fm_ena_generate(0, FM_ENA_FMT1); + } + + /* + * Construct the full class, detector, and other standard FMA fields. + */ + (void) snprintf(class, sizeof (class), "%s.%s", + ZFS_ERROR_CLASS, subclass); + + fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), + vd != NULL ? vd->vdev_guid : 0); + + fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); + + /* + * Construct the per-ereport payload, depending on which parameters are + * passed in. + */ + + /* + * Generic payload members common to all ereports. + * + * The direct reference to spa_name is used rather than spa_name() + * because of the asynchronous nature of the zio pipeline. spa_name() + * asserts that the config lock is held in some form. This is always + * the case in I/O context, but because the check for RW_WRITER compares + * against 'curthread', we may be in an asynchronous context and blow + * this assert. Rather than loosen this assert, we acknowledge that all + * contexts in which this function is called (pool open, I/O) are safe, + * and dereference the name directly. + */ + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL, + DATA_TYPE_STRING, spa->spa_name, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + DATA_TYPE_UINT64, spa_guid(spa), + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, + spa->spa_load_state, NULL); + + if (vd != NULL) { + vdev_t *pvd = vd->vdev_parent; + + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + DATA_TYPE_UINT64, vd->vdev_guid, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); + if (vd->vdev_path) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, + DATA_TYPE_STRING, vd->vdev_path, NULL); + if (vd->vdev_devid) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, + DATA_TYPE_STRING, vd->vdev_devid, NULL); + + if (pvd != NULL) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, + DATA_TYPE_UINT64, pvd->vdev_guid, + FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, + DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, + NULL); + if (pvd->vdev_path) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, + DATA_TYPE_STRING, vd->vdev_path, NULL); + if (pvd->vdev_devid) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, + DATA_TYPE_STRING, pvd->vdev_devid, NULL); + } + } + + if (zio != NULL) { + /* + * Payload common to all I/Os. + */ + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, + DATA_TYPE_INT32, zio->io_error, NULL); + + /* + * If the 'size' parameter is non-zero, it indicates this is a + * RAID-Z or other I/O where the physical offset and length are + * provided for us, instead of within the zio_t. + */ + if (vd != NULL) { + if (size) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + DATA_TYPE_UINT64, stateoroffset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, + DATA_TYPE_UINT64, size); + else + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + DATA_TYPE_UINT64, zio->io_offset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, + DATA_TYPE_UINT64, zio->io_size); + } + + /* + * Payload for I/Os with corresponding logical information. + */ + if (zio->io_logical != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, + DATA_TYPE_UINT64, + zio->io_logical->io_bookmark.zb_objset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, + DATA_TYPE_UINT64, + zio->io_logical->io_bookmark.zb_object, + FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, + DATA_TYPE_INT32, + zio->io_logical->io_bookmark.zb_level, + FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, + DATA_TYPE_UINT64, + zio->io_logical->io_bookmark.zb_blkid); + } else if (vd != NULL) { + /* + * If we have a vdev but no zio, this is a device fault, and the + * 'stateoroffset' parameter indicates the previous state of the + * vdev. + */ + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, + DATA_TYPE_UINT64, stateoroffset, NULL); + } + mutex_exit(&spa->spa_errlist_lock); + + fm_ereport_post(ereport, EVCH_SLEEP); + + fm_nvlist_destroy(ereport, FM_NVA_FREE); + fm_nvlist_destroy(detector, FM_NVA_FREE); +#endif +} + +/* + * The 'resource.fs.zfs.ok' event is an internal signal that the associated + * resource (pool or disk) has been identified by ZFS as healthy. This will + * then trigger the DE to close the associated case, if any. + */ +void +zfs_post_ok(spa_t *spa, vdev_t *vd) +{ +#ifdef _KERNEL + nvlist_t *resource; + char class[64]; + + if ((resource = fm_nvlist_create(NULL)) == NULL) + return; + + (void) snprintf(class, sizeof (class), "%s.%s.%s", FM_RSRC_RESOURCE, + ZFS_ERROR_CLASS, FM_RESOURCE_OK); + VERIFY(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION) == 0); + VERIFY(nvlist_add_string(resource, FM_CLASS, class) == 0); + VERIFY(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)) == 0); + if (vd) + VERIFY(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid) == 0); + + fm_ereport_post(resource, EVCH_SLEEP); + + fm_nvlist_destroy(resource, FM_NVA_FREE); +#endif +} diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 29b01e4331..422b24a993 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -297,6 +297,16 @@ zfs_secpolicy_config(const char *unused, const char *unused2, cred_t *cr) } /* + * Policy for fault injection. Requires all privileges. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_inject(const char *unused, const char *unused2, cred_t *cr) +{ + return (secpolicy_zinject(cr)); +} + +/* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int @@ -368,7 +378,7 @@ zfs_ioc_pool_import(zfs_cmd_t *zc) return (error); if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || - guid != zc->zc_pool_guid) + guid != zc->zc_guid) error = EINVAL; else error = spa_import(zc->zc_name, config, @@ -396,7 +406,8 @@ zfs_ioc_pool_configs(zfs_cmd_t *zc) if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) return (EEXIST); - VERIFY(nvlist_pack(configs, &packed, &size, NV_ENCODE_NATIVE, 0) == 0); + VERIFY(nvlist_pack(configs, &packed, &size, NV_ENCODE_NATIVE, + KM_SLEEP) == 0); if (size > zc->zc_config_dst_size) error = ENOMEM; @@ -420,7 +431,7 @@ zfs_ioc_pool_guid(zfs_cmd_t *zc) error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { - zc->zc_pool_guid = spa_guid(spa); + zc->zc_guid = spa_guid(spa); spa_close(spa, FTAG); } return (error); @@ -433,28 +444,37 @@ zfs_ioc_pool_stats(zfs_cmd_t *zc) char *packed = NULL; size_t size = 0; int error; + int ret = 0; - error = spa_get_stats(zc->zc_name, &config); + error = spa_get_stats(zc->zc_name, &config, zc->zc_root, + sizeof (zc->zc_root)); if (config != NULL) { VERIFY(nvlist_pack(config, &packed, &size, - NV_ENCODE_NATIVE, 0) == 0); + NV_ENCODE_NATIVE, KM_SLEEP) == 0); if (size > zc->zc_config_dst_size) - error = ENOMEM; + ret = ENOMEM; else if (xcopyout(packed, (void *)(uintptr_t)zc->zc_config_dst, size)) - error = EFAULT; + ret = EFAULT; zc->zc_config_dst_size = size; kmem_free(packed, size); nvlist_free(config); + + /* + * The config may be present even if 'error' is non-zero. + * In this case we return success, and preserve the real errno + * in 'zc_cookie'. + */ + zc->zc_cookie = error; } else { - ASSERT(error != 0); + ret = error; } - return (error); + return (ret); } /* @@ -479,7 +499,8 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc) if (config == NULL) return (EINVAL); - VERIFY(nvlist_pack(config, &packed, &size, NV_ENCODE_NATIVE, 0) == 0); + VERIFY(nvlist_pack(config, &packed, &size, NV_ENCODE_NATIVE, + KM_SLEEP) == 0); if (size > zc->zc_config_dst_size) error = ENOMEM; @@ -554,13 +575,12 @@ static int zfs_ioc_vdev_online(zfs_cmd_t *zc) { spa_t *spa; - char *path = zc->zc_prop_value; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); - error = vdev_online(spa, path); + error = vdev_online(spa, zc->zc_guid); spa_close(spa, FTAG); return (error); } @@ -569,14 +589,13 @@ static int zfs_ioc_vdev_offline(zfs_cmd_t *zc) { spa_t *spa; - char *path = zc->zc_prop_value; int istmp = zc->zc_cookie; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); - error = vdev_offline(spa, path, istmp); + error = vdev_offline(spa, zc->zc_guid, istmp); spa_close(spa, FTAG); return (error); } @@ -585,7 +604,6 @@ static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; - char *path = zc->zc_prop_value; int replacing = zc->zc_cookie; nvlist_t *config; int error; @@ -595,7 +613,7 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc) return (error); if ((error = get_config(zc, &config)) == 0) { - error = spa_vdev_attach(spa, path, config, replacing); + error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); nvlist_free(config); } @@ -607,14 +625,13 @@ static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; - char *path = zc->zc_prop_value; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); - error = spa_vdev_detach(spa, path, 0, B_FALSE); + error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE); spa_close(spa, FTAG); return (error); @@ -625,7 +642,7 @@ zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; char *path = zc->zc_prop_value; - uint64_t guid = zc->zc_pool_guid; + uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); @@ -688,6 +705,8 @@ retry: if (!error && zc->zc_objset_stats.dds_type == DMU_OST_ZVOL) error = zvol_get_stats(zc, os); + spa_altroot(dmu_objset_spa(os), zc->zc_root, sizeof (zc->zc_root)); + dmu_objset_close(os); return (error); } @@ -1008,8 +1027,8 @@ zfs_ioc_recvbackup(zfs_cmd_t *zc) fp = getf(fd); if (fp == NULL) return (EBADF); - error = dmu_recvbackup(&zc->zc_begin_record, &zc->zc_cookie, - fp->f_vnode, fp->f_offset); + error = dmu_recvbackup(zc->zc_filename, &zc->zc_begin_record, + &zc->zc_cookie, fp->f_vnode, fp->f_offset); releasef(fd); return (error); } @@ -1053,6 +1072,110 @@ zfs_ioc_sendbackup(zfs_cmd_t *zc) return (error); } +static int +zfs_ioc_inject_fault(zfs_cmd_t *zc) +{ + int id, error; + + error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, + &zc->zc_inject_record); + + if (error == 0) + zc->zc_guid = (uint64_t)id; + + return (error); +} + +static int +zfs_ioc_clear_fault(zfs_cmd_t *zc) +{ + return (zio_clear_fault((int)zc->zc_guid)); +} + +static int +zfs_ioc_inject_list_next(zfs_cmd_t *zc) +{ + int id = (int)zc->zc_guid; + int error; + + error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), + &zc->zc_inject_record); + + zc->zc_guid = id; + + return (error); +} + +static int +zfs_ioc_error_log(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + size_t count = (size_t)zc->zc_config_dst_size; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_config_dst, + &count); + if (error == 0) + zc->zc_config_dst_size = count; + else + zc->zc_config_dst_size = spa_get_errlog_size(spa); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_clear(zfs_cmd_t *zc) +{ + spa_t *spa; + vdev_t *vd; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + spa_config_enter(spa, RW_WRITER, FTAG); + + if (zc->zc_prop_value[0] == '\0') + vd = NULL; + else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) { + spa_config_exit(spa, FTAG); + spa_close(spa, FTAG); + return (ENODEV); + } + + vdev_clear(spa, vd); + + spa_config_exit(spa, FTAG); + + spa_close(spa, FTAG); + + return (0); +} + +static int +zfs_ioc_bookmark_name(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_bookmark_name(spa, &zc->zc_bookmark, + zc->zc_prop_name, sizeof (zc->zc_prop_name), zc->zc_prop_value, + sizeof (zc->zc_prop_value), zc->zc_filename, + sizeof (zc->zc_filename)); + + spa_close(spa, FTAG); + + return (error); +} + static zfs_ioc_vec_t zfs_ioc_vec[] = { { zfs_ioc_pool_create, zfs_secpolicy_config, pool_name }, { zfs_ioc_pool_destroy, zfs_secpolicy_config, pool_name }, @@ -1087,6 +1210,12 @@ static zfs_ioc_vec_t zfs_ioc_vec[] = { { zfs_ioc_rename, zfs_secpolicy_write, dataset_name }, { zfs_ioc_recvbackup, zfs_secpolicy_write, dataset_name }, { zfs_ioc_sendbackup, zfs_secpolicy_write, dataset_name }, + { zfs_ioc_inject_fault, zfs_secpolicy_inject, no_name }, + { zfs_ioc_clear_fault, zfs_secpolicy_inject, no_name }, + { zfs_ioc_inject_list_next, zfs_secpolicy_inject, no_name }, + { zfs_ioc_error_log, zfs_secpolicy_inject, pool_name }, + { zfs_ioc_clear, zfs_secpolicy_config, pool_name }, + { zfs_ioc_bookmark_name, zfs_secpolicy_inject, pool_name } }; static int @@ -1279,7 +1408,7 @@ _fini(void) { int error; - if (spa_busy() || zfs_busy() || zvol_busy()) + if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) return (EBUSY); if ((error = mod_remove(&modlinkage)) != 0) diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index 17771b2e26..68a3e414eb 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -52,6 +52,7 @@ #include <sys/modctl.h> #include <sys/zfs_ioctl.h> #include <sys/zfs_ctldir.h> +#include <sys/bootconf.h> #include <sys/sunddi.h> #include <sys/dnlc.h> @@ -61,8 +62,11 @@ static major_t zfs_major; static minor_t zfs_minor; static kmutex_t zfs_dev_mtx; +extern char zfs_bootpath[BO_MAXOBJNAME]; + static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); +static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); static int zfs_root(vfs_t *vfsp, vnode_t **vpp); static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp); static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp); @@ -71,6 +75,7 @@ static void zfs_objset_close(zfsvfs_t *zfsvfs); static const fs_operation_def_t zfs_vfsops_template[] = { VFSNAME_MOUNT, zfs_mount, + VFSNAME_MOUNTROOT, zfs_mountroot, VFSNAME_UNMOUNT, zfs_umount, VFSNAME_ROOT, zfs_root, VFSNAME_STATVFS, zfs_statvfs, @@ -150,6 +155,58 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) return (0); } +static int +zfs_create_unique_device(dev_t *dev) +{ + major_t new_major; + + do { + ASSERT3U(zfs_minor, <=, MAXMIN32); + minor_t start = zfs_minor; + do { + mutex_enter(&zfs_dev_mtx); + if (zfs_minor >= MAXMIN32) { + /* + * If we're still using the real major + * keep out of /dev/zfs and /dev/zvol minor + * number space. If we're using a getudev()'ed + * major number, we can use all of its minors. + */ + if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) + zfs_minor = ZFS_MIN_MINOR; + else + zfs_minor = 0; + } else { + zfs_minor++; + } + *dev = makedevice(zfs_major, zfs_minor); + mutex_exit(&zfs_dev_mtx); + } while (vfs_devismounted(*dev) && zfs_minor != start); + if (zfs_minor == start) { + /* + * We are using all ~262,000 minor numbers for the + * current major number. Create a new major number. + */ + if ((new_major = getudev()) == (major_t)-1) { + cmn_err(CE_WARN, + "zfs_mount: Can't get unique major " + "device number."); + return (-1); + } + mutex_enter(&zfs_dev_mtx); + zfs_major = new_major; + zfs_minor = 0; + + mutex_exit(&zfs_dev_mtx); + } else { + break; + } + /* CONSTANTCONDITION */ + } while (1); + + return (0); +} + static void atime_changed_cb(void *arg, uint64_t newval) { @@ -271,110 +328,182 @@ acl_inherit_changed_cb(void *arg, uint64_t newval) zfsvfs->z_acl_inherit = newval; } -/*ARGSUSED*/ static int -zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +zfs_refresh_properties(vfs_t *vfsp) { - zfsvfs_t *zfsvfs = NULL; - znode_t *zp = NULL; - vnode_t *vp = NULL; - objset_t *os = NULL; - struct dsl_dataset *ds; - char *osname; - uint64_t readonly, recordsize; - pathname_t spn; - dev_t mount_dev; - major_t new_major; - int mode; - int error = 0; - uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? - UIO_SYSSPACE : UIO_USERSPACE; - int canwrite; + zfsvfs_t *zfsvfs = vfsp->vfs_data; - if (mvp->v_type != VDIR) - return (ENOTDIR); + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + readonly_changed_cb(zfsvfs, B_TRUE); + } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + if (dmu_objset_is_snapshot(zfsvfs->z_os)) + return (EROFS); + readonly_changed_cb(zfsvfs, B_FALSE); + } - mutex_enter(&mvp->v_lock); - if ((uap->flags & MS_REMOUNT) == 0 && - (uap->flags & MS_OVERLAY) == 0 && - (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { - mutex_exit(&mvp->v_lock); - return (EBUSY); + if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { + devices_changed_cb(zfsvfs, B_FALSE); + setuid_changed_cb(zfsvfs, B_FALSE); + } else { + if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) + devices_changed_cb(zfsvfs, B_FALSE); + else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) + devices_changed_cb(zfsvfs, B_TRUE); + + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) + setuid_changed_cb(zfsvfs, B_FALSE); + else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) + setuid_changed_cb(zfsvfs, B_TRUE); } - mutex_exit(&mvp->v_lock); - /* - * ZFS does not support passing unparsed data in via MS_DATA. - * Users should use the MS_OPTIONSTR interface; this means - * that all option parsing is already done and the options struct - * can be interrogated. - */ - if ((uap->flags & MS_DATA) && uap->datalen > 0) - return (EINVAL); + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) + exec_changed_cb(zfsvfs, B_FALSE); + else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) + exec_changed_cb(zfsvfs, B_TRUE); + + return (0); +} + +static int +zfs_register_callbacks(vfs_t *vfsp) +{ + struct dsl_dataset *ds = NULL; + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + int do_readonly = FALSE, readonly; + int do_setuid = FALSE, setuid; + int do_exec = FALSE, exec; + int do_devices = FALSE, devices; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfsp->vfs_data; + ASSERT(zfsvfs); + os = zfsvfs->z_os; /* - * When doing a remount, we simply refresh our temporary properties - * according to those options set in the current VFS options. + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and restore + * restore them after we register the callbacks. */ - if (uap->flags & MS_REMOUNT) { - zfsvfs = vfsp->vfs_data; - - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) - readonly_changed_cb(zfsvfs, B_TRUE); - else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { - if (dmu_objset_is_snapshot(zfsvfs->z_os)) - return (EROFS); - readonly_changed_cb(zfsvfs, B_FALSE); + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { + readonly = B_TRUE; + do_readonly = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + readonly = B_FALSE; + do_readonly = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { + devices = B_FALSE; + setuid = B_FALSE; + do_devices = B_TRUE; + do_setuid = B_TRUE; + } else { + if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { + devices = B_FALSE; + do_devices = B_TRUE; + } else if (vfs_optionisset(vfsp, + MNTOPT_DEVICES, NULL)) { + devices = B_TRUE; + do_devices = B_TRUE; } - if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { - devices_changed_cb(zfsvfs, B_FALSE); - setuid_changed_cb(zfsvfs, B_FALSE); - } else { - if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) - devices_changed_cb(zfsvfs, B_FALSE); - else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) - devices_changed_cb(zfsvfs, B_TRUE); - - if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) - setuid_changed_cb(zfsvfs, B_FALSE); - else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) - setuid_changed_cb(zfsvfs, B_TRUE); + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { + setuid = B_TRUE; + do_setuid = B_TRUE; } - - if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) - exec_changed_cb(zfsvfs, B_FALSE); - else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) - exec_changed_cb(zfsvfs, B_TRUE); - - return (0); + } + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { + exec = B_TRUE; + do_exec = B_TRUE; } /* - * Get the objset name (the "special" mount argument). + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... */ - if (error = pn_get(uap->spec, fromspace, &spn)) - return (error); + ds = dmu_objset_ds(os); + error = dsl_prop_register(ds, "atime", atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "recordsize", blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "readonly", readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "devices", devices_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "setuid", setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "exec", exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "snapdir", snapdir_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "aclmode", acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + "aclinherit", acl_inherit_changed_cb, zfsvfs); + if (error) + goto unregister; - osname = spn.pn_path; + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_devices) + devices_changed_cb(zfsvfs, devices); - if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) - goto out; + return (0); +unregister: /* - * Refuse to mount a filesystem if we are in a local zone and the - * dataset is not visible. + * We may attempt to unregister some callbacks that are not + * registered, but this is OK; it will simply return ENOMSG, + * which we will ignore. */ - if (!INGLOBALZONE(curproc) && - (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { - error = EPERM; - goto out; - } + (void) dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs); + (void) dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, + zfsvfs); + return (error); + +} + +static int +zfs_domount(vfs_t *vfsp, char *osname, cred_t *cr) +{ + dev_t mount_dev; + uint64_t recordsize, readonly; + int error = 0; + int mode; + zfsvfs_t *zfsvfs; + znode_t *zp = NULL; + + ASSERT(vfsp); + ASSERT(osname); /* * Initialize the zfs-specific filesystem structure. * Should probably make this a kmem cache, shuffle fields, - * and just bzero upto z_hold_mtx[]. + * and just bzero up to z_hold_mtx[]. */ zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs->z_vfs = vfsp; @@ -388,63 +517,19 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) offsetof(znode_t, z_link_node)); rw_init(&zfsvfs->z_um_lock, NULL, RW_DEFAULT, NULL); - /* - * Initialize the generic filesystem structure. - */ + /* Initialize the generic filesystem structure. */ vfsp->vfs_bcount = 0; vfsp->vfs_data = NULL; - /* - * Create a unique device for the mount. - */ - do { - ASSERT3U(zfs_minor, <=, MAXMIN32); - minor_t start = zfs_minor; - do { - mutex_enter(&zfs_dev_mtx); - if (zfs_minor >= MAXMIN32) { - /* - * If we're still using the real major number, - * keep out of /dev/zfs and /dev/zvol minor - * number space. If we're using a getudev()'ed - * major number, we can use all of its minors. - */ - if (zfs_major == ddi_name_to_major(ZFS_DRIVER)) - zfs_minor = ZFS_MIN_MINOR; - else - zfs_minor = 0; - } else { - zfs_minor++; - } - mount_dev = makedevice(zfs_major, zfs_minor); - mutex_exit(&zfs_dev_mtx); - } while (vfs_devismounted(mount_dev) && zfs_minor != start); - if (zfs_minor == start) { - /* - * We are using all ~262,000 minor numbers - * for the current major number. Create a - * new major number. - */ - if ((new_major = getudev()) == (major_t)-1) { - cmn_err(CE_WARN, - "zfs_mount: Can't get unique" - " major device number."); - goto out; - } - mutex_enter(&zfs_dev_mtx); - zfs_major = new_major; - zfs_minor = 0; - mutex_exit(&zfs_dev_mtx); - } else { - break; - } - /* CONSTANTCONDITION */ - } while (1); - + if (zfs_create_unique_device(&mount_dev) == -1) { + error = ENODEV; + goto out; + } ASSERT(vfs_devismounted(mount_dev) == 0); - if (dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL) != 0) - recordsize = SPA_MAXBLOCKSIZE; + if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize, + NULL)) + goto out; vfsp->vfs_dev = mount_dev; vfsp->vfs_fstype = zfsfstype; @@ -452,8 +537,7 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) vfsp->vfs_flag |= VFS_NOTRUNC; vfsp->vfs_data = zfsvfs; - error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL); - if (error) + if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) goto out; if (readonly) @@ -467,7 +551,6 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); } - os = zfsvfs->z_os; if (error) goto out; @@ -475,16 +558,18 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) if (error = zfs_init_fs(zfsvfs, &zp, cr)) goto out; - if (dmu_objset_is_snapshot(os)) { + /* The call to zfs_init_fs leaves the vnode held, release it here. */ + VN_RELE(ZTOV(zp)); + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { ASSERT(mode & DS_MODE_READONLY); atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); zfsvfs->z_issnap = B_TRUE; } else { - int do_readonly = FALSE, readonly; - int do_setuid = FALSE, setuid; - int do_exec = FALSE, exec; - int do_devices = FALSE, devices; + error = zfs_register_callbacks(vfsp); + if (error) + goto out; /* * Start a delete thread running. @@ -494,119 +579,216 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) /* * Parse and replay the intent log. */ - zil_replay(os, zfsvfs, &zfsvfs->z_assign, zfs_replay_vector, - (void (*)(void *))zfs_delete_wait_empty); + zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, + zfs_replay_vector, (void (*)(void *))zfs_delete_wait_empty); if (!zil_disable) - zfsvfs->z_log = zil_open(os, zfs_get_data); + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + } - /* - * The act of registering our callbacks will destroy any mount - * options we may have. In order to enable temporary overrides - * of mount options, we stash away the current values and - * restore them after we register the callbacks. - */ - if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) { - readonly = B_TRUE; - do_readonly = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { - readonly = B_FALSE; - do_readonly = B_TRUE; - } - if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) { - devices = B_FALSE; - setuid = B_FALSE; - do_devices = B_TRUE; - do_setuid = B_TRUE; - } else { - if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) { - devices = B_FALSE; - do_devices = B_TRUE; - } else if (vfs_optionisset(vfsp, - MNTOPT_DEVICES, NULL)) { - devices = B_TRUE; - do_devices = B_TRUE; - } + if (!zfsvfs->z_issnap) + zfsctl_create(zfsvfs); +out: + if (error) { + if (zfsvfs->z_os) + dmu_objset_close(zfsvfs->z_os); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + } else { + atomic_add_32(&zfs_active_fs_count, 1); + } - if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { - setuid = B_FALSE; - do_setuid = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { - setuid = B_TRUE; - do_setuid = B_TRUE; - } - } - if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { - exec = B_FALSE; - do_exec = B_TRUE; - } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { - exec = B_TRUE; - do_exec = B_TRUE; - } + return (error); - /* - * Register property callbacks. - */ +} + +void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + struct dsl_dataset *ds; + + /* + * Unregister properties. + */ + if (!dmu_objset_is_snapshot(os)) { ds = dmu_objset_ds(os); - VERIFY(dsl_prop_register(ds, "atime", atime_changed_cb, + VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, zfsvfs) == 0); - VERIFY(dsl_prop_register(ds, "recordsize", blksz_changed_cb, + VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, zfsvfs) == 0); - VERIFY(dsl_prop_register(ds, "readonly", readonly_changed_cb, + VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, zfsvfs) == 0); - VERIFY(dsl_prop_register(ds, "devices", devices_changed_cb, + VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, zfsvfs) == 0); - VERIFY(dsl_prop_register(ds, "setuid", setuid_changed_cb, + VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, zfsvfs) == 0); - VERIFY(dsl_prop_register(ds, "exec", exec_changed_cb, + VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, zfsvfs) == 0); - VERIFY(dsl_prop_register(ds, "snapdir", snapdir_changed_cb, + VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, zfsvfs) == 0); - VERIFY(dsl_prop_register(ds, "aclmode", acl_mode_changed_cb, + VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, zfsvfs) == 0); - VERIFY(dsl_prop_register(ds, "aclinherit", + VERIFY(dsl_prop_unregister(ds, "aclinherit", acl_inherit_changed_cb, zfsvfs) == 0); + } +} +static int +zfs_mountroot(vfs_t *vfsp, enum whymountroot why) +{ + int error = 0; + int ret = 0; + static int zfsrootdone = 0; + zfsvfs_t *zfsvfs = NULL; + znode_t *zp = NULL; + vnode_t *vp = NULL; + + ASSERT(vfsp); + + /* + * The filesystem that we mount as root is defined in + * /etc/system using the zfsroot variable. The value defined + * there is copied early in startup code to zfs_bootpath + * (defined in modsysfile.c). + */ + if (why == ROOT_INIT) { + if (zfsrootdone++) + return (EBUSY); /* - * Invoke our callbacks to restore temporary mount options. + * This needs to be done here, so that when we return from + * mountroot, the vfs resource name will be set correctly. */ - if (do_readonly) - readonly_changed_cb(zfsvfs, readonly); - if (do_setuid) - setuid_changed_cb(zfsvfs, setuid); - if (do_exec) - exec_changed_cb(zfsvfs, exec); - if (do_devices) - devices_changed_cb(zfsvfs, devices); - } + if (snprintf(rootfs.bo_name, BO_MAXOBJNAME, "%s", zfs_bootpath) + >= BO_MAXOBJNAME) + return (ENAMETOOLONG); - vp = ZTOV(zp); - if (!zfsvfs->z_issnap) - zfsctl_create(zfsvfs); -out: - if (error) { - if (zp) - VN_RELE(vp); + if (error = vfs_lock(vfsp)) + return (error); - if (zfsvfs) { - if (os) - dmu_objset_close(os); - kmem_free(zfsvfs, sizeof (zfsvfs_t)); - } - } else { - atomic_add_32(&zfs_active_fs_count, 1); + if (error = zfs_domount(vfsp, zfs_bootpath, CRED())) + goto out; + + zfsvfs = (zfsvfs_t *)vfsp->vfs_data; + ASSERT(zfsvfs); + if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) + goto out; + + vp = ZTOV(zp); + mutex_enter(&vp->v_lock); + vp->v_flag |= VROOT; + mutex_exit(&vp->v_lock); + rootvp = vp; + + /* + * The zfs_zget call above returns with a hold on vp, we release + * it here. + */ VN_RELE(vp); + + /* + * Mount root as readonly initially, it will be remouted + * read/write by /lib/svc/method/fs-usr. + */ + readonly_changed_cb(vfsp->vfs_data, B_TRUE); + vfs_add((struct vnode *)0, vfsp, + (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0); +out: + vfs_unlock(vfsp); + ret = (error) ? error : 0; + return (ret); + + } else if (why == ROOT_REMOUNT) { + + readonly_changed_cb(vfsp->vfs_data, B_FALSE); + vfsp->vfs_flag |= VFS_REMOUNT; + return (zfs_refresh_properties(vfsp)); + + } else if (why == ROOT_UNMOUNT) { + zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data); + (void) zfs_sync(vfsp, 0, 0); + return (0); + } + + /* + * if "why" is equal to anything else other than ROOT_INIT, + * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it. + */ + return (ENOTSUP); +} + +/*ARGSUSED*/ +static int +zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) +{ + char *osname; + pathname_t spn; + int error = 0; + uio_seg_t fromspace = (uap->flags & MS_SYSSPACE) ? + UIO_SYSSPACE : UIO_USERSPACE; + int canwrite; + + if (mvp->v_type != VDIR) + return (ENOTDIR); + + mutex_enter(&mvp->v_lock); + if ((uap->flags & MS_REMOUNT) == 0 && + (uap->flags & MS_OVERLAY) == 0 && + (mvp->v_count != 1 || (mvp->v_flag & VROOT))) { + mutex_exit(&mvp->v_lock); + return (EBUSY); + } + mutex_exit(&mvp->v_lock); + + /* + * ZFS does not support passing unparsed data in via MS_DATA. + * Users should use the MS_OPTIONSTR interface; this means + * that all option parsing is already done and the options struct + * can be interrogated. + */ + if ((uap->flags & MS_DATA) && uap->datalen > 0) + return (EINVAL); + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (uap->flags & MS_REMOUNT) { + return (zfs_refresh_properties(vfsp)); } + /* + * Get the objset name (the "special" mount argument). + */ + if (error = pn_get(uap->spec, fromspace, &spn)) + return (error); + + osname = spn.pn_path; + + if ((error = secpolicy_fs_mount(cr, mvp, vfsp)) != 0) + goto out; + + /* + * Refuse to mount a filesystem if we are in a local zone and the + * dataset is not visible. + */ + if (!INGLOBALZONE(curproc) && + (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { + error = EPERM; + goto out; + } + + error = zfs_domount(vfsp, osname, cr); + +out: pn_free(&spn); return (error); } @@ -739,9 +921,6 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) return (0); } - - zfs_zcache_flush(zfsvfs); - /* * Stop all delete threads. */ @@ -866,7 +1045,6 @@ zfs_objset_close(zfsvfs_t *zfsvfs) zfs_delete_t *zd = &zfsvfs->z_delete_head; znode_t *zp, *nextzp; objset_t *os = zfsvfs->z_os; - struct dsl_dataset *ds; /* * Stop all delete threads. @@ -881,8 +1059,6 @@ zfs_objset_close(zfsvfs_t *zfsvfs) */ rw_enter(&zfsvfs->z_um_lock, RW_WRITER); - zfs_zcache_flush(zfsvfs); - /* * Release all delete in progress znodes * They will be processed when the file system remounts. @@ -891,7 +1067,7 @@ zfs_objset_close(zfsvfs_t *zfsvfs) while (zp = list_head(&zd->z_znodes)) { list_remove(&zd->z_znodes, zp); zp->z_dbuf_held = 0; - dmu_buf_rele(zp->z_dbuf); + dmu_buf_rele(zp->z_dbuf, NULL); } mutex_exit(&zd->z_mutex); @@ -911,7 +1087,7 @@ zfs_objset_close(zfsvfs_t *zfsvfs) /* dbufs should only be held when force unmounting */ zp->z_dbuf_held = 0; mutex_exit(&zfsvfs->z_znodes_lock); - dmu_buf_rele(zp->z_dbuf); + dmu_buf_rele(zp->z_dbuf, NULL); /* Start again */ mutex_enter(&zfsvfs->z_znodes_lock); nextzp = list_head(&zfsvfs->z_all_znodes); @@ -922,36 +1098,8 @@ zfs_objset_close(zfsvfs_t *zfsvfs) /* * Unregister properties. */ - if (!dmu_objset_is_snapshot(os)) { - ds = dmu_objset_ds(os); - - VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb, - zfsvfs) == 0); - - VERIFY(dsl_prop_unregister(ds, "aclinherit", - acl_inherit_changed_cb, zfsvfs) == 0); - } + if (!dmu_objset_is_snapshot(os)) + zfs_unregister_callbacks(zfsvfs); /* * Make the dmu drop all it dbuf holds so that zfs_inactive @@ -977,6 +1125,11 @@ zfs_objset_close(zfsvfs_t *zfsvfs) } /* + * Evict all dbufs so that cached znodes will be freed + */ + dmu_objset_evict_dbufs(os); + + /* * Finally close the objset */ dmu_objset_close(os); diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index da5b41101a..2b9da086cc 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -229,6 +229,14 @@ zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred, case _FIOFFS: return (zfs_sync(vp->v_vfsp, 0, cred)); + /* + * The following two ioctls are used by bfu. Faking out, + * necessary to avoid bfu errors. + */ + case _FIOGDIO: + case _FIOSDIO: + return (0); + case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: if (ddi_copyin((void *)data, &off, sizeof (off), flag)) @@ -436,12 +444,10 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) n = MIN(zfs_read_chunk_size, zp->z_phys->zp_size - uio->uio_loffset); n = MIN(n, cnt); - dbpp = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id, - uio->uio_loffset, n, &numbufs); - if (error = dmu_buf_read_array_canfail(dbpp, numbufs)) { - dmu_buf_rele_array(dbpp, numbufs); + error = dmu_buf_hold_array(zfsvfs->z_os, zp->z_id, + uio->uio_loffset, n, TRUE, FTAG, &numbufs, &dbpp); + if (error) goto out; - } /* * Compute the adjustment to align the dmu buffers * with the uio buffer. @@ -467,7 +473,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) (n < size ? n : size), UIO_READ, uio); } if (error) { - dmu_buf_rele_array(dbpp, numbufs); + dmu_buf_rele_array(dbpp, numbufs, FTAG); goto out; } n -= dbp->db_size; @@ -476,7 +482,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) delta = 0; } } - dmu_buf_rele_array(dbpp, numbufs); + dmu_buf_rele_array(dbpp, numbufs, FTAG); } out: rw_exit(&zp->z_grow_lock); @@ -850,10 +856,10 @@ zfs_get_data(void *arg, lr_write_t *lr) */ if (sizeof (lr_write_t) + dlen <= reclen) { /* immediate write */ rw_enter(&zp->z_grow_lock, RW_READER); - dmu_buf_t *db = dmu_buf_hold(os, lr->lr_foid, off); - dmu_buf_read(db); + dmu_buf_t *db; + VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, off, FTAG, &db)); bcopy((char *)db->db_data + off - db->db_offset, lr + 1, dlen); - dmu_buf_rele(db); + dmu_buf_rele(db, FTAG); rw_exit(&zp->z_grow_lock); } else { /* @@ -1071,7 +1077,7 @@ top: tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); dmu_tx_hold_bonus(tx, dzp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, 1); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); @@ -1266,7 +1272,7 @@ top: * allow for either case. */ tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, -1); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_bonus(tx, zp->z_id); if (may_delete_now) dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); @@ -1289,7 +1295,7 @@ top: dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); /* charge as an update -- would be nice not to charge at all */ - dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, -1); + dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { @@ -1427,8 +1433,8 @@ top: * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, 1); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 0); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); @@ -1534,9 +1540,9 @@ top: rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); - dmu_tx_hold_zap(tx, dzp->z_id, 1); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_bonus(tx, zp->z_id); - dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1); + dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { dmu_tx_abort(tx); @@ -2059,8 +2065,7 @@ top: have_grow_lock = TRUE; if (off < zp->z_phys->zp_size) dmu_tx_hold_free(tx, zp->z_id, off, DMU_OBJECT_END); - else if (zp->z_phys->zp_size && - zp->z_blksz < zfsvfs->z_max_blksz && off > zp->z_blksz) + else if (zp->z_blksz < zfsvfs->z_max_blksz && off > zp->z_blksz) /* we will rewrite this block if we grow */ dmu_tx_hold_write(tx, zp->z_id, 0, zp->z_phys->zp_size); } @@ -2419,17 +2424,13 @@ top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, szp->z_id); /* nlink changes */ dmu_tx_hold_bonus(tx, sdzp->z_id); /* nlink changes */ - if (sdzp != tdzp) { - dmu_tx_hold_zap(tx, sdzp->z_id, 1); - dmu_tx_hold_zap(tx, tdzp->z_id, 1); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) dmu_tx_hold_bonus(tx, tdzp->z_id); /* nlink changes */ - } else { - dmu_tx_hold_zap(tx, sdzp->z_id, 2); - } - if (tzp) { - dmu_tx_hold_bonus(tx, tzp->z_id); /* nlink changes */ - } - dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, 1); + if (tzp) + dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ + dmu_tx_hold_zap(tx, zfsvfs->z_dqueue, FALSE, NULL); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { dmu_tx_abort(tx); @@ -2532,7 +2533,7 @@ top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_bonus(tx, dzp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, 1); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); error = dmu_tx_assign(tx, zfsvfs->z_assign); @@ -2569,12 +2570,12 @@ top: if (error) goto out; - dbp = dmu_buf_hold(zfsvfs->z_os, zoid, 0); + VERIFY(0 == dmu_buf_hold(zfsvfs->z_os, zoid, 0, FTAG, &dbp)); dmu_buf_will_dirty(dbp, tx); ASSERT3U(len, <=, dbp->db_size); bcopy(link, dbp->db_data, len); - dmu_buf_rele(dbp); + dmu_buf_rele(dbp, FTAG); } zp->z_phys->zp_size = len; @@ -2631,15 +2632,15 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr) error = uiomove(zp->z_phys + 1, MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); } else { - dmu_buf_t *dbp = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0); - if ((error = dmu_buf_read_canfail(dbp)) != 0) { - dmu_buf_rele(dbp); + dmu_buf_t *dbp; + error = dmu_buf_hold(zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp); + if (error) { ZFS_EXIT(zfsvfs); return (error); } error = uiomove(dbp->db_data, MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); - dmu_buf_rele(dbp); + dmu_buf_rele(dbp, FTAG); } ZFS_ACCESSTIME_STAMP(zfsvfs, zp); @@ -2732,7 +2733,7 @@ top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, szp->z_id); - dmu_tx_hold_zap(tx, dzp->z_id, 1); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); error = dmu_tx_assign(tx, zfsvfs->z_assign); if (error) { dmu_tx_abort(tx); @@ -2921,8 +2922,14 @@ zfs_inactive(vnode_t *vp, cred_t *cr) B_INVAL, cr); } + mutex_enter(&zp->z_lock); vp->v_count = 0; /* count arrives as 1 */ - zfs_znode_free(zp); + if (zp->z_dbuf == NULL) { + mutex_exit(&zp->z_lock); + zfs_znode_free(zp); + } else { + mutex_exit(&zp->z_lock); + } rw_exit(&zfsvfs->z_um_lock); VFS_RELE(zfsvfs->z_vfs); return; @@ -2986,27 +2993,21 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - uint_t cnt = 1; int error; ZFS_ENTER(zfsvfs); /* - * If file is being mapped, disallow frlock. We set the mapcnt to - * -1 here to signal that we are in the process of setting a lock. - * This prevents a race with zfs_map(). - * XXX - well, sort of; since zfs_map() does not change z_mapcnt, - * we could be in the middle of zfs_map() and still call fs_frlock(). - * Also, we are doing no checking in zfs_addmap() (where z_mapcnt - * *is* manipulated). + * We are following the UFS semantics with respect to mapcnt + * here: If we see that the file is mapped already, then we will + * return an error, but we don't worry about races between this + * function and zfs_map(). */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode) && - (int)(cnt = atomic_cas_32(&zp->z_mapcnt, 0, -1)) > 0) { + if (zp->z_mapcnt > 0 && MANDMODE((mode_t)zp->z_phys->zp_mode)) { ZFS_EXIT(zfsvfs); return (EAGAIN); } error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr); - ASSERT((cnt != 0) || ((int)atomic_cas_32(&zp->z_mapcnt, -1, 0) == -1)); ZFS_EXIT(zfsvfs); return (error); } @@ -3074,7 +3075,7 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { ASSERT(io_off == cur_pp->p_offset); va = ppmapin(cur_pp, PROT_READ | PROT_WRITE, (caddr_t)-1); - err = dmu_read_canfail(os, oid, io_off, PAGESIZE, va); + err = dmu_read(os, oid, io_off, PAGESIZE, va); ppmapout(va); if (err) { /* On error, toss the entire kluster */ @@ -3241,6 +3242,20 @@ out: return (err); } +/* + * Request a memory map for a section of a file. This code interacts + * with common code and the VM system as follows: + * + * common code calls mmap(), which ends up in smmap_common() + * + * this calls VOP_MAP(), which takes you into (say) zfs + * + * zfs_map() calls as_map(), passing segvn_create() as the callback + * + * segvn_create() creates the new segment and calls VOP_ADDMAP() + * + * zfs_addmap() updates z_mapcnt + */ static int zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) @@ -3269,15 +3284,10 @@ zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp, /* * If file is locked, disallow mapping. - * XXX - since we don't modify z_mapcnt here, there is nothing - * to stop a file lock being placed immediately after we complete - * this check. */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode)) { - if (vn_has_flocks(vp) || zp->z_mapcnt == -1) { - ZFS_EXIT(zfsvfs); - return (EAGAIN); - } + if (MANDMODE((mode_t)zp->z_phys->zp_mode) && vn_has_flocks(vp)) { + ZFS_EXIT(zfsvfs); + return (EAGAIN); } as_rangelock(as); @@ -3318,11 +3328,9 @@ static int zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr) { - /* - * XXX - shouldn't we be checking for file locks here? - */ - ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0); - atomic_add_32(&VTOZ(vp)->z_mapcnt, btopr(len)); + uint64_t pages = btopr(len); + + atomic_add_64(&VTOZ(vp)->z_mapcnt, pages); return (0); } @@ -3331,8 +3339,10 @@ static int zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr) { - atomic_add_32(&VTOZ(vp)->z_mapcnt, -btopr(len)); - ASSERT3U(VTOZ(vp)->z_mapcnt, >=, 0); + uint64_t pages = btopr(len); + + ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages); + atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages); return (0); } diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c index 7eb3a2410d..3fd338940e 100644 --- a/usr/src/uts/common/fs/zfs/zfs_znode.c +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -55,251 +54,6 @@ struct kmem_cache *znode_cache = NULL; -/* - * Note that znodes can be on one of 2 states: - * ZCACHE_mru - recently used, currently cached - * ZCACHE_mfu - frequently used, currently cached - * When there are no active references to the znode, they - * are linked onto one of the lists in zcache. These are the - * only znodes that can be evicted. - */ - -typedef struct zcache_state { - list_t list; /* linked list of evictable znodes in state */ - uint64_t lcnt; /* total number of znodes in the linked list */ - uint64_t cnt; /* total number of all znodes in this state */ - uint64_t hits; - kmutex_t mtx; -} zcache_state_t; - -/* The 2 states: */ -static zcache_state_t ZCACHE_mru; -static zcache_state_t ZCACHE_mfu; - -static struct zcache { - zcache_state_t *mru; - zcache_state_t *mfu; - uint64_t p; /* Target size of mru */ - uint64_t c; /* Target size of cache */ - uint64_t c_max; /* Maximum target cache size */ - - /* performance stats */ - uint64_t missed; - uint64_t evicted; - uint64_t skipped; -} zcache; - -void zcache_kmem_reclaim(void); - -#define ZCACHE_MINTIME (hz>>4) /* 62 ms */ - -/* - * Move the supplied znode to the indicated state. The mutex - * for the znode must be held by the caller. - */ -static void -zcache_change_state(zcache_state_t *new_state, znode_t *zp) -{ - /* ASSERT(MUTEX_HELD(hash_mtx)); */ - ASSERT(zp->z_active); - - if (zp->z_zcache_state) { - ASSERT3U(zp->z_zcache_state->cnt, >=, 1); - atomic_add_64(&zp->z_zcache_state->cnt, -1); - } - atomic_add_64(&new_state->cnt, 1); - zp->z_zcache_state = new_state; -} - -static void -zfs_zcache_evict(znode_t *zp, kmutex_t *hash_mtx) -{ - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - - ASSERT(zp->z_phys); - ASSERT(zp->z_dbuf_held); - - zp->z_dbuf_held = 0; - mutex_exit(&zp->z_lock); - dmu_buf_rele(zp->z_dbuf); - mutex_exit(hash_mtx); - VFS_RELE(zfsvfs->z_vfs); -} - -/* - * Evict znodes from list until we've removed the specified number - */ -static void -zcache_evict_state(zcache_state_t *state, int64_t cnt, zfsvfs_t *zfsvfs) -{ - int znodes_evicted = 0; - znode_t *zp, *zp_prev; - kmutex_t *hash_mtx; - - ASSERT(state == zcache.mru || state == zcache.mfu); - - mutex_enter(&state->mtx); - - for (zp = list_tail(&state->list); zp; zp = zp_prev) { - zp_prev = list_prev(&state->list, zp); - if (zfsvfs && zp->z_zfsvfs != zfsvfs) - continue; - hash_mtx = ZFS_OBJ_MUTEX(zp); - if (mutex_tryenter(hash_mtx)) { - mutex_enter(&zp->z_lock); - list_remove(&zp->z_zcache_state->list, zp); - zp->z_zcache_state->lcnt -= 1; - ASSERT3U(zp->z_zcache_state->cnt, >=, 1); - atomic_add_64(&zp->z_zcache_state->cnt, -1); - zp->z_zcache_state = NULL; - zp->z_zcache_access = 0; - /* drops z_lock and hash_mtx */ - zfs_zcache_evict(zp, hash_mtx); - znodes_evicted += 1; - atomic_add_64(&zcache.evicted, 1); - if (znodes_evicted >= cnt) - break; - } else { - atomic_add_64(&zcache.skipped, 1); - } - } - mutex_exit(&state->mtx); - - if (znodes_evicted < cnt) - dprintf("only evicted %lld znodes from %x", - (longlong_t)znodes_evicted, state); -} - -static void -zcache_adjust(void) -{ - uint64_t mrucnt = zcache.mru->lcnt; - uint64_t mfucnt = zcache.mfu->lcnt; - uint64_t p = zcache.p; - uint64_t c = zcache.c; - - if (mrucnt > p) - zcache_evict_state(zcache.mru, mrucnt - p, NULL); - - if (mfucnt > 0 && mrucnt + mfucnt > c) { - int64_t toevict = MIN(mfucnt, mrucnt + mfucnt - c); - zcache_evict_state(zcache.mfu, toevict, NULL); - } -} - -/* - * Flush all *evictable* data from the cache. - * NOTE: this will not touch "active" (i.e. referenced) data. - */ -void -zfs_zcache_flush(zfsvfs_t *zfsvfs) -{ - zcache_evict_state(zcache.mru, zcache.mru->lcnt, zfsvfs); - zcache_evict_state(zcache.mfu, zcache.mfu->lcnt, zfsvfs); -} - -static void -zcache_try_grow(int64_t cnt) -{ - int64_t size; - /* - * If we're almost to the current target cache size, - * increment the target cache size - */ - size = zcache.mru->lcnt + zcache.mfu->lcnt; - if ((zcache.c - size) <= 1) { - atomic_add_64(&zcache.c, cnt); - if (zcache.c > zcache.c_max) - zcache.c = zcache.c_max; - else if (zcache.p + cnt < zcache.c) - atomic_add_64(&zcache.p, cnt); - } -} - -/* - * This routine is called whenever a znode is accessed. - */ -static void -zcache_access(znode_t *zp, kmutex_t *hash_mtx) -{ - ASSERT(MUTEX_HELD(hash_mtx)); - - if (zp->z_zcache_state == NULL) { - /* - * This znode is not in the cache. - * Add the new znode to the MRU state. - */ - - zcache_try_grow(1); - - ASSERT(zp->z_zcache_access == 0); - zp->z_zcache_access = lbolt; - zcache_change_state(zcache.mru, zp); - mutex_exit(hash_mtx); - - /* - * If we are using less than 2/3 of our total target - * cache size, bump up the target size for the MRU - * list. - */ - if (zcache.mru->lcnt + zcache.mfu->lcnt < zcache.c*2/3) { - zcache.p = zcache.mru->lcnt + zcache.c/6; - } - - zcache_adjust(); - - atomic_add_64(&zcache.missed, 1); - } else if (zp->z_zcache_state == zcache.mru) { - /* - * This znode has been "accessed" only once so far, - * Move it to the MFU state. - */ - if (lbolt > zp->z_zcache_access + ZCACHE_MINTIME) { - /* - * More than 125ms have passed since we - * instantiated this buffer. Move it to the - * most frequently used state. - */ - zp->z_zcache_access = lbolt; - zcache_change_state(zcache.mfu, zp); - } - atomic_add_64(&zcache.mru->hits, 1); - mutex_exit(hash_mtx); - } else { - ASSERT(zp->z_zcache_state == zcache.mfu); - /* - * This buffer has been accessed more than once. - * Keep it in the MFU state. - */ - atomic_add_64(&zcache.mfu->hits, 1); - mutex_exit(hash_mtx); - } -} - -static void -zcache_init(void) -{ - zcache.c = 20; - zcache.c_max = 50; - - zcache.mru = &ZCACHE_mru; - zcache.mfu = &ZCACHE_mfu; - - list_create(&zcache.mru->list, sizeof (znode_t), - offsetof(znode_t, z_zcache_node)); - list_create(&zcache.mfu->list, sizeof (znode_t), - offsetof(znode_t, z_zcache_node)); -} - -static void -zcache_fini(void) -{ - zfs_zcache_flush(NULL); - - list_destroy(&zcache.mru->list); - list_destroy(&zcache.mfu->list); -} - /*ARGSUSED*/ static void znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr) @@ -307,9 +61,15 @@ znode_pageout_func(dmu_buf_t *dbuf, void *user_ptr) znode_t *zp = user_ptr; vnode_t *vp = ZTOV(zp); + mutex_enter(&zp->z_lock); if (vp->v_count == 0) { + mutex_exit(&zp->z_lock); vn_invalid(vp); zfs_znode_free(zp); + } else { + /* signal force unmount that this znode can be freed */ + zp->z_dbuf = NULL; + mutex_exit(&zp->z_lock); } } @@ -359,15 +119,11 @@ zfs_znode_init(void) znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, 0); - - zcache_init(); } void zfs_znode_fini(void) { - zcache_fini(); - /* * Cleanup vfs & vnode ops */ @@ -488,8 +244,8 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) if (dmu_object_info(os, MASTER_NODE_OBJ, &doi) == ENOENT) { dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 3); /* master node */ - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, 1); /* delete queue */ + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* master */ + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); /* del queue */ dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); /* root node */ error = dmu_tx_assign(tx, TXG_WAIT); ASSERT3U(error, ==, 0); @@ -497,8 +253,10 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) dmu_tx_commit(tx); } - if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1, &version)) { - return (EINVAL); + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_VERSION_OBJ, 8, 1, + &version); + if (error) { + return (error); } else if (version != ZFS_VERSION) { (void) printf("Mismatched versions: File system " "is version %lld on-disk format, which is " @@ -524,9 +282,9 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) kmem_free(stats, sizeof (dmu_objset_stats_t)); stats = NULL; - if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid)) { - return (EINVAL); - } + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zoid); + if (error) + return (error); ASSERT(zoid != 0); zfsvfs->z_root = zoid; @@ -545,9 +303,9 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) return (error); ASSERT3U((*zpp)->z_id, ==, zoid); - if (zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid)) { - return (EINVAL); - } + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_DELETE_QUEUE, 8, 1, &zoid); + if (error) + return (error); zfsvfs->z_dqueue = zoid; @@ -570,7 +328,7 @@ zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp, cred_t *cr) * up to the caller to do, in case you don't want to * return the znode */ -znode_t * +static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) { znode_t *zp; @@ -593,8 +351,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, uint64_t obj_num, int blksz) zp->z_blksz = blksz; zp->z_seq = 0x7A4653; - bzero(&zp->z_zcache_node, sizeof (list_node_t)); - mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); @@ -662,9 +418,6 @@ zfs_znode_dmu_init(znode_t *zp) ZTOV(zp)->v_flag |= VROOT; } - zp->z_zcache_state = NULL; - zp->z_zcache_access = 0; - ASSERT(zp->z_dbuf_held == 0); zp->z_dbuf_held = 1; VFS_HOLD(zfsvfs->z_vfs); @@ -715,6 +468,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, /* * Create a new DMU object. */ + /* + * There's currently no mechanism for pre-reading the blocks that will + * be to needed allocate a new object, so we accept the small chance + * that there will be an i/o error and we will fail one of the + * assertions below. + */ if (vap->va_type == VDIR) { if (flag & IS_REPLAY) { err = zap_create_claim(zfsvfs->z_os, *oid, @@ -738,7 +497,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); } } - dbp = dmu_bonus_hold(zfsvfs->z_os, *oid); + VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, *oid, NULL, &dbp)); dmu_buf_will_dirty(dbp, tx); /* @@ -803,11 +562,12 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, uint64_t *oid, dmu_tx_t *tx, cred_t *cr, mutex_enter(hash_mtx); zfs_znode_dmu_init(zp); - zcache_access(zp, hash_mtx); + mutex_exit(hash_mtx); + *zpp = zp; } else { ZTOV(zp)->v_count = 0; - dmu_buf_rele(dbp); + dmu_buf_rele(dbp, NULL); zfs_znode_free(zp); } } @@ -818,25 +578,25 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; + int err; *zpp = NULL; ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); - db = dmu_bonus_hold(zfsvfs->z_os, obj_num); - if (db == NULL) { + err = dmu_bonus_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - return (ENOENT); + return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_ZNODE || doi.doi_bonus_size < sizeof (znode_phys_t)) { - dmu_buf_rele(db); + dmu_buf_rele(db, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (EINVAL); } - dmu_buf_read(db); ASSERT(db->db_object == obj_num); ASSERT(db->db_offset == -1); @@ -849,29 +609,23 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) ASSERT3U(zp->z_id, ==, obj_num); if (zp->z_reap) { - dmu_buf_rele(db); + dmu_buf_rele(db, NULL); mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); return (ENOENT); } else if (zp->z_dbuf_held) { - dmu_buf_rele(db); + dmu_buf_rele(db, NULL); } else { zp->z_dbuf_held = 1; VFS_HOLD(zfsvfs->z_vfs); } - if (zp->z_active == 0) { + if (zp->z_active == 0) zp->z_active = 1; - if (list_link_active(&zp->z_zcache_node)) { - mutex_enter(&zp->z_zcache_state->mtx); - list_remove(&zp->z_zcache_state->list, zp); - zp->z_zcache_state->lcnt -= 1; - mutex_exit(&zp->z_zcache_state->mtx); - } - } + VN_HOLD(ZTOV(zp)); mutex_exit(&zp->z_lock); - zcache_access(zp, ZFS_OBJ_MUTEX(zp)); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); *zpp = zp; return (0); } @@ -882,7 +636,7 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) zp = zfs_znode_alloc(zfsvfs, db, obj_num, doi.doi_data_block_size); ASSERT3U(zp->z_id, ==, obj_num); zfs_znode_dmu_init(zp); - zcache_access(zp, ZFS_OBJ_MUTEX(zp)); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); *zpp = zp; return (0); } @@ -899,15 +653,11 @@ zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) zp->z_phys->zp_acl.z_acl_extern_obj, tx); ASSERT3U(error, ==, 0); } - if (zp->z_zcache_state) { - ASSERT3U(zp->z_zcache_state->cnt, >=, 1); - atomic_add_64(&zp->z_zcache_state->cnt, -1); - } error = dmu_object_free(zfsvfs->z_os, zp->z_id, tx); ASSERT3U(error, ==, 0); zp->z_dbuf_held = 0; ZFS_OBJ_HOLD_EXIT(zfsvfs, zp->z_id); - dmu_buf_rele(zp->z_dbuf); + dmu_buf_rele(zp->z_dbuf, NULL); } void @@ -954,9 +704,6 @@ zfs_zinactive(znode_t *zp) if (zp->z_reap) { mutex_exit(&zp->z_lock); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); - ASSERT3U(zp->z_zcache_state->cnt, >=, 1); - atomic_add_64(&zp->z_zcache_state->cnt, -1); - zp->z_zcache_state = NULL; /* XATTR files are not put on the delete queue */ if (zp->z_phys->zp_flags & ZFS_XATTR) { zfs_rmnode(zp); @@ -970,23 +717,14 @@ zfs_zinactive(znode_t *zp) VFS_RELE(zfsvfs->z_vfs); return; } + ASSERT(zp->z_phys); + ASSERT(zp->z_dbuf_held); - /* - * If the file system for this znode is no longer mounted, - * evict the znode now, don't put it in the cache. - */ - if (zfsvfs->z_unmounted1) { - zfs_zcache_evict(zp, ZFS_OBJ_MUTEX(zp)); - return; - } - - /* put znode on evictable list */ - mutex_enter(&zp->z_zcache_state->mtx); - list_insert_head(&zp->z_zcache_state->list, zp); - zp->z_zcache_state->lcnt += 1; - mutex_exit(&zp->z_zcache_state->mtx); + zp->z_dbuf_held = 0; mutex_exit(&zp->z_lock); + dmu_buf_rele(zp->z_dbuf, NULL); ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + VFS_RELE(zfsvfs->z_vfs); } void @@ -1206,7 +944,8 @@ zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx, len = -1; else if (end > size) len = size - from; - dmu_free_range(zp->z_zfsvfs->z_os, zp->z_id, from, len, tx); + VERIFY(0 == dmu_free_range(zp->z_zfsvfs->z_os, + zp->z_id, from, len, tx)); if (!have_grow_lock) rw_exit(&zp->z_grow_lock); @@ -1214,7 +953,6 @@ zfs_freesp(znode_t *zp, uint64_t from, uint64_t len, int flag, dmu_tx_t *tx, return (0); } - void zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) { @@ -1229,6 +967,10 @@ zfs_create_fs(objset_t *os, cred_t *cr, dmu_tx_t *tx) /* * First attempt to create master node. */ + /* + * In an empty objset, there are no blocks to read and thus + * there can be no i/o errors (which we assert below). + */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 14b989fbd3..55040166b4 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -136,11 +136,17 @@ zil_read_log_block(zilog_t *zilog, blkptr_t *bp, char *buf) uint64_t blksz = BP_GET_LSIZE(bp); zil_trailer_t *ztp = (zil_trailer_t *)(buf + blksz) - 1; zio_cksum_t cksum; + zbookmark_t zb; int error; + zb.zb_objset = bp->blk_cksum.zc_word[2]; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = bp->blk_cksum.zc_word[3]; + error = zio_wait(zio_read(NULL, zilog->zl_spa, bp, buf, blksz, NULL, NULL, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE)); + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); if (error) { dprintf_bp(bp, "zilog %p bp %p read failed, error %d: ", zilog, bp, error); @@ -551,6 +557,7 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1; uint64_t txg; uint64_t zil_blksz; + zbookmark_t zb; int error; ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb)); @@ -579,11 +586,21 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) error = zio_alloc_blk(zilog->zl_spa, ZIO_CHECKSUM_ZILOG, zil_blksz, &ztp->zit_next_blk, txg); if (error) { + /* + * Reinitialise the lwb. + * By returning NULL the caller will call tx_wait_synced() + */ + mutex_enter(&zilog->zl_lock); + ASSERT(lwb->lwb_state == UNWRITTEN); + lwb->lwb_nused = 0; + lwb->lwb_seq = 0; + mutex_exit(&zilog->zl_lock); txg_rele_to_sync(&lwb->lwb_txgh); return (NULL); } ASSERT3U(ztp->zit_next_blk.blk_birth, ==, txg); + ztp->zit_pad = 0; ztp->zit_nused = lwb->lwb_nused; ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; ztp->zit_next_blk.blk_cksum = lwb->lwb_blk.blk_cksum; @@ -617,9 +634,15 @@ zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) * write the old log block */ dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg); + + zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[2]; + zb.zb_object = 0; + zb.zb_level = -1; + zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[3]; + zio_nowait(zio_rewrite(NULL, zilog->zl_spa, ZIO_CHECKSUM_ZILOG, 0, &lwb->lwb_blk, lwb->lwb_buf, lwb->lwb_sz, zil_lwb_write_done, lwb, - ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED)); + ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb)); return (nlwb); } @@ -674,7 +697,8 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) lwb = zil_lwb_write_start(zilog, lwb); if (lwb == NULL) return (NULL); - if (lwb->lwb_nused + reclen > ZIL_BLK_DATA_SZ(lwb)) { + ASSERT(lwb->lwb_nused == 0); + if (reclen > ZIL_BLK_DATA_SZ(lwb)) { txg_wait_synced(zilog->zl_dmu_pool, txg); mutex_enter(&zilog->zl_lock); zilog->zl_ss_seq = MAX(seq, zilog->zl_ss_seq); @@ -1157,10 +1181,17 @@ zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) * checksum error. We can safely ignore this because * the later write will provide the correct data. */ + zbookmark_t zb; + + zb.zb_objset = dmu_objset_id(zilog->zl_os); + zb.zb_object = lrw->lr_foid; + zb.zb_level = -1; + zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp); + (void) zio_wait(zio_read(NULL, zilog->zl_spa, wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE)); + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen); } } diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 1554504a93..b9741ee5c2 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,13 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include <sys/zfs_context.h> +#include <sys/fm/fs/zfs.h> #include <sys/spa.h> #include <sys/txg.h> #include <sys/spa_impl.h> @@ -35,9 +35,6 @@ #include <sys/zio_compress.h> #include <sys/zio_checksum.h> -static void zio_vdev_io_enter(zio_t *zio); -static void zio_vdev_io_exit(zio_t *zio); - /* * ========================================================================== * I/O priority table @@ -128,6 +125,8 @@ zio_init(void) if (zio_buf_cache[c - 1] == NULL) zio_buf_cache[c - 1] = zio_buf_cache[c]; } + + zio_inject_init(); } void @@ -143,6 +142,8 @@ zio_fini(void) } zio_buf_cache[c] = NULL; } + + zio_inject_fini(); } /* @@ -263,11 +264,12 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, if (pio == NULL) { if (!(flags & ZIO_FLAG_CONFIG_HELD)) - spa_config_enter(zio->io_spa, RW_READER); + spa_config_enter(zio->io_spa, RW_READER, zio); zio->io_root = zio; } else { zio->io_root = pio->io_root; - + if (!(flags & ZIO_FLAG_NOBOOKMARK)) + zio->io_logical = pio->io_logical; mutex_enter(&pio->io_lock); if (stage < ZIO_STAGE_READY) pio->io_children_notready++; @@ -305,7 +307,7 @@ zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) zio_t * zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags) + int priority, int flags, zbookmark_t *zb) { zio_t *zio; dva_t *dva; @@ -314,6 +316,9 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, zio = zio_create(pio, spa, bp->blk_birth, bp, data, size, done, private, ZIO_TYPE_READ, priority, flags, ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); + zio->io_bookmark = *zb; + + zio->io_logical = zio; /* * Work off our copy of the bp so the caller can free it. @@ -345,7 +350,8 @@ zio_read(zio_t *pio, spa_t *spa, blkptr_t *bp, void *data, zio_t * zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *done, void *private, int priority, int flags) + zio_done_func_t *done, void *private, int priority, int flags, + zbookmark_t *zb) { zio_t *zio; @@ -359,6 +365,10 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, ZIO_TYPE_WRITE, priority, flags, ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); + zio->io_bookmark = *zb; + + zio->io_logical = zio; + zio->io_checksum = checksum; zio->io_compress = compress; @@ -378,7 +388,8 @@ zio_write(zio_t *pio, spa_t *spa, int checksum, int compress, zio_t * zio_rewrite(zio_t *pio, spa_t *spa, int checksum, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, - zio_done_func_t *done, void *private, int priority, int flags) + zio_done_func_t *done, void *private, int priority, int flags, + zbookmark_t *zb) { zio_t *zio; @@ -387,6 +398,7 @@ zio_rewrite(zio_t *pio, spa_t *spa, int checksum, ZIO_TYPE_WRITE, priority, flags, ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); + zio->io_bookmark = *zb; zio->io_checksum = checksum; zio->io_compress = ZIO_COMPRESS_OFF; @@ -667,8 +679,6 @@ zio_wait_for_children(zio_t *zio, uint32_t stage, uint64_t *countp) mutex_exit(&zio->io_lock); zio_next_stage(zio); } else { - if (zio->io_stage == ZIO_STAGE_VDEV_IO_START) - zio_vdev_io_exit(zio); zio->io_stalled = stage; mutex_exit(&zio->io_lock); } @@ -683,8 +693,6 @@ zio_notify_parent(zio_t *zio, uint32_t stage, uint64_t *countp) if (pio->io_error == 0 && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) pio->io_error = zio->io_error; if (--*countp == 0 && pio->io_stalled == stage) { - if (pio->io_stage == ZIO_STAGE_VDEV_IO_START) - zio_vdev_io_enter(pio); pio->io_stalled = 0; mutex_exit(&pio->io_lock); zio_next_stage_async(pio); @@ -748,36 +756,45 @@ zio_done(zio_t *zio) vdev_stat_update(zio); if (zio->io_error) { - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, - bp ? bp : &zio->io_bp_copy); - dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): error %d\n", - zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", - zio_type_name[zio->io_type], - vdev_description(vd), - (u_longlong_t)zio->io_offset, - zio, blkbuf, zio->io_error); - } - - if (zio->io_numerrors != 0 && zio->io_type == ZIO_TYPE_WRITE) { - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, - bp ? bp : &zio->io_bp_copy); - dprintf("ZFS: %s (%s on %s off %llx: zio %p %s): %d errors\n", - "partial write", - zio_type_name[zio->io_type], - vdev_description(vd), - (u_longlong_t)zio->io_offset, - zio, blkbuf, zio->io_numerrors); - } + /* + * If this I/O is attached to a particular vdev, + * generate an error message describing the I/O failure + * at the block level. We ignore these errors if the + * device is currently unavailable. + */ + if (zio->io_error != ECKSUM && zio->io_vd && + !vdev_is_dead(zio->io_vd)) + zfs_ereport_post(FM_EREPORT_ZFS_IO, + zio->io_spa, zio->io_vd, zio, 0, 0); + + if ((zio->io_error == EIO || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && + zio->io_logical == zio) { + /* + * For root I/O requests, tell the SPA to log the error + * appropriately. Also, generate a logical data + * ereport. + */ + spa_log_error(zio->io_spa, zio); + + zfs_ereport_post(FM_EREPORT_ZFS_DATA, + zio->io_spa, NULL, zio, 0, 0); + } - if (zio->io_error && !(zio->io_flags & ZIO_FLAG_CANFAIL)) { - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, - bp ? bp : &zio->io_bp_copy); - panic("ZFS: %s (%s on %s off %llx: zio %p %s): error %d", - zio->io_error == ECKSUM ? "bad checksum" : "I/O failure", - zio_type_name[zio->io_type], - vdev_description(vd), - (u_longlong_t)zio->io_offset, - zio, blkbuf, zio->io_error); + /* + * For I/O requests that cannot fail, panic appropriately. + */ + if (!(zio->io_flags & ZIO_FLAG_CANFAIL)) { + sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, + bp ? bp : &zio->io_bp_copy); + panic("ZFS: %s (%s on %s off %llx: zio %p %s): error " + "%d", zio->io_error == ECKSUM ? + "bad checksum" : "I/O failure", + zio_type_name[zio->io_type], + vdev_description(vd), + (u_longlong_t)zio->io_offset, + zio, blkbuf, zio->io_error); + } } zio_clear_transform_stack(zio); @@ -807,7 +824,7 @@ zio_done(zio_t *zio) } if (pio == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_HELD)) - spa_config_exit(spa); + spa_config_exit(spa, zio); if (zio->io_waiter != NULL) { mutex_enter(&zio->io_lock); @@ -988,7 +1005,8 @@ zio_read_gang_members(zio_t *zio) zio_nowait(zio_read(zio, zio->io_spa, gbp, (char *)zio->io_data + loff, lsize, NULL, NULL, - zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT)); + zio->io_priority, zio->io_flags & ZIO_FLAG_GANG_INHERIT, + &zio->io_bookmark)); } zio_buf_free(gbh, gbufsize); @@ -1022,7 +1040,8 @@ zio_rewrite_gang_members(zio_t *zio) zio_nowait(zio_rewrite(zio, zio->io_spa, zio->io_checksum, zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, - NULL, NULL, zio->io_priority, zio->io_flags)); + NULL, NULL, zio->io_priority, zio->io_flags, + &zio->io_bookmark)); } zio_push_transform(zio, gbh, gsize, gbufsize); @@ -1153,7 +1172,8 @@ zio_write_allocate_gang_members(zio_t *zio) zio->io_checksum, zio->io_txg, gbp, (char *)zio->io_data + loff, lsize, zio_write_allocate_gang_member_done, NULL, - zio->io_priority, zio->io_flags)); + zio->io_priority, zio->io_flags, + &zio->io_bookmark)); } else { lsize = P2ROUNDUP(resid / gbps_left, SPA_MINBLOCKSIZE); ASSERT(lsize != SPA_MINBLOCKSIZE); @@ -1263,51 +1283,6 @@ zio_dva_translate(zio_t *zio) * Read and write to physical devices * ========================================================================== */ -static void -zio_vdev_io_enter(zio_t *zio) -{ - vdev_t *tvd = zio->io_vd->vdev_top; - - mutex_enter(&tvd->vdev_io_lock); - ASSERT(zio->io_pending.list_next == NULL); - list_insert_tail(&tvd->vdev_io_pending, zio); - mutex_exit(&tvd->vdev_io_lock); -} - -static void -zio_vdev_io_exit(zio_t *zio) -{ - vdev_t *tvd = zio->io_vd->vdev_top; - - mutex_enter(&tvd->vdev_io_lock); - ASSERT(zio->io_pending.list_next != NULL); - list_remove(&tvd->vdev_io_pending, zio); - if (list_head(&tvd->vdev_io_pending) == NULL) - cv_broadcast(&tvd->vdev_io_cv); - mutex_exit(&tvd->vdev_io_lock); -} - -static void -zio_vdev_io_retry(void *vdarg) -{ - vdev_t *vd = vdarg; - zio_t *zio, *zq; - - ASSERT(vd == vd->vdev_top); - - /* XXPOLICY */ - delay(hz); - - vdev_reopen(vd, &zq); - - while ((zio = zq) != NULL) { - zq = zio->io_retry_next; - zio->io_retry_next = NULL; - dprintf("async retry #%d for I/O to %s offset %llx\n", - zio->io_retries, vdev_description(vd), zio->io_offset); - zio_next_stage_async(zio); - } -} static void zio_vdev_io_setup(zio_t *zio) @@ -1323,8 +1298,6 @@ zio_vdev_io_setup(zio_t *zio) zio->io_offset += VDEV_LABEL_START_SIZE; } - zio_vdev_io_enter(zio); - zio_next_stage(zio); } @@ -1350,7 +1323,7 @@ zio_vdev_io_done(zio_t *zio) } /* XXPOLICY */ -static boolean_t +boolean_t zio_should_retry(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -1363,11 +1336,7 @@ zio_should_retry(zio_t *zio) return (B_FALSE); if (zio->io_flags & ZIO_FLAG_DONT_RETRY) return (B_FALSE); - if (zio->io_retries > 300 && - (zio->io_flags & (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL))) - return (B_FALSE); - if (zio->io_retries > 1 && - (zio->io_error == ECKSUM || zio->io_error == ENXIO)) + if (zio->io_retries > 0) return (B_FALSE); return (B_TRUE); @@ -1379,17 +1348,16 @@ zio_vdev_io_assess(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; - zio_vdev_io_exit(zio); - ASSERT(zio->io_vsd == NULL); + if (zio_injection_enabled && !zio->io_error) + zio->io_error = zio_handle_fault_injection(zio, EIO); + /* * If the I/O failed, determine whether we should attempt to retry it. */ /* XXPOLICY */ if (zio_should_retry(zio)) { - zio_t *zq; - ASSERT(tvd == vd); ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)); @@ -1405,29 +1373,27 @@ zio_vdev_io_assess(zio_t *zio) zio->io_retries, zio_type_name[zio->io_type], vdev_description(vd), zio->io_offset); - /* - * If this is the first retry, do it immediately. - */ - /* XXPOLICY */ - if (zio->io_retries == 1) { - zio_next_stage_async(zio); - return; - } + zio_next_stage_async(zio); + return; + } + if (zio->io_error != 0 && !(zio->io_flags & ZIO_FLAG_SPECULATIVE) && + zio->io_error != ECKSUM) { /* - * This was not the first retry, so go through the - * longer enqueue/delay/vdev_reopen() process. + * Poor man's hotplug support. Even if we're done retrying this + * I/O, try to reopen the vdev to see if it's still attached. + * To avoid excessive thrashing, we only try it once a minute. + * This also has the effect of detecting when missing devices + * have come back, by polling the device once a minute. + * + * We need to do this asynchronously because we can't grab + * all the necessary locks way down here. */ - mutex_enter(&tvd->vdev_io_lock); - ASSERT(zio->io_retry_next == NULL); - zio->io_retry_next = zq = tvd->vdev_io_retry; - tvd->vdev_io_retry = zio; - mutex_exit(&tvd->vdev_io_lock); - if (zq == NULL) - (void) taskq_dispatch( - tvd->vdev_spa->spa_vdev_retry_taskq, - zio_vdev_io_retry, tvd, TQ_SLEEP); - return; + if (gethrtime() - vd->vdev_last_try > 60ULL * NANOSEC) { + vd->vdev_last_try = gethrtime(); + tvd->vdev_reopen_wanted = 1; + spa_async_request(vd->vdev_spa, SPA_ASYNC_REOPEN); + } } zio_next_stage(zio); @@ -1502,10 +1468,9 @@ zio_checksum_verify(zio_t *zio) { if (zio->io_bp != NULL) { zio->io_error = zio_checksum_error(zio); - if (zio->io_error) { - dprintf("bad checksum on vdev %s\n", - vdev_description(zio->io_vd)); - } + if (zio->io_error && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) + zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, + zio->io_spa, zio->io_vd, zio, 0, 0); } zio_next_stage(zio); @@ -1660,7 +1625,7 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, { int error; - spa_config_enter(spa, RW_READER); + spa_config_enter(spa, RW_READER, FTAG); BP_ZERO(bp); @@ -1677,7 +1642,7 @@ zio_alloc_blk(spa_t *spa, int checksum, uint64_t size, blkptr_t *bp, bp->blk_birth = txg; } - spa_config_exit(spa); + spa_config_exit(spa, FTAG); return (error); } @@ -1693,9 +1658,9 @@ zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) dprintf_bp(bp, "txg %llu: ", txg); - spa_config_enter(spa, RW_READER); + spa_config_enter(spa, RW_READER, FTAG); metaslab_free(spa, BP_IDENTITY(bp), txg); - spa_config_exit(spa); + spa_config_exit(spa, FTAG); } diff --git a/usr/src/uts/common/fs/zfs/zio_checksum.c b/usr/src/uts/common/fs/zfs/zio_checksum.c index dc31527ce8..d57ab6d525 100644 --- a/usr/src/uts/common/fs/zfs/zio_checksum.c +++ b/usr/src/uts/common/fs/zfs/zio_checksum.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -170,5 +169,8 @@ zio_checksum_error(zio_t *zio) (actual_cksum.zc_word[3] - zc.zc_word[3])) return (ECKSUM); + if (zio_injection_enabled && !zio->io_error) + return (zio_handle_fault_injection(zio, ECKSUM)); + return (0); } diff --git a/usr/src/uts/common/fs/zfs/zio_inject.c b/usr/src/uts/common/fs/zfs/zio_inject.c new file mode 100644 index 0000000000..4cada09d83 --- /dev/null +++ b/usr/src/uts/common/fs/zfs/zio_inject.c @@ -0,0 +1,315 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * ZFS fault injection + * + * To handle fault injection, we keep track of a series of zinject_record_t + * structures which describe which logical block(s) should be injected with a + * fault. These are kept in a global list. Each record corresponds to a given + * spa_t and maintains a special hold on the spa_t so that it cannot be deleted + * or exported while the injection record exists. + * + * Device level injection is done using the 'zi_guid' field. If this is set, it + * means that the error is destined for a particular device, not a piece of + * data. + * + * This is a rather poor data structure and algorithm, but we don't expect more + * than a few faults at any one time, so it should be sufficient for our needs. + */ + +#include <sys/arc.h> +#include <sys/zio_impl.h> +#include <sys/zfs_ioctl.h> +#include <sys/spa_impl.h> +#include <sys/vdev_impl.h> + +uint32_t zio_injection_enabled; + +typedef struct inject_handler { + int zi_id; + spa_t *zi_spa; + zinject_record_t zi_record; + list_node_t zi_link; +} inject_handler_t; + +static list_t inject_handlers; +static krwlock_t inject_lock; +static int inject_next_id = 1; + +/* + * Returns true if the given record matches the I/O in progress. + */ +static boolean_t +zio_match_handler(zbookmark_t *zb, uint64_t type, + zinject_record_t *record, int error) +{ + /* + * Check for a match against the MOS, which is based on type + */ + if (zb->zb_objset == 0 && record->zi_objset == 0 && + record->zi_object == 0) { + if (record->zi_type == DMU_OT_NONE || + type == record->zi_type) + return (record->zi_freq == 0 || + spa_get_random(100) < record->zi_freq); + else + return (B_FALSE); + } + + /* + * Check for an exact match. + */ + if (zb->zb_objset == record->zi_objset && + zb->zb_object == record->zi_object && + zb->zb_level == record->zi_level && + zb->zb_blkid >= record->zi_start && + zb->zb_blkid <= record->zi_end && + error == record->zi_error) + return (record->zi_freq == 0 || + spa_get_random(100) < record->zi_freq); + + return (B_FALSE); +} + +/* + * Determine if the I/O in question should return failure. Returns the errno + * to be returned to the caller. + */ +int +zio_handle_fault_injection(zio_t *zio, int error) +{ + int ret = 0; + inject_handler_t *handler; + + /* + * Ignore I/O not associated with any logical data. + */ + if (zio->io_logical == NULL) + return (0); + + /* + * Currently, we only support fault injection on reads. + */ + if (zio->io_type != ZIO_TYPE_READ) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + /* Ignore errors not destined for this pool */ + if (zio->io_spa != handler->zi_spa) + continue; + + /* Ignore device errors */ + if (handler->zi_record.zi_guid != 0) + continue; + + /* If this handler matches, return EIO */ + if (zio_match_handler(&zio->io_logical->io_bookmark, + zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, + &handler->zi_record, error)) { + ret = error; + break; + } + } + + rw_exit(&inject_lock); + + return (ret); +} + +int +zio_handle_device_injection(vdev_t *vd, int error) +{ + inject_handler_t *handler; + int ret = 0; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (vd->vdev_guid == handler->zi_record.zi_guid) { + if (handler->zi_record.zi_error == error) { + /* + * For a failed open, pretend like the device + * has gone away. + */ + if (error == ENXIO) + vd->vdev_stat.vs_aux = + VDEV_AUX_OPEN_FAILED; + ret = error; + break; + } + if (handler->zi_record.zi_error == ENXIO) { + ret = EIO; + break; + } + } + } + + rw_exit(&inject_lock); + + return (ret); +} + +/* + * Create a new handler for the given record. We add it to the list, adding + * a reference to the spa_t in the process. We increment zio_injection_enabled, + * which is the switch to trigger all fault injection. + */ +int +zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) +{ + inject_handler_t *handler; + int error; + spa_t *spa; + + /* + * If this is pool-wide metadata, make sure we unload the corresponding + * spa_t, so that the next attempt to load it will trigger the fault. + * We call spa_reset() to unload the pool appropriately. + */ + if (flags & ZINJECT_UNLOAD_SPA) + if ((error = spa_reset(name)) != 0) + return (error); + + if (!(flags & ZINJECT_NULL)) { + /* + * spa_inject_ref() will add an injection reference, which will + * prevent the pool from being removed from the namespace while + * still allowing it to be unloaded. + */ + if ((spa = spa_inject_addref(name)) == NULL) + return (ENOENT); + + handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); + + rw_enter(&inject_lock, RW_WRITER); + + *id = handler->zi_id = inject_next_id++; + handler->zi_spa = spa; + handler->zi_record = *record; + list_insert_tail(&inject_handlers, handler); + atomic_add_32(&zio_injection_enabled, 1); + + rw_exit(&inject_lock); + } + + /* + * Flush the ARC, so that any attempts to read this data will end up + * going to the ZIO layer. Note that this is a little overkill, but + * we don't have the necessary ARC interfaces to do anything else, and + * fault injection isn't a performance critical path. + */ + if (flags & ZINJECT_FLUSH_ARC) + arc_flush(); + + return (0); +} + +/* + * Returns the next record with an ID greater than that supplied to the + * function. Used to iterate over all handlers in the system. + */ +int +zio_inject_list_next(int *id, char *name, size_t buflen, + zinject_record_t *record) +{ + inject_handler_t *handler; + int ret; + + mutex_enter(&spa_namespace_lock); + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) + if (handler->zi_id > *id) + break; + + if (handler) { + *record = handler->zi_record; + *id = handler->zi_id; + (void) strncpy(name, spa_name(handler->zi_spa), buflen); + ret = 0; + } else { + ret = ENOENT; + } + + rw_exit(&inject_lock); + mutex_exit(&spa_namespace_lock); + + return (ret); +} + +/* + * Clear the fault handler with the given identifier, or return ENOENT if none + * exists. + */ +int +zio_clear_fault(int id) +{ + inject_handler_t *handler; + int ret; + + rw_enter(&inject_lock, RW_WRITER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) + if (handler->zi_id == id) + break; + + if (handler == NULL) { + ret = ENOENT; + } else { + list_remove(&inject_handlers, handler); + spa_inject_delref(handler->zi_spa); + kmem_free(handler, sizeof (inject_handler_t)); + atomic_add_32(&zio_injection_enabled, -1); + ret = 0; + } + + rw_exit(&inject_lock); + + return (ret); +} + +void +zio_inject_init(void) +{ + list_create(&inject_handlers, sizeof (inject_handler_t), + offsetof(inject_handler_t, zi_link)); +} + +void +zio_inject_fini(void) +{ + list_destroy(&inject_handlers); +} diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c index a570d4d971..69fb50c2c3 100644 --- a/usr/src/uts/common/fs/zfs/zvol.c +++ b/usr/src/uts/common/fs/zfs/zvol.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -418,6 +417,7 @@ zvol_create_minor(zfs_cmd_t *zc) zvol_size_changed(zv, dev); + /* XXX this should handle the possible i/o error */ VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset), "readonly", zvol_readonly_changed_cb, zv) == 0); @@ -500,7 +500,7 @@ zvol_set_volsize(zfs_cmd_t *zc) } tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, 1); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); dmu_tx_hold_free(tx, ZVOL_OBJ, zc->zc_volsize, DMU_OBJECT_END); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { @@ -511,9 +511,10 @@ zvol_set_volsize(zfs_cmd_t *zc) error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, &zc->zc_volsize, tx); - if (error == 0) - dmu_free_range(zv->zv_objset, ZVOL_OBJ, zc->zc_volsize, + if (error == 0) { + error = dmu_free_range(zv->zv_objset, ZVOL_OBJ, zc->zc_volsize, DMU_OBJECT_END, tx); + } dmu_tx_commit(tx); @@ -744,7 +745,7 @@ zvol_strategy(buf_t *bp) size = volsize - off; if (bp->b_flags & B_READ) { - error = dmu_read_canfail(os, ZVOL_OBJ, + error = dmu_read(os, ZVOL_OBJ, off, size, addr); } else { dmu_tx_t *tx = dmu_tx_create(os); diff --git a/usr/src/uts/common/krtld/kobj.c b/usr/src/uts/common/krtld/kobj.c index 003022d104..1cdf93e98f 100644 --- a/usr/src/uts/common/krtld/kobj.c +++ b/usr/src/uts/common/krtld/kobj.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -108,6 +107,7 @@ static int kobj_boot_open(char *, int); static int kobj_boot_close(int); static int kobj_boot_seek(int, off_t, off_t); static int kobj_boot_read(int, caddr_t, size_t); +static int kobj_boot_fstat(int, struct bootstat *); static Sym *lookup_one(struct module *, const char *); static void sym_insert(struct module *, char *, symid_t); @@ -3324,8 +3324,8 @@ kobj_open(char *filename) */ cred_t *saved_cred = curthread->t_cred; curthread->t_cred = kcred; - Errno = vn_open(filename, UIO_SYSSPACE, FREAD, 0, &vp, - 0, 0); + Errno = vn_openat(filename, UIO_SYSSPACE, FREAD, 0, &vp, + 0, 0, rootdir); curthread->t_cred = saved_cred; } kobjopen_free(ltp); @@ -3458,6 +3458,47 @@ kobj_close(intptr_t descr) (void) kobj_boot_close((int)descr); } +int +kobj_fstat(intptr_t descr, struct bootstat *buf) +{ + if (buf == NULL) + return (-1); + + if (_modrootloaded) { + vattr_t vattr; + struct vnode *vp = (struct vnode *)descr; + if (VOP_GETATTR(vp, &vattr, 0, kcred) != 0) + return (-1); + + /* + * The vattr and bootstat structures are similar, but not + * identical. We do our best to fill in the bootstat structure + * from the contents of vattr (transfering only the ones that + * are obvious. + */ + + buf->st_mode = (uint32_t)vattr.va_mode; + buf->st_nlink = (uint32_t)vattr.va_nlink; + buf->st_uid = (int32_t)vattr.va_uid; + buf->st_gid = (int32_t)vattr.va_gid; + buf->st_rdev = (uint64_t)vattr.va_rdev; + buf->st_size = (uint64_t)vattr.va_size; + buf->st_atim.tv_sec = (int64_t)vattr.va_atime.tv_sec; + buf->st_atim.tv_nsec = (int64_t)vattr.va_atime.tv_nsec; + buf->st_mtim.tv_sec = (int64_t)vattr.va_mtime.tv_sec; + buf->st_mtim.tv_nsec = (int64_t)vattr.va_mtime.tv_nsec; + buf->st_ctim.tv_sec = (int64_t)vattr.va_ctime.tv_sec; + buf->st_ctim.tv_nsec = (int64_t)vattr.va_ctime.tv_nsec; + buf->st_blksize = (int32_t)vattr.va_blksize; + buf->st_blocks = (int64_t)vattr.va_nblocks; + + return (0); + } + + return (kobj_boot_fstat((int)descr, buf)); +} + + struct _buf * kobj_open_file(char *name) { @@ -4097,6 +4138,18 @@ kobj_record_file(char *filename) } #endif /* __x86 */ +static int +kobj_boot_fstat(int fd, struct bootstat *stp) +{ +#if defined(__sparc) + if (!standalone && _ioquiesced) + return (-1); + return (BOP_FSTAT(ops, fd, stp)); +#else + return (BRD_FSTAT(bfs_ops, fd, stp)); +#endif +} + /* * XXX these wrappers should go away when sparc is converted * boot from ramdisk diff --git a/usr/src/uts/common/krtld/kobj_stubs.c b/usr/src/uts/common/krtld/kobj_stubs.c index 3d972194bb..c592fb5317 100644 --- a/usr/src/uts/common/krtld/kobj_stubs.c +++ b/usr/src/uts/common/krtld/kobj_stubs.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -108,6 +107,13 @@ kobj_close(intptr_t descr) /*ARGSUSED*/ int +kobj_fstat(intptr_t descr, struct bootstat *buf) +{ + return (-1); +} + +/*ARGSUSED*/ +int kobj_filbuf(struct _buf *f) { return (-1); diff --git a/usr/src/uts/common/krtld/mapfile b/usr/src/uts/common/krtld/mapfile index 398c6dcf32..cb1f85b04a 100644 --- a/usr/src/uts/common/krtld/mapfile +++ b/usr/src/uts/common/krtld/mapfile @@ -1,13 +1,9 @@ # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. -# Use is subject to license terms. -# # CDDL HEADER START # # The contents of this file are subject to the terms of the -# Common Development and Distribution License, Version 1.0 only -# (the "License"). You may not use this file except in compliance -# with the License. +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. # # You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE # or http://www.opensolaris.org/os/licensing. @@ -22,6 +18,9 @@ # # CDDL HEADER END # +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# #pragma ident "%Z%%M% %I% %E% SMI" # @@ -36,6 +35,7 @@ kobj_export_module; kobj_filbuf; kobj_free; + kobj_fstat; kobj_getelfsym; kobj_getmodinfo; kobj_getpagesize; diff --git a/usr/src/uts/common/os/fm.c b/usr/src/uts/common/os/fm.c index 6ff4626405..43c3acbef0 100644 --- a/usr/src/uts/common/os/fm.c +++ b/usr/src/uts/common/os/fm.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -1070,6 +1069,37 @@ fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth, } } +void +fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid, + uint64_t vdev_guid) +{ + if (version != ZFS_SCHEME_VERSION0) { + atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); + return; + } + + if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { + atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); + return; + } + + if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) { + atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); + return; + } + + if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) { + atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1); + } + + if (vdev_guid != 0) { + if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) { + atomic_add_64( + &erpt_kstat_data.fmri_set_failed.value.ui64, 1); + } + } +} + uint64_t fm_ena_increment(uint64_t ena) { diff --git a/usr/src/uts/common/os/modsysfile.c b/usr/src/uts/common/os/modsysfile.c index 7ffcf66d10..0e36f3e2cc 100644 --- a/usr/src/uts/common/os/modsysfile.c +++ b/usr/src/uts/common/os/modsysfile.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -73,6 +72,7 @@ static vmem_t *mod_sysfile_arena; /* parser memory */ char obp_bootpath[BO_MAXOBJNAME]; /* bootpath from obp */ char svm_bootpath[BO_MAXOBJNAME]; /* bootpath redirected via rootdev */ +char zfs_bootpath[BO_MAXOBJNAME]; /* zfs bootpath, set via zfsroot */ #if defined(_PSM_MODULES) @@ -489,6 +489,8 @@ static struct modcmd modcmd[] = { { "set32", MOD_SET32 }, { "SET64", MOD_SET64 }, { "set64", MOD_SET64 }, + { "ZFSROOT", MOD_ZFSROOT }, + { "zfsroot", MOD_ZFSROOT }, { NULL, MOD_UNKNOWN } }; @@ -528,6 +530,7 @@ do_sysfile_cmd(struct _buf *file, const char *cmd) */ case MOD_ROOTFS: case MOD_SWAPFS: + case MOD_ZFSROOT: if ((token = kobj_lex(file, tok1, sizeof (tok1))) == COLON) { token = kobj_lex(file, tok1, sizeof (tok1)); } else { @@ -1520,7 +1523,10 @@ setparams() (void) copystr(sysp->sys_ptr, bootobjp->bo_fstype, BO_MAXOBJNAME, NULL); break; - + case MOD_ZFSROOT: + (void) copystr(sysp->sys_ptr, zfs_bootpath, + BO_MAXOBJNAME, NULL); + break; default: break; } diff --git a/usr/src/uts/common/os/policy.c b/usr/src/uts/common/os/policy.c index fe4a5c82df..2e027b7ba5 100644 --- a/usr/src/uts/common/os/policy.c +++ b/usr/src/uts/common/os/policy.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -1741,13 +1740,10 @@ secpolicy_contract_event_choice(const cred_t *cr) } /* - * Name: secpolicy_gart_access - * - * Normal: Verify if the subject has sufficient priveleges to make ioctls - * to agpgart device - * - * Output: EPERM - if not privileged + * secpolicy_gart_access * + * Determine if the subject has sufficient priveleges to make ioctls to agpgart + * device. */ int secpolicy_gart_access(const cred_t *cr) @@ -1756,13 +1752,10 @@ secpolicy_gart_access(const cred_t *cr) } /* - * Name: secpolicy_gart_map - * - * Normal: Verify if the subject has sufficient privelegs to map aperture - * range through agpgart driver - * - * Output: EPERM - if not privileged + * secpolicy_gart_map * + * Determine if the subject has sufficient priveleges to map aperture range + * through agpgart driver. */ int secpolicy_gart_map(const cred_t *cr) @@ -1774,10 +1767,22 @@ secpolicy_gart_map(const cred_t *cr) } /* + * secpolicy_zinject + * + * Determine if the subject can inject faults in the ZFS fault injection + * framework. Requires all privileges. + */ +int +secpolicy_zinject(const cred_t *cr) +{ + return (secpolicy_require_set(cr, PRIV_FULLSET, NULL)); +} + +/* * secpolicy_zfs * - * Determine if the user has permission to manipulate ZFS datasets (not pools). - * Equivalent to the SYS_MOUNT privilege. + * Determine if the subject has permission to manipulate ZFS datasets + * (not pools). Equivalent to the SYS_MOUNT privilege. */ int secpolicy_zfs(const cred_t *cr) diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index f82a933903..516ecc0a5a 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -657,6 +657,9 @@ FMHDRS= \ protocol.h \ util.h +FMFSHDRS= \ + zfs.h + FMIOHDRS= \ ddi.h \ pci.h \ @@ -914,6 +917,7 @@ CHECKHDRS= \ $(TAVORHDRS:%.h=ib/adapters/tavor/%.check) \ $(ISOHDRS:%.h=iso/%.check) \ $(FMHDRS:%.h=fm/%.check) \ + $(FMFSHDRS:%.h=fm/fs/%.check) \ $(FMIOHDRS:%.h=fm/io/%.check) \ $(FSHDRS:%.h=fs/%.check) \ $(LVMHDRS:%.h=lvm/%.check) \ @@ -949,6 +953,7 @@ CHECKHDRS= \ $(ROOTISOHDRS) \ $(ROOTFMHDRS) \ $(ROOTFMIOHDRS) \ + $(ROOTFMFSHDRS) \ $(ROOTFSHDRS) \ $(ROOTIBDHDRS) \ $(ROOTIBHDRS) \ @@ -992,7 +997,8 @@ install_h: \ $(ROOTDCAMHDRS) \ $(ROOTISOHDRS) \ $(ROOTFMHDRS) \ - $(ROOTFMIOHDRS) \ + $(ROOTFMFSHDRS) \ + $(ROOTFMIOHDRS) \ $(ROOTFSHDRS) \ $(ROOTIBDHDRS) \ $(ROOTIBHDRS) \ diff --git a/usr/src/uts/common/sys/Makefile.syshdrs b/usr/src/uts/common/sys/Makefile.syshdrs index cdc3436049..d9c363b48b 100644 --- a/usr/src/uts/common/sys/Makefile.syshdrs +++ b/usr/src/uts/common/sys/Makefile.syshdrs @@ -1,5 +1,5 @@ # -# Copyright 2005 Sun Microsystems, Inc. All rights reserved. +# Copyright 2006 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. # # ident "%Z%%M% %I% %E% SMI" @@ -18,10 +18,13 @@ av/%.check: av/%.h fm/%.check: fm/%.h $(DOT_H_CHECK) -fm/cpu/%.check: fm/cpu/%.h +fm/cpu/%.check: fm/cpu/%.h $(DOT_H_CHECK) -fm/io/%.check: fm/io/%.h +fm/fs/%.check: fm/fs/%.h + $(DOT_H_CHECK) + +fm/io/%.check: fm/io/%.h $(DOT_H_CHECK) fs/%.check: fs/%.h @@ -129,6 +132,7 @@ ROOTDIRS= \ $(ROOTDIR)/iso \ $(ROOTDIR)/fm \ $(ROOTDIR)/fm/cpu \ + $(ROOTDIR)/fm/fs \ $(ROOTDIR)/fm/io \ $(ROOTDIR)/fs \ $(ROOTDIR)/ib \ @@ -187,6 +191,7 @@ ROOTISOHDRS= $(ISOHDRS:%=$(ROOTDIR)/iso/%) ROOTFMHDRS= $(FMHDRS:%=$(ROOTDIR)/fm/%) ROOTFMCPUHDRS= $(FMCPUHDRS:%=$(ROOTDIR)/fm/cpu/%) ROOTFMIOHDRS= $(FMIOHDRS:%=$(ROOTDIR)/fm/io/%) +ROOTFMFSHDRS= $(FMFSHDRS:%=$(ROOTDIR)/fm/fs/%) ROOTFSHDRS= $(FSHDRS:%=$(ROOTDIR)/fs/%) diff --git a/usr/src/uts/common/sys/fm/fs/zfs.h b/usr/src/uts/common/sys/fm/fs/zfs.h new file mode 100644 index 0000000000..aa5c7ee0d7 --- /dev/null +++ b/usr/src/uts/common/sys/fm/fs/zfs.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FM_FS_ZFS_H +#define _SYS_FM_FS_ZFS_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_ERROR_CLASS "fs.zfs" + +#define FM_EREPORT_ZFS_CHECKSUM "checksum" +#define FM_EREPORT_ZFS_IO "io" +#define FM_EREPORT_ZFS_DATA "data" +#define FM_EREPORT_ZFS_POOL "zpool" +#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" +#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" +#define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data" +#define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas" +#define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum" +#define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small" +#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label" + +#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" +#define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid" +#define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" +#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" + +#define FM_RESOURCE_OK "ok" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FM_FS_ZFS_H */ diff --git a/usr/src/uts/common/sys/fm/protocol.h b/usr/src/uts/common/sys/fm/protocol.h index 89b761ef6c..1afa67f66b 100644 --- a/usr/src/uts/common/sys/fm/protocol.h +++ b/usr/src/uts/common/sys/fm/protocol.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -168,6 +167,7 @@ extern "C" { #define FM_FMRI_SCHEME_MOD "mod" #define FM_FMRI_SCHEME_PKG "pkg" #define FM_FMRI_SCHEME_LEGACY "legacy-hc" +#define FM_FMRI_SCHEME_ZFS "zfs" /* Scheme versions */ #define FMD_SCHEME_VERSION0 0 @@ -187,6 +187,8 @@ extern "C" { #define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0 #define LEGACY_SCHEME_VERSION0 0 #define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0 +#define ZFS_SCHEME_VERSION0 0 +#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0 /* hc scheme member names */ #define FM_FMRI_HC_SERIAL_ID "serial" @@ -253,6 +255,10 @@ extern "C" { #define FM_FMRI_MOD_ID "mod-id" #define FM_FMRI_MOD_DESC "mod-desc" +/* zfs scheme member names */ +#define FM_FMRI_ZFS_POOL "pool" +#define FM_FMRI_ZFS_VDEV "vdev" + extern nv_alloc_t *fm_nva_xcreate(char *, size_t); extern void fm_nva_xdestroy(nv_alloc_t *); @@ -277,6 +283,7 @@ extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *, const char *, uint64_t); extern void fm_authority_set(nvlist_t *, int, const char *, const char *, const char *, const char *); +extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t); extern uint64_t fm_ena_increment(uint64_t); extern uint64_t fm_ena_generate(uint64_t, uchar_t); diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index 65425c829c..0fa884dcaa 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -133,6 +133,8 @@ uint64_t zfs_prop_default_numeric(zfs_prop_t); #define ZPOOL_CONFIG_STATS "stats" #define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" #define ZPOOL_CONFIG_OFFLINE "offline" +#define ZPOOL_CONFIG_ERRCOUNT "error_count" +#define ZPOOL_CONFIG_NOT_PRESENT "not_present" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" @@ -304,9 +306,25 @@ typedef enum zfs_ioc { ZFS_IOC_ROLLBACK, ZFS_IOC_RENAME, ZFS_IOC_RECVBACKUP, - ZFS_IOC_SENDBACKUP + ZFS_IOC_SENDBACKUP, + ZFS_IOC_INJECT_FAULT, + ZFS_IOC_CLEAR_FAULT, + ZFS_IOC_INJECT_LIST_NEXT, + ZFS_IOC_ERROR_LOG, + ZFS_IOC_CLEAR, + ZFS_IOC_BOOKMARK_NAME } zfs_ioc_t; +/* + * Internal SPA load state. Used by FMA diagnosis engine. + */ +typedef enum { + SPA_LOAD_NONE, /* no load in progress */ + SPA_LOAD_OPEN, /* normal open */ + SPA_LOAD_IMPORT, /* import in progress */ + SPA_LOAD_TRYIMPORT /* tryimport in progress */ +} spa_load_state_t; + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/sys/kobj.h b/usr/src/uts/common/sys/kobj.h index 7d2bd0922e..9276aa370f 100644 --- a/usr/src/uts/common/sys/kobj.h +++ b/usr/src/uts/common/sys/kobj.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -34,6 +33,7 @@ #include <sys/machelf.h> #include <sys/vmem.h> #include <sys/sdt.h> +#include <sys/bootstat.h> #ifdef __cplusplus extern "C" { @@ -162,6 +162,7 @@ extern uintptr_t kobj_getsymvalue(char *, int); extern char *kobj_getsymname(uintptr_t, ulong_t *); extern char *kobj_searchsym(struct module *, uintptr_t, ulong_t *); +extern int kobj_fstat(intptr_t, struct bootstat *); extern intptr_t kobj_open(char *); extern int kobj_path_exists(char *, int); extern struct _buf *kobj_open_path(char *, int, int); diff --git a/usr/src/uts/common/sys/policy.h b/usr/src/uts/common/sys/policy.h index 9653a58b0e..beabb63818 100644 --- a/usr/src/uts/common/sys/policy.h +++ b/usr/src/uts/common/sys/policy.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -141,6 +140,7 @@ int secpolicy_vnode_setdac(const cred_t *, uid_t); int secpolicy_vnode_setid_retain(const cred_t *, boolean_t); int secpolicy_vnode_setids_setgids(const cred_t *, gid_t); int secpolicy_vnode_stky_modify(const cred_t *); +int secpolicy_zinject(const cred_t *); int secpolicy_zfs(const cred_t *); void secpolicy_setid_clear(vattr_t *, cred_t *); diff --git a/usr/src/uts/common/sys/sysconf.h b/usr/src/uts/common/sys/sysconf.h index 4594d91287..654436a115 100644 --- a/usr/src/uts/common/sys/sysconf.h +++ b/usr/src/uts/common/sys/sysconf.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 1990-2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -72,6 +71,7 @@ struct modcmd { #define MOD_UNKNOWN 9 /* unknown command */ #define MOD_SET32 10 /* like MOD_SET but -only- on 32-bit kernel */ #define MOD_SET64 11 /* like MOD_SET but -only- on 64-bit kernel */ +#define MOD_ZFSROOT 12 /* use zfs as the root filesystem */ /* * Commands for mod_sysctl() |
