diff options
author | Keith M Wesolowski <wesolows@foobazco.org> | 2013-06-11 22:47:43 +0000 |
---|---|---|
committer | Keith M Wesolowski <wesolows@foobazco.org> | 2013-06-11 22:47:43 +0000 |
commit | 80702ccb4f267d5bfae86b07731a8f89e724d055 (patch) | |
tree | 4bcba7532a1633ba6d708dbaf41c21af8876e7c0 | |
parent | 2ffdaec9c70166169b32efb691ef26af80d99263 (diff) | |
parent | 3cb69f734bc60bbb4d56a28c83706db862bec082 (diff) | |
download | illumos-joyent-80702ccb4f267d5bfae86b07731a8f89e724d055.tar.gz |
[illumos-gate merge]
commit 3cb69f734bc60bbb4d56a28c83706db862bec082
3749 zfs event processing should work on R/O root filesystems
commit b3d9f2e26021d3f55a281af30720589d303b9806
3747 txg commit callbacks don't work
commit 8b713775314bbbf24edd503b4869342d8711ce95
3745 zpool create should treat -O mountpoint and -m the same
3811 zpool create -o altroot=/xyz -O mountpoint=/mnt ignores the mountpoint option
commit fc7a6e3fefc649cb65c8e2a35d194781445008b0
3744 zfs shouldn't ignore errors unmounting snapshots
commit b287be1ba86043996f49b1cc34c80cc620f9b841
3743 zfs needs a refcount audit
commit f7170741490edba9d1d9c697c177c887172bc741
3742 zfs comments need cleaner, more consistent style
commit 3e30c24aeefdee1631958ecf17f18da671781956
3741 zfs needs better comments
commit 2ac302890e472bf0c11db192dd18f12ded6043f6
3797 AHCI: Support for ASMedia ASM106x
60 files changed, 806 insertions, 547 deletions
diff --git a/usr/src/cmd/mdb/intel/modules/sata/sata.c b/usr/src/cmd/mdb/intel/modules/sata/sata.c index 5a43653d30..b1618cd8d8 100644 --- a/usr/src/cmd/mdb/intel/modules/sata/sata.c +++ b/usr/src/cmd/mdb/intel/modules/sata/sata.c @@ -22,7 +22,9 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - +/* + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + */ #include <sys/mdb_modapi.h> #include <mdb/mdb_ks.h> @@ -168,7 +170,7 @@ sata_dmsg_dump(sata_trace_dmsg_t *addr, int print_pathname, uint_t *printed) (void) mdb_ddi_pathname( (uintptr_t)dmsg.dip, pathname, sizeof (pathname)); - mdb_printf("\n[%s]", pathname); + mdb_printf("[%s]", pathname); } } } else { @@ -181,7 +183,7 @@ sata_dmsg_dump(sata_trace_dmsg_t *addr, int print_pathname, uint_t *printed) dmsg.buf); } - mdb_printf("%s", merge); + mdb_printf("%s\n", merge); if (printed != NULL) { (*printed)++; diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c index 310c33462a..7a133bf27e 100644 --- a/usr/src/cmd/zpool/zpool_main.c +++ b/usr/src/cmd/zpool/zpool_main.c @@ -678,6 +678,7 @@ zpool_do_create(int argc, char **argv) goto errout; break; case 'm': + /* Equivalent to -O mountpoint=optarg */ mountpoint = optarg; break; case 'o': @@ -716,8 +717,18 @@ zpool_do_create(int argc, char **argv) *propval = '\0'; propval++; - if (add_prop_list(optarg, propval, &fsprops, B_FALSE)) + /* + * Mountpoints are checked and then added later. + * Uniquely among properties, they can be specified + * more than once, to avoid conflict with -m. + */ + if (0 == strcmp(optarg, + zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { + mountpoint = propval; + } else if (add_prop_list(optarg, propval, &fsprops, + B_FALSE)) { goto errout; + } break; case ':': (void) fprintf(stderr, gettext("missing argument for " @@ -834,6 +845,18 @@ zpool_do_create(int argc, char **argv) } } + /* + * Now that the mountpoint's validity has been checked, ensure that + * the property is set appropriately prior to creating the pool. + */ + if (mountpoint != NULL) { + ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), + mountpoint, &fsprops, B_FALSE); + if (ret != 0) + goto errout; + } + + ret = 1; if (dryrun) { /* * For a dry run invocation, print out a basic message and run @@ -868,21 +891,19 @@ zpool_do_create(int argc, char **argv) if (nvlist_exists(props, propname)) continue; - if (add_prop_list(propname, ZFS_FEATURE_ENABLED, - &props, B_TRUE) != 0) + ret = add_prop_list(propname, + ZFS_FEATURE_ENABLED, &props, B_TRUE); + if (ret != 0) goto errout; } } + + ret = 1; if (zpool_create(g_zfs, poolname, nvroot, props, fsprops) == 0) { zfs_handle_t *pool = zfs_open(g_zfs, poolname, ZFS_TYPE_FILESYSTEM); if (pool != NULL) { - if (mountpoint != NULL) - verify(zfs_prop_set(pool, - zfs_prop_to_name( - ZFS_PROP_MOUNTPOINT), - mountpoint) == 0); if (zfs_mount(pool, NULL, 0) == 0) ret = zfs_shareall(pool); zfs_close(pool); diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index ed460551c6..980615eae0 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -4507,7 +4507,7 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) */ tmp_cb = list_head(&zcl.zcl_callbacks); if (tmp_cb != NULL && - tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) { + (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) { fatal(0, "Commit callback threshold exceeded, oldest txg: %" PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); } diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c index 36ab907508..27656b7526 100644 --- a/usr/src/lib/libzfs/common/libzfs_dataset.c +++ b/usr/src/lib/libzfs/common/libzfs_dataset.c @@ -4473,6 +4473,11 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) return (err); } +/* + * Convert the zvol's volume size to an appropriate reservation. + * Note: If this routine is updated, it is necessary to update the ZFS test + * suite's shell version in reservation.kshlib. + */ uint64_t zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) { diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c index 8315e1404b..0fd5f5738c 100644 --- a/usr/src/lib/libzfs/common/libzfs_pool.c +++ b/usr/src/lib/libzfs/common/libzfs_pool.c @@ -1088,7 +1088,6 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, nvlist_t *zc_fsprops = NULL; nvlist_t *zc_props = NULL; char msg[1024]; - char *altroot; int ret = -1; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, @@ -1187,21 +1186,6 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, } } - /* - * If this is an alternate root pool, then we automatically set the - * mountpoint of the root dataset to be '/'. - */ - if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), - &altroot) == 0) { - zfs_handle_t *zhp; - - verify((zhp = zfs_open(hdl, pool, ZFS_TYPE_DATASET)) != NULL); - verify(zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), - "/") == 0); - - zfs_close(zhp); - } - create_failed: zcmd_free_nvlists(&zc); nvlist_free(zc_props); diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh index 2458cf350c..c2f3789891 100644..100755 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh @@ -103,12 +103,16 @@ do [[ "$mpt" != "$mpt_val" ]] && \ log_fail "The value of mountpoint property is different\ from the output of zfs mount" - if [[ "$opt" == "-R $TESTDIR1" ]] || [[ "$opt" == "-m $TESTDIR1" ]]; - then + if [[ "$opt" == "-m $TESTDIR1" ]]; then [[ ! -d $TESTDIR1 ]] && \ log_fail "$TESTDIR1 is not created auotmatically." [[ "$mpt" != "$TESTDIR1" ]] && \ log_fail "$TESTPOOL is not mounted on $TESTDIR1." + elif [[ "$opt" == "-R $TESTDIR1" ]]; then + [[ ! -d $TESTDIR1/$TESTPOOL ]] && \ + log_fail "$TESTDIR1/$TESTPOOL is not created auotmatically." + [[ "$mpt" != "$TESTDIR1/$TESTPOOL" ]] && \ + log_fail "$TESTPOOL is not mounted on $TESTDIR1/$TESTPOOL." else [[ ! -d ${TESTDIR1}$TESTDIR1 ]] && \ log_fail "${TESTDIR1}$TESTDIR1 is not created automatically." diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 81ee2e3ff3..67847d2d99 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -59,11 +59,11 @@ * tight. * * 3. The Megiddo and Modha model assumes a fixed page size. All - * elements of the cache are therefor exactly the same size. So + * elements of the cache are therefore exactly the same size. So * when adjusting the cache size following a cache miss, its simply * a matter of choosing a single page to evict. In our model, we * have variable sized cache blocks (rangeing from 512 bytes to - * 128K bytes). We therefor choose a set of blocks to evict to make + * 128K bytes). We therefore choose a set of blocks to evict to make * space for a cache miss that approximates as closely as possible * the space used by the new block. * @@ -78,7 +78,7 @@ * ways: 1) via a hash table lookup using the DVA as a key, * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal arc algorithms for - * adjusting the cache use method 2. We therefor provide two + * adjusting the cache use method 2. We therefore provide two * types of locks: 1) the hash table lock array, and 2) the * arc list locks. * @@ -258,7 +258,18 @@ typedef struct arc_stats { kstat_named_t arcstat_mfu_ghost_hits; kstat_named_t arcstat_deleted; kstat_named_t arcstat_recycle_miss; + /* + * Number of buffers that could not be evicted because the hash lock + * was held by another thread. The lock may not necessarily be held + * by something using the same buffer, since hash locks are shared + * by multiple buffers. + */ kstat_named_t arcstat_mutex_miss; + /* + * Number of buffers skipped because they have I/O in progress, are + * indrect prefetch buffers that have not lived long enough, or are + * not from the spa we're trying to evict from. + */ kstat_named_t arcstat_evict_skip; kstat_named_t arcstat_evict_l2_cached; kstat_named_t arcstat_evict_l2_eligible; @@ -376,7 +387,7 @@ static arc_stats_t arc_stats = { #define ARCSTAT(stat) (arc_stats.stat.value.ui64) #define ARCSTAT_INCR(stat, val) \ - atomic_add_64(&arc_stats.stat.value.ui64, (val)); + atomic_add_64(&arc_stats.stat.value.ui64, (val)) #define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) @@ -604,9 +615,7 @@ uint64_t zfs_crc64_table[256]; #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) -/* - * L2ARC Performance Tunables - */ +/* L2ARC Performance Tunables */ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ @@ -3001,6 +3010,10 @@ top: mutex_exit(hash_lock); + /* + * At this point, we have a level 1 cache miss. Try again in + * L2ARC if possible. + */ ASSERT3U(hdr->b_size, ==, size); DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, uint64_t, size, zbookmark_t *, zb); @@ -3243,8 +3256,8 @@ arc_buf_evict(arc_buf_t *buf) } /* - * Release this buffer from the cache. This must be done - * after a read and prior to modifying the buffer contents. + * Release this buffer from the cache, making it an anonymous buffer. This + * must be done after a read and prior to modifying the buffer contents. * If the buffer has more than one reference, we must make * a new hdr for the buffer. */ @@ -3633,7 +3646,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) /* * Writes will, almost always, require additional memory allocations - * in order to compress/encrypt/etc the data. We therefor need to + * in order to compress/encrypt/etc the data. We therefore need to * make sure that there is sufficient available memory for this. */ if (error = arc_memory_throttle(reserve, anon_size, txg)) diff --git a/usr/src/uts/common/fs/zfs/bptree.c b/usr/src/uts/common/fs/zfs/bptree.c index 73922db88b..a0c90cc4d9 100644 --- a/usr/src/uts/common/fs/zfs/bptree.c +++ b/usr/src/uts/common/fs/zfs/bptree.c @@ -43,7 +43,7 @@ * dsl_scan_sync. This allows the delete operation to finish without traversing * all the dataset's blocks. * - * Note that while bt_begin and bt_end are only ever incremented in this code + * Note that while bt_begin and bt_end are only ever incremented in this code, * they are effectively reset to 0 every time the entire bptree is freed because * the bptree's object is destroyed and re-created. */ diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index f4c1904543..9f4c8a8e35 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -653,6 +653,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) if (!havepzio) err = zio_wait(zio); } else { + /* + * Another reader came in while the dbuf was in flight + * between UNCACHED and CACHED. Either a writer will finish + * writing the buffer (sending the dbuf to CACHED) or the + * first reader's request will reach the read_done callback + * and send the dbuf to CACHED. Otherwise, a failure + * occurred and the dbuf went to UNCACHED. + */ mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, @@ -661,6 +669,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); + /* Skip the wait per the caller's request. */ mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || @@ -1276,7 +1285,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } /* - * Return TRUE if this evicted the dbuf. + * Undirty a buffer in the transaction group referenced by the given + * transaction. Return whether this evicted the dbuf. */ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) @@ -2237,6 +2247,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(db->db_level > 0); DBUF_VERIFY(db); + /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); @@ -2247,10 +2258,12 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) DB_DNODE_ENTER(db); dn = DB_DNODE(db); + /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); DB_DNODE_EXIT(db); + /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); @@ -2637,6 +2650,7 @@ dbuf_write_override_done(zio_t *zio) dbuf_write_done(zio, NULL, db); } +/* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { @@ -2671,11 +2685,19 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) } if (parent != dn->dn_dbuf) { + /* Our parent is an indirect block. */ + /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); + /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); + /* + * We're about to modify our parent's db_data by modifying + * our block pointer, so the parent must be released. + */ ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { + /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index a616fd37cf..a3640fd593 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -1827,7 +1827,7 @@ dmu_init(void) void dmu_fini(void) { - arc_fini(); + arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); zfetch_fini(); dbuf_fini(); diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c index ad4084021d..e30c6d345e 100644 --- a/usr/src/uts/common/fs/zfs/dmu_tx.c +++ b/usr/src/uts/common/fs/zfs/dmu_tx.c @@ -1016,6 +1016,10 @@ dmu_tx_unassign(dmu_tx_t *tx) txg_rele_to_quiesce(&tx->tx_txgh); + /* + * Walk the transaction's hold list, removing the hold on the + * associated dnode, and notifying waiters if the refcount drops to 0. + */ for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; @@ -1128,6 +1132,10 @@ dmu_tx_commit(dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); + /* + * Go through the transaction's hold list and remove holds on + * associated dnodes, notifying waiters if no holds remain. + */ while (txh = list_head(&tx->tx_holds)) { dnode_t *dn = txh->txh_dnode; diff --git a/usr/src/uts/common/fs/zfs/dmu_zfetch.c b/usr/src/uts/common/fs/zfs/dmu_zfetch.c index 37037c30f6..2ebfa183aa 100644 --- a/usr/src/uts/common/fs/zfs/dmu_zfetch.c +++ b/usr/src/uts/common/fs/zfs/dmu_zfetch.c @@ -48,11 +48,11 @@ uint32_t zfetch_block_cap = 256; uint64_t zfetch_array_rd_sz = 1024 * 1024; /* forward decls for static routines */ -static int dmu_zfetch_colinear(zfetch_t *, zstream_t *); +static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *); static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *); static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t); static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t); -static int dmu_zfetch_find(zfetch_t *, zstream_t *, int); +static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int); static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *); static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); @@ -104,9 +104,9 @@ kstat_t *zfetch_ksp; * last stream, then we are probably in a strided access pattern. So * combine the two sequential streams into a single strided stream. * - * If no co-linear streams are found, return NULL. + * Returns whether co-linear streams were found. */ -static int +static boolean_t dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh) { zstream_t *z_walk; @@ -326,7 +326,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) * for this block read. If so, it starts a prefetch for the stream it * located and returns true, otherwise it returns false */ -static int +static boolean_t dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) { zstream_t *zs; @@ -639,7 +639,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) { zstream_t zst; zstream_t *newstream; - int fetched; + boolean_t fetched; int inserted; unsigned int blkshft; uint64_t blksz; @@ -665,7 +665,8 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) ZFETCHSTAT_BUMP(zfetchstat_hits); } else { ZFETCHSTAT_BUMP(zfetchstat_misses); - if (fetched = dmu_zfetch_colinear(zf, &zst)) { + fetched = dmu_zfetch_colinear(zf, &zst); + if (fetched) { ZFETCHSTAT_BUMP(zfetchstat_colinear_hits); } else { ZFETCHSTAT_BUMP(zfetchstat_colinear_misses); diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c index 417e219b88..92996a6f12 100644 --- a/usr/src/uts/common/fs/zfs/dnode.c +++ b/usr/src/uts/common/fs/zfs/dnode.c @@ -1803,14 +1803,16 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) } /* - * This function scans a block at the indicated "level" looking for - * a hole or data (depending on 'flags'). If level > 0, then we are - * scanning an indirect block looking at its pointers. If level == 0, - * then we are looking at a block of dnodes. If we don't find what we - * are looking for in the block, we return ESRCH. Otherwise, return - * with *offset pointing to the beginning (if searching forwards) or - * end (if searching backwards) of the range covered by the block - * pointer we matched on (or dnode). + * Scans a block at the indicated "level" looking for a hole or data, + * depending on 'flags'. + * + * If level > 0, then we are scanning an indirect block looking at its + * pointers. If level == 0, then we are looking at a block of dnodes. + * + * If we don't find what we are looking for in the block, we return ESRCH. + * Otherwise, return with *offset pointing to the beginning (if searching + * forwards) or end (if searching backwards) of the range covered by the + * block pointer we matched on (or dnode). * * The basic search algorithm used below by dnode_next_offset() is to * use this function to search up the block tree (widen the search) until diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c index 7d47ce02b4..2a1094be24 100644 --- a/usr/src/uts/common/fs/zfs/dnode_sync.c +++ b/usr/src/uts/common/fs/zfs/dnode_sync.c @@ -302,7 +302,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc, } /* - * free_range: Traverse the indicated range of the provided file + * Traverse the indicated range of the provided file * and "free" all the blocks contained there. */ static void @@ -370,7 +370,7 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) } /* - * Try to kick all the dnodes dbufs out of the cache... + * Try to kick all the dnode's dbufs out of the cache... */ void dnode_evict_dbufs(dnode_t *dn) diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c index bfc8b06d03..d59b6fa052 100644 --- a/usr/src/uts/common/fs/zfs/dsl_dataset.c +++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c @@ -356,8 +356,10 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, /* Make sure dsobj has the correct object type. */ dmu_object_info_from_db(dbuf, &doi); - if (doi.doi_type != DMU_OT_DSL_DATASET) + if (doi.doi_type != DMU_OT_DSL_DATASET) { + dmu_buf_rele(dbuf, tag); return (SET_ERROR(EINVAL)); + } ds = dmu_buf_get_user(dbuf); if (ds == NULL) { diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c index 8e0e089448..2eada5cd16 100644 --- a/usr/src/uts/common/fs/zfs/dsl_prop.c +++ b/usr/src/uts/common/fs/zfs/dsl_prop.c @@ -380,7 +380,7 @@ dsl_prop_predict(dsl_dir_t *dd, const char *propname, /* * Unregister this callback. Return 0 on success, ENOENT if ddname is - * invalid, ENOMSG if no matching callback registered. + * invalid, or ENOMSG if no matching callback registered. */ int dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, diff --git a/usr/src/uts/common/fs/zfs/dsl_userhold.c b/usr/src/uts/common/fs/zfs/dsl_userhold.c index fa9d937085..568bba33b5 100644 --- a/usr/src/uts/common/fs/zfs/dsl_userhold.c +++ b/usr/src/uts/common/fs/zfs/dsl_userhold.c @@ -433,7 +433,7 @@ dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, const char *htag) dsl_dataset_name(ds, name); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); - zfs_unmount_snap(name); + (void) zfs_unmount_snap(name); } else { dsl_pool_config_exit(dp, FTAG); } diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c index 996f6e1443..6a87231c7b 100644 --- a/usr/src/uts/common/fs/zfs/sa.c +++ b/usr/src/uts/common/fs/zfs/sa.c @@ -111,6 +111,7 @@ * location. * * Byteswap implications: + * * Since the SA attributes are not entirely self describing we can't do * the normal byteswap processing. The special ZAP layout attribute and * attribute registration attributes define the byteswap function and the @@ -189,7 +190,6 @@ sa_attr_reg_t sa_legacy_attrs[] = { }; /* - * ZPL legacy layout * This is only used for objects of type DMU_OT_ZNODE */ sa_attr_type_t sa_legacy_zpl_layout[] = { @@ -199,7 +199,6 @@ sa_attr_type_t sa_legacy_zpl_layout[] = { /* * Special dummy layout used for buffers with no attributes. */ - sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; static int sa_legacy_attr_count = 16; diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 0fc3e66904..7334d39516 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -26,6 +26,8 @@ */ /* + * SPA: Storage Pool Allocator + * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. @@ -77,6 +79,12 @@ #include "zfs_prop.h" #include "zfs_comutil.h" +/* + * The interval, in seconds, at which failed configuration cache file writes + * should be retried. + */ +static int zfs_ccw_retry_interval = 300; + typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */ @@ -4514,6 +4522,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) /* * Detach a device from a mirror or replacing vdev. + * * If 'replace_done' is specified, only detach if the parent * is a replacing vdev. */ @@ -5168,11 +5177,9 @@ spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) * the spa_vdev_config_[enter/exit] functions which allow us to * grab and release the spa_config_lock while still holding the namespace * lock. During each step the configuration is synced out. - */ - -/* - * Remove a device from the pool. Currently, this supports removing only hot - * spares, slogs, and level 2 ARC devices. + * + * Currently, this supports removing only hot spares, slogs, and level 2 ARC + * devices. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) @@ -5282,7 +5289,7 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) /* * Find any device that's done replacing, or a vdev marked 'unspare' that's - * current spared, so we can detach it. + * currently spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) @@ -5661,13 +5668,34 @@ spa_async_resume(spa_t *spa) mutex_exit(&spa->spa_async_lock); } +static boolean_t +spa_async_tasks_pending(spa_t *spa) +{ + uint_t non_config_tasks; + uint_t config_task; + boolean_t config_task_suspended; + + non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; + config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; + if (spa->spa_ccw_fail_time == 0) { + config_task_suspended = B_FALSE; + } else { + config_task_suspended = + (gethrtime() - spa->spa_ccw_fail_time) < + (zfs_ccw_retry_interval * NANOSEC); + } + + return (non_config_tasks || (config_task && !config_task_suspended)); +} + static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); - if (spa->spa_async_tasks && !spa->spa_async_suspended && + if (spa_async_tasks_pending(spa) && + !spa->spa_async_suspended && spa->spa_async_thread == NULL && - rootdir != NULL && !vn_is_readonly(rootdir)) + rootdir != NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c index b113ce9e0c..d97fc32fbf 100644 --- a/usr/src/uts/common/fs/zfs/spa_config.c +++ b/usr/src/uts/common/fs/zfs/spa_config.c @@ -26,6 +26,7 @@ */ #include <sys/spa.h> +#include <sys/fm/fs/zfs.h> #include <sys/spa_impl.h> #include <sys/nvpair.h> #include <sys/uio.h> @@ -140,7 +141,7 @@ out: kobj_close_file(file); } -static void +static int spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) { size_t buflen; @@ -148,13 +149,14 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) vnode_t *vp; int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX; char *temp; + int err; /* * If the nvlist is empty (NULL), then remove the old cachefile. */ if (nvl == NULL) { - (void) vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); - return; + err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE); + return (err); } /* @@ -175,12 +177,14 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) */ (void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path); - if (vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) == 0) { - if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, - 0, RLIM64_INFINITY, kcred, NULL) == 0 && - VOP_FSYNC(vp, FSYNC, kcred, NULL) == 0) { - (void) vn_rename(temp, dp->scd_path, UIO_SYSSPACE); - } + err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0); + if (err == 0) { + err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE, + 0, RLIM64_INFINITY, kcred, NULL); + if (err == 0) + err = VOP_FSYNC(vp, FSYNC, kcred, NULL); + if (err == 0) + err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE); (void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL); VN_RELE(vp); } @@ -189,6 +193,7 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) kmem_free(buf, buflen); kmem_free(temp, MAXPATHLEN); + return (err); } /* @@ -200,6 +205,8 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) { spa_config_dirent_t *dp, *tdp; nvlist_t *nvl; + boolean_t ccw_failure; + int error; ASSERT(MUTEX_HELD(&spa_namespace_lock)); @@ -211,6 +218,7 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) * cachefile is changed, the new one is pushed onto this list, allowing * us to update previous cachefiles that no longer contain this pool. */ + ccw_failure = B_FALSE; for (dp = list_head(&target->spa_config_list); dp != NULL; dp = list_next(&target->spa_config_list, dp)) { spa_t *spa = NULL; @@ -251,10 +259,32 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) mutex_exit(&spa->spa_props_lock); } - spa_config_write(dp, nvl); + error = spa_config_write(dp, nvl); + if (error != 0) + ccw_failure = B_TRUE; nvlist_free(nvl); } + if (ccw_failure) { + /* + * Keep trying so that configuration data is + * written if/when any temporary filesystem + * resource issues are resolved. + */ + if (target->spa_ccw_fail_time == 0) { + zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, + target, NULL, NULL, 0, 0); + } + target->spa_ccw_fail_time = gethrtime(); + spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); + } else { + /* + * Do not rate limit future attempts to update + * the config cache. + */ + target->spa_ccw_fail_time = 0; + } + /* * Remove any config entries older than the current one. */ @@ -317,6 +347,7 @@ spa_config_set(spa_t *spa, nvlist_t *config) /* * Generate the pool's configuration based on the current in-core state. + * * We infer whether to generate a complete config or just one top-level config * based on whether vd is the root vdev. */ diff --git a/usr/src/uts/common/fs/zfs/spa_errlog.c b/usr/src/uts/common/fs/zfs/spa_errlog.c index 9152846d6e..0dd6c7a489 100644 --- a/usr/src/uts/common/fs/zfs/spa_errlog.c +++ b/usr/src/uts/common/fs/zfs/spa_errlog.c @@ -183,8 +183,10 @@ process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) if (copyout(&zb, (char *)addr + (*count - 1) * sizeof (zbookmark_t), - sizeof (zbookmark_t)) != 0) + sizeof (zbookmark_t)) != 0) { + zap_cursor_fini(&zc); return (SET_ERROR(EFAULT)); + } *count -= 1; } diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index e57d8ab143..2b8a071cb0 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -1334,7 +1334,7 @@ zfs_panic_recover(const char *fmt, ...) /* * This is a stripped-down version of strtoull, suitable only for converting - * lowercase hexidecimal numbers that don't overflow. + * lowercase hexadecimal numbers that don't overflow. */ uint64_t strtonum(const char *str, char **nptr) diff --git a/usr/src/uts/common/fs/zfs/sys/ddt.h b/usr/src/uts/common/fs/zfs/sys/ddt.h index 9724d6eceb..771610677e 100644 --- a/usr/src/uts/common/fs/zfs/sys/ddt.h +++ b/usr/src/uts/common/fs/zfs/sys/ddt.h @@ -63,16 +63,15 @@ enum ddt_class { */ typedef struct ddt_key { zio_cksum_t ddk_cksum; /* 256-bit block checksum */ - uint64_t ddk_prop; /* LSIZE, PSIZE, compression */ + /* + * Encoded with logical & physical size, and compression, as follows: + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | 0 | 0 | 0 | comp | PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + */ + uint64_t ddk_prop; } ddt_key_t; -/* - * ddk_prop layout: - * - * +-------+-------+-------+-------+-------+-------+-------+-------+ - * | 0 | 0 | 0 | comp | PSIZE | LSIZE | - * +-------+-------+-------+-------+-------+-------+-------+-------+ - */ #define DDK_GET_LSIZE(ddk) \ BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) #define DDK_SET_LSIZE(ddk, x) \ diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 1366a998fd..6e07a156dc 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -409,6 +409,8 @@ void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release your hold with dmu_buf_rele(). + * + * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); @@ -664,8 +666,14 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); +/* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); +/* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); +/* + * Like dmu_object_info_from_db, but faster still when you only care about + * the size. This is specifically optimized for zfs_getattr(). + */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h index 9f9134d8cd..c3de03d369 100644 --- a/usr/src/uts/common/fs/zfs/sys/dnode.h +++ b/usr/src/uts/common/fs/zfs/sys/dnode.h @@ -145,9 +145,8 @@ typedef struct dnode_phys { typedef struct dnode { /* - * dn_struct_rwlock protects the structure of the dnode, - * including the number of levels of indirection (dn_nlevels), - * dn_maxblkid, and dn_next_* + * Protects the structure of the dnode, including the number of levels + * of indirection (dn_nlevels), dn_maxblkid, and dn_next_* */ krwlock_t dn_struct_rwlock; diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h index b0160edfb1..d3b411ba57 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h @@ -110,6 +110,7 @@ typedef struct dsl_pool { /* * Protects administrative changes (properties, namespace) + * * It is only held for write in syncing context. Therefore * syncing context does not need to ever have it for read, since * nobody else could possibly have it for write. diff --git a/usr/src/uts/common/fs/zfs/sys/sa_impl.h b/usr/src/uts/common/fs/zfs/sys/sa_impl.h index 8ae05ce364..582bd76f01 100644 --- a/usr/src/uts/common/fs/zfs/sys/sa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/sa_impl.h @@ -150,6 +150,7 @@ struct sa_os { /* * header for all bonus and spill buffers. + * * The header has a fixed portion with a variable number * of "lengths" depending on the number of variable sized * attribues which are determined by the "layout number" @@ -158,29 +159,27 @@ struct sa_os { #define SA_MAGIC 0x2F505A /* ZFS SA */ typedef struct sa_hdr_phys { uint32_t sa_magic; - uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */ + /* + * Encoded with hdrsize and layout number as follows: + * 16 10 0 + * +--------+-------+ + * | hdrsz |layout | + * +--------+-------+ + * + * Bits 0-10 are the layout number + * Bits 11-16 are the size of the header. + * The hdrsize is the number * 8 + * + * For example. + * hdrsz of 1 ==> 8 byte header + * 2 ==> 16 byte header + * + */ + uint16_t sa_layout_info; uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ /* ... Data follows the lengths. */ } sa_hdr_phys_t; -/* - * sa_hdr_phys -> sa_layout_info - * - * 16 10 0 - * +--------+-------+ - * | hdrsz |layout | - * +--------+-------+ - * - * Bits 0-10 are the layout number - * Bits 11-16 are the size of the header. - * The hdrsize is the number * 8 - * - * For example. - * hdrsz of 1 ==> 8 byte header - * 2 ==> 16 byte header - * - */ - #define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) #define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0) #define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 983103e386..66ea159475 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -238,8 +238,9 @@ struct spa { uint64_t spa_deadman_synctime; /* deadman expiration timer */ kmutex_t spa_iokstat_lock; /* protects spa_iokstat_* */ struct kstat *spa_iokstat; /* kstat of io to this pool */ + hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ /* - * spa_refcnt & spa_config_lock must be the last elements + * spa_refcount & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. * In order for the MDB module to function correctly, the other * fields must remain in the same location. diff --git a/usr/src/uts/common/fs/zfs/sys/space_map.h b/usr/src/uts/common/fs/zfs/sys/space_map.h index 64223daf62..c0070da670 100644 --- a/usr/src/uts/common/fs/zfs/sys/space_map.h +++ b/usr/src/uts/common/fs/zfs/sys/space_map.h @@ -94,7 +94,6 @@ struct space_map_ops { * 63 62 60 59 50 49 0 * * - * * non-debug entry * * 1 47 1 15 diff --git a/usr/src/uts/common/fs/zfs/sys/unique.h b/usr/src/uts/common/fs/zfs/sys/unique.h index 2ef3093edf..d4ba32e5c6 100644 --- a/usr/src/uts/common/fs/zfs/sys/unique.h +++ b/usr/src/uts/common/fs/zfs/sys/unique.h @@ -26,8 +26,6 @@ #ifndef _SYS_UNIQUE_H #define _SYS_UNIQUE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #ifdef __cplusplus @@ -42,7 +40,7 @@ void unique_fini(void); /* * Return a new unique value (which will not be uniquified against until - * it is unique_insert()-ed. + * it is unique_insert()-ed). */ uint64_t unique_create(void); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index c599c549ac..02e3e838c3 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -246,12 +246,13 @@ typedef struct vdev_label { #define VDD_METASLAB 0x01 #define VDD_DTL 0x02 +/* Offset of embedded boot loader region on each label */ +#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) /* - * Size and offset of embedded boot loader region on each label. + * Size of embedded boot loader region on each label. * The total size of the first two labels plus the boot area is 4MB. */ -#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) -#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ +#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ /* * Size of label regions at the start and end of each leaf device. @@ -318,8 +319,9 @@ extern uint64_t vdev_get_min_asize(vdev_t *vd); extern void vdev_set_min_asize(vdev_t *vd); /* - * zdb uses this tunable, so it must be declared here to make lint happy. + * Global variables */ +/* zdb uses this tunable, so it must be declared here to make lint happy. */ extern int zfs_vdev_cache_size; /* diff --git a/usr/src/uts/common/fs/zfs/sys/zap.h b/usr/src/uts/common/fs/zfs/sys/zap.h index 1e975e99e0..20a66edf85 100644 --- a/usr/src/uts/common/fs/zfs/sys/zap.h +++ b/usr/src/uts/common/fs/zfs/sys/zap.h @@ -86,18 +86,22 @@ extern "C" { #endif /* - * The matchtype specifies which entry will be accessed. - * MT_EXACT: only find an exact match (non-normalized) - * MT_FIRST: find the "first" normalized (case and Unicode - * form) match; the designated "first" match will not change as long - * as the set of entries with this normalization doesn't change - * MT_BEST: if there is an exact match, find that, otherwise find the - * first normalized match + * Specifies matching criteria for ZAP lookups. */ typedef enum matchtype { + /* Only find an exact match (non-normalized) */ MT_EXACT, + /* + * If there is an exact match, find that, otherwise find the + * first normalized match. + */ MT_BEST, + /* + * Find the "first" normalized (case and Unicode form) match; + * the designated "first" match will not change as long as the + * set of entries with this normalization doesn't change. + */ MT_FIRST } matchtype_t; @@ -174,16 +178,21 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); * call will fail and return EINVAL. * * If 'integer_size' is equal to or larger than the attribute's integer - * size, the call will succeed and return 0. * When converting to a - * larger integer size, the integers will be treated as unsigned (ie. no - * sign-extension will be performed). + * size, the call will succeed and return 0. + * + * When converting to a larger integer size, the integers will be treated as + * unsigned (ie. no sign-extension will be performed). * * 'num_integers' is the length (in integers) of 'buf'. * * If the attribute is longer than the buffer, as many integers as will * fit will be transferred to 'buf'. If the entire attribute was not * transferred, the call will return EOVERFLOW. - * + */ +int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); + +/* * If rn_len is nonzero, realname will be set to the name of the found * entry (which may be different from the requested name if matchtype is * not MT_EXACT). @@ -191,8 +200,6 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); * If normalization_conflictp is not NULL, it will be set if there is * another name with the same case/unicode normalized form. */ -int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, - uint64_t integer_size, uint64_t num_integers, void *buf); int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, diff --git a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h index 3a33636741..f6947a72d7 100644 --- a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h +++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h @@ -101,6 +101,7 @@ typedef enum zap_chunk_type { */ typedef struct zap_leaf_phys { struct zap_leaf_header { + /* Public to ZAP */ uint64_t lh_block_type; /* ZBT_LEAF */ uint64_t lh_pad1; uint64_t lh_prefix; /* hash prefix of this leaf */ @@ -109,8 +110,7 @@ typedef struct zap_leaf_phys { uint16_t lh_nentries; /* number of entries */ uint16_t lh_prefix_len; /* num bits used to id this */ -/* above is accessable to zap, below is zap_leaf private */ - + /* Private to zap_leaf */ uint16_t lh_freelist; /* chunk head of free list */ uint8_t lh_flags; /* ZLF_* flags */ uint8_t lh_pad2[11]; @@ -161,13 +161,13 @@ typedef struct zap_leaf { typedef struct zap_entry_handle { - /* below is set by zap_leaf.c and is public to zap.c */ + /* Set by zap_leaf and public to ZAP */ uint64_t zeh_num_integers; uint64_t zeh_hash; uint32_t zeh_cd; uint8_t zeh_integer_size; - /* below is private to zap_leaf.c */ + /* Private to zap_leaf */ uint16_t zeh_fakechunk; uint16_t *zeh_chunkp; zap_leaf_t *zeh_leaf; @@ -202,7 +202,7 @@ extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh, /* * Replace the value of an existing entry. * - * zap_entry_update may fail if it runs out of space (ENOSPC). + * May fail if it runs out of space (ENOSPC). */ extern int zap_entry_update(zap_entry_handle_t *zeh, uint8_t integer_size, uint64_t num_integers, const void *buf); @@ -221,10 +221,7 @@ extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd, uint8_t integer_size, uint64_t num_integers, const void *buf, zap_entry_handle_t *zeh); -/* - * Return true if there are additional entries with the same normalized - * form. - */ +/* Determine whether there is another entry with the same normalized form. */ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, struct zap_name *zn, const char *name, struct zap *zap); diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h index d1a64180d5..4eefdc563f 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h @@ -46,7 +46,8 @@ struct znode_phys; #define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID /* - * ZFS ACLs are store in various forms. + * ZFS ACLs (Access Control Lists) are stored in various forms. + * * Files created with ACL version ZFS_ACL_VERSION_INITIAL * will all be created with fixed length ACEs of type * zfs_oldace_t. @@ -136,8 +137,8 @@ typedef struct acl_ops { size_t (*ace_size)(void *acep); /* how big is this ace */ size_t (*ace_abstract_size)(void); /* sizeof abstract entry */ int (*ace_mask_off)(void); /* off of access mask in ace */ + /* ptr to data if any */ int (*ace_data)(void *acep, void **datap); - /* ptr to data if any */ } acl_ops_t; /* diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h index 874d422568..9422177023 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -344,7 +344,7 @@ extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int zfs_busy(void); -extern void zfs_unmount_snap(const char *); +extern int zfs_unmount_snap(const char *); extern void zfs_destroy_unmount_origin(const char *); /* diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h b/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h index f302b663e2..93733ba8a2 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h @@ -26,8 +26,6 @@ #ifndef _SYS_FS_ZFS_RLOCK_H #define _SYS_FS_ZFS_RLOCK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -57,16 +55,14 @@ typedef struct rl { } rl_t; /* - * Lock a range (offset, length) as either shared (READER) - * or exclusive (WRITER or APPEND). APPEND is a special type that - * is converted to WRITER that specified to lock from the start of the - * end of file. zfs_range_lock() returns the range lock structure. + * Lock a range (offset, length) as either shared (RL_READER) + * or exclusive (RL_WRITER or RL_APPEND). RL_APPEND is a special type that + * is converted to RL_WRITER that specified to lock from the start of the + * end of file. Returns the range lock structure. */ rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type); -/* - * Unlock range and destroy range lock structure. - */ +/* Unlock range and destroy range lock structure. */ void zfs_range_unlock(rl_t *rl); /* @@ -76,7 +72,8 @@ void zfs_range_unlock(rl_t *rl); void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len); /* - * AVL comparison function used to compare range locks + * AVL comparison function used to order range locks + * Locks are ordered on the start offset of the range. */ int zfs_range_compare(const void *arg1, const void *arg2); diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h index cf0bbee2ca..43986afda2 100644 --- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h +++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h @@ -138,8 +138,9 @@ extern "C" { #define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) -/* Path component length */ /* + * Path component length + * * The generic fs code uses MAXNAMELEN to represent * what the largest component length is. Unfortunately, * this length includes the terminating NULL. ZFS needs @@ -234,11 +235,7 @@ typedef struct znode { #define ZTOV(ZP) ((ZP)->z_vnode) #define VTOZ(VP) ((znode_t *)(VP)->v_data) -/* - * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation. - * ZFS_EXIT() must be called before exitting the vop. - * ZFS_VERIFY_ZP() verifies the znode is valid. - */ +/* Called on entry to each ZFS vnode and vfs operation */ #define ZFS_ENTER(zfsvfs) \ { \ rrw_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ @@ -248,8 +245,10 @@ typedef struct znode { } \ } +/* Must be called before exiting the vop */ #define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG) +/* Verifies the znode is valid */ #define ZFS_VERIFY_ZP(zp) \ if ((zp)->z_sa_hdl == NULL) { \ ZFS_EXIT((zp)->z_zfsvfs); \ @@ -269,15 +268,14 @@ typedef struct znode { #define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) -/* - * Macros to encode/decode ZFS stored time values from/to struct timespec - */ +/* Encode ZFS stored time values from a struct timespec */ #define ZFS_TIME_ENCODE(tp, stmp) \ { \ (stmp)[0] = (uint64_t)(tp)->tv_sec; \ (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ } +/* Decode ZFS stored time values to a struct timespec */ #define ZFS_TIME_DECODE(tp, stmp) \ { \ (tp)->tv_sec = (time_t)(stmp)[0]; \ diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h index a212e4f0e1..15ef2aa8bf 100644 --- a/usr/src/uts/common/fs/zfs/sys/zil.h +++ b/usr/src/uts/common/fs/zfs/sys/zil.h @@ -242,6 +242,12 @@ typedef struct { * information needed for replaying the create. If the * file doesn't have any actual ACEs then the lr_aclcnt * would be zero. + * + * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's. + * If create is also setting xvattr's, then acl data follows xvattr. + * If ACE FUIDs are needed then they will follow the xvattr_t. Following + * the FUIDs will be the domain table information. The FUIDs for the owner + * and group will be in lr_create. Name follows ACL data. */ typedef struct { lr_create_t lr_create; /* common create portion */ @@ -250,13 +256,6 @@ typedef struct { uint64_t lr_fuidcnt; /* number of real fuids */ uint64_t lr_acl_bytes; /* number of bytes in ACL */ uint64_t lr_acl_flags; /* ACL flags */ - /* lr_acl_bytes number of variable sized ace's follows */ - /* if create is also setting xvattr's, then acl data follows xvattr */ - /* if ACE FUIDs are needed then they will follow the xvattr_t */ - /* Following the FUIDs will be the domain table information. */ - /* The FUIDs for the owner and group will be in the lr_create */ - /* portion of the record. */ - /* name follows ACL data */ } lr_acl_create_t; typedef struct { diff --git a/usr/src/uts/common/fs/zfs/sys/zio_compress.h b/usr/src/uts/common/fs/zfs/sys/zio_compress.h index 34a82a8b81..f4cb84511a 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio_compress.h +++ b/usr/src/uts/common/fs/zfs/sys/zio_compress.h @@ -36,11 +36,10 @@ extern "C" { #endif -/* - * Common signature for all zio compress/decompress functions. - */ +/* Common signature for all zio compress functions. */ typedef size_t zio_compress_func_t(void *src, void *dst, size_t s_len, size_t d_len, int); +/* Common signature for all zio decompress functions. */ typedef int zio_decompress_func_t(void *src, void *dst, size_t s_len, size_t d_len, int); diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c index 62a0c605d7..8cdb284832 100644 --- a/usr/src/uts/common/fs/zfs/txg.c +++ b/usr/src/uts/common/fs/zfs/txg.c @@ -344,6 +344,12 @@ txg_rele_to_sync(txg_handle_t *th) th->th_cpu = NULL; /* defensive */ } +/* + * Blocks until all transactions in the group are committed. + * + * On return, the transaction group has reached a stable state in which it can + * then be passed off to the syncing context. + */ static void txg_quiesce(dsl_pool_t *dp, uint64_t txg) { @@ -394,6 +400,9 @@ txg_do_callbacks(list_t *cb_list) /* * Dispatch the commit callbacks registered on this txg to worker threads. + * + * If no callbacks are registered for a given TXG, nothing happens. + * This function creates a taskq for the associated pool, if needed. */ static void txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) @@ -404,7 +413,10 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; - /* No need to lock tx_cpu_t at this point */ + /* + * No need to lock tx_cpu_t at this point, since this can + * only be called once a txg has been synced. + */ int g = txg & TXG_MASK; @@ -424,7 +436,7 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) list_create(cb_list, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); - list_move_tail(&tc->tc_callbacks[g], cb_list); + list_move_tail(cb_list, &tc->tc_callbacks[g]); (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) txg_do_callbacks, cb_list, TQ_SLEEP); @@ -558,8 +570,8 @@ txg_quiesce_thread(dsl_pool_t *dp) /* * Delay this thread by delay nanoseconds if we are still in the open - * transaction group and there is already a waiting txg quiesing or quiesced. - * Abort the delay if this txg stalls or enters the quiesing state. + * transaction group and there is already a waiting txg quiescing or quiesced. + * Abort the delay if this txg stalls or enters the quiescing state. */ void txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) @@ -567,7 +579,7 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) tx_state_t *tx = &dp->dp_tx; hrtime_t start = gethrtime(); - /* don't delay if this txg could transition to quiesing immediately */ + /* don't delay if this txg could transition to quiescing immediately */ if (tx->tx_open_txg > txg || tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) return; diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index cc3594ad1a..7a409bd7ed 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -956,9 +956,11 @@ vdev_probe_done(zio_t *zio) } /* - * Determine whether this device is accessible by reading and writing - * to several known locations: the pad regions of each vdev label - * but the first (which we leave alone in case it contains a VTOC). + * Determine whether this device is accessible. + * + * Read and write to several known locations: the pad regions of each + * vdev label but the first, which we leave alone in case it contains + * a VTOC. */ zio_t * vdev_probe(vdev_t *vd, zio_t *zio) @@ -2179,10 +2181,12 @@ vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) } /* - * Online the given vdev. If 'unspare' is set, it implies two things. First, - * any attached spare device should be detached when the device finishes - * resilvering. Second, the online should be treated like a 'test' online case, - * so no FMA events are generated if the device fails to open. + * Online the given vdev. + * + * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached + * spare device should be detached when the device finishes resilvering. + * Second, the online should be treated like a 'test' online case, so no FMA + * events are generated if the device fails to open. */ int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index 5ee7c7d15b..904918c3a4 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -1028,6 +1028,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags) zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd)); } +/* Sync the uberblocks to all vdevs in svd[] */ int vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) { diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index fccbbb1d75..8de4b324a2 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -38,13 +38,14 @@ /* * These tunables are for performance analysis. */ + +/* The maximum number of I/Os concurrently pending to each device. */ +int zfs_vdev_max_pending = 10; + /* - * zfs_vdev_max_pending is the maximum number of i/os concurrently - * pending to each device. zfs_vdev_min_pending is the initial number - * of i/os pending to each device (before it starts ramping up to - * max_pending). + * The initial number of I/Os pending to each device, before it starts ramping + * up to zfs_vdev_max_pending. */ -int zfs_vdev_max_pending = 10; int zfs_vdev_min_pending = 4; /* diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index d4b3d5b5a8..b22bcd2b2f 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -64,6 +64,7 @@ * o addition (+) is represented by a bitwise XOR * o subtraction (-) is therefore identical to addition: A + B = A - B * o multiplication of A by 2 is defined by the following bitwise expression: + * * (A * 2)_7 = A_6 * (A * 2)_6 = A_5 * (A * 2)_5 = A_4 @@ -122,7 +123,7 @@ typedef struct raidz_map { uint64_t rm_missingparity; /* Count of missing parity devices */ uint64_t rm_firstdatacol; /* First data column/parity count */ uint64_t rm_nskip; /* Skipped sectors for padding */ - uint64_t rm_skipstart; /* Column index of padding start */ + uint64_t rm_skipstart; /* Column index of padding start */ void *rm_datacopy; /* rm_asize-buffer of copied data */ uintptr_t rm_reports; /* # of referencing checksum reports */ uint8_t rm_freed; /* map no longer has referencing ZIO */ @@ -164,10 +165,7 @@ typedef struct raidz_map { */ int vdev_raidz_default_to_general; -/* - * These two tables represent powers and logs of 2 in the Galois field defined - * above. These values were computed by repeatedly multiplying by 2 as above. - */ +/* Powers of 2 in the Galois field defined above. */ static const uint8_t vdev_raidz_pow2[256] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, @@ -202,6 +200,7 @@ static const uint8_t vdev_raidz_pow2[256] = { 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 }; +/* Logs of 2 in the Galois field defined above. */ static const uint8_t vdev_raidz_log2[256] = { 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, @@ -437,23 +436,50 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = { vdev_raidz_cksum_report }; +/* + * Divides the IO evenly across all child vdevs; usually, dcols is + * the number of children in the target vdev. + */ static raidz_map_t * vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset, uint64_t unit_shift, uint64_t dcols, uint64_t nparity) { raidz_map_t *rm; + /* The starting RAIDZ (parent) vdev sector of the block. */ uint64_t b = offset >> unit_shift; + /* The zio's size in units of the vdev's minimum sector size. */ uint64_t s = size >> unit_shift; + /* The first column for this stripe. */ uint64_t f = b % dcols; + /* The starting byte offset on each child vdev. */ uint64_t o = (b / dcols) << unit_shift; uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ q = s / (dcols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ r = s - q * (dcols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ tot = s + nparity * (q + (r == 0 ? 0 : 1)); + /* acols: The columns that will be accessed. */ + /* scols: The columns that will be accessed or skipped. */ if (q == 0) { + /* Our I/O request doesn't span all child vdevs. */ acols = bc; scols = MIN(dcols, roundup(bc, nparity + 1)); } else { @@ -1668,6 +1694,23 @@ vdev_raidz_child_done(zio_t *zio) rc->rc_skipped = 0; } +/* + * Start an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ static int vdev_raidz_io_start(zio_t *zio) { @@ -2019,6 +2062,27 @@ done: return (ret); } +/* + * Complete an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Check for errors on the child IOs. + * 2. Return, setting an error code if too few child VDevs were written + * to reconstruct the data later. Note that partial writes are + * considered successful if they can be reconstructed at all. + * - For read operations: + * 1. Check for errors on the child IOs. + * 2. If data errors occurred: + * a. Try to reassemble the data from the parity available. + * b. If we haven't yet read the parity drives, read them now. + * c. If all parity drives have been read but the data still doesn't + * reassemble with a correct checksum, then try combinatorial + * reconstruction. + * d. If that doesn't work, return an error. + * 3. If there were unexpected errors or this is a resilver operation, + * rewrite the vdevs that had errors. + */ static void vdev_raidz_io_done(zio_t *zio) { diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c index 2f4ccfb6ea..f69c1b0312 100644 --- a/usr/src/uts/common/fs/zfs/zap.c +++ b/usr/src/uts/common/fs/zfs/zap.c @@ -295,7 +295,8 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_nextblk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); - dmu_buf_rele(db, FTAG); + if (err == 0) + dmu_buf_rele(db, FTAG); } return (err); } @@ -992,18 +993,21 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) zap_attribute_t za; int err; + err = 0; for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, &za) == 0; (void) zap_cursor_advance(&zc)) { - if (za.za_integer_length != 8 || za.za_num_integers != 1) - return (SET_ERROR(EINVAL)); + if (za.za_integer_length != 8 || za.za_num_integers != 1) { + err = SET_ERROR(EINVAL); + break; + } err = zap_add(os, intoobj, za.za_name, 8, 1, &za.za_first_integer, tx); if (err) - return (err); + break; } zap_cursor_fini(&zc); - return (0); + return (err); } int @@ -1014,18 +1018,21 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, zap_attribute_t za; int err; + err = 0; for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, &za) == 0; (void) zap_cursor_advance(&zc)) { - if (za.za_integer_length != 8 || za.za_num_integers != 1) - return (SET_ERROR(EINVAL)); + if (za.za_integer_length != 8 || za.za_num_integers != 1) { + err = SET_ERROR(EINVAL); + break; + } err = zap_add(os, intoobj, za.za_name, 8, 1, &value, tx); if (err) - return (err); + break; } zap_cursor_fini(&zc); - return (0); + return (err); } int @@ -1036,24 +1043,27 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, zap_attribute_t za; int err; + err = 0; for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, &za) == 0; (void) zap_cursor_advance(&zc)) { uint64_t delta = 0; - if (za.za_integer_length != 8 || za.za_num_integers != 1) - return (SET_ERROR(EINVAL)); + if (za.za_integer_length != 8 || za.za_num_integers != 1) { + err = SET_ERROR(EINVAL); + break; + } err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta); if (err != 0 && err != ENOENT) - return (err); + break; delta += za.za_first_integer; err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx); if (err) - return (err); + break; } zap_cursor_fini(&zc); - return (0); + return (err); } int diff --git a/usr/src/uts/common/fs/zfs/zfs_acl.c b/usr/src/uts/common/fs/zfs/zfs_acl.c index 1b296b2897..2eecefd8cf 100644 --- a/usr/src/uts/common/fs/zfs/zfs_acl.c +++ b/usr/src/uts/common/fs/zfs/zfs_acl.c @@ -1362,7 +1362,8 @@ zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t trim, zfs_acl_t *aclp) zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; new_bytes += abstract_size; - } if (masks.deny1) { + } + if (masks.deny1) { zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); zacep = (void *)((uintptr_t)zacep + abstract_size); new_count++; @@ -1766,7 +1767,7 @@ zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids) } /* - * Retrieve a files ACL + * Retrieve a file's ACL */ int zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) @@ -1921,7 +1922,7 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, } /* - * Set a files ACL + * Set a file's ACL */ int zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) @@ -2342,6 +2343,7 @@ slow: /* * Determine whether Access should be granted/denied. + * * The least priv subsytem is always consulted as a basic privilege * can define any form of access. */ @@ -2537,7 +2539,6 @@ zfs_delete_final_check(znode_t *zp, znode_t *dzp, * Determine whether Access should be granted/deny, without * consulting least priv subsystem. * - * * The following chart is the recommended NFSv4 enforcement for * ability to delete an object. * diff --git a/usr/src/uts/common/fs/zfs/zfs_ctldir.c b/usr/src/uts/common/fs/zfs/zfs_ctldir.c index f915d21db2..5928fe75e9 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ctldir.c +++ b/usr/src/uts/common/fs/zfs/zfs_ctldir.c @@ -505,6 +505,11 @@ static const fs_operation_def_t zfsctl_tops_root[] = { { NULL } }; +/* + * Gets the full dataset name that corresponds to the given snapshot name + * Example: + * zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1" + */ static int zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) { @@ -1046,6 +1051,7 @@ zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, /* * pvp is the '.zfs' directory (zfsctl_node_t). + * * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). * * This function is the callback to create a GFS vnode for '.zfs/snapshot' diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index 15b19b0d06..06ea64181d 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -323,9 +323,7 @@ zfs_is_bootfs(const char *name) } /* - * zfs_earlier_version - * - * Return non-zero if the spa version is less than requested version. + * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) @@ -343,8 +341,6 @@ zfs_earlier_version(const char *name, int version) } /* - * zpl_earlier_version - * * Return TRUE if the ZPL version is less than requested version. */ static boolean_t @@ -2986,10 +2982,10 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) /* * inputs: - * createprops list of properties requested by creator - * default_zplver zpl version to use if unspecified in createprops - * fuids_ok fuids allowed in this version of the spa? * os parent objset pointer (NULL if root fs) + * fuids_ok fuids allowed in this version of the spa? + * sa_ok SAs allowed in this version of the spa? + * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object @@ -3395,41 +3391,44 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). + * + * Returns 0 if the argument is not a snapshot, or it is not currently a + * filesystem, or we were able to unmount it. Returns error code otherwise. */ -void +int zfs_unmount_snap(const char *snapname) { vfs_t *vfsp; zfsvfs_t *zfsvfs; + int err; if (strchr(snapname, '@') == NULL) - return; + return (0); vfsp = zfs_get_vfs(snapname); if (vfsp == NULL) - return; + return (0); zfsvfs = vfsp->vfs_data; ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); - if (vn_vfswlock(vfsp->vfs_vnodecovered) != 0) { - VFS_RELE(vfsp); - return; - } + err = vn_vfswlock(vfsp->vfs_vnodecovered); VFS_RELE(vfsp); + if (err != 0) + return (SET_ERROR(err)); /* * Always force the unmount for snapshots. */ (void) dounmount(vfsp, MS_FORCE, kcred); + return (0); } /* ARGSUSED */ static int zfs_unmount_snap_cb(const char *snapname, void *arg) { - zfs_unmount_snap(snapname); - return (0); + return (zfs_unmount_snap(snapname)); } /* @@ -3452,7 +3451,7 @@ zfs_destroy_unmount_origin(const char *fsname) char originname[MAXNAMELEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); - zfs_unmount_snap(originname); + (void) zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } @@ -3470,7 +3469,7 @@ zfs_destroy_unmount_origin(const char *fsname) static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { - int poollen; + int error, poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; @@ -3491,7 +3490,9 @@ zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); - zfs_unmount_snap(name); + error = zfs_unmount_snap(name); + if (error != 0) + return (error); } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); @@ -3509,8 +3510,12 @@ static int zfs_ioc_destroy(zfs_cmd_t *zc) { int err; - if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) - zfs_unmount_snap(zc->zc_name); + + if (zc->zc_objset_type == DMU_OST_ZFS) { + err = zfs_unmount_snap(zc->zc_name); + if (err != 0) + return (err); + } if (strchr(zc->zc_name, '@')) err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); @@ -3556,8 +3561,7 @@ recursive_unmount(const char *fsname, void *arg) char fullname[MAXNAMELEN]; (void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname); - zfs_unmount_snap(fullname); - return (0); + return (zfs_unmount_snap(fullname)); } /* @@ -5016,14 +5020,18 @@ static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { nvpair_t *pair; + int err; /* * The release may cause the snapshot to be destroyed; make sure it * is not mounted. */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; - pair = nvlist_next_nvpair(holds, pair)) - zfs_unmount_snap(nvpair_name(pair)); + pair = nvlist_next_nvpair(holds, pair)) { + err = zfs_unmount_snap(nvpair_name(pair)); + if (err != 0) + return (err); + } return (dsl_dataset_user_release(holds, errlist)); } diff --git a/usr/src/uts/common/fs/zfs/zfs_log.c b/usr/src/uts/common/fs/zfs/zfs_log.c index de786bf7f4..aeaba2233a 100644 --- a/usr/src/uts/common/fs/zfs/zfs_log.c +++ b/usr/src/uts/common/fs/zfs/zfs_log.c @@ -211,9 +211,8 @@ zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) } /* - * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, - * TX_MKDIR_ATTR and TX_MKXATTR - * transactions. + * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and + * TK_MKXATTR transactions. * * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID * domain information appended prior to the name. In this case the @@ -340,7 +339,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, } /* - * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions. + * Handles both TX_REMOVE and TX_RMDIR transactions. */ void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, @@ -364,7 +363,7 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, } /* - * zfs_log_link() handles TX_LINK transactions. + * Handles TX_LINK transactions. */ void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, @@ -387,7 +386,7 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, } /* - * zfs_log_symlink() handles TX_SYMLINK transactions. + * Handles TX_SYMLINK transactions. */ void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, @@ -419,7 +418,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, } /* - * zfs_log_rename() handles TX_RENAME transactions. + * Handles TX_RENAME transactions. */ void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, @@ -445,7 +444,7 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, } /* - * zfs_log_write() handles TX_WRITE transactions. + * Handles TX_WRITE transactions. */ ssize_t zfs_immediate_write_sz = 32768; @@ -524,7 +523,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, } /* - * zfs_log_truncate() handles TX_TRUNCATE transactions. + * Handles TX_TRUNCATE transactions. */ void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, @@ -547,7 +546,7 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, } /* - * zfs_log_setattr() handles TX_SETATTR transactions. + * Handles TX_SETATTR transactions. */ void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, @@ -609,7 +608,7 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, } /* - * zfs_log_acl() handles TX_ACL transactions. + * Handles TX_ACL transactions. */ void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, diff --git a/usr/src/uts/common/fs/zfs/zfs_rlock.c b/usr/src/uts/common/fs/zfs/zfs_rlock.c index be562496b0..b40bdbea12 100644 --- a/usr/src/uts/common/fs/zfs/zfs_rlock.c +++ b/usr/src/uts/common/fs/zfs/zfs_rlock.c @@ -28,7 +28,7 @@ /* * This file contains the code to implement file range locking in - * ZFS, although there isn't much specific to ZFS (all that comes to mind + * ZFS, although there isn't much specific to ZFS (all that comes to mind is * support for growing the blocksize). * * Interface diff --git a/usr/src/uts/common/fs/zfs/zfs_sa.c b/usr/src/uts/common/fs/zfs/zfs_sa.c index d141e43d72..ed5f276475 100644 --- a/usr/src/uts/common/fs/zfs/zfs_sa.c +++ b/usr/src/uts/common/fs/zfs/zfs_sa.c @@ -187,7 +187,7 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) /* * I'm not convinced we should do any of this upgrade. * since the SA code can read both old/new znode formats - * with probably little to know performance difference. + * with probably little to no performance difference. * * All new files will be created with the new format. */ diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c index e337861cd4..c7d4444722 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c @@ -1349,13 +1349,12 @@ zfs_parse_bootfs(char *bpath, char *outpath) } /* - * zfs_check_global_label: - * Check that the hex label string is appropriate for the dataset - * being mounted into the global_zone proper. + * Check that the hex label string is appropriate for the dataset being + * mounted into the global_zone proper. * - * Return an error if the hex label string is not default or - * admin_low/admin_high. For admin_low labels, the corresponding - * dataset must be readonly. + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. */ int zfs_check_global_label(const char *dsname, const char *hexsl) @@ -1377,15 +1376,12 @@ zfs_check_global_label(const char *dsname, const char *hexsl) } /* - * zfs_mount_label_policy: - * Determine whether the mount is allowed according to MAC check. - * by comparing (where appropriate) label of the dataset against - * the label of the zone being mounted into. If the dataset has - * no label, create one. + * Determine whether the mount is allowed according to MAC check. + * by comparing (where appropriate) label of the dataset against + * the label of the zone being mounted into. If the dataset has + * no label, create one. * - * Returns: - * 0 : access allowed - * >0 : error code, such as EACCES + * Returns 0 if access allowed, error otherwise (e.g. EACCES) */ static int zfs_mount_label_policy(vfs_t *vfsp, char *osname) diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c index 161e573175..b0901bea0c 100644 --- a/usr/src/uts/common/fs/zfs/zfs_vnops.c +++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c @@ -95,11 +95,11 @@ * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * - * (1) A check must be made in each zfs thread for a mounted file system. + * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). - * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes - * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros - * can return EIO from the calling function. + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: @@ -131,7 +131,7 @@ * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. - * During ZIL replay the zfs_log_* functions will update the sequence + * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, @@ -388,7 +388,7 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. + * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) @@ -437,8 +437,7 @@ offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ * * OUT: uio - updated offset and range, buffer filled. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Side Effects: * vp - atime updated if byte count > 0 @@ -574,14 +573,14 @@ out: * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. - * ioflag - FAPPEND flag set if in append mode. + * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is + * set if in append mode. * cr - credentials of caller. * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated if byte count > 0 @@ -1149,8 +1148,7 @@ specvp_check(vnode_t **vpp, cred_t *cr) * * OUT: vpp - vnode of located entry, NULL if not found. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * NA @@ -1291,8 +1289,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, * * OUT: vpp - vnode of created or trunc'd entry. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created @@ -1542,8 +1539,7 @@ out: * ct - caller context * flags - case flags * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime @@ -1773,12 +1769,12 @@ out: * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context + * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated @@ -1958,8 +1954,7 @@ top: * ct - caller context * flags - case flags * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated @@ -2077,7 +2072,7 @@ out: /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in - * the uio structure. + * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, @@ -2089,8 +2084,7 @@ out: * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated @@ -2409,7 +2403,7 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) * * OUT: vap - attribute values. * - * RETURN: 0 (always succeeds) + * RETURN: 0 (always succeeds). */ /* ARGSUSED */ static int @@ -2611,8 +2605,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, * cr - credentials of caller. * ct - caller context * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. @@ -2620,7 +2613,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, /* ARGSUSED */ static int zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, - caller_context_t *ct) + caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; @@ -3213,6 +3206,7 @@ out: if (attrzp) VN_RELE(ZTOV(attrzp)); + if (aclp) zfs_acl_free(aclp); @@ -3347,8 +3341,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) * ct - caller context * flags - case flags * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated @@ -3695,13 +3688,11 @@ out: * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. - * target - Target path of new symlink. * cr - credentials of caller. * ct - caller context * flags - case flags * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated @@ -3847,14 +3838,13 @@ top: * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. - * uoip - structure to contain the link path. + * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * - * OUT: uio - structure to contain the link path. + * OUT: uio - structure containing the link path. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated @@ -3893,8 +3883,7 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) * cr - credentials of caller. * ct - caller context * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated @@ -4062,8 +4051,7 @@ zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, * OUT: offp - start of range pushed. * lenp - len of range pushed. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * NOTE: callers must have locked the page to be pushed. On * exit, the page (and all other pages in the kluster) must be @@ -4187,8 +4175,7 @@ out: * cr - credentials of caller. * ct - caller context. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated @@ -4353,8 +4340,7 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) * noffp - pointer to new file offset * ct - caller context * - * RETURN: 0 if success - * EINVAL if new offset invalid + * RETURN: 0 on success, EINVAL if new offset invalid. */ /* ARGSUSED */ static int @@ -4490,8 +4476,7 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, * OUT: protp - protection mode of created pages. * pl - list of pages created. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated @@ -4499,8 +4484,8 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, /* ARGSUSED */ static int zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, - page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, - enum seg_rw rw, cred_t *cr, caller_context_t *ct) + page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, + enum seg_rw rw, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; @@ -4575,15 +4560,11 @@ out: * Request a memory map for a section of a file. This code interacts * with common code and the VM system as follows: * - * common code calls mmap(), which ends up in smmap_common() - * - * this calls VOP_MAP(), which takes you into (say) zfs - * - * zfs_map() calls as_map(), passing segvn_create() as the callback - * - * segvn_create() creates the new segment and calls VOP_ADDMAP() - * - * zfs_addmap() updates z_mapcnt + * - common code calls mmap(), which ends up in smmap_common() + * - this calls VOP_MAP(), which takes you into (say) zfs + * - zfs_map() calls as_map(), passing segvn_create() as the callback + * - segvn_create() creates the new segment and calls VOP_ADDMAP() + * - zfs_addmap() updates z_mapcnt */ /*ARGSUSED*/ static int @@ -4700,8 +4681,7 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr, * cr - credentials of caller [UNUSED]. * ct - caller context. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated @@ -4916,13 +4896,14 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, } /* - * Tunable, both must be a power of 2. - * - * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf - * zcr_blksz_max: if set to less than the file block size, allow loaning out of - * an arcbuf for a partial block read + * The smallest read we may consider to loan out an arcbuf. + * This must be a power of 2. */ int zcr_blksz_min = (1 << 10); /* 1K */ +/* + * If set to less than the file block size, allow loaning out of an + * arcbuf for a partial block read. This must be a power of 2. + */ int zcr_blksz_max = (1 << 17); /* 128K */ /*ARGSUSED*/ @@ -5196,10 +5177,12 @@ const fs_operation_def_t zfs_sharevnodeops_template[] = { /* * Extended attribute directory vnode operations template - * This template is identical to the directory vnodes - * operation template except for restricted operations: - * VOP_MKDIR() - * VOP_SYMLINK() + * + * This template is identical to the directory vnodes + * operation template except for restricted operations: + * VOP_MKDIR() + * VOP_SYMLINK() + * * Note that there are other restrictions embedded in: * zfs_create() - restrict type to VREG * zfs_link() - no links into/out of attribute space diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c index 7d4f3084ed..f9433b6b44 100644 --- a/usr/src/uts/common/fs/zfs/zfs_znode.c +++ b/usr/src/uts/common/fs/zfs/zfs_znode.c @@ -1006,9 +1006,8 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, } /* - * zfs_xvattr_set only updates the in-core attributes - * it is assumed the caller will be doing an sa_bulk_update - * to push the changes out + * Update in-core attributes. It is assumed the caller will be doing an + * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) @@ -1447,8 +1446,7 @@ zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp, * IN: zp - znode of file to free data in. * end - new end-of-file * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) @@ -1525,8 +1523,7 @@ top: * off - start of section to free. * len - length of section to free. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) @@ -1564,8 +1561,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) * IN: zp - znode of file to free data in. * end - new end-of-file. * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) @@ -1663,8 +1659,7 @@ top: * flag - current file open mode flags. * log - TRUE if this action should be logged * - * RETURN: 0 if success - * error code if failure + * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c index 8142e24210..da809916a4 100644 --- a/usr/src/uts/common/fs/zfs/zil.c +++ b/usr/src/uts/common/fs/zfs/zil.c @@ -66,9 +66,9 @@ */ /* - * This global ZIL switch affects all pools + * Disable intent logging replay. This global ZIL switch affects all pools. */ -int zil_replay_disable = 0; /* disable intent logging replay */ +int zil_replay_disable = 0; /* * Tunable parameter for debugging or performance analysis. Setting @@ -879,6 +879,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) /* * Define a limited set of intent log block sizes. + * * These must be a multiple of 4KB. Note only the amount used (again * aligned to 4KB) actually gets written. However, we can't always just * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted. diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 425dcb3100..30738dce53 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -1193,13 +1193,16 @@ zio_interrupt(zio_t *zio) /* * Execute the I/O pipeline until one of the following occurs: - * (1) the I/O completes; (2) the pipeline stalls waiting for - * dependent child I/Os; (3) the I/O issues, so we're waiting - * for an I/O completion interrupt; (4) the I/O is delegated by - * vdev-level caching or aggregation; (5) the I/O is deferred - * due to vdev-level queueing; (6) the I/O is handed off to - * another thread. In all cases, the pipeline stops whenever - * there's no CPU work; it never burns a thread in cv_wait(). + * + * (1) the I/O completes + * (2) the pipeline stalls waiting for dependent child I/Os + * (3) the I/O issues, so we're waiting for an I/O completion interrupt + * (4) the I/O is delegated by vdev-level caching or aggregation + * (5) the I/O is deferred due to vdev-level queueing + * (6) the I/O is handed off to another thread. + * + * In all cases, the pipeline stops whenever there's no CPU work; it never + * burns a thread in cv_wait(). * * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. diff --git a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c index 73cef1199b..bea112d166 100644 --- a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c +++ b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. */ /* @@ -105,13 +106,14 @@ static int ahci_check_slot_handle(ahci_port_t *, int); /* * Local function prototypes */ +static int ahci_setup_port_base_addresses(ahci_ctl_t *, ahci_port_t *); static int ahci_alloc_ports_state(ahci_ctl_t *); static void ahci_dealloc_ports_state(ahci_ctl_t *); static int ahci_alloc_port_state(ahci_ctl_t *, uint8_t); static void ahci_dealloc_port_state(ahci_ctl_t *, uint8_t); -static int ahci_alloc_rcvd_fis(ahci_ctl_t *, ahci_port_t *, uint8_t); +static int ahci_alloc_rcvd_fis(ahci_ctl_t *, ahci_port_t *); static void ahci_dealloc_rcvd_fis(ahci_port_t *); -static int ahci_alloc_cmd_list(ahci_ctl_t *, ahci_port_t *, uint8_t); +static int ahci_alloc_cmd_list(ahci_ctl_t *, ahci_port_t *); static void ahci_dealloc_cmd_list(ahci_ctl_t *, ahci_port_t *); static int ahci_alloc_cmd_tables(ahci_ctl_t *, ahci_port_t *); static void ahci_dealloc_cmd_tables(ahci_ctl_t *, ahci_port_t *); @@ -122,6 +124,7 @@ static int ahci_initialize_controller(ahci_ctl_t *); static void ahci_uninitialize_controller(ahci_ctl_t *); static int ahci_initialize_port(ahci_ctl_t *, ahci_port_t *, ahci_addr_t *); static int ahci_config_space_init(ahci_ctl_t *); +static void ahci_staggered_spin_up(ahci_ctl_t *, uint8_t); static void ahci_drain_ports_taskq(ahci_ctl_t *); static int ahci_rdwr_pmult(ahci_ctl_t *, ahci_addr_t *, uint8_t, uint32_t *, @@ -454,6 +457,10 @@ _init(void) goto err_out; } + /* watchdog tick */ + ahci_watchdog_tick = drv_usectohz( + (clock_t)ahci_watchdog_timeout * 1000000); + ret = mod_install(&modlinkage); if (ret != 0) { sata_hba_fini(&modlinkage); @@ -464,9 +471,6 @@ _init(void) goto err_out; } - /* watchdog tick */ - ahci_watchdog_tick = drv_usectohz( - (clock_t)ahci_watchdog_timeout * 1000000); return (ret); err_out: @@ -517,6 +521,7 @@ ahci_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) int status; int attach_state; uint32_t cap_status, ahci_version; + uint32_t ghc_control; int intr_types; int i; pci_regspec_t *regs; @@ -544,6 +549,16 @@ ahci_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) ahci_ctlp = ddi_get_soft_state(ahci_statep, instance); mutex_enter(&ahci_ctlp->ahcictl_mutex); + /* + * GHC.AE must be set to 1 before any other AHCI register + * is accessed + */ + ghc_control = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_GLOBAL_GHC(ahci_ctlp)); + ghc_control |= AHCI_HBA_GHC_AE; + ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_GLOBAL_GHC(ahci_ctlp), ghc_control); + /* Restart watch thread */ if (ahci_ctlp->ahcictl_timeout_id == 0) ahci_ctlp->ahcictl_timeout_id = timeout( @@ -655,6 +670,16 @@ ahci_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) attach_state |= AHCI_ATTACH_STATE_REG_MAP; + /* + * GHC.AE must be set to 1 before any other AHCI register + * is accessed + */ + ghc_control = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_GLOBAL_GHC(ahci_ctlp)); + ghc_control |= AHCI_HBA_GHC_AE; + ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_GLOBAL_GHC(ahci_ctlp), ghc_control); + /* Get the AHCI version information */ ahci_version = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, (uint32_t *)AHCI_GLOBAL_VS(ahci_ctlp)); @@ -678,6 +703,18 @@ ahci_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "hba capabilities = 0x%x", cap_status); + /* CAP2 (HBA Capabilities Extended) is available since AHCI spec 1.2 */ + if (ahci_version >= 0x00010200) { + uint32_t cap2_status; + + /* Get the HBA capabilities extended information */ + cap2_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_GLOBAL_CAP2(ahci_ctlp)); + + AHCIDBG(AHCIDBG_INIT, ahci_ctlp, + "hba capabilities extended = 0x%x", cap2_status); + } + #if AHCI_DEBUG /* Get the interface speed supported by the HBA */ speed = (cap_status & AHCI_HBA_CAP_ISS) >> AHCI_HBA_CAP_ISS_SHIFT; @@ -709,20 +746,12 @@ ahci_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "hba implementation of ports: 0x%x", ahci_ctlp->ahcictl_ports_implemented); - /* - * According to the AHCI spec, CAP.NP should indicate the maximum - * number of ports supported by the HBA silicon, but we found - * this value of ICH8 chipset only indicates the number of ports - * implemented (exposed) by it. Therefore, the driver should calculate - * the potential maximum value by checking PI register, and use - * the maximum of this value and CAP.NP. - */ - ahci_ctlp->ahcictl_num_ports = max( - (cap_status & AHCI_HBA_CAP_NP) + 1, - ddi_fls(ahci_ctlp->ahcictl_ports_implemented)); + /* Max port number implemented */ + ahci_ctlp->ahcictl_num_ports = + ddi_fls(ahci_ctlp->ahcictl_ports_implemented); AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "hba number of ports: %d", - ahci_ctlp->ahcictl_num_ports); + (cap_status & AHCI_HBA_CAP_NP) + 1); /* Get the number of implemented ports by the HBA */ ahci_ctlp->ahcictl_num_implemented_ports = @@ -3537,6 +3566,7 @@ ahci_check_slot_handle(ahci_port_t *ahci_portp, int slot) } return (DDI_SUCCESS); } + /* * Allocate the ports structure, only called by ahci_attach */ @@ -3637,28 +3667,11 @@ ahci_initialize_controller(ahci_ctl_t *ahci_ctlp) { ahci_port_t *ahci_portp; ahci_addr_t addr; - uint32_t ghc_control; int port; AHCIDBG(AHCIDBG_INIT|AHCIDBG_ENTRY, ahci_ctlp, "ahci_initialize_controller enter", NULL); - mutex_enter(&ahci_ctlp->ahcictl_mutex); - - /* - * Indicate that system software is AHCI aware by setting - * GHC.AE to 1 - */ - ghc_control = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_GLOBAL_GHC(ahci_ctlp)); - - ghc_control |= AHCI_HBA_GHC_AE; - ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_GLOBAL_GHC(ahci_ctlp), - ghc_control); - - mutex_exit(&ahci_ctlp->ahcictl_mutex); - /* Initialize the implemented ports and structures */ for (port = 0; port < ahci_ctlp->ahcictl_num_ports; port++) { if (!AHCI_PORT_IMPLEMENTED(ahci_ctlp, port)) { @@ -3821,6 +3834,41 @@ ahci_dealloc_pmult(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp) } /* + * Staggered Spin-up. + * + * WARNING!!! ahciport_mutex should be acquired before the function + * is called. + */ +static void +ahci_staggered_spin_up(ahci_ctl_t *ahci_ctlp, uint8_t port) +{ + uint32_t cap_status; + uint32_t port_cmd_status; + + cap_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_GLOBAL_CAP(ahci_ctlp)); + + /* Check for staggered spin-up support */ + if (!(cap_status & AHCI_HBA_CAP_SSS)) + return; + + port_cmd_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port)); + + /* If PxCMD.SUD == 1, no staggered spin-up is needed */ + if (port_cmd_status & AHCI_CMD_STATUS_SUD) + return; + + AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "Spin-up at port %d", port); + + /* Set PxCMD.SUD */ + port_cmd_status |= AHCI_CMD_STATUS_SUD; + ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port), + port_cmd_status); +} + +/* * The routine is to initialize a port. First put the port in NOTRunning * state, then enable port interrupt and clear Serror register. And under * AHCI_ATTACH case, find device signature and then try to start the port. @@ -3874,15 +3922,9 @@ ahci_initialize_port(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, "set PxCLB, PxCLBU, PxFB and PxFBU " "during resume", port); - /* Config Port Received FIS Base Address */ - ddi_put64(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint64_t *)AHCI_PORT_PxFB(ahci_ctlp, port), - ahci_portp->ahciport_rcvd_fis_dma_cookie.dmac_laddress); - - /* Config Port Command List Base Address */ - ddi_put64(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint64_t *)AHCI_PORT_PxCLB(ahci_ctlp, port), - ahci_portp->ahciport_cmd_list_dma_cookie.dmac_laddress); + if (ahci_setup_port_base_addresses(ahci_ctlp, ahci_portp) != + AHCI_SUCCESS) + return (AHCI_FAILURE); } port_cmd_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, @@ -3904,6 +3946,9 @@ ahci_initialize_port(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, ahci_portp, port); } + /* Make sure the drive is spun-up */ + ahci_staggered_spin_up(ahci_ctlp, port); + /* Disable interrupt */ ahci_disable_port_intrs(ahci_ctlp, port); @@ -3939,7 +3984,7 @@ ahci_initialize_port(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, if (ret != AHCI_SUCCESS) { AHCIDBG(AHCIDBG_INIT|AHCIDBG_ERRS, ahci_ctlp, "ahci_initialize_port:" - "port reset faild at port %d", port); + "port reset failed at port %d", port); return (AHCI_FAILURE); } @@ -3952,24 +3997,26 @@ ahci_initialize_port(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, return (AHCI_FAILURE); } } + AHCIPORT_SET_STATE(ahci_portp, addrp, SATA_STATE_READY); AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "port %d is ready now.", port); /* * Try to get the device signature if the port is not empty. */ - if (!resuming && ahci_portp->ahciport_device_type != SATA_DTYPE_NONE) + if (!resuming && AHCIPORT_DEV_TYPE(ahci_portp, addrp) != + SATA_DTYPE_NONE) ahci_find_dev_signature(ahci_ctlp, ahci_portp, addrp); /* Return directly if no device connected */ - if (ahci_portp->ahciport_device_type == SATA_DTYPE_NONE) { + if (AHCIPORT_DEV_TYPE(ahci_portp, addrp) == SATA_DTYPE_NONE) { AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "No device connected to port %d", port); goto out; } /* If this is a port multiplier, we need do some initialization */ - if (ahci_portp->ahciport_device_type == SATA_DTYPE_PMULT) { + if (AHCIPORT_DEV_TYPE(ahci_portp, addrp) == SATA_DTYPE_PMULT) { AHCIDBG(AHCIDBG_INFO|AHCIDBG_PMULT, ahci_ctlp, "Port multiplier found at port %d", port); ahci_alloc_pmult(ahci_ctlp, ahci_portp); @@ -4884,8 +4931,8 @@ ahci_probe_pmport(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, * the port must be idle and PxTFD.STS.BSY and PxTFD.STS.DRQ must be * cleared unless command list override (PxCMD.CLO) is supported. * - * WARNING!!! ahciport_mutex should be acquired and PxCMD.FRE should be - * set before the function is called. + * WARNING!!! ahciport_mutex should be acquired before the function + * is called. */ static int ahci_software_reset(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, @@ -5130,17 +5177,12 @@ out: * * When an HBA or port reset occurs, Phy communication is going to * be re-established with the device through a COMRESET followed by the - * normal out-of-band communication sequence defined in Serial ATA. AT + * normal out-of-band communication sequence defined in Serial ATA. At * the end of reset, the device, if working properly, will send a D2H * Register FIS, which contains the device signature. When the HBA receives * this FIS, it updates PxTFD.STS and PxTFD.ERR register fields, and updates * the PxSIG register with the signature. * - * Staggered spin-up is an optional feature in SATA II, and it enables an HBA - * to individually spin-up attached devices. Please refer to chapter 10.9 of - * AHCI 1.0 spec. - */ -/* * WARNING!!! ahciport_mutex should be acquired, and PxCMD.ST should be also * cleared before the function is called. */ @@ -5149,7 +5191,7 @@ ahci_port_reset(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, ahci_addr_t *addrp) { ahci_addr_t pmult_addr; - uint32_t cap_status, port_cmd_status; + uint32_t port_cmd_status; uint32_t port_scontrol, port_sstatus, port_serror; uint32_t port_intr_status, port_task_file; uint32_t port_state; @@ -5169,117 +5211,45 @@ ahci_port_reset(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, "Port %d port resetting...", port); ahci_portp->ahciport_port_state = 0; - cap_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_GLOBAL_CAP(ahci_ctlp)); - port_cmd_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port)); - if (cap_status & AHCI_HBA_CAP_SSS) { - /* - * HBA support staggered spin-up, if the port has - * not spin up yet, then force it to do spin-up - */ - if (!(port_cmd_status & AHCI_CMD_STATUS_SUD)) { - if (!(ahci_portp->ahciport_flags - & AHCI_PORT_FLAG_SPINUP)) { - AHCIDBG(AHCIDBG_INIT, ahci_ctlp, - "Port %d PxCMD.SUD is zero, force " - "it to do spin-up", port); - ahci_portp->ahciport_flags |= - AHCI_PORT_FLAG_SPINUP; - } - } - } else { - /* - * HBA doesn't support stagger spin-up, force it - * to do normal COMRESET - */ - if (ahci_portp->ahciport_flags & - AHCI_PORT_FLAG_SPINUP) { - AHCIDBG(AHCIDBG_INIT, ahci_ctlp, - "HBA does not support staggered spin-up " - "force it to do normal COMRESET", NULL); - ahci_portp->ahciport_flags &= - ~AHCI_PORT_FLAG_SPINUP; - } - } - - if (!(ahci_portp->ahciport_flags & AHCI_PORT_FLAG_SPINUP)) { - /* Do normal COMRESET */ - AHCIDBG(AHCIDBG_INFO, ahci_ctlp, - "ahci_port_reset: do normal COMRESET", port); - - /* - * According to the spec, SUD bit should be set here, - * but JMicron JMB363 doesn't follow it, so remove - * the assertion, and just print a debug message. - */ -#if AHCI_DEBUG - if (!(port_cmd_status & AHCI_CMD_STATUS_SUD)) - AHCIDBG(AHCIDBG_ERRS, ahci_ctlp, - "port %d SUD bit not set", port) -#endif - - port_scontrol = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port)); - SCONTROL_SET_DET(port_scontrol, SCONTROL_DET_COMRESET); - - ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port), - port_scontrol); - - /* Enable PxCMD.FRE to read device */ - ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port), - port_cmd_status|AHCI_CMD_STATUS_FRE); - - /* - * Give time for COMRESET to percolate, according to the AHCI - * spec, software shall wait at least 1 millisecond before - * clearing PxSCTL.DET - */ - drv_usecwait(AHCI_1MS_USECS*2); - - /* Fetch the SCONTROL again and rewrite the DET part with 0 */ - port_scontrol = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port)); - SCONTROL_SET_DET(port_scontrol, SCONTROL_DET_NOACTION); - ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port), - port_scontrol); - } else { - /* Do staggered spin-up */ - port_scontrol = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port)); - SCONTROL_SET_DET(port_scontrol, SCONTROL_DET_NOACTION); + /* + * According to the spec, SUD bit should be set here, + * but JMicron JMB363 doesn't follow it, so print + * a debug message. + */ + if (!(port_cmd_status & AHCI_CMD_STATUS_SUD)) + AHCIDBG(AHCIDBG_ERRS, ahci_ctlp, + "ahci_port_reset: port %d SUD bit not set", port); - /* PxSCTL.DET must be 0 */ - ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port), - port_scontrol); + port_scontrol = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port)); + SCONTROL_SET_DET(port_scontrol, SCONTROL_DET_COMRESET); - port_cmd_status &= ~AHCI_CMD_STATUS_SUD; - ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port), - port_cmd_status); + ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port), + port_scontrol); - /* 0 -> 1 edge */ - drv_usecwait(AHCI_1MS_USECS*2); + /* Enable PxCMD.FRE to read device */ + ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port), + port_cmd_status|AHCI_CMD_STATUS_FRE); - /* Set PxCMD.SUD to 1 */ - port_cmd_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port)); - port_cmd_status |= AHCI_CMD_STATUS_SUD; - ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port), - port_cmd_status); + /* + * Give time for COMRESET to percolate, according to the AHCI + * spec, software shall wait at least 1 millisecond before + * clearing PxSCTL.DET + */ + drv_usecwait(AHCI_1MS_USECS * 2); - /* Enable PxCMD.FRE to read device */ - ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port), - port_cmd_status|AHCI_CMD_STATUS_FRE); - } + /* Fetch the SCONTROL again and rewrite the DET part with 0 */ + port_scontrol = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port)); + SCONTROL_SET_DET(port_scontrol, SCONTROL_DET_NOACTION); + ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port), + port_scontrol); /* * The port enters P:StartComm state, and HBA tells link layer to @@ -5645,7 +5615,7 @@ err: /* R/W PMULT error */ * * When an HBA reset occurs, Phy communication will be re-established with * the device through a COMRESET followed by the normal out-of-band - * communication sequence defined in Serial ATA. AT the end of reset, the + * communication sequence defined in Serial ATA. At the end of reset, the * device, if working properly, will send a D2H Register FIS, which contains * the device signature. When the HBA receives this FIS, it updates PxTFD.STS * and PxTFD.ERR register fields, and updates the PxSIG register with the @@ -5657,7 +5627,6 @@ static int ahci_hba_reset(ahci_ctl_t *ahci_ctlp) { ahci_port_t *ahci_portp; - ahci_addr_t addr; uint32_t ghc_control; uint8_t port; int loop_count; @@ -5728,7 +5697,8 @@ ahci_hba_reset(ahci_ctl_t *ahci_ctlp) ahci_portp = ahci_ctlp->ahcictl_ports[port]; mutex_enter(&ahci_portp->ahciport_mutex); - AHCI_ADDR_SET_PORT(&addr, port); + /* Make sure the drive is spun-up */ + ahci_staggered_spin_up(ahci_ctlp, port); if (ahci_restart_port_wait_till_ready(ahci_ctlp, ahci_portp, port, AHCI_PORT_RESET|AHCI_RESET_NO_EVENTS_UP, NULL) != @@ -5854,7 +5824,6 @@ ahci_find_dev_signature(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, signature = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, (uint32_t *)AHCI_PORT_PxSIG(ahci_ctlp, port)); -#ifdef AHCI_DEBUG if (AHCI_ADDR_IS_PMPORT(addrp)) { AHCIDBG(AHCIDBG_INIT|AHCIDBG_INFO|AHCIDBG_PMULT, ahci_ctlp, "ahci_find_dev_signature: signature = 0x%x at port %d:%d", @@ -5864,7 +5833,6 @@ ahci_find_dev_signature(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, "ahci_find_dev_signature: signature = 0x%x at port %d", signature, port); } -#endif /* NOTE: Only support ATAPI device at controller port. */ if (signature == AHCI_SIGNATURE_ATAPI && !AHCI_ADDR_IS_PORT(addrp)) @@ -5995,6 +5963,87 @@ ahci_start_port(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, uint8_t port) } /* + * Setup PxCLB, PxCLBU, PxFB, and PxFBU for particular port. First, we need + * to make sure PxCMD.ST, PxCMD.CR, PxCMD.FRE, and PxCMD.FR are all cleared. + * Then set PxCLB, PxCLBU, PxFB, and PxFBU. + * + * WARNING!!! ahciport_mutex should be acquired before the function is called. + */ +static int +ahci_setup_port_base_addresses(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp) +{ + uint8_t port = ahci_portp->ahciport_port_num; + uint32_t port_cmd_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port)); + + /* Step 1: Make sure both PxCMD.ST and PxCMD.CR are cleared. */ + if (port_cmd_status & (AHCI_CMD_STATUS_ST | AHCI_CMD_STATUS_CR)) { + if (ahci_put_port_into_notrunning_state(ahci_ctlp, ahci_portp, + port) != AHCI_SUCCESS) + return (AHCI_FAILURE); + + port_cmd_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port)); + } + + /* Step 2: Make sure both PxCMD.FRE and PxCMD.FR are cleared. */ + if (port_cmd_status & (AHCI_CMD_STATUS_FRE | AHCI_CMD_STATUS_FR)) { + int loop_count = 0; + + /* Clear PxCMD.FRE */ + port_cmd_status &= ~AHCI_CMD_STATUS_FRE; + ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port), + port_cmd_status); + + /* Wait until PxCMD.FR is cleared */ + for (;;) { + port_cmd_status = + ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port)); + + if (!(port_cmd_status & AHCI_CMD_STATUS_FR)) + break; + + if (loop_count++ >= AHCI_POLLRATE_PORT_IDLE_FR) { + AHCIDBG(AHCIDBG_INIT | AHCIDBG_ERRS, ahci_ctlp, + "ahci_setup_port_base_addresses: cannot " + "clear PxCMD.FR for port %d.", port); + + /* + * We are effectively timing out after 0.5 sec. + * This value is specified in AHCI spec. + */ + return (AHCI_FAILURE); + } + + /* Wait for 1 millisec */ + drv_usecwait(AHCI_1MS_USECS); + } + } + + /* Step 3: Config Port Command List Base Address */ + ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxCLB(ahci_ctlp, port), + ahci_portp->ahciport_cmd_list_dma_cookie.dmac_address); + + ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxCLBU(ahci_ctlp, port), + ahci_portp->ahciport_cmd_list_dma_cookie.dmac_notused); + + /* Step 4: Config Port Received FIS Base Address */ + ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxFB(ahci_ctlp, port), + ahci_portp->ahciport_rcvd_fis_dma_cookie.dmac_address); + + ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle, + (uint32_t *)AHCI_PORT_PxFBU(ahci_ctlp, port), + ahci_portp->ahciport_rcvd_fis_dma_cookie.dmac_notused); + + return (AHCI_SUCCESS); +} + +/* * Allocate the ahci_port_t including Received FIS and Command List. * The argument - port is the physical port number, and not logical * port number seen by the SATA framework. @@ -6028,14 +6077,20 @@ ahci_alloc_port_state(ahci_ctl_t *ahci_ctlp, uint8_t port) * Allocate memory for received FIS structure and * command list for this port */ - if (ahci_alloc_rcvd_fis(ahci_ctlp, ahci_portp, port) != AHCI_SUCCESS) { + if (ahci_alloc_rcvd_fis(ahci_ctlp, ahci_portp) != AHCI_SUCCESS) { goto err_case1; } - if (ahci_alloc_cmd_list(ahci_ctlp, ahci_portp, port) != AHCI_SUCCESS) { + if (ahci_alloc_cmd_list(ahci_ctlp, ahci_portp) != AHCI_SUCCESS) { goto err_case2; } + /* Setup PxCMD.CLB, PxCMD.CLBU, PxCMD.FB, and PxCMD.FBU */ + if (ahci_setup_port_base_addresses(ahci_ctlp, ahci_portp) != + AHCI_SUCCESS) { + goto err_case3; + } + (void) snprintf(taskq_name + strlen(taskq_name), sizeof (taskq_name) - strlen(taskq_name), "_port%d", port); @@ -6087,7 +6142,7 @@ err_case1: } /* - * Reverse of ahci_dealloc_port_state(). + * Reverse of ahci_alloc_port_state(). * * WARNING!!! ahcictl_mutex should be acquired before the function * is called. @@ -6126,8 +6181,7 @@ ahci_dealloc_port_state(ahci_ctl_t *ahci_ctlp, uint8_t port) * is called. */ static int -ahci_alloc_rcvd_fis(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, - uint8_t port) +ahci_alloc_rcvd_fis(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp) { size_t rcvd_fis_size; size_t ret_len; @@ -6185,11 +6239,6 @@ ahci_alloc_rcvd_fis(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, bzero((void *)ahci_portp->ahciport_rcvd_fis, rcvd_fis_size); - /* Config Port Received FIS Base Address */ - ddi_put64(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint64_t *)AHCI_PORT_PxFB(ahci_ctlp, port), - ahci_portp->ahciport_rcvd_fis_dma_cookie.dmac_laddress); - AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "64-bit, dma address: 0x%llx", ahci_portp->ahciport_rcvd_fis_dma_cookie.dmac_laddress); AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "32-bit, dma address: 0x%x", @@ -6226,8 +6275,7 @@ ahci_dealloc_rcvd_fis(ahci_port_t *ahci_portp) * is called. */ static int -ahci_alloc_cmd_list(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, - uint8_t port) +ahci_alloc_cmd_list(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp) { size_t cmd_list_size; size_t ret_len; @@ -6285,11 +6333,6 @@ ahci_alloc_cmd_list(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, bzero((void *)ahci_portp->ahciport_cmd_list, cmd_list_size); - /* Config Port Command List Base Address */ - ddi_put64(ahci_ctlp->ahcictl_ahci_acc_handle, - (uint64_t *)AHCI_PORT_PxCLB(ahci_ctlp, port), - ahci_portp->ahciport_cmd_list_dma_cookie.dmac_laddress); - AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "64-bit, dma address: 0x%llx", ahci_portp->ahciport_cmd_list_dma_cookie.dmac_laddress); diff --git a/usr/src/uts/common/sys/fm/fs/zfs.h b/usr/src/uts/common/sys/fm/fs/zfs.h index c752edc99b..029af540b3 100644 --- a/usr/src/uts/common/sys/fm/fs/zfs.h +++ b/usr/src/uts/common/sys/fm/fs/zfs.h @@ -46,6 +46,7 @@ extern "C" { #define FM_EREPORT_ZFS_IO_FAILURE "io_failure" #define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure" #define FM_EREPORT_ZFS_LOG_REPLAY "log_replay" +#define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write" #define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" #define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode" diff --git a/usr/src/uts/common/sys/sata/adapters/ahci/ahcireg.h b/usr/src/uts/common/sys/sata/adapters/ahci/ahcireg.h index 5614c929d5..e738783dfe 100644 --- a/usr/src/uts/common/sys/sata/adapters/ahci/ahcireg.h +++ b/usr/src/uts/common/sys/sata/adapters/ahci/ahcireg.h @@ -23,7 +23,9 @@ * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - +/* + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + */ #ifndef _AHCIREG_H #define _AHCIREG_H @@ -134,6 +136,10 @@ extern "C" { #define AHCI_GLOBAL_EM_LOC(ahci_ctlp) (AHCI_GLOBAL_OFFSET(ahci_ctlp) + 0x1c) /* Enclosure Management Control */ #define AHCI_GLOBAL_EM_CTL(ahci_ctlp) (AHCI_GLOBAL_OFFSET(ahci_ctlp) + 0x20) + /* HBA Capabilities Extended (AHCI spec 1.2) */ +#define AHCI_GLOBAL_CAP2(ahci_ctlp) (AHCI_GLOBAL_OFFSET(ahci_ctlp) + 0x24) + /* BIOS/OS Handoff Control and Status (AHCI spec 1.2) */ +#define AHCI_GLOBAL_BOHC(ahci_ctlp) (AHCI_GLOBAL_OFFSET(ahci_ctlp) + 0x28) #define AHCI_PORT_IMPLEMENTED(ahci_ctlp, port) \ ((0x1 << port) & ahci_ctlp->ahcictl_ports_implemented) diff --git a/usr/src/uts/common/sys/sata/adapters/ahci/ahcivar.h b/usr/src/uts/common/sys/sata/adapters/ahci/ahcivar.h index bf8671425c..b28d0b6464 100644 --- a/usr/src/uts/common/sys/sata/adapters/ahci/ahcivar.h +++ b/usr/src/uts/common/sys/sata/adapters/ahci/ahcivar.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. */ @@ -133,9 +134,6 @@ typedef struct ahci_pmult_info ahci_pmult_info_t; /* * flags for ahciport_flags * - * AHCI_PORT_FLAG_SPINUP: this flag will be set when a HBA which supports - * staggered spin-up needs to do a spin-up. - * * AHCI_PORT_FLAG_MOPPING: this flag will be set when the HBA is stopped, * and all the outstanding commands need to be aborted and sent to upper * layers. @@ -173,7 +171,6 @@ typedef struct ahci_pmult_info ahci_pmult_info_t; * will be printed. Note that, for INDENTIFY DEVICE command sent to ATAPI * device or ATAPI PACKET command, this flag won't be set. */ -#define AHCI_PORT_FLAG_SPINUP 0x01 #define AHCI_PORT_FLAG_MOPPING 0x02 #define AHCI_PORT_FLAG_POLLING 0x04 #define AHCI_PORT_FLAG_RQSENSE 0x08 @@ -199,7 +196,6 @@ typedef struct ahci_port { ahci_pmult_info_t *ahciport_pmult_info; /* - * AHCI_PORT_FLAG_SPINUP * AHCI_PORT_FLAG_MOPPING * AHCI_PORT_FLAG_POLLING * AHCI_PORT_FLAG_RQSENSE @@ -552,6 +548,7 @@ _NOTE(MUTEX_PROTECTS_DATA(ahci_ctl_t::ahcictl_mutex, #define AHCI_POLLRATE_PORT_IDLE 50 #define AHCI_POLLRATE_PORT_SOFTRESET 100 #define AHCI_POLLRATE_GET_SPKT 100 +#define AHCI_POLLRATE_PORT_IDLE_FR 500 /* Clearing & setting the n'th bit in a given tag */ |