summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKeith M Wesolowski <wesolows@foobazco.org>2013-06-11 22:47:43 +0000
committerKeith M Wesolowski <wesolows@foobazco.org>2013-06-11 22:47:43 +0000
commit80702ccb4f267d5bfae86b07731a8f89e724d055 (patch)
tree4bcba7532a1633ba6d708dbaf41c21af8876e7c0
parent2ffdaec9c70166169b32efb691ef26af80d99263 (diff)
parent3cb69f734bc60bbb4d56a28c83706db862bec082 (diff)
downloadillumos-joyent-80702ccb4f267d5bfae86b07731a8f89e724d055.tar.gz
[illumos-gate merge]
commit 3cb69f734bc60bbb4d56a28c83706db862bec082 3749 zfs event processing should work on R/O root filesystems commit b3d9f2e26021d3f55a281af30720589d303b9806 3747 txg commit callbacks don't work commit 8b713775314bbbf24edd503b4869342d8711ce95 3745 zpool create should treat -O mountpoint and -m the same 3811 zpool create -o altroot=/xyz -O mountpoint=/mnt ignores the mountpoint option commit fc7a6e3fefc649cb65c8e2a35d194781445008b0 3744 zfs shouldn't ignore errors unmounting snapshots commit b287be1ba86043996f49b1cc34c80cc620f9b841 3743 zfs needs a refcount audit commit f7170741490edba9d1d9c697c177c887172bc741 3742 zfs comments need cleaner, more consistent style commit 3e30c24aeefdee1631958ecf17f18da671781956 3741 zfs needs better comments commit 2ac302890e472bf0c11db192dd18f12ded6043f6 3797 AHCI: Support for ASMedia ASM106x
-rw-r--r--usr/src/cmd/mdb/intel/modules/sata/sata.c8
-rw-r--r--usr/src/cmd/zpool/zpool_main.c37
-rw-r--r--usr/src/cmd/ztest/ztest.c2
-rw-r--r--usr/src/lib/libzfs/common/libzfs_dataset.c5
-rw-r--r--usr/src/lib/libzfs/common/libzfs_pool.c16
-rwxr-xr-x[-rw-r--r--]usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh8
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c33
-rw-r--r--usr/src/uts/common/fs/zfs/bptree.c2
-rw-r--r--usr/src/uts/common/fs/zfs/dbuf.c24
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c2
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_tx.c8
-rw-r--r--usr/src/uts/common/fs/zfs/dmu_zfetch.c15
-rw-r--r--usr/src/uts/common/fs/zfs/dnode.c18
-rw-r--r--usr/src/uts/common/fs/zfs/dnode_sync.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_dataset.c4
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_prop.c2
-rw-r--r--usr/src/uts/common/fs/zfs/dsl_userhold.c2
-rw-r--r--usr/src/uts/common/fs/zfs/sa.c3
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c44
-rw-r--r--usr/src/uts/common/fs/zfs/spa_config.c51
-rw-r--r--usr/src/uts/common/fs/zfs/spa_errlog.c4
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/ddt.h15
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h8
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dnode.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dsl_pool.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/sa_impl.h37
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h3
-rw-r--r--usr/src/uts/common/fs/zfs/sys/space_map.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/unique.h4
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h10
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zap.h33
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zap_leaf.h15
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_acl.h5
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_rlock.h17
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zfs_znode.h16
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zil.h13
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio_compress.h5
-rw-r--r--usr/src/uts/common/fs/zfs/txg.c22
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c18
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c1
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_queue.c11
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_raidz.c74
-rw-r--r--usr/src/uts/common/fs/zfs/zap.c38
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_acl.c9
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ctldir.c6
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c60
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_log.c21
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_rlock.c2
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_sa.c2
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vfsops.c24
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_vnops.c121
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_znode.c17
-rw-r--r--usr/src/uts/common/fs/zfs/zil.c5
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c17
-rw-r--r--usr/src/uts/common/io/sata/adapters/ahci/ahci.c405
-rw-r--r--usr/src/uts/common/sys/fm/fs/zfs.h1
-rw-r--r--usr/src/uts/common/sys/sata/adapters/ahci/ahcireg.h8
-rw-r--r--usr/src/uts/common/sys/sata/adapters/ahci/ahcivar.h7
60 files changed, 806 insertions, 547 deletions
diff --git a/usr/src/cmd/mdb/intel/modules/sata/sata.c b/usr/src/cmd/mdb/intel/modules/sata/sata.c
index 5a43653d30..b1618cd8d8 100644
--- a/usr/src/cmd/mdb/intel/modules/sata/sata.c
+++ b/usr/src/cmd/mdb/intel/modules/sata/sata.c
@@ -22,7 +22,9 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-
+/*
+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ */
#include <sys/mdb_modapi.h>
#include <mdb/mdb_ks.h>
@@ -168,7 +170,7 @@ sata_dmsg_dump(sata_trace_dmsg_t *addr, int print_pathname, uint_t *printed)
(void) mdb_ddi_pathname(
(uintptr_t)dmsg.dip, pathname,
sizeof (pathname));
- mdb_printf("\n[%s]", pathname);
+ mdb_printf("[%s]", pathname);
}
}
} else {
@@ -181,7 +183,7 @@ sata_dmsg_dump(sata_trace_dmsg_t *addr, int print_pathname, uint_t *printed)
dmsg.buf);
}
- mdb_printf("%s", merge);
+ mdb_printf("%s\n", merge);
if (printed != NULL) {
(*printed)++;
diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c
index 310c33462a..7a133bf27e 100644
--- a/usr/src/cmd/zpool/zpool_main.c
+++ b/usr/src/cmd/zpool/zpool_main.c
@@ -678,6 +678,7 @@ zpool_do_create(int argc, char **argv)
goto errout;
break;
case 'm':
+ /* Equivalent to -O mountpoint=optarg */
mountpoint = optarg;
break;
case 'o':
@@ -716,8 +717,18 @@ zpool_do_create(int argc, char **argv)
*propval = '\0';
propval++;
- if (add_prop_list(optarg, propval, &fsprops, B_FALSE))
+ /*
+ * Mountpoints are checked and then added later.
+ * Uniquely among properties, they can be specified
+ * more than once, to avoid conflict with -m.
+ */
+ if (0 == strcmp(optarg,
+ zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) {
+ mountpoint = propval;
+ } else if (add_prop_list(optarg, propval, &fsprops,
+ B_FALSE)) {
goto errout;
+ }
break;
case ':':
(void) fprintf(stderr, gettext("missing argument for "
@@ -834,6 +845,18 @@ zpool_do_create(int argc, char **argv)
}
}
+ /*
+ * Now that the mountpoint's validity has been checked, ensure that
+ * the property is set appropriately prior to creating the pool.
+ */
+ if (mountpoint != NULL) {
+ ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
+ mountpoint, &fsprops, B_FALSE);
+ if (ret != 0)
+ goto errout;
+ }
+
+ ret = 1;
if (dryrun) {
/*
* For a dry run invocation, print out a basic message and run
@@ -868,21 +891,19 @@ zpool_do_create(int argc, char **argv)
if (nvlist_exists(props, propname))
continue;
- if (add_prop_list(propname, ZFS_FEATURE_ENABLED,
- &props, B_TRUE) != 0)
+ ret = add_prop_list(propname,
+ ZFS_FEATURE_ENABLED, &props, B_TRUE);
+ if (ret != 0)
goto errout;
}
}
+
+ ret = 1;
if (zpool_create(g_zfs, poolname,
nvroot, props, fsprops) == 0) {
zfs_handle_t *pool = zfs_open(g_zfs, poolname,
ZFS_TYPE_FILESYSTEM);
if (pool != NULL) {
- if (mountpoint != NULL)
- verify(zfs_prop_set(pool,
- zfs_prop_to_name(
- ZFS_PROP_MOUNTPOINT),
- mountpoint) == 0);
if (zfs_mount(pool, NULL, 0) == 0)
ret = zfs_shareall(pool);
zfs_close(pool);
diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c
index ed460551c6..980615eae0 100644
--- a/usr/src/cmd/ztest/ztest.c
+++ b/usr/src/cmd/ztest/ztest.c
@@ -4507,7 +4507,7 @@ ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
*/
tmp_cb = list_head(&zcl.zcl_callbacks);
if (tmp_cb != NULL &&
- tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) {
+ (txg - ZTEST_COMMIT_CALLBACK_THRESH) > tmp_cb->zcd_txg) {
fatal(0, "Commit callback threshold exceeded, oldest txg: %"
PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
}
diff --git a/usr/src/lib/libzfs/common/libzfs_dataset.c b/usr/src/lib/libzfs/common/libzfs_dataset.c
index 36ab907508..27656b7526 100644
--- a/usr/src/lib/libzfs/common/libzfs_dataset.c
+++ b/usr/src/lib/libzfs/common/libzfs_dataset.c
@@ -4473,6 +4473,11 @@ zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl)
return (err);
}
+/*
+ * Convert the zvol's volume size to an appropriate reservation.
+ * Note: If this routine is updated, it is necessary to update the ZFS test
+ * suite's shell version in reservation.kshlib.
+ */
uint64_t
zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
{
diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c
index 8315e1404b..0fd5f5738c 100644
--- a/usr/src/lib/libzfs/common/libzfs_pool.c
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c
@@ -1088,7 +1088,6 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
nvlist_t *zc_fsprops = NULL;
nvlist_t *zc_props = NULL;
char msg[1024];
- char *altroot;
int ret = -1;
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
@@ -1187,21 +1186,6 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
}
}
- /*
- * If this is an alternate root pool, then we automatically set the
- * mountpoint of the root dataset to be '/'.
- */
- if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT),
- &altroot) == 0) {
- zfs_handle_t *zhp;
-
- verify((zhp = zfs_open(hdl, pool, ZFS_TYPE_DATASET)) != NULL);
- verify(zfs_prop_set(zhp, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT),
- "/") == 0);
-
- zfs_close(zhp);
- }
-
create_failed:
zcmd_free_nvlists(&zc);
nvlist_free(zc_props);
diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh
index 2458cf350c..c2f3789891 100644..100755
--- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh
+++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh
@@ -103,12 +103,16 @@ do
[[ "$mpt" != "$mpt_val" ]] && \
log_fail "The value of mountpoint property is different\
from the output of zfs mount"
- if [[ "$opt" == "-R $TESTDIR1" ]] || [[ "$opt" == "-m $TESTDIR1" ]];
- then
+ if [[ "$opt" == "-m $TESTDIR1" ]]; then
[[ ! -d $TESTDIR1 ]] && \
log_fail "$TESTDIR1 is not created auotmatically."
[[ "$mpt" != "$TESTDIR1" ]] && \
log_fail "$TESTPOOL is not mounted on $TESTDIR1."
+ elif [[ "$opt" == "-R $TESTDIR1" ]]; then
+ [[ ! -d $TESTDIR1/$TESTPOOL ]] && \
+ log_fail "$TESTDIR1/$TESTPOOL is not created auotmatically."
+ [[ "$mpt" != "$TESTDIR1/$TESTPOOL" ]] && \
+ log_fail "$TESTPOOL is not mounted on $TESTDIR1/$TESTPOOL."
else
[[ ! -d ${TESTDIR1}$TESTDIR1 ]] && \
log_fail "${TESTDIR1}$TESTDIR1 is not created automatically."
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index 81ee2e3ff3..67847d2d99 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -59,11 +59,11 @@
* tight.
*
* 3. The Megiddo and Modha model assumes a fixed page size. All
- * elements of the cache are therefor exactly the same size. So
+ * elements of the cache are therefore exactly the same size. So
* when adjusting the cache size following a cache miss, its simply
* a matter of choosing a single page to evict. In our model, we
* have variable sized cache blocks (rangeing from 512 bytes to
- * 128K bytes). We therefor choose a set of blocks to evict to make
+ * 128K bytes). We therefore choose a set of blocks to evict to make
* space for a cache miss that approximates as closely as possible
* the space used by the new block.
*
@@ -78,7 +78,7 @@
* ways: 1) via a hash table lookup using the DVA as a key,
* or 2) via one of the ARC lists. The arc_read() interface
* uses method 1, while the internal arc algorithms for
- * adjusting the cache use method 2. We therefor provide two
+ * adjusting the cache use method 2. We therefore provide two
* types of locks: 1) the hash table lock array, and 2) the
* arc list locks.
*
@@ -258,7 +258,18 @@ typedef struct arc_stats {
kstat_named_t arcstat_mfu_ghost_hits;
kstat_named_t arcstat_deleted;
kstat_named_t arcstat_recycle_miss;
+ /*
+ * Number of buffers that could not be evicted because the hash lock
+ * was held by another thread. The lock may not necessarily be held
+ * by something using the same buffer, since hash locks are shared
+ * by multiple buffers.
+ */
kstat_named_t arcstat_mutex_miss;
+ /*
+ * Number of buffers skipped because they have I/O in progress, are
+ * indrect prefetch buffers that have not lived long enough, or are
+ * not from the spa we're trying to evict from.
+ */
kstat_named_t arcstat_evict_skip;
kstat_named_t arcstat_evict_l2_cached;
kstat_named_t arcstat_evict_l2_eligible;
@@ -376,7 +387,7 @@ static arc_stats_t arc_stats = {
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
#define ARCSTAT_INCR(stat, val) \
- atomic_add_64(&arc_stats.stat.value.ui64, (val));
+ atomic_add_64(&arc_stats.stat.value.ui64, (val))
#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1)
#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1)
@@ -604,9 +615,7 @@ uint64_t zfs_crc64_table[256];
#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
-/*
- * L2ARC Performance Tunables
- */
+/* L2ARC Performance Tunables */
uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */
uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
@@ -3001,6 +3010,10 @@ top:
mutex_exit(hash_lock);
+ /*
+ * At this point, we have a level 1 cache miss. Try again in
+ * L2ARC if possible.
+ */
ASSERT3U(hdr->b_size, ==, size);
DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
uint64_t, size, zbookmark_t *, zb);
@@ -3243,8 +3256,8 @@ arc_buf_evict(arc_buf_t *buf)
}
/*
- * Release this buffer from the cache. This must be done
- * after a read and prior to modifying the buffer contents.
+ * Release this buffer from the cache, making it an anonymous buffer. This
+ * must be done after a read and prior to modifying the buffer contents.
* If the buffer has more than one reference, we must make
* a new hdr for the buffer.
*/
@@ -3633,7 +3646,7 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
/*
* Writes will, almost always, require additional memory allocations
- * in order to compress/encrypt/etc the data. We therefor need to
+ * in order to compress/encrypt/etc the data. We therefore need to
* make sure that there is sufficient available memory for this.
*/
if (error = arc_memory_throttle(reserve, anon_size, txg))
diff --git a/usr/src/uts/common/fs/zfs/bptree.c b/usr/src/uts/common/fs/zfs/bptree.c
index 73922db88b..a0c90cc4d9 100644
--- a/usr/src/uts/common/fs/zfs/bptree.c
+++ b/usr/src/uts/common/fs/zfs/bptree.c
@@ -43,7 +43,7 @@
* dsl_scan_sync. This allows the delete operation to finish without traversing
* all the dataset's blocks.
*
- * Note that while bt_begin and bt_end are only ever incremented in this code
+ * Note that while bt_begin and bt_end are only ever incremented in this code,
* they are effectively reset to 0 every time the entire bptree is freed because
* the bptree's object is destroyed and re-created.
*/
diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c
index f4c1904543..9f4c8a8e35 100644
--- a/usr/src/uts/common/fs/zfs/dbuf.c
+++ b/usr/src/uts/common/fs/zfs/dbuf.c
@@ -653,6 +653,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (!havepzio)
err = zio_wait(zio);
} else {
+ /*
+ * Another reader came in while the dbuf was in flight
+ * between UNCACHED and CACHED. Either a writer will finish
+ * writing the buffer (sending the dbuf to CACHED) or the
+ * first reader's request will reach the read_done callback
+ * and send the dbuf to CACHED. Otherwise, a failure
+ * occurred and the dbuf went to UNCACHED.
+ */
mutex_exit(&db->db_mtx);
if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
@@ -661,6 +669,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
+ /* Skip the wait per the caller's request. */
mutex_enter(&db->db_mtx);
if ((flags & DB_RF_NEVERWAIT) == 0) {
while (db->db_state == DB_READ ||
@@ -1276,7 +1285,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
}
/*
- * Return TRUE if this evicted the dbuf.
+ * Undirty a buffer in the transaction group referenced by the given
+ * transaction. Return whether this evicted the dbuf.
*/
static boolean_t
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
@@ -2237,6 +2247,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(db->db_level > 0);
DBUF_VERIFY(db);
+ /* Read the block if it hasn't been read yet. */
if (db->db_buf == NULL) {
mutex_exit(&db->db_mtx);
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
@@ -2247,10 +2258,12 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
+ /* Indirect block size must match what the dnode thinks it is. */
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
dbuf_check_blkptr(dn, db);
DB_DNODE_EXIT(db);
+ /* Provide the pending dirty record to child dbufs */
db->db_data_pending = dr;
mutex_exit(&db->db_mtx);
@@ -2637,6 +2650,7 @@ dbuf_write_override_done(zio_t *zio)
dbuf_write_done(zio, NULL, db);
}
+/* Issue I/O to commit a dirty buffer to disk. */
static void
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
@@ -2671,11 +2685,19 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
}
if (parent != dn->dn_dbuf) {
+ /* Our parent is an indirect block. */
+ /* We have a dirty parent that has been scheduled for write. */
ASSERT(parent && parent->db_data_pending);
+ /* Our parent's buffer is one level closer to the dnode. */
ASSERT(db->db_level == parent->db_level-1);
+ /*
+ * We're about to modify our parent's db_data by modifying
+ * our block pointer, so the parent must be released.
+ */
ASSERT(arc_released(parent->db_buf));
zio = parent->db_data_pending->dr_zio;
} else {
+ /* Our parent is the dnode itself. */
ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
db->db_blkid != DMU_SPILL_BLKID) ||
(db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index a616fd37cf..a3640fd593 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -1827,7 +1827,7 @@ dmu_init(void)
void
dmu_fini(void)
{
- arc_fini();
+ arc_fini(); /* arc depends on l2arc, so arc must go first */
l2arc_fini();
zfetch_fini();
dbuf_fini();
diff --git a/usr/src/uts/common/fs/zfs/dmu_tx.c b/usr/src/uts/common/fs/zfs/dmu_tx.c
index ad4084021d..e30c6d345e 100644
--- a/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ b/usr/src/uts/common/fs/zfs/dmu_tx.c
@@ -1016,6 +1016,10 @@ dmu_tx_unassign(dmu_tx_t *tx)
txg_rele_to_quiesce(&tx->tx_txgh);
+ /*
+ * Walk the transaction's hold list, removing the hold on the
+ * associated dnode, and notifying waiters if the refcount drops to 0.
+ */
for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
txh = list_next(&tx->tx_holds, txh)) {
dnode_t *dn = txh->txh_dnode;
@@ -1128,6 +1132,10 @@ dmu_tx_commit(dmu_tx_t *tx)
ASSERT(tx->tx_txg != 0);
+ /*
+ * Go through the transaction's hold list and remove holds on
+ * associated dnodes, notifying waiters if no holds remain.
+ */
while (txh = list_head(&tx->tx_holds)) {
dnode_t *dn = txh->txh_dnode;
diff --git a/usr/src/uts/common/fs/zfs/dmu_zfetch.c b/usr/src/uts/common/fs/zfs/dmu_zfetch.c
index 37037c30f6..2ebfa183aa 100644
--- a/usr/src/uts/common/fs/zfs/dmu_zfetch.c
+++ b/usr/src/uts/common/fs/zfs/dmu_zfetch.c
@@ -48,11 +48,11 @@ uint32_t zfetch_block_cap = 256;
uint64_t zfetch_array_rd_sz = 1024 * 1024;
/* forward decls for static routines */
-static int dmu_zfetch_colinear(zfetch_t *, zstream_t *);
+static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *);
static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
-static int dmu_zfetch_find(zfetch_t *, zstream_t *, int);
+static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int);
static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
@@ -104,9 +104,9 @@ kstat_t *zfetch_ksp;
* last stream, then we are probably in a strided access pattern. So
* combine the two sequential streams into a single strided stream.
*
- * If no co-linear streams are found, return NULL.
+ * Returns whether co-linear streams were found.
*/
-static int
+static boolean_t
dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
{
zstream_t *z_walk;
@@ -326,7 +326,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
* for this block read. If so, it starts a prefetch for the stream it
* located and returns true, otherwise it returns false
*/
-static int
+static boolean_t
dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
{
zstream_t *zs;
@@ -639,7 +639,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
{
zstream_t zst;
zstream_t *newstream;
- int fetched;
+ boolean_t fetched;
int inserted;
unsigned int blkshft;
uint64_t blksz;
@@ -665,7 +665,8 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
ZFETCHSTAT_BUMP(zfetchstat_hits);
} else {
ZFETCHSTAT_BUMP(zfetchstat_misses);
- if (fetched = dmu_zfetch_colinear(zf, &zst)) {
+ fetched = dmu_zfetch_colinear(zf, &zst);
+ if (fetched) {
ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
} else {
ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
diff --git a/usr/src/uts/common/fs/zfs/dnode.c b/usr/src/uts/common/fs/zfs/dnode.c
index 417e219b88..92996a6f12 100644
--- a/usr/src/uts/common/fs/zfs/dnode.c
+++ b/usr/src/uts/common/fs/zfs/dnode.c
@@ -1803,14 +1803,16 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
}
/*
- * This function scans a block at the indicated "level" looking for
- * a hole or data (depending on 'flags'). If level > 0, then we are
- * scanning an indirect block looking at its pointers. If level == 0,
- * then we are looking at a block of dnodes. If we don't find what we
- * are looking for in the block, we return ESRCH. Otherwise, return
- * with *offset pointing to the beginning (if searching forwards) or
- * end (if searching backwards) of the range covered by the block
- * pointer we matched on (or dnode).
+ * Scans a block at the indicated "level" looking for a hole or data,
+ * depending on 'flags'.
+ *
+ * If level > 0, then we are scanning an indirect block looking at its
+ * pointers. If level == 0, then we are looking at a block of dnodes.
+ *
+ * If we don't find what we are looking for in the block, we return ESRCH.
+ * Otherwise, return with *offset pointing to the beginning (if searching
+ * forwards) or end (if searching backwards) of the range covered by the
+ * block pointer we matched on (or dnode).
*
* The basic search algorithm used below by dnode_next_offset() is to
* use this function to search up the block tree (widen the search) until
diff --git a/usr/src/uts/common/fs/zfs/dnode_sync.c b/usr/src/uts/common/fs/zfs/dnode_sync.c
index 7d47ce02b4..2a1094be24 100644
--- a/usr/src/uts/common/fs/zfs/dnode_sync.c
+++ b/usr/src/uts/common/fs/zfs/dnode_sync.c
@@ -302,7 +302,7 @@ free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, int trunc,
}
/*
- * free_range: Traverse the indicated range of the provided file
+ * Traverse the indicated range of the provided file
* and "free" all the blocks contained there.
*/
static void
@@ -370,7 +370,7 @@ dnode_sync_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
}
/*
- * Try to kick all the dnodes dbufs out of the cache...
+ * Try to kick all the dnode's dbufs out of the cache...
*/
void
dnode_evict_dbufs(dnode_t *dn)
diff --git a/usr/src/uts/common/fs/zfs/dsl_dataset.c b/usr/src/uts/common/fs/zfs/dsl_dataset.c
index bfc8b06d03..d59b6fa052 100644
--- a/usr/src/uts/common/fs/zfs/dsl_dataset.c
+++ b/usr/src/uts/common/fs/zfs/dsl_dataset.c
@@ -356,8 +356,10 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
/* Make sure dsobj has the correct object type. */
dmu_object_info_from_db(dbuf, &doi);
- if (doi.doi_type != DMU_OT_DSL_DATASET)
+ if (doi.doi_type != DMU_OT_DSL_DATASET) {
+ dmu_buf_rele(dbuf, tag);
return (SET_ERROR(EINVAL));
+ }
ds = dmu_buf_get_user(dbuf);
if (ds == NULL) {
diff --git a/usr/src/uts/common/fs/zfs/dsl_prop.c b/usr/src/uts/common/fs/zfs/dsl_prop.c
index 8e0e089448..2eada5cd16 100644
--- a/usr/src/uts/common/fs/zfs/dsl_prop.c
+++ b/usr/src/uts/common/fs/zfs/dsl_prop.c
@@ -380,7 +380,7 @@ dsl_prop_predict(dsl_dir_t *dd, const char *propname,
/*
* Unregister this callback. Return 0 on success, ENOENT if ddname is
- * invalid, ENOMSG if no matching callback registered.
+ * invalid, or ENOMSG if no matching callback registered.
*/
int
dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
diff --git a/usr/src/uts/common/fs/zfs/dsl_userhold.c b/usr/src/uts/common/fs/zfs/dsl_userhold.c
index fa9d937085..568bba33b5 100644
--- a/usr/src/uts/common/fs/zfs/dsl_userhold.c
+++ b/usr/src/uts/common/fs/zfs/dsl_userhold.c
@@ -433,7 +433,7 @@ dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, const char *htag)
dsl_dataset_name(ds, name);
dsl_dataset_rele(ds, FTAG);
dsl_pool_config_exit(dp, FTAG);
- zfs_unmount_snap(name);
+ (void) zfs_unmount_snap(name);
} else {
dsl_pool_config_exit(dp, FTAG);
}
diff --git a/usr/src/uts/common/fs/zfs/sa.c b/usr/src/uts/common/fs/zfs/sa.c
index 996f6e1443..6a87231c7b 100644
--- a/usr/src/uts/common/fs/zfs/sa.c
+++ b/usr/src/uts/common/fs/zfs/sa.c
@@ -111,6 +111,7 @@
* location.
*
* Byteswap implications:
+ *
* Since the SA attributes are not entirely self describing we can't do
* the normal byteswap processing. The special ZAP layout attribute and
* attribute registration attributes define the byteswap function and the
@@ -189,7 +190,6 @@ sa_attr_reg_t sa_legacy_attrs[] = {
};
/*
- * ZPL legacy layout
* This is only used for objects of type DMU_OT_ZNODE
*/
sa_attr_type_t sa_legacy_zpl_layout[] = {
@@ -199,7 +199,6 @@ sa_attr_type_t sa_legacy_zpl_layout[] = {
/*
* Special dummy layout used for buffers with no attributes.
*/
-
sa_attr_type_t sa_dummy_zpl_layout[] = { 0 };
static int sa_legacy_attr_count = 16;
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 0fc3e66904..7334d39516 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -26,6 +26,8 @@
*/
/*
+ * SPA: Storage Pool Allocator
+ *
* This file contains all the routines used when modifying on-disk SPA state.
* This includes opening, importing, destroying, exporting a pool, and syncing a
* pool.
@@ -77,6 +79,12 @@
#include "zfs_prop.h"
#include "zfs_comutil.h"
+/*
+ * The interval, in seconds, at which failed configuration cache file writes
+ * should be retried.
+ */
+static int zfs_ccw_retry_interval = 300;
+
typedef enum zti_modes {
ZTI_MODE_FIXED, /* value is # of threads (min 1) */
ZTI_MODE_ONLINE_PERCENT, /* value is % of online CPUs */
@@ -4514,6 +4522,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
/*
* Detach a device from a mirror or replacing vdev.
+ *
* If 'replace_done' is specified, only detach if the parent
* is a replacing vdev.
*/
@@ -5168,11 +5177,9 @@ spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
* the spa_vdev_config_[enter/exit] functions which allow us to
* grab and release the spa_config_lock while still holding the namespace
* lock. During each step the configuration is synced out.
- */
-
-/*
- * Remove a device from the pool. Currently, this supports removing only hot
- * spares, slogs, and level 2 ARC devices.
+ *
+ * Currently, this supports removing only hot spares, slogs, and level 2 ARC
+ * devices.
*/
int
spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
@@ -5282,7 +5289,7 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
/*
* Find any device that's done replacing, or a vdev marked 'unspare' that's
- * current spared, so we can detach it.
+ * currently spared, so we can detach it.
*/
static vdev_t *
spa_vdev_resilver_done_hunt(vdev_t *vd)
@@ -5661,13 +5668,34 @@ spa_async_resume(spa_t *spa)
mutex_exit(&spa->spa_async_lock);
}
+static boolean_t
+spa_async_tasks_pending(spa_t *spa)
+{
+ uint_t non_config_tasks;
+ uint_t config_task;
+ boolean_t config_task_suspended;
+
+ non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
+ config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
+ if (spa->spa_ccw_fail_time == 0) {
+ config_task_suspended = B_FALSE;
+ } else {
+ config_task_suspended =
+ (gethrtime() - spa->spa_ccw_fail_time) <
+ (zfs_ccw_retry_interval * NANOSEC);
+ }
+
+ return (non_config_tasks || (config_task && !config_task_suspended));
+}
+
static void
spa_async_dispatch(spa_t *spa)
{
mutex_enter(&spa->spa_async_lock);
- if (spa->spa_async_tasks && !spa->spa_async_suspended &&
+ if (spa_async_tasks_pending(spa) &&
+ !spa->spa_async_suspended &&
spa->spa_async_thread == NULL &&
- rootdir != NULL && !vn_is_readonly(rootdir))
+ rootdir != NULL)
spa->spa_async_thread = thread_create(NULL, 0,
spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
mutex_exit(&spa->spa_async_lock);
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index b113ce9e0c..d97fc32fbf 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -26,6 +26,7 @@
*/
#include <sys/spa.h>
+#include <sys/fm/fs/zfs.h>
#include <sys/spa_impl.h>
#include <sys/nvpair.h>
#include <sys/uio.h>
@@ -140,7 +141,7 @@ out:
kobj_close_file(file);
}
-static void
+static int
spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
{
size_t buflen;
@@ -148,13 +149,14 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
vnode_t *vp;
int oflags = FWRITE | FTRUNC | FCREAT | FOFFMAX;
char *temp;
+ int err;
/*
* If the nvlist is empty (NULL), then remove the old cachefile.
*/
if (nvl == NULL) {
- (void) vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE);
- return;
+ err = vn_remove(dp->scd_path, UIO_SYSSPACE, RMFILE);
+ return (err);
}
/*
@@ -175,12 +177,14 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
*/
(void) snprintf(temp, MAXPATHLEN, "%s.tmp", dp->scd_path);
- if (vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0) == 0) {
- if (vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
- 0, RLIM64_INFINITY, kcred, NULL) == 0 &&
- VOP_FSYNC(vp, FSYNC, kcred, NULL) == 0) {
- (void) vn_rename(temp, dp->scd_path, UIO_SYSSPACE);
- }
+ err = vn_open(temp, UIO_SYSSPACE, oflags, 0644, &vp, CRCREAT, 0);
+ if (err == 0) {
+ err = vn_rdwr(UIO_WRITE, vp, buf, buflen, 0, UIO_SYSSPACE,
+ 0, RLIM64_INFINITY, kcred, NULL);
+ if (err == 0)
+ err = VOP_FSYNC(vp, FSYNC, kcred, NULL);
+ if (err == 0)
+ err = vn_rename(temp, dp->scd_path, UIO_SYSSPACE);
(void) VOP_CLOSE(vp, oflags, 1, 0, kcred, NULL);
VN_RELE(vp);
}
@@ -189,6 +193,7 @@ spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl)
kmem_free(buf, buflen);
kmem_free(temp, MAXPATHLEN);
+ return (err);
}
/*
@@ -200,6 +205,8 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
{
spa_config_dirent_t *dp, *tdp;
nvlist_t *nvl;
+ boolean_t ccw_failure;
+ int error;
ASSERT(MUTEX_HELD(&spa_namespace_lock));
@@ -211,6 +218,7 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
* cachefile is changed, the new one is pushed onto this list, allowing
* us to update previous cachefiles that no longer contain this pool.
*/
+ ccw_failure = B_FALSE;
for (dp = list_head(&target->spa_config_list); dp != NULL;
dp = list_next(&target->spa_config_list, dp)) {
spa_t *spa = NULL;
@@ -251,10 +259,32 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
mutex_exit(&spa->spa_props_lock);
}
- spa_config_write(dp, nvl);
+ error = spa_config_write(dp, nvl);
+ if (error != 0)
+ ccw_failure = B_TRUE;
nvlist_free(nvl);
}
+ if (ccw_failure) {
+ /*
+ * Keep trying so that configuration data is
+ * written if/when any temporary filesystem
+ * resource issues are resolved.
+ */
+ if (target->spa_ccw_fail_time == 0) {
+ zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE,
+ target, NULL, NULL, 0, 0);
+ }
+ target->spa_ccw_fail_time = gethrtime();
+ spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE);
+ } else {
+ /*
+ * Do not rate limit future attempts to update
+ * the config cache.
+ */
+ target->spa_ccw_fail_time = 0;
+ }
+
/*
* Remove any config entries older than the current one.
*/
@@ -317,6 +347,7 @@ spa_config_set(spa_t *spa, nvlist_t *config)
/*
* Generate the pool's configuration based on the current in-core state.
+ *
* We infer whether to generate a complete config or just one top-level config
* based on whether vd is the root vdev.
*/
diff --git a/usr/src/uts/common/fs/zfs/spa_errlog.c b/usr/src/uts/common/fs/zfs/spa_errlog.c
index 9152846d6e..0dd6c7a489 100644
--- a/usr/src/uts/common/fs/zfs/spa_errlog.c
+++ b/usr/src/uts/common/fs/zfs/spa_errlog.c
@@ -183,8 +183,10 @@ process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count)
if (copyout(&zb, (char *)addr +
(*count - 1) * sizeof (zbookmark_t),
- sizeof (zbookmark_t)) != 0)
+ sizeof (zbookmark_t)) != 0) {
+ zap_cursor_fini(&zc);
return (SET_ERROR(EFAULT));
+ }
*count -= 1;
}
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index e57d8ab143..2b8a071cb0 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -1334,7 +1334,7 @@ zfs_panic_recover(const char *fmt, ...)
/*
* This is a stripped-down version of strtoull, suitable only for converting
- * lowercase hexidecimal numbers that don't overflow.
+ * lowercase hexadecimal numbers that don't overflow.
*/
uint64_t
strtonum(const char *str, char **nptr)
diff --git a/usr/src/uts/common/fs/zfs/sys/ddt.h b/usr/src/uts/common/fs/zfs/sys/ddt.h
index 9724d6eceb..771610677e 100644
--- a/usr/src/uts/common/fs/zfs/sys/ddt.h
+++ b/usr/src/uts/common/fs/zfs/sys/ddt.h
@@ -63,16 +63,15 @@ enum ddt_class {
*/
typedef struct ddt_key {
zio_cksum_t ddk_cksum; /* 256-bit block checksum */
- uint64_t ddk_prop; /* LSIZE, PSIZE, compression */
+ /*
+ * Encoded with logical & physical size, and compression, as follows:
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ * | 0 | 0 | 0 | comp | PSIZE | LSIZE |
+ * +-------+-------+-------+-------+-------+-------+-------+-------+
+ */
+ uint64_t ddk_prop;
} ddt_key_t;
-/*
- * ddk_prop layout:
- *
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- * | 0 | 0 | 0 | comp | PSIZE | LSIZE |
- * +-------+-------+-------+-------+-------+-------+-------+-------+
- */
#define DDK_GET_LSIZE(ddk) \
BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
#define DDK_SET_LSIZE(ddk, x) \
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 1366a998fd..6e07a156dc 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -409,6 +409,8 @@ void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
* object must be held in an assigned transaction before calling
* dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
* buffer as well. You must release your hold with dmu_buf_rele().
+ *
+ * Returns ENOENT, EIO, or 0.
*/
int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
int dmu_bonus_max(void);
@@ -664,8 +666,14 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
* If doi is NULL, just indicates whether the object exists.
*/
int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
+/* Like dmu_object_info, but faster if you have a held dnode in hand. */
void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
+/*
+ * Like dmu_object_info_from_db, but faster still when you only care about
+ * the size. This is specifically optimized for zfs_getattr().
+ */
void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
u_longlong_t *nblk512);
diff --git a/usr/src/uts/common/fs/zfs/sys/dnode.h b/usr/src/uts/common/fs/zfs/sys/dnode.h
index 9f9134d8cd..c3de03d369 100644
--- a/usr/src/uts/common/fs/zfs/sys/dnode.h
+++ b/usr/src/uts/common/fs/zfs/sys/dnode.h
@@ -145,9 +145,8 @@ typedef struct dnode_phys {
typedef struct dnode {
/*
- * dn_struct_rwlock protects the structure of the dnode,
- * including the number of levels of indirection (dn_nlevels),
- * dn_maxblkid, and dn_next_*
+ * Protects the structure of the dnode, including the number of levels
+ * of indirection (dn_nlevels), dn_maxblkid, and dn_next_*
*/
krwlock_t dn_struct_rwlock;
diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
index b0160edfb1..d3b411ba57 100644
--- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h
@@ -110,6 +110,7 @@ typedef struct dsl_pool {
/*
* Protects administrative changes (properties, namespace)
+ *
* It is only held for write in syncing context. Therefore
* syncing context does not need to ever have it for read, since
* nobody else could possibly have it for write.
diff --git a/usr/src/uts/common/fs/zfs/sys/sa_impl.h b/usr/src/uts/common/fs/zfs/sys/sa_impl.h
index 8ae05ce364..582bd76f01 100644
--- a/usr/src/uts/common/fs/zfs/sys/sa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/sa_impl.h
@@ -150,6 +150,7 @@ struct sa_os {
/*
* header for all bonus and spill buffers.
+ *
* The header has a fixed portion with a variable number
* of "lengths" depending on the number of variable sized
* attribues which are determined by the "layout number"
@@ -158,29 +159,27 @@ struct sa_os {
#define SA_MAGIC 0x2F505A /* ZFS SA */
typedef struct sa_hdr_phys {
uint32_t sa_magic;
- uint16_t sa_layout_info; /* Encoded with hdrsize and layout number */
+ /*
+ * Encoded with hdrsize and layout number as follows:
+ * 16 10 0
+ * +--------+-------+
+ * | hdrsz |layout |
+ * +--------+-------+
+ *
+ * Bits 0-10 are the layout number
+ * Bits 11-16 are the size of the header.
+ * The hdrsize is the number * 8
+ *
+ * For example.
+ * hdrsz of 1 ==> 8 byte header
+ * 2 ==> 16 byte header
+ *
+ */
+ uint16_t sa_layout_info;
uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */
/* ... Data follows the lengths. */
} sa_hdr_phys_t;
-/*
- * sa_hdr_phys -> sa_layout_info
- *
- * 16 10 0
- * +--------+-------+
- * | hdrsz |layout |
- * +--------+-------+
- *
- * Bits 0-10 are the layout number
- * Bits 11-16 are the size of the header.
- * The hdrsize is the number * 8
- *
- * For example.
- * hdrsz of 1 ==> 8 byte header
- * 2 ==> 16 byte header
- *
- */
-
#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10)
#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0)
#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 983103e386..66ea159475 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -238,8 +238,9 @@ struct spa {
uint64_t spa_deadman_synctime; /* deadman expiration timer */
kmutex_t spa_iokstat_lock; /* protects spa_iokstat_* */
struct kstat *spa_iokstat; /* kstat of io to this pool */
+ hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */
/*
- * spa_refcnt & spa_config_lock must be the last elements
+ * spa_refcount & spa_config_lock must be the last elements
* because refcount_t changes size based on compilation options.
* In order for the MDB module to function correctly, the other
* fields must remain in the same location.
diff --git a/usr/src/uts/common/fs/zfs/sys/space_map.h b/usr/src/uts/common/fs/zfs/sys/space_map.h
index 64223daf62..c0070da670 100644
--- a/usr/src/uts/common/fs/zfs/sys/space_map.h
+++ b/usr/src/uts/common/fs/zfs/sys/space_map.h
@@ -94,7 +94,6 @@ struct space_map_ops {
* 63 62 60 59 50 49 0
*
*
- *
* non-debug entry
*
* 1 47 1 15
diff --git a/usr/src/uts/common/fs/zfs/sys/unique.h b/usr/src/uts/common/fs/zfs/sys/unique.h
index 2ef3093edf..d4ba32e5c6 100644
--- a/usr/src/uts/common/fs/zfs/sys/unique.h
+++ b/usr/src/uts/common/fs/zfs/sys/unique.h
@@ -26,8 +26,6 @@
#ifndef _SYS_UNIQUE_H
#define _SYS_UNIQUE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#ifdef __cplusplus
@@ -42,7 +40,7 @@ void unique_fini(void);
/*
* Return a new unique value (which will not be uniquified against until
- * it is unique_insert()-ed.
+ * it is unique_insert()-ed).
*/
uint64_t unique_create(void);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index c599c549ac..02e3e838c3 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -246,12 +246,13 @@ typedef struct vdev_label {
#define VDD_METASLAB 0x01
#define VDD_DTL 0x02
+/* Offset of embedded boot loader region on each label */
+#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
/*
- * Size and offset of embedded boot loader region on each label.
+ * Size of embedded boot loader region on each label.
* The total size of the first two labels plus the boot area is 4MB.
*/
-#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t))
-#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
+#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */
/*
* Size of label regions at the start and end of each leaf device.
@@ -318,8 +319,9 @@ extern uint64_t vdev_get_min_asize(vdev_t *vd);
extern void vdev_set_min_asize(vdev_t *vd);
/*
- * zdb uses this tunable, so it must be declared here to make lint happy.
+ * Global variables
*/
+/* zdb uses this tunable, so it must be declared here to make lint happy. */
extern int zfs_vdev_cache_size;
/*
diff --git a/usr/src/uts/common/fs/zfs/sys/zap.h b/usr/src/uts/common/fs/zfs/sys/zap.h
index 1e975e99e0..20a66edf85 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap.h
@@ -86,18 +86,22 @@ extern "C" {
#endif
/*
- * The matchtype specifies which entry will be accessed.
- * MT_EXACT: only find an exact match (non-normalized)
- * MT_FIRST: find the "first" normalized (case and Unicode
- * form) match; the designated "first" match will not change as long
- * as the set of entries with this normalization doesn't change
- * MT_BEST: if there is an exact match, find that, otherwise find the
- * first normalized match
+ * Specifies matching criteria for ZAP lookups.
*/
typedef enum matchtype
{
+ /* Only find an exact match (non-normalized) */
MT_EXACT,
+ /*
+ * If there is an exact match, find that, otherwise find the
+ * first normalized match.
+ */
MT_BEST,
+ /*
+ * Find the "first" normalized (case and Unicode form) match;
+ * the designated "first" match will not change as long as the
+ * set of entries with this normalization doesn't change.
+ */
MT_FIRST
} matchtype_t;
@@ -174,16 +178,21 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
* call will fail and return EINVAL.
*
* If 'integer_size' is equal to or larger than the attribute's integer
- * size, the call will succeed and return 0. * When converting to a
- * larger integer size, the integers will be treated as unsigned (ie. no
- * sign-extension will be performed).
+ * size, the call will succeed and return 0.
+ *
+ * When converting to a larger integer size, the integers will be treated as
+ * unsigned (ie. no sign-extension will be performed).
*
* 'num_integers' is the length (in integers) of 'buf'.
*
* If the attribute is longer than the buffer, as many integers as will
* fit will be transferred to 'buf'. If the entire attribute was not
* transferred, the call will return EOVERFLOW.
- *
+ */
+int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
+ uint64_t integer_size, uint64_t num_integers, void *buf);
+
+/*
* If rn_len is nonzero, realname will be set to the name of the found
* entry (which may be different from the requested name if matchtype is
* not MT_EXACT).
@@ -191,8 +200,6 @@ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx);
* If normalization_conflictp is not NULL, it will be set if there is
* another name with the same case/unicode normalized form.
*/
-int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name,
- uint64_t integer_size, uint64_t num_integers, void *buf);
int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
uint64_t integer_size, uint64_t num_integers, void *buf,
matchtype_t mt, char *realname, int rn_len,
diff --git a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
index 3a33636741..f6947a72d7 100644
--- a/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
+++ b/usr/src/uts/common/fs/zfs/sys/zap_leaf.h
@@ -101,6 +101,7 @@ typedef enum zap_chunk_type {
*/
typedef struct zap_leaf_phys {
struct zap_leaf_header {
+ /* Public to ZAP */
uint64_t lh_block_type; /* ZBT_LEAF */
uint64_t lh_pad1;
uint64_t lh_prefix; /* hash prefix of this leaf */
@@ -109,8 +110,7 @@ typedef struct zap_leaf_phys {
uint16_t lh_nentries; /* number of entries */
uint16_t lh_prefix_len; /* num bits used to id this */
-/* above is accessable to zap, below is zap_leaf private */
-
+ /* Private to zap_leaf */
uint16_t lh_freelist; /* chunk head of free list */
uint8_t lh_flags; /* ZLF_* flags */
uint8_t lh_pad2[11];
@@ -161,13 +161,13 @@ typedef struct zap_leaf {
typedef struct zap_entry_handle {
- /* below is set by zap_leaf.c and is public to zap.c */
+ /* Set by zap_leaf and public to ZAP */
uint64_t zeh_num_integers;
uint64_t zeh_hash;
uint32_t zeh_cd;
uint8_t zeh_integer_size;
- /* below is private to zap_leaf.c */
+ /* Private to zap_leaf */
uint16_t zeh_fakechunk;
uint16_t *zeh_chunkp;
zap_leaf_t *zeh_leaf;
@@ -202,7 +202,7 @@ extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh,
/*
* Replace the value of an existing entry.
*
- * zap_entry_update may fail if it runs out of space (ENOSPC).
+ * May fail if it runs out of space (ENOSPC).
*/
extern int zap_entry_update(zap_entry_handle_t *zeh,
uint8_t integer_size, uint64_t num_integers, const void *buf);
@@ -221,10 +221,7 @@ extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd,
uint8_t integer_size, uint64_t num_integers, const void *buf,
zap_entry_handle_t *zeh);
-/*
- * Return true if there are additional entries with the same normalized
- * form.
- */
+/* Determine whether there is another entry with the same normalized form. */
extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
struct zap_name *zn, const char *name, struct zap *zap);
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
index d1a64180d5..4eefdc563f 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_acl.h
@@ -46,7 +46,8 @@ struct znode_phys;
#define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID
/*
- * ZFS ACLs are store in various forms.
+ * ZFS ACLs (Access Control Lists) are stored in various forms.
+ *
* Files created with ACL version ZFS_ACL_VERSION_INITIAL
* will all be created with fixed length ACEs of type
* zfs_oldace_t.
@@ -136,8 +137,8 @@ typedef struct acl_ops {
size_t (*ace_size)(void *acep); /* how big is this ace */
size_t (*ace_abstract_size)(void); /* sizeof abstract entry */
int (*ace_mask_off)(void); /* off of access mask in ace */
+ /* ptr to data if any */
int (*ace_data)(void *acep, void **datap);
- /* ptr to data if any */
} acl_ops_t;
/*
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
index 874d422568..9422177023 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -344,7 +344,7 @@ extern int zfs_secpolicy_rename_perms(const char *from,
const char *to, cred_t *cr);
extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
extern int zfs_busy(void);
-extern void zfs_unmount_snap(const char *);
+extern int zfs_unmount_snap(const char *);
extern void zfs_destroy_unmount_origin(const char *);
/*
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h b/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h
index f302b663e2..93733ba8a2 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_rlock.h
@@ -26,8 +26,6 @@
#ifndef _SYS_FS_ZFS_RLOCK_H
#define _SYS_FS_ZFS_RLOCK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#ifdef __cplusplus
extern "C" {
#endif
@@ -57,16 +55,14 @@ typedef struct rl {
} rl_t;
/*
- * Lock a range (offset, length) as either shared (READER)
- * or exclusive (WRITER or APPEND). APPEND is a special type that
- * is converted to WRITER that specified to lock from the start of the
- * end of file. zfs_range_lock() returns the range lock structure.
+ * Lock a range (offset, length) as either shared (RL_READER)
+ * or exclusive (RL_WRITER or RL_APPEND). RL_APPEND is a special type that
+ * is converted to RL_WRITER that specified to lock from the start of the
+ * end of file. Returns the range lock structure.
*/
rl_t *zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type);
-/*
- * Unlock range and destroy range lock structure.
- */
+/* Unlock range and destroy range lock structure. */
void zfs_range_unlock(rl_t *rl);
/*
@@ -76,7 +72,8 @@ void zfs_range_unlock(rl_t *rl);
void zfs_range_reduce(rl_t *rl, uint64_t off, uint64_t len);
/*
- * AVL comparison function used to compare range locks
+ * AVL comparison function used to order range locks
+ * Locks are ordered on the start offset of the range.
*/
int zfs_range_compare(const void *arg1, const void *arg2);
diff --git a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
index cf0bbee2ca..43986afda2 100644
--- a/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/usr/src/uts/common/fs/zfs/sys/zfs_znode.h
@@ -138,8 +138,9 @@ extern "C" {
#define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE)
-/* Path component length */
/*
+ * Path component length
+ *
* The generic fs code uses MAXNAMELEN to represent
* what the largest component length is. Unfortunately,
* this length includes the terminating NULL. ZFS needs
@@ -234,11 +235,7 @@ typedef struct znode {
#define ZTOV(ZP) ((ZP)->z_vnode)
#define VTOZ(VP) ((znode_t *)(VP)->v_data)
-/*
- * ZFS_ENTER() is called on entry to each ZFS vnode and vfs operation.
- * ZFS_EXIT() must be called before exitting the vop.
- * ZFS_VERIFY_ZP() verifies the znode is valid.
- */
+/* Called on entry to each ZFS vnode and vfs operation */
#define ZFS_ENTER(zfsvfs) \
{ \
rrw_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
@@ -248,8 +245,10 @@ typedef struct znode {
} \
}
+/* Must be called before exiting the vop */
#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
+/* Verifies the znode is valid */
#define ZFS_VERIFY_ZP(zp) \
if ((zp)->z_sa_hdl == NULL) { \
ZFS_EXIT((zp)->z_zfsvfs); \
@@ -269,15 +268,14 @@ typedef struct znode {
#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \
mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num)))
-/*
- * Macros to encode/decode ZFS stored time values from/to struct timespec
- */
+/* Encode ZFS stored time values from a struct timespec */
#define ZFS_TIME_ENCODE(tp, stmp) \
{ \
(stmp)[0] = (uint64_t)(tp)->tv_sec; \
(stmp)[1] = (uint64_t)(tp)->tv_nsec; \
}
+/* Decode ZFS stored time values to a struct timespec */
#define ZFS_TIME_DECODE(tp, stmp) \
{ \
(tp)->tv_sec = (time_t)(stmp)[0]; \
diff --git a/usr/src/uts/common/fs/zfs/sys/zil.h b/usr/src/uts/common/fs/zfs/sys/zil.h
index a212e4f0e1..15ef2aa8bf 100644
--- a/usr/src/uts/common/fs/zfs/sys/zil.h
+++ b/usr/src/uts/common/fs/zfs/sys/zil.h
@@ -242,6 +242,12 @@ typedef struct {
* information needed for replaying the create. If the
* file doesn't have any actual ACEs then the lr_aclcnt
* would be zero.
+ *
+ * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's.
+ * If create is also setting xvattr's, then acl data follows xvattr.
+ * If ACE FUIDs are needed then they will follow the xvattr_t. Following
+ * the FUIDs will be the domain table information. The FUIDs for the owner
+ * and group will be in lr_create. Name follows ACL data.
*/
typedef struct {
lr_create_t lr_create; /* common create portion */
@@ -250,13 +256,6 @@ typedef struct {
uint64_t lr_fuidcnt; /* number of real fuids */
uint64_t lr_acl_bytes; /* number of bytes in ACL */
uint64_t lr_acl_flags; /* ACL flags */
- /* lr_acl_bytes number of variable sized ace's follows */
- /* if create is also setting xvattr's, then acl data follows xvattr */
- /* if ACE FUIDs are needed then they will follow the xvattr_t */
- /* Following the FUIDs will be the domain table information. */
- /* The FUIDs for the owner and group will be in the lr_create */
- /* portion of the record. */
- /* name follows ACL data */
} lr_acl_create_t;
typedef struct {
diff --git a/usr/src/uts/common/fs/zfs/sys/zio_compress.h b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
index 34a82a8b81..f4cb84511a 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio_compress.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio_compress.h
@@ -36,11 +36,10 @@
extern "C" {
#endif
-/*
- * Common signature for all zio compress/decompress functions.
- */
+/* Common signature for all zio compress functions. */
typedef size_t zio_compress_func_t(void *src, void *dst,
size_t s_len, size_t d_len, int);
+/* Common signature for all zio decompress functions. */
typedef int zio_decompress_func_t(void *src, void *dst,
size_t s_len, size_t d_len, int);
diff --git a/usr/src/uts/common/fs/zfs/txg.c b/usr/src/uts/common/fs/zfs/txg.c
index 62a0c605d7..8cdb284832 100644
--- a/usr/src/uts/common/fs/zfs/txg.c
+++ b/usr/src/uts/common/fs/zfs/txg.c
@@ -344,6 +344,12 @@ txg_rele_to_sync(txg_handle_t *th)
th->th_cpu = NULL; /* defensive */
}
+/*
+ * Blocks until all transactions in the group are committed.
+ *
+ * On return, the transaction group has reached a stable state in which it can
+ * then be passed off to the syncing context.
+ */
static void
txg_quiesce(dsl_pool_t *dp, uint64_t txg)
{
@@ -394,6 +400,9 @@ txg_do_callbacks(list_t *cb_list)
/*
* Dispatch the commit callbacks registered on this txg to worker threads.
+ *
+ * If no callbacks are registered for a given TXG, nothing happens.
+ * This function creates a taskq for the associated pool, if needed.
*/
static void
txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
@@ -404,7 +413,10 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
for (c = 0; c < max_ncpus; c++) {
tx_cpu_t *tc = &tx->tx_cpu[c];
- /* No need to lock tx_cpu_t at this point */
+ /*
+ * No need to lock tx_cpu_t at this point, since this can
+ * only be called once a txg has been synced.
+ */
int g = txg & TXG_MASK;
@@ -424,7 +436,7 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
list_create(cb_list, sizeof (dmu_tx_callback_t),
offsetof(dmu_tx_callback_t, dcb_node));
- list_move_tail(&tc->tc_callbacks[g], cb_list);
+ list_move_tail(cb_list, &tc->tc_callbacks[g]);
(void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
txg_do_callbacks, cb_list, TQ_SLEEP);
@@ -558,8 +570,8 @@ txg_quiesce_thread(dsl_pool_t *dp)
/*
* Delay this thread by delay nanoseconds if we are still in the open
- * transaction group and there is already a waiting txg quiesing or quiesced.
- * Abort the delay if this txg stalls or enters the quiesing state.
+ * transaction group and there is already a waiting txg quiescing or quiesced.
+ * Abort the delay if this txg stalls or enters the quiescing state.
*/
void
txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
@@ -567,7 +579,7 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
tx_state_t *tx = &dp->dp_tx;
hrtime_t start = gethrtime();
- /* don't delay if this txg could transition to quiesing immediately */
+ /* don't delay if this txg could transition to quiescing immediately */
if (tx->tx_open_txg > txg ||
tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
return;
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index cc3594ad1a..7a409bd7ed 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -956,9 +956,11 @@ vdev_probe_done(zio_t *zio)
}
/*
- * Determine whether this device is accessible by reading and writing
- * to several known locations: the pad regions of each vdev label
- * but the first (which we leave alone in case it contains a VTOC).
+ * Determine whether this device is accessible.
+ *
+ * Read and write to several known locations: the pad regions of each
+ * vdev label but the first, which we leave alone in case it contains
+ * a VTOC.
*/
zio_t *
vdev_probe(vdev_t *vd, zio_t *zio)
@@ -2179,10 +2181,12 @@ vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
}
/*
- * Online the given vdev. If 'unspare' is set, it implies two things. First,
- * any attached spare device should be detached when the device finishes
- * resilvering. Second, the online should be treated like a 'test' online case,
- * so no FMA events are generated if the device fails to open.
+ * Online the given vdev.
+ *
+ * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached
+ * spare device should be detached when the device finishes resilvering.
+ * Second, the online should be treated like a 'test' online case, so no FMA
+ * events are generated if the device fails to open.
*/
int
vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 5ee7c7d15b..904918c3a4 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -1028,6 +1028,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
}
+/* Sync the uberblocks to all vdevs in svd[] */
int
vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
{
diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c
index fccbbb1d75..8de4b324a2 100644
--- a/usr/src/uts/common/fs/zfs/vdev_queue.c
+++ b/usr/src/uts/common/fs/zfs/vdev_queue.c
@@ -38,13 +38,14 @@
/*
* These tunables are for performance analysis.
*/
+
+/* The maximum number of I/Os concurrently pending to each device. */
+int zfs_vdev_max_pending = 10;
+
/*
- * zfs_vdev_max_pending is the maximum number of i/os concurrently
- * pending to each device. zfs_vdev_min_pending is the initial number
- * of i/os pending to each device (before it starts ramping up to
- * max_pending).
+ * The initial number of I/Os pending to each device, before it starts ramping
+ * up to zfs_vdev_max_pending.
*/
-int zfs_vdev_max_pending = 10;
int zfs_vdev_min_pending = 4;
/*
diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c
index d4b3d5b5a8..b22bcd2b2f 100644
--- a/usr/src/uts/common/fs/zfs/vdev_raidz.c
+++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c
@@ -64,6 +64,7 @@
* o addition (+) is represented by a bitwise XOR
* o subtraction (-) is therefore identical to addition: A + B = A - B
* o multiplication of A by 2 is defined by the following bitwise expression:
+ *
* (A * 2)_7 = A_6
* (A * 2)_6 = A_5
* (A * 2)_5 = A_4
@@ -122,7 +123,7 @@ typedef struct raidz_map {
uint64_t rm_missingparity; /* Count of missing parity devices */
uint64_t rm_firstdatacol; /* First data column/parity count */
uint64_t rm_nskip; /* Skipped sectors for padding */
- uint64_t rm_skipstart; /* Column index of padding start */
+ uint64_t rm_skipstart; /* Column index of padding start */
void *rm_datacopy; /* rm_asize-buffer of copied data */
uintptr_t rm_reports; /* # of referencing checksum reports */
uint8_t rm_freed; /* map no longer has referencing ZIO */
@@ -164,10 +165,7 @@ typedef struct raidz_map {
*/
int vdev_raidz_default_to_general;
-/*
- * These two tables represent powers and logs of 2 in the Galois field defined
- * above. These values were computed by repeatedly multiplying by 2 as above.
- */
+/* Powers of 2 in the Galois field defined above. */
static const uint8_t vdev_raidz_pow2[256] = {
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
@@ -202,6 +200,7 @@ static const uint8_t vdev_raidz_pow2[256] = {
0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
};
+/* Logs of 2 in the Galois field defined above. */
static const uint8_t vdev_raidz_log2[256] = {
0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
@@ -437,23 +436,50 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
vdev_raidz_cksum_report
};
+/*
+ * Divides the IO evenly across all child vdevs; usually, dcols is
+ * the number of children in the target vdev.
+ */
static raidz_map_t *
vdev_raidz_map_alloc(caddr_t data, uint64_t size, uint64_t offset,
uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
{
raidz_map_t *rm;
+ /* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = offset >> unit_shift;
+ /* The zio's size in units of the vdev's minimum sector size. */
uint64_t s = size >> unit_shift;
+ /* The first column for this stripe. */
uint64_t f = b % dcols;
+ /* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << unit_shift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ */
q = s / (dcols - nparity);
+
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
r = s - q * (dcols - nparity);
+
+ /* The number of "big columns" - those which contain remainder data. */
bc = (r == 0 ? 0 : r + nparity);
+
+ /*
+ * The total number of data and parity sectors associated with
+ * this I/O.
+ */
tot = s + nparity * (q + (r == 0 ? 0 : 1));
+ /* acols: The columns that will be accessed. */
+ /* scols: The columns that will be accessed or skipped. */
if (q == 0) {
+ /* Our I/O request doesn't span all child vdevs. */
acols = bc;
scols = MIN(dcols, roundup(bc, nparity + 1));
} else {
@@ -1668,6 +1694,23 @@ vdev_raidz_child_done(zio_t *zio)
rc->rc_skipped = 0;
}
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ * 1. Generate the parity data
+ * 2. Create child zio write operations to each column's vdev, for both
+ * data and parity.
+ * 3. If the column skips any sectors for padding, create optional dummy
+ * write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ * 1. Create child zio read operations to each data column's vdev to read
+ * the range of data required for zio.
+ * 2. If this is a scrub or resilver operation, or if any of the data
+ * vdevs have had errors, then create zio read operations to the parity
+ * columns' VDevs as well.
+ */
static int
vdev_raidz_io_start(zio_t *zio)
{
@@ -2019,6 +2062,27 @@ done:
return (ret);
}
+/*
+ * Complete an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ * 1. Check for errors on the child IOs.
+ * 2. Return, setting an error code if too few child VDevs were written
+ * to reconstruct the data later. Note that partial writes are
+ * considered successful if they can be reconstructed at all.
+ * - For read operations:
+ * 1. Check for errors on the child IOs.
+ * 2. If data errors occurred:
+ * a. Try to reassemble the data from the parity available.
+ * b. If we haven't yet read the parity drives, read them now.
+ * c. If all parity drives have been read but the data still doesn't
+ * reassemble with a correct checksum, then try combinatorial
+ * reconstruction.
+ * d. If that doesn't work, return an error.
+ * 3. If there were unexpected errors or this is a resilver operation,
+ * rewrite the vdevs that had errors.
+ */
static void
vdev_raidz_io_done(zio_t *zio)
{
diff --git a/usr/src/uts/common/fs/zfs/zap.c b/usr/src/uts/common/fs/zfs/zap.c
index 2f4ccfb6ea..f69c1b0312 100644
--- a/usr/src/uts/common/fs/zfs/zap.c
+++ b/usr/src/uts/common/fs/zfs/zap.c
@@ -295,7 +295,8 @@ zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp)
err = dmu_buf_hold(zap->zap_objset, zap->zap_object,
(tbl->zt_nextblk + blk) << bs, FTAG, &db,
DMU_READ_NO_PREFETCH);
- dmu_buf_rele(db, FTAG);
+ if (err == 0)
+ dmu_buf_rele(db, FTAG);
}
return (err);
}
@@ -992,18 +993,21 @@ zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx)
zap_attribute_t za;
int err;
+ err = 0;
for (zap_cursor_init(&zc, os, fromobj);
zap_cursor_retrieve(&zc, &za) == 0;
(void) zap_cursor_advance(&zc)) {
- if (za.za_integer_length != 8 || za.za_num_integers != 1)
- return (SET_ERROR(EINVAL));
+ if (za.za_integer_length != 8 || za.za_num_integers != 1) {
+ err = SET_ERROR(EINVAL);
+ break;
+ }
err = zap_add(os, intoobj, za.za_name,
8, 1, &za.za_first_integer, tx);
if (err)
- return (err);
+ break;
}
zap_cursor_fini(&zc);
- return (0);
+ return (err);
}
int
@@ -1014,18 +1018,21 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
zap_attribute_t za;
int err;
+ err = 0;
for (zap_cursor_init(&zc, os, fromobj);
zap_cursor_retrieve(&zc, &za) == 0;
(void) zap_cursor_advance(&zc)) {
- if (za.za_integer_length != 8 || za.za_num_integers != 1)
- return (SET_ERROR(EINVAL));
+ if (za.za_integer_length != 8 || za.za_num_integers != 1) {
+ err = SET_ERROR(EINVAL);
+ break;
+ }
err = zap_add(os, intoobj, za.za_name,
8, 1, &value, tx);
if (err)
- return (err);
+ break;
}
zap_cursor_fini(&zc);
- return (0);
+ return (err);
}
int
@@ -1036,24 +1043,27 @@ zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj,
zap_attribute_t za;
int err;
+ err = 0;
for (zap_cursor_init(&zc, os, fromobj);
zap_cursor_retrieve(&zc, &za) == 0;
(void) zap_cursor_advance(&zc)) {
uint64_t delta = 0;
- if (za.za_integer_length != 8 || za.za_num_integers != 1)
- return (SET_ERROR(EINVAL));
+ if (za.za_integer_length != 8 || za.za_num_integers != 1) {
+ err = SET_ERROR(EINVAL);
+ break;
+ }
err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta);
if (err != 0 && err != ENOENT)
- return (err);
+ break;
delta += za.za_first_integer;
err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx);
if (err)
- return (err);
+ break;
}
zap_cursor_fini(&zc);
- return (0);
+ return (err);
}
int
diff --git a/usr/src/uts/common/fs/zfs/zfs_acl.c b/usr/src/uts/common/fs/zfs/zfs_acl.c
index 1b296b2897..2eecefd8cf 100644
--- a/usr/src/uts/common/fs/zfs/zfs_acl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_acl.c
@@ -1362,7 +1362,8 @@ zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t trim, zfs_acl_t *aclp)
zacep = (void *)((uintptr_t)zacep + abstract_size);
new_count++;
new_bytes += abstract_size;
- } if (masks.deny1) {
+ }
+ if (masks.deny1) {
zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER);
zacep = (void *)((uintptr_t)zacep + abstract_size);
new_count++;
@@ -1766,7 +1767,7 @@ zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids)
}
/*
- * Retrieve a files ACL
+ * Retrieve a file's ACL
*/
int
zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
@@ -1921,7 +1922,7 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
}
/*
- * Set a files ACL
+ * Set a file's ACL
*/
int
zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
@@ -2342,6 +2343,7 @@ slow:
/*
* Determine whether Access should be granted/denied.
+ *
* The least priv subsytem is always consulted as a basic privilege
* can define any form of access.
*/
@@ -2537,7 +2539,6 @@ zfs_delete_final_check(znode_t *zp, znode_t *dzp,
* Determine whether Access should be granted/deny, without
* consulting least priv subsystem.
*
- *
* The following chart is the recommended NFSv4 enforcement for
* ability to delete an object.
*
diff --git a/usr/src/uts/common/fs/zfs/zfs_ctldir.c b/usr/src/uts/common/fs/zfs/zfs_ctldir.c
index f915d21db2..5928fe75e9 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ctldir.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ctldir.c
@@ -505,6 +505,11 @@ static const fs_operation_def_t zfsctl_tops_root[] = {
{ NULL }
};
+/*
+ * Gets the full dataset name that corresponds to the given snapshot name
+ * Example:
+ * zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1"
+ */
static int
zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
{
@@ -1046,6 +1051,7 @@ zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
/*
* pvp is the '.zfs' directory (zfsctl_node_t).
+ *
* Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
*
* This function is the callback to create a GFS vnode for '.zfs/snapshot'
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index 15b19b0d06..06ea64181d 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -323,9 +323,7 @@ zfs_is_bootfs(const char *name)
}
/*
- * zfs_earlier_version
- *
- * Return non-zero if the spa version is less than requested version.
+ * Return non-zero if the spa version is less than requested version.
*/
static int
zfs_earlier_version(const char *name, int version)
@@ -343,8 +341,6 @@ zfs_earlier_version(const char *name, int version)
}
/*
- * zpl_earlier_version
- *
* Return TRUE if the ZPL version is less than requested version.
*/
static boolean_t
@@ -2986,10 +2982,10 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
/*
* inputs:
- * createprops list of properties requested by creator
- * default_zplver zpl version to use if unspecified in createprops
- * fuids_ok fuids allowed in this version of the spa?
* os parent objset pointer (NULL if root fs)
+ * fuids_ok fuids allowed in this version of the spa?
+ * sa_ok SAs allowed in this version of the spa?
+ * createprops list of properties requested by creator
*
* outputs:
* zplprops values for the zplprops we attach to the master node object
@@ -3395,41 +3391,44 @@ zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl)
*
* This function is best-effort. Callers must deal gracefully if it
* remains mounted (or is remounted after this call).
+ *
+ * Returns 0 if the argument is not a snapshot, or it is not currently a
+ * filesystem, or we were able to unmount it. Returns error code otherwise.
*/
-void
+int
zfs_unmount_snap(const char *snapname)
{
vfs_t *vfsp;
zfsvfs_t *zfsvfs;
+ int err;
if (strchr(snapname, '@') == NULL)
- return;
+ return (0);
vfsp = zfs_get_vfs(snapname);
if (vfsp == NULL)
- return;
+ return (0);
zfsvfs = vfsp->vfs_data;
ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os)));
- if (vn_vfswlock(vfsp->vfs_vnodecovered) != 0) {
- VFS_RELE(vfsp);
- return;
- }
+ err = vn_vfswlock(vfsp->vfs_vnodecovered);
VFS_RELE(vfsp);
+ if (err != 0)
+ return (SET_ERROR(err));
/*
* Always force the unmount for snapshots.
*/
(void) dounmount(vfsp, MS_FORCE, kcred);
+ return (0);
}
/* ARGSUSED */
static int
zfs_unmount_snap_cb(const char *snapname, void *arg)
{
- zfs_unmount_snap(snapname);
- return (0);
+ return (zfs_unmount_snap(snapname));
}
/*
@@ -3452,7 +3451,7 @@ zfs_destroy_unmount_origin(const char *fsname)
char originname[MAXNAMELEN];
dsl_dataset_name(ds->ds_prev, originname);
dmu_objset_rele(os, FTAG);
- zfs_unmount_snap(originname);
+ (void) zfs_unmount_snap(originname);
} else {
dmu_objset_rele(os, FTAG);
}
@@ -3470,7 +3469,7 @@ zfs_destroy_unmount_origin(const char *fsname)
static int
zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
{
- int poollen;
+ int error, poollen;
nvlist_t *snaps;
nvpair_t *pair;
boolean_t defer;
@@ -3491,7 +3490,9 @@ zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
(name[poollen] != '/' && name[poollen] != '@'))
return (SET_ERROR(EXDEV));
- zfs_unmount_snap(name);
+ error = zfs_unmount_snap(name);
+ if (error != 0)
+ return (error);
}
return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl));
@@ -3509,8 +3510,12 @@ static int
zfs_ioc_destroy(zfs_cmd_t *zc)
{
int err;
- if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS)
- zfs_unmount_snap(zc->zc_name);
+
+ if (zc->zc_objset_type == DMU_OST_ZFS) {
+ err = zfs_unmount_snap(zc->zc_name);
+ if (err != 0)
+ return (err);
+ }
if (strchr(zc->zc_name, '@'))
err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy);
@@ -3556,8 +3561,7 @@ recursive_unmount(const char *fsname, void *arg)
char fullname[MAXNAMELEN];
(void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname);
- zfs_unmount_snap(fullname);
- return (0);
+ return (zfs_unmount_snap(fullname));
}
/*
@@ -5016,14 +5020,18 @@ static int
zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist)
{
nvpair_t *pair;
+ int err;
/*
* The release may cause the snapshot to be destroyed; make sure it
* is not mounted.
*/
for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL;
- pair = nvlist_next_nvpair(holds, pair))
- zfs_unmount_snap(nvpair_name(pair));
+ pair = nvlist_next_nvpair(holds, pair)) {
+ err = zfs_unmount_snap(nvpair_name(pair));
+ if (err != 0)
+ return (err);
+ }
return (dsl_dataset_user_release(holds, errlist));
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_log.c b/usr/src/uts/common/fs/zfs/zfs_log.c
index de786bf7f4..aeaba2233a 100644
--- a/usr/src/uts/common/fs/zfs/zfs_log.c
+++ b/usr/src/uts/common/fs/zfs/zfs_log.c
@@ -211,9 +211,8 @@ zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
}
/*
- * zfs_log_create() is used to handle TX_CREATE, TX_CREATE_ATTR, TX_MKDIR,
- * TX_MKDIR_ATTR and TX_MKXATTR
- * transactions.
+ * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and
+ * TK_MKXATTR transactions.
*
* TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
* domain information appended prior to the name. In this case the
@@ -340,7 +339,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
}
/*
- * zfs_log_remove() handles both TX_REMOVE and TX_RMDIR transactions.
+ * Handles both TX_REMOVE and TX_RMDIR transactions.
*/
void
zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
@@ -364,7 +363,7 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
}
/*
- * zfs_log_link() handles TX_LINK transactions.
+ * Handles TX_LINK transactions.
*/
void
zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
@@ -387,7 +386,7 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
}
/*
- * zfs_log_symlink() handles TX_SYMLINK transactions.
+ * Handles TX_SYMLINK transactions.
*/
void
zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
@@ -419,7 +418,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
}
/*
- * zfs_log_rename() handles TX_RENAME transactions.
+ * Handles TX_RENAME transactions.
*/
void
zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
@@ -445,7 +444,7 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
}
/*
- * zfs_log_write() handles TX_WRITE transactions.
+ * Handles TX_WRITE transactions.
*/
ssize_t zfs_immediate_write_sz = 32768;
@@ -524,7 +523,7 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
}
/*
- * zfs_log_truncate() handles TX_TRUNCATE transactions.
+ * Handles TX_TRUNCATE transactions.
*/
void
zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
@@ -547,7 +546,7 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
}
/*
- * zfs_log_setattr() handles TX_SETATTR transactions.
+ * Handles TX_SETATTR transactions.
*/
void
zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
@@ -609,7 +608,7 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
}
/*
- * zfs_log_acl() handles TX_ACL transactions.
+ * Handles TX_ACL transactions.
*/
void
zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
diff --git a/usr/src/uts/common/fs/zfs/zfs_rlock.c b/usr/src/uts/common/fs/zfs/zfs_rlock.c
index be562496b0..b40bdbea12 100644
--- a/usr/src/uts/common/fs/zfs/zfs_rlock.c
+++ b/usr/src/uts/common/fs/zfs/zfs_rlock.c
@@ -28,7 +28,7 @@
/*
* This file contains the code to implement file range locking in
- * ZFS, although there isn't much specific to ZFS (all that comes to mind
+ * ZFS, although there isn't much specific to ZFS (all that comes to mind is
* support for growing the blocksize).
*
* Interface
diff --git a/usr/src/uts/common/fs/zfs/zfs_sa.c b/usr/src/uts/common/fs/zfs/zfs_sa.c
index d141e43d72..ed5f276475 100644
--- a/usr/src/uts/common/fs/zfs/zfs_sa.c
+++ b/usr/src/uts/common/fs/zfs/zfs_sa.c
@@ -187,7 +187,7 @@ zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
/*
* I'm not convinced we should do any of this upgrade.
* since the SA code can read both old/new znode formats
- * with probably little to know performance difference.
+ * with probably little to no performance difference.
*
* All new files will be created with the new format.
*/
diff --git a/usr/src/uts/common/fs/zfs/zfs_vfsops.c b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
index e337861cd4..c7d4444722 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vfsops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vfsops.c
@@ -1349,13 +1349,12 @@ zfs_parse_bootfs(char *bpath, char *outpath)
}
/*
- * zfs_check_global_label:
- * Check that the hex label string is appropriate for the dataset
- * being mounted into the global_zone proper.
+ * Check that the hex label string is appropriate for the dataset being
+ * mounted into the global_zone proper.
*
- * Return an error if the hex label string is not default or
- * admin_low/admin_high. For admin_low labels, the corresponding
- * dataset must be readonly.
+ * Return an error if the hex label string is not default or
+ * admin_low/admin_high. For admin_low labels, the corresponding
+ * dataset must be readonly.
*/
int
zfs_check_global_label(const char *dsname, const char *hexsl)
@@ -1377,15 +1376,12 @@ zfs_check_global_label(const char *dsname, const char *hexsl)
}
/*
- * zfs_mount_label_policy:
- * Determine whether the mount is allowed according to MAC check.
- * by comparing (where appropriate) label of the dataset against
- * the label of the zone being mounted into. If the dataset has
- * no label, create one.
+ * Determine whether the mount is allowed according to MAC check.
+ * by comparing (where appropriate) label of the dataset against
+ * the label of the zone being mounted into. If the dataset has
+ * no label, create one.
*
- * Returns:
- * 0 : access allowed
- * >0 : error code, such as EACCES
+ * Returns 0 if access allowed, error otherwise (e.g. EACCES)
*/
static int
zfs_mount_label_policy(vfs_t *vfsp, char *osname)
diff --git a/usr/src/uts/common/fs/zfs/zfs_vnops.c b/usr/src/uts/common/fs/zfs/zfs_vnops.c
index 161e573175..b0901bea0c 100644
--- a/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ b/usr/src/uts/common/fs/zfs/zfs_vnops.c
@@ -95,11 +95,11 @@
* The ordering of events is important to avoid deadlocks and references
* to freed memory. The example below illustrates the following Big Rules:
*
- * (1) A check must be made in each zfs thread for a mounted file system.
+ * (1) A check must be made in each zfs thread for a mounted file system.
* This is done avoiding races using ZFS_ENTER(zfsvfs).
- * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
- * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
- * can return EIO from the calling function.
+ * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
+ * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
+ * can return EIO from the calling function.
*
* (2) VN_RELE() should always be the last thing except for zil_commit()
* (if necessary) and ZFS_EXIT(). This is for 3 reasons:
@@ -131,7 +131,7 @@
* (5) If the operation succeeded, generate the intent log entry for it
* before dropping locks. This ensures that the ordering of events
* in the intent log matches the order in which they actually occurred.
- * During ZIL replay the zfs_log_* functions will update the sequence
+ * During ZIL replay the zfs_log_* functions will update the sequence
* number to indicate the zil transaction has replayed.
*
* (6) At the end of each vnode op, the DMU tx must always commit,
@@ -388,7 +388,7 @@ update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
* else we default from the dmu buffer.
*
* NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- * the file is memory mapped.
+ * the file is memory mapped.
*/
static int
mappedread(vnode_t *vp, int nbytes, uio_t *uio)
@@ -437,8 +437,7 @@ offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
*
* OUT: uio - updated offset and range, buffer filled.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Side Effects:
* vp - atime updated if byte count > 0
@@ -574,14 +573,14 @@ out:
* IN: vp - vnode of file to be written to.
* uio - structure supplying write location, range info,
* and data buffer.
- * ioflag - FAPPEND flag set if in append mode.
+ * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
+ * set if in append mode.
* cr - credentials of caller.
* ct - caller context (NFS/CIFS fem monitor only)
*
* OUT: uio - updated offset and range.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - ctime|mtime updated if byte count > 0
@@ -1149,8 +1148,7 @@ specvp_check(vnode_t **vpp, cred_t *cr)
*
* OUT: vpp - vnode of located entry, NULL if not found.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* NA
@@ -1291,8 +1289,7 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
*
* OUT: vpp - vnode of created or trunc'd entry.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime updated if new entry created
@@ -1542,8 +1539,7 @@ out:
* ct - caller context
* flags - case flags
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime
@@ -1773,12 +1769,12 @@ out:
* vap - attributes of new directory.
* cr - credentials of caller.
* ct - caller context
+ * flags - case flags
* vsecp - ACL to be set
*
* OUT: vpp - vnode of created directory.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime updated
@@ -1958,8 +1954,7 @@ top:
* ct - caller context
* flags - case flags
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime updated
@@ -2077,7 +2072,7 @@ out:
/*
* Read as many directory entries as will fit into the provided
* buffer from the given directory cursor position (specified in
- * the uio structure.
+ * the uio structure).
*
* IN: vp - vnode of directory to read.
* uio - structure supplying read location, range info,
@@ -2089,8 +2084,7 @@ out:
* OUT: uio - updated offset and range, buffer filled.
* eofp - set to true if end-of-file detected.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - atime updated
@@ -2409,7 +2403,7 @@ zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
*
* OUT: vap - attribute values.
*
- * RETURN: 0 (always succeeds)
+ * RETURN: 0 (always succeeds).
*/
/* ARGSUSED */
static int
@@ -2611,8 +2605,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
* cr - credentials of caller.
* ct - caller context
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - ctime updated, mtime updated if size changed.
@@ -2620,7 +2613,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
/* ARGSUSED */
static int
zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
- caller_context_t *ct)
+ caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
@@ -3213,6 +3206,7 @@ out:
if (attrzp)
VN_RELE(ZTOV(attrzp));
+
if (aclp)
zfs_acl_free(aclp);
@@ -3347,8 +3341,7 @@ zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
* ct - caller context
* flags - case flags
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* sdvp,tdvp - ctime|mtime updated
@@ -3695,13 +3688,11 @@ out:
* IN: dvp - Directory to contain new symbolic link.
* link - Name for new symlink entry.
* vap - Attributes of new entry.
- * target - Target path of new symlink.
* cr - credentials of caller.
* ct - caller context
* flags - case flags
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* dvp - ctime|mtime updated
@@ -3847,14 +3838,13 @@ top:
* the symbolic path referred to by vp.
*
* IN: vp - vnode of symbolic link.
- * uoip - structure to contain the link path.
+ * uio - structure to contain the link path.
* cr - credentials of caller.
* ct - caller context
*
- * OUT: uio - structure to contain the link path.
+ * OUT: uio - structure containing the link path.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - atime updated
@@ -3893,8 +3883,7 @@ zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
* cr - credentials of caller.
* ct - caller context
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* tdvp - ctime|mtime updated
@@ -4062,8 +4051,7 @@ zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
* OUT: offp - start of range pushed.
* lenp - len of range pushed.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* NOTE: callers must have locked the page to be pushed. On
* exit, the page (and all other pages in the kluster) must be
@@ -4187,8 +4175,7 @@ out:
* cr - credentials of caller.
* ct - caller context.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - ctime|mtime updated
@@ -4353,8 +4340,7 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
* noffp - pointer to new file offset
* ct - caller context
*
- * RETURN: 0 if success
- * EINVAL if new offset invalid
+ * RETURN: 0 on success, EINVAL if new offset invalid.
*/
/* ARGSUSED */
static int
@@ -4490,8 +4476,7 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
* OUT: protp - protection mode of created pages.
* pl - list of pages created.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - atime updated
@@ -4499,8 +4484,8 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
/* ARGSUSED */
static int
zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
- page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
- enum seg_rw rw, cred_t *cr, caller_context_t *ct)
+ page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
+ enum seg_rw rw, cred_t *cr, caller_context_t *ct)
{
znode_t *zp = VTOZ(vp);
zfsvfs_t *zfsvfs = zp->z_zfsvfs;
@@ -4575,15 +4560,11 @@ out:
* Request a memory map for a section of a file. This code interacts
* with common code and the VM system as follows:
*
- * common code calls mmap(), which ends up in smmap_common()
- *
- * this calls VOP_MAP(), which takes you into (say) zfs
- *
- * zfs_map() calls as_map(), passing segvn_create() as the callback
- *
- * segvn_create() creates the new segment and calls VOP_ADDMAP()
- *
- * zfs_addmap() updates z_mapcnt
+ * - common code calls mmap(), which ends up in smmap_common()
+ * - this calls VOP_MAP(), which takes you into (say) zfs
+ * - zfs_map() calls as_map(), passing segvn_create() as the callback
+ * - segvn_create() creates the new segment and calls VOP_ADDMAP()
+ * - zfs_addmap() updates z_mapcnt
*/
/*ARGSUSED*/
static int
@@ -4700,8 +4681,7 @@ zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
* cr - credentials of caller [UNUSED].
* ct - caller context.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure.
*
* Timestamps:
* vp - ctime|mtime updated
@@ -4916,13 +4896,14 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
}
/*
- * Tunable, both must be a power of 2.
- *
- * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
- * zcr_blksz_max: if set to less than the file block size, allow loaning out of
- * an arcbuf for a partial block read
+ * The smallest read we may consider to loan out an arcbuf.
+ * This must be a power of 2.
*/
int zcr_blksz_min = (1 << 10); /* 1K */
+/*
+ * If set to less than the file block size, allow loaning out of an
+ * arcbuf for a partial block read. This must be a power of 2.
+ */
int zcr_blksz_max = (1 << 17); /* 128K */
/*ARGSUSED*/
@@ -5196,10 +5177,12 @@ const fs_operation_def_t zfs_sharevnodeops_template[] = {
/*
* Extended attribute directory vnode operations template
- * This template is identical to the directory vnodes
- * operation template except for restricted operations:
- * VOP_MKDIR()
- * VOP_SYMLINK()
+ *
+ * This template is identical to the directory vnodes
+ * operation template except for restricted operations:
+ * VOP_MKDIR()
+ * VOP_SYMLINK()
+ *
* Note that there are other restrictions embedded in:
* zfs_create() - restrict type to VREG
* zfs_link() - no links into/out of attribute space
diff --git a/usr/src/uts/common/fs/zfs/zfs_znode.c b/usr/src/uts/common/fs/zfs/zfs_znode.c
index 7d4f3084ed..f9433b6b44 100644
--- a/usr/src/uts/common/fs/zfs/zfs_znode.c
+++ b/usr/src/uts/common/fs/zfs/zfs_znode.c
@@ -1006,9 +1006,8 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
}
/*
- * zfs_xvattr_set only updates the in-core attributes
- * it is assumed the caller will be doing an sa_bulk_update
- * to push the changes out
+ * Update in-core attributes. It is assumed the caller will be doing an
+ * sa_bulk_update to push the changes out.
*/
void
zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx)
@@ -1447,8 +1446,7 @@ zfs_no_putpage(vnode_t *vp, page_t *pp, u_offset_t *offp, size_t *lenp,
* IN: zp - znode of file to free data in.
* end - new end-of-file
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure
*/
static int
zfs_extend(znode_t *zp, uint64_t end)
@@ -1525,8 +1523,7 @@ top:
* off - start of section to free.
* len - length of section to free.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure
*/
static int
zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
@@ -1564,8 +1561,7 @@ zfs_free_range(znode_t *zp, uint64_t off, uint64_t len)
* IN: zp - znode of file to free data in.
* end - new end-of-file.
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure
*/
static int
zfs_trunc(znode_t *zp, uint64_t end)
@@ -1663,8 +1659,7 @@ top:
* flag - current file open mode flags.
* log - TRUE if this action should be logged
*
- * RETURN: 0 if success
- * error code if failure
+ * RETURN: 0 on success, error code on failure
*/
int
zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
diff --git a/usr/src/uts/common/fs/zfs/zil.c b/usr/src/uts/common/fs/zfs/zil.c
index 8142e24210..da809916a4 100644
--- a/usr/src/uts/common/fs/zfs/zil.c
+++ b/usr/src/uts/common/fs/zfs/zil.c
@@ -66,9 +66,9 @@
*/
/*
- * This global ZIL switch affects all pools
+ * Disable intent logging replay. This global ZIL switch affects all pools.
*/
-int zil_replay_disable = 0; /* disable intent logging replay */
+int zil_replay_disable = 0;
/*
* Tunable parameter for debugging or performance analysis. Setting
@@ -879,6 +879,7 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
/*
* Define a limited set of intent log block sizes.
+ *
* These must be a multiple of 4KB. Note only the amount used (again
* aligned to 4KB) actually gets written. However, we can't always just
* allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index 425dcb3100..30738dce53 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -1193,13 +1193,16 @@ zio_interrupt(zio_t *zio)
/*
* Execute the I/O pipeline until one of the following occurs:
- * (1) the I/O completes; (2) the pipeline stalls waiting for
- * dependent child I/Os; (3) the I/O issues, so we're waiting
- * for an I/O completion interrupt; (4) the I/O is delegated by
- * vdev-level caching or aggregation; (5) the I/O is deferred
- * due to vdev-level queueing; (6) the I/O is handed off to
- * another thread. In all cases, the pipeline stops whenever
- * there's no CPU work; it never burns a thread in cv_wait().
+ *
+ * (1) the I/O completes
+ * (2) the pipeline stalls waiting for dependent child I/Os
+ * (3) the I/O issues, so we're waiting for an I/O completion interrupt
+ * (4) the I/O is delegated by vdev-level caching or aggregation
+ * (5) the I/O is deferred due to vdev-level queueing
+ * (6) the I/O is handed off to another thread.
+ *
+ * In all cases, the pipeline stops whenever there's no CPU work; it never
+ * burns a thread in cv_wait().
*
* There's no locking on io_stage because there's no legitimate way
* for multiple threads to be attempting to process the same I/O.
diff --git a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c
index 73cef1199b..bea112d166 100644
--- a/usr/src/uts/common/io/sata/adapters/ahci/ahci.c
+++ b/usr/src/uts/common/io/sata/adapters/ahci/ahci.c
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
*/
/*
@@ -105,13 +106,14 @@ static int ahci_check_slot_handle(ahci_port_t *, int);
/*
* Local function prototypes
*/
+static int ahci_setup_port_base_addresses(ahci_ctl_t *, ahci_port_t *);
static int ahci_alloc_ports_state(ahci_ctl_t *);
static void ahci_dealloc_ports_state(ahci_ctl_t *);
static int ahci_alloc_port_state(ahci_ctl_t *, uint8_t);
static void ahci_dealloc_port_state(ahci_ctl_t *, uint8_t);
-static int ahci_alloc_rcvd_fis(ahci_ctl_t *, ahci_port_t *, uint8_t);
+static int ahci_alloc_rcvd_fis(ahci_ctl_t *, ahci_port_t *);
static void ahci_dealloc_rcvd_fis(ahci_port_t *);
-static int ahci_alloc_cmd_list(ahci_ctl_t *, ahci_port_t *, uint8_t);
+static int ahci_alloc_cmd_list(ahci_ctl_t *, ahci_port_t *);
static void ahci_dealloc_cmd_list(ahci_ctl_t *, ahci_port_t *);
static int ahci_alloc_cmd_tables(ahci_ctl_t *, ahci_port_t *);
static void ahci_dealloc_cmd_tables(ahci_ctl_t *, ahci_port_t *);
@@ -122,6 +124,7 @@ static int ahci_initialize_controller(ahci_ctl_t *);
static void ahci_uninitialize_controller(ahci_ctl_t *);
static int ahci_initialize_port(ahci_ctl_t *, ahci_port_t *, ahci_addr_t *);
static int ahci_config_space_init(ahci_ctl_t *);
+static void ahci_staggered_spin_up(ahci_ctl_t *, uint8_t);
static void ahci_drain_ports_taskq(ahci_ctl_t *);
static int ahci_rdwr_pmult(ahci_ctl_t *, ahci_addr_t *, uint8_t, uint32_t *,
@@ -454,6 +457,10 @@ _init(void)
goto err_out;
}
+ /* watchdog tick */
+ ahci_watchdog_tick = drv_usectohz(
+ (clock_t)ahci_watchdog_timeout * 1000000);
+
ret = mod_install(&modlinkage);
if (ret != 0) {
sata_hba_fini(&modlinkage);
@@ -464,9 +471,6 @@ _init(void)
goto err_out;
}
- /* watchdog tick */
- ahci_watchdog_tick = drv_usectohz(
- (clock_t)ahci_watchdog_timeout * 1000000);
return (ret);
err_out:
@@ -517,6 +521,7 @@ ahci_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
int status;
int attach_state;
uint32_t cap_status, ahci_version;
+ uint32_t ghc_control;
int intr_types;
int i;
pci_regspec_t *regs;
@@ -544,6 +549,16 @@ ahci_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
ahci_ctlp = ddi_get_soft_state(ahci_statep, instance);
mutex_enter(&ahci_ctlp->ahcictl_mutex);
+ /*
+ * GHC.AE must be set to 1 before any other AHCI register
+ * is accessed
+ */
+ ghc_control = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_GLOBAL_GHC(ahci_ctlp));
+ ghc_control |= AHCI_HBA_GHC_AE;
+ ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_GLOBAL_GHC(ahci_ctlp), ghc_control);
+
/* Restart watch thread */
if (ahci_ctlp->ahcictl_timeout_id == 0)
ahci_ctlp->ahcictl_timeout_id = timeout(
@@ -655,6 +670,16 @@ ahci_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
attach_state |= AHCI_ATTACH_STATE_REG_MAP;
+ /*
+ * GHC.AE must be set to 1 before any other AHCI register
+ * is accessed
+ */
+ ghc_control = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_GLOBAL_GHC(ahci_ctlp));
+ ghc_control |= AHCI_HBA_GHC_AE;
+ ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_GLOBAL_GHC(ahci_ctlp), ghc_control);
+
/* Get the AHCI version information */
ahci_version = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
(uint32_t *)AHCI_GLOBAL_VS(ahci_ctlp));
@@ -678,6 +703,18 @@ ahci_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "hba capabilities = 0x%x",
cap_status);
+ /* CAP2 (HBA Capabilities Extended) is available since AHCI spec 1.2 */
+ if (ahci_version >= 0x00010200) {
+ uint32_t cap2_status;
+
+ /* Get the HBA capabilities extended information */
+ cap2_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_GLOBAL_CAP2(ahci_ctlp));
+
+ AHCIDBG(AHCIDBG_INIT, ahci_ctlp,
+ "hba capabilities extended = 0x%x", cap2_status);
+ }
+
#if AHCI_DEBUG
/* Get the interface speed supported by the HBA */
speed = (cap_status & AHCI_HBA_CAP_ISS) >> AHCI_HBA_CAP_ISS_SHIFT;
@@ -709,20 +746,12 @@ ahci_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "hba implementation of ports: 0x%x",
ahci_ctlp->ahcictl_ports_implemented);
- /*
- * According to the AHCI spec, CAP.NP should indicate the maximum
- * number of ports supported by the HBA silicon, but we found
- * this value of ICH8 chipset only indicates the number of ports
- * implemented (exposed) by it. Therefore, the driver should calculate
- * the potential maximum value by checking PI register, and use
- * the maximum of this value and CAP.NP.
- */
- ahci_ctlp->ahcictl_num_ports = max(
- (cap_status & AHCI_HBA_CAP_NP) + 1,
- ddi_fls(ahci_ctlp->ahcictl_ports_implemented));
+ /* Max port number implemented */
+ ahci_ctlp->ahcictl_num_ports =
+ ddi_fls(ahci_ctlp->ahcictl_ports_implemented);
AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "hba number of ports: %d",
- ahci_ctlp->ahcictl_num_ports);
+ (cap_status & AHCI_HBA_CAP_NP) + 1);
/* Get the number of implemented ports by the HBA */
ahci_ctlp->ahcictl_num_implemented_ports =
@@ -3537,6 +3566,7 @@ ahci_check_slot_handle(ahci_port_t *ahci_portp, int slot)
}
return (DDI_SUCCESS);
}
+
/*
* Allocate the ports structure, only called by ahci_attach
*/
@@ -3637,28 +3667,11 @@ ahci_initialize_controller(ahci_ctl_t *ahci_ctlp)
{
ahci_port_t *ahci_portp;
ahci_addr_t addr;
- uint32_t ghc_control;
int port;
AHCIDBG(AHCIDBG_INIT|AHCIDBG_ENTRY, ahci_ctlp,
"ahci_initialize_controller enter", NULL);
- mutex_enter(&ahci_ctlp->ahcictl_mutex);
-
- /*
- * Indicate that system software is AHCI aware by setting
- * GHC.AE to 1
- */
- ghc_control = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_GLOBAL_GHC(ahci_ctlp));
-
- ghc_control |= AHCI_HBA_GHC_AE;
- ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_GLOBAL_GHC(ahci_ctlp),
- ghc_control);
-
- mutex_exit(&ahci_ctlp->ahcictl_mutex);
-
/* Initialize the implemented ports and structures */
for (port = 0; port < ahci_ctlp->ahcictl_num_ports; port++) {
if (!AHCI_PORT_IMPLEMENTED(ahci_ctlp, port)) {
@@ -3821,6 +3834,41 @@ ahci_dealloc_pmult(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp)
}
/*
+ * Staggered Spin-up.
+ *
+ * WARNING!!! ahciport_mutex should be acquired before the function
+ * is called.
+ */
+static void
+ahci_staggered_spin_up(ahci_ctl_t *ahci_ctlp, uint8_t port)
+{
+ uint32_t cap_status;
+ uint32_t port_cmd_status;
+
+ cap_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_GLOBAL_CAP(ahci_ctlp));
+
+ /* Check for staggered spin-up support */
+ if (!(cap_status & AHCI_HBA_CAP_SSS))
+ return;
+
+ port_cmd_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port));
+
+ /* If PxCMD.SUD == 1, no staggered spin-up is needed */
+ if (port_cmd_status & AHCI_CMD_STATUS_SUD)
+ return;
+
+ AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "Spin-up at port %d", port);
+
+ /* Set PxCMD.SUD */
+ port_cmd_status |= AHCI_CMD_STATUS_SUD;
+ ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port),
+ port_cmd_status);
+}
+
+/*
* The routine is to initialize a port. First put the port in NOTRunning
* state, then enable port interrupt and clear Serror register. And under
* AHCI_ATTACH case, find device signature and then try to start the port.
@@ -3874,15 +3922,9 @@ ahci_initialize_port(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
"set PxCLB, PxCLBU, PxFB and PxFBU "
"during resume", port);
- /* Config Port Received FIS Base Address */
- ddi_put64(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint64_t *)AHCI_PORT_PxFB(ahci_ctlp, port),
- ahci_portp->ahciport_rcvd_fis_dma_cookie.dmac_laddress);
-
- /* Config Port Command List Base Address */
- ddi_put64(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint64_t *)AHCI_PORT_PxCLB(ahci_ctlp, port),
- ahci_portp->ahciport_cmd_list_dma_cookie.dmac_laddress);
+ if (ahci_setup_port_base_addresses(ahci_ctlp, ahci_portp) !=
+ AHCI_SUCCESS)
+ return (AHCI_FAILURE);
}
port_cmd_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
@@ -3904,6 +3946,9 @@ ahci_initialize_port(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
ahci_portp, port);
}
+ /* Make sure the drive is spun-up */
+ ahci_staggered_spin_up(ahci_ctlp, port);
+
/* Disable interrupt */
ahci_disable_port_intrs(ahci_ctlp, port);
@@ -3939,7 +3984,7 @@ ahci_initialize_port(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
if (ret != AHCI_SUCCESS) {
AHCIDBG(AHCIDBG_INIT|AHCIDBG_ERRS, ahci_ctlp,
"ahci_initialize_port:"
- "port reset faild at port %d", port);
+ "port reset failed at port %d", port);
return (AHCI_FAILURE);
}
@@ -3952,24 +3997,26 @@ ahci_initialize_port(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
return (AHCI_FAILURE);
}
}
+
AHCIPORT_SET_STATE(ahci_portp, addrp, SATA_STATE_READY);
AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "port %d is ready now.", port);
/*
* Try to get the device signature if the port is not empty.
*/
- if (!resuming && ahci_portp->ahciport_device_type != SATA_DTYPE_NONE)
+ if (!resuming && AHCIPORT_DEV_TYPE(ahci_portp, addrp) !=
+ SATA_DTYPE_NONE)
ahci_find_dev_signature(ahci_ctlp, ahci_portp, addrp);
/* Return directly if no device connected */
- if (ahci_portp->ahciport_device_type == SATA_DTYPE_NONE) {
+ if (AHCIPORT_DEV_TYPE(ahci_portp, addrp) == SATA_DTYPE_NONE) {
AHCIDBG(AHCIDBG_INIT, ahci_ctlp,
"No device connected to port %d", port);
goto out;
}
/* If this is a port multiplier, we need do some initialization */
- if (ahci_portp->ahciport_device_type == SATA_DTYPE_PMULT) {
+ if (AHCIPORT_DEV_TYPE(ahci_portp, addrp) == SATA_DTYPE_PMULT) {
AHCIDBG(AHCIDBG_INFO|AHCIDBG_PMULT, ahci_ctlp,
"Port multiplier found at port %d", port);
ahci_alloc_pmult(ahci_ctlp, ahci_portp);
@@ -4884,8 +4931,8 @@ ahci_probe_pmport(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
* the port must be idle and PxTFD.STS.BSY and PxTFD.STS.DRQ must be
* cleared unless command list override (PxCMD.CLO) is supported.
*
- * WARNING!!! ahciport_mutex should be acquired and PxCMD.FRE should be
- * set before the function is called.
+ * WARNING!!! ahciport_mutex should be acquired before the function
+ * is called.
*/
static int
ahci_software_reset(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
@@ -5130,17 +5177,12 @@ out:
*
* When an HBA or port reset occurs, Phy communication is going to
* be re-established with the device through a COMRESET followed by the
- * normal out-of-band communication sequence defined in Serial ATA. AT
+ * normal out-of-band communication sequence defined in Serial ATA. At
* the end of reset, the device, if working properly, will send a D2H
* Register FIS, which contains the device signature. When the HBA receives
* this FIS, it updates PxTFD.STS and PxTFD.ERR register fields, and updates
* the PxSIG register with the signature.
*
- * Staggered spin-up is an optional feature in SATA II, and it enables an HBA
- * to individually spin-up attached devices. Please refer to chapter 10.9 of
- * AHCI 1.0 spec.
- */
-/*
* WARNING!!! ahciport_mutex should be acquired, and PxCMD.ST should be also
* cleared before the function is called.
*/
@@ -5149,7 +5191,7 @@ ahci_port_reset(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
ahci_addr_t *addrp)
{
ahci_addr_t pmult_addr;
- uint32_t cap_status, port_cmd_status;
+ uint32_t port_cmd_status;
uint32_t port_scontrol, port_sstatus, port_serror;
uint32_t port_intr_status, port_task_file;
uint32_t port_state;
@@ -5169,117 +5211,45 @@ ahci_port_reset(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
"Port %d port resetting...", port);
ahci_portp->ahciport_port_state = 0;
- cap_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_GLOBAL_CAP(ahci_ctlp));
-
port_cmd_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
(uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port));
- if (cap_status & AHCI_HBA_CAP_SSS) {
- /*
- * HBA support staggered spin-up, if the port has
- * not spin up yet, then force it to do spin-up
- */
- if (!(port_cmd_status & AHCI_CMD_STATUS_SUD)) {
- if (!(ahci_portp->ahciport_flags
- & AHCI_PORT_FLAG_SPINUP)) {
- AHCIDBG(AHCIDBG_INIT, ahci_ctlp,
- "Port %d PxCMD.SUD is zero, force "
- "it to do spin-up", port);
- ahci_portp->ahciport_flags |=
- AHCI_PORT_FLAG_SPINUP;
- }
- }
- } else {
- /*
- * HBA doesn't support stagger spin-up, force it
- * to do normal COMRESET
- */
- if (ahci_portp->ahciport_flags &
- AHCI_PORT_FLAG_SPINUP) {
- AHCIDBG(AHCIDBG_INIT, ahci_ctlp,
- "HBA does not support staggered spin-up "
- "force it to do normal COMRESET", NULL);
- ahci_portp->ahciport_flags &=
- ~AHCI_PORT_FLAG_SPINUP;
- }
- }
-
- if (!(ahci_portp->ahciport_flags & AHCI_PORT_FLAG_SPINUP)) {
- /* Do normal COMRESET */
- AHCIDBG(AHCIDBG_INFO, ahci_ctlp,
- "ahci_port_reset: do normal COMRESET", port);
-
- /*
- * According to the spec, SUD bit should be set here,
- * but JMicron JMB363 doesn't follow it, so remove
- * the assertion, and just print a debug message.
- */
-#if AHCI_DEBUG
- if (!(port_cmd_status & AHCI_CMD_STATUS_SUD))
- AHCIDBG(AHCIDBG_ERRS, ahci_ctlp,
- "port %d SUD bit not set", port)
-#endif
-
- port_scontrol = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port));
- SCONTROL_SET_DET(port_scontrol, SCONTROL_DET_COMRESET);
-
- ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port),
- port_scontrol);
-
- /* Enable PxCMD.FRE to read device */
- ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port),
- port_cmd_status|AHCI_CMD_STATUS_FRE);
-
- /*
- * Give time for COMRESET to percolate, according to the AHCI
- * spec, software shall wait at least 1 millisecond before
- * clearing PxSCTL.DET
- */
- drv_usecwait(AHCI_1MS_USECS*2);
-
- /* Fetch the SCONTROL again and rewrite the DET part with 0 */
- port_scontrol = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port));
- SCONTROL_SET_DET(port_scontrol, SCONTROL_DET_NOACTION);
- ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port),
- port_scontrol);
- } else {
- /* Do staggered spin-up */
- port_scontrol = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port));
- SCONTROL_SET_DET(port_scontrol, SCONTROL_DET_NOACTION);
+ /*
+ * According to the spec, SUD bit should be set here,
+ * but JMicron JMB363 doesn't follow it, so print
+ * a debug message.
+ */
+ if (!(port_cmd_status & AHCI_CMD_STATUS_SUD))
+ AHCIDBG(AHCIDBG_ERRS, ahci_ctlp,
+ "ahci_port_reset: port %d SUD bit not set", port);
- /* PxSCTL.DET must be 0 */
- ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port),
- port_scontrol);
+ port_scontrol = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port));
+ SCONTROL_SET_DET(port_scontrol, SCONTROL_DET_COMRESET);
- port_cmd_status &= ~AHCI_CMD_STATUS_SUD;
- ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port),
- port_cmd_status);
+ ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port),
+ port_scontrol);
- /* 0 -> 1 edge */
- drv_usecwait(AHCI_1MS_USECS*2);
+ /* Enable PxCMD.FRE to read device */
+ ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port),
+ port_cmd_status|AHCI_CMD_STATUS_FRE);
- /* Set PxCMD.SUD to 1 */
- port_cmd_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port));
- port_cmd_status |= AHCI_CMD_STATUS_SUD;
- ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port),
- port_cmd_status);
+ /*
+ * Give time for COMRESET to percolate, according to the AHCI
+ * spec, software shall wait at least 1 millisecond before
+ * clearing PxSCTL.DET
+ */
+ drv_usecwait(AHCI_1MS_USECS * 2);
- /* Enable PxCMD.FRE to read device */
- ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port),
- port_cmd_status|AHCI_CMD_STATUS_FRE);
- }
+ /* Fetch the SCONTROL again and rewrite the DET part with 0 */
+ port_scontrol = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port));
+ SCONTROL_SET_DET(port_scontrol, SCONTROL_DET_NOACTION);
+ ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxSCTL(ahci_ctlp, port),
+ port_scontrol);
/*
* The port enters P:StartComm state, and HBA tells link layer to
@@ -5645,7 +5615,7 @@ err: /* R/W PMULT error */
*
* When an HBA reset occurs, Phy communication will be re-established with
* the device through a COMRESET followed by the normal out-of-band
- * communication sequence defined in Serial ATA. AT the end of reset, the
+ * communication sequence defined in Serial ATA. At the end of reset, the
* device, if working properly, will send a D2H Register FIS, which contains
* the device signature. When the HBA receives this FIS, it updates PxTFD.STS
* and PxTFD.ERR register fields, and updates the PxSIG register with the
@@ -5657,7 +5627,6 @@ static int
ahci_hba_reset(ahci_ctl_t *ahci_ctlp)
{
ahci_port_t *ahci_portp;
- ahci_addr_t addr;
uint32_t ghc_control;
uint8_t port;
int loop_count;
@@ -5728,7 +5697,8 @@ ahci_hba_reset(ahci_ctl_t *ahci_ctlp)
ahci_portp = ahci_ctlp->ahcictl_ports[port];
mutex_enter(&ahci_portp->ahciport_mutex);
- AHCI_ADDR_SET_PORT(&addr, port);
+ /* Make sure the drive is spun-up */
+ ahci_staggered_spin_up(ahci_ctlp, port);
if (ahci_restart_port_wait_till_ready(ahci_ctlp, ahci_portp,
port, AHCI_PORT_RESET|AHCI_RESET_NO_EVENTS_UP, NULL) !=
@@ -5854,7 +5824,6 @@ ahci_find_dev_signature(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
signature = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
(uint32_t *)AHCI_PORT_PxSIG(ahci_ctlp, port));
-#ifdef AHCI_DEBUG
if (AHCI_ADDR_IS_PMPORT(addrp)) {
AHCIDBG(AHCIDBG_INIT|AHCIDBG_INFO|AHCIDBG_PMULT, ahci_ctlp,
"ahci_find_dev_signature: signature = 0x%x at port %d:%d",
@@ -5864,7 +5833,6 @@ ahci_find_dev_signature(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
"ahci_find_dev_signature: signature = 0x%x at port %d",
signature, port);
}
-#endif
/* NOTE: Only support ATAPI device at controller port. */
if (signature == AHCI_SIGNATURE_ATAPI && !AHCI_ADDR_IS_PORT(addrp))
@@ -5995,6 +5963,87 @@ ahci_start_port(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp, uint8_t port)
}
/*
+ * Setup PxCLB, PxCLBU, PxFB, and PxFBU for particular port. First, we need
+ * to make sure PxCMD.ST, PxCMD.CR, PxCMD.FRE, and PxCMD.FR are all cleared.
+ * Then set PxCLB, PxCLBU, PxFB, and PxFBU.
+ *
+ * WARNING!!! ahciport_mutex should be acquired before the function is called.
+ */
+static int
+ahci_setup_port_base_addresses(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp)
+{
+ uint8_t port = ahci_portp->ahciport_port_num;
+ uint32_t port_cmd_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port));
+
+ /* Step 1: Make sure both PxCMD.ST and PxCMD.CR are cleared. */
+ if (port_cmd_status & (AHCI_CMD_STATUS_ST | AHCI_CMD_STATUS_CR)) {
+ if (ahci_put_port_into_notrunning_state(ahci_ctlp, ahci_portp,
+ port) != AHCI_SUCCESS)
+ return (AHCI_FAILURE);
+
+ port_cmd_status = ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port));
+ }
+
+ /* Step 2: Make sure both PxCMD.FRE and PxCMD.FR are cleared. */
+ if (port_cmd_status & (AHCI_CMD_STATUS_FRE | AHCI_CMD_STATUS_FR)) {
+ int loop_count = 0;
+
+ /* Clear PxCMD.FRE */
+ port_cmd_status &= ~AHCI_CMD_STATUS_FRE;
+ ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port),
+ port_cmd_status);
+
+ /* Wait until PxCMD.FR is cleared */
+ for (;;) {
+ port_cmd_status =
+ ddi_get32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxCMD(ahci_ctlp, port));
+
+ if (!(port_cmd_status & AHCI_CMD_STATUS_FR))
+ break;
+
+ if (loop_count++ >= AHCI_POLLRATE_PORT_IDLE_FR) {
+ AHCIDBG(AHCIDBG_INIT | AHCIDBG_ERRS, ahci_ctlp,
+ "ahci_setup_port_base_addresses: cannot "
+ "clear PxCMD.FR for port %d.", port);
+
+ /*
+ * We are effectively timing out after 0.5 sec.
+ * This value is specified in AHCI spec.
+ */
+ return (AHCI_FAILURE);
+ }
+
+ /* Wait for 1 millisec */
+ drv_usecwait(AHCI_1MS_USECS);
+ }
+ }
+
+ /* Step 3: Config Port Command List Base Address */
+ ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxCLB(ahci_ctlp, port),
+ ahci_portp->ahciport_cmd_list_dma_cookie.dmac_address);
+
+ ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxCLBU(ahci_ctlp, port),
+ ahci_portp->ahciport_cmd_list_dma_cookie.dmac_notused);
+
+ /* Step 4: Config Port Received FIS Base Address */
+ ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxFB(ahci_ctlp, port),
+ ahci_portp->ahciport_rcvd_fis_dma_cookie.dmac_address);
+
+ ddi_put32(ahci_ctlp->ahcictl_ahci_acc_handle,
+ (uint32_t *)AHCI_PORT_PxFBU(ahci_ctlp, port),
+ ahci_portp->ahciport_rcvd_fis_dma_cookie.dmac_notused);
+
+ return (AHCI_SUCCESS);
+}
+
+/*
* Allocate the ahci_port_t including Received FIS and Command List.
* The argument - port is the physical port number, and not logical
* port number seen by the SATA framework.
@@ -6028,14 +6077,20 @@ ahci_alloc_port_state(ahci_ctl_t *ahci_ctlp, uint8_t port)
* Allocate memory for received FIS structure and
* command list for this port
*/
- if (ahci_alloc_rcvd_fis(ahci_ctlp, ahci_portp, port) != AHCI_SUCCESS) {
+ if (ahci_alloc_rcvd_fis(ahci_ctlp, ahci_portp) != AHCI_SUCCESS) {
goto err_case1;
}
- if (ahci_alloc_cmd_list(ahci_ctlp, ahci_portp, port) != AHCI_SUCCESS) {
+ if (ahci_alloc_cmd_list(ahci_ctlp, ahci_portp) != AHCI_SUCCESS) {
goto err_case2;
}
+ /* Setup PxCMD.CLB, PxCMD.CLBU, PxCMD.FB, and PxCMD.FBU */
+ if (ahci_setup_port_base_addresses(ahci_ctlp, ahci_portp) !=
+ AHCI_SUCCESS) {
+ goto err_case3;
+ }
+
(void) snprintf(taskq_name + strlen(taskq_name),
sizeof (taskq_name) - strlen(taskq_name),
"_port%d", port);
@@ -6087,7 +6142,7 @@ err_case1:
}
/*
- * Reverse of ahci_dealloc_port_state().
+ * Reverse of ahci_alloc_port_state().
*
* WARNING!!! ahcictl_mutex should be acquired before the function
* is called.
@@ -6126,8 +6181,7 @@ ahci_dealloc_port_state(ahci_ctl_t *ahci_ctlp, uint8_t port)
* is called.
*/
static int
-ahci_alloc_rcvd_fis(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
- uint8_t port)
+ahci_alloc_rcvd_fis(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp)
{
size_t rcvd_fis_size;
size_t ret_len;
@@ -6185,11 +6239,6 @@ ahci_alloc_rcvd_fis(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
bzero((void *)ahci_portp->ahciport_rcvd_fis, rcvd_fis_size);
- /* Config Port Received FIS Base Address */
- ddi_put64(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint64_t *)AHCI_PORT_PxFB(ahci_ctlp, port),
- ahci_portp->ahciport_rcvd_fis_dma_cookie.dmac_laddress);
-
AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "64-bit, dma address: 0x%llx",
ahci_portp->ahciport_rcvd_fis_dma_cookie.dmac_laddress);
AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "32-bit, dma address: 0x%x",
@@ -6226,8 +6275,7 @@ ahci_dealloc_rcvd_fis(ahci_port_t *ahci_portp)
* is called.
*/
static int
-ahci_alloc_cmd_list(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
- uint8_t port)
+ahci_alloc_cmd_list(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp)
{
size_t cmd_list_size;
size_t ret_len;
@@ -6285,11 +6333,6 @@ ahci_alloc_cmd_list(ahci_ctl_t *ahci_ctlp, ahci_port_t *ahci_portp,
bzero((void *)ahci_portp->ahciport_cmd_list, cmd_list_size);
- /* Config Port Command List Base Address */
- ddi_put64(ahci_ctlp->ahcictl_ahci_acc_handle,
- (uint64_t *)AHCI_PORT_PxCLB(ahci_ctlp, port),
- ahci_portp->ahciport_cmd_list_dma_cookie.dmac_laddress);
-
AHCIDBG(AHCIDBG_INIT, ahci_ctlp, "64-bit, dma address: 0x%llx",
ahci_portp->ahciport_cmd_list_dma_cookie.dmac_laddress);
diff --git a/usr/src/uts/common/sys/fm/fs/zfs.h b/usr/src/uts/common/sys/fm/fs/zfs.h
index c752edc99b..029af540b3 100644
--- a/usr/src/uts/common/sys/fm/fs/zfs.h
+++ b/usr/src/uts/common/sys/fm/fs/zfs.h
@@ -46,6 +46,7 @@ extern "C" {
#define FM_EREPORT_ZFS_IO_FAILURE "io_failure"
#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure"
#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay"
+#define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write"
#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool"
#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode"
diff --git a/usr/src/uts/common/sys/sata/adapters/ahci/ahcireg.h b/usr/src/uts/common/sys/sata/adapters/ahci/ahcireg.h
index 5614c929d5..e738783dfe 100644
--- a/usr/src/uts/common/sys/sata/adapters/ahci/ahcireg.h
+++ b/usr/src/uts/common/sys/sata/adapters/ahci/ahcireg.h
@@ -23,7 +23,9 @@
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-
+/*
+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
+ */
#ifndef _AHCIREG_H
#define _AHCIREG_H
@@ -134,6 +136,10 @@ extern "C" {
#define AHCI_GLOBAL_EM_LOC(ahci_ctlp) (AHCI_GLOBAL_OFFSET(ahci_ctlp) + 0x1c)
/* Enclosure Management Control */
#define AHCI_GLOBAL_EM_CTL(ahci_ctlp) (AHCI_GLOBAL_OFFSET(ahci_ctlp) + 0x20)
+ /* HBA Capabilities Extended (AHCI spec 1.2) */
+#define AHCI_GLOBAL_CAP2(ahci_ctlp) (AHCI_GLOBAL_OFFSET(ahci_ctlp) + 0x24)
+ /* BIOS/OS Handoff Control and Status (AHCI spec 1.2) */
+#define AHCI_GLOBAL_BOHC(ahci_ctlp) (AHCI_GLOBAL_OFFSET(ahci_ctlp) + 0x28)
#define AHCI_PORT_IMPLEMENTED(ahci_ctlp, port) \
((0x1 << port) & ahci_ctlp->ahcictl_ports_implemented)
diff --git a/usr/src/uts/common/sys/sata/adapters/ahci/ahcivar.h b/usr/src/uts/common/sys/sata/adapters/ahci/ahcivar.h
index bf8671425c..b28d0b6464 100644
--- a/usr/src/uts/common/sys/sata/adapters/ahci/ahcivar.h
+++ b/usr/src/uts/common/sys/sata/adapters/ahci/ahcivar.h
@@ -21,6 +21,7 @@
/*
* Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
*/
@@ -133,9 +134,6 @@ typedef struct ahci_pmult_info ahci_pmult_info_t;
/*
* flags for ahciport_flags
*
- * AHCI_PORT_FLAG_SPINUP: this flag will be set when a HBA which supports
- * staggered spin-up needs to do a spin-up.
- *
* AHCI_PORT_FLAG_MOPPING: this flag will be set when the HBA is stopped,
* and all the outstanding commands need to be aborted and sent to upper
* layers.
@@ -173,7 +171,6 @@ typedef struct ahci_pmult_info ahci_pmult_info_t;
* will be printed. Note that, for INDENTIFY DEVICE command sent to ATAPI
* device or ATAPI PACKET command, this flag won't be set.
*/
-#define AHCI_PORT_FLAG_SPINUP 0x01
#define AHCI_PORT_FLAG_MOPPING 0x02
#define AHCI_PORT_FLAG_POLLING 0x04
#define AHCI_PORT_FLAG_RQSENSE 0x08
@@ -199,7 +196,6 @@ typedef struct ahci_port {
ahci_pmult_info_t *ahciport_pmult_info;
/*
- * AHCI_PORT_FLAG_SPINUP
* AHCI_PORT_FLAG_MOPPING
* AHCI_PORT_FLAG_POLLING
* AHCI_PORT_FLAG_RQSENSE
@@ -552,6 +548,7 @@ _NOTE(MUTEX_PROTECTS_DATA(ahci_ctl_t::ahcictl_mutex,
#define AHCI_POLLRATE_PORT_IDLE 50
#define AHCI_POLLRATE_PORT_SOFTRESET 100
#define AHCI_POLLRATE_GET_SPKT 100
+#define AHCI_POLLRATE_PORT_IDLE_FR 500
/* Clearing & setting the n'th bit in a given tag */