summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorbrendan <none@none>2007-11-09 21:33:30 -0800
committerbrendan <none@none>2007-11-09 21:33:30 -0800
commitfa94a07fd0519b8abfd871ad8fe60e6bebe1e2bb (patch)
tree1c594a9272fa03552a1cca3328b7820b409a2563 /usr/src
parent380789fc80376bd1573770361cb177a08c7e3524 (diff)
downloadillumos-joyent-fa94a07fd0519b8abfd871ad8fe60e6bebe1e2bb.tar.gz
PSARC 2007/618 ZFS L2ARC
6536054 second tier ("external") ARC
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/zinject/translate.c6
-rw-r--r--usr/src/cmd/zpool/zpool_main.c99
-rw-r--r--usr/src/cmd/zpool/zpool_vdev.c50
-rw-r--r--usr/src/grub/grub-0.95/stage2/zfs-include/dmu.h1
-rw-r--r--usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h11
-rw-r--r--usr/src/lib/libdiskmgt/common/entry.c4
-rw-r--r--usr/src/lib/libdiskmgt/common/inuse_zpool.c5
-rw-r--r--usr/src/lib/libdiskmgt/common/libdiskmgt.h1
-rw-r--r--usr/src/lib/libzfs/common/libzfs.h4
-rw-r--r--usr/src/lib/libzfs/common/libzfs_import.c71
-rw-r--r--usr/src/lib/libzfs/common/libzfs_pool.c138
-rw-r--r--usr/src/lib/libzfs/common/libzfs_util.c2
-rw-r--r--usr/src/uts/common/fs/zfs/arc.c1278
-rw-r--r--usr/src/uts/common/fs/zfs/dmu.c2
-rw-r--r--usr/src/uts/common/fs/zfs/metaslab.c6
-rw-r--r--usr/src/uts/common/fs/zfs/spa.c910
-rw-r--r--usr/src/uts/common/fs/zfs/spa_config.c2
-rw-r--r--usr/src/uts/common/fs/zfs/spa_misc.c247
-rw-r--r--usr/src/uts/common/fs/zfs/sys/arc.h10
-rw-r--r--usr/src/uts/common/fs/zfs/sys/dmu.h1
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa.h9
-rw-r--r--usr/src/uts/common/fs/zfs/sys/spa_impl.h19
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev.h10
-rw-r--r--usr/src/uts/common/fs/zfs/sys/vdev_impl.h2
-rw-r--r--usr/src/uts/common/fs/zfs/sys/zio.h6
-rw-r--r--usr/src/uts/common/fs/zfs/vdev.c61
-rw-r--r--usr/src/uts/common/fs/zfs/vdev_label.c96
-rw-r--r--usr/src/uts/common/fs/zfs/zfs_ioctl.c38
-rw-r--r--usr/src/uts/common/fs/zfs/zio.c18
-rw-r--r--usr/src/uts/common/sys/fs/zfs.h14
30 files changed, 2618 insertions, 503 deletions
diff --git a/usr/src/cmd/zinject/translate.c b/usr/src/cmd/zinject/translate.c
index b4f6693aa1..e778c1b6c0 100644
--- a/usr/src/cmd/zinject/translate.c
+++ b/usr/src/cmd/zinject/translate.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -437,7 +437,7 @@ translate_device(const char *pool, const char *device, zinject_record_t *record)
char *end;
zpool_handle_t *zhp;
nvlist_t *tgt;
- boolean_t isspare;
+ boolean_t isspare, iscache;
/*
* Given a device name or GUID, create an appropriate injection record
@@ -448,7 +448,7 @@ translate_device(const char *pool, const char *device, zinject_record_t *record)
record->zi_guid = strtoull(device, &end, 16);
if (record->zi_guid == 0 || *end != '\0') {
- tgt = zpool_find_vdev(zhp, device, &isspare);
+ tgt = zpool_find_vdev(zhp, device, &isspare, &iscache);
if (tgt == NULL) {
(void) fprintf(stderr, "cannot find device '%s' in "
diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c
index 45cf03abc2..a482625d75 100644
--- a/usr/src/cmd/zpool/zpool_main.c
+++ b/usr/src/cmd/zpool/zpool_main.c
@@ -213,7 +213,7 @@ get_usage(zpool_help_t idx) {
return (gettext("\treplace [-f] <pool> <device> "
"[new-device]\n"));
case HELP_REMOVE:
- return (gettext("\tremove <pool> <device>\n"));
+ return (gettext("\tremove <pool> <device> ...\n"));
case HELP_SCRUB:
return (gettext("\tscrub [-s] <pool> ...\n"));
case HELP_STATUS:
@@ -493,17 +493,17 @@ zpool_do_add(int argc, char **argv)
}
/*
- * zpool remove <pool> <vdev>
+ * zpool remove <pool> <vdev> ...
*
* Removes the given vdev from the pool. Currently, this only supports removing
- * spares from the pool. Eventually, we'll want to support removing leaf vdevs
- * (as an alias for 'detach') as well as toplevel vdevs.
+ * spares and cache devices from the pool. Eventually, we'll want to support
+ * removing leaf vdevs (as an alias for 'detach') as well as toplevel vdevs.
*/
int
zpool_do_remove(int argc, char **argv)
{
char *poolname;
- int ret;
+ int i, ret = 0;
zpool_handle_t *zhp;
argc--;
@@ -524,7 +524,10 @@ zpool_do_remove(int argc, char **argv)
if ((zhp = zpool_open(g_zfs, poolname)) == NULL)
return (1);
- ret = (zpool_vdev_remove(zhp, argv[1]) != 0);
+ for (i = 1; i < argc; i++) {
+ if (zpool_vdev_remove(zhp, argv[i]) != 0)
+ ret = 1;
+ }
return (ret);
}
@@ -910,6 +913,14 @@ max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max)
max = ret;
}
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++)
+ if ((ret = max_width(zhp, child[c], depth + 2,
+ max)) > max)
+ max = ret;
+ }
+
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
&child, &children) == 0) {
for (c = 0; c < children; c++)
@@ -995,15 +1006,24 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth,
free(vname);
}
- if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
- &child, &children) != 0)
- return;
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0) {
+ (void) printf(gettext("\tcache\n"));
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+ (void) printf("\t %s\n", vname);
+ free(vname);
+ }
+ }
- (void) printf(gettext("\tspares\n"));
- for (c = 0; c < children; c++) {
- vname = zpool_vdev_name(g_zfs, NULL, child[c]);
- (void) printf("\t %s\n", vname);
- free(vname);
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
+ &child, &children) == 0) {
+ (void) printf(gettext("\tspares\n"));
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+ (void) printf("\t %s\n", vname);
+ free(vname);
+ }
}
}
@@ -1655,6 +1675,28 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
newchild[c], cb, depth + 2);
free(vname);
}
+
+ /*
+ * Include level 2 ARC devices in iostat output
+ */
+ if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE,
+ &newchild, &children) != 0)
+ return;
+
+ if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE,
+ &oldchild, &c) != 0)
+ return;
+
+ if (children > 0) {
+ (void) printf("%-*s - - - - - "
+ "-\n", cb->cb_namewidth, "cache");
+ for (c = 0; c < children; c++) {
+ vname = zpool_vdev_name(g_zfs, zhp, newchild[c]);
+ print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
+ newchild[c], cb, depth + 2);
+ free(vname);
+ }
+ }
}
static int
@@ -2805,6 +2847,26 @@ print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares,
}
}
+static void
+print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache,
+ int namewidth)
+{
+ uint_t i;
+ char *name;
+
+ if (nl2cache == 0)
+ return;
+
+ (void) printf(gettext("\tcache\n"));
+
+ for (i = 0; i < nl2cache; i++) {
+ name = zpool_vdev_name(g_zfs, zhp, l2cache[i]);
+ print_status_config(zhp, name, l2cache[i],
+ namewidth, 2, B_FALSE, B_FALSE);
+ free(name);
+ }
+}
+
/*
* Display a summary of pool status. Displays a summary such as:
*
@@ -2996,8 +3058,8 @@ status_callback(zpool_handle_t *zhp, void *data)
if (config != NULL) {
int namewidth;
uint64_t nerr;
- nvlist_t **spares;
- uint_t nspares;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
(void) printf(gettext(" scrub: "));
@@ -3016,6 +3078,10 @@ status_callback(zpool_handle_t *zhp, void *data)
print_status_config(zhp, "logs", nvroot, namewidth, 0,
B_FALSE, B_TRUE);
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0)
+ print_l2cache(zhp, l2cache, nl2cache, namewidth);
+
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
&spares, &nspares) == 0)
print_spares(zhp, spares, nspares, namewidth);
@@ -3303,6 +3369,7 @@ zpool_do_upgrade(int argc, char **argv)
(void) printf(gettext(" 8 Delegated administration\n"));
(void) printf(gettext(" 9 refquota and refreservation "
"properties\n"));
+ (void) printf(gettext(" 10 Cache devices\n"));
(void) printf(gettext("For more information on a particular "
"version, including supported releases, see:\n\n"));
(void) printf("http://www.opensolaris.org/os/community/zfs/"
diff --git a/usr/src/cmd/zpool/zpool_vdev.c b/usr/src/cmd/zpool/zpool_vdev.c
index 48615d991e..8402521a82 100644
--- a/usr/src/cmd/zpool/zpool_vdev.c
+++ b/usr/src/cmd/zpool/zpool_vdev.c
@@ -968,6 +968,12 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv)
if ((ret = make_disks(zhp, child[c])) != 0)
return (ret);
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0)
+ for (c = 0; c < children; c++)
+ if ((ret = make_disks(zhp, child[c])) != 0)
+ return (ret);
+
return (0);
}
@@ -1077,6 +1083,14 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
if ((ret = check_in_use(config, child[c], force,
isreplacing, B_TRUE)) != 0)
return (ret);
+
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0)
+ for (c = 0; c < children; c++)
+ if ((ret = check_in_use(config, child[c], force,
+ isreplacing, B_FALSE)) != 0)
+ return (ret);
+
return (0);
}
@@ -1113,6 +1127,12 @@ is_grouping(const char *type, int *mindev)
return (VDEV_TYPE_LOG);
}
+ if (strcmp(type, "cache") == 0) {
+ if (mindev != NULL)
+ *mindev = 1;
+ return (VDEV_TYPE_L2CACHE);
+ }
+
return (NULL);
}
@@ -1125,8 +1145,8 @@ is_grouping(const char *type, int *mindev)
nvlist_t *
construct_spec(int argc, char **argv)
{
- nvlist_t *nvroot, *nv, **top, **spares;
- int t, toplevels, mindev, nspares, nlogs;
+ nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
+ int t, toplevels, mindev, nspares, nlogs, nl2cache;
const char *type;
uint64_t is_log;
boolean_t seen_logs;
@@ -1134,8 +1154,10 @@ construct_spec(int argc, char **argv)
top = NULL;
toplevels = 0;
spares = NULL;
+ l2cache = NULL;
nspares = 0;
nlogs = 0;
+ nl2cache = 0;
is_log = B_FALSE;
seen_logs = B_FALSE;
@@ -1180,6 +1202,17 @@ construct_spec(int argc, char **argv)
continue;
}
+ if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
+ if (l2cache != NULL) {
+ (void) fprintf(stderr,
+ gettext("invalid vdev "
+ "specification: 'cache' can be "
+ "specified only once\n"));
+ return (NULL);
+ }
+ is_log = B_FALSE;
+ }
+
if (is_log) {
if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
(void) fprintf(stderr,
@@ -1219,6 +1252,10 @@ construct_spec(int argc, char **argv)
spares = child;
nspares = children;
continue;
+ } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
+ l2cache = child;
+ nl2cache = children;
+ continue;
} else {
verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
0) == 0);
@@ -1259,7 +1296,7 @@ construct_spec(int argc, char **argv)
top[toplevels - 1] = nv;
}
- if (toplevels == 0 && nspares == 0) {
+ if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
(void) fprintf(stderr, gettext("invalid vdev "
"specification: at least one toplevel vdev must be "
"specified\n"));
@@ -1283,13 +1320,20 @@ construct_spec(int argc, char **argv)
if (nspares != 0)
verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
spares, nspares) == 0);
+ if (nl2cache != 0)
+ verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ l2cache, nl2cache) == 0);
for (t = 0; t < toplevels; t++)
nvlist_free(top[t]);
for (t = 0; t < nspares; t++)
nvlist_free(spares[t]);
+ for (t = 0; t < nl2cache; t++)
+ nvlist_free(l2cache[t]);
if (spares)
free(spares);
+ if (l2cache)
+ free(l2cache);
free(top);
return (nvroot);
diff --git a/usr/src/grub/grub-0.95/stage2/zfs-include/dmu.h b/usr/src/grub/grub-0.95/stage2/zfs-include/dmu.h
index e57e45682d..3b18d8ada5 100644
--- a/usr/src/grub/grub-0.95/stage2/zfs-include/dmu.h
+++ b/usr/src/grub/grub-0.95/stage2/zfs-include/dmu.h
@@ -102,5 +102,6 @@ typedef enum dmu_objset_type {
#define DMU_POOL_DEFLATE "deflate"
#define DMU_POOL_HISTORY "history"
#define DMU_POOL_PROPS "pool_props"
+#define DMU_POOL_L2CACHE "l2cache"
#endif /* _SYS_DMU_H */
diff --git a/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h b/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h
index 9619d5bfc2..82d767a010 100644
--- a/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h
+++ b/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h
@@ -38,7 +38,8 @@
#define SPA_VERSION_7 7ULL
#define SPA_VERSION_8 8ULL
#define SPA_VERSION_9 9ULL
-#define SPA_VERSION SPA_VERSION_9
+#define SPA_VERSION_10 10ULL
+#define SPA_VERSION SPA_VERSION_10
/*
* The following are configuration names used in the nvlist describing a pool's
@@ -71,6 +72,7 @@
#define ZPOOL_CONFIG_SPARES "spares"
#define ZPOOL_CONFIG_IS_SPARE "is_spare"
#define ZPOOL_CONFIG_NPARITY "nparity"
+#define ZPOOL_CONFIG_L2CACHE "l2cache"
#define VDEV_TYPE_ROOT "root"
#define VDEV_TYPE_MIRROR "mirror"
@@ -80,17 +82,20 @@
#define VDEV_TYPE_FILE "file"
#define VDEV_TYPE_MISSING "missing"
#define VDEV_TYPE_SPARE "spare"
+#define VDEV_TYPE_L2CACHE "l2cache"
/*
* pool state. The following states are written to disk as part of the normal
- * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are
- * software abstractions used at various levels to communicate pool state.
+ * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining
+ * states are software abstractions used at various levels to communicate pool
+ * state.
*/
typedef enum pool_state {
POOL_STATE_ACTIVE = 0, /* In active use */
POOL_STATE_EXPORTED, /* Explicitly exported */
POOL_STATE_DESTROYED, /* Explicitly destroyed */
POOL_STATE_SPARE, /* Reserved for hot spare use */
+ POOL_STATE_L2CACHE, /* Level 2 ARC device */
POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
POOL_STATE_UNAVAIL, /* Internal libzfs state */
POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */
diff --git a/usr/src/lib/libdiskmgt/common/entry.c b/usr/src/lib/libdiskmgt/common/entry.c
index a7756b02d4..1d3b2313e4 100644
--- a/usr/src/lib/libdiskmgt/common/entry.c
+++ b/usr/src/lib/libdiskmgt/common/entry.c
@@ -1118,6 +1118,10 @@ dm_get_usage_string(char *what, char *how, char **usage_string)
*usage_string = dgettext(TEXT_DOMAIN,
"%s is reserved as a hot spare for ZFS pool %s. Please "
"see zpool(1M).\n");
+ } else if (strcmp(what, DM_USE_L2CACHE_ZPOOL) == 0) {
+ *usage_string = dgettext(TEXT_DOMAIN,
+ "%s is in use as a cache device for ZFS pool %s. "
+ "Please see zpool(1M).\n");
}
}
void
diff --git a/usr/src/lib/libdiskmgt/common/inuse_zpool.c b/usr/src/lib/libdiskmgt/common/inuse_zpool.c
index a7cf203a2f..e4f9ab5a3b 100644
--- a/usr/src/lib/libdiskmgt/common/inuse_zpool.c
+++ b/usr/src/lib/libdiskmgt/common/inuse_zpool.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2006 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -102,6 +102,9 @@ inuse_zpool_common(char *slice, nvlist_t *attrs, int *errp, char *type)
} else if (state == POOL_STATE_SPARE) {
found = 1;
type = DM_USE_SPARE_ZPOOL;
+ } else if (state == POOL_STATE_L2CACHE) {
+ found = 1;
+ type = DM_USE_L2CACHE_ZPOOL;
}
} else {
found = 1;
diff --git a/usr/src/lib/libdiskmgt/common/libdiskmgt.h b/usr/src/lib/libdiskmgt/common/libdiskmgt.h
index 3a580cbfb4..303b03f171 100644
--- a/usr/src/lib/libdiskmgt/common/libdiskmgt.h
+++ b/usr/src/lib/libdiskmgt/common/libdiskmgt.h
@@ -216,6 +216,7 @@ typedef enum {
#define DM_USE_EXPORTED_ZPOOL "exported_zpool"
#define DM_USE_ACTIVE_ZPOOL "active_zpool"
#define DM_USE_SPARE_ZPOOL "spare_zpool"
+#define DM_USE_L2CACHE_ZPOOL "l2cache_zpool"
/* event */
#define DM_EV_NAME "name"
diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h
index e2ebae01f4..f4ef8adbe5 100644
--- a/usr/src/lib/libzfs/common/libzfs.h
+++ b/usr/src/lib/libzfs/common/libzfs.h
@@ -113,6 +113,7 @@ enum {
EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */
EZFS_SHARESMBFAILED, /* failed to share over smb */
EZFS_BADCACHE, /* bad cache file */
+ EZFS_ISL2CACHE, /* device is for the level 2 ARC */
EZFS_UNKNOWN
};
@@ -216,7 +217,8 @@ extern int zpool_vdev_fault(zpool_handle_t *, uint64_t);
extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t);
extern int zpool_vdev_clear(zpool_handle_t *, uint64_t);
-extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *);
+extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *,
+ boolean_t *);
extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *);
/*
diff --git a/usr/src/lib/libzfs/common/libzfs_import.c b/usr/src/lib/libzfs/common/libzfs_import.c
index 50e9ddf97e..a8ae241f88 100644
--- a/usr/src/lib/libzfs/common/libzfs_import.c
+++ b/usr/src/lib/libzfs/common/libzfs_import.c
@@ -213,11 +213,13 @@ add_config(libzfs_handle_t *hdl, pool_list_t *pl, const char *path,
name_entry_t *ne;
/*
- * If this is a hot spare not currently in use, add it to the list of
- * names to translate, but don't do anything else.
+ * If this is a hot spare not currently in use or level 2 cache
+ * device, add it to the list of names to translate, but don't do
+ * anything else.
*/
if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE,
- &state) == 0 && state == POOL_STATE_SPARE &&
+ &state) == 0 &&
+ (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) &&
nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) {
if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL)
return (-1);
@@ -415,8 +417,8 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl)
vdev_entry_t *ve;
config_entry_t *ce;
nvlist_t *ret = NULL, *config = NULL, *tmp, *nvtop, *nvroot;
- nvlist_t **spares;
- uint_t i, nspares;
+ nvlist_t **spares, **l2cache;
+ uint_t i, nspares, nl2cache;
boolean_t config_seen;
uint64_t best_txg;
char *name, *hostname;
@@ -647,6 +649,17 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl)
}
/*
+ * Update the paths for l2cache devices.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0) {
+ for (i = 0; i < nl2cache; i++) {
+ if (fix_paths(l2cache[i], pl->names) != 0)
+ goto nomem;
+ }
+ }
+
+ /*
* Restore the original information read from the actual label.
*/
(void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID,
@@ -728,12 +741,12 @@ zpool_read_label(int fd, nvlist_t **config)
continue;
if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE,
- &state) != 0 || state > POOL_STATE_SPARE) {
+ &state) != 0 || state > POOL_STATE_L2CACHE) {
nvlist_free(*config);
continue;
}
- if (state != POOL_STATE_SPARE &&
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
(nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG,
&txg) != 0 || txg == 0)) {
nvlist_free(*config);
@@ -1001,27 +1014,28 @@ find_guid(nvlist_t *nv, uint64_t guid)
return (B_FALSE);
}
-typedef struct spare_cbdata {
+typedef struct aux_cbdata {
+ const char *cb_type;
uint64_t cb_guid;
zpool_handle_t *cb_zhp;
-} spare_cbdata_t;
+} aux_cbdata_t;
static int
-find_spare(zpool_handle_t *zhp, void *data)
+find_aux(zpool_handle_t *zhp, void *data)
{
- spare_cbdata_t *cbp = data;
- nvlist_t **spares;
- uint_t i, nspares;
+ aux_cbdata_t *cbp = data;
+ nvlist_t **list;
+ uint_t i, count;
uint64_t guid;
nvlist_t *nvroot;
verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- &spares, &nspares) == 0) {
- for (i = 0; i < nspares; i++) {
- verify(nvlist_lookup_uint64(spares[i],
+ if (nvlist_lookup_nvlist_array(nvroot, cbp->cb_type,
+ &list, &count) == 0) {
+ for (i = 0; i < count; i++) {
+ verify(nvlist_lookup_uint64(list[i],
ZPOOL_CONFIG_GUID, &guid) == 0);
if (guid == cbp->cb_guid) {
cbp->cb_zhp = zhp;
@@ -1050,7 +1064,7 @@ zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr,
zpool_handle_t *zhp;
nvlist_t *pool_config;
uint64_t stateval, isspare;
- spare_cbdata_t cb = { 0 };
+ aux_cbdata_t cb = { 0 };
boolean_t isactive;
*inuse = B_FALSE;
@@ -1068,7 +1082,7 @@ zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr,
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID,
&vdev_guid) == 0);
- if (stateval != POOL_STATE_SPARE) {
+ if (stateval != POOL_STATE_SPARE && stateval != POOL_STATE_L2CACHE) {
verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
&name) == 0);
verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
@@ -1147,7 +1161,24 @@ zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr,
*/
cb.cb_zhp = NULL;
cb.cb_guid = vdev_guid;
- if (zpool_iter(hdl, find_spare, &cb) == 1) {
+ cb.cb_type = ZPOOL_CONFIG_SPARES;
+ if (zpool_iter(hdl, find_aux, &cb) == 1) {
+ name = (char *)zpool_get_name(cb.cb_zhp);
+ ret = TRUE;
+ } else {
+ ret = FALSE;
+ }
+ break;
+
+ case POOL_STATE_L2CACHE:
+
+ /*
+ * Check if any pool is currently using this l2cache device.
+ */
+ cb.cb_zhp = NULL;
+ cb.cb_guid = vdev_guid;
+ cb.cb_type = ZPOOL_CONFIG_L2CACHE;
+ if (zpool_iter(hdl, find_aux, &cb) == 1) {
name = (char *)zpool_get_name(cb.cb_zhp);
ret = TRUE;
} else {
diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c
index 7320378e6f..75c1ce7492 100644
--- a/usr/src/lib/libzfs/common/libzfs_pool.c
+++ b/usr/src/lib/libzfs/common/libzfs_pool.c
@@ -815,6 +815,11 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot,
"one or more devices is out of space"));
return (zfs_error(hdl, EZFS_BADDEV, msg));
+ case ENOTBLK:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "cache device must be a disk or disk slice"));
+ return (zfs_error(hdl, EZFS_BADDEV, msg));
+
default:
return (zpool_standard_error(hdl, errno, msg));
}
@@ -898,14 +903,14 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
int ret;
libzfs_handle_t *hdl = zhp->zpool_hdl;
char msg[1024];
- nvlist_t **spares;
- uint_t nspares;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
"cannot add to '%s'"), zhp->zpool_name);
- if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL)
- < SPA_VERSION_SPARES &&
+ if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
+ SPA_VERSION_SPARES &&
nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
&spares, &nspares) == 0) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
@@ -913,6 +918,15 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
return (zfs_error(hdl, EZFS_BADVERSION, msg));
}
+ if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) <
+ SPA_VERSION_L2CACHE &&
+ nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0) {
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be "
+ "upgraded to add cache devices"));
+ return (zfs_error(hdl, EZFS_BADVERSION, msg));
+ }
+
if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0)
return (-1);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
@@ -963,6 +977,12 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
(void) zfs_error(hdl, EZFS_POOL_NOTSUP, msg);
break;
+ case ENOTBLK:
+ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+ "cache device must be a disk or disk slice"));
+ (void) zfs_error(hdl, EZFS_BADDEV, msg);
+ break;
+
default:
(void) zpool_standard_error(hdl, errno, msg);
}
@@ -1172,7 +1192,7 @@ zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type)
*/
static nvlist_t *
vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
- boolean_t *avail_spare)
+ boolean_t *avail_spare, boolean_t *l2cache)
{
uint_t c, children;
nvlist_t **child;
@@ -1214,25 +1234,37 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
for (c = 0; c < children; c++)
if ((ret = vdev_to_nvlist_iter(child[c], search, guid,
- avail_spare)) != NULL)
+ avail_spare, l2cache)) != NULL)
return (ret);
if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
&child, &children) == 0) {
for (c = 0; c < children; c++) {
if ((ret = vdev_to_nvlist_iter(child[c], search, guid,
- avail_spare)) != NULL) {
+ avail_spare, l2cache)) != NULL) {
*avail_spare = B_TRUE;
return (ret);
}
}
}
+ if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
+ &child, &children) == 0) {
+ for (c = 0; c < children; c++) {
+ if ((ret = vdev_to_nvlist_iter(child[c], search, guid,
+ avail_spare, l2cache)) != NULL) {
+ *l2cache = B_TRUE;
+ return (ret);
+ }
+ }
+ }
+
return (NULL);
}
nvlist_t *
-zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare)
+zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
+ boolean_t *l2cache)
{
char buf[MAXPATHLEN];
const char *search;
@@ -1254,29 +1286,32 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare)
&nvroot) == 0);
*avail_spare = B_FALSE;
- return (vdev_to_nvlist_iter(nvroot, search, guid, avail_spare));
+ *l2cache = B_FALSE;
+ return (vdev_to_nvlist_iter(nvroot, search, guid, avail_spare,
+ l2cache));
}
/*
- * Returns TRUE if the given guid corresponds to a spare (INUSE or not).
+ * Returns TRUE if the given guid corresponds to the given type.
+ * This is used to check for hot spares (INUSE or not), and level 2 cache
+ * devices.
*/
static boolean_t
-is_spare(zpool_handle_t *zhp, uint64_t guid)
+is_guid_type(zpool_handle_t *zhp, uint64_t guid, const char *type)
{
- uint64_t spare_guid;
+ uint64_t target_guid;
nvlist_t *nvroot;
- nvlist_t **spares;
- uint_t nspares;
+ nvlist_t **list;
+ uint_t count;
int i;
verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- &spares, &nspares) == 0) {
- for (i = 0; i < nspares; i++) {
- verify(nvlist_lookup_uint64(spares[i],
- ZPOOL_CONFIG_GUID, &spare_guid) == 0);
- if (guid == spare_guid)
+ if (nvlist_lookup_nvlist_array(nvroot, type, &list, &count) == 0) {
+ for (i = 0; i < count; i++) {
+ verify(nvlist_lookup_uint64(list[i], ZPOOL_CONFIG_GUID,
+ &target_guid) == 0);
+ if (guid == target_guid)
return (B_TRUE);
}
}
@@ -1295,21 +1330,26 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
- boolean_t avail_spare;
+ boolean_t avail_spare, l2cache;
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot online %s"), path);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
- if ((tgt = zpool_find_vdev(zhp, path, &avail_spare)) == NULL)
+ if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache)) == NULL)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
- if (avail_spare || is_spare(zhp, zc.zc_guid) == B_TRUE)
+ if (avail_spare ||
+ is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
+ if (l2cache ||
+ is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_L2CACHE) == B_TRUE)
+ return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
+
zc.zc_cookie = VDEV_STATE_ONLINE;
zc.zc_obj = flags;
@@ -1330,21 +1370,26 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
- boolean_t avail_spare;
+ boolean_t avail_spare, l2cache;
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot offline %s"), path);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
- if ((tgt = zpool_find_vdev(zhp, path, &avail_spare)) == NULL)
+ if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache)) == NULL)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
- if (avail_spare || is_spare(zhp, zc.zc_guid) == B_TRUE)
+ if (avail_spare ||
+ is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
+ if (l2cache ||
+ is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_L2CACHE) == B_TRUE)
+ return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
+
zc.zc_cookie = VDEV_STATE_OFFLINE;
zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0;
@@ -1461,7 +1506,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
char msg[1024];
int ret;
nvlist_t *tgt;
- boolean_t avail_spare;
+ boolean_t avail_spare, l2cache;
uint64_t val, is_log;
char *path;
nvlist_t **child;
@@ -1477,12 +1522,15 @@ zpool_vdev_attach(zpool_handle_t *zhp,
"cannot attach %s to %s"), new_disk, old_disk);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
- if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare)) == 0)
+ if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache)) == 0)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
+ if (l2cache)
+ return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
+
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
zc.zc_cookie = replacing;
@@ -1503,7 +1551,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
if (replacing &&
nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 &&
nvlist_lookup_string(child[0], ZPOOL_CONFIG_PATH, &path) == 0 &&
- (zpool_find_vdev(zhp, path, &avail_spare) == NULL ||
+ (zpool_find_vdev(zhp, path, &avail_spare, &l2cache) == NULL ||
!avail_spare) && is_replacing_spare(config_root, tgt, 1)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"can only be replaced by another hot spare"));
@@ -1516,8 +1564,8 @@ zpool_vdev_attach(zpool_handle_t *zhp,
*/
if (replacing &&
nvlist_lookup_string(child[0], ZPOOL_CONFIG_PATH, &path) == 0 &&
- zpool_find_vdev(zhp, path, &avail_spare) != NULL && avail_spare &&
- is_replacing_spare(config_root, tgt, 0)) {
+ zpool_find_vdev(zhp, path, &avail_spare, &l2cache) != NULL &&
+ avail_spare && is_replacing_spare(config_root, tgt, 0)) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
"device has already been replaced with a spare"));
return (zfs_error(hdl, EZFS_BADTARGET, msg));
@@ -1612,19 +1660,22 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
- boolean_t avail_spare;
+ boolean_t avail_spare, l2cache;
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot detach %s"), path);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
- if ((tgt = zpool_find_vdev(zhp, path, &avail_spare)) == 0)
+ if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache)) == 0)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
+ if (l2cache)
+ return (zfs_error(hdl, EZFS_ISL2CACHE, msg));
+
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0)
@@ -1656,7 +1707,8 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
}
/*
- * Remove the given device. Currently, this is supported only for hot spares.
+ * Remove the given device. Currently, this is supported only for hot spares
+ * and level 2 cache devices.
*/
int
zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
@@ -1664,19 +1716,20 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
- boolean_t avail_spare;
+ boolean_t avail_spare, l2cache;
libzfs_handle_t *hdl = zhp->zpool_hdl;
(void) snprintf(msg, sizeof (msg),
dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
- if ((tgt = zpool_find_vdev(zhp, path, &avail_spare)) == 0)
+ if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache)) == 0)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
- if (!avail_spare) {
+ if (!avail_spare && !l2cache) {
zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
- "only inactive hot spares can be removed"));
+ "only inactive hot spares or cache devices "
+ "can be removed"));
return (zfs_error(hdl, EZFS_NODEVICE, msg));
}
@@ -1697,7 +1750,7 @@ zpool_clear(zpool_handle_t *zhp, const char *path)
zfs_cmd_t zc = { 0 };
char msg[1024];
nvlist_t *tgt;
- boolean_t avail_spare;
+ boolean_t avail_spare, l2cache;
libzfs_handle_t *hdl = zhp->zpool_hdl;
if (path)
@@ -1711,9 +1764,14 @@ zpool_clear(zpool_handle_t *zhp, const char *path)
(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
if (path) {
- if ((tgt = zpool_find_vdev(zhp, path, &avail_spare)) == 0)
+ if ((tgt = zpool_find_vdev(zhp, path, &avail_spare,
+ &l2cache)) == 0)
return (zfs_error(hdl, EZFS_NODEVICE, msg));
+ /*
+ * Don't allow error clearing for hot spares. Do allow
+ * error clearing for l2cache devices.
+ */
if (avail_spare)
return (zfs_error(hdl, EZFS_ISSPARE, msg));
diff --git a/usr/src/lib/libzfs/common/libzfs_util.c b/usr/src/lib/libzfs/common/libzfs_util.c
index d2005867e3..b58da2c0bf 100644
--- a/usr/src/lib/libzfs/common/libzfs_util.c
+++ b/usr/src/lib/libzfs/common/libzfs_util.c
@@ -201,6 +201,8 @@ libzfs_error_description(libzfs_handle_t *hdl)
" modified"));
case EZFS_BADCACHE:
return (dgettext(TEXT_DOMAIN, "invalid or missing cache file"));
+ case EZFS_ISL2CACHE:
+ return (dgettext(TEXT_DOMAIN, "device is in use as a cache"));
case EZFS_UNKNOWN:
return (dgettext(TEXT_DOMAIN, "unknown error"));
default:
diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c
index aafce2d68e..76f8c155ef 100644
--- a/usr/src/uts/common/fs/zfs/arc.c
+++ b/usr/src/uts/common/fs/zfs/arc.c
@@ -47,13 +47,13 @@
* There are times when it is not possible to evict the requested
* space. In these circumstances we are unable to adjust the cache
* size. To prevent the cache growing unbounded at these times we
- * implement a "cache throttle" that slowes the flow of new data
- * into the cache until we can make space avaiable.
+ * implement a "cache throttle" that slows the flow of new data
+ * into the cache until we can make space available.
*
* 2. The Megiddo and Modha model assumes a fixed cache size.
* Pages are evicted when the cache is full and there is a cache
* miss. Our model has a variable sized cache. It grows with
- * high use, but also tries to react to memory preasure from the
+ * high use, but also tries to react to memory pressure from the
* operating system: decreasing its size when system memory is
* tight.
*
@@ -75,7 +75,7 @@
*
* A new reference to a cache buffer can be obtained in two
* ways: 1) via a hash table lookup using the DVA as a key,
- * or 2) via one of the ARC lists. The arc_read() inerface
+ * or 2) via one of the ARC lists. The arc_read() interface
* uses method 1, while the internal arc algorithms for
* adjusting the cache use method 2. We therefor provide two
* types of locks: 1) the hash table lock array, and 2) the
@@ -109,6 +109,14 @@
*
* Note that the majority of the performance stats are manipulated
* with atomic operations.
+ *
+ * The L2ARC uses the l2arc_buflist_mtx global mutex for the following:
+ *
+ * - L2ARC buflist creation
+ * - L2ARC buflist eviction
+ * - L2ARC write completion, which walks L2ARC buflists
+ * - ARC header destruction, as it removes from L2ARC buflists
+ * - ARC header release, as it removes from L2ARC buflists
*/
#include <sys/spa.h>
@@ -157,19 +165,20 @@ uint64_t zfs_arc_min;
uint64_t zfs_arc_meta_limit = 0;
/*
- * Note that buffers can be in one of 5 states:
+ * Note that buffers can be in one of 6 states:
* ARC_anon - anonymous (discussed below)
* ARC_mru - recently used, currently cached
* ARC_mru_ghost - recentely used, no longer in cache
* ARC_mfu - frequently used, currently cached
* ARC_mfu_ghost - frequently used, no longer in cache
+ * ARC_l2c_only - exists in L2ARC but not other states
* When there are no active references to the buffer, they are
* are linked onto a list in one of these arc states. These are
* the only buffers that can be evicted or deleted. Within each
* state there are multiple lists, one for meta-data and one for
* non-meta-data. Meta-data (indirect blocks, blocks of dnodes,
* etc.) is tracked separately so that it can be managed more
- * explicitly: favored over data, limited explicitely.
+ * explicitly: favored over data, limited explicitly.
*
* Anonymous buffers are buffers that are not associated with
* a DVA. These are buffers that hold dirty block copies
@@ -177,6 +186,14 @@ uint64_t zfs_arc_meta_limit = 0;
* they are "ref'd" and are considered part of arc_mru
* that cannot be freed. Generally, they will aquire a DVA
* as they are written and migrate onto the arc_mru list.
+ *
+ * The ARC_l2c_only state is for buffers that are in the second
+ * level ARC but no longer in any of the ARC_m* lists. The second
+ * level ARC itself may also contain buffers that are in any of
+ * the ARC_m* states - meaning that a buffer can exist in two
+ * places. The reason for the ARC_l2c_only state is to keep the
+ * buffer header in the hash table, so that reads that hit the
+ * second level ARC benefit from these fast lookups.
*/
typedef struct arc_state {
@@ -186,12 +203,13 @@ typedef struct arc_state {
kmutex_t arcs_mtx;
} arc_state_t;
-/* The 5 states: */
+/* The 6 states: */
static arc_state_t ARC_anon;
static arc_state_t ARC_mru;
static arc_state_t ARC_mru_ghost;
static arc_state_t ARC_mfu;
static arc_state_t ARC_mfu_ghost;
+static arc_state_t ARC_l2c_only;
typedef struct arc_stats {
kstat_named_t arcstat_hits;
@@ -222,6 +240,23 @@ typedef struct arc_stats {
kstat_named_t arcstat_c_min;
kstat_named_t arcstat_c_max;
kstat_named_t arcstat_size;
+ kstat_named_t arcstat_hdr_size;
+ kstat_named_t arcstat_l2_hits;
+ kstat_named_t arcstat_l2_misses;
+ kstat_named_t arcstat_l2_feeds;
+ kstat_named_t arcstat_l2_rw_clash;
+ kstat_named_t arcstat_l2_writes_sent;
+ kstat_named_t arcstat_l2_writes_done;
+ kstat_named_t arcstat_l2_writes_error;
+ kstat_named_t arcstat_l2_writes_hdr_miss;
+ kstat_named_t arcstat_l2_evict_lock_retry;
+ kstat_named_t arcstat_l2_evict_reading;
+ kstat_named_t arcstat_l2_free_on_write;
+ kstat_named_t arcstat_l2_abort_lowmem;
+ kstat_named_t arcstat_l2_cksum_bad;
+ kstat_named_t arcstat_l2_io_error;
+ kstat_named_t arcstat_l2_size;
+ kstat_named_t arcstat_l2_hdr_size;
} arc_stats_t;
static arc_stats_t arc_stats = {
@@ -252,7 +287,24 @@ static arc_stats_t arc_stats = {
{ "c", KSTAT_DATA_UINT64 },
{ "c_min", KSTAT_DATA_UINT64 },
{ "c_max", KSTAT_DATA_UINT64 },
- { "size", KSTAT_DATA_UINT64 }
+ { "size", KSTAT_DATA_UINT64 },
+ { "hdr_size", KSTAT_DATA_UINT64 },
+ { "l2_hits", KSTAT_DATA_UINT64 },
+ { "l2_misses", KSTAT_DATA_UINT64 },
+ { "l2_feeds", KSTAT_DATA_UINT64 },
+ { "l2_rw_clash", KSTAT_DATA_UINT64 },
+ { "l2_writes_sent", KSTAT_DATA_UINT64 },
+ { "l2_writes_done", KSTAT_DATA_UINT64 },
+ { "l2_writes_error", KSTAT_DATA_UINT64 },
+ { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 },
+ { "l2_evict_lock_retry", KSTAT_DATA_UINT64 },
+ { "l2_evict_reading", KSTAT_DATA_UINT64 },
+ { "l2_free_on_write", KSTAT_DATA_UINT64 },
+ { "l2_abort_lowmem", KSTAT_DATA_UINT64 },
+ { "l2_cksum_bad", KSTAT_DATA_UINT64 },
+ { "l2_io_error", KSTAT_DATA_UINT64 },
+ { "l2_size", KSTAT_DATA_UINT64 },
+ { "l2_hdr_size", KSTAT_DATA_UINT64 }
};
#define ARCSTAT(stat) (arc_stats.stat.value.ui64)
@@ -299,6 +351,7 @@ static arc_state_t *arc_mru;
static arc_state_t *arc_mru_ghost;
static arc_state_t *arc_mfu;
static arc_state_t *arc_mfu_ghost;
+static arc_state_t *arc_l2c_only;
/*
* There are several ARC variables that are critical to export as kstats --
@@ -320,6 +373,8 @@ static uint64_t arc_meta_used;
static uint64_t arc_meta_limit;
static uint64_t arc_meta_max = 0;
+typedef struct l2arc_buf_hdr l2arc_buf_hdr_t;
+
typedef struct arc_callback arc_callback_t;
struct arc_callback {
@@ -371,6 +426,9 @@ struct arc_buf_hdr {
/* self protecting */
refcount_t b_refcnt;
+
+ l2arc_buf_hdr_t *b_l2hdr;
+ list_node_t b_l2node;
};
static arc_buf_t *arc_eviction_list;
@@ -382,7 +440,8 @@ static int arc_evict_needed(arc_buf_contents_t type);
static void arc_evict_ghost(arc_state_t *state, int64_t bytes);
#define GHOST_STATE(state) \
- ((state) == arc_mru_ghost || (state) == arc_mfu_ghost)
+ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \
+ (state) == arc_l2c_only)
/*
* Private ARC flags. These flags are private ARC only flags that will show up
@@ -398,12 +457,24 @@ static void arc_evict_ghost(arc_state_t *state, int64_t bytes);
#define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */
#define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */
#define ARC_INDIRECT (1 << 14) /* this is an indirect block */
+#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */
+#define ARC_DONT_L2CACHE (1 << 16) /* originated by prefetch */
+#define ARC_L2_READING (1 << 17) /* L2ARC read in progress */
+#define ARC_L2_WRITING (1 << 18) /* L2ARC write in progress */
+#define ARC_L2_EVICTED (1 << 19) /* evicted during I/O */
+#define ARC_L2_WRITE_HEAD (1 << 20) /* head of write list */
#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE)
#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS)
#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR)
#define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ)
#define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE)
+#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
+#define HDR_DONT_L2CACHE(hdr) ((hdr)->b_flags & ARC_DONT_L2CACHE)
+#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_L2_READING)
+#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING)
+#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED)
+#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD)
/*
* Hash table routines
@@ -436,6 +507,87 @@ static buf_hash_table_t buf_hash_table;
uint64_t zfs_crc64_table[256];
+/*
+ * Level 2 ARC
+ */
+
+#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */
+#define L2ARC_HEADROOM 4 /* num of writes */
+#define L2ARC_FEED_DELAY 180 /* starting grace */
+#define L2ARC_FEED_SECS 1 /* caching interval */
+
+#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent)
+#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done)
+
+/*
+ * L2ARC Performance Tunables
+ */
+uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */
+uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */
+uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */
+boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */
+
+/*
+ * L2ARC Internals
+ */
+typedef struct l2arc_dev {
+ vdev_t *l2ad_vdev; /* vdev */
+ spa_t *l2ad_spa; /* spa */
+ uint64_t l2ad_hand; /* next write location */
+ uint64_t l2ad_write; /* desired write size, bytes */
+ uint64_t l2ad_start; /* first addr on device */
+ uint64_t l2ad_end; /* last addr on device */
+ uint64_t l2ad_evict; /* last addr eviction reached */
+ boolean_t l2ad_first; /* first sweep through */
+ list_t *l2ad_buflist; /* buffer list */
+ list_node_t l2ad_node; /* device list node */
+} l2arc_dev_t;
+
+static list_t L2ARC_dev_list; /* device list */
+static list_t *l2arc_dev_list; /* device list pointer */
+static kmutex_t l2arc_dev_mtx; /* device list mutex */
+static l2arc_dev_t *l2arc_dev_last; /* last device used */
+static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */
+static list_t L2ARC_free_on_write; /* free after write buf list */
+static list_t *l2arc_free_on_write; /* free after write list ptr */
+static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */
+static uint64_t l2arc_ndev; /* number of devices */
+
+typedef struct l2arc_read_callback {
+ arc_buf_t *l2rcb_buf; /* read buffer */
+ spa_t *l2rcb_spa; /* spa */
+ blkptr_t l2rcb_bp; /* original blkptr */
+ zbookmark_t l2rcb_zb; /* original bookmark */
+ int l2rcb_flags; /* original flags */
+} l2arc_read_callback_t;
+
+typedef struct l2arc_write_callback {
+ l2arc_dev_t *l2wcb_dev; /* device info */
+ arc_buf_hdr_t *l2wcb_head; /* head of write buflist */
+} l2arc_write_callback_t;
+
+struct l2arc_buf_hdr {
+ /* protected by arc_buf_hdr mutex */
+ l2arc_dev_t *b_dev; /* L2ARC device */
+ daddr_t b_daddr; /* disk address, offset byte */
+};
+
+typedef struct l2arc_data_free {
+ /* protected by l2arc_free_on_write_mtx */
+ void *l2df_data;
+ size_t l2df_size;
+ void (*l2df_func)(void *, size_t);
+ list_node_t l2df_list_node;
+} l2arc_data_free_t;
+
+static kmutex_t l2arc_feed_thr_lock;
+static kcondvar_t l2arc_feed_thr_cv;
+static uint8_t l2arc_thread_exit;
+
+static void l2arc_read_done(zio_t *zio);
+static void l2arc_hdr_stat_add(void);
+static void l2arc_hdr_stat_remove(void);
+
static uint64_t
buf_hash(spa_t *spa, dva_t *dva, uint64_t birth)
{
@@ -585,6 +737,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
refcount_create(&buf->b_refcnt);
cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ARCSTAT_INCR(arcstat_hdr_size, sizeof (arc_buf_hdr_t));
return (0);
}
@@ -601,6 +755,8 @@ hdr_dest(void *vbuf, void *unused)
refcount_destroy(&buf->b_refcnt);
cv_destroy(&buf->b_cv);
mutex_destroy(&buf->b_freeze_lock);
+
+ ARCSTAT_INCR(arcstat_hdr_size, -sizeof (arc_buf_hdr_t));
}
/*
@@ -680,10 +836,24 @@ arc_cksum_verify(arc_buf_t *buf)
mutex_exit(&buf->b_hdr->b_freeze_lock);
}
+static int
+arc_cksum_equal(arc_buf_t *buf)
+{
+ zio_cksum_t zc;
+ int equal;
+
+ mutex_enter(&buf->b_hdr->b_freeze_lock);
+ fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc);
+ equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc);
+ mutex_exit(&buf->b_hdr->b_freeze_lock);
+
+ return (equal);
+}
+
static void
-arc_cksum_compute(arc_buf_t *buf)
+arc_cksum_compute(arc_buf_t *buf, boolean_t force)
{
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
+ if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY))
return;
mutex_enter(&buf->b_hdr->b_freeze_lock);
@@ -700,14 +870,14 @@ arc_cksum_compute(arc_buf_t *buf)
void
arc_buf_thaw(arc_buf_t *buf)
{
- if (!(zfs_flags & ZFS_DEBUG_MODIFY))
- return;
+ if (zfs_flags & ZFS_DEBUG_MODIFY) {
+ if (buf->b_hdr->b_state != arc_anon)
+ panic("modifying non-anon buffer!");
+ if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
+ panic("modifying buffer while i/o in progress!");
+ arc_cksum_verify(buf);
+ }
- if (buf->b_hdr->b_state != arc_anon)
- panic("modifying non-anon buffer!");
- if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS)
- panic("modifying buffer while i/o in progress!");
- arc_cksum_verify(buf);
mutex_enter(&buf->b_hdr->b_freeze_lock);
if (buf->b_hdr->b_freeze_cksum != NULL) {
kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t));
@@ -724,7 +894,7 @@ arc_buf_freeze(arc_buf_t *buf)
ASSERT(buf->b_hdr->b_freeze_cksum != NULL ||
buf->b_hdr->b_state == arc_anon);
- arc_cksum_compute(buf);
+ arc_cksum_compute(buf, B_FALSE);
}
static void
@@ -852,7 +1022,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
}
ASSERT(!BUF_EMPTY(ab));
- if (new_state == arc_anon && old_state != arc_anon) {
+ if (new_state == arc_anon) {
buf_hash_remove(ab);
}
@@ -864,6 +1034,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
atomic_add_64(&old_state->arcs_size, -from_delta);
}
ab->b_state = new_state;
+
+ /* adjust l2arc hdr stats */
+ if (new_state == arc_l2c_only)
+ l2arc_hdr_stat_add();
+ else if (old_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
}
void
@@ -990,6 +1166,29 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
data, metadata, hits);
}
+/*
+ * Free the arc data buffer. If it is an l2arc write in progress,
+ * the buffer is placed on l2arc_free_on_write to be freed later.
+ */
+static void
+arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t),
+ void *data, size_t size)
+{
+ if (HDR_L2_WRITING(hdr)) {
+ l2arc_data_free_t *df;
+ df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP);
+ df->l2df_data = data;
+ df->l2df_size = size;
+ df->l2df_func = free_func;
+ mutex_enter(&l2arc_free_on_write_mtx);
+ list_insert_head(l2arc_free_on_write, df);
+ mutex_exit(&l2arc_free_on_write_mtx);
+ ARCSTAT_BUMP(arcstat_l2_free_on_write);
+ } else {
+ free_func(data, size);
+ }
+}
+
static void
arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
{
@@ -1004,11 +1203,13 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
arc_cksum_verify(buf);
if (!recycle) {
if (type == ARC_BUFC_METADATA) {
- zio_buf_free(buf->b_data, size);
+ arc_buf_data_free(buf->b_hdr, zio_buf_free,
+ buf->b_data, size);
arc_space_return(size);
} else {
ASSERT(type == ARC_BUFC_DATA);
- zio_data_buf_free(buf->b_data, size);
+ arc_buf_data_free(buf->b_hdr,
+ zio_data_buf_free, buf->b_data, size);
atomic_add_64(&arc_size, -size);
}
}
@@ -1051,6 +1252,30 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
ASSERT3P(hdr->b_state, ==, arc_anon);
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
+ if (hdr->b_l2hdr != NULL) {
+ if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
+ /*
+ * To prevent arc_free() and l2arc_evict() from
+ * attempting to free the same buffer at the same time,
+ * a FREE_IN_PROGRESS flag is given to arc_free() to
+ * give it priority. l2arc_evict() can't destroy this
+ * header while we are waiting on l2arc_buflist_mtx.
+ */
+ mutex_enter(&l2arc_buflist_mtx);
+ ASSERT(hdr->b_l2hdr != NULL);
+
+ list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+ mutex_exit(&l2arc_buflist_mtx);
+ } else {
+ list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+ }
+ ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+ kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
+ if (hdr->b_state == arc_l2c_only)
+ l2arc_hdr_stat_remove();
+ hdr->b_l2hdr = NULL;
+ }
+
if (!BUF_EMPTY(hdr)) {
ASSERT(!HDR_IN_HASH_TABLE(hdr));
bzero(&hdr->b_dva, sizeof (dva_t));
@@ -1214,7 +1439,8 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
if (buf->b_data) {
bytes_evicted += ab->b_size;
if (recycle && ab->b_type == type &&
- ab->b_size == bytes) {
+ ab->b_size == bytes &&
+ !HDR_L2_WRITING(ab)) {
stolen = buf->b_data;
recycle = FALSE;
}
@@ -1236,7 +1462,8 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle,
ASSERT(ab->b_datacnt == 0);
arc_change_state(evicted_state, ab, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(ab));
- ab->b_flags = ARC_IN_HASH_TABLE;
+ ab->b_flags |= ARC_IN_HASH_TABLE;
+ ab->b_flags &= ~ARC_BUF_AVAILABLE;
DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab);
if (!have_lock)
mutex_exit(hash_lock);
@@ -1306,11 +1533,22 @@ top:
if (mutex_tryenter(hash_lock)) {
ASSERT(!HDR_IO_IN_PROGRESS(ab));
ASSERT(ab->b_buf == NULL);
- arc_change_state(arc_anon, ab, hash_lock);
- mutex_exit(hash_lock);
ARCSTAT_BUMP(arcstat_deleted);
bytes_deleted += ab->b_size;
- arc_hdr_destroy(ab);
+
+ if (ab->b_l2hdr != NULL) {
+ /*
+ * This buffer is cached on the 2nd Level ARC;
+ * don't destroy the header.
+ */
+ arc_change_state(arc_l2c_only, ab, hash_lock);
+ mutex_exit(hash_lock);
+ } else {
+ arc_change_state(arc_anon, ab, hash_lock);
+ mutex_exit(hash_lock);
+ arc_hdr_destroy(ab);
+ }
+
DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab);
if (bytes >= 0 && bytes_deleted >= bytes)
break;
@@ -1506,7 +1744,7 @@ arc_reclaim_needed(void)
/*
* check to make sure that swapfs has enough space so that anon
- * reservations can still succeeed. anon_resvmem() checks that the
+ * reservations can still succeed. anon_resvmem() checks that the
* availrmem is greater than swapfs_minfree, and the number of reserved
* swap pages. We also add a bit of extra here just to prevent
* circumstances from getting really dire.
@@ -1523,7 +1761,7 @@ arc_reclaim_needed(void)
* can have in the system. However, this is generally fixed at 25 pages
* which is so low that it's useless. In this comparison, we seek to
* calculate the total heap-size, and reclaim if more than 3/4ths of the
- * heap is allocated. (Or, in the caclulation, if less than 1/4th is
+ * heap is allocated. (Or, in the calculation, if less than 1/4th is
* free)
*/
if (btop(vmem_size(heap_arena, VMEM_FREE)) <
@@ -1564,7 +1802,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat)
#endif
/*
- * An agressive reclamation will shrink the cache size as well as
+ * An aggressive reclamation will shrink the cache size as well as
* reap free buffers from the arc kmem caches.
*/
if (strat == ARC_RECLAIM_AGGR)
@@ -1648,6 +1886,9 @@ arc_adapt(int bytes, arc_state_t *state)
{
int mult;
+ if (state == arc_l2c_only)
+ return;
+
ASSERT(bytes > 0);
/*
* Adapt the target size of the MRU list:
@@ -1944,6 +2185,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
arc_change_state(new_state, buf, hash_lock);
ARCSTAT_BUMP(arcstat_mfu_ghost_hits);
+ } else if (buf->b_state == arc_l2c_only) {
+ /*
+ * This buffer is on the 2nd Level ARC.
+ */
+
+ buf->b_arc_access = lbolt;
+ DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
+ arc_change_state(arc_mfu, buf, hash_lock);
} else {
ASSERT(!"invalid arc state");
}
@@ -1996,7 +2245,12 @@ arc_read_done(zio_t *zio)
&hash_lock);
ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
- (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))));
+ (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) ||
+ (found == hdr && HDR_L2_READING(hdr)));
+
+ hdr->b_flags &= ~(ARC_L2_READING|ARC_L2_EVICTED);
+ if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH))
+ hdr->b_flags |= ARC_DONT_L2CACHE;
/* byteswap if necessary */
callback_list = hdr->b_acb;
@@ -2004,7 +2258,7 @@ arc_read_done(zio_t *zio)
if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap)
callback_list->acb_byteswap(buf->b_data, hdr->b_size);
- arc_cksum_compute(buf);
+ arc_cksum_compute(buf, B_FALSE);
/* create copies of the data buffer for the callers */
abuf = buf;
@@ -2108,7 +2362,7 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap,
arc_buf_hdr_t *hdr;
arc_buf_t *buf;
kmutex_t *hash_lock;
- zio_t *rzio;
+ zio_t *rzio;
top:
hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
@@ -2255,7 +2509,6 @@ top:
if (GHOST_STATE(hdr->b_state))
arc_access(hdr, hash_lock);
- mutex_exit(hash_lock);
ASSERT3U(hdr->b_size, ==, size);
DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
@@ -2265,6 +2518,57 @@ top:
demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
data, metadata, misses);
+ if (l2arc_ndev != 0) {
+ /*
+ * Read from the L2ARC if the following are true:
+ * 1. This buffer has L2ARC metadata.
+ * 2. This buffer isn't currently writing to the L2ARC.
+ */
+ if (hdr->b_l2hdr != NULL && !HDR_L2_WRITING(hdr)) {
+ vdev_t *vd = hdr->b_l2hdr->b_dev->l2ad_vdev;
+ daddr_t addr = hdr->b_l2hdr->b_daddr;
+ l2arc_read_callback_t *cb;
+
+ DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_hits);
+
+ hdr->b_flags |= ARC_L2_READING;
+ mutex_exit(hash_lock);
+
+ cb = kmem_zalloc(sizeof (l2arc_read_callback_t),
+ KM_SLEEP);
+ cb->l2rcb_buf = buf;
+ cb->l2rcb_spa = spa;
+ cb->l2rcb_bp = *bp;
+ cb->l2rcb_zb = *zb;
+ cb->l2rcb_flags = flags;
+
+ /*
+ * l2arc read.
+ */
+ rzio = zio_read_phys(pio, vd, addr, size,
+ buf->b_data, ZIO_CHECKSUM_OFF,
+ l2arc_read_done, cb, priority,
+ flags | ZIO_FLAG_DONT_CACHE, B_FALSE);
+ DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
+ zio_t *, rzio);
+
+ if (*arc_flags & ARC_WAIT)
+ return (zio_wait(rzio));
+
+ ASSERT(*arc_flags & ARC_NOWAIT);
+ zio_nowait(rzio);
+ return (0);
+ } else {
+ DTRACE_PROBE1(l2arc__miss,
+ arc_buf_hdr_t *, hdr);
+ ARCSTAT_BUMP(arcstat_l2_misses);
+ if (HDR_L2_WRITING(hdr))
+ ARCSTAT_BUMP(arcstat_l2_rw_clash);
+ }
+ }
+ mutex_exit(hash_lock);
+
rzio = zio_read(pio, spa, bp, buf->b_data, size,
arc_read_done, buf, priority, flags, zb);
@@ -2402,7 +2706,8 @@ arc_buf_evict(arc_buf_t *buf)
arc_change_state(evicted_state, hdr, hash_lock);
ASSERT(HDR_IN_HASH_TABLE(hdr));
- hdr->b_flags = ARC_IN_HASH_TABLE;
+ hdr->b_flags |= ARC_IN_HASH_TABLE;
+ hdr->b_flags &= ~ARC_BUF_AVAILABLE;
mutex_exit(&evicted_state->arcs_mtx);
mutex_exit(&old_state->arcs_mtx);
@@ -2428,6 +2733,8 @@ arc_release(arc_buf_t *buf, void *tag)
{
arc_buf_hdr_t *hdr = buf->b_hdr;
kmutex_t *hash_lock = HDR_LOCK(hdr);
+ l2arc_buf_hdr_t *l2hdr = NULL;
+ uint64_t buf_size;
/* this buffer is not on any list */
ASSERT(refcount_count(&hdr->b_refcnt) > 0);
@@ -2452,6 +2759,7 @@ arc_release(arc_buf_t *buf, void *tag)
uint64_t blksz = hdr->b_size;
spa_t *spa = hdr->b_spa;
arc_buf_contents_t type = hdr->b_type;
+ uint32_t flags = hdr->b_flags;
ASSERT(hdr->b_datacnt > 1);
/*
@@ -2473,6 +2781,12 @@ arc_release(arc_buf_t *buf, void *tag)
atomic_add_64(size, -hdr->b_size);
}
hdr->b_datacnt -= 1;
+ if (hdr->b_l2hdr != NULL) {
+ mutex_enter(&l2arc_buflist_mtx);
+ l2hdr = hdr->b_l2hdr;
+ hdr->b_l2hdr = NULL;
+ buf_size = hdr->b_size;
+ }
arc_cksum_verify(buf);
mutex_exit(hash_lock);
@@ -2484,21 +2798,27 @@ arc_release(arc_buf_t *buf, void *tag)
nhdr->b_buf = buf;
nhdr->b_state = arc_anon;
nhdr->b_arc_access = 0;
- nhdr->b_flags = 0;
+ nhdr->b_flags = flags & ARC_L2_WRITING;
+ nhdr->b_l2hdr = NULL;
nhdr->b_datacnt = 1;
nhdr->b_freeze_cksum = NULL;
(void) refcount_add(&nhdr->b_refcnt, tag);
buf->b_hdr = nhdr;
atomic_add_64(&arc_anon->arcs_size, blksz);
-
- hdr = nhdr;
} else {
ASSERT(refcount_count(&hdr->b_refcnt) == 1);
ASSERT(!list_link_active(&hdr->b_arc_node));
ASSERT(!HDR_IO_IN_PROGRESS(hdr));
arc_change_state(arc_anon, hdr, hash_lock);
hdr->b_arc_access = 0;
+ if (hdr->b_l2hdr != NULL) {
+ mutex_enter(&l2arc_buflist_mtx);
+ l2hdr = hdr->b_l2hdr;
+ hdr->b_l2hdr = NULL;
+ buf_size = hdr->b_size;
+ }
mutex_exit(hash_lock);
+
bzero(&hdr->b_dva, sizeof (dva_t));
hdr->b_birth = 0;
hdr->b_cksum0 = 0;
@@ -2506,6 +2826,14 @@ arc_release(arc_buf_t *buf, void *tag)
}
buf->b_efunc = NULL;
buf->b_private = NULL;
+
+ if (l2hdr) {
+ list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+ kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -buf_size);
+ }
+ if (MUTEX_HELD(&l2arc_buflist_mtx))
+ mutex_exit(&l2arc_buflist_mtx);
}
int
@@ -2559,7 +2887,7 @@ arc_write_ready(zio_t *zio)
}
mutex_exit(&hdr->b_freeze_lock);
}
- arc_cksum_compute(buf);
+ arc_cksum_compute(buf, B_FALSE);
hdr->b_flags |= ARC_IO_IN_PROGRESS;
}
@@ -2704,6 +3032,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
ab->b_buf->b_private = NULL;
mutex_exit(hash_lock);
} else if (refcount_is_zero(&ab->b_refcnt)) {
+ ab->b_flags |= ARC_FREE_IN_PROGRESS;
mutex_exit(hash_lock);
arc_hdr_destroy(ab);
ARCSTAT_BUMP(arcstat_deleted);
@@ -2847,6 +3176,7 @@ arc_init(void)
arc_mru_ghost = &ARC_mru_ghost;
arc_mfu = &ARC_mfu;
arc_mfu_ghost = &ARC_mfu_ghost;
+ arc_l2c_only = &ARC_l2c_only;
arc_size = 0;
mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
@@ -2854,6 +3184,7 @@ arc_init(void)
mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL);
list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
@@ -2871,6 +3202,10 @@ arc_init(void)
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA],
sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
+ list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA],
+ sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node));
buf_init();
@@ -2932,3 +3267,868 @@ arc_fini(void)
buf_fini();
}
+
+/*
+ * Level 2 ARC
+ *
+ * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk.
+ * It uses dedicated storage devices to hold cached data, which are populated
+ * using large infrequent writes. The main role of this cache is to boost
+ * the performance of random read workloads. The intended L2ARC devices
+ * include short-stroked disks, solid state disks, and other media with
+ * substantially faster read latency than disk.
+ *
+ * +-----------------------+
+ * | ARC |
+ * +-----------------------+
+ * | ^ ^
+ * | | |
+ * l2arc_feed_thread() arc_read()
+ * | | |
+ * | l2arc read |
+ * V | |
+ * +---------------+ |
+ * | L2ARC | |
+ * +---------------+ |
+ * | ^ |
+ * l2arc_write() | |
+ * | | |
+ * V | |
+ * +-------+ +-------+
+ * | vdev | | vdev |
+ * | cache | | cache |
+ * +-------+ +-------+
+ * +=========+ .-----.
+ * : L2ARC : |-_____-|
+ * : devices : | Disks |
+ * +=========+ `-_____-'
+ *
+ * Read requests are satisfied from the following sources, in order:
+ *
+ * 1) ARC
+ * 2) vdev cache of L2ARC devices
+ * 3) L2ARC devices
+ * 4) vdev cache of disks
+ * 5) disks
+ *
+ * Some L2ARC device types exhibit extremely slow write performance.
+ * To accommodate for this there are some significant differences between
+ * the L2ARC and traditional cache design:
+ *
+ * 1. There is no eviction path from the ARC to the L2ARC. Evictions from
+ * the ARC behave as usual, freeing buffers and placing headers on ghost
+ * lists. The ARC does not send buffers to the L2ARC during eviction as
+ * this would add inflated write latencies for all ARC memory pressure.
+ *
+ * 2. The L2ARC attempts to cache data from the ARC before it is evicted.
+ * It does this by periodically scanning buffers from the eviction-end of
+ * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are
+ * not already there. It scans until a headroom of buffers is satisfied,
+ * which itself is a buffer for ARC eviction. The thread that does this is
+ * l2arc_feed_thread(), illustrated below; example sizes are included to
+ * provide a better sense of ratio than this diagram:
+ *
+ * head --> tail
+ * +---------------------+----------+
+ * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC
+ * +---------------------+----------+ | o L2ARC eligible
+ * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer
+ * +---------------------+----------+ |
+ * 15.9 Gbytes ^ 32 Mbytes |
+ * headroom |
+ * l2arc_feed_thread()
+ * |
+ * l2arc write hand <--[oooo]--'
+ * | 8 Mbyte
+ * | write max
+ * V
+ * +==============================+
+ * L2ARC dev |####|#|###|###| |####| ... |
+ * +==============================+
+ * 32 Gbytes
+ *
+ * 3. If an ARC buffer is copied to the L2ARC but then hit instead of
+ * evicted, then the L2ARC has cached a buffer much sooner than it probably
+ * needed to, potentially wasting L2ARC device bandwidth and storage. It is
+ * safe to say that this is an uncommon case, since buffers at the end of
+ * the ARC lists have moved there due to inactivity.
+ *
+ * 4. If the ARC evicts faster than the L2ARC can maintain a headroom,
+ * then the L2ARC simply misses copying some buffers. This serves as a
+ * pressure valve to prevent heavy read workloads from both stalling the ARC
+ * with waits and clogging the L2ARC with writes. This also helps prevent
+ * the potential for the L2ARC to churn if it attempts to cache content too
+ * quickly, such as during backups of the entire pool.
+ *
+ * 5. Writes to the L2ARC devices are grouped and sent in-sequence, so that
+ * the vdev queue can aggregate them into larger and fewer writes. Each
+ * device is written to in a rotor fashion, sweeping writes through
+ * available space then repeating.
+ *
+ * 6. The L2ARC does not store dirty content. It never needs to flush
+ * write buffers back to disk based storage.
+ *
+ * 7. If an ARC buffer is written (and dirtied) which also exists in the
+ * L2ARC, the now stale L2ARC buffer is immediately dropped.
+ *
+ * The performance of the L2ARC can be tweaked by a number of tunables, which
+ * may be necessary for different workloads:
+ *
+ * l2arc_write_max max write bytes per interval
+ * l2arc_noprefetch skip caching prefetched buffers
+ * l2arc_headroom number of max device writes to precache
+ * l2arc_feed_secs seconds between L2ARC writing
+ *
+ * Tunables may be removed or added as future performance improvements are
+ * integrated, and also may become zpool properties.
+ */
+
+static void
+l2arc_hdr_stat_add(void)
+{
+ ARCSTAT_INCR(arcstat_l2_hdr_size, sizeof (arc_buf_hdr_t) +
+ sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_hdr_size, -sizeof (arc_buf_hdr_t));
+}
+
+static void
+l2arc_hdr_stat_remove(void)
+{
+ ARCSTAT_INCR(arcstat_l2_hdr_size, -sizeof (arc_buf_hdr_t) -
+ sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_hdr_size, sizeof (arc_buf_hdr_t));
+}
+
+/*
+ * Cycle through L2ARC devices. This is how L2ARC load balances.
+ * This is called with l2arc_dev_mtx held, which also locks out spa removal.
+ */
+static l2arc_dev_t *
+l2arc_dev_get_next(void)
+{
+ l2arc_dev_t *next;
+
+ if (l2arc_dev_last == NULL) {
+ next = list_head(l2arc_dev_list);
+ } else {
+ next = list_next(l2arc_dev_list, l2arc_dev_last);
+ if (next == NULL)
+ next = list_head(l2arc_dev_list);
+ }
+
+ l2arc_dev_last = next;
+
+ return (next);
+}
+
+/*
+ * A write to a cache device has completed. Update all headers to allow
+ * reads from these buffers to begin.
+ */
+static void
+l2arc_write_done(zio_t *zio)
+{
+ l2arc_write_callback_t *cb;
+ l2arc_dev_t *dev;
+ list_t *buflist;
+ l2arc_data_free_t *df, *df_prev;
+ arc_buf_hdr_t *head, *ab, *ab_prev;
+ kmutex_t *hash_lock;
+
+ cb = zio->io_private;
+ ASSERT(cb != NULL);
+ dev = cb->l2wcb_dev;
+ ASSERT(dev != NULL);
+ head = cb->l2wcb_head;
+ ASSERT(head != NULL);
+ buflist = dev->l2ad_buflist;
+ ASSERT(buflist != NULL);
+ DTRACE_PROBE2(l2arc__iodone, zio_t *, zio,
+ l2arc_write_callback_t *, cb);
+
+ if (zio->io_error != 0)
+ ARCSTAT_BUMP(arcstat_l2_writes_error);
+
+ mutex_enter(&l2arc_buflist_mtx);
+
+ /*
+ * All writes completed, or an error was hit.
+ */
+ for (ab = list_prev(buflist, head); ab; ab = ab_prev) {
+ ab_prev = list_prev(buflist, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * This buffer misses out. It may be in a stage
+ * of eviction. Its ARC_L2_WRITING flag will be
+ * left set, denying reads to this buffer.
+ */
+ ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss);
+ continue;
+ }
+
+ if (zio->io_error != 0) {
+ /*
+ * Error - invalidate L2ARC entry.
+ */
+ ab->b_l2hdr = NULL;
+ }
+
+ /*
+ * Allow ARC to begin reads to this L2ARC entry.
+ */
+ ab->b_flags &= ~ARC_L2_WRITING;
+
+ mutex_exit(hash_lock);
+ }
+
+ atomic_inc_64(&l2arc_writes_done);
+ list_remove(buflist, head);
+ kmem_cache_free(hdr_cache, head);
+ mutex_exit(&l2arc_buflist_mtx);
+
+ /*
+ * Free buffers that were tagged for destruction.
+ */
+ mutex_enter(&l2arc_free_on_write_mtx);
+ buflist = l2arc_free_on_write;
+ for (df = list_tail(buflist); df; df = df_prev) {
+ df_prev = list_prev(buflist, df);
+ ASSERT(df->l2df_data != NULL);
+ ASSERT(df->l2df_func != NULL);
+ df->l2df_func(df->l2df_data, df->l2df_size);
+ list_remove(buflist, df);
+ kmem_free(df, sizeof (l2arc_data_free_t));
+ }
+ mutex_exit(&l2arc_free_on_write_mtx);
+
+ kmem_free(cb, sizeof (l2arc_write_callback_t));
+}
+
+/*
+ * A read to a cache device completed. Validate buffer contents before
+ * handing over to the regular ARC routines.
+ */
+static void
+l2arc_read_done(zio_t *zio)
+{
+ l2arc_read_callback_t *cb;
+ arc_buf_hdr_t *hdr;
+ arc_buf_t *buf;
+ zio_t *rzio;
+ kmutex_t *hash_lock;
+ int equal, err = 0;
+
+ cb = zio->io_private;
+ ASSERT(cb != NULL);
+ buf = cb->l2rcb_buf;
+ ASSERT(buf != NULL);
+ hdr = buf->b_hdr;
+ ASSERT(hdr != NULL);
+
+ hash_lock = HDR_LOCK(hdr);
+ mutex_enter(hash_lock);
+
+ /*
+ * Check this survived the L2ARC journey.
+ */
+ equal = arc_cksum_equal(buf);
+ if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) {
+ mutex_exit(hash_lock);
+ zio->io_private = buf;
+ arc_read_done(zio);
+ } else {
+ mutex_exit(hash_lock);
+ /*
+ * Buffer didn't survive caching. Increment stats and
+ * reissue to the original storage device.
+ */
+ if (zio->io_error != 0)
+ ARCSTAT_BUMP(arcstat_l2_io_error);
+ if (!equal)
+ ARCSTAT_BUMP(arcstat_l2_cksum_bad);
+
+ zio->io_flags &= ~ZIO_FLAG_DONT_CACHE;
+ rzio = zio_read(NULL, cb->l2rcb_spa, &cb->l2rcb_bp,
+ buf->b_data, zio->io_size, arc_read_done, buf,
+ zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb);
+
+ /*
+ * Since this is a seperate thread, we can wait on this
+ * I/O whether there is an io_waiter or not.
+ */
+ err = zio_wait(rzio);
+
+ /*
+ * Let the resent I/O call arc_read_done() instead.
+ * io_error is set to the reissued I/O error status.
+ */
+ zio->io_done = NULL;
+ zio->io_waiter = NULL;
+ zio->io_error = err;
+ }
+
+ kmem_free(cb, sizeof (l2arc_read_callback_t));
+}
+
+/*
+ * This is the list priority from which the L2ARC will search for pages to
+ * cache. This is used within loops (0..3) to cycle through lists in the
+ * desired order. This order can have a significant effect on cache
+ * performance.
+ *
+ * Currently the metadata lists are hit first, MFU then MRU, followed by
+ * the data lists. This function returns a locked list, and also returns
+ * the lock pointer.
+ */
+static list_t *
+l2arc_list_locked(int list_num, kmutex_t **lock)
+{
+ list_t *list;
+
+ ASSERT(list_num >= 0 && list_num <= 3);
+
+ switch (list_num) {
+ case 0:
+ list = &arc_mfu->arcs_list[ARC_BUFC_METADATA];
+ *lock = &arc_mfu->arcs_mtx;
+ break;
+ case 1:
+ list = &arc_mru->arcs_list[ARC_BUFC_METADATA];
+ *lock = &arc_mru->arcs_mtx;
+ break;
+ case 2:
+ list = &arc_mfu->arcs_list[ARC_BUFC_DATA];
+ *lock = &arc_mfu->arcs_mtx;
+ break;
+ case 3:
+ list = &arc_mru->arcs_list[ARC_BUFC_DATA];
+ *lock = &arc_mru->arcs_mtx;
+ break;
+ }
+
+ ASSERT(!(MUTEX_HELD(*lock)));
+ mutex_enter(*lock);
+ return (list);
+}
+
+/*
+ * Evict buffers from the device write hand to the distance specified in
+ * bytes. This distance may span populated buffers, it may span nothing.
+ * This is clearing a region on the L2ARC device ready for writing.
+ * If the 'all' boolean is set, every buffer is evicted.
+ */
+static void
+l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
+{
+ list_t *buflist;
+ l2arc_buf_hdr_t *abl2;
+ arc_buf_hdr_t *ab, *ab_prev;
+ kmutex_t *hash_lock;
+ uint64_t taddr;
+
+ ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
+
+ buflist = dev->l2ad_buflist;
+
+ if (buflist == NULL)
+ return;
+
+ if (!all && dev->l2ad_first) {
+ /*
+ * This is the first sweep through the device. There is
+ * nothing to evict.
+ */
+ return;
+ }
+
+ if (dev->l2ad_hand >= (dev->l2ad_end - (2 * dev->l2ad_write))) {
+ /*
+ * When nearing the end of the device, evict to the end
+ * before the device write hand jumps to the start.
+ */
+ taddr = dev->l2ad_end;
+ } else {
+ taddr = dev->l2ad_hand + distance;
+ }
+ DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist,
+ uint64_t, taddr, boolean_t, all);
+
+top:
+ mutex_enter(&l2arc_buflist_mtx);
+ for (ab = list_tail(buflist); ab; ab = ab_prev) {
+ ab_prev = list_prev(buflist, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ if (!mutex_tryenter(hash_lock)) {
+ /*
+ * Missed the hash lock. Retry.
+ */
+ ARCSTAT_BUMP(arcstat_l2_evict_lock_retry);
+ mutex_exit(&l2arc_buflist_mtx);
+ mutex_enter(hash_lock);
+ mutex_exit(hash_lock);
+ goto top;
+ }
+
+ if (HDR_L2_WRITE_HEAD(ab)) {
+ /*
+ * We hit a write head node. Leave it for
+ * l2arc_write_done().
+ */
+ list_remove(buflist, ab);
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (!all && ab->b_l2hdr != NULL &&
+ (ab->b_l2hdr->b_daddr > taddr ||
+ ab->b_l2hdr->b_daddr < dev->l2ad_hand)) {
+ /*
+ * We've evicted to the target address,
+ * or the end of the device.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (HDR_FREE_IN_PROGRESS(ab)) {
+ /*
+ * Already on the path to destruction.
+ */
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (ab->b_state == arc_l2c_only) {
+ ASSERT(!HDR_L2_READING(ab));
+ /*
+ * This doesn't exist in the ARC. Destroy.
+ * arc_hdr_destroy() will call list_remove()
+ * and decrement arcstat_l2_size.
+ */
+ arc_change_state(arc_anon, ab, hash_lock);
+ arc_hdr_destroy(ab);
+ } else {
+ /*
+ * Tell ARC this no longer exists in L2ARC.
+ */
+ if (ab->b_l2hdr != NULL) {
+ abl2 = ab->b_l2hdr;
+ ab->b_l2hdr = NULL;
+ kmem_free(abl2, sizeof (l2arc_buf_hdr_t));
+ ARCSTAT_INCR(arcstat_l2_size, -ab->b_size);
+ }
+ list_remove(buflist, ab);
+
+ /*
+ * This may have been leftover after a
+ * failed write.
+ */
+ ab->b_flags &= ~ARC_L2_WRITING;
+
+ /*
+ * Invalidate issued or about to be issued
+ * reads, since we may be about to write
+ * over this location.
+ */
+ if (HDR_L2_READING(ab)) {
+ ARCSTAT_BUMP(arcstat_l2_evict_reading);
+ ab->b_flags |= ARC_L2_EVICTED;
+ }
+ }
+ mutex_exit(hash_lock);
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+
+ spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+ dev->l2ad_evict = taddr;
+}
+
+/*
+ * Find and write ARC buffers to the L2ARC device.
+ *
+ * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
+ * for reading until they have completed writing.
+ */
+static void
+l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev)
+{
+ arc_buf_hdr_t *ab, *ab_prev, *head;
+ l2arc_buf_hdr_t *hdrl2;
+ list_t *list;
+ uint64_t passed_sz, write_sz, buf_sz;
+ uint64_t target_sz = dev->l2ad_write;
+ uint64_t headroom = dev->l2ad_write * l2arc_headroom;
+ void *buf_data;
+ kmutex_t *hash_lock, *list_lock;
+ boolean_t have_lock, full;
+ l2arc_write_callback_t *cb;
+ zio_t *pio, *wzio;
+
+ ASSERT(MUTEX_HELD(&l2arc_dev_mtx));
+ ASSERT(dev->l2ad_vdev != NULL);
+
+ pio = NULL;
+ write_sz = 0;
+ full = B_FALSE;
+ head = kmem_cache_alloc(hdr_cache, KM_SLEEP);
+ head->b_flags |= ARC_L2_WRITE_HEAD;
+
+ /*
+ * Copy buffers for L2ARC writing.
+ */
+ mutex_enter(&l2arc_buflist_mtx);
+ for (int try = 0; try <= 3; try++) {
+ list = l2arc_list_locked(try, &list_lock);
+ passed_sz = 0;
+
+ for (ab = list_tail(list); ab; ab = ab_prev) {
+ ab_prev = list_prev(list, ab);
+
+ hash_lock = HDR_LOCK(ab);
+ have_lock = MUTEX_HELD(hash_lock);
+ if (!have_lock && !mutex_tryenter(hash_lock)) {
+ /*
+ * Skip this buffer rather than waiting.
+ */
+ continue;
+ }
+
+ passed_sz += ab->b_size;
+ if (passed_sz > headroom) {
+ /*
+ * Searched too far.
+ */
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (ab->b_spa != spa) {
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (ab->b_l2hdr != NULL) {
+ /*
+ * Already in L2ARC.
+ */
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (HDR_IO_IN_PROGRESS(ab) || HDR_DONT_L2CACHE(ab)) {
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if ((write_sz + ab->b_size) > target_sz) {
+ full = B_TRUE;
+ mutex_exit(hash_lock);
+ break;
+ }
+
+ if (ab->b_buf == NULL) {
+ DTRACE_PROBE1(l2arc__buf__null, void *, ab);
+ mutex_exit(hash_lock);
+ continue;
+ }
+
+ if (pio == NULL) {
+ /*
+ * Insert a dummy header on the buflist so
+ * l2arc_write_done() can find where the
+ * write buffers begin without searching.
+ */
+ list_insert_head(dev->l2ad_buflist, head);
+
+ cb = kmem_alloc(
+ sizeof (l2arc_write_callback_t), KM_SLEEP);
+ cb->l2wcb_dev = dev;
+ cb->l2wcb_head = head;
+ pio = zio_root(spa, l2arc_write_done, cb,
+ ZIO_FLAG_CANFAIL);
+ }
+
+ /*
+ * Create and add a new L2ARC header.
+ */
+ hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP);
+ hdrl2->b_dev = dev;
+ hdrl2->b_daddr = dev->l2ad_hand;
+
+ ab->b_flags |= ARC_L2_WRITING;
+ ab->b_l2hdr = hdrl2;
+ list_insert_head(dev->l2ad_buflist, ab);
+ buf_data = ab->b_buf->b_data;
+ buf_sz = ab->b_size;
+
+ /*
+ * Compute and store the buffer cksum before
+ * writing. On debug the cksum is verified first.
+ */
+ arc_cksum_verify(ab->b_buf);
+ arc_cksum_compute(ab->b_buf, B_TRUE);
+
+ mutex_exit(hash_lock);
+
+ wzio = zio_write_phys(pio, dev->l2ad_vdev,
+ dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF,
+ NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_CANFAIL, B_FALSE);
+
+ DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev,
+ zio_t *, wzio);
+ (void) zio_nowait(wzio);
+
+ write_sz += buf_sz;
+ dev->l2ad_hand += buf_sz;
+ }
+
+ mutex_exit(list_lock);
+
+ if (full == B_TRUE)
+ break;
+ }
+ mutex_exit(&l2arc_buflist_mtx);
+
+ if (pio == NULL) {
+ ASSERT3U(write_sz, ==, 0);
+ kmem_cache_free(hdr_cache, head);
+ return;
+ }
+
+ ASSERT3U(write_sz, <=, target_sz);
+ ARCSTAT_BUMP(arcstat_l2_writes_sent);
+ ARCSTAT_INCR(arcstat_l2_size, write_sz);
+ spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+
+ /*
+ * Bump device hand to the device start if it is approaching the end.
+ * l2arc_evict() will already have evicted ahead for this case.
+ */
+ if (dev->l2ad_hand >= (dev->l2ad_end - dev->l2ad_write)) {
+ spa_l2cache_space_update(dev->l2ad_vdev, 0,
+ dev->l2ad_end - dev->l2ad_hand);
+ dev->l2ad_hand = dev->l2ad_start;
+ dev->l2ad_evict = dev->l2ad_start;
+ dev->l2ad_first = B_FALSE;
+ }
+
+ (void) zio_wait(pio);
+}
+
+/*
+ * This thread feeds the L2ARC at regular intervals. This is the beating
+ * heart of the L2ARC.
+ */
+static void
+l2arc_feed_thread(void)
+{
+ callb_cpr_t cpr;
+ l2arc_dev_t *dev;
+ spa_t *spa;
+ int interval;
+ boolean_t startup = B_TRUE;
+
+ CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
+
+ mutex_enter(&l2arc_feed_thr_lock);
+
+ while (l2arc_thread_exit == 0) {
+ /*
+ * Initially pause for L2ARC_FEED_DELAY seconds as a grace
+ * interval during boot, followed by l2arc_feed_secs seconds
+ * thereafter.
+ */
+ CALLB_CPR_SAFE_BEGIN(&cpr);
+ if (startup) {
+ interval = L2ARC_FEED_DELAY;
+ startup = B_FALSE;
+ } else {
+ interval = l2arc_feed_secs;
+ }
+ (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
+ lbolt + (hz * interval));
+ CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+
+ /*
+ * Do nothing until L2ARC devices exist.
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ if (l2arc_ndev == 0) {
+ mutex_exit(&l2arc_dev_mtx);
+ continue;
+ }
+
+ /*
+ * Avoid contributing to memory pressure.
+ */
+ if (arc_reclaim_needed()) {
+ ARCSTAT_BUMP(arcstat_l2_abort_lowmem);
+ mutex_exit(&l2arc_dev_mtx);
+ continue;
+ }
+
+ /*
+ * This selects the next l2arc device to write to, and in
+ * doing so the next spa to feed from: dev->l2ad_spa.
+ */
+ if ((dev = l2arc_dev_get_next()) == NULL) {
+ mutex_exit(&l2arc_dev_mtx);
+ continue;
+ }
+ spa = dev->l2ad_spa;
+ ASSERT(spa != NULL);
+ ARCSTAT_BUMP(arcstat_l2_feeds);
+
+ /*
+ * Evict L2ARC buffers that will be overwritten.
+ */
+ l2arc_evict(dev, dev->l2ad_write, B_FALSE);
+
+ /*
+ * Write ARC buffers.
+ */
+ l2arc_write_buffers(spa, dev);
+ mutex_exit(&l2arc_dev_mtx);
+ }
+
+ l2arc_thread_exit = 0;
+ cv_broadcast(&l2arc_feed_thr_cv);
+ CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */
+ thread_exit();
+}
+
+/*
+ * Add a vdev for use by the L2ARC. By this point the spa has already
+ * validated the vdev and opened it.
+ */
+void
+l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
+{
+ l2arc_dev_t *adddev;
+
+ /*
+ * Create a new l2arc device entry.
+ */
+ adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP);
+ adddev->l2ad_spa = spa;
+ adddev->l2ad_vdev = vd;
+ adddev->l2ad_write = l2arc_write_max;
+ adddev->l2ad_start = start;
+ adddev->l2ad_end = end;
+ adddev->l2ad_hand = adddev->l2ad_start;
+ adddev->l2ad_evict = adddev->l2ad_start;
+ adddev->l2ad_first = B_TRUE;
+ ASSERT3U(adddev->l2ad_write, >, 0);
+
+ /*
+ * This is a list of all ARC buffers that are still valid on the
+ * device.
+ */
+ adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP);
+ list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
+ offsetof(arc_buf_hdr_t, b_l2node));
+
+ spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+
+ /*
+ * Add device to global list
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ list_insert_head(l2arc_dev_list, adddev);
+ atomic_inc_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+}
+
+/*
+ * Remove a vdev from the L2ARC.
+ */
+void
+l2arc_remove_vdev(vdev_t *vd)
+{
+ l2arc_dev_t *dev, *nextdev, *remdev = NULL;
+
+ /*
+ * We can only grab the spa config lock when cache device writes
+ * complete.
+ */
+ ASSERT3U(l2arc_writes_sent, ==, l2arc_writes_done);
+
+ /*
+ * Find the device by vdev
+ */
+ mutex_enter(&l2arc_dev_mtx);
+ for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) {
+ nextdev = list_next(l2arc_dev_list, dev);
+ if (vd == dev->l2ad_vdev) {
+ remdev = dev;
+ break;
+ }
+ }
+ ASSERT(remdev != NULL);
+
+ /*
+ * Remove device from global list
+ */
+ list_remove(l2arc_dev_list, remdev);
+ l2arc_dev_last = NULL; /* may have been invalidated */
+
+ /*
+ * Clear all buflists and ARC references. L2ARC device flush.
+ */
+ l2arc_evict(remdev, 0, B_TRUE);
+ list_destroy(remdev->l2ad_buflist);
+ kmem_free(remdev->l2ad_buflist, sizeof (list_t));
+ kmem_free(remdev, sizeof (l2arc_dev_t));
+
+ atomic_dec_64(&l2arc_ndev);
+ mutex_exit(&l2arc_dev_mtx);
+}
+
+void
+l2arc_init()
+{
+ l2arc_thread_exit = 0;
+ l2arc_ndev = 0;
+ l2arc_writes_sent = 0;
+ l2arc_writes_done = 0;
+
+ mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL);
+ cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL);
+ mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL);
+
+ l2arc_dev_list = &L2ARC_dev_list;
+ l2arc_free_on_write = &L2ARC_free_on_write;
+ list_create(l2arc_dev_list, sizeof (l2arc_dev_t),
+ offsetof(l2arc_dev_t, l2ad_node));
+ list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t),
+ offsetof(l2arc_data_free_t, l2df_list_node));
+
+ (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
+ TS_RUN, minclsyspri);
+}
+
+void
+l2arc_fini()
+{
+ mutex_enter(&l2arc_feed_thr_lock);
+ cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */
+ l2arc_thread_exit = 1;
+ while (l2arc_thread_exit != 0)
+ cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock);
+ mutex_exit(&l2arc_feed_thr_lock);
+
+ mutex_destroy(&l2arc_feed_thr_lock);
+ cv_destroy(&l2arc_feed_thr_cv);
+ mutex_destroy(&l2arc_dev_mtx);
+ mutex_destroy(&l2arc_buflist_mtx);
+ mutex_destroy(&l2arc_free_on_write_mtx);
+
+ list_destroy(l2arc_dev_list);
+ list_destroy(l2arc_free_on_write);
+}
diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c
index 0c0f7cb2d9..ef6ed22f85 100644
--- a/usr/src/uts/common/fs/zfs/dmu.c
+++ b/usr/src/uts/common/fs/zfs/dmu.c
@@ -1036,6 +1036,7 @@ dmu_init(void)
dbuf_init();
dnode_init();
arc_init();
+ l2arc_init();
}
void
@@ -1044,4 +1045,5 @@ dmu_fini(void)
arc_fini();
dnode_fini();
dbuf_fini();
+ l2arc_fini();
}
diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c
index b2840e4e87..589dc7e3de 100644
--- a/usr/src/uts/common/fs/zfs/metaslab.c
+++ b/usr/src/uts/common/fs/zfs/metaslab.c
@@ -341,7 +341,7 @@ metaslab_fini(metaslab_t *msp)
int t;
vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
- -msp->ms_smo.smo_alloc);
+ -msp->ms_smo.smo_alloc, B_TRUE);
metaslab_group_remove(mg, msp);
@@ -569,10 +569,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
space_map_create(&msp->ms_freemap[t], sm->sm_start,
sm->sm_size, sm->sm_shift, sm->sm_lock);
}
- vdev_space_update(vd, sm->sm_size, 0);
+ vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
}
- vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc);
+ vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c
index 8b0a936e0b..983e2c3154 100644
--- a/usr/src/uts/common/fs/zfs/spa.c
+++ b/usr/src/uts/common/fs/zfs/spa.c
@@ -56,6 +56,7 @@
#include <sys/dsl_prop.h>
#include <sys/dsl_synctask.h>
#include <sys/fs/zfs.h>
+#include <sys/arc.h>
#include <sys/callb.h>
#include <sys/systeminfo.h>
#include <sys/sunddi.h>
@@ -662,6 +663,11 @@ spa_unload(spa_t *spa)
spa_config_exit(spa, FTAG);
/*
+ * Drop and purge level 2 cache
+ */
+ spa_l2cache_drop(spa);
+
+ /*
* Close the dsl pool.
*/
if (spa->spa_dsl_pool) {
@@ -676,15 +682,28 @@ spa_unload(spa_t *spa)
vdev_free(spa->spa_root_vdev);
ASSERT(spa->spa_root_vdev == NULL);
- for (i = 0; i < spa->spa_nspares; i++)
- vdev_free(spa->spa_spares[i]);
- if (spa->spa_spares) {
- kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
- spa->spa_spares = NULL;
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
+ vdev_free(spa->spa_spares.sav_vdevs[i]);
+ if (spa->spa_spares.sav_vdevs) {
+ kmem_free(spa->spa_spares.sav_vdevs,
+ spa->spa_spares.sav_count * sizeof (void *));
+ spa->spa_spares.sav_vdevs = NULL;
}
- if (spa->spa_sparelist) {
- nvlist_free(spa->spa_sparelist);
- spa->spa_sparelist = NULL;
+ if (spa->spa_spares.sav_config) {
+ nvlist_free(spa->spa_spares.sav_config);
+ spa->spa_spares.sav_config = NULL;
+ }
+
+ for (i = 0; i < spa->spa_l2cache.sav_count; i++)
+ vdev_free(spa->spa_l2cache.sav_vdevs[i]);
+ if (spa->spa_l2cache.sav_vdevs) {
+ kmem_free(spa->spa_l2cache.sav_vdevs,
+ spa->spa_l2cache.sav_count * sizeof (void *));
+ spa->spa_l2cache.sav_vdevs = NULL;
+ }
+ if (spa->spa_l2cache.sav_config) {
+ nvlist_free(spa->spa_l2cache.sav_config);
+ spa->spa_l2cache.sav_config = NULL;
}
spa->spa_async_suspended = 0;
@@ -693,8 +712,8 @@ spa_unload(spa_t *spa)
/*
* Load (or re-load) the current list of vdevs describing the active spares for
* this pool. When this is called, we have some form of basic information in
- * 'spa_sparelist'. We parse this into vdevs, try to open them, and then
- * re-generate a more complete list including status information.
+ * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and
+ * then re-generate a more complete list including status information.
*/
static void
spa_load_spares(spa_t *spa)
@@ -707,8 +726,8 @@ spa_load_spares(spa_t *spa)
/*
* First, close and free any existing spare vdevs.
*/
- for (i = 0; i < spa->spa_nspares; i++) {
- vd = spa->spa_spares[i];
+ for (i = 0; i < spa->spa_spares.sav_count; i++) {
+ vd = spa->spa_spares.sav_vdevs[i];
/* Undo the call to spa_activate() below */
if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL &&
@@ -718,17 +737,18 @@ spa_load_spares(spa_t *spa)
vdev_free(vd);
}
- if (spa->spa_spares)
- kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *));
+ if (spa->spa_spares.sav_vdevs)
+ kmem_free(spa->spa_spares.sav_vdevs,
+ spa->spa_spares.sav_count * sizeof (void *));
- if (spa->spa_sparelist == NULL)
+ if (spa->spa_spares.sav_config == NULL)
nspares = 0;
else
- VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
- spa->spa_nspares = (int)nspares;
- spa->spa_spares = NULL;
+ spa->spa_spares.sav_count = (int)nspares;
+ spa->spa_spares.sav_vdevs = NULL;
if (nspares == 0)
return;
@@ -742,13 +762,14 @@ spa_load_spares(spa_t *spa)
* validate each vdev on the spare list. If the vdev also exists in the
* active configuration, then we also mark this vdev as an active spare.
*/
- spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP);
- for (i = 0; i < spa->spa_nspares; i++) {
+ spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0; i < spa->spa_spares.sav_count; i++) {
VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
VDEV_ALLOC_SPARE) == 0);
ASSERT(vd != NULL);
- spa->spa_spares[i] = vd;
+ spa->spa_spares.sav_vdevs[i] = vd;
if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) {
if (!tvd->vdev_isspare)
@@ -775,25 +796,160 @@ spa_load_spares(spa_t *spa)
continue;
vd->vdev_top = vd;
- (void) vdev_validate_spare(vd);
+ if (vdev_validate_aux(vd) == 0)
+ spa_spare_add(vd);
}
/*
* Recompute the stashed list of spares, with status information
* this time.
*/
- VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
DATA_TYPE_NVLIST_ARRAY) == 0);
- spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP);
- for (i = 0; i < spa->spa_nspares; i++)
- spares[i] = vdev_config_generate(spa, spa->spa_spares[i],
- B_TRUE, B_TRUE);
- VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
- spares, spa->spa_nspares) == 0);
- for (i = 0; i < spa->spa_nspares; i++)
+ spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
+ spares[i] = vdev_config_generate(spa,
+ spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
+ for (i = 0; i < spa->spa_spares.sav_count; i++)
nvlist_free(spares[i]);
- kmem_free(spares, spa->spa_nspares * sizeof (void *));
+ kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
+}
+
+/*
+ * Load (or re-load) the current list of vdevs describing the active l2cache for
+ * this pool. When this is called, we have some form of basic information in
+ * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and
+ * then re-generate a more complete list including status information.
+ * Devices which are already active have their details maintained, and are
+ * not re-opened.
+ */
+static void
+spa_load_l2cache(spa_t *spa)
+{
+ nvlist_t **l2cache;
+ uint_t nl2cache;
+ int i, j, oldnvdevs;
+ uint64_t guid;
+ vdev_t *vd, **oldvdevs, **newvdevs;
+ spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+ if (sav->sav_config != NULL) {
+ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+ newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
+ } else {
+ nl2cache = 0;
+ }
+
+ oldvdevs = sav->sav_vdevs;
+ oldnvdevs = sav->sav_count;
+ sav->sav_vdevs = NULL;
+ sav->sav_count = 0;
+
+ /*
+ * Process new nvlist of vdevs.
+ */
+ for (i = 0; i < nl2cache; i++) {
+ VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
+ &guid) == 0);
+
+ newvdevs[i] = NULL;
+ for (j = 0; j < oldnvdevs; j++) {
+ vd = oldvdevs[j];
+ if (vd != NULL && guid == vd->vdev_guid) {
+ /*
+ * Retain previous vdev for add/remove ops.
+ */
+ newvdevs[i] = vd;
+ oldvdevs[j] = NULL;
+ break;
+ }
+ }
+
+ if (newvdevs[i] == NULL) {
+ /*
+ * Create new vdev
+ */
+ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
+ VDEV_ALLOC_L2CACHE) == 0);
+ ASSERT(vd != NULL);
+ newvdevs[i] = vd;
+
+ /*
+ * Commit this vdev as an l2cache device,
+ * even if it fails to open.
+ */
+ spa_l2cache_add(vd);
+
+ if (vdev_open(vd) != 0)
+ continue;
+
+ vd->vdev_top = vd;
+ (void) vdev_validate_aux(vd);
+
+ if (!vdev_is_dead(vd)) {
+ uint64_t size;
+ size = vdev_get_rsize(vd);
+ ASSERT3U(size, >, 0);
+ if (spa_mode & FWRITE) {
+ l2arc_add_vdev(spa, vd,
+ VDEV_LABEL_START_SIZE,
+ size - VDEV_LABEL_START_SIZE);
+ }
+ spa_l2cache_activate(vd);
+ }
+ }
+ }
+
+ /*
+ * Purge vdevs that were dropped
+ */
+ for (i = 0; i < oldnvdevs; i++) {
+ uint64_t pool;
+
+ vd = oldvdevs[i];
+ if (vd != NULL) {
+ if (spa_mode & FWRITE &&
+ spa_l2cache_exists(vd->vdev_guid, &pool) &&
+ pool != 0ULL) {
+ l2arc_remove_vdev(vd);
+ }
+ (void) vdev_close(vd);
+ spa_l2cache_remove(vd);
+ }
+ }
+
+ if (oldvdevs)
+ kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
+
+ if (sav->sav_config == NULL)
+ goto out;
+
+ sav->sav_vdevs = newvdevs;
+ sav->sav_count = (int)nl2cache;
+
+ /*
+ * Recompute the stashed list of l2cache devices, with status
+ * information this time.
+ */
+ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
+ for (i = 0; i < sav->sav_count; i++)
+ l2cache[i] = vdev_config_generate(spa,
+ sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE);
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
+out:
+ for (i = 0; i < sav->sav_count; i++)
+ nvlist_free(l2cache[i]);
+ if (sav->sav_count)
+ kmem_free(l2cache, sav->sav_count * sizeof (void *));
}
static int
@@ -1090,7 +1246,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
* Load any hot spares for this pool.
*/
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
- DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object);
+ DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
if (error != 0 && error != ENOENT) {
vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
@@ -1099,8 +1255,8 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
}
if (error == 0) {
ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
- if (load_nvlist(spa, spa->spa_spares_object,
- &spa->spa_sparelist) != 0) {
+ if (load_nvlist(spa, spa->spa_spares.sav_object,
+ &spa->spa_spares.sav_config) != 0) {
vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
VDEV_AUX_CORRUPT_DATA);
error = EIO;
@@ -1112,6 +1268,34 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
spa_config_exit(spa, FTAG);
}
+ /*
+ * Load any level 2 ARC devices for this pool.
+ */
+ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+ DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
+ &spa->spa_l2cache.sav_object);
+ if (error != 0 && error != ENOENT) {
+ vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+ if (error == 0) {
+ ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
+ if (load_nvlist(spa, spa->spa_l2cache.sav_object,
+ &spa->spa_l2cache.sav_config) != 0) {
+ vdev_set_state(rvd, B_TRUE,
+ VDEV_STATE_CANT_OPEN,
+ VDEV_AUX_CORRUPT_DATA);
+ error = EIO;
+ goto out;
+ }
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_l2cache(spa);
+ spa_config_exit(spa, FTAG);
+ }
+
spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -1372,6 +1556,9 @@ spa_inject_delref(spa_t *spa)
mutex_exit(&spa_namespace_lock);
}
+/*
+ * Add spares device information to the nvlist.
+ */
static void
spa_add_spares(spa_t *spa, nvlist_t *config)
{
@@ -1383,12 +1570,12 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
uint_t vsc;
uint64_t pool;
- if (spa->spa_nspares == 0)
+ if (spa->spa_spares.sav_count == 0)
return;
VERIFY(nvlist_lookup_nvlist(config,
ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
- VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
if (nspares != 0) {
VERIFY(nvlist_add_nvlist_array(nvroot,
@@ -1415,6 +1602,62 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
}
}
+/*
+ * Add l2cache device information to the nvlist, including vdev stats.
+ */
+static void
+spa_add_l2cache(spa_t *spa, nvlist_t *config)
+{
+ nvlist_t **l2cache;
+ uint_t i, j, nl2cache;
+ nvlist_t *nvroot;
+ uint64_t guid;
+ vdev_t *vd;
+ vdev_stat_t *vs;
+ uint_t vsc;
+
+ if (spa->spa_l2cache.sav_count == 0)
+ return;
+
+ spa_config_enter(spa, RW_READER, FTAG);
+
+ VERIFY(nvlist_lookup_nvlist(config,
+ ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+ if (nl2cache != 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+ VERIFY(nvlist_lookup_nvlist_array(nvroot,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
+
+ /*
+ * Update level 2 cache device stats.
+ */
+
+ for (i = 0; i < nl2cache; i++) {
+ VERIFY(nvlist_lookup_uint64(l2cache[i],
+ ZPOOL_CONFIG_GUID, &guid) == 0);
+
+ vd = NULL;
+ for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
+ if (guid ==
+ spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
+ vd = spa->spa_l2cache.sav_vdevs[j];
+ break;
+ }
+ }
+ ASSERT(vd != NULL);
+
+ VERIFY(nvlist_lookup_uint64_array(l2cache[i],
+ ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0);
+ vdev_get_stats(vd, vs);
+ }
+ }
+
+ spa_config_exit(spa, FTAG);
+}
+
int
spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
{
@@ -1429,6 +1672,7 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
spa_get_errlog_size(spa)) == 0);
spa_add_spares(spa, *config);
+ spa_add_l2cache(spa, *config);
}
/*
@@ -1457,45 +1701,46 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
}
/*
- * Validate that the 'spares' array is well formed. We must have an array of
- * nvlists, each which describes a valid leaf vdev. If this is an import (mode
- * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long
- * as they are well-formed.
+ * Validate that the auxiliary device array is well formed. We must have an
+ * array of nvlists, each which describes a valid leaf vdev. If this is an
+ * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
+ * specified, as long as they are well-formed.
*/
static int
-spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
+spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
+ spa_aux_vdev_t *sav, const char *config, uint64_t version,
+ vdev_labeltype_t label)
{
- nvlist_t **spares;
- uint_t i, nspares;
+ nvlist_t **dev;
+ uint_t i, ndev;
vdev_t *vd;
int error;
/*
- * It's acceptable to have no spares specified.
+ * It's acceptable to have no devs specified.
*/
- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- &spares, &nspares) != 0)
+ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
return (0);
- if (nspares == 0)
+ if (ndev == 0)
return (EINVAL);
/*
- * Make sure the pool is formatted with a version that supports hot
- * spares.
+ * Make sure the pool is formatted with a version that supports this
+ * device type.
*/
- if (spa_version(spa) < SPA_VERSION_SPARES)
+ if (spa_version(spa) < version)
return (ENOTSUP);
/*
- * Set the pending spare list so we correctly handle device in-use
+ * Set the pending device list so we correctly handle device in-use
* checking.
*/
- spa->spa_pending_spares = spares;
- spa->spa_pending_nspares = nspares;
+ sav->sav_pending = dev;
+ sav->sav_npending = ndev;
- for (i = 0; i < nspares; i++) {
- if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0,
+ for (i = 0; i < ndev; i++) {
+ if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
mode)) != 0)
goto out;
@@ -1505,29 +1750,127 @@ spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
goto out;
}
+ /*
+ * The L2ARC currently only supports disk devices.
+ */
+ if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
+ strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
+ error = ENOTBLK;
+ goto out;
+ }
+
vd->vdev_top = vd;
if ((error = vdev_open(vd)) == 0 &&
- (error = vdev_label_init(vd, crtxg,
- VDEV_LABEL_SPARE)) == 0) {
- VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID,
+ (error = vdev_label_init(vd, crtxg, label)) == 0) {
+ VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
vd->vdev_guid) == 0);
}
vdev_free(vd);
- if (error && mode != VDEV_ALLOC_SPARE)
+ if (error &&
+ (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
goto out;
else
error = 0;
}
out:
- spa->spa_pending_spares = NULL;
- spa->spa_pending_nspares = 0;
+ sav->sav_pending = NULL;
+ sav->sav_npending = 0;
return (error);
}
+static int
+spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
+{
+ int error;
+
+ if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
+ &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
+ VDEV_LABEL_SPARE)) != 0) {
+ return (error);
+ }
+
+ return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
+ &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
+ VDEV_LABEL_L2CACHE));
+}
+
+static void
+spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
+ const char *config)
+{
+ int i;
+
+ if (sav->sav_config != NULL) {
+ nvlist_t **olddevs;
+ uint_t oldndevs;
+ nvlist_t **newdevs;
+
+ /*
+ * Generate new dev list by concatentating with the
+ * current dev list.
+ */
+ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
+ &olddevs, &oldndevs) == 0);
+
+ newdevs = kmem_alloc(sizeof (void *) *
+ (ndevs + oldndevs), KM_SLEEP);
+ for (i = 0; i < oldndevs; i++)
+ VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
+ KM_SLEEP) == 0);
+ for (i = 0; i < ndevs; i++)
+ VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
+ KM_SLEEP) == 0);
+
+ VERIFY(nvlist_remove(sav->sav_config, config,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+ config, newdevs, ndevs + oldndevs) == 0);
+ for (i = 0; i < oldndevs + ndevs; i++)
+ nvlist_free(newdevs[i]);
+ kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
+ } else {
+ /*
+ * Generate a new dev list.
+ */
+ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
+ KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
+ devs, ndevs) == 0);
+ }
+}
+
+/*
+ * Stop and drop level 2 ARC devices
+ */
+void
+spa_l2cache_drop(spa_t *spa)
+{
+ vdev_t *vd;
+ int i;
+ spa_aux_vdev_t *sav = &spa->spa_l2cache;
+
+ for (i = 0; i < sav->sav_count; i++) {
+ uint64_t pool;
+
+ vd = sav->sav_vdevs[i];
+ ASSERT(vd != NULL);
+
+ if (spa_mode & FWRITE &&
+ spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL) {
+ l2arc_remove_vdev(vd);
+ }
+ if (vd->vdev_isl2cache)
+ spa_l2cache_remove(vd);
+ vdev_clear_stats(vd);
+ (void) vdev_close(vd);
+ }
+}
+
/*
* Pool Creation
*/
@@ -1542,8 +1885,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
dmu_tx_t *tx;
int c, error = 0;
uint64_t txg = TXG_INITIAL;
- nvlist_t **spares;
- uint_t nspares;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
uint64_t version;
/*
@@ -1594,7 +1937,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
if (error == 0 &&
(error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
- (error = spa_validate_spares(spa, nvroot, txg,
+ (error = spa_validate_aux(spa, nvroot, txg,
VDEV_ALLOC_ADD)) == 0) {
for (c = 0; c < rvd->vdev_children; c++)
vdev_init(rvd->vdev_child[c], txg);
@@ -1616,14 +1959,29 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
*/
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
&spares, &nspares) == 0) {
- VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME,
+ VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
KM_SLEEP) == 0);
- VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
+ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
spa_config_enter(spa, RW_WRITER, FTAG);
spa_load_spares(spa);
spa_config_exit(spa, FTAG);
- spa->spa_sync_spares = B_TRUE;
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+
+ /*
+ * Get the list of level 2 cache devices, if specified.
+ */
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0) {
+ VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_l2cache(spa);
+ spa_config_exit(spa, FTAG);
+ spa->spa_l2cache.sav_sync = B_TRUE;
}
spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg);
@@ -1717,8 +2075,8 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
char *altroot = NULL;
int error;
nvlist_t *nvroot;
- nvlist_t **spares;
- uint_t nspares;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
/*
* If a pool with this name exists, return failure.
@@ -1749,18 +2107,24 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
* Toss any existing sparelist, as it doesn't have any validity anymore,
* and conflicts with spa_has_spare().
*/
- if (spa->spa_sparelist) {
- nvlist_free(spa->spa_sparelist);
- spa->spa_sparelist = NULL;
+ if (spa->spa_spares.sav_config) {
+ nvlist_free(spa->spa_spares.sav_config);
+ spa->spa_spares.sav_config = NULL;
spa_load_spares(spa);
}
+ if (spa->spa_l2cache.sav_config) {
+ nvlist_free(spa->spa_l2cache.sav_config);
+ spa->spa_l2cache.sav_config = NULL;
+ spa_load_l2cache(spa);
+ }
VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
&nvroot) == 0);
- if (error == 0) {
- error = spa_validate_spares(spa, nvroot, -1ULL,
- VDEV_ALLOC_SPARE);
- }
+ if (error == 0)
+ error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE);
+ if (error == 0)
+ error = spa_validate_aux(spa, nvroot, -1ULL,
+ VDEV_ALLOC_L2CACHE);
spa_config_exit(spa, FTAG);
if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
@@ -1772,23 +2136,38 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
}
/*
- * Override any spares as specified by the user, as these may have
- * correct device names/devids, etc.
+ * Override any spares and level 2 cache devices as specified by
+ * the user, as these may have correct device names/devids, etc.
*/
if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
&spares, &nspares) == 0) {
- if (spa->spa_sparelist)
- VERIFY(nvlist_remove(spa->spa_sparelist,
+ if (spa->spa_spares.sav_config)
+ VERIFY(nvlist_remove(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
else
- VERIFY(nvlist_alloc(&spa->spa_sparelist,
+ VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
+ VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
spa_config_enter(spa, RW_WRITER, FTAG);
spa_load_spares(spa);
spa_config_exit(spa, FTAG);
- spa->spa_sync_spares = B_TRUE;
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0) {
+ if (spa->spa_l2cache.sav_config)
+ VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
+ else
+ VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+ NV_UNIQUE_NAME, KM_SLEEP) == 0);
+ VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+ spa_config_enter(spa, RW_WRITER, FTAG);
+ spa_load_l2cache(spa);
+ spa_config_exit(spa, FTAG);
+ spa->spa_l2cache.sav_sync = B_TRUE;
}
/*
@@ -1857,9 +2236,10 @@ spa_tryimport(nvlist_t *tryconfig)
spa->spa_uberblock.ub_timestamp) == 0);
/*
- * Add the list of hot spares.
+ * Add the list of hot spares and level 2 cache devices.
*/
spa_add_spares(spa, config);
+ spa_add_l2cache(spa, config);
}
spa_unload(spa);
@@ -2014,8 +2394,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
int c, error;
vdev_t *rvd = spa->spa_root_vdev;
vdev_t *vd, *tvd;
- nvlist_t **spares;
- uint_t i, nspares;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
txg = spa_vdev_enter(spa);
@@ -2025,11 +2405,15 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
spa->spa_pending_vdev = vd;
- if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- &spares, &nspares) != 0)
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
+ &nspares) != 0)
nspares = 0;
- if (vd->vdev_children == 0 && nspares == 0) {
+ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
+ &nl2cache) != 0)
+ nl2cache = 0;
+
+ if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) {
spa->spa_pending_vdev = NULL;
return (spa_vdev_exit(spa, vd, txg, EINVAL));
}
@@ -2042,11 +2426,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
}
/*
- * We must validate the spares after checking the children. Otherwise,
- * vdev_inuse() will blindly overwrite the spare.
+ * We must validate the spares and l2cache devices after checking the
+ * children. Otherwise, vdev_inuse() will blindly overwrite the spare.
*/
- if ((error = spa_validate_spares(spa, nvroot, txg,
- VDEV_ALLOC_ADD)) != 0) {
+ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) {
spa->spa_pending_vdev = NULL;
return (spa_vdev_exit(spa, vd, txg, error));
}
@@ -2065,43 +2448,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
}
if (nspares != 0) {
- if (spa->spa_sparelist != NULL) {
- nvlist_t **oldspares;
- uint_t oldnspares;
- nvlist_t **newspares;
-
- VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0);
-
- newspares = kmem_alloc(sizeof (void *) *
- (nspares + oldnspares), KM_SLEEP);
- for (i = 0; i < oldnspares; i++)
- VERIFY(nvlist_dup(oldspares[i],
- &newspares[i], KM_SLEEP) == 0);
- for (i = 0; i < nspares; i++)
- VERIFY(nvlist_dup(spares[i],
- &newspares[i + oldnspares],
- KM_SLEEP) == 0);
-
- VERIFY(nvlist_remove(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
-
- VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, newspares,
- nspares + oldnspares) == 0);
- for (i = 0; i < oldnspares + nspares; i++)
- nvlist_free(newspares[i]);
- kmem_free(newspares, (oldnspares + nspares) *
- sizeof (void *));
- } else {
- VERIFY(nvlist_alloc(&spa->spa_sparelist,
- NV_UNIQUE_NAME, KM_SLEEP) == 0);
- VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
- }
-
+ spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
+ ZPOOL_CONFIG_SPARES);
spa_load_spares(spa);
- spa->spa_sync_spares = B_TRUE;
+ spa->spa_spares.sav_sync = B_TRUE;
+ }
+
+ if (nl2cache != 0) {
+ spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
+ ZPOOL_CONFIG_L2CACHE);
+ spa_load_l2cache(spa);
+ spa->spa_l2cache.sav_sync = B_TRUE;
}
/*
@@ -2511,55 +2868,38 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
}
/*
- * Remove a device from the pool. Currently, this supports removing only hot
- * spares.
+ * Remove a spares vdev from the nvlist config.
*/
-int
-spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
+static int
+spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare,
+ nvlist_t **spares, int nspares, vdev_t *vd)
{
- vdev_t *vd;
- nvlist_t **spares, *nv, **newspares;
- uint_t i, j, nspares;
- int ret = 0;
-
- spa_config_enter(spa, RW_WRITER, FTAG);
-
- vd = spa_lookup_by_guid(spa, guid);
+ nvlist_t *nv, **newspares;
+ int i, j;
nv = NULL;
- if (spa->spa_spares != NULL &&
- nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
- &spares, &nspares) == 0) {
- for (i = 0; i < nspares; i++) {
- uint64_t theguid;
+ for (i = 0; i < nspares; i++) {
+ uint64_t theguid;
- VERIFY(nvlist_lookup_uint64(spares[i],
- ZPOOL_CONFIG_GUID, &theguid) == 0);
- if (theguid == guid) {
- nv = spares[i];
- break;
- }
+ VERIFY(nvlist_lookup_uint64(spares[i],
+ ZPOOL_CONFIG_GUID, &theguid) == 0);
+ if (theguid == guid) {
+ nv = spares[i];
+ break;
}
}
/*
- * We only support removing a hot spare, and only if it's not currently
- * in use in this pool.
+ * Only remove the hot spare if it's not currently in use in this pool.
*/
- if (nv == NULL && vd == NULL) {
- ret = ENOENT;
- goto out;
- }
+ if (nv == NULL && vd == NULL)
+ return (ENOENT);
- if (nv == NULL && vd != NULL) {
- ret = ENOTSUP;
- goto out;
- }
+ if (nv == NULL && vd != NULL)
+ return (ENOTSUP);
- if (!unspare && nv != NULL && vd != NULL) {
- ret = EBUSY;
- goto out;
- }
+ if (!unspare && nv != NULL && vd != NULL)
+ return (EBUSY);
if (nspares == 1) {
newspares = NULL;
@@ -2573,20 +2913,119 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
}
}
- VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
+ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES,
DATA_TYPE_NVLIST_ARRAY) == 0);
- VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES,
- newspares, nspares - 1) == 0);
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0);
for (i = 0; i < nspares - 1; i++)
nvlist_free(newspares[i]);
kmem_free(newspares, (nspares - 1) * sizeof (void *));
- spa_load_spares(spa);
- spa->spa_sync_spares = B_TRUE;
+
+ return (0);
+}
+
+/*
+ * Remove an l2cache vdev from the nvlist config.
+ */
+static int
+spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache,
+ int nl2cache, vdev_t *vd)
+{
+ nvlist_t *nv, **newl2cache;
+ int i, j;
+
+ nv = NULL;
+ for (i = 0; i < nl2cache; i++) {
+ uint64_t theguid;
+
+ VERIFY(nvlist_lookup_uint64(l2cache[i],
+ ZPOOL_CONFIG_GUID, &theguid) == 0);
+ if (theguid == guid) {
+ nv = l2cache[i];
+ break;
+ }
+ }
+
+ if (vd == NULL) {
+ for (i = 0; i < nl2cache; i++) {
+ if (sav->sav_vdevs[i]->vdev_guid == guid) {
+ vd = sav->sav_vdevs[i];
+ break;
+ }
+ }
+ }
+
+ if (nv == NULL && vd == NULL)
+ return (ENOENT);
+
+ if (nv == NULL && vd != NULL)
+ return (ENOTSUP);
+
+ if (nl2cache == 1) {
+ newl2cache = NULL;
+ } else {
+ newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *),
+ KM_SLEEP);
+ for (i = 0, j = 0; i < nl2cache; i++) {
+ if (l2cache[i] != nv)
+ VERIFY(nvlist_dup(l2cache[i],
+ &newl2cache[j++], KM_SLEEP) == 0);
+ }
+ }
+
+ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
+ DATA_TYPE_NVLIST_ARRAY) == 0);
+ VERIFY(nvlist_add_nvlist_array(sav->sav_config,
+ ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0);
+ for (i = 0; i < nl2cache - 1; i++)
+ nvlist_free(newl2cache[i]);
+ kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *));
+
+ return (0);
+}
+
+/*
+ * Remove a device from the pool. Currently, this supports removing only hot
+ * spares and level 2 ARC devices.
+ */
+int
+spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
+{
+ vdev_t *vd;
+ nvlist_t **spares, **l2cache;
+ uint_t nspares, nl2cache;
+ int error = 0;
+
+ spa_config_enter(spa, RW_WRITER, FTAG);
+
+ vd = spa_lookup_by_guid(spa, guid);
+
+ if (spa->spa_spares.sav_vdevs != NULL &&
+ spa_spare_exists(guid, NULL) &&
+ nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
+ ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) {
+ if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare,
+ spares, nspares, vd)) != 0)
+ goto out;
+ spa_load_spares(spa);
+ spa->spa_spares.sav_sync = B_TRUE;
+ goto out;
+ }
+
+ if (spa->spa_l2cache.sav_vdevs != NULL &&
+ spa_l2cache_exists(guid, NULL) &&
+ nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
+ ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) {
+ if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid,
+ l2cache, nl2cache, vd)) != 0)
+ goto out;
+ spa_load_l2cache(spa);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ }
out:
spa_config_exit(spa, FTAG);
-
- return (ret);
+ return (error);
}
/*
@@ -2693,33 +3132,52 @@ spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
/*
- * Determine if this is a reference to a hot spare. In that
- * case, update the path as stored in the spare list.
+ * Determine if this is a reference to a hot spare or l2cache
+ * device. If it is, update the path as stored in their
+ * device list.
*/
- nvlist_t **spares;
- uint_t i, nspares;
- if (spa->spa_sparelist != NULL) {
- VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist,
- ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
+ nvlist_t **spares, **l2cache;
+ uint_t i, nspares, nl2cache;
+
+ if (spa->spa_spares.sav_config != NULL) {
+ VERIFY(nvlist_lookup_nvlist_array(
+ spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
+ &spares, &nspares) == 0);
for (i = 0; i < nspares; i++) {
uint64_t theguid;
VERIFY(nvlist_lookup_uint64(spares[i],
ZPOOL_CONFIG_GUID, &theguid) == 0);
- if (theguid == guid)
- break;
+ if (theguid == guid) {
+ VERIFY(nvlist_add_string(spares[i],
+ ZPOOL_CONFIG_PATH, newpath) == 0);
+ spa_load_spares(spa);
+ spa->spa_spares.sav_sync = B_TRUE;
+ return (spa_vdev_exit(spa, NULL, txg,
+ 0));
+ }
}
+ }
- if (i == nspares)
- return (spa_vdev_exit(spa, NULL, txg, ENOENT));
-
- VERIFY(nvlist_add_string(spares[i],
- ZPOOL_CONFIG_PATH, newpath) == 0);
- spa_load_spares(spa);
- spa->spa_sync_spares = B_TRUE;
- return (spa_vdev_exit(spa, NULL, txg, 0));
- } else {
- return (spa_vdev_exit(spa, NULL, txg, ENOENT));
+ if (spa->spa_l2cache.sav_config != NULL) {
+ VERIFY(nvlist_lookup_nvlist_array(
+ spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache) == 0);
+ for (i = 0; i < nl2cache; i++) {
+ uint64_t theguid;
+ VERIFY(nvlist_lookup_uint64(l2cache[i],
+ ZPOOL_CONFIG_GUID, &theguid) == 0);
+ if (theguid == guid) {
+ VERIFY(nvlist_add_string(l2cache[i],
+ ZPOOL_CONFIG_PATH, newpath) == 0);
+ spa_load_l2cache(spa);
+ spa->spa_l2cache.sav_sync = B_TRUE;
+ return (spa_vdev_exit(spa, NULL, txg,
+ 0));
+ }
+ }
}
+
+ return (spa_vdev_exit(spa, NULL, txg, ENOENT));
}
if (!vd->vdev_ops->vdev_op_leaf)
@@ -3338,50 +3796,49 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
}
static void
-spa_sync_spares(spa_t *spa, dmu_tx_t *tx)
+spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
+ const char *config, const char *entry)
{
nvlist_t *nvroot;
- nvlist_t **spares;
+ nvlist_t **list;
int i;
- if (!spa->spa_sync_spares)
+ if (!sav->sav_sync)
return;
/*
- * Update the MOS nvlist describing the list of available spares.
- * spa_validate_spares() will have already made sure this nvlist is
+ * Update the MOS nvlist describing the list of available devices.
+ * spa_validate_aux() will have already made sure this nvlist is
* valid and the vdevs are labeled appropriately.
*/
- if (spa->spa_spares_object == 0) {
- spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset,
- DMU_OT_PACKED_NVLIST, 1 << 14,
- DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
+ if (sav->sav_object == 0) {
+ sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
+ DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
+ sizeof (uint64_t), tx);
VERIFY(zap_update(spa->spa_meta_objset,
- DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES,
- sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0);
+ DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
+ &sav->sav_object, tx) == 0);
}
VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
- if (spa->spa_nspares == 0) {
- VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- NULL, 0) == 0);
+ if (sav->sav_count == 0) {
+ VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
} else {
- spares = kmem_alloc(spa->spa_nspares * sizeof (void *),
- KM_SLEEP);
- for (i = 0; i < spa->spa_nspares; i++)
- spares[i] = vdev_config_generate(spa,
- spa->spa_spares[i], B_FALSE, B_TRUE);
- VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
- spares, spa->spa_nspares) == 0);
- for (i = 0; i < spa->spa_nspares; i++)
- nvlist_free(spares[i]);
- kmem_free(spares, spa->spa_nspares * sizeof (void *));
+ list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
+ for (i = 0; i < sav->sav_count; i++)
+ list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
+ B_FALSE, B_FALSE, B_TRUE);
+ VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
+ sav->sav_count) == 0);
+ for (i = 0; i < sav->sav_count; i++)
+ nvlist_free(list[i]);
+ kmem_free(list, sav->sav_count * sizeof (void *));
}
- spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx);
+ spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
nvlist_free(nvroot);
- spa->spa_sync_spares = B_FALSE;
+ sav->sav_sync = B_FALSE;
}
static void
@@ -3606,7 +4063,10 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_sync_pass++;
spa_sync_config_object(spa, tx);
- spa_sync_spares(spa, tx);
+ spa_sync_aux_dev(spa, &spa->spa_spares, tx,
+ ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
+ spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
+ ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
spa_errlog_sync(spa, txg);
dsl_pool_sync(dp, txg);
@@ -3806,15 +4266,15 @@ spa_has_spare(spa_t *spa, uint64_t guid)
{
int i;
uint64_t spareguid;
+ spa_aux_vdev_t *sav = &spa->spa_spares;
- for (i = 0; i < spa->spa_nspares; i++)
- if (spa->spa_spares[i]->vdev_guid == guid)
+ for (i = 0; i < sav->sav_count; i++)
+ if (sav->sav_vdevs[i]->vdev_guid == guid)
return (B_TRUE);
- for (i = 0; i < spa->spa_pending_nspares; i++) {
- if (nvlist_lookup_uint64(spa->spa_pending_spares[i],
- ZPOOL_CONFIG_GUID, &spareguid) == 0 &&
- spareguid == guid)
+ for (i = 0; i < sav->sav_npending; i++) {
+ if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
+ &spareguid) == 0 && spareguid == guid)
return (B_TRUE);
}
diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c
index 437ef3b0e7..17978ccc25 100644
--- a/usr/src/uts/common/fs/zfs/spa_config.c
+++ b/usr/src/uts/common/fs/zfs/spa_config.c
@@ -422,7 +422,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
vd = vd->vdev_top; /* label contains top config */
}
- nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE);
+ nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
nvlist_free(nvroot);
diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c
index fd72f815e2..6aefb025fc 100644
--- a/usr/src/uts/common/fs/zfs/spa_misc.c
+++ b/usr/src/uts/common/fs/zfs/spa_misc.c
@@ -178,6 +178,8 @@ int spa_max_replication_override = SPA_DVAS_PER_BP;
static kmutex_t spa_spare_lock;
static avl_tree_t spa_spare_avl;
+static kmutex_t spa_l2cache_lock;
+static avl_tree_t spa_l2cache_avl;
kmem_cache_t *spa_buffer_pool;
int spa_mode;
@@ -406,11 +408,108 @@ spa_refcount_zero(spa_t *spa)
/*
* ==========================================================================
- * SPA spare tracking
+ * SPA spare and l2cache tracking
* ==========================================================================
*/
/*
+ * Hot spares and cache devices are tracked using the same code below,
+ * for 'auxiliary' devices.
+ */
+
+typedef struct spa_aux {
+ uint64_t aux_guid;
+ uint64_t aux_pool;
+ avl_node_t aux_avl;
+ int aux_count;
+} spa_aux_t;
+
+static int
+spa_aux_compare(const void *a, const void *b)
+{
+ const spa_aux_t *sa = a;
+ const spa_aux_t *sb = b;
+
+ if (sa->aux_guid < sb->aux_guid)
+ return (-1);
+ else if (sa->aux_guid > sb->aux_guid)
+ return (1);
+ else
+ return (0);
+}
+
+void
+spa_aux_add(vdev_t *vd, avl_tree_t *avl)
+{
+ avl_index_t where;
+ spa_aux_t search;
+ spa_aux_t *aux;
+
+ search.aux_guid = vd->vdev_guid;
+ if ((aux = avl_find(avl, &search, &where)) != NULL) {
+ aux->aux_count++;
+ } else {
+ aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP);
+ aux->aux_guid = vd->vdev_guid;
+ aux->aux_count = 1;
+ avl_insert(avl, aux, where);
+ }
+}
+
+void
+spa_aux_remove(vdev_t *vd, avl_tree_t *avl)
+{
+ spa_aux_t search;
+ spa_aux_t *aux;
+ avl_index_t where;
+
+ search.aux_guid = vd->vdev_guid;
+ aux = avl_find(avl, &search, &where);
+
+ ASSERT(aux != NULL);
+
+ if (--aux->aux_count == 0) {
+ avl_remove(avl, aux);
+ kmem_free(aux, sizeof (spa_aux_t));
+ } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) {
+ aux->aux_pool = 0ULL;
+ }
+}
+
+boolean_t
+spa_aux_exists(uint64_t guid, uint64_t *pool, avl_tree_t *avl)
+{
+ spa_aux_t search, *found;
+ avl_index_t where;
+
+ search.aux_guid = guid;
+ found = avl_find(avl, &search, &where);
+
+ if (pool) {
+ if (found)
+ *pool = found->aux_pool;
+ else
+ *pool = 0ULL;
+ }
+
+ return (found != NULL);
+}
+
+void
+spa_aux_activate(vdev_t *vd, avl_tree_t *avl)
+{
+ spa_aux_t search, *found;
+ avl_index_t where;
+
+ search.aux_guid = vd->vdev_guid;
+ found = avl_find(avl, &search, &where);
+ ASSERT(found != NULL);
+ ASSERT(found->aux_pool == 0ULL);
+
+ found->aux_pool = spa_guid(vd->vdev_spa);
+}
+
+/*
* Spares are tracked globally due to the following constraints:
*
* - A spare may be part of multiple pools.
@@ -432,73 +531,28 @@ spa_refcount_zero(spa_t *spa)
* be completely consistent with respect to other vdev configuration changes.
*/
-typedef struct spa_spare {
- uint64_t spare_guid;
- uint64_t spare_pool;
- avl_node_t spare_avl;
- int spare_count;
-} spa_spare_t;
-
static int
spa_spare_compare(const void *a, const void *b)
{
- const spa_spare_t *sa = a;
- const spa_spare_t *sb = b;
-
- if (sa->spare_guid < sb->spare_guid)
- return (-1);
- else if (sa->spare_guid > sb->spare_guid)
- return (1);
- else
- return (0);
+ return (spa_aux_compare(a, b));
}
void
spa_spare_add(vdev_t *vd)
{
- avl_index_t where;
- spa_spare_t search;
- spa_spare_t *spare;
-
mutex_enter(&spa_spare_lock);
ASSERT(!vd->vdev_isspare);
-
- search.spare_guid = vd->vdev_guid;
- if ((spare = avl_find(&spa_spare_avl, &search, &where)) != NULL) {
- spare->spare_count++;
- } else {
- spare = kmem_zalloc(sizeof (spa_spare_t), KM_SLEEP);
- spare->spare_guid = vd->vdev_guid;
- spare->spare_count = 1;
- avl_insert(&spa_spare_avl, spare, where);
- }
+ spa_aux_add(vd, &spa_spare_avl);
vd->vdev_isspare = B_TRUE;
-
mutex_exit(&spa_spare_lock);
}
void
spa_spare_remove(vdev_t *vd)
{
- spa_spare_t search;
- spa_spare_t *spare;
- avl_index_t where;
-
mutex_enter(&spa_spare_lock);
-
- search.spare_guid = vd->vdev_guid;
- spare = avl_find(&spa_spare_avl, &search, &where);
-
ASSERT(vd->vdev_isspare);
- ASSERT(spare != NULL);
-
- if (--spare->spare_count == 0) {
- avl_remove(&spa_spare_avl, spare);
- kmem_free(spare, sizeof (spa_spare_t));
- } else if (spare->spare_pool == spa_guid(vd->vdev_spa)) {
- spare->spare_pool = 0ULL;
- }
-
+ spa_aux_remove(vd, &spa_spare_avl);
vd->vdev_isspare = B_FALSE;
mutex_exit(&spa_spare_lock);
}
@@ -506,42 +560,81 @@ spa_spare_remove(vdev_t *vd)
boolean_t
spa_spare_exists(uint64_t guid, uint64_t *pool)
{
- spa_spare_t search, *found;
- avl_index_t where;
+ boolean_t found;
mutex_enter(&spa_spare_lock);
-
- search.spare_guid = guid;
- found = avl_find(&spa_spare_avl, &search, &where);
-
- if (pool) {
- if (found)
- *pool = found->spare_pool;
- else
- *pool = 0ULL;
- }
-
+ found = spa_aux_exists(guid, pool, &spa_spare_avl);
mutex_exit(&spa_spare_lock);
- return (found != NULL);
+ return (found);
}
void
spa_spare_activate(vdev_t *vd)
{
- spa_spare_t search, *found;
- avl_index_t where;
-
mutex_enter(&spa_spare_lock);
ASSERT(vd->vdev_isspare);
+ spa_aux_activate(vd, &spa_spare_avl);
+ mutex_exit(&spa_spare_lock);
+}
- search.spare_guid = vd->vdev_guid;
- found = avl_find(&spa_spare_avl, &search, &where);
- ASSERT(found != NULL);
- ASSERT(found->spare_pool == 0ULL);
+/*
+ * Level 2 ARC devices are tracked globally for the same reasons as spares.
+ * Cache devices currently only support one pool per cache device, and so
+ * for these devices the aux reference count is currently unused beyond 1.
+ */
- found->spare_pool = spa_guid(vd->vdev_spa);
- mutex_exit(&spa_spare_lock);
+static int
+spa_l2cache_compare(const void *a, const void *b)
+{
+ return (spa_aux_compare(a, b));
+}
+
+void
+spa_l2cache_add(vdev_t *vd)
+{
+ mutex_enter(&spa_l2cache_lock);
+ ASSERT(!vd->vdev_isl2cache);
+ spa_aux_add(vd, &spa_l2cache_avl);
+ vd->vdev_isl2cache = B_TRUE;
+ mutex_exit(&spa_l2cache_lock);
+}
+
+void
+spa_l2cache_remove(vdev_t *vd)
+{
+ mutex_enter(&spa_l2cache_lock);
+ ASSERT(vd->vdev_isl2cache);
+ spa_aux_remove(vd, &spa_l2cache_avl);
+ vd->vdev_isl2cache = B_FALSE;
+ mutex_exit(&spa_l2cache_lock);
+}
+
+boolean_t
+spa_l2cache_exists(uint64_t guid, uint64_t *pool)
+{
+ boolean_t found;
+
+ mutex_enter(&spa_l2cache_lock);
+ found = spa_aux_exists(guid, pool, &spa_l2cache_avl);
+ mutex_exit(&spa_l2cache_lock);
+
+ return (found);
+}
+
+void
+spa_l2cache_activate(vdev_t *vd)
+{
+ mutex_enter(&spa_l2cache_lock);
+ ASSERT(vd->vdev_isl2cache);
+ spa_aux_activate(vd, &spa_l2cache_avl);
+ mutex_exit(&spa_l2cache_lock);
+}
+
+void
+spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
+{
+ vdev_space_update(vd, space, alloc, B_FALSE);
}
/*
@@ -1078,13 +1171,17 @@ spa_init(int mode)
{
mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL);
+ mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL);
avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t),
offsetof(spa_t, spa_avl));
- avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_spare_t),
- offsetof(spa_spare_t, spare_avl));
+ avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t),
+ offsetof(spa_aux_t, aux_avl));
+
+ avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
+ offsetof(spa_aux_t, aux_avl));
spa_mode = mode;
@@ -1111,10 +1208,12 @@ spa_fini(void)
avl_destroy(&spa_namespace_avl);
avl_destroy(&spa_spare_avl);
+ avl_destroy(&spa_l2cache_avl);
cv_destroy(&spa_namespace_cv);
mutex_destroy(&spa_namespace_lock);
mutex_destroy(&spa_spare_lock);
+ mutex_destroy(&spa_l2cache_lock);
}
/*
diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h
index f6607060b0..aa3932690f 100644
--- a/usr/src/uts/common/fs/zfs/sys/arc.h
+++ b/usr/src/uts/common/fs/zfs/sys/arc.h
@@ -36,6 +36,7 @@ extern "C" {
#include <sys/zio.h>
#include <sys/dmu.h>
+#include <sys/spa.h>
typedef struct arc_buf_hdr arc_buf_hdr_t;
typedef struct arc_buf arc_buf_t;
@@ -106,6 +107,15 @@ int arc_tempreserve_space(uint64_t tempreserve);
void arc_init(void);
void arc_fini(void);
+/*
+ * Level 2 ARC
+ */
+
+void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end);
+void l2arc_remove_vdev(vdev_t *vd);
+void l2arc_init(void);
+void l2arc_fini(void);
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h
index 0a24df2129..f59aadf50a 100644
--- a/usr/src/uts/common/fs/zfs/sys/dmu.h
+++ b/usr/src/uts/common/fs/zfs/sys/dmu.h
@@ -200,6 +200,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
#define DMU_POOL_DEFLATE "deflate"
#define DMU_POOL_HISTORY "history"
#define DMU_POOL_PROPS "pool_props"
+#define DMU_POOL_L2CACHE "l2cache"
/*
* Allocate an object from this objset. The range of object numbers
diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h
index 82798cf07f..dd723df0c9 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa.h
@@ -47,6 +47,7 @@ typedef struct vdev vdev_t;
typedef struct metaslab metaslab_t;
typedef struct zilog zilog_t;
typedef struct traverse_handle traverse_handle_t;
+typedef struct spa_aux_vdev spa_aux_vdev_t;
struct dsl_pool;
/*
@@ -356,6 +357,14 @@ extern void spa_spare_remove(vdev_t *vd);
extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool);
extern void spa_spare_activate(vdev_t *vd);
+/* L2ARC state (which is global across all pools) */
+extern void spa_l2cache_add(vdev_t *vd);
+extern void spa_l2cache_remove(vdev_t *vd);
+extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
+extern void spa_l2cache_activate(vdev_t *vd);
+extern void spa_l2cache_drop(spa_t *spa);
+extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc);
+
/* scrubbing */
extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force);
extern void spa_scrub_suspend(spa_t *spa);
diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
index 6c4559fce2..eb2b6d6289 100644
--- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h
@@ -58,6 +58,16 @@ typedef struct spa_history_phys {
uint64_t sh_records_lost; /* num of records overwritten */
} spa_history_phys_t;
+struct spa_aux_vdev {
+ uint64_t sav_object; /* MOS object for device list */
+ nvlist_t *sav_config; /* cached device config */
+ vdev_t **sav_vdevs; /* devices */
+ int sav_count; /* number devices */
+ boolean_t sav_sync; /* sync the device list */
+ nvlist_t **sav_pending; /* pending device additions */
+ uint_t sav_npending; /* # pending devices */
+};
+
struct spa {
/*
* Fields protected by spa_namespace_lock.
@@ -87,11 +97,8 @@ struct spa {
vdev_t *spa_root_vdev; /* top-level vdev container */
uint64_t spa_load_guid; /* initial guid for spa_load */
list_t spa_dirty_list; /* vdevs with dirty labels */
- uint64_t spa_spares_object; /* MOS object for spare list */
- nvlist_t *spa_sparelist; /* cached spare config */
- vdev_t **spa_spares; /* available hot spares */
- int spa_nspares; /* number of hot spares */
- boolean_t spa_sync_spares; /* sync the spares list */
+ spa_aux_vdev_t spa_spares; /* hot spares */
+ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */
uint64_t spa_config_object; /* MOS object for pool config */
uint64_t spa_syncing_txg; /* txg currently syncing */
uint64_t spa_sync_bplist_obj; /* object for deferred frees */
@@ -134,8 +141,6 @@ struct spa {
uint64_t spa_history; /* history object */
kmutex_t spa_history_lock; /* history lock */
vdev_t *spa_pending_vdev; /* pending vdev additions */
- nvlist_t **spa_pending_spares; /* pending spare additions */
- uint_t spa_pending_nspares; /* # pending spares */
kmutex_t spa_props_lock; /* property lock */
uint64_t spa_pool_props_object; /* object for properties */
uint64_t spa_bootfs; /* default boot filesystem */
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h
index dced3da5ff..b1ec648056 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev.h
@@ -53,7 +53,7 @@ extern void vdev_close(vdev_t *);
extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
extern void vdev_init(vdev_t *, uint64_t txg);
extern void vdev_reopen(vdev_t *);
-extern int vdev_validate_spare(vdev_t *);
+extern int vdev_validate_aux(vdev_t *vd);
extern int vdev_probe(vdev_t *);
extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
@@ -69,6 +69,7 @@ extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
extern void vdev_metaslab_fini(vdev_t *vd);
extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
+extern void vdev_clear_stats(vdev_t *vd);
extern void vdev_stat_update(zio_t *zio);
extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
boolean_t complete);
@@ -78,7 +79,7 @@ extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
vdev_aux_t aux);
extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
- int64_t alloc_delta);
+ int64_t alloc_delta, boolean_t update_root);
extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
@@ -113,7 +114,7 @@ extern void vdev_config_clean(vdev_t *vd);
extern int vdev_config_sync(vdev_t *vd, uint64_t txg);
extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
- boolean_t getstats, boolean_t isspare);
+ boolean_t getstats, boolean_t isspare, boolean_t isl2cache);
/*
* Label routines
@@ -127,7 +128,8 @@ typedef enum {
VDEV_LABEL_CREATE, /* create/add a new device */
VDEV_LABEL_REPLACE, /* replace an existing device */
VDEV_LABEL_SPARE, /* add a new hot spare */
- VDEV_LABEL_REMOVE /* remove an existing device */
+ VDEV_LABEL_REMOVE, /* remove an existing device */
+ VDEV_LABEL_L2CACHE /* add an L2ARC cache device */
} vdev_labeltype_t;
extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
index 6fa21e83b0..2eebbba566 100644
--- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h
@@ -168,6 +168,7 @@ struct vdev {
uint8_t vdev_tmpoffline; /* device taken offline temporarily? */
uint8_t vdev_detached; /* device detached? */
uint64_t vdev_isspare; /* was a hot spare */
+ uint64_t vdev_isl2cache; /* was a l2cache device */
vdev_queue_t vdev_queue; /* I/O deadline schedule queue */
vdev_cache_t vdev_cache; /* physical block cache */
uint64_t vdev_not_present; /* not present during import */
@@ -249,6 +250,7 @@ typedef struct vdev_label {
#define VDEV_ALLOC_LOAD 0
#define VDEV_ALLOC_ADD 1
#define VDEV_ALLOC_SPARE 2
+#define VDEV_ALLOC_L2CACHE 3
/*
* Allocate or free a vdev
diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h
index d2949884af..4591274518 100644
--- a/usr/src/uts/common/fs/zfs/sys/zio.h
+++ b/usr/src/uts/common/fs/zfs/sys/zio.h
@@ -304,11 +304,13 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
- zio_done_func_t *done, void *private, int priority, int flags);
+ zio_done_func_t *done, void *private, int priority, int flags,
+ boolean_t labels);
extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
uint64_t size, void *data, int checksum,
- zio_done_func_t *done, void *private, int priority, int flags);
+ zio_done_func_t *done, void *private, int priority, int flags,
+ boolean_t labels);
extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
blkptr_t *old_bp, uint64_t txg);
diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c
index e7810dd340..2a2dc1d625 100644
--- a/usr/src/uts/common/fs/zfs/vdev.c
+++ b/usr/src/uts/common/fs/zfs/vdev.c
@@ -363,6 +363,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
} else if (alloctype == VDEV_ALLOC_SPARE) {
if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
return (EINVAL);
+ } else if (alloctype == VDEV_ALLOC_L2CACHE) {
+ if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+ return (EINVAL);
}
/*
@@ -550,6 +553,8 @@ vdev_free(vdev_t *vd)
if (vd->vdev_isspare)
spa_spare_remove(vd);
+ if (vd->vdev_isl2cache)
+ spa_l2cache_remove(vd);
txg_list_destroy(&vd->vdev_ms_list);
txg_list_destroy(&vd->vdev_dtl_list);
@@ -1367,14 +1372,14 @@ vdev_load(vdev_t *vd)
}
/*
- * This special case of vdev_spare() is used for hot spares. It's sole purpose
- * it to set the vdev state for the associated vdev. To do this, we make sure
- * that we can open the underlying device, then try to read the label, and make
- * sure that the label is sane and that it hasn't been repurposed to another
- * pool.
+ * The special vdev case is used for hot spares and l2cache devices. Its
+ * sole purpose it to set the vdev state for the associated vdev. To do this,
+ * we make sure that we can open the underlying device, then try to read the
+ * label, and make sure that the label is sane and that it hasn't been
+ * repurposed to another pool.
*/
int
-vdev_validate_spare(vdev_t *vd)
+vdev_validate_aux(vdev_t *vd)
{
nvlist_t *label;
uint64_t guid, version;
@@ -1397,8 +1402,6 @@ vdev_validate_spare(vdev_t *vd)
return (-1);
}
- spa_spare_add(vd);
-
/*
* We don't actually check the pool state here. If it's in fact in
* use by another pool, we update this fact on the fly when requested.
@@ -1855,6 +1858,16 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
}
void
+vdev_clear_stats(vdev_t *vd)
+{
+ mutex_enter(&vd->vdev_stat_lock);
+ vd->vdev_stat.vs_space = 0;
+ vd->vdev_stat.vs_dspace = 0;
+ vd->vdev_stat.vs_alloc = 0;
+ mutex_exit(&vd->vdev_stat_lock);
+}
+
+void
vdev_stat_update(zio_t *zio)
{
vdev_t *vd = zio->io_vd;
@@ -1952,15 +1965,14 @@ vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
* Update the in-core space usage stats for this vdev and the root vdev.
*/
void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta)
+vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
+ boolean_t update_root)
{
int64_t dspace_delta = space_delta;
spa_t *spa = vd->vdev_spa;
vdev_t *rvd = spa->spa_root_vdev;
ASSERT(vd == vd->vdev_top);
- ASSERT(rvd == vd->vdev_parent);
- ASSERT(vd->vdev_ms_count != 0);
/*
* Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion
@@ -1978,18 +1990,23 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta)
vd->vdev_stat.vs_dspace += dspace_delta;
mutex_exit(&vd->vdev_stat_lock);
- /*
- * Don't count non-normal (e.g. intent log) space as part of
- * the pool's capacity.
- */
- if (vd->vdev_mg->mg_class != spa->spa_normal_class)
- return;
+ if (update_root) {
+ ASSERT(rvd == vd->vdev_parent);
+ ASSERT(vd->vdev_ms_count != 0);
+
+ /*
+ * Don't count non-normal (e.g. intent log) space as part of
+ * the pool's capacity.
+ */
+ if (vd->vdev_mg->mg_class != spa->spa_normal_class)
+ return;
- mutex_enter(&rvd->vdev_stat_lock);
- rvd->vdev_stat.vs_space += space_delta;
- rvd->vdev_stat.vs_alloc += alloc_delta;
- rvd->vdev_stat.vs_dspace += dspace_delta;
- mutex_exit(&rvd->vdev_stat_lock);
+ mutex_enter(&rvd->vdev_stat_lock);
+ rvd->vdev_stat.vs_space += space_delta;
+ rvd->vdev_stat.vs_alloc += alloc_delta;
+ rvd->vdev_stat.vs_dspace += dspace_delta;
+ mutex_exit(&rvd->vdev_stat_lock);
+ }
}
/*
diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c
index 070444a093..b6688ae69d 100644
--- a/usr/src/uts/common/fs/zfs/vdev_label.c
+++ b/usr/src/uts/common/fs/zfs/vdev_label.c
@@ -169,7 +169,8 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
vdev_label_offset(vd->vdev_psize, l, offset),
size, buf, ZIO_CHECKSUM_LABEL, done, private,
ZIO_PRIORITY_SYNC_READ,
- ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE));
+ ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+ B_TRUE));
}
static void
@@ -181,7 +182,8 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
zio_nowait(zio_write_phys(zio, vd,
vdev_label_offset(vd->vdev_psize, l, offset),
size, buf, ZIO_CHECKSUM_LABEL, done, private,
- ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL));
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL,
+ B_TRUE));
}
/*
@@ -189,7 +191,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
*/
nvlist_t *
vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
- boolean_t isspare)
+ boolean_t isspare, boolean_t isl2cache)
{
nvlist_t *nv = NULL;
@@ -197,7 +199,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
vd->vdev_ops->vdev_op_type) == 0);
- if (!isspare)
+ if (!isspare && !isl2cache)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id)
== 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0);
@@ -245,7 +247,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
if (vd->vdev_isspare)
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0);
- if (!isspare && vd == vd->vdev_top) {
+ if (!isspare && !isl2cache && vd == vd->vdev_top) {
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
vd->vdev_ms_array) == 0);
VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
@@ -278,7 +280,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
for (c = 0; c < vd->vdev_children; c++)
child[c] = vdev_config_generate(spa, vd->vdev_child[c],
- getstats, isspare);
+ getstats, isspare, isl2cache);
VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
child, vd->vdev_children) == 0);
@@ -357,7 +359,7 @@ vdev_label_read_config(vdev_t *vd)
*/
static boolean_t
vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
- uint64_t *spare_guid)
+ uint64_t *spare_guid, uint64_t *l2cache_guid)
{
spa_t *spa = vd->vdev_spa;
uint64_t state, pool_guid, device_guid, txg, spare_pool;
@@ -366,6 +368,8 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
if (spare_guid)
*spare_guid = 0ULL;
+ if (l2cache_guid)
+ *l2cache_guid = 0ULL;
/*
* Read the label, if any, and perform some basic sanity checks.
@@ -384,7 +388,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
return (B_FALSE);
}
- if (state != POOL_STATE_SPARE &&
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
&pool_guid) != 0 ||
nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
@@ -400,9 +404,10 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
* be a part of. The only way this is allowed is if the device is a hot
* spare (which we check for later on).
*/
- if (state != POOL_STATE_SPARE &&
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
!spa_guid_exists(pool_guid, device_guid) &&
- !spa_spare_exists(device_guid, NULL))
+ !spa_spare_exists(device_guid, NULL) &&
+ !spa_l2cache_exists(device_guid, NULL))
return (B_FALSE);
/*
@@ -412,13 +417,14 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
* user has attempted to add the same vdev multiple times in the same
* transaction.
*/
- if (state != POOL_STATE_SPARE && txg == 0 && vdtxg == crtxg)
+ if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
+ txg == 0 && vdtxg == crtxg)
return (B_TRUE);
/*
* Check to see if this is a spare device. We do an explicit check for
* spa_has_spare() here because it may be on our pending list of spares
- * to add.
+ * to add. We also check if it is an l2cache device.
*/
if (spa_spare_exists(device_guid, &spare_pool) ||
spa_has_spare(spa, device_guid)) {
@@ -427,6 +433,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
switch (reason) {
case VDEV_LABEL_CREATE:
+ case VDEV_LABEL_L2CACHE:
return (B_TRUE);
case VDEV_LABEL_REPLACE:
@@ -439,6 +446,12 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
}
/*
+ * Check to see if this is an l2cache device.
+ */
+ if (spa_l2cache_exists(device_guid, NULL))
+ return (B_TRUE);
+
+ /*
* If the device is marked ACTIVE, then this device is in use by another
* pool on the system.
*/
@@ -466,7 +479,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
char *buf;
size_t buflen;
int error;
- uint64_t spare_guid;
+ uint64_t spare_guid, l2cache_guid;
ASSERT(spa_config_held(spa, RW_WRITER));
@@ -488,19 +501,20 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
* Determine if the vdev is in use.
*/
if (reason != VDEV_LABEL_REMOVE &&
- vdev_inuse(vd, crtxg, reason, &spare_guid))
+ vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
return (EBUSY);
ASSERT(reason != VDEV_LABEL_REMOVE ||
- vdev_inuse(vd, crtxg, reason, NULL));
+ vdev_inuse(vd, crtxg, reason, NULL, NULL));
/*
- * If this is a request to add or replace a spare that is in use
- * elsewhere on the system, then we must update the guid (which was
- * initialized to a random value) to reflect the actual GUID (which is
- * shared between multiple pools).
+ * If this is a request to add or replace a spare or l2cache device
+ * that is in use elsewhere on the system, then we must update the
+ * guid (which was initialized to a random value) to reflect the
+ * actual GUID (which is shared between multiple pools).
*/
- if (reason != VDEV_LABEL_REMOVE && spare_guid != 0ULL) {
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
+ spare_guid != 0ULL) {
vdev_t *pvd = vd->vdev_parent;
for (; pvd != NULL; pvd = pvd->vdev_parent) {
@@ -520,6 +534,27 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
ASSERT(reason == VDEV_LABEL_REPLACE);
}
+ if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
+ l2cache_guid != 0ULL) {
+ vdev_t *pvd = vd->vdev_parent;
+
+ for (; pvd != NULL; pvd = pvd->vdev_parent) {
+ pvd->vdev_guid_sum -= vd->vdev_guid;
+ pvd->vdev_guid_sum += l2cache_guid;
+ }
+
+ vd->vdev_guid = vd->vdev_guid_sum = l2cache_guid;
+
+ /*
+ * If this is a replacement, then we want to fallthrough to the
+ * rest of the code. If we're adding an l2cache, then it's
+ * already labeled appropriately and we can just return.
+ */
+ if (reason == VDEV_LABEL_L2CACHE)
+ return (0);
+ ASSERT(reason == VDEV_LABEL_REPLACE);
+ }
+
/*
* Initialize its label.
*/
@@ -549,6 +584,19 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
POOL_STATE_SPARE) == 0);
VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
vd->vdev_guid) == 0);
+ } else if (reason == VDEV_LABEL_L2CACHE ||
+ (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
+ /*
+ * For level 2 ARC devices, add a special label.
+ */
+ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
+ spa_version(spa)) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
+ POOL_STATE_L2CACHE) == 0);
+ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
+ vd->vdev_guid) == 0);
} else {
label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
@@ -623,13 +671,19 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
/*
* If this vdev hasn't been previously identified as a spare, then we
* mark it as such only if a) we are labeling it as a spare, or b) it
- * exists as a spare elsewhere in the system.
+ * exists as a spare elsewhere in the system. Do the same for
+ * level 2 ARC devices.
*/
if (error == 0 && !vd->vdev_isspare &&
(reason == VDEV_LABEL_SPARE ||
spa_spare_exists(vd->vdev_guid, NULL)))
spa_spare_add(vd);
+ if (error == 0 && !vd->vdev_isl2cache &&
+ (reason == VDEV_LABEL_L2CACHE ||
+ spa_l2cache_exists(vd->vdev_guid, NULL)))
+ spa_l2cache_add(vd);
+
return (error);
}
diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
index fa2f71308f..e46af7a3e6 100644
--- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c
+++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c
@@ -963,23 +963,30 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
{
spa_t *spa;
int error;
- nvlist_t *config;
+ nvlist_t *config, **l2cache;
+ uint_t nl2cache;
error = spa_open(zc->zc_name, &spa, FTAG);
if (error != 0)
return (error);
+ error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+ &config);
+ (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
+ &l2cache, &nl2cache);
+
/*
* A root pool with concatenated devices is not supported.
* Thus, can not add a device to a root pool with one device.
+ * Allow for l2cache devices to be added.
*/
- if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0) {
+ if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0 &&
+ nl2cache == 0) {
spa_close(spa, FTAG);
return (EDOM);
}
- if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
- &config)) == 0) {
+ if (error == 0) {
error = spa_vdev_add(spa, config);
nvlist_free(config);
}
@@ -2573,9 +2580,26 @@ zfs_ioc_clear(zfs_cmd_t *zc)
if (zc->zc_guid == 0) {
vd = NULL;
} else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) {
- (void) spa_vdev_exit(spa, NULL, txg, ENODEV);
- spa_close(spa, FTAG);
- return (ENODEV);
+ spa_aux_vdev_t *sav;
+ int i;
+
+ /*
+ * Check if this is an l2cache device.
+ */
+ ASSERT(spa != NULL);
+ sav = &spa->spa_l2cache;
+ for (i = 0; i < sav->sav_count; i++) {
+ if (sav->sav_vdevs[i]->vdev_guid == zc->zc_guid) {
+ vd = sav->sav_vdevs[i];
+ break;
+ }
+ }
+
+ if (vd == NULL) {
+ (void) spa_vdev_exit(spa, NULL, txg, ENODEV);
+ spa_close(spa, FTAG);
+ return (ENODEV);
+ }
}
vdev_clear(spa, vd, B_TRUE);
diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c
index fcce8fa65e..112aaa6f25 100644
--- a/usr/src/uts/common/fs/zfs/zio.c
+++ b/usr/src/uts/common/fs/zfs/zio.c
@@ -675,7 +675,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
static void
zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
- int checksum)
+ int checksum, boolean_t labels)
{
ASSERT(vd->vdev_children == 0);
@@ -683,8 +683,12 @@ zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
- ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
- offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+#ifdef ZFS_DEBUG
+ if (labels) {
+ ASSERT(offset + size <= VDEV_LABEL_START_SIZE ||
+ offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
+ }
+#endif
ASSERT3U(offset + size, <=, vd->vdev_psize);
BP_ZERO(bp);
@@ -703,14 +707,14 @@ zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size,
zio_t *
zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
- int priority, int flags)
+ int priority, int flags, boolean_t labels)
{
zio_t *zio;
blkptr_t blk;
ZIO_ENTER(vd->vdev_spa);
- zio_phys_bp_init(vd, &blk, offset, size, checksum);
+ zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL,
@@ -730,7 +734,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
zio_t *
zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
void *data, int checksum, zio_done_func_t *done, void *private,
- int priority, int flags)
+ int priority, int flags, boolean_t labels)
{
zio_block_tail_t *zbt;
void *wbuf;
@@ -739,7 +743,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
ZIO_ENTER(vd->vdev_spa);
- zio_phys_bp_init(vd, &blk, offset, size, checksum);
+ zio_phys_bp_init(vd, &blk, offset, size, checksum, labels);
zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private,
ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL,
diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h
index 10074241c0..3fff205c47 100644
--- a/usr/src/uts/common/sys/fs/zfs.h
+++ b/usr/src/uts/common/sys/fs/zfs.h
@@ -219,14 +219,15 @@ typedef enum zfs_share_op {
#define SPA_VERSION_7 7ULL
#define SPA_VERSION_8 8ULL
#define SPA_VERSION_9 9ULL
+#define SPA_VERSION_10 10ULL
/*
* When bumping up SPA_VERSION, make sure GRUB ZFS understand the on-disk
* format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*},
* and do the appropriate changes.
*/
-#define SPA_VERSION SPA_VERSION_9
-#define SPA_VERSION_STRING "9"
+#define SPA_VERSION SPA_VERSION_10
+#define SPA_VERSION_STRING "10"
/*
* Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -256,6 +257,7 @@ typedef enum zfs_share_op {
#define SPA_VERSION_REFRESERVATION SPA_VERSION_9
#define SPA_VERSION_REFQUOTA SPA_VERSION_9
#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9
+#define SPA_VERSION_L2CACHE SPA_VERSION_10
/*
* ZPL version - rev'd whenever an incompatible on-disk format change
@@ -312,6 +314,7 @@ typedef enum zfs_share_op {
#define ZPOOL_CONFIG_UNSPARE "unspare"
#define ZPOOL_CONFIG_PHYS_PATH "phys_path"
#define ZPOOL_CONFIG_IS_LOG "is_log"
+#define ZPOOL_CONFIG_L2CACHE "l2cache"
/*
* The persistent vdev state is stored as separate values rather than a single
* 'vdev_state' entry. This is because a device can be in multiple states, such
@@ -331,6 +334,7 @@ typedef enum zfs_share_op {
#define VDEV_TYPE_MISSING "missing"
#define VDEV_TYPE_SPARE "spare"
#define VDEV_TYPE_LOG "log"
+#define VDEV_TYPE_L2CACHE "l2cache"
/*
* This is needed in userland to report the minimum necessary device size.
@@ -384,14 +388,16 @@ typedef enum vdev_aux {
/*
* pool state. The following states are written to disk as part of the normal
- * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are
- * software abstractions used at various levels to communicate pool state.
+ * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining
+ * states are software abstractions used at various levels to communicate
+ * pool state.
*/
typedef enum pool_state {
POOL_STATE_ACTIVE = 0, /* In active use */
POOL_STATE_EXPORTED, /* Explicitly exported */
POOL_STATE_DESTROYED, /* Explicitly destroyed */
POOL_STATE_SPARE, /* Reserved for hot spare use */
+ POOL_STATE_L2CACHE, /* Level 2 ARC device */
POOL_STATE_UNINITIALIZED, /* Internal spa_t state */
POOL_STATE_IO_FAILURE, /* Internal pool state */
POOL_STATE_UNAVAIL, /* Internal libzfs state */