diff options
author | brendan <none@none> | 2007-11-09 21:33:30 -0800 |
---|---|---|
committer | brendan <none@none> | 2007-11-09 21:33:30 -0800 |
commit | fa94a07fd0519b8abfd871ad8fe60e6bebe1e2bb (patch) | |
tree | 1c594a9272fa03552a1cca3328b7820b409a2563 /usr/src | |
parent | 380789fc80376bd1573770361cb177a08c7e3524 (diff) | |
download | illumos-joyent-fa94a07fd0519b8abfd871ad8fe60e6bebe1e2bb.tar.gz |
PSARC 2007/618 ZFS L2ARC
6536054 second tier ("external") ARC
Diffstat (limited to 'usr/src')
30 files changed, 2618 insertions, 503 deletions
diff --git a/usr/src/cmd/zinject/translate.c b/usr/src/cmd/zinject/translate.c index b4f6693aa1..e778c1b6c0 100644 --- a/usr/src/cmd/zinject/translate.c +++ b/usr/src/cmd/zinject/translate.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -437,7 +437,7 @@ translate_device(const char *pool, const char *device, zinject_record_t *record) char *end; zpool_handle_t *zhp; nvlist_t *tgt; - boolean_t isspare; + boolean_t isspare, iscache; /* * Given a device name or GUID, create an appropriate injection record @@ -448,7 +448,7 @@ translate_device(const char *pool, const char *device, zinject_record_t *record) record->zi_guid = strtoull(device, &end, 16); if (record->zi_guid == 0 || *end != '\0') { - tgt = zpool_find_vdev(zhp, device, &isspare); + tgt = zpool_find_vdev(zhp, device, &isspare, &iscache); if (tgt == NULL) { (void) fprintf(stderr, "cannot find device '%s' in " diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c index 45cf03abc2..a482625d75 100644 --- a/usr/src/cmd/zpool/zpool_main.c +++ b/usr/src/cmd/zpool/zpool_main.c @@ -213,7 +213,7 @@ get_usage(zpool_help_t idx) { return (gettext("\treplace [-f] <pool> <device> " "[new-device]\n")); case HELP_REMOVE: - return (gettext("\tremove <pool> <device>\n")); + return (gettext("\tremove <pool> <device> ...\n")); case HELP_SCRUB: return (gettext("\tscrub [-s] <pool> ...\n")); case HELP_STATUS: @@ -493,17 +493,17 @@ zpool_do_add(int argc, char **argv) } /* - * zpool remove <pool> <vdev> + * zpool remove <pool> <vdev> ... * * Removes the given vdev from the pool. Currently, this only supports removing - * spares from the pool. Eventually, we'll want to support removing leaf vdevs - * (as an alias for 'detach') as well as toplevel vdevs. + * spares and cache devices from the pool. Eventually, we'll want to support + * removing leaf vdevs (as an alias for 'detach') as well as toplevel vdevs. */ int zpool_do_remove(int argc, char **argv) { char *poolname; - int ret; + int i, ret = 0; zpool_handle_t *zhp; argc--; @@ -524,7 +524,10 @@ zpool_do_remove(int argc, char **argv) if ((zhp = zpool_open(g_zfs, poolname)) == NULL) return (1); - ret = (zpool_vdev_remove(zhp, argv[1]) != 0); + for (i = 1; i < argc; i++) { + if (zpool_vdev_remove(zhp, argv[i]) != 0) + ret = 1; + } return (ret); } @@ -910,6 +913,14 @@ max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max) max = ret; } + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if ((ret = max_width(zhp, child[c], depth + 2, + max)) > max) + max = ret; + } + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) @@ -995,15 +1006,24 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth, free(vname); } - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, - &child, &children) != 0) - return; + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + (void) printf(gettext("\tcache\n")); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, NULL, child[c]); + (void) printf("\t %s\n", vname); + free(vname); + } + } - (void) printf(gettext("\tspares\n")); - for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, NULL, child[c]); - (void) printf("\t %s\n", vname); - free(vname); + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + (void) printf(gettext("\tspares\n")); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, NULL, child[c]); + (void) printf("\t %s\n", vname); + free(vname); + } } } @@ -1655,6 +1675,28 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, newchild[c], cb, depth + 2); free(vname); } + + /* + * Include level 2 ARC devices in iostat output + */ + if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, + &newchild, &children) != 0) + return; + + if (oldnv && nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, + &oldchild, &c) != 0) + return; + + if (children > 0) { + (void) printf("%-*s - - - - - " + "-\n", cb->cb_namewidth, "cache"); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, zhp, newchild[c]); + print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, + newchild[c], cb, depth + 2); + free(vname); + } + } } static int @@ -2805,6 +2847,26 @@ print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares, } } +static void +print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache, + int namewidth) +{ + uint_t i; + char *name; + + if (nl2cache == 0) + return; + + (void) printf(gettext("\tcache\n")); + + for (i = 0; i < nl2cache; i++) { + name = zpool_vdev_name(g_zfs, zhp, l2cache[i]); + print_status_config(zhp, name, l2cache[i], + namewidth, 2, B_FALSE, B_FALSE); + free(name); + } +} + /* * Display a summary of pool status. Displays a summary such as: * @@ -2996,8 +3058,8 @@ status_callback(zpool_handle_t *zhp, void *data) if (config != NULL) { int namewidth; uint64_t nerr; - nvlist_t **spares; - uint_t nspares; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; (void) printf(gettext(" scrub: ")); @@ -3016,6 +3078,10 @@ status_callback(zpool_handle_t *zhp, void *data) print_status_config(zhp, "logs", nvroot, namewidth, 0, B_FALSE, B_TRUE); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) + print_l2cache(zhp, l2cache, nl2cache, namewidth); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) print_spares(zhp, spares, nspares, namewidth); @@ -3303,6 +3369,7 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext(" 8 Delegated administration\n")); (void) printf(gettext(" 9 refquota and refreservation " "properties\n")); + (void) printf(gettext(" 10 Cache devices\n")); (void) printf(gettext("For more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" diff --git a/usr/src/cmd/zpool/zpool_vdev.c b/usr/src/cmd/zpool/zpool_vdev.c index 48615d991e..8402521a82 100644 --- a/usr/src/cmd/zpool/zpool_vdev.c +++ b/usr/src/cmd/zpool/zpool_vdev.c @@ -968,6 +968,12 @@ make_disks(zpool_handle_t *zhp, nvlist_t *nv) if ((ret = make_disks(zhp, child[c])) != 0) return (ret); + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) + for (c = 0; c < children; c++) + if ((ret = make_disks(zhp, child[c])) != 0) + return (ret); + return (0); } @@ -1077,6 +1083,14 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, if ((ret = check_in_use(config, child[c], force, isreplacing, B_TRUE)) != 0) return (ret); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) + for (c = 0; c < children; c++) + if ((ret = check_in_use(config, child[c], force, + isreplacing, B_FALSE)) != 0) + return (ret); + return (0); } @@ -1113,6 +1127,12 @@ is_grouping(const char *type, int *mindev) return (VDEV_TYPE_LOG); } + if (strcmp(type, "cache") == 0) { + if (mindev != NULL) + *mindev = 1; + return (VDEV_TYPE_L2CACHE); + } + return (NULL); } @@ -1125,8 +1145,8 @@ is_grouping(const char *type, int *mindev) nvlist_t * construct_spec(int argc, char **argv) { - nvlist_t *nvroot, *nv, **top, **spares; - int t, toplevels, mindev, nspares, nlogs; + nvlist_t *nvroot, *nv, **top, **spares, **l2cache; + int t, toplevels, mindev, nspares, nlogs, nl2cache; const char *type; uint64_t is_log; boolean_t seen_logs; @@ -1134,8 +1154,10 @@ construct_spec(int argc, char **argv) top = NULL; toplevels = 0; spares = NULL; + l2cache = NULL; nspares = 0; nlogs = 0; + nl2cache = 0; is_log = B_FALSE; seen_logs = B_FALSE; @@ -1180,6 +1202,17 @@ construct_spec(int argc, char **argv) continue; } + if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { + if (l2cache != NULL) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: 'cache' can be " + "specified only once\n")); + return (NULL); + } + is_log = B_FALSE; + } + if (is_log) { if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { (void) fprintf(stderr, @@ -1219,6 +1252,10 @@ construct_spec(int argc, char **argv) spares = child; nspares = children; continue; + } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { + l2cache = child; + nl2cache = children; + continue; } else { verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) == 0); @@ -1259,7 +1296,7 @@ construct_spec(int argc, char **argv) top[toplevels - 1] = nv; } - if (toplevels == 0 && nspares == 0) { + if (toplevels == 0 && nspares == 0 && nl2cache == 0) { (void) fprintf(stderr, gettext("invalid vdev " "specification: at least one toplevel vdev must be " "specified\n")); @@ -1283,13 +1320,20 @@ construct_spec(int argc, char **argv) if (nspares != 0) verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + if (nl2cache != 0) + verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + l2cache, nl2cache) == 0); for (t = 0; t < toplevels; t++) nvlist_free(top[t]); for (t = 0; t < nspares; t++) nvlist_free(spares[t]); + for (t = 0; t < nl2cache; t++) + nvlist_free(l2cache[t]); if (spares) free(spares); + if (l2cache) + free(l2cache); free(top); return (nvroot); diff --git a/usr/src/grub/grub-0.95/stage2/zfs-include/dmu.h b/usr/src/grub/grub-0.95/stage2/zfs-include/dmu.h index e57e45682d..3b18d8ada5 100644 --- a/usr/src/grub/grub-0.95/stage2/zfs-include/dmu.h +++ b/usr/src/grub/grub-0.95/stage2/zfs-include/dmu.h @@ -102,5 +102,6 @@ typedef enum dmu_objset_type { #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" +#define DMU_POOL_L2CACHE "l2cache" #endif /* _SYS_DMU_H */ diff --git a/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h b/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h index 9619d5bfc2..82d767a010 100644 --- a/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h +++ b/usr/src/grub/grub-0.95/stage2/zfs-include/zfs.h @@ -38,7 +38,8 @@ #define SPA_VERSION_7 7ULL #define SPA_VERSION_8 8ULL #define SPA_VERSION_9 9ULL -#define SPA_VERSION SPA_VERSION_9 +#define SPA_VERSION_10 10ULL +#define SPA_VERSION SPA_VERSION_10 /* * The following are configuration names used in the nvlist describing a pool's @@ -71,6 +72,7 @@ #define ZPOOL_CONFIG_SPARES "spares" #define ZPOOL_CONFIG_IS_SPARE "is_spare" #define ZPOOL_CONFIG_NPARITY "nparity" +#define ZPOOL_CONFIG_L2CACHE "l2cache" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" @@ -80,17 +82,20 @@ #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_SPARE "spare" +#define VDEV_TYPE_L2CACHE "l2cache" /* * pool state. The following states are written to disk as part of the normal - * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are - * software abstractions used at various levels to communicate pool state. + * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining + * states are software abstractions used at various levels to communicate pool + * state. */ typedef enum pool_state { POOL_STATE_ACTIVE = 0, /* In active use */ POOL_STATE_EXPORTED, /* Explicitly exported */ POOL_STATE_DESTROYED, /* Explicitly destroyed */ POOL_STATE_SPARE, /* Reserved for hot spare use */ + POOL_STATE_L2CACHE, /* Level 2 ARC device */ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ POOL_STATE_UNAVAIL, /* Internal libzfs state */ POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ diff --git a/usr/src/lib/libdiskmgt/common/entry.c b/usr/src/lib/libdiskmgt/common/entry.c index a7756b02d4..1d3b2313e4 100644 --- a/usr/src/lib/libdiskmgt/common/entry.c +++ b/usr/src/lib/libdiskmgt/common/entry.c @@ -1118,6 +1118,10 @@ dm_get_usage_string(char *what, char *how, char **usage_string) *usage_string = dgettext(TEXT_DOMAIN, "%s is reserved as a hot spare for ZFS pool %s. Please " "see zpool(1M).\n"); + } else if (strcmp(what, DM_USE_L2CACHE_ZPOOL) == 0) { + *usage_string = dgettext(TEXT_DOMAIN, + "%s is in use as a cache device for ZFS pool %s. " + "Please see zpool(1M).\n"); } } void diff --git a/usr/src/lib/libdiskmgt/common/inuse_zpool.c b/usr/src/lib/libdiskmgt/common/inuse_zpool.c index a7cf203a2f..e4f9ab5a3b 100644 --- a/usr/src/lib/libdiskmgt/common/inuse_zpool.c +++ b/usr/src/lib/libdiskmgt/common/inuse_zpool.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -102,6 +102,9 @@ inuse_zpool_common(char *slice, nvlist_t *attrs, int *errp, char *type) } else if (state == POOL_STATE_SPARE) { found = 1; type = DM_USE_SPARE_ZPOOL; + } else if (state == POOL_STATE_L2CACHE) { + found = 1; + type = DM_USE_L2CACHE_ZPOOL; } } else { found = 1; diff --git a/usr/src/lib/libdiskmgt/common/libdiskmgt.h b/usr/src/lib/libdiskmgt/common/libdiskmgt.h index 3a580cbfb4..303b03f171 100644 --- a/usr/src/lib/libdiskmgt/common/libdiskmgt.h +++ b/usr/src/lib/libdiskmgt/common/libdiskmgt.h @@ -216,6 +216,7 @@ typedef enum { #define DM_USE_EXPORTED_ZPOOL "exported_zpool" #define DM_USE_ACTIVE_ZPOOL "active_zpool" #define DM_USE_SPARE_ZPOOL "spare_zpool" +#define DM_USE_L2CACHE_ZPOOL "l2cache_zpool" /* event */ #define DM_EV_NAME "name" diff --git a/usr/src/lib/libzfs/common/libzfs.h b/usr/src/lib/libzfs/common/libzfs.h index e2ebae01f4..f4ef8adbe5 100644 --- a/usr/src/lib/libzfs/common/libzfs.h +++ b/usr/src/lib/libzfs/common/libzfs.h @@ -113,6 +113,7 @@ enum { EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */ EZFS_SHARESMBFAILED, /* failed to share over smb */ EZFS_BADCACHE, /* bad cache file */ + EZFS_ISL2CACHE, /* device is for the level 2 ARC */ EZFS_UNKNOWN }; @@ -216,7 +217,8 @@ extern int zpool_vdev_fault(zpool_handle_t *, uint64_t); extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t); extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); -extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *); +extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, + boolean_t *); extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); /* diff --git a/usr/src/lib/libzfs/common/libzfs_import.c b/usr/src/lib/libzfs/common/libzfs_import.c index 50e9ddf97e..a8ae241f88 100644 --- a/usr/src/lib/libzfs/common/libzfs_import.c +++ b/usr/src/lib/libzfs/common/libzfs_import.c @@ -213,11 +213,13 @@ add_config(libzfs_handle_t *hdl, pool_list_t *pl, const char *path, name_entry_t *ne; /* - * If this is a hot spare not currently in use, add it to the list of - * names to translate, but don't do anything else. + * If this is a hot spare not currently in use or level 2 cache + * device, add it to the list of names to translate, but don't do + * anything else. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, - &state) == 0 && state == POOL_STATE_SPARE && + &state) == 0 && + (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { if ((ne = zfs_alloc(hdl, sizeof (name_entry_t))) == NULL) return (-1); @@ -415,8 +417,8 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl) vdev_entry_t *ve; config_entry_t *ce; nvlist_t *ret = NULL, *config = NULL, *tmp, *nvtop, *nvroot; - nvlist_t **spares; - uint_t i, nspares; + nvlist_t **spares, **l2cache; + uint_t i, nspares, nl2cache; boolean_t config_seen; uint64_t best_txg; char *name, *hostname; @@ -647,6 +649,17 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl) } /* + * Update the paths for l2cache devices. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + for (i = 0; i < nl2cache; i++) { + if (fix_paths(l2cache[i], pl->names) != 0) + goto nomem; + } + } + + /* * Restore the original information read from the actual label. */ (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID, @@ -728,12 +741,12 @@ zpool_read_label(int fd, nvlist_t **config) continue; if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, - &state) != 0 || state > POOL_STATE_SPARE) { + &state) != 0 || state > POOL_STATE_L2CACHE) { nvlist_free(*config); continue; } - if (state != POOL_STATE_SPARE && + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, &txg) != 0 || txg == 0)) { nvlist_free(*config); @@ -1001,27 +1014,28 @@ find_guid(nvlist_t *nv, uint64_t guid) return (B_FALSE); } -typedef struct spare_cbdata { +typedef struct aux_cbdata { + const char *cb_type; uint64_t cb_guid; zpool_handle_t *cb_zhp; -} spare_cbdata_t; +} aux_cbdata_t; static int -find_spare(zpool_handle_t *zhp, void *data) +find_aux(zpool_handle_t *zhp, void *data) { - spare_cbdata_t *cbp = data; - nvlist_t **spares; - uint_t i, nspares; + aux_cbdata_t *cbp = data; + nvlist_t **list; + uint_t i, count; uint64_t guid; nvlist_t *nvroot; verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - for (i = 0; i < nspares; i++) { - verify(nvlist_lookup_uint64(spares[i], + if (nvlist_lookup_nvlist_array(nvroot, cbp->cb_type, + &list, &count) == 0) { + for (i = 0; i < count; i++) { + verify(nvlist_lookup_uint64(list[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (guid == cbp->cb_guid) { cbp->cb_zhp = zhp; @@ -1050,7 +1064,7 @@ zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr, zpool_handle_t *zhp; nvlist_t *pool_config; uint64_t stateval, isspare; - spare_cbdata_t cb = { 0 }; + aux_cbdata_t cb = { 0 }; boolean_t isactive; *inuse = B_FALSE; @@ -1068,7 +1082,7 @@ zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr, verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0); - if (stateval != POOL_STATE_SPARE) { + if (stateval != POOL_STATE_SPARE && stateval != POOL_STATE_L2CACHE) { verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, @@ -1147,7 +1161,24 @@ zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr, */ cb.cb_zhp = NULL; cb.cb_guid = vdev_guid; - if (zpool_iter(hdl, find_spare, &cb) == 1) { + cb.cb_type = ZPOOL_CONFIG_SPARES; + if (zpool_iter(hdl, find_aux, &cb) == 1) { + name = (char *)zpool_get_name(cb.cb_zhp); + ret = TRUE; + } else { + ret = FALSE; + } + break; + + case POOL_STATE_L2CACHE: + + /* + * Check if any pool is currently using this l2cache device. + */ + cb.cb_zhp = NULL; + cb.cb_guid = vdev_guid; + cb.cb_type = ZPOOL_CONFIG_L2CACHE; + if (zpool_iter(hdl, find_aux, &cb) == 1) { name = (char *)zpool_get_name(cb.cb_zhp); ret = TRUE; } else { diff --git a/usr/src/lib/libzfs/common/libzfs_pool.c b/usr/src/lib/libzfs/common/libzfs_pool.c index 7320378e6f..75c1ce7492 100644 --- a/usr/src/lib/libzfs/common/libzfs_pool.c +++ b/usr/src/lib/libzfs/common/libzfs_pool.c @@ -815,6 +815,11 @@ zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, "one or more devices is out of space")); return (zfs_error(hdl, EZFS_BADDEV, msg)); + case ENOTBLK: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cache device must be a disk or disk slice")); + return (zfs_error(hdl, EZFS_BADDEV, msg)); + default: return (zpool_standard_error(hdl, errno, msg)); } @@ -898,14 +903,14 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) int ret; libzfs_handle_t *hdl = zhp->zpool_hdl; char msg[1024]; - nvlist_t **spares; - uint_t nspares; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot add to '%s'"), zhp->zpool_name); - if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) - < SPA_VERSION_SPARES && + if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < + SPA_VERSION_SPARES && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " @@ -913,6 +918,15 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) return (zfs_error(hdl, EZFS_BADVERSION, msg)); } + if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < + SPA_VERSION_L2CACHE && + nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " + "upgraded to add cache devices")); + return (zfs_error(hdl, EZFS_BADVERSION, msg)); + } + if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) return (-1); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); @@ -963,6 +977,12 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) (void) zfs_error(hdl, EZFS_POOL_NOTSUP, msg); break; + case ENOTBLK: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cache device must be a disk or disk slice")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + default: (void) zpool_standard_error(hdl, errno, msg); } @@ -1172,7 +1192,7 @@ zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type) */ static nvlist_t * vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, - boolean_t *avail_spare) + boolean_t *avail_spare, boolean_t *l2cache) { uint_t c, children; nvlist_t **child; @@ -1214,25 +1234,37 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, for (c = 0; c < children; c++) if ((ret = vdev_to_nvlist_iter(child[c], search, guid, - avail_spare)) != NULL) + avail_spare, l2cache)) != NULL) return (ret); if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) { if ((ret = vdev_to_nvlist_iter(child[c], search, guid, - avail_spare)) != NULL) { + avail_spare, l2cache)) != NULL) { *avail_spare = B_TRUE; return (ret); } } } + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if ((ret = vdev_to_nvlist_iter(child[c], search, guid, + avail_spare, l2cache)) != NULL) { + *l2cache = B_TRUE; + return (ret); + } + } + } + return (NULL); } nvlist_t * -zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare) +zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, + boolean_t *l2cache) { char buf[MAXPATHLEN]; const char *search; @@ -1254,29 +1286,32 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare) &nvroot) == 0); *avail_spare = B_FALSE; - return (vdev_to_nvlist_iter(nvroot, search, guid, avail_spare)); + *l2cache = B_FALSE; + return (vdev_to_nvlist_iter(nvroot, search, guid, avail_spare, + l2cache)); } /* - * Returns TRUE if the given guid corresponds to a spare (INUSE or not). + * Returns TRUE if the given guid corresponds to the given type. + * This is used to check for hot spares (INUSE or not), and level 2 cache + * devices. */ static boolean_t -is_spare(zpool_handle_t *zhp, uint64_t guid) +is_guid_type(zpool_handle_t *zhp, uint64_t guid, const char *type) { - uint64_t spare_guid; + uint64_t target_guid; nvlist_t *nvroot; - nvlist_t **spares; - uint_t nspares; + nvlist_t **list; + uint_t count; int i; verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - for (i = 0; i < nspares; i++) { - verify(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &spare_guid) == 0); - if (guid == spare_guid) + if (nvlist_lookup_nvlist_array(nvroot, type, &list, &count) == 0) { + for (i = 0; i < count; i++) { + verify(nvlist_lookup_uint64(list[i], ZPOOL_CONFIG_GUID, + &target_guid) == 0); + if (guid == target_guid) return (B_TRUE); } } @@ -1295,21 +1330,26 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; - boolean_t avail_spare; + boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot online %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if ((tgt = zpool_find_vdev(zhp, path, &avail_spare)) == NULL) + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); - if (avail_spare || is_spare(zhp, zc.zc_guid) == B_TRUE) + if (avail_spare || + is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE) return (zfs_error(hdl, EZFS_ISSPARE, msg)); + if (l2cache || + is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_L2CACHE) == B_TRUE) + return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); + zc.zc_cookie = VDEV_STATE_ONLINE; zc.zc_obj = flags; @@ -1330,21 +1370,26 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; - boolean_t avail_spare; + boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot offline %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if ((tgt = zpool_find_vdev(zhp, path, &avail_spare)) == NULL) + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); - if (avail_spare || is_spare(zhp, zc.zc_guid) == B_TRUE) + if (avail_spare || + is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE) return (zfs_error(hdl, EZFS_ISSPARE, msg)); + if (l2cache || + is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_L2CACHE) == B_TRUE) + return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); + zc.zc_cookie = VDEV_STATE_OFFLINE; zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0; @@ -1461,7 +1506,7 @@ zpool_vdev_attach(zpool_handle_t *zhp, char msg[1024]; int ret; nvlist_t *tgt; - boolean_t avail_spare; + boolean_t avail_spare, l2cache; uint64_t val, is_log; char *path; nvlist_t **child; @@ -1477,12 +1522,15 @@ zpool_vdev_attach(zpool_handle_t *zhp, "cannot attach %s to %s"), new_disk, old_disk); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare)) == 0) + if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache)) == 0) return (zfs_error(hdl, EZFS_NODEVICE, msg)); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); + if (l2cache) + return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); zc.zc_cookie = replacing; @@ -1503,7 +1551,7 @@ zpool_vdev_attach(zpool_handle_t *zhp, if (replacing && nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 && nvlist_lookup_string(child[0], ZPOOL_CONFIG_PATH, &path) == 0 && - (zpool_find_vdev(zhp, path, &avail_spare) == NULL || + (zpool_find_vdev(zhp, path, &avail_spare, &l2cache) == NULL || !avail_spare) && is_replacing_spare(config_root, tgt, 1)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "can only be replaced by another hot spare")); @@ -1516,8 +1564,8 @@ zpool_vdev_attach(zpool_handle_t *zhp, */ if (replacing && nvlist_lookup_string(child[0], ZPOOL_CONFIG_PATH, &path) == 0 && - zpool_find_vdev(zhp, path, &avail_spare) != NULL && avail_spare && - is_replacing_spare(config_root, tgt, 0)) { + zpool_find_vdev(zhp, path, &avail_spare, &l2cache) != NULL && + avail_spare && is_replacing_spare(config_root, tgt, 0)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device has already been replaced with a spare")); return (zfs_error(hdl, EZFS_BADTARGET, msg)); @@ -1612,19 +1660,22 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path) zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; - boolean_t avail_spare; + boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot detach %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if ((tgt = zpool_find_vdev(zhp, path, &avail_spare)) == 0) + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache)) == 0) return (zfs_error(hdl, EZFS_NODEVICE, msg)); if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); + if (l2cache) + return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0) @@ -1656,7 +1707,8 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path) } /* - * Remove the given device. Currently, this is supported only for hot spares. + * Remove the given device. Currently, this is supported only for hot spares + * and level 2 cache devices. */ int zpool_vdev_remove(zpool_handle_t *zhp, const char *path) @@ -1664,19 +1716,20 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; - boolean_t avail_spare; + boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); - if ((tgt = zpool_find_vdev(zhp, path, &avail_spare)) == 0) + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache)) == 0) return (zfs_error(hdl, EZFS_NODEVICE, msg)); - if (!avail_spare) { + if (!avail_spare && !l2cache) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "only inactive hot spares can be removed")); + "only inactive hot spares or cache devices " + "can be removed")); return (zfs_error(hdl, EZFS_NODEVICE, msg)); } @@ -1697,7 +1750,7 @@ zpool_clear(zpool_handle_t *zhp, const char *path) zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; - boolean_t avail_spare; + boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; if (path) @@ -1711,9 +1764,14 @@ zpool_clear(zpool_handle_t *zhp, const char *path) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (path) { - if ((tgt = zpool_find_vdev(zhp, path, &avail_spare)) == 0) + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, + &l2cache)) == 0) return (zfs_error(hdl, EZFS_NODEVICE, msg)); + /* + * Don't allow error clearing for hot spares. Do allow + * error clearing for l2cache devices. + */ if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); diff --git a/usr/src/lib/libzfs/common/libzfs_util.c b/usr/src/lib/libzfs/common/libzfs_util.c index d2005867e3..b58da2c0bf 100644 --- a/usr/src/lib/libzfs/common/libzfs_util.c +++ b/usr/src/lib/libzfs/common/libzfs_util.c @@ -201,6 +201,8 @@ libzfs_error_description(libzfs_handle_t *hdl) " modified")); case EZFS_BADCACHE: return (dgettext(TEXT_DOMAIN, "invalid or missing cache file")); + case EZFS_ISL2CACHE: + return (dgettext(TEXT_DOMAIN, "device is in use as a cache")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index aafce2d68e..76f8c155ef 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -47,13 +47,13 @@ * There are times when it is not possible to evict the requested * space. In these circumstances we are unable to adjust the cache * size. To prevent the cache growing unbounded at these times we - * implement a "cache throttle" that slowes the flow of new data - * into the cache until we can make space avaiable. + * implement a "cache throttle" that slows the flow of new data + * into the cache until we can make space available. * * 2. The Megiddo and Modha model assumes a fixed cache size. * Pages are evicted when the cache is full and there is a cache * miss. Our model has a variable sized cache. It grows with - * high use, but also tries to react to memory preasure from the + * high use, but also tries to react to memory pressure from the * operating system: decreasing its size when system memory is * tight. * @@ -75,7 +75,7 @@ * * A new reference to a cache buffer can be obtained in two * ways: 1) via a hash table lookup using the DVA as a key, - * or 2) via one of the ARC lists. The arc_read() inerface + * or 2) via one of the ARC lists. The arc_read() interface * uses method 1, while the internal arc algorithms for * adjusting the cache use method 2. We therefor provide two * types of locks: 1) the hash table lock array, and 2) the @@ -109,6 +109,14 @@ * * Note that the majority of the performance stats are manipulated * with atomic operations. + * + * The L2ARC uses the l2arc_buflist_mtx global mutex for the following: + * + * - L2ARC buflist creation + * - L2ARC buflist eviction + * - L2ARC write completion, which walks L2ARC buflists + * - ARC header destruction, as it removes from L2ARC buflists + * - ARC header release, as it removes from L2ARC buflists */ #include <sys/spa.h> @@ -157,19 +165,20 @@ uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; /* - * Note that buffers can be in one of 5 states: + * Note that buffers can be in one of 6 states: * ARC_anon - anonymous (discussed below) * ARC_mru - recently used, currently cached * ARC_mru_ghost - recentely used, no longer in cache * ARC_mfu - frequently used, currently cached * ARC_mfu_ghost - frequently used, no longer in cache + * ARC_l2c_only - exists in L2ARC but not other states * When there are no active references to the buffer, they are * are linked onto a list in one of these arc states. These are * the only buffers that can be evicted or deleted. Within each * state there are multiple lists, one for meta-data and one for * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, * etc.) is tracked separately so that it can be managed more - * explicitly: favored over data, limited explicitely. + * explicitly: favored over data, limited explicitly. * * Anonymous buffers are buffers that are not associated with * a DVA. These are buffers that hold dirty block copies @@ -177,6 +186,14 @@ uint64_t zfs_arc_meta_limit = 0; * they are "ref'd" and are considered part of arc_mru * that cannot be freed. Generally, they will aquire a DVA * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. */ typedef struct arc_state { @@ -186,12 +203,13 @@ typedef struct arc_state { kmutex_t arcs_mtx; } arc_state_t; -/* The 5 states: */ +/* The 6 states: */ static arc_state_t ARC_anon; static arc_state_t ARC_mru; static arc_state_t ARC_mru_ghost; static arc_state_t ARC_mfu; static arc_state_t ARC_mfu_ghost; +static arc_state_t ARC_l2c_only; typedef struct arc_stats { kstat_named_t arcstat_hits; @@ -222,6 +240,23 @@ typedef struct arc_stats { kstat_named_t arcstat_c_min; kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; + kstat_named_t arcstat_hdr_size; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_hdr_miss; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_size; + kstat_named_t arcstat_l2_hdr_size; } arc_stats_t; static arc_stats_t arc_stats = { @@ -252,7 +287,24 @@ static arc_stats_t arc_stats = { { "c", KSTAT_DATA_UINT64 }, { "c_min", KSTAT_DATA_UINT64 }, { "c_max", KSTAT_DATA_UINT64 }, - { "size", KSTAT_DATA_UINT64 } + { "size", KSTAT_DATA_UINT64 }, + { "hdr_size", KSTAT_DATA_UINT64 }, + { "l2_hits", KSTAT_DATA_UINT64 }, + { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_feeds", KSTAT_DATA_UINT64 }, + { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_writes_sent", KSTAT_DATA_UINT64 }, + { "l2_writes_done", KSTAT_DATA_UINT64 }, + { "l2_writes_error", KSTAT_DATA_UINT64 }, + { "l2_writes_hdr_miss", KSTAT_DATA_UINT64 }, + { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, + { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_free_on_write", KSTAT_DATA_UINT64 }, + { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, + { "l2_cksum_bad", KSTAT_DATA_UINT64 }, + { "l2_io_error", KSTAT_DATA_UINT64 }, + { "l2_size", KSTAT_DATA_UINT64 }, + { "l2_hdr_size", KSTAT_DATA_UINT64 } }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -299,6 +351,7 @@ static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; static arc_state_t *arc_mfu_ghost; +static arc_state_t *arc_l2c_only; /* * There are several ARC variables that are critical to export as kstats -- @@ -320,6 +373,8 @@ static uint64_t arc_meta_used; static uint64_t arc_meta_limit; static uint64_t arc_meta_max = 0; +typedef struct l2arc_buf_hdr l2arc_buf_hdr_t; + typedef struct arc_callback arc_callback_t; struct arc_callback { @@ -371,6 +426,9 @@ struct arc_buf_hdr { /* self protecting */ refcount_t b_refcnt; + + l2arc_buf_hdr_t *b_l2hdr; + list_node_t b_l2node; }; static arc_buf_t *arc_eviction_list; @@ -382,7 +440,8 @@ static int arc_evict_needed(arc_buf_contents_t type); static void arc_evict_ghost(arc_state_t *state, int64_t bytes); #define GHOST_STATE(state) \ - ((state) == arc_mru_ghost || (state) == arc_mfu_ghost) + ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ + (state) == arc_l2c_only) /* * Private ARC flags. These flags are private ARC only flags that will show up @@ -398,12 +457,24 @@ static void arc_evict_ghost(arc_state_t *state, int64_t bytes); #define ARC_FREED_IN_READ (1 << 12) /* buf freed while in read */ #define ARC_BUF_AVAILABLE (1 << 13) /* block not in active use */ #define ARC_INDIRECT (1 << 14) /* this is an indirect block */ +#define ARC_FREE_IN_PROGRESS (1 << 15) /* hdr about to be freed */ +#define ARC_DONT_L2CACHE (1 << 16) /* originated by prefetch */ +#define ARC_L2_READING (1 << 17) /* L2ARC read in progress */ +#define ARC_L2_WRITING (1 << 18) /* L2ARC write in progress */ +#define ARC_L2_EVICTED (1 << 19) /* evicted during I/O */ +#define ARC_L2_WRITE_HEAD (1 << 20) /* head of write list */ #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) +#define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) +#define HDR_DONT_L2CACHE(hdr) ((hdr)->b_flags & ARC_DONT_L2CACHE) +#define HDR_L2_READING(hdr) ((hdr)->b_flags & ARC_L2_READING) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_L2_WRITE_HEAD) /* * Hash table routines @@ -436,6 +507,87 @@ static buf_hash_table_t buf_hash_table; uint64_t zfs_crc64_table[256]; +/* + * Level 2 ARC + */ + +#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 4 /* num of writes */ +#define L2ARC_FEED_DELAY 180 /* starting grace */ +#define L2ARC_FEED_SECS 1 /* caching interval */ + +#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) +#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) + +/* + * L2ARC Performance Tunables + */ +uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ +uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ +uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ + +/* + * L2ARC Internals + */ +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_write; /* desired write size, bytes */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + uint64_t l2ad_evict; /* last addr eviction reached */ + boolean_t l2ad_first; /* first sweep through */ + list_t *l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ +} l2arc_dev_t; + +static list_t L2ARC_dev_list; /* device list */ +static list_t *l2arc_dev_list; /* device list pointer */ +static kmutex_t l2arc_dev_mtx; /* device list mutex */ +static l2arc_dev_t *l2arc_dev_last; /* last device used */ +static kmutex_t l2arc_buflist_mtx; /* mutex for all buflists */ +static list_t L2ARC_free_on_write; /* free after write buf list */ +static list_t *l2arc_free_on_write; /* free after write list ptr */ +static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ +static uint64_t l2arc_ndev; /* number of devices */ + +typedef struct l2arc_read_callback { + arc_buf_t *l2rcb_buf; /* read buffer */ + spa_t *l2rcb_spa; /* spa */ + blkptr_t l2rcb_bp; /* original blkptr */ + zbookmark_t l2rcb_zb; /* original bookmark */ + int l2rcb_flags; /* original flags */ +} l2arc_read_callback_t; + +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ +} l2arc_write_callback_t; + +struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + daddr_t b_daddr; /* disk address, offset byte */ +}; + +typedef struct l2arc_data_free { + /* protected by l2arc_free_on_write_mtx */ + void *l2df_data; + size_t l2df_size; + void (*l2df_func)(void *, size_t); + list_node_t l2df_list_node; +} l2arc_data_free_t; + +static kmutex_t l2arc_feed_thr_lock; +static kcondvar_t l2arc_feed_thr_cv; +static uint8_t l2arc_thread_exit; + +static void l2arc_read_done(zio_t *zio); +static void l2arc_hdr_stat_add(void); +static void l2arc_hdr_stat_remove(void); + static uint64_t buf_hash(spa_t *spa, dva_t *dva, uint64_t birth) { @@ -585,6 +737,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag) refcount_create(&buf->b_refcnt); cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + + ARCSTAT_INCR(arcstat_hdr_size, sizeof (arc_buf_hdr_t)); return (0); } @@ -601,6 +755,8 @@ hdr_dest(void *vbuf, void *unused) refcount_destroy(&buf->b_refcnt); cv_destroy(&buf->b_cv); mutex_destroy(&buf->b_freeze_lock); + + ARCSTAT_INCR(arcstat_hdr_size, -sizeof (arc_buf_hdr_t)); } /* @@ -680,10 +836,24 @@ arc_cksum_verify(arc_buf_t *buf) mutex_exit(&buf->b_hdr->b_freeze_lock); } +static int +arc_cksum_equal(arc_buf_t *buf) +{ + zio_cksum_t zc; + int equal; + + mutex_enter(&buf->b_hdr->b_freeze_lock); + fletcher_2_native(buf->b_data, buf->b_hdr->b_size, &zc); + equal = ZIO_CHECKSUM_EQUAL(*buf->b_hdr->b_freeze_cksum, zc); + mutex_exit(&buf->b_hdr->b_freeze_lock); + + return (equal); +} + static void -arc_cksum_compute(arc_buf_t *buf) +arc_cksum_compute(arc_buf_t *buf, boolean_t force) { - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + if (!force && !(zfs_flags & ZFS_DEBUG_MODIFY)) return; mutex_enter(&buf->b_hdr->b_freeze_lock); @@ -700,14 +870,14 @@ arc_cksum_compute(arc_buf_t *buf) void arc_buf_thaw(arc_buf_t *buf) { - if (!(zfs_flags & ZFS_DEBUG_MODIFY)) - return; + if (zfs_flags & ZFS_DEBUG_MODIFY) { + if (buf->b_hdr->b_state != arc_anon) + panic("modifying non-anon buffer!"); + if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) + panic("modifying buffer while i/o in progress!"); + arc_cksum_verify(buf); + } - if (buf->b_hdr->b_state != arc_anon) - panic("modifying non-anon buffer!"); - if (buf->b_hdr->b_flags & ARC_IO_IN_PROGRESS) - panic("modifying buffer while i/o in progress!"); - arc_cksum_verify(buf); mutex_enter(&buf->b_hdr->b_freeze_lock); if (buf->b_hdr->b_freeze_cksum != NULL) { kmem_free(buf->b_hdr->b_freeze_cksum, sizeof (zio_cksum_t)); @@ -724,7 +894,7 @@ arc_buf_freeze(arc_buf_t *buf) ASSERT(buf->b_hdr->b_freeze_cksum != NULL || buf->b_hdr->b_state == arc_anon); - arc_cksum_compute(buf); + arc_cksum_compute(buf, B_FALSE); } static void @@ -852,7 +1022,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } ASSERT(!BUF_EMPTY(ab)); - if (new_state == arc_anon && old_state != arc_anon) { + if (new_state == arc_anon) { buf_hash_remove(ab); } @@ -864,6 +1034,12 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) atomic_add_64(&old_state->arcs_size, -from_delta); } ab->b_state = new_state; + + /* adjust l2arc hdr stats */ + if (new_state == arc_l2c_only) + l2arc_hdr_stat_add(); + else if (old_state == arc_l2c_only) + l2arc_hdr_stat_remove(); } void @@ -990,6 +1166,29 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) data, metadata, hits); } +/* + * Free the arc data buffer. If it is an l2arc write in progress, + * the buffer is placed on l2arc_free_on_write to be freed later. + */ +static void +arc_buf_data_free(arc_buf_hdr_t *hdr, void (*free_func)(void *, size_t), + void *data, size_t size) +{ + if (HDR_L2_WRITING(hdr)) { + l2arc_data_free_t *df; + df = kmem_alloc(sizeof (l2arc_data_free_t), KM_SLEEP); + df->l2df_data = data; + df->l2df_size = size; + df->l2df_func = free_func; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else { + free_func(data, size); + } +} + static void arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) { @@ -1004,11 +1203,13 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) arc_cksum_verify(buf); if (!recycle) { if (type == ARC_BUFC_METADATA) { - zio_buf_free(buf->b_data, size); + arc_buf_data_free(buf->b_hdr, zio_buf_free, + buf->b_data, size); arc_space_return(size); } else { ASSERT(type == ARC_BUFC_DATA); - zio_data_buf_free(buf->b_data, size); + arc_buf_data_free(buf->b_hdr, + zio_data_buf_free, buf->b_data, size); atomic_add_64(&arc_size, -size); } } @@ -1051,6 +1252,30 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ASSERT3P(hdr->b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + if (hdr->b_l2hdr != NULL) { + if (!MUTEX_HELD(&l2arc_buflist_mtx)) { + /* + * To prevent arc_free() and l2arc_evict() from + * attempting to free the same buffer at the same time, + * a FREE_IN_PROGRESS flag is given to arc_free() to + * give it priority. l2arc_evict() can't destroy this + * header while we are waiting on l2arc_buflist_mtx. + */ + mutex_enter(&l2arc_buflist_mtx); + ASSERT(hdr->b_l2hdr != NULL); + + list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); + mutex_exit(&l2arc_buflist_mtx); + } else { + list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); + } + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t)); + if (hdr->b_state == arc_l2c_only) + l2arc_hdr_stat_remove(); + hdr->b_l2hdr = NULL; + } + if (!BUF_EMPTY(hdr)) { ASSERT(!HDR_IN_HASH_TABLE(hdr)); bzero(&hdr->b_dva, sizeof (dva_t)); @@ -1214,7 +1439,8 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, if (buf->b_data) { bytes_evicted += ab->b_size; if (recycle && ab->b_type == type && - ab->b_size == bytes) { + ab->b_size == bytes && + !HDR_L2_WRITING(ab)) { stolen = buf->b_data; recycle = FALSE; } @@ -1236,7 +1462,8 @@ arc_evict(arc_state_t *state, int64_t bytes, boolean_t recycle, ASSERT(ab->b_datacnt == 0); arc_change_state(evicted_state, ab, hash_lock); ASSERT(HDR_IN_HASH_TABLE(ab)); - ab->b_flags = ARC_IN_HASH_TABLE; + ab->b_flags |= ARC_IN_HASH_TABLE; + ab->b_flags &= ~ARC_BUF_AVAILABLE; DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, ab); if (!have_lock) mutex_exit(hash_lock); @@ -1306,11 +1533,22 @@ top: if (mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(ab)); ASSERT(ab->b_buf == NULL); - arc_change_state(arc_anon, ab, hash_lock); - mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_deleted); bytes_deleted += ab->b_size; - arc_hdr_destroy(ab); + + if (ab->b_l2hdr != NULL) { + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, ab, hash_lock); + mutex_exit(hash_lock); + } else { + arc_change_state(arc_anon, ab, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(ab); + } + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, ab); if (bytes >= 0 && bytes_deleted >= bytes) break; @@ -1506,7 +1744,7 @@ arc_reclaim_needed(void) /* * check to make sure that swapfs has enough space so that anon - * reservations can still succeeed. anon_resvmem() checks that the + * reservations can still succeed. anon_resvmem() checks that the * availrmem is greater than swapfs_minfree, and the number of reserved * swap pages. We also add a bit of extra here just to prevent * circumstances from getting really dire. @@ -1523,7 +1761,7 @@ arc_reclaim_needed(void) * can have in the system. However, this is generally fixed at 25 pages * which is so low that it's useless. In this comparison, we seek to * calculate the total heap-size, and reclaim if more than 3/4ths of the - * heap is allocated. (Or, in the caclulation, if less than 1/4th is + * heap is allocated. (Or, in the calculation, if less than 1/4th is * free) */ if (btop(vmem_size(heap_arena, VMEM_FREE)) < @@ -1564,7 +1802,7 @@ arc_kmem_reap_now(arc_reclaim_strategy_t strat) #endif /* - * An agressive reclamation will shrink the cache size as well as + * An aggressive reclamation will shrink the cache size as well as * reap free buffers from the arc kmem caches. */ if (strat == ARC_RECLAIM_AGGR) @@ -1648,6 +1886,9 @@ arc_adapt(int bytes, arc_state_t *state) { int mult; + if (state == arc_l2c_only) + return; + ASSERT(bytes > 0); /* * Adapt the target size of the MRU list: @@ -1944,6 +2185,14 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) arc_change_state(new_state, buf, hash_lock); ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + } else if (buf->b_state == arc_l2c_only) { + /* + * This buffer is on the 2nd Level ARC. + */ + + buf->b_arc_access = lbolt; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); + arc_change_state(arc_mfu, buf, hash_lock); } else { ASSERT(!"invalid arc state"); } @@ -1996,7 +2245,12 @@ arc_read_done(zio_t *zio) &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || - (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp)))); + (found == hdr && DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || + (found == hdr && HDR_L2_READING(hdr))); + + hdr->b_flags &= ~(ARC_L2_READING|ARC_L2_EVICTED); + if (l2arc_noprefetch && (hdr->b_flags & ARC_PREFETCH)) + hdr->b_flags |= ARC_DONT_L2CACHE; /* byteswap if necessary */ callback_list = hdr->b_acb; @@ -2004,7 +2258,7 @@ arc_read_done(zio_t *zio) if (BP_SHOULD_BYTESWAP(zio->io_bp) && callback_list->acb_byteswap) callback_list->acb_byteswap(buf->b_data, hdr->b_size); - arc_cksum_compute(buf); + arc_cksum_compute(buf, B_FALSE); /* create copies of the data buffer for the callers */ abuf = buf; @@ -2108,7 +2362,7 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_byteswap_func_t *swap, arc_buf_hdr_t *hdr; arc_buf_t *buf; kmutex_t *hash_lock; - zio_t *rzio; + zio_t *rzio; top: hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); @@ -2255,7 +2509,6 @@ top: if (GHOST_STATE(hdr->b_state)) arc_access(hdr, hash_lock); - mutex_exit(hash_lock); ASSERT3U(hdr->b_size, ==, size); DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, @@ -2265,6 +2518,57 @@ top: demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); + if (l2arc_ndev != 0) { + /* + * Read from the L2ARC if the following are true: + * 1. This buffer has L2ARC metadata. + * 2. This buffer isn't currently writing to the L2ARC. + */ + if (hdr->b_l2hdr != NULL && !HDR_L2_WRITING(hdr)) { + vdev_t *vd = hdr->b_l2hdr->b_dev->l2ad_vdev; + daddr_t addr = hdr->b_l2hdr->b_daddr; + l2arc_read_callback_t *cb; + + DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_hits); + + hdr->b_flags |= ARC_L2_READING; + mutex_exit(hash_lock); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), + KM_SLEEP); + cb->l2rcb_buf = buf; + cb->l2rcb_spa = spa; + cb->l2rcb_bp = *bp; + cb->l2rcb_zb = *zb; + cb->l2rcb_flags = flags; + + /* + * l2arc read. + */ + rzio = zio_read_phys(pio, vd, addr, size, + buf->b_data, ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, + flags | ZIO_FLAG_DONT_CACHE, B_FALSE); + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, + zio_t *, rzio); + + if (*arc_flags & ARC_WAIT) + return (zio_wait(rzio)); + + ASSERT(*arc_flags & ARC_NOWAIT); + zio_nowait(rzio); + return (0); + } else { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + if (HDR_L2_WRITING(hdr)) + ARCSTAT_BUMP(arcstat_l2_rw_clash); + } + } + mutex_exit(hash_lock); + rzio = zio_read(pio, spa, bp, buf->b_data, size, arc_read_done, buf, priority, flags, zb); @@ -2402,7 +2706,8 @@ arc_buf_evict(arc_buf_t *buf) arc_change_state(evicted_state, hdr, hash_lock); ASSERT(HDR_IN_HASH_TABLE(hdr)); - hdr->b_flags = ARC_IN_HASH_TABLE; + hdr->b_flags |= ARC_IN_HASH_TABLE; + hdr->b_flags &= ~ARC_BUF_AVAILABLE; mutex_exit(&evicted_state->arcs_mtx); mutex_exit(&old_state->arcs_mtx); @@ -2428,6 +2733,8 @@ arc_release(arc_buf_t *buf, void *tag) { arc_buf_hdr_t *hdr = buf->b_hdr; kmutex_t *hash_lock = HDR_LOCK(hdr); + l2arc_buf_hdr_t *l2hdr = NULL; + uint64_t buf_size; /* this buffer is not on any list */ ASSERT(refcount_count(&hdr->b_refcnt) > 0); @@ -2452,6 +2759,7 @@ arc_release(arc_buf_t *buf, void *tag) uint64_t blksz = hdr->b_size; spa_t *spa = hdr->b_spa; arc_buf_contents_t type = hdr->b_type; + uint32_t flags = hdr->b_flags; ASSERT(hdr->b_datacnt > 1); /* @@ -2473,6 +2781,12 @@ arc_release(arc_buf_t *buf, void *tag) atomic_add_64(size, -hdr->b_size); } hdr->b_datacnt -= 1; + if (hdr->b_l2hdr != NULL) { + mutex_enter(&l2arc_buflist_mtx); + l2hdr = hdr->b_l2hdr; + hdr->b_l2hdr = NULL; + buf_size = hdr->b_size; + } arc_cksum_verify(buf); mutex_exit(hash_lock); @@ -2484,21 +2798,27 @@ arc_release(arc_buf_t *buf, void *tag) nhdr->b_buf = buf; nhdr->b_state = arc_anon; nhdr->b_arc_access = 0; - nhdr->b_flags = 0; + nhdr->b_flags = flags & ARC_L2_WRITING; + nhdr->b_l2hdr = NULL; nhdr->b_datacnt = 1; nhdr->b_freeze_cksum = NULL; (void) refcount_add(&nhdr->b_refcnt, tag); buf->b_hdr = nhdr; atomic_add_64(&arc_anon->arcs_size, blksz); - - hdr = nhdr; } else { ASSERT(refcount_count(&hdr->b_refcnt) == 1); ASSERT(!list_link_active(&hdr->b_arc_node)); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); arc_change_state(arc_anon, hdr, hash_lock); hdr->b_arc_access = 0; + if (hdr->b_l2hdr != NULL) { + mutex_enter(&l2arc_buflist_mtx); + l2hdr = hdr->b_l2hdr; + hdr->b_l2hdr = NULL; + buf_size = hdr->b_size; + } mutex_exit(hash_lock); + bzero(&hdr->b_dva, sizeof (dva_t)); hdr->b_birth = 0; hdr->b_cksum0 = 0; @@ -2506,6 +2826,14 @@ arc_release(arc_buf_t *buf, void *tag) } buf->b_efunc = NULL; buf->b_private = NULL; + + if (l2hdr) { + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -buf_size); + } + if (MUTEX_HELD(&l2arc_buflist_mtx)) + mutex_exit(&l2arc_buflist_mtx); } int @@ -2559,7 +2887,7 @@ arc_write_ready(zio_t *zio) } mutex_exit(&hdr->b_freeze_lock); } - arc_cksum_compute(buf); + arc_cksum_compute(buf, B_FALSE); hdr->b_flags |= ARC_IO_IN_PROGRESS; } @@ -2704,6 +3032,7 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ab->b_buf->b_private = NULL; mutex_exit(hash_lock); } else if (refcount_is_zero(&ab->b_refcnt)) { + ab->b_flags |= ARC_FREE_IN_PROGRESS; mutex_exit(hash_lock); arc_hdr_destroy(ab); ARCSTAT_BUMP(arcstat_deleted); @@ -2847,6 +3176,7 @@ arc_init(void) arc_mru_ghost = &ARC_mru_ghost; arc_mfu = &ARC_mfu; arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; arc_size = 0; mutex_init(&arc_anon->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); @@ -2854,6 +3184,7 @@ arc_init(void) mutex_init(&arc_mru_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&arc_mfu_ghost->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&arc_l2c_only->arcs_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&arc_mru->arcs_list[ARC_BUFC_METADATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); @@ -2871,6 +3202,10 @@ arc_init(void) sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); list_create(&arc_mfu_ghost->arcs_list[ARC_BUFC_DATA], sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_METADATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); + list_create(&arc_l2c_only->arcs_list[ARC_BUFC_DATA], + sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_arc_node)); buf_init(); @@ -2932,3 +3267,868 @@ arc_fini(void) buf_fini(); } + +/* + * Level 2 ARC + * + * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. + * It uses dedicated storage devices to hold cached data, which are populated + * using large infrequent writes. The main role of this cache is to boost + * the performance of random read workloads. The intended L2ARC devices + * include short-stroked disks, solid state disks, and other media with + * substantially faster read latency than disk. + * + * +-----------------------+ + * | ARC | + * +-----------------------+ + * | ^ ^ + * | | | + * l2arc_feed_thread() arc_read() + * | | | + * | l2arc read | + * V | | + * +---------------+ | + * | L2ARC | | + * +---------------+ | + * | ^ | + * l2arc_write() | | + * | | | + * V | | + * +-------+ +-------+ + * | vdev | | vdev | + * | cache | | cache | + * +-------+ +-------+ + * +=========+ .-----. + * : L2ARC : |-_____-| + * : devices : | Disks | + * +=========+ `-_____-' + * + * Read requests are satisfied from the following sources, in order: + * + * 1) ARC + * 2) vdev cache of L2ARC devices + * 3) L2ARC devices + * 4) vdev cache of disks + * 5) disks + * + * Some L2ARC device types exhibit extremely slow write performance. + * To accommodate for this there are some significant differences between + * the L2ARC and traditional cache design: + * + * 1. There is no eviction path from the ARC to the L2ARC. Evictions from + * the ARC behave as usual, freeing buffers and placing headers on ghost + * lists. The ARC does not send buffers to the L2ARC during eviction as + * this would add inflated write latencies for all ARC memory pressure. + * + * 2. The L2ARC attempts to cache data from the ARC before it is evicted. + * It does this by periodically scanning buffers from the eviction-end of + * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are + * not already there. It scans until a headroom of buffers is satisfied, + * which itself is a buffer for ARC eviction. The thread that does this is + * l2arc_feed_thread(), illustrated below; example sizes are included to + * provide a better sense of ratio than this diagram: + * + * head --> tail + * +---------------------+----------+ + * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC + * +---------------------+----------+ | o L2ARC eligible + * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer + * +---------------------+----------+ | + * 15.9 Gbytes ^ 32 Mbytes | + * headroom | + * l2arc_feed_thread() + * | + * l2arc write hand <--[oooo]--' + * | 8 Mbyte + * | write max + * V + * +==============================+ + * L2ARC dev |####|#|###|###| |####| ... | + * +==============================+ + * 32 Gbytes + * + * 3. If an ARC buffer is copied to the L2ARC but then hit instead of + * evicted, then the L2ARC has cached a buffer much sooner than it probably + * needed to, potentially wasting L2ARC device bandwidth and storage. It is + * safe to say that this is an uncommon case, since buffers at the end of + * the ARC lists have moved there due to inactivity. + * + * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, + * then the L2ARC simply misses copying some buffers. This serves as a + * pressure valve to prevent heavy read workloads from both stalling the ARC + * with waits and clogging the L2ARC with writes. This also helps prevent + * the potential for the L2ARC to churn if it attempts to cache content too + * quickly, such as during backups of the entire pool. + * + * 5. Writes to the L2ARC devices are grouped and sent in-sequence, so that + * the vdev queue can aggregate them into larger and fewer writes. Each + * device is written to in a rotor fashion, sweeping writes through + * available space then repeating. + * + * 6. The L2ARC does not store dirty content. It never needs to flush + * write buffers back to disk based storage. + * + * 7. If an ARC buffer is written (and dirtied) which also exists in the + * L2ARC, the now stale L2ARC buffer is immediately dropped. + * + * The performance of the L2ARC can be tweaked by a number of tunables, which + * may be necessary for different workloads: + * + * l2arc_write_max max write bytes per interval + * l2arc_noprefetch skip caching prefetched buffers + * l2arc_headroom number of max device writes to precache + * l2arc_feed_secs seconds between L2ARC writing + * + * Tunables may be removed or added as future performance improvements are + * integrated, and also may become zpool properties. + */ + +static void +l2arc_hdr_stat_add(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, sizeof (arc_buf_hdr_t) + + sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_hdr_size, -sizeof (arc_buf_hdr_t)); +} + +static void +l2arc_hdr_stat_remove(void) +{ + ARCSTAT_INCR(arcstat_l2_hdr_size, -sizeof (arc_buf_hdr_t) - + sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_hdr_size, sizeof (arc_buf_hdr_t)); +} + +/* + * Cycle through L2ARC devices. This is how L2ARC load balances. + * This is called with l2arc_dev_mtx held, which also locks out spa removal. + */ +static l2arc_dev_t * +l2arc_dev_get_next(void) +{ + l2arc_dev_t *next; + + if (l2arc_dev_last == NULL) { + next = list_head(l2arc_dev_list); + } else { + next = list_next(l2arc_dev_list, l2arc_dev_last); + if (next == NULL) + next = list_head(l2arc_dev_list); + } + + l2arc_dev_last = next; + + return (next); +} + +/* + * A write to a cache device has completed. Update all headers to allow + * reads from these buffers to begin. + */ +static void +l2arc_write_done(zio_t *zio) +{ + l2arc_write_callback_t *cb; + l2arc_dev_t *dev; + list_t *buflist; + l2arc_data_free_t *df, *df_prev; + arc_buf_hdr_t *head, *ab, *ab_prev; + kmutex_t *hash_lock; + + cb = zio->io_private; + ASSERT(cb != NULL); + dev = cb->l2wcb_dev; + ASSERT(dev != NULL); + head = cb->l2wcb_head; + ASSERT(head != NULL); + buflist = dev->l2ad_buflist; + ASSERT(buflist != NULL); + DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, + l2arc_write_callback_t *, cb); + + if (zio->io_error != 0) + ARCSTAT_BUMP(arcstat_l2_writes_error); + + mutex_enter(&l2arc_buflist_mtx); + + /* + * All writes completed, or an error was hit. + */ + for (ab = list_prev(buflist, head); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * This buffer misses out. It may be in a stage + * of eviction. Its ARC_L2_WRITING flag will be + * left set, denying reads to this buffer. + */ + ARCSTAT_BUMP(arcstat_l2_writes_hdr_miss); + continue; + } + + if (zio->io_error != 0) { + /* + * Error - invalidate L2ARC entry. + */ + ab->b_l2hdr = NULL; + } + + /* + * Allow ARC to begin reads to this L2ARC entry. + */ + ab->b_flags &= ~ARC_L2_WRITING; + + mutex_exit(hash_lock); + } + + atomic_inc_64(&l2arc_writes_done); + list_remove(buflist, head); + kmem_cache_free(hdr_cache, head); + mutex_exit(&l2arc_buflist_mtx); + + /* + * Free buffers that were tagged for destruction. + */ + mutex_enter(&l2arc_free_on_write_mtx); + buflist = l2arc_free_on_write; + for (df = list_tail(buflist); df; df = df_prev) { + df_prev = list_prev(buflist, df); + ASSERT(df->l2df_data != NULL); + ASSERT(df->l2df_func != NULL); + df->l2df_func(df->l2df_data, df->l2df_size); + list_remove(buflist, df); + kmem_free(df, sizeof (l2arc_data_free_t)); + } + mutex_exit(&l2arc_free_on_write_mtx); + + kmem_free(cb, sizeof (l2arc_write_callback_t)); +} + +/* + * A read to a cache device completed. Validate buffer contents before + * handing over to the regular ARC routines. + */ +static void +l2arc_read_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + zio_t *rzio; + kmutex_t *hash_lock; + int equal, err = 0; + + cb = zio->io_private; + ASSERT(cb != NULL); + buf = cb->l2rcb_buf; + ASSERT(buf != NULL); + hdr = buf->b_hdr; + ASSERT(hdr != NULL); + + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + /* + * Check this survived the L2ARC journey. + */ + equal = arc_cksum_equal(buf); + if (equal && zio->io_error == 0 && !HDR_L2_EVICTED(hdr)) { + mutex_exit(hash_lock); + zio->io_private = buf; + arc_read_done(zio); + } else { + mutex_exit(hash_lock); + /* + * Buffer didn't survive caching. Increment stats and + * reissue to the original storage device. + */ + if (zio->io_error != 0) + ARCSTAT_BUMP(arcstat_l2_io_error); + if (!equal) + ARCSTAT_BUMP(arcstat_l2_cksum_bad); + + zio->io_flags &= ~ZIO_FLAG_DONT_CACHE; + rzio = zio_read(NULL, cb->l2rcb_spa, &cb->l2rcb_bp, + buf->b_data, zio->io_size, arc_read_done, buf, + zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb); + + /* + * Since this is a seperate thread, we can wait on this + * I/O whether there is an io_waiter or not. + */ + err = zio_wait(rzio); + + /* + * Let the resent I/O call arc_read_done() instead. + * io_error is set to the reissued I/O error status. + */ + zio->io_done = NULL; + zio->io_waiter = NULL; + zio->io_error = err; + } + + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + +/* + * This is the list priority from which the L2ARC will search for pages to + * cache. This is used within loops (0..3) to cycle through lists in the + * desired order. This order can have a significant effect on cache + * performance. + * + * Currently the metadata lists are hit first, MFU then MRU, followed by + * the data lists. This function returns a locked list, and also returns + * the lock pointer. + */ +static list_t * +l2arc_list_locked(int list_num, kmutex_t **lock) +{ + list_t *list; + + ASSERT(list_num >= 0 && list_num <= 3); + + switch (list_num) { + case 0: + list = &arc_mfu->arcs_list[ARC_BUFC_METADATA]; + *lock = &arc_mfu->arcs_mtx; + break; + case 1: + list = &arc_mru->arcs_list[ARC_BUFC_METADATA]; + *lock = &arc_mru->arcs_mtx; + break; + case 2: + list = &arc_mfu->arcs_list[ARC_BUFC_DATA]; + *lock = &arc_mfu->arcs_mtx; + break; + case 3: + list = &arc_mru->arcs_list[ARC_BUFC_DATA]; + *lock = &arc_mru->arcs_mtx; + break; + } + + ASSERT(!(MUTEX_HELD(*lock))); + mutex_enter(*lock); + return (list); +} + +/* + * Evict buffers from the device write hand to the distance specified in + * bytes. This distance may span populated buffers, it may span nothing. + * This is clearing a region on the L2ARC device ready for writing. + * If the 'all' boolean is set, every buffer is evicted. + */ +static void +l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) +{ + list_t *buflist; + l2arc_buf_hdr_t *abl2; + arc_buf_hdr_t *ab, *ab_prev; + kmutex_t *hash_lock; + uint64_t taddr; + + ASSERT(MUTEX_HELD(&l2arc_dev_mtx)); + + buflist = dev->l2ad_buflist; + + if (buflist == NULL) + return; + + if (!all && dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. + */ + return; + } + + if (dev->l2ad_hand >= (dev->l2ad_end - (2 * dev->l2ad_write))) { + /* + * When nearing the end of the device, evict to the end + * before the device write hand jumps to the start. + */ + taddr = dev->l2ad_end; + } else { + taddr = dev->l2ad_hand + distance; + } + DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, + uint64_t, taddr, boolean_t, all); + +top: + mutex_enter(&l2arc_buflist_mtx); + for (ab = list_tail(buflist); ab; ab = ab_prev) { + ab_prev = list_prev(buflist, ab); + + hash_lock = HDR_LOCK(ab); + if (!mutex_tryenter(hash_lock)) { + /* + * Missed the hash lock. Retry. + */ + ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); + mutex_exit(&l2arc_buflist_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + + if (HDR_L2_WRITE_HEAD(ab)) { + /* + * We hit a write head node. Leave it for + * l2arc_write_done(). + */ + list_remove(buflist, ab); + mutex_exit(hash_lock); + continue; + } + + if (!all && ab->b_l2hdr != NULL && + (ab->b_l2hdr->b_daddr > taddr || + ab->b_l2hdr->b_daddr < dev->l2ad_hand)) { + /* + * We've evicted to the target address, + * or the end of the device. + */ + mutex_exit(hash_lock); + break; + } + + if (HDR_FREE_IN_PROGRESS(ab)) { + /* + * Already on the path to destruction. + */ + mutex_exit(hash_lock); + continue; + } + + if (ab->b_state == arc_l2c_only) { + ASSERT(!HDR_L2_READING(ab)); + /* + * This doesn't exist in the ARC. Destroy. + * arc_hdr_destroy() will call list_remove() + * and decrement arcstat_l2_size. + */ + arc_change_state(arc_anon, ab, hash_lock); + arc_hdr_destroy(ab); + } else { + /* + * Tell ARC this no longer exists in L2ARC. + */ + if (ab->b_l2hdr != NULL) { + abl2 = ab->b_l2hdr; + ab->b_l2hdr = NULL; + kmem_free(abl2, sizeof (l2arc_buf_hdr_t)); + ARCSTAT_INCR(arcstat_l2_size, -ab->b_size); + } + list_remove(buflist, ab); + + /* + * This may have been leftover after a + * failed write. + */ + ab->b_flags &= ~ARC_L2_WRITING; + + /* + * Invalidate issued or about to be issued + * reads, since we may be about to write + * over this location. + */ + if (HDR_L2_READING(ab)) { + ARCSTAT_BUMP(arcstat_l2_evict_reading); + ab->b_flags |= ARC_L2_EVICTED; + } + } + mutex_exit(hash_lock); + } + mutex_exit(&l2arc_buflist_mtx); + + spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict)); + dev->l2ad_evict = taddr; +} + +/* + * Find and write ARC buffers to the L2ARC device. + * + * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid + * for reading until they have completed writing. + */ +static void +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev) +{ + arc_buf_hdr_t *ab, *ab_prev, *head; + l2arc_buf_hdr_t *hdrl2; + list_t *list; + uint64_t passed_sz, write_sz, buf_sz; + uint64_t target_sz = dev->l2ad_write; + uint64_t headroom = dev->l2ad_write * l2arc_headroom; + void *buf_data; + kmutex_t *hash_lock, *list_lock; + boolean_t have_lock, full; + l2arc_write_callback_t *cb; + zio_t *pio, *wzio; + + ASSERT(MUTEX_HELD(&l2arc_dev_mtx)); + ASSERT(dev->l2ad_vdev != NULL); + + pio = NULL; + write_sz = 0; + full = B_FALSE; + head = kmem_cache_alloc(hdr_cache, KM_SLEEP); + head->b_flags |= ARC_L2_WRITE_HEAD; + + /* + * Copy buffers for L2ARC writing. + */ + mutex_enter(&l2arc_buflist_mtx); + for (int try = 0; try <= 3; try++) { + list = l2arc_list_locked(try, &list_lock); + passed_sz = 0; + + for (ab = list_tail(list); ab; ab = ab_prev) { + ab_prev = list_prev(list, ab); + + hash_lock = HDR_LOCK(ab); + have_lock = MUTEX_HELD(hash_lock); + if (!have_lock && !mutex_tryenter(hash_lock)) { + /* + * Skip this buffer rather than waiting. + */ + continue; + } + + passed_sz += ab->b_size; + if (passed_sz > headroom) { + /* + * Searched too far. + */ + mutex_exit(hash_lock); + break; + } + + if (ab->b_spa != spa) { + mutex_exit(hash_lock); + continue; + } + + if (ab->b_l2hdr != NULL) { + /* + * Already in L2ARC. + */ + mutex_exit(hash_lock); + continue; + } + + if (HDR_IO_IN_PROGRESS(ab) || HDR_DONT_L2CACHE(ab)) { + mutex_exit(hash_lock); + continue; + } + + if ((write_sz + ab->b_size) > target_sz) { + full = B_TRUE; + mutex_exit(hash_lock); + break; + } + + if (ab->b_buf == NULL) { + DTRACE_PROBE1(l2arc__buf__null, void *, ab); + mutex_exit(hash_lock); + continue; + } + + if (pio == NULL) { + /* + * Insert a dummy header on the buflist so + * l2arc_write_done() can find where the + * write buffers begin without searching. + */ + list_insert_head(dev->l2ad_buflist, head); + + cb = kmem_alloc( + sizeof (l2arc_write_callback_t), KM_SLEEP); + cb->l2wcb_dev = dev; + cb->l2wcb_head = head; + pio = zio_root(spa, l2arc_write_done, cb, + ZIO_FLAG_CANFAIL); + } + + /* + * Create and add a new L2ARC header. + */ + hdrl2 = kmem_zalloc(sizeof (l2arc_buf_hdr_t), KM_SLEEP); + hdrl2->b_dev = dev; + hdrl2->b_daddr = dev->l2ad_hand; + + ab->b_flags |= ARC_L2_WRITING; + ab->b_l2hdr = hdrl2; + list_insert_head(dev->l2ad_buflist, ab); + buf_data = ab->b_buf->b_data; + buf_sz = ab->b_size; + + /* + * Compute and store the buffer cksum before + * writing. On debug the cksum is verified first. + */ + arc_cksum_verify(ab->b_buf); + arc_cksum_compute(ab->b_buf, B_TRUE); + + mutex_exit(hash_lock); + + wzio = zio_write_phys(pio, dev->l2ad_vdev, + dev->l2ad_hand, buf_sz, buf_data, ZIO_CHECKSUM_OFF, + NULL, NULL, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); + + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + (void) zio_nowait(wzio); + + write_sz += buf_sz; + dev->l2ad_hand += buf_sz; + } + + mutex_exit(list_lock); + + if (full == B_TRUE) + break; + } + mutex_exit(&l2arc_buflist_mtx); + + if (pio == NULL) { + ASSERT3U(write_sz, ==, 0); + kmem_cache_free(hdr_cache, head); + return; + } + + ASSERT3U(write_sz, <=, target_sz); + ARCSTAT_BUMP(arcstat_l2_writes_sent); + ARCSTAT_INCR(arcstat_l2_size, write_sz); + spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); + + /* + * Bump device hand to the device start if it is approaching the end. + * l2arc_evict() will already have evicted ahead for this case. + */ + if (dev->l2ad_hand >= (dev->l2ad_end - dev->l2ad_write)) { + spa_l2cache_space_update(dev->l2ad_vdev, 0, + dev->l2ad_end - dev->l2ad_hand); + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_evict = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + } + + (void) zio_wait(pio); +} + +/* + * This thread feeds the L2ARC at regular intervals. This is the beating + * heart of the L2ARC. + */ +static void +l2arc_feed_thread(void) +{ + callb_cpr_t cpr; + l2arc_dev_t *dev; + spa_t *spa; + int interval; + boolean_t startup = B_TRUE; + + CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&l2arc_feed_thr_lock); + + while (l2arc_thread_exit == 0) { + /* + * Initially pause for L2ARC_FEED_DELAY seconds as a grace + * interval during boot, followed by l2arc_feed_secs seconds + * thereafter. + */ + CALLB_CPR_SAFE_BEGIN(&cpr); + if (startup) { + interval = L2ARC_FEED_DELAY; + startup = B_FALSE; + } else { + interval = l2arc_feed_secs; + } + (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, + lbolt + (hz * interval)); + CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); + + /* + * Do nothing until L2ARC devices exist. + */ + mutex_enter(&l2arc_dev_mtx); + if (l2arc_ndev == 0) { + mutex_exit(&l2arc_dev_mtx); + continue; + } + + /* + * Avoid contributing to memory pressure. + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_abort_lowmem); + mutex_exit(&l2arc_dev_mtx); + continue; + } + + /* + * This selects the next l2arc device to write to, and in + * doing so the next spa to feed from: dev->l2ad_spa. + */ + if ((dev = l2arc_dev_get_next()) == NULL) { + mutex_exit(&l2arc_dev_mtx); + continue; + } + spa = dev->l2ad_spa; + ASSERT(spa != NULL); + ARCSTAT_BUMP(arcstat_l2_feeds); + + /* + * Evict L2ARC buffers that will be overwritten. + */ + l2arc_evict(dev, dev->l2ad_write, B_FALSE); + + /* + * Write ARC buffers. + */ + l2arc_write_buffers(spa, dev); + mutex_exit(&l2arc_dev_mtx); + } + + l2arc_thread_exit = 0; + cv_broadcast(&l2arc_feed_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ + thread_exit(); +} + +/* + * Add a vdev for use by the L2ARC. By this point the spa has already + * validated the vdev and opened it. + */ +void +l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) +{ + l2arc_dev_t *adddev; + + /* + * Create a new l2arc device entry. + */ + adddev = kmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); + adddev->l2ad_spa = spa; + adddev->l2ad_vdev = vd; + adddev->l2ad_write = l2arc_write_max; + adddev->l2ad_start = start; + adddev->l2ad_end = end; + adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_evict = adddev->l2ad_start; + adddev->l2ad_first = B_TRUE; + ASSERT3U(adddev->l2ad_write, >, 0); + + /* + * This is a list of all ARC buffers that are still valid on the + * device. + */ + adddev->l2ad_buflist = kmem_zalloc(sizeof (list_t), KM_SLEEP); + list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2node)); + + spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0); + + /* + * Add device to global list + */ + mutex_enter(&l2arc_dev_mtx); + list_insert_head(l2arc_dev_list, adddev); + atomic_inc_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); +} + +/* + * Remove a vdev from the L2ARC. + */ +void +l2arc_remove_vdev(vdev_t *vd) +{ + l2arc_dev_t *dev, *nextdev, *remdev = NULL; + + /* + * We can only grab the spa config lock when cache device writes + * complete. + */ + ASSERT3U(l2arc_writes_sent, ==, l2arc_writes_done); + + /* + * Find the device by vdev + */ + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev; dev = nextdev) { + nextdev = list_next(l2arc_dev_list, dev); + if (vd == dev->l2ad_vdev) { + remdev = dev; + break; + } + } + ASSERT(remdev != NULL); + + /* + * Remove device from global list + */ + list_remove(l2arc_dev_list, remdev); + l2arc_dev_last = NULL; /* may have been invalidated */ + + /* + * Clear all buflists and ARC references. L2ARC device flush. + */ + l2arc_evict(remdev, 0, B_TRUE); + list_destroy(remdev->l2ad_buflist); + kmem_free(remdev->l2ad_buflist, sizeof (list_t)); + kmem_free(remdev, sizeof (l2arc_dev_t)); + + atomic_dec_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); +} + +void +l2arc_init() +{ + l2arc_thread_exit = 0; + l2arc_ndev = 0; + l2arc_writes_sent = 0; + l2arc_writes_done = 0; + + mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_buflist_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); + + l2arc_dev_list = &L2ARC_dev_list; + l2arc_free_on_write = &L2ARC_free_on_write; + list_create(l2arc_dev_list, sizeof (l2arc_dev_t), + offsetof(l2arc_dev_t, l2ad_node)); + list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), + offsetof(l2arc_data_free_t, l2df_list_node)); + + (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, + TS_RUN, minclsyspri); +} + +void +l2arc_fini() +{ + mutex_enter(&l2arc_feed_thr_lock); + cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ + l2arc_thread_exit = 1; + while (l2arc_thread_exit != 0) + cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); + mutex_exit(&l2arc_feed_thr_lock); + + mutex_destroy(&l2arc_feed_thr_lock); + cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_dev_mtx); + mutex_destroy(&l2arc_buflist_mtx); + mutex_destroy(&l2arc_free_on_write_mtx); + + list_destroy(l2arc_dev_list); + list_destroy(l2arc_free_on_write); +} diff --git a/usr/src/uts/common/fs/zfs/dmu.c b/usr/src/uts/common/fs/zfs/dmu.c index 0c0f7cb2d9..ef6ed22f85 100644 --- a/usr/src/uts/common/fs/zfs/dmu.c +++ b/usr/src/uts/common/fs/zfs/dmu.c @@ -1036,6 +1036,7 @@ dmu_init(void) dbuf_init(); dnode_init(); arc_init(); + l2arc_init(); } void @@ -1044,4 +1045,5 @@ dmu_fini(void) arc_fini(); dnode_fini(); dbuf_fini(); + l2arc_fini(); } diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index b2840e4e87..589dc7e3de 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -341,7 +341,7 @@ metaslab_fini(metaslab_t *msp) int t; vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, - -msp->ms_smo.smo_alloc); + -msp->ms_smo.smo_alloc, B_TRUE); metaslab_group_remove(mg, msp); @@ -569,10 +569,10 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) space_map_create(&msp->ms_freemap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); } - vdev_space_update(vd, sm->sm_size, 0); + vdev_space_update(vd, sm->sm_size, 0, B_TRUE); } - vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc); + vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE); ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 8b0a936e0b..983e2c3154 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -56,6 +56,7 @@ #include <sys/dsl_prop.h> #include <sys/dsl_synctask.h> #include <sys/fs/zfs.h> +#include <sys/arc.h> #include <sys/callb.h> #include <sys/systeminfo.h> #include <sys/sunddi.h> @@ -662,6 +663,11 @@ spa_unload(spa_t *spa) spa_config_exit(spa, FTAG); /* + * Drop and purge level 2 cache + */ + spa_l2cache_drop(spa); + + /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { @@ -676,15 +682,28 @@ spa_unload(spa_t *spa) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); - for (i = 0; i < spa->spa_nspares; i++) - vdev_free(spa->spa_spares[i]); - if (spa->spa_spares) { - kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); - spa->spa_spares = NULL; + for (i = 0; i < spa->spa_spares.sav_count; i++) + vdev_free(spa->spa_spares.sav_vdevs[i]); + if (spa->spa_spares.sav_vdevs) { + kmem_free(spa->spa_spares.sav_vdevs, + spa->spa_spares.sav_count * sizeof (void *)); + spa->spa_spares.sav_vdevs = NULL; } - if (spa->spa_sparelist) { - nvlist_free(spa->spa_sparelist); - spa->spa_sparelist = NULL; + if (spa->spa_spares.sav_config) { + nvlist_free(spa->spa_spares.sav_config); + spa->spa_spares.sav_config = NULL; + } + + for (i = 0; i < spa->spa_l2cache.sav_count; i++) + vdev_free(spa->spa_l2cache.sav_vdevs[i]); + if (spa->spa_l2cache.sav_vdevs) { + kmem_free(spa->spa_l2cache.sav_vdevs, + spa->spa_l2cache.sav_count * sizeof (void *)); + spa->spa_l2cache.sav_vdevs = NULL; + } + if (spa->spa_l2cache.sav_config) { + nvlist_free(spa->spa_l2cache.sav_config); + spa->spa_l2cache.sav_config = NULL; } spa->spa_async_suspended = 0; @@ -693,8 +712,8 @@ spa_unload(spa_t *spa) /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in - * 'spa_sparelist'. We parse this into vdevs, try to open them, and then - * re-generate a more complete list including status information. + * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and + * then re-generate a more complete list including status information. */ static void spa_load_spares(spa_t *spa) @@ -707,8 +726,8 @@ spa_load_spares(spa_t *spa) /* * First, close and free any existing spare vdevs. */ - for (i = 0; i < spa->spa_nspares; i++) { - vd = spa->spa_spares[i]; + for (i = 0; i < spa->spa_spares.sav_count; i++) { + vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL && @@ -718,17 +737,18 @@ spa_load_spares(spa_t *spa) vdev_free(vd); } - if (spa->spa_spares) - kmem_free(spa->spa_spares, spa->spa_nspares * sizeof (void *)); + if (spa->spa_spares.sav_vdevs) + kmem_free(spa->spa_spares.sav_vdevs, + spa->spa_spares.sav_count * sizeof (void *)); - if (spa->spa_sparelist == NULL) + if (spa->spa_spares.sav_config == NULL) nspares = 0; else - VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, + VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); - spa->spa_nspares = (int)nspares; - spa->spa_spares = NULL; + spa->spa_spares.sav_count = (int)nspares; + spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; @@ -742,13 +762,14 @@ spa_load_spares(spa_t *spa) * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ - spa->spa_spares = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); - for (i = 0; i < spa->spa_nspares; i++) { + spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), + KM_SLEEP); + for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); - spa->spa_spares[i] = vd; + spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid)) != NULL) { if (!tvd->vdev_isspare) @@ -775,25 +796,160 @@ spa_load_spares(spa_t *spa) continue; vd->vdev_top = vd; - (void) vdev_validate_spare(vd); + if (vdev_validate_aux(vd) == 0) + spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ - VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); - spares = kmem_alloc(spa->spa_nspares * sizeof (void *), KM_SLEEP); - for (i = 0; i < spa->spa_nspares; i++) - spares[i] = vdev_config_generate(spa, spa->spa_spares[i], - B_TRUE, B_TRUE); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, - spares, spa->spa_nspares) == 0); - for (i = 0; i < spa->spa_nspares; i++) + spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), + KM_SLEEP); + for (i = 0; i < spa->spa_spares.sav_count; i++) + spares[i] = vdev_config_generate(spa, + spa->spa_spares.sav_vdevs[i], B_TRUE, B_TRUE, B_FALSE); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); + for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); - kmem_free(spares, spa->spa_nspares * sizeof (void *)); + kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); +} + +/* + * Load (or re-load) the current list of vdevs describing the active l2cache for + * this pool. When this is called, we have some form of basic information in + * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and + * then re-generate a more complete list including status information. + * Devices which are already active have their details maintained, and are + * not re-opened. + */ +static void +spa_load_l2cache(spa_t *spa) +{ + nvlist_t **l2cache; + uint_t nl2cache; + int i, j, oldnvdevs; + uint64_t guid; + vdev_t *vd, **oldvdevs, **newvdevs; + spa_aux_vdev_t *sav = &spa->spa_l2cache; + + if (sav->sav_config != NULL) { + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); + } else { + nl2cache = 0; + } + + oldvdevs = sav->sav_vdevs; + oldnvdevs = sav->sav_count; + sav->sav_vdevs = NULL; + sav->sav_count = 0; + + /* + * Process new nvlist of vdevs. + */ + for (i = 0; i < nl2cache; i++) { + VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, + &guid) == 0); + + newvdevs[i] = NULL; + for (j = 0; j < oldnvdevs; j++) { + vd = oldvdevs[j]; + if (vd != NULL && guid == vd->vdev_guid) { + /* + * Retain previous vdev for add/remove ops. + */ + newvdevs[i] = vd; + oldvdevs[j] = NULL; + break; + } + } + + if (newvdevs[i] == NULL) { + /* + * Create new vdev + */ + VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, + VDEV_ALLOC_L2CACHE) == 0); + ASSERT(vd != NULL); + newvdevs[i] = vd; + + /* + * Commit this vdev as an l2cache device, + * even if it fails to open. + */ + spa_l2cache_add(vd); + + if (vdev_open(vd) != 0) + continue; + + vd->vdev_top = vd; + (void) vdev_validate_aux(vd); + + if (!vdev_is_dead(vd)) { + uint64_t size; + size = vdev_get_rsize(vd); + ASSERT3U(size, >, 0); + if (spa_mode & FWRITE) { + l2arc_add_vdev(spa, vd, + VDEV_LABEL_START_SIZE, + size - VDEV_LABEL_START_SIZE); + } + spa_l2cache_activate(vd); + } + } + } + + /* + * Purge vdevs that were dropped + */ + for (i = 0; i < oldnvdevs; i++) { + uint64_t pool; + + vd = oldvdevs[i]; + if (vd != NULL) { + if (spa_mode & FWRITE && + spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL) { + l2arc_remove_vdev(vd); + } + (void) vdev_close(vd); + spa_l2cache_remove(vd); + } + } + + if (oldvdevs) + kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); + + if (sav->sav_config == NULL) + goto out; + + sav->sav_vdevs = newvdevs; + sav->sav_count = (int)nl2cache; + + /* + * Recompute the stashed list of l2cache devices, with status + * information this time. + */ + VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, + DATA_TYPE_NVLIST_ARRAY) == 0); + + l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); + for (i = 0; i < sav->sav_count; i++) + l2cache[i] = vdev_config_generate(spa, + sav->sav_vdevs[i], B_TRUE, B_FALSE, B_TRUE); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); +out: + for (i = 0; i < sav->sav_count; i++) + nvlist_free(l2cache[i]); + if (sav->sav_count) + kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int @@ -1090,7 +1246,7 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) * Load any hot spares for this pool. */ error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares_object); + DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); if (error != 0 && error != ENOENT) { vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -1099,8 +1255,8 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) } if (error == 0) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); - if (load_nvlist(spa, spa->spa_spares_object, - &spa->spa_sparelist) != 0) { + if (load_nvlist(spa, spa->spa_spares.sav_object, + &spa->spa_spares.sav_config) != 0) { vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); error = EIO; @@ -1112,6 +1268,34 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) spa_config_exit(spa, FTAG); } + /* + * Load any level 2 ARC devices for this pool. + */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_L2CACHE, sizeof (uint64_t), 1, + &spa->spa_l2cache.sav_object); + if (error != 0 && error != ENOENT) { + vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + if (error == 0) { + ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); + if (load_nvlist(spa, spa->spa_l2cache.sav_object, + &spa->spa_l2cache.sav_config) != 0) { + vdev_set_state(rvd, B_TRUE, + VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + error = EIO; + goto out; + } + + spa_config_enter(spa, RW_WRITER, FTAG); + spa_load_l2cache(spa); + spa_config_exit(spa, FTAG); + } + spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, @@ -1372,6 +1556,9 @@ spa_inject_delref(spa_t *spa) mutex_exit(&spa_namespace_lock); } +/* + * Add spares device information to the nvlist. + */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { @@ -1383,12 +1570,12 @@ spa_add_spares(spa_t *spa, nvlist_t *config) uint_t vsc; uint64_t pool; - if (spa->spa_nspares == 0) + if (spa->spa_spares.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, + VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); if (nspares != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, @@ -1415,6 +1602,62 @@ spa_add_spares(spa_t *spa, nvlist_t *config) } } +/* + * Add l2cache device information to the nvlist, including vdev stats. + */ +static void +spa_add_l2cache(spa_t *spa, nvlist_t *config) +{ + nvlist_t **l2cache; + uint_t i, j, nl2cache; + nvlist_t *nvroot; + uint64_t guid; + vdev_t *vd; + vdev_stat_t *vs; + uint_t vsc; + + if (spa->spa_l2cache.sav_count == 0) + return; + + spa_config_enter(spa, RW_READER, FTAG); + + VERIFY(nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + if (nl2cache != 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + + /* + * Update level 2 cache device stats. + */ + + for (i = 0; i < nl2cache; i++) { + VERIFY(nvlist_lookup_uint64(l2cache[i], + ZPOOL_CONFIG_GUID, &guid) == 0); + + vd = NULL; + for (j = 0; j < spa->spa_l2cache.sav_count; j++) { + if (guid == + spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { + vd = spa->spa_l2cache.sav_vdevs[j]; + break; + } + } + ASSERT(vd != NULL); + + VERIFY(nvlist_lookup_uint64_array(l2cache[i], + ZPOOL_CONFIG_STATS, (uint64_t **)&vs, &vsc) == 0); + vdev_get_stats(vd, vs); + } + } + + spa_config_exit(spa, FTAG); +} + int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { @@ -1429,6 +1672,7 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) spa_get_errlog_size(spa)) == 0); spa_add_spares(spa, *config); + spa_add_l2cache(spa, *config); } /* @@ -1457,45 +1701,46 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) } /* - * Validate that the 'spares' array is well formed. We must have an array of - * nvlists, each which describes a valid leaf vdev. If this is an import (mode - * is VDEV_ALLOC_SPARE), then we allow corrupted spares to be specified, as long - * as they are well-formed. + * Validate that the auxiliary device array is well formed. We must have an + * array of nvlists, each which describes a valid leaf vdev. If this is an + * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be + * specified, as long as they are well-formed. */ static int -spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) +spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, + spa_aux_vdev_t *sav, const char *config, uint64_t version, + vdev_labeltype_t label) { - nvlist_t **spares; - uint_t i, nspares; + nvlist_t **dev; + uint_t i, ndev; vdev_t *vd; int error; /* - * It's acceptable to have no spares specified. + * It's acceptable to have no devs specified. */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) != 0) + if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); - if (nspares == 0) + if (ndev == 0) return (EINVAL); /* - * Make sure the pool is formatted with a version that supports hot - * spares. + * Make sure the pool is formatted with a version that supports this + * device type. */ - if (spa_version(spa) < SPA_VERSION_SPARES) + if (spa_version(spa) < version) return (ENOTSUP); /* - * Set the pending spare list so we correctly handle device in-use + * Set the pending device list so we correctly handle device in-use * checking. */ - spa->spa_pending_spares = spares; - spa->spa_pending_nspares = nspares; + sav->sav_pending = dev; + sav->sav_npending = ndev; - for (i = 0; i < nspares; i++) { - if ((error = spa_config_parse(spa, &vd, spares[i], NULL, 0, + for (i = 0; i < ndev; i++) { + if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; @@ -1505,29 +1750,127 @@ spa_validate_spares(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) goto out; } + /* + * The L2ARC currently only supports disk devices. + */ + if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && + strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { + error = ENOTBLK; + goto out; + } + vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && - (error = vdev_label_init(vd, crtxg, - VDEV_LABEL_SPARE)) == 0) { - VERIFY(nvlist_add_uint64(spares[i], ZPOOL_CONFIG_GUID, + (error = vdev_label_init(vd, crtxg, label)) == 0) { + VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } vdev_free(vd); - if (error && mode != VDEV_ALLOC_SPARE) + if (error && + (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: - spa->spa_pending_spares = NULL; - spa->spa_pending_nspares = 0; + sav->sav_pending = NULL; + sav->sav_npending = 0; return (error); } +static int +spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) +{ + int error; + + if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, + &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, + VDEV_LABEL_SPARE)) != 0) { + return (error); + } + + return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, + &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, + VDEV_LABEL_L2CACHE)); +} + +static void +spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, + const char *config) +{ + int i; + + if (sav->sav_config != NULL) { + nvlist_t **olddevs; + uint_t oldndevs; + nvlist_t **newdevs; + + /* + * Generate new dev list by concatentating with the + * current dev list. + */ + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, + &olddevs, &oldndevs) == 0); + + newdevs = kmem_alloc(sizeof (void *) * + (ndevs + oldndevs), KM_SLEEP); + for (i = 0; i < oldndevs; i++) + VERIFY(nvlist_dup(olddevs[i], &newdevs[i], + KM_SLEEP) == 0); + for (i = 0; i < ndevs; i++) + VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], + KM_SLEEP) == 0); + + VERIFY(nvlist_remove(sav->sav_config, config, + DATA_TYPE_NVLIST_ARRAY) == 0); + + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + config, newdevs, ndevs + oldndevs) == 0); + for (i = 0; i < oldndevs + ndevs; i++) + nvlist_free(newdevs[i]); + kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); + } else { + /* + * Generate a new dev list. + */ + VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, + devs, ndevs) == 0); + } +} + +/* + * Stop and drop level 2 ARC devices + */ +void +spa_l2cache_drop(spa_t *spa) +{ + vdev_t *vd; + int i; + spa_aux_vdev_t *sav = &spa->spa_l2cache; + + for (i = 0; i < sav->sav_count; i++) { + uint64_t pool; + + vd = sav->sav_vdevs[i]; + ASSERT(vd != NULL); + + if (spa_mode & FWRITE && + spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL) { + l2arc_remove_vdev(vd); + } + if (vd->vdev_isl2cache) + spa_l2cache_remove(vd); + vdev_clear_stats(vd); + (void) vdev_close(vd); + } +} + /* * Pool Creation */ @@ -1542,8 +1885,8 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, dmu_tx_t *tx; int c, error = 0; uint64_t txg = TXG_INITIAL; - nvlist_t **spares; - uint_t nspares; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; uint64_t version; /* @@ -1594,7 +1937,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && - (error = spa_validate_spares(spa, nvroot, txg, + (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { for (c = 0; c < rvd->vdev_children; c++) vdev_init(rvd->vdev_child[c], txg); @@ -1616,14 +1959,29 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { - VERIFY(nvlist_alloc(&spa->spa_sparelist, NV_UNIQUE_NAME, + VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, RW_WRITER, FTAG); spa_load_spares(spa); spa_config_exit(spa, FTAG); - spa->spa_sync_spares = B_TRUE; + spa->spa_spares.sav_sync = B_TRUE; + } + + /* + * Get the list of level 2 cache devices, if specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa_config_enter(spa, RW_WRITER, FTAG); + spa_load_l2cache(spa); + spa_config_exit(spa, FTAG); + spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_dsl_pool = dp = dsl_pool_create(spa, txg); @@ -1717,8 +2075,8 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) char *altroot = NULL; int error; nvlist_t *nvroot; - nvlist_t **spares; - uint_t nspares; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. @@ -1749,18 +2107,24 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) * Toss any existing sparelist, as it doesn't have any validity anymore, * and conflicts with spa_has_spare(). */ - if (spa->spa_sparelist) { - nvlist_free(spa->spa_sparelist); - spa->spa_sparelist = NULL; + if (spa->spa_spares.sav_config) { + nvlist_free(spa->spa_spares.sav_config); + spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } + if (spa->spa_l2cache.sav_config) { + nvlist_free(spa->spa_l2cache.sav_config); + spa->spa_l2cache.sav_config = NULL; + spa_load_l2cache(spa); + } VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); - if (error == 0) { - error = spa_validate_spares(spa, nvroot, -1ULL, - VDEV_ALLOC_SPARE); - } + if (error == 0) + error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); + if (error == 0) + error = spa_validate_aux(spa, nvroot, -1ULL, + VDEV_ALLOC_L2CACHE); spa_config_exit(spa, FTAG); if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { @@ -1772,23 +2136,38 @@ spa_import(const char *pool, nvlist_t *config, nvlist_t *props) } /* - * Override any spares as specified by the user, as these may have - * correct device names/devids, etc. + * Override any spares and level 2 cache devices as specified by + * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { - if (spa->spa_sparelist) - VERIFY(nvlist_remove(spa->spa_sparelist, + if (spa->spa_spares.sav_config) + VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); else - VERIFY(nvlist_alloc(&spa->spa_sparelist, + VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, RW_WRITER, FTAG); spa_load_spares(spa); spa_config_exit(spa, FTAG); - spa->spa_sync_spares = B_TRUE; + spa->spa_spares.sav_sync = B_TRUE; + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + if (spa->spa_l2cache.sav_config) + VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa_config_enter(spa, RW_WRITER, FTAG); + spa_load_l2cache(spa); + spa_config_exit(spa, FTAG); + spa->spa_l2cache.sav_sync = B_TRUE; } /* @@ -1857,9 +2236,10 @@ spa_tryimport(nvlist_t *tryconfig) spa->spa_uberblock.ub_timestamp) == 0); /* - * Add the list of hot spares. + * Add the list of hot spares and level 2 cache devices. */ spa_add_spares(spa, config); + spa_add_l2cache(spa, config); } spa_unload(spa); @@ -2014,8 +2394,8 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) int c, error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; - nvlist_t **spares; - uint_t i, nspares; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; txg = spa_vdev_enter(spa); @@ -2025,11 +2405,15 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) spa->spa_pending_vdev = vd; - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) != 0) + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, + &nspares) != 0) nspares = 0; - if (vd->vdev_children == 0 && nspares == 0) { + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, + &nl2cache) != 0) + nl2cache = 0; + + if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) { spa->spa_pending_vdev = NULL; return (spa_vdev_exit(spa, vd, txg, EINVAL)); } @@ -2042,11 +2426,10 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } /* - * We must validate the spares after checking the children. Otherwise, - * vdev_inuse() will blindly overwrite the spare. + * We must validate the spares and l2cache devices after checking the + * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ - if ((error = spa_validate_spares(spa, nvroot, txg, - VDEV_ALLOC_ADD)) != 0) { + if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) { spa->spa_pending_vdev = NULL; return (spa_vdev_exit(spa, vd, txg, error)); } @@ -2065,43 +2448,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) } if (nspares != 0) { - if (spa->spa_sparelist != NULL) { - nvlist_t **oldspares; - uint_t oldnspares; - nvlist_t **newspares; - - VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, &oldspares, &oldnspares) == 0); - - newspares = kmem_alloc(sizeof (void *) * - (nspares + oldnspares), KM_SLEEP); - for (i = 0; i < oldnspares; i++) - VERIFY(nvlist_dup(oldspares[i], - &newspares[i], KM_SLEEP) == 0); - for (i = 0; i < nspares; i++) - VERIFY(nvlist_dup(spares[i], - &newspares[i + oldnspares], - KM_SLEEP) == 0); - - VERIFY(nvlist_remove(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); - - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, newspares, - nspares + oldnspares) == 0); - for (i = 0; i < oldnspares + nspares; i++) - nvlist_free(newspares[i]); - kmem_free(newspares, (oldnspares + nspares) * - sizeof (void *)); - } else { - VERIFY(nvlist_alloc(&spa->spa_sparelist, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - } - + spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, + ZPOOL_CONFIG_SPARES); spa_load_spares(spa); - spa->spa_sync_spares = B_TRUE; + spa->spa_spares.sav_sync = B_TRUE; + } + + if (nl2cache != 0) { + spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, + ZPOOL_CONFIG_L2CACHE); + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; } /* @@ -2511,55 +2868,38 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) } /* - * Remove a device from the pool. Currently, this supports removing only hot - * spares. + * Remove a spares vdev from the nvlist config. */ -int -spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) +static int +spa_remove_spares(spa_aux_vdev_t *sav, uint64_t guid, boolean_t unspare, + nvlist_t **spares, int nspares, vdev_t *vd) { - vdev_t *vd; - nvlist_t **spares, *nv, **newspares; - uint_t i, j, nspares; - int ret = 0; - - spa_config_enter(spa, RW_WRITER, FTAG); - - vd = spa_lookup_by_guid(spa, guid); + nvlist_t *nv, **newspares; + int i, j; nv = NULL; - if (spa->spa_spares != NULL && - nvlist_lookup_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - for (i = 0; i < nspares; i++) { - uint64_t theguid; + for (i = 0; i < nspares; i++) { + uint64_t theguid; - VERIFY(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &theguid) == 0); - if (theguid == guid) { - nv = spares[i]; - break; - } + VERIFY(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &theguid) == 0); + if (theguid == guid) { + nv = spares[i]; + break; } } /* - * We only support removing a hot spare, and only if it's not currently - * in use in this pool. + * Only remove the hot spare if it's not currently in use in this pool. */ - if (nv == NULL && vd == NULL) { - ret = ENOENT; - goto out; - } + if (nv == NULL && vd == NULL) + return (ENOENT); - if (nv == NULL && vd != NULL) { - ret = ENOTSUP; - goto out; - } + if (nv == NULL && vd != NULL) + return (ENOTSUP); - if (!unspare && nv != NULL && vd != NULL) { - ret = EBUSY; - goto out; - } + if (!unspare && nv != NULL && vd != NULL) + return (EBUSY); if (nspares == 1) { newspares = NULL; @@ -2573,20 +2913,119 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) } } - VERIFY(nvlist_remove(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, + VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_sparelist, ZPOOL_CONFIG_SPARES, - newspares, nspares - 1) == 0); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_SPARES, newspares, nspares - 1) == 0); for (i = 0; i < nspares - 1; i++) nvlist_free(newspares[i]); kmem_free(newspares, (nspares - 1) * sizeof (void *)); - spa_load_spares(spa); - spa->spa_sync_spares = B_TRUE; + + return (0); +} + +/* + * Remove an l2cache vdev from the nvlist config. + */ +static int +spa_remove_l2cache(spa_aux_vdev_t *sav, uint64_t guid, nvlist_t **l2cache, + int nl2cache, vdev_t *vd) +{ + nvlist_t *nv, **newl2cache; + int i, j; + + nv = NULL; + for (i = 0; i < nl2cache; i++) { + uint64_t theguid; + + VERIFY(nvlist_lookup_uint64(l2cache[i], + ZPOOL_CONFIG_GUID, &theguid) == 0); + if (theguid == guid) { + nv = l2cache[i]; + break; + } + } + + if (vd == NULL) { + for (i = 0; i < nl2cache; i++) { + if (sav->sav_vdevs[i]->vdev_guid == guid) { + vd = sav->sav_vdevs[i]; + break; + } + } + } + + if (nv == NULL && vd == NULL) + return (ENOENT); + + if (nv == NULL && vd != NULL) + return (ENOTSUP); + + if (nl2cache == 1) { + newl2cache = NULL; + } else { + newl2cache = kmem_alloc((nl2cache - 1) * sizeof (void *), + KM_SLEEP); + for (i = 0, j = 0; i < nl2cache; i++) { + if (l2cache[i] != nv) + VERIFY(nvlist_dup(l2cache[i], + &newl2cache[j++], KM_SLEEP) == 0); + } + } + + VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, + DATA_TYPE_NVLIST_ARRAY) == 0); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, newl2cache, nl2cache - 1) == 0); + for (i = 0; i < nl2cache - 1; i++) + nvlist_free(newl2cache[i]); + kmem_free(newl2cache, (nl2cache - 1) * sizeof (void *)); + + return (0); +} + +/* + * Remove a device from the pool. Currently, this supports removing only hot + * spares and level 2 ARC devices. + */ +int +spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) +{ + vdev_t *vd; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + int error = 0; + + spa_config_enter(spa, RW_WRITER, FTAG); + + vd = spa_lookup_by_guid(spa, guid); + + if (spa->spa_spares.sav_vdevs != NULL && + spa_spare_exists(guid, NULL) && + nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { + if ((error = spa_remove_spares(&spa->spa_spares, guid, unspare, + spares, nspares, vd)) != 0) + goto out; + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + goto out; + } + + if (spa->spa_l2cache.sav_vdevs != NULL && + spa_l2cache_exists(guid, NULL) && + nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { + if ((error = spa_remove_l2cache(&spa->spa_l2cache, guid, + l2cache, nl2cache, vd)) != 0) + goto out; + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; + } out: spa_config_exit(spa, FTAG); - - return (ret); + return (error); } /* @@ -2693,33 +3132,52 @@ spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) if ((vd = vdev_lookup_by_guid(rvd, guid)) == NULL) { /* - * Determine if this is a reference to a hot spare. In that - * case, update the path as stored in the spare list. + * Determine if this is a reference to a hot spare or l2cache + * device. If it is, update the path as stored in their + * device list. */ - nvlist_t **spares; - uint_t i, nspares; - if (spa->spa_sparelist != NULL) { - VERIFY(nvlist_lookup_nvlist_array(spa->spa_sparelist, - ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + nvlist_t **spares, **l2cache; + uint_t i, nspares, nl2cache; + + if (spa->spa_spares.sav_config != NULL) { + VERIFY(nvlist_lookup_nvlist_array( + spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0); for (i = 0; i < nspares; i++) { uint64_t theguid; VERIFY(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &theguid) == 0); - if (theguid == guid) - break; + if (theguid == guid) { + VERIFY(nvlist_add_string(spares[i], + ZPOOL_CONFIG_PATH, newpath) == 0); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + return (spa_vdev_exit(spa, NULL, txg, + 0)); + } } + } - if (i == nspares) - return (spa_vdev_exit(spa, NULL, txg, ENOENT)); - - VERIFY(nvlist_add_string(spares[i], - ZPOOL_CONFIG_PATH, newpath) == 0); - spa_load_spares(spa); - spa->spa_sync_spares = B_TRUE; - return (spa_vdev_exit(spa, NULL, txg, 0)); - } else { - return (spa_vdev_exit(spa, NULL, txg, ENOENT)); + if (spa->spa_l2cache.sav_config != NULL) { + VERIFY(nvlist_lookup_nvlist_array( + spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0); + for (i = 0; i < nl2cache; i++) { + uint64_t theguid; + VERIFY(nvlist_lookup_uint64(l2cache[i], + ZPOOL_CONFIG_GUID, &theguid) == 0); + if (theguid == guid) { + VERIFY(nvlist_add_string(l2cache[i], + ZPOOL_CONFIG_PATH, newpath) == 0); + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; + return (spa_vdev_exit(spa, NULL, txg, + 0)); + } + } } + + return (spa_vdev_exit(spa, NULL, txg, ENOENT)); } if (!vd->vdev_ops->vdev_op_leaf) @@ -3338,50 +3796,49 @@ spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) } static void -spa_sync_spares(spa_t *spa, dmu_tx_t *tx) +spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, + const char *config, const char *entry) { nvlist_t *nvroot; - nvlist_t **spares; + nvlist_t **list; int i; - if (!spa->spa_sync_spares) + if (!sav->sav_sync) return; /* - * Update the MOS nvlist describing the list of available spares. - * spa_validate_spares() will have already made sure this nvlist is + * Update the MOS nvlist describing the list of available devices. + * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ - if (spa->spa_spares_object == 0) { - spa->spa_spares_object = dmu_object_alloc(spa->spa_meta_objset, - DMU_OT_PACKED_NVLIST, 1 << 14, - DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); + if (sav->sav_object == 0) { + sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, + sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SPARES, - sizeof (uint64_t), 1, &spa->spa_spares_object, tx) == 0); + DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, + &sav->sav_object, tx) == 0); } VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (spa->spa_nspares == 0) { - VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - NULL, 0) == 0); + if (sav->sav_count == 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); } else { - spares = kmem_alloc(spa->spa_nspares * sizeof (void *), - KM_SLEEP); - for (i = 0; i < spa->spa_nspares; i++) - spares[i] = vdev_config_generate(spa, - spa->spa_spares[i], B_FALSE, B_TRUE); - VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - spares, spa->spa_nspares) == 0); - for (i = 0; i < spa->spa_nspares; i++) - nvlist_free(spares[i]); - kmem_free(spares, spa->spa_nspares * sizeof (void *)); + list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); + for (i = 0; i < sav->sav_count; i++) + list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], + B_FALSE, B_FALSE, B_TRUE); + VERIFY(nvlist_add_nvlist_array(nvroot, config, list, + sav->sav_count) == 0); + for (i = 0; i < sav->sav_count; i++) + nvlist_free(list[i]); + kmem_free(list, sav->sav_count * sizeof (void *)); } - spa_sync_nvlist(spa, spa->spa_spares_object, nvroot, tx); + spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); - spa->spa_sync_spares = B_FALSE; + sav->sav_sync = B_FALSE; } static void @@ -3606,7 +4063,10 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_sync_pass++; spa_sync_config_object(spa, tx); - spa_sync_spares(spa, tx); + spa_sync_aux_dev(spa, &spa->spa_spares, tx, + ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); + spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, + ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); @@ -3806,15 +4266,15 @@ spa_has_spare(spa_t *spa, uint64_t guid) { int i; uint64_t spareguid; + spa_aux_vdev_t *sav = &spa->spa_spares; - for (i = 0; i < spa->spa_nspares; i++) - if (spa->spa_spares[i]->vdev_guid == guid) + for (i = 0; i < sav->sav_count; i++) + if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); - for (i = 0; i < spa->spa_pending_nspares; i++) { - if (nvlist_lookup_uint64(spa->spa_pending_spares[i], - ZPOOL_CONFIG_GUID, &spareguid) == 0 && - spareguid == guid) + for (i = 0; i < sav->sav_npending; i++) { + if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, + &spareguid) == 0 && spareguid == guid) return (B_TRUE); } diff --git a/usr/src/uts/common/fs/zfs/spa_config.c b/usr/src/uts/common/fs/zfs/spa_config.c index 437ef3b0e7..17978ccc25 100644 --- a/usr/src/uts/common/fs/zfs/spa_config.c +++ b/usr/src/uts/common/fs/zfs/spa_config.c @@ -422,7 +422,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) vd = vd->vdev_top; /* label contains top config */ } - nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE); + nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index fd72f815e2..6aefb025fc 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -178,6 +178,8 @@ int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; +static kmutex_t spa_l2cache_lock; +static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; int spa_mode; @@ -406,11 +408,108 @@ spa_refcount_zero(spa_t *spa) /* * ========================================================================== - * SPA spare tracking + * SPA spare and l2cache tracking * ========================================================================== */ /* + * Hot spares and cache devices are tracked using the same code below, + * for 'auxiliary' devices. + */ + +typedef struct spa_aux { + uint64_t aux_guid; + uint64_t aux_pool; + avl_node_t aux_avl; + int aux_count; +} spa_aux_t; + +static int +spa_aux_compare(const void *a, const void *b) +{ + const spa_aux_t *sa = a; + const spa_aux_t *sb = b; + + if (sa->aux_guid < sb->aux_guid) + return (-1); + else if (sa->aux_guid > sb->aux_guid) + return (1); + else + return (0); +} + +void +spa_aux_add(vdev_t *vd, avl_tree_t *avl) +{ + avl_index_t where; + spa_aux_t search; + spa_aux_t *aux; + + search.aux_guid = vd->vdev_guid; + if ((aux = avl_find(avl, &search, &where)) != NULL) { + aux->aux_count++; + } else { + aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); + aux->aux_guid = vd->vdev_guid; + aux->aux_count = 1; + avl_insert(avl, aux, where); + } +} + +void +spa_aux_remove(vdev_t *vd, avl_tree_t *avl) +{ + spa_aux_t search; + spa_aux_t *aux; + avl_index_t where; + + search.aux_guid = vd->vdev_guid; + aux = avl_find(avl, &search, &where); + + ASSERT(aux != NULL); + + if (--aux->aux_count == 0) { + avl_remove(avl, aux); + kmem_free(aux, sizeof (spa_aux_t)); + } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { + aux->aux_pool = 0ULL; + } +} + +boolean_t +spa_aux_exists(uint64_t guid, uint64_t *pool, avl_tree_t *avl) +{ + spa_aux_t search, *found; + avl_index_t where; + + search.aux_guid = guid; + found = avl_find(avl, &search, &where); + + if (pool) { + if (found) + *pool = found->aux_pool; + else + *pool = 0ULL; + } + + return (found != NULL); +} + +void +spa_aux_activate(vdev_t *vd, avl_tree_t *avl) +{ + spa_aux_t search, *found; + avl_index_t where; + + search.aux_guid = vd->vdev_guid; + found = avl_find(avl, &search, &where); + ASSERT(found != NULL); + ASSERT(found->aux_pool == 0ULL); + + found->aux_pool = spa_guid(vd->vdev_spa); +} + +/* * Spares are tracked globally due to the following constraints: * * - A spare may be part of multiple pools. @@ -432,73 +531,28 @@ spa_refcount_zero(spa_t *spa) * be completely consistent with respect to other vdev configuration changes. */ -typedef struct spa_spare { - uint64_t spare_guid; - uint64_t spare_pool; - avl_node_t spare_avl; - int spare_count; -} spa_spare_t; - static int spa_spare_compare(const void *a, const void *b) { - const spa_spare_t *sa = a; - const spa_spare_t *sb = b; - - if (sa->spare_guid < sb->spare_guid) - return (-1); - else if (sa->spare_guid > sb->spare_guid) - return (1); - else - return (0); + return (spa_aux_compare(a, b)); } void spa_spare_add(vdev_t *vd) { - avl_index_t where; - spa_spare_t search; - spa_spare_t *spare; - mutex_enter(&spa_spare_lock); ASSERT(!vd->vdev_isspare); - - search.spare_guid = vd->vdev_guid; - if ((spare = avl_find(&spa_spare_avl, &search, &where)) != NULL) { - spare->spare_count++; - } else { - spare = kmem_zalloc(sizeof (spa_spare_t), KM_SLEEP); - spare->spare_guid = vd->vdev_guid; - spare->spare_count = 1; - avl_insert(&spa_spare_avl, spare, where); - } + spa_aux_add(vd, &spa_spare_avl); vd->vdev_isspare = B_TRUE; - mutex_exit(&spa_spare_lock); } void spa_spare_remove(vdev_t *vd) { - spa_spare_t search; - spa_spare_t *spare; - avl_index_t where; - mutex_enter(&spa_spare_lock); - - search.spare_guid = vd->vdev_guid; - spare = avl_find(&spa_spare_avl, &search, &where); - ASSERT(vd->vdev_isspare); - ASSERT(spare != NULL); - - if (--spare->spare_count == 0) { - avl_remove(&spa_spare_avl, spare); - kmem_free(spare, sizeof (spa_spare_t)); - } else if (spare->spare_pool == spa_guid(vd->vdev_spa)) { - spare->spare_pool = 0ULL; - } - + spa_aux_remove(vd, &spa_spare_avl); vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } @@ -506,42 +560,81 @@ spa_spare_remove(vdev_t *vd) boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool) { - spa_spare_t search, *found; - avl_index_t where; + boolean_t found; mutex_enter(&spa_spare_lock); - - search.spare_guid = guid; - found = avl_find(&spa_spare_avl, &search, &where); - - if (pool) { - if (found) - *pool = found->spare_pool; - else - *pool = 0ULL; - } - + found = spa_aux_exists(guid, pool, &spa_spare_avl); mutex_exit(&spa_spare_lock); - return (found != NULL); + return (found); } void spa_spare_activate(vdev_t *vd) { - spa_spare_t search, *found; - avl_index_t where; - mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); + spa_aux_activate(vd, &spa_spare_avl); + mutex_exit(&spa_spare_lock); +} - search.spare_guid = vd->vdev_guid; - found = avl_find(&spa_spare_avl, &search, &where); - ASSERT(found != NULL); - ASSERT(found->spare_pool == 0ULL); +/* + * Level 2 ARC devices are tracked globally for the same reasons as spares. + * Cache devices currently only support one pool per cache device, and so + * for these devices the aux reference count is currently unused beyond 1. + */ - found->spare_pool = spa_guid(vd->vdev_spa); - mutex_exit(&spa_spare_lock); +static int +spa_l2cache_compare(const void *a, const void *b) +{ + return (spa_aux_compare(a, b)); +} + +void +spa_l2cache_add(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(!vd->vdev_isl2cache); + spa_aux_add(vd, &spa_l2cache_avl); + vd->vdev_isl2cache = B_TRUE; + mutex_exit(&spa_l2cache_lock); +} + +void +spa_l2cache_remove(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(vd->vdev_isl2cache); + spa_aux_remove(vd, &spa_l2cache_avl); + vd->vdev_isl2cache = B_FALSE; + mutex_exit(&spa_l2cache_lock); +} + +boolean_t +spa_l2cache_exists(uint64_t guid, uint64_t *pool) +{ + boolean_t found; + + mutex_enter(&spa_l2cache_lock); + found = spa_aux_exists(guid, pool, &spa_l2cache_avl); + mutex_exit(&spa_l2cache_lock); + + return (found); +} + +void +spa_l2cache_activate(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(vd->vdev_isl2cache); + spa_aux_activate(vd, &spa_l2cache_avl); + mutex_exit(&spa_l2cache_lock); +} + +void +spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc) +{ + vdev_space_update(vd, space, alloc, B_FALSE); } /* @@ -1078,13 +1171,17 @@ spa_init(int mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), offsetof(spa_t, spa_avl)); - avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_spare_t), - offsetof(spa_spare_t, spare_avl)); + avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), + offsetof(spa_aux_t, aux_avl)); + + avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), + offsetof(spa_aux_t, aux_avl)); spa_mode = mode; @@ -1111,10 +1208,12 @@ spa_fini(void) avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); + avl_destroy(&spa_l2cache_avl); cv_destroy(&spa_namespace_cv); mutex_destroy(&spa_namespace_lock); mutex_destroy(&spa_spare_lock); + mutex_destroy(&spa_l2cache_lock); } /* diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index f6607060b0..aa3932690f 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -36,6 +36,7 @@ extern "C" { #include <sys/zio.h> #include <sys/dmu.h> +#include <sys/spa.h> typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; @@ -106,6 +107,15 @@ int arc_tempreserve_space(uint64_t tempreserve); void arc_init(void); void arc_fini(void); +/* + * Level 2 ARC + */ + +void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end); +void l2arc_remove_vdev(vdev_t *vd); +void l2arc_init(void); +void l2arc_fini(void); + #ifdef __cplusplus } #endif diff --git a/usr/src/uts/common/fs/zfs/sys/dmu.h b/usr/src/uts/common/fs/zfs/sys/dmu.h index 0a24df2129..f59aadf50a 100644 --- a/usr/src/uts/common/fs/zfs/sys/dmu.h +++ b/usr/src/uts/common/fs/zfs/sys/dmu.h @@ -200,6 +200,7 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" +#define DMU_POOL_L2CACHE "l2cache" /* * Allocate an object from this objset. The range of object numbers diff --git a/usr/src/uts/common/fs/zfs/sys/spa.h b/usr/src/uts/common/fs/zfs/sys/spa.h index 82798cf07f..dd723df0c9 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa.h +++ b/usr/src/uts/common/fs/zfs/sys/spa.h @@ -47,6 +47,7 @@ typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; typedef struct zilog zilog_t; typedef struct traverse_handle traverse_handle_t; +typedef struct spa_aux_vdev spa_aux_vdev_t; struct dsl_pool; /* @@ -356,6 +357,14 @@ extern void spa_spare_remove(vdev_t *vd); extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool); extern void spa_spare_activate(vdev_t *vd); +/* L2ARC state (which is global across all pools) */ +extern void spa_l2cache_add(vdev_t *vd); +extern void spa_l2cache_remove(vdev_t *vd); +extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); +extern void spa_l2cache_activate(vdev_t *vd); +extern void spa_l2cache_drop(spa_t *spa); +extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc); + /* scrubbing */ extern int spa_scrub(spa_t *spa, pool_scrub_type_t type, boolean_t force); extern void spa_scrub_suspend(spa_t *spa); diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index 6c4559fce2..eb2b6d6289 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -58,6 +58,16 @@ typedef struct spa_history_phys { uint64_t sh_records_lost; /* num of records overwritten */ } spa_history_phys_t; +struct spa_aux_vdev { + uint64_t sav_object; /* MOS object for device list */ + nvlist_t *sav_config; /* cached device config */ + vdev_t **sav_vdevs; /* devices */ + int sav_count; /* number devices */ + boolean_t sav_sync; /* sync the device list */ + nvlist_t **sav_pending; /* pending device additions */ + uint_t sav_npending; /* # pending devices */ +}; + struct spa { /* * Fields protected by spa_namespace_lock. @@ -87,11 +97,8 @@ struct spa { vdev_t *spa_root_vdev; /* top-level vdev container */ uint64_t spa_load_guid; /* initial guid for spa_load */ list_t spa_dirty_list; /* vdevs with dirty labels */ - uint64_t spa_spares_object; /* MOS object for spare list */ - nvlist_t *spa_sparelist; /* cached spare config */ - vdev_t **spa_spares; /* available hot spares */ - int spa_nspares; /* number of hot spares */ - boolean_t spa_sync_spares; /* sync the spares list */ + spa_aux_vdev_t spa_spares; /* hot spares */ + spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_syncing_txg; /* txg currently syncing */ uint64_t spa_sync_bplist_obj; /* object for deferred frees */ @@ -134,8 +141,6 @@ struct spa { uint64_t spa_history; /* history object */ kmutex_t spa_history_lock; /* history lock */ vdev_t *spa_pending_vdev; /* pending vdev additions */ - nvlist_t **spa_pending_spares; /* pending spare additions */ - uint_t spa_pending_nspares; /* # pending spares */ kmutex_t spa_props_lock; /* property lock */ uint64_t spa_pool_props_object; /* object for properties */ uint64_t spa_bootfs; /* default boot filesystem */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index dced3da5ff..b1ec648056 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -53,7 +53,7 @@ extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); extern void vdev_init(vdev_t *, uint64_t txg); extern void vdev_reopen(vdev_t *); -extern int vdev_validate_spare(vdev_t *); +extern int vdev_validate_aux(vdev_t *vd); extern int vdev_probe(vdev_t *); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); @@ -69,6 +69,7 @@ extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); +extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio); extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete); @@ -78,7 +79,7 @@ extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux); extern void vdev_space_update(vdev_t *vd, int64_t space_delta, - int64_t alloc_delta); + int64_t alloc_delta, boolean_t update_root); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); @@ -113,7 +114,7 @@ extern void vdev_config_clean(vdev_t *vd); extern int vdev_config_sync(vdev_t *vd, uint64_t txg); extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, - boolean_t getstats, boolean_t isspare); + boolean_t getstats, boolean_t isspare, boolean_t isl2cache); /* * Label routines @@ -127,7 +128,8 @@ typedef enum { VDEV_LABEL_CREATE, /* create/add a new device */ VDEV_LABEL_REPLACE, /* replace an existing device */ VDEV_LABEL_SPARE, /* add a new hot spare */ - VDEV_LABEL_REMOVE /* remove an existing device */ + VDEV_LABEL_REMOVE, /* remove an existing device */ + VDEV_LABEL_L2CACHE /* add an L2ARC cache device */ } vdev_labeltype_t; extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 6fa21e83b0..2eebbba566 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -168,6 +168,7 @@ struct vdev { uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ uint8_t vdev_detached; /* device detached? */ uint64_t vdev_isspare; /* was a hot spare */ + uint64_t vdev_isl2cache; /* was a l2cache device */ vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ vdev_cache_t vdev_cache; /* physical block cache */ uint64_t vdev_not_present; /* not present during import */ @@ -249,6 +250,7 @@ typedef struct vdev_label { #define VDEV_ALLOC_LOAD 0 #define VDEV_ALLOC_ADD 1 #define VDEV_ALLOC_SPARE 2 +#define VDEV_ALLOC_L2CACHE 3 /* * Allocate or free a vdev diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index d2949884af..4591274518 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -304,11 +304,13 @@ extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, - zio_done_func_t *done, void *private, int priority, int flags); + zio_done_func_t *done, void *private, int priority, int flags, + boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, - zio_done_func_t *done, void *private, int priority, int flags); + zio_done_func_t *done, void *private, int priority, int flags, + boolean_t labels); extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, uint64_t txg); diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index e7810dd340..2a2dc1d625 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -363,6 +363,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } else if (alloctype == VDEV_ALLOC_SPARE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (EINVAL); + } else if (alloctype == VDEV_ALLOC_L2CACHE) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (EINVAL); } /* @@ -550,6 +553,8 @@ vdev_free(vdev_t *vd) if (vd->vdev_isspare) spa_spare_remove(vd); + if (vd->vdev_isl2cache) + spa_l2cache_remove(vd); txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); @@ -1367,14 +1372,14 @@ vdev_load(vdev_t *vd) } /* - * This special case of vdev_spare() is used for hot spares. It's sole purpose - * it to set the vdev state for the associated vdev. To do this, we make sure - * that we can open the underlying device, then try to read the label, and make - * sure that the label is sane and that it hasn't been repurposed to another - * pool. + * The special vdev case is used for hot spares and l2cache devices. Its + * sole purpose it to set the vdev state for the associated vdev. To do this, + * we make sure that we can open the underlying device, then try to read the + * label, and make sure that the label is sane and that it hasn't been + * repurposed to another pool. */ int -vdev_validate_spare(vdev_t *vd) +vdev_validate_aux(vdev_t *vd) { nvlist_t *label; uint64_t guid, version; @@ -1397,8 +1402,6 @@ vdev_validate_spare(vdev_t *vd) return (-1); } - spa_spare_add(vd); - /* * We don't actually check the pool state here. If it's in fact in * use by another pool, we update this fact on the fly when requested. @@ -1855,6 +1858,16 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) } void +vdev_clear_stats(vdev_t *vd) +{ + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_space = 0; + vd->vdev_stat.vs_dspace = 0; + vd->vdev_stat.vs_alloc = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +void vdev_stat_update(zio_t *zio) { vdev_t *vd = zio->io_vd; @@ -1952,15 +1965,14 @@ vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) * Update the in-core space usage stats for this vdev and the root vdev. */ void -vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta) +vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, + boolean_t update_root) { int64_t dspace_delta = space_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; ASSERT(vd == vd->vdev_top); - ASSERT(rvd == vd->vdev_parent); - ASSERT(vd->vdev_ms_count != 0); /* * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion @@ -1978,18 +1990,23 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta) vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); - /* - * Don't count non-normal (e.g. intent log) space as part of - * the pool's capacity. - */ - if (vd->vdev_mg->mg_class != spa->spa_normal_class) - return; + if (update_root) { + ASSERT(rvd == vd->vdev_parent); + ASSERT(vd->vdev_ms_count != 0); + + /* + * Don't count non-normal (e.g. intent log) space as part of + * the pool's capacity. + */ + if (vd->vdev_mg->mg_class != spa->spa_normal_class) + return; - mutex_enter(&rvd->vdev_stat_lock); - rvd->vdev_stat.vs_space += space_delta; - rvd->vdev_stat.vs_alloc += alloc_delta; - rvd->vdev_stat.vs_dspace += dspace_delta; - mutex_exit(&rvd->vdev_stat_lock); + mutex_enter(&rvd->vdev_stat_lock); + rvd->vdev_stat.vs_space += space_delta; + rvd->vdev_stat.vs_alloc += alloc_delta; + rvd->vdev_stat.vs_dspace += dspace_delta; + mutex_exit(&rvd->vdev_stat_lock); + } } /* diff --git a/usr/src/uts/common/fs/zfs/vdev_label.c b/usr/src/uts/common/fs/zfs/vdev_label.c index 070444a093..b6688ae69d 100644 --- a/usr/src/uts/common/fs/zfs/vdev_label.c +++ b/usr/src/uts/common/fs/zfs/vdev_label.c @@ -169,7 +169,8 @@ vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE)); + ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + B_TRUE)); } static void @@ -181,7 +182,8 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, zio_nowait(zio_write_phys(zio, vd, vdev_label_offset(vd->vdev_psize, l, offset), size, buf, ZIO_CHECKSUM_LABEL, done, private, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL)); + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CONFIG_HELD | ZIO_FLAG_CANFAIL, + B_TRUE)); } /* @@ -189,7 +191,7 @@ vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset, */ nvlist_t * vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, - boolean_t isspare) + boolean_t isspare, boolean_t isl2cache) { nvlist_t *nv = NULL; @@ -197,7 +199,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type) == 0); - if (!isspare) + if (!isspare && !isl2cache) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); @@ -245,7 +247,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_isspare) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1) == 0); - if (!isspare && vd == vd->vdev_top) { + if (!isspare && !isl2cache && vd == vd->vdev_top) { VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, vd->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, @@ -278,7 +280,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, for (c = 0; c < vd->vdev_children; c++) child[c] = vdev_config_generate(spa, vd->vdev_child[c], - getstats, isspare); + getstats, isspare, isl2cache); VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, vd->vdev_children) == 0); @@ -357,7 +359,7 @@ vdev_label_read_config(vdev_t *vd) */ static boolean_t vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, - uint64_t *spare_guid) + uint64_t *spare_guid, uint64_t *l2cache_guid) { spa_t *spa = vd->vdev_spa; uint64_t state, pool_guid, device_guid, txg, spare_pool; @@ -366,6 +368,8 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, if (spare_guid) *spare_guid = 0ULL; + if (l2cache_guid) + *l2cache_guid = 0ULL; /* * Read the label, if any, and perform some basic sanity checks. @@ -384,7 +388,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, return (B_FALSE); } - if (state != POOL_STATE_SPARE && + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, @@ -400,9 +404,10 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, * be a part of. The only way this is allowed is if the device is a hot * spare (which we check for later on). */ - if (state != POOL_STATE_SPARE && + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && !spa_guid_exists(pool_guid, device_guid) && - !spa_spare_exists(device_guid, NULL)) + !spa_spare_exists(device_guid, NULL) && + !spa_l2cache_exists(device_guid, NULL)) return (B_FALSE); /* @@ -412,13 +417,14 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, * user has attempted to add the same vdev multiple times in the same * transaction. */ - if (state != POOL_STATE_SPARE && txg == 0 && vdtxg == crtxg) + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + txg == 0 && vdtxg == crtxg) return (B_TRUE); /* * Check to see if this is a spare device. We do an explicit check for * spa_has_spare() here because it may be on our pending list of spares - * to add. + * to add. We also check if it is an l2cache device. */ if (spa_spare_exists(device_guid, &spare_pool) || spa_has_spare(spa, device_guid)) { @@ -427,6 +433,7 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, switch (reason) { case VDEV_LABEL_CREATE: + case VDEV_LABEL_L2CACHE: return (B_TRUE); case VDEV_LABEL_REPLACE: @@ -439,6 +446,12 @@ vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, } /* + * Check to see if this is an l2cache device. + */ + if (spa_l2cache_exists(device_guid, NULL)) + return (B_TRUE); + + /* * If the device is marked ACTIVE, then this device is in use by another * pool on the system. */ @@ -466,7 +479,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) char *buf; size_t buflen; int error; - uint64_t spare_guid; + uint64_t spare_guid, l2cache_guid; ASSERT(spa_config_held(spa, RW_WRITER)); @@ -488,19 +501,20 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * Determine if the vdev is in use. */ if (reason != VDEV_LABEL_REMOVE && - vdev_inuse(vd, crtxg, reason, &spare_guid)) + vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) return (EBUSY); ASSERT(reason != VDEV_LABEL_REMOVE || - vdev_inuse(vd, crtxg, reason, NULL)); + vdev_inuse(vd, crtxg, reason, NULL, NULL)); /* - * If this is a request to add or replace a spare that is in use - * elsewhere on the system, then we must update the guid (which was - * initialized to a random value) to reflect the actual GUID (which is - * shared between multiple pools). + * If this is a request to add or replace a spare or l2cache device + * that is in use elsewhere on the system, then we must update the + * guid (which was initialized to a random value) to reflect the + * actual GUID (which is shared between multiple pools). */ - if (reason != VDEV_LABEL_REMOVE && spare_guid != 0ULL) { + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE && + spare_guid != 0ULL) { vdev_t *pvd = vd->vdev_parent; for (; pvd != NULL; pvd = pvd->vdev_parent) { @@ -520,6 +534,27 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) ASSERT(reason == VDEV_LABEL_REPLACE); } + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && + l2cache_guid != 0ULL) { + vdev_t *pvd = vd->vdev_parent; + + for (; pvd != NULL; pvd = pvd->vdev_parent) { + pvd->vdev_guid_sum -= vd->vdev_guid; + pvd->vdev_guid_sum += l2cache_guid; + } + + vd->vdev_guid = vd->vdev_guid_sum = l2cache_guid; + + /* + * If this is a replacement, then we want to fallthrough to the + * rest of the code. If we're adding an l2cache, then it's + * already labeled appropriately and we can just return. + */ + if (reason == VDEV_LABEL_L2CACHE) + return (0); + ASSERT(reason == VDEV_LABEL_REPLACE); + } + /* * Initialize its label. */ @@ -549,6 +584,19 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) POOL_STATE_SPARE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); + } else if (reason == VDEV_LABEL_L2CACHE || + (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { + /* + * For level 2 ARC devices, add a special label. + */ + VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, + POOL_STATE_L2CACHE) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); } else { label = spa_config_generate(spa, vd, 0ULL, B_FALSE); @@ -623,13 +671,19 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * If this vdev hasn't been previously identified as a spare, then we * mark it as such only if a) we are labeling it as a spare, or b) it - * exists as a spare elsewhere in the system. + * exists as a spare elsewhere in the system. Do the same for + * level 2 ARC devices. */ if (error == 0 && !vd->vdev_isspare && (reason == VDEV_LABEL_SPARE || spa_spare_exists(vd->vdev_guid, NULL))) spa_spare_add(vd); + if (error == 0 && !vd->vdev_isl2cache && + (reason == VDEV_LABEL_L2CACHE || + spa_l2cache_exists(vd->vdev_guid, NULL))) + spa_l2cache_add(vd); + return (error); } diff --git a/usr/src/uts/common/fs/zfs/zfs_ioctl.c b/usr/src/uts/common/fs/zfs/zfs_ioctl.c index fa2f71308f..e46af7a3e6 100644 --- a/usr/src/uts/common/fs/zfs/zfs_ioctl.c +++ b/usr/src/uts/common/fs/zfs/zfs_ioctl.c @@ -963,23 +963,30 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; - nvlist_t *config; + nvlist_t *config, **l2cache; + uint_t nl2cache; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); + error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + &config); + (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache); + /* * A root pool with concatenated devices is not supported. * Thus, can not add a device to a root pool with one device. + * Allow for l2cache devices to be added. */ - if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0) { + if (spa->spa_root_vdev->vdev_children == 1 && spa->spa_bootfs != 0 && + nl2cache == 0) { spa_close(spa, FTAG); return (EDOM); } - if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) == 0) { + if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } @@ -2573,9 +2580,26 @@ zfs_ioc_clear(zfs_cmd_t *zc) if (zc->zc_guid == 0) { vd = NULL; } else if ((vd = spa_lookup_by_guid(spa, zc->zc_guid)) == NULL) { - (void) spa_vdev_exit(spa, NULL, txg, ENODEV); - spa_close(spa, FTAG); - return (ENODEV); + spa_aux_vdev_t *sav; + int i; + + /* + * Check if this is an l2cache device. + */ + ASSERT(spa != NULL); + sav = &spa->spa_l2cache; + for (i = 0; i < sav->sav_count; i++) { + if (sav->sav_vdevs[i]->vdev_guid == zc->zc_guid) { + vd = sav->sav_vdevs[i]; + break; + } + } + + if (vd == NULL) { + (void) spa_vdev_exit(spa, NULL, txg, ENODEV); + spa_close(spa, FTAG); + return (ENODEV); + } } vdev_clear(spa, vd, B_TRUE); diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index fcce8fa65e..112aaa6f25 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -675,7 +675,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, static void zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, - int checksum) + int checksum, boolean_t labels) { ASSERT(vd->vdev_children == 0); @@ -683,8 +683,12 @@ zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0); ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); - ASSERT(offset + size <= VDEV_LABEL_START_SIZE || - offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); +#ifdef ZFS_DEBUG + if (labels) { + ASSERT(offset + size <= VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + } +#endif ASSERT3U(offset + size, <=, vd->vdev_psize); BP_ZERO(bp); @@ -703,14 +707,14 @@ zio_phys_bp_init(vdev_t *vd, blkptr_t *bp, uint64_t offset, uint64_t size, zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, - int priority, int flags) + int priority, int flags, boolean_t labels) { zio_t *zio; blkptr_t blk; ZIO_ENTER(vd->vdev_spa); - zio_phys_bp_init(vd, &blk, offset, size, checksum); + zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, @@ -730,7 +734,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, - int priority, int flags) + int priority, int flags, boolean_t labels) { zio_block_tail_t *zbt; void *wbuf; @@ -739,7 +743,7 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, ZIO_ENTER(vd->vdev_spa); - zio_phys_bp_init(vd, &blk, offset, size, checksum); + zio_phys_bp_init(vd, &blk, offset, size, checksum, labels); zio = zio_create(pio, vd->vdev_spa, 0, &blk, data, size, done, private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index 10074241c0..3fff205c47 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -219,14 +219,15 @@ typedef enum zfs_share_op { #define SPA_VERSION_7 7ULL #define SPA_VERSION_8 8ULL #define SPA_VERSION_9 9ULL +#define SPA_VERSION_10 10ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understand the on-disk * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*}, * and do the appropriate changes. */ -#define SPA_VERSION SPA_VERSION_9 -#define SPA_VERSION_STRING "9" +#define SPA_VERSION SPA_VERSION_10 +#define SPA_VERSION_STRING "10" /* * Symbolic names for the changes that caused a SPA_VERSION switch. @@ -256,6 +257,7 @@ typedef enum zfs_share_op { #define SPA_VERSION_REFRESERVATION SPA_VERSION_9 #define SPA_VERSION_REFQUOTA SPA_VERSION_9 #define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 +#define SPA_VERSION_L2CACHE SPA_VERSION_10 /* * ZPL version - rev'd whenever an incompatible on-disk format change @@ -312,6 +314,7 @@ typedef enum zfs_share_op { #define ZPOOL_CONFIG_UNSPARE "unspare" #define ZPOOL_CONFIG_PHYS_PATH "phys_path" #define ZPOOL_CONFIG_IS_LOG "is_log" +#define ZPOOL_CONFIG_L2CACHE "l2cache" /* * The persistent vdev state is stored as separate values rather than a single * 'vdev_state' entry. This is because a device can be in multiple states, such @@ -331,6 +334,7 @@ typedef enum zfs_share_op { #define VDEV_TYPE_MISSING "missing" #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" +#define VDEV_TYPE_L2CACHE "l2cache" /* * This is needed in userland to report the minimum necessary device size. @@ -384,14 +388,16 @@ typedef enum vdev_aux { /* * pool state. The following states are written to disk as part of the normal - * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE. The remaining states are - * software abstractions used at various levels to communicate pool state. + * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining + * states are software abstractions used at various levels to communicate + * pool state. */ typedef enum pool_state { POOL_STATE_ACTIVE = 0, /* In active use */ POOL_STATE_EXPORTED, /* Explicitly exported */ POOL_STATE_DESTROYED, /* Explicitly destroyed */ POOL_STATE_SPARE, /* Reserved for hot spare use */ + POOL_STATE_L2CACHE, /* Level 2 ARC device */ POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ POOL_STATE_IO_FAILURE, /* Internal pool state */ POOL_STATE_UNAVAIL, /* Internal libzfs state */ |