diff options
author | Andy Fiddaman <omnios@citrus-it.co.uk> | 2022-02-02 23:04:19 +0000 |
---|---|---|
committer | Andy Fiddaman <omnios@citrus-it.co.uk> | 2022-02-17 22:18:17 +0000 |
commit | 6f0e4dc91b854250fff5c24de2d27aed3375ac69 (patch) | |
tree | 8755f8b826b884eed81f346a0b3da8b16bd7c506 /usr/src | |
parent | cd0d4b4073e62fa22997078b1595f399434a1047 (diff) | |
download | illumos-joyent-6f0e4dc91b854250fff5c24de2d27aed3375ac69.tar.gz |
14469 nvme could raise dynamic lun expansion sysevents
Reviewed by: Robert Mustacchi <rm+illumos@fingolfin.org>
Reviewed by: Jason King <jason.brian.king+illumos@gmail.com>
Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c | 31 | ||||
-rw-r--r-- | usr/src/uts/common/io/blkdev/blkdev.c | 127 | ||||
-rw-r--r-- | usr/src/uts/common/io/nvme/nvme.c | 43 |
3 files changed, 176 insertions, 25 deletions
diff --git a/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c b/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c index 4697128c90..82c296a669 100644 --- a/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c +++ b/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c @@ -22,6 +22,7 @@ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. */ /* @@ -32,15 +33,15 @@ * * When a device is added to the system: * - * 1. Search for any vdevs whose devid matches that of the newly added + * 1. Search for any vdevs whose devid matches that of the newly added * device. * - * 2. If no vdevs are found, then search for any vdevs whose devfs path + * 2. If no vdevs are found, then search for any vdevs whose devfs path * matches that of the new device. * * 3. If no vdevs match by either method, then ignore the event. * - * 4. Attempt to online the device with a flag to indicate that it should + * 4. Attempt to online the device with a flag to indicate that it should * be unspared when resilvering completes. If this succeeds, then the * same device was inserted and we should continue normally. * @@ -319,11 +320,11 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) * string. However, we allow substring matches in the following * cases: * - * <path>: This is a devpath, and the target is one - * of its children. + * <path>: This is a devpath, and the target is one + * of its children. * - * <path/> This is a devid for a whole disk, and - * the target is one of its children. + * <path/> This is a devid for a whole disk, and + * the target is one of its children. */ if (path[len] != '\0' && path[len] != ':' && path[len - 1] != '/') @@ -555,7 +556,7 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data) vdev_state_t newstate; nvlist_t *tgt; - syseventd_print(9, "zfsdle_vdev_online: searching for %s in pool %s\n", + syseventd_print(9, "%s: searching for %s in pool %s\n", __func__, devname, zpool_get_name(zhp)); if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, @@ -568,6 +569,11 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data) verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk) == 0); + syseventd_print(9, "%s: " + "found %s in pool %s (wholedisk: %s)\n", __func__, + path, zpool_get_name(zhp), + wholedisk != 0 ? "true" : "false"); + (void) strlcpy(fullpath, path, sizeof (fullpath)); if (wholedisk) { fullpath[strlen(fullpath) - 2] = '\0'; @@ -581,12 +587,13 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data) } if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { - syseventd_print(9, "zfsdle_vdev_online: setting device" - " device %s to ONLINE state in pool %s.\n", - fullpath, zpool_get_name(zhp)); - if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) + syseventd_print(9, "%s: " + "setting device %s to ONLINE state in pool %s.\n", + __func__, fullpath, zpool_get_name(zhp)); + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { (void) zpool_vdev_online(zhp, fullpath, 0, &newstate); + } } zpool_close(zhp); return (1); diff --git a/usr/src/uts/common/io/blkdev/blkdev.c b/usr/src/uts/common/io/blkdev/blkdev.c index c0bdb3dab2..611666b0a1 100644 --- a/usr/src/uts/common/io/blkdev/blkdev.c +++ b/usr/src/uts/common/io/blkdev/blkdev.c @@ -26,6 +26,7 @@ * Copyright 2017 The MathWorks, Inc. All rights reserved. * Copyright 2019 Western Digital Corporation. * Copyright 2020 Joyent, Inc. + * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. */ #include <sys/types.h> @@ -55,6 +56,11 @@ #include <sys/note.h> #include <sys/blkdev.h> #include <sys/scsi/impl/inquiry.h> +#include <sys/taskq.h> +#include <sys/taskq_impl.h> +#include <sys/disp.h> +#include <sys/sysevent/eventdefs.h> +#include <sys/sysevent/dev.h> /* * blkdev is a driver which provides a lot of the common functionality @@ -122,8 +128,8 @@ * * Locks * ----- - * There are 4 instance global locks d_ocmutex, d_ksmutex, d_errmutex and - * d_statemutex. As well a q_iomutex per waitq/runq pair. + * There are 5 instance global locks d_ocmutex, d_ksmutex, d_errmutex, + * d_statemutex and d_dle_mutex. As well a q_iomutex per waitq/runq pair. * * Lock Hierarchy * -------------- @@ -139,11 +145,16 @@ typedef struct bd bd_t; typedef struct bd_xfer_impl bd_xfer_impl_t; typedef struct bd_queue bd_queue_t; +typedef enum { + BD_DLE_PENDING = 1 << 0, + BD_DLE_RUNNING = 1 << 1 +} bd_dle_state_t; + struct bd { void *d_private; dev_info_t *d_dip; - kmutex_t d_ocmutex; - kmutex_t d_ksmutex; + kmutex_t d_ocmutex; /* open/close */ + kmutex_t d_ksmutex; /* kstat */ kmutex_t d_errmutex; kmutex_t d_statemutex; kcondvar_t d_statecv; @@ -183,6 +194,10 @@ struct bd { ddi_dma_attr_t d_dma; bd_ops_t d_ops; bd_handle_t d_handle; + + kmutex_t d_dle_mutex; + taskq_ent_t d_dle_ent; + bd_dle_state_t d_dle_state; }; struct bd_handle { @@ -328,20 +343,34 @@ static struct modlinkage modlinkage = { static void *bd_state; static krwlock_t bd_lock; +static taskq_t *bd_taskq; int _init(void) { - int rv; + char taskq_name[TASKQ_NAMELEN]; + const char *name; + int rv; rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2); - if (rv != DDI_SUCCESS) { + if (rv != DDI_SUCCESS) return (rv); + + name = mod_modname(&modlinkage); + (void) snprintf(taskq_name, sizeof (taskq_name), "%s_taskq", name); + bd_taskq = taskq_create(taskq_name, 1, minclsyspri, 0, 0, 0); + if (bd_taskq == NULL) { + cmn_err(CE_WARN, "%s: unable to create %s", name, taskq_name); + ddi_soft_state_fini(&bd_state); + return (DDI_FAILURE); } + rw_init(&bd_lock, NULL, RW_DRIVER, NULL); + rv = mod_install(&modlinkage); if (rv != DDI_SUCCESS) { rw_destroy(&bd_lock); + taskq_destroy(bd_taskq); ddi_soft_state_fini(&bd_state); } return (rv); @@ -355,6 +384,7 @@ _fini(void) rv = mod_remove(&modlinkage); if (rv == DDI_SUCCESS) { rw_destroy(&bd_lock); + taskq_destroy(bd_taskq); ddi_soft_state_fini(&bd_state); } return (rv); @@ -696,6 +726,8 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL); mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL); cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL); + mutex_init(&bd->d_dle_mutex, NULL, MUTEX_DRIVER, NULL); + bd->d_dle_state = 0; bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8, bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0); @@ -853,6 +885,7 @@ fail_drive_info: mutex_destroy(&bd->d_statemutex); mutex_destroy(&bd->d_ocmutex); mutex_destroy(&bd->d_ksmutex); + mutex_destroy(&bd->d_dle_mutex); ddi_soft_state_free(bd_state, inst); return (DDI_FAILURE); } @@ -891,6 +924,7 @@ bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) mutex_destroy(&bd->d_ocmutex); mutex_destroy(&bd->d_statemutex); cv_destroy(&bd->d_statecv); + mutex_destroy(&bd->d_dle_mutex); bd_queues_free(bd); ddi_soft_state_free(bd_state, ddi_get_instance(dip)); return (DDI_SUCCESS); @@ -1890,6 +1924,69 @@ bd_runq_exit(bd_xfer_impl_t *xi, int err) } static void +bd_dle_sysevent_task(void *arg) +{ + nvlist_t *attr = NULL; + char *path = NULL; + bd_t *bd = arg; + dev_info_t *dip = bd->d_dip; + size_t n; + + mutex_enter(&bd->d_dle_mutex); + bd->d_dle_state &= ~BD_DLE_PENDING; + bd->d_dle_state |= BD_DLE_RUNNING; + mutex_exit(&bd->d_dle_mutex); + + dev_err(dip, CE_NOTE, "!dynamic LUN expansion"); + + if (nvlist_alloc(&attr, NV_UNIQUE_NAME_TYPE, KM_SLEEP) != 0) { + mutex_enter(&bd->d_dle_mutex); + bd->d_dle_state &= ~(BD_DLE_RUNNING|BD_DLE_PENDING); + mutex_exit(&bd->d_dle_mutex); + return; + } + + path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + n = snprintf(path, MAXPATHLEN, "/devices"); + (void) ddi_pathname(dip, path + n); + n = strlen(path); + n += snprintf(path + n, MAXPATHLEN - n, ":x"); + + for (;;) { + /* + * On receipt of this event, the ZFS sysevent module will scan + * active zpools for child vdevs matching this physical path. + * In order to catch both whole disk pools and those with an + * EFI boot partition, generate separate sysevents for minor + * node 'a' and 'b'. (By comparison, io/scsi/targets/sd.c sends + * events for just 'a') + */ + for (char c = 'a'; c < 'c'; c++) { + path[n - 1] = c; + + if (nvlist_add_string(attr, DEV_PHYS_PATH, path) != 0) + break; + + (void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW, + EC_DEV_STATUS, ESC_DEV_DLE, attr, NULL, DDI_SLEEP); + } + + mutex_enter(&bd->d_dle_mutex); + if ((bd->d_dle_state & BD_DLE_PENDING) == 0) { + bd->d_dle_state &= ~BD_DLE_RUNNING; + mutex_exit(&bd->d_dle_mutex); + break; + } + bd->d_dle_state &= ~BD_DLE_PENDING; + mutex_exit(&bd->d_dle_mutex); + } + + nvlist_free(attr); + kmem_free(path, MAXPATHLEN); +} + +static void bd_update_state(bd_t *bd) { enum dkio_state state = DKIO_INSERTED; @@ -1908,8 +2005,7 @@ bd_update_state(bd_t *bd) if ((media.m_blksize < 512) || (!ISP2(media.m_blksize)) || (P2PHASE(bd->d_maxxfer, media.m_blksize))) { - cmn_err(CE_WARN, "%s%d: Invalid media block size (%d)", - ddi_driver_name(bd->d_dip), ddi_get_instance(bd->d_dip), + dev_err(bd->d_dip, CE_WARN, "Invalid media block size (%d)", media.m_blksize); /* * We can't use the media, treat it as not present. @@ -1954,6 +2050,21 @@ done: if (docmlb) { if (state == DKIO_INSERTED) { (void) cmlb_validate(bd->d_cmlbh, 0, 0); + + mutex_enter(&bd->d_dle_mutex); + /* + * If there is already an event pending, there's + * nothing to do; we coalesce multiple events. + */ + if ((bd->d_dle_state & BD_DLE_PENDING) == 0) { + if ((bd->d_dle_state & BD_DLE_RUNNING) == 0) { + taskq_dispatch_ent(bd_taskq, + bd_dle_sysevent_task, bd, 0, + &bd->d_dle_ent); + } + bd->d_dle_state |= BD_DLE_PENDING; + } + mutex_exit(&bd->d_dle_mutex); } else { cmlb_invalidate(bd->d_cmlbh, 0); } diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c index ad076201e0..e9c779e323 100644 --- a/usr/src/uts/common/io/nvme/nvme.c +++ b/usr/src/uts/common/io/nvme/nvme.c @@ -445,6 +445,8 @@ static int nvme_open(dev_t *, int, int, cred_t *); static int nvme_close(dev_t, int, int, cred_t *); static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); +static void nvme_changed_ns(nvme_t *, int); + static ddi_ufm_ops_t nvme_ufm_ops = { NULL, nvme_ufm_fill_image, @@ -1955,11 +1957,7 @@ nvme_async_event_task(void *arg) if (nsid == 0) /* end of list */ break; - - dev_err(nvme->n_dip, CE_CONT, - "namespace %u (%s) has changed.\n", - nsid, nvme->n_ns[nsid - 1].ns_name); - /* TODO: handle namespace resize. */ + nvme_changed_ns(nvme, nsid); } break; @@ -2693,6 +2691,41 @@ nvme_prepare_devid(nvme_t *nvme, uint32_t nsid) nvme->n_idctl->id_vid, model, serial, nsid); } +static void +nvme_changed_ns(nvme_t *nvme, int nsid) +{ + nvme_namespace_t *ns = &nvme->n_ns[nsid - 1]; + nvme_identify_nsid_t *idns, *oidns; + + dev_err(nvme->n_dip, CE_NOTE, "!namespace %u (%s) has changed.", + nsid, ns->ns_name); + + if (ns->ns_ignore) + return; + + /* + * The namespace has changed in some way. At present, we only update + * the device capacity and trigger blkdev to check the device state. + */ + + if (nvme_identify(nvme, B_FALSE, nsid, (void **)&idns) != 0) { + dev_err(nvme->n_dip, CE_WARN, + "!failed to identify namespace %d", nsid); + return; + } + + oidns = ns->ns_idns; + ns->ns_idns = idns; + kmem_free(oidns, sizeof (nvme_identify_nsid_t)); + + ns->ns_block_count = idns->id_nsize; + ns->ns_block_size = + 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; + ns->ns_best_block_size = ns->ns_block_size; + + bd_state_change(ns->ns_bd_hdl); +} + static int nvme_init_ns(nvme_t *nvme, int nsid) { |