summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorAndy Fiddaman <omnios@citrus-it.co.uk>2022-02-02 23:04:19 +0000
committerAndy Fiddaman <omnios@citrus-it.co.uk>2022-02-17 22:18:17 +0000
commit6f0e4dc91b854250fff5c24de2d27aed3375ac69 (patch)
tree8755f8b826b884eed81f346a0b3da8b16bd7c506 /usr/src
parentcd0d4b4073e62fa22997078b1595f399434a1047 (diff)
downloadillumos-joyent-6f0e4dc91b854250fff5c24de2d27aed3375ac69.tar.gz
14469 nvme could raise dynamic lun expansion sysevents
Reviewed by: Robert Mustacchi <rm+illumos@fingolfin.org> Reviewed by: Jason King <jason.brian.king+illumos@gmail.com> Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c31
-rw-r--r--usr/src/uts/common/io/blkdev/blkdev.c127
-rw-r--r--usr/src/uts/common/io/nvme/nvme.c43
3 files changed, 176 insertions, 25 deletions
diff --git a/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c b/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c
index 4697128c90..82c296a669 100644
--- a/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c
+++ b/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c
@@ -22,6 +22,7 @@
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+ * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
*/
/*
@@ -32,15 +33,15 @@
*
* When a device is added to the system:
*
- * 1. Search for any vdevs whose devid matches that of the newly added
+ * 1. Search for any vdevs whose devid matches that of the newly added
* device.
*
- * 2. If no vdevs are found, then search for any vdevs whose devfs path
+ * 2. If no vdevs are found, then search for any vdevs whose devfs path
* matches that of the new device.
*
* 3. If no vdevs match by either method, then ignore the event.
*
- * 4. Attempt to online the device with a flag to indicate that it should
+ * 4. Attempt to online the device with a flag to indicate that it should
* be unspared when resilvering completes. If this succeeds, then the
* same device was inserted and we should continue normally.
*
@@ -319,11 +320,11 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
* string. However, we allow substring matches in the following
* cases:
*
- * <path>: This is a devpath, and the target is one
- * of its children.
+ * <path>: This is a devpath, and the target is one
+ * of its children.
*
- * <path/> This is a devid for a whole disk, and
- * the target is one of its children.
+ * <path/> This is a devid for a whole disk, and
+ * the target is one of its children.
*/
if (path[len] != '\0' && path[len] != ':' &&
path[len - 1] != '/')
@@ -555,7 +556,7 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
vdev_state_t newstate;
nvlist_t *tgt;
- syseventd_print(9, "zfsdle_vdev_online: searching for %s in pool %s\n",
+ syseventd_print(9, "%s: searching for %s in pool %s\n", __func__,
devname, zpool_get_name(zhp));
if ((tgt = zpool_find_vdev_by_physpath(zhp, devname,
@@ -568,6 +569,11 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
&wholedisk) == 0);
+ syseventd_print(9, "%s: "
+ "found %s in pool %s (wholedisk: %s)\n", __func__,
+ path, zpool_get_name(zhp),
+ wholedisk != 0 ? "true" : "false");
+
(void) strlcpy(fullpath, path, sizeof (fullpath));
if (wholedisk) {
fullpath[strlen(fullpath) - 2] = '\0';
@@ -581,12 +587,13 @@ zfsdle_vdev_online(zpool_handle_t *zhp, void *data)
}
if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
- syseventd_print(9, "zfsdle_vdev_online: setting device"
- " device %s to ONLINE state in pool %s.\n",
- fullpath, zpool_get_name(zhp));
- if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL)
+ syseventd_print(9, "%s: "
+ "setting device %s to ONLINE state in pool %s.\n",
+ __func__, fullpath, zpool_get_name(zhp));
+ if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) {
(void) zpool_vdev_online(zhp, fullpath, 0,
&newstate);
+ }
}
zpool_close(zhp);
return (1);
diff --git a/usr/src/uts/common/io/blkdev/blkdev.c b/usr/src/uts/common/io/blkdev/blkdev.c
index c0bdb3dab2..611666b0a1 100644
--- a/usr/src/uts/common/io/blkdev/blkdev.c
+++ b/usr/src/uts/common/io/blkdev/blkdev.c
@@ -26,6 +26,7 @@
* Copyright 2017 The MathWorks, Inc. All rights reserved.
* Copyright 2019 Western Digital Corporation.
* Copyright 2020 Joyent, Inc.
+ * Copyright 2022 OmniOS Community Edition (OmniOSce) Association.
*/
#include <sys/types.h>
@@ -55,6 +56,11 @@
#include <sys/note.h>
#include <sys/blkdev.h>
#include <sys/scsi/impl/inquiry.h>
+#include <sys/taskq.h>
+#include <sys/taskq_impl.h>
+#include <sys/disp.h>
+#include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
/*
* blkdev is a driver which provides a lot of the common functionality
@@ -122,8 +128,8 @@
*
* Locks
* -----
- * There are 4 instance global locks d_ocmutex, d_ksmutex, d_errmutex and
- * d_statemutex. As well a q_iomutex per waitq/runq pair.
+ * There are 5 instance global locks d_ocmutex, d_ksmutex, d_errmutex,
+ * d_statemutex and d_dle_mutex. As well a q_iomutex per waitq/runq pair.
*
* Lock Hierarchy
* --------------
@@ -139,11 +145,16 @@ typedef struct bd bd_t;
typedef struct bd_xfer_impl bd_xfer_impl_t;
typedef struct bd_queue bd_queue_t;
+typedef enum {
+ BD_DLE_PENDING = 1 << 0,
+ BD_DLE_RUNNING = 1 << 1
+} bd_dle_state_t;
+
struct bd {
void *d_private;
dev_info_t *d_dip;
- kmutex_t d_ocmutex;
- kmutex_t d_ksmutex;
+ kmutex_t d_ocmutex; /* open/close */
+ kmutex_t d_ksmutex; /* kstat */
kmutex_t d_errmutex;
kmutex_t d_statemutex;
kcondvar_t d_statecv;
@@ -183,6 +194,10 @@ struct bd {
ddi_dma_attr_t d_dma;
bd_ops_t d_ops;
bd_handle_t d_handle;
+
+ kmutex_t d_dle_mutex;
+ taskq_ent_t d_dle_ent;
+ bd_dle_state_t d_dle_state;
};
struct bd_handle {
@@ -328,20 +343,34 @@ static struct modlinkage modlinkage = {
static void *bd_state;
static krwlock_t bd_lock;
+static taskq_t *bd_taskq;
int
_init(void)
{
- int rv;
+ char taskq_name[TASKQ_NAMELEN];
+ const char *name;
+ int rv;
rv = ddi_soft_state_init(&bd_state, sizeof (struct bd), 2);
- if (rv != DDI_SUCCESS) {
+ if (rv != DDI_SUCCESS)
return (rv);
+
+ name = mod_modname(&modlinkage);
+ (void) snprintf(taskq_name, sizeof (taskq_name), "%s_taskq", name);
+ bd_taskq = taskq_create(taskq_name, 1, minclsyspri, 0, 0, 0);
+ if (bd_taskq == NULL) {
+ cmn_err(CE_WARN, "%s: unable to create %s", name, taskq_name);
+ ddi_soft_state_fini(&bd_state);
+ return (DDI_FAILURE);
}
+
rw_init(&bd_lock, NULL, RW_DRIVER, NULL);
+
rv = mod_install(&modlinkage);
if (rv != DDI_SUCCESS) {
rw_destroy(&bd_lock);
+ taskq_destroy(bd_taskq);
ddi_soft_state_fini(&bd_state);
}
return (rv);
@@ -355,6 +384,7 @@ _fini(void)
rv = mod_remove(&modlinkage);
if (rv == DDI_SUCCESS) {
rw_destroy(&bd_lock);
+ taskq_destroy(bd_taskq);
ddi_soft_state_fini(&bd_state);
}
return (rv);
@@ -696,6 +726,8 @@ bd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
mutex_init(&bd->d_ocmutex, NULL, MUTEX_DRIVER, NULL);
mutex_init(&bd->d_statemutex, NULL, MUTEX_DRIVER, NULL);
cv_init(&bd->d_statecv, NULL, CV_DRIVER, NULL);
+ mutex_init(&bd->d_dle_mutex, NULL, MUTEX_DRIVER, NULL);
+ bd->d_dle_state = 0;
bd->d_cache = kmem_cache_create(kcache, sizeof (bd_xfer_impl_t), 8,
bd_xfer_ctor, bd_xfer_dtor, NULL, bd, NULL, 0);
@@ -853,6 +885,7 @@ fail_drive_info:
mutex_destroy(&bd->d_statemutex);
mutex_destroy(&bd->d_ocmutex);
mutex_destroy(&bd->d_ksmutex);
+ mutex_destroy(&bd->d_dle_mutex);
ddi_soft_state_free(bd_state, inst);
return (DDI_FAILURE);
}
@@ -891,6 +924,7 @@ bd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
mutex_destroy(&bd->d_ocmutex);
mutex_destroy(&bd->d_statemutex);
cv_destroy(&bd->d_statecv);
+ mutex_destroy(&bd->d_dle_mutex);
bd_queues_free(bd);
ddi_soft_state_free(bd_state, ddi_get_instance(dip));
return (DDI_SUCCESS);
@@ -1890,6 +1924,69 @@ bd_runq_exit(bd_xfer_impl_t *xi, int err)
}
static void
+bd_dle_sysevent_task(void *arg)
+{
+ nvlist_t *attr = NULL;
+ char *path = NULL;
+ bd_t *bd = arg;
+ dev_info_t *dip = bd->d_dip;
+ size_t n;
+
+ mutex_enter(&bd->d_dle_mutex);
+ bd->d_dle_state &= ~BD_DLE_PENDING;
+ bd->d_dle_state |= BD_DLE_RUNNING;
+ mutex_exit(&bd->d_dle_mutex);
+
+ dev_err(dip, CE_NOTE, "!dynamic LUN expansion");
+
+ if (nvlist_alloc(&attr, NV_UNIQUE_NAME_TYPE, KM_SLEEP) != 0) {
+ mutex_enter(&bd->d_dle_mutex);
+ bd->d_dle_state &= ~(BD_DLE_RUNNING|BD_DLE_PENDING);
+ mutex_exit(&bd->d_dle_mutex);
+ return;
+ }
+
+ path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+
+ n = snprintf(path, MAXPATHLEN, "/devices");
+ (void) ddi_pathname(dip, path + n);
+ n = strlen(path);
+ n += snprintf(path + n, MAXPATHLEN - n, ":x");
+
+ for (;;) {
+ /*
+ * On receipt of this event, the ZFS sysevent module will scan
+ * active zpools for child vdevs matching this physical path.
+ * In order to catch both whole disk pools and those with an
+ * EFI boot partition, generate separate sysevents for minor
+ * node 'a' and 'b'. (By comparison, io/scsi/targets/sd.c sends
+ * events for just 'a')
+ */
+ for (char c = 'a'; c < 'c'; c++) {
+ path[n - 1] = c;
+
+ if (nvlist_add_string(attr, DEV_PHYS_PATH, path) != 0)
+ break;
+
+ (void) ddi_log_sysevent(dip, DDI_VENDOR_SUNW,
+ EC_DEV_STATUS, ESC_DEV_DLE, attr, NULL, DDI_SLEEP);
+ }
+
+ mutex_enter(&bd->d_dle_mutex);
+ if ((bd->d_dle_state & BD_DLE_PENDING) == 0) {
+ bd->d_dle_state &= ~BD_DLE_RUNNING;
+ mutex_exit(&bd->d_dle_mutex);
+ break;
+ }
+ bd->d_dle_state &= ~BD_DLE_PENDING;
+ mutex_exit(&bd->d_dle_mutex);
+ }
+
+ nvlist_free(attr);
+ kmem_free(path, MAXPATHLEN);
+}
+
+static void
bd_update_state(bd_t *bd)
{
enum dkio_state state = DKIO_INSERTED;
@@ -1908,8 +2005,7 @@ bd_update_state(bd_t *bd)
if ((media.m_blksize < 512) ||
(!ISP2(media.m_blksize)) ||
(P2PHASE(bd->d_maxxfer, media.m_blksize))) {
- cmn_err(CE_WARN, "%s%d: Invalid media block size (%d)",
- ddi_driver_name(bd->d_dip), ddi_get_instance(bd->d_dip),
+ dev_err(bd->d_dip, CE_WARN, "Invalid media block size (%d)",
media.m_blksize);
/*
* We can't use the media, treat it as not present.
@@ -1954,6 +2050,21 @@ done:
if (docmlb) {
if (state == DKIO_INSERTED) {
(void) cmlb_validate(bd->d_cmlbh, 0, 0);
+
+ mutex_enter(&bd->d_dle_mutex);
+ /*
+ * If there is already an event pending, there's
+ * nothing to do; we coalesce multiple events.
+ */
+ if ((bd->d_dle_state & BD_DLE_PENDING) == 0) {
+ if ((bd->d_dle_state & BD_DLE_RUNNING) == 0) {
+ taskq_dispatch_ent(bd_taskq,
+ bd_dle_sysevent_task, bd, 0,
+ &bd->d_dle_ent);
+ }
+ bd->d_dle_state |= BD_DLE_PENDING;
+ }
+ mutex_exit(&bd->d_dle_mutex);
} else {
cmlb_invalidate(bd->d_cmlbh, 0);
}
diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c
index ad076201e0..e9c779e323 100644
--- a/usr/src/uts/common/io/nvme/nvme.c
+++ b/usr/src/uts/common/io/nvme/nvme.c
@@ -445,6 +445,8 @@ static int nvme_open(dev_t *, int, int, cred_t *);
static int nvme_close(dev_t, int, int, cred_t *);
static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *);
+static void nvme_changed_ns(nvme_t *, int);
+
static ddi_ufm_ops_t nvme_ufm_ops = {
NULL,
nvme_ufm_fill_image,
@@ -1955,11 +1957,7 @@ nvme_async_event_task(void *arg)
if (nsid == 0) /* end of list */
break;
-
- dev_err(nvme->n_dip, CE_CONT,
- "namespace %u (%s) has changed.\n",
- nsid, nvme->n_ns[nsid - 1].ns_name);
- /* TODO: handle namespace resize. */
+ nvme_changed_ns(nvme, nsid);
}
break;
@@ -2693,6 +2691,41 @@ nvme_prepare_devid(nvme_t *nvme, uint32_t nsid)
nvme->n_idctl->id_vid, model, serial, nsid);
}
+static void
+nvme_changed_ns(nvme_t *nvme, int nsid)
+{
+ nvme_namespace_t *ns = &nvme->n_ns[nsid - 1];
+ nvme_identify_nsid_t *idns, *oidns;
+
+ dev_err(nvme->n_dip, CE_NOTE, "!namespace %u (%s) has changed.",
+ nsid, ns->ns_name);
+
+ if (ns->ns_ignore)
+ return;
+
+ /*
+ * The namespace has changed in some way. At present, we only update
+ * the device capacity and trigger blkdev to check the device state.
+ */
+
+ if (nvme_identify(nvme, B_FALSE, nsid, (void **)&idns) != 0) {
+ dev_err(nvme->n_dip, CE_WARN,
+ "!failed to identify namespace %d", nsid);
+ return;
+ }
+
+ oidns = ns->ns_idns;
+ ns->ns_idns = idns;
+ kmem_free(oidns, sizeof (nvme_identify_nsid_t));
+
+ ns->ns_block_count = idns->id_nsize;
+ ns->ns_block_size =
+ 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads;
+ ns->ns_best_block_size = ns->ns_block_size;
+
+ bd_state_change(ns->ns_bd_hdl);
+}
+
static int
nvme_init_ns(nvme_t *nvme, int nsid)
{