diff options
author | Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> | 2021-10-11 20:49:25 +0200 |
---|---|---|
committer | Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> | 2022-04-20 16:41:47 +0200 |
commit | 353d89b0745ef752e824c1afc3f0474f66dfbd64 (patch) | |
tree | 12aba4e03777e644f3116deb01a19ba2047259b2 | |
parent | c542a624b7efda0b8123026500f05f430ff6c770 (diff) | |
download | illumos-joyent-353d89b0745ef752e824c1afc3f0474f66dfbd64.tar.gz |
14530 nvme should require exclusive open for attach, detach, and format ioctls
Reviewed by: Andrew Giles <agiles@tintri.com>
Reviewed by: Guy Morrogh <gmorrogh@tintri.com>
Reviewed by: Robert Mustacchi <rm+illumos@fingolfin.org>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>
-rw-r--r-- | usr/src/cmd/nvmeadm/nvmeadm.c | 96 | ||||
-rw-r--r-- | usr/src/cmd/nvmeadm/nvmeadm.h | 3 | ||||
-rw-r--r-- | usr/src/cmd/nvmeadm/nvmeadm_dev.c | 4 | ||||
-rw-r--r-- | usr/src/uts/common/io/nvme/nvme.c | 86 | ||||
-rw-r--r-- | usr/src/uts/common/io/nvme/nvme_var.h | 12 |
5 files changed, 134 insertions, 67 deletions
diff --git a/usr/src/cmd/nvmeadm/nvmeadm.c b/usr/src/cmd/nvmeadm/nvmeadm.c index ee9e26c997..3de4fa0b00 100644 --- a/usr/src/cmd/nvmeadm/nvmeadm.c +++ b/usr/src/cmd/nvmeadm/nvmeadm.c @@ -58,9 +58,12 @@ struct nvme_feature { nvme_version_t *); }; -#define NVMEADM_CTRL 1 -#define NVMEADM_NS 2 -#define NVMEADM_BOTH (NVMEADM_CTRL | NVMEADM_NS) +#define NVMEADM_F_CTRL 1 +#define NVMEADM_F_NS 2 +#define NVMEADM_F_BOTH (NVMEADM_F_CTRL | NVMEADM_F_NS) + +#define NVMEADM_C_MULTI 1 +#define NVMEADM_C_EXCL 2 struct nvmeadm_cmd { char *c_name; @@ -68,8 +71,8 @@ struct nvmeadm_cmd { const char *c_flagdesc; int (*c_func)(int, const nvme_process_arg_t *); void (*c_usage)(const char *); - boolean_t c_multi; void (*c_optparse)(nvme_process_arg_t *); + int c_flags; }; @@ -123,119 +126,131 @@ static const nvmeadm_cmd_t nvmeadm_cmds[] = { "list controllers and namespaces", " -p\t\tprint parsable output\n" " -o field\tselect a field for parsable output\n", - do_list, usage_list, B_TRUE, optparse_list + do_list, usage_list, optparse_list, + NVMEADM_C_MULTI }, { "identify", "identify controllers and/or namespaces", NULL, - do_identify, usage_identify, B_TRUE + do_identify, usage_identify, NULL, + NVMEADM_C_MULTI }, { "get-logpage", "get a log page from controllers and/or namespaces", NULL, - do_get_logpage, usage_get_logpage, B_TRUE + do_get_logpage, usage_get_logpage, NULL, + NVMEADM_C_MULTI }, { "get-features", "get features from controllers and/or namespaces", NULL, - do_get_features, usage_get_features, B_TRUE + do_get_features, usage_get_features, NULL, + NVMEADM_C_MULTI }, { "format", "format namespace(s) of a controller", NULL, - do_format, usage_format, B_FALSE + do_format, usage_format, NULL, + NVMEADM_C_EXCL }, { "secure-erase", "secure erase namespace(s) of a controller", " -c Do a cryptographic erase.", - do_secure_erase, usage_secure_erase, B_FALSE + do_secure_erase, usage_secure_erase, NULL, + NVMEADM_C_EXCL }, { "detach", "detach blkdev(4D) from namespace(s) of a controller", NULL, - do_attach_detach, usage_attach_detach, B_FALSE + do_attach_detach, usage_attach_detach, NULL, + NVMEADM_C_EXCL }, { "attach", "attach blkdev(4D) to namespace(s) of a controller", NULL, - do_attach_detach, usage_attach_detach, B_FALSE + do_attach_detach, usage_attach_detach, NULL, + NVMEADM_C_EXCL }, { "list-firmware", "list firmware on a controller", NULL, - do_get_logpage_fwslot, usage_firmware_list, B_FALSE + do_get_logpage_fwslot, usage_firmware_list, NULL, + 0 }, { "load-firmware", "load firmware to a controller", NULL, - do_firmware_load, usage_firmware_load, B_FALSE + do_firmware_load, usage_firmware_load, NULL, + 0 }, { "commit-firmware", "commit downloaded firmware to a slot of a controller", NULL, - do_firmware_commit, usage_firmware_commit, B_FALSE + do_firmware_commit, usage_firmware_commit, NULL, + 0 }, { "activate-firmware", "activate a firmware slot of a controller", NULL, - do_firmware_activate, usage_firmware_activate, B_FALSE + do_firmware_activate, usage_firmware_activate, NULL, + 0 }, { NULL, NULL, NULL, - NULL, NULL, B_FALSE + NULL, NULL, NULL, 0 } }; static const nvme_feature_t features[] = { { "Arbitration", "", - NVME_FEAT_ARBITRATION, 0, NVMEADM_CTRL, + NVME_FEAT_ARBITRATION, 0, NVMEADM_F_CTRL, do_get_feat_common, nvme_print_feat_arbitration }, { "Power Management", "", - NVME_FEAT_POWER_MGMT, 0, NVMEADM_CTRL, + NVME_FEAT_POWER_MGMT, 0, NVMEADM_F_CTRL, do_get_feat_common, nvme_print_feat_power_mgmt }, { "LBA Range Type", "range", - NVME_FEAT_LBA_RANGE, NVME_LBA_RANGE_BUFSIZE, NVMEADM_NS, + NVME_FEAT_LBA_RANGE, NVME_LBA_RANGE_BUFSIZE, NVMEADM_F_NS, do_get_feat_common, nvme_print_feat_lba_range }, { "Temperature Threshold", "", - NVME_FEAT_TEMPERATURE, 0, NVMEADM_CTRL, + NVME_FEAT_TEMPERATURE, 0, NVMEADM_F_CTRL, do_get_feat_temp_thresh, nvme_print_feat_temperature }, { "Error Recovery", "", - NVME_FEAT_ERROR, 0, NVMEADM_CTRL, + NVME_FEAT_ERROR, 0, NVMEADM_F_CTRL, do_get_feat_common, nvme_print_feat_error }, { "Volatile Write Cache", "cache", - NVME_FEAT_WRITE_CACHE, 0, NVMEADM_CTRL, + NVME_FEAT_WRITE_CACHE, 0, NVMEADM_F_CTRL, do_get_feat_common, nvme_print_feat_write_cache }, { "Number of Queues", "queues", - NVME_FEAT_NQUEUES, 0, NVMEADM_CTRL, + NVME_FEAT_NQUEUES, 0, NVMEADM_F_CTRL, do_get_feat_common, nvme_print_feat_nqueues }, { "Interrupt Coalescing", "coalescing", - NVME_FEAT_INTR_COAL, 0, NVMEADM_CTRL, + NVME_FEAT_INTR_COAL, 0, NVMEADM_F_CTRL, do_get_feat_common, nvme_print_feat_intr_coal }, { "Interrupt Vector Configuration", "vector", - NVME_FEAT_INTR_VECT, 0, NVMEADM_CTRL, + NVME_FEAT_INTR_VECT, 0, NVMEADM_F_CTRL, do_get_feat_intr_vect, nvme_print_feat_intr_vect }, { "Write Atomicity", "atomicity", - NVME_FEAT_WRITE_ATOM, 0, NVMEADM_CTRL, + NVME_FEAT_WRITE_ATOM, 0, NVMEADM_F_CTRL, do_get_feat_common, nvme_print_feat_write_atom }, { "Asynchronous Event Configuration", "event", - NVME_FEAT_ASYNC_EVENT, 0, NVMEADM_CTRL, + NVME_FEAT_ASYNC_EVENT, 0, NVMEADM_F_CTRL, do_get_feat_common, nvme_print_feat_async_event }, { "Autonomous Power State Transition", "", - NVME_FEAT_AUTO_PST, NVME_AUTO_PST_BUFSIZE, NVMEADM_CTRL, + NVME_FEAT_AUTO_PST, NVME_AUTO_PST_BUFSIZE, NVMEADM_F_CTRL, do_get_feat_common, nvme_print_feat_auto_pst }, { "Software Progress Marker", "progress", - NVME_FEAT_PROGRESS, 0, NVMEADM_CTRL, + NVME_FEAT_PROGRESS, 0, NVMEADM_F_CTRL, do_get_feat_common, nvme_print_feat_progress }, { NULL, NULL, 0, 0, B_FALSE, NULL } }; @@ -294,6 +309,7 @@ main(int argc, char **argv) npa.npa_cmd = cmd; npa.npa_interactive = B_TRUE; + npa.npa_excl = ((cmd->c_flags & NVMEADM_C_EXCL) != 0); optind++; @@ -334,7 +350,7 @@ main(int argc, char **argv) * aren't allowed to do that. */ if (ctrl != NULL && strchr(ctrl, ',') != NULL && - cmd->c_multi == B_FALSE) { + (cmd->c_flags & NVMEADM_C_MULTI) == 0) { warnx("%s not allowed on multiple controllers", cmd->c_name); usage(cmd); @@ -513,7 +529,7 @@ nvme_process(di_node_t node, di_minor_t minor, void *arg) if (!nvme_match(npa)) return (DI_WALK_CONTINUE); - if ((fd = nvme_open(minor)) < 0) + if ((fd = nvme_open(minor, npa->npa_excl)) < 0) return (DI_WALK_CONTINUE); npa->npa_found++; @@ -851,9 +867,9 @@ usage_get_features(const char *c_name) for (feat = &features[0]; feat->f_feature != 0; feat++) { char *type; - if ((feat->f_getflags & NVMEADM_BOTH) == NVMEADM_BOTH) + if ((feat->f_getflags & NVMEADM_F_BOTH) == NVMEADM_F_BOTH) type = "both"; - else if ((feat->f_getflags & NVMEADM_CTRL) != 0) + else if ((feat->f_getflags & NVMEADM_F_CTRL) != 0) type = "controller only"; else type = "namespace only"; @@ -1065,9 +1081,9 @@ do_get_features(int fd, const nvme_process_arg_t *npa) (void) printf("%s: Get Features\n", npa->npa_name); for (feat = &features[0]; feat->f_feature != 0; feat++) { if ((npa->npa_isns && - (feat->f_getflags & NVMEADM_NS) == 0) || + (feat->f_getflags & NVMEADM_F_NS) == 0) || (!npa->npa_isns && - (feat->f_getflags & NVMEADM_CTRL) == 0)) + (feat->f_getflags & NVMEADM_F_CTRL) == 0)) continue; (void) feat->f_get(fd, feat, npa); @@ -1101,11 +1117,12 @@ do_get_features(int fd, const nvme_process_arg_t *npa) } if ((npa->npa_isns && - (feat->f_getflags & NVMEADM_NS) == 0) || + (feat->f_getflags & NVMEADM_F_NS) == 0) || (!npa->npa_isns && - (feat->f_getflags & NVMEADM_CTRL) == 0)) { + (feat->f_getflags & NVMEADM_F_CTRL) == 0)) { warnx("feature %s %s supported for namespaces", - feat->f_name, (feat->f_getflags & NVMEADM_NS) != 0 ? + feat->f_name, + (feat->f_getflags & NVMEADM_F_NS) != 0 ? "only" : "not"); continue; } @@ -1248,6 +1265,7 @@ do_attach_detach(int fd, const nvme_process_arg_t *npa) ns_npa.npa_name = npa->npa_name; ns_npa.npa_isns = B_TRUE; ns_npa.npa_cmd = npa->npa_cmd; + ns_npa.npa_excl = npa->npa_excl; nvme_walk(&ns_npa, npa->npa_node); diff --git a/usr/src/cmd/nvmeadm/nvmeadm.h b/usr/src/cmd/nvmeadm/nvmeadm.h index e06cd93189..e6c16a8252 100644 --- a/usr/src/cmd/nvmeadm/nvmeadm.h +++ b/usr/src/cmd/nvmeadm/nvmeadm.h @@ -41,6 +41,7 @@ struct nvme_process_arg { char *npa_name; char *npa_nsid; int npa_found; + boolean_t npa_excl; boolean_t npa_isns; boolean_t npa_ignored; boolean_t npa_interactive; @@ -101,7 +102,7 @@ extern void nvme_print_feat_progress(uint64_t, void *, size_t, extern const char *nvme_fw_error(int, int); /* device node functions */ -extern int nvme_open(di_minor_t); +extern int nvme_open(di_minor_t, boolean_t); extern void nvme_close(int); extern nvme_version_t *nvme_version(int); extern nvme_capabilities_t *nvme_capabilities(int); diff --git a/usr/src/cmd/nvmeadm/nvmeadm_dev.c b/usr/src/cmd/nvmeadm/nvmeadm_dev.c index 6c837d0d8c..ce86a8d164 100644 --- a/usr/src/cmd/nvmeadm/nvmeadm_dev.c +++ b/usr/src/cmd/nvmeadm/nvmeadm_dev.c @@ -241,7 +241,7 @@ nvme_is_ignored_ns(int fd) } int -nvme_open(di_minor_t minor) +nvme_open(di_minor_t minor, boolean_t excl) { char *devpath, *path; int fd; @@ -256,7 +256,7 @@ nvme_open(di_minor_t minor) di_devfs_path_free(devpath); - fd = open(path, O_RDWR); + fd = open(path, O_RDWR | (excl ? O_EXCL: 0)); if (fd < 0) { if (debug) diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c index c1c808ed34..328dae2744 100644 --- a/usr/src/uts/common/io/nvme/nvme.c +++ b/usr/src/uts/common/io/nvme/nvme.c @@ -109,6 +109,17 @@ * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the * interface for the nvmeadm(8) utility. * + * Exclusive opens are required for certain ioctl(9E) operations that alter + * controller and/or namespace state. While different namespaces may be opened + * exclusively in parallel, an exclusive open of the controller minor node + * requires that no namespaces are currently open (exclusive or otherwise). + * Opening any namespace minor node (exclusive or otherwise) will fail while + * the controller minor node is opened exclusively by any other thread. Thus it + * is possible for one thread at a time to open the controller minor node + * exclusively, and keep it open while opening any namespace minor node of the + * same controller, exclusively or otherwise. + * + * * * Blkdev Interface: * @@ -194,8 +205,9 @@ * mutex is non-contentious but is required for implementation completeness * and safety. * - * Each minor node has its own nm_mutex, which protects the open count nm_ocnt - * and exclusive-open flag nm_oexcl. + * There is one mutex n_minor_mutex which protects all open flags nm_open and + * exclusive-open thread pointers nm_oexcl of each minor node associated with a + * controller and its namespaces. * * * Quiesce / Fast Reboot: @@ -3167,8 +3179,6 @@ nvme_init(nvme_t *nvme) nvme->n_namespace_count, KM_SLEEP); for (i = 0; i != nvme->n_namespace_count; i++) { - mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER, - NULL); nvme->n_ns[i].ns_ignore = B_TRUE; if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS) goto fail; @@ -3551,7 +3561,8 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) goto fail; } - mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL); + mutex_init(&nvme->n_minor_mutex, NULL, MUTEX_DRIVER, NULL); + nvme->n_progress |= NVME_MUTEX_INIT; nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE; @@ -3772,12 +3783,10 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) return (DDI_FAILURE); ddi_remove_minor_node(dip, "devctl"); - mutex_destroy(&nvme->n_minor.nm_mutex); if (nvme->n_ns) { for (i = 0; i != nvme->n_namespace_count; i++) { ddi_remove_minor_node(dip, nvme->n_ns[i].ns_name); - mutex_destroy(&nvme->n_ns[i].ns_minor.nm_mutex); if (nvme->n_ns[i].ns_bd_hdl) { (void) bd_detach_handle( @@ -3808,6 +3817,10 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) taskq_wait(nvme->n_cq[i]->ncq_cmd_taskq); } + if (nvme->n_progress & NVME_MUTEX_INIT) { + mutex_destroy(&nvme->n_minor_mutex); + } + if (nvme->n_ioq_count > 0) { for (i = 1; i != nvme->n_ioq_count + 1; i++) { if (nvme->n_ioq[i] != NULL) { @@ -4333,26 +4346,46 @@ nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) if (nvme->n_dead) return (EIO); - nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; + mutex_enter(&nvme->n_minor_mutex); - mutex_enter(&nm->nm_mutex); - if (nm->nm_oexcl) { + /* + * First check the devctl node and error out if it's been opened + * exclusively already by any other thread. + */ + if (nvme->n_minor.nm_oexcl != NULL && + nvme->n_minor.nm_oexcl != curthread) { rv = EBUSY; goto out; } + nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; + if (flag & FEXCL) { - if (nm->nm_ocnt != 0) { + if (nm->nm_oexcl != NULL || nm->nm_open) { rv = EBUSY; goto out; } - nm->nm_oexcl = B_TRUE; + + /* + * If at least one namespace is already open, fail the + * exclusive open of the devctl node. + */ + if (nsid == 0) { + for (int i = 0; i != nvme->n_namespace_count; i++) { + if (nvme->n_ns[i].ns_minor.nm_open) { + rv = EBUSY; + goto out; + } + } + } + + nm->nm_oexcl = curthread; } - nm->nm_ocnt++; + nm->nm_open = B_TRUE; out: - mutex_exit(&nm->nm_mutex); + mutex_exit(&nvme->n_minor_mutex); return (rv); } @@ -4380,13 +4413,15 @@ nvme_close(dev_t dev, int flag, int otyp, cred_t *cred_p) nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; - mutex_enter(&nm->nm_mutex); - if (nm->nm_oexcl) - nm->nm_oexcl = B_FALSE; + mutex_enter(&nvme->n_minor_mutex); + if (nm->nm_oexcl != NULL) { + ASSERT(nm->nm_oexcl == curthread); + nm->nm_oexcl = NULL; + } - ASSERT(nm->nm_ocnt > 0); - nm->nm_ocnt--; - mutex_exit(&nm->nm_mutex); + ASSERT(nm->nm_open); + nm->nm_open = B_FALSE; + mutex_exit(&nvme->n_minor_mutex); return (0); } @@ -4767,10 +4802,15 @@ nvme_ioctl_format(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, _NOTE(ARGUNUSED(mode)); nvme_format_nvm_t frmt = { 0 }; int c_nsid = nsid != 0 ? nsid - 1 : 0; + nvme_minor_state_t *nm; if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) return (EPERM); + nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[c_nsid].ns_minor; + if (nm->nm_oexcl != curthread) + return (EACCES); + frmt.r = nioc->n_arg & 0xffffffff; /* @@ -4832,6 +4872,9 @@ nvme_ioctl_detach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, if (nsid == 0) return (EINVAL); + if (nvme->n_ns[nsid - 1].ns_minor.nm_oexcl != curthread) + return (EACCES); + if (nvme->n_ns[nsid - 1].ns_ignore) return (0); @@ -4856,6 +4899,9 @@ nvme_ioctl_attach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, if (nsid == 0) return (EINVAL); + if (nvme->n_ns[nsid - 1].ns_minor.nm_oexcl != curthread) + return (EACCES); + /* * Identify namespace again, free old identify data. */ diff --git a/usr/src/uts/common/io/nvme/nvme_var.h b/usr/src/uts/common/io/nvme/nvme_var.h index 75039d4bc7..aa351d863f 100644 --- a/usr/src/uts/common/io/nvme/nvme_var.h +++ b/usr/src/uts/common/io/nvme/nvme_var.h @@ -10,12 +10,11 @@ */ /* - * Copyright 2018 Nexenta Systems, Inc. * Copyright 2016 The MathWorks, Inc. All rights reserved. * Copyright 2019 Joyent, Inc. - * Copyright 2019 Western Digital Corporation. * Copyright 2021 Oxide Computer Company. * Copyright 2022 OmniOS Community Edition (OmniOSce) Association. + * Copyright 2022 Tintri by DDN, Inc. All rights reserved. */ #ifndef _NVME_VAR_H @@ -41,6 +40,7 @@ extern "C" { #define NVME_CTRL_LIMITS 0x8 #define NVME_INTERRUPTS 0x10 #define NVME_UFM_INIT 0x20 +#define NVME_MUTEX_INIT 0x40 #define NVME_MIN_ADMIN_QUEUE_LEN 16 #define NVME_MIN_IO_QUEUE_LEN 16 @@ -61,9 +61,8 @@ typedef struct nvme_qpair nvme_qpair_t; typedef struct nvme_task_arg nvme_task_arg_t; struct nvme_minor_state { - kmutex_t nm_mutex; - boolean_t nm_oexcl; - uint_t nm_ocnt; + kthread_t *nm_oexcl; + boolean_t nm_open; }; struct nvme_dma { @@ -214,6 +213,9 @@ struct nvme { ksema_t n_abort_sema; + /* protects minor node operations */ + kmutex_t n_minor_mutex; + /* state for devctl minor node */ nvme_minor_state_t n_minor; |