diff options
Diffstat (limited to 'usr/src')
-rw-r--r-- | usr/src/cmd/Makefile | 1 | ||||
-rw-r--r-- | usr/src/cmd/nvmeadm/Makefile | 43 | ||||
-rw-r--r-- | usr/src/cmd/nvmeadm/nvmeadm.c | 1005 | ||||
-rw-r--r-- | usr/src/cmd/nvmeadm/nvmeadm.h | 87 | ||||
-rw-r--r-- | usr/src/cmd/nvmeadm/nvmeadm_dev.c | 201 | ||||
-rw-r--r-- | usr/src/cmd/nvmeadm/nvmeadm_print.c | 1138 | ||||
-rw-r--r-- | usr/src/man/man1m/Makefile | 3 | ||||
-rw-r--r-- | usr/src/man/man1m/nvmeadm.1m | 410 | ||||
-rw-r--r-- | usr/src/pkg/manifests/driver-storage-nvme.mf | 3 | ||||
-rw-r--r-- | usr/src/uts/common/io/blkdev/blkdev.c | 16 | ||||
-rw-r--r-- | usr/src/uts/common/io/nvme/nvme.c | 1059 | ||||
-rw-r--r-- | usr/src/uts/common/io/nvme/nvme_reg.h | 341 | ||||
-rw-r--r-- | usr/src/uts/common/io/nvme/nvme_var.h | 24 | ||||
-rw-r--r-- | usr/src/uts/common/sys/Makefile | 3 | ||||
-rw-r--r-- | usr/src/uts/common/sys/nvme.h | 574 | ||||
-rw-r--r-- | usr/src/uts/common/sys/sunddi.h | 5 |
16 files changed, 4451 insertions, 462 deletions
diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile index 08841eb06d..e33e3643f9 100644 --- a/usr/src/cmd/Makefile +++ b/usr/src/cmd/Makefile @@ -480,6 +480,7 @@ i386_SUBDIRS= \ addbadsec \ biosdev \ diskscan \ + nvmeadm \ rtc \ ucodeadm \ xvm diff --git a/usr/src/cmd/nvmeadm/Makefile b/usr/src/cmd/nvmeadm/Makefile new file mode 100644 index 0000000000..c042d4075f --- /dev/null +++ b/usr/src/cmd/nvmeadm/Makefile @@ -0,0 +1,43 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2015 Nexenta Systems, Inc. +# + + +PROG= nvmeadm + +OBJS= nvmeadm.o nvmeadm_dev.o nvmeadm_print.o +SRCS= $(OBJS:%.o=%.c) + +include ../Makefile.cmd + +.KEEP_STATE: + +CFLAGS += $(CCVERBOSE) +LDLIBS += -ldevinfo +C99MODE= $(C99_ENABLE) + +all: $(PROG) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDLIBS) + $(POST_PROCESS) + +install: all $(ROOTUSRSBINPROG) + +clean: + $(RM) $(OBJS) $(PROG) + +lint: lint_SRCS + +include ../Makefile.targ diff --git a/usr/src/cmd/nvmeadm/nvmeadm.c b/usr/src/cmd/nvmeadm/nvmeadm.c new file mode 100644 index 0000000000..13cace3ead --- /dev/null +++ b/usr/src/cmd/nvmeadm/nvmeadm.c @@ -0,0 +1,1005 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Nexenta Systems, Inc. + */ + +/* + * nvmeadm -- NVMe administration utility + * + * nvmeadm [-v] [-d] [-h] <command> [<ctl>[/<ns>][,...]] [args] + * commands: list + * identify + * get-logpage <logpage name> + * get-features <feature>[,...] + * format ... + * secure-erase ... + * detach ... + * attach ... + * get-param ... + * set-param ... + * load-firmware ... + * activate-firmware ... + * write-uncorrectable ... + * compare ... + * compare-and-write ... + */ + +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <ctype.h> +#include <err.h> +#include <sys/sunddi.h> +#include <libdevinfo.h> + +#include <sys/nvme.h> + +#include "nvmeadm.h" + +typedef struct nvme_process_arg nvme_process_arg_t; +typedef struct nvme_feature nvme_feature_t; +typedef struct nvmeadm_cmd nvmeadm_cmd_t; + +struct nvme_process_arg { + int npa_argc; + char **npa_argv; + char *npa_name; + uint32_t npa_nsid; + boolean_t npa_isns; + const nvmeadm_cmd_t *npa_cmd; + di_node_t npa_node; + di_minor_t npa_minor; + char *npa_path; + char *npa_dsk; + nvme_identify_ctrl_t *npa_idctl; + nvme_identify_nsid_t *npa_idns; + nvme_version_t *npa_version; +}; + +struct nvme_feature { + char *f_name; + char *f_short; + uint8_t f_feature; + size_t f_bufsize; + uint_t f_getflags; + int (*f_get)(int, const nvme_feature_t *, nvme_identify_ctrl_t *); + void (*f_print)(uint64_t, void *, size_t, nvme_identify_ctrl_t *); +}; + +#define NVMEADM_CTRL 1 +#define NVMEADM_NS 2 +#define NVMEADM_BOTH (NVMEADM_CTRL | NVMEADM_NS) + +struct nvmeadm_cmd { + char *c_name; + char *c_desc; + char *c_flagdesc; + int (*c_func)(int, const nvme_process_arg_t *); + void (*c_usage)(const char *); + boolean_t c_multi; +}; + + +static void usage(const nvmeadm_cmd_t *); +static void nvme_walk(nvme_process_arg_t *, di_node_t); +static boolean_t nvme_match(nvme_process_arg_t *); + +static int nvme_process(di_node_t, di_minor_t, void *); + +static int do_list(int, const nvme_process_arg_t *); +static int do_identify(int, const nvme_process_arg_t *); +static int do_get_logpage_error(int, const nvme_process_arg_t *); +static int do_get_logpage_health(int, const nvme_process_arg_t *); +static int do_get_logpage_fwslot(int, const nvme_process_arg_t *); +static int do_get_logpage(int, const nvme_process_arg_t *); +static int do_get_feat_common(int, const nvme_feature_t *, + nvme_identify_ctrl_t *); +static int do_get_feat_intr_vect(int, const nvme_feature_t *, + nvme_identify_ctrl_t *); +static int do_get_features(int, const nvme_process_arg_t *); +static int do_format(int, const nvme_process_arg_t *); +static int do_secure_erase(int, const nvme_process_arg_t *); +static int do_attach_detach(int, const nvme_process_arg_t *); + +static void usage_list(const char *); +static void usage_identify(const char *); +static void usage_get_logpage(const char *); +static void usage_get_features(const char *); +static void usage_format(const char *); +static void usage_secure_erase(const char *); +static void usage_attach_detach(const char *); + +int verbose; +int debug; +int found; +static int exitcode; + +static const nvmeadm_cmd_t nvmeadm_cmds[] = { + { + "list", + "list controllers and namespaces", + NULL, + do_list, usage_list, B_TRUE + }, + { + "identify", + "identify controllers and/or namespaces", + NULL, + do_identify, usage_identify, B_TRUE + }, + { + "get-logpage", + "get a log page from controllers and/or namespaces", + NULL, + do_get_logpage, usage_get_logpage, B_TRUE + }, + { + "get-features", + "get features from controllers and/or namespaces", + NULL, + do_get_features, usage_get_features, B_TRUE + }, + { + "format", + "format namespace(s) of a controller", + NULL, + do_format, usage_format, B_FALSE + }, + { + "secure-erase", + "secure erase namespace(s) of a controller", + " -c Do a cryptographic erase.", + do_secure_erase, usage_secure_erase, B_FALSE + }, + { + "detach", + "detach blkdev(7d) from namespace(s) of a controller", + NULL, + do_attach_detach, usage_attach_detach, B_FALSE + }, + { + "attach", + "attach blkdev(7d) to namespace(s) of a controller", + NULL, + do_attach_detach, usage_attach_detach, B_FALSE + }, + { + NULL, NULL, NULL, + NULL, NULL, B_FALSE + } +}; + +static const nvme_feature_t features[] = { + { "Arbitration", "", + NVME_FEAT_ARBITRATION, 0, NVMEADM_CTRL, + do_get_feat_common, nvme_print_feat_arbitration }, + { "Power Management", "", + NVME_FEAT_POWER_MGMT, 0, NVMEADM_CTRL, + do_get_feat_common, nvme_print_feat_power_mgmt }, + { "LBA Range Type", "range", + NVME_FEAT_LBA_RANGE, NVME_LBA_RANGE_BUFSIZE, NVMEADM_NS, + do_get_feat_common, nvme_print_feat_lba_range }, + { "Temperature Threshold", "", + NVME_FEAT_TEMPERATURE, 0, NVMEADM_CTRL, + do_get_feat_common, nvme_print_feat_temperature }, + { "Error Recovery", "", + NVME_FEAT_ERROR, 0, NVMEADM_CTRL, + do_get_feat_common, nvme_print_feat_error }, + { "Volatile Write Cache", "cache", + NVME_FEAT_WRITE_CACHE, 0, NVMEADM_CTRL, + do_get_feat_common, nvme_print_feat_write_cache }, + { "Number of Queues", "queues", + NVME_FEAT_NQUEUES, 0, NVMEADM_CTRL, + do_get_feat_common, nvme_print_feat_nqueues }, + { "Interrupt Coalescing", "coalescing", + NVME_FEAT_INTR_COAL, 0, NVMEADM_CTRL, + do_get_feat_common, nvme_print_feat_intr_coal }, + { "Interrupt Vector Configuration", "vector", + NVME_FEAT_INTR_VECT, 0, NVMEADM_CTRL, + do_get_feat_intr_vect, nvme_print_feat_intr_vect }, + { "Write Atomicity", "atomicity", + NVME_FEAT_WRITE_ATOM, 0, NVMEADM_CTRL, + do_get_feat_common, nvme_print_feat_write_atom }, + { "Asynchronous Event Configuration", "event", + NVME_FEAT_ASYNC_EVENT, 0, NVMEADM_CTRL, + do_get_feat_common, nvme_print_feat_async_event }, + { "Autonomous Power State Transition", "", + NVME_FEAT_AUTO_PST, NVME_AUTO_PST_BUFSIZE, NVMEADM_CTRL, + do_get_feat_common, nvme_print_feat_auto_pst }, + { "Software Progress Marker", "progress", + NVME_FEAT_PROGRESS, 0, NVMEADM_CTRL, + do_get_feat_common, nvme_print_feat_progress }, + { NULL, NULL, 0, 0, B_FALSE, NULL } +}; + + +int +main(int argc, char **argv) +{ + int c; + extern int optind; + const nvmeadm_cmd_t *cmd; + di_node_t node; + nvme_process_arg_t npa = { 0 }; + int help = 0; + char *tmp, *lasts = NULL; + + while ((c = getopt(argc, argv, "dhv")) != -1) { + switch (c) { + case 'd': + debug++; + break; + case 'v': + verbose++; + break; + case 'h': + help++; + break; + case '?': + usage(NULL); + exit(-1); + } + } + + if (optind == argc) { + usage(NULL); + if (help) + exit(0); + else + exit(-1); + } + + /* Look up the specified command in the command table. */ + for (cmd = &nvmeadm_cmds[0]; cmd->c_name != NULL; cmd++) + if (strcmp(cmd->c_name, argv[optind]) == 0) + break; + + if (cmd->c_name == NULL) { + usage(NULL); + exit(-1); + } + + if (help) { + usage(cmd); + exit(0); + } + + npa.npa_cmd = cmd; + + optind++; + + /* + * All commands but "list" require a ctl/ns argument. + */ + if ((optind == argc || (strncmp(argv[optind], "nvme", 4) != 0)) && + cmd->c_func != do_list) { + warnx("missing controller/namespace name"); + usage(cmd); + exit(-1); + } + + + /* Store the remaining arguments for use by the command. */ + npa.npa_argc = argc - optind - 1; + npa.npa_argv = &argv[optind + 1]; + + /* + * Make sure we're not running commands on multiple controllers that + * aren't allowed to do that. + */ + if (argv[optind] != NULL && strchr(argv[optind], ',') != NULL && + cmd->c_multi == B_FALSE) { + warnx("%s not allowed on multiple controllers", + cmd->c_name); + usage(cmd); + exit(-1); + } + + /* + * Get controller/namespace arguments and run command. + */ + npa.npa_name = strtok_r(argv[optind], ",", &lasts); + do { + if (npa.npa_name != NULL) { + tmp = strchr(npa.npa_name, '/'); + if (tmp != NULL) { + unsigned long nsid; + *tmp++ = '\0'; + errno = 0; + nsid = strtoul(tmp, NULL, 10); + if (nsid >= UINT32_MAX || errno != 0) { + warn("invalid namespace %s", tmp); + exitcode--; + continue; + } + if (nsid == 0) { + warnx("invalid namespace %s", tmp); + exitcode--; + continue; + } + npa.npa_nsid = nsid; + npa.npa_isns = B_TRUE; + } + } + + if ((node = di_init("/", DINFOSUBTREE | DINFOMINOR)) == NULL) + err(-1, "failed to initialize libdevinfo"); + nvme_walk(&npa, node); + di_fini(node); + + if (found == 0) { + if (npa.npa_name != NULL) { + warnx("%s%.*s%.*d: no such controller or " + "namespace", npa.npa_name, + npa.npa_nsid > 0 ? -1 : 0, "/", + npa.npa_nsid > 0 ? -1 : 0, npa.npa_nsid); + } else { + warnx("no controllers found"); + } + exitcode--; + } + found = 0; + npa.npa_name = strtok_r(NULL, ",", &lasts); + } while (npa.npa_name != NULL); + + exit(exitcode); +} + +static void +usage(const nvmeadm_cmd_t *cmd) +{ + (void) fprintf(stderr, "usage:\n"); + (void) fprintf(stderr, " %s -h %s\n", getprogname(), + cmd != NULL ? cmd->c_name : "[<command>]"); + (void) fprintf(stderr, " %s [-dv] ", getprogname()); + + if (cmd != NULL) { + cmd->c_usage(cmd->c_name); + } else { + (void) fprintf(stderr, + "<command> <ctl>[/<ns>][,...] [<args>]\n"); + (void) fprintf(stderr, + "\n Manage NVMe controllers and namespaces.\n"); + (void) fprintf(stderr, "\ncommands:\n"); + + for (cmd = &nvmeadm_cmds[0]; cmd->c_name != NULL; cmd++) + (void) fprintf(stderr, " %-15s - %s\n", + cmd->c_name, cmd->c_desc); + } + (void) fprintf(stderr, "\nflags:\n" + " -h print usage information\n" + " -d print information useful for debugging %s\n" + " -v print verbose information\n", getprogname()); + if (cmd != NULL && cmd->c_flagdesc != NULL) + (void) fprintf(stderr, "%s\n", cmd->c_flagdesc); +} + +static boolean_t +nvme_match(nvme_process_arg_t *npa) +{ + char *name; + uint32_t nsid = 0; + + if (npa->npa_name == NULL) + return (B_TRUE); + + if (asprintf(&name, "%s%d", di_driver_name(npa->npa_node), + di_instance(npa->npa_node)) < 0) + err(-1, "nvme_match()"); + + if (strcmp(name, npa->npa_name) != 0) { + free(name); + return (B_FALSE); + } + + free(name); + + if (npa->npa_isns) { + if (npa->npa_nsid == 0) + return (B_TRUE); + nsid = strtoul(di_minor_name(npa->npa_minor), NULL, 10); + } + + if (npa->npa_isns && npa->npa_nsid != nsid) + return (B_FALSE); + + return (B_TRUE); +} + +char * +nvme_dskname(const nvme_process_arg_t *npa) +{ + char *path = NULL; + di_node_t child; + di_dim_t dim; + char *addr; + + dim = di_dim_init(); + + for (child = di_child_node(npa->npa_node); + child != DI_NODE_NIL; + child = di_sibling_node(child)) { + addr = di_bus_addr(child); + if (addr == NULL) + continue; + + if (addr[0] == 'w') + addr++; + + if (strncasecmp(addr, di_minor_name(npa->npa_minor), + strchrnul(addr, ',') - addr) != 0) + continue; + + path = di_dim_path_dev(dim, di_driver_name(child), + di_instance(child), "c"); + + if (path != NULL) { + path[strlen(path) - 2] = '\0'; + path = strrchr(path, '/') + 1; + if (path != NULL) { + path = strdup(path); + if (path == NULL) + err(-1, "nvme_dskname"); + } + } + + break; + } + + di_dim_fini(dim); + return (path); +} + +static int +nvme_process(di_node_t node, di_minor_t minor, void *arg) +{ + nvme_process_arg_t *npa = arg; + int fd; + + npa->npa_node = node; + npa->npa_minor = minor; + + if (!nvme_match(npa)) + return (DI_WALK_CONTINUE); + + if ((fd = nvme_open(minor)) < 0) + return (DI_WALK_CONTINUE); + + found++; + + npa->npa_path = di_devfs_path(node); + if (npa->npa_path == NULL) + goto out; + + npa->npa_version = nvme_version(fd); + if (npa->npa_version == NULL) + goto out; + + npa->npa_idctl = nvme_identify_ctrl(fd); + if (npa->npa_idctl == NULL) + goto out; + + npa->npa_idns = nvme_identify_nsid(fd); + if (npa->npa_idns == NULL) + goto out; + + if (npa->npa_isns) + npa->npa_dsk = nvme_dskname(npa); + + exitcode += npa->npa_cmd->c_func(fd, npa); + +out: + di_devfs_path_free(npa->npa_path); + free(npa->npa_dsk); + free(npa->npa_version); + free(npa->npa_idctl); + free(npa->npa_idns); + + npa->npa_version = NULL; + npa->npa_idctl = NULL; + npa->npa_idns = NULL; + + nvme_close(fd); + + return (DI_WALK_CONTINUE); +} + +static void +nvme_walk(nvme_process_arg_t *npa, di_node_t node) +{ + char *minor_nodetype = DDI_NT_NVME_NEXUS; + + if (npa->npa_isns) + minor_nodetype = DDI_NT_NVME_ATTACHMENT_POINT; + + (void) di_walk_minor(node, minor_nodetype, 0, npa, nvme_process); +} + +static void +usage_list(const char *c_name) +{ + (void) fprintf(stderr, "%s [<ctl>[/<ns>][,...]\n\n" + " List NVMe controllers and their namespaces. If no " + "controllers and/or name-\n spaces are specified, all " + "controllers and namespaces in the system will be\n " + "listed.\n", c_name); +} + +static int +do_list_nsid(int fd, const nvme_process_arg_t *npa) +{ + _NOTE(ARGUNUSED(fd)); + + (void) printf(" %s/%s (%s): ", npa->npa_name, + di_minor_name(npa->npa_minor), + npa->npa_dsk != NULL ? npa->npa_dsk : "unattached"); + nvme_print_nsid_summary(npa->npa_idns); + + return (0); +} + +static int +do_list(int fd, const nvme_process_arg_t *npa) +{ + _NOTE(ARGUNUSED(fd)); + + nvme_process_arg_t ns_npa = { 0 }; + nvmeadm_cmd_t cmd = { 0 }; + char *name; + + if (asprintf(&name, "%s%d", di_driver_name(npa->npa_node), + di_instance(npa->npa_node)) < 0) + err(-1, "do_list()"); + + (void) printf("%s: ", name); + nvme_print_ctrl_summary(npa->npa_idctl, npa->npa_version); + + ns_npa.npa_name = name; + ns_npa.npa_isns = B_TRUE; + ns_npa.npa_nsid = npa->npa_nsid; + cmd = *(npa->npa_cmd); + cmd.c_func = do_list_nsid; + ns_npa.npa_cmd = &cmd; + + nvme_walk(&ns_npa, npa->npa_node); + + free(name); + + return (exitcode); +} + +static void +usage_identify(const char *c_name) +{ + (void) fprintf(stderr, "%s <ctl>[/<ns>][,...]\n\n" + " Print detailed information about the specified NVMe " + "controllers and/or name-\n spaces.\n", c_name); +} + +static int +do_identify(int fd, const nvme_process_arg_t *npa) +{ + if (npa->npa_nsid == 0) { + nvme_capabilities_t *cap; + + cap = nvme_capabilities(fd); + if (cap == NULL) + return (-1); + + (void) printf("%s: ", npa->npa_name); + nvme_print_identify_ctrl(npa->npa_idctl, cap, + npa->npa_version); + + free(cap); + } else { + (void) printf("%s/%s: ", npa->npa_name, + di_minor_name(npa->npa_minor)); + nvme_print_identify_nsid(npa->npa_idns, + npa->npa_version); + } + + return (0); +} + +static void +usage_get_logpage(const char *c_name) +{ + (void) fprintf(stderr, "%s <ctl>[/<ns>][,...] <logpage>\n\n" + " Print the specified log page of the specified NVMe " + "controllers and/or name-\n spaces. Supported log pages " + "are error, health, and firmware.\n", c_name); +} + +static int +do_get_logpage_error(int fd, const nvme_process_arg_t *npa) +{ + int nlog = npa->npa_idctl->id_elpe + 1; + size_t bufsize = sizeof (nvme_error_log_entry_t) * nlog; + nvme_error_log_entry_t *elog; + + if (npa->npa_nsid != 0) + errx(-1, "Error Log not available on a per-namespace basis"); + + elog = nvme_get_logpage(fd, NVME_LOGPAGE_ERROR, &bufsize); + + if (elog == NULL) + return (-1); + + nlog = bufsize / sizeof (nvme_error_log_entry_t); + + (void) printf("%s: ", npa->npa_name); + nvme_print_error_log(nlog, elog); + + free(elog); + + return (0); +} + +static int +do_get_logpage_health(int fd, const nvme_process_arg_t *npa) +{ + size_t bufsize = sizeof (nvme_health_log_t); + nvme_health_log_t *hlog; + + if (npa->npa_nsid != 0) { + if (npa->npa_idctl->id_lpa.lp_smart == 0) + errx(-1, "SMART/Health information not available " + "on a per-namespace basis on this controller"); + } + + hlog = nvme_get_logpage(fd, NVME_LOGPAGE_HEALTH, &bufsize); + + if (hlog == NULL) + return (-1); + + (void) printf("%s: ", npa->npa_name); + nvme_print_health_log(hlog, npa->npa_idctl); + + free(hlog); + + return (0); +} + +static int +do_get_logpage_fwslot(int fd, const nvme_process_arg_t *npa) +{ + size_t bufsize = sizeof (nvme_fwslot_log_t); + nvme_fwslot_log_t *fwlog; + + if (npa->npa_nsid != 0) + errx(-1, "Firmware Slot information not available on a " + "per-namespace basis"); + + fwlog = nvme_get_logpage(fd, NVME_LOGPAGE_FWSLOT, &bufsize); + + if (fwlog == NULL) + return (-1); + + (void) printf("%s: ", npa->npa_name); + nvme_print_fwslot_log(fwlog); + + free(fwlog); + + return (0); +} + +static int +do_get_logpage(int fd, const nvme_process_arg_t *npa) +{ + int ret = 0; + int (*func)(int, const nvme_process_arg_t *); + + if (npa->npa_argc < 1) { + warnx("missing logpage name"); + usage(npa->npa_cmd); + exit(-1); + } + + if (strcmp(npa->npa_argv[0], "error") == 0) + func = do_get_logpage_error; + else if (strcmp(npa->npa_argv[0], "health") == 0) + func = do_get_logpage_health; + else if (strcmp(npa->npa_argv[0], "firmware") == 0) + func = do_get_logpage_fwslot; + else + errx(-1, "invalid log page: %s", npa->npa_argv[0]); + + ret = func(fd, npa); + return (ret); +} + +static void +usage_get_features(const char *c_name) +{ + const nvme_feature_t *feat; + + (void) fprintf(stderr, "%s <ctl>[/<ns>][,...] [<feature>[,...]]\n\n" + " Print the specified features of the specified NVMe controllers " + "and/or\n namespaces. Supported features are:\n\n", c_name); + (void) fprintf(stderr, " %-35s %-14s %s\n", + "FEATURE NAME", "SHORT NAME", "CONTROLLER/NAMESPACE"); + for (feat = &features[0]; feat->f_feature != 0; feat++) { + char *type; + + if ((feat->f_getflags & NVMEADM_BOTH) == NVMEADM_BOTH) + type = "both"; + else if ((feat->f_getflags & NVMEADM_CTRL) != 0) + type = "controller only"; + else + type = "namespace only"; + + (void) fprintf(stderr, " %-35s %-14s %s\n", + feat->f_name, feat->f_short, type); + } + +} + +static int +do_get_feat_common(int fd, const nvme_feature_t *feat, + nvme_identify_ctrl_t *idctl) +{ + void *buf = NULL; + size_t bufsize = feat->f_bufsize; + uint64_t res; + + if (nvme_get_feature(fd, feat->f_feature, 0, &res, &bufsize, &buf) + == B_FALSE) + return (EINVAL); + + nvme_print(2, feat->f_name, -1, NULL); + feat->f_print(res, buf, bufsize, idctl); + free(buf); + + return (0); +} + +static int +do_get_feat_intr_vect(int fd, const nvme_feature_t *feat, + nvme_identify_ctrl_t *idctl) +{ + uint64_t res; + uint64_t arg; + int intr_cnt; + + intr_cnt = nvme_intr_cnt(fd); + + if (intr_cnt == -1) + return (EINVAL); + + nvme_print(2, feat->f_name, -1, NULL); + + for (arg = 0; arg < intr_cnt; arg++) { + if (nvme_get_feature(fd, feat->f_feature, arg, &res, NULL, NULL) + == B_FALSE) + return (EINVAL); + + feat->f_print(res, NULL, 0, idctl); + } + + return (0); +} + +static int +do_get_features(int fd, const nvme_process_arg_t *npa) +{ + const nvme_feature_t *feat; + char *f, *flist, *lasts; + boolean_t header_printed = B_FALSE; + + if (npa->npa_argc > 1) + errx(-1, "unexpected arguments"); + + /* + * No feature list given, print all supported features. + */ + if (npa->npa_argc == 0) { + (void) printf("%s: Get Features\n", npa->npa_name); + for (feat = &features[0]; feat->f_feature != 0; feat++) { + if ((npa->npa_nsid != 0 && + (feat->f_getflags & NVMEADM_NS) == 0) || + (npa->npa_nsid == 0 && + (feat->f_getflags & NVMEADM_CTRL) == 0)) + continue; + + (void) feat->f_get(fd, feat, npa->npa_idctl); + } + + return (0); + } + + /* + * Process feature list. + */ + flist = strdup(npa->npa_argv[0]); + if (flist == NULL) + err(-1, "do_get_features"); + + for (f = strtok_r(flist, ",", &lasts); + f != NULL; + f = strtok_r(NULL, ",", &lasts)) { + while (isspace(*f)) + f++; + + for (feat = &features[0]; feat->f_feature != 0; feat++) { + if (strncasecmp(feat->f_name, f, strlen(f)) == 0 || + strncasecmp(feat->f_short, f, strlen(f)) == 0) + break; + } + + if (feat->f_feature == 0) { + warnx("unknown feature %s", f); + continue; + } + + if ((npa->npa_nsid != 0 && + (feat->f_getflags & NVMEADM_NS) == 0) || + (npa->npa_nsid == 0 && + (feat->f_getflags & NVMEADM_CTRL) == 0)) { + warnx("feature %s %s supported for namespaces", + feat->f_name, (feat->f_getflags & NVMEADM_NS) != 0 ? + "only" : "not"); + continue; + } + + if (!header_printed) { + (void) printf("%s: Get Features\n", npa->npa_name); + header_printed = B_TRUE; + } + + if (feat->f_get(fd, feat, npa->npa_idctl) != 0) { + warnx("unsupported feature: %s", feat->f_name); + continue; + } + } + + free(flist); + return (0); +} + +static int +do_format_common(int fd, const nvme_process_arg_t *npa, unsigned long lbaf, + unsigned long ses) +{ + nvme_process_arg_t ns_npa = { 0 }; + nvmeadm_cmd_t cmd = { 0 }; + + cmd = *(npa->npa_cmd); + cmd.c_func = do_attach_detach; + cmd.c_name = "detach"; + ns_npa = *npa; + ns_npa.npa_cmd = &cmd; + + if (do_attach_detach(fd, &ns_npa) != 0) + return (exitcode); + if (nvme_format_nvm(fd, lbaf, ses) == B_FALSE) { + warn("%s failed", npa->npa_cmd->c_name); + exitcode += -1; + } + cmd.c_name = "attach"; + exitcode += do_attach_detach(fd, &ns_npa); + + return (exitcode); +} + +static void +usage_format(const char *c_name) +{ + (void) fprintf(stderr, "%s <ctl>[/<ns>] [<lba-format>]\n\n" + " Format one or all namespaces of the specified NVMe " + "controller. Supported LBA\n formats can be queried with " + "the \"%s identify\" command on the namespace\n to be " + "formatted.\n", c_name, getprogname()); +} + +static int +do_format(int fd, const nvme_process_arg_t *npa) +{ + unsigned long lbaf; + + if (npa->npa_idctl->id_oacs.oa_format == 0) + errx(-1, "%s not supported", npa->npa_cmd->c_name); + + if (npa->npa_isns && npa->npa_idctl->id_fna.fn_format != 0) + errx(-1, "%s not supported on individual namespace", + npa->npa_cmd->c_name); + + + if (npa->npa_argc > 0) { + errno = 0; + lbaf = strtoul(npa->npa_argv[0], NULL, 10); + + if (errno != 0 || lbaf > NVME_FRMT_MAX_LBAF) + errx(-1, "invalid LBA format %d", lbaf + 1); + + if (npa->npa_idns->id_lbaf[lbaf].lbaf_ms != 0) + errx(-1, "LBA formats with metadata not supported"); + } else { + lbaf = npa->npa_idns->id_flbas.lba_format; + } + + return (do_format_common(fd, npa, lbaf, 0)); +} + +static void +usage_secure_erase(const char *c_name) +{ + (void) fprintf(stderr, "%s <ctl>[/<ns>] [-c]\n\n" + " Secure-Erase one or all namespaces of the specified " + "NVMe controller.\n", c_name); +} + +static int +do_secure_erase(int fd, const nvme_process_arg_t *npa) +{ + unsigned long lbaf; + uint8_t ses = NVME_FRMT_SES_USER; + + if (npa->npa_idctl->id_oacs.oa_format == 0) + errx(-1, "%s not supported", npa->npa_cmd->c_name); + + if (npa->npa_isns && npa->npa_idctl->id_fna.fn_sec_erase != 0) + errx(-1, "%s not supported on individual namespace", + npa->npa_cmd->c_name); + + if (npa->npa_argc > 0) { + if (strcmp(npa->npa_argv[0], "-c") == 0) + ses = NVME_FRMT_SES_CRYPTO; + else + usage(npa->npa_cmd); + } + + if (ses == NVME_FRMT_SES_CRYPTO && + npa->npa_idctl->id_fna.fn_crypt_erase == 0) + errx(-1, "cryptographic %s not supported", + npa->npa_cmd->c_name); + + lbaf = npa->npa_idns->id_flbas.lba_format; + + return (do_format_common(fd, npa, lbaf, ses)); +} + +static void +usage_attach_detach(const char *c_name) +{ + (void) fprintf(stderr, "%s <ctl>[/<ns>]\n\n" + " %c%s blkdev(7d) %s one or all namespaces of the " + "specified NVMe controller.\n", + c_name, toupper(c_name[0]), &c_name[1], + c_name[0] == 'd' ? "from" : "to"); +} + +static int +do_attach_detach(int fd, const nvme_process_arg_t *npa) +{ + char *c_name = npa->npa_cmd->c_name; + + if (!npa->npa_isns) { + nvme_process_arg_t ns_npa = { 0 }; + + ns_npa.npa_name = npa->npa_name; + ns_npa.npa_isns = B_TRUE; + ns_npa.npa_cmd = npa->npa_cmd; + + nvme_walk(&ns_npa, npa->npa_node); + + return (exitcode); + } else { + if ((c_name[0] == 'd' ? nvme_detach : nvme_attach)(fd) + == B_FALSE) { + warn("%s failed", c_name); + return (-1); + } + } + + return (0); +} diff --git a/usr/src/cmd/nvmeadm/nvmeadm.h b/usr/src/cmd/nvmeadm/nvmeadm.h new file mode 100644 index 0000000000..4464350ace --- /dev/null +++ b/usr/src/cmd/nvmeadm/nvmeadm.h @@ -0,0 +1,87 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Nexenta Systems, Inc. + */ + +#ifndef _NVMEADM_H +#define _NVMEADM_H + +#include <stdio.h> +#include <libdevinfo.h> +#include <sys/nvme.h> + +#ifdef __cplusplus +extern "C" { +#endif + +extern int verbose; +extern int debug; + +/* printing functions */ +extern void nvme_print(int, char *, int, const char *, ...); +extern void nvme_print_ctrl_summary(nvme_identify_ctrl_t *, nvme_version_t *); +extern void nvme_print_nsid_summary(nvme_identify_nsid_t *); +extern void nvme_print_identify_ctrl(nvme_identify_ctrl_t *, + nvme_capabilities_t *, nvme_version_t *); +extern void nvme_print_identify_nsid(nvme_identify_nsid_t *, nvme_version_t *); +extern void nvme_print_error_log(int, nvme_error_log_entry_t *); +extern void nvme_print_health_log(nvme_health_log_t *, nvme_identify_ctrl_t *); +extern void nvme_print_fwslot_log(nvme_fwslot_log_t *); + +extern void nvme_print_feat_arbitration(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); +extern void nvme_print_feat_power_mgmt(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); +extern void nvme_print_feat_lba_range(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); +extern void nvme_print_feat_temperature(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); +extern void nvme_print_feat_error(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); +extern void nvme_print_feat_write_cache(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); +extern void nvme_print_feat_nqueues(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); +extern void nvme_print_feat_intr_coal(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); +extern void nvme_print_feat_intr_vect(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); +extern void nvme_print_feat_write_atom(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); +extern void nvme_print_feat_async_event(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); +extern void nvme_print_feat_auto_pst(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); +extern void nvme_print_feat_progress(uint64_t, void *, size_t, + nvme_identify_ctrl_t *); + +/* device node functions */ +extern int nvme_open(di_minor_t); +extern void nvme_close(int); +extern nvme_version_t *nvme_version(int); +extern nvme_capabilities_t *nvme_capabilities(int); +extern nvme_identify_ctrl_t *nvme_identify_ctrl(int); +extern nvme_identify_nsid_t *nvme_identify_nsid(int); +extern void *nvme_get_logpage(int, uint8_t, size_t *); +extern boolean_t nvme_get_feature(int, uint8_t, uint32_t, uint64_t *, size_t *, + void **); +extern int nvme_intr_cnt(int); +extern boolean_t nvme_format_nvm(int, uint8_t, uint8_t); +extern boolean_t nvme_detach(int); +extern boolean_t nvme_attach(int); + +#ifdef __cplusplus +} +#endif + +#endif /* _NVMEADM_H */ diff --git a/usr/src/cmd/nvmeadm/nvmeadm_dev.c b/usr/src/cmd/nvmeadm/nvmeadm_dev.c new file mode 100644 index 0000000000..2ac3946a5d --- /dev/null +++ b/usr/src/cmd/nvmeadm/nvmeadm_dev.c @@ -0,0 +1,201 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Nexenta Systems, Inc. + */ + +#include <sys/types.h> +#include <sys/stat.h> +#include <fcntl.h> +#include <unistd.h> +#include <stropts.h> +#include <err.h> +#include <libdevinfo.h> +#include <sys/nvme.h> +#include <assert.h> + +#include "nvmeadm.h" + + +static boolean_t +nvme_ioctl(int fd, int ioc, size_t *bufsize, void **buf, uint64_t arg, + uint64_t *res) +{ + nvme_ioctl_t nioc = { 0 }; + + if (buf != NULL) + *buf = NULL; + + if (res != NULL) + *res = ~0ULL; + + if (bufsize != NULL && *bufsize != 0) { + assert(buf != NULL); + + if ((nioc.n_buf = (uintptr_t)calloc(*bufsize, 1)) == NULL) + err(-1, "nvme_ioctl()"); + + nioc.n_len = *bufsize; + } + + nioc.n_arg = arg; + + if (ioctl(fd, ioc, &nioc) != 0) { + if (debug) + warn("nvme_ioctl()"); + if (nioc.n_buf != 0) + free((void *)nioc.n_buf); + + return (B_FALSE); + } + + if (res != NULL) + *res = nioc.n_arg; + + if (bufsize != NULL) + *bufsize = nioc.n_len; + + if (buf != NULL) + *buf = (void *)nioc.n_buf; + + return (B_TRUE); +} + +nvme_capabilities_t * +nvme_capabilities(int fd) +{ + void *cap = NULL; + size_t bufsize = sizeof (nvme_capabilities_t); + + (void) nvme_ioctl(fd, NVME_IOC_CAPABILITIES, &bufsize, &cap, 0, NULL); + + return (cap); +} + +nvme_version_t * +nvme_version(int fd) +{ + void *vs = NULL; + size_t bufsize = sizeof (nvme_version_t); + + (void) nvme_ioctl(fd, NVME_IOC_VERSION, &bufsize, &vs, 0, NULL); + + return (vs); +} + +nvme_identify_ctrl_t * +nvme_identify_ctrl(int fd) +{ + void *idctl = NULL; + size_t bufsize = NVME_IDENTIFY_BUFSIZE; + + (void) nvme_ioctl(fd, NVME_IOC_IDENTIFY_CTRL, &bufsize, &idctl, 0, + NULL); + + return (idctl); +} + +nvme_identify_nsid_t * +nvme_identify_nsid(int fd) +{ + void *idns = NULL; + size_t bufsize = NVME_IDENTIFY_BUFSIZE; + + (void) nvme_ioctl(fd, NVME_IOC_IDENTIFY_NSID, &bufsize, &idns, 0, NULL); + + return (idns); +} + +void * +nvme_get_logpage(int fd, uint8_t logpage, size_t *bufsize) +{ + void *buf = NULL; + + (void) nvme_ioctl(fd, NVME_IOC_GET_LOGPAGE, bufsize, &buf, logpage, + NULL); + + return (buf); +} + +boolean_t +nvme_get_feature(int fd, uint8_t feature, uint32_t arg, uint64_t *res, + size_t *bufsize, void **buf) +{ + return (nvme_ioctl(fd, NVME_IOC_GET_FEATURES, bufsize, buf, + (uint64_t)feature << 32 | arg, res)); +} + +int +nvme_intr_cnt(int fd) +{ + uint64_t res = 0; + + (void) nvme_ioctl(fd, NVME_IOC_INTR_CNT, NULL, NULL, 0, &res); + return ((int)res); +} + +boolean_t +nvme_format_nvm(int fd, uint8_t lbaf, uint8_t ses) +{ + nvme_format_nvm_t frmt = { 0 }; + + frmt.b.fm_lbaf = lbaf & 0xf; + frmt.b.fm_ses = ses & 0x7; + + return (nvme_ioctl(fd, NVME_IOC_FORMAT, NULL, NULL, frmt.r, NULL)); +} + +boolean_t +nvme_detach(int fd) +{ + return (nvme_ioctl(fd, NVME_IOC_DETACH, NULL, NULL, 0, NULL)); +} + +boolean_t +nvme_attach(int fd) +{ + return (nvme_ioctl(fd, NVME_IOC_ATTACH, NULL, NULL, 0, NULL)); +} + +int +nvme_open(di_minor_t minor) +{ + char *devpath, *path; + int fd; + + if ((devpath = di_devfs_minor_path(minor)) == NULL) + err(-1, "nvme_open()"); + + if (asprintf(&path, "/devices%s", devpath) < 0) { + di_devfs_path_free(devpath); + err(-1, "nvme_open()"); + } + + di_devfs_path_free(devpath); + + fd = open(path, O_RDWR); + free(path); + + if (fd < 0) { + if (debug) + warn("nvme_open(%s)", path); + return (-1); + } + + return (fd); +} + +void +nvme_close(int fd) +{ + (void) close(fd); +} diff --git a/usr/src/cmd/nvmeadm/nvmeadm_print.c b/usr/src/cmd/nvmeadm/nvmeadm_print.c new file mode 100644 index 0000000000..582a849a3e --- /dev/null +++ b/usr/src/cmd/nvmeadm/nvmeadm_print.c @@ -0,0 +1,1138 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Nexenta Systems, Inc. + */ + +/* + * functions for printing of NVMe data structures and their members + */ + +#include <sys/byteorder.h> +#include <sys/types.h> +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <stdarg.h> +#include <err.h> +#include <assert.h> + +#include "nvmeadm.h" + +static int nvme_strlen(const char *, int); + +static void nvme_print_str(int, char *, int, const char *, int); +static void nvme_print_double(int, char *, double, int, char *); +static void nvme_print_uint64(int, char *, uint64_t, const char *, char *); +static void nvme_print_uint128(int, char *, nvme_uint128_t, char *, int, int); +static void nvme_print_bit(int, char *, int, char *, char *); + +#define ARRAYSIZE(x) (sizeof (x) / sizeof (*(x))) + +static const char *generic_status_codes[] = { + "Successful Completion", + "Invalid Command Opcode", + "Invalid Field in Command", + "Command ID Conflict", + "Data Transfer Error", + "Commands Aborted due to Power Loss Notification", + "Internal Error", + "Command Abort Requested", + "Command Aborted due to SQ Deletion", + "Command Aborted due to Failed Fused Command", + "Command Aborted due to Missing Fused Command", + "Invalid Namespace or Format", + "Command Sequence Error", + /* NVMe 1.1 */ + "Invalid SGL Segment Descriptor", + "Invalid Number of SGL Descriptors", + "Data SGL Length Invalid", + "Metadata SGL Length Invalid", + "SGL Descriptor Type Invalid", + /* NVMe 1.2 */ + "Invalid Use of Controller Memory Buffer", + "PRP Offset Invalid", + "Atomic Write Unit Exceeded" +}; + +static const char *specific_status_codes[] = { + "Completion Queue Invalid", + "Invalid Queue Identifier", + "Invalid Queue Size", + "Abort Command Limit Exceeded", + "Reserved", + "Asynchronous Event Request Limit Exceeded", + "Invalid Firmware Slot", + "Invalid Firmware Image", + "Invalid Interrupt Vector", + "Invalid Log Page", + "Invalid Format", + "Firmware Activation Requires Conventional Reset", + "Invalid Queue Deletion", + /* NVMe 1.1 */ + "Feature Identifier Not Saveable", + "Feature Not Changeable", + "Feature Not Namespace Specific", + "Firmware Activation Requires NVM Subsystem Reset", + /* NVMe 1.2 */ + "Firmware Activation Requires Reset", + "Firmware Activation Requires Maximum Time Violation", + "Firmware Activation Prohibited", + "Overlapping Range", + "Namespace Insufficient Capacity", + "Namespace Identifier Unavailable", + "Reserved", + "Namespace Already Attached", + "Namespace Is Private", + "Namespace Not Attached", + "Thin Provisioning Not Supported", + "Controller List Invalid" +}; + +static const char *generic_nvm_status_codes[] = { + "LBA Out Of Range", + "Capacity Exceeded", + "Namespace Not Ready", + /* NVMe 1.1 */ + "Reservation Conflict", + /* NVMe 1.2 */ + "Format In Progress", +}; + +static const char *specific_nvm_status_codes[] = { + "Conflicting Attributes", + "Invalid Protection Information", + "Attempted Write to Read Only Range" +}; + +static const char *media_nvm_status_codes[] = { + "Write Fault", + "Unrecovered Read Error", + "End-to-End Guard Check Error", + "End-to-End Application Tag Check Error", + "End-to-End Reference Tag Check Error", + "Compare Failure", + "Access Denied", + /* NVMe 1.2 */ + "Deallocated or Unwritten Logical Block" +}; + +static const char *status_code_types[] = { + "Generic Command Status", + "Command Specific Status", + "Media Errors", + "Reserved", + "Reserved", + "Reserved", + "Reserved", + "Vendor Specific" +}; + +static const char *lbaf_relative_performance[] = { + "Best", "Better", "Good", "Degraded" +}; + +static const char *lba_range_types[] = { + "Reserved", "Filesystem", "RAID", "Cache", "Page/Swap File" +}; + +/* + * nvme_print + * + * This function prints a string indented by the specified number of spaces, + * optionally followed by the specified index if it is >= 0. If a format string + * is specified, a single colon and the required number of spaces for alignment + * are printed before the format string and any remaining arguments are passed + * vprintf. + * + * NVME_PRINT_ALIGN was chosen so that all values will be lined up nicely even + * for the longest name at its default indentation. + */ + +#define NVME_PRINT_ALIGN 43 + +void +nvme_print(int indent, char *name, int index, const char *fmt, ...) +{ + int align = NVME_PRINT_ALIGN - (indent + strlen(name) + 1); + va_list ap; + + if (index >= 0) + align -= snprintf(NULL, 0, " %d", index); + + if (align < 0) + align = 0; + + va_start(ap, fmt); + + (void) printf("%*s%s", indent, "", name); + + if (index >= 0) + (void) printf(" %d", index); + + if (fmt != NULL) { + (void) printf(": %*s", align, ""); + (void) vprintf(fmt, ap); + } + + (void) printf("\n"); + va_end(ap); +} + +/* + * nvme_strlen -- return length of string without trailing whitespace + */ +static int +nvme_strlen(const char *str, int len) +{ + if (len < 0) + return (0); + + while (str[--len] == ' ') + ; + + return (++len); +} + +/* + * nvme_print_str -- print a string up to the specified length + */ +static void +nvme_print_str(int indent, char *name, int index, const char *value, int len) +{ + if (len == 0) + len = strlen(value); + + nvme_print(indent, name, index, "%.*s", nvme_strlen(value, len), value); +} + +/* + * nvme_print_double -- print a double up to a specified number of places with + * optional unit + */ +static void +nvme_print_double(int indent, char *name, double value, int places, char *unit) +{ + if (unit == NULL) + unit = ""; + + nvme_print(indent, name, -1, "%.*g%s", places, value, unit); +} + +/* + * nvme_print_uint64 -- print uint64_t with optional unit in decimal or another + * format specified + */ +static void +nvme_print_uint64(int indent, char *name, uint64_t value, const char *fmt, + char *unit) +{ + char *tmp_fmt; + + if (unit == NULL) + unit = ""; + + if (fmt == NULL) + fmt = "%"PRId64; + + if (asprintf(&tmp_fmt, "%s%%s", fmt) < 0) + err(-1, "nvme_print_uint64()"); + + nvme_print(indent, name, -1, tmp_fmt, value, unit); + + free(tmp_fmt); +} + +/* + * nvme_print_uint128 -- print a 128bit uint with optional unit, after applying + * binary and/or decimal shifting + */ +static void +nvme_print_uint128(int indent, char *name, nvme_uint128_t value, char *unit, + int scale_bits, int scale_tens) +{ + const char hex[] = "0123456789abcdef"; + uint8_t o[(128 + scale_bits) / 3]; + char p[sizeof (o) * 2]; + char *pp = &p[0]; + int i, x; + uint64_t rem = 0; + + if (unit == NULL) + unit = ""; + + /* + * Don't allow binary shifting by more than 64 bits to keep the + * arithmetic simple. Also limit decimal shifting based on the size + * of any possible remainder from binary shifting. + */ + assert(scale_bits <= 64); + assert(scale_tens <= (64 - scale_bits) / 3); + + bzero(o, sizeof (o)); + bzero(p, sizeof (p)); + + /* + * Convert the two 64-bit numbers into a series of BCD digits using + * a double-dabble algorithm. By using more or less iterations than + * 128 we can do a binary shift in either direction. + */ + for (x = 0; x != 128 - scale_bits; x++) { + for (i = 0; i != sizeof (o); i++) { + if ((o[i] & 0xf0) > 0x40) + o[i] += 0x30; + + if ((o[i] & 0xf) > 4) + o[i] += 3; + } + + for (i = 0; i != sizeof (o) - 1; i++) + o[i] = (o[i] << 1) + (o[i+1] >> 7); + + o[i] = (o[i] << 1) + (value.hi >> 63); + + value.hi = (value.hi << 1) + (value.lo >> 63); + value.lo = (value.lo << 1); + } + + /* + * If we're supposed to do a decimal left shift (* 10^x), too, + * calculate the remainder of the previous binary shift operation. + */ + if (scale_tens > 0) { + rem = value.hi >> (64 - scale_bits); + + for (i = 0; i != scale_tens; i++) + rem *= 10; + + rem >>= scale_bits; + } + + /* + * Construct the decimal number for printing. Skip leading zeros. + */ + for (i = 0; i < sizeof (o); i++) + if (o[i] != 0) + break; + + if (i == sizeof (o)) { + /* + * The converted number is 0. Just print the calculated + * remainder and return. + */ + nvme_print(indent, name, -1, "%"PRId64"%s", rem, unit); + return; + } else { + if (o[i] > 0xf) + *pp++ = hex[o[i] >> 4]; + + *pp++ = hex[o[i] & 0xf]; + + for (i++; i < sizeof (o); i++) { + *pp++ = hex[o[i] >> 4]; + *pp++ = hex[o[i] & 0xf]; + } + } + + /* + * For negative decimal scaling, use the printf precision specifier to + * truncate the results according to the requested decimal scaling. For + * positive decimal scaling we print the remainder padded with 0. + */ + nvme_print(indent, name, -1, "%.*s%0.*"PRId64"%s", + strlen(p) + scale_tens, p, + scale_tens > 0 ? scale_tens : 0, rem, + unit); +} + +/* + * nvme_print_bit -- print a bit with optional names for both states + */ +static void +nvme_print_bit(int indent, char *name, int value, char *s_true, char *s_false) +{ + if (s_true == NULL) + s_true = "supported"; + if (s_false == NULL) + s_false = "unsupported"; + + nvme_print(indent, name, -1, "%s", value ? s_true : s_false); +} + +/* + * nvme_print_ctrl_summary -- print a 1-line summary of the IDENTIFY CONTROLLER + * data structure + */ +void +nvme_print_ctrl_summary(nvme_identify_ctrl_t *idctl, nvme_version_t *version) +{ + (void) printf("model: %.*s, serial: %.*s, FW rev: %.*s, NVMe v%d.%d\n", + nvme_strlen(idctl->id_model, sizeof (idctl->id_model)), + idctl->id_model, + nvme_strlen(idctl->id_serial, sizeof (idctl->id_serial)), + idctl->id_serial, + nvme_strlen(idctl->id_fwrev, sizeof (idctl->id_fwrev)), + idctl->id_fwrev, + version->v_major, version->v_minor); +} + +/* + * nvme_print_nsid_summary -- print a 1-line summary of the IDENTIFY NAMESPACE + * data structure + */ +void +nvme_print_nsid_summary(nvme_identify_nsid_t *idns) +{ + int bsize = 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; + + (void) printf("Size = %"PRId64" MB, " + "Capacity = %"PRId64" MB, " + "Used = %"PRId64" MB\n", + idns->id_nsize * bsize / 1024 / 1024, + idns->id_ncap * bsize / 1024 / 1024, + idns->id_nuse * bsize / 1024 / 1024); + +} + +/* + * nvme_print_identify_ctrl + * + * This function pretty-prints the structure returned by the IDENTIFY CONTROLLER + * command. + */ +void +nvme_print_identify_ctrl(nvme_identify_ctrl_t *idctl, + nvme_capabilities_t *cap, nvme_version_t *version) +{ + int i; + + nvme_print(0, "Identify Controller", -1, NULL); + nvme_print(2, "Controller Capabilities and Features", -1, NULL); + nvme_print_str(4, "Model", -1, + idctl->id_model, sizeof (idctl->id_model)); + nvme_print_str(4, "Serial", -1, + idctl->id_serial, sizeof (idctl->id_serial)); + nvme_print_str(4, "Firmware Revision", -1, + idctl->id_fwrev, sizeof (idctl->id_fwrev)); + if (verbose) { + nvme_print_uint64(4, "PCI vendor ID", + idctl->id_vid, "0x%0.4"PRIx64, NULL); + nvme_print_uint64(4, "subsystem vendor ID", + idctl->id_ssvid, "0x%0.4"PRIx64, NULL); + nvme_print_uint64(4, "Recommended Arbitration Burst", + idctl->id_rab, NULL, NULL); + nvme_print(4, "Vendor IEEE OUI", -1, "%0.2X-%0.2X-%0.2X", + idctl->id_oui[0], idctl->id_oui[1], idctl->id_oui[2]); + } + nvme_print(4, "Multi-Interface Capabilities", -1, NULL); + nvme_print_bit(6, "Multiple PCI Express ports", + idctl->id_mic.m_multi_pci, NULL, NULL); + + if (NVME_VERSION_ATLEAST(version, 1, 1)) { + nvme_print_bit(6, "Multiple Controllers", + idctl->id_mic.m_multi_ctrl, NULL, NULL); + nvme_print_bit(6, "Is SR-IOV virtual function", + idctl->id_mic.m_sr_iov, "yes", "no"); + } + if (idctl->id_mdts > 0) + nvme_print_uint64(4, "Maximum Data Transfer Size", + (1 << idctl->id_mdts) * cap->mpsmin / 1024, NULL, "kB"); + else + nvme_print_str(4, "Maximum Data Transfer Size", -1, + "unlimited", 0); + + if (NVME_VERSION_ATLEAST(version, 1, 1)) { + nvme_print_uint64(4, "Unique Controller Identifier", + idctl->id_cntlid, "0x%0.4"PRIx64, NULL); + } + + nvme_print(2, "Admin Command Set Attributes", -1, NULL); + nvme_print(4, "Optional Admin Command Support", -1, NULL); + nvme_print_bit(6, "Security Send & Receive", + idctl->id_oacs.oa_security, NULL, NULL); + nvme_print_bit(6, "Format NVM", + idctl->id_oacs.oa_format, NULL, NULL); + nvme_print_bit(6, "Firmware Activate & Download", + idctl->id_oacs.oa_firmware, NULL, NULL); + if (verbose) { + nvme_print_uint64(4, "Abort Command Limit", + (uint16_t)idctl->id_acl + 1, NULL, NULL); + nvme_print_uint64(4, "Asynchronous Event Request Limit", + (uint16_t)idctl->id_aerl + 1, NULL, NULL); + } + nvme_print(4, "Firmware Updates", -1, NULL); + nvme_print_bit(6, "Firmware Slot 1", + idctl->id_frmw.fw_readonly, "read-only", "writable"); + nvme_print_uint64(6, "No. of Firmware Slots", + idctl->id_frmw.fw_nslot, NULL, NULL); + nvme_print(2, "Log Page Attributes", -1, NULL); + nvme_print_bit(6, "per Namespace SMART/Health info", + idctl->id_lpa.lp_smart, NULL, NULL); + nvme_print_uint64(4, "Error Log Page Entries", + (uint16_t)idctl->id_elpe + 1, NULL, NULL); + nvme_print_uint64(4, "Number of Power States", + (uint16_t)idctl->id_npss + 1, NULL, NULL); + if (verbose) { + nvme_print_bit(4, "Admin Vendor-specific Command Format", + idctl->id_avscc.av_spec, "standard", "vendor-specific"); + } + + if (NVME_VERSION_ATLEAST(version, 1, 1)) { + nvme_print_bit(4, "Autonomous Power State Transitions", + idctl->id_apsta.ap_sup, NULL, NULL); + } + + nvme_print(2, "NVM Command Set Attributes", -1, NULL); + if (verbose) { + nvme_print(4, "Submission Queue Entry Size", -1, + "min %d, max %d", + 1 << idctl->id_sqes.qes_min, 1 << idctl->id_sqes.qes_max); + nvme_print(4, "Completion Queue Entry Size", -1, + "min %d, max %d", + 1 << idctl->id_cqes.qes_min, 1 << idctl->id_cqes.qes_max); + } + nvme_print_uint64(4, "Number of Namespaces", + idctl->id_nn, NULL, NULL); + nvme_print(4, "Optional NVM Command Support", -1, NULL); + nvme_print_bit(6, "Compare", + idctl->id_oncs.on_compare, NULL, NULL); + nvme_print_bit(6, "Write Uncorrectable", + idctl->id_oncs.on_wr_unc, NULL, NULL); + nvme_print_bit(6, "Dataset Management", + idctl->id_oncs.on_dset_mgmt, NULL, NULL); + + if (NVME_VERSION_ATLEAST(version, 1, 1)) { + nvme_print_bit(6, "Write Zeros", + idctl->id_oncs.on_wr_zero, NULL, NULL); + nvme_print_bit(6, "Save/Select in Get/Set Features", + idctl->id_oncs.on_save, NULL, NULL); + nvme_print_bit(6, "Reservations", + idctl->id_oncs.on_reserve, NULL, NULL); + } + + nvme_print(4, "Fused Operation Support", -1, NULL); + nvme_print_bit(6, "Compare and Write", + idctl->id_fuses.f_cmp_wr, NULL, NULL); + nvme_print(4, "Format NVM Attributes", -1, NULL); + nvme_print_bit(6, "per Namespace Format", + idctl->id_fna.fn_format == 0, NULL, NULL); + nvme_print_bit(6, "per Namespace Secure Erase", + idctl->id_fna.fn_sec_erase == 0, NULL, NULL); + nvme_print_bit(6, "Cryptographic Erase", + idctl->id_fna.fn_crypt_erase, NULL, NULL); + nvme_print_bit(4, "Volatile Write Cache", + idctl->id_vwc.vwc_present, "present", "not present"); + nvme_print_uint64(4, "Atomic Write Unit Normal", + (uint32_t)idctl->id_awun + 1, NULL, + idctl->id_awun == 0 ? " block" : " blocks"); + nvme_print_uint64(4, "Atomic Write Unit Power Fail", + (uint32_t)idctl->id_awupf + 1, NULL, + idctl->id_awupf == 0 ? " block" : " blocks"); + + if (verbose != 0) + nvme_print_bit(4, "NVM Vendor-specific Command Format", + idctl->id_nvscc.nv_spec, "standard", "vendor-specific"); + + if (NVME_VERSION_ATLEAST(version, 1, 1)) { + nvme_print_uint64(4, "Atomic Compare & Write Size", + (uint32_t)idctl->id_acwu + 1, NULL, + idctl->id_acwu == 0 ? " block" : " blocks"); + nvme_print(4, "SGL Support", -1, NULL); + nvme_print_bit(6, "SGLs in NVM commands", + idctl->id_sgls.sgl_sup, NULL, NULL); + nvme_print_bit(6, "SGL Bit Bucket Descriptor", + idctl->id_sgls.sgl_bucket, NULL, NULL); + } + + for (i = 0; i != idctl->id_npss + 1; i++) { + double scale = 0.01; + double power = 0; + int places = 2; + char *unit = "W"; + + if (NVME_VERSION_ATLEAST(version, 1, 1) && + idctl->id_psd[i].psd_mps == 1) { + scale = 0.0001; + places = 4; + } + + power = (double)idctl->id_psd[i].psd_mp * scale; + if (power < 1.0) { + power *= 1000.0; + unit = "mW"; + } + + nvme_print(4, "Power State Descriptor", i, NULL); + nvme_print_double(6, "Maximum Power", power, places, unit); + nvme_print_bit(6, "Non-Operational State", + idctl->id_psd[i].psd_nops, "yes", "no"); + nvme_print_uint64(6, "Entry Latency", + idctl->id_psd[i].psd_enlat, NULL, "us"); + nvme_print_uint64(6, "Exit Latency", + idctl->id_psd[i].psd_exlat, NULL, "us"); + nvme_print_uint64(6, "Relative Read Throughput (0 = best)", + idctl->id_psd[i].psd_rrt, NULL, NULL); + nvme_print_uint64(6, "Relative Read Latency (0 = best)", + idctl->id_psd[i].psd_rrl, NULL, NULL); + nvme_print_uint64(6, "Relative Write Throughput (0 = best)", + idctl->id_psd[i].psd_rwt, NULL, NULL); + nvme_print_uint64(6, "Relative Write Latency (0 = best)", + idctl->id_psd[i].psd_rwl, NULL, NULL); + } +} + +/* + * nvme_print_identify_nsid + * + * This function pretty-prints the structure returned by the IDENTIFY NAMESPACE + * command. + */ +void +nvme_print_identify_nsid(nvme_identify_nsid_t *idns, nvme_version_t *version) +{ + int bsize = 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; + int i; + + nvme_print(0, "Identify Namespace", -1, NULL); + nvme_print(2, "Namespace Capabilities and Features", -1, NULL); + nvme_print_uint64(4, "Namespace Size", + idns->id_nsize * bsize / 1024 / 1024, NULL, "MB"); + nvme_print_uint64(4, "Namespace Capacity", + idns->id_ncap * bsize / 1024 / 1024, NULL, "MB"); + nvme_print_uint64(4, "Namespace Utilization", + idns->id_nuse * bsize / 1024 / 1024, NULL, "MB"); + nvme_print(4, "Namespace Features", -1, NULL); + nvme_print_bit(6, "Thin Provisioning", + idns->id_nsfeat.f_thin, NULL, NULL); + nvme_print_uint64(4, "Number of LBA Formats", + (uint16_t)idns->id_nlbaf + 1, NULL, NULL); + nvme_print(4, "Formatted LBA Size", -1, NULL); + nvme_print_uint64(6, "LBA Format", + (uint16_t)idns->id_flbas.lba_format, NULL, NULL); + nvme_print_bit(6, "Extended Data LBA", + idns->id_flbas.lba_extlba, "yes", "no"); + nvme_print(4, "Metadata Capabilities", -1, NULL); + nvme_print_bit(6, "Extended Data LBA", + idns->id_mc.mc_extlba, NULL, NULL); + nvme_print_bit(6, "Separate Metadata", + idns->id_mc.mc_separate, NULL, NULL); + nvme_print(4, "End-to-End Data Protection Capabilities", -1, NULL); + nvme_print_bit(6, "Protection Information Type 1", + idns->id_dpc.dp_type1, NULL, NULL); + nvme_print_bit(6, "Protection Information Type 2", + idns->id_dpc.dp_type2, NULL, NULL); + nvme_print_bit(6, "Protection Information Type 3", + idns->id_dpc.dp_type3, NULL, NULL); + nvme_print_bit(6, "Protection Information first", + idns->id_dpc.dp_first, NULL, NULL); + nvme_print_bit(6, "Protection Information last", + idns->id_dpc.dp_last, NULL, NULL); + nvme_print(4, "End-to-End Data Protection Settings", -1, NULL); + if (idns->id_dps.dp_pinfo == 0) + nvme_print_str(6, "Protection Information", -1, + "disabled", 0); + else + nvme_print_uint64(6, "Protection Information Type", + idns->id_dps.dp_pinfo, NULL, NULL); + nvme_print_bit(6, "Protection Information in Metadata", + idns->id_dps.dp_first, "first 8 bytes", "last 8 bytes"); + + if (NVME_VERSION_ATLEAST(version, 1, 1)) { + nvme_print(4, "Namespace Multi-Path I/O and Namespace Sharing " + "Capabilities", -1, NULL); + nvme_print_bit(6, "Namespace is shared", + idns->id_nmic.nm_shared, "yes", "no"); + nvme_print(2, "Reservation Capabilities", -1, NULL); + nvme_print_bit(6, "Persist Through Power Loss", + idns->id_rescap.rc_persist, NULL, NULL); + nvme_print_bit(6, "Write Exclusive", + idns->id_rescap.rc_wr_excl, NULL, NULL); + nvme_print_bit(6, "Exclusive Access", + idns->id_rescap.rc_excl, NULL, NULL); + nvme_print_bit(6, "Write Exclusive - Registrants Only", + idns->id_rescap.rc_wr_excl_r, NULL, NULL); + nvme_print_bit(6, "Exclusive Access - Registrants Only", + idns->id_rescap.rc_excl_r, NULL, NULL); + nvme_print_bit(6, "Write Exclusive - All Registrants", + idns->id_rescap.rc_wr_excl_a, NULL, NULL); + nvme_print_bit(6, "Exclusive Access - All Registrants", + idns->id_rescap.rc_excl_a, NULL, NULL); + + nvme_print(4, "IEEE Extended Unique Identifier", -1, + "%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X%0.2X", + idns->id_eui64[0], idns->id_eui64[1], + idns->id_eui64[2], idns->id_eui64[3], + idns->id_eui64[4], idns->id_eui64[5], + idns->id_eui64[6], idns->id_eui64[7]); + } + + for (i = 0; i <= idns->id_nlbaf; i++) { + if (verbose == 0 && idns->id_lbaf[i].lbaf_ms != 0) + continue; + + nvme_print(4, "LBA Format", i, NULL); + nvme_print_uint64(6, "Metadata Size", + idns->id_lbaf[i].lbaf_ms, NULL, " bytes"); + nvme_print_uint64(6, "LBA Data Size", + 1 << idns->id_lbaf[i].lbaf_lbads, NULL, " bytes"); + nvme_print_str(6, "Relative Performance", -1, + lbaf_relative_performance[idns->id_lbaf[i].lbaf_rp], 0); + } +} + +/* + * nvme_print_error_log + * + * This function pretty-prints all non-zero error log entries, or all entries + * if verbose is set. + */ +void +nvme_print_error_log(int nlog, nvme_error_log_entry_t *elog) +{ + int i; + + nvme_print(0, "Error Log", -1, NULL); + for (i = 0; i != nlog; i++) + if (elog[i].el_count == 0) + break; + nvme_print_uint64(2, "Number of Error Log Entries", i, NULL, NULL); + + for (i = 0; i != nlog; i++) { + int sc = elog[i].el_sf.sf_sc; + const char *sc_str = ""; + + if (elog[i].el_count == 0 && verbose == 0) + break; + + switch (elog[i].el_sf.sf_sct) { + case 0: /* Generic Command Status */ + if (sc < ARRAYSIZE(generic_status_codes)) + sc_str = generic_status_codes[sc]; + else if (sc >= 0x80 && + sc - 0x80 < ARRAYSIZE(generic_nvm_status_codes)) + sc_str = generic_nvm_status_codes[sc - 0x80]; + break; + case 1: /* Specific Command Status */ + if (sc < ARRAYSIZE(specific_status_codes)) + sc_str = specific_status_codes[sc]; + else if (sc >= 0x80 && + sc - 0x80 < ARRAYSIZE(specific_nvm_status_codes)) + sc_str = specific_nvm_status_codes[sc - 0x80]; + break; + case 2: /* Media Errors */ + if (sc >= 0x80 && + sc - 0x80 < ARRAYSIZE(media_nvm_status_codes)) + sc_str = media_nvm_status_codes[sc - 0x80]; + break; + case 7: /* Vendor Specific */ + sc_str = "Unknown Vendor Specific"; + break; + default: + sc_str = "Reserved"; + break; + } + + nvme_print(2, "Entry", i, NULL); + nvme_print_uint64(4, "Error Count", + elog[i].el_count, NULL, NULL); + nvme_print_uint64(4, "Submission Queue ID", + elog[i].el_sqid, NULL, NULL); + nvme_print_uint64(4, "Command ID", + elog[i].el_cid, NULL, NULL); + nvme_print(4, "Status Field", -1, NULL); + nvme_print_uint64(6, "Phase Tag", + elog[i].el_sf.sf_p, NULL, NULL); + nvme_print(6, "Status Code", -1, "0x%0.2x (%s)", + sc, sc_str); + nvme_print(6, "Status Code Type", -1, "0x%x (%s)", + elog[i].el_sf.sf_sct, + status_code_types[elog[i].el_sf.sf_sct]); + nvme_print_bit(6, "More", + elog[i].el_sf.sf_m, "yes", "no"); + nvme_print_bit(6, "Do Not Retry", + elog[i].el_sf.sf_m, "yes", "no"); + nvme_print_uint64(4, "Parameter Error Location byte", + elog[i].el_byte, "0x%0.2"PRIx64, NULL); + nvme_print_uint64(4, "Parameter Error Location bit", + elog[i].el_bit, NULL, NULL); + nvme_print_uint64(4, "Logical Block Address", + elog[i].el_lba, NULL, NULL); + nvme_print(4, "Namespace ID", -1, "%d", + elog[i].el_nsid == 0xffffffff ? + 0 : elog[i].el_nsid); + nvme_print_uint64(4, + "Vendor Specifc Information Available", + elog[i].el_vendor, NULL, NULL); + } +} + +/* + * nvme_print_health_log + * + * This function pretty-prints a summary of the SMART/Health log, or all + * of the log if verbose is set. + */ +void +nvme_print_health_log(nvme_health_log_t *hlog, nvme_identify_ctrl_t *idctl) +{ + nvme_print(0, "SMART/Health Information", -1, NULL); + nvme_print(2, "Critical Warnings", -1, NULL); + nvme_print_bit(4, "Available Space", + hlog->hl_crit_warn.cw_avail, "low", "OK"); + nvme_print_bit(4, "Temperature", + hlog->hl_crit_warn.cw_temp, "too high", "OK"); + nvme_print_bit(4, "Device Reliability", + hlog->hl_crit_warn.cw_reliab, "degraded", "OK"); + nvme_print_bit(4, "Media", + hlog->hl_crit_warn.cw_readonly, "read-only", "OK"); + if (idctl->id_vwc.vwc_present != 0) + nvme_print_bit(4, "Volatile Memory Backup", + hlog->hl_crit_warn.cw_volatile, "failed", "OK"); + + nvme_print_uint64(2, "Temperature", + hlog->hl_temp - 273, NULL, "C"); + nvme_print_uint64(2, "Available Spare Capacity", + hlog->hl_avail_spare, NULL, "%"); + + if (verbose != 0) + nvme_print_uint64(2, "Available Spare Threshold", + hlog->hl_avail_spare_thr, NULL, "%"); + + nvme_print_uint64(2, "Device Life Used", + hlog->hl_used, NULL, "%"); + + if (verbose == 0) + return; + + /* + * The following two fields are in 1000 512 byte units. Convert that to + * GB by doing binary shifts (9 left and 30 right) and muliply by 10^3. + */ + nvme_print_uint128(2, "Data Read", + hlog->hl_data_read, "GB", 30 - 9, 3); + nvme_print_uint128(2, "Data Written", + hlog->hl_data_write, "GB", 30 - 9, 3); + + nvme_print_uint128(2, "Read Commands", + hlog->hl_host_read, NULL, 0, 0); + nvme_print_uint128(2, "Write Commands", + hlog->hl_host_write, NULL, 0, 0); + nvme_print_uint128(2, "Controller Busy", + hlog->hl_ctrl_busy, "min", 0, 0); + nvme_print_uint128(2, "Power Cycles", + hlog->hl_power_cycles, NULL, 0, 0); + nvme_print_uint128(2, "Power On", + hlog->hl_power_on_hours, "h", 0, 0); + nvme_print_uint128(2, "Unsafe Shutdowns", + hlog->hl_unsafe_shutdn, NULL, 0, 0); + nvme_print_uint128(2, "Uncorrectable Media Errors", + hlog->hl_media_errors, NULL, 0, 0); + nvme_print_uint128(2, "Errors Logged", + hlog->hl_errors_logged, NULL, 0, 0); +} + +/* + * nvme_print_fwslot_log + * + * This function pretty-prints the firmware slot information. + */ +void +nvme_print_fwslot_log(nvme_fwslot_log_t *fwlog) +{ + int i; + + nvme_print(0, "Firmware Slot Information", -1, NULL); + nvme_print_uint64(2, "Active Firmware Slot", fwlog->fw_afi, NULL, NULL); + + for (i = 0; i != ARRAYSIZE(fwlog->fw_frs); i++) { + if (fwlog->fw_frs[i][0] == '\0') + break; + nvme_print_str(2, "Firmware Revision for Slot", i + 1, + fwlog->fw_frs[i], sizeof (fwlog->fw_frs[i])); + } +} + +/* + * nvme_print_feat_* + * + * These functions pretty-print the data structures returned by GET FEATURES. + */ +void +nvme_print_feat_arbitration(uint64_t res, void *b, size_t s, + nvme_identify_ctrl_t *id) +{ + _NOTE(ARGUNUSED(b)); + _NOTE(ARGUNUSED(s)); + _NOTE(ARGUNUSED(id)); + nvme_arbitration_t arb; + + arb.r = (uint32_t)res; + if (arb.b.arb_ab != 7) + nvme_print_uint64(4, "Arbitration Burst", + 1 << arb.b.arb_ab, NULL, NULL); + else + nvme_print_str(4, "Arbitration Burst", 0, + "no limit", 0); + nvme_print_uint64(4, "Low Priority Weight", + (uint16_t)arb.b.arb_lpw + 1, NULL, NULL); + nvme_print_uint64(4, "Medium Priority Weight", + (uint16_t)arb.b.arb_mpw + 1, NULL, NULL); + nvme_print_uint64(4, "High Priority Weight", + (uint16_t)arb.b.arb_hpw + 1, NULL, NULL); +} + +void +nvme_print_feat_power_mgmt(uint64_t res, void *b, size_t s, + nvme_identify_ctrl_t *id) +{ + _NOTE(ARGUNUSED(b)); + _NOTE(ARGUNUSED(s)); + _NOTE(ARGUNUSED(id)); + nvme_power_mgmt_t pm; + + pm.r = (uint32_t)res; + nvme_print_uint64(4, "Power State", (uint8_t)pm.b.pm_ps, + NULL, NULL); +} + +void +nvme_print_feat_lba_range(uint64_t res, void *buf, size_t bufsize, + nvme_identify_ctrl_t *id) +{ + _NOTE(ARGUNUSED(id)); + + nvme_lba_range_type_t lrt; + nvme_lba_range_t *lr; + size_t n_lr; + int i; + + if (buf == NULL) + return; + + lrt.r = res; + lr = buf; + + n_lr = bufsize / sizeof (nvme_lba_range_t); + if (n_lr > lrt.b.lr_num + 1) + n_lr = lrt.b.lr_num + 1; + + nvme_print_uint64(4, "Number of LBA Ranges", + (uint8_t)lrt.b.lr_num + 1, NULL, NULL); + + for (i = 0; i != n_lr; i++) { + if (verbose == 0 && lr[i].lr_nlb == 0) + continue; + + nvme_print(4, "LBA Range", i, NULL); + if (lr[i].lr_type < ARRAYSIZE(lba_range_types)) + nvme_print_str(6, "Type", -1, + lba_range_types[lr[i].lr_type], 0); + else + nvme_print_uint64(6, "Type", + lr[i].lr_type, NULL, NULL); + nvme_print(6, "Attributes", -1, NULL); + nvme_print_bit(8, "Writable", + lr[i].lr_attr.lr_write, "yes", "no"); + nvme_print_bit(8, "Hidden", + lr[i].lr_attr.lr_hidden, "yes", "no"); + nvme_print_uint64(6, "Starting LBA", + lr[i].lr_slba, NULL, NULL); + nvme_print_uint64(6, "Number of Logical Blocks", + lr[i].lr_nlb, NULL, NULL); + nvme_print(6, "Unique Identifier", -1, + "%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x" + "%.2x%.2x%.2x%.2x%.2x%.2x%.2x%.2x", + lr[i].lr_guid[0], lr[i].lr_guid[1], + lr[i].lr_guid[2], lr[i].lr_guid[3], + lr[i].lr_guid[4], lr[i].lr_guid[5], + lr[i].lr_guid[6], lr[i].lr_guid[7], + lr[i].lr_guid[8], lr[i].lr_guid[9], + lr[i].lr_guid[10], lr[i].lr_guid[11], + lr[i].lr_guid[12], lr[i].lr_guid[13], + lr[i].lr_guid[14], lr[i].lr_guid[15]); + } +} + +void +nvme_print_feat_temperature(uint64_t res, void *b, size_t s, + nvme_identify_ctrl_t *id) +{ + _NOTE(ARGUNUSED(b)); + _NOTE(ARGUNUSED(s)); + _NOTE(ARGUNUSED(id)); + nvme_temp_threshold_t tt; + + tt.r = (uint32_t)res; + nvme_print_uint64(4, "Temperature Threshold", tt.b.tt_tmpth - 273, + NULL, "C"); +} + +void +nvme_print_feat_error(uint64_t res, void *b, size_t s, + nvme_identify_ctrl_t *id) +{ + _NOTE(ARGUNUSED(b)); + _NOTE(ARGUNUSED(s)); + _NOTE(ARGUNUSED(id)); + nvme_error_recovery_t er; + + er.r = (uint32_t)res; + if (er.b.er_tler > 0) + nvme_print_uint64(4, "Time Limited Error Recovery", + (uint32_t)er.b.er_tler * 100, NULL, "ms"); + else + nvme_print_str(4, "Time Limited Error Recovery", -1, + "no time limit", 0); +} + +void +nvme_print_feat_write_cache(uint64_t res, void *b, size_t s, + nvme_identify_ctrl_t *id) +{ + _NOTE(ARGUNUSED(b)); + _NOTE(ARGUNUSED(s)); + _NOTE(ARGUNUSED(id)); + nvme_write_cache_t wc; + + wc.r = (uint32_t)res; + nvme_print_bit(4, "Volatile Write Cache", + wc.b.wc_wce, "enabled", "disabled"); +} + +void +nvme_print_feat_nqueues(uint64_t res, void *b, size_t s, + nvme_identify_ctrl_t *id) +{ + _NOTE(ARGUNUSED(b)); + _NOTE(ARGUNUSED(s)); + _NOTE(ARGUNUSED(id)); + nvme_nqueues_t nq; + + nq.r = (uint32_t)res; + nvme_print_uint64(4, "Number of Submission Queues", + nq.b.nq_nsq + 1, NULL, NULL); + nvme_print_uint64(4, "Number of Completion Queues", + nq.b.nq_ncq + 1, NULL, NULL); +} + +void +nvme_print_feat_intr_coal(uint64_t res, void *b, size_t s, + nvme_identify_ctrl_t *id) +{ + _NOTE(ARGUNUSED(b)); + _NOTE(ARGUNUSED(s)); + _NOTE(ARGUNUSED(id)); + nvme_intr_coal_t ic; + + ic.r = (uint32_t)res; + nvme_print_uint64(4, "Aggregation Threshold", + ic.b.ic_thr + 1, NULL, NULL); + nvme_print_uint64(4, "Aggregation Time", + (uint16_t)ic.b.ic_time * 100, NULL, "us"); +} +void +nvme_print_feat_intr_vect(uint64_t res, void *b, size_t s, + nvme_identify_ctrl_t *id) +{ + _NOTE(ARGUNUSED(b)); + _NOTE(ARGUNUSED(s)); + _NOTE(ARGUNUSED(id)); + nvme_intr_vect_t iv; + char *tmp; + + iv.r = (uint32_t)res; + if (asprintf(&tmp, "Vector %d Coalescing Disable", iv.b.iv_iv) < 0) + err(-1, "nvme_print_feat_common()"); + + nvme_print_bit(4, tmp, iv.b.iv_cd, "yes", "no"); +} + +void +nvme_print_feat_write_atom(uint64_t res, void *b, size_t s, + nvme_identify_ctrl_t *id) +{ + _NOTE(ARGUNUSED(b)); + _NOTE(ARGUNUSED(s)); + _NOTE(ARGUNUSED(id)); + nvme_write_atomicity_t wa; + + wa.r = (uint32_t)res; + nvme_print_bit(4, "Disable Normal", wa.b.wa_dn, "yes", "no"); +} + +void +nvme_print_feat_async_event(uint64_t res, void *b, size_t s, + nvme_identify_ctrl_t *idctl) +{ + _NOTE(ARGUNUSED(b)); + _NOTE(ARGUNUSED(s)); + nvme_async_event_conf_t aec; + + aec.r = (uint32_t)res; + nvme_print_bit(4, "Available Space below threshold", + aec.b.aec_avail, "enabled", "disabled"); + nvme_print_bit(4, "Temperature above threshold", + aec.b.aec_temp, "enabled", "disabled"); + nvme_print_bit(4, "Device Reliability compromised", + aec.b.aec_reliab, "enabled", "disabled"); + nvme_print_bit(4, "Media read-only", + aec.b.aec_readonly, "enabled", "disabled"); + if (idctl->id_vwc.vwc_present != 0) + nvme_print_bit(4, "Volatile Memory Backup failed", + aec.b.aec_volatile, "enabled", "disabled"); +} + +void +nvme_print_feat_auto_pst(uint64_t res, void *buf, size_t bufsize, + nvme_identify_ctrl_t *id) +{ + _NOTE(ARGUNUSED(id)); + + nvme_auto_power_state_trans_t apst; + nvme_auto_power_state_t *aps; + int i; + int cnt = bufsize / sizeof (nvme_auto_power_state_t); + + if (buf == NULL) + return; + + apst.r = res; + aps = buf; + + nvme_print_bit(4, "Autonomous Power State Transition", + apst.b.apst_apste, "enabled", "disabled"); + for (i = 0; i != cnt; i++) { + if (aps[i].apst_itps == 0 && aps[i].apst_itpt == 0) + break; + + nvme_print(4, "Power State", i, NULL); + nvme_print_uint64(6, "Idle Transition Power State", + (uint16_t)aps[i].apst_itps, NULL, NULL); + nvme_print_uint64(6, "Idle Time Prior to Transition", + aps[i].apst_itpt, NULL, "ms"); + } +} + +void +nvme_print_feat_progress(uint64_t res, void *b, size_t s, + nvme_identify_ctrl_t *id) +{ + _NOTE(ARGUNUSED(b)); + _NOTE(ARGUNUSED(s)); + _NOTE(ARGUNUSED(id)); + nvme_software_progress_marker_t spm; + + spm.r = (uint32_t)res; + nvme_print_uint64(4, "Pre-Boot Software Load Count", + spm.b.spm_pbslc, NULL, NULL); +} diff --git a/usr/src/man/man1m/Makefile b/usr/src/man/man1m/Makefile index 543928c16b..19a8cd0f88 100644 --- a/usr/src/man/man1m/Makefile +++ b/usr/src/man/man1m/Makefile @@ -554,7 +554,8 @@ _MANFILES= 6to4relay.1m \ i386_MANFILES= \ acpidump.1m \ - acpixtract.1m + acpixtract.1m \ + nvmeadm.1m sparc_MANFILES= cvcd.1m \ dcs.1m \ diff --git a/usr/src/man/man1m/nvmeadm.1m b/usr/src/man/man1m/nvmeadm.1m new file mode 100644 index 0000000000..9e1cfc1014 --- /dev/null +++ b/usr/src/man/man1m/nvmeadm.1m @@ -0,0 +1,410 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2016 Nexenta Systems, Inc. All rights reserved. +.\" +.Dd May 04, 2016 +.Dt NVMEADM 1M +.Os +.Sh NAME +.Nm nvmeadm +.Nd NVMe administration utility +.Sh SYNOPSIS +.Nm +.Fl h +.Op Ar command +.Nm +.Op Fl dv +.Cm list +.Op Ar ctl[/ns][,...] +.Nm +.Op Fl dv +.Cm identify +.Ar ctl[/ns][,...] +.Nm +.Op Fl dv +.Cm get-logpage +.Ar ctl[/ns][,...] +.Ar logpage +.Nm +.Op Fl dv +.Cm get-features +.Ar ctl[/ns][,...] +.Op Ar feature-list +.Nm +.Op Fl dv +.Cm format +.Ar ctl[/ns] +.Op Ar lba-format +.Nm +.Op Fl dv +.Cm secure-erase +.Ar ctl[/ns] +.Op Fl c +.Nm +.Op Fl dv +.Cm detach +.Ar ctl[/ns] +.Nm +.Op Fl dv +.Cm attach +.Ar ctl[/ns] +.Sh DESCRIPTION +The +.Nm +utility can be used to enumerate the NVMe controllers and their +namespaces, query hardware information from a NVMe controller or +namespace, and to format or secure-erase a NVMe controller or +namespace. +.Pp +The information returned by the hardware is printed by +.Nm +in a human-readable form were applicable. +Generally all 0-based counts are normalized and values may be +converted to human-readable units such as MB (megabytes), W (watts), +or C (degrees Celsius). +.Sh OPTIONS +The following options are supported: +.Bl -tag -width Ds +.It Fl h +Print a short help text for +.Nm , +or for an optionally specified +.Nm +command. +.It Fl d +Enable debugging output. +.It Fl v +Enable verbose output. +.El +.Sh ARGUMENTS +.Nm +expects the following kinds of arguments: +.Bl -tag -width "ctl/[ns]" +.It Ar command +Any command +.Nm +understands. +See section +.Sx COMMANDS . +.It Ar ctl/[ns] +Specifies a NVMe controller and optionally a namespace within that +controller. +The controller name consists of the driver name +.Qq nvme +followed by an instance number. +A namespace is specified by appending a single +.Qq / +followed by the namespace ID to the controller name. +The namespace ID is the EUI64 of the namespace, or a positive non-zero +decimal number if the namespace doesn't have an EUI64. +For commands that don't change the device state multiple controllers +and namespaces can be specified as a comma-separated list. +.Pp +The list of controllers and namespaces present in the system can be +queried with the +.Cm list +command without any arguments. +.It Ar logpage +Specifies the log page name for the +.Cm get-logpage +command. +.It Ar feature-list +A comma-separated list of feature names for the +.Cm get-features +command. +Feature names can be specified in upper or lower case and can be +shortened the shortest unique name. +Some features may also have an alternative short name. +.It Ar lba-format +A non-zero integer specifying the LBA format for the +.Cm format +command. +The list of supported LBA formats on a namespace can be retrieved +with the +.Nm +.Cm identify +command. +.El +.Sh COMMANDS +.Bl -tag -width "" +.It Xo +.Nm +.Cm list +.Op Ar ctl[/ns][,...] +.Xc +Lists the NVMe controllers and their namespaces in the system and +prints a 1-line summary of their basic properties for each. +If a list of controllers and/or namespaces is given then the listing +is limited to those devices. +.It Xo +.Nm +.Cm identify +.Ar ctl[/ns][,...] +.Xc +Print detailed information about the specified controllers and/or +namespaces. +The information returned differs depending on whether a controller or +a namespace is specified. +For an explanation of the data printed by this command refer to the +description of the +.Qq IDENTIFY +admin command in the NVMe specification. +.It Xo +.Nm +.Cm get-logpage +.Ar ctl[/ns][,...] +.Ar logpage +.Xc +Print the specified log page of the specified controllers and/or namespaces. +Most log pages are only available on a per-controller basis. +Known log pages are: +.Bl -tag -width "firmware" +.It error +Error Information +.It health +SMART/Health Information. +A controller may support this log page on a per-namespace basis. +.It firmware +Firmware Slot Information +.El +.Pp +For an explanation of the contents of the log pages refer to the +description of the +.Qq GET LOGPAGE +admin command in the NVMe specification. +.It Xo +.Nm +.Cm get-features +.Ar ctl[/ns][,...] +.Op Ar feature-list +.Xc +Prints information about the specified features, or all features if +none are given, of the specified controllers and/or namespaces. +Feature names are case-insensitive, and they can be shortened as long +as they remain unique. +Some features also have alternative short names to which the same +rules apply. +The following features are supported: +.Pp +.TS +tab(:); +l l l. +FULL NAME:SHORT NAME:CONTROLLER/NAMESPACE +Arbitration::controller +Power Management::controller +LBA Range Type:range:namespace +Temperature Threshold::controller +Error Recovery::controller +Volatile Write Cache:cache:controller +Number of Queues:queues:controller +Interrupt Coalescing:coalescing:controller +Interrupt Vector Configuration:vector:controller +Write Atomicity:atomicity:controller +Asynchronous Event Configuration:event:controller +Autonomous Power State Transition::controller +Software Progress Marker:progress:controller +.TE +.Pp +For an explanation of the individual features refer to the description +of the +.Qq SET FEATURES +admin command in the NVMe specification. +.It Xo +.Nm +.Cm format +.Ar ctl[/ns] +.Op Ar lba-format +.Xc +Formats the specified namespace or all namespaces of the specified +controller. +This command implies a +.Nm +.Cm detach +and subsequent +.Nm +.Cm attach +of the specified namespace(s), which will cause a changed LBA format +to be detected. +If no LBA format is specified the LBA format currently used by the +namespace will be used. +When formatting all namespaces without specifying a LBA format the LBA +format of namespace 1 will be used. +A list of LBA formats supported by a namespace can be queried with the +.Nm +.Cm identify +command. +.Pp +Note that not all devices support formatting individual or all +namespaces, or support formatting at all. +.Pp +LBA formats using a non-zero metadata size are not supported by +.Nm +or +.Xr nvme 7D . +.Pp +The list of supported LBA formats on a namespace can be retrieved +with the +.Nm +.Cm identify +command. +.It Xo +.Nm +.Cm secure-erase +.Ar ctl[/ns] +.Op Fl c +.Xc +Erases the specified namespace or all namespaces of the controller. +The flag +.Fl c +will cause a cryptographic erase instead of a normal erase. +This command implies a +.Nm +.Cm detach +and +.Nm +.Cm attach +of the specified namespace(s). +.Pp +Note that not all devices support erasing individual or all +namespaces, or support erasing at all. +.It Xo +.Nm +.Cm detach +.Ar ctl[/ns] +.Xc +Temporarily detaches the +.Xr blkdev 7D +instance from the specified namespace or all namespaces of the controller. +This will prevent I/O access to the affected namespace(s). +Detach will only succeed if the affected namespace(s) are not +currently opened. +The detached state will not persist across reboots or reloads of the +.Xr nvme 7D +driver. +.It Xo +.Nm +.Cm attach +.Ar ctl[/ns] +.Xc +Attaches the +.Xr blkdev 7D +instance to the specified namespace or all namespaces of the controller. +This will make I/O accesses to the namespace(s) possible again after a +previous +.Nm +.Cm detach +command. +.El +.Sh EXIT STATUS +.Ex -std +.Sh EXAMPLES +.Bl -tag -width "" +.It Sy Example 1: List all NVMe controllers and namespaces +.Bd -literal +# nvmeadm list +nvme1: model: INTEL SSDPEDMD800G4, serial: CVFT4134001R800CGN, FW rev: 8DV10049, NVMe v1.0 + nvme1/1 (c1t1d0): Size = 763097 MB, Capacity = 763097 MB, Used = 763097 MB +nvme4: model: SAMSUNG MZVPV128HDGM-00000, serial: S1XVNYAGA00640, FW rev: BXW7300Q, NVMe v1.1 + nvme4/1 (c2t2d0): Size = 122104 MB, Capacity = 122104 MB, Used = 5127 MB +.Ed +.It Sy Example 2: Identify a namespace +.Bd -literal +# nvmeadm identify nvme4/1 +nvme4/1: Identify Namespace + Namespace Capabilities and Features + Namespace Size: 122104MB + Namespace Capacity: 122104MB + Namespace Utilization: 5127MB + Namespace Features + Thin Provisioning: unsupported + Number of LBA Formats: 1 + Formatted LBA Size + LBA Format: 1 + Extended Data LBA: no + Metadata Capabilities + Extended Data LBA: unsupported + Separate Metadata: unsupported + End-to-End Data Protection Capabilities + Protection Information Type 1: unsupported + Protection Information Type 2: unsupported + Protection Information Type 3: unsupported + Protection Information first: unsupported + Protection Information last: unsupported + End-to-End Data Protection Settings + Protection Information: disabled + Protection Information in Metadata: last 8 bytes + LBA Format 1 + Metadata Size: 0 bytes + LBA Data Size: 512 bytes + Relative Performance: Best +.Ed +.It Sy Example 3: Get SMART/Health information (verbose) +.Bd -literal +# nvmeadm -v get-logpage nvme4/1 health +nvme4/1: SMART/Health Information + Critical Warnings + Available Space: OK + Temperature: OK + Device Reliability: OK + Media: OK + Volatile Memory Backup: OK + Temperature: 37C + Available Spare Capacity: 100% + Available Spare Threshold: 10% + Device Life Used: 0% + Data Read: 0GB + Data Written: 64GB + Read Commands: 52907 + Write Commands: 567874 + Controller Busy: 1min + Power Cycles: 6 + Power On: 141h + Unsafe Shutdowns: 1 + Uncorrectable Media Errors: 0 + Errors Logged: 1 +.Ed +.It Sy Example 4: Get Asynchronous Event Configuration information +.Bd -literal +# nvmeadm get-features nvme0,nvme4 event,power +nvme0: Get Features + Asynchronous Event Configuration + Available Space below threshold: disabled + Temperature above threshold: disabled + Device Reliability compromised: disabled + Media read-only: disabled + Power Management + Power State: 0 +nvme4: Get Features + Asynchronous Event Configuration + Available Space below threshold: disabled + Temperature above threshold: disabled + Device Reliability compromised: disabled + Media read-only: disabled + Volatile Memory Backup failed: disabled + Power Management + Power State: 0 +.Ed +.El +.Sh INTERFACE STABILITY +The command line interface of +.Nm +is +.Sy Evolving . +The output of +.Nm +is +.Sy Not-an-Interface +and may change any time. +.Sh SEE ALSO +.Xr nvme 7D +.Pp +.Lk http://www.nvmexpress.org/specifications/ "NVMe specifications" diff --git a/usr/src/pkg/manifests/driver-storage-nvme.mf b/usr/src/pkg/manifests/driver-storage-nvme.mf index 87b0954b7f..45b5a333f7 100644 --- a/usr/src/pkg/manifests/driver-storage-nvme.mf +++ b/usr/src/pkg/manifests/driver-storage-nvme.mf @@ -34,6 +34,7 @@ dir path=kernel group=sys dir path=kernel/drv group=sys dir path=kernel/drv/$(ARCH64) group=sys dir path=usr group=sys +dir path=usr/sbin dir path=usr/share dir path=usr/share/man dir path=usr/share/man/man7d @@ -41,5 +42,7 @@ driver name=nvme alias=pciexclass,010802 class=disk perms="* 0600 root sys" file path=kernel/drv/$(ARCH64)/nvme group=sys file path=kernel/drv/nvme group=sys file path=kernel/drv/nvme.conf group=sys +file path=usr/sbin/nvmeadm mode=0555 +file path=usr/share/man/man1m/nvmeadm.1m file path=usr/share/man/man7d/nvme.7d license lic_CDDL license=lic_CDDL diff --git a/usr/src/uts/common/io/blkdev/blkdev.c b/usr/src/uts/common/io/blkdev/blkdev.c index 0c80d15cfe..d3b96c9f8a 100644 --- a/usr/src/uts/common/io/blkdev/blkdev.c +++ b/usr/src/uts/common/io/blkdev/blkdev.c @@ -1819,6 +1819,16 @@ bd_attach_handle(dev_info_t *dip, bd_handle_t hdl) dev_info_t *child; bd_drive_t drive = { 0 }; + /* + * It's not an error if bd_attach_handle() is called on a handle that + * already is attached. We just ignore the request to attach and return. + * This way drivers using blkdev don't have to keep track about blkdev + * state, they can just call this function to make sure it attached. + */ + if (hdl->h_child != NULL) { + return (DDI_SUCCESS); + } + /* if drivers don't override this, make it assume none */ drive.d_lun = -1; hdl->h_ops.o_drive_info(hdl->h_private, &drive); @@ -1882,6 +1892,12 @@ bd_detach_handle(bd_handle_t hdl) int rv; char *devnm; + /* + * It's not an error if bd_detach_handle() is called on a handle that + * already is detached. We just ignore the request to detach and return. + * This way drivers using blkdev don't have to keep track about blkdev + * state, they can just call this function to make sure it detached. + */ if (hdl->h_child == NULL) { return (DDI_SUCCESS); } diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c index cb2e9bdd22..c87be0d3f0 100644 --- a/usr/src/uts/common/io/nvme/nvme.c +++ b/usr/src/uts/common/io/nvme/nvme.c @@ -83,6 +83,19 @@ * passes it to blkdev to use it in the device node names. As this is currently * untested namespaces with EUI64 are ignored by default. * + * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a + * single controller. This is an artificial limit imposed by the driver to be + * able to address a reasonable number of controllers and namespaces using a + * 32bit minor node number. + * + * + * Minor nodes: + * + * For each NVMe device the driver exposes one minor node for the controller and + * one minor node for each namespace. The only operations supported by those + * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the + * interface for the nvmeadm(1M) utility. + * * * Blkdev Interface: * @@ -164,7 +177,6 @@ * - polled I/O support to support kernel core dumping * - FMA handling of media errors * - support for devices supporting very large I/O requests using chained PRPs - * - support for querying log pages from user space * - support for configuring hardware parameters like interrupt coalescing * - support for media formatting and hard partitioning into namespaces * - support for big-endian systems @@ -186,6 +198,7 @@ #include <sys/devops.h> #include <sys/ddi.h> #include <sys/sunddi.h> +#include <sys/sunndi.h> #include <sys/bitmap.h> #include <sys/sysmacros.h> #include <sys/param.h> @@ -196,6 +209,10 @@ #include <sys/atomic.h> #include <sys/archsystm.h> #include <sys/sata/sata_hba.h> +#include <sys/stat.h> +#include <sys/policy.h> + +#include <sys/nvme.h> #ifdef __x86 #include <sys/x86_archext.h> @@ -210,7 +227,10 @@ static const int nvme_version_major = 1; static const int nvme_version_minor = 2; /* tunable for admin command timeout in seconds, default is 1s */ -static volatile int nvme_admin_cmd_timeout = 1; +int nvme_admin_cmd_timeout = 1; + +/* tunable for FORMAT NVM command timeout in seconds, default is 600s */ +int nvme_format_cmd_timeout = 600; static int nvme_attach(dev_info_t *, ddi_attach_cmd_t); static int nvme_detach(dev_info_t *, ddi_detach_cmd_t); @@ -243,10 +263,14 @@ static inline int nvme_check_cmd_status(nvme_cmd_t *); static void nvme_abort_cmd(nvme_cmd_t *); static int nvme_async_event(nvme_t *); -static void *nvme_get_logpage(nvme_t *, uint8_t, ...); +static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t, + boolean_t, uint8_t); +static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...); static void *nvme_identify(nvme_t *, uint32_t); static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t, uint32_t *); +static boolean_t nvme_get_features(nvme_t *, uint32_t, uint8_t, uint32_t *, + void **, size_t *); static boolean_t nvme_write_cache_set(nvme_t *, boolean_t); static int nvme_set_nqueues(nvme_t *, uint16_t); @@ -283,6 +307,16 @@ static void nvme_prp_dma_destructor(void *, void *); static void nvme_prepare_devid(nvme_t *, uint32_t); +static int nvme_open(dev_t *, int, int, cred_t *); +static int nvme_close(dev_t, int, int, cred_t *); +static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); + +#define NVME_MINOR_INST_SHIFT 14 +#define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid)) +#define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT) +#define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1)) +#define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2) + static void *nvme_state; static kmem_cache_t *nvme_cmd_cache; @@ -358,6 +392,27 @@ static ddi_device_acc_attr_t nvme_reg_acc_attr = { .devacc_attr_dataorder = DDI_STRICTORDER_ACC }; +static struct cb_ops nvme_cb_ops = { + .cb_open = nvme_open, + .cb_close = nvme_close, + .cb_strategy = nodev, + .cb_print = nodev, + .cb_dump = nodev, + .cb_read = nodev, + .cb_write = nodev, + .cb_ioctl = nvme_ioctl, + .cb_devmap = nodev, + .cb_mmap = nodev, + .cb_segmap = nodev, + .cb_chpoll = nochpoll, + .cb_prop_op = ddi_prop_op, + .cb_str = 0, + .cb_flag = D_NEW | D_MP, + .cb_rev = CB_REV, + .cb_aread = nodev, + .cb_awrite = nodev +}; + static struct dev_ops nvme_dev_ops = { .devo_rev = DEVO_REV, .devo_refcnt = 0, @@ -367,7 +422,7 @@ static struct dev_ops nvme_dev_ops = { .devo_attach = nvme_attach, .devo_detach = nvme_detach, .devo_reset = nodev, - .devo_cb_ops = NULL, + .devo_cb_ops = &nvme_cb_ops, .devo_bus_ops = NULL, .devo_power = NULL, .devo_quiesce = nvme_quiesce, @@ -844,7 +899,8 @@ nvme_check_unknown_cmd_status(nvme_cmd_t *cmd) cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); - bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); if (cmd->nc_nvme->n_strict_version) { cmd->nc_nvme->n_dead = B_TRUE; @@ -881,13 +937,15 @@ nvme_check_integrity_cmd_status(nvme_cmd_t *cmd) case NVME_CQE_SC_INT_NVM_WRITE: /* write fail */ /* TODO: post ereport */ - bd_error(cmd->nc_xfer, BD_ERR_MEDIA); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_MEDIA); return (EIO); case NVME_CQE_SC_INT_NVM_READ: /* read fail */ /* TODO: post ereport */ - bd_error(cmd->nc_xfer, BD_ERR_MEDIA); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_MEDIA); return (EIO); default: @@ -915,9 +973,11 @@ nvme_check_generic_cmd_status(nvme_cmd_t *cmd) case NVME_CQE_SC_GEN_INV_FLD: /* Invalid Field in Command */ - dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " - "invalid field in cmd %p", (void *)cmd); - return (0); + if (!cmd->nc_dontpanic) + dev_err(cmd->nc_nvme->n_dip, CE_PANIC, + "programming error: invalid field in cmd %p", + (void *)cmd); + return (EIO); case NVME_CQE_SC_GEN_ID_CNFL: /* Command ID Conflict */ @@ -927,9 +987,11 @@ nvme_check_generic_cmd_status(nvme_cmd_t *cmd) case NVME_CQE_SC_GEN_INV_NS: /* Invalid Namespace or Format */ - dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " - "invalid NS/format in cmd %p", (void *)cmd); - return (0); + if (!cmd->nc_dontpanic) + dev_err(cmd->nc_nvme->n_dip, CE_PANIC, + "programming error: " "invalid NS/format in cmd %p", + (void *)cmd); + return (EINVAL); case NVME_CQE_SC_GEN_NVM_LBA_RANGE: /* LBA Out Of Range */ @@ -944,7 +1006,8 @@ nvme_check_generic_cmd_status(nvme_cmd_t *cmd) /* Data Transfer Error (DMA) */ /* TODO: post ereport */ atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err); - bd_error(cmd->nc_xfer, BD_ERR_NTRDY); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_NTRDY); return (EIO); case NVME_CQE_SC_GEN_INTERNAL_ERR: @@ -955,7 +1018,8 @@ nvme_check_generic_cmd_status(nvme_cmd_t *cmd) * in the async event handler. */ atomic_inc_32(&cmd->nc_nvme->n_internal_err); - bd_error(cmd->nc_xfer, BD_ERR_NTRDY); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_NTRDY); return (EIO); case NVME_CQE_SC_GEN_ABORT_REQUEST: @@ -981,13 +1045,15 @@ nvme_check_generic_cmd_status(nvme_cmd_t *cmd) case NVME_CQE_SC_GEN_NVM_CAP_EXC: /* Capacity Exceeded */ atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc); - bd_error(cmd->nc_xfer, BD_ERR_MEDIA); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_MEDIA); return (EIO); case NVME_CQE_SC_GEN_NVM_NS_NOTRDY: /* Namespace Not Ready */ atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy); - bd_error(cmd->nc_xfer, BD_ERR_NTRDY); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_NTRDY); return (EIO); default: @@ -1048,14 +1114,14 @@ nvme_check_specific_cmd_status(nvme_cmd_t *cmd) /* Invalid Log Page */ ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE); atomic_inc_32(&cmd->nc_nvme->n_inv_log_page); - bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); return (EINVAL); case NVME_CQE_SC_SPC_INV_FORMAT: /* Invalid Format */ ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT); atomic_inc_32(&cmd->nc_nvme->n_inv_format); - bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); return (EINVAL); case NVME_CQE_SC_SPC_INV_Q_DEL: @@ -1070,7 +1136,8 @@ nvme_check_specific_cmd_status(nvme_cmd_t *cmd) cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr); - bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); return (EINVAL); case NVME_CQE_SC_SPC_NVM_INV_PROT: @@ -1079,14 +1146,16 @@ nvme_check_specific_cmd_status(nvme_cmd_t *cmd) cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); atomic_inc_32(&cmd->nc_nvme->n_inv_prot); - bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); return (EINVAL); case NVME_CQE_SC_SPC_NVM_READONLY: /* Write to Read Only Range */ ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); atomic_inc_32(&cmd->nc_nvme->n_readonly); - bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); return (EROFS); default: @@ -1281,6 +1350,7 @@ nvme_async_event_task(void *arg) nvme_t *nvme = cmd->nc_nvme; nvme_error_log_entry_t *error_log = NULL; nvme_health_log_t *health_log = NULL; + size_t logsize = 0; nvme_async_event_t event; int ret; @@ -1328,8 +1398,8 @@ nvme_async_event_task(void *arg) switch (event.b.ae_type) { case NVME_ASYNC_TYPE_ERROR: if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) { - error_log = (nvme_error_log_entry_t *) - nvme_get_logpage(nvme, event.b.ae_logpage); + (void) nvme_get_logpage(nvme, (void **)&error_log, + &logsize, event.b.ae_logpage); } else { dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " "async event reply: %d", event.b.ae_logpage); @@ -1379,8 +1449,8 @@ nvme_async_event_task(void *arg) case NVME_ASYNC_TYPE_HEALTH: if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) { - health_log = (nvme_health_log_t *) - nvme_get_logpage(nvme, event.b.ae_logpage, -1); + (void) nvme_get_logpage(nvme, (void **)&health_log, + &logsize, event.b.ae_logpage, -1); } else { dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " "async event reply: %d", event.b.ae_logpage); @@ -1427,11 +1497,10 @@ nvme_async_event_task(void *arg) } if (error_log) - kmem_free(error_log, sizeof (nvme_error_log_entry_t) * - nvme->n_error_log_len); + kmem_free(error_log, logsize); if (health_log) - kmem_free(health_log, sizeof (nvme_health_log_t)); + kmem_free(health_log, logsize); } static int @@ -1485,14 +1554,58 @@ nvme_async_event(nvme_t *nvme) return (DDI_SUCCESS); } -static void * -nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...) +static int +nvme_format_nvm(nvme_t *nvme, uint32_t nsid, uint8_t lbaf, boolean_t ms, + uint8_t pi, boolean_t pil, uint8_t ses) +{ + nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); + nvme_format_nvm_t format_nvm = { 0 }; + int ret; + + format_nvm.b.fm_lbaf = lbaf & 0xf; + format_nvm.b.fm_ms = ms ? 1 : 0; + format_nvm.b.fm_pi = pi & 0x7; + format_nvm.b.fm_pil = pil ? 1 : 0; + format_nvm.b.fm_ses = ses & 0x7; + + cmd->nc_sqid = 0; + cmd->nc_callback = nvme_wakeup_cmd; + cmd->nc_sqe.sqe_nsid = nsid; + cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT; + cmd->nc_sqe.sqe_cdw10 = format_nvm.r; + + /* + * Some devices like Samsung SM951 don't allow formatting of all + * namespaces in one command. Handle that gracefully. + */ + if (nsid == (uint32_t)-1) + cmd->nc_dontpanic = B_TRUE; + + if ((ret = nvme_admin_cmd(cmd, nvme_format_cmd_timeout)) + != DDI_SUCCESS) { + dev_err(nvme->n_dip, CE_WARN, + "!nvme_admin_cmd failed for FORMAT NVM"); + return (EIO); + } + + if ((ret = nvme_check_cmd_status(cmd)) != 0) { + dev_err(nvme->n_dip, CE_WARN, + "!FORMAT failed with sct = %x, sc = %x", + cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); + } + + nvme_free_cmd(cmd); + return (ret); +} + +static int +nvme_get_logpage(nvme_t *nvme, void **buf, size_t *bufsize, uint8_t logpage, + ...) { nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); - void *buf = NULL; nvme_getlogpage_t getlogpage = { 0 }; - size_t bufsize; va_list ap; + int ret = DDI_FAILURE; va_start(ap, logpage); @@ -1505,18 +1618,22 @@ nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...) switch (logpage) { case NVME_LOGPAGE_ERROR: cmd->nc_sqe.sqe_nsid = (uint32_t)-1; - bufsize = nvme->n_error_log_len * - sizeof (nvme_error_log_entry_t); + /* + * The GET LOG PAGE command can use at most 2 pages to return + * data, PRP lists are not supported. + */ + *bufsize = MIN(2 * nvme->n_pagesize, + nvme->n_error_log_len * sizeof (nvme_error_log_entry_t)); break; case NVME_LOGPAGE_HEALTH: cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); - bufsize = sizeof (nvme_health_log_t); + *bufsize = sizeof (nvme_health_log_t); break; case NVME_LOGPAGE_FWSLOT: cmd->nc_sqe.sqe_nsid = (uint32_t)-1; - bufsize = sizeof (nvme_fwslot_log_t); + *bufsize = sizeof (nvme_fwslot_log_t); break; default: @@ -1528,7 +1645,7 @@ nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...) va_end(ap); - getlogpage.b.lp_numd = bufsize / sizeof (uint32_t) - 1; + getlogpage.b.lp_numd = *bufsize / sizeof (uint32_t) - 1; cmd->nc_sqe.sqe_cdw10 = getlogpage.r; @@ -1557,7 +1674,7 @@ nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...) if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { dev_err(nvme->n_dip, CE_WARN, "!nvme_admin_cmd failed for GET LOG PAGE"); - return (NULL); + return (ret); } if (nvme_check_cmd_status(cmd)) { @@ -1567,13 +1684,15 @@ nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...) goto fail; } - buf = kmem_alloc(bufsize, KM_SLEEP); - bcopy(cmd->nc_dma->nd_memp, buf, bufsize); + *buf = kmem_alloc(*bufsize, KM_SLEEP); + bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); + + ret = DDI_SUCCESS; fail: nvme_free_cmd(cmd); - return (buf); + return (ret); } static void * @@ -1684,6 +1803,130 @@ fail: } static boolean_t +nvme_get_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t *res, + void **buf, size_t *bufsize) +{ + nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); + boolean_t ret = B_FALSE; + + ASSERT(res != NULL); + + if (bufsize != NULL) + *bufsize = 0; + + cmd->nc_sqid = 0; + cmd->nc_callback = nvme_wakeup_cmd; + cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES; + cmd->nc_sqe.sqe_cdw10 = feature; + cmd->nc_sqe.sqe_cdw11 = *res; + + switch (feature) { + case NVME_FEAT_ARBITRATION: + case NVME_FEAT_POWER_MGMT: + case NVME_FEAT_TEMPERATURE: + case NVME_FEAT_ERROR: + case NVME_FEAT_NQUEUES: + case NVME_FEAT_INTR_COAL: + case NVME_FEAT_INTR_VECT: + case NVME_FEAT_WRITE_ATOM: + case NVME_FEAT_ASYNC_EVENT: + case NVME_FEAT_PROGRESS: + break; + + case NVME_FEAT_WRITE_CACHE: + if (!nvme->n_write_cache_present) + goto fail; + break; + + case NVME_FEAT_LBA_RANGE: + if (!nvme->n_lba_range_supported) + goto fail; + + /* + * The LBA Range Type feature is optional. There doesn't seem + * be a method of detecting whether it is supported other than + * using it. This will cause a "invalid field in command" error, + * which is normally considered a programming error and causes + * panic in nvme_check_generic_cmd_status(). + */ + cmd->nc_dontpanic = B_TRUE; + cmd->nc_sqe.sqe_nsid = nsid; + ASSERT(bufsize != NULL); + *bufsize = NVME_LBA_RANGE_BUFSIZE; + + break; + + case NVME_FEAT_AUTO_PST: + if (!nvme->n_auto_pst_supported) + goto fail; + + ASSERT(bufsize != NULL); + *bufsize = NVME_AUTO_PST_BUFSIZE; + break; + + default: + goto fail; + } + + if (bufsize != NULL && *bufsize != 0) { + if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ, + &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { + dev_err(nvme->n_dip, CE_WARN, + "!nvme_zalloc_dma failed for GET FEATURES"); + goto fail; + } + + if (cmd->nc_dma->nd_ncookie > 2) { + dev_err(nvme->n_dip, CE_WARN, + "!too many DMA cookies for GET FEATURES"); + atomic_inc_32(&nvme->n_too_many_cookies); + goto fail; + } + + cmd->nc_sqe.sqe_dptr.d_prp[0] = + cmd->nc_dma->nd_cookie.dmac_laddress; + if (cmd->nc_dma->nd_ncookie > 1) { + ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, + &cmd->nc_dma->nd_cookie); + cmd->nc_sqe.sqe_dptr.d_prp[1] = + cmd->nc_dma->nd_cookie.dmac_laddress; + } + } + + if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { + dev_err(nvme->n_dip, CE_WARN, + "!nvme_admin_cmd failed for GET FEATURES"); + return (ret); + } + + if (nvme_check_cmd_status(cmd)) { + if (feature == NVME_FEAT_LBA_RANGE && + cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && + cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD) + nvme->n_lba_range_supported = B_FALSE; + else + dev_err(nvme->n_dip, CE_WARN, + "!GET FEATURES %d failed with sct = %x, sc = %x", + feature, cmd->nc_cqe.cqe_sf.sf_sct, + cmd->nc_cqe.cqe_sf.sf_sc); + goto fail; + } + + if (bufsize != NULL && *bufsize != 0) { + ASSERT(buf != NULL); + *buf = kmem_alloc(*bufsize, KM_SLEEP); + bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); + } + + *res = cmd->nc_cqe.cqe_dw0; + ret = B_TRUE; + +fail: + nvme_free_cmd(cmd); + return (ret); +} + +static boolean_t nvme_write_cache_set(nvme_t *nvme, boolean_t enable) { nvme_write_cache_t nwc = { 0 }; @@ -1700,7 +1943,7 @@ nvme_write_cache_set(nvme_t *nvme, boolean_t enable) static int nvme_set_nqueues(nvme_t *nvme, uint16_t nqueues) { - nvme_nqueue_t nq = { 0 }; + nvme_nqueues_t nq = { 0 }; nq.b.nq_nsq = nq.b.nq_ncq = nqueues - 1; @@ -1866,6 +2109,89 @@ nvme_prepare_devid(nvme_t *nvme, uint32_t nsid) } static int +nvme_init_ns(nvme_t *nvme, int nsid) +{ + nvme_namespace_t *ns = &nvme->n_ns[nsid - 1]; + nvme_identify_nsid_t *idns; + int last_rp; + + ns->ns_nvme = nvme; + idns = nvme_identify(nvme, nsid); + + if (idns == NULL) { + dev_err(nvme->n_dip, CE_WARN, + "!failed to identify namespace %d", nsid); + return (DDI_FAILURE); + } + + ns->ns_idns = idns; + ns->ns_id = nsid; + ns->ns_block_count = idns->id_nsize; + ns->ns_block_size = + 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; + ns->ns_best_block_size = ns->ns_block_size; + + /* + * Get the EUI64 if present. Use it for devid and device node names. + */ + if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) + bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64)); + + /*LINTED: E_BAD_PTR_CAST_ALIGN*/ + if (*(uint64_t *)ns->ns_eui64 != 0) { + uint8_t *eui64 = ns->ns_eui64; + + (void) snprintf(ns->ns_name, sizeof (ns->ns_name), + "%02x%02x%02x%02x%02x%02x%02x%02x", + eui64[0], eui64[1], eui64[2], eui64[3], + eui64[4], eui64[5], eui64[6], eui64[7]); + } else { + (void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%d", + ns->ns_id); + + nvme_prepare_devid(nvme, ns->ns_id); + } + + /* + * Find the LBA format with no metadata and the best relative + * performance. A value of 3 means "degraded", 0 is best. + */ + last_rp = 3; + for (int j = 0; j <= idns->id_nlbaf; j++) { + if (idns->id_lbaf[j].lbaf_lbads == 0) + break; + if (idns->id_lbaf[j].lbaf_ms != 0) + continue; + if (idns->id_lbaf[j].lbaf_rp >= last_rp) + continue; + last_rp = idns->id_lbaf[j].lbaf_rp; + ns->ns_best_block_size = + 1 << idns->id_lbaf[j].lbaf_lbads; + } + + if (ns->ns_best_block_size < nvme->n_min_block_size) + ns->ns_best_block_size = nvme->n_min_block_size; + + /* + * We currently don't support namespaces that use either: + * - thin provisioning + * - protection information + */ + if (idns->id_nsfeat.f_thin || + idns->id_dps.dp_pinfo) { + dev_err(nvme->n_dip, CE_WARN, + "!ignoring namespace %d, unsupported features: " + "thin = %d, pinfo = %d", nsid, + idns->id_nsfeat.f_thin, idns->id_dps.dp_pinfo); + ns->ns_ignore = B_TRUE; + } else { + ns->ns_ignore = B_FALSE; + } + + return (DDI_SUCCESS); +} + +static int nvme_init(nvme_t *nvme) { nvme_reg_cc_t cc = { 0 }; @@ -2150,90 +2476,37 @@ nvme_init(nvme_t *nvme) nvme->n_write_cache_enabled ? 1 : 0); /* - * Grab a copy of all mandatory log pages. - * - * TODO: should go away once user space tool exists to print logs + * Assume LBA Range Type feature is supported. If it isn't this + * will be set to B_FALSE by nvme_get_features(). */ - nvme->n_error_log = (nvme_error_log_entry_t *) - nvme_get_logpage(nvme, NVME_LOGPAGE_ERROR); - nvme->n_health_log = (nvme_health_log_t *) - nvme_get_logpage(nvme, NVME_LOGPAGE_HEALTH, -1); - nvme->n_fwslot_log = (nvme_fwslot_log_t *) - nvme_get_logpage(nvme, NVME_LOGPAGE_FWSLOT); + nvme->n_lba_range_supported = B_TRUE; + + /* + * Check support for Autonomous Power State Transition. + */ + if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) + nvme->n_auto_pst_supported = + nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE; /* * Identify Namespaces */ nvme->n_namespace_count = nvme->n_idctl->id_nn; + if (nvme->n_namespace_count > NVME_MINOR_MAX) { + dev_err(nvme->n_dip, CE_WARN, + "!too many namespaces: %d, limiting to %d\n", + nvme->n_namespace_count, NVME_MINOR_MAX); + nvme->n_namespace_count = NVME_MINOR_MAX; + } + nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) * nvme->n_namespace_count, KM_SLEEP); for (i = 0; i != nvme->n_namespace_count; i++) { - nvme_identify_nsid_t *idns; - int last_rp; - - nvme->n_ns[i].ns_nvme = nvme; - nvme->n_ns[i].ns_idns = idns = nvme_identify(nvme, i + 1); - - if (idns == NULL) { - dev_err(nvme->n_dip, CE_WARN, - "!failed to identify namespace %d", i + 1); + mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER, + NULL); + if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS) goto fail; - } - - nvme->n_ns[i].ns_id = i + 1; - nvme->n_ns[i].ns_block_count = idns->id_nsize; - nvme->n_ns[i].ns_block_size = - 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; - nvme->n_ns[i].ns_best_block_size = nvme->n_ns[i].ns_block_size; - - /* - * Get the EUI64 if present. If not present prepare the devid - * from other device data. - */ - if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) - bcopy(idns->id_eui64, nvme->n_ns[i].ns_eui64, - sizeof (nvme->n_ns[i].ns_eui64)); - - /*LINTED: E_BAD_PTR_CAST_ALIGN*/ - if (*(uint64_t *)nvme->n_ns[i].ns_eui64 == 0) { - nvme_prepare_devid(nvme, nvme->n_ns[i].ns_id); - } - - /* - * Find the LBA format with no metadata and the best relative - * performance. A value of 3 means "degraded", 0 is best. - */ - last_rp = 3; - for (int j = 0; j <= idns->id_nlbaf; j++) { - if (idns->id_lbaf[j].lbaf_lbads == 0) - break; - if (idns->id_lbaf[j].lbaf_ms != 0) - continue; - if (idns->id_lbaf[j].lbaf_rp >= last_rp) - continue; - last_rp = idns->id_lbaf[j].lbaf_rp; - nvme->n_ns[i].ns_best_block_size = - 1 << idns->id_lbaf[j].lbaf_lbads; - } - - if (nvme->n_ns[i].ns_best_block_size < nvme->n_min_block_size) - nvme->n_ns[i].ns_best_block_size = - nvme->n_min_block_size; - - /* - * We currently don't support namespaces that use either: - * - thin provisioning - * - protection information - */ - if (idns->id_nsfeat.f_thin || - idns->id_dps.dp_pinfo) { - dev_err(nvme->n_dip, CE_WARN, - "!ignoring namespace %d, unsupported features: " - "thin = %d, pinfo = %d", i + 1, - idns->id_nsfeat.f_thin, idns->id_dps.dp_pinfo); - nvme->n_ns[i].ns_ignore = B_TRUE; - } } /* @@ -2520,6 +2793,8 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) ddi_set_driver_private(dip, nvme); nvme->n_dip = dip; + mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL); + nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE; nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY, @@ -2640,6 +2915,14 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) * Attach the blkdev driver for each namespace. */ for (i = 0; i != nvme->n_namespace_count; i++) { + if (ddi_create_minor_node(nvme->n_dip, nvme->n_ns[i].ns_name, + S_IFCHR, NVME_MINOR(ddi_get_instance(nvme->n_dip), i + 1), + DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, + "!failed to create minor node for namespace %d", i); + goto fail; + } + if (nvme->n_ns[i].ns_ignore) continue; @@ -2661,6 +2944,14 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) } } + if (ddi_create_minor_node(dip, "devctl", S_IFCHR, + NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0) + != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "nvme_attach: " + "cannot create devctl minor node"); + goto fail; + } + return (DDI_SUCCESS); fail: @@ -2689,8 +2980,14 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) if (nvme == NULL) return (DDI_FAILURE); + ddi_remove_minor_node(dip, "devctl"); + mutex_destroy(&nvme->n_minor.nm_mutex); + if (nvme->n_ns) { for (i = 0; i != nvme->n_namespace_count; i++) { + ddi_remove_minor_node(dip, nvme->n_ns[i].ns_name); + mutex_destroy(&nvme->n_ns[i].ns_minor.nm_mutex); + if (nvme->n_ns[i].ns_bd_hdl) { (void) bd_detach_handle( nvme->n_ns[i].ns_bd_hdl); @@ -2745,7 +3042,7 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) nvme_free_qpair(nvme->n_adminq); if (nvme->n_idctl) - kmem_free(nvme->n_idctl, sizeof (nvme_identify_ctrl_t)); + kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE); if (nvme->n_progress & NVME_REGS_MAPPED) ddi_regs_map_free(&nvme->n_regh); @@ -3042,3 +3339,531 @@ nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) strlen(ns->ns_devid), ns->ns_devid, devid)); } } + +static int +nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ +#ifndef __lock_lint + _NOTE(ARGUNUSED(cred_p)); +#endif + minor_t minor = getminor(*devp); + nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); + int nsid = NVME_MINOR_NSID(minor); + nvme_minor_state_t *nm; + int rv = 0; + + if (otyp != OTYP_CHR) + return (EINVAL); + + if (nvme == NULL) + return (ENXIO); + + if (nsid > nvme->n_namespace_count) + return (ENXIO); + + nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; + + mutex_enter(&nm->nm_mutex); + if (nm->nm_oexcl) { + rv = EBUSY; + goto out; + } + + if (flag & FEXCL) { + if (nm->nm_ocnt != 0) { + rv = EBUSY; + goto out; + } + nm->nm_oexcl = B_TRUE; + } + + nm->nm_ocnt++; + +out: + mutex_exit(&nm->nm_mutex); + return (rv); + +} + +static int +nvme_close(dev_t dev, int flag, int otyp, cred_t *cred_p) +{ +#ifndef __lock_lint + _NOTE(ARGUNUSED(cred_p)); + _NOTE(ARGUNUSED(flag)); +#endif + minor_t minor = getminor(dev); + nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); + int nsid = NVME_MINOR_NSID(minor); + nvme_minor_state_t *nm; + + if (otyp != OTYP_CHR) + return (ENXIO); + + if (nvme == NULL) + return (ENXIO); + + if (nsid > nvme->n_namespace_count) + return (ENXIO); + + nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; + + mutex_enter(&nm->nm_mutex); + if (nm->nm_oexcl) + nm->nm_oexcl = B_FALSE; + + ASSERT(nm->nm_ocnt > 0); + nm->nm_ocnt--; + mutex_exit(&nm->nm_mutex); + + return (0); +} + +static int +nvme_ioctl_identify(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, + cred_t *cred_p) +{ + _NOTE(ARGUNUSED(cred_p)); + int rv = 0; + void *idctl; + + if ((mode & FREAD) == 0) + return (EPERM); + + if (nioc->n_len < NVME_IDENTIFY_BUFSIZE) + return (EINVAL); + + idctl = nvme_identify(nvme, nsid); + if (idctl == NULL) + return (EIO); + + if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode) + != 0) + rv = EFAULT; + + kmem_free(idctl, NVME_IDENTIFY_BUFSIZE); + + return (rv); +} + +static int +nvme_ioctl_capabilities(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, + int mode, cred_t *cred_p) +{ + _NOTE(ARGUNUSED(nsid, cred_p)); + int rv = 0; + nvme_reg_cap_t cap = { 0 }; + nvme_capabilities_t nc; + + if ((mode & FREAD) == 0) + return (EPERM); + + if (nioc->n_len < sizeof (nc)) + return (EINVAL); + + cap.r = nvme_get64(nvme, NVME_REG_CAP); + + /* + * The MPSMIN and MPSMAX fields in the CAP register use 0 to + * specify the base page size of 4k (1<<12), so add 12 here to + * get the real page size value. + */ + nc.mpsmax = 1 << (12 + cap.b.cap_mpsmax); + nc.mpsmin = 1 << (12 + cap.b.cap_mpsmin); + + if (ddi_copyout(&nc, (void *)nioc->n_buf, sizeof (nc), mode) != 0) + rv = EFAULT; + + return (rv); +} + +static int +nvme_ioctl_get_logpage(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, + int mode, cred_t *cred_p) +{ + _NOTE(ARGUNUSED(cred_p)); + void *log = NULL; + size_t bufsize = 0; + int rv = 0; + + if ((mode & FREAD) == 0) + return (EPERM); + + switch (nioc->n_arg) { + case NVME_LOGPAGE_ERROR: + if (nsid != 0) + return (EINVAL); + break; + case NVME_LOGPAGE_HEALTH: + if (nsid != 0 && nvme->n_idctl->id_lpa.lp_smart == 0) + return (EINVAL); + + if (nsid == 0) + nsid = (uint32_t)-1; + + break; + case NVME_LOGPAGE_FWSLOT: + if (nsid != 0) + return (EINVAL); + break; + default: + return (EINVAL); + } + + if (nvme_get_logpage(nvme, &log, &bufsize, nioc->n_arg, nsid) + != DDI_SUCCESS) + return (EIO); + + if (nioc->n_len < bufsize) { + kmem_free(log, bufsize); + return (EINVAL); + } + + if (ddi_copyout(log, (void *)nioc->n_buf, bufsize, mode) != 0) + rv = EFAULT; + + nioc->n_len = bufsize; + kmem_free(log, bufsize); + + return (rv); +} + +static int +nvme_ioctl_get_features(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, + int mode, cred_t *cred_p) +{ + _NOTE(ARGUNUSED(cred_p)); + void *buf = NULL; + size_t bufsize = 0; + uint32_t res = 0; + uint8_t feature; + int rv = 0; + + if ((mode & FREAD) == 0) + return (EPERM); + + if ((nioc->n_arg >> 32) > 0xff) + return (EINVAL); + + feature = (uint8_t)(nioc->n_arg >> 32); + + switch (feature) { + case NVME_FEAT_ARBITRATION: + case NVME_FEAT_POWER_MGMT: + case NVME_FEAT_TEMPERATURE: + case NVME_FEAT_ERROR: + case NVME_FEAT_NQUEUES: + case NVME_FEAT_INTR_COAL: + case NVME_FEAT_WRITE_ATOM: + case NVME_FEAT_ASYNC_EVENT: + case NVME_FEAT_PROGRESS: + if (nsid != 0) + return (EINVAL); + break; + + case NVME_FEAT_INTR_VECT: + if (nsid != 0) + return (EINVAL); + + res = nioc->n_arg & 0xffffffffUL; + if (res >= nvme->n_intr_cnt) + return (EINVAL); + break; + + case NVME_FEAT_LBA_RANGE: + if (nvme->n_lba_range_supported == B_FALSE) + return (EINVAL); + + if (nsid == 0 || + nsid > nvme->n_namespace_count) + return (EINVAL); + + break; + + case NVME_FEAT_WRITE_CACHE: + if (nsid != 0) + return (EINVAL); + + if (!nvme->n_write_cache_present) + return (EINVAL); + + break; + + case NVME_FEAT_AUTO_PST: + if (nsid != 0) + return (EINVAL); + + if (!nvme->n_auto_pst_supported) + return (EINVAL); + + break; + + default: + return (EINVAL); + } + + if (nvme_get_features(nvme, nsid, feature, &res, &buf, &bufsize) == + B_FALSE) + return (EIO); + + if (nioc->n_len < bufsize) { + kmem_free(buf, bufsize); + return (EINVAL); + } + + if (buf && ddi_copyout(buf, (void*)nioc->n_buf, bufsize, mode) != 0) + rv = EFAULT; + + kmem_free(buf, bufsize); + nioc->n_arg = res; + nioc->n_len = bufsize; + + return (rv); +} + +static int +nvme_ioctl_intr_cnt(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, + cred_t *cred_p) +{ + _NOTE(ARGUNUSED(nsid, mode, cred_p)); + + if ((mode & FREAD) == 0) + return (EPERM); + + nioc->n_arg = nvme->n_intr_cnt; + return (0); +} + +static int +nvme_ioctl_version(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, + cred_t *cred_p) +{ + _NOTE(ARGUNUSED(nsid, cred_p)); + int rv = 0; + + if ((mode & FREAD) == 0) + return (EPERM); + + if (nioc->n_len < sizeof (nvme->n_version)) + return (ENOMEM); + + if (ddi_copyout(&nvme->n_version, (void *)nioc->n_buf, + sizeof (nvme->n_version), mode) != 0) + rv = EFAULT; + + return (rv); +} + +static int +nvme_ioctl_format(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, + cred_t *cred_p) +{ + _NOTE(ARGUNUSED(mode)); + nvme_format_nvm_t frmt = { 0 }; + int c_nsid = nsid != 0 ? nsid - 1 : 0; + + if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) + return (EPERM); + + frmt.r = nioc->n_arg & 0xffffffff; + + /* + * Check whether the FORMAT NVM command is supported. + */ + if (nvme->n_idctl->id_oacs.oa_format == 0) + return (EINVAL); + + /* + * Don't allow format or secure erase of individual namespace if that + * would cause a format or secure erase of all namespaces. + */ + if (nsid != 0 && nvme->n_idctl->id_fna.fn_format != 0) + return (EINVAL); + + if (nsid != 0 && frmt.b.fm_ses != NVME_FRMT_SES_NONE && + nvme->n_idctl->id_fna.fn_sec_erase != 0) + return (EINVAL); + + /* + * Don't allow formatting with Protection Information. + */ + if (frmt.b.fm_pi != 0 || frmt.b.fm_pil != 0 || frmt.b.fm_ms != 0) + return (EINVAL); + + /* + * Don't allow formatting using an illegal LBA format, or any LBA format + * that uses metadata. + */ + if (frmt.b.fm_lbaf > nvme->n_ns[c_nsid].ns_idns->id_nlbaf || + nvme->n_ns[c_nsid].ns_idns->id_lbaf[frmt.b.fm_lbaf].lbaf_ms != 0) + return (EINVAL); + + /* + * Don't allow formatting using an illegal Secure Erase setting. + */ + if (frmt.b.fm_ses > NVME_FRMT_MAX_SES || + (frmt.b.fm_ses == NVME_FRMT_SES_CRYPTO && + nvme->n_idctl->id_fna.fn_crypt_erase == 0)) + return (EINVAL); + + if (nsid == 0) + nsid = (uint32_t)-1; + + return (nvme_format_nvm(nvme, nsid, frmt.b.fm_lbaf, B_FALSE, 0, B_FALSE, + frmt.b.fm_ses)); +} + +static int +nvme_ioctl_detach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, + cred_t *cred_p) +{ + _NOTE(ARGUNUSED(nioc, mode)); + int rv = 0; + + if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) + return (EPERM); + + if (nsid == 0) + return (EINVAL); + + rv = bd_detach_handle(nvme->n_ns[nsid - 1].ns_bd_hdl); + if (rv != DDI_SUCCESS) + rv = EBUSY; + + return (rv); +} + +static int +nvme_ioctl_attach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, + cred_t *cred_p) +{ + _NOTE(ARGUNUSED(nioc, mode)); + nvme_identify_nsid_t *idns; + int rv = 0; + + if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) + return (EPERM); + + if (nsid == 0) + return (EINVAL); + + /* + * Identify namespace again, free old identify data. + */ + idns = nvme->n_ns[nsid - 1].ns_idns; + if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS) + return (EIO); + + kmem_free(idns, sizeof (nvme_identify_nsid_t)); + + rv = bd_attach_handle(nvme->n_dip, nvme->n_ns[nsid - 1].ns_bd_hdl); + if (rv != DDI_SUCCESS) + rv = EBUSY; + + return (rv); +} + +static int +nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, + int *rval_p) +{ +#ifndef __lock_lint + _NOTE(ARGUNUSED(rval_p)); +#endif + minor_t minor = getminor(dev); + nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); + int nsid = NVME_MINOR_NSID(minor); + int rv = 0; + nvme_ioctl_t nioc; + + int (*nvme_ioctl[])(nvme_t *, int, nvme_ioctl_t *, int, cred_t *) = { + NULL, + nvme_ioctl_identify, + nvme_ioctl_identify, + nvme_ioctl_capabilities, + nvme_ioctl_get_logpage, + nvme_ioctl_get_features, + nvme_ioctl_intr_cnt, + nvme_ioctl_version, + nvme_ioctl_format, + nvme_ioctl_detach, + nvme_ioctl_attach + }; + + if (nvme == NULL) + return (ENXIO); + + if (nsid > nvme->n_namespace_count) + return (ENXIO); + + if (IS_DEVCTL(cmd)) + return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0)); + +#ifdef _MULTI_DATAMODEL + switch (ddi_model_convert_from(mode & FMODELS)) { + case DDI_MODEL_ILP32: { + nvme_ioctl32_t nioc32; + if (ddi_copyin((void*)arg, &nioc32, sizeof (nvme_ioctl32_t), + mode) != 0) + return (EFAULT); + nioc.n_len = nioc32.n_len; + nioc.n_buf = nioc32.n_buf; + nioc.n_arg = nioc32.n_arg; + break; + } + case DDI_MODEL_NONE: +#endif + if (ddi_copyin((void*)arg, &nioc, sizeof (nvme_ioctl_t), mode) + != 0) + return (EFAULT); +#ifdef _MULTI_DATAMODEL + break; + } +#endif + + if (cmd == NVME_IOC_IDENTIFY_CTRL) { + /* + * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and + * attachment point nodes. + */ + nsid = 0; + } else if (cmd == NVME_IOC_IDENTIFY_NSID && nsid == 0) { + /* + * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it + * will always return identify data for namespace 1. + */ + nsid = 1; + } + + if (IS_NVME_IOC(cmd) && nvme_ioctl[NVME_IOC_CMD(cmd)] != NULL) + rv = nvme_ioctl[NVME_IOC_CMD(cmd)](nvme, nsid, &nioc, mode, + cred_p); + else + rv = EINVAL; + +#ifdef _MULTI_DATAMODEL + switch (ddi_model_convert_from(mode & FMODELS)) { + case DDI_MODEL_ILP32: { + nvme_ioctl32_t nioc32; + + nioc32.n_len = (size32_t)nioc.n_len; + nioc32.n_buf = (uintptr32_t)nioc.n_buf; + nioc32.n_arg = nioc.n_arg; + + if (ddi_copyout(&nioc32, (void *)arg, sizeof (nvme_ioctl32_t), + mode) != 0) + return (EFAULT); + break; + } + case DDI_MODEL_NONE: +#endif + if (ddi_copyout(&nioc, (void *)arg, sizeof (nvme_ioctl_t), mode) + != 0) + return (EFAULT); +#ifdef _MULTI_DATAMODEL + break; + } +#endif + + return (rv); +} diff --git a/usr/src/uts/common/io/nvme/nvme_reg.h b/usr/src/uts/common/io/nvme/nvme_reg.h index 3e4b77079b..acff0e2362 100644 --- a/usr/src/uts/common/io/nvme/nvme_reg.h +++ b/usr/src/uts/common/io/nvme/nvme_reg.h @@ -20,6 +20,8 @@ #ifndef _NVME_REG_H #define _NVME_REG_H +#include <sys/nvme.h> + #pragma pack(1) #ifdef __cplusplus @@ -33,22 +35,6 @@ extern "C" { #define NVME_MAX_ADMIN_QUEUE_LEN 4096 /* - * NVMe version - */ -typedef struct { - uint16_t v_minor; - uint16_t v_major; -} nvme_version_t; - -#define NVME_VERSION_ATLEAST(v, maj, min) \ - (((v)->v_major) > (maj) || \ - ((v)->v_major == (maj) && (v)->v_minor >= (min))) - -#define NVME_VERSION_HIGHER(v, maj, min) \ - (((v)->v_major) > (maj) || \ - ((v)->v_major == (maj) && (v)->v_minor > (min))) - -/* * NVMe registers and register fields */ #define NVME_REG_CAP 0x0 /* Controller Capabilities */ @@ -258,15 +244,6 @@ typedef struct { * NVMe completion queue entry */ typedef struct { - uint16_t sf_p:1; /* Phase Tag */ - uint16_t sf_sc:8; /* Status Code */ - uint16_t sf_sct:3; /* Status Code Type */ - uint16_t sf_rsvd2:2; - uint16_t sf_m:1; /* More */ - uint16_t sf_dnr:1; /* Do Not Retry */ -} nvme_cqe_sf_t; - -typedef struct { uint32_t cqe_dw0; /* Command Specific */ uint32_t cqe_rsvd1; uint16_t cqe_sqhd; /* SQ Head Pointer */ @@ -408,203 +385,6 @@ typedef union { #define NVME_IDENTIFY_CTRL 0x1 /* Identify Controller */ #define NVME_IDENTIFY_LIST 0x2 /* Identify List Namespaces */ -#define NVME_IDENTIFY_BUFSIZE 4096 /* buffer size for Identify */ - -/* NVMe Queue Entry Size bitfield */ -typedef struct { - uint8_t qes_min:4; /* minimum entry size */ - uint8_t qes_max:4; /* maximum entry size */ -} nvme_idctl_qes_t; - -/* NVMe Power State Descriptor */ -typedef struct { - uint16_t psd_mp; /* Maximum Power */ - uint8_t psd_rsvd1; - uint8_t psd_mps:1; /* Max Power Scale (1.1) */ - uint8_t psd_nops:1; /* Non-Operational State (1.1) */ - uint8_t psd_rsvd2:6; - uint32_t psd_enlat; /* Entry Latency */ - uint32_t psd_exlat; /* Exit Latency */ - uint8_t psd_rrt:5; /* Relative Read Throughput */ - uint8_t psd_rsvd3:3; - uint8_t psd_rrl:5; /* Relative Read Latency */ - uint8_t psd_rsvd4:3; - uint8_t psd_rwt:5; /* Relative Write Throughput */ - uint8_t psd_rsvd5:3; - uint8_t psd_rwl:5; /* Relative Write Latency */ - uint8_t psd_rsvd6:3; - uint8_t psd_rsvd7[16]; -} nvme_idctl_psd_t; - -/* NVMe Identify Controller Data Structure */ -typedef struct { - /* Controller Capabilities & Features */ - uint16_t id_vid; /* PCI vendor ID */ - uint16_t id_ssvid; /* PCI subsystem vendor ID */ - char id_serial[20]; /* Serial Number */ - char id_model[40]; /* Model Number */ - char id_fwrev[8]; /* Firmware Revision */ - uint8_t id_rab; /* Recommended Arbitration Burst */ - uint8_t id_oui[3]; /* vendor IEEE OUI */ - struct { /* Multi-Interface Capabilities */ - uint8_t m_multi_pci:1; /* HW has multiple PCIe interfaces */ - uint8_t m_multi_ctrl:1; /* HW has multiple controllers (1.1) */ - uint8_t m_sr_iov:1; /* controller is SR-IOV virt fn (1.1) */ - uint8_t m_rsvd:5; - } id_mic; - uint8_t id_mdts; /* Maximum Data Transfer Size */ - uint16_t id_cntlid; /* Unique Controller Identifier (1.1) */ - uint8_t id_rsvd_cc[256 - 80]; - - /* Admin Command Set Attributes */ - struct { /* Optional Admin Command Support */ - uint16_t oa_security:1; /* Security Send & Receive */ - uint16_t oa_format:1; /* Format NVM */ - uint16_t oa_firmare:1; /* Firmware Activate & Download */ - uint16_t oa_rsvd:13; - } id_oacs; - uint8_t id_acl; /* Abort Command Limit */ - uint8_t id_aerl; /* Asynchronous Event Request Limit */ - struct { /* Firmware Updates */ - uint8_t fw_readonly:1; /* Slot 1 is Read-Only */ - uint8_t fw_nslot:3; /* number of firmware slots */ - uint8_t fw_rsvd:4; - } id_frmw; - struct { /* Log Page Attributes */ - uint8_t lp_smart:1; /* SMART/Health information per NS */ - uint8_t lp_rsvd:7; - } id_lpa; - uint8_t id_elpe; /* Error Log Page Entries */ - uint8_t id_npss; /* Number of Power States */ - struct { /* Admin Vendor Specific Command Conf */ - uint8_t av_spec:1; /* use format from spec */ - uint8_t av_rsvd:7; - } id_avscc; - struct { /* Autonomous Power State Trans (1.1) */ - uint8_t ap_sup:1; /* APST supported (1.1) */ - uint8_t ap_rsvd:7; - } id_apsta; - uint8_t id_rsvd_ac[256 - 10]; - - /* NVM Command Set Attributes */ - nvme_idctl_qes_t id_sqes; /* Submission Queue Entry Size */ - nvme_idctl_qes_t id_cqes; /* Completion Queue Entry Size */ - uint16_t id_rsvd_nc_1; - uint32_t id_nn; /* Number of Namespaces */ - struct { /* Optional NVM Command Support */ - uint16_t on_compare:1; /* Compare */ - uint16_t on_wr_unc:1; /* Write Uncorrectable */ - uint16_t on_dset_mgmt:1; /* Dataset Management */ - uint16_t on_wr_zero:1; /* Write Zeros (1.1) */ - uint16_t on_save:1; /* Save/Select in Get/Set Feat (1.1) */ - uint16_t on_reserve:1; /* Reservations (1.1) */ - uint16_t on_rsvd:10; - } id_oncs; - struct { /* Fused Operation Support */ - uint16_t f_cmp_wr:1; /* Compare and Write */ - uint16_t f_rsvd:15; - } id_fuses; - struct { /* Format NVM Attributes */ - uint8_t fn_format:1; /* Format applies to all NS */ - uint8_t fn_sec_erase:1; /* Secure Erase applies to all NS */ - uint8_t fn_crypt_erase:1; /* Cryptographic Erase supported */ - uint8_t fn_rsvd:5; - } id_fna; - struct { /* Volatile Write Cache */ - uint8_t vwc_present:1; /* Volatile Write Cache present */ - uint8_t rsvd:7; - } id_vwc; - uint16_t id_awun; /* Atomic Write Unit Normal */ - uint16_t id_awupf; /* Atomic Write Unit Power Fail */ - struct { /* NVM Vendor Specific Command Conf */ - uint8_t nv_spec:1; /* use format from spec */ - uint8_t nv_rsvd:7; - } id_nvscc; - uint8_t id_rsvd_nc_2; - uint16_t id_acwu; /* Atomic Compare & Write Unit (1.1) */ - uint16_t id_rsvd_nc_3; - struct { /* SGL Support (1.1) */ - uint16_t sgl_sup:1; /* SGL Supported in NVM cmds (1.1) */ - uint16_t sgl_rsvd1:15; - uint16_t sgl_bucket:1; /* SGL Bit Bucket supported (1.1) */ - uint16_t sgl_rsvd2:15; - } id_sgls; - uint8_t id_rsvd_nc_4[192 - 28]; - - /* I/O Command Set Attributes */ - uint8_t id_rsvd_ioc[1344]; - - /* Power State Descriptors */ - nvme_idctl_psd_t id_psd[32]; - - /* Vendor Specific */ - uint8_t id_vs[1024]; -} nvme_identify_ctrl_t; - -/* NVMe Identify Namespace LBA Format */ -typedef struct { - uint16_t lbaf_ms; /* Metadata Size */ - uint8_t lbaf_lbads; /* LBA Data Size */ - uint8_t lbaf_rp:2; /* Relative Performance */ - uint8_t lbaf_rsvd1:6; -} nvme_idns_lbaf_t; - -/* NVMe Identify Namespace Data Structure */ -typedef struct { - uint64_t id_nsize; /* Namespace Size */ - uint64_t id_ncap; /* Namespace Capacity */ - uint64_t id_nuse; /* Namespace Utilization */ - struct { /* Namespace Features */ - uint8_t f_thin:1; /* Thin Provisioning */ - uint8_t f_rsvd:7; - } id_nsfeat; - uint8_t id_nlbaf; /* Number of LBA formats */ - struct { /* Formatted LBA size */ - uint8_t lba_format:4; /* LBA format */ - uint8_t lba_extlba:1; /* extended LBA (includes metadata) */ - uint8_t lba_rsvd:3; - } id_flbas; - struct { /* Metadata Capabilities */ - uint8_t mc_extlba:1; /* extended LBA transfers */ - uint8_t mc_separate:1; /* separate metadata transfers */ - uint8_t mc_rsvd:6; - } id_mc; - struct { /* Data Protection Capabilities */ - uint8_t dp_type1:1; /* Protection Information Type 1 */ - uint8_t dp_type2:1; /* Protection Information Type 2 */ - uint8_t dp_type3:1; /* Protection Information Type 3 */ - uint8_t dp_first:1; /* first 8 bytes of metadata */ - uint8_t dp_last:1; /* last 8 bytes of metadata */ - uint8_t dp_rsvd:3; - } id_dpc; - struct { /* Data Protection Settings */ - uint8_t dp_pinfo:3; /* Protection Information enabled */ - uint8_t dp_first:1; /* first 8 bytes of metadata */ - uint8_t dp_rsvd:4; - } id_dps; - struct { /* NS Multi-Path/Sharing Cap (1.1) */ - uint8_t nm_shared:1; /* NS is shared (1.1) */ - uint8_t nm_rsvd:7; - } id_nmic; - struct { /* Reservation Capabilities (1.1) */ - uint8_t rc_persist:1; /* Persist Through Power Loss (1.1) */ - uint8_t rc_wr_excl:1; /* Write Exclusive (1.1) */ - uint8_t rc_excl:1; /* Exclusive Access (1.1) */ - uint8_t rc_wr_excl_r:1; /* Wr Excl - Registrants Only (1.1) */ - uint8_t rc_excl_r:1; /* Excl Acc - Registrants Only (1.1) */ - uint8_t rc_wr_excl_a:1; /* Wr Excl - All Registrants (1.1) */ - uint8_t rc_excl_a:1; /* Excl Acc - All Registrants (1.1) */ - uint8_t rc_rsvd:1; - } id_rescap; - uint8_t id_rsvd1[120 - 32]; - uint8_t id_eui64[8]; /* IEEE Extended Unique Id (1.1) */ - nvme_idns_lbaf_t id_lbaf[16]; /* LBA Formats */ - - uint8_t id_rsvd2[192]; - - uint8_t id_vs[3712]; /* Vendor Specific */ -} nvme_identify_nsid_t; - /* * NVMe Abort Command @@ -619,79 +399,8 @@ typedef union { /* - * NVMe Get / Set Features - */ -#define NVME_FEAT_ARBITRATION 0x1 /* Command Arbitration */ -#define NVME_FEAT_POWER_MGMT 0x2 /* Power Management */ -#define NVME_FEAT_LBA_RANGE 0x3 /* LBA Range Type */ -#define NVME_FEAT_TEMPERATURE 0x4 /* Temperature Threshold */ -#define NVME_FEAT_ERROR 0x5 /* Error Recovery */ -#define NVME_FEAT_WRITE_CACHE 0x6 /* Volatile Write Cache */ -#define NVME_FEAT_NQUEUES 0x7 /* Number of Queues */ -#define NVME_FEAT_INTR_COAL 0x8 /* Interrupt Coalescing */ -#define NVME_FEAT_INTR_VECT 0x9 /* Interrupt Vector Configuration */ -#define NVME_FEAT_WRITE_ATOM 0xa /* Write Atomicity */ -#define NVME_FEAT_ASYNC_EVENT 0xb /* Asynchronous Event Configuration */ -#define NVME_FEAT_AUTO_PST 0xc /* Autonomous Power State Transition */ - /* (1.1) */ - -#define NVME_FEAT_PROGRESS 0x80 /* Software Progress Marker */ - -/* Arbitration Feature */ -typedef struct { - uint8_t arb_ab:3; /* Arbitration Burst */ - uint8_t arb_rsvd:5; - uint8_t arb_lpw; /* Low Priority Weight */ - uint8_t arb_mpw; /* Medium Priority Weight */ - uint8_t arb_hpw; /* High Priority Weight */ -} nvme_arbitration_dw11_t; - -/* LBA Range Type Feature */ -typedef struct { - uint32_t lr_num:6; /* Number of LBA ranges */ - uint32_t lr_rsvd:26; -} nvme_lba_range_type_dw11_t; - -typedef struct { - uint8_t lr_type; /* Type */ - struct { /* Attributes */ - uint8_t lr_write:1; /* may be overwritten */ - uint8_t lr_hidden:1; /* hidden from OS/EFI/BIOS */ - uint8_t lr_rsvd1:6; - } lr_attr; - uint8_t lr_rsvd2[14]; - uint64_t lr_slba; /* Starting LBA */ - uint64_t lr_nlb; /* Number of Logical Blocks */ - uint8_t lr_guid[16]; /* Unique Identifier */ - uint8_t lr_rsvd3[16]; -} nvme_lba_range_type_t; - -/* Volatile Write Cache Feature */ -typedef union { - struct { - uint32_t wc_wce:1; /* Volatile Write Cache Enable */ - uint32_t wc_rsvd:31; - } b; - uint32_t r; -} nvme_write_cache_t; - -/* Number of Queues */ -typedef union { - struct { - uint16_t nq_nsq; /* Number of Submission Queues */ - uint16_t nq_ncq; /* Number of Completion Queues */ - } b; - uint32_t r; -} nvme_nqueue_t; - - -/* * NVMe Get Log Page */ -#define NVME_LOGPAGE_ERROR 0x1 /* Error Information */ -#define NVME_LOGPAGE_HEALTH 0x2 /* SMART/Health Information */ -#define NVME_LOGPAGE_FWSLOT 0x3 /* Firmware Slot Information */ - typedef union { struct { uint8_t lp_lid; /* Log Page Identifier */ @@ -702,52 +411,6 @@ typedef union { uint32_t r; } nvme_getlogpage_t; -typedef struct { - uint64_t el_count; /* Error Count */ - uint16_t el_sqid; /* Submission Queue ID */ - uint16_t el_cid; /* Command ID */ - nvme_cqe_sf_t el_sf; /* Status Field */ - uint8_t el_byte; /* Parameter Error Location byte */ - uint8_t el_bit:3; /* Parameter Error Location bit */ - uint8_t el_rsvd1:5; - uint64_t el_lba; /* Logical Block Address */ - uint32_t el_nsid; /* Namespace ID */ - uint8_t el_vendor; /* Vendor Specific Information avail */ - uint8_t el_rsvd2[64 - 29]; -} nvme_error_log_entry_t; - -typedef struct { - uint64_t lo; - uint64_t hi; -} nvme_uint128_t; - -typedef struct { - uint8_t hl_crit_warn; /* Critical Warning */ - uint16_t hl_temp; /* Temperature */ - uint8_t hl_avail_spare; /* Available Spare */ - uint8_t hl_avail_spare_thr; /* Available Spare Threshold */ - uint8_t hl_used; /* Percentage Used */ - uint8_t hl_rsvd1[32 - 6]; - nvme_uint128_t hl_data_read; /* Data Units Read */ - nvme_uint128_t hl_data_write; /* Data Units Written */ - nvme_uint128_t hl_host_read; /* Host Read Commands */ - nvme_uint128_t hl_host_write; /* Host Write Commands */ - nvme_uint128_t hl_ctrl_busy; /* Controller Busy Time */ - nvme_uint128_t hl_power_cycles; /* Power Cycles */ - nvme_uint128_t hl_power_on_hours; /* Power On Hours */ - nvme_uint128_t hl_unsafe_shutdn; /* Unsafe Shutdowns */ - nvme_uint128_t hl_media_errors; /* Media Errors */ - nvme_uint128_t hl_errors_logged; /* Number of errors logged */ - uint8_t hl_rsvd2[512 - 192]; -} nvme_health_log_t; - -typedef struct { - uint8_t fw_afi:3; /* Active Firmware Slot */ - uint8_t fw_rsvd1:5; - uint8_t fw_rsvd2[7]; - char fw_frs[7][8]; /* Firmware Revision / Slot */ - uint8_t fw_rsvd3[512 - 64]; -} nvme_fwslot_log_t; #ifdef __cplusplus } diff --git a/usr/src/uts/common/io/nvme/nvme_var.h b/usr/src/uts/common/io/nvme/nvme_var.h index fd6f93af88..651adaec8c 100644 --- a/usr/src/uts/common/io/nvme/nvme_var.h +++ b/usr/src/uts/common/io/nvme/nvme_var.h @@ -27,7 +27,7 @@ */ #ifdef __cplusplus -/* extern "C" { */ +extern "C" { #endif #define NVME_FMA_INIT 0x1 @@ -47,11 +47,18 @@ typedef struct nvme nvme_t; typedef struct nvme_namespace nvme_namespace_t; +typedef struct nvme_minor_state nvme_minor_state_t; typedef struct nvme_dma nvme_dma_t; typedef struct nvme_cmd nvme_cmd_t; typedef struct nvme_qpair nvme_qpair_t; typedef struct nvme_task_arg nvme_task_arg_t; +struct nvme_minor_state { + kmutex_t nm_mutex; + boolean_t nm_oexcl; + uint_t nm_ocnt; +}; + struct nvme_dma { ddi_dma_handle_t nd_dmah; ddi_acc_handle_t nd_acch; @@ -69,6 +76,7 @@ struct nvme_cmd { void (*nc_callback)(void *); bd_xfer_t *nc_xfer; boolean_t nc_completed; + boolean_t nc_dontpanic; uint16_t nc_sqid; nvme_dma_t *nc_dma; @@ -137,6 +145,8 @@ struct nvme { boolean_t n_write_cache_present; boolean_t n_write_cache_enabled; int n_error_log_len; + boolean_t n_lba_range_supported; + boolean_t n_auto_pst_supported; int n_nssr_supported; int n_doorbell_stride; @@ -168,9 +178,8 @@ struct nvme { ddi_taskq_t *n_cmd_taskq; - nvme_error_log_entry_t *n_error_log; - nvme_health_log_t *n_health_log; - nvme_fwslot_log_t *n_fwslot_log; + /* state for devctl minor node */ + nvme_minor_state_t n_minor; /* errors detected by driver */ uint32_t n_dma_bind_err; @@ -217,6 +226,7 @@ struct nvme { struct nvme_namespace { nvme_t *ns_nvme; uint8_t ns_eui64[8]; + char ns_name[17]; bd_handle_t ns_bd_hdl; @@ -229,6 +239,9 @@ struct nvme_namespace { nvme_identify_nsid_t *ns_idns; + /* state for attachment point minor node */ + nvme_minor_state_t ns_minor; + /* * If a namespace has no EUI64, we create a devid in * nvme_prepare_devid(). @@ -241,8 +254,9 @@ struct nvme_task_arg { nvme_cmd_t *nt_cmd; }; + #ifdef __cplusplus -/* } */ +} #endif #endif /* _NVME_VAR_H */ diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 1c7662c28a..7ce40a658a 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -38,6 +38,7 @@ FILEMODE=644 # neither installed or shipped as part of the product: # cpuid_drv.h: Private interface for cpuid consumers # unix_bb_info.h: Private interface to kcov +# nvme.h Private interface to nvme # i386_HDRS= \ @@ -54,6 +55,7 @@ i386_HDRS= \ firmload.h \ gfx_private.h \ mouse.h \ + nvme.h \ ucode.h sparc_HDRS= \ @@ -422,6 +424,7 @@ CHKHDRS= \ nexusdefs.h \ note.h \ null.h \ + nvme.h \ nvpair.h \ nvpair_impl.h \ objfs.h \ diff --git a/usr/src/uts/common/sys/nvme.h b/usr/src/uts/common/sys/nvme.h new file mode 100644 index 0000000000..916b439f3f --- /dev/null +++ b/usr/src/uts/common/sys/nvme.h @@ -0,0 +1,574 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Nexenta Systems, Inc. + */ + +#ifndef _SYS_NVME_H +#define _SYS_NVME_H + +#include <sys/types.h> + +#ifdef _KERNEL +#include <sys/types32.h> +#else +#include <stdint.h> +#endif + +/* + * Declarations used for communication between nvmeadm(1M) and nvme(7D) + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * NVMe ioctl definitions + */ + +#define NVME_IOC (('N' << 24) | ('V' << 16) | ('M' << 8)) +#define NVME_IOC_IDENTIFY_CTRL (NVME_IOC | 1) +#define NVME_IOC_IDENTIFY_NSID (NVME_IOC | 2) +#define NVME_IOC_CAPABILITIES (NVME_IOC | 3) +#define NVME_IOC_GET_LOGPAGE (NVME_IOC | 4) +#define NVME_IOC_GET_FEATURES (NVME_IOC | 5) +#define NVME_IOC_INTR_CNT (NVME_IOC | 6) +#define NVME_IOC_VERSION (NVME_IOC | 7) +#define NVME_IOC_FORMAT (NVME_IOC | 8) +#define NVME_IOC_DETACH (NVME_IOC | 9) +#define NVME_IOC_ATTACH (NVME_IOC | 10) +#define NVME_IOC_MAX NVME_IOC_ATTACH + +#define IS_NVME_IOC(x) ((x) > NVME_IOC && (x) <= NVME_IOC_MAX) +#define NVME_IOC_CMD(x) ((x) & 0xff) + +typedef struct { + size_t n_len; + uintptr_t n_buf; + uint64_t n_arg; +} nvme_ioctl_t; + +#ifdef _KERNEL +typedef struct { + size32_t n_len; + uintptr32_t n_buf; + uint64_t n_arg; +} nvme_ioctl32_t; +#endif + +/* + * NVMe capabilities + */ +typedef struct { + uint32_t mpsmax; /* Memory Page Size Maximum */ + uint32_t mpsmin; /* Memory Page Size Minimum */ +} nvme_capabilities_t; + +/* + * NVMe version + */ +typedef struct { + uint16_t v_minor; + uint16_t v_major; +} nvme_version_t; + +#define NVME_VERSION_ATLEAST(v, maj, min) \ + (((v)->v_major) > (maj) || \ + ((v)->v_major == (maj) && (v)->v_minor >= (min))) + +#define NVME_VERSION_HIGHER(v, maj, min) \ + (((v)->v_major) > (maj) || \ + ((v)->v_major == (maj) && (v)->v_minor > (min))) + + +#pragma pack(1) + +/* + * NVMe Identify data structures + */ + +#define NVME_IDENTIFY_BUFSIZE 4096 /* buffer size for Identify */ + +/* NVMe Queue Entry Size bitfield */ +typedef struct { + uint8_t qes_min:4; /* minimum entry size */ + uint8_t qes_max:4; /* maximum entry size */ +} nvme_idctl_qes_t; + +/* NVMe Power State Descriptor */ +typedef struct { + uint16_t psd_mp; /* Maximum Power */ + uint8_t psd_rsvd1; + uint8_t psd_mps:1; /* Max Power Scale (1.1) */ + uint8_t psd_nops:1; /* Non-Operational State (1.1) */ + uint8_t psd_rsvd2:6; + uint32_t psd_enlat; /* Entry Latency */ + uint32_t psd_exlat; /* Exit Latency */ + uint8_t psd_rrt:5; /* Relative Read Throughput */ + uint8_t psd_rsvd3:3; + uint8_t psd_rrl:5; /* Relative Read Latency */ + uint8_t psd_rsvd4:3; + uint8_t psd_rwt:5; /* Relative Write Throughput */ + uint8_t psd_rsvd5:3; + uint8_t psd_rwl:5; /* Relative Write Latency */ + uint8_t psd_rsvd6:3; + uint8_t psd_rsvd7[16]; +} nvme_idctl_psd_t; + +/* NVMe Identify Controller Data Structure */ +typedef struct { + /* Controller Capabilities & Features */ + uint16_t id_vid; /* PCI vendor ID */ + uint16_t id_ssvid; /* PCI subsystem vendor ID */ + char id_serial[20]; /* Serial Number */ + char id_model[40]; /* Model Number */ + char id_fwrev[8]; /* Firmware Revision */ + uint8_t id_rab; /* Recommended Arbitration Burst */ + uint8_t id_oui[3]; /* vendor IEEE OUI */ + struct { /* Multi-Interface Capabilities */ + uint8_t m_multi_pci:1; /* HW has multiple PCIe interfaces */ + uint8_t m_multi_ctrl:1; /* HW has multiple controllers (1.1) */ + uint8_t m_sr_iov:1; /* controller is SR-IOV virt fn (1.1) */ + uint8_t m_rsvd:5; + } id_mic; + uint8_t id_mdts; /* Maximum Data Transfer Size */ + uint16_t id_cntlid; /* Unique Controller Identifier (1.1) */ + uint8_t id_rsvd_cc[256 - 80]; + + /* Admin Command Set Attributes */ + struct { /* Optional Admin Command Support */ + uint16_t oa_security:1; /* Security Send & Receive */ + uint16_t oa_format:1; /* Format NVM */ + uint16_t oa_firmware:1; /* Firmware Activate & Download */ + uint16_t oa_rsvd:13; + } id_oacs; + uint8_t id_acl; /* Abort Command Limit */ + uint8_t id_aerl; /* Asynchronous Event Request Limit */ + struct { /* Firmware Updates */ + uint8_t fw_readonly:1; /* Slot 1 is Read-Only */ + uint8_t fw_nslot:3; /* number of firmware slots */ + uint8_t fw_rsvd:4; + } id_frmw; + struct { /* Log Page Attributes */ + uint8_t lp_smart:1; /* SMART/Health information per NS */ + uint8_t lp_rsvd:7; + } id_lpa; + uint8_t id_elpe; /* Error Log Page Entries */ + uint8_t id_npss; /* Number of Power States */ + struct { /* Admin Vendor Specific Command Conf */ + uint8_t av_spec:1; /* use format from spec */ + uint8_t av_rsvd:7; + } id_avscc; + struct { /* Autonomous Power State Trans (1.1) */ + uint8_t ap_sup:1; /* APST supported (1.1) */ + uint8_t ap_rsvd:7; + } id_apsta; + uint8_t id_rsvd_ac[256 - 10]; + + /* NVM Command Set Attributes */ + nvme_idctl_qes_t id_sqes; /* Submission Queue Entry Size */ + nvme_idctl_qes_t id_cqes; /* Completion Queue Entry Size */ + uint16_t id_rsvd_nc_1; + uint32_t id_nn; /* Number of Namespaces */ + struct { /* Optional NVM Command Support */ + uint16_t on_compare:1; /* Compare */ + uint16_t on_wr_unc:1; /* Write Uncorrectable */ + uint16_t on_dset_mgmt:1; /* Dataset Management */ + uint16_t on_wr_zero:1; /* Write Zeros (1.1) */ + uint16_t on_save:1; /* Save/Select in Get/Set Feat (1.1) */ + uint16_t on_reserve:1; /* Reservations (1.1) */ + uint16_t on_rsvd:10; + } id_oncs; + struct { /* Fused Operation Support */ + uint16_t f_cmp_wr:1; /* Compare and Write */ + uint16_t f_rsvd:15; + } id_fuses; + struct { /* Format NVM Attributes */ + uint8_t fn_format:1; /* Format applies to all NS */ + uint8_t fn_sec_erase:1; /* Secure Erase applies to all NS */ + uint8_t fn_crypt_erase:1; /* Cryptographic Erase supported */ + uint8_t fn_rsvd:5; + } id_fna; + struct { /* Volatile Write Cache */ + uint8_t vwc_present:1; /* Volatile Write Cache present */ + uint8_t rsvd:7; + } id_vwc; + uint16_t id_awun; /* Atomic Write Unit Normal */ + uint16_t id_awupf; /* Atomic Write Unit Power Fail */ + struct { /* NVM Vendor Specific Command Conf */ + uint8_t nv_spec:1; /* use format from spec */ + uint8_t nv_rsvd:7; + } id_nvscc; + uint8_t id_rsvd_nc_2; + uint16_t id_acwu; /* Atomic Compare & Write Unit (1.1) */ + uint16_t id_rsvd_nc_3; + struct { /* SGL Support (1.1) */ + uint16_t sgl_sup:1; /* SGL Supported in NVM cmds (1.1) */ + uint16_t sgl_rsvd1:15; + uint16_t sgl_bucket:1; /* SGL Bit Bucket supported (1.1) */ + uint16_t sgl_rsvd2:15; + } id_sgls; + uint8_t id_rsvd_nc_4[192 - 28]; + + /* I/O Command Set Attributes */ + uint8_t id_rsvd_ioc[1344]; + + /* Power State Descriptors */ + nvme_idctl_psd_t id_psd[32]; + + /* Vendor Specific */ + uint8_t id_vs[1024]; +} nvme_identify_ctrl_t; + +/* NVMe Identify Namespace LBA Format */ +typedef struct { + uint16_t lbaf_ms; /* Metadata Size */ + uint8_t lbaf_lbads; /* LBA Data Size */ + uint8_t lbaf_rp:2; /* Relative Performance */ + uint8_t lbaf_rsvd1:6; +} nvme_idns_lbaf_t; + +/* NVMe Identify Namespace Data Structure */ +typedef struct { + uint64_t id_nsize; /* Namespace Size */ + uint64_t id_ncap; /* Namespace Capacity */ + uint64_t id_nuse; /* Namespace Utilization */ + struct { /* Namespace Features */ + uint8_t f_thin:1; /* Thin Provisioning */ + uint8_t f_rsvd:7; + } id_nsfeat; + uint8_t id_nlbaf; /* Number of LBA formats */ + struct { /* Formatted LBA size */ + uint8_t lba_format:4; /* LBA format */ + uint8_t lba_extlba:1; /* extended LBA (includes metadata) */ + uint8_t lba_rsvd:3; + } id_flbas; + struct { /* Metadata Capabilities */ + uint8_t mc_extlba:1; /* extended LBA transfers */ + uint8_t mc_separate:1; /* separate metadata transfers */ + uint8_t mc_rsvd:6; + } id_mc; + struct { /* Data Protection Capabilities */ + uint8_t dp_type1:1; /* Protection Information Type 1 */ + uint8_t dp_type2:1; /* Protection Information Type 2 */ + uint8_t dp_type3:1; /* Protection Information Type 3 */ + uint8_t dp_first:1; /* first 8 bytes of metadata */ + uint8_t dp_last:1; /* last 8 bytes of metadata */ + uint8_t dp_rsvd:3; + } id_dpc; + struct { /* Data Protection Settings */ + uint8_t dp_pinfo:3; /* Protection Information enabled */ + uint8_t dp_first:1; /* first 8 bytes of metadata */ + uint8_t dp_rsvd:4; + } id_dps; + struct { /* NS Multi-Path/Sharing Cap (1.1) */ + uint8_t nm_shared:1; /* NS is shared (1.1) */ + uint8_t nm_rsvd:7; + } id_nmic; + struct { /* Reservation Capabilities (1.1) */ + uint8_t rc_persist:1; /* Persist Through Power Loss (1.1) */ + uint8_t rc_wr_excl:1; /* Write Exclusive (1.1) */ + uint8_t rc_excl:1; /* Exclusive Access (1.1) */ + uint8_t rc_wr_excl_r:1; /* Wr Excl - Registrants Only (1.1) */ + uint8_t rc_excl_r:1; /* Excl Acc - Registrants Only (1.1) */ + uint8_t rc_wr_excl_a:1; /* Wr Excl - All Registrants (1.1) */ + uint8_t rc_excl_a:1; /* Excl Acc - All Registrants (1.1) */ + uint8_t rc_rsvd:1; + } id_rescap; + uint8_t id_rsvd1[120 - 32]; + uint8_t id_eui64[8]; /* IEEE Extended Unique Id (1.1) */ + nvme_idns_lbaf_t id_lbaf[16]; /* LBA Formats */ + + uint8_t id_rsvd2[192]; + + uint8_t id_vs[3712]; /* Vendor Specific */ +} nvme_identify_nsid_t; + + +/* + * NVMe completion queue entry status field + */ +typedef struct { + uint16_t sf_p:1; /* Phase Tag */ + uint16_t sf_sc:8; /* Status Code */ + uint16_t sf_sct:3; /* Status Code Type */ + uint16_t sf_rsvd2:2; + uint16_t sf_m:1; /* More */ + uint16_t sf_dnr:1; /* Do Not Retry */ +} nvme_cqe_sf_t; + + +/* + * NVMe Get Log Page + */ +#define NVME_LOGPAGE_ERROR 0x1 /* Error Information */ +#define NVME_LOGPAGE_HEALTH 0x2 /* SMART/Health Information */ +#define NVME_LOGPAGE_FWSLOT 0x3 /* Firmware Slot Information */ + +typedef struct { + uint64_t el_count; /* Error Count */ + uint16_t el_sqid; /* Submission Queue ID */ + uint16_t el_cid; /* Command ID */ + nvme_cqe_sf_t el_sf; /* Status Field */ + uint8_t el_byte; /* Parameter Error Location byte */ + uint8_t el_bit:3; /* Parameter Error Location bit */ + uint8_t el_rsvd1:5; + uint64_t el_lba; /* Logical Block Address */ + uint32_t el_nsid; /* Namespace ID */ + uint8_t el_vendor; /* Vendor Specific Information avail */ + uint8_t el_rsvd2[64 - 29]; +} nvme_error_log_entry_t; + +typedef struct { + uint64_t lo; + uint64_t hi; +} nvme_uint128_t; + +typedef struct { + struct { /* Critical Warning */ + uint8_t cw_avail:1; /* available space too low */ + uint8_t cw_temp:1; /* temperature too high */ + uint8_t cw_reliab:1; /* degraded reliability */ + uint8_t cw_readonly:1; /* media is read-only */ + uint8_t cw_volatile:1; /* volatile memory backup failed */ + uint8_t cw_rsvd:3; + } hl_crit_warn; + uint16_t hl_temp; /* Temperature */ + uint8_t hl_avail_spare; /* Available Spare */ + uint8_t hl_avail_spare_thr; /* Available Spare Threshold */ + uint8_t hl_used; /* Percentage Used */ + uint8_t hl_rsvd1[32 - 6]; + nvme_uint128_t hl_data_read; /* Data Units Read */ + nvme_uint128_t hl_data_write; /* Data Units Written */ + nvme_uint128_t hl_host_read; /* Host Read Commands */ + nvme_uint128_t hl_host_write; /* Host Write Commands */ + nvme_uint128_t hl_ctrl_busy; /* Controller Busy Time */ + nvme_uint128_t hl_power_cycles; /* Power Cycles */ + nvme_uint128_t hl_power_on_hours; /* Power On Hours */ + nvme_uint128_t hl_unsafe_shutdn; /* Unsafe Shutdowns */ + nvme_uint128_t hl_media_errors; /* Media Errors */ + nvme_uint128_t hl_errors_logged; /* Number of errors logged */ + uint8_t hl_rsvd2[512 - 192]; +} nvme_health_log_t; + +typedef struct { + uint8_t fw_afi:3; /* Active Firmware Slot */ + uint8_t fw_rsvd1:5; + uint8_t fw_rsvd2[7]; + char fw_frs[7][8]; /* Firmware Revision / Slot */ + uint8_t fw_rsvd3[512 - 64]; +} nvme_fwslot_log_t; + + +/* + * NVMe Format NVM + */ +#define NVME_FRMT_SES_NONE 0 +#define NVME_FRMT_SES_USER 1 +#define NVME_FRMT_SES_CRYPTO 2 +#define NVME_FRMT_MAX_SES 2 + +#define NVME_FRMT_MAX_LBAF 15 + +typedef union { + struct { + uint32_t fm_lbaf:4; /* LBA Format */ + uint32_t fm_ms:1; /* Metadata Settings */ + uint32_t fm_pi:3; /* Protection Information */ + uint32_t fm_pil:1; /* Prot. Information Location */ + uint32_t fm_ses:3; /* Secure Erase Settings */ + uint32_t fm_resvd:20; + } b; + uint32_t r; +} nvme_format_nvm_t; + + +/* + * NVMe Get / Set Features + */ +#define NVME_FEAT_ARBITRATION 0x1 /* Command Arbitration */ +#define NVME_FEAT_POWER_MGMT 0x2 /* Power Management */ +#define NVME_FEAT_LBA_RANGE 0x3 /* LBA Range Type */ +#define NVME_FEAT_TEMPERATURE 0x4 /* Temperature Threshold */ +#define NVME_FEAT_ERROR 0x5 /* Error Recovery */ +#define NVME_FEAT_WRITE_CACHE 0x6 /* Volatile Write Cache */ +#define NVME_FEAT_NQUEUES 0x7 /* Number of Queues */ +#define NVME_FEAT_INTR_COAL 0x8 /* Interrupt Coalescing */ +#define NVME_FEAT_INTR_VECT 0x9 /* Interrupt Vector Configuration */ +#define NVME_FEAT_WRITE_ATOM 0xa /* Write Atomicity */ +#define NVME_FEAT_ASYNC_EVENT 0xb /* Asynchronous Event Configuration */ +#define NVME_FEAT_AUTO_PST 0xc /* Autonomous Power State Transition */ + /* (1.1) */ + +#define NVME_FEAT_PROGRESS 0x80 /* Software Progress Marker */ + +/* Arbitration Feature */ +typedef union { + struct { + uint8_t arb_ab:3; /* Arbitration Burst */ + uint8_t arb_rsvd:5; + uint8_t arb_lpw; /* Low Priority Weight */ + uint8_t arb_mpw; /* Medium Priority Weight */ + uint8_t arb_hpw; /* High Priority Weight */ + } b; + uint32_t r; +} nvme_arbitration_t; + +/* Power Management Feature */ +typedef union { + struct { + uint32_t pm_ps:5; /* Power State */ + uint32_t pm_rsvd:27; + } b; + uint32_t r; +} nvme_power_mgmt_t; + +/* LBA Range Type Feature */ +typedef union { + struct { + uint32_t lr_num:6; /* Number of LBA ranges */ + uint32_t lr_rsvd:26; + } b; + uint32_t r; +} nvme_lba_range_type_t; + +typedef struct { + uint8_t lr_type; /* Type */ + struct { /* Attributes */ + uint8_t lr_write:1; /* may be overwritten */ + uint8_t lr_hidden:1; /* hidden from OS/EFI/BIOS */ + uint8_t lr_rsvd1:6; + } lr_attr; + uint8_t lr_rsvd2[14]; + uint64_t lr_slba; /* Starting LBA */ + uint64_t lr_nlb; /* Number of Logical Blocks */ + uint8_t lr_guid[16]; /* Unique Identifier */ + uint8_t lr_rsvd3[16]; +} nvme_lba_range_t; + +#define NVME_LBA_RANGE_BUFSIZE 4096 + +/* Temperature Threshold Feature */ +typedef union { + struct { + uint16_t tt_tmpth; /* Temperature Threshold */ + uint16_t tt_rsvd; + } b; + uint32_t r; +} nvme_temp_threshold_t; + +/* Error Recovery Feature */ +typedef union { + struct { + uint16_t er_tler; /* Time-Limited Error Recovery */ + uint16_t er_rsvd; + } b; + uint32_t r; +} nvme_error_recovery_t; + +/* Volatile Write Cache Feature */ +typedef union { + struct { + uint32_t wc_wce:1; /* Volatile Write Cache Enable */ + uint32_t wc_rsvd:31; + } b; + uint32_t r; +} nvme_write_cache_t; + +/* Number of Queues Feature */ +typedef union { + struct { + uint16_t nq_nsq; /* Number of Submission Queues */ + uint16_t nq_ncq; /* Number of Completion Queues */ + } b; + uint32_t r; +} nvme_nqueues_t; + +/* Interrupt Coalescing Feature */ +typedef union { + struct { + uint8_t ic_thr; /* Aggregation Threshold */ + uint8_t ic_time; /* Aggregation Time */ + uint16_t ic_rsvd; + } b; + uint32_t r; +} nvme_intr_coal_t; + +/* Interrupt Configuration Features */ +typedef union { + struct { + uint16_t iv_iv; /* Interrupt Vector */ + uint16_t iv_cd:1; /* Coalescing Disable */ + uint16_t iv_rsvd:15; + } b; + uint32_t r; +} nvme_intr_vect_t; + +/* Write Atomicity Feature */ +typedef union { + struct { + uint32_t wa_dn:1; /* Disable Normal */ + uint32_t wa_rsvd:31; + } b; + uint32_t r; +} nvme_write_atomicity_t; + +/* Asynchronous Event Configuration Feature */ +typedef union { + struct { + uint8_t aec_avail:1; /* available space too low */ + uint8_t aec_temp:1; /* temperature too high */ + uint8_t aec_reliab:1; /* degraded reliability */ + uint8_t aec_readonly:1; /* media is read-only */ + uint8_t aec_volatile:1; /* volatile memory backup failed */ + uint8_t aec_rsvd1:3; + uint8_t aec_rsvd2[3]; + } b; + uint32_t r; +} nvme_async_event_conf_t; + +/* Autonomous Power State Transition Feature (1.1) */ +typedef union { + struct { + uint8_t apst_apste:1; /* APST enabled */ + uint8_t apst_rsvd:7; + } b; + uint8_t r; +} nvme_auto_power_state_trans_t; + +typedef struct { + uint32_t apst_rsvd1:3; + uint32_t apst_itps:5; /* Idle Transition Power State */ + uint32_t apst_itpt:24; /* Idle Time Prior to Transition */ + uint32_t apst_rsvd2; +} nvme_auto_power_state_t; + +#define NVME_AUTO_PST_BUFSIZE 256 + +/* Software Progress Marker Feature */ +typedef union { + struct { + uint8_t spm_pbslc; /* Pre-Boot Software Load Count */ + uint8_t spm_rsvd[3]; + } b; + uint32_t r; +} nvme_software_progress_marker_t; + +#pragma pack() /* pack(1) */ + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_NVME_H */ diff --git a/usr/src/uts/common/sys/sunddi.h b/usr/src/uts/common/sys/sunddi.h index d5e52dbbfc..1d94c8fd2c 100644 --- a/usr/src/uts/common/sys/sunddi.h +++ b/usr/src/uts/common/sys/sunddi.h @@ -212,6 +212,8 @@ extern "C" { #define DDI_NT_NEXUS "ddi_ctl:devctl" /* nexus drivers */ +#define DDI_NT_NVME_NEXUS "ddi_ctl:devctl:nvme" /* nexus drivers */ + #define DDI_NT_SCSI_NEXUS "ddi_ctl:devctl:scsi" /* nexus drivers */ #define DDI_NT_SATA_NEXUS "ddi_ctl:devctl:sata" /* nexus drivers */ @@ -220,6 +222,9 @@ extern "C" { #define DDI_NT_ATTACHMENT_POINT "ddi_ctl:attachment_point" /* attachment pt */ +#define DDI_NT_NVME_ATTACHMENT_POINT "ddi_ctl:attachment_point:nvme" + /* nvme attachment pt */ + #define DDI_NT_SCSI_ATTACHMENT_POINT "ddi_ctl:attachment_point:scsi" /* scsi attachment pt */ |