diff options
Diffstat (limited to 'usr/src/uts/common')
-rw-r--r-- | usr/src/uts/common/io/blkdev/blkdev.c | 16 | ||||
-rw-r--r-- | usr/src/uts/common/io/nvme/nvme.c | 1059 | ||||
-rw-r--r-- | usr/src/uts/common/io/nvme/nvme_reg.h | 341 | ||||
-rw-r--r-- | usr/src/uts/common/io/nvme/nvme_var.h | 24 | ||||
-rw-r--r-- | usr/src/uts/common/sys/Makefile | 3 | ||||
-rw-r--r-- | usr/src/uts/common/sys/nvme.h | 574 | ||||
-rw-r--r-- | usr/src/uts/common/sys/sunddi.h | 5 |
7 files changed, 1561 insertions, 461 deletions
diff --git a/usr/src/uts/common/io/blkdev/blkdev.c b/usr/src/uts/common/io/blkdev/blkdev.c index 0c80d15cfe..d3b96c9f8a 100644 --- a/usr/src/uts/common/io/blkdev/blkdev.c +++ b/usr/src/uts/common/io/blkdev/blkdev.c @@ -1819,6 +1819,16 @@ bd_attach_handle(dev_info_t *dip, bd_handle_t hdl) dev_info_t *child; bd_drive_t drive = { 0 }; + /* + * It's not an error if bd_attach_handle() is called on a handle that + * already is attached. We just ignore the request to attach and return. + * This way drivers using blkdev don't have to keep track about blkdev + * state, they can just call this function to make sure it attached. + */ + if (hdl->h_child != NULL) { + return (DDI_SUCCESS); + } + /* if drivers don't override this, make it assume none */ drive.d_lun = -1; hdl->h_ops.o_drive_info(hdl->h_private, &drive); @@ -1882,6 +1892,12 @@ bd_detach_handle(bd_handle_t hdl) int rv; char *devnm; + /* + * It's not an error if bd_detach_handle() is called on a handle that + * already is detached. We just ignore the request to detach and return. + * This way drivers using blkdev don't have to keep track about blkdev + * state, they can just call this function to make sure it detached. + */ if (hdl->h_child == NULL) { return (DDI_SUCCESS); } diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c index cb2e9bdd22..c87be0d3f0 100644 --- a/usr/src/uts/common/io/nvme/nvme.c +++ b/usr/src/uts/common/io/nvme/nvme.c @@ -83,6 +83,19 @@ * passes it to blkdev to use it in the device node names. As this is currently * untested namespaces with EUI64 are ignored by default. * + * We currently support only (2 << NVME_MINOR_INST_SHIFT) - 2 namespaces in a + * single controller. This is an artificial limit imposed by the driver to be + * able to address a reasonable number of controllers and namespaces using a + * 32bit minor node number. + * + * + * Minor nodes: + * + * For each NVMe device the driver exposes one minor node for the controller and + * one minor node for each namespace. The only operations supported by those + * minor nodes are open(9E), close(9E), and ioctl(9E). This serves as the + * interface for the nvmeadm(1M) utility. + * * * Blkdev Interface: * @@ -164,7 +177,6 @@ * - polled I/O support to support kernel core dumping * - FMA handling of media errors * - support for devices supporting very large I/O requests using chained PRPs - * - support for querying log pages from user space * - support for configuring hardware parameters like interrupt coalescing * - support for media formatting and hard partitioning into namespaces * - support for big-endian systems @@ -186,6 +198,7 @@ #include <sys/devops.h> #include <sys/ddi.h> #include <sys/sunddi.h> +#include <sys/sunndi.h> #include <sys/bitmap.h> #include <sys/sysmacros.h> #include <sys/param.h> @@ -196,6 +209,10 @@ #include <sys/atomic.h> #include <sys/archsystm.h> #include <sys/sata/sata_hba.h> +#include <sys/stat.h> +#include <sys/policy.h> + +#include <sys/nvme.h> #ifdef __x86 #include <sys/x86_archext.h> @@ -210,7 +227,10 @@ static const int nvme_version_major = 1; static const int nvme_version_minor = 2; /* tunable for admin command timeout in seconds, default is 1s */ -static volatile int nvme_admin_cmd_timeout = 1; +int nvme_admin_cmd_timeout = 1; + +/* tunable for FORMAT NVM command timeout in seconds, default is 600s */ +int nvme_format_cmd_timeout = 600; static int nvme_attach(dev_info_t *, ddi_attach_cmd_t); static int nvme_detach(dev_info_t *, ddi_detach_cmd_t); @@ -243,10 +263,14 @@ static inline int nvme_check_cmd_status(nvme_cmd_t *); static void nvme_abort_cmd(nvme_cmd_t *); static int nvme_async_event(nvme_t *); -static void *nvme_get_logpage(nvme_t *, uint8_t, ...); +static int nvme_format_nvm(nvme_t *, uint32_t, uint8_t, boolean_t, uint8_t, + boolean_t, uint8_t); +static int nvme_get_logpage(nvme_t *, void **, size_t *, uint8_t, ...); static void *nvme_identify(nvme_t *, uint32_t); static boolean_t nvme_set_features(nvme_t *, uint32_t, uint8_t, uint32_t, uint32_t *); +static boolean_t nvme_get_features(nvme_t *, uint32_t, uint8_t, uint32_t *, + void **, size_t *); static boolean_t nvme_write_cache_set(nvme_t *, boolean_t); static int nvme_set_nqueues(nvme_t *, uint16_t); @@ -283,6 +307,16 @@ static void nvme_prp_dma_destructor(void *, void *); static void nvme_prepare_devid(nvme_t *, uint32_t); +static int nvme_open(dev_t *, int, int, cred_t *); +static int nvme_close(dev_t, int, int, cred_t *); +static int nvme_ioctl(dev_t, int, intptr_t, int, cred_t *, int *); + +#define NVME_MINOR_INST_SHIFT 14 +#define NVME_MINOR(inst, nsid) (((inst) << NVME_MINOR_INST_SHIFT) | (nsid)) +#define NVME_MINOR_INST(minor) ((minor) >> NVME_MINOR_INST_SHIFT) +#define NVME_MINOR_NSID(minor) ((minor) & ((1 << NVME_MINOR_INST_SHIFT) - 1)) +#define NVME_MINOR_MAX (NVME_MINOR(1, 0) - 2) + static void *nvme_state; static kmem_cache_t *nvme_cmd_cache; @@ -358,6 +392,27 @@ static ddi_device_acc_attr_t nvme_reg_acc_attr = { .devacc_attr_dataorder = DDI_STRICTORDER_ACC }; +static struct cb_ops nvme_cb_ops = { + .cb_open = nvme_open, + .cb_close = nvme_close, + .cb_strategy = nodev, + .cb_print = nodev, + .cb_dump = nodev, + .cb_read = nodev, + .cb_write = nodev, + .cb_ioctl = nvme_ioctl, + .cb_devmap = nodev, + .cb_mmap = nodev, + .cb_segmap = nodev, + .cb_chpoll = nochpoll, + .cb_prop_op = ddi_prop_op, + .cb_str = 0, + .cb_flag = D_NEW | D_MP, + .cb_rev = CB_REV, + .cb_aread = nodev, + .cb_awrite = nodev +}; + static struct dev_ops nvme_dev_ops = { .devo_rev = DEVO_REV, .devo_refcnt = 0, @@ -367,7 +422,7 @@ static struct dev_ops nvme_dev_ops = { .devo_attach = nvme_attach, .devo_detach = nvme_detach, .devo_reset = nodev, - .devo_cb_ops = NULL, + .devo_cb_ops = &nvme_cb_ops, .devo_bus_ops = NULL, .devo_power = NULL, .devo_quiesce = nvme_quiesce, @@ -844,7 +899,8 @@ nvme_check_unknown_cmd_status(nvme_cmd_t *cmd) cqe->cqe_sqid, cqe->cqe_cid, cqe->cqe_sf.sf_sc, cqe->cqe_sf.sf_sct, cqe->cqe_sf.sf_dnr, cqe->cqe_sf.sf_m); - bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); if (cmd->nc_nvme->n_strict_version) { cmd->nc_nvme->n_dead = B_TRUE; @@ -881,13 +937,15 @@ nvme_check_integrity_cmd_status(nvme_cmd_t *cmd) case NVME_CQE_SC_INT_NVM_WRITE: /* write fail */ /* TODO: post ereport */ - bd_error(cmd->nc_xfer, BD_ERR_MEDIA); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_MEDIA); return (EIO); case NVME_CQE_SC_INT_NVM_READ: /* read fail */ /* TODO: post ereport */ - bd_error(cmd->nc_xfer, BD_ERR_MEDIA); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_MEDIA); return (EIO); default: @@ -915,9 +973,11 @@ nvme_check_generic_cmd_status(nvme_cmd_t *cmd) case NVME_CQE_SC_GEN_INV_FLD: /* Invalid Field in Command */ - dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " - "invalid field in cmd %p", (void *)cmd); - return (0); + if (!cmd->nc_dontpanic) + dev_err(cmd->nc_nvme->n_dip, CE_PANIC, + "programming error: invalid field in cmd %p", + (void *)cmd); + return (EIO); case NVME_CQE_SC_GEN_ID_CNFL: /* Command ID Conflict */ @@ -927,9 +987,11 @@ nvme_check_generic_cmd_status(nvme_cmd_t *cmd) case NVME_CQE_SC_GEN_INV_NS: /* Invalid Namespace or Format */ - dev_err(cmd->nc_nvme->n_dip, CE_PANIC, "programming error: " - "invalid NS/format in cmd %p", (void *)cmd); - return (0); + if (!cmd->nc_dontpanic) + dev_err(cmd->nc_nvme->n_dip, CE_PANIC, + "programming error: " "invalid NS/format in cmd %p", + (void *)cmd); + return (EINVAL); case NVME_CQE_SC_GEN_NVM_LBA_RANGE: /* LBA Out Of Range */ @@ -944,7 +1006,8 @@ nvme_check_generic_cmd_status(nvme_cmd_t *cmd) /* Data Transfer Error (DMA) */ /* TODO: post ereport */ atomic_inc_32(&cmd->nc_nvme->n_data_xfr_err); - bd_error(cmd->nc_xfer, BD_ERR_NTRDY); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_NTRDY); return (EIO); case NVME_CQE_SC_GEN_INTERNAL_ERR: @@ -955,7 +1018,8 @@ nvme_check_generic_cmd_status(nvme_cmd_t *cmd) * in the async event handler. */ atomic_inc_32(&cmd->nc_nvme->n_internal_err); - bd_error(cmd->nc_xfer, BD_ERR_NTRDY); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_NTRDY); return (EIO); case NVME_CQE_SC_GEN_ABORT_REQUEST: @@ -981,13 +1045,15 @@ nvme_check_generic_cmd_status(nvme_cmd_t *cmd) case NVME_CQE_SC_GEN_NVM_CAP_EXC: /* Capacity Exceeded */ atomic_inc_32(&cmd->nc_nvme->n_nvm_cap_exc); - bd_error(cmd->nc_xfer, BD_ERR_MEDIA); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_MEDIA); return (EIO); case NVME_CQE_SC_GEN_NVM_NS_NOTRDY: /* Namespace Not Ready */ atomic_inc_32(&cmd->nc_nvme->n_nvm_ns_notrdy); - bd_error(cmd->nc_xfer, BD_ERR_NTRDY); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_NTRDY); return (EIO); default: @@ -1048,14 +1114,14 @@ nvme_check_specific_cmd_status(nvme_cmd_t *cmd) /* Invalid Log Page */ ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_GET_LOG_PAGE); atomic_inc_32(&cmd->nc_nvme->n_inv_log_page); - bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); return (EINVAL); case NVME_CQE_SC_SPC_INV_FORMAT: /* Invalid Format */ ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_FORMAT); atomic_inc_32(&cmd->nc_nvme->n_inv_format); - bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); return (EINVAL); case NVME_CQE_SC_SPC_INV_Q_DEL: @@ -1070,7 +1136,8 @@ nvme_check_specific_cmd_status(nvme_cmd_t *cmd) cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); atomic_inc_32(&cmd->nc_nvme->n_cnfl_attr); - bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); return (EINVAL); case NVME_CQE_SC_SPC_NVM_INV_PROT: @@ -1079,14 +1146,16 @@ nvme_check_specific_cmd_status(nvme_cmd_t *cmd) cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_READ || cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); atomic_inc_32(&cmd->nc_nvme->n_inv_prot); - bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); return (EINVAL); case NVME_CQE_SC_SPC_NVM_READONLY: /* Write to Read Only Range */ ASSERT(cmd->nc_sqe.sqe_opc == NVME_OPC_NVM_WRITE); atomic_inc_32(&cmd->nc_nvme->n_readonly); - bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); + if (cmd->nc_xfer != NULL) + bd_error(cmd->nc_xfer, BD_ERR_ILLRQ); return (EROFS); default: @@ -1281,6 +1350,7 @@ nvme_async_event_task(void *arg) nvme_t *nvme = cmd->nc_nvme; nvme_error_log_entry_t *error_log = NULL; nvme_health_log_t *health_log = NULL; + size_t logsize = 0; nvme_async_event_t event; int ret; @@ -1328,8 +1398,8 @@ nvme_async_event_task(void *arg) switch (event.b.ae_type) { case NVME_ASYNC_TYPE_ERROR: if (event.b.ae_logpage == NVME_LOGPAGE_ERROR) { - error_log = (nvme_error_log_entry_t *) - nvme_get_logpage(nvme, event.b.ae_logpage); + (void) nvme_get_logpage(nvme, (void **)&error_log, + &logsize, event.b.ae_logpage); } else { dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " "async event reply: %d", event.b.ae_logpage); @@ -1379,8 +1449,8 @@ nvme_async_event_task(void *arg) case NVME_ASYNC_TYPE_HEALTH: if (event.b.ae_logpage == NVME_LOGPAGE_HEALTH) { - health_log = (nvme_health_log_t *) - nvme_get_logpage(nvme, event.b.ae_logpage, -1); + (void) nvme_get_logpage(nvme, (void **)&health_log, + &logsize, event.b.ae_logpage, -1); } else { dev_err(nvme->n_dip, CE_WARN, "!wrong logpage in " "async event reply: %d", event.b.ae_logpage); @@ -1427,11 +1497,10 @@ nvme_async_event_task(void *arg) } if (error_log) - kmem_free(error_log, sizeof (nvme_error_log_entry_t) * - nvme->n_error_log_len); + kmem_free(error_log, logsize); if (health_log) - kmem_free(health_log, sizeof (nvme_health_log_t)); + kmem_free(health_log, logsize); } static int @@ -1485,14 +1554,58 @@ nvme_async_event(nvme_t *nvme) return (DDI_SUCCESS); } -static void * -nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...) +static int +nvme_format_nvm(nvme_t *nvme, uint32_t nsid, uint8_t lbaf, boolean_t ms, + uint8_t pi, boolean_t pil, uint8_t ses) +{ + nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); + nvme_format_nvm_t format_nvm = { 0 }; + int ret; + + format_nvm.b.fm_lbaf = lbaf & 0xf; + format_nvm.b.fm_ms = ms ? 1 : 0; + format_nvm.b.fm_pi = pi & 0x7; + format_nvm.b.fm_pil = pil ? 1 : 0; + format_nvm.b.fm_ses = ses & 0x7; + + cmd->nc_sqid = 0; + cmd->nc_callback = nvme_wakeup_cmd; + cmd->nc_sqe.sqe_nsid = nsid; + cmd->nc_sqe.sqe_opc = NVME_OPC_NVM_FORMAT; + cmd->nc_sqe.sqe_cdw10 = format_nvm.r; + + /* + * Some devices like Samsung SM951 don't allow formatting of all + * namespaces in one command. Handle that gracefully. + */ + if (nsid == (uint32_t)-1) + cmd->nc_dontpanic = B_TRUE; + + if ((ret = nvme_admin_cmd(cmd, nvme_format_cmd_timeout)) + != DDI_SUCCESS) { + dev_err(nvme->n_dip, CE_WARN, + "!nvme_admin_cmd failed for FORMAT NVM"); + return (EIO); + } + + if ((ret = nvme_check_cmd_status(cmd)) != 0) { + dev_err(nvme->n_dip, CE_WARN, + "!FORMAT failed with sct = %x, sc = %x", + cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); + } + + nvme_free_cmd(cmd); + return (ret); +} + +static int +nvme_get_logpage(nvme_t *nvme, void **buf, size_t *bufsize, uint8_t logpage, + ...) { nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); - void *buf = NULL; nvme_getlogpage_t getlogpage = { 0 }; - size_t bufsize; va_list ap; + int ret = DDI_FAILURE; va_start(ap, logpage); @@ -1505,18 +1618,22 @@ nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...) switch (logpage) { case NVME_LOGPAGE_ERROR: cmd->nc_sqe.sqe_nsid = (uint32_t)-1; - bufsize = nvme->n_error_log_len * - sizeof (nvme_error_log_entry_t); + /* + * The GET LOG PAGE command can use at most 2 pages to return + * data, PRP lists are not supported. + */ + *bufsize = MIN(2 * nvme->n_pagesize, + nvme->n_error_log_len * sizeof (nvme_error_log_entry_t)); break; case NVME_LOGPAGE_HEALTH: cmd->nc_sqe.sqe_nsid = va_arg(ap, uint32_t); - bufsize = sizeof (nvme_health_log_t); + *bufsize = sizeof (nvme_health_log_t); break; case NVME_LOGPAGE_FWSLOT: cmd->nc_sqe.sqe_nsid = (uint32_t)-1; - bufsize = sizeof (nvme_fwslot_log_t); + *bufsize = sizeof (nvme_fwslot_log_t); break; default: @@ -1528,7 +1645,7 @@ nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...) va_end(ap); - getlogpage.b.lp_numd = bufsize / sizeof (uint32_t) - 1; + getlogpage.b.lp_numd = *bufsize / sizeof (uint32_t) - 1; cmd->nc_sqe.sqe_cdw10 = getlogpage.r; @@ -1557,7 +1674,7 @@ nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...) if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { dev_err(nvme->n_dip, CE_WARN, "!nvme_admin_cmd failed for GET LOG PAGE"); - return (NULL); + return (ret); } if (nvme_check_cmd_status(cmd)) { @@ -1567,13 +1684,15 @@ nvme_get_logpage(nvme_t *nvme, uint8_t logpage, ...) goto fail; } - buf = kmem_alloc(bufsize, KM_SLEEP); - bcopy(cmd->nc_dma->nd_memp, buf, bufsize); + *buf = kmem_alloc(*bufsize, KM_SLEEP); + bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); + + ret = DDI_SUCCESS; fail: nvme_free_cmd(cmd); - return (buf); + return (ret); } static void * @@ -1684,6 +1803,130 @@ fail: } static boolean_t +nvme_get_features(nvme_t *nvme, uint32_t nsid, uint8_t feature, uint32_t *res, + void **buf, size_t *bufsize) +{ + nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); + boolean_t ret = B_FALSE; + + ASSERT(res != NULL); + + if (bufsize != NULL) + *bufsize = 0; + + cmd->nc_sqid = 0; + cmd->nc_callback = nvme_wakeup_cmd; + cmd->nc_sqe.sqe_opc = NVME_OPC_GET_FEATURES; + cmd->nc_sqe.sqe_cdw10 = feature; + cmd->nc_sqe.sqe_cdw11 = *res; + + switch (feature) { + case NVME_FEAT_ARBITRATION: + case NVME_FEAT_POWER_MGMT: + case NVME_FEAT_TEMPERATURE: + case NVME_FEAT_ERROR: + case NVME_FEAT_NQUEUES: + case NVME_FEAT_INTR_COAL: + case NVME_FEAT_INTR_VECT: + case NVME_FEAT_WRITE_ATOM: + case NVME_FEAT_ASYNC_EVENT: + case NVME_FEAT_PROGRESS: + break; + + case NVME_FEAT_WRITE_CACHE: + if (!nvme->n_write_cache_present) + goto fail; + break; + + case NVME_FEAT_LBA_RANGE: + if (!nvme->n_lba_range_supported) + goto fail; + + /* + * The LBA Range Type feature is optional. There doesn't seem + * be a method of detecting whether it is supported other than + * using it. This will cause a "invalid field in command" error, + * which is normally considered a programming error and causes + * panic in nvme_check_generic_cmd_status(). + */ + cmd->nc_dontpanic = B_TRUE; + cmd->nc_sqe.sqe_nsid = nsid; + ASSERT(bufsize != NULL); + *bufsize = NVME_LBA_RANGE_BUFSIZE; + + break; + + case NVME_FEAT_AUTO_PST: + if (!nvme->n_auto_pst_supported) + goto fail; + + ASSERT(bufsize != NULL); + *bufsize = NVME_AUTO_PST_BUFSIZE; + break; + + default: + goto fail; + } + + if (bufsize != NULL && *bufsize != 0) { + if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ, + &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { + dev_err(nvme->n_dip, CE_WARN, + "!nvme_zalloc_dma failed for GET FEATURES"); + goto fail; + } + + if (cmd->nc_dma->nd_ncookie > 2) { + dev_err(nvme->n_dip, CE_WARN, + "!too many DMA cookies for GET FEATURES"); + atomic_inc_32(&nvme->n_too_many_cookies); + goto fail; + } + + cmd->nc_sqe.sqe_dptr.d_prp[0] = + cmd->nc_dma->nd_cookie.dmac_laddress; + if (cmd->nc_dma->nd_ncookie > 1) { + ddi_dma_nextcookie(cmd->nc_dma->nd_dmah, + &cmd->nc_dma->nd_cookie); + cmd->nc_sqe.sqe_dptr.d_prp[1] = + cmd->nc_dma->nd_cookie.dmac_laddress; + } + } + + if (nvme_admin_cmd(cmd, nvme_admin_cmd_timeout) != DDI_SUCCESS) { + dev_err(nvme->n_dip, CE_WARN, + "!nvme_admin_cmd failed for GET FEATURES"); + return (ret); + } + + if (nvme_check_cmd_status(cmd)) { + if (feature == NVME_FEAT_LBA_RANGE && + cmd->nc_cqe.cqe_sf.sf_sct == NVME_CQE_SCT_GENERIC && + cmd->nc_cqe.cqe_sf.sf_sc == NVME_CQE_SC_GEN_INV_FLD) + nvme->n_lba_range_supported = B_FALSE; + else + dev_err(nvme->n_dip, CE_WARN, + "!GET FEATURES %d failed with sct = %x, sc = %x", + feature, cmd->nc_cqe.cqe_sf.sf_sct, + cmd->nc_cqe.cqe_sf.sf_sc); + goto fail; + } + + if (bufsize != NULL && *bufsize != 0) { + ASSERT(buf != NULL); + *buf = kmem_alloc(*bufsize, KM_SLEEP); + bcopy(cmd->nc_dma->nd_memp, *buf, *bufsize); + } + + *res = cmd->nc_cqe.cqe_dw0; + ret = B_TRUE; + +fail: + nvme_free_cmd(cmd); + return (ret); +} + +static boolean_t nvme_write_cache_set(nvme_t *nvme, boolean_t enable) { nvme_write_cache_t nwc = { 0 }; @@ -1700,7 +1943,7 @@ nvme_write_cache_set(nvme_t *nvme, boolean_t enable) static int nvme_set_nqueues(nvme_t *nvme, uint16_t nqueues) { - nvme_nqueue_t nq = { 0 }; + nvme_nqueues_t nq = { 0 }; nq.b.nq_nsq = nq.b.nq_ncq = nqueues - 1; @@ -1866,6 +2109,89 @@ nvme_prepare_devid(nvme_t *nvme, uint32_t nsid) } static int +nvme_init_ns(nvme_t *nvme, int nsid) +{ + nvme_namespace_t *ns = &nvme->n_ns[nsid - 1]; + nvme_identify_nsid_t *idns; + int last_rp; + + ns->ns_nvme = nvme; + idns = nvme_identify(nvme, nsid); + + if (idns == NULL) { + dev_err(nvme->n_dip, CE_WARN, + "!failed to identify namespace %d", nsid); + return (DDI_FAILURE); + } + + ns->ns_idns = idns; + ns->ns_id = nsid; + ns->ns_block_count = idns->id_nsize; + ns->ns_block_size = + 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; + ns->ns_best_block_size = ns->ns_block_size; + + /* + * Get the EUI64 if present. Use it for devid and device node names. + */ + if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) + bcopy(idns->id_eui64, ns->ns_eui64, sizeof (ns->ns_eui64)); + + /*LINTED: E_BAD_PTR_CAST_ALIGN*/ + if (*(uint64_t *)ns->ns_eui64 != 0) { + uint8_t *eui64 = ns->ns_eui64; + + (void) snprintf(ns->ns_name, sizeof (ns->ns_name), + "%02x%02x%02x%02x%02x%02x%02x%02x", + eui64[0], eui64[1], eui64[2], eui64[3], + eui64[4], eui64[5], eui64[6], eui64[7]); + } else { + (void) snprintf(ns->ns_name, sizeof (ns->ns_name), "%d", + ns->ns_id); + + nvme_prepare_devid(nvme, ns->ns_id); + } + + /* + * Find the LBA format with no metadata and the best relative + * performance. A value of 3 means "degraded", 0 is best. + */ + last_rp = 3; + for (int j = 0; j <= idns->id_nlbaf; j++) { + if (idns->id_lbaf[j].lbaf_lbads == 0) + break; + if (idns->id_lbaf[j].lbaf_ms != 0) + continue; + if (idns->id_lbaf[j].lbaf_rp >= last_rp) + continue; + last_rp = idns->id_lbaf[j].lbaf_rp; + ns->ns_best_block_size = + 1 << idns->id_lbaf[j].lbaf_lbads; + } + + if (ns->ns_best_block_size < nvme->n_min_block_size) + ns->ns_best_block_size = nvme->n_min_block_size; + + /* + * We currently don't support namespaces that use either: + * - thin provisioning + * - protection information + */ + if (idns->id_nsfeat.f_thin || + idns->id_dps.dp_pinfo) { + dev_err(nvme->n_dip, CE_WARN, + "!ignoring namespace %d, unsupported features: " + "thin = %d, pinfo = %d", nsid, + idns->id_nsfeat.f_thin, idns->id_dps.dp_pinfo); + ns->ns_ignore = B_TRUE; + } else { + ns->ns_ignore = B_FALSE; + } + + return (DDI_SUCCESS); +} + +static int nvme_init(nvme_t *nvme) { nvme_reg_cc_t cc = { 0 }; @@ -2150,90 +2476,37 @@ nvme_init(nvme_t *nvme) nvme->n_write_cache_enabled ? 1 : 0); /* - * Grab a copy of all mandatory log pages. - * - * TODO: should go away once user space tool exists to print logs + * Assume LBA Range Type feature is supported. If it isn't this + * will be set to B_FALSE by nvme_get_features(). */ - nvme->n_error_log = (nvme_error_log_entry_t *) - nvme_get_logpage(nvme, NVME_LOGPAGE_ERROR); - nvme->n_health_log = (nvme_health_log_t *) - nvme_get_logpage(nvme, NVME_LOGPAGE_HEALTH, -1); - nvme->n_fwslot_log = (nvme_fwslot_log_t *) - nvme_get_logpage(nvme, NVME_LOGPAGE_FWSLOT); + nvme->n_lba_range_supported = B_TRUE; + + /* + * Check support for Autonomous Power State Transition. + */ + if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) + nvme->n_auto_pst_supported = + nvme->n_idctl->id_apsta.ap_sup == 0 ? B_FALSE : B_TRUE; /* * Identify Namespaces */ nvme->n_namespace_count = nvme->n_idctl->id_nn; + if (nvme->n_namespace_count > NVME_MINOR_MAX) { + dev_err(nvme->n_dip, CE_WARN, + "!too many namespaces: %d, limiting to %d\n", + nvme->n_namespace_count, NVME_MINOR_MAX); + nvme->n_namespace_count = NVME_MINOR_MAX; + } + nvme->n_ns = kmem_zalloc(sizeof (nvme_namespace_t) * nvme->n_namespace_count, KM_SLEEP); for (i = 0; i != nvme->n_namespace_count; i++) { - nvme_identify_nsid_t *idns; - int last_rp; - - nvme->n_ns[i].ns_nvme = nvme; - nvme->n_ns[i].ns_idns = idns = nvme_identify(nvme, i + 1); - - if (idns == NULL) { - dev_err(nvme->n_dip, CE_WARN, - "!failed to identify namespace %d", i + 1); + mutex_init(&nvme->n_ns[i].ns_minor.nm_mutex, NULL, MUTEX_DRIVER, + NULL); + if (nvme_init_ns(nvme, i + 1) != DDI_SUCCESS) goto fail; - } - - nvme->n_ns[i].ns_id = i + 1; - nvme->n_ns[i].ns_block_count = idns->id_nsize; - nvme->n_ns[i].ns_block_size = - 1 << idns->id_lbaf[idns->id_flbas.lba_format].lbaf_lbads; - nvme->n_ns[i].ns_best_block_size = nvme->n_ns[i].ns_block_size; - - /* - * Get the EUI64 if present. If not present prepare the devid - * from other device data. - */ - if (NVME_VERSION_ATLEAST(&nvme->n_version, 1, 1)) - bcopy(idns->id_eui64, nvme->n_ns[i].ns_eui64, - sizeof (nvme->n_ns[i].ns_eui64)); - - /*LINTED: E_BAD_PTR_CAST_ALIGN*/ - if (*(uint64_t *)nvme->n_ns[i].ns_eui64 == 0) { - nvme_prepare_devid(nvme, nvme->n_ns[i].ns_id); - } - - /* - * Find the LBA format with no metadata and the best relative - * performance. A value of 3 means "degraded", 0 is best. - */ - last_rp = 3; - for (int j = 0; j <= idns->id_nlbaf; j++) { - if (idns->id_lbaf[j].lbaf_lbads == 0) - break; - if (idns->id_lbaf[j].lbaf_ms != 0) - continue; - if (idns->id_lbaf[j].lbaf_rp >= last_rp) - continue; - last_rp = idns->id_lbaf[j].lbaf_rp; - nvme->n_ns[i].ns_best_block_size = - 1 << idns->id_lbaf[j].lbaf_lbads; - } - - if (nvme->n_ns[i].ns_best_block_size < nvme->n_min_block_size) - nvme->n_ns[i].ns_best_block_size = - nvme->n_min_block_size; - - /* - * We currently don't support namespaces that use either: - * - thin provisioning - * - protection information - */ - if (idns->id_nsfeat.f_thin || - idns->id_dps.dp_pinfo) { - dev_err(nvme->n_dip, CE_WARN, - "!ignoring namespace %d, unsupported features: " - "thin = %d, pinfo = %d", i + 1, - idns->id_nsfeat.f_thin, idns->id_dps.dp_pinfo); - nvme->n_ns[i].ns_ignore = B_TRUE; - } } /* @@ -2520,6 +2793,8 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) ddi_set_driver_private(dip, nvme); nvme->n_dip = dip; + mutex_init(&nvme->n_minor.nm_mutex, NULL, MUTEX_DRIVER, NULL); + nvme->n_strict_version = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "strict-version", 1) == 1 ? B_TRUE : B_FALSE; nvme->n_ignore_unknown_vendor_status = ddi_prop_get_int(DDI_DEV_T_ANY, @@ -2640,6 +2915,14 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) * Attach the blkdev driver for each namespace. */ for (i = 0; i != nvme->n_namespace_count; i++) { + if (ddi_create_minor_node(nvme->n_dip, nvme->n_ns[i].ns_name, + S_IFCHR, NVME_MINOR(ddi_get_instance(nvme->n_dip), i + 1), + DDI_NT_NVME_ATTACHMENT_POINT, 0) != DDI_SUCCESS) { + dev_err(dip, CE_WARN, + "!failed to create minor node for namespace %d", i); + goto fail; + } + if (nvme->n_ns[i].ns_ignore) continue; @@ -2661,6 +2944,14 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) } } + if (ddi_create_minor_node(dip, "devctl", S_IFCHR, + NVME_MINOR(ddi_get_instance(dip), 0), DDI_NT_NVME_NEXUS, 0) + != DDI_SUCCESS) { + dev_err(dip, CE_WARN, "nvme_attach: " + "cannot create devctl minor node"); + goto fail; + } + return (DDI_SUCCESS); fail: @@ -2689,8 +2980,14 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) if (nvme == NULL) return (DDI_FAILURE); + ddi_remove_minor_node(dip, "devctl"); + mutex_destroy(&nvme->n_minor.nm_mutex); + if (nvme->n_ns) { for (i = 0; i != nvme->n_namespace_count; i++) { + ddi_remove_minor_node(dip, nvme->n_ns[i].ns_name); + mutex_destroy(&nvme->n_ns[i].ns_minor.nm_mutex); + if (nvme->n_ns[i].ns_bd_hdl) { (void) bd_detach_handle( nvme->n_ns[i].ns_bd_hdl); @@ -2745,7 +3042,7 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) nvme_free_qpair(nvme->n_adminq); if (nvme->n_idctl) - kmem_free(nvme->n_idctl, sizeof (nvme_identify_ctrl_t)); + kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE); if (nvme->n_progress & NVME_REGS_MAPPED) ddi_regs_map_free(&nvme->n_regh); @@ -3042,3 +3339,531 @@ nvme_bd_devid(void *arg, dev_info_t *devinfo, ddi_devid_t *devid) strlen(ns->ns_devid), ns->ns_devid, devid)); } } + +static int +nvme_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) +{ +#ifndef __lock_lint + _NOTE(ARGUNUSED(cred_p)); +#endif + minor_t minor = getminor(*devp); + nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); + int nsid = NVME_MINOR_NSID(minor); + nvme_minor_state_t *nm; + int rv = 0; + + if (otyp != OTYP_CHR) + return (EINVAL); + + if (nvme == NULL) + return (ENXIO); + + if (nsid > nvme->n_namespace_count) + return (ENXIO); + + nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; + + mutex_enter(&nm->nm_mutex); + if (nm->nm_oexcl) { + rv = EBUSY; + goto out; + } + + if (flag & FEXCL) { + if (nm->nm_ocnt != 0) { + rv = EBUSY; + goto out; + } + nm->nm_oexcl = B_TRUE; + } + + nm->nm_ocnt++; + +out: + mutex_exit(&nm->nm_mutex); + return (rv); + +} + +static int +nvme_close(dev_t dev, int flag, int otyp, cred_t *cred_p) +{ +#ifndef __lock_lint + _NOTE(ARGUNUSED(cred_p)); + _NOTE(ARGUNUSED(flag)); +#endif + minor_t minor = getminor(dev); + nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); + int nsid = NVME_MINOR_NSID(minor); + nvme_minor_state_t *nm; + + if (otyp != OTYP_CHR) + return (ENXIO); + + if (nvme == NULL) + return (ENXIO); + + if (nsid > nvme->n_namespace_count) + return (ENXIO); + + nm = nsid == 0 ? &nvme->n_minor : &nvme->n_ns[nsid - 1].ns_minor; + + mutex_enter(&nm->nm_mutex); + if (nm->nm_oexcl) + nm->nm_oexcl = B_FALSE; + + ASSERT(nm->nm_ocnt > 0); + nm->nm_ocnt--; + mutex_exit(&nm->nm_mutex); + + return (0); +} + +static int +nvme_ioctl_identify(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, + cred_t *cred_p) +{ + _NOTE(ARGUNUSED(cred_p)); + int rv = 0; + void *idctl; + + if ((mode & FREAD) == 0) + return (EPERM); + + if (nioc->n_len < NVME_IDENTIFY_BUFSIZE) + return (EINVAL); + + idctl = nvme_identify(nvme, nsid); + if (idctl == NULL) + return (EIO); + + if (ddi_copyout(idctl, (void *)nioc->n_buf, NVME_IDENTIFY_BUFSIZE, mode) + != 0) + rv = EFAULT; + + kmem_free(idctl, NVME_IDENTIFY_BUFSIZE); + + return (rv); +} + +static int +nvme_ioctl_capabilities(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, + int mode, cred_t *cred_p) +{ + _NOTE(ARGUNUSED(nsid, cred_p)); + int rv = 0; + nvme_reg_cap_t cap = { 0 }; + nvme_capabilities_t nc; + + if ((mode & FREAD) == 0) + return (EPERM); + + if (nioc->n_len < sizeof (nc)) + return (EINVAL); + + cap.r = nvme_get64(nvme, NVME_REG_CAP); + + /* + * The MPSMIN and MPSMAX fields in the CAP register use 0 to + * specify the base page size of 4k (1<<12), so add 12 here to + * get the real page size value. + */ + nc.mpsmax = 1 << (12 + cap.b.cap_mpsmax); + nc.mpsmin = 1 << (12 + cap.b.cap_mpsmin); + + if (ddi_copyout(&nc, (void *)nioc->n_buf, sizeof (nc), mode) != 0) + rv = EFAULT; + + return (rv); +} + +static int +nvme_ioctl_get_logpage(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, + int mode, cred_t *cred_p) +{ + _NOTE(ARGUNUSED(cred_p)); + void *log = NULL; + size_t bufsize = 0; + int rv = 0; + + if ((mode & FREAD) == 0) + return (EPERM); + + switch (nioc->n_arg) { + case NVME_LOGPAGE_ERROR: + if (nsid != 0) + return (EINVAL); + break; + case NVME_LOGPAGE_HEALTH: + if (nsid != 0 && nvme->n_idctl->id_lpa.lp_smart == 0) + return (EINVAL); + + if (nsid == 0) + nsid = (uint32_t)-1; + + break; + case NVME_LOGPAGE_FWSLOT: + if (nsid != 0) + return (EINVAL); + break; + default: + return (EINVAL); + } + + if (nvme_get_logpage(nvme, &log, &bufsize, nioc->n_arg, nsid) + != DDI_SUCCESS) + return (EIO); + + if (nioc->n_len < bufsize) { + kmem_free(log, bufsize); + return (EINVAL); + } + + if (ddi_copyout(log, (void *)nioc->n_buf, bufsize, mode) != 0) + rv = EFAULT; + + nioc->n_len = bufsize; + kmem_free(log, bufsize); + + return (rv); +} + +static int +nvme_ioctl_get_features(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, + int mode, cred_t *cred_p) +{ + _NOTE(ARGUNUSED(cred_p)); + void *buf = NULL; + size_t bufsize = 0; + uint32_t res = 0; + uint8_t feature; + int rv = 0; + + if ((mode & FREAD) == 0) + return (EPERM); + + if ((nioc->n_arg >> 32) > 0xff) + return (EINVAL); + + feature = (uint8_t)(nioc->n_arg >> 32); + + switch (feature) { + case NVME_FEAT_ARBITRATION: + case NVME_FEAT_POWER_MGMT: + case NVME_FEAT_TEMPERATURE: + case NVME_FEAT_ERROR: + case NVME_FEAT_NQUEUES: + case NVME_FEAT_INTR_COAL: + case NVME_FEAT_WRITE_ATOM: + case NVME_FEAT_ASYNC_EVENT: + case NVME_FEAT_PROGRESS: + if (nsid != 0) + return (EINVAL); + break; + + case NVME_FEAT_INTR_VECT: + if (nsid != 0) + return (EINVAL); + + res = nioc->n_arg & 0xffffffffUL; + if (res >= nvme->n_intr_cnt) + return (EINVAL); + break; + + case NVME_FEAT_LBA_RANGE: + if (nvme->n_lba_range_supported == B_FALSE) + return (EINVAL); + + if (nsid == 0 || + nsid > nvme->n_namespace_count) + return (EINVAL); + + break; + + case NVME_FEAT_WRITE_CACHE: + if (nsid != 0) + return (EINVAL); + + if (!nvme->n_write_cache_present) + return (EINVAL); + + break; + + case NVME_FEAT_AUTO_PST: + if (nsid != 0) + return (EINVAL); + + if (!nvme->n_auto_pst_supported) + return (EINVAL); + + break; + + default: + return (EINVAL); + } + + if (nvme_get_features(nvme, nsid, feature, &res, &buf, &bufsize) == + B_FALSE) + return (EIO); + + if (nioc->n_len < bufsize) { + kmem_free(buf, bufsize); + return (EINVAL); + } + + if (buf && ddi_copyout(buf, (void*)nioc->n_buf, bufsize, mode) != 0) + rv = EFAULT; + + kmem_free(buf, bufsize); + nioc->n_arg = res; + nioc->n_len = bufsize; + + return (rv); +} + +static int +nvme_ioctl_intr_cnt(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, + cred_t *cred_p) +{ + _NOTE(ARGUNUSED(nsid, mode, cred_p)); + + if ((mode & FREAD) == 0) + return (EPERM); + + nioc->n_arg = nvme->n_intr_cnt; + return (0); +} + +static int +nvme_ioctl_version(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, + cred_t *cred_p) +{ + _NOTE(ARGUNUSED(nsid, cred_p)); + int rv = 0; + + if ((mode & FREAD) == 0) + return (EPERM); + + if (nioc->n_len < sizeof (nvme->n_version)) + return (ENOMEM); + + if (ddi_copyout(&nvme->n_version, (void *)nioc->n_buf, + sizeof (nvme->n_version), mode) != 0) + rv = EFAULT; + + return (rv); +} + +static int +nvme_ioctl_format(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, + cred_t *cred_p) +{ + _NOTE(ARGUNUSED(mode)); + nvme_format_nvm_t frmt = { 0 }; + int c_nsid = nsid != 0 ? nsid - 1 : 0; + + if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) + return (EPERM); + + frmt.r = nioc->n_arg & 0xffffffff; + + /* + * Check whether the FORMAT NVM command is supported. + */ + if (nvme->n_idctl->id_oacs.oa_format == 0) + return (EINVAL); + + /* + * Don't allow format or secure erase of individual namespace if that + * would cause a format or secure erase of all namespaces. + */ + if (nsid != 0 && nvme->n_idctl->id_fna.fn_format != 0) + return (EINVAL); + + if (nsid != 0 && frmt.b.fm_ses != NVME_FRMT_SES_NONE && + nvme->n_idctl->id_fna.fn_sec_erase != 0) + return (EINVAL); + + /* + * Don't allow formatting with Protection Information. + */ + if (frmt.b.fm_pi != 0 || frmt.b.fm_pil != 0 || frmt.b.fm_ms != 0) + return (EINVAL); + + /* + * Don't allow formatting using an illegal LBA format, or any LBA format + * that uses metadata. + */ + if (frmt.b.fm_lbaf > nvme->n_ns[c_nsid].ns_idns->id_nlbaf || + nvme->n_ns[c_nsid].ns_idns->id_lbaf[frmt.b.fm_lbaf].lbaf_ms != 0) + return (EINVAL); + + /* + * Don't allow formatting using an illegal Secure Erase setting. + */ + if (frmt.b.fm_ses > NVME_FRMT_MAX_SES || + (frmt.b.fm_ses == NVME_FRMT_SES_CRYPTO && + nvme->n_idctl->id_fna.fn_crypt_erase == 0)) + return (EINVAL); + + if (nsid == 0) + nsid = (uint32_t)-1; + + return (nvme_format_nvm(nvme, nsid, frmt.b.fm_lbaf, B_FALSE, 0, B_FALSE, + frmt.b.fm_ses)); +} + +static int +nvme_ioctl_detach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, + cred_t *cred_p) +{ + _NOTE(ARGUNUSED(nioc, mode)); + int rv = 0; + + if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) + return (EPERM); + + if (nsid == 0) + return (EINVAL); + + rv = bd_detach_handle(nvme->n_ns[nsid - 1].ns_bd_hdl); + if (rv != DDI_SUCCESS) + rv = EBUSY; + + return (rv); +} + +static int +nvme_ioctl_attach(nvme_t *nvme, int nsid, nvme_ioctl_t *nioc, int mode, + cred_t *cred_p) +{ + _NOTE(ARGUNUSED(nioc, mode)); + nvme_identify_nsid_t *idns; + int rv = 0; + + if ((mode & FWRITE) == 0 || secpolicy_sys_config(cred_p, B_FALSE) != 0) + return (EPERM); + + if (nsid == 0) + return (EINVAL); + + /* + * Identify namespace again, free old identify data. + */ + idns = nvme->n_ns[nsid - 1].ns_idns; + if (nvme_init_ns(nvme, nsid) != DDI_SUCCESS) + return (EIO); + + kmem_free(idns, sizeof (nvme_identify_nsid_t)); + + rv = bd_attach_handle(nvme->n_dip, nvme->n_ns[nsid - 1].ns_bd_hdl); + if (rv != DDI_SUCCESS) + rv = EBUSY; + + return (rv); +} + +static int +nvme_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *cred_p, + int *rval_p) +{ +#ifndef __lock_lint + _NOTE(ARGUNUSED(rval_p)); +#endif + minor_t minor = getminor(dev); + nvme_t *nvme = ddi_get_soft_state(nvme_state, NVME_MINOR_INST(minor)); + int nsid = NVME_MINOR_NSID(minor); + int rv = 0; + nvme_ioctl_t nioc; + + int (*nvme_ioctl[])(nvme_t *, int, nvme_ioctl_t *, int, cred_t *) = { + NULL, + nvme_ioctl_identify, + nvme_ioctl_identify, + nvme_ioctl_capabilities, + nvme_ioctl_get_logpage, + nvme_ioctl_get_features, + nvme_ioctl_intr_cnt, + nvme_ioctl_version, + nvme_ioctl_format, + nvme_ioctl_detach, + nvme_ioctl_attach + }; + + if (nvme == NULL) + return (ENXIO); + + if (nsid > nvme->n_namespace_count) + return (ENXIO); + + if (IS_DEVCTL(cmd)) + return (ndi_devctl_ioctl(nvme->n_dip, cmd, arg, mode, 0)); + +#ifdef _MULTI_DATAMODEL + switch (ddi_model_convert_from(mode & FMODELS)) { + case DDI_MODEL_ILP32: { + nvme_ioctl32_t nioc32; + if (ddi_copyin((void*)arg, &nioc32, sizeof (nvme_ioctl32_t), + mode) != 0) + return (EFAULT); + nioc.n_len = nioc32.n_len; + nioc.n_buf = nioc32.n_buf; + nioc.n_arg = nioc32.n_arg; + break; + } + case DDI_MODEL_NONE: +#endif + if (ddi_copyin((void*)arg, &nioc, sizeof (nvme_ioctl_t), mode) + != 0) + return (EFAULT); +#ifdef _MULTI_DATAMODEL + break; + } +#endif + + if (cmd == NVME_IOC_IDENTIFY_CTRL) { + /* + * This makes NVME_IOC_IDENTIFY_CTRL work the same on devctl and + * attachment point nodes. + */ + nsid = 0; + } else if (cmd == NVME_IOC_IDENTIFY_NSID && nsid == 0) { + /* + * This makes NVME_IOC_IDENTIFY_NSID work on a devctl node, it + * will always return identify data for namespace 1. + */ + nsid = 1; + } + + if (IS_NVME_IOC(cmd) && nvme_ioctl[NVME_IOC_CMD(cmd)] != NULL) + rv = nvme_ioctl[NVME_IOC_CMD(cmd)](nvme, nsid, &nioc, mode, + cred_p); + else + rv = EINVAL; + +#ifdef _MULTI_DATAMODEL + switch (ddi_model_convert_from(mode & FMODELS)) { + case DDI_MODEL_ILP32: { + nvme_ioctl32_t nioc32; + + nioc32.n_len = (size32_t)nioc.n_len; + nioc32.n_buf = (uintptr32_t)nioc.n_buf; + nioc32.n_arg = nioc.n_arg; + + if (ddi_copyout(&nioc32, (void *)arg, sizeof (nvme_ioctl32_t), + mode) != 0) + return (EFAULT); + break; + } + case DDI_MODEL_NONE: +#endif + if (ddi_copyout(&nioc, (void *)arg, sizeof (nvme_ioctl_t), mode) + != 0) + return (EFAULT); +#ifdef _MULTI_DATAMODEL + break; + } +#endif + + return (rv); +} diff --git a/usr/src/uts/common/io/nvme/nvme_reg.h b/usr/src/uts/common/io/nvme/nvme_reg.h index 3e4b77079b..acff0e2362 100644 --- a/usr/src/uts/common/io/nvme/nvme_reg.h +++ b/usr/src/uts/common/io/nvme/nvme_reg.h @@ -20,6 +20,8 @@ #ifndef _NVME_REG_H #define _NVME_REG_H +#include <sys/nvme.h> + #pragma pack(1) #ifdef __cplusplus @@ -33,22 +35,6 @@ extern "C" { #define NVME_MAX_ADMIN_QUEUE_LEN 4096 /* - * NVMe version - */ -typedef struct { - uint16_t v_minor; - uint16_t v_major; -} nvme_version_t; - -#define NVME_VERSION_ATLEAST(v, maj, min) \ - (((v)->v_major) > (maj) || \ - ((v)->v_major == (maj) && (v)->v_minor >= (min))) - -#define NVME_VERSION_HIGHER(v, maj, min) \ - (((v)->v_major) > (maj) || \ - ((v)->v_major == (maj) && (v)->v_minor > (min))) - -/* * NVMe registers and register fields */ #define NVME_REG_CAP 0x0 /* Controller Capabilities */ @@ -258,15 +244,6 @@ typedef struct { * NVMe completion queue entry */ typedef struct { - uint16_t sf_p:1; /* Phase Tag */ - uint16_t sf_sc:8; /* Status Code */ - uint16_t sf_sct:3; /* Status Code Type */ - uint16_t sf_rsvd2:2; - uint16_t sf_m:1; /* More */ - uint16_t sf_dnr:1; /* Do Not Retry */ -} nvme_cqe_sf_t; - -typedef struct { uint32_t cqe_dw0; /* Command Specific */ uint32_t cqe_rsvd1; uint16_t cqe_sqhd; /* SQ Head Pointer */ @@ -408,203 +385,6 @@ typedef union { #define NVME_IDENTIFY_CTRL 0x1 /* Identify Controller */ #define NVME_IDENTIFY_LIST 0x2 /* Identify List Namespaces */ -#define NVME_IDENTIFY_BUFSIZE 4096 /* buffer size for Identify */ - -/* NVMe Queue Entry Size bitfield */ -typedef struct { - uint8_t qes_min:4; /* minimum entry size */ - uint8_t qes_max:4; /* maximum entry size */ -} nvme_idctl_qes_t; - -/* NVMe Power State Descriptor */ -typedef struct { - uint16_t psd_mp; /* Maximum Power */ - uint8_t psd_rsvd1; - uint8_t psd_mps:1; /* Max Power Scale (1.1) */ - uint8_t psd_nops:1; /* Non-Operational State (1.1) */ - uint8_t psd_rsvd2:6; - uint32_t psd_enlat; /* Entry Latency */ - uint32_t psd_exlat; /* Exit Latency */ - uint8_t psd_rrt:5; /* Relative Read Throughput */ - uint8_t psd_rsvd3:3; - uint8_t psd_rrl:5; /* Relative Read Latency */ - uint8_t psd_rsvd4:3; - uint8_t psd_rwt:5; /* Relative Write Throughput */ - uint8_t psd_rsvd5:3; - uint8_t psd_rwl:5; /* Relative Write Latency */ - uint8_t psd_rsvd6:3; - uint8_t psd_rsvd7[16]; -} nvme_idctl_psd_t; - -/* NVMe Identify Controller Data Structure */ -typedef struct { - /* Controller Capabilities & Features */ - uint16_t id_vid; /* PCI vendor ID */ - uint16_t id_ssvid; /* PCI subsystem vendor ID */ - char id_serial[20]; /* Serial Number */ - char id_model[40]; /* Model Number */ - char id_fwrev[8]; /* Firmware Revision */ - uint8_t id_rab; /* Recommended Arbitration Burst */ - uint8_t id_oui[3]; /* vendor IEEE OUI */ - struct { /* Multi-Interface Capabilities */ - uint8_t m_multi_pci:1; /* HW has multiple PCIe interfaces */ - uint8_t m_multi_ctrl:1; /* HW has multiple controllers (1.1) */ - uint8_t m_sr_iov:1; /* controller is SR-IOV virt fn (1.1) */ - uint8_t m_rsvd:5; - } id_mic; - uint8_t id_mdts; /* Maximum Data Transfer Size */ - uint16_t id_cntlid; /* Unique Controller Identifier (1.1) */ - uint8_t id_rsvd_cc[256 - 80]; - - /* Admin Command Set Attributes */ - struct { /* Optional Admin Command Support */ - uint16_t oa_security:1; /* Security Send & Receive */ - uint16_t oa_format:1; /* Format NVM */ - uint16_t oa_firmare:1; /* Firmware Activate & Download */ - uint16_t oa_rsvd:13; - } id_oacs; - uint8_t id_acl; /* Abort Command Limit */ - uint8_t id_aerl; /* Asynchronous Event Request Limit */ - struct { /* Firmware Updates */ - uint8_t fw_readonly:1; /* Slot 1 is Read-Only */ - uint8_t fw_nslot:3; /* number of firmware slots */ - uint8_t fw_rsvd:4; - } id_frmw; - struct { /* Log Page Attributes */ - uint8_t lp_smart:1; /* SMART/Health information per NS */ - uint8_t lp_rsvd:7; - } id_lpa; - uint8_t id_elpe; /* Error Log Page Entries */ - uint8_t id_npss; /* Number of Power States */ - struct { /* Admin Vendor Specific Command Conf */ - uint8_t av_spec:1; /* use format from spec */ - uint8_t av_rsvd:7; - } id_avscc; - struct { /* Autonomous Power State Trans (1.1) */ - uint8_t ap_sup:1; /* APST supported (1.1) */ - uint8_t ap_rsvd:7; - } id_apsta; - uint8_t id_rsvd_ac[256 - 10]; - - /* NVM Command Set Attributes */ - nvme_idctl_qes_t id_sqes; /* Submission Queue Entry Size */ - nvme_idctl_qes_t id_cqes; /* Completion Queue Entry Size */ - uint16_t id_rsvd_nc_1; - uint32_t id_nn; /* Number of Namespaces */ - struct { /* Optional NVM Command Support */ - uint16_t on_compare:1; /* Compare */ - uint16_t on_wr_unc:1; /* Write Uncorrectable */ - uint16_t on_dset_mgmt:1; /* Dataset Management */ - uint16_t on_wr_zero:1; /* Write Zeros (1.1) */ - uint16_t on_save:1; /* Save/Select in Get/Set Feat (1.1) */ - uint16_t on_reserve:1; /* Reservations (1.1) */ - uint16_t on_rsvd:10; - } id_oncs; - struct { /* Fused Operation Support */ - uint16_t f_cmp_wr:1; /* Compare and Write */ - uint16_t f_rsvd:15; - } id_fuses; - struct { /* Format NVM Attributes */ - uint8_t fn_format:1; /* Format applies to all NS */ - uint8_t fn_sec_erase:1; /* Secure Erase applies to all NS */ - uint8_t fn_crypt_erase:1; /* Cryptographic Erase supported */ - uint8_t fn_rsvd:5; - } id_fna; - struct { /* Volatile Write Cache */ - uint8_t vwc_present:1; /* Volatile Write Cache present */ - uint8_t rsvd:7; - } id_vwc; - uint16_t id_awun; /* Atomic Write Unit Normal */ - uint16_t id_awupf; /* Atomic Write Unit Power Fail */ - struct { /* NVM Vendor Specific Command Conf */ - uint8_t nv_spec:1; /* use format from spec */ - uint8_t nv_rsvd:7; - } id_nvscc; - uint8_t id_rsvd_nc_2; - uint16_t id_acwu; /* Atomic Compare & Write Unit (1.1) */ - uint16_t id_rsvd_nc_3; - struct { /* SGL Support (1.1) */ - uint16_t sgl_sup:1; /* SGL Supported in NVM cmds (1.1) */ - uint16_t sgl_rsvd1:15; - uint16_t sgl_bucket:1; /* SGL Bit Bucket supported (1.1) */ - uint16_t sgl_rsvd2:15; - } id_sgls; - uint8_t id_rsvd_nc_4[192 - 28]; - - /* I/O Command Set Attributes */ - uint8_t id_rsvd_ioc[1344]; - - /* Power State Descriptors */ - nvme_idctl_psd_t id_psd[32]; - - /* Vendor Specific */ - uint8_t id_vs[1024]; -} nvme_identify_ctrl_t; - -/* NVMe Identify Namespace LBA Format */ -typedef struct { - uint16_t lbaf_ms; /* Metadata Size */ - uint8_t lbaf_lbads; /* LBA Data Size */ - uint8_t lbaf_rp:2; /* Relative Performance */ - uint8_t lbaf_rsvd1:6; -} nvme_idns_lbaf_t; - -/* NVMe Identify Namespace Data Structure */ -typedef struct { - uint64_t id_nsize; /* Namespace Size */ - uint64_t id_ncap; /* Namespace Capacity */ - uint64_t id_nuse; /* Namespace Utilization */ - struct { /* Namespace Features */ - uint8_t f_thin:1; /* Thin Provisioning */ - uint8_t f_rsvd:7; - } id_nsfeat; - uint8_t id_nlbaf; /* Number of LBA formats */ - struct { /* Formatted LBA size */ - uint8_t lba_format:4; /* LBA format */ - uint8_t lba_extlba:1; /* extended LBA (includes metadata) */ - uint8_t lba_rsvd:3; - } id_flbas; - struct { /* Metadata Capabilities */ - uint8_t mc_extlba:1; /* extended LBA transfers */ - uint8_t mc_separate:1; /* separate metadata transfers */ - uint8_t mc_rsvd:6; - } id_mc; - struct { /* Data Protection Capabilities */ - uint8_t dp_type1:1; /* Protection Information Type 1 */ - uint8_t dp_type2:1; /* Protection Information Type 2 */ - uint8_t dp_type3:1; /* Protection Information Type 3 */ - uint8_t dp_first:1; /* first 8 bytes of metadata */ - uint8_t dp_last:1; /* last 8 bytes of metadata */ - uint8_t dp_rsvd:3; - } id_dpc; - struct { /* Data Protection Settings */ - uint8_t dp_pinfo:3; /* Protection Information enabled */ - uint8_t dp_first:1; /* first 8 bytes of metadata */ - uint8_t dp_rsvd:4; - } id_dps; - struct { /* NS Multi-Path/Sharing Cap (1.1) */ - uint8_t nm_shared:1; /* NS is shared (1.1) */ - uint8_t nm_rsvd:7; - } id_nmic; - struct { /* Reservation Capabilities (1.1) */ - uint8_t rc_persist:1; /* Persist Through Power Loss (1.1) */ - uint8_t rc_wr_excl:1; /* Write Exclusive (1.1) */ - uint8_t rc_excl:1; /* Exclusive Access (1.1) */ - uint8_t rc_wr_excl_r:1; /* Wr Excl - Registrants Only (1.1) */ - uint8_t rc_excl_r:1; /* Excl Acc - Registrants Only (1.1) */ - uint8_t rc_wr_excl_a:1; /* Wr Excl - All Registrants (1.1) */ - uint8_t rc_excl_a:1; /* Excl Acc - All Registrants (1.1) */ - uint8_t rc_rsvd:1; - } id_rescap; - uint8_t id_rsvd1[120 - 32]; - uint8_t id_eui64[8]; /* IEEE Extended Unique Id (1.1) */ - nvme_idns_lbaf_t id_lbaf[16]; /* LBA Formats */ - - uint8_t id_rsvd2[192]; - - uint8_t id_vs[3712]; /* Vendor Specific */ -} nvme_identify_nsid_t; - /* * NVMe Abort Command @@ -619,79 +399,8 @@ typedef union { /* - * NVMe Get / Set Features - */ -#define NVME_FEAT_ARBITRATION 0x1 /* Command Arbitration */ -#define NVME_FEAT_POWER_MGMT 0x2 /* Power Management */ -#define NVME_FEAT_LBA_RANGE 0x3 /* LBA Range Type */ -#define NVME_FEAT_TEMPERATURE 0x4 /* Temperature Threshold */ -#define NVME_FEAT_ERROR 0x5 /* Error Recovery */ -#define NVME_FEAT_WRITE_CACHE 0x6 /* Volatile Write Cache */ -#define NVME_FEAT_NQUEUES 0x7 /* Number of Queues */ -#define NVME_FEAT_INTR_COAL 0x8 /* Interrupt Coalescing */ -#define NVME_FEAT_INTR_VECT 0x9 /* Interrupt Vector Configuration */ -#define NVME_FEAT_WRITE_ATOM 0xa /* Write Atomicity */ -#define NVME_FEAT_ASYNC_EVENT 0xb /* Asynchronous Event Configuration */ -#define NVME_FEAT_AUTO_PST 0xc /* Autonomous Power State Transition */ - /* (1.1) */ - -#define NVME_FEAT_PROGRESS 0x80 /* Software Progress Marker */ - -/* Arbitration Feature */ -typedef struct { - uint8_t arb_ab:3; /* Arbitration Burst */ - uint8_t arb_rsvd:5; - uint8_t arb_lpw; /* Low Priority Weight */ - uint8_t arb_mpw; /* Medium Priority Weight */ - uint8_t arb_hpw; /* High Priority Weight */ -} nvme_arbitration_dw11_t; - -/* LBA Range Type Feature */ -typedef struct { - uint32_t lr_num:6; /* Number of LBA ranges */ - uint32_t lr_rsvd:26; -} nvme_lba_range_type_dw11_t; - -typedef struct { - uint8_t lr_type; /* Type */ - struct { /* Attributes */ - uint8_t lr_write:1; /* may be overwritten */ - uint8_t lr_hidden:1; /* hidden from OS/EFI/BIOS */ - uint8_t lr_rsvd1:6; - } lr_attr; - uint8_t lr_rsvd2[14]; - uint64_t lr_slba; /* Starting LBA */ - uint64_t lr_nlb; /* Number of Logical Blocks */ - uint8_t lr_guid[16]; /* Unique Identifier */ - uint8_t lr_rsvd3[16]; -} nvme_lba_range_type_t; - -/* Volatile Write Cache Feature */ -typedef union { - struct { - uint32_t wc_wce:1; /* Volatile Write Cache Enable */ - uint32_t wc_rsvd:31; - } b; - uint32_t r; -} nvme_write_cache_t; - -/* Number of Queues */ -typedef union { - struct { - uint16_t nq_nsq; /* Number of Submission Queues */ - uint16_t nq_ncq; /* Number of Completion Queues */ - } b; - uint32_t r; -} nvme_nqueue_t; - - -/* * NVMe Get Log Page */ -#define NVME_LOGPAGE_ERROR 0x1 /* Error Information */ -#define NVME_LOGPAGE_HEALTH 0x2 /* SMART/Health Information */ -#define NVME_LOGPAGE_FWSLOT 0x3 /* Firmware Slot Information */ - typedef union { struct { uint8_t lp_lid; /* Log Page Identifier */ @@ -702,52 +411,6 @@ typedef union { uint32_t r; } nvme_getlogpage_t; -typedef struct { - uint64_t el_count; /* Error Count */ - uint16_t el_sqid; /* Submission Queue ID */ - uint16_t el_cid; /* Command ID */ - nvme_cqe_sf_t el_sf; /* Status Field */ - uint8_t el_byte; /* Parameter Error Location byte */ - uint8_t el_bit:3; /* Parameter Error Location bit */ - uint8_t el_rsvd1:5; - uint64_t el_lba; /* Logical Block Address */ - uint32_t el_nsid; /* Namespace ID */ - uint8_t el_vendor; /* Vendor Specific Information avail */ - uint8_t el_rsvd2[64 - 29]; -} nvme_error_log_entry_t; - -typedef struct { - uint64_t lo; - uint64_t hi; -} nvme_uint128_t; - -typedef struct { - uint8_t hl_crit_warn; /* Critical Warning */ - uint16_t hl_temp; /* Temperature */ - uint8_t hl_avail_spare; /* Available Spare */ - uint8_t hl_avail_spare_thr; /* Available Spare Threshold */ - uint8_t hl_used; /* Percentage Used */ - uint8_t hl_rsvd1[32 - 6]; - nvme_uint128_t hl_data_read; /* Data Units Read */ - nvme_uint128_t hl_data_write; /* Data Units Written */ - nvme_uint128_t hl_host_read; /* Host Read Commands */ - nvme_uint128_t hl_host_write; /* Host Write Commands */ - nvme_uint128_t hl_ctrl_busy; /* Controller Busy Time */ - nvme_uint128_t hl_power_cycles; /* Power Cycles */ - nvme_uint128_t hl_power_on_hours; /* Power On Hours */ - nvme_uint128_t hl_unsafe_shutdn; /* Unsafe Shutdowns */ - nvme_uint128_t hl_media_errors; /* Media Errors */ - nvme_uint128_t hl_errors_logged; /* Number of errors logged */ - uint8_t hl_rsvd2[512 - 192]; -} nvme_health_log_t; - -typedef struct { - uint8_t fw_afi:3; /* Active Firmware Slot */ - uint8_t fw_rsvd1:5; - uint8_t fw_rsvd2[7]; - char fw_frs[7][8]; /* Firmware Revision / Slot */ - uint8_t fw_rsvd3[512 - 64]; -} nvme_fwslot_log_t; #ifdef __cplusplus } diff --git a/usr/src/uts/common/io/nvme/nvme_var.h b/usr/src/uts/common/io/nvme/nvme_var.h index fd6f93af88..651adaec8c 100644 --- a/usr/src/uts/common/io/nvme/nvme_var.h +++ b/usr/src/uts/common/io/nvme/nvme_var.h @@ -27,7 +27,7 @@ */ #ifdef __cplusplus -/* extern "C" { */ +extern "C" { #endif #define NVME_FMA_INIT 0x1 @@ -47,11 +47,18 @@ typedef struct nvme nvme_t; typedef struct nvme_namespace nvme_namespace_t; +typedef struct nvme_minor_state nvme_minor_state_t; typedef struct nvme_dma nvme_dma_t; typedef struct nvme_cmd nvme_cmd_t; typedef struct nvme_qpair nvme_qpair_t; typedef struct nvme_task_arg nvme_task_arg_t; +struct nvme_minor_state { + kmutex_t nm_mutex; + boolean_t nm_oexcl; + uint_t nm_ocnt; +}; + struct nvme_dma { ddi_dma_handle_t nd_dmah; ddi_acc_handle_t nd_acch; @@ -69,6 +76,7 @@ struct nvme_cmd { void (*nc_callback)(void *); bd_xfer_t *nc_xfer; boolean_t nc_completed; + boolean_t nc_dontpanic; uint16_t nc_sqid; nvme_dma_t *nc_dma; @@ -137,6 +145,8 @@ struct nvme { boolean_t n_write_cache_present; boolean_t n_write_cache_enabled; int n_error_log_len; + boolean_t n_lba_range_supported; + boolean_t n_auto_pst_supported; int n_nssr_supported; int n_doorbell_stride; @@ -168,9 +178,8 @@ struct nvme { ddi_taskq_t *n_cmd_taskq; - nvme_error_log_entry_t *n_error_log; - nvme_health_log_t *n_health_log; - nvme_fwslot_log_t *n_fwslot_log; + /* state for devctl minor node */ + nvme_minor_state_t n_minor; /* errors detected by driver */ uint32_t n_dma_bind_err; @@ -217,6 +226,7 @@ struct nvme { struct nvme_namespace { nvme_t *ns_nvme; uint8_t ns_eui64[8]; + char ns_name[17]; bd_handle_t ns_bd_hdl; @@ -229,6 +239,9 @@ struct nvme_namespace { nvme_identify_nsid_t *ns_idns; + /* state for attachment point minor node */ + nvme_minor_state_t ns_minor; + /* * If a namespace has no EUI64, we create a devid in * nvme_prepare_devid(). @@ -241,8 +254,9 @@ struct nvme_task_arg { nvme_cmd_t *nt_cmd; }; + #ifdef __cplusplus -/* } */ +} #endif #endif /* _NVME_VAR_H */ diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile index 1c7662c28a..7ce40a658a 100644 --- a/usr/src/uts/common/sys/Makefile +++ b/usr/src/uts/common/sys/Makefile @@ -38,6 +38,7 @@ FILEMODE=644 # neither installed or shipped as part of the product: # cpuid_drv.h: Private interface for cpuid consumers # unix_bb_info.h: Private interface to kcov +# nvme.h Private interface to nvme # i386_HDRS= \ @@ -54,6 +55,7 @@ i386_HDRS= \ firmload.h \ gfx_private.h \ mouse.h \ + nvme.h \ ucode.h sparc_HDRS= \ @@ -422,6 +424,7 @@ CHKHDRS= \ nexusdefs.h \ note.h \ null.h \ + nvme.h \ nvpair.h \ nvpair_impl.h \ objfs.h \ diff --git a/usr/src/uts/common/sys/nvme.h b/usr/src/uts/common/sys/nvme.h new file mode 100644 index 0000000000..916b439f3f --- /dev/null +++ b/usr/src/uts/common/sys/nvme.h @@ -0,0 +1,574 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Nexenta Systems, Inc. + */ + +#ifndef _SYS_NVME_H +#define _SYS_NVME_H + +#include <sys/types.h> + +#ifdef _KERNEL +#include <sys/types32.h> +#else +#include <stdint.h> +#endif + +/* + * Declarations used for communication between nvmeadm(1M) and nvme(7D) + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * NVMe ioctl definitions + */ + +#define NVME_IOC (('N' << 24) | ('V' << 16) | ('M' << 8)) +#define NVME_IOC_IDENTIFY_CTRL (NVME_IOC | 1) +#define NVME_IOC_IDENTIFY_NSID (NVME_IOC | 2) +#define NVME_IOC_CAPABILITIES (NVME_IOC | 3) +#define NVME_IOC_GET_LOGPAGE (NVME_IOC | 4) +#define NVME_IOC_GET_FEATURES (NVME_IOC | 5) +#define NVME_IOC_INTR_CNT (NVME_IOC | 6) +#define NVME_IOC_VERSION (NVME_IOC | 7) +#define NVME_IOC_FORMAT (NVME_IOC | 8) +#define NVME_IOC_DETACH (NVME_IOC | 9) +#define NVME_IOC_ATTACH (NVME_IOC | 10) +#define NVME_IOC_MAX NVME_IOC_ATTACH + +#define IS_NVME_IOC(x) ((x) > NVME_IOC && (x) <= NVME_IOC_MAX) +#define NVME_IOC_CMD(x) ((x) & 0xff) + +typedef struct { + size_t n_len; + uintptr_t n_buf; + uint64_t n_arg; +} nvme_ioctl_t; + +#ifdef _KERNEL +typedef struct { + size32_t n_len; + uintptr32_t n_buf; + uint64_t n_arg; +} nvme_ioctl32_t; +#endif + +/* + * NVMe capabilities + */ +typedef struct { + uint32_t mpsmax; /* Memory Page Size Maximum */ + uint32_t mpsmin; /* Memory Page Size Minimum */ +} nvme_capabilities_t; + +/* + * NVMe version + */ +typedef struct { + uint16_t v_minor; + uint16_t v_major; +} nvme_version_t; + +#define NVME_VERSION_ATLEAST(v, maj, min) \ + (((v)->v_major) > (maj) || \ + ((v)->v_major == (maj) && (v)->v_minor >= (min))) + +#define NVME_VERSION_HIGHER(v, maj, min) \ + (((v)->v_major) > (maj) || \ + ((v)->v_major == (maj) && (v)->v_minor > (min))) + + +#pragma pack(1) + +/* + * NVMe Identify data structures + */ + +#define NVME_IDENTIFY_BUFSIZE 4096 /* buffer size for Identify */ + +/* NVMe Queue Entry Size bitfield */ +typedef struct { + uint8_t qes_min:4; /* minimum entry size */ + uint8_t qes_max:4; /* maximum entry size */ +} nvme_idctl_qes_t; + +/* NVMe Power State Descriptor */ +typedef struct { + uint16_t psd_mp; /* Maximum Power */ + uint8_t psd_rsvd1; + uint8_t psd_mps:1; /* Max Power Scale (1.1) */ + uint8_t psd_nops:1; /* Non-Operational State (1.1) */ + uint8_t psd_rsvd2:6; + uint32_t psd_enlat; /* Entry Latency */ + uint32_t psd_exlat; /* Exit Latency */ + uint8_t psd_rrt:5; /* Relative Read Throughput */ + uint8_t psd_rsvd3:3; + uint8_t psd_rrl:5; /* Relative Read Latency */ + uint8_t psd_rsvd4:3; + uint8_t psd_rwt:5; /* Relative Write Throughput */ + uint8_t psd_rsvd5:3; + uint8_t psd_rwl:5; /* Relative Write Latency */ + uint8_t psd_rsvd6:3; + uint8_t psd_rsvd7[16]; +} nvme_idctl_psd_t; + +/* NVMe Identify Controller Data Structure */ +typedef struct { + /* Controller Capabilities & Features */ + uint16_t id_vid; /* PCI vendor ID */ + uint16_t id_ssvid; /* PCI subsystem vendor ID */ + char id_serial[20]; /* Serial Number */ + char id_model[40]; /* Model Number */ + char id_fwrev[8]; /* Firmware Revision */ + uint8_t id_rab; /* Recommended Arbitration Burst */ + uint8_t id_oui[3]; /* vendor IEEE OUI */ + struct { /* Multi-Interface Capabilities */ + uint8_t m_multi_pci:1; /* HW has multiple PCIe interfaces */ + uint8_t m_multi_ctrl:1; /* HW has multiple controllers (1.1) */ + uint8_t m_sr_iov:1; /* controller is SR-IOV virt fn (1.1) */ + uint8_t m_rsvd:5; + } id_mic; + uint8_t id_mdts; /* Maximum Data Transfer Size */ + uint16_t id_cntlid; /* Unique Controller Identifier (1.1) */ + uint8_t id_rsvd_cc[256 - 80]; + + /* Admin Command Set Attributes */ + struct { /* Optional Admin Command Support */ + uint16_t oa_security:1; /* Security Send & Receive */ + uint16_t oa_format:1; /* Format NVM */ + uint16_t oa_firmware:1; /* Firmware Activate & Download */ + uint16_t oa_rsvd:13; + } id_oacs; + uint8_t id_acl; /* Abort Command Limit */ + uint8_t id_aerl; /* Asynchronous Event Request Limit */ + struct { /* Firmware Updates */ + uint8_t fw_readonly:1; /* Slot 1 is Read-Only */ + uint8_t fw_nslot:3; /* number of firmware slots */ + uint8_t fw_rsvd:4; + } id_frmw; + struct { /* Log Page Attributes */ + uint8_t lp_smart:1; /* SMART/Health information per NS */ + uint8_t lp_rsvd:7; + } id_lpa; + uint8_t id_elpe; /* Error Log Page Entries */ + uint8_t id_npss; /* Number of Power States */ + struct { /* Admin Vendor Specific Command Conf */ + uint8_t av_spec:1; /* use format from spec */ + uint8_t av_rsvd:7; + } id_avscc; + struct { /* Autonomous Power State Trans (1.1) */ + uint8_t ap_sup:1; /* APST supported (1.1) */ + uint8_t ap_rsvd:7; + } id_apsta; + uint8_t id_rsvd_ac[256 - 10]; + + /* NVM Command Set Attributes */ + nvme_idctl_qes_t id_sqes; /* Submission Queue Entry Size */ + nvme_idctl_qes_t id_cqes; /* Completion Queue Entry Size */ + uint16_t id_rsvd_nc_1; + uint32_t id_nn; /* Number of Namespaces */ + struct { /* Optional NVM Command Support */ + uint16_t on_compare:1; /* Compare */ + uint16_t on_wr_unc:1; /* Write Uncorrectable */ + uint16_t on_dset_mgmt:1; /* Dataset Management */ + uint16_t on_wr_zero:1; /* Write Zeros (1.1) */ + uint16_t on_save:1; /* Save/Select in Get/Set Feat (1.1) */ + uint16_t on_reserve:1; /* Reservations (1.1) */ + uint16_t on_rsvd:10; + } id_oncs; + struct { /* Fused Operation Support */ + uint16_t f_cmp_wr:1; /* Compare and Write */ + uint16_t f_rsvd:15; + } id_fuses; + struct { /* Format NVM Attributes */ + uint8_t fn_format:1; /* Format applies to all NS */ + uint8_t fn_sec_erase:1; /* Secure Erase applies to all NS */ + uint8_t fn_crypt_erase:1; /* Cryptographic Erase supported */ + uint8_t fn_rsvd:5; + } id_fna; + struct { /* Volatile Write Cache */ + uint8_t vwc_present:1; /* Volatile Write Cache present */ + uint8_t rsvd:7; + } id_vwc; + uint16_t id_awun; /* Atomic Write Unit Normal */ + uint16_t id_awupf; /* Atomic Write Unit Power Fail */ + struct { /* NVM Vendor Specific Command Conf */ + uint8_t nv_spec:1; /* use format from spec */ + uint8_t nv_rsvd:7; + } id_nvscc; + uint8_t id_rsvd_nc_2; + uint16_t id_acwu; /* Atomic Compare & Write Unit (1.1) */ + uint16_t id_rsvd_nc_3; + struct { /* SGL Support (1.1) */ + uint16_t sgl_sup:1; /* SGL Supported in NVM cmds (1.1) */ + uint16_t sgl_rsvd1:15; + uint16_t sgl_bucket:1; /* SGL Bit Bucket supported (1.1) */ + uint16_t sgl_rsvd2:15; + } id_sgls; + uint8_t id_rsvd_nc_4[192 - 28]; + + /* I/O Command Set Attributes */ + uint8_t id_rsvd_ioc[1344]; + + /* Power State Descriptors */ + nvme_idctl_psd_t id_psd[32]; + + /* Vendor Specific */ + uint8_t id_vs[1024]; +} nvme_identify_ctrl_t; + +/* NVMe Identify Namespace LBA Format */ +typedef struct { + uint16_t lbaf_ms; /* Metadata Size */ + uint8_t lbaf_lbads; /* LBA Data Size */ + uint8_t lbaf_rp:2; /* Relative Performance */ + uint8_t lbaf_rsvd1:6; +} nvme_idns_lbaf_t; + +/* NVMe Identify Namespace Data Structure */ +typedef struct { + uint64_t id_nsize; /* Namespace Size */ + uint64_t id_ncap; /* Namespace Capacity */ + uint64_t id_nuse; /* Namespace Utilization */ + struct { /* Namespace Features */ + uint8_t f_thin:1; /* Thin Provisioning */ + uint8_t f_rsvd:7; + } id_nsfeat; + uint8_t id_nlbaf; /* Number of LBA formats */ + struct { /* Formatted LBA size */ + uint8_t lba_format:4; /* LBA format */ + uint8_t lba_extlba:1; /* extended LBA (includes metadata) */ + uint8_t lba_rsvd:3; + } id_flbas; + struct { /* Metadata Capabilities */ + uint8_t mc_extlba:1; /* extended LBA transfers */ + uint8_t mc_separate:1; /* separate metadata transfers */ + uint8_t mc_rsvd:6; + } id_mc; + struct { /* Data Protection Capabilities */ + uint8_t dp_type1:1; /* Protection Information Type 1 */ + uint8_t dp_type2:1; /* Protection Information Type 2 */ + uint8_t dp_type3:1; /* Protection Information Type 3 */ + uint8_t dp_first:1; /* first 8 bytes of metadata */ + uint8_t dp_last:1; /* last 8 bytes of metadata */ + uint8_t dp_rsvd:3; + } id_dpc; + struct { /* Data Protection Settings */ + uint8_t dp_pinfo:3; /* Protection Information enabled */ + uint8_t dp_first:1; /* first 8 bytes of metadata */ + uint8_t dp_rsvd:4; + } id_dps; + struct { /* NS Multi-Path/Sharing Cap (1.1) */ + uint8_t nm_shared:1; /* NS is shared (1.1) */ + uint8_t nm_rsvd:7; + } id_nmic; + struct { /* Reservation Capabilities (1.1) */ + uint8_t rc_persist:1; /* Persist Through Power Loss (1.1) */ + uint8_t rc_wr_excl:1; /* Write Exclusive (1.1) */ + uint8_t rc_excl:1; /* Exclusive Access (1.1) */ + uint8_t rc_wr_excl_r:1; /* Wr Excl - Registrants Only (1.1) */ + uint8_t rc_excl_r:1; /* Excl Acc - Registrants Only (1.1) */ + uint8_t rc_wr_excl_a:1; /* Wr Excl - All Registrants (1.1) */ + uint8_t rc_excl_a:1; /* Excl Acc - All Registrants (1.1) */ + uint8_t rc_rsvd:1; + } id_rescap; + uint8_t id_rsvd1[120 - 32]; + uint8_t id_eui64[8]; /* IEEE Extended Unique Id (1.1) */ + nvme_idns_lbaf_t id_lbaf[16]; /* LBA Formats */ + + uint8_t id_rsvd2[192]; + + uint8_t id_vs[3712]; /* Vendor Specific */ +} nvme_identify_nsid_t; + + +/* + * NVMe completion queue entry status field + */ +typedef struct { + uint16_t sf_p:1; /* Phase Tag */ + uint16_t sf_sc:8; /* Status Code */ + uint16_t sf_sct:3; /* Status Code Type */ + uint16_t sf_rsvd2:2; + uint16_t sf_m:1; /* More */ + uint16_t sf_dnr:1; /* Do Not Retry */ +} nvme_cqe_sf_t; + + +/* + * NVMe Get Log Page + */ +#define NVME_LOGPAGE_ERROR 0x1 /* Error Information */ +#define NVME_LOGPAGE_HEALTH 0x2 /* SMART/Health Information */ +#define NVME_LOGPAGE_FWSLOT 0x3 /* Firmware Slot Information */ + +typedef struct { + uint64_t el_count; /* Error Count */ + uint16_t el_sqid; /* Submission Queue ID */ + uint16_t el_cid; /* Command ID */ + nvme_cqe_sf_t el_sf; /* Status Field */ + uint8_t el_byte; /* Parameter Error Location byte */ + uint8_t el_bit:3; /* Parameter Error Location bit */ + uint8_t el_rsvd1:5; + uint64_t el_lba; /* Logical Block Address */ + uint32_t el_nsid; /* Namespace ID */ + uint8_t el_vendor; /* Vendor Specific Information avail */ + uint8_t el_rsvd2[64 - 29]; +} nvme_error_log_entry_t; + +typedef struct { + uint64_t lo; + uint64_t hi; +} nvme_uint128_t; + +typedef struct { + struct { /* Critical Warning */ + uint8_t cw_avail:1; /* available space too low */ + uint8_t cw_temp:1; /* temperature too high */ + uint8_t cw_reliab:1; /* degraded reliability */ + uint8_t cw_readonly:1; /* media is read-only */ + uint8_t cw_volatile:1; /* volatile memory backup failed */ + uint8_t cw_rsvd:3; + } hl_crit_warn; + uint16_t hl_temp; /* Temperature */ + uint8_t hl_avail_spare; /* Available Spare */ + uint8_t hl_avail_spare_thr; /* Available Spare Threshold */ + uint8_t hl_used; /* Percentage Used */ + uint8_t hl_rsvd1[32 - 6]; + nvme_uint128_t hl_data_read; /* Data Units Read */ + nvme_uint128_t hl_data_write; /* Data Units Written */ + nvme_uint128_t hl_host_read; /* Host Read Commands */ + nvme_uint128_t hl_host_write; /* Host Write Commands */ + nvme_uint128_t hl_ctrl_busy; /* Controller Busy Time */ + nvme_uint128_t hl_power_cycles; /* Power Cycles */ + nvme_uint128_t hl_power_on_hours; /* Power On Hours */ + nvme_uint128_t hl_unsafe_shutdn; /* Unsafe Shutdowns */ + nvme_uint128_t hl_media_errors; /* Media Errors */ + nvme_uint128_t hl_errors_logged; /* Number of errors logged */ + uint8_t hl_rsvd2[512 - 192]; +} nvme_health_log_t; + +typedef struct { + uint8_t fw_afi:3; /* Active Firmware Slot */ + uint8_t fw_rsvd1:5; + uint8_t fw_rsvd2[7]; + char fw_frs[7][8]; /* Firmware Revision / Slot */ + uint8_t fw_rsvd3[512 - 64]; +} nvme_fwslot_log_t; + + +/* + * NVMe Format NVM + */ +#define NVME_FRMT_SES_NONE 0 +#define NVME_FRMT_SES_USER 1 +#define NVME_FRMT_SES_CRYPTO 2 +#define NVME_FRMT_MAX_SES 2 + +#define NVME_FRMT_MAX_LBAF 15 + +typedef union { + struct { + uint32_t fm_lbaf:4; /* LBA Format */ + uint32_t fm_ms:1; /* Metadata Settings */ + uint32_t fm_pi:3; /* Protection Information */ + uint32_t fm_pil:1; /* Prot. Information Location */ + uint32_t fm_ses:3; /* Secure Erase Settings */ + uint32_t fm_resvd:20; + } b; + uint32_t r; +} nvme_format_nvm_t; + + +/* + * NVMe Get / Set Features + */ +#define NVME_FEAT_ARBITRATION 0x1 /* Command Arbitration */ +#define NVME_FEAT_POWER_MGMT 0x2 /* Power Management */ +#define NVME_FEAT_LBA_RANGE 0x3 /* LBA Range Type */ +#define NVME_FEAT_TEMPERATURE 0x4 /* Temperature Threshold */ +#define NVME_FEAT_ERROR 0x5 /* Error Recovery */ +#define NVME_FEAT_WRITE_CACHE 0x6 /* Volatile Write Cache */ +#define NVME_FEAT_NQUEUES 0x7 /* Number of Queues */ +#define NVME_FEAT_INTR_COAL 0x8 /* Interrupt Coalescing */ +#define NVME_FEAT_INTR_VECT 0x9 /* Interrupt Vector Configuration */ +#define NVME_FEAT_WRITE_ATOM 0xa /* Write Atomicity */ +#define NVME_FEAT_ASYNC_EVENT 0xb /* Asynchronous Event Configuration */ +#define NVME_FEAT_AUTO_PST 0xc /* Autonomous Power State Transition */ + /* (1.1) */ + +#define NVME_FEAT_PROGRESS 0x80 /* Software Progress Marker */ + +/* Arbitration Feature */ +typedef union { + struct { + uint8_t arb_ab:3; /* Arbitration Burst */ + uint8_t arb_rsvd:5; + uint8_t arb_lpw; /* Low Priority Weight */ + uint8_t arb_mpw; /* Medium Priority Weight */ + uint8_t arb_hpw; /* High Priority Weight */ + } b; + uint32_t r; +} nvme_arbitration_t; + +/* Power Management Feature */ +typedef union { + struct { + uint32_t pm_ps:5; /* Power State */ + uint32_t pm_rsvd:27; + } b; + uint32_t r; +} nvme_power_mgmt_t; + +/* LBA Range Type Feature */ +typedef union { + struct { + uint32_t lr_num:6; /* Number of LBA ranges */ + uint32_t lr_rsvd:26; + } b; + uint32_t r; +} nvme_lba_range_type_t; + +typedef struct { + uint8_t lr_type; /* Type */ + struct { /* Attributes */ + uint8_t lr_write:1; /* may be overwritten */ + uint8_t lr_hidden:1; /* hidden from OS/EFI/BIOS */ + uint8_t lr_rsvd1:6; + } lr_attr; + uint8_t lr_rsvd2[14]; + uint64_t lr_slba; /* Starting LBA */ + uint64_t lr_nlb; /* Number of Logical Blocks */ + uint8_t lr_guid[16]; /* Unique Identifier */ + uint8_t lr_rsvd3[16]; +} nvme_lba_range_t; + +#define NVME_LBA_RANGE_BUFSIZE 4096 + +/* Temperature Threshold Feature */ +typedef union { + struct { + uint16_t tt_tmpth; /* Temperature Threshold */ + uint16_t tt_rsvd; + } b; + uint32_t r; +} nvme_temp_threshold_t; + +/* Error Recovery Feature */ +typedef union { + struct { + uint16_t er_tler; /* Time-Limited Error Recovery */ + uint16_t er_rsvd; + } b; + uint32_t r; +} nvme_error_recovery_t; + +/* Volatile Write Cache Feature */ +typedef union { + struct { + uint32_t wc_wce:1; /* Volatile Write Cache Enable */ + uint32_t wc_rsvd:31; + } b; + uint32_t r; +} nvme_write_cache_t; + +/* Number of Queues Feature */ +typedef union { + struct { + uint16_t nq_nsq; /* Number of Submission Queues */ + uint16_t nq_ncq; /* Number of Completion Queues */ + } b; + uint32_t r; +} nvme_nqueues_t; + +/* Interrupt Coalescing Feature */ +typedef union { + struct { + uint8_t ic_thr; /* Aggregation Threshold */ + uint8_t ic_time; /* Aggregation Time */ + uint16_t ic_rsvd; + } b; + uint32_t r; +} nvme_intr_coal_t; + +/* Interrupt Configuration Features */ +typedef union { + struct { + uint16_t iv_iv; /* Interrupt Vector */ + uint16_t iv_cd:1; /* Coalescing Disable */ + uint16_t iv_rsvd:15; + } b; + uint32_t r; +} nvme_intr_vect_t; + +/* Write Atomicity Feature */ +typedef union { + struct { + uint32_t wa_dn:1; /* Disable Normal */ + uint32_t wa_rsvd:31; + } b; + uint32_t r; +} nvme_write_atomicity_t; + +/* Asynchronous Event Configuration Feature */ +typedef union { + struct { + uint8_t aec_avail:1; /* available space too low */ + uint8_t aec_temp:1; /* temperature too high */ + uint8_t aec_reliab:1; /* degraded reliability */ + uint8_t aec_readonly:1; /* media is read-only */ + uint8_t aec_volatile:1; /* volatile memory backup failed */ + uint8_t aec_rsvd1:3; + uint8_t aec_rsvd2[3]; + } b; + uint32_t r; +} nvme_async_event_conf_t; + +/* Autonomous Power State Transition Feature (1.1) */ +typedef union { + struct { + uint8_t apst_apste:1; /* APST enabled */ + uint8_t apst_rsvd:7; + } b; + uint8_t r; +} nvme_auto_power_state_trans_t; + +typedef struct { + uint32_t apst_rsvd1:3; + uint32_t apst_itps:5; /* Idle Transition Power State */ + uint32_t apst_itpt:24; /* Idle Time Prior to Transition */ + uint32_t apst_rsvd2; +} nvme_auto_power_state_t; + +#define NVME_AUTO_PST_BUFSIZE 256 + +/* Software Progress Marker Feature */ +typedef union { + struct { + uint8_t spm_pbslc; /* Pre-Boot Software Load Count */ + uint8_t spm_rsvd[3]; + } b; + uint32_t r; +} nvme_software_progress_marker_t; + +#pragma pack() /* pack(1) */ + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_NVME_H */ diff --git a/usr/src/uts/common/sys/sunddi.h b/usr/src/uts/common/sys/sunddi.h index d5e52dbbfc..1d94c8fd2c 100644 --- a/usr/src/uts/common/sys/sunddi.h +++ b/usr/src/uts/common/sys/sunddi.h @@ -212,6 +212,8 @@ extern "C" { #define DDI_NT_NEXUS "ddi_ctl:devctl" /* nexus drivers */ +#define DDI_NT_NVME_NEXUS "ddi_ctl:devctl:nvme" /* nexus drivers */ + #define DDI_NT_SCSI_NEXUS "ddi_ctl:devctl:scsi" /* nexus drivers */ #define DDI_NT_SATA_NEXUS "ddi_ctl:devctl:sata" /* nexus drivers */ @@ -220,6 +222,9 @@ extern "C" { #define DDI_NT_ATTACHMENT_POINT "ddi_ctl:attachment_point" /* attachment pt */ +#define DDI_NT_NVME_ATTACHMENT_POINT "ddi_ctl:attachment_point:nvme" + /* nvme attachment pt */ + #define DDI_NT_SCSI_ATTACHMENT_POINT "ddi_ctl:attachment_point:scsi" /* scsi attachment pt */ |