diff options
author | Paul Winder <Paul.Winder@wdc.com> | 2019-06-05 14:02:04 +0100 |
---|---|---|
committer | Dan McDonald <danmcd@joyent.com> | 2019-06-20 10:02:46 -0400 |
commit | 0999c1123c1ab769df080ccc5f1626d50663e7a8 (patch) | |
tree | 78ca9f53ced438d9d1a49145c70dbb32979e33d6 | |
parent | c89583d1669aac784ae44a473d81f8c5f564c728 (diff) | |
download | illumos-joyent-0999c1123c1ab769df080ccc5f1626d50663e7a8.tar.gz |
11202 Allow the number of NVMe submission and completion queues to be different
11228 nvme may queue more submissions than allowed
11229 nvme_get_logpage() can allocate a too small buffer to receive logpage data
11230 Panic in nvme_fill_prp() because of miscalculation of the number of PRPs per page
11231 nvme in polled mode ignores the command call back
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org>
Reviewed by: Gergő Mihály Doma <domag02@gmail.com>
Reviewed by: Youzhong Yang <youzhong@gmail.com>
Approved by: Dan McDonald <danmcd@joyent.com>
-rw-r--r-- | usr/src/man/man7d/nvme.7d | 33 | ||||
-rw-r--r-- | usr/src/uts/common/io/nvme/nvme.c | 460 | ||||
-rw-r--r-- | usr/src/uts/common/io/nvme/nvme.conf | 26 | ||||
-rw-r--r-- | usr/src/uts/common/io/nvme/nvme_var.h | 40 |
4 files changed, 459 insertions, 100 deletions
diff --git a/usr/src/man/man7d/nvme.7d b/usr/src/man/man7d/nvme.7d index 83f72f346a..2819a788e5 100644 --- a/usr/src/man/man7d/nvme.7d +++ b/usr/src/man/man7d/nvme.7d @@ -11,6 +11,7 @@ .\" .\" Copyright 2016 Nexenta Systems, Inc. All rights reserved. .\" Copyright (c) 2018, Joyent, Inc. +.\" Copyright 2019, Western Digital Corporation .\" .Dd July 31, 2018 .Dt NVME 7D @@ -58,9 +59,17 @@ status. .It Va admin-queue-len This is the number of entries in the admin command queue. Legal values are between 16 and 4096, the default value is 256. -.It Va io-queue-len -This is the number of entries in each I/O command queue. +.It Va io-squeue-len +This is the number of entries in each I/O submission queue. Legal values are between 16 and 65536, the default value is 1024. +.It Va io-cqueue-len +This is the number of entries in each I/O completion queue. +Legal values are between 16 and 65536, the default value is 2048. +When the number of submission and completion queues are the same, +the queue lengths will be both set to the lesser of +.Va io-squeue-len +and +.Va io-cqueue-len . .It Va async-event-limit This is the maximum number of asynchronous event requests issued by the driver. @@ -78,6 +87,26 @@ This is the minimum physical block size that is reported to This value must be a power of 2 greater than or equal to 512. If the device reports a best block size larger than what is specified here the driver will ignore the value specified here. +.It Va max-submission-queues +This is the maximum number of submission queues the driver will create per +device. +Legal values are between 1 and 65535, the default is to +match the number for +.Em max-completion-queues . +The number of queues created will not be allowed to exceed the +drive's hardware limitation. +If the number of submission queues is larger than +.Em max-completion-queues +the completion queues will be shared across the submission +queues. +.It Va max-completion-queues +This is the maximum number of completion queues the driver will create per +device. +Legal values are between 1 and 65535, the default is to match +the number of interrupt vectors allocated to the drive. +The number of queues created will not exceed the number of interrupt vectors, +.Em max-submission-queues , +or the drive's hardware limitation. .El . .Sh FILES diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c index cfe314b03e..8a3af7d7d7 100644 --- a/usr/src/uts/common/io/nvme/nvme.c +++ b/usr/src/uts/common/io/nvme/nvme.c @@ -14,6 +14,7 @@ * Copyright 2016 Tegile Systems, Inc. All rights reserved. * Copyright (c) 2016 The MathWorks, Inc. All rights reserved. * Copyright 2018 Joyent, Inc. + * Copyright 2019 Western Digital Corporation. */ /* @@ -56,8 +57,8 @@ * From the hardware perspective both queues of a queue pair are independent, * but they share some driver state: the command array (holding pointers to * commands currently being processed by the hardware) and the active command - * counter. Access to a queue pair and the shared state is protected by - * nq_mutex. + * counter. Access to a submission queue and the shared state is protected by + * nq_mutex, completion queue is protected by ncq_mutex. * * When a command is submitted to a queue pair the active command counter is * incremented and a pointer to the command is stored in the command array. The @@ -161,11 +162,12 @@ * * Locking: * - * Each queue pair has its own nq_mutex, which must be held when accessing the - * associated queue registers or the shared state of the queue pair. Callers of - * nvme_unqueue_cmd() must make sure that nq_mutex is held, while - * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of this - * themselves. + * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held + * when accessing shared state and submission queue registers, ncq_mutex + * is held when accessing completion queue state and registers. + * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while + * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both + * mutexes themselves. * * Each command also has its own nc_mutex, which is associated with the * condition variable nc_cv. It is only used on admin commands which are run @@ -180,6 +182,14 @@ * nvme_abort_cmd() to prevent the command from completing while the abort is in * progress. * + * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be + * acquired first. More than one nq_mutex is never held by a single thread. + * The ncq_mutex is only held by nvme_retrieve_cmd() and + * nvme_process_iocq(). nvme_process_iocq() is only called from the + * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the + * mutex is non-contentious but is required for implementation completeness + * and safety. + * * Each minor node has its own nm_mutex, which protects the open count nm_ocnt * and exclusive-open flag nm_oexcl. * @@ -200,13 +210,18 @@ * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor * specific command status as a fatal error leading device faulting * - admin-queue-len: the maximum length of the admin queue (16-4096) - * - io-queue-len: the maximum length of the I/O queues (16-65536) + * - io-squeue-len: the maximum length of the I/O submission queues (16-65536) + * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536) * - async-event-limit: the maximum number of asynchronous event requests to be * posted by the driver * - volatile-write-cache-enable: can be set to 0 to disable the volatile write * cache * - min-phys-block-size: the minimum physical block size to report to blkdev, * which is among other things the basis for ZFS vdev ashift + * - max-submission-queues: the maximum number of I/O submission queues. + * - max-completion-queues: the maximum number of I/O completion queues, + * can be less than max-submission-queues, in which case the completion + * queues are shared. * * * TODO: @@ -334,7 +349,7 @@ static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t, static int nvme_get_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t *, void **, size_t *); static int nvme_write_cache_set(nvme_t *, boolean_t); -static int nvme_set_nqueues(nvme_t *, uint16_t *); +static int nvme_set_nqueues(nvme_t *); static void nvme_free_dma(nvme_dma_t *); static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *, @@ -342,7 +357,7 @@ static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *, static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t, nvme_dma_t **); static void nvme_free_qpair(nvme_qpair_t *); -static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, int); +static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t); static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t); static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t); @@ -762,8 +777,6 @@ nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len, len = roundup(len, nvme->n_pagesize); - q_dma_attr.dma_attr_minxfer = len; - if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma) != DDI_SUCCESS) { dev_err(nvme->n_dip, CE_WARN, @@ -789,6 +802,17 @@ fail: } static void +nvme_free_cq(nvme_cq_t *cq) +{ + mutex_destroy(&cq->ncq_mutex); + + if (cq->ncq_dma != NULL) + nvme_free_dma(cq->ncq_dma); + + kmem_free(cq, sizeof (*cq)); +} + +static void nvme_free_qpair(nvme_qpair_t *qp) { int i; @@ -798,8 +822,6 @@ nvme_free_qpair(nvme_qpair_t *qp) if (qp->nq_sqdma != NULL) nvme_free_dma(qp->nq_sqdma); - if (qp->nq_cqdma != NULL) - nvme_free_dma(qp->nq_cqdma); if (qp->nq_active_cmds > 0) for (i = 0; i != qp->nq_nentry; i++) @@ -812,30 +834,122 @@ nvme_free_qpair(nvme_qpair_t *qp) kmem_free(qp, sizeof (nvme_qpair_t)); } +/* + * Destroy the pre-allocated cq array, but only free individual completion + * queues from the given starting index. + */ +static void +nvme_destroy_cq_array(nvme_t *nvme, uint_t start) +{ + uint_t i; + + for (i = start; i < nvme->n_cq_count; i++) + if (nvme->n_cq[i] != NULL) + nvme_free_cq(nvme->n_cq[i]); + + kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count); +} + +static int +nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx) +{ + nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP); + + mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER, + DDI_INTR_PRI(nvme->n_intr_pri)); + + if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t), + DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS) + goto fail; + + cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp; + cq->ncq_nentry = nentry; + cq->ncq_id = idx; + cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx); + + *cqp = cq; + return (DDI_SUCCESS); + +fail: + nvme_free_cq(cq); + *cqp = NULL; + + return (DDI_FAILURE); +} + +/* + * Create the n_cq array big enough to hold "ncq" completion queues. + * If the array already exists it will be re-sized (but only larger). + * The admin queue is included in this array, which boosts the + * max number of entries to UINT16_MAX + 1. + */ +static int +nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry) +{ + nvme_cq_t **cq; + uint_t i, cq_count; + + ASSERT3U(ncq, >, nvme->n_cq_count); + + cq = nvme->n_cq; + cq_count = nvme->n_cq_count; + + nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP); + nvme->n_cq_count = ncq; + + for (i = 0; i < cq_count; i++) + nvme->n_cq[i] = cq[i]; + + for (; i < nvme->n_cq_count; i++) + if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i) != + DDI_SUCCESS) + goto fail; + + if (cq != NULL) + kmem_free(cq, sizeof (*cq) * cq_count); + + return (DDI_SUCCESS); + +fail: + nvme_destroy_cq_array(nvme, cq_count); + /* + * Restore the original array + */ + nvme->n_cq_count = cq_count; + nvme->n_cq = cq; + + return (DDI_FAILURE); +} + static int nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp, - int idx) + uint_t idx) { nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP); + uint_t cq_idx; mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER, DDI_INTR_PRI(nvme->n_intr_pri)); - sema_init(&qp->nq_sema, nentry, NULL, SEMA_DRIVER, NULL); + + /* + * The NVMe spec defines that a full queue has one empty (unused) slot; + * initialize the semaphore accordingly. + */ + sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL); if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t), DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS) goto fail; - if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t), - DDI_DMA_READ, &qp->nq_cqdma) != DDI_SUCCESS) - goto fail; - + /* + * idx == 0 is adminq, those above 0 are shared io completion queues. + */ + cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1); + qp->nq_cq = nvme->n_cq[cq_idx]; qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp; - qp->nq_cq = (nvme_cqe_t *)qp->nq_cqdma->nd_memp; qp->nq_nentry = nentry; qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx); - qp->nq_cqhdbl = NVME_REG_CQHDBL(nvme, idx); qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP); qp->nq_next_cmd = 0; @@ -962,43 +1076,102 @@ nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid) return (cmd); } +/* + * Get the command tied to the next completed cqe and bump along completion + * queue head counter. + */ static nvme_cmd_t * -nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp) +nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq) { - nvme_reg_cqhdbl_t head = { 0 }; - + nvme_qpair_t *qp; nvme_cqe_t *cqe; nvme_cmd_t *cmd; - (void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0, - sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL); + ASSERT(mutex_owned(&cq->ncq_mutex)); - mutex_enter(&qp->nq_mutex); - cqe = &qp->nq_cq[qp->nq_cqhead]; + cqe = &cq->ncq_cq[cq->ncq_head]; /* Check phase tag of CQE. Hardware inverts it for new entries. */ - if (cqe->cqe_sf.sf_p == qp->nq_phase) { - mutex_exit(&qp->nq_mutex); + if (cqe->cqe_sf.sf_p == cq->ncq_phase) return (NULL); - } - ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp); + qp = nvme->n_ioq[cqe->cqe_sqid]; + mutex_enter(&qp->nq_mutex); cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid); + mutex_exit(&qp->nq_mutex); ASSERT(cmd->nc_sqid == cqe->cqe_sqid); bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t)); qp->nq_sqhead = cqe->cqe_sqhd; - head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry; + cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry; /* Toggle phase on wrap-around. */ - if (qp->nq_cqhead == 0) - qp->nq_phase = qp->nq_phase ? 0 : 1; + if (cq->ncq_head == 0) + cq->ncq_phase = cq->ncq_phase ? 0 : 1; - nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r); - mutex_exit(&qp->nq_mutex); + return (cmd); +} + +/* + * Process all completed commands on the io completion queue. + */ +static uint_t +nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq) +{ + nvme_reg_cqhdbl_t head = { 0 }; + nvme_cmd_t *cmd; + uint_t completed = 0; + + if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != + DDI_SUCCESS) + dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", + __func__); + + mutex_enter(&cq->ncq_mutex); + + while ((cmd = nvme_get_completed(nvme, cq)) != NULL) { + taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq, + cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); + + completed++; + } + + if (completed > 0) { + /* + * Update the completion queue head doorbell. + */ + head.b.cqhdbl_cqh = cq->ncq_head; + nvme_put32(nvme, cq->ncq_hdbl, head.r); + } + + mutex_exit(&cq->ncq_mutex); + + return (completed); +} + +static nvme_cmd_t * +nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp) +{ + nvme_cq_t *cq = qp->nq_cq; + nvme_reg_cqhdbl_t head = { 0 }; + nvme_cmd_t *cmd; + + if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) != + DDI_SUCCESS) + dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s", + __func__); + + mutex_enter(&cq->ncq_mutex); + + if ((cmd = nvme_get_completed(nvme, cq)) != NULL) { + head.b.cqhdbl_cqh = cq->ncq_head; + nvme_put32(nvme, cq->ncq_hdbl, head.r); + } + + mutex_exit(&cq->ncq_mutex); return (cmd); } @@ -1724,7 +1897,7 @@ nvme_get_logpage(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize, cmd->nc_sqe.sqe_cdw10 = getlogpage.r; - if (nvme_zalloc_dma(nvme, getlogpage.b.lp_numd * sizeof (uint32_t), + if (nvme_zalloc_dma(nvme, *bufsize, DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) { dev_err(nvme->n_dip, CE_WARN, "!nvme_zalloc_dma failed for GET LOG PAGE"); @@ -2036,50 +2209,80 @@ nvme_write_cache_set(nvme_t *nvme, boolean_t enable) } static int -nvme_set_nqueues(nvme_t *nvme, uint16_t *nqueues) +nvme_set_nqueues(nvme_t *nvme) { nvme_nqueues_t nq = { 0 }; int ret; - nq.b.nq_nsq = nq.b.nq_ncq = *nqueues - 1; + /* + * The default is to allocate one completion queue per vector. + */ + if (nvme->n_completion_queues == -1) + nvme->n_completion_queues = nvme->n_intr_cnt; + + /* + * There is no point in having more compeletion queues than + * interrupt vectors. + */ + nvme->n_completion_queues = MIN(nvme->n_completion_queues, + nvme->n_intr_cnt); + + /* + * The default is to use one submission queue per completion queue. + */ + if (nvme->n_submission_queues == -1) + nvme->n_submission_queues = nvme->n_completion_queues; + + /* + * There is no point in having more compeletion queues than + * submission queues. + */ + nvme->n_completion_queues = MIN(nvme->n_completion_queues, + nvme->n_submission_queues); + + ASSERT(nvme->n_submission_queues > 0); + ASSERT(nvme->n_completion_queues > 0); + + nq.b.nq_nsq = nvme->n_submission_queues - 1; + nq.b.nq_ncq = nvme->n_completion_queues - 1; ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r, &nq.r); if (ret == 0) { /* - * Always use the same number of submission and completion - * queues, and never use more than the requested number of - * queues. + * Never use more than the requested number of queues. */ - *nqueues = MIN(*nqueues, MIN(nq.b.nq_nsq, nq.b.nq_ncq) + 1); + nvme->n_submission_queues = MIN(nvme->n_submission_queues, + nq.b.nq_nsq + 1); + nvme->n_completion_queues = MIN(nvme->n_completion_queues, + nq.b.nq_ncq + 1); } return (ret); } static int -nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx) +nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq) { nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP); nvme_create_queue_dw10_t dw10 = { 0 }; nvme_create_cq_dw11_t c_dw11 = { 0 }; - nvme_create_sq_dw11_t s_dw11 = { 0 }; int ret; - dw10.b.q_qid = idx; - dw10.b.q_qsize = qp->nq_nentry - 1; + dw10.b.q_qid = cq->ncq_id; + dw10.b.q_qsize = cq->ncq_nentry - 1; c_dw11.b.cq_pc = 1; c_dw11.b.cq_ien = 1; - c_dw11.b.cq_iv = idx % nvme->n_intr_cnt; + c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt; cmd->nc_sqid = 0; cmd->nc_callback = nvme_wakeup_cmd; cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE; cmd->nc_sqe.sqe_cdw10 = dw10.r; cmd->nc_sqe.sqe_cdw11 = c_dw11.r; - cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_cqdma->nd_cookie.dmac_laddress; + cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress; nvme_admin_cmd(cmd, nvme_admin_cmd_timeout); @@ -2087,13 +2290,36 @@ nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx) dev_err(nvme->n_dip, CE_WARN, "!CREATE CQUEUE failed with sct = %x, sc = %x", cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); - goto fail; } nvme_free_cmd(cmd); + return (ret); +} + +static int +nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx) +{ + nvme_cq_t *cq = qp->nq_cq; + nvme_cmd_t *cmd; + nvme_create_queue_dw10_t dw10 = { 0 }; + nvme_create_sq_dw11_t s_dw11 = { 0 }; + int ret; + + /* + * It is possible to have more qpairs than completion queues, + * and when the idx > ncq_id, that completion queue is shared + * and has already been created. + */ + if (idx <= cq->ncq_id && + nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS) + return (DDI_FAILURE); + + dw10.b.q_qid = idx; + dw10.b.q_qsize = qp->nq_nentry - 1; + s_dw11.b.sq_pc = 1; - s_dw11.b.sq_cqid = idx; + s_dw11.b.sq_cqid = cq->ncq_id; cmd = nvme_alloc_cmd(nvme, KM_SLEEP); cmd->nc_sqid = 0; @@ -2109,10 +2335,8 @@ nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx) dev_err(nvme->n_dip, CE_WARN, "!CREATE SQUEUE failed with sct = %x, sc = %x", cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc); - goto fail; } -fail: nvme_free_cmd(cmd); return (ret); @@ -2365,6 +2589,16 @@ nvme_init(nvme_t *nvme) } /* + * Create the cq array with one completion queue to be assigned + * to the admin queue pair. + */ + if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len) != + DDI_SUCCESS) { + dev_err(nvme->n_dip, CE_WARN, + "!failed to pre-allocate admin completion queue"); + goto fail; + } + /* * Create the admin queue pair. */ if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0) @@ -2383,7 +2617,7 @@ nvme_init(nvme_t *nvme) aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1; asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress; - acq = nvme->n_adminq->nq_cqdma->nd_cookie.dmac_laddress; + acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress; ASSERT((asq & (nvme->n_pagesize - 1)) == 0); ASSERT((acq & (nvme->n_pagesize - 1)) == 0); @@ -2635,13 +2869,11 @@ nvme_init(nvme_t *nvme) } } - nqueues = nvme->n_intr_cnt; - /* * Create I/O queue pairs. */ - if (nvme_set_nqueues(nvme, &nqueues) != 0) { + if (nvme_set_nqueues(nvme) != 0) { dev_err(nvme->n_dip, CE_WARN, "!failed to set number of I/O queues to %d", nvme->n_intr_cnt); @@ -2653,20 +2885,55 @@ nvme_init(nvme_t *nvme) */ kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *)); nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) * - (nqueues + 1), KM_SLEEP); + (nvme->n_submission_queues + 1), KM_SLEEP); nvme->n_ioq[0] = nvme->n_adminq; - nvme->n_ioq_count = nqueues; + /* + * There should always be at least as many submission queues + * as completion queues. + */ + ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues); + + nvme->n_ioq_count = nvme->n_submission_queues; + + nvme->n_io_squeue_len = + MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries); + + (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len", + nvme->n_io_squeue_len); + + /* + * Pre-allocate completion queues. + * When there are the same number of submission and completion + * queues there is no value in having a larger completion + * queue length. + */ + if (nvme->n_submission_queues == nvme->n_completion_queues) + nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, + nvme->n_io_squeue_len); + + nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len, + nvme->n_max_queue_entries); + + (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len", + nvme->n_io_cqueue_len); + + if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1, + nvme->n_io_cqueue_len) != DDI_SUCCESS) { + dev_err(nvme->n_dip, CE_WARN, + "!failed to pre-allocate completion queues"); + goto fail; + } /* - * If we got less queues than we asked for we might as well give + * If we use less completion queues than interrupt vectors return * some of the interrupt vectors back to the system. */ - if (nvme->n_ioq_count < nvme->n_intr_cnt) { + if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) { nvme_release_interrupts(nvme); if (nvme_setup_interrupts(nvme, nvme->n_intr_type, - nvme->n_ioq_count) != DDI_SUCCESS) { + nvme->n_completion_queues + 1) != DDI_SUCCESS) { dev_err(nvme->n_dip, CE_WARN, "!failed to reduce number of interrupts"); goto fail; @@ -2676,13 +2943,9 @@ nvme_init(nvme_t *nvme) /* * Alloc & register I/O queue pairs */ - nvme->n_io_queue_len = - MIN(nvme->n_io_queue_len, nvme->n_max_queue_entries); - (void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-queue-len", - nvme->n_io_queue_len); for (i = 1; i != nvme->n_ioq_count + 1; i++) { - if (nvme_alloc_qpair(nvme, nvme->n_io_queue_len, + if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len, &nvme->n_ioq[i], i) != DDI_SUCCESS) { dev_err(nvme->n_dip, CE_WARN, "!unable to allocate I/O qpair %d", i); @@ -2720,7 +2983,6 @@ nvme_intr(caddr_t arg1, caddr_t arg2) int inum = (int)(uintptr_t)arg2; int ccnt = 0; int qnum; - nvme_cmd_t *cmd; if (inum >= nvme->n_intr_cnt) return (DDI_INTR_UNCLAIMED); @@ -2735,13 +2997,9 @@ nvme_intr(caddr_t arg1, caddr_t arg2) * in steps of n_intr_cnt to process all queues using this vector. */ for (qnum = inum; - qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL; + qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL; qnum += nvme->n_intr_cnt) { - while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) { - taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq, - cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent); - ccnt++; - } + ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]); } return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED); @@ -2912,8 +3170,14 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) B_TRUE : B_FALSE; nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN); - nvme->n_io_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, - DDI_PROP_DONTPASS, "io-queue-len", NVME_DEFAULT_IO_QUEUE_LEN); + nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN); + /* + * Double up the default for completion queues in case of + * queue sharing. + */ + nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN); nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "async-event-limit", NVME_DEFAULT_ASYNC_EVENT_LIMIT); @@ -2923,6 +3187,10 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip, DDI_PROP_DONTPASS, "min-phys-block-size", NVME_DEFAULT_MIN_BLOCK_SIZE); + nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "max-submission-queues", -1); + nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip, + DDI_PROP_DONTPASS, "max-completion-queues", -1); if (!ISP2(nvme->n_min_block_size) || (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) { @@ -2933,13 +3201,33 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE; } + if (nvme->n_submission_queues != -1 && + (nvme->n_submission_queues < 1 || + nvme->n_submission_queues > UINT16_MAX)) { + dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not " + "valid. Must be [1..%d]", nvme->n_submission_queues, + UINT16_MAX); + nvme->n_submission_queues = -1; + } + + if (nvme->n_completion_queues != -1 && + (nvme->n_completion_queues < 1 || + nvme->n_completion_queues > UINT16_MAX)) { + dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not " + "valid. Must be [1..%d]", nvme->n_completion_queues, + UINT16_MAX); + nvme->n_completion_queues = -1; + } + if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN) nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN; else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN) nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN; - if (nvme->n_io_queue_len < NVME_MIN_IO_QUEUE_LEN) - nvme->n_io_queue_len = NVME_MIN_IO_QUEUE_LEN; + if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN) + nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN; + if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN) + nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN; if (nvme->n_async_event_limit < 1) nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT; @@ -3151,6 +3439,12 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) if (nvme->n_progress & NVME_ADMIN_QUEUE) nvme_free_qpair(nvme->n_adminq); + if (nvme->n_cq_count > 0) { + nvme_destroy_cq_array(nvme, 0); + nvme->n_cq = NULL; + nvme->n_cq_count = 0; + } + if (nvme->n_idctl) kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE); @@ -3222,7 +3516,7 @@ nvme_fill_prp(nvme_cmd_t *cmd, bd_xfer_t *xfer) xfer->x_ndmac--; - nprp_page = nvme->n_pagesize / sizeof (uint64_t) - 1; + nprp_page = nvme->n_pagesize / sizeof (uint64_t); ASSERT(nprp_page > 0); nprp = (xfer->x_ndmac + nprp_page - 1) / nprp_page; @@ -3325,7 +3619,7 @@ nvme_bd_driveinfo(void *arg, bd_drive_t *drive) * TODO: need to figure out a sane default, or use per-NS I/O queues, * or change blkdev to handle EAGAIN */ - drive->d_qsize = nvme->n_ioq_count * nvme->n_io_queue_len + drive->d_qsize = nvme->n_ioq_count * nvme->n_io_squeue_len / nvme->n_namespace_count; /* @@ -3405,7 +3699,7 @@ nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc) do { cmd = nvme_retrieve_cmd(nvme, ioq); if (cmd != NULL) - nvme_bd_xfer_done(cmd); + cmd->nc_callback(cmd); else drv_usecwait(10); } while (ioq->nq_active_cmds != 0); diff --git a/usr/src/uts/common/io/nvme/nvme.conf b/usr/src/uts/common/io/nvme/nvme.conf index f174e9cd80..982be2d538 100644 --- a/usr/src/uts/common/io/nvme/nvme.conf +++ b/usr/src/uts/common/io/nvme/nvme.conf @@ -9,6 +9,7 @@ # # # Copyright 2016 Nexenta Systems, Inc. All rights reserved. +# Copyright 2019 Western Digital Corporation # # @@ -28,9 +29,28 @@ #admin-queue-len=256; # -# The maximum length of the individual I/O queues can be overriden here -# (16-65536). -#io-queue-len=1024; +# The maximum length of the individual I/O submission queues can be +# overriden here (16-65536). +#io-squeue-len=1024; + +# +# The maximum length of the individual I/O completion queues can be +# overriden here (16-65536). +#io-cqueue-len=2048; + +# +# The number of submission queues can be configured here. The default is +# to match submission queues 1 for 1 with completion queues. +# The range is 1-65535. +#max-submission-queues=65535; + +# +# The number of completion queues can be configured here. The default is +# one per interrupt vector. Should there be fewer completion queues than +# vectors, the number of vectors is reduced. The number of completion +# queues will also be limited to the number of submission queues. +# The range is 1-65535. +#max-completion-queues=65535; # # The maximum number of outstanding asynchronous event requests can diff --git a/usr/src/uts/common/io/nvme/nvme_var.h b/usr/src/uts/common/io/nvme/nvme_var.h index 49d900040e..780205d145 100644 --- a/usr/src/uts/common/io/nvme/nvme_var.h +++ b/usr/src/uts/common/io/nvme/nvme_var.h @@ -13,6 +13,7 @@ * Copyright 2018 Nexenta Systems, Inc. * Copyright 2016 The MathWorks, Inc. All rights reserved. * Copyright 2017 Joyent, Inc. + * Copyright 2019 Western Digital Corporation. */ #ifndef _NVME_VAR_H @@ -33,10 +34,10 @@ extern "C" { #endif #define NVME_FMA_INIT 0x1 -#define NVME_REGS_MAPPED 0x2 -#define NVME_ADMIN_QUEUE 0x4 -#define NVME_CTRL_LIMITS 0x8 -#define NVME_INTERRUPTS 0x10 +#define NVME_REGS_MAPPED 0x2 +#define NVME_ADMIN_QUEUE 0x4 +#define NVME_CTRL_LIMITS 0x8 +#define NVME_INTERRUPTS 0x10 #define NVME_MIN_ADMIN_QUEUE_LEN 16 #define NVME_MIN_IO_QUEUE_LEN 16 @@ -52,6 +53,7 @@ typedef struct nvme_namespace nvme_namespace_t; typedef struct nvme_minor_state nvme_minor_state_t; typedef struct nvme_dma nvme_dma_t; typedef struct nvme_cmd nvme_cmd_t; +typedef struct nvme_cq nvme_cq_t; typedef struct nvme_qpair nvme_qpair_t; typedef struct nvme_task_arg nvme_task_arg_t; @@ -92,6 +94,20 @@ struct nvme_cmd { nvme_t *nc_nvme; }; +struct nvme_cq { + size_t ncq_nentry; + uint16_t ncq_id; + + nvme_dma_t *ncq_dma; + nvme_cqe_t *ncq_cq; + uint_t ncq_head; + uint_t ncq_tail; + uintptr_t ncq_hdbl; + int ncq_phase; + + kmutex_t ncq_mutex; +}; + struct nvme_qpair { size_t nq_nentry; @@ -101,16 +117,11 @@ struct nvme_qpair { uint_t nq_sqtail; uintptr_t nq_sqtdbl; - nvme_dma_t *nq_cqdma; - nvme_cqe_t *nq_cq; - uint_t nq_cqhead; - uint_t nq_cqtail; - uintptr_t nq_cqhdbl; + nvme_cq_t *nq_cq; nvme_cmd_t **nq_cmd; uint16_t nq_next_cmd; uint_t nq_active_cmds; - int nq_phase; kmutex_t nq_mutex; ksema_t nq_sema; @@ -142,7 +153,8 @@ struct nvme { boolean_t n_strict_version; boolean_t n_ignore_unknown_vendor_status; uint32_t n_admin_queue_len; - uint32_t n_io_queue_len; + uint32_t n_io_squeue_len; + uint32_t n_io_cqueue_len; uint16_t n_async_event_limit; uint_t n_min_block_size; uint16_t n_abort_command_limit; @@ -154,6 +166,8 @@ struct nvme { boolean_t n_auto_pst_supported; boolean_t n_async_event_supported; boolean_t n_progress_supported; + int n_submission_queues; + int n_completion_queues; int n_nssr_supported; int n_doorbell_stride; @@ -165,12 +179,14 @@ struct nvme { int n_pagesize; int n_namespace_count; - uint16_t n_ioq_count; + uint_t n_ioq_count; + uint_t n_cq_count; nvme_identify_ctrl_t *n_idctl; nvme_qpair_t *n_adminq; nvme_qpair_t **n_ioq; + nvme_cq_t **n_cq; nvme_namespace_t *n_ns; |