11202 Allow the number of NVMe submission and completion queues to be different

11228 nvme may queue more submissions than allowed 11229 nvme_get_logpage() can allocate a too small buffer to receive logpage data 11230 Panic in nvme_fill_prp() because of miscalculation of the number of PRPs per page 11231 nvme in polled mode ignores the command call back Reviewed by: Robert Mustacchi <rm@joyent.com> Reviewed by: Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> Reviewed by: Gergő Mihály Doma <domag02@gmail.com> Reviewed by: Youzhong Yang <youzhong@gmail.com> Approved by: Dan McDonald <danmcd@joyent.com>
author: Paul Winder <Paul.Winder@wdc.com> 2019-06-05 14:02:04 +0100
committer: Dan McDonald <danmcd@joyent.com> 2019-06-20 10:02:46 -0400
commit: 0999c1123c1ab769df080ccc5f1626d50663e7a8 (patch)
tree: 78ca9f53ced438d9d1a49145c70dbb32979e33d6
parent: c89583d1669aac784ae44a473d81f8c5f564c728 (diff)
download: illumos-joyent-0999c1123c1ab769df080ccc5f1626d50663e7a8.tar.gz
4 files changed, 459 insertions, 100 deletions
diff --git a/usr/src/man/man7d/nvme.7d b/usr/src/man/man7d/nvme.7d
index 83f72f346a..2819a788e5 100644
--- a/usr/src/man/man7d/nvme.7d
+++ b/usr/src/man/man7d/nvme.7d
@@ -11,6 +11,7 @@
 .\"
 .\" Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
 .\" Copyright (c) 2018, Joyent, Inc.
+.\" Copyright 2019, Western Digital Corporation
 .\"
 .Dd July 31, 2018
 .Dt NVME 7D
@@ -58,9 +59,17 @@ status.
 .It Va admin-queue-len
 This is the number of entries in the admin command queue.
 Legal values are between 16 and 4096, the default value is 256.
-.It Va io-queue-len
-This is the number of entries in each I/O command queue.
+.It Va io-squeue-len
+This is the number of entries in each I/O submission queue.
 Legal values are between 16 and 65536, the default value is 1024.
+.It Va io-cqueue-len
+This is the number of entries in each I/O completion queue.
+Legal values are between 16 and 65536, the default value is 2048.
+When the number of submission and completion queues are the same,
+the queue lengths will be both set to the lesser of
+.Va io-squeue-len
+and
+.Va io-cqueue-len .
 .It Va async-event-limit
 This is the maximum number of asynchronous event requests issued by
 the driver.
@@ -78,6 +87,26 @@ This is the minimum physical block size that is reported to
 This value must be a power of 2 greater than or equal to 512.
 If the device reports a best block size larger than what is
 specified here the driver will ignore the value specified here.
+.It Va max-submission-queues
+This is the maximum number of submission queues the driver will create per
+device.
+Legal values are between 1 and 65535, the default is to
+match the number for
+.Em max-completion-queues .
+The number of queues created will not be allowed to exceed the
+drive's hardware limitation.
+If the number of submission queues is larger than
+.Em max-completion-queues
+the completion queues will be shared across the submission
+queues.
+.It Va max-completion-queues
+This is the maximum number of completion queues the driver will create per
+device.
+Legal values are between 1 and 65535, the default is to match
+the number of interrupt vectors allocated to the drive.
+The number of queues created will not exceed the number of interrupt vectors,
+.Em max-submission-queues ,
+or the drive's hardware limitation.
 .El
 .
 .Sh FILES
diff --git a/usr/src/uts/common/io/nvme/nvme.c b/usr/src/uts/common/io/nvme/nvme.c
index cfe314b03e..8a3af7d7d7 100644
--- a/usr/src/uts/common/io/nvme/nvme.c
+++ b/usr/src/uts/common/io/nvme/nvme.c
@@ -14,6 +14,7 @@
  * Copyright 2016 Tegile Systems, Inc. All rights reserved.
  * Copyright (c) 2016 The MathWorks, Inc.  All rights reserved.
  * Copyright 2018 Joyent, Inc.
+ * Copyright 2019 Western Digital Corporation.
  */
 
 /*
@@ -56,8 +57,8 @@
  * From the hardware perspective both queues of a queue pair are independent,
  * but they share some driver state: the command array (holding pointers to
  * commands currently being processed by the hardware) and the active command
- * counter. Access to a queue pair and the shared state is protected by
- * nq_mutex.
+ * counter. Access to a submission queue and the shared state is protected by
+ * nq_mutex, completion queue is protected by ncq_mutex.
  *
  * When a command is submitted to a queue pair the active command counter is
  * incremented and a pointer to the command is stored in the command array. The
@@ -161,11 +162,12 @@
  *
  * Locking:
  *
- * Each queue pair has its own nq_mutex, which must be held when accessing the
- * associated queue registers or the shared state of the queue pair. Callers of
- * nvme_unqueue_cmd() must make sure that nq_mutex is held, while
- * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of this
- * themselves.
+ * Each queue pair has a nq_mutex and ncq_mutex. The nq_mutex must be held
+ * when accessing shared state and submission queue registers, ncq_mutex
+ * is held when accessing completion queue state and registers.
+ * Callers of nvme_unqueue_cmd() must make sure that nq_mutex is held, while
+ * nvme_submit_{admin,io}_cmd() and nvme_retrieve_cmd() take care of both
+ * mutexes themselves.
  *
  * Each command also has its own nc_mutex, which is associated with the
  * condition variable nc_cv. It is only used on admin commands which are run
@@ -180,6 +182,14 @@
  * nvme_abort_cmd() to prevent the command from completing while the abort is in
  * progress.
  *
+ * If both nq_mutex and ncq_mutex need to be held, ncq_mutex must be
+ * acquired first. More than one nq_mutex is never held by a single thread.
+ * The ncq_mutex is only held by nvme_retrieve_cmd() and
+ * nvme_process_iocq(). nvme_process_iocq() is only called from the
+ * interrupt thread and nvme_retrieve_cmd() during polled I/O, so the
+ * mutex is non-contentious but is required for implementation completeness
+ * and safety.
+ *
  * Each minor node has its own nm_mutex, which protects the open count nm_ocnt
  * and exclusive-open flag nm_oexcl.
  *
@@ -200,13 +210,18 @@
  * - ignore-unknown-vendor-status: can be set to 1 to not handle any vendor
  *   specific command status as a fatal error leading device faulting
  * - admin-queue-len: the maximum length of the admin queue (16-4096)
- * - io-queue-len: the maximum length of the I/O queues (16-65536)
+ * - io-squeue-len: the maximum length of the I/O submission queues (16-65536)
+ * - io-cqueue-len: the maximum length of the I/O completion queues (16-65536)
  * - async-event-limit: the maximum number of asynchronous event requests to be
  *   posted by the driver
  * - volatile-write-cache-enable: can be set to 0 to disable the volatile write
  *   cache
  * - min-phys-block-size: the minimum physical block size to report to blkdev,
  *   which is among other things the basis for ZFS vdev ashift
+ * - max-submission-queues: the maximum number of I/O submission queues.
+ * - max-completion-queues: the maximum number of I/O completion queues,
+ *   can be less than max-submission-queues, in which case the completion
+ *   queues are shared.
  *
  *
  * TODO:
@@ -334,7 +349,7 @@ static int nvme_set_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t,
 static int nvme_get_features(nvme_t *, boolean_t, uint32_t, uint8_t, uint32_t *,
     void **, size_t *);
 static int nvme_write_cache_set(nvme_t *, boolean_t);
-static int nvme_set_nqueues(nvme_t *, uint16_t *);
+static int nvme_set_nqueues(nvme_t *);
 
 static void nvme_free_dma(nvme_dma_t *);
 static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
@@ -342,7 +357,7 @@ static int nvme_zalloc_dma(nvme_t *, size_t, uint_t, ddi_dma_attr_t *,
 static int nvme_zalloc_queue_dma(nvme_t *, uint32_t, uint16_t, uint_t,
     nvme_dma_t **);
 static void nvme_free_qpair(nvme_qpair_t *);
-static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, int);
+static int nvme_alloc_qpair(nvme_t *, uint32_t, nvme_qpair_t **, uint_t);
 static int nvme_create_io_qpair(nvme_t *, nvme_qpair_t *, uint16_t);
 
 static inline void nvme_put64(nvme_t *, uintptr_t, uint64_t);
@@ -762,8 +777,6 @@ nvme_zalloc_queue_dma(nvme_t *nvme, uint32_t nentry, uint16_t qe_len,
 
 	len = roundup(len, nvme->n_pagesize);
 
-	q_dma_attr.dma_attr_minxfer = len;
-
 	if (nvme_zalloc_dma(nvme, len, flags, &q_dma_attr, dma)
 	    != DDI_SUCCESS) {
 		dev_err(nvme->n_dip, CE_WARN,
@@ -789,6 +802,17 @@ fail:
 }
 
 static void
+nvme_free_cq(nvme_cq_t *cq)
+{
+	mutex_destroy(&cq->ncq_mutex);
+
+	if (cq->ncq_dma != NULL)
+		nvme_free_dma(cq->ncq_dma);
+
+	kmem_free(cq, sizeof (*cq));
+}
+
+static void
 nvme_free_qpair(nvme_qpair_t *qp)
 {
 	int i;
@@ -798,8 +822,6 @@ nvme_free_qpair(nvme_qpair_t *qp)
 
 	if (qp->nq_sqdma != NULL)
 		nvme_free_dma(qp->nq_sqdma);
-	if (qp->nq_cqdma != NULL)
-		nvme_free_dma(qp->nq_cqdma);
 
 	if (qp->nq_active_cmds > 0)
 		for (i = 0; i != qp->nq_nentry; i++)
@@ -812,30 +834,122 @@ nvme_free_qpair(nvme_qpair_t *qp)
 	kmem_free(qp, sizeof (nvme_qpair_t));
 }
 
+/*
+ * Destroy the pre-allocated cq array, but only free individual completion
+ * queues from the given starting index.
+ */
+static void
+nvme_destroy_cq_array(nvme_t *nvme, uint_t start)
+{
+	uint_t i;
+
+	for (i = start; i < nvme->n_cq_count; i++)
+		if (nvme->n_cq[i] != NULL)
+			nvme_free_cq(nvme->n_cq[i]);
+
+	kmem_free(nvme->n_cq, sizeof (*nvme->n_cq) * nvme->n_cq_count);
+}
+
+static int
+nvme_alloc_cq(nvme_t *nvme, uint32_t nentry, nvme_cq_t **cqp, uint16_t idx)
+{
+	nvme_cq_t *cq = kmem_zalloc(sizeof (*cq), KM_SLEEP);
+
+	mutex_init(&cq->ncq_mutex, NULL, MUTEX_DRIVER,
+	    DDI_INTR_PRI(nvme->n_intr_pri));
+
+	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
+	    DDI_DMA_READ, &cq->ncq_dma) != DDI_SUCCESS)
+		goto fail;
+
+	cq->ncq_cq = (nvme_cqe_t *)cq->ncq_dma->nd_memp;
+	cq->ncq_nentry = nentry;
+	cq->ncq_id = idx;
+	cq->ncq_hdbl = NVME_REG_CQHDBL(nvme, idx);
+
+	*cqp = cq;
+	return (DDI_SUCCESS);
+
+fail:
+	nvme_free_cq(cq);
+	*cqp = NULL;
+
+	return (DDI_FAILURE);
+}
+
+/*
+ * Create the n_cq array big enough to hold "ncq" completion queues.
+ * If the array already exists it will be re-sized (but only larger).
+ * The admin queue is included in this array, which boosts the
+ * max number of entries to UINT16_MAX + 1.
+ */
+static int
+nvme_create_cq_array(nvme_t *nvme, uint_t ncq, uint32_t nentry)
+{
+	nvme_cq_t **cq;
+	uint_t i, cq_count;
+
+	ASSERT3U(ncq, >, nvme->n_cq_count);
+
+	cq = nvme->n_cq;
+	cq_count = nvme->n_cq_count;
+
+	nvme->n_cq = kmem_zalloc(sizeof (*nvme->n_cq) * ncq, KM_SLEEP);
+	nvme->n_cq_count = ncq;
+
+	for (i = 0; i < cq_count; i++)
+		nvme->n_cq[i] = cq[i];
+
+	for (; i < nvme->n_cq_count; i++)
+		if (nvme_alloc_cq(nvme, nentry, &nvme->n_cq[i], i) !=
+		    DDI_SUCCESS)
+			goto fail;
+
+	if (cq != NULL)
+		kmem_free(cq, sizeof (*cq) * cq_count);
+
+	return (DDI_SUCCESS);
+
+fail:
+	nvme_destroy_cq_array(nvme, cq_count);
+	/*
+	 * Restore the original array
+	 */
+	nvme->n_cq_count = cq_count;
+	nvme->n_cq = cq;
+
+	return (DDI_FAILURE);
+}
+
 static int
 nvme_alloc_qpair(nvme_t *nvme, uint32_t nentry, nvme_qpair_t **nqp,
-    int idx)
+    uint_t idx)
 {
 	nvme_qpair_t *qp = kmem_zalloc(sizeof (*qp), KM_SLEEP);
+	uint_t cq_idx;
 
 	mutex_init(&qp->nq_mutex, NULL, MUTEX_DRIVER,
 	    DDI_INTR_PRI(nvme->n_intr_pri));
-	sema_init(&qp->nq_sema, nentry, NULL, SEMA_DRIVER, NULL);
+
+	/*
+	 * The NVMe spec defines that a full queue has one empty (unused) slot;
+	 * initialize the semaphore accordingly.
+	 */
+	sema_init(&qp->nq_sema, nentry - 1, NULL, SEMA_DRIVER, NULL);
 
 	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_sqe_t),
 	    DDI_DMA_WRITE, &qp->nq_sqdma) != DDI_SUCCESS)
 		goto fail;
 
-	if (nvme_zalloc_queue_dma(nvme, nentry, sizeof (nvme_cqe_t),
-	    DDI_DMA_READ, &qp->nq_cqdma) != DDI_SUCCESS)
-		goto fail;
-
+	/*
+	 * idx == 0 is adminq, those above 0 are shared io completion queues.
+	 */
+	cq_idx = idx == 0 ? 0 : 1 + (idx - 1) % (nvme->n_cq_count - 1);
+	qp->nq_cq = nvme->n_cq[cq_idx];
 	qp->nq_sq = (nvme_sqe_t *)qp->nq_sqdma->nd_memp;
-	qp->nq_cq = (nvme_cqe_t *)qp->nq_cqdma->nd_memp;
 	qp->nq_nentry = nentry;
 
 	qp->nq_sqtdbl = NVME_REG_SQTDBL(nvme, idx);
-	qp->nq_cqhdbl = NVME_REG_CQHDBL(nvme, idx);
 
 	qp->nq_cmd = kmem_zalloc(sizeof (nvme_cmd_t *) * nentry, KM_SLEEP);
 	qp->nq_next_cmd = 0;
@@ -962,43 +1076,102 @@ nvme_unqueue_cmd(nvme_t *nvme, nvme_qpair_t *qp, int cid)
 	return (cmd);
 }
 
+/*
+ * Get the command tied to the next completed cqe and bump along completion
+ * queue head counter.
+ */
 static nvme_cmd_t *
-nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
+nvme_get_completed(nvme_t *nvme, nvme_cq_t *cq)
 {
-	nvme_reg_cqhdbl_t head = { 0 };
-
+	nvme_qpair_t *qp;
 	nvme_cqe_t *cqe;
 	nvme_cmd_t *cmd;
 
-	(void) ddi_dma_sync(qp->nq_cqdma->nd_dmah, 0,
-	    sizeof (nvme_cqe_t) * qp->nq_nentry, DDI_DMA_SYNC_FORKERNEL);
+	ASSERT(mutex_owned(&cq->ncq_mutex));
 
-	mutex_enter(&qp->nq_mutex);
-	cqe = &qp->nq_cq[qp->nq_cqhead];
+	cqe = &cq->ncq_cq[cq->ncq_head];
 
 	/* Check phase tag of CQE. Hardware inverts it for new entries. */
-	if (cqe->cqe_sf.sf_p == qp->nq_phase) {
-		mutex_exit(&qp->nq_mutex);
+	if (cqe->cqe_sf.sf_p == cq->ncq_phase)
 		return (NULL);
-	}
 
-	ASSERT(nvme->n_ioq[cqe->cqe_sqid] == qp);
+	qp = nvme->n_ioq[cqe->cqe_sqid];
 
+	mutex_enter(&qp->nq_mutex);
 	cmd = nvme_unqueue_cmd(nvme, qp, cqe->cqe_cid);
+	mutex_exit(&qp->nq_mutex);
 
 	ASSERT(cmd->nc_sqid == cqe->cqe_sqid);
 	bcopy(cqe, &cmd->nc_cqe, sizeof (nvme_cqe_t));
 
 	qp->nq_sqhead = cqe->cqe_sqhd;
 
-	head.b.cqhdbl_cqh = qp->nq_cqhead = (qp->nq_cqhead + 1) % qp->nq_nentry;
+	cq->ncq_head = (cq->ncq_head + 1) % cq->ncq_nentry;
 
 	/* Toggle phase on wrap-around. */
-	if (qp->nq_cqhead == 0)
-		qp->nq_phase = qp->nq_phase ? 0 : 1;
+	if (cq->ncq_head == 0)
+		cq->ncq_phase = cq->ncq_phase ? 0 : 1;
 
-	nvme_put32(cmd->nc_nvme, qp->nq_cqhdbl, head.r);
-	mutex_exit(&qp->nq_mutex);
+	return (cmd);
+}
+
+/*
+ * Process all completed commands on the io completion queue.
+ */
+static uint_t
+nvme_process_iocq(nvme_t *nvme, nvme_cq_t *cq)
+{
+	nvme_reg_cqhdbl_t head = { 0 };
+	nvme_cmd_t *cmd;
+	uint_t completed = 0;
+
+	if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
+	    DDI_SUCCESS)
+		dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
+		    __func__);
+
+	mutex_enter(&cq->ncq_mutex);
+
+	while ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
+		taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
+		    cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
+
+		completed++;
+	}
+
+	if (completed > 0) {
+		/*
+		 * Update the completion queue head doorbell.
+		 */
+		head.b.cqhdbl_cqh = cq->ncq_head;
+		nvme_put32(nvme, cq->ncq_hdbl, head.r);
+	}
+
+	mutex_exit(&cq->ncq_mutex);
+
+	return (completed);
+}
+
+static nvme_cmd_t *
+nvme_retrieve_cmd(nvme_t *nvme, nvme_qpair_t *qp)
+{
+	nvme_cq_t *cq = qp->nq_cq;
+	nvme_reg_cqhdbl_t head = { 0 };
+	nvme_cmd_t *cmd;
+
+	if (ddi_dma_sync(cq->ncq_dma->nd_dmah, 0, 0, DDI_DMA_SYNC_FORKERNEL) !=
+	    DDI_SUCCESS)
+		dev_err(nvme->n_dip, CE_WARN, "!ddi_dma_sync() failed in %s",
+		    __func__);
+
+	mutex_enter(&cq->ncq_mutex);
+
+	if ((cmd = nvme_get_completed(nvme, cq)) != NULL) {
+		head.b.cqhdbl_cqh = cq->ncq_head;
+		nvme_put32(nvme, cq->ncq_hdbl, head.r);
+	}
+
+	mutex_exit(&cq->ncq_mutex);
 
 	return (cmd);
 }
@@ -1724,7 +1897,7 @@ nvme_get_logpage(nvme_t *nvme, boolean_t user, void **buf, size_t *bufsize,
 
 	cmd->nc_sqe.sqe_cdw10 = getlogpage.r;
 
-	if (nvme_zalloc_dma(nvme, getlogpage.b.lp_numd * sizeof (uint32_t),
+	if (nvme_zalloc_dma(nvme, *bufsize,
 	    DDI_DMA_READ, &nvme->n_prp_dma_attr, &cmd->nc_dma) != DDI_SUCCESS) {
 		dev_err(nvme->n_dip, CE_WARN,
 		    "!nvme_zalloc_dma failed for GET LOG PAGE");
@@ -2036,50 +2209,80 @@ nvme_write_cache_set(nvme_t *nvme, boolean_t enable)
 }
 
 static int
-nvme_set_nqueues(nvme_t *nvme, uint16_t *nqueues)
+nvme_set_nqueues(nvme_t *nvme)
 {
 	nvme_nqueues_t nq = { 0 };
 	int ret;
 
-	nq.b.nq_nsq = nq.b.nq_ncq = *nqueues - 1;
+	/*
+	 * The default is to allocate one completion queue per vector.
+	 */
+	if (nvme->n_completion_queues == -1)
+		nvme->n_completion_queues = nvme->n_intr_cnt;
+
+	/*
+	 * There is no point in having more compeletion queues than
+	 * interrupt vectors.
+	 */
+	nvme->n_completion_queues = MIN(nvme->n_completion_queues,
+	    nvme->n_intr_cnt);
+
+	/*
+	 * The default is to use one submission queue per completion queue.
+	 */
+	if (nvme->n_submission_queues == -1)
+		nvme->n_submission_queues = nvme->n_completion_queues;
+
+	/*
+	 * There is no point in having more compeletion queues than
+	 * submission queues.
+	 */
+	nvme->n_completion_queues = MIN(nvme->n_completion_queues,
+	    nvme->n_submission_queues);
+
+	ASSERT(nvme->n_submission_queues > 0);
+	ASSERT(nvme->n_completion_queues > 0);
+
+	nq.b.nq_nsq = nvme->n_submission_queues - 1;
+	nq.b.nq_ncq = nvme->n_completion_queues - 1;
 
 	ret = nvme_set_features(nvme, B_FALSE, 0, NVME_FEAT_NQUEUES, nq.r,
 	    &nq.r);
 
 	if (ret == 0) {
 		/*
-		 * Always use the same number of submission and completion
-		 * queues, and never use more than the requested number of
-		 * queues.
+		 * Never use more than the requested number of queues.
 		 */
-		*nqueues = MIN(*nqueues, MIN(nq.b.nq_nsq, nq.b.nq_ncq) + 1);
+		nvme->n_submission_queues = MIN(nvme->n_submission_queues,
+		    nq.b.nq_nsq + 1);
+		nvme->n_completion_queues = MIN(nvme->n_completion_queues,
+		    nq.b.nq_ncq + 1);
 	}
 
 	return (ret);
 }
 
 static int
-nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
+nvme_create_completion_queue(nvme_t *nvme, nvme_cq_t *cq)
 {
 	nvme_cmd_t *cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
 	nvme_create_queue_dw10_t dw10 = { 0 };
 	nvme_create_cq_dw11_t c_dw11 = { 0 };
-	nvme_create_sq_dw11_t s_dw11 = { 0 };
 	int ret;
 
-	dw10.b.q_qid = idx;
-	dw10.b.q_qsize = qp->nq_nentry - 1;
+	dw10.b.q_qid = cq->ncq_id;
+	dw10.b.q_qsize = cq->ncq_nentry - 1;
 
 	c_dw11.b.cq_pc = 1;
 	c_dw11.b.cq_ien = 1;
-	c_dw11.b.cq_iv = idx % nvme->n_intr_cnt;
+	c_dw11.b.cq_iv = cq->ncq_id % nvme->n_intr_cnt;
 
 	cmd->nc_sqid = 0;
 	cmd->nc_callback = nvme_wakeup_cmd;
 	cmd->nc_sqe.sqe_opc = NVME_OPC_CREATE_CQUEUE;
 	cmd->nc_sqe.sqe_cdw10 = dw10.r;
 	cmd->nc_sqe.sqe_cdw11 = c_dw11.r;
-	cmd->nc_sqe.sqe_dptr.d_prp[0] = qp->nq_cqdma->nd_cookie.dmac_laddress;
+	cmd->nc_sqe.sqe_dptr.d_prp[0] = cq->ncq_dma->nd_cookie.dmac_laddress;
 
 	nvme_admin_cmd(cmd, nvme_admin_cmd_timeout);
 
@@ -2087,13 +2290,36 @@ nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
 		dev_err(nvme->n_dip, CE_WARN,
 		    "!CREATE CQUEUE failed with sct = %x, sc = %x",
 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
-		goto fail;
 	}
 
 	nvme_free_cmd(cmd);
 
+	return (ret);
+}
+
+static int
+nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
+{
+	nvme_cq_t *cq = qp->nq_cq;
+	nvme_cmd_t *cmd;
+	nvme_create_queue_dw10_t dw10 = { 0 };
+	nvme_create_sq_dw11_t s_dw11 = { 0 };
+	int ret;
+
+	/*
+	 * It is possible to have more qpairs than completion queues,
+	 * and when the idx > ncq_id, that completion queue is shared
+	 * and has already been created.
+	 */
+	if (idx <= cq->ncq_id &&
+	    nvme_create_completion_queue(nvme, cq) != DDI_SUCCESS)
+		return (DDI_FAILURE);
+
+	dw10.b.q_qid = idx;
+	dw10.b.q_qsize = qp->nq_nentry - 1;
+
 	s_dw11.b.sq_pc = 1;
-	s_dw11.b.sq_cqid = idx;
+	s_dw11.b.sq_cqid = cq->ncq_id;
 
 	cmd = nvme_alloc_cmd(nvme, KM_SLEEP);
 	cmd->nc_sqid = 0;
@@ -2109,10 +2335,8 @@ nvme_create_io_qpair(nvme_t *nvme, nvme_qpair_t *qp, uint16_t idx)
 		dev_err(nvme->n_dip, CE_WARN,
 		    "!CREATE SQUEUE failed with sct = %x, sc = %x",
 		    cmd->nc_cqe.cqe_sf.sf_sct, cmd->nc_cqe.cqe_sf.sf_sc);
-		goto fail;
 	}
 
-fail:
 	nvme_free_cmd(cmd);
 
 	return (ret);
@@ -2365,6 +2589,16 @@ nvme_init(nvme_t *nvme)
 	}
 
 	/*
+	 * Create the cq array with one completion queue to be assigned
+	 * to the admin queue pair.
+	 */
+	if (nvme_create_cq_array(nvme, 1, nvme->n_admin_queue_len) !=
+	    DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to pre-allocate admin completion queue");
+		goto fail;
+	}
+	/*
 	 * Create the admin queue pair.
 	 */
 	if (nvme_alloc_qpair(nvme, nvme->n_admin_queue_len, &nvme->n_adminq, 0)
@@ -2383,7 +2617,7 @@ nvme_init(nvme_t *nvme)
 
 	aqa.b.aqa_asqs = aqa.b.aqa_acqs = nvme->n_admin_queue_len - 1;
 	asq = nvme->n_adminq->nq_sqdma->nd_cookie.dmac_laddress;
-	acq = nvme->n_adminq->nq_cqdma->nd_cookie.dmac_laddress;
+	acq = nvme->n_adminq->nq_cq->ncq_dma->nd_cookie.dmac_laddress;
 
 	ASSERT((asq & (nvme->n_pagesize - 1)) == 0);
 	ASSERT((acq & (nvme->n_pagesize - 1)) == 0);
@@ -2635,13 +2869,11 @@ nvme_init(nvme_t *nvme)
 		}
 	}
 
-	nqueues = nvme->n_intr_cnt;
-
 	/*
 	 * Create I/O queue pairs.
 	 */
 
-	if (nvme_set_nqueues(nvme, &nqueues) != 0) {
+	if (nvme_set_nqueues(nvme) != 0) {
 		dev_err(nvme->n_dip, CE_WARN,
 		    "!failed to set number of I/O queues to %d",
 		    nvme->n_intr_cnt);
@@ -2653,20 +2885,55 @@ nvme_init(nvme_t *nvme)
 	 */
 	kmem_free(nvme->n_ioq, sizeof (nvme_qpair_t *));
 	nvme->n_ioq = kmem_zalloc(sizeof (nvme_qpair_t *) *
-	    (nqueues + 1), KM_SLEEP);
+	    (nvme->n_submission_queues + 1), KM_SLEEP);
 	nvme->n_ioq[0] = nvme->n_adminq;
 
-	nvme->n_ioq_count = nqueues;
+	/*
+	 * There should always be at least as many submission queues
+	 * as completion queues.
+	 */
+	ASSERT(nvme->n_submission_queues >= nvme->n_completion_queues);
+
+	nvme->n_ioq_count = nvme->n_submission_queues;
+
+	nvme->n_io_squeue_len =
+	    MIN(nvme->n_io_squeue_len, nvme->n_max_queue_entries);
+
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-squeue-len",
+	    nvme->n_io_squeue_len);
+
+	/*
+	 * Pre-allocate completion queues.
+	 * When there are the same number of submission and completion
+	 * queues there is no value in having a larger completion
+	 * queue length.
+	 */
+	if (nvme->n_submission_queues == nvme->n_completion_queues)
+		nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
+		    nvme->n_io_squeue_len);
+
+	nvme->n_io_cqueue_len = MIN(nvme->n_io_cqueue_len,
+	    nvme->n_max_queue_entries);
+
+	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-cqueue-len",
+	    nvme->n_io_cqueue_len);
+
+	if (nvme_create_cq_array(nvme, nvme->n_completion_queues + 1,
+	    nvme->n_io_cqueue_len) != DDI_SUCCESS) {
+		dev_err(nvme->n_dip, CE_WARN,
+		    "!failed to pre-allocate completion queues");
+		goto fail;
+	}
 
 	/*
-	 * If we got less queues than we asked for we might as well give
+	 * If we use less completion queues than interrupt vectors return
 	 * some of the interrupt vectors back to the system.
 	 */
-	if (nvme->n_ioq_count < nvme->n_intr_cnt) {
+	if (nvme->n_completion_queues + 1 < nvme->n_intr_cnt) {
 		nvme_release_interrupts(nvme);
 
 		if (nvme_setup_interrupts(nvme, nvme->n_intr_type,
-		    nvme->n_ioq_count) != DDI_SUCCESS) {
+		    nvme->n_completion_queues + 1) != DDI_SUCCESS) {
 			dev_err(nvme->n_dip, CE_WARN,
 			    "!failed to reduce number of interrupts");
 			goto fail;
@@ -2676,13 +2943,9 @@ nvme_init(nvme_t *nvme)
 	/*
 	 * Alloc & register I/O queue pairs
 	 */
-	nvme->n_io_queue_len =
-	    MIN(nvme->n_io_queue_len, nvme->n_max_queue_entries);
-	(void) ddi_prop_update_int(DDI_DEV_T_NONE, nvme->n_dip, "io-queue-len",
-	    nvme->n_io_queue_len);
 
 	for (i = 1; i != nvme->n_ioq_count + 1; i++) {
-		if (nvme_alloc_qpair(nvme, nvme->n_io_queue_len,
+		if (nvme_alloc_qpair(nvme, nvme->n_io_squeue_len,
 		    &nvme->n_ioq[i], i) != DDI_SUCCESS) {
 			dev_err(nvme->n_dip, CE_WARN,
 			    "!unable to allocate I/O qpair %d", i);
@@ -2720,7 +2983,6 @@ nvme_intr(caddr_t arg1, caddr_t arg2)
 	int inum = (int)(uintptr_t)arg2;
 	int ccnt = 0;
 	int qnum;
-	nvme_cmd_t *cmd;
 
 	if (inum >= nvme->n_intr_cnt)
 		return (DDI_INTR_UNCLAIMED);
@@ -2735,13 +2997,9 @@ nvme_intr(caddr_t arg1, caddr_t arg2)
 	 * in steps of n_intr_cnt to process all queues using this vector.
 	 */
 	for (qnum = inum;
-	    qnum < nvme->n_ioq_count + 1 && nvme->n_ioq[qnum] != NULL;
+	    qnum < nvme->n_cq_count && nvme->n_cq[qnum] != NULL;
 	    qnum += nvme->n_intr_cnt) {
-		while ((cmd = nvme_retrieve_cmd(nvme, nvme->n_ioq[qnum]))) {
-			taskq_dispatch_ent((taskq_t *)cmd->nc_nvme->n_cmd_taskq,
-			    cmd->nc_callback, cmd, TQ_NOSLEEP, &cmd->nc_tqent);
-			ccnt++;
-		}
+		ccnt += nvme_process_iocq(nvme, nvme->n_cq[qnum]);
 	}
 
 	return (ccnt > 0 ? DDI_INTR_CLAIMED : DDI_INTR_UNCLAIMED);
@@ -2912,8 +3170,14 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	    B_TRUE : B_FALSE;
 	nvme->n_admin_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 	    DDI_PROP_DONTPASS, "admin-queue-len", NVME_DEFAULT_ADMIN_QUEUE_LEN);
-	nvme->n_io_queue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
-	    DDI_PROP_DONTPASS, "io-queue-len", NVME_DEFAULT_IO_QUEUE_LEN);
+	nvme->n_io_squeue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "io-squeue-len", NVME_DEFAULT_IO_QUEUE_LEN);
+	/*
+	 * Double up the default for completion queues in case of
+	 * queue sharing.
+	 */
+	nvme->n_io_cqueue_len = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "io-cqueue-len", 2 * NVME_DEFAULT_IO_QUEUE_LEN);
 	nvme->n_async_event_limit = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 	    DDI_PROP_DONTPASS, "async-event-limit",
 	    NVME_DEFAULT_ASYNC_EVENT_LIMIT);
@@ -2923,6 +3187,10 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 	nvme->n_min_block_size = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
 	    DDI_PROP_DONTPASS, "min-phys-block-size",
 	    NVME_DEFAULT_MIN_BLOCK_SIZE);
+	nvme->n_submission_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "max-submission-queues", -1);
+	nvme->n_completion_queues = ddi_prop_get_int(DDI_DEV_T_ANY, dip,
+	    DDI_PROP_DONTPASS, "max-completion-queues", -1);
 
 	if (!ISP2(nvme->n_min_block_size) ||
 	    (nvme->n_min_block_size < NVME_DEFAULT_MIN_BLOCK_SIZE)) {
@@ -2933,13 +3201,33 @@ nvme_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
 		nvme->n_min_block_size = NVME_DEFAULT_MIN_BLOCK_SIZE;
 	}
 
+	if (nvme->n_submission_queues != -1 &&
+	    (nvme->n_submission_queues < 1 ||
+	    nvme->n_submission_queues > UINT16_MAX)) {
+		dev_err(dip, CE_WARN, "!\"submission-queues\"=%d is not "
+		    "valid. Must be [1..%d]", nvme->n_submission_queues,
+		    UINT16_MAX);
+		nvme->n_submission_queues = -1;
+	}
+
+	if (nvme->n_completion_queues != -1 &&
+	    (nvme->n_completion_queues < 1 ||
+	    nvme->n_completion_queues > UINT16_MAX)) {
+		dev_err(dip, CE_WARN, "!\"completion-queues\"=%d is not "
+		    "valid. Must be [1..%d]", nvme->n_completion_queues,
+		    UINT16_MAX);
+		nvme->n_completion_queues = -1;
+	}
+
 	if (nvme->n_admin_queue_len < NVME_MIN_ADMIN_QUEUE_LEN)
 		nvme->n_admin_queue_len = NVME_MIN_ADMIN_QUEUE_LEN;
 	else if (nvme->n_admin_queue_len > NVME_MAX_ADMIN_QUEUE_LEN)
 		nvme->n_admin_queue_len = NVME_MAX_ADMIN_QUEUE_LEN;
 
-	if (nvme->n_io_queue_len < NVME_MIN_IO_QUEUE_LEN)
-		nvme->n_io_queue_len = NVME_MIN_IO_QUEUE_LEN;
+	if (nvme->n_io_squeue_len < NVME_MIN_IO_QUEUE_LEN)
+		nvme->n_io_squeue_len = NVME_MIN_IO_QUEUE_LEN;
+	if (nvme->n_io_cqueue_len < NVME_MIN_IO_QUEUE_LEN)
+		nvme->n_io_cqueue_len = NVME_MIN_IO_QUEUE_LEN;
 
 	if (nvme->n_async_event_limit < 1)
 		nvme->n_async_event_limit = NVME_DEFAULT_ASYNC_EVENT_LIMIT;
@@ -3151,6 +3439,12 @@ nvme_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
 	if (nvme->n_progress & NVME_ADMIN_QUEUE)
 		nvme_free_qpair(nvme->n_adminq);
 
+	if (nvme->n_cq_count > 0) {
+		nvme_destroy_cq_array(nvme, 0);
+		nvme->n_cq = NULL;
+		nvme->n_cq_count = 0;
+	}
+
 	if (nvme->n_idctl)
 		kmem_free(nvme->n_idctl, NVME_IDENTIFY_BUFSIZE);
 
@@ -3222,7 +3516,7 @@ nvme_fill_prp(nvme_cmd_t *cmd, bd_xfer_t *xfer)
 
 	xfer->x_ndmac--;
 
-	nprp_page = nvme->n_pagesize / sizeof (uint64_t) - 1;
+	nprp_page = nvme->n_pagesize / sizeof (uint64_t);
 	ASSERT(nprp_page > 0);
 	nprp = (xfer->x_ndmac + nprp_page - 1) / nprp_page;
 
@@ -3325,7 +3619,7 @@ nvme_bd_driveinfo(void *arg, bd_drive_t *drive)
 	 * TODO: need to figure out a sane default, or use per-NS I/O queues,
 	 * or change blkdev to handle EAGAIN
 	 */
-	drive->d_qsize = nvme->n_ioq_count * nvme->n_io_queue_len
+	drive->d_qsize = nvme->n_ioq_count * nvme->n_io_squeue_len
 	    / nvme->n_namespace_count;
 
 	/*
@@ -3405,7 +3699,7 @@ nvme_bd_cmd(nvme_namespace_t *ns, bd_xfer_t *xfer, uint8_t opc)
 	do {
 		cmd = nvme_retrieve_cmd(nvme, ioq);
 		if (cmd != NULL)
-			nvme_bd_xfer_done(cmd);
+			cmd->nc_callback(cmd);
 		else
 			drv_usecwait(10);
 	} while (ioq->nq_active_cmds != 0);
diff --git a/usr/src/uts/common/io/nvme/nvme.conf b/usr/src/uts/common/io/nvme/nvme.conf
index f174e9cd80..982be2d538 100644
--- a/usr/src/uts/common/io/nvme/nvme.conf
+++ b/usr/src/uts/common/io/nvme/nvme.conf
@@ -9,6 +9,7 @@
 #
 #
 # Copyright 2016 Nexenta Systems, Inc. All rights reserved.
+# Copyright 2019 Western Digital Corporation
 #
 
 #
@@ -28,9 +29,28 @@
 #admin-queue-len=256;
 
 #
-# The maximum length of the individual I/O queues can be overriden here
-# (16-65536).
-#io-queue-len=1024;
+# The maximum length of the individual I/O submission queues can be
+# overriden here (16-65536).
+#io-squeue-len=1024;
+
+#
+# The maximum length of the individual I/O completion queues can be
+# overriden here (16-65536).
+#io-cqueue-len=2048;
+
+#
+# The number of submission queues can be configured here. The default is
+# to match submission queues 1 for 1 with completion queues.
+# The range is 1-65535.
+#max-submission-queues=65535;
+
+#
+# The number of completion queues can be configured here. The default is
+# one per interrupt vector. Should there be fewer completion queues than
+# vectors, the number of vectors is reduced. The number of completion
+# queues will also be limited to the number of submission queues.
+# The range is 1-65535.
+#max-completion-queues=65535;
 
 #
 # The maximum number of outstanding asynchronous event requests can
diff --git a/usr/src/uts/common/io/nvme/nvme_var.h b/usr/src/uts/common/io/nvme/nvme_var.h
index 49d900040e..780205d145 100644
--- a/usr/src/uts/common/io/nvme/nvme_var.h
+++ b/usr/src/uts/common/io/nvme/nvme_var.h
@@ -13,6 +13,7 @@
  * Copyright 2018 Nexenta Systems, Inc.
  * Copyright 2016 The MathWorks, Inc. All rights reserved.
  * Copyright 2017 Joyent, Inc.
+ * Copyright 2019 Western Digital Corporation.
  */
 
 #ifndef _NVME_VAR_H
@@ -33,10 +34,10 @@ extern "C" {
 #endif
 
 #define	NVME_FMA_INIT			0x1
-#define	NVME_REGS_MAPPED 		0x2
-#define	NVME_ADMIN_QUEUE 		0x4
-#define	NVME_CTRL_LIMITS 		0x8
-#define	NVME_INTERRUPTS  		0x10
+#define	NVME_REGS_MAPPED		0x2
+#define	NVME_ADMIN_QUEUE		0x4
+#define	NVME_CTRL_LIMITS		0x8
+#define	NVME_INTERRUPTS			0x10
 
 #define	NVME_MIN_ADMIN_QUEUE_LEN	16
 #define	NVME_MIN_IO_QUEUE_LEN		16
@@ -52,6 +53,7 @@ typedef struct nvme_namespace nvme_namespace_t;
 typedef struct nvme_minor_state nvme_minor_state_t;
 typedef struct nvme_dma nvme_dma_t;
 typedef struct nvme_cmd nvme_cmd_t;
+typedef struct nvme_cq nvme_cq_t;
 typedef struct nvme_qpair nvme_qpair_t;
 typedef struct nvme_task_arg nvme_task_arg_t;
 
@@ -92,6 +94,20 @@ struct nvme_cmd {
 	nvme_t *nc_nvme;
 };
 
+struct nvme_cq {
+	size_t ncq_nentry;
+	uint16_t ncq_id;
+
+	nvme_dma_t *ncq_dma;
+	nvme_cqe_t *ncq_cq;
+	uint_t ncq_head;
+	uint_t ncq_tail;
+	uintptr_t ncq_hdbl;
+	int ncq_phase;
+
+	kmutex_t ncq_mutex;
+};
+
 struct nvme_qpair {
 	size_t nq_nentry;
 
@@ -101,16 +117,11 @@ struct nvme_qpair {
 	uint_t nq_sqtail;
 	uintptr_t nq_sqtdbl;
 
-	nvme_dma_t *nq_cqdma;
-	nvme_cqe_t *nq_cq;
-	uint_t nq_cqhead;
-	uint_t nq_cqtail;
-	uintptr_t nq_cqhdbl;
+	nvme_cq_t *nq_cq;
 
 	nvme_cmd_t **nq_cmd;
 	uint16_t nq_next_cmd;
 	uint_t nq_active_cmds;
-	int nq_phase;
 
 	kmutex_t nq_mutex;
 	ksema_t nq_sema;
@@ -142,7 +153,8 @@ struct nvme {
 	boolean_t n_strict_version;
 	boolean_t n_ignore_unknown_vendor_status;
 	uint32_t n_admin_queue_len;
-	uint32_t n_io_queue_len;
+	uint32_t n_io_squeue_len;
+	uint32_t n_io_cqueue_len;
 	uint16_t n_async_event_limit;
 	uint_t n_min_block_size;
 	uint16_t n_abort_command_limit;
@@ -154,6 +166,8 @@ struct nvme {
 	boolean_t n_auto_pst_supported;
 	boolean_t n_async_event_supported;
 	boolean_t n_progress_supported;
+	int n_submission_queues;
+	int n_completion_queues;
 
 	int n_nssr_supported;
 	int n_doorbell_stride;
@@ -165,12 +179,14 @@ struct nvme {
 	int n_pagesize;
 
 	int n_namespace_count;
-	uint16_t n_ioq_count;
+	uint_t n_ioq_count;
+	uint_t n_cq_count;
 
 	nvme_identify_ctrl_t *n_idctl;
 
 	nvme_qpair_t *n_adminq;
 	nvme_qpair_t **n_ioq;
+	nvme_cq_t **n_cq;
 
 	nvme_namespace_t *n_ns;
author	Paul Winder <Paul.Winder@wdc.com>	2019-06-05 14:02:04 +0100
committer	Dan McDonald <danmcd@joyent.com>	2019-06-20 10:02:46 -0400
commit	0999c1123c1ab769df080ccc5f1626d50663e7a8 (patch)
tree	78ca9f53ced438d9d1a49145c70dbb32979e33d6
parent	c89583d1669aac784ae44a473d81f8c5f564c728 (diff)
download	illumos-joyent-0999c1123c1ab769df080ccc5f1626d50663e7a8.tar.gz