summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPaul Winder <pwinder@racktopsystems.com>2020-07-24 12:16:05 +0100
committerPaul Winder <paul@winder.uk.net>2020-08-26 15:21:08 +0100
commite1447ca93391f31609bda487cb922dbff9dcdef5 (patch)
treefc963f1786bf7af1a35884957127903bfbc7ad3c
parentdbe930bf51e0d7458b24d30e9f25756c5da54ddf (diff)
downloadillumos-joyent-e1447ca93391f31609bda487cb922dbff9dcdef5.tar.gz
12980 attempting to change MTU on mlxcx based aggregation can induce FMA event
12987 devo_power misconfigured in mlxcx 12988 potential hang in mlxcx when async and ring vectors end up on same CPU Reviewed by: Robert Mustacchi <rm@fingolfin.org> Reviewed by: Dan McDonald <danmcd@joyent.com> Reviewed by: Igor Kozhukhov <igor@dilos.org> Approved by: Garrett D'Amore <garrett@damore.org>
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx.c5
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx.h1
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx_cmd.c3
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx_gld.c27
-rw-r--r--usr/src/uts/common/io/mlxcx/mlxcx_intr.c66
5 files changed, 90 insertions, 12 deletions
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.c b/usr/src/uts/common/io/mlxcx/mlxcx.c
index dbad9be958..6421b47126 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.c
@@ -1800,7 +1800,7 @@ mlxcx_setup_ports(mlxcx_t *mlxp)
p->mlx_port_event.mla_mlx = mlxp;
p->mlx_port_event.mla_port = p;
mutex_init(&p->mlx_port_event.mla_mtx, NULL,
- MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri));
+ MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
p->mlp_init |= MLXCX_PORT_INIT;
mutex_init(&p->mlp_mtx, NULL, MUTEX_DRIVER,
DDI_INTR_PRI(mlxp->mlx_intr_pri));
@@ -2716,7 +2716,7 @@ mlxcx_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
for (i = 0; i <= MLXCX_FUNC_ID_MAX; i++) {
mlxp->mlx_npages_req[i].mla_mlx = mlxp;
mutex_init(&mlxp->mlx_npages_req[i].mla_mtx, NULL,
- MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_intr_pri));
+ MUTEX_DRIVER, DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
}
mlxp->mlx_attach |= MLXCX_ATTACH_ASYNC_TQ;
@@ -2913,7 +2913,6 @@ static struct dev_ops mlxcx_dev_ops = {
.devo_attach = mlxcx_attach,
.devo_detach = mlxcx_detach,
.devo_reset = nodev,
- .devo_power = ddi_power,
.devo_quiesce = ddi_quiesce_not_supported,
.devo_cb_ops = &mlxcx_cb_ops
};
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h
index 77d36447c6..dd91ea3734 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.h
@@ -1082,6 +1082,7 @@ struct mlxcx {
* Interrupts
*/
uint_t mlx_intr_pri;
+ uint_t mlx_async_intr_pri;
uint_t mlx_intr_type; /* always MSI-X */
int mlx_intr_count;
size_t mlx_intr_size; /* allocation size */
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c
index c8eb1335ea..3cb531df39 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_cmd.c
@@ -667,7 +667,8 @@ static void
mlxcx_cmd_init(mlxcx_t *mlxp, mlxcx_cmd_t *cmd)
{
bzero(cmd, sizeof (*cmd));
- mutex_init(&cmd->mlcmd_lock, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&cmd->mlcmd_lock, NULL, MUTEX_DRIVER,
+ DDI_INTR_PRI(mlxp->mlx_async_intr_pri));
cv_init(&cmd->mlcmd_cv, NULL, CV_DRIVER, NULL);
cmd->mlcmd_token = id_alloc(mlxp->mlx_cmd.mcmd_tokens);
cmd->mlcmd_poll = mlxp->mlx_cmd.mcmd_polled;
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
index 89645bb2b1..941eb0f9e7 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
@@ -809,19 +809,32 @@ mlxcx_mac_ring_stop(mac_ring_driver_t rh)
if (wq->mlwq_state & MLXCX_WQ_BUFFERS) {
+ list_t cq_buffers;
+
+ /*
+ * Take the buffers away from the CQ. If the CQ is being
+ * processed and the WQ has been stopped, a completion
+ * which does not match to a buffer will be ignored.
+ */
+ list_create(&cq_buffers, sizeof (mlxcx_buffer_t),
+ offsetof(mlxcx_buffer_t, mlb_cq_entry));
+
+ list_move_tail(&cq_buffers, &cq->mlcq_buffers);
+
+ mutex_enter(&cq->mlcq_bufbmtx);
+ list_move_tail(&cq_buffers, &cq->mlcq_buffers_b);
+ mutex_exit(&cq->mlcq_bufbmtx);
+
+ cq->mlcq_bufcnt = 0;
+
mutex_exit(&wq->mlwq_mtx);
mutex_exit(&cq->mlcq_mtx);
/* Return any outstanding buffers to the free pool. */
- while ((buf = list_remove_head(&cq->mlcq_buffers)) != NULL) {
+ while ((buf = list_remove_head(&cq_buffers)) != NULL) {
mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
}
- mutex_enter(&cq->mlcq_bufbmtx);
- while ((buf = list_remove_head(&cq->mlcq_buffers_b)) != NULL) {
- mlxcx_buf_return_chain(mlxp, buf, B_FALSE);
- }
- mutex_exit(&cq->mlcq_bufbmtx);
- cq->mlcq_bufcnt = 0;
+ list_destroy(&cq_buffers);
s = wq->mlwq_bufs;
mutex_enter(&s->mlbs_mtx);
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
index f79c148d20..10d8d19b2f 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_intr.c
@@ -922,6 +922,20 @@ lookagain:
if (added)
goto lookagain;
+ /*
+ * This check could go just after the lookagain
+ * label, but it is a hot code path so we don't
+ * want to unnecessarily grab a lock and check
+ * a flag for a relatively rare event (the ring
+ * being stopped).
+ */
+ mutex_enter(&wq->mlwq_mtx);
+ if ((wq->mlwq_state & MLXCX_WQ_STARTED) == 0) {
+ mutex_exit(&wq->mlwq_mtx);
+ goto nextcq;
+ }
+ mutex_exit(&wq->mlwq_mtx);
+
buf = list_head(&mlcq->mlcq_buffers);
mlxcx_warn(mlxp, "got completion on CQ %x but "
"no buffer matching wqe found: %x (first "
@@ -1165,6 +1179,7 @@ mlxcx_intr_setup(mlxcx_t *mlxp)
ret = ddi_intr_get_supported_types(dip, &types);
if (ret != DDI_SUCCESS) {
+ mlxcx_warn(mlxp, "Failed to get supported interrupt types");
return (B_FALSE);
}
@@ -1176,6 +1191,7 @@ mlxcx_intr_setup(mlxcx_t *mlxp)
ret = ddi_intr_get_nintrs(dip, DDI_INTR_TYPE_MSIX, &nintrs);
if (ret != DDI_SUCCESS) {
+ mlxcx_warn(mlxp, "Failed to get number of interrupts");
return (B_FALSE);
}
if (nintrs < 2) {
@@ -1203,10 +1219,14 @@ mlxcx_intr_setup(mlxcx_t *mlxp)
ret = ddi_intr_alloc(dip, mlxp->mlx_intr_handles, DDI_INTR_TYPE_MSIX,
0, navail, &mlxp->mlx_intr_count, DDI_INTR_ALLOC_NORMAL);
if (ret != DDI_SUCCESS) {
+ mlxcx_warn(mlxp, "Failed to allocate %d interrupts", navail);
mlxcx_intr_teardown(mlxp);
return (B_FALSE);
}
if (mlxp->mlx_intr_count < mlxp->mlx_intr_cq0 + 1) {
+ mlxcx_warn(mlxp, "%d MSI-X interrupts allocated, but mlxcx "
+ "requires %d", mlxp->mlx_intr_count,
+ mlxp->mlx_intr_cq0 + 1);
mlxcx_intr_teardown(mlxp);
return (B_FALSE);
}
@@ -1214,10 +1234,29 @@ mlxcx_intr_setup(mlxcx_t *mlxp)
ret = ddi_intr_get_pri(mlxp->mlx_intr_handles[0], &mlxp->mlx_intr_pri);
if (ret != DDI_SUCCESS) {
+ mlxcx_warn(mlxp, "Failed to get interrupt priority");
mlxcx_intr_teardown(mlxp);
return (B_FALSE);
}
+ /*
+ * Set the interrupt priority for the asynchronous handler higher
+ * than the ring handlers. Some operations which issue commands,
+ * and thus rely on the async interrupt handler for posting
+ * completion, do so with a CQ mutex held. The CQ mutex is also
+ * acquired during ring processing, so if the ring processing vector
+ * happens to be assigned to the same CPU as the async vector
+ * it can hold off the async interrupt thread and lead to a deadlock.
+ * By assigning a higher priority to the async vector, it will
+ * always be dispatched.
+ */
+ mlxp->mlx_async_intr_pri = mlxp->mlx_intr_pri;
+ if (mlxp->mlx_async_intr_pri < LOCK_LEVEL) {
+ mlxp->mlx_async_intr_pri++;
+ } else {
+ mlxp->mlx_intr_pri--;
+ }
+
mlxp->mlx_eqs_size = mlxp->mlx_intr_count *
sizeof (mlxcx_event_queue_t);
mlxp->mlx_eqs = kmem_zalloc(mlxp->mlx_eqs_size, KM_SLEEP);
@@ -1227,8 +1266,11 @@ mlxcx_intr_setup(mlxcx_t *mlxp)
* mutex and avl tree to be init'ed - so do it now.
*/
for (i = 0; i < mlxp->mlx_intr_count; ++i) {
+ uint_t pri = (i == 0) ? mlxp->mlx_async_intr_pri :
+ mlxp->mlx_intr_pri;
+
mutex_init(&mlxp->mlx_eqs[i].mleq_mtx, NULL, MUTEX_DRIVER,
- DDI_INTR_PRI(mlxp->mlx_intr_pri));
+ DDI_INTR_PRI(pri));
cv_init(&mlxp->mlx_eqs[i].mleq_cv, NULL, CV_DRIVER, NULL);
if (i < mlxp->mlx_intr_cq0)
@@ -1239,9 +1281,19 @@ mlxcx_intr_setup(mlxcx_t *mlxp)
offsetof(mlxcx_completion_queue_t, mlcq_eq_entry));
}
+ ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[0],
+ mlxp->mlx_async_intr_pri);
+ if (ret != DDI_SUCCESS) {
+ mlxcx_warn(mlxp, "Failed to set interrupt priority to %u for "
+ "async interrupt vector", mlxp->mlx_async_intr_pri);
+ mlxcx_intr_teardown(mlxp);
+ return (B_FALSE);
+ }
+
ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[0], mlxcx_intr_async,
(caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[0]);
if (ret != DDI_SUCCESS) {
+ mlxcx_warn(mlxp, "Failed to add async interrupt handler");
mlxcx_intr_teardown(mlxp);
return (B_FALSE);
}
@@ -1268,9 +1320,21 @@ mlxcx_intr_setup(mlxcx_t *mlxp)
eqt = MLXCX_EQ_TYPE_RX;
}
+ ret = ddi_intr_set_pri(mlxp->mlx_intr_handles[i],
+ mlxp->mlx_intr_pri);
+ if (ret != DDI_SUCCESS) {
+ mlxcx_warn(mlxp, "Failed to set interrupt priority to "
+ "%u for interrupt vector %d", mlxp->mlx_intr_pri,
+ i);
+ mlxcx_intr_teardown(mlxp);
+ return (B_FALSE);
+ }
+
ret = ddi_intr_add_handler(mlxp->mlx_intr_handles[i],
mlxcx_intr_n, (caddr_t)mlxp, (caddr_t)&mlxp->mlx_eqs[i]);
if (ret != DDI_SUCCESS) {
+ mlxcx_warn(mlxp, "Failed to add interrupt handler %d",
+ i);
mlxcx_intr_teardown(mlxp);
return (B_FALSE);
}