Merge branch 'master' into OS-8161OS-8161

author: John Levon <john.levon@joyent.com> 2020-04-27 12:13:35 +0100
committer: GitHub <noreply@github.com> 2020-04-27 12:13:35 +0100
commit: 6fcbeecca7f56c616db6edcc081a5ad4fea1ac05 (patch)
tree: 64e2369c2120a6098fd184d78494ce5c6af48a32
parent: 6b8372c7be33da3e049ec99ed01e05fc957aa76c (diff)
parent: 31f2b09b95899b5b69759d52e8abc6cf122d0e74 (diff)
download: illumos-joyent-OS-8161.tar.gz
12 files changed, 268 insertions, 129 deletions
diff --git a/usr/src/boot/sys/boot/forth/joyent.menu.rc b/usr/src/boot/sys/boot/forth/joyent.menu.rc
index b9c24cc790..7342c1fd6d 100644
--- a/usr/src/boot/sys/boot/forth/joyent.menu.rc
+++ b/usr/src/boot/sys/boot/forth/joyent.menu.rc
@@ -77,29 +77,17 @@ set optionsmenu_keycode[3]=118
 set optionsansi_caption[3]="^[1mV^[merbose............... ^[34;1mOff^[m"
 set optionstoggled_ansi[3]="^[1mV^[merbose............... ^[32;7mOn^[m"
 
-set optionsmenu_init[4]="init_kmdb"
-set optionsmenu_caption[4]="Load [k]mdb........... Off"
-set optionstoggled_text[4]="Load [k]mdb........... On"
-set optionsmenu_command[4]="toggle_kmdb"
+set optionsmenu_kmdb=4
+set optionsmenu_command[4]="cycle_kmdb"
 set optionsmenu_keycode[4]=107
-set optionsansi_caption[4]="Load ^[1mk^[mmdb............. ^[34;1mOff^[m"
-set optionstoggled_ansi[4]="Load ^[1mk^[mmdb............. ^[32;7mOn^[m"
-
-set optionsmenu_init[5]="init_drop_into_kmdb"
-set optionsmenu_caption[5]="[D]rop into kmdb...... Off"
-set optionstoggled_text[5]="[D]rop into kmdb...... On"
-set optionsmenu_command[5]="toggle_drop_into_kmdb"
-set optionsmenu_keycode[5]=100
-set optionsansi_caption[5]="^[1mD^[mrop into kmdb........ ^[34;1mOff^[m"
-set optionstoggled_ansi[5]="^[1mD^[mrop into kmdb........ ^[32;7mOn^[m"
-
-set optionsmenu_init[6]="init_rescue"
-set optionsmenu_caption[6]="[R]escue Mode......... Off"
-set optionstoggled_text[6]="[R]escue Mode......... On"
-set optionsmenu_command[6]="toggle_rescue"
-set optionsmenu_keycode[6]=114
-set optionsansi_caption[6]="^[1mR^[mescue Mode........... ^[34;1mOff^[m"
-set optionstoggled_ansi[6]="^[1mR^[mescue Mode........... ^[32;7mOn^[m"
+
+set optionsmenu_init[5]="init_rescue"
+set optionsmenu_caption[5]="[R]escue Mode......... Off"
+set optionstoggled_text[5]="[R]escue Mode......... On"
+set optionsmenu_command[5]="toggle_rescue"
+set optionsmenu_keycode[5]=114
+set optionsansi_caption[5]="^[1mR^[mescue Mode........... ^[34;1mOff^[m"
+set optionstoggled_ansi[5]="^[1mR^[mescue Mode........... ^[32;7mOn^[m"
 
 \ Enable automatic booting (add ``autoboot_delay=N'' to loader.conf(5) to
 \ customize the timeout; default is 10-seconds)
diff --git a/usr/src/uts/common/fs/zfs/zvol.c b/usr/src/uts/common/fs/zfs/zvol.c
index 3ed5977c20..2e684a5ff0 100644
--- a/usr/src/uts/common/fs/zfs/zvol.c
+++ b/usr/src/uts/common/fs/zfs/zvol.c
@@ -24,7 +24,7 @@
  * Portions Copyright 2010 Robert Milkowski
  *
  * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2019 Joyent, Inc.
  */
@@ -1499,7 +1499,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 			bytes = volsize - off;
 
 		tot_bytes += bytes;
-		dmu_tx_hold_write(tx, ZVOL_OBJ, off, bytes);
+		dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes);
 		error = dmu_tx_assign(tx, TXG_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
diff --git a/usr/src/uts/common/inet/cc/cc_cubic.c b/usr/src/uts/common/inet/cc/cc_cubic.c
index 11c238afd8..bb26a2358d 100644
--- a/usr/src/uts/common/inet/cc/cc_cubic.c
+++ b/usr/src/uts/common/inet/cc/cc_cubic.c
@@ -4,6 +4,7 @@
  * All rights reserved.
  * Copyright (c) 2017 by Delphix. All rights reserved.
  * Copyright 2019 Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  *
  * This software was developed by Lawrence Stewart while studying at the Centre
  * for Advanced Internet Architectures, Swinburne University of Technology, made
@@ -85,6 +86,7 @@ static void	cubic_conn_init(struct cc_var *ccv);
 static void	cubic_post_recovery(struct cc_var *ccv);
 static void	cubic_record_rtt(struct cc_var *ccv);
 static void	cubic_ssthresh_update(struct cc_var *ccv);
+static void	cubic_after_idle(struct cc_var *ccv);
 
 struct cubic {
 	/* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */
@@ -115,6 +117,7 @@ struct cc_algo cubic_cc_algo = {
 	.cong_signal = cubic_cong_signal,
 	.conn_init = cubic_conn_init,
 	.post_recovery = cubic_post_recovery,
+	.after_idle = cubic_after_idle,
 };
 
 int
@@ -129,7 +132,7 @@ _init(void)
 		if ((err = mod_install(&cc_cubic_modlinkage)) != 0)
 			(void) cc_deregister_algo(&cubic_cc_algo);
 	}
-	cubic_cc_algo.after_idle = newreno_cc_algo->after_idle;
+
 	return (err);
 }
 
@@ -195,19 +198,22 @@ cubic_ack_received(struct cc_var *ccv, uint16_t type)
 				 * TCP-friendly region, follow tf
 				 * cwnd growth.
 				 */
-				CCV(ccv, tcp_cwnd) = w_tf;
+				if (CCV(ccv, tcp_cwnd) < w_tf)
+					CCV(ccv, tcp_cwnd) = w_tf;
 			} else if (CCV(ccv, tcp_cwnd) < w_cubic_next) {
 				/*
 				 * Concave or convex region, follow CUBIC
 				 * cwnd growth.
 				 */
 				if (CC_ABC(ccv))
-					CCV(ccv, tcp_cwnd) = w_cubic_next;
+					CCV(ccv, tcp_cwnd) = MIN(w_cubic_next,
+					    INT_MAX);
 				else
-					CCV(ccv, tcp_cwnd) += ((w_cubic_next -
+					CCV(ccv, tcp_cwnd) += MAX(1,
+					    ((MIN(w_cubic_next, INT_MAX) -
 					    CCV(ccv, tcp_cwnd)) *
 					    CCV(ccv, tcp_mss)) /
-					    CCV(ccv, tcp_cwnd);
+					    CCV(ccv, tcp_cwnd));
 			}
 
 			/*
@@ -218,12 +224,34 @@ cubic_ack_received(struct cc_var *ccv, uint16_t type)
 			 * max_cwnd.
 			 */
 			if (cubic_data->num_cong_events == 0 &&
-			    cubic_data->max_cwnd < CCV(ccv, tcp_cwnd))
+			    cubic_data->max_cwnd < CCV(ccv, tcp_cwnd)) {
 				cubic_data->max_cwnd = CCV(ccv, tcp_cwnd);
+				cubic_data->K = cubic_k(cubic_data->max_cwnd /
+				    CCV(ccv, tcp_mss));
+			}
 		}
 	}
 }
 
+/*
+ * This is a Cubic specific implementation of after_idle.
+ *   - Reset cwnd by calling New Reno implementation of after_idle.
+ *   - Reset t_last_cong.
+ */
+static void
+cubic_after_idle(struct cc_var *ccv)
+{
+	struct cubic *cubic_data;
+
+	cubic_data = ccv->cc_data;
+
+	cubic_data->max_cwnd = max(cubic_data->max_cwnd, CCV(ccv, tcp_cwnd));
+	cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss));
+
+	newreno_cc_algo->after_idle(ccv);
+	cubic_data->t_last_cong = gethrtime();
+}
+
 static void
 cubic_cb_destroy(struct cc_var *ccv)
 {
@@ -237,7 +265,7 @@ cubic_cb_init(struct cc_var *ccv)
 {
 	struct cubic *cubic_data;
 
-	cubic_data = kmem_alloc(sizeof (struct cubic), KM_NOSLEEP);
+	cubic_data = kmem_zalloc(sizeof (struct cubic), KM_NOSLEEP);
 
 	if (cubic_data == NULL)
 		return (ENOMEM);
@@ -330,6 +358,7 @@ static void
 cubic_post_recovery(struct cc_var *ccv)
 {
 	struct cubic *cubic_data;
+	uint32_t mss, pipe;
 
 	cubic_data = ccv->cc_data;
 
@@ -339,11 +368,39 @@ cubic_post_recovery(struct cc_var *ccv)
 		    >> CUBIC_SHIFT;
 	}
 
+	/*
+	 * There is a risk that if the cwnd becomes less than mss, and
+	 * we do not get enough acks to drive it back up beyond mss,
+	 * we will stop transmitting data altogether.
+	 *
+	 * The Cubic RFC defines values in terms of units of mss. Therefore
+	 * we must make sure we have at least 1 mss to make progress
+	 * since the algorthm is written that way.
+	 */
+	mss = CCV(ccv, tcp_mss);
+
 	if (IN_FASTRECOVERY(ccv->flags)) {
-		/* Update cwnd based on beta and adjusted max_cwnd. */
-		CCV(ccv, tcp_cwnd) = max(1, ((CUBIC_BETA *
-		    cubic_data->max_cwnd) >> CUBIC_SHIFT));
+		/*
+		 * If inflight data is less than ssthresh, set cwnd
+		 * conservatively to avoid a burst of data, as suggested in
+		 * the NewReno RFC. Otherwise, use the CUBIC method.
+		 */
+		pipe = CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna);
+		if (pipe < CCV(ccv, tcp_cwnd_ssthresh)) {
+			/*
+			 * Ensure that cwnd does not collapse to 1 MSS under
+			 * adverse conditions. Implements RFC6582
+			 */
+			CCV(ccv, tcp_cwnd) = MAX(pipe, mss) + mss;
+		} else {
+			/* Update cwnd based on beta and adjusted max_cwnd. */
+			CCV(ccv, tcp_cwnd) = max(mss, ((CUBIC_BETA *
+			    cubic_data->max_cwnd) >> CUBIC_SHIFT));
+		}
+	} else {
+		CCV(ccv, tcp_cwnd) = max(mss, CCV(ccv, tcp_cwnd));
 	}
+
 	cubic_data->t_last_cong = gethrtime();
 
 	/* Calculate the average RTT between congestion epochs. */
@@ -355,7 +412,7 @@ cubic_post_recovery(struct cc_var *ccv)
 
 	cubic_data->epoch_ack_count = 0;
 	cubic_data->sum_rtt_nsecs = 0;
-	cubic_data->K = cubic_k(cubic_data->max_cwnd / CCV(ccv, tcp_mss));
+	cubic_data->K = cubic_k(cubic_data->max_cwnd / mss);
 }
 
 /*
diff --git a/usr/src/uts/common/inet/cc/cc_cubic.h b/usr/src/uts/common/inet/cc/cc_cubic.h
index c87751d257..cc6e6e459a 100644
--- a/usr/src/uts/common/inet/cc/cc_cubic.h
+++ b/usr/src/uts/common/inet/cc/cc_cubic.h
@@ -4,6 +4,7 @@
  * All rights reserved.
  * Copyright (c) 2017 by Delphix. All rights reserved.
  * Copyright 2019 Joyent, Inc.
+ * Copyright 2020 RackTop Systems, Inc.
  *
  * This software was developed by Lawrence Stewart while studying at the Centre
  * for Advanced Internet Architectures, Swinburne University of Technology, made
@@ -70,6 +71,12 @@
 /* Don't trust s_rtt until this many rtt samples have been taken. */
 #define	CUBIC_MIN_RTT_SAMPLES	8
 
+/*
+ * (2^21)^3 is long max. Dividing (2^63) by Cubic_C_factor
+ * and taking cube-root yields 448845 as the effective useful limit
+ */
+#define	CUBED_ROOT_MAX_ULONG	448845
+
 /* Userland only bits. */
 #ifndef _KERNEL
 
@@ -188,6 +195,11 @@ cubic_cwnd(hrtime_t nsecs_since_cong, uint32_t wmax, uint32_t smss, int64_t K)
 	 */
 	cwnd = (t - K * MILLISEC) / MILLISEC;
 
+	if (cwnd > CUBED_ROOT_MAX_ULONG)
+		return (INT_MAX);
+	if (cwnd < -CUBED_ROOT_MAX_ULONG)
+		return (0);
+
 	/* cwnd = (t - K)^3, with CUBIC_SHIFT^3 worth of precision. */
 	cwnd *= (cwnd * cwnd);
 
@@ -199,7 +211,10 @@ cubic_cwnd(hrtime_t nsecs_since_cong, uint32_t wmax, uint32_t smss, int64_t K)
 	 */
 	cwnd = ((cwnd * CUBIC_C_FACTOR * smss) >> CUBIC_SHIFT_4) + wmax;
 
-	return ((uint32_t)cwnd);
+	/*
+	 * for negative cwnd, limiting to zero as lower bound
+	 */
+	return (max(0, cwnd));
 }
 
 /*
diff --git a/usr/src/uts/common/inet/cc/cc_newreno.c b/usr/src/uts/common/inet/cc/cc_newreno.c
index ceb76d8643..5cb1c32534 100644
--- a/usr/src/uts/common/inet/cc/cc_newreno.c
+++ b/usr/src/uts/common/inet/cc/cc_newreno.c
@@ -7,6 +7,7 @@
  * Copyright (c) 2010 The FreeBSD Foundation
  * All rights reserved.
  * Copyright (c) 2017 by Delphix. All rights reserved.
+ * Copyright 2020 RackTop Systems, Inc.
  *
  * This software was developed at the Centre for Advanced Internet
  * Architectures, Swinburne University of Technology, by Lawrence Stewart, James
@@ -256,12 +257,25 @@ newreno_cong_signal(struct cc_var *ccv, uint32_t type)
 static void
 newreno_post_recovery(struct cc_var *ccv)
 {
+	uint32_t pipe;
+
 	if (IN_FASTRECOVERY(ccv->flags)) {
 		/*
 		 * Fast recovery will conclude after returning from this
-		 * function.
+		 * function. Window inflation should have left us with
+		 * approximately cwnd_ssthresh outstanding data. But in case we
+		 * would be inclined to send a burst, better to do it via the
+		 * slow start mechanism.
 		 */
-		if (CCV(ccv, tcp_cwnd) > CCV(ccv, tcp_cwnd_ssthresh)) {
+		pipe = CCV(ccv, tcp_snxt) - CCV(ccv, tcp_suna);
+		if (pipe < CCV(ccv, tcp_cwnd_ssthresh)) {
+			/*
+			 * Ensure that cwnd does not collapse to 1 MSS under
+			 * adverse conditions. Implements RFC6582
+			 */
+			CCV(ccv, tcp_cwnd) = MAX(pipe, CCV(ccv, tcp_mss)) +
+			    CCV(ccv, tcp_mss);
+		} else if (CCV(ccv, tcp_cwnd) > CCV(ccv, tcp_cwnd_ssthresh)) {
 			CCV(ccv, tcp_cwnd) = CCV(ccv, tcp_cwnd_ssthresh);
 		}
 	}
diff --git a/usr/src/uts/common/io/bge/bge_chip2.c b/usr/src/uts/common/io/bge/bge_chip2.c
index 14797ac90f..48d7ed0e0a 100644
--- a/usr/src/uts/common/io/bge/bge_chip2.c
+++ b/usr/src/uts/common/io/bge/bge_chip2.c
@@ -1005,8 +1005,8 @@ bge_nic_get64(bge_t *bgep, bge_regno_t addr)
 #elif defined(__sparc)
 	if (DEVICE_5723_SERIES_CHIPSETS(bgep) ||
 	    DEVICE_5717_SERIES_CHIPSETS(bgep) ||
-	    DEVICE_5725_SERIES_CHIPSETS(bgep ||
-	    DEVICE_57765_SERIES_CHIPSETS(bgep))) {
+	    DEVICE_5725_SERIES_CHIPSETS(bgep) ||
+	    DEVICE_57765_SERIES_CHIPSETS(bgep)) {
 		data = ddi_get32(bgep->io_handle, PIO_ADDR(bgep, addr));
 		data <<= 32;
 		data |= ddi_get32(bgep->io_handle,
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx.h b/usr/src/uts/common/io/mlxcx/mlxcx.h
index bf07691095..da048b4ac3 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx.h
@@ -1172,8 +1172,8 @@ extern void mlxcx_buf_return(mlxcx_t *, mlxcx_buffer_t *);
 extern void mlxcx_buf_return_chain(mlxcx_t *, mlxcx_buffer_t *, boolean_t);
 extern void mlxcx_buf_destroy(mlxcx_t *, mlxcx_buffer_t *);
 
-extern mlxcx_buffer_t *mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *,
-    mblk_t *, size_t);
+extern uint_t mlxcx_buf_bind_or_copy(mlxcx_t *, mlxcx_work_queue_t *,
+    mblk_t *, size_t, mlxcx_buffer_t **);
 
 extern boolean_t mlxcx_rx_group_setup(mlxcx_t *, mlxcx_ring_group_t *);
 extern boolean_t mlxcx_tx_group_setup(mlxcx_t *, mlxcx_ring_group_t *);
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
index a1d50659c1..a08cec3980 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_gld.c
@@ -395,6 +395,7 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
 	uint32_t chkflags = 0;
 	boolean_t ok;
 	size_t take = 0;
+	uint_t bcount;
 
 	VERIFY(mp->b_next == NULL);
 
@@ -430,8 +431,8 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
 		}
 	}
 
-	b = mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take);
-	if (b == NULL) {
+	bcount = mlxcx_buf_bind_or_copy(mlxp, sq, kmp, take, &b);
+	if (bcount == 0) {
 		atomic_or_uint(&sq->mlwq_state, MLXCX_WQ_BLOCKED_MAC);
 		return (mp);
 	}
@@ -457,10 +458,12 @@ mlxcx_mac_ring_tx(void *arg, mblk_t *mp)
 	}
 
 	/*
-	 * Similar logic here: bufcnt is only manipulated atomically, and
-	 * bufhwm is set at startup.
+	 * If the completion queue buffer count is already at or above
+	 * the high water mark, or the addition of this new chain will
+	 * exceed the CQ ring size, then indicate we are blocked.
 	 */
-	if (cq->mlcq_bufcnt >= cq->mlcq_bufhwm) {
+	if (cq->mlcq_bufcnt >= cq->mlcq_bufhwm ||
+	    (cq->mlcq_bufcnt + bcount) > cq->mlcq_nents) {
 		atomic_or_uint(&cq->mlcq_state, MLXCX_CQ_BLOCKED_MAC);
 		goto blocked;
 	}
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
index f65280d41d..6d09abea5c 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_reg.h
@@ -390,8 +390,16 @@ typedef enum {
 	MLXCX_WQE_OP_RDMA_R		= 0x10,
 } mlxcx_wqe_opcode_t;
 
+#define	MLXCX_WQE_OCTOWORD	16
 #define	MLXCX_SQE_MAX_DS	((1 << 6) - 1)
-#define	MLXCX_SQE_MAX_PTRS	61
+/*
+ * Calculate the max number of address pointers in a single ethernet
+ * send message. This is the remainder from MLXCX_SQE_MAX_DS
+ * after accounting for the Control and Ethernet segements.
+ */
+#define	MLXCX_SQE_MAX_PTRS	(MLXCX_SQE_MAX_DS - \
+	(sizeof (mlxcx_wqe_eth_seg_t) + sizeof (mlxcx_wqe_control_seg_t)) / \
+	MLXCX_WQE_OCTOWORD)
 
 typedef enum {
 	MLXCX_SQE_FENCE_NONE		= 0x0,
@@ -2497,6 +2505,8 @@ typedef struct {
 
 #pragma pack()
 
+CTASSERT(MLXCX_SQE_MAX_PTRS > 0);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
index 2305f943a7..492f8fd8a5 100644
--- a/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
+++ b/usr/src/uts/common/io/mlxcx/mlxcx_ring.c
@@ -25,6 +25,7 @@
 #include <sys/sysmacros.h>
 #include <sys/atomic.h>
 #include <sys/cpuvar.h>
+#include <sys/sdt.h>
 
 #include <sys/pattr.h>
 #include <sys/dlpi.h>
@@ -1567,8 +1568,8 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 		    inlinelen);
 	}
 
-	ent0->mlsqe_control.mlcs_ds =
-	    offsetof(mlxcx_sendq_ent_t, mlsqe_data) / 16;
+	ent0->mlsqe_control.mlcs_ds = offsetof(mlxcx_sendq_ent_t, mlsqe_data) /
+	    MLXCX_WQE_OCTOWORD;
 
 	if (chkflags & HCK_IPV4_HDRCKSUM) {
 		ASSERT(mlxp->mlx_caps->mlc_checksum);
@@ -1653,7 +1654,20 @@ mlxcx_sq_add_buffer(mlxcx_t *mlxp, mlxcx_work_queue_t *mlwq,
 	/*
 	 * Make sure the workqueue entry is flushed out before updating
 	 * the doorbell.
+	 * If the ring has wrapped, we need to flush the front and back.
 	 */
+	if ((first + ents) > mlwq->mlwq_nents) {
+		uint_t sync_cnt = mlwq->mlwq_nents - first;
+
+		VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
+		    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
+		    sync_cnt * sizeof (mlxcx_sendq_ent_t),
+		    DDI_DMA_SYNC_FORDEV));
+
+		ent0 = &mlwq->mlwq_send_ent[0];
+		ents -= sync_cnt;
+	}
+
 	VERIFY0(ddi_dma_sync(mlwq->mlwq_dma.mxdb_dma_handle,
 	    (uintptr_t)ent0 - (uintptr_t)mlwq->mlwq_send_ent,
 	    ents * sizeof (mlxcx_sendq_ent_t), DDI_DMA_SYNC_FORDEV));
@@ -2205,58 +2219,64 @@ copyb:
 	return (b);
 }
 
-mlxcx_buffer_t *
-mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
-    mblk_t *mpb, size_t off)
+static mlxcx_buffer_t *
+mlxcx_bind_or_copy_mblk(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
+    mblk_t *mp, size_t off)
 {
-	mlxcx_buffer_t *b, *b0 = NULL;
-	boolean_t first = B_TRUE;
-	mblk_t *mp;
+	mlxcx_buffer_t *b;
 	uint8_t *rptr;
 	size_t sz;
-	size_t ncookies = 0;
 	boolean_t ret;
 
-	for (mp = mpb; mp != NULL; mp = mp->b_cont) {
-		rptr = mp->b_rptr;
-		sz = MBLKL(mp);
+	rptr = mp->b_rptr;
+	sz = MBLKL(mp);
 
-		if (off > 0)
-			ASSERT3U(off, <, sz);
-		rptr += off;
-		sz -= off;
+#ifdef DEBUG
+	if (off > 0) {
+		ASSERT3U(off, <, sz);
+	}
+#endif
 
-		if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) {
-			b = mlxcx_copy_data(mlxp, wq, rptr, sz);
-			if (b == NULL)
-				goto failed;
-		} else {
-			b = mlxcx_buf_take_foreign(mlxp, wq);
-			if (b == NULL)
-				goto failed;
+	rptr += off;
+	sz -= off;
 
-			ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off,
-			    B_FALSE);
+	if (sz < mlxp->mlx_props.mldp_tx_bind_threshold) {
+		b = mlxcx_copy_data(mlxp, wq, rptr, sz);
+	} else {
+		b = mlxcx_buf_take_foreign(mlxp, wq);
+		if (b == NULL)
+			return (NULL);
 
-			if (!ret) {
-				mlxcx_buf_return(mlxp, b);
+		ret = mlxcx_dma_bind_mblk(mlxp, &b->mlb_dma, mp, off,
+		    B_FALSE);
 
-				b = mlxcx_copy_data(mlxp, wq, rptr, sz);
-				if (b == NULL)
-					goto failed;
-			}
+		if (!ret) {
+			mlxcx_buf_return(mlxp, b);
+
+			b = mlxcx_copy_data(mlxp, wq, rptr, sz);
 		}
+	}
+
+	return (b);
+}
+
+uint_t
+mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
+    mblk_t *mpb, size_t off, mlxcx_buffer_t **bp)
+{
+	mlxcx_buffer_t *b, *b0 = NULL;
+	boolean_t first = B_TRUE;
+	mblk_t *mp;
+	size_t offset = off;
+	size_t ncookies = 0;
+	uint_t count = 0;
+
+	for (mp = mpb; mp != NULL && ncookies <= MLXCX_SQE_MAX_PTRS;
+	    mp = mp->b_cont) {
+		b = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, offset);
+		if (b == NULL)
+			goto failed;
 
-		/*
-		 * We might overestimate here when we've copied data, since
-		 * the buffer might be longer than what we copied into it. This
-		 * is safe since it's always wrong in the conservative
-		 * direction (and we will blow up later when we actually
-		 * generate the WQE anyway).
-		 *
-		 * If the assert below ever blows, we'll have to come and fix
-		 * this up so we can transmit these packets.
-		 */
 		ncookies += b->mlb_dma.mxdb_ncookies;
 
 		if (first)
@@ -2267,23 +2287,55 @@ mlxcx_buf_bind_or_copy(mlxcx_t *mlxp, mlxcx_work_queue_t *wq,
 
 		b->mlb_tx_mp = mp;
 		b->mlb_tx_head = b0;
-		b->mlb_used = sz;
+		b->mlb_used = MBLKL(mp) - offset;
 
 		if (!first)
 			list_insert_tail(&b0->mlb_tx_chain, b);
 		first = B_FALSE;
-		off = 0;
+		offset = 0;
+
+		count++;
+	}
+
+	/*
+	 * The chain of mblks has resulted in too many cookies for
+	 * a single message. This is unusual, so take the hit to tidy
+	 * up, do a pullup to a single mblk and allocate the requisite
+	 * buf.
+	 */
+	if (ncookies > MLXCX_SQE_MAX_PTRS) {
+		DTRACE_PROBE4(pullup, mlxcx_t *, mlxp, mlxcx_work_queue_t *, wq,
+		    mblk_t *, mpb, size_t, ncookies);
+
+		if (b0 != NULL)
+			mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
+
+		if ((mp = msgpullup(mpb, -1)) == NULL)
+			return (0);
+
+		b0 = mlxcx_bind_or_copy_mblk(mlxp, wq, mp, off);
+		if (b0 == NULL) {
+			freemsg(mp);
+			return (0);
+		}
+		freemsg(mpb);
+
+		b0->mlb_tx_mp = mp;
+		b0->mlb_tx_head = b0;
+		b0->mlb_used = MBLKL(mp) - off;
+
+		count = 1;
 	}
 
-	ASSERT3U(ncookies, <=, MLXCX_SQE_MAX_PTRS);
+	*bp = b0;
 
-	return (b0);
+	return (count);
 
 failed:
 	if (b0 != NULL)
 		mlxcx_buf_return_chain(mlxp, b0, B_TRUE);
 
-	return (NULL);
+	return (0);
 }
 
 mlxcx_buffer_t *
diff --git a/usr/src/uts/intel/os/driver_aliases b/usr/src/uts/intel/os/driver_aliases
index 7e8c801f03..3e3bcb1a7f 100644
--- a/usr/src/uts/intel/os/driver_aliases
+++ b/usr/src/uts/intel/os/driver_aliases
@@ -142,16 +142,16 @@ bge "pci108e,1647"
 bge "pci108e,1648"
 bge "pci108e,16a7"
 bge "pci108e,16a8"
-bge "pci14e4,0x1682"
-bge "pci14e4,0x1686"
-bge "pci14e4,0x16b0"
-bge "pci14e4,0x16b1"
-bge "pci14e4,0x16b2"
-bge "pci14e4,0x16b3"
-bge "pci14e4,0x16b4"
-bge "pci14e4,0x16b5"
-bge "pci14e4,0x16b6"
-bge "pci14e4,0x16b7"
+bge "pci14e4,1682"
+bge "pci14e4,1686"
+bge "pci14e4,16b0"
+bge "pci14e4,16b1"
+bge "pci14e4,16b2"
+bge "pci14e4,16b3"
+bge "pci14e4,16b4"
+bge "pci14e4,16b5"
+bge "pci14e4,16b6"
+bge "pci14e4,16b7"
 bge "pci14e4,1600"
 bge "pci14e4,1601"
 bge "pci14e4,1643"
@@ -186,16 +186,16 @@ bge "pci14e4,16a7"
 bge "pci14e4,16a8"
 bge "pci14e4,16c7"
 bge "pci14e4,16f3"
-bge "pciex14e4,0x1682"
-bge "pciex14e4,0x1686"
-bge "pciex14e4,0x16b0"
-bge "pciex14e4,0x16b1"
-bge "pciex14e4,0x16b2"
-bge "pciex14e4,0x16b3"
-bge "pciex14e4,0x16b4"
-bge "pciex14e4,0x16b5"
-bge "pciex14e4,0x16b6"
-bge "pciex14e4,0x16b7"
+bge "pciex14e4,1682"
+bge "pciex14e4,1686"
+bge "pciex14e4,16b0"
+bge "pciex14e4,16b1"
+bge "pciex14e4,16b2"
+bge "pciex14e4,16b3"
+bge "pciex14e4,16b4"
+bge "pciex14e4,16b5"
+bge "pciex14e4,16b6"
+bge "pciex14e4,16b7"
 bge "pciex14e4,1643"
 bge "pciex14e4,1655"
 bge "pciex14e4,1656"
diff --git a/usr/src/uts/sun4u/cpu/spitfire.c b/usr/src/uts/sun4u/cpu/spitfire.c
index b4c35566ce..705d871557 100644
--- a/usr/src/uts/sun4u/cpu/spitfire.c
+++ b/usr/src/uts/sun4u/cpu/spitfire.c
@@ -376,7 +376,7 @@ ecc_psynd_score(ushort_t p_synd)
 #define	CPU_SPACE		0x0002	/* print flt_status (data or instr) */
 #define	CPU_ERRID		0x0004	/* print flt_id */
 #define	CPU_TL			0x0008	/* print flt_tl */
-#define	CPU_ERRID_FIRST 	0x0010	/* print flt_id first in message */
+#define	CPU_ERRID_FIRST		0x0010	/* print flt_id first in message */
 #define	CPU_AFSR		0x0020	/* print flt_stat as decoded %afsr */
 #define	CPU_AFAR		0x0040	/* print flt_addr as %afar */
 #define	CPU_AF_PSYND		0x0080	/* print flt_stat %afsr.PSYND */
@@ -808,7 +808,7 @@ cpu_aflt_size(void)
 /*ARGSUSED*/
 void
 cpu_ce_error(struct regs *rp, ulong_t p_afar, ulong_t p_afsr,
-	uint_t p_afsr_high, uint_t p_afar_high)
+    uint_t p_afsr_high, uint_t p_afar_high)
 {
 	ushort_t sdbh, sdbl;
 	ushort_t e_syndh, e_syndl;
@@ -1241,7 +1241,7 @@ cpu_ue_log_err(struct async_flt *aflt)
 /*ARGSUSED*/
 void
 cpu_async_error(struct regs *rp, ulong_t p_afar, ulong_t p_afsr,
-	uint_t p_afsr_high, uint_t p_afar_high)
+    uint_t p_afsr_high, uint_t p_afar_high)
 {
 	ushort_t sdbh, sdbl, ttype, tl;
 	spitf_async_flt spf_flt;
@@ -1937,7 +1937,7 @@ cpu_get_mem_unum_aflt(int synd_status, struct async_flt *aflt,
  */
 int
 cpu_get_mem_name(uint64_t synd, uint64_t *afsr, uint64_t afar,
-		char *buf, int buflen, int *lenp)
+    char *buf, int buflen, int *lenp)
 {
 	int synd_status, flt_in_memory, ret;
 	char unum[UNUM_NAMLEN];
@@ -2777,7 +2777,7 @@ static uint64_t hb_eclk[HB_LOWEST_DIV + 1] = {
 	uint64_t count, new_count;				\
 	clock_t delay;						\
 	data = lddphysio(HB_MEM_CNTRL0);			\
-	count = (data & HB_REFRESH_COUNT_MASK) >> 		\
+	count = (data & HB_REFRESH_COUNT_MASK) >>		\
 	    HB_REFRESH_COUNT_SHIFT;				\
 	new_count = (HB_REFRESH_INTERVAL *			\
 	    cpunodes[CPU->cpu_id].clock_freq) /			\
@@ -2785,7 +2785,7 @@ static uint64_t hb_eclk[HB_LOWEST_DIV + 1] = {
 	data = (data & ~HB_REFRESH_COUNT_MASK) |		\
 	    (new_count << HB_REFRESH_COUNT_SHIFT);		\
 	stdphysio(HB_MEM_CNTRL0, data);				\
-	data = lddphysio(HB_MEM_CNTRL0);        		\
+	data = lddphysio(HB_MEM_CNTRL0);			\
 	/*							\
 	 * If we are slowing down the cpu and Memory		\
 	 * Self Refresh is not enabled, it is required		\
@@ -2951,7 +2951,7 @@ clear_errors(spitf_async_flt *spf_flt, uint64_t *acc_afsr)
  */
 static void
 scan_ecache(uint64_t *t_afar, ec_data_t *ecache_data,
-	uint64_t *ecache_tag, int *linecnt, uint64_t *t_afsr)
+    uint64_t *ecache_tag, int *linecnt, uint64_t *t_afsr)
 {
 	ec_data_t t_ecdata[8];
 	uint64_t t_etag, oafsr;
@@ -3176,7 +3176,7 @@ cpu_log_ecmem_info(spitf_async_flt *spf_flt)
 /*PRINTFLIKE6*/
 static void
 cpu_aflt_log(int ce_code, int tagnum, spitf_async_flt *spflt, uint_t logflags,
-	const char *endstr, const char *fmt, ...)
+    const char *endstr, const char *fmt, ...)
 {
 	struct async_flt *aflt = (struct async_flt *)spflt;
 	char buf[400], *p, *q; /* see comments about buf[] size above */
@@ -3255,7 +3255,7 @@ cpu_aflt_log(int ce_code, int tagnum, spitf_async_flt *spflt, uint_t logflags,
 
 		if (logflags & CPU_AFSR) {
 			(void) snprintf(p, (size_t)(q - p),
-			    "\n    AFSR 0x%08b.%08b",
+			    "\n    AFSR 0x%8b.%8b",
 			    (uint32_t)(aflt->flt_stat >> 32), AFSR_FMTSTR0,
 			    (uint32_t)aflt->flt_stat, AFSR_FMTSTR1);
 			p += strlen(p);
@@ -3292,7 +3292,7 @@ cpu_aflt_log(int ce_code, int tagnum, spitf_async_flt *spflt, uint_t logflags,
 
 		if (logflags & CPU_UDBH) {
 			(void) snprintf(p, (size_t)(q - p),
-			    "\n    UDBH 0x%04b UDBH.ESYND 0x%02x",
+			    "\n    UDBH 0x%4b UDBH.ESYND 0x%02x",
 			    spflt->flt_sdbh, UDB_FMTSTR,
 			    spflt->flt_sdbh & 0xFF);
 			p += strlen(p);
@@ -3300,7 +3300,7 @@ cpu_aflt_log(int ce_code, int tagnum, spitf_async_flt *spflt, uint_t logflags,
 
 		if (logflags & CPU_UDBL) {
 			(void) snprintf(p, (size_t)(q - p),
-			    " UDBL 0x%04b UDBL.ESYND 0x%02x",
+			    " UDBL 0x%4b UDBL.ESYND 0x%02x",
 			    spflt->flt_sdbl, UDB_FMTSTR,
 			    spflt->flt_sdbl & 0xFF);
 			p += strlen(p);
@@ -3348,8 +3348,8 @@ cpu_aflt_log(int ce_code, int tagnum, spitf_async_flt *spflt, uint_t logflags,
  *	modify	parity	busy   idle
  *	----------------------------
  *	clean	good		X
- * 	clean	bad	X	X
- * 	dirty	good		X
+ *	clean	bad	X	X
+ *	dirty	good		X
  *	dirty	bad
  *
  * Bad or good refers to whether a line has an E$ parity error or not.
@@ -3885,7 +3885,7 @@ ecache_kstat_init(struct cpu *cp)
  */
 static void
 ecache_scrub_log(ec_data_t *ec_data, uint64_t ec_tag, uint64_t paddr, int mpb,
-		uint64_t afsr)
+    uint64_t afsr)
 {
 	spitf_async_flt spf_flt;
 	struct async_flt *aflt;
author	John Levon <john.levon@joyent.com>	2020-04-27 12:13:35 +0100
committer	GitHub <noreply@github.com>	2020-04-27 12:13:35 +0100
commit	6fcbeecca7f56c616db6edcc081a5ad4fea1ac05 (patch)
tree	64e2369c2120a6098fd184d78494ce5c6af48a32
parent	6b8372c7be33da3e049ec99ed01e05fc957aa76c (diff)
parent	31f2b09b95899b5b69759d52e8abc6cf122d0e74 (diff)
download	illumos-joyent-OS-8161.tar.gz