7186 xnf: panic on Xen 4.x

Contributed by: Frank Salzmann <frank@delphix.com> Contributed by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed by: Toomas Soome <tsoome@me.com> Reviewed by: Ken Mays <maybird1776@yahoo.com> Reviewed by: Igor Kozhukhov <igor@dilos.org> Approved by: Dan McDonald <danmcd@joyent.com>
author: Yuri Pankov <yuri.pankov@nexenta.com> 2017-10-07 05:09:40 +0300
committer: Dan McDonald <danmcd@joyent.com> 2017-10-13 15:41:47 -0400
commit: 9276b3991ba20d5a5660887ba81b0bc7bed25a0c (patch)
tree: ae01db1ba588a449d545b244bf36475e435fd5b5 /usr/src
parent: 5ee44debdc8aa52cdcbf27fa252332a2403ef693 (diff)
download: illumos-joyent-9276b3991ba20d5a5660887ba81b0bc7bed25a0c.tar.gz
2 files changed, 1133 insertions, 559 deletions
diff --git a/usr/src/uts/common/xen/io/xnf.c b/usr/src/uts/common/xen/io/xnf.c
index 2f895a33d7..e2475b5942 100644
--- a/usr/src/uts/common/xen/io/xnf.c
+++ b/usr/src/uts/common/xen/io/xnf.c
@@ -25,6 +25,10 @@
  */
 
 /*
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+ */
+
+/*
  *
  * Copyright (c) 2004 Christian Limpach.
  * All rights reserved.
@@ -122,6 +126,8 @@
 #include <sys/pattr.h>
 #include <inet/ip.h>
 #include <inet/ip_impl.h>
+#include <inet/tcp.h>
+#include <netinet/udp.h>
 #include <sys/gld.h>
 #include <sys/modctl.h>
 #include <sys/mac_provider.h>
@@ -162,7 +168,9 @@ xnf_t *xnf_debug_instance = NULL;
  */
 #define	xnf_btop(addr)	((addr) >> PAGESHIFT)
 
-unsigned int	xnf_max_tx_frags = 1;
+/*
+ * The parameters below should only be changed in /etc/system, never in mdb.
+ */
 
 /*
  * Should we use the multicast control feature if the backend provides
@@ -171,6 +179,32 @@ unsigned int	xnf_max_tx_frags = 1;
 boolean_t xnf_multicast_control = B_TRUE;
 
 /*
+ * Should we allow scatter-gather for tx if backend allows it?
+ */
+boolean_t xnf_enable_tx_sg = B_TRUE;
+
+/*
+ * Should we allow scatter-gather for rx if backend allows it?
+ */
+boolean_t xnf_enable_rx_sg = B_TRUE;
+
+/*
+ * Should we allow lso for tx sends if backend allows it?
+ * Requires xnf_enable_tx_sg to be also set to TRUE.
+ */
+boolean_t xnf_enable_lso = B_TRUE;
+
+/*
+ * Should we allow lro on rx if backend supports it?
+ * Requires xnf_enable_rx_sg to be also set to TRUE.
+ *
+ * !! WARNING !!
+ * LRO is not yet supported in the OS so this should be left as FALSE.
+ * !! WARNING !!
+ */
+boolean_t xnf_enable_lro = B_FALSE;
+
+/*
  * Received packets below this size are copied to a new streams buffer
  * rather than being desballoc'ed.
  *
@@ -194,7 +228,14 @@ size_t xnf_rx_copy_limit = 64;
 #define	INVALID_TX_ID		((uint16_t)-1)
 
 #define	TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
-#define	TX_ID_VALID(i) (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
+#define	TX_ID_VALID(i) \
+	(((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
+
+/*
+ * calculate how many pages are spanned by an mblk fragment
+ */
+#define	xnf_mblk_pages(mp)	(MBLKL(mp) == 0 ? 0 : \
+    xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1)
 
 /* Required system entry points */
 static int	xnf_attach(dev_info_t *, ddi_attach_cmd_t);
@@ -210,6 +251,11 @@ static mblk_t	*xnf_send(void *, mblk_t *);
 static uint_t	xnf_intr(caddr_t);
 static int	xnf_stat(void *, uint_t, uint64_t *);
 static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
+static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
+static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t,
+    const void *);
+static void xnf_propinfo(void *, const char *, mac_prop_id_t,
+    mac_prop_info_handle_t);
 
 /* Driver private functions */
 static int xnf_alloc_dma_resources(xnf_t *);
@@ -229,17 +275,16 @@ static void xnf_buf_recycle(xnf_buf_t *);
 static int xnf_tx_buf_constructor(void *, void *, int);
 static void xnf_tx_buf_destructor(void *, void *);
 
-static grant_ref_t gref_get(xnf_t *);
-#pragma inline(gref_get)
-static void gref_put(xnf_t *, grant_ref_t);
-#pragma inline(gref_put)
+static grant_ref_t xnf_gref_get(xnf_t *);
+#pragma inline(xnf_gref_get)
+static void xnf_gref_put(xnf_t *, grant_ref_t);
+#pragma inline(xnf_gref_put)
 
-static xnf_txid_t *txid_get(xnf_t *);
-#pragma inline(txid_get)
-static void txid_put(xnf_t *, xnf_txid_t *);
-#pragma inline(txid_put)
+static xnf_txid_t *xnf_txid_get(xnf_t *);
+#pragma inline(xnf_txid_get)
+static void xnf_txid_put(xnf_t *, xnf_txid_t *);
+#pragma inline(xnf_txid_put)
 
-void xnf_send_driver_status(int, int);
 static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
 static int xnf_tx_clean_ring(xnf_t  *);
 static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
@@ -247,50 +292,69 @@ static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
 static boolean_t xnf_kstat_init(xnf_t *);
 static void xnf_rx_collect(xnf_t *);
 
+#define	XNF_CALLBACK_FLAGS	(MC_GETCAPAB | MC_PROPERTIES)
+
 static mac_callbacks_t xnf_callbacks = {
-	MC_GETCAPAB,
-	xnf_stat,
-	xnf_start,
-	xnf_stop,
-	xnf_set_promiscuous,
-	xnf_set_multicast,
-	xnf_set_mac_addr,
-	xnf_send,
-	NULL,
-	NULL,
-	xnf_getcapab
+	.mc_callbacks = XNF_CALLBACK_FLAGS,
+	.mc_getstat = xnf_stat,
+	.mc_start = xnf_start,
+	.mc_stop = xnf_stop,
+	.mc_setpromisc = xnf_set_promiscuous,
+	.mc_multicst = xnf_set_multicast,
+	.mc_unicst = xnf_set_mac_addr,
+	.mc_tx = xnf_send,
+	.mc_getcapab = xnf_getcapab,
+	.mc_setprop = xnf_setprop,
+	.mc_getprop = xnf_getprop,
+	.mc_propinfo = xnf_propinfo,
 };
 
 /* DMA attributes for network ring buffer */
 static ddi_dma_attr_t ringbuf_dma_attr = {
-	DMA_ATTR_V0,		/* version of this structure */
-	0,			/* lowest usable address */
-	0xffffffffffffffffULL,	/* highest usable address */
-	0x7fffffff,		/* maximum DMAable byte count */
-	MMU_PAGESIZE,		/* alignment in bytes */
-	0x7ff,			/* bitmap of burst sizes */
-	1,			/* minimum transfer */
-	0xffffffffU,		/* maximum transfer */
-	0xffffffffffffffffULL,	/* maximum segment length */
-	1,			/* maximum number of segments */
-	1,			/* granularity */
-	0,			/* flags (reserved) */
+	.dma_attr_version = DMA_ATTR_V0,
+	.dma_attr_addr_lo = 0,
+	.dma_attr_addr_hi = 0xffffffffffffffffULL,
+	.dma_attr_count_max = 0x7fffffff,
+	.dma_attr_align = MMU_PAGESIZE,
+	.dma_attr_burstsizes = 0x7ff,
+	.dma_attr_minxfer = 1,
+	.dma_attr_maxxfer = 0xffffffffU,
+	.dma_attr_seg = 0xffffffffffffffffULL,
+	.dma_attr_sgllen = 1,
+	.dma_attr_granular = 1,
+	.dma_attr_flags = 0
+};
+
+/* DMA attributes for receive data */
+static ddi_dma_attr_t rx_buf_dma_attr = {
+	.dma_attr_version = DMA_ATTR_V0,
+	.dma_attr_addr_lo = 0,
+	.dma_attr_addr_hi = 0xffffffffffffffffULL,
+	.dma_attr_count_max = MMU_PAGEOFFSET,
+	.dma_attr_align = MMU_PAGESIZE, /* allocation alignment */
+	.dma_attr_burstsizes = 0x7ff,
+	.dma_attr_minxfer = 1,
+	.dma_attr_maxxfer = 0xffffffffU,
+	.dma_attr_seg = 0xffffffffffffffffULL,
+	.dma_attr_sgllen = 1,
+	.dma_attr_granular = 1,
+	.dma_attr_flags = 0
 };
 
-/* DMA attributes for transmit and receive data */
-static ddi_dma_attr_t buf_dma_attr = {
-	DMA_ATTR_V0,		/* version of this structure */
-	0,			/* lowest usable address */
-	0xffffffffffffffffULL,	/* highest usable address */
-	0x7fffffff,		/* maximum DMAable byte count */
-	MMU_PAGESIZE,		/* alignment in bytes */
-	0x7ff,			/* bitmap of burst sizes */
-	1,			/* minimum transfer */
-	0xffffffffU,		/* maximum transfer */
-	0xffffffffffffffffULL,	/* maximum segment length */
-	1,			/* maximum number of segments */
-	1,			/* granularity */
-	0,			/* flags (reserved) */
+/* DMA attributes for transmit data */
+static ddi_dma_attr_t tx_buf_dma_attr = {
+	.dma_attr_version = DMA_ATTR_V0,
+	.dma_attr_addr_lo = 0,
+	.dma_attr_addr_hi = 0xffffffffffffffffULL,
+	.dma_attr_count_max = MMU_PAGEOFFSET,
+	.dma_attr_align = 1,
+	.dma_attr_burstsizes = 0x7ff,
+	.dma_attr_minxfer = 1,
+	.dma_attr_maxxfer = 0xffffffffU,
+	.dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */
+	.dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */
+	.dma_attr_granular = 1,
+	.dma_attr_flags = 0
 };
 
 /* DMA access attributes for registers and descriptors */
@@ -349,7 +413,7 @@ _info(struct modinfo *modinfop)
  * Acquire a grant reference.
  */
 static grant_ref_t
-gref_get(xnf_t *xnfp)
+xnf_gref_get(xnf_t *xnfp)
 {
 	grant_ref_t gref;
 
@@ -379,7 +443,7 @@ gref_get(xnf_t *xnfp)
  * Release a grant reference.
  */
 static void
-gref_put(xnf_t *xnfp, grant_ref_t gref)
+xnf_gref_put(xnf_t *xnfp, grant_ref_t gref)
 {
 	ASSERT(gref != INVALID_GRANT_REF);
 
@@ -394,7 +458,7 @@ gref_put(xnf_t *xnfp, grant_ref_t gref)
  * Acquire a transmit id.
  */
 static xnf_txid_t *
-txid_get(xnf_t *xnfp)
+xnf_txid_get(xnf_t *xnfp)
 {
 	xnf_txid_t *tidp;
 
@@ -418,7 +482,7 @@ txid_get(xnf_t *xnfp)
  * Release a transmit id.
  */
 static void
-txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
+xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
 {
 	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
 	ASSERT(TX_ID_VALID(tidp->id));
@@ -429,6 +493,93 @@ txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
 	xnfp->xnf_tx_pkt_id_head = tidp->id;
 }
 
+static void
+xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp)
+{
+	ASSERT3U(txp->tx_type, ==, TX_DATA);
+
+	/*
+	 * We are either using a lookaside buffer or we are mapping existing
+	 * buffers.
+	 */
+	if (txp->tx_bdesc != NULL) {
+		ASSERT(!txp->tx_handle_bound);
+		xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
+	} else {
+		if (txp->tx_txreq.gref != INVALID_GRANT_REF) {
+			if (gnttab_query_foreign_access(txp->tx_txreq.gref) !=
+			    0) {
+				cmn_err(CE_PANIC, "tx grant %d still in use by "
+				    "backend domain", txp->tx_txreq.gref);
+			}
+			(void) gnttab_end_foreign_access_ref(
+			    txp->tx_txreq.gref, 1);
+			xnf_gref_put(xnfp, txp->tx_txreq.gref);
+		}
+
+		if (txp->tx_handle_bound)
+			(void) ddi_dma_unbind_handle(txp->tx_dma_handle);
+	}
+
+	if (txp->tx_mp != NULL)
+		freemsg(txp->tx_mp);
+
+	if (txp->tx_prev != NULL) {
+		ASSERT3P(txp->tx_prev->tx_next, ==, txp);
+		txp->tx_prev->tx_next = NULL;
+	}
+
+	if (txp->tx_txreq.id != INVALID_TX_ID) {
+		/*
+		 * This should be only possible when resuming from a suspend.
+		 */
+		ASSERT(!xnfp->xnf_connected);
+		xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id));
+		txp->tx_txreq.id = INVALID_TX_ID;
+	}
+
+	kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+}
+
+static void
+xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp)
+{
+	if (txp == NULL)
+		return;
+
+	while (txp->tx_next != NULL)
+		txp = txp->tx_next;
+
+	/*
+	 * We free the chain in reverse order so that grants can be released
+	 * for all dma chunks before unbinding the dma handles. The mblk is
+	 * freed last, after all its fragments' dma handles are unbound.
+	 */
+	xnf_txbuf_t *prev;
+	for (; txp != NULL; txp = prev) {
+		prev = txp->tx_prev;
+		xnf_data_txbuf_free(xnfp, txp);
+	}
+}
+
+static xnf_txbuf_t *
+xnf_data_txbuf_alloc(xnf_t *xnfp)
+{
+	xnf_txbuf_t *txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
+	txp->tx_type = TX_DATA;
+	txp->tx_next = NULL;
+	txp->tx_prev = NULL;
+	txp->tx_head = txp;
+	txp->tx_frags_to_ack = 0;
+	txp->tx_mp = NULL;
+	txp->tx_bdesc = NULL;
+	txp->tx_handle_bound = B_FALSE;
+	txp->tx_txreq.gref = INVALID_GRANT_REF;
+	txp->tx_txreq.id = INVALID_TX_ID;
+
+	return (txp);
+}
+
 /*
  * Get `wanted' slots in the transmit ring, waiting for at least that
  * number if `wait' is B_TRUE. Force the ring to be cleaned by setting
@@ -437,7 +588,7 @@ txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
  * Return the number of slots available.
  */
 static int
-tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
+xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
 {
 	int slotsfree;
 	boolean_t forced_clean = (wanted == 0);
@@ -513,45 +664,24 @@ xnf_setup_rings(xnf_t *xnfp)
 	mutex_enter(&xnfp->xnf_txlock);
 
 	/*
-	 * Setup/cleanup the TX ring.  Note that this can lose packets
-	 * after a resume, but we expect to stagger on.
+	 * We first cleanup the TX ring in case we are doing a resume.
+	 * Note that this can lose packets, but we expect to stagger on.
 	 */
 	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
 	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
 	    i < NET_TX_RING_SIZE;
 	    i++, tidp++) {
-		xnf_txbuf_t *txp;
-
-		tidp->id = i;
-
-		txp = tidp->txbuf;
-		if (txp == NULL) {
-			tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
-			txid_put(xnfp, tidp);
+		xnf_txbuf_t *txp = tidp->txbuf;
+		if (txp == NULL)
 			continue;
-		}
-
-		ASSERT(txp->tx_txreq.gref != INVALID_GRANT_REF);
-		ASSERT(txp->tx_mp != NULL);
 
 		switch (txp->tx_type) {
 		case TX_DATA:
-			VERIFY(gnttab_query_foreign_access(txp->tx_txreq.gref)
-			    == 0);
-
-			if (txp->tx_bdesc == NULL) {
-				(void) gnttab_end_foreign_access_ref(
-				    txp->tx_txreq.gref, 1);
-				gref_put(xnfp, txp->tx_txreq.gref);
-				(void) ddi_dma_unbind_handle(
-				    txp->tx_dma_handle);
-			} else {
-				xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
-			}
-
-			freemsg(txp->tx_mp);
-			txid_put(xnfp, tidp);
-			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+			/*
+			 * txid_put() will be called for each txbuf's txid in
+			 * the chain which will result in clearing tidp->txbuf.
+			 */
+			xnf_data_txbuf_free_chain(xnfp, txp);
 
 			break;
 
@@ -566,8 +696,7 @@ xnf_setup_rings(xnf_t *xnfp)
 			 * over the empty slot.
 			 */
 			i++;
-			ASSERT(i < NET_TX_RING_SIZE);
-
+			ASSERT3U(i, <, NET_TX_RING_SIZE);
 			break;
 
 		case TX_MCAST_RSP:
@@ -575,6 +704,19 @@ xnf_setup_rings(xnf_t *xnfp)
 		}
 	}
 
+	/*
+	 * Now purge old list and add each txid to the new free list.
+	 */
+	xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
+	for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
+	    i < NET_TX_RING_SIZE;
+	    i++, tidp++) {
+		tidp->id = i;
+		ASSERT3P(tidp->txbuf, ==, NULL);
+		tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
+		xnf_txid_put(xnfp, tidp);
+	}
+
 	/* LINTED: constant in conditional context */
 	SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
 	/* LINTED: constant in conditional context */
@@ -708,6 +850,27 @@ again:
 		}
 	}
 
+	/*
+	 * Tell backend if we support scatter-gather lists on the rx side.
+	 */
+	err = xenbus_printf(xbt, xsname, "feature-sg", "%d",
+	    xnf_enable_rx_sg ? 1 : 0);
+	if (err != 0) {
+		message = "writing feature-sg";
+		goto abort_transaction;
+	}
+
+	/*
+	 * Tell backend if we support LRO for IPv4. Scatter-gather on rx is
+	 * a prerequisite.
+	 */
+	err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d",
+	    (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0);
+	if (err != 0) {
+		message = "writing feature-gso-tcpv4";
+		goto abort_transaction;
+	}
+
 	err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
 	if (err != 0) {
 		message = "switching state to XenbusStateConnected";
@@ -778,6 +941,43 @@ xnf_read_config(xnf_t *xnfp)
 	if (err != 0)
 		be_cap = 0;
 	xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
+
+	/*
+	 * See if back-end supports scatter-gather for transmits. If not,
+	 * we will not support LSO and limit the mtu to 1500.
+	 */
+	err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap);
+	if (err != 0) {
+		be_cap = 0;
+		dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
+		    "'feature-sg' from backend driver");
+	}
+	if (be_cap == 0) {
+		dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not "
+		    "supported for transmits in the backend driver. LSO is "
+		    "disabled and MTU is restricted to 1500 bytes.");
+	}
+	xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg;
+
+	if (xnfp->xnf_be_tx_sg) {
+		/*
+		 * Check if LSO is supported. Currently we only check for
+		 * IPv4 as Illumos doesn't support LSO for IPv6.
+		 */
+		err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d",
+		    &be_cap);
+		if (err != 0) {
+			be_cap = 0;
+			dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
+			    "'feature-gso-tcpv4' from backend driver");
+		}
+		if (be_cap == 0) {
+			dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not "
+			    "supported by the backend driver. Performance "
+			    "will be affected.");
+		}
+		xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso;
+	}
 }
 
 /*
@@ -829,6 +1029,12 @@ xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
 		return (DDI_FAILURE);
 	xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
 
+	xnfp->xnf_tx_pkt_id =
+	    kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP);
+
+	xnfp->xnf_rx_pkt_info =
+	    kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP);
+
 	macp->m_dip = devinfo;
 	macp->m_driver = xnfp;
 	xnfp->xnf_devinfo = devinfo;
@@ -837,7 +1043,8 @@ xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
 	macp->m_src_addr = xnfp->xnf_mac_addr;
 	macp->m_callbacks = &xnf_callbacks;
 	macp->m_min_sdu = 0;
-	macp->m_max_sdu = XNF_MAXPKT;
+	xnfp->xnf_mtu = ETHERMTU;
+	macp->m_max_sdu = xnfp->xnf_mtu;
 
 	xnfp->xnf_running = B_FALSE;
 	xnfp->xnf_connected = B_FALSE;
@@ -1156,11 +1363,11 @@ xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
 	 * 5. Wait for the response via xnf_tx_clean_ring().
 	 */
 
-	n_slots = tx_slots_get(xnfp, 2, B_TRUE);
+	n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE);
 	ASSERT(n_slots >= 2);
 
 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
-	tidp = txid_get(xnfp);
+	tidp = xnf_txid_get(xnfp);
 	VERIFY(tidp != NULL);
 
 	txp->tx_type = TX_MCAST_REQ;
@@ -1196,10 +1403,9 @@ xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
 
 	while (txp->tx_type == TX_MCAST_REQ)
-		cv_wait(&xnfp->xnf_cv_multicast,
-		    &xnfp->xnf_txlock);
+		cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock);
 
-	ASSERT(txp->tx_type == TX_MCAST_RSP);
+	ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP);
 
 	mutex_enter(&xnfp->xnf_schedlock);
 	xnfp->xnf_pending_multicast--;
@@ -1207,7 +1413,7 @@ xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
 
 	result = (txp->tx_status == NETIF_RSP_OKAY);
 
-	txid_put(xnfp, tidp);
+	xnf_txid_put(xnfp, tidp);
 
 	mutex_exit(&xnfp->xnf_txlock);
 
@@ -1261,39 +1467,44 @@ loop:
 			xnf_txbuf_t *txp;
 
 			trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
+			/*
+			 * if this slot was occupied by netif_extra_info_t,
+			 * then the response will be NETIF_RSP_NULL. In this
+			 * case there are no resources to clean up.
+			 */
+			if (trp->status == NETIF_RSP_NULL)
+				continue;
+
 			ASSERT(TX_ID_VALID(trp->id));
 
 			tidp = TX_ID_TO_TXID(xnfp, trp->id);
-			ASSERT(tidp->id == trp->id);
-			ASSERT(tidp->next == INVALID_TX_ID);
+			ASSERT3U(tidp->id, ==, trp->id);
+			ASSERT3U(tidp->next, ==, INVALID_TX_ID);
 
 			txp = tidp->txbuf;
 			ASSERT(txp != NULL);
-			ASSERT(txp->tx_txreq.id == trp->id);
+			ASSERT3U(txp->tx_txreq.id, ==, trp->id);
 
 			switch (txp->tx_type) {
 			case TX_DATA:
-				if (gnttab_query_foreign_access(
-				    txp->tx_txreq.gref) != 0)
-					cmn_err(CE_PANIC,
-					    "tx grant %d still in use by "
-					    "backend domain",
-					    txp->tx_txreq.gref);
-
-				if (txp->tx_bdesc == NULL) {
-					(void) gnttab_end_foreign_access_ref(
-					    txp->tx_txreq.gref, 1);
-					gref_put(xnfp, txp->tx_txreq.gref);
-					(void) ddi_dma_unbind_handle(
-					    txp->tx_dma_handle);
-				} else {
-					xnf_buf_put(xnfp, txp->tx_bdesc,
-					    B_TRUE);
-				}
-
-				freemsg(txp->tx_mp);
-				txid_put(xnfp, tidp);
-				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+				/*
+				 * We must put the txid for each response we
+				 * acknowledge to make sure that we never have
+				 * more free slots than txids. Because of this
+				 * we do it here instead of waiting for it to
+				 * be done in xnf_data_txbuf_free_chain().
+				 */
+				xnf_txid_put(xnfp, tidp);
+				txp->tx_txreq.id = INVALID_TX_ID;
+				ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0);
+				txp->tx_head->tx_frags_to_ack--;
+
+				/*
+				 * We clean the whole chain once we got a
+				 * response for each fragment.
+				 */
+				if (txp->tx_head->tx_frags_to_ack == 0)
+					xnf_data_txbuf_free_chain(xnfp, txp);
 
 				break;
 
@@ -1304,9 +1515,6 @@ loop:
 
 				break;
 
-			case TX_MCAST_RSP:
-				break;
-
 			default:
 				cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
 				    "invalid xnf_txbuf_t type: %d",
@@ -1336,7 +1544,7 @@ loop:
  * within a single page.
  */
 static xnf_buf_t *
-xnf_tx_pullup(xnf_t *xnfp, mblk_t *mp)
+xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen)
 {
 	xnf_buf_t *bd;
 	caddr_t bp;
@@ -1355,68 +1563,101 @@ xnf_tx_pullup(xnf_t *xnfp, mblk_t *mp)
 		mp = mp->b_cont;
 	}
 
-	ASSERT((bp - bd->buf) <= PAGESIZE);
+	*plen = bp - bd->buf;
+	ASSERT3U(*plen, <=, PAGESIZE);
 
-	xnfp->xnf_stat_tx_pullup++;
+	xnfp->xnf_stat_tx_lookaside++;
 
 	return (bd);
 }
 
 /*
- * Insert the pseudo-header checksum into the packet `buf'.
+ * Insert the pseudo-header checksum into the packet.
+ * Assumes packet is IPv4, TCP/UDP since we only advertised support for
+ * HCKSUM_INET_FULL_V4.
  */
-void
-xnf_pseudo_cksum(caddr_t buf, int length)
+int
+xnf_pseudo_cksum(mblk_t *mp)
 {
 	struct ether_header *ehp;
-	uint16_t sap, len, *stuff;
+	uint16_t sap, iplen, *stuff;
 	uint32_t cksum;
-	size_t offset;
+	size_t len;
 	ipha_t *ipha;
 	ipaddr_t src, dst;
+	uchar_t *ptr;
+
+	ptr = mp->b_rptr;
+	len = MBLKL(mp);
+
+	/* Each header must fit completely in an mblk. */
+	ASSERT3U(len, >=, sizeof (*ehp));
 
-	ASSERT(length >= sizeof (*ehp));
-	ehp = (struct ether_header *)buf;
+	ehp = (struct ether_header *)ptr;
 
 	if (ntohs(ehp->ether_type) == VLAN_TPID) {
 		struct ether_vlan_header *evhp;
-
-		ASSERT(length >= sizeof (*evhp));
-		evhp = (struct ether_vlan_header *)buf;
+		ASSERT3U(len, >=, sizeof (*evhp));
+		evhp = (struct ether_vlan_header *)ptr;
 		sap = ntohs(evhp->ether_type);
-		offset = sizeof (*evhp);
+		ptr += sizeof (*evhp);
+		len -= sizeof (*evhp);
 	} else {
 		sap = ntohs(ehp->ether_type);
-		offset = sizeof (*ehp);
+		ptr += sizeof (*ehp);
+		len -= sizeof (*ehp);
 	}
 
-	ASSERT(sap == ETHERTYPE_IP);
+	ASSERT3U(sap, ==, ETHERTYPE_IP);
 
-	/* Packet should have been pulled up by the caller. */
-	if ((offset + sizeof (ipha_t)) > length) {
-		cmn_err(CE_WARN, "xnf_pseudo_cksum: no room for checksum");
-		return;
+	/*
+	 * Ethernet and IP headers may be in different mblks.
+	 */
+	ASSERT3P(ptr, <=, mp->b_wptr);
+	if (ptr == mp->b_wptr) {
+		mp = mp->b_cont;
+		ptr = mp->b_rptr;
+		len = MBLKL(mp);
 	}
 
-	ipha = (ipha_t *)(buf + offset);
+	ASSERT3U(len, >=, sizeof (ipha_t));
+	ipha = (ipha_t *)ptr;
 
-	ASSERT(IPH_HDR_LENGTH(ipha) == IP_SIMPLE_HDR_LENGTH);
+	/*
+	 * We assume the IP header has no options. (This is enforced in
+	 * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM).
+	 */
+	ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH);
+	iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
 
-	len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
+	ptr += IP_SIMPLE_HDR_LENGTH;
+	len -= IP_SIMPLE_HDR_LENGTH;
+
+	/*
+	 * IP and L4 headers may be in different mblks.
+	 */
+	ASSERT3P(ptr, <=, mp->b_wptr);
+	if (ptr == mp->b_wptr) {
+		mp = mp->b_cont;
+		ptr = mp->b_rptr;
+		len = MBLKL(mp);
+	}
 
 	switch (ipha->ipha_protocol) {
 	case IPPROTO_TCP:
-		stuff = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
+		ASSERT3U(len, >=, sizeof (tcph_t));
+		stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET);
 		cksum = IP_TCP_CSUM_COMP;
 		break;
 	case IPPROTO_UDP:
-		stuff = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
+		ASSERT3U(len, >=, sizeof (struct udphdr));
+		stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET);
 		cksum = IP_UDP_CSUM_COMP;
 		break;
 	default:
 		cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
 		    ipha->ipha_protocol);
-		return;
+		return (EINVAL);
 	}
 
 	src = ipha->ipha_src;
@@ -1424,7 +1665,7 @@ xnf_pseudo_cksum(caddr_t buf, int length)
 
 	cksum += (dst >> 16) + (dst & 0xFFFF);
 	cksum += (src >> 16) + (src & 0xFFFF);
-	cksum += htons(len);
+	cksum += htons(iplen);
 
 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
 	cksum = (cksum >> 16) + (cksum & 0xFFFF);
@@ -1432,40 +1673,38 @@ xnf_pseudo_cksum(caddr_t buf, int length)
 	ASSERT(cksum <= 0xFFFF);
 
 	*stuff = (uint16_t)(cksum ? cksum : ~cksum);
+
+	return (0);
 }
 
 /*
- * Push a list of prepared packets (`txp') into the transmit ring.
+ * Push a packet into the transmit ring.
+ *
+ * Note: the format of a tx packet that spans multiple slots is similar to
+ * what is described in xnf_rx_one_packet().
  */
-static xnf_txbuf_t *
-tx_push_packets(xnf_t *xnfp, xnf_txbuf_t *txp)
+static void
+xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head)
 {
-	int slots_free;
+	int nslots = 0;
+	int extras = 0;
 	RING_IDX slot;
 	boolean_t notify;
 
-	mutex_enter(&xnfp->xnf_txlock);
-
+	ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
 	ASSERT(xnfp->xnf_running);
 
-	/*
-	 * Wait until we are connected to the backend.
-	 */
-	while (!xnfp->xnf_connected)
-		cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
-
-	slots_free = tx_slots_get(xnfp, 1, B_FALSE);
-	DTRACE_PROBE1(xnf_send_slotsfree, int, slots_free);
-
 	slot = xnfp->xnf_tx_ring.req_prod_pvt;
 
-	while ((txp != NULL) && (slots_free > 0)) {
+	/*
+	 * The caller has already checked that we have enough slots to proceed.
+	 */
+	for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) {
 		xnf_txid_t *tidp;
 		netif_tx_request_t *txrp;
 
-		tidp = txid_get(xnfp);
+		tidp = xnf_txid_get(xnfp);
 		VERIFY(tidp != NULL);
-
 		txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
 
 		txp->tx_slot = slot;
@@ -1473,281 +1712,353 @@ tx_push_packets(xnf_t *xnfp, xnf_txbuf_t *txp)
 		*txrp = txp->tx_txreq;
 
 		tidp->txbuf = txp;
-
-		xnfp->xnf_stat_opackets++;
-		xnfp->xnf_stat_obytes += txp->tx_txreq.size;
-
-		txp = txp->tx_next;
-		slots_free--;
 		slot++;
+		nslots++;
 
+		/*
+		 * When present, LSO info is placed in a slot after the first
+		 * data segment, and doesn't require a txid.
+		 */
+		if (txp->tx_txreq.flags & NETTXF_extra_info) {
+			netif_extra_info_t *extra;
+			ASSERT3U(nslots, ==, 1);
+
+			extra = (netif_extra_info_t *)
+			    RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
+			*extra = txp->tx_extra;
+			slot++;
+			nslots++;
+			extras = 1;
+		}
 	}
 
+	ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX);
+
+	/*
+	 * Store the number of data fragments.
+	 */
+	head->tx_frags_to_ack = nslots - extras;
+
 	xnfp->xnf_tx_ring.req_prod_pvt = slot;
 
 	/*
 	 * Tell the peer that we sent something, if it cares.
 	 */
 	/* LINTED: constant in conditional context */
-	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
-	    notify);
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify);
 	if (notify)
 		ec_notify_via_evtchn(xnfp->xnf_evtchn);
+}
 
-	mutex_exit(&xnfp->xnf_txlock);
+static xnf_txbuf_t *
+xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp)
+{
+	xnf_txbuf_t *txp = xnf_data_txbuf_alloc(xnfp);
+	size_t length;
+
+	txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length);
+	if (txp->tx_bdesc == NULL) {
+		xnf_data_txbuf_free(xnfp, txp);
+		return (NULL);
+	}
+	txp->tx_mfn = txp->tx_bdesc->buf_mfn;
+	txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
+	txp->tx_txreq.size = length;
+	txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET;
+	txp->tx_txreq.flags = 0;
 
 	return (txp);
 }
 
-/*
- * Send the chain of packets `mp'. Called by the MAC framework.
- */
-static mblk_t *
-xnf_send(void *arg, mblk_t *mp)
+static xnf_txbuf_t *
+xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp)
 {
-	xnf_t *xnfp = arg;
+	xnf_txbuf_t *head = NULL;
+	xnf_txbuf_t *tail = NULL;
 	domid_t oeid;
-	xnf_txbuf_t *head, *tail;
-	mblk_t *ml;
-	int prepared;
+	int nsegs = 0;
 
 	oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
 
-	/*
-	 * Prepare packets for transmission.
-	 */
-	head = tail = NULL;
-	prepared = 0;
-	while (mp != NULL) {
+	for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) {
+		ddi_dma_handle_t dma_handle;
+		ddi_dma_cookie_t dma_cookie;
+		uint_t ncookies;
 		xnf_txbuf_t *txp;
-		int n_chunks, length;
-		boolean_t page_oops;
-		uint32_t pflags;
 
-		for (ml = mp, n_chunks = length = 0, page_oops = B_FALSE;
-		    ml != NULL;
-		    ml = ml->b_cont, n_chunks++) {
+		if (MBLKL(ml) == 0)
+			continue;
 
-			/*
-			 * Test if this buffer includes a page
-			 * boundary. The test assumes that the range
-			 * b_rptr...b_wptr can include only a single
-			 * boundary.
-			 */
-			if (xnf_btop((size_t)ml->b_rptr) !=
-			    xnf_btop((size_t)ml->b_wptr)) {
-				xnfp->xnf_stat_tx_pagebndry++;
-				page_oops = B_TRUE;
-			}
+		txp = xnf_data_txbuf_alloc(xnfp);
 
-			length += MBLKL(ml);
+		if (head == NULL) {
+			head = txp;
+		} else {
+			ASSERT(tail != NULL);
+			TXBUF_SETNEXT(tail, txp);
+			txp->tx_head = head;
 		}
-		DTRACE_PROBE1(xnf_send_b_cont, int, n_chunks);
 
 		/*
-		 * Make sure packet isn't too large.
+		 * The necessary segmentation rules (e.g. not crossing a page
+		 * boundary) are enforced by the dma attributes of the handle.
 		 */
-		if (length > XNF_FRAMESIZE) {
-			cmn_err(CE_WARN,
-			    "xnf%d: oversized packet (%d bytes) dropped",
-			    ddi_get_instance(xnfp->xnf_devinfo), length);
-			freemsg(mp);
-			continue;
+		dma_handle = txp->tx_dma_handle;
+		int ret = ddi_dma_addr_bind_handle(dma_handle,
+		    NULL, (char *)ml->b_rptr, MBLKL(ml),
+		    DDI_DMA_WRITE | DDI_DMA_STREAMING,
+		    DDI_DMA_DONTWAIT, 0, &dma_cookie,
+		    &ncookies);
+		if (ret != DDI_DMA_MAPPED) {
+			if (ret != DDI_DMA_NORESOURCES) {
+				dev_err(xnfp->xnf_devinfo, CE_WARN,
+				    "ddi_dma_addr_bind_handle() failed "
+				    "[dma_error=%d]", ret);
+			}
+			goto error;
 		}
-
-		txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
-
-		txp->tx_type = TX_DATA;
-
-		if ((n_chunks > xnf_max_tx_frags) || page_oops) {
-			/*
-			 * Loan a side buffer rather than the mblk
-			 * itself.
-			 */
-			txp->tx_bdesc = xnf_tx_pullup(xnfp, mp);
-			if (txp->tx_bdesc == NULL) {
-				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
-				break;
+		txp->tx_handle_bound = B_TRUE;
+
+		ASSERT(ncookies > 0);
+		for (int i = 0; i < ncookies; i++) {
+			if (nsegs == XEN_MAX_TX_DATA_PAGES) {
+				dev_err(xnfp->xnf_devinfo, CE_WARN,
+				    "xnf_dmamap_alloc() failed: "
+				    "too many segments");
+				goto error;
 			}
-
-			txp->tx_bufp = txp->tx_bdesc->buf;
-			txp->tx_mfn = txp->tx_bdesc->buf_mfn;
-			txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
-
-		} else {
-			int rc;
-			ddi_dma_cookie_t dma_cookie;
-			uint_t ncookies;
-
-			rc = ddi_dma_addr_bind_handle(txp->tx_dma_handle,
-			    NULL, (char *)mp->b_rptr, length,
-			    DDI_DMA_WRITE | DDI_DMA_STREAMING,
-			    DDI_DMA_DONTWAIT, 0, &dma_cookie,
-			    &ncookies);
-			if (rc != DDI_DMA_MAPPED) {
-				ASSERT(rc != DDI_DMA_INUSE);
-				ASSERT(rc != DDI_DMA_PARTIAL_MAP);
-
-#ifdef XNF_DEBUG
-				if (rc != DDI_DMA_NORESOURCES)
-					cmn_err(CE_WARN,
-					    "xnf%d: bind_handle failed (%x)",
-					    ddi_get_instance(xnfp->xnf_devinfo),
-					    rc);
-#endif
-				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
-				break;
+			if (i > 0) {
+				txp = xnf_data_txbuf_alloc(xnfp);
+				ASSERT(tail != NULL);
+				TXBUF_SETNEXT(tail, txp);
+				txp->tx_head = head;
 			}
-			ASSERT(ncookies == 1);
 
-			txp->tx_bdesc = NULL;
-			txp->tx_bufp = (caddr_t)mp->b_rptr;
 			txp->tx_mfn =
 			    xnf_btop(pa_to_ma(dma_cookie.dmac_laddress));
-			txp->tx_txreq.gref = gref_get(xnfp);
+			txp->tx_txreq.gref = xnf_gref_get(xnfp);
 			if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
-				(void) ddi_dma_unbind_handle(
-				    txp->tx_dma_handle);
-				kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
-				break;
+				dev_err(xnfp->xnf_devinfo, CE_WARN,
+				    "xnf_dmamap_alloc() failed: "
+				    "invalid grant ref");
+				goto error;
 			}
 			gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
 			    oeid, txp->tx_mfn, 1);
-		}
+			txp->tx_txreq.offset =
+			    dma_cookie.dmac_laddress & PAGEOFFSET;
+			txp->tx_txreq.size = dma_cookie.dmac_size;
+			txp->tx_txreq.flags = 0;
 
-		txp->tx_next = NULL;
-		txp->tx_mp = mp;
-		txp->tx_txreq.size = length;
-		txp->tx_txreq.offset = (uintptr_t)txp->tx_bufp & PAGEOFFSET;
-		txp->tx_txreq.flags = 0;
-		mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &pflags);
-		if (pflags != 0) {
-			/*
-			 * If the local protocol stack requests checksum
-			 * offload we set the 'checksum blank' flag,
-			 * indicating to the peer that we need the checksum
-			 * calculated for us.
-			 *
-			 * We _don't_ set the validated flag, because we haven't
-			 * validated that the data and the checksum match.
-			 */
-			xnf_pseudo_cksum(txp->tx_bufp, length);
-			txp->tx_txreq.flags |= NETTXF_csum_blank;
+			ddi_dma_nextcookie(dma_handle, &dma_cookie);
+			nsegs++;
 
-			xnfp->xnf_stat_tx_cksum_deferred++;
+			if (tail != NULL)
+				tail->tx_txreq.flags = NETTXF_more_data;
+			tail = txp;
 		}
+	}
 
-		if (head == NULL) {
-			ASSERT(tail == NULL);
-
-			head = txp;
-		} else {
-			ASSERT(tail != NULL);
-
-			tail->tx_next = txp;
-		}
-		tail = txp;
+	*countp = nsegs;
+	return (head);
 
-		mp = mp->b_next;
-		prepared++;
+error:
+	xnf_data_txbuf_free_chain(xnfp, head);
+	return (NULL);
+}
 
+static void
+xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head,
+    uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss)
+{
+	if (lso_flags != 0) {
+		ASSERT3U(lso_flags, ==, HW_LSO);
+		ASSERT3P(head->tx_bdesc, ==, NULL);
+
+		head->tx_txreq.flags |= NETTXF_extra_info;
+		netif_extra_info_t *extra = &head->tx_extra;
+		extra->type = XEN_NETIF_EXTRA_TYPE_GSO;
+		extra->flags = 0;
+		extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+		extra->u.gso.size = mss;
+		extra->u.gso.features = 0;
+		extra->u.gso.pad = 0;
+	} else if (cksum_flags != 0) {
+		ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
 		/*
-		 * There is no point in preparing more than
-		 * NET_TX_RING_SIZE, as we won't be able to push them
-		 * into the ring in one go and would hence have to
-		 * un-prepare the extra.
+		 * If the local protocol stack requests checksum
+		 * offload we set the 'checksum blank' flag,
+		 * indicating to the peer that we need the checksum
+		 * calculated for us.
+		 *
+		 * We _don't_ set the validated flag, because we haven't
+		 * validated that the data and the checksum match.
+		 *
+		 * Note: we already called xnf_pseudo_cksum() in
+		 * xnf_send(), so we just set the txreq flag here.
 		 */
-		if (prepared == NET_TX_RING_SIZE)
-			break;
+		head->tx_txreq.flags |= NETTXF_csum_blank;
+		xnfp->xnf_stat_tx_cksum_deferred++;
 	}
+}
 
-	DTRACE_PROBE1(xnf_send_prepared, int, prepared);
+/*
+ * Send packet mp. Called by the MAC framework.
+ */
+static mblk_t *
+xnf_send(void *arg, mblk_t *mp)
+{
+	xnf_t *xnfp = arg;
+	xnf_txbuf_t *head;
+	mblk_t *ml;
+	int length;
+	int pages, chunks, slots, slots_free;
+	uint32_t cksum_flags, lso_flags, mss;
+	boolean_t pulledup = B_FALSE;
+	boolean_t force_copy = B_FALSE;
 
-	if (mp != NULL) {
-#ifdef XNF_DEBUG
-		int notprepared = 0;
-		mblk_t *l = mp;
+	ASSERT3P(mp->b_next, ==, NULL);
 
-		while (l != NULL) {
-			notprepared++;
-			l = l->b_next;
-		}
+	mutex_enter(&xnfp->xnf_txlock);
+
+	/*
+	 * Wait until we are connected to the backend.
+	 */
+	while (!xnfp->xnf_connected)
+		cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
+
+	/*
+	 * To simplify logic and be in sync with the rescheduling mechanism,
+	 * we require the maximum amount of slots that could be used by a
+	 * transaction to be free before proceeding. The only downside of doing
+	 * this is that it slightly reduces the effective size of the ring.
+	 */
+	slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE);
+	if (slots_free < XEN_MAX_SLOTS_PER_TX) {
+		/*
+		 * We need to ask for a re-schedule later as the ring is full.
+		 */
+		mutex_enter(&xnfp->xnf_schedlock);
+		xnfp->xnf_need_sched = B_TRUE;
+		mutex_exit(&xnfp->xnf_schedlock);
 
-		DTRACE_PROBE1(xnf_send_notprepared, int, notprepared);
-#else /* !XNF_DEBUG */
-		DTRACE_PROBE1(xnf_send_notprepared, int, -1);
-#endif /* XNF_DEBUG */
+		xnfp->xnf_stat_tx_defer++;
+		mutex_exit(&xnfp->xnf_txlock);
+		return (mp);
 	}
 
 	/*
-	 * Push the packets we have prepared into the ring. They may
-	 * not all go.
+	 * Get hw offload parameters.
+	 * This must be done before pulling up the mp as those parameters
+	 * are not copied over.
 	 */
-	if (head != NULL)
-		head = tx_push_packets(xnfp, head);
+	mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags);
+	mac_lso_get(mp, &mss, &lso_flags);
 
 	/*
-	 * If some packets that we prepared were not sent, unprepare
-	 * them and add them back to the head of those we didn't
-	 * prepare.
+	 * XXX: fix MAC framework so that we can advertise support for
+	 * partial checksum for IPv4 only. This way we won't need to calculate
+	 * the pseudo header checksum ourselves.
 	 */
-	{
-		xnf_txbuf_t *loop;
-		mblk_t *mp_head, *mp_tail;
-		int unprepared = 0;
-
-		mp_head = mp_tail = NULL;
-		loop = head;
-
-		while (loop != NULL) {
-			xnf_txbuf_t *next = loop->tx_next;
-
-			if (loop->tx_bdesc == NULL) {
-				(void) gnttab_end_foreign_access_ref(
-				    loop->tx_txreq.gref, 1);
-				gref_put(xnfp, loop->tx_txreq.gref);
-				(void) ddi_dma_unbind_handle(
-				    loop->tx_dma_handle);
-			} else {
-				xnf_buf_put(xnfp, loop->tx_bdesc, B_TRUE);
-			}
+	if (cksum_flags != 0) {
+		ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
+		(void) xnf_pseudo_cksum(mp);
+	}
 
-			ASSERT(loop->tx_mp != NULL);
-			if (mp_head == NULL)
-				mp_head = loop->tx_mp;
-			mp_tail = loop->tx_mp;
+pulledup:
+	for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL;
+	    ml = ml->b_cont, chunks++) {
+		pages += xnf_mblk_pages(ml);
+		length += MBLKL(ml);
+	}
+	DTRACE_PROBE3(packet, int, length, int, chunks, int, pages);
+	DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss);
 
-			kmem_cache_free(xnfp->xnf_tx_buf_cache, loop);
-			loop = next;
-			unprepared++;
-		}
+	/*
+	 * If the ethernet header crosses a page boundary the packet
+	 * will be dropped by the backend. In practice it seems like
+	 * this happens fairly rarely so we'll do nothing unless the
+	 * packet is small enough to fit in a look-aside buffer.
+	 */
+	if (((uintptr_t)mp->b_rptr & PAGEOFFSET) +
+	    sizeof (struct ether_header) > PAGESIZE) {
+		xnfp->xnf_stat_tx_eth_hdr_split++;
+		if (length <= PAGESIZE)
+			force_copy = B_TRUE;
+	}
 
-		if (mp_tail == NULL) {
-			ASSERT(mp_head == NULL);
-		} else {
-			ASSERT(mp_head != NULL);
+	if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) {
+		/*
+		 * If the packet spans several pages and scatter-gather is not
+		 * supported then use a look-aside buffer.
+		 */
+		ASSERT3U(length, <=, PAGESIZE);
+		head = xnf_mblk_copy(xnfp, mp);
+		if (head == NULL) {
+			dev_err(xnfp->xnf_devinfo, CE_WARN,
+			    "xnf_mblk_copy() failed");
+			goto drop;
+		}
+	} else {
+		/*
+		 * There's a limit for how many pages can be passed to the
+		 * backend. If we pass that limit, the packet will be dropped
+		 * and some backend implementations (e.g. Linux) could even
+		 * offline the interface.
+		 */
+		if (pages > XEN_MAX_TX_DATA_PAGES) {
+			if (pulledup) {
+				dev_err(xnfp->xnf_devinfo, CE_WARN,
+				    "too many pages, even after pullup: %d.",
+				    pages);
+				goto drop;
+			}
 
-			mp_tail->b_next = mp;
-			mp = mp_head;
+			/*
+			 * Defragment packet if it spans too many pages.
+			 */
+			mblk_t *newmp = msgpullup(mp, -1);
+			freemsg(mp);
+			mp = newmp;
+			xnfp->xnf_stat_tx_pullup++;
+			pulledup = B_TRUE;
+			goto pulledup;
 		}
 
-		DTRACE_PROBE1(xnf_send_unprepared, int, unprepared);
+		head = xnf_mblk_map(xnfp, mp, &slots);
+		if (head == NULL)
+			goto drop;
+
+		IMPLY(slots > 1, xnfp->xnf_be_tx_sg);
 	}
 
 	/*
-	 * If any mblks are left then we have deferred for some reason
-	 * and need to ask for a re-schedule later. This is typically
-	 * due to the ring filling.
+	 * Set tx_mp so that mblk is freed when the txbuf chain is freed.
 	 */
-	if (mp != NULL) {
-		mutex_enter(&xnfp->xnf_schedlock);
-		xnfp->xnf_need_sched = B_TRUE;
-		mutex_exit(&xnfp->xnf_schedlock);
+	head->tx_mp = mp;
 
-		xnfp->xnf_stat_tx_defer++;
-	}
+	xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss);
+
+	/*
+	 * The first request must store the total length of the packet.
+	 */
+	head->tx_txreq.size = length;
+
+	/*
+	 * Push the packet we have prepared into the ring.
+	 */
+	xnf_tx_push_packet(xnfp, head);
+	xnfp->xnf_stat_opackets++;
+	xnfp->xnf_stat_obytes += length;
+
+	mutex_exit(&xnfp->xnf_txlock);
+	return (NULL);
 
-	return (mp);
+drop:
+	freemsg(mp);
+	xnfp->xnf_stat_tx_drop++;
+	mutex_exit(&xnfp->xnf_txlock);
+	return (NULL);
 }
 
 /*
@@ -1834,9 +2145,9 @@ xnf_intr(caddr_t arg)
 		int free_slots;
 
 		mutex_enter(&xnfp->xnf_txlock);
-		free_slots = tx_slots_get(xnfp, 0, B_FALSE);
+		free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE);
 
-		if (need_sched && (free_slots > 0)) {
+		if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) {
 			mutex_enter(&xnfp->xnf_schedlock);
 			xnfp->xnf_need_sched = B_FALSE;
 			mutex_exit(&xnfp->xnf_schedlock);
@@ -1922,74 +2233,126 @@ xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
 }
 
 /*
- * Collect packets from the RX ring, storing them in `xnfp' for later
- * use.
+ * Receive an entire packet from the ring, starting from slot *consp.
+ * prod indicates the slot of the latest response.
+ * On return, *consp will point to the head of the next packet.
+ *
+ * Note: If slot prod was reached before we could gather a full packet, we will
+ * drop the partial packet; this would most likely indicate a bug in either
+ * the front-end or the back-end driver.
+ *
+ * An rx packet can consist of several fragments and thus span multiple slots.
+ * Each fragment can contain up to 4k of data.
+ *
+ * A typical 9000 MTU packet with look like this:
+ * +------+---------------------+-------------------+-----------------------+
+ * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 1    | netif_rx_response_t | 1st data fragment | more_data             |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 2    | netif_rx_response_t | 2nd data fragment | more_data             |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 3    | netif_rx_response_t | 3rd data fragment | [none]                |
+ * +------+---------------------+-------------------+-----------------------+
+ *
+ * Fragments are chained by setting NETRXF_more_data in the previous
+ * response's flags. If there are additional flags, such as
+ * NETRXF_data_validated or NETRXF_extra_info, those should be set on the
+ * first fragment.
+ *
+ * Sometimes extra info can be present. If so, it will follow the first
+ * fragment, and NETRXF_extra_info flag will be set on the first response.
+ * If LRO is set on a packet, it will be stored in the extra info. Conforming
+ * to the spec, extra info can also be chained, but must all be present right
+ * after the first fragment.
+ *
+ * Example of a packet with 2 extra infos:
+ * +------+---------------------+-------------------+-----------------------+
+ * | SLOT | TYPE                | CONTENTS          | FLAGS                 |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 1    | netif_rx_response_t | 1st data fragment | extra_info, more_data |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 2    | netif_extra_info_t  | 1st extra info    | EXTRA_FLAG_MORE       |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 3    | netif_extra_info_t  | 2nd extra info    | [none]                |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 4    | netif_rx_response_t | 2nd data fragment | more_data             |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 5    | netif_rx_response_t | 3rd data fragment | more_data             |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 6    | netif_rx_response_t | 4th data fragment | [none]                |
+ * +------+---------------------+-------------------+-----------------------+
+ *
+ * In practice, the only extra we expect is for LRO, but only if we advertise
+ * that we support it to the backend (xnf_enable_lro == TRUE).
  */
-static void
-xnf_rx_collect(xnf_t *xnfp)
+static int
+xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp)
 {
-	mblk_t *head, *tail;
+	mblk_t *head = NULL;
+	mblk_t *tail = NULL;
+	mblk_t *mp;
+	int error = 0;
+	RING_IDX cons = *consp;
+	netif_extra_info_t lro;
+	boolean_t is_lro = B_FALSE;
+	boolean_t is_extra = B_FALSE;
 
-	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
+	netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
 
-	/*
-	 * Loop over unconsumed responses:
-	 * 1. get a response
-	 * 2. take corresponding buffer off recv. ring
-	 * 3. indicate this by setting slot to NULL
-	 * 4. create a new message and
-	 * 5. copy data in, adjust ptr
-	 */
+	boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0;
+	boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0;
+	boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0;
 
-	head = tail = NULL;
+	IMPLY(more_data, xnf_enable_rx_sg);
 
-	while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
-		netif_rx_response_t *rxpkt;
+	while (cons != prod) {
 		xnf_buf_t *bdesc;
-		ssize_t len;
-		size_t off;
-		mblk_t *mp = NULL;
-		boolean_t hwcsum = B_FALSE;
-		grant_ref_t ref;
+		int len, off;
+		int rxidx = cons & (NET_RX_RING_SIZE - 1);
+
+		bdesc = xnfp->xnf_rx_pkt_info[rxidx];
+		xnfp->xnf_rx_pkt_info[rxidx] = NULL;
+
+		if (is_extra) {
+			netif_extra_info_t *extra = (netif_extra_info_t *)&rsp;
+			/*
+			 * The only extra we expect is for LRO, and it should
+			 * only be present once.
+			 */
+			if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO &&
+			    !is_lro) {
+				ASSERT(xnf_enable_lro);
+				lro = *extra;
+				is_lro = B_TRUE;
+				DTRACE_PROBE1(lro, netif_extra_info_t *, &lro);
+			} else {
+				dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet "
+				    "contains unexpected extra info of type %d",
+				    extra->type);
+				error = EINVAL;
+			}
+			more_extra =
+			    (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0;
 
-		/* 1. */
-		rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
-		    xnfp->xnf_rx_ring.rsp_cons);
+			goto hang_buf;
+		}
 
-		DTRACE_PROBE4(xnf_rx_got_rsp, int, (int)rxpkt->id,
-		    int, (int)rxpkt->offset,
-		    int, (int)rxpkt->flags,
-		    int, (int)rxpkt->status);
+		ASSERT3U(bdesc->id, ==, rsp.id);
 
 		/*
-		 * 2.
+		 * status stores packet length when >= 0, or errors when < 0.
 		 */
-		bdesc = xnfp->xnf_rx_pkt_info[rxpkt->id];
+		len = rsp.status;
+		off = rsp.offset;
+		more_data = (rsp.flags & NETRXF_more_data) != 0;
 
 		/*
-		 * 3.
+		 * sanity checks.
 		 */
-		xnfp->xnf_rx_pkt_info[rxpkt->id] = NULL;
-		ASSERT(bdesc->id == rxpkt->id);
-
-		ref = bdesc->grant_ref;
-		off = rxpkt->offset;
-		len = rxpkt->status;
-
 		if (!xnfp->xnf_running) {
-			DTRACE_PROBE4(xnf_rx_not_running,
-			    int, rxpkt->status,
-			    char *, bdesc->buf, int, rxpkt->offset,
-			    char *, ((char *)bdesc->buf) + rxpkt->offset);
-
-			xnfp->xnf_stat_drop++;
-
+			error = EBUSY;
 		} else if (len <= 0) {
-			DTRACE_PROBE4(xnf_rx_pkt_status_negative,
-			    int, rxpkt->status,
-			    char *, bdesc->buf, int, rxpkt->offset,
-			    char *, ((char *)bdesc->buf) + rxpkt->offset);
-
 			xnfp->xnf_stat_errrx++;
 
 			switch (len) {
@@ -2003,148 +2366,204 @@ xnf_rx_collect(xnf_t *xnfp)
 				xnfp->xnf_stat_norxbuf++;
 				break;
 			}
-
+			error = EINVAL;
 		} else if (bdesc->grant_ref == INVALID_GRANT_REF) {
-			cmn_err(CE_WARN, "Bad rx grant reference %d "
-			    "from domain %d", ref,
-			    xvdi_get_oeid(xnfp->xnf_devinfo));
-
+			dev_err(xnfp->xnf_devinfo, CE_WARN,
+			    "Bad rx grant reference, rsp id %d", rsp.id);
+			error = EINVAL;
 		} else if ((off + len) > PAGESIZE) {
-			cmn_err(CE_WARN, "Rx packet overflows page "
-			    "(offset %ld, length %ld) from domain %d",
-			    off, len, xvdi_get_oeid(xnfp->xnf_devinfo));
-		} else {
-			xnf_buf_t *nbuf = NULL;
+			dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses "
+			    "page boundary (offset %d, length %d)", off, len);
+			error = EINVAL;
+		}
 
-			DTRACE_PROBE4(xnf_rx_packet, int, len,
-			    char *, bdesc->buf, int, off,
-			    char *, ((char *)bdesc->buf) + off);
+		if (error != 0) {
+			/*
+			 * If an error has been detected, we do not attempt
+			 * to read the data but we still need to replace
+			 * the rx bufs.
+			 */
+			goto hang_buf;
+		}
 
-			ASSERT(off + len <= PAGEOFFSET);
+		xnf_buf_t *nbuf = NULL;
+
+		/*
+		 * If the packet is below a pre-determined size we will
+		 * copy data out of the buf rather than replace it.
+		 */
+		if (len > xnf_rx_copy_limit)
+			nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
 
-			if (rxpkt->flags & NETRXF_data_validated)
-				hwcsum = B_TRUE;
+		if (nbuf != NULL) {
+			mp = desballoc((unsigned char *)bdesc->buf,
+			    bdesc->len, 0, &bdesc->free_rtn);
+
+			if (mp == NULL) {
+				xnfp->xnf_stat_rx_desballoc_fail++;
+				xnfp->xnf_stat_norxbuf++;
+				error = ENOMEM;
+				/*
+				 * we free the buf we just allocated as we
+				 * will re-hang the old buf.
+				 */
+				xnf_buf_put(xnfp, nbuf, B_FALSE);
+				goto hang_buf;
+			}
+
+			mp->b_rptr = mp->b_rptr + off;
+			mp->b_wptr = mp->b_rptr + len;
 
 			/*
-			 * If the packet is below a pre-determined
-			 * size we will copy data out rather than
-			 * replace it.
+			 * Release the grant as the backend doesn't need to
+			 * access this buffer anymore and grants are scarce.
 			 */
-			if (len > xnf_rx_copy_limit)
-				nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
+			(void) gnttab_end_foreign_access_ref(bdesc->grant_ref,
+			    0);
+			xnf_gref_put(xnfp, bdesc->grant_ref);
+			bdesc->grant_ref = INVALID_GRANT_REF;
 
+			bdesc = nbuf;
+		} else {
 			/*
-			 * If we have a replacement buffer, attempt to
-			 * wrap the existing one with an mblk_t in
-			 * order that the upper layers of the stack
-			 * might use it directly.
+			 * We failed to allocate a new buf or decided to reuse
+			 * the old one. In either case we copy the data off it
+			 * and put it back into the ring.
 			 */
-			if (nbuf != NULL) {
-				mp = desballoc((unsigned char *)bdesc->buf,
-				    bdesc->len, 0, &bdesc->free_rtn);
-				if (mp == NULL) {
-					xnfp->xnf_stat_rx_desballoc_fail++;
-					xnfp->xnf_stat_norxbuf++;
-
-					xnf_buf_put(xnfp, nbuf, B_FALSE);
-					nbuf = NULL;
-				} else {
-					mp->b_rptr = mp->b_rptr + off;
-					mp->b_wptr = mp->b_rptr + len;
-
-					/*
-					 * Release the grant reference
-					 * associated with this buffer
-					 * - they are scarce and the
-					 * upper layers of the stack
-					 * don't need it.
-					 */
-					(void) gnttab_end_foreign_access_ref(
-					    bdesc->grant_ref, 0);
-					gref_put(xnfp, bdesc->grant_ref);
-					bdesc->grant_ref = INVALID_GRANT_REF;
-
-					bdesc = nbuf;
-				}
-			}
-
-			if (nbuf == NULL) {
-				/*
-				 * No replacement buffer allocated -
-				 * attempt to copy the data out and
-				 * re-hang the existing buffer.
-				 */
-
-				/* 4. */
-				mp = allocb(len, BPRI_MED);
-				if (mp == NULL) {
-					xnfp->xnf_stat_rx_allocb_fail++;
-					xnfp->xnf_stat_norxbuf++;
-				} else {
-					/* 5. */
-					bcopy(bdesc->buf + off, mp->b_wptr,
-					    len);
-					mp->b_wptr += len;
-				}
+			mp = allocb(len, 0);
+			if (mp == NULL) {
+				xnfp->xnf_stat_rx_allocb_fail++;
+				xnfp->xnf_stat_norxbuf++;
+				error = ENOMEM;
+				goto hang_buf;
 			}
+			bcopy(bdesc->buf + off, mp->b_wptr, len);
+			mp->b_wptr += len;
 		}
 
-		/* Re-hang the buffer. */
+		if (head == NULL)
+			head = mp;
+		else
+			tail->b_cont = mp;
+		tail = mp;
+
+hang_buf:
+		/*
+		 * No matter what happens, for each response we need to hang
+		 * a new buf on the rx ring. Put either the old one, or a new
+		 * one if the old one is borrowed by the kernel via desballoc().
+		 */
 		xnf_rxbuf_hang(xnfp, bdesc);
+		cons++;
 
-		if (mp != NULL) {
-			if (hwcsum) {
-				/*
-				 * If the peer says that the data has
-				 * been validated then we declare that
-				 * the full checksum has been
-				 * verified.
-				 *
-				 * We don't look at the "checksum
-				 * blank" flag, and hence could have a
-				 * packet here that we are asserting
-				 * is good with a blank checksum.
-				 */
-				mac_hcksum_set(mp, 0, 0, 0, 0,
-				    HCK_FULLCKSUM_OK);
-				xnfp->xnf_stat_rx_cksum_no_need++;
-			}
-			if (head == NULL) {
-				ASSERT(tail == NULL);
+		/* next response is an extra */
+		is_extra = more_extra;
 
-				head = mp;
-			} else {
-				ASSERT(tail != NULL);
+		if (!more_data && !more_extra)
+			break;
 
-				tail->b_next = mp;
-			}
-			tail = mp;
+		/*
+		 * Note that since requests and responses are union'd on the
+		 * same ring, we copy the response to a local variable instead
+		 * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have
+		 * overwritten contents of rsp.
+		 */
+		rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
+	}
 
-			ASSERT(mp->b_next == NULL);
+	/*
+	 * Check that we do not get stuck in a loop.
+	 */
+	ASSERT3U(*consp, !=, cons);
+	*consp = cons;
 
-			xnfp->xnf_stat_ipackets++;
-			xnfp->xnf_stat_rbytes += len;
-		}
+	/*
+	 * We ran out of responses but the flags indicate there is more data.
+	 */
+	if (more_data) {
+		dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments.");
+		error = EINVAL;
+	}
+	if (more_extra) {
+		dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments "
+		    "(extras).");
+		error = EINVAL;
+	}
 
-		xnfp->xnf_rx_ring.rsp_cons++;
+	/*
+	 * An error means the packet must be dropped. If we have already formed
+	 * a partial packet, then discard it.
+	 */
+	if (error != 0) {
+		if (head != NULL)
+			freemsg(head);
+		xnfp->xnf_stat_rx_drop++;
+		return (error);
 	}
 
+	ASSERT(head != NULL);
+
+	if (hwcsum) {
+		/*
+		 * If the peer says that the data has been validated then we
+		 * declare that the full checksum has been verified.
+		 *
+		 * We don't look at the "checksum blank" flag, and hence could
+		 * have a packet here that we are asserting is good with
+		 * a blank checksum.
+		 */
+		mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
+		xnfp->xnf_stat_rx_cksum_no_need++;
+	}
+
+	/* XXX: set lro info for packet once LRO is supported in OS. */
+
+	*mpp = head;
+
+	return (0);
+}
+
+/*
+ * Collect packets from the RX ring, storing them in `xnfp' for later use.
+ */
+static void
+xnf_rx_collect(xnf_t *xnfp)
+{
+	RING_IDX prod;
+
+	ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
+
+	prod = xnfp->xnf_rx_ring.sring->rsp_prod;
 	/*
-	 * Store the mblks we have collected.
+	 * Ensure we see queued responses up to 'prod'.
 	 */
-	if (head != NULL) {
-		ASSERT(tail != NULL);
+	membar_consumer();
 
-		if (xnfp->xnf_rx_head == NULL) {
-			ASSERT(xnfp->xnf_rx_tail == NULL);
+	while (xnfp->xnf_rx_ring.rsp_cons != prod) {
+		mblk_t *mp;
 
-			xnfp->xnf_rx_head = head;
-		} else {
-			ASSERT(xnfp->xnf_rx_tail != NULL);
+		/*
+		 * Collect a packet.
+		 * rsp_cons is updated inside xnf_rx_one_packet().
+		 */
+		int error = xnf_rx_one_packet(xnfp, prod,
+		    &xnfp->xnf_rx_ring.rsp_cons, &mp);
+		if (error == 0) {
+			xnfp->xnf_stat_ipackets++;
+			xnfp->xnf_stat_rbytes += xmsgsize(mp);
 
-			xnfp->xnf_rx_tail->b_next = head;
+			/*
+			 * Append the mblk to the rx list.
+			 */
+			if (xnfp->xnf_rx_head == NULL) {
+				ASSERT3P(xnfp->xnf_rx_tail, ==, NULL);
+				xnfp->xnf_rx_head = mp;
+			} else {
+				ASSERT(xnfp->xnf_rx_tail != NULL);
+				xnfp->xnf_rx_tail->b_next = mp;
+			}
+			xnfp->xnf_rx_tail = mp;
 		}
-		xnfp->xnf_rx_tail = tail;
 	}
 }
 
@@ -2306,7 +2725,7 @@ xnf_release_mblks(xnf_t *xnfp)
 			ASSERT(txp->tx_mp != NULL);
 			freemsg(txp->tx_mp);
 
-			txid_put(xnfp, tidp);
+			xnf_txid_put(xnfp, tidp);
 			kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
 		}
 	}
@@ -2326,7 +2745,7 @@ xnf_buf_constructor(void *buf, void *arg, int kmflag)
 		ddiflags = DDI_DMA_DONTWAIT;
 
 	/* Allocate a DMA access handle for the buffer. */
-	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
+	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr,
 	    ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
 		goto failure;
 
@@ -2391,17 +2810,17 @@ xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
 	 * Usually grant references are more scarce than memory, so we
 	 * attempt to acquire a grant reference first.
 	 */
-	gref = gref_get(xnfp);
+	gref = xnf_gref_get(xnfp);
 	if (gref == INVALID_GRANT_REF)
 		return (NULL);
 
 	bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
 	if (bufp == NULL) {
-		gref_put(xnfp, gref);
+		xnf_gref_put(xnfp, gref);
 		return (NULL);
 	}
 
-	ASSERT(bufp->grant_ref == INVALID_GRANT_REF);
+	ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF);
 
 	bufp->grant_ref = gref;
 
@@ -2423,7 +2842,7 @@ xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
 	if (bufp->grant_ref != INVALID_GRANT_REF) {
 		(void) gnttab_end_foreign_access_ref(
 		    bufp->grant_ref, readonly ? 1 : 0);
-		gref_put(xnfp, bufp->grant_ref);
+		xnf_gref_put(xnfp, bufp->grant_ref);
 		bufp->grant_ref = INVALID_GRANT_REF;
 	}
 
@@ -2464,7 +2883,7 @@ xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
 	if (kmflag & KM_NOSLEEP)
 		ddiflags = DDI_DMA_DONTWAIT;
 
-	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
+	if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr,
 	    ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
 		ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
 		return (-1);
@@ -2491,8 +2910,9 @@ static char *xnf_aux_statistics[] = {
 	"interrupts",
 	"unclaimed_interrupts",
 	"tx_pullup",
-	"tx_pagebndry",
-	"tx_attempt",
+	"tx_lookaside",
+	"tx_drop",
+	"tx_eth_hdr_split",
 	"buf_allocated",
 	"buf_outstanding",
 	"gref_outstanding",
@@ -2524,8 +2944,9 @@ xnf_kstat_aux_update(kstat_t *ksp, int flag)
 	(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
 	(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
 	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
-	(knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
-	(knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
+	(knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside;
+	(knp++)->value.ui64 = xnfp->xnf_stat_tx_drop;
+	(knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split;
 
 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
 	(knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
@@ -2629,10 +3050,94 @@ xnf_stat(void *arg, uint_t stat, uint64_t *val)
 	return (0);
 }
 
+static int
+xnf_change_mtu(xnf_t *xnfp, uint32_t mtu)
+{
+	if (mtu > ETHERMTU) {
+		if (!xnf_enable_tx_sg) {
+			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
+			    "because scatter-gather is disabled for transmit "
+			    "in driver settings", ETHERMTU);
+			return (EINVAL);
+		} else if (!xnf_enable_rx_sg) {
+			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
+			    "because scatter-gather is disabled for receive "
+			    "in driver settings", ETHERMTU);
+			return (EINVAL);
+		} else if (!xnfp->xnf_be_tx_sg) {
+			dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
+			    "because backend doesn't support scatter-gather",
+			    ETHERMTU);
+			return (EINVAL);
+		}
+		if (mtu > XNF_MAXPKT)
+			return (EINVAL);
+	}
+	int error = mac_maxsdu_update(xnfp->xnf_mh, mtu);
+	if (error == 0)
+		xnfp->xnf_mtu = mtu;
+
+	return (error);
+}
+
+/*ARGSUSED*/
+static int
+xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
+    uint_t prop_val_size, void *prop_val)
+{
+	xnf_t *xnfp = data;
+
+	switch (prop_id) {
+	case MAC_PROP_MTU:
+		ASSERT(prop_val_size >= sizeof (uint32_t));
+		bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t));
+		break;
+	default:
+		return (ENOTSUP);
+	}
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
+    uint_t prop_val_size, const void *prop_val)
+{
+	xnf_t *xnfp = data;
+	uint32_t new_mtu;
+	int error;
+
+	switch (prop_id) {
+	case MAC_PROP_MTU:
+		ASSERT(prop_val_size >= sizeof (uint32_t));
+		bcopy(prop_val, &new_mtu, sizeof (new_mtu));
+		error = xnf_change_mtu(xnfp, new_mtu);
+		break;
+	default:
+		return (ENOTSUP);
+	}
+
+	return (error);
+}
+
+/*ARGSUSED*/
+static void
+xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id,
+    mac_prop_info_handle_t prop_handle)
+{
+	switch (prop_id) {
+	case MAC_PROP_MTU:
+		mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT);
+		break;
+	default:
+		break;
+	}
+}
+
 static boolean_t
 xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 {
-	_NOTE(ARGUNUSED(arg));
+	xnf_t *xnfp = arg;
 
 	switch (cap) {
 	case MAC_CAPAB_HCKSUM: {
@@ -2656,6 +3161,21 @@ xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
 		 * before passing the packet to the IO domain.
 		 */
 		*capab = HCKSUM_INET_FULL_V4;
+
+		/*
+		 * TODO: query the "feature-ipv6-csum-offload" capability.
+		 * If enabled, that could allow us to use HCKSUM_INET_PARTIAL.
+		 */
+
+		break;
+	}
+	case MAC_CAPAB_LSO: {
+		if (!xnfp->xnf_be_lso)
+			return (B_FALSE);
+
+		mac_capab_lso_t *lso = cap_data;
+		lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
+		lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET;
 		break;
 	}
 	default:
@@ -2710,6 +3230,13 @@ oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
 		 */
 		mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
 
+		/*
+		 * We do not know if some features such as LSO are supported
+		 * until we connect to the backend. We request the MAC layer
+		 * to poll our capabilities again.
+		 */
+		mac_capab_update(xnfp->xnf_mh);
+
 		break;
 
 	case XenbusStateConnected:
diff --git a/usr/src/uts/common/xen/io/xnf.h b/usr/src/uts/common/xen/io/xnf.h
index 0c8eb2e373..63ce31020f 100644
--- a/usr/src/uts/common/xen/io/xnf.h
+++ b/usr/src/uts/common/xen/io/xnf.h
@@ -24,6 +24,10 @@
  * Use is subject to license terms.
  */
 
+/*
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+ */
+
 #ifndef _SYS_XNF_H
 #define	_SYS_XNF_H
 
@@ -31,10 +35,19 @@
 extern "C" {
 #endif
 
+/*
+ * As of April 2017, TX and RX ring sizes are fixed to 1 page in
+ * size and Xen doesn't support changing it.
+ * This represents 256 entries.
+ */
 #define	NET_TX_RING_SIZE  __CONST_RING_SIZE(netif_tx, PAGESIZE)
 #define	NET_RX_RING_SIZE  __CONST_RING_SIZE(netif_rx, PAGESIZE)
 
-#define	XNF_MAXPKT	1500		/* MTU size */
+/*
+ * There is no MTU limit, however for all practical purposes hardware won't
+ * support anything much larger than 9k. We put an arbitrary 16k limit.
+ */
+#define	XNF_MAXPKT	16384
 #define	XNF_FRAMESIZE	1514		/* frame size including MAC header */
 
 /* DEBUG flags */
@@ -42,6 +55,18 @@ extern "C" {
 #define	XNF_DEBUG_TRACE		0x02
 
 /*
+ * Based on XEN_NETIF_NR_SLOTS_MIN in Linux. Packets that span more pages
+ * than this must be defragmented or dropped.
+ */
+#define	XEN_MAX_TX_DATA_PAGES	18
+/*
+ * We keep one extra slot for LSO
+ */
+#define	XEN_MAX_SLOTS_PER_TX	(XEN_MAX_TX_DATA_PAGES + 1)
+
+#define	XEN_DATA_BOUNDARY	0x1000
+
+/*
  * Information about each receive buffer and any transmit look-aside
  * buffers.
  */
@@ -63,23 +88,41 @@ typedef struct xnf_buf {
 /*
  * Information about each transmit buffer.
  */
+typedef enum xnf_txbuf_type {
+	TX_DATA = 1,
+	TX_MCAST_REQ,
+	TX_MCAST_RSP
+} xnf_txbuf_type_t;
+
+/*
+ * A xnf_txbuf is used to store ancillary data for a netif_tx_request_t.
+ * A tx packet can span multiple xnf_txbuf's, linked together through tx_next
+ * and tx_prev; tx_head points to the head of the chain.
+ */
 typedef struct xnf_txbuf {
 	struct xnf_txbuf	*tx_next;
-	mblk_t			*tx_mp;	/* mblk associated with packet */
+	struct xnf_txbuf	*tx_prev;
+	struct xnf_txbuf	*tx_head;
+	xnf_txbuf_type_t	tx_type;
 	netif_tx_request_t	tx_txreq;
-	caddr_t			tx_bufp;
+	netif_extra_info_t	tx_extra;
+	/* Used for TX_DATA types */
 	ddi_dma_handle_t	tx_dma_handle;
-	mfn_t			tx_mfn;
+	boolean_t		tx_handle_bound;
+	mblk_t			*tx_mp;
 	xnf_buf_t		*tx_bdesc; /* Look-aside buffer, if used. */
-	unsigned char		tx_type;
+	int			tx_frags_to_ack;
+	/* Used for TX_MCAST types */
 	int16_t			tx_status;
+	/* Used for debugging */
+	mfn_t			tx_mfn;
 	RING_IDX		tx_slot;
-
-#define	TX_DATA		1
-#define	TX_MCAST_REQ	2
-#define	TX_MCAST_RSP	3
 } xnf_txbuf_t;
 
+#define	TXBUF_SETNEXT(head, next)	\
+	head->tx_next = next;		\
+	next->tx_prev = head;
+
 /*
  * Information about each outstanding transmit operation.
  */
@@ -97,6 +140,7 @@ typedef struct xnf {
 	dev_info_t		*xnf_devinfo;
 	mac_handle_t		xnf_mh;
 	unsigned char		xnf_mac_addr[ETHERADDRL];
+	uint32_t		xnf_mtu;
 
 	unsigned int		xnf_gen;	/* Increments on resume. */
 
@@ -105,17 +149,20 @@ typedef struct xnf {
 
 	boolean_t		xnf_be_rx_copy;
 	boolean_t		xnf_be_mcast_control;
+	boolean_t		xnf_be_tx_sg;
+	boolean_t		xnf_be_lso;
 
 	uint64_t		xnf_stat_interrupts;
 	uint64_t		xnf_stat_unclaimed_interrupts;
 	uint64_t		xnf_stat_norxbuf;
-	uint64_t		xnf_stat_drop;
+	uint64_t		xnf_stat_rx_drop;
 	uint64_t		xnf_stat_errrx;
 
-	uint64_t		xnf_stat_tx_attempt;
 	uint64_t		xnf_stat_tx_pullup;
-	uint64_t		xnf_stat_tx_pagebndry;
+	uint64_t		xnf_stat_tx_lookaside;
 	uint64_t		xnf_stat_tx_defer;
+	uint64_t		xnf_stat_tx_drop;
+	uint64_t		xnf_stat_tx_eth_hdr_split;
 	uint64_t		xnf_stat_mac_rcv_error;
 	uint64_t		xnf_stat_runt;
 
@@ -145,7 +192,7 @@ typedef struct xnf {
 	paddr_t			xnf_tx_ring_phys_addr;
 	grant_ref_t		xnf_tx_ring_ref;
 
-	xnf_txid_t		xnf_tx_pkt_id[NET_TX_RING_SIZE];
+	xnf_txid_t		*xnf_tx_pkt_id;
 	uint16_t		xnf_tx_pkt_id_head;
 	kmutex_t		xnf_txlock;
 	kmutex_t		xnf_schedlock;
@@ -159,7 +206,7 @@ typedef struct xnf {
 	paddr_t			xnf_rx_ring_phys_addr;
 	grant_ref_t		xnf_rx_ring_ref;
 
-	xnf_buf_t		*xnf_rx_pkt_info[NET_RX_RING_SIZE];
+	xnf_buf_t		**xnf_rx_pkt_info;
 	kmutex_t		xnf_rxlock;
 	mblk_t			*xnf_rx_head;
 	mblk_t			*xnf_rx_tail;
author	Yuri Pankov <yuri.pankov@nexenta.com>	2017-10-07 05:09:40 +0300
committer	Dan McDonald <danmcd@joyent.com>	2017-10-13 15:41:47 -0400
commit	9276b3991ba20d5a5660887ba81b0bc7bed25a0c (patch)
tree	ae01db1ba588a449d545b244bf36475e435fd5b5 /usr/src
parent	5ee44debdc8aa52cdcbf27fa252332a2403ef693 (diff)
download	illumos-joyent-9276b3991ba20d5a5660887ba81b0bc7bed25a0c.tar.gz