summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorYuri Pankov <yuri.pankov@nexenta.com>2017-10-07 05:09:40 +0300
committerDan McDonald <danmcd@joyent.com>2017-10-13 15:41:47 -0400
commit9276b3991ba20d5a5660887ba81b0bc7bed25a0c (patch)
treeae01db1ba588a449d545b244bf36475e435fd5b5 /usr/src
parent5ee44debdc8aa52cdcbf27fa252332a2403ef693 (diff)
downloadillumos-joyent-9276b3991ba20d5a5660887ba81b0bc7bed25a0c.tar.gz
7186 xnf: panic on Xen 4.x
Contributed by: Frank Salzmann <frank@delphix.com> Contributed by: Pavel Zakharov <pavel.zakharov@delphix.com> Reviewed by: Toomas Soome <tsoome@me.com> Reviewed by: Ken Mays <maybird1776@yahoo.com> Reviewed by: Igor Kozhukhov <igor@dilos.org> Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/common/xen/io/xnf.c1617
-rw-r--r--usr/src/uts/common/xen/io/xnf.h75
2 files changed, 1133 insertions, 559 deletions
diff --git a/usr/src/uts/common/xen/io/xnf.c b/usr/src/uts/common/xen/io/xnf.c
index 2f895a33d7..e2475b5942 100644
--- a/usr/src/uts/common/xen/io/xnf.c
+++ b/usr/src/uts/common/xen/io/xnf.c
@@ -25,6 +25,10 @@
*/
/*
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+ */
+
+/*
*
* Copyright (c) 2004 Christian Limpach.
* All rights reserved.
@@ -122,6 +126,8 @@
#include <sys/pattr.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
+#include <inet/tcp.h>
+#include <netinet/udp.h>
#include <sys/gld.h>
#include <sys/modctl.h>
#include <sys/mac_provider.h>
@@ -162,7 +168,9 @@ xnf_t *xnf_debug_instance = NULL;
*/
#define xnf_btop(addr) ((addr) >> PAGESHIFT)
-unsigned int xnf_max_tx_frags = 1;
+/*
+ * The parameters below should only be changed in /etc/system, never in mdb.
+ */
/*
* Should we use the multicast control feature if the backend provides
@@ -171,6 +179,32 @@ unsigned int xnf_max_tx_frags = 1;
boolean_t xnf_multicast_control = B_TRUE;
/*
+ * Should we allow scatter-gather for tx if backend allows it?
+ */
+boolean_t xnf_enable_tx_sg = B_TRUE;
+
+/*
+ * Should we allow scatter-gather for rx if backend allows it?
+ */
+boolean_t xnf_enable_rx_sg = B_TRUE;
+
+/*
+ * Should we allow lso for tx sends if backend allows it?
+ * Requires xnf_enable_tx_sg to be also set to TRUE.
+ */
+boolean_t xnf_enable_lso = B_TRUE;
+
+/*
+ * Should we allow lro on rx if backend supports it?
+ * Requires xnf_enable_rx_sg to be also set to TRUE.
+ *
+ * !! WARNING !!
+ * LRO is not yet supported in the OS so this should be left as FALSE.
+ * !! WARNING !!
+ */
+boolean_t xnf_enable_lro = B_FALSE;
+
+/*
* Received packets below this size are copied to a new streams buffer
* rather than being desballoc'ed.
*
@@ -194,7 +228,14 @@ size_t xnf_rx_copy_limit = 64;
#define INVALID_TX_ID ((uint16_t)-1)
#define TX_ID_TO_TXID(p, id) (&((p)->xnf_tx_pkt_id[(id)]))
-#define TX_ID_VALID(i) (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
+#define TX_ID_VALID(i) \
+ (((i) != INVALID_TX_ID) && ((i) < NET_TX_RING_SIZE))
+
+/*
+ * calculate how many pages are spanned by an mblk fragment
+ */
+#define xnf_mblk_pages(mp) (MBLKL(mp) == 0 ? 0 : \
+ xnf_btop((uintptr_t)mp->b_wptr - 1) - xnf_btop((uintptr_t)mp->b_rptr) + 1)
/* Required system entry points */
static int xnf_attach(dev_info_t *, ddi_attach_cmd_t);
@@ -210,6 +251,11 @@ static mblk_t *xnf_send(void *, mblk_t *);
static uint_t xnf_intr(caddr_t);
static int xnf_stat(void *, uint_t, uint64_t *);
static boolean_t xnf_getcapab(void *, mac_capab_t, void *);
+static int xnf_getprop(void *, const char *, mac_prop_id_t, uint_t, void *);
+static int xnf_setprop(void *, const char *, mac_prop_id_t, uint_t,
+ const void *);
+static void xnf_propinfo(void *, const char *, mac_prop_id_t,
+ mac_prop_info_handle_t);
/* Driver private functions */
static int xnf_alloc_dma_resources(xnf_t *);
@@ -229,17 +275,16 @@ static void xnf_buf_recycle(xnf_buf_t *);
static int xnf_tx_buf_constructor(void *, void *, int);
static void xnf_tx_buf_destructor(void *, void *);
-static grant_ref_t gref_get(xnf_t *);
-#pragma inline(gref_get)
-static void gref_put(xnf_t *, grant_ref_t);
-#pragma inline(gref_put)
+static grant_ref_t xnf_gref_get(xnf_t *);
+#pragma inline(xnf_gref_get)
+static void xnf_gref_put(xnf_t *, grant_ref_t);
+#pragma inline(xnf_gref_put)
-static xnf_txid_t *txid_get(xnf_t *);
-#pragma inline(txid_get)
-static void txid_put(xnf_t *, xnf_txid_t *);
-#pragma inline(txid_put)
+static xnf_txid_t *xnf_txid_get(xnf_t *);
+#pragma inline(xnf_txid_get)
+static void xnf_txid_put(xnf_t *, xnf_txid_t *);
+#pragma inline(xnf_txid_put)
-void xnf_send_driver_status(int, int);
static void xnf_rxbuf_hang(xnf_t *, xnf_buf_t *);
static int xnf_tx_clean_ring(xnf_t *);
static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
@@ -247,50 +292,69 @@ static void oe_state_change(dev_info_t *, ddi_eventcookie_t,
static boolean_t xnf_kstat_init(xnf_t *);
static void xnf_rx_collect(xnf_t *);
+#define XNF_CALLBACK_FLAGS (MC_GETCAPAB | MC_PROPERTIES)
+
static mac_callbacks_t xnf_callbacks = {
- MC_GETCAPAB,
- xnf_stat,
- xnf_start,
- xnf_stop,
- xnf_set_promiscuous,
- xnf_set_multicast,
- xnf_set_mac_addr,
- xnf_send,
- NULL,
- NULL,
- xnf_getcapab
+ .mc_callbacks = XNF_CALLBACK_FLAGS,
+ .mc_getstat = xnf_stat,
+ .mc_start = xnf_start,
+ .mc_stop = xnf_stop,
+ .mc_setpromisc = xnf_set_promiscuous,
+ .mc_multicst = xnf_set_multicast,
+ .mc_unicst = xnf_set_mac_addr,
+ .mc_tx = xnf_send,
+ .mc_getcapab = xnf_getcapab,
+ .mc_setprop = xnf_setprop,
+ .mc_getprop = xnf_getprop,
+ .mc_propinfo = xnf_propinfo,
};
/* DMA attributes for network ring buffer */
static ddi_dma_attr_t ringbuf_dma_attr = {
- DMA_ATTR_V0, /* version of this structure */
- 0, /* lowest usable address */
- 0xffffffffffffffffULL, /* highest usable address */
- 0x7fffffff, /* maximum DMAable byte count */
- MMU_PAGESIZE, /* alignment in bytes */
- 0x7ff, /* bitmap of burst sizes */
- 1, /* minimum transfer */
- 0xffffffffU, /* maximum transfer */
- 0xffffffffffffffffULL, /* maximum segment length */
- 1, /* maximum number of segments */
- 1, /* granularity */
- 0, /* flags (reserved) */
+ .dma_attr_version = DMA_ATTR_V0,
+ .dma_attr_addr_lo = 0,
+ .dma_attr_addr_hi = 0xffffffffffffffffULL,
+ .dma_attr_count_max = 0x7fffffff,
+ .dma_attr_align = MMU_PAGESIZE,
+ .dma_attr_burstsizes = 0x7ff,
+ .dma_attr_minxfer = 1,
+ .dma_attr_maxxfer = 0xffffffffU,
+ .dma_attr_seg = 0xffffffffffffffffULL,
+ .dma_attr_sgllen = 1,
+ .dma_attr_granular = 1,
+ .dma_attr_flags = 0
+};
+
+/* DMA attributes for receive data */
+static ddi_dma_attr_t rx_buf_dma_attr = {
+ .dma_attr_version = DMA_ATTR_V0,
+ .dma_attr_addr_lo = 0,
+ .dma_attr_addr_hi = 0xffffffffffffffffULL,
+ .dma_attr_count_max = MMU_PAGEOFFSET,
+ .dma_attr_align = MMU_PAGESIZE, /* allocation alignment */
+ .dma_attr_burstsizes = 0x7ff,
+ .dma_attr_minxfer = 1,
+ .dma_attr_maxxfer = 0xffffffffU,
+ .dma_attr_seg = 0xffffffffffffffffULL,
+ .dma_attr_sgllen = 1,
+ .dma_attr_granular = 1,
+ .dma_attr_flags = 0
};
-/* DMA attributes for transmit and receive data */
-static ddi_dma_attr_t buf_dma_attr = {
- DMA_ATTR_V0, /* version of this structure */
- 0, /* lowest usable address */
- 0xffffffffffffffffULL, /* highest usable address */
- 0x7fffffff, /* maximum DMAable byte count */
- MMU_PAGESIZE, /* alignment in bytes */
- 0x7ff, /* bitmap of burst sizes */
- 1, /* minimum transfer */
- 0xffffffffU, /* maximum transfer */
- 0xffffffffffffffffULL, /* maximum segment length */
- 1, /* maximum number of segments */
- 1, /* granularity */
- 0, /* flags (reserved) */
+/* DMA attributes for transmit data */
+static ddi_dma_attr_t tx_buf_dma_attr = {
+ .dma_attr_version = DMA_ATTR_V0,
+ .dma_attr_addr_lo = 0,
+ .dma_attr_addr_hi = 0xffffffffffffffffULL,
+ .dma_attr_count_max = MMU_PAGEOFFSET,
+ .dma_attr_align = 1,
+ .dma_attr_burstsizes = 0x7ff,
+ .dma_attr_minxfer = 1,
+ .dma_attr_maxxfer = 0xffffffffU,
+ .dma_attr_seg = XEN_DATA_BOUNDARY - 1, /* segment boundary */
+ .dma_attr_sgllen = XEN_MAX_TX_DATA_PAGES, /* max number of segments */
+ .dma_attr_granular = 1,
+ .dma_attr_flags = 0
};
/* DMA access attributes for registers and descriptors */
@@ -349,7 +413,7 @@ _info(struct modinfo *modinfop)
* Acquire a grant reference.
*/
static grant_ref_t
-gref_get(xnf_t *xnfp)
+xnf_gref_get(xnf_t *xnfp)
{
grant_ref_t gref;
@@ -379,7 +443,7 @@ gref_get(xnf_t *xnfp)
* Release a grant reference.
*/
static void
-gref_put(xnf_t *xnfp, grant_ref_t gref)
+xnf_gref_put(xnf_t *xnfp, grant_ref_t gref)
{
ASSERT(gref != INVALID_GRANT_REF);
@@ -394,7 +458,7 @@ gref_put(xnf_t *xnfp, grant_ref_t gref)
* Acquire a transmit id.
*/
static xnf_txid_t *
-txid_get(xnf_t *xnfp)
+xnf_txid_get(xnf_t *xnfp)
{
xnf_txid_t *tidp;
@@ -418,7 +482,7 @@ txid_get(xnf_t *xnfp)
* Release a transmit id.
*/
static void
-txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
+xnf_txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
{
ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
ASSERT(TX_ID_VALID(tidp->id));
@@ -429,6 +493,93 @@ txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
xnfp->xnf_tx_pkt_id_head = tidp->id;
}
+static void
+xnf_data_txbuf_free(xnf_t *xnfp, xnf_txbuf_t *txp)
+{
+ ASSERT3U(txp->tx_type, ==, TX_DATA);
+
+ /*
+ * We are either using a lookaside buffer or we are mapping existing
+ * buffers.
+ */
+ if (txp->tx_bdesc != NULL) {
+ ASSERT(!txp->tx_handle_bound);
+ xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
+ } else {
+ if (txp->tx_txreq.gref != INVALID_GRANT_REF) {
+ if (gnttab_query_foreign_access(txp->tx_txreq.gref) !=
+ 0) {
+ cmn_err(CE_PANIC, "tx grant %d still in use by "
+ "backend domain", txp->tx_txreq.gref);
+ }
+ (void) gnttab_end_foreign_access_ref(
+ txp->tx_txreq.gref, 1);
+ xnf_gref_put(xnfp, txp->tx_txreq.gref);
+ }
+
+ if (txp->tx_handle_bound)
+ (void) ddi_dma_unbind_handle(txp->tx_dma_handle);
+ }
+
+ if (txp->tx_mp != NULL)
+ freemsg(txp->tx_mp);
+
+ if (txp->tx_prev != NULL) {
+ ASSERT3P(txp->tx_prev->tx_next, ==, txp);
+ txp->tx_prev->tx_next = NULL;
+ }
+
+ if (txp->tx_txreq.id != INVALID_TX_ID) {
+ /*
+ * This should be only possible when resuming from a suspend.
+ */
+ ASSERT(!xnfp->xnf_connected);
+ xnf_txid_put(xnfp, TX_ID_TO_TXID(xnfp, txp->tx_txreq.id));
+ txp->tx_txreq.id = INVALID_TX_ID;
+ }
+
+ kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+}
+
+static void
+xnf_data_txbuf_free_chain(xnf_t *xnfp, xnf_txbuf_t *txp)
+{
+ if (txp == NULL)
+ return;
+
+ while (txp->tx_next != NULL)
+ txp = txp->tx_next;
+
+ /*
+ * We free the chain in reverse order so that grants can be released
+ * for all dma chunks before unbinding the dma handles. The mblk is
+ * freed last, after all its fragments' dma handles are unbound.
+ */
+ xnf_txbuf_t *prev;
+ for (; txp != NULL; txp = prev) {
+ prev = txp->tx_prev;
+ xnf_data_txbuf_free(xnfp, txp);
+ }
+}
+
+static xnf_txbuf_t *
+xnf_data_txbuf_alloc(xnf_t *xnfp)
+{
+ xnf_txbuf_t *txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
+ txp->tx_type = TX_DATA;
+ txp->tx_next = NULL;
+ txp->tx_prev = NULL;
+ txp->tx_head = txp;
+ txp->tx_frags_to_ack = 0;
+ txp->tx_mp = NULL;
+ txp->tx_bdesc = NULL;
+ txp->tx_handle_bound = B_FALSE;
+ txp->tx_txreq.gref = INVALID_GRANT_REF;
+ txp->tx_txreq.id = INVALID_TX_ID;
+
+ return (txp);
+}
+
/*
* Get `wanted' slots in the transmit ring, waiting for at least that
* number if `wait' is B_TRUE. Force the ring to be cleaned by setting
@@ -437,7 +588,7 @@ txid_put(xnf_t *xnfp, xnf_txid_t *tidp)
* Return the number of slots available.
*/
static int
-tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
+xnf_tx_slots_get(xnf_t *xnfp, int wanted, boolean_t wait)
{
int slotsfree;
boolean_t forced_clean = (wanted == 0);
@@ -513,45 +664,24 @@ xnf_setup_rings(xnf_t *xnfp)
mutex_enter(&xnfp->xnf_txlock);
/*
- * Setup/cleanup the TX ring. Note that this can lose packets
- * after a resume, but we expect to stagger on.
+ * We first cleanup the TX ring in case we are doing a resume.
+ * Note that this can lose packets, but we expect to stagger on.
*/
xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
i < NET_TX_RING_SIZE;
i++, tidp++) {
- xnf_txbuf_t *txp;
-
- tidp->id = i;
-
- txp = tidp->txbuf;
- if (txp == NULL) {
- tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
- txid_put(xnfp, tidp);
+ xnf_txbuf_t *txp = tidp->txbuf;
+ if (txp == NULL)
continue;
- }
-
- ASSERT(txp->tx_txreq.gref != INVALID_GRANT_REF);
- ASSERT(txp->tx_mp != NULL);
switch (txp->tx_type) {
case TX_DATA:
- VERIFY(gnttab_query_foreign_access(txp->tx_txreq.gref)
- == 0);
-
- if (txp->tx_bdesc == NULL) {
- (void) gnttab_end_foreign_access_ref(
- txp->tx_txreq.gref, 1);
- gref_put(xnfp, txp->tx_txreq.gref);
- (void) ddi_dma_unbind_handle(
- txp->tx_dma_handle);
- } else {
- xnf_buf_put(xnfp, txp->tx_bdesc, B_TRUE);
- }
-
- freemsg(txp->tx_mp);
- txid_put(xnfp, tidp);
- kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+ /*
+ * txid_put() will be called for each txbuf's txid in
+ * the chain which will result in clearing tidp->txbuf.
+ */
+ xnf_data_txbuf_free_chain(xnfp, txp);
break;
@@ -566,8 +696,7 @@ xnf_setup_rings(xnf_t *xnfp)
* over the empty slot.
*/
i++;
- ASSERT(i < NET_TX_RING_SIZE);
-
+ ASSERT3U(i, <, NET_TX_RING_SIZE);
break;
case TX_MCAST_RSP:
@@ -575,6 +704,19 @@ xnf_setup_rings(xnf_t *xnfp)
}
}
+ /*
+ * Now purge old list and add each txid to the new free list.
+ */
+ xnfp->xnf_tx_pkt_id_head = INVALID_TX_ID; /* I.e. emtpy list. */
+ for (i = 0, tidp = &xnfp->xnf_tx_pkt_id[0];
+ i < NET_TX_RING_SIZE;
+ i++, tidp++) {
+ tidp->id = i;
+ ASSERT3P(tidp->txbuf, ==, NULL);
+ tidp->next = INVALID_TX_ID; /* Appease txid_put(). */
+ xnf_txid_put(xnfp, tidp);
+ }
+
/* LINTED: constant in conditional context */
SHARED_RING_INIT(xnfp->xnf_tx_ring.sring);
/* LINTED: constant in conditional context */
@@ -708,6 +850,27 @@ again:
}
}
+ /*
+ * Tell backend if we support scatter-gather lists on the rx side.
+ */
+ err = xenbus_printf(xbt, xsname, "feature-sg", "%d",
+ xnf_enable_rx_sg ? 1 : 0);
+ if (err != 0) {
+ message = "writing feature-sg";
+ goto abort_transaction;
+ }
+
+ /*
+ * Tell backend if we support LRO for IPv4. Scatter-gather on rx is
+ * a prerequisite.
+ */
+ err = xenbus_printf(xbt, xsname, "feature-gso-tcpv4", "%d",
+ (xnf_enable_rx_sg && xnf_enable_lro) ? 1 : 0);
+ if (err != 0) {
+ message = "writing feature-gso-tcpv4";
+ goto abort_transaction;
+ }
+
err = xvdi_switch_state(xnfp->xnf_devinfo, xbt, XenbusStateConnected);
if (err != 0) {
message = "switching state to XenbusStateConnected";
@@ -778,6 +941,43 @@ xnf_read_config(xnf_t *xnfp)
if (err != 0)
be_cap = 0;
xnfp->xnf_be_mcast_control = (be_cap != 0) && xnf_multicast_control;
+
+ /*
+ * See if back-end supports scatter-gather for transmits. If not,
+ * we will not support LSO and limit the mtu to 1500.
+ */
+ err = xenbus_scanf(XBT_NULL, oename, "feature-sg", "%d", &be_cap);
+ if (err != 0) {
+ be_cap = 0;
+ dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
+ "'feature-sg' from backend driver");
+ }
+ if (be_cap == 0) {
+ dev_err(xnfp->xnf_devinfo, CE_WARN, "scatter-gather is not "
+ "supported for transmits in the backend driver. LSO is "
+ "disabled and MTU is restricted to 1500 bytes.");
+ }
+ xnfp->xnf_be_tx_sg = (be_cap != 0) && xnf_enable_tx_sg;
+
+ if (xnfp->xnf_be_tx_sg) {
+ /*
+ * Check if LSO is supported. Currently we only check for
+ * IPv4 as Illumos doesn't support LSO for IPv6.
+ */
+ err = xenbus_scanf(XBT_NULL, oename, "feature-gso-tcpv4", "%d",
+ &be_cap);
+ if (err != 0) {
+ be_cap = 0;
+ dev_err(xnfp->xnf_devinfo, CE_WARN, "error reading "
+ "'feature-gso-tcpv4' from backend driver");
+ }
+ if (be_cap == 0) {
+ dev_err(xnfp->xnf_devinfo, CE_WARN, "LSO is not "
+ "supported by the backend driver. Performance "
+ "will be affected.");
+ }
+ xnfp->xnf_be_lso = (be_cap != 0) && xnf_enable_lso;
+ }
}
/*
@@ -829,6 +1029,12 @@ xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
return (DDI_FAILURE);
xnfp = kmem_zalloc(sizeof (*xnfp), KM_SLEEP);
+ xnfp->xnf_tx_pkt_id =
+ kmem_zalloc(sizeof (xnf_txid_t) * NET_TX_RING_SIZE, KM_SLEEP);
+
+ xnfp->xnf_rx_pkt_info =
+ kmem_zalloc(sizeof (xnf_buf_t *) * NET_RX_RING_SIZE, KM_SLEEP);
+
macp->m_dip = devinfo;
macp->m_driver = xnfp;
xnfp->xnf_devinfo = devinfo;
@@ -837,7 +1043,8 @@ xnf_attach(dev_info_t *devinfo, ddi_attach_cmd_t cmd)
macp->m_src_addr = xnfp->xnf_mac_addr;
macp->m_callbacks = &xnf_callbacks;
macp->m_min_sdu = 0;
- macp->m_max_sdu = XNF_MAXPKT;
+ xnfp->xnf_mtu = ETHERMTU;
+ macp->m_max_sdu = xnfp->xnf_mtu;
xnfp->xnf_running = B_FALSE;
xnfp->xnf_connected = B_FALSE;
@@ -1156,11 +1363,11 @@ xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
* 5. Wait for the response via xnf_tx_clean_ring().
*/
- n_slots = tx_slots_get(xnfp, 2, B_TRUE);
+ n_slots = xnf_tx_slots_get(xnfp, 2, B_TRUE);
ASSERT(n_slots >= 2);
slot = xnfp->xnf_tx_ring.req_prod_pvt;
- tidp = txid_get(xnfp);
+ tidp = xnf_txid_get(xnfp);
VERIFY(tidp != NULL);
txp->tx_type = TX_MCAST_REQ;
@@ -1196,10 +1403,9 @@ xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
ec_notify_via_evtchn(xnfp->xnf_evtchn);
while (txp->tx_type == TX_MCAST_REQ)
- cv_wait(&xnfp->xnf_cv_multicast,
- &xnfp->xnf_txlock);
+ cv_wait(&xnfp->xnf_cv_multicast, &xnfp->xnf_txlock);
- ASSERT(txp->tx_type == TX_MCAST_RSP);
+ ASSERT3U(txp->tx_type, ==, TX_MCAST_RSP);
mutex_enter(&xnfp->xnf_schedlock);
xnfp->xnf_pending_multicast--;
@@ -1207,7 +1413,7 @@ xnf_set_multicast(void *arg, boolean_t add, const uint8_t *mca)
result = (txp->tx_status == NETIF_RSP_OKAY);
- txid_put(xnfp, tidp);
+ xnf_txid_put(xnfp, tidp);
mutex_exit(&xnfp->xnf_txlock);
@@ -1261,39 +1467,44 @@ loop:
xnf_txbuf_t *txp;
trp = RING_GET_RESPONSE(&xnfp->xnf_tx_ring, i);
+ /*
+ * if this slot was occupied by netif_extra_info_t,
+ * then the response will be NETIF_RSP_NULL. In this
+ * case there are no resources to clean up.
+ */
+ if (trp->status == NETIF_RSP_NULL)
+ continue;
+
ASSERT(TX_ID_VALID(trp->id));
tidp = TX_ID_TO_TXID(xnfp, trp->id);
- ASSERT(tidp->id == trp->id);
- ASSERT(tidp->next == INVALID_TX_ID);
+ ASSERT3U(tidp->id, ==, trp->id);
+ ASSERT3U(tidp->next, ==, INVALID_TX_ID);
txp = tidp->txbuf;
ASSERT(txp != NULL);
- ASSERT(txp->tx_txreq.id == trp->id);
+ ASSERT3U(txp->tx_txreq.id, ==, trp->id);
switch (txp->tx_type) {
case TX_DATA:
- if (gnttab_query_foreign_access(
- txp->tx_txreq.gref) != 0)
- cmn_err(CE_PANIC,
- "tx grant %d still in use by "
- "backend domain",
- txp->tx_txreq.gref);
-
- if (txp->tx_bdesc == NULL) {
- (void) gnttab_end_foreign_access_ref(
- txp->tx_txreq.gref, 1);
- gref_put(xnfp, txp->tx_txreq.gref);
- (void) ddi_dma_unbind_handle(
- txp->tx_dma_handle);
- } else {
- xnf_buf_put(xnfp, txp->tx_bdesc,
- B_TRUE);
- }
-
- freemsg(txp->tx_mp);
- txid_put(xnfp, tidp);
- kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
+ /*
+ * We must put the txid for each response we
+ * acknowledge to make sure that we never have
+ * more free slots than txids. Because of this
+ * we do it here instead of waiting for it to
+ * be done in xnf_data_txbuf_free_chain().
+ */
+ xnf_txid_put(xnfp, tidp);
+ txp->tx_txreq.id = INVALID_TX_ID;
+ ASSERT3S(txp->tx_head->tx_frags_to_ack, >, 0);
+ txp->tx_head->tx_frags_to_ack--;
+
+ /*
+ * We clean the whole chain once we got a
+ * response for each fragment.
+ */
+ if (txp->tx_head->tx_frags_to_ack == 0)
+ xnf_data_txbuf_free_chain(xnfp, txp);
break;
@@ -1304,9 +1515,6 @@ loop:
break;
- case TX_MCAST_RSP:
- break;
-
default:
cmn_err(CE_PANIC, "xnf_tx_clean_ring: "
"invalid xnf_txbuf_t type: %d",
@@ -1336,7 +1544,7 @@ loop:
* within a single page.
*/
static xnf_buf_t *
-xnf_tx_pullup(xnf_t *xnfp, mblk_t *mp)
+xnf_tx_get_lookaside(xnf_t *xnfp, mblk_t *mp, size_t *plen)
{
xnf_buf_t *bd;
caddr_t bp;
@@ -1355,68 +1563,101 @@ xnf_tx_pullup(xnf_t *xnfp, mblk_t *mp)
mp = mp->b_cont;
}
- ASSERT((bp - bd->buf) <= PAGESIZE);
+ *plen = bp - bd->buf;
+ ASSERT3U(*plen, <=, PAGESIZE);
- xnfp->xnf_stat_tx_pullup++;
+ xnfp->xnf_stat_tx_lookaside++;
return (bd);
}
/*
- * Insert the pseudo-header checksum into the packet `buf'.
+ * Insert the pseudo-header checksum into the packet.
+ * Assumes packet is IPv4, TCP/UDP since we only advertised support for
+ * HCKSUM_INET_FULL_V4.
*/
-void
-xnf_pseudo_cksum(caddr_t buf, int length)
+int
+xnf_pseudo_cksum(mblk_t *mp)
{
struct ether_header *ehp;
- uint16_t sap, len, *stuff;
+ uint16_t sap, iplen, *stuff;
uint32_t cksum;
- size_t offset;
+ size_t len;
ipha_t *ipha;
ipaddr_t src, dst;
+ uchar_t *ptr;
+
+ ptr = mp->b_rptr;
+ len = MBLKL(mp);
+
+ /* Each header must fit completely in an mblk. */
+ ASSERT3U(len, >=, sizeof (*ehp));
- ASSERT(length >= sizeof (*ehp));
- ehp = (struct ether_header *)buf;
+ ehp = (struct ether_header *)ptr;
if (ntohs(ehp->ether_type) == VLAN_TPID) {
struct ether_vlan_header *evhp;
-
- ASSERT(length >= sizeof (*evhp));
- evhp = (struct ether_vlan_header *)buf;
+ ASSERT3U(len, >=, sizeof (*evhp));
+ evhp = (struct ether_vlan_header *)ptr;
sap = ntohs(evhp->ether_type);
- offset = sizeof (*evhp);
+ ptr += sizeof (*evhp);
+ len -= sizeof (*evhp);
} else {
sap = ntohs(ehp->ether_type);
- offset = sizeof (*ehp);
+ ptr += sizeof (*ehp);
+ len -= sizeof (*ehp);
}
- ASSERT(sap == ETHERTYPE_IP);
+ ASSERT3U(sap, ==, ETHERTYPE_IP);
- /* Packet should have been pulled up by the caller. */
- if ((offset + sizeof (ipha_t)) > length) {
- cmn_err(CE_WARN, "xnf_pseudo_cksum: no room for checksum");
- return;
+ /*
+ * Ethernet and IP headers may be in different mblks.
+ */
+ ASSERT3P(ptr, <=, mp->b_wptr);
+ if (ptr == mp->b_wptr) {
+ mp = mp->b_cont;
+ ptr = mp->b_rptr;
+ len = MBLKL(mp);
}
- ipha = (ipha_t *)(buf + offset);
+ ASSERT3U(len, >=, sizeof (ipha_t));
+ ipha = (ipha_t *)ptr;
- ASSERT(IPH_HDR_LENGTH(ipha) == IP_SIMPLE_HDR_LENGTH);
+ /*
+ * We assume the IP header has no options. (This is enforced in
+ * ire_send_wire_v4() -- search for IXAF_NO_HW_CKSUM).
+ */
+ ASSERT3U(IPH_HDR_LENGTH(ipha), ==, IP_SIMPLE_HDR_LENGTH);
+ iplen = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
- len = ntohs(ipha->ipha_length) - IP_SIMPLE_HDR_LENGTH;
+ ptr += IP_SIMPLE_HDR_LENGTH;
+ len -= IP_SIMPLE_HDR_LENGTH;
+
+ /*
+ * IP and L4 headers may be in different mblks.
+ */
+ ASSERT3P(ptr, <=, mp->b_wptr);
+ if (ptr == mp->b_wptr) {
+ mp = mp->b_cont;
+ ptr = mp->b_rptr;
+ len = MBLKL(mp);
+ }
switch (ipha->ipha_protocol) {
case IPPROTO_TCP:
- stuff = IPH_TCPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
+ ASSERT3U(len, >=, sizeof (tcph_t));
+ stuff = (uint16_t *)(ptr + TCP_CHECKSUM_OFFSET);
cksum = IP_TCP_CSUM_COMP;
break;
case IPPROTO_UDP:
- stuff = IPH_UDPH_CHECKSUMP(ipha, IP_SIMPLE_HDR_LENGTH);
+ ASSERT3U(len, >=, sizeof (struct udphdr));
+ stuff = (uint16_t *)(ptr + UDP_CHECKSUM_OFFSET);
cksum = IP_UDP_CSUM_COMP;
break;
default:
cmn_err(CE_WARN, "xnf_pseudo_cksum: unexpected protocol %d",
ipha->ipha_protocol);
- return;
+ return (EINVAL);
}
src = ipha->ipha_src;
@@ -1424,7 +1665,7 @@ xnf_pseudo_cksum(caddr_t buf, int length)
cksum += (dst >> 16) + (dst & 0xFFFF);
cksum += (src >> 16) + (src & 0xFFFF);
- cksum += htons(len);
+ cksum += htons(iplen);
cksum = (cksum >> 16) + (cksum & 0xFFFF);
cksum = (cksum >> 16) + (cksum & 0xFFFF);
@@ -1432,40 +1673,38 @@ xnf_pseudo_cksum(caddr_t buf, int length)
ASSERT(cksum <= 0xFFFF);
*stuff = (uint16_t)(cksum ? cksum : ~cksum);
+
+ return (0);
}
/*
- * Push a list of prepared packets (`txp') into the transmit ring.
+ * Push a packet into the transmit ring.
+ *
+ * Note: the format of a tx packet that spans multiple slots is similar to
+ * what is described in xnf_rx_one_packet().
*/
-static xnf_txbuf_t *
-tx_push_packets(xnf_t *xnfp, xnf_txbuf_t *txp)
+static void
+xnf_tx_push_packet(xnf_t *xnfp, xnf_txbuf_t *head)
{
- int slots_free;
+ int nslots = 0;
+ int extras = 0;
RING_IDX slot;
boolean_t notify;
- mutex_enter(&xnfp->xnf_txlock);
-
+ ASSERT(MUTEX_HELD(&xnfp->xnf_txlock));
ASSERT(xnfp->xnf_running);
- /*
- * Wait until we are connected to the backend.
- */
- while (!xnfp->xnf_connected)
- cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
-
- slots_free = tx_slots_get(xnfp, 1, B_FALSE);
- DTRACE_PROBE1(xnf_send_slotsfree, int, slots_free);
-
slot = xnfp->xnf_tx_ring.req_prod_pvt;
- while ((txp != NULL) && (slots_free > 0)) {
+ /*
+ * The caller has already checked that we have enough slots to proceed.
+ */
+ for (xnf_txbuf_t *txp = head; txp != NULL; txp = txp->tx_next) {
xnf_txid_t *tidp;
netif_tx_request_t *txrp;
- tidp = txid_get(xnfp);
+ tidp = xnf_txid_get(xnfp);
VERIFY(tidp != NULL);
-
txrp = RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
txp->tx_slot = slot;
@@ -1473,281 +1712,353 @@ tx_push_packets(xnf_t *xnfp, xnf_txbuf_t *txp)
*txrp = txp->tx_txreq;
tidp->txbuf = txp;
-
- xnfp->xnf_stat_opackets++;
- xnfp->xnf_stat_obytes += txp->tx_txreq.size;
-
- txp = txp->tx_next;
- slots_free--;
slot++;
+ nslots++;
+ /*
+ * When present, LSO info is placed in a slot after the first
+ * data segment, and doesn't require a txid.
+ */
+ if (txp->tx_txreq.flags & NETTXF_extra_info) {
+ netif_extra_info_t *extra;
+ ASSERT3U(nslots, ==, 1);
+
+ extra = (netif_extra_info_t *)
+ RING_GET_REQUEST(&xnfp->xnf_tx_ring, slot);
+ *extra = txp->tx_extra;
+ slot++;
+ nslots++;
+ extras = 1;
+ }
}
+ ASSERT3U(nslots, <=, XEN_MAX_SLOTS_PER_TX);
+
+ /*
+ * Store the number of data fragments.
+ */
+ head->tx_frags_to_ack = nslots - extras;
+
xnfp->xnf_tx_ring.req_prod_pvt = slot;
/*
* Tell the peer that we sent something, if it cares.
*/
/* LINTED: constant in conditional context */
- RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring,
- notify);
+ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&xnfp->xnf_tx_ring, notify);
if (notify)
ec_notify_via_evtchn(xnfp->xnf_evtchn);
+}
- mutex_exit(&xnfp->xnf_txlock);
+static xnf_txbuf_t *
+xnf_mblk_copy(xnf_t *xnfp, mblk_t *mp)
+{
+ xnf_txbuf_t *txp = xnf_data_txbuf_alloc(xnfp);
+ size_t length;
+
+ txp->tx_bdesc = xnf_tx_get_lookaside(xnfp, mp, &length);
+ if (txp->tx_bdesc == NULL) {
+ xnf_data_txbuf_free(xnfp, txp);
+ return (NULL);
+ }
+ txp->tx_mfn = txp->tx_bdesc->buf_mfn;
+ txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
+ txp->tx_txreq.size = length;
+ txp->tx_txreq.offset = (uintptr_t)txp->tx_bdesc->buf & PAGEOFFSET;
+ txp->tx_txreq.flags = 0;
return (txp);
}
-/*
- * Send the chain of packets `mp'. Called by the MAC framework.
- */
-static mblk_t *
-xnf_send(void *arg, mblk_t *mp)
+static xnf_txbuf_t *
+xnf_mblk_map(xnf_t *xnfp, mblk_t *mp, int *countp)
{
- xnf_t *xnfp = arg;
+ xnf_txbuf_t *head = NULL;
+ xnf_txbuf_t *tail = NULL;
domid_t oeid;
- xnf_txbuf_t *head, *tail;
- mblk_t *ml;
- int prepared;
+ int nsegs = 0;
oeid = xvdi_get_oeid(xnfp->xnf_devinfo);
- /*
- * Prepare packets for transmission.
- */
- head = tail = NULL;
- prepared = 0;
- while (mp != NULL) {
+ for (mblk_t *ml = mp; ml != NULL; ml = ml->b_cont) {
+ ddi_dma_handle_t dma_handle;
+ ddi_dma_cookie_t dma_cookie;
+ uint_t ncookies;
xnf_txbuf_t *txp;
- int n_chunks, length;
- boolean_t page_oops;
- uint32_t pflags;
- for (ml = mp, n_chunks = length = 0, page_oops = B_FALSE;
- ml != NULL;
- ml = ml->b_cont, n_chunks++) {
+ if (MBLKL(ml) == 0)
+ continue;
- /*
- * Test if this buffer includes a page
- * boundary. The test assumes that the range
- * b_rptr...b_wptr can include only a single
- * boundary.
- */
- if (xnf_btop((size_t)ml->b_rptr) !=
- xnf_btop((size_t)ml->b_wptr)) {
- xnfp->xnf_stat_tx_pagebndry++;
- page_oops = B_TRUE;
- }
+ txp = xnf_data_txbuf_alloc(xnfp);
- length += MBLKL(ml);
+ if (head == NULL) {
+ head = txp;
+ } else {
+ ASSERT(tail != NULL);
+ TXBUF_SETNEXT(tail, txp);
+ txp->tx_head = head;
}
- DTRACE_PROBE1(xnf_send_b_cont, int, n_chunks);
/*
- * Make sure packet isn't too large.
+ * The necessary segmentation rules (e.g. not crossing a page
+ * boundary) are enforced by the dma attributes of the handle.
*/
- if (length > XNF_FRAMESIZE) {
- cmn_err(CE_WARN,
- "xnf%d: oversized packet (%d bytes) dropped",
- ddi_get_instance(xnfp->xnf_devinfo), length);
- freemsg(mp);
- continue;
+ dma_handle = txp->tx_dma_handle;
+ int ret = ddi_dma_addr_bind_handle(dma_handle,
+ NULL, (char *)ml->b_rptr, MBLKL(ml),
+ DDI_DMA_WRITE | DDI_DMA_STREAMING,
+ DDI_DMA_DONTWAIT, 0, &dma_cookie,
+ &ncookies);
+ if (ret != DDI_DMA_MAPPED) {
+ if (ret != DDI_DMA_NORESOURCES) {
+ dev_err(xnfp->xnf_devinfo, CE_WARN,
+ "ddi_dma_addr_bind_handle() failed "
+ "[dma_error=%d]", ret);
+ }
+ goto error;
}
-
- txp = kmem_cache_alloc(xnfp->xnf_tx_buf_cache, KM_SLEEP);
-
- txp->tx_type = TX_DATA;
-
- if ((n_chunks > xnf_max_tx_frags) || page_oops) {
- /*
- * Loan a side buffer rather than the mblk
- * itself.
- */
- txp->tx_bdesc = xnf_tx_pullup(xnfp, mp);
- if (txp->tx_bdesc == NULL) {
- kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
- break;
+ txp->tx_handle_bound = B_TRUE;
+
+ ASSERT(ncookies > 0);
+ for (int i = 0; i < ncookies; i++) {
+ if (nsegs == XEN_MAX_TX_DATA_PAGES) {
+ dev_err(xnfp->xnf_devinfo, CE_WARN,
+ "xnf_dmamap_alloc() failed: "
+ "too many segments");
+ goto error;
}
-
- txp->tx_bufp = txp->tx_bdesc->buf;
- txp->tx_mfn = txp->tx_bdesc->buf_mfn;
- txp->tx_txreq.gref = txp->tx_bdesc->grant_ref;
-
- } else {
- int rc;
- ddi_dma_cookie_t dma_cookie;
- uint_t ncookies;
-
- rc = ddi_dma_addr_bind_handle(txp->tx_dma_handle,
- NULL, (char *)mp->b_rptr, length,
- DDI_DMA_WRITE | DDI_DMA_STREAMING,
- DDI_DMA_DONTWAIT, 0, &dma_cookie,
- &ncookies);
- if (rc != DDI_DMA_MAPPED) {
- ASSERT(rc != DDI_DMA_INUSE);
- ASSERT(rc != DDI_DMA_PARTIAL_MAP);
-
-#ifdef XNF_DEBUG
- if (rc != DDI_DMA_NORESOURCES)
- cmn_err(CE_WARN,
- "xnf%d: bind_handle failed (%x)",
- ddi_get_instance(xnfp->xnf_devinfo),
- rc);
-#endif
- kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
- break;
+ if (i > 0) {
+ txp = xnf_data_txbuf_alloc(xnfp);
+ ASSERT(tail != NULL);
+ TXBUF_SETNEXT(tail, txp);
+ txp->tx_head = head;
}
- ASSERT(ncookies == 1);
- txp->tx_bdesc = NULL;
- txp->tx_bufp = (caddr_t)mp->b_rptr;
txp->tx_mfn =
xnf_btop(pa_to_ma(dma_cookie.dmac_laddress));
- txp->tx_txreq.gref = gref_get(xnfp);
+ txp->tx_txreq.gref = xnf_gref_get(xnfp);
if (txp->tx_txreq.gref == INVALID_GRANT_REF) {
- (void) ddi_dma_unbind_handle(
- txp->tx_dma_handle);
- kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
- break;
+ dev_err(xnfp->xnf_devinfo, CE_WARN,
+ "xnf_dmamap_alloc() failed: "
+ "invalid grant ref");
+ goto error;
}
gnttab_grant_foreign_access_ref(txp->tx_txreq.gref,
oeid, txp->tx_mfn, 1);
- }
+ txp->tx_txreq.offset =
+ dma_cookie.dmac_laddress & PAGEOFFSET;
+ txp->tx_txreq.size = dma_cookie.dmac_size;
+ txp->tx_txreq.flags = 0;
- txp->tx_next = NULL;
- txp->tx_mp = mp;
- txp->tx_txreq.size = length;
- txp->tx_txreq.offset = (uintptr_t)txp->tx_bufp & PAGEOFFSET;
- txp->tx_txreq.flags = 0;
- mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &pflags);
- if (pflags != 0) {
- /*
- * If the local protocol stack requests checksum
- * offload we set the 'checksum blank' flag,
- * indicating to the peer that we need the checksum
- * calculated for us.
- *
- * We _don't_ set the validated flag, because we haven't
- * validated that the data and the checksum match.
- */
- xnf_pseudo_cksum(txp->tx_bufp, length);
- txp->tx_txreq.flags |= NETTXF_csum_blank;
+ ddi_dma_nextcookie(dma_handle, &dma_cookie);
+ nsegs++;
- xnfp->xnf_stat_tx_cksum_deferred++;
+ if (tail != NULL)
+ tail->tx_txreq.flags = NETTXF_more_data;
+ tail = txp;
}
+ }
- if (head == NULL) {
- ASSERT(tail == NULL);
-
- head = txp;
- } else {
- ASSERT(tail != NULL);
-
- tail->tx_next = txp;
- }
- tail = txp;
+ *countp = nsegs;
+ return (head);
- mp = mp->b_next;
- prepared++;
+error:
+ xnf_data_txbuf_free_chain(xnfp, head);
+ return (NULL);
+}
+static void
+xnf_tx_setup_offload(xnf_t *xnfp, xnf_txbuf_t *head,
+ uint32_t cksum_flags, uint32_t lso_flags, uint32_t mss)
+{
+ if (lso_flags != 0) {
+ ASSERT3U(lso_flags, ==, HW_LSO);
+ ASSERT3P(head->tx_bdesc, ==, NULL);
+
+ head->tx_txreq.flags |= NETTXF_extra_info;
+ netif_extra_info_t *extra = &head->tx_extra;
+ extra->type = XEN_NETIF_EXTRA_TYPE_GSO;
+ extra->flags = 0;
+ extra->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4;
+ extra->u.gso.size = mss;
+ extra->u.gso.features = 0;
+ extra->u.gso.pad = 0;
+ } else if (cksum_flags != 0) {
+ ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
/*
- * There is no point in preparing more than
- * NET_TX_RING_SIZE, as we won't be able to push them
- * into the ring in one go and would hence have to
- * un-prepare the extra.
+ * If the local protocol stack requests checksum
+ * offload we set the 'checksum blank' flag,
+ * indicating to the peer that we need the checksum
+ * calculated for us.
+ *
+ * We _don't_ set the validated flag, because we haven't
+ * validated that the data and the checksum match.
+ *
+ * Note: we already called xnf_pseudo_cksum() in
+ * xnf_send(), so we just set the txreq flag here.
*/
- if (prepared == NET_TX_RING_SIZE)
- break;
+ head->tx_txreq.flags |= NETTXF_csum_blank;
+ xnfp->xnf_stat_tx_cksum_deferred++;
}
+}
- DTRACE_PROBE1(xnf_send_prepared, int, prepared);
+/*
+ * Send packet mp. Called by the MAC framework.
+ */
+static mblk_t *
+xnf_send(void *arg, mblk_t *mp)
+{
+ xnf_t *xnfp = arg;
+ xnf_txbuf_t *head;
+ mblk_t *ml;
+ int length;
+ int pages, chunks, slots, slots_free;
+ uint32_t cksum_flags, lso_flags, mss;
+ boolean_t pulledup = B_FALSE;
+ boolean_t force_copy = B_FALSE;
- if (mp != NULL) {
-#ifdef XNF_DEBUG
- int notprepared = 0;
- mblk_t *l = mp;
+ ASSERT3P(mp->b_next, ==, NULL);
- while (l != NULL) {
- notprepared++;
- l = l->b_next;
- }
+ mutex_enter(&xnfp->xnf_txlock);
+
+ /*
+ * Wait until we are connected to the backend.
+ */
+ while (!xnfp->xnf_connected)
+ cv_wait(&xnfp->xnf_cv_state, &xnfp->xnf_txlock);
+
+ /*
+ * To simplify logic and be in sync with the rescheduling mechanism,
+ * we require the maximum amount of slots that could be used by a
+ * transaction to be free before proceeding. The only downside of doing
+ * this is that it slightly reduces the effective size of the ring.
+ */
+ slots_free = xnf_tx_slots_get(xnfp, XEN_MAX_SLOTS_PER_TX, B_FALSE);
+ if (slots_free < XEN_MAX_SLOTS_PER_TX) {
+ /*
+ * We need to ask for a re-schedule later as the ring is full.
+ */
+ mutex_enter(&xnfp->xnf_schedlock);
+ xnfp->xnf_need_sched = B_TRUE;
+ mutex_exit(&xnfp->xnf_schedlock);
- DTRACE_PROBE1(xnf_send_notprepared, int, notprepared);
-#else /* !XNF_DEBUG */
- DTRACE_PROBE1(xnf_send_notprepared, int, -1);
-#endif /* XNF_DEBUG */
+ xnfp->xnf_stat_tx_defer++;
+ mutex_exit(&xnfp->xnf_txlock);
+ return (mp);
}
/*
- * Push the packets we have prepared into the ring. They may
- * not all go.
+ * Get hw offload parameters.
+ * This must be done before pulling up the mp as those parameters
+ * are not copied over.
*/
- if (head != NULL)
- head = tx_push_packets(xnfp, head);
+ mac_hcksum_get(mp, NULL, NULL, NULL, NULL, &cksum_flags);
+ mac_lso_get(mp, &mss, &lso_flags);
/*
- * If some packets that we prepared were not sent, unprepare
- * them and add them back to the head of those we didn't
- * prepare.
+ * XXX: fix MAC framework so that we can advertise support for
+ * partial checksum for IPv4 only. This way we won't need to calculate
+ * the pseudo header checksum ourselves.
*/
- {
- xnf_txbuf_t *loop;
- mblk_t *mp_head, *mp_tail;
- int unprepared = 0;
-
- mp_head = mp_tail = NULL;
- loop = head;
-
- while (loop != NULL) {
- xnf_txbuf_t *next = loop->tx_next;
-
- if (loop->tx_bdesc == NULL) {
- (void) gnttab_end_foreign_access_ref(
- loop->tx_txreq.gref, 1);
- gref_put(xnfp, loop->tx_txreq.gref);
- (void) ddi_dma_unbind_handle(
- loop->tx_dma_handle);
- } else {
- xnf_buf_put(xnfp, loop->tx_bdesc, B_TRUE);
- }
+ if (cksum_flags != 0) {
+ ASSERT3U(cksum_flags, ==, HCK_FULLCKSUM);
+ (void) xnf_pseudo_cksum(mp);
+ }
- ASSERT(loop->tx_mp != NULL);
- if (mp_head == NULL)
- mp_head = loop->tx_mp;
- mp_tail = loop->tx_mp;
+pulledup:
+ for (ml = mp, pages = 0, chunks = 0, length = 0; ml != NULL;
+ ml = ml->b_cont, chunks++) {
+ pages += xnf_mblk_pages(ml);
+ length += MBLKL(ml);
+ }
+ DTRACE_PROBE3(packet, int, length, int, chunks, int, pages);
+ DTRACE_PROBE3(lso, int, length, uint32_t, lso_flags, uint32_t, mss);
- kmem_cache_free(xnfp->xnf_tx_buf_cache, loop);
- loop = next;
- unprepared++;
- }
+ /*
+ * If the ethernet header crosses a page boundary the packet
+ * will be dropped by the backend. In practice it seems like
+ * this happens fairly rarely so we'll do nothing unless the
+ * packet is small enough to fit in a look-aside buffer.
+ */
+ if (((uintptr_t)mp->b_rptr & PAGEOFFSET) +
+ sizeof (struct ether_header) > PAGESIZE) {
+ xnfp->xnf_stat_tx_eth_hdr_split++;
+ if (length <= PAGESIZE)
+ force_copy = B_TRUE;
+ }
- if (mp_tail == NULL) {
- ASSERT(mp_head == NULL);
- } else {
- ASSERT(mp_head != NULL);
+ if (force_copy || (pages > 1 && !xnfp->xnf_be_tx_sg)) {
+ /*
+ * If the packet spans several pages and scatter-gather is not
+ * supported then use a look-aside buffer.
+ */
+ ASSERT3U(length, <=, PAGESIZE);
+ head = xnf_mblk_copy(xnfp, mp);
+ if (head == NULL) {
+ dev_err(xnfp->xnf_devinfo, CE_WARN,
+ "xnf_mblk_copy() failed");
+ goto drop;
+ }
+ } else {
+ /*
+ * There's a limit for how many pages can be passed to the
+ * backend. If we pass that limit, the packet will be dropped
+ * and some backend implementations (e.g. Linux) could even
+ * offline the interface.
+ */
+ if (pages > XEN_MAX_TX_DATA_PAGES) {
+ if (pulledup) {
+ dev_err(xnfp->xnf_devinfo, CE_WARN,
+ "too many pages, even after pullup: %d.",
+ pages);
+ goto drop;
+ }
- mp_tail->b_next = mp;
- mp = mp_head;
+ /*
+ * Defragment packet if it spans too many pages.
+ */
+ mblk_t *newmp = msgpullup(mp, -1);
+ freemsg(mp);
+ mp = newmp;
+ xnfp->xnf_stat_tx_pullup++;
+ pulledup = B_TRUE;
+ goto pulledup;
}
- DTRACE_PROBE1(xnf_send_unprepared, int, unprepared);
+ head = xnf_mblk_map(xnfp, mp, &slots);
+ if (head == NULL)
+ goto drop;
+
+ IMPLY(slots > 1, xnfp->xnf_be_tx_sg);
}
/*
- * If any mblks are left then we have deferred for some reason
- * and need to ask for a re-schedule later. This is typically
- * due to the ring filling.
+ * Set tx_mp so that mblk is freed when the txbuf chain is freed.
*/
- if (mp != NULL) {
- mutex_enter(&xnfp->xnf_schedlock);
- xnfp->xnf_need_sched = B_TRUE;
- mutex_exit(&xnfp->xnf_schedlock);
+ head->tx_mp = mp;
- xnfp->xnf_stat_tx_defer++;
- }
+ xnf_tx_setup_offload(xnfp, head, cksum_flags, lso_flags, mss);
+
+ /*
+ * The first request must store the total length of the packet.
+ */
+ head->tx_txreq.size = length;
+
+ /*
+ * Push the packet we have prepared into the ring.
+ */
+ xnf_tx_push_packet(xnfp, head);
+ xnfp->xnf_stat_opackets++;
+ xnfp->xnf_stat_obytes += length;
+
+ mutex_exit(&xnfp->xnf_txlock);
+ return (NULL);
- return (mp);
+drop:
+ freemsg(mp);
+ xnfp->xnf_stat_tx_drop++;
+ mutex_exit(&xnfp->xnf_txlock);
+ return (NULL);
}
/*
@@ -1834,9 +2145,9 @@ xnf_intr(caddr_t arg)
int free_slots;
mutex_enter(&xnfp->xnf_txlock);
- free_slots = tx_slots_get(xnfp, 0, B_FALSE);
+ free_slots = xnf_tx_slots_get(xnfp, 0, B_FALSE);
- if (need_sched && (free_slots > 0)) {
+ if (need_sched && (free_slots >= XEN_MAX_SLOTS_PER_TX)) {
mutex_enter(&xnfp->xnf_schedlock);
xnfp->xnf_need_sched = B_FALSE;
mutex_exit(&xnfp->xnf_schedlock);
@@ -1922,74 +2233,126 @@ xnf_rxbuf_hang(xnf_t *xnfp, xnf_buf_t *bdesc)
}
/*
- * Collect packets from the RX ring, storing them in `xnfp' for later
- * use.
+ * Receive an entire packet from the ring, starting from slot *consp.
+ * prod indicates the slot of the latest response.
+ * On return, *consp will point to the head of the next packet.
+ *
+ * Note: If slot prod was reached before we could gather a full packet, we will
+ * drop the partial packet; this would most likely indicate a bug in either
+ * the front-end or the back-end driver.
+ *
+ * An rx packet can consist of several fragments and thus span multiple slots.
+ * Each fragment can contain up to 4k of data.
+ *
+ * A typical 9000 MTU packet with look like this:
+ * +------+---------------------+-------------------+-----------------------+
+ * | SLOT | TYPE | CONTENTS | FLAGS |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 1 | netif_rx_response_t | 1st data fragment | more_data |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 2 | netif_rx_response_t | 2nd data fragment | more_data |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 3 | netif_rx_response_t | 3rd data fragment | [none] |
+ * +------+---------------------+-------------------+-----------------------+
+ *
+ * Fragments are chained by setting NETRXF_more_data in the previous
+ * response's flags. If there are additional flags, such as
+ * NETRXF_data_validated or NETRXF_extra_info, those should be set on the
+ * first fragment.
+ *
+ * Sometimes extra info can be present. If so, it will follow the first
+ * fragment, and NETRXF_extra_info flag will be set on the first response.
+ * If LRO is set on a packet, it will be stored in the extra info. Conforming
+ * to the spec, extra info can also be chained, but must all be present right
+ * after the first fragment.
+ *
+ * Example of a packet with 2 extra infos:
+ * +------+---------------------+-------------------+-----------------------+
+ * | SLOT | TYPE | CONTENTS | FLAGS |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 1 | netif_rx_response_t | 1st data fragment | extra_info, more_data |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 2 | netif_extra_info_t | 1st extra info | EXTRA_FLAG_MORE |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 3 | netif_extra_info_t | 2nd extra info | [none] |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 4 | netif_rx_response_t | 2nd data fragment | more_data |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 5 | netif_rx_response_t | 3rd data fragment | more_data |
+ * +------+---------------------+-------------------+-----------------------+
+ * | 6 | netif_rx_response_t | 4th data fragment | [none] |
+ * +------+---------------------+-------------------+-----------------------+
+ *
+ * In practice, the only extra we expect is for LRO, but only if we advertise
+ * that we support it to the backend (xnf_enable_lro == TRUE).
*/
-static void
-xnf_rx_collect(xnf_t *xnfp)
+static int
+xnf_rx_one_packet(xnf_t *xnfp, RING_IDX prod, RING_IDX *consp, mblk_t **mpp)
{
- mblk_t *head, *tail;
+ mblk_t *head = NULL;
+ mblk_t *tail = NULL;
+ mblk_t *mp;
+ int error = 0;
+ RING_IDX cons = *consp;
+ netif_extra_info_t lro;
+ boolean_t is_lro = B_FALSE;
+ boolean_t is_extra = B_FALSE;
- ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
+ netif_rx_response_t rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
- /*
- * Loop over unconsumed responses:
- * 1. get a response
- * 2. take corresponding buffer off recv. ring
- * 3. indicate this by setting slot to NULL
- * 4. create a new message and
- * 5. copy data in, adjust ptr
- */
+ boolean_t hwcsum = (rsp.flags & NETRXF_data_validated) != 0;
+ boolean_t more_data = (rsp.flags & NETRXF_more_data) != 0;
+ boolean_t more_extra = (rsp.flags & NETRXF_extra_info) != 0;
- head = tail = NULL;
+ IMPLY(more_data, xnf_enable_rx_sg);
- while (RING_HAS_UNCONSUMED_RESPONSES(&xnfp->xnf_rx_ring)) {
- netif_rx_response_t *rxpkt;
+ while (cons != prod) {
xnf_buf_t *bdesc;
- ssize_t len;
- size_t off;
- mblk_t *mp = NULL;
- boolean_t hwcsum = B_FALSE;
- grant_ref_t ref;
+ int len, off;
+ int rxidx = cons & (NET_RX_RING_SIZE - 1);
+
+ bdesc = xnfp->xnf_rx_pkt_info[rxidx];
+ xnfp->xnf_rx_pkt_info[rxidx] = NULL;
+
+ if (is_extra) {
+ netif_extra_info_t *extra = (netif_extra_info_t *)&rsp;
+ /*
+ * The only extra we expect is for LRO, and it should
+ * only be present once.
+ */
+ if (extra->type == XEN_NETIF_EXTRA_TYPE_GSO &&
+ !is_lro) {
+ ASSERT(xnf_enable_lro);
+ lro = *extra;
+ is_lro = B_TRUE;
+ DTRACE_PROBE1(lro, netif_extra_info_t *, &lro);
+ } else {
+ dev_err(xnfp->xnf_devinfo, CE_WARN, "rx packet "
+ "contains unexpected extra info of type %d",
+ extra->type);
+ error = EINVAL;
+ }
+ more_extra =
+ (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE) != 0;
- /* 1. */
- rxpkt = RING_GET_RESPONSE(&xnfp->xnf_rx_ring,
- xnfp->xnf_rx_ring.rsp_cons);
+ goto hang_buf;
+ }
- DTRACE_PROBE4(xnf_rx_got_rsp, int, (int)rxpkt->id,
- int, (int)rxpkt->offset,
- int, (int)rxpkt->flags,
- int, (int)rxpkt->status);
+ ASSERT3U(bdesc->id, ==, rsp.id);
/*
- * 2.
+ * status stores packet length when >= 0, or errors when < 0.
*/
- bdesc = xnfp->xnf_rx_pkt_info[rxpkt->id];
+ len = rsp.status;
+ off = rsp.offset;
+ more_data = (rsp.flags & NETRXF_more_data) != 0;
/*
- * 3.
+ * sanity checks.
*/
- xnfp->xnf_rx_pkt_info[rxpkt->id] = NULL;
- ASSERT(bdesc->id == rxpkt->id);
-
- ref = bdesc->grant_ref;
- off = rxpkt->offset;
- len = rxpkt->status;
-
if (!xnfp->xnf_running) {
- DTRACE_PROBE4(xnf_rx_not_running,
- int, rxpkt->status,
- char *, bdesc->buf, int, rxpkt->offset,
- char *, ((char *)bdesc->buf) + rxpkt->offset);
-
- xnfp->xnf_stat_drop++;
-
+ error = EBUSY;
} else if (len <= 0) {
- DTRACE_PROBE4(xnf_rx_pkt_status_negative,
- int, rxpkt->status,
- char *, bdesc->buf, int, rxpkt->offset,
- char *, ((char *)bdesc->buf) + rxpkt->offset);
-
xnfp->xnf_stat_errrx++;
switch (len) {
@@ -2003,148 +2366,204 @@ xnf_rx_collect(xnf_t *xnfp)
xnfp->xnf_stat_norxbuf++;
break;
}
-
+ error = EINVAL;
} else if (bdesc->grant_ref == INVALID_GRANT_REF) {
- cmn_err(CE_WARN, "Bad rx grant reference %d "
- "from domain %d", ref,
- xvdi_get_oeid(xnfp->xnf_devinfo));
-
+ dev_err(xnfp->xnf_devinfo, CE_WARN,
+ "Bad rx grant reference, rsp id %d", rsp.id);
+ error = EINVAL;
} else if ((off + len) > PAGESIZE) {
- cmn_err(CE_WARN, "Rx packet overflows page "
- "(offset %ld, length %ld) from domain %d",
- off, len, xvdi_get_oeid(xnfp->xnf_devinfo));
- } else {
- xnf_buf_t *nbuf = NULL;
+ dev_err(xnfp->xnf_devinfo, CE_WARN, "Rx packet crosses "
+ "page boundary (offset %d, length %d)", off, len);
+ error = EINVAL;
+ }
- DTRACE_PROBE4(xnf_rx_packet, int, len,
- char *, bdesc->buf, int, off,
- char *, ((char *)bdesc->buf) + off);
+ if (error != 0) {
+ /*
+ * If an error has been detected, we do not attempt
+ * to read the data but we still need to replace
+ * the rx bufs.
+ */
+ goto hang_buf;
+ }
- ASSERT(off + len <= PAGEOFFSET);
+ xnf_buf_t *nbuf = NULL;
+
+ /*
+ * If the packet is below a pre-determined size we will
+ * copy data out of the buf rather than replace it.
+ */
+ if (len > xnf_rx_copy_limit)
+ nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
- if (rxpkt->flags & NETRXF_data_validated)
- hwcsum = B_TRUE;
+ if (nbuf != NULL) {
+ mp = desballoc((unsigned char *)bdesc->buf,
+ bdesc->len, 0, &bdesc->free_rtn);
+
+ if (mp == NULL) {
+ xnfp->xnf_stat_rx_desballoc_fail++;
+ xnfp->xnf_stat_norxbuf++;
+ error = ENOMEM;
+ /*
+ * we free the buf we just allocated as we
+ * will re-hang the old buf.
+ */
+ xnf_buf_put(xnfp, nbuf, B_FALSE);
+ goto hang_buf;
+ }
+
+ mp->b_rptr = mp->b_rptr + off;
+ mp->b_wptr = mp->b_rptr + len;
/*
- * If the packet is below a pre-determined
- * size we will copy data out rather than
- * replace it.
+ * Release the grant as the backend doesn't need to
+ * access this buffer anymore and grants are scarce.
*/
- if (len > xnf_rx_copy_limit)
- nbuf = xnf_buf_get(xnfp, KM_NOSLEEP, B_FALSE);
+ (void) gnttab_end_foreign_access_ref(bdesc->grant_ref,
+ 0);
+ xnf_gref_put(xnfp, bdesc->grant_ref);
+ bdesc->grant_ref = INVALID_GRANT_REF;
+ bdesc = nbuf;
+ } else {
/*
- * If we have a replacement buffer, attempt to
- * wrap the existing one with an mblk_t in
- * order that the upper layers of the stack
- * might use it directly.
+ * We failed to allocate a new buf or decided to reuse
+ * the old one. In either case we copy the data off it
+ * and put it back into the ring.
*/
- if (nbuf != NULL) {
- mp = desballoc((unsigned char *)bdesc->buf,
- bdesc->len, 0, &bdesc->free_rtn);
- if (mp == NULL) {
- xnfp->xnf_stat_rx_desballoc_fail++;
- xnfp->xnf_stat_norxbuf++;
-
- xnf_buf_put(xnfp, nbuf, B_FALSE);
- nbuf = NULL;
- } else {
- mp->b_rptr = mp->b_rptr + off;
- mp->b_wptr = mp->b_rptr + len;
-
- /*
- * Release the grant reference
- * associated with this buffer
- * - they are scarce and the
- * upper layers of the stack
- * don't need it.
- */
- (void) gnttab_end_foreign_access_ref(
- bdesc->grant_ref, 0);
- gref_put(xnfp, bdesc->grant_ref);
- bdesc->grant_ref = INVALID_GRANT_REF;
-
- bdesc = nbuf;
- }
- }
-
- if (nbuf == NULL) {
- /*
- * No replacement buffer allocated -
- * attempt to copy the data out and
- * re-hang the existing buffer.
- */
-
- /* 4. */
- mp = allocb(len, BPRI_MED);
- if (mp == NULL) {
- xnfp->xnf_stat_rx_allocb_fail++;
- xnfp->xnf_stat_norxbuf++;
- } else {
- /* 5. */
- bcopy(bdesc->buf + off, mp->b_wptr,
- len);
- mp->b_wptr += len;
- }
+ mp = allocb(len, 0);
+ if (mp == NULL) {
+ xnfp->xnf_stat_rx_allocb_fail++;
+ xnfp->xnf_stat_norxbuf++;
+ error = ENOMEM;
+ goto hang_buf;
}
+ bcopy(bdesc->buf + off, mp->b_wptr, len);
+ mp->b_wptr += len;
}
- /* Re-hang the buffer. */
+ if (head == NULL)
+ head = mp;
+ else
+ tail->b_cont = mp;
+ tail = mp;
+
+hang_buf:
+ /*
+ * No matter what happens, for each response we need to hang
+ * a new buf on the rx ring. Put either the old one, or a new
+ * one if the old one is borrowed by the kernel via desballoc().
+ */
xnf_rxbuf_hang(xnfp, bdesc);
+ cons++;
- if (mp != NULL) {
- if (hwcsum) {
- /*
- * If the peer says that the data has
- * been validated then we declare that
- * the full checksum has been
- * verified.
- *
- * We don't look at the "checksum
- * blank" flag, and hence could have a
- * packet here that we are asserting
- * is good with a blank checksum.
- */
- mac_hcksum_set(mp, 0, 0, 0, 0,
- HCK_FULLCKSUM_OK);
- xnfp->xnf_stat_rx_cksum_no_need++;
- }
- if (head == NULL) {
- ASSERT(tail == NULL);
+ /* next response is an extra */
+ is_extra = more_extra;
- head = mp;
- } else {
- ASSERT(tail != NULL);
+ if (!more_data && !more_extra)
+ break;
- tail->b_next = mp;
- }
- tail = mp;
+ /*
+ * Note that since requests and responses are union'd on the
+ * same ring, we copy the response to a local variable instead
+ * of keeping a pointer. Otherwise xnf_rxbuf_hang() would have
+ * overwritten contents of rsp.
+ */
+ rsp = *RING_GET_RESPONSE(&xnfp->xnf_rx_ring, cons);
+ }
- ASSERT(mp->b_next == NULL);
+ /*
+ * Check that we do not get stuck in a loop.
+ */
+ ASSERT3U(*consp, !=, cons);
+ *consp = cons;
- xnfp->xnf_stat_ipackets++;
- xnfp->xnf_stat_rbytes += len;
- }
+ /*
+ * We ran out of responses but the flags indicate there is more data.
+ */
+ if (more_data) {
+ dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments.");
+ error = EINVAL;
+ }
+ if (more_extra) {
+ dev_err(xnfp->xnf_devinfo, CE_WARN, "rx: need more fragments "
+ "(extras).");
+ error = EINVAL;
+ }
- xnfp->xnf_rx_ring.rsp_cons++;
+ /*
+ * An error means the packet must be dropped. If we have already formed
+ * a partial packet, then discard it.
+ */
+ if (error != 0) {
+ if (head != NULL)
+ freemsg(head);
+ xnfp->xnf_stat_rx_drop++;
+ return (error);
}
+ ASSERT(head != NULL);
+
+ if (hwcsum) {
+ /*
+ * If the peer says that the data has been validated then we
+ * declare that the full checksum has been verified.
+ *
+ * We don't look at the "checksum blank" flag, and hence could
+ * have a packet here that we are asserting is good with
+ * a blank checksum.
+ */
+ mac_hcksum_set(head, 0, 0, 0, 0, HCK_FULLCKSUM_OK);
+ xnfp->xnf_stat_rx_cksum_no_need++;
+ }
+
+ /* XXX: set lro info for packet once LRO is supported in OS. */
+
+ *mpp = head;
+
+ return (0);
+}
+
+/*
+ * Collect packets from the RX ring, storing them in `xnfp' for later use.
+ */
+static void
+xnf_rx_collect(xnf_t *xnfp)
+{
+ RING_IDX prod;
+
+ ASSERT(MUTEX_HELD(&xnfp->xnf_rxlock));
+
+ prod = xnfp->xnf_rx_ring.sring->rsp_prod;
/*
- * Store the mblks we have collected.
+ * Ensure we see queued responses up to 'prod'.
*/
- if (head != NULL) {
- ASSERT(tail != NULL);
+ membar_consumer();
- if (xnfp->xnf_rx_head == NULL) {
- ASSERT(xnfp->xnf_rx_tail == NULL);
+ while (xnfp->xnf_rx_ring.rsp_cons != prod) {
+ mblk_t *mp;
- xnfp->xnf_rx_head = head;
- } else {
- ASSERT(xnfp->xnf_rx_tail != NULL);
+ /*
+ * Collect a packet.
+ * rsp_cons is updated inside xnf_rx_one_packet().
+ */
+ int error = xnf_rx_one_packet(xnfp, prod,
+ &xnfp->xnf_rx_ring.rsp_cons, &mp);
+ if (error == 0) {
+ xnfp->xnf_stat_ipackets++;
+ xnfp->xnf_stat_rbytes += xmsgsize(mp);
- xnfp->xnf_rx_tail->b_next = head;
+ /*
+ * Append the mblk to the rx list.
+ */
+ if (xnfp->xnf_rx_head == NULL) {
+ ASSERT3P(xnfp->xnf_rx_tail, ==, NULL);
+ xnfp->xnf_rx_head = mp;
+ } else {
+ ASSERT(xnfp->xnf_rx_tail != NULL);
+ xnfp->xnf_rx_tail->b_next = mp;
+ }
+ xnfp->xnf_rx_tail = mp;
}
- xnfp->xnf_rx_tail = tail;
}
}
@@ -2306,7 +2725,7 @@ xnf_release_mblks(xnf_t *xnfp)
ASSERT(txp->tx_mp != NULL);
freemsg(txp->tx_mp);
- txid_put(xnfp, tidp);
+ xnf_txid_put(xnfp, tidp);
kmem_cache_free(xnfp->xnf_tx_buf_cache, txp);
}
}
@@ -2326,7 +2745,7 @@ xnf_buf_constructor(void *buf, void *arg, int kmflag)
ddiflags = DDI_DMA_DONTWAIT;
/* Allocate a DMA access handle for the buffer. */
- if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
+ if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &rx_buf_dma_attr,
ddiflags, 0, &bdesc->dma_handle) != DDI_SUCCESS)
goto failure;
@@ -2391,17 +2810,17 @@ xnf_buf_get(xnf_t *xnfp, int flags, boolean_t readonly)
* Usually grant references are more scarce than memory, so we
* attempt to acquire a grant reference first.
*/
- gref = gref_get(xnfp);
+ gref = xnf_gref_get(xnfp);
if (gref == INVALID_GRANT_REF)
return (NULL);
bufp = kmem_cache_alloc(xnfp->xnf_buf_cache, flags);
if (bufp == NULL) {
- gref_put(xnfp, gref);
+ xnf_gref_put(xnfp, gref);
return (NULL);
}
- ASSERT(bufp->grant_ref == INVALID_GRANT_REF);
+ ASSERT3U(bufp->grant_ref, ==, INVALID_GRANT_REF);
bufp->grant_ref = gref;
@@ -2423,7 +2842,7 @@ xnf_buf_put(xnf_t *xnfp, xnf_buf_t *bufp, boolean_t readonly)
if (bufp->grant_ref != INVALID_GRANT_REF) {
(void) gnttab_end_foreign_access_ref(
bufp->grant_ref, readonly ? 1 : 0);
- gref_put(xnfp, bufp->grant_ref);
+ xnf_gref_put(xnfp, bufp->grant_ref);
bufp->grant_ref = INVALID_GRANT_REF;
}
@@ -2464,7 +2883,7 @@ xnf_tx_buf_constructor(void *buf, void *arg, int kmflag)
if (kmflag & KM_NOSLEEP)
ddiflags = DDI_DMA_DONTWAIT;
- if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &buf_dma_attr,
+ if (ddi_dma_alloc_handle(xnfp->xnf_devinfo, &tx_buf_dma_attr,
ddiflags, 0, &txp->tx_dma_handle) != DDI_SUCCESS) {
ASSERT(kmflag & KM_NOSLEEP); /* Cannot fail for KM_SLEEP. */
return (-1);
@@ -2491,8 +2910,9 @@ static char *xnf_aux_statistics[] = {
"interrupts",
"unclaimed_interrupts",
"tx_pullup",
- "tx_pagebndry",
- "tx_attempt",
+ "tx_lookaside",
+ "tx_drop",
+ "tx_eth_hdr_split",
"buf_allocated",
"buf_outstanding",
"gref_outstanding",
@@ -2524,8 +2944,9 @@ xnf_kstat_aux_update(kstat_t *ksp, int flag)
(knp++)->value.ui64 = xnfp->xnf_stat_interrupts;
(knp++)->value.ui64 = xnfp->xnf_stat_unclaimed_interrupts;
(knp++)->value.ui64 = xnfp->xnf_stat_tx_pullup;
- (knp++)->value.ui64 = xnfp->xnf_stat_tx_pagebndry;
- (knp++)->value.ui64 = xnfp->xnf_stat_tx_attempt;
+ (knp++)->value.ui64 = xnfp->xnf_stat_tx_lookaside;
+ (knp++)->value.ui64 = xnfp->xnf_stat_tx_drop;
+ (knp++)->value.ui64 = xnfp->xnf_stat_tx_eth_hdr_split;
(knp++)->value.ui64 = xnfp->xnf_stat_buf_allocated;
(knp++)->value.ui64 = xnfp->xnf_stat_buf_outstanding;
@@ -2629,10 +3050,94 @@ xnf_stat(void *arg, uint_t stat, uint64_t *val)
return (0);
}
+static int
+xnf_change_mtu(xnf_t *xnfp, uint32_t mtu)
+{
+ if (mtu > ETHERMTU) {
+ if (!xnf_enable_tx_sg) {
+ dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
+ "because scatter-gather is disabled for transmit "
+ "in driver settings", ETHERMTU);
+ return (EINVAL);
+ } else if (!xnf_enable_rx_sg) {
+ dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
+ "because scatter-gather is disabled for receive "
+ "in driver settings", ETHERMTU);
+ return (EINVAL);
+ } else if (!xnfp->xnf_be_tx_sg) {
+ dev_err(xnfp->xnf_devinfo, CE_WARN, "MTU limited to %d "
+ "because backend doesn't support scatter-gather",
+ ETHERMTU);
+ return (EINVAL);
+ }
+ if (mtu > XNF_MAXPKT)
+ return (EINVAL);
+ }
+ int error = mac_maxsdu_update(xnfp->xnf_mh, mtu);
+ if (error == 0)
+ xnfp->xnf_mtu = mtu;
+
+ return (error);
+}
+
+/*ARGSUSED*/
+static int
+xnf_getprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
+ uint_t prop_val_size, void *prop_val)
+{
+ xnf_t *xnfp = data;
+
+ switch (prop_id) {
+ case MAC_PROP_MTU:
+ ASSERT(prop_val_size >= sizeof (uint32_t));
+ bcopy(&xnfp->xnf_mtu, prop_val, sizeof (uint32_t));
+ break;
+ default:
+ return (ENOTSUP);
+ }
+ return (0);
+}
+
+/*ARGSUSED*/
+static int
+xnf_setprop(void *data, const char *prop_name, mac_prop_id_t prop_id,
+ uint_t prop_val_size, const void *prop_val)
+{
+ xnf_t *xnfp = data;
+ uint32_t new_mtu;
+ int error;
+
+ switch (prop_id) {
+ case MAC_PROP_MTU:
+ ASSERT(prop_val_size >= sizeof (uint32_t));
+ bcopy(prop_val, &new_mtu, sizeof (new_mtu));
+ error = xnf_change_mtu(xnfp, new_mtu);
+ break;
+ default:
+ return (ENOTSUP);
+ }
+
+ return (error);
+}
+
+/*ARGSUSED*/
+static void
+xnf_propinfo(void *data, const char *prop_name, mac_prop_id_t prop_id,
+ mac_prop_info_handle_t prop_handle)
+{
+ switch (prop_id) {
+ case MAC_PROP_MTU:
+ mac_prop_info_set_range_uint32(prop_handle, 0, XNF_MAXPKT);
+ break;
+ default:
+ break;
+ }
+}
+
static boolean_t
xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
{
- _NOTE(ARGUNUSED(arg));
+ xnf_t *xnfp = arg;
switch (cap) {
case MAC_CAPAB_HCKSUM: {
@@ -2656,6 +3161,21 @@ xnf_getcapab(void *arg, mac_capab_t cap, void *cap_data)
* before passing the packet to the IO domain.
*/
*capab = HCKSUM_INET_FULL_V4;
+
+ /*
+ * TODO: query the "feature-ipv6-csum-offload" capability.
+ * If enabled, that could allow us to use HCKSUM_INET_PARTIAL.
+ */
+
+ break;
+ }
+ case MAC_CAPAB_LSO: {
+ if (!xnfp->xnf_be_lso)
+ return (B_FALSE);
+
+ mac_capab_lso_t *lso = cap_data;
+ lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
+ lso->lso_basic_tcp_ipv4.lso_max = IP_MAXPACKET;
break;
}
default:
@@ -2710,6 +3230,13 @@ oe_state_change(dev_info_t *dip, ddi_eventcookie_t id,
*/
mac_unicst_update(xnfp->xnf_mh, xnfp->xnf_mac_addr);
+ /*
+ * We do not know if some features such as LSO are supported
+ * until we connect to the backend. We request the MAC layer
+ * to poll our capabilities again.
+ */
+ mac_capab_update(xnfp->xnf_mh);
+
break;
case XenbusStateConnected:
diff --git a/usr/src/uts/common/xen/io/xnf.h b/usr/src/uts/common/xen/io/xnf.h
index 0c8eb2e373..63ce31020f 100644
--- a/usr/src/uts/common/xen/io/xnf.h
+++ b/usr/src/uts/common/xen/io/xnf.h
@@ -24,6 +24,10 @@
* Use is subject to license terms.
*/
+/*
+ * Copyright (c) 2014, 2017 by Delphix. All rights reserved.
+ */
+
#ifndef _SYS_XNF_H
#define _SYS_XNF_H
@@ -31,10 +35,19 @@
extern "C" {
#endif
+/*
+ * As of April 2017, TX and RX ring sizes are fixed to 1 page in
+ * size and Xen doesn't support changing it.
+ * This represents 256 entries.
+ */
#define NET_TX_RING_SIZE __CONST_RING_SIZE(netif_tx, PAGESIZE)
#define NET_RX_RING_SIZE __CONST_RING_SIZE(netif_rx, PAGESIZE)
-#define XNF_MAXPKT 1500 /* MTU size */
+/*
+ * There is no MTU limit, however for all practical purposes hardware won't
+ * support anything much larger than 9k. We put an arbitrary 16k limit.
+ */
+#define XNF_MAXPKT 16384
#define XNF_FRAMESIZE 1514 /* frame size including MAC header */
/* DEBUG flags */
@@ -42,6 +55,18 @@ extern "C" {
#define XNF_DEBUG_TRACE 0x02
/*
+ * Based on XEN_NETIF_NR_SLOTS_MIN in Linux. Packets that span more pages
+ * than this must be defragmented or dropped.
+ */
+#define XEN_MAX_TX_DATA_PAGES 18
+/*
+ * We keep one extra slot for LSO
+ */
+#define XEN_MAX_SLOTS_PER_TX (XEN_MAX_TX_DATA_PAGES + 1)
+
+#define XEN_DATA_BOUNDARY 0x1000
+
+/*
* Information about each receive buffer and any transmit look-aside
* buffers.
*/
@@ -63,23 +88,41 @@ typedef struct xnf_buf {
/*
* Information about each transmit buffer.
*/
+typedef enum xnf_txbuf_type {
+ TX_DATA = 1,
+ TX_MCAST_REQ,
+ TX_MCAST_RSP
+} xnf_txbuf_type_t;
+
+/*
+ * A xnf_txbuf is used to store ancillary data for a netif_tx_request_t.
+ * A tx packet can span multiple xnf_txbuf's, linked together through tx_next
+ * and tx_prev; tx_head points to the head of the chain.
+ */
typedef struct xnf_txbuf {
struct xnf_txbuf *tx_next;
- mblk_t *tx_mp; /* mblk associated with packet */
+ struct xnf_txbuf *tx_prev;
+ struct xnf_txbuf *tx_head;
+ xnf_txbuf_type_t tx_type;
netif_tx_request_t tx_txreq;
- caddr_t tx_bufp;
+ netif_extra_info_t tx_extra;
+ /* Used for TX_DATA types */
ddi_dma_handle_t tx_dma_handle;
- mfn_t tx_mfn;
+ boolean_t tx_handle_bound;
+ mblk_t *tx_mp;
xnf_buf_t *tx_bdesc; /* Look-aside buffer, if used. */
- unsigned char tx_type;
+ int tx_frags_to_ack;
+ /* Used for TX_MCAST types */
int16_t tx_status;
+ /* Used for debugging */
+ mfn_t tx_mfn;
RING_IDX tx_slot;
-
-#define TX_DATA 1
-#define TX_MCAST_REQ 2
-#define TX_MCAST_RSP 3
} xnf_txbuf_t;
+#define TXBUF_SETNEXT(head, next) \
+ head->tx_next = next; \
+ next->tx_prev = head;
+
/*
* Information about each outstanding transmit operation.
*/
@@ -97,6 +140,7 @@ typedef struct xnf {
dev_info_t *xnf_devinfo;
mac_handle_t xnf_mh;
unsigned char xnf_mac_addr[ETHERADDRL];
+ uint32_t xnf_mtu;
unsigned int xnf_gen; /* Increments on resume. */
@@ -105,17 +149,20 @@ typedef struct xnf {
boolean_t xnf_be_rx_copy;
boolean_t xnf_be_mcast_control;
+ boolean_t xnf_be_tx_sg;
+ boolean_t xnf_be_lso;
uint64_t xnf_stat_interrupts;
uint64_t xnf_stat_unclaimed_interrupts;
uint64_t xnf_stat_norxbuf;
- uint64_t xnf_stat_drop;
+ uint64_t xnf_stat_rx_drop;
uint64_t xnf_stat_errrx;
- uint64_t xnf_stat_tx_attempt;
uint64_t xnf_stat_tx_pullup;
- uint64_t xnf_stat_tx_pagebndry;
+ uint64_t xnf_stat_tx_lookaside;
uint64_t xnf_stat_tx_defer;
+ uint64_t xnf_stat_tx_drop;
+ uint64_t xnf_stat_tx_eth_hdr_split;
uint64_t xnf_stat_mac_rcv_error;
uint64_t xnf_stat_runt;
@@ -145,7 +192,7 @@ typedef struct xnf {
paddr_t xnf_tx_ring_phys_addr;
grant_ref_t xnf_tx_ring_ref;
- xnf_txid_t xnf_tx_pkt_id[NET_TX_RING_SIZE];
+ xnf_txid_t *xnf_tx_pkt_id;
uint16_t xnf_tx_pkt_id_head;
kmutex_t xnf_txlock;
kmutex_t xnf_schedlock;
@@ -159,7 +206,7 @@ typedef struct xnf {
paddr_t xnf_rx_ring_phys_addr;
grant_ref_t xnf_rx_ring_ref;
- xnf_buf_t *xnf_rx_pkt_info[NET_RX_RING_SIZE];
+ xnf_buf_t **xnf_rx_pkt_info;
kmutex_t xnf_rxlock;
mblk_t *xnf_rx_head;
mblk_t *xnf_rx_tail;