summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/inet')
-rw-r--r--usr/src/uts/common/inet/ip.h49
-rw-r--r--usr/src/uts/common/inet/ip/ip.c283
-rw-r--r--usr/src/uts/common/inet/ip/ip6.c5
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c2
-rw-r--r--usr/src/uts/common/inet/ip_impl.h84
-rw-r--r--usr/src/uts/common/inet/ip_stack.h7
-rw-r--r--usr/src/uts/common/inet/ipclassifier.h1
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c5
-rw-r--r--usr/src/uts/common/inet/udp/udp.c79
-rw-r--r--usr/src/uts/common/inet/udp_impl.h7
10 files changed, 412 insertions, 110 deletions
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index ff820814bf..a18c3d0f4c 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -1128,7 +1128,7 @@ typedef struct iulp_s {
extern const iulp_t ire_uinfo_null;
/*
- * The conn drain list structure.
+ * The conn drain list structure (idl_t).
* The list is protected by idl_lock. Each conn_t inserted in the list
* points back at this idl_t using conn_idl. IP primes the draining of the
* conns queued in these lists, by qenabling the 1st conn of each list. This
@@ -1137,8 +1137,27 @@ extern const iulp_t ire_uinfo_null;
* idl_lock protects all other members of idl_t and conn_drain_next
* and conn_drain_prev of conn_t. The conn_lock protects IPCF_DRAIN_DISABLED
* flag of the conn_t and conn_idl.
+ *
+ * The conn drain list, idl_t, itself is part of tx cookie list structure.
+ * A tx cookie list points to a blocked Tx ring and contains the list of
+ * all conn's that are blocked due to the flow-controlled Tx ring (via
+ * the idl drain list). Note that a link can have multiple Tx rings. The
+ * drain list will store the conn's blocked due to Tx ring being flow
+ * controlled.
*/
-typedef struct idl_s {
+
+typedef uintptr_t ip_mac_tx_cookie_t;
+typedef struct idl_s idl_t;
+typedef struct idl_tx_list_s idl_tx_list_t;
+
+struct idl_tx_list_s {
+ ip_mac_tx_cookie_t txl_cookie;
+ kmutex_t txl_lock; /* Lock for this list */
+ idl_t *txl_drain_list;
+ int txl_drain_index;
+};
+
+struct idl_s {
conn_t *idl_conn; /* Head of drain list */
kmutex_t idl_lock; /* Lock for this list */
conn_t *idl_conn_draining; /* conn that is draining */
@@ -1146,7 +1165,8 @@ typedef struct idl_s {
idl_repeat : 1, /* Last conn must re-enable */
/* drain list again */
idl_unused : 31;
-} idl_t;
+ idl_tx_list_t *idl_itl;
+};
#define CONN_DRAIN_LIST_LOCK(connp) (&((connp)->conn_idl->idl_lock))
/*
@@ -3336,8 +3356,8 @@ extern ill_t *ipmp_ipif_hold_bound_ill(const ipif_t *);
extern boolean_t ipmp_ipif_is_dataaddr(const ipif_t *);
extern boolean_t ipmp_ipif_is_stubaddr(const ipif_t *);
-extern void conn_drain_insert(conn_t *connp);
-extern int conn_ipsec_length(conn_t *connp);
+extern void conn_drain_insert(conn_t *, idl_tx_list_t *);
+extern int conn_ipsec_length(conn_t *);
extern void ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *,
ire_t *);
extern ipaddr_t ip_get_dst(ipha_t *);
@@ -3587,13 +3607,16 @@ typedef enum {
* we need to duplicate the definitions here because we cannot
* include mac/dls header files here.
*/
-typedef void *ip_mac_tx_cookie_t;
-typedef void (*ip_mac_intr_disable_t)(void *);
-typedef void (*ip_mac_intr_enable_t)(void *);
-typedef void *(*ip_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t);
-typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t);
-typedef void *(*ip_dld_callb_t)(void *, ip_flow_enable_t, void *);
-typedef int (*ip_capab_func_t)(void *, uint_t, void *, uint_t);
+typedef void (*ip_mac_intr_disable_t)(void *);
+typedef void (*ip_mac_intr_enable_t)(void *);
+typedef ip_mac_tx_cookie_t (*ip_dld_tx_t)(void *, mblk_t *,
+ uint64_t, uint16_t);
+typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t);
+typedef void *(*ip_dld_callb_t)(void *,
+ ip_flow_enable_t, void *);
+typedef boolean_t (*ip_dld_fctl_t)(void *, ip_mac_tx_cookie_t);
+typedef int (*ip_capab_func_t)(void *, uint_t,
+ void *, uint_t);
/*
* POLLING README
@@ -3640,6 +3663,8 @@ typedef struct ill_dld_direct_s { /* DLD provided driver Tx */
void *idd_tx_dh; /* dld_str_t *dsp */
ip_dld_callb_t idd_tx_cb_df; /* mac_tx_srs_notify */
void *idd_tx_cb_dh; /* mac_client_handle_t *mch */
+ ip_dld_fctl_t idd_tx_fctl_df; /* mac_tx_is_flow_blocked */
+ void *idd_tx_fctl_dh; /* mac_client_handle */
} ill_dld_direct_t;
/* IP - DLD polling capability */
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index b040d36c8a..116ae8ccec 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -451,29 +451,115 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t,
* policy change may affect them.
*
* IP Flow control notes:
+ * ---------------------
+ * Non-TCP streams are flow controlled by IP. The way this is accomplished
+ * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When
+ * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into
+ * GLDv3. Otherwise packets are sent down to lower layers using STREAMS
+ * functions.
*
- * Non-TCP streams are flow controlled by IP. On the send side, if the packet
- * cannot be sent down to the driver by IP, because of a canput failure, IP
- * does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq.
- * ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained
- * when the flowcontrol condition subsides. Ultimately STREAMS backenables the
- * ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the
- * first conn in the list of conn's to be drained. ip_wsrv on this conn drains
- * the queued messages, and removes the conn from the drain list, if all
- * messages were drained. It also qenables the next conn in the drain list to
- * continue the drain process.
+ * Per Tx ring udp flow control:
+ * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in
+ * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true).
+ *
+ * The underlying link can expose multiple Tx rings to the GLDv3 mac layer.
+ * To achieve best performance, outgoing traffic need to be fanned out among
+ * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send
+ * traffic out of the NIC and it takes a fanout hint. UDP connections pass
+ * the address of connp as fanout hint to mac_tx(). Under flow controlled
+ * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This
+ * cookie points to a specific Tx ring that is blocked. The cookie is used to
+ * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t
+ * point to drain_lists (idl_t's). These drain list will store the blocked UDP
+ * connp's. The drain list is not a single list but a configurable number of
+ * lists.
+ *
+ * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t
+ * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE
+ * which is equal to 128. This array in turn contains a pointer to idl_t[],
+ * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain
+ * list will point to the list of connp's that are flow controlled.
+ *
+ * --------------- ------- ------- -------
+ * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
+ * | --------------- ------- ------- -------
+ * | --------------- ------- ------- -------
+ * |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
+ * ---------------- | --------------- ------- ------- -------
+ * |idl_tx_list[0]|->| --------------- ------- ------- -------
+ * ---------------- |->|drain_list[2]|-->|connp|-->|connp|-->|connp|-->
+ * | --------------- ------- ------- -------
+ * . . . . .
+ * | --------------- ------- ------- -------
+ * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
+ * --------------- ------- ------- -------
+ * --------------- ------- ------- -------
+ * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|-->
+ * | --------------- ------- ------- -------
+ * | --------------- ------- ------- -------
+ * ---------------- |->|drain_list[1]|-->|connp|-->|connp|-->|connp|-->
+ * |idl_tx_list[1]|->| --------------- ------- ------- -------
+ * ---------------- | . . . .
+ * | --------------- ------- ------- -------
+ * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|-->
+ * --------------- ------- ------- -------
+ * .....
+ * ----------------
+ * |idl_tx_list[n]|-> ...
+ * ----------------
+ *
+ * When mac_tx() returns a cookie, the cookie is used to hash into a
+ * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is
+ * called passing idl_tx_list. The connp gets inserted in a drain list
+ * pointed to by idl_tx_list. conn_drain_list() asserts flow control for
+ * the sockets (non stream based) and sets QFULL condition for conn_wq.
+ * connp->conn_direct_blocked will be set to indicate the blocked
+ * condition.
+ *
+ * GLDv3 mac layer calls ill_flow_enable() when flow control is relieved.
+ * A cookie is passed in the call to ill_flow_enable() that identifies the
+ * blocked Tx ring. This cookie is used to get to the idl_tx_list that
+ * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t
+ * and goes through each of the drain list (q)enabling the conn_wq of the
+ * first conn in each of the drain list. This causes ip_wsrv to run for the
+ * conn. ip_wsrv drains the queued messages, and removes the conn from the
+ * drain list, if all messages were drained. It also qenables the next conn
+ * in the drain list to continue the drain process.
*
* In reality the drain list is not a single list, but a configurable number
- * of lists. The ip_wsrv on the IP module, qenables the first conn in each
- * list. If the ip_wsrv of the next qenabled conn does not run, because the
+ * of lists. conn_drain_walk() in the IP module, qenables the first conn in
+ * each list. If the ip_wsrv of the next qenabled conn does not run, because
+ * the stream closes, ip_close takes responsibility to qenable the next conn
+ * in the drain list. conn_drain_insert and conn_drain_tail are the only
+ * functions that manipulate this drain list. conn_drain_insert is called in
+ * ip_wput context itself (as opposed to from ip_wsrv context for STREAMS
+ * case -- see below). The synchronization between drain insertion and flow
+ * control wakeup is handled by using idl_txl->txl_lock.
+ *
+ * Flow control using STREAMS:
+ * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism
+ * is used. On the send side, if the packet cannot be sent down to the
+ * driver by IP, because of a canput failure, IP does a putq on the conn_wq.
+ * This will cause ip_wsrv to run on the conn_wq. ip_wsrv in turn, inserts
+ * the conn in a list of conn's that need to be drained when the flow
+ * control condition subsides. The blocked connps are put in first member
+ * of ips_idl_tx_list[] array. Ultimately STREAMS backenables the ip_wsrv
+ * on the IP module. It calls conn_walk_drain() passing ips_idl_tx_list[0].
+ * ips_idl_tx_list[0] contains the drain lists of blocked conns. The
+ * conn_wq of the first conn in the drain lists is (q)enabled to run.
+ * ip_wsrv on this conn drains the queued messages, and removes the conn
+ * from the drain list, if all messages were drained. It also qenables the
+ * next conn in the drain list to continue the drain process.
+ *
+ * If the ip_wsrv of the next qenabled conn does not run, because the
* stream closes, ip_close takes responsibility to qenable the next conn in
* the drain list. The directly called ip_wput path always does a putq, if
* it cannot putnext. Thus synchronization problems are handled between
* ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only
* functions that manipulate this drain list. Furthermore conn_drain_insert
- * is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv
- * running on a queue at any time. conn_drain_tail can be simultaneously called
- * from both ip_wsrv and ip_close.
+ * is called only from ip_wsrv for the STREAMS case, and there can be only 1
+ * instance of ip_wsrv running on a queue at any time. conn_drain_tail can
+ * be simultaneously called from both ip_wsrv and ip_close.
*
* IPQOS notes:
*
@@ -732,9 +818,11 @@ static void conn_drain_init(ip_stack_t *);
static void conn_drain_fini(ip_stack_t *);
static void conn_drain_tail(conn_t *connp, boolean_t closing);
-static void conn_walk_drain(ip_stack_t *);
+static void conn_walk_drain(ip_stack_t *, idl_tx_list_t *);
static void conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *,
zoneid_t);
+static void conn_setqfull(conn_t *);
+static void conn_clrqfull(conn_t *);
static void *ip_stack_init(netstackid_t stackid, netstack_t *ns);
static void ip_stack_shutdown(netstackid_t stackid, void *arg);
@@ -5372,6 +5460,7 @@ ip_modclose(ill_t *ill)
ipif_t *ipif;
queue_t *q = ill->ill_rq;
ip_stack_t *ipst = ill->ill_ipst;
+ int i;
/*
* The punlink prior to this may have initiated a capability
@@ -5463,7 +5552,9 @@ ip_modclose(ill_t *ill)
* get unblocked.
*/
ip1dbg(("ip_wsrv: walking\n"));
- conn_walk_drain(ipst);
+ for (i = 0; i < TX_FANOUT_SIZE; i++) {
+ conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]);
+ }
mutex_enter(&ipst->ips_ip_mi_lock);
mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill);
@@ -13908,8 +13999,7 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp)
ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone,
ALL_ZONES, ill, IPV4_VERSION, hlen, ipst);
}
-
- ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC);
+ ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC, NULL);
}
return (ire);
@@ -22341,8 +22431,13 @@ another:;
if (!IP_FLOW_CONTROLLED_ULP(PROTO)) {
queue_t *dev_q = stq->q_next;
- /* flow controlled */
- if (DEV_Q_FLOW_BLOCKED(dev_q))
+ /*
+ * For DIRECT_CAPABLE, we do flow control at
+ * the time of sending the packet. See
+ * ILL_SEND_TX().
+ */
+ if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) &&
+ (DEV_Q_FLOW_BLOCKED(dev_q)))
goto blocked;
if ((PROTO == IPPROTO_UDP) &&
@@ -22765,7 +22860,8 @@ broadcast:
} else {
queue_t *dev_q = stq->q_next;
- if (DEV_Q_FLOW_BLOCKED(dev_q)) {
+ if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) &&
+ (DEV_Q_FLOW_BLOCKED(dev_q))) {
blocked:
ipha->ipha_ident = ip_hdr_included;
/*
@@ -22780,10 +22876,15 @@ blocked:
connp != NULL &&
caller != IRE_SEND) {
if (caller == IP_WSRV) {
+ idl_tx_list_t *idl_txl;
+
+ idl_txl =
+ &ipst->ips_idl_tx_list[0];
connp->conn_did_putbq = 1;
(void) putbq(connp->conn_wq,
first_mp);
- conn_drain_insert(connp);
+ conn_drain_insert(connp,
+ idl_txl);
/*
* This is the service thread,
* and the queue is already
@@ -24401,7 +24502,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill,
ipha_t *, ipha, ip6_t *, NULL, int, 0);
- ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0);
+ ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0, connp);
BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits);
UPDATE_MIB(out_ill->ill_ip_mib,
@@ -24708,7 +24809,8 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag,
__dtrace_ipsr_ill_t *, out_ill, ipha_t *,
ipha, ip6_t *, NULL, int, 0);
- ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0);
+ ILL_SEND_TX(out_ill, ire, connp,
+ xmit_mp, 0, connp);
BUMP_MIB(out_ill->ill_ip_mib,
ipIfStatsHCOutTransmits);
@@ -27921,7 +28023,8 @@ bad_src_route:
static void
conn_drain_init(ip_stack_t *ipst)
{
- int i;
+ int i, j;
+ idl_tx_list_t *itl_tx;
ipst->ips_conn_drain_list_cnt = conn_drain_nthreads;
@@ -27937,12 +28040,19 @@ conn_drain_init(ip_stack_t *ipst)
ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8);
}
- ipst->ips_conn_drain_list = kmem_zalloc(ipst->ips_conn_drain_list_cnt *
- sizeof (idl_t), KM_SLEEP);
-
- for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) {
- mutex_init(&ipst->ips_conn_drain_list[i].idl_lock, NULL,
- MUTEX_DEFAULT, NULL);
+ ipst->ips_idl_tx_list =
+ kmem_zalloc(TX_FANOUT_SIZE * sizeof (idl_tx_list_t), KM_SLEEP);
+ for (i = 0; i < TX_FANOUT_SIZE; i++) {
+ itl_tx = &ipst->ips_idl_tx_list[i];
+ itl_tx->txl_drain_list =
+ kmem_zalloc(ipst->ips_conn_drain_list_cnt *
+ sizeof (idl_t), KM_SLEEP);
+ mutex_init(&itl_tx->txl_lock, NULL, MUTEX_DEFAULT, NULL);
+ for (j = 0; j < ipst->ips_conn_drain_list_cnt; j++) {
+ mutex_init(&itl_tx->txl_drain_list[j].idl_lock, NULL,
+ MUTEX_DEFAULT, NULL);
+ itl_tx->txl_drain_list[j].idl_itl = itl_tx;
+ }
}
}
@@ -27950,12 +28060,16 @@ static void
conn_drain_fini(ip_stack_t *ipst)
{
int i;
+ idl_tx_list_t *itl_tx;
- for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++)
- mutex_destroy(&ipst->ips_conn_drain_list[i].idl_lock);
- kmem_free(ipst->ips_conn_drain_list,
- ipst->ips_conn_drain_list_cnt * sizeof (idl_t));
- ipst->ips_conn_drain_list = NULL;
+ for (i = 0; i < TX_FANOUT_SIZE; i++) {
+ itl_tx = &ipst->ips_idl_tx_list[i];
+ kmem_free(itl_tx->txl_drain_list,
+ ipst->ips_conn_drain_list_cnt * sizeof (idl_t));
+ }
+ kmem_free(ipst->ips_idl_tx_list,
+ TX_FANOUT_SIZE * sizeof (idl_tx_list_t));
+ ipst->ips_idl_tx_list = NULL;
}
/*
@@ -27968,16 +28082,11 @@ conn_drain_fini(ip_stack_t *ipst)
* the first conn in each of these drain lists. Each of these qenabled conns
* in turn enables the next in the list, after it runs, or when it closes,
* thus sustaining the drain process.
- *
- * The only possible calling sequence is ip_wsrv (on conn) -> ip_wput ->
- * conn_drain_insert. Thus there can be only 1 instance of conn_drain_insert
- * running at any time, on a given conn, since there can be only 1 service proc
- * running on a queue at any time.
*/
void
-conn_drain_insert(conn_t *connp)
+conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list)
{
- idl_t *idl;
+ idl_t *idl = tx_list->txl_drain_list;
uint_t index;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
@@ -27996,13 +28105,13 @@ conn_drain_insert(conn_t *connp)
* Atomicity of load/stores is enough to make sure that
* conn_drain_list_index is always within bounds.
*/
- index = ipst->ips_conn_drain_list_index;
+ index = tx_list->txl_drain_index;
ASSERT(index < ipst->ips_conn_drain_list_cnt);
- connp->conn_idl = &ipst->ips_conn_drain_list[index];
+ connp->conn_idl = &tx_list->txl_drain_list[index];
index++;
if (index == ipst->ips_conn_drain_list_cnt)
index = 0;
- ipst->ips_conn_drain_list_index = index;
+ tx_list->txl_drain_index = index;
}
mutex_exit(&connp->conn_lock);
@@ -28044,8 +28153,12 @@ conn_drain_insert(conn_t *connp)
* For non streams based sockets assert flow control.
*/
if (IPCL_IS_NONSTR(connp)) {
+ DTRACE_PROBE1(su__txq__full, conn_t *, connp);
(*connp->conn_upcalls->su_txq_full)
(connp->conn_upper_handle, B_TRUE);
+ } else {
+ conn_setqfull(connp);
+ noenable(connp->conn_wq);
}
mutex_exit(CONN_DRAIN_LIST_LOCK(connp));
}
@@ -28167,6 +28280,9 @@ conn_drain_tail(conn_t *connp, boolean_t closing)
if (IPCL_IS_NONSTR(connp)) {
(*connp->conn_upcalls->su_txq_full)
(connp->conn_upper_handle, B_FALSE);
+ } else {
+ conn_clrqfull(connp);
+ enableok(connp->conn_wq);
}
}
@@ -28194,6 +28310,8 @@ ip_wsrv(queue_t *q)
if (q->q_next) {
ill = (ill_t *)q->q_ptr;
if (ill->ill_state_flags == 0) {
+ ip_stack_t *ipst = ill->ill_ipst;
+
/*
* The device flow control has opened up.
* Walk through conn drain lists and qenable the
@@ -28202,7 +28320,7 @@ ip_wsrv(queue_t *q)
* Hence the if check above.
*/
ip1dbg(("ip_wsrv: walking\n"));
- conn_walk_drain(ill->ill_ipst);
+ conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]);
}
return;
}
@@ -28229,12 +28347,14 @@ ip_wsrv(queue_t *q)
* (causing an infinite loop).
*/
ASSERT(!connp->conn_did_putbq);
+
while ((q->q_first != NULL) && !connp->conn_did_putbq) {
connp->conn_draining = 1;
noenable(q);
while ((mp = getq(q)) != NULL) {
ASSERT(CONN_Q(q));
+ DTRACE_PROBE1(ip__wsrv__ip__output, conn_t *, connp);
ip_output(Q_TO_CONN(q), mp, q, IP_WSRV);
if (connp->conn_did_putbq) {
/* ip_wput did a putbq */
@@ -28253,12 +28373,23 @@ ip_wsrv(queue_t *q)
*/
connp->conn_draining = 0;
enableok(q);
-
}
/* Enable the next conn for draining */
conn_drain_tail(connp, B_FALSE);
+ /*
+ * conn_direct_blocked is used to indicate blocked
+ * condition for direct path (ILL_DIRECT_CAPABLE()).
+ * This is the only place where it is set without
+ * checking for ILL_DIRECT_CAPABLE() and setting it
+ * to 0 is ok even if it is not ILL_DIRECT_CAPABLE().
+ */
+ if (!connp->conn_did_putbq && connp->conn_direct_blocked) {
+ DTRACE_PROBE1(ip__wsrv__direct__blocked, conn_t *, connp);
+ connp->conn_direct_blocked = B_FALSE;
+ }
+
connp->conn_did_putbq = 0;
}
@@ -28274,11 +28405,18 @@ ip_wsrv(queue_t *q)
* function and wakes up corresponding mac worker threads, which in turn
* calls this callback function, and disables flow control.
*/
-/* ARGSUSED */
void
-ill_flow_enable(void *ill, ip_mac_tx_cookie_t cookie)
+ill_flow_enable(void *arg, ip_mac_tx_cookie_t cookie)
{
- qenable(((ill_t *)ill)->ill_wq);
+ ill_t *ill = (ill_t *)arg;
+ ip_stack_t *ipst = ill->ill_ipst;
+ idl_tx_list_t *idl_txl;
+
+ idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
+ mutex_enter(&idl_txl->txl_lock);
+ /* add code to to set a flag to indicate idl_txl is enabled */
+ conn_walk_drain(ipst, idl_txl);
+ mutex_exit(&idl_txl->txl_lock);
}
/*
@@ -28315,7 +28453,7 @@ conn_walk_fanout(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst)
* in turn qenable the next conn, when it is done/blocked/closing.
*/
static void
-conn_walk_drain(ip_stack_t *ipst)
+conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list)
{
int i;
idl_t *idl;
@@ -28323,7 +28461,7 @@ conn_walk_drain(ip_stack_t *ipst)
IP_STAT(ipst, ip_conn_walk_drain);
for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) {
- idl = &ipst->ips_conn_drain_list[i];
+ idl = &tx_list->txl_drain_list[i];
mutex_enter(&idl->idl_lock);
if (idl->idl_conn == NULL) {
mutex_exit(&idl->idl_lock);
@@ -28521,6 +28659,41 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags,
return (found);
}
+static void
+conn_setqfull(conn_t *connp)
+{
+ queue_t *q = connp->conn_wq;
+
+ if (!(q->q_flag & QFULL)) {
+ mutex_enter(QLOCK(q));
+ if (!(q->q_flag & QFULL)) {
+ /* still need to set QFULL */
+ q->q_flag |= QFULL;
+ mutex_exit(QLOCK(q));
+ } else {
+ mutex_exit(QLOCK(q));
+ }
+ }
+}
+
+static void
+conn_clrqfull(conn_t *connp)
+{
+ queue_t *q = connp->conn_wq;
+
+ if (q->q_flag & QFULL) {
+ mutex_enter(QLOCK(q));
+ if (q->q_flag & QFULL) {
+ q->q_flag &= ~QFULL;
+ mutex_exit(QLOCK(q));
+ if (q->q_flag & QWANTW)
+ qbackenable(q, 0);
+ } else {
+ mutex_exit(QLOCK(q));
+ }
+ }
+}
+
/*
* Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp.
*/
@@ -29666,7 +29839,7 @@ ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io,
0);
ILL_SEND_TX(out_ill,
- ire, connp, first_mp, 0);
+ ire, connp, first_mp, 0, connp);
} else {
BUMP_MIB(out_ill->ill_ip_mib,
ipIfStatsOutDiscards);
diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c
index 686e2ad94e..fe10ea8110 100644
--- a/usr/src/uts/common/inet/ip/ip6.c
+++ b/usr/src/uts/common/inet/ip/ip6.c
@@ -10807,9 +10807,12 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src,
if (ipst->ips_ip_output_queue && connp != NULL &&
!mctl_present && caller != IRE_SEND) {
if (caller == IP_WSRV) {
+ idl_tx_list_t *idl_txl;
+
+ idl_txl = &ipst->ips_idl_tx_list[0];
connp->conn_did_putbq = 1;
(void) putbq(connp->conn_wq, mp);
- conn_drain_insert(connp);
+ conn_drain_insert(connp, idl_txl);
/*
* caller == IP_WSRV implies we are
* the service thread, and the
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index 64f9789fe9..3628dd4f56 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -3083,6 +3083,8 @@ ill_capability_direct_enable(ill_t *ill)
idd->idd_tx_dh = direct.di_tx_dh;
idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df;
idd->idd_tx_cb_dh = direct.di_tx_cb_dh;
+ idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df;
+ idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh;
/*
* One time registration of flow enable callback function
*/
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index 369ba60005..7711a2fedc 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -503,24 +503,72 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t;
#define ILL_DIRECT_CAPABLE(ill) \
(((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0)
-#define ILL_SEND_TX(ill, ire, hint, mp, flag) { \
- if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) { \
- ill_dld_direct_t *idd; \
- \
- idd = &(ill)->ill_dld_capab->idc_direct; \
- /* \
- * Send the packet directly to DLD, where it \
- * may be queued depending on the availability \
- * of transmit resources at the media layer. \
- * Ignore the returned value for the time being \
- * In future, we may want to take this into \
- * account and flow control the TCP. \
- */ \
- (void) idd->idd_tx_df(idd->idd_tx_dh, mp, \
- (uintptr_t)(hint), flag); \
- } else { \
- putnext((ire)->ire_stq, mp); \
- } \
+#define ILL_SEND_TX(ill, ire, hint, mp, flag, connp) { \
+ if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) { \
+ ill_dld_direct_t *idd; \
+ uintptr_t cookie; \
+ conn_t *udp_connp = (conn_t *)connp; \
+ \
+ idd = &(ill)->ill_dld_capab->idc_direct; \
+ /* \
+ * Send the packet directly to DLD, where it \
+ * may be queued depending on the availability \
+ * of transmit resources at the media layer. \
+ * Ignore the returned value for the time being \
+ * In future, we may want to take this into \
+ * account and flow control the TCP. \
+ */ \
+ cookie = idd->idd_tx_df(idd->idd_tx_dh, mp, \
+ (uintptr_t)(hint), flag); \
+ \
+ /* \
+ * non-NULL cookie indicates flow control situation \
+ * and the cookie itself identifies this specific \
+ * Tx ring that is blocked. This cookie is used to \
+ * block the UDP conn that is sending packets over \
+ * this specific Tx ring. \
+ */ \
+ if ((cookie != NULL) && (udp_connp != NULL) && \
+ (udp_connp->conn_ulp == IPPROTO_UDP)) { \
+ idl_tx_list_t *idl_txl; \
+ ip_stack_t *ipst; \
+ \
+ /* \
+ * Flow controlled. \
+ */ \
+ DTRACE_PROBE2(ill__send__tx__cookie, \
+ uintptr_t, cookie, conn_t *, udp_connp); \
+ ipst = udp_connp->conn_netstack->netstack_ip; \
+ idl_txl = \
+ &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];\
+ mutex_enter(&idl_txl->txl_lock); \
+ if (udp_connp->conn_direct_blocked || \
+ (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, \
+ cookie) == 0)) { \
+ DTRACE_PROBE1(ill__tx__not__blocked, \
+ boolean, \
+ udp_connp->conn_direct_blocked); \
+ } else if (idl_txl->txl_cookie != NULL && \
+ idl_txl->txl_cookie != cookie) { \
+ udp_t *udp = udp_connp->conn_udp; \
+ udp_stack_t *us = udp->udp_us; \
+ \
+ DTRACE_PROBE2(ill__send__tx__collision, \
+ uintptr_t, cookie, \
+ uintptr_t, idl_txl->txl_cookie); \
+ UDP_STAT(us, udp_cookie_coll); \
+ } else { \
+ udp_connp->conn_direct_blocked = B_TRUE;\
+ idl_txl->txl_cookie = cookie; \
+ conn_drain_insert(udp_connp, idl_txl); \
+ DTRACE_PROBE1(ill__send__tx__insert, \
+ conn_t *, udp_connp); \
+ } \
+ mutex_exit(&idl_txl->txl_lock); \
+ } \
+ } else { \
+ putnext((ire)->ire_stq, mp); \
+ } \
}
#define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \
diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h
index 750378f587..d6f0b033ff 100644
--- a/usr/src/uts/common/inet/ip_stack.h
+++ b/usr/src/uts/common/inet/ip_stack.h
@@ -131,6 +131,9 @@ typedef struct ire_stats {
uint64_t ire_stats_deleted; /* # of ires deleted from the bucket */
} ire_stats_t;
+#define TX_FANOUT_SIZE 128
+#define IDLHASHINDEX(X) \
+ ((((uintptr_t)(X) >> 2) + ((uintptr_t)(X) >> 9)) & (TX_FANOUT_SIZE - 1))
/*
* IP stack instances
@@ -348,9 +351,9 @@ struct ip_stack {
kstat_t *ips_loopback_ksp;
- struct idl_s *ips_conn_drain_list; /* Array of conn drain lists */
+ /* Array of conn drain lists */
+ struct idl_tx_list_s *ips_idl_tx_list;
uint_t ips_conn_drain_list_cnt; /* Count of conn_drain_list */
- int ips_conn_drain_list_index; /* Next drain_list */
/*
* ID used to assign next free one.
diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h
index 2ecc445a56..716689989f 100644
--- a/usr/src/uts/common/inet/ipclassifier.h
+++ b/usr/src/uts/common/inet/ipclassifier.h
@@ -245,6 +245,7 @@ struct conn_s {
unsigned int
conn_lso_ok : 1; /* LSO is usable */
+ boolean_t conn_direct_blocked; /* conn is flow-controlled */
squeue_t *conn_initial_sqp; /* Squeue at open time */
squeue_t *conn_final_sqp; /* Squeue after connect */
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 95b2551008..1b0df0f335 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -86,6 +86,7 @@
#include <inet/kstatcom.h>
#include <inet/tcp.h>
#include <inet/tcp_impl.h>
+#include <inet/udp_impl.h>
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
#include <inet/ipdrop.h>
@@ -19431,7 +19432,7 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst);
}
- ILL_SEND_TX(ill, ire, connp, mp, 0);
+ ILL_SEND_TX(ill, ire, connp, mp, 0, NULL);
}
IRE_REFRELE(ire);
@@ -21418,7 +21419,7 @@ tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss,
ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst);
}
- ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0);
+ ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0, NULL);
}
}
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index 30c876b45f..c4f7be3b93 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -5604,6 +5604,7 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
udp_stack_t *us = udp->udp_us;
ip_stack_t *ipst = connp->conn_netstack->netstack_ip;
boolean_t ll_multicast = B_FALSE;
+ boolean_t direct_send;
dev_q = ire->ire_stq->q_next;
ASSERT(dev_q != NULL);
@@ -5611,16 +5612,24 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
ill = ire_to_ill(ire);
ASSERT(ill != NULL);
+ /*
+ * For the direct send case, if resetting of conn_direct_blocked
+ * was missed, it is still ok because the putq() would enable
+ * the queue and write service will drain it out.
+ */
+ direct_send = ILL_DIRECT_CAPABLE(ill);
+
/* is queue flow controlled? */
- if (q->q_first != NULL || connp->conn_draining ||
- DEV_Q_FLOW_BLOCKED(dev_q)) {
+ if ((!direct_send) && (q->q_first != NULL || connp->conn_draining ||
+ DEV_Q_FLOW_BLOCKED(dev_q))) {
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests);
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards);
-
- if (ipst->ips_ip_output_queue)
+ if (ipst->ips_ip_output_queue) {
+ DTRACE_PROBE1(udp__xmit__putq, conn_t *, connp);
(void) putq(connp->conn_wq, mp);
- else
+ } else {
freemsg(mp);
+ }
ire_refrele(ire);
return;
}
@@ -5718,20 +5727,60 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid)
ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst);
}
- if (mp != NULL) {
- DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
- void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,
- ipha_t *, ipha, ip6_t *, NULL, int, 0);
+ if (mp == NULL)
+ goto bail;
- if (ILL_DIRECT_CAPABLE(ill)) {
- ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct;
+ DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL,
+ void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill,
+ ipha_t *, ipha, ip6_t *, NULL, int, 0);
- (void) idd->idd_tx_df(idd->idd_tx_dh, mp,
- (uintptr_t)connp, 0);
- } else {
- putnext(ire->ire_stq, mp);
+ if (direct_send) {
+ uintptr_t cookie;
+ ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct;
+
+ cookie = idd->idd_tx_df(idd->idd_tx_dh, mp,
+ (uintptr_t)connp, 0);
+ if (cookie != NULL) {
+ idl_tx_list_t *idl_txl;
+
+ /*
+ * Flow controlled.
+ */
+ DTRACE_PROBE2(non__null__cookie, uintptr_t,
+ cookie, conn_t *, connp);
+ idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];
+ mutex_enter(&idl_txl->txl_lock);
+ /*
+ * Check again after holding txl_lock to see if Tx
+ * ring is still blocked and only then insert the
+ * connp into the drain list.
+ */
+ if (connp->conn_direct_blocked ||
+ (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh,
+ cookie) == 0)) {
+ mutex_exit(&idl_txl->txl_lock);
+ goto bail;
+ }
+ if (idl_txl->txl_cookie != NULL &&
+ idl_txl->txl_cookie != cookie) {
+ DTRACE_PROBE2(udp__xmit__collision,
+ uintptr_t, cookie,
+ uintptr_t, idl_txl->txl_cookie);
+ UDP_STAT(us, udp_cookie_coll);
+ } else {
+ connp->conn_direct_blocked = B_TRUE;
+ idl_txl->txl_cookie = cookie;
+ conn_drain_insert(connp, idl_txl);
+ DTRACE_PROBE1(udp__xmit__insert,
+ conn_t *, connp);
+ }
+ mutex_exit(&idl_txl->txl_lock);
}
+ } else {
+ DTRACE_PROBE1(udp__xmit__putnext, mblk_t *, mp);
+ putnext(ire->ire_stq, mp);
}
+bail:
IRE_REFRELE(ire);
}
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 34b38e67bd..96f84e43bc 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -192,10 +192,7 @@ typedef struct udp_stat { /* Class "net" kstats */
kstat_named_t udp_in_recvtclass;
kstat_named_t udp_in_timestamp;
kstat_named_t udp_ip_rcvpktinfo;
- kstat_named_t udp_direct_send;
- kstat_named_t udp_bwsq_send;
- kstat_named_t udp_connected_direct_send;
- kstat_named_t udp_connected_bwsq_send;
+ kstat_named_t udp_cookie_coll;
#ifdef DEBUG
kstat_named_t udp_data_conn;
kstat_named_t udp_data_notconn;