diff options
Diffstat (limited to 'usr/src/uts/common/inet')
| -rw-r--r-- | usr/src/uts/common/inet/ip.h | 49 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/ip/ip.c | 283 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/ip/ip6.c | 5 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/ip/ip_if.c | 2 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/ip_impl.h | 84 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/ip_stack.h | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/ipclassifier.h | 1 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/tcp/tcp.c | 5 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/udp/udp.c | 79 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/udp_impl.h | 7 |
10 files changed, 412 insertions, 110 deletions
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index ff820814bf..a18c3d0f4c 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -1128,7 +1128,7 @@ typedef struct iulp_s { extern const iulp_t ire_uinfo_null; /* - * The conn drain list structure. + * The conn drain list structure (idl_t). * The list is protected by idl_lock. Each conn_t inserted in the list * points back at this idl_t using conn_idl. IP primes the draining of the * conns queued in these lists, by qenabling the 1st conn of each list. This @@ -1137,8 +1137,27 @@ extern const iulp_t ire_uinfo_null; * idl_lock protects all other members of idl_t and conn_drain_next * and conn_drain_prev of conn_t. The conn_lock protects IPCF_DRAIN_DISABLED * flag of the conn_t and conn_idl. + * + * The conn drain list, idl_t, itself is part of tx cookie list structure. + * A tx cookie list points to a blocked Tx ring and contains the list of + * all conn's that are blocked due to the flow-controlled Tx ring (via + * the idl drain list). Note that a link can have multiple Tx rings. The + * drain list will store the conn's blocked due to Tx ring being flow + * controlled. */ -typedef struct idl_s { + +typedef uintptr_t ip_mac_tx_cookie_t; +typedef struct idl_s idl_t; +typedef struct idl_tx_list_s idl_tx_list_t; + +struct idl_tx_list_s { + ip_mac_tx_cookie_t txl_cookie; + kmutex_t txl_lock; /* Lock for this list */ + idl_t *txl_drain_list; + int txl_drain_index; +}; + +struct idl_s { conn_t *idl_conn; /* Head of drain list */ kmutex_t idl_lock; /* Lock for this list */ conn_t *idl_conn_draining; /* conn that is draining */ @@ -1146,7 +1165,8 @@ typedef struct idl_s { idl_repeat : 1, /* Last conn must re-enable */ /* drain list again */ idl_unused : 31; -} idl_t; + idl_tx_list_t *idl_itl; +}; #define CONN_DRAIN_LIST_LOCK(connp) (&((connp)->conn_idl->idl_lock)) /* @@ -3336,8 +3356,8 @@ extern ill_t *ipmp_ipif_hold_bound_ill(const ipif_t *); extern boolean_t ipmp_ipif_is_dataaddr(const ipif_t *); extern boolean_t ipmp_ipif_is_stubaddr(const ipif_t *); -extern void conn_drain_insert(conn_t *connp); -extern int conn_ipsec_length(conn_t *connp); +extern void conn_drain_insert(conn_t *, idl_tx_list_t *); +extern int conn_ipsec_length(conn_t *); extern void ip_wput_ipsec_out(queue_t *, mblk_t *, ipha_t *, ill_t *, ire_t *); extern ipaddr_t ip_get_dst(ipha_t *); @@ -3587,13 +3607,16 @@ typedef enum { * we need to duplicate the definitions here because we cannot * include mac/dls header files here. */ -typedef void *ip_mac_tx_cookie_t; -typedef void (*ip_mac_intr_disable_t)(void *); -typedef void (*ip_mac_intr_enable_t)(void *); -typedef void *(*ip_dld_tx_t)(void *, mblk_t *, uint64_t, uint16_t); -typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t); -typedef void *(*ip_dld_callb_t)(void *, ip_flow_enable_t, void *); -typedef int (*ip_capab_func_t)(void *, uint_t, void *, uint_t); +typedef void (*ip_mac_intr_disable_t)(void *); +typedef void (*ip_mac_intr_enable_t)(void *); +typedef ip_mac_tx_cookie_t (*ip_dld_tx_t)(void *, mblk_t *, + uint64_t, uint16_t); +typedef void (*ip_flow_enable_t)(void *, ip_mac_tx_cookie_t); +typedef void *(*ip_dld_callb_t)(void *, + ip_flow_enable_t, void *); +typedef boolean_t (*ip_dld_fctl_t)(void *, ip_mac_tx_cookie_t); +typedef int (*ip_capab_func_t)(void *, uint_t, + void *, uint_t); /* * POLLING README @@ -3640,6 +3663,8 @@ typedef struct ill_dld_direct_s { /* DLD provided driver Tx */ void *idd_tx_dh; /* dld_str_t *dsp */ ip_dld_callb_t idd_tx_cb_df; /* mac_tx_srs_notify */ void *idd_tx_cb_dh; /* mac_client_handle_t *mch */ + ip_dld_fctl_t idd_tx_fctl_df; /* mac_tx_is_flow_blocked */ + void *idd_tx_fctl_dh; /* mac_client_handle */ } ill_dld_direct_t; /* IP - DLD polling capability */ diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index b040d36c8a..116ae8ccec 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -451,29 +451,115 @@ void (*cl_inet_idlesa)(netstackid_t, uint8_t, uint32_t, sa_family_t, * policy change may affect them. * * IP Flow control notes: + * --------------------- + * Non-TCP streams are flow controlled by IP. The way this is accomplished + * differs when ILL_CAPAB_DLD_DIRECT is enabled for that IP instance. When + * ILL_DIRECT_CAPABLE(ill) is TRUE, IP can do direct function calls into + * GLDv3. Otherwise packets are sent down to lower layers using STREAMS + * functions. * - * Non-TCP streams are flow controlled by IP. On the send side, if the packet - * cannot be sent down to the driver by IP, because of a canput failure, IP - * does a putq on the conn_wq. This will cause ip_wsrv to run on the conn_wq. - * ip_wsrv in turn, inserts the conn in a list of conn's that need to be drained - * when the flowcontrol condition subsides. Ultimately STREAMS backenables the - * ip_wsrv on the IP module, which in turn does a qenable of the conn_wq of the - * first conn in the list of conn's to be drained. ip_wsrv on this conn drains - * the queued messages, and removes the conn from the drain list, if all - * messages were drained. It also qenables the next conn in the drain list to - * continue the drain process. + * Per Tx ring udp flow control: + * This is applicable only when ILL_CAPAB_DLD_DIRECT capability is set in + * the ill (i.e. ILL_DIRECT_CAPABLE(ill) is true). + * + * The underlying link can expose multiple Tx rings to the GLDv3 mac layer. + * To achieve best performance, outgoing traffic need to be fanned out among + * these Tx ring. mac_tx() is called (via str_mdata_fastpath_put()) to send + * traffic out of the NIC and it takes a fanout hint. UDP connections pass + * the address of connp as fanout hint to mac_tx(). Under flow controlled + * condition, mac_tx() returns a non-NULL cookie (ip_mac_tx_cookie_t). This + * cookie points to a specific Tx ring that is blocked. The cookie is used to + * hash into an idl_tx_list[] entry in idl_tx_list[] array. Each idl_tx_list_t + * point to drain_lists (idl_t's). These drain list will store the blocked UDP + * connp's. The drain list is not a single list but a configurable number of + * lists. + * + * The diagram below shows idl_tx_list_t's and their drain_lists. ip_stack_t + * has an array of idl_tx_list_t. The size of the array is TX_FANOUT_SIZE + * which is equal to 128. This array in turn contains a pointer to idl_t[], + * the ip drain list. The idl_t[] array size is MIN(max_ncpus, 8). The drain + * list will point to the list of connp's that are flow controlled. + * + * --------------- ------- ------- ------- + * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|--> + * | --------------- ------- ------- ------- + * | --------------- ------- ------- ------- + * |->|drain_list[1]|-->|connp|-->|connp|-->|connp|--> + * ---------------- | --------------- ------- ------- ------- + * |idl_tx_list[0]|->| --------------- ------- ------- ------- + * ---------------- |->|drain_list[2]|-->|connp|-->|connp|-->|connp|--> + * | --------------- ------- ------- ------- + * . . . . . + * | --------------- ------- ------- ------- + * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|--> + * --------------- ------- ------- ------- + * --------------- ------- ------- ------- + * |->|drain_list[0]|-->|connp|-->|connp|-->|connp|--> + * | --------------- ------- ------- ------- + * | --------------- ------- ------- ------- + * ---------------- |->|drain_list[1]|-->|connp|-->|connp|-->|connp|--> + * |idl_tx_list[1]|->| --------------- ------- ------- ------- + * ---------------- | . . . . + * | --------------- ------- ------- ------- + * |->|drain_list[n]|-->|connp|-->|connp|-->|connp|--> + * --------------- ------- ------- ------- + * ..... + * ---------------- + * |idl_tx_list[n]|-> ... + * ---------------- + * + * When mac_tx() returns a cookie, the cookie is used to hash into a + * idl_tx_list in ips_idl_tx_list[] array. Then conn_drain_insert() is + * called passing idl_tx_list. The connp gets inserted in a drain list + * pointed to by idl_tx_list. conn_drain_list() asserts flow control for + * the sockets (non stream based) and sets QFULL condition for conn_wq. + * connp->conn_direct_blocked will be set to indicate the blocked + * condition. + * + * GLDv3 mac layer calls ill_flow_enable() when flow control is relieved. + * A cookie is passed in the call to ill_flow_enable() that identifies the + * blocked Tx ring. This cookie is used to get to the idl_tx_list that + * contains the blocked connp's. conn_walk_drain() uses the idl_tx_list_t + * and goes through each of the drain list (q)enabling the conn_wq of the + * first conn in each of the drain list. This causes ip_wsrv to run for the + * conn. ip_wsrv drains the queued messages, and removes the conn from the + * drain list, if all messages were drained. It also qenables the next conn + * in the drain list to continue the drain process. * * In reality the drain list is not a single list, but a configurable number - * of lists. The ip_wsrv on the IP module, qenables the first conn in each - * list. If the ip_wsrv of the next qenabled conn does not run, because the + * of lists. conn_drain_walk() in the IP module, qenables the first conn in + * each list. If the ip_wsrv of the next qenabled conn does not run, because + * the stream closes, ip_close takes responsibility to qenable the next conn + * in the drain list. conn_drain_insert and conn_drain_tail are the only + * functions that manipulate this drain list. conn_drain_insert is called in + * ip_wput context itself (as opposed to from ip_wsrv context for STREAMS + * case -- see below). The synchronization between drain insertion and flow + * control wakeup is handled by using idl_txl->txl_lock. + * + * Flow control using STREAMS: + * When ILL_DIRECT_CAPABLE() is not TRUE, STREAMS flow control mechanism + * is used. On the send side, if the packet cannot be sent down to the + * driver by IP, because of a canput failure, IP does a putq on the conn_wq. + * This will cause ip_wsrv to run on the conn_wq. ip_wsrv in turn, inserts + * the conn in a list of conn's that need to be drained when the flow + * control condition subsides. The blocked connps are put in first member + * of ips_idl_tx_list[] array. Ultimately STREAMS backenables the ip_wsrv + * on the IP module. It calls conn_walk_drain() passing ips_idl_tx_list[0]. + * ips_idl_tx_list[0] contains the drain lists of blocked conns. The + * conn_wq of the first conn in the drain lists is (q)enabled to run. + * ip_wsrv on this conn drains the queued messages, and removes the conn + * from the drain list, if all messages were drained. It also qenables the + * next conn in the drain list to continue the drain process. + * + * If the ip_wsrv of the next qenabled conn does not run, because the * stream closes, ip_close takes responsibility to qenable the next conn in * the drain list. The directly called ip_wput path always does a putq, if * it cannot putnext. Thus synchronization problems are handled between * ip_wsrv and ip_close. conn_drain_insert and conn_drain_tail are the only * functions that manipulate this drain list. Furthermore conn_drain_insert - * is called only from ip_wsrv, and there can be only 1 instance of ip_wsrv - * running on a queue at any time. conn_drain_tail can be simultaneously called - * from both ip_wsrv and ip_close. + * is called only from ip_wsrv for the STREAMS case, and there can be only 1 + * instance of ip_wsrv running on a queue at any time. conn_drain_tail can + * be simultaneously called from both ip_wsrv and ip_close. * * IPQOS notes: * @@ -732,9 +818,11 @@ static void conn_drain_init(ip_stack_t *); static void conn_drain_fini(ip_stack_t *); static void conn_drain_tail(conn_t *connp, boolean_t closing); -static void conn_walk_drain(ip_stack_t *); +static void conn_walk_drain(ip_stack_t *, idl_tx_list_t *); static void conn_walk_fanout_table(connf_t *, uint_t, pfv_t, void *, zoneid_t); +static void conn_setqfull(conn_t *); +static void conn_clrqfull(conn_t *); static void *ip_stack_init(netstackid_t stackid, netstack_t *ns); static void ip_stack_shutdown(netstackid_t stackid, void *arg); @@ -5372,6 +5460,7 @@ ip_modclose(ill_t *ill) ipif_t *ipif; queue_t *q = ill->ill_rq; ip_stack_t *ipst = ill->ill_ipst; + int i; /* * The punlink prior to this may have initiated a capability @@ -5463,7 +5552,9 @@ ip_modclose(ill_t *ill) * get unblocked. */ ip1dbg(("ip_wsrv: walking\n")); - conn_walk_drain(ipst); + for (i = 0; i < TX_FANOUT_SIZE; i++) { + conn_walk_drain(ipst, &ipst->ips_idl_tx_list[i]); + } mutex_enter(&ipst->ips_ip_mi_lock); mi_close_unlink(&ipst->ips_ip_g_head, (IDP)ill); @@ -13908,8 +13999,7 @@ ip_fast_forward(ire_t *ire, ipaddr_t dst, ill_t *ill, mblk_t *mp) ipobs_hook(mp, IPOBS_HOOK_OUTBOUND, szone, ALL_ZONES, ill, IPV4_VERSION, hlen, ipst); } - - ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC); + ILL_SEND_TX(stq_ill, ire, dst, mp, IP_DROP_ON_NO_DESC, NULL); } return (ire); @@ -22341,8 +22431,13 @@ another:; if (!IP_FLOW_CONTROLLED_ULP(PROTO)) { queue_t *dev_q = stq->q_next; - /* flow controlled */ - if (DEV_Q_FLOW_BLOCKED(dev_q)) + /* + * For DIRECT_CAPABLE, we do flow control at + * the time of sending the packet. See + * ILL_SEND_TX(). + */ + if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) && + (DEV_Q_FLOW_BLOCKED(dev_q))) goto blocked; if ((PROTO == IPPROTO_UDP) && @@ -22765,7 +22860,8 @@ broadcast: } else { queue_t *dev_q = stq->q_next; - if (DEV_Q_FLOW_BLOCKED(dev_q)) { + if (!ILL_DIRECT_CAPABLE((ill_t *)stq->q_ptr) && + (DEV_Q_FLOW_BLOCKED(dev_q))) { blocked: ipha->ipha_ident = ip_hdr_included; /* @@ -22780,10 +22876,15 @@ blocked: connp != NULL && caller != IRE_SEND) { if (caller == IP_WSRV) { + idl_tx_list_t *idl_txl; + + idl_txl = + &ipst->ips_idl_tx_list[0]; connp->conn_did_putbq = 1; (void) putbq(connp->conn_wq, first_mp); - conn_drain_insert(connp); + conn_drain_insert(connp, + idl_txl); /* * This is the service thread, * and the queue is already @@ -24401,7 +24502,7 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, void_ip_t *, ipha, __dtrace_ipsr_ill_t *, out_ill, ipha_t *, ipha, ip6_t *, NULL, int, 0); - ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0); + ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0, connp); BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits); UPDATE_MIB(out_ill->ill_ip_mib, @@ -24708,7 +24809,8 @@ ip_wput_frag(ire_t *ire, mblk_t *mp_orig, ip_pkt_t pkt_type, uint32_t max_frag, __dtrace_ipsr_ill_t *, out_ill, ipha_t *, ipha, ip6_t *, NULL, int, 0); - ILL_SEND_TX(out_ill, ire, connp, xmit_mp, 0); + ILL_SEND_TX(out_ill, ire, connp, + xmit_mp, 0, connp); BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsHCOutTransmits); @@ -27921,7 +28023,8 @@ bad_src_route: static void conn_drain_init(ip_stack_t *ipst) { - int i; + int i, j; + idl_tx_list_t *itl_tx; ipst->ips_conn_drain_list_cnt = conn_drain_nthreads; @@ -27937,12 +28040,19 @@ conn_drain_init(ip_stack_t *ipst) ipst->ips_conn_drain_list_cnt = MIN(max_ncpus, 8); } - ipst->ips_conn_drain_list = kmem_zalloc(ipst->ips_conn_drain_list_cnt * - sizeof (idl_t), KM_SLEEP); - - for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) { - mutex_init(&ipst->ips_conn_drain_list[i].idl_lock, NULL, - MUTEX_DEFAULT, NULL); + ipst->ips_idl_tx_list = + kmem_zalloc(TX_FANOUT_SIZE * sizeof (idl_tx_list_t), KM_SLEEP); + for (i = 0; i < TX_FANOUT_SIZE; i++) { + itl_tx = &ipst->ips_idl_tx_list[i]; + itl_tx->txl_drain_list = + kmem_zalloc(ipst->ips_conn_drain_list_cnt * + sizeof (idl_t), KM_SLEEP); + mutex_init(&itl_tx->txl_lock, NULL, MUTEX_DEFAULT, NULL); + for (j = 0; j < ipst->ips_conn_drain_list_cnt; j++) { + mutex_init(&itl_tx->txl_drain_list[j].idl_lock, NULL, + MUTEX_DEFAULT, NULL); + itl_tx->txl_drain_list[j].idl_itl = itl_tx; + } } } @@ -27950,12 +28060,16 @@ static void conn_drain_fini(ip_stack_t *ipst) { int i; + idl_tx_list_t *itl_tx; - for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) - mutex_destroy(&ipst->ips_conn_drain_list[i].idl_lock); - kmem_free(ipst->ips_conn_drain_list, - ipst->ips_conn_drain_list_cnt * sizeof (idl_t)); - ipst->ips_conn_drain_list = NULL; + for (i = 0; i < TX_FANOUT_SIZE; i++) { + itl_tx = &ipst->ips_idl_tx_list[i]; + kmem_free(itl_tx->txl_drain_list, + ipst->ips_conn_drain_list_cnt * sizeof (idl_t)); + } + kmem_free(ipst->ips_idl_tx_list, + TX_FANOUT_SIZE * sizeof (idl_tx_list_t)); + ipst->ips_idl_tx_list = NULL; } /* @@ -27968,16 +28082,11 @@ conn_drain_fini(ip_stack_t *ipst) * the first conn in each of these drain lists. Each of these qenabled conns * in turn enables the next in the list, after it runs, or when it closes, * thus sustaining the drain process. - * - * The only possible calling sequence is ip_wsrv (on conn) -> ip_wput -> - * conn_drain_insert. Thus there can be only 1 instance of conn_drain_insert - * running at any time, on a given conn, since there can be only 1 service proc - * running on a queue at any time. */ void -conn_drain_insert(conn_t *connp) +conn_drain_insert(conn_t *connp, idl_tx_list_t *tx_list) { - idl_t *idl; + idl_t *idl = tx_list->txl_drain_list; uint_t index; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; @@ -27996,13 +28105,13 @@ conn_drain_insert(conn_t *connp) * Atomicity of load/stores is enough to make sure that * conn_drain_list_index is always within bounds. */ - index = ipst->ips_conn_drain_list_index; + index = tx_list->txl_drain_index; ASSERT(index < ipst->ips_conn_drain_list_cnt); - connp->conn_idl = &ipst->ips_conn_drain_list[index]; + connp->conn_idl = &tx_list->txl_drain_list[index]; index++; if (index == ipst->ips_conn_drain_list_cnt) index = 0; - ipst->ips_conn_drain_list_index = index; + tx_list->txl_drain_index = index; } mutex_exit(&connp->conn_lock); @@ -28044,8 +28153,12 @@ conn_drain_insert(conn_t *connp) * For non streams based sockets assert flow control. */ if (IPCL_IS_NONSTR(connp)) { + DTRACE_PROBE1(su__txq__full, conn_t *, connp); (*connp->conn_upcalls->su_txq_full) (connp->conn_upper_handle, B_TRUE); + } else { + conn_setqfull(connp); + noenable(connp->conn_wq); } mutex_exit(CONN_DRAIN_LIST_LOCK(connp)); } @@ -28167,6 +28280,9 @@ conn_drain_tail(conn_t *connp, boolean_t closing) if (IPCL_IS_NONSTR(connp)) { (*connp->conn_upcalls->su_txq_full) (connp->conn_upper_handle, B_FALSE); + } else { + conn_clrqfull(connp); + enableok(connp->conn_wq); } } @@ -28194,6 +28310,8 @@ ip_wsrv(queue_t *q) if (q->q_next) { ill = (ill_t *)q->q_ptr; if (ill->ill_state_flags == 0) { + ip_stack_t *ipst = ill->ill_ipst; + /* * The device flow control has opened up. * Walk through conn drain lists and qenable the @@ -28202,7 +28320,7 @@ ip_wsrv(queue_t *q) * Hence the if check above. */ ip1dbg(("ip_wsrv: walking\n")); - conn_walk_drain(ill->ill_ipst); + conn_walk_drain(ipst, &ipst->ips_idl_tx_list[0]); } return; } @@ -28229,12 +28347,14 @@ ip_wsrv(queue_t *q) * (causing an infinite loop). */ ASSERT(!connp->conn_did_putbq); + while ((q->q_first != NULL) && !connp->conn_did_putbq) { connp->conn_draining = 1; noenable(q); while ((mp = getq(q)) != NULL) { ASSERT(CONN_Q(q)); + DTRACE_PROBE1(ip__wsrv__ip__output, conn_t *, connp); ip_output(Q_TO_CONN(q), mp, q, IP_WSRV); if (connp->conn_did_putbq) { /* ip_wput did a putbq */ @@ -28253,12 +28373,23 @@ ip_wsrv(queue_t *q) */ connp->conn_draining = 0; enableok(q); - } /* Enable the next conn for draining */ conn_drain_tail(connp, B_FALSE); + /* + * conn_direct_blocked is used to indicate blocked + * condition for direct path (ILL_DIRECT_CAPABLE()). + * This is the only place where it is set without + * checking for ILL_DIRECT_CAPABLE() and setting it + * to 0 is ok even if it is not ILL_DIRECT_CAPABLE(). + */ + if (!connp->conn_did_putbq && connp->conn_direct_blocked) { + DTRACE_PROBE1(ip__wsrv__direct__blocked, conn_t *, connp); + connp->conn_direct_blocked = B_FALSE; + } + connp->conn_did_putbq = 0; } @@ -28274,11 +28405,18 @@ ip_wsrv(queue_t *q) * function and wakes up corresponding mac worker threads, which in turn * calls this callback function, and disables flow control. */ -/* ARGSUSED */ void -ill_flow_enable(void *ill, ip_mac_tx_cookie_t cookie) +ill_flow_enable(void *arg, ip_mac_tx_cookie_t cookie) { - qenable(((ill_t *)ill)->ill_wq); + ill_t *ill = (ill_t *)arg; + ip_stack_t *ipst = ill->ill_ipst; + idl_tx_list_t *idl_txl; + + idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)]; + mutex_enter(&idl_txl->txl_lock); + /* add code to to set a flag to indicate idl_txl is enabled */ + conn_walk_drain(ipst, idl_txl); + mutex_exit(&idl_txl->txl_lock); } /* @@ -28315,7 +28453,7 @@ conn_walk_fanout(pfv_t func, void *arg, zoneid_t zoneid, ip_stack_t *ipst) * in turn qenable the next conn, when it is done/blocked/closing. */ static void -conn_walk_drain(ip_stack_t *ipst) +conn_walk_drain(ip_stack_t *ipst, idl_tx_list_t *tx_list) { int i; idl_t *idl; @@ -28323,7 +28461,7 @@ conn_walk_drain(ip_stack_t *ipst) IP_STAT(ipst, ip_conn_walk_drain); for (i = 0; i < ipst->ips_conn_drain_list_cnt; i++) { - idl = &ipst->ips_conn_drain_list[i]; + idl = &tx_list->txl_drain_list[i]; mutex_enter(&idl->idl_lock); if (idl->idl_conn == NULL) { mutex_exit(&idl->idl_lock); @@ -28521,6 +28659,41 @@ conn_wantpacket(conn_t *connp, ill_t *ill, ipha_t *ipha, int fanout_flags, return (found); } +static void +conn_setqfull(conn_t *connp) +{ + queue_t *q = connp->conn_wq; + + if (!(q->q_flag & QFULL)) { + mutex_enter(QLOCK(q)); + if (!(q->q_flag & QFULL)) { + /* still need to set QFULL */ + q->q_flag |= QFULL; + mutex_exit(QLOCK(q)); + } else { + mutex_exit(QLOCK(q)); + } + } +} + +static void +conn_clrqfull(conn_t *connp) +{ + queue_t *q = connp->conn_wq; + + if (q->q_flag & QFULL) { + mutex_enter(QLOCK(q)); + if (q->q_flag & QFULL) { + q->q_flag &= ~QFULL; + mutex_exit(QLOCK(q)); + if (q->q_flag & QWANTW) + qbackenable(q, 0); + } else { + mutex_exit(QLOCK(q)); + } + } +} + /* * Finish processing of "arp_up" when AR_DLPIOP_DONE is received from arp. */ @@ -29666,7 +29839,7 @@ ip_xmit_v4(mblk_t *mp, ire_t *ire, ipsec_out_t *io, 0); ILL_SEND_TX(out_ill, - ire, connp, first_mp, 0); + ire, connp, first_mp, 0, connp); } else { BUMP_MIB(out_ill->ill_ip_mib, ipIfStatsOutDiscards); diff --git a/usr/src/uts/common/inet/ip/ip6.c b/usr/src/uts/common/inet/ip/ip6.c index 686e2ad94e..fe10ea8110 100644 --- a/usr/src/uts/common/inet/ip/ip6.c +++ b/usr/src/uts/common/inet/ip/ip6.c @@ -10807,9 +10807,12 @@ ip_wput_ire_v6(queue_t *q, mblk_t *mp, ire_t *ire, int unspec_src, if (ipst->ips_ip_output_queue && connp != NULL && !mctl_present && caller != IRE_SEND) { if (caller == IP_WSRV) { + idl_tx_list_t *idl_txl; + + idl_txl = &ipst->ips_idl_tx_list[0]; connp->conn_did_putbq = 1; (void) putbq(connp->conn_wq, mp); - conn_drain_insert(connp); + conn_drain_insert(connp, idl_txl); /* * caller == IP_WSRV implies we are * the service thread, and the diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index 64f9789fe9..3628dd4f56 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -3083,6 +3083,8 @@ ill_capability_direct_enable(ill_t *ill) idd->idd_tx_dh = direct.di_tx_dh; idd->idd_tx_cb_df = (ip_dld_callb_t)direct.di_tx_cb_df; idd->idd_tx_cb_dh = direct.di_tx_cb_dh; + idd->idd_tx_fctl_df = (ip_dld_fctl_t)direct.di_tx_fctl_df; + idd->idd_tx_fctl_dh = direct.di_tx_fctl_dh; /* * One time registration of flow enable callback function */ diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h index 369ba60005..7711a2fedc 100644 --- a/usr/src/uts/common/inet/ip_impl.h +++ b/usr/src/uts/common/inet/ip_impl.h @@ -503,24 +503,72 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; #define ILL_DIRECT_CAPABLE(ill) \ (((ill)->ill_capabilities & ILL_CAPAB_DLD_DIRECT) != 0) -#define ILL_SEND_TX(ill, ire, hint, mp, flag) { \ - if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) { \ - ill_dld_direct_t *idd; \ - \ - idd = &(ill)->ill_dld_capab->idc_direct; \ - /* \ - * Send the packet directly to DLD, where it \ - * may be queued depending on the availability \ - * of transmit resources at the media layer. \ - * Ignore the returned value for the time being \ - * In future, we may want to take this into \ - * account and flow control the TCP. \ - */ \ - (void) idd->idd_tx_df(idd->idd_tx_dh, mp, \ - (uintptr_t)(hint), flag); \ - } else { \ - putnext((ire)->ire_stq, mp); \ - } \ +#define ILL_SEND_TX(ill, ire, hint, mp, flag, connp) { \ + if (ILL_DIRECT_CAPABLE(ill) && DB_TYPE(mp) == M_DATA) { \ + ill_dld_direct_t *idd; \ + uintptr_t cookie; \ + conn_t *udp_connp = (conn_t *)connp; \ + \ + idd = &(ill)->ill_dld_capab->idc_direct; \ + /* \ + * Send the packet directly to DLD, where it \ + * may be queued depending on the availability \ + * of transmit resources at the media layer. \ + * Ignore the returned value for the time being \ + * In future, we may want to take this into \ + * account and flow control the TCP. \ + */ \ + cookie = idd->idd_tx_df(idd->idd_tx_dh, mp, \ + (uintptr_t)(hint), flag); \ + \ + /* \ + * non-NULL cookie indicates flow control situation \ + * and the cookie itself identifies this specific \ + * Tx ring that is blocked. This cookie is used to \ + * block the UDP conn that is sending packets over \ + * this specific Tx ring. \ + */ \ + if ((cookie != NULL) && (udp_connp != NULL) && \ + (udp_connp->conn_ulp == IPPROTO_UDP)) { \ + idl_tx_list_t *idl_txl; \ + ip_stack_t *ipst; \ + \ + /* \ + * Flow controlled. \ + */ \ + DTRACE_PROBE2(ill__send__tx__cookie, \ + uintptr_t, cookie, conn_t *, udp_connp); \ + ipst = udp_connp->conn_netstack->netstack_ip; \ + idl_txl = \ + &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)];\ + mutex_enter(&idl_txl->txl_lock); \ + if (udp_connp->conn_direct_blocked || \ + (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, \ + cookie) == 0)) { \ + DTRACE_PROBE1(ill__tx__not__blocked, \ + boolean, \ + udp_connp->conn_direct_blocked); \ + } else if (idl_txl->txl_cookie != NULL && \ + idl_txl->txl_cookie != cookie) { \ + udp_t *udp = udp_connp->conn_udp; \ + udp_stack_t *us = udp->udp_us; \ + \ + DTRACE_PROBE2(ill__send__tx__collision, \ + uintptr_t, cookie, \ + uintptr_t, idl_txl->txl_cookie); \ + UDP_STAT(us, udp_cookie_coll); \ + } else { \ + udp_connp->conn_direct_blocked = B_TRUE;\ + idl_txl->txl_cookie = cookie; \ + conn_drain_insert(udp_connp, idl_txl); \ + DTRACE_PROBE1(ill__send__tx__insert, \ + conn_t *, udp_connp); \ + } \ + mutex_exit(&idl_txl->txl_lock); \ + } \ + } else { \ + putnext((ire)->ire_stq, mp); \ + } \ } #define MBLK_RX_FANOUT_SLOWPATH(mp, ipha) \ diff --git a/usr/src/uts/common/inet/ip_stack.h b/usr/src/uts/common/inet/ip_stack.h index 750378f587..d6f0b033ff 100644 --- a/usr/src/uts/common/inet/ip_stack.h +++ b/usr/src/uts/common/inet/ip_stack.h @@ -131,6 +131,9 @@ typedef struct ire_stats { uint64_t ire_stats_deleted; /* # of ires deleted from the bucket */ } ire_stats_t; +#define TX_FANOUT_SIZE 128 +#define IDLHASHINDEX(X) \ + ((((uintptr_t)(X) >> 2) + ((uintptr_t)(X) >> 9)) & (TX_FANOUT_SIZE - 1)) /* * IP stack instances @@ -348,9 +351,9 @@ struct ip_stack { kstat_t *ips_loopback_ksp; - struct idl_s *ips_conn_drain_list; /* Array of conn drain lists */ + /* Array of conn drain lists */ + struct idl_tx_list_s *ips_idl_tx_list; uint_t ips_conn_drain_list_cnt; /* Count of conn_drain_list */ - int ips_conn_drain_list_index; /* Next drain_list */ /* * ID used to assign next free one. diff --git a/usr/src/uts/common/inet/ipclassifier.h b/usr/src/uts/common/inet/ipclassifier.h index 2ecc445a56..716689989f 100644 --- a/usr/src/uts/common/inet/ipclassifier.h +++ b/usr/src/uts/common/inet/ipclassifier.h @@ -245,6 +245,7 @@ struct conn_s { unsigned int conn_lso_ok : 1; /* LSO is usable */ + boolean_t conn_direct_blocked; /* conn is flow-controlled */ squeue_t *conn_initial_sqp; /* Squeue at open time */ squeue_t *conn_final_sqp; /* Squeue after connect */ diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 95b2551008..1b0df0f335 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -86,6 +86,7 @@ #include <inet/kstatcom.h> #include <inet/tcp.h> #include <inet/tcp_impl.h> +#include <inet/udp_impl.h> #include <net/pfkeyv2.h> #include <inet/ipsec_info.h> #include <inet/ipdrop.h> @@ -19431,7 +19432,7 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst); } - ILL_SEND_TX(ill, ire, connp, mp, 0); + ILL_SEND_TX(ill, ire, connp, mp, 0, NULL); } IRE_REFRELE(ire); @@ -21418,7 +21419,7 @@ tcp_lsosend_data(tcp_t *tcp, mblk_t *mp, ire_t *ire, ill_t *ill, const int mss, ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst); } - ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0); + ILL_SEND_TX(ill, ire, tcp->tcp_connp, mp, 0, NULL); } } diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index 30c876b45f..c4f7be3b93 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -5604,6 +5604,7 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) udp_stack_t *us = udp->udp_us; ip_stack_t *ipst = connp->conn_netstack->netstack_ip; boolean_t ll_multicast = B_FALSE; + boolean_t direct_send; dev_q = ire->ire_stq->q_next; ASSERT(dev_q != NULL); @@ -5611,16 +5612,24 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) ill = ire_to_ill(ire); ASSERT(ill != NULL); + /* + * For the direct send case, if resetting of conn_direct_blocked + * was missed, it is still ok because the putq() would enable + * the queue and write service will drain it out. + */ + direct_send = ILL_DIRECT_CAPABLE(ill); + /* is queue flow controlled? */ - if (q->q_first != NULL || connp->conn_draining || - DEV_Q_FLOW_BLOCKED(dev_q)) { + if ((!direct_send) && (q->q_first != NULL || connp->conn_draining || + DEV_Q_FLOW_BLOCKED(dev_q))) { BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsHCOutRequests); BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsOutDiscards); - - if (ipst->ips_ip_output_queue) + if (ipst->ips_ip_output_queue) { + DTRACE_PROBE1(udp__xmit__putq, conn_t *, connp); (void) putq(connp->conn_wq, mp); - else + } else { freemsg(mp); + } ire_refrele(ire); return; } @@ -5718,20 +5727,60 @@ udp_xmit(queue_t *q, mblk_t *mp, ire_t *ire, conn_t *connp, zoneid_t zoneid) ALL_ZONES, ill, IPV4_VERSION, ire_fp_mp_len, ipst); } - if (mp != NULL) { - DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, - void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, - ipha_t *, ipha, ip6_t *, NULL, int, 0); + if (mp == NULL) + goto bail; - if (ILL_DIRECT_CAPABLE(ill)) { - ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct; + DTRACE_IP7(send, mblk_t *, mp, conn_t *, NULL, + void_ip_t *, ipha, __dtrace_ipsr_ill_t *, ill, + ipha_t *, ipha, ip6_t *, NULL, int, 0); - (void) idd->idd_tx_df(idd->idd_tx_dh, mp, - (uintptr_t)connp, 0); - } else { - putnext(ire->ire_stq, mp); + if (direct_send) { + uintptr_t cookie; + ill_dld_direct_t *idd = &ill->ill_dld_capab->idc_direct; + + cookie = idd->idd_tx_df(idd->idd_tx_dh, mp, + (uintptr_t)connp, 0); + if (cookie != NULL) { + idl_tx_list_t *idl_txl; + + /* + * Flow controlled. + */ + DTRACE_PROBE2(non__null__cookie, uintptr_t, + cookie, conn_t *, connp); + idl_txl = &ipst->ips_idl_tx_list[IDLHASHINDEX(cookie)]; + mutex_enter(&idl_txl->txl_lock); + /* + * Check again after holding txl_lock to see if Tx + * ring is still blocked and only then insert the + * connp into the drain list. + */ + if (connp->conn_direct_blocked || + (idd->idd_tx_fctl_df(idd->idd_tx_fctl_dh, + cookie) == 0)) { + mutex_exit(&idl_txl->txl_lock); + goto bail; + } + if (idl_txl->txl_cookie != NULL && + idl_txl->txl_cookie != cookie) { + DTRACE_PROBE2(udp__xmit__collision, + uintptr_t, cookie, + uintptr_t, idl_txl->txl_cookie); + UDP_STAT(us, udp_cookie_coll); + } else { + connp->conn_direct_blocked = B_TRUE; + idl_txl->txl_cookie = cookie; + conn_drain_insert(connp, idl_txl); + DTRACE_PROBE1(udp__xmit__insert, + conn_t *, connp); + } + mutex_exit(&idl_txl->txl_lock); } + } else { + DTRACE_PROBE1(udp__xmit__putnext, mblk_t *, mp); + putnext(ire->ire_stq, mp); } +bail: IRE_REFRELE(ire); } diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h index 34b38e67bd..96f84e43bc 100644 --- a/usr/src/uts/common/inet/udp_impl.h +++ b/usr/src/uts/common/inet/udp_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -192,10 +192,7 @@ typedef struct udp_stat { /* Class "net" kstats */ kstat_named_t udp_in_recvtclass; kstat_named_t udp_in_timestamp; kstat_named_t udp_ip_rcvpktinfo; - kstat_named_t udp_direct_send; - kstat_named_t udp_bwsq_send; - kstat_named_t udp_connected_direct_send; - kstat_named_t udp_connected_bwsq_send; + kstat_named_t udp_cookie_coll; #ifdef DEBUG kstat_named_t udp_data_conn; kstat_named_t udp_data_notconn; |
