diff options
| author | krgopi <none@none> | 2005-12-23 10:29:12 -0800 |
|---|---|---|
| committer | krgopi <none@none> | 2005-12-23 10:29:12 -0800 |
| commit | 4b46d1ef625bf17cc3dd4b14b9ad807be97dc558 (patch) | |
| tree | f7a47bdc523d89874ab78528527c14b8bd0aaef9 /usr/src | |
| parent | 5805a1baa6594684bbf7d7fa108cea093396ea31 (diff) | |
| download | illumos-gate-4b46d1ef625bf17cc3dd4b14b9ad807be97dc558.tar.gz | |
PSARC 2005/654 Nemo soft rings
6306717 For Nemo based drivers, IP can ask dls to do the fanout
Diffstat (limited to 'usr/src')
| -rw-r--r-- | usr/src/uts/common/Makefile.files | 2 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/ip.h | 47 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/ip/ip.c | 11 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/ip/ip_if.c | 357 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/ip/ip_squeue.c | 437 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/ip_impl.h | 19 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/tcp/tcp.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/inet/udp/udp.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/io/dld/dld_proto.c | 256 | ||||
| -rw-r--r-- | usr/src/uts/common/io/dld/dld_str.c | 7 | ||||
| -rw-r--r-- | usr/src/uts/common/io/dls/dls.c | 40 | ||||
| -rw-r--r-- | usr/src/uts/common/io/dls/dls_soft_ring.c | 667 | ||||
| -rw-r--r-- | usr/src/uts/common/io/dls/dls_stat.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/io/dls/dls_vlan.c | 4 | ||||
| -rw-r--r-- | usr/src/uts/common/io/mac/mac.c | 5 | ||||
| -rw-r--r-- | usr/src/uts/common/os/space.c | 20 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/dld_impl.h | 3 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/dlpi.h | 60 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/dls_impl.h | 10 | ||||
| -rw-r--r-- | usr/src/uts/common/sys/dls_soft_ring.h | 99 | ||||
| -rw-r--r-- | usr/src/uts/sun4v/os/mach_startup.c | 15 |
21 files changed, 1779 insertions, 292 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index 7be9d00998..9d21d041b5 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -494,7 +494,7 @@ CN_OBJS += cons.o DLD_OBJS += dld_drv.o dld_proto.o dld_str.o -DLS_OBJS += dls.o dls_link.o dls_mod.o dls_stat.o dls_vlan.o +DLS_OBJS += dls.o dls_link.o dls_mod.o dls_stat.o dls_vlan.o dls_soft_ring.o GLD_OBJS += gld.o gldutil.o diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h index f286253080..358e67d354 100644 --- a/usr/src/uts/common/inet/ip.h +++ b/usr/src/uts/common/inet/ip.h @@ -1593,6 +1593,7 @@ extern ill_g_head_t ill_g_heads[]; /* ILL List Head */ #define ILL_CAPAB_HCKSUM 0x08 /* Hardware checksumming */ #define ILL_CAPAB_ZEROCOPY 0x10 /* Zero-copy */ #define ILL_CAPAB_POLL 0x20 /* Polling Toggle */ +#define ILL_CAPAB_SOFT_RING 0x40 /* Soft_Ring capability */ /* * Per-ill Multidata Transmit capabilities. @@ -1615,9 +1616,9 @@ typedef struct ill_hcksum_capab_s ill_hcksum_capab_t; typedef struct ill_zerocopy_capab_s ill_zerocopy_capab_t; /* - * Per-ill Polling capbilities. + * Per-ill Polling/soft ring capbilities. */ -typedef struct ill_poll_capab_s ill_poll_capab_t; +typedef struct ill_dls_capab_s ill_dls_capab_t; /* * Per-ill polling resource map. @@ -1629,6 +1630,7 @@ typedef struct ill_rx_ring ill_rx_ring_t; #define ILL_CONDEMNED 0x02 /* No more new ref's to the ILL */ #define ILL_CHANGING 0x04 /* ILL not globally visible */ #define ILL_DL_UNBIND_DONE 0x08 /* UNBIND_REQ has been Acked */ +#define ILL_SOFT_RING_ASSIGN 0x10 /* Makeing soft ring assigment */ /* Is this an ILL whose source address is used by other ILL's ? */ #define IS_USESRC_ILL(ill) \ @@ -1775,7 +1777,7 @@ typedef struct ill_s { ill_ipsec_capab_t *ill_ipsec_capab_esp; /* IPsec ESP capabilities */ ill_hcksum_capab_t *ill_hcksum_capab; /* H/W cksumming capabilities */ ill_zerocopy_capab_t *ill_zerocopy_capab; /* Zero-copy capabilities */ - ill_poll_capab_t *ill_poll_capab; /* Polling capabilities */ + ill_dls_capab_t *ill_dls_capab; /* Polling, soft ring capabilities */ /* * New fields for IPv6 @@ -2962,11 +2964,16 @@ struct ill_zerocopy_capab_s { #define ILL_POLLING 0x01 /* Polling in use */ /* - * This function pointer type is exported by the mac layer. - * we need to duplicate the definition here because we cannot - * include mac.h in this file. + * These functions pointer types are exported by the mac/dls layer. + * we need to duplicate the definitions here because we cannot + * include mac/dls header files here. */ typedef void (*ip_mac_blank_t)(void *, time_t, uint_t); +typedef void (*ip_dld_tx_t)(void *, mblk_t *); + +typedef void (*ip_dls_chg_soft_ring_t)(void *, int); +typedef void (*ip_dls_bind_t)(void *, processorid_t); +typedef void (*ip_dls_unbind_t)(void *); struct ill_rx_ring { ip_mac_blank_t rr_blank; /* Driver interrupt blanking func */ @@ -2984,15 +2991,15 @@ struct ill_rx_ring { uint32_t rr_ring_state; /* State of this ring */ }; -/* - * This is exported by dld and is meant to be invoked from a ULP. - */ -typedef void (*ip_dld_tx_t)(void *, mblk_t *); - -struct ill_poll_capab_s { - ip_dld_tx_t ill_tx; /* dld-supplied tx routine */ - void *ill_tx_handle; /* dld-supplied tx handle */ +struct ill_dls_capab_s { + ip_dld_tx_t ill_tx; /* Driver Tx routine */ + void *ill_tx_handle; /* Driver Tx handle */ + ip_dls_chg_soft_ring_t ill_dls_change_status; + /* change soft ring fanout */ + ip_dls_bind_t ill_dls_bind; /* to add CPU affinity */ + ip_dls_unbind_t ill_dls_unbind; /* remove CPU affinity */ ill_rx_ring_t *ill_ring_tbl; /* Ring to Sqp mapping table */ + uint_t ill_dls_soft_ring_cnt; /* Number of soft ring */ conn_t *ill_unbind_conn; /* Conn used during unplumb */ }; @@ -3002,6 +3009,10 @@ struct ill_poll_capab_s { extern int ip_squeue_profile; extern int ip_squeue_bind; extern boolean_t ip_squeue_fanout; +extern boolean_t ip_squeue_soft_ring; +extern uint_t ip_threads_per_cpu; +extern uint_t ip_squeues_per_cpu; +extern uint_t ip_soft_rings_cnt; typedef struct squeue_set_s { kmutex_t sqs_lock; @@ -3012,10 +3023,8 @@ typedef struct squeue_set_s { } squeue_set_t; #define IP_SQUEUE_GET(hint) \ - (!ip_squeue_fanout ? \ - (CPU->cpu_squeue_set->sqs_list[hint % \ - CPU->cpu_squeue_set->sqs_size]) : \ - ip_squeue_random(hint)) + ((!ip_squeue_fanout) ? (CPU->cpu_squeue_set->sqs_list[0]) : \ + ip_squeue_random(hint)) typedef void (*squeue_func_t)(squeue_t *, mblk_t *, sqproc_t, void *, uint8_t); @@ -3027,6 +3036,8 @@ extern int ip_squeue_bind_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); extern int ip_squeue_bind_get(queue_t *, mblk_t *, caddr_t, cred_t *); extern void ip_squeue_clean(void *, mblk_t *, void *); extern void ip_resume_tcp_bind(void *, mblk_t *, void *); +extern void ip_soft_ring_assignment(ill_t *, ill_rx_ring_t *, + mblk_t *, size_t); extern void tcp_wput(queue_t *, mblk_t *); diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c index d587a746c5..33ac6bd126 100644 --- a/usr/src/uts/common/inet/ip/ip.c +++ b/usr/src/uts/common/inet/ip/ip.c @@ -743,7 +743,7 @@ extern int ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, cred_t *cr); -static int ip_fanout_set(queue_t *, mblk_t *, char *, caddr_t, +static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *); static squeue_func_t ip_squeue_switch(int); @@ -941,10 +941,12 @@ static ipndp_t lcl_ndp_arr[] = { (caddr_t)&ip_squeue_bind, "ip_squeue_bind" }, { ip_param_generic_get, ip_input_proc_set, (caddr_t)&ip_squeue_enter, "ip_squeue_enter" }, - { ip_param_generic_get, ip_fanout_set, + { ip_param_generic_get, ip_int_set, (caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" }, { ip_cgtp_filter_get, ip_cgtp_filter_set, (caddr_t)&ip_cgtp_filter, - "ip_cgtp_filter" } + "ip_cgtp_filter" }, + { ip_param_generic_get, ip_int_set, + (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" } }; /* @@ -25996,7 +25998,7 @@ ip_input_proc_set(queue_t *q, mblk_t *mp, char *value, /* ARGSUSED */ static int -ip_fanout_set(queue_t *q, mblk_t *mp, char *value, +ip_int_set(queue_t *q, mblk_t *mp, char *value, caddr_t addr, cred_t *cr) { int *v = (int *)addr; @@ -26009,7 +26011,6 @@ ip_fanout_set(queue_t *q, mblk_t *mp, char *value, return (0); } - static void ip_kstat_init(void) { diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c index d04760f02c..adc05133fb 100644 --- a/usr/src/uts/common/inet/ip/ip_if.c +++ b/usr/src/uts/common/inet/ip/ip_if.c @@ -235,9 +235,10 @@ static void ill_capability_zerocopy_ack(ill_t *, mblk_t *, dl_capability_sub_t *); static void ill_capability_zerocopy_reset(ill_t *, mblk_t **); -static void ill_capability_poll_ack(ill_t *, mblk_t *, dl_capability_sub_t *); +static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *); static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *); -static void ill_capability_poll_reset(ill_t *, mblk_t **); +static void ill_capability_dls_reset(ill_t *, mblk_t **); +static void ill_capability_dls_disable(ill_t *); static void illgrp_cache_delete(ire_t *, char *); static void illgrp_delete(ill_t *ill); @@ -560,6 +561,16 @@ static phyint_list_t phyint_g_list; /* start of phyint list */ */ static boolean_t ipmp_enable_failback = B_TRUE; +/* + * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout + * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is + * set through platform specific code (Niagara/Ontario). + */ +#define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \ + (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE) + +#define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL) + static uint_t ipif_rand(void) { @@ -770,7 +781,7 @@ ill_delete_tail(ill_t *ill) * to this ill. */ mutex_enter(&ill->ill_lock); - if (ill->ill_capabilities & ILL_CAPAB_POLL) { + if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) { while (!(ill->ill_state_flags & ILL_DL_UNBIND_DONE)) cv_wait(&ill->ill_cv, &ill->ill_lock); } @@ -820,18 +831,18 @@ ill_delete_tail(ill_t *ill) } /* - * Clean up polling capabilities + * Clean up polling and soft ring capabilities */ - if (ill->ill_capabilities & ILL_CAPAB_POLL) - ipsq_clean_all(ill); + if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) + ill_capability_dls_disable(ill); - if (ill->ill_poll_capab != NULL) { - CONN_DEC_REF(ill->ill_poll_capab->ill_unbind_conn); - ill->ill_poll_capab->ill_unbind_conn = NULL; - kmem_free(ill->ill_poll_capab, - sizeof (ill_poll_capab_t) + + if (ill->ill_dls_capab != NULL) { + CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn); + ill->ill_dls_capab->ill_unbind_conn = NULL; + kmem_free(ill->ill_dls_capab, + sizeof (ill_dls_capab_t) + (sizeof (ill_rx_ring_t) * ILL_MAX_RINGS)); - ill->ill_poll_capab = NULL; + ill->ill_dls_capab = NULL; } ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); @@ -1801,7 +1812,7 @@ ill_capability_reset(ill_t *ill) ill_capability_hcksum_reset(ill, &sc_mp); ill_capability_zerocopy_reset(ill, &sc_mp); ill_capability_ipsec_reset(ill, &sc_mp); - ill_capability_poll_reset(ill, &sc_mp); + ill_capability_dls_reset(ill, &sc_mp); /* Nothing to send down in order to disable the capabilities? */ if (sc_mp == NULL) @@ -2627,7 +2638,12 @@ ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp, ill_capability_zerocopy_ack(ill, mp, subp); break; case DL_CAPAB_POLL: - ill_capability_poll_ack(ill, mp, subp); + if (!SOFT_RINGS_ENABLED()) + ill_capability_dls_ack(ill, mp, subp); + break; + case DL_CAPAB_SOFT_RING: + if (SOFT_RINGS_ENABLED()) + ill_capability_dls_ack(ill, mp, subp); break; default: ip1dbg(("ill_capability_dispatch: unknown capab type %d\n", @@ -2672,16 +2688,16 @@ ill_ring_add(void *arg, mac_resource_t *mrp) ill_rx_ring_t *rx_ring; int ip_rx_index; + ASSERT(mrp != NULL); if (mrp->mr_type != MAC_RX_FIFO) { return (NULL); } ASSERT(ill != NULL); - ASSERT(ill->ill_poll_capab != NULL); - ASSERT(mrp != NULL); + ASSERT(ill->ill_dls_capab != NULL); mutex_enter(&ill->ill_lock); for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { - rx_ring = &ill->ill_poll_capab->ill_ring_tbl[ip_rx_index]; + rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index]; ASSERT(rx_ring != NULL); if (rx_ring->rr_ring_state == ILL_RING_FREE) { @@ -2732,107 +2748,129 @@ ill_ring_add(void *arg, mac_resource_t *mrp) } static boolean_t -ill_capability_poll_init(ill_t *ill) +ill_capability_dls_init(ill_t *ill) { - ill_poll_capab_t *ill_poll = ill->ill_poll_capab; + ill_dls_capab_t *ill_dls = ill->ill_dls_capab; conn_t *connp; size_t sz; - if (ill->ill_capabilities & ILL_CAPAB_POLL) { - if (ill_poll == NULL) { - cmn_err(CE_PANIC, "ill_capability_poll_init: " - "polling enabled for ill=%s (%p) but data " + if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { + if (ill_dls == NULL) { + cmn_err(CE_PANIC, "ill_capability_dls_init: " + "soft_ring enabled for ill=%s (%p) but data " "structs uninitialized\n", ill->ill_name, (void *)ill); } return (B_TRUE); + } else if (ill->ill_capabilities & ILL_CAPAB_POLL) { + if (ill_dls == NULL) { + cmn_err(CE_PANIC, "ill_capability_dls_init: " + "polling enabled for ill=%s (%p) but data " + "structs uninitialized\n", ill->ill_name, + (void *)ill); + } + return (B_TRUE); } - if (ill_poll != NULL) { - ill_rx_ring_t *rx_ring = ill_poll->ill_ring_tbl; - /* Polling is being re-enabled */ + if (ill_dls != NULL) { + ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl; + /* Soft_Ring or polling is being re-enabled */ - connp = ill_poll->ill_unbind_conn; + connp = ill_dls->ill_unbind_conn; ASSERT(rx_ring != NULL); - bzero((void *)ill_poll, sizeof (ill_poll_capab_t)); + bzero((void *)ill_dls, sizeof (ill_dls_capab_t)); bzero((void *)rx_ring, sizeof (ill_rx_ring_t) * ILL_MAX_RINGS); - ill_poll->ill_ring_tbl = rx_ring; - ill_poll->ill_unbind_conn = connp; + ill_dls->ill_ring_tbl = rx_ring; + ill_dls->ill_unbind_conn = connp; return (B_TRUE); } if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL) return (B_FALSE); - sz = sizeof (ill_poll_capab_t); + sz = sizeof (ill_dls_capab_t); sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS; - ill_poll = kmem_zalloc(sz, KM_NOSLEEP); - if (ill_poll == NULL) { - cmn_err(CE_WARN, "ill_capability_poll_init: could not " - "allocate poll_capab for %s (%p)\n", ill->ill_name, + ill_dls = kmem_zalloc(sz, KM_NOSLEEP); + if (ill_dls == NULL) { + cmn_err(CE_WARN, "ill_capability_dls_init: could not " + "allocate dls_capab for %s (%p)\n", ill->ill_name, (void *)ill); CONN_DEC_REF(connp); return (B_FALSE); } /* Allocate space to hold ring table */ - ill_poll->ill_ring_tbl = (ill_rx_ring_t *)&ill_poll[1]; - ill->ill_poll_capab = ill_poll; - ill_poll->ill_unbind_conn = connp; + ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1]; + ill->ill_dls_capab = ill_dls; + ill_dls->ill_unbind_conn = connp; return (B_TRUE); } /* - * ill_capability_poll_disable: disable polling capability. Since - * any of the rings might already be in use, need to call ipsq_clean_all() - * which gets behind the squeue to disable direct calls if necessary. - * Clean up the direct tx function pointers as well. + * ill_capability_dls_disable: disable soft_ring and/or polling + * capability. Since any of the rings might already be in use, need + * to call ipsq_clean_all() which gets behind the squeue to disable + * direct calls if necessary. */ static void -ill_capability_poll_disable(ill_t *ill) +ill_capability_dls_disable(ill_t *ill) { - ill_poll_capab_t *ill_poll = ill->ill_poll_capab; + ill_dls_capab_t *ill_dls = ill->ill_dls_capab; - if (ill->ill_capabilities & ILL_CAPAB_POLL) { + if (ill->ill_capabilities & ILL_CAPAB_DLS) { ipsq_clean_all(ill); - ill_poll->ill_tx = NULL; - ill_poll->ill_tx_handle = NULL; + ill_dls->ill_tx = NULL; + ill_dls->ill_tx_handle = NULL; + ill_dls->ill_dls_change_status = NULL; + ill_dls->ill_dls_bind = NULL; + ill_dls->ill_dls_unbind = NULL; } - ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL)); + ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS)); } static void -ill_capability_poll_capable(ill_t *ill, dl_capab_poll_t *ipoll, +ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls, dl_capability_sub_t *isub) { uint_t size; uchar_t *rptr; - dl_capab_poll_t poll, *opoll; - ill_poll_capab_t *ill_poll; + dl_capab_dls_t dls, *odls; + ill_dls_capab_t *ill_dls; mblk_t *nmp = NULL; dl_capability_req_t *ocap; + uint_t sub_dl_cap = isub->dl_cap; - if (!ill_capability_poll_init(ill)) + if (!ill_capability_dls_init(ill)) return; - ill_poll = ill->ill_poll_capab; + ill_dls = ill->ill_dls_capab; /* Copy locally to get the members aligned */ - bcopy((void *)ipoll, (void *)&poll, sizeof (dl_capab_poll_t)); + bcopy((void *)idls, (void *)&dls, + sizeof (dl_capab_dls_t)); /* Get the tx function and handle from dld */ - ill_poll->ill_tx = (ip_dld_tx_t)poll.poll_tx; - ill_poll->ill_tx_handle = (void *)poll.poll_tx_handle; + ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx; + ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle; + + if (sub_dl_cap == DL_CAPAB_SOFT_RING) { + ill_dls->ill_dls_change_status = + (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status; + ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind; + ill_dls->ill_dls_unbind = + (ip_dls_unbind_t)dls.dls_ring_unbind; + ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt; + } size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) + isub->dl_length; if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) { - cmn_err(CE_WARN, "ill_capability_poll_ack: could not allocate " - "memory for CAPAB_REQ for %s (%p)\n", ill->ill_name, - (void *)ill); + cmn_err(CE_WARN, "ill_capability_dls_capable: could " + "not allocate memory for CAPAB_REQ for %s (%p)\n", + ill->ill_name, (void *)ill); return; } @@ -2847,46 +2885,93 @@ ill_capability_poll_capable(ill_t *ill, dl_capab_poll_t *ipoll, bcopy(isub, rptr, sizeof (*isub)); rptr += sizeof (*isub); - opoll = (dl_capab_poll_t *)rptr; - rptr += sizeof (dl_capab_poll_t); + odls = (dl_capab_dls_t *)rptr; + rptr += sizeof (dl_capab_dls_t); - /* initialize dl_capab_poll_t to be sent down */ - poll.poll_rx_handle = (uintptr_t)ill; - poll.poll_rx = (uintptr_t)ip_input; - poll.poll_ring_add = (uintptr_t)ill_ring_add; - poll.poll_flags = POLL_ENABLE; - bcopy((void *)&poll, (void *)opoll, sizeof (dl_capab_poll_t)); - ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); - - ip1dbg(("ill_capability_poll_capable: asking interface %s " - "to enable polling\n", ill->ill_name)); + /* initialize dl_capab_dls_t to be sent down */ + dls.dls_rx_handle = (uintptr_t)ill; + dls.dls_rx = (uintptr_t)ip_input; + dls.dls_ring_add = (uintptr_t)ill_ring_add; - /* nmp points to a DL_CAPABILITY_REQ message to enable polling */ + if (sub_dl_cap == DL_CAPAB_SOFT_RING) { + dls.dls_ring_cnt = ip_soft_rings_cnt; + dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment; + dls.dls_flags = SOFT_RING_ENABLE; + } else { + dls.dls_flags = POLL_ENABLE; + ip1dbg(("ill_capability_dls_capable: asking interface %s " + "to enable polling\n", ill->ill_name)); + } + bcopy((void *)&dls, (void *)odls, + sizeof (dl_capab_dls_t)); + ASSERT(nmp->b_wptr == (nmp->b_rptr + size)); + /* + * nmp points to a DL_CAPABILITY_REQ message to + * enable either soft_ring or polling + */ ill_dlpi_send(ill, nmp); } +static void +ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp) +{ + mblk_t *mp; + dl_capab_dls_t *idls; + dl_capability_sub_t *dl_subcap; + int size; + + if (!(ill->ill_capabilities & ILL_CAPAB_DLS)) + return; + + ASSERT(ill->ill_dls_capab != NULL); + + size = sizeof (*dl_subcap) + sizeof (*idls); + + mp = allocb(size, BPRI_HI); + if (mp == NULL) { + ip1dbg(("ill_capability_dls_reset: unable to allocate " + "request to disable soft_ring\n")); + return; + } + + mp->b_wptr = mp->b_rptr + size; + + dl_subcap = (dl_capability_sub_t *)mp->b_rptr; + dl_subcap->dl_length = sizeof (*idls); + if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) + dl_subcap->dl_cap = DL_CAPAB_SOFT_RING; + else + dl_subcap->dl_cap = DL_CAPAB_POLL; + + idls = (dl_capab_dls_t *)(dl_subcap + 1); + if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) + idls->dls_flags = SOFT_RING_DISABLE; + else + idls->dls_flags = POLL_DISABLE; + + if (*sc_mp != NULL) + linkb(*sc_mp, mp); + else + *sc_mp = mp; +} /* - * Process a polling capability negotiation ack received - * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_POLL) - * of a DL_CAPABILITY_ACK message. + * Process a soft_ring/poll capability negotiation ack received + * from a DLS Provider.isub must point to the sub-capability + * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message. */ static void -ill_capability_poll_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) +ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) { - dl_capab_poll_t *ipoll; + dl_capab_dls_t *idls; uint_t sub_dl_cap = isub->dl_cap; uint8_t *capend; + ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING || + sub_dl_cap == DL_CAPAB_POLL); - ASSERT(sub_dl_cap == DL_CAPAB_POLL); - - /* - * Don't enable polling for ipv6 ill's - */ - if (ill->ill_isv6) { + if (ill->ill_isv6) return; - } /* * Note: range checks here are not absolutely sufficient to @@ -2897,7 +2982,7 @@ ill_capability_poll_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) */ capend = (uint8_t *)(isub + 1) + isub->dl_length; if (capend > mp->b_wptr) { - cmn_err(CE_WARN, "ill_capability_poll_ack: " + cmn_err(CE_WARN, "ill_capability_dls_ack: " "malformed sub-capability too long for mblk"); return; } @@ -2905,17 +2990,17 @@ ill_capability_poll_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) /* * There are two types of acks we process here: * 1. acks in reply to a (first form) generic capability req - * (poll_flag will be set to POLL_CAPABLE) - * 2. acks in reply to a POLL_ENABLE capability req. - * (POLL_ENABLE flag set) + * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE) + * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE + * capability req. */ - ipoll = (dl_capab_poll_t *)(isub + 1); + idls = (dl_capab_dls_t *)(isub + 1); - if (!dlcapabcheckqid(&ipoll->poll_mid, ill->ill_lmod_rq)) { - ip1dbg(("ill_capability_poll_ack: mid token for polling " + if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) { + ip1dbg(("ill_capability_dls_ack: mid token for dls " "capability isn't as expected; pass-thru " "module(s) detected, discarding capability\n")); - if (ill->ill_capabilities & ILL_CAPAB_POLL) { + if (ill->ill_capabilities & ILL_CAPAB_DLS) { /* * This is a capability renegotitation case. * The interface better be unusable at this @@ -2923,80 +3008,48 @@ ill_capability_poll_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub) * if we disable direct calls on a running * and up interface. */ - ill_capability_poll_disable(ill); + ill_capability_dls_disable(ill); } return; } - switch (ipoll->poll_flags) { + switch (idls->dls_flags) { default: /* Disable if unknown flag */ + case SOFT_RING_DISABLE: case POLL_DISABLE: - ill_capability_poll_disable(ill); + ill_capability_dls_disable(ill); break; + case SOFT_RING_CAPABLE: case POLL_CAPABLE: /* * If the capability was already enabled, its safe * to disable it first to get rid of stale information * and then start enabling it again. */ - ill_capability_poll_disable(ill); - ill_capability_poll_capable(ill, ipoll, isub); + ill_capability_dls_disable(ill); + ill_capability_dls_capable(ill, idls, isub); break; + case SOFT_RING_ENABLE: case POLL_ENABLE: - if (!(ill->ill_capabilities & ILL_CAPAB_POLL)) { - ASSERT(ill->ill_poll_capab != NULL); + mutex_enter(&ill->ill_lock); + if (sub_dl_cap == DL_CAPAB_SOFT_RING && + !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) { + ASSERT(ill->ill_dls_capab != NULL); + ill->ill_capabilities |= ILL_CAPAB_SOFT_RING; + } + if (sub_dl_cap == DL_CAPAB_POLL && + !(ill->ill_capabilities & ILL_CAPAB_POLL)) { + ASSERT(ill->ill_dls_capab != NULL); ill->ill_capabilities |= ILL_CAPAB_POLL; + ip1dbg(("ill_capability_dls_ack: interface %s " + "has enabled polling\n", ill->ill_name)); } - ip1dbg(("ill_capability_poll_ack: interface %s " - "has enabled polling\n", ill->ill_name)); + mutex_exit(&ill->ill_lock); break; } } -static void -ill_capability_poll_reset(ill_t *ill, mblk_t **sc_mp) -{ - mblk_t *mp; - dl_capab_poll_t *ipoll; - dl_capability_sub_t *dl_subcap; - int size; - - if (!(ill->ill_capabilities & ILL_CAPAB_POLL)) - return; - - ASSERT(ill->ill_poll_capab != NULL); - - /* - * Disable polling capability - */ - ill_capability_poll_disable(ill); - - size = sizeof (*dl_subcap) + sizeof (*ipoll); - - mp = allocb(size, BPRI_HI); - if (mp == NULL) { - ip1dbg(("ill_capability_poll_reset: unable to allocate " - "request to disable polling\n")); - return; - } - - mp->b_wptr = mp->b_rptr + size; - - dl_subcap = (dl_capability_sub_t *)mp->b_rptr; - dl_subcap->dl_cap = DL_CAPAB_POLL; - dl_subcap->dl_length = sizeof (*ipoll); - - ipoll = (dl_capab_poll_t *)(dl_subcap + 1); - ipoll->poll_flags = POLL_DISABLE; - - if (*sc_mp != NULL) - linkb(*sc_mp, mp); - else - *sc_mp = mp; -} - - /* * Process a hardware checksum offload capability negotiation ack received * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM) @@ -7340,6 +7393,12 @@ ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) /* Just clean one squeue */ mutex_enter(&ill->ill_lock); + /* + * Reset the ILL_SOFT_RING_ASSIGN bit so that + * ip_squeue_soft_ring_affinty() will not go + * ahead with assigning rings. + */ + ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; while (rx_ring->rr_ring_state == ILL_RING_INPROC) /* Some operations pending on the ring. Wait */ cv_wait(&ill->ill_cv, &ill->ill_lock); @@ -7376,7 +7435,7 @@ ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) /* * Use the preallocated ill_unbind_conn for this purpose */ - connp = ill->ill_poll_capab->ill_unbind_conn; + connp = ill->ill_dls_capab->ill_unbind_conn; mp = &connp->conn_tcp->tcp_closemp; CONN_INC_REF(connp); squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL); @@ -7396,15 +7455,15 @@ ipsq_clean_all(ill_t *ill) /* * No need to clean if poll_capab isn't set for this ill */ - if (!(ill->ill_capabilities & ILL_CAPAB_POLL)) + if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))) return; - ill->ill_capabilities &= ~ILL_CAPAB_POLL; - for (idx = 0; idx < ILL_MAX_RINGS; idx++) { - ill_rx_ring_t *ipr = &ill->ill_poll_capab->ill_ring_tbl[idx]; + ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx]; ipsq_clean_ring(ill, ipr); } + + ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING); } /* ARGSUSED */ diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c index ae7731ac7b..78db295c78 100644 --- a/usr/src/uts/common/inet/ip/ip_squeue.c +++ b/usr/src/uts/common/inet/ip/ip_squeue.c @@ -125,13 +125,16 @@ * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 * mapping between squeue and NIC (or Rx ring) for performance reasons so * each squeue can uniquely own a NIC or a Rx ring and do polling - * (PSARC 2004/630). So we allow up to MAX_THREAD_PER_CPU squeues per CPU. - * We start by creating MIN_THREAD_PER_CPU squeues per CPU but more squeues + * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. + * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues * can be created dynamically as needed. */ -#define MAX_THREAD_PER_CPU 32 -#define MIN_THREAD_PER_CPU 1 -uint_t ip_threads_per_cpu = MIN_THREAD_PER_CPU; +#define MAX_SQUEUES_PER_CPU 32 +#define MIN_SQUEUES_PER_CPU 1 +uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; + +#define IP_NUM_SOFT_RINGS 2 +uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; /* * List of all created squeue sets. The size is protected by cpu_lock @@ -155,11 +158,12 @@ static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); static void ip_squeue_set_bind(squeue_set_t *); static void ip_squeue_set_unbind(squeue_set_t *); +static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t); #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) /* - * Create squeue set containing ip_threads_per_cpu number of squeues + * Create squeue set containing ip_squeues_per_cpu number of squeues * for this CPU and bind them all to the CPU. */ static squeue_set_t * @@ -186,13 +190,13 @@ ip_squeue_set_create(cpu_t *cp, boolean_t reuse) } sqs = kmem_zalloc(sizeof (squeue_set_t) + - (sizeof (squeue_t *) * MAX_THREAD_PER_CPU), KM_SLEEP); + (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); sqs->sqs_list = (squeue_t **)&sqs[1]; - sqs->sqs_max_size = MAX_THREAD_PER_CPU; + sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; sqs->sqs_bind = id; - for (i = 0; i < ip_threads_per_cpu; i++) { + for (i = 0; i < ip_squeues_per_cpu; i++) { bzero(sqname, sizeof (sqname)); (void) snprintf(sqname, sizeof (sqname), @@ -202,6 +206,12 @@ ip_squeue_set_create(cpu_t *cp, boolean_t reuse) sqp = squeue_create(sqname, id, ip_squeue_worker_wait, minclsyspri); + /* + * The first squeue in each squeue_set is the DEFAULT + * squeue. + */ + sqp->sq_state |= SQS_DEFAULT; + ASSERT(sqp != NULL); squeue_profile_enable(sqp); @@ -229,10 +239,10 @@ ip_squeue_init(void (*callback)(squeue_t *)) ASSERT(sqset_global_list == NULL); - if (ip_threads_per_cpu < MIN_THREAD_PER_CPU) - ip_threads_per_cpu = MIN_THREAD_PER_CPU; - else if (ip_threads_per_cpu > MAX_THREAD_PER_CPU) - ip_threads_per_cpu = MAX_THREAD_PER_CPU; + if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) + ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; + else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) + ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; ip_squeue_create_callback = callback; squeue_init(); @@ -293,6 +303,10 @@ ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) mutex_exit(&sqp->sq_lock); ill = ring->rr_ill; + if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { + ASSERT(ring->rr_handle != NULL); + ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); + } /* * Cleanup the ring @@ -338,15 +352,20 @@ ip_squeue_extend(void *arg) ill_t *ill = sq_arg->ip_taskq_ill; ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; - squeue_set_t *sqs; + squeue_set_t *sqs; squeue_t *sqp = NULL; - char sqname[64]; - int i; ASSERT(ill != NULL); ASSERT(ill_rx_ring != NULL); kmem_free(arg, sizeof (ip_taskq_arg_t)); + /* + * Make sure the CPU that originally took the interrupt still + * exists. + */ + if (!CPU_ISON(intr_cpu)) + intr_cpu = CPU; + sqs = intr_cpu->cpu_squeue_set; /* @@ -356,10 +375,337 @@ ip_squeue_extend(void *arg) * is sequential, we need to hold the ill_lock. */ mutex_enter(&ill->ill_lock); + sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE); + if (sqp == NULL) { + /* + * We hit the max limit of squeues allowed per CPU. + * Assign this rx_ring to DEFAULT squeue of the + * interrupted CPU but the squeue will not manage + * the ring. Also print a warning. + */ + cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " + "has max number of squeues. System performance might " + "become suboptimal\n", sqs->sqs_bind, (void *)sqs); + + /* the first squeue in the list is the default squeue */ + sqp = sqs->sqs_list[0]; + ASSERT(sqp != NULL); + ill_rx_ring->rr_sqp = sqp; + ill_rx_ring->rr_ring_state = ILL_RING_INUSE; + + mutex_exit(&ill->ill_lock); + ill_waiter_dcr(ill); + return; + } + + ASSERT(MUTEX_HELD(&sqp->sq_lock)); + sqp->sq_rx_ring = ill_rx_ring; + ill_rx_ring->rr_sqp = sqp; + ill_rx_ring->rr_ring_state = ILL_RING_INUSE; + + sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); + mutex_exit(&sqp->sq_lock); + + mutex_exit(&ill->ill_lock); + + /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ + ill_waiter_dcr(ill); +} + +/* + * Do a Rx ring to squeue binding. Find a unique squeue that is not + * managing a receive ring. If no such squeue exists, dynamically + * create a new one in the squeue set. + * + * The function runs via the system taskq. The ill passed as an + * argument can't go away since we hold a ref. The lock order is + * ill_lock -> sqs_lock -> sq_lock. + * + * If we are binding a Rx ring to a squeue attached to the offline CPU, + * no need to check that because squeues are never destroyed once + * created. + */ +/* ARGSUSED */ +static void +ip_squeue_soft_ring_affinity(void *arg) +{ + ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; + ill_t *ill = sq_arg->ip_taskq_ill; + ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; + ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; + cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; + cpu_t *bind_cpu; + int cpu_id = intr_cpu->cpu_id; + int min_cpu_id, max_cpu_id; + boolean_t enough_uniq_cpus = B_FALSE; + boolean_t enough_cpus = B_FALSE; + squeue_set_t *sqs, *last_sqs; + squeue_t *sqp = NULL; + int i, j; + + ASSERT(ill != NULL); + kmem_free(arg, sizeof (ip_taskq_arg_t)); + + /* + * Make sure the CPU that originally took the interrupt still + * exists. + */ + if (!CPU_ISON(intr_cpu)) { + intr_cpu = CPU; + cpu_id = intr_cpu->cpu_id; + } + + /* + * If this ill represents link aggregation, then there might be + * multiple NICs trying to register them selves at the same time + * and in order to ensure that test and assignment of free rings + * is sequential, we need to hold the ill_lock. + */ + mutex_enter(&ill->ill_lock); + + if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { + mutex_exit(&ill->ill_lock); + return; + } + /* + * We need to fanout the interrupts from the NIC. We do that by + * telling the driver underneath to create soft rings and use + * worker threads (if the driver advertized SOFT_RING capability) + * Its still a big performance win to if we can fanout to the + * threads on the same core that is taking interrupts. + * + * Since we don't know the interrupt to CPU binding, we don't + * assign any squeues or affinity to worker threads in the NIC. + * At the time of the first interrupt, we know which CPU is + * taking interrupts and try to find other threads on the same + * core. Assuming, ip_threads_per_cpu is correct and cpus are + * numbered sequentially for each core (XXX need something better + * than this in future), find the lowest number and highest + * number thread for that core. + * + * If we have one more thread per core than number of soft rings, + * then don't assign any worker threads to the H/W thread (cpu) + * taking interrupts (capability negotiation tries to ensure this) + * + * If the number of threads per core are same as the number of + * soft rings, then assign the worker affinity and squeue to + * the same cpu. + * + * Otherwise, just fanout to higher number CPUs starting from + * the interrupted CPU. + */ + + min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; + max_cpu_id = min_cpu_id + ip_threads_per_cpu; + + cmn_err(CE_CONT, "soft_ring_affinity: min/max/intr = %d/%d/%d\n", + min_cpu_id, max_cpu_id, (int)intr_cpu->cpu_id); + + /* + * Quickly check if there are enough CPUs present for fanout + * and also max_cpu_id is less than the id of the active CPU. + * We use the cpu_id stored in the last squeue_set to get + * an idea. The scheme is by no means perfect since it doesn't + * take into account CPU DR operations and the fact that + * interrupts themselves might change. An ideal scenario + * would be to ensure that interrupts run cpus by themselves + * and worker threads never have affinity to those CPUs. If + * the interrupts move to CPU which had a worker thread, it + * should be changed. Probably callbacks similar to CPU offline + * are needed to make it work perfectly. + */ + last_sqs = sqset_global_list[sqset_global_size - 1]; + if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { + if ((max_cpu_id - min_cpu_id) > + ill_soft_ring->ill_dls_soft_ring_cnt) + enough_uniq_cpus = B_TRUE; + else if ((max_cpu_id - min_cpu_id) >= + ill_soft_ring->ill_dls_soft_ring_cnt) + enough_cpus = B_TRUE; + } + + j = 0; + for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { + if (enough_uniq_cpus) { + if ((min_cpu_id + i) == cpu_id) { + j++; + continue; + } + bind_cpu = cpu[min_cpu_id + i]; + } else if (enough_cpus) { + bind_cpu = cpu[min_cpu_id + i]; + } else { + /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ + bind_cpu = cpu[(cpu_id + i) % ncpus]; + } + + /* + * Check if the CPU actually exist and active. If not, + * use the interrupted CPU. ip_find_unused_squeue() will + * find the right CPU to fanout anyway. + */ + if (!CPU_ISON(bind_cpu)) + bind_cpu = intr_cpu; + + sqs = bind_cpu->cpu_squeue_set; + ASSERT(sqs != NULL); + ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; + + sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE); + if (sqp == NULL) { + /* + * We hit the max limit of squeues allowed per CPU. + * Assign this rx_ring to DEFAULT squeue of the + * interrupted CPU but thesqueue will not manage + * the ring. Also print a warning. + */ + cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " + "%d/%p already has max number of squeues. System " + "performance might become suboptimal\n", + sqs->sqs_bind, (void *)sqs); + + /* the first squeue in the list is the default squeue */ + sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; + ASSERT(sqp != NULL); + + ill_rx_ring->rr_sqp = sqp; + ill_rx_ring->rr_ring_state = ILL_RING_INUSE; + continue; + + } + ASSERT(MUTEX_HELD(&sqp->sq_lock)); + ill_rx_ring->rr_sqp = sqp; + sqp->sq_rx_ring = ill_rx_ring; + ill_rx_ring->rr_ring_state = ILL_RING_INUSE; + sqp->sq_state |= SQS_ILL_BOUND; + + /* assign affinity to soft ring */ + if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { + ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, + sqp->sq_bind); + } + mutex_exit(&sqp->sq_lock); + + cmn_err(CE_CONT, "soft_ring_affinity: ring = %d, bind = %d\n", + i - j, sqp->sq_bind); + } + mutex_exit(&ill->ill_lock); + + ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, + SOFT_RING_SRC_HASH); + + /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ + ill_waiter_dcr(ill); +} + +void +ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, +mblk_t *mp_chain, size_t hdrlen) +{ + ip_taskq_arg_t *taskq_arg; + boolean_t refheld; + + ASSERT(servicing_interrupt()); + ASSERT(ip_ring == NULL); + + mutex_enter(&ill->ill_lock); + if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { + taskq_arg = (ip_taskq_arg_t *) + kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); + + if (taskq_arg == NULL) + goto out; + + taskq_arg->ip_taskq_ill = ill; + taskq_arg->ip_taskq_ill_rx_ring = ip_ring; + taskq_arg->ip_taskq_cpu = CPU; + + /* + * Set ILL_SOFT_RING_ASSIGN flag. We don't want + * the next interrupt to schedule a task for calling + * ip_squeue_soft_ring_affinity(); + */ + ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; + } else { + mutex_exit(&ill->ill_lock); + goto out; + } + mutex_exit(&ill->ill_lock); + refheld = ill_waiter_inc(ill); + if (refheld) { + if (taskq_dispatch(system_taskq, + ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) + goto out; + + /* release ref on ill if taskq dispatch fails */ + ill_waiter_dcr(ill); + } + /* + * Turn on CAPAB_SOFT_RING so that affinity assignment + * can be tried again later. + */ + mutex_enter(&ill->ill_lock); + ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; + mutex_exit(&ill->ill_lock); + kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); + +out: + ip_input(ill, ip_ring, mp_chain, hdrlen); +} + +static squeue_t * +ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout) +{ + int i; + squeue_set_t *best_sqs = NULL; + squeue_set_t *curr_sqs = NULL; + int min_sq = 0; + squeue_t *sqp = NULL; + char sqname[64]; + + /* + * If fanout is set and the passed squeue_set already has some + * squeues which are managing the NICs, try to find squeues on + * unused CPU. + */ + if (sqs->sqs_size > 1 && fanout) { + /* + * First check to see if any squeue on the CPU passed + * is managing a NIC. + */ + for (i = 0; i < sqs->sqs_size; i++) { + mutex_enter(&sqs->sqs_list[i]->sq_lock); + if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && + !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { + mutex_exit(&sqs->sqs_list[i]->sq_lock); + break; + } + mutex_exit(&sqs->sqs_list[i]->sq_lock); + } + if (i != sqs->sqs_size) { + best_sqs = sqset_global_list[sqset_global_size - 1]; + min_sq = best_sqs->sqs_size; + + for (i = sqset_global_size - 2; i >= 0; i--) { + curr_sqs = sqset_global_list[i]; + if (curr_sqs->sqs_size < min_sq) { + best_sqs = curr_sqs; + min_sq = curr_sqs->sqs_size; + } + } + + ASSERT(best_sqs != NULL); + sqs = best_sqs; + bind_cpu = cpu[sqs->sqs_bind]; + } + } + mutex_enter(&sqs->sqs_lock); + for (i = 0; i < sqs->sqs_size; i++) { mutex_enter(&sqs->sqs_list[i]->sq_lock); - if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) == 0) { + if ((sqs->sqs_list[i]->sq_state & + (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { sqp = sqs->sqs_list[i]; break; } @@ -371,29 +717,19 @@ ip_squeue_extend(void *arg) if (sqs->sqs_size == sqs->sqs_max_size) { /* * Reached the max limit for squeue - * we can allocate on this CPU. Leave - * ill_ring_state set to ILL_RING_INPROC - * so that ip_squeue_direct will just - * assign the default squeue for this - * ring for future connections. + * we can allocate on this CPU. */ -#ifdef DEBUG - cmn_err(CE_NOTE, "ip_squeue_add: Reached max " - " threads per CPU for sqp = %p\n", (void *)sqp); -#endif mutex_exit(&sqs->sqs_lock); - mutex_exit(&ill->ill_lock); - ill_waiter_dcr(ill); - return; + return (NULL); } bzero(sqname, sizeof (sqname)); (void) snprintf(sqname, sizeof (sqname), - "ip_squeue_cpu_%d/%d/%d", CPU->cpu_seqid, - CPU->cpu_id, sqs->sqs_size); + "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, + bind_cpu->cpu_id, sqs->sqs_size); - sqp = squeue_create(sqname, CPU->cpu_id, ip_squeue_worker_wait, - minclsyspri); + sqp = squeue_create(sqname, bind_cpu->cpu_id, + ip_squeue_worker_wait, minclsyspri); ASSERT(sqp != NULL); @@ -403,26 +739,18 @@ ip_squeue_extend(void *arg) if (ip_squeue_create_callback != NULL) ip_squeue_create_callback(sqp); - if (ip_squeue_bind) { + mutex_enter(&cpu_lock); + if (ip_squeue_bind && cpu_is_online(bind_cpu)) { squeue_bind(sqp, -1); } + mutex_exit(&cpu_lock); + mutex_enter(&sqp->sq_lock); } - ASSERT(sqp != NULL); - - sqp->sq_rx_ring = ill_rx_ring; - ill_rx_ring->rr_sqp = sqp; - ill_rx_ring->rr_ring_state = ILL_RING_INUSE; - - sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); - mutex_exit(&sqp->sq_lock); mutex_exit(&sqs->sqs_lock); - - mutex_exit(&ill->ill_lock); - - /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ - ill_waiter_dcr(ill); + ASSERT(sqp != NULL); + return (sqp); } /* @@ -657,6 +985,21 @@ ip_squeue_set_unbind(squeue_set_t *sqs) mutex_enter(&sqs->sqs_lock); for (i = 0; i < sqs->sqs_size; i++) { sqp = sqs->sqs_list[i]; + + /* + * CPU is going offline. Remove the thread affinity + * for any soft ring threads the squeue is managing. + */ + if (sqp->sq_state & SQS_ILL_BOUND) { + ill_rx_ring_t *ring = sqp->sq_rx_ring; + ill_t *ill = ring->rr_ill; + + if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { + ASSERT(ring->rr_handle != NULL); + ill->ill_dls_capab->ill_dls_unbind( + ring->rr_handle); + } + } if (!(sqp->sq_state & SQS_BOUND)) continue; squeue_unbind(sqp); diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h index 8a9f611fab..dd8f1db3da 100644 --- a/usr/src/uts/common/inet/ip_impl.h +++ b/usr/src/uts/common/inet/ip_impl.h @@ -466,20 +466,21 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t; putnext((connp)->conn_rq, mp); \ } -#define ILL_POLL_CAPABLE(ill) \ - (((ill)->ill_capabilities & ILL_CAPAB_POLL) != 0) +#define ILL_DLS_CAPABLE(ill) \ + (((ill)->ill_capabilities & \ + (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) != 0) /* * Macro that hands off one or more messages directly to DLD * when the interface is marked with ILL_CAPAB_POLL. */ -#define IP_POLL_ILL_TX(ill, mp) { \ - ill_poll_capab_t *ill_poll = ill->ill_poll_capab; \ - ASSERT(ILL_POLL_CAPABLE(ill)); \ - ASSERT(ill_poll != NULL); \ - ASSERT(ill_poll->ill_tx != NULL); \ - ASSERT(ill_poll->ill_tx_handle != NULL); \ - ill_poll->ill_tx(ill_poll->ill_tx_handle, mp); \ +#define IP_DLS_ILL_TX(ill, mp) { \ + ill_dls_capab_t *ill_dls = ill->ill_dls_capab; \ + ASSERT(ILL_DLS_CAPABLE(ill)); \ + ASSERT(ill_dls != NULL); \ + ASSERT(ill_dls->ill_tx != NULL); \ + ASSERT(ill_dls->ill_tx_handle != NULL); \ + ill_dls->ill_tx(ill_dls->ill_tx_handle, mp); \ } extern int ip_wput_frag_mdt_min; diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c index 61495f4705..886f6a00e5 100644 --- a/usr/src/uts/common/inet/tcp/tcp.c +++ b/usr/src/uts/common/inet/tcp/tcp.c @@ -18126,13 +18126,13 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp) ire->ire_last_used_time = lbolt; BUMP_MIB(&ip_mib, ipOutRequests); - if (ILL_POLL_CAPABLE(ill)) { + if (ILL_DLS_CAPABLE(ill)) { /* * Send the packet directly to DLD, where it may be queued * depending on the availability of transmit resources at * the media layer. */ - IP_POLL_ILL_TX(ill, mp); + IP_DLS_ILL_TX(ill, mp); } else { putnext(ire->ire_stq, mp); } diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c index c13d7c485f..3888f36ce2 100644 --- a/usr/src/uts/common/inet/udp/udp.c +++ b/usr/src/uts/common/inet/udp/udp.c @@ -6168,13 +6168,13 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha) UPDATE_OB_PKT_COUNT(ire); ire->ire_last_used_time = lbolt; - if (ILL_POLL_CAPABLE(ill)) { + if (ILL_DLS_CAPABLE(ill)) { /* * Send the packet directly to DLD, where it may be queued * depending on the availability of transmit resources at * the media layer. */ - IP_POLL_ILL_TX(ill, mp); + IP_DLS_ILL_TX(ill, mp); } else { putnext(ire->ire_stq, mp); } diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c index 4dce2ecc6d..8511f99890 100644 --- a/usr/src/uts/common/io/dld/dld_proto.c +++ b/usr/src/uts/common/io/dld/dld_proto.c @@ -37,6 +37,7 @@ #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/strsun.h> +#include <sys/cpuvar.h> #include <sys/dlpi.h> #include <netinet/in.h> #include <sys/sdt.h> @@ -46,6 +47,7 @@ #include <sys/dls.h> #include <sys/dld.h> #include <sys/dld_impl.h> +#include <sys/dls_soft_ring.h> typedef boolean_t proto_reqfunc_t(dld_str_t *, union DL_primitives *, mblk_t *); @@ -56,9 +58,15 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req, proto_notify_req, proto_unitdata_req, proto_passive_req; static void proto_poll_disable(dld_str_t *); -static boolean_t proto_poll_enable(dld_str_t *, dl_capab_poll_t *); +static boolean_t proto_poll_enable(dld_str_t *, dl_capab_dls_t *); static boolean_t proto_capability_advertise(dld_str_t *, mblk_t *); +static void proto_soft_ring_disable(dld_str_t *); +static boolean_t proto_soft_ring_enable(dld_str_t *, dl_capab_dls_t *); +static boolean_t proto_capability_advertise(dld_str_t *, mblk_t *); +static void proto_change_soft_ring_fanout(dld_str_t *, int); +static void proto_stop_soft_ring_threads(void *); + #define DL_ACK_PENDING(state) \ ((state) == DL_ATTACH_PENDING || \ (state) == DL_DETACH_PENDING || \ @@ -606,6 +614,22 @@ proto_unbind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) */ dsp->ds_mode = DLD_UNITDATA; + /* + * If soft rings were enabled, the workers + * should be quiesced. Start a task that will + * get this in motion. We cannot check for + * ds_soft_ring flag because + * proto_soft_ring_disable() called from + * proto_capability_req() would have reset it. + */ + if (dls_soft_ring_workers(dsp->ds_dc)) { + dsp->ds_unbind_req = mp; + dsp->ds_task_id = taskq_dispatch(system_taskq, + proto_stop_soft_ring_threads, (void *)dsp, TQ_SLEEP); + rw_exit(&dsp->ds_lock); + return (B_TRUE); + } + dsp->ds_dlstate = DL_UNBOUND; rw_exit(&dsp->ds_lock); @@ -1055,6 +1079,20 @@ failed: return (B_FALSE); } +static boolean_t +check_ip_above(queue_t *q) +{ + queue_t *next_q; + boolean_t ret = B_TRUE; + + claimstr(q); + next_q = q->q_next; + if (strcmp(next_q->q_qinfo->qi_minfo->mi_idname, "ip") != 0) + ret = B_FALSE; + releasestr(q); + return (ret); +} + /* * DL_CAPABILITY_REQ */ @@ -1141,14 +1179,14 @@ proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) * IP polling interface. */ case DL_CAPAB_POLL: { - dl_capab_poll_t *pollp; - dl_capab_poll_t poll; + dl_capab_dls_t *pollp; + dl_capab_dls_t poll; - pollp = (dl_capab_poll_t *)&sp[1]; + pollp = (dl_capab_dls_t *)&sp[1]; /* * Copy for alignment. */ - bcopy(pollp, &poll, sizeof (dl_capab_poll_t)); + bcopy(pollp, &poll, sizeof (dl_capab_dls_t)); /* * We need to become writer before enabling and/or @@ -1168,7 +1206,7 @@ proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) } upgraded = B_TRUE; - switch (poll.poll_flags) { + switch (poll.dls_flags) { default: /*FALLTHRU*/ case POLL_DISABLE: @@ -1186,16 +1224,81 @@ proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp) /* * Now attempt enable it. */ - if (!proto_poll_enable(dsp, &poll)) - break; + if (check_ip_above(dsp->ds_rq) && + proto_poll_enable(dsp, &poll)) { + bzero(&poll, sizeof (dl_capab_dls_t)); + poll.dls_flags = POLL_ENABLE; + } + break; + } + + dlcapabsetqid(&(poll.dls_mid), dsp->ds_rq); + bcopy(&poll, pollp, sizeof (dl_capab_dls_t)); + break; + } + case DL_CAPAB_SOFT_RING: { + dl_capab_dls_t *soft_ringp; + dl_capab_dls_t soft_ring; + + soft_ringp = (dl_capab_dls_t *)&sp[1]; + /* + * Copy for alignment. + */ + bcopy(soft_ringp, &soft_ring, + sizeof (dl_capab_dls_t)); - bzero(&poll, sizeof (dl_capab_poll_t)); - poll.poll_flags = POLL_ENABLE; + /* + * We need to become writer before enabling and/or + * disabling the soft_ring interface. If we couldn' + * upgrade, check state again after re-acquiring the + * lock to make sure we can proceed. + */ + if (!upgraded && !rw_tryupgrade(&dsp->ds_lock)) { + rw_exit(&dsp->ds_lock); + rw_enter(&dsp->ds_lock, RW_WRITER); + + if (dsp->ds_dlstate == DL_UNATTACHED || + DL_ACK_PENDING(dsp->ds_dlstate)) { + dl_err = DL_OUTSTATE; + goto failed; + } + } + upgraded = B_TRUE; + + switch (soft_ring.dls_flags) { + default: + /*FALLTHRU*/ + case SOFT_RING_DISABLE: + proto_soft_ring_disable(dsp); + break; + + case SOFT_RING_ENABLE: + /* + * Make sure soft_ring is disabled. + */ + proto_soft_ring_disable(dsp); + + /* + * Now attempt enable it. + */ + if (check_ip_above(dsp->ds_rq) && + proto_soft_ring_enable(dsp, &soft_ring)) { + bzero(&soft_ring, + sizeof (dl_capab_dls_t)); + soft_ring.dls_flags = + SOFT_RING_ENABLE; + } else { + bzero(&soft_ring, + sizeof (dl_capab_dls_t)); + soft_ring.dls_flags = + SOFT_RING_DISABLE; + } break; } - dlcapabsetqid(&(poll.poll_mid), dsp->ds_rq); - bcopy(&poll, pollp, sizeof (dl_capab_poll_t)); + dlcapabsetqid(&(soft_ring.dls_mid), dsp->ds_rq); + bcopy(&soft_ring, soft_ringp, + sizeof (dl_capab_dls_t)); break; } default: @@ -1440,6 +1543,7 @@ proto_poll_disable(dld_str_t *dsp) */ mh = dls_mac(dsp->ds_dc); mac_resource_set(mh, NULL, NULL); + mac_resources(mh); /* * Set receive function back to default. @@ -1454,7 +1558,7 @@ proto_poll_disable(dld_str_t *dsp) } static boolean_t -proto_poll_enable(dld_str_t *dsp, dl_capab_poll_t *pollp) +proto_poll_enable(dld_str_t *dsp, dl_capab_dls_t *pollp) { mac_handle_t mh; @@ -1473,15 +1577,15 @@ proto_poll_enable(dld_str_t *dsp, dl_capab_poll_t *pollp) /* * Register resources. */ - mac_resource_set(mh, (mac_resource_add_t)pollp->poll_ring_add, - (void *)pollp->poll_rx_handle); + mac_resource_set(mh, (mac_resource_add_t)pollp->dls_ring_add, + (void *)pollp->dls_rx_handle); mac_resources(mh); /* * Set the receive function. */ - dls_rx_set(dsp->ds_dc, (dls_rx_t)pollp->poll_rx, - (void *)pollp->poll_rx_handle); + dls_rx_set(dsp->ds_dc, (dls_rx_t)pollp->dls_rx, + (void *)pollp->dls_rx_handle); /* * Note that polling is enabled. This prevents further DLIOCHDRINFO @@ -1491,6 +1595,74 @@ proto_poll_enable(dld_str_t *dsp, dl_capab_poll_t *pollp) return (B_TRUE); } +static void +proto_soft_ring_disable(dld_str_t *dsp) +{ + ASSERT(RW_WRITE_HELD(&dsp->ds_lock)); + + if (!dsp->ds_soft_ring) + return; + + /* + * It should be impossible to enable raw mode if soft_ring is turned on. + */ + ASSERT(dsp->ds_mode != DLD_RAW); + proto_change_soft_ring_fanout(dsp, SOFT_RING_NONE); + /* + * Note that fanout is disabled. + */ + dsp->ds_soft_ring = B_FALSE; +} + +static boolean_t +proto_soft_ring_enable(dld_str_t *dsp, dl_capab_dls_t *soft_ringp) +{ + ASSERT(RW_WRITE_HELD(&dsp->ds_lock)); + ASSERT(!dsp->ds_soft_ring); + + /* + * We cannot enable soft_ring if raw mode + * has been enabled. + */ + if (dsp->ds_mode == DLD_RAW) + return (B_FALSE); + + if (dls_soft_ring_enable(dsp->ds_dc, soft_ringp) == B_FALSE) + return (B_FALSE); + + dsp->ds_soft_ring = B_TRUE; + return (B_TRUE); +} + +static void +proto_change_soft_ring_fanout(dld_str_t *dsp, int type) +{ + dls_rx_t rx; + + if (type == SOFT_RING_NONE) { + rx = (dsp->ds_mode == DLD_FASTPATH) ? + dld_str_rx_fastpath : dld_str_rx_unitdata; + } else { + rx = (dls_rx_t)dls_ether_soft_ring_fanout; + } + dls_soft_ring_rx_set(dsp->ds_dc, rx, dsp, type); +} + +static void +proto_stop_soft_ring_threads(void *arg) +{ + dld_str_t *dsp = (dld_str_t *)arg; + + rw_enter(&dsp->ds_lock, RW_WRITER); + dls_soft_ring_disable(dsp->ds_dc); + dsp->ds_dlstate = DL_UNBOUND; + rw_exit(&dsp->ds_lock); + dlokack(dsp->ds_wq, dsp->ds_unbind_req, DL_UNBIND_REQ); + rw_enter(&dsp->ds_lock, RW_WRITER); + dsp->ds_task_id = NULL; + rw_exit(&dsp->ds_lock); +} + /* * DL_CAPABILITY_ACK/DL_ERROR_ACK */ @@ -1500,7 +1672,8 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) dl_capability_ack_t *dlap; dl_capability_sub_t *dlsp; size_t subsize; - dl_capab_poll_t poll; + dl_capab_dls_t poll; + dl_capab_dls_t soft_ring; dl_capab_hcksum_t hcksum; dl_capab_zerocopy_t zcopy; uint8_t *ptr; @@ -1516,6 +1689,9 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) */ subsize = 0; + /* Always advertize soft ring capability for GLDv3 drivers */ + subsize += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dls_t); + /* * Check if polling can be enabled on this interface. * If advertising DL_CAPAB_POLL has not been explicitly disabled @@ -1525,7 +1701,7 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) !(dld_opt & DLD_OPT_NO_POLL) && (dsp->ds_vid == VLAN_ID_NONE)); if (poll_cap) { subsize += sizeof (dl_capability_sub_t) + - sizeof (dl_capab_poll_t); + sizeof (dl_capab_dls_t); } /* @@ -1550,7 +1726,7 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) * If there are no capabilities to advertise or if we * can't allocate a response, send a DL_ERROR_ACK. */ - if (subsize == 0 || (mp1 = reallocb(mp, + if ((mp1 = reallocb(mp, sizeof (dl_capability_ack_t) + subsize, 0)) == NULL) { rw_exit(&dsp->ds_lock); dlerrorack(q, mp, DL_CAPABILITY_REQ, DL_NOTSUPPORTED, 0); @@ -1594,7 +1770,7 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) rw_downgrade(&dsp->ds_lock); poll_capab_size = sizeof (dl_capability_sub_t) + - sizeof (dl_capab_poll_t); + sizeof (dl_capab_dls_t); mp->b_wptr -= poll_capab_size; subsize -= poll_capab_size; @@ -1607,23 +1783,43 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp) dlsp = (dl_capability_sub_t *)ptr; dlsp->dl_cap = DL_CAPAB_POLL; - dlsp->dl_length = sizeof (dl_capab_poll_t); + dlsp->dl_length = sizeof (dl_capab_dls_t); ptr += sizeof (dl_capability_sub_t); - bzero(&poll, sizeof (dl_capab_poll_t)); - poll.poll_version = POLL_VERSION_1; - poll.poll_flags = POLL_CAPABLE; - poll.poll_tx_handle = (uintptr_t)dsp; - poll.poll_tx = (uintptr_t)str_mdata_fastpath_put; + bzero(&poll, sizeof (dl_capab_dls_t)); + poll.dls_version = POLL_VERSION_1; + poll.dls_flags = POLL_CAPABLE; + poll.dls_tx_handle = (uintptr_t)dsp; + poll.dls_tx = (uintptr_t)str_mdata_fastpath_put; - dlcapabsetqid(&(poll.poll_mid), dsp->ds_rq); - bcopy(&poll, ptr, sizeof (dl_capab_poll_t)); - ptr += sizeof (dl_capab_poll_t); + dlcapabsetqid(&(poll.dls_mid), dsp->ds_rq); + bcopy(&poll, ptr, sizeof (dl_capab_dls_t)); + ptr += sizeof (dl_capab_dls_t); } } ASSERT(RW_READ_HELD(&dsp->ds_lock)); + dlsp = (dl_capability_sub_t *)ptr; + + dlsp->dl_cap = DL_CAPAB_SOFT_RING; + dlsp->dl_length = sizeof (dl_capab_dls_t); + ptr += sizeof (dl_capability_sub_t); + + bzero(&soft_ring, sizeof (dl_capab_dls_t)); + soft_ring.dls_version = SOFT_RING_VERSION_1; + soft_ring.dls_flags = SOFT_RING_CAPABLE; + soft_ring.dls_tx_handle = (uintptr_t)dsp; + soft_ring.dls_tx = (uintptr_t)str_mdata_fastpath_put; + soft_ring.dls_ring_change_status = + (uintptr_t)proto_change_soft_ring_fanout; + soft_ring.dls_ring_bind = (uintptr_t)soft_ring_bind; + soft_ring.dls_ring_unbind = (uintptr_t)soft_ring_unbind; + + dlcapabsetqid(&(soft_ring.dls_mid), dsp->ds_rq); + bcopy(&soft_ring, ptr, sizeof (dl_capab_dls_t)); + ptr += sizeof (dl_capab_dls_t); + /* * TCP/IP checksum offload. */ diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c index d723bd7450..c2b8c63e43 100644 --- a/usr/src/uts/common/io/dld/dld_str.c +++ b/usr/src/uts/common/io/dld/dld_str.c @@ -259,6 +259,8 @@ dld_close(queue_t *rq) { dld_str_t *dsp = rq->q_ptr; + ASSERT(dsp->ds_task_id == NULL); + /* * Disable the queue srv(9e) routine. */ @@ -859,6 +861,7 @@ dld_str_detach(dld_str_t *dsp) * Clear the polling and promisc flags. */ dsp->ds_polling = B_FALSE; + dsp->ds_soft_ring = B_FALSE; dsp->ds_promisc = 0; /* @@ -1494,7 +1497,7 @@ ioc_raw(dld_str_t *dsp, mblk_t *mp) queue_t *q = dsp->ds_wq; rw_enter(&dsp->ds_lock, RW_WRITER); - if (dsp->ds_polling) { + if (dsp->ds_polling || dsp->ds_soft_ring) { rw_exit(&dsp->ds_lock); miocnak(q, mp, 0, EPROTO); return; @@ -1604,7 +1607,7 @@ ioc_fast(dld_str_t *dsp, mblk_t *mp) /* * Set the receive callback (unless polling is enabled). */ - if (!dsp->ds_polling) + if (!dsp->ds_polling && !dsp->ds_soft_ring) dls_rx_set(dc, dld_str_rx_fastpath, (void *)dsp); /* diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c index 0968818e07..dbf4edc280 100644 --- a/usr/src/uts/common/io/dls/dls.c +++ b/usr/src/uts/common/io/dls/dls.c @@ -43,10 +43,18 @@ #include <sys/dls.h> #include <sys/dls_impl.h> +#include <sys/dls_soft_ring.h> static kmem_cache_t *i_dls_impl_cachep; static uint32_t i_dls_impl_count; +static kstat_t *dls_ksp = (kstat_t *)NULL; +struct dls_kstats dls_kstat = +{ + { "soft_ring_pkt_drop", KSTAT_DATA_UINT32 }, +}; + + /* * Private functions. */ @@ -257,6 +265,27 @@ vlan: dhip->dhi_vid = VLAN_ID(tci); } +static void +dls_stat_init() +{ + if ((dls_ksp = kstat_create("dls", 0, "dls_stat", + "net", KSTAT_TYPE_NAMED, + sizeof (dls_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL)) == NULL) { + cmn_err(CE_WARN, + "DLS: failed to create kstat structure for dls stats"); + return; + } + dls_ksp->ks_data = (void *)&dls_kstat; + kstat_install(dls_ksp); +} + +static void +dls_stat_destroy() +{ + kstat_delete(dls_ksp); +} + /* * Module initialization functions. */ @@ -271,6 +300,8 @@ dls_init(void) sizeof (dls_impl_t), 0, i_dls_constructor, i_dls_destructor, NULL, NULL, NULL, 0); ASSERT(i_dls_impl_cachep != NULL); + soft_ring_init(); + dls_stat_init(); } int @@ -286,6 +317,7 @@ dls_fini(void) * Destroy the kmem_cache. */ kmem_cache_destroy(i_dls_impl_cachep); + dls_stat_destroy(); return (0); } @@ -423,6 +455,14 @@ dls_close(dls_channel_t dc) */ dip->di_dvp = NULL; dip->di_txinfo = NULL; + + if (dip->di_soft_ring_list != NULL) { + soft_ring_set_destroy(dip->di_soft_ring_list, + dip->di_soft_ring_size); + dip->di_soft_ring_list = NULL; + } + dip->di_soft_ring_size = 0; + kmem_cache_free(i_dls_impl_cachep, dip); /* diff --git a/usr/src/uts/common/io/dls/dls_soft_ring.c b/usr/src/uts/common/io/dls/dls_soft_ring.c new file mode 100644 index 0000000000..cfd75e724a --- /dev/null +++ b/usr/src/uts/common/io/dls/dls_soft_ring.c @@ -0,0 +1,667 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * General Soft rings - Simulating Rx rings in S/W. + * + * This is a general purpose high-performance soft ring mechanism. It is + * similar to a taskq with a single worker thread. The dls creates a + * set of these rings to simulate the H/W Rx ring (DMA channels) some + * NICs have. The purpose is to present a common interface to IP + * so the individual squeues can control these rings and switch them + * between polling and interrupt mode. + * + * This code also serves as a fanout mechanism for fast NIC feeding slow + * CPU where incoming traffic can be separated into multiple soft rings + * based on capability negotiation with IP and IP also creates thread + * affinity to soft ring worker threads to CPU so that conenction to + * CPU/Squeue affinity is never broken. + * + * The soft rings can also be driven by a classifier which can direct + * traffic to individual soft rings based on the input from IP. + */ + +#include <sys/types.h> +#include <sys/cmn_err.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/cpuvar.h> +#include <sys/condvar_impl.h> +#include <sys/systm.h> +#include <sys/callb.h> +#include <sys/sdt.h> +#include <sys/ddi.h> +#include <sys/strsubr.h> +#include <inet/common.h> +#include <inet/ip.h> + +#include <sys/dls_impl.h> +#include <sys/dls_soft_ring.h> + +static void soft_ring_fire(void *); +static void soft_ring_drain(soft_ring_t *, clock_t); +static void soft_ring_worker(soft_ring_t *); +static void soft_ring_stop_workers(soft_ring_t **, int); + +kmem_cache_t *soft_ring_cache; + + +int soft_ring_workerwait_ms = 10; +int soft_ring_max_q_cnt = (4 * 1024 * 1024); + +/* The values above converted to ticks */ +static int soft_ring_workerwait_tick = 0; + +#define SOFT_RING_WORKER_WAKEUP(ringp) { \ + timeout_id_t tid = (ringp)->s_ring_tid; \ + \ + ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ + /* \ + * Queue isn't being processed, so take \ + * any post enqueue actions needed before leaving. \ + */ \ + if (tid != 0) { \ + /* \ + * Waiting for an enter() to process mblk(s). \ + */ \ + clock_t waited = lbolt - (ringp)->s_ring_awaken; \ + \ + if (TICK_TO_MSEC(waited) >= (ringp)->s_ring_wait) { \ + /* \ + * Times up and have a worker thread \ + * waiting for work, so schedule it. \ + */ \ + (ringp)->s_ring_tid = 0; \ + cv_signal(&(ringp)->s_ring_async); \ + mutex_exit(&(ringp)->s_ring_lock); \ + (void) untimeout(tid); \ + } else { \ + mutex_exit(&(ringp)->s_ring_lock); \ + } \ + } else if ((ringp)->s_ring_wait != 0) { \ + (ringp)->s_ring_awaken = lbolt; \ + (ringp)->s_ring_tid = timeout(soft_ring_fire, (ringp), \ + (ringp)->s_ring_wait); \ + mutex_exit(&(ringp)->s_ring_lock); \ + } else { \ + /* \ + * Schedule the worker thread. \ + */ \ + cv_signal(&(ringp)->s_ring_async); \ + mutex_exit(&(ringp)->s_ring_lock); \ + } \ + ASSERT(MUTEX_NOT_HELD(&(ringp)->s_ring_lock)); \ +} + + +#define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt) { \ + /* \ + * Enqueue our mblk chain. \ + */ \ + ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ + \ + if ((ringp)->s_ring_last != NULL) \ + (ringp)->s_ring_last->b_next = (mp); \ + else \ + (ringp)->s_ring_first = (mp); \ + (ringp)->s_ring_last = (tail); \ + (ringp)->s_ring_count += (cnt); \ + ASSERT((ringp)->s_ring_count > 0); \ +} + +void +soft_ring_init(void) +{ + soft_ring_cache = kmem_cache_create("soft_ring_cache", + sizeof (soft_ring_t), 64, NULL, NULL, NULL, NULL, NULL, 0); + + soft_ring_workerwait_tick = + MSEC_TO_TICK_ROUNDUP(soft_ring_workerwait_ms); +} + +/* ARGSUSED */ +soft_ring_t * +soft_ring_create(char *name, processorid_t bind, clock_t wait, + uint_t type, pri_t pri) +{ + soft_ring_t *ringp; + + ringp = kmem_cache_alloc(soft_ring_cache, KM_NOSLEEP); + if (ringp == NULL) + return (NULL); + + bzero(ringp, sizeof (soft_ring_t)); + (void) strncpy(ringp->s_ring_name, name, S_RING_NAMELEN + 1); + ringp->s_ring_name[S_RING_NAMELEN] = '\0'; + mutex_init(&ringp->s_ring_lock, NULL, MUTEX_DEFAULT, NULL); + + ringp->s_ring_type = type; + ringp->s_ring_bind = bind; + if (bind != S_RING_BIND_NONE) + soft_ring_bind(ringp, bind); + ringp->s_ring_wait = MSEC_TO_TICK(wait); + + ringp->s_ring_worker = thread_create(NULL, 0, soft_ring_worker, + ringp, 0, &p0, TS_RUN, pri); + + return (ringp); +} + +soft_ring_t ** +soft_ring_set_create(char *name, processorid_t bind, clock_t wait, + uint_t type, pri_t pri, int cnt) +{ + int i; + soft_ring_t **ringp_list; + + if ((ringp_list = + (soft_ring_t **) kmem_zalloc(sizeof (soft_ring_t *) * cnt, + KM_NOSLEEP)) != NULL) { + for (i = 0; i < cnt; i++) { + ringp_list[i] = soft_ring_create(name, bind, wait, + type, pri); + if (ringp_list[i] == NULL) + break; + } + if (i != cnt) { + soft_ring_stop_workers(ringp_list, i); + soft_ring_set_destroy(ringp_list, i); + ringp_list = NULL; + } + } + return (ringp_list); +} + +static void +soft_ring_stop_workers(soft_ring_t **ringp_set, int cnt) +{ + int i; + soft_ring_t *ringp; + timeout_id_t tid; + kt_did_t t_did; + + for (i = 0; i < cnt; i++) { + ringp = ringp_set[i]; + + soft_ring_unbind((void *)ringp); + mutex_enter(&ringp->s_ring_lock); + if ((tid = ringp->s_ring_tid) != 0) + (void) untimeout(tid); + + ringp->s_ring_tid = 0; + + if (!(ringp->s_ring_state & S_RING_DEAD)) { + ringp->s_ring_state |= S_RING_DESTROY; + t_did = ringp->s_ring_worker->t_did; + + + /* Wake the worker so it can exit */ + cv_signal(&(ringp)->s_ring_async); + } + mutex_exit(&ringp->s_ring_lock); + + /* + * Here comes the tricky part. IP and driver ensure + * that packet flow has stopped but worker thread + * might still be draining the soft ring. We have + * already set the S_RING_DESTROY flag. We wait till + * the worker thread takes notice and stops processing + * the soft_ring and exits. It sets S_RING_DEAD on + * exiting. + */ + if (t_did) + thread_join(t_did); + } +} + +void +soft_ring_set_destroy(soft_ring_t **ringp_set, int cnt) +{ + int i; + mblk_t *mp; + soft_ring_t *ringp; + + for (i = 0; i < cnt; i++) { + ringp = ringp_set[i]; + + mutex_enter(&ringp->s_ring_lock); + + ASSERT(ringp->s_ring_state & S_RING_DEAD); + + while ((mp = ringp->s_ring_first) != NULL) { + ringp->s_ring_first = mp->b_next; + mp->b_next = NULL; + freemsg(mp); + } + ringp->s_ring_last = NULL; + mutex_exit(&ringp->s_ring_lock); + + /* + * IP/driver ensure that no packets are flowing + * when we are destroying the soft rings otherwise bad + * things will happen. + */ + kmem_cache_free(soft_ring_cache, ringp); + ringp_set[i] = NULL; + } + kmem_free(ringp_set, sizeof (soft_ring_t *) * cnt); +} + +/* ARGSUSED */ +void +soft_ring_bind(void *arg, processorid_t bind) +{ + cpu_t *cp; + soft_ring_t *ringp = (soft_ring_t *)arg; + + mutex_enter(&ringp->s_ring_lock); + if (ringp->s_ring_state & S_RING_BOUND) { + mutex_exit(&ringp->s_ring_lock); + return; + } + + ringp->s_ring_state |= S_RING_BOUND; + ringp->s_ring_bind = bind; + mutex_exit(&ringp->s_ring_lock); + + cp = cpu[bind]; + mutex_enter(&cpu_lock); + if (cpu_is_online(cp)) { + thread_affinity_set(ringp->s_ring_worker, ringp->s_ring_bind); + } + mutex_exit(&cpu_lock); +} + +void +soft_ring_unbind(void *arg) +{ + soft_ring_t *ringp = (soft_ring_t *)arg; + + mutex_enter(&ringp->s_ring_lock); + if (!(ringp->s_ring_state & S_RING_BOUND)) { + mutex_exit(&ringp->s_ring_lock); + return; + } + + ringp->s_ring_state &= ~S_RING_BOUND; + ringp->s_ring_bind = S_RING_BIND_NONE; + mutex_exit(&ringp->s_ring_lock); + + thread_affinity_clear(ringp->s_ring_worker); +} + +/* + * soft_ring_enter() - enter soft_ring sqp with mblk mp (which can be + * a chain), while tail points to the end and cnt in number of + * mblks in the chain. + * + * For a chain of single packet (i.e. mp == tail), go through the + * fast path if no one is processing the soft_ring and nothing is queued. + * + * The proc and arg for each mblk is already stored in the mblk in + * appropriate places. + */ +/* ARGSUSED */ +void +soft_ring_process(soft_ring_t *ringp, mblk_t *mp_chain, uint8_t tag) +{ + void *arg1, *arg2; + s_ring_proc_t proc; + mblk_t *tail; + int cnt = 1; + + ASSERT(ringp != NULL); + ASSERT(mp_chain != NULL); + ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); + + tail = mp_chain; + while (tail->b_next != NULL) { + tail = tail->b_next; + cnt++; + } + mutex_enter(&ringp->s_ring_lock); + + ringp->s_ring_total_inpkt += cnt; + if (!(ringp->s_ring_state & S_RING_PROC) && + !(ringp->s_ring_type == S_RING_WORKER_ONLY)) { + /* + * See if anything is already queued. If we are the + * first packet, do inline processing else queue the + * packet and do the drain. + */ + if (ringp->s_ring_first == NULL && cnt == 1) { + /* + * Fast-path, ok to process and nothing queued. + */ + ringp->s_ring_run = curthread; + ringp->s_ring_state |= (S_RING_PROC); + + /* + * We are the chain of 1 packet so + * go through this fast path. + */ + ASSERT(mp_chain->b_next == NULL); + proc = ringp->s_ring_upcall; + arg1 = ringp->s_ring_upcall_arg1; + arg2 = ringp->s_ring_upcall_arg2; + + mutex_exit(&ringp->s_ring_lock); + (*proc)(arg1, arg2, mp_chain, -1); + + ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); + mutex_enter(&ringp->s_ring_lock); + ringp->s_ring_run = NULL; + ringp->s_ring_state &= ~S_RING_PROC; + if (ringp->s_ring_first == NULL) { + /* + * We processed inline our packet and + * nothing new has arrived. We are done. + */ + mutex_exit(&ringp->s_ring_lock); + return; + } + } else { + SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt); + } + + /* + * We are here because either we couldn't do inline + * processing (because something was already queued), + * or we had a chanin of more than one packet, + * or something else arrived after we were done with + * inline processing. + */ + ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); + ASSERT(ringp->s_ring_first != NULL); + + + soft_ring_drain(ringp, -1); + mutex_exit(&ringp->s_ring_lock); + return; + } else { + /* + * Queue is already being processed. Just enqueue + * the packet and go away. + */ + if (ringp->s_ring_count > soft_ring_max_q_cnt) { + freemsgchain(mp_chain); + DLS_BUMP_STAT(dlss_soft_ring_pkt_drop, cnt); + } else + SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt); + if (!(ringp->s_ring_state & S_RING_PROC)) { + SOFT_RING_WORKER_WAKEUP(ringp); + } else { + ASSERT(ringp->s_ring_run != NULL); + mutex_exit(&ringp->s_ring_lock); + } + return; + } +} + +/* + * PRIVATE FUNCTIONS + */ + +static void +soft_ring_fire(void *arg) +{ + soft_ring_t *ringp = arg; + + mutex_enter(&ringp->s_ring_lock); + if (ringp->s_ring_tid == 0) { + mutex_exit(&ringp->s_ring_lock); + return; + } + + ringp->s_ring_tid = 0; + + if (!(ringp->s_ring_state & S_RING_PROC)) { + cv_signal(&ringp->s_ring_async); + } + mutex_exit(&ringp->s_ring_lock); +} + +/* ARGSUSED */ +static void +soft_ring_drain(soft_ring_t *ringp, clock_t expire) +{ + mblk_t *mp; + s_ring_proc_t proc; + void *arg1, *arg2; + timeout_id_t tid; + + ringp->s_ring_run = curthread; + ASSERT(mutex_owned(&ringp->s_ring_lock)); + ASSERT(!(ringp->s_ring_state & S_RING_PROC)); + + if ((tid = ringp->s_ring_tid) != 0) + ringp->s_ring_tid = 0; + + ringp->s_ring_state |= S_RING_PROC; + + + proc = ringp->s_ring_upcall; + arg1 = ringp->s_ring_upcall_arg1; + arg2 = ringp->s_ring_upcall_arg2; + + while (ringp->s_ring_first != NULL) { + mp = ringp->s_ring_first; + ringp->s_ring_first = NULL; + ringp->s_ring_last = NULL; + ringp->s_ring_count = 0; + mutex_exit(&ringp->s_ring_lock); + + if (tid != 0) { + (void) untimeout(tid); + tid = 0; + } + + (*proc)(arg1, arg2, mp, -1); + + mutex_enter(&ringp->s_ring_lock); + } + + ringp->s_ring_state &= ~S_RING_PROC; + ringp->s_ring_run = NULL; +} + +static void +soft_ring_worker(soft_ring_t *ringp) +{ + kmutex_t *lock = &ringp->s_ring_lock; + kcondvar_t *async = &ringp->s_ring_async; + callb_cpr_t cprinfo; + + CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "soft_ring"); + mutex_enter(lock); + + for (;;) { + while (ringp->s_ring_first == NULL || + (ringp->s_ring_state & S_RING_PROC)) { + CALLB_CPR_SAFE_BEGIN(&cprinfo); + if (ringp->s_ring_state & S_RING_DESTROY) + goto destroy; +still_wait: + cv_wait(async, lock); + if (ringp->s_ring_state & S_RING_DESTROY) { +destroy: + if (ringp->s_ring_state & S_RING_DESTROY) { + ringp->s_ring_state |= S_RING_DEAD; + CALLB_CPR_EXIT(&cprinfo); + thread_exit(); + } + } + if (ringp->s_ring_state & S_RING_PROC) { + goto still_wait; + } + CALLB_CPR_SAFE_END(&cprinfo, lock); + } + soft_ring_drain(ringp, -1); + } +} + +void +dls_soft_ring_rx_set(dls_channel_t dc, dls_rx_t rx, void *arg, int type) +{ + dls_impl_t *dip = (dls_impl_t *)dc; + + rw_enter(&(dip->di_lock), RW_WRITER); + dip->di_soft_ring_fanout_type = type; + dip->di_rx = rx; + if (type == SOFT_RING_NONE) + dip->di_rx_arg = arg; + else + dip->di_rx_arg = (void *)dip; + rw_exit(&(dip->di_lock)); +} + +boolean_t +dls_soft_ring_workers(dls_channel_t dc) +{ + dls_impl_t *dip = (dls_impl_t *)dc; + boolean_t ret = B_FALSE; + + rw_enter(&(dip->di_lock), RW_WRITER); + if (dip->di_soft_ring_list != NULL) + ret = B_TRUE; + rw_exit(&(dip->di_lock)); + return (ret); +} + +void +dls_soft_ring_disable(dls_channel_t dc) +{ + dls_impl_t *dip = (dls_impl_t *)dc; + + rw_enter(&(dip->di_lock), RW_WRITER); + if (dip->di_soft_ring_list != NULL) + soft_ring_stop_workers(dip->di_soft_ring_list, + dip->di_soft_ring_size); + rw_exit(&(dip->di_lock)); +} + +boolean_t +dls_soft_ring_enable(dls_channel_t dc, dl_capab_dls_t *soft_ringp) +{ + dls_impl_t *dip; + int i; + soft_ring_t **softring_set; + soft_ring_t *softring; + mac_rx_fifo_t mrf; + char name[64]; + + dip = (dls_impl_t *)dc; + + rw_enter(&(dip->di_lock), RW_WRITER); + + if (dip->di_soft_ring_list != NULL) { + soft_ring_stop_workers(dip->di_soft_ring_list, + dip->di_soft_ring_size); + soft_ring_set_destroy(dip->di_soft_ring_list, + dip->di_soft_ring_size); + dip->di_soft_ring_list = NULL; + } + dip->di_soft_ring_size = 0; + + bzero(name, sizeof (name)); + (void) snprintf(name, sizeof (name), "dls_soft_ring_%p", dip); + dip->di_soft_ring_list = soft_ring_set_create(name, S_RING_BIND_NONE, + 0, S_RING_WORKER_ONLY, minclsyspri, soft_ringp->dls_ring_cnt); + + if (dip->di_soft_ring_list == NULL) { + rw_exit(&(dip->di_lock)); + return (B_FALSE); + } + + dip->di_soft_ring_size = soft_ringp->dls_ring_cnt; + softring_set = dip->di_soft_ring_list; + + dip->di_ring_add = (mac_resource_add_t)soft_ringp->dls_ring_add; + dip->di_rx = (dls_rx_t)soft_ringp->dls_ring_assign; + dip->di_rx_arg = (void *)soft_ringp->dls_rx_handle; + + bzero(&mrf, sizeof (mac_rx_fifo_t)); + mrf.mrf_type = MAC_RX_FIFO; + for (i = 0; i < soft_ringp->dls_ring_cnt; i++) { + softring = softring_set[i]; + mrf.mrf_arg = softring; + softring->s_ring_upcall_arg1 = + (void *)soft_ringp->dls_rx_handle; + softring->s_ring_upcall_arg2 = + dip->di_ring_add((void *)soft_ringp->dls_rx_handle, + (mac_resource_t *)&mrf); + softring->s_ring_upcall = + (s_ring_proc_t)soft_ringp->dls_rx; + } + + /* + * Note that soft_ring is enabled. This prevents further DLIOCHDRINFO + * ioctls from overwriting the receive function pointer. + */ + rw_exit(&(dip->di_lock)); + return (B_TRUE); +} + +#define COMPUTE_HASH(key, sz) (key % sz) + +/* ARGSUSED */ +void +dls_ether_soft_ring_fanout(void *rx_handle, void *rx_cookie, mblk_t *mp_chain, + size_t hdrlen) +{ + ipha_t *ipha = (ipha_t *)mp_chain->b_rptr; + dls_impl_t *dip = (dls_impl_t *)rx_handle; + int indx; + int key; + int hdr_len; + uint16_t port1, port2; + + switch (dip->di_soft_ring_fanout_type) { + case SOFT_RING_SRC_HASH: + /* + * We get a chain of packets from the same remote. Make + * sure the same remote goes to same ring. + */ + hdr_len = IPH_HDR_LENGTH(ipha); + port1 = *((uint16_t *)(&mp_chain->b_rptr[hdr_len])); + port2 = *((uint16_t *)(&mp_chain->b_rptr[hdr_len+2])); + key = port1 + port2; + indx = COMPUTE_HASH(key, dip->di_soft_ring_size); + soft_ring_process(dip->di_soft_ring_list[indx], + mp_chain, 0); + break; + case SOFT_RING_RND_ROBIN: + case SOFT_RING_RANDOM: + /* + * Just send it to any possible soft ring + */ + soft_ring_process(dip->di_soft_ring_list[ + lbolt % dip->di_soft_ring_size], mp_chain, 0); + break; + } +} diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c index d0025c7fb9..3d58c49cbd 100644 --- a/usr/src/uts/common/io/dls/dls_stat.c +++ b/usr/src/uts/common/io/dls/dls_stat.c @@ -121,7 +121,7 @@ i_dls_stat_update(kstat_t *ksp, int rw) */ void -dls_stat_create(dls_vlan_t *dvp) +dls_mac_stat_create(dls_vlan_t *dvp) { dls_link_t *dlp = dvp->dv_dlp; char module[IFNAMSIZ]; @@ -172,7 +172,7 @@ done: } void -dls_stat_destroy(dls_vlan_t *dvp) +dls_mac_stat_destroy(dls_vlan_t *dvp) { kstat_delete(dvp->dv_ksp); dvp->dv_ksp = NULL; diff --git a/usr/src/uts/common/io/dls/dls_vlan.c b/usr/src/uts/common/io/dls/dls_vlan.c index 872dc29522..9e20730ea6 100644 --- a/usr/src/uts/common/io/dls/dls_vlan.c +++ b/usr/src/uts/common/io/dls/dls_vlan.c @@ -305,7 +305,7 @@ again: } if (dvp->dv_ref++ == 0) - dls_stat_create(dvp); + dls_mac_stat_create(dvp); *dvpp = dvp; done: @@ -334,7 +334,7 @@ dls_vlan_rele(dls_vlan_t *dvp) mac_stop(dlp->dl_mh); dls_mac_rele(dlp); if (--dvp->dv_ref == 0) { - dls_stat_destroy(dvp); + dls_mac_stat_destroy(dvp); /* * Tagged vlans get destroyed when dv_ref drops * to 0. We need to copy dv_name here because diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c index 288bb79298..53bcc5fc94 100644 --- a/usr/src/uts/common/io/mac/mac.c +++ b/usr/src/uts/common/io/mac/mac.c @@ -1297,7 +1297,10 @@ mac_resource_add(mac_t *mp, mac_resource_t *mrp) add = mip->mi_resource_add; arg = mip->mi_resource_add_arg; - mrh = add(arg, mrp); + if (add != NULL) + mrh = add(arg, mrp); + else + mrh = NULL; rw_exit(&mip->mi_resource_lock); return (mrh); diff --git a/usr/src/uts/common/os/space.c b/usr/src/uts/common/os/space.c index 071d12bc39..0a61cae511 100644 --- a/usr/src/uts/common/os/space.c +++ b/usr/src/uts/common/os/space.c @@ -20,7 +20,7 @@ * CDDL HEADER END */ /* - * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -371,3 +371,21 @@ space_free(char *key) #include <sys/crc32.h> const uint32_t crc32_table[256] = { CRC32_TABLE }; + + +/* + * We need to fanout load from NIC which can overwhelm a single + * CPU. A 10Gb NIC interrupting a single CPU is a good example. + * Instead of fanning out to random CPUs, it a big performance + * win if you can fanout to the threads on the same core (niagara) + * that is taking interrupts. + * + * We need a better mechanism to figure out the other threads on + * the same core or cores on the same chip which share caches etc. + * but for time being, this will suffice. + */ +#define NUMBER_OF_THREADS_PER_CPU 4 +uint_t ip_threads_per_cpu = NUMBER_OF_THREADS_PER_CPU; + +/* Global flag to enable/disable soft ring facility */ +boolean_t ip_squeue_soft_ring = B_FALSE; diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h index cafc32433a..d7d869189d 100644 --- a/usr/src/uts/common/sys/dld_impl.h +++ b/usr/src/uts/common/sys/dld_impl.h @@ -163,6 +163,7 @@ struct dld_str { * IP polling is operational if this flag is set. */ boolean_t ds_polling; + boolean_t ds_soft_ring; /* * State of DLPI user: may be active (regular network layer), @@ -194,7 +195,9 @@ struct dld_str { */ kmutex_t ds_thr_lock; uint_t ds_thr; + taskqid_t ds_task_id; mblk_t *ds_detach_req; + mblk_t *ds_unbind_req; } dld_str; /* diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h index 1169d68d68..b6f3715289 100644 --- a/usr/src/uts/common/sys/dlpi.h +++ b/usr/src/uts/common/sys/dlpi.h @@ -577,7 +577,9 @@ union DL_qos_types { #define DL_CAPAB_ZEROCOPY 0x05 /* Zero-copy capability */ /* dl_data is dl_capab_zerocopy_t */ #define DL_CAPAB_POLL 0x06 /* Polling capability */ - /* dl_data is dl_capab_poll_t */ + /* dl_data is dl_capab_dls_t */ +#define DL_CAPAB_SOFT_RING 0x07 /* Soft ring capable */ + /* dl_data is dl_capab_dls_t */ typedef struct { t_uscalar_t dl_cap; /* capability type */ @@ -696,37 +698,57 @@ typedef struct { #ifdef _KERNEL /* - * This defines the DL_CAPAB_POLL capability. Currently it provides a - * mechanism for IP to exchange function pointers with a gldv3-based driver - * to enable streams-bypassing data-paths and interrupt blanking. True polling - * support will be added in the future. + * This structure is used by DL_CAPAB_POLL and DL_CAPAB_SOFT_RING + * capabilities. It provides a mechanism for IP to exchange function + * pointers with a gldv3-based driver to enable it to bypass streams- + * data-paths. DL_CAPAB_POLL mechanism provides a way to blank + * interrupts. Note: True polling support will be added in the future. + * DL_CAPAB_SOFT_RING provides a mechanism to create soft ring at the + * dls layer. */ -typedef struct dl_capab_poll_s { - t_uscalar_t poll_version; - t_uscalar_t poll_flags; +typedef struct dl_capab_dls_s { + t_uscalar_t dls_version; + t_uscalar_t dls_flags; /* DLD provided information */ - uintptr_t poll_tx_handle; - uintptr_t poll_tx; + uintptr_t dls_tx_handle; + uintptr_t dls_tx; + uintptr_t dls_ring_change_status; + uintptr_t dls_ring_bind; + uintptr_t dls_ring_unbind; /* IP provided information */ - uintptr_t poll_rx_handle; - uintptr_t poll_rx; - uintptr_t poll_ring_add; + uintptr_t dls_rx_handle; + uintptr_t dls_ring_assign; + uintptr_t dls_rx; + uintptr_t dls_ring_add; + t_uscalar_t dls_ring_cnt; - dl_mid_t poll_mid; /* module ID */ -} dl_capab_poll_t; + dl_mid_t dls_mid; /* module ID */ +} dl_capab_dls_t; #define POLL_CURRENT_VERSION 0x01 #define POLL_VERSION_1 0x01 -/* - * Values for poll_flags - */ +#define SOFT_RING_VERSION_1 0x01 + +/* Values for poll_flags */ #define POLL_ENABLE 0x01 /* Set to enable polling */ /* capability */ #define POLL_CAPABLE 0x02 /* Polling ability exists */ -#define POLL_DISABLE 0x04 /* Disable Polling */ +#define POLL_DISABLE 0x03 /* Disable Polling */ + +/* Values for soft_ring_flags */ +#define SOFT_RING_ENABLE 0x04 /* Set to enable soft_ring */ + /* capability */ +#define SOFT_RING_CAPABLE 0x05 /* Soft_Ring ability exists */ +#define SOFT_RING_DISABLE 0x06 /* Disable Soft_Ring */ + +/* Soft_Ring fanout types (used by soft_ring_change_status) */ +#define SOFT_RING_NONE 0x00 +#define SOFT_RING_RANDOM 0x01 +#define SOFT_RING_SRC_HASH 0x02 +#define SOFT_RING_RND_ROBIN 0x03 #endif /* _KERNEL */ diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h index 69f9b4b3dc..cf26f5cb21 100644 --- a/usr/src/uts/common/sys/dls_impl.h +++ b/usr/src/uts/common/sys/dls_impl.h @@ -35,6 +35,8 @@ #include <sys/modhash.h> #include <sys/kstat.h> #include <net/if.h> +#include <sys/dlpi.h> +#include <sys/dls_soft_ring.h> #ifdef __cplusplus extern "C" { @@ -99,6 +101,7 @@ struct dls_impl_s { dls_multicst_addr_t *di_dmap; dls_rx_t di_rx; void *di_rx_arg; + mac_resource_add_t di_ring_add; const mac_txinfo_t *di_txinfo; boolean_t di_bound; boolean_t di_removing; @@ -106,6 +109,9 @@ struct dls_impl_s { uint8_t di_unicst_addr[MAXADDRLEN]; dls_priv_header_t di_header; dls_priv_header_info_t di_header_info; + soft_ring_t **di_soft_ring_list; + uint_t di_soft_ring_size; + int di_soft_ring_fanout_type; }; struct dls_head_s { @@ -123,8 +129,8 @@ extern void dls_link_remove(dls_link_t *, dls_impl_t *); extern int dls_mac_hold(dls_link_t *); extern void dls_mac_rele(dls_link_t *); -extern void dls_stat_create(dls_vlan_t *); -extern void dls_stat_destroy(dls_vlan_t *); +extern void dls_mac_stat_create(dls_vlan_t *); +extern void dls_mac_stat_destroy(dls_vlan_t *); extern void dls_vlan_init(void); extern int dls_vlan_fini(void); diff --git a/usr/src/uts/common/sys/dls_soft_ring.h b/usr/src/uts/common/sys/dls_soft_ring.h new file mode 100644 index 0000000000..96a5e7ccc4 --- /dev/null +++ b/usr/src/uts/common/sys/dls_soft_ring.h @@ -0,0 +1,99 @@ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DLS_SOFT_RING_H +#define _SYS_DLS_SOFT_RING_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include <sys/types.h> +#include <sys/processor.h> +#include <sys/stream.h> +#include <sys/squeue.h> + +#define S_RING_NAMELEN 64 + +typedef void (*s_ring_proc_t)(void *, void *, mblk_t *, size_t); + +typedef struct soft_ring_s { + /* Keep the most used members 64bytes cache aligned */ + kmutex_t s_ring_lock; /* lock before using any member */ + uint16_t s_ring_type; /* processing model of the sq */ + uint16_t s_ring_state; /* state flags and message count */ + int s_ring_count; /* # of mblocks in soft_ring */ + mblk_t *s_ring_first; /* first mblk chain or NULL */ + mblk_t *s_ring_last; /* last mblk chain or NULL */ + s_ring_proc_t s_ring_upcall; /* Upcall func pointer */ + void *s_ring_upcall_arg1; /* upcall argument 1 */ + void *s_ring_upcall_arg2; /* upcall argument 2 */ + clock_t s_ring_awaken; /* time async thread was awakened */ + + kthread_t *s_ring_run; /* Current thread processing sq */ + processorid_t s_ring_bind; /* processor to bind to */ + kcondvar_t s_ring_async; /* async thread blocks on */ + clock_t s_ring_wait; /* lbolts to wait after a fill() */ + timeout_id_t s_ring_tid; /* timer id of pending timeout() */ + kthread_t *s_ring_worker; /* kernel thread id */ + char s_ring_name[S_RING_NAMELEN + 1]; + uint32_t s_ring_total_inpkt; +} soft_ring_t; + + +/* + * type flags - combination allowed to process and drain the queue + */ +#define S_RING_WORKER_ONLY 0x0001 /* Worker thread only */ +#define S_RING_ANY 0x0002 /* Any thread can process the queue */ + +/* + * State flags. + */ +#define S_RING_PROC 0x0001 /* being processed */ +#define S_RING_WORKER 0x0002 /* worker thread */ +#define S_RING_BOUND 0x0004 /* Worker thread is bound */ +#define S_RING_DESTROY 0x0008 /* Ring is being destroyed */ +#define S_RING_DEAD 0x0010 /* Worker thread is no more */ + +/* + * arguments for processors to bind to + */ +#define S_RING_BIND_NONE -1 + +/* + * Structure for dls statistics + */ +struct dls_kstats { + kstat_named_t dlss_soft_ring_pkt_drop; +}; + +extern struct dls_kstats dls_kstat; + +#define DLS_BUMP_STAT(x, y) (dls_kstat.x.value.ui32 += y) + +extern void soft_ring_init(void); +extern soft_ring_t *soft_ring_create(char *, processorid_t, clock_t, + uint_t, pri_t); +extern soft_ring_t **soft_ring_set_create(char *, processorid_t, clock_t, + uint_t, pri_t, int); +extern void soft_ring_set_destroy(soft_ring_t **, int); +extern void soft_ring_process(soft_ring_t *, mblk_t *, uint8_t); +extern void soft_ring_bind(void *, processorid_t); +extern void soft_ring_unbind(void *); +extern void dls_ether_soft_ring_fanout(void *, + void *, mblk_t *, size_t); +extern boolean_t dls_soft_ring_enable(dls_channel_t, dl_capab_dls_t *); +extern void dls_soft_ring_disable(dls_channel_t); +extern boolean_t dls_soft_ring_workers(dls_channel_t); +extern void dls_soft_ring_rx_set(dls_channel_t, dls_rx_t, void *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DLS_SOFT_RING_H */ diff --git a/usr/src/uts/sun4v/os/mach_startup.c b/usr/src/uts/sun4v/os/mach_startup.c index 179a830d60..986a2fa488 100644 --- a/usr/src/uts/sun4v/os/mach_startup.c +++ b/usr/src/uts/sun4v/os/mach_startup.c @@ -289,6 +289,21 @@ mach_hw_copy_limit(void) } /* + * We need to enable soft ring functionality on Niagara platform since + * one strand can't handle interrupts for a 1Gb NIC. Set the tunable + * ip_squeue_soft_ring by default on this platform. We can also set + * ip_threads_per_cpu to track number of threads per core. The variables + * themselves are defined in space.c and used by IP module + */ +extern uint_t ip_threads_per_cpu; +extern boolean_t ip_squeue_soft_ring; +void +startup_platform(void) +{ + ip_squeue_soft_ring = B_TRUE; +} + +/* * This function sets up hypervisor traptrace buffer * This routine is called by the boot cpu only */ |
