summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
authorkrgopi <none@none>2005-12-23 10:29:12 -0800
committerkrgopi <none@none>2005-12-23 10:29:12 -0800
commit4b46d1ef625bf17cc3dd4b14b9ad807be97dc558 (patch)
treef7a47bdc523d89874ab78528527c14b8bd0aaef9 /usr/src
parent5805a1baa6594684bbf7d7fa108cea093396ea31 (diff)
downloadillumos-gate-4b46d1ef625bf17cc3dd4b14b9ad807be97dc558.tar.gz
PSARC 2005/654 Nemo soft rings
6306717 For Nemo based drivers, IP can ask dls to do the fanout
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/uts/common/Makefile.files2
-rw-r--r--usr/src/uts/common/inet/ip.h47
-rw-r--r--usr/src/uts/common/inet/ip/ip.c11
-rw-r--r--usr/src/uts/common/inet/ip/ip_if.c357
-rw-r--r--usr/src/uts/common/inet/ip/ip_squeue.c437
-rw-r--r--usr/src/uts/common/inet/ip_impl.h19
-rw-r--r--usr/src/uts/common/inet/tcp/tcp.c4
-rw-r--r--usr/src/uts/common/inet/udp/udp.c4
-rw-r--r--usr/src/uts/common/io/dld/dld_proto.c256
-rw-r--r--usr/src/uts/common/io/dld/dld_str.c7
-rw-r--r--usr/src/uts/common/io/dls/dls.c40
-rw-r--r--usr/src/uts/common/io/dls/dls_soft_ring.c667
-rw-r--r--usr/src/uts/common/io/dls/dls_stat.c4
-rw-r--r--usr/src/uts/common/io/dls/dls_vlan.c4
-rw-r--r--usr/src/uts/common/io/mac/mac.c5
-rw-r--r--usr/src/uts/common/os/space.c20
-rw-r--r--usr/src/uts/common/sys/dld_impl.h3
-rw-r--r--usr/src/uts/common/sys/dlpi.h60
-rw-r--r--usr/src/uts/common/sys/dls_impl.h10
-rw-r--r--usr/src/uts/common/sys/dls_soft_ring.h99
-rw-r--r--usr/src/uts/sun4v/os/mach_startup.c15
21 files changed, 1779 insertions, 292 deletions
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 7be9d00998..9d21d041b5 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -494,7 +494,7 @@ CN_OBJS += cons.o
DLD_OBJS += dld_drv.o dld_proto.o dld_str.o
-DLS_OBJS += dls.o dls_link.o dls_mod.o dls_stat.o dls_vlan.o
+DLS_OBJS += dls.o dls_link.o dls_mod.o dls_stat.o dls_vlan.o dls_soft_ring.o
GLD_OBJS += gld.o gldutil.o
diff --git a/usr/src/uts/common/inet/ip.h b/usr/src/uts/common/inet/ip.h
index f286253080..358e67d354 100644
--- a/usr/src/uts/common/inet/ip.h
+++ b/usr/src/uts/common/inet/ip.h
@@ -1593,6 +1593,7 @@ extern ill_g_head_t ill_g_heads[]; /* ILL List Head */
#define ILL_CAPAB_HCKSUM 0x08 /* Hardware checksumming */
#define ILL_CAPAB_ZEROCOPY 0x10 /* Zero-copy */
#define ILL_CAPAB_POLL 0x20 /* Polling Toggle */
+#define ILL_CAPAB_SOFT_RING 0x40 /* Soft_Ring capability */
/*
* Per-ill Multidata Transmit capabilities.
@@ -1615,9 +1616,9 @@ typedef struct ill_hcksum_capab_s ill_hcksum_capab_t;
typedef struct ill_zerocopy_capab_s ill_zerocopy_capab_t;
/*
- * Per-ill Polling capbilities.
+ * Per-ill Polling/soft ring capbilities.
*/
-typedef struct ill_poll_capab_s ill_poll_capab_t;
+typedef struct ill_dls_capab_s ill_dls_capab_t;
/*
* Per-ill polling resource map.
@@ -1629,6 +1630,7 @@ typedef struct ill_rx_ring ill_rx_ring_t;
#define ILL_CONDEMNED 0x02 /* No more new ref's to the ILL */
#define ILL_CHANGING 0x04 /* ILL not globally visible */
#define ILL_DL_UNBIND_DONE 0x08 /* UNBIND_REQ has been Acked */
+#define ILL_SOFT_RING_ASSIGN 0x10 /* Makeing soft ring assigment */
/* Is this an ILL whose source address is used by other ILL's ? */
#define IS_USESRC_ILL(ill) \
@@ -1775,7 +1777,7 @@ typedef struct ill_s {
ill_ipsec_capab_t *ill_ipsec_capab_esp; /* IPsec ESP capabilities */
ill_hcksum_capab_t *ill_hcksum_capab; /* H/W cksumming capabilities */
ill_zerocopy_capab_t *ill_zerocopy_capab; /* Zero-copy capabilities */
- ill_poll_capab_t *ill_poll_capab; /* Polling capabilities */
+ ill_dls_capab_t *ill_dls_capab; /* Polling, soft ring capabilities */
/*
* New fields for IPv6
@@ -2962,11 +2964,16 @@ struct ill_zerocopy_capab_s {
#define ILL_POLLING 0x01 /* Polling in use */
/*
- * This function pointer type is exported by the mac layer.
- * we need to duplicate the definition here because we cannot
- * include mac.h in this file.
+ * These functions pointer types are exported by the mac/dls layer.
+ * we need to duplicate the definitions here because we cannot
+ * include mac/dls header files here.
*/
typedef void (*ip_mac_blank_t)(void *, time_t, uint_t);
+typedef void (*ip_dld_tx_t)(void *, mblk_t *);
+
+typedef void (*ip_dls_chg_soft_ring_t)(void *, int);
+typedef void (*ip_dls_bind_t)(void *, processorid_t);
+typedef void (*ip_dls_unbind_t)(void *);
struct ill_rx_ring {
ip_mac_blank_t rr_blank; /* Driver interrupt blanking func */
@@ -2984,15 +2991,15 @@ struct ill_rx_ring {
uint32_t rr_ring_state; /* State of this ring */
};
-/*
- * This is exported by dld and is meant to be invoked from a ULP.
- */
-typedef void (*ip_dld_tx_t)(void *, mblk_t *);
-
-struct ill_poll_capab_s {
- ip_dld_tx_t ill_tx; /* dld-supplied tx routine */
- void *ill_tx_handle; /* dld-supplied tx handle */
+struct ill_dls_capab_s {
+ ip_dld_tx_t ill_tx; /* Driver Tx routine */
+ void *ill_tx_handle; /* Driver Tx handle */
+ ip_dls_chg_soft_ring_t ill_dls_change_status;
+ /* change soft ring fanout */
+ ip_dls_bind_t ill_dls_bind; /* to add CPU affinity */
+ ip_dls_unbind_t ill_dls_unbind; /* remove CPU affinity */
ill_rx_ring_t *ill_ring_tbl; /* Ring to Sqp mapping table */
+ uint_t ill_dls_soft_ring_cnt; /* Number of soft ring */
conn_t *ill_unbind_conn; /* Conn used during unplumb */
};
@@ -3002,6 +3009,10 @@ struct ill_poll_capab_s {
extern int ip_squeue_profile;
extern int ip_squeue_bind;
extern boolean_t ip_squeue_fanout;
+extern boolean_t ip_squeue_soft_ring;
+extern uint_t ip_threads_per_cpu;
+extern uint_t ip_squeues_per_cpu;
+extern uint_t ip_soft_rings_cnt;
typedef struct squeue_set_s {
kmutex_t sqs_lock;
@@ -3012,10 +3023,8 @@ typedef struct squeue_set_s {
} squeue_set_t;
#define IP_SQUEUE_GET(hint) \
- (!ip_squeue_fanout ? \
- (CPU->cpu_squeue_set->sqs_list[hint % \
- CPU->cpu_squeue_set->sqs_size]) : \
- ip_squeue_random(hint))
+ ((!ip_squeue_fanout) ? (CPU->cpu_squeue_set->sqs_list[0]) : \
+ ip_squeue_random(hint))
typedef void (*squeue_func_t)(squeue_t *, mblk_t *, sqproc_t, void *, uint8_t);
@@ -3027,6 +3036,8 @@ extern int ip_squeue_bind_set(queue_t *, mblk_t *, char *, caddr_t, cred_t *);
extern int ip_squeue_bind_get(queue_t *, mblk_t *, caddr_t, cred_t *);
extern void ip_squeue_clean(void *, mblk_t *, void *);
extern void ip_resume_tcp_bind(void *, mblk_t *, void *);
+extern void ip_soft_ring_assignment(ill_t *, ill_rx_ring_t *,
+ mblk_t *, size_t);
extern void tcp_wput(queue_t *, mblk_t *);
diff --git a/usr/src/uts/common/inet/ip/ip.c b/usr/src/uts/common/inet/ip/ip.c
index d587a746c5..33ac6bd126 100644
--- a/usr/src/uts/common/inet/ip/ip.c
+++ b/usr/src/uts/common/inet/ip/ip.c
@@ -743,7 +743,7 @@ extern int ip_squeue_profile_set(queue_t *, mblk_t *, char *, caddr_t,
cred_t *);
static int ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
-static int ip_fanout_set(queue_t *, mblk_t *, char *, caddr_t,
+static int ip_int_set(queue_t *, mblk_t *, char *, caddr_t,
cred_t *);
static squeue_func_t ip_squeue_switch(int);
@@ -941,10 +941,12 @@ static ipndp_t lcl_ndp_arr[] = {
(caddr_t)&ip_squeue_bind, "ip_squeue_bind" },
{ ip_param_generic_get, ip_input_proc_set,
(caddr_t)&ip_squeue_enter, "ip_squeue_enter" },
- { ip_param_generic_get, ip_fanout_set,
+ { ip_param_generic_get, ip_int_set,
(caddr_t)&ip_squeue_fanout, "ip_squeue_fanout" },
{ ip_cgtp_filter_get, ip_cgtp_filter_set, (caddr_t)&ip_cgtp_filter,
- "ip_cgtp_filter" }
+ "ip_cgtp_filter" },
+ { ip_param_generic_get, ip_int_set,
+ (caddr_t)&ip_soft_rings_cnt, "ip_soft_rings_cnt" }
};
/*
@@ -25996,7 +25998,7 @@ ip_input_proc_set(queue_t *q, mblk_t *mp, char *value,
/* ARGSUSED */
static int
-ip_fanout_set(queue_t *q, mblk_t *mp, char *value,
+ip_int_set(queue_t *q, mblk_t *mp, char *value,
caddr_t addr, cred_t *cr)
{
int *v = (int *)addr;
@@ -26009,7 +26011,6 @@ ip_fanout_set(queue_t *q, mblk_t *mp, char *value,
return (0);
}
-
static void
ip_kstat_init(void)
{
diff --git a/usr/src/uts/common/inet/ip/ip_if.c b/usr/src/uts/common/inet/ip/ip_if.c
index d04760f02c..adc05133fb 100644
--- a/usr/src/uts/common/inet/ip/ip_if.c
+++ b/usr/src/uts/common/inet/ip/ip_if.c
@@ -235,9 +235,10 @@ static void ill_capability_zerocopy_ack(ill_t *, mblk_t *,
dl_capability_sub_t *);
static void ill_capability_zerocopy_reset(ill_t *, mblk_t **);
-static void ill_capability_poll_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
+static void ill_capability_dls_ack(ill_t *, mblk_t *, dl_capability_sub_t *);
static mac_resource_handle_t ill_ring_add(void *, mac_resource_t *);
-static void ill_capability_poll_reset(ill_t *, mblk_t **);
+static void ill_capability_dls_reset(ill_t *, mblk_t **);
+static void ill_capability_dls_disable(ill_t *);
static void illgrp_cache_delete(ire_t *, char *);
static void illgrp_delete(ill_t *ill);
@@ -560,6 +561,16 @@ static phyint_list_t phyint_g_list; /* start of phyint list */
*/
static boolean_t ipmp_enable_failback = B_TRUE;
+/*
+ * Enable soft rings if ip_squeue_soft_ring or ip_squeue_fanout
+ * is set and ip_soft_rings_cnt > 0. ip_squeue_soft_ring is
+ * set through platform specific code (Niagara/Ontario).
+ */
+#define SOFT_RINGS_ENABLED() (ip_soft_rings_cnt ? \
+ (ip_squeue_soft_ring || ip_squeue_fanout) : B_FALSE)
+
+#define ILL_CAPAB_DLS (ILL_CAPAB_SOFT_RING | ILL_CAPAB_POLL)
+
static uint_t
ipif_rand(void)
{
@@ -770,7 +781,7 @@ ill_delete_tail(ill_t *ill)
* to this ill.
*/
mutex_enter(&ill->ill_lock);
- if (ill->ill_capabilities & ILL_CAPAB_POLL) {
+ if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) {
while (!(ill->ill_state_flags & ILL_DL_UNBIND_DONE))
cv_wait(&ill->ill_cv, &ill->ill_lock);
}
@@ -820,18 +831,18 @@ ill_delete_tail(ill_t *ill)
}
/*
- * Clean up polling capabilities
+ * Clean up polling and soft ring capabilities
*/
- if (ill->ill_capabilities & ILL_CAPAB_POLL)
- ipsq_clean_all(ill);
+ if (ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING))
+ ill_capability_dls_disable(ill);
- if (ill->ill_poll_capab != NULL) {
- CONN_DEC_REF(ill->ill_poll_capab->ill_unbind_conn);
- ill->ill_poll_capab->ill_unbind_conn = NULL;
- kmem_free(ill->ill_poll_capab,
- sizeof (ill_poll_capab_t) +
+ if (ill->ill_dls_capab != NULL) {
+ CONN_DEC_REF(ill->ill_dls_capab->ill_unbind_conn);
+ ill->ill_dls_capab->ill_unbind_conn = NULL;
+ kmem_free(ill->ill_dls_capab,
+ sizeof (ill_dls_capab_t) +
(sizeof (ill_rx_ring_t) * ILL_MAX_RINGS));
- ill->ill_poll_capab = NULL;
+ ill->ill_dls_capab = NULL;
}
ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL));
@@ -1801,7 +1812,7 @@ ill_capability_reset(ill_t *ill)
ill_capability_hcksum_reset(ill, &sc_mp);
ill_capability_zerocopy_reset(ill, &sc_mp);
ill_capability_ipsec_reset(ill, &sc_mp);
- ill_capability_poll_reset(ill, &sc_mp);
+ ill_capability_dls_reset(ill, &sc_mp);
/* Nothing to send down in order to disable the capabilities? */
if (sc_mp == NULL)
@@ -2627,7 +2638,12 @@ ill_capability_dispatch(ill_t *ill, mblk_t *mp, dl_capability_sub_t *subp,
ill_capability_zerocopy_ack(ill, mp, subp);
break;
case DL_CAPAB_POLL:
- ill_capability_poll_ack(ill, mp, subp);
+ if (!SOFT_RINGS_ENABLED())
+ ill_capability_dls_ack(ill, mp, subp);
+ break;
+ case DL_CAPAB_SOFT_RING:
+ if (SOFT_RINGS_ENABLED())
+ ill_capability_dls_ack(ill, mp, subp);
break;
default:
ip1dbg(("ill_capability_dispatch: unknown capab type %d\n",
@@ -2672,16 +2688,16 @@ ill_ring_add(void *arg, mac_resource_t *mrp)
ill_rx_ring_t *rx_ring;
int ip_rx_index;
+ ASSERT(mrp != NULL);
if (mrp->mr_type != MAC_RX_FIFO) {
return (NULL);
}
ASSERT(ill != NULL);
- ASSERT(ill->ill_poll_capab != NULL);
- ASSERT(mrp != NULL);
+ ASSERT(ill->ill_dls_capab != NULL);
mutex_enter(&ill->ill_lock);
for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
- rx_ring = &ill->ill_poll_capab->ill_ring_tbl[ip_rx_index];
+ rx_ring = &ill->ill_dls_capab->ill_ring_tbl[ip_rx_index];
ASSERT(rx_ring != NULL);
if (rx_ring->rr_ring_state == ILL_RING_FREE) {
@@ -2732,107 +2748,129 @@ ill_ring_add(void *arg, mac_resource_t *mrp)
}
static boolean_t
-ill_capability_poll_init(ill_t *ill)
+ill_capability_dls_init(ill_t *ill)
{
- ill_poll_capab_t *ill_poll = ill->ill_poll_capab;
+ ill_dls_capab_t *ill_dls = ill->ill_dls_capab;
conn_t *connp;
size_t sz;
- if (ill->ill_capabilities & ILL_CAPAB_POLL) {
- if (ill_poll == NULL) {
- cmn_err(CE_PANIC, "ill_capability_poll_init: "
- "polling enabled for ill=%s (%p) but data "
+ if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
+ if (ill_dls == NULL) {
+ cmn_err(CE_PANIC, "ill_capability_dls_init: "
+ "soft_ring enabled for ill=%s (%p) but data "
"structs uninitialized\n", ill->ill_name,
(void *)ill);
}
return (B_TRUE);
+ } else if (ill->ill_capabilities & ILL_CAPAB_POLL) {
+ if (ill_dls == NULL) {
+ cmn_err(CE_PANIC, "ill_capability_dls_init: "
+ "polling enabled for ill=%s (%p) but data "
+ "structs uninitialized\n", ill->ill_name,
+ (void *)ill);
+ }
+ return (B_TRUE);
}
- if (ill_poll != NULL) {
- ill_rx_ring_t *rx_ring = ill_poll->ill_ring_tbl;
- /* Polling is being re-enabled */
+ if (ill_dls != NULL) {
+ ill_rx_ring_t *rx_ring = ill_dls->ill_ring_tbl;
+ /* Soft_Ring or polling is being re-enabled */
- connp = ill_poll->ill_unbind_conn;
+ connp = ill_dls->ill_unbind_conn;
ASSERT(rx_ring != NULL);
- bzero((void *)ill_poll, sizeof (ill_poll_capab_t));
+ bzero((void *)ill_dls, sizeof (ill_dls_capab_t));
bzero((void *)rx_ring,
sizeof (ill_rx_ring_t) * ILL_MAX_RINGS);
- ill_poll->ill_ring_tbl = rx_ring;
- ill_poll->ill_unbind_conn = connp;
+ ill_dls->ill_ring_tbl = rx_ring;
+ ill_dls->ill_unbind_conn = connp;
return (B_TRUE);
}
if ((connp = ipcl_conn_create(IPCL_TCPCONN, KM_NOSLEEP)) == NULL)
return (B_FALSE);
- sz = sizeof (ill_poll_capab_t);
+ sz = sizeof (ill_dls_capab_t);
sz += sizeof (ill_rx_ring_t) * ILL_MAX_RINGS;
- ill_poll = kmem_zalloc(sz, KM_NOSLEEP);
- if (ill_poll == NULL) {
- cmn_err(CE_WARN, "ill_capability_poll_init: could not "
- "allocate poll_capab for %s (%p)\n", ill->ill_name,
+ ill_dls = kmem_zalloc(sz, KM_NOSLEEP);
+ if (ill_dls == NULL) {
+ cmn_err(CE_WARN, "ill_capability_dls_init: could not "
+ "allocate dls_capab for %s (%p)\n", ill->ill_name,
(void *)ill);
CONN_DEC_REF(connp);
return (B_FALSE);
}
/* Allocate space to hold ring table */
- ill_poll->ill_ring_tbl = (ill_rx_ring_t *)&ill_poll[1];
- ill->ill_poll_capab = ill_poll;
- ill_poll->ill_unbind_conn = connp;
+ ill_dls->ill_ring_tbl = (ill_rx_ring_t *)&ill_dls[1];
+ ill->ill_dls_capab = ill_dls;
+ ill_dls->ill_unbind_conn = connp;
return (B_TRUE);
}
/*
- * ill_capability_poll_disable: disable polling capability. Since
- * any of the rings might already be in use, need to call ipsq_clean_all()
- * which gets behind the squeue to disable direct calls if necessary.
- * Clean up the direct tx function pointers as well.
+ * ill_capability_dls_disable: disable soft_ring and/or polling
+ * capability. Since any of the rings might already be in use, need
+ * to call ipsq_clean_all() which gets behind the squeue to disable
+ * direct calls if necessary.
*/
static void
-ill_capability_poll_disable(ill_t *ill)
+ill_capability_dls_disable(ill_t *ill)
{
- ill_poll_capab_t *ill_poll = ill->ill_poll_capab;
+ ill_dls_capab_t *ill_dls = ill->ill_dls_capab;
- if (ill->ill_capabilities & ILL_CAPAB_POLL) {
+ if (ill->ill_capabilities & ILL_CAPAB_DLS) {
ipsq_clean_all(ill);
- ill_poll->ill_tx = NULL;
- ill_poll->ill_tx_handle = NULL;
+ ill_dls->ill_tx = NULL;
+ ill_dls->ill_tx_handle = NULL;
+ ill_dls->ill_dls_change_status = NULL;
+ ill_dls->ill_dls_bind = NULL;
+ ill_dls->ill_dls_unbind = NULL;
}
- ASSERT(!(ill->ill_capabilities & ILL_CAPAB_POLL));
+ ASSERT(!(ill->ill_capabilities & ILL_CAPAB_DLS));
}
static void
-ill_capability_poll_capable(ill_t *ill, dl_capab_poll_t *ipoll,
+ill_capability_dls_capable(ill_t *ill, dl_capab_dls_t *idls,
dl_capability_sub_t *isub)
{
uint_t size;
uchar_t *rptr;
- dl_capab_poll_t poll, *opoll;
- ill_poll_capab_t *ill_poll;
+ dl_capab_dls_t dls, *odls;
+ ill_dls_capab_t *ill_dls;
mblk_t *nmp = NULL;
dl_capability_req_t *ocap;
+ uint_t sub_dl_cap = isub->dl_cap;
- if (!ill_capability_poll_init(ill))
+ if (!ill_capability_dls_init(ill))
return;
- ill_poll = ill->ill_poll_capab;
+ ill_dls = ill->ill_dls_capab;
/* Copy locally to get the members aligned */
- bcopy((void *)ipoll, (void *)&poll, sizeof (dl_capab_poll_t));
+ bcopy((void *)idls, (void *)&dls,
+ sizeof (dl_capab_dls_t));
/* Get the tx function and handle from dld */
- ill_poll->ill_tx = (ip_dld_tx_t)poll.poll_tx;
- ill_poll->ill_tx_handle = (void *)poll.poll_tx_handle;
+ ill_dls->ill_tx = (ip_dld_tx_t)dls.dls_tx;
+ ill_dls->ill_tx_handle = (void *)dls.dls_tx_handle;
+
+ if (sub_dl_cap == DL_CAPAB_SOFT_RING) {
+ ill_dls->ill_dls_change_status =
+ (ip_dls_chg_soft_ring_t)dls.dls_ring_change_status;
+ ill_dls->ill_dls_bind = (ip_dls_bind_t)dls.dls_ring_bind;
+ ill_dls->ill_dls_unbind =
+ (ip_dls_unbind_t)dls.dls_ring_unbind;
+ ill_dls->ill_dls_soft_ring_cnt = ip_soft_rings_cnt;
+ }
size = sizeof (dl_capability_req_t) + sizeof (dl_capability_sub_t) +
isub->dl_length;
if ((nmp = ip_dlpi_alloc(size, DL_CAPABILITY_REQ)) == NULL) {
- cmn_err(CE_WARN, "ill_capability_poll_ack: could not allocate "
- "memory for CAPAB_REQ for %s (%p)\n", ill->ill_name,
- (void *)ill);
+ cmn_err(CE_WARN, "ill_capability_dls_capable: could "
+ "not allocate memory for CAPAB_REQ for %s (%p)\n",
+ ill->ill_name, (void *)ill);
return;
}
@@ -2847,46 +2885,93 @@ ill_capability_poll_capable(ill_t *ill, dl_capab_poll_t *ipoll,
bcopy(isub, rptr, sizeof (*isub));
rptr += sizeof (*isub);
- opoll = (dl_capab_poll_t *)rptr;
- rptr += sizeof (dl_capab_poll_t);
+ odls = (dl_capab_dls_t *)rptr;
+ rptr += sizeof (dl_capab_dls_t);
- /* initialize dl_capab_poll_t to be sent down */
- poll.poll_rx_handle = (uintptr_t)ill;
- poll.poll_rx = (uintptr_t)ip_input;
- poll.poll_ring_add = (uintptr_t)ill_ring_add;
- poll.poll_flags = POLL_ENABLE;
- bcopy((void *)&poll, (void *)opoll, sizeof (dl_capab_poll_t));
- ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
-
- ip1dbg(("ill_capability_poll_capable: asking interface %s "
- "to enable polling\n", ill->ill_name));
+ /* initialize dl_capab_dls_t to be sent down */
+ dls.dls_rx_handle = (uintptr_t)ill;
+ dls.dls_rx = (uintptr_t)ip_input;
+ dls.dls_ring_add = (uintptr_t)ill_ring_add;
- /* nmp points to a DL_CAPABILITY_REQ message to enable polling */
+ if (sub_dl_cap == DL_CAPAB_SOFT_RING) {
+ dls.dls_ring_cnt = ip_soft_rings_cnt;
+ dls.dls_ring_assign = (uintptr_t)ip_soft_ring_assignment;
+ dls.dls_flags = SOFT_RING_ENABLE;
+ } else {
+ dls.dls_flags = POLL_ENABLE;
+ ip1dbg(("ill_capability_dls_capable: asking interface %s "
+ "to enable polling\n", ill->ill_name));
+ }
+ bcopy((void *)&dls, (void *)odls,
+ sizeof (dl_capab_dls_t));
+ ASSERT(nmp->b_wptr == (nmp->b_rptr + size));
+ /*
+ * nmp points to a DL_CAPABILITY_REQ message to
+ * enable either soft_ring or polling
+ */
ill_dlpi_send(ill, nmp);
}
+static void
+ill_capability_dls_reset(ill_t *ill, mblk_t **sc_mp)
+{
+ mblk_t *mp;
+ dl_capab_dls_t *idls;
+ dl_capability_sub_t *dl_subcap;
+ int size;
+
+ if (!(ill->ill_capabilities & ILL_CAPAB_DLS))
+ return;
+
+ ASSERT(ill->ill_dls_capab != NULL);
+
+ size = sizeof (*dl_subcap) + sizeof (*idls);
+
+ mp = allocb(size, BPRI_HI);
+ if (mp == NULL) {
+ ip1dbg(("ill_capability_dls_reset: unable to allocate "
+ "request to disable soft_ring\n"));
+ return;
+ }
+
+ mp->b_wptr = mp->b_rptr + size;
+
+ dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
+ dl_subcap->dl_length = sizeof (*idls);
+ if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING)
+ dl_subcap->dl_cap = DL_CAPAB_SOFT_RING;
+ else
+ dl_subcap->dl_cap = DL_CAPAB_POLL;
+
+ idls = (dl_capab_dls_t *)(dl_subcap + 1);
+ if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING)
+ idls->dls_flags = SOFT_RING_DISABLE;
+ else
+ idls->dls_flags = POLL_DISABLE;
+
+ if (*sc_mp != NULL)
+ linkb(*sc_mp, mp);
+ else
+ *sc_mp = mp;
+}
/*
- * Process a polling capability negotiation ack received
- * from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_POLL)
- * of a DL_CAPABILITY_ACK message.
+ * Process a soft_ring/poll capability negotiation ack received
+ * from a DLS Provider.isub must point to the sub-capability
+ * (DL_CAPAB_SOFT_RING/DL_CAPAB_POLL) of a DL_CAPABILITY_ACK message.
*/
static void
-ill_capability_poll_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
+ill_capability_dls_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
{
- dl_capab_poll_t *ipoll;
+ dl_capab_dls_t *idls;
uint_t sub_dl_cap = isub->dl_cap;
uint8_t *capend;
+ ASSERT(sub_dl_cap == DL_CAPAB_SOFT_RING ||
+ sub_dl_cap == DL_CAPAB_POLL);
- ASSERT(sub_dl_cap == DL_CAPAB_POLL);
-
- /*
- * Don't enable polling for ipv6 ill's
- */
- if (ill->ill_isv6) {
+ if (ill->ill_isv6)
return;
- }
/*
* Note: range checks here are not absolutely sufficient to
@@ -2897,7 +2982,7 @@ ill_capability_poll_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
*/
capend = (uint8_t *)(isub + 1) + isub->dl_length;
if (capend > mp->b_wptr) {
- cmn_err(CE_WARN, "ill_capability_poll_ack: "
+ cmn_err(CE_WARN, "ill_capability_dls_ack: "
"malformed sub-capability too long for mblk");
return;
}
@@ -2905,17 +2990,17 @@ ill_capability_poll_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
/*
* There are two types of acks we process here:
* 1. acks in reply to a (first form) generic capability req
- * (poll_flag will be set to POLL_CAPABLE)
- * 2. acks in reply to a POLL_ENABLE capability req.
- * (POLL_ENABLE flag set)
+ * (dls_flag will be set to SOFT_RING_CAPABLE or POLL_CAPABLE)
+ * 2. acks in reply to a SOFT_RING_ENABLE or POLL_ENABLE
+ * capability req.
*/
- ipoll = (dl_capab_poll_t *)(isub + 1);
+ idls = (dl_capab_dls_t *)(isub + 1);
- if (!dlcapabcheckqid(&ipoll->poll_mid, ill->ill_lmod_rq)) {
- ip1dbg(("ill_capability_poll_ack: mid token for polling "
+ if (!dlcapabcheckqid(&idls->dls_mid, ill->ill_lmod_rq)) {
+ ip1dbg(("ill_capability_dls_ack: mid token for dls "
"capability isn't as expected; pass-thru "
"module(s) detected, discarding capability\n"));
- if (ill->ill_capabilities & ILL_CAPAB_POLL) {
+ if (ill->ill_capabilities & ILL_CAPAB_DLS) {
/*
* This is a capability renegotitation case.
* The interface better be unusable at this
@@ -2923,80 +3008,48 @@ ill_capability_poll_ack(ill_t *ill, mblk_t *mp, dl_capability_sub_t *isub)
* if we disable direct calls on a running
* and up interface.
*/
- ill_capability_poll_disable(ill);
+ ill_capability_dls_disable(ill);
}
return;
}
- switch (ipoll->poll_flags) {
+ switch (idls->dls_flags) {
default:
/* Disable if unknown flag */
+ case SOFT_RING_DISABLE:
case POLL_DISABLE:
- ill_capability_poll_disable(ill);
+ ill_capability_dls_disable(ill);
break;
+ case SOFT_RING_CAPABLE:
case POLL_CAPABLE:
/*
* If the capability was already enabled, its safe
* to disable it first to get rid of stale information
* and then start enabling it again.
*/
- ill_capability_poll_disable(ill);
- ill_capability_poll_capable(ill, ipoll, isub);
+ ill_capability_dls_disable(ill);
+ ill_capability_dls_capable(ill, idls, isub);
break;
+ case SOFT_RING_ENABLE:
case POLL_ENABLE:
- if (!(ill->ill_capabilities & ILL_CAPAB_POLL)) {
- ASSERT(ill->ill_poll_capab != NULL);
+ mutex_enter(&ill->ill_lock);
+ if (sub_dl_cap == DL_CAPAB_SOFT_RING &&
+ !(ill->ill_capabilities & ILL_CAPAB_SOFT_RING)) {
+ ASSERT(ill->ill_dls_capab != NULL);
+ ill->ill_capabilities |= ILL_CAPAB_SOFT_RING;
+ }
+ if (sub_dl_cap == DL_CAPAB_POLL &&
+ !(ill->ill_capabilities & ILL_CAPAB_POLL)) {
+ ASSERT(ill->ill_dls_capab != NULL);
ill->ill_capabilities |= ILL_CAPAB_POLL;
+ ip1dbg(("ill_capability_dls_ack: interface %s "
+ "has enabled polling\n", ill->ill_name));
}
- ip1dbg(("ill_capability_poll_ack: interface %s "
- "has enabled polling\n", ill->ill_name));
+ mutex_exit(&ill->ill_lock);
break;
}
}
-static void
-ill_capability_poll_reset(ill_t *ill, mblk_t **sc_mp)
-{
- mblk_t *mp;
- dl_capab_poll_t *ipoll;
- dl_capability_sub_t *dl_subcap;
- int size;
-
- if (!(ill->ill_capabilities & ILL_CAPAB_POLL))
- return;
-
- ASSERT(ill->ill_poll_capab != NULL);
-
- /*
- * Disable polling capability
- */
- ill_capability_poll_disable(ill);
-
- size = sizeof (*dl_subcap) + sizeof (*ipoll);
-
- mp = allocb(size, BPRI_HI);
- if (mp == NULL) {
- ip1dbg(("ill_capability_poll_reset: unable to allocate "
- "request to disable polling\n"));
- return;
- }
-
- mp->b_wptr = mp->b_rptr + size;
-
- dl_subcap = (dl_capability_sub_t *)mp->b_rptr;
- dl_subcap->dl_cap = DL_CAPAB_POLL;
- dl_subcap->dl_length = sizeof (*ipoll);
-
- ipoll = (dl_capab_poll_t *)(dl_subcap + 1);
- ipoll->poll_flags = POLL_DISABLE;
-
- if (*sc_mp != NULL)
- linkb(*sc_mp, mp);
- else
- *sc_mp = mp;
-}
-
-
/*
* Process a hardware checksum offload capability negotiation ack received
* from a DLS Provider.isub must point to the sub-capability (DL_CAPAB_HCKSUM)
@@ -7340,6 +7393,12 @@ ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
/* Just clean one squeue */
mutex_enter(&ill->ill_lock);
+ /*
+ * Reset the ILL_SOFT_RING_ASSIGN bit so that
+ * ip_squeue_soft_ring_affinty() will not go
+ * ahead with assigning rings.
+ */
+ ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
while (rx_ring->rr_ring_state == ILL_RING_INPROC)
/* Some operations pending on the ring. Wait */
cv_wait(&ill->ill_cv, &ill->ill_lock);
@@ -7376,7 +7435,7 @@ ipsq_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
/*
* Use the preallocated ill_unbind_conn for this purpose
*/
- connp = ill->ill_poll_capab->ill_unbind_conn;
+ connp = ill->ill_dls_capab->ill_unbind_conn;
mp = &connp->conn_tcp->tcp_closemp;
CONN_INC_REF(connp);
squeue_enter(sqp, mp, ip_squeue_clean, connp, NULL);
@@ -7396,15 +7455,15 @@ ipsq_clean_all(ill_t *ill)
/*
* No need to clean if poll_capab isn't set for this ill
*/
- if (!(ill->ill_capabilities & ILL_CAPAB_POLL))
+ if (!(ill->ill_capabilities & (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)))
return;
- ill->ill_capabilities &= ~ILL_CAPAB_POLL;
-
for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
- ill_rx_ring_t *ipr = &ill->ill_poll_capab->ill_ring_tbl[idx];
+ ill_rx_ring_t *ipr = &ill->ill_dls_capab->ill_ring_tbl[idx];
ipsq_clean_ring(ill, ipr);
}
+
+ ill->ill_capabilities &= ~(ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING);
}
/* ARGSUSED */
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index ae7731ac7b..78db295c78 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -125,13 +125,16 @@
* We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
* mapping between squeue and NIC (or Rx ring) for performance reasons so
* each squeue can uniquely own a NIC or a Rx ring and do polling
- * (PSARC 2004/630). So we allow up to MAX_THREAD_PER_CPU squeues per CPU.
- * We start by creating MIN_THREAD_PER_CPU squeues per CPU but more squeues
+ * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU.
+ * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
* can be created dynamically as needed.
*/
-#define MAX_THREAD_PER_CPU 32
-#define MIN_THREAD_PER_CPU 1
-uint_t ip_threads_per_cpu = MIN_THREAD_PER_CPU;
+#define MAX_SQUEUES_PER_CPU 32
+#define MIN_SQUEUES_PER_CPU 1
+uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
+
+#define IP_NUM_SOFT_RINGS 2
+uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
/*
* List of all created squeue sets. The size is protected by cpu_lock
@@ -155,11 +158,12 @@ static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
static void ip_squeue_set_bind(squeue_set_t *);
static void ip_squeue_set_unbind(squeue_set_t *);
+static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t);
#define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
/*
- * Create squeue set containing ip_threads_per_cpu number of squeues
+ * Create squeue set containing ip_squeues_per_cpu number of squeues
* for this CPU and bind them all to the CPU.
*/
static squeue_set_t *
@@ -186,13 +190,13 @@ ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
}
sqs = kmem_zalloc(sizeof (squeue_set_t) +
- (sizeof (squeue_t *) * MAX_THREAD_PER_CPU), KM_SLEEP);
+ (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
sqs->sqs_list = (squeue_t **)&sqs[1];
- sqs->sqs_max_size = MAX_THREAD_PER_CPU;
+ sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
sqs->sqs_bind = id;
- for (i = 0; i < ip_threads_per_cpu; i++) {
+ for (i = 0; i < ip_squeues_per_cpu; i++) {
bzero(sqname, sizeof (sqname));
(void) snprintf(sqname, sizeof (sqname),
@@ -202,6 +206,12 @@ ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
minclsyspri);
+ /*
+ * The first squeue in each squeue_set is the DEFAULT
+ * squeue.
+ */
+ sqp->sq_state |= SQS_DEFAULT;
+
ASSERT(sqp != NULL);
squeue_profile_enable(sqp);
@@ -229,10 +239,10 @@ ip_squeue_init(void (*callback)(squeue_t *))
ASSERT(sqset_global_list == NULL);
- if (ip_threads_per_cpu < MIN_THREAD_PER_CPU)
- ip_threads_per_cpu = MIN_THREAD_PER_CPU;
- else if (ip_threads_per_cpu > MAX_THREAD_PER_CPU)
- ip_threads_per_cpu = MAX_THREAD_PER_CPU;
+ if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
+ ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
+ else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
+ ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
ip_squeue_create_callback = callback;
squeue_init();
@@ -293,6 +303,10 @@ ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
mutex_exit(&sqp->sq_lock);
ill = ring->rr_ill;
+ if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
+ ASSERT(ring->rr_handle != NULL);
+ ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
+ }
/*
* Cleanup the ring
@@ -338,15 +352,20 @@ ip_squeue_extend(void *arg)
ill_t *ill = sq_arg->ip_taskq_ill;
ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
cpu_t *intr_cpu = sq_arg->ip_taskq_cpu;
- squeue_set_t *sqs;
+ squeue_set_t *sqs;
squeue_t *sqp = NULL;
- char sqname[64];
- int i;
ASSERT(ill != NULL);
ASSERT(ill_rx_ring != NULL);
kmem_free(arg, sizeof (ip_taskq_arg_t));
+ /*
+ * Make sure the CPU that originally took the interrupt still
+ * exists.
+ */
+ if (!CPU_ISON(intr_cpu))
+ intr_cpu = CPU;
+
sqs = intr_cpu->cpu_squeue_set;
/*
@@ -356,10 +375,337 @@ ip_squeue_extend(void *arg)
* is sequential, we need to hold the ill_lock.
*/
mutex_enter(&ill->ill_lock);
+ sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE);
+ if (sqp == NULL) {
+ /*
+ * We hit the max limit of squeues allowed per CPU.
+ * Assign this rx_ring to DEFAULT squeue of the
+ * interrupted CPU but the squeue will not manage
+ * the ring. Also print a warning.
+ */
+ cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
+ "has max number of squeues. System performance might "
+ "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
+
+ /* the first squeue in the list is the default squeue */
+ sqp = sqs->sqs_list[0];
+ ASSERT(sqp != NULL);
+ ill_rx_ring->rr_sqp = sqp;
+ ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
+
+ mutex_exit(&ill->ill_lock);
+ ill_waiter_dcr(ill);
+ return;
+ }
+
+ ASSERT(MUTEX_HELD(&sqp->sq_lock));
+ sqp->sq_rx_ring = ill_rx_ring;
+ ill_rx_ring->rr_sqp = sqp;
+ ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
+
+ sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
+ mutex_exit(&sqp->sq_lock);
+
+ mutex_exit(&ill->ill_lock);
+
+ /* ill_waiter_dcr will also signal any waiters on ill_ring_state */
+ ill_waiter_dcr(ill);
+}
+
+/*
+ * Do a Rx ring to squeue binding. Find a unique squeue that is not
+ * managing a receive ring. If no such squeue exists, dynamically
+ * create a new one in the squeue set.
+ *
+ * The function runs via the system taskq. The ill passed as an
+ * argument can't go away since we hold a ref. The lock order is
+ * ill_lock -> sqs_lock -> sq_lock.
+ *
+ * If we are binding a Rx ring to a squeue attached to the offline CPU,
+ * no need to check that because squeues are never destroyed once
+ * created.
+ */
+/* ARGSUSED */
+static void
+ip_squeue_soft_ring_affinity(void *arg)
+{
+ ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg;
+ ill_t *ill = sq_arg->ip_taskq_ill;
+ ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab;
+ ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
+ cpu_t *intr_cpu = sq_arg->ip_taskq_cpu;
+ cpu_t *bind_cpu;
+ int cpu_id = intr_cpu->cpu_id;
+ int min_cpu_id, max_cpu_id;
+ boolean_t enough_uniq_cpus = B_FALSE;
+ boolean_t enough_cpus = B_FALSE;
+ squeue_set_t *sqs, *last_sqs;
+ squeue_t *sqp = NULL;
+ int i, j;
+
+ ASSERT(ill != NULL);
+ kmem_free(arg, sizeof (ip_taskq_arg_t));
+
+ /*
+ * Make sure the CPU that originally took the interrupt still
+ * exists.
+ */
+ if (!CPU_ISON(intr_cpu)) {
+ intr_cpu = CPU;
+ cpu_id = intr_cpu->cpu_id;
+ }
+
+ /*
+ * If this ill represents link aggregation, then there might be
+ * multiple NICs trying to register them selves at the same time
+ * and in order to ensure that test and assignment of free rings
+ * is sequential, we need to hold the ill_lock.
+ */
+ mutex_enter(&ill->ill_lock);
+
+ if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
+ mutex_exit(&ill->ill_lock);
+ return;
+ }
+ /*
+ * We need to fanout the interrupts from the NIC. We do that by
+ * telling the driver underneath to create soft rings and use
+ * worker threads (if the driver advertized SOFT_RING capability)
+ * Its still a big performance win to if we can fanout to the
+ * threads on the same core that is taking interrupts.
+ *
+ * Since we don't know the interrupt to CPU binding, we don't
+ * assign any squeues or affinity to worker threads in the NIC.
+ * At the time of the first interrupt, we know which CPU is
+ * taking interrupts and try to find other threads on the same
+ * core. Assuming, ip_threads_per_cpu is correct and cpus are
+ * numbered sequentially for each core (XXX need something better
+ * than this in future), find the lowest number and highest
+ * number thread for that core.
+ *
+ * If we have one more thread per core than number of soft rings,
+ * then don't assign any worker threads to the H/W thread (cpu)
+ * taking interrupts (capability negotiation tries to ensure this)
+ *
+ * If the number of threads per core are same as the number of
+ * soft rings, then assign the worker affinity and squeue to
+ * the same cpu.
+ *
+ * Otherwise, just fanout to higher number CPUs starting from
+ * the interrupted CPU.
+ */
+
+ min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
+ max_cpu_id = min_cpu_id + ip_threads_per_cpu;
+
+ cmn_err(CE_CONT, "soft_ring_affinity: min/max/intr = %d/%d/%d\n",
+ min_cpu_id, max_cpu_id, (int)intr_cpu->cpu_id);
+
+ /*
+ * Quickly check if there are enough CPUs present for fanout
+ * and also max_cpu_id is less than the id of the active CPU.
+ * We use the cpu_id stored in the last squeue_set to get
+ * an idea. The scheme is by no means perfect since it doesn't
+ * take into account CPU DR operations and the fact that
+ * interrupts themselves might change. An ideal scenario
+ * would be to ensure that interrupts run cpus by themselves
+ * and worker threads never have affinity to those CPUs. If
+ * the interrupts move to CPU which had a worker thread, it
+ * should be changed. Probably callbacks similar to CPU offline
+ * are needed to make it work perfectly.
+ */
+ last_sqs = sqset_global_list[sqset_global_size - 1];
+ if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
+ if ((max_cpu_id - min_cpu_id) >
+ ill_soft_ring->ill_dls_soft_ring_cnt)
+ enough_uniq_cpus = B_TRUE;
+ else if ((max_cpu_id - min_cpu_id) >=
+ ill_soft_ring->ill_dls_soft_ring_cnt)
+ enough_cpus = B_TRUE;
+ }
+
+ j = 0;
+ for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
+ if (enough_uniq_cpus) {
+ if ((min_cpu_id + i) == cpu_id) {
+ j++;
+ continue;
+ }
+ bind_cpu = cpu[min_cpu_id + i];
+ } else if (enough_cpus) {
+ bind_cpu = cpu[min_cpu_id + i];
+ } else {
+ /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
+ bind_cpu = cpu[(cpu_id + i) % ncpus];
+ }
+
+ /*
+ * Check if the CPU actually exist and active. If not,
+ * use the interrupted CPU. ip_find_unused_squeue() will
+ * find the right CPU to fanout anyway.
+ */
+ if (!CPU_ISON(bind_cpu))
+ bind_cpu = intr_cpu;
+
+ sqs = bind_cpu->cpu_squeue_set;
+ ASSERT(sqs != NULL);
+ ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
+
+ sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE);
+ if (sqp == NULL) {
+ /*
+ * We hit the max limit of squeues allowed per CPU.
+ * Assign this rx_ring to DEFAULT squeue of the
+ * interrupted CPU but thesqueue will not manage
+ * the ring. Also print a warning.
+ */
+ cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
+ "%d/%p already has max number of squeues. System "
+ "performance might become suboptimal\n",
+ sqs->sqs_bind, (void *)sqs);
+
+ /* the first squeue in the list is the default squeue */
+ sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
+ ASSERT(sqp != NULL);
+
+ ill_rx_ring->rr_sqp = sqp;
+ ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
+ continue;
+
+ }
+ ASSERT(MUTEX_HELD(&sqp->sq_lock));
+ ill_rx_ring->rr_sqp = sqp;
+ sqp->sq_rx_ring = ill_rx_ring;
+ ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
+ sqp->sq_state |= SQS_ILL_BOUND;
+
+ /* assign affinity to soft ring */
+ if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
+ ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
+ sqp->sq_bind);
+ }
+ mutex_exit(&sqp->sq_lock);
+
+ cmn_err(CE_CONT, "soft_ring_affinity: ring = %d, bind = %d\n",
+ i - j, sqp->sq_bind);
+ }
+ mutex_exit(&ill->ill_lock);
+
+ ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
+ SOFT_RING_SRC_HASH);
+
+ /* ill_waiter_dcr will also signal any waiters on ill_ring_state */
+ ill_waiter_dcr(ill);
+}
+
+void
+ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
+mblk_t *mp_chain, size_t hdrlen)
+{
+ ip_taskq_arg_t *taskq_arg;
+ boolean_t refheld;
+
+ ASSERT(servicing_interrupt());
+ ASSERT(ip_ring == NULL);
+
+ mutex_enter(&ill->ill_lock);
+ if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
+ taskq_arg = (ip_taskq_arg_t *)
+ kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
+
+ if (taskq_arg == NULL)
+ goto out;
+
+ taskq_arg->ip_taskq_ill = ill;
+ taskq_arg->ip_taskq_ill_rx_ring = ip_ring;
+ taskq_arg->ip_taskq_cpu = CPU;
+
+ /*
+ * Set ILL_SOFT_RING_ASSIGN flag. We don't want
+ * the next interrupt to schedule a task for calling
+ * ip_squeue_soft_ring_affinity();
+ */
+ ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
+ } else {
+ mutex_exit(&ill->ill_lock);
+ goto out;
+ }
+ mutex_exit(&ill->ill_lock);
+ refheld = ill_waiter_inc(ill);
+ if (refheld) {
+ if (taskq_dispatch(system_taskq,
+ ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
+ goto out;
+
+ /* release ref on ill if taskq dispatch fails */
+ ill_waiter_dcr(ill);
+ }
+ /*
+ * Turn on CAPAB_SOFT_RING so that affinity assignment
+ * can be tried again later.
+ */
+ mutex_enter(&ill->ill_lock);
+ ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
+ mutex_exit(&ill->ill_lock);
+ kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
+
+out:
+ ip_input(ill, ip_ring, mp_chain, hdrlen);
+}
+
+static squeue_t *
+ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout)
+{
+ int i;
+ squeue_set_t *best_sqs = NULL;
+ squeue_set_t *curr_sqs = NULL;
+ int min_sq = 0;
+ squeue_t *sqp = NULL;
+ char sqname[64];
+
+ /*
+ * If fanout is set and the passed squeue_set already has some
+ * squeues which are managing the NICs, try to find squeues on
+ * unused CPU.
+ */
+ if (sqs->sqs_size > 1 && fanout) {
+ /*
+ * First check to see if any squeue on the CPU passed
+ * is managing a NIC.
+ */
+ for (i = 0; i < sqs->sqs_size; i++) {
+ mutex_enter(&sqs->sqs_list[i]->sq_lock);
+ if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
+ !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
+ mutex_exit(&sqs->sqs_list[i]->sq_lock);
+ break;
+ }
+ mutex_exit(&sqs->sqs_list[i]->sq_lock);
+ }
+ if (i != sqs->sqs_size) {
+ best_sqs = sqset_global_list[sqset_global_size - 1];
+ min_sq = best_sqs->sqs_size;
+
+ for (i = sqset_global_size - 2; i >= 0; i--) {
+ curr_sqs = sqset_global_list[i];
+ if (curr_sqs->sqs_size < min_sq) {
+ best_sqs = curr_sqs;
+ min_sq = curr_sqs->sqs_size;
+ }
+ }
+
+ ASSERT(best_sqs != NULL);
+ sqs = best_sqs;
+ bind_cpu = cpu[sqs->sqs_bind];
+ }
+ }
+
mutex_enter(&sqs->sqs_lock);
+
for (i = 0; i < sqs->sqs_size; i++) {
mutex_enter(&sqs->sqs_list[i]->sq_lock);
- if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) == 0) {
+ if ((sqs->sqs_list[i]->sq_state &
+ (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
sqp = sqs->sqs_list[i];
break;
}
@@ -371,29 +717,19 @@ ip_squeue_extend(void *arg)
if (sqs->sqs_size == sqs->sqs_max_size) {
/*
* Reached the max limit for squeue
- * we can allocate on this CPU. Leave
- * ill_ring_state set to ILL_RING_INPROC
- * so that ip_squeue_direct will just
- * assign the default squeue for this
- * ring for future connections.
+ * we can allocate on this CPU.
*/
-#ifdef DEBUG
- cmn_err(CE_NOTE, "ip_squeue_add: Reached max "
- " threads per CPU for sqp = %p\n", (void *)sqp);
-#endif
mutex_exit(&sqs->sqs_lock);
- mutex_exit(&ill->ill_lock);
- ill_waiter_dcr(ill);
- return;
+ return (NULL);
}
bzero(sqname, sizeof (sqname));
(void) snprintf(sqname, sizeof (sqname),
- "ip_squeue_cpu_%d/%d/%d", CPU->cpu_seqid,
- CPU->cpu_id, sqs->sqs_size);
+ "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
+ bind_cpu->cpu_id, sqs->sqs_size);
- sqp = squeue_create(sqname, CPU->cpu_id, ip_squeue_worker_wait,
- minclsyspri);
+ sqp = squeue_create(sqname, bind_cpu->cpu_id,
+ ip_squeue_worker_wait, minclsyspri);
ASSERT(sqp != NULL);
@@ -403,26 +739,18 @@ ip_squeue_extend(void *arg)
if (ip_squeue_create_callback != NULL)
ip_squeue_create_callback(sqp);
- if (ip_squeue_bind) {
+ mutex_enter(&cpu_lock);
+ if (ip_squeue_bind && cpu_is_online(bind_cpu)) {
squeue_bind(sqp, -1);
}
+ mutex_exit(&cpu_lock);
+
mutex_enter(&sqp->sq_lock);
}
- ASSERT(sqp != NULL);
-
- sqp->sq_rx_ring = ill_rx_ring;
- ill_rx_ring->rr_sqp = sqp;
- ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
-
- sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
- mutex_exit(&sqp->sq_lock);
mutex_exit(&sqs->sqs_lock);
-
- mutex_exit(&ill->ill_lock);
-
- /* ill_waiter_dcr will also signal any waiters on ill_ring_state */
- ill_waiter_dcr(ill);
+ ASSERT(sqp != NULL);
+ return (sqp);
}
/*
@@ -657,6 +985,21 @@ ip_squeue_set_unbind(squeue_set_t *sqs)
mutex_enter(&sqs->sqs_lock);
for (i = 0; i < sqs->sqs_size; i++) {
sqp = sqs->sqs_list[i];
+
+ /*
+ * CPU is going offline. Remove the thread affinity
+ * for any soft ring threads the squeue is managing.
+ */
+ if (sqp->sq_state & SQS_ILL_BOUND) {
+ ill_rx_ring_t *ring = sqp->sq_rx_ring;
+ ill_t *ill = ring->rr_ill;
+
+ if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
+ ASSERT(ring->rr_handle != NULL);
+ ill->ill_dls_capab->ill_dls_unbind(
+ ring->rr_handle);
+ }
+ }
if (!(sqp->sq_state & SQS_BOUND))
continue;
squeue_unbind(sqp);
diff --git a/usr/src/uts/common/inet/ip_impl.h b/usr/src/uts/common/inet/ip_impl.h
index 8a9f611fab..dd8f1db3da 100644
--- a/usr/src/uts/common/inet/ip_impl.h
+++ b/usr/src/uts/common/inet/ip_impl.h
@@ -466,20 +466,21 @@ typedef struct ip_pdescinfo_s PDESCINFO_STRUCT(2) ip_pdescinfo_t;
putnext((connp)->conn_rq, mp); \
}
-#define ILL_POLL_CAPABLE(ill) \
- (((ill)->ill_capabilities & ILL_CAPAB_POLL) != 0)
+#define ILL_DLS_CAPABLE(ill) \
+ (((ill)->ill_capabilities & \
+ (ILL_CAPAB_POLL|ILL_CAPAB_SOFT_RING)) != 0)
/*
* Macro that hands off one or more messages directly to DLD
* when the interface is marked with ILL_CAPAB_POLL.
*/
-#define IP_POLL_ILL_TX(ill, mp) { \
- ill_poll_capab_t *ill_poll = ill->ill_poll_capab; \
- ASSERT(ILL_POLL_CAPABLE(ill)); \
- ASSERT(ill_poll != NULL); \
- ASSERT(ill_poll->ill_tx != NULL); \
- ASSERT(ill_poll->ill_tx_handle != NULL); \
- ill_poll->ill_tx(ill_poll->ill_tx_handle, mp); \
+#define IP_DLS_ILL_TX(ill, mp) { \
+ ill_dls_capab_t *ill_dls = ill->ill_dls_capab; \
+ ASSERT(ILL_DLS_CAPABLE(ill)); \
+ ASSERT(ill_dls != NULL); \
+ ASSERT(ill_dls->ill_tx != NULL); \
+ ASSERT(ill_dls->ill_tx_handle != NULL); \
+ ill_dls->ill_tx(ill_dls->ill_tx_handle, mp); \
}
extern int ip_wput_frag_mdt_min;
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index 61495f4705..886f6a00e5 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -18126,13 +18126,13 @@ tcp_send_data(tcp_t *tcp, queue_t *q, mblk_t *mp)
ire->ire_last_used_time = lbolt;
BUMP_MIB(&ip_mib, ipOutRequests);
- if (ILL_POLL_CAPABLE(ill)) {
+ if (ILL_DLS_CAPABLE(ill)) {
/*
* Send the packet directly to DLD, where it may be queued
* depending on the availability of transmit resources at
* the media layer.
*/
- IP_POLL_ILL_TX(ill, mp);
+ IP_DLS_ILL_TX(ill, mp);
} else {
putnext(ire->ire_stq, mp);
}
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index c13d7c485f..3888f36ce2 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -6168,13 +6168,13 @@ udp_send_data(udp_t *udp, queue_t *q, mblk_t *mp, ipha_t *ipha)
UPDATE_OB_PKT_COUNT(ire);
ire->ire_last_used_time = lbolt;
- if (ILL_POLL_CAPABLE(ill)) {
+ if (ILL_DLS_CAPABLE(ill)) {
/*
* Send the packet directly to DLD, where it may be queued
* depending on the availability of transmit resources at
* the media layer.
*/
- IP_POLL_ILL_TX(ill, mp);
+ IP_DLS_ILL_TX(ill, mp);
} else {
putnext(ire->ire_stq, mp);
}
diff --git a/usr/src/uts/common/io/dld/dld_proto.c b/usr/src/uts/common/io/dld/dld_proto.c
index 4dce2ecc6d..8511f99890 100644
--- a/usr/src/uts/common/io/dld/dld_proto.c
+++ b/usr/src/uts/common/io/dld/dld_proto.c
@@ -37,6 +37,7 @@
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/strsun.h>
+#include <sys/cpuvar.h>
#include <sys/dlpi.h>
#include <netinet/in.h>
#include <sys/sdt.h>
@@ -46,6 +47,7 @@
#include <sys/dls.h>
#include <sys/dld.h>
#include <sys/dld_impl.h>
+#include <sys/dls_soft_ring.h>
typedef boolean_t proto_reqfunc_t(dld_str_t *, union DL_primitives *, mblk_t *);
@@ -56,9 +58,15 @@ static proto_reqfunc_t proto_info_req, proto_attach_req, proto_detach_req,
proto_notify_req, proto_unitdata_req, proto_passive_req;
static void proto_poll_disable(dld_str_t *);
-static boolean_t proto_poll_enable(dld_str_t *, dl_capab_poll_t *);
+static boolean_t proto_poll_enable(dld_str_t *, dl_capab_dls_t *);
static boolean_t proto_capability_advertise(dld_str_t *, mblk_t *);
+static void proto_soft_ring_disable(dld_str_t *);
+static boolean_t proto_soft_ring_enable(dld_str_t *, dl_capab_dls_t *);
+static boolean_t proto_capability_advertise(dld_str_t *, mblk_t *);
+static void proto_change_soft_ring_fanout(dld_str_t *, int);
+static void proto_stop_soft_ring_threads(void *);
+
#define DL_ACK_PENDING(state) \
((state) == DL_ATTACH_PENDING || \
(state) == DL_DETACH_PENDING || \
@@ -606,6 +614,22 @@ proto_unbind_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
*/
dsp->ds_mode = DLD_UNITDATA;
+ /*
+ * If soft rings were enabled, the workers
+ * should be quiesced. Start a task that will
+ * get this in motion. We cannot check for
+ * ds_soft_ring flag because
+ * proto_soft_ring_disable() called from
+ * proto_capability_req() would have reset it.
+ */
+ if (dls_soft_ring_workers(dsp->ds_dc)) {
+ dsp->ds_unbind_req = mp;
+ dsp->ds_task_id = taskq_dispatch(system_taskq,
+ proto_stop_soft_ring_threads, (void *)dsp, TQ_SLEEP);
+ rw_exit(&dsp->ds_lock);
+ return (B_TRUE);
+ }
+
dsp->ds_dlstate = DL_UNBOUND;
rw_exit(&dsp->ds_lock);
@@ -1055,6 +1079,20 @@ failed:
return (B_FALSE);
}
+static boolean_t
+check_ip_above(queue_t *q)
+{
+ queue_t *next_q;
+ boolean_t ret = B_TRUE;
+
+ claimstr(q);
+ next_q = q->q_next;
+ if (strcmp(next_q->q_qinfo->qi_minfo->mi_idname, "ip") != 0)
+ ret = B_FALSE;
+ releasestr(q);
+ return (ret);
+}
+
/*
* DL_CAPABILITY_REQ
*/
@@ -1141,14 +1179,14 @@ proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
* IP polling interface.
*/
case DL_CAPAB_POLL: {
- dl_capab_poll_t *pollp;
- dl_capab_poll_t poll;
+ dl_capab_dls_t *pollp;
+ dl_capab_dls_t poll;
- pollp = (dl_capab_poll_t *)&sp[1];
+ pollp = (dl_capab_dls_t *)&sp[1];
/*
* Copy for alignment.
*/
- bcopy(pollp, &poll, sizeof (dl_capab_poll_t));
+ bcopy(pollp, &poll, sizeof (dl_capab_dls_t));
/*
* We need to become writer before enabling and/or
@@ -1168,7 +1206,7 @@ proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
}
upgraded = B_TRUE;
- switch (poll.poll_flags) {
+ switch (poll.dls_flags) {
default:
/*FALLTHRU*/
case POLL_DISABLE:
@@ -1186,16 +1224,81 @@ proto_capability_req(dld_str_t *dsp, union DL_primitives *udlp, mblk_t *mp)
/*
* Now attempt enable it.
*/
- if (!proto_poll_enable(dsp, &poll))
- break;
+ if (check_ip_above(dsp->ds_rq) &&
+ proto_poll_enable(dsp, &poll)) {
+ bzero(&poll, sizeof (dl_capab_dls_t));
+ poll.dls_flags = POLL_ENABLE;
+ }
+ break;
+ }
+
+ dlcapabsetqid(&(poll.dls_mid), dsp->ds_rq);
+ bcopy(&poll, pollp, sizeof (dl_capab_dls_t));
+ break;
+ }
+ case DL_CAPAB_SOFT_RING: {
+ dl_capab_dls_t *soft_ringp;
+ dl_capab_dls_t soft_ring;
+
+ soft_ringp = (dl_capab_dls_t *)&sp[1];
+ /*
+ * Copy for alignment.
+ */
+ bcopy(soft_ringp, &soft_ring,
+ sizeof (dl_capab_dls_t));
- bzero(&poll, sizeof (dl_capab_poll_t));
- poll.poll_flags = POLL_ENABLE;
+ /*
+ * We need to become writer before enabling and/or
+ * disabling the soft_ring interface. If we couldn'
+ * upgrade, check state again after re-acquiring the
+ * lock to make sure we can proceed.
+ */
+ if (!upgraded && !rw_tryupgrade(&dsp->ds_lock)) {
+ rw_exit(&dsp->ds_lock);
+ rw_enter(&dsp->ds_lock, RW_WRITER);
+
+ if (dsp->ds_dlstate == DL_UNATTACHED ||
+ DL_ACK_PENDING(dsp->ds_dlstate)) {
+ dl_err = DL_OUTSTATE;
+ goto failed;
+ }
+ }
+ upgraded = B_TRUE;
+
+ switch (soft_ring.dls_flags) {
+ default:
+ /*FALLTHRU*/
+ case SOFT_RING_DISABLE:
+ proto_soft_ring_disable(dsp);
+ break;
+
+ case SOFT_RING_ENABLE:
+ /*
+ * Make sure soft_ring is disabled.
+ */
+ proto_soft_ring_disable(dsp);
+
+ /*
+ * Now attempt enable it.
+ */
+ if (check_ip_above(dsp->ds_rq) &&
+ proto_soft_ring_enable(dsp, &soft_ring)) {
+ bzero(&soft_ring,
+ sizeof (dl_capab_dls_t));
+ soft_ring.dls_flags =
+ SOFT_RING_ENABLE;
+ } else {
+ bzero(&soft_ring,
+ sizeof (dl_capab_dls_t));
+ soft_ring.dls_flags =
+ SOFT_RING_DISABLE;
+ }
break;
}
- dlcapabsetqid(&(poll.poll_mid), dsp->ds_rq);
- bcopy(&poll, pollp, sizeof (dl_capab_poll_t));
+ dlcapabsetqid(&(soft_ring.dls_mid), dsp->ds_rq);
+ bcopy(&soft_ring, soft_ringp,
+ sizeof (dl_capab_dls_t));
break;
}
default:
@@ -1440,6 +1543,7 @@ proto_poll_disable(dld_str_t *dsp)
*/
mh = dls_mac(dsp->ds_dc);
mac_resource_set(mh, NULL, NULL);
+ mac_resources(mh);
/*
* Set receive function back to default.
@@ -1454,7 +1558,7 @@ proto_poll_disable(dld_str_t *dsp)
}
static boolean_t
-proto_poll_enable(dld_str_t *dsp, dl_capab_poll_t *pollp)
+proto_poll_enable(dld_str_t *dsp, dl_capab_dls_t *pollp)
{
mac_handle_t mh;
@@ -1473,15 +1577,15 @@ proto_poll_enable(dld_str_t *dsp, dl_capab_poll_t *pollp)
/*
* Register resources.
*/
- mac_resource_set(mh, (mac_resource_add_t)pollp->poll_ring_add,
- (void *)pollp->poll_rx_handle);
+ mac_resource_set(mh, (mac_resource_add_t)pollp->dls_ring_add,
+ (void *)pollp->dls_rx_handle);
mac_resources(mh);
/*
* Set the receive function.
*/
- dls_rx_set(dsp->ds_dc, (dls_rx_t)pollp->poll_rx,
- (void *)pollp->poll_rx_handle);
+ dls_rx_set(dsp->ds_dc, (dls_rx_t)pollp->dls_rx,
+ (void *)pollp->dls_rx_handle);
/*
* Note that polling is enabled. This prevents further DLIOCHDRINFO
@@ -1491,6 +1595,74 @@ proto_poll_enable(dld_str_t *dsp, dl_capab_poll_t *pollp)
return (B_TRUE);
}
+static void
+proto_soft_ring_disable(dld_str_t *dsp)
+{
+ ASSERT(RW_WRITE_HELD(&dsp->ds_lock));
+
+ if (!dsp->ds_soft_ring)
+ return;
+
+ /*
+ * It should be impossible to enable raw mode if soft_ring is turned on.
+ */
+ ASSERT(dsp->ds_mode != DLD_RAW);
+ proto_change_soft_ring_fanout(dsp, SOFT_RING_NONE);
+ /*
+ * Note that fanout is disabled.
+ */
+ dsp->ds_soft_ring = B_FALSE;
+}
+
+static boolean_t
+proto_soft_ring_enable(dld_str_t *dsp, dl_capab_dls_t *soft_ringp)
+{
+ ASSERT(RW_WRITE_HELD(&dsp->ds_lock));
+ ASSERT(!dsp->ds_soft_ring);
+
+ /*
+ * We cannot enable soft_ring if raw mode
+ * has been enabled.
+ */
+ if (dsp->ds_mode == DLD_RAW)
+ return (B_FALSE);
+
+ if (dls_soft_ring_enable(dsp->ds_dc, soft_ringp) == B_FALSE)
+ return (B_FALSE);
+
+ dsp->ds_soft_ring = B_TRUE;
+ return (B_TRUE);
+}
+
+static void
+proto_change_soft_ring_fanout(dld_str_t *dsp, int type)
+{
+ dls_rx_t rx;
+
+ if (type == SOFT_RING_NONE) {
+ rx = (dsp->ds_mode == DLD_FASTPATH) ?
+ dld_str_rx_fastpath : dld_str_rx_unitdata;
+ } else {
+ rx = (dls_rx_t)dls_ether_soft_ring_fanout;
+ }
+ dls_soft_ring_rx_set(dsp->ds_dc, rx, dsp, type);
+}
+
+static void
+proto_stop_soft_ring_threads(void *arg)
+{
+ dld_str_t *dsp = (dld_str_t *)arg;
+
+ rw_enter(&dsp->ds_lock, RW_WRITER);
+ dls_soft_ring_disable(dsp->ds_dc);
+ dsp->ds_dlstate = DL_UNBOUND;
+ rw_exit(&dsp->ds_lock);
+ dlokack(dsp->ds_wq, dsp->ds_unbind_req, DL_UNBIND_REQ);
+ rw_enter(&dsp->ds_lock, RW_WRITER);
+ dsp->ds_task_id = NULL;
+ rw_exit(&dsp->ds_lock);
+}
+
/*
* DL_CAPABILITY_ACK/DL_ERROR_ACK
*/
@@ -1500,7 +1672,8 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
dl_capability_ack_t *dlap;
dl_capability_sub_t *dlsp;
size_t subsize;
- dl_capab_poll_t poll;
+ dl_capab_dls_t poll;
+ dl_capab_dls_t soft_ring;
dl_capab_hcksum_t hcksum;
dl_capab_zerocopy_t zcopy;
uint8_t *ptr;
@@ -1516,6 +1689,9 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
*/
subsize = 0;
+ /* Always advertize soft ring capability for GLDv3 drivers */
+ subsize += sizeof (dl_capability_sub_t) + sizeof (dl_capab_dls_t);
+
/*
* Check if polling can be enabled on this interface.
* If advertising DL_CAPAB_POLL has not been explicitly disabled
@@ -1525,7 +1701,7 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
!(dld_opt & DLD_OPT_NO_POLL) && (dsp->ds_vid == VLAN_ID_NONE));
if (poll_cap) {
subsize += sizeof (dl_capability_sub_t) +
- sizeof (dl_capab_poll_t);
+ sizeof (dl_capab_dls_t);
}
/*
@@ -1550,7 +1726,7 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
* If there are no capabilities to advertise or if we
* can't allocate a response, send a DL_ERROR_ACK.
*/
- if (subsize == 0 || (mp1 = reallocb(mp,
+ if ((mp1 = reallocb(mp,
sizeof (dl_capability_ack_t) + subsize, 0)) == NULL) {
rw_exit(&dsp->ds_lock);
dlerrorack(q, mp, DL_CAPABILITY_REQ, DL_NOTSUPPORTED, 0);
@@ -1594,7 +1770,7 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
rw_downgrade(&dsp->ds_lock);
poll_capab_size = sizeof (dl_capability_sub_t) +
- sizeof (dl_capab_poll_t);
+ sizeof (dl_capab_dls_t);
mp->b_wptr -= poll_capab_size;
subsize -= poll_capab_size;
@@ -1607,23 +1783,43 @@ proto_capability_advertise(dld_str_t *dsp, mblk_t *mp)
dlsp = (dl_capability_sub_t *)ptr;
dlsp->dl_cap = DL_CAPAB_POLL;
- dlsp->dl_length = sizeof (dl_capab_poll_t);
+ dlsp->dl_length = sizeof (dl_capab_dls_t);
ptr += sizeof (dl_capability_sub_t);
- bzero(&poll, sizeof (dl_capab_poll_t));
- poll.poll_version = POLL_VERSION_1;
- poll.poll_flags = POLL_CAPABLE;
- poll.poll_tx_handle = (uintptr_t)dsp;
- poll.poll_tx = (uintptr_t)str_mdata_fastpath_put;
+ bzero(&poll, sizeof (dl_capab_dls_t));
+ poll.dls_version = POLL_VERSION_1;
+ poll.dls_flags = POLL_CAPABLE;
+ poll.dls_tx_handle = (uintptr_t)dsp;
+ poll.dls_tx = (uintptr_t)str_mdata_fastpath_put;
- dlcapabsetqid(&(poll.poll_mid), dsp->ds_rq);
- bcopy(&poll, ptr, sizeof (dl_capab_poll_t));
- ptr += sizeof (dl_capab_poll_t);
+ dlcapabsetqid(&(poll.dls_mid), dsp->ds_rq);
+ bcopy(&poll, ptr, sizeof (dl_capab_dls_t));
+ ptr += sizeof (dl_capab_dls_t);
}
}
ASSERT(RW_READ_HELD(&dsp->ds_lock));
+ dlsp = (dl_capability_sub_t *)ptr;
+
+ dlsp->dl_cap = DL_CAPAB_SOFT_RING;
+ dlsp->dl_length = sizeof (dl_capab_dls_t);
+ ptr += sizeof (dl_capability_sub_t);
+
+ bzero(&soft_ring, sizeof (dl_capab_dls_t));
+ soft_ring.dls_version = SOFT_RING_VERSION_1;
+ soft_ring.dls_flags = SOFT_RING_CAPABLE;
+ soft_ring.dls_tx_handle = (uintptr_t)dsp;
+ soft_ring.dls_tx = (uintptr_t)str_mdata_fastpath_put;
+ soft_ring.dls_ring_change_status =
+ (uintptr_t)proto_change_soft_ring_fanout;
+ soft_ring.dls_ring_bind = (uintptr_t)soft_ring_bind;
+ soft_ring.dls_ring_unbind = (uintptr_t)soft_ring_unbind;
+
+ dlcapabsetqid(&(soft_ring.dls_mid), dsp->ds_rq);
+ bcopy(&soft_ring, ptr, sizeof (dl_capab_dls_t));
+ ptr += sizeof (dl_capab_dls_t);
+
/*
* TCP/IP checksum offload.
*/
diff --git a/usr/src/uts/common/io/dld/dld_str.c b/usr/src/uts/common/io/dld/dld_str.c
index d723bd7450..c2b8c63e43 100644
--- a/usr/src/uts/common/io/dld/dld_str.c
+++ b/usr/src/uts/common/io/dld/dld_str.c
@@ -259,6 +259,8 @@ dld_close(queue_t *rq)
{
dld_str_t *dsp = rq->q_ptr;
+ ASSERT(dsp->ds_task_id == NULL);
+
/*
* Disable the queue srv(9e) routine.
*/
@@ -859,6 +861,7 @@ dld_str_detach(dld_str_t *dsp)
* Clear the polling and promisc flags.
*/
dsp->ds_polling = B_FALSE;
+ dsp->ds_soft_ring = B_FALSE;
dsp->ds_promisc = 0;
/*
@@ -1494,7 +1497,7 @@ ioc_raw(dld_str_t *dsp, mblk_t *mp)
queue_t *q = dsp->ds_wq;
rw_enter(&dsp->ds_lock, RW_WRITER);
- if (dsp->ds_polling) {
+ if (dsp->ds_polling || dsp->ds_soft_ring) {
rw_exit(&dsp->ds_lock);
miocnak(q, mp, 0, EPROTO);
return;
@@ -1604,7 +1607,7 @@ ioc_fast(dld_str_t *dsp, mblk_t *mp)
/*
* Set the receive callback (unless polling is enabled).
*/
- if (!dsp->ds_polling)
+ if (!dsp->ds_polling && !dsp->ds_soft_ring)
dls_rx_set(dc, dld_str_rx_fastpath, (void *)dsp);
/*
diff --git a/usr/src/uts/common/io/dls/dls.c b/usr/src/uts/common/io/dls/dls.c
index 0968818e07..dbf4edc280 100644
--- a/usr/src/uts/common/io/dls/dls.c
+++ b/usr/src/uts/common/io/dls/dls.c
@@ -43,10 +43,18 @@
#include <sys/dls.h>
#include <sys/dls_impl.h>
+#include <sys/dls_soft_ring.h>
static kmem_cache_t *i_dls_impl_cachep;
static uint32_t i_dls_impl_count;
+static kstat_t *dls_ksp = (kstat_t *)NULL;
+struct dls_kstats dls_kstat =
+{
+ { "soft_ring_pkt_drop", KSTAT_DATA_UINT32 },
+};
+
+
/*
* Private functions.
*/
@@ -257,6 +265,27 @@ vlan:
dhip->dhi_vid = VLAN_ID(tci);
}
+static void
+dls_stat_init()
+{
+ if ((dls_ksp = kstat_create("dls", 0, "dls_stat",
+ "net", KSTAT_TYPE_NAMED,
+ sizeof (dls_kstat) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL)) == NULL) {
+ cmn_err(CE_WARN,
+ "DLS: failed to create kstat structure for dls stats");
+ return;
+ }
+ dls_ksp->ks_data = (void *)&dls_kstat;
+ kstat_install(dls_ksp);
+}
+
+static void
+dls_stat_destroy()
+{
+ kstat_delete(dls_ksp);
+}
+
/*
* Module initialization functions.
*/
@@ -271,6 +300,8 @@ dls_init(void)
sizeof (dls_impl_t), 0, i_dls_constructor, i_dls_destructor, NULL,
NULL, NULL, 0);
ASSERT(i_dls_impl_cachep != NULL);
+ soft_ring_init();
+ dls_stat_init();
}
int
@@ -286,6 +317,7 @@ dls_fini(void)
* Destroy the kmem_cache.
*/
kmem_cache_destroy(i_dls_impl_cachep);
+ dls_stat_destroy();
return (0);
}
@@ -423,6 +455,14 @@ dls_close(dls_channel_t dc)
*/
dip->di_dvp = NULL;
dip->di_txinfo = NULL;
+
+ if (dip->di_soft_ring_list != NULL) {
+ soft_ring_set_destroy(dip->di_soft_ring_list,
+ dip->di_soft_ring_size);
+ dip->di_soft_ring_list = NULL;
+ }
+ dip->di_soft_ring_size = 0;
+
kmem_cache_free(i_dls_impl_cachep, dip);
/*
diff --git a/usr/src/uts/common/io/dls/dls_soft_ring.c b/usr/src/uts/common/io/dls/dls_soft_ring.c
new file mode 100644
index 0000000000..cfd75e724a
--- /dev/null
+++ b/usr/src/uts/common/io/dls/dls_soft_ring.c
@@ -0,0 +1,667 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License, Version 1.0 only
+ * (the "License"). You may not use this file except in compliance
+ * with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+/*
+ * General Soft rings - Simulating Rx rings in S/W.
+ *
+ * This is a general purpose high-performance soft ring mechanism. It is
+ * similar to a taskq with a single worker thread. The dls creates a
+ * set of these rings to simulate the H/W Rx ring (DMA channels) some
+ * NICs have. The purpose is to present a common interface to IP
+ * so the individual squeues can control these rings and switch them
+ * between polling and interrupt mode.
+ *
+ * This code also serves as a fanout mechanism for fast NIC feeding slow
+ * CPU where incoming traffic can be separated into multiple soft rings
+ * based on capability negotiation with IP and IP also creates thread
+ * affinity to soft ring worker threads to CPU so that conenction to
+ * CPU/Squeue affinity is never broken.
+ *
+ * The soft rings can also be driven by a classifier which can direct
+ * traffic to individual soft rings based on the input from IP.
+ */
+
+#include <sys/types.h>
+#include <sys/cmn_err.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+#include <sys/cpuvar.h>
+#include <sys/condvar_impl.h>
+#include <sys/systm.h>
+#include <sys/callb.h>
+#include <sys/sdt.h>
+#include <sys/ddi.h>
+#include <sys/strsubr.h>
+#include <inet/common.h>
+#include <inet/ip.h>
+
+#include <sys/dls_impl.h>
+#include <sys/dls_soft_ring.h>
+
+static void soft_ring_fire(void *);
+static void soft_ring_drain(soft_ring_t *, clock_t);
+static void soft_ring_worker(soft_ring_t *);
+static void soft_ring_stop_workers(soft_ring_t **, int);
+
+kmem_cache_t *soft_ring_cache;
+
+
+int soft_ring_workerwait_ms = 10;
+int soft_ring_max_q_cnt = (4 * 1024 * 1024);
+
+/* The values above converted to ticks */
+static int soft_ring_workerwait_tick = 0;
+
+#define SOFT_RING_WORKER_WAKEUP(ringp) { \
+ timeout_id_t tid = (ringp)->s_ring_tid; \
+ \
+ ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \
+ /* \
+ * Queue isn't being processed, so take \
+ * any post enqueue actions needed before leaving. \
+ */ \
+ if (tid != 0) { \
+ /* \
+ * Waiting for an enter() to process mblk(s). \
+ */ \
+ clock_t waited = lbolt - (ringp)->s_ring_awaken; \
+ \
+ if (TICK_TO_MSEC(waited) >= (ringp)->s_ring_wait) { \
+ /* \
+ * Times up and have a worker thread \
+ * waiting for work, so schedule it. \
+ */ \
+ (ringp)->s_ring_tid = 0; \
+ cv_signal(&(ringp)->s_ring_async); \
+ mutex_exit(&(ringp)->s_ring_lock); \
+ (void) untimeout(tid); \
+ } else { \
+ mutex_exit(&(ringp)->s_ring_lock); \
+ } \
+ } else if ((ringp)->s_ring_wait != 0) { \
+ (ringp)->s_ring_awaken = lbolt; \
+ (ringp)->s_ring_tid = timeout(soft_ring_fire, (ringp), \
+ (ringp)->s_ring_wait); \
+ mutex_exit(&(ringp)->s_ring_lock); \
+ } else { \
+ /* \
+ * Schedule the worker thread. \
+ */ \
+ cv_signal(&(ringp)->s_ring_async); \
+ mutex_exit(&(ringp)->s_ring_lock); \
+ } \
+ ASSERT(MUTEX_NOT_HELD(&(ringp)->s_ring_lock)); \
+}
+
+
+#define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt) { \
+ /* \
+ * Enqueue our mblk chain. \
+ */ \
+ ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \
+ \
+ if ((ringp)->s_ring_last != NULL) \
+ (ringp)->s_ring_last->b_next = (mp); \
+ else \
+ (ringp)->s_ring_first = (mp); \
+ (ringp)->s_ring_last = (tail); \
+ (ringp)->s_ring_count += (cnt); \
+ ASSERT((ringp)->s_ring_count > 0); \
+}
+
+void
+soft_ring_init(void)
+{
+ soft_ring_cache = kmem_cache_create("soft_ring_cache",
+ sizeof (soft_ring_t), 64, NULL, NULL, NULL, NULL, NULL, 0);
+
+ soft_ring_workerwait_tick =
+ MSEC_TO_TICK_ROUNDUP(soft_ring_workerwait_ms);
+}
+
+/* ARGSUSED */
+soft_ring_t *
+soft_ring_create(char *name, processorid_t bind, clock_t wait,
+ uint_t type, pri_t pri)
+{
+ soft_ring_t *ringp;
+
+ ringp = kmem_cache_alloc(soft_ring_cache, KM_NOSLEEP);
+ if (ringp == NULL)
+ return (NULL);
+
+ bzero(ringp, sizeof (soft_ring_t));
+ (void) strncpy(ringp->s_ring_name, name, S_RING_NAMELEN + 1);
+ ringp->s_ring_name[S_RING_NAMELEN] = '\0';
+ mutex_init(&ringp->s_ring_lock, NULL, MUTEX_DEFAULT, NULL);
+
+ ringp->s_ring_type = type;
+ ringp->s_ring_bind = bind;
+ if (bind != S_RING_BIND_NONE)
+ soft_ring_bind(ringp, bind);
+ ringp->s_ring_wait = MSEC_TO_TICK(wait);
+
+ ringp->s_ring_worker = thread_create(NULL, 0, soft_ring_worker,
+ ringp, 0, &p0, TS_RUN, pri);
+
+ return (ringp);
+}
+
+soft_ring_t **
+soft_ring_set_create(char *name, processorid_t bind, clock_t wait,
+ uint_t type, pri_t pri, int cnt)
+{
+ int i;
+ soft_ring_t **ringp_list;
+
+ if ((ringp_list =
+ (soft_ring_t **) kmem_zalloc(sizeof (soft_ring_t *) * cnt,
+ KM_NOSLEEP)) != NULL) {
+ for (i = 0; i < cnt; i++) {
+ ringp_list[i] = soft_ring_create(name, bind, wait,
+ type, pri);
+ if (ringp_list[i] == NULL)
+ break;
+ }
+ if (i != cnt) {
+ soft_ring_stop_workers(ringp_list, i);
+ soft_ring_set_destroy(ringp_list, i);
+ ringp_list = NULL;
+ }
+ }
+ return (ringp_list);
+}
+
+static void
+soft_ring_stop_workers(soft_ring_t **ringp_set, int cnt)
+{
+ int i;
+ soft_ring_t *ringp;
+ timeout_id_t tid;
+ kt_did_t t_did;
+
+ for (i = 0; i < cnt; i++) {
+ ringp = ringp_set[i];
+
+ soft_ring_unbind((void *)ringp);
+ mutex_enter(&ringp->s_ring_lock);
+ if ((tid = ringp->s_ring_tid) != 0)
+ (void) untimeout(tid);
+
+ ringp->s_ring_tid = 0;
+
+ if (!(ringp->s_ring_state & S_RING_DEAD)) {
+ ringp->s_ring_state |= S_RING_DESTROY;
+ t_did = ringp->s_ring_worker->t_did;
+
+
+ /* Wake the worker so it can exit */
+ cv_signal(&(ringp)->s_ring_async);
+ }
+ mutex_exit(&ringp->s_ring_lock);
+
+ /*
+ * Here comes the tricky part. IP and driver ensure
+ * that packet flow has stopped but worker thread
+ * might still be draining the soft ring. We have
+ * already set the S_RING_DESTROY flag. We wait till
+ * the worker thread takes notice and stops processing
+ * the soft_ring and exits. It sets S_RING_DEAD on
+ * exiting.
+ */
+ if (t_did)
+ thread_join(t_did);
+ }
+}
+
+void
+soft_ring_set_destroy(soft_ring_t **ringp_set, int cnt)
+{
+ int i;
+ mblk_t *mp;
+ soft_ring_t *ringp;
+
+ for (i = 0; i < cnt; i++) {
+ ringp = ringp_set[i];
+
+ mutex_enter(&ringp->s_ring_lock);
+
+ ASSERT(ringp->s_ring_state & S_RING_DEAD);
+
+ while ((mp = ringp->s_ring_first) != NULL) {
+ ringp->s_ring_first = mp->b_next;
+ mp->b_next = NULL;
+ freemsg(mp);
+ }
+ ringp->s_ring_last = NULL;
+ mutex_exit(&ringp->s_ring_lock);
+
+ /*
+ * IP/driver ensure that no packets are flowing
+ * when we are destroying the soft rings otherwise bad
+ * things will happen.
+ */
+ kmem_cache_free(soft_ring_cache, ringp);
+ ringp_set[i] = NULL;
+ }
+ kmem_free(ringp_set, sizeof (soft_ring_t *) * cnt);
+}
+
+/* ARGSUSED */
+void
+soft_ring_bind(void *arg, processorid_t bind)
+{
+ cpu_t *cp;
+ soft_ring_t *ringp = (soft_ring_t *)arg;
+
+ mutex_enter(&ringp->s_ring_lock);
+ if (ringp->s_ring_state & S_RING_BOUND) {
+ mutex_exit(&ringp->s_ring_lock);
+ return;
+ }
+
+ ringp->s_ring_state |= S_RING_BOUND;
+ ringp->s_ring_bind = bind;
+ mutex_exit(&ringp->s_ring_lock);
+
+ cp = cpu[bind];
+ mutex_enter(&cpu_lock);
+ if (cpu_is_online(cp)) {
+ thread_affinity_set(ringp->s_ring_worker, ringp->s_ring_bind);
+ }
+ mutex_exit(&cpu_lock);
+}
+
+void
+soft_ring_unbind(void *arg)
+{
+ soft_ring_t *ringp = (soft_ring_t *)arg;
+
+ mutex_enter(&ringp->s_ring_lock);
+ if (!(ringp->s_ring_state & S_RING_BOUND)) {
+ mutex_exit(&ringp->s_ring_lock);
+ return;
+ }
+
+ ringp->s_ring_state &= ~S_RING_BOUND;
+ ringp->s_ring_bind = S_RING_BIND_NONE;
+ mutex_exit(&ringp->s_ring_lock);
+
+ thread_affinity_clear(ringp->s_ring_worker);
+}
+
+/*
+ * soft_ring_enter() - enter soft_ring sqp with mblk mp (which can be
+ * a chain), while tail points to the end and cnt in number of
+ * mblks in the chain.
+ *
+ * For a chain of single packet (i.e. mp == tail), go through the
+ * fast path if no one is processing the soft_ring and nothing is queued.
+ *
+ * The proc and arg for each mblk is already stored in the mblk in
+ * appropriate places.
+ */
+/* ARGSUSED */
+void
+soft_ring_process(soft_ring_t *ringp, mblk_t *mp_chain, uint8_t tag)
+{
+ void *arg1, *arg2;
+ s_ring_proc_t proc;
+ mblk_t *tail;
+ int cnt = 1;
+
+ ASSERT(ringp != NULL);
+ ASSERT(mp_chain != NULL);
+ ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
+
+ tail = mp_chain;
+ while (tail->b_next != NULL) {
+ tail = tail->b_next;
+ cnt++;
+ }
+ mutex_enter(&ringp->s_ring_lock);
+
+ ringp->s_ring_total_inpkt += cnt;
+ if (!(ringp->s_ring_state & S_RING_PROC) &&
+ !(ringp->s_ring_type == S_RING_WORKER_ONLY)) {
+ /*
+ * See if anything is already queued. If we are the
+ * first packet, do inline processing else queue the
+ * packet and do the drain.
+ */
+ if (ringp->s_ring_first == NULL && cnt == 1) {
+ /*
+ * Fast-path, ok to process and nothing queued.
+ */
+ ringp->s_ring_run = curthread;
+ ringp->s_ring_state |= (S_RING_PROC);
+
+ /*
+ * We are the chain of 1 packet so
+ * go through this fast path.
+ */
+ ASSERT(mp_chain->b_next == NULL);
+ proc = ringp->s_ring_upcall;
+ arg1 = ringp->s_ring_upcall_arg1;
+ arg2 = ringp->s_ring_upcall_arg2;
+
+ mutex_exit(&ringp->s_ring_lock);
+ (*proc)(arg1, arg2, mp_chain, -1);
+
+ ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock));
+ mutex_enter(&ringp->s_ring_lock);
+ ringp->s_ring_run = NULL;
+ ringp->s_ring_state &= ~S_RING_PROC;
+ if (ringp->s_ring_first == NULL) {
+ /*
+ * We processed inline our packet and
+ * nothing new has arrived. We are done.
+ */
+ mutex_exit(&ringp->s_ring_lock);
+ return;
+ }
+ } else {
+ SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt);
+ }
+
+ /*
+ * We are here because either we couldn't do inline
+ * processing (because something was already queued),
+ * or we had a chanin of more than one packet,
+ * or something else arrived after we were done with
+ * inline processing.
+ */
+ ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
+ ASSERT(ringp->s_ring_first != NULL);
+
+
+ soft_ring_drain(ringp, -1);
+ mutex_exit(&ringp->s_ring_lock);
+ return;
+ } else {
+ /*
+ * Queue is already being processed. Just enqueue
+ * the packet and go away.
+ */
+ if (ringp->s_ring_count > soft_ring_max_q_cnt) {
+ freemsgchain(mp_chain);
+ DLS_BUMP_STAT(dlss_soft_ring_pkt_drop, cnt);
+ } else
+ SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt);
+ if (!(ringp->s_ring_state & S_RING_PROC)) {
+ SOFT_RING_WORKER_WAKEUP(ringp);
+ } else {
+ ASSERT(ringp->s_ring_run != NULL);
+ mutex_exit(&ringp->s_ring_lock);
+ }
+ return;
+ }
+}
+
+/*
+ * PRIVATE FUNCTIONS
+ */
+
+static void
+soft_ring_fire(void *arg)
+{
+ soft_ring_t *ringp = arg;
+
+ mutex_enter(&ringp->s_ring_lock);
+ if (ringp->s_ring_tid == 0) {
+ mutex_exit(&ringp->s_ring_lock);
+ return;
+ }
+
+ ringp->s_ring_tid = 0;
+
+ if (!(ringp->s_ring_state & S_RING_PROC)) {
+ cv_signal(&ringp->s_ring_async);
+ }
+ mutex_exit(&ringp->s_ring_lock);
+}
+
+/* ARGSUSED */
+static void
+soft_ring_drain(soft_ring_t *ringp, clock_t expire)
+{
+ mblk_t *mp;
+ s_ring_proc_t proc;
+ void *arg1, *arg2;
+ timeout_id_t tid;
+
+ ringp->s_ring_run = curthread;
+ ASSERT(mutex_owned(&ringp->s_ring_lock));
+ ASSERT(!(ringp->s_ring_state & S_RING_PROC));
+
+ if ((tid = ringp->s_ring_tid) != 0)
+ ringp->s_ring_tid = 0;
+
+ ringp->s_ring_state |= S_RING_PROC;
+
+
+ proc = ringp->s_ring_upcall;
+ arg1 = ringp->s_ring_upcall_arg1;
+ arg2 = ringp->s_ring_upcall_arg2;
+
+ while (ringp->s_ring_first != NULL) {
+ mp = ringp->s_ring_first;
+ ringp->s_ring_first = NULL;
+ ringp->s_ring_last = NULL;
+ ringp->s_ring_count = 0;
+ mutex_exit(&ringp->s_ring_lock);
+
+ if (tid != 0) {
+ (void) untimeout(tid);
+ tid = 0;
+ }
+
+ (*proc)(arg1, arg2, mp, -1);
+
+ mutex_enter(&ringp->s_ring_lock);
+ }
+
+ ringp->s_ring_state &= ~S_RING_PROC;
+ ringp->s_ring_run = NULL;
+}
+
+static void
+soft_ring_worker(soft_ring_t *ringp)
+{
+ kmutex_t *lock = &ringp->s_ring_lock;
+ kcondvar_t *async = &ringp->s_ring_async;
+ callb_cpr_t cprinfo;
+
+ CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "soft_ring");
+ mutex_enter(lock);
+
+ for (;;) {
+ while (ringp->s_ring_first == NULL ||
+ (ringp->s_ring_state & S_RING_PROC)) {
+ CALLB_CPR_SAFE_BEGIN(&cprinfo);
+ if (ringp->s_ring_state & S_RING_DESTROY)
+ goto destroy;
+still_wait:
+ cv_wait(async, lock);
+ if (ringp->s_ring_state & S_RING_DESTROY) {
+destroy:
+ if (ringp->s_ring_state & S_RING_DESTROY) {
+ ringp->s_ring_state |= S_RING_DEAD;
+ CALLB_CPR_EXIT(&cprinfo);
+ thread_exit();
+ }
+ }
+ if (ringp->s_ring_state & S_RING_PROC) {
+ goto still_wait;
+ }
+ CALLB_CPR_SAFE_END(&cprinfo, lock);
+ }
+ soft_ring_drain(ringp, -1);
+ }
+}
+
+void
+dls_soft_ring_rx_set(dls_channel_t dc, dls_rx_t rx, void *arg, int type)
+{
+ dls_impl_t *dip = (dls_impl_t *)dc;
+
+ rw_enter(&(dip->di_lock), RW_WRITER);
+ dip->di_soft_ring_fanout_type = type;
+ dip->di_rx = rx;
+ if (type == SOFT_RING_NONE)
+ dip->di_rx_arg = arg;
+ else
+ dip->di_rx_arg = (void *)dip;
+ rw_exit(&(dip->di_lock));
+}
+
+boolean_t
+dls_soft_ring_workers(dls_channel_t dc)
+{
+ dls_impl_t *dip = (dls_impl_t *)dc;
+ boolean_t ret = B_FALSE;
+
+ rw_enter(&(dip->di_lock), RW_WRITER);
+ if (dip->di_soft_ring_list != NULL)
+ ret = B_TRUE;
+ rw_exit(&(dip->di_lock));
+ return (ret);
+}
+
+void
+dls_soft_ring_disable(dls_channel_t dc)
+{
+ dls_impl_t *dip = (dls_impl_t *)dc;
+
+ rw_enter(&(dip->di_lock), RW_WRITER);
+ if (dip->di_soft_ring_list != NULL)
+ soft_ring_stop_workers(dip->di_soft_ring_list,
+ dip->di_soft_ring_size);
+ rw_exit(&(dip->di_lock));
+}
+
+boolean_t
+dls_soft_ring_enable(dls_channel_t dc, dl_capab_dls_t *soft_ringp)
+{
+ dls_impl_t *dip;
+ int i;
+ soft_ring_t **softring_set;
+ soft_ring_t *softring;
+ mac_rx_fifo_t mrf;
+ char name[64];
+
+ dip = (dls_impl_t *)dc;
+
+ rw_enter(&(dip->di_lock), RW_WRITER);
+
+ if (dip->di_soft_ring_list != NULL) {
+ soft_ring_stop_workers(dip->di_soft_ring_list,
+ dip->di_soft_ring_size);
+ soft_ring_set_destroy(dip->di_soft_ring_list,
+ dip->di_soft_ring_size);
+ dip->di_soft_ring_list = NULL;
+ }
+ dip->di_soft_ring_size = 0;
+
+ bzero(name, sizeof (name));
+ (void) snprintf(name, sizeof (name), "dls_soft_ring_%p", dip);
+ dip->di_soft_ring_list = soft_ring_set_create(name, S_RING_BIND_NONE,
+ 0, S_RING_WORKER_ONLY, minclsyspri, soft_ringp->dls_ring_cnt);
+
+ if (dip->di_soft_ring_list == NULL) {
+ rw_exit(&(dip->di_lock));
+ return (B_FALSE);
+ }
+
+ dip->di_soft_ring_size = soft_ringp->dls_ring_cnt;
+ softring_set = dip->di_soft_ring_list;
+
+ dip->di_ring_add = (mac_resource_add_t)soft_ringp->dls_ring_add;
+ dip->di_rx = (dls_rx_t)soft_ringp->dls_ring_assign;
+ dip->di_rx_arg = (void *)soft_ringp->dls_rx_handle;
+
+ bzero(&mrf, sizeof (mac_rx_fifo_t));
+ mrf.mrf_type = MAC_RX_FIFO;
+ for (i = 0; i < soft_ringp->dls_ring_cnt; i++) {
+ softring = softring_set[i];
+ mrf.mrf_arg = softring;
+ softring->s_ring_upcall_arg1 =
+ (void *)soft_ringp->dls_rx_handle;
+ softring->s_ring_upcall_arg2 =
+ dip->di_ring_add((void *)soft_ringp->dls_rx_handle,
+ (mac_resource_t *)&mrf);
+ softring->s_ring_upcall =
+ (s_ring_proc_t)soft_ringp->dls_rx;
+ }
+
+ /*
+ * Note that soft_ring is enabled. This prevents further DLIOCHDRINFO
+ * ioctls from overwriting the receive function pointer.
+ */
+ rw_exit(&(dip->di_lock));
+ return (B_TRUE);
+}
+
+#define COMPUTE_HASH(key, sz) (key % sz)
+
+/* ARGSUSED */
+void
+dls_ether_soft_ring_fanout(void *rx_handle, void *rx_cookie, mblk_t *mp_chain,
+ size_t hdrlen)
+{
+ ipha_t *ipha = (ipha_t *)mp_chain->b_rptr;
+ dls_impl_t *dip = (dls_impl_t *)rx_handle;
+ int indx;
+ int key;
+ int hdr_len;
+ uint16_t port1, port2;
+
+ switch (dip->di_soft_ring_fanout_type) {
+ case SOFT_RING_SRC_HASH:
+ /*
+ * We get a chain of packets from the same remote. Make
+ * sure the same remote goes to same ring.
+ */
+ hdr_len = IPH_HDR_LENGTH(ipha);
+ port1 = *((uint16_t *)(&mp_chain->b_rptr[hdr_len]));
+ port2 = *((uint16_t *)(&mp_chain->b_rptr[hdr_len+2]));
+ key = port1 + port2;
+ indx = COMPUTE_HASH(key, dip->di_soft_ring_size);
+ soft_ring_process(dip->di_soft_ring_list[indx],
+ mp_chain, 0);
+ break;
+ case SOFT_RING_RND_ROBIN:
+ case SOFT_RING_RANDOM:
+ /*
+ * Just send it to any possible soft ring
+ */
+ soft_ring_process(dip->di_soft_ring_list[
+ lbolt % dip->di_soft_ring_size], mp_chain, 0);
+ break;
+ }
+}
diff --git a/usr/src/uts/common/io/dls/dls_stat.c b/usr/src/uts/common/io/dls/dls_stat.c
index d0025c7fb9..3d58c49cbd 100644
--- a/usr/src/uts/common/io/dls/dls_stat.c
+++ b/usr/src/uts/common/io/dls/dls_stat.c
@@ -121,7 +121,7 @@ i_dls_stat_update(kstat_t *ksp, int rw)
*/
void
-dls_stat_create(dls_vlan_t *dvp)
+dls_mac_stat_create(dls_vlan_t *dvp)
{
dls_link_t *dlp = dvp->dv_dlp;
char module[IFNAMSIZ];
@@ -172,7 +172,7 @@ done:
}
void
-dls_stat_destroy(dls_vlan_t *dvp)
+dls_mac_stat_destroy(dls_vlan_t *dvp)
{
kstat_delete(dvp->dv_ksp);
dvp->dv_ksp = NULL;
diff --git a/usr/src/uts/common/io/dls/dls_vlan.c b/usr/src/uts/common/io/dls/dls_vlan.c
index 872dc29522..9e20730ea6 100644
--- a/usr/src/uts/common/io/dls/dls_vlan.c
+++ b/usr/src/uts/common/io/dls/dls_vlan.c
@@ -305,7 +305,7 @@ again:
}
if (dvp->dv_ref++ == 0)
- dls_stat_create(dvp);
+ dls_mac_stat_create(dvp);
*dvpp = dvp;
done:
@@ -334,7 +334,7 @@ dls_vlan_rele(dls_vlan_t *dvp)
mac_stop(dlp->dl_mh);
dls_mac_rele(dlp);
if (--dvp->dv_ref == 0) {
- dls_stat_destroy(dvp);
+ dls_mac_stat_destroy(dvp);
/*
* Tagged vlans get destroyed when dv_ref drops
* to 0. We need to copy dv_name here because
diff --git a/usr/src/uts/common/io/mac/mac.c b/usr/src/uts/common/io/mac/mac.c
index 288bb79298..53bcc5fc94 100644
--- a/usr/src/uts/common/io/mac/mac.c
+++ b/usr/src/uts/common/io/mac/mac.c
@@ -1297,7 +1297,10 @@ mac_resource_add(mac_t *mp, mac_resource_t *mrp)
add = mip->mi_resource_add;
arg = mip->mi_resource_add_arg;
- mrh = add(arg, mrp);
+ if (add != NULL)
+ mrh = add(arg, mrp);
+ else
+ mrh = NULL;
rw_exit(&mip->mi_resource_lock);
return (mrh);
diff --git a/usr/src/uts/common/os/space.c b/usr/src/uts/common/os/space.c
index 071d12bc39..0a61cae511 100644
--- a/usr/src/uts/common/os/space.c
+++ b/usr/src/uts/common/os/space.c
@@ -20,7 +20,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2003 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -371,3 +371,21 @@ space_free(char *key)
#include <sys/crc32.h>
const uint32_t crc32_table[256] = { CRC32_TABLE };
+
+
+/*
+ * We need to fanout load from NIC which can overwhelm a single
+ * CPU. A 10Gb NIC interrupting a single CPU is a good example.
+ * Instead of fanning out to random CPUs, it a big performance
+ * win if you can fanout to the threads on the same core (niagara)
+ * that is taking interrupts.
+ *
+ * We need a better mechanism to figure out the other threads on
+ * the same core or cores on the same chip which share caches etc.
+ * but for time being, this will suffice.
+ */
+#define NUMBER_OF_THREADS_PER_CPU 4
+uint_t ip_threads_per_cpu = NUMBER_OF_THREADS_PER_CPU;
+
+/* Global flag to enable/disable soft ring facility */
+boolean_t ip_squeue_soft_ring = B_FALSE;
diff --git a/usr/src/uts/common/sys/dld_impl.h b/usr/src/uts/common/sys/dld_impl.h
index cafc32433a..d7d869189d 100644
--- a/usr/src/uts/common/sys/dld_impl.h
+++ b/usr/src/uts/common/sys/dld_impl.h
@@ -163,6 +163,7 @@ struct dld_str {
* IP polling is operational if this flag is set.
*/
boolean_t ds_polling;
+ boolean_t ds_soft_ring;
/*
* State of DLPI user: may be active (regular network layer),
@@ -194,7 +195,9 @@ struct dld_str {
*/
kmutex_t ds_thr_lock;
uint_t ds_thr;
+ taskqid_t ds_task_id;
mblk_t *ds_detach_req;
+ mblk_t *ds_unbind_req;
} dld_str;
/*
diff --git a/usr/src/uts/common/sys/dlpi.h b/usr/src/uts/common/sys/dlpi.h
index 1169d68d68..b6f3715289 100644
--- a/usr/src/uts/common/sys/dlpi.h
+++ b/usr/src/uts/common/sys/dlpi.h
@@ -577,7 +577,9 @@ union DL_qos_types {
#define DL_CAPAB_ZEROCOPY 0x05 /* Zero-copy capability */
/* dl_data is dl_capab_zerocopy_t */
#define DL_CAPAB_POLL 0x06 /* Polling capability */
- /* dl_data is dl_capab_poll_t */
+ /* dl_data is dl_capab_dls_t */
+#define DL_CAPAB_SOFT_RING 0x07 /* Soft ring capable */
+ /* dl_data is dl_capab_dls_t */
typedef struct {
t_uscalar_t dl_cap; /* capability type */
@@ -696,37 +698,57 @@ typedef struct {
#ifdef _KERNEL
/*
- * This defines the DL_CAPAB_POLL capability. Currently it provides a
- * mechanism for IP to exchange function pointers with a gldv3-based driver
- * to enable streams-bypassing data-paths and interrupt blanking. True polling
- * support will be added in the future.
+ * This structure is used by DL_CAPAB_POLL and DL_CAPAB_SOFT_RING
+ * capabilities. It provides a mechanism for IP to exchange function
+ * pointers with a gldv3-based driver to enable it to bypass streams-
+ * data-paths. DL_CAPAB_POLL mechanism provides a way to blank
+ * interrupts. Note: True polling support will be added in the future.
+ * DL_CAPAB_SOFT_RING provides a mechanism to create soft ring at the
+ * dls layer.
*/
-typedef struct dl_capab_poll_s {
- t_uscalar_t poll_version;
- t_uscalar_t poll_flags;
+typedef struct dl_capab_dls_s {
+ t_uscalar_t dls_version;
+ t_uscalar_t dls_flags;
/* DLD provided information */
- uintptr_t poll_tx_handle;
- uintptr_t poll_tx;
+ uintptr_t dls_tx_handle;
+ uintptr_t dls_tx;
+ uintptr_t dls_ring_change_status;
+ uintptr_t dls_ring_bind;
+ uintptr_t dls_ring_unbind;
/* IP provided information */
- uintptr_t poll_rx_handle;
- uintptr_t poll_rx;
- uintptr_t poll_ring_add;
+ uintptr_t dls_rx_handle;
+ uintptr_t dls_ring_assign;
+ uintptr_t dls_rx;
+ uintptr_t dls_ring_add;
+ t_uscalar_t dls_ring_cnt;
- dl_mid_t poll_mid; /* module ID */
-} dl_capab_poll_t;
+ dl_mid_t dls_mid; /* module ID */
+} dl_capab_dls_t;
#define POLL_CURRENT_VERSION 0x01
#define POLL_VERSION_1 0x01
-/*
- * Values for poll_flags
- */
+#define SOFT_RING_VERSION_1 0x01
+
+/* Values for poll_flags */
#define POLL_ENABLE 0x01 /* Set to enable polling */
/* capability */
#define POLL_CAPABLE 0x02 /* Polling ability exists */
-#define POLL_DISABLE 0x04 /* Disable Polling */
+#define POLL_DISABLE 0x03 /* Disable Polling */
+
+/* Values for soft_ring_flags */
+#define SOFT_RING_ENABLE 0x04 /* Set to enable soft_ring */
+ /* capability */
+#define SOFT_RING_CAPABLE 0x05 /* Soft_Ring ability exists */
+#define SOFT_RING_DISABLE 0x06 /* Disable Soft_Ring */
+
+/* Soft_Ring fanout types (used by soft_ring_change_status) */
+#define SOFT_RING_NONE 0x00
+#define SOFT_RING_RANDOM 0x01
+#define SOFT_RING_SRC_HASH 0x02
+#define SOFT_RING_RND_ROBIN 0x03
#endif /* _KERNEL */
diff --git a/usr/src/uts/common/sys/dls_impl.h b/usr/src/uts/common/sys/dls_impl.h
index 69f9b4b3dc..cf26f5cb21 100644
--- a/usr/src/uts/common/sys/dls_impl.h
+++ b/usr/src/uts/common/sys/dls_impl.h
@@ -35,6 +35,8 @@
#include <sys/modhash.h>
#include <sys/kstat.h>
#include <net/if.h>
+#include <sys/dlpi.h>
+#include <sys/dls_soft_ring.h>
#ifdef __cplusplus
extern "C" {
@@ -99,6 +101,7 @@ struct dls_impl_s {
dls_multicst_addr_t *di_dmap;
dls_rx_t di_rx;
void *di_rx_arg;
+ mac_resource_add_t di_ring_add;
const mac_txinfo_t *di_txinfo;
boolean_t di_bound;
boolean_t di_removing;
@@ -106,6 +109,9 @@ struct dls_impl_s {
uint8_t di_unicst_addr[MAXADDRLEN];
dls_priv_header_t di_header;
dls_priv_header_info_t di_header_info;
+ soft_ring_t **di_soft_ring_list;
+ uint_t di_soft_ring_size;
+ int di_soft_ring_fanout_type;
};
struct dls_head_s {
@@ -123,8 +129,8 @@ extern void dls_link_remove(dls_link_t *, dls_impl_t *);
extern int dls_mac_hold(dls_link_t *);
extern void dls_mac_rele(dls_link_t *);
-extern void dls_stat_create(dls_vlan_t *);
-extern void dls_stat_destroy(dls_vlan_t *);
+extern void dls_mac_stat_create(dls_vlan_t *);
+extern void dls_mac_stat_destroy(dls_vlan_t *);
extern void dls_vlan_init(void);
extern int dls_vlan_fini(void);
diff --git a/usr/src/uts/common/sys/dls_soft_ring.h b/usr/src/uts/common/sys/dls_soft_ring.h
new file mode 100644
index 0000000000..96a5e7ccc4
--- /dev/null
+++ b/usr/src/uts/common/sys/dls_soft_ring.h
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2005 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DLS_SOFT_RING_H
+#define _SYS_DLS_SOFT_RING_H
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <sys/types.h>
+#include <sys/processor.h>
+#include <sys/stream.h>
+#include <sys/squeue.h>
+
+#define S_RING_NAMELEN 64
+
+typedef void (*s_ring_proc_t)(void *, void *, mblk_t *, size_t);
+
+typedef struct soft_ring_s {
+ /* Keep the most used members 64bytes cache aligned */
+ kmutex_t s_ring_lock; /* lock before using any member */
+ uint16_t s_ring_type; /* processing model of the sq */
+ uint16_t s_ring_state; /* state flags and message count */
+ int s_ring_count; /* # of mblocks in soft_ring */
+ mblk_t *s_ring_first; /* first mblk chain or NULL */
+ mblk_t *s_ring_last; /* last mblk chain or NULL */
+ s_ring_proc_t s_ring_upcall; /* Upcall func pointer */
+ void *s_ring_upcall_arg1; /* upcall argument 1 */
+ void *s_ring_upcall_arg2; /* upcall argument 2 */
+ clock_t s_ring_awaken; /* time async thread was awakened */
+
+ kthread_t *s_ring_run; /* Current thread processing sq */
+ processorid_t s_ring_bind; /* processor to bind to */
+ kcondvar_t s_ring_async; /* async thread blocks on */
+ clock_t s_ring_wait; /* lbolts to wait after a fill() */
+ timeout_id_t s_ring_tid; /* timer id of pending timeout() */
+ kthread_t *s_ring_worker; /* kernel thread id */
+ char s_ring_name[S_RING_NAMELEN + 1];
+ uint32_t s_ring_total_inpkt;
+} soft_ring_t;
+
+
+/*
+ * type flags - combination allowed to process and drain the queue
+ */
+#define S_RING_WORKER_ONLY 0x0001 /* Worker thread only */
+#define S_RING_ANY 0x0002 /* Any thread can process the queue */
+
+/*
+ * State flags.
+ */
+#define S_RING_PROC 0x0001 /* being processed */
+#define S_RING_WORKER 0x0002 /* worker thread */
+#define S_RING_BOUND 0x0004 /* Worker thread is bound */
+#define S_RING_DESTROY 0x0008 /* Ring is being destroyed */
+#define S_RING_DEAD 0x0010 /* Worker thread is no more */
+
+/*
+ * arguments for processors to bind to
+ */
+#define S_RING_BIND_NONE -1
+
+/*
+ * Structure for dls statistics
+ */
+struct dls_kstats {
+ kstat_named_t dlss_soft_ring_pkt_drop;
+};
+
+extern struct dls_kstats dls_kstat;
+
+#define DLS_BUMP_STAT(x, y) (dls_kstat.x.value.ui32 += y)
+
+extern void soft_ring_init(void);
+extern soft_ring_t *soft_ring_create(char *, processorid_t, clock_t,
+ uint_t, pri_t);
+extern soft_ring_t **soft_ring_set_create(char *, processorid_t, clock_t,
+ uint_t, pri_t, int);
+extern void soft_ring_set_destroy(soft_ring_t **, int);
+extern void soft_ring_process(soft_ring_t *, mblk_t *, uint8_t);
+extern void soft_ring_bind(void *, processorid_t);
+extern void soft_ring_unbind(void *);
+extern void dls_ether_soft_ring_fanout(void *,
+ void *, mblk_t *, size_t);
+extern boolean_t dls_soft_ring_enable(dls_channel_t, dl_capab_dls_t *);
+extern void dls_soft_ring_disable(dls_channel_t);
+extern boolean_t dls_soft_ring_workers(dls_channel_t);
+extern void dls_soft_ring_rx_set(dls_channel_t, dls_rx_t, void *, int);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_DLS_SOFT_RING_H */
diff --git a/usr/src/uts/sun4v/os/mach_startup.c b/usr/src/uts/sun4v/os/mach_startup.c
index 179a830d60..986a2fa488 100644
--- a/usr/src/uts/sun4v/os/mach_startup.c
+++ b/usr/src/uts/sun4v/os/mach_startup.c
@@ -289,6 +289,21 @@ mach_hw_copy_limit(void)
}
/*
+ * We need to enable soft ring functionality on Niagara platform since
+ * one strand can't handle interrupts for a 1Gb NIC. Set the tunable
+ * ip_squeue_soft_ring by default on this platform. We can also set
+ * ip_threads_per_cpu to track number of threads per core. The variables
+ * themselves are defined in space.c and used by IP module
+ */
+extern uint_t ip_threads_per_cpu;
+extern boolean_t ip_squeue_soft_ring;
+void
+startup_platform(void)
+{
+ ip_squeue_soft_ring = B_TRUE;
+}
+
+/*
* This function sets up hypervisor traptrace buffer
* This routine is called by the boot cpu only
*/