summaryrefslogtreecommitdiff
path: root/usr/src/uts/common/inet/ip/ip_squeue.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/uts/common/inet/ip/ip_squeue.c')
-rw-r--r--usr/src/uts/common/inet/ip/ip_squeue.c437
1 files changed, 390 insertions, 47 deletions
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index ae7731ac7b..78db295c78 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -125,13 +125,16 @@
* We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
* mapping between squeue and NIC (or Rx ring) for performance reasons so
* each squeue can uniquely own a NIC or a Rx ring and do polling
- * (PSARC 2004/630). So we allow up to MAX_THREAD_PER_CPU squeues per CPU.
- * We start by creating MIN_THREAD_PER_CPU squeues per CPU but more squeues
+ * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU.
+ * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
* can be created dynamically as needed.
*/
-#define MAX_THREAD_PER_CPU 32
-#define MIN_THREAD_PER_CPU 1
-uint_t ip_threads_per_cpu = MIN_THREAD_PER_CPU;
+#define MAX_SQUEUES_PER_CPU 32
+#define MIN_SQUEUES_PER_CPU 1
+uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
+
+#define IP_NUM_SOFT_RINGS 2
+uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
/*
* List of all created squeue sets. The size is protected by cpu_lock
@@ -155,11 +158,12 @@ static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
static void ip_squeue_set_bind(squeue_set_t *);
static void ip_squeue_set_unbind(squeue_set_t *);
+static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t);
#define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
/*
- * Create squeue set containing ip_threads_per_cpu number of squeues
+ * Create squeue set containing ip_squeues_per_cpu number of squeues
* for this CPU and bind them all to the CPU.
*/
static squeue_set_t *
@@ -186,13 +190,13 @@ ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
}
sqs = kmem_zalloc(sizeof (squeue_set_t) +
- (sizeof (squeue_t *) * MAX_THREAD_PER_CPU), KM_SLEEP);
+ (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
sqs->sqs_list = (squeue_t **)&sqs[1];
- sqs->sqs_max_size = MAX_THREAD_PER_CPU;
+ sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
sqs->sqs_bind = id;
- for (i = 0; i < ip_threads_per_cpu; i++) {
+ for (i = 0; i < ip_squeues_per_cpu; i++) {
bzero(sqname, sizeof (sqname));
(void) snprintf(sqname, sizeof (sqname),
@@ -202,6 +206,12 @@ ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
minclsyspri);
+ /*
+ * The first squeue in each squeue_set is the DEFAULT
+ * squeue.
+ */
+ sqp->sq_state |= SQS_DEFAULT;
+
ASSERT(sqp != NULL);
squeue_profile_enable(sqp);
@@ -229,10 +239,10 @@ ip_squeue_init(void (*callback)(squeue_t *))
ASSERT(sqset_global_list == NULL);
- if (ip_threads_per_cpu < MIN_THREAD_PER_CPU)
- ip_threads_per_cpu = MIN_THREAD_PER_CPU;
- else if (ip_threads_per_cpu > MAX_THREAD_PER_CPU)
- ip_threads_per_cpu = MAX_THREAD_PER_CPU;
+ if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
+ ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
+ else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
+ ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
ip_squeue_create_callback = callback;
squeue_init();
@@ -293,6 +303,10 @@ ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
mutex_exit(&sqp->sq_lock);
ill = ring->rr_ill;
+ if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
+ ASSERT(ring->rr_handle != NULL);
+ ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
+ }
/*
* Cleanup the ring
@@ -338,15 +352,20 @@ ip_squeue_extend(void *arg)
ill_t *ill = sq_arg->ip_taskq_ill;
ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
cpu_t *intr_cpu = sq_arg->ip_taskq_cpu;
- squeue_set_t *sqs;
+ squeue_set_t *sqs;
squeue_t *sqp = NULL;
- char sqname[64];
- int i;
ASSERT(ill != NULL);
ASSERT(ill_rx_ring != NULL);
kmem_free(arg, sizeof (ip_taskq_arg_t));
+ /*
+ * Make sure the CPU that originally took the interrupt still
+ * exists.
+ */
+ if (!CPU_ISON(intr_cpu))
+ intr_cpu = CPU;
+
sqs = intr_cpu->cpu_squeue_set;
/*
@@ -356,10 +375,337 @@ ip_squeue_extend(void *arg)
* is sequential, we need to hold the ill_lock.
*/
mutex_enter(&ill->ill_lock);
+ sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE);
+ if (sqp == NULL) {
+ /*
+ * We hit the max limit of squeues allowed per CPU.
+ * Assign this rx_ring to DEFAULT squeue of the
+ * interrupted CPU but the squeue will not manage
+ * the ring. Also print a warning.
+ */
+ cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
+ "has max number of squeues. System performance might "
+ "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
+
+ /* the first squeue in the list is the default squeue */
+ sqp = sqs->sqs_list[0];
+ ASSERT(sqp != NULL);
+ ill_rx_ring->rr_sqp = sqp;
+ ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
+
+ mutex_exit(&ill->ill_lock);
+ ill_waiter_dcr(ill);
+ return;
+ }
+
+ ASSERT(MUTEX_HELD(&sqp->sq_lock));
+ sqp->sq_rx_ring = ill_rx_ring;
+ ill_rx_ring->rr_sqp = sqp;
+ ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
+
+ sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
+ mutex_exit(&sqp->sq_lock);
+
+ mutex_exit(&ill->ill_lock);
+
+ /* ill_waiter_dcr will also signal any waiters on ill_ring_state */
+ ill_waiter_dcr(ill);
+}
+
+/*
+ * Do a Rx ring to squeue binding. Find a unique squeue that is not
+ * managing a receive ring. If no such squeue exists, dynamically
+ * create a new one in the squeue set.
+ *
+ * The function runs via the system taskq. The ill passed as an
+ * argument can't go away since we hold a ref. The lock order is
+ * ill_lock -> sqs_lock -> sq_lock.
+ *
+ * If we are binding a Rx ring to a squeue attached to the offline CPU,
+ * no need to check that because squeues are never destroyed once
+ * created.
+ */
+/* ARGSUSED */
+static void
+ip_squeue_soft_ring_affinity(void *arg)
+{
+ ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg;
+ ill_t *ill = sq_arg->ip_taskq_ill;
+ ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab;
+ ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
+ cpu_t *intr_cpu = sq_arg->ip_taskq_cpu;
+ cpu_t *bind_cpu;
+ int cpu_id = intr_cpu->cpu_id;
+ int min_cpu_id, max_cpu_id;
+ boolean_t enough_uniq_cpus = B_FALSE;
+ boolean_t enough_cpus = B_FALSE;
+ squeue_set_t *sqs, *last_sqs;
+ squeue_t *sqp = NULL;
+ int i, j;
+
+ ASSERT(ill != NULL);
+ kmem_free(arg, sizeof (ip_taskq_arg_t));
+
+ /*
+ * Make sure the CPU that originally took the interrupt still
+ * exists.
+ */
+ if (!CPU_ISON(intr_cpu)) {
+ intr_cpu = CPU;
+ cpu_id = intr_cpu->cpu_id;
+ }
+
+ /*
+ * If this ill represents link aggregation, then there might be
+ * multiple NICs trying to register them selves at the same time
+ * and in order to ensure that test and assignment of free rings
+ * is sequential, we need to hold the ill_lock.
+ */
+ mutex_enter(&ill->ill_lock);
+
+ if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
+ mutex_exit(&ill->ill_lock);
+ return;
+ }
+ /*
+ * We need to fanout the interrupts from the NIC. We do that by
+ * telling the driver underneath to create soft rings and use
+ * worker threads (if the driver advertized SOFT_RING capability)
+ * Its still a big performance win to if we can fanout to the
+ * threads on the same core that is taking interrupts.
+ *
+ * Since we don't know the interrupt to CPU binding, we don't
+ * assign any squeues or affinity to worker threads in the NIC.
+ * At the time of the first interrupt, we know which CPU is
+ * taking interrupts and try to find other threads on the same
+ * core. Assuming, ip_threads_per_cpu is correct and cpus are
+ * numbered sequentially for each core (XXX need something better
+ * than this in future), find the lowest number and highest
+ * number thread for that core.
+ *
+ * If we have one more thread per core than number of soft rings,
+ * then don't assign any worker threads to the H/W thread (cpu)
+ * taking interrupts (capability negotiation tries to ensure this)
+ *
+ * If the number of threads per core are same as the number of
+ * soft rings, then assign the worker affinity and squeue to
+ * the same cpu.
+ *
+ * Otherwise, just fanout to higher number CPUs starting from
+ * the interrupted CPU.
+ */
+
+ min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
+ max_cpu_id = min_cpu_id + ip_threads_per_cpu;
+
+ cmn_err(CE_CONT, "soft_ring_affinity: min/max/intr = %d/%d/%d\n",
+ min_cpu_id, max_cpu_id, (int)intr_cpu->cpu_id);
+
+ /*
+ * Quickly check if there are enough CPUs present for fanout
+ * and also max_cpu_id is less than the id of the active CPU.
+ * We use the cpu_id stored in the last squeue_set to get
+ * an idea. The scheme is by no means perfect since it doesn't
+ * take into account CPU DR operations and the fact that
+ * interrupts themselves might change. An ideal scenario
+ * would be to ensure that interrupts run cpus by themselves
+ * and worker threads never have affinity to those CPUs. If
+ * the interrupts move to CPU which had a worker thread, it
+ * should be changed. Probably callbacks similar to CPU offline
+ * are needed to make it work perfectly.
+ */
+ last_sqs = sqset_global_list[sqset_global_size - 1];
+ if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
+ if ((max_cpu_id - min_cpu_id) >
+ ill_soft_ring->ill_dls_soft_ring_cnt)
+ enough_uniq_cpus = B_TRUE;
+ else if ((max_cpu_id - min_cpu_id) >=
+ ill_soft_ring->ill_dls_soft_ring_cnt)
+ enough_cpus = B_TRUE;
+ }
+
+ j = 0;
+ for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
+ if (enough_uniq_cpus) {
+ if ((min_cpu_id + i) == cpu_id) {
+ j++;
+ continue;
+ }
+ bind_cpu = cpu[min_cpu_id + i];
+ } else if (enough_cpus) {
+ bind_cpu = cpu[min_cpu_id + i];
+ } else {
+ /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
+ bind_cpu = cpu[(cpu_id + i) % ncpus];
+ }
+
+ /*
+ * Check if the CPU actually exist and active. If not,
+ * use the interrupted CPU. ip_find_unused_squeue() will
+ * find the right CPU to fanout anyway.
+ */
+ if (!CPU_ISON(bind_cpu))
+ bind_cpu = intr_cpu;
+
+ sqs = bind_cpu->cpu_squeue_set;
+ ASSERT(sqs != NULL);
+ ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
+
+ sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE);
+ if (sqp == NULL) {
+ /*
+ * We hit the max limit of squeues allowed per CPU.
+ * Assign this rx_ring to DEFAULT squeue of the
+ * interrupted CPU but thesqueue will not manage
+ * the ring. Also print a warning.
+ */
+ cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
+ "%d/%p already has max number of squeues. System "
+ "performance might become suboptimal\n",
+ sqs->sqs_bind, (void *)sqs);
+
+ /* the first squeue in the list is the default squeue */
+ sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
+ ASSERT(sqp != NULL);
+
+ ill_rx_ring->rr_sqp = sqp;
+ ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
+ continue;
+
+ }
+ ASSERT(MUTEX_HELD(&sqp->sq_lock));
+ ill_rx_ring->rr_sqp = sqp;
+ sqp->sq_rx_ring = ill_rx_ring;
+ ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
+ sqp->sq_state |= SQS_ILL_BOUND;
+
+ /* assign affinity to soft ring */
+ if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
+ ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
+ sqp->sq_bind);
+ }
+ mutex_exit(&sqp->sq_lock);
+
+ cmn_err(CE_CONT, "soft_ring_affinity: ring = %d, bind = %d\n",
+ i - j, sqp->sq_bind);
+ }
+ mutex_exit(&ill->ill_lock);
+
+ ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
+ SOFT_RING_SRC_HASH);
+
+ /* ill_waiter_dcr will also signal any waiters on ill_ring_state */
+ ill_waiter_dcr(ill);
+}
+
+void
+ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
+mblk_t *mp_chain, size_t hdrlen)
+{
+ ip_taskq_arg_t *taskq_arg;
+ boolean_t refheld;
+
+ ASSERT(servicing_interrupt());
+ ASSERT(ip_ring == NULL);
+
+ mutex_enter(&ill->ill_lock);
+ if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
+ taskq_arg = (ip_taskq_arg_t *)
+ kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
+
+ if (taskq_arg == NULL)
+ goto out;
+
+ taskq_arg->ip_taskq_ill = ill;
+ taskq_arg->ip_taskq_ill_rx_ring = ip_ring;
+ taskq_arg->ip_taskq_cpu = CPU;
+
+ /*
+ * Set ILL_SOFT_RING_ASSIGN flag. We don't want
+ * the next interrupt to schedule a task for calling
+ * ip_squeue_soft_ring_affinity();
+ */
+ ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
+ } else {
+ mutex_exit(&ill->ill_lock);
+ goto out;
+ }
+ mutex_exit(&ill->ill_lock);
+ refheld = ill_waiter_inc(ill);
+ if (refheld) {
+ if (taskq_dispatch(system_taskq,
+ ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
+ goto out;
+
+ /* release ref on ill if taskq dispatch fails */
+ ill_waiter_dcr(ill);
+ }
+ /*
+ * Turn on CAPAB_SOFT_RING so that affinity assignment
+ * can be tried again later.
+ */
+ mutex_enter(&ill->ill_lock);
+ ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
+ mutex_exit(&ill->ill_lock);
+ kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
+
+out:
+ ip_input(ill, ip_ring, mp_chain, hdrlen);
+}
+
+static squeue_t *
+ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout)
+{
+ int i;
+ squeue_set_t *best_sqs = NULL;
+ squeue_set_t *curr_sqs = NULL;
+ int min_sq = 0;
+ squeue_t *sqp = NULL;
+ char sqname[64];
+
+ /*
+ * If fanout is set and the passed squeue_set already has some
+ * squeues which are managing the NICs, try to find squeues on
+ * unused CPU.
+ */
+ if (sqs->sqs_size > 1 && fanout) {
+ /*
+ * First check to see if any squeue on the CPU passed
+ * is managing a NIC.
+ */
+ for (i = 0; i < sqs->sqs_size; i++) {
+ mutex_enter(&sqs->sqs_list[i]->sq_lock);
+ if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
+ !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
+ mutex_exit(&sqs->sqs_list[i]->sq_lock);
+ break;
+ }
+ mutex_exit(&sqs->sqs_list[i]->sq_lock);
+ }
+ if (i != sqs->sqs_size) {
+ best_sqs = sqset_global_list[sqset_global_size - 1];
+ min_sq = best_sqs->sqs_size;
+
+ for (i = sqset_global_size - 2; i >= 0; i--) {
+ curr_sqs = sqset_global_list[i];
+ if (curr_sqs->sqs_size < min_sq) {
+ best_sqs = curr_sqs;
+ min_sq = curr_sqs->sqs_size;
+ }
+ }
+
+ ASSERT(best_sqs != NULL);
+ sqs = best_sqs;
+ bind_cpu = cpu[sqs->sqs_bind];
+ }
+ }
+
mutex_enter(&sqs->sqs_lock);
+
for (i = 0; i < sqs->sqs_size; i++) {
mutex_enter(&sqs->sqs_list[i]->sq_lock);
- if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) == 0) {
+ if ((sqs->sqs_list[i]->sq_state &
+ (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
sqp = sqs->sqs_list[i];
break;
}
@@ -371,29 +717,19 @@ ip_squeue_extend(void *arg)
if (sqs->sqs_size == sqs->sqs_max_size) {
/*
* Reached the max limit for squeue
- * we can allocate on this CPU. Leave
- * ill_ring_state set to ILL_RING_INPROC
- * so that ip_squeue_direct will just
- * assign the default squeue for this
- * ring for future connections.
+ * we can allocate on this CPU.
*/
-#ifdef DEBUG
- cmn_err(CE_NOTE, "ip_squeue_add: Reached max "
- " threads per CPU for sqp = %p\n", (void *)sqp);
-#endif
mutex_exit(&sqs->sqs_lock);
- mutex_exit(&ill->ill_lock);
- ill_waiter_dcr(ill);
- return;
+ return (NULL);
}
bzero(sqname, sizeof (sqname));
(void) snprintf(sqname, sizeof (sqname),
- "ip_squeue_cpu_%d/%d/%d", CPU->cpu_seqid,
- CPU->cpu_id, sqs->sqs_size);
+ "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
+ bind_cpu->cpu_id, sqs->sqs_size);
- sqp = squeue_create(sqname, CPU->cpu_id, ip_squeue_worker_wait,
- minclsyspri);
+ sqp = squeue_create(sqname, bind_cpu->cpu_id,
+ ip_squeue_worker_wait, minclsyspri);
ASSERT(sqp != NULL);
@@ -403,26 +739,18 @@ ip_squeue_extend(void *arg)
if (ip_squeue_create_callback != NULL)
ip_squeue_create_callback(sqp);
- if (ip_squeue_bind) {
+ mutex_enter(&cpu_lock);
+ if (ip_squeue_bind && cpu_is_online(bind_cpu)) {
squeue_bind(sqp, -1);
}
+ mutex_exit(&cpu_lock);
+
mutex_enter(&sqp->sq_lock);
}
- ASSERT(sqp != NULL);
-
- sqp->sq_rx_ring = ill_rx_ring;
- ill_rx_ring->rr_sqp = sqp;
- ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
-
- sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
- mutex_exit(&sqp->sq_lock);
mutex_exit(&sqs->sqs_lock);
-
- mutex_exit(&ill->ill_lock);
-
- /* ill_waiter_dcr will also signal any waiters on ill_ring_state */
- ill_waiter_dcr(ill);
+ ASSERT(sqp != NULL);
+ return (sqp);
}
/*
@@ -657,6 +985,21 @@ ip_squeue_set_unbind(squeue_set_t *sqs)
mutex_enter(&sqs->sqs_lock);
for (i = 0; i < sqs->sqs_size; i++) {
sqp = sqs->sqs_list[i];
+
+ /*
+ * CPU is going offline. Remove the thread affinity
+ * for any soft ring threads the squeue is managing.
+ */
+ if (sqp->sq_state & SQS_ILL_BOUND) {
+ ill_rx_ring_t *ring = sqp->sq_rx_ring;
+ ill_t *ill = ring->rr_ill;
+
+ if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
+ ASSERT(ring->rr_handle != NULL);
+ ill->ill_dls_capab->ill_dls_unbind(
+ ring->rr_handle);
+ }
+ }
if (!(sqp->sq_state & SQS_BOUND))
continue;
squeue_unbind(sqp);