1 files changed, 390 insertions, 47 deletions
diff --git a/usr/src/uts/common/inet/ip/ip_squeue.c b/usr/src/uts/common/inet/ip/ip_squeue.c
index ae7731ac7b..78db295c78 100644
--- a/usr/src/uts/common/inet/ip/ip_squeue.c
+++ b/usr/src/uts/common/inet/ip/ip_squeue.c
@@ -125,13 +125,16 @@
  * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
  * mapping between squeue and NIC (or Rx ring) for performance reasons so
  * each squeue can uniquely own a NIC or a Rx ring and do polling
- * (PSARC 2004/630). So we allow up to  MAX_THREAD_PER_CPU squeues per CPU.
- * We start by creating MIN_THREAD_PER_CPU squeues per CPU but more squeues
+ * (PSARC 2004/630). So we allow up to  MAX_SQUEUES_PER_CPU squeues per CPU.
+ * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
  * can be created dynamically as needed.
  */
-#define	MAX_THREAD_PER_CPU	32
-#define	MIN_THREAD_PER_CPU	1
-uint_t	ip_threads_per_cpu = MIN_THREAD_PER_CPU;
+#define	MAX_SQUEUES_PER_CPU	32
+#define	MIN_SQUEUES_PER_CPU	1
+uint_t	ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
+
+#define	IP_NUM_SOFT_RINGS	2
+uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
 
 /*
  * List of all created squeue sets. The size is protected by cpu_lock
@@ -155,11 +158,12 @@ static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
 
 static void ip_squeue_set_bind(squeue_set_t *);
 static void ip_squeue_set_unbind(squeue_set_t *);
+static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t);
 
 #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
 
 /*
- * Create squeue set containing ip_threads_per_cpu number of squeues
+ * Create squeue set containing ip_squeues_per_cpu number of squeues
  * for this CPU and bind them all to the CPU.
  */
 static squeue_set_t *
@@ -186,13 +190,13 @@ ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
 	}
 
 	sqs = kmem_zalloc(sizeof (squeue_set_t) +
-	    (sizeof (squeue_t *) * MAX_THREAD_PER_CPU), KM_SLEEP);
+	    (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
 	mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
 	sqs->sqs_list = (squeue_t **)&sqs[1];
-	sqs->sqs_max_size = MAX_THREAD_PER_CPU;
+	sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
 	sqs->sqs_bind = id;
 
-	for (i = 0; i < ip_threads_per_cpu; i++) {
+	for (i = 0; i < ip_squeues_per_cpu; i++) {
 		bzero(sqname, sizeof (sqname));
 
 		(void) snprintf(sqname, sizeof (sqname),
@@ -202,6 +206,12 @@ ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
 		sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
 		    minclsyspri);
 
+		/*
+		 * The first squeue in each squeue_set is the DEFAULT
+		 * squeue.
+		 */
+		sqp->sq_state |= SQS_DEFAULT;
+
 		ASSERT(sqp != NULL);
 
 		squeue_profile_enable(sqp);
@@ -229,10 +239,10 @@ ip_squeue_init(void (*callback)(squeue_t *))
 
 	ASSERT(sqset_global_list == NULL);
 
-	if (ip_threads_per_cpu < MIN_THREAD_PER_CPU)
-		ip_threads_per_cpu = MIN_THREAD_PER_CPU;
-	else if (ip_threads_per_cpu > MAX_THREAD_PER_CPU)
-		ip_threads_per_cpu = MAX_THREAD_PER_CPU;
+	if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
+		ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
+	else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
+		ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
 
 	ip_squeue_create_callback = callback;
 	squeue_init();
@@ -293,6 +303,10 @@ ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
 	mutex_exit(&sqp->sq_lock);
 
 	ill = ring->rr_ill;
+	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
+		ASSERT(ring->rr_handle != NULL);
+		ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
+	}
 
 	/*
 	 * Cleanup the ring
@@ -338,15 +352,20 @@ ip_squeue_extend(void *arg)
 	ill_t		*ill = sq_arg->ip_taskq_ill;
 	ill_rx_ring_t	*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
 	cpu_t		*intr_cpu = sq_arg->ip_taskq_cpu;
-	squeue_set_t *sqs;
+	squeue_set_t 	*sqs;
 	squeue_t 	*sqp = NULL;
-	char		sqname[64];
-	int		i;
 
 	ASSERT(ill != NULL);
 	ASSERT(ill_rx_ring != NULL);
 	kmem_free(arg, sizeof (ip_taskq_arg_t));
 
+	/*
+	 * Make sure the CPU that originally took the interrupt still
+	 * exists.
+	 */
+	if (!CPU_ISON(intr_cpu))
+		intr_cpu = CPU;
+
 	sqs = intr_cpu->cpu_squeue_set;
 
 	/*
@@ -356,10 +375,337 @@ ip_squeue_extend(void *arg)
 	 * is sequential, we need to hold the ill_lock.
 	 */
 	mutex_enter(&ill->ill_lock);
+	sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE);
+	if (sqp == NULL) {
+		/*
+		 * We hit the max limit of squeues allowed per CPU.
+		 * Assign this rx_ring to DEFAULT squeue of the
+		 * interrupted CPU but the squeue will not manage
+		 * the ring. Also print a warning.
+		 */
+		cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
+		    "has max number of squeues. System performance might "
+		    "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
+
+		/* the first squeue in the list is the default squeue */
+		sqp = sqs->sqs_list[0];
+		ASSERT(sqp != NULL);
+		ill_rx_ring->rr_sqp = sqp;
+		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
+
+		mutex_exit(&ill->ill_lock);
+		ill_waiter_dcr(ill);
+		return;
+	}
+
+	ASSERT(MUTEX_HELD(&sqp->sq_lock));
+	sqp->sq_rx_ring = ill_rx_ring;
+	ill_rx_ring->rr_sqp = sqp;
+	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
+
+	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
+	mutex_exit(&sqp->sq_lock);
+
+	mutex_exit(&ill->ill_lock);
+
+	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
+	ill_waiter_dcr(ill);
+}
+
+/*
+ * Do a Rx ring to squeue binding. Find a unique squeue that is not
+ * managing a receive ring. If no such squeue exists, dynamically
+ * create a new one in the squeue set.
+ *
+ * The function runs via the system taskq. The ill passed as an
+ * argument can't go away since we hold a ref. The lock order is
+ * ill_lock -> sqs_lock -> sq_lock.
+ *
+ * If we are binding a Rx ring to a squeue attached to the offline CPU,
+ * no need to check that because squeues are never destroyed once
+ * created.
+ */
+/* ARGSUSED */
+static void
+ip_squeue_soft_ring_affinity(void *arg)
+{
+	ip_taskq_arg_t		*sq_arg = (ip_taskq_arg_t *)arg;
+	ill_t			*ill = sq_arg->ip_taskq_ill;
+	ill_dls_capab_t	*ill_soft_ring = ill->ill_dls_capab;
+	ill_rx_ring_t		*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
+	cpu_t			*intr_cpu = sq_arg->ip_taskq_cpu;
+	cpu_t			*bind_cpu;
+	int			cpu_id = intr_cpu->cpu_id;
+	int			min_cpu_id, max_cpu_id;
+	boolean_t		enough_uniq_cpus = B_FALSE;
+	boolean_t		enough_cpus = B_FALSE;
+	squeue_set_t 		*sqs, *last_sqs;
+	squeue_t 		*sqp = NULL;
+	int			i, j;
+
+	ASSERT(ill != NULL);
+	kmem_free(arg, sizeof (ip_taskq_arg_t));
+
+	/*
+	 * Make sure the CPU that originally took the interrupt still
+	 * exists.
+	 */
+	if (!CPU_ISON(intr_cpu)) {
+		intr_cpu = CPU;
+		cpu_id = intr_cpu->cpu_id;
+	}
+
+	/*
+	 * If this ill represents link aggregation, then there might be
+	 * multiple NICs trying to register them selves at the same time
+	 * and in order to ensure that test and assignment of free rings
+	 * is sequential, we need to hold the ill_lock.
+	 */
+	mutex_enter(&ill->ill_lock);
+
+	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
+		mutex_exit(&ill->ill_lock);
+		return;
+	}
+	/*
+	 * We need to fanout the interrupts from the NIC. We do that by
+	 * telling the driver underneath to create soft rings and use
+	 * worker threads (if the driver advertized SOFT_RING capability)
+	 * Its still a big performance win to if we can fanout to the
+	 * threads on the same core that is taking interrupts.
+	 *
+	 * Since we don't know the interrupt to CPU binding, we don't
+	 * assign any squeues or affinity to worker threads in the NIC.
+	 * At the time of the first interrupt, we know which CPU is
+	 * taking interrupts and try to find other threads on the same
+	 * core. Assuming, ip_threads_per_cpu is correct and cpus are
+	 * numbered sequentially for each core (XXX need something better
+	 * than this in future), find the lowest number and highest
+	 * number thread for that core.
+	 *
+	 * If we have one more thread per core than number of soft rings,
+	 * then don't assign any worker threads to the H/W thread (cpu)
+	 * taking interrupts (capability negotiation tries to ensure this)
+	 *
+	 * If the number of threads per core are same as the number of
+	 * soft rings, then assign the worker affinity and squeue to
+	 * the same cpu.
+	 *
+	 * Otherwise, just fanout to higher number CPUs starting from
+	 * the interrupted CPU.
+	 */
+
+	min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
+	max_cpu_id = min_cpu_id + ip_threads_per_cpu;
+
+	cmn_err(CE_CONT, "soft_ring_affinity: min/max/intr = %d/%d/%d\n",
+	    min_cpu_id, max_cpu_id, (int)intr_cpu->cpu_id);
+
+	/*
+	 * Quickly check if there are enough CPUs present for fanout
+	 * and also max_cpu_id is less than the id of the active CPU.
+	 * We use the cpu_id stored in the last squeue_set to get
+	 * an idea. The scheme is by no means perfect since it doesn't
+	 * take into account CPU DR operations and the fact that
+	 * interrupts themselves might change. An ideal scenario
+	 * would be to ensure that interrupts run cpus by themselves
+	 * and worker threads never have affinity to those CPUs. If
+	 * the interrupts move to CPU which had a worker thread, it
+	 * should be changed. Probably callbacks similar to CPU offline
+	 * are needed to make it work perfectly.
+	 */
+	last_sqs = sqset_global_list[sqset_global_size - 1];
+	if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
+		if ((max_cpu_id - min_cpu_id) >
+		    ill_soft_ring->ill_dls_soft_ring_cnt)
+			enough_uniq_cpus = B_TRUE;
+		else if ((max_cpu_id - min_cpu_id) >=
+		    ill_soft_ring->ill_dls_soft_ring_cnt)
+			enough_cpus = B_TRUE;
+	}
+
+	j = 0;
+	for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
+		if (enough_uniq_cpus) {
+			if ((min_cpu_id + i) == cpu_id) {
+				j++;
+				continue;
+			}
+			bind_cpu = cpu[min_cpu_id + i];
+		} else if (enough_cpus) {
+			bind_cpu = cpu[min_cpu_id + i];
+		} else {
+			/* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
+			bind_cpu = cpu[(cpu_id + i) % ncpus];
+		}
+
+		/*
+		 * Check if the CPU actually exist and active. If not,
+		 * use the interrupted CPU. ip_find_unused_squeue() will
+		 * find the right CPU to fanout anyway.
+		 */
+		if (!CPU_ISON(bind_cpu))
+			bind_cpu = intr_cpu;
+
+		sqs = bind_cpu->cpu_squeue_set;
+		ASSERT(sqs != NULL);
+		ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
+
+		sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE);
+		if (sqp == NULL) {
+			/*
+			 * We hit the max limit of squeues allowed per CPU.
+			 * Assign this rx_ring to DEFAULT squeue of the
+			 * interrupted CPU but thesqueue will not manage
+			 * the ring. Also print a warning.
+			 */
+			cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
+			    "%d/%p already has max number of squeues. System "
+			    "performance might become suboptimal\n",
+			    sqs->sqs_bind, (void *)sqs);
+
+			/* the first squeue in the list is the default squeue */
+			sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
+			ASSERT(sqp != NULL);
+
+			ill_rx_ring->rr_sqp = sqp;
+			ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
+			continue;
+
+		}
+		ASSERT(MUTEX_HELD(&sqp->sq_lock));
+		ill_rx_ring->rr_sqp = sqp;
+		sqp->sq_rx_ring = ill_rx_ring;
+		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
+		sqp->sq_state |= SQS_ILL_BOUND;
+
+		/* assign affinity to soft ring */
+		if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
+			ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
+			    sqp->sq_bind);
+		}
+		mutex_exit(&sqp->sq_lock);
+
+		cmn_err(CE_CONT, "soft_ring_affinity: ring = %d, bind = %d\n",
+		    i - j, sqp->sq_bind);
+	}
+	mutex_exit(&ill->ill_lock);
+
+	ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
+	    SOFT_RING_SRC_HASH);
+
+	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
+	ill_waiter_dcr(ill);
+}
+
+void
+ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
+mblk_t *mp_chain, size_t hdrlen)
+{
+	ip_taskq_arg_t	*taskq_arg;
+	boolean_t	refheld;
+
+	ASSERT(servicing_interrupt());
+	ASSERT(ip_ring == NULL);
+
+	mutex_enter(&ill->ill_lock);
+	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
+		taskq_arg = (ip_taskq_arg_t *)
+		    kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
+
+		if (taskq_arg == NULL)
+			goto out;
+
+		taskq_arg->ip_taskq_ill = ill;
+		taskq_arg->ip_taskq_ill_rx_ring = ip_ring;
+		taskq_arg->ip_taskq_cpu = CPU;
+
+		/*
+		 * Set ILL_SOFT_RING_ASSIGN flag. We don't want
+		 * the next interrupt to schedule a task for calling
+		 * ip_squeue_soft_ring_affinity();
+		 */
+		ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
+	} else {
+		mutex_exit(&ill->ill_lock);
+		goto out;
+	}
+	mutex_exit(&ill->ill_lock);
+	refheld = ill_waiter_inc(ill);
+	if (refheld) {
+		if (taskq_dispatch(system_taskq,
+		    ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
+			goto out;
+
+		/* release ref on ill if taskq dispatch fails */
+		ill_waiter_dcr(ill);
+	}
+	/*
+	 * Turn on CAPAB_SOFT_RING so that affinity assignment
+	 * can be tried again later.
+	 */
+	mutex_enter(&ill->ill_lock);
+	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
+	mutex_exit(&ill->ill_lock);
+	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
+
+out:
+	ip_input(ill, ip_ring, mp_chain, hdrlen);
+}
+
+static squeue_t *
+ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout)
+{
+	int 		i;
+	squeue_set_t	*best_sqs = NULL;
+	squeue_set_t	*curr_sqs = NULL;
+	int		min_sq = 0;
+	squeue_t 	*sqp = NULL;
+	char		sqname[64];
+
+	/*
+	 * If fanout is set and the passed squeue_set already has some
+	 * squeues which are managing the NICs, try to find squeues on
+	 * unused CPU.
+	 */
+	if (sqs->sqs_size > 1 && fanout) {
+		/*
+		 * First check to see if any squeue on the CPU passed
+		 * is managing a NIC.
+		 */
+		for (i = 0; i < sqs->sqs_size; i++) {
+			mutex_enter(&sqs->sqs_list[i]->sq_lock);
+			if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
+			    !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
+				mutex_exit(&sqs->sqs_list[i]->sq_lock);
+				break;
+			}
+			mutex_exit(&sqs->sqs_list[i]->sq_lock);
+		}
+		if (i != sqs->sqs_size) {
+			best_sqs = sqset_global_list[sqset_global_size - 1];
+			min_sq = best_sqs->sqs_size;
+
+			for (i = sqset_global_size - 2; i >= 0; i--) {
+				curr_sqs = sqset_global_list[i];
+				if (curr_sqs->sqs_size < min_sq) {
+					best_sqs = curr_sqs;
+					min_sq = curr_sqs->sqs_size;
+				}
+			}
+
+			ASSERT(best_sqs != NULL);
+			sqs = best_sqs;
+			bind_cpu = cpu[sqs->sqs_bind];
+		}
+	}
+
 	mutex_enter(&sqs->sqs_lock);
+
 	for (i = 0; i < sqs->sqs_size; i++) {
 		mutex_enter(&sqs->sqs_list[i]->sq_lock);
-		if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) == 0) {
+		if ((sqs->sqs_list[i]->sq_state &
+		    (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
 			sqp = sqs->sqs_list[i];
 			break;
 		}
@@ -371,29 +717,19 @@ ip_squeue_extend(void *arg)
 		if (sqs->sqs_size == sqs->sqs_max_size) {
 			/*
 			 * Reached the max limit for squeue
-			 * we can allocate on this CPU. Leave
-			 * ill_ring_state set to ILL_RING_INPROC
-			 * so that ip_squeue_direct will just
-			 * assign the default squeue for this
-			 * ring for future connections.
+			 * we can allocate on this CPU.
 			 */
-#ifdef DEBUG
-			cmn_err(CE_NOTE, "ip_squeue_add: Reached max "
-			    " threads per CPU for sqp = %p\n", (void *)sqp);
-#endif
 			mutex_exit(&sqs->sqs_lock);
-			mutex_exit(&ill->ill_lock);
-			ill_waiter_dcr(ill);
-			return;
+			return (NULL);
 		}
 
 		bzero(sqname, sizeof (sqname));
 		(void) snprintf(sqname, sizeof (sqname),
-		    "ip_squeue_cpu_%d/%d/%d", CPU->cpu_seqid,
-		    CPU->cpu_id, sqs->sqs_size);
+		    "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
+		    bind_cpu->cpu_id, sqs->sqs_size);
 
-		sqp = squeue_create(sqname, CPU->cpu_id, ip_squeue_worker_wait,
-		    minclsyspri);
+		sqp = squeue_create(sqname, bind_cpu->cpu_id,
+		    ip_squeue_worker_wait, minclsyspri);
 
 		ASSERT(sqp != NULL);
 
@@ -403,26 +739,18 @@ ip_squeue_extend(void *arg)
 		if (ip_squeue_create_callback != NULL)
 			ip_squeue_create_callback(sqp);
 
-		if (ip_squeue_bind) {
+		mutex_enter(&cpu_lock);
+		if (ip_squeue_bind && cpu_is_online(bind_cpu)) {
 			squeue_bind(sqp, -1);
 		}
+		mutex_exit(&cpu_lock);
+
 		mutex_enter(&sqp->sq_lock);
 	}
 
-	ASSERT(sqp != NULL);
-
-	sqp->sq_rx_ring = ill_rx_ring;
-	ill_rx_ring->rr_sqp = sqp;
-	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
-
-	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
-	mutex_exit(&sqp->sq_lock);
 	mutex_exit(&sqs->sqs_lock);
-
-	mutex_exit(&ill->ill_lock);
-
-	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
-	ill_waiter_dcr(ill);
+	ASSERT(sqp != NULL);
+	return (sqp);
 }
 
 /*
@@ -657,6 +985,21 @@ ip_squeue_set_unbind(squeue_set_t *sqs)
 	mutex_enter(&sqs->sqs_lock);
 	for (i = 0; i < sqs->sqs_size; i++) {
 		sqp = sqs->sqs_list[i];
+
+		/*
+		 * CPU is going offline. Remove the thread affinity
+		 * for any soft ring threads the squeue is managing.
+		 */
+		if (sqp->sq_state & SQS_ILL_BOUND) {
+			ill_rx_ring_t	*ring = sqp->sq_rx_ring;
+			ill_t		*ill = ring->rr_ill;
+
+			if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
+				ASSERT(ring->rr_handle != NULL);
+				ill->ill_dls_capab->ill_dls_unbind(
+					ring->rr_handle);
+			}
+		}
 		if (!(sqp->sq_state & SQS_BOUND))
 			continue;
 		squeue_unbind(sqp);