usr/src/uts/sun4/os/intr.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877

/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
 * Use is subject to license terms.
 */
/*
 * Copyright 2019 Joyent, Inc.
 */
/*
 * Copyright 2019 Peter Tribble.
 */

#include <sys/sysmacros.h>
#include <sys/stack.h>
#include <sys/cpuvar.h>
#include <sys/ivintr.h>
#include <sys/intreg.h>
#include <sys/membar.h>
#include <sys/kmem.h>
#include <sys/intr.h>
#include <sys/sunddi.h>
#include <sys/sunndi.h>
#include <sys/cmn_err.h>
#include <sys/privregs.h>
#include <sys/systm.h>
#include <sys/archsystm.h>
#include <sys/machsystm.h>
#include <sys/x_call.h>
#include <vm/seg_kp.h>
#include <sys/debug.h>
#include <sys/cyclic.h>
#include <sys/kdi_impl.h>
#include <sys/ddi_periodic.h>

#include <sys/cpu_sgnblk_defs.h>

/* Global locks which protect the interrupt distribution lists */
static kmutex_t intr_dist_lock;
static kmutex_t intr_dist_cpu_lock;

/* Head of the interrupt distribution lists */
static struct intr_dist *intr_dist_head = NULL;
static struct intr_dist *intr_dist_whead = NULL;

static uint64_t siron_inum[DDI_IPL_10]; /* software interrupt numbers */
uint64_t *siron_cpu_inum = NULL;
uint64_t siron_poke_cpu_inum;
static int siron_cpu_setup(cpu_setup_t, int, void *);
extern uint_t softlevel1();

static uint64_t siron1_inum; /* backward compatibility */
uint64_t poke_cpu_inum;
uint_t poke_cpu_intr(caddr_t arg1, caddr_t arg2);
uint_t siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2);

/*
 * Variable to enable/disable printing a message when an invalid vecintr
 * is received.
 */
uint_t ignore_invalid_vecintr = 0;

/*
 * Note:-
 * siron_pending was originally created to prevent a resource over consumption
 * bug in setsoftint(exhaustion of interrupt pool free list).
 * It's original intention is obsolete with the use of iv_pending in
 * setsoftint. However, siron_pending stayed around, acting as a second
 * gatekeeper preventing soft interrupts from being queued. In this capacity,
 * it can lead to hangs on MP systems, where due to global visibility issues
 * it can end up set while iv_pending is reset, preventing soft interrupts from
 * ever being processed. In addition to its gatekeeper role, init_intr also
 * uses it to flag the situation where siron() was called before siron_inum has
 * been defined.
 *
 * siron() does not need an extra gatekeeper; any cpu that wishes should be
 * allowed to queue a soft interrupt. It is softint()'s job to ensure
 * correct handling of the queues. Therefore, siron_pending has been
 * stripped of its gatekeeper task, retaining only its intr_init job, where
 * it indicates that there is a pending need to call siron().
 */
static int siron_pending[DDI_IPL_10]; /* software interrupt pending flags */
static int siron1_pending; /* backward compatibility */

int intr_policy = INTR_WEIGHTED_DIST;	/* interrupt distribution policy */
int intr_dist_debug = 0;
int32_t intr_dist_weight_max = 1;
int32_t intr_dist_weight_maxmax = 1000;
int intr_dist_weight_maxfactor = 2;
#define	INTR_DEBUG(args) if (intr_dist_debug) cmn_err args

/*
 * intr_init() - Interrupt initialization
 *	Initialize the system's interrupt vector table.
 */
void
intr_init(cpu_t *cp)
{
	int i;
	extern uint_t softlevel1();

	init_ivintr();

	/*
	 * Register these software interrupts for ddi timer.
	 * Software interrupts up to the level 10 are supported.
	 */
	for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
		siron_inum[i - 1] = add_softintr(i,
		    (softintrfunc)ddi_periodic_softintr,
		    (caddr_t)(uintptr_t)(i), SOFTINT_ST);
	}

	siron1_inum = add_softintr(PIL_1, softlevel1, 0, SOFTINT_ST);
	poke_cpu_inum = add_softintr(PIL_13, poke_cpu_intr, 0, SOFTINT_MT);
	siron_poke_cpu_inum = add_softintr(PIL_13,
	    siron_poke_cpu_intr, 0, SOFTINT_MT);
	cp->cpu_m.poke_cpu_outstanding = B_FALSE;

	mutex_init(&intr_dist_lock, NULL, MUTEX_DEFAULT, NULL);
	mutex_init(&intr_dist_cpu_lock, NULL, MUTEX_DEFAULT, NULL);

	/*
	 * A soft interrupt may have been requested prior to the initialization
	 * of soft interrupts.  Soft interrupts can't be dispatched until after
	 * init_intr(), so we have to wait until now before we can dispatch the
	 * pending soft interrupt (if any).
	 */
	for (i = DDI_IPL_1; i <= DDI_IPL_10; i++) {
		if (siron_pending[i-1]) {
			siron_pending[i-1] = 0;
			sir_on(i);
		}
	}
	if (siron1_pending) {
		siron1_pending = 0;
		siron();
	}
}

/*
 * poke_cpu_intr - fall through when poke_cpu calls
 */
/* ARGSUSED */
uint_t
poke_cpu_intr(caddr_t arg1, caddr_t arg2)
{
	CPU->cpu_m.poke_cpu_outstanding = B_FALSE;
	membar_stld_stst();
	return (1);
}

/*
 * Trigger software interrupts dedicated to ddi timer.
 */
void
sir_on(int level)
{
	ASSERT(level >= DDI_IPL_1 && level <= DDI_IPL_10);
	if (siron_inum[level-1])
		setsoftint(siron_inum[level-1]);
	else
		siron_pending[level-1] = 1;
}

/*
 * kmdb uses siron (and thus setsoftint) while the world is stopped in order to
 * inform its driver component that there's work to be done.  We need to keep
 * DTrace from instrumenting kmdb's siron and setsoftint.  We duplicate siron,
 * giving kmdb's version a kdi_ prefix to keep DTrace at bay.  The
 * implementation of setsoftint is complicated enough that we don't want to
 * duplicate it, but at the same time we don't want to preclude tracing either.
 * The meat of setsoftint() therefore goes into kdi_setsoftint, with
 * setsoftint() implemented as a wrapper.  This allows tracing, while still
 * providing a way for kmdb to sneak in unmolested.
 */
void
kdi_siron(void)
{
	if (siron1_inum != 0)
		kdi_setsoftint(siron1_inum);
	else
		siron1_pending = 1;
}

void
setsoftint(uint64_t inum)
{
	kdi_setsoftint(inum);
}

/*
 * Generates softlevel1 interrupt on current CPU if it
 * is not pending already.
 */
void
siron(void)
{
	uint64_t inum;

	if (siron1_inum != 0) {
		/*
		 * Once siron_cpu_inum has been allocated, we can
		 * use per-CPU siron inum.
		 */
		if (siron_cpu_inum && siron_cpu_inum[CPU->cpu_id] != 0)
			inum = siron_cpu_inum[CPU->cpu_id];
		else
			inum = siron1_inum;

		setsoftint(inum);
	} else
		siron1_pending = 1;
}


static void
siron_init(void)
{
	/*
	 * We just allocate memory for per-cpu siron right now. Rest of
	 * the work is done when CPU is configured.
	 */
	siron_cpu_inum = kmem_zalloc(sizeof (uint64_t) * NCPU, KM_SLEEP);
}

/*
 * This routine creates per-CPU siron inum for CPUs which are
 * configured during boot.
 */
void
siron_mp_init()
{
	cpu_t *c;

	/*
	 * Get the memory for per-CPU siron inums
	 */
	siron_init();

	mutex_enter(&cpu_lock);
	c = cpu_list;
	do {
		(void) siron_cpu_setup(CPU_CONFIG, c->cpu_id, NULL);
	} while ((c = c->cpu_next) != cpu_list);

	register_cpu_setup_func(siron_cpu_setup, NULL);
	mutex_exit(&cpu_lock);
}

/*
 * siron_poke_cpu_intr - cross-call handler.
 */
/* ARGSUSED */
uint_t
siron_poke_cpu_intr(caddr_t arg1, caddr_t arg2)
{
	/* generate level1 softint */
	siron();
	return (1);
}

/*
 * This routine generates a cross-call on target CPU(s).
 */
void
siron_poke_cpu(cpuset_t poke)
{
	int cpuid = CPU->cpu_id;

	if (CPU_IN_SET(poke, cpuid)) {
		siron();
		CPUSET_DEL(poke, cpuid);
		if (CPUSET_ISNULL(poke))
			return;
	}

	xt_some(poke, setsoftint_tl1, siron_poke_cpu_inum, 0);
}

/*
 * This callback function allows us to create per-CPU siron inum.
 */
/* ARGSUSED */
static int
siron_cpu_setup(cpu_setup_t what, int id, void *arg)
{
	cpu_t *cp = cpu[id];

	ASSERT(MUTEX_HELD(&cpu_lock));
	ASSERT(cp != NULL);

	switch (what) {
	case CPU_CONFIG:
		siron_cpu_inum[cp->cpu_id] = add_softintr(PIL_1,
		    (softintrfunc)softlevel1, 0, SOFTINT_ST);
		break;
	case CPU_UNCONFIG:
		(void) rem_softintr(siron_cpu_inum[cp->cpu_id]);
		siron_cpu_inum[cp->cpu_id] = 0;
		break;
	default:
		break;
	}

	return (0);
}

/*
 * no_ivintr()
 *	called by setvecint_tl1() through sys_trap()
 *	vector interrupt received but not valid or not
 *	registered in intr_vec_table
 *	considered as a spurious mondo interrupt
 */
/* ARGSUSED */
void
no_ivintr(struct regs *rp, int inum, int pil)
{
	if (!ignore_invalid_vecintr)
		cmn_err(CE_WARN, "invalid vector intr: number 0x%x, pil 0x%x",
		    inum, pil);

#ifdef DEBUG_VEC_INTR
	prom_enter_mon();
#endif /* DEBUG_VEC_INTR */
}

void
intr_dequeue_req(uint_t pil, uint64_t inum)
{
	intr_vec_t	*iv, *next, *prev;
	struct machcpu	*mcpu;
	uint32_t	clr;
	processorid_t	cpu_id;
	extern uint_t	getpstate(void);

	ASSERT((getpstate() & PSTATE_IE) == 0);

	mcpu = &CPU->cpu_m;
	cpu_id = CPU->cpu_id;

	iv = (intr_vec_t *)inum;
	prev = NULL;
	next = mcpu->intr_head[pil];

	/* Find a matching entry in the list */
	while (next != NULL) {
		if (next == iv)
			break;
		prev = next;
		next = IV_GET_PIL_NEXT(next, cpu_id);
	}

	if (next != NULL) {
		intr_vec_t	*next_iv = IV_GET_PIL_NEXT(next, cpu_id);

		/* Remove entry from list */
		if (prev != NULL)
			IV_SET_PIL_NEXT(prev, cpu_id, next_iv); /* non-head */
		else
			mcpu->intr_head[pil] = next_iv; /* head */

		if (next_iv == NULL)
			mcpu->intr_tail[pil] = prev; /* tail */
	}

	/* Clear pending interrupts at this level if the list is empty */
	if (mcpu->intr_head[pil] == NULL) {
		clr = 1 << pil;
		if (pil == PIL_14)
			clr |= (TICK_INT_MASK | STICK_INT_MASK);
		wr_clr_softint(clr);
	}
}


/*
 * Send a directed interrupt of specified interrupt number id to a cpu.
 */
void
send_dirint(
	int cpuix,		/* cpu to be interrupted */
	int intr_id)		/* interrupt number id */
{
	xt_one(cpuix, setsoftint_tl1, intr_id, 0);
}

/*
 * Take the specified CPU out of participation in interrupts.
 *	Called by p_online(2) when a processor is being taken off-line.
 *	This allows interrupt threads being handled on the processor to
 *	complete before the processor is idled.
 */
int
cpu_disable_intr(struct cpu *cp)
{
	ASSERT(MUTEX_HELD(&cpu_lock));

	/*
	 * Turn off the CPU_ENABLE flag before calling the redistribution
	 * function, since it checks for this in the cpu flags.
	 */
	cp->cpu_flags &= ~CPU_ENABLE;
	ncpus_intr_enabled--;

	intr_redist_all_cpus();

	return (0);
}

/*
 * Allow the specified CPU to participate in interrupts.
 *	Called by p_online(2) if a processor could not be taken off-line
 *	because of bound threads, in order to resume processing interrupts.
 *	Also called after starting a processor.
 */
void
cpu_enable_intr(struct cpu *cp)
{
	ASSERT(MUTEX_HELD(&cpu_lock));

	cp->cpu_flags |= CPU_ENABLE;
	ncpus_intr_enabled++;

	intr_redist_all_cpus();
}

/*
 * Add function to callback list for intr_redist_all_cpus.  We keep two lists,
 * one for weighted callbacks and one for normal callbacks. Weighted callbacks
 * are issued to redirect interrupts of a specified weight, from heavy to
 * light.  This allows all the interrupts of a given weight to be redistributed
 * for all weighted nexus drivers prior to those of less weight.
 */
static void
intr_dist_add_list(struct intr_dist **phead, void (*func)(void *), void *arg)
{
	struct intr_dist *new = kmem_alloc(sizeof (*new), KM_SLEEP);
	struct intr_dist *iptr;
	struct intr_dist **pptr;

	ASSERT(func);
	new->func = func;
	new->arg = arg;
	new->next = NULL;

	/* Add to tail so that redistribution occurs in original order. */
	mutex_enter(&intr_dist_lock);
	for (iptr = *phead, pptr = phead; iptr != NULL;
	    pptr = &iptr->next, iptr = iptr->next) {
		/* check for problems as we locate the tail */
		if ((iptr->func == func) && (iptr->arg == arg)) {
			cmn_err(CE_PANIC, "intr_dist_add_list(): duplicate");
			/*NOTREACHED*/
		}
	}
	*pptr = new;

	mutex_exit(&intr_dist_lock);
}

void
intr_dist_add(void (*func)(void *), void *arg)
{
	intr_dist_add_list(&intr_dist_head, (void (*)(void *))func, arg);
}

void
intr_dist_add_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
{
	intr_dist_add_list(&intr_dist_whead, (void (*)(void *))func, arg);
}

/*
 * Search for the interrupt distribution structure with the specified
 * mondo vec reg in the interrupt distribution list. If a match is found,
 * then delete the entry from the list. The caller is responsible for
 * modifying the mondo vector registers.
 */
static void
intr_dist_rem_list(struct intr_dist **headp, void (*func)(void *), void *arg)
{
	struct intr_dist *iptr;
	struct intr_dist **vect;

	mutex_enter(&intr_dist_lock);
	for (iptr = *headp, vect = headp;
	    iptr != NULL; vect = &iptr->next, iptr = iptr->next) {
		if ((iptr->func == func) && (iptr->arg == arg)) {
			*vect = iptr->next;
			kmem_free(iptr, sizeof (struct intr_dist));
			mutex_exit(&intr_dist_lock);
			return;
		}
	}

	if (!panicstr)
		cmn_err(CE_PANIC, "intr_dist_rem_list: not found");
	mutex_exit(&intr_dist_lock);
}

void
intr_dist_rem(void (*func)(void *), void *arg)
{
	intr_dist_rem_list(&intr_dist_head, (void (*)(void *))func, arg);
}

void
intr_dist_rem_weighted(void (*func)(void *, int32_t, int32_t), void *arg)
{
	intr_dist_rem_list(&intr_dist_whead, (void (*)(void *))func, arg);
}

/*
 * Initiate interrupt redistribution.  Redistribution improves the isolation
 * associated with interrupt weights by ordering operations from heavy weight
 * to light weight.  When a CPUs orientation changes relative to interrupts,
 * there is *always* a redistribution to accommodate this change (call to
 * intr_redist_all_cpus()).  As devices (not CPUs) attach/detach it is possible
 * that a redistribution could improve the quality of an initialization. For
 * example, if you are not using a NIC it may not be attached with s10 (devfs).
 * If you then configure the NIC (ifconfig), this may cause the NIC to attach
 * and plumb interrupts.  The CPU assignment for the NIC's interrupts is
 * occurring late, so optimal "isolation" relative to weight is not occurring.
 * The same applies to detach, although in this case doing the redistribution
 * might improve "spread" for medium weight devices since the "isolation" of
 * a higher weight device may no longer be present.
 *
 * NB: We should provide a utility to trigger redistribution (ala "intradm -r").
 *
 * NB: There is risk associated with automatically triggering execution of the
 * redistribution code at arbitrary times. The risk comes from the fact that
 * there is a lot of low-level hardware interaction associated with a
 * redistribution.  At some point we may want this code to perform automatic
 * redistribution (redistribution thread; trigger timeout when add/remove
 * weight delta is large enough, and call cv_signal from timeout - causing
 * thead to call i_ddi_intr_redist_all_cpus()) but this is considered too
 * risky at this time.
 */
void
i_ddi_intr_redist_all_cpus()
{
	mutex_enter(&cpu_lock);
	INTR_DEBUG((CE_CONT, "intr_dist: i_ddi_intr_redist_all_cpus\n"));
	intr_redist_all_cpus();
	mutex_exit(&cpu_lock);
}

/*
 * Redistribute all interrupts
 *
 * This function redistributes all interrupting devices, running the
 * parent callback functions for each node.
 */
void
intr_redist_all_cpus(void)
{
	struct cpu *cp;
	struct intr_dist *iptr;
	int32_t weight, max_weight;

	ASSERT(MUTEX_HELD(&cpu_lock));
	mutex_enter(&intr_dist_lock);

	/*
	 * zero cpu_intr_weight on all cpus - it is safe to traverse
	 * cpu_list since we hold cpu_lock.
	 */
	cp = cpu_list;
	do {
		cp->cpu_intr_weight = 0;
	} while ((cp = cp->cpu_next) != cpu_list);

	/*
	 * Assume that this redistribution may encounter a device weight
	 * via driver.conf tuning of "ddi-intr-weight" that is at most
	 * intr_dist_weight_maxfactor times larger.
	 */
	max_weight = intr_dist_weight_max * intr_dist_weight_maxfactor;
	if (max_weight > intr_dist_weight_maxmax)
		max_weight = intr_dist_weight_maxmax;
	intr_dist_weight_max = 1;

	INTR_DEBUG((CE_CONT, "intr_dist: "
	    "intr_redist_all_cpus: %d-0\n", max_weight));

	/*
	 * Redistribute weighted, from heavy to light.  The callback that
	 * specifies a weight equal to weight_max should redirect all
	 * interrupts of weight weight_max or greater [weight_max, inf.).
	 * Interrupts of lesser weight should be processed on the call with
	 * the matching weight. This allows all the heaver weight interrupts
	 * on all weighted busses (multiple pci busses) to be redirected prior
	 * to any lesser weight interrupts.
	 */
	for (weight = max_weight; weight >= 0; weight--)
		for (iptr = intr_dist_whead; iptr != NULL; iptr = iptr->next)
			((void (*)(void *, int32_t, int32_t))iptr->func)
			    (iptr->arg, max_weight, weight);

	/* redistribute normal (non-weighted) interrupts */
	for (iptr = intr_dist_head; iptr != NULL; iptr = iptr->next)
		((void (*)(void *))iptr->func)(iptr->arg);
	mutex_exit(&intr_dist_lock);
}

void
intr_redist_all_cpus_shutdown(void)
{
	intr_policy = INTR_CURRENT_CPU;
	intr_redist_all_cpus();
}

/*
 * Determine what CPU to target, based on interrupt policy.
 *
 * INTR_FLAT_DIST: hold a current CPU pointer in a static variable and
 *	advance through interrupt enabled cpus (round-robin).
 *
 * INTR_WEIGHTED_DIST: search for an enabled CPU with the lowest
 *	cpu_intr_weight, round robin when all equal.
 *
 *	Weighted interrupt distribution provides two things: "spread" of weight
 *	(associated with algorithm itself) and "isolation" (associated with a
 *	particular device weight). A redistribution is what provides optimal
 *	"isolation" of heavy weight interrupts, optimal "spread" of weight
 *	(relative to what came before) is always occurring.
 *
 *	An interrupt weight is a subjective number that represents the
 *	percentage of a CPU required to service a device's interrupts: the
 *	default weight is 0% (however the algorithm still maintains
 *	round-robin), a network interface controller (NIC) may have a large
 *	weight (35%). Interrupt weight only has meaning relative to the
 *	interrupt weight of other devices: a CPU can be weighted more than
 *	100%, and a single device might consume more than 100% of a CPU.
 *
 *	A coarse interrupt weight can be defined by the parent nexus driver
 *	based on bus specific information, like pci class codes. A nexus
 *	driver that supports device interrupt weighting for its children
 *	should call intr_dist_cpuid_add/rem_device_weight(), which adds
 *	and removes the weight of a device from the CPU that an interrupt
 *	is directed at.  The quality of initialization improves when the
 *	device interrupt weights more accuracy reflect actual run-time weights,
 *	and as the assignments are ordered from is heavy to light.
 *
 *	The implementation also supports interrupt weight being specified in
 *	driver.conf files via the property "ddi-intr-weight", which takes
 *	precedence over the nexus supplied weight.  This support is added to
 *	permit possible tweaking in the product in response to customer
 *	problems. This is not a formal or committed interface.
 *
 *	While a weighted approach chooses the CPU providing the best spread
 *	given past weights, less than optimal isolation can result in cases
 *	where heavy weight devices show up last. The nexus driver's interrupt
 *	redistribution logic should use intr_dist_add/rem_weighted so that
 *	interrupts can be redistributed heavy first for optimal isolation.
 */
uint32_t
intr_dist_cpuid(void)
{
	static struct cpu	*curr_cpu;
	struct cpu		*start_cpu;
	struct cpu		*new_cpu;
	struct cpu		*cp;
	int			cpuid = -1;

	/* Establish exclusion for curr_cpu and cpu_intr_weight manipulation */
	mutex_enter(&intr_dist_cpu_lock);

	switch (intr_policy) {
	case INTR_CURRENT_CPU:
		cpuid = CPU->cpu_id;
		break;

	case INTR_BOOT_CPU:
		panic("INTR_BOOT_CPU no longer supported.");
		/*NOTREACHED*/

	case INTR_FLAT_DIST:
	case INTR_WEIGHTED_DIST:
	default:
		/*
		 * Ensure that curr_cpu is valid - cpu_next will be NULL if
		 * the cpu has been deleted (cpu structs are never freed).
		 */
		if (curr_cpu == NULL || curr_cpu->cpu_next == NULL)
			curr_cpu = CPU;

		/*
		 * Advance to online CPU after curr_cpu (round-robin). For
		 * INTR_WEIGHTED_DIST we choose the cpu with the lightest
		 * weight.  For a nexus that does not support weight the
		 * default weight of zero is used. We degrade to round-robin
		 * behavior among equal weightes.  The default weight is zero
		 * and round-robin behavior continues.
		 *
		 * Disable preemption while traversing cpu_next_onln to
		 * ensure the list does not change.  This works because
		 * modifiers of this list and other lists in a struct cpu
		 * call pause_cpus() before making changes.
		 */
		kpreempt_disable();
		cp = start_cpu = curr_cpu->cpu_next_onln;
		new_cpu = NULL;
		do {
			/* Skip CPUs with interrupts disabled */
			if ((cp->cpu_flags & CPU_ENABLE) == 0)
				continue;

			if (intr_policy == INTR_FLAT_DIST) {
				/* select CPU */
				new_cpu = cp;
				break;
			} else if ((new_cpu == NULL) ||
			    (cp->cpu_intr_weight < new_cpu->cpu_intr_weight)) {
				/* Choose if lighter weight */
				new_cpu = cp;
			}
		} while ((cp = cp->cpu_next_onln) != start_cpu);
		ASSERT(new_cpu);
		cpuid = new_cpu->cpu_id;

		INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: "
		    "targeted\n", cpuid, new_cpu->cpu_intr_weight));

		/* update static pointer for next round-robin */
		curr_cpu = new_cpu;
		kpreempt_enable();
		break;
	}
	mutex_exit(&intr_dist_cpu_lock);
	return (cpuid);
}

/*
 * Add or remove the the weight of a device from a CPUs interrupt weight.
 *
 * We expect nexus drivers to call intr_dist_cpuid_add/rem_device_weight for
 * their children to improve the overall quality of interrupt initialization.
 *
 * If a nexues shares the CPU returned by a single intr_dist_cpuid() call
 * among multiple devices (sharing ino) then the nexus should call
 * intr_dist_cpuid_add/rem_device_weight for each device separately. Devices
 * that share must specify the same cpuid.
 *
 * If a nexus driver is unable to determine the cpu at remove_intr time
 * for some of its interrupts, then it should not call add_device_weight -
 * intr_dist_cpuid will still provide round-robin.
 *
 * An established device weight (from dev_info node) takes precedence over
 * the weight passed in.  If a device weight is not already established
 * then the passed in nexus weight is established.
 */
void
intr_dist_cpuid_add_device_weight(uint32_t cpuid,
    dev_info_t *dip, int32_t nweight)
{
	int32_t		eweight;

	/*
	 * For non-weighted policy everything has weight of zero (and we get
	 * round-robin distribution from intr_dist_cpuid).
	 * NB: intr_policy is limited to this file. A weighted nexus driver is
	 * calls this rouitne even if intr_policy has been patched to
	 * INTR_FLAG_DIST.
	 */
	ASSERT(dip);
	if (intr_policy != INTR_WEIGHTED_DIST)
		return;

	eweight = i_ddi_get_intr_weight(dip);
	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: +%2d/%2d for "
	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight,
	    nweight, eweight, ddi_driver_name(ddi_get_parent(dip)),
	    ddi_get_instance(ddi_get_parent(dip)),
	    ddi_driver_name(dip), ddi_get_instance(dip)));

	/* if no establish weight, establish nexus weight */
	if (eweight < 0) {
		if (nweight > 0)
			(void) i_ddi_set_intr_weight(dip, nweight);
		else
			nweight = 0;
	} else
		nweight = eweight;	/* use established weight */

	/* Establish exclusion for cpu_intr_weight manipulation */
	mutex_enter(&intr_dist_cpu_lock);
	cpu[cpuid]->cpu_intr_weight += nweight;

	/* update intr_dist_weight_max */
	if (nweight > intr_dist_weight_max)
		intr_dist_weight_max = nweight;
	mutex_exit(&intr_dist_cpu_lock);
}

void
intr_dist_cpuid_rem_device_weight(uint32_t cpuid, dev_info_t *dip)
{
	struct cpu	*cp;
	int32_t		weight;

	ASSERT(dip);
	if (intr_policy != INTR_WEIGHTED_DIST)
		return;

	/* remove weight of device from cpu */
	weight = i_ddi_get_intr_weight(dip);
	if (weight < 0)
		weight = 0;
	INTR_DEBUG((CE_CONT, "intr_dist: cpu %2d weight %3d: -%2d    for "
	    "%s#%d/%s#%d\n", cpuid, cpu[cpuid]->cpu_intr_weight, weight,
	    ddi_driver_name(ddi_get_parent(dip)),
	    ddi_get_instance(ddi_get_parent(dip)),
	    ddi_driver_name(dip), ddi_get_instance(dip)));

	/* Establish exclusion for cpu_intr_weight manipulation */
	mutex_enter(&intr_dist_cpu_lock);
	cp = cpu[cpuid];
	cp->cpu_intr_weight -= weight;
	if (cp->cpu_intr_weight < 0)
		cp->cpu_intr_weight = 0;	/* sanity */
	mutex_exit(&intr_dist_cpu_lock);
}

ulong_t
create_softint(uint_t pil, uint_t (*func)(caddr_t, caddr_t), caddr_t arg1)
{
	uint64_t inum;

	inum = add_softintr(pil, func, arg1, SOFTINT_MT);
	return ((ulong_t)inum);
}

void
invoke_softint(processorid_t cpuid, ulong_t hdl)
{
	uint64_t inum = hdl;

	if (cpuid == CPU->cpu_id)
		setsoftint(inum);
	else
		xt_one(cpuid, setsoftint_tl1, inum, 0);
}

void
remove_softint(ulong_t hdl)
{
	uint64_t inum = hdl;

	(void) rem_softintr(inum);
}

void
sync_softint(cpuset_t set)
{
	xt_sync(set);
}