9 files changed, 576 insertions, 64 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
index d68260a8f8..48578a52f4 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
@@ -4059,6 +4059,9 @@ static const mdb_dcmd_t dcmds[] = {
 
 	/* from netstack.c */
 	{ "netstack", "", "show stack instances", netstack },
+	{ "netstackid2netstack", ":",
+		"translate a netstack id to its netstack_t",
+		netstackid2netstack },
 
 	/* from nvpair.c */
 	{ NVPAIR_DCMD_NAME, NVPAIR_DCMD_USAGE, NVPAIR_DCMD_DESCR,
@@ -4149,6 +4152,10 @@ static const mdb_dcmd_t dcmds[] = {
 		pfiles_help },
 
 	/* from zone.c */
+	{ "zid2zone", ":", "find the zone_t with the given zone id",
+		zid2zone },
+	{ "zdid2zone", ":", "find the zone_t with the given zone debug id",
+		zdid2zone },
 	{ "zone", "?[-r [-v]]", "display kernel zone(s)", zoneprt },
 	{ "zsd", ":[-v] [zsd_key]", "display zone-specific-data entries for "
 	    "selected zones", zsd },
diff --git a/usr/src/cmd/mdb/common/modules/genunix/netstack.c b/usr/src/cmd/mdb/common/modules/genunix/netstack.c
index 588bd6dbf3..d46bd85d1f 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/netstack.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/netstack.c
@@ -21,10 +21,9 @@
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <mdb/mdb_modapi.h>
 #include <mdb/mdb_ks.h>
 #include <mdb/mdb_ctf.h>
@@ -121,3 +120,30 @@ netstack(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 
 	return (DCMD_OK);
 }
+
+static int
+netstackid_lookup_cb(uintptr_t addr, const netstack_t *ns, void *arg)
+{
+	netstackid_t nid = *(uintptr_t *)arg;
+	if (ns->netstack_stackid == nid)
+		mdb_printf("%p\n", addr);
+
+	return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+int
+netstackid2netstack(uintptr_t addr, uint_t flags, int argc,
+    const mdb_arg_t *argv)
+{
+	if (!(flags & DCMD_ADDRSPEC) || argc != 0)
+		return (DCMD_USAGE);
+
+	if (mdb_walk("netstack", (mdb_walk_cb_t)netstackid_lookup_cb, &addr) ==
+	    -1) {
+		mdb_warn("failed to walk zone");
+		return (DCMD_ERR);
+	}
+
+	return (DCMD_OK);
+}
diff --git a/usr/src/cmd/mdb/common/modules/genunix/netstack.h b/usr/src/cmd/mdb/common/modules/genunix/netstack.h
index 392565caca..f5773c36c1 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/netstack.h
+++ b/usr/src/cmd/mdb/common/modules/genunix/netstack.h
@@ -26,8 +26,6 @@
 #ifndef	_NETSTACK_H
 #define	_NETSTACK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <mdb/mdb_modapi.h>
 
 #ifdef	__cplusplus
@@ -38,6 +36,7 @@ int netstack_walk_init(mdb_walk_state_t *);
 int netstack_walk_step(mdb_walk_state_t *);
 
 int netstack(uintptr_t, uint_t, int, const mdb_arg_t *);
+int netstackid2netstack(uintptr_t, uint_t, int, const mdb_arg_t *);
 
 #ifdef	__cplusplus
 }
diff --git a/usr/src/cmd/mdb/common/modules/genunix/zone.c b/usr/src/cmd/mdb/common/modules/genunix/zone.c
index 96f6b598ec..fc243061cb 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/zone.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/zone.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
  */
 
 #include <mdb/mdb_param.h>
@@ -54,6 +55,56 @@ char *zone_status_names[] = {
 	"dead"			/* ZONE_IS_DEAD */
 };
 
+static int
+zid_lookup_cb(uintptr_t addr, const zone_t *zone, void *arg)
+{
+	zoneid_t zid = *(uintptr_t *)arg;
+	if (zone->zone_id == zid)
+		mdb_printf("%p\n", addr);
+
+	return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+int
+zid2zone(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	if (!(flags & DCMD_ADDRSPEC) || argc != 0)
+		return (DCMD_USAGE);
+
+	if (mdb_walk("zone", (mdb_walk_cb_t)zid_lookup_cb, &addr) == -1) {
+		mdb_warn("failed to walk zone");
+		return (DCMD_ERR);
+	}
+
+	return (DCMD_OK);
+}
+
+static int
+zdid_lookup_cb(uintptr_t addr, const zone_t *zone, void *arg)
+{
+	zoneid_t zdid = *(uintptr_t *)arg;
+	if (zone->zone_did == zdid)
+		mdb_printf("%p\n", addr);
+
+	return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+int
+zdid2zone(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+	if (!(flags & DCMD_ADDRSPEC) || argc != 0)
+		return (DCMD_USAGE);
+
+	if (mdb_walk("zone", (mdb_walk_cb_t)zdid_lookup_cb, &addr) == -1) {
+		mdb_warn("failed to walk zone");
+		return (DCMD_ERR);
+	}
+
+	return (DCMD_OK);
+}
+
 int
 zoneprt(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
 {
diff --git a/usr/src/cmd/mdb/common/modules/genunix/zone.h b/usr/src/cmd/mdb/common/modules/genunix/zone.h
index e0e5038527..94a383e41c 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/zone.h
+++ b/usr/src/cmd/mdb/common/modules/genunix/zone.h
@@ -27,14 +27,14 @@
 #ifndef	_ZONE_H
 #define	_ZONE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <mdb/mdb_modapi.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
+extern int zid2zone(uintptr_t, uint_t, int argc, const mdb_arg_t *);
+extern int zdid2zone(uintptr_t, uint_t, int argc, const mdb_arg_t *);
 extern int zoneprt(uintptr_t, uint_t, int argc, const mdb_arg_t *);
 
 extern int zone_walk_init(mdb_walk_state_t *);
diff --git a/usr/src/uts/common/os/netstack.c b/usr/src/uts/common/os/netstack.c
index b8467fbe13..93fd1a387d 100644
--- a/usr/src/uts/common/os/netstack.c
+++ b/usr/src/uts/common/os/netstack.c
@@ -22,6 +22,7 @@
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserved.
  */
 
 #include <sys/param.h>
@@ -205,6 +206,7 @@ void
 netstack_unregister(int moduleid)
 {
 	netstack_t *ns;
+	boolean_t created = B_FALSE;
 
 	ASSERT(moduleid >= 0 && moduleid < NS_MAX);
 
@@ -223,7 +225,33 @@ netstack_unregister(int moduleid)
 		nm_state_t *nms = &ns->netstack_m_state[moduleid];
 
 		mutex_enter(&ns->netstack_lock);
-		if (ns_reg[moduleid].nr_shutdown != NULL &&
+
+		/*
+		 * We need to be careful here. We could actually have a netstack
+		 * being created as we speak waiting for us to let go of this
+		 * lock to proceed. It may have set NSS_CREATE_NEEDED, but not
+		 * have gotten to the point of completing it yet. If
+		 * NSS_CREATE_NEEDED, we can safely just remove it here and
+		 * never create the module. However, if NSS_CREATE_INPROGRESS is
+		 * set, we need to still flag this module for shutdown and
+		 * deletion, just as though it had reached NSS_CREATE_COMPLETED.
+		 *
+		 * It is safe to do that because of two different guarantees
+		 * that exist in the system. The first is that before we do a
+		 * create, shutdown, or destroy, we ensure that nothing else is
+		 * in progress in the system for this netstack and wait for it
+		 * to complete. Secondly, because the zone is being created, we
+		 * know that the following call to apply_all_netstack will block
+		 * on the zone finishing its initialization.
+		 */
+		if (nms->nms_flags & NSS_CREATE_NEEDED)
+			nms->nms_flags &= ~NSS_CREATE_NEEDED;
+
+		if (nms->nms_flags & NSS_CREATE_INPROGRESS ||
+		    nms->nms_flags & NSS_CREATE_COMPLETED)
+			created = B_TRUE;
+
+		if (ns_reg[moduleid].nr_shutdown != NULL && created &&
 		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
 		    (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
 			nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
@@ -231,8 +259,7 @@ netstack_unregister(int moduleid)
 			    netstack_t *, ns, int, moduleid);
 		}
 		if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) &&
-		    ns_reg[moduleid].nr_destroy != NULL &&
-		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
+		    ns_reg[moduleid].nr_destroy != NULL && created &&
 		    (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
 			nms->nms_flags |= NSS_DESTROY_NEEDED;
 			DTRACE_PROBE2(netstack__destroy__needed,
diff --git a/usr/src/uts/i86pc/io/apix/apix.c b/usr/src/uts/i86pc/io/apix/apix.c
index 8c4ccb6a0a..81b88dc426 100644
--- a/usr/src/uts/i86pc/io/apix/apix.c
+++ b/usr/src/uts/i86pc/io/apix/apix.c
@@ -635,10 +635,10 @@ apix_send_eoi(void)
 /*
  * platform_intr_enter
  *
- *	Called at the beginning of the interrupt service routine to
- *	mask all level equal to and below the interrupt priority
- *	of the interrupting vector.  An EOI should be given to
- *	the interrupt controller to enable other HW interrupts.
+ *	Called at the beginning of the interrupt service routine, but unlike
+ *	pcplusmp, does not mask interrupts. An EOI is given to the interrupt
+ *	controller to enable other HW interrupts but interrupts are still
+ * 	masked by the IF flag.
  *
  *	Return -1 for spurious interrupts
  *
@@ -750,58 +750,30 @@ apix_intr_exit(int prev_ipl, int arg2)
 }
 
 /*
- * Mask all interrupts below or equal to the given IPL.
- * Any changes made to this function must also change X2APIC
- * version of setspl.
+ * The pcplusmp setspl code uses the TPR to mask all interrupts at or below the
+ * given ipl, but apix never uses the TPR and we never mask a subset of the
+ * interrupts. They are either all blocked by the IF flag or all can come in.
+ *
+ * For setspl, we mask all interrupts for XC_HI_PIL, otherwise, interrupts can
+ * come in if currently enabled by the IF flag. This table shows the state of
+ * the IF flag when we leave this function.
+ *
+ *    curr IF |	ipl == 15	ipl != 15
+ *    --------+---------------------------
+ *       0    |    0		    0
+ *       1    |    0		    1
  */
 static void
 apix_setspl(int ipl)
 {
-	/* interrupts at ipl above this cannot be in progress */
-	apic_cpus[psm_get_cpu_id()].aci_ISR_in_progress &= (2 << ipl) - 1;
-
 	/*
-	 * Mask all interrupts for XC_HI_PIL (i.e set TPR to 0xf).
-	 * Otherwise, enable all interrupts (i.e. set TPR to 0).
+	 * Interrupts at ipl above this cannot be in progress, so the following
+	 * mask is ok.
 	 */
-	if (ipl != XC_HI_PIL)
-		ipl = 0;
-
-#if defined(__amd64)
-	setcr8((ulong_t)ipl);
-#else
-	if (apic_have_32bit_cr8)
-		setcr8((ulong_t)ipl);
-	else
-		apicadr[APIC_TASK_REG] = ipl << APIC_IPL_SHIFT;
-#endif
-
-	/*
-	 * this is a patch fix for the ALR QSMP P5 machine, so that interrupts
-	 * have enough time to come in before the priority is raised again
-	 * during the idle() loop.
-	 */
-	if (apic_setspl_delay)
-		(void) apic_reg_ops->apic_get_pri();
-}
-
-/*
- * X2APIC version of setspl.
- */
-static void
-x2apix_setspl(int ipl)
-{
-	/* interrupts at ipl above this cannot be in progress */
 	apic_cpus[psm_get_cpu_id()].aci_ISR_in_progress &= (2 << ipl) - 1;
 
-	/*
-	 * Mask all interrupts for XC_HI_PIL (i.e set TPR to 0xf).
-	 * Otherwise, enable all interrupts (i.e. set TPR to 0).
-	 */
-	if (ipl != XC_HI_PIL)
-		ipl = 0;
-
-	X2APIC_WRITE(APIC_TASK_REG, ipl << APIC_IPL_SHIFT);
+	if (ipl == XC_HI_PIL)
+		cli();
 }
 
 int
@@ -1112,6 +1084,10 @@ apix_post_cyclic_setup(void *arg)
 	    apic_redistribute_sample_interval, DDI_IPL_2);
 }
 
+/*
+ * Called the first time we enable x2apic mode on this cpu.
+ * Update some of the function pointers to use x2apic routines.
+ */
 void
 x2apic_update_psm()
 {
@@ -1120,14 +1096,17 @@ x2apic_update_psm()
 	ASSERT(pops != NULL);
 
 	/*
-	 * The xxx_intr_exit() sets TPR and sends back EOI. The
-	 * xxx_setspl() sets TPR. These two routines are not
-	 * needed in new design.
+	 * The pcplusmp module x2apic_update_psm function does this:
 	 *
-	 * pops->psm_intr_exit = x2apic_intr_exit;
-	 * pops->psm_setspl = x2apic_setspl;
+	 *	pops->psm_intr_exit = x2apic_intr_exit;
+	 *	pops->psm_setspl = x2apic_setspl;
+	 *	pops->psm_send_ipi =  x2apic_send_ipi;
+	 *
+	 * Note the x2apic prefix vs. our apix prefix for setspl.
+	 * The x2apic_intr_exit() sets TPR and sends back EOI. The
+	 * x2apic_setspl() sets TPR.  This functionality is not
+	 * used in new design.
 	 */
-	pops->psm_setspl = x2apix_setspl;
 	pops->psm_send_ipi = x2apic_send_ipi;
 
 	send_dirintf = pops->psm_send_ipi;
@@ -2077,6 +2056,9 @@ apix_intx_get_pending(int irqno)
 	return (pending);
 }
 
+/*
+ * This function will mask the interrupt on the I/O APIC
+ */
 static void
 apix_intx_set_mask(int irqno)
 {
@@ -2106,6 +2088,9 @@ apix_intx_set_mask(int irqno)
 	intr_restore(iflag);
 }
 
+/*
+ * This function will clear the mask for the interrupt on the I/O APIC
+ */
 static void
 apix_intx_clear_mask(int irqno)
 {
diff --git a/usr/src/uts/i86pc/io/apix/apix_intr.c b/usr/src/uts/i86pc/io/apix/apix_intr.c
index e5d072b525..d870a4d365 100644
--- a/usr/src/uts/i86pc/io/apix/apix_intr.c
+++ b/usr/src/uts/i86pc/io/apix/apix_intr.c
@@ -862,6 +862,9 @@ apix_dispatch_lowlevel(uint_t vector, uint_t oldipl)
 	apix_intr_thread_epilog(cpu, oldipl);
 }
 
+/*
+ * Interrupt service routine, called with interrupts disabled.
+ */
 void
 apix_do_interrupt(struct regs *rp, trap_trace_rec_t *ttp)
 {
@@ -904,7 +907,7 @@ apix_do_interrupt(struct regs *rp, trap_trace_rec_t *ttp)
 	}
 
 	/*
-	 * Raise the interrupt priority. Send EOI to local APIC
+	 * Send EOI to local APIC
 	 */
 	newipl = (*setlvl)(oldipl, (int *)&rp->r_trapno);
 #ifdef TRAPTRACE
diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c
index 91d7afcf36..8acaf8fc73 100644
--- a/usr/src/uts/i86pc/os/intr.c
+++ b/usr/src/uts/i86pc/os/intr.c
@@ -21,6 +21,420 @@
 
 /*
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc.  All rights reserverd.
+ */
+
+/*
+ * To understand the present state of interrupt handling on i86pc, we must
+ * first consider the history of interrupt controllers and our way of handling
+ * interrupts.
+ *
+ * History of Interrupt Controllers on i86pc
+ * -----------------------------------------
+ *
+ *    Intel 8259 and 8259A
+ *
+ * The first interrupt controller that attained widespread use on i86pc was
+ * the Intel 8259(A) Programmable Interrupt Controller that first saw use with
+ * the 8086. It took up to 8 interrupt sources and combined them into one
+ * output wire. Up to 8 8259s could be slaved together providing up to 64 IRQs.
+ * With the switch to the 8259A, level mode interrupts became possible. For a
+ * long time on i86pc the 8259A was the only way to handle interrupts and it
+ * had its own set of quirks. The 8259A and its corresponding interval timer
+ * the 8254 are programmed using outb and inb instructions.
+ *
+ *    Intel Advanced Programmable Interrupt Controller (APIC)
+ *
+ * Starting around the time of the introduction of the P6 family
+ * microarchitecture (i686) Intel introduced a new interrupt controller.
+ * Instead of having the series of slaved 8259A devices, Intel opted to outfit
+ * each processor with a Local APIC (lapic) and to outfit the system with at
+ * least one, but potentially more, I/O APICs (ioapic). The lapics and ioapics
+ * initially communicated over a dedicated bus, but this has since been
+ * replaced. Each physical core and even hyperthread currently contains its
+ * own local apic, which is not shared. There are a few exceptions for
+ * hyperthreads, but that does not usually concern us.
+ *
+ * Instead of talking directly to 8259 for status, sending End Of Interrupt
+ * (EOI), etc. a microprocessor now communicates directly to the lapic. This
+ * also allows for each microprocessor to be able to have independent controls.
+ * The programming method is different from the 8259. Consumers map the lapic
+ * registers into uncacheable memory to read and manipulate the state.
+ *
+ * The number of addressable interrupt vectors was increased to 256. However
+ * vectors 0-31 are reserved for the processor exception handling, leaving the
+ * remaining vectors for general use. In addition to hardware generated
+ * interrupts, the lapic provides a way for generating inter-processor
+ * interrupts (IPI) which are the basis for CPU cross calls and CPU pokes.
+ *
+ * AMD ended up implementing the Intel APIC architecture in lieu of their work
+ * with Cyrix.
+ *
+ *    Intel x2apic
+ *
+ * The x2apic is an extension to the lapic which started showing up around the
+ * same time as the Sandy Bridge chipsets. It provides a new programming mode
+ * as well as new features. The goal of the x2apic is to solve a few problems
+ * with the previous generation of lapic and the x2apic is backwards compatible
+ * with the previous programming and model. The only downsides to using the
+ * backward compatibility is that you not able to take advantage of the new
+ * x2apic features.
+ *
+ *    o The APIC ID is increased from an 8-bit value to a 32-bit value. This
+ *    increases the maximum number of addressable physical processors beyond
+ *    256. This new ID is assembled in a similar manner as the information that
+ *    is obtainable by the extended cpuid topology leaves.
+ *
+ *    o A new means of generating IPIs was introduced.
+ *
+ *    o Instead of memory mapping the registers, the x2apic only allows for
+ *    programming it through a series of wrmsrs. This has important semantic
+ *    side effects. Recall that the registers were previously all mapped to
+ *    uncachable memory which meant that all operations to the local apic were
+ *    serializing instructions. With the switch to using wrmsrs this has been
+ *    relaxed and these operations can no longer be assumed to be serializing
+ *    instructions.
+ *
+ * Note for the rest of this we are only going to concern ourselves with the
+ * apic and x2apic which practically all of i86pc has been using now for
+ * quite some time.
+ *
+ * Interrupt Priority Levels
+ * -------------------------
+ *
+ * On i86pc systems there are a total of fifteen interrupt priority levels
+ * (ipls) which range from 1-15. Level 0 is for normal processing and
+ * non-interrupt processing. To manipulate these values the family of spl
+ * functions (which date back to UNIX on the PDP-11) are used. Specifically,
+ * splr() to raise the priority level and splx() to lower it. One should not
+ * generally call setspl() directly.
+ *
+ * Both i86pc and the supported SPARC platforms honor the same conventions for
+ * the meaning behind these IPLs. The most important IPL is the platform's
+ * LOCK_LEVEL (0xa on i86pc). If a thread is above LOCK_LEVEL it _must_ not
+ * sleep on any synchronization object. The only allowed synchronization
+ * primitive is a mutex that has been specifically initialized to be a spin
+ * lock (see mutex_init(9F)). Another important level is DISP_LEVEL (0xb on
+ * i86pc). You must be at DISP_LEVEL if you want to control the dispatcher.
+ * The XC_HI_PIL is the highest level (0xf) and is used during cross-calls.
+ *
+ * Each interrupt that is registered in the system fires at a specific IPL.
+ * Generally most interrupts fire below LOCK_LEVEL.
+ *
+ * PSM Drivers
+ * -----------
+ *
+ * We currently have three sets of PSM drivers available. uppc, pcplusmp, and
+ * apix. uppc (uni-processor PC) is the original driver that interacts with the
+ * 8259A and 8254. In general, it is not used anymore given the prevalence of
+ * the apic.
+ *
+ * The system prefers to use the apix driver over the pcplusmp driver. The apix
+ * driver requires HW support for an x2apic. If there is no x2apic HW, apix
+ * will not be used. In general we prefer using the apix driver over the
+ * pcplusmp driver because it gives us much more flexibility with respect to
+ * interrupts. In the apix driver each local apic has its own independent set
+ * of  interrupts, whereas the pcplusmp driver only has a single global set of
+ * interrupts. This is why pcplusmp only supports a finite number of interrupts
+ * per IPL -- generally 16, often less. The apix driver supports using either
+ * the x2apic or the local apic programing modes. The programming mode does not
+ * change the number of interrupts available, just the number of processors
+ * that we can address. For the apix driver, the x2apic mode is enabled if the
+ * system supports interrupt re-mapping, otherwise the module manages the
+ * x2apic in local mode.
+ *
+ * When there is no x2apic present, we default back to the pcplusmp PSM driver.
+ * In general, this is not problematic unless you have more than 256
+ * processors in the machine or you do not have enough interrupts available.
+ *
+ * Controlling Interrupt Generation on i86pc
+ * -----------------------------------------
+ *
+ * There are two different ways to manipulate which interrupts will be
+ * generated on i86pc. Each offers different degrees of control.
+ *
+ * The first is through the flags register (eflags and rflags on i386 and amd64
+ * respectively). The IF bit determines whether or not interrupts are enabled
+ * or disabled. This is manipulated in one of several ways. The most common way
+ * is through the cli and sti instructions. These clear the IF flag and set it,
+ * respectively, for the current processor. The other common way is through the
+ * use of the intr_clear and intr_restore functions.
+ *
+ * Assuming interrupts are not blocked by the IF flag, then the second form is
+ * through the Processor-Priority Register (PPR). The PPR is used to determine
+ * whether or not a pending interrupt should be delivered. If the ipl of the
+ * new interrupt is higher than the current value in the PPR, then the lapic
+ * will either deliver it immediately (if interrupts are not in progress) or it
+ * will deliver it once the current interrupt processing has issued an EOI. The
+ * highest unmasked interrupt will be the one delivered.
+ *
+ * The PPR register is based upon the max of the following two registers in the
+ * lapic, The TPR register (also known as CR8 on amd64) that can be used to
+ * mask interrupt levels, and the current vector. Because the pcplusmp module
+ * always sets TPR appropriately early in the do_interrupt path, we can usually
+ * just think that the PPR is the TPR. The pcplusmp module also issues an EOI
+ * once it has set the TPR, so higher priority interrupts can come in while
+ * we're servicing a lower priority interrupt.
+ *
+ * Handling Interrupts
+ * -------------------
+ *
+ * Interrupts can be broken down into three categories based on priority and
+ * source:
+ *
+ *   o High level interrupts
+ *   o Low level hardware interrupts
+ *   o Low level software interrupts
+ *
+ *   High Level Interrupts
+ *
+ * High level interrupts encompasses both hardware-sourced and software-sourced
+ * interrupts. Examples of high level hardware interrupts include the serial
+ * console. High level software-sourced interrupts are still delivered through
+ * the local apic through IPIs. This is primarily cross calls.
+ *
+ * When a high level interrupt comes in, we will raise the SPL and then pin the
+ * current lwp to the processor. We will use its lwp, but our own interrupt
+ * stack and process the high level interrupt in-situ. These handlers are
+ * designed to be very short in nature and cannot go to sleep, only block on a
+ * spin lock. If the interrupt has a lot of work to do, it must generate a
+ * low-priority software interrupt that will be processed later.
+ *
+ *   Low level hardware interrupts
+ *
+ * Low level hardware interrupts start off like their high-level cousins. The
+ * current CPU contains a number of kernel threads (kthread_t) that can be used
+ * to process low level interrupts. These are shared between both low level
+ * hardware and software interrupts. Note that we while we run with our
+ * kthread_t, we borrow the pinned threads lwp_t until such a time as we hit a
+ * synchronization object. If we hit one and need to sleep, then the scheduler
+ * will instead create the rest of what we need.
+ *
+ *   Low level software interrupts
+ *
+ * Low level software interrupts are handled in a similar way as hardware
+ * interrupts, but the notification vector is different. Each CPU has a bitmask
+ * of pending software interrupts. We can notify a CPU to process software
+ * interrupts through a specific trap vector as well as through several
+ * checks that are performed throughout the code. Thse checks will look at
+ * processing software interrupts as we lower our spl.
+ *
+ * We attempt to process the highest pending software interrupt that we can
+ * which is greater than our current IPL. If none currently exist, then we move
+ * on. We process a software interrupt in a similar fashion to a hardware
+ * interrupt.
+ *
+ * Traditional Interrupt Flow
+ * --------------------------
+ *
+ * The following diagram tracks the flow of the traditional uppc and pcplusmp
+ * interrupt handlers. The apix driver has its own version of do_interrupt().
+ * We come into the interrupt handler with all interrupts masked by the IF
+ * flag. This is because we set up the handler using an interrupt-gate, which
+ * is defined architectuarlly to have cleared the IF flag for us.
+ *
+ * +--------------+    +----------------+    +-----------+
+ * | _interrupt() |--->| do_interrupt() |--->| *setlvl() |
+ * +--------------+    +----------------+    +-----------+
+ *                       |      |     |
+ *                       |      |     |
+ *              low-level|      |     | softint
+ *                HW int |      |     +---------------------------------------+
+ * +--------------+      |      |                                             |
+ * | intr_thread_ |<-----+      | hi-level int                                |
+ * | prolog()     |             |    +----------+                             |
+ * +--------------+             +--->| hilevel_ |      Not on intr stack      |
+ *       |                           | intr_    |-----------------+           |
+ *       |                           | prolog() |                 |           |
+ * +------------+                    +----------+                 |           |
+ * | switch_sp_ |                        | On intr                v           |
+ * | and_call() |                        | Stack          +------------+      |
+ * +------------+                        |                | switch_sp_ |      |
+ *       |                               v                | and_call() |      |
+ *       v                             +-----------+      +------------+      |
+ * +-----------+                       | dispatch_ |             |            |
+ * | dispatch_ |   +-------------------| hilevel() |<------------+            |
+ * | hardint() |   |                   +-----------+                          |
+ * +-----------+   |                                                          |
+ *       |         v                                                          |
+ *       |     +-----+  +----------------------+  +-----+  hi-level           |
+ *       +---->| sti |->| av_dispatch_autovect |->| cli |---------+           |
+ *             +-----+  +----------------------+  +-----+         |           |
+ *                                |                |              |           |
+ *                                v                |              |           |
+ *                         +----------+            |              |           |
+ *                         | for each |            |              |           |
+ *                         | handler  |            |              |           |
+ *                         |  *intr() |            |              v           |
+ * +--------------+        +----------+            |      +----------------+  |
+ * | intr_thread_ |                      low-level |      | hilevel_intr_  |  |
+ * | epilog()     |<-------------------------------+      | epilog()       |  |
+ * +--------------+                                       +----------------+  |
+ *   |       |                                                   |            |
+ *   |       +----------------------v      v---------------------+            |
+ *   |                           +------------+                               |
+ *   |   +---------------------->| *setlvlx() |                               |
+ *   |   |                       +------------+                               |
+ *   |   |                              |                                     |
+ *   |   |                              v                                     |
+ *   |   |      +--------+     +------------------+      +-------------+      |
+ *   |   |      | return |<----| softint pending? |----->| dosoftint() |<-----+
+ *   |   |      +--------+  no +------------------+ yes  +-------------+
+ *   |   |           ^                                      |     |
+ *   |   |           |  softint pil too low                 |     |
+ *   |   |           +--------------------------------------+     |
+ *   |   |                                                        v
+ *   |   |    +-----------+      +------------+          +-----------+
+ *   |   |    | dispatch_ |<-----| switch_sp_ |<---------| *setspl() |
+ *   |   |    | softint() |      | and_call() |          +-----------+
+ *   |   |    +-----------+      +------------+
+ *   |   |        |
+ *   |   |        v
+ *   |   |      +-----+  +----------------------+  +-----+  +------------+
+ *   |   |      | sti |->| av_dispatch_autovect |->| cli |->| dosoftint_ |
+ *   |   |      +-----+  +----------------------+  +-----+  | epilog()   |
+ *   |   |                                                  +------------+
+ *   |   |                                                    |     |
+ *   |   +----------------------------------------------------+     |
+ *   v                                                              |
+ * +-----------+                                                    |
+ * | interrupt |                                                    |
+ * | thread    |<---------------------------------------------------+
+ * | blocked   |
+ * +-----------+
+ *      |
+ *      v
+ *  +----------------+  +------------+  +-----------+  +-------+  +---------+
+ *  | set_base_spl() |->| *setlvlx() |->| splhigh() |->| sti() |->| swtch() |
+ *  +----------------+  +------------+  +-----------+  +-------+  +---------+
+ *
+ *    Calls made on Interrupt Stacks and Epilogue routines
+ *
+ * We use the switch_sp_and_call() assembly routine to switch our sp to the
+ * interrupt stacks and then call the appropriate dispatch function. In the
+ * case of interrupts which may block, softints and hardints, we always ensure
+ * that we are still on the interrupt thread when we call the epilog routine.
+ * This is not just important, it's necessary. If the interrupt thread blocked,
+ * we won't return from our switch_sp_and_call() function and instead we'll go
+ * through and set ourselves up to swtch() directly.
+ *
+ * New Interrupt Flow
+ * ------------------
+ *
+ * The apix module has its own interrupt path. This is done for various
+ * reasons. The first is that rather than having global interrupt vectors, we
+ * now have per-cpu vectors.
+ *
+ * The other substantial change is that the apix design does not use the TPR to
+ * mask interrupts below the current level. In fact, except for one special
+ * case, it does not use the TPR at all. Instead, it only uses the IF flag
+ * (cli/sti) to either block all interrupts or allow any interrupts to come in.
+ * The design is such that when interrupts are allowed to come in, if we are
+ * currently servicing a higher priority interupt, the new interrupt is treated
+ * as pending and serviced later. Specifically, in the pcplusmp module's
+ * apic_intr_enter function the code masks interrupts at or below the current
+ * IPL using the TPR before sending EOI, whereas the apix module's
+ * apix_intr_enter function simply sends EOI.
+ *
+ * The one special case where the apix code uses the TPR is when it calls
+ * through the apic_reg_ops function pointer apic_write_task_reg in
+ * apix_init_intr() to initially mask all levels and then finally to enable all
+ * levels.
+ *
+ * Recall that we come into the interrupt handler with all interrupts masked
+ * by the IF flag. This is because we set up the handler using an
+ * interrupt-gate which is defined architectuarlly to have cleared the IF flag
+ * for us.
+ *
+ * +--------------+    +---------------------+
+ * | _interrupt() |--->| apix_do_interrupt() |
+ * +--------------+    +---------------------+
+ *                               |
+ *                hard int? +----+--------+ softint?
+ *                          |             | (but no low-level looping)
+ *                   +-----------+        |
+ *                   | *setlvl() |        |
+ * +---------+       +-----------+        +----------------------------------+
+ * |apix_add_|    check IPL |                                                |
+ * |pending_ |<-------------+------+----------------------+                  |
+ * |hardint()|        low-level int|          hi-level int|                  |
+ * +---------+                     v                      v                  |
+ *     | check IPL       +-----------------+     +---------------+           |
+ *  +--+-----+           | apix_intr_      |     | apix_hilevel_ |           |
+ *  |        |           | thread_prolog() |     | intr_prolog() |           |
+ *  |      return        +-----------------+     +---------------+           |
+ *  |                         |                    | On intr                 |
+ *  |                   +------------+             | stack?  +------------+  |
+ *  |                   | switch_sp_ |             +---------| switch_sp_ |  |
+ *  |                   | and_call() |             |         | and_call() |  |
+ *  |                   +------------+             |         +------------+  |
+ *  |                         |                    |          |              |
+ *  |                   +----------------+     +----------------+            |
+ *  |                   | apix_dispatch_ |     | apix_dispatch_ |            |
+ *  |                   | lowlevel()     |     | hilevel()      |            |
+ *  |                   +----------------+     +----------------+            |
+ *  |                                |             |                         |
+ *  |                                v             v                         |
+ *  |                       +-------------------------+                      |
+ *  |                       |apix_dispatch_by_vector()|----+                 |
+ *  |                       +-------------------------+    |                 |
+ *  |               !XC_HI_PIL|         |         |        |                 |
+ *  |                       +---+   +-------+   +---+      |                 |
+ *  |                       |sti|   |*intr()|   |cli|      |                 |
+ *  |                       +---+   +-------+   +---+      |  hi-level?      |
+ *  |                          +---------------------------+----+            |
+ *  |                          v                low-level?      v            |
+ *  |                  +----------------+               +----------------+   |
+ *  |                  | apix_intr_     |               | apix_hilevel_  |   |
+ *  |                  | thread_epilog()|               | intr_epilog()  |   |
+ *  |                  +----------------+               +----------------+   |
+ *  |                          |                                |            |
+ *  |        v-----------------+--------------------------------+            |
+ *  |  +------------+                                                        |
+ *  |  | *setlvlx() |   +----------------------------------------------------+
+ *  |  +------------+   |
+ *  |      |            |            +--------------------------------+ low
+ *  v      v     v------+            v                                | level
+ * +------------------+      +------------------+      +-----------+  | pending?
+ * | apix_do_pending_ |----->| apix_do_pending_ |----->| apix_do_  |--+
+ * | hilevel()        |      | hardint()        |      | softint() |  |
+ * +------------------+      +------------------+      +-----------+    return
+ *     |                       |                         |
+ *     | while pending         | while pending           | while pending
+ *     | hi-level              | low-level               | softint
+ *     |                       |                         |
+ *  +---------------+        +-----------------+       +-----------------+
+ *  | apix_hilevel_ |        | apix_intr_      |       | apix_do_        |
+ *  | intr_prolog() |        | thread_prolog() |       | softint_prolog()|
+ *  +---------------+        +-----------------+       +-----------------+
+ *     | On intr                       |                      |
+ *     | stack? +------------+    +------------+        +------------+
+ *     +--------| switch_sp_ |    | switch_sp_ |        | switch_sp_ |
+ *     |        | and_call() |    | and_call() |        | and_call() |
+ *     |        +------------+    +------------+        +------------+
+ *     |           |                   |                      |
+ *  +------------------+   +------------------+   +------------------------+
+ *  | apix_dispatch_   |   | apix_dispatch_   |   | apix_dispatch_softint()|
+ *  | pending_hilevel()|   | pending_hardint()|   +------------------------+
+ *  +------------------+   +------------------+      |    |      |      |
+ *    |         |           |         |              |    |      |      |
+ *    | +----------------+  | +----------------+     |    |      |      |
+ *    | | apix_hilevel_  |  | | apix_intr_     |     |    |      |      |
+ *    | | intr_epilog()  |  | | thread_epilog()|     |    |      |      |
+ *    | +----------------+  | +----------------+     |    |      |      |
+ *    |         |           |       |                |    |      |      |
+ *    |   +------------+    |  +----------+   +------+    |      |      |
+ *    |   | *setlvlx() |    |  |*setlvlx()|   |           |      |      |
+ *    |   +------------+    |  +----------+   |   +----------+   |   +---------+
+ *    |                     |               +---+ |av_       | +---+ |apix_do_ |
+ * +---------------------------------+      |sti| |dispatch_ | |cli| |softint_ |
+ * | apix_dispatch_pending_autovect()|      +---+ |softvect()| +---+ |epilog() |
+ * +---------------------------------+            +----------+       +---------+
+ *  |!XC_HI_PIL  |       |         |                    |
+ * +---+  +-------+    +---+  +----------+          +-------+
+ * |sti|  |*intr()|    |cli|  |apix_post_|          |*intr()|
+ * +---+  +-------+    +---+  |hardint() |          +-------+
+ *                            +----------+
  */
 
 #include <sys/cpuvar.h>