summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/genunix.c7
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/netstack.c30
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/netstack.h3
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/zone.c51
-rw-r--r--usr/src/cmd/mdb/common/modules/genunix/zone.h4
-rw-r--r--usr/src/uts/common/os/netstack.c33
-rw-r--r--usr/src/uts/i86pc/io/apix/apix.c93
-rw-r--r--usr/src/uts/i86pc/io/apix/apix_intr.c5
-rw-r--r--usr/src/uts/i86pc/os/intr.c414
9 files changed, 576 insertions, 64 deletions
diff --git a/usr/src/cmd/mdb/common/modules/genunix/genunix.c b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
index d68260a8f8..48578a52f4 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/genunix.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/genunix.c
@@ -4059,6 +4059,9 @@ static const mdb_dcmd_t dcmds[] = {
/* from netstack.c */
{ "netstack", "", "show stack instances", netstack },
+ { "netstackid2netstack", ":",
+ "translate a netstack id to its netstack_t",
+ netstackid2netstack },
/* from nvpair.c */
{ NVPAIR_DCMD_NAME, NVPAIR_DCMD_USAGE, NVPAIR_DCMD_DESCR,
@@ -4149,6 +4152,10 @@ static const mdb_dcmd_t dcmds[] = {
pfiles_help },
/* from zone.c */
+ { "zid2zone", ":", "find the zone_t with the given zone id",
+ zid2zone },
+ { "zdid2zone", ":", "find the zone_t with the given zone debug id",
+ zdid2zone },
{ "zone", "?[-r [-v]]", "display kernel zone(s)", zoneprt },
{ "zsd", ":[-v] [zsd_key]", "display zone-specific-data entries for "
"selected zones", zsd },
diff --git a/usr/src/cmd/mdb/common/modules/genunix/netstack.c b/usr/src/cmd/mdb/common/modules/genunix/netstack.c
index 588bd6dbf3..d46bd85d1f 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/netstack.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/netstack.c
@@ -21,10 +21,9 @@
/*
* Copyright 2007 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <mdb/mdb_modapi.h>
#include <mdb/mdb_ks.h>
#include <mdb/mdb_ctf.h>
@@ -121,3 +120,30 @@ netstack(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
return (DCMD_OK);
}
+
+static int
+netstackid_lookup_cb(uintptr_t addr, const netstack_t *ns, void *arg)
+{
+ netstackid_t nid = *(uintptr_t *)arg;
+ if (ns->netstack_stackid == nid)
+ mdb_printf("%p\n", addr);
+
+ return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+int
+netstackid2netstack(uintptr_t addr, uint_t flags, int argc,
+ const mdb_arg_t *argv)
+{
+ if (!(flags & DCMD_ADDRSPEC) || argc != 0)
+ return (DCMD_USAGE);
+
+ if (mdb_walk("netstack", (mdb_walk_cb_t)netstackid_lookup_cb, &addr) ==
+ -1) {
+ mdb_warn("failed to walk zone");
+ return (DCMD_ERR);
+ }
+
+ return (DCMD_OK);
+}
diff --git a/usr/src/cmd/mdb/common/modules/genunix/netstack.h b/usr/src/cmd/mdb/common/modules/genunix/netstack.h
index 392565caca..f5773c36c1 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/netstack.h
+++ b/usr/src/cmd/mdb/common/modules/genunix/netstack.h
@@ -26,8 +26,6 @@
#ifndef _NETSTACK_H
#define _NETSTACK_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <mdb/mdb_modapi.h>
#ifdef __cplusplus
@@ -38,6 +36,7 @@ int netstack_walk_init(mdb_walk_state_t *);
int netstack_walk_step(mdb_walk_state_t *);
int netstack(uintptr_t, uint_t, int, const mdb_arg_t *);
+int netstackid2netstack(uintptr_t, uint_t, int, const mdb_arg_t *);
#ifdef __cplusplus
}
diff --git a/usr/src/cmd/mdb/common/modules/genunix/zone.c b/usr/src/cmd/mdb/common/modules/genunix/zone.c
index 96f6b598ec..fc243061cb 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/zone.c
+++ b/usr/src/cmd/mdb/common/modules/genunix/zone.c
@@ -20,6 +20,7 @@
*/
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#include <mdb/mdb_param.h>
@@ -54,6 +55,56 @@ char *zone_status_names[] = {
"dead" /* ZONE_IS_DEAD */
};
+static int
+zid_lookup_cb(uintptr_t addr, const zone_t *zone, void *arg)
+{
+ zoneid_t zid = *(uintptr_t *)arg;
+ if (zone->zone_id == zid)
+ mdb_printf("%p\n", addr);
+
+ return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+int
+zid2zone(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ if (!(flags & DCMD_ADDRSPEC) || argc != 0)
+ return (DCMD_USAGE);
+
+ if (mdb_walk("zone", (mdb_walk_cb_t)zid_lookup_cb, &addr) == -1) {
+ mdb_warn("failed to walk zone");
+ return (DCMD_ERR);
+ }
+
+ return (DCMD_OK);
+}
+
+static int
+zdid_lookup_cb(uintptr_t addr, const zone_t *zone, void *arg)
+{
+ zoneid_t zdid = *(uintptr_t *)arg;
+ if (zone->zone_did == zdid)
+ mdb_printf("%p\n", addr);
+
+ return (WALK_NEXT);
+}
+
+/*ARGSUSED*/
+int
+zdid2zone(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ if (!(flags & DCMD_ADDRSPEC) || argc != 0)
+ return (DCMD_USAGE);
+
+ if (mdb_walk("zone", (mdb_walk_cb_t)zdid_lookup_cb, &addr) == -1) {
+ mdb_warn("failed to walk zone");
+ return (DCMD_ERR);
+ }
+
+ return (DCMD_OK);
+}
+
int
zoneprt(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
{
diff --git a/usr/src/cmd/mdb/common/modules/genunix/zone.h b/usr/src/cmd/mdb/common/modules/genunix/zone.h
index e0e5038527..94a383e41c 100644
--- a/usr/src/cmd/mdb/common/modules/genunix/zone.h
+++ b/usr/src/cmd/mdb/common/modules/genunix/zone.h
@@ -27,14 +27,14 @@
#ifndef _ZONE_H
#define _ZONE_H
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <mdb/mdb_modapi.h>
#ifdef __cplusplus
extern "C" {
#endif
+extern int zid2zone(uintptr_t, uint_t, int argc, const mdb_arg_t *);
+extern int zdid2zone(uintptr_t, uint_t, int argc, const mdb_arg_t *);
extern int zoneprt(uintptr_t, uint_t, int argc, const mdb_arg_t *);
extern int zone_walk_init(mdb_walk_state_t *);
diff --git a/usr/src/uts/common/os/netstack.c b/usr/src/uts/common/os/netstack.c
index b8467fbe13..93fd1a387d 100644
--- a/usr/src/uts/common/os/netstack.c
+++ b/usr/src/uts/common/os/netstack.c
@@ -22,6 +22,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserved.
*/
#include <sys/param.h>
@@ -205,6 +206,7 @@ void
netstack_unregister(int moduleid)
{
netstack_t *ns;
+ boolean_t created = B_FALSE;
ASSERT(moduleid >= 0 && moduleid < NS_MAX);
@@ -223,7 +225,33 @@ netstack_unregister(int moduleid)
nm_state_t *nms = &ns->netstack_m_state[moduleid];
mutex_enter(&ns->netstack_lock);
- if (ns_reg[moduleid].nr_shutdown != NULL &&
+
+ /*
+ * We need to be careful here. We could actually have a netstack
+ * being created as we speak waiting for us to let go of this
+ * lock to proceed. It may have set NSS_CREATE_NEEDED, but not
+ * have gotten to the point of completing it yet. If
+ * NSS_CREATE_NEEDED, we can safely just remove it here and
+ * never create the module. However, if NSS_CREATE_INPROGRESS is
+ * set, we need to still flag this module for shutdown and
+ * deletion, just as though it had reached NSS_CREATE_COMPLETED.
+ *
+ * It is safe to do that because of two different guarantees
+ * that exist in the system. The first is that before we do a
+ * create, shutdown, or destroy, we ensure that nothing else is
+ * in progress in the system for this netstack and wait for it
+ * to complete. Secondly, because the zone is being created, we
+ * know that the following call to apply_all_netstack will block
+ * on the zone finishing its initialization.
+ */
+ if (nms->nms_flags & NSS_CREATE_NEEDED)
+ nms->nms_flags &= ~NSS_CREATE_NEEDED;
+
+ if (nms->nms_flags & NSS_CREATE_INPROGRESS ||
+ nms->nms_flags & NSS_CREATE_COMPLETED)
+ created = B_TRUE;
+
+ if (ns_reg[moduleid].nr_shutdown != NULL && created &&
(nms->nms_flags & NSS_CREATE_COMPLETED) &&
(nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
@@ -231,8 +259,7 @@ netstack_unregister(int moduleid)
netstack_t *, ns, int, moduleid);
}
if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) &&
- ns_reg[moduleid].nr_destroy != NULL &&
- (nms->nms_flags & NSS_CREATE_COMPLETED) &&
+ ns_reg[moduleid].nr_destroy != NULL && created &&
(nms->nms_flags & NSS_DESTROY_ALL) == 0) {
nms->nms_flags |= NSS_DESTROY_NEEDED;
DTRACE_PROBE2(netstack__destroy__needed,
diff --git a/usr/src/uts/i86pc/io/apix/apix.c b/usr/src/uts/i86pc/io/apix/apix.c
index 8c4ccb6a0a..81b88dc426 100644
--- a/usr/src/uts/i86pc/io/apix/apix.c
+++ b/usr/src/uts/i86pc/io/apix/apix.c
@@ -635,10 +635,10 @@ apix_send_eoi(void)
/*
* platform_intr_enter
*
- * Called at the beginning of the interrupt service routine to
- * mask all level equal to and below the interrupt priority
- * of the interrupting vector. An EOI should be given to
- * the interrupt controller to enable other HW interrupts.
+ * Called at the beginning of the interrupt service routine, but unlike
+ * pcplusmp, does not mask interrupts. An EOI is given to the interrupt
+ * controller to enable other HW interrupts but interrupts are still
+ * masked by the IF flag.
*
* Return -1 for spurious interrupts
*
@@ -750,58 +750,30 @@ apix_intr_exit(int prev_ipl, int arg2)
}
/*
- * Mask all interrupts below or equal to the given IPL.
- * Any changes made to this function must also change X2APIC
- * version of setspl.
+ * The pcplusmp setspl code uses the TPR to mask all interrupts at or below the
+ * given ipl, but apix never uses the TPR and we never mask a subset of the
+ * interrupts. They are either all blocked by the IF flag or all can come in.
+ *
+ * For setspl, we mask all interrupts for XC_HI_PIL, otherwise, interrupts can
+ * come in if currently enabled by the IF flag. This table shows the state of
+ * the IF flag when we leave this function.
+ *
+ * curr IF | ipl == 15 ipl != 15
+ * --------+---------------------------
+ * 0 | 0 0
+ * 1 | 0 1
*/
static void
apix_setspl(int ipl)
{
- /* interrupts at ipl above this cannot be in progress */
- apic_cpus[psm_get_cpu_id()].aci_ISR_in_progress &= (2 << ipl) - 1;
-
/*
- * Mask all interrupts for XC_HI_PIL (i.e set TPR to 0xf).
- * Otherwise, enable all interrupts (i.e. set TPR to 0).
+ * Interrupts at ipl above this cannot be in progress, so the following
+ * mask is ok.
*/
- if (ipl != XC_HI_PIL)
- ipl = 0;
-
-#if defined(__amd64)
- setcr8((ulong_t)ipl);
-#else
- if (apic_have_32bit_cr8)
- setcr8((ulong_t)ipl);
- else
- apicadr[APIC_TASK_REG] = ipl << APIC_IPL_SHIFT;
-#endif
-
- /*
- * this is a patch fix for the ALR QSMP P5 machine, so that interrupts
- * have enough time to come in before the priority is raised again
- * during the idle() loop.
- */
- if (apic_setspl_delay)
- (void) apic_reg_ops->apic_get_pri();
-}
-
-/*
- * X2APIC version of setspl.
- */
-static void
-x2apix_setspl(int ipl)
-{
- /* interrupts at ipl above this cannot be in progress */
apic_cpus[psm_get_cpu_id()].aci_ISR_in_progress &= (2 << ipl) - 1;
- /*
- * Mask all interrupts for XC_HI_PIL (i.e set TPR to 0xf).
- * Otherwise, enable all interrupts (i.e. set TPR to 0).
- */
- if (ipl != XC_HI_PIL)
- ipl = 0;
-
- X2APIC_WRITE(APIC_TASK_REG, ipl << APIC_IPL_SHIFT);
+ if (ipl == XC_HI_PIL)
+ cli();
}
int
@@ -1112,6 +1084,10 @@ apix_post_cyclic_setup(void *arg)
apic_redistribute_sample_interval, DDI_IPL_2);
}
+/*
+ * Called the first time we enable x2apic mode on this cpu.
+ * Update some of the function pointers to use x2apic routines.
+ */
void
x2apic_update_psm()
{
@@ -1120,14 +1096,17 @@ x2apic_update_psm()
ASSERT(pops != NULL);
/*
- * The xxx_intr_exit() sets TPR and sends back EOI. The
- * xxx_setspl() sets TPR. These two routines are not
- * needed in new design.
+ * The pcplusmp module x2apic_update_psm function does this:
*
- * pops->psm_intr_exit = x2apic_intr_exit;
- * pops->psm_setspl = x2apic_setspl;
+ * pops->psm_intr_exit = x2apic_intr_exit;
+ * pops->psm_setspl = x2apic_setspl;
+ * pops->psm_send_ipi = x2apic_send_ipi;
+ *
+ * Note the x2apic prefix vs. our apix prefix for setspl.
+ * The x2apic_intr_exit() sets TPR and sends back EOI. The
+ * x2apic_setspl() sets TPR. This functionality is not
+ * used in new design.
*/
- pops->psm_setspl = x2apix_setspl;
pops->psm_send_ipi = x2apic_send_ipi;
send_dirintf = pops->psm_send_ipi;
@@ -2077,6 +2056,9 @@ apix_intx_get_pending(int irqno)
return (pending);
}
+/*
+ * This function will mask the interrupt on the I/O APIC
+ */
static void
apix_intx_set_mask(int irqno)
{
@@ -2106,6 +2088,9 @@ apix_intx_set_mask(int irqno)
intr_restore(iflag);
}
+/*
+ * This function will clear the mask for the interrupt on the I/O APIC
+ */
static void
apix_intx_clear_mask(int irqno)
{
diff --git a/usr/src/uts/i86pc/io/apix/apix_intr.c b/usr/src/uts/i86pc/io/apix/apix_intr.c
index e5d072b525..d870a4d365 100644
--- a/usr/src/uts/i86pc/io/apix/apix_intr.c
+++ b/usr/src/uts/i86pc/io/apix/apix_intr.c
@@ -862,6 +862,9 @@ apix_dispatch_lowlevel(uint_t vector, uint_t oldipl)
apix_intr_thread_epilog(cpu, oldipl);
}
+/*
+ * Interrupt service routine, called with interrupts disabled.
+ */
void
apix_do_interrupt(struct regs *rp, trap_trace_rec_t *ttp)
{
@@ -904,7 +907,7 @@ apix_do_interrupt(struct regs *rp, trap_trace_rec_t *ttp)
}
/*
- * Raise the interrupt priority. Send EOI to local APIC
+ * Send EOI to local APIC
*/
newipl = (*setlvl)(oldipl, (int *)&rp->r_trapno);
#ifdef TRAPTRACE
diff --git a/usr/src/uts/i86pc/os/intr.c b/usr/src/uts/i86pc/os/intr.c
index 91d7afcf36..8acaf8fc73 100644
--- a/usr/src/uts/i86pc/os/intr.c
+++ b/usr/src/uts/i86pc/os/intr.c
@@ -21,6 +21,420 @@
/*
* Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2012, Joyent, Inc. All rights reserverd.
+ */
+
+/*
+ * To understand the present state of interrupt handling on i86pc, we must
+ * first consider the history of interrupt controllers and our way of handling
+ * interrupts.
+ *
+ * History of Interrupt Controllers on i86pc
+ * -----------------------------------------
+ *
+ * Intel 8259 and 8259A
+ *
+ * The first interrupt controller that attained widespread use on i86pc was
+ * the Intel 8259(A) Programmable Interrupt Controller that first saw use with
+ * the 8086. It took up to 8 interrupt sources and combined them into one
+ * output wire. Up to 8 8259s could be slaved together providing up to 64 IRQs.
+ * With the switch to the 8259A, level mode interrupts became possible. For a
+ * long time on i86pc the 8259A was the only way to handle interrupts and it
+ * had its own set of quirks. The 8259A and its corresponding interval timer
+ * the 8254 are programmed using outb and inb instructions.
+ *
+ * Intel Advanced Programmable Interrupt Controller (APIC)
+ *
+ * Starting around the time of the introduction of the P6 family
+ * microarchitecture (i686) Intel introduced a new interrupt controller.
+ * Instead of having the series of slaved 8259A devices, Intel opted to outfit
+ * each processor with a Local APIC (lapic) and to outfit the system with at
+ * least one, but potentially more, I/O APICs (ioapic). The lapics and ioapics
+ * initially communicated over a dedicated bus, but this has since been
+ * replaced. Each physical core and even hyperthread currently contains its
+ * own local apic, which is not shared. There are a few exceptions for
+ * hyperthreads, but that does not usually concern us.
+ *
+ * Instead of talking directly to 8259 for status, sending End Of Interrupt
+ * (EOI), etc. a microprocessor now communicates directly to the lapic. This
+ * also allows for each microprocessor to be able to have independent controls.
+ * The programming method is different from the 8259. Consumers map the lapic
+ * registers into uncacheable memory to read and manipulate the state.
+ *
+ * The number of addressable interrupt vectors was increased to 256. However
+ * vectors 0-31 are reserved for the processor exception handling, leaving the
+ * remaining vectors for general use. In addition to hardware generated
+ * interrupts, the lapic provides a way for generating inter-processor
+ * interrupts (IPI) which are the basis for CPU cross calls and CPU pokes.
+ *
+ * AMD ended up implementing the Intel APIC architecture in lieu of their work
+ * with Cyrix.
+ *
+ * Intel x2apic
+ *
+ * The x2apic is an extension to the lapic which started showing up around the
+ * same time as the Sandy Bridge chipsets. It provides a new programming mode
+ * as well as new features. The goal of the x2apic is to solve a few problems
+ * with the previous generation of lapic and the x2apic is backwards compatible
+ * with the previous programming and model. The only downsides to using the
+ * backward compatibility is that you not able to take advantage of the new
+ * x2apic features.
+ *
+ * o The APIC ID is increased from an 8-bit value to a 32-bit value. This
+ * increases the maximum number of addressable physical processors beyond
+ * 256. This new ID is assembled in a similar manner as the information that
+ * is obtainable by the extended cpuid topology leaves.
+ *
+ * o A new means of generating IPIs was introduced.
+ *
+ * o Instead of memory mapping the registers, the x2apic only allows for
+ * programming it through a series of wrmsrs. This has important semantic
+ * side effects. Recall that the registers were previously all mapped to
+ * uncachable memory which meant that all operations to the local apic were
+ * serializing instructions. With the switch to using wrmsrs this has been
+ * relaxed and these operations can no longer be assumed to be serializing
+ * instructions.
+ *
+ * Note for the rest of this we are only going to concern ourselves with the
+ * apic and x2apic which practically all of i86pc has been using now for
+ * quite some time.
+ *
+ * Interrupt Priority Levels
+ * -------------------------
+ *
+ * On i86pc systems there are a total of fifteen interrupt priority levels
+ * (ipls) which range from 1-15. Level 0 is for normal processing and
+ * non-interrupt processing. To manipulate these values the family of spl
+ * functions (which date back to UNIX on the PDP-11) are used. Specifically,
+ * splr() to raise the priority level and splx() to lower it. One should not
+ * generally call setspl() directly.
+ *
+ * Both i86pc and the supported SPARC platforms honor the same conventions for
+ * the meaning behind these IPLs. The most important IPL is the platform's
+ * LOCK_LEVEL (0xa on i86pc). If a thread is above LOCK_LEVEL it _must_ not
+ * sleep on any synchronization object. The only allowed synchronization
+ * primitive is a mutex that has been specifically initialized to be a spin
+ * lock (see mutex_init(9F)). Another important level is DISP_LEVEL (0xb on
+ * i86pc). You must be at DISP_LEVEL if you want to control the dispatcher.
+ * The XC_HI_PIL is the highest level (0xf) and is used during cross-calls.
+ *
+ * Each interrupt that is registered in the system fires at a specific IPL.
+ * Generally most interrupts fire below LOCK_LEVEL.
+ *
+ * PSM Drivers
+ * -----------
+ *
+ * We currently have three sets of PSM drivers available. uppc, pcplusmp, and
+ * apix. uppc (uni-processor PC) is the original driver that interacts with the
+ * 8259A and 8254. In general, it is not used anymore given the prevalence of
+ * the apic.
+ *
+ * The system prefers to use the apix driver over the pcplusmp driver. The apix
+ * driver requires HW support for an x2apic. If there is no x2apic HW, apix
+ * will not be used. In general we prefer using the apix driver over the
+ * pcplusmp driver because it gives us much more flexibility with respect to
+ * interrupts. In the apix driver each local apic has its own independent set
+ * of interrupts, whereas the pcplusmp driver only has a single global set of
+ * interrupts. This is why pcplusmp only supports a finite number of interrupts
+ * per IPL -- generally 16, often less. The apix driver supports using either
+ * the x2apic or the local apic programing modes. The programming mode does not
+ * change the number of interrupts available, just the number of processors
+ * that we can address. For the apix driver, the x2apic mode is enabled if the
+ * system supports interrupt re-mapping, otherwise the module manages the
+ * x2apic in local mode.
+ *
+ * When there is no x2apic present, we default back to the pcplusmp PSM driver.
+ * In general, this is not problematic unless you have more than 256
+ * processors in the machine or you do not have enough interrupts available.
+ *
+ * Controlling Interrupt Generation on i86pc
+ * -----------------------------------------
+ *
+ * There are two different ways to manipulate which interrupts will be
+ * generated on i86pc. Each offers different degrees of control.
+ *
+ * The first is through the flags register (eflags and rflags on i386 and amd64
+ * respectively). The IF bit determines whether or not interrupts are enabled
+ * or disabled. This is manipulated in one of several ways. The most common way
+ * is through the cli and sti instructions. These clear the IF flag and set it,
+ * respectively, for the current processor. The other common way is through the
+ * use of the intr_clear and intr_restore functions.
+ *
+ * Assuming interrupts are not blocked by the IF flag, then the second form is
+ * through the Processor-Priority Register (PPR). The PPR is used to determine
+ * whether or not a pending interrupt should be delivered. If the ipl of the
+ * new interrupt is higher than the current value in the PPR, then the lapic
+ * will either deliver it immediately (if interrupts are not in progress) or it
+ * will deliver it once the current interrupt processing has issued an EOI. The
+ * highest unmasked interrupt will be the one delivered.
+ *
+ * The PPR register is based upon the max of the following two registers in the
+ * lapic, The TPR register (also known as CR8 on amd64) that can be used to
+ * mask interrupt levels, and the current vector. Because the pcplusmp module
+ * always sets TPR appropriately early in the do_interrupt path, we can usually
+ * just think that the PPR is the TPR. The pcplusmp module also issues an EOI
+ * once it has set the TPR, so higher priority interrupts can come in while
+ * we're servicing a lower priority interrupt.
+ *
+ * Handling Interrupts
+ * -------------------
+ *
+ * Interrupts can be broken down into three categories based on priority and
+ * source:
+ *
+ * o High level interrupts
+ * o Low level hardware interrupts
+ * o Low level software interrupts
+ *
+ * High Level Interrupts
+ *
+ * High level interrupts encompasses both hardware-sourced and software-sourced
+ * interrupts. Examples of high level hardware interrupts include the serial
+ * console. High level software-sourced interrupts are still delivered through
+ * the local apic through IPIs. This is primarily cross calls.
+ *
+ * When a high level interrupt comes in, we will raise the SPL and then pin the
+ * current lwp to the processor. We will use its lwp, but our own interrupt
+ * stack and process the high level interrupt in-situ. These handlers are
+ * designed to be very short in nature and cannot go to sleep, only block on a
+ * spin lock. If the interrupt has a lot of work to do, it must generate a
+ * low-priority software interrupt that will be processed later.
+ *
+ * Low level hardware interrupts
+ *
+ * Low level hardware interrupts start off like their high-level cousins. The
+ * current CPU contains a number of kernel threads (kthread_t) that can be used
+ * to process low level interrupts. These are shared between both low level
+ * hardware and software interrupts. Note that we while we run with our
+ * kthread_t, we borrow the pinned threads lwp_t until such a time as we hit a
+ * synchronization object. If we hit one and need to sleep, then the scheduler
+ * will instead create the rest of what we need.
+ *
+ * Low level software interrupts
+ *
+ * Low level software interrupts are handled in a similar way as hardware
+ * interrupts, but the notification vector is different. Each CPU has a bitmask
+ * of pending software interrupts. We can notify a CPU to process software
+ * interrupts through a specific trap vector as well as through several
+ * checks that are performed throughout the code. Thse checks will look at
+ * processing software interrupts as we lower our spl.
+ *
+ * We attempt to process the highest pending software interrupt that we can
+ * which is greater than our current IPL. If none currently exist, then we move
+ * on. We process a software interrupt in a similar fashion to a hardware
+ * interrupt.
+ *
+ * Traditional Interrupt Flow
+ * --------------------------
+ *
+ * The following diagram tracks the flow of the traditional uppc and pcplusmp
+ * interrupt handlers. The apix driver has its own version of do_interrupt().
+ * We come into the interrupt handler with all interrupts masked by the IF
+ * flag. This is because we set up the handler using an interrupt-gate, which
+ * is defined architectuarlly to have cleared the IF flag for us.
+ *
+ * +--------------+ +----------------+ +-----------+
+ * | _interrupt() |--->| do_interrupt() |--->| *setlvl() |
+ * +--------------+ +----------------+ +-----------+
+ * | | |
+ * | | |
+ * low-level| | | softint
+ * HW int | | +---------------------------------------+
+ * +--------------+ | | |
+ * | intr_thread_ |<-----+ | hi-level int |
+ * | prolog() | | +----------+ |
+ * +--------------+ +--->| hilevel_ | Not on intr stack |
+ * | | intr_ |-----------------+ |
+ * | | prolog() | | |
+ * +------------+ +----------+ | |
+ * | switch_sp_ | | On intr v |
+ * | and_call() | | Stack +------------+ |
+ * +------------+ | | switch_sp_ | |
+ * | v | and_call() | |
+ * v +-----------+ +------------+ |
+ * +-----------+ | dispatch_ | | |
+ * | dispatch_ | +-------------------| hilevel() |<------------+ |
+ * | hardint() | | +-----------+ |
+ * +-----------+ | |
+ * | v |
+ * | +-----+ +----------------------+ +-----+ hi-level |
+ * +---->| sti |->| av_dispatch_autovect |->| cli |---------+ |
+ * +-----+ +----------------------+ +-----+ | |
+ * | | | |
+ * v | | |
+ * +----------+ | | |
+ * | for each | | | |
+ * | handler | | | |
+ * | *intr() | | v |
+ * +--------------+ +----------+ | +----------------+ |
+ * | intr_thread_ | low-level | | hilevel_intr_ | |
+ * | epilog() |<-------------------------------+ | epilog() | |
+ * +--------------+ +----------------+ |
+ * | | | |
+ * | +----------------------v v---------------------+ |
+ * | +------------+ |
+ * | +---------------------->| *setlvlx() | |
+ * | | +------------+ |
+ * | | | |
+ * | | v |
+ * | | +--------+ +------------------+ +-------------+ |
+ * | | | return |<----| softint pending? |----->| dosoftint() |<-----+
+ * | | +--------+ no +------------------+ yes +-------------+
+ * | | ^ | |
+ * | | | softint pil too low | |
+ * | | +--------------------------------------+ |
+ * | | v
+ * | | +-----------+ +------------+ +-----------+
+ * | | | dispatch_ |<-----| switch_sp_ |<---------| *setspl() |
+ * | | | softint() | | and_call() | +-----------+
+ * | | +-----------+ +------------+
+ * | | |
+ * | | v
+ * | | +-----+ +----------------------+ +-----+ +------------+
+ * | | | sti |->| av_dispatch_autovect |->| cli |->| dosoftint_ |
+ * | | +-----+ +----------------------+ +-----+ | epilog() |
+ * | | +------------+
+ * | | | |
+ * | +----------------------------------------------------+ |
+ * v |
+ * +-----------+ |
+ * | interrupt | |
+ * | thread |<---------------------------------------------------+
+ * | blocked |
+ * +-----------+
+ * |
+ * v
+ * +----------------+ +------------+ +-----------+ +-------+ +---------+
+ * | set_base_spl() |->| *setlvlx() |->| splhigh() |->| sti() |->| swtch() |
+ * +----------------+ +------------+ +-----------+ +-------+ +---------+
+ *
+ * Calls made on Interrupt Stacks and Epilogue routines
+ *
+ * We use the switch_sp_and_call() assembly routine to switch our sp to the
+ * interrupt stacks and then call the appropriate dispatch function. In the
+ * case of interrupts which may block, softints and hardints, we always ensure
+ * that we are still on the interrupt thread when we call the epilog routine.
+ * This is not just important, it's necessary. If the interrupt thread blocked,
+ * we won't return from our switch_sp_and_call() function and instead we'll go
+ * through and set ourselves up to swtch() directly.
+ *
+ * New Interrupt Flow
+ * ------------------
+ *
+ * The apix module has its own interrupt path. This is done for various
+ * reasons. The first is that rather than having global interrupt vectors, we
+ * now have per-cpu vectors.
+ *
+ * The other substantial change is that the apix design does not use the TPR to
+ * mask interrupts below the current level. In fact, except for one special
+ * case, it does not use the TPR at all. Instead, it only uses the IF flag
+ * (cli/sti) to either block all interrupts or allow any interrupts to come in.
+ * The design is such that when interrupts are allowed to come in, if we are
+ * currently servicing a higher priority interupt, the new interrupt is treated
+ * as pending and serviced later. Specifically, in the pcplusmp module's
+ * apic_intr_enter function the code masks interrupts at or below the current
+ * IPL using the TPR before sending EOI, whereas the apix module's
+ * apix_intr_enter function simply sends EOI.
+ *
+ * The one special case where the apix code uses the TPR is when it calls
+ * through the apic_reg_ops function pointer apic_write_task_reg in
+ * apix_init_intr() to initially mask all levels and then finally to enable all
+ * levels.
+ *
+ * Recall that we come into the interrupt handler with all interrupts masked
+ * by the IF flag. This is because we set up the handler using an
+ * interrupt-gate which is defined architectuarlly to have cleared the IF flag
+ * for us.
+ *
+ * +--------------+ +---------------------+
+ * | _interrupt() |--->| apix_do_interrupt() |
+ * +--------------+ +---------------------+
+ * |
+ * hard int? +----+--------+ softint?
+ * | | (but no low-level looping)
+ * +-----------+ |
+ * | *setlvl() | |
+ * +---------+ +-----------+ +----------------------------------+
+ * |apix_add_| check IPL | |
+ * |pending_ |<-------------+------+----------------------+ |
+ * |hardint()| low-level int| hi-level int| |
+ * +---------+ v v |
+ * | check IPL +-----------------+ +---------------+ |
+ * +--+-----+ | apix_intr_ | | apix_hilevel_ | |
+ * | | | thread_prolog() | | intr_prolog() | |
+ * | return +-----------------+ +---------------+ |
+ * | | | On intr |
+ * | +------------+ | stack? +------------+ |
+ * | | switch_sp_ | +---------| switch_sp_ | |
+ * | | and_call() | | | and_call() | |
+ * | +------------+ | +------------+ |
+ * | | | | |
+ * | +----------------+ +----------------+ |
+ * | | apix_dispatch_ | | apix_dispatch_ | |
+ * | | lowlevel() | | hilevel() | |
+ * | +----------------+ +----------------+ |
+ * | | | |
+ * | v v |
+ * | +-------------------------+ |
+ * | |apix_dispatch_by_vector()|----+ |
+ * | +-------------------------+ | |
+ * | !XC_HI_PIL| | | | |
+ * | +---+ +-------+ +---+ | |
+ * | |sti| |*intr()| |cli| | |
+ * | +---+ +-------+ +---+ | hi-level? |
+ * | +---------------------------+----+ |
+ * | v low-level? v |
+ * | +----------------+ +----------------+ |
+ * | | apix_intr_ | | apix_hilevel_ | |
+ * | | thread_epilog()| | intr_epilog() | |
+ * | +----------------+ +----------------+ |
+ * | | | |
+ * | v-----------------+--------------------------------+ |
+ * | +------------+ |
+ * | | *setlvlx() | +----------------------------------------------------+
+ * | +------------+ |
+ * | | | +--------------------------------+ low
+ * v v v------+ v | level
+ * +------------------+ +------------------+ +-----------+ | pending?
+ * | apix_do_pending_ |----->| apix_do_pending_ |----->| apix_do_ |--+
+ * | hilevel() | | hardint() | | softint() | |
+ * +------------------+ +------------------+ +-----------+ return
+ * | | |
+ * | while pending | while pending | while pending
+ * | hi-level | low-level | softint
+ * | | |
+ * +---------------+ +-----------------+ +-----------------+
+ * | apix_hilevel_ | | apix_intr_ | | apix_do_ |
+ * | intr_prolog() | | thread_prolog() | | softint_prolog()|
+ * +---------------+ +-----------------+ +-----------------+
+ * | On intr | |
+ * | stack? +------------+ +------------+ +------------+
+ * +--------| switch_sp_ | | switch_sp_ | | switch_sp_ |
+ * | | and_call() | | and_call() | | and_call() |
+ * | +------------+ +------------+ +------------+
+ * | | | |
+ * +------------------+ +------------------+ +------------------------+
+ * | apix_dispatch_ | | apix_dispatch_ | | apix_dispatch_softint()|
+ * | pending_hilevel()| | pending_hardint()| +------------------------+
+ * +------------------+ +------------------+ | | | |
+ * | | | | | | | |
+ * | +----------------+ | +----------------+ | | | |
+ * | | apix_hilevel_ | | | apix_intr_ | | | | |
+ * | | intr_epilog() | | | thread_epilog()| | | | |
+ * | +----------------+ | +----------------+ | | | |
+ * | | | | | | | |
+ * | +------------+ | +----------+ +------+ | | |
+ * | | *setlvlx() | | |*setlvlx()| | | | |
+ * | +------------+ | +----------+ | +----------+ | +---------+
+ * | | +---+ |av_ | +---+ |apix_do_ |
+ * +---------------------------------+ |sti| |dispatch_ | |cli| |softint_ |
+ * | apix_dispatch_pending_autovect()| +---+ |softvect()| +---+ |epilog() |
+ * +---------------------------------+ +----------+ +---------+
+ * |!XC_HI_PIL | | | |
+ * +---+ +-------+ +---+ +----------+ +-------+
+ * |sti| |*intr()| |cli| |apix_post_| |*intr()|
+ * +---+ +-------+ +---+ |hardint() | +-------+
+ * +----------+
*/
#include <sys/cpuvar.h>