13007 bhyve vlapic should set TMR on intr accept

13106 clarify PPR transitions in bhyve vLAPIC 13132 VMX event injection can race in bhyve 13259 SVM event injection can race in bhyve Reviewed by: Robert Mustacchi <rm@fingolfin.org> Reviewed by: Toomas Soome <tsoome@me.com> Approved by: Dan McDonald <danmcd@joyent.com>
author: Patrick Mooney <pmooney@pfmooney.com> 2020-08-01 22:16:45 +0000
committer: Patrick Mooney <pmooney@oxide.computer> 2020-11-24 20:30:25 +0000
commit: c74a40a584c9d875009f725565896fd7e8ee38d6 (patch)
tree: 90307bd32af113964bbdc8a157fa3b4974375d9f
parent: 273d774d1d685415fd99d31224bdae55e7cfb793 (diff)
download: illumos-joyent-c74a40a584c9d875009f725565896fd7e8ee38d6.tar.gz
12 files changed, 788 insertions, 914 deletions
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c
index dd9d5a55a8..8c12f4ba04 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm.c
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c
@@ -903,67 +903,6 @@ svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit,
 	vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa);
 }
 
-#ifdef KTR
-static const char *
-intrtype_to_str(int intr_type)
-{
-	switch (intr_type) {
-	case VMCB_EVENTINJ_TYPE_INTR:
-		return ("hwintr");
-	case VMCB_EVENTINJ_TYPE_NMI:
-		return ("nmi");
-	case VMCB_EVENTINJ_TYPE_INTn:
-		return ("swintr");
-	case VMCB_EVENTINJ_TYPE_EXCEPTION:
-		return ("exception");
-	default:
-		panic("%s: unknown intr_type %d", __func__, intr_type);
-	}
-}
-#endif
-
-/*
- * Inject an event to vcpu as described in section 15.20, "Event injection".
- */
-static void
-svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector,
-		 uint32_t error, bool ec_valid)
-{
-	struct vmcb_ctrl *ctrl;
-
-	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
-
-	KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0,
-	    ("%s: event already pending %lx", __func__, ctrl->eventinj));
-
-	KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d",
-	    __func__, vector));
-
-	switch (intr_type) {
-	case VMCB_EVENTINJ_TYPE_INTR:
-	case VMCB_EVENTINJ_TYPE_NMI:
-	case VMCB_EVENTINJ_TYPE_INTn:
-		break;
-	case VMCB_EVENTINJ_TYPE_EXCEPTION:
-		if (vector >= 0 && vector <= 31 && vector != 2)
-			break;
-		/* FALLTHROUGH */
-	default:
-		panic("%s: invalid intr_type/vector: %d/%d", __func__,
-		    intr_type, vector);
-	}
-	ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID;
-	if (ec_valid) {
-		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
-		ctrl->eventinj |= (uint64_t)error << 32;
-		VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %x",
-		    intrtype_to_str(intr_type), vector, error);
-	} else {
-		VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d",
-		    intrtype_to_str(intr_type), vector);
-	}
-}
-
 static void
 svm_update_virqinfo(struct svm_softc *sc, int vcpu)
 {
@@ -984,7 +923,7 @@ svm_update_virqinfo(struct svm_softc *sc, int vcpu)
 }
 
 static void
-svm_save_intinfo(struct svm_softc *svm_sc, int vcpu)
+svm_save_exitintinfo(struct svm_softc *svm_sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 	uint64_t intinfo;
@@ -1014,12 +953,14 @@ vintr_intercept_enabled(struct svm_softc *sc, int vcpu)
 	    VMCB_INTCPT_VINTR));
 }
 
-static __inline void
-enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
+static void
+svm_enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
+	struct vmcb_state *state;
 
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	state = svm_get_vmcb_state(sc, vcpu);
 
 	if ((ctrl->v_irq & V_IRQ) != 0 && ctrl->v_intr_vector == 0) {
 		KASSERT(ctrl->v_intr_prio & V_IGN_TPR,
@@ -1029,6 +970,17 @@ enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 		return;
 	}
 
+	/*
+	 * We use V_IRQ in conjunction with the VINTR intercept to trap into the
+	 * hypervisor as soon as a virtual interrupt can be delivered.
+	 *
+	 * Since injected events are not subject to intercept checks we need to
+	 * ensure that the V_IRQ is not actually going to be delivered on VM
+	 * entry.
+	 */
+	VERIFY((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
+	    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow);
+
 	VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting");
 	ctrl->v_irq |= V_IRQ;
 	ctrl->v_intr_prio |= V_IGN_TPR;
@@ -1037,8 +989,8 @@ enable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR);
 }
 
-static __inline void
-disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
+static void
+svm_disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 
@@ -1063,30 +1015,18 @@ disable_intr_window_exiting(struct svm_softc *sc, int vcpu)
  * to track when the vcpu is done handling the NMI.
  */
 static int
-nmi_blocked(struct svm_softc *sc, int vcpu)
+svm_nmi_blocked(struct svm_softc *sc, int vcpu)
 {
-	int blocked;
-
-	blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
-	    VMCB_INTCPT_IRET);
-	return (blocked);
-}
-
-static void
-enable_nmi_blocking(struct svm_softc *sc, int vcpu)
-{
-
-	KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked"));
-	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled");
-	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
+	return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT,
+	    VMCB_INTCPT_IRET));
 }
 
 static void
-clear_nmi_blocking(struct svm_softc *sc, int vcpu)
+svm_clear_nmi_blocking(struct svm_softc *sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 
-	KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
+	KASSERT(svm_nmi_blocked(sc, vcpu), ("vNMI already unblocked"));
 	VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared");
 	/*
 	 * When the IRET intercept is cleared the vcpu will attempt to execute
@@ -1102,13 +1042,80 @@ clear_nmi_blocking(struct svm_softc *sc, int vcpu)
 	svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
 
 	/*
-	 * Set 'intr_shadow' to prevent an NMI from being injected on the
-	 * immediate VMRUN.
+	 * Set an interrupt shadow to prevent an NMI from being immediately
+	 * injected on the next VMRUN.
 	 */
 	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
 	ctrl->intr_shadow = 1;
 }
 
+static void
+svm_inject_event(struct svm_softc *sc, int vcpu, uint64_t intinfo)
+{
+	struct vmcb_ctrl *ctrl;
+	uint8_t vector;
+	uint32_t evtype;
+
+	ASSERT(VMCB_EXITINTINFO_VALID(intinfo));
+
+	ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+	vector = VMCB_EXITINTINFO_VECTOR(intinfo);
+	evtype = VMCB_EXITINTINFO_TYPE(intinfo);
+
+	switch (evtype) {
+	case VMCB_EVENTINJ_TYPE_INTR:
+	case VMCB_EVENTINJ_TYPE_NMI:
+	case VMCB_EVENTINJ_TYPE_INTn:
+		break;
+	case VMCB_EVENTINJ_TYPE_EXCEPTION:
+		VERIFY(vector <= 31);
+		/*
+		 * NMIs are expected to be injected with VMCB_EVENTINJ_TYPE_NMI,
+		 * rather than as an exception with the NMI vector.
+		 */
+		VERIFY(vector != 2);
+		break;
+	default:
+		panic("unexpected event type %x", evtype);
+	}
+
+	ctrl->eventinj = VMCB_EVENTINJ_VALID | evtype | vector;
+	if (VMCB_EXITINTINFO_EC_VALID(intinfo)) {
+		ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID;
+		ctrl->eventinj |= (uint64_t)VMCB_EXITINTINFO_EC(intinfo) << 32;
+	}
+}
+
+static void
+svm_inject_nmi(struct svm_softc *sc, int vcpu)
+{
+	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+
+	ASSERT(!svm_nmi_blocked(sc, vcpu));
+
+	ctrl->eventinj = VMCB_EVENTINJ_VALID | VMCB_EVENTINJ_TYPE_NMI;
+	vm_nmi_clear(sc->vm, vcpu);
+
+	/*
+	 * Virtual NMI blocking is now in effect.
+	 *
+	 * Not only does this block a subsequent NMI injection from taking
+	 * place, it also configures an intercept on the IRET so we can track
+	 * when the next injection can take place.
+	 */
+	svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET);
+}
+
+static void
+svm_inject_irq(struct svm_softc *sc, int vcpu, int vector)
+{
+	struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpu);
+
+	ASSERT(vector >= 0 && vector <= 255);
+
+	ctrl->eventinj = VMCB_EVENTINJ_VALID | vector;
+}
+
 #define	EFER_MBZ_BITS	0xFFFFFFFFFFFF0200UL
 
 static int
@@ -1335,7 +1342,7 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 	    vmexit->inst_length, code, info1, info2));
 
 	svm_update_virqinfo(svm_sc, vcpu);
-	svm_save_intinfo(svm_sc, vcpu);
+	svm_save_exitintinfo(svm_sc, vcpu);
 
 	switch (code) {
 	case VMCB_EXIT_IRET:
@@ -1343,11 +1350,12 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 		 * Restart execution at "iret" but with the intercept cleared.
 		 */
 		vmexit->inst_length = 0;
-		clear_nmi_blocking(svm_sc, vcpu);
+		svm_clear_nmi_blocking(svm_sc, vcpu);
 		handled = 1;
 		break;
 	case VMCB_EXIT_VINTR:	/* interrupt window exiting */
 		vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1);
+		svm_disable_intr_window_exiting(svm_sc, vcpu);
 		handled = 1;
 		break;
 	case VMCB_EXIT_INTR:	/* external interrupt */
@@ -1571,51 +1579,40 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit)
 	return (handled);
 }
 
-static void
-svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu)
-{
-	uint64_t intinfo;
-
-	if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo))
-		return;
-
-	KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not "
-	    "valid: %lx", __func__, intinfo));
-
-	svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo),
-		VMCB_EXITINTINFO_VECTOR(intinfo),
-		VMCB_EXITINTINFO_EC(intinfo),
-		VMCB_EXITINTINFO_EC_VALID(intinfo));
-	vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
-	VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %lx", intinfo);
-}
-
 /*
- * Inject event to virtual cpu.
+ * Inject exceptions, NMIs, and ExtINTs.
+ *
+ * The logic behind these are complicated and may involve mutex contention, so
+ * the injection is performed without the protection of host CPU interrupts
+ * being disabled.  This means a racing notification could be "lost",
+ * necessitating a later call to svm_inject_recheck() to close that window
+ * of opportunity.
  */
-static void
-svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
+static enum event_inject_state
+svm_inject_events(struct svm_softc *sc, int vcpu)
 {
 	struct vmcb_ctrl *ctrl;
 	struct vmcb_state *state;
 	struct svm_vcpu *vcpustate;
-	uint8_t v_tpr;
-	int vector, need_intr_window;
-	int extint_pending;
+	uint64_t intinfo;
+	enum event_inject_state ev_state;
 
 	state = svm_get_vmcb_state(sc, vcpu);
 	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
 	vcpustate = svm_get_vcpu(sc, vcpu);
+	ev_state = EIS_CAN_INJECT;
 
-	need_intr_window = 0;
-
-	vlapic_tmr_update(vlapic);
-
+	/* Clear any interrupt shadow if guest %rip has changed */
 	if (vcpustate->nextrip != state->rip) {
 		ctrl->intr_shadow = 0;
-		VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking "
-		    "cleared due to rip change: %lx/%lx",
-		    vcpustate->nextrip, state->rip);
+	}
+
+	/*
+	 * An event is already pending for injection.  This can occur when the
+	 * vCPU exits prior to VM entry (like for an AST).
+	 */
+	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
+		return (EIS_EV_EXISTING | EIS_REQ_EXIT);
 	}
 
 	/*
@@ -1627,118 +1624,79 @@ svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
 	 * An event might also be pending because an exception was injected
 	 * by the hypervisor (e.g. #PF during instruction emulation).
 	 */
-	svm_inj_intinfo(sc, vcpu);
+	if (vm_entry_intinfo(sc->vm, vcpu, &intinfo)) {
+		ASSERT(VMCB_EXITINTINFO_VALID(intinfo));
+
+		svm_inject_event(sc, vcpu, intinfo);
+		vmm_stat_incr(sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1);
+		ev_state = EIS_EV_INJECTED;
+	}
 
 	/* NMI event has priority over interrupts. */
-	if (vm_nmi_pending(sc->vm, vcpu)) {
-		if (nmi_blocked(sc, vcpu)) {
-			/*
-			 * Can't inject another NMI if the guest has not
-			 * yet executed an "iret" after the last NMI.
-			 */
-			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due "
-			    "to NMI-blocking");
-		} else if (ctrl->intr_shadow) {
-			/*
-			 * Can't inject an NMI if the vcpu is in an intr_shadow.
-			 */
-			VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to "
-			    "interrupt shadow");
-			need_intr_window = 1;
-			goto done;
-		} else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
-			/*
-			 * If there is already an exception/interrupt pending
-			 * then defer the NMI until after that.
-			 */
-			VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to "
-			    "eventinj %lx", ctrl->eventinj);
+	if (vm_nmi_pending(sc->vm, vcpu) && !svm_nmi_blocked(sc, vcpu)) {
+		if (ev_state == EIS_CAN_INJECT) {
+			/* Can't inject NMI if vcpu is in an intr_shadow. */
+			if (ctrl->intr_shadow) {
+				return (EIS_GI_BLOCK);
+			}
 
-			/*
-			 * Use self-IPI to trigger a VM-exit as soon as
-			 * possible after the event injection is completed.
-			 *
-			 * This works only if the external interrupt exiting
-			 * is at a lower priority than the event injection.
-			 *
-			 * Although not explicitly specified in APMv2 the
-			 * relative priorities were verified empirically.
-			 */
-			ipi_cpu(curcpu, IPI_AST);	/* XXX vmm_ipinum? */
+			svm_inject_nmi(sc, vcpu);
+			ev_state = EIS_EV_INJECTED;
 		} else {
-			vm_nmi_clear(sc->vm, vcpu);
+			return (ev_state | EIS_REQ_EXIT);
+		}
+	}
 
-			/* Inject NMI, vector number is not used */
-			svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI,
-			    IDT_NMI, 0, false);
+	if (vm_extint_pending(sc->vm, vcpu)) {
+		int vector;
 
-			/* virtual NMI blocking is now in effect */
-			enable_nmi_blocking(sc, vcpu);
+		if (ev_state != EIS_CAN_INJECT) {
+			return (ev_state | EIS_REQ_EXIT);
+		}
 
-			VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI");
+		/*
+		 * If the guest has disabled interrupts or is in an interrupt
+		 * shadow then we cannot inject the pending interrupt.
+		 */
+		if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
+			return (EIS_GI_BLOCK);
 		}
-	}
 
-	extint_pending = vm_extint_pending(sc->vm, vcpu);
-	if (!extint_pending) {
-		if (!vlapic_pending_intr(vlapic, &vector))
-			goto done;
-		KASSERT(vector >= 16 && vector <= 255,
-		    ("invalid vector %d from local APIC", vector));
-	} else {
 		/* Ask the legacy pic for a vector to inject */
 		vatpic_pending_intr(sc->vm, &vector);
 		KASSERT(vector >= 0 && vector <= 255,
 		    ("invalid vector %d from INTR", vector));
-	}
 
-	/*
-	 * If the guest has disabled interrupts or is in an interrupt shadow
-	 * then we cannot inject the pending interrupt.
-	 */
-	if ((state->rflags & PSL_I) == 0) {
-		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
-		    "rflags %lx", vector, state->rflags);
-		need_intr_window = 1;
-		goto done;
-	}
-
-	if (ctrl->intr_shadow) {
-		VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to "
-		    "interrupt shadow", vector);
-		need_intr_window = 1;
-		goto done;
+		svm_inject_irq(sc, vcpu, vector);
+		vm_extint_clear(sc->vm, vcpu);
+		vatpic_intr_accepted(sc->vm, vector);
+		ev_state = EIS_EV_INJECTED;
 	}
 
-	if (ctrl->eventinj & VMCB_EVENTINJ_VALID) {
-		VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to "
-		    "eventinj %lx", vector, ctrl->eventinj);
-		need_intr_window = 1;
-		goto done;
-	}
+	return (ev_state);
+}
 
-	svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false);
+/*
+ * Synchronize vLAPIC state and inject any interrupts pending on it.
+ *
+ * This is done with host CPU interrupts disabled so notification IPIs will be
+ * queued on the host APIC and recognized when entering SVM guest context.
+ */
+static enum event_inject_state
+svm_inject_vlapic(struct svm_softc *sc, int vcpu, struct vlapic *vlapic,
+    enum event_inject_state ev_state)
+{
+	struct vmcb_ctrl *ctrl;
+	struct vmcb_state *state;
+	int vector;
+	uint8_t v_tpr;
 
-	if (!extint_pending) {
-		vlapic_intr_accepted(vlapic, vector);
-	} else {
-		vm_extint_clear(sc->vm, vcpu);
-		vatpic_intr_accepted(sc->vm, vector);
-	}
+	state = svm_get_vmcb_state(sc, vcpu);
+	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
 
 	/*
-	 * Force a VM-exit as soon as the vcpu is ready to accept another
-	 * interrupt. This is done because the PIC might have another vector
-	 * that it wants to inject. Also, if the APIC has a pending interrupt
-	 * that was preempted by the ExtInt then it allows us to inject the
-	 * APIC vector as soon as possible.
-	 */
-	need_intr_window = 1;
-done:
-	/*
-	 * The guest can modify the TPR by writing to %CR8. In guest mode
-	 * the processor reflects this write to V_TPR without hypervisor
-	 * intervention.
+	 * The guest can modify the TPR by writing to %cr8. In guest mode the
+	 * CPU reflects this write to V_TPR without hypervisor intervention.
 	 *
 	 * The guest can also modify the TPR by writing to it via the memory
 	 * mapped APIC page. In this case, the write will be emulated by the
@@ -1748,33 +1706,88 @@ done:
 	v_tpr = vlapic_get_cr8(vlapic);
 	KASSERT(v_tpr <= 15, ("invalid v_tpr %x", v_tpr));
 	if (ctrl->v_tpr != v_tpr) {
-		VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %x to %x",
-		    ctrl->v_tpr, v_tpr);
 		ctrl->v_tpr = v_tpr;
 		svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR);
 	}
 
-	if (need_intr_window) {
+	/* If an event cannot otherwise be injected, we are done for now */
+	if (ev_state != EIS_CAN_INJECT) {
+		return (ev_state);
+	}
+
+	if (!vlapic_pending_intr(vlapic, &vector)) {
+		return (EIS_CAN_INJECT);
+	}
+	KASSERT(vector >= 16 && vector <= 255,
+	    ("invalid vector %d from local APIC", vector));
+
+	/*
+	 * If the guest has disabled interrupts or is in an interrupt shadow
+	 * then we cannot inject the pending interrupt.
+	 */
+	if ((state->rflags & PSL_I) == 0 || ctrl->intr_shadow) {
+		return (EIS_GI_BLOCK);
+	}
+
+	svm_inject_irq(sc, vcpu, vector);
+	vlapic_intr_accepted(vlapic, vector);
+	return (EIS_EV_INJECTED);
+}
+
+/*
+ * Re-check for events to be injected.
+ *
+ * Once host CPU interrupts are disabled, check for the presence of any events
+ * which require injection processing.  If an exit is required upon injection,
+ * or once the guest becomes interruptable, that will be configured too.
+ */
+static bool
+svm_inject_recheck(struct svm_softc *sc, int vcpu,
+    enum event_inject_state ev_state)
+{
+	struct vmcb_ctrl *ctrl;
+
+	ctrl  = svm_get_vmcb_ctrl(sc, vcpu);
+
+	if (ev_state == EIS_CAN_INJECT) {
 		/*
-		 * We use V_IRQ in conjunction with the VINTR intercept to
-		 * trap into the hypervisor as soon as a virtual interrupt
-		 * can be delivered.
-		 *
-		 * Since injected events are not subject to intercept checks
-		 * we need to ensure that the V_IRQ is not actually going to
-		 * be delivered on VM entry. The KASSERT below enforces this.
+		 * An active interrupt shadow would preclude us from injecting
+		 * any events picked up during a re-check.
 		 */
-		KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 ||
-		    (state->rflags & PSL_I) == 0 || ctrl->intr_shadow,
-		    ("Bogus intr_window_exiting: eventinj (%lx), "
-		    "intr_shadow (%lu), rflags (%lx)",
-		    ctrl->eventinj, ctrl->intr_shadow, state->rflags));
-		enable_intr_window_exiting(sc, vcpu);
+		if (ctrl->intr_shadow != 0) {
+			return (false);
+		}
+
+		if (vm_nmi_pending(sc->vm, vcpu) &&
+		    !svm_nmi_blocked(sc, vcpu)) {
+			/* queued NMI not blocked by NMI-window-exiting */
+			return (true);
+		}
+		if (vm_extint_pending(sc->vm, vcpu)) {
+			/* queued ExtINT not blocked by existing injection */
+			return (true);
+		}
 	} else {
-		disable_intr_window_exiting(sc, vcpu);
+		if ((ev_state & EIS_REQ_EXIT) != 0) {
+			/*
+			 * Use a self-IPI to force an immediate exit after
+			 * event injection has occurred.
+			 */
+			poke_cpu(CPU->cpu_id);
+		} else {
+			/*
+			 * If any event is being injected, an exit immediately
+			 * upon becoming interruptable again will allow pending
+			 * or newly queued events to be injected in a timely
+			 * manner.
+			 */
+			svm_enable_intr_window_exiting(sc, vcpu);
+		}
 	}
+	return (false);
 }
 
+
 #ifdef __FreeBSD__
 static void
 check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu)
@@ -2039,15 +2052,15 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
 	state->rip = rip;
 
 	do {
-#ifndef __FreeBSD__
+		enum event_inject_state inject_state;
+
 		/*
-		 * Interrupt injection may involve mutex contention which, on
-		 * illumos bhyve, are blocking/non-spin.  Doing so with global
-		 * interrupts disabled is a recipe for deadlock, so it is
-		 * performed here.
+		 * Initial event injection is complex and may involve mutex
+		 * contention, so it must be performed with global interrupts
+		 * still enabled.
 		 */
-		svm_inj_interrupts(svm_sc, vcpu, vlapic);
-#endif
+		inject_state = svm_inject_events(svm_sc, vcpu);
+		handled = 0;
 
 		/*
 		 * Disable global interrupts to guarantee atomicity during
@@ -2058,6 +2071,13 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
 		 */
 		disable_gintr();
 
+		/*
+		 * Synchronizing and injecting vlapic state is lock-free and is
+		 * safe (and prudent) to perform with interrupts disabled.
+		 */
+		inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic,
+		    inject_state);
+
 		if (vcpu_suspended(evinfo)) {
 			enable_gintr();
 			vm_exit_suspended(vm, vcpu, state->rip);
@@ -2090,6 +2110,16 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
 		}
 
 		/*
+		 * If subsequent activity queued events which require injection
+		 * handling, take another lap to handle them.
+		 */
+		if (svm_inject_recheck(svm_sc, vcpu, inject_state)) {
+			enable_gintr();
+			handled = 1;
+			continue;
+		}
+
+		/*
 		 * #VMEXIT resumes the host with the guest LDTR, so
 		 * save the current LDT selector so it can be restored
 		 * after an exit.  The userspace hypervisor probably
@@ -2098,10 +2128,6 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
 		 */
 		ldt_sel = sldt();
 
-#ifdef __FreeBSD__
-		svm_inj_interrupts(svm_sc, vcpu, vlapic);
-#endif
-
 		/* Activate the nested pmap on 'curcpu' */
 		CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active);
 
diff --git a/usr/src/uts/i86pc/io/vmm/amd/vmcb.h b/usr/src/uts/i86pc/io/vmm/amd/vmcb.h
index 63b088253d..1c002aee7b 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/vmcb.h
+++ b/usr/src/uts/i86pc/io/vmm/amd/vmcb.h
@@ -136,9 +136,9 @@ struct svm_softc;
 
 /* Event types that can be injected */
 #define	VMCB_EVENTINJ_TYPE_INTR		0
-#define	VMCB_EVENTINJ_TYPE_NMI		2
-#define	VMCB_EVENTINJ_TYPE_EXCEPTION	3
-#define	VMCB_EVENTINJ_TYPE_INTn		4
+#define	VMCB_EVENTINJ_TYPE_NMI		(2 << 8)
+#define	VMCB_EVENTINJ_TYPE_EXCEPTION	(3 << 8)
+#define	VMCB_EVENTINJ_TYPE_INTn		(4 << 8)
 
 /* VMCB exit code, APM vol2 Appendix C */
 #define	VMCB_EXIT_MC			0x52
@@ -187,9 +187,9 @@ struct svm_softc;
  * Section 15.7.2, Intercepts during IDT Interrupt Delivery.
  */
 #define VMCB_EXITINTINFO_VECTOR(x)	((x) & 0xFF)
-#define VMCB_EXITINTINFO_TYPE(x)	(((x) >> 8) & 0x7)
-#define VMCB_EXITINTINFO_EC_VALID(x)	(((x) & BIT(11)) ? 1 : 0)
-#define VMCB_EXITINTINFO_VALID(x)	(((x) & BIT(31)) ? 1 : 0)
+#define VMCB_EXITINTINFO_TYPE(x)	((x) & (0x7 << 8))
+#define VMCB_EXITINTINFO_EC_VALID(x)	(((x) & BIT(11)) != 0)
+#define VMCB_EXITINTINFO_VALID(x)	(((x) & BIT(31)) != 0)
 #define VMCB_EXITINTINFO_EC(x)		(((x) >> 32) & 0xFFFFFFFF)
 
 /* Offset of various VMCB fields. */
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
index edd5a64443..8302beff1f 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
@@ -338,8 +338,10 @@ SDT_PROBE_DEFINE4(vmm, vmx, exit, return,
 
 static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
 static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
-static void vmx_inject_pir(struct vlapic *vlapic);
 static void vmx_apply_tsc_adjust(struct vmx *, int);
+static void vmx_apicv_sync_tmr(struct vlapic *vlapic);
+static void vmx_tpr_shadow_enter(struct vlapic *vlapic);
+static void vmx_tpr_shadow_exit(struct vlapic *vlapic);
 
 #ifdef KTR
 static const char *
@@ -1270,26 +1272,27 @@ vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
 	VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
 }
 
+static __inline bool
+vmx_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+	return ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0);
+}
+
 static __inline void
 vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
-
-	if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
+	if (!vmx_nmi_window_exiting(vmx, vcpu)) {
 		vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
 		vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
-		VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
 	}
 }
 
 static __inline void
 vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
 {
-
-	KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
-	    ("nmi_window_exiting not set %x", vmx->cap[vcpu].proc_ctls));
+	ASSERT(vmx_nmi_window_exiting(vmx, vcpu));
 	vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
 	vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
-	VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
 }
 
 /*
@@ -1319,60 +1322,46 @@ vmx_apply_tsc_adjust(struct vmx *vmx, int vcpu)
 #define	HWINTR_BLOCKING	(VMCS_INTERRUPTIBILITY_STI_BLOCKING |		\
 			 VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
 
-#ifndef __FreeBSD__
-static uint32_t
-vmx_inject_nmi(struct vmx *vmx, int vcpu)
-#else
 static void
 vmx_inject_nmi(struct vmx *vmx, int vcpu)
-#endif
 {
-	uint32_t gi, info;
-
-	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
-	KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
-	    "interruptibility-state %x", gi));
-
-	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
-	KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
-	    "VM-entry interruption information %x", info));
+	ASSERT0(vmcs_read(VMCS_GUEST_INTERRUPTIBILITY) & NMI_BLOCKING);
+	ASSERT0(vmcs_read(VMCS_ENTRY_INTR_INFO) & VMCS_INTR_VALID);
 
 	/*
 	 * Inject the virtual NMI. The vector must be the NMI IDT entry
 	 * or the VMCS entry check will fail.
 	 */
-	info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
-	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
-
-	VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
+	vmcs_write(VMCS_ENTRY_INTR_INFO,
+	    IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID);
 
 	/* Clear the request */
 	vm_nmi_clear(vmx->vm, vcpu);
-
-#ifndef __FreeBSD__
-	return (info);
-#endif
 }
 
-static void
-vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
-    uint64_t guestrip)
+/*
+ * Inject exceptions, NMIs, and ExtINTs.
+ *
+ * The logic behind these are complicated and may involve mutex contention, so
+ * the injection is performed without the protection of host CPU interrupts
+ * being disabled.  This means a racing notification could be "lost",
+ * necessitating a later call to vmx_inject_recheck() to close that window
+ * of opportunity.
+ */
+static enum event_inject_state
+vmx_inject_events(struct vmx *vmx, int vcpu, uint64_t rip)
 {
-	uint64_t entryinfo, rflags;
+	uint64_t entryinfo;
 	uint32_t gi, info;
 	int vector;
-	boolean_t extint_pending = B_FALSE;
-
-	vlapic_tmr_update(vlapic);
+	enum event_inject_state state;
 
 	gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
 	info = vmcs_read(VMCS_ENTRY_INTR_INFO);
+	state = EIS_CAN_INJECT;
 
-	if (vmx->state[vcpu].nextrip != guestrip &&
-	    (gi & HWINTR_BLOCKING) != 0) {
-		VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking "
-		    "cleared due to rip change: %lx/%lx",
-		    vmx->state[vcpu].nextrip, guestrip);
+	/* Clear any interrupt blocking if the guest %rip has changed */
+	if (vmx->state[vcpu].nextrip != rip && (gi & HWINTR_BLOCKING) != 0) {
 		gi &= ~HWINTR_BLOCKING;
 		vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
 	}
@@ -1383,15 +1372,11 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
 	 * such as an AST before a vm-entry delivered the injection.
 	 */
 	if ((info & VMCS_INTR_VALID) != 0) {
-		goto cantinject;
+		return (EIS_EV_EXISTING | EIS_REQ_EXIT);
 	}
 
 	if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
-		KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
-		    "intinfo is not valid: %lx", __func__, entryinfo));
-
-		KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
-		     "pending exception: %lx/%x", __func__, entryinfo, info));
+		ASSERT(entryinfo & VMCS_INTR_VALID);
 
 		info = entryinfo;
 		vector = info & 0xff;
@@ -1404,50 +1389,49 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
 			info |= VMCS_INTR_T_SWEXCEPTION;
 		}
 
-		if (info & VMCS_INTR_DEL_ERRCODE)
+		if (info & VMCS_INTR_DEL_ERRCODE) {
 			vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
+		}
 
 		vmcs_write(VMCS_ENTRY_INTR_INFO, info);
+		state = EIS_EV_INJECTED;
 	}
 
 	if (vm_nmi_pending(vmx->vm, vcpu)) {
-		int need_nmi_exiting = 1;
-
 		/*
-		 * If there are no conditions blocking NMI injection then
-		 * inject it directly here otherwise enable "NMI window
-		 * exiting" to inject it as soon as we can.
+		 * If there are no conditions blocking NMI injection then inject
+		 * it directly here otherwise enable "NMI window exiting" to
+		 * inject it as soon as we can.
 		 *
-		 * We also check for STI_BLOCKING because some implementations
-		 * don't allow NMI injection in this case. If we are running
-		 * on a processor that doesn't have this restriction it will
-		 * immediately exit and the NMI will be injected in the
-		 * "NMI window exiting" handler.
+		 * According to the Intel manual, some CPUs do not allow NMI
+		 * injection when STI_BLOCKING is active.  That check is
+		 * enforced here, regardless of CPU capability.  If running on a
+		 * CPU without such a restriction it will immediately exit and
+		 * the NMI will be injected in the "NMI window exiting" handler.
 		 */
 		if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
-			if ((info & VMCS_INTR_VALID) == 0) {
-				info = vmx_inject_nmi(vmx, vcpu);
-				need_nmi_exiting = 0;
+			if (state == EIS_CAN_INJECT) {
+				vmx_inject_nmi(vmx, vcpu);
+				state = EIS_EV_INJECTED;
 			} else {
-				VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
-				    "due to VM-entry intr info %x", info);
+				return (state | EIS_REQ_EXIT);
 			}
 		} else {
-			VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
-			    "Guest Interruptibility-state %x", gi);
-		}
-
-		if (need_nmi_exiting) {
 			vmx_set_nmi_window_exiting(vmx, vcpu);
-			return;
 		}
 	}
 
-	/* Check the AT-PIC and APIC for interrupts. */
 	if (vm_extint_pending(vmx->vm, vcpu)) {
+		if (state != EIS_CAN_INJECT) {
+			return (state | EIS_REQ_EXIT);
+		}
+		if ((gi & HWINTR_BLOCKING) != 0 ||
+		    (vmcs_read(VMCS_GUEST_RFLAGS) & PSL_I) == 0) {
+			return (EIS_GI_BLOCK);
+		}
+
 		/* Ask the legacy pic for a vector to inject */
 		vatpic_pending_intr(vmx->vm, &vector);
-		extint_pending = B_TRUE;
 
 		/*
 		 * From the Intel SDM, Volume 3, Section "Maskable
@@ -1457,80 +1441,131 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
 		 */
 		KASSERT(vector >= 0 && vector <= 255,
 		    ("invalid vector %d from INTR", vector));
-	} else if (!vmx_cap_en(vmx, VMX_CAP_APICV)) {
-		/* Ask the local apic for a vector to inject */
-		if (!vlapic_pending_intr(vlapic, &vector))
-			return;
 
-		/*
-		 * From the Intel SDM, Volume 3, Section "Maskable
-		 * Hardware Interrupts":
-		 * - maskable interrupt vectors [16,255] can be delivered
-		 *   through the local APIC.
-		*/
-		KASSERT(vector >= 16 && vector <= 255,
-		    ("invalid vector %d from local APIC", vector));
-	} else {
-		/* No futher injection needed */
-		return;
-	}
+		/* Inject the interrupt */
+		vmcs_write(VMCS_ENTRY_INTR_INFO,
+		    VMCS_INTR_T_HWINTR | VMCS_INTR_VALID | vector);
 
-	/*
-	 * Verify that the guest is interruptable and the above logic has not
-	 * already queued an event for injection.
-	 */
-	if ((gi & HWINTR_BLOCKING) != 0) {
-		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
-		    "Guest Interruptibility-state %x", vector, gi);
-		goto cantinject;
-	}
-	if ((info & VMCS_INTR_VALID) != 0) {
-		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
-		    "VM-entry intr info %x", vector, info);
-		goto cantinject;
+		vm_extint_clear(vmx->vm, vcpu);
+		vatpic_intr_accepted(vmx->vm, vector);
+		state = EIS_EV_INJECTED;
 	}
-	rflags = vmcs_read(VMCS_GUEST_RFLAGS);
-	if ((rflags & PSL_I) == 0) {
-		VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
-		    "rflags %lx", vector, rflags);
-		goto cantinject;
+
+	return (state);
+}
+
+/*
+ * Inject any interrupts pending on the vLAPIC.
+ *
+ * This is done with host CPU interrupts disabled so notification IPIs, either
+ * from the standard vCPU notification or APICv posted interrupts, will be
+ * queued on the host APIC and recognized when entering VMX context.
+ */
+static enum event_inject_state
+vmx_inject_vlapic(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
+{
+	int vector;
+
+	if (!vlapic_pending_intr(vlapic, &vector)) {
+		return (EIS_CAN_INJECT);
 	}
 
-	/* Inject the interrupt */
-	info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
-	info |= vector;
-	vmcs_write(VMCS_ENTRY_INTR_INFO, info);
+	/*
+	 * From the Intel SDM, Volume 3, Section "Maskable
+	 * Hardware Interrupts":
+	 * - maskable interrupt vectors [16,255] can be delivered
+	 *   through the local APIC.
+	*/
+	KASSERT(vector >= 16 && vector <= 255,
+	    ("invalid vector %d from local APIC", vector));
 
-	if (extint_pending) {
-		vm_extint_clear(vmx->vm, vcpu);
-		vatpic_intr_accepted(vmx->vm, vector);
+	if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
+		uint16_t status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
+		uint16_t status_new = (status_old & 0xff00) | vector;
 
 		/*
-		 * After we accepted the current ExtINT the PIC may
-		 * have posted another one.  If that is the case, set
-		 * the Interrupt Window Exiting execution control so
-		 * we can inject that one too.
-		 *
-		 * Also, interrupt window exiting allows us to inject any
-		 * pending APIC vector that was preempted by the ExtINT
-		 * as soon as possible. This applies both for the software
-		 * emulated vlapic and the hardware assisted virtual APIC.
+		 * The APICv state will have been synced into the vLAPIC
+		 * as part of vlapic_pending_intr().  Prepare the VMCS
+		 * for the to-be-injected pending interrupt.
 		 */
-		vmx_set_int_window_exiting(vmx, vcpu);
-	} else {
-		/* Update the Local APIC ISR */
-		vlapic_intr_accepted(vlapic, vector);
+		if (status_new > status_old) {
+			vmcs_write(VMCS_GUEST_INTR_STATUS, status_new);
+			VCPU_CTR2(vlapic->vm, vlapic->vcpuid,
+			    "vmx_inject_interrupts: guest_intr_status "
+			    "changed from 0x%04x to 0x%04x",
+			    status_old, status_new);
+		}
+
+		/*
+		 * Ensure VMCS state regarding EOI traps is kept in sync
+		 * with the TMRs in the vlapic.
+		 */
+		vmx_apicv_sync_tmr(vlapic);
+
+		/*
+		 * The rest of the injection process for injecting the
+		 * interrupt(s) is handled by APICv. It does not preclude other
+		 * event injection from occurring.
+		 */
+		return (EIS_CAN_INJECT);
 	}
 
-	VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
-	return;
+	ASSERT0(vmcs_read(VMCS_ENTRY_INTR_INFO) & VMCS_INTR_VALID);
 
-cantinject:
-	/*
-	 * Set the Interrupt Window Exiting execution control so we can inject
-	 * the interrupt as soon as blocking condition goes away.
-	 */
-	vmx_set_int_window_exiting(vmx, vcpu);
+	/* Does guest interruptability block injection? */
+	if ((vmcs_read(VMCS_GUEST_INTERRUPTIBILITY) & HWINTR_BLOCKING) != 0 ||
+	    (vmcs_read(VMCS_GUEST_RFLAGS) & PSL_I) == 0) {
+		return (EIS_GI_BLOCK);
+	}
+
+	/* Inject the interrupt */
+	vmcs_write(VMCS_ENTRY_INTR_INFO,
+	    VMCS_INTR_T_HWINTR | VMCS_INTR_VALID | vector);
+
+	/* Update the Local APIC ISR */
+	vlapic_intr_accepted(vlapic, vector);
+
+	return (EIS_EV_INJECTED);
+}
+
+/*
+ * Re-check for events to be injected.
+ *
+ * Once host CPU interrupts are disabled, check for the presence of any events
+ * which require injection processing.  If an exit is required upon injection,
+ * or once the guest becomes interruptable, that will be configured too.
+ */
+static bool
+vmx_inject_recheck(struct vmx *vmx, int vcpu, enum event_inject_state state)
+{
+	if (state == EIS_CAN_INJECT) {
+		if (vm_nmi_pending(vmx->vm, vcpu) &&
+		    !vmx_nmi_window_exiting(vmx, vcpu)) {
+			/* queued NMI not blocked by NMI-window-exiting */
+			return (true);
+		}
+		if (vm_extint_pending(vmx->vm, vcpu)) {
+			/* queued ExtINT not blocked by existing injection */
+			return (true);
+		}
+	} else {
+		if ((state & EIS_REQ_EXIT) != 0) {
+			/*
+			 * Use a self-IPI to force an immediate exit after
+			 * event injection has occurred.
+			 */
+			poke_cpu(CPU->cpu_id);
+		} else {
+			/*
+			 * If any event is being injected, an exit immediately
+			 * upon becoming interruptable again will allow pending
+			 * or newly queued events to be injected in a timely
+			 * manner.
+			 */
+			vmx_set_int_window_exiting(vmx, vcpu);
+		}
+	}
+	return (false);
 }
 
 /*
@@ -2437,12 +2472,6 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
 		SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit);
 		vmexit->exitcode = VM_EXITCODE_HLT;
 		vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
-		if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
-			vmexit->u.hlt.intr_status =
-			    vmcs_read(VMCS_GUEST_INTR_STATUS);
-		} else {
-			vmexit->u.hlt.intr_status = 0;
-		}
 		break;
 	case EXIT_REASON_MTF:
 		vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
@@ -2871,6 +2900,7 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
 	struct region_descriptor gdtr, idtr;
 	uint16_t ldt_sel;
 #endif
+	bool tpr_shadow_active;
 
 	vmx = arg;
 	vm = vmx->vm;
@@ -2879,6 +2909,9 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
 	vlapic = vm_lapic(vm, vcpu);
 	vmexit = vm_exitinfo(vm, vcpu);
 	launched = 0;
+	tpr_shadow_active = vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW) &&
+	    !vmx_cap_en(vmx, VMX_CAP_APICV) &&
+	    (vmx->cap[vcpu].proc_ctls & PROCBASED_USE_TPR_SHADOW) != 0;
 
 	KASSERT(vmxctx->pmap == pmap,
 	    ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap));
@@ -2905,10 +2938,19 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
 	vmcs_write(VMCS_GUEST_RIP, rip);
 	vmx_set_pcpu_defaults(vmx, vcpu, pmap);
 	do {
+		enum event_inject_state inject_state;
+
 		KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch "
 		    "%lx/%lx", __func__, vmcs_guest_rip(), rip));
 
 		handled = UNHANDLED;
+
+		/*
+		 * Perform initial event/exception/interrupt injection before
+		 * host CPU interrupts are disabled.
+		 */
+		inject_state = vmx_inject_events(vmx, vcpu, rip);
+
 		/*
 		 * Interrupts are disabled from this point on until the
 		 * guest starts executing. This is done for the following
@@ -2919,27 +2961,28 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
 		 * will cause a VM exit due to external interrupt as soon as
 		 * the guest state is loaded.
 		 *
-		 * A posted interrupt after 'vmx_inject_interrupts()' will
-		 * not be "lost" because it will be held pending in the host
-		 * APIC because interrupts are disabled. The pending interrupt
-		 * will be recognized as soon as the guest state is loaded.
+		 * A posted interrupt after vmx_inject_vlapic() will not be
+		 * "lost" because it will be held pending in the host APIC
+		 * because interrupts are disabled. The pending interrupt will
+		 * be recognized as soon as the guest state is loaded.
 		 *
 		 * The same reasoning applies to the IPI generated by
 		 * pmap_invalidate_ept().
-		 *
-		 * The bulk of guest interrupt injection is done without
-		 * interrupts disabled on the host CPU.  This is necessary
-		 * since contended mutexes might force the thread to sleep.
 		 */
-		vmx_inject_interrupts(vmx, vcpu, vlapic, rip);
 		disable_intr();
-		if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
-			vmx_inject_pir(vlapic);
+
+		/*
+		 * If not precluded by existing events, inject any interrupt
+		 * pending on the vLAPIC.  As a lock-less operation, it is safe
+		 * (and prudent) to perform with host CPU interrupts disabled.
+		 */
+		if (inject_state == EIS_CAN_INJECT) {
+			inject_state = vmx_inject_vlapic(vmx, vcpu, vlapic);
 		}
 
 		/*
 		 * Check for vcpu suspension after injecting events because
-		 * vmx_inject_interrupts() can suspend the vcpu due to a
+		 * vmx_inject_events() can suspend the vcpu due to a
 		 * triple fault.
 		 */
 		if (vcpu_suspended(evinfo)) {
@@ -2974,6 +3017,16 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
 			break;
 		}
 
+		/*
+		 * If subsequent activity queued events which require injection
+		 * handling, take another lap to handle them.
+		 */
+		if (vmx_inject_recheck(vmx, vcpu, inject_state)) {
+			enable_intr();
+			handled = HANDLED;
+			continue;
+		}
+
 #ifndef __FreeBSD__
 		if ((rc = smt_acquire()) != 1) {
 			enable_intr();
@@ -3032,17 +3085,8 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
 		ldt_sel = sldt();
 #endif
 
-		/*
-		 * If TPR Shadowing is enabled, the TPR Threshold must be
-		 * updated right before entering the guest.
-		 */
-		if (vmx_cap_en(vmx, VMX_CAP_TPR_SHADOW) &&
-		    !vmx_cap_en(vmx, VMX_CAP_APICV)) {
-			if ((vmx->cap[vcpu].proc_ctls &
-			    PROCBASED_USE_TPR_SHADOW) != 0) {
-				vmcs_write(VMCS_TPR_THRESHOLD,
-				    vlapic_get_cr8(vlapic));
-			}
+		if (tpr_shadow_active) {
+			vmx_tpr_shadow_enter(vlapic);
 		}
 
 		vmx_run_trace(vmx, vcpu);
@@ -3059,6 +3103,10 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap,
 		lldt(ldt_sel);
 #endif
 
+		if (tpr_shadow_active) {
+			vmx_tpr_shadow_exit(vlapic);
+		}
+
 		/* Collect some information for VM exit processing */
 		vmexit->rip = rip = vmcs_guest_rip();
 		vmexit->inst_length = vmexit_instruction_length();
@@ -3524,47 +3572,73 @@ vmx_setcap(void *arg, int vcpu, int type, int val)
 
 struct vlapic_vtx {
 	struct vlapic	vlapic;
+
+	/* Align to the nearest cacheline */
+	uint8_t		_pad[64 - (sizeof (struct vlapic) % 64)];
+
+	/* TMR handling state for posted interrupts */
+	uint32_t	tmr_active[8];
+	uint32_t	pending_level[8];
+	uint32_t	pending_edge[8];
+
 	struct pir_desc	*pir_desc;
 	struct vmx	*vmx;
 	u_int	pending_prio;
+	boolean_t	tmr_sync;
 };
 
-#define VPR_PRIO_BIT(vpr)	(1 << ((vpr) >> 4))
+CTASSERT((offsetof (struct vlapic_vtx, tmr_active) & 63) == 0);
 
-#define	VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg)	\
-do {									\
-	VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d",	\
-	    level ? "level" : "edge", vector);				\
-	VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]);	\
-	VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]);	\
-	VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]);	\
-	VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]);	\
-	VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
-} while (0)
+#define VPR_PRIO_BIT(vpr)	(1 << ((vpr) >> 4))
 
-/*
- * vlapic->ops handlers that utilize the APICv hardware assist described in
- * Chapter 29 of the Intel SDM.
- */
-static int
-vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
+static vcpu_notify_t
+vmx_apicv_set_ready(struct vlapic *vlapic, int vector, bool level)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
-	uint64_t mask;
-	int idx, notify = 0;
+	uint32_t mask, tmrval;
+	int idx;
+	vcpu_notify_t notify = VCPU_NOTIFY_NONE;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
+	idx = vector / 32;
+	mask = 1UL << (vector % 32);
 
 	/*
-	 * Keep track of interrupt requests in the PIR descriptor. This is
-	 * because the virtual APIC page pointed to by the VMCS cannot be
-	 * modified if the vcpu is running.
+	 * If the currently asserted TMRs do not match the state requested by
+	 * the incoming interrupt, an exit will be required to reconcile those
+	 * bits in the APIC page.  This will keep the vLAPIC behavior in line
+	 * with the architecturally defined expectations.
+	 *
+	 * If actors of mixed types (edge and level) are racing against the same
+	 * vector (toggling its TMR bit back and forth), the results could
+	 * inconsistent.  Such circumstances are considered a rare edge case and
+	 * are never expected to be found in the wild.
 	 */
-	idx = vector / 64;
-	mask = 1UL << (vector % 64);
-	atomic_set_long(&pir_desc->pir[idx], mask);
+	tmrval = atomic_load_acq_int(&vlapic_vtx->tmr_active[idx]);
+	if (!level) {
+		if ((tmrval & mask) != 0) {
+			/* Edge-triggered interrupt needs TMR de-asserted */
+			atomic_set_int(&vlapic_vtx->pending_edge[idx], mask);
+			atomic_store_rel_long(&pir_desc->pending, 1);
+			return (VCPU_NOTIFY_EXIT);
+		}
+	} else {
+		if ((tmrval & mask) == 0) {
+			/* Level-triggered interrupt needs TMR asserted */
+			atomic_set_int(&vlapic_vtx->pending_level[idx], mask);
+			atomic_store_rel_long(&pir_desc->pending, 1);
+			return (VCPU_NOTIFY_EXIT);
+		}
+	}
+
+	/*
+	 * If the interrupt request does not require manipulation of the TMRs
+	 * for delivery, set it in PIR descriptor.  It cannot be inserted into
+	 * the APIC page while the vCPU might be running.
+	 */
+	atomic_set_int(&pir_desc->pir[idx], mask);
 
 	/*
 	 * A notification is required whenever the 'pending' bit makes a
@@ -3585,7 +3659,7 @@ vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 	 * cleared whenever the 'pending' bit makes another 0->1 transition.
 	 */
 	if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) {
-		notify = 1;
+		notify = VCPU_NOTIFY_APIC;
 		vlapic_vtx->pending_prio = 0;
 	} else {
 		const u_int old_prio = vlapic_vtx->pending_prio;
@@ -3593,113 +3667,44 @@ vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 
 		if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) {
 			atomic_set_int(&vlapic_vtx->pending_prio, prio_bit);
-			notify = 1;
+			notify = VCPU_NOTIFY_APIC;
 		}
 	}
 
-	VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
-	    level, "vmx_set_intr_ready");
 	return (notify);
 }
 
-static int
-vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
+static void
+vmx_apicv_accepted(struct vlapic *vlapic, int vector)
 {
-	struct vlapic_vtx *vlapic_vtx;
-	struct pir_desc *pir_desc;
-	struct LAPIC *lapic;
-	uint64_t pending, pirval;
-	uint32_t ppr, vpr;
-	int i;
-
-	/*
-	 * This function is only expected to be called from the 'HLT' exit
-	 * handler which does not care about the vector that is pending.
-	 */
-	KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
-
-	vlapic_vtx = (struct vlapic_vtx *)vlapic;
-	pir_desc = vlapic_vtx->pir_desc;
-
-	pending = atomic_load_acq_long(&pir_desc->pending);
-	if (!pending) {
-		/*
-		 * While a virtual interrupt may have already been
-		 * processed the actual delivery maybe pending the
-		 * interruptibility of the guest.  Recognize a pending
-		 * interrupt by reevaluating virtual interrupts
-		 * following Section 29.2.1 in the Intel SDM Volume 3.
-		 */
-		struct vm_exit *vmexit;
-		uint8_t rvi, ppr;
-
-		vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
-		rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT;
-		lapic = vlapic->apic_page;
-		ppr = lapic->ppr & APIC_TPR_INT;
-		if (rvi > ppr) {
-			return (1);
-		}
-
-		return (0);
-	}
-
 	/*
-	 * If there is an interrupt pending then it will be recognized only
-	 * if its priority is greater than the processor priority.
-	 *
-	 * Special case: if the processor priority is zero then any pending
-	 * interrupt will be recognized.
+	 * When APICv is enabled for an instance, the traditional interrupt
+	 * injection method (populating ENTRY_INTR_INFO in the VMCS) is not
+	 * used and the CPU does the heavy lifting of virtual interrupt
+	 * delivery.  For that reason vmx_intr_accepted() should never be called
+	 * when APICv is enabled.
 	 */
-	lapic = vlapic->apic_page;
-	ppr = lapic->ppr & APIC_TPR_INT;
-	if (ppr == 0)
-		return (1);
-
-	VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
-	    lapic->ppr);
-
-	vpr = 0;
-	for (i = 3; i >= 0; i--) {
-		pirval = pir_desc->pir[i];
-		if (pirval != 0) {
-			vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT;
-			break;
-		}
-	}
-
-	/*
-	 * If the highest-priority pending interrupt falls short of the
-	 * processor priority of this vCPU, ensure that 'pending_prio' does not
-	 * have any stale bits which would preclude a higher-priority interrupt
-	 * from incurring a notification later.
-	 */
-	if (vpr <= ppr) {
-		const u_int prio_bit = VPR_PRIO_BIT(vpr);
-		const u_int old = vlapic_vtx->pending_prio;
-
-		if (old > prio_bit && (old & prio_bit) == 0) {
-			vlapic_vtx->pending_prio = prio_bit;
-		}
-		return (0);
-	}
-	return (1);
+	panic("vmx_intr_accepted: not expected to be called");
 }
 
 static void
-vmx_intr_accepted(struct vlapic *vlapic, int vector)
+vmx_apicv_sync_tmr(struct vlapic *vlapic)
 {
+	struct vlapic_vtx *vlapic_vtx;
+	const uint32_t *tmrs;
 
-	panic("vmx_intr_accepted: not expected to be called");
-}
+	vlapic_vtx = (struct vlapic_vtx *)vlapic;
+	tmrs = &vlapic_vtx->tmr_active[0];
 
-static void
-vmx_set_tmr(struct vlapic *vlapic, const uint32_t *masks)
-{
-	vmcs_write(VMCS_EOI_EXIT0, ((uint64_t)masks[1] << 32) | masks[0]);
-	vmcs_write(VMCS_EOI_EXIT1, ((uint64_t)masks[3] << 32) | masks[2]);
-	vmcs_write(VMCS_EOI_EXIT2, ((uint64_t)masks[5] << 32) | masks[4]);
-	vmcs_write(VMCS_EOI_EXIT3, ((uint64_t)masks[7] << 32) | masks[6]);
+	if (!vlapic_vtx->tmr_sync) {
+		return;
+	}
+
+	vmcs_write(VMCS_EOI_EXIT0, ((uint64_t)tmrs[1] << 32) | tmrs[0]);
+	vmcs_write(VMCS_EOI_EXIT1, ((uint64_t)tmrs[3] << 32) | tmrs[2]);
+	vmcs_write(VMCS_EOI_EXIT2, ((uint64_t)tmrs[5] << 32) | tmrs[4]);
+	vmcs_write(VMCS_EOI_EXIT3, ((uint64_t)tmrs[7] << 32) | tmrs[6]);
+	vlapic_vtx->tmr_sync = B_FALSE;
 }
 
 static void
@@ -3765,107 +3770,99 @@ vmx_enable_x2apic_mode_vid(struct vlapic *vlapic)
 }
 
 static void
-vmx_post_intr(struct vlapic *vlapic, int hostcpu)
+vmx_apicv_notify(struct vlapic *vlapic, int hostcpu)
 {
-#ifdef __FreeBSD__
-	ipi_cpu(hostcpu, pirvec);
-#else
 	psm_send_pir_ipi(hostcpu);
-#endif
 }
 
-/*
- * Transfer the pending interrupts in the PIR descriptor to the IRR
- * in the virtual APIC page.
- */
 static void
-vmx_inject_pir(struct vlapic *vlapic)
+vmx_apicv_sync(struct vlapic *vlapic)
 {
 	struct vlapic_vtx *vlapic_vtx;
 	struct pir_desc *pir_desc;
 	struct LAPIC *lapic;
-	uint64_t val, pirval;
-	int rvi, pirbase = -1;
-	uint16_t intr_status_old, intr_status_new;
+	uint_t i;
 
 	vlapic_vtx = (struct vlapic_vtx *)vlapic;
 	pir_desc = vlapic_vtx->pir_desc;
+	lapic = vlapic->apic_page;
+
 	if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
-		VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
-		    "no posted interrupt pending");
 		return;
 	}
 
-	pirval = 0;
-	pirbase = -1;
-	lapic = vlapic->apic_page;
+	vlapic_vtx->pending_prio = 0;
 
-	val = atomic_readandclear_long(&pir_desc->pir[0]);
-	if (val != 0) {
-		lapic->irr0 |= val;
-		lapic->irr1 |= val >> 32;
-		pirbase = 0;
-		pirval = val;
-	}
+	/* Make sure the invalid (0-15) vectors are not set */
+	ASSERT0(vlapic_vtx->pending_level[0] & 0xffff);
+	ASSERT0(vlapic_vtx->pending_edge[0] & 0xffff);
+	ASSERT0(pir_desc->pir[0] & 0xffff);
 
-	val = atomic_readandclear_long(&pir_desc->pir[1]);
-	if (val != 0) {
-		lapic->irr2 |= val;
-		lapic->irr3 |= val >> 32;
-		pirbase = 64;
-		pirval = val;
-	}
+	for (i = 0; i <= 7; i++) {
+		uint32_t *tmrp = &lapic->tmr0 + (i * 4);
+		uint32_t *irrp = &lapic->irr0 + (i * 4);
 
-	val = atomic_readandclear_long(&pir_desc->pir[2]);
-	if (val != 0) {
-		lapic->irr4 |= val;
-		lapic->irr5 |= val >> 32;
-		pirbase = 128;
-		pirval = val;
-	}
+		const uint32_t pending_level =
+		    atomic_readandclear_int(&vlapic_vtx->pending_level[i]);
+		const uint32_t pending_edge =
+		    atomic_readandclear_int(&vlapic_vtx->pending_edge[i]);
+		const uint32_t pending_inject =
+		    atomic_readandclear_int(&pir_desc->pir[i]);
+
+		if (pending_level != 0) {
+			/*
+			 * Level-triggered interrupts assert their corresponding
+			 * bit in the TMR when queued in IRR.
+			 */
+			*tmrp |= pending_level;
+			*irrp |= pending_level;
+		}
+		if (pending_edge != 0) {
+			/*
+			 * When queuing an edge-triggered interrupt in IRR, the
+			 * corresponding bit in the TMR is cleared.
+			 */
+			*tmrp &= ~pending_edge;
+			*irrp |= pending_edge;
+		}
+		if (pending_inject != 0) {
+			/*
+			 * Interrupts which do not require a change to the TMR
+			 * (because it already matches the necessary state) can
+			 * simply be queued in IRR.
+			 */
+			*irrp |= pending_inject;
+		}
 
-	val = atomic_readandclear_long(&pir_desc->pir[3]);
-	if (val != 0) {
-		lapic->irr6 |= val;
-		lapic->irr7 |= val >> 32;
-		pirbase = 192;
-		pirval = val;
+		if (*tmrp != vlapic_vtx->tmr_active[i]) {
+			/* Check if VMX EOI triggers require updating. */
+			vlapic_vtx->tmr_active[i] = *tmrp;
+			vlapic_vtx->tmr_sync = B_TRUE;
+		}
 	}
+}
 
-	VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
+static void
+vmx_tpr_shadow_enter(struct vlapic *vlapic)
+{
+	/*
+	 * When TPR shadowing is enabled, VMX will initiate a guest exit if its
+	 * TPR falls below a threshold priority.  That threshold is set to the
+	 * current TPR priority, since guest interrupt status should be
+	 * re-evaluated if its TPR is set lower.
+	 */
+	vmcs_write(VMCS_TPR_THRESHOLD, vlapic_get_cr8(vlapic));
+}
 
+static void
+vmx_tpr_shadow_exit(struct vlapic *vlapic)
+{
 	/*
-	 * Update RVI so the processor can evaluate pending virtual
-	 * interrupts on VM-entry.
-	 *
-	 * It is possible for pirval to be 0 here, even though the
-	 * pending bit has been set. The scenario is:
-	 * CPU-Y is sending a posted interrupt to CPU-X, which
-	 * is running a guest and processing posted interrupts in h/w.
-	 * CPU-X will eventually exit and the state seen in s/w is
-	 * the pending bit set, but no PIR bits set.
-	 *
-	 *      CPU-X                      CPU-Y
-	 *   (vm running)                (host running)
-	 *   rx posted interrupt
-	 *   CLEAR pending bit
-	 *				 SET PIR bit
-	 *   READ/CLEAR PIR bits
-	 *				 SET pending bit
-	 *   (vm exit)
-	 *   pending bit set, PIR 0
+	 * Unlike full APICv, where changes to the TPR are reflected in the PPR,
+	 * with TPR shadowing, that duty is relegated to the VMM.  Upon exit,
+	 * the PPR is updated to reflect any change in the TPR here.
 	 */
-	if (pirval != 0) {
-		rvi = pirbase + flsl(pirval) - 1;
-		intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
-		intr_status_new = (intr_status_old & 0xFF00) | rvi;
-		if (intr_status_new > intr_status_old) {
-			vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
-			VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
-			    "guest_intr_status changed from 0x%04x to 0x%04x",
-			    intr_status_old, intr_status_new);
-		}
-	}
+	vlapic_sync_tpr(vlapic);
 }
 
 static struct vlapic *
@@ -3890,14 +3887,13 @@ vmx_vlapic_init(void *arg, int vcpuid)
 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_ts;
 	}
 	if (vmx_cap_en(vmx, VMX_CAP_APICV)) {
-		vlapic->ops.set_intr_ready = vmx_set_intr_ready;
-		vlapic->ops.pending_intr = vmx_pending_intr;
-		vlapic->ops.intr_accepted = vmx_intr_accepted;
-		vlapic->ops.set_tmr = vmx_set_tmr;
+		vlapic->ops.set_intr_ready = vmx_apicv_set_ready;
+		vlapic->ops.sync_state = vmx_apicv_sync;
+		vlapic->ops.intr_accepted = vmx_apicv_accepted;
 		vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode_vid;
 
 		if (vmx_cap_en(vmx, VMX_CAP_APICV_PIR)) {
-			vlapic->ops.post_intr = vmx_post_intr;
+			vlapic->ops.post_intr = vmx_apicv_notify;
 		}
 	}
 
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.h b/usr/src/uts/i86pc/io/vmm/intel/vmx.h
index 7943c1fd0e..b78f146755 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.h
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h
@@ -110,7 +110,7 @@ CTASSERT(sizeof(struct apic_page) == PAGE_SIZE);
 
 /* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */
 struct pir_desc {
-	uint64_t	pir[4];
+	uint32_t	pir[8];
 	uint64_t	pending;
 	uint64_t	unused[3];
 } __aligned(64);
diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.c b/usr/src/uts/i86pc/io/vmm/io/vioapic.c
index 1e8ee1fa7a..89d3bf79df 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vioapic.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.c
@@ -237,141 +237,6 @@ vioapic_pulse_irq(struct vm *vm, int irq)
 	return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE));
 }
 
-#define	REDIR_IS_PHYS(reg)	(((reg) & IOART_DESTMOD) == IOART_DESTPHY)
-#define	REDIR_IS_LOWPRIO(reg)	(((reg) & IOART_DELMOD) == IOART_DELLOPRI)
-/* Level-triggered interrupts only valid in fixed and low-priority modes */
-#define	REDIR_IS_LVLTRIG(reg)						\
-    (((reg) & IOART_TRGRLVL) != 0 &&					\
-    (((reg) & IOART_DELMOD) == IOART_DELFIXED || REDIR_IS_LOWPRIO(reg)))
-#define	REDIR_DEST(reg)		((reg) >> (32 + APIC_ID_SHIFT))
-#define	REDIR_VECTOR(reg)	((reg) & IOART_INTVEC)
-
-/*
- * Given a redirection entry, determine which vCPUs would be targeted.
- */
-static void
-vioapic_calcdest(struct vioapic *vioapic, uint64_t redir_ent, cpuset_t *dmask)
-{
-
-	/*
-	 * When calculating interrupt destinations with vlapic_calcdest(), the
-	 * legacy xAPIC format is assumed, since the system lacks interrupt
-	 * redirection hardware.
-	 * See vlapic_deliver_intr() for more details.
-	 */
-	vlapic_calcdest(vioapic->vm, dmask, REDIR_DEST(redir_ent),
-	    REDIR_IS_PHYS(redir_ent), REDIR_IS_LOWPRIO(redir_ent), false);
-}
-
-/*
- * Across all redirection entries utilizing a specified vector, determine the
- * set of vCPUs which would be targeted by a level-triggered interrupt.
- */
-static void
-vioapic_tmr_active(struct vioapic *vioapic, uint8_t vec, cpuset_t *result)
-{
-	u_int i;
-
-	CPU_ZERO(result);
-	if (vec == 0) {
-		return;
-	}
-
-	for (i = 0; i < REDIR_ENTRIES; i++) {
-		cpuset_t dest;
-		const uint64_t val = vioapic->rtbl[i].reg;
-
-		if (!REDIR_IS_LVLTRIG(val) || REDIR_VECTOR(val) != vec) {
-			continue;
-		}
-
-		CPU_ZERO(&dest);
-		vioapic_calcdest(vioapic, val, &dest);
-		CPU_OR(result, &dest);
-	}
-}
-
-/*
- * Update TMR state in vLAPICs after changes to vIOAPIC pin configuration
- */
-static void
-vioapic_update_tmrs(struct vioapic *vioapic, int vcpuid, uint64_t oldval,
-    uint64_t newval)
-{
-	cpuset_t active, allset, newset, oldset;
-	struct vm *vm;
-	uint8_t newvec, oldvec;
-
-	vm = vioapic->vm;
-	CPU_ZERO(&allset);
-	CPU_ZERO(&newset);
-	CPU_ZERO(&oldset);
-	newvec = oldvec = 0;
-
-	if (REDIR_IS_LVLTRIG(oldval)) {
-		vioapic_calcdest(vioapic, oldval, &oldset);
-		CPU_OR(&allset, &oldset);
-		oldvec = REDIR_VECTOR(oldval);
-	}
-
-	if (REDIR_IS_LVLTRIG(newval)) {
-		vioapic_calcdest(vioapic, newval, &newset);
-		CPU_OR(&allset, &newset);
-		newvec = REDIR_VECTOR(newval);
-	}
-
-	if (CPU_EMPTY(&allset) ||
-	    (CPU_CMP(&oldset, &newset) == 0 && oldvec == newvec)) {
-		return;
-	}
-
-	/*
-	 * Since the write to the redirection table has already occurred, a
-	 * scan of level-triggered entries referencing the old vector will find
-	 * only entries which are now currently valid.
-	 */
-	vioapic_tmr_active(vioapic, oldvec, &active);
-
-	while (!CPU_EMPTY(&allset)) {
-		struct vlapic *vlapic;
-		u_int i;
-
-		i = CPU_FFS(&allset) - 1;
-		CPU_CLR(i, &allset);
-
-		if (oldvec == newvec &&
-		    CPU_ISSET(i, &oldset) && CPU_ISSET(i, &newset)) {
-			continue;
-		}
-
-		if (i != vcpuid) {
-			vcpu_block_run(vm, i);
-		}
-
-		vlapic = vm_lapic(vm, i);
-		if (CPU_ISSET(i, &oldset)) {
-			/*
-			 * Perform the deassertion if no other level-triggered
-			 * IOAPIC entries target this vCPU with the old vector
-			 *
-			 * Note: Sharing of vectors like that should be
-			 * extremely rare in modern operating systems and was
-			 * previously unsupported by the bhyve vIOAPIC.
-			 */
-			if (!CPU_ISSET(i, &active)) {
-				vlapic_tmr_set(vlapic, oldvec, false);
-			}
-		}
-		if (CPU_ISSET(i, &newset)) {
-			vlapic_tmr_set(vlapic, newvec, true);
-		}
-
-		if (i != vcpuid) {
-			vcpu_unblock_run(vm, i);
-		}
-	}
-}
-
 static uint32_t
 vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr)
 {
@@ -411,7 +276,6 @@ static void
 vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
 {
 	uint64_t data64, mask64;
-	uint64_t last, changed;
 	int regnum, pin, lshift;
 
 	regnum = addr & 0xff;
@@ -436,8 +300,6 @@ vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
 		else
 			lshift = 0;
 
-		last = vioapic->rtbl[pin].reg;
-
 		data64 = (uint64_t)data << lshift;
 		mask64 = (uint64_t)0xffffffff << lshift;
 		vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS;
@@ -447,19 +309,6 @@ vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
 		    pin, vioapic->rtbl[pin].reg);
 
 		/*
-		 * If any fields in the redirection table entry (except mask
-		 * or polarity) have changed then update the trigger-mode
-		 * registers on all the vlapics.
-		 */
-		changed = last ^ vioapic->rtbl[pin].reg;
-		if (changed & ~(IOART_INTMASK | IOART_INTPOL)) {
-			VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate "
-			    "vlapic trigger-mode register", pin);
-			vioapic_update_tmrs(vioapic, vcpuid, last,
-			    vioapic->rtbl[pin].reg);
-		}
-
-		/*
 		 * Generate an interrupt if the following conditions are met:
 		 * - pin is not masked
 		 * - previous interrupt has been EOIed
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
index 038c17ca78..8af77a387b 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
@@ -70,7 +70,13 @@ __FBSDID("$FreeBSD$");
 #include "vlapic_priv.h"
 #include "vioapic.h"
 
-#define	PRIO(x)			((x) >> 4)
+
+/*
+ * The 4 high bits of a given interrupt vector represent its priority.  The same
+ * is true for the contents of the TPR when it is used to calculate the ultimate
+ * PPR of an APIC - the 4 high bits hold the priority.
+ */
+#define	PRIO(x)			((x) & 0xf0)
 
 #define VLAPIC_VERSION		(16)
 
@@ -94,7 +100,6 @@ __FBSDID("$FreeBSD$");
 #define VLAPIC_BUS_FREQ		(128 * 1024 * 1024)
 
 static void vlapic_set_error(struct vlapic *, uint32_t, bool);
-static void vlapic_tmr_reset(struct vlapic *);
 
 #ifdef __ISRVEC_DEBUG
 static void vlapic_isrstk_accept(struct vlapic *, int);
@@ -289,52 +294,60 @@ vlapic_esr_write_handler(struct vlapic *vlapic)
 	vlapic->esr_pending = 0;
 }
 
-int
+vcpu_notify_t
 vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
 {
 	struct LAPIC *lapic;
-	uint32_t *irrptr, *tmrptr, mask;
+	uint32_t *irrptr, *tmrptr, mask, tmr;
 	int idx;
 
 	KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector));
 
 	lapic = vlapic->apic_page;
 	if (!(lapic->svr & APIC_SVR_ENABLE)) {
-		VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring "
-		    "interrupt %d", vector);
-		return (0);
+		/* ignore interrupt on software-disabled APIC */
+		return (VCPU_NOTIFY_NONE);
 	}
 
 	if (vector < 16) {
 		vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR,
 		    false);
-		VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d",
-		    vector);
-		return (1);
+
+		/*
+		 * If the error LVT is configured to interrupt the vCPU, it will
+		 * have delivered a notification through that mechanism.
+		 */
+		return (VCPU_NOTIFY_NONE);
 	}
 
-	if (vlapic->ops.set_intr_ready)
+	if (vlapic->ops.set_intr_ready) {
 		return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level));
+	}
 
 	idx = (vector / 32) * 4;
 	mask = 1 << (vector % 32);
-
+	tmrptr = &lapic->tmr0;
 	irrptr = &lapic->irr0;
-	atomic_set_int(&irrptr[idx], mask);
 
 	/*
-	 * Verify that the trigger-mode of the interrupt matches with
-	 * the vlapic TMR registers.
+	 * Update TMR for requested vector, if necessary.
+	 * This must be done prior to asserting the bit in IRR so that the
+	 * proper TMR state is always visible before the to-be-queued interrupt
+	 * can be injected.
 	 */
-	tmrptr = &lapic->tmr0;
-	if ((tmrptr[idx] & mask) != (level ? mask : 0)) {
-		VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but "
-		    "interrupt is %s-triggered", idx / 4, tmrptr[idx],
-		    level ? "level" : "edge");
+	tmr = atomic_load_acq_32(&tmrptr[idx]);
+	if ((tmr & mask) != (level ? mask : 0)) {
+		if (level) {
+			atomic_set_int(&tmrptr[idx], mask);
+		} else {
+			atomic_clear_int(&tmrptr[idx], mask);
+		}
 	}
 
-	VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
-	return (1);
+	/* Now set the bit in IRR */
+	atomic_set_int(&irrptr[idx], mask);
+
+	return (VCPU_NOTIFY_EXIT);
 }
 
 static __inline uint32_t *
@@ -472,6 +485,7 @@ static int
 vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt)
 {
 	uint32_t mode, reg, vec;
+	vcpu_notify_t notify;
 
 	reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]);
 
@@ -487,8 +501,8 @@ vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt)
 			    lvt == APIC_LVT_ERROR);
 			return (0);
 		}
-		if (vlapic_set_intr_ready(vlapic, vec, false))
-			vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true);
+		notify = vlapic_set_intr_ready(vlapic, vec, false);
+		vcpu_notify_event_type(vlapic->vm, vlapic->vcpuid, notify);
 		break;
 	case APIC_LVT_DM_NMI:
 		vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
@@ -532,8 +546,8 @@ vlapic_active_isr(struct vlapic *vlapic)
 }
 
 /*
- * Algorithm adopted from section "Interrupt, Task and Processor Priority"
- * in Intel Architecture Manual Vol 3a.
+ * After events which might arbitrarily change the value of PPR, such as a TPR
+ * write or an EOI, calculate that new PPR value and store it in the APIC page.
  */
 static void
 vlapic_update_ppr(struct vlapic *vlapic)
@@ -543,19 +557,44 @@ vlapic_update_ppr(struct vlapic *vlapic)
 	isrvec = vlapic_active_isr(vlapic);
 	tpr = vlapic->apic_page->tpr;
 
-#ifdef __ISRVEC_DEBUG
-	vlapic_isrstk_verify(vlapic);
-#endif
-
-	if (PRIO(tpr) >= PRIO(isrvec))
+	/*
+	 * Algorithm adopted from section "Interrupt, Task and Processor
+	 * Priority" in Intel Architecture Manual Vol 3a.
+	 */
+	if (PRIO(tpr) >= PRIO(isrvec)) {
 		ppr = tpr;
-	else
-		ppr = isrvec & 0xf0;
+	} else {
+		ppr = PRIO(isrvec);
+	}
 
 	vlapic->apic_page->ppr = ppr;
 	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
 }
 
+/*
+ * When a vector is asserted in ISR as in-service, the PPR must be raised to the
+ * priority of that vector, as the vCPU would have been at a lower priority in
+ * order for the vector to be accepted.
+ */
+static void
+vlapic_raise_ppr(struct vlapic *vlapic, int vec)
+{
+	struct LAPIC *lapic = vlapic->apic_page;
+	int ppr;
+
+	ppr = PRIO(vec);
+
+#ifdef __ISRVEC_DEBUG
+	KASSERT(vec >= 16 && vec < 256, ("invalid vector %d", vec));
+	KASSERT(ppr > lapic->tpr, ("ppr %x <= tpr %x", ppr, lapic->tpr));
+	KASSERT(ppr > lapic->ppr, ("ppr %x <= old ppr %x", ppr, lapic->ppr));
+	KASSERT(vec == (int)vlapic_active_isr(vlapic), ("ISR missing for ppr"));
+#endif /* __ISRVEC_DEBUG */
+
+	lapic->ppr = ppr;
+	VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
+}
+
 void
 vlapic_sync_tpr(struct vlapic *vlapic)
 {
@@ -1087,10 +1126,9 @@ vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
 	int		 idx, i, bitpos, vector;
 	uint32_t	*irrptr, val;
 
-	vlapic_update_ppr(vlapic);
-
-	if (vlapic->ops.pending_intr)
-		return ((*vlapic->ops.pending_intr)(vlapic, vecptr));
+	if (vlapic->ops.sync_state) {
+		(*vlapic->ops.sync_state)(vlapic);
+	}
 
 	irrptr = &lapic->irr0;
 
@@ -1119,6 +1157,8 @@ vlapic_intr_accepted(struct vlapic *vlapic, int vector)
 	uint32_t	*irrptr, *isrptr;
 	int		idx;
 
+	KASSERT(vector >= 16 && vector < 256, ("invalid vector %d", vector));
+
 	if (vlapic->ops.intr_accepted)
 		return ((*vlapic->ops.intr_accepted)(vlapic, vector));
 
@@ -1136,6 +1176,13 @@ vlapic_intr_accepted(struct vlapic *vlapic, int vector)
 	isrptr[idx] |= 1 << (vector % 32);
 	VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
 
+	/*
+	 * The only way a fresh vector could be accepted into ISR is if it was
+	 * of a higher priority than the current PPR.  With that vector now
+	 * in-service, the PPR must be raised.
+	 */
+	vlapic_raise_ppr(vlapic, vector);
+
 #ifdef __ISRVEC_DEBUG
 	vlapic_isrstk_accept(vlapic, vector);
 #endif
@@ -1425,7 +1472,6 @@ vlapic_reset(struct vlapic *vlapic)
 	lapic->dfr = 0xffffffff;
 	lapic->svr = APIC_SVR_VECTOR;
 	vlapic_mask_lvts(vlapic);
-	vlapic_tmr_reset(vlapic);
 
 	lapic->dcr_timer = 0;
 	vlapic_dcr_write_handler(vlapic);
@@ -1592,82 +1638,6 @@ vlapic_enabled(struct vlapic *vlapic)
 		return (false);
 }
 
-static void
-vlapic_tmr_reset(struct vlapic *vlapic)
-{
-	struct LAPIC *lapic;
-
-	lapic = vlapic->apic_page;
-	lapic->tmr0 = lapic->tmr1 = lapic->tmr2 = lapic->tmr3 = 0;
-	lapic->tmr4 = lapic->tmr5 = lapic->tmr6 = lapic->tmr7 = 0;
-	vlapic->tmr_pending = 1;
-}
-
-/*
- * Synchronize TMR designations into the LAPIC state.
- * The vCPU must be in the VCPU_RUNNING state.
- */
-void
-vlapic_tmr_update(struct vlapic *vlapic)
-{
-	struct LAPIC *lapic;
-	uint32_t *tmrptr;
-	uint32_t result[VLAPIC_TMR_CNT];
-	u_int i, tmr_idx;
-
-	if (vlapic->tmr_pending == 0) {
-		return;
-	}
-
-	lapic = vlapic->apic_page;
-	tmrptr = &lapic->tmr0;
-
-	VLAPIC_CTR0(vlapic, "synchronizing TMR");
-	for (i = 0; i < VLAPIC_TMR_CNT; i++) {
-		tmr_idx = i * 4;
-
-		tmrptr[tmr_idx] &= ~vlapic->tmr_vec_deassert[i];
-		tmrptr[tmr_idx] |= vlapic->tmr_vec_assert[i];
-		vlapic->tmr_vec_deassert[i] = 0;
-		vlapic->tmr_vec_assert[i] = 0;
-		result[i] = tmrptr[tmr_idx];
-	}
-	vlapic->tmr_pending = 0;
-
-	if (vlapic->ops.set_tmr != NULL) {
-		(*vlapic->ops.set_tmr)(vlapic, result);
-	}
-}
-
-/*
- * Designate the TMR state for a given interrupt vector.
- * The caller must hold the vIOAPIC lock and prevent the vCPU corresponding to
- * this vLAPIC instance from being-in or entering the VCPU_RUNNING state.
- */
-void
-vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active)
-{
-	const uint32_t idx = vector / 32;
-	const uint32_t mask = 1 << (vector % 32);
-
-	VLAPIC_CTR2(vlapic, "TMR for vector %u %sasserted", vector,
-	    active ? "" : "de");
-	if (active) {
-		vlapic->tmr_vec_assert[idx] |= mask;
-		vlapic->tmr_vec_deassert[idx] &= ~mask;
-	} else {
-		vlapic->tmr_vec_deassert[idx] |= mask;
-		vlapic->tmr_vec_assert[idx] &= ~mask;
-	}
-
-	/*
-	 * Track the number of TMR changes between calls to vlapic_tmr_update.
-	 * While a simple boolean would suffice, this count may be useful when
-	 * tracing or debugging, and is cheap to calculate.
-	 */
-	vlapic->tmr_pending = MIN(UINT32_MAX - 1, vlapic->tmr_pending) + 1;
-}
-
 #ifndef __FreeBSD__
 void
 vlapic_localize_resources(struct vlapic *vlapic)
@@ -1685,6 +1655,7 @@ vlapic_isrstk_eoi(struct vlapic *vlapic, int vector)
 		      vlapic->isrvec_stk_top);
 	}
 	vlapic->isrvec_stk_top--;
+	vlapic_isrstk_verify(vlapic);
 }
 
 static void
@@ -1699,6 +1670,7 @@ vlapic_isrstk_accept(struct vlapic *vlapic, int vector)
 		panic("isrvec_stk_top overflow %d", stk_top);
 
 	vlapic->isrvec_stk[stk_top] = vector;
+	vlapic_isrstk_verify(vlapic);
 }
 
 static void
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.h b/usr/src/uts/i86pc/io/vmm/io/vlapic.h
index 746699393f..f34cf1ec4b 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic.h
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h
@@ -63,10 +63,8 @@ int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr);
  */
 void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
 
-/*
- * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise.
- */
-int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level);
+vcpu_notify_t vlapic_set_intr_ready(struct vlapic *vlapic, int vector,
+    bool level);
 
 /*
  * Post an interrupt to the vcpu running on 'hostcpu'. This will use a
@@ -91,9 +89,6 @@ void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
 void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
     bool lowprio, bool x2apic_dest);
 
-void vlapic_tmr_update(struct vlapic *vlapic);
-void vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active);
-
 void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val);
 uint64_t vlapic_get_cr8(struct vlapic *vlapic);
 
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
index 8a0d594de3..1329ab5b36 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
@@ -159,11 +159,11 @@ enum boot_state {
 struct vlapic;
 
 struct vlapic_ops {
-	int (*set_intr_ready)(struct vlapic *vlapic, int vector, bool level);
-	int (*pending_intr)(struct vlapic *vlapic, int *vecptr);
+	vcpu_notify_t (*set_intr_ready)(struct vlapic *vlapic, int vector,
+	    bool level);
+	void (*sync_state)(struct vlapic *vlapic);
 	void (*intr_accepted)(struct vlapic *vlapic, int vector);
 	void (*post_intr)(struct vlapic *vlapic, int hostcpu);
-	void (*set_tmr)(struct vlapic *vlapic, const uint32_t *result);
 	void (*enable_x2apic_mode)(struct vlapic *vlapic);
 };
 
@@ -174,7 +174,6 @@ struct vlapic {
 	struct vlapic_ops	ops;
 
 	uint32_t		esr_pending;
-	uint32_t		tmr_pending;
 
 	struct callout	callout;	/* vlapic timer */
 	struct bintime	timer_fire_bt;	/* callout expiry time */
@@ -194,19 +193,6 @@ struct vlapic {
 	uint32_t	svr_last;
 	uint32_t	lvt_last[VLAPIC_MAXLVT_INDEX + 1];
 
-	/*
-	 * Store intended modifications to the trigger-mode register state.
-	 * Along with the tmr_pending counter above, these are protected by the
-	 * vIOAPIC lock and can only be modified under specific conditions:
-	 *
-	 * 1. When holding the vIOAPIC lock, and the vCPU to which the vLAPIC
-	 *    belongs is prevented from entering the VCPU_RUNNING state.
-	 * 2. When the owning vCPU is in the VCPU_RUNNING state, and is
-	 *    applying the TMR modifications prior to interrupt injection.
-	 */
-	uint32_t	tmr_vec_deassert[VLAPIC_TMR_CNT];
-	uint32_t	tmr_vec_assert[VLAPIC_TMR_CNT];
-
 #ifdef __ISRVEC_DEBUG
 	/*
 	 * The 'isrvec_stk' is a stack of vectors injected by the local APIC.
diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h
index acdabf556f..b566e503e0 100644
--- a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h
+++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h
@@ -280,8 +280,15 @@ vcpu_should_yield(struct vm *vm, int vcpu)
 }
 #endif /* _SYS_THREAD_H */
 
+typedef enum vcpu_notify {
+	VCPU_NOTIFY_NONE,
+	VCPU_NOTIFY_APIC,	/* Posted intr notification (if possible) */
+	VCPU_NOTIFY_EXIT,	/* IPI to cause VM exit */
+} vcpu_notify_t;
+
 void *vcpu_stats(struct vm *vm, int vcpu);
-void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
+void vcpu_notify_event(struct vm *vm, int vcpuid);
+void vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t);
 struct vmspace *vm_get_vmspace(struct vm *vm);
 struct vatpic *vm_atpic(struct vm *vm);
 struct vatpit *vm_atpit(struct vm *vm);
@@ -374,6 +381,25 @@ void vm_inject_ac(struct vm *vm, int vcpuid, int errcode);
 void vm_inject_ss(struct vm *vm, int vcpuid, int errcode);
 void vm_inject_pf(struct vm *vm, int vcpuid, int errcode, uint64_t cr2);
 
+/*
+ * Both SVM and VMX have complex logic for injecting events such as exceptions
+ * or interrupts into the guest.  Within those two backends, the progress of
+ * event injection is tracked by event_inject_state, hopefully making it easier
+ * to reason about.
+ */
+enum event_inject_state {
+	EIS_CAN_INJECT	= 0, /* exception/interrupt can be injected */
+	EIS_EV_EXISTING	= 1, /* blocked by existing event */
+	EIS_EV_INJECTED	= 2, /* blocked by injected event */
+	EIS_GI_BLOCK	= 3, /* blocked by guest interruptability */
+
+	/*
+	 * Flag to request an immediate exit from VM context after event
+	 * injection in order to perform more processing
+	 */
+	EIS_REQ_EXIT	= (1 << 15),
+};
+
 #ifndef	__FreeBSD__
 
 void vmm_sol_glue_init(void);
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
index 1821a96fd7..3cd89f9fe6 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -297,7 +297,7 @@ SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN,
 
 static void vm_free_memmap(struct vm *vm, int ident);
 static bool sysmem_mapping(struct vm *vm, struct mem_map *mm);
-static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr);
+static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t);
 
 #ifndef __FreeBSD__
 static void vm_clear_memseg(struct vm *, int);
@@ -1338,7 +1338,7 @@ vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
 	if (from_idle) {
 		while (vcpu->state != VCPU_IDLE) {
 			vcpu->reqidle = 1;
-			vcpu_notify_event_locked(vcpu, false);
+			vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
 			VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to "
 			    "idle requested", vcpu_state2str(vcpu->state));
 #ifdef __FreeBSD__
@@ -1839,7 +1839,7 @@ vm_handle_suspend(struct vm *vm, int vcpuid)
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &vm->suspended_cpus)) {
-			vcpu_notify_event(vm, i, false);
+			vcpu_notify_event(vm, i);
 		}
 	}
 
@@ -1909,7 +1909,7 @@ vm_suspend(struct vm *vm, enum vm_suspend_how how)
 	 */
 	for (i = 0; i < vm->maxcpus; i++) {
 		if (CPU_ISSET(i, &vm->active_cpus))
-			vcpu_notify_event(vm, i, false);
+			vcpu_notify_event(vm, i);
 	}
 
 	return (0);
@@ -2620,6 +2620,14 @@ vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid,
 		return (EINVAL);
 
 	/*
+	 * NMIs (which bear an exception vector of 2) are to be injected via
+	 * their own specialized path using vm_inject_nmi().
+	 */
+	if (vector == 2) {
+		return (EINVAL);
+	}
+
+	/*
 	 * A double fault exception should never be injected directly into
 	 * the guest. It is a derived exception that results from specific
 	 * combinations of nested faults.
@@ -2728,7 +2736,7 @@ vm_inject_nmi(struct vm *vm, int vcpuid)
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->nmi_pending = 1;
-	vcpu_notify_event(vm, vcpuid, false);
+	vcpu_notify_event(vm, vcpuid);
 	return (0);
 }
 
@@ -2775,7 +2783,7 @@ vm_inject_extint(struct vm *vm, int vcpuid)
 	vcpu = &vm->vcpu[vcpuid];
 
 	vcpu->extint_pending = 1;
-	vcpu_notify_event(vm, vcpuid, false);
+	vcpu_notify_event(vm, vcpuid);
 	return (0);
 }
 
@@ -2956,7 +2964,7 @@ vcpu_block_run(struct vm *vm, int vcpuid)
 	vcpu_lock(vcpu);
 	vcpu->runblock++;
 	if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) {
-		vcpu_notify_event_locked(vcpu, false);
+		vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
 	}
 	while (vcpu->state == VCPU_RUNNING) {
 #ifdef __FreeBSD__
@@ -3026,14 +3034,14 @@ vm_suspend_cpu(struct vm *vm, int vcpuid)
 		vm->debug_cpus = vm->active_cpus;
 		for (i = 0; i < vm->maxcpus; i++) {
 			if (CPU_ISSET(i, &vm->active_cpus))
-				vcpu_notify_event(vm, i, false);
+				vcpu_notify_event(vm, i);
 		}
 	} else {
 		if (!CPU_ISSET(vcpuid, &vm->active_cpus))
 			return (EINVAL);
 
 		CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus);
-		vcpu_notify_event(vm, vcpuid, false);
+		vcpu_notify_event(vm, vcpuid);
 	}
 	return (0);
 }
@@ -3126,15 +3134,17 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
  *   to the host_cpu to cause the vcpu to trap into the hypervisor.
  */
 static void
-vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
+vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t ntype)
 {
 	int hostcpu;
 
+	ASSERT(ntype == VCPU_NOTIFY_APIC || VCPU_NOTIFY_EXIT);
+
 	hostcpu = vcpu->hostcpu;
 	if (vcpu->state == VCPU_RUNNING) {
 		KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
 		if (hostcpu != curcpu) {
-			if (lapic_intr) {
+			if (ntype == VCPU_NOTIFY_APIC) {
 				vlapic_post_intr(vcpu->vlapic, hostcpu,
 				    vmm_ipinum);
 			} else {
@@ -3162,12 +3172,26 @@ vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr)
 }
 
 void
-vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
+vcpu_notify_event(struct vm *vm, int vcpuid)
 {
 	struct vcpu *vcpu = &vm->vcpu[vcpuid];
 
 	vcpu_lock(vcpu);
-	vcpu_notify_event_locked(vcpu, lapic_intr);
+	vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT);
+	vcpu_unlock(vcpu);
+}
+
+void
+vcpu_notify_event_type(struct vm *vm, int vcpuid, vcpu_notify_t ntype)
+{
+	struct vcpu *vcpu = &vm->vcpu[vcpuid];
+
+	if (ntype == VCPU_NOTIFY_NONE) {
+		return;
+	}
+
+	vcpu_lock(vcpu);
+	vcpu_notify_event_locked(vcpu, ntype);
 	vcpu_unlock(vcpu);
 }
 
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
index f8d8970807..3de67f012d 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
@@ -67,6 +67,7 @@ int
 lapic_set_intr(struct vm *vm, int cpu, int vector, bool level)
 {
 	struct vlapic *vlapic;
+	vcpu_notify_t notify;
 
 	if (cpu < 0 || cpu >= vm_get_maxcpus(vm))
 		return (EINVAL);
@@ -79,8 +80,8 @@ lapic_set_intr(struct vm *vm, int cpu, int vector, bool level)
 		return (EINVAL);
 
 	vlapic = vm_lapic(vm, cpu);
-	if (vlapic_set_intr_ready(vlapic, vector, level))
-		vcpu_notify_event(vm, cpu, true);
+	notify = vlapic_set_intr_ready(vlapic, vector, level);
+	vcpu_notify_event_type(vm, cpu, notify);
 	return (0);
 }
 
diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h
index 2322919d29..c6859a3c00 100644
--- a/usr/src/uts/i86pc/sys/vmm.h
+++ b/usr/src/uts/i86pc/sys/vmm.h
@@ -353,7 +353,6 @@ struct vm_exit {
 		} spinup_ap;
 		struct {
 			uint64_t	rflags;
-			uint64_t	intr_status;
 		} hlt;
 		struct {
 			int		vector;
author	Patrick Mooney <pmooney@pfmooney.com>	2020-08-01 22:16:45 +0000
committer	Patrick Mooney <pmooney@oxide.computer>	2020-11-24 20:30:25 +0000
commit	c74a40a584c9d875009f725565896fd7e8ee38d6 (patch)
tree	90307bd32af113964bbdc8a157fa3b4974375d9f
parent	273d774d1d685415fd99d31224bdae55e7cfb793 (diff)
download	illumos-joyent-c74a40a584c9d875009f725565896fd7e8ee38d6.tar.gz