summaryrefslogtreecommitdiff
path: root/usr/src/uts/i86pc
diff options
context:
space:
mode:
authorPatrick Mooney <pmooney@pfmooney.com>2019-02-28 20:12:58 +0000
committerPatrick Mooney <pmooney@pfmooney.com>2019-06-20 13:28:50 +0000
commitec6f18e946abf1cebedfd0d7ca47dd5a5d183ff8 (patch)
tree537fa3a59398af947af43a350661f676198e47bc /usr/src/uts/i86pc
parent0d05320d3398b8b3197651d60300801cee9d054e (diff)
downloadillumos-joyent-ec6f18e946abf1cebedfd0d7ca47dd5a5d183ff8.tar.gz
OS-7622 bhyve vioapic writes can deadlock instance
Reviewed by: Hans Rosenfeld <hans.rosenfeld@joyent.com> Reviewed by: John Baldwin <jhb@FreeBSD.org> Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Approved by: Robert Mustacchi <rm@joyent.com>
Diffstat (limited to 'usr/src/uts/i86pc')
-rw-r--r--usr/src/uts/i86pc/io/vmm/amd/svm.c10
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmx.c36
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vioapic.c164
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vlapic.c102
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vlapic.h13
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h18
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm.c264
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c31
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_stat.c2
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_stat.h2
-rw-r--r--usr/src/uts/i86pc/sys/vmm.h30
11 files changed, 336 insertions, 336 deletions
diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c
index 114eec365a..25dc3a63fa 100644
--- a/usr/src/uts/i86pc/io/vmm/amd/svm.c
+++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c
@@ -1629,6 +1629,8 @@ svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic)
need_intr_window = 0;
+ vlapic_tmr_update(vlapic);
+
if (vcpustate->nextrip != state->rip) {
ctrl->intr_shadow = 0;
VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking "
@@ -2060,8 +2062,8 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
* XXX
* Setting 'vcpustate->lastcpu' here is bit premature because
* we may return from this function without actually executing
- * the VMRUN instruction. This could happen if a rendezvous
- * or an AST is pending on the first time through the loop.
+ * the VMRUN instruction. This could happen if an AST or yield
+ * condition is pending on the first time through the loop.
*
* This works for now but any new side-effects of vcpu
* migration should take this case into account.
@@ -2106,9 +2108,9 @@ svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap,
break;
}
- if (vcpu_rendezvous_pending(evinfo)) {
+ if (vcpu_runblocked(evinfo)) {
enable_gintr();
- vm_exit_rendezvous(vm, vcpu, state->rip);
+ vm_exit_runblock(vm, vcpu, state->rip);
break;
}
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
index cf6b0e69c3..131615d576 100644
--- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
@@ -1525,6 +1525,8 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
int vector;
boolean_t extint_pending = B_FALSE;
+ vlapic_tmr_update(vlapic);
+
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
info = vmcs_read(VMCS_ENTRY_INTR_INFO);
@@ -1701,6 +1703,8 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic,
uint64_t rflags, entryinfo;
uint32_t gi, info;
+ vlapic_tmr_update(vlapic);
+
if (vmx->state[vcpu].nextrip != guestrip) {
gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
if (gi & HWINTR_BLOCKING) {
@@ -3321,9 +3325,9 @@ vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap,
break;
}
- if (vcpu_rendezvous_pending(evinfo)) {
+ if (vcpu_runblocked(evinfo)) {
enable_intr();
- vm_exit_rendezvous(vmx->vm, vcpu, rip);
+ vm_exit_runblock(vmx->vm, vcpu, rip);
break;
}
@@ -4046,30 +4050,12 @@ vmx_intr_accepted(struct vlapic *vlapic, int vector)
}
static void
-vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
+vmx_set_tmr(struct vlapic *vlapic, const uint32_t *masks)
{
- struct vlapic_vtx *vlapic_vtx;
- struct vmx *vmx;
- struct vmcs *vmcs;
- uint64_t mask, val;
-
- KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
- KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
- ("vmx_set_tmr: vcpu cannot be running"));
-
- vlapic_vtx = (struct vlapic_vtx *)vlapic;
- vmx = vlapic_vtx->vmx;
- vmcs = &vmx->vmcs[vlapic->vcpuid];
- mask = 1UL << (vector % 64);
-
- VMPTRLD(vmcs);
- val = vmcs_read(VMCS_EOI_EXIT(vector));
- if (level)
- val |= mask;
- else
- val &= ~mask;
- vmcs_write(VMCS_EOI_EXIT(vector), val);
- VMCLEAR(vmcs);
+ vmcs_write(VMCS_EOI_EXIT0, ((uint64_t)masks[1] << 32) | masks[0]);
+ vmcs_write(VMCS_EOI_EXIT1, ((uint64_t)masks[3] << 32) | masks[2]);
+ vmcs_write(VMCS_EOI_EXIT2, ((uint64_t)masks[5] << 32) | masks[4]);
+ vmcs_write(VMCS_EOI_EXIT3, ((uint64_t)masks[7] << 32) | masks[6]);
}
static void
diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.c b/usr/src/uts/i86pc/io/vmm/io/vioapic.c
index c001ffd933..dbd3420420 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vioapic.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.c
@@ -52,6 +52,7 @@ __FBSDID("$FreeBSD$");
#include <sys/systm.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
+#include <sys/cpuset.h>
#include <x86/apicreg.h>
#include <machine/vmm.h>
@@ -236,48 +237,139 @@ vioapic_pulse_irq(struct vm *vm, int irq)
return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE));
}
+#define REDIR_IS_PHYS(reg) (((reg) & IOART_DESTMOD) == IOART_DESTPHY)
+#define REDIR_IS_LOWPRIO(reg) (((reg) & IOART_DELMOD) == IOART_DELLOPRI)
+/* Level-triggered interrupts only valid in fixed and low-priority modes */
+#define REDIR_IS_LVLTRIG(reg) \
+ (((reg) & IOART_TRGRLVL) != 0 && \
+ (((reg) & IOART_DELMOD) == IOART_DELFIXED || REDIR_IS_LOWPRIO(reg)))
+#define REDIR_DEST(reg) ((reg) >> (32 + APIC_ID_SHIFT))
+#define REDIR_VECTOR(reg) ((reg) & IOART_INTVEC)
+
/*
- * Reset the vlapic's trigger-mode register to reflect the ioapic pin
- * configuration.
+ * Given a redirection entry, determine which vCPUs would be targeted.
*/
static void
-vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg)
+vioapic_calcdest(struct vioapic *vioapic, uint64_t redir_ent, cpuset_t *dmask)
{
- struct vioapic *vioapic;
- struct vlapic *vlapic;
- uint32_t low, high, dest;
- int delmode, pin, vector;
- bool level, phys;
- vlapic = vm_lapic(vm, vcpuid);
- vioapic = vm_ioapic(vm);
+ /*
+ * When calculating interrupt destinations with vlapic_calcdest(), the
+ * legacy xAPIC format is assumed, since the system lacks interrupt
+ * redirection hardware.
+ * See vlapic_deliver_intr() for more details.
+ */
+ vlapic_calcdest(vioapic->vm, dmask, REDIR_DEST(redir_ent),
+ REDIR_IS_PHYS(redir_ent), REDIR_IS_LOWPRIO(redir_ent), false);
+}
+
+/*
+ * Across all redirection entries utilizing a specified vector, determine the
+ * set of vCPUs which would be targeted by a level-triggered interrupt.
+ */
+static void
+vioapic_tmr_active(struct vioapic *vioapic, uint8_t vec, cpuset_t *result)
+{
+ u_int i;
+
+ CPU_ZERO(result);
+ if (vec == 0) {
+ return;
+ }
+
+ for (i = 0; i < REDIR_ENTRIES; i++) {
+ cpuset_t dest;
+ const uint64_t val = vioapic->rtbl[i].reg;
+
+ if (!REDIR_IS_LVLTRIG(val) || REDIR_VECTOR(val) != vec) {
+ continue;
+ }
+
+ CPU_ZERO(&dest);
+ vioapic_calcdest(vioapic, val, &dest);
+ CPU_OR(result, &dest);
+ }
+}
+
+/*
+ * Update TMR state in vLAPICs after changes to vIOAPIC pin configuration
+ */
+static void
+vioapic_update_tmrs(struct vioapic *vioapic, int vcpuid, uint64_t oldval,
+ uint64_t newval)
+{
+ cpuset_t active, allset, newset, oldset;
+ struct vm *vm;
+ uint8_t newvec, oldvec;
+
+ vm = vioapic->vm;
+ CPU_ZERO(&allset);
+ CPU_ZERO(&newset);
+ CPU_ZERO(&oldset);
+ newvec = oldvec = 0;
+
+ if (REDIR_IS_LVLTRIG(oldval)) {
+ vioapic_calcdest(vioapic, oldval, &oldset);
+ CPU_OR(&allset, &oldset);
+ oldvec = REDIR_VECTOR(oldval);
+ }
+
+ if (REDIR_IS_LVLTRIG(newval)) {
+ vioapic_calcdest(vioapic, newval, &newset);
+ CPU_OR(&allset, &newset);
+ newvec = REDIR_VECTOR(newval);
+ }
+
+ if (CPU_EMPTY(&allset) ||
+ (CPU_CMP(&oldset, &newset) == 0 && oldvec == newvec)) {
+ return;
+ }
- VIOAPIC_LOCK(vioapic);
/*
- * Reset all vectors to be edge-triggered.
+ * Since the write to the redirection table has already occurred, a
+ * scan of level-triggered entries referencing the old vector will find
+ * only entries which are now currently valid.
*/
- vlapic_reset_tmr(vlapic);
- for (pin = 0; pin < REDIR_ENTRIES; pin++) {
- low = vioapic->rtbl[pin].reg;
- high = vioapic->rtbl[pin].reg >> 32;
+ vioapic_tmr_active(vioapic, oldvec, &active);
- level = low & IOART_TRGRLVL ? true : false;
- if (!level)
+ while (!CPU_EMPTY(&allset)) {
+ struct vlapic *vlapic;
+ u_int i;
+
+ i = CPU_FFS(&allset) - 1;
+ CPU_CLR(i, &allset);
+
+ if (oldvec == newvec &&
+ CPU_ISSET(i, &oldset) && CPU_ISSET(i, &newset)) {
continue;
+ }
- /*
- * For a level-triggered 'pin' let the vlapic figure out if
- * an assertion on this 'pin' would result in an interrupt
- * being delivered to it. If yes, then it will modify the
- * TMR bit associated with this vector to level-triggered.
- */
- phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
- delmode = low & IOART_DELMOD;
- vector = low & IOART_INTVEC;
- dest = high >> APIC_ID_SHIFT;
- vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector);
+ if (i != vcpuid) {
+ vcpu_block_run(vm, i);
+ }
+
+ vlapic = vm_lapic(vm, i);
+ if (CPU_ISSET(i, &oldset)) {
+ /*
+ * Perform the deassertion if no other level-triggered
+ * IOAPIC entries target this vCPU with the old vector
+ *
+ * Note: Sharing of vectors like that should be
+ * extremely rare in modern operating systems and was
+ * previously unsupported by the bhyve vIOAPIC.
+ */
+ if (!CPU_ISSET(i, &active)) {
+ vlapic_tmr_set(vlapic, oldvec, false);
+ }
+ }
+ if (CPU_ISSET(i, &newset)) {
+ vlapic_tmr_set(vlapic, newvec, true);
+ }
+
+ if (i != vcpuid) {
+ vcpu_unblock_run(vm, i);
+ }
}
- VIOAPIC_UNLOCK(vioapic);
}
static uint32_t
@@ -321,7 +413,6 @@ vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
uint64_t data64, mask64;
uint64_t last, changed;
int regnum, pin, lshift;
- cpuset_t allvcpus;
regnum = addr & 0xff;
switch (regnum) {
@@ -357,18 +448,15 @@ vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
/*
* If any fields in the redirection table entry (except mask
- * or polarity) have changed then rendezvous all the vcpus
- * to update their vlapic trigger-mode registers.
+ * or polarity) have changed then update the trigger-mode
+ * registers on all the vlapics.
*/
changed = last ^ vioapic->rtbl[pin].reg;
if (changed & ~(IOART_INTMASK | IOART_INTPOL)) {
VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate "
"vlapic trigger-mode register", pin);
- VIOAPIC_UNLOCK(vioapic);
- allvcpus = vm_active_cpus(vioapic->vm);
- vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus,
- vioapic_update_tmr, NULL);
- VIOAPIC_LOCK(vioapic);
+ vioapic_update_tmrs(vioapic, vcpuid, last,
+ vioapic->rtbl[pin].reg);
}
/*
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
index 98dfc6ee56..4e58249c8d 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic.c
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
@@ -93,6 +93,7 @@ __FBSDID("$FreeBSD$");
#define VLAPIC_BUS_FREQ (128 * 1024 * 1024)
static void vlapic_set_error(struct vlapic *, uint32_t, bool);
+static void vlapic_tmr_reset(struct vlapic *);
static __inline uint32_t
vlapic_get_id(struct vlapic *vlapic)
@@ -824,11 +825,11 @@ vlapic_icrtmr_write_handler(struct vlapic *vlapic)
/*
* This function populates 'dmask' with the set of vcpus that match the
* addressing specified by the (dest, phys, lowprio) tuple.
- *
+ *
* 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
* or xAPIC (8-bit) destination field.
*/
-static void
+void
vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
bool lowprio, bool x2apic_dest)
{
@@ -1452,7 +1453,7 @@ vlapic_reset(struct vlapic *vlapic)
lapic->dfr = 0xffffffff;
lapic->svr = APIC_SVR_VECTOR;
vlapic_mask_lvts(vlapic);
- vlapic_reset_tmr(vlapic);
+ vlapic_tmr_reset(vlapic);
lapic->dcr_timer = 0;
vlapic_dcr_write_handler(vlapic);
@@ -1620,62 +1621,79 @@ vlapic_enabled(struct vlapic *vlapic)
}
static void
-vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level)
+vlapic_tmr_reset(struct vlapic *vlapic)
{
struct LAPIC *lapic;
- uint32_t *tmrptr, mask;
- int idx;
lapic = vlapic->apic_page;
- tmrptr = &lapic->tmr0;
- idx = (vector / 32) * 4;
- mask = 1 << (vector % 32);
- if (level)
- tmrptr[idx] |= mask;
- else
- tmrptr[idx] &= ~mask;
-
- if (vlapic->ops.set_tmr != NULL)
- (*vlapic->ops.set_tmr)(vlapic, vector, level);
+ lapic->tmr0 = lapic->tmr1 = lapic->tmr2 = lapic->tmr3 = 0;
+ lapic->tmr4 = lapic->tmr5 = lapic->tmr6 = lapic->tmr7 = 0;
+ vlapic->tmr_pending = 1;
}
+/*
+ * Synchronize TMR designations into the LAPIC state.
+ * The vCPU must be in the VCPU_RUNNING state.
+ */
void
-vlapic_reset_tmr(struct vlapic *vlapic)
+vlapic_tmr_update(struct vlapic *vlapic)
{
- int vector;
+ struct LAPIC *lapic;
+ uint32_t *tmrptr;
+ uint32_t result[VLAPIC_TMR_CNT];
+ u_int i, tmr_idx;
+
+ if (vlapic->tmr_pending == 0) {
+ return;
+ }
+
+ lapic = vlapic->apic_page;
+ tmrptr = &lapic->tmr0;
+
+ VLAPIC_CTR0(vlapic, "synchronizing TMR");
+ for (i = 0; i < VLAPIC_TMR_CNT; i++) {
+ tmr_idx = i * 4;
- VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered");
+ tmrptr[tmr_idx] &= ~vlapic->tmr_vec_deassert[i];
+ tmrptr[tmr_idx] |= vlapic->tmr_vec_assert[i];
+ vlapic->tmr_vec_deassert[i] = 0;
+ vlapic->tmr_vec_assert[i] = 0;
+ result[i] = tmrptr[tmr_idx];
+ }
+ vlapic->tmr_pending = 0;
- for (vector = 0; vector <= 255; vector++)
- vlapic_set_tmr(vlapic, vector, false);
+ if (vlapic->ops.set_tmr != NULL) {
+ (*vlapic->ops.set_tmr)(vlapic, result);
+ }
}
+/*
+ * Designate the TMR state for a given interrupt vector.
+ * The caller must hold the vIOAPIC lock and prevent the vCPU corresponding to
+ * this vLAPIC instance from being-in or entering the VCPU_RUNNING state.
+ */
void
-vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
- int delmode, int vector)
+vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active)
{
- cpuset_t dmask;
- bool lowprio;
-
- KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
+ const uint32_t idx = vector / 32;
+ const uint32_t mask = 1 << (vector % 32);
+
+ VLAPIC_CTR2(vlapic, "TMR for vector %u %sasserted", vector,
+ active ? "" : "de");
+ if (active) {
+ vlapic->tmr_vec_assert[idx] |= mask;
+ vlapic->tmr_vec_deassert[idx] &= ~mask;
+ } else {
+ vlapic->tmr_vec_deassert[idx] |= mask;
+ vlapic->tmr_vec_assert[idx] &= ~mask;
+ }
/*
- * A level trigger is valid only for fixed and lowprio delivery modes.
+ * Track the number of TMR changes between calls to vlapic_tmr_update.
+ * While a simple boolean would suffice, this count may be useful when
+ * tracing or debugging, and is cheap to calculate.
*/
- if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) {
- VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for "
- "delivery-mode %d", delmode);
- return;
- }
-
- lowprio = (delmode == APIC_DELMODE_LOWPRIO);
- vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false);
-
- if (!CPU_ISSET(vlapic->vcpuid, &dmask))
- return;
-
- VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector);
- vlapic_set_tmr(vlapic, vector, true);
+ vlapic->tmr_pending = MIN(UINT32_MAX - 1, vlapic->tmr_pending) + 1;
}
#ifndef __FreeBSD__
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.h b/usr/src/uts/i86pc/io/vmm/io/vlapic.h
index b7fb918b0f..e1a52551a9 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic.h
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h
@@ -86,16 +86,11 @@ bool vlapic_enabled(struct vlapic *vlapic);
void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
int delmode, int vec);
-/* Reset the trigger-mode bits for all vectors to be edge-triggered */
-void vlapic_reset_tmr(struct vlapic *vlapic);
+void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
+ bool lowprio, bool x2apic_dest);
-/*
- * Set the trigger-mode bit associated with 'vector' to level-triggered if
- * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to
- * this 'vlapic'.
- */
-void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
- int delmode, int vector);
+void vlapic_tmr_update(struct vlapic *vlapic);
+void vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active);
void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val);
uint64_t vlapic_get_cr8(struct vlapic *vlapic);
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
index fe7965cb65..5795d48d52 100644
--- a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
@@ -138,6 +138,8 @@ enum boot_state {
#define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI
+#define VLAPIC_TMR_CNT 8
+
struct vlapic;
struct vlapic_ops {
@@ -145,7 +147,7 @@ struct vlapic_ops {
int (*pending_intr)(struct vlapic *vlapic, int *vecptr);
void (*intr_accepted)(struct vlapic *vlapic, int vector);
void (*post_intr)(struct vlapic *vlapic, int hostcpu);
- void (*set_tmr)(struct vlapic *vlapic, int vector, bool level);
+ void (*set_tmr)(struct vlapic *vlapic, const uint32_t *result);
void (*enable_x2apic_mode)(struct vlapic *vlapic);
};
@@ -156,6 +158,7 @@ struct vlapic {
struct vlapic_ops ops;
uint32_t esr_pending;
+ uint32_t tmr_pending;
struct callout callout; /* vlapic timer */
struct bintime timer_fire_bt; /* callout expiry time */
@@ -183,6 +186,19 @@ struct vlapic {
*/
uint32_t svr_last;
uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1];
+
+ /*
+ * Store intended modifications to the trigger-mode register state.
+ * Along with the tmr_pending counter above, these are protected by the
+ * vIOAPIC lock and can only be modified under specific conditions:
+ *
+ * 1. When holding the vIOAPIC lock, and the vCPU to which the vLAPIC
+ * belongs is prevented from entering the VCPU_RUNNING state.
+ * 2. When the owning vCPU is in the VCPU_RUNNING state, and is
+ * applying the TMR modifications prior to interrupt injection.
+ */
+ uint32_t tmr_vec_deassert[VLAPIC_TMR_CNT];
+ uint32_t tmr_vec_assert[VLAPIC_TMR_CNT];
};
void vlapic_init(struct vlapic *vlapic);
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
index 11915220d2..991d6c7850 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -118,6 +118,7 @@ struct vcpu {
#ifndef __FreeBSD__
int lastloccpu; /* (o) last host cpu localized to */
#endif
+ u_int runblock; /* (i) block vcpu from run state */
int reqidle; /* (i) request vcpu to idle */
struct vlapic *vlapic; /* (i) APIC device model */
enum x2apic_state x2apic_state; /* (i) APIC mode */
@@ -185,14 +186,6 @@ struct vm {
int suspend; /* (i) stop VM execution */
volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */
volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */
- cpuset_t rendezvous_req_cpus; /* (x) rendezvous requested */
- cpuset_t rendezvous_done_cpus; /* (x) rendezvous finished */
- void *rendezvous_arg; /* (x) rendezvous func/arg */
- vm_rendezvous_func_t rendezvous_func;
- struct mtx rendezvous_mtx; /* (o) rendezvous lock */
-#ifndef __FreeBSD__
- kcondvar_t rendezvous_cv; /* (0) rendezvous condvar */
-#endif
struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */
struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */
struct vmspace *vmspace; /* (o) guest's address space */
@@ -356,6 +349,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create)
vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+ vcpu->runblock = 0;
vcpu->reqidle = 0;
vcpu->exitintinfo = 0;
vcpu->nmi_pending = 0;
@@ -586,7 +580,6 @@ vm_create(const char *name, struct vm **retvm)
vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
strcpy(vm->name, name);
vm->vmspace = vmspace;
- mtx_init(&vm->rendezvous_mtx, "vm rendezvous lock", 0, MTX_DEF);
vm->sockets = 1;
vm->cores = cores_per_package; /* XXX backwards compatibility */
@@ -1412,6 +1405,16 @@ vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
break;
}
+ if (newstate == VCPU_RUNNING) {
+ while (vcpu->runblock != 0) {
+#ifdef __FreeBSD__
+ msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
+#else
+ cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
+#endif
+ }
+ }
+
if (error)
return (EBUSY);
@@ -1424,7 +1427,8 @@ vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate,
else
vcpu->hostcpu = NOCPU;
- if (newstate == VCPU_IDLE) {
+ if (newstate == VCPU_IDLE ||
+ (newstate == VCPU_FROZEN && vcpu->runblock != 0)) {
#ifdef __FreeBSD__
wakeup(&vcpu->state);
#else
@@ -1453,88 +1457,6 @@ vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate)
panic("Error %d setting state to %d", error, newstate);
}
-static void
-vm_set_rendezvous_func(struct vm *vm, vm_rendezvous_func_t func)
-{
-
- KASSERT(mtx_owned(&vm->rendezvous_mtx), ("rendezvous_mtx not locked"));
-
- /*
- * Update 'rendezvous_func' and execute a write memory barrier to
- * ensure that it is visible across all host cpus. This is not needed
- * for correctness but it does ensure that all the vcpus will notice
- * that the rendezvous is requested immediately.
- */
- vm->rendezvous_func = func;
-#ifdef __FreeBSD__
- wmb();
-#else
- membar_producer();
-#endif
-}
-
-#define RENDEZVOUS_CTR0(vm, vcpuid, fmt) \
- do { \
- if (vcpuid >= 0) \
- VCPU_CTR0(vm, vcpuid, fmt); \
- else \
- VM_CTR0(vm, fmt); \
- } while (0)
-
-static void
-vm_handle_rendezvous(struct vm *vm, int vcpuid)
-{
-
- KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus),
- ("vm_handle_rendezvous: invalid vcpuid %d", vcpuid));
-
- mtx_lock(&vm->rendezvous_mtx);
- while (vm->rendezvous_func != NULL) {
- /* 'rendezvous_req_cpus' must be a subset of 'active_cpus' */
- CPU_AND(&vm->rendezvous_req_cpus, &vm->active_cpus);
-
- if (vcpuid != -1 &&
- CPU_ISSET(vcpuid, &vm->rendezvous_req_cpus) &&
- !CPU_ISSET(vcpuid, &vm->rendezvous_done_cpus)) {
- VCPU_CTR0(vm, vcpuid, "Calling rendezvous func");
- (*vm->rendezvous_func)(vm, vcpuid, vm->rendezvous_arg);
- CPU_SET(vcpuid, &vm->rendezvous_done_cpus);
- }
- if (CPU_CMP(&vm->rendezvous_req_cpus,
- &vm->rendezvous_done_cpus) == 0) {
- VCPU_CTR0(vm, vcpuid, "Rendezvous completed");
- vm_set_rendezvous_func(vm, NULL);
-#ifdef __FreeBSD__
- wakeup(&vm->rendezvous_func);
-#else
- cv_broadcast(&vm->rendezvous_cv);
-#endif
- break;
- }
- RENDEZVOUS_CTR0(vm, vcpuid, "Wait for rendezvous completion");
-#ifdef __FreeBSD__
- mtx_sleep(&vm->rendezvous_func, &vm->rendezvous_mtx, 0,
- "vmrndv", 0);
-#else
- /*
- * A cv_wait() call should be adequate for this, but since the
- * bhyve process could be killed in the middle of an unfinished
- * vm_smp_rendezvous, rendering its completion impossible, a
- * timed wait is necessary. When bailing out early from a
- * rendezvous, the instance may be left in a bizarre state, but
- * that is preferable to a thread stuck waiting in the kernel.
- */
- if (cv_reltimedwait_sig(&vm->rendezvous_cv,
- &vm->rendezvous_mtx.m, hz, TR_CLOCK_TICK) <= 0) {
- if ((curproc->p_flag & SEXITING) != 0) {
- break;
- }
- }
-#endif
- }
- mtx_unlock(&vm->rendezvous_mtx);
-}
-
/*
* Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
*/
@@ -1566,7 +1488,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
* vcpu returned from VMRUN() and before it acquired the
* vcpu lock above.
*/
- if (vm->rendezvous_func != NULL || vm->suspend || vcpu->reqidle)
+ if (vm->suspend || vcpu->reqidle)
break;
if (vm_nmi_pending(vm, vcpuid))
break;
@@ -1773,10 +1695,6 @@ vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
/*
* Wait until all 'active_cpus' have suspended themselves.
- *
- * Since a VM may be suspended at any time including when one or
- * more vcpus are doing a rendezvous we need to call the rendezvous
- * handler while we are waiting to prevent a deadlock.
*/
vcpu_lock(vcpu);
while (1) {
@@ -1785,35 +1703,27 @@ vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu)
break;
}
- if (vm->rendezvous_func == NULL) {
- VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
- vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
+ VCPU_CTR0(vm, vcpuid, "Sleeping during suspend");
+ vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING);
#ifdef __FreeBSD__
- msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
+ msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz);
#else
- /*
- * Like vm_handle_rendezvous, vm_handle_suspend could
- * become stuck in the kernel if the bhyve process
- * driving its vCPUs is killed. Offer a bail-out in
- * that case, even though not all the vCPUs have
- * reached the suspended state.
- */
- if (cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m,
- hz, TR_CLOCK_TICK) <= 0) {
- if ((curproc->p_flag & SEXITING) != 0) {
- vcpu_require_state_locked(vm, vcpuid,
- VCPU_FROZEN);
- break;
- }
+ /*
+ * To prevent vm_handle_suspend from becoming stuck in the
+ * kernel if the bhyve process driving its vCPUs is killed,
+ * offer a bail-out, even though not all the vCPUs have reached
+ * the suspended state.
+ */
+ if (cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m,
+ hz, TR_CLOCK_TICK) <= 0) {
+ if ((curproc->p_flag & SEXITING) != 0) {
+ vcpu_require_state_locked(vm, vcpuid,
+ VCPU_FROZEN);
+ break;
}
-#endif
- vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
- } else {
- VCPU_CTR0(vm, vcpuid, "Rendezvous during suspend");
- vcpu_unlock(vcpu);
- vm_handle_rendezvous(vm, vcpuid);
- vcpu_lock(vcpu);
}
+#endif
+ vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN);
}
vcpu_unlock(vcpu);
@@ -1915,17 +1825,15 @@ vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip)
}
void
-vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip)
+vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip)
{
struct vm_exit *vmexit;
- KASSERT(vm->rendezvous_func != NULL, ("rendezvous not in progress"));
-
vmexit = vm_exitinfo(vm, vcpuid);
vmexit->rip = rip;
vmexit->inst_length = 0;
- vmexit->exitcode = VM_EXITCODE_RENDEZVOUS;
- vmm_stat_incr(vm, vcpuid, VMEXIT_RENDEZVOUS, 1);
+ vmexit->exitcode = VM_EXITCODE_RUNBLOCK;
+ vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1);
}
void
@@ -2087,7 +1995,7 @@ vm_run(struct vm *vm, struct vm_run *vmrun)
pmap = vmspace_pmap(vm->vmspace);
vcpu = &vm->vcpu[vcpuid];
vme = &vcpu->exitinfo;
- evinfo.rptr = &vm->rendezvous_func;
+ evinfo.rptr = &vcpu->runblock;
evinfo.sptr = &vm->suspend;
evinfo.iptr = &vcpu->reqidle;
@@ -2177,9 +2085,7 @@ restart:
vioapic_process_eoi(vm, vcpuid,
vme->u.ioapic_eoi.vector);
break;
- case VM_EXITCODE_RENDEZVOUS:
- vm_handle_rendezvous(vm, vcpuid);
- error = 0;
+ case VM_EXITCODE_RUNBLOCK:
break;
case VM_EXITCODE_HLT:
intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
@@ -2807,6 +2713,54 @@ vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
return (state);
}
+void
+vcpu_block_run(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ vcpu->runblock++;
+ if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) {
+ vcpu_notify_event_locked(vcpu, false);
+ }
+ while (vcpu->state == VCPU_RUNNING) {
+#ifdef __FreeBSD__
+ msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0);
+#else
+ cv_wait(&vcpu->state_cv, &vcpu->mtx.m);
+#endif
+ }
+ vcpu_unlock(vcpu);
+}
+
+void
+vcpu_unblock_run(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vcpu_block_run: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ KASSERT(vcpu->runblock != 0, ("expected non-zero runblock"));
+ vcpu->runblock--;
+ if (vcpu->runblock == 0) {
+#ifdef __FreeBSD__
+ wakeup(&vcpu->state);
+#else
+ cv_broadcast(&vcpu->state_cv);
+#endif
+ }
+ vcpu_unlock(vcpu);
+}
+
#ifndef __FreeBSD__
uint64_t
vcpu_tsc_offset(struct vm *vm, int vcpuid)
@@ -3003,54 +2957,6 @@ vm_apicid2vcpuid(struct vm *vm, int apicid)
return (apicid);
}
-void
-vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
- vm_rendezvous_func_t func, void *arg)
-{
- int i;
-
- /*
- * Enforce that this function is called without any locks
- */
- WITNESS_WARN(WARN_PANIC, NULL, "vm_smp_rendezvous");
- KASSERT(vcpuid == -1 || (vcpuid >= 0 && vcpuid < vm->maxcpus),
- ("vm_smp_rendezvous: invalid vcpuid %d", vcpuid));
-
-restart:
- mtx_lock(&vm->rendezvous_mtx);
- if (vm->rendezvous_func != NULL) {
- /*
- * If a rendezvous is already in progress then we need to
- * call the rendezvous handler in case this 'vcpuid' is one
- * of the targets of the rendezvous.
- */
- RENDEZVOUS_CTR0(vm, vcpuid, "Rendezvous already in progress");
- mtx_unlock(&vm->rendezvous_mtx);
- vm_handle_rendezvous(vm, vcpuid);
- goto restart;
- }
- KASSERT(vm->rendezvous_func == NULL, ("vm_smp_rendezvous: previous "
- "rendezvous is still in progress"));
-
- RENDEZVOUS_CTR0(vm, vcpuid, "Initiating rendezvous");
- vm->rendezvous_req_cpus = dest;
- CPU_ZERO(&vm->rendezvous_done_cpus);
- vm->rendezvous_arg = arg;
- vm_set_rendezvous_func(vm, func);
- mtx_unlock(&vm->rendezvous_mtx);
-
- /*
- * Wake up any sleeping vcpus and trigger a VM-exit in any running
- * vcpus so they handle the rendezvous as soon as possible.
- */
- for (i = 0; i < vm->maxcpus; i++) {
- if (CPU_ISSET(i, &dest))
- vcpu_notify_event(vm, i, false);
- }
-
- vm_handle_rendezvous(vm, vcpuid);
-}
-
struct vatpic *
vm_atpic(struct vm *vm)
{
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
index d03955ec06..8dc7118ee1 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
@@ -125,21 +125,24 @@ pmap_kextract(vm_offset_t va)
int
cpusetobj_ffs(const cpuset_t *set)
{
-#if CPUSET_WORDS > 1
- int i, cbit;
-
- cbit = 0;
- for (i = 0; i < CPUSET_WORDS; i++) {
- if (set->cpub[i] != 0) {
- cbit = ffsl(set->cpub[i]);
- cbit += i * sizeof (set->cpub[0]);
- break;
- }
+ uint_t large, small;
+
+ /*
+ * Rather than reaching into the cpuset_t ourselves, leave that task to
+ * cpuset_bounds(). The simplicity is worth the extra wasted work to
+ * find the upper bound.
+ */
+ cpuset_bounds(set, &small, &large);
+
+ if (small == CPUSET_NOTINSET) {
+ /* The FreeBSD version returns 0 if it find nothing */
+ return (0);
}
- return (cbit);
-#else
- return (ffsl(*set));
-#endif
+
+ ASSERT3U(small, <=, INT_MAX);
+
+ /* Least significant bit index starts at 1 for valid results */
+ return (small + 1);
}
struct kmem_item {
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.c b/usr/src/uts/i86pc/io/vmm/vmm_stat.c
index f61272c49c..2cbcce9590 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_stat.c
+++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.c
@@ -168,5 +168,5 @@ VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason");
VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit");
VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit");
VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace");
-VMM_STAT(VMEXIT_RENDEZVOUS, "number of times rendezvous pending at exit");
+VMM_STAT(VMEXIT_RUNBLOCK, "number of times runblock at exit");
VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions");
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.h b/usr/src/uts/i86pc/io/vmm/vmm_stat.h
index 1193cea368..3232e23888 100644
--- a/usr/src/uts/i86pc/io/vmm/vmm_stat.h
+++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.h
@@ -166,7 +166,7 @@ VMM_STAT_DECLARE(VMEXIT_INST_EMUL);
VMM_STAT_DECLARE(VMEXIT_UNKNOWN);
VMM_STAT_DECLARE(VMEXIT_ASTPENDING);
VMM_STAT_DECLARE(VMEXIT_USERSPACE);
-VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS);
+VMM_STAT_DECLARE(VMEXIT_RUNBLOCK);
VMM_STAT_DECLARE(VMEXIT_EXCEPTION);
VMM_STAT_DECLARE(VMEXIT_REQIDLE);
#endif
diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h
index d6313469a5..ac8f14b042 100644
--- a/usr/src/uts/i86pc/sys/vmm.h
+++ b/usr/src/uts/i86pc/sys/vmm.h
@@ -146,7 +146,7 @@ struct vm_guest_paging;
struct pmap;
struct vm_eventinfo {
- void *rptr; /* rendezvous cookie */
+ u_int *rptr; /* runblock cookie */
int *sptr; /* suspend cookie */
int *iptr; /* reqidle cookie */
};
@@ -275,38 +275,21 @@ int vm_resume_cpu(struct vm *vm, int vcpu);
struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip);
-void vm_exit_rendezvous(struct vm *vm, int vcpuid, uint64_t rip);
+void vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip);
void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip);
#ifdef _SYS__CPUSET_H_
-/*
- * Rendezvous all vcpus specified in 'dest' and execute 'func(arg)'.
- * The rendezvous 'func(arg)' is not allowed to do anything that will
- * cause the thread to be put to sleep.
- *
- * If the rendezvous is being initiated from a vcpu context then the
- * 'vcpuid' must refer to that vcpu, otherwise it should be set to -1.
- *
- * The caller cannot hold any locks when initiating the rendezvous.
- *
- * The implementation of this API may cause vcpus other than those specified
- * by 'dest' to be stalled. The caller should not rely on any vcpus making
- * forward progress when the rendezvous is in progress.
- */
-typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
-void vm_smp_rendezvous(struct vm *vm, int vcpuid, cpuset_t dest,
- vm_rendezvous_func_t func, void *arg);
cpuset_t vm_active_cpus(struct vm *vm);
cpuset_t vm_debug_cpus(struct vm *vm);
cpuset_t vm_suspended_cpus(struct vm *vm);
#endif /* _SYS__CPUSET_H_ */
static __inline int
-vcpu_rendezvous_pending(struct vm_eventinfo *info)
+vcpu_runblocked(struct vm_eventinfo *info)
{
- return (*((uintptr_t *)(info->rptr)) != 0);
+ return (*info->rptr != 0);
}
static __inline int
@@ -345,6 +328,9 @@ enum vcpu_state {
int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
bool from_idle);
enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
+void vcpu_block_run(struct vm *, int);
+void vcpu_unblock_run(struct vm *, int);
+
#ifndef __FreeBSD__
uint64_t vcpu_tsc_offset(struct vm *vm, int vcpuid);
#endif
@@ -579,7 +565,7 @@ enum vm_exitcode {
VM_EXITCODE_INST_EMUL,
VM_EXITCODE_SPINUP_AP,
VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */
- VM_EXITCODE_RENDEZVOUS,
+ VM_EXITCODE_RUNBLOCK,
VM_EXITCODE_IOAPIC_EOI,
VM_EXITCODE_SUSPENDED,
VM_EXITCODE_INOUT_STR,