diff options
author | Dan McDonald <danmcd@joyent.com> | 2021-01-11 08:50:27 -0500 |
---|---|---|
committer | Dan McDonald <danmcd@joyent.com> | 2021-01-11 08:50:40 -0500 |
commit | 7fab0ca77cdcb2a9ec0d10bd58bca415c5138dc0 (patch) | |
tree | 1d068a17f4139b3e9a10c3565a3aa9ad20951e1e | |
parent | 77a2bdc4fcb9b0ace15a0459d94a9c4ef6203329 (diff) | |
parent | 2606939d92dd3044a9851b2930ebf533c3c03892 (diff) | |
download | illumos-joyent-7fab0ca77cdcb2a9ec0d10bd58bca415c5138dc0.tar.gz |
[illumos-gate merge]
commit 2606939d92dd3044a9851b2930ebf533c3c03892
13275 bhyve needs richer INIT/SIPI support
commit 78f846c0ab4f41678386d3e1b49c16cc8db07a8b
13438 Update prototypes to 2021
commit ab2fdd80a620c2b88e5ac2c4247ab79880761b18
13409 cxgbe: replace zero sized array by flexible array
commit 6dc7d05754d992040097e8ba8f85e77512125c60
8040 NFSv4 client: 3-way deadlock between nfs4_bio(), nfs4_do_delegreturn(), and nfs4_flush_pages()
Conflicts:
usr/src/cmd/bhyve/bhyverun.c
40 files changed, 787 insertions, 485 deletions
diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c index 4392f8640b..5029134ede 100644 --- a/usr/src/cmd/bhyve/bhyverun.c +++ b/usr/src/cmd/bhyve/bhyverun.c @@ -518,13 +518,14 @@ void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) #else void -fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip, - bool suspend) +fbsdrun_addcpu(struct vmctx *ctx, int newcpu, uint64_t rip, bool suspend) #endif { int error; +#ifdef __FreeBSD__ assert(fromcpu == BSP); +#endif /* * The 'newcpu' must be activated in the context of 'fromcpu'. If @@ -577,7 +578,7 @@ vmentry_mmio_read(int vcpu, uint64_t gpa, uint8_t bytes, uint64_t data) assert(entry->cmd == VEC_DEFAULT); - entry->cmd = VEC_COMPLETE_MMIO; + entry->cmd = VEC_FULFILL_MMIO; mmio->bytes = bytes; mmio->read = 1; mmio->gpa = gpa; @@ -592,7 +593,7 @@ vmentry_mmio_write(int vcpu, uint64_t gpa, uint8_t bytes) assert(entry->cmd == VEC_DEFAULT); - entry->cmd = VEC_COMPLETE_MMIO; + entry->cmd = VEC_FULFILL_MMIO; mmio->bytes = bytes; mmio->read = 0; mmio->gpa = gpa; @@ -607,7 +608,7 @@ vmentry_inout_read(int vcpu, uint16_t port, uint8_t bytes, uint32_t data) assert(entry->cmd == VEC_DEFAULT); - entry->cmd = VEC_COMPLETE_INOUT; + entry->cmd = VEC_FULFILL_INOUT; inout->bytes = bytes; inout->flags = INOUT_IN; inout->port = port; @@ -622,7 +623,7 @@ vmentry_inout_write(int vcpu, uint16_t port, uint8_t bytes) assert(entry->cmd == VEC_DEFAULT); - entry->cmd = VEC_COMPLETE_INOUT; + entry->cmd = VEC_FULFILL_INOUT; inout->bytes = bytes; inout->flags = 0; inout->port = port; @@ -731,6 +732,7 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) return (VMEXIT_CONTINUE); } +#ifdef __FreeBSD__ static int vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { @@ -740,6 +742,18 @@ vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) return (VMEXIT_CONTINUE); } +#else +static int +vmexit_run_state(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) +{ + /* + * Run-state transitions (INIT, SIPI, etc) are handled in-kernel, so an + * exit to userspace with that code is not expected. + */ + fprintf(stderr, "unexpected run-state VM exit"); + return (VMEXIT_ABORT); +} +#endif /* __FreeBSD__ */ #ifdef __FreeBSD__ #define DEBUG_EPT_MISCONFIG @@ -1017,7 +1031,11 @@ static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, +#ifdef __FreeBSD__ [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, +#else + [VM_EXITCODE_RUN_STATE] = vmexit_run_state, +#endif [VM_EXITCODE_SUSPENDED] = vmexit_suspend, [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, [VM_EXITCODE_DEBUG] = vmexit_debug, @@ -1547,14 +1565,21 @@ main(int argc, char *argv[]) errx(EX_OSERR, "cap_enter() failed"); #endif +#ifdef __FreeBSD__ /* * Add CPU 0 */ -#ifdef __FreeBSD__ fbsdrun_addcpu(ctx, BSP, BSP, rip); #else - fbsdrun_addcpu(ctx, BSP, BSP, rip, suspend); + /* Set BSP to run (unlike the APs which wait for INIT) */ + error = vm_set_run_state(ctx, BSP, VRS_RUN, 0); + assert(error == 0); + fbsdrun_addcpu(ctx, BSP, rip, suspend); + /* Add subsequent CPUs, which will wait until INIT/SIPI-ed */ + for (uint_t i = 1; i < guest_ncpus; i++) { + spinup_halted_ap(ctx, i); + } mark_provisioned(); #endif diff --git a/usr/src/cmd/bhyve/bhyverun.h b/usr/src/cmd/bhyve/bhyverun.h index 99ce739f70..f2582512ea 100644 --- a/usr/src/cmd/bhyve/bhyverun.h +++ b/usr/src/cmd/bhyve/bhyverun.h @@ -58,8 +58,7 @@ void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu); #ifdef __FreeBSD__ void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip); #else -void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip, - bool suspend); +void fbsdrun_addcpu(struct vmctx *ctx, int newcpu, uint64_t rip, bool suspend); #endif int fbsdrun_muxed(void); int fbsdrun_vmexit_on_hlt(void); diff --git a/usr/src/cmd/bhyve/spinup_ap.c b/usr/src/cmd/bhyve/spinup_ap.c index 80caafe78e..96eef44bcf 100644 --- a/usr/src/cmd/bhyve/spinup_ap.c +++ b/usr/src/cmd/bhyve/spinup_ap.c @@ -56,6 +56,7 @@ __FBSDID("$FreeBSD$"); #include "bhyverun.h" #include "spinup_ap.h" +#ifdef __FreeBSD__ static void spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip) { @@ -101,7 +102,6 @@ spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip) fbsdrun_set_capabilities(ctx, newcpu); -#ifdef __FreeBSD__ /* * Enable the 'unrestricted guest' mode for 'newcpu'. * @@ -110,17 +110,30 @@ spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip) */ error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1); assert(error == 0); -#else - /* Unrestricted Guest is always enabled on illumos */ -#endif spinup_ap_realmode(ctx, newcpu, &rip); -#ifdef __FreeBSD__ fbsdrun_addcpu(ctx, vcpu, newcpu, rip); -#else - fbsdrun_addcpu(ctx, vcpu, newcpu, rip, false); -#endif return (newcpu); } +#else /* __FreeBSD__ */ +void +spinup_halted_ap(struct vmctx *ctx, int newcpu) +{ + int error; + + assert(newcpu != 0); + assert(newcpu < guest_ncpus); + + error = vcpu_reset(ctx, newcpu); + assert(error == 0); + + fbsdrun_set_capabilities(ctx, newcpu); + + error = vm_set_run_state(ctx, newcpu, VRS_HALT, 0); + assert(error == 0); + + fbsdrun_addcpu(ctx, newcpu, 0, false); +} +#endif /* __FreeBSD__ */ diff --git a/usr/src/cmd/bhyve/spinup_ap.h b/usr/src/cmd/bhyve/spinup_ap.h index 226542f6c3..bce4f3878c 100644 --- a/usr/src/cmd/bhyve/spinup_ap.h +++ b/usr/src/cmd/bhyve/spinup_ap.h @@ -31,6 +31,10 @@ #ifndef _SPINUP_AP_H_ #define _SPINUP_AP_H_ +#ifdef __FreeBSD__ int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip); +#else +void spinup_halted_ap(struct vmctx *ctx, int newcpu); +#endif /* __FreeBSD__ */ #endif diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers index 9e972b8d60..26cfd15426 100644 --- a/usr/src/lib/libvmmapi/common/mapfile-vers +++ b/usr/src/lib/libvmmapi/common/mapfile-vers @@ -123,6 +123,8 @@ SYMBOL_VERSION ILLUMOSprivate { vm_unassign_pptdev; vm_pmtmr_set_location; vm_wrlock_cycle; + vm_get_run_state; + vm_set_run_state; local: *; diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c index 5b2cb4c235..0b22ca7522 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.c +++ b/usr/src/lib/libvmmapi/common/vmmapi.c @@ -1302,6 +1302,18 @@ vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state) return (error); } +#ifndef __FreeBSD__ +int +vcpu_reset(struct vmctx *vmctx, int vcpu) +{ + struct vm_vcpu_reset vvr; + + vvr.vcpuid = vcpu; + vvr.kind = VRK_RESET; + + return (ioctl(vmctx->fd, VM_RESET_CPU, &vvr)); +} +#else /* __FreeBSD__ */ /* * From Intel Vol 3a: * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT @@ -1458,6 +1470,7 @@ vcpu_reset(struct vmctx *vmctx, int vcpu) done: return (error); } +#endif /* __FreeBSD__ */ int vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num) @@ -1839,6 +1852,39 @@ vm_wrlock_cycle(struct vmctx *ctx) } return (0); } + +int +vm_get_run_state(struct vmctx *ctx, int vcpu, enum vcpu_run_state *state, + uint8_t *sipi_vector) +{ + struct vm_run_state data; + + data.vcpuid = vcpu; + if (ioctl(ctx->fd, VM_GET_RUN_STATE, &data) != 0) { + return (errno); + } + + *state = data.state; + *sipi_vector = data.sipi_vector; + return (0); +} + +int +vm_set_run_state(struct vmctx *ctx, int vcpu, enum vcpu_run_state state, + uint8_t sipi_vector) +{ + struct vm_run_state data; + + data.vcpuid = vcpu; + data.state = state; + data.sipi_vector = sipi_vector; + if (ioctl(ctx->fd, VM_SET_RUN_STATE, &data) != 0) { + return (errno); + } + + return (0); +} + #endif /* __FreeBSD__ */ #ifdef __FreeBSD__ diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h index 96102ec925..f7aaa02087 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.h +++ b/usr/src/lib/libvmmapi/common/vmmapi.h @@ -304,6 +304,10 @@ int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, /* illumos-specific APIs */ int vm_pmtmr_set_location(struct vmctx *ctx, uint16_t ioport); int vm_wrlock_cycle(struct vmctx *ctx); +int vm_get_run_state(struct vmctx *ctx, int vcpu, enum vcpu_run_state *state, + uint8_t *sipi_vector); +int vm_set_run_state(struct vmctx *ctx, int vcpu, enum vcpu_run_state state, + uint8_t sipi_vector); #endif /* __FreeBSD__ */ #ifdef __FreeBSD__ diff --git a/usr/src/prototypes/README b/usr/src/prototypes/README index 85dfe77bcd..b377b9e60a 100644 --- a/usr/src/prototypes/README +++ b/usr/src/prototypes/README @@ -17,5 +17,5 @@ CDDL version 1.0 for each new file introduced in illumos. */ /* - * Copyright 2020 <contributor> + * Copyright 2021 <contributor> */ diff --git a/usr/src/prototypes/prototype.Makefile b/usr/src/prototypes/prototype.Makefile index 76e4c9f402..f3d19ae8e3 100644 --- a/usr/src/prototypes/prototype.Makefile +++ b/usr/src/prototypes/prototype.Makefile @@ -10,6 +10,6 @@ # # -# Copyright 2020 <contributor> +# Copyright 2021 <contributor> # diff --git a/usr/src/prototypes/prototype.c b/usr/src/prototypes/prototype.c index 697ca2f594..36cd2f91ac 100644 --- a/usr/src/prototypes/prototype.c +++ b/usr/src/prototypes/prototype.c @@ -10,7 +10,7 @@ */ /* - * Copyright 2020 <contributor> + * Copyright 2021 <contributor> */ /* diff --git a/usr/src/prototypes/prototype.csh b/usr/src/prototypes/prototype.csh index 60fa7f3e8f..4f7d0b6f4e 100644 --- a/usr/src/prototypes/prototype.csh +++ b/usr/src/prototypes/prototype.csh @@ -12,6 +12,6 @@ # # -# Copyright 2020 <contributor> +# Copyright 2021 <contributor> # diff --git a/usr/src/prototypes/prototype.h b/usr/src/prototypes/prototype.h index b92e6aae09..6cdfd3116e 100644 --- a/usr/src/prototypes/prototype.h +++ b/usr/src/prototypes/prototype.h @@ -10,7 +10,7 @@ */ /* - * Copyright 2020 <contributor> + * Copyright 2021 <contributor> */ #ifndef _PROTOTYPE_H diff --git a/usr/src/prototypes/prototype.java b/usr/src/prototypes/prototype.java index 4de4a78a94..f4997b9dc1 100644 --- a/usr/src/prototypes/prototype.java +++ b/usr/src/prototypes/prototype.java @@ -10,7 +10,7 @@ */ /* - * Copyright 2020 <contributor> + * Copyright 2021 <contributor> */ /* diff --git a/usr/src/prototypes/prototype.ksh b/usr/src/prototypes/prototype.ksh index f69153663f..b63044b74c 100644 --- a/usr/src/prototypes/prototype.ksh +++ b/usr/src/prototypes/prototype.ksh @@ -12,6 +12,6 @@ # # -# Copyright 2020 <contributor> +# Copyright 2021 <contributor> # diff --git a/usr/src/prototypes/prototype.man b/usr/src/prototypes/prototype.man index 4c5cc3e514..bb262e9a5b 100644 --- a/usr/src/prototypes/prototype.man +++ b/usr/src/prototypes/prototype.man @@ -9,5 +9,5 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright 2020 <contributor> +.\" Copyright 2021 <contributor> .\" diff --git a/usr/src/prototypes/prototype.man1 b/usr/src/prototypes/prototype.man1 index 2ab3d426dc..d1e1d6fedf 100644 --- a/usr/src/prototypes/prototype.man1 +++ b/usr/src/prototypes/prototype.man1 @@ -9,7 +9,7 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright 2020 <contributor> +.\" Copyright 2021 <contributor> .\" .Dd Month Day, Year .Dt COMMAND 1 diff --git a/usr/src/prototypes/prototype.man3x b/usr/src/prototypes/prototype.man3x index 3604b4a0e2..c5ea8f639f 100644 --- a/usr/src/prototypes/prototype.man3x +++ b/usr/src/prototypes/prototype.man3x @@ -9,7 +9,7 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright 2020 <contributor> +.\" Copyright 2021 <contributor> .\" .Dd Month Day, Year .Dt MANUALPAGE 3SECTION diff --git a/usr/src/prototypes/prototype.man7d b/usr/src/prototypes/prototype.man7d index ee51f21a64..12e25f490e 100644 --- a/usr/src/prototypes/prototype.man7d +++ b/usr/src/prototypes/prototype.man7d @@ -9,7 +9,7 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright 2020 <contributor> +.\" Copyright 2021 <contributor> .\" .Dd Month Day, Year .Dt DRIVERNAME 7D diff --git a/usr/src/prototypes/prototype.man9e b/usr/src/prototypes/prototype.man9e index dc229ad6fd..81c8f73e6e 100644 --- a/usr/src/prototypes/prototype.man9e +++ b/usr/src/prototypes/prototype.man9e @@ -9,7 +9,7 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright 2020 <contributor> +.\" Copyright 2021 <contributor> .\" .Dd Month Day, Year .Dt ENTRYNAME 9E diff --git a/usr/src/prototypes/prototype.man9f b/usr/src/prototypes/prototype.man9f index 29e7c76d89..5617e8c988 100644 --- a/usr/src/prototypes/prototype.man9f +++ b/usr/src/prototypes/prototype.man9f @@ -9,7 +9,7 @@ .\" http://www.illumos.org/license/CDDL. .\" .\" -.\" Copyright 2020 <contributor> +.\" Copyright 2021 <contributor> .\" .Dd Month Day, Year .Dt FUNCNAME 9F diff --git a/usr/src/prototypes/prototype.mapfile-vers b/usr/src/prototypes/prototype.mapfile-vers index 08463224c1..379b4bbca5 100644 --- a/usr/src/prototypes/prototype.mapfile-vers +++ b/usr/src/prototypes/prototype.mapfile-vers @@ -10,7 +10,7 @@ # # -# Copyright 2020 <contributor> +# Copyright 2021 <contributor> # # diff --git a/usr/src/prototypes/prototype.pl b/usr/src/prototypes/prototype.pl index 459d961062..21590b137f 100644 --- a/usr/src/prototypes/prototype.pl +++ b/usr/src/prototypes/prototype.pl @@ -11,7 +11,7 @@ # # -# Copyright 2020 <contributor> +# Copyright 2021 <contributor> # # diff --git a/usr/src/prototypes/prototype.py b/usr/src/prototypes/prototype.py index d6240365a5..391565ca96 100644 --- a/usr/src/prototypes/prototype.py +++ b/usr/src/prototypes/prototype.py @@ -11,7 +11,7 @@ # # -# Copyright 2020 <contributor> +# Copyright 2021 <contributor> # # diff --git a/usr/src/prototypes/prototype.s b/usr/src/prototypes/prototype.s index 91ca220507..8601df9754 100644 --- a/usr/src/prototypes/prototype.s +++ b/usr/src/prototypes/prototype.s @@ -10,7 +10,7 @@ */ /* - * Copyright 2020 <contributor> + * Copyright 2021 <contributor> */ .file "prototype.s" diff --git a/usr/src/prototypes/prototype.sh b/usr/src/prototypes/prototype.sh index fae70c599c..54f8dee725 100644 --- a/usr/src/prototypes/prototype.sh +++ b/usr/src/prototypes/prototype.sh @@ -12,6 +12,6 @@ # # -# Copyright 2020 <contributor> +# Copyright 2021 <contributor> # diff --git a/usr/src/uts/common/fs/nfs/nfs4_client.c b/usr/src/uts/common/fs/nfs/nfs4_client.c index 5456fc7c63..856da430ea 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_client.c +++ b/usr/src/uts/common/fs/nfs/nfs4_client.c @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. + * Copyright (c) 1983,1984,1985,1986,1987,1988,1989 AT&T. * All Rights Reserved */ @@ -464,33 +464,15 @@ nfs4_attr_cache(vnode_t *vp, nfs4_ga_res_t *garp, rp = VTOR4(vp); mutex_enter(&rp->r_statelock); was_serial = (rp->r_serial == curthread); - if (rp->r_serial && !was_serial) { - klwp_t *lwp = ttolwp(curthread); - + if (rp->r_serial != NULL && !was_serial) { /* - * If we're the recovery thread, then purge current attrs - * and bail out to avoid potential deadlock between another - * thread caching attrs (r_serial thread), recov thread, - * and an async writer thread. + * Purge current attrs and bail out to avoid potential deadlock + * between another thread caching attrs (r_serial thread), this + * thread, and a thread trying to read or write pages. */ - if (recov) { - PURGE_ATTRCACHE4_LOCKED(rp); - mutex_exit(&rp->r_statelock); - return; - } - - if (lwp != NULL) - lwp->lwp_nostop++; - while (rp->r_serial != NULL) { - if (!cv_wait_sig(&rp->r_cv, &rp->r_statelock)) { - mutex_exit(&rp->r_statelock); - if (lwp != NULL) - lwp->lwp_nostop--; - return; - } - } - if (lwp != NULL) - lwp->lwp_nostop--; + PURGE_ATTRCACHE4_LOCKED(rp); + mutex_exit(&rp->r_statelock); + return; } /* @@ -3067,7 +3049,7 @@ nfs_free_mi4(mntinfo4_t *mi) nfs4_oo_hash_bucket_t *bucketp; nfs4_debug_msg_t *msgp; int i; - servinfo4_t *svp; + servinfo4_t *svp; /* * Code introduced here should be carefully evaluated to make diff --git a/usr/src/uts/common/fs/nfs/nfs4_vnops.c b/usr/src/uts/common/fs/nfs/nfs4_vnops.c index 15c6445146..6a3fbff48e 100644 --- a/usr/src/uts/common/fs/nfs/nfs4_vnops.c +++ b/usr/src/uts/common/fs/nfs/nfs4_vnops.c @@ -2596,12 +2596,6 @@ nfs4close_otw(rnode4_t *rp, cred_t *cred_otw, nfs4_open_owner_t *oop, osp->os_ref_count--; if (ep->error == 0) { - /* - * Avoid a deadlock with the r_serial thread waiting for - * os_sync_lock in nfs4_get_otw_cred_by_osp() which might be - * held by us. We will wait in nfs4_attr_cache() for the - * completion of the r_serial thread. - */ mutex_exit(&osp->os_sync_lock); *have_sync_lockp = 0; diff --git a/usr/src/uts/common/io/cxgbe/common/t4_msg.h b/usr/src/uts/common/io/cxgbe/common/t4_msg.h index aeb018f72f..69334be7fd 100644 --- a/usr/src/uts/common/io/cxgbe/common/t4_msg.h +++ b/usr/src/uts/common/io/cxgbe/common/t4_msg.h @@ -2769,7 +2769,7 @@ struct ulptx_sgl { __be64 addr0; #if !(defined C99_NOT_SUPPORTED) - struct ulptx_sge_pair sge[0]; + struct ulptx_sge_pair sge[]; #endif }; @@ -2785,7 +2785,7 @@ struct ulptx_isgl { __be32 rsvd; #if !(defined C99_NOT_SUPPORTED) - struct ulptx_isge sge[0]; + struct ulptx_isge sge[]; #endif }; diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c index aaa19d4bab..02926b6b12 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm.c +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c @@ -1917,8 +1917,7 @@ svm_dr_leave_guest(struct svm_regctx *gctx) * Start vcpu with specified RIP. */ static int -svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap, - struct vm_eventinfo *evinfo) +svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap) { struct svm_regctx *gctx; struct svm_softc *svm_sc; @@ -2010,34 +2009,18 @@ svm_vmrun(void *arg, int vcpu, uint64_t rip, pmap_t pmap, inject_state = svm_inject_vlapic(svm_sc, vcpu, vlapic, inject_state); - if (vcpu_suspended(evinfo)) { - enable_gintr(); - vm_exit_suspended(vm, vcpu, state->rip); - break; - } - - if (vcpu_runblocked(evinfo)) { - enable_gintr(); - vm_exit_runblock(vm, vcpu, state->rip); - break; - } - - if (vcpu_reqidle(evinfo)) { - enable_gintr(); - vm_exit_reqidle(vm, vcpu, state->rip); - break; - } - - /* We are asked to give the cpu by scheduler. */ - if (vcpu_should_yield(vm, vcpu)) { + /* + * Check for vCPU bail-out conditions. This must be done after + * svm_inject_events() to detect a triple-fault condition. + */ + if (vcpu_entry_bailout_checks(vm, vcpu, state->rip)) { enable_gintr(); - vm_exit_astpending(vm, vcpu, state->rip); break; } - if (vcpu_debugged(vm, vcpu)) { + if (vcpu_run_state_pending(vm, vcpu)) { enable_gintr(); - vm_exit_debug(vm, vcpu, state->rip); + vm_exit_run_state(vm, vcpu, state->rip); break; } @@ -2303,7 +2286,7 @@ svm_setreg(void *arg, int vcpu, int ident, uint64_t val) } static int -svm_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +svm_setdesc(void *arg, int vcpu, int reg, const struct seg_desc *desc) { struct vmcb *vmcb; struct svm_softc *sc; diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c index 54a03b1d3e..b5c713891c 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c @@ -2738,8 +2738,7 @@ vmx_dr_leave_guest(struct vmxctx *vmxctx) } static int -vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap, - struct vm_eventinfo *evinfo) +vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap) { int rc, handled, launched; struct vmx *vmx; @@ -2834,39 +2833,17 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap, } /* - * Check for vcpu suspension after injecting events because - * vmx_inject_events() can suspend the vcpu due to a - * triple fault. + * Check for vCPU bail-out conditions. This must be done after + * vmx_inject_events() to detect a triple-fault condition. */ - if (vcpu_suspended(evinfo)) { + if (vcpu_entry_bailout_checks(vmx->vm, vcpu, rip)) { enable_intr(); - vm_exit_suspended(vmx->vm, vcpu, rip); break; } - if (vcpu_runblocked(evinfo)) { + if (vcpu_run_state_pending(vm, vcpu)) { enable_intr(); - vm_exit_runblock(vmx->vm, vcpu, rip); - break; - } - - if (vcpu_reqidle(evinfo)) { - enable_intr(); - vm_exit_reqidle(vmx->vm, vcpu, rip); - break; - } - - if (vcpu_should_yield(vm, vcpu)) { - enable_intr(); - vm_exit_astpending(vmx->vm, vcpu, rip); - vmx_astpending_trace(vmx, vcpu, rip); - handled = HANDLED; - break; - } - - if (vcpu_debugged(vm, vcpu)) { - enable_intr(); - vm_exit_debug(vmx->vm, vcpu, rip); + vm_exit_run_state(vmx->vm, vcpu, rip); break; } @@ -2985,19 +2962,12 @@ vmx_run(void *arg, int vcpu, uint64_t rip, pmap_t pmap, rip = vmexit->rip; } while (handled); - /* - * If a VM exit has been handled then the exitcode must be BOGUS - * If a VM exit is not handled then the exitcode must not be BOGUS - */ - if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) || - (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) { - panic("Mismatch between handled (%d) and exitcode (%d)", - handled, vmexit->exitcode); + /* If a VM exit has been handled then the exitcode must be BOGUS */ + if (handled && vmexit->exitcode != VM_EXITCODE_BOGUS) { + panic("Non-BOGUS exitcode (%d) unexpected for handled VM exit", + vmexit->exitcode); } - if (!handled) - vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); - VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", vmexit->exitcode); @@ -3261,7 +3231,7 @@ vmx_getdesc(void *arg, int vcpu, int seg, struct seg_desc *desc) } static int -vmx_setdesc(void *arg, int vcpu, int seg, struct seg_desc *desc) +vmx_setdesc(void *arg, int vcpu, int seg, const struct seg_desc *desc) { int hostcpu, running; struct vmx *vmx = arg; diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c index e84520de46..8c054a52fb 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vlapic.c +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c @@ -992,13 +992,10 @@ int vlapic_icrlo_write_handler(struct vlapic *vlapic) { int i; - bool phys; cpuset_t dmask; uint64_t icrval; - uint32_t dest, vec, mode; - struct vlapic *vlapic2; + uint32_t dest, vec, mode, dsh; struct LAPIC *lapic; - uint16_t maxcpus; lapic = vlapic->apic_page; lapic->icr_lo &= ~APIC_DELSTAT_PEND; @@ -1010,93 +1007,79 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic) dest = icrval >> (32 + 24); vec = icrval & APIC_VECTOR_MASK; mode = icrval & APIC_DELMODE_MASK; + dsh = icrval & APIC_DEST_MASK; if (mode == APIC_DELMODE_FIXED && vec < 16) { vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false); - VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); return (0); } + if (mode == APIC_DELMODE_INIT && + (icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) { + /* No work required to deassert INIT */ + return (0); + } + if ((mode == APIC_DELMODE_STARTUP || mode == APIC_DELMODE_INIT) && + !(dsh == APIC_DEST_DESTFLD || dsh == APIC_DEST_ALLESELF)) { + /* + * While Intel makes no mention of restrictions for destination + * shorthand when sending INIT or SIPI, AMD requires either a + * specific destination or all-excluding self. Common use seems + * to be restricted to those two cases. + */ + return (-1); + } - VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec); + switch (dsh) { + case APIC_DEST_DESTFLD: + vlapic_calcdest(vlapic->vm, &dmask, dest, + (icrval & APIC_DESTMODE_LOG) == 0, false, x2apic(vlapic)); + break; + case APIC_DEST_SELF: + CPU_SETOF(vlapic->vcpuid, &dmask); + break; + case APIC_DEST_ALLISELF: + dmask = vm_active_cpus(vlapic->vm); + break; + case APIC_DEST_ALLESELF: + dmask = vm_active_cpus(vlapic->vm); + CPU_CLR(vlapic->vcpuid, &dmask); + break; + default: + /* + * All possible delivery notations are covered above. + * We should never end up here. + */ + panic("unknown delivery shorthand: %x", dsh); + } - if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) { - switch (icrval & APIC_DEST_MASK) { - case APIC_DEST_DESTFLD: - phys = ((icrval & APIC_DESTMODE_LOG) == 0); - vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false, - x2apic(vlapic)); + while ((i = CPU_FFS(&dmask)) != 0) { + i--; + CPU_CLR(i, &dmask); + switch (mode) { + case APIC_DELMODE_FIXED: + lapic_intr_edge(vlapic->vm, i, vec); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, + VLAPIC_IPI_SEND, 1); + vmm_stat_incr(vlapic->vm, i, + VLAPIC_IPI_RECV, 1); break; - case APIC_DEST_SELF: - CPU_SETOF(vlapic->vcpuid, &dmask); + case APIC_DELMODE_NMI: + vm_inject_nmi(vlapic->vm, i); break; - case APIC_DEST_ALLISELF: - dmask = vm_active_cpus(vlapic->vm); + case APIC_DELMODE_INIT: + (void) vm_inject_init(vlapic->vm, i); break; - case APIC_DEST_ALLESELF: - dmask = vm_active_cpus(vlapic->vm); - CPU_CLR(vlapic->vcpuid, &dmask); + case APIC_DELMODE_STARTUP: + (void) vm_inject_sipi(vlapic->vm, i, vec); break; + case APIC_DELMODE_LOWPRIO: + case APIC_DELMODE_SMI: default: - CPU_ZERO(&dmask); /* satisfy gcc */ + /* Unhandled IPI modes (for now) */ break; } - - while ((i = CPU_FFS(&dmask)) != 0) { - i--; - CPU_CLR(i, &dmask); - if (mode == APIC_DELMODE_FIXED) { - lapic_intr_edge(vlapic->vm, i, vec); - vmm_stat_incr(vlapic->vm, vlapic->vcpuid, - VLAPIC_IPI_SEND, 1); - vmm_stat_incr(vlapic->vm, i, - VLAPIC_IPI_RECV, 1); - VLAPIC_CTR2(vlapic, "vlapic sending ipi %d " - "to vcpuid %d", vec, i); - } else { - vm_inject_nmi(vlapic->vm, i); - VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi " - "to vcpuid %d", i); - } - } - - return (0); /* handled completely in the kernel */ - } - - maxcpus = vm_get_maxcpus(vlapic->vm); - if (mode == APIC_DELMODE_INIT) { - if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) - return (0); - - if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { - vlapic2 = vm_lapic(vlapic->vm, dest); - - /* move from INIT to waiting-for-SIPI state */ - if (vlapic2->boot_state == BS_INIT) { - vlapic2->boot_state = BS_SIPI; - } - - return (0); - } - } - - if (mode == APIC_DELMODE_STARTUP) { - if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { - vlapic2 = vm_lapic(vlapic->vm, dest); - - /* - * Ignore SIPIs in any state other than wait-for-SIPI - */ - if (vlapic2->boot_state != BS_SIPI) - return (0); - - vlapic2->boot_state = BS_RUNNING; - vm_req_spinup_ap(vlapic->vm, dest, vec << PAGE_SHIFT); - return (0); - } } - - /* Return to userland. */ - return (-1); + return (0); } void @@ -1450,30 +1433,72 @@ vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, return (retval); } -static void +void vlapic_reset(struct vlapic *vlapic) { - struct LAPIC *lapic; + struct LAPIC *lapic = vlapic->apic_page; + uint32_t *isrptr, *tmrptr, *irrptr; - lapic = vlapic->apic_page; - bzero(lapic, sizeof (struct LAPIC)); + /* Reset any timer-related state first */ + VLAPIC_TIMER_LOCK(vlapic); + callout_stop(&vlapic->callout); + lapic->icr_timer = 0; + lapic->ccr_timer = 0; + VLAPIC_TIMER_UNLOCK(vlapic); + lapic->dcr_timer = 0; + vlapic_dcr_write_handler(vlapic); + + /* + * Sync any APIC acceleration (APICv/AVIC) state into the APIC page so + * it is not leftover after the reset. This is performed after the APIC + * timer has been stopped, in case it happened to fire just prior to + * being deactivated. + */ + if (vlapic->ops.sync_state) { + (*vlapic->ops.sync_state)(vlapic); + } lapic->id = vlapic_get_id(vlapic); lapic->version = VLAPIC_VERSION; lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT); - lapic->dfr = 0xffffffff; - lapic->svr = APIC_SVR_VECTOR; - vlapic_mask_lvts(vlapic); - lapic->dcr_timer = 0; - vlapic_dcr_write_handler(vlapic); + lapic->tpr = 0; + lapic->apr = 0; + lapic->ppr = 0; - if (vlapic->vcpuid == 0) - vlapic->boot_state = BS_RUNNING; /* BSP */ - else - vlapic->boot_state = BS_INIT; /* AP */ +#ifdef __ISRVEC_DEBUG + /* With the PPR cleared, the isrvec tracking should be reset too */ + vlapic->isrvec_stk_top = 0; +#endif + lapic->eoi = 0; + lapic->ldr = 0; + lapic->dfr = 0xffffffff; + lapic->svr = APIC_SVR_VECTOR; vlapic->svr_last = lapic->svr; + + isrptr = &lapic->isr0; + tmrptr = &lapic->tmr0; + irrptr = &lapic->irr0; + for (uint_t i = 0; i < 8; i++) { + atomic_store_rel_int(&isrptr[i * 4], 0); + atomic_store_rel_int(&tmrptr[i * 4], 0); + atomic_store_rel_int(&irrptr[i * 4], 0); + } + + lapic->esr = 0; + vlapic->esr_pending = 0; + lapic->icr_lo = 0; + lapic->icr_hi = 0; + + lapic->lvt_cmci = 0; + lapic->lvt_timer = 0; + lapic->lvt_thermal = 0; + lapic->lvt_pcint = 0; + lapic->lvt_lint0 = 0; + lapic->lvt_lint1 = 0; + lapic->lvt_error = 0; + vlapic_mask_lvts(vlapic); } void diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.h b/usr/src/uts/i86pc/io/vmm/io/vlapic.h index f34cf1ec4b..6072b46101 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vlapic.h +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h @@ -30,6 +30,7 @@ /* * Copyright 2018 Joyent, Inc. + * Copyright 2020 Oxide Computer Company */ #ifndef _VLAPIC_H_ @@ -38,6 +39,8 @@ struct vm; enum x2apic_state; +void vlapic_reset(struct vlapic *vlapic); + int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset, uint64_t data); int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset, diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h index 69daf3652c..8d739bcfcc 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h @@ -137,12 +137,6 @@ do { \ VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \ } while (0) -enum boot_state { - BS_INIT, - BS_SIPI, - BS_RUNNING -}; - /* * 16 priority levels with at most one vector injected per level. */ @@ -182,7 +176,6 @@ struct vlapic { struct mtx timer_mtx; uint64_t msr_apicbase; - enum boot_state boot_state; /* * Copies of some registers in the virtual APIC page. We do this for diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h index 863d31cfeb..a7dbf84061 100644 --- a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h @@ -64,18 +64,12 @@ struct vm_object; struct vm_guest_paging; struct pmap; -struct vm_eventinfo { - uint_t *rptr; /* runblock cookie */ - int *sptr; /* suspend cookie */ - int *iptr; /* reqidle cookie */ -}; - typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); typedef void (*vmm_resume_func_t)(void); typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); typedef int (*vmi_run_func_t)(void *vmi, int vcpu, uint64_t rip, - struct pmap *pmap, struct vm_eventinfo *info); + struct pmap *pmap); typedef void (*vmi_cleanup_func_t)(void *vmi); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); @@ -84,7 +78,7 @@ typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, - struct seg_desc *desc); + const struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); typedef struct vmspace *(*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); @@ -169,9 +163,13 @@ bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, - struct seg_desc *ret_desc); + struct seg_desc *ret_desc); int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, - struct seg_desc *desc); + const struct seg_desc *desc); +int vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, + uint8_t *sipi_vec); +int vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, + uint8_t sipi_vec); int vm_run(struct vm *vm, int vcpuid, const struct vm_entry *); int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vm *vm, int vcpu); @@ -180,6 +178,8 @@ void vm_nmi_clear(struct vm *vm, int vcpuid); int vm_inject_extint(struct vm *vm, int vcpu); int vm_extint_pending(struct vm *vm, int vcpuid); void vm_extint_clear(struct vm *vm, int vcpuid); +int vm_inject_init(struct vm *vm, int vcpuid); +int vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vec); struct vlapic *vm_lapic(struct vm *vm, int cpu); struct vioapic *vm_ioapic(struct vm *vm); struct vhpet *vm_hpet(struct vm *vm); @@ -195,14 +195,13 @@ struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); struct vie *vm_vie_ctx(struct vm *vm, int vcpuid); void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); -void vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip); int vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, int rsize); int vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, int wsize); -void vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip); #ifdef _SYS__CPUSET_H_ cpuset_t vm_active_cpus(struct vm *vm); @@ -210,28 +209,9 @@ cpuset_t vm_debug_cpus(struct vm *vm); cpuset_t vm_suspended_cpus(struct vm *vm); #endif /* _SYS__CPUSET_H_ */ -static __inline int -vcpu_runblocked(struct vm_eventinfo *info) -{ - - return (*info->rptr != 0); -} - -static __inline int -vcpu_suspended(struct vm_eventinfo *info) -{ - - return (*info->sptr); -} - -static __inline int -vcpu_reqidle(struct vm_eventinfo *info) -{ - - return (*info->iptr); -} - -int vcpu_debugged(struct vm *vm, int vcpuid); +bool vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip); +bool vcpu_run_state_pending(struct vm *vm, int vcpuid); +int vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only); /* * Return true if device indicated by bus/slot/func is supposed to be a diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c index ed7a97921a..8c9a284cbf 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm.c @@ -109,17 +109,15 @@ struct vlapic; * (x) initialized before use */ struct vcpu { - struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ + /* (o) protects state, run_state, hostcpu, sipi_vector */ + struct mtx mtx; + enum vcpu_state state; /* (o) vcpu state */ -#ifndef __FreeBSD__ + enum vcpu_run_state run_state; /* (i) vcpu init/sipi/run state */ kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ kcondvar_t state_cv; /* (o) IDLE-transition cv */ -#endif /* __FreeBSD__ */ int hostcpu; /* (o) vcpu's current host cpu */ -#ifndef __FreeBSD__ int lastloccpu; /* (o) last host cpu localized to */ -#endif - uint_t runblock; /* (i) block vcpu from run state */ int reqidle; /* (i) request vcpu to idle */ struct vlapic *vlapic; /* (i) APIC device model */ enum x2apic_state x2apic_state; /* (i) APIC mode */ @@ -130,6 +128,7 @@ struct vcpu { int exc_vector; /* (x) exception collateral */ int exc_errcode_valid; uint32_t exc_errcode; + uint8_t sipi_vector; /* (i) SIPI vector */ struct savefpu *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ @@ -200,15 +199,6 @@ struct vm { uint16_t maxcpus; /* (o) max pluggable cpus */ struct ioport_config ioports; /* (o) ioport handling */ - - bool sipi_req; /* (i) SIPI requested */ - int sipi_req_vcpu; /* (i) SIPI destination */ - uint64_t sipi_req_rip; /* (i) SIPI start %rip */ - - /* Miscellaneous VM-wide statistics and counters */ - struct vm_wide_stats { - uint64_t sipi_supersede; - } stats; }; static int vmm_initialized; @@ -249,8 +239,8 @@ static struct vmm_ops *ops = &vmm_ops_null; #define VMM_RESUME() ((*ops->resume)()) #define VMINIT(vm, pmap) ((*ops->vminit)(vm, pmap)) -#define VMRUN(vmi, vcpu, rip, pmap, evinfo) \ - ((*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo)) +#define VMRUN(vmi, vcpu, rip, pmap) \ + ((*ops->vmrun)(vmi, vcpu, rip, pmap)) #define VMCLEANUP(vmi) ((*ops->vmcleanup)(vmi)) #define VMSPACE_ALLOC(min, max) ((*ops->vmspace_alloc)(min, max)) #define VMSPACE_FREE(vmspace) ((*ops->vmspace_free)(vmspace)) @@ -292,6 +282,8 @@ static int trace_guest_exceptions; static void vm_free_memmap(struct vm *vm, int ident); static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); static void vcpu_notify_event_locked(struct vcpu *vcpu, vcpu_notify_t); +static bool vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid); +static int vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector); #ifndef __FreeBSD__ static void vm_clear_memseg(struct vm *, int); @@ -370,9 +362,9 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create) bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); } + vcpu->run_state = VRS_HALT; vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); - vcpu->runblock = 0; vcpu->reqidle = 0; vcpu->exitintinfo = 0; vcpu->nmi_pending = 0; @@ -1233,7 +1225,7 @@ vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) } int -vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) +vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) { if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); @@ -1244,6 +1236,49 @@ vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) return (VMSETDESC(vm->cookie, vcpu, reg, desc)); } +int +vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) { + return (EINVAL); + } + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + *state = vcpu->run_state; + *sipi_vec = vcpu->sipi_vector; + vcpu_unlock(vcpu); + + return (0); +} + +int +vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) { + return (EINVAL); + } + if (!VRS_IS_VALID(state)) { + return (EINVAL); + } + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu->run_state = state; + vcpu->sipi_vector = sipi_vec; + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + vcpu_unlock(vcpu); + + return (0); +} + + static void restore_guest_fpustate(struct vcpu *vcpu) { @@ -1354,16 +1389,6 @@ vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, break; } - if (newstate == VCPU_RUNNING) { - while (vcpu->runblock != 0) { -#ifdef __FreeBSD__ - msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); -#else - cv_wait(&vcpu->state_cv, &vcpu->mtx.m); -#endif - } - } - if (error) return (EBUSY); @@ -1376,8 +1401,7 @@ vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, else vcpu->hostcpu = NOCPU; - if (newstate == VCPU_IDLE || - (newstate == VCPU_FROZEN && vcpu->runblock != 0)) { + if (newstate == VCPU_IDLE) { #ifdef __FreeBSD__ wakeup(&vcpu->state); #else @@ -1413,12 +1437,8 @@ static int vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) { struct vcpu *vcpu; -#ifdef __FreeBSD__ - const char *wmesg; -#else - const char *wmesg __unused; -#endif int t, vcpu_halted, vm_halted; + bool userspace_exit = false; KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); @@ -1429,18 +1449,13 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) vcpu_lock(vcpu); while (1) { /* - * Do a final check for pending NMI or interrupts before - * really putting this thread to sleep. Also check for - * software events that would cause this vcpu to wakeup. - * - * These interrupts/events could have happened after the - * vcpu returned from VMRUN() and before it acquired the - * vcpu lock above. + * Do a final check for pending interrupts (including NMI and + * INIT) before putting this thread to sleep. */ - if (vm->suspend || vcpu->reqidle) - break; if (vm_nmi_pending(vm, vcpuid)) break; + if (vcpu_run_state_pending(vm, vcpuid)) + break; if (!intr_disabled) { if (vm_extint_pending(vm, vcpuid) || vlapic_pending_intr(vcpu->vlapic, NULL)) { @@ -1448,12 +1463,15 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) } } - /* Don't go to sleep if the vcpu thread needs to yield */ - if (vcpu_should_yield(vm, vcpuid)) - break; - - if (vcpu_debugged(vm, vcpuid)) + /* + * Also check for software events which would cause a wake-up. + * This will set the appropriate exitcode directly, rather than + * requiring a trip through VM_RUN(). + */ + if (vcpu_sleep_bailout_checks(vm, vcpuid)) { + userspace_exit = true; break; + } /* * Some Linux guests implement "halt" by having all vcpus @@ -1462,8 +1480,6 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) * vcpus enter the halted state the virtual machine is halted. */ if (intr_disabled) { - wmesg = "vmhalt"; - VCPU_CTR0(vm, vcpuid, "Halted"); if (!vcpu_halted && halt_detection_enabled) { vcpu_halted = 1; CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); @@ -1472,25 +1488,11 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) vm_halted = 1; break; } - } else { - wmesg = "vmidle"; } t = ticks; vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); -#ifdef __FreeBSD__ - /* - * XXX msleep_spin() cannot be interrupted by signals so - * wake up periodically to check pending signals. - */ - msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); -#else - /* - * Fortunately, cv_wait_sig can be interrupted by signals, so - * there is no need to periodically wake up. - */ (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m); -#endif vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } @@ -1503,7 +1505,7 @@ vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled) if (vm_halted) vm_suspend(vm, VM_SUSPEND_HALT); - return (0); + return (userspace_exit ? -1 : 0); } static int @@ -1832,6 +1834,62 @@ vm_handle_reqidle(struct vm *vm, int vcpuid) return (-1); } +static int +vm_handle_run_state(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + bool handled = false; + + vcpu_lock(vcpu); + while (1) { + if ((vcpu->run_state & VRS_PEND_INIT) != 0) { + vcpu_unlock(vcpu); + VERIFY0(vcpu_arch_reset(vm, vcpuid, true)); + vcpu_lock(vcpu); + + vcpu->run_state &= ~(VRS_RUN | VRS_PEND_INIT); + vcpu->run_state |= VRS_INIT; + } + + if ((vcpu->run_state & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI)) == + (VRS_INIT | VRS_PEND_SIPI)) { + const uint8_t vector = vcpu->sipi_vector; + + vcpu_unlock(vcpu); + VERIFY0(vcpu_vector_sipi(vm, vcpuid, vector)); + vcpu_lock(vcpu); + + vcpu->run_state &= ~VRS_PEND_SIPI; + vcpu->run_state |= VRS_RUN; + } + + /* + * If the vCPU is now in the running state, there is no need to + * wait for anything prior to re-entry. + */ + if ((vcpu->run_state & VRS_RUN) != 0) { + handled = true; + break; + } + + /* + * Also check for software events which would cause a wake-up. + * This will set the appropriate exitcode directly, rather than + * requiring a trip through VM_RUN(). + */ + if (vcpu_sleep_bailout_checks(vm, vcpuid)) { + break; + } + + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); + (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m); + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + } + vcpu_unlock(vcpu); + + return (handled ? 0 : -1); +} + #ifndef __FreeBSD__ static int vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) @@ -1850,18 +1908,6 @@ vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) } #endif /* __FreeBSD__ */ -void -vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip) -{ - if (vm->sipi_req) { - /* This should never occur if userspace is doing its job. */ - vm->stats.sipi_supersede++; - } - vm->sipi_req = true; - vm->sipi_req_vcpu = req_vcpuid; - vm->sipi_req_rip = req_rip; -} - int vm_suspend(struct vm *vm, enum vm_suspend_how how) { @@ -1890,66 +1936,17 @@ vm_suspend(struct vm *vm, enum vm_suspend_how how) } void -vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) -{ - struct vm_exit *vmexit; - - KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, - ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); - - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->rip = rip; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_SUSPENDED; - vmexit->u.suspended.how = vm->suspend; -} - -void -vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip) +vm_exit_run_state(struct vm *vm, int vcpuid, uint64_t rip) { struct vm_exit *vmexit; vmexit = vm_exitinfo(vm, vcpuid); vmexit->rip = rip; vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_DEBUG; + vmexit->exitcode = VM_EXITCODE_RUN_STATE; + vmm_stat_incr(vm, vcpuid, VMEXIT_RUN_STATE, 1); } -void -vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip) -{ - struct vm_exit *vmexit; - - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->rip = rip; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_RUNBLOCK; - vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1); -} - -void -vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip) -{ - struct vm_exit *vmexit; - - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->rip = rip; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_REQIDLE; - vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); -} - -void -vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) -{ - struct vm_exit *vmexit; - - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->rip = rip; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_BOGUS; - vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); -} #ifndef __FreeBSD__ /* @@ -2072,7 +2069,7 @@ vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, case VEC_DISCARD_INSTR: vie_reset(vie); return (0); - case VEC_COMPLETE_MMIO: + case VEC_FULFILL_MMIO: err = vie_fulfill_mmio(vie, &entry->u.mmio); if (err == 0) { err = vie_emulate_mmio(vie, vm, vcpuid); @@ -2091,7 +2088,7 @@ vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, } } break; - case VEC_COMPLETE_INOUT: + case VEC_FULFILL_INOUT: err = vie_fulfill_inout(vie, &entry->u.inout); if (err == 0) { err = vie_emulate_inout(vie, vm, vcpuid); @@ -2132,25 +2129,12 @@ vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) return (-1); } - if (vcpuid == 0 && vm->sipi_req) { - /* The boot vCPU has sent a SIPI to one of the other CPUs */ - vme->exitcode = VM_EXITCODE_SPINUP_AP; - vme->u.spinup_ap.vcpu = vm->sipi_req_vcpu; - vme->u.spinup_ap.rip = vm->sipi_req_rip; - - vm->sipi_req = false; - vm->sipi_req_vcpu = 0; - vm->sipi_req_rip = 0; - return (-1); - } - return (0); } int vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) { - struct vm_eventinfo evinfo; int error; struct vcpu *vcpu; #ifdef __FreeBSD__ @@ -2177,9 +2161,6 @@ vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; - evinfo.rptr = &vcpu->runblock; - evinfo.sptr = &vm->suspend; - evinfo.iptr = &vcpu->reqidle; #ifndef __FreeBSD__ vtc.vtc_vm = vm; @@ -2242,7 +2223,7 @@ restart: #endif vcpu_require_state(vm, vcpuid, VCPU_RUNNING); - error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo); + error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); #ifdef __FreeBSD__ @@ -2273,6 +2254,9 @@ restart: case VM_EXITCODE_REQIDLE: error = vm_handle_reqidle(vm, vcpuid); break; + case VM_EXITCODE_RUN_STATE: + error = vm_handle_run_state(vm, vcpuid); + break; case VM_EXITCODE_SUSPENDED: error = vm_handle_suspend(vm, vcpuid); break; @@ -2280,8 +2264,6 @@ restart: vioapic_process_eoi(vm, vcpuid, vme->u.ioapic_eoi.vector); break; - case VM_EXITCODE_RUNBLOCK: - break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled); @@ -2792,6 +2774,196 @@ vm_extint_clear(struct vm *vm, int vcpuid) } int +vm_inject_init(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vcpu_lock(vcpu); + vcpu->run_state |= VRS_PEND_INIT; + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + vcpu_unlock(vcpu); + return (0); +} + +int +vm_inject_sipi(struct vm *vm, int vcpuid, uint8_t vector) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + vcpu_lock(vcpu); + vcpu->run_state |= VRS_PEND_SIPI; + vcpu->sipi_vector = vector; + /* SIPI is only actionable if the CPU is waiting in INIT state */ + if ((vcpu->run_state & (VRS_INIT | VRS_RUN)) == VRS_INIT) { + vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); + } + vcpu_unlock(vcpu); + return (0); +} + +bool +vcpu_run_state_pending(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); + vcpu = &vm->vcpu[vcpuid]; + + /* Of interest: vCPU not in running state or with pending INIT */ + return ((vcpu->run_state & (VRS_RUN | VRS_PEND_INIT)) != VRS_RUN); +} + +int +vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) +{ + struct seg_desc desc; + const enum vm_reg_name clear_regs[] = { + VM_REG_GUEST_CR2, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_DR0, + VM_REG_GUEST_DR1, + VM_REG_GUEST_DR2, + VM_REG_GUEST_DR3, + VM_REG_GUEST_EFER, + }; + const enum vm_reg_name data_segs[] = { + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_ES, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + }; + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + for (uint_t i = 0; i < nitems(clear_regs); i++) { + VERIFY0(vm_set_register(vm, vcpuid, clear_regs[i], 0)); + } + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, 2)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0xfff0)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CR0, 0x60000010)); + + /* + * The prescribed contents of %rdx differ slightly between the Intel and + * AMD architectural definitions. The former expects the Extended Model + * in bits 16-19 where the latter expects all the Family, Model, and + * Stepping be there. Common boot ROMs appear to disregard this + * anyways, so we stick with a compromise value similar to what is + * spelled out in the Intel SDM. + */ + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RDX, 0x600)); + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR6, 0xffff0ff0)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_DR7, 0x400)); + + /* CS: Present, R/W, Accessed */ + desc.access = 0x0093; + desc.base = 0xffff0000; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, 0xf000)); + + /* SS, DS, ES, FS, GS: Present, R/W, Accessed */ + desc.access = 0x0093; + desc.base = 0; + desc.limit = 0xffff; + for (uint_t i = 0; i < nitems(data_segs); i++) { + VERIFY0(vm_set_seg_desc(vm, vcpuid, data_segs[i], &desc)); + VERIFY0(vm_set_register(vm, vcpuid, data_segs[i], 0)); + } + + /* GDTR, IDTR */ + desc.base = 0; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_GDTR, &desc)); + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_IDTR, &desc)); + + /* LDTR: Present, LDT */ + desc.access = 0x0082; + desc.base = 0; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_LDTR, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_LDTR, 0)); + + /* TR: Present, 32-bit TSS */ + desc.access = 0x008b; + desc.base = 0; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_TR, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_TR, 0)); + + vlapic_reset(vm_lapic(vm, vcpuid)); + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0)); + + vcpu->exitintinfo = 0; + vcpu->exception_pending = 0; + vcpu->nmi_pending = 0; + vcpu->extint_pending = 0; + + /* + * A CPU reset caused by power-on or system reset clears more state than + * one which is trigged from an INIT IPI. + */ + if (!init_only) { + vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; + fpu_save_area_reset(vcpu->guestfpu); + + /* XXX: clear MSRs and other pieces */ + } + + return (0); +} + +static int +vcpu_vector_sipi(struct vm *vm, int vcpuid, uint8_t vector) +{ + struct seg_desc desc; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + /* CS: Present, R/W, Accessed */ + desc.access = 0x0093; + desc.base = (uint64_t)vector << 12; + desc.limit = 0xffff; + VERIFY0(vm_set_seg_desc(vm, vcpuid, VM_REG_GUEST_CS, &desc)); + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_CS, + (uint64_t)vector << 8)); + + VERIFY0(vm_set_register(vm, vcpuid, VM_REG_GUEST_RIP, 0)); + + return (0); +} + +int vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) { if (vcpu < 0 || vcpu >= vm->maxcpus) @@ -2894,7 +3066,7 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, struct vcpu *vcpu; if (vcpuid < 0 || vcpuid >= vm->maxcpus) - panic("vm_set_run_state: invalid vcpuid %d", vcpuid); + panic("vcpu_set_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -2912,7 +3084,7 @@ vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) enum vcpu_state state; if (vcpuid < 0 || vcpuid >= vm->maxcpus) - panic("vm_get_run_state: invalid vcpuid %d", vcpuid); + panic("vcpu_get_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -2925,54 +3097,6 @@ vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) return (state); } -void -vcpu_block_run(struct vm *vm, int vcpuid) -{ - struct vcpu *vcpu; - - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) - panic("vcpu_block_run: invalid vcpuid %d", vcpuid); - - vcpu = &vm->vcpu[vcpuid]; - - vcpu_lock(vcpu); - vcpu->runblock++; - if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) { - vcpu_notify_event_locked(vcpu, VCPU_NOTIFY_EXIT); - } - while (vcpu->state == VCPU_RUNNING) { -#ifdef __FreeBSD__ - msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); -#else - cv_wait(&vcpu->state_cv, &vcpu->mtx.m); -#endif - } - vcpu_unlock(vcpu); -} - -void -vcpu_unblock_run(struct vm *vm, int vcpuid) -{ - struct vcpu *vcpu; - - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) - panic("vcpu_block_run: invalid vcpuid %d", vcpuid); - - vcpu = &vm->vcpu[vcpuid]; - - vcpu_lock(vcpu); - KASSERT(vcpu->runblock != 0, ("expected non-zero runblock")); - vcpu->runblock--; - if (vcpu->runblock == 0) { -#ifdef __FreeBSD__ - wakeup(&vcpu->state); -#else - cv_broadcast(&vcpu->state_cv); -#endif - } - vcpu_unlock(vcpu); -} - #ifndef __FreeBSD__ uint64_t vcpu_tsc_offset(struct vm *vm, int vcpuid) @@ -3038,11 +3162,93 @@ vm_resume_cpu(struct vm *vm, int vcpuid) return (0); } -int -vcpu_debugged(struct vm *vm, int vcpuid) +static bool +vcpu_bailout_checks(struct vm *vm, int vcpuid, bool on_entry, + uint64_t entry_rip) { + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + struct vm_exit *vme = &vcpu->exitinfo; + bool bail = false; - return (CPU_ISSET(vcpuid, &vm->debug_cpus)); + ASSERT(vcpuid >= 0 && vcpuid < vm->maxcpus); + + if (vm->suspend) { + if (on_entry) { + VERIFY(vm->suspend > VM_SUSPEND_NONE && + vm->suspend < VM_SUSPEND_LAST); + + vme->exitcode = VM_EXITCODE_SUSPENDED; + vme->u.suspended.how = vm->suspend; + } else { + /* + * Handling VM suspend is complicated, so if that + * condition is detected outside of VM-entry itself, + * just emit a BOGUS exitcode so we take a lap to pick + * up the event during an entry and are directed into + * the vm_handle_suspend() logic. + */ + vme->exitcode = VM_EXITCODE_BOGUS; + } + bail = true; + } + if (vcpu->reqidle) { + vme->exitcode = VM_EXITCODE_REQIDLE; + vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); + + if (!on_entry) { + /* + * A reqidle request detected outside of VM-entry can be + * handled directly by clearing the request (and taking + * a lap to userspace). + */ + vcpu_assert_locked(vcpu); + vcpu->reqidle = 0; + } + bail = true; + } + if (vcpu_should_yield(vm, vcpuid)) { + vme->exitcode = VM_EXITCODE_BOGUS; + vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); + bail = true; + } + if (CPU_ISSET(vcpuid, &vm->debug_cpus)) { + vme->exitcode = VM_EXITCODE_DEBUG; + bail = true; + } + + if (bail) { + if (on_entry) { + /* + * If bailing out during VM-entry, the current %rip must + * be recorded in the exitinfo. + */ + vme->rip = entry_rip; + } + vme->inst_length = 0; + } + return (bail); +} + +static bool +vcpu_sleep_bailout_checks(struct vm *vm, int vcpuid) +{ + /* + * Bail-out check done prior to sleeping (in vCPU contexts like HLT or + * wait-for-SIPI) expect that %rip is already populated in the vm_exit + * structure, and we would only modify the exitcode. + */ + return (vcpu_bailout_checks(vm, vcpuid, false, 0)); +} + +bool +vcpu_entry_bailout_checks(struct vm *vm, int vcpuid, uint64_t rip) +{ + /* + * Bail-out checks done as part of VM entry require an updated %rip to + * populate the vm_exit struct if any of the conditions of interest are + * matched in the check. + */ + return (vcpu_bailout_checks(vm, vcpuid, true, rip)); } cpuset_t diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c index b62c6d1043..df127d4021 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -443,6 +443,9 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, case VM_RESTART_INSTRUCTION: case VM_SET_KERNEMU_DEV: case VM_GET_KERNEMU_DEV: + case VM_RESET_CPU: + case VM_GET_RUN_STATE: + case VM_SET_RUN_STATE: /* * Copy in the ID of the vCPU chosen for this operation. * Since a nefarious caller could update their struct between @@ -989,6 +992,45 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, } break; } + case VM_RESET_CPU: { + struct vm_vcpu_reset vvr; + + if (ddi_copyin(datap, &vvr, sizeof (vvr), md)) { + error = EFAULT; + break; + } + if (vvr.kind != VRK_RESET && vvr.kind != VRK_INIT) { + error = EINVAL; + } + + error = vcpu_arch_reset(sc->vmm_vm, vcpu, vvr.kind == VRK_INIT); + break; + } + case VM_GET_RUN_STATE: { + struct vm_run_state vrs; + + bzero(&vrs, sizeof (vrs)); + error = vm_get_run_state(sc->vmm_vm, vcpu, &vrs.state, + &vrs.sipi_vector); + if (error == 0) { + if (ddi_copyout(&vrs, datap, sizeof (vrs), md)) { + error = EFAULT; + break; + } + } + break; + } + case VM_SET_RUN_STATE: { + struct vm_run_state vrs; + + if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { + error = EFAULT; + break; + } + error = vm_set_run_state(sc->vmm_vm, vcpu, vrs.state, + vrs.sipi_vector); + break; + } case VM_SET_KERNEMU_DEV: case VM_GET_KERNEMU_DEV: { diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.c b/usr/src/uts/i86pc/io/vmm/vmm_stat.c index 26e3573bc9..da38bb7de5 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_stat.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.c @@ -167,6 +167,5 @@ VMM_STAT(VMEXIT_MMIO_EMUL, "vm exits for mmio emulation"); VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); -VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); -VMM_STAT(VMEXIT_RUNBLOCK, "number of times runblock at exit"); VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); +VMM_STAT(VMEXIT_RUN_STATE, "number of vm exits due to run_state change"); diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.h b/usr/src/uts/i86pc/io/vmm/vmm_stat.h index 68e43c7bfc..2975a4a914 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_stat.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.h @@ -165,8 +165,7 @@ VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); VMM_STAT_DECLARE(VMEXIT_MMIO_EMUL); VMM_STAT_DECLARE(VMEXIT_UNKNOWN); VMM_STAT_DECLARE(VMEXIT_ASTPENDING); -VMM_STAT_DECLARE(VMEXIT_USERSPACE); -VMM_STAT_DECLARE(VMEXIT_RUNBLOCK); VMM_STAT_DECLARE(VMEXIT_EXCEPTION); VMM_STAT_DECLARE(VMEXIT_REQIDLE); +VMM_STAT_DECLARE(VMEXIT_RUN_STATE); #endif diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h index c6859a3c00..65fdb19349 100644 --- a/usr/src/uts/i86pc/sys/vmm.h +++ b/usr/src/uts/i86pc/sys/vmm.h @@ -217,9 +217,9 @@ enum vm_exitcode { VM_EXITCODE_PAUSE, VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, - VM_EXITCODE_SPINUP_AP, + VM_EXITCODE_RUN_STATE, VM_EXITCODE_MMIO_EMUL, - VM_EXITCODE_RUNBLOCK, + VM_EXITCODE_DEPRECATED, /* formerly RUNBLOCK */ VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, VM_EXITCODE_MMIO, @@ -287,6 +287,18 @@ struct vm_task_switch { struct vm_guest_paging paging; }; +enum vcpu_run_state { + VRS_HALT = 0, + VRS_INIT = (1 << 0), + VRS_RUN = (1 << 1), + + VRS_PEND_INIT = (1 << 14), + VRS_PEND_SIPI = (1 << 15), +}; +#define VRS_MASK_VALID(v) \ + ((v) & (VRS_INIT | VRS_RUN | VRS_PEND_SIPI | VRS_PEND_SIPI)) +#define VRS_IS_VALID(v) ((v) == VRS_MASK_VALID(v)) + struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ @@ -348,10 +360,6 @@ struct vm_exit { uint64_t wval; } msr; struct { - int vcpu; - uint64_t rip; - } spinup_ap; - struct { uint64_t rflags; } hlt; struct { @@ -367,8 +375,8 @@ struct vm_exit { enum vm_entry_cmds { VEC_DEFAULT = 0, VEC_DISCARD_INSTR, /* discard inst emul state */ - VEC_COMPLETE_MMIO, /* entry includes result for mmio emul */ - VEC_COMPLETE_INOUT, /* entry includes result for inout emul */ + VEC_FULFILL_MMIO, /* entry includes result for mmio emul */ + VEC_FULFILL_INOUT, /* entry includes result for inout emul */ }; struct vm_entry { diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h index c894c6aeb0..f5d031bfd4 100644 --- a/usr/src/uts/i86pc/sys/vmm_dev.h +++ b/usr/src/uts/i86pc/sys/vmm_dev.h @@ -259,6 +259,28 @@ struct vm_readwrite_kernemu_device { }; _Static_assert(sizeof(struct vm_readwrite_kernemu_device) == 24, "ABI"); +enum vcpu_reset_kind { + VRK_RESET = 0, + /* + * The reset performed by an INIT IPI clears much of the CPU state, but + * some portions are left untouched, unlike VRK_RESET, which represents + * a "full" reset as if the system was freshly powered on. + */ + VRK_INIT = 1, +}; + +struct vm_vcpu_reset { + int vcpuid; + uint32_t kind; /* contains: enum vcpu_reset_kind */ +}; + +struct vm_run_state { + int vcpuid; + uint32_t state; /* of enum cpu_init_status type */ + uint8_t sipi_vector; /* vector of SIPI, if any */ + uint8_t _pad[3]; +}; + #define VMMCTL_IOC_BASE (('V' << 16) | ('M' << 8)) #define VMM_IOC_BASE (('v' << 16) | ('m' << 8)) #define VMM_LOCK_IOC_BASE (('v' << 16) | ('l' << 8)) @@ -291,6 +313,9 @@ _Static_assert(sizeof(struct vm_readwrite_kernemu_device) == 24, "ABI"); #define VM_RESTART_INSTRUCTION (VMM_CPU_IOC_BASE | 0x13) #define VM_SET_KERNEMU_DEV (VMM_CPU_IOC_BASE | 0x14) #define VM_GET_KERNEMU_DEV (VMM_CPU_IOC_BASE | 0x15) +#define VM_RESET_CPU (VMM_CPU_IOC_BASE | 0x16) +#define VM_GET_RUN_STATE (VMM_CPU_IOC_BASE | 0x17) +#define VM_SET_RUN_STATE (VMM_CPU_IOC_BASE | 0x18) /* Operations requiring write-locking the VM */ #define VM_REINIT (VMM_LOCK_IOC_BASE | 0x01) |