diff options
author | Patrick Mooney <pmooney@pfmooney.com> | 2020-07-18 19:49:51 +0000 |
---|---|---|
committer | Patrick Mooney <pmooney@oxide.computer> | 2020-08-20 19:19:51 +0000 |
commit | e0c0d44e917080841514d0dd031a696c74e8c435 (patch) | |
tree | 232fc454b760fd2fdced128fec0e84b065e4f025 /usr/src | |
parent | 76f19f5fdc974fe5be5c82a556e43a4df93f1de1 (diff) | |
download | illumos-joyent-e0c0d44e917080841514d0dd031a696c74e8c435.tar.gz |
12989 improve interface boundary for bhyve MMIO
12990 improve interface boundary for bhyve ins/outs
12991 bhyve vlapic should SIPI more carefully
Reviewed by: Mike Zeller <mike.zeller@joyent.com>
Reviewed by: Joshua M. Clulow <josh@sysmgr.org>
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Approved by: Gordon Ross <gordon.w.ross@gmail.com>
Diffstat (limited to 'usr/src')
32 files changed, 1897 insertions, 1168 deletions
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile index 348a8988fe..7126fdda17 100644 --- a/usr/src/cmd/bhyve/Makefile +++ b/usr/src/cmd/bhyve/Makefile @@ -12,6 +12,7 @@ # # Copyright 2014 Pluribus Networks Inc. # Copyright 2019 Joyent, Inc. +# Copyright 2020 Oxide Computer Company # PROG = bhyve @@ -74,7 +75,6 @@ SRCS = acpi.c \ usb_mouse.c \ vga.c \ virtio.c \ - vmm_instruction_emul.c \ vmgenc.c \ xmsr.c \ spinup_ap.c \ @@ -161,7 +161,3 @@ $(SUBDIRS): FRC @cd $@; pwd; $(MAKE) $(TARGET) FRC: - -%.o: $(SRC)/uts/i86pc/io/vmm/%.c - $(COMPILE.c) $< - $(POST_PROCESS_O) diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c index fbc9fab6b1..d2a4032682 100644 --- a/usr/src/cmd/bhyve/bhyverun.c +++ b/usr/src/cmd/bhyve/bhyverun.c @@ -213,6 +213,7 @@ static cpuset_t cpumask; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); static struct vm_exit vmexit[VM_MAXCPU]; +static struct vm_entry vmentry[VM_MAXCPU]; struct bhyvestats { uint64_t vmexit_bogus; @@ -220,15 +221,18 @@ struct bhyvestats { uint64_t vmexit_hlt; uint64_t vmexit_pause; uint64_t vmexit_mtrap; - uint64_t vmexit_inst_emul; + uint64_t vmexit_mmio; + uint64_t vmexit_inout; uint64_t cpu_switch_rotate; uint64_t cpu_switch_direct; + uint64_t mmio_unhandled; } stats; struct mt_vmm_info { pthread_t mt_thr; struct vmctx *mt_ctx; - int mt_vcpu; + int mt_vcpu; + uint64_t mt_startrip; } mt_vmm_info[VM_MAXCPU]; #ifdef __FreeBSD__ @@ -498,7 +502,7 @@ fbsdrun_start_thread(void *param) if (gdb_port != 0) gdb_cpu_add(vcpu); - vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); + vm_loop(mtp->mt_ctx, vcpu, mtp->mt_startrip); /* not reached */ exit(1); @@ -539,11 +543,9 @@ fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip, * Set up the vmexit struct to allow execution to start * at the given RIP */ - vmexit[newcpu].rip = rip; - vmexit[newcpu].inst_length = 0; - mt_vmm_info[newcpu].mt_ctx = ctx; mt_vmm_info[newcpu].mt_vcpu = newcpu; + mt_vmm_info[newcpu].mt_startrip = rip; error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL, fbsdrun_start_thread, &mt_vmm_info[newcpu]); @@ -563,6 +565,66 @@ fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) return (CPU_EMPTY(&cpumask)); } +static void +vmentry_mmio_read(int vcpu, uint64_t gpa, uint8_t bytes, uint64_t data) +{ + struct vm_entry *entry = &vmentry[vcpu]; + struct vm_mmio *mmio = &entry->u.mmio; + + assert(entry->cmd == VEC_DEFAULT); + + entry->cmd = VEC_COMPLETE_MMIO; + mmio->bytes = bytes; + mmio->read = 1; + mmio->gpa = gpa; + mmio->data = data; +} + +static void +vmentry_mmio_write(int vcpu, uint64_t gpa, uint8_t bytes) +{ + struct vm_entry *entry = &vmentry[vcpu]; + struct vm_mmio *mmio = &entry->u.mmio; + + assert(entry->cmd == VEC_DEFAULT); + + entry->cmd = VEC_COMPLETE_MMIO; + mmio->bytes = bytes; + mmio->read = 0; + mmio->gpa = gpa; + mmio->data = 0; +} + +static void +vmentry_inout_read(int vcpu, uint16_t port, uint8_t bytes, uint32_t data) +{ + struct vm_entry *entry = &vmentry[vcpu]; + struct vm_inout *inout = &entry->u.inout; + + assert(entry->cmd == VEC_DEFAULT); + + entry->cmd = VEC_COMPLETE_INOUT; + inout->bytes = bytes; + inout->flags = INOUT_IN; + inout->port = port; + inout->eax = data; +} + +static void +vmentry_inout_write(int vcpu, uint16_t port, uint8_t bytes) +{ + struct vm_entry *entry = &vmentry[vcpu]; + struct vm_inout *inout = &entry->u.inout; + + assert(entry->cmd == VEC_DEFAULT); + + entry->cmd = VEC_COMPLETE_INOUT; + inout->bytes = bytes; + inout->flags = 0; + inout->port = port; + inout->eax = 0; +} + static int vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, uint32_t eax) @@ -579,30 +641,42 @@ static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; - int bytes, port, in, out; int vcpu; + struct vm_inout inout; + bool in; + uint8_t bytes; - vcpu = *pvcpu; + stats.vmexit_inout++; - port = vme->u.inout.port; - bytes = vme->u.inout.bytes; - in = vme->u.inout.in; - out = !in; + vcpu = *pvcpu; + inout = vme->u.inout; + in = (inout.flags & INOUT_IN) != 0; + bytes = inout.bytes; /* Extra-special case of host notifications */ - if (out && port == GUEST_NIO_PORT) { - error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax); + if (!in && inout.port == GUEST_NIO_PORT) { + error = vmexit_handle_notify(ctx, vme, pvcpu, inout.eax); + vmentry_inout_write(vcpu, inout.port, bytes); return (error); } - error = emulate_inout(ctx, vcpu, vme, strictio); + error = emulate_inout(ctx, vcpu, &inout, strictio != 0); if (error) { fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n", in ? "in" : "out", bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'), - port, vmexit->rip); + inout.port, vmexit->rip); return (VMEXIT_ABORT); } else { + /* + * Communicate the status of the inout operation back to the + * in-kernel instruction emulation. + */ + if (in) { + vmentry_inout_read(vcpu, inout.port, bytes, inout.eax); + } else { + vmentry_inout_write(vcpu, inout.port, bytes); + } return (VMEXIT_CONTINUE); } } @@ -792,29 +866,70 @@ vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) static int vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { - int err, i; - struct vie *vie; + uint8_t i, valid; + + fprintf(stderr, "Failed to emulate instruction sequence "); + + valid = vmexit->u.inst_emul.num_valid; + if (valid != 0) { + assert(valid <= sizeof (vmexit->u.inst_emul.inst)); + fprintf(stderr, "["); + for (i = 0; i < valid; i++) { + if (i == 0) { + fprintf(stderr, "%02x", + vmexit->u.inst_emul.inst[i]); + } else { + fprintf(stderr, ", %02x", + vmexit->u.inst_emul.inst[i]); + } + } + fprintf(stderr, "] "); + } + fprintf(stderr, "@ %rip = %x\n", vmexit->rip); - stats.vmexit_inst_emul++; + return (VMEXIT_ABORT); +} - vie = &vmexit->u.inst_emul.vie; - err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa, - vie, &vmexit->u.inst_emul.paging); +static int +vmexit_mmio(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + int vcpu, err; + struct vm_mmio mmio; + bool is_read; - if (err) { - if (err == ESRCH) { - EPRINTLN("Unhandled memory access to 0x%lx\n", - vmexit->u.inst_emul.gpa); - } + stats.vmexit_mmio++; - fprintf(stderr, "Failed to emulate instruction sequence [ "); - for (i = 0; i < vie->num_valid; i++) - fprintf(stderr, "%02x", vie->inst[i]); - FPRINTLN(stderr, " ] at 0x%lx", vmexit->rip); - return (VMEXIT_ABORT); + vcpu = *pvcpu; + mmio = vmexit->u.mmio; + is_read = (mmio.read != 0); + + err = emulate_mem(ctx, vcpu, &mmio); + + if (err == ESRCH) { + fprintf(stderr, "Unhandled memory access to 0x%lx\n", mmio.gpa); + stats.mmio_unhandled++; + + /* + * Access to non-existent physical addresses is not likely to + * result in fatal errors on hardware machines, but rather reads + * of all-ones or discarded-but-acknowledged writes. + */ + mmio.data = ~0UL; + err = 0; } - return (VMEXIT_CONTINUE); + if (err == 0) { + if (is_read) { + vmentry_mmio_read(vcpu, mmio.gpa, mmio.bytes, + mmio.data); + } else { + vmentry_mmio_write(vcpu, mmio.gpa, mmio.bytes); + } + return (VMEXIT_CONTINUE); + } + + fprintf(stderr, "Unhandled mmio error to 0x%lx: %d\n", mmio.gpa, err); + return (VMEXIT_ABORT); } static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; @@ -884,7 +999,7 @@ vmexit_breakpoint(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, - [VM_EXITCODE_INOUT_STR] = vmexit_inout, + [VM_EXITCODE_MMIO] = vmexit_mmio, [VM_EXITCODE_VMX] = vmexit_vmx, [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, @@ -906,6 +1021,8 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) int error, rc; enum vm_exitcode exitcode; cpuset_t active_cpus; + struct vm_exit *vexit; + struct vm_entry *ventry; #ifdef __FreeBSD__ if (vcpumap[vcpu] != NULL) { @@ -920,19 +1037,30 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); assert(error == 0); + ventry = &vmentry[vcpu]; + vexit = &vmexit[vcpu]; + while (1) { - error = vm_run(ctx, vcpu, &vmexit[vcpu]); + error = vm_run(ctx, vcpu, ventry, vexit); if (error != 0) break; - exitcode = vmexit[vcpu].exitcode; + if (ventry->cmd != VEC_DEFAULT) { + /* + * Discard any lingering entry state after it has been + * submitted via vm_run(). + */ + bzero(ventry, sizeof (*ventry)); + } + + exitcode = vexit->exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); exit(4); } - rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); + rc = (*handler[exitcode])(ctx, vexit, &vcpu); switch (rc) { case VMEXIT_CONTINUE: diff --git a/usr/src/cmd/bhyve/inout.c b/usr/src/cmd/bhyve/inout.c index b460ee2988..27068023d3 100644 --- a/usr/src/cmd/bhyve/inout.c +++ b/usr/src/cmd/bhyve/inout.c @@ -27,6 +27,18 @@ * * $FreeBSD$ */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); @@ -40,7 +52,6 @@ __FBSDID("$FreeBSD$"); #include <x86/segments.h> #include <machine/vmm.h> -#include <machine/vmm_instruction_emul.h> #include <vmmapi.h> #include <stdio.h> @@ -57,12 +68,14 @@ SET_DECLARE(inout_port_set, struct inout_port); #define VERIFY_IOPORT(port, size) \ assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS) -static struct { +struct inout_handler { const char *name; int flags; inout_func_t handler; void *arg; -} inout_handlers[MAX_IOPORTS]; +}; + +static struct inout_handler inout_handlers[MAX_IOPORTS]; static int default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes, @@ -85,11 +98,11 @@ default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes, return (0); } -static void +static void register_default_iohandler(int start, int size) { struct inout_port iop; - + VERIFY_IOPORT(start, size); bzero(&iop, sizeof(iop)); @@ -103,136 +116,37 @@ register_default_iohandler(int start, int size) } int -emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) +emulate_inout(struct vmctx *ctx, int vcpu, struct vm_inout *inout, bool strict) { - int addrsize, bytes, flags, in, port, prot, rep; - uint32_t eax, val; - inout_func_t handler; - void *arg; - int error, fault, retval; - enum vm_reg_name idxreg; - uint64_t gla, index, iterations, count; - struct vm_inout_str *vis; - struct iovec iov[2]; - - bytes = vmexit->u.inout.bytes; - in = vmexit->u.inout.in; - port = vmexit->u.inout.port; - - assert(port < MAX_IOPORTS); + struct inout_handler handler; + inout_func_t hfunc; + void *harg; + int error; + uint8_t bytes; + bool in; + + bytes = inout->bytes; + in = (inout->flags & INOUT_IN) != 0; + assert(bytes == 1 || bytes == 2 || bytes == 4); - handler = inout_handlers[port].handler; + handler = inout_handlers[inout->port]; + hfunc = handler.handler; + harg = handler.arg; - if (strict && handler == default_inout) + if (strict && hfunc == default_inout) return (-1); - flags = inout_handlers[port].flags; - arg = inout_handlers[port].arg; - if (in) { - if (!(flags & IOPORT_F_IN)) + if (!(handler.flags & IOPORT_F_IN)) return (-1); } else { - if (!(flags & IOPORT_F_OUT)) + if (!(handler.flags & IOPORT_F_OUT)) return (-1); } - retval = 0; - if (vmexit->u.inout.string) { - vis = &vmexit->u.inout_str; - rep = vis->inout.rep; - addrsize = vis->addrsize; - prot = in ? PROT_WRITE : PROT_READ; - assert(addrsize == 2 || addrsize == 4 || addrsize == 8); - - /* Index register */ - idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; - index = vis->index & vie_size2mask(addrsize); - - /* Count register */ - count = vis->count & vie_size2mask(addrsize); - - /* Limit number of back-to-back in/out emulations to 16 */ - iterations = MIN(count, 16); - while (iterations > 0) { - assert(retval == 0); - if (vie_calculate_gla(vis->paging.cpu_mode, - vis->seg_name, &vis->seg_desc, index, bytes, - addrsize, prot, &gla)) { - vm_inject_gp(ctx, vcpu); - break; - } - - error = vm_copy_setup(ctx, vcpu, &vis->paging, gla, - bytes, prot, iov, nitems(iov), &fault); - if (error) { - retval = -1; /* Unrecoverable error */ - break; - } else if (fault) { - retval = 0; /* Resume guest to handle fault */ - break; - } - - if (vie_alignment_check(vis->paging.cpl, bytes, - vis->cr0, vis->rflags, gla)) { - vm_inject_ac(ctx, vcpu, 0); - break; - } - - val = 0; - if (!in) - vm_copyin(ctx, vcpu, iov, &val, bytes); - - retval = handler(ctx, vcpu, in, port, bytes, &val, arg); - if (retval != 0) - break; - - if (in) - vm_copyout(ctx, vcpu, &val, iov, bytes); - - /* Update index */ - if (vis->rflags & PSL_D) - index -= bytes; - else - index += bytes; - - count--; - iterations--; - } - - /* Update index register */ - error = vie_update_register(ctx, vcpu, idxreg, index, addrsize); - assert(error == 0); - - /* - * Update count register only if the instruction had a repeat - * prefix. - */ - if (rep) { - error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX, - count, addrsize); - assert(error == 0); - } - - /* Restart the instruction if more iterations remain */ - if (retval == 0 && count != 0) { - error = vm_restart_instruction(ctx, vcpu); - assert(error == 0); - } - } else { - eax = vmexit->u.inout.eax; - val = eax & vie_size2mask(bytes); - retval = handler(ctx, vcpu, in, port, bytes, &val, arg); - if (retval == 0 && in) { - eax &= ~vie_size2mask(bytes); - eax |= val & vie_size2mask(bytes); - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, - eax); - assert(error == 0); - } - } - return (retval); + error = hfunc(ctx, vcpu, in, inout->port, bytes, &inout->eax, harg); + return (error); } void diff --git a/usr/src/cmd/bhyve/inout.h b/usr/src/cmd/bhyve/inout.h index b72ee5d93e..b026e18e92 100644 --- a/usr/src/cmd/bhyve/inout.h +++ b/usr/src/cmd/bhyve/inout.h @@ -47,6 +47,7 @@ struct vmctx; struct vm_exit; +struct vm_inout; /* * inout emulation handlers return 0 on success and -1 on failure. @@ -82,10 +83,10 @@ struct inout_port { 0 \ }; \ DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__)) - + void init_inout(void); -int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit, - int strict); +int emulate_inout(struct vmctx *, int vcpu, struct vm_inout *inout, + bool strict); int register_inout(struct inout_port *iop); int unregister_inout(struct inout_port *iop); void init_bvmcons(void); diff --git a/usr/src/cmd/bhyve/mem.c b/usr/src/cmd/bhyve/mem.c index 90aefe45c8..1afc8bf5f0 100644 --- a/usr/src/cmd/bhyve/mem.c +++ b/usr/src/cmd/bhyve/mem.c @@ -27,6 +27,18 @@ * * $FreeBSD$ */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ /* * Memory ranges are represented with an RB tree. On insertion, the range @@ -41,7 +53,6 @@ __FBSDID("$FreeBSD$"); #include <sys/errno.h> #include <sys/tree.h> #include <machine/vmm.h> -#include <machine/vmm_instruction_emul.h> #include <assert.h> #include <err.h> @@ -96,7 +107,7 @@ mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr, *entry = res; return (0); } - + return (ENOENT); } @@ -170,7 +181,7 @@ access_memory(struct vmctx *ctx, int vcpu, uint64_t paddr, mem_cb_t *cb, { struct mmio_rb_range *entry; int err, perror, immutable; - + pthread_rwlock_rdlock(&mmio_rwlock); /* * First check the per-vCPU cache @@ -185,7 +196,7 @@ access_memory(struct vmctx *ctx, int vcpu, uint64_t paddr, mem_cb_t *cb, if (entry == NULL) { if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) { /* Update the per-vCPU cache */ - mmio_hint[vcpu] = entry; + mmio_hint[vcpu] = entry; } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) { perror = pthread_rwlock_unlock(&mmio_rwlock); assert(perror == 0); @@ -223,32 +234,28 @@ access_memory(struct vmctx *ctx, int vcpu, uint64_t paddr, mem_cb_t *cb, return (err); } -struct emulate_mem_args { - struct vie *vie; - struct vm_guest_paging *paging; -}; - static int emulate_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr, void *arg) { - struct emulate_mem_args *ema; + struct vm_mmio *mmio; + int err = 0; + + mmio = arg; - ema = arg; - return (vmm_emulate_instruction(ctx, vcpu, paddr, ema->vie, ema->paging, - mem_read, mem_write, mr)); + if (mmio->read != 0) { + err = mem_read(ctx, vcpu, paddr, &mmio->data, mmio->bytes, mr); + } else { + err = mem_write(ctx, vcpu, paddr, mmio->data, mmio->bytes, mr); + } + + return (err); } int -emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, - struct vm_guest_paging *paging) - +emulate_mem(struct vmctx *ctx, int vcpu, struct vm_mmio *mmio) { - struct emulate_mem_args ema; - - ema.vie = vie; - ema.paging = paging; - return (access_memory(ctx, vcpu, paddr, emulate_mem_cb, &ema)); + return (access_memory(ctx, vcpu, mmio->gpa, emulate_mem_cb, mmio)); } struct rw_mem_args { @@ -333,23 +340,23 @@ register_mem_fallback(struct mem_range *memp) return (register_mem_int(&mmio_rb_fallback, memp)); } -int +int unregister_mem(struct mem_range *memp) { struct mem_range *mr; struct mmio_rb_range *entry = NULL; int err, perror, i; - + pthread_rwlock_wrlock(&mmio_rwlock); err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry); if (err == 0) { mr = &entry->mr_param; assert(mr->name == memp->name); - assert(mr->base == memp->base && mr->size == memp->size); + assert(mr->base == memp->base && mr->size == memp->size); assert((mr->flags & MEM_F_IMMUTABLE) == 0); RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry); - /* flush Per-vCPU cache */ + /* flush Per-vCPU cache */ for (i=0; i < VM_MAXCPU; i++) { if (mmio_hint[i] == entry) mmio_hint[i] = NULL; @@ -360,7 +367,7 @@ unregister_mem(struct mem_range *memp) if (entry) free(entry); - + return (err); } diff --git a/usr/src/cmd/bhyve/mem.h b/usr/src/cmd/bhyve/mem.h index 38d773c43f..8b81b93a02 100644 --- a/usr/src/cmd/bhyve/mem.h +++ b/usr/src/cmd/bhyve/mem.h @@ -53,8 +53,8 @@ struct mem_range { #define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */ void init_mem(void); -int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie, - struct vm_guest_paging *paging); + +int emulate_mem(struct vmctx *ctx, int vcpu, struct vm_mmio *mmio); int read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size); diff --git a/usr/src/cmd/bhyve/task_switch.c b/usr/src/cmd/bhyve/task_switch.c index f1b564d560..c4a087b54f 100644 --- a/usr/src/cmd/bhyve/task_switch.c +++ b/usr/src/cmd/bhyve/task_switch.c @@ -25,6 +25,18 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); @@ -37,7 +49,6 @@ __FBSDID("$FreeBSD$"); #include <x86/segments.h> #include <x86/specialreg.h> #include <machine/vmm.h> -#include <machine/vmm_instruction_emul.h> #include <assert.h> #include <errno.h> @@ -618,6 +629,150 @@ tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, return (0); } + +/* + * Copy of vie_alignment_check() from vmm_instruction_emul.c + */ +static int +alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) +{ + assert(size == 1 || size == 2 || size == 4 || size == 8); + assert(cpl >= 0 && cpl <= 3); + + if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0) + return (0); + + return ((gla & (size - 1)) ? 1 : 0); +} + +/* + * Copy of vie_size2mask() from vmm_instruction_emul.c + */ +static uint64_t +size2mask(int size) +{ + switch (size) { + case 1: + return (0xff); + case 2: + return (0xffff); + case 4: + return (0xffffffff); + case 8: + return (0xffffffffffffffff); + default: + assert(0); + /* not reached */ + return (0); + } +} + +/* + * Copy of vie_calculate_gla() from vmm_instruction_emul.c + */ +static int +calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t offset, int length, int addrsize, + int prot, uint64_t *gla) +{ + uint64_t firstoff, low_limit, high_limit, segbase; + int glasize, type; + + assert(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS); + assert((length == 1 || length == 2 || length == 4 || length == 8)); + assert((prot & ~(PROT_READ | PROT_WRITE)) == 0); + + firstoff = offset; + if (cpu_mode == CPU_MODE_64BIT) { + assert(addrsize == 4 || addrsize == 8); + glasize = 8; + } else { + assert(addrsize == 2 || addrsize == 4); + glasize = 4; + /* + * If the segment selector is loaded with a NULL selector + * then the descriptor is unusable and attempting to use + * it results in a #GP(0). + */ + if (SEG_DESC_UNUSABLE(desc->access)) + return (-1); + + /* + * The processor generates a #NP exception when a segment + * register is loaded with a selector that points to a + * descriptor that is not present. If this was the case then + * it would have been checked before the VM-exit. + */ + assert(SEG_DESC_PRESENT(desc->access)); + + /* + * The descriptor type must indicate a code/data segment. + */ + type = SEG_DESC_TYPE(desc->access); + assert(type >= 16 && type <= 31); + + if (prot & PROT_READ) { + /* #GP on a read access to a exec-only code segment */ + if ((type & 0xA) == 0x8) + return (-1); + } + + if (prot & PROT_WRITE) { + /* + * #GP on a write access to a code segment or a + * read-only data segment. + */ + if (type & 0x8) /* code segment */ + return (-1); + + if ((type & 0xA) == 0) /* read-only data seg */ + return (-1); + } + + /* + * 'desc->limit' is fully expanded taking granularity into + * account. + */ + if ((type & 0xC) == 0x4) { + /* expand-down data segment */ + low_limit = desc->limit + 1; + high_limit = SEG_DESC_DEF32(desc->access) ? + 0xffffffff : 0xffff; + } else { + /* code segment or expand-up data segment */ + low_limit = 0; + high_limit = desc->limit; + } + + while (length > 0) { + offset &= size2mask(addrsize); + if (offset < low_limit || offset > high_limit) + return (-1); + offset++; + length--; + } + } + + /* + * In 64-bit mode all segments except %fs and %gs have a segment + * base address of 0. + */ + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + segbase = desc->base; + } + + /* + * Truncate 'firstoff' to the effective address size before adding + * it to the segment base. + */ + firstoff &= size2mask(addrsize); + *gla = (segbase + firstoff) & size2mask(glasize); + return (0); +} + /* * Push an error code on the stack of the new task. This is needed if the * task switch was triggered by a hardware exception that causes an error @@ -667,14 +822,14 @@ push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); esp -= bytes; - if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, + if (calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); *faultptr = 1; return (0); } - if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { + if (alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { vm_inject_ac(ctx, vcpu, 1); *faultptr = 1; return (0); diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c index 5299791091..22c72cf5df 100644 --- a/usr/src/cmd/bhyvectl/bhyvectl.c +++ b/usr/src/cmd/bhyvectl/bhyvectl.c @@ -36,11 +36,10 @@ * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. - */ - -/* + * * Copyright 2015 Pluribus Networks Inc. * Copyright 2019 Joyent, Inc. + * Copyright 2020 Oxide Computer Company */ #include <sys/cdefs.h> @@ -358,14 +357,20 @@ dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu) switch (vmexit->exitcode) { case VM_EXITCODE_INOUT: printf("\treason\t\tINOUT\n"); - printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT"); + printf("\tdirection\t%s\n", + (vmexit->u.inout.flags & INOUT_IN) ? "IN" : "OUT"); printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes); - printf("\tflags\t\t%s%s\n", - vmexit->u.inout.string ? "STRING " : "", - vmexit->u.inout.rep ? "REP " : ""); printf("\tport\t\t0x%04x\n", vmexit->u.inout.port); printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax); break; + case VM_EXITCODE_MMIO: + printf("\treason\t\tMMIO\n"); + printf("\toperation\t%s\n", + vmexit->u.mmio.read ? "READ" : "WRITE"); + printf("\tbytes\t\t%d\n", vmexit->u.mmio.bytes); + printf("\tgpa\t\t0x%08x\n", vmexit->u.mmio.gpa); + printf("\tdata\t\t0x%08x\n", vmexit->u.mmio.data); + break; case VM_EXITCODE_VMX: printf("\treason\t\tVMX\n"); printf("\tstatus\t\t%d\n", vmexit->u.vmx.status); @@ -2366,7 +2371,11 @@ main(int argc, char *argv[]) } if (!error && run) { - error = vm_run(ctx, vcpu, &vmexit); + struct vm_entry entry; + + bzero(&entry, sizeof (entry)); + + error = vm_run(ctx, vcpu, &entry, &vmexit); if (error == 0) dump_vm_run_exitcode(&vmexit, vcpu); else diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c index 7d3446a845..6d5145431e 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.c +++ b/usr/src/lib/libvmmapi/common/vmmapi.c @@ -772,17 +772,16 @@ vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, } int -vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) +vm_run(struct vmctx *ctx, int vcpu, const struct vm_entry *vm_entry, + struct vm_exit *vm_exit) { - int error; - struct vm_run vmrun; + struct vm_entry entry; - bzero(&vmrun, sizeof(vmrun)); - vmrun.cpuid = vcpu; + bcopy(vm_entry, &entry, sizeof (entry)); + entry.cpuid = vcpu; + entry.exit_data = vm_exit; - error = ioctl(ctx->fd, VM_RUN, &vmrun); - bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit)); - return (error); + return (ioctl(ctx->fd, VM_RUN, &entry)); } int diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h index 997267b8cc..4656f417b4 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.h +++ b/usr/src/lib/libvmmapi/common/vmmapi.h @@ -165,7 +165,8 @@ int vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals); int vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, const int *regnums, uint64_t *regvals); -int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit); +int vm_run(struct vmctx *ctx, int vcpu, const struct vm_entry *vm_entry, + struct vm_exit *vm_exit); int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); int vm_reinit(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c index 615d3cd029..e78d401e68 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/svm.c +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c @@ -68,7 +68,7 @@ __FBSDID("$FreeBSD$"); #include <machine/smp.h> #include <machine/vmm.h> #include <machine/vmm_dev.h> -#include <machine/vmm_instruction_emul.h> +#include <sys/vmm_instruction_emul.h> #include "vmm_lapic.h" #include "vmm_stat.h" @@ -717,61 +717,6 @@ svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) /* * ins/outs utility routines */ -static uint64_t -svm_inout_str_index(struct svm_regctx *regs, int in) -{ - uint64_t val; - - val = in ? regs->sctx_rdi : regs->sctx_rsi; - - return (val); -} - -static uint64_t -svm_inout_str_count(struct svm_regctx *regs, int rep) -{ - uint64_t val; - - val = rep ? regs->sctx_rcx : 1; - - return (val); -} - -static void -svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1, - int in, struct vm_inout_str *vis) -{ - int error, s; - - if (in) { - vis->seg_name = VM_REG_GUEST_ES; - } else { - /* The segment field has standard encoding */ - s = (info1 >> 10) & 0x7; - vis->seg_name = vm_segment_name(s); - } - - error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc); - KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error)); -} - -static int -svm_inout_str_addrsize(uint64_t info1) -{ - uint32_t size; - - size = (info1 >> 7) & 0x7; - switch (size) { - case 1: - return (2); /* 16 bit */ - case 2: - return (4); /* 32 bit */ - case 4: - return (8); /* 64 bit */ - default: - panic("%s: invalid size encoding %d", __func__, size); - } -} static void svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) @@ -792,53 +737,78 @@ svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) * Handle guest I/O intercept. */ static int -svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) +svm_handle_inout(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) { struct vmcb_ctrl *ctrl; struct vmcb_state *state; - struct svm_regctx *regs; - struct vm_inout_str *vis; + struct vm_inout *inout; + struct vie *vie; uint64_t info1; - int inout_string; + struct vm_guest_paging paging; state = svm_get_vmcb_state(svm_sc, vcpu); - ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); - regs = svm_get_guest_regctx(svm_sc, vcpu); - + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + inout = &vmexit->u.inout; info1 = ctrl->exitinfo1; - inout_string = info1 & BIT(2) ? 1 : 0; - /* - * The effective segment number in EXITINFO1[12:10] is populated - * only if the processor has the DecodeAssist capability. - * - * XXX this is not specified explicitly in APMv2 but can be verified - * empirically. - */ - if (inout_string && !decode_assist()) - return (UNHANDLED); - - vmexit->exitcode = VM_EXITCODE_INOUT; - vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; - vmexit->u.inout.string = inout_string; - vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0; - vmexit->u.inout.bytes = (info1 >> 4) & 0x7; - vmexit->u.inout.port = (uint16_t)(info1 >> 16); - vmexit->u.inout.eax = (uint32_t)(state->rax); - - if (inout_string) { - vmexit->exitcode = VM_EXITCODE_INOUT_STR; - vis = &vmexit->u.inout_str; - svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging); - vis->rflags = state->rflags; - vis->cr0 = state->cr0; - vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); - vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); - vis->addrsize = svm_inout_str_addrsize(info1); - svm_inout_str_seginfo(svm_sc, vcpu, info1, - vmexit->u.inout.in, vis); + inout->bytes = (info1 >> 4) & 0x7; + inout->flags = 0; + inout->flags |= (info1 & BIT(0)) ? INOUT_IN : 0; + inout->flags |= (info1 & BIT(3)) ? INOUT_REP : 0; + inout->flags |= (info1 & BIT(2)) ? INOUT_STR : 0; + inout->port = (uint16_t)(info1 >> 16); + inout->eax = (uint32_t)(state->rax); + + if ((inout->flags & INOUT_STR) != 0) { + /* + * The effective segment number in EXITINFO1[12:10] is populated + * only if the processor has the DecodeAssist capability. + * + * This is not specified explicitly in APMv2 but can be verified + * empirically. + */ + if (!decode_assist()) { + /* + * Without decoding assistance, force the task of + * emulating the ins/outs on userspace. + */ + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + bzero(&vmexit->u.inst_emul, + sizeof (vmexit->u.inst_emul)); + return (UNHANDLED); + } + + /* + * Bits 7-9 encode the address size of ins/outs operations where + * the 1/2/4 values correspond to 16/32/64 bit sizes. + */ + inout->addrsize = 2 * ((info1 >> 7) & 0x7); + VERIFY(inout->addrsize == 2 || inout->addrsize == 4 || + inout->addrsize == 8); + + if (inout->flags & INOUT_IN) { + /* + * For INS instructions, %es (encoded as 0) is the + * implied segment for the operation. + */ + inout->segment = 0; + } else { + /* + * Bits 10-12 encode the segment for OUTS. + * This value follows the standard x86 segment order. + */ + inout->segment = (info1 >> 10) & 0x7; + } } + vmexit->exitcode = VM_EXITCODE_INOUT; + svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &paging); + vie = vm_vie_ctx(svm_sc->vm, vcpu); + vie_init_inout(vie, inout, vmexit->inst_length, &paging); + + /* The in/out emulation will handle advancing %rip */ + vmexit->inst_length = 0; + return (UNHANDLED); } @@ -857,7 +827,6 @@ npf_fault_type(uint64_t exitinfo1) static bool svm_npf_emul_fault(uint64_t exitinfo1) { - if (exitinfo1 & VMCB_NPF_INFO1_ID) { return (false); } @@ -870,48 +839,52 @@ svm_npf_emul_fault(uint64_t exitinfo1) return (false); } - return (true); + return (true); } static void -svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) +svm_handle_mmio_emul(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit, + uint64_t gpa) { - struct vm_guest_paging *paging; - struct vmcb_segment seg; struct vmcb_ctrl *ctrl; - char *inst_bytes; - int error, inst_len; + struct vmcb *vmcb; + struct vie *vie; + struct vm_guest_paging paging; + struct vmcb_segment seg; + char *inst_bytes = NULL; + uint8_t inst_len = 0; + int error; + vmcb = svm_get_vmcb(svm_sc, vcpu); ctrl = &vmcb->ctrl; - paging = &vmexit->u.inst_emul.paging; - vmexit->exitcode = VM_EXITCODE_INST_EMUL; - vmexit->u.inst_emul.gpa = gpa; - vmexit->u.inst_emul.gla = VIE_INVALID_GLA; - svm_paging_info(vmcb, paging); + vmexit->exitcode = VM_EXITCODE_MMIO_EMUL; + vmexit->u.mmio_emul.gpa = gpa; + vmexit->u.mmio_emul.gla = VIE_INVALID_GLA; + svm_paging_info(vmcb, &paging); error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error)); - switch(paging->cpu_mode) { + switch (paging.cpu_mode) { case CPU_MODE_REAL: - vmexit->u.inst_emul.cs_base = seg.base; - vmexit->u.inst_emul.cs_d = 0; + vmexit->u.mmio_emul.cs_base = seg.base; + vmexit->u.mmio_emul.cs_d = 0; break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: - vmexit->u.inst_emul.cs_base = seg.base; + vmexit->u.mmio_emul.cs_base = seg.base; /* * Section 4.8.1 of APM2, Default Operand Size or D bit. */ - vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ? + vmexit->u.mmio_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ? 1 : 0; break; default: - vmexit->u.inst_emul.cs_base = 0; - vmexit->u.inst_emul.cs_d = 0; - break; + vmexit->u.mmio_emul.cs_base = 0; + vmexit->u.mmio_emul.cs_d = 0; + break; } /* @@ -920,11 +893,9 @@ svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) if (decode_assist() && !disable_npf_assist) { inst_len = ctrl->inst_len; inst_bytes = (char *)ctrl->inst_bytes; - } else { - inst_len = 0; - inst_bytes = NULL; } - vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); + vie = vm_vie_ctx(svm_sc->vm, vcpu); + vie_init_mmio(vie, inst_bytes, inst_len, &paging, gpa); } #ifdef KTR @@ -1520,7 +1491,7 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) } break; case VMCB_EXIT_IO: - handled = svm_handle_io(svm_sc, vcpu, vmexit); + handled = svm_handle_inout(svm_sc, vcpu, vmexit); vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); break; case VMCB_EXIT_CPUID: @@ -1552,9 +1523,9 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) "on gpa %#lx/%#lx at rip %#lx", info2, info1, state->rip); } else if (svm_npf_emul_fault(info1)) { - svm_handle_inst_emul(vmcb, info2, vmexit); - vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1); - VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault " + svm_handle_mmio_emul(svm_sc, vcpu, vmexit, info2); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_MMIO_EMUL, 1); + VCPU_CTR3(svm_sc->vm, vcpu, "mmio_emul fault " "for gpa %#lx/%#lx at rip %#lx", info2, info1, state->rip); } @@ -1568,7 +1539,7 @@ svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) default: vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); break; - } + } VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d", handled ? "handled" : "unhandled", exit_reason_to_str(code), diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c index 83f149c6b7..8156121571 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c @@ -77,7 +77,7 @@ __FBSDID("$FreeBSD$"); #include <machine/vmm.h> #include <machine/vmm_dev.h> -#include <machine/vmm_instruction_emul.h> +#include <sys/vmm_instruction_emul.h> #include "vmm_lapic.h" #include "vmm_host.h" #include "vmm_ioport.h" @@ -1887,69 +1887,6 @@ vmx_paging_mode(void) return (PAGING_MODE_PAE); } -static uint64_t -inout_str_index(struct vmx *vmx, int vcpuid, int in) -{ - uint64_t val; - int error; - enum vm_reg_name reg; - - reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; - error = vmx_getreg(vmx, vcpuid, reg, &val); - KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error)); - return (val); -} - -static uint64_t -inout_str_count(struct vmx *vmx, int vcpuid, int rep) -{ - uint64_t val; - int error; - - if (rep) { - error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val); - KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error)); - } else { - val = 1; - } - return (val); -} - -static int -inout_str_addrsize(uint32_t inst_info) -{ - uint32_t size; - - size = (inst_info >> 7) & 0x7; - switch (size) { - case 0: - return (2); /* 16 bit */ - case 1: - return (4); /* 32 bit */ - case 2: - return (8); /* 64 bit */ - default: - panic("%s: invalid size encoding %d", __func__, size); - } -} - -static void -inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in, - struct vm_inout_str *vis) -{ - int error, s; - - if (in) { - vis->seg_name = VM_REG_GUEST_ES; - } else { - s = (inst_info >> 15) & 0x7; - vis->seg_name = vm_segment_name(s); - } - - error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc); - KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error)); -} - static void vmx_paging_info(struct vm_guest_paging *paging) { @@ -1960,35 +1897,89 @@ vmx_paging_info(struct vm_guest_paging *paging) } static void -vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) +vmexit_mmio_emul(struct vm_exit *vmexit, struct vie *vie, uint64_t gpa, + uint64_t gla) { - struct vm_guest_paging *paging; + struct vm_guest_paging paging; uint32_t csar; - paging = &vmexit->u.inst_emul.paging; - - vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->exitcode = VM_EXITCODE_MMIO_EMUL; vmexit->inst_length = 0; - vmexit->u.inst_emul.gpa = gpa; - vmexit->u.inst_emul.gla = gla; - vmx_paging_info(paging); - switch (paging->cpu_mode) { + vmexit->u.mmio_emul.gpa = gpa; + vmexit->u.mmio_emul.gla = gla; + vmx_paging_info(&paging); + + switch (paging.cpu_mode) { case CPU_MODE_REAL: - vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); - vmexit->u.inst_emul.cs_d = 0; + vmexit->u.mmio_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + vmexit->u.mmio_emul.cs_d = 0; break; case CPU_MODE_PROTECTED: case CPU_MODE_COMPATIBILITY: - vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); + vmexit->u.mmio_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE); csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS); - vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar); + vmexit->u.mmio_emul.cs_d = SEG_DESC_DEF32(csar); break; default: - vmexit->u.inst_emul.cs_base = 0; - vmexit->u.inst_emul.cs_d = 0; + vmexit->u.mmio_emul.cs_base = 0; + vmexit->u.mmio_emul.cs_d = 0; break; } - vie_init(&vmexit->u.inst_emul.vie, NULL, 0); + + vie_init_mmio(vie, NULL, 0, &paging, gpa); +} + +static void +vmexit_inout(struct vm_exit *vmexit, struct vie *vie, uint64_t qual, + uint32_t eax) +{ + struct vm_guest_paging paging; + struct vm_inout *inout; + + inout = &vmexit->u.inout; + + inout->bytes = (qual & 0x7) + 1; + inout->flags = 0; + inout->flags |= (qual & 0x8) ? INOUT_IN : 0; + inout->flags |= (qual & 0x10) ? INOUT_STR : 0; + inout->flags |= (qual & 0x20) ? INOUT_REP : 0; + inout->port = (uint16_t)(qual >> 16); + inout->eax = eax; + if (inout->flags & INOUT_STR) { + uint64_t inst_info; + + inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); + + /* + * Bits 7-9 encode the address size of ins/outs operations where + * the 0/1/2 values correspond to 16/32/64 bit sizes. + */ + inout->addrsize = 2 << (1 + ((inst_info >> 7) & 0x3)); + VERIFY(inout->addrsize == 2 || inout->addrsize == 4 || + inout->addrsize == 8); + + if (inout->flags & INOUT_IN) { + /* + * The bits describing the segment in INSTRUCTION_INFO + * are not defined for ins, leaving it to system + * software to assume %es (encoded as 0) + */ + inout->segment = 0; + } else { + /* + * Bits 15-17 encode the segment for OUTS. + * This value follows the standard x86 segment order. + */ + inout->segment = (inst_info >> 15) & 0x7; + } + } + + vmexit->exitcode = VM_EXITCODE_INOUT; + vmx_paging_info(&paging); + vie_init_inout(vie, inout, vmexit->inst_length, &paging); + + /* The in/out emulation will handle advancing %rip */ + vmexit->inst_length = 0; } static int @@ -2136,6 +2127,7 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) { uint64_t qual; int access_type, offset, allowed; + struct vie *vie; if (!apic_access_virtualization(vmx, vcpuid)) return (UNHANDLED); @@ -2182,7 +2174,8 @@ vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) } if (allowed) { - vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, + vie = vm_vie_ctx(vmx->vm, vcpuid); + vmexit_mmio_emul(vmexit, vie, DEFAULT_APIC_BASE + offset, VIE_INVALID_GLA); } @@ -2264,10 +2257,10 @@ emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { - int error, errcode, errcode_valid, handled, in; + int error, errcode, errcode_valid, handled; struct vmxctx *vmxctx; + struct vie *vie; struct vlapic *vlapic; - struct vm_inout_str *vis; struct vm_task_switch *ts; uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; uint32_t intr_type, intr_vec, reason; @@ -2524,25 +2517,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) return (1); case EXIT_REASON_INOUT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1); - vmexit->exitcode = VM_EXITCODE_INOUT; - vmexit->u.inout.bytes = (qual & 0x7) + 1; - vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0; - vmexit->u.inout.string = (qual & 0x10) ? 1 : 0; - vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0; - vmexit->u.inout.port = (uint16_t)(qual >> 16); - vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax); - if (vmexit->u.inout.string) { - inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO); - vmexit->exitcode = VM_EXITCODE_INOUT_STR; - vis = &vmexit->u.inout_str; - vmx_paging_info(&vis->paging); - vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS); - vis->cr0 = vmcs_read(VMCS_GUEST_CR0); - vis->index = inout_str_index(vmx, vcpu, in); - vis->count = inout_str_count(vmx, vcpu, vis->inout.rep); - vis->addrsize = inout_str_addrsize(inst_info); - inout_str_seginfo(vmx, vcpu, inst_info, in, vis); - } + vie = vm_vie_ctx(vmx->vm, vcpu); + vmexit_inout(vmexit, vie, qual, (uint32_t)vmxctx->guest_rax); SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); break; case EXIT_REASON_CPUID: @@ -2653,8 +2629,9 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) SDT_PROBE5(vmm, vmx, exit, nestedfault, vmx, vcpu, vmexit, gpa, qual); } else if (ept_emulation_fault(qual)) { - vmexit_inst_emul(vmexit, gpa, vmcs_gla()); - vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); + vie = vm_vie_ctx(vmx->vm, vcpu); + vmexit_mmio_emul(vmexit, vie, gpa, vmcs_gla()); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MMIO_EMUL, 1); SDT_PROBE4(vmm, vmx, exit, mmiofault, vmx, vcpu, vmexit, gpa); } diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.c b/usr/src/uts/i86pc/io/vmm/io/vatpic.c index ba4cd7785e..817c815fd6 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpic.c +++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.c @@ -709,8 +709,8 @@ vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port, } int -vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, - uint32_t *eax) +vatpic_master_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *eax) { struct vatpic *vatpic; struct atpic *atpic; @@ -729,8 +729,8 @@ vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, } int -vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, - uint32_t *eax) +vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *eax) { struct vatpic *vatpic; struct atpic *atpic; @@ -749,8 +749,8 @@ vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, } int -vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, - uint32_t *eax) +vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *eax) { struct vatpic *vatpic; bool is_master; diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.h b/usr/src/uts/i86pc/io/vmm/io/vatpic.h index d4a1be1820..dcb8ea6c6f 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpic.h +++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.h @@ -39,12 +39,12 @@ struct vatpic *vatpic_init(struct vm *vm); void vatpic_cleanup(struct vatpic *vatpic); -int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, - int bytes, uint32_t *eax); -int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, - int bytes, uint32_t *eax); -int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, - uint32_t *eax); +int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *eax); +int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *eax); +int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *eax); int vatpic_assert_irq(struct vm *vm, int irq); int vatpic_deassert_irq(struct vm *vm, int irq); diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.c b/usr/src/uts/i86pc/io/vmm/io/vatpit.c index 03f63798e7..47cb40f9bd 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpit.c +++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.c @@ -336,7 +336,7 @@ vatpit_update_mode(struct vatpit *vatpit, uint8_t val) } int -vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, +vatpit_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, uint8_t bytes, uint32_t *eax) { struct vatpit *vatpit; @@ -419,8 +419,8 @@ vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, } int -vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, - uint32_t *eax) +vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *eax) { struct vatpit *vatpit; diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.h b/usr/src/uts/i86pc/io/vmm/io/vatpit.h index 4bf9fe048d..512ce20735 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpit.h +++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.h @@ -39,10 +39,10 @@ struct vatpit *vatpit_init(struct vm *vm); void vatpit_cleanup(struct vatpit *vatpit); -int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, - uint32_t *eax); -int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, - int bytes, uint32_t *eax); +int vatpit_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *eax); +int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *eax); #ifndef __FreeBSD__ void vatpit_localize_resources(struct vatpit *); diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c index af902ba40e..60fc907b85 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vlapic.c +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c @@ -988,7 +988,6 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) uint64_t icrval; uint32_t dest, vec, mode; struct vlapic *vlapic2; - struct vm_exit *vmexit; struct LAPIC *lapic; uint16_t maxcpus; @@ -1082,13 +1081,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) return (0); vlapic2->boot_state = BS_RUNNING; - - *retu = true; - vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); - vmexit->exitcode = VM_EXITCODE_SPINUP_AP; - vmexit->u.spinup_ap.vcpu = dest; - vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT; - + vm_req_spinup_ap(vlapic->vm, dest, vec << PAGE_SHIFT); return (0); } } diff --git a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c index 4df909777d..0dce2b0a1f 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c +++ b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c @@ -80,7 +80,7 @@ vpmtmr_cleanup(struct vpmtmr *vpmtmr) } int -vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, +vpmtmr_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, uint8_t bytes, uint32_t *val) { struct vpmtmr *vpmtmr; diff --git a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h index e6562da5c0..c06825b970 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h +++ b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h @@ -38,7 +38,7 @@ struct vpmtmr; struct vpmtmr *vpmtmr_init(struct vm *vm); void vpmtmr_cleanup(struct vpmtmr *pmtmr); -int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, - uint32_t *val); +int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *val); #endif diff --git a/usr/src/uts/i86pc/io/vmm/io/vrtc.c b/usr/src/uts/i86pc/io/vmm/io/vrtc.c index a3635fc9f0..7a98cd75ad 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vrtc.c +++ b/usr/src/uts/i86pc/io/vmm/io/vrtc.c @@ -874,8 +874,8 @@ vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) } int -vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, - uint32_t *val) +vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *val) { struct vrtc *vrtc; @@ -897,8 +897,8 @@ vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, } int -vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, - uint32_t *val) +vrtc_data_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *val) { struct vrtc *vrtc; struct rtcdev *rtc; diff --git a/usr/src/uts/i86pc/io/vmm/io/vrtc.h b/usr/src/uts/i86pc/io/vmm/io/vrtc.h index 13abbedeb9..92a060cb8e 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vrtc.h +++ b/usr/src/uts/i86pc/io/vmm/io/vrtc.h @@ -48,10 +48,10 @@ int vrtc_set_time(struct vm *vm, time_t secs); int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value); int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval); -int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, - uint32_t *val); -int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, - uint32_t *val); +int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *val); +int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *val); #ifndef __FreeBSD__ void vrtc_localize_resources(struct vrtc *); diff --git a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_instruction_emul.h index d084301aee..d3a07b0f99 100644 --- a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_instruction_emul.h @@ -27,64 +27,57 @@ * * $FreeBSD$ */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ #ifndef _VMM_INSTRUCTION_EMUL_H_ #define _VMM_INSTRUCTION_EMUL_H_ #include <sys/mman.h> +#include <machine/vmm.h> -/* - * Callback functions to read and write memory regions. - */ -typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa, - uint64_t *rval, int rsize, void *arg); - -typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa, - uint64_t wval, int wsize, void *arg); +struct vie; -/* - * Emulate the decoded 'vie' instruction. - * - * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region - * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the - * callback functions. - * - * 'void *vm' should be 'struct vm *' when called from kernel context and - * 'struct vmctx *' when called from user context. - * s - */ -int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie, - struct vm_guest_paging *paging, mem_region_read_t mrr, - mem_region_write_t mrw, void *mrarg); +struct vie *vie_alloc(); +void vie_free(struct vie *); -int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, - uint64_t val, int size); +void vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length, + const struct vm_guest_paging *paging, uint64_t gpa); +void vie_init_inout(struct vie *vie, const struct vm_inout *inout, + uint8_t inst_len, const struct vm_guest_paging *paging); -/* - * Returns 1 if an alignment check exception should be injected and 0 otherwise. - */ -int vie_alignment_check(int cpl, int operand_size, uint64_t cr0, - uint64_t rflags, uint64_t gla); +int vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *res); +int vie_fulfill_inout(struct vie *vie, const struct vm_inout *res); -/* Returns 1 if the 'gla' is not canonical and 0 otherwise. */ -int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); +bool vie_needs_fetch(const struct vie *vie); +bool vie_pending(const struct vie *vie); +uint64_t vie_mmio_gpa(const struct vie *vie); +void vie_exitinfo(const struct vie *vie, struct vm_exit *vme); +void vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme); -uint64_t vie_size2mask(int size); +void vie_reset(struct vie *vie); +void vie_advance_pc(struct vie *vie, uint64_t *nextrip); -int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, - struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot, - uint64_t *gla); +int vie_emulate_mmio(struct vie *vie, void *vm, int vcpuid); +int vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid); -#ifdef _KERNEL /* * APIs to fetch and decode the instruction from nested page fault handler. * - * 'vie' must be initialized before calling 'vmm_fetch_instruction()' + * 'vie' must be initialized before calling 'vie_fetch_instruction()' */ -int vmm_fetch_instruction(struct vm *vm, int cpuid, - struct vm_guest_paging *guest_paging, - uint64_t rip, int inst_length, struct vie *vie, - int *is_fault); +int vie_fetch_instruction(struct vie *vie, struct vm *vm, int cpuid, + uint64_t rip, int *is_fault); /* * Translate the guest linear address 'gla' to a guest physical address. @@ -101,34 +94,23 @@ int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, * Like vm_gla2gpa, but no exceptions are injected into the guest and * PTEs are not changed. */ -int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa, int *is_fault); -#endif /* _KERNEL */ - -void vie_init(struct vie *vie, const char *inst_bytes, int inst_length); +int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, + struct vm_guest_paging *paging, uint64_t gla, int prot, uint64_t *gpa, + int *is_fault); +int vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla); /* * Decode the instruction fetched into 'vie' so it can be emulated. * * 'gla' is the guest linear address provided by the hardware assist * that caused the nested page table fault. It is used to verify that * the software instruction decoding is in agreement with the hardware. - * + * * Some hardware assists do not provide the 'gla' to the hypervisor. * To skip the 'gla' verification for this or any other reason pass * in VIE_INVALID_GLA instead. */ -#ifdef _KERNEL #define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */ -int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, - enum vm_cpu_mode cpu_mode, int csd, struct vie *vie); -#else /* !_KERNEL */ -/* - * Permit instruction decoding logic to be compiled outside of the kernel for - * rapid iteration and validation. No GLA validation is performed, obviously. - */ -int vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int csd, - struct vie *vie); -#endif /* _KERNEL */ +int vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int csd); #endif /* _VMM_INSTRUCTION_EMUL_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h index 2a884e6e0e..fbd2884b84 100644 --- a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h @@ -54,6 +54,7 @@ struct vm; struct vm_exception; struct seg_desc; struct vm_exit; +struct vie; struct vm_run; struct vhpet; struct vioapic; @@ -171,7 +172,7 @@ int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *ret_desc); int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc); -int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_run(struct vm *vm, int vcpuid, const struct vm_entry *); int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vm *vm, int vcpu); int vm_nmi_pending(struct vm *vm, int vcpuid); @@ -191,11 +192,17 @@ int vm_activate_cpu(struct vm *vm, int vcpu); int vm_suspend_cpu(struct vm *vm, int vcpu); int vm_resume_cpu(struct vm *vm, int vcpu); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); +struct vie *vm_vie_ctx(struct vm *vm, int vcpuid); void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); +int vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, + int rsize); +int vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, + int wsize); +void vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip); #ifdef _SYS__CPUSET_H_ cpuset_t vm_active_cpus(struct vm *vm); diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c index 579ca12e84..f4c22c13dd 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm.c @@ -79,7 +79,7 @@ __FBSDID("$FreeBSD$"); #include <machine/vmm.h> #include <machine/vmm_dev.h> -#include <machine/vmm_instruction_emul.h> +#include <sys/vmm_instruction_emul.h> #include "vmm_ioport.h" #include "vmm_ktr.h" @@ -135,6 +135,7 @@ struct vcpu { void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ uint64_t nextrip; /* (x) next instruction to execute */ + struct vie *vie_ctx; /* (x) instruction emulation context */ #ifndef __FreeBSD__ uint64_t tsc_offset; /* (x) offset from host TSC */ #endif @@ -200,6 +201,14 @@ struct vm { #ifndef __FreeBSD__ list_t ioport_hooks; #endif /* __FreeBSD__ */ + bool sipi_req; /* (i) SIPI requested */ + int sipi_req_vcpu; /* (i) SIPI destination */ + uint64_t sipi_req_rip; /* (i) SIPI start %rip */ + + /* Miscellaneous VM-wide statistics and counters */ + struct vm_wide_stats { + uint64_t sipi_supersede; + } stats; }; static int vmm_initialized; @@ -341,6 +350,8 @@ vcpu_cleanup(struct vm *vm, int i, bool destroy) if (destroy) { vmm_stat_free(vcpu->stats); fpu_save_area_free(vcpu->guestfpu); + vie_free(vcpu->vie_ctx); + vcpu->vie_ctx = NULL; } } @@ -367,6 +378,10 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create) #endif vcpu->guestfpu = fpu_save_area_alloc(); vcpu->stats = vmm_stat_alloc(); + vcpu->vie_ctx = vie_alloc(); + } else { + vie_reset(vcpu->vie_ctx); + bzero(&vcpu->exitinfo, sizeof (vcpu->exitinfo)); } vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); @@ -402,6 +417,15 @@ vm_exitinfo(struct vm *vm, int cpuid) return (&vcpu->exitinfo); } +struct vie * +vm_vie_ctx(struct vm *vm, int cpuid) +{ + if (cpuid < 0 || cpuid >= vm->maxcpus) + panic("vm_vie_ctx: invalid cpuid %d", cpuid); + + return (vm->vcpu[cpuid].vie_ctx); +} + static int vmm_init(void) { @@ -1558,85 +1582,190 @@ done: return (0); } +int +vm_service_mmio_read(struct vm *vm, int cpuid, uint64_t gpa, uint64_t *rval, + int rsize) +{ + int err = ESRCH; + void *arg = NULL; + + if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { + err = lapic_mmio_read(vm, cpuid, gpa, rval, rsize, &arg); + } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { + err = vioapic_mmio_read(vm, cpuid, gpa, rval, rsize, &arg); + } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { + err = vhpet_mmio_read(vm, cpuid, gpa, rval, rsize, &arg); + } + + return (err); +} + +int +vm_service_mmio_write(struct vm *vm, int cpuid, uint64_t gpa, uint64_t wval, + int wsize) +{ + int err = ESRCH; + void *arg = NULL; + + if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { + err = lapic_mmio_write(vm, cpuid, gpa, wval, wsize, &arg); + } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { + err = vioapic_mmio_write(vm, cpuid, gpa, wval, wsize, &arg); + } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { + err = vhpet_mmio_write(vm, cpuid, gpa, wval, wsize, &arg); + } + + return (err); +} + static int -vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) +vm_handle_mmio_emul(struct vm *vm, int vcpuid, bool *retu) { struct vie *vie; struct vcpu *vcpu; struct vm_exit *vme; - uint64_t gla, gpa, cs_base; - struct vm_guest_paging *paging; - mem_region_read_t mread; - mem_region_write_t mwrite; - enum vm_cpu_mode cpu_mode; - int cs_d, error, fault; + uint64_t inst_addr; + int error, fault, cs_d; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; + vie = vcpu->vie_ctx; KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", __func__, vme->inst_length)); - gla = vme->u.inst_emul.gla; - gpa = vme->u.inst_emul.gpa; - cs_base = vme->u.inst_emul.cs_base; - cs_d = vme->u.inst_emul.cs_d; - vie = &vme->u.inst_emul.vie; - paging = &vme->u.inst_emul.paging; - cpu_mode = paging->cpu_mode; + inst_addr = vme->rip + vme->u.mmio_emul.cs_base; + cs_d = vme->u.mmio_emul.cs_d; - VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa); + VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", + vme->u.mmio_emul.gpa); - /* Fetch, decode and emulate the faulting instruction */ - if (vie->num_valid == 0) { - error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + - cs_base, VIE_INST_SIZE, vie, &fault); - } else { - /* - * The instruction bytes have already been copied into 'vie' - */ - error = fault = 0; + /* Fetch the faulting instruction */ + if (vie_needs_fetch(vie)) { + error = vie_fetch_instruction(vie, vm, vcpuid, inst_addr, + &fault); + if (error != 0) { + return (error); + } else if (fault) { + /* + * If a fault during instruction fetch was encounted, it + * will have asserted that the appropriate exception be + * injected at next entry. No further work is required. + */ + return (0); + } } - if (error || fault) - return (error); - if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { + if (vie_decode_instruction(vie, vm, vcpuid, cs_d) != 0) { VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", - vme->rip + cs_base); - *retu = true; /* dump instruction bytes in userspace */ + inst_addr); + /* Dump (unrecognized) instruction bytes in userspace */ + vie_fallback_exitinfo(vie, vme); + *retu = true; return (0); } - - /* - * Update 'nextrip' based on the length of the emulated instruction. - */ - vme->inst_length = vie->num_processed; - vcpu->nextrip += vie->num_processed; - VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction " - "decoding", vcpu->nextrip); - - /* return to userland unless this is an in-kernel emulated device */ - if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { - mread = lapic_mmio_read; - mwrite = lapic_mmio_write; - } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { - mread = vioapic_mmio_read; - mwrite = vioapic_mmio_write; - } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) { - mread = vhpet_mmio_read; - mwrite = vhpet_mmio_write; - } else { + if (vme->u.mmio_emul.gla != VIE_INVALID_GLA && + vie_verify_gla(vie, vm, vcpuid, vme->u.mmio_emul.gla) != 0) { + /* Decoded GLA does not match GLA from VM exit state */ + vie_fallback_exitinfo(vie, vme); *retu = true; return (0); } - error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging, - mread, mwrite, retu); - +repeat: + error = vie_emulate_mmio(vie, vm, vcpuid); + if (error < 0) { + /* + * MMIO not handled by any of the in-kernel-emulated devices, so + * make a trip out to userspace for it. + */ + vie_exitinfo(vie, vme); + *retu = true; + error = 0; + } else if (error == EAGAIN) { + /* + * Continue emulating the rep-prefixed instruction, which has + * not completed its iterations. + * + * In case this can be emulated in-kernel and has a high + * repetition count (causing a tight spin), it should be + * deferential to yield conditions. + */ + if (!vcpu_should_yield(vm, vcpuid)) { + goto repeat; + } else { + /* + * Defer to the contending load by making a trip to + * userspace with a no-op (BOGUS) exit reason. + */ + vie_reset(vie); + vme->exitcode = VM_EXITCODE_BOGUS; + *retu = true; + return (0); + } + } else if (error == 0) { + /* Update %rip now that instruction has been emulated */ + vie_advance_pc(vie, &vcpu->nextrip); + } return (error); } static int +vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu) +{ + struct vcpu *vcpu; + struct vie *vie; + int err; + + vcpu = &vm->vcpu[vcpuid]; + vie = vcpu->vie_ctx; + +repeat: + err = vie_emulate_inout(vie, vm, vcpuid); + + if (err < 0) { + /* + * In/out not handled by any of the in-kernel-emulated devices, + * so make a trip out to userspace for it. + */ + vie_exitinfo(vie, vme); + *retu = true; + return (0); + } else if (err == EAGAIN) { + /* + * Continue emulating the rep-prefixed ins/outs, which has not + * completed its iterations. + * + * In case this can be emulated in-kernel and has a high + * repetition count (causing a tight spin), it should be + * deferential to yield conditions. + */ + if (!vcpu_should_yield(vm, vcpuid)) { + goto repeat; + } else { + /* + * Defer to the contending load by making a trip to + * userspace with a no-op (BOGUS) exit reason. + */ + vie_reset(vie); + vme->exitcode = VM_EXITCODE_BOGUS; + *retu = true; + return (0); + } + } else if (err != 0) { + /* Emulation failure. Bail all the way out to userspace. */ + vme->exitcode = VM_EXITCODE_INST_EMUL; + bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); + *retu = true; + return (0); + } + + vie_advance_pc(vie, &vcpu->nextrip); + *retu = false; + return (0); +} + +static int vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) { #ifdef __FreeBSD__ @@ -1768,6 +1897,18 @@ vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) } #endif /* __FreeBSD__ */ +void +vm_req_spinup_ap(struct vm *vm, int req_vcpuid, uint64_t req_rip) +{ + if (vm->sipi_req) { + /* This should never occur if userspace is doing its job. */ + vm->stats.sipi_supersede++; + } + vm->sipi_req = true; + vm->sipi_req_vcpu = req_vcpuid; + vm->sipi_req_rip = req_rip; +} + int vm_suspend(struct vm *vm, enum vm_suspend_how how) { @@ -1960,11 +2101,104 @@ vmm_freectx(void *arg, int isexec) #endif /* __FreeBSD */ +static int +vm_entry_actions(struct vm *vm, int vcpuid, const struct vm_entry *entry, + struct vm_exit *vme) +{ + struct vcpu *vcpu; + struct vie *vie; + int err; + + vcpu = &vm->vcpu[vcpuid]; + vie = vcpu->vie_ctx; + err = 0; + + switch (entry->cmd) { + case VEC_DEFAULT: + return (0); + case VEC_DISCARD_INSTR: + vie_reset(vie); + return (0); + case VEC_COMPLETE_MMIO: + err = vie_fulfill_mmio(vie, &entry->u.mmio); + if (err == 0) { + err = vie_emulate_mmio(vie, vm, vcpuid); + if (err == 0) { + vie_advance_pc(vie, &vcpu->nextrip); + } else if (err < 0) { + vie_exitinfo(vie, vme); + } else if (err == EAGAIN) { + /* + * Clear the instruction emulation state in + * order to re-enter VM context and continue + * this 'rep <instruction>' + */ + vie_reset(vie); + err = 0; + } + } + break; + case VEC_COMPLETE_INOUT: + err = vie_fulfill_inout(vie, &entry->u.inout); + if (err == 0) { + err = vie_emulate_inout(vie, vm, vcpuid); + if (err == 0) { + vie_advance_pc(vie, &vcpu->nextrip); + } else if (err < 0) { + vie_exitinfo(vie, vme); + } else if (err == EAGAIN) { + /* + * Clear the instruction emulation state in + * order to re-enter VM context and continue + * this 'rep ins/outs' + */ + vie_reset(vie); + err = 0; + } + } + break; + default: + return (EINVAL); + } + return (err); +} + +static int +vm_loop_checks(struct vm *vm, int vcpuid, struct vm_exit *vme) +{ + struct vie *vie; + + vie = vm->vcpu[vcpuid].vie_ctx; + + if (vie_pending(vie)) { + /* + * Userspace has not fulfilled the pending needs of the + * instruction emulation, so bail back out. + */ + vie_exitinfo(vie, vme); + return (-1); + } + + if (vcpuid == 0 && vm->sipi_req) { + /* The boot vCPU has sent a SIPI to one of the other CPUs */ + vme->exitcode = VM_EXITCODE_SPINUP_AP; + vme->u.spinup_ap.vcpu = vm->sipi_req_vcpu; + vme->u.spinup_ap.rip = vm->sipi_req_rip; + + vm->sipi_req = false; + vm->sipi_req_vcpu = 0; + vm->sipi_req_rip = 0; + return (-1); + } + + return (0); +} + int -vm_run(struct vm *vm, struct vm_run *vmrun) +vm_run(struct vm *vm, int vcpuid, const struct vm_entry *entry) { struct vm_eventinfo evinfo; - int error, vcpuid; + int error; struct vcpu *vcpu; #ifdef __FreeBSD__ struct pcb *pcb; @@ -1978,8 +2212,6 @@ vm_run(struct vm *vm, struct vm_run *vmrun) int affinity_type = CPU_CURRENT; #endif - vcpuid = vmrun->cpuid; - if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); @@ -2005,7 +2237,21 @@ vm_run(struct vm *vm, struct vm_run *vmrun) NULL, vmm_freectx); #endif + error = vm_entry_actions(vm, vcpuid, entry, vme); + if (error < 0) { + /* Exit condition to be serviced by userspace */ + error = 0; + goto exit; + } else if (error != 0) { + goto exit; + } + restart: + if (vm_loop_checks(vm, vcpuid, vme) != 0) { + error = 0; + goto exit; + } + #ifndef __FreeBSD__ thread_affinity_set(curthread, affinity_type); /* @@ -2091,11 +2337,10 @@ restart: case VM_EXITCODE_PAGING: error = vm_handle_paging(vm, vcpuid, &retu); break; - case VM_EXITCODE_INST_EMUL: - error = vm_handle_inst_emul(vm, vcpuid, &retu); + case VM_EXITCODE_MMIO_EMUL: + error = vm_handle_mmio_emul(vm, vcpuid, &retu); break; case VM_EXITCODE_INOUT: - case VM_EXITCODE_INOUT_STR: error = vm_handle_inout(vm, vcpuid, vme, &retu); break; case VM_EXITCODE_MONITOR: @@ -2114,12 +2359,12 @@ restart: affinity_type = CPU_BEST; break; } +#endif case VM_EXITCODE_MTRAP: vm_suspend_cpu(vm, vcpuid); retu = true; break; -#endif default: retu = true; /* handled in userland */ break; @@ -2129,6 +2374,7 @@ restart: if (error == 0 && retu == false) goto restart; +exit: #ifndef __FreeBSD__ removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL, NULL, vmm_freectx); @@ -2136,8 +2382,6 @@ restart: VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); - /* copy the exit information */ - bcopy(vme, &vmrun->vm_exit, sizeof (struct vm_exit)); return (error); } @@ -3206,21 +3450,21 @@ vm_ioport_handle_hook(struct vm *vm, int cpuid, bool in, int port, int bytes, } } if (hook == NULL) { - return (ENOENT); + return (ESRCH); } if (in) { uint64_t tval; if (hook->vmih_rmem_cb == NULL) { - return (ENOENT); + return (ESRCH); } err = hook->vmih_rmem_cb(hook->vmih_arg, (uintptr_t)port, (uint_t)bytes, &tval); *val = (uint32_t)tval; } else { if (hook->vmih_wmem_cb == NULL) { - return (ENOENT); + return (ESRCH); } err = hook->vmih_wmem_cb(hook->vmih_arg, (uintptr_t)port, (uint_t)bytes, (uint64_t)*val); diff --git a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c index 0d32fe0b9a..f8bb7a1646 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c @@ -40,12 +40,12 @@ * * Copyright 2015 Pluribus Networks Inc. * Copyright 2018 Joyent, Inc. + * Copyright 2020 Oxide Computer Company */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); -#ifdef _KERNEL #include <sys/param.h> #include <sys/pcpu.h> #include <sys/systm.h> @@ -56,27 +56,109 @@ __FBSDID("$FreeBSD$"); #include <machine/vmparam.h> #include <machine/vmm.h> -#else /* !_KERNEL */ -#include <sys/types.h> -#include <sys/errno.h> -#include <sys/_iovec.h> +#include <sys/vmm_kernel.h> -#include <machine/vmm.h> - -#include <err.h> -#include <assert.h> -#include <stdbool.h> -#include <stdio.h> -#include <strings.h> -#include <vmmapi.h> -#define KASSERT(exp,msg) assert((exp)) -#define panic(...) errx(4, __VA_ARGS__) -#endif /* _KERNEL */ - -#include <machine/vmm_instruction_emul.h> +#include <sys/vmm_instruction_emul.h> #include <x86/psl.h> #include <x86/specialreg.h> +#include "vmm_ioport.h" +#include "vmm_ktr.h" + +enum vie_status { + VIES_INIT = (1U << 0), + VIES_MMIO = (1U << 1), + VIES_INOUT = (1U << 2), + VIES_INST_FETCH = (1U << 3), + VIES_INST_DECODE = (1U << 4), + VIES_PENDING_MMIO = (1U << 5), + VIES_PENDING_INOUT = (1U << 6), + VIES_REPEAT = (1U << 7), + VIES_COMPLETE = (1U << 8), +}; + +/* State of request to perform emulated access (inout or MMIO) */ +enum vie_req { + VR_NONE, + VR_PENDING, + VR_DONE, +}; + +struct vie_mmio { + uint64_t data; + uint64_t gpa; + uint8_t bytes; + enum vie_req state; +}; + +struct vie_op { + uint8_t op_byte; /* actual opcode byte */ + uint8_t op_type; /* type of operation (e.g. MOV) */ + uint16_t op_flags; +}; + +#define VIE_INST_SIZE 15 +struct vie { + uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ + uint8_t num_valid; /* size of the instruction */ + uint8_t num_processed; + + uint8_t addrsize:4, opsize:4; /* address and operand sizes */ + uint8_t rex_w:1, /* REX prefix */ + rex_r:1, + rex_x:1, + rex_b:1, + rex_present:1, + repz_present:1, /* REP/REPE/REPZ prefix */ + repnz_present:1, /* REPNE/REPNZ prefix */ + opsize_override:1, /* Operand size override */ + addrsize_override:1, /* Address size override */ + segment_override:1; /* Segment override */ + + uint8_t mod:2, /* ModRM byte */ + reg:4, + rm:4; + + uint8_t ss:2, /* SIB byte */ + vex_present:1, /* VEX prefixed */ + vex_l:1, /* L bit */ + index:4, /* SIB byte */ + base:4; /* SIB byte */ + + uint8_t disp_bytes; + uint8_t imm_bytes; + + uint8_t scale; + + uint8_t vex_reg:4, /* vvvv: first source register specifier */ + vex_pp:2, /* pp */ + _sparebits:2; + + uint8_t _sparebytes[2]; + + int base_register; /* VM_REG_GUEST_xyz */ + int index_register; /* VM_REG_GUEST_xyz */ + int segment_register; /* VM_REG_GUEST_xyz */ + + int64_t displacement; /* optional addr displacement */ + int64_t immediate; /* optional immediate operand */ + + struct vie_op op; /* opcode description */ + + enum vie_status status; + + struct vm_guest_paging paging; /* guest paging state */ + + uint64_t mmio_gpa; /* faulting GPA */ + struct vie_mmio mmio_req_read; + struct vie_mmio mmio_req_write; + + struct vm_inout inout; /* active in/out op */ + enum vie_req inout_req_state; + uint32_t inout_req_val; /* value from userspace */ +}; + + /* struct vie_op.op_type */ enum { VIE_OP_TYPE_NONE = 0, @@ -299,14 +381,29 @@ static uint64_t size2mask[] = { [8] = 0xffffffffffffffff, }; -static int -vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval) -{ - int error; - error = vm_get_register(vm, vcpuid, reg, rval); +static int vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, + uint64_t gpa, uint64_t *rval, int bytes); +static int vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, + uint64_t gpa, uint64_t wval, int bytes); +static int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, + struct seg_desc *desc, uint64_t offset, int length, int addrsize, + int prot, uint64_t *gla); +static int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla); +static int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, + uint64_t gla); +static uint64_t vie_size2mask(int size); + +struct vie * +vie_alloc() +{ + return (kmem_zalloc(sizeof (struct vie), KM_SLEEP)); +} - return (error); +void +vie_free(struct vie *vie) +{ + kmem_free(vie, sizeof (struct vie)); } static void @@ -336,7 +433,7 @@ vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr) } static int -vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) +vie_read_bytereg(struct vie *vie, void *vm, int vcpuid, uint8_t *rval) { uint64_t val; int error, lhbr; @@ -357,7 +454,7 @@ vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval) } static int -vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) +vie_write_bytereg(struct vie *vie, void *vm, int vcpuid, uint8_t byte) { uint64_t origval, val, mask; int error, lhbr; @@ -382,9 +479,9 @@ vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte) return (error); } -int -vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, - uint64_t val, int size) +static int +vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t val, + int size) { int error; uint64_t origval; @@ -392,7 +489,7 @@ vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, switch (size) { case 1: case 2: - error = vie_read_register(vm, vcpuid, reg, &origval); + error = vm_get_register(vm, vcpuid, reg, &origval); if (error) return (error); val &= size2mask[size]; @@ -411,6 +508,29 @@ vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg, return (error); } +static int +vie_repeat(struct vie *vie) +{ + vie->status |= VIES_REPEAT; + + /* + * Clear out any cached operation values so the repeated instruction can + * begin without using that stale state. Other state, such as the + * decoding results, are kept around as it will not vary between + * iterations of a rep-prefixed instruction. + */ + if ((vie->status & VIES_MMIO) != 0) { + vie->mmio_req_read.state = VR_NONE; + vie->mmio_req_write.state = VR_NONE; + } else if ((vie->status & VIES_INOUT) != 0) { + vie->inout_req_state = VR_NONE; + } else { + panic("unexpected emulation state"); + } + + return (EAGAIN); +} + #define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V) /* @@ -519,8 +639,7 @@ getandflags(int opsize, uint64_t x, uint64_t y) } static int -emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { int error, size; enum vm_reg_name reg; @@ -538,9 +657,9 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available) */ size = 1; /* override for byte operation */ - error = vie_read_bytereg(vm, vcpuid, vie, &byte); + error = vie_read_bytereg(vie, vm, vcpuid, &byte); if (error == 0) - error = memwrite(vm, vcpuid, gpa, byte, size, arg); + error = vie_mmio_write(vie, vm, vcpuid, gpa, byte, size); break; case 0x89: /* @@ -550,10 +669,10 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * REX.W + 89/r mov r/m64, r64 */ reg = gpr_map[vie->reg]; - error = vie_read_register(vm, vcpuid, reg, &val); + error = vm_get_register(vm, vcpuid, reg, &val); if (error == 0) { val &= size2mask[size]; - error = memwrite(vm, vcpuid, gpa, val, size, arg); + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); } break; case 0x8A: @@ -563,9 +682,9 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * REX + 8A/r: mov r8, r/m8 */ size = 1; /* override for byte operation */ - error = memread(vm, vcpuid, gpa, &val, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); if (error == 0) - error = vie_write_bytereg(vm, vcpuid, vie, val); + error = vie_write_bytereg(vie, vm, vcpuid, val); break; case 0x8B: /* @@ -574,7 +693,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * 8B/r: mov r32, r/m32 * REX.W 8B/r: mov r64, r/m64 */ - error = memread(vm, vcpuid, gpa, &val, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); if (error == 0) { reg = gpr_map[vie->reg]; error = vie_update_register(vm, vcpuid, reg, val, size); @@ -587,7 +706,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * A1: mov EAX, moffs32 * REX.W + A1: mov RAX, moffs64 */ - error = memread(vm, vcpuid, gpa, &val, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, size); if (error == 0) { reg = VM_REG_GUEST_RAX; error = vie_update_register(vm, vcpuid, reg, val, size); @@ -597,13 +716,13 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, /* * MOV from AX/EAX/RAX to seg:moffset * A3: mov moffs16, AX - * A3: mov moffs32, EAX + * A3: mov moffs32, EAX * REX.W + A3: mov moffs64, RAX */ - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); if (error == 0) { val &= size2mask[size]; - error = memwrite(vm, vcpuid, gpa, val, size, arg); + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); } break; case 0xC6: @@ -613,7 +732,8 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * REX + C6/0 mov r/m8, imm8 */ size = 1; /* override for byte operation */ - error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg); + val = vie->immediate; + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); break; case 0xC7: /* @@ -623,7 +743,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits) */ val = vie->immediate & size2mask[size]; - error = memwrite(vm, vcpuid, gpa, val, size, arg); + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, size); break; default: break; @@ -633,9 +753,7 @@ emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, } static int -emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t memread, mem_region_write_t memwrite, - void *arg) +emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { int error, size; enum vm_reg_name reg; @@ -656,7 +774,7 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, */ /* get the first operand */ - error = memread(vm, vcpuid, gpa, &val, 1, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1); if (error) break; @@ -677,7 +795,7 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * 0F B7/r movzx r32, r/m16 * REX.W + 0F B7/r movzx r64, r/m16 */ - error = memread(vm, vcpuid, gpa, &val, 2, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 2); if (error) return (error); @@ -699,7 +817,7 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, */ /* get the first operand */ - error = memread(vm, vcpuid, gpa, &val, 1, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, 1); if (error) break; @@ -722,25 +840,27 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * Helper function to calculate and validate a linear address. */ static int -get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, - int opsize, int addrsize, int prot, enum vm_reg_name seg, - enum vm_reg_name gpr, uint64_t *gla, int *fault) +vie_get_gla(struct vie *vie, void *vm, int vcpuid, int opsize, int addrsize, + int prot, enum vm_reg_name seg, enum vm_reg_name gpr, uint64_t *gla) { struct seg_desc desc; uint64_t cr0, val, rflags; int error; + struct vm_guest_paging *paging; + + paging = &vie->paging; - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); error = vm_get_seg_desc(vm, vcpuid, seg, &desc); KASSERT(error == 0, ("%s: error %d getting segment descriptor %d", __func__, error, seg)); - error = vie_read_register(vm, vcpuid, gpr, &val); + error = vm_get_register(vm, vcpuid, gpr, &val); KASSERT(error == 0, ("%s: error %d getting register %d", __func__, error, gpr)); @@ -750,7 +870,7 @@ get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, vm_inject_ss(vm, vcpuid, 0); else vm_inject_gp(vm, vcpuid); - goto guest_fault; + return (-1); } if (vie_canonical_check(paging->cpu_mode, *gla)) { @@ -758,39 +878,30 @@ get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, vm_inject_ss(vm, vcpuid, 0); else vm_inject_gp(vm, vcpuid); - goto guest_fault; + return (-1); } if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { vm_inject_ac(vm, vcpuid, 0); - goto guest_fault; + return (-1); } - *fault = 0; - return (0); - -guest_fault: - *fault = 1; return (0); } static int -emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - struct vm_guest_paging *paging, mem_region_read_t memread, - mem_region_write_t memwrite, void *arg) +emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { -#ifdef _KERNEL struct vm_copyinfo copyinfo[2]; -#else - struct iovec copyinfo[2]; -#endif uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; uint64_t rcx, rdi, rsi, rflags; int error, fault, opsize, seg, repeat; + struct vm_guest_paging *paging; opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; val = 0; error = 0; + paging = &vie->paging; /* * XXX although the MOVS instruction is only supposed to be used with @@ -802,7 +913,7 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, repeat = vie->repz_present | vie->repnz_present; if (repeat) { - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); /* @@ -832,10 +943,10 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, */ seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; - error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, - PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); - if (error || fault) + if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, PROT_READ, seg, + VM_REG_GUEST_RSI, &srcaddr) != 0) { goto done; + } error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, copyinfo, nitems(copyinfo), &fault); @@ -848,7 +959,7 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, */ vm_copyin(vm, vcpuid, copyinfo, &val, opsize); vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); - error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize); if (error) goto done; } else { @@ -857,11 +968,11 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * if 'srcaddr' is in the mmio space. */ - error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, - PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, - &fault); - if (error || fault) + if (vie_get_gla(vie, vm, vcpuid, opsize, vie->addrsize, + PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, + &dstaddr) != 0) { goto done; + } error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, PROT_WRITE, copyinfo, nitems(copyinfo), &fault); @@ -878,7 +989,8 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * injected into the guest then it will happen * before the MMIO read is attempted. */ - error = memread(vm, vcpuid, gpa, &val, opsize, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, + opsize); if (error) goto done; @@ -903,23 +1015,25 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, if (error || fault) goto done; - error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); + error = vie_mmio_read(vie, vm, vcpuid, srcgpa, &val, + opsize); if (error) goto done; - error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); + error = vie_mmio_write(vie, vm, vcpuid, dstgpa, val, + opsize); if (error) goto done; } } - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi); KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error)); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); if (rflags & PSL_D) { @@ -948,18 +1062,14 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * Repeat the instruction if the count register is not zero. */ if ((rcx & vie_size2mask(vie->addrsize)) != 0) - vm_restart_instruction(vm, vcpuid); + return (vie_repeat(vie)); } done: - KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", - __func__, error)); return (error); } static int -emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - struct vm_guest_paging *paging, mem_region_read_t memread, - mem_region_write_t memwrite, void *arg) +emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { int error, opsize, repeat; uint64_t val; @@ -969,7 +1079,7 @@ emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, repeat = vie->repz_present | vie->repnz_present; if (repeat) { - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx); KASSERT(!error, ("%s: error %d getting rcx", __func__, error)); /* @@ -980,17 +1090,17 @@ emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, return (0); } - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RAX, &val); KASSERT(!error, ("%s: error %d getting rax", __func__, error)); - error = memwrite(vm, vcpuid, gpa, val, opsize, arg); + error = vie_mmio_write(vie, vm, vcpuid, gpa, val, opsize); if (error) return (error); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi); KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error)); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); if (rflags & PSL_D) @@ -1012,15 +1122,14 @@ emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * Repeat the instruction if the count register is not zero. */ if ((rcx & vie_size2mask(vie->addrsize)) != 0) - vm_restart_instruction(vm, vcpuid); + return (vie_repeat(vie)); } return (0); } static int -emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { int error, size; enum vm_reg_name reg; @@ -1042,12 +1151,12 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, /* get the first operand */ reg = gpr_map[vie->reg]; - error = vie_read_register(vm, vcpuid, reg, &val1); + error = vm_get_register(vm, vcpuid, reg, &val1); if (error) break; /* get the second operand */ - error = memread(vm, vcpuid, gpa, &val2, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); if (error) break; @@ -1071,7 +1180,7 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, */ /* get the first operand */ - error = memread(vm, vcpuid, gpa, &val1, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size); if (error) break; @@ -1080,7 +1189,7 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * operand and write the result */ result = val1 & vie->immediate; - error = memwrite(vm, vcpuid, gpa, result, size, arg); + error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size); break; default: break; @@ -1088,7 +1197,7 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, if (error) return (error); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); @@ -1107,8 +1216,7 @@ emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, } static int -emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { int error, size; enum vm_reg_name reg; @@ -1130,12 +1238,12 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, /* get the first operand */ reg = gpr_map[vie->reg]; - error = vie_read_register(vm, vcpuid, reg, &val1); + error = vm_get_register(vm, vcpuid, reg, &val1); if (error) break; - + /* get the second operand */ - error = memread(vm, vcpuid, gpa, &val2, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); if (error) break; @@ -1159,7 +1267,7 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, */ /* get the first operand */ - error = memread(vm, vcpuid, gpa, &val1, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val1, size); if (error) break; @@ -1168,7 +1276,7 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * operand and write the result */ result = val1 | vie->immediate; - error = memwrite(vm, vcpuid, gpa, result, size, arg); + error = vie_mmio_write(vie, vm, vcpuid, gpa, result, size); break; default: break; @@ -1176,7 +1284,7 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, if (error) return (error); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); @@ -1195,8 +1303,7 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, } static int -emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { int error, size; uint64_t regop, memop, op1, op2, rflags, rflags2; @@ -1223,12 +1330,12 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, /* Get the register operand */ reg = gpr_map[vie->reg]; - error = vie_read_register(vm, vcpuid, reg, ®op); + error = vm_get_register(vm, vcpuid, reg, ®op); if (error) return (error); /* Get the memory operand */ - error = memread(vm, vcpuid, gpa, &memop, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &memop, size); if (error) return (error); @@ -1267,7 +1374,7 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, size = 1; /* get the first operand */ - error = memread(vm, vcpuid, gpa, &op1, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); if (error) return (error); @@ -1276,7 +1383,7 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, default: return (EINVAL); } - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); rflags &= ~RFLAGS_STATUS_BITS; @@ -1287,8 +1394,7 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, } static int -emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { int error, size; uint64_t op1, rflags, rflags2; @@ -1311,7 +1417,7 @@ emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, if ((vie->reg & 7) != 0) return (EINVAL); - error = memread(vm, vcpuid, gpa, &op1, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &op1, size); if (error) return (error); @@ -1320,7 +1426,7 @@ emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, default: return (EINVAL); } - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); @@ -1336,16 +1442,16 @@ emulate_test(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, } static int -emulate_bextr(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - struct vm_guest_paging *paging, mem_region_read_t memread, - mem_region_write_t memwrite, void *arg) +emulate_bextr(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { uint64_t src1, src2, dst, rflags; unsigned start, len; int error, size; + struct vm_guest_paging *paging; size = vie->opsize; error = EINVAL; + paging = &vie->paging; /* * VEX.LZ.0F38.W0 F7 /r BEXTR r32a, r/m32, r32b @@ -1364,13 +1470,13 @@ emulate_bextr(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * operand) using an index and length specified in the second /source/ * operand (third operand). */ - error = memread(vm, vcpuid, gpa, &src1, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &src1, size); if (error) return (error); - error = vie_read_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2); + error = vm_get_register(vm, vcpuid, gpr_map[vie->vex_reg], &src2); if (error) return (error); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); @@ -1413,8 +1519,7 @@ done: } static int -emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { int error, size; uint64_t nval, rflags, rflags2, val1, val2; @@ -1435,12 +1540,12 @@ emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, /* get the first operand */ reg = gpr_map[vie->reg]; - error = vie_read_register(vm, vcpuid, reg, &val1); + error = vm_get_register(vm, vcpuid, reg, &val1); if (error) break; /* get the second operand */ - error = memread(vm, vcpuid, gpa, &val2, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); if (error) break; @@ -1454,7 +1559,7 @@ emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, if (!error) { rflags2 = getaddflags(size, val1, val2); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); @@ -1469,8 +1574,7 @@ emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, } static int -emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { int error, size; uint64_t nval, rflags, rflags2, val1, val2; @@ -1483,7 +1587,7 @@ emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, case 0x2B: /* * SUB r/m from r and store the result in r - * + * * 2B/r SUB r16, r/m16 * 2B/r SUB r32, r/m32 * REX.W + 2B/r SUB r64, r/m64 @@ -1491,12 +1595,12 @@ emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, /* get the first operand */ reg = gpr_map[vie->reg]; - error = vie_read_register(vm, vcpuid, reg, &val1); + error = vm_get_register(vm, vcpuid, reg, &val1); if (error) break; /* get the second operand */ - error = memread(vm, vcpuid, gpa, &val2, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val2, size); if (error) break; @@ -1510,7 +1614,7 @@ emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, if (!error) { rflags2 = getcc(size, val1, val2); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); if (error) return (error); @@ -1525,22 +1629,18 @@ emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, } static int -emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, - struct vm_guest_paging *paging, mem_region_read_t memread, - mem_region_write_t memwrite, void *arg) +emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie) { -#ifdef _KERNEL struct vm_copyinfo copyinfo[2]; -#else - struct iovec copyinfo[2]; -#endif struct seg_desc ss_desc; uint64_t cr0, rflags, rsp, stack_gla, val; int error, fault, size, stackaddrsize, pushop; + struct vm_guest_paging *paging; val = 0; size = vie->opsize; pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0; + paging = &vie->paging; /* * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1 @@ -1572,13 +1672,13 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, stackaddrsize = 2; } - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0); KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error)); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp); KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error)); if (pushop) { rsp -= size; @@ -1608,12 +1708,12 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, return (error); if (pushop) { - error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); + error = vie_mmio_read(vie, vm, vcpuid, mmio_gpa, &val, size); if (error == 0) vm_copyout(vm, vcpuid, &val, copyinfo, size); } else { vm_copyin(vm, vcpuid, copyinfo, &val, size); - error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg); + error = vie_mmio_write(vie, vm, vcpuid, mmio_gpa, val, size); rsp += size; } vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); @@ -1627,9 +1727,7 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, } static int -emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, - struct vm_guest_paging *paging, mem_region_read_t memread, - mem_region_write_t memwrite, void *arg) +emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie) { int error; @@ -1642,15 +1740,12 @@ emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, if ((vie->reg & 7) != 6) return (EINVAL); - error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, - memwrite, arg); + error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie); return (error); } static int -emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, - struct vm_guest_paging *paging, mem_region_read_t memread, - mem_region_write_t memwrite, void *arg) +emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie) { int error; @@ -1663,30 +1758,24 @@ emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, if ((vie->reg & 7) != 0) return (EINVAL); - error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread, - memwrite, arg); + error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie); return (error); } static int -emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - struct vm_guest_paging *paging, mem_region_read_t memread, - mem_region_write_t memwrite, void *memarg) +emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { int error; switch (vie->reg & 7) { case 0x1: /* OR */ - error = emulate_or(vm, vcpuid, gpa, vie, - memread, memwrite, memarg); + error = emulate_or(vm, vcpuid, gpa, vie); break; case 0x4: /* AND */ - error = emulate_and(vm, vcpuid, gpa, vie, - memread, memwrite, memarg); + error = emulate_and(vm, vcpuid, gpa, vie); break; case 0x7: /* CMP */ - error = emulate_cmp(vm, vcpuid, gpa, vie, - memread, memwrite, memarg); + error = emulate_cmp(vm, vcpuid, gpa, vie); break; default: error = EINVAL; @@ -1697,8 +1786,7 @@ emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, } static int -emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) +emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { uint64_t val, rflags; int error, bitmask, bitoff; @@ -1712,10 +1800,10 @@ emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, if ((vie->reg & 7) != 4) return (EINVAL); - error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); - error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &val, vie->opsize); if (error) return (error); @@ -1739,8 +1827,7 @@ emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, } static int -emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) +emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie) { int error; uint64_t buf; @@ -1758,7 +1845,7 @@ emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * CLFLUSH, CLFLUSHOPT. Only check for access * rights. */ - error = memread(vm, vcpuid, gpa, &buf, 1, memarg); + error = vie_mmio_read(vie, vm, vcpuid, gpa, &buf, 1); } break; default: @@ -1769,91 +1856,460 @@ emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, return (error); } +static int +vie_mmio_read(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa, + uint64_t *rval, int bytes) +{ + int err; + + if (vie->mmio_req_read.state == VR_DONE) { + ASSERT(vie->mmio_req_read.bytes == bytes); + ASSERT(vie->mmio_req_read.gpa == gpa); + + *rval = vie->mmio_req_read.data; + return (0); + } + + err = vm_service_mmio_read(vm, cpuid, gpa, rval, bytes); + if (err == 0) { + /* + * A successful read from an in-kernel-emulated device may come + * with side effects, so stash the result in case it's used for + * an instruction which subsequently needs to issue an MMIO + * write to userspace. + */ + ASSERT(vie->mmio_req_read.state == VR_NONE); + + vie->mmio_req_read.bytes = bytes; + vie->mmio_req_read.gpa = gpa; + vie->mmio_req_read.data = *rval; + vie->mmio_req_read.state = VR_DONE; + + } else if (err == ESRCH) { + /* Hope that userspace emulation can fulfill this read */ + vie->mmio_req_read.bytes = bytes; + vie->mmio_req_read.gpa = gpa; + vie->mmio_req_read.state = VR_PENDING; + vie->status |= VIES_PENDING_MMIO; + } + return (err); +} + +static int +vie_mmio_write(struct vie *vie, struct vm *vm, int cpuid, uint64_t gpa, + uint64_t wval, int bytes) +{ + int err; + + if (vie->mmio_req_write.state == VR_DONE) { + ASSERT(vie->mmio_req_write.bytes == bytes); + ASSERT(vie->mmio_req_write.gpa == gpa); + + return (0); + } + + err = vm_service_mmio_write(vm, cpuid, gpa, wval, bytes); + if (err == 0) { + /* + * A successful write to an in-kernel-emulated device probably + * results in side effects, so stash the fact that such a write + * succeeded in case the operation requires other work. + */ + vie->mmio_req_write.bytes = bytes; + vie->mmio_req_write.gpa = gpa; + vie->mmio_req_write.data = wval; + vie->mmio_req_write.state = VR_DONE; + } else if (err == ESRCH) { + /* Hope that userspace emulation can fulfill this write */ + vie->mmio_req_write.bytes = bytes; + vie->mmio_req_write.gpa = gpa; + vie->mmio_req_write.data = wval; + vie->mmio_req_write.state = VR_PENDING; + vie->status |= VIES_PENDING_MMIO; + } + return (err); +} + int -vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, - struct vm_guest_paging *paging, mem_region_read_t memread, - mem_region_write_t memwrite, void *memarg) +vie_emulate_mmio(struct vie *vie, void *vm, int vcpuid) { int error; + uint64_t gpa; - if (!vie->decoded) + if ((vie->status & (VIES_INST_DECODE | VIES_MMIO)) != + (VIES_INST_DECODE | VIES_MMIO)) { return (EINVAL); + } + + gpa = vie->mmio_gpa; switch (vie->op.op_type) { case VIE_OP_TYPE_GROUP1: - error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread, - memwrite, memarg); + error = emulate_group1(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_POP: - error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread, - memwrite, memarg); + error = emulate_pop(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_PUSH: - error = emulate_push(vm, vcpuid, gpa, vie, paging, memread, - memwrite, memarg); + error = emulate_push(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_CMP: - error = emulate_cmp(vm, vcpuid, gpa, vie, - memread, memwrite, memarg); + error = emulate_cmp(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_MOV: - error = emulate_mov(vm, vcpuid, gpa, vie, - memread, memwrite, memarg); + error = emulate_mov(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_MOVSX: case VIE_OP_TYPE_MOVZX: - error = emulate_movx(vm, vcpuid, gpa, vie, - memread, memwrite, memarg); + error = emulate_movx(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_MOVS: - error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread, - memwrite, memarg); + error = emulate_movs(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_STOS: - error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread, - memwrite, memarg); + error = emulate_stos(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_AND: - error = emulate_and(vm, vcpuid, gpa, vie, - memread, memwrite, memarg); + error = emulate_and(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_OR: - error = emulate_or(vm, vcpuid, gpa, vie, - memread, memwrite, memarg); + error = emulate_or(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_SUB: - error = emulate_sub(vm, vcpuid, gpa, vie, - memread, memwrite, memarg); + error = emulate_sub(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_BITTEST: - error = emulate_bittest(vm, vcpuid, gpa, vie, - memread, memwrite, memarg); + error = emulate_bittest(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_TWOB_GRP15: - error = emulate_twob_group15(vm, vcpuid, gpa, vie, - memread, memwrite, memarg); + error = emulate_twob_group15(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_ADD: - error = emulate_add(vm, vcpuid, gpa, vie, memread, - memwrite, memarg); + error = emulate_add(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_TEST: - error = emulate_test(vm, vcpuid, gpa, vie, - memread, memwrite, memarg); + error = emulate_test(vm, vcpuid, gpa, vie); break; case VIE_OP_TYPE_BEXTR: - error = emulate_bextr(vm, vcpuid, gpa, vie, paging, - memread, memwrite, memarg); + error = emulate_bextr(vm, vcpuid, gpa, vie); break; default: error = EINVAL; break; } + if (error == ESRCH) { + /* Return to userspace with the mmio request */ + return (-1); + } + return (error); } +static int +vie_emulate_inout_port(struct vie *vie, struct vm *vm, int vcpuid) +{ + uint32_t mask, val; + bool in; + int err; + + mask = vie_size2mask(vie->inout.bytes); + in = (vie->inout.flags & INOUT_IN) != 0; + + if (!in) { + val = vie->inout.eax & mask; + } + + if (vie->inout_req_state != VR_DONE) { + err = vm_inout_access(vm, vcpuid, in, vie->inout.port, + vie->inout.bytes, &val); + } else { + /* + * This port access was handled in userspace and the result was + * injected in to be handled now. + */ + val = vie->inout_req_val; + vie->inout_req_state = VR_NONE; + err = 0; + } + + if (err == ESRCH) { + vie->status |= VIES_PENDING_INOUT; + vie->inout_req_state = VR_PENDING; + return (err); + } else if (err != 0) { + return (err); + } + + if (in) { + val &= mask; + val |= (vie->inout.eax & ~mask); + err = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, val); + KASSERT(err == 0, ("emulate_ioport: error %d setting guest " + "rax register", err)); + } + return (0); +} + +static enum vm_reg_name +vie_inout_segname(const struct vie *vie) +{ + uint8_t segidx = vie->inout.segment; + const enum vm_reg_name segmap[] = { + VM_REG_GUEST_ES, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS, + }; + const uint8_t maxidx = (sizeof (segmap) / sizeof (segmap[0])); + + if (segidx >= maxidx) { + panic("unexpected segment index %u", segidx); + } + return (segmap[segidx]); +} + +static int +vie_emulate_inout_str(struct vie *vie, struct vm *vm, int vcpuid) +{ + uint8_t bytes, addrsize; + uint64_t index, count = 0, gla, rflags; + int prot, err, fault; + bool in, repeat; + enum vm_reg_name seg_reg, idx_reg; + struct vm_copyinfo copyinfo[2]; + + in = (vie->inout.flags & INOUT_IN) != 0; + bytes = vie->inout.bytes; + addrsize = vie->inout.addrsize; + prot = in ? PROT_WRITE : PROT_READ; + + ASSERT(bytes == 1 || bytes == 2 || bytes == 4); + ASSERT(addrsize == 2 || addrsize == 4 || addrsize == 8); + + idx_reg = (in) ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI; + seg_reg = vie_inout_segname(vie); + err = vm_get_register(vm, vcpuid, idx_reg, &index); + ASSERT(err == 0); + index = index & vie_size2mask(addrsize); + + repeat = (vie->inout.flags & INOUT_REP) != 0; + + /* Count register */ + if (repeat) { + err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RCX, &count); + count &= vie_size2mask(addrsize); + + if (count == 0) { + /* + * If we were asked to emulate a REP INS/OUTS when the + * count register is zero, no further work is required. + */ + return (0); + } + } else { + count = 1; + } + + gla = 0; + if (vie_get_gla(vie, vm, vcpuid, bytes, addrsize, prot, seg_reg, + idx_reg, &gla) != 0) { + /* vie_get_gla() already injected the appropriate fault */ + return (0); + } + + /* + * The INS/OUTS emulate currently assumes that the memory target resides + * within the guest system memory, rather than a device MMIO region. If + * such a case becomes a necessity, that additional handling could be + * put in place. + */ + err = vm_copy_setup(vm, vcpuid, &vie->paging, gla, bytes, prot, + copyinfo, nitems(copyinfo), &fault); + + if (err) { + /* Unrecoverable error */ + return (err); + } else if (fault) { + /* Resume guest to handle fault */ + return (0); + } + + if (!in) { + vm_copyin(vm, vcpuid, copyinfo, &vie->inout.eax, bytes); + } + + err = vie_emulate_inout_port(vie, vm, vcpuid); + + if (err == 0 && in) { + vm_copyout(vm, vcpuid, &vie->inout.eax, copyinfo, bytes); + } + + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + + if (err == 0) { + err = vm_get_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + &rflags); + ASSERT(err == 0); + + /* Update index */ + if (rflags & PSL_D) { + index -= bytes; + } else { + index += bytes; + } + + /* Update index register */ + err = vie_update_register(vm, vcpuid, idx_reg, index, addrsize); + ASSERT(err == 0); + + /* + * Update count register only if the instruction had a repeat + * prefix. + */ + if ((vie->inout.flags & INOUT_REP) != 0) { + count--; + err = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX, + count, addrsize); + ASSERT(err == 0); + + if (count != 0) { + return (vie_repeat(vie)); + } + } + } + + return (err); +} + int +vie_emulate_inout(struct vie *vie, struct vm *vm, int vcpuid) +{ + int err = 0; + + if ((vie->status & VIES_INOUT) == 0) { + return (EINVAL); + } + + if ((vie->inout.flags & INOUT_STR) == 0) { + /* + * For now, using the 'rep' prefixes with plain (non-string) + * in/out is not supported. + */ + if ((vie->inout.flags & INOUT_REP) != 0) { + return (EINVAL); + } + + err = vie_emulate_inout_port(vie, vm, vcpuid); + + if (err == ESRCH) { + ASSERT(vie->status & VIES_PENDING_INOUT); + /* Return to userspace with the in/out request */ + err = -1; + } + } else { + vie->status &= ~VIES_REPEAT; + err = vie_emulate_inout_str(vie, vm, vcpuid); + + if (err == ESRCH) { + ASSERT(vie->status & VIES_PENDING_INOUT); + /* Return to userspace with the in/out request */ + err = -1; + } + } + + return (err); +} + +void +vie_reset(struct vie *vie) +{ + vie->status = 0; + vie->num_processed = vie->num_valid = 0; +} + +void +vie_advance_pc(struct vie *vie, uint64_t *nextrip) +{ + VERIFY((vie->status & VIES_REPEAT) == 0); + + *nextrip += vie->num_processed; + vie_reset(vie); +} + +void +vie_exitinfo(const struct vie *vie, struct vm_exit *vme) +{ + if (vie->status & VIES_MMIO) { + vme->exitcode = VM_EXITCODE_MMIO; + if (vie->mmio_req_read.state == VR_PENDING) { + vme->u.mmio.gpa = vie->mmio_req_read.gpa; + vme->u.mmio.data = 0; + vme->u.mmio.bytes = vie->mmio_req_read.bytes; + vme->u.mmio.read = 1; + } else if (vie->mmio_req_write.state == VR_PENDING) { + vme->u.mmio.gpa = vie->mmio_req_write.gpa; + vme->u.mmio.data = vie->mmio_req_write.data & + vie_size2mask(vie->mmio_req_write.bytes); + vme->u.mmio.bytes = vie->mmio_req_write.bytes; + vme->u.mmio.read = 0; + } else { + panic("bad pending MMIO state"); + } + } else if (vie->status & VIES_INOUT) { + vme->exitcode = VM_EXITCODE_INOUT; + vme->u.inout.port = vie->inout.port; + vme->u.inout.bytes = vie->inout.bytes; + if ((vie->inout.flags & INOUT_IN) != 0) { + vme->u.inout.flags = INOUT_IN; + vme->u.inout.eax = 0; + } else { + vme->u.inout.flags = 0; + vme->u.inout.eax = vie->inout.eax & + vie_size2mask(vie->inout.bytes); + } + } else { + panic("no pending operation"); + } +} + +/* + * In the case of a decoding or verification failure, bailing out to userspace + * to do the instruction emulation is our only option for now. + */ +void +vie_fallback_exitinfo(const struct vie *vie, struct vm_exit *vme) +{ + if ((vie->status & VIES_INST_FETCH) == 0) { + bzero(&vme->u.inst_emul, sizeof (vme->u.inst_emul)); + } else { + ASSERT(sizeof (vie->inst) == sizeof (vme->u.inst_emul.inst)); + + bcopy(vie->inst, vme->u.inst_emul.inst, sizeof (vie->inst)); + vme->u.inst_emul.num_valid = vie->num_valid; + } + vme->exitcode = VM_EXITCODE_INST_EMUL; +} + +bool +vie_pending(const struct vie *vie) +{ + return ((vie->status & (VIES_PENDING_MMIO|VIES_PENDING_INOUT)) != 0); +} + +bool +vie_needs_fetch(const struct vie *vie) +{ + if (vie->status & VIES_INST_FETCH) { + ASSERT(vie->num_valid != 0); + return (false); + } + return (true); +} + +static int vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) { KASSERT(size == 1 || size == 2 || size == 4 || size == 8, @@ -1866,7 +2322,7 @@ vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla) return ((gla & (size - 1)) ? 1 : 0); } -int +static int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) { uint64_t mask; @@ -1885,7 +2341,7 @@ vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla) return ((gla & mask) != 0); } -uint64_t +static uint64_t vie_size2mask(int size) { KASSERT(size == 1 || size == 2 || size == 4 || size == 8, @@ -1893,7 +2349,7 @@ vie_size2mask(int size) return (size2mask[size]); } -int +static int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, struct seg_desc *desc, uint64_t offset, int length, int addrsize, int prot, uint64_t *gla) @@ -1905,13 +2361,8 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, ("%s: invalid segment %d", __func__, seg)); KASSERT(length == 1 || length == 2 || length == 4 || length == 8, ("%s: invalid operand size %d", __func__, length)); -#ifdef __FreeBSD__ - KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, - ("%s: invalid prot %#x", __func__, prot)); -#else KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0, ("%s: invalid prot %x", __func__, prot)); -#endif firstoff = offset; if (cpu_mode == CPU_MODE_64BIT) { @@ -1930,31 +2381,21 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, if (SEG_DESC_UNUSABLE(desc->access)) return (-1); - /* + /* * The processor generates a #NP exception when a segment * register is loaded with a selector that points to a * descriptor that is not present. If this was the case then * it would have been checked before the VM-exit. */ -#ifdef __FreeBSD__ - KASSERT(SEG_DESC_PRESENT(desc->access), - ("segment %d not present: %#x", seg, desc->access)); -#else KASSERT(SEG_DESC_PRESENT(desc->access), ("segment %d not present: %x", seg, desc->access)); -#endif /* * The descriptor type must indicate a code/data segment. */ type = SEG_DESC_TYPE(desc->access); -#ifdef __FreeBSD__ - KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " - "descriptor type %#x", seg, type)); -#else KASSERT(type >= 16 && type <= 31, ("segment %d has invalid " "descriptor type %x", seg, type)); -#endif if (prot & PROT_READ) { /* #GP on a read access to a exec-only code segment */ @@ -2019,24 +2460,107 @@ vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, } void -vie_init(struct vie *vie, const char *inst_bytes, int inst_length) +vie_init_mmio(struct vie *vie, const char *inst_bytes, uint8_t inst_length, + const struct vm_guest_paging *paging, uint64_t gpa) { - KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE, + KASSERT(inst_length <= VIE_INST_SIZE, ("%s: invalid instruction length (%d)", __func__, inst_length)); - bzero(vie, sizeof(struct vie)); + bzero(vie, sizeof (struct vie)); vie->base_register = VM_REG_LAST; vie->index_register = VM_REG_LAST; vie->segment_register = VM_REG_LAST; + vie->status = VIES_INIT | VIES_MMIO; - if (inst_length) { + if (inst_length != 0) { bcopy(inst_bytes, vie->inst, inst_length); vie->num_valid = inst_length; + vie->status |= VIES_INST_FETCH; + } + + vie->paging = *paging; + vie->mmio_gpa = gpa; +} + +void +vie_init_inout(struct vie *vie, const struct vm_inout *inout, uint8_t inst_len, + const struct vm_guest_paging *paging) +{ + bzero(vie, sizeof (struct vie)); + + vie->status = VIES_INIT | VIES_INOUT; + + vie->inout = *inout; + vie->paging = *paging; + + /* + * Since VMX/SVM assists already decoded the nature of the in/out + * instruction, let the status reflect that. + */ + vie->status |= VIES_INST_FETCH | VIES_INST_DECODE; + vie->num_processed = inst_len; +} + +int +vie_fulfill_mmio(struct vie *vie, const struct vm_mmio *result) +{ + struct vie_mmio *pending; + + if ((vie->status & VIES_MMIO) == 0 || + (vie->status & VIES_PENDING_MMIO) == 0) { + return (EINVAL); + } + + if (result->read) { + pending = &vie->mmio_req_read; + } else { + pending = &vie->mmio_req_write; + } + + if (pending->state != VR_PENDING || + pending->bytes != result->bytes || pending->gpa != result->gpa) { + return (EINVAL); + } + + if (result->read) { + pending->data = result->data & vie_size2mask(pending->bytes); + } + pending->state = VR_DONE; + vie->status &= ~VIES_PENDING_MMIO; + + return (0); +} + +int +vie_fulfill_inout(struct vie *vie, const struct vm_inout *result) +{ + if ((vie->status & VIES_INOUT) == 0 || + (vie->status & VIES_PENDING_INOUT) == 0) { + return (EINVAL); } + if ((vie->inout.flags & INOUT_IN) != (result->flags & INOUT_IN) || + vie->inout.bytes != result->bytes || + vie->inout.port != result->port) { + return (EINVAL); + } + + if (result->flags & INOUT_IN) { + vie->inout_req_val = result->eax & + vie_size2mask(vie->inout.bytes); + } + vie->inout_req_state = VR_DONE; + vie->status &= ~(VIES_PENDING_INOUT); + + return (0); +} + +uint64_t +vie_mmio_gpa(const struct vie *vie) +{ + return (vie->mmio_gpa); } -#ifdef _KERNEL static int pf_error_code(int usermode, int prot, int rsvd, uint64_t pte) { @@ -2299,27 +2823,28 @@ vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, } int -vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, - uint64_t rip, int inst_length, struct vie *vie, int *faultptr) +vie_fetch_instruction(struct vie *vie, struct vm *vm, int vcpuid, uint64_t rip, + int *faultptr) { struct vm_copyinfo copyinfo[2]; int error, prot; - if (inst_length > VIE_INST_SIZE) - panic("vmm_fetch_instruction: invalid length %d", inst_length); + if (vie->status != (VIES_INIT|VIES_MMIO)) { + return (EINVAL); + } prot = PROT_READ | PROT_EXEC; - error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, - copyinfo, nitems(copyinfo), faultptr); + error = vm_copy_setup(vm, vcpuid, &vie->paging, rip, VIE_INST_SIZE, + prot, copyinfo, nitems(copyinfo), faultptr); if (error || *faultptr) return (error); - vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); + vm_copyin(vm, vcpuid, copyinfo, vie->inst, VIE_INST_SIZE); vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); - vie->num_valid = inst_length; + vie->num_valid = VIE_INST_SIZE; + vie->status |= VIES_INST_FETCH; return (0); } -#endif /* _KERNEL */ static int vie_peek(struct vie *vie, uint8_t *x) @@ -2821,23 +3346,28 @@ decode_moffset(struct vie *vie) return (0); } -#ifdef _KERNEL /* * Verify that the 'guest linear address' provided as collateral of the nested * page table fault matches with our instruction decoding. */ -static int -verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie, - enum vm_cpu_mode cpu_mode) +int +vie_verify_gla(struct vie *vie, struct vm *vm, int cpuid, uint64_t gla) { int error; uint64_t base, segbase, idx, gla2; enum vm_reg_name seg; struct seg_desc desc; - /* Skip 'gla' verification */ - if (gla == VIE_INVALID_GLA) + ASSERT((vie->status & VIES_INST_DECODE) != 0); + + /* + * If there was no valid GLA context with the exit, or the decoded + * instruction acts on more than one address, verification is done. + */ + if (gla == VIE_INVALID_GLA || + (vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) != 0) { return (0); + } base = 0; if (vie->base_register != VM_REG_LAST) { @@ -2879,15 +3409,16 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie, * string destination the DS segment is the default. These * can be overridden to allow other segments to be accessed. */ - if (vie->segment_override) + if (vie->segment_override) { seg = vie->segment_register; - else if (vie->base_register == VM_REG_GUEST_RSP || - vie->base_register == VM_REG_GUEST_RBP) + } else if (vie->base_register == VM_REG_GUEST_RSP || + vie->base_register == VM_REG_GUEST_RBP) { seg = VM_REG_GUEST_SS; - else + } else { seg = VM_REG_GUEST_DS; - if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && - seg != VM_REG_GUEST_GS) { + } + if (vie->paging.cpu_mode == CPU_MODE_64BIT && + seg != VM_REG_GUEST_FS && seg != VM_REG_GUEST_GS) { segbase = 0; } else { error = vm_get_seg_desc(vm, cpuid, seg, &desc); @@ -2913,16 +3444,17 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie, return (0); } -#endif /* _KERNEL */ int -#ifdef _KERNEL -vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, - enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) -#else -vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) -#endif +vie_decode_instruction(struct vie *vie, struct vm *vm, int cpuid, int cs_d) { + enum vm_cpu_mode cpu_mode; + + if ((vie->status & VIES_INST_FETCH) == 0) { + return (EINVAL); + } + + cpu_mode = vie->paging.cpu_mode; if (decode_prefixes(vie, cpu_mode, cs_d)) return (-1); @@ -2945,14 +3477,7 @@ vmm_decode_instruction(enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie) if (decode_moffset(vie)) return (-1); -#ifdef _KERNEL - if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { - if (verify_gla(vm, cpuid, gla, vie, cpu_mode)) - return (-1); - } -#endif - - vie->decoded = 1; /* success */ + vie->status |= VIES_INST_DECODE; return (0); } diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c index 3d08fd5e85..01fae7d584 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c @@ -25,6 +25,18 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2020 Oxide Computer Company + */ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); @@ -33,18 +45,16 @@ __FBSDID("$FreeBSD$"); #include <sys/systm.h> #include <machine/vmm.h> -#include <machine/vmm_instruction_emul.h> #include "vatpic.h" #include "vatpit.h" #include "vpmtmr.h" #include "vrtc.h" #include "vmm_ioport.h" -#include "vmm_ktr.h" #define MAX_IOPORTS 1280 -ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { +static ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { [TIMER_MODE] = vatpit_handler, [TIMER_CNTR0] = vatpit_handler, [TIMER_CNTR1] = vatpit_handler, @@ -61,144 +71,24 @@ ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { [IO_RTC + 1] = vrtc_data_handler, }; -#ifdef KTR -static const char * -inout_instruction(struct vm_exit *vmexit) -{ - int index; - - static const char *iodesc[] = { - "outb", "outw", "outl", - "inb", "inw", "inl", - "outsb", "outsw", "outsd", - "insb", "insw", "insd", - }; - - switch (vmexit->u.inout.bytes) { - case 1: - index = 0; - break; - case 2: - index = 1; - break; - default: - index = 2; - break; - } - - if (vmexit->u.inout.in) - index += 3; - - if (vmexit->u.inout.string) - index += 6; - - KASSERT(index < nitems(iodesc), ("%s: invalid index %d", - __func__, index)); - - return (iodesc[index]); -} -#endif /* KTR */ - -static int -emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, - bool *retu) +int +vm_inout_access(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *val) { ioport_handler_func_t handler; - uint32_t mask, val; int error; -#ifdef __FreeBSD__ - /* - * If there is no handler for the I/O port then punt to userspace. - */ - if (vmexit->u.inout.port >= MAX_IOPORTS || - (handler = ioport_handler[vmexit->u.inout.port]) == NULL) { - *retu = true; - return (0); - } -#else /* __FreeBSD__ */ handler = NULL; - if (vmexit->u.inout.port < MAX_IOPORTS) { - handler = ioport_handler[vmexit->u.inout.port]; + if (port < MAX_IOPORTS) { + handler = ioport_handler[port]; } - /* Look for hooks, if a standard handler is not present */ - if (handler == NULL) { - mask = vie_size2mask(vmexit->u.inout.bytes); - if (!vmexit->u.inout.in) { - val = vmexit->u.inout.eax & mask; - } - error = vm_ioport_handle_hook(vm, vcpuid, vmexit->u.inout.in, - vmexit->u.inout.port, vmexit->u.inout.bytes, &val); - if (error == 0) { - goto finish; - } - *retu = true; - return (0); + if (handler != NULL) { + error = (*handler)(vm, vcpuid, in, port, bytes, val); + } else { + /* Look for hooks, if a standard handler is not present */ + error = vm_ioport_handle_hook(vm, vcpuid, in, port, bytes, val); } -#endif /* __FreeBSD__ */ - - mask = vie_size2mask(vmexit->u.inout.bytes); - - if (!vmexit->u.inout.in) { - val = vmexit->u.inout.eax & mask; - } - - error = (*handler)(vm, vcpuid, vmexit->u.inout.in, - vmexit->u.inout.port, vmexit->u.inout.bytes, &val); - if (error) { - /* - * The value returned by this function is also the return value - * of vm_run(). This needs to be a positive number otherwise it - * can be interpreted as a "pseudo-error" like ERESTART. - * - * Enforce this by mapping all errors to EIO. - */ - return (EIO); - } - -#ifndef __FreeBSD__ -finish: -#endif /* __FreeBSD__ */ - if (vmexit->u.inout.in) { - vmexit->u.inout.eax &= ~mask; - vmexit->u.inout.eax |= val & mask; - error = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX, - vmexit->u.inout.eax); - KASSERT(error == 0, ("emulate_ioport: error %d setting guest " - "rax register", error)); - } - *retu = false; - return (0); -} - -static int -emulate_inout_str(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) -{ - *retu = true; - return (0); /* Return to userspace to finish emulation */ -} - -int -vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu) -{ - int bytes, error; - - bytes = vmexit->u.inout.bytes; - KASSERT(bytes == 1 || bytes == 2 || bytes == 4, - ("vm_handle_inout: invalid operand size %d", bytes)); - - if (vmexit->u.inout.string) - error = emulate_inout_str(vm, vcpuid, vmexit, retu); - else - error = emulate_inout_port(vm, vcpuid, vmexit, retu); - - VCPU_CTR4(vm, vcpuid, "%s%s 0x%04x: %s", - vmexit->u.inout.rep ? "rep " : "", - inout_instruction(vmexit), - vmexit->u.inout.port, - error ? "error" : (*retu ? "userspace" : "handled")); - return (error); } diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h index 14e315f400..7c51906e85 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h @@ -32,8 +32,9 @@ #define _VMM_IOPORT_H_ typedef int (*ioport_handler_func_t)(struct vm *vm, int vcpuid, - bool in, int port, int bytes, uint32_t *val); + bool in, uint16_t port, uint8_t bytes, uint32_t *val); -int vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu); +int vm_inout_access(struct vm *vm, int vcpuid, bool in, uint16_t port, + uint8_t bytes, uint32_t *val); #endif /* _VMM_IOPORT_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c index 6526188b1c..3fd7f862d1 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -500,25 +500,27 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, /* Execute the primary logic for the ioctl. */ switch (cmd) { case VM_RUN: { - struct vm_run vmrun; + struct vm_entry entry; - if (ddi_copyin(datap, &vmrun, sizeof (vmrun), md)) { + if (ddi_copyin(datap, &entry, sizeof (entry), md)) { error = EFAULT; break; } - vmrun.cpuid = vcpu; if (!(curthread->t_schedflag & TS_VCPU)) smt_mark_as_vcpu(); - error = vm_run(sc->vmm_vm, &vmrun); - /* - * XXXJOY: I think it's necessary to do copyout, even in the - * face of errors, since the exit state is communicated out. - */ - if (ddi_copyout(&vmrun, datap, sizeof (vmrun), md)) { - error = EFAULT; - break; + error = vm_run(sc->vmm_vm, vcpu, &entry); + + if (error == 0) { + const struct vm_exit *vme; + void *outp = entry.exit_data; + + vme = vm_exitinfo(sc->vmm_vm, vcpu); + if (ddi_copyout(vme, outp, sizeof (*vme), md)) { + error = EFAULT; + break; + } } break; } @@ -982,9 +984,6 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, case VM_GET_KERNEMU_DEV: { struct vm_readwrite_kernemu_device kemu; size_t size = 0; - mem_region_write_t mwrite = NULL; - mem_region_read_t mread = NULL; - uint64_t ignored = 0; if (ddi_copyin(datap, &kemu, sizeof (kemu), md)) { error = EFAULT; @@ -998,31 +997,12 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, size = (1 << kemu.access_width); ASSERT(size >= 1 && size <= 8); - if (kemu.gpa >= DEFAULT_APIC_BASE && - kemu.gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { - mread = lapic_mmio_read; - mwrite = lapic_mmio_write; - } else if (kemu.gpa >= VIOAPIC_BASE && - kemu.gpa < VIOAPIC_BASE + VIOAPIC_SIZE) { - mread = vioapic_mmio_read; - mwrite = vioapic_mmio_write; - } else if (kemu.gpa >= VHPET_BASE && - kemu.gpa < VHPET_BASE + VHPET_SIZE) { - mread = vhpet_mmio_read; - mwrite = vhpet_mmio_write; - } else { - error = EINVAL; - break; - } - if (cmd == VM_SET_KERNEMU_DEV) { - VERIFY(mwrite != NULL); - error = mwrite(sc->vmm_vm, vcpu, kemu.gpa, kemu.value, - size, &ignored); + error = vm_service_mmio_write(sc->vmm_vm, vcpu, + kemu.gpa, kemu.value, size); } else { - VERIFY(mread != NULL); - error = mread(sc->vmm_vm, vcpu, kemu.gpa, &kemu.value, - size, &ignored); + error = vm_service_mmio_read(sc->vmm_vm, vcpu, + kemu.gpa, &kemu.value, size); } if (error == 0) { @@ -2004,6 +1984,11 @@ vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, vmm_softc_t *sc; minor_t minor; + /* The structs in bhyve ioctls assume a 64-bit datamodel */ + if (ddi_model_convert_from(mode & FMODELS) != DDI_MODEL_NONE) { + return (ENOTSUP); + } + minor = getminor(dev); if (minor == VMM_CTL_MINOR) { diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.c b/usr/src/uts/i86pc/io/vmm/vmm_stat.c index a6af75e40a..42d6f8cfa3 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_stat.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.c @@ -163,7 +163,7 @@ VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); -VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); +VMM_STAT(VMEXIT_MMIO_EMUL, "vm exits for mmio emulation"); VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.h b/usr/src/uts/i86pc/io/vmm/vmm_stat.h index 3232e23888..bfe35e9f67 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_stat.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.h @@ -162,7 +162,7 @@ VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW); VMM_STAT_DECLARE(VMEXIT_INOUT); VMM_STAT_DECLARE(VMEXIT_CPUID); VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT); -VMM_STAT_DECLARE(VMEXIT_INST_EMUL); +VMM_STAT_DECLARE(VMEXIT_MMIO_EMUL); VMM_STAT_DECLARE(VMEXIT_UNKNOWN); VMM_STAT_DECLARE(VMEXIT_ASTPENDING); VMM_STAT_DECLARE(VMEXIT_USERSPACE); diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h index 45838e343e..d6d24f0c37 100644 --- a/usr/src/uts/i86pc/sys/vmm.h +++ b/usr/src/uts/i86pc/sys/vmm.h @@ -122,31 +122,13 @@ enum x2apic_state { #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) -#ifndef __FreeBSD__ /* * illumos doesn't have a limitation based on SPECNAMELEN like FreeBSD does. * Instead of picking an arbitrary value we will just rely on the same * calculation that's made below. If this calculation ever changes we need to * update the the VM_MAX_NAMELEN mapping in the bhyve brand's boot.c file. */ -#else -/* - * The VM name has to fit into the pathname length constraints of devfs, - * governed primarily by SPECNAMELEN. The length is the total number of - * characters in the full path, relative to the mount point and not - * including any leading '/' characters. - * A prefix and a suffix are added to the name specified by the user. - * The prefix is usually "vmm/" or "vmm.io/", but can be a few characters - * longer for future use. - * The suffix is a string that identifies a bootrom image or some similar - * image that is attached to the VM. A separator character gets added to - * the suffix automatically when generating the full path, so it must be - * accounted for, reducing the effective length by 1. - * The effective length of a VM name is 229 bytes for FreeBSD 13 and 37 - * bytes for FreeBSD 12. A minimum length is set for safety and supports - * a SPECNAMELEN as small as 32 on old systems. - */ -#endif + #define VM_MAX_PREFIXLEN 10 #define VM_MAX_SUFFIXLEN 15 #define VM_MIN_NAMELEN 6 @@ -224,76 +206,6 @@ struct vm_guest_paging { enum vm_paging_mode paging_mode; }; -/* - * The data structures 'vie' and 'vie_op' are meant to be opaque to the - * consumers of instruction decoding. The only reason why their contents - * need to be exposed is because they are part of the 'vm_exit' structure. - */ -struct vie_op { - uint8_t op_byte; /* actual opcode byte */ - uint8_t op_type; /* type of operation (e.g. MOV) */ - uint16_t op_flags; -}; -_Static_assert(sizeof(struct vie_op) == 4, "ABI"); -_Static_assert(_Alignof(struct vie_op) == 2, "ABI"); - -#define VIE_INST_SIZE 15 -struct vie { - uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */ - uint8_t num_valid; /* size of the instruction */ - uint8_t num_processed; - - uint8_t addrsize:4, opsize:4; /* address and operand sizes */ - uint8_t rex_w:1, /* REX prefix */ - rex_r:1, - rex_x:1, - rex_b:1, - rex_present:1, - repz_present:1, /* REP/REPE/REPZ prefix */ - repnz_present:1, /* REPNE/REPNZ prefix */ - opsize_override:1, /* Operand size override */ - addrsize_override:1, /* Address size override */ - segment_override:1; /* Segment override */ - - uint8_t mod:2, /* ModRM byte */ - reg:4, - rm:4; - - uint8_t ss:2, /* SIB byte */ - vex_present:1, /* VEX prefixed */ - vex_l:1, /* L bit */ - index:4, /* SIB byte */ - base:4; /* SIB byte */ - - uint8_t disp_bytes; - uint8_t imm_bytes; - - uint8_t scale; - - uint8_t vex_reg:4, /* vvvv: first source register specifier */ - vex_pp:2, /* pp */ - _sparebits:2; - - uint8_t _sparebytes[2]; - - int base_register; /* VM_REG_GUEST_xyz */ - int index_register; /* VM_REG_GUEST_xyz */ - int segment_register; /* VM_REG_GUEST_xyz */ - - int64_t displacement; /* optional addr displacement */ - int64_t immediate; /* optional immediate operand */ - - uint8_t decoded; /* set to 1 if successfully decoded */ - - uint8_t _sparebyte; - - struct vie_op op; /* opcode description */ -}; -_Static_assert(sizeof(struct vie) == 64, "ABI"); -_Static_assert(__offsetof(struct vie, disp_bytes) == 22, "ABI"); -_Static_assert(__offsetof(struct vie, scale) == 24, "ABI"); -_Static_assert(__offsetof(struct vie, base_register) == 28, "ABI"); - enum vm_exitcode { VM_EXITCODE_INOUT, VM_EXITCODE_VMX, @@ -306,11 +218,11 @@ enum vm_exitcode { VM_EXITCODE_PAGING, VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, - VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ + VM_EXITCODE_MMIO_EMUL, VM_EXITCODE_RUNBLOCK, VM_EXITCODE_IOAPIC_EOI, VM_EXITCODE_SUSPENDED, - VM_EXITCODE_INOUT_STR, + VM_EXITCODE_MMIO, VM_EXITCODE_TASK_SWITCH, VM_EXITCODE_MONITOR, VM_EXITCODE_MWAIT, @@ -325,25 +237,38 @@ enum vm_exitcode { VM_EXITCODE_MAX }; +enum inout_flags { + INOUT_IN = (1U << 0), /* direction: 'in' when set, else 'out' */ + + /* + * The following flags are used only for in-kernel emulation logic and + * are not exposed to userspace. + */ + INOUT_STR = (1U << 1), /* ins/outs operation */ + INOUT_REP = (1U << 2), /* 'rep' prefix present on instruction */ +}; + struct vm_inout { - uint16_t bytes:3; /* 1 or 2 or 4 */ - uint16_t in:1; - uint16_t string:1; - uint16_t rep:1; + uint32_t eax; uint16_t port; - uint32_t eax; /* valid for out */ + uint8_t bytes; /* 1 or 2 or 4 */ + uint8_t flags; /* see: inout_flags */ + + /* + * The address size and segment are relevant to INS/OUTS operations. + * Userspace is not concerned with them since the in-kernel emulation + * handles those specific aspects. + */ + uint8_t addrsize; + uint8_t segment; }; -struct vm_inout_str { - struct vm_inout inout; /* must be the first element */ - struct vm_guest_paging paging; - uint64_t rflags; - uint64_t cr0; - uint64_t index; - uint64_t count; /* rep=1 (%rcx), rep=0 (1) */ - int addrsize; - enum vm_reg_name seg_name; - struct seg_desc seg_desc; +struct vm_mmio { + uint8_t bytes; /* 1/2/4/8 bytes */ + uint8_t read; /* read: 1, write: 0 */ + uint16_t _pad[3]; + uint64_t gpa; + uint64_t data; }; enum task_switch_reason { @@ -368,18 +293,25 @@ struct vm_exit { uint64_t rip; union { struct vm_inout inout; - struct vm_inout_str inout_str; + struct vm_mmio mmio; struct { uint64_t gpa; int fault_type; } paging; + /* + * Kernel-internal MMIO decoding and emulation. + * Userspace should not expect to see this, but rather a + * VM_EXITCODE_MMIO with the above 'mmio' context. + */ struct { uint64_t gpa; uint64_t gla; uint64_t cs_base; int cs_d; /* CS.D */ - struct vm_guest_paging paging; - struct vie vie; + } mmio_emul; + struct { + uint8_t inst[15]; + uint8_t num_valid; } inst_emul; /* * VMX specific payload. Used when there is no "better" @@ -433,6 +365,23 @@ struct vm_exit { } u; }; +enum vm_entry_cmds { + VEC_DEFAULT = 0, + VEC_DISCARD_INSTR, /* discard inst emul state */ + VEC_COMPLETE_MMIO, /* entry includes result for mmio emul */ + VEC_COMPLETE_INOUT, /* entry includes result for inout emul */ +}; + +struct vm_entry { + int cpuid; + uint_t cmd; /* see: vm_entry_cmds */ + void *exit_data; + union { + struct vm_inout inout; + struct vm_mmio mmio; + } u; +}; + void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2); int vm_restart_instruction(void *vm, int vcpuid); diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h index 40e0857945..4e89b712dc 100644 --- a/usr/src/uts/i86pc/sys/vmm_dev.h +++ b/usr/src/uts/i86pc/sys/vmm_dev.h @@ -83,11 +83,6 @@ struct vm_register_set { uint64_t *regvals; }; -struct vm_run { - int cpuid; - struct vm_exit vm_exit; -}; - struct vm_exception { int cpuid; int vector; |