diff options
author | Patrick Mooney <pmooney@pfmooney.com> | 2022-01-28 21:26:53 +0000 |
---|---|---|
committer | Patrick Mooney <pmooney@oxide.computer> | 2022-02-18 02:23:05 +0000 |
commit | 957246c9e6c47389c40079995d73eebcc659fb29 (patch) | |
tree | 0abb0e86445192958a251fd636adcc436195cf74 /usr | |
parent | 4dde95dacc64b35aa9882fcbd0a847355d130734 (diff) | |
download | illumos-gate-957246c9e6c47389c40079995d73eebcc659fb29.tar.gz |
14456 bhyve needs fpu import/export
Reviewed by: Dan Cross <cross@oxidecomputer.com>
Reviewed by: C Fraire <cfraire@me.com>
Reviewed by: Robert Mustacchi <rm@fingolfin.org>
Approved by: Dan McDonald <danmcd@joyent.com>
Diffstat (limited to 'usr')
23 files changed, 967 insertions, 184 deletions
diff --git a/usr/src/cmd/bhyvectl/Makefile b/usr/src/cmd/bhyvectl/Makefile index 01d331c823..486f39da31 100644 --- a/usr/src/cmd/bhyvectl/Makefile +++ b/usr/src/cmd/bhyvectl/Makefile @@ -35,6 +35,9 @@ CPPFLAGS = -I$(COMPAT)/bhyve -I$(CONTRIB)/bhyve \ -I$(SRC)/uts/i86pc LDLIBS += -lvmmapi +# Force c99 for everything +CSTD= $(CSTD_GNU99) + CERRWARN += -_gcc=-Wno-uninitialized # main() is too hairy for smatch diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c index 4fc6ddc251..cbe779a4ea 100644 --- a/usr/src/cmd/bhyvectl/bhyvectl.c +++ b/usr/src/cmd/bhyvectl/bhyvectl.c @@ -39,7 +39,7 @@ * * Copyright 2015 Pluribus Networks Inc. * Copyright 2019 Joyent, Inc. - * Copyright 2021 Oxide Computer Company + * Copyright 2022 Oxide Computer Company */ #include <sys/cdefs.h> @@ -51,6 +51,9 @@ __FBSDID("$FreeBSD$"); #include <sys/errno.h> #include <sys/mman.h> #include <sys/cpuset.h> +#ifndef __FreeBSD__ +#include <sys/fp.h> +#endif /* __FreeBSD__ */ #include <stdio.h> #include <stdlib.h> @@ -312,6 +315,7 @@ static int get_cpu_topology; #ifndef __FreeBSD__ static int pmtmr_port; static int wrlock_cycle; +static int get_fpu; #endif /* @@ -1534,6 +1538,7 @@ setup_options(bool cpu_intel) #ifndef __FreeBSD__ { "pmtmr-port", REQ_ARG, 0, PMTMR_PORT }, { "wrlock-cycle", NO_ARG, &wrlock_cycle, 1 }, + { "get-fpu", NO_ARG, &get_fpu, 1 }, #endif }; @@ -1752,6 +1757,93 @@ show_memseg(struct vmctx *ctx) } } +#ifndef __FreeBSD__ +static int +show_fpu(struct vmctx *ctx, int vcpu) +{ + int res, fd; + + struct vm_fpu_desc_entry entries[64]; + struct vm_fpu_desc desc = { + .vfd_entry_data = entries, + .vfd_num_entries = 64, + }; + fd = vm_get_device_fd(ctx); + res = ioctl(fd, VM_DESC_FPU_AREA, &desc); + if (res != 0) { + return (errno); + } + for (uint_t i = 0; i < desc.vfd_num_entries; i++) { + const struct vm_fpu_desc_entry *entry = &entries[i]; + + /* confirm that AVX fields are where we expect */ + if (entry->vfde_feature == XFEATURE_AVX) { + if (entry->vfde_size != 0x100 || + entry->vfde_off != 0x240) { + (void) fprintf(stderr, + "show_fpu: unexpected AVX size/placement " + "- size:%x off:%x\n", + entry->vfde_size, entry->vfde_off); + return (EINVAL); + } + } + } + void *buf = malloc(desc.vfd_req_size); + if (buf == NULL) { + return (ENOMEM); + } + struct vm_fpu_state req = { + .vcpuid = vcpu, + .buf = buf, + .len = desc.vfd_req_size, + }; + res = ioctl(fd, VM_GET_FPU, &req); + if (res != 0) { + res = errno; + free(buf); + return (res); + } + + const struct xsave_state *state = buf; + const struct fxsave_state *fx = &state->xs_fxsave; + (void) printf("fpu_fcw[%d]\t\t0x%04x\n", vcpu, fx->fx_fcw); + (void) printf("fpu_fsw[%d]\t\t0x%04x\n", vcpu, fx->fx_fsw); + (void) printf("fpu_ftw[%d]\t\t0x%04x\n", vcpu, fx->fx_fctw); + (void) printf("fpu_fop[%d]\t\t0x%04x\n", vcpu, fx->fx_fop); + (void) printf("fpu_rip[%d]\t\t0x%016lx\n", vcpu, fx->fx_rip); + (void) printf("fpu_rdp[%d]\t\t0x%016lx\n", vcpu, fx->fx_rdp); + (void) printf("fpu_mxcsr[%d]\t\t0x%08x\n", vcpu, fx->fx_mxcsr); + (void) printf("fpu_mxcsr_mask[%d]\t0x%08x\n", vcpu, + fx->fx_mxcsr_mask); + /* ST/MMX regs */ + for (uint_t i = 0; i < 8; i++) { + (void) printf("fpu_st%u[%d]\t\t0x%08x%08x%08x%08x\n", vcpu, i, + fx->fx_st[i].__fpr_pad[0], fx->fx_st[i].__fpr_pad[1], + fx->fx_st[i].__fpr_pad[2], fx->fx_st[i].__fpr_pad[3]); + } + /* SSE regs */ + for (uint_t i = 0; i < 16; i++) { + (void) printf("fpu_xmm%u[%d]\t\t0x%08x%08x%08x%08x\n", + i, vcpu, + fx->fx_xmm[i]._l[0], fx->fx_xmm[i]._l[1], + fx->fx_xmm[i]._l[2], fx->fx_xmm[i]._l[3]); + } + + if (state->xs_header.xsh_xstate_bv & XFEATURE_AVX) { + /* AVX regs */ + for (uint_t i = 0; i < 16; i++) { + (void) printf("fpu_ymm%u[%d]\t\t0x%08x%08x%08x%08x\n", + i, vcpu, + state->xs_ymm[i]._l[0], state->xs_ymm[i]._l[1], + state->xs_ymm[i]._l[2], state->xs_ymm[i]._l[3]); + } + } + + free(buf); + return (0); +} +#endif /*__FreeBSD__ */ + int main(int argc, char *argv[]) { @@ -2150,6 +2242,12 @@ main(int argc, char *argv[]) if (!error) error = get_all_segments(ctx, vcpu); +#ifndef __FreeBSD__ + if (!error && (get_fpu || get_all)) { + error = show_fpu(ctx, vcpu); + } +#endif /* __FreeBSD__ */ + if (!error) { if (cpu_intel) error = get_misc_vmcs(ctx, vcpu); diff --git a/usr/src/compat/bhyve/amd64/machine/fpu.h b/usr/src/compat/bhyve/amd64/machine/fpu.h deleted file mode 100644 index 6bc651d996..0000000000 --- a/usr/src/compat/bhyve/amd64/machine/fpu.h +++ /dev/null @@ -1,28 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2014 Pluribus Networks Inc. - * Copyright (c) 2018, Joyent, Inc. - */ - -#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_ -#define _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_ - -void fpuexit(kthread_t *td); -void fpurestore(void *); -void fpusave(void *); - -struct savefpu *fpu_save_area_alloc(void); -void fpu_save_area_free(struct savefpu *fsa); -void fpu_save_area_reset(struct savefpu *fsa); - -#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_ */ diff --git a/usr/src/compat/bhyve/amd64/machine/pcb.h b/usr/src/compat/bhyve/amd64/machine/pcb.h deleted file mode 100644 index 75b5de640c..0000000000 --- a/usr/src/compat/bhyve/amd64/machine/pcb.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2014 Pluribus Networks Inc. - */ - -#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PCB_H_ -#define _COMPAT_FREEBSD_AMD64_MACHINE_PCB_H_ - -#include <machine/fpu.h> - -#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_PCB_H_ */ diff --git a/usr/src/pkg/manifests/system-bhyve-tests.p5m b/usr/src/pkg/manifests/system-bhyve-tests.p5m index 5b4a7351c4..823ed69a60 100644 --- a/usr/src/pkg/manifests/system-bhyve-tests.p5m +++ b/usr/src/pkg/manifests/system-bhyve-tests.p5m @@ -38,6 +38,7 @@ file path=opt/bhyve-tests/tests/mevent/read_requeue mode=0555 file path=opt/bhyve-tests/tests/mevent/vnode_file mode=0555 file path=opt/bhyve-tests/tests/mevent/vnode_zvol mode=0555 dir path=opt/bhyve-tests/tests/vmm +file path=opt/bhyve-tests/tests/vmm/fpu_getset mode=0555 file path=opt/bhyve-tests/tests/vmm/mem_partial mode=0555 file path=opt/bhyve-tests/tests/vmm/mem_seg_map mode=0555 license lic_CDDL license=lic_CDDL diff --git a/usr/src/test/bhyve-tests/runfiles/default.run b/usr/src/test/bhyve-tests/runfiles/default.run index babfa0f7e9..3055f3e2d8 100644 --- a/usr/src/test/bhyve-tests/runfiles/default.run +++ b/usr/src/test/bhyve-tests/runfiles/default.run @@ -20,7 +20,7 @@ post = outputdir = /var/tmp/test_results [/opt/bhyve-tests/tests/vmm] -tests = ['mem_partial', 'mem_seg_map'] +tests = ['mem_partial', 'mem_seg_map', 'fpu_getset'] # Tests of userspace mevent system, built from cmd/bhyve [/opt/bhyve-tests/tests/mevent] diff --git a/usr/src/test/bhyve-tests/tests/vmm/Makefile b/usr/src/test/bhyve-tests/tests/vmm/Makefile index c91ed9a7e4..30d06a0f6b 100644 --- a/usr/src/test/bhyve-tests/tests/vmm/Makefile +++ b/usr/src/test/bhyve-tests/tests/vmm/Makefile @@ -16,7 +16,8 @@ include $(SRC)/cmd/Makefile.cmd.64 include $(SRC)/test/Makefile.com PROG = mem_partial \ - mem_seg_map + mem_seg_map \ + fpu_getset COMMON_OBJS = common.o CLEAN_OBJS = $(PROG:%=%.o) diff --git a/usr/src/test/bhyve-tests/tests/vmm/common.c b/usr/src/test/bhyve-tests/tests/vmm/common.c index b7f0a30ed0..622a14c61f 100644 --- a/usr/src/test/bhyve-tests/tests/vmm/common.c +++ b/usr/src/test/bhyve-tests/tests/vmm/common.c @@ -23,12 +23,13 @@ #include <vmmapi.h> struct vmctx * -create_test_vm(void) +create_test_vm(const char *test_suite_name) { char name[VM_MAX_NAMELEN]; int res; - (void) snprintf(name, sizeof (name), "bhyve-test-memmap-%d", getpid()); + (void) snprintf(name, sizeof (name), "bhyve-test-%s-%d", + test_suite_name, getpid()); res = vm_create(name, 0); if (res != 0) { diff --git a/usr/src/test/bhyve-tests/tests/vmm/common.h b/usr/src/test/bhyve-tests/tests/vmm/common.h index 7b64574cf2..f210408b71 100644 --- a/usr/src/test/bhyve-tests/tests/vmm/common.h +++ b/usr/src/test/bhyve-tests/tests/vmm/common.h @@ -16,7 +16,7 @@ #ifndef _COMMON_H_ #define _COMMON_H_ -struct vmctx *create_test_vm(void); +struct vmctx *create_test_vm(const char *); int alloc_memseg(struct vmctx *, int, size_t, const char *); #define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) diff --git a/usr/src/test/bhyve-tests/tests/vmm/fpu_getset.c b/usr/src/test/bhyve-tests/tests/vmm/fpu_getset.c new file mode 100644 index 0000000000..814e15dec3 --- /dev/null +++ b/usr/src/test/bhyve-tests/tests/vmm/fpu_getset.c @@ -0,0 +1,333 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2022 Oxide Computer Company + */ + + +#include <stdio.h> +#include <unistd.h> +#include <stdlib.h> +#include <stropts.h> +#include <strings.h> +#include <signal.h> +#include <setjmp.h> +#include <libgen.h> +#include <sys/debug.h> +#include <sys/fp.h> + +#include <sys/vmm.h> +#include <sys/vmm_dev.h> +#include <sys/x86_archext.h> +#include <vmmapi.h> + +#include "common.h" + +/* Minimal xsave state area (sans any AVX storage) */ +struct xsave_min { + struct fxsave_state legacy; + struct xsave_header header; +}; + +CTASSERT(sizeof (struct xsave_min) == MIN_XSAVE_SIZE); + +struct avx_state { + /* 16 x 128-bit: high portions of the ymm registers */ + uint64_t ymm[32]; +}; + +static bool +get_fpu(int fd, struct vm_fpu_state *req) +{ + int res = ioctl(fd, VM_GET_FPU, req); + if (res != 0) { + perror("could not read FPU for vCPU"); + return (false); + } + return (true); +} + +static bool +set_fpu(int fd, struct vm_fpu_state *req) +{ + int res = ioctl(fd, VM_SET_FPU, req); + if (res != 0) { + perror("could not write FPU for vCPU"); + return (false); + } + return (true); +} + +static bool +check_sse(int fd, const struct vm_fpu_desc *desc, void *fpu_area, + size_t fpu_size) +{ + /* Make sure the x87/MMX/SSE state is described as present */ + bool found_fp = false, found_sse = false; + for (uint_t i = 0; i < desc->vfd_num_entries; i++) { + const struct vm_fpu_desc_entry *ent = &desc->vfd_entry_data[i]; + + switch (ent->vfde_feature) { + case XFEATURE_LEGACY_FP: + found_fp = true; + if (ent->vfde_off != 0 || + ent->vfde_size != sizeof (struct fxsave_state)) { + (void) fprintf(stderr, + "unexpected entity for %x: " + "size=%x off=%x\n", ent->vfde_feature, + ent->vfde_size, ent->vfde_off); + return (false); + } + break; + case XFEATURE_SSE: + found_sse = true; + if (ent->vfde_off != 0 || + ent->vfde_size != sizeof (struct fxsave_state)) { + (void) fprintf(stderr, + "unexpected entity for %x: " + "size=%x off=%x\n", ent->vfde_feature, + ent->vfde_size, ent->vfde_off); + return (false); + } + break; + } + } + + if (!found_fp || !found_sse) { + (void) fprintf(stderr, "did not find x87 and SSE area " + "descriptors as expected in initial FPU\n"); + return (false); + } + + struct vm_fpu_state req = { + .vcpuid = 0, + .buf = fpu_area, + .len = fpu_size, + }; + + if (!get_fpu(fd, &req)) { + return (false); + } + + struct xsave_min *xs = fpu_area; + /* + * Executing this test on a freshly-created instance, we expect the FPU + * to only have the legacy and SSE features present in its active state. + */ + if (xs->header.xsh_xstate_bv != (XFEATURE_LEGACY_FP | XFEATURE_SSE)) { + (void) fprintf(stderr, "bad xstate_bv %lx, expected %lx", + xs->header.xsh_xstate_bv, + (XFEATURE_LEGACY_FP | XFEATURE_SSE)); + return (false); + } + + /* load some SSE values to check for a get/set cycle */ + uint64_t *xmm = (void *)&xs->legacy.fx_xmm[0]; + xmm[0] = UINT64_MAX; + xmm[2] = 1; + + if (!set_fpu(fd, &req)) { + return (false); + } + + /* check that those values made it in/out of the guest FPU */ + bzero(fpu_area, fpu_size); + if (!get_fpu(fd, &req)) { + return (false); + } + if (xmm[0] != UINT64_MAX || xmm[2] != 1) { + (void) fprintf(stderr, "SSE test registers not saved\n"); + return (false); + } + + /* Make sure that a bogus MXCSR value is rejected */ + xs->legacy.fx_mxcsr = UINT32_MAX; + int res = ioctl(fd, VM_SET_FPU, &req); + if (res == 0) { + (void) fprintf(stderr, + "write of invalid MXCSR erroneously allowed\n"); + return (false); + } + + return (true); +} + +static bool +check_avx(int fd, const struct vm_fpu_desc *desc, void *fpu_area, + size_t fpu_size) +{ + bool found_avx = false; + size_t avx_size, avx_off; + for (uint_t i = 0; i < desc->vfd_num_entries; i++) { + const struct vm_fpu_desc_entry *ent = &desc->vfd_entry_data[i]; + + if (ent->vfde_feature == XFEATURE_AVX) { + found_avx = true; + avx_size = ent->vfde_size; + avx_off = ent->vfde_off; + break; + } + } + + if (!found_avx) { + (void) printf("AVX capability not found on host CPU, " + "skipping related tests\n"); + return (true); + } + + if (avx_size != sizeof (struct avx_state)) { + (void) fprintf(stderr, "unexpected AVX state size: %x, " + "expected %x\n", avx_size, sizeof (struct avx_state)); + return (false); + } + if ((avx_off + avx_size) > fpu_size) { + (void) fprintf(stderr, "AVX data falls outside fpu size: " + "%x > %x\n", avx_off + avx_size, fpu_size); + return (false); + } + + struct xsave_min *xs = fpu_area; + struct avx_state *avx = fpu_area + avx_off; + + /* do a simple data round-trip */ + struct vm_fpu_state req = { + .vcpuid = 0, + .buf = fpu_area, + .len = fpu_size, + }; + if (!get_fpu(fd, &req)) { + return (false); + } + + /* With AVX unused so far, we expect it to be absent from the BV */ + if (xs->header.xsh_xstate_bv != (XFEATURE_LEGACY_FP | XFEATURE_SSE)) { + (void) fprintf(stderr, "bad xstate_bv %lx, expected %lx\n", + xs->header.xsh_xstate_bv, + (XFEATURE_LEGACY_FP | XFEATURE_SSE)); + return (false); + } + + avx->ymm[0] = UINT64_MAX; + avx->ymm[2] = 2; + + /* first write without asserting AVX in BV */ + if (!set_fpu(fd, &req)) { + return (false); + } + + /* And check that the AVX state stays empty */ + bzero(fpu_area, fpu_size); + if (!get_fpu(fd, &req)) { + return (false); + } + if (xs->header.xsh_xstate_bv != (XFEATURE_LEGACY_FP | XFEATURE_SSE)) { + (void) fprintf(stderr, "xstate_bv changed unexpectedly %lx\n", + xs->header.xsh_xstate_bv); + return (false); + } + if (avx->ymm[0] != 0 || avx->ymm[2] != 0) { + (void) fprintf(stderr, "YMM state changed unexpectedly " + "%lx %lx\n", avx->ymm[0], avx->ymm[2]); + return (false); + } + + /* Now write YMM and set the appropriate AVX BV state */ + avx->ymm[0] = UINT64_MAX; + avx->ymm[2] = 2; + xs->header.xsh_xstate_bv |= XFEATURE_AVX; + if (!set_fpu(fd, &req)) { + return (false); + } + + /* ... and now check that it stuck */ + bzero(fpu_area, fpu_size); + if (!get_fpu(fd, &req)) { + return (false); + } + if ((xs->header.xsh_xstate_bv & XFEATURE_AVX) == 0) { + (void) fprintf(stderr, "AVX missing from xstate_bv %lx\n", + xs->header.xsh_xstate_bv); + return (false); + } + if (avx->ymm[0] != UINT64_MAX || avx->ymm[2] != 2) { + (void) fprintf(stderr, "YMM state not preserved " + "%lx != %lx | %lx != %lx\n", + avx->ymm[0], UINT64_MAX, avx->ymm[2], 2); + return (false); + } + + + return (true); +} + +int +main(int argc, char *argv[]) +{ + struct vmctx *ctx; + int res, fd; + const char *suite_name = basename(argv[0]); + + ctx = create_test_vm(suite_name); + if (ctx == NULL) { + perror("could not open test VM"); + return (EXIT_FAILURE); + } + fd = vm_get_device_fd(ctx); + + struct vm_fpu_desc_entry entries[64]; + struct vm_fpu_desc desc = { + .vfd_entry_data = entries, + .vfd_num_entries = 64, + }; + + res = ioctl(fd, VM_DESC_FPU_AREA, &desc); + if (res != 0) { + perror("could not query fpu area description"); + goto bail; + } + + /* Make sure the XSAVE area described for this machine is reasonable */ + if (desc.vfd_num_entries == 0) { + (void) fprintf(stderr, "no FPU description entries found\n"); + goto bail; + } + if (desc.vfd_req_size < MIN_XSAVE_SIZE) { + (void) fprintf(stderr, "required XSAVE size %lu < " + "expected %lu\n", desc.vfd_req_size, MIN_XSAVE_SIZE); + goto bail; + } + + const size_t fpu_size = desc.vfd_req_size; + void *fpu_area = malloc(fpu_size); + if (fpu_area == NULL) { + perror("could not allocate fpu area"); + goto bail; + } + bzero(fpu_area, fpu_size); + + if (!check_sse(fd, &desc, fpu_area, fpu_size)) { + goto bail; + } + if (!check_avx(fd, &desc, fpu_area, fpu_size)) { + goto bail; + } + + /* mission accomplished */ + vm_destroy(ctx); + (void) printf("%s\tPASS\n", suite_name); + return (EXIT_SUCCESS); + +bail: + vm_destroy(ctx); + (void) printf("%s\tFAIL\n", suite_name); + return (EXIT_FAILURE); +} diff --git a/usr/src/test/bhyve-tests/tests/vmm/mem_partial.c b/usr/src/test/bhyve-tests/tests/vmm/mem_partial.c index b410c673ab..964fdf95c5 100644 --- a/usr/src/test/bhyve-tests/tests/vmm/mem_partial.c +++ b/usr/src/test/bhyve-tests/tests/vmm/mem_partial.c @@ -57,8 +57,9 @@ main(int argc, char *argv[]) struct vmctx *ctx; int res, fd; void *guest_mem; + const char *suite_name = basename(argv[0]); - ctx = create_test_vm(); + ctx = create_test_vm(suite_name); if (ctx == NULL) { perror("could open test VM"); return (1); @@ -192,7 +193,7 @@ main(int argc, char *argv[]) } /* mission accomplished */ - (void) printf("%s\tPASS\n", basename(argv[0])); + (void) printf("%s\tPASS\n", suite_name); vm_destroy(ctx); return (0); diff --git a/usr/src/test/bhyve-tests/tests/vmm/mem_seg_map.c b/usr/src/test/bhyve-tests/tests/vmm/mem_seg_map.c index e80f18547e..92d90bbf28 100644 --- a/usr/src/test/bhyve-tests/tests/vmm/mem_seg_map.c +++ b/usr/src/test/bhyve-tests/tests/vmm/mem_seg_map.c @@ -40,8 +40,9 @@ main(int argc, char *argv[]) struct vmctx *ctx; int res, fd; void *seg_obj, *guest_mem; + const char *suite_name = basename(argv[0]); - ctx = create_test_vm(); + ctx = create_test_vm(suite_name); if (ctx == NULL) { perror("could open test VM"); return (1); @@ -129,7 +130,7 @@ main(int argc, char *argv[]) /* mission accomplished */ vm_destroy(ctx); - (void) printf("%s\tPASS\n", basename(argv[0])); + (void) printf("%s\tPASS\n", suite_name); return (0); bail: diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c index cf00426300..844e8b9708 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c @@ -42,7 +42,6 @@ __FBSDID("$FreeBSD$"); #include <machine/clock.h> #include <machine/cpufunc.h> #include <machine/md_var.h> -#include <machine/pcb.h> #include <machine/specialreg.h> #include <machine/vmm.h> diff --git a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h index 7584213d39..e94f7a876b 100644 --- a/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h +++ b/usr/src/uts/i86pc/io/vmm/sys/vmm_kernel.h @@ -39,7 +39,7 @@ * * Copyright 2015 Pluribus Networks Inc. * Copyright 2019 Joyent, Inc. - * Copyright 2021 Oxide Computer Company + * Copyright 2022 Oxide Computer Company * Copyright 2021 OmniOS Community Edition (OmniOSce) Association. */ @@ -161,6 +161,8 @@ int vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec); int vm_set_run_state(struct vm *vm, int vcpuid, uint32_t state, uint8_t sipi_vec); +int vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len); +int vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len); int vm_run(struct vm *vm, int vcpuid, const struct vm_entry *); int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vm *vm, int vcpu); diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c index 16acc1ea2c..78a810880d 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm.c @@ -60,8 +60,8 @@ __FBSDID("$FreeBSD$"); #include <sys/sched.h> #include <sys/systm.h> #include <sys/sunddi.h> +#include <sys/hma.h> -#include <machine/pcb.h> #include <machine/md_var.h> #include <x86/psl.h> #include <x86/apicreg.h> @@ -132,7 +132,7 @@ struct vcpu { int exc_errcode_valid; uint32_t exc_errcode; uint8_t sipi_vector; /* (i) SIPI vector */ - struct savefpu *guestfpu; /* (a,i) guest fpu state */ + hma_fpu_t *guestfpu; /* (a,i) guest fpu state */ uint64_t guest_xcr0; /* (i) guest %xcr0 register */ void *stats; /* (a,i) statistics */ struct vm_exit exitinfo; /* (x) exit reason and collateral */ @@ -318,7 +318,8 @@ vcpu_cleanup(struct vm *vm, int i, bool destroy) VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); if (destroy) { vmm_stat_free(vcpu->stats); - fpu_save_area_free(vcpu->guestfpu); + hma_fpu_free(vcpu->guestfpu); + vcpu->guestfpu = NULL; vie_free(vcpu->vie_ctx); vcpu->vie_ctx = NULL; vmc_destroy(vcpu->vmclient); @@ -342,7 +343,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create) vcpu->state = VCPU_IDLE; vcpu->hostcpu = NOCPU; vcpu->lastloccpu = NOCPU; - vcpu->guestfpu = fpu_save_area_alloc(); + vcpu->guestfpu = hma_fpu_alloc(KM_SLEEP); vcpu->stats = vmm_stat_alloc(); vcpu->vie_ctx = vie_alloc(); @@ -369,7 +370,7 @@ vcpu_init(struct vm *vm, int vcpu_id, bool create) vcpu->extint_pending = 0; vcpu->exception_pending = 0; vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; - fpu_save_area_reset(vcpu->guestfpu); + hma_fpu_init(vcpu->guestfpu); vmm_stat_init(vcpu->stats); vcpu->tsc_offset = 0; } @@ -1168,6 +1169,50 @@ vm_set_seg_desc(struct vm *vm, int vcpu, int reg, const struct seg_desc *desc) return (VMSETDESC(vm->cookie, vcpu, reg, desc)); } +static int +translate_hma_xsave_result(hma_fpu_xsave_result_t res) +{ + switch (res) { + case HFXR_OK: + return (0); + case HFXR_NO_SPACE: + return (ENOSPC); + case HFXR_BAD_ALIGN: + case HFXR_UNSUP_FMT: + case HFXR_UNSUP_FEAT: + case HFXR_INVALID_DATA: + return (EINVAL); + default: + panic("unexpected xsave result"); + } +} + +int +vm_get_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) +{ + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + hma_fpu_xsave_result_t res; + + res = hma_fpu_get_xsave_state(vcpu->guestfpu, buf, len); + return (translate_hma_xsave_result(res)); +} + +int +vm_set_fpu(struct vm *vm, int vcpuid, void *buf, size_t len) +{ + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + hma_fpu_xsave_result_t res; + + res = hma_fpu_set_xsave_state(vcpu->guestfpu, buf, len); + return (translate_hma_xsave_result(res)); +} + int vm_get_run_state(struct vm *vm, int vcpuid, uint32_t *state, uint8_t *sipi_vec) { @@ -1220,13 +1265,9 @@ vm_track_dirty_pages(struct vm *vm, uint64_t gpa, size_t len, uint8_t *bitmap) static void restore_guest_fpustate(struct vcpu *vcpu) { - - /* flush host state to the pcb */ - fpuexit(curthread); - - /* restore guest FPU state */ + /* Save host FPU and restore guest FPU */ fpu_stop_emulating(); - fpurestore(vcpu->guestfpu); + hma_fpu_start_guest(vcpu->guestfpu); /* restore guest XCR0 if XSAVE is enabled in the host */ if (rcr4() & CR4_XSAVE) @@ -1252,9 +1293,9 @@ save_guest_fpustate(struct vcpu *vcpu) load_xcr(0, vmm_get_host_xcr0()); } - /* save guest FPU state */ + /* save guest FPU and restore host FPU */ fpu_stop_emulating(); - fpusave(vcpu->guestfpu); + hma_fpu_stop_guest(vcpu->guestfpu); /* * When the host state has been restored, we should not re-enable * CR0.TS on illumos for eager FPU. @@ -2912,7 +2953,7 @@ vcpu_arch_reset(struct vm *vm, int vcpuid, bool init_only) */ if (!init_only) { vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; - fpu_save_area_reset(vcpu->guestfpu); + hma_fpu_init(vcpu->guestfpu); /* XXX: clear MSRs and other pieces */ } diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c index a83989e9eb..4ef2e5f583 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -414,6 +414,8 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, case VM_RESET_CPU: case VM_GET_RUN_STATE: case VM_SET_RUN_STATE: + case VM_GET_FPU: + case VM_SET_FPU: /* * Copy in the ID of the vCPU chosen for this operation. * Since a nefarious caller could update their struct between @@ -469,6 +471,7 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, case VM_GET_GPA_PMAP: case VM_IOAPIC_PINCOUNT: case VM_SUSPEND: + case VM_DESC_FPU_AREA: default: break; } @@ -755,6 +758,53 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, } break; } + case VM_DESC_FPU_AREA: { + struct vm_fpu_desc desc; + void *buf = NULL; + + if (ddi_copyin(datap, &desc, sizeof (desc), md)) { + error = EFAULT; + break; + } + if (desc.vfd_num_entries > 64) { + error = EINVAL; + break; + } + const size_t buf_sz = sizeof (struct vm_fpu_desc_entry) * + desc.vfd_num_entries; + if (buf_sz != 0) { + buf = kmem_zalloc(buf_sz, KM_SLEEP); + } + + /* + * For now, we are depending on vm_fpu_desc_entry and + * hma_xsave_state_desc_t having the same format. + */ + CTASSERT(sizeof (struct vm_fpu_desc_entry) == + sizeof (hma_xsave_state_desc_t)); + + size_t req_size; + const uint_t max_entries = hma_fpu_describe_xsave_state( + (hma_xsave_state_desc_t *)buf, + desc.vfd_num_entries, + &req_size); + + desc.vfd_req_size = req_size; + desc.vfd_num_entries = max_entries; + if (buf_sz != 0) { + if (ddi_copyout(buf, desc.vfd_entry_data, buf_sz, md)) { + error = EFAULT; + } + kmem_free(buf, buf_sz); + } + + if (error == 0) { + if (ddi_copyout(&desc, datap, sizeof (desc), md)) { + error = EFAULT; + } + } + break; + } case VM_ISA_ASSERT_IRQ: { struct vm_isa_irq isa_irq; @@ -1040,6 +1090,51 @@ vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, vrs.sipi_vector); break; } + case VM_GET_FPU: { + struct vm_fpu_state req; + const size_t max_len = (PAGESIZE * 2); + void *kbuf; + + if (ddi_copyin(datap, &req, sizeof (req), md)) { + error = EFAULT; + break; + } + if (req.len > max_len || req.len == 0) { + error = EINVAL; + break; + } + kbuf = kmem_zalloc(req.len, KM_SLEEP); + error = vm_get_fpu(sc->vmm_vm, vcpu, kbuf, req.len); + if (error == 0) { + if (ddi_copyout(kbuf, req.buf, req.len, md)) { + error = EFAULT; + } + } + kmem_free(kbuf, req.len); + break; + } + case VM_SET_FPU: { + struct vm_fpu_state req; + const size_t max_len = (PAGESIZE * 2); + void *kbuf; + + if (ddi_copyin(datap, &req, sizeof (req), md)) { + error = EFAULT; + break; + } + if (req.len > max_len || req.len == 0) { + error = EINVAL; + break; + } + kbuf = kmem_alloc(req.len, KM_SLEEP); + if (ddi_copyin(req.buf, kbuf, req.len, md)) { + error = EFAULT; + } else { + error = vm_set_fpu(sc->vmm_vm, vcpu, kbuf, req.len); + } + kmem_free(kbuf, req.len); + break; + } case VM_SET_KERNEMU_DEV: case VM_GET_KERNEMU_DEV: { diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c index f78db731d6..cdcebc71d4 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c @@ -58,7 +58,6 @@ #include <sys/x86_archext.h> #include <machine/cpufunc.h> -#include <machine/fpu.h> #include <machine/md_var.h> #include <machine/specialreg.h> #include <machine/vmm.h> @@ -434,67 +433,6 @@ vmm_cpuid_init(void) cpu_exthigh = regs[0]; } -/* - * FreeBSD uses the struct savefpu for managing the FPU state. That is mimicked - * by our hypervisor multiplexor framework structure. - */ -struct savefpu * -fpu_save_area_alloc(void) -{ - return ((struct savefpu *)hma_fpu_alloc(KM_SLEEP)); -} - -void -fpu_save_area_free(struct savefpu *fsa) -{ - hma_fpu_t *fpu = (hma_fpu_t *)fsa; - hma_fpu_free(fpu); -} - -void -fpu_save_area_reset(struct savefpu *fsa) -{ - hma_fpu_t *fpu = (hma_fpu_t *)fsa; - hma_fpu_init(fpu); -} - -/* - * This glue function is supposed to save the host's FPU state. This is always - * paired in the general bhyve code with a call to fpusave. Therefore, we treat - * this as a nop and do all the work in fpusave(), which will have the context - * argument that we want anyways. - */ -void -fpuexit(kthread_t *td) -{ -} - -/* - * This glue function is supposed to restore the guest's FPU state from the save - * area back to the host. In FreeBSD, it is assumed that the host state has - * already been saved by a call to fpuexit(); however, we do both here. - */ -void -fpurestore(void *arg) -{ - hma_fpu_t *fpu = arg; - - hma_fpu_start_guest(fpu); -} - -/* - * This glue function is supposed to save the guest's FPU state. The host's FPU - * state is not expected to be restored necessarily due to the use of FPU - * emulation through CR0.TS. However, we can and do restore it here. - */ -void -fpusave(void *arg) -{ - hma_fpu_t *fpu = arg; - - hma_fpu_stop_guest(fpu); -} - void vmm_sol_glue_init(void) { diff --git a/usr/src/uts/i86pc/os/hma_fpu.c b/usr/src/uts/i86pc/os/hma_fpu.c index 14cfa8baed..138af7a32a 100644 --- a/usr/src/uts/i86pc/os/hma_fpu.c +++ b/usr/src/uts/i86pc/os/hma_fpu.c @@ -11,6 +11,7 @@ /* * Copyright (c) 2018, Joyent, Inc. + * Copyright 2022 Oxide Computer Company */ /* @@ -28,6 +29,12 @@ #include <sys/hma.h> #include <sys/x86_archext.h> #include <sys/archsystm.h> +#include <sys/controlregs.h> +#include <sys/sysmacros.h> +#include <sys/stdbool.h> +#include <sys/ontrap.h> +#include <sys/cpuvar.h> +#include <sys/disp.h> struct hma_fpu { fpu_ctx_t hf_guest_fpu; @@ -57,7 +64,7 @@ hma_fpu_init(hma_fpu_t *fpu) xs = fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_xs; bzero(xs, cpuid_get_xsave_size()); bcopy(&avx_initial, xs, sizeof (*xs)); - xs->xs_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE; + xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE; fpu->hf_guest_fpu.fpu_xsave_mask = XFEATURE_FP_ALL; break; default: @@ -140,6 +147,36 @@ hma_fpu_start_guest(hma_fpu_t *fpu) fpu->hf_guest_fpu.fpu_flags &= ~FPU_VALID; } +/* + * Since fp_save() assumes a thread-centric view of the FPU usage -- it will + * assert if attempting to save elsewhere than the thread PCB, and will elide + * action if the FPU is not enabled -- we cannot use it for the manual saving of + * FPU contents. To work around that, we call the save mechanism directly. + */ +static void +do_fp_save(fpu_ctx_t *fpu) +{ + /* + * For our manual saving, we expect that the thread PCB never be the + * landing zone for the data. + */ + ASSERT(curthread->t_lwp == NULL || + fpu != &curthread->t_lwp->lwp_pcb.pcb_fpu); + + switch (fp_save_mech) { + case FP_FXSAVE: + fpxsave(fpu->fpu_regs.kfpu_u.kfpu_fx); + break; + case FP_XSAVE: + xsavep(fpu->fpu_regs.kfpu_u.kfpu_xs, fpu->fpu_xsave_mask); + break; + default: + panic("Invalid fp_save_mech"); + } + fpu->fpu_flags |= FPU_VALID; +} + + void hma_fpu_stop_guest(hma_fpu_t *fpu) { @@ -148,29 +185,232 @@ hma_fpu_stop_guest(hma_fpu_t *fpu) ASSERT3U(fpu->hf_guest_fpu.fpu_flags & FPU_EN, !=, 0); ASSERT3U(fpu->hf_guest_fpu.fpu_flags & FPU_VALID, ==, 0); + do_fp_save(&fpu->hf_guest_fpu); + + fp_restore(&curthread->t_lwp->lwp_pcb.pcb_fpu); + + fpu->hf_inguest = B_FALSE; + fpu->hf_curthread = NULL; +} + +/* + * Will output up to `ndesc` records into `descp`. The required size for an + * XSAVE area containing all of the data fields supported by the host will be + * placed in `req_sizep` (if non-NULL). Returns the number of feature bits + * supported by the host. + */ +uint_t +hma_fpu_describe_xsave_state(hma_xsave_state_desc_t *descp, uint_t ndesc, + size_t *req_sizep) +{ + uint64_t features; + + switch (fp_save_mech) { + case FP_FXSAVE: + /* + * Even without xsave support, the FPU will have legacy x87 + * float and SSE state contained within. + */ + features = XFEATURE_LEGACY_FP | XFEATURE_SSE; + break; + case FP_XSAVE: + features = get_xcr(XFEATURE_ENABLED_MASK); + break; + default: + panic("Invalid fp_save_mech"); + } + + uint_t count, pos; + uint_t max_size = MIN_XSAVE_SIZE; + for (count = 0, pos = 0; pos <= 63; pos++) { + const uint64_t bit = (1 << pos); + uint32_t size, off; + + if ((features & bit) == 0) { + continue; + } + + if (bit == XFEATURE_LEGACY_FP || bit == XFEATURE_SSE) { + size = sizeof (struct fxsave_state); + off = 0; + } else { + /* + * Size and position of data types within the XSAVE area + * is described in leaf 0xD in the subfunction + * corresponding to the bit position (for pos > 1). + */ + struct cpuid_regs regs = { + .cp_eax = 0xD, + .cp_ecx = pos, + }; + + ASSERT3U(pos, >, 1); + + (void) __cpuid_insn(®s); + size = regs.cp_eax; + off = regs.cp_ebx; + } + max_size = MAX(max_size, off + size); + + if (count < ndesc) { + hma_xsave_state_desc_t *desc = &descp[count]; + + desc->hxsd_bit = bit; + desc->hxsd_size = size; + desc->hxsd_off = off; + } + count++; + } + if (req_sizep != NULL) { + *req_sizep = max_size; + } + return (count); +} + +hma_fpu_xsave_result_t +hma_fpu_get_xsave_state(const hma_fpu_t *fpu, void *buf, size_t len) +{ + ASSERT(!fpu->hf_inguest); + + size_t valid_len; + switch (fp_save_mech) { + case FP_FXSAVE: { + if (len < MIN_XSAVE_SIZE) { + return (HFXR_NO_SPACE); + } + bcopy(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic, buf, + sizeof (struct fxsave_state)); + + struct xsave_header hdr = { + .xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE, + }; + bcopy(&hdr, buf + sizeof (struct fxsave_state), sizeof (hdr)); + + break; + } + case FP_XSAVE: + (void) hma_fpu_describe_xsave_state(NULL, 0, &valid_len); + if (len < valid_len) { + return (HFXR_NO_SPACE); + } + bcopy(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_generic, buf, + valid_len); + break; + default: + panic("Invalid fp_save_mech"); + } + + return (HFXR_OK); +} + +hma_fpu_xsave_result_t +hma_fpu_set_xsave_state(hma_fpu_t *fpu, void *buf, size_t len) +{ + ASSERT(!fpu->hf_inguest); + + if (len < MIN_XSAVE_SIZE) { + return (HFXR_NO_SPACE); + } + /* 64-byte alignment is demanded of the FPU-related operations */ + if (((uintptr_t)buf & 63) != 0) { + return (HFXR_BAD_ALIGN); + } + + struct xsave_header *hdr = buf + sizeof (struct fxsave_state); + if (hdr->xsh_xcomp_bv != 0) { + /* XSAVEC formatting not supported at this time */ + return (HFXR_UNSUP_FMT); + } + + uint64_t allowed_bits; + size_t save_area_size; + switch (fp_save_mech) { + case FP_FXSAVE: + allowed_bits = XFEATURE_LEGACY_FP | XFEATURE_SSE; + save_area_size = sizeof (struct fxsave_state); + break; + case FP_XSAVE: + allowed_bits = get_xcr(XFEATURE_ENABLED_MASK); + save_area_size = cpuid_get_xsave_size(); + break; + default: + panic("Invalid fp_save_mech"); + } + if ((hdr->xsh_xstate_bv & ~(allowed_bits)) != 0) { + return (HFXR_UNSUP_FEAT); + } + /* - * Note, we can't use fp_save because it assumes that we're saving to - * the thread's PCB and not somewhere else. Because this is a different - * FPU context, we instead have to do this ourselves. + * We validate the incoming state with the FPU itself prior to saving it + * into the guest FPU context area. In order to preserve any state + * currently housed in the FPU, we save it to a temporarily allocated + * FPU context. It is important to note that we are not following the + * normal rules around state management detailed in uts/intel/os/fpu.c. + * This saving is unconditional, uncaring about the state in the FPU or + * the value of CR0_TS, simplifying our process before returning to the + * caller (without needing to chcek of an lwp, etc). To prevent + * interrupting threads from encountering this unusual FPU state, we + * keep interrupts disabled for the duration. */ + fpu_ctx_t temp_ctx = { + .fpu_xsave_mask = XFEATURE_FP_ALL, + }; + temp_ctx.fpu_regs.kfpu_u.kfpu_generic = + kmem_cache_alloc(fpsave_cachep, KM_SLEEP); + bzero(temp_ctx.fpu_regs.kfpu_u.kfpu_generic, save_area_size); + + ulong_t iflag; + iflag = intr_clear(); + bool disable_when_done = (getcr0() & CR0_TS) != 0; + do_fp_save(&temp_ctx); + + /* + * If the provided data is invalid, it will cause a #GP when we attempt + * to load it into the FPU, so protect against that with on_trap(). + * Should the data load successfully, we can then be confident that its + * later use in via hma_fpu_start_guest() will be safe. + */ + on_trap_data_t otd; + volatile hma_fpu_xsave_result_t res = HFXR_OK; + if (on_trap(&otd, OT_DATA_EC) != 0) { + res = HFXR_INVALID_DATA; + goto done; + } + switch (fp_save_mech) { case FP_FXSAVE: - fpxsave(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx); + if (hdr->xsh_xstate_bv == 0) { + /* + * An empty xstate_bv means we can simply load the + * legacy FP/SSE area with their initial state. + */ + bcopy(&sse_initial, + fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx, + sizeof (sse_initial)); + } else { + fpxrestore(buf); + fpxsave(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_fx); + } break; case FP_XSAVE: + xrestore(buf, XFEATURE_FP_ALL); xsavep(fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_xs, fpu->hf_guest_fpu.fpu_xsave_mask); break; default: panic("Invalid fp_save_mech"); - /*NOTREACHED*/ } - fpu->hf_guest_fpu.fpu_flags |= FPU_VALID; - fp_restore(&curthread->t_lwp->lwp_pcb.pcb_fpu); +done: + no_trap(); + fp_restore(&temp_ctx); + if (disable_when_done) { + fpdisable(); + } + intr_restore(iflag); + kmem_cache_free(fpsave_cachep, temp_ctx.fpu_regs.kfpu_u.kfpu_generic); - fpu->hf_inguest = B_FALSE; - fpu->hf_curthread = NULL; + return (res); } void @@ -214,11 +454,11 @@ hma_fpu_set_fxsave_state(hma_fpu_t *fpu, const struct fxsave_state *fx) gxs = fpu->hf_guest_fpu.fpu_regs.kfpu_u.kfpu_xs; bzero(gxs, cpuid_get_xsave_size()); bcopy(fx, &gxs->xs_fxsave, sizeof (*fx)); - gxs->xs_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE; + gxs->xs_header.xsh_xstate_bv = + XFEATURE_LEGACY_FP | XFEATURE_SSE; break; default: panic("Invalid fp_save_mech"); - /* NOTREACHED */ } return (0); diff --git a/usr/src/uts/i86pc/sys/hma.h b/usr/src/uts/i86pc/sys/hma.h index 16ab708896..e15cd60d5e 100644 --- a/usr/src/uts/i86pc/sys/hma.h +++ b/usr/src/uts/i86pc/sys/hma.h @@ -11,6 +11,7 @@ /* * Copyright 2019 Joyent, Inc. + * Copyright 2022 Oxide Computer Company */ #ifndef _SYS_HMA_H @@ -117,6 +118,43 @@ extern void hma_fpu_start_guest(hma_fpu_t *); */ extern void hma_fpu_stop_guest(hma_fpu_t *); +typedef enum { + HFXR_OK = 0, + HFXR_NO_SPACE, /* buffer is not large enough */ + HFXR_BAD_ALIGN, /* buffer is not properly (64-byte) aligned */ + HFXR_UNSUP_FMT, /* data using unsupported (compressed) format */ + HFXR_UNSUP_FEAT, /* data has unsupported features set */ + HFXR_INVALID_DATA, /* CPU determined xsave data is invalid */ +} hma_fpu_xsave_result_t; + +/* + * Get and set the contents of the FPU save area, formatted as XSAVE-style + * information. If XSAVE is not supported by the host, the input and output + * values will be translated to and from the FXSAVE format. Attempts to set + * XSAVE values not supported by the host will result in an error. + * + * These functions cannot be called while the FPU is in use by the guest. It is + * up to callers to guarantee this invariant. + */ +extern hma_fpu_xsave_result_t hma_fpu_get_xsave_state(const hma_fpu_t *, void *, + size_t); +extern hma_fpu_xsave_result_t hma_fpu_set_xsave_state(hma_fpu_t *, void *, + size_t); + +typedef struct hma_xsave_state_desc { + uint64_t hxsd_bit; + uint32_t hxsd_size; + uint32_t hxsd_off; +} hma_xsave_state_desc_t; + +/* + * Get a description of the data fields supported by the host via the XSAVE APIs + * for getting/setting guest FPU data. See the function definition for more + * detailed parameter usage. + */ +extern uint_t hma_fpu_describe_xsave_state(hma_xsave_state_desc_t *, uint_t, + size_t *); + /* * Get and set the contents of the FPU save area. This sets the fxsave style * information. In all cases when this is in use, if an XSAVE state is actually diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h index 3282fa86bf..027a7da214 100644 --- a/usr/src/uts/i86pc/sys/vmm_dev.h +++ b/usr/src/uts/i86pc/sys/vmm_dev.h @@ -302,6 +302,25 @@ struct vm_run_state { uint8_t _pad[3]; }; +/* Transfer data for VM_GET_FPU and VM_SET_FPU */ +struct vm_fpu_state { + int vcpuid; + void *buf; + size_t len; +}; + +struct vm_fpu_desc_entry { + uint64_t vfde_feature; + uint32_t vfde_size; + uint32_t vfde_off; +}; + +struct vm_fpu_desc { + struct vm_fpu_desc_entry *vfd_entry_data; + size_t vfd_req_size; + uint32_t vfd_num_entries; +}; + struct vmm_resv_query { size_t vrq_free_sz; size_t vrq_alloc_sz; @@ -370,6 +389,8 @@ struct vmm_dirty_tracker { #define VM_RESET_CPU (VMM_CPU_IOC_BASE | 0x16) #define VM_GET_RUN_STATE (VMM_CPU_IOC_BASE | 0x17) #define VM_SET_RUN_STATE (VMM_CPU_IOC_BASE | 0x18) +#define VM_GET_FPU (VMM_CPU_IOC_BASE | 0x19) +#define VM_SET_FPU (VMM_CPU_IOC_BASE | 0x1a) /* Operations requiring write-locking the VM */ #define VM_REINIT (VMM_LOCK_IOC_BASE | 0x01) @@ -428,6 +449,7 @@ struct vmm_dirty_tracker { /* Note: forces a barrier on a flush operation before returning. */ #define VM_TRACK_DIRTY_PAGES (VMM_IOC_BASE | 0x20) +#define VM_DESC_FPU_AREA (VMM_IOC_BASE | 0x21) #define VM_DEVMEM_GETOFFSET (VMM_IOC_BASE | 0xff) diff --git a/usr/src/uts/intel/os/archdep.c b/usr/src/uts/intel/os/archdep.c index 3d2996880d..9ef480a69a 100644 --- a/usr/src/uts/intel/os/archdep.c +++ b/usr/src/uts/intel/os/archdep.c @@ -269,7 +269,7 @@ setfpregs(klwp_t *lwp, fpregset_t *fp) &fpu->fpu_regs.kfpu_u.kfpu_xs->xs_fxsave); fpu->fpu_regs.kfpu_xstatus = fp->fp_reg_set.fpchip_state.xstatus; - fpu->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= + fpu->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |= (XFEATURE_LEGACY_FP | XFEATURE_SSE); break; default: diff --git a/usr/src/uts/intel/os/fpu.c b/usr/src/uts/intel/os/fpu.c index 0a9b828288..9644282429 100644 --- a/usr/src/uts/intel/os/fpu.c +++ b/usr/src/uts/intel/os/fpu.c @@ -22,7 +22,7 @@ * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2021 Joyent, Inc. * Copyright 2021 RackTop Systems, Inc. - * Copyright 2021 Oxide Computer Company + * Copyright 2022 Oxide Computer Company */ /* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ @@ -528,23 +528,18 @@ const struct xsave_state avx_initial = { * The definition below needs to be identical with sse_initial * defined above. */ - { - FPU_CW_INIT, /* fx_fcw */ - 0, /* fx_fsw */ - 0, /* fx_fctw */ - 0, /* fx_fop */ - 0, /* fx_rip */ - 0, /* fx_rdp */ - SSE_MXCSR_INIT /* fx_mxcsr */ - /* rest of structure is zero */ + .xs_fxsave = { + .fx_fcw = FPU_CW_INIT, + .fx_mxcsr = SSE_MXCSR_INIT, + }, + .xs_header = { + /* + * bit0 = 1 for XSTATE_BV to indicate that legacy fields are + * valid, and CPU should initialize XMM/YMM. + */ + .xsh_xstate_bv = 1, + .xsh_xcomp_bv = 0, }, - /* - * bit0 = 1 for XSTATE_BV to indicate that legacy fields are valid, - * and CPU should initialize XMM/YMM. - */ - 1, - 0 /* xs_xcomp_bv */ - /* rest of structure is zero */ }; /* @@ -656,8 +651,8 @@ fp_new_lwp(void *parent, void *child) bcopy(&avx_initial, cxs, sizeof (*cxs)); cfx->fx_mxcsr = fx->fx_mxcsr & ~SSE_MXCSR_EFLAGS; cfx->fx_fcw = fx->fx_fcw; - cxs->xs_xstate_bv |= (get_xcr(XFEATURE_ENABLED_MASK) & - XFEATURE_FP_INITIAL); + cxs->xs_header.xsh_xstate_bv |= + (get_xcr(XFEATURE_ENABLED_MASK) & XFEATURE_FP_INITIAL); break; default: panic("Invalid fp_save_mech"); @@ -973,7 +968,8 @@ fpexterrflt(struct regs *rp) * Always set LEGACY_FP as it may have been cleared by XSAVE * instruction */ - fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP; + fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |= + XFEATURE_LEGACY_FP; break; default: panic("Invalid fp_save_mech"); @@ -1154,7 +1150,8 @@ fpsetcw(uint16_t fcw, uint32_t mxcsr) * Always set LEGACY_FP as it may have been cleared by XSAVE * instruction */ - fp->fpu_regs.kfpu_u.kfpu_xs->xs_xstate_bv |= XFEATURE_LEGACY_FP; + fp->fpu_regs.kfpu_u.kfpu_xs->xs_header.xsh_xstate_bv |= + XFEATURE_LEGACY_FP; break; default: panic("Invalid fp_save_mech"); @@ -1177,7 +1174,7 @@ kernel_fpu_fpstate_init(kfpu_state_t *kfpu) xs = kfpu->kfpu_ctx.fpu_regs.kfpu_u.kfpu_xs; bzero(xs, cpuid_get_xsave_size()); bcopy(&avx_initial, xs, sizeof (*xs)); - xs->xs_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE; + xs->xs_header.xsh_xstate_bv = XFEATURE_LEGACY_FP | XFEATURE_SSE; kfpu->kfpu_ctx.fpu_xsave_mask = XFEATURE_FP_ALL; break; default: diff --git a/usr/src/uts/intel/sys/fp.h b/usr/src/uts/intel/sys/fp.h index dfbcf7dc1c..7423444c60 100644 --- a/usr/src/uts/intel/sys/fp.h +++ b/usr/src/uts/intel/sys/fp.h @@ -21,6 +21,7 @@ /* * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2018, Joyent, Inc. + * Copyright 2022 Oxide Computer Company * * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. */ @@ -230,11 +231,23 @@ struct fxsave_state { } __aligned(16); /* 512 bytes */ /* + * This structure represents the header portion of the data layout used by the + * 'xsave' instruction variants. It is documented in section 13.4.2 of the + * Intel 64 and IA-32 Architectures Software Developer’s Manual, Volume 1 + * (IASDv1). Although "header" is somewhat of a misnomer, considering the data + * begins at offset 512 of the xsave area, its contents dictate which portions + * of the area are present and how they may be formatted. + */ +struct xsave_header { + uint64_t xsh_xstate_bv; + uint64_t xsh_xcomp_bv; + uint64_t xsh_reserved[6]; +}; + +/* * This structure is written to memory by one of the 'xsave' instruction * variants. The first 512 bytes are compatible with the format of the 'fxsave' - * area. The header portion of the xsave layout is documented in section - * 13.4.2 of the Intel 64 and IA-32 Architectures Software Developer’s Manual, - * Volume 1 (IASDv1). The extended portion is documented in section 13.4.3. + * area. The extended portion is documented in section 13.4.3. * * Our size is at least AVX_XSAVE_SIZE (832 bytes), which is asserted * statically. Enabling additional xsave-related CPU features requires an @@ -245,9 +258,10 @@ struct fxsave_state { * determined dynamically by querying the CPU. See the xsave_info structure in * cpuid.c. * - * xsave component usage is tracked using bits in the xs_xstate_bv field. The - * components are documented in section 13.1 of IASDv1. For easy reference, - * this is a summary of the currently defined component bit definitions: + * xsave component usage is tracked using bits in the xstate_bv field of the + * header. The components are documented in section 13.1 of IASDv1. For easy + * reference, this is a summary of the currently defined component bit + * definitions: * x87 0x0001 * SSE 0x0002 * AVX 0x0004 @@ -259,21 +273,28 @@ struct fxsave_state { * PT 0x0100 * PKRU 0x0200 * When xsaveopt_ctxt is being used to save into the xsave_state area, the - * xs_xstate_bv field is updated by the xsaveopt instruction to indicate which + * xstate_bv field is updated by the xsaveopt instruction to indicate which * elements of the xsave area are active. * - * xs_xcomp_bv should always be 0, since we do not currently use the compressed - * form of xsave (xsavec). + * The xcomp_bv field should always be 0, since we do not currently use the + * compressed form of xsave (xsavec). */ struct xsave_state { struct fxsave_state xs_fxsave; /* 0-511 legacy region */ - uint64_t xs_xstate_bv; /* 512-519 start xsave header */ - uint64_t xs_xcomp_bv; /* 520-527 */ - uint64_t xs_reserved[6]; /* 528-575 end xsave header */ + struct xsave_header xs_header; /* 512-575 XSAVE header */ upad128_t xs_ymm[16]; /* 576 AVX component */ } __aligned(64); /* + * While AVX_XSTATE_SIZE is the smallest the kernel will allocate for FPU + * state-saving, other consumers may constrain themselves to the minimum + * possible xsave state structure, which features only the legacy area and the + * bare xsave header. + */ +#define MIN_XSAVE_SIZE (sizeof (struct fxsave_state) + \ + sizeof (struct xsave_header)) + +/* * Kernel's FPU save area */ typedef struct { |