From a68aefedafdc78f1f25e9c888f61357d59c87311 Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Sat, 18 Apr 2020 15:22:13 +0300 Subject: 12540 mdb: build with gcore Reviewed by: John Levon Approved by: Joshua M. Clulow --- usr/src/cmd/mdb/sparc/modules/genunix/gcore_isadep.c | 4 ---- usr/src/cmd/mdb/sparc/v9/genunix/Makefile | 1 + 2 files changed, 1 insertion(+), 4 deletions(-) (limited to 'usr/src') diff --git a/usr/src/cmd/mdb/sparc/modules/genunix/gcore_isadep.c b/usr/src/cmd/mdb/sparc/modules/genunix/gcore_isadep.c index 50f60516a1..2839f47219 100644 --- a/usr/src/cmd/mdb/sparc/modules/genunix/gcore_isadep.c +++ b/usr/src/cmd/mdb/sparc/modules/genunix/gcore_isadep.c @@ -17,8 +17,6 @@ * implemented. */ -#ifndef _KMDB - #include /* ARGSUSED */ @@ -54,5 +52,3 @@ gcore_prgetrvals(mdb_klwp_t *lwp, long *rval1, long *rval2) { return (0); } - -#endif /* _KMDB */ diff --git a/usr/src/cmd/mdb/sparc/v9/genunix/Makefile b/usr/src/cmd/mdb/sparc/v9/genunix/Makefile index 00dadda34b..8bfb17c607 100644 --- a/usr/src/cmd/mdb/sparc/v9/genunix/Makefile +++ b/usr/src/cmd/mdb/sparc/v9/genunix/Makefile @@ -38,6 +38,7 @@ KMODSRCS = \ MODSRCS = \ $(COMMONSRCS) \ + gcore.c \ typegraph.c # -- cgit v1.2.3 From 4c87aefe8930bd07275b8dd2e96ea5f24d93a52e Mon Sep 17 00:00:00 2001 From: Patrick Mooney Date: Tue, 10 Oct 2017 12:37:29 +0200 Subject: 12665 want modern bhyve Portions contributed by: Hans Rosenfeld Portions contributed by: John Levon Portions contributed by: Mike Gerdts Portions contributed by: Andy Fiddaman Portions contributed by: Dominik Hassler Portions contributed by: Jerry Jelinek Portions contributed by: Robert Mustacchi Portions contributed by: Mike Zeller Reviewed by: Andy Fiddaman Approved by: Dan McDonald --- exception_lists/check_rtime | 2 + exception_lists/copyright | 84 + exception_lists/cstyle | 83 + exception_lists/hdrchk | 50 + exception_lists/packaging | 6 + exception_lists/wscheck | 81 + usr/contrib/freebsd/amd64/machine/pmap.h | 455 ++++ usr/contrib/freebsd/amd64/machine/specialreg.h | 6 - usr/contrib/freebsd/dev/io/iodev.h | 44 + usr/contrib/freebsd/dev/mii/mii.h | 239 ++ usr/contrib/freebsd/dev/nvme/nvme.h | 1511 +++++++++++ usr/contrib/freebsd/dev/usb/controller/xhcireg.h | 224 ++ usr/contrib/freebsd/dev/usb/usb.h | 801 ++++++ usr/contrib/freebsd/dev/usb/usb_endian.h | 121 + usr/contrib/freebsd/dev/usb/usb_freebsd.h | 101 + usr/contrib/freebsd/dev/usb/usbdi.h | 657 +++++ usr/contrib/freebsd/isa/rtc.h | 125 + usr/contrib/freebsd/lib/libutil/humanize_number.c | 179 ++ usr/contrib/freebsd/sys/ata.h | 392 ++- usr/contrib/freebsd/sys/linker_set.h | 119 - usr/contrib/freebsd/sys/pciio.h | 146 ++ usr/contrib/freebsd/sys/queue.h | 787 ++++++ usr/contrib/freebsd/x86/segments.h | 274 ++ usr/contrib/freebsd/x86/specialreg.h | 320 ++- usr/src/Makefile.master | 6 + usr/src/cmd/Makefile | 2 + usr/src/cmd/bhyve/Makefile | 129 +- usr/src/cmd/bhyve/Makefile.com | 94 - usr/src/cmd/bhyve/acpi.c | 1007 ++++++++ usr/src/cmd/bhyve/acpi.h | 4 +- usr/src/cmd/bhyve/ahci.h | 534 ++-- usr/src/cmd/bhyve/amd64/Makefile | 21 - usr/src/cmd/bhyve/atkbdc.c | 218 +- usr/src/cmd/bhyve/atkbdc.h | 2 +- usr/src/cmd/bhyve/bhyve_sol_glue.c | 57 +- usr/src/cmd/bhyve/bhyvegc.c | 35 +- usr/src/cmd/bhyve/bhyvegc.h | 6 +- usr/src/cmd/bhyve/bhyverun.c | 772 +++++- usr/src/cmd/bhyve/bhyverun.h | 11 +- usr/src/cmd/bhyve/block_if.c | 576 ++++- usr/src/cmd/bhyve/block_if.h | 25 +- usr/src/cmd/bhyve/bootrom.c | 113 + usr/src/cmd/bhyve/bootrom.h | 40 + usr/src/cmd/bhyve/console.c | 41 +- usr/src/cmd/bhyve/console.h | 19 +- usr/src/cmd/bhyve/consport.c | 45 +- usr/src/cmd/bhyve/dbgport.c | 180 ++ usr/src/cmd/bhyve/dbgport.h | 4 +- usr/src/cmd/bhyve/fwctl.c | 552 ++++ usr/src/cmd/bhyve/fwctl.h | 56 + usr/src/cmd/bhyve/gdb.c | 1523 +++++++++++ usr/src/cmd/bhyve/gdb.h | 38 + usr/src/cmd/bhyve/inout.c | 44 +- usr/src/cmd/bhyve/inout.h | 4 +- usr/src/cmd/bhyve/ioapic.c | 15 +- usr/src/cmd/bhyve/ioapic.h | 10 +- usr/src/cmd/bhyve/iov.c | 148 ++ usr/src/cmd/bhyve/iov.h | 44 + usr/src/cmd/bhyve/mem.c | 135 +- usr/src/cmd/bhyve/mem.h | 10 +- usr/src/cmd/bhyve/mevent.c | 680 +++++ usr/src/cmd/bhyve/mevent.h | 53 + usr/src/cmd/bhyve/mevent_test.c | 282 ++ usr/src/cmd/bhyve/mptbl.c | 6 +- usr/src/cmd/bhyve/mptbl.h | 4 +- usr/src/cmd/bhyve/pci_ahci.c | 1060 +++++--- usr/src/cmd/bhyve/pci_e82545.c | 2418 +++++++++++++++++ usr/src/cmd/bhyve/pci_emul.c | 210 +- usr/src/cmd/bhyve/pci_emul.h | 12 +- usr/src/cmd/bhyve/pci_fbuf.c | 467 ++++ usr/src/cmd/bhyve/pci_hostbridge.c | 170 +- usr/src/cmd/bhyve/pci_irq.c | 47 +- usr/src/cmd/bhyve/pci_irq.h | 8 +- usr/src/cmd/bhyve/pci_lpc.c | 66 +- usr/src/cmd/bhyve/pci_lpc.h | 6 +- usr/src/cmd/bhyve/pci_nvme.c | 1953 ++++++++++++++ usr/src/cmd/bhyve/pci_passthru.c | 937 +++++++ usr/src/cmd/bhyve/pci_uart.c | 121 + usr/src/cmd/bhyve/pci_virtio_block.c | 284 +- usr/src/cmd/bhyve/pci_virtio_console.c | 701 +++++ usr/src/cmd/bhyve/pci_virtio_net.c | 723 ++++-- usr/src/cmd/bhyve/pci_virtio_rnd.c | 209 ++ usr/src/cmd/bhyve/pci_virtio_scsi.c | 737 ++++++ usr/src/cmd/bhyve/pci_virtio_viona.c | 9 +- usr/src/cmd/bhyve/pci_xhci.c | 2855 +++++++++++++++++++++ usr/src/cmd/bhyve/pci_xhci.h | 355 +++ usr/src/cmd/bhyve/pm.c | 65 +- usr/src/cmd/bhyve/pmtmr.c | 212 -- usr/src/cmd/bhyve/post.c | 6 +- usr/src/cmd/bhyve/ps2kbd.c | 317 +-- usr/src/cmd/bhyve/ps2kbd.h | 2 + usr/src/cmd/bhyve/ps2mouse.c | 79 +- usr/src/cmd/bhyve/ps2mouse.h | 6 +- usr/src/cmd/bhyve/rfb.c | 914 ++++++- usr/src/cmd/bhyve/rfb.h | 8 +- usr/src/cmd/bhyve/rtc.c | 317 +-- usr/src/cmd/bhyve/rtc.h | 8 +- usr/src/cmd/bhyve/smbiostbl.c | 98 +- usr/src/cmd/bhyve/smbiostbl.h | 9 +- usr/src/cmd/bhyve/sockstream.c | 86 + usr/src/cmd/bhyve/sockstream.h | 35 + usr/src/cmd/bhyve/spinup_ap.c | 6 +- usr/src/cmd/bhyve/spinup_ap.h | 4 +- usr/src/cmd/bhyve/task_switch.c | 941 +++++++ usr/src/cmd/bhyve/test/Makefile | 18 + usr/src/cmd/bhyve/test/Makefile.com | 61 + usr/src/cmd/bhyve/test/Makefile.subdirs | 29 + usr/src/cmd/bhyve/test/Makefile.targ | 55 + usr/src/cmd/bhyve/test/scripts/Makefile | 28 + usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh | 231 ++ usr/src/cmd/bhyve/test/tst/Makefile | 18 + usr/src/cmd/bhyve/test/tst/mevent/Makefile | 30 + usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c | 172 ++ usr/src/cmd/bhyve/test/tst/mevent/mevent.c | 57 + usr/src/cmd/bhyve/test/tst/mevent/read.disable.c | 163 ++ usr/src/cmd/bhyve/test/tst/mevent/read.pause.c | 152 ++ usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c | 108 + usr/src/cmd/bhyve/test/tst/mevent/testlib.c | 70 + usr/src/cmd/bhyve/test/tst/mevent/testlib.h | 93 + usr/src/cmd/bhyve/uart_emul.c | 987 ++++--- usr/src/cmd/bhyve/uart_emul.h | 4 +- usr/src/cmd/bhyve/usb_emul.c | 78 + usr/src/cmd/bhyve/usb_emul.h | 164 ++ usr/src/cmd/bhyve/usb_mouse.c | 809 ++++++ usr/src/cmd/bhyve/vga.c | 106 +- usr/src/cmd/bhyve/vga.h | 80 +- usr/src/cmd/bhyve/virtio.c | 68 +- usr/src/cmd/bhyve/virtio.h | 75 +- usr/src/cmd/bhyve/xmsr.c | 21 +- usr/src/cmd/bhyve/xmsr.h | 4 +- usr/src/cmd/bhyveconsole/Makefile | 41 - usr/src/cmd/bhyveconsole/bhyveconsole.c | 360 --- usr/src/cmd/bhyveconsole/i386/Makefile | 43 - usr/src/cmd/bhyvectl/Makefile | 45 +- usr/src/cmd/bhyvectl/Makefile.com | 48 - usr/src/cmd/bhyvectl/amd64/Makefile | 21 - usr/src/cmd/bhyvectl/bhyvectl.c | 2347 +++++++++++------ usr/src/cmd/bhyveload-uefi/Makefile | 41 - usr/src/cmd/bhyveload-uefi/Makefile.com | 52 - usr/src/cmd/bhyveload-uefi/amd64/Makefile | 21 - usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c | 190 -- usr/src/cmd/bhyveload-uefi/i386/Makefile | 18 - usr/src/cmd/devfsadm/i386/misc_link_i386.c | 18 + usr/src/cmd/mdb/intel/amd64/vmm/Makefile | 20 - usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile | 32 - usr/src/cmd/mdb/intel/amd64/vmm/vmm.c | 238 -- usr/src/compat/freebsd/amd64/machine/asmacros.h | 3 + usr/src/compat/freebsd/amd64/machine/atomic.h | 92 +- usr/src/compat/freebsd/amd64/machine/cpufunc.h | 154 +- usr/src/compat/freebsd/amd64/machine/fpu.h | 3 +- usr/src/compat/freebsd/amd64/machine/iodev.h | 19 + usr/src/compat/freebsd/amd64/machine/md_var.h | 4 + usr/src/compat/freebsd/amd64/machine/param.h | 2 + usr/src/compat/freebsd/amd64/machine/pmap.h | 461 +++- usr/src/compat/freebsd/amd64/machine/reg.h | 23 + usr/src/compat/freebsd/amd64/machine/smp.h | 11 + usr/src/compat/freebsd/amd64/machine/specialreg.h | 61 + usr/src/compat/freebsd/amd64/machine/vmm.h | 3 + usr/src/compat/freebsd/amd64/machine/vmparam.h | 26 + usr/src/compat/freebsd/err.h | 23 + usr/src/compat/freebsd/libutil.h | 14 + usr/src/compat/freebsd/net/ethernet.h | 14 + usr/src/compat/freebsd/pthread_np.h | 4 +- usr/src/compat/freebsd/sys/_cpuset.h | 33 + usr/src/compat/freebsd/sys/callout.h | 4 + usr/src/compat/freebsd/sys/cdefs.h | 67 +- usr/src/compat/freebsd/sys/clock.h | 110 + usr/src/compat/freebsd/sys/cpuset.h | 112 +- usr/src/compat/freebsd/sys/endian.h | 11 + usr/src/compat/freebsd/sys/eventhandler.h | 19 + usr/src/compat/freebsd/sys/ioctl.h | 2 + usr/src/compat/freebsd/sys/kernel.h | 19 +- usr/src/compat/freebsd/sys/limits.h | 5 + usr/src/compat/freebsd/sys/lock.h | 23 + usr/src/compat/freebsd/sys/malloc.h | 5 + usr/src/compat/freebsd/sys/mutex.h | 5 +- usr/src/compat/freebsd/sys/param.h | 9 + usr/src/compat/freebsd/sys/sdt.h | 37 + usr/src/compat/freebsd/sys/sglist.h | 29 + usr/src/compat/freebsd/sys/smp.h | 6 +- usr/src/compat/freebsd/sys/socket.h | 23 + usr/src/compat/freebsd/sys/systm.h | 9 - usr/src/compat/freebsd/sys/time.h | 28 +- usr/src/compat/freebsd/sys/types.h | 15 +- usr/src/compat/freebsd/unistd.h | 23 + usr/src/compat/freebsd/vm/pmap.h | 21 - usr/src/compat/freebsd/vm/vm.h | 31 +- usr/src/compat/freebsd/vm/vm_param.h | 21 + usr/src/compat/freebsd/x86/_types.h | 2 + usr/src/compat/freebsd/x86/segments.h | 15 +- usr/src/head/bhyve.h | 25 - usr/src/lib/Makefile | 6 +- usr/src/lib/libvmmapi/Makefile | 4 +- usr/src/lib/libvmmapi/Makefile.com | 16 +- usr/src/lib/libvmmapi/common/mapfile-vers | 140 +- usr/src/lib/libvmmapi/common/vmmapi.c | 911 +++++-- usr/src/lib/libvmmapi/common/vmmapi.h | 126 +- usr/src/pkg/manifests/system-bhyve-tests.mf | 35 + usr/src/pkg/manifests/system-bhyve.mf | 46 + usr/src/pkg/manifests/system-library-bhyve.mf | 31 + usr/src/req.flg | 2 + usr/src/tools/scripts/build_cscope.conf | 4 +- usr/src/tools/scripts/gensetdefs.pl | 31 - usr/src/uts/Makefile.targ | 3 +- usr/src/uts/common/Makefile.files | 6 +- usr/src/uts/i86pc/Makefile.files | 42 +- usr/src/uts/i86pc/Makefile.i86pc | 1 + usr/src/uts/i86pc/Makefile.rules | 29 + usr/src/uts/i86pc/io/viona/viona.c | 17 +- usr/src/uts/i86pc/io/vmm/README.sync | 18 + usr/src/uts/i86pc/io/vmm/amd/amdv.c | 137 +- usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c | 1461 +++++++++++ usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h | 431 ++++ usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c | 735 ++++++ usr/src/uts/i86pc/io/vmm/amd/npt.c | 87 + usr/src/uts/i86pc/io/vmm/amd/npt.h | 38 + usr/src/uts/i86pc/io/vmm/amd/offsets.in | 36 + usr/src/uts/i86pc/io/vmm/amd/svm.c | 2446 ++++++++++++++++++ usr/src/uts/i86pc/io/vmm/amd/svm.h | 74 + usr/src/uts/i86pc/io/vmm/amd/svm_msr.c | 199 ++ usr/src/uts/i86pc/io/vmm/amd/svm_msr.h | 46 + usr/src/uts/i86pc/io/vmm/amd/svm_softc.h | 131 + usr/src/uts/i86pc/io/vmm/amd/svm_support.s | 164 ++ usr/src/uts/i86pc/io/vmm/amd/vmcb.c | 454 ++++ usr/src/uts/i86pc/io/vmm/amd/vmcb.h | 336 +++ usr/src/uts/i86pc/io/vmm/intel/ept.c | 362 +-- usr/src/uts/i86pc/io/vmm/intel/ept.h | 18 +- usr/src/uts/i86pc/io/vmm/intel/offsets.in | 62 + usr/src/uts/i86pc/io/vmm/intel/vmcs.c | 127 +- usr/src/uts/i86pc/io/vmm/intel/vmcs.h | 102 +- usr/src/uts/i86pc/io/vmm/intel/vmx.c | 2407 +++++++++++++---- usr/src/uts/i86pc/io/vmm/intel/vmx.h | 92 +- usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h | 4 +- usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h | 39 +- usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c | 101 +- usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h | 4 +- usr/src/uts/i86pc/io/vmm/intel/vmx_support.s | 445 ++-- usr/src/uts/i86pc/io/vmm/intel/vtd.c | 690 +++++ usr/src/uts/i86pc/io/vmm/io/iommu.h | 76 + usr/src/uts/i86pc/io/vmm/io/ppt.h | 56 + usr/src/uts/i86pc/io/vmm/io/sol_iommu.c | 86 + usr/src/uts/i86pc/io/vmm/io/sol_ppt.c | 92 + usr/src/uts/i86pc/io/vmm/io/vatpic.c | 5 +- usr/src/uts/i86pc/io/vmm/io/vatpic.h | 2 +- usr/src/uts/i86pc/io/vmm/io/vatpit.c | 87 +- usr/src/uts/i86pc/io/vmm/io/vatpit.h | 8 +- usr/src/uts/i86pc/io/vmm/io/vhpet.c | 104 +- usr/src/uts/i86pc/io/vmm/io/vhpet.h | 12 +- usr/src/uts/i86pc/io/vmm/io/vioapic.c | 176 +- usr/src/uts/i86pc/io/vmm/io/vioapic.h | 8 +- usr/src/uts/i86pc/io/vmm/io/vlapic.c | 236 +- usr/src/uts/i86pc/io/vmm/io/vlapic.h | 27 +- usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h | 23 +- usr/src/uts/i86pc/io/vmm/io/vpmtmr.c | 105 + usr/src/uts/i86pc/io/vmm/io/vpmtmr.h | 44 + usr/src/uts/i86pc/io/vmm/io/vrtc.c | 1061 ++++++++ usr/src/uts/i86pc/io/vmm/io/vrtc.h | 60 + usr/src/uts/i86pc/io/vmm/offsets.in | 72 - usr/src/uts/i86pc/io/vmm/vm/pmap.h | 27 + usr/src/uts/i86pc/io/vmm/vm/vm_extern.h | 35 + usr/src/uts/i86pc/io/vmm/vm/vm_glue.h | 99 + usr/src/uts/i86pc/io/vmm/vm/vm_map.h | 63 + usr/src/uts/i86pc/io/vmm/vm/vm_object.h | 31 + usr/src/uts/i86pc/io/vmm/vm/vm_page.h | 28 + usr/src/uts/i86pc/io/vmm/vm/vm_pager.h | 23 + usr/src/uts/i86pc/io/vmm/vmm.c | 2235 ++++++++++++---- usr/src/uts/i86pc/io/vmm/vmm.mapfile | 62 + usr/src/uts/i86pc/io/vmm/vmm_host.c | 57 +- usr/src/uts/i86pc/io/vmm/vmm_host.h | 17 +- usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c | 552 +++- usr/src/uts/i86pc/io/vmm/vmm_ioport.c | 38 +- usr/src/uts/i86pc/io/vmm/vmm_ioport.h | 4 +- usr/src/uts/i86pc/io/vmm/vmm_ipi.h | 37 - usr/src/uts/i86pc/io/vmm/vmm_ktr.h | 4 +- usr/src/uts/i86pc/io/vmm/vmm_lapic.c | 17 +- usr/src/uts/i86pc/io/vmm/vmm_lapic.h | 4 +- usr/src/uts/i86pc/io/vmm/vmm_mem.c | 124 + usr/src/uts/i86pc/io/vmm/vmm_mem.h | 12 +- usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c | 2378 ++++++++++++----- usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c | 268 ++ usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c | 907 +++---- usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c | 111 - usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c | 297 +++ usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c | 1016 ++++++++ usr/src/uts/i86pc/io/vmm/vmm_stat.c | 172 ++ usr/src/uts/i86pc/io/vmm/vmm_stat.h | 69 +- usr/src/uts/i86pc/io/vmm/vmm_support.s | 54 + usr/src/uts/i86pc/io/vmm/vmm_util.c | 6 +- usr/src/uts/i86pc/io/vmm/vmm_util.h | 4 +- usr/src/uts/i86pc/io/vmm/vmm_zsd.c | 218 ++ usr/src/uts/i86pc/io/vmm/vmx_assym.s | 1 - usr/src/uts/i86pc/io/vmm/x86.c | 475 +++- usr/src/uts/i86pc/io/vmm/x86.h | 19 +- usr/src/uts/i86pc/os/gipt.c | 566 ++++ usr/src/uts/i86pc/sys/gipt.h | 92 + usr/src/uts/i86pc/sys/viona_io.h | 5 +- usr/src/uts/i86pc/sys/vmm.h | 258 +- usr/src/uts/i86pc/sys/vmm_dev.h | 181 +- usr/src/uts/i86pc/sys/vmm_drv.h | 50 + usr/src/uts/i86pc/sys/vmm_impl.h | 105 +- usr/src/uts/i86pc/sys/vmm_instruction_emul.h | 23 +- usr/src/uts/i86pc/viona/Makefile | 9 +- usr/src/uts/i86pc/vmm/Makefile | 74 +- usr/src/uts/req.flg | 3 + 304 files changed, 56348 insertions(+), 9173 deletions(-) create mode 100644 usr/contrib/freebsd/amd64/machine/pmap.h delete mode 100644 usr/contrib/freebsd/amd64/machine/specialreg.h create mode 100644 usr/contrib/freebsd/dev/io/iodev.h create mode 100644 usr/contrib/freebsd/dev/mii/mii.h create mode 100644 usr/contrib/freebsd/dev/nvme/nvme.h create mode 100644 usr/contrib/freebsd/dev/usb/controller/xhcireg.h create mode 100644 usr/contrib/freebsd/dev/usb/usb.h create mode 100644 usr/contrib/freebsd/dev/usb/usb_endian.h create mode 100644 usr/contrib/freebsd/dev/usb/usb_freebsd.h create mode 100644 usr/contrib/freebsd/dev/usb/usbdi.h create mode 100644 usr/contrib/freebsd/isa/rtc.h create mode 100644 usr/contrib/freebsd/lib/libutil/humanize_number.c delete mode 100644 usr/contrib/freebsd/sys/linker_set.h create mode 100644 usr/contrib/freebsd/sys/pciio.h create mode 100644 usr/contrib/freebsd/sys/queue.h create mode 100644 usr/contrib/freebsd/x86/segments.h delete mode 100644 usr/src/cmd/bhyve/Makefile.com create mode 100644 usr/src/cmd/bhyve/acpi.c delete mode 100644 usr/src/cmd/bhyve/amd64/Makefile create mode 100644 usr/src/cmd/bhyve/bootrom.c create mode 100644 usr/src/cmd/bhyve/bootrom.h create mode 100644 usr/src/cmd/bhyve/dbgport.c create mode 100644 usr/src/cmd/bhyve/fwctl.c create mode 100644 usr/src/cmd/bhyve/fwctl.h create mode 100644 usr/src/cmd/bhyve/gdb.c create mode 100644 usr/src/cmd/bhyve/gdb.h create mode 100644 usr/src/cmd/bhyve/iov.c create mode 100644 usr/src/cmd/bhyve/iov.h create mode 100644 usr/src/cmd/bhyve/mevent.c create mode 100644 usr/src/cmd/bhyve/mevent.h create mode 100644 usr/src/cmd/bhyve/mevent_test.c create mode 100644 usr/src/cmd/bhyve/pci_e82545.c create mode 100644 usr/src/cmd/bhyve/pci_fbuf.c create mode 100644 usr/src/cmd/bhyve/pci_nvme.c create mode 100644 usr/src/cmd/bhyve/pci_passthru.c create mode 100644 usr/src/cmd/bhyve/pci_uart.c create mode 100644 usr/src/cmd/bhyve/pci_virtio_console.c create mode 100644 usr/src/cmd/bhyve/pci_virtio_rnd.c create mode 100644 usr/src/cmd/bhyve/pci_virtio_scsi.c create mode 100644 usr/src/cmd/bhyve/pci_xhci.c create mode 100644 usr/src/cmd/bhyve/pci_xhci.h delete mode 100644 usr/src/cmd/bhyve/pmtmr.c create mode 100644 usr/src/cmd/bhyve/sockstream.c create mode 100644 usr/src/cmd/bhyve/sockstream.h create mode 100644 usr/src/cmd/bhyve/task_switch.c create mode 100644 usr/src/cmd/bhyve/test/Makefile create mode 100644 usr/src/cmd/bhyve/test/Makefile.com create mode 100644 usr/src/cmd/bhyve/test/Makefile.subdirs create mode 100644 usr/src/cmd/bhyve/test/Makefile.targ create mode 100644 usr/src/cmd/bhyve/test/scripts/Makefile create mode 100644 usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh create mode 100644 usr/src/cmd/bhyve/test/tst/Makefile create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/Makefile create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/mevent.c create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/read.disable.c create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/read.pause.c create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/testlib.c create mode 100644 usr/src/cmd/bhyve/test/tst/mevent/testlib.h create mode 100644 usr/src/cmd/bhyve/usb_emul.c create mode 100644 usr/src/cmd/bhyve/usb_emul.h create mode 100644 usr/src/cmd/bhyve/usb_mouse.c delete mode 100644 usr/src/cmd/bhyveconsole/Makefile delete mode 100644 usr/src/cmd/bhyveconsole/bhyveconsole.c delete mode 100644 usr/src/cmd/bhyveconsole/i386/Makefile delete mode 100644 usr/src/cmd/bhyvectl/Makefile.com delete mode 100644 usr/src/cmd/bhyvectl/amd64/Makefile delete mode 100644 usr/src/cmd/bhyveload-uefi/Makefile delete mode 100644 usr/src/cmd/bhyveload-uefi/Makefile.com delete mode 100644 usr/src/cmd/bhyveload-uefi/amd64/Makefile delete mode 100644 usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c delete mode 100644 usr/src/cmd/bhyveload-uefi/i386/Makefile delete mode 100644 usr/src/cmd/mdb/intel/amd64/vmm/Makefile delete mode 100644 usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile delete mode 100644 usr/src/cmd/mdb/intel/amd64/vmm/vmm.c create mode 100644 usr/src/compat/freebsd/amd64/machine/iodev.h create mode 100644 usr/src/compat/freebsd/amd64/machine/reg.h create mode 100644 usr/src/compat/freebsd/amd64/machine/specialreg.h create mode 100644 usr/src/compat/freebsd/err.h create mode 100644 usr/src/compat/freebsd/sys/_cpuset.h create mode 100644 usr/src/compat/freebsd/sys/clock.h create mode 100644 usr/src/compat/freebsd/sys/eventhandler.h create mode 100644 usr/src/compat/freebsd/sys/lock.h create mode 100644 usr/src/compat/freebsd/sys/sdt.h create mode 100644 usr/src/compat/freebsd/sys/sglist.h create mode 100644 usr/src/compat/freebsd/sys/socket.h create mode 100644 usr/src/compat/freebsd/unistd.h delete mode 100644 usr/src/compat/freebsd/vm/pmap.h create mode 100644 usr/src/compat/freebsd/vm/vm_param.h delete mode 100644 usr/src/head/bhyve.h create mode 100644 usr/src/pkg/manifests/system-bhyve-tests.mf create mode 100644 usr/src/pkg/manifests/system-bhyve.mf create mode 100644 usr/src/pkg/manifests/system-library-bhyve.mf delete mode 100644 usr/src/tools/scripts/gensetdefs.pl create mode 100644 usr/src/uts/i86pc/io/vmm/README.sync create mode 100644 usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c create mode 100644 usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h create mode 100644 usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c create mode 100644 usr/src/uts/i86pc/io/vmm/amd/npt.c create mode 100644 usr/src/uts/i86pc/io/vmm/amd/npt.h create mode 100644 usr/src/uts/i86pc/io/vmm/amd/offsets.in create mode 100644 usr/src/uts/i86pc/io/vmm/amd/svm.c create mode 100644 usr/src/uts/i86pc/io/vmm/amd/svm.h create mode 100644 usr/src/uts/i86pc/io/vmm/amd/svm_msr.c create mode 100644 usr/src/uts/i86pc/io/vmm/amd/svm_msr.h create mode 100644 usr/src/uts/i86pc/io/vmm/amd/svm_softc.h create mode 100644 usr/src/uts/i86pc/io/vmm/amd/svm_support.s create mode 100644 usr/src/uts/i86pc/io/vmm/amd/vmcb.c create mode 100644 usr/src/uts/i86pc/io/vmm/amd/vmcb.h create mode 100644 usr/src/uts/i86pc/io/vmm/intel/offsets.in create mode 100644 usr/src/uts/i86pc/io/vmm/intel/vtd.c create mode 100644 usr/src/uts/i86pc/io/vmm/io/iommu.h create mode 100644 usr/src/uts/i86pc/io/vmm/io/ppt.h create mode 100644 usr/src/uts/i86pc/io/vmm/io/sol_iommu.c create mode 100644 usr/src/uts/i86pc/io/vmm/io/sol_ppt.c create mode 100644 usr/src/uts/i86pc/io/vmm/io/vpmtmr.c create mode 100644 usr/src/uts/i86pc/io/vmm/io/vpmtmr.h create mode 100644 usr/src/uts/i86pc/io/vmm/io/vrtc.c create mode 100644 usr/src/uts/i86pc/io/vmm/io/vrtc.h delete mode 100644 usr/src/uts/i86pc/io/vmm/offsets.in create mode 100644 usr/src/uts/i86pc/io/vmm/vm/pmap.h create mode 100644 usr/src/uts/i86pc/io/vmm/vm/vm_extern.h create mode 100644 usr/src/uts/i86pc/io/vmm/vm/vm_glue.h create mode 100644 usr/src/uts/i86pc/io/vmm/vm/vm_map.h create mode 100644 usr/src/uts/i86pc/io/vmm/vm/vm_object.h create mode 100644 usr/src/uts/i86pc/io/vmm/vm/vm_page.h create mode 100644 usr/src/uts/i86pc/io/vmm/vm/vm_pager.h create mode 100644 usr/src/uts/i86pc/io/vmm/vmm.mapfile delete mode 100644 usr/src/uts/i86pc/io/vmm/vmm_ipi.h create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_mem.c create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c delete mode 100644 usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_stat.c create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_support.s create mode 100644 usr/src/uts/i86pc/io/vmm/vmm_zsd.c delete mode 100644 usr/src/uts/i86pc/io/vmm/vmx_assym.s create mode 100644 usr/src/uts/i86pc/os/gipt.c create mode 100644 usr/src/uts/i86pc/sys/gipt.h create mode 100644 usr/src/uts/i86pc/sys/vmm_drv.h (limited to 'usr/src') diff --git a/exception_lists/check_rtime b/exception_lists/check_rtime index 01bb189dca..42964957d4 100644 --- a/exception_lists/check_rtime +++ b/exception_lists/check_rtime @@ -24,6 +24,7 @@ # Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. # Copyright 2018 OmniOS Community Edition (OmniOSce) Association. # Copyright 2019 Peter Tribble. +# Copyright 2018 Joyent, Inc. # Copyright 2020 Oxide Computer Company # @@ -83,6 +84,7 @@ EXEC_STACK ^opt/os-tests/tests/secflags/stacky$ # Objects for which we allow relocations to the text segment TEXTREL ^platform/.*/MACH(kernel)/unix$ +TEXTREL ^usr/sbin/amd64/bhyve$ # Directories and files that are allowed to have no direct bound symbols NODIRECT ^platform/.*/MACH(kernel)/unix$ diff --git a/exception_lists/copyright b/exception_lists/copyright index 647bc46b60..c62835e304 100644 --- a/exception_lists/copyright +++ b/exception_lists/copyright @@ -466,3 +466,87 @@ usr/src/uts/common/sys/THIRDPARTYLICENSE.firmload usr/src/uts/common/sys/THIRDPARTYLICENSE.firmload.descrip usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/* usr/src/uts/sparc/nsmb/ioc_check.ref + +# bhyve sources +usr/src/cmd/bhyve/acpi.[ch] +usr/src/cmd/bhyve/ahci.h +usr/src/cmd/bhyve/atkbdc.[ch] +usr/src/cmd/bhyve/bhyvegc.[ch] +usr/src/cmd/bhyve/bhyverun.[ch] +usr/src/cmd/bhyve/block_if.[ch] +usr/src/cmd/bhyve/bootrom.[ch] +usr/src/cmd/bhyve/console.[ch] +usr/src/cmd/bhyve/consport.c +usr/src/cmd/bhyve/dbgport.[ch] +usr/src/cmd/bhyve/fwctl.[ch] +usr/src/cmd/bhyve/gdb.[ch] +usr/src/cmd/bhyve/inout.[ch] +usr/src/cmd/bhyve/ioapic.[ch] +usr/src/cmd/bhyve/mem.[ch] +usr/src/cmd/bhyve/mevent.[ch] +usr/src/cmd/bhyve/mevent_test.c +usr/src/cmd/bhyve/mptbl.[ch] +usr/src/cmd/bhyve/pci_ahci.c +usr/src/cmd/bhyve/pci_e82545.c +usr/src/cmd/bhyve/pci_emul.[ch] +usr/src/cmd/bhyve/pci_fbuf.c +usr/src/cmd/bhyve/pci_hostbridge.c +usr/src/cmd/bhyve/pci_irq.[ch] +usr/src/cmd/bhyve/pci_lpc.[ch] +usr/src/cmd/bhyve/pci_nvme.c +usr/src/cmd/bhyve/pci_passthru.c +usr/src/cmd/bhyve/pci_uart.c +usr/src/cmd/bhyve/pci_virtio_block.c +usr/src/cmd/bhyve/pci_virtio_net.c +usr/src/cmd/bhyve/pci_virtio_rnd.c +usr/src/cmd/bhyve/pci_virtio_scsi.c +usr/src/cmd/bhyve/pci_xhci.[ch] +usr/src/cmd/bhyve/pm.c +usr/src/cmd/bhyve/pmtmr.c +usr/src/cmd/bhyve/post.c +usr/src/cmd/bhyve/ps2kbd.[ch] +usr/src/cmd/bhyve/ps2mouse.[ch] +usr/src/cmd/bhyve/rfb.[ch] +usr/src/cmd/bhyve/rtc.[ch] +usr/src/cmd/bhyve/smbiostbl.[ch] +usr/src/cmd/bhyve/sockstream.[ch] +usr/src/cmd/bhyve/spinup_ap.[ch] +usr/src/cmd/bhyve/task_switch.c +usr/src/cmd/bhyve/uart_emul.[ch] +usr/src/cmd/bhyve/usb_emul.[ch] +usr/src/cmd/bhyve/usb_mouse.c +usr/src/cmd/bhyve/vga.[ch] +usr/src/cmd/bhyve/virtio.[ch] +usr/src/cmd/bhyve/xmsr.[ch] +usr/src/cmd/bhyvectl/bhyvectl.c +usr/src/compat/freebsd/*.h +usr/src/compat/freebsd/*/*.h +usr/src/compat/freebsd/amd64/machine/*.h +usr/contrib/freebsd/*/*.h +usr/contrib/freebsd/*/*/*.h +usr/contrib/freebsd/lib/libutil/*.c +usr/src/lib/libvmmapi/common/vmmapi.[ch] +usr/src/tools/scripts/gensetdefs.pl +usr/src/uts/i86pc/io/vmm/amd/*.[chs] +usr/src/uts/i86pc/io/vmm/intel/*.[chs] +usr/src/uts/i86pc/io/vmm/intel/offsets.in +usr/src/uts/i86pc/io/vmm/io/*.[ch] +usr/src/uts/i86pc/io/vmm/README.sync +usr/src/uts/i86pc/io/vmm/vmm.c +usr/src/uts/i86pc/io/vmm/vmm_host.[ch] +usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c +usr/src/uts/i86pc/io/vmm/vmm_ioport.[ch] +usr/src/uts/i86pc/io/vmm/vmm_ipi.h +usr/src/uts/i86pc/io/vmm/vmm_ktr.h +usr/src/uts/i86pc/io/vmm/vmm_lapic.[ch] +usr/src/uts/i86pc/io/vmm/vmm_mem.[ch] +usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c +usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c +usr/src/uts/i86pc/io/vmm/vmm_stat.[ch] +usr/src/uts/i86pc/io/vmm/vmm_util.[ch] +usr/src/uts/i86pc/io/vmm/vmx_assym.s +usr/src/uts/i86pc/io/vmm/x86.[ch] +usr/src/uts/i86pc/sys/vmm.h +usr/src/uts/i86pc/sys/vmm_dev.h +usr/src/uts/i86pc/sys/vmm_instruction_emul.h diff --git a/exception_lists/cstyle b/exception_lists/cstyle index d320dcfacc..73edc10e88 100644 --- a/exception_lists/cstyle +++ b/exception_lists/cstyle @@ -1326,3 +1326,86 @@ usr/src/uts/intel/sys/acpi/platform/acos2.h usr/src/uts/intel/sys/acpi/platform/acsolaris.h usr/src/uts/intel/sys/acpi/platform/acwin.h usr/src/uts/intel/sys/acpi/platform/acwin64.h + +# bhyve sources +syntax: glob +usr/src/cmd/bhyve/acpi.[ch] +usr/src/cmd/bhyve/ahci.h +usr/src/cmd/bhyve/atkbdc.[ch] +usr/src/cmd/bhyve/bhyvegc.[ch] +usr/src/cmd/bhyve/bhyverun.[ch] +usr/src/cmd/bhyve/block_if.[ch] +usr/src/cmd/bhyve/bootrom.[ch] +usr/src/cmd/bhyve/console.[ch] +usr/src/cmd/bhyve/consport.c +usr/src/cmd/bhyve/dbgport.[ch] +usr/src/cmd/bhyve/fwctl.[ch] +usr/src/cmd/bhyve/gdb.[ch] +usr/src/cmd/bhyve/inout.[ch] +usr/src/cmd/bhyve/ioapic.[ch] +usr/src/cmd/bhyve/iov.[ch] +usr/src/cmd/bhyve/mem.[ch] +usr/src/cmd/bhyve/mevent.[ch] +usr/src/cmd/bhyve/mevent_test.c +usr/src/cmd/bhyve/mptbl.[ch] +usr/src/cmd/bhyve/pci_ahci.c +usr/src/cmd/bhyve/pci_e82545.c +usr/src/cmd/bhyve/pci_emul.[ch] +usr/src/cmd/bhyve/pci_fbuf.c +usr/src/cmd/bhyve/pci_hostbridge.c +usr/src/cmd/bhyve/pci_irq.[ch] +usr/src/cmd/bhyve/pci_lpc.[ch] +usr/src/cmd/bhyve/pci_nvme.c +usr/src/cmd/bhyve/pci_passthru.c +usr/src/cmd/bhyve/pci_uart.c +usr/src/cmd/bhyve/pci_virtio_block.c +usr/src/cmd/bhyve/pci_virtio_console.c +usr/src/cmd/bhyve/pci_virtio_net.c +usr/src/cmd/bhyve/pci_virtio_rnd.c +usr/src/cmd/bhyve/pci_virtio_scsi.c +usr/src/cmd/bhyve/pci_xhci.[ch] +usr/src/cmd/bhyve/pm.c +usr/src/cmd/bhyve/pmtmr.c +usr/src/cmd/bhyve/post.c +usr/src/cmd/bhyve/ps2kbd.[ch] +usr/src/cmd/bhyve/ps2mouse.[ch] +usr/src/cmd/bhyve/rfb.[ch] +usr/src/cmd/bhyve/rtc.[ch] +usr/src/cmd/bhyve/smbiostbl.[ch] +usr/src/cmd/bhyve/sockstream.[ch] +usr/src/cmd/bhyve/spinup_ap.[ch] +usr/src/cmd/bhyve/task_switch.c +usr/src/cmd/bhyve/uart_emul.[ch] +usr/src/cmd/bhyve/usb_emul.[ch] +usr/src/cmd/bhyve/usb_mouse.c +usr/src/cmd/bhyve/vga.[ch] +usr/src/cmd/bhyve/virtio.[ch] +usr/src/cmd/bhyve/xmsr.[ch] +usr/src/cmd/bhyveconsole/bhyveconsole.c +usr/src/cmd/bhyvectl/bhyvectl.c +usr/src/compat/freebsd/*.h +usr/src/compat/freebsd/*/*.h +usr/src/compat/freebsd/amd64/machine/*.h +usr/contrib/freebsd/*/*.h +usr/contrib/freebsd/*/*/*.h +usr/contrib/freebsd/lib/libutil/*.c +usr/src/lib/libvmmapi/common/vmmapi.[ch] +usr/src/uts/i86pc/io/vmm/amd/*.[ch] +usr/src/uts/i86pc/io/vmm/intel/*.[chs] +usr/src/uts/i86pc/io/vmm/io/*.[ch] +usr/src/uts/i86pc/io/vmm/vmm.c +usr/src/uts/i86pc/io/vmm/vmm_host.[ch] +usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c +usr/src/uts/i86pc/io/vmm/vmm_ioport.[ch] +usr/src/uts/i86pc/io/vmm/vmm_ipi.h +usr/src/uts/i86pc/io/vmm/vmm_ktr.h +usr/src/uts/i86pc/io/vmm/vmm_lapic.[ch] +usr/src/uts/i86pc/io/vmm/vmm_mem.[ch] +usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c +usr/src/uts/i86pc/io/vmm/vmm_stat.[ch] +usr/src/uts/i86pc/io/vmm/vmm_util.[ch] +usr/src/uts/i86pc/io/vmm/vmx_assym.s +usr/src/uts/i86pc/io/vmm/x86.[ch] +usr/src/uts/i86pc/sys/vmm.h +usr/src/uts/i86pc/sys/vmm_dev.h +usr/src/uts/i86pc/sys/vmm_instruction_emul.h diff --git a/exception_lists/hdrchk b/exception_lists/hdrchk index c8edb3e5ae..7fa467f735 100644 --- a/exception_lists/hdrchk +++ b/exception_lists/hdrchk @@ -374,3 +374,53 @@ usr/src/uts/intel/sys/acpi/acresrc.h usr/src/uts/intel/sys/acpi/acstruct.h usr/src/uts/intel/sys/acpi/amlresrc.h usr/src/uts/intel/sys/acpi/platform/acwin64.h + +# bhyve sources +syntax: glob +usr/src/cmd/bhyve/acpi.h +usr/src/cmd/bhyve/ahci.h +usr/src/cmd/bhyve/atkbdc.h +usr/src/cmd/bhyve/bhyvegc.h +usr/src/cmd/bhyve/bhyverun.h +usr/src/cmd/bhyve/block_if.h +usr/src/cmd/bhyve/bootrom.h +usr/src/cmd/bhyve/console.h +usr/src/cmd/bhyve/dbgport.h +usr/src/cmd/bhyve/inout.h +usr/src/cmd/bhyve/ioapic.h +usr/src/cmd/bhyve/mem.h +usr/src/cmd/bhyve/mptbl.h +usr/src/cmd/bhyve/pci_emul.h +usr/src/cmd/bhyve/pci_irq.h +usr/src/cmd/bhyve/pci_lpc.h +usr/src/cmd/bhyve/ps2kbd.h +usr/src/cmd/bhyve/ps2mouse.h +usr/src/cmd/bhyve/rfb.h +usr/src/cmd/bhyve/rtc.h +usr/src/cmd/bhyve/smbiostbl.h +usr/src/cmd/bhyve/sockstream.h +usr/src/cmd/bhyve/spinup_ap.h +usr/src/cmd/bhyve/uart_emul.h +usr/src/cmd/bhyve/vga.h +usr/src/cmd/bhyve/virtio.h +usr/src/cmd/bhyve/xmsr.h +usr/src/compat/freebsd/*.h +usr/src/compat/freebsd/*/*.h +usr/src/compat/freebsd/amd64/machine/*.h +usr/contrib/freebsd/*/*.h +usr/contrib/freebsd/*/*/*.h +usr/src/lib/libvmmapi/common/vmmapi.h +usr/src/uts/i86pc/io/vmm/intel/*.h +usr/src/uts/i86pc/io/vmm/io/*.h +usr/src/uts/i86pc/io/vmm/vmm_host.h +usr/src/uts/i86pc/io/vmm/vmm_ioport.h +usr/src/uts/i86pc/io/vmm/vmm_ipi.h +usr/src/uts/i86pc/io/vmm/vmm_ktr.h +usr/src/uts/i86pc/io/vmm/vmm_lapic.h +usr/src/uts/i86pc/io/vmm/vmm_mem.h +usr/src/uts/i86pc/io/vmm/vmm_stat.h +usr/src/uts/i86pc/io/vmm/vmm_util.h +usr/src/uts/i86pc/io/vmm/x86.h +usr/src/uts/i86pc/sys/vmm.h +usr/src/uts/i86pc/sys/vmm_dev.h +usr/src/uts/i86pc/sys/vmm_instruction_emul.h diff --git a/exception_lists/packaging b/exception_lists/packaging index 41ca551cc2..cd1e8ed230 100644 --- a/exception_lists/packaging +++ b/exception_lists/packaging @@ -816,6 +816,12 @@ usr/lib/amd64/libsff.so i386 usr/lib/sparcv9/libsff.so sparc usr/lib/libsff.so +# +# private bhyve files +# +lib/amd64/libvmmapi.so i386 +usr/include/vmmapi.h i386 + # # libcustr is private # diff --git a/exception_lists/wscheck b/exception_lists/wscheck index 489668a350..ac16cc54b2 100644 --- a/exception_lists/wscheck +++ b/exception_lists/wscheck @@ -26,3 +26,84 @@ usr/src/uts/common/io/e1000api/* usr/src/uts/common/io/qede/* usr/src/uts/common/io/i40e/core/* usr/src/uts/common/io/ixgbe/core/* + +# bhyve sources +usr/src/cmd/bhyve/acpi.[ch] +usr/src/cmd/bhyve/ahci.h +usr/src/cmd/bhyve/atkbdc.[ch] +usr/src/cmd/bhyve/bhyvegc.[ch] +usr/src/cmd/bhyve/bhyverun.[ch] +usr/src/cmd/bhyve/block_if.[ch] +usr/src/cmd/bhyve/bootrom.[ch] +usr/src/cmd/bhyve/console.[ch] +usr/src/cmd/bhyve/consport.c +usr/src/cmd/bhyve/dbgport.[ch] +usr/src/cmd/bhyve/fwctl.[ch] +usr/src/cmd/bhyve/gdb.[ch] +usr/src/cmd/bhyve/inout.[ch] +usr/src/cmd/bhyve/ioapic.[ch] +usr/src/cmd/bhyve/mem.[ch] +usr/src/cmd/bhyve/mevent.[ch] +usr/src/cmd/bhyve/mevent_test.c +usr/src/cmd/bhyve/mptbl.[ch] +usr/src/cmd/bhyve/pci_ahci.c +usr/src/cmd/bhyve/pci_e82545.c +usr/src/cmd/bhyve/pci_emul.[ch] +usr/src/cmd/bhyve/pci_fbuf.c +usr/src/cmd/bhyve/pci_hostbridge.c +usr/src/cmd/bhyve/pci_irq.[ch] +usr/src/cmd/bhyve/pci_lpc.[ch] +usr/src/cmd/bhyve/pci_nvme.c +usr/src/cmd/bhyve/pci_passthru.c +usr/src/cmd/bhyve/pci_uart.c +usr/src/cmd/bhyve/pci_virtio_block.c +usr/src/cmd/bhyve/pci_virtio_console.c +usr/src/cmd/bhyve/pci_virtio_net.c +usr/src/cmd/bhyve/pci_virtio_rnd.c +usr/src/cmd/bhyve/pci_virtio_scsi.c +usr/src/cmd/bhyve/pci_xhci.[ch] +usr/src/cmd/bhyve/pm.c +usr/src/cmd/bhyve/pmtmr.c +usr/src/cmd/bhyve/post.c +usr/src/cmd/bhyve/ps2kbd.[ch] +usr/src/cmd/bhyve/ps2mouse.[ch] +usr/src/cmd/bhyve/rfb.[ch] +usr/src/cmd/bhyve/rtc.[ch] +usr/src/cmd/bhyve/smbiostbl.[ch] +usr/src/cmd/bhyve/sockstream.[ch] +usr/src/cmd/bhyve/spinup_ap.[ch] +usr/src/cmd/bhyve/task_switch.c +usr/src/cmd/bhyve/uart_emul.[ch] +usr/src/cmd/bhyve/usb_emul.[ch] +usr/src/cmd/bhyve/usb_mouse.c +usr/src/cmd/bhyve/vga.[ch] +usr/src/cmd/bhyve/virtio.[ch] +usr/src/cmd/bhyve/xmsr.[ch] +usr/src/cmd/bhyveconsole/bhyveconsole.c +usr/src/cmd/bhyvectl/bhyvectl.c +usr/src/compat/freebsd/*.h +usr/src/compat/freebsd/*/*.h +usr/src/compat/freebsd/amd64/machine/*.h +usr/contrib/freebsd/*/*.h +usr/contrib/freebsd/*/*/*.h +usr/contrib/freebsd/lib/libutil/*.c +usr/src/lib/libvmmapi/common/vmmapi.[ch] +usr/src/uts/i86pc/io/vmm/amd/*.[ch] +usr/src/uts/i86pc/io/vmm/intel/*.[chs] +usr/src/uts/i86pc/io/vmm/io/*.[ch] +usr/src/uts/i86pc/io/vmm/vmm.c +usr/src/uts/i86pc/io/vmm/vmm_host.[ch] +usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c +usr/src/uts/i86pc/io/vmm/vmm_ioport.[ch] +usr/src/uts/i86pc/io/vmm/vmm_ipi.h +usr/src/uts/i86pc/io/vmm/vmm_ktr.h +usr/src/uts/i86pc/io/vmm/vmm_lapic.[ch] +usr/src/uts/i86pc/io/vmm/vmm_mem.[ch] +usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c +usr/src/uts/i86pc/io/vmm/vmm_stat.[ch] +usr/src/uts/i86pc/io/vmm/vmm_util.[ch] +usr/src/uts/i86pc/io/vmm/vmx_assym.s +usr/src/uts/i86pc/io/vmm/x86.[ch] +usr/src/uts/i86pc/sys/vmm.h +usr/src/uts/i86pc/sys/vmm_dev.h +usr/src/uts/i86pc/sys/vmm_instruction_emul.h diff --git a/usr/contrib/freebsd/amd64/machine/pmap.h b/usr/contrib/freebsd/amd64/machine/pmap.h new file mode 100644 index 0000000000..a0b8ee37f2 --- /dev/null +++ b/usr/contrib/freebsd/amd64/machine/pmap.h @@ -0,0 +1,455 @@ +/*- + * Copyright (c) 2003 Peter Wemm. + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Derived from hp300 version by Mike Hibler, this version by William + * Jolitz uses a recursive map [a pde points to the page directory] to + * map the page tables using the pagetables themselves. This is done to + * reduce the impact on kernel virtual memory for lots of sparse address + * space, and to reduce the cost of memory to each process. + * + * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90 + * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91 + * $FreeBSD$ + */ + +#ifndef _MACHINE_PMAP_H_ +#define _MACHINE_PMAP_H_ + +/* + * Page-directory and page-table entries follow this format, with a few + * of the fields not present here and there, depending on a lot of things. + */ + /* ---- Intel Nomenclature ---- */ +#define X86_PG_V 0x001 /* P Valid */ +#define X86_PG_RW 0x002 /* R/W Read/Write */ +#define X86_PG_U 0x004 /* U/S User/Supervisor */ +#define X86_PG_NC_PWT 0x008 /* PWT Write through */ +#define X86_PG_NC_PCD 0x010 /* PCD Cache disable */ +#define X86_PG_A 0x020 /* A Accessed */ +#define X86_PG_M 0x040 /* D Dirty */ +#define X86_PG_PS 0x080 /* PS Page size (0=4k,1=2M) */ +#define X86_PG_PTE_PAT 0x080 /* PAT PAT index */ +#define X86_PG_G 0x100 /* G Global */ +#define X86_PG_AVAIL1 0x200 /* / Available for system */ +#define X86_PG_AVAIL2 0x400 /* < programmers use */ +#define X86_PG_AVAIL3 0x800 /* \ */ +#define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */ +#define X86_PG_NX (1ul<<63) /* No-execute */ +#define X86_PG_AVAIL(x) (1ul << (x)) + +/* Page level cache control fields used to determine the PAT type */ +#define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) +#define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) + +/* + * Intel extended page table (EPT) bit definitions. + */ +#define EPT_PG_READ 0x001 /* R Read */ +#define EPT_PG_WRITE 0x002 /* W Write */ +#define EPT_PG_EXECUTE 0x004 /* X Execute */ +#define EPT_PG_IGNORE_PAT 0x040 /* IPAT Ignore PAT */ +#define EPT_PG_PS 0x080 /* PS Page size */ +#define EPT_PG_A 0x100 /* A Accessed */ +#define EPT_PG_M 0x200 /* D Dirty */ +#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) /* MT Memory Type */ + +/* + * Define the PG_xx macros in terms of the bits on x86 PTEs. + */ +#define PG_V X86_PG_V +#define PG_RW X86_PG_RW +#define PG_U X86_PG_U +#define PG_NC_PWT X86_PG_NC_PWT +#define PG_NC_PCD X86_PG_NC_PCD +#define PG_A X86_PG_A +#define PG_M X86_PG_M +#define PG_PS X86_PG_PS +#define PG_PTE_PAT X86_PG_PTE_PAT +#define PG_G X86_PG_G +#define PG_AVAIL1 X86_PG_AVAIL1 +#define PG_AVAIL2 X86_PG_AVAIL2 +#define PG_AVAIL3 X86_PG_AVAIL3 +#define PG_PDE_PAT X86_PG_PDE_PAT +#define PG_NX X86_PG_NX +#define PG_PDE_CACHE X86_PG_PDE_CACHE +#define PG_PTE_CACHE X86_PG_PTE_CACHE + +/* Our various interpretations of the above */ +#define PG_W X86_PG_AVAIL3 /* "Wired" pseudoflag */ +#define PG_MANAGED X86_PG_AVAIL2 +#define EPT_PG_EMUL_V X86_PG_AVAIL(52) +#define EPT_PG_EMUL_RW X86_PG_AVAIL(53) +#define PG_PROMOTED X86_PG_AVAIL(54) /* PDE only */ +#define PG_FRAME (0x000ffffffffff000ul) +#define PG_PS_FRAME (0x000fffffffe00000ul) + +/* + * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB + * (PTE) page mappings have identical settings for the following fields: + */ +#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \ + PG_M | PG_A | PG_U | PG_RW | PG_V) + +/* + * Page Protection Exception bits + */ + +#define PGEX_P 0x01 /* Protection violation vs. not present */ +#define PGEX_W 0x02 /* during a Write cycle */ +#define PGEX_U 0x04 /* access from User mode (UPL) */ +#define PGEX_RSV 0x08 /* reserved PTE field is non-zero */ +#define PGEX_I 0x10 /* during an instruction fetch */ + +/* + * undef the PG_xx macros that define bits in the regular x86 PTEs that + * have a different position in nested PTEs. This is done when compiling + * code that needs to be aware of the differences between regular x86 and + * nested PTEs. + * + * The appropriate bitmask will be calculated at runtime based on the pmap + * type. + */ +#ifdef AMD64_NPT_AWARE +#undef PG_AVAIL1 /* X86_PG_AVAIL1 aliases with EPT_PG_M */ +#undef PG_G +#undef PG_A +#undef PG_M +#undef PG_PDE_PAT +#undef PG_PDE_CACHE +#undef PG_PTE_PAT +#undef PG_PTE_CACHE +#undef PG_RW +#undef PG_V +#endif + +/* + * Pte related macros. This is complicated by having to deal with + * the sign extension of the 48th bit. + */ +#define KVADDR(l4, l3, l2, l1) ( \ + ((unsigned long)-1 << 47) | \ + ((unsigned long)(l4) << PML4SHIFT) | \ + ((unsigned long)(l3) << PDPSHIFT) | \ + ((unsigned long)(l2) << PDRSHIFT) | \ + ((unsigned long)(l1) << PAGE_SHIFT)) + +#define UVADDR(l4, l3, l2, l1) ( \ + ((unsigned long)(l4) << PML4SHIFT) | \ + ((unsigned long)(l3) << PDPSHIFT) | \ + ((unsigned long)(l2) << PDRSHIFT) | \ + ((unsigned long)(l1) << PAGE_SHIFT)) + +/* + * Number of kernel PML4 slots. Can be anywhere from 1 to 64 or so, + * but setting it larger than NDMPML4E makes no sense. + * + * Each slot provides .5 TB of kernel virtual space. + */ +#define NKPML4E 4 + +#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */ +#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */ +#define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */ + +/* + * NDMPML4E is the maximum number of PML4 entries that will be + * used to implement the direct map. It must be a power of two, + * and should generally exceed NKPML4E. The maximum possible + * value is 64; using 128 will make the direct map intrude into + * the recursive page table map. + */ +#define NDMPML4E 8 + +/* + * These values control the layout of virtual memory. The starting address + * of the direct map, which is controlled by DMPML4I, must be a multiple of + * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.) + * + * Note: KPML4I is the index of the (single) level 4 page that maps + * the KVA that holds KERNBASE, while KPML4BASE is the index of the + * first level 4 page that maps VM_MIN_KERNEL_ADDRESS. If NKPML4E + * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra + * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to + * KERNBASE. + * + * (KPML4I combines with KPDPI to choose where KERNBASE starts. + * Or, in other words, KPML4I provides bits 39..47 of KERNBASE, + * and KPDPI provides bits 30..38.) + */ +#define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */ + +#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */ +#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */ + +#define KPML4I (NPML4EPG-1) +#define KPDPI (NPDPEPG-2) /* kernbase at -2GB */ + +/* + * XXX doesn't really belong here I guess... + */ +#define ISA_HOLE_START 0xa0000 +#define ISA_HOLE_LENGTH (0x100000-ISA_HOLE_START) + +#define PMAP_PCID_NONE 0xffffffff +#define PMAP_PCID_KERN 0 +#define PMAP_PCID_OVERMAX 0x1000 + +#ifndef LOCORE + +#include +#include +#include +#include + +#include + +typedef u_int64_t pd_entry_t; +typedef u_int64_t pt_entry_t; +typedef u_int64_t pdp_entry_t; +typedef u_int64_t pml4_entry_t; + +/* + * Address of current address space page table maps and directories. + */ +#ifdef _KERNEL +#define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0)) +#define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0)) +#define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0)) +#define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)) +#define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t))) +#define PTmap ((pt_entry_t *)(addr_PTmap)) +#define PDmap ((pd_entry_t *)(addr_PDmap)) +#define PDPmap ((pd_entry_t *)(addr_PDPmap)) +#define PML4map ((pd_entry_t *)(addr_PML4map)) +#define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e)) + +extern int nkpt; /* Initial number of kernel page tables */ +extern u_int64_t KPDPphys; /* physical address of kernel level 3 */ +extern u_int64_t KPML4phys; /* physical address of kernel level 4 */ + +/* + * virtual address to page table entry and + * to physical address. + * Note: these work recursively, thus vtopte of a pte will give + * the corresponding pde that in turn maps it. + */ +pt_entry_t *vtopte(vm_offset_t); +#define vtophys(va) pmap_kextract(((vm_offset_t) (va))) + +#define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) +#define pte_load_clear(ptep) atomic_swap_long(ptep, 0) +#define pte_store(ptep, pte) do { \ + *(u_long *)(ptep) = (u_long)(pte); \ +} while (0) +#define pte_clear(ptep) pte_store(ptep, 0) + +#define pde_store(pdep, pde) pte_store(pdep, pde) + +extern pt_entry_t pg_nx; + +#endif /* _KERNEL */ + +/* + * Pmap stuff + */ +struct pv_entry; +struct pv_chunk; + +/* + * Locks + * (p) PV list lock + */ +struct md_page { + TAILQ_HEAD(, pv_entry) pv_list; /* (p) */ + int pv_gen; /* (p) */ + int pat_mode; +}; + +enum pmap_type { + PT_X86, /* regular x86 page tables */ + PT_EPT, /* Intel's nested page tables */ + PT_RVI, /* AMD's nested page tables */ +}; + +struct pmap_pcids { + uint32_t pm_pcid; + uint32_t pm_gen; +}; + +/* + * The kernel virtual address (KVA) of the level 4 page table page is always + * within the direct map (DMAP) region. + */ +struct pmap { + struct mtx pm_mtx; + pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ + uint64_t pm_cr3; + TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ + cpuset_t pm_active; /* active on cpus */ + enum pmap_type pm_type; /* regular or nested tables */ + struct pmap_statistics pm_stats; /* pmap statistics */ + struct vm_radix pm_root; /* spare page table pages */ + long pm_eptgen; /* EPT pmap generation id */ + int pm_flags; + struct pmap_pcids pm_pcids[MAXCPU]; +}; + +/* flags */ +#define PMAP_NESTED_IPIMASK 0xff +#define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ +#define PMAP_EMULATE_AD_BITS (1 << 9) /* needs A/D bits emulation */ +#define PMAP_SUPPORTS_EXEC_ONLY (1 << 10) /* execute only mappings ok */ + +typedef struct pmap *pmap_t; + +#ifdef _KERNEL +extern struct pmap kernel_pmap_store; +#define kernel_pmap (&kernel_pmap_store) + +#define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx) +#define PMAP_LOCK_ASSERT(pmap, type) \ + mtx_assert(&(pmap)->pm_mtx, (type)) +#define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) +#define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \ + NULL, MTX_DEF | MTX_DUPOK) +#define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) +#define PMAP_MTX(pmap) (&(pmap)->pm_mtx) +#define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) +#define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) + +int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags); +int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype); +#endif + +/* + * For each vm_page_t, there is a list of all currently valid virtual + * mappings of that page. An entry is a pv_entry_t, the list is pv_list. + */ +typedef struct pv_entry { + vm_offset_t pv_va; /* virtual address for mapping */ + TAILQ_ENTRY(pv_entry) pv_next; +} *pv_entry_t; + +/* + * pv_entries are allocated in chunks per-process. This avoids the + * need to track per-pmap assignments. + */ +#define _NPCM 3 +#define _NPCPV 168 +struct pv_chunk { + pmap_t pc_pmap; + TAILQ_ENTRY(pv_chunk) pc_list; + uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ + TAILQ_ENTRY(pv_chunk) pc_lru; + struct pv_entry pc_pventry[_NPCPV]; +}; + +#ifdef _KERNEL + +extern caddr_t CADDR1; +extern pt_entry_t *CMAP1; +extern vm_paddr_t phys_avail[]; +extern vm_paddr_t dump_avail[]; +extern vm_offset_t virtual_avail; +extern vm_offset_t virtual_end; +extern vm_paddr_t dmaplimit; +extern int pmap_pcid_enabled; +extern int invpcid_works; + +#define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) +#define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) +#define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) + +struct thread; + +void pmap_activate_sw(struct thread *); +void pmap_bootstrap(vm_paddr_t *); +int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde); +int pmap_change_attr(vm_offset_t, vm_size_t, int); +void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate); +void pmap_init_pat(void); +void pmap_kenter(vm_offset_t va, vm_paddr_t pa); +void *pmap_kenter_temporary(vm_paddr_t pa, int i); +vm_paddr_t pmap_kextract(vm_offset_t); +void pmap_kremove(vm_offset_t); +void *pmap_mapbios(vm_paddr_t, vm_size_t); +void *pmap_mapdev(vm_paddr_t, vm_size_t); +void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); +boolean_t pmap_page_is_mapped(vm_page_t m); +void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma); +void pmap_pinit_pml4(vm_page_t); +void pmap_unmapdev(vm_offset_t, vm_size_t); +void pmap_invalidate_page(pmap_t, vm_offset_t); +void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); +void pmap_invalidate_all(pmap_t); +void pmap_invalidate_cache(void); +void pmap_invalidate_cache_pages(vm_page_t *pages, int count); +void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, + boolean_t force); +void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); +boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); +void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); +#endif /* _KERNEL */ + +/* Return various clipped indexes for a given VA */ +static __inline vm_pindex_t +pmap_pte_index(vm_offset_t va) +{ + + return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); +} + +static __inline vm_pindex_t +pmap_pde_index(vm_offset_t va) +{ + + return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); +} + +static __inline vm_pindex_t +pmap_pdpe_index(vm_offset_t va) +{ + + return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); +} + +static __inline vm_pindex_t +pmap_pml4e_index(vm_offset_t va) +{ + + return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); +} + +#endif /* !LOCORE */ + +#endif /* !_MACHINE_PMAP_H_ */ diff --git a/usr/contrib/freebsd/amd64/machine/specialreg.h b/usr/contrib/freebsd/amd64/machine/specialreg.h deleted file mode 100644 index 41d4125cb9..0000000000 --- a/usr/contrib/freebsd/amd64/machine/specialreg.h +++ /dev/null @@ -1,6 +0,0 @@ -/*- - * This file is in the public domain. - */ -/* $FreeBSD: head/sys/amd64/include/specialreg.h 233207 2012-03-19 21:34:11Z tijl $ */ - -#include diff --git a/usr/contrib/freebsd/dev/io/iodev.h b/usr/contrib/freebsd/dev/io/iodev.h new file mode 100644 index 0000000000..d040fcccf4 --- /dev/null +++ b/usr/contrib/freebsd/dev/io/iodev.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2010 Marcel Moolenaar + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _DEV_IODEV_H_ +#define _DEV_IODEV_H_ + +#define IODEV_PIO_READ 0 +#define IODEV_PIO_WRITE 1 + +struct iodev_pio_req { + u_int access; + u_int port; + u_int width; + u_int val; +}; + +#define IODEV_PIO _IOWR('I', 0, struct iodev_pio_req) + +#endif /* _DEV_IODEV_H_ */ diff --git a/usr/contrib/freebsd/dev/mii/mii.h b/usr/contrib/freebsd/dev/mii/mii.h new file mode 100644 index 0000000000..fa1ec84eaa --- /dev/null +++ b/usr/contrib/freebsd/dev/mii/mii.h @@ -0,0 +1,239 @@ +/* $NetBSD: mii.h,v 1.18 2014/06/16 14:43:22 msaitoh Exp $ */ + +/*- + * Copyright (c) 1997 Manuel Bouyer. All rights reserved. + * + * Modification to match BSD/OS 3.0 MII interface by Jason R. Thorpe, + * Numerical Aerospace Simulation Facility, NASA Ames Research Center. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _DEV_MII_MII_H_ +#define _DEV_MII_MII_H_ + +/* + * Registers common to all PHYs. + */ + +#define MII_NPHY 32 /* max # of PHYs per MII */ + +/* + * MII commands, used if a device must drive the MII lines + * manually. + */ +#define MII_COMMAND_START 0x01 +#define MII_COMMAND_READ 0x02 +#define MII_COMMAND_WRITE 0x01 +#define MII_COMMAND_ACK 0x02 + +#define MII_BMCR 0x00 /* Basic mode control register (rw) */ +#define BMCR_RESET 0x8000 /* reset */ +#define BMCR_LOOP 0x4000 /* loopback */ +#define BMCR_SPEED0 0x2000 /* speed selection (LSB) */ +#define BMCR_AUTOEN 0x1000 /* autonegotiation enable */ +#define BMCR_PDOWN 0x0800 /* power down */ +#define BMCR_ISO 0x0400 /* isolate */ +#define BMCR_STARTNEG 0x0200 /* restart autonegotiation */ +#define BMCR_FDX 0x0100 /* Set duplex mode */ +#define BMCR_CTEST 0x0080 /* collision test */ +#define BMCR_SPEED1 0x0040 /* speed selection (MSB) */ + +#define BMCR_S10 0x0000 /* 10 Mb/s */ +#define BMCR_S100 BMCR_SPEED0 /* 100 Mb/s */ +#define BMCR_S1000 BMCR_SPEED1 /* 1000 Mb/s */ + +#define BMCR_SPEED(x) ((x) & (BMCR_SPEED0|BMCR_SPEED1)) + +#define MII_BMSR 0x01 /* Basic mode status register (ro) */ +#define BMSR_100T4 0x8000 /* 100 base T4 capable */ +#define BMSR_100TXFDX 0x4000 /* 100 base Tx full duplex capable */ +#define BMSR_100TXHDX 0x2000 /* 100 base Tx half duplex capable */ +#define BMSR_10TFDX 0x1000 /* 10 base T full duplex capable */ +#define BMSR_10THDX 0x0800 /* 10 base T half duplex capable */ +#define BMSR_100T2FDX 0x0400 /* 100 base T2 full duplex capable */ +#define BMSR_100T2HDX 0x0200 /* 100 base T2 half duplex capable */ +#define BMSR_EXTSTAT 0x0100 /* Extended status in register 15 */ +#define BMSR_MFPS 0x0040 /* MII Frame Preamble Suppression */ +#define BMSR_ACOMP 0x0020 /* Autonegotiation complete */ +#define BMSR_RFAULT 0x0010 /* Link partner fault */ +#define BMSR_ANEG 0x0008 /* Autonegotiation capable */ +#define BMSR_LINK 0x0004 /* Link status */ +#define BMSR_JABBER 0x0002 /* Jabber detected */ +#define BMSR_EXTCAP 0x0001 /* Extended capability */ + +#define BMSR_DEFCAPMASK 0xffffffff + +/* + * Note that the EXTSTAT bit indicates that there is extended status + * info available in register 15, but 802.3 section 22.2.4.3 also + * states that all 1000 Mb/s capable PHYs will set this bit to 1. + */ + +#define BMSR_MEDIAMASK (BMSR_100T4|BMSR_100TXFDX|BMSR_100TXHDX| \ + BMSR_10TFDX|BMSR_10THDX|BMSR_100T2FDX|BMSR_100T2HDX) + +/* + * Convert BMSR media capabilities to ANAR bits for autonegotiation. + * Note the shift chopps off the BMSR_ANEG bit. + */ +#define BMSR_MEDIA_TO_ANAR(x) (((x) & BMSR_MEDIAMASK) >> 6) + +#define MII_PHYIDR1 0x02 /* ID register 1 (ro) */ + +#define MII_PHYIDR2 0x03 /* ID register 2 (ro) */ +#define IDR2_OUILSB 0xfc00 /* OUI LSB */ +#define IDR2_MODEL 0x03f0 /* vendor model */ +#define IDR2_REV 0x000f /* vendor revision */ + +#define MII_ANAR 0x04 /* Autonegotiation advertisement (rw) */ + /* section 28.2.4.1 and 37.2.6.1 */ +#define ANAR_NP 0x8000 /* Next page (ro) */ +#define ANAR_ACK 0x4000 /* link partner abilities acknowledged (ro) */ +#define ANAR_RF 0x2000 /* remote fault (ro) */ + /* Annex 28B.2 */ +#define ANAR_FC 0x0400 /* local device supports PAUSE */ +#define ANAR_T4 0x0200 /* local device supports 100bT4 */ +#define ANAR_TX_FD 0x0100 /* local device supports 100bTx FD */ +#define ANAR_TX 0x0080 /* local device supports 100bTx */ +#define ANAR_10_FD 0x0040 /* local device supports 10bT FD */ +#define ANAR_10 0x0020 /* local device supports 10bT */ +#define ANAR_CSMA 0x0001 /* protocol selector CSMA/CD */ +#define ANAR_PAUSE_NONE (0 << 10) +#define ANAR_PAUSE_SYM (1 << 10) +#define ANAR_PAUSE_ASYM (2 << 10) +#define ANAR_PAUSE_TOWARDS (3 << 10) + + /* Annex 28D */ +#define ANAR_X_FD 0x0020 /* local device supports 1000BASE-X FD */ +#define ANAR_X_HD 0x0040 /* local device supports 1000BASE-X HD */ +#define ANAR_X_PAUSE_NONE (0 << 7) +#define ANAR_X_PAUSE_SYM (1 << 7) +#define ANAR_X_PAUSE_ASYM (2 << 7) +#define ANAR_X_PAUSE_TOWARDS (3 << 7) + +#define MII_ANLPAR 0x05 /* Autonegotiation lnk partner abilities (rw) */ + /* section 28.2.4.1 and 37.2.6.1 */ +#define ANLPAR_NP 0x8000 /* Next page (ro) */ +#define ANLPAR_ACK 0x4000 /* link partner accepted ACK (ro) */ +#define ANLPAR_RF 0x2000 /* remote fault (ro) */ +#define ANLPAR_FC 0x0400 /* link partner supports PAUSE */ +#define ANLPAR_T4 0x0200 /* link partner supports 100bT4 */ +#define ANLPAR_TX_FD 0x0100 /* link partner supports 100bTx FD */ +#define ANLPAR_TX 0x0080 /* link partner supports 100bTx */ +#define ANLPAR_10_FD 0x0040 /* link partner supports 10bT FD */ +#define ANLPAR_10 0x0020 /* link partner supports 10bT */ +#define ANLPAR_CSMA 0x0001 /* protocol selector CSMA/CD */ +#define ANLPAR_PAUSE_MASK (3 << 10) +#define ANLPAR_PAUSE_NONE (0 << 10) +#define ANLPAR_PAUSE_SYM (1 << 10) +#define ANLPAR_PAUSE_ASYM (2 << 10) +#define ANLPAR_PAUSE_TOWARDS (3 << 10) + +#define ANLPAR_X_FD 0x0020 /* local device supports 1000BASE-X FD */ +#define ANLPAR_X_HD 0x0040 /* local device supports 1000BASE-X HD */ +#define ANLPAR_X_PAUSE_MASK (3 << 7) +#define ANLPAR_X_PAUSE_NONE (0 << 7) +#define ANLPAR_X_PAUSE_SYM (1 << 7) +#define ANLPAR_X_PAUSE_ASYM (2 << 7) +#define ANLPAR_X_PAUSE_TOWARDS (3 << 7) + +#define MII_ANER 0x06 /* Autonegotiation expansion (ro) */ + /* section 28.2.4.1 and 37.2.6.1 */ +#define ANER_MLF 0x0010 /* multiple link detection fault */ +#define ANER_LPNP 0x0008 /* link parter next page-able */ +#define ANER_NP 0x0004 /* next page-able */ +#define ANER_PAGE_RX 0x0002 /* Page received */ +#define ANER_LPAN 0x0001 /* link parter autoneg-able */ + +#define MII_ANNP 0x07 /* Autonegotiation next page */ + /* section 28.2.4.1 and 37.2.6.1 */ + +#define MII_ANLPRNP 0x08 /* Autonegotiation link partner rx next page */ + /* section 32.5.1 and 37.2.6.1 */ + + /* This is also the 1000baseT control register */ +#define MII_100T2CR 0x09 /* 100base-T2 control register */ +#define GTCR_TEST_MASK 0xe000 /* see 802.3ab ss. 40.6.1.1.2 */ +#define GTCR_MAN_MS 0x1000 /* enable manual master/slave control */ +#define GTCR_ADV_MS 0x0800 /* 1 = adv. master, 0 = adv. slave */ +#define GTCR_PORT_TYPE 0x0400 /* 1 = DCE, 0 = DTE (NIC) */ +#define GTCR_ADV_1000TFDX 0x0200 /* adv. 1000baseT FDX */ +#define GTCR_ADV_1000THDX 0x0100 /* adv. 1000baseT HDX */ + + /* This is also the 1000baseT status register */ +#define MII_100T2SR 0x0a /* 100base-T2 status register */ +#define GTSR_MAN_MS_FLT 0x8000 /* master/slave config fault */ +#define GTSR_MS_RES 0x4000 /* result: 1 = master, 0 = slave */ +#define GTSR_LRS 0x2000 /* local rx status, 1 = ok */ +#define GTSR_RRS 0x1000 /* remote rx status, 1 = ok */ +#define GTSR_LP_1000TFDX 0x0800 /* link partner 1000baseT FDX capable */ +#define GTSR_LP_1000THDX 0x0400 /* link partner 1000baseT HDX capable */ +#define GTSR_LP_ASM_DIR 0x0200 /* link partner asym. pause dir. capable */ +#define GTSR_IDLE_ERR 0x00ff /* IDLE error count */ + +#define MII_PSECR 0x0b /* PSE control register */ +#define PSECR_PACTLMASK 0x000c /* pair control mask */ +#define PSECR_PSEENMASK 0x0003 /* PSE enable mask */ +#define PSECR_PINOUTB 0x0008 /* PSE pinout Alternative B */ +#define PSECR_PINOUTA 0x0004 /* PSE pinout Alternative A */ +#define PSECR_FOPOWTST 0x0002 /* Force Power Test Mode */ +#define PSECR_PSEEN 0x0001 /* PSE Enabled */ +#define PSECR_PSEDIS 0x0000 /* PSE Disabled */ + +#define MII_PSESR 0x0c /* PSE status register */ +#define PSESR_PWRDENIED 0x1000 /* Power Denied */ +#define PSESR_VALSIG 0x0800 /* Valid PD signature detected */ +#define PSESR_INVALSIG 0x0400 /* Invalid PD signature detected */ +#define PSESR_SHORTCIRC 0x0200 /* Short circuit condition detected */ +#define PSESR_OVERLOAD 0x0100 /* Overload condition detected */ +#define PSESR_MPSABSENT 0x0080 /* MPS absent condition detected */ +#define PSESR_PDCLMASK 0x0070 /* PD Class mask */ +#define PSESR_STATMASK 0x000e /* PSE Status mask */ +#define PSESR_PAIRCTABL 0x0001 /* PAIR Control Ability */ +#define PSESR_PDCL_4 (4 << 4) /* Class 4 */ +#define PSESR_PDCL_3 (3 << 4) /* Class 3 */ +#define PSESR_PDCL_2 (2 << 4) /* Class 2 */ +#define PSESR_PDCL_1 (1 << 4) /* Class 1 */ +#define PSESR_PDCL_0 (0 << 4) /* Class 0 */ + +#define MII_MMDACR 0x0d /* MMD access control register */ +#define MMDACR_FUNCMASK 0xc000 /* function */ +#define MMDACR_DADDRMASK 0x001f /* device address */ +#define MMDACR_FN_ADDRESS (0 << 14) /* address */ +#define MMDACR_FN_DATANPI (1 << 14) /* data, no post increment */ +#define MMDACR_FN_DATAPIRW (2 << 14) /* data, post increment on r/w */ +#define MMDACR_FN_DATAPIW (3 << 14) /* data, post increment on wr only */ + +#define MII_MMDAADR 0x0e /* MMD access address data register */ + +#define MII_EXTSR 0x0f /* Extended status register */ +#define EXTSR_1000XFDX 0x8000 /* 1000X full-duplex capable */ +#define EXTSR_1000XHDX 0x4000 /* 1000X half-duplex capable */ +#define EXTSR_1000TFDX 0x2000 /* 1000T full-duplex capable */ +#define EXTSR_1000THDX 0x1000 /* 1000T half-duplex capable */ + +#define EXTSR_MEDIAMASK (EXTSR_1000XFDX|EXTSR_1000XHDX| \ + EXTSR_1000TFDX|EXTSR_1000THDX) + +#endif /* _DEV_MII_MII_H_ */ diff --git a/usr/contrib/freebsd/dev/nvme/nvme.h b/usr/contrib/freebsd/dev/nvme/nvme.h new file mode 100644 index 0000000000..c7f6496426 --- /dev/null +++ b/usr/contrib/freebsd/dev/nvme/nvme.h @@ -0,0 +1,1511 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (C) 2012-2013 Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + * Copyright 2019 Joyent, Inc. + */ + +#ifndef __NVME_H__ +#define __NVME_H__ + +#ifdef _KERNEL +#include +#endif + +#include +#include + +#define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command) +#define NVME_RESET_CONTROLLER _IO('n', 1) + +#define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test) +#define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test) + +/* + * Macros to deal with NVME revisions, as defined VS register + */ +#define NVME_REV(x, y) (((x) << 16) | ((y) << 8)) +#define NVME_MAJOR(r) (((r) >> 16) & 0xffff) +#define NVME_MINOR(r) (((r) >> 8) & 0xff) + +/* + * Use to mark a command to apply to all namespaces, or to retrieve global + * log pages. + */ +#define NVME_GLOBAL_NAMESPACE_TAG ((uint32_t)0xFFFFFFFF) + +/* Cap nvme to 1MB transfers driver explodes with larger sizes */ +#define NVME_MAX_XFER_SIZE (MAXPHYS < (1<<20) ? MAXPHYS : (1<<20)) + +/* Register field definitions */ +#define NVME_CAP_LO_REG_MQES_SHIFT (0) +#define NVME_CAP_LO_REG_MQES_MASK (0xFFFF) +#define NVME_CAP_LO_REG_CQR_SHIFT (16) +#define NVME_CAP_LO_REG_CQR_MASK (0x1) +#define NVME_CAP_LO_REG_AMS_SHIFT (17) +#define NVME_CAP_LO_REG_AMS_MASK (0x3) +#define NVME_CAP_LO_REG_TO_SHIFT (24) +#define NVME_CAP_LO_REG_TO_MASK (0xFF) + +#define NVME_CAP_HI_REG_DSTRD_SHIFT (0) +#define NVME_CAP_HI_REG_DSTRD_MASK (0xF) +#define NVME_CAP_HI_REG_CSS_NVM_SHIFT (5) +#define NVME_CAP_HI_REG_CSS_NVM_MASK (0x1) +#define NVME_CAP_HI_REG_MPSMIN_SHIFT (16) +#define NVME_CAP_HI_REG_MPSMIN_MASK (0xF) +#define NVME_CAP_HI_REG_MPSMAX_SHIFT (20) +#define NVME_CAP_HI_REG_MPSMAX_MASK (0xF) + +#define NVME_CC_REG_EN_SHIFT (0) +#define NVME_CC_REG_EN_MASK (0x1) +#define NVME_CC_REG_CSS_SHIFT (4) +#define NVME_CC_REG_CSS_MASK (0x7) +#define NVME_CC_REG_MPS_SHIFT (7) +#define NVME_CC_REG_MPS_MASK (0xF) +#define NVME_CC_REG_AMS_SHIFT (11) +#define NVME_CC_REG_AMS_MASK (0x7) +#define NVME_CC_REG_SHN_SHIFT (14) +#define NVME_CC_REG_SHN_MASK (0x3) +#define NVME_CC_REG_IOSQES_SHIFT (16) +#define NVME_CC_REG_IOSQES_MASK (0xF) +#define NVME_CC_REG_IOCQES_SHIFT (20) +#define NVME_CC_REG_IOCQES_MASK (0xF) + +#define NVME_CSTS_REG_RDY_SHIFT (0) +#define NVME_CSTS_REG_RDY_MASK (0x1) +#define NVME_CSTS_REG_CFS_SHIFT (1) +#define NVME_CSTS_REG_CFS_MASK (0x1) +#define NVME_CSTS_REG_SHST_SHIFT (2) +#define NVME_CSTS_REG_SHST_MASK (0x3) + +#define NVME_CSTS_GET_SHST(csts) (((csts) >> NVME_CSTS_REG_SHST_SHIFT) & NVME_CSTS_REG_SHST_MASK) + +#define NVME_AQA_REG_ASQS_SHIFT (0) +#define NVME_AQA_REG_ASQS_MASK (0xFFF) +#define NVME_AQA_REG_ACQS_SHIFT (16) +#define NVME_AQA_REG_ACQS_MASK (0xFFF) + +/* Command field definitions */ + +#define NVME_CMD_FUSE_SHIFT (8) +#define NVME_CMD_FUSE_MASK (0x3) + +#define NVME_STATUS_P_SHIFT (0) +#define NVME_STATUS_P_MASK (0x1) +#define NVME_STATUS_SC_SHIFT (1) +#define NVME_STATUS_SC_MASK (0xFF) +#define NVME_STATUS_SCT_SHIFT (9) +#define NVME_STATUS_SCT_MASK (0x7) +#define NVME_STATUS_M_SHIFT (14) +#define NVME_STATUS_M_MASK (0x1) +#define NVME_STATUS_DNR_SHIFT (15) +#define NVME_STATUS_DNR_MASK (0x1) + +#define NVME_STATUS_GET_P(st) (((st) >> NVME_STATUS_P_SHIFT) & NVME_STATUS_P_MASK) +#define NVME_STATUS_GET_SC(st) (((st) >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK) +#define NVME_STATUS_GET_SCT(st) (((st) >> NVME_STATUS_SCT_SHIFT) & NVME_STATUS_SCT_MASK) +#define NVME_STATUS_GET_M(st) (((st) >> NVME_STATUS_M_SHIFT) & NVME_STATUS_M_MASK) +#define NVME_STATUS_GET_DNR(st) (((st) >> NVME_STATUS_DNR_SHIFT) & NVME_STATUS_DNR_MASK) + +#define NVME_PWR_ST_MPS_SHIFT (0) +#define NVME_PWR_ST_MPS_MASK (0x1) +#define NVME_PWR_ST_NOPS_SHIFT (1) +#define NVME_PWR_ST_NOPS_MASK (0x1) +#define NVME_PWR_ST_RRT_SHIFT (0) +#define NVME_PWR_ST_RRT_MASK (0x1F) +#define NVME_PWR_ST_RRL_SHIFT (0) +#define NVME_PWR_ST_RRL_MASK (0x1F) +#define NVME_PWR_ST_RWT_SHIFT (0) +#define NVME_PWR_ST_RWT_MASK (0x1F) +#define NVME_PWR_ST_RWL_SHIFT (0) +#define NVME_PWR_ST_RWL_MASK (0x1F) +#define NVME_PWR_ST_IPS_SHIFT (6) +#define NVME_PWR_ST_IPS_MASK (0x3) +#define NVME_PWR_ST_APW_SHIFT (0) +#define NVME_PWR_ST_APW_MASK (0x7) +#define NVME_PWR_ST_APS_SHIFT (6) +#define NVME_PWR_ST_APS_MASK (0x3) + +/** Controller Multi-path I/O and Namespace Sharing Capabilities */ +/* More then one port */ +#define NVME_CTRLR_DATA_MIC_MPORTS_SHIFT (0) +#define NVME_CTRLR_DATA_MIC_MPORTS_MASK (0x1) +/* More then one controller */ +#define NVME_CTRLR_DATA_MIC_MCTRLRS_SHIFT (1) +#define NVME_CTRLR_DATA_MIC_MCTRLRS_MASK (0x1) +/* SR-IOV Virtual Function */ +#define NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT (2) +#define NVME_CTRLR_DATA_MIC_SRIOVVF_MASK (0x1) + +/** OACS - optional admin command support */ +/* supports security send/receive commands */ +#define NVME_CTRLR_DATA_OACS_SECURITY_SHIFT (0) +#define NVME_CTRLR_DATA_OACS_SECURITY_MASK (0x1) +/* supports format nvm command */ +#define NVME_CTRLR_DATA_OACS_FORMAT_SHIFT (1) +#define NVME_CTRLR_DATA_OACS_FORMAT_MASK (0x1) +/* supports firmware activate/download commands */ +#define NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT (2) +#define NVME_CTRLR_DATA_OACS_FIRMWARE_MASK (0x1) +/* supports namespace management commands */ +#define NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT (3) +#define NVME_CTRLR_DATA_OACS_NSMGMT_MASK (0x1) +/* supports Device Self-test command */ +#define NVME_CTRLR_DATA_OACS_SELFTEST_SHIFT (4) +#define NVME_CTRLR_DATA_OACS_SELFTEST_MASK (0x1) +/* supports Directives */ +#define NVME_CTRLR_DATA_OACS_DIRECTIVES_SHIFT (5) +#define NVME_CTRLR_DATA_OACS_DIRECTIVES_MASK (0x1) +/* supports NVMe-MI Send/Receive */ +#define NVME_CTRLR_DATA_OACS_NVMEMI_SHIFT (6) +#define NVME_CTRLR_DATA_OACS_NVMEMI_MASK (0x1) +/* supports Virtualization Management */ +#define NVME_CTRLR_DATA_OACS_VM_SHIFT (7) +#define NVME_CTRLR_DATA_OACS_VM_MASK (0x1) +/* supports Doorbell Buffer Config */ +#define NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT (8) +#define NVME_CTRLR_DATA_OACS_DBBUFFER_MASK (0x1) + +/** firmware updates */ +/* first slot is read-only */ +#define NVME_CTRLR_DATA_FRMW_SLOT1_RO_SHIFT (0) +#define NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK (0x1) +/* number of firmware slots */ +#define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT (1) +#define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK (0x7) + +/** log page attributes */ +/* per namespace smart/health log page */ +#define NVME_CTRLR_DATA_LPA_NS_SMART_SHIFT (0) +#define NVME_CTRLR_DATA_LPA_NS_SMART_MASK (0x1) + +/** AVSCC - admin vendor specific command configuration */ +/* admin vendor specific commands use spec format */ +#define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_SHIFT (0) +#define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_MASK (0x1) + +/** Autonomous Power State Transition Attributes */ +/* Autonomous Power State Transitions supported */ +#define NVME_CTRLR_DATA_APSTA_APST_SUPP_SHIFT (0) +#define NVME_CTRLR_DATA_APSTA_APST_SUPP_MASK (0x1) + +/** submission queue entry size */ +#define NVME_CTRLR_DATA_SQES_MIN_SHIFT (0) +#define NVME_CTRLR_DATA_SQES_MIN_MASK (0xF) +#define NVME_CTRLR_DATA_SQES_MAX_SHIFT (4) +#define NVME_CTRLR_DATA_SQES_MAX_MASK (0xF) + +/** completion queue entry size */ +#define NVME_CTRLR_DATA_CQES_MIN_SHIFT (0) +#define NVME_CTRLR_DATA_CQES_MIN_MASK (0xF) +#define NVME_CTRLR_DATA_CQES_MAX_SHIFT (4) +#define NVME_CTRLR_DATA_CQES_MAX_MASK (0xF) + +/** optional nvm command support */ +#define NVME_CTRLR_DATA_ONCS_COMPARE_SHIFT (0) +#define NVME_CTRLR_DATA_ONCS_COMPARE_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_WRITE_UNC_SHIFT (1) +#define NVME_CTRLR_DATA_ONCS_WRITE_UNC_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_DSM_SHIFT (2) +#define NVME_CTRLR_DATA_ONCS_DSM_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_WRZERO_SHIFT (3) +#define NVME_CTRLR_DATA_ONCS_WRZERO_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_SAVEFEAT_SHIFT (4) +#define NVME_CTRLR_DATA_ONCS_SAVEFEAT_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_RESERV_SHIFT (5) +#define NVME_CTRLR_DATA_ONCS_RESERV_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT (6) +#define NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK (0x1) + +/** Fused Operation Support */ +#define NVME_CTRLR_DATA_FUSES_CNW_SHIFT (0) +#define NVME_CTRLR_DATA_FUSES_CNW_MASK (0x1) + +/** Format NVM Attributes */ +#define NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT (0) +#define NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK (0x1) +#define NVME_CTRLR_DATA_FNA_ERASE_ALL_SHIFT (1) +#define NVME_CTRLR_DATA_FNA_ERASE_ALL_MASK (0x1) +#define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_SHIFT (2) +#define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK (0x1) + +/** volatile write cache */ +#define NVME_CTRLR_DATA_VWC_PRESENT_SHIFT (0) +#define NVME_CTRLR_DATA_VWC_PRESENT_MASK (0x1) + +/** namespace features */ +/* thin provisioning */ +#define NVME_NS_DATA_NSFEAT_THIN_PROV_SHIFT (0) +#define NVME_NS_DATA_NSFEAT_THIN_PROV_MASK (0x1) +/* NAWUN, NAWUPF, and NACWU fields are valid */ +#define NVME_NS_DATA_NSFEAT_NA_FIELDS_SHIFT (1) +#define NVME_NS_DATA_NSFEAT_NA_FIELDS_MASK (0x1) +/* Deallocated or Unwritten Logical Block errors supported */ +#define NVME_NS_DATA_NSFEAT_DEALLOC_SHIFT (2) +#define NVME_NS_DATA_NSFEAT_DEALLOC_MASK (0x1) +/* NGUID and EUI64 fields are not reusable */ +#define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_SHIFT (3) +#define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_MASK (0x1) + +/** formatted lba size */ +#define NVME_NS_DATA_FLBAS_FORMAT_SHIFT (0) +#define NVME_NS_DATA_FLBAS_FORMAT_MASK (0xF) +#define NVME_NS_DATA_FLBAS_EXTENDED_SHIFT (4) +#define NVME_NS_DATA_FLBAS_EXTENDED_MASK (0x1) + +/** metadata capabilities */ +/* metadata can be transferred as part of data prp list */ +#define NVME_NS_DATA_MC_EXTENDED_SHIFT (0) +#define NVME_NS_DATA_MC_EXTENDED_MASK (0x1) +/* metadata can be transferred with separate metadata pointer */ +#define NVME_NS_DATA_MC_POINTER_SHIFT (1) +#define NVME_NS_DATA_MC_POINTER_MASK (0x1) + +/** end-to-end data protection capabilities */ +/* protection information type 1 */ +#define NVME_NS_DATA_DPC_PIT1_SHIFT (0) +#define NVME_NS_DATA_DPC_PIT1_MASK (0x1) +/* protection information type 2 */ +#define NVME_NS_DATA_DPC_PIT2_SHIFT (1) +#define NVME_NS_DATA_DPC_PIT2_MASK (0x1) +/* protection information type 3 */ +#define NVME_NS_DATA_DPC_PIT3_SHIFT (2) +#define NVME_NS_DATA_DPC_PIT3_MASK (0x1) +/* first eight bytes of metadata */ +#define NVME_NS_DATA_DPC_MD_START_SHIFT (3) +#define NVME_NS_DATA_DPC_MD_START_MASK (0x1) +/* last eight bytes of metadata */ +#define NVME_NS_DATA_DPC_MD_END_SHIFT (4) +#define NVME_NS_DATA_DPC_MD_END_MASK (0x1) + +/** end-to-end data protection type settings */ +/* protection information type */ +#define NVME_NS_DATA_DPS_PIT_SHIFT (0) +#define NVME_NS_DATA_DPS_PIT_MASK (0x7) +/* 1 == protection info transferred at start of metadata */ +/* 0 == protection info transferred at end of metadata */ +#define NVME_NS_DATA_DPS_MD_START_SHIFT (3) +#define NVME_NS_DATA_DPS_MD_START_MASK (0x1) + +/** Namespace Multi-path I/O and Namespace Sharing Capabilities */ +/* the namespace may be attached to two or more controllers */ +#define NVME_NS_DATA_NMIC_MAY_BE_SHARED_SHIFT (0) +#define NVME_NS_DATA_NMIC_MAY_BE_SHARED_MASK (0x1) + +/** Reservation Capabilities */ +/* Persist Through Power Loss */ +#define NVME_NS_DATA_RESCAP_PTPL_SHIFT (0) +#define NVME_NS_DATA_RESCAP_PTPL_MASK (0x1) +/* supports the Write Exclusive */ +#define NVME_NS_DATA_RESCAP_WR_EX_SHIFT (1) +#define NVME_NS_DATA_RESCAP_WR_EX_MASK (0x1) +/* supports the Exclusive Access */ +#define NVME_NS_DATA_RESCAP_EX_AC_SHIFT (2) +#define NVME_NS_DATA_RESCAP_EX_AC_MASK (0x1) +/* supports the Write Exclusive – Registrants Only */ +#define NVME_NS_DATA_RESCAP_WR_EX_RO_SHIFT (3) +#define NVME_NS_DATA_RESCAP_WR_EX_RO_MASK (0x1) +/* supports the Exclusive Access - Registrants Only */ +#define NVME_NS_DATA_RESCAP_EX_AC_RO_SHIFT (4) +#define NVME_NS_DATA_RESCAP_EX_AC_RO_MASK (0x1) +/* supports the Write Exclusive – All Registrants */ +#define NVME_NS_DATA_RESCAP_WR_EX_AR_SHIFT (5) +#define NVME_NS_DATA_RESCAP_WR_EX_AR_MASK (0x1) +/* supports the Exclusive Access - All Registrants */ +#define NVME_NS_DATA_RESCAP_EX_AC_AR_SHIFT (6) +#define NVME_NS_DATA_RESCAP_EX_AC_AR_MASK (0x1) +/* Ignore Existing Key is used as defined in revision 1.3 or later */ +#define NVME_NS_DATA_RESCAP_IEKEY13_SHIFT (7) +#define NVME_NS_DATA_RESCAP_IEKEY13_MASK (0x1) + +/** Format Progress Indicator */ +/* percentage of the Format NVM command that remains to be completed */ +#define NVME_NS_DATA_FPI_PERC_SHIFT (0) +#define NVME_NS_DATA_FPI_PERC_MASK (0x7f) +/* namespace supports the Format Progress Indicator */ +#define NVME_NS_DATA_FPI_SUPP_SHIFT (7) +#define NVME_NS_DATA_FPI_SUPP_MASK (0x1) + +/** lba format support */ +/* metadata size */ +#define NVME_NS_DATA_LBAF_MS_SHIFT (0) +#define NVME_NS_DATA_LBAF_MS_MASK (0xFFFF) +/* lba data size */ +#define NVME_NS_DATA_LBAF_LBADS_SHIFT (16) +#define NVME_NS_DATA_LBAF_LBADS_MASK (0xFF) +/* relative performance */ +#define NVME_NS_DATA_LBAF_RP_SHIFT (24) +#define NVME_NS_DATA_LBAF_RP_MASK (0x3) + +enum nvme_critical_warning_state { + NVME_CRIT_WARN_ST_AVAILABLE_SPARE = 0x1, + NVME_CRIT_WARN_ST_TEMPERATURE = 0x2, + NVME_CRIT_WARN_ST_DEVICE_RELIABILITY = 0x4, + NVME_CRIT_WARN_ST_READ_ONLY = 0x8, + NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP = 0x10, +}; +#define NVME_CRIT_WARN_ST_RESERVED_MASK (0xE0) + +/* slot for current FW */ +#define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT (0) +#define NVME_FIRMWARE_PAGE_AFI_SLOT_MASK (0x7) + +/* CC register SHN field values */ +enum shn_value { + NVME_SHN_NORMAL = 0x1, + NVME_SHN_ABRUPT = 0x2, +}; + +/* CSTS register SHST field values */ +enum shst_value { + NVME_SHST_NORMAL = 0x0, + NVME_SHST_OCCURRING = 0x1, + NVME_SHST_COMPLETE = 0x2, +}; + +struct nvme_registers +{ + /** controller capabilities */ + uint32_t cap_lo; + uint32_t cap_hi; + + uint32_t vs; /* version */ + uint32_t intms; /* interrupt mask set */ + uint32_t intmc; /* interrupt mask clear */ + + /** controller configuration */ + uint32_t cc; + + uint32_t reserved1; + + /** controller status */ + uint32_t csts; + + uint32_t reserved2; + + /** admin queue attributes */ + uint32_t aqa; + + uint64_t asq; /* admin submission queue base addr */ + uint64_t acq; /* admin completion queue base addr */ + uint32_t reserved3[0x3f2]; + + struct { + uint32_t sq_tdbl; /* submission queue tail doorbell */ + uint32_t cq_hdbl; /* completion queue head doorbell */ + } doorbell[1] __packed; +} __packed; + +_Static_assert(sizeof(struct nvme_registers) == 0x1008, "bad size for nvme_registers"); + +struct nvme_command +{ + /* dword 0 */ + uint8_t opc; /* opcode */ + uint8_t fuse; /* fused operation */ + uint16_t cid; /* command identifier */ + + /* dword 1 */ + uint32_t nsid; /* namespace identifier */ + + /* dword 2-3 */ + uint32_t rsvd2; + uint32_t rsvd3; + + /* dword 4-5 */ + uint64_t mptr; /* metadata pointer */ + + /* dword 6-7 */ + uint64_t prp1; /* prp entry 1 */ + + /* dword 8-9 */ + uint64_t prp2; /* prp entry 2 */ + + /* dword 10-15 */ + uint32_t cdw10; /* command-specific */ + uint32_t cdw11; /* command-specific */ + uint32_t cdw12; /* command-specific */ + uint32_t cdw13; /* command-specific */ + uint32_t cdw14; /* command-specific */ + uint32_t cdw15; /* command-specific */ +} __packed; + +_Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command"); + +struct nvme_completion { + + /* dword 0 */ + uint32_t cdw0; /* command-specific */ + + /* dword 1 */ + uint32_t rsvd1; + + /* dword 2 */ + uint16_t sqhd; /* submission queue head pointer */ + uint16_t sqid; /* submission queue identifier */ + + /* dword 3 */ + uint16_t cid; /* command identifier */ + uint16_t status; +} __packed; + +_Static_assert(sizeof(struct nvme_completion) == 4 * 4, "bad size for nvme_completion"); + +struct nvme_dsm_range { + uint32_t attributes; + uint32_t length; + uint64_t starting_lba; +} __packed; + +/* Largest DSM Trim that can be done */ +#define NVME_MAX_DSM_TRIM 4096 + +_Static_assert(sizeof(struct nvme_dsm_range) == 16, "bad size for nvme_dsm_ranage"); + +/* status code types */ +enum nvme_status_code_type { + NVME_SCT_GENERIC = 0x0, + NVME_SCT_COMMAND_SPECIFIC = 0x1, + NVME_SCT_MEDIA_ERROR = 0x2, + /* 0x3-0x6 - reserved */ + NVME_SCT_VENDOR_SPECIFIC = 0x7, +}; + +/* generic command status codes */ +enum nvme_generic_command_status_code { + NVME_SC_SUCCESS = 0x00, + NVME_SC_INVALID_OPCODE = 0x01, + NVME_SC_INVALID_FIELD = 0x02, + NVME_SC_COMMAND_ID_CONFLICT = 0x03, + NVME_SC_DATA_TRANSFER_ERROR = 0x04, + NVME_SC_ABORTED_POWER_LOSS = 0x05, + NVME_SC_INTERNAL_DEVICE_ERROR = 0x06, + NVME_SC_ABORTED_BY_REQUEST = 0x07, + NVME_SC_ABORTED_SQ_DELETION = 0x08, + NVME_SC_ABORTED_FAILED_FUSED = 0x09, + NVME_SC_ABORTED_MISSING_FUSED = 0x0a, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT = 0x0b, + NVME_SC_COMMAND_SEQUENCE_ERROR = 0x0c, + NVME_SC_INVALID_SGL_SEGMENT_DESCR = 0x0d, + NVME_SC_INVALID_NUMBER_OF_SGL_DESCR = 0x0e, + NVME_SC_DATA_SGL_LENGTH_INVALID = 0x0f, + NVME_SC_METADATA_SGL_LENGTH_INVALID = 0x10, + NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID = 0x11, + NVME_SC_INVALID_USE_OF_CMB = 0x12, + NVME_SC_PRP_OFFET_INVALID = 0x13, + NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED = 0x14, + NVME_SC_OPERATION_DENIED = 0x15, + NVME_SC_SGL_OFFSET_INVALID = 0x16, + /* 0x17 - reserved */ + NVME_SC_HOST_ID_INCONSISTENT_FORMAT = 0x18, + NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED = 0x19, + NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID = 0x1a, + NVME_SC_ABORTED_DUE_TO_PREEMPT = 0x1b, + NVME_SC_SANITIZE_FAILED = 0x1c, + NVME_SC_SANITIZE_IN_PROGRESS = 0x1d, + NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID = 0x1e, + NVME_SC_NOT_SUPPORTED_IN_CMB = 0x1f, + + NVME_SC_LBA_OUT_OF_RANGE = 0x80, + NVME_SC_CAPACITY_EXCEEDED = 0x81, + NVME_SC_NAMESPACE_NOT_READY = 0x82, + NVME_SC_RESERVATION_CONFLICT = 0x83, + NVME_SC_FORMAT_IN_PROGRESS = 0x84, +}; + +/* command specific status codes */ +enum nvme_command_specific_status_code { + NVME_SC_COMPLETION_QUEUE_INVALID = 0x00, + NVME_SC_INVALID_QUEUE_IDENTIFIER = 0x01, + NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED = 0x02, + NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED = 0x03, + /* 0x04 - reserved */ + NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED = 0x05, + NVME_SC_INVALID_FIRMWARE_SLOT = 0x06, + NVME_SC_INVALID_FIRMWARE_IMAGE = 0x07, + NVME_SC_INVALID_INTERRUPT_VECTOR = 0x08, + NVME_SC_INVALID_LOG_PAGE = 0x09, + NVME_SC_INVALID_FORMAT = 0x0a, + NVME_SC_FIRMWARE_REQUIRES_RESET = 0x0b, + NVME_SC_INVALID_QUEUE_DELETION = 0x0c, + NVME_SC_FEATURE_NOT_SAVEABLE = 0x0d, + NVME_SC_FEATURE_NOT_CHANGEABLE = 0x0e, + NVME_SC_FEATURE_NOT_NS_SPECIFIC = 0x0f, + NVME_SC_FW_ACT_REQUIRES_NVMS_RESET = 0x10, + NVME_SC_FW_ACT_REQUIRES_RESET = 0x11, + NVME_SC_FW_ACT_REQUIRES_TIME = 0x12, + NVME_SC_FW_ACT_PROHIBITED = 0x13, + NVME_SC_OVERLAPPING_RANGE = 0x14, + NVME_SC_NS_INSUFFICIENT_CAPACITY = 0x15, + NVME_SC_NS_ID_UNAVAILABLE = 0x16, + /* 0x17 - reserved */ + NVME_SC_NS_ALREADY_ATTACHED = 0x18, + NVME_SC_NS_IS_PRIVATE = 0x19, + NVME_SC_NS_NOT_ATTACHED = 0x1a, + NVME_SC_THIN_PROV_NOT_SUPPORTED = 0x1b, + NVME_SC_CTRLR_LIST_INVALID = 0x1c, + NVME_SC_SELT_TEST_IN_PROGRESS = 0x1d, + NVME_SC_BOOT_PART_WRITE_PROHIB = 0x1e, + NVME_SC_INVALID_CTRLR_ID = 0x1f, + NVME_SC_INVALID_SEC_CTRLR_STATE = 0x20, + NVME_SC_INVALID_NUM_OF_CTRLR_RESRC = 0x21, + NVME_SC_INVALID_RESOURCE_ID = 0x22, + + NVME_SC_CONFLICTING_ATTRIBUTES = 0x80, + NVME_SC_INVALID_PROTECTION_INFO = 0x81, + NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE = 0x82, +}; + +/* media error status codes */ +enum nvme_media_error_status_code { + NVME_SC_WRITE_FAULTS = 0x80, + NVME_SC_UNRECOVERED_READ_ERROR = 0x81, + NVME_SC_GUARD_CHECK_ERROR = 0x82, + NVME_SC_APPLICATION_TAG_CHECK_ERROR = 0x83, + NVME_SC_REFERENCE_TAG_CHECK_ERROR = 0x84, + NVME_SC_COMPARE_FAILURE = 0x85, + NVME_SC_ACCESS_DENIED = 0x86, + NVME_SC_DEALLOCATED_OR_UNWRITTEN = 0x87, +}; + +/* admin opcodes */ +enum nvme_admin_opcode { + NVME_OPC_DELETE_IO_SQ = 0x00, + NVME_OPC_CREATE_IO_SQ = 0x01, + NVME_OPC_GET_LOG_PAGE = 0x02, + /* 0x03 - reserved */ + NVME_OPC_DELETE_IO_CQ = 0x04, + NVME_OPC_CREATE_IO_CQ = 0x05, + NVME_OPC_IDENTIFY = 0x06, + /* 0x07 - reserved */ + NVME_OPC_ABORT = 0x08, + NVME_OPC_SET_FEATURES = 0x09, + NVME_OPC_GET_FEATURES = 0x0a, + /* 0x0b - reserved */ + NVME_OPC_ASYNC_EVENT_REQUEST = 0x0c, + NVME_OPC_NAMESPACE_MANAGEMENT = 0x0d, + /* 0x0e-0x0f - reserved */ + NVME_OPC_FIRMWARE_ACTIVATE = 0x10, + NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD = 0x11, + NVME_OPC_DEVICE_SELF_TEST = 0x14, + NVME_OPC_NAMESPACE_ATTACHMENT = 0x15, + NVME_OPC_KEEP_ALIVE = 0x18, + NVME_OPC_DIRECTIVE_SEND = 0x19, + NVME_OPC_DIRECTIVE_RECEIVE = 0x1a, + NVME_OPC_VIRTUALIZATION_MANAGEMENT = 0x1c, + NVME_OPC_NVME_MI_SEND = 0x1d, + NVME_OPC_NVME_MI_RECEIVE = 0x1e, + NVME_OPC_DOORBELL_BUFFER_CONFIG = 0x7c, + + NVME_OPC_FORMAT_NVM = 0x80, + NVME_OPC_SECURITY_SEND = 0x81, + NVME_OPC_SECURITY_RECEIVE = 0x82, + NVME_OPC_SANITIZE = 0x84, +}; + +/* nvme nvm opcodes */ +enum nvme_nvm_opcode { + NVME_OPC_FLUSH = 0x00, + NVME_OPC_WRITE = 0x01, + NVME_OPC_READ = 0x02, + /* 0x03 - reserved */ + NVME_OPC_WRITE_UNCORRECTABLE = 0x04, + NVME_OPC_COMPARE = 0x05, + /* 0x06 - reserved */ + NVME_OPC_WRITE_ZEROES = 0x08, + /* 0x07 - reserved */ + NVME_OPC_DATASET_MANAGEMENT = 0x09, + /* 0x0a-0x0c - reserved */ + NVME_OPC_RESERVATION_REGISTER = 0x0d, + NVME_OPC_RESERVATION_REPORT = 0x0e, + /* 0x0f-0x10 - reserved */ + NVME_OPC_RESERVATION_ACQUIRE = 0x11, + /* 0x12-0x14 - reserved */ + NVME_OPC_RESERVATION_RELEASE = 0x15, +}; + +enum nvme_feature { + /* 0x00 - reserved */ + NVME_FEAT_ARBITRATION = 0x01, + NVME_FEAT_POWER_MANAGEMENT = 0x02, + NVME_FEAT_LBA_RANGE_TYPE = 0x03, + NVME_FEAT_TEMPERATURE_THRESHOLD = 0x04, + NVME_FEAT_ERROR_RECOVERY = 0x05, + NVME_FEAT_VOLATILE_WRITE_CACHE = 0x06, + NVME_FEAT_NUMBER_OF_QUEUES = 0x07, + NVME_FEAT_INTERRUPT_COALESCING = 0x08, + NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION = 0x09, + NVME_FEAT_WRITE_ATOMICITY = 0x0A, + NVME_FEAT_ASYNC_EVENT_CONFIGURATION = 0x0B, + NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION = 0x0C, + NVME_FEAT_HOST_MEMORY_BUFFER = 0x0D, + NVME_FEAT_TIMESTAMP = 0x0E, + NVME_FEAT_KEEP_ALIVE_TIMER = 0x0F, + NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT = 0x10, + NVME_FEAT_NON_OP_POWER_STATE_CONFIG = 0x11, + /* 0x12-0x77 - reserved */ + /* 0x78-0x7f - NVMe Management Interface */ + NVME_FEAT_SOFTWARE_PROGRESS_MARKER = 0x80, + /* 0x81-0xBF - command set specific (reserved) */ + /* 0xC0-0xFF - vendor specific */ +}; + +enum nvme_dsm_attribute { + NVME_DSM_ATTR_INTEGRAL_READ = 0x1, + NVME_DSM_ATTR_INTEGRAL_WRITE = 0x2, + NVME_DSM_ATTR_DEALLOCATE = 0x4, +}; + +enum nvme_activate_action { + NVME_AA_REPLACE_NO_ACTIVATE = 0x0, + NVME_AA_REPLACE_ACTIVATE = 0x1, + NVME_AA_ACTIVATE = 0x2, +}; + +struct nvme_power_state { + /** Maximum Power */ + uint16_t mp; /* Maximum Power */ + uint8_t ps_rsvd1; + uint8_t mps_nops; /* Max Power Scale, Non-Operational State */ + + uint32_t enlat; /* Entry Latency */ + uint32_t exlat; /* Exit Latency */ + + uint8_t rrt; /* Relative Read Throughput */ + uint8_t rrl; /* Relative Read Latency */ + uint8_t rwt; /* Relative Write Throughput */ + uint8_t rwl; /* Relative Write Latency */ + + uint16_t idlp; /* Idle Power */ + uint8_t ips; /* Idle Power Scale */ + uint8_t ps_rsvd8; + + uint16_t actp; /* Active Power */ + uint8_t apw_aps; /* Active Power Workload, Active Power Scale */ + uint8_t ps_rsvd10[9]; +} __packed; + +_Static_assert(sizeof(struct nvme_power_state) == 32, "bad size for nvme_power_state"); + +#define NVME_SERIAL_NUMBER_LENGTH 20 +#define NVME_MODEL_NUMBER_LENGTH 40 +#define NVME_FIRMWARE_REVISION_LENGTH 8 + +struct nvme_controller_data { + + /* bytes 0-255: controller capabilities and features */ + + /** pci vendor id */ + uint16_t vid; + + /** pci subsystem vendor id */ + uint16_t ssvid; + + /** serial number */ + uint8_t sn[NVME_SERIAL_NUMBER_LENGTH]; + + /** model number */ + uint8_t mn[NVME_MODEL_NUMBER_LENGTH]; + + /** firmware revision */ + uint8_t fr[NVME_FIRMWARE_REVISION_LENGTH]; + + /** recommended arbitration burst */ + uint8_t rab; + + /** ieee oui identifier */ + uint8_t ieee[3]; + + /** multi-interface capabilities */ + uint8_t mic; + + /** maximum data transfer size */ + uint8_t mdts; + + /** Controller ID */ + uint16_t ctrlr_id; + + /** Version */ + uint32_t ver; + + /** RTD3 Resume Latency */ + uint32_t rtd3r; + + /** RTD3 Enter Latency */ + uint32_t rtd3e; + + /** Optional Asynchronous Events Supported */ + uint32_t oaes; /* bitfield really */ + + /** Controller Attributes */ + uint32_t ctratt; /* bitfield really */ + + uint8_t reserved1[12]; + + /** FRU Globally Unique Identifier */ + uint8_t fguid[16]; + + uint8_t reserved2[128]; + + /* bytes 256-511: admin command set attributes */ + + /** optional admin command support */ + uint16_t oacs; + + /** abort command limit */ + uint8_t acl; + + /** asynchronous event request limit */ + uint8_t aerl; + + /** firmware updates */ + uint8_t frmw; + + /** log page attributes */ + uint8_t lpa; + + /** error log page entries */ + uint8_t elpe; + + /** number of power states supported */ + uint8_t npss; + + /** admin vendor specific command configuration */ + uint8_t avscc; + + /** Autonomous Power State Transition Attributes */ + uint8_t apsta; + + /** Warning Composite Temperature Threshold */ + uint16_t wctemp; + + /** Critical Composite Temperature Threshold */ + uint16_t cctemp; + + /** Maximum Time for Firmware Activation */ + uint16_t mtfa; + + /** Host Memory Buffer Preferred Size */ + uint32_t hmpre; + + /** Host Memory Buffer Minimum Size */ + uint32_t hmmin; + + /** Name space capabilities */ + struct { + /* if nsmgmt, report tnvmcap and unvmcap */ + uint8_t tnvmcap[16]; + uint8_t unvmcap[16]; + } __packed untncap; + + /** Replay Protected Memory Block Support */ + uint32_t rpmbs; /* Really a bitfield */ + + /** Extended Device Self-test Time */ + uint16_t edstt; + + /** Device Self-test Options */ + uint8_t dsto; /* Really a bitfield */ + + /** Firmware Update Granularity */ + uint8_t fwug; + + /** Keep Alive Support */ + uint16_t kas; + + /** Host Controlled Thermal Management Attributes */ + uint16_t hctma; /* Really a bitfield */ + + /** Minimum Thermal Management Temperature */ + uint16_t mntmt; + + /** Maximum Thermal Management Temperature */ + uint16_t mxtmt; + + /** Sanitize Capabilities */ + uint32_t sanicap; /* Really a bitfield */ + + uint8_t reserved3[180]; + /* bytes 512-703: nvm command set attributes */ + + /** submission queue entry size */ + uint8_t sqes; + + /** completion queue entry size */ + uint8_t cqes; + + /** Maximum Outstanding Commands */ + uint16_t maxcmd; + + /** number of namespaces */ + uint32_t nn; + + /** optional nvm command support */ + uint16_t oncs; + + /** fused operation support */ + uint16_t fuses; + + /** format nvm attributes */ + uint8_t fna; + + /** volatile write cache */ + uint8_t vwc; + + /** Atomic Write Unit Normal */ + uint16_t awun; + + /** Atomic Write Unit Power Fail */ + uint16_t awupf; + + /** NVM Vendor Specific Command Configuration */ + uint8_t nvscc; + uint8_t reserved5; + + /** Atomic Compare & Write Unit */ + uint16_t acwu; + uint16_t reserved6; + + /** SGL Support */ + uint32_t sgls; + + /* bytes 540-767: Reserved */ + uint8_t reserved7[228]; + + /** NVM Subsystem NVMe Qualified Name */ + uint8_t subnqn[256]; + + /* bytes 1024-1791: Reserved */ + uint8_t reserved8[768]; + + /* bytes 1792-2047: NVMe over Fabrics specification */ + uint8_t reserved9[256]; + + /* bytes 2048-3071: power state descriptors */ + struct nvme_power_state power_state[32]; + + /* bytes 3072-4095: vendor specific */ + uint8_t vs[1024]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_controller_data) == 4096, "bad size for nvme_controller_data"); + +struct nvme_namespace_data { + + /** namespace size */ + uint64_t nsze; + + /** namespace capacity */ + uint64_t ncap; + + /** namespace utilization */ + uint64_t nuse; + + /** namespace features */ + uint8_t nsfeat; + + /** number of lba formats */ + uint8_t nlbaf; + + /** formatted lba size */ + uint8_t flbas; + + /** metadata capabilities */ + uint8_t mc; + + /** end-to-end data protection capabilities */ + uint8_t dpc; + + /** end-to-end data protection type settings */ + uint8_t dps; + + /** Namespace Multi-path I/O and Namespace Sharing Capabilities */ + uint8_t nmic; + + /** Reservation Capabilities */ + uint8_t rescap; + + /** Format Progress Indicator */ + uint8_t fpi; + + /** Deallocate Logical Block Features */ + uint8_t dlfeat; + + /** Namespace Atomic Write Unit Normal */ + uint16_t nawun; + + /** Namespace Atomic Write Unit Power Fail */ + uint16_t nawupf; + + /** Namespace Atomic Compare & Write Unit */ + uint16_t nacwu; + + /** Namespace Atomic Boundary Size Normal */ + uint16_t nabsn; + + /** Namespace Atomic Boundary Offset */ + uint16_t nabo; + + /** Namespace Atomic Boundary Size Power Fail */ + uint16_t nabspf; + + /** Namespace Optimal IO Boundary */ + uint16_t noiob; + + /** NVM Capacity */ + uint8_t nvmcap[16]; + + /* bytes 64-103: Reserved */ + uint8_t reserved5[40]; + + /** Namespace Globally Unique Identifier */ + uint8_t nguid[16]; + + /** IEEE Extended Unique Identifier */ + uint8_t eui64[8]; + + /** lba format support */ + uint32_t lbaf[16]; + + uint8_t reserved6[192]; + + uint8_t vendor_specific[3712]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_namespace_data) == 4096, "bad size for nvme_namepsace_data"); + +enum nvme_log_page { + + /* 0x00 - reserved */ + NVME_LOG_ERROR = 0x01, + NVME_LOG_HEALTH_INFORMATION = 0x02, + NVME_LOG_FIRMWARE_SLOT = 0x03, + NVME_LOG_CHANGED_NAMESPACE = 0x04, + NVME_LOG_COMMAND_EFFECT = 0x05, + /* 0x06-0x7F - reserved */ + /* 0x80-0xBF - I/O command set specific */ + NVME_LOG_RES_NOTIFICATION = 0x80, + /* 0xC0-0xFF - vendor specific */ + + /* + * The following are Intel Specific log pages, but they seem + * to be widely implemented. + */ + INTEL_LOG_READ_LAT_LOG = 0xc1, + INTEL_LOG_WRITE_LAT_LOG = 0xc2, + INTEL_LOG_TEMP_STATS = 0xc5, + INTEL_LOG_ADD_SMART = 0xca, + INTEL_LOG_DRIVE_MKT_NAME = 0xdd, + + /* + * HGST log page, with lots ofs sub pages. + */ + HGST_INFO_LOG = 0xc1, +}; + +struct nvme_error_information_entry { + + uint64_t error_count; + uint16_t sqid; + uint16_t cid; + uint16_t status; + uint16_t error_location; + uint64_t lba; + uint32_t nsid; + uint8_t vendor_specific; + uint8_t reserved[35]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry"); + +struct nvme_health_information_page { + + uint8_t critical_warning; + uint16_t temperature; + uint8_t available_spare; + uint8_t available_spare_threshold; + uint8_t percentage_used; + + uint8_t reserved[26]; + + /* + * Note that the following are 128-bit values, but are + * defined as an array of 2 64-bit values. + */ + /* Data Units Read is always in 512-byte units. */ + uint64_t data_units_read[2]; + /* Data Units Written is always in 512-byte units. */ + uint64_t data_units_written[2]; + /* For NVM command set, this includes Compare commands. */ + uint64_t host_read_commands[2]; + uint64_t host_write_commands[2]; + /* Controller Busy Time is reported in minutes. */ + uint64_t controller_busy_time[2]; + uint64_t power_cycles[2]; + uint64_t power_on_hours[2]; + uint64_t unsafe_shutdowns[2]; + uint64_t media_errors[2]; + uint64_t num_error_info_log_entries[2]; + uint32_t warning_temp_time; + uint32_t error_temp_time; + uint16_t temp_sensor[8]; + + uint8_t reserved2[296]; +} __packed __aligned(4); + +/* Currently sparse/smatch incorrectly packs this struct in some situations. */ +#ifndef __CHECKER__ +_Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page"); +#endif + +struct nvme_firmware_page { + + uint8_t afi; + uint8_t reserved[7]; + uint64_t revision[7]; /* revisions for 7 slots */ + uint8_t reserved2[448]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_firmware_page) == 512, "bad size for nvme_firmware_page"); + +struct nvme_ns_list { + uint32_t ns[1024]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_ns_list) == 4096, "bad size for nvme_ns_list"); + +struct intel_log_temp_stats +{ + uint64_t current; + uint64_t overtemp_flag_last; + uint64_t overtemp_flag_life; + uint64_t max_temp; + uint64_t min_temp; + uint64_t _rsvd[5]; + uint64_t max_oper_temp; + uint64_t min_oper_temp; + uint64_t est_offset; +} __packed __aligned(4); + +_Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats"); + +#define NVME_TEST_MAX_THREADS 128 + +struct nvme_io_test { + + enum nvme_nvm_opcode opc; + uint32_t size; + uint32_t time; /* in seconds */ + uint32_t num_threads; + uint32_t flags; + uint64_t io_completed[NVME_TEST_MAX_THREADS]; +}; + +enum nvme_io_test_flags { + + /* + * Specifies whether dev_refthread/dev_relthread should be + * called during NVME_BIO_TEST. Ignored for other test + * types. + */ + NVME_TEST_FLAG_REFTHREAD = 0x1, +}; + +struct nvme_pt_command { + + /* + * cmd is used to specify a passthrough command to a controller or + * namespace. + * + * The following fields from cmd may be specified by the caller: + * * opc (opcode) + * * nsid (namespace id) - for admin commands only + * * cdw10-cdw15 + * + * Remaining fields must be set to 0 by the caller. + */ + struct nvme_command cmd; + + /* + * cpl returns completion status for the passthrough command + * specified by cmd. + * + * The following fields will be filled out by the driver, for + * consumption by the caller: + * * cdw0 + * * status (except for phase) + * + * Remaining fields will be set to 0 by the driver. + */ + struct nvme_completion cpl; + + /* buf is the data buffer associated with this passthrough command. */ + void * buf; + + /* + * len is the length of the data buffer associated with this + * passthrough command. + */ + uint32_t len; + + /* + * is_read = 1 if the passthrough command will read data into the + * supplied buffer from the controller. + * + * is_read = 0 if the passthrough command will write data from the + * supplied buffer to the controller. + */ + uint32_t is_read; + + /* + * driver_lock is used by the driver only. It must be set to 0 + * by the caller. + */ + struct mtx * driver_lock; +}; + +#define nvme_completion_is_error(cpl) \ + (NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0) + +void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen); + +#ifdef _KERNEL + +struct bio; + +struct nvme_namespace; +struct nvme_controller; +struct nvme_consumer; + +typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *); + +typedef void *(*nvme_cons_ns_fn_t)(struct nvme_namespace *, void *); +typedef void *(*nvme_cons_ctrlr_fn_t)(struct nvme_controller *); +typedef void (*nvme_cons_async_fn_t)(void *, const struct nvme_completion *, + uint32_t, void *, uint32_t); +typedef void (*nvme_cons_fail_fn_t)(void *); + +enum nvme_namespace_flags { + NVME_NS_DEALLOCATE_SUPPORTED = 0x1, + NVME_NS_FLUSH_SUPPORTED = 0x2, +}; + +int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, + struct nvme_pt_command *pt, + uint32_t nsid, int is_user_buffer, + int is_admin_cmd); + +/* Admin functions */ +void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, + uint8_t feature, uint32_t cdw11, + void *payload, uint32_t payload_size, + nvme_cb_fn_t cb_fn, void *cb_arg); +void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, + uint8_t feature, uint32_t cdw11, + void *payload, uint32_t payload_size, + nvme_cb_fn_t cb_fn, void *cb_arg); +void nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr, + uint8_t log_page, uint32_t nsid, + void *payload, uint32_t payload_size, + nvme_cb_fn_t cb_fn, void *cb_arg); + +/* NVM I/O functions */ +int nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload, + uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, + void *cb_arg); +int nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp, + nvme_cb_fn_t cb_fn, void *cb_arg); +int nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload, + uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, + void *cb_arg); +int nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp, + nvme_cb_fn_t cb_fn, void *cb_arg); +int nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload, + uint8_t num_ranges, nvme_cb_fn_t cb_fn, + void *cb_arg); +int nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, + void *cb_arg); +int nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset, + size_t len); + +/* Registration functions */ +struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, + nvme_cons_ctrlr_fn_t ctrlr_fn, + nvme_cons_async_fn_t async_fn, + nvme_cons_fail_fn_t fail_fn); +void nvme_unregister_consumer(struct nvme_consumer *consumer); + +/* Controller helper functions */ +device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr); +const struct nvme_controller_data * + nvme_ctrlr_get_data(struct nvme_controller *ctrlr); + +/* Namespace helper functions */ +uint32_t nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns); +uint32_t nvme_ns_get_sector_size(struct nvme_namespace *ns); +uint64_t nvme_ns_get_num_sectors(struct nvme_namespace *ns); +uint64_t nvme_ns_get_size(struct nvme_namespace *ns); +uint32_t nvme_ns_get_flags(struct nvme_namespace *ns); +const char * nvme_ns_get_serial_number(struct nvme_namespace *ns); +const char * nvme_ns_get_model_number(struct nvme_namespace *ns); +const struct nvme_namespace_data * + nvme_ns_get_data(struct nvme_namespace *ns); +uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns); + +int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp, + nvme_cb_fn_t cb_fn); + +/* + * Command building helper functions -- shared with CAM + * These functions assume allocator zeros out cmd structure + * CAM's xpt_get_ccb and the request allocator for nvme both + * do zero'd allocations. + */ +static inline +void nvme_ns_flush_cmd(struct nvme_command *cmd, uint32_t nsid) +{ + + cmd->opc = NVME_OPC_FLUSH; + cmd->nsid = htole32(nsid); +} + +static inline +void nvme_ns_rw_cmd(struct nvme_command *cmd, uint32_t rwcmd, uint32_t nsid, + uint64_t lba, uint32_t count) +{ + cmd->opc = rwcmd; + cmd->nsid = htole32(nsid); + cmd->cdw10 = htole32(lba & 0xffffffffu); + cmd->cdw11 = htole32(lba >> 32); + cmd->cdw12 = htole32(count-1); +} + +static inline +void nvme_ns_write_cmd(struct nvme_command *cmd, uint32_t nsid, + uint64_t lba, uint32_t count) +{ + nvme_ns_rw_cmd(cmd, NVME_OPC_WRITE, nsid, lba, count); +} + +static inline +void nvme_ns_read_cmd(struct nvme_command *cmd, uint32_t nsid, + uint64_t lba, uint32_t count) +{ + nvme_ns_rw_cmd(cmd, NVME_OPC_READ, nsid, lba, count); +} + +static inline +void nvme_ns_trim_cmd(struct nvme_command *cmd, uint32_t nsid, + uint32_t num_ranges) +{ + cmd->opc = NVME_OPC_DATASET_MANAGEMENT; + cmd->nsid = htole32(nsid); + cmd->cdw10 = htole32(num_ranges - 1); + cmd->cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); +} + +extern int nvme_use_nvd; + +#endif /* _KERNEL */ + +/* Endianess conversion functions for NVMe structs */ +static inline +void nvme_completion_swapbytes(struct nvme_completion *s) +{ + + s->cdw0 = le32toh(s->cdw0); + /* omit rsvd1 */ + s->sqhd = le16toh(s->sqhd); + s->sqid = le16toh(s->sqid); + /* omit cid */ + s->status = le16toh(s->status); +} + +static inline +void nvme_power_state_swapbytes(struct nvme_power_state *s) +{ + + s->mp = le16toh(s->mp); + s->enlat = le32toh(s->enlat); + s->exlat = le32toh(s->exlat); + s->idlp = le16toh(s->idlp); + s->actp = le16toh(s->actp); +} + +static inline +void nvme_controller_data_swapbytes(struct nvme_controller_data *s) +{ + int i; + + s->vid = le16toh(s->vid); + s->ssvid = le16toh(s->ssvid); + s->ctrlr_id = le16toh(s->ctrlr_id); + s->ver = le32toh(s->ver); + s->rtd3r = le32toh(s->rtd3r); + s->rtd3e = le32toh(s->rtd3e); + s->oaes = le32toh(s->oaes); + s->ctratt = le32toh(s->ctratt); + s->oacs = le16toh(s->oacs); + s->wctemp = le16toh(s->wctemp); + s->cctemp = le16toh(s->cctemp); + s->mtfa = le16toh(s->mtfa); + s->hmpre = le32toh(s->hmpre); + s->hmmin = le32toh(s->hmmin); + s->rpmbs = le32toh(s->rpmbs); + s->edstt = le16toh(s->edstt); + s->kas = le16toh(s->kas); + s->hctma = le16toh(s->hctma); + s->mntmt = le16toh(s->mntmt); + s->mxtmt = le16toh(s->mxtmt); + s->sanicap = le32toh(s->sanicap); + s->maxcmd = le16toh(s->maxcmd); + s->nn = le32toh(s->nn); + s->oncs = le16toh(s->oncs); + s->fuses = le16toh(s->fuses); + s->awun = le16toh(s->awun); + s->awupf = le16toh(s->awupf); + s->acwu = le16toh(s->acwu); + s->sgls = le32toh(s->sgls); + for (i = 0; i < 32; i++) + nvme_power_state_swapbytes(&s->power_state[i]); +} + +static inline +void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s) +{ + int i; + + s->nsze = le64toh(s->nsze); + s->ncap = le64toh(s->ncap); + s->nuse = le64toh(s->nuse); + s->nawun = le16toh(s->nawun); + s->nawupf = le16toh(s->nawupf); + s->nacwu = le16toh(s->nacwu); + s->nabsn = le16toh(s->nabsn); + s->nabo = le16toh(s->nabo); + s->nabspf = le16toh(s->nabspf); + s->noiob = le16toh(s->noiob); + for (i = 0; i < 16; i++) + s->lbaf[i] = le32toh(s->lbaf[i]); +} + +static inline +void nvme_error_information_entry_swapbytes(struct nvme_error_information_entry *s) +{ + + s->error_count = le64toh(s->error_count); + s->sqid = le16toh(s->sqid); + s->cid = le16toh(s->cid); + s->status = le16toh(s->status); + s->error_location = le16toh(s->error_location); + s->lba = le64toh(s->lba); + s->nsid = le32toh(s->nsid); +} + +static inline +void nvme_le128toh(void *p) +{ + /* + * Upstream, this uses the following comparison: + * #if _BYTE_ORDER != _LITTLE_ENDIAN + * + * Rather than keep this file in compat with only that little bit + * changed, we'll just float a little patch here for now. + */ +#ifndef _LITTLE_ENDIAN + /* Swap 16 bytes in place */ + char *tmp = (char*)p; + char b; + int i; + for (i = 0; i < 8; i++) { + b = tmp[i]; + tmp[i] = tmp[15-i]; + tmp[15-i] = b; + } +#else + (void)p; +#endif +} + +static inline +void nvme_health_information_page_swapbytes(struct nvme_health_information_page *s) +{ + int i; + + s->temperature = le16toh(s->temperature); + nvme_le128toh((void *)s->data_units_read); + nvme_le128toh((void *)s->data_units_written); + nvme_le128toh((void *)s->host_read_commands); + nvme_le128toh((void *)s->host_write_commands); + nvme_le128toh((void *)s->controller_busy_time); + nvme_le128toh((void *)s->power_cycles); + nvme_le128toh((void *)s->power_on_hours); + nvme_le128toh((void *)s->unsafe_shutdowns); + nvme_le128toh((void *)s->media_errors); + nvme_le128toh((void *)s->num_error_info_log_entries); + s->warning_temp_time = le32toh(s->warning_temp_time); + s->error_temp_time = le32toh(s->error_temp_time); + for (i = 0; i < 8; i++) + s->temp_sensor[i] = le16toh(s->temp_sensor[i]); +} + + +static inline +void nvme_firmware_page_swapbytes(struct nvme_firmware_page *s) +{ + int i; + + for (i = 0; i < 7; i++) + s->revision[i] = le64toh(s->revision[i]); +} + +static inline +void nvme_ns_list_swapbytes(struct nvme_ns_list *s) +{ + int i; + + for (i = 0; i < 1024; i++) + s->ns[i] = le32toh(s->ns[i]); +} + +static inline +void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s) +{ + + s->current = le64toh(s->current); + s->overtemp_flag_last = le64toh(s->overtemp_flag_last); + s->overtemp_flag_life = le64toh(s->overtemp_flag_life); + s->max_temp = le64toh(s->max_temp); + s->min_temp = le64toh(s->min_temp); + /* omit _rsvd[] */ + s->max_oper_temp = le64toh(s->max_oper_temp); + s->min_oper_temp = le64toh(s->min_oper_temp); + s->est_offset = le64toh(s->est_offset); +} + +#endif /* __NVME_H__ */ diff --git a/usr/contrib/freebsd/dev/usb/controller/xhcireg.h b/usr/contrib/freebsd/dev/usb/controller/xhcireg.h new file mode 100644 index 0000000000..0e588ecba3 --- /dev/null +++ b/usr/contrib/freebsd/dev/usb/controller/xhcireg.h @@ -0,0 +1,224 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (c) 2010 Hans Petter Selasky. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _XHCIREG_H_ +#define _XHCIREG_H_ + +/* XHCI PCI config registers */ +#define PCI_XHCI_CBMEM 0x10 /* configuration base MEM */ +#define PCI_XHCI_USBREV 0x60 /* RO USB protocol revision */ +#define PCI_USB_REV_3_0 0x30 /* USB 3.0 */ +#define PCI_XHCI_FLADJ 0x61 /* RW frame length adjust */ + +#define PCI_XHCI_INTEL_XUSB2PR 0xD0 /* Intel USB2 Port Routing */ +#define PCI_XHCI_INTEL_USB2PRM 0xD4 /* Intel USB2 Port Routing Mask */ +#define PCI_XHCI_INTEL_USB3_PSSEN 0xD8 /* Intel USB3 Port SuperSpeed Enable */ +#define PCI_XHCI_INTEL_USB3PRM 0xDC /* Intel USB3 Port Routing Mask */ + +/* XHCI capability registers */ +#define XHCI_CAPLENGTH 0x00 /* RO capability */ +#define XHCI_RESERVED 0x01 /* Reserved */ +#define XHCI_HCIVERSION 0x02 /* RO Interface version number */ +#define XHCI_HCIVERSION_0_9 0x0090 /* xHCI version 0.9 */ +#define XHCI_HCIVERSION_1_0 0x0100 /* xHCI version 1.0 */ +#define XHCI_HCSPARAMS1 0x04 /* RO structural parameters 1 */ +#define XHCI_HCS1_DEVSLOT_MAX(x)((x) & 0xFF) +#define XHCI_HCS1_IRQ_MAX(x) (((x) >> 8) & 0x3FF) +#define XHCI_HCS1_N_PORTS(x) (((x) >> 24) & 0xFF) +#define XHCI_HCSPARAMS2 0x08 /* RO structural parameters 2 */ +#define XHCI_HCS2_IST(x) ((x) & 0xF) +#define XHCI_HCS2_ERST_MAX(x) (((x) >> 4) & 0xF) +#define XHCI_HCS2_SPR(x) (((x) >> 26) & 0x1) +#define XHCI_HCS2_SPB_MAX(x) ((((x) >> 16) & 0x3E0) | (((x) >> 27) & 0x1F)) +#define XHCI_HCSPARAMS3 0x0C /* RO structural parameters 3 */ +#define XHCI_HCS3_U1_DEL(x) ((x) & 0xFF) +#define XHCI_HCS3_U2_DEL(x) (((x) >> 16) & 0xFFFF) +#define XHCI_HCSPARAMS0 0x10 /* RO capability parameters */ +#define XHCI_HCS0_AC64(x) ((x) & 0x1) /* 64-bit capable */ +#define XHCI_HCS0_BNC(x) (((x) >> 1) & 0x1) /* BW negotiation */ +#define XHCI_HCS0_CSZ(x) (((x) >> 2) & 0x1) /* context size */ +#define XHCI_HCS0_PPC(x) (((x) >> 3) & 0x1) /* port power control */ +#define XHCI_HCS0_PIND(x) (((x) >> 4) & 0x1) /* port indicators */ +#define XHCI_HCS0_LHRC(x) (((x) >> 5) & 0x1) /* light HC reset */ +#define XHCI_HCS0_LTC(x) (((x) >> 6) & 0x1) /* latency tolerance msg */ +#define XHCI_HCS0_NSS(x) (((x) >> 7) & 0x1) /* no secondary sid */ +#define XHCI_HCS0_PSA_SZ_MAX(x) (((x) >> 12) & 0xF) /* max pri. stream array size */ +#define XHCI_HCS0_XECP(x) (((x) >> 16) & 0xFFFF) /* extended capabilities pointer */ +#define XHCI_DBOFF 0x14 /* RO doorbell offset */ +#define XHCI_RTSOFF 0x18 /* RO runtime register space offset */ + +/* XHCI operational registers. Offset given by XHCI_CAPLENGTH register */ +#define XHCI_USBCMD 0x00 /* XHCI command */ +#define XHCI_CMD_RS 0x00000001 /* RW Run/Stop */ +#define XHCI_CMD_HCRST 0x00000002 /* RW Host Controller Reset */ +#define XHCI_CMD_INTE 0x00000004 /* RW Interrupter Enable */ +#define XHCI_CMD_HSEE 0x00000008 /* RW Host System Error Enable */ +#define XHCI_CMD_LHCRST 0x00000080 /* RO/RW Light Host Controller Reset */ +#define XHCI_CMD_CSS 0x00000100 /* RW Controller Save State */ +#define XHCI_CMD_CRS 0x00000200 /* RW Controller Restore State */ +#define XHCI_CMD_EWE 0x00000400 /* RW Enable Wrap Event */ +#define XHCI_CMD_EU3S 0x00000800 /* RW Enable U3 MFINDEX Stop */ +#define XHCI_USBSTS 0x04 /* XHCI status */ +#define XHCI_STS_HCH 0x00000001 /* RO - Host Controller Halted */ +#define XHCI_STS_HSE 0x00000004 /* RW - Host System Error */ +#define XHCI_STS_EINT 0x00000008 /* RW - Event Interrupt */ +#define XHCI_STS_PCD 0x00000010 /* RW - Port Change Detect */ +#define XHCI_STS_SSS 0x00000100 /* RO - Save State Status */ +#define XHCI_STS_RSS 0x00000200 /* RO - Restore State Status */ +#define XHCI_STS_SRE 0x00000400 /* RW - Save/Restore Error */ +#define XHCI_STS_CNR 0x00000800 /* RO - Controller Not Ready */ +#define XHCI_STS_HCE 0x00001000 /* RO - Host Controller Error */ +#define XHCI_PAGESIZE 0x08 /* XHCI page size mask */ +#define XHCI_PAGESIZE_4K 0x00000001 /* 4K Page Size */ +#define XHCI_PAGESIZE_8K 0x00000002 /* 8K Page Size */ +#define XHCI_PAGESIZE_16K 0x00000004 /* 16K Page Size */ +#define XHCI_PAGESIZE_32K 0x00000008 /* 32K Page Size */ +#define XHCI_PAGESIZE_64K 0x00000010 /* 64K Page Size */ +#define XHCI_DNCTRL 0x14 /* XHCI device notification control */ +#define XHCI_DNCTRL_MASK(n) (1U << (n)) +#define XHCI_CRCR_LO 0x18 /* XHCI command ring control */ +#define XHCI_CRCR_LO_RCS 0x00000001 /* RW - consumer cycle state */ +#define XHCI_CRCR_LO_CS 0x00000002 /* RW - command stop */ +#define XHCI_CRCR_LO_CA 0x00000004 /* RW - command abort */ +#define XHCI_CRCR_LO_CRR 0x00000008 /* RW - command ring running */ +#define XHCI_CRCR_LO_MASK 0x0000000F +#define XHCI_CRCR_HI 0x1C /* XHCI command ring control */ +#define XHCI_DCBAAP_LO 0x30 /* XHCI dev context BA pointer */ +#define XHCI_DCBAAP_HI 0x34 /* XHCI dev context BA pointer */ +#define XHCI_CONFIG 0x38 +#define XHCI_CONFIG_SLOTS_MASK 0x000000FF /* RW - number of device slots enabled */ + +/* XHCI port status registers */ +#define XHCI_PORTSC(n) (0x3F0 + (0x10 * (n))) /* XHCI port status */ +#define XHCI_PS_CCS 0x00000001 /* RO - current connect status */ +#define XHCI_PS_PED 0x00000002 /* RW - port enabled / disabled */ +#define XHCI_PS_OCA 0x00000008 /* RO - over current active */ +#define XHCI_PS_PR 0x00000010 /* RW - port reset */ +#define XHCI_PS_PLS_GET(x) (((x) >> 5) & 0xF) /* RW - port link state */ +#define XHCI_PS_PLS_SET(x) (((x) & 0xF) << 5) /* RW - port link state */ +#define XHCI_PS_PP 0x00000200 /* RW - port power */ +#define XHCI_PS_SPEED_GET(x) (((x) >> 10) & 0xF) /* RO - port speed */ +#define XHCI_PS_PIC_GET(x) (((x) >> 14) & 0x3) /* RW - port indicator */ +#define XHCI_PS_PIC_SET(x) (((x) & 0x3) << 14) /* RW - port indicator */ +#define XHCI_PS_LWS 0x00010000 /* RW - port link state write strobe */ +#define XHCI_PS_CSC 0x00020000 /* RW - connect status change */ +#define XHCI_PS_PEC 0x00040000 /* RW - port enable/disable change */ +#define XHCI_PS_WRC 0x00080000 /* RW - warm port reset change */ +#define XHCI_PS_OCC 0x00100000 /* RW - over-current change */ +#define XHCI_PS_PRC 0x00200000 /* RW - port reset change */ +#define XHCI_PS_PLC 0x00400000 /* RW - port link state change */ +#define XHCI_PS_CEC 0x00800000 /* RW - config error change */ +#define XHCI_PS_CAS 0x01000000 /* RO - cold attach status */ +#define XHCI_PS_WCE 0x02000000 /* RW - wake on connect enable */ +#define XHCI_PS_WDE 0x04000000 /* RW - wake on disconnect enable */ +#define XHCI_PS_WOE 0x08000000 /* RW - wake on over-current enable */ +#define XHCI_PS_DR 0x40000000 /* RO - device removable */ +#define XHCI_PS_WPR 0x80000000U /* RW - warm port reset */ +#define XHCI_PS_CLEAR 0x80FF01FFU /* command bits */ + +#define XHCI_PORTPMSC(n) (0x3F4 + (0x10 * (n))) /* XHCI status and control */ +#define XHCI_PM3_U1TO_GET(x) (((x) >> 0) & 0xFF) /* RW - U1 timeout */ +#define XHCI_PM3_U1TO_SET(x) (((x) & 0xFF) << 0) /* RW - U1 timeout */ +#define XHCI_PM3_U2TO_GET(x) (((x) >> 8) & 0xFF) /* RW - U2 timeout */ +#define XHCI_PM3_U2TO_SET(x) (((x) & 0xFF) << 8) /* RW - U2 timeout */ +#define XHCI_PM3_FLA 0x00010000 /* RW - Force Link PM Accept */ +#define XHCI_PM2_L1S_GET(x) (((x) >> 0) & 0x7) /* RO - L1 status */ +#define XHCI_PM2_RWE 0x00000008 /* RW - remote wakup enable */ +#define XHCI_PM2_HIRD_GET(x) (((x) >> 4) & 0xF) /* RW - host initiated resume duration */ +#define XHCI_PM2_HIRD_SET(x) (((x) & 0xF) << 4) /* RW - host initiated resume duration */ +#define XHCI_PM2_L1SLOT_GET(x) (((x) >> 8) & 0xFF) /* RW - L1 device slot */ +#define XHCI_PM2_L1SLOT_SET(x) (((x) & 0xFF) << 8) /* RW - L1 device slot */ +#define XHCI_PM2_HLE 0x00010000 /* RW - hardware LPM enable */ +#define XHCI_PORTLI(n) (0x3F8 + (0x10 * (n))) /* XHCI port link info */ +#define XHCI_PLI3_ERR_GET(x) (((x) >> 0) & 0xFFFF) /* RO - port link errors */ +#define XHCI_PORTRSV(n) (0x3FC + (0x10 * (n))) /* XHCI port reserved */ + +/* XHCI runtime registers. Offset given by XHCI_CAPLENGTH + XHCI_RTSOFF registers */ +#define XHCI_MFINDEX 0x0000 /* RO - microframe index */ +#define XHCI_MFINDEX_GET(x) ((x) & 0x3FFF) +#define XHCI_IMAN(n) (0x0020 + (0x20 * (n))) /* XHCI interrupt management */ +#define XHCI_IMAN_INTR_PEND 0x00000001 /* RW - interrupt pending */ +#define XHCI_IMAN_INTR_ENA 0x00000002 /* RW - interrupt enable */ +#define XHCI_IMOD(n) (0x0024 + (0x20 * (n))) /* XHCI interrupt moderation */ +#define XHCI_IMOD_IVAL_GET(x) (((x) >> 0) & 0xFFFF) /* 250ns unit */ +#define XHCI_IMOD_IVAL_SET(x) (((x) & 0xFFFF) << 0) /* 250ns unit */ +#define XHCI_IMOD_ICNT_GET(x) (((x) >> 16) & 0xFFFF) /* 250ns unit */ +#define XHCI_IMOD_ICNT_SET(x) (((x) & 0xFFFF) << 16) /* 250ns unit */ +#define XHCI_IMOD_DEFAULT 0x000001F4U /* 8000 IRQs/second */ +#define XHCI_IMOD_DEFAULT_LP 0x000003F8U /* 4000 IRQs/second - LynxPoint */ +#define XHCI_ERSTSZ(n) (0x0028 + (0x20 * (n))) /* XHCI event ring segment table size */ +#define XHCI_ERSTS_GET(x) ((x) & 0xFFFF) +#define XHCI_ERSTS_SET(x) ((x) & 0xFFFF) +#define XHCI_ERSTBA_LO(n) (0x0030 + (0x20 * (n))) /* XHCI event ring segment table BA */ +#define XHCI_ERSTBA_HI(n) (0x0034 + (0x20 * (n))) /* XHCI event ring segment table BA */ +#define XHCI_ERDP_LO(n) (0x0038 + (0x20 * (n))) /* XHCI event ring dequeue pointer */ +#define XHCI_ERDP_LO_SINDEX(x) ((x) & 0x7) /* RO - dequeue segment index */ +#define XHCI_ERDP_LO_BUSY 0x00000008 /* RW - event handler busy */ +#define XHCI_ERDP_HI(n) (0x003C + (0x20 * (n))) /* XHCI event ring dequeue pointer */ + +/* XHCI doorbell registers. Offset given by XHCI_CAPLENGTH + XHCI_DBOFF registers */ +#define XHCI_DOORBELL(n) (0x0000 + (4 * (n))) +#define XHCI_DB_TARGET_GET(x) ((x) & 0xFF) /* RW - doorbell target */ +#define XHCI_DB_TARGET_SET(x) ((x) & 0xFF) /* RW - doorbell target */ +#define XHCI_DB_SID_GET(x) (((x) >> 16) & 0xFFFF) /* RW - doorbell stream ID */ +#define XHCI_DB_SID_SET(x) (((x) & 0xFFFF) << 16) /* RW - doorbell stream ID */ + +/* XHCI legacy support */ +#define XHCI_XECP_ID(x) ((x) & 0xFF) +#define XHCI_XECP_NEXT(x) (((x) >> 8) & 0xFF) +#define XHCI_XECP_BIOS_SEM 0x0002 +#define XHCI_XECP_OS_SEM 0x0003 + +/* XHCI capability ID's */ +#define XHCI_ID_USB_LEGACY 0x0001 +#define XHCI_ID_PROTOCOLS 0x0002 +#define XHCI_ID_POWER_MGMT 0x0003 +#define XHCI_ID_VIRTUALIZATION 0x0004 +#define XHCI_ID_MSG_IRQ 0x0005 +#define XHCI_ID_USB_LOCAL_MEM 0x0006 + +/* XHCI register R/W wrappers */ +#define XREAD1(sc, what, a) \ + bus_space_read_1((sc)->sc_io_tag, (sc)->sc_io_hdl, \ + (a) + (sc)->sc_##what##_off) +#define XREAD2(sc, what, a) \ + bus_space_read_2((sc)->sc_io_tag, (sc)->sc_io_hdl, \ + (a) + (sc)->sc_##what##_off) +#define XREAD4(sc, what, a) \ + bus_space_read_4((sc)->sc_io_tag, (sc)->sc_io_hdl, \ + (a) + (sc)->sc_##what##_off) +#define XWRITE1(sc, what, a, x) \ + bus_space_write_1((sc)->sc_io_tag, (sc)->sc_io_hdl, \ + (a) + (sc)->sc_##what##_off, (x)) +#define XWRITE2(sc, what, a, x) \ + bus_space_write_2((sc)->sc_io_tag, (sc)->sc_io_hdl, \ + (a) + (sc)->sc_##what##_off, (x)) +#define XWRITE4(sc, what, a, x) \ + bus_space_write_4((sc)->sc_io_tag, (sc)->sc_io_hdl, \ + (a) + (sc)->sc_##what##_off, (x)) + +#endif /* _XHCIREG_H_ */ diff --git a/usr/contrib/freebsd/dev/usb/usb.h b/usr/contrib/freebsd/dev/usb/usb.h new file mode 100644 index 0000000000..bcea2ac8bd --- /dev/null +++ b/usr/contrib/freebsd/dev/usb/usb.h @@ -0,0 +1,801 @@ +/* $FreeBSD$ */ +/*- + * Copyright (c) 2008 Hans Petter Selasky. All rights reserved. + * Copyright (c) 1998 The NetBSD Foundation, Inc. All rights reserved. + * Copyright (c) 1998 Lennart Augustsson. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file contains standard definitions for the following USB + * protocol versions: + * + * USB v1.0 + * USB v1.1 + * USB v2.0 + * USB v3.0 + */ + +#ifndef _USB_STANDARD_H_ +#define _USB_STANDARD_H_ + +#if defined(_KERNEL) +#ifndef USB_GLOBAL_INCLUDE_FILE +#include "opt_usb.h" +#endif + +/* Declare parent SYSCTL USB node. */ +#ifdef SYSCTL_DECL +SYSCTL_DECL(_hw_usb); +#endif + +#ifndef USB_GLOBAL_INCLUDE_FILE +#include +#endif + +MALLOC_DECLARE(M_USB); +MALLOC_DECLARE(M_USBDEV); +#endif /* _KERNEL */ + +#ifndef USB_GLOBAL_INCLUDE_FILE +#include +#include +#endif + +#define USB_STACK_VERSION 2000 /* 2.0 */ + +/* Definition of some hardcoded USB constants. */ + +#define USB_MAX_IPACKET 8 /* initial USB packet size */ +#define USB_EP_MAX (2*16) /* hardcoded */ +#define USB_ROOT_HUB_ADDR 1 /* index */ +#define USB_MIN_DEVICES 2 /* unused + root HUB */ +#define USB_UNCONFIG_INDEX 0xFF /* internal use only */ +#define USB_IFACE_INDEX_ANY 0xFF /* internal use only */ +#define USB_START_ADDR 0 /* default USB device BUS address + * after USB bus reset */ +#define USB_CONTROL_ENDPOINT 0 /* default control endpoint */ + +#define USB_FRAMES_PER_SECOND_FS 1000 /* full speed */ +#define USB_FRAMES_PER_SECOND_HS 8000 /* high speed */ + +#define USB_FS_BYTES_PER_HS_UFRAME 188 /* bytes */ +#define USB_HS_MICRO_FRAMES_MAX 8 /* units */ + +#define USB_ISOC_TIME_MAX 128 /* ms */ + +/* + * Minimum time a device needs to be powered down to go through a + * power cycle. These values are not in the USB specification. + */ +#define USB_POWER_DOWN_TIME 200 /* ms */ +#define USB_PORT_POWER_DOWN_TIME 100 /* ms */ + +/* Definition of software USB power modes */ +#define USB_POWER_MODE_OFF 0 /* turn off device */ +#define USB_POWER_MODE_ON 1 /* always on */ +#define USB_POWER_MODE_SAVE 2 /* automatic suspend and resume */ +#define USB_POWER_MODE_SUSPEND 3 /* force suspend */ +#define USB_POWER_MODE_RESUME 4 /* force resume */ + +/* These are the values from the USB specification. */ +#define USB_PORT_RESET_DELAY_SPEC 10 /* ms */ +#define USB_PORT_ROOT_RESET_DELAY_SPEC 50 /* ms */ +#define USB_PORT_RESET_RECOVERY_SPEC 10 /* ms */ +#define USB_PORT_POWERUP_DELAY_SPEC 100 /* ms */ +#define USB_PORT_RESUME_DELAY_SPEC 20 /* ms */ +#define USB_SET_ADDRESS_SETTLE_SPEC 2 /* ms */ +#define USB_RESUME_DELAY_SPEC (20*5) /* ms */ +#define USB_RESUME_WAIT_SPEC 10 /* ms */ +#define USB_RESUME_RECOVERY_SPEC 10 /* ms */ +#define USB_EXTRA_POWER_UP_TIME_SPEC 0 /* ms */ + +/* Allow for marginal and non-conforming devices. */ +#define USB_PORT_RESET_DELAY 50 /* ms */ +#define USB_PORT_ROOT_RESET_DELAY 200 /* ms */ +#define USB_PORT_RESET_RECOVERY 250 /* ms */ +#define USB_PORT_POWERUP_DELAY 300 /* ms */ +#define USB_PORT_RESUME_DELAY (20*2) /* ms */ +#define USB_SET_ADDRESS_SETTLE 10 /* ms */ +#define USB_RESUME_DELAY (50*5) /* ms */ +#define USB_RESUME_WAIT 50 /* ms */ +#define USB_RESUME_RECOVERY 50 /* ms */ +#define USB_EXTRA_POWER_UP_TIME 20 /* ms */ + +#define USB_MIN_POWER 100 /* mA */ +#define USB_MAX_POWER 500 /* mA */ + +#define USB_BUS_RESET_DELAY 100 /* ms */ + +/* + * USB record layout in memory: + * + * - USB config 0 + * - USB interfaces + * - USB alternative interfaces + * - USB endpoints + * + * - USB config 1 + * - USB interfaces + * - USB alternative interfaces + * - USB endpoints + */ + +/* Declaration of USB records */ + +struct usb_device_request { + uByte bmRequestType; + uByte bRequest; + uWord wValue; + uWord wIndex; + uWord wLength; +} __packed; +typedef struct usb_device_request usb_device_request_t; + +#define UT_WRITE 0x00 +#define UT_READ 0x80 +#define UT_STANDARD 0x00 +#define UT_CLASS 0x20 +#define UT_VENDOR 0x40 +#define UT_DEVICE 0x00 +#define UT_INTERFACE 0x01 +#define UT_ENDPOINT 0x02 +#define UT_OTHER 0x03 + +#define UT_READ_DEVICE (UT_READ | UT_STANDARD | UT_DEVICE) +#define UT_READ_INTERFACE (UT_READ | UT_STANDARD | UT_INTERFACE) +#define UT_READ_ENDPOINT (UT_READ | UT_STANDARD | UT_ENDPOINT) +#define UT_WRITE_DEVICE (UT_WRITE | UT_STANDARD | UT_DEVICE) +#define UT_WRITE_INTERFACE (UT_WRITE | UT_STANDARD | UT_INTERFACE) +#define UT_WRITE_ENDPOINT (UT_WRITE | UT_STANDARD | UT_ENDPOINT) +#define UT_READ_CLASS_DEVICE (UT_READ | UT_CLASS | UT_DEVICE) +#define UT_READ_CLASS_INTERFACE (UT_READ | UT_CLASS | UT_INTERFACE) +#define UT_READ_CLASS_OTHER (UT_READ | UT_CLASS | UT_OTHER) +#define UT_READ_CLASS_ENDPOINT (UT_READ | UT_CLASS | UT_ENDPOINT) +#define UT_WRITE_CLASS_DEVICE (UT_WRITE | UT_CLASS | UT_DEVICE) +#define UT_WRITE_CLASS_INTERFACE (UT_WRITE | UT_CLASS | UT_INTERFACE) +#define UT_WRITE_CLASS_OTHER (UT_WRITE | UT_CLASS | UT_OTHER) +#define UT_WRITE_CLASS_ENDPOINT (UT_WRITE | UT_CLASS | UT_ENDPOINT) +#define UT_READ_VENDOR_DEVICE (UT_READ | UT_VENDOR | UT_DEVICE) +#define UT_READ_VENDOR_INTERFACE (UT_READ | UT_VENDOR | UT_INTERFACE) +#define UT_READ_VENDOR_OTHER (UT_READ | UT_VENDOR | UT_OTHER) +#define UT_READ_VENDOR_ENDPOINT (UT_READ | UT_VENDOR | UT_ENDPOINT) +#define UT_WRITE_VENDOR_DEVICE (UT_WRITE | UT_VENDOR | UT_DEVICE) +#define UT_WRITE_VENDOR_INTERFACE (UT_WRITE | UT_VENDOR | UT_INTERFACE) +#define UT_WRITE_VENDOR_OTHER (UT_WRITE | UT_VENDOR | UT_OTHER) +#define UT_WRITE_VENDOR_ENDPOINT (UT_WRITE | UT_VENDOR | UT_ENDPOINT) + +/* Requests */ +#define UR_GET_STATUS 0x00 +#define UR_CLEAR_FEATURE 0x01 +#define UR_SET_FEATURE 0x03 +#define UR_SET_ADDRESS 0x05 +#define UR_GET_DESCRIPTOR 0x06 +#define UDESC_DEVICE 0x01 +#define UDESC_CONFIG 0x02 +#define UDESC_STRING 0x03 +#define USB_LANGUAGE_TABLE 0x00 /* language ID string index */ +#define UDESC_INTERFACE 0x04 +#define UDESC_ENDPOINT 0x05 +#define UDESC_DEVICE_QUALIFIER 0x06 +#define UDESC_OTHER_SPEED_CONFIGURATION 0x07 +#define UDESC_INTERFACE_POWER 0x08 +#define UDESC_OTG 0x09 +#define UDESC_DEBUG 0x0A +#define UDESC_IFACE_ASSOC 0x0B /* interface association */ +#define UDESC_BOS 0x0F /* binary object store */ +#define UDESC_DEVICE_CAPABILITY 0x10 +#define UDESC_CS_DEVICE 0x21 /* class specific */ +#define UDESC_CS_CONFIG 0x22 +#define UDESC_CS_STRING 0x23 +#define UDESC_CS_INTERFACE 0x24 +#define UDESC_CS_ENDPOINT 0x25 +#define UDESC_HUB 0x29 +#define UDESC_SS_HUB 0x2A /* super speed */ +#define UDESC_ENDPOINT_SS_COMP 0x30 /* super speed */ +#define UR_SET_DESCRIPTOR 0x07 +#define UR_GET_CONFIG 0x08 +#define UR_SET_CONFIG 0x09 +#define UR_GET_INTERFACE 0x0a +#define UR_SET_INTERFACE 0x0b +#define UR_SYNCH_FRAME 0x0c +#define UR_SET_SEL 0x30 +#define UR_ISOCH_DELAY 0x31 + +/* HUB specific request */ +#define UR_GET_BUS_STATE 0x02 +#define UR_CLEAR_TT_BUFFER 0x08 +#define UR_RESET_TT 0x09 +#define UR_GET_TT_STATE 0x0a +#define UR_STOP_TT 0x0b +#define UR_SET_AND_TEST 0x0c /* USB 2.0 only */ +#define UR_SET_HUB_DEPTH 0x0c /* USB 3.0 only */ +#define USB_SS_HUB_DEPTH_MAX 5 +#define UR_GET_PORT_ERR_COUNT 0x0d + +/* Feature numbers */ +#define UF_ENDPOINT_HALT 0 +#define UF_DEVICE_REMOTE_WAKEUP 1 +#define UF_TEST_MODE 2 +#define UF_U1_ENABLE 0x30 +#define UF_U2_ENABLE 0x31 +#define UF_LTM_ENABLE 0x32 + +/* HUB specific features */ +#define UHF_C_HUB_LOCAL_POWER 0 +#define UHF_C_HUB_OVER_CURRENT 1 +#define UHF_PORT_CONNECTION 0 +#define UHF_PORT_ENABLE 1 +#define UHF_PORT_SUSPEND 2 +#define UHF_PORT_OVER_CURRENT 3 +#define UHF_PORT_RESET 4 +#define UHF_PORT_LINK_STATE 5 +#define UHF_PORT_POWER 8 +#define UHF_PORT_LOW_SPEED 9 +#define UHF_PORT_L1 10 +#define UHF_C_PORT_CONNECTION 16 +#define UHF_C_PORT_ENABLE 17 +#define UHF_C_PORT_SUSPEND 18 +#define UHF_C_PORT_OVER_CURRENT 19 +#define UHF_C_PORT_RESET 20 +#define UHF_PORT_TEST 21 +#define UHF_PORT_INDICATOR 22 +#define UHF_C_PORT_L1 23 + +/* SuperSpeed HUB specific features */ +#define UHF_PORT_U1_TIMEOUT 23 +#define UHF_PORT_U2_TIMEOUT 24 +#define UHF_C_PORT_LINK_STATE 25 +#define UHF_C_PORT_CONFIG_ERROR 26 +#define UHF_PORT_REMOTE_WAKE_MASK 27 +#define UHF_BH_PORT_RESET 28 +#define UHF_C_BH_PORT_RESET 29 +#define UHF_FORCE_LINKPM_ACCEPT 30 + +struct usb_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bDescriptorSubtype; +} __packed; +typedef struct usb_descriptor usb_descriptor_t; + +struct usb_device_descriptor { + uByte bLength; + uByte bDescriptorType; + uWord bcdUSB; +#define UD_USB_2_0 0x0200 +#define UD_USB_3_0 0x0300 +#define UD_IS_USB2(d) ((d)->bcdUSB[1] == 0x02) +#define UD_IS_USB3(d) ((d)->bcdUSB[1] == 0x03) + uByte bDeviceClass; + uByte bDeviceSubClass; + uByte bDeviceProtocol; + uByte bMaxPacketSize; + /* The fields below are not part of the initial descriptor. */ + uWord idVendor; + uWord idProduct; + uWord bcdDevice; + uByte iManufacturer; + uByte iProduct; + uByte iSerialNumber; + uByte bNumConfigurations; +} __packed; +typedef struct usb_device_descriptor usb_device_descriptor_t; + +/* Binary Device Object Store (BOS) */ +struct usb_bos_descriptor { + uByte bLength; + uByte bDescriptorType; + uWord wTotalLength; + uByte bNumDeviceCaps; +} __packed; +typedef struct usb_bos_descriptor usb_bos_descriptor_t; + +/* Binary Device Object Store Capability */ +struct usb_bos_cap_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bDevCapabilityType; +#define USB_DEVCAP_RESERVED 0x00 +#define USB_DEVCAP_WUSB 0x01 +#define USB_DEVCAP_USB2EXT 0x02 +#define USB_DEVCAP_SUPER_SPEED 0x03 +#define USB_DEVCAP_CONTAINER_ID 0x04 + /* data ... */ +} __packed; +typedef struct usb_bos_cap_descriptor usb_bos_cap_descriptor_t; + +struct usb_devcap_usb2ext_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bDevCapabilityType; + uDWord bmAttributes; +#define USB_V2EXT_LPM (1U << 1) +#define USB_V2EXT_BESL_SUPPORTED (1U << 2) +#define USB_V2EXT_BESL_BASELINE_VALID (1U << 3) +#define USB_V2EXT_BESL_DEEP_VALID (1U << 4) +#define USB_V2EXT_BESL_BASELINE_GET(x) (((x) >> 8) & 0xF) +#define USB_V2EXT_BESL_DEEP_GET(x) (((x) >> 12) & 0xF) +} __packed; +typedef struct usb_devcap_usb2ext_descriptor usb_devcap_usb2ext_descriptor_t; + +struct usb_devcap_ss_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bDevCapabilityType; + uByte bmAttributes; + uWord wSpeedsSupported; + uByte bFunctionalitySupport; + uByte bU1DevExitLat; + uWord wU2DevExitLat; +} __packed; +typedef struct usb_devcap_ss_descriptor usb_devcap_ss_descriptor_t; + +struct usb_devcap_container_id_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bDevCapabilityType; + uByte bReserved; + uByte bContainerID; +} __packed; +typedef struct usb_devcap_container_id_descriptor + usb_devcap_container_id_descriptor_t; + +/* Device class codes */ +#define UDCLASS_IN_INTERFACE 0x00 +#define UDCLASS_COMM 0x02 +#define UDCLASS_HUB 0x09 +#define UDSUBCLASS_HUB 0x00 +#define UDPROTO_FSHUB 0x00 +#define UDPROTO_HSHUBSTT 0x01 +#define UDPROTO_HSHUBMTT 0x02 +#define UDPROTO_SSHUB 0x03 +#define UDCLASS_DIAGNOSTIC 0xdc +#define UDCLASS_WIRELESS 0xe0 +#define UDSUBCLASS_RF 0x01 +#define UDPROTO_BLUETOOTH 0x01 +#define UDCLASS_VENDOR 0xff + +struct usb_config_descriptor { + uByte bLength; + uByte bDescriptorType; + uWord wTotalLength; + uByte bNumInterface; + uByte bConfigurationValue; +#define USB_UNCONFIG_NO 0 + uByte iConfiguration; + uByte bmAttributes; +#define UC_BUS_POWERED 0x80 +#define UC_SELF_POWERED 0x40 +#define UC_REMOTE_WAKEUP 0x20 + uByte bMaxPower; /* max current in 2 mA units */ +#define UC_POWER_FACTOR 2 +} __packed; +typedef struct usb_config_descriptor usb_config_descriptor_t; + +struct usb_interface_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bInterfaceNumber; + uByte bAlternateSetting; + uByte bNumEndpoints; + uByte bInterfaceClass; + uByte bInterfaceSubClass; + uByte bInterfaceProtocol; + uByte iInterface; +} __packed; +typedef struct usb_interface_descriptor usb_interface_descriptor_t; + +struct usb_interface_assoc_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bFirstInterface; + uByte bInterfaceCount; + uByte bFunctionClass; + uByte bFunctionSubClass; + uByte bFunctionProtocol; + uByte iFunction; +} __packed; +typedef struct usb_interface_assoc_descriptor usb_interface_assoc_descriptor_t; + +/* Interface class codes */ +#define UICLASS_UNSPEC 0x00 +#define UICLASS_AUDIO 0x01 /* audio */ +#define UISUBCLASS_AUDIOCONTROL 1 +#define UISUBCLASS_AUDIOSTREAM 2 +#define UISUBCLASS_MIDISTREAM 3 + +#define UICLASS_CDC 0x02 /* communication */ +#define UISUBCLASS_DIRECT_LINE_CONTROL_MODEL 1 +#define UISUBCLASS_ABSTRACT_CONTROL_MODEL 2 +#define UISUBCLASS_TELEPHONE_CONTROL_MODEL 3 +#define UISUBCLASS_MULTICHANNEL_CONTROL_MODEL 4 +#define UISUBCLASS_CAPI_CONTROLMODEL 5 +#define UISUBCLASS_ETHERNET_NETWORKING_CONTROL_MODEL 6 +#define UISUBCLASS_ATM_NETWORKING_CONTROL_MODEL 7 +#define UISUBCLASS_WIRELESS_HANDSET_CM 8 +#define UISUBCLASS_DEVICE_MGMT 9 +#define UISUBCLASS_MOBILE_DIRECT_LINE_MODEL 10 +#define UISUBCLASS_OBEX 11 +#define UISUBCLASS_ETHERNET_EMULATION_MODEL 12 +#define UISUBCLASS_NETWORK_CONTROL_MODEL 13 + +#define UIPROTO_CDC_NONE 0 +#define UIPROTO_CDC_AT 1 + +#define UICLASS_HID 0x03 +#define UISUBCLASS_BOOT 1 +#define UIPROTO_BOOT_KEYBOARD 1 +#define UIPROTO_MOUSE 2 + +#define UICLASS_PHYSICAL 0x05 +#define UICLASS_IMAGE 0x06 +#define UISUBCLASS_SIC 1 /* still image class */ +#define UICLASS_PRINTER 0x07 +#define UISUBCLASS_PRINTER 1 +#define UIPROTO_PRINTER_UNI 1 +#define UIPROTO_PRINTER_BI 2 +#define UIPROTO_PRINTER_1284 3 + +#define UICLASS_MASS 0x08 +#define UISUBCLASS_RBC 1 +#define UISUBCLASS_SFF8020I 2 +#define UISUBCLASS_QIC157 3 +#define UISUBCLASS_UFI 4 +#define UISUBCLASS_SFF8070I 5 +#define UISUBCLASS_SCSI 6 +#define UIPROTO_MASS_CBI_I 0 +#define UIPROTO_MASS_CBI 1 +#define UIPROTO_MASS_BBB_OLD 2 /* Not in the spec anymore */ +#define UIPROTO_MASS_BBB 80 /* 'P' for the Iomega Zip drive */ + +#define UICLASS_HUB 0x09 +#define UISUBCLASS_HUB 0 +#define UIPROTO_FSHUB 0 +#define UIPROTO_HSHUBSTT 0 /* Yes, same as previous */ +#define UIPROTO_HSHUBMTT 1 + +#define UICLASS_CDC_DATA 0x0a +#define UISUBCLASS_DATA 0x00 +#define UIPROTO_DATA_ISDNBRI 0x30 /* Physical iface */ +#define UIPROTO_DATA_HDLC 0x31 /* HDLC */ +#define UIPROTO_DATA_TRANSPARENT 0x32 /* Transparent */ +#define UIPROTO_DATA_Q921M 0x50 /* Management for Q921 */ +#define UIPROTO_DATA_Q921 0x51 /* Data for Q921 */ +#define UIPROTO_DATA_Q921TM 0x52 /* TEI multiplexer for Q921 */ +#define UIPROTO_DATA_V42BIS 0x90 /* Data compression */ +#define UIPROTO_DATA_Q931 0x91 /* Euro-ISDN */ +#define UIPROTO_DATA_V120 0x92 /* V.24 rate adaption */ +#define UIPROTO_DATA_CAPI 0x93 /* CAPI 2.0 commands */ +#define UIPROTO_DATA_HOST_BASED 0xfd /* Host based driver */ +#define UIPROTO_DATA_PUF 0xfe /* see Prot. Unit Func. Desc. */ +#define UIPROTO_DATA_VENDOR 0xff /* Vendor specific */ +#define UIPROTO_DATA_NCM 0x01 /* Network Control Model */ + +#define UICLASS_SMARTCARD 0x0b +#define UICLASS_FIRM_UPD 0x0c +#define UICLASS_SECURITY 0x0d +#define UICLASS_DIAGNOSTIC 0xdc +#define UICLASS_WIRELESS 0xe0 +#define UISUBCLASS_RF 0x01 +#define UIPROTO_BLUETOOTH 0x01 +#define UIPROTO_RNDIS 0x03 + +#define UICLASS_IAD 0xEF /* Interface Association Descriptor */ +#define UISUBCLASS_SYNC 0x01 +#define UIPROTO_ACTIVESYNC 0x01 + +#define UICLASS_APPL_SPEC 0xfe +#define UISUBCLASS_FIRMWARE_DOWNLOAD 1 +#define UISUBCLASS_IRDA 2 +#define UIPROTO_IRDA 0 + +#define UICLASS_VENDOR 0xff +#define UISUBCLASS_XBOX360_CONTROLLER 0x5d +#define UIPROTO_XBOX360_GAMEPAD 0x01 + +struct usb_endpoint_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bEndpointAddress; +#define UE_GET_DIR(a) ((a) & 0x80) +#define UE_SET_DIR(a,d) ((a) | (((d)&1) << 7)) +#define UE_DIR_IN 0x80 /* IN-token endpoint, fixed */ +#define UE_DIR_OUT 0x00 /* OUT-token endpoint, fixed */ +#define UE_DIR_RX 0xfd /* for internal use only! */ +#define UE_DIR_TX 0xfe /* for internal use only! */ +#define UE_DIR_ANY 0xff /* for internal use only! */ +#define UE_ADDR 0x0f +#define UE_ADDR_ANY 0xff /* for internal use only! */ +#define UE_GET_ADDR(a) ((a) & UE_ADDR) + uByte bmAttributes; +#define UE_XFERTYPE 0x03 +#define UE_CONTROL 0x00 +#define UE_ISOCHRONOUS 0x01 +#define UE_BULK 0x02 +#define UE_INTERRUPT 0x03 +#define UE_BULK_INTR 0xfe /* for internal use only! */ +#define UE_TYPE_ANY 0xff /* for internal use only! */ +#define UE_GET_XFERTYPE(a) ((a) & UE_XFERTYPE) +#define UE_ISO_TYPE 0x0c +#define UE_ISO_ASYNC 0x04 +#define UE_ISO_ADAPT 0x08 +#define UE_ISO_SYNC 0x0c +#define UE_GET_ISO_TYPE(a) ((a) & UE_ISO_TYPE) +#define UE_ISO_USAGE 0x30 +#define UE_ISO_USAGE_DATA 0x00 +#define UE_ISO_USAGE_FEEDBACK 0x10 +#define UE_ISO_USAGE_IMPLICT_FB 0x20 +#define UE_GET_ISO_USAGE(a) ((a) & UE_ISO_USAGE) + uWord wMaxPacketSize; +#define UE_ZERO_MPS 0xFFFF /* for internal use only */ + uByte bInterval; +} __packed; +typedef struct usb_endpoint_descriptor usb_endpoint_descriptor_t; + +struct usb_endpoint_ss_comp_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bMaxBurst; + uByte bmAttributes; +#define UE_GET_BULK_STREAMS(x) ((x) & 0x0F) +#define UE_GET_SS_ISO_MULT(x) ((x) & 0x03) + uWord wBytesPerInterval; +} __packed; +typedef struct usb_endpoint_ss_comp_descriptor + usb_endpoint_ss_comp_descriptor_t; + +struct usb_string_descriptor { + uByte bLength; + uByte bDescriptorType; + uWord bString[126]; + uByte bUnused; +} __packed; +typedef struct usb_string_descriptor usb_string_descriptor_t; + +#define USB_MAKE_STRING_DESC(m,name) \ +static const struct { \ + uByte bLength; \ + uByte bDescriptorType; \ + uByte bData[sizeof((uint8_t []){m})]; \ +} __packed name = { \ + .bLength = sizeof(name), \ + .bDescriptorType = UDESC_STRING, \ + .bData = { m }, \ +} + +struct usb_string_lang { + uByte bLength; + uByte bDescriptorType; + uByte bData[2]; +} __packed; +typedef struct usb_string_lang usb_string_lang_t; + +struct usb_hub_descriptor { + uByte bDescLength; + uByte bDescriptorType; + uByte bNbrPorts; + uWord wHubCharacteristics; +#define UHD_PWR 0x0003 +#define UHD_PWR_GANGED 0x0000 +#define UHD_PWR_INDIVIDUAL 0x0001 +#define UHD_PWR_NO_SWITCH 0x0002 +#define UHD_COMPOUND 0x0004 +#define UHD_OC 0x0018 +#define UHD_OC_GLOBAL 0x0000 +#define UHD_OC_INDIVIDUAL 0x0008 +#define UHD_OC_NONE 0x0010 +#define UHD_TT_THINK 0x0060 +#define UHD_TT_THINK_8 0x0000 +#define UHD_TT_THINK_16 0x0020 +#define UHD_TT_THINK_24 0x0040 +#define UHD_TT_THINK_32 0x0060 +#define UHD_PORT_IND 0x0080 + uByte bPwrOn2PwrGood; /* delay in 2 ms units */ +#define UHD_PWRON_FACTOR 2 + uByte bHubContrCurrent; + uByte DeviceRemovable[32]; /* max 255 ports */ +#define UHD_NOT_REMOV(desc, i) \ + (((desc)->DeviceRemovable[(i)/8] >> ((i) % 8)) & 1) + uByte PortPowerCtrlMask[1]; /* deprecated */ +} __packed; +typedef struct usb_hub_descriptor usb_hub_descriptor_t; + +struct usb_hub_ss_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bNbrPorts; + uWord wHubCharacteristics; + uByte bPwrOn2PwrGood; /* delay in 2 ms units */ + uByte bHubContrCurrent; + uByte bHubHdrDecLat; + uWord wHubDelay; + uByte DeviceRemovable[32]; /* max 255 ports */ +} __packed; +typedef struct usb_hub_ss_descriptor usb_hub_ss_descriptor_t; + +/* minimum HUB descriptor (8-ports maximum) */ +struct usb_hub_descriptor_min { + uByte bDescLength; + uByte bDescriptorType; + uByte bNbrPorts; + uWord wHubCharacteristics; + uByte bPwrOn2PwrGood; + uByte bHubContrCurrent; + uByte DeviceRemovable[1]; + uByte PortPowerCtrlMask[1]; +} __packed; +typedef struct usb_hub_descriptor_min usb_hub_descriptor_min_t; + +struct usb_device_qualifier { + uByte bLength; + uByte bDescriptorType; + uWord bcdUSB; + uByte bDeviceClass; + uByte bDeviceSubClass; + uByte bDeviceProtocol; + uByte bMaxPacketSize0; + uByte bNumConfigurations; + uByte bReserved; +} __packed; +typedef struct usb_device_qualifier usb_device_qualifier_t; + +struct usb_otg_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bmAttributes; +#define UOTG_SRP 0x01 +#define UOTG_HNP 0x02 +} __packed; +typedef struct usb_otg_descriptor usb_otg_descriptor_t; + +/* OTG feature selectors */ +#define UOTG_B_HNP_ENABLE 3 +#define UOTG_A_HNP_SUPPORT 4 +#define UOTG_A_ALT_HNP_SUPPORT 5 + +struct usb_status { + uWord wStatus; +/* Device status flags */ +#define UDS_SELF_POWERED 0x0001 +#define UDS_REMOTE_WAKEUP 0x0002 +/* Endpoint status flags */ +#define UES_HALT 0x0001 +} __packed; +typedef struct usb_status usb_status_t; + +struct usb_hub_status { + uWord wHubStatus; +#define UHS_LOCAL_POWER 0x0001 +#define UHS_OVER_CURRENT 0x0002 + uWord wHubChange; +} __packed; +typedef struct usb_hub_status usb_hub_status_t; + +struct usb_port_status { + uWord wPortStatus; +#define UPS_CURRENT_CONNECT_STATUS 0x0001 +#define UPS_PORT_ENABLED 0x0002 +#define UPS_SUSPEND 0x0004 +#define UPS_OVERCURRENT_INDICATOR 0x0008 +#define UPS_RESET 0x0010 +#define UPS_PORT_L1 0x0020 /* USB 2.0 only */ +/* The link-state bits are valid for Super-Speed USB HUBs */ +#define UPS_PORT_LINK_STATE_GET(x) (((x) >> 5) & 0xF) +#define UPS_PORT_LINK_STATE_SET(x) (((x) & 0xF) << 5) +#define UPS_PORT_LS_U0 0x00 +#define UPS_PORT_LS_U1 0x01 +#define UPS_PORT_LS_U2 0x02 +#define UPS_PORT_LS_U3 0x03 +#define UPS_PORT_LS_SS_DIS 0x04 +#define UPS_PORT_LS_RX_DET 0x05 +#define UPS_PORT_LS_SS_INA 0x06 +#define UPS_PORT_LS_POLL 0x07 +#define UPS_PORT_LS_RECOVER 0x08 +#define UPS_PORT_LS_HOT_RST 0x09 +#define UPS_PORT_LS_COMP_MODE 0x0A +#define UPS_PORT_LS_LOOPBACK 0x0B +#define UPS_PORT_LS_RESUME 0x0F +#define UPS_PORT_POWER 0x0100 +#define UPS_PORT_POWER_SS 0x0200 /* super-speed only */ +#define UPS_LOW_SPEED 0x0200 +#define UPS_HIGH_SPEED 0x0400 +#define UPS_OTHER_SPEED 0x0600 /* currently FreeBSD specific */ +#define UPS_PORT_TEST 0x0800 +#define UPS_PORT_INDICATOR 0x1000 +#define UPS_PORT_MODE_DEVICE 0x8000 /* currently FreeBSD specific */ + uWord wPortChange; +#define UPS_C_CONNECT_STATUS 0x0001 +#define UPS_C_PORT_ENABLED 0x0002 +#define UPS_C_SUSPEND 0x0004 +#define UPS_C_OVERCURRENT_INDICATOR 0x0008 +#define UPS_C_PORT_RESET 0x0010 +#define UPS_C_PORT_L1 0x0020 /* USB 2.0 only */ +#define UPS_C_BH_PORT_RESET 0x0020 /* USB 3.0 only */ +#define UPS_C_PORT_LINK_STATE 0x0040 +#define UPS_C_PORT_CONFIG_ERROR 0x0080 +} __packed; +typedef struct usb_port_status usb_port_status_t; + +/* + * The "USB_SPEED" macros defines all the supported USB speeds. + */ +enum usb_dev_speed { + USB_SPEED_VARIABLE, + USB_SPEED_LOW, + USB_SPEED_FULL, + USB_SPEED_HIGH, + USB_SPEED_SUPER, +}; +#define USB_SPEED_MAX (USB_SPEED_SUPER+1) + +/* + * The "USB_REV" macros defines all the supported USB revisions. + */ +enum usb_revision { + USB_REV_UNKNOWN, + USB_REV_PRE_1_0, + USB_REV_1_0, + USB_REV_1_1, + USB_REV_2_0, + USB_REV_2_5, + USB_REV_3_0 +}; +#define USB_REV_MAX (USB_REV_3_0+1) + +/* + * Supported host controller modes. + */ +enum usb_hc_mode { + USB_MODE_HOST, /* initiates transfers */ + USB_MODE_DEVICE, /* bus transfer target */ + USB_MODE_DUAL /* can be host or device */ +}; +#define USB_MODE_MAX (USB_MODE_DUAL+1) + +/* + * The "USB_STATE" enums define all the supported device states. + */ +enum usb_dev_state { + USB_STATE_DETACHED, + USB_STATE_ATTACHED, + USB_STATE_POWERED, + USB_STATE_ADDRESSED, + USB_STATE_CONFIGURED, +}; +#define USB_STATE_MAX (USB_STATE_CONFIGURED+1) + +/* + * The "USB_EP_MODE" macros define all the currently supported + * endpoint modes. + */ +enum usb_ep_mode { + USB_EP_MODE_DEFAULT, + USB_EP_MODE_STREAMS, /* USB3.0 specific */ + USB_EP_MODE_HW_MASS_STORAGE, + USB_EP_MODE_HW_SERIAL, + USB_EP_MODE_HW_ETHERNET_CDC, + USB_EP_MODE_HW_ETHERNET_NCM, + USB_EP_MODE_MAX +}; +#endif /* _USB_STANDARD_H_ */ diff --git a/usr/contrib/freebsd/dev/usb/usb_endian.h b/usr/contrib/freebsd/dev/usb/usb_endian.h new file mode 100644 index 0000000000..0bbcb9bf82 --- /dev/null +++ b/usr/contrib/freebsd/dev/usb/usb_endian.h @@ -0,0 +1,121 @@ +/* $FreeBSD$ */ +/* + * Copyright (c) 2008 Hans Petter Selasky. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _USB_ENDIAN_H_ +#define _USB_ENDIAN_H_ + +#ifndef USB_GLOBAL_INCLUDE_FILE +#include +#include +#endif + +/* + * Declare the basic USB record types. USB records have an alignment + * of 1 byte and are always packed. + */ +typedef uint8_t uByte; +typedef uint8_t uWord[2]; +typedef uint8_t uDWord[4]; +typedef uint8_t uQWord[8]; + +/* + * Define a set of macros that can get and set data independent of + * CPU endianness and CPU alignment requirements: + */ +#define UGETB(w) \ + ((w)[0]) + +#define UGETW(w) \ + ((w)[0] | \ + (((uint16_t)((w)[1])) << 8)) + +#define UGETDW(w) \ + ((w)[0] | \ + (((uint16_t)((w)[1])) << 8) | \ + (((uint32_t)((w)[2])) << 16) | \ + (((uint32_t)((w)[3])) << 24)) + +#define UGETQW(w) \ + ((w)[0] | \ + (((uint16_t)((w)[1])) << 8) | \ + (((uint32_t)((w)[2])) << 16) | \ + (((uint32_t)((w)[3])) << 24) | \ + (((uint64_t)((w)[4])) << 32) | \ + (((uint64_t)((w)[5])) << 40) | \ + (((uint64_t)((w)[6])) << 48) | \ + (((uint64_t)((w)[7])) << 56)) + +#define USETB(w,v) do { \ + (w)[0] = (uint8_t)(v); \ +} while (0) + +#define USETW(w,v) do { \ + (w)[0] = (uint8_t)(v); \ + (w)[1] = (uint8_t)((v) >> 8); \ +} while (0) + +#define USETDW(w,v) do { \ + (w)[0] = (uint8_t)(v); \ + (w)[1] = (uint8_t)((v) >> 8); \ + (w)[2] = (uint8_t)((v) >> 16); \ + (w)[3] = (uint8_t)((v) >> 24); \ +} while (0) + +#define USETQW(w,v) do { \ + (w)[0] = (uint8_t)(v); \ + (w)[1] = (uint8_t)((v) >> 8); \ + (w)[2] = (uint8_t)((v) >> 16); \ + (w)[3] = (uint8_t)((v) >> 24); \ + (w)[4] = (uint8_t)((v) >> 32); \ + (w)[5] = (uint8_t)((v) >> 40); \ + (w)[6] = (uint8_t)((v) >> 48); \ + (w)[7] = (uint8_t)((v) >> 56); \ +} while (0) + +#define USETW2(w,b1,b0) do { \ + (w)[0] = (uint8_t)(b0); \ + (w)[1] = (uint8_t)(b1); \ +} while (0) + +#define USETW4(w,b3,b2,b1,b0) do { \ + (w)[0] = (uint8_t)(b0); \ + (w)[1] = (uint8_t)(b1); \ + (w)[2] = (uint8_t)(b2); \ + (w)[3] = (uint8_t)(b3); \ +} while (0) + +#define USETW8(w,b7,b6,b5,b4,b3,b2,b1,b0) do { \ + (w)[0] = (uint8_t)(b0); \ + (w)[1] = (uint8_t)(b1); \ + (w)[2] = (uint8_t)(b2); \ + (w)[3] = (uint8_t)(b3); \ + (w)[4] = (uint8_t)(b4); \ + (w)[5] = (uint8_t)(b5); \ + (w)[6] = (uint8_t)(b6); \ + (w)[7] = (uint8_t)(b7); \ +} while (0) + +#endif /* _USB_ENDIAN_H_ */ diff --git a/usr/contrib/freebsd/dev/usb/usb_freebsd.h b/usr/contrib/freebsd/dev/usb/usb_freebsd.h new file mode 100644 index 0000000000..3bc9d2c1eb --- /dev/null +++ b/usr/contrib/freebsd/dev/usb/usb_freebsd.h @@ -0,0 +1,101 @@ +/* $FreeBSD$ */ +/*- + * Copyright (c) 2008 Hans Petter Selasky. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Including this file is mandatory for all USB related c-files in the kernel. + */ + +#ifndef _USB_FREEBSD_H_ +#define _USB_FREEBSD_H_ + +/* Default USB configuration */ +#define USB_HAVE_UGEN 1 +#define USB_HAVE_DEVCTL 1 +#define USB_HAVE_BUSDMA 1 +#define USB_HAVE_COMPAT_LINUX 1 +#define USB_HAVE_USER_IO 1 +#define USB_HAVE_MBUF 1 +#define USB_HAVE_TT_SUPPORT 1 +#define USB_HAVE_POWERD 1 +#define USB_HAVE_MSCTEST 1 +#define USB_HAVE_MSCTEST_DETACH 1 +#define USB_HAVE_PF 1 +#define USB_HAVE_ROOT_MOUNT_HOLD 1 +#define USB_HAVE_ID_SECTION 1 +#define USB_HAVE_PER_BUS_PROCESS 1 +#define USB_HAVE_FIXED_ENDPOINT 0 +#define USB_HAVE_FIXED_IFACE 0 +#define USB_HAVE_FIXED_CONFIG 0 +#define USB_HAVE_FIXED_PORT 0 +#define USB_HAVE_DISABLE_ENUM 1 + +/* define zero ticks callout value */ +#define USB_CALLOUT_ZERO_TICKS 1 + +#define USB_TD_GET_PROC(td) (td)->td_proc +#define USB_PROC_GET_GID(td) (td)->p_pgid + +#if (!defined(USB_HOST_ALIGN)) || (USB_HOST_ALIGN <= 0) +/* Use default value. */ +#undef USB_HOST_ALIGN +#if defined(__arm__) || defined(__mips__) || defined(__powerpc__) +#define USB_HOST_ALIGN 32 /* Arm and MIPS need at least this much, if not more */ +#else +#define USB_HOST_ALIGN 8 /* bytes, must be power of two */ +#endif +#endif +/* Sanity check for USB_HOST_ALIGN: Verify power of two. */ +#if ((-USB_HOST_ALIGN) & USB_HOST_ALIGN) != USB_HOST_ALIGN +#error "USB_HOST_ALIGN is not power of two." +#endif +#define USB_FS_ISOC_UFRAME_MAX 4 /* exclusive unit */ +#define USB_BUS_MAX 256 /* units */ +#define USB_MAX_DEVICES 128 /* units */ +#define USB_CONFIG_MAX 65535 /* bytes */ +#define USB_IFACE_MAX 32 /* units */ +#define USB_FIFO_MAX 128 /* units */ +#define USB_MAX_EP_STREAMS 8 /* units */ +#define USB_MAX_EP_UNITS 32 /* units */ +#define USB_MAX_PORTS 255 /* units */ + +#define USB_MAX_FS_ISOC_FRAMES_PER_XFER (120) /* units */ +#define USB_MAX_HS_ISOC_FRAMES_PER_XFER (8*120) /* units */ + +#define USB_HUB_MAX_DEPTH 5 +#define USB_EP0_BUFSIZE 1024 /* bytes */ +#define USB_CS_RESET_LIMIT 20 /* failures = 20 * 50 ms = 1sec */ + +#define USB_MAX_AUTO_QUIRK 8 /* maximum number of dynamic quirks */ + +typedef uint32_t usb_timeout_t; /* milliseconds */ +typedef uint32_t usb_frlength_t; /* bytes */ +typedef uint32_t usb_frcount_t; /* units */ +typedef uint32_t usb_size_t; /* bytes */ +typedef uint32_t usb_ticks_t; /* system defined */ +typedef uint16_t usb_power_mask_t; /* see "USB_HW_POWER_XXX" */ +typedef uint16_t usb_stream_t; /* stream ID */ + +#endif /* _USB_FREEBSD_H_ */ diff --git a/usr/contrib/freebsd/dev/usb/usbdi.h b/usr/contrib/freebsd/dev/usb/usbdi.h new file mode 100644 index 0000000000..202ad89fa7 --- /dev/null +++ b/usr/contrib/freebsd/dev/usb/usbdi.h @@ -0,0 +1,657 @@ +/*- + * Copyright (c) 2009 Andrew Thompson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef _USB_USBDI_H_ +#define _USB_USBDI_H_ + +struct usb_fifo; +struct usb_xfer; +struct usb_device; +struct usb_attach_arg; +struct usb_interface; +struct usb_endpoint; +struct usb_page_cache; +struct usb_page_search; +struct usb_process; +struct usb_proc_msg; +struct usb_mbuf; +struct usb_fs_privdata; +struct mbuf; + +typedef enum { /* keep in sync with usb_errstr_table */ + USB_ERR_NORMAL_COMPLETION = 0, + USB_ERR_PENDING_REQUESTS, /* 1 */ + USB_ERR_NOT_STARTED, /* 2 */ + USB_ERR_INVAL, /* 3 */ + USB_ERR_NOMEM, /* 4 */ + USB_ERR_CANCELLED, /* 5 */ + USB_ERR_BAD_ADDRESS, /* 6 */ + USB_ERR_BAD_BUFSIZE, /* 7 */ + USB_ERR_BAD_FLAG, /* 8 */ + USB_ERR_NO_CALLBACK, /* 9 */ + USB_ERR_IN_USE, /* 10 */ + USB_ERR_NO_ADDR, /* 11 */ + USB_ERR_NO_PIPE, /* 12 */ + USB_ERR_ZERO_NFRAMES, /* 13 */ + USB_ERR_ZERO_MAXP, /* 14 */ + USB_ERR_SET_ADDR_FAILED, /* 15 */ + USB_ERR_NO_POWER, /* 16 */ + USB_ERR_TOO_DEEP, /* 17 */ + USB_ERR_IOERROR, /* 18 */ + USB_ERR_NOT_CONFIGURED, /* 19 */ + USB_ERR_TIMEOUT, /* 20 */ + USB_ERR_SHORT_XFER, /* 21 */ + USB_ERR_STALLED, /* 22 */ + USB_ERR_INTERRUPTED, /* 23 */ + USB_ERR_DMA_LOAD_FAILED, /* 24 */ + USB_ERR_BAD_CONTEXT, /* 25 */ + USB_ERR_NO_ROOT_HUB, /* 26 */ + USB_ERR_NO_INTR_THREAD, /* 27 */ + USB_ERR_NOT_LOCKED, /* 28 */ + USB_ERR_MAX +} usb_error_t; + +/* + * Flags for transfers + */ +#define USB_FORCE_SHORT_XFER 0x0001 /* force a short transmit last */ +#define USB_SHORT_XFER_OK 0x0004 /* allow short reads */ +#define USB_DELAY_STATUS_STAGE 0x0010 /* insert delay before STATUS stage */ +#define USB_USER_DATA_PTR 0x0020 /* internal flag */ +#define USB_MULTI_SHORT_OK 0x0040 /* allow multiple short frames */ +#define USB_MANUAL_STATUS 0x0080 /* manual ctrl status */ + +#define USB_NO_TIMEOUT 0 +#define USB_DEFAULT_TIMEOUT 5000 /* 5000 ms = 5 seconds */ + +#if defined(_KERNEL) +/* typedefs */ + +typedef void (usb_callback_t)(struct usb_xfer *, usb_error_t); +typedef void (usb_proc_callback_t)(struct usb_proc_msg *); +typedef usb_error_t (usb_handle_req_t)(struct usb_device *, + struct usb_device_request *, const void **, uint16_t *); + +typedef int (usb_fifo_open_t)(struct usb_fifo *fifo, int fflags); +typedef void (usb_fifo_close_t)(struct usb_fifo *fifo, int fflags); +typedef int (usb_fifo_ioctl_t)(struct usb_fifo *fifo, u_long cmd, void *addr, int fflags); +typedef void (usb_fifo_cmd_t)(struct usb_fifo *fifo); +typedef void (usb_fifo_filter_t)(struct usb_fifo *fifo, struct usb_mbuf *m); + + +/* USB events */ +#ifndef USB_GLOBAL_INCLUDE_FILE +#include +#endif +typedef void (*usb_dev_configured_t)(void *, struct usb_device *, + struct usb_attach_arg *); +EVENTHANDLER_DECLARE(usb_dev_configured, usb_dev_configured_t); + +/* + * The following macros are used used to convert milliseconds into + * HZ. We use 1024 instead of 1000 milliseconds per second to save a + * full division. + */ +#define USB_MS_HZ 1024 + +#define USB_MS_TO_TICKS(ms) \ + (((uint32_t)((((uint32_t)(ms)) * ((uint32_t)(hz))) + USB_MS_HZ - 1)) / USB_MS_HZ) + +/* + * Common queue structure for USB transfers. + */ +struct usb_xfer_queue { + TAILQ_HEAD(, usb_xfer) head; + struct usb_xfer *curr; /* current USB transfer processed */ + void (*command) (struct usb_xfer_queue *pq); + uint8_t recurse_1:1; + uint8_t recurse_2:1; + uint8_t recurse_3:1; + uint8_t reserved:5; +}; + +/* + * The following structure defines an USB endpoint + * USB endpoint. + */ +struct usb_endpoint { + /* queue of USB transfers */ + struct usb_xfer_queue endpoint_q[USB_MAX_EP_STREAMS]; + + struct usb_endpoint_descriptor *edesc; + struct usb_endpoint_ss_comp_descriptor *ecomp; + const struct usb_pipe_methods *methods; /* set by HC driver */ + + uint16_t isoc_next; + + uint8_t toggle_next:1; /* next data toggle value */ + uint8_t is_stalled:1; /* set if endpoint is stalled */ + uint8_t is_synced:1; /* set if we a synchronised */ + uint8_t unused:5; + uint8_t iface_index; /* not used by "default endpoint" */ + + uint8_t refcount_alloc; /* allocation refcount */ + uint8_t refcount_bw; /* bandwidth refcount */ +#define USB_EP_REF_MAX 0x3f + + /* High-Speed resource allocation (valid if "refcount_bw" > 0) */ + + uint8_t usb_smask; /* USB start mask */ + uint8_t usb_cmask; /* USB complete mask */ + uint8_t usb_uframe; /* USB microframe */ + + /* USB endpoint mode, see USB_EP_MODE_XXX */ + + uint8_t ep_mode; +}; + +/* + * The following structure defines an USB interface. + */ +struct usb_interface { + struct usb_interface_descriptor *idesc; + device_t subdev; + uint8_t alt_index; + uint8_t parent_iface_index; + + /* Linux compat */ + struct usb_host_interface *altsetting; + struct usb_host_interface *cur_altsetting; + struct usb_device *linux_udev; + void *bsd_priv_sc; /* device specific information */ + char *pnpinfo; /* additional PnP-info for this interface */ + uint8_t num_altsetting; /* number of alternate settings */ + uint8_t bsd_iface_index; +}; + +/* + * The following structure defines a set of USB transfer flags. + */ +struct usb_xfer_flags { + uint8_t force_short_xfer:1; /* force a short transmit transfer + * last */ + uint8_t short_xfer_ok:1; /* allow short receive transfers */ + uint8_t short_frames_ok:1; /* allow short frames */ + uint8_t pipe_bof:1; /* block pipe on failure */ + uint8_t proxy_buffer:1; /* makes buffer size a factor of + * "max_frame_size" */ + uint8_t ext_buffer:1; /* uses external DMA buffer */ + uint8_t manual_status:1; /* non automatic status stage on + * control transfers */ + uint8_t no_pipe_ok:1; /* set if "USB_ERR_NO_PIPE" error can + * be ignored */ + uint8_t stall_pipe:1; /* set if the endpoint belonging to + * this USB transfer should be stalled + * before starting this transfer! */ + uint8_t pre_scale_frames:1; /* "usb_config->frames" is + * assumed to give the + * buffering time in + * milliseconds and is + * converted into the nearest + * number of frames when the + * USB transfer is setup. This + * option only has effect for + * ISOCHRONOUS transfers. + */ +}; + +/* + * The following structure define an USB configuration, that basically + * is used when setting up an USB transfer. + */ +struct usb_config { + usb_callback_t *callback; /* USB transfer callback */ + usb_frlength_t bufsize; /* total pipe buffer size in bytes */ + usb_frcount_t frames; /* maximum number of USB frames */ + usb_timeout_t interval; /* interval in milliseconds */ +#define USB_DEFAULT_INTERVAL 0 + usb_timeout_t timeout; /* transfer timeout in milliseconds */ + struct usb_xfer_flags flags; /* transfer flags */ + usb_stream_t stream_id; /* USB3.0 specific */ + enum usb_hc_mode usb_mode; /* host or device mode */ + uint8_t type; /* pipe type */ + uint8_t endpoint; /* pipe number */ + uint8_t direction; /* pipe direction */ + uint8_t ep_index; /* pipe index match to use */ + uint8_t if_index; /* "ifaces" index to use */ +}; + +/* + * Use these macro when defining USB device ID arrays if you want to + * have your driver module automatically loaded in host, device or + * both modes respectively: + */ +#if USB_HAVE_ID_SECTION +#define STRUCT_USB_HOST_ID \ + struct usb_device_id __section("usb_host_id") +#define STRUCT_USB_DEVICE_ID \ + struct usb_device_id __section("usb_device_id") +#define STRUCT_USB_DUAL_ID \ + struct usb_device_id __section("usb_dual_id") +#else +#define STRUCT_USB_HOST_ID \ + struct usb_device_id +#define STRUCT_USB_DEVICE_ID \ + struct usb_device_id +#define STRUCT_USB_DUAL_ID \ + struct usb_device_id +#endif /* USB_HAVE_ID_SECTION */ + +/* + * The following structure is used when looking up an USB driver for + * an USB device. It is inspired by the Linux structure called + * "usb_device_id". + */ +struct usb_device_id { + + /* Select which fields to match against */ +#if BYTE_ORDER == LITTLE_ENDIAN + uint16_t + match_flag_vendor:1, + match_flag_product:1, + match_flag_dev_lo:1, + match_flag_dev_hi:1, + + match_flag_dev_class:1, + match_flag_dev_subclass:1, + match_flag_dev_protocol:1, + match_flag_int_class:1, + + match_flag_int_subclass:1, + match_flag_int_protocol:1, + match_flag_unused:6; +#else + uint16_t + match_flag_unused:6, + match_flag_int_protocol:1, + match_flag_int_subclass:1, + + match_flag_int_class:1, + match_flag_dev_protocol:1, + match_flag_dev_subclass:1, + match_flag_dev_class:1, + + match_flag_dev_hi:1, + match_flag_dev_lo:1, + match_flag_product:1, + match_flag_vendor:1; +#endif + + /* Used for product specific matches; the BCD range is inclusive */ + uint16_t idVendor; + uint16_t idProduct; + uint16_t bcdDevice_lo; + uint16_t bcdDevice_hi; + + /* Used for device class matches */ + uint8_t bDeviceClass; + uint8_t bDeviceSubClass; + uint8_t bDeviceProtocol; + + /* Used for interface class matches */ + uint8_t bInterfaceClass; + uint8_t bInterfaceSubClass; + uint8_t bInterfaceProtocol; + +#if USB_HAVE_COMPAT_LINUX + /* which fields to match against */ + uint16_t match_flags; +#define USB_DEVICE_ID_MATCH_VENDOR 0x0001 +#define USB_DEVICE_ID_MATCH_PRODUCT 0x0002 +#define USB_DEVICE_ID_MATCH_DEV_LO 0x0004 +#define USB_DEVICE_ID_MATCH_DEV_HI 0x0008 +#define USB_DEVICE_ID_MATCH_DEV_CLASS 0x0010 +#define USB_DEVICE_ID_MATCH_DEV_SUBCLASS 0x0020 +#define USB_DEVICE_ID_MATCH_DEV_PROTOCOL 0x0040 +#define USB_DEVICE_ID_MATCH_INT_CLASS 0x0080 +#define USB_DEVICE_ID_MATCH_INT_SUBCLASS 0x0100 +#define USB_DEVICE_ID_MATCH_INT_PROTOCOL 0x0200 +#endif + + /* Hook for driver specific information */ + unsigned long driver_info; +} __aligned(32); + +#define USB_STD_PNP_INFO "M16:mask;U16:vendor;U16:product;L16:product;G16:product;" \ + "U8:devclass;U8:devsubclass;U8:devprotocol;" \ + "U8:intclass;U8:intsubclass;U8:intprotocol;" +#define USB_STD_PNP_HOST_INFO USB_STD_PNP_INFO "T:mode=host;" +#define USB_STD_PNP_DEVICE_INFO USB_STD_PNP_INFO "T:mode=device;" +#define USB_PNP_HOST_INFO(table) \ + MODULE_PNP_INFO(USB_STD_PNP_HOST_INFO, usb, table, table, sizeof(table[0]), \ + sizeof(table) / sizeof(table[0])) +#define USB_PNP_DEVICE_INFO(table) \ + MODULE_PNP_INFO(USB_STD_PNP_DEVICE_INFO, usb, table, table, sizeof(table[0]), \ + sizeof(table) / sizeof(table[0])) +#define USB_PNP_DUAL_INFO(table) \ + MODULE_PNP_INFO(USB_STD_PNP_INFO, usb, table, table, sizeof(table[0]), \ + sizeof(table) / sizeof(table[0])) + +/* check that the size of the structure above is correct */ +extern char usb_device_id_assert[(sizeof(struct usb_device_id) == 32) ? 1 : -1]; + +#define USB_VENDOR(vend) \ + .match_flag_vendor = 1, .idVendor = (vend) + +#define USB_PRODUCT(prod) \ + .match_flag_product = 1, .idProduct = (prod) + +#define USB_VP(vend,prod) \ + USB_VENDOR(vend), USB_PRODUCT(prod) + +#define USB_VPI(vend,prod,info) \ + USB_VENDOR(vend), USB_PRODUCT(prod), USB_DRIVER_INFO(info) + +#define USB_DEV_BCD_GTEQ(lo) /* greater than or equal */ \ + .match_flag_dev_lo = 1, .bcdDevice_lo = (lo) + +#define USB_DEV_BCD_LTEQ(hi) /* less than or equal */ \ + .match_flag_dev_hi = 1, .bcdDevice_hi = (hi) + +#define USB_DEV_CLASS(dc) \ + .match_flag_dev_class = 1, .bDeviceClass = (dc) + +#define USB_DEV_SUBCLASS(dsc) \ + .match_flag_dev_subclass = 1, .bDeviceSubClass = (dsc) + +#define USB_DEV_PROTOCOL(dp) \ + .match_flag_dev_protocol = 1, .bDeviceProtocol = (dp) + +#define USB_IFACE_CLASS(ic) \ + .match_flag_int_class = 1, .bInterfaceClass = (ic) + +#define USB_IFACE_SUBCLASS(isc) \ + .match_flag_int_subclass = 1, .bInterfaceSubClass = (isc) + +#define USB_IFACE_PROTOCOL(ip) \ + .match_flag_int_protocol = 1, .bInterfaceProtocol = (ip) + +#define USB_IF_CSI(class,subclass,info) \ + USB_IFACE_CLASS(class), USB_IFACE_SUBCLASS(subclass), USB_DRIVER_INFO(info) + +#define USB_DRIVER_INFO(n) \ + .driver_info = (n) + +#define USB_GET_DRIVER_INFO(did) \ + (did)->driver_info + +/* + * The following structure keeps information that is used to match + * against an array of "usb_device_id" elements. + */ +struct usbd_lookup_info { + uint16_t idVendor; + uint16_t idProduct; + uint16_t bcdDevice; + uint8_t bDeviceClass; + uint8_t bDeviceSubClass; + uint8_t bDeviceProtocol; + uint8_t bInterfaceClass; + uint8_t bInterfaceSubClass; + uint8_t bInterfaceProtocol; + uint8_t bIfaceIndex; + uint8_t bIfaceNum; + uint8_t bConfigIndex; + uint8_t bConfigNum; +}; + +/* Structure used by probe and attach */ + +struct usb_attach_arg { + struct usbd_lookup_info info; + device_t temp_dev; /* for internal use */ + unsigned long driver_info; /* for internal use */ + void *driver_ivar; + struct usb_device *device; /* current device */ + struct usb_interface *iface; /* current interface */ + enum usb_hc_mode usb_mode; /* host or device mode */ + uint8_t port; + uint8_t dev_state; +#define UAA_DEV_READY 0 +#define UAA_DEV_DISABLED 1 +#define UAA_DEV_EJECTING 2 +}; + +/* + * The following is a wrapper for the callout structure to ease + * porting the code to other platforms. + */ +struct usb_callout { + struct callout co; +}; +#define usb_callout_init_mtx(c,m,f) callout_init_mtx(&(c)->co,m,f) +#define usb_callout_reset(c,t,f,d) callout_reset(&(c)->co,t,f,d) +#define usb_callout_stop(c) callout_stop(&(c)->co) +#define usb_callout_drain(c) callout_drain(&(c)->co) +#define usb_callout_pending(c) callout_pending(&(c)->co) + +/* USB transfer states */ + +#define USB_ST_SETUP 0 +#define USB_ST_TRANSFERRED 1 +#define USB_ST_ERROR 2 + +/* USB handle request states */ +#define USB_HR_NOT_COMPLETE 0 +#define USB_HR_COMPLETE_OK 1 +#define USB_HR_COMPLETE_ERR 2 + +/* + * The following macro will return the current state of an USB + * transfer like defined by the "USB_ST_XXX" enums. + */ +#define USB_GET_STATE(xfer) (usbd_xfer_state(xfer)) + +/* + * The following structure defines the USB process message header. + */ +struct usb_proc_msg { + TAILQ_ENTRY(usb_proc_msg) pm_qentry; + usb_proc_callback_t *pm_callback; + usb_size_t pm_num; +}; + +#define USB_FIFO_TX 0 +#define USB_FIFO_RX 1 + +/* + * Locking note for the following functions. All the + * "usb_fifo_cmd_t" and "usb_fifo_filter_t" functions are called + * locked. The others are called unlocked. + */ +struct usb_fifo_methods { + usb_fifo_open_t *f_open; + usb_fifo_close_t *f_close; + usb_fifo_ioctl_t *f_ioctl; + /* + * NOTE: The post-ioctl callback is called after the USB reference + * gets locked in the IOCTL handler: + */ + usb_fifo_ioctl_t *f_ioctl_post; + usb_fifo_cmd_t *f_start_read; + usb_fifo_cmd_t *f_stop_read; + usb_fifo_cmd_t *f_start_write; + usb_fifo_cmd_t *f_stop_write; + usb_fifo_filter_t *f_filter_read; + usb_fifo_filter_t *f_filter_write; + const char *basename[4]; + const char *postfix[4]; +}; + +struct usb_fifo_sc { + struct usb_fifo *fp[2]; + struct usb_fs_privdata *dev; +}; + +const char *usbd_errstr(usb_error_t error); +void *usbd_find_descriptor(struct usb_device *udev, void *id, + uint8_t iface_index, uint8_t type, uint8_t type_mask, + uint8_t subtype, uint8_t subtype_mask); +struct usb_config_descriptor *usbd_get_config_descriptor( + struct usb_device *udev); +struct usb_device_descriptor *usbd_get_device_descriptor( + struct usb_device *udev); +struct usb_interface *usbd_get_iface(struct usb_device *udev, + uint8_t iface_index); +struct usb_interface_descriptor *usbd_get_interface_descriptor( + struct usb_interface *iface); +struct usb_endpoint *usbd_get_endpoint(struct usb_device *udev, uint8_t iface_index, + const struct usb_config *setup); +struct usb_endpoint *usbd_get_ep_by_addr(struct usb_device *udev, uint8_t ea_val); +usb_error_t usbd_interface_count(struct usb_device *udev, uint8_t *count); +enum usb_hc_mode usbd_get_mode(struct usb_device *udev); +enum usb_dev_speed usbd_get_speed(struct usb_device *udev); +void device_set_usb_desc(device_t dev); +void usb_pause_mtx(struct mtx *mtx, int _ticks); +usb_error_t usbd_set_pnpinfo(struct usb_device *udev, + uint8_t iface_index, const char *pnpinfo); +usb_error_t usbd_add_dynamic_quirk(struct usb_device *udev, + uint16_t quirk); +usb_error_t usbd_set_endpoint_mode(struct usb_device *udev, + struct usb_endpoint *ep, uint8_t ep_mode); +uint8_t usbd_get_endpoint_mode(struct usb_device *udev, + struct usb_endpoint *ep); + +const struct usb_device_id *usbd_lookup_id_by_info( + const struct usb_device_id *id, usb_size_t sizeof_id, + const struct usbd_lookup_info *info); +int usbd_lookup_id_by_uaa(const struct usb_device_id *id, + usb_size_t sizeof_id, struct usb_attach_arg *uaa); + +usb_error_t usbd_do_request_flags(struct usb_device *udev, struct mtx *mtx, + struct usb_device_request *req, void *data, uint16_t flags, + uint16_t *actlen, usb_timeout_t timeout); +#define usbd_do_request(u,m,r,d) \ + usbd_do_request_flags(u,m,r,d,0,NULL,USB_DEFAULT_TIMEOUT) + +uint8_t usbd_clear_stall_callback(struct usb_xfer *xfer1, + struct usb_xfer *xfer2); +uint8_t usbd_get_interface_altindex(struct usb_interface *iface); +usb_error_t usbd_set_alt_interface_index(struct usb_device *udev, + uint8_t iface_index, uint8_t alt_index); +uint32_t usbd_get_isoc_fps(struct usb_device *udev); +usb_error_t usbd_transfer_setup(struct usb_device *udev, + const uint8_t *ifaces, struct usb_xfer **pxfer, + const struct usb_config *setup_start, uint16_t n_setup, + void *priv_sc, struct mtx *priv_mtx); +void usbd_transfer_submit(struct usb_xfer *xfer); +void usbd_transfer_clear_stall(struct usb_xfer *xfer); +void usbd_transfer_drain(struct usb_xfer *xfer); +uint8_t usbd_transfer_pending(struct usb_xfer *xfer); +void usbd_transfer_start(struct usb_xfer *xfer); +void usbd_transfer_stop(struct usb_xfer *xfer); +void usbd_transfer_unsetup(struct usb_xfer **pxfer, uint16_t n_setup); +void usbd_transfer_poll(struct usb_xfer **ppxfer, uint16_t max); +void usbd_set_parent_iface(struct usb_device *udev, uint8_t iface_index, + uint8_t parent_index); +uint8_t usbd_get_bus_index(struct usb_device *udev); +uint8_t usbd_get_device_index(struct usb_device *udev); +void usbd_set_power_mode(struct usb_device *udev, uint8_t power_mode); +uint8_t usbd_filter_power_mode(struct usb_device *udev, uint8_t power_mode); +uint8_t usbd_device_attached(struct usb_device *udev); + +usb_frlength_t + usbd_xfer_old_frame_length(struct usb_xfer *xfer, usb_frcount_t frindex); +void usbd_xfer_status(struct usb_xfer *xfer, int *actlen, int *sumlen, + int *aframes, int *nframes); +struct usb_page_cache *usbd_xfer_get_frame(struct usb_xfer *, usb_frcount_t); +void *usbd_xfer_get_frame_buffer(struct usb_xfer *, usb_frcount_t); +void *usbd_xfer_softc(struct usb_xfer *xfer); +void *usbd_xfer_get_priv(struct usb_xfer *xfer); +void usbd_xfer_set_priv(struct usb_xfer *xfer, void *); +void usbd_xfer_set_interval(struct usb_xfer *xfer, int); +uint8_t usbd_xfer_state(struct usb_xfer *xfer); +void usbd_xfer_set_frame_data(struct usb_xfer *xfer, usb_frcount_t frindex, + void *ptr, usb_frlength_t len); +void usbd_xfer_frame_data(struct usb_xfer *xfer, usb_frcount_t frindex, + void **ptr, int *len); +void usbd_xfer_set_frame_offset(struct usb_xfer *xfer, usb_frlength_t offset, + usb_frcount_t frindex); +usb_frlength_t usbd_xfer_max_len(struct usb_xfer *xfer); +usb_frlength_t usbd_xfer_max_framelen(struct usb_xfer *xfer); +usb_frcount_t usbd_xfer_max_frames(struct usb_xfer *xfer); +uint8_t usbd_xfer_get_fps_shift(struct usb_xfer *xfer); +usb_frlength_t usbd_xfer_frame_len(struct usb_xfer *xfer, + usb_frcount_t frindex); +void usbd_xfer_set_frame_len(struct usb_xfer *xfer, usb_frcount_t frindex, + usb_frlength_t len); +void usbd_xfer_set_timeout(struct usb_xfer *xfer, int timeout); +void usbd_xfer_set_frames(struct usb_xfer *xfer, usb_frcount_t n); +void usbd_xfer_set_stall(struct usb_xfer *xfer); +int usbd_xfer_is_stalled(struct usb_xfer *xfer); +void usbd_xfer_set_flag(struct usb_xfer *xfer, int flag); +void usbd_xfer_clr_flag(struct usb_xfer *xfer, int flag); +uint16_t usbd_xfer_get_timestamp(struct usb_xfer *xfer); +uint8_t usbd_xfer_maxp_was_clamped(struct usb_xfer *xfer); + +void usbd_copy_in(struct usb_page_cache *cache, usb_frlength_t offset, + const void *ptr, usb_frlength_t len); +int usbd_copy_in_user(struct usb_page_cache *cache, usb_frlength_t offset, + const void *ptr, usb_frlength_t len); +void usbd_copy_out(struct usb_page_cache *cache, usb_frlength_t offset, + void *ptr, usb_frlength_t len); +int usbd_copy_out_user(struct usb_page_cache *cache, usb_frlength_t offset, + void *ptr, usb_frlength_t len); +void usbd_get_page(struct usb_page_cache *pc, usb_frlength_t offset, + struct usb_page_search *res); +void usbd_m_copy_in(struct usb_page_cache *cache, usb_frlength_t dst_offset, + struct mbuf *m, usb_size_t src_offset, usb_frlength_t src_len); +void usbd_frame_zero(struct usb_page_cache *cache, usb_frlength_t offset, + usb_frlength_t len); +void usbd_start_re_enumerate(struct usb_device *udev); +usb_error_t + usbd_start_set_config(struct usb_device *, uint8_t); + +int usb_fifo_attach(struct usb_device *udev, void *priv_sc, + struct mtx *priv_mtx, struct usb_fifo_methods *pm, + struct usb_fifo_sc *f_sc, uint16_t unit, int16_t subunit, + uint8_t iface_index, uid_t uid, gid_t gid, int mode); +void usb_fifo_detach(struct usb_fifo_sc *f_sc); +int usb_fifo_alloc_buffer(struct usb_fifo *f, uint32_t bufsize, + uint16_t nbuf); +void usb_fifo_free_buffer(struct usb_fifo *f); +uint32_t usb_fifo_put_bytes_max(struct usb_fifo *fifo); +void usb_fifo_put_data(struct usb_fifo *fifo, struct usb_page_cache *pc, + usb_frlength_t offset, usb_frlength_t len, uint8_t what); +void usb_fifo_put_data_linear(struct usb_fifo *fifo, void *ptr, + usb_size_t len, uint8_t what); +uint8_t usb_fifo_put_data_buffer(struct usb_fifo *f, void *ptr, usb_size_t len); +void usb_fifo_put_data_error(struct usb_fifo *fifo); +uint8_t usb_fifo_get_data(struct usb_fifo *fifo, struct usb_page_cache *pc, + usb_frlength_t offset, usb_frlength_t len, usb_frlength_t *actlen, + uint8_t what); +uint8_t usb_fifo_get_data_linear(struct usb_fifo *fifo, void *ptr, + usb_size_t len, usb_size_t *actlen, uint8_t what); +uint8_t usb_fifo_get_data_buffer(struct usb_fifo *f, void **pptr, + usb_size_t *plen); +void usb_fifo_reset(struct usb_fifo *f); +void usb_fifo_wakeup(struct usb_fifo *f); +void usb_fifo_get_data_error(struct usb_fifo *fifo); +void *usb_fifo_softc(struct usb_fifo *fifo); +void usb_fifo_set_close_zlp(struct usb_fifo *, uint8_t); +void usb_fifo_set_write_defrag(struct usb_fifo *, uint8_t); +void usb_fifo_free(struct usb_fifo *f); +#endif /* _KERNEL */ +#endif /* _USB_USBDI_H_ */ diff --git a/usr/contrib/freebsd/isa/rtc.h b/usr/contrib/freebsd/isa/rtc.h new file mode 100644 index 0000000000..bb964ddf6a --- /dev/null +++ b/usr/contrib/freebsd/isa/rtc.h @@ -0,0 +1,125 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)rtc.h 7.1 (Berkeley) 5/12/91 + * $FreeBSD$ + */ + +#ifndef _I386_ISA_RTC_H_ +#define _I386_ISA_RTC_H_ 1 + +/* + * MC146818 RTC Register locations + */ + +#define RTC_SEC 0x00 /* seconds */ +#define RTC_SECALRM 0x01 /* seconds alarm */ +#define RTC_MIN 0x02 /* minutes */ +#define RTC_MINALRM 0x03 /* minutes alarm */ +#define RTC_HRS 0x04 /* hours */ +#define RTC_HRSALRM 0x05 /* hours alarm */ +#define RTC_WDAY 0x06 /* week day */ +#define RTC_DAY 0x07 /* day of month */ +#define RTC_MONTH 0x08 /* month of year */ +#define RTC_YEAR 0x09 /* month of year */ + +#define RTC_STATUSA 0x0a /* status register A */ +#define RTCSA_TUP 0x80 /* time update, don't look now */ +#define RTCSA_RESET 0x70 /* reset divider */ +#define RTCSA_DIVIDER 0x20 /* divider correct for 32768 Hz */ +#define RTCSA_8192 0x03 /* 8192 Hz interrupt */ +#define RTCSA_4096 0x04 +#define RTCSA_2048 0x05 +#define RTCSA_1024 0x06 /* default for profiling */ +#define RTCSA_PROF RTCSA_1024 +#define RTC_PROFRATE 1024 +#define RTCSA_512 0x07 +#define RTCSA_256 0x08 +#define RTCSA_128 0x09 +#define RTCSA_NOPROF RTCSA_128 +#define RTC_NOPROFRATE 128 +#define RTCSA_64 0x0a +#define RTCSA_32 0x0b /* 32 Hz interrupt */ + +#define RTC_STATUSB 0x0b /* status register B */ +#define RTCSB_DST 0x01 /* USA Daylight Savings Time enable */ +#define RTCSB_24HR 0x02 /* 0 = 12 hours, 1 = 24 hours */ +#define RTCSB_BCD 0x04 /* 0 = BCD, 1 = Binary coded time */ +#define RTCSB_SQWE 0x08 /* 1 = output sqare wave at SQW pin */ +#define RTCSB_UINTR 0x10 /* 1 = enable update-ended interrupt */ +#define RTCSB_AINTR 0x20 /* 1 = enable alarm interrupt */ +#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */ +#define RTCSB_HALT 0x80 /* stop clock updates */ + +#define RTC_INTR 0x0c /* status register C (R) interrupt source */ +#define RTCIR_UPDATE 0x10 /* update intr */ +#define RTCIR_ALARM 0x20 /* alarm intr */ +#define RTCIR_PERIOD 0x40 /* periodic intr */ +#define RTCIR_INT 0x80 /* interrupt output signal */ + +#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */ +#define RTCSD_PWR 0x80 /* clock power OK */ + +#define RTC_DIAG 0x0e /* status register E - bios diagnostic */ +#define RTCDG_BITS "\020\010clock_battery\007ROM_cksum\006config_unit\005memory_size\004fixed_disk\003invalid_time" + +#define RTC_RESET 0x0f /* status register F - reset code byte */ +#define RTCRS_RST 0x00 /* normal reset */ +#define RTCRS_LOAD 0x04 /* load system */ + +#define RTC_FDISKETTE 0x10 /* diskette drive type in upper/lower nibble */ +#define RTCFDT_NONE 0 /* none present */ +#define RTCFDT_360K 0x10 /* 360K */ +#define RTCFDT_12M 0x20 /* 1.2M */ +#define RTCFDT_720K 0x30 /* 720K */ +#define RTCFDT_144M 0x40 /* 1.44M */ +#define RTCFDT_288M_1 0x50 /* 2.88M, some BIOSes */ +#define RTCFDT_288M 0x60 /* 2.88M */ + +#define RTC_BASELO 0x15 /* low byte of basemem size */ +#define RTC_BASEHI 0x16 /* high byte of basemem size */ +#define RTC_EXTLO 0x17 /* low byte of extended mem size */ +#define RTC_EXTHI 0x18 /* low byte of extended mem size */ + +#define RTC_CENTURY 0x32 /* current century */ + +#ifdef __FreeBSD__ +#ifdef _KERNEL +extern struct mtx clock_lock; +extern int atrtcclock_disable; +int rtcin(int reg); +void atrtc_restore(void); +void writertc(int reg, u_char val); +void atrtc_set(struct timespec *ts); +#endif +#endif + +#endif /* _I386_ISA_RTC_H_ */ diff --git a/usr/contrib/freebsd/lib/libutil/humanize_number.c b/usr/contrib/freebsd/lib/libutil/humanize_number.c new file mode 100644 index 0000000000..675a969aaa --- /dev/null +++ b/usr/contrib/freebsd/lib/libutil/humanize_number.c @@ -0,0 +1,179 @@ +/* $NetBSD: humanize_number.c,v 1.14 2008/04/28 20:22:59 martin Exp $ */ + +/* + * Copyright (c) 1997, 1998, 1999, 2002 The NetBSD Foundation, Inc. + * Copyright 2013 John-Mark Gurney + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, + * NASA Ames Research Center, by Luke Mewburn and by Tomas Svensson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +static const int maxscale = 6; + +int +humanize_number(char *buf, size_t len, int64_t quotient, + const char *suffix, int scale, int flags) +{ + const char *prefixes, *sep; + int i, r, remainder, s1, s2, sign; + int divisordeccut; + int64_t divisor, max; + size_t baselen; + + /* Since so many callers don't check -1, NUL terminate the buffer */ + if (len > 0) + buf[0] = '\0'; + + /* validate args */ + if (buf == NULL || suffix == NULL) + return (-1); + if (scale < 0) + return (-1); + else if (scale > maxscale && + ((scale & ~(HN_AUTOSCALE|HN_GETSCALE)) != 0)) + return (-1); + if ((flags & HN_DIVISOR_1000) && (flags & HN_IEC_PREFIXES)) + return (-1); + + /* setup parameters */ + remainder = 0; + + if (flags & HN_IEC_PREFIXES) { + baselen = 2; + /* + * Use the prefixes for power of two recommended by + * the International Electrotechnical Commission + * (IEC) in IEC 80000-3 (i.e. Ki, Mi, Gi...). + * + * HN_IEC_PREFIXES implies a divisor of 1024 here + * (use of HN_DIVISOR_1000 would have triggered + * an assertion earlier). + */ + divisor = 1024; + divisordeccut = 973; /* ceil(.95 * 1024) */ + if (flags & HN_B) + prefixes = "B\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei"; + else + prefixes = "\0\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei"; + } else { + baselen = 1; + if (flags & HN_DIVISOR_1000) { + divisor = 1000; + divisordeccut = 950; + if (flags & HN_B) + prefixes = "B\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E"; + else + prefixes = "\0\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E"; + } else { + divisor = 1024; + divisordeccut = 973; /* ceil(.95 * 1024) */ + if (flags & HN_B) + prefixes = "B\0\0K\0\0M\0\0G\0\0T\0\0P\0\0E"; + else + prefixes = "\0\0\0K\0\0M\0\0G\0\0T\0\0P\0\0E"; + } + } + +#define SCALE2PREFIX(scale) (&prefixes[(scale) * 3]) + + if (quotient < 0) { + sign = -1; + quotient = -quotient; + baselen += 2; /* sign, digit */ + } else { + sign = 1; + baselen += 1; /* digit */ + } + if (flags & HN_NOSPACE) + sep = ""; + else { + sep = " "; + baselen++; + } + baselen += strlen(suffix); + + /* Check if enough room for `x y' + suffix + `\0' */ + if (len < baselen + 1) + return (-1); + + if (scale & (HN_AUTOSCALE | HN_GETSCALE)) { + /* See if there is additional columns can be used. */ + for (max = 1, i = len - baselen; i-- > 0;) + max *= 10; + + /* + * Divide the number until it fits the given column. + * If there will be an overflow by the rounding below, + * divide once more. + */ + for (i = 0; + (quotient >= max || (quotient == max - 1 && + remainder >= divisordeccut)) && i < maxscale; i++) { + remainder = quotient % divisor; + quotient /= divisor; + } + + if (scale & HN_GETSCALE) + return (i); + } else { + for (i = 0; i < scale && i < maxscale; i++) { + remainder = quotient % divisor; + quotient /= divisor; + } + } + + /* If a value <= 9.9 after rounding and ... */ + /* + * XXX - should we make sure there is enough space for the decimal + * place and if not, don't do HN_DECIMAL? + */ + if (((quotient == 9 && remainder < divisordeccut) || quotient < 9) && + i > 0 && flags & HN_DECIMAL) { + s1 = (int)quotient + ((remainder * 10 + divisor / 2) / + divisor / 10); + s2 = ((remainder * 10 + divisor / 2) / divisor) % 10; + r = snprintf(buf, len, "%d%s%d%s%s%s", + sign * s1, localeconv()->decimal_point, s2, + sep, SCALE2PREFIX(i), suffix); + } else + r = snprintf(buf, len, "%" PRId64 "%s%s%s", + sign * (quotient + (remainder + divisor / 2) / divisor), + sep, SCALE2PREFIX(i), suffix); + + return (r); +} diff --git a/usr/contrib/freebsd/sys/ata.h b/usr/contrib/freebsd/sys/ata.h index 705460355f..223bd7b3eb 100644 --- a/usr/contrib/freebsd/sys/ata.h +++ b/usr/contrib/freebsd/sys/ata.h @@ -23,7 +23,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: head/sys/sys/ata.h 264853 2014-04-24 01:28:14Z smh $ + * $FreeBSD$ */ #ifndef _SYS_ATA_H_ @@ -105,6 +105,10 @@ struct ata_params { /*069*/ u_int16_t support3; #define ATA_SUPPORT_RZAT 0x0020 #define ATA_SUPPORT_DRAT 0x4000 +#define ATA_SUPPORT_ZONE_MASK 0x0003 +#define ATA_SUPPORT_ZONE_NR 0x0000 +#define ATA_SUPPORT_ZONE_HOST_AWARE 0x0001 +#define ATA_SUPPORT_ZONE_DEV_MANAGED 0x0002 u_int16_t reserved70; /*071*/ u_int16_t rlsovlap; /* rel time (us) for overlap */ /*072*/ u_int16_t rlsservice; /* rel time (us) for service */ @@ -228,7 +232,14 @@ struct ata_params { #define ATA_SUPPORT_RWLOGDMAEXT 0x0008 #define ATA_SUPPORT_MICROCODE3 0x0010 #define ATA_SUPPORT_FREEFALL 0x0020 +#define ATA_SUPPORT_SENSE_REPORT 0x0040 +#define ATA_SUPPORT_EPC 0x0080 /*120*/ u_int16_t enabled2; +#define ATA_ENABLED_WRITEREADVERIFY 0x0002 +#define ATA_ENABLED_WRITEUNCORREXT 0x0004 +#define ATA_ENABLED_FREEFALL 0x0020 +#define ATA_ENABLED_SENSE_REPORT 0x0040 +#define ATA_ENABLED_EPC 0x0080 u_int16_t reserved121[6]; /*127*/ u_int16_t removable_status; /*128*/ u_int16_t security_status; @@ -252,7 +263,7 @@ struct ata_params { u_int16_t reserved170[6]; /*176*/ u_int8_t media_serial[60]; /*206*/ u_int16_t sct; - u_int16_t reserved206[2]; + u_int16_t reserved207[2]; /*209*/ u_int16_t lsalign; /*210*/ u_int16_t wrv_sectors_m3_1; u_int16_t wrv_sectors_m3_2; @@ -298,8 +309,14 @@ struct ata_params { #define ATA_MAX_28BIT_LBA 268435455UL /* ATA Status Register */ -#define ATA_STATUS_ERROR 0x01 -#define ATA_STATUS_DEVICE_FAULT 0x20 +#define ATA_STATUS_ERROR 0x01 +#define ATA_STATUS_SENSE_AVAIL 0x02 +#define ATA_STATUS_ALIGN_ERR 0x04 +#define ATA_STATUS_DATA_REQ 0x08 +#define ATA_STATUS_DEF_WRITE_ERR 0x10 +#define ATA_STATUS_DEVICE_FAULT 0x20 +#define ATA_STATUS_DEVICE_READY 0x40 +#define ATA_STATUS_BUSY 0x80 /* ATA Error Register */ #define ATA_ERROR_ABORT 0x04 @@ -335,6 +352,7 @@ struct ata_params { #define ATA_UDMA6 0x46 #define ATA_SA150 0x47 #define ATA_SA300 0x48 +#define ATA_SA600 0x49 #define ATA_DMA_MAX 0x4f @@ -367,13 +385,36 @@ struct ata_params { #define ATA_WRITE_LOG_EXT 0x3f #define ATA_READ_VERIFY 0x40 #define ATA_READ_VERIFY48 0x42 +#define ATA_WRITE_UNCORRECTABLE48 0x45 /* write uncorrectable 48bit LBA */ +#define ATA_WU_PSEUDO 0x55 /* pseudo-uncorrectable error */ +#define ATA_WU_FLAGGED 0xaa /* flagged-uncorrectable error */ #define ATA_READ_LOG_DMA_EXT 0x47 /* read log DMA ext - PIO Data-In */ +#define ATA_ZAC_MANAGEMENT_IN 0x4a /* ZAC management in */ +#define ATA_ZM_REPORT_ZONES 0x00 /* report zones */ #define ATA_READ_FPDMA_QUEUED 0x60 /* read DMA NCQ */ #define ATA_WRITE_FPDMA_QUEUED 0x61 /* write DMA NCQ */ +#define ATA_NCQ_NON_DATA 0x63 /* NCQ non-data command */ +#define ATA_ABORT_NCQ_QUEUE 0x00 /* abort NCQ queue */ +#define ATA_DEADLINE_HANDLING 0x01 /* deadline handling */ +#define ATA_SET_FEATURES 0x05 /* set features */ +#define ATA_ZERO_EXT 0x06 /* zero ext */ +#define ATA_NCQ_ZAC_MGMT_OUT 0x07 /* NCQ ZAC mgmt out no data */ #define ATA_SEND_FPDMA_QUEUED 0x64 /* send DMA NCQ */ -#define ATA_RECV_FPDMA_QUEUED 0x65 /* recieve DMA NCQ */ +#define ATA_SFPDMA_DSM 0x00 /* Data set management */ +#define ATA_SFPDMA_DSM_TRIM 0x01 /* Set trim bit in auxiliary */ +#define ATA_SFPDMA_HYBRID_EVICT 0x01 /* Hybrid Evict */ +#define ATA_SFPDMA_WLDMA 0x02 /* Write Log DMA EXT */ +#define ATA_SFPDMA_ZAC_MGMT_OUT 0x03 /* NCQ ZAC mgmt out w/data */ +#define ATA_RECV_FPDMA_QUEUED 0x65 /* receive DMA NCQ */ +#define ATA_RFPDMA_RL_DMA_EXT 0x00 /* Read Log DMA EXT */ +#define ATA_RFPDMA_ZAC_MGMT_IN 0x02 /* NCQ ZAC mgmt in w/data */ #define ATA_SEP_ATTN 0x67 /* SEP request */ #define ATA_SEEK 0x70 /* seek */ +#define ATA_ZAC_MANAGEMENT_OUT 0x9f /* ZAC management out */ +#define ATA_ZM_CLOSE_ZONE 0x01 /* close zone */ +#define ATA_ZM_FINISH_ZONE 0x02 /* finish zone */ +#define ATA_ZM_OPEN_ZONE 0x03 /* open zone */ +#define ATA_ZM_RWP 0x04 /* reset write pointer */ #define ATA_PACKET_CMD 0xa0 /* packet command */ #define ATA_ATAPI_IDENTIFY 0xa1 /* get ATAPI params*/ #define ATA_SERVICE 0xa2 /* service command */ @@ -393,24 +434,36 @@ struct ata_params { #define ATA_IDLE_CMD 0xe3 /* idle */ #define ATA_READ_BUFFER 0xe4 /* read buffer */ #define ATA_READ_PM 0xe4 /* read portmultiplier */ +#define ATA_CHECK_POWER_MODE 0xe5 /* device power mode */ #define ATA_SLEEP 0xe6 /* sleep */ #define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */ #define ATA_WRITE_PM 0xe8 /* write portmultiplier */ #define ATA_FLUSHCACHE48 0xea /* flush cache to disk */ #define ATA_ATA_IDENTIFY 0xec /* get ATA params */ #define ATA_SETFEATURES 0xef /* features command */ -#define ATA_SF_SETXFER 0x03 /* set transfer mode */ #define ATA_SF_ENAB_WCACHE 0x02 /* enable write cache */ #define ATA_SF_DIS_WCACHE 0x82 /* disable write cache */ +#define ATA_SF_SETXFER 0x03 /* set transfer mode */ +#define ATA_SF_APM 0x05 /* Enable APM feature set */ #define ATA_SF_ENAB_PUIS 0x06 /* enable PUIS */ #define ATA_SF_DIS_PUIS 0x86 /* disable PUIS */ #define ATA_SF_PUIS_SPINUP 0x07 /* PUIS spin-up */ +#define ATA_SF_WRV 0x0b /* Enable Write-Read-Verify */ +#define ATA_SF_DLC 0x0c /* Enable device life control */ +#define ATA_SF_SATA 0x10 /* Enable use of SATA feature */ +#define ATA_SF_FFC 0x41 /* Free-fall Control */ +#define ATA_SF_MHIST 0x43 /* Set Max Host Sect. Times */ +#define ATA_SF_RATE 0x45 /* Set Rate Basis */ +#define ATA_SF_EPC 0x4A /* Extended Power Conditions */ #define ATA_SF_ENAB_RCACHE 0xaa /* enable readahead cache */ #define ATA_SF_DIS_RCACHE 0x55 /* disable readahead cache */ #define ATA_SF_ENAB_RELIRQ 0x5d /* enable release interrupt */ #define ATA_SF_DIS_RELIRQ 0xdd /* disable release interrupt */ #define ATA_SF_ENAB_SRVIRQ 0x5e /* enable service interrupt */ #define ATA_SF_DIS_SRVIRQ 0xde /* disable service interrupt */ +#define ATA_SF_LPSAERC 0x62 /* Long Phys Sect Align ErrRep*/ +#define ATA_SF_DSN 0x63 /* Device Stats Notification */ +#define ATA_CHECK_POWER_MODE 0xe5 /* Check Power Mode */ #define ATA_SECURITY_SET_PASSWORD 0xf1 /* set drive password */ #define ATA_SECURITY_UNLOCK 0xf2 /* unlock drive using passwd */ #define ATA_SECURITY_ERASE_PREPARE 0xf3 /* prepare to erase drive */ @@ -537,6 +590,333 @@ struct atapi_sense { u_int8_t specific2; /* sense key specific */ } __packed; +/* + * SET FEATURES subcommands + */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * These values go in the LBA 3:0. + */ +#define ATA_SF_EPC_RESTORE 0x00 /* Restore Power Condition Settings */ +#define ATA_SF_EPC_GOTO 0x01 /* Go To Power Condition */ +#define ATA_SF_EPC_SET_TIMER 0x02 /* Set Power Condition Timer */ +#define ATA_SF_EPC_SET_STATE 0x03 /* Set Power Condition State */ +#define ATA_SF_EPC_ENABLE 0x04 /* Enable the EPC feature set */ +#define ATA_SF_EPC_DISABLE 0x05 /* Disable the EPC feature set */ +#define ATA_SF_EPC_SET_SOURCE 0x06 /* Set EPC Power Source */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * Power Condition ID field + * These values go in the count register. + */ +#define ATA_EPC_STANDBY_Z 0x00 /* Substate of PM2:Standby */ +#define ATA_EPC_STANDBY_Y 0x01 /* Substate of PM2:Standby */ +#define ATA_EPC_IDLE_A 0x81 /* Substate of PM1:Idle */ +#define ATA_EPC_IDLE_B 0x82 /* Substate of PM1:Idle */ +#define ATA_EPC_IDLE_C 0x83 /* Substate of PM1:Idle */ +#define ATA_EPC_ALL 0xff /* All supported power conditions */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * Restore Power Conditions Settings subcommand + * These values go in the LBA register. + */ +#define ATA_SF_EPC_RST_DFLT 0x40 /* 1=Rst from Default, 0= from Saved */ +#define ATA_SF_EPC_RST_SAVE 0x10 /* 1=Save on completion */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * Got To Power Condition subcommand + * These values go in the LBA register. + */ +#define ATA_SF_EPC_GOTO_DELAY 0x02000000 /* Delayed entry bit */ +#define ATA_SF_EPC_GOTO_HOLD 0x01000000 /* Hold Power Cond bit */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * Set Power Condition Timer subcommand + * These values go in the LBA register. + */ +#define ATA_SF_EPC_TIMER_MASK 0x00ffff00 /* Timer field */ +#define ATA_SF_EPC_TIMER_SHIFT 8 +#define ATA_SF_EPC_TIMER_SEC 0x00000080 /* Timer units, 1=sec, 0=.1s */ +#define ATA_SF_EPC_TIMER_EN 0x00000020 /* Enable/disable cond. */ +#define ATA_SF_EPC_TIMER_SAVE 0x00000010 /* Save settings on comp. */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * Set Power Condition State subcommand + * These values go in the LBA register. + */ +#define ATA_SF_EPC_SETCON_EN 0x00000020 /* Enable power cond. */ +#define ATA_SF_EPC_SETCON_SAVE 0x00000010 /* Save settings on comp */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * Set EPC Power Source subcommand + * These values go in the count register. + */ +#define ATA_SF_EPC_SRC_UNKNOWN 0x0000 /* Unknown source */ +#define ATA_SF_EPC_SRC_BAT 0x0001 /* battery source */ +#define ATA_SF_EPC_SRC_NOT_BAT 0x0002 /* not battery source */ + +#define ATA_LOG_DIRECTORY 0x00 /* Directory of all logs */ +#define ATA_POWER_COND_LOG 0x08 /* Power Conditions Log */ +#define ATA_PCL_IDLE 0x00 /* Idle Power Conditions Page */ +#define ATA_PCL_STANDBY 0x01 /* Standby Power Conditions Page */ +#define ATA_IDENTIFY_DATA_LOG 0x30 /* Identify Device Data Log */ +#define ATA_IDL_PAGE_LIST 0x00 /* List of supported pages */ +#define ATA_IDL_IDENTIFY_DATA 0x01 /* Copy of Identify Device data */ +#define ATA_IDL_CAPACITY 0x02 /* Capacity */ +#define ATA_IDL_SUP_CAP 0x03 /* Supported Capabilities */ +#define ATA_IDL_CUR_SETTINGS 0x04 /* Current Settings */ +#define ATA_IDL_ATA_STRINGS 0x05 /* ATA Strings */ +#define ATA_IDL_SECURITY 0x06 /* Security */ +#define ATA_IDL_PARALLEL_ATA 0x07 /* Parallel ATA */ +#define ATA_IDL_SERIAL_ATA 0x08 /* Seiral ATA */ +#define ATA_IDL_ZDI 0x09 /* Zoned Device Information */ + +struct ata_gp_log_dir { + uint8_t header[2]; +#define ATA_GP_LOG_DIR_VERSION 0x0001 + uint8_t num_pages[255*2]; /* Number of log pages at address */ +}; + +/* + * ATA Power Conditions log descriptor + */ +struct ata_power_cond_log_desc { + uint8_t reserved1; + uint8_t flags; +#define ATA_PCL_COND_SUPPORTED 0x80 +#define ATA_PCL_COND_SAVEABLE 0x40 +#define ATA_PCL_COND_CHANGEABLE 0x20 +#define ATA_PCL_DEFAULT_TIMER_EN 0x10 +#define ATA_PCL_SAVED_TIMER_EN 0x08 +#define ATA_PCL_CURRENT_TIMER_EN 0x04 +#define ATA_PCL_HOLD_PC_NOT_SUP 0x02 + uint8_t reserved2[2]; + uint8_t default_timer[4]; + uint8_t saved_timer[4]; + uint8_t current_timer[4]; + uint8_t nom_time_to_active[4]; + uint8_t min_timer[4]; + uint8_t max_timer[4]; + uint8_t num_transitions_to_pc[4]; + uint8_t hours_in_pc[4]; + uint8_t reserved3[28]; +}; + +/* + * ATA Power Conditions Log (0x08), Idle power conditions page (0x00) + */ +struct ata_power_cond_log_idle { + struct ata_power_cond_log_desc idle_a_desc; + struct ata_power_cond_log_desc idle_b_desc; + struct ata_power_cond_log_desc idle_c_desc; + uint8_t reserved[320]; +}; + +/* + * ATA Power Conditions Log (0x08), Standby power conditions page (0x01) + */ +struct ata_power_cond_log_standby { + uint8_t reserved[384]; + struct ata_power_cond_log_desc standby_y_desc; + struct ata_power_cond_log_desc standby_z_desc; +}; + +/* + * ATA IDENTIFY DEVICE data log (0x30) page 0x00 + * List of Supported IDENTIFY DEVICE data pages. + */ +struct ata_identify_log_pages { + uint8_t header[8]; +#define ATA_IDLOG_REVISION 0x0000000000000001 + uint8_t entry_count; + uint8_t entries[503]; +}; + +/* + * ATA IDENTIFY DEVICE data log (0x30) + * Capacity (Page 0x02). + */ +struct ata_identify_log_capacity { + uint8_t header[8]; +#define ATA_CAP_HEADER_VALID 0x8000000000000000 +#define ATA_CAP_PAGE_NUM_MASK 0x0000000000ff0000 +#define ATA_CAP_PAGE_NUM_SHIFT 16 +#define ATA_CAP_REV_MASK 0x00000000000000ff + uint8_t capacity[8]; +#define ATA_CAP_CAPACITY_VALID 0x8000000000000000 +#define ATA_CAP_ACCESSIBLE_CAP 0x0000ffffffffffff + uint8_t phys_logical_sect_size[8]; +#define ATA_CAP_PL_VALID 0x8000000000000000 +#define ATA_CAP_LTOP_REL_SUP 0x4000000000000000 +#define ATA_CAP_LOG_SECT_SUP 0x2000000000000000 +#define ATA_CAP_ALIGN_ERR_MASK 0x0000000000300000 +#define ATA_CAP_LTOP_MASK 0x00000000000f0000 +#define ATA_CAP_LOG_SECT_OFF 0x000000000000ffff + uint8_t logical_sect_size[8]; +#define ATA_CAP_LOG_SECT_VALID 0x8000000000000000 +#define ATA_CAP_LOG_SECT_SIZE 0x00000000ffffffff + uint8_t nominal_buffer_size[8]; +#define ATA_CAP_NOM_BUF_VALID 0x8000000000000000 +#define ATA_CAP_NOM_BUF_SIZE 0x7fffffffffffffff + uint8_t reserved[472]; +}; + +/* + * ATA IDENTIFY DEVICE data log (0x30) + * Supported Capabilities (Page 0x03). + */ + +struct ata_identify_log_sup_cap { + uint8_t header[8]; +#define ATA_SUP_CAP_HEADER_VALID 0x8000000000000000 +#define ATA_SUP_CAP_PAGE_NUM_MASK 0x0000000000ff0000 +#define ATA_SUP_CAP_PAGE_NUM_SHIFT 16 +#define ATA_SUP_CAP_REV_MASK 0x00000000000000ff + uint8_t sup_cap[8]; +#define ATA_SUP_CAP_VALID 0x8000000000000000 +#define ATA_SC_SET_SECT_CONFIG_SUP 0x0002000000000000 /* Set Sect Conf*/ +#define ATA_SC_ZERO_EXT_SUP 0x0001000000000000 /* Zero EXT */ +#define ATA_SC_SUCC_NCQ_SENSE_SUP 0x0000800000000000 /* Succ. NCQ Sns */ +#define ATA_SC_DLC_SUP 0x0000400000000000 /* DLC */ +#define ATA_SC_RQSN_DEV_FAULT_SUP 0x0000200000000000 /* Req Sns Dev Flt*/ +#define ATA_SC_DSN_SUP 0x0000100000000000 /* DSN */ +#define ATA_SC_LP_STANDBY_SUP 0x0000080000000000 /* LP Standby */ +#define ATA_SC_SET_EPC_PS_SUP 0x0000040000000000 /* Set EPC PS */ +#define ATA_SC_AMAX_ADDR_SUP 0x0000020000000000 /* AMAX Addr */ +#define ATA_SC_DRAT_SUP 0x0000008000000000 /* DRAT */ +#define ATA_SC_LPS_MISALGN_SUP 0x0000004000000000 /* LPS Misalign */ +#define ATA_SC_RB_DMA_SUP 0x0000001000000000 /* Read Buf DMA */ +#define ATA_SC_WB_DMA_SUP 0x0000000800000000 /* Write Buf DMA */ +#define ATA_SC_DNLD_MC_DMA_SUP 0x0000000200000000 /* DL MCode DMA */ +#define ATA_SC_28BIT_SUP 0x0000000100000000 /* 28-bit */ +#define ATA_SC_RZAT_SUP 0x0000000080000000 /* RZAT */ +#define ATA_SC_NOP_SUP 0x0000000020000000 /* NOP */ +#define ATA_SC_READ_BUFFER_SUP 0x0000000010000000 /* Read Buffer */ +#define ATA_SC_WRITE_BUFFER_SUP 0x0000000008000000 /* Write Buffer */ +#define ATA_SC_READ_LOOK_AHEAD_SUP 0x0000000002000000 /* Read Look-Ahead*/ +#define ATA_SC_VOLATILE_WC_SUP 0x0000000001000000 /* Volatile WC */ +#define ATA_SC_SMART_SUP 0x0000000000800000 /* SMART */ +#define ATA_SC_FLUSH_CACHE_EXT_SUP 0x0000000000400000 /* Flush Cache Ext */ +#define ATA_SC_48BIT_SUP 0x0000000000100000 /* 48-Bit */ +#define ATA_SC_SPINUP_SUP 0x0000000000040000 /* Spin-Up */ +#define ATA_SC_PUIS_SUP 0x0000000000020000 /* PUIS */ +#define ATA_SC_APM_SUP 0x0000000000010000 /* APM */ +#define ATA_SC_DL_MICROCODE_SUP 0x0000000000004000 /* DL Microcode */ +#define ATA_SC_UNLOAD_SUP 0x0000000000002000 /* Unload */ +#define ATA_SC_WRITE_FUA_EXT_SUP 0x0000000000001000 /* Write FUA EXT */ +#define ATA_SC_GPL_SUP 0x0000000000000800 /* GPL */ +#define ATA_SC_STREAMING_SUP 0x0000000000000400 /* Streaming */ +#define ATA_SC_SMART_SELFTEST_SUP 0x0000000000000100 /* SMART self-test */ +#define ATA_SC_SMART_ERR_LOG_SUP 0x0000000000000080 /* SMART Err Log */ +#define ATA_SC_EPC_SUP 0x0000000000000040 /* EPC */ +#define ATA_SC_SENSE_SUP 0x0000000000000020 /* Sense data */ +#define ATA_SC_FREEFALL_SUP 0x0000000000000010 /* Free-Fall */ +#define ATA_SC_DM_MODE3_SUP 0x0000000000000008 /* DM Mode 3 */ +#define ATA_SC_GPL_DMA_SUP 0x0000000000000004 /* GPL DMA */ +#define ATA_SC_WRITE_UNCOR_SUP 0x0000000000000002 /* Write uncorr. */ +#define ATA_SC_WRV_SUP 0x0000000000000001 /* WRV */ + uint8_t download_code_cap[8]; +#define ATA_DL_CODE_VALID 0x8000000000000000 +#define ATA_DLC_DM_OFFSETS_DEFER_SUP 0x0000000400000000 +#define ATA_DLC_DM_IMMED_SUP 0x0000000200000000 +#define ATA_DLC_DM_OFF_IMMED_SUP 0x0000000100000000 +#define ATA_DLC_DM_MAX_XFER_SIZE_MASK 0x00000000ffff0000 +#define ATA_DLC_DM_MAX_XFER_SIZE_SHIFT 16 +#define ATA_DLC_DM_MIN_XFER_SIZE_MASK 0x000000000000ffff + uint8_t nom_media_rotation_rate[8]; +#define ATA_NOM_MEDIA_ROTATION_VALID 0x8000000000000000 +#define ATA_ROTATION_MASK 0x000000000000ffff + uint8_t form_factor[8]; +#define ATA_FORM_FACTOR_VALID 0x8000000000000000 +#define ATA_FF_MASK 0x000000000000000f +#define ATA_FF_NOT_REPORTED 0x0000000000000000 /* Not reported */ +#define ATA_FF_525_IN 0x0000000000000001 /* 5.25 inch */ +#define ATA_FF_35_IN 0x0000000000000002 /* 3.5 inch */ +#define ATA_FF_25_IN 0x0000000000000003 /* 2.5 inch */ +#define ATA_FF_18_IN 0x0000000000000004 /* 1.8 inch */ +#define ATA_FF_LT_18_IN 0x0000000000000005 /* < 1.8 inch */ +#define ATA_FF_MSATA 0x0000000000000006 /* mSATA */ +#define ATA_FF_M2 0x0000000000000007 /* M.2 */ +#define ATA_FF_MICROSSD 0x0000000000000008 /* MicroSSD */ +#define ATA_FF_CFAST 0x0000000000000009 /* CFast */ + uint8_t wrv_sec_cnt_mode3[8]; +#define ATA_WRV_MODE3_VALID 0x8000000000000000 +#define ATA_WRV_MODE3_COUNT 0x00000000ffffffff + uint8_t wrv_sec_cnt_mode2[8]; +#define ATA_WRV_MODE2_VALID 0x8000000000000000 +#define ATA_WRV_MODE2_COUNT 0x00000000ffffffff + uint8_t wwn[16]; + /* XXX KDM need to figure out how to handle 128-bit fields */ + uint8_t dsm[8]; +#define ATA_DSM_VALID 0x8000000000000000 +#define ATA_LB_MARKUP_SUP 0x000000000000ff00 +#define ATA_TRIM_SUP 0x0000000000000001 + uint8_t util_per_unit_time[16]; + /* XXX KDM need to figure out how to handle 128-bit fields */ + uint8_t util_usage_rate_sup[8]; +#define ATA_UTIL_USAGE_RATE_VALID 0x8000000000000000 +#define ATA_SETTING_RATE_SUP 0x0000000000800000 +#define ATA_SINCE_POWERON_SUP 0x0000000000000100 +#define ATA_POH_RATE_SUP 0x0000000000000010 +#define ATA_DATE_TIME_RATE_SUP 0x0000000000000001 + uint8_t zoned_cap[8]; +#define ATA_ZONED_VALID 0x8000000000000000 +#define ATA_ZONED_MASK 0x0000000000000003 + uint8_t sup_zac_cap[8]; +#define ATA_SUP_ZAC_CAP_VALID 0x8000000000000000 +#define ATA_ND_RWP_SUP 0x0000000000000010 /* Reset Write Ptr*/ +#define ATA_ND_FINISH_ZONE_SUP 0x0000000000000008 /* Finish Zone */ +#define ATA_ND_CLOSE_ZONE_SUP 0x0000000000000004 /* Close Zone */ +#define ATA_ND_OPEN_ZONE_SUP 0x0000000000000002 /* Open Zone */ +#define ATA_REPORT_ZONES_SUP 0x0000000000000001 /* Report Zones */ + uint8_t reserved[392]; +}; + +/* + * ATA Identify Device Data Log Zoned Device Information Page (0x09). + * Current as of ZAC r04a, August 25, 2015. + */ +struct ata_zoned_info_log { + uint8_t header[8]; +#define ATA_ZDI_HEADER_VALID 0x8000000000000000 +#define ATA_ZDI_PAGE_NUM_MASK 0x0000000000ff0000 +#define ATA_ZDI_PAGE_NUM_SHIFT 16 +#define ATA_ZDI_REV_MASK 0x00000000000000ff + uint8_t zoned_cap[8]; +#define ATA_ZDI_CAP_VALID 0x8000000000000000 +#define ATA_ZDI_CAP_URSWRZ 0x0000000000000001 + uint8_t zoned_settings[8]; +#define ATA_ZDI_SETTINGS_VALID 0x8000000000000000 + uint8_t optimal_seq_zones[8]; +#define ATA_ZDI_OPT_SEQ_VALID 0x8000000000000000 +#define ATA_ZDI_OPT_SEQ_MASK 0x00000000ffffffff + uint8_t optimal_nonseq_zones[8]; +#define ATA_ZDI_OPT_NS_VALID 0x8000000000000000 +#define ATA_ZDI_OPT_NS_MASK 0x00000000ffffffff + uint8_t max_seq_req_zones[8]; +#define ATA_ZDI_MAX_SEQ_VALID 0x8000000000000000 +#define ATA_ZDI_MAX_SEQ_MASK 0x00000000ffffffff + uint8_t version_info[8]; +#define ATA_ZDI_VER_VALID 0x8000000000000000 +#define ATA_ZDI_VER_ZAC_SUP 0x0100000000000000 +#define ATA_ZDI_VER_ZAC_MASK 0x00000000000000ff + uint8_t reserved[456]; +}; + struct ata_ioc_request { union { struct { diff --git a/usr/contrib/freebsd/sys/linker_set.h b/usr/contrib/freebsd/sys/linker_set.h deleted file mode 100644 index 393dfbc131..0000000000 --- a/usr/contrib/freebsd/sys/linker_set.h +++ /dev/null @@ -1,119 +0,0 @@ -/*- - * Copyright (c) 1999 John D. Polstra - * Copyright (c) 1999,2001 Peter Wemm - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: head/sys/sys/linker_set.h 215701 2010-11-22 19:32:54Z dim $ - */ - -#ifndef _SYS_LINKER_SET_H_ -#define _SYS_LINKER_SET_H_ - -#ifdef __FreeBSD__ -#ifndef _SYS_CDEFS_H_ -#error this file needs sys/cdefs.h as a prerequisite -#endif -#else -#ifndef _COMPAT_FREEBSD_SYS_CDEFS_H_ -#error this file needs sys/cdefs.h as a prerequisite -#endif -#endif - -/* - * The following macros are used to declare global sets of objects, which - * are collected by the linker into a `linker_set' as defined below. - * For ELF, this is done by constructing a separate segment for each set. - */ - -/* - * Private macros, not to be used outside this header file. - */ -#ifdef __GNUCLIKE___SECTION -#ifdef __FreeBSD__ -#define __MAKE_SET(set, sym) \ - __GLOBL(__CONCAT(__start_set_,set)); \ - __GLOBL(__CONCAT(__stop_set_,set)); \ - static void const * const __set_##set##_sym_##sym \ - __section("set_" #set) __used = &sym -#else -#define __MAKE_SET(set, sym) \ - static void const * const __set_##set##_sym_##sym \ - __section("set_" #set) __used = &sym -#endif -#else /* !__GNUCLIKE___SECTION */ -#ifndef lint -#error this file needs to be ported to your compiler -#endif /* lint */ -#define __MAKE_SET(set, sym) extern void const * const (__set_##set##_sym_##sym) -#endif /* __GNUCLIKE___SECTION */ - -/* - * Public macros. - */ -#define TEXT_SET(set, sym) __MAKE_SET(set, sym) -#define DATA_SET(set, sym) __MAKE_SET(set, sym) -#define BSS_SET(set, sym) __MAKE_SET(set, sym) -#define ABS_SET(set, sym) __MAKE_SET(set, sym) -#define SET_ENTRY(set, sym) __MAKE_SET(set, sym) - -/* - * Initialize before referring to a given linker set. - */ -#ifdef __FreeBSD__ -#define SET_DECLARE(set, ptype) \ - extern ptype *__CONCAT(__start_set_,set); \ - extern ptype *__CONCAT(__stop_set_,set) -#else -#define SET_DECLARE(set, ptype) \ - _Pragma(__XSTRING(weak __CONCAT(__start_set_,set))) \ - _Pragma(__XSTRING(weak __CONCAT(__stop_set_,set))) \ - extern ptype *__CONCAT(__start_set_,set); \ - extern ptype *__CONCAT(__stop_set_,set) -#endif - -#define SET_BEGIN(set) \ - (&__CONCAT(__start_set_,set)) -#define SET_LIMIT(set) \ - (&__CONCAT(__stop_set_,set)) - -/* - * Iterate over all the elements of a set. - * - * Sets always contain addresses of things, and "pvar" points to words - * containing those addresses. Thus is must be declared as "type **pvar", - * and the address of each set item is obtained inside the loop by "*pvar". - */ -#define SET_FOREACH(pvar, set) \ - for (pvar = SET_BEGIN(set); pvar < SET_LIMIT(set); pvar++) - -#define SET_ITEM(set, i) \ - ((SET_BEGIN(set))[i]) - -/* - * Provide a count of the items in a set. - */ -#define SET_COUNT(set) \ - (SET_LIMIT(set) - SET_BEGIN(set)) - -#endif /* _SYS_LINKER_SET_H_ */ diff --git a/usr/contrib/freebsd/sys/pciio.h b/usr/contrib/freebsd/sys/pciio.h new file mode 100644 index 0000000000..d70bfbcf6f --- /dev/null +++ b/usr/contrib/freebsd/sys/pciio.h @@ -0,0 +1,146 @@ +/*- + * Copyright (c) 1997, Stefan Esser + * Copyright (c) 1997, 1998, 1999, Kenneth D. Merry + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _SYS_PCIIO_H_ +#define _SYS_PCIIO_H_ + +#include + +#define PCI_MAXNAMELEN 16 + +typedef enum { + PCI_GETCONF_LAST_DEVICE, + PCI_GETCONF_LIST_CHANGED, + PCI_GETCONF_MORE_DEVS, + PCI_GETCONF_ERROR +} pci_getconf_status; + +typedef enum { + PCI_GETCONF_NO_MATCH = 0x0000, + PCI_GETCONF_MATCH_DOMAIN = 0x0001, + PCI_GETCONF_MATCH_BUS = 0x0002, + PCI_GETCONF_MATCH_DEV = 0x0004, + PCI_GETCONF_MATCH_FUNC = 0x0008, + PCI_GETCONF_MATCH_NAME = 0x0010, + PCI_GETCONF_MATCH_UNIT = 0x0020, + PCI_GETCONF_MATCH_VENDOR = 0x0040, + PCI_GETCONF_MATCH_DEVICE = 0x0080, + PCI_GETCONF_MATCH_CLASS = 0x0100 +} pci_getconf_flags; + +struct pcisel { + u_int32_t pc_domain; /* domain number */ + u_int8_t pc_bus; /* bus number */ + u_int8_t pc_dev; /* device on this bus */ + u_int8_t pc_func; /* function on this device */ +}; + +struct pci_conf { + struct pcisel pc_sel; /* domain+bus+slot+function */ + u_int8_t pc_hdr; /* PCI header type */ + u_int16_t pc_subvendor; /* card vendor ID */ + u_int16_t pc_subdevice; /* card device ID, assigned by + card vendor */ + u_int16_t pc_vendor; /* chip vendor ID */ + u_int16_t pc_device; /* chip device ID, assigned by + chip vendor */ + u_int8_t pc_class; /* chip PCI class */ + u_int8_t pc_subclass; /* chip PCI subclass */ + u_int8_t pc_progif; /* chip PCI programming interface */ + u_int8_t pc_revid; /* chip revision ID */ + char pd_name[PCI_MAXNAMELEN + 1]; /* device name */ + u_long pd_unit; /* device unit number */ +}; + +struct pci_match_conf { + struct pcisel pc_sel; /* domain+bus+slot+function */ + char pd_name[PCI_MAXNAMELEN + 1]; /* device name */ + u_long pd_unit; /* Unit number */ + u_int16_t pc_vendor; /* PCI Vendor ID */ + u_int16_t pc_device; /* PCI Device ID */ + u_int8_t pc_class; /* PCI class */ + pci_getconf_flags flags; /* Matching expression */ +}; + +struct pci_conf_io { + u_int32_t pat_buf_len; /* pattern buffer length */ + u_int32_t num_patterns; /* number of patterns */ + struct pci_match_conf *patterns; /* pattern buffer */ + u_int32_t match_buf_len; /* match buffer length */ + u_int32_t num_matches; /* number of matches returned */ + struct pci_conf *matches; /* match buffer */ + u_int32_t offset; /* offset into device list */ + u_int32_t generation; /* device list generation */ + pci_getconf_status status; /* request status */ +}; + +struct pci_io { + struct pcisel pi_sel; /* device to operate on */ + int pi_reg; /* configuration register to examine */ + int pi_width; /* width (in bytes) of read or write */ + u_int32_t pi_data; /* data to write or result of read */ +}; + +struct pci_bar_io { + struct pcisel pbi_sel; /* device to operate on */ + int pbi_reg; /* starting address of BAR */ + int pbi_enabled; /* decoding enabled */ + uint64_t pbi_base; /* current value of BAR */ + uint64_t pbi_length; /* length of BAR */ +}; + +struct pci_vpd_element { + char pve_keyword[2]; + uint8_t pve_flags; + uint8_t pve_datalen; + uint8_t pve_data[0]; +}; + +#define PVE_FLAG_IDENT 0x01 /* Element is the string identifier */ +#define PVE_FLAG_RW 0x02 /* Element is read/write */ + +#define PVE_NEXT(pve) \ + ((struct pci_vpd_element *)((char *)(pve) + \ + sizeof(struct pci_vpd_element) + (pve)->pve_datalen)) + +struct pci_list_vpd_io { + struct pcisel plvi_sel; /* device to operate on */ + size_t plvi_len; /* size of the data area */ + struct pci_vpd_element *plvi_data; +}; + +#define PCIOCGETCONF _IOWR('p', 5, struct pci_conf_io) +#define PCIOCREAD _IOWR('p', 2, struct pci_io) +#define PCIOCWRITE _IOWR('p', 3, struct pci_io) +#define PCIOCATTACHED _IOWR('p', 4, struct pci_io) +#define PCIOCGETBAR _IOWR('p', 6, struct pci_bar_io) +#define PCIOCLISTVPD _IOWR('p', 7, struct pci_list_vpd_io) + +#endif /* !_SYS_PCIIO_H_ */ diff --git a/usr/contrib/freebsd/sys/queue.h b/usr/contrib/freebsd/sys/queue.h new file mode 100644 index 0000000000..f26c492af1 --- /dev/null +++ b/usr/contrib/freebsd/sys/queue.h @@ -0,0 +1,787 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + * $FreeBSD$ + */ + +#ifndef _SYS_QUEUE_H_ +#define _SYS_QUEUE_H_ + +#include + +/* + * This file defines four types of data structures: singly-linked lists, + * singly-linked tail queues, lists and tail queues. + * + * A singly-linked list is headed by a single forward pointer. The elements + * are singly linked for minimum space and pointer manipulation overhead at + * the expense of O(n) removal for arbitrary elements. New elements can be + * added to the list after an existing element or at the head of the list. + * Elements being removed from the head of the list should use the explicit + * macro for this purpose for optimum efficiency. A singly-linked list may + * only be traversed in the forward direction. Singly-linked lists are ideal + * for applications with large datasets and few or no removals or for + * implementing a LIFO queue. + * + * A singly-linked tail queue is headed by a pair of pointers, one to the + * head of the list and the other to the tail of the list. The elements are + * singly linked for minimum space and pointer manipulation overhead at the + * expense of O(n) removal for arbitrary elements. New elements can be added + * to the list after an existing element, at the head of the list, or at the + * end of the list. Elements being removed from the head of the tail queue + * should use the explicit macro for this purpose for optimum efficiency. + * A singly-linked tail queue may only be traversed in the forward direction. + * Singly-linked tail queues are ideal for applications with large datasets + * and few or no removals or for implementing a FIFO queue. + * + * A list is headed by a single forward pointer (or an array of forward + * pointers for a hash table header). The elements are doubly linked + * so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before + * or after an existing element or at the head of the list. A list + * may be traversed in either direction. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may be traversed in either direction. + * + * For details on the use of these macros, see the queue(3) manual page. + * + * Below is a summary of implemented functions where: + * + means the macro is available + * - means the macro is not available + * s means the macro is available but is slow (runs in O(n) time) + * + * SLIST LIST STAILQ TAILQ + * _HEAD + + + + + * _CLASS_HEAD + + + + + * _HEAD_INITIALIZER + + + + + * _ENTRY + + + + + * _CLASS_ENTRY + + + + + * _INIT + + + + + * _EMPTY + + + + + * _FIRST + + + + + * _NEXT + + + + + * _PREV - + - + + * _LAST - - + + + * _FOREACH + + + + + * _FOREACH_FROM + + + + + * _FOREACH_SAFE + + + + + * _FOREACH_FROM_SAFE + + + + + * _FOREACH_REVERSE - - - + + * _FOREACH_REVERSE_FROM - - - + + * _FOREACH_REVERSE_SAFE - - - + + * _FOREACH_REVERSE_FROM_SAFE - - - + + * _INSERT_HEAD + + + + + * _INSERT_BEFORE - + - + + * _INSERT_AFTER + + + + + * _INSERT_TAIL - - + + + * _CONCAT s s + + + * _REMOVE_AFTER + - + - + * _REMOVE_HEAD + - + - + * _REMOVE s + s + + * _SWAP + + + + + * + */ +#ifdef QUEUE_MACRO_DEBUG +/* Store the last 2 places the queue element or head was altered */ +struct qm_trace { + unsigned long lastline; + unsigned long prevline; + const char *lastfile; + const char *prevfile; +}; + +#define TRACEBUF struct qm_trace trace; +#define TRACEBUF_INITIALIZER { __LINE__, 0, __FILE__, NULL } , +#define TRASHIT(x) do {(x) = (void *)-1;} while (0) +#define QMD_SAVELINK(name, link) void **name = (void *)&(link) + +#define QMD_TRACE_HEAD(head) do { \ + (head)->trace.prevline = (head)->trace.lastline; \ + (head)->trace.prevfile = (head)->trace.lastfile; \ + (head)->trace.lastline = __LINE__; \ + (head)->trace.lastfile = __FILE__; \ +} while (0) + +#define QMD_TRACE_ELEM(elem) do { \ + (elem)->trace.prevline = (elem)->trace.lastline; \ + (elem)->trace.prevfile = (elem)->trace.lastfile; \ + (elem)->trace.lastline = __LINE__; \ + (elem)->trace.lastfile = __FILE__; \ +} while (0) + +#else +#define QMD_TRACE_ELEM(elem) +#define QMD_TRACE_HEAD(head) +#define QMD_SAVELINK(name, link) +#define TRACEBUF +#define TRACEBUF_INITIALIZER +#define TRASHIT(x) +#endif /* QUEUE_MACRO_DEBUG */ + +#ifdef __cplusplus +/* + * In C++ there can be structure lists and class lists: + */ +#define QUEUE_TYPEOF(type) type +#else +#define QUEUE_TYPEOF(type) struct type +#endif + +/* + * Singly-linked List declarations. + */ +#define SLIST_HEAD(name, type) \ +struct name { \ + struct type *slh_first; /* first element */ \ +} + +#define SLIST_CLASS_HEAD(name, type) \ +struct name { \ + class type *slh_first; /* first element */ \ +} + +#define SLIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define SLIST_ENTRY(type) \ +struct { \ + struct type *sle_next; /* next element */ \ +} + +#define SLIST_CLASS_ENTRY(type) \ +struct { \ + class type *sle_next; /* next element */ \ +} + +/* + * Singly-linked List functions. + */ +#define SLIST_CONCAT(head1, head2, type, field) do { \ + QUEUE_TYPEOF(type) *curelm = SLIST_FIRST(head1); \ + if (curelm == NULL) { \ + if ((SLIST_FIRST(head1) = SLIST_FIRST(head2)) != NULL) \ + SLIST_INIT(head2); \ + } else if (SLIST_FIRST(head2) != NULL) { \ + while (SLIST_NEXT(curelm, field) != NULL) \ + curelm = SLIST_NEXT(curelm, field); \ + SLIST_NEXT(curelm, field) = SLIST_FIRST(head2); \ + SLIST_INIT(head2); \ + } \ +} while (0) + +#define SLIST_EMPTY(head) ((head)->slh_first == NULL) + +#define SLIST_FIRST(head) ((head)->slh_first) + +#define SLIST_FOREACH(var, head, field) \ + for ((var) = SLIST_FIRST((head)); \ + (var); \ + (var) = SLIST_NEXT((var), field)) + +#define SLIST_FOREACH_FROM(var, head, field) \ + for ((var) = ((var) ? (var) : SLIST_FIRST((head))); \ + (var); \ + (var) = SLIST_NEXT((var), field)) + +#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = SLIST_FIRST((head)); \ + (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define SLIST_FOREACH_FROM_SAFE(var, head, field, tvar) \ + for ((var) = ((var) ? (var) : SLIST_FIRST((head))); \ + (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ + for ((varp) = &SLIST_FIRST((head)); \ + ((var) = *(varp)) != NULL; \ + (varp) = &SLIST_NEXT((var), field)) + +#define SLIST_INIT(head) do { \ + SLIST_FIRST((head)) = NULL; \ +} while (0) + +#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ + SLIST_NEXT((slistelm), field) = (elm); \ +} while (0) + +#define SLIST_INSERT_HEAD(head, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ + SLIST_FIRST((head)) = (elm); \ +} while (0) + +#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +#define SLIST_REMOVE(head, elm, type, field) do { \ + QMD_SAVELINK(oldnext, (elm)->field.sle_next); \ + if (SLIST_FIRST((head)) == (elm)) { \ + SLIST_REMOVE_HEAD((head), field); \ + } \ + else { \ + QUEUE_TYPEOF(type) *curelm = SLIST_FIRST(head); \ + while (SLIST_NEXT(curelm, field) != (elm)) \ + curelm = SLIST_NEXT(curelm, field); \ + SLIST_REMOVE_AFTER(curelm, field); \ + } \ + TRASHIT(*oldnext); \ +} while (0) + +#define SLIST_REMOVE_AFTER(elm, field) do { \ + SLIST_NEXT(elm, field) = \ + SLIST_NEXT(SLIST_NEXT(elm, field), field); \ +} while (0) + +#define SLIST_REMOVE_HEAD(head, field) do { \ + SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ +} while (0) + +#define SLIST_SWAP(head1, head2, type) do { \ + QUEUE_TYPEOF(type) *swap_first = SLIST_FIRST(head1); \ + SLIST_FIRST(head1) = SLIST_FIRST(head2); \ + SLIST_FIRST(head2) = swap_first; \ +} while (0) + +/* + * Singly-linked Tail queue declarations. + */ +#define STAILQ_HEAD(name, type) \ +struct name { \ + struct type *stqh_first;/* first element */ \ + struct type **stqh_last;/* addr of last next element */ \ +} + +#define STAILQ_CLASS_HEAD(name, type) \ +struct name { \ + class type *stqh_first; /* first element */ \ + class type **stqh_last; /* addr of last next element */ \ +} + +#define STAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).stqh_first } + +#define STAILQ_ENTRY(type) \ +struct { \ + struct type *stqe_next; /* next element */ \ +} + +#define STAILQ_CLASS_ENTRY(type) \ +struct { \ + class type *stqe_next; /* next element */ \ +} + +/* + * Singly-linked Tail queue functions. + */ +#define STAILQ_CONCAT(head1, head2) do { \ + if (!STAILQ_EMPTY((head2))) { \ + *(head1)->stqh_last = (head2)->stqh_first; \ + (head1)->stqh_last = (head2)->stqh_last; \ + STAILQ_INIT((head2)); \ + } \ +} while (0) + +#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) + +#define STAILQ_FIRST(head) ((head)->stqh_first) + +#define STAILQ_FOREACH(var, head, field) \ + for((var) = STAILQ_FIRST((head)); \ + (var); \ + (var) = STAILQ_NEXT((var), field)) + +#define STAILQ_FOREACH_FROM(var, head, field) \ + for ((var) = ((var) ? (var) : STAILQ_FIRST((head))); \ + (var); \ + (var) = STAILQ_NEXT((var), field)) + +#define STAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = STAILQ_FIRST((head)); \ + (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define STAILQ_FOREACH_FROM_SAFE(var, head, field, tvar) \ + for ((var) = ((var) ? (var) : STAILQ_FIRST((head))); \ + (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define STAILQ_INIT(head) do { \ + STAILQ_FIRST((head)) = NULL; \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_NEXT((tqelm), field) = (elm); \ +} while (0) + +#define STAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_FIRST((head)) = (elm); \ +} while (0) + +#define STAILQ_INSERT_TAIL(head, elm, field) do { \ + STAILQ_NEXT((elm), field) = NULL; \ + *(head)->stqh_last = (elm); \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_LAST(head, type, field) \ + (STAILQ_EMPTY((head)) ? NULL : \ + __containerof((head)->stqh_last, \ + QUEUE_TYPEOF(type), field.stqe_next)) + +#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) + +#define STAILQ_REMOVE(head, elm, type, field) do { \ + QMD_SAVELINK(oldnext, (elm)->field.stqe_next); \ + if (STAILQ_FIRST((head)) == (elm)) { \ + STAILQ_REMOVE_HEAD((head), field); \ + } \ + else { \ + QUEUE_TYPEOF(type) *curelm = STAILQ_FIRST(head); \ + while (STAILQ_NEXT(curelm, field) != (elm)) \ + curelm = STAILQ_NEXT(curelm, field); \ + STAILQ_REMOVE_AFTER(head, curelm, field); \ + } \ + TRASHIT(*oldnext); \ +} while (0) + +#define STAILQ_REMOVE_AFTER(head, elm, field) do { \ + if ((STAILQ_NEXT(elm, field) = \ + STAILQ_NEXT(STAILQ_NEXT(elm, field), field)) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_REMOVE_HEAD(head, field) do { \ + if ((STAILQ_FIRST((head)) = \ + STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_SWAP(head1, head2, type) do { \ + QUEUE_TYPEOF(type) *swap_first = STAILQ_FIRST(head1); \ + QUEUE_TYPEOF(type) **swap_last = (head1)->stqh_last; \ + STAILQ_FIRST(head1) = STAILQ_FIRST(head2); \ + (head1)->stqh_last = (head2)->stqh_last; \ + STAILQ_FIRST(head2) = swap_first; \ + (head2)->stqh_last = swap_last; \ + if (STAILQ_EMPTY(head1)) \ + (head1)->stqh_last = &STAILQ_FIRST(head1); \ + if (STAILQ_EMPTY(head2)) \ + (head2)->stqh_last = &STAILQ_FIRST(head2); \ +} while (0) + + +/* + * List declarations. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define LIST_CLASS_HEAD(name, type) \ +struct name { \ + class type *lh_first; /* first element */ \ +} + +#define LIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define LIST_ENTRY(type) \ +struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ +} + +#define LIST_CLASS_ENTRY(type) \ +struct { \ + class type *le_next; /* next element */ \ + class type **le_prev; /* address of previous next element */ \ +} + +/* + * List functions. + */ + +#if (defined(_KERNEL) && defined(INVARIANTS)) +#define QMD_LIST_CHECK_HEAD(head, field) do { \ + if (LIST_FIRST((head)) != NULL && \ + LIST_FIRST((head))->field.le_prev != \ + &LIST_FIRST((head))) \ + panic("Bad list head %p first->prev != head", (head)); \ +} while (0) + +#define QMD_LIST_CHECK_NEXT(elm, field) do { \ + if (LIST_NEXT((elm), field) != NULL && \ + LIST_NEXT((elm), field)->field.le_prev != \ + &((elm)->field.le_next)) \ + panic("Bad link elm %p next->prev != elm", (elm)); \ +} while (0) + +#define QMD_LIST_CHECK_PREV(elm, field) do { \ + if (*(elm)->field.le_prev != (elm)) \ + panic("Bad link elm %p prev->next != elm", (elm)); \ +} while (0) +#else +#define QMD_LIST_CHECK_HEAD(head, field) +#define QMD_LIST_CHECK_NEXT(elm, field) +#define QMD_LIST_CHECK_PREV(elm, field) +#endif /* (_KERNEL && INVARIANTS) */ + +#define LIST_CONCAT(head1, head2, type, field) do { \ + QUEUE_TYPEOF(type) *curelm = LIST_FIRST(head1); \ + if (curelm == NULL) { \ + if ((LIST_FIRST(head1) = LIST_FIRST(head2)) != NULL) { \ + LIST_FIRST(head2)->field.le_prev = \ + &LIST_FIRST((head1)); \ + LIST_INIT(head2); \ + } \ + } else if (LIST_FIRST(head2) != NULL) { \ + while (LIST_NEXT(curelm, field) != NULL) \ + curelm = LIST_NEXT(curelm, field); \ + LIST_NEXT(curelm, field) = LIST_FIRST(head2); \ + LIST_FIRST(head2)->field.le_prev = &LIST_NEXT(curelm, field); \ + LIST_INIT(head2); \ + } \ +} while (0) + +#define LIST_EMPTY(head) ((head)->lh_first == NULL) + +#define LIST_FIRST(head) ((head)->lh_first) + +#define LIST_FOREACH(var, head, field) \ + for ((var) = LIST_FIRST((head)); \ + (var); \ + (var) = LIST_NEXT((var), field)) + +#define LIST_FOREACH_FROM(var, head, field) \ + for ((var) = ((var) ? (var) : LIST_FIRST((head))); \ + (var); \ + (var) = LIST_NEXT((var), field)) + +#define LIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = LIST_FIRST((head)); \ + (var) && ((tvar) = LIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define LIST_FOREACH_FROM_SAFE(var, head, field, tvar) \ + for ((var) = ((var) ? (var) : LIST_FIRST((head))); \ + (var) && ((tvar) = LIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define LIST_INIT(head) do { \ + LIST_FIRST((head)) = NULL; \ +} while (0) + +#define LIST_INSERT_AFTER(listelm, elm, field) do { \ + QMD_LIST_CHECK_NEXT(listelm, field); \ + if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ + LIST_NEXT((listelm), field)->field.le_prev = \ + &LIST_NEXT((elm), field); \ + LIST_NEXT((listelm), field) = (elm); \ + (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ +} while (0) + +#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ + QMD_LIST_CHECK_PREV(listelm, field); \ + (elm)->field.le_prev = (listelm)->field.le_prev; \ + LIST_NEXT((elm), field) = (listelm); \ + *(listelm)->field.le_prev = (elm); \ + (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ +} while (0) + +#define LIST_INSERT_HEAD(head, elm, field) do { \ + QMD_LIST_CHECK_HEAD((head), field); \ + if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ + LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ + LIST_FIRST((head)) = (elm); \ + (elm)->field.le_prev = &LIST_FIRST((head)); \ +} while (0) + +#define LIST_NEXT(elm, field) ((elm)->field.le_next) + +#define LIST_PREV(elm, head, type, field) \ + ((elm)->field.le_prev == &LIST_FIRST((head)) ? NULL : \ + __containerof((elm)->field.le_prev, \ + QUEUE_TYPEOF(type), field.le_next)) + +#define LIST_REMOVE(elm, field) do { \ + QMD_SAVELINK(oldnext, (elm)->field.le_next); \ + QMD_SAVELINK(oldprev, (elm)->field.le_prev); \ + QMD_LIST_CHECK_NEXT(elm, field); \ + QMD_LIST_CHECK_PREV(elm, field); \ + if (LIST_NEXT((elm), field) != NULL) \ + LIST_NEXT((elm), field)->field.le_prev = \ + (elm)->field.le_prev; \ + *(elm)->field.le_prev = LIST_NEXT((elm), field); \ + TRASHIT(*oldnext); \ + TRASHIT(*oldprev); \ +} while (0) + +#define LIST_SWAP(head1, head2, type, field) do { \ + QUEUE_TYPEOF(type) *swap_tmp = LIST_FIRST(head1); \ + LIST_FIRST((head1)) = LIST_FIRST((head2)); \ + LIST_FIRST((head2)) = swap_tmp; \ + if ((swap_tmp = LIST_FIRST((head1))) != NULL) \ + swap_tmp->field.le_prev = &LIST_FIRST((head1)); \ + if ((swap_tmp = LIST_FIRST((head2))) != NULL) \ + swap_tmp->field.le_prev = &LIST_FIRST((head2)); \ +} while (0) + +/* + * Tail queue declarations. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ + TRACEBUF \ +} + +#define TAILQ_CLASS_HEAD(name, type) \ +struct name { \ + class type *tqh_first; /* first element */ \ + class type **tqh_last; /* addr of last next element */ \ + TRACEBUF \ +} + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first, TRACEBUF_INITIALIZER } + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ +} + +#define TAILQ_CLASS_ENTRY(type) \ +struct { \ + class type *tqe_next; /* next element */ \ + class type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ +} + +/* + * Tail queue functions. + */ +#if (defined(_KERNEL) && defined(INVARIANTS)) +#define QMD_TAILQ_CHECK_HEAD(head, field) do { \ + if (!TAILQ_EMPTY(head) && \ + TAILQ_FIRST((head))->field.tqe_prev != \ + &TAILQ_FIRST((head))) \ + panic("Bad tailq head %p first->prev != head", (head)); \ +} while (0) + +#define QMD_TAILQ_CHECK_TAIL(head, field) do { \ + if (*(head)->tqh_last != NULL) \ + panic("Bad tailq NEXT(%p->tqh_last) != NULL", (head)); \ +} while (0) + +#define QMD_TAILQ_CHECK_NEXT(elm, field) do { \ + if (TAILQ_NEXT((elm), field) != NULL && \ + TAILQ_NEXT((elm), field)->field.tqe_prev != \ + &((elm)->field.tqe_next)) \ + panic("Bad link elm %p next->prev != elm", (elm)); \ +} while (0) + +#define QMD_TAILQ_CHECK_PREV(elm, field) do { \ + if (*(elm)->field.tqe_prev != (elm)) \ + panic("Bad link elm %p prev->next != elm", (elm)); \ +} while (0) +#else +#define QMD_TAILQ_CHECK_HEAD(head, field) +#define QMD_TAILQ_CHECK_TAIL(head, headname) +#define QMD_TAILQ_CHECK_NEXT(elm, field) +#define QMD_TAILQ_CHECK_PREV(elm, field) +#endif /* (_KERNEL && INVARIANTS) */ + +#define TAILQ_CONCAT(head1, head2, field) do { \ + if (!TAILQ_EMPTY(head2)) { \ + *(head1)->tqh_last = (head2)->tqh_first; \ + (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ + (head1)->tqh_last = (head2)->tqh_last; \ + TAILQ_INIT((head2)); \ + QMD_TRACE_HEAD(head1); \ + QMD_TRACE_HEAD(head2); \ + } \ +} while (0) + +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) + +#define TAILQ_FIRST(head) ((head)->tqh_first) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = TAILQ_FIRST((head)); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_FROM(var, head, field) \ + for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_FROM_SAFE(var, head, field, tvar) \ + for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_FOREACH_REVERSE_FROM(var, head, headname, field) \ + for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_REVERSE_FROM_SAFE(var, head, headname, field, tvar) \ + for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \ + (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ + (var) = (tvar)) + +#define TAILQ_INIT(head) do { \ + TAILQ_FIRST((head)) = NULL; \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ +} while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + QMD_TAILQ_CHECK_NEXT(listelm, field); \ + if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else { \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + } \ + TAILQ_NEXT((listelm), field) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&(listelm)->field); \ +} while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + QMD_TAILQ_CHECK_PREV(listelm, field); \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field) = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&(listelm)->field); \ +} while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + QMD_TAILQ_CHECK_HEAD(head, field); \ + if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ + TAILQ_FIRST((head))->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_FIRST((head)) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + QMD_TAILQ_CHECK_TAIL(head, field); \ + TAILQ_NEXT((elm), field) = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + +#define TAILQ_REMOVE(head, elm, field) do { \ + QMD_SAVELINK(oldnext, (elm)->field.tqe_next); \ + QMD_SAVELINK(oldprev, (elm)->field.tqe_prev); \ + QMD_TAILQ_CHECK_NEXT(elm, field); \ + QMD_TAILQ_CHECK_PREV(elm, field); \ + if ((TAILQ_NEXT((elm), field)) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else { \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + QMD_TRACE_HEAD(head); \ + } \ + *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ + TRASHIT(*oldnext); \ + TRASHIT(*oldprev); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_SWAP(head1, head2, type, field) do { \ + QUEUE_TYPEOF(type) *swap_first = (head1)->tqh_first; \ + QUEUE_TYPEOF(type) **swap_last = (head1)->tqh_last; \ + (head1)->tqh_first = (head2)->tqh_first; \ + (head1)->tqh_last = (head2)->tqh_last; \ + (head2)->tqh_first = swap_first; \ + (head2)->tqh_last = swap_last; \ + if ((swap_first = (head1)->tqh_first) != NULL) \ + swap_first->field.tqe_prev = &(head1)->tqh_first; \ + else \ + (head1)->tqh_last = &(head1)->tqh_first; \ + if ((swap_first = (head2)->tqh_first) != NULL) \ + swap_first->field.tqe_prev = &(head2)->tqh_first; \ + else \ + (head2)->tqh_last = &(head2)->tqh_first; \ +} while (0) + +#endif /* !_SYS_QUEUE_H_ */ diff --git a/usr/contrib/freebsd/x86/segments.h b/usr/contrib/freebsd/x86/segments.h new file mode 100644 index 0000000000..1b8c4a3c1c --- /dev/null +++ b/usr/contrib/freebsd/x86/segments.h @@ -0,0 +1,274 @@ +/*- + * Copyright (c) 1989, 1990 William F. Jolitz + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)segments.h 7.1 (Berkeley) 5/9/91 + * $FreeBSD$ + */ + +#ifndef _X86_SEGMENTS_H_ +#define _X86_SEGMENTS_H_ + +/* + * X86 Segmentation Data Structures and definitions + */ + +/* + * Selectors + */ +#define SEL_RPL_MASK 3 /* requester priv level */ +#define ISPL(s) ((s)&3) /* priority level of a selector */ +#define SEL_KPL 0 /* kernel priority level */ +#define SEL_UPL 3 /* user priority level */ +#define ISLDT(s) ((s)&SEL_LDT) /* is it local or global */ +#define SEL_LDT 4 /* local descriptor table */ +#define IDXSEL(s) (((s)>>3) & 0x1fff) /* index of selector */ +#define LSEL(s,r) (((s)<<3) | SEL_LDT | r) /* a local selector */ +#define GSEL(s,r) (((s)<<3) | r) /* a global selector */ + +/* + * User segment descriptors (%cs, %ds etc for i386 apps. 64 bit wide) + * For long-mode apps, %cs only has the conforming bit in sd_type, the sd_dpl, + * sd_p, sd_l and sd_def32 which must be zero). %ds only has sd_p. + */ +struct segment_descriptor { + unsigned sd_lolimit:16; /* segment extent (lsb) */ + unsigned sd_lobase:24; /* segment base address (lsb) */ + unsigned sd_type:5; /* segment type */ + unsigned sd_dpl:2; /* segment descriptor priority level */ + unsigned sd_p:1; /* segment descriptor present */ + unsigned sd_hilimit:4; /* segment extent (msb) */ + unsigned sd_xx:2; /* unused */ + unsigned sd_def32:1; /* default 32 vs 16 bit size */ + unsigned sd_gran:1; /* limit granularity (byte/page units)*/ + unsigned sd_hibase:8; /* segment base address (msb) */ +} __packed; + +struct user_segment_descriptor { + unsigned sd_lolimit:16; /* segment extent (lsb) */ + unsigned sd_lobase:24; /* segment base address (lsb) */ + unsigned sd_type:5; /* segment type */ + unsigned sd_dpl:2; /* segment descriptor priority level */ + unsigned sd_p:1; /* segment descriptor present */ + unsigned sd_hilimit:4; /* segment extent (msb) */ + unsigned sd_xx:1; /* unused */ + unsigned sd_long:1; /* long mode (cs only) */ + unsigned sd_def32:1; /* default 32 vs 16 bit size */ + unsigned sd_gran:1; /* limit granularity (byte/page units)*/ + unsigned sd_hibase:8; /* segment base address (msb) */ +} __packed; + +#define USD_GETBASE(sd) (((sd)->sd_lobase) | (sd)->sd_hibase << 24) +#define USD_SETBASE(sd, b) (sd)->sd_lobase = (b); \ + (sd)->sd_hibase = ((b) >> 24); +#define USD_GETLIMIT(sd) (((sd)->sd_lolimit) | (sd)->sd_hilimit << 16) +#define USD_SETLIMIT(sd, l) (sd)->sd_lolimit = (l); \ + (sd)->sd_hilimit = ((l) >> 16); + +#ifdef __i386__ +/* + * Gate descriptors (e.g. indirect descriptors) + */ +struct gate_descriptor { + unsigned gd_looffset:16; /* gate offset (lsb) */ + unsigned gd_selector:16; /* gate segment selector */ + unsigned gd_stkcpy:5; /* number of stack wds to cpy */ + unsigned gd_xx:3; /* unused */ + unsigned gd_type:5; /* segment type */ + unsigned gd_dpl:2; /* segment descriptor priority level */ + unsigned gd_p:1; /* segment descriptor present */ + unsigned gd_hioffset:16; /* gate offset (msb) */ +} __packed; + +/* + * Generic descriptor + */ +union descriptor { + struct segment_descriptor sd; + struct gate_descriptor gd; +}; +#else +/* + * Gate descriptors (e.g. indirect descriptors, trap, interrupt etc. 128 bit) + * Only interrupt and trap gates have gd_ist. + */ +struct gate_descriptor { + uint64_t gd_looffset:16; /* gate offset (lsb) */ + uint64_t gd_selector:16; /* gate segment selector */ + uint64_t gd_ist:3; /* IST table index */ + uint64_t gd_xx:5; /* unused */ + uint64_t gd_type:5; /* segment type */ + uint64_t gd_dpl:2; /* segment descriptor priority level */ + uint64_t gd_p:1; /* segment descriptor present */ + uint64_t gd_hioffset:48; /* gate offset (msb) */ + uint64_t sd_xx1:32; +} __packed; + +/* + * Generic descriptor + */ +union descriptor { + struct user_segment_descriptor sd; + struct gate_descriptor gd; +}; +#endif + + /* system segments and gate types */ +#define SDT_SYSNULL 0 /* system null */ +#define SDT_SYS286TSS 1 /* system 286 TSS available */ +#define SDT_SYSLDT 2 /* system local descriptor table */ +#define SDT_SYS286BSY 3 /* system 286 TSS busy */ +#define SDT_SYS286CGT 4 /* system 286 call gate */ +#define SDT_SYSTASKGT 5 /* system task gate */ +#define SDT_SYS286IGT 6 /* system 286 interrupt gate */ +#define SDT_SYS286TGT 7 /* system 286 trap gate */ +#define SDT_SYSNULL2 8 /* system null again */ +#define SDT_SYS386TSS 9 /* system 386 TSS available */ +#define SDT_SYSTSS 9 /* system available 64 bit TSS */ +#define SDT_SYSNULL3 10 /* system null again */ +#define SDT_SYS386BSY 11 /* system 386 TSS busy */ +#define SDT_SYSBSY 11 /* system busy 64 bit TSS */ +#define SDT_SYS386CGT 12 /* system 386 call gate */ +#define SDT_SYSCGT 12 /* system 64 bit call gate */ +#define SDT_SYSNULL4 13 /* system null again */ +#define SDT_SYS386IGT 14 /* system 386 interrupt gate */ +#define SDT_SYSIGT 14 /* system 64 bit interrupt gate */ +#define SDT_SYS386TGT 15 /* system 386 trap gate */ +#define SDT_SYSTGT 15 /* system 64 bit trap gate */ + + /* memory segment types */ +#define SDT_MEMRO 16 /* memory read only */ +#define SDT_MEMROA 17 /* memory read only accessed */ +#define SDT_MEMRW 18 /* memory read write */ +#define SDT_MEMRWA 19 /* memory read write accessed */ +#define SDT_MEMROD 20 /* memory read only expand dwn limit */ +#define SDT_MEMRODA 21 /* memory read only expand dwn limit accessed */ +#define SDT_MEMRWD 22 /* memory read write expand dwn limit */ +#define SDT_MEMRWDA 23 /* memory read write expand dwn limit accessed*/ +#define SDT_MEME 24 /* memory execute only */ +#define SDT_MEMEA 25 /* memory execute only accessed */ +#define SDT_MEMER 26 /* memory execute read */ +#define SDT_MEMERA 27 /* memory execute read accessed */ +#define SDT_MEMEC 28 /* memory execute only conforming */ +#define SDT_MEMEAC 29 /* memory execute only accessed conforming */ +#define SDT_MEMERC 30 /* memory execute read conforming */ +#define SDT_MEMERAC 31 /* memory execute read accessed conforming */ + +/* + * Size of IDT table + */ +#define NIDT 256 /* 32 reserved, 0x80 syscall, most are h/w */ +#define NRSVIDT 32 /* reserved entries for cpu exceptions */ + +/* + * Entries in the Interrupt Descriptor Table (IDT) + */ +#define IDT_DE 0 /* #DE: Divide Error */ +#define IDT_DB 1 /* #DB: Debug */ +#define IDT_NMI 2 /* Nonmaskable External Interrupt */ +#define IDT_BP 3 /* #BP: Breakpoint */ +#define IDT_OF 4 /* #OF: Overflow */ +#define IDT_BR 5 /* #BR: Bound Range Exceeded */ +#define IDT_UD 6 /* #UD: Undefined/Invalid Opcode */ +#define IDT_NM 7 /* #NM: No Math Coprocessor */ +#define IDT_DF 8 /* #DF: Double Fault */ +#define IDT_FPUGP 9 /* Coprocessor Segment Overrun */ +#define IDT_TS 10 /* #TS: Invalid TSS */ +#define IDT_NP 11 /* #NP: Segment Not Present */ +#define IDT_SS 12 /* #SS: Stack Segment Fault */ +#define IDT_GP 13 /* #GP: General Protection Fault */ +#define IDT_PF 14 /* #PF: Page Fault */ +#define IDT_MF 16 /* #MF: FPU Floating-Point Error */ +#define IDT_AC 17 /* #AC: Alignment Check */ +#define IDT_MC 18 /* #MC: Machine Check */ +#define IDT_XF 19 /* #XF: SIMD Floating-Point Exception */ +#define IDT_IO_INTS NRSVIDT /* Base of IDT entries for I/O interrupts. */ +#define IDT_SYSCALL 0x80 /* System Call Interrupt Vector */ +#define IDT_DTRACE_RET 0x92 /* DTrace pid provider Interrupt Vector */ +#define IDT_EVTCHN 0x93 /* Xen HVM Event Channel Interrupt Vector */ + +#if defined(__i386__) +/* + * Entries in the Global Descriptor Table (GDT) + * Note that each 4 entries share a single 32 byte L1 cache line. + * Some of the fast syscall instructions require a specific order here. + */ +#define GNULL_SEL 0 /* Null Descriptor */ +#define GPRIV_SEL 1 /* SMP Per-Processor Private Data */ +#define GUFS_SEL 2 /* User %fs Descriptor (order critical: 1) */ +#define GUGS_SEL 3 /* User %gs Descriptor (order critical: 2) */ +#define GCODE_SEL 4 /* Kernel Code Descriptor (order critical: 1) */ +#define GDATA_SEL 5 /* Kernel Data Descriptor (order critical: 2) */ +#define GUCODE_SEL 6 /* User Code Descriptor (order critical: 3) */ +#define GUDATA_SEL 7 /* User Data Descriptor (order critical: 4) */ +#define GBIOSLOWMEM_SEL 8 /* BIOS low memory access (must be entry 8) */ +#define GPROC0_SEL 9 /* Task state process slot zero and up */ +#define GLDT_SEL 10 /* Default User LDT */ +#define GUSERLDT_SEL 11 /* User LDT */ +#define GPANIC_SEL 12 /* Task state to consider panic from */ +#define GBIOSCODE32_SEL 13 /* BIOS interface (32bit Code) */ +#define GBIOSCODE16_SEL 14 /* BIOS interface (16bit Code) */ +#define GBIOSDATA_SEL 15 /* BIOS interface (Data) */ +#define GBIOSUTIL_SEL 16 /* BIOS interface (Utility) */ +#define GBIOSARGS_SEL 17 /* BIOS interface (Arguments) */ +#define GNDIS_SEL 18 /* For the NDIS layer */ +#define NGDT 19 + +/* + * Entries in the Local Descriptor Table (LDT) + */ +#define LSYS5CALLS_SEL 0 /* forced by intel BCS */ +#define LSYS5SIGR_SEL 1 +#define LUCODE_SEL 3 +#define LUDATA_SEL 5 +#define NLDT (LUDATA_SEL + 1) + +#else /* !__i386__ */ +/* + * Entries in the Global Descriptor Table (GDT) + */ +#define GNULL_SEL 0 /* Null Descriptor */ +#define GNULL2_SEL 1 /* Null Descriptor */ +#define GUFS32_SEL 2 /* User 32 bit %fs Descriptor */ +#define GUGS32_SEL 3 /* User 32 bit %gs Descriptor */ +#define GCODE_SEL 4 /* Kernel Code Descriptor */ +#define GDATA_SEL 5 /* Kernel Data Descriptor */ +#define GUCODE32_SEL 6 /* User 32 bit code Descriptor */ +#define GUDATA_SEL 7 /* User 32/64 bit Data Descriptor */ +#define GUCODE_SEL 8 /* User 64 bit Code Descriptor */ +#define GPROC0_SEL 9 /* TSS for entering kernel etc */ +/* slot 10 is second half of GPROC0_SEL */ +#define GUSERLDT_SEL 11 /* LDT */ +/* slot 12 is second half of GUSERLDT_SEL */ +#define NGDT 13 +#endif /* __i386__ */ + +#endif /* !_X86_SEGMENTS_H_ */ diff --git a/usr/contrib/freebsd/x86/specialreg.h b/usr/contrib/freebsd/x86/specialreg.h index bea3122423..f528bad55c 100644 --- a/usr/contrib/freebsd/x86/specialreg.h +++ b/usr/contrib/freebsd/x86/specialreg.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * @@ -10,7 +12,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -27,7 +29,7 @@ * SUCH DAMAGE. * * from: @(#)specialreg.h 7.1 (Berkeley) 5/9/91 - * $FreeBSD: head/sys/x86/include/specialreg.h 273338 2014-10-20 18:09:33Z neel $ + * $FreeBSD$ */ #ifndef _MACHINE_SPECIALREG_H_ @@ -53,6 +55,7 @@ #define CR0_CD 0x40000000 /* Cache Disable */ #define CR3_PCID_SAVE 0x8000000000000000 +#define CR3_PCID_MASK 0xfff /* * Bits in PPro special registers @@ -73,6 +76,8 @@ #define CR4_PCIDE 0x00020000 /* Enable Context ID */ #define CR4_XSAVE 0x00040000 /* XSETBV/XGETBV */ #define CR4_SMEP 0x00100000 /* Supervisor-Mode Execution Prevention */ +#define CR4_SMAP 0x00200000 /* Supervisor-Mode Access Prevention */ +#define CR4_PKE 0x00400000 /* Protection Keys Enable */ /* * Bits in AMD64 special registers. EFER is 64 bits wide. @@ -82,6 +87,9 @@ #define EFER_LMA 0x000000400 /* Long mode active (R) */ #define EFER_NXE 0x000000800 /* PTE No-Execute bit enable (R/W) */ #define EFER_SVM 0x000001000 /* SVM enable bit for AMD, reserved for Intel */ +#define EFER_LMSLE 0x000002000 /* Long Mode Segment Limit Enable */ +#define EFER_FFXSR 0x000004000 /* Fast FXSAVE/FSRSTOR */ +#define EFER_TCE 0x000008000 /* Translation Cache Extension */ /* * Intel Extended Features registers @@ -154,6 +162,7 @@ #define CPUID2_TM2 0x00000100 #define CPUID2_SSSE3 0x00000200 #define CPUID2_CNXTID 0x00000400 +#define CPUID2_SDBG 0x00000800 #define CPUID2_FMA 0x00001000 #define CPUID2_CX16 0x00002000 #define CPUID2_XTPR 0x00004000 @@ -181,8 +190,43 @@ #define CPUTPM1_SENSOR 0x00000001 #define CPUTPM1_TURBO 0x00000002 #define CPUTPM1_ARAT 0x00000004 +#define CPUTPM1_HWP 0x00000080 +#define CPUTPM1_HWP_NOTIFICATION 0x00000100 +#define CPUTPM1_HWP_ACTIVITY_WINDOW 0x00000200 +#define CPUTPM1_HWP_PERF_PREF 0x00000400 +#define CPUTPM1_HWP_PKG 0x00000800 +#define CPUTPM1_HWP_FLEXIBLE 0x00020000 #define CPUTPM2_EFFREQ 0x00000001 +/* Intel Processor Trace CPUID. */ + +/* Leaf 0 ebx. */ +#define CPUPT_CR3 (1 << 0) /* CR3 Filtering Support */ +#define CPUPT_PSB (1 << 1) /* Configurable PSB and Cycle-Accurate Mode Supported */ +#define CPUPT_IPF (1 << 2) /* IP Filtering and TraceStop supported */ +#define CPUPT_MTC (1 << 3) /* MTC Supported */ +#define CPUPT_PRW (1 << 4) /* PTWRITE Supported */ +#define CPUPT_PWR (1 << 5) /* Power Event Trace Supported */ + +/* Leaf 0 ecx. */ +#define CPUPT_TOPA (1 << 0) /* ToPA Output Supported */ +#define CPUPT_TOPA_MULTI (1 << 1) /* ToPA Tables Allow Multiple Output Entries */ +#define CPUPT_SINGLE (1 << 2) /* Single-Range Output Supported */ +#define CPUPT_TT_OUT (1 << 3) /* Output to Trace Transport Subsystem Supported */ +#define CPUPT_LINEAR_IP (1 << 31) /* IP Payloads are Linear IP, otherwise IP is effective */ + +/* Leaf 1 eax. */ +#define CPUPT_NADDR_S 0 /* Number of Address Ranges */ +#define CPUPT_NADDR_M (0x7 << CPUPT_NADDR_S) +#define CPUPT_MTC_BITMAP_S 16 /* Bitmap of supported MTC Period Encodings */ +#define CPUPT_MTC_BITMAP_M (0xffff << CPUPT_MTC_BITMAP_S) + +/* Leaf 1 ebx. */ +#define CPUPT_CT_BITMAP_S 0 /* Bitmap of supported Cycle Threshold values */ +#define CPUPT_CT_BITMAP_M (0xffff << CPUPT_CT_BITMAP_S) +#define CPUPT_PFE_BITMAP_S 16 /* Bitmap of supported Configurable PSB Frequency encoding */ +#define CPUPT_PFE_BITMAP_M (0xffff << CPUPT_PFE_BITMAP_S) + /* * Important bits in the AMD extended cpuid flags */ @@ -190,7 +234,7 @@ #define AMDID_MP 0x00080000 #define AMDID_NX 0x00100000 #define AMDID_EXT_MMX 0x00400000 -#define AMDID_FFXSR 0x01000000 +#define AMDID_FFXSR 0x02000000 #define AMDID_PAGE1GB 0x04000000 #define AMDID_RDTSCP 0x08000000 #define AMDID_LM 0x20000000 @@ -222,6 +266,7 @@ #define AMDID2_DBE 0x04000000 #define AMDID2_PTSC 0x08000000 #define AMDID2_PTSCEL2I 0x10000000 +#define AMDID2_MWAITX 0x20000000 /* * CPUID instruction 1 eax info @@ -301,6 +346,15 @@ #define CPUID_EXTSTATE_XINUSE 0x00000004 #define CPUID_EXTSTATE_XSAVES 0x00000008 +/* + * AMD extended function 8000_0007h ebx info + */ +#define AMDRAS_MCA_OF_RECOV 0x00000001 +#define AMDRAS_SUCCOR 0x00000002 +#define AMDRAS_HW_ASSERT 0x00000004 +#define AMDRAS_SCALABLE_MCA 0x00000008 +#define AMDRAS_PFEH_SUPPORT 0x00000010 + /* * AMD extended function 8000_0007h edx info */ @@ -315,6 +369,24 @@ #define AMDPM_TSC_INVARIANT 0x00000100 #define AMDPM_CPB 0x00000200 +/* + * AMD extended function 8000_0008h ebx info (amd_extended_feature_extensions) + */ +#define AMDFEID_CLZERO 0x00000001 +#define AMDFEID_IRPERF 0x00000002 +#define AMDFEID_XSAVEERPTR 0x00000004 +#define AMDFEID_IBPB 0x00001000 +#define AMDFEID_IBRS 0x00004000 +#define AMDFEID_STIBP 0x00008000 +/* The below are only defined if the corresponding base feature above exists. */ +#define AMDFEID_IBRS_ALWAYSON 0x00010000 +#define AMDFEID_STIBP_ALWAYSON 0x00020000 +#define AMDFEID_PREFER_IBRS 0x00040000 +#define AMDFEID_SSBD 0x01000000 +/* SSBD via MSRC001_011F instead of MSR 0x48: */ +#define AMDFEID_VIRT_SSBD 0x02000000 +#define AMDFEID_SSB_NO 0x04000000 + /* * AMD extended function 8000_0008h ecx info */ @@ -327,25 +399,83 @@ */ #define CPUID_STDEXT_FSGSBASE 0x00000001 #define CPUID_STDEXT_TSC_ADJUST 0x00000002 +#define CPUID_STDEXT_SGX 0x00000004 #define CPUID_STDEXT_BMI1 0x00000008 #define CPUID_STDEXT_HLE 0x00000010 #define CPUID_STDEXT_AVX2 0x00000020 +#define CPUID_STDEXT_FDP_EXC 0x00000040 #define CPUID_STDEXT_SMEP 0x00000080 #define CPUID_STDEXT_BMI2 0x00000100 #define CPUID_STDEXT_ERMS 0x00000200 #define CPUID_STDEXT_INVPCID 0x00000400 #define CPUID_STDEXT_RTM 0x00000800 +#define CPUID_STDEXT_PQM 0x00001000 +#define CPUID_STDEXT_NFPUSG 0x00002000 #define CPUID_STDEXT_MPX 0x00004000 +#define CPUID_STDEXT_PQE 0x00008000 #define CPUID_STDEXT_AVX512F 0x00010000 +#define CPUID_STDEXT_AVX512DQ 0x00020000 #define CPUID_STDEXT_RDSEED 0x00040000 #define CPUID_STDEXT_ADX 0x00080000 #define CPUID_STDEXT_SMAP 0x00100000 +#define CPUID_STDEXT_AVX512IFMA 0x00200000 +#define CPUID_STDEXT_PCOMMIT 0x00400000 #define CPUID_STDEXT_CLFLUSHOPT 0x00800000 +#define CPUID_STDEXT_CLWB 0x01000000 #define CPUID_STDEXT_PROCTRACE 0x02000000 #define CPUID_STDEXT_AVX512PF 0x04000000 #define CPUID_STDEXT_AVX512ER 0x08000000 #define CPUID_STDEXT_AVX512CD 0x10000000 #define CPUID_STDEXT_SHA 0x20000000 +#define CPUID_STDEXT_AVX512BW 0x40000000 +#define CPUID_STDEXT_AVX512VL 0x80000000 + +/* + * CPUID instruction 7 Structured Extended Features, leaf 0 ecx info + */ +#define CPUID_STDEXT2_PREFETCHWT1 0x00000001 +#define CPUID_STDEXT2_AVX512VBMI 0x00000002 +#define CPUID_STDEXT2_UMIP 0x00000004 +#define CPUID_STDEXT2_PKU 0x00000008 +#define CPUID_STDEXT2_OSPKE 0x00000010 +#define CPUID_STDEXT2_WAITPKG 0x00000020 +#define CPUID_STDEXT2_AVX512VBMI2 0x00000040 +#define CPUID_STDEXT2_GFNI 0x00000100 +#define CPUID_STDEXT2_VAES 0x00000200 +#define CPUID_STDEXT2_VPCLMULQDQ 0x00000400 +#define CPUID_STDEXT2_AVX512VNNI 0x00000800 +#define CPUID_STDEXT2_AVX512BITALG 0x00001000 +#define CPUID_STDEXT2_AVX512VPOPCNTDQ 0x00004000 +#define CPUID_STDEXT2_RDPID 0x00400000 +#define CPUID_STDEXT2_CLDEMOTE 0x02000000 +#define CPUID_STDEXT2_MOVDIRI 0x08000000 +#define CPUID_STDEXT2_MOVDIRI64B 0x10000000 +#define CPUID_STDEXT2_ENQCMD 0x20000000 +#define CPUID_STDEXT2_SGXLC 0x40000000 + +/* + * CPUID instruction 7 Structured Extended Features, leaf 0 edx info + */ +#define CPUID_STDEXT3_AVX5124VNNIW 0x00000004 +#define CPUID_STDEXT3_AVX5124FMAPS 0x00000008 +#define CPUID_STDEXT3_AVX512VP2INTERSECT 0x00000100 +#define CPUID_STDEXT3_MD_CLEAR 0x00000400 +#define CPUID_STDEXT3_TSXFA 0x00002000 +#define CPUID_STDEXT3_PCONFIG 0x00040000 +#define CPUID_STDEXT3_IBPB 0x04000000 +#define CPUID_STDEXT3_STIBP 0x08000000 +#define CPUID_STDEXT3_L1D_FLUSH 0x10000000 +#define CPUID_STDEXT3_ARCH_CAP 0x20000000 +#define CPUID_STDEXT3_CORE_CAP 0x40000000 +#define CPUID_STDEXT3_SSBD 0x80000000 + +/* MSR IA32_ARCH_CAP(ABILITIES) bits */ +#define IA32_ARCH_CAP_RDCL_NO 0x00000001 +#define IA32_ARCH_CAP_IBRS_ALL 0x00000002 +#define IA32_ARCH_CAP_RSBA 0x00000004 +#define IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY 0x00000008 +#define IA32_ARCH_CAP_SSB_NO 0x00000010 +#define IA32_ARCH_CAP_MDS_NO 0x00000020 /* * CPUID manufacturers identifiers @@ -375,6 +505,8 @@ #define MSR_EBL_CR_POWERON 0x02a #define MSR_TEST_CTL 0x033 #define MSR_IA32_FEATURE_CONTROL 0x03a +#define MSR_IA32_SPEC_CTRL 0x048 +#define MSR_IA32_PRED_CMD 0x049 #define MSR_BIOS_UPDT_TRIG 0x079 #define MSR_BBL_CR_D0 0x088 #define MSR_BBL_CR_D1 0x089 @@ -387,6 +519,9 @@ #define MSR_APERF 0x0e8 #define MSR_IA32_EXT_CONFIG 0x0ee /* Undocumented. Core Solo/Duo only */ #define MSR_MTRRcap 0x0fe +#define MSR_IA32_ARCH_CAP 0x10a +#define MSR_IA32_FLUSH_CMD 0x10b +#define MSR_TSX_FORCE_ABORT 0x10f #define MSR_BBL_CR_ADDR 0x116 #define MSR_BBL_CR_DECC 0x118 #define MSR_BBL_CR_CTL 0x119 @@ -446,6 +581,14 @@ #define MSR_DRAM_ENERGY_STATUS 0x619 #define MSR_PP0_ENERGY_STATUS 0x639 #define MSR_PP1_ENERGY_STATUS 0x641 +#define MSR_PPERF 0x64e +#define MSR_TSC_DEADLINE 0x6e0 /* Writes are not serializing */ +#define MSR_IA32_PM_ENABLE 0x770 +#define MSR_IA32_HWP_CAPABILITIES 0x771 +#define MSR_IA32_HWP_REQUEST_PKG 0x772 +#define MSR_IA32_HWP_INTERRUPT 0x773 +#define MSR_IA32_HWP_REQUEST 0x774 +#define MSR_IA32_HWP_STATUS 0x777 /* * VMX MSRs @@ -467,8 +610,10 @@ #define MSR_VMX_TRUE_ENTRY_CTLS 0x490 /* - * X2APIC MSRs + * X2APIC MSRs. + * Writes are not serializing. */ +#define MSR_APIC_000 0x800 #define MSR_APIC_ID 0x802 #define MSR_APIC_VERSION 0x803 #define MSR_APIC_TPR 0x808 @@ -501,6 +646,85 @@ #define MSR_IA32_XSS 0xda0 +/* + * Intel Processor Trace (PT) MSRs. + */ +#define MSR_IA32_RTIT_OUTPUT_BASE 0x560 /* Trace Output Base Register (R/W) */ +#define MSR_IA32_RTIT_OUTPUT_MASK_PTRS 0x561 /* Trace Output Mask Pointers Register (R/W) */ +#define MSR_IA32_RTIT_CTL 0x570 /* Trace Control Register (R/W) */ +#define RTIT_CTL_TRACEEN (1 << 0) +#define RTIT_CTL_CYCEN (1 << 1) +#define RTIT_CTL_OS (1 << 2) +#define RTIT_CTL_USER (1 << 3) +#define RTIT_CTL_PWREVTEN (1 << 4) +#define RTIT_CTL_FUPONPTW (1 << 5) +#define RTIT_CTL_FABRICEN (1 << 6) +#define RTIT_CTL_CR3FILTER (1 << 7) +#define RTIT_CTL_TOPA (1 << 8) +#define RTIT_CTL_MTCEN (1 << 9) +#define RTIT_CTL_TSCEN (1 << 10) +#define RTIT_CTL_DISRETC (1 << 11) +#define RTIT_CTL_PTWEN (1 << 12) +#define RTIT_CTL_BRANCHEN (1 << 13) +#define RTIT_CTL_MTC_FREQ_S 14 +#define RTIT_CTL_MTC_FREQ(n) ((n) << RTIT_CTL_MTC_FREQ_S) +#define RTIT_CTL_MTC_FREQ_M (0xf << RTIT_CTL_MTC_FREQ_S) +#define RTIT_CTL_CYC_THRESH_S 19 +#define RTIT_CTL_CYC_THRESH_M (0xf << RTIT_CTL_CYC_THRESH_S) +#define RTIT_CTL_PSB_FREQ_S 24 +#define RTIT_CTL_PSB_FREQ_M (0xf << RTIT_CTL_PSB_FREQ_S) +#define RTIT_CTL_ADDR_CFG_S(n) (32 + (n) * 4) +#define RTIT_CTL_ADDR0_CFG_S 32 +#define RTIT_CTL_ADDR0_CFG_M (0xfULL << RTIT_CTL_ADDR0_CFG_S) +#define RTIT_CTL_ADDR1_CFG_S 36 +#define RTIT_CTL_ADDR1_CFG_M (0xfULL << RTIT_CTL_ADDR1_CFG_S) +#define RTIT_CTL_ADDR2_CFG_S 40 +#define RTIT_CTL_ADDR2_CFG_M (0xfULL << RTIT_CTL_ADDR2_CFG_S) +#define RTIT_CTL_ADDR3_CFG_S 44 +#define RTIT_CTL_ADDR3_CFG_M (0xfULL << RTIT_CTL_ADDR3_CFG_S) +#define MSR_IA32_RTIT_STATUS 0x571 /* Tracing Status Register (R/W) */ +#define RTIT_STATUS_FILTEREN (1 << 0) +#define RTIT_STATUS_CONTEXTEN (1 << 1) +#define RTIT_STATUS_TRIGGEREN (1 << 2) +#define RTIT_STATUS_ERROR (1 << 4) +#define RTIT_STATUS_STOPPED (1 << 5) +#define RTIT_STATUS_PACKETBYTECNT_S 32 +#define RTIT_STATUS_PACKETBYTECNT_M (0x1ffffULL << RTIT_STATUS_PACKETBYTECNT_S) +#define MSR_IA32_RTIT_CR3_MATCH 0x572 /* Trace Filter CR3 Match Register (R/W) */ +#define MSR_IA32_RTIT_ADDR_A(n) (0x580 + (n) * 2) +#define MSR_IA32_RTIT_ADDR_B(n) (0x581 + (n) * 2) +#define MSR_IA32_RTIT_ADDR0_A 0x580 /* Region 0 Start Address (R/W) */ +#define MSR_IA32_RTIT_ADDR0_B 0x581 /* Region 0 End Address (R/W) */ +#define MSR_IA32_RTIT_ADDR1_A 0x582 /* Region 1 Start Address (R/W) */ +#define MSR_IA32_RTIT_ADDR1_B 0x583 /* Region 1 End Address (R/W) */ +#define MSR_IA32_RTIT_ADDR2_A 0x584 /* Region 2 Start Address (R/W) */ +#define MSR_IA32_RTIT_ADDR2_B 0x585 /* Region 2 End Address (R/W) */ +#define MSR_IA32_RTIT_ADDR3_A 0x586 /* Region 3 Start Address (R/W) */ +#define MSR_IA32_RTIT_ADDR3_B 0x587 /* Region 3 End Address (R/W) */ + +/* Intel Processor Trace Table of Physical Addresses (ToPA). */ +#define TOPA_SIZE_S 6 +#define TOPA_SIZE_M (0xf << TOPA_SIZE_S) +#define TOPA_SIZE_4K (0 << TOPA_SIZE_S) +#define TOPA_SIZE_8K (1 << TOPA_SIZE_S) +#define TOPA_SIZE_16K (2 << TOPA_SIZE_S) +#define TOPA_SIZE_32K (3 << TOPA_SIZE_S) +#define TOPA_SIZE_64K (4 << TOPA_SIZE_S) +#define TOPA_SIZE_128K (5 << TOPA_SIZE_S) +#define TOPA_SIZE_256K (6 << TOPA_SIZE_S) +#define TOPA_SIZE_512K (7 << TOPA_SIZE_S) +#define TOPA_SIZE_1M (8 << TOPA_SIZE_S) +#define TOPA_SIZE_2M (9 << TOPA_SIZE_S) +#define TOPA_SIZE_4M (10 << TOPA_SIZE_S) +#define TOPA_SIZE_8M (11 << TOPA_SIZE_S) +#define TOPA_SIZE_16M (12 << TOPA_SIZE_S) +#define TOPA_SIZE_32M (13 << TOPA_SIZE_S) +#define TOPA_SIZE_64M (14 << TOPA_SIZE_S) +#define TOPA_SIZE_128M (15 << TOPA_SIZE_S) +#define TOPA_STOP (1 << 4) +#define TOPA_INT (1 << 2) +#define TOPA_END (1 << 0) + /* * Constants related to MSR's. */ @@ -515,6 +739,55 @@ #define IA32_FEATURE_CONTROL_SMX_EN 0x02 /* enable VMX inside SMX */ #define IA32_FEATURE_CONTROL_VMX_EN 0x04 /* enable VMX outside SMX */ +/* MSR IA32_MISC_ENABLE */ +#define IA32_MISC_EN_FASTSTR 0x0000000000000001ULL +#define IA32_MISC_EN_ATCCE 0x0000000000000008ULL +#define IA32_MISC_EN_PERFMON 0x0000000000000080ULL +#define IA32_MISC_EN_PEBSU 0x0000000000001000ULL +#define IA32_MISC_EN_ESSTE 0x0000000000010000ULL +#define IA32_MISC_EN_MONE 0x0000000000040000ULL +#define IA32_MISC_EN_LIMCPUID 0x0000000000400000ULL +#define IA32_MISC_EN_xTPRD 0x0000000000800000ULL +#define IA32_MISC_EN_XDD 0x0000000400000000ULL + +/* + * IA32_SPEC_CTRL and IA32_PRED_CMD MSRs are described in the Intel' + * document 336996-001 Speculative Execution Side Channel Mitigations. + * + * AMD uses the same MSRs and bit definitions, as described in 111006-B + * "Indirect Branch Control Extension" and 124441 "Speculative Store Bypass + * Disable." + */ +/* MSR IA32_SPEC_CTRL */ +#define IA32_SPEC_CTRL_IBRS 0x00000001 +#define IA32_SPEC_CTRL_STIBP 0x00000002 +#define IA32_SPEC_CTRL_SSBD 0x00000004 + +/* MSR IA32_PRED_CMD */ +#define IA32_PRED_CMD_IBPB_BARRIER 0x0000000000000001ULL + +/* MSR IA32_FLUSH_CMD */ +#define IA32_FLUSH_CMD_L1D 0x00000001 + +/* MSR IA32_HWP_CAPABILITIES */ +#define IA32_HWP_CAPABILITIES_HIGHEST_PERFORMANCE(x) (((x) >> 0) & 0xff) +#define IA32_HWP_CAPABILITIES_GUARANTEED_PERFORMANCE(x) (((x) >> 8) & 0xff) +#define IA32_HWP_CAPABILITIES_EFFICIENT_PERFORMANCE(x) (((x) >> 16) & 0xff) +#define IA32_HWP_CAPABILITIES_LOWEST_PERFORMANCE(x) (((x) >> 24) & 0xff) + +/* MSR IA32_HWP_REQUEST */ +#define IA32_HWP_REQUEST_MINIMUM_VALID (1ULL << 63) +#define IA32_HWP_REQUEST_MAXIMUM_VALID (1ULL << 62) +#define IA32_HWP_REQUEST_DESIRED_VALID (1ULL << 61) +#define IA32_HWP_REQUEST_EPP_VALID (1ULL << 60) +#define IA32_HWP_REQUEST_ACTIVITY_WINDOW_VALID (1ULL << 59) +#define IA32_HWP_REQUEST_PACKAGE_CONTROL (1ULL << 42) +#define IA32_HWP_ACTIVITY_WINDOW (0x3ffULL << 32) +#define IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE (0xffULL << 24) +#define IA32_HWP_DESIRED_PERFORMANCE (0xffULL << 16) +#define IA32_HWP_REQUEST_MAXIMUM_PERFORMANCE (0xffULL << 8) +#define IA32_HWP_MINIMUM_PERFORMANCE (0xffULL << 0) + /* * PAT modes. */ @@ -665,6 +938,33 @@ #define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */ #define MC_CTL2_THRESHOLD 0x0000000000007fff #define MC_CTL2_CMCI_EN 0x0000000040000000 +#define MC_AMDNB_BANK 4 +#define MC_MISC_AMD_VAL 0x8000000000000000 /* Counter presence valid */ +#define MC_MISC_AMD_CNTP 0x4000000000000000 /* Counter present */ +#define MC_MISC_AMD_LOCK 0x2000000000000000 /* Register locked */ +#define MC_MISC_AMD_INTP 0x1000000000000000 /* Int. type can generate interrupts */ +#define MC_MISC_AMD_LVT_MASK 0x00f0000000000000 /* Extended LVT offset */ +#define MC_MISC_AMD_LVT_SHIFT 52 +#define MC_MISC_AMD_CNTEN 0x0008000000000000 /* Counter enabled */ +#define MC_MISC_AMD_INT_MASK 0x0006000000000000 /* Interrupt type */ +#define MC_MISC_AMD_INT_LVT 0x0002000000000000 /* Interrupt via Extended LVT */ +#define MC_MISC_AMD_INT_SMI 0x0004000000000000 /* SMI */ +#define MC_MISC_AMD_OVERFLOW 0x0001000000000000 /* Counter overflow */ +#define MC_MISC_AMD_CNT_MASK 0x00000fff00000000 /* Counter value */ +#define MC_MISC_AMD_CNT_SHIFT 32 +#define MC_MISC_AMD_CNT_MAX 0xfff +#define MC_MISC_AMD_PTR_MASK 0x00000000ff000000 /* Pointer to additional registers */ +#define MC_MISC_AMD_PTR_SHIFT 24 + +/* AMD Scalable MCA */ +#define MSR_SMCA_MC0_CTL 0xc0002000 +#define MSR_SMCA_MC0_STATUS 0xc0002001 +#define MSR_SMCA_MC0_ADDR 0xc0002002 +#define MSR_SMCA_MC0_MISC0 0xc0002003 +#define MSR_SMCA_MC_CTL(x) (MSR_SMCA_MC0_CTL + 0x10 * (x)) +#define MSR_SMCA_MC_STATUS(x) (MSR_SMCA_MC0_STATUS + 0x10 * (x)) +#define MSR_SMCA_MC_ADDR(x) (MSR_SMCA_MC0_ADDR + 0x10 * (x)) +#define MSR_SMCA_MC_MISC(x) (MSR_SMCA_MC0_MISC0 + 0x10 * (x)) /* * The following four 3-byte registers control the non-cacheable regions. @@ -768,6 +1068,7 @@ #define MSR_FSBASE 0xc0000100 /* base address of the %fs "segment" */ #define MSR_GSBASE 0xc0000101 /* base address of the %gs "segment" */ #define MSR_KGSBASE 0xc0000102 /* base address of the kernel %gs */ +#define MSR_TSC_AUX 0xc0000103 #define MSR_PERFEVSEL0 0xc0010000 #define MSR_PERFEVSEL1 0xc0010001 #define MSR_PERFEVSEL2 0xc0010002 @@ -785,17 +1086,20 @@ #define MSR_TOP_MEM 0xc001001a /* boundary for ram below 4G */ #define MSR_TOP_MEM2 0xc001001d /* boundary for ram above 4G */ #define MSR_NB_CFG1 0xc001001f /* NB configuration 1 */ +#define MSR_K8_UCODE_UPDATE 0xc0010020 /* update microcode */ +#define MSR_MC0_CTL_MASK 0xc0010044 #define MSR_P_STATE_LIMIT 0xc0010061 /* P-state Current Limit Register */ #define MSR_P_STATE_CONTROL 0xc0010062 /* P-state Control Register */ #define MSR_P_STATE_STATUS 0xc0010063 /* P-state Status Register */ #define MSR_P_STATE_CONFIG(n) (0xc0010064 + (n)) /* P-state Config */ #define MSR_SMM_ADDR 0xc0010112 /* SMM TSEG base address */ #define MSR_SMM_MASK 0xc0010113 /* SMM TSEG address mask */ +#define MSR_VM_CR 0xc0010114 /* SVM: feature control */ +#define MSR_VM_HSAVE_PA 0xc0010117 /* SVM: host save area address */ +#define MSR_AMD_CPUID07 0xc0011002 /* CPUID 07 %ebx override */ +#define MSR_EXTFEATURES 0xc0011005 /* Extended CPUID Features override */ +#define MSR_LS_CFG 0xc0011020 #define MSR_IC_CFG 0xc0011021 /* Instruction Cache Configuration */ -#define MSR_K8_UCODE_UPDATE 0xc0010020 /* update microcode */ -#define MSR_MC0_CTL_MASK 0xc0010044 -#define MSR_VM_CR 0xc0010114 /* SVM: feature control */ -#define MSR_VM_HSAVE_PA 0xc0010117 /* SVM: host save area address */ /* MSR_VM_CR related */ #define VM_CR_SVMDIS 0x10 /* SVM: disabled by BIOS */ diff --git a/usr/src/Makefile.master b/usr/src/Makefile.master index a6d4d763d6..aa7bd524bd 100644 --- a/usr/src/Makefile.master +++ b/usr/src/Makefile.master @@ -57,6 +57,12 @@ $(NO_ADJUNCT_PROTO)HAVE_ADJUNCT_PROTO=$(POUND_SIGN) # NATIVE_ADJUNCT= /usr +# +# Compatibility code for FreeBSD etc. +# +COMPAT= $(SRC)/compat +CONTRIB= $(SRC)/../contrib + # # RELEASE_BUILD should be cleared for final release builds. # NOT_RELEASE_BUILD is exactly what the name implies. diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile index a4a744fe95..f20274bd35 100644 --- a/usr/src/cmd/Makefile +++ b/usr/src/cmd/Makefile @@ -488,6 +488,8 @@ i386_SUBDIRS= \ acpihpd \ addbadsec \ ahciem \ + bhyve \ + bhyvectl \ biosdev \ cxgbetool \ diskscan \ diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile index f47daead31..e96868e006 100644 --- a/usr/src/cmd/bhyve/Makefile +++ b/usr/src/cmd/bhyve/Makefile @@ -11,13 +11,16 @@ # # Copyright 2014 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. # PROG = bhyve include ../Makefile.cmd +include ../Makefile.cmd.64 +include ../Makefile.ctf -$(BUILD64)SUBDIRS += $(MACH64) +SUBDIRS = test all := TARGET = all install := TARGET = install @@ -25,17 +28,127 @@ clean := TARGET = clean clobber := TARGET = clobber lint := TARGET = lint +SRCS = acpi.c \ + atkbdc.c \ + bhyvegc.c \ + bhyverun.c \ + block_if.c \ + bootrom.c \ + console.c \ + consport.c \ + dbgport.c \ + fwctl.c \ + gdb.c \ + inout.c \ + ioapic.c \ + mem.c \ + mevent.c \ + mptbl.c \ + pci_ahci.c \ + pci_e82545.c \ + pci_emul.c \ + pci_fbuf.c \ + pci_hostbridge.c \ + pci_irq.c \ + pci_lpc.c \ + pci_nvme.c \ + pci_passthru.c \ + pci_uart.c \ + pci_virtio_block.c \ + pci_virtio_console.c \ + pci_virtio_net.c \ + pci_virtio_rnd.c \ + pci_xhci.c \ + pm.c \ + post.c \ + ps2kbd.c \ + ps2mouse.c \ + rfb.c \ + rtc.c \ + smbiostbl.c \ + sockstream.c \ + task_switch.c \ + uart_emul.c \ + usb_emul.c \ + usb_mouse.c \ + vga.c \ + virtio.c \ + vmm_instruction_emul.c \ + xmsr.c \ + spinup_ap.c \ + iov.c \ + bhyve_sol_glue.c + +# The virtio-scsi driver appears to include a slew of materials from FreeBSD's +# native SCSI implementation. We will omit that complexity for now. + #ctl_util.c \ + #ctl_scsi_all.c \ + #pci_virtio_scsi.c \ + + +OBJS = $(SRCS:.c=.o) + +CLOBBERFILES = $(ROOTUSRSBINPROG) + +MEVENT_TEST_PROG = mevent_test +MEVENT_TEST_SRCS = mevent.c mevent_test.c +MEVENT_TEST_OBJS = $(MEVENT_TEST_SRCS:.c=.o) + +CLEANFILES = $(PROG) $(MEVENT_TEST_PROG) $(MEVENT_TEST_OBJS) + +CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration -_gcc=-Wno-parentheses +CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ + -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 \ + -I$(CONTRIB)/freebsd/dev/usb/controller \ + -I$(CONTRIB)/freebsd/dev/mii \ + -I$(SRC)/uts/common/io/e1000api \ + $(CPPFLAGS.master) \ + -I$(SRC)/uts/i86pc/io/vmm \ + -I$(SRC)/uts/common \ + -I$(SRC)/uts/i86pc \ + -DWITHOUT_CAPSICUM + +# Disable the crypto code until it is wired up +CPPFLAGS += -DNO_OPENSSL + +pci_nvme.o := CERRWARN += -_gcc=-Wno-pointer-sign + +SMOFF += all_func_returns,leaks,no_if_block + +# Force c99 for everything +CSTD= $(CSTD_GNU99) +C99MODE= -xc99=%all +C99LMODE= -Xc99=%all + +$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -lmd -luuid -lvmmapi -lz +$(MEVENT_TEST_PROG) := LDLIBS += -lsocket + .KEEP_STATE: -all clean clobber lint: $(SUBDIRS) +all: $(PROG) $(MEVENT_TEST_PROG) $(SUBDIRS) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) + $(POST_PROCESS) + +$(MEVENT_TEST_PROG): $(MEVENT_TEST_OBJS) + $(LINK.c) -o $@ $(MEVENT_TEST_OBJS) $(LDFLAGS) $(LDLIBS) + +install: all $(ROOTUSRSBINPROG) $(SUBDIRS) + +clean: $(SUBDIRS) + $(RM) $(OBJS) $(CLEANFILES) + +clobber: clean $(SUBDIRS) + $(RM) $(CLOBBERFILES) -install: $(SUBDIRS) - -$(RM) $(ROOTUSRSBINPROG) - -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG) +lint: lint_SRCS $(SUBDIRS) -$(SUBDIRS): FRC - @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) FRC: -include ../Makefile.targ +%.o: $(SRC)/uts/i86pc/io/vmm/%.c + $(COMPILE.c) $< + $(POST_PROCESS_O) diff --git a/usr/src/cmd/bhyve/Makefile.com b/usr/src/cmd/bhyve/Makefile.com deleted file mode 100644 index 4a92b622ab..0000000000 --- a/usr/src/cmd/bhyve/Makefile.com +++ /dev/null @@ -1,94 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2015 Pluribus Networks Inc. -# - -PROG= bhyve - -SRCS = atkbdc.c \ - bhyvegc.c \ - bhyverun.c \ - block_if.c \ - console.c \ - consport.c \ - inout.c \ - ioapic.c \ - mem.c \ - mptbl.c \ - pci_ahci.c \ - pci_emul.c \ - pci_hostbridge.c \ - pci_irq.c \ - pci_lpc.c \ - pci_virtio_block.c \ - pci_virtio_net.c \ - pci_virtio_viona.c \ - pm.c \ - pmtmr.c \ - post.c \ - ps2kbd.c \ - ps2mouse.c \ - rfb.c \ - rtc.c \ - smbiostbl.c \ - uart_emul.c \ - vga.c \ - virtio.c \ - vmm_instruction_emul.c \ - xmsr.c \ - spinup_ap.c \ - bhyve_sol_glue.c - -OBJS = $(SRCS:.c=.o) - -include ../../Makefile.cmd - -.KEEP_STATE: - -CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration -CFLAGS64 += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration -CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \ - -I$(ROOT)/usr/platform/i86pc/include \ - -I$(SRC)/uts/i86pc/io/vmm \ - -I$(SRC)/uts/common \ - -I$(SRC)/uts/i86pc \ - -I$(SRC)/lib/libdladm/common -LDLIBS += -lsocket -lnsl -ldlpi -ldladm -lkstat -lmd -luuid -lvmmapi - -POST_PROCESS += ; $(GENSETDEFS) $@ - -all: $(PROG) - -$(PROG): $(OBJS) - $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) - $(POST_PROCESS) - -install: all $(ROOTUSRSBINPROG) - -clean: - $(RM) $(OBJS) - -lint: lint_SRCS - -include ../../Makefile.targ - -%.o: ../%.c - $(COMPILE.c) $< - $(POST_PROCESS_O) - -%.o: $(SRC)/uts/i86pc/io/vmm/%.c - $(COMPILE.c) $< - $(POST_PROCESS_O) - -%.o: ../%.s - $(COMPILE.s) $< diff --git a/usr/src/cmd/bhyve/acpi.c b/usr/src/cmd/bhyve/acpi.c new file mode 100644 index 0000000000..862f4512f8 --- /dev/null +++ b/usr/src/cmd/bhyve/acpi.c @@ -0,0 +1,1007 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * bhyve ACPI table generator. + * + * Create the minimal set of ACPI tables required to boot FreeBSD (and + * hopefully other o/s's) by writing out ASL template files for each of + * the tables and the compiling them to AML with the Intel iasl compiler. + * The AML files are then read into guest memory. + * + * The tables are placed in the guest's ROM area just below 1MB physical, + * above the MPTable. + * + * Layout (No longer correct at FADT and beyond due to properly + * calculating the size of the MADT to allow for changes to + * VM_MAXCPU above 21 which overflows this layout.) + * ------ + * RSDP -> 0xf2400 (36 bytes fixed) + * RSDT -> 0xf2440 (36 bytes + 4*7 table addrs, 4 used) + * XSDT -> 0xf2480 (36 bytes + 8*7 table addrs, 4 used) + * MADT -> 0xf2500 (depends on #CPUs) + * FADT -> 0xf2600 (268 bytes) + * HPET -> 0xf2740 (56 bytes) + * MCFG -> 0xf2780 (60 bytes) + * FACS -> 0xf27C0 (64 bytes) + * DSDT -> 0xf2800 (variable - can go up to 0x100000) + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "bhyverun.h" +#include "acpi.h" +#include "pci_emul.h" + +/* + * Define the base address of the ACPI tables, the sizes of some tables, + * and the offsets to the individual tables, + */ +#define BHYVE_ACPI_BASE 0xf2400 +#define RSDT_OFFSET 0x040 +#define XSDT_OFFSET 0x080 +#define MADT_OFFSET 0x100 +/* + * The MADT consists of: + * 44 Fixed Header + * 8 * maxcpu Processor Local APIC entries + * 12 I/O APIC entry + * 2 * 10 Interrupt Source Override entires + * 6 Local APIC NMI entry + */ +#define MADT_SIZE (44 + VM_MAXCPU*8 + 12 + 2*10 + 6) +#define FADT_OFFSET (MADT_OFFSET + MADT_SIZE) +#define FADT_SIZE 0x140 +#define HPET_OFFSET (FADT_OFFSET + FADT_SIZE) +#define HPET_SIZE 0x40 +#define MCFG_OFFSET (HPET_OFFSET + HPET_SIZE) +#define MCFG_SIZE 0x40 +#define FACS_OFFSET (MCFG_OFFSET + MCFG_SIZE) +#define FACS_SIZE 0x40 +#define DSDT_OFFSET (FACS_OFFSET + FACS_SIZE) + +#define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX" +#define BHYVE_ASL_SUFFIX ".aml" +#define BHYVE_ASL_COMPILER "/usr/sbin/iasl" + +static int basl_keep_temps; +static int basl_verbose_iasl; +static int basl_ncpu; +static uint32_t basl_acpi_base = BHYVE_ACPI_BASE; +static uint32_t hpet_capabilities; + +/* + * Contains the full pathname of the template to be passed + * to mkstemp/mktemps(3) + */ +static char basl_template[MAXPATHLEN]; +static char basl_stemplate[MAXPATHLEN]; + +/* + * State for dsdt_line(), dsdt_indent(), and dsdt_unindent(). + */ +static FILE *dsdt_fp; +static int dsdt_indent_level; +static int dsdt_error; + +struct basl_fio { + int fd; + FILE *fp; + char f_name[MAXPATHLEN]; +}; + +#define EFPRINTF(...) \ + if (fprintf(__VA_ARGS__) < 0) goto err_exit; + +#define EFFLUSH(x) \ + if (fflush(x) != 0) goto err_exit; + +static int +basl_fwrite_rsdp(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve RSDP template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 43\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 02\n"); + EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n", + basl_acpi_base + RSDT_OFFSET); + EFPRINTF(fp, "[0004]\t\tLength : 00000024\n"); + EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n", + basl_acpi_base + XSDT_OFFSET); + EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n"); + EFPRINTF(fp, "[0003]\t\tReserved : 000000\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_rsdt(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve RSDT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + /* Add in pointers to the MADT, FADT and HPET */ + EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n", + basl_acpi_base + MADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n", + basl_acpi_base + FADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n", + basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n", + basl_acpi_base + MCFG_OFFSET); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_xsdt(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve XSDT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + /* Add in pointers to the MADT, FADT and HPET */ + EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n", + basl_acpi_base + MADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n", + basl_acpi_base + FADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n", + basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n", + basl_acpi_base + MCFG_OFFSET); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_madt(FILE *fp) +{ + int i; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve MADT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); + EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n"); + EFPRINTF(fp, "\n"); + + /* Add a Processor Local APIC entry for each CPU */ + for (i = 0; i < basl_ncpu; i++) { + EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n"); + EFPRINTF(fp, "[0001]\t\tLength : 08\n"); + /* iasl expects hex values for the proc and apic id's */ + EFPRINTF(fp, "[0001]\t\tProcessor ID : %02x\n", i); + EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02x\n", i); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); + EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n"); +#ifdef __FreeBSD__ + EFPRINTF(fp, "\t\t\tRuntime Online Capable : 0\n"); +#else + /* + * Until iasl is updated to support the "Runtime Online + * Capable" entry, it must be omitted. This should be + * re-checked when illumos receives an acpica update. + */ +#endif /* __FreeBSD__ */ + EFPRINTF(fp, "\n"); + } + + /* Always a single IOAPIC entry, with ID 0 */ + EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n"); + EFPRINTF(fp, "[0001]\t\tLength : 0C\n"); + /* iasl expects a hex value for the i/o apic id */ + EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02x\n", 0); + EFPRINTF(fp, "[0001]\t\tReserved : 00\n"); + EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n"); + EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n"); + EFPRINTF(fp, "\n"); + + /* Legacy IRQ0 is connected to pin 2 of the IOAPIC */ + EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n"); + EFPRINTF(fp, "[0001]\t\tLength : 0A\n"); + EFPRINTF(fp, "[0001]\t\tBus : 00\n"); + EFPRINTF(fp, "[0001]\t\tSource : 00\n"); + EFPRINTF(fp, "[0004]\t\tInterrupt : 00000002\n"); + EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n"); + EFPRINTF(fp, "\t\t\tPolarity : 1\n"); + EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n"); + EFPRINTF(fp, "[0001]\t\tLength : 0A\n"); + EFPRINTF(fp, "[0001]\t\tBus : 00\n"); + EFPRINTF(fp, "[0001]\t\tSource : %02X\n", SCI_INT); + EFPRINTF(fp, "[0004]\t\tInterrupt : %08X\n", SCI_INT); + EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n"); + EFPRINTF(fp, "\t\t\tPolarity : 3\n"); + EFPRINTF(fp, "\t\t\tTrigger Mode : 3\n"); + EFPRINTF(fp, "\n"); + + /* Local APIC NMI is connected to LINT 1 on all CPUs */ + EFPRINTF(fp, "[0001]\t\tSubtable Type : 04\n"); + EFPRINTF(fp, "[0001]\t\tLength : 06\n"); + EFPRINTF(fp, "[0001]\t\tProcessorId : FF\n"); + EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n"); + EFPRINTF(fp, "\t\t\tPolarity : 1\n"); + EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n"); + EFPRINTF(fp, "[0001]\t\tInterrupt : 01\n"); + EFPRINTF(fp, "\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_fadt(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve FADT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 05\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n", + basl_acpi_base + FACS_OFFSET); + EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n", + basl_acpi_base + DSDT_OFFSET); + EFPRINTF(fp, "[0001]\t\tModel : 01\n"); + EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n"); + EFPRINTF(fp, "[0002]\t\tSCI Interrupt : %04X\n", + SCI_INT); + EFPRINTF(fp, "[0004]\t\tSMI Command Port : %08X\n", + SMI_CMD); + EFPRINTF(fp, "[0001]\t\tACPI Enable Value : %02X\n", + BHYVE_ACPI_ENABLE); + EFPRINTF(fp, "[0001]\t\tACPI Disable Value : %02X\n", + BHYVE_ACPI_DISABLE); + EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n"); + EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n"); + EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : %08X\n", + PM1A_EVT_ADDR); + EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : %08X\n", + PM1A_CNT_ADDR); + EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n", + IO_PMTMR); + EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n"); + EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n"); + EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n"); + EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n"); + EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n"); + EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n"); + EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n"); + EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n"); + EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n"); + EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n"); + EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n"); + EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n"); + EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n"); + EFPRINTF(fp, "[0001]\t\tRTC Century Index : 32\n"); + EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n"); + EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n"); + EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n"); + EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n"); + EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n"); + EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n"); + EFPRINTF(fp, "[0001]\t\tReserved : 00\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n"); + EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n"); + EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 1\n"); + EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n"); + EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n"); + EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n"); + EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n"); + EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n"); + EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tReset Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000CF9\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0001]\t\tValue to cause reset : 06\n"); + EFPRINTF(fp, "[0002]\t\tARM Flags (decoded below): 0000\n"); + EFPRINTF(fp, "\t\t\tPSCI Compliant : 0\n"); + EFPRINTF(fp, "\t\t\tMust use HVC for PSCI : 0\n"); + EFPRINTF(fp, "[0001]\t\tFADT Minor Revision : 01\n"); + EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n", + basl_acpi_base + FACS_OFFSET); + EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n", + basl_acpi_base + DSDT_OFFSET); + EFPRINTF(fp, + "[0012]\t\tPM1A Event Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 20\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", + PM1A_EVT_ADDR); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM1B Event Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM1A Control Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 10\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", + PM1A_CNT_ADDR); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM1B Control Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM2 Control Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + /* Valid for bhyve */ + EFPRINTF(fp, + "[0012]\t\tPM Timer Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 20\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", + IO_PMTMR); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tSleep Control Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tSleep Status Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_hpet(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve HPET template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"HPET\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVHPET \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0004]\t\tTimer Block ID : %08X\n", hpet_capabilities); + EFPRINTF(fp, + "[0012]\t\tTimer Block Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 00 [SystemMemory]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000FED00000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0001]\t\tHPET Number : 00\n"); + EFPRINTF(fp, "[0002]\t\tMinimum Clock Ticks : 0000\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); + EFPRINTF(fp, "\t\t\t4K Page Protect : 1\n"); + EFPRINTF(fp, "\t\t\t64K Page Protect : 0\n"); + EFPRINTF(fp, "\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_mcfg(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve MCFG template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "[0008]\t\tReserved : 0\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base()); + EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n"); + EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n"); + EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n"); + EFPRINTF(fp, "[0004]\t\tReserved : 0\n"); + EFFLUSH(fp); + return (0); +err_exit: + return (errno); +} + +static int +basl_fwrite_facs(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve FACS template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n"); + EFPRINTF(fp, "[0004]\t\tLength : 00000040\n"); + EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n"); + EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n"); + EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n"); + EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n"); + EFPRINTF(fp, + "[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n"); + EFPRINTF(fp, "[0001]\t\tVersion : 02\n"); + EFPRINTF(fp, "[0003]\t\tReserved : 000000\n"); + EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n"); + EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +/* + * Helper routines for writing to the DSDT from other modules. + */ +void +dsdt_line(const char *fmt, ...) +{ + va_list ap; + + if (dsdt_error != 0) + return; + + if (strcmp(fmt, "") != 0) { + if (dsdt_indent_level != 0) + EFPRINTF(dsdt_fp, "%*c", dsdt_indent_level * 2, ' '); + va_start(ap, fmt); + if (vfprintf(dsdt_fp, fmt, ap) < 0) { + va_end(ap); + goto err_exit; + } + va_end(ap); + } + EFPRINTF(dsdt_fp, "\n"); + return; + +err_exit: + dsdt_error = errno; +} + +void +dsdt_indent(int levels) +{ + + dsdt_indent_level += levels; + assert(dsdt_indent_level >= 0); +} + +void +dsdt_unindent(int levels) +{ + + assert(dsdt_indent_level >= levels); + dsdt_indent_level -= levels; +} + +void +dsdt_fixed_ioport(uint16_t iobase, uint16_t length) +{ + + dsdt_line("IO (Decode16,"); + dsdt_line(" 0x%04X, // Range Minimum", iobase); + dsdt_line(" 0x%04X, // Range Maximum", iobase); + dsdt_line(" 0x01, // Alignment"); + dsdt_line(" 0x%02X, // Length", length); + dsdt_line(" )"); +} + +void +dsdt_fixed_irq(uint8_t irq) +{ + + dsdt_line("IRQNoFlags ()"); + dsdt_line(" {%d}", irq); +} + +void +dsdt_fixed_mem32(uint32_t base, uint32_t length) +{ + + dsdt_line("Memory32Fixed (ReadWrite,"); + dsdt_line(" 0x%08X, // Address Base", base); + dsdt_line(" 0x%08X, // Address Length", length); + dsdt_line(" )"); +} + +static int +basl_fwrite_dsdt(FILE *fp) +{ + dsdt_fp = fp; + dsdt_error = 0; + dsdt_indent_level = 0; + + dsdt_line("/*"); + dsdt_line(" * bhyve DSDT template"); + dsdt_line(" */"); + dsdt_line("DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2," + "\"BHYVE \", \"BVDSDT \", 0x00000001)"); + dsdt_line("{"); + dsdt_line(" Name (_S5, Package ()"); + dsdt_line(" {"); + dsdt_line(" 0x05,"); + dsdt_line(" Zero,"); + dsdt_line(" })"); + + pci_write_dsdt(); + + dsdt_line(""); + dsdt_line(" Scope (_SB.PC00)"); + dsdt_line(" {"); + dsdt_line(" Device (HPET)"); + dsdt_line(" {"); + dsdt_line(" Name (_HID, EISAID(\"PNP0103\"))"); + dsdt_line(" Name (_UID, 0)"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(4); + dsdt_fixed_mem32(0xFED00000, 0x400); + dsdt_unindent(4); + dsdt_line(" })"); + dsdt_line(" }"); + dsdt_line(" }"); + dsdt_line("}"); + + if (dsdt_error != 0) + return (dsdt_error); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_open(struct basl_fio *bf, int suffix) +{ + int err; + + err = 0; + + if (suffix) { + strlcpy(bf->f_name, basl_stemplate, MAXPATHLEN); + bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX)); + } else { + strlcpy(bf->f_name, basl_template, MAXPATHLEN); + bf->fd = mkstemp(bf->f_name); + } + + if (bf->fd > 0) { + bf->fp = fdopen(bf->fd, "w+"); + if (bf->fp == NULL) { + unlink(bf->f_name); + close(bf->fd); + } + } else { + err = 1; + } + + return (err); +} + +static void +basl_close(struct basl_fio *bf) +{ + + if (!basl_keep_temps) + unlink(bf->f_name); + fclose(bf->fp); +} + +static int +basl_start(struct basl_fio *in, struct basl_fio *out) +{ + int err; + + err = basl_open(in, 0); + if (!err) { + err = basl_open(out, 1); + if (err) { + basl_close(in); + } + } + + return (err); +} + +static void +basl_end(struct basl_fio *in, struct basl_fio *out) +{ + + basl_close(in); + basl_close(out); +} + +static int +basl_load(struct vmctx *ctx, int fd, uint64_t off) +{ + struct stat sb; + void *gaddr; + + if (fstat(fd, &sb) < 0) + return (errno); + + gaddr = paddr_guest2host(ctx, basl_acpi_base + off, sb.st_size); + if (gaddr == NULL) + return (EFAULT); + + if (read(fd, gaddr, sb.st_size) < 0) + return (errno); + + return (0); +} + +static int +basl_compile(struct vmctx *ctx, int (*fwrite_section)(FILE *), uint64_t offset) +{ + struct basl_fio io[2]; + static char iaslbuf[3*MAXPATHLEN + 10]; + char *fmt; + int err; + + err = basl_start(&io[0], &io[1]); + if (!err) { + err = (*fwrite_section)(io[0].fp); + + if (!err) { + /* + * iasl sends the results of the compilation to + * stdout. Shut this down by using the shell to + * redirect stdout to /dev/null, unless the user + * has requested verbose output for debugging + * purposes + */ + fmt = basl_verbose_iasl ? + "%s -p %s %s" : + "/bin/sh -c \"%s -p %s %s\" 1> /dev/null"; + + snprintf(iaslbuf, sizeof(iaslbuf), + fmt, + BHYVE_ASL_COMPILER, + io[1].f_name, io[0].f_name); + err = system(iaslbuf); + + if (!err) { + /* + * Copy the aml output file into guest + * memory at the specified location + */ + err = basl_load(ctx, io[1].fd, offset); + } + } + basl_end(&io[0], &io[1]); + } + + return (err); +} + +static int +basl_make_templates(void) +{ + const char *tmpdir; + int err; + int len; + + err = 0; + + /* + * + */ + if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' || + (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') { + tmpdir = _PATH_TMP; + } + + len = strlen(tmpdir); + + if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) { + strcpy(basl_template, tmpdir); + while (len > 0 && basl_template[len - 1] == '/') + len--; + basl_template[len] = '/'; + strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE); + } else + err = E2BIG; + + if (!err) { + /* + * len has been intialized (and maybe adjusted) above + */ + if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 + + sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) { + strcpy(basl_stemplate, tmpdir); + basl_stemplate[len] = '/'; + strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE); + len = strlen(basl_stemplate); + strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX); + } else + err = E2BIG; + } + + return (err); +} + +static struct { + int (*wsect)(FILE *fp); + uint64_t offset; +} basl_ftables[] = +{ + { basl_fwrite_rsdp, 0}, + { basl_fwrite_rsdt, RSDT_OFFSET }, + { basl_fwrite_xsdt, XSDT_OFFSET }, + { basl_fwrite_madt, MADT_OFFSET }, + { basl_fwrite_fadt, FADT_OFFSET }, + { basl_fwrite_hpet, HPET_OFFSET }, + { basl_fwrite_mcfg, MCFG_OFFSET }, + { basl_fwrite_facs, FACS_OFFSET }, + { basl_fwrite_dsdt, DSDT_OFFSET }, + { NULL } +}; + +int +acpi_build(struct vmctx *ctx, int ncpu) +{ + int err; + int i; + + basl_ncpu = ncpu; + + err = vm_get_hpet_capabilities(ctx, &hpet_capabilities); + if (err != 0) + return (err); + + /* + * For debug, allow the user to have iasl compiler output sent + * to stdout rather than /dev/null + */ + if (getenv("BHYVE_ACPI_VERBOSE_IASL")) + basl_verbose_iasl = 1; + + /* + * Allow the user to keep the generated ASL files for debugging + * instead of deleting them following use + */ + if (getenv("BHYVE_ACPI_KEEPTMPS")) + basl_keep_temps = 1; + + i = 0; + err = basl_make_templates(); + + /* + * Run through all the ASL files, compiling them and + * copying them into guest memory + */ + while (!err && basl_ftables[i].wsect != NULL) { + err = basl_compile(ctx, basl_ftables[i].wsect, + basl_ftables[i].offset); + i++; + } + + return (err); +} diff --git a/usr/src/cmd/bhyve/acpi.h b/usr/src/cmd/bhyve/acpi.h index 477f827286..4c6d86d091 100644 --- a/usr/src/cmd/bhyve/acpi.h +++ b/usr/src/cmd/bhyve/acpi.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/acpi.h 266125 2014-05-15 14:16:55Z jhb $ + * $FreeBSD$ */ #ifndef _ACPI_H_ diff --git a/usr/src/cmd/bhyve/ahci.h b/usr/src/cmd/bhyve/ahci.h index 1cf09adcbf..691d4bd438 100644 --- a/usr/src/cmd/bhyve/ahci.h +++ b/usr/src/cmd/bhyve/ahci.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1998 - 2008 Søren Schmidt * Copyright (c) 2009-2012 Alexander Motin * All rights reserved. @@ -24,281 +26,299 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/ahci.h 256056 2013-10-04 18:31:38Z grehan $ + * $FreeBSD$ */ #ifndef _AHCI_H_ #define _AHCI_H_ /* ATA register defines */ -#define ATA_DATA 0 /* (RW) data */ - -#define ATA_FEATURE 1 /* (W) feature */ -#define ATA_F_DMA 0x01 /* enable DMA */ -#define ATA_F_OVL 0x02 /* enable overlap */ - -#define ATA_COUNT 2 /* (W) sector count */ - -#define ATA_SECTOR 3 /* (RW) sector # */ -#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */ -#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */ -#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */ -#define ATA_D_LBA 0x40 /* use LBA addressing */ -#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */ - -#define ATA_COMMAND 7 /* (W) command */ - -#define ATA_ERROR 8 /* (R) error */ -#define ATA_E_ILI 0x01 /* illegal length */ -#define ATA_E_NM 0x02 /* no media */ -#define ATA_E_ABORT 0x04 /* command aborted */ -#define ATA_E_MCR 0x08 /* media change request */ -#define ATA_E_IDNF 0x10 /* ID not found */ -#define ATA_E_MC 0x20 /* media changed */ -#define ATA_E_UNC 0x40 /* uncorrectable data */ -#define ATA_E_ICRC 0x80 /* UDMA crc error */ -#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */ - -#define ATA_IREASON 9 /* (R) interrupt reason */ -#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */ -#define ATA_I_IN 0x02 /* read (1) | write (0) */ -#define ATA_I_RELEASE 0x04 /* released bus (1) */ -#define ATA_I_TAGMASK 0xf8 /* tag mask */ - -#define ATA_STATUS 10 /* (R) status */ -#define ATA_ALTSTAT 11 /* (R) alternate status */ -#define ATA_S_ERROR 0x01 /* error */ -#define ATA_S_INDEX 0x02 /* index */ -#define ATA_S_CORR 0x04 /* data corrected */ -#define ATA_S_DRQ 0x08 /* data request */ -#define ATA_S_DSC 0x10 /* drive seek completed */ -#define ATA_S_SERVICE 0x10 /* drive needs service */ -#define ATA_S_DWF 0x20 /* drive write fault */ -#define ATA_S_DMA 0x20 /* DMA ready */ -#define ATA_S_READY 0x40 /* drive ready */ -#define ATA_S_BUSY 0x80 /* busy */ - -#define ATA_CONTROL 12 /* (W) control */ -#define ATA_A_IDS 0x02 /* disable interrupts */ -#define ATA_A_RESET 0x04 /* RESET controller */ -#define ATA_A_4BIT 0x08 /* 4 head bits */ -#define ATA_A_HOB 0x80 /* High Order Byte enable */ +#define ATA_DATA 0 /* (RW) data */ + +#define ATA_FEATURE 1 /* (W) feature */ +#define ATA_F_DMA 0x01 /* enable DMA */ +#define ATA_F_OVL 0x02 /* enable overlap */ + +#define ATA_COUNT 2 /* (W) sector count */ + +#define ATA_SECTOR 3 /* (RW) sector # */ +#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */ +#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */ +#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */ +#define ATA_D_LBA 0x40 /* use LBA addressing */ +#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */ + +#define ATA_COMMAND 7 /* (W) command */ + +#define ATA_ERROR 8 /* (R) error */ +#define ATA_E_ILI 0x01 /* illegal length */ +#define ATA_E_NM 0x02 /* no media */ +#define ATA_E_ABORT 0x04 /* command aborted */ +#define ATA_E_MCR 0x08 /* media change request */ +#define ATA_E_IDNF 0x10 /* ID not found */ +#define ATA_E_MC 0x20 /* media changed */ +#define ATA_E_UNC 0x40 /* uncorrectable data */ +#define ATA_E_ICRC 0x80 /* UDMA crc error */ +#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */ + +#define ATA_IREASON 9 /* (R) interrupt reason */ +#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */ +#define ATA_I_IN 0x02 /* read (1) | write (0) */ +#define ATA_I_RELEASE 0x04 /* released bus (1) */ +#define ATA_I_TAGMASK 0xf8 /* tag mask */ + +#define ATA_STATUS 10 /* (R) status */ +#define ATA_ALTSTAT 11 /* (R) alternate status */ +#define ATA_S_ERROR 0x01 /* error */ +#define ATA_S_INDEX 0x02 /* index */ +#define ATA_S_CORR 0x04 /* data corrected */ +#define ATA_S_DRQ 0x08 /* data request */ +#define ATA_S_DSC 0x10 /* drive seek completed */ +#define ATA_S_SERVICE 0x10 /* drive needs service */ +#define ATA_S_DWF 0x20 /* drive write fault */ +#define ATA_S_DMA 0x20 /* DMA ready */ +#define ATA_S_READY 0x40 /* drive ready */ +#define ATA_S_BUSY 0x80 /* busy */ + +#define ATA_CONTROL 12 /* (W) control */ +#define ATA_A_IDS 0x02 /* disable interrupts */ +#define ATA_A_RESET 0x04 /* RESET controller */ +#define ATA_A_4BIT 0x08 /* 4 head bits */ +#define ATA_A_HOB 0x80 /* High Order Byte enable */ /* SATA register defines */ -#define ATA_SSTATUS 13 -#define ATA_SS_DET_MASK 0x0000000f -#define ATA_SS_DET_NO_DEVICE 0x00000000 -#define ATA_SS_DET_DEV_PRESENT 0x00000001 -#define ATA_SS_DET_PHY_ONLINE 0x00000003 -#define ATA_SS_DET_PHY_OFFLINE 0x00000004 - -#define ATA_SS_SPD_MASK 0x000000f0 -#define ATA_SS_SPD_NO_SPEED 0x00000000 -#define ATA_SS_SPD_GEN1 0x00000010 -#define ATA_SS_SPD_GEN2 0x00000020 -#define ATA_SS_SPD_GEN3 0x00000040 - -#define ATA_SS_IPM_MASK 0x00000f00 -#define ATA_SS_IPM_NO_DEVICE 0x00000000 -#define ATA_SS_IPM_ACTIVE 0x00000100 -#define ATA_SS_IPM_PARTIAL 0x00000200 -#define ATA_SS_IPM_SLUMBER 0x00000600 - -#define ATA_SERROR 14 -#define ATA_SE_DATA_CORRECTED 0x00000001 -#define ATA_SE_COMM_CORRECTED 0x00000002 -#define ATA_SE_DATA_ERR 0x00000100 -#define ATA_SE_COMM_ERR 0x00000200 -#define ATA_SE_PROT_ERR 0x00000400 -#define ATA_SE_HOST_ERR 0x00000800 -#define ATA_SE_PHY_CHANGED 0x00010000 -#define ATA_SE_PHY_IERROR 0x00020000 -#define ATA_SE_COMM_WAKE 0x00040000 -#define ATA_SE_DECODE_ERR 0x00080000 -#define ATA_SE_PARITY_ERR 0x00100000 -#define ATA_SE_CRC_ERR 0x00200000 -#define ATA_SE_HANDSHAKE_ERR 0x00400000 -#define ATA_SE_LINKSEQ_ERR 0x00800000 -#define ATA_SE_TRANSPORT_ERR 0x01000000 -#define ATA_SE_UNKNOWN_FIS 0x02000000 -#define ATA_SE_EXCHANGED 0x04000000 - -#define ATA_SCONTROL 15 -#define ATA_SC_DET_MASK 0x0000000f -#define ATA_SC_DET_IDLE 0x00000000 -#define ATA_SC_DET_RESET 0x00000001 -#define ATA_SC_DET_DISABLE 0x00000004 - -#define ATA_SC_SPD_MASK 0x000000f0 -#define ATA_SC_SPD_NO_SPEED 0x00000000 -#define ATA_SC_SPD_SPEED_GEN1 0x00000010 -#define ATA_SC_SPD_SPEED_GEN2 0x00000020 -#define ATA_SC_SPD_SPEED_GEN3 0x00000040 - -#define ATA_SC_IPM_MASK 0x00000f00 -#define ATA_SC_IPM_NONE 0x00000000 -#define ATA_SC_IPM_DIS_PARTIAL 0x00000100 -#define ATA_SC_IPM_DIS_SLUMBER 0x00000200 - -#define ATA_SACTIVE 16 - -#define AHCI_MAX_PORTS 32 -#define AHCI_MAX_SLOTS 32 +#define ATA_SSTATUS 13 +#define ATA_SS_DET_MASK 0x0000000f +#define ATA_SS_DET_NO_DEVICE 0x00000000 +#define ATA_SS_DET_DEV_PRESENT 0x00000001 +#define ATA_SS_DET_PHY_ONLINE 0x00000003 +#define ATA_SS_DET_PHY_OFFLINE 0x00000004 + +#define ATA_SS_SPD_MASK 0x000000f0 +#define ATA_SS_SPD_NO_SPEED 0x00000000 +#define ATA_SS_SPD_GEN1 0x00000010 +#define ATA_SS_SPD_GEN2 0x00000020 +#define ATA_SS_SPD_GEN3 0x00000030 + +#define ATA_SS_IPM_MASK 0x00000f00 +#define ATA_SS_IPM_NO_DEVICE 0x00000000 +#define ATA_SS_IPM_ACTIVE 0x00000100 +#define ATA_SS_IPM_PARTIAL 0x00000200 +#define ATA_SS_IPM_SLUMBER 0x00000600 +#define ATA_SS_IPM_DEVSLEEP 0x00000800 + +#define ATA_SERROR 14 +#define ATA_SE_DATA_CORRECTED 0x00000001 +#define ATA_SE_COMM_CORRECTED 0x00000002 +#define ATA_SE_DATA_ERR 0x00000100 +#define ATA_SE_COMM_ERR 0x00000200 +#define ATA_SE_PROT_ERR 0x00000400 +#define ATA_SE_HOST_ERR 0x00000800 +#define ATA_SE_PHY_CHANGED 0x00010000 +#define ATA_SE_PHY_IERROR 0x00020000 +#define ATA_SE_COMM_WAKE 0x00040000 +#define ATA_SE_DECODE_ERR 0x00080000 +#define ATA_SE_PARITY_ERR 0x00100000 +#define ATA_SE_CRC_ERR 0x00200000 +#define ATA_SE_HANDSHAKE_ERR 0x00400000 +#define ATA_SE_LINKSEQ_ERR 0x00800000 +#define ATA_SE_TRANSPORT_ERR 0x01000000 +#define ATA_SE_UNKNOWN_FIS 0x02000000 +#define ATA_SE_EXCHANGED 0x04000000 + +#define ATA_SCONTROL 15 +#define ATA_SC_DET_MASK 0x0000000f +#define ATA_SC_DET_IDLE 0x00000000 +#define ATA_SC_DET_RESET 0x00000001 +#define ATA_SC_DET_DISABLE 0x00000004 + +#define ATA_SC_SPD_MASK 0x000000f0 +#define ATA_SC_SPD_NO_SPEED 0x00000000 +#define ATA_SC_SPD_SPEED_GEN1 0x00000010 +#define ATA_SC_SPD_SPEED_GEN2 0x00000020 +#define ATA_SC_SPD_SPEED_GEN3 0x00000030 + +#define ATA_SC_IPM_MASK 0x00000f00 +#define ATA_SC_IPM_NONE 0x00000000 +#define ATA_SC_IPM_DIS_PARTIAL 0x00000100 +#define ATA_SC_IPM_DIS_SLUMBER 0x00000200 +#define ATA_SC_IPM_DIS_DEVSLEEP 0x00000400 + +#define ATA_SACTIVE 16 + +#define AHCI_MAX_PORTS 32 +#define AHCI_MAX_SLOTS 32 +#define AHCI_MAX_IRQS 16 /* SATA AHCI v1.0 register defines */ -#define AHCI_CAP 0x00 -#define AHCI_CAP_NPMASK 0x0000001f -#define AHCI_CAP_SXS 0x00000020 -#define AHCI_CAP_EMS 0x00000040 -#define AHCI_CAP_CCCS 0x00000080 -#define AHCI_CAP_NCS 0x00001F00 -#define AHCI_CAP_NCS_SHIFT 8 -#define AHCI_CAP_PSC 0x00002000 -#define AHCI_CAP_SSC 0x00004000 -#define AHCI_CAP_PMD 0x00008000 -#define AHCI_CAP_FBSS 0x00010000 -#define AHCI_CAP_SPM 0x00020000 -#define AHCI_CAP_SAM 0x00080000 -#define AHCI_CAP_ISS 0x00F00000 -#define AHCI_CAP_ISS_SHIFT 20 -#define AHCI_CAP_SCLO 0x01000000 -#define AHCI_CAP_SAL 0x02000000 -#define AHCI_CAP_SALP 0x04000000 -#define AHCI_CAP_SSS 0x08000000 -#define AHCI_CAP_SMPS 0x10000000 -#define AHCI_CAP_SSNTF 0x20000000 -#define AHCI_CAP_SNCQ 0x40000000 -#define AHCI_CAP_64BIT 0x80000000 - -#define AHCI_GHC 0x04 -#define AHCI_GHC_AE 0x80000000 -#define AHCI_GHC_MRSM 0x00000004 -#define AHCI_GHC_IE 0x00000002 -#define AHCI_GHC_HR 0x00000001 - -#define AHCI_IS 0x08 -#define AHCI_PI 0x0c -#define AHCI_VS 0x10 - -#define AHCI_CCCC 0x14 -#define AHCI_CCCC_TV_MASK 0xffff0000 -#define AHCI_CCCC_TV_SHIFT 16 -#define AHCI_CCCC_CC_MASK 0x0000ff00 -#define AHCI_CCCC_CC_SHIFT 8 -#define AHCI_CCCC_INT_MASK 0x000000f8 -#define AHCI_CCCC_INT_SHIFT 3 -#define AHCI_CCCC_EN 0x00000001 -#define AHCI_CCCP 0x18 - -#define AHCI_EM_LOC 0x1C -#define AHCI_EM_CTL 0x20 -#define AHCI_EM_MR 0x00000001 -#define AHCI_EM_TM 0x00000100 -#define AHCI_EM_RST 0x00000200 -#define AHCI_EM_LED 0x00010000 -#define AHCI_EM_SAFTE 0x00020000 -#define AHCI_EM_SES2 0x00040000 -#define AHCI_EM_SGPIO 0x00080000 -#define AHCI_EM_SMB 0x01000000 -#define AHCI_EM_XMT 0x02000000 -#define AHCI_EM_ALHD 0x04000000 -#define AHCI_EM_PM 0x08000000 - -#define AHCI_CAP2 0x24 -#define AHCI_CAP2_BOH 0x00000001 -#define AHCI_CAP2_NVMP 0x00000002 -#define AHCI_CAP2_APST 0x00000004 - -#define AHCI_OFFSET 0x100 -#define AHCI_STEP 0x80 - -#define AHCI_P_CLB 0x00 -#define AHCI_P_CLBU 0x04 -#define AHCI_P_FB 0x08 -#define AHCI_P_FBU 0x0c -#define AHCI_P_IS 0x10 -#define AHCI_P_IE 0x14 -#define AHCI_P_IX_DHR 0x00000001 -#define AHCI_P_IX_PS 0x00000002 -#define AHCI_P_IX_DS 0x00000004 -#define AHCI_P_IX_SDB 0x00000008 -#define AHCI_P_IX_UF 0x00000010 -#define AHCI_P_IX_DP 0x00000020 -#define AHCI_P_IX_PC 0x00000040 -#define AHCI_P_IX_MP 0x00000080 - -#define AHCI_P_IX_PRC 0x00400000 -#define AHCI_P_IX_IPM 0x00800000 -#define AHCI_P_IX_OF 0x01000000 -#define AHCI_P_IX_INF 0x04000000 -#define AHCI_P_IX_IF 0x08000000 -#define AHCI_P_IX_HBD 0x10000000 -#define AHCI_P_IX_HBF 0x20000000 -#define AHCI_P_IX_TFE 0x40000000 -#define AHCI_P_IX_CPD 0x80000000 - -#define AHCI_P_CMD 0x18 -#define AHCI_P_CMD_ST 0x00000001 -#define AHCI_P_CMD_SUD 0x00000002 -#define AHCI_P_CMD_POD 0x00000004 -#define AHCI_P_CMD_CLO 0x00000008 -#define AHCI_P_CMD_FRE 0x00000010 -#define AHCI_P_CMD_CCS_MASK 0x00001f00 -#define AHCI_P_CMD_CCS_SHIFT 8 -#define AHCI_P_CMD_ISS 0x00002000 -#define AHCI_P_CMD_FR 0x00004000 -#define AHCI_P_CMD_CR 0x00008000 -#define AHCI_P_CMD_CPS 0x00010000 -#define AHCI_P_CMD_PMA 0x00020000 -#define AHCI_P_CMD_HPCP 0x00040000 -#define AHCI_P_CMD_MPSP 0x00080000 -#define AHCI_P_CMD_CPD 0x00100000 -#define AHCI_P_CMD_ESP 0x00200000 -#define AHCI_P_CMD_FBSCP 0x00400000 -#define AHCI_P_CMD_APSTE 0x00800000 -#define AHCI_P_CMD_ATAPI 0x01000000 -#define AHCI_P_CMD_DLAE 0x02000000 -#define AHCI_P_CMD_ALPE 0x04000000 -#define AHCI_P_CMD_ASP 0x08000000 -#define AHCI_P_CMD_ICC_MASK 0xf0000000 -#define AHCI_P_CMD_NOOP 0x00000000 -#define AHCI_P_CMD_ACTIVE 0x10000000 -#define AHCI_P_CMD_PARTIAL 0x20000000 -#define AHCI_P_CMD_SLUMBER 0x60000000 - -#define AHCI_P_TFD 0x20 -#define AHCI_P_SIG 0x24 -#define AHCI_P_SSTS 0x28 -#define AHCI_P_SCTL 0x2c -#define AHCI_P_SERR 0x30 -#define AHCI_P_SACT 0x34 -#define AHCI_P_CI 0x38 -#define AHCI_P_SNTF 0x3C -#define AHCI_P_FBS 0x40 -#define AHCI_P_FBS_EN 0x00000001 -#define AHCI_P_FBS_DEC 0x00000002 -#define AHCI_P_FBS_SDE 0x00000004 -#define AHCI_P_FBS_DEV 0x00000f00 -#define AHCI_P_FBS_DEV_SHIFT 8 -#define AHCI_P_FBS_ADO 0x0000f000 -#define AHCI_P_FBS_ADO_SHIFT 12 -#define AHCI_P_FBS_DWE 0x000f0000 -#define AHCI_P_FBS_DWE_SHIFT 16 +#define AHCI_CAP 0x00 +#define AHCI_CAP_NPMASK 0x0000001f +#define AHCI_CAP_SXS 0x00000020 +#define AHCI_CAP_EMS 0x00000040 +#define AHCI_CAP_CCCS 0x00000080 +#define AHCI_CAP_NCS 0x00001F00 +#define AHCI_CAP_NCS_SHIFT 8 +#define AHCI_CAP_PSC 0x00002000 +#define AHCI_CAP_SSC 0x00004000 +#define AHCI_CAP_PMD 0x00008000 +#define AHCI_CAP_FBSS 0x00010000 +#define AHCI_CAP_SPM 0x00020000 +#define AHCI_CAP_SAM 0x00080000 +#define AHCI_CAP_ISS 0x00F00000 +#define AHCI_CAP_ISS_SHIFT 20 +#define AHCI_CAP_SCLO 0x01000000 +#define AHCI_CAP_SAL 0x02000000 +#define AHCI_CAP_SALP 0x04000000 +#define AHCI_CAP_SSS 0x08000000 +#define AHCI_CAP_SMPS 0x10000000 +#define AHCI_CAP_SSNTF 0x20000000 +#define AHCI_CAP_SNCQ 0x40000000 +#define AHCI_CAP_64BIT 0x80000000 + +#define AHCI_GHC 0x04 +#define AHCI_GHC_AE 0x80000000 +#define AHCI_GHC_MRSM 0x00000004 +#define AHCI_GHC_IE 0x00000002 +#define AHCI_GHC_HR 0x00000001 + +#define AHCI_IS 0x08 +#define AHCI_PI 0x0c +#define AHCI_VS 0x10 + +#define AHCI_CCCC 0x14 +#define AHCI_CCCC_TV_MASK 0xffff0000 +#define AHCI_CCCC_TV_SHIFT 16 +#define AHCI_CCCC_CC_MASK 0x0000ff00 +#define AHCI_CCCC_CC_SHIFT 8 +#define AHCI_CCCC_INT_MASK 0x000000f8 +#define AHCI_CCCC_INT_SHIFT 3 +#define AHCI_CCCC_EN 0x00000001 +#define AHCI_CCCP 0x18 + +#define AHCI_EM_LOC 0x1C +#define AHCI_EM_CTL 0x20 +#define AHCI_EM_MR 0x00000001 +#define AHCI_EM_TM 0x00000100 +#define AHCI_EM_RST 0x00000200 +#define AHCI_EM_LED 0x00010000 +#define AHCI_EM_SAFTE 0x00020000 +#define AHCI_EM_SES2 0x00040000 +#define AHCI_EM_SGPIO 0x00080000 +#define AHCI_EM_SMB 0x01000000 +#define AHCI_EM_XMT 0x02000000 +#define AHCI_EM_ALHD 0x04000000 +#define AHCI_EM_PM 0x08000000 + +#define AHCI_CAP2 0x24 +#define AHCI_CAP2_BOH 0x00000001 +#define AHCI_CAP2_NVMP 0x00000002 +#define AHCI_CAP2_APST 0x00000004 +#define AHCI_CAP2_SDS 0x00000008 +#define AHCI_CAP2_SADM 0x00000010 +#define AHCI_CAP2_DESO 0x00000020 + +#define AHCI_OFFSET 0x100 +#define AHCI_STEP 0x80 + +#define AHCI_P_CLB 0x00 +#define AHCI_P_CLBU 0x04 +#define AHCI_P_FB 0x08 +#define AHCI_P_FBU 0x0c +#define AHCI_P_IS 0x10 +#define AHCI_P_IE 0x14 +#define AHCI_P_IX_DHR 0x00000001 +#define AHCI_P_IX_PS 0x00000002 +#define AHCI_P_IX_DS 0x00000004 +#define AHCI_P_IX_SDB 0x00000008 +#define AHCI_P_IX_UF 0x00000010 +#define AHCI_P_IX_DP 0x00000020 +#define AHCI_P_IX_PC 0x00000040 +#define AHCI_P_IX_MP 0x00000080 + +#define AHCI_P_IX_PRC 0x00400000 +#define AHCI_P_IX_IPM 0x00800000 +#define AHCI_P_IX_OF 0x01000000 +#define AHCI_P_IX_INF 0x04000000 +#define AHCI_P_IX_IF 0x08000000 +#define AHCI_P_IX_HBD 0x10000000 +#define AHCI_P_IX_HBF 0x20000000 +#define AHCI_P_IX_TFE 0x40000000 +#define AHCI_P_IX_CPD 0x80000000 + +#define AHCI_P_CMD 0x18 +#define AHCI_P_CMD_ST 0x00000001 +#define AHCI_P_CMD_SUD 0x00000002 +#define AHCI_P_CMD_POD 0x00000004 +#define AHCI_P_CMD_CLO 0x00000008 +#define AHCI_P_CMD_FRE 0x00000010 +#define AHCI_P_CMD_CCS_MASK 0x00001f00 +#define AHCI_P_CMD_CCS_SHIFT 8 +#define AHCI_P_CMD_ISS 0x00002000 +#define AHCI_P_CMD_FR 0x00004000 +#define AHCI_P_CMD_CR 0x00008000 +#define AHCI_P_CMD_CPS 0x00010000 +#define AHCI_P_CMD_PMA 0x00020000 +#define AHCI_P_CMD_HPCP 0x00040000 +#define AHCI_P_CMD_MPSP 0x00080000 +#define AHCI_P_CMD_CPD 0x00100000 +#define AHCI_P_CMD_ESP 0x00200000 +#define AHCI_P_CMD_FBSCP 0x00400000 +#define AHCI_P_CMD_APSTE 0x00800000 +#define AHCI_P_CMD_ATAPI 0x01000000 +#define AHCI_P_CMD_DLAE 0x02000000 +#define AHCI_P_CMD_ALPE 0x04000000 +#define AHCI_P_CMD_ASP 0x08000000 +#define AHCI_P_CMD_ICC_MASK 0xf0000000 +#define AHCI_P_CMD_NOOP 0x00000000 +#define AHCI_P_CMD_ACTIVE 0x10000000 +#define AHCI_P_CMD_PARTIAL 0x20000000 +#define AHCI_P_CMD_SLUMBER 0x60000000 +#define AHCI_P_CMD_DEVSLEEP 0x80000000 + +#define AHCI_P_TFD 0x20 +#define AHCI_P_SIG 0x24 +#define AHCI_P_SSTS 0x28 +#define AHCI_P_SCTL 0x2c +#define AHCI_P_SERR 0x30 +#define AHCI_P_SACT 0x34 +#define AHCI_P_CI 0x38 +#define AHCI_P_SNTF 0x3C +#define AHCI_P_FBS 0x40 +#define AHCI_P_FBS_EN 0x00000001 +#define AHCI_P_FBS_DEC 0x00000002 +#define AHCI_P_FBS_SDE 0x00000004 +#define AHCI_P_FBS_DEV 0x00000f00 +#define AHCI_P_FBS_DEV_SHIFT 8 +#define AHCI_P_FBS_ADO 0x0000f000 +#define AHCI_P_FBS_ADO_SHIFT 12 +#define AHCI_P_FBS_DWE 0x000f0000 +#define AHCI_P_FBS_DWE_SHIFT 16 +#define AHCI_P_DEVSLP 0x44 +#define AHCI_P_DEVSLP_ADSE 0x00000001 +#define AHCI_P_DEVSLP_DSP 0x00000002 +#define AHCI_P_DEVSLP_DETO 0x000003fc +#define AHCI_P_DEVSLP_DETO_SHIFT 2 +#define AHCI_P_DEVSLP_MDAT 0x00007c00 +#define AHCI_P_DEVSLP_MDAT_SHIFT 10 +#define AHCI_P_DEVSLP_DITO 0x01ff8000 +#define AHCI_P_DEVSLP_DITO_SHIFT 15 +#define AHCI_P_DEVSLP_DM 0x0e000000 +#define AHCI_P_DEVSLP_DM_SHIFT 25 /* Just to be sure, if building as module. */ #if MAXPHYS < 512 * 1024 #undef MAXPHYS -#define MAXPHYS 512 * 1024 +#define MAXPHYS 512 * 1024 #endif /* Pessimistic prognosis on number of required S/G entries */ -#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8)) +#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8)) /* Command list. 32 commands. First, 1Kbyte aligned. */ -#define AHCI_CL_OFFSET 0 -#define AHCI_CL_SIZE 32 +#define AHCI_CL_OFFSET 0 +#define AHCI_CL_SIZE 32 /* Command tables. Up to 32 commands, Each, 128byte aligned. */ -#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS) -#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16) +#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS) +#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16) /* Total main work area. */ -#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots) +#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots) #endif /* _AHCI_H_ */ diff --git a/usr/src/cmd/bhyve/amd64/Makefile b/usr/src/cmd/bhyve/amd64/Makefile deleted file mode 100644 index 13cdae6663..0000000000 --- a/usr/src/cmd/bhyve/amd64/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2015 Pluribus Networks Inc. -# - -include ../Makefile.com -include ../../Makefile.cmd.64 - -CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 - -install: all $(ROOTUSRSBINPROG64) diff --git a/usr/src/cmd/bhyve/atkbdc.c b/usr/src/cmd/bhyve/atkbdc.c index 4d09d88266..1c1838c2e8 100644 --- a/usr/src/cmd/bhyve/atkbdc.c +++ b/usr/src/cmd/bhyve/atkbdc.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale * Copyright (c) 2015 Nahanni Systems Inc. * All rights reserved. @@ -26,7 +28,7 @@ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/atkbdc.c 267611 2014-06-18 17:20:02Z neel $"); +__FBSDID("$FreeBSD$"); #include @@ -45,6 +47,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/atkbdc.c 267611 2014-06-18 17:20:02Z nee #include #include "acpi.h" +#include "atkbdc.h" #include "inout.h" #include "pci_emul.h" #include "pci_irq.h" @@ -99,19 +102,21 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/atkbdc.c 267611 2014-06-18 17:20:02Z nee #define KBDO_AUX_OUTFULL 0x20 #define RAMSZ 32 +#define FIFOSZ 15 +#define CTRL_CMD_FLAG 0x8000 struct kbd_dev { bool irq_active; int irq; - uint8_t buffer; + uint8_t buffer[FIFOSZ]; + int brd, bwr; + int bcnt; }; struct aux_dev { bool irq_active; int irq; - - uint8_t buffer; }; struct atkbdc_softc { @@ -126,6 +131,7 @@ struct atkbdc_softc { uint8_t ram[RAMSZ]; /* byte0 = controller config */ uint32_t curcmd; /* current command for next byte */ + uint32_t ctrlbyte; struct kbd_dev kbd; struct aux_dev aux; @@ -134,72 +140,37 @@ struct atkbdc_softc { static void atkbdc_assert_kbd_intr(struct atkbdc_softc *sc) { - if (!sc->kbd.irq_active && - (sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) { + if ((sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) { sc->kbd.irq_active = true; - vm_isa_assert_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq); - } -} - -static void -atkbdc_deassert_kbd_intr(struct atkbdc_softc *sc) -{ - if (sc->kbd.irq_active) { - vm_isa_deassert_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq); - sc->kbd.irq_active = false; + vm_isa_pulse_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq); } } static void atkbdc_assert_aux_intr(struct atkbdc_softc *sc) { - if (!sc->aux.irq_active && - (sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) { + if ((sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) { sc->aux.irq_active = true; - vm_isa_assert_irq(sc->ctx, sc->aux.irq, sc->aux.irq); + vm_isa_pulse_irq(sc->ctx, sc->aux.irq, sc->aux.irq); } } -static void -atkbdc_deassert_aux_intr(struct atkbdc_softc *sc) -{ - if (sc->aux.irq_active) { - vm_isa_deassert_irq(sc->ctx, sc->aux.irq, sc->aux.irq); - sc->aux.irq_active = false; - } -} - -static void -atkbdc_aux_queue_data(struct atkbdc_softc *sc, uint8_t val) -{ - assert(pthread_mutex_isowned_np(&sc->mtx)); - - sc->aux.buffer = val; - sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); - sc->outport |= KBDO_AUX_OUTFULL; - atkbdc_assert_aux_intr(sc); -} - -static void +static int atkbdc_kbd_queue_data(struct atkbdc_softc *sc, uint8_t val) { assert(pthread_mutex_isowned_np(&sc->mtx)); - sc->kbd.buffer = val; - sc->status |= KBDS_KBD_BUFFER_FULL; - sc->outport |= KBDO_KBD_OUTFULL; - atkbdc_assert_kbd_intr(sc); -} - -static void -atkbdc_aux_read(struct atkbdc_softc *sc) -{ - uint8_t val; - - assert(pthread_mutex_isowned_np(&sc->mtx)); + if (sc->kbd.bcnt < FIFOSZ) { + sc->kbd.buffer[sc->kbd.bwr] = val; + sc->kbd.bwr = (sc->kbd.bwr + 1) % FIFOSZ; + sc->kbd.bcnt++; + sc->status |= KBDS_KBD_BUFFER_FULL; + sc->outport |= KBDO_KBD_OUTFULL; + } else { + printf("atkbd data buffer full\n"); + } - if (ps2mouse_read(sc->ps2mouse_sc, &val) != -1) - atkbdc_aux_queue_data(sc, val); + return (sc->kbd.bcnt < FIFOSZ); } static void @@ -252,21 +223,31 @@ atkbdc_kbd_read(struct atkbdc_softc *sc) } else { val = translation[val] | release; } - atkbdc_kbd_queue_data(sc, val); break; } } else { - if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) - atkbdc_kbd_queue_data(sc, val); + while (sc->kbd.bcnt < FIFOSZ) { + if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) + atkbdc_kbd_queue_data(sc, val); + else + break; + } } + + if (((sc->ram[0] & KBD_DISABLE_AUX_PORT) || + ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) && sc->kbd.bcnt > 0) + atkbdc_assert_kbd_intr(sc); } static void atkbdc_aux_poll(struct atkbdc_softc *sc) { - if ((sc->outport & KBDO_AUX_OUTFULL) == 0) - atkbdc_aux_read(sc); + if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0) { + sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL; + sc->outport |= KBDO_AUX_OUTFULL; + atkbdc_assert_aux_intr(sc); + } } static void @@ -274,8 +255,7 @@ atkbdc_kbd_poll(struct atkbdc_softc *sc) { assert(pthread_mutex_isowned_np(&sc->mtx)); - if ((sc->outport & KBDO_KBD_OUTFULL) == 0) - atkbdc_kbd_read(sc); + atkbdc_kbd_read(sc); } static void @@ -290,22 +270,35 @@ atkbdc_dequeue_data(struct atkbdc_softc *sc, uint8_t *buf) { assert(pthread_mutex_isowned_np(&sc->mtx)); - if (sc->outport & KBDO_AUX_OUTFULL) { - *buf = sc->aux.buffer; - sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); - sc->outport &= ~KBDO_AUX_OUTFULL; - atkbdc_deassert_aux_intr(sc); + if (ps2mouse_read(sc->ps2mouse_sc, buf) == 0) { + if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) { + if (sc->kbd.bcnt == 0) + sc->status &= ~(KBDS_AUX_BUFFER_FULL | + KBDS_KBD_BUFFER_FULL); + else + sc->status &= ~(KBDS_AUX_BUFFER_FULL); + sc->outport &= ~KBDO_AUX_OUTFULL; + } atkbdc_poll(sc); return; } - *buf = sc->kbd.buffer; - sc->status &= ~KBDS_KBD_BUFFER_FULL; - sc->outport &= ~KBDO_KBD_OUTFULL; - atkbdc_deassert_kbd_intr(sc); + if (sc->kbd.bcnt > 0) { + *buf = sc->kbd.buffer[sc->kbd.brd]; + sc->kbd.brd = (sc->kbd.brd + 1) % FIFOSZ; + sc->kbd.bcnt--; + if (sc->kbd.bcnt == 0) { + sc->status &= ~KBDS_KBD_BUFFER_FULL; + sc->outport &= ~KBDO_KBD_OUTFULL; + } - atkbdc_poll(sc); + atkbdc_poll(sc); + } + + if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0 && sc->kbd.bcnt == 0) { + sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); + } } static int @@ -318,19 +311,22 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, if (bytes != 1) return (-1); - sc = arg; retval = 0; pthread_mutex_lock(&sc->mtx); if (in) { sc->curcmd = 0; - sc->status &= ~KBDS_CTRL_FLAG; - - /* read device buffer; includes kbd cmd responses */ - atkbdc_dequeue_data(sc, &buf); - *eax = buf; + if (sc->ctrlbyte != 0) { + *eax = sc->ctrlbyte & 0xff; + sc->ctrlbyte = 0; + } else { + /* read device buffer; includes kbd cmd responses */ + atkbdc_dequeue_data(sc, &buf); + *eax = buf; + } + sc->status &= ~KBDS_CTRL_FLAG; pthread_mutex_unlock(&sc->mtx); return (retval); } @@ -345,29 +341,22 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, if (sc->ram[0] & KBD_SYS_FLAG_BIT) sc->status |= KBDS_SYS_FLAG; else - sc->status &= KBDS_SYS_FLAG; - if (sc->outport & KBDO_AUX_OUTFULL) - atkbdc_assert_aux_intr(sc); - else if (sc->outport & KBDO_KBD_OUTFULL) - atkbdc_assert_kbd_intr(sc); + sc->status &= ~KBDS_SYS_FLAG; break; case KBDC_WRITE_OUTPORT: sc->outport = *eax; - if (sc->outport & KBDO_AUX_OUTFULL) - sc->status |= (KBDS_AUX_BUFFER_FULL | - KBDS_KBD_BUFFER_FULL); - if (sc->outport & KBDO_KBD_OUTFULL) - sc->status |= KBDS_KBD_BUFFER_FULL; break; case KBDC_WRITE_TO_AUX: - ps2mouse_write(sc->ps2mouse_sc, *eax); + ps2mouse_write(sc->ps2mouse_sc, *eax, 0); atkbdc_poll(sc); break; case KBDC_WRITE_KBD_OUTBUF: atkbdc_kbd_queue_data(sc, *eax); break; case KBDC_WRITE_AUX_OUTBUF: - atkbdc_aux_queue_data(sc, *eax); + ps2mouse_write(sc->ps2mouse_sc, *eax, 1); + sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); + atkbdc_aux_poll(sc); break; default: /* write to particular RAM byte */ @@ -398,7 +387,6 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, return (retval); } - static int atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) @@ -421,25 +409,27 @@ atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, return (retval); } + sc->curcmd = 0; sc->status |= KBDS_CTRL_FLAG; + sc->ctrlbyte = 0; switch (*eax) { case KBDC_GET_COMMAND_BYTE: - atkbdc_kbd_queue_data(sc, sc->ram[0]); + sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[0]; break; case KBDC_TEST_CTRL: - atkbdc_kbd_queue_data(sc, 0x55); + sc->ctrlbyte = CTRL_CMD_FLAG | 0x55; break; case KBDC_TEST_AUX_PORT: case KBDC_TEST_KBD_PORT: - atkbdc_kbd_queue_data(sc, 0); + sc->ctrlbyte = CTRL_CMD_FLAG | 0; break; case KBDC_READ_INPORT: - atkbdc_kbd_queue_data(sc, 0); + sc->ctrlbyte = CTRL_CMD_FLAG | 0; break; case KBDC_READ_OUTPORT: - atkbdc_kbd_queue_data(sc, sc->outport); + sc->ctrlbyte = CTRL_CMD_FLAG | sc->outport; break; case KBDC_SET_COMMAND_BYTE: case KBDC_WRITE_OUTPORT: @@ -452,6 +442,8 @@ atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, break; case KBDC_ENABLE_KBD_PORT: sc->ram[0] &= ~KBD_DISABLE_KBD_PORT; + if (sc->kbd.bcnt > 0) + sc->status |= KBDS_KBD_BUFFER_FULL; atkbdc_poll(sc); break; case KBDC_WRITE_TO_AUX: @@ -459,17 +451,19 @@ atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, break; case KBDC_DISABLE_AUX_PORT: sc->ram[0] |= KBD_DISABLE_AUX_PORT; + ps2mouse_toggle(sc->ps2mouse_sc, 0); + sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); + sc->outport &= ~KBDS_AUX_BUFFER_FULL; break; case KBDC_ENABLE_AUX_PORT: sc->ram[0] &= ~KBD_DISABLE_AUX_PORT; + ps2mouse_toggle(sc->ps2mouse_sc, 1); + if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0) + sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL; break; case KBDC_RESET: /* Pulse "reset" line */ -#ifdef __FreeBSD__ error = vm_suspend(ctx, VM_SUSPEND_RESET); assert(error == 0 || errno == EALREADY); -#else - exit(0); -#endif break; default: if (*eax >= 0x21 && *eax <= 0x3f) { @@ -477,21 +471,38 @@ atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int byten; byten = (*eax - 0x20) & 0x1f; - atkbdc_kbd_queue_data(sc, sc->ram[byten]); + sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[byten]; } break; } pthread_mutex_unlock(&sc->mtx); + if (sc->ctrlbyte != 0) { + sc->status |= KBDS_KBD_BUFFER_FULL; + sc->status &= ~KBDS_AUX_BUFFER_FULL; + atkbdc_assert_kbd_intr(sc); + } else if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0 && + (sc->ram[0] & KBD_DISABLE_AUX_PORT) == 0) { + sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL; + atkbdc_assert_aux_intr(sc); + } else if (sc->kbd.bcnt > 0 && (sc->ram[0] & KBD_DISABLE_KBD_PORT) == 0) { + sc->status |= KBDS_KBD_BUFFER_FULL; + atkbdc_assert_kbd_intr(sc); + } + return (retval); } void -atkbdc_event(struct atkbdc_softc *sc) +atkbdc_event(struct atkbdc_softc *sc, int iskbd) { pthread_mutex_lock(&sc->mtx); - atkbdc_poll(sc); + + if (iskbd) + atkbdc_kbd_poll(sc); + else + atkbdc_aux_poll(sc); pthread_mutex_unlock(&sc->mtx); } @@ -539,7 +550,6 @@ atkbdc_init(struct vmctx *ctx) sc->ps2mouse_sc = ps2mouse_init(sc); } -#ifdef __FreeBSD__ static void atkbdc_dsdt(void) { @@ -573,4 +583,4 @@ atkbdc_dsdt(void) dsdt_line("}"); } LPC_DSDT(atkbdc_dsdt); -#endif + diff --git a/usr/src/cmd/bhyve/atkbdc.h b/usr/src/cmd/bhyve/atkbdc.h index 48b3a8b00c..85c8a7141e 100644 --- a/usr/src/cmd/bhyve/atkbdc.h +++ b/usr/src/cmd/bhyve/atkbdc.h @@ -33,6 +33,6 @@ struct atkbdc_softc; struct vmctx; void atkbdc_init(struct vmctx *ctx); -void atkbdc_event(struct atkbdc_softc *sc); +void atkbdc_event(struct atkbdc_softc *sc, int iskbd); #endif /* _ATKBDC_H_ */ diff --git a/usr/src/cmd/bhyve/bhyve_sol_glue.c b/usr/src/cmd/bhyve/bhyve_sol_glue.c index 633faacc5f..7b24ea7f5d 100644 --- a/usr/src/cmd/bhyve/bhyve_sol_glue.c +++ b/usr/src/cmd/bhyve/bhyve_sol_glue.c @@ -11,6 +11,7 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #include @@ -25,62 +26,14 @@ void cfmakeraw(struct termios *t) { - t->c_iflag &= ~(IMAXBEL|IXOFF|INPCK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|ICRNL|IXON|IGNPAR); + t->c_iflag &= ~(IMAXBEL|IXOFF|INPCK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR| + ICRNL|IXON|IGNPAR); t->c_iflag |= IGNBRK; t->c_oflag &= ~OPOST; - t->c_lflag &= ~(ECHO|ECHOE|ECHOK|ECHONL|ICANON|ISIG|IEXTEN|NOFLSH|TOSTOP |PENDIN); + t->c_lflag &= ~(ECHO|ECHOE|ECHOK|ECHONL|ICANON|ISIG|IEXTEN|NOFLSH| + TOSTOP|PENDIN); t->c_cflag &= ~(CSIZE|PARENB); t->c_cflag |= CS8|CREAD; t->c_cc[VMIN] = 1; t->c_cc[VTIME] = 0; } - -ssize_t -preadv(int d, const struct iovec *iov, int iovcnt, off_t offset) -{ - off_t old_offset; - ssize_t n; - - old_offset = lseek(d, (off_t)0, SEEK_CUR); - if (old_offset == -1) - return (-1); - - offset = lseek(d, offset, SEEK_SET); - if (offset == -1) - return (-1); - - n = readv(d, iov, iovcnt); - if (n == -1) - return (-1); - - offset = lseek(d, old_offset, SEEK_SET); - if (offset == -1) - return (-1); - - return (n); -} - -ssize_t -pwritev(int d, const struct iovec *iov, int iovcnt, off_t offset) -{ - off_t old_offset; - ssize_t n; - - old_offset = lseek(d, (off_t)0, SEEK_CUR); - if (old_offset == -1) - return (-1); - - offset = lseek(d, offset, SEEK_SET); - if (offset == -1) - return (-1); - - n = writev(d, iov, iovcnt); - if (n == -1) - return (-1); - - offset = lseek(d, old_offset, SEEK_SET); - if (offset == -1) - return (-1); - - return (n); -} diff --git a/usr/src/cmd/bhyve/bhyvegc.c b/usr/src/cmd/bhyve/bhyvegc.c index 7a13c4c83f..4bd49ded79 100644 --- a/usr/src/cmd/bhyve/bhyvegc.c +++ b/usr/src/cmd/bhyve/bhyvegc.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale * All rights reserved. * @@ -37,10 +39,11 @@ __FBSDID("$FreeBSD$"); struct bhyvegc { struct bhyvegc_image *gc_image; + int raw; }; struct bhyvegc * -bhyvegc_init(int width, int height) +bhyvegc_init(int width, int height, void *fbaddr) { struct bhyvegc *gc; struct bhyvegc_image *gc_image; @@ -50,13 +53,28 @@ bhyvegc_init(int width, int height) gc_image = calloc(1, sizeof(struct bhyvegc_image)); gc_image->width = width; gc_image->height = height; - gc_image->data = calloc(width * height, sizeof (uint32_t)); + if (fbaddr) { + gc_image->data = fbaddr; + gc->raw = 1; + } else { + gc_image->data = calloc(width * height, sizeof (uint32_t)); + gc->raw = 0; + } gc->gc_image = gc_image; return (gc); } +void +bhyvegc_set_fbaddr(struct bhyvegc *gc, void *fbaddr) +{ + gc->raw = 1; + if (gc->gc_image->data && gc->gc_image->data != fbaddr) + free(gc->gc_image->data); + gc->gc_image->data = fbaddr; +} + void bhyvegc_resize(struct bhyvegc *gc, int width, int height) { @@ -66,13 +84,20 @@ bhyvegc_resize(struct bhyvegc *gc, int width, int height) gc_image->width = width; gc_image->height = height; - gc_image->data = realloc(gc_image->data, - sizeof (uint32_t) * width * height); - memset(gc_image->data, 0, width * height * sizeof (uint32_t)); + if (!gc->raw) { + gc_image->data = reallocarray(gc_image->data, width * height, + sizeof (uint32_t)); + if (gc_image->data != NULL) + memset(gc_image->data, 0, width * height * + sizeof (uint32_t)); + } } struct bhyvegc_image * bhyvegc_get_image(struct bhyvegc *gc) { + if (gc == NULL) + return (NULL); + return (gc->gc_image); } diff --git a/usr/src/cmd/bhyve/bhyvegc.h b/usr/src/cmd/bhyve/bhyvegc.h index 19648f98af..11323586df 100644 --- a/usr/src/cmd/bhyve/bhyvegc.h +++ b/usr/src/cmd/bhyve/bhyvegc.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale * All rights reserved. * @@ -32,12 +34,14 @@ struct bhyvegc; struct bhyvegc_image { + int vgamode; int width; int height; uint32_t *data; }; -struct bhyvegc *bhyvegc_init(int width, int height); +struct bhyvegc *bhyvegc_init(int width, int height, void *fbaddr); +void bhyvegc_set_fbaddr(struct bhyvegc *gc, void *fbaddr); void bhyvegc_resize(struct bhyvegc *gc, int width, int height); struct bhyvegc_image *bhyvegc_get_image(struct bhyvegc *gc); diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c index b985a2286e..928d2dc811 100644 --- a/usr/src/cmd/bhyve/bhyverun.c +++ b/usr/src/cmd/bhyve/bhyverun.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,30 +38,50 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z neel $"); +__FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include +#include + +#ifdef __FreeBSD__ +#include +#else +#include +#endif +#include #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include #include #include +#include #include #include #include -#include #include #include #include +#include +#include #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include "bhyverun.h" @@ -68,11 +90,11 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z n #include "console.h" #include "inout.h" #include "dbgport.h" +#include "fwctl.h" +#include "gdb.h" #include "ioapic.h" #include "mem.h" -#ifdef __FreeBSD__ #include "mevent.h" -#endif #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" @@ -89,11 +111,81 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z n #define MB (1024UL * 1024) #define GB (1024UL * MB) +static const char * const vmx_exit_reason_desc[] = { + [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", + [EXIT_REASON_EXT_INTR] = "External interrupt", + [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", + [EXIT_REASON_INIT] = "INIT signal", + [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", + [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", + [EXIT_REASON_SMI] = "Other SMI", + [EXIT_REASON_INTR_WINDOW] = "Interrupt window", + [EXIT_REASON_NMI_WINDOW] = "NMI window", + [EXIT_REASON_TASK_SWITCH] = "Task switch", + [EXIT_REASON_CPUID] = "CPUID", + [EXIT_REASON_GETSEC] = "GETSEC", + [EXIT_REASON_HLT] = "HLT", + [EXIT_REASON_INVD] = "INVD", + [EXIT_REASON_INVLPG] = "INVLPG", + [EXIT_REASON_RDPMC] = "RDPMC", + [EXIT_REASON_RDTSC] = "RDTSC", + [EXIT_REASON_RSM] = "RSM", + [EXIT_REASON_VMCALL] = "VMCALL", + [EXIT_REASON_VMCLEAR] = "VMCLEAR", + [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", + [EXIT_REASON_VMPTRLD] = "VMPTRLD", + [EXIT_REASON_VMPTRST] = "VMPTRST", + [EXIT_REASON_VMREAD] = "VMREAD", + [EXIT_REASON_VMRESUME] = "VMRESUME", + [EXIT_REASON_VMWRITE] = "VMWRITE", + [EXIT_REASON_VMXOFF] = "VMXOFF", + [EXIT_REASON_VMXON] = "VMXON", + [EXIT_REASON_CR_ACCESS] = "Control-register accesses", + [EXIT_REASON_DR_ACCESS] = "MOV DR", + [EXIT_REASON_INOUT] = "I/O instruction", + [EXIT_REASON_RDMSR] = "RDMSR", + [EXIT_REASON_WRMSR] = "WRMSR", + [EXIT_REASON_INVAL_VMCS] = + "VM-entry failure due to invalid guest state", + [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", + [EXIT_REASON_MWAIT] = "MWAIT", + [EXIT_REASON_MTF] = "Monitor trap flag", + [EXIT_REASON_MONITOR] = "MONITOR", + [EXIT_REASON_PAUSE] = "PAUSE", + [EXIT_REASON_MCE_DURING_ENTRY] = + "VM-entry failure due to machine-check event", + [EXIT_REASON_TPR] = "TPR below threshold", + [EXIT_REASON_APIC_ACCESS] = "APIC access", + [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", + [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", + [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", + [EXIT_REASON_EPT_FAULT] = "EPT violation", + [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", + [EXIT_REASON_INVEPT] = "INVEPT", + [EXIT_REASON_RDTSCP] = "RDTSCP", + [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", + [EXIT_REASON_INVVPID] = "INVVPID", + [EXIT_REASON_WBINVD] = "WBINVD", + [EXIT_REASON_XSETBV] = "XSETBV", + [EXIT_REASON_APIC_WRITE] = "APIC write", + [EXIT_REASON_RDRAND] = "RDRAND", + [EXIT_REASON_INVPCID] = "INVPCID", + [EXIT_REASON_VMFUNC] = "VMFUNC", + [EXIT_REASON_ENCLS] = "ENCLS", + [EXIT_REASON_RDSEED] = "RDSEED", + [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", + [EXIT_REASON_XSAVES] = "XSAVES", + [EXIT_REASON_XRSTORS] = "XRSTORS" +}; + typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); +extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); char *vmname; int guest_ncpus; +uint16_t cores, maxcpus, sockets, threads; + char *guest_uuid_str; static int guest_vmexit_on_hlt, guest_vmexit_on_pause; @@ -103,9 +195,7 @@ static int x2apic_mode = 0; /* default is xAPIC */ static int strictio; static int strictmsr = 1; -#ifdef __FreeBSD__ static int acpi; -#endif static char *progname; static const int BSP = 0; @@ -124,15 +214,14 @@ static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); static struct vm_exit vmexit[VM_MAXCPU]; struct bhyvestats { - uint64_t vmexit_bogus; - uint64_t vmexit_bogus_switch; - uint64_t vmexit_hlt; - uint64_t vmexit_pause; - uint64_t vmexit_mtrap; - uint64_t vmexit_inst_emul; - uint64_t cpu_switch_rotate; - uint64_t cpu_switch_direct; - int io_reset; + uint64_t vmexit_bogus; + uint64_t vmexit_reqidle; + uint64_t vmexit_hlt; + uint64_t vmexit_pause; + uint64_t vmexit_mtrap; + uint64_t vmexit_inst_emul; + uint64_t cpu_switch_rotate; + uint64_t cpu_switch_direct; } stats; struct mt_vmm_info { @@ -141,55 +230,211 @@ struct mt_vmm_info { int mt_vcpu; } mt_vmm_info[VM_MAXCPU]; +#ifdef __FreeBSD__ +static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; +#endif + static void usage(int code) { -#ifdef __FreeBSD__ fprintf(stderr, - "Usage: %s [-aehwAHIPW] [-g ] [-s ] [-c vcpus]\n" - " %*s [-p vcpu:hostcpu] [-m mem] [-l ] \n" + "Usage: %s [-abehuwxACHPSWY]\n" + " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" + " %*s [-g ] [-l ]\n" +#ifdef __FreeBSD__ + " %*s [-m mem] [-p vcpu:hostcpu] [-s ] [-U uuid] \n" +#else + " %*s [-m mem] [-s ] [-U uuid] \n" +#endif " -a: local apic is in xAPIC mode (deprecated)\n" - " -A: create an ACPI table\n" - " -g: gdb port\n" - " -c: # cpus (default 1)\n" + " -A: create ACPI tables\n" + " -c: number of cpus and/or topology specification\n" " -C: include guest memory in core file\n" - " -p: pin 'vcpu' to 'hostcpu'\n" - " -H: vmexit from the guest on hlt\n" - " -P: vmexit from the guest on pause\n" - " -W: force virtio to use single-vector MSI\n" " -e: exit on unhandled I/O access\n" + " -g: gdb port\n" " -h: help\n" - " -s: PCI slot config\n" - " -l: LPC device configuration\n" - " -m: memory size in MB\n" - " -w: ignore unimplemented MSRs\n" - " -x: local apic is in x2APIC mode\n" - " -Y: disable MPtable generation\n" - " -U: uuid\n", - progname, (int)strlen(progname), ""); -#else - fprintf(stderr, - "Usage: %s [-ehwHPW] [-s ] [-c vcpus]\n" - " %*s [-p vcpu:hostcpu] [-m mem] [-l ] \n" - " -c: # cpus (default 1)\n" " -H: vmexit from the guest on hlt\n" + " -l: LPC device configuration\n" + " -m: memory size\n" +#ifdef __FreeBSD__ + " -p: pin 'vcpu' to 'hostcpu'\n" +#endif " -P: vmexit from the guest on pause\n" - " -W: force virtio to use single-vector MSI\n" - " -e: exit on unhandled I/O access\n" - " -h: help\n" " -s: PCI slot config\n" - " -l: LPC device configuration\n" - " -m: memory size in MB\n" + " -S: guest memory cannot be swapped\n" + " -u: RTC keeps UTC time\n" + " -U: uuid\n" " -w: ignore unimplemented MSRs\n" - " -Y: disable MPtable generation\n" - " -U: uuid\n", - progname, (int)strlen(progname), ""); -#endif + " -W: force virtio to use single-vector MSI\n" + " -x: local apic is in x2APIC mode\n" + " -Y: disable MPtable generation\n", + progname, (int)strlen(progname), "", (int)strlen(progname), "", + (int)strlen(progname), ""); exit(code); } +/* + * XXX This parser is known to have the following issues: + * 1. It accepts null key=value tokens ",,". + * 2. It accepts whitespace after = and before value. + * 3. Values out of range of INT are silently wrapped. + * 4. It doesn't check non-final values. + * 5. The apparently bogus limits of UINT16_MAX are for future expansion. + * + * The acceptance of a null specification ('-c ""') is by design to match the + * manual page syntax specification, this results in a topology of 1 vCPU. + */ +static int +topology_parse(const char *opt) +{ + uint64_t ncpus; + int c, chk, n, s, t, tmp; + char *cp, *str; + bool ns, scts; + + c = 1, n = 1, s = 1, t = 1; + ns = false, scts = false; + str = strdup(opt); + if (str == NULL) + goto out; + + while ((cp = strsep(&str, ",")) != NULL) { + if (sscanf(cp, "%i%n", &tmp, &chk) == 1) { + n = tmp; + ns = true; + } else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) { + n = tmp; + ns = true; + } else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) { + s = tmp; + scts = true; + } else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) { + c = tmp; + scts = true; + } else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) { + t = tmp; + scts = true; +#ifdef notyet /* Do not expose this until vmm.ko implements it */ + } else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) { + m = tmp; +#endif + /* Skip the empty argument case from -c "" */ + } else if (cp[0] == '\0') + continue; + else + goto out; + /* Any trailing garbage causes an error */ + if (cp[chk] != '\0') + goto out; + } + free(str); + str = NULL; + + /* + * Range check 1 <= n <= UINT16_MAX all values + */ + if (n < 1 || s < 1 || c < 1 || t < 1 || + n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX || + t > UINT16_MAX) + return (-1); + + /* If only the cpus was specified, use that as sockets */ + if (!scts) + s = n; + /* + * Compute sockets * cores * threads avoiding overflow + * The range check above insures these are 16 bit values + * If n was specified check it against computed ncpus + */ + ncpus = (uint64_t)s * c * t; + if (ncpus > UINT16_MAX || (ns && n != ncpus)) + return (-1); + + guest_ncpus = ncpus; + sockets = s; + cores = c; + threads = t; + return(0); + +out: + free(str); + return (-1); +} + +#ifndef WITHOUT_CAPSICUM +/* + * 11-stable capsicum helpers + */ +static void +bhyve_caph_cache_catpages(void) +{ + + (void)catopen("libc", NL_CAT_LOCALE); +} + +static int +bhyve_caph_limit_stdoe(void) +{ + cap_rights_t rights; + unsigned long cmds[] = { TIOCGETA, TIOCGWINSZ }; + int i, fds[] = { STDOUT_FILENO, STDERR_FILENO }; + + cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_IOCTL); + cap_rights_set(&rights, CAP_WRITE); + + for (i = 0; i < nitems(fds); i++) { + if (cap_rights_limit(fds[i], &rights) < 0 && errno != ENOSYS) + return (-1); + + if (cap_ioctls_limit(fds[i], cmds, nitems(cmds)) < 0 && errno != ENOSYS) + return (-1); + + if (cap_fcntls_limit(fds[i], CAP_FCNTL_GETFL) < 0 && errno != ENOSYS) + return (-1); + } + + return (0); +} + +#endif + +#ifdef __FreeBSD__ +static int +pincpu_parse(const char *opt) +{ + int vcpu, pcpu; + + if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { + fprintf(stderr, "invalid format: %s\n", opt); + return (-1); + } + + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", + vcpu, VM_MAXCPU - 1); + return (-1); + } + + if (pcpu < 0 || pcpu >= CPU_SETSIZE) { + fprintf(stderr, "hostcpu '%d' outside valid range from " + "0 to %d\n", pcpu, CPU_SETSIZE - 1); + return (-1); + } + + if (vcpumap[vcpu] == NULL) { + if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { + perror("malloc"); + return (-1); + } + CPU_ZERO(vcpumap[vcpu]); + } + CPU_SET(pcpu, vcpumap[vcpu]); + return (0); +} +#endif + void vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, int errcode) @@ -246,6 +491,8 @@ fbsdrun_start_thread(void *param) snprintf(tname, sizeof(tname), "vcpu %d", vcpu); pthread_set_name_np(mtp->mt_thr, tname); + gdb_cpu_add(vcpu); + vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); /* not reached */ @@ -267,7 +514,8 @@ fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) * with vm_suspend(). */ error = vm_activate_cpu(ctx, newcpu); - assert(error == 0); + if (error != 0) + err(EX_OSERR, "could not activate CPU %d", newcpu); CPU_SET_ATOMIC(newcpu, &cpumask); @@ -286,6 +534,19 @@ fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) assert(error == 0); } +static int +fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) +{ + + if (!CPU_ISSET(vcpu, &cpumask)) { + fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); + exit(4); + } + + CPU_CLR_ATOMIC(vcpu, &cpumask); + return (CPU_EMPTY(&cpumask)); +} + static int vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, uint32_t eax) @@ -295,21 +556,20 @@ vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, * put guest-driven debug here */ #endif - return (VMEXIT_CONTINUE); + return (VMEXIT_CONTINUE); } static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; - int bytes, port, in, out, string; + int bytes, port, in, out; int vcpu; vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; - string = vme->u.inout.string; in = vme->u.inout.in; out = !in; @@ -380,13 +640,29 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) static int vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { - int newcpu; - int retval = VMEXIT_CONTINUE; - newcpu = spinup_ap(ctx, *pvcpu, - vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); + (void)spinup_ap(ctx, *pvcpu, + vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); + + return (VMEXIT_CONTINUE); +} + +#define DEBUG_EPT_MISCONFIG +#ifdef DEBUG_EPT_MISCONFIG +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 + +static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; +static int ept_misconfig_ptenum; +#endif + +static const char * +vmexit_vmx_desc(uint32_t exit_reason) +{ - return (retval); + if (exit_reason >= nitems(vmx_exit_reason_desc) || + vmx_exit_reason_desc[exit_reason] == NULL) + return ("Unknown"); + return (vmx_exit_reason_desc[exit_reason]); } static int @@ -398,12 +674,41 @@ vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); - fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); + fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason, + vmexit_vmx_desc(vmexit->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); +#ifdef DEBUG_EPT_MISCONFIG + if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { + vm_get_register(ctx, *pvcpu, + VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), + &ept_misconfig_gpa); + vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, + &ept_misconfig_ptenum); + fprintf(stderr, "\tEPT misconfiguration:\n"); + fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); + fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", + ept_misconfig_ptenum, ept_misconfig_pte[0], + ept_misconfig_pte[1], ept_misconfig_pte[2], + ept_misconfig_pte[3]); + } +#endif /* DEBUG_EPT_MISCONFIG */ + return (VMEXIT_ABORT); +} +static int +vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + fprintf(stderr, "vm exit[%d]\n", *pvcpu); + fprintf(stderr, "\treason\t\tSVM\n"); + fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); + fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); + fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode); + fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1); + fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2); return (VMEXIT_ABORT); } @@ -411,11 +716,24 @@ static int vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { + assert(vmexit->inst_length == 0); + stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } +static int +vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + assert(vmexit->inst_length == 0); + + stats.vmexit_reqidle++; + + return (VMEXIT_CONTINUE); +} + static int vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { @@ -443,8 +761,12 @@ static int vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { + assert(vmexit->inst_length == 0); + stats.vmexit_mtrap++; + gdb_cpu_mtrap(*pvcpu); + return (VMEXIT_CONTINUE); } @@ -478,35 +800,88 @@ vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) return (VMEXIT_CONTINUE); } +static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; + +static int +vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + enum vm_suspend_how how; + + how = vmexit->u.suspended.how; + + fbsdrun_deletecpu(ctx, *pvcpu); + + if (*pvcpu != BSP) { + pthread_mutex_lock(&resetcpu_mtx); + pthread_cond_signal(&resetcpu_cond); + pthread_mutex_unlock(&resetcpu_mtx); + pthread_exit(NULL); + } + + pthread_mutex_lock(&resetcpu_mtx); + while (!CPU_EMPTY(&cpumask)) { + pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); + } + pthread_mutex_unlock(&resetcpu_mtx); + + switch (how) { + case VM_SUSPEND_RESET: + exit(0); + case VM_SUSPEND_POWEROFF: + exit(1); + case VM_SUSPEND_HALT: + exit(2); + case VM_SUSPEND_TRIPLEFAULT: + exit(3); + default: + fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); + exit(100); + } + return (0); /* NOTREACHED */ +} + +static int +vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + gdb_cpu_suspend(*pvcpu); + return (VMEXIT_CONTINUE); +} + static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, + [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, + [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, + [VM_EXITCODE_SUSPENDED] = vmexit_suspend, + [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, + [VM_EXITCODE_DEBUG] = vmexit_debug, }; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) { -#ifdef __FreeBSD__ - cpuset_t mask; -#endif - int error, rc, prevcpu; + int error, rc; enum vm_exitcode exitcode; + cpuset_t active_cpus; #ifdef __FreeBSD__ - if (pincpu >= 0) { - CPU_ZERO(&mask); - CPU_SET(pincpu + vcpu, &mask); + if (vcpumap[vcpu] != NULL) { error = pthread_setaffinity_np(pthread_self(), - sizeof(mask), &mask); + sizeof(cpuset_t), vcpumap[vcpu]); assert(error == 0); } #endif + error = vm_active_cpus(ctx, &active_cpus); + assert(CPU_ISSET(vcpu, &active_cpus)); error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); assert(error == 0); @@ -516,16 +891,14 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) if (error != 0) break; - prevcpu = vcpu; - exitcode = vmexit[vcpu].exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); - exit(1); + exit(4); } - rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); + rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); switch (rc) { case VMEXIT_CONTINUE: @@ -533,7 +906,7 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) case VMEXIT_ABORT: abort(); default: - exit(1); + exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); @@ -565,7 +938,7 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); - exit(1); + exit(4); } vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); if (cpu == BSP) @@ -580,7 +953,7 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); - exit(1); + exit(4); } vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); if (cpu == BSP) @@ -594,7 +967,7 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); - exit(1); + exit(4); } #ifdef __FreeBSD__ @@ -602,70 +975,175 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) #endif } +static struct vmctx * +do_open(const char *vmname) +{ + struct vmctx *ctx; + int error; + bool reinit, romboot; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + const cap_ioctl_t *cmds; + size_t ncmds; +#endif + + reinit = romboot = false; + + if (lpc_bootrom()) + romboot = true; + + error = vm_create(vmname); + if (error) { + if (errno == EEXIST) { + if (romboot) { + reinit = true; + } else { + /* + * The virtual machine has been setup by the + * userspace bootloader. + */ + } + } else { + perror("vm_create"); + exit(4); + } + } else { + if (!romboot) { + /* + * If the virtual machine was just created then a + * bootrom must be configured to boot it. + */ + fprintf(stderr, "virtual machine cannot be booted\n"); + exit(4); + } + } + + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(4); + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); + if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + vm_get_ioctls(&ncmds); + cmds = vm_get_ioctls(NULL); + if (cmds == NULL) + errx(EX_OSERR, "out of memory"); + if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + free((cap_ioctl_t *)cmds); +#endif + + if (reinit) { + error = vm_reinit(ctx); + if (error) { + perror("vm_reinit"); + exit(4); + } + } + error = vm_set_topology(ctx, sockets, cores, threads, maxcpus); + if (error) + errx(EX_OSERR, "vm_set_topology"); + return (ctx); +} + int main(int argc, char *argv[]) { - int c, error, gdb_port, rfb_port, err, bvmcons; - int max_vcpus; + int c, error, dbg_port, gdb_port, err, bvmcons; + int max_vcpus, mptgen, memflags; + int rtc_localtime; + bool gdb_stop; struct vmctx *ctx; uint64_t rip; size_t memsize; + char *optstr; bvmcons = 0; progname = basename(argv[0]); + dbg_port = 0; gdb_port = 0; - rfb_port = -1; + gdb_stop = false; guest_ncpus = 1; + sockets = cores = threads = 1; + maxcpus = 0; memsize = 256 * MB; - + mptgen = 1; + rtc_localtime = 1; + memflags = 0; #ifdef __FreeBSD__ - while ((c = getopt(argc, argv, "abehwxACHIPWYp:r:g:c:s:m:l:U:")) != -1) { + optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:B:U:"; #else - while ((c = getopt(argc, argv, "abehwxHIPWYr:c:s:m:l:U:")) != -1) { + optstr = "abehuwxACHIPSWYg:G:c:s:m:l:B:U:"; #endif + while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': x2apic_mode = 0; break; -#ifdef __FreeBSD__ case 'A': acpi = 1; break; -#endif case 'b': bvmcons = 1; break; + case 'B': + if (smbios_parse(optarg) != 0) { + errx(EX_USAGE, "invalid SMBIOS " + "configuration '%s'", optarg); + } + break; #ifdef __FreeBSD__ case 'p': - pincpu = atoi(optarg); + if (pincpu_parse(optarg) != 0) { + errx(EX_USAGE, "invalid vcpu pinning " + "configuration '%s'", optarg); + } break; #endif - case 'r': - if (optarg[0] == ':') - rfb_port = atoi(optarg + 1) + RFB_PORT; - else - rfb_port = atoi(optarg); - break; case 'c': - guest_ncpus = atoi(optarg); + if (topology_parse(optarg) != 0) { + errx(EX_USAGE, "invalid cpu topology " + "'%s'", optarg); + } + break; + case 'C': + memflags |= VM_MEM_F_INCORE; break; -#ifdef __FreeBSD__ case 'g': + dbg_port = atoi(optarg); + break; + case 'G': + if (optarg[0] == 'w') { + gdb_stop = true; + optarg++; + } gdb_port = atoi(optarg); break; -#endif case 'l': - if (lpc_device_parse(optarg) != 0) { + if (strncmp(optarg, "help", strlen(optarg)) == 0) { + lpc_print_supported_devices(); + exit(0); + } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; case 's': - if (pci_parse_slot(optarg) != 0) - exit(1); + if (strncmp(optarg, "help", strlen(optarg)) == 0) { + pci_print_supported_devices(); + exit(0); + } else if (pci_parse_slot(optarg) != 0) + exit(4); else break; + case 'S': + memflags |= VM_MEM_F_WIRED; + break; case 'm': error = vm_parse_memsize(optarg, &memsize); if (error) @@ -689,15 +1167,24 @@ main(int argc, char *argv[]) case 'e': strictio = 1; break; + case 'u': + rtc_localtime = 0; + break; case 'U': guest_uuid_str = optarg; break; + case 'w': + strictmsr = 0; + break; case 'W': virtio_msix = 0; break; case 'x': x2apic_mode = 1; break; + case 'Y': + mptgen = 0; + break; case 'h': usage(0); default: @@ -711,32 +1198,41 @@ main(int argc, char *argv[]) usage(1); vmname = argv[0]; - - ctx = vm_open(vmname); - if (ctx == NULL) { - perror("vm_open"); - exit(1); - } + ctx = do_open(vmname); max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); - exit(1); + exit(4); } fbsdrun_set_capabilities(ctx, BSP); + vm_set_memflags(ctx, memflags); +#ifdef __FreeBSD__ err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); +#else + do { + errno = 0; + err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); + error = errno; + if (err != 0 && error == ENOMEM) { + (void) fprintf(stderr, "Unable to allocate memory " + "(%llu), retrying in 1 second\n", memsize); + sleep(1); + } + } while (error == ENOMEM); +#endif if (err) { - fprintf(stderr, "Unable to setup memory (%d)\n", err); - exit(1); + fprintf(stderr, "Unable to setup memory (%d)\n", errno); + exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); - exit(1); + exit(4); } init_mem(); @@ -745,26 +1241,37 @@ main(int argc, char *argv[]) pci_irq_init(ctx); ioapic_init(ctx); - rtc_init(ctx); + rtc_init(ctx, rtc_localtime); + sci_init(ctx); /* - * Exit if a device emulation finds an error in it's initilization + * Exit if a device emulation finds an error in its initilization */ - if (init_pci(ctx) != 0) - exit(1); + if (init_pci(ctx) != 0) { + perror("device emulation initialization error"); + exit(4); + } + + if (dbg_port != 0) + init_dbgport(dbg_port); -#ifdef __FreeBSD__ if (gdb_port != 0) - init_dbgport(gdb_port); -#endif + init_gdb(ctx, gdb_port, gdb_stop); if (bvmcons) init_bvmcons(); - console_init(); - vga_init(); - if (rfb_port != -1) - rfb_init(rfb_port); + vga_init(1); + + if (lpc_bootrom()) { + if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { + fprintf(stderr, "ROM boot failed: unrestricted guest " + "capability not available\n"); + exit(4); + } + error = vcpu_reset(ctx, BSP); + assert(error == 0); + } error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); @@ -772,22 +1279,41 @@ main(int argc, char *argv[]) /* * build the guest tables, MP etc. */ - mptable_build(ctx, guest_ncpus); + if (mptgen) { + error = mptable_build(ctx, guest_ncpus); + if (error) { + perror("error to build the guest tables"); + exit(4); + } + } error = smbios_build(ctx); assert(error == 0); -#ifdef __FreeBSD__ if (acpi) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } + if (lpc_bootrom()) + fwctl_init(); + /* * Change the proc title to include the VM name. */ - setproctitle("%s", vmname); -#else + setproctitle("%s", vmname); + +#ifndef WITHOUT_CAPSICUM + caph_cache_catpages(); + + if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + + if (caph_enter() == -1) + errx(EX_OSERR, "cap_enter() failed"); +#endif + +#ifndef __FreeBSD__ /* * If applicable, wait for bhyveconsole */ @@ -810,11 +1336,7 @@ main(int argc, char *argv[]) /* * Head off to the main event dispatch loop */ -#ifdef __FreeBSD__ mevent_dispatch(); -#else - pthread_exit(NULL); -#endif - exit(1); + exit(4); } diff --git a/usr/src/cmd/bhyve/bhyverun.h b/usr/src/cmd/bhyve/bhyverun.h index be89314c09..78b3f1111f 100644 --- a/usr/src/cmd/bhyve/bhyverun.h +++ b/usr/src/cmd/bhyve/bhyverun.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/bhyverun.h 277310 2015-01-18 03:08:30Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -41,17 +43,12 @@ #ifndef _FBSDRUN_H_ #define _FBSDRUN_H_ -#ifndef CTASSERT /* Allow lint to override */ -#define CTASSERT(x) _CTASSERT(x, __LINE__) -#define _CTASSERT(x, y) __CTASSERT(x, y) -#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1] -#endif - #define VMEXIT_CONTINUE (0) #define VMEXIT_ABORT (-1) struct vmctx; extern int guest_ncpus; +extern uint16_t cores, sockets, threads; extern char *guest_uuid_str; extern char *vmname; #ifndef __FreeBSD__ diff --git a/usr/src/cmd/bhyve/block_if.c b/usr/src/cmd/bhyve/block_if.c index 2da946d420..72c5b02a0d 100644 --- a/usr/src/cmd/bhyve/block_if.c +++ b/usr/src/cmd/bhyve/block_if.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Peter Grehan * All rights reserved. * @@ -23,20 +25,36 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z tychon $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z tychon $"); +__FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include #include #include #include +#include +#include +#ifndef __FreeBSD__ +#include +#endif #include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include #include #include #include @@ -44,6 +62,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z t #include #include #include +#include #include #include @@ -56,16 +75,27 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z t #define BLOCKIF_SIG 0xb109b109 -#define BLOCKIF_MAXREQ 33 +#ifdef __FreeBSD__ +#define BLOCKIF_NUMTHR 8 +#else +/* Enlarge to keep pace with the virtio-block ring size */ +#define BLOCKIF_NUMTHR 16 +#endif +#define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) enum blockop { BOP_READ, BOP_WRITE, - BOP_FLUSH +#ifndef __FreeBSD__ + BOP_WRITE_SYNC, +#endif + BOP_FLUSH, + BOP_DELETE }; enum blockstat { BST_FREE, + BST_BLOCK, BST_PEND, BST_BUSY, BST_DONE @@ -77,24 +107,40 @@ struct blockif_elem { enum blockop be_op; enum blockstat be_status; pthread_t be_tid; + off_t be_block; }; +#ifndef __FreeBSD__ +enum blockif_wce { + WCE_NONE = 0, + WCE_IOCTL, + WCE_FCNTL +}; +#endif + struct blockif_ctxt { int bc_magic; int bc_fd; + int bc_ischr; + int bc_isgeom; + int bc_candelete; +#ifndef __FreeBSD__ + enum blockif_wce bc_wce; +#endif int bc_rdonly; off_t bc_size; int bc_sectsz; - pthread_t bc_btid; - pthread_mutex_t bc_mtx; - pthread_cond_t bc_cond; + int bc_psectsz; + int bc_psectoff; int bc_closing; + pthread_t bc_btid[BLOCKIF_NUMTHR]; + pthread_mutex_t bc_mtx; + pthread_cond_t bc_cond; /* Request elements and free/pending/busy queues */ TAILQ_HEAD(, blockif_elem) bc_freeq; TAILQ_HEAD(, blockif_elem) bc_pendq; TAILQ_HEAD(, blockif_elem) bc_busyq; - u_int bc_req_count; struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; }; @@ -113,83 +159,214 @@ static int blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { - struct blockif_elem *be; - - assert(bc->bc_req_count < BLOCKIF_MAXREQ); + struct blockif_elem *be, *tbe; + off_t off; + int i; be = TAILQ_FIRST(&bc->bc_freeq); assert(be != NULL); assert(be->be_status == BST_FREE); - TAILQ_REMOVE(&bc->bc_freeq, be, be_link); - be->be_status = BST_PEND; be->be_req = breq; be->be_op = op; + switch (op) { + case BOP_READ: + case BOP_WRITE: +#ifndef __FreeBSD__ + case BOP_WRITE_SYNC: +#endif + case BOP_DELETE: + off = breq->br_offset; + for (i = 0; i < breq->br_iovcnt; i++) + off += breq->br_iov[i].iov_len; + break; + default: + off = OFF_MAX; + } + be->be_block = off; + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + if (tbe == NULL) { + TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + } + if (tbe == NULL) + be->be_status = BST_PEND; + else + be->be_status = BST_BLOCK; TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); - - bc->bc_req_count++; - - return (0); + return (be->be_status == BST_PEND); } static int -blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep) +blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) { struct blockif_elem *be; - if (bc->bc_req_count == 0) - return (ENOENT); - - be = TAILQ_FIRST(&bc->bc_pendq); - assert(be != NULL); - assert(be->be_status == BST_PEND); + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_status == BST_PEND) + break; + assert(be->be_status == BST_BLOCK); + } + if (be == NULL) + return (0); TAILQ_REMOVE(&bc->bc_pendq, be, be_link); be->be_status = BST_BUSY; - be->be_tid = bc->bc_btid; + be->be_tid = t; TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); - *bep = be; - - return (0); + return (1); } static void blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) { - assert(be->be_status == BST_DONE); + struct blockif_elem *tbe; - TAILQ_REMOVE(&bc->bc_busyq, be, be_link); + if (be->be_status == BST_DONE || be->be_status == BST_BUSY) + TAILQ_REMOVE(&bc->bc_busyq, be, be_link); + else + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_req->br_offset == be->be_block) + tbe->be_status = BST_PEND; + } be->be_tid = 0; be->be_status = BST_FREE; be->be_req = NULL; TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); - - bc->bc_req_count--; } static void -blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be) +blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) { struct blockif_req *br; - int err; +#ifdef __FreeBSD__ + off_t arg[2]; +#endif + ssize_t clen, len, off, boff, voff; + int i, err; br = be->be_req; + if (br->br_iovcnt <= 1) + buf = NULL; err = 0; - switch (be->be_op) { case BOP_READ: - if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, - br->br_offset) < 0) - err = errno; + if (buf == NULL) { + if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + if (pread(bc->bc_fd, buf, len, br->br_offset + + off) < 0) { + err = errno; + break; + } + boff = 0; + do { + clen = MIN(len - boff, br->br_iov[i].iov_len - + voff); + memcpy(br->br_iov[i].iov_base + voff, + buf + boff, clen); + if (clen < br->br_iov[i].iov_len - voff) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + off += len; + br->br_resid -= len; + } break; case BOP_WRITE: - if (bc->bc_rdonly) + if (bc->bc_rdonly) { err = EROFS; - else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, - br->br_offset) < 0) - err = errno; + break; + } + if (buf == NULL) { + if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + boff = 0; + do { + clen = MIN(len - boff, br->br_iov[i].iov_len - + voff); + memcpy(buf + boff, + br->br_iov[i].iov_base + voff, clen); + if (clen < br->br_iov[i].iov_len - voff) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + if (pwrite(bc->bc_fd, buf, len, br->br_offset + + off) < 0) { + err = errno; + break; + } + off += len; + br->br_resid -= len; + } break; case BOP_FLUSH: +#ifdef __FreeBSD__ + if (bc->bc_ischr) { + if (ioctl(bc->bc_fd, DIOCGFLUSH)) + err = errno; + } else if (fsync(bc->bc_fd)) + err = errno; +#else + /* + * This fsync() should be adequate to flush the cache of a file + * or device. In VFS, the VOP_SYNC operation is converted to + * the appropriate ioctl in both sdev (for real devices) and + * zfs (for zvols). + */ + if (fsync(bc->bc_fd)) + err = errno; +#endif + break; + case BOP_DELETE: + if (!bc->bc_candelete) + err = EOPNOTSUPP; + else if (bc->bc_rdonly) + err = EROFS; +#ifdef __FreeBSD__ + else if (bc->bc_ischr) { + arg[0] = br->br_offset; + arg[1] = br->br_resid; + if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) + err = errno; + else + br->br_resid = 0; + } +#endif + else + err = EOPNOTSUPP; break; default: err = EINVAL; @@ -206,28 +383,34 @@ blockif_thr(void *arg) { struct blockif_ctxt *bc; struct blockif_elem *be; + pthread_t t; + uint8_t *buf; bc = arg; + if (bc->bc_isgeom) + buf = malloc(MAXPHYS); + else + buf = NULL; + t = pthread_self(); + pthread_mutex_lock(&bc->bc_mtx); for (;;) { - pthread_mutex_lock(&bc->bc_mtx); - while (!blockif_dequeue(bc, &be)) { + while (blockif_dequeue(bc, t, &be)) { pthread_mutex_unlock(&bc->bc_mtx); - blockif_proc(bc, be); + blockif_proc(bc, be, buf); pthread_mutex_lock(&bc->bc_mtx); blockif_complete(bc, be); } - pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); - pthread_mutex_unlock(&bc->bc_mtx); - - /* - * Check ctxt status here to see if exit requested - */ + /* Check ctxt status here to see if exit requested */ if (bc->bc_closing) - pthread_exit(NULL); + break; + pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); } + pthread_mutex_unlock(&bc->bc_mtx); - /* Not reached */ + if (buf) + free(buf); + pthread_exit(NULL); return (NULL); } @@ -276,15 +459,31 @@ struct blockif_ctxt * blockif_open(const char *optstr, const char *ident) { char tname[MAXCOMLEN + 1]; - char *nopt, *xopts; +#ifdef __FreeBSD__ + char name[MAXPATHLEN]; + char *nopt, *xopts, *cp; +#else + char *nopt, *xopts, *cp = NULL; +#endif struct blockif_ctxt *bc; struct stat sbuf; - off_t size; +#ifdef __FreeBSD__ + struct diocgattr_arg arg; +#else + enum blockif_wce wce = WCE_NONE; +#endif + off_t size, psectsz, psectoff; int extra, fd, i, sectsz; - int nocache, sync, ro; + int nocache, sync, ro, candelete, geom, ssopt, pssopt; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; +#endif pthread_once(&blockif_once, blockif_init); + fd = -1; + ssopt = 0; nocache = 0; sync = 0; ro = 0; @@ -293,16 +492,25 @@ blockif_open(const char *optstr, const char *ident) * The first element in the optstring is always a pathname. * Optional elements follow */ - nopt = strdup(optstr); - for (xopts = strtok(nopt, ","); - xopts != NULL; - xopts = strtok(NULL, ",")) { - if (!strcmp(xopts, "nocache")) + nopt = xopts = strdup(optstr); + while (xopts != NULL) { + cp = strsep(&xopts, ","); + if (cp == nopt) /* file or device pathname */ + continue; + else if (!strcmp(cp, "nocache")) nocache = 1; - else if (!strcmp(xopts, "sync")) + else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) sync = 1; - else if (!strcmp(xopts, "ro")) + else if (!strcmp(cp, "ro")) ro = 1; + else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) + ; + else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) + pssopt = ssopt; + else { + fprintf(stderr, "Invalid device option \"%s\"\n", cp); + goto err; + } } extra = 0; @@ -319,62 +527,185 @@ blockif_open(const char *optstr, const char *ident) } if (fd < 0) { - perror("Could not open backing file"); - return (NULL); + warn("Could not open backing file: %s", nopt); + goto err; } if (fstat(fd, &sbuf) < 0) { - perror("Could not stat backing file"); - close(fd); - return (NULL); + warn("Could not stat backing file %s", nopt); + goto err; } +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, + CAP_WRITE); + if (ro) + cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); + + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + /* * Deal with raw devices */ size = sbuf.st_size; sectsz = DEV_BSIZE; + psectsz = psectoff = 0; + candelete = geom = 0; #ifdef __FreeBSD__ if (S_ISCHR(sbuf.st_mode)) { if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || ioctl(fd, DIOCGSECTORSIZE, §sz)) { perror("Could not fetch dev blk/sector size"); - close(fd); - return (NULL); + goto err; } assert(size != 0); assert(sectsz != 0); + if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) + ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); + strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); + arg.len = sizeof(arg.value.i); + if (ioctl(fd, DIOCGATTR, &arg) == 0) + candelete = arg.value.i; + if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) + geom = 1; + } else { + psectsz = sbuf.st_blksize; } +#else + psectsz = sbuf.st_blksize; + if (S_ISCHR(sbuf.st_mode)) { + struct dk_minfo_ext dkmext; + int wce_val; + + /* Look for a more accurate physical blocksize */ + if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) { + psectsz = dkmext.dki_pbsize; + } + /* See if a configurable write cache is present and working */ + if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) { + /* + * If WCE is already active, disable it until the + * specific device driver calls for its return. If it + * is not active, toggle it on and off to verify that + * such actions are possible. + */ + if (wce_val != 0) { + wce_val = 0; + /* + * Inability to disable the cache is a threat + * to data durability. + */ + assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0); + wce = WCE_IOCTL; + } else { + int r1, r2; + + wce_val = 1; + r1 = ioctl(fd, DKIOCSETWCE, &wce_val); + wce_val = 0; + r2 = ioctl(fd, DKIOCSETWCE, &wce_val); + + if (r1 == 0 && r2 == 0) { + wce = WCE_IOCTL; + } else { + /* + * If the cache cache toggle was not + * successful, ensure that the cache + * was not left enabled. + */ + assert(r1 != 0); + } + } + } + } else { + int flags; + + if ((flags = fcntl(fd, F_GETFL)) >= 0) { + flags |= O_DSYNC; + if (fcntl(fd, F_SETFL, flags) != -1) { + wce = WCE_FCNTL; + } + } + } +#endif + +#ifndef WITHOUT_CAPSICUM + if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif + if (ssopt != 0) { + if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || + ssopt > pssopt) { + fprintf(stderr, "Invalid sector size %d/%d\n", + ssopt, pssopt); + goto err; + } + + /* + * Some backend drivers (e.g. cd0, ada0) require that the I/O + * size be a multiple of the device's sector size. + * + * Validate that the emulated sector size complies with this + * requirement. + */ + if (S_ISCHR(sbuf.st_mode)) { + if (ssopt < sectsz || (ssopt % sectsz) != 0) { + fprintf(stderr, "Sector size %d incompatible " + "with underlying device sector size %d\n", + ssopt, sectsz); + goto err; + } + } + + sectsz = ssopt; + psectsz = pssopt; + psectoff = 0; + } + bc = calloc(1, sizeof(struct blockif_ctxt)); if (bc == NULL) { - close(fd); - return (NULL); + perror("calloc"); + goto err; } bc->bc_magic = BLOCKIF_SIG; bc->bc_fd = fd; + bc->bc_ischr = S_ISCHR(sbuf.st_mode); + bc->bc_isgeom = geom; + bc->bc_candelete = candelete; +#ifndef __FreeBSD__ + bc->bc_wce = wce; +#endif bc->bc_rdonly = ro; bc->bc_size = size; bc->bc_sectsz = sectsz; + bc->bc_psectsz = psectsz; + bc->bc_psectoff = psectoff; pthread_mutex_init(&bc->bc_mtx, NULL); pthread_cond_init(&bc->bc_cond, NULL); TAILQ_INIT(&bc->bc_freeq); TAILQ_INIT(&bc->bc_pendq); TAILQ_INIT(&bc->bc_busyq); - bc->bc_req_count = 0; for (i = 0; i < BLOCKIF_MAXREQ; i++) { bc->bc_reqs[i].be_status = BST_FREE; TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); } - pthread_create(&bc->bc_btid, NULL, blockif_thr, bc); - - snprintf(tname, sizeof(tname), "blk-%s", ident); - pthread_set_name_np(bc->bc_btid, tname); + for (i = 0; i < BLOCKIF_NUMTHR; i++) { + pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); + snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); + pthread_set_name_np(bc->bc_btid[i], tname); + } return (bc); +err: + if (fd >= 0) + close(fd); + free(nopt); + return (NULL); } static int @@ -386,13 +717,13 @@ blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, err = 0; pthread_mutex_lock(&bc->bc_mtx); - if (bc->bc_req_count < BLOCKIF_MAXREQ) { + if (!TAILQ_EMPTY(&bc->bc_freeq)) { /* * Enqueue and inform the block i/o thread * that there is work available */ - blockif_enqueue(bc, breq, op); - pthread_cond_signal(&bc->bc_cond); + if (blockif_enqueue(bc, breq, op)) + pthread_cond_signal(&bc->bc_cond); } else { /* * Callers are not allowed to enqueue more than @@ -431,6 +762,14 @@ blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) return (blockif_request(bc, breq, BOP_FLUSH)); } +int +blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_DELETE)); +} + int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) { @@ -450,11 +789,7 @@ blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) /* * Found it. */ - TAILQ_REMOVE(&bc->bc_pendq, be, be_link); - be->be_status = BST_FREE; - be->be_req = NULL; - TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); - bc->bc_req_count--; + blockif_complete(bc, be); pthread_mutex_unlock(&bc->bc_mtx); return (0); @@ -515,18 +850,19 @@ int blockif_close(struct blockif_ctxt *bc) { void *jval; - int err; - - err = 0; + int i; assert(bc->bc_magic == BLOCKIF_SIG); /* * Stop the block i/o thread */ + pthread_mutex_lock(&bc->bc_mtx); bc->bc_closing = 1; - pthread_cond_signal(&bc->bc_cond); - pthread_join(bc->bc_btid, &jval); + pthread_mutex_unlock(&bc->bc_mtx); + pthread_cond_broadcast(&bc->bc_cond); + for (i = 0; i < BLOCKIF_NUMTHR; i++) + pthread_join(bc->bc_btid[i], &jval); /* XXX Cancel queued i/o's ??? */ @@ -608,6 +944,15 @@ blockif_sectsz(struct blockif_ctxt *bc) return (bc->bc_sectsz); } +void +blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + *size = bc->bc_psectsz; + *off = bc->bc_psectoff; +} + int blockif_queuesz(struct blockif_ctxt *bc) { @@ -623,3 +968,54 @@ blockif_is_ro(struct blockif_ctxt *bc) assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_rdonly); } + +int +blockif_candelete(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_candelete); +} + +#ifndef __FreeBSD__ +int +blockif_set_wce(struct blockif_ctxt *bc, int wc_enable) +{ + int res = 0, flags; + int clean_val = (wc_enable != 0) ? 1 : 0; + + (void) pthread_mutex_lock(&bc->bc_mtx); + switch (bc->bc_wce) { + case WCE_IOCTL: + res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val); + break; + case WCE_FCNTL: + if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) { + if (wc_enable == 0) { + flags |= O_DSYNC; + } else { + flags &= ~O_DSYNC; + } + if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) { + res = -1; + } + } else { + res = -1; + } + break; + default: + break; + } + + /* + * After a successful disable of the write cache, ensure that any + * lingering data in the cache is synced out. + */ + if (res == 0 && wc_enable == 0) { + res = fsync(bc->bc_fd); + } + (void) pthread_mutex_unlock(&bc->bc_mtx); + + return (res); +} +#endif /* __FreeBSD__ */ diff --git a/usr/src/cmd/bhyve/block_if.h b/usr/src/cmd/bhyve/block_if.h index 5ef120933c..bff2b42768 100644 --- a/usr/src/cmd/bhyve/block_if.h +++ b/usr/src/cmd/bhyve/block_if.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Peter Grehan * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/block_if.h 268638 2014-07-15 00:25:54Z grehan $ + * $FreeBSD$ */ /* @@ -39,18 +41,21 @@ #include #include -#ifdef __FreeBSD__ -#define BLOCKIF_IOV_MAX 32 /* not practical to be IOV_MAX */ -#else -#define BLOCKIF_IOV_MAX 16 /* not practical to be IOV_MAX */ -#endif +/* + * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in + * a single request. BLOCKIF_RING_MAX is the maxmimum number of + * pending requests that can be queued. + */ +#define BLOCKIF_IOV_MAX 128 /* not practical to be IOV_MAX */ +#define BLOCKIF_RING_MAX 128 struct blockif_req { - struct iovec br_iov[BLOCKIF_IOV_MAX]; int br_iovcnt; off_t br_offset; + ssize_t br_resid; void (*br_callback)(struct blockif_req *req, int err); void *br_param; + struct iovec br_iov[BLOCKIF_IOV_MAX]; }; struct blockif_ctxt; @@ -59,11 +64,17 @@ off_t blockif_size(struct blockif_ctxt *bc); void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s); int blockif_sectsz(struct blockif_ctxt *bc); +void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off); int blockif_queuesz(struct blockif_ctxt *bc); int blockif_is_ro(struct blockif_ctxt *bc); +int blockif_candelete(struct blockif_ctxt *bc); +#ifndef __FreeBSD__ +int blockif_set_wce(struct blockif_ctxt *bc, int enable); +#endif int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_close(struct blockif_ctxt *bc); diff --git a/usr/src/cmd/bhyve/bootrom.c b/usr/src/cmd/bhyve/bootrom.c new file mode 100644 index 0000000000..b8c63828c8 --- /dev/null +++ b/usr/src/cmd/bhyve/bootrom.c @@ -0,0 +1,113 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include "bhyverun.h" +#include "bootrom.h" + +#define MAX_BOOTROM_SIZE (16 * 1024 * 1024) /* 16 MB */ + +int +bootrom_init(struct vmctx *ctx, const char *romfile) +{ + struct stat sbuf; + vm_paddr_t gpa; + ssize_t rlen; + char *ptr; + int fd, i, rv, prot; + + rv = -1; + fd = open(romfile, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "Error opening bootrom \"%s\": %s\n", + romfile, strerror(errno)); + goto done; + } + + if (fstat(fd, &sbuf) < 0) { + fprintf(stderr, "Could not fstat bootrom file \"%s\": %s\n", + romfile, strerror(errno)); + goto done; + } + + /* + * Limit bootrom size to 16MB so it doesn't encroach into reserved + * MMIO space (e.g. APIC, HPET, MSI). + */ + if (sbuf.st_size > MAX_BOOTROM_SIZE || sbuf.st_size < PAGE_SIZE) { + fprintf(stderr, "Invalid bootrom size %ld\n", sbuf.st_size); + goto done; + } + + if (sbuf.st_size & PAGE_MASK) { + fprintf(stderr, "Bootrom size %ld is not a multiple of the " + "page size\n", sbuf.st_size); + goto done; + } + + ptr = vm_create_devmem(ctx, VM_BOOTROM, "bootrom", sbuf.st_size); + if (ptr == MAP_FAILED) + goto done; + + /* Map the bootrom into the guest address space */ + prot = PROT_READ | PROT_EXEC; + gpa = (1ULL << 32) - sbuf.st_size; + if (vm_mmap_memseg(ctx, gpa, VM_BOOTROM, 0, sbuf.st_size, prot) != 0) + goto done; + + /* Read 'romfile' into the guest address space */ + for (i = 0; i < sbuf.st_size / PAGE_SIZE; i++) { + rlen = read(fd, ptr + i * PAGE_SIZE, PAGE_SIZE); + if (rlen != PAGE_SIZE) { + fprintf(stderr, "Incomplete read of page %d of bootrom " + "file %s: %ld bytes\n", i, romfile, rlen); + goto done; + } + } + rv = 0; +done: + if (fd >= 0) + close(fd); + return (rv); +} diff --git a/usr/src/cmd/bhyve/bootrom.h b/usr/src/cmd/bhyve/bootrom.h new file mode 100644 index 0000000000..7fb12181dd --- /dev/null +++ b/usr/src/cmd/bhyve/bootrom.h @@ -0,0 +1,40 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _BOOTROM_H_ +#define _BOOTROM_H_ + +#include + +struct vmctx; + +int bootrom_init(struct vmctx *ctx, const char *romfile); + +#endif diff --git a/usr/src/cmd/bhyve/console.c b/usr/src/cmd/bhyve/console.c index a8d07709be..2567f69959 100644 --- a/usr/src/cmd/bhyve/console.c +++ b/usr/src/cmd/bhyve/console.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale * All rights reserved. * @@ -40,15 +42,23 @@ static struct { kbd_event_func_t kbd_event_cb; void *kbd_arg; + int kbd_priority; ptr_event_func_t ptr_event_cb; void *ptr_arg; + int ptr_priority; } console; void -console_init(void) +console_init(int w, int h, void *fbaddr) +{ + console.gc = bhyvegc_init(w, h, fbaddr); +} + +void +console_set_fbaddr(void *fbaddr) { - console.gc = bhyvegc_init(640, 400); + bhyvegc_set_fbaddr(console.gc, fbaddr); } struct bhyvegc_image * @@ -71,31 +81,40 @@ console_fb_register(fb_render_func_t render_cb, void *arg) void console_refresh(void) { - (*console.fb_render_cb)(console.gc, console.fb_arg); + if (console.fb_render_cb) + (*console.fb_render_cb)(console.gc, console.fb_arg); } void -console_kbd_register(kbd_event_func_t event_cb, void *arg) +console_kbd_register(kbd_event_func_t event_cb, void *arg, int pri) { - console.kbd_event_cb = event_cb; - console.kbd_arg = arg; + if (pri > console.kbd_priority) { + console.kbd_event_cb = event_cb; + console.kbd_arg = arg; + console.kbd_priority = pri; + } } void -console_ptr_register(ptr_event_func_t event_cb, void *arg) +console_ptr_register(ptr_event_func_t event_cb, void *arg, int pri) { - console.ptr_event_cb = event_cb; - console.ptr_arg = arg; + if (pri > console.ptr_priority) { + console.ptr_event_cb = event_cb; + console.ptr_arg = arg; + console.ptr_priority = pri; + } } void console_key_event(int down, uint32_t keysym) { - (*console.kbd_event_cb)(down, keysym, console.kbd_arg); + if (console.kbd_event_cb) + (*console.kbd_event_cb)(down, keysym, console.kbd_arg); } void console_ptr_event(uint8_t button, int x, int y) { - (*console.ptr_event_cb)(button, x, y, console.ptr_arg); + if (console.ptr_event_cb) + (*console.ptr_event_cb)(button, x, y, console.ptr_arg); } diff --git a/usr/src/cmd/bhyve/console.h b/usr/src/cmd/bhyve/console.h index bffb7c2456..0d0a854866 100644 --- a/usr/src/cmd/bhyve/console.h +++ b/usr/src/cmd/bhyve/console.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale * All rights reserved. * @@ -35,16 +37,19 @@ typedef void (*fb_render_func_t)(struct bhyvegc *gc, void *arg); typedef void (*kbd_event_func_t)(int down, uint32_t keysym, void *arg); typedef void (*ptr_event_func_t)(uint8_t mask, int x, int y, void *arg); -void console_init(void); +void console_init(int w, int h, void *fbaddr); + +void console_set_fbaddr(void *fbaddr); + struct bhyvegc_image *console_get_image(void); -void console_fb_register(fb_render_func_t render_cb, void *arg); -void console_refresh(void); +void console_fb_register(fb_render_func_t render_cb, void *arg); +void console_refresh(void); -void console_kbd_register(kbd_event_func_t event_cb, void *arg); -void console_key_event(int down, uint32_t keysym); +void console_kbd_register(kbd_event_func_t event_cb, void *arg, int pri); +void console_key_event(int down, uint32_t keysym); -void console_ptr_register(ptr_event_func_t event_cb, void *arg); -void console_ptr_event(uint8_t button, int x, int y); +void console_ptr_register(ptr_event_func_t event_cb, void *arg, int pri); +void console_ptr_event(uint8_t button, int x, int y); #endif /* _CONSOLE_H_ */ diff --git a/usr/src/cmd/bhyve/consport.c b/usr/src/cmd/bhyve/consport.c index 69b6dfddf1..cda2df2414 100644 --- a/usr/src/cmd/bhyve/consport.c +++ b/usr/src/cmd/bhyve/consport.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,20 +25,29 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z jhb $ + * $FreeBSD$ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z jhb $"); +__FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include #include #include #include #include #include +#include #include "inout.h" #include "pci_lpc.h" @@ -44,6 +55,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z j #define BVM_CONSOLE_PORT 0x220 #define BVM_CONS_SIG ('b' << 8 | 'v') +#ifdef __FreeBSD__ static struct termios tio_orig, tio_new; static void @@ -51,6 +63,7 @@ ttyclose(void) { tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); } +#endif static void ttyopen(void) @@ -68,14 +81,14 @@ ttyopen(void) static bool tty_char_available(void) { - fd_set rfds; - struct timeval tv; - - FD_ZERO(&rfds); - FD_SET(STDIN_FILENO, &rfds); - tv.tv_sec = 0; - tv.tv_usec = 0; - if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { + fd_set rfds; + struct timeval tv; + + FD_ZERO(&rfds); + FD_SET(STDIN_FILENO, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 0; + if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { return (true); } else { return (false); @@ -106,6 +119,10 @@ console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { static int opened; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; +#endif if (bytes == 2 && in) { *eax = BVM_CONS_SIG; @@ -125,6 +142,14 @@ console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, return (-1); if (!opened) { +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, + CAP_WRITE); + if (caph_rights_limit(STDIN_FILENO, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(STDIN_FILENO, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif ttyopen(); opened = 1; } diff --git a/usr/src/cmd/bhyve/dbgport.c b/usr/src/cmd/bhyve/dbgport.c new file mode 100644 index 0000000000..88a616b50d --- /dev/null +++ b/usr/src/cmd/bhyve/dbgport.c @@ -0,0 +1,180 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +#include "inout.h" +#include "dbgport.h" +#include "pci_lpc.h" + +#define BVM_DBG_PORT 0x224 +#define BVM_DBG_SIG ('B' << 8 | 'V') + +static int listen_fd, conn_fd; + +static struct sockaddr_in sin; + +static int +dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int nwritten, nread, printonce; + int on = 1; + char ch; + + if (bytes == 2 && in) { + *eax = BVM_DBG_SIG; + return (0); + } + + if (bytes != 4) + return (-1); + +again: + printonce = 0; + while (conn_fd < 0) { + if (!printonce) { + printf("Waiting for connection from gdb\r\n"); + printonce = 1; + } + conn_fd = accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK); + if (conn_fd >= 0) { + /* Avoid EPIPE after the client drops off. */ + (void)setsockopt(conn_fd, SOL_SOCKET, SO_NOSIGPIPE, + &on, sizeof(on)); + /* Improve latency for one byte at a time tranfers. */ + (void)setsockopt(conn_fd, IPPROTO_TCP, TCP_NODELAY, + &on, sizeof(on)); + } else if (errno != EINTR) { + perror("accept"); + } + } + + if (in) { + nread = read(conn_fd, &ch, 1); + if (nread == -1 && errno == EAGAIN) + *eax = -1; + else if (nread == 1) + *eax = ch; + else { + close(conn_fd); + conn_fd = -1; + goto again; + } + } else { + ch = *eax; + nwritten = write(conn_fd, &ch, 1); + if (nwritten != 1) { + close(conn_fd); + conn_fd = -1; + goto again; + } + } + return (0); +} + +static struct inout_port dbgport = { + "bvmdbg", + BVM_DBG_PORT, + 1, + IOPORT_F_INOUT, + dbg_handler +}; + +SYSRES_IO(BVM_DBG_PORT, 4); + +void +init_dbgport(int sport) +{ + int reuse; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + conn_fd = -1; + + if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("cannot create socket"); + exit(4); + } + +#ifdef __FreeBSD__ + sin.sin_len = sizeof(sin); +#endif + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(sport); + + reuse = 1; + if (setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, + sizeof(reuse)) < 0) { + perror("cannot set socket options"); + exit(4); + } + + if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("cannot bind socket"); + exit(4); + } + + if (listen(listen_fd, 1) < 0) { + perror("cannot listen socket"); + exit(4); + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(listen_fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + register_inout(&dbgport); +} diff --git a/usr/src/cmd/bhyve/dbgport.h b/usr/src/cmd/bhyve/dbgport.h index b95df0bd31..407ff3ffbf 100644 --- a/usr/src/cmd/bhyve/dbgport.h +++ b/usr/src/cmd/bhyve/dbgport.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/dbgport.h 256156 2013-10-08 16:36:17Z neel $ + * $FreeBSD$ */ #ifndef _DBGPORT_H_ diff --git a/usr/src/cmd/bhyve/fwctl.c b/usr/src/cmd/bhyve/fwctl.c new file mode 100644 index 0000000000..0640bc28ba --- /dev/null +++ b/usr/src/cmd/bhyve/fwctl.c @@ -0,0 +1,552 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Peter Grehan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Guest firmware interface. Uses i/o ports x510/x511 as Qemu does, + * but with a request/response messaging protocol. + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "bhyverun.h" +#include "inout.h" +#include "fwctl.h" + +/* + * Messaging protocol base operations + */ +#define OP_NULL 1 +#define OP_ECHO 2 +#define OP_GET 3 +#define OP_GET_LEN 4 +#define OP_SET 5 +#define OP_MAX OP_SET + +/* I/O ports */ +#define FWCTL_OUT 0x510 +#define FWCTL_IN 0x511 + +/* + * Back-end state-machine + */ +enum state { + DORMANT, + IDENT_WAIT, + IDENT_SEND, + REQ, + RESP +} be_state = DORMANT; + +static uint8_t sig[] = { 'B', 'H', 'Y', 'V' }; +static u_int ident_idx; + +struct op_info { + int op; + int (*op_start)(uint32_t len); + void (*op_data)(uint32_t data, uint32_t len); + int (*op_result)(struct iovec **data); + void (*op_done)(struct iovec *data); +}; +static struct op_info *ops[OP_MAX+1]; + +/* Return 0-padded uint32_t */ +static uint32_t +fwctl_send_rest(uint32_t *data, size_t len) +{ + union { + uint8_t c[4]; + uint32_t w; + } u; + uint8_t *cdata; + int i; + + cdata = (uint8_t *) data; + u.w = 0; + + for (i = 0, u.w = 0; i < len; i++) + u.c[i] = *cdata++; + + return (u.w); +} + +/* + * error op dummy proto - drop all data sent and return an error +*/ +static int errop_code; + +static void +errop_set(int err) +{ + + errop_code = err; +} + +static int +errop_start(uint32_t len) +{ + errop_code = ENOENT; + + /* accept any length */ + return (errop_code); +} + +static void +errop_data(uint32_t data, uint32_t len) +{ + + /* ignore */ +} + +static int +errop_result(struct iovec **data) +{ + + /* no data to send back; always successful */ + *data = NULL; + return (errop_code); +} + +static void +errop_done(struct iovec *data) +{ + + /* assert data is NULL */ +} + +static struct op_info errop_info = { + .op_start = errop_start, + .op_data = errop_data, + .op_result = errop_result, + .op_done = errop_done +}; + +/* OID search */ +SET_DECLARE(ctl_set, struct ctl); + +CTL_NODE("hw.ncpu", &guest_ncpus, sizeof(guest_ncpus)); + +static struct ctl * +ctl_locate(const char *str, int maxlen) +{ + struct ctl *cp, **cpp; + + SET_FOREACH(cpp, ctl_set) { + cp = *cpp; + if (!strncmp(str, cp->c_oid, maxlen)) + return (cp); + } + + return (NULL); +} + +/* uefi-sysctl get-len */ +#define FGET_STRSZ 80 +static struct iovec fget_biov[2]; +static char fget_str[FGET_STRSZ]; +static struct { + size_t f_sz; + uint32_t f_data[1024]; +} fget_buf; +static int fget_cnt; +static size_t fget_size; + +static int +fget_start(uint32_t len) +{ + + if (len > FGET_STRSZ) + return(E2BIG); + + fget_cnt = 0; + + return (0); +} + +static void +fget_data(uint32_t data, uint32_t len) +{ + + *((uint32_t *) &fget_str[fget_cnt]) = data; + fget_cnt += sizeof(uint32_t); +} + +static int +fget_result(struct iovec **data, int val) +{ + struct ctl *cp; + int err; + + err = 0; + + /* Locate the OID */ + cp = ctl_locate(fget_str, fget_cnt); + if (cp == NULL) { + *data = NULL; + err = ENOENT; + } else { + if (val) { + /* For now, copy the len/data into a buffer */ + memset(&fget_buf, 0, sizeof(fget_buf)); + fget_buf.f_sz = cp->c_len; + memcpy(fget_buf.f_data, cp->c_data, cp->c_len); + fget_biov[0].iov_base = (char *)&fget_buf; + fget_biov[0].iov_len = sizeof(fget_buf.f_sz) + + cp->c_len; + } else { + fget_size = cp->c_len; + fget_biov[0].iov_base = (char *)&fget_size; + fget_biov[0].iov_len = sizeof(fget_size); + } + + fget_biov[1].iov_base = NULL; + fget_biov[1].iov_len = 0; + *data = fget_biov; + } + + return (err); +} + +static void +fget_done(struct iovec *data) +{ + + /* nothing needs to be freed */ +} + +static int +fget_len_result(struct iovec **data) +{ + return (fget_result(data, 0)); +} + +static int +fget_val_result(struct iovec **data) +{ + return (fget_result(data, 1)); +} + +static struct op_info fgetlen_info = { + .op_start = fget_start, + .op_data = fget_data, + .op_result = fget_len_result, + .op_done = fget_done +}; + +static struct op_info fgetval_info = { + .op_start = fget_start, + .op_data = fget_data, + .op_result = fget_val_result, + .op_done = fget_done +}; + +static struct req_info { + int req_error; + u_int req_count; + uint32_t req_size; + uint32_t req_type; + uint32_t req_txid; + struct op_info *req_op; + int resp_error; + int resp_count; + size_t resp_size; + size_t resp_off; + struct iovec *resp_biov; +} rinfo; + +static void +fwctl_response_done(void) +{ + + (*rinfo.req_op->op_done)(rinfo.resp_biov); + + /* reinit the req data struct */ + memset(&rinfo, 0, sizeof(rinfo)); +} + +static void +fwctl_request_done(void) +{ + + rinfo.resp_error = (*rinfo.req_op->op_result)(&rinfo.resp_biov); + + /* XXX only a single vector supported at the moment */ + rinfo.resp_off = 0; + if (rinfo.resp_biov == NULL) { + rinfo.resp_size = 0; + } else { + rinfo.resp_size = rinfo.resp_biov[0].iov_len; + } +} + +static int +fwctl_request_start(void) +{ + int err; + + /* Data size doesn't include header */ + rinfo.req_size -= 12; + + rinfo.req_op = &errop_info; + if (rinfo.req_type <= OP_MAX && ops[rinfo.req_type] != NULL) + rinfo.req_op = ops[rinfo.req_type]; + + err = (*rinfo.req_op->op_start)(rinfo.req_size); + + if (err) { + errop_set(err); + rinfo.req_op = &errop_info; + } + + /* Catch case of zero-length message here */ + if (rinfo.req_size == 0) { + fwctl_request_done(); + return (1); + } + + return (0); +} + +static int +fwctl_request_data(uint32_t value) +{ + + /* Make sure remaining size is >= 0 */ + if (rinfo.req_size <= sizeof(uint32_t)) + rinfo.req_size = 0; + else + rinfo.req_size -= sizeof(uint32_t); + + (*rinfo.req_op->op_data)(value, rinfo.req_size); + + if (rinfo.req_size < sizeof(uint32_t)) { + fwctl_request_done(); + return (1); + } + + return (0); +} + +static int +fwctl_request(uint32_t value) +{ + + int ret; + + ret = 0; + + switch (rinfo.req_count) { + case 0: + /* Verify size */ + if (value < 12) { + printf("msg size error"); + exit(4); + } + rinfo.req_size = value; + rinfo.req_count = 1; + break; + case 1: + rinfo.req_type = value; + rinfo.req_count++; + break; + case 2: + rinfo.req_txid = value; + rinfo.req_count++; + ret = fwctl_request_start(); + break; + default: + ret = fwctl_request_data(value); + break; + } + + return (ret); +} + +static int +fwctl_response(uint32_t *retval) +{ + uint32_t *dp; + ssize_t remlen; + + switch(rinfo.resp_count) { + case 0: + /* 4 x u32 header len + data */ + *retval = 4*sizeof(uint32_t) + + roundup(rinfo.resp_size, sizeof(uint32_t)); + rinfo.resp_count++; + break; + case 1: + *retval = rinfo.req_type; + rinfo.resp_count++; + break; + case 2: + *retval = rinfo.req_txid; + rinfo.resp_count++; + break; + case 3: + *retval = rinfo.resp_error; + rinfo.resp_count++; + break; + default: + remlen = rinfo.resp_size - rinfo.resp_off; + dp = (uint32_t *) + ((uint8_t *)rinfo.resp_biov->iov_base + rinfo.resp_off); + if (remlen >= sizeof(uint32_t)) { + *retval = *dp; + } else if (remlen > 0) { + *retval = fwctl_send_rest(dp, remlen); + } + rinfo.resp_off += sizeof(uint32_t); + break; + } + + if (rinfo.resp_count > 3 && + rinfo.resp_off >= rinfo.resp_size) { + fwctl_response_done(); + return (1); + } + + return (0); +} + + +/* + * i/o port handling. + */ +static uint8_t +fwctl_inb(void) +{ + uint8_t retval; + + retval = 0xff; + + switch (be_state) { + case IDENT_SEND: + retval = sig[ident_idx++]; + if (ident_idx >= sizeof(sig)) + be_state = REQ; + break; + default: + break; + } + + return (retval); +} + +static void +fwctl_outw(uint16_t val) +{ + switch (be_state) { + case IDENT_WAIT: + if (val == 0) { + be_state = IDENT_SEND; + ident_idx = 0; + } + break; + default: + /* ignore */ + break; + } +} + +static uint32_t +fwctl_inl(void) +{ + uint32_t retval; + + switch (be_state) { + case RESP: + if (fwctl_response(&retval)) + be_state = REQ; + break; + default: + retval = 0xffffffff; + break; + } + + return (retval); +} + +static void +fwctl_outl(uint32_t val) +{ + + switch (be_state) { + case REQ: + if (fwctl_request(val)) + be_state = RESP; + default: + break; + } + +} + +static int +fwctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + if (in) { + if (bytes == 1) + *eax = fwctl_inb(); + else if (bytes == 4) + *eax = fwctl_inl(); + else + *eax = 0xffff; + } else { + if (bytes == 2) + fwctl_outw(*eax); + else if (bytes == 4) + fwctl_outl(*eax); + } + + return (0); +} +INOUT_PORT(fwctl_wreg, FWCTL_OUT, IOPORT_F_INOUT, fwctl_handler); +INOUT_PORT(fwctl_rreg, FWCTL_IN, IOPORT_F_IN, fwctl_handler); + +void +fwctl_init(void) +{ + + ops[OP_GET_LEN] = &fgetlen_info; + ops[OP_GET] = &fgetval_info; + + be_state = IDENT_WAIT; +} diff --git a/usr/src/cmd/bhyve/fwctl.h b/usr/src/cmd/bhyve/fwctl.h new file mode 100644 index 0000000000..6dad244811 --- /dev/null +++ b/usr/src/cmd/bhyve/fwctl.h @@ -0,0 +1,56 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Peter Grehan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _FWCTL_H_ +#define _FWCTL_H_ + +#include + +/* + * Linker set api for export of information to guest firmware via + * a sysctl-like OID interface + */ +struct ctl { + const char *c_oid; + const void *c_data; + const int c_len; +}; + +#define CTL_NODE(oid, data, len) \ + static struct ctl __CONCAT(__ctl, __LINE__) = { \ + oid, \ + (data), \ + (len), \ + }; \ + DATA_SET(ctl_set, __CONCAT(__ctl, __LINE__)) + +void fwctl_init(void); + +#endif /* _FWCTL_H_ */ diff --git a/usr/src/cmd/bhyve/gdb.c b/usr/src/cmd/bhyve/gdb.c new file mode 100644 index 0000000000..20c2de1dec --- /dev/null +++ b/usr/src/cmd/bhyve/gdb.c @@ -0,0 +1,1523 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017-2018 John H. Baldwin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "mem.h" +#include "mevent.h" + +/* + * GDB_SIGNAL_* numbers are part of the GDB remote protocol. Most stops + * use SIGTRAP. + */ +#define GDB_SIGNAL_TRAP 5 + +static void gdb_resume_vcpus(void); +static void check_command(int fd); + +static struct mevent *read_event, *write_event; + +static cpuset_t vcpus_active, vcpus_suspended, vcpus_waiting; +static pthread_mutex_t gdb_lock; +static pthread_cond_t idle_vcpus; +static bool stop_pending, first_stop; +static int stepping_vcpu, stopped_vcpu; + +/* + * An I/O buffer contains 'capacity' bytes of room at 'data'. For a + * read buffer, 'start' is unused and 'len' contains the number of + * valid bytes in the buffer. For a write buffer, 'start' is set to + * the index of the next byte in 'data' to send, and 'len' contains + * the remaining number of valid bytes to send. + */ +struct io_buffer { + uint8_t *data; + size_t capacity; + size_t start; + size_t len; +}; + +static struct io_buffer cur_comm, cur_resp; +static uint8_t cur_csum; +static int cur_vcpu; +static struct vmctx *ctx; +static int cur_fd = -1; + +const int gdb_regset[] = { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_RIP, + VM_REG_GUEST_RFLAGS, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_ES, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS +}; + +const int gdb_regsize[] = { + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 4, + 4, + 4, + 4, + 4, + 4, + 4 +}; + +#ifdef GDB_LOG +#include +#include + +static void __printflike(1, 2) +debug(const char *fmt, ...) +{ + static FILE *logfile; + va_list ap; + + if (logfile == NULL) { + logfile = fopen("/tmp/bhyve_gdb.log", "w"); + if (logfile == NULL) + return; +#ifndef WITHOUT_CAPSICUM + if (caph_limit_stream(fileno(logfile), CAPH_WRITE) == -1) { + fclose(logfile); + logfile = NULL; + return; + } +#endif + setlinebuf(logfile); + } + va_start(ap, fmt); + vfprintf(logfile, fmt, ap); + va_end(ap); +} +#else +#define debug(...) +#endif + +static int +guest_paging_info(int vcpu, struct vm_guest_paging *paging) +{ + uint64_t regs[4]; + const int regset[4] = { + VM_REG_GUEST_CR0, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_EFER + }; + + if (vm_get_register_set(ctx, vcpu, nitems(regset), regset, regs) == -1) + return (-1); + + /* + * For the debugger, always pretend to be the kernel (CPL 0), + * and if long-mode is enabled, always parse addresses as if + * in 64-bit mode. + */ + paging->cr3 = regs[1]; + paging->cpl = 0; + if (regs[3] & EFER_LMA) + paging->cpu_mode = CPU_MODE_64BIT; + else if (regs[0] & CR0_PE) + paging->cpu_mode = CPU_MODE_PROTECTED; + else + paging->cpu_mode = CPU_MODE_REAL; + if (!(regs[0] & CR0_PG)) + paging->paging_mode = PAGING_MODE_FLAT; + else if (!(regs[2] & CR4_PAE)) + paging->paging_mode = PAGING_MODE_32; + else if (regs[3] & EFER_LME) + paging->paging_mode = PAGING_MODE_64; + else + paging->paging_mode = PAGING_MODE_PAE; + return (0); +} + +/* + * Map a guest virtual address to a physical address (for a given vcpu). + * If a guest virtual address is valid, return 1. If the address is + * not valid, return 0. If an error occurs obtaining the mapping, + * return -1. + */ +static int +guest_vaddr2paddr(int vcpu, uint64_t vaddr, uint64_t *paddr) +{ + struct vm_guest_paging paging; + int fault; + + if (guest_paging_info(vcpu, &paging) == -1) + return (-1); + + /* + * Always use PROT_READ. We really care if the VA is + * accessible, not if the current vCPU can write. + */ + if (vm_gla2gpa_nofault(ctx, vcpu, &paging, vaddr, PROT_READ, paddr, + &fault) == -1) + return (-1); + if (fault) + return (0); + return (1); +} + +static void +io_buffer_reset(struct io_buffer *io) +{ + + io->start = 0; + io->len = 0; +} + +/* Available room for adding data. */ +static size_t +io_buffer_avail(struct io_buffer *io) +{ + + return (io->capacity - (io->start + io->len)); +} + +static uint8_t * +io_buffer_head(struct io_buffer *io) +{ + + return (io->data + io->start); +} + +static uint8_t * +io_buffer_tail(struct io_buffer *io) +{ + + return (io->data + io->start + io->len); +} + +static void +io_buffer_advance(struct io_buffer *io, size_t amount) +{ + + assert(amount <= io->len); + io->start += amount; + io->len -= amount; +} + +static void +io_buffer_consume(struct io_buffer *io, size_t amount) +{ + + io_buffer_advance(io, amount); + if (io->len == 0) { + io->start = 0; + return; + } + + /* + * XXX: Consider making this move optional and compacting on a + * future read() before realloc(). + */ + memmove(io->data, io_buffer_head(io), io->len); + io->start = 0; +} + +static void +io_buffer_grow(struct io_buffer *io, size_t newsize) +{ + uint8_t *new_data; + size_t avail, new_cap; + + avail = io_buffer_avail(io); + if (newsize <= avail) + return; + + new_cap = io->capacity + (newsize - avail); + new_data = realloc(io->data, new_cap); + if (new_data == NULL) + err(1, "Failed to grow GDB I/O buffer"); + io->data = new_data; + io->capacity = new_cap; +} + +static bool +response_pending(void) +{ + + if (cur_resp.start == 0 && cur_resp.len == 0) + return (false); + if (cur_resp.start + cur_resp.len == 1 && cur_resp.data[0] == '+') + return (false); + return (true); +} + +static void +close_connection(void) +{ + + /* + * XXX: This triggers a warning because mevent does the close + * before the EV_DELETE. + */ + pthread_mutex_lock(&gdb_lock); + mevent_delete(write_event); + mevent_delete_close(read_event); + write_event = NULL; + read_event = NULL; + io_buffer_reset(&cur_comm); + io_buffer_reset(&cur_resp); + cur_fd = -1; + + /* Resume any stopped vCPUs. */ + gdb_resume_vcpus(); + pthread_mutex_unlock(&gdb_lock); +} + +static uint8_t +hex_digit(uint8_t nibble) +{ + + if (nibble <= 9) + return (nibble + '0'); + else + return (nibble + 'a' - 10); +} + +static uint8_t +parse_digit(uint8_t v) +{ + + if (v >= '0' && v <= '9') + return (v - '0'); + if (v >= 'a' && v <= 'f') + return (v - 'a' + 10); + if (v >= 'A' && v <= 'F') + return (v - 'A' + 10); + return (0xF); +} + +/* Parses big-endian hexadecimal. */ +static uintmax_t +parse_integer(const uint8_t *p, size_t len) +{ + uintmax_t v; + + v = 0; + while (len > 0) { + v <<= 4; + v |= parse_digit(*p); + p++; + len--; + } + return (v); +} + +static uint8_t +parse_byte(const uint8_t *p) +{ + + return (parse_digit(p[0]) << 4 | parse_digit(p[1])); +} + +static void +send_pending_data(int fd) +{ + ssize_t nwritten; + + if (cur_resp.len == 0) { + mevent_disable(write_event); + return; + } + nwritten = write(fd, io_buffer_head(&cur_resp), cur_resp.len); + if (nwritten == -1) { + warn("Write to GDB socket failed"); + close_connection(); + } else { + io_buffer_advance(&cur_resp, nwritten); + if (cur_resp.len == 0) + mevent_disable(write_event); + else + mevent_enable(write_event); + } +} + +/* Append a single character to the output buffer. */ +static void +send_char(uint8_t data) +{ + io_buffer_grow(&cur_resp, 1); + *io_buffer_tail(&cur_resp) = data; + cur_resp.len++; +} + +/* Append an array of bytes to the output buffer. */ +static void +send_data(const uint8_t *data, size_t len) +{ + + io_buffer_grow(&cur_resp, len); + memcpy(io_buffer_tail(&cur_resp), data, len); + cur_resp.len += len; +} + +static void +format_byte(uint8_t v, uint8_t *buf) +{ + + buf[0] = hex_digit(v >> 4); + buf[1] = hex_digit(v & 0xf); +} + +/* + * Append a single byte (formatted as two hex characters) to the + * output buffer. + */ +static void +send_byte(uint8_t v) +{ + uint8_t buf[2]; + + format_byte(v, buf); + send_data(buf, sizeof(buf)); +} + +static void +start_packet(void) +{ + + send_char('$'); + cur_csum = 0; +} + +static void +finish_packet(void) +{ + + send_char('#'); + send_byte(cur_csum); + debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp)); +} + +/* + * Append a single character (for the packet payload) and update the + * checksum. + */ +static void +append_char(uint8_t v) +{ + + send_char(v); + cur_csum += v; +} + +/* + * Append an array of bytes (for the packet payload) and update the + * checksum. + */ +static void +append_packet_data(const uint8_t *data, size_t len) +{ + + send_data(data, len); + while (len > 0) { + cur_csum += *data; + data++; + len--; + } +} + +static void +append_string(const char *str) +{ + +#ifdef __FreeBSD__ + append_packet_data(str, strlen(str)); +#else + append_packet_data((const uint8_t *)str, strlen(str)); +#endif +} + +static void +append_byte(uint8_t v) +{ + uint8_t buf[2]; + + format_byte(v, buf); + append_packet_data(buf, sizeof(buf)); +} + +static void +append_unsigned_native(uintmax_t value, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + append_byte(value); + value >>= 8; + } +} + +static void +append_unsigned_be(uintmax_t value, size_t len) +{ + char buf[len * 2]; + size_t i; + + for (i = 0; i < len; i++) { +#ifdef __FreeBSD__ + format_byte(value, buf + (len - i - 1) * 2); +#else + format_byte(value, (uint8_t *)(buf + (len - i - 1) * 2)); +#endif + value >>= 8; + } +#ifdef __FreeBSD__ + append_packet_data(buf, sizeof(buf)); +#else + append_packet_data((const uint8_t *)buf, sizeof(buf)); +#endif +} + +static void +append_integer(unsigned int value) +{ + + if (value == 0) + append_char('0'); + else + append_unsigned_be(value, fls(value) + 7 / 8); +} + +static void +append_asciihex(const char *str) +{ + + while (*str != '\0') { + append_byte(*str); + str++; + } +} + +static void +send_empty_response(void) +{ + + start_packet(); + finish_packet(); +} + +static void +send_error(int error) +{ + + start_packet(); + append_char('E'); + append_byte(error); + finish_packet(); +} + +static void +send_ok(void) +{ + + start_packet(); + append_string("OK"); + finish_packet(); +} + +static int +parse_threadid(const uint8_t *data, size_t len) +{ + + if (len == 1 && *data == '0') + return (0); + if (len == 2 && memcmp(data, "-1", 2) == 0) + return (-1); + if (len == 0) + return (-2); + return (parse_integer(data, len)); +} + +static void +report_stop(void) +{ + + start_packet(); + if (stopped_vcpu == -1) + append_char('S'); + else + append_char('T'); + append_byte(GDB_SIGNAL_TRAP); + if (stopped_vcpu != -1) { + append_string("thread:"); + append_integer(stopped_vcpu + 1); + append_char(';'); + } + stopped_vcpu = -1; + finish_packet(); +} + +static void +gdb_finish_suspend_vcpus(void) +{ + + if (first_stop) { + first_stop = false; + stopped_vcpu = -1; + } else if (response_pending()) + stop_pending = true; + else { + report_stop(); + send_pending_data(cur_fd); + } +} + +static void +_gdb_cpu_suspend(int vcpu, bool report_stop) +{ + + debug("$vCPU %d suspending\n", vcpu); + CPU_SET(vcpu, &vcpus_waiting); + if (report_stop && CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) + gdb_finish_suspend_vcpus(); + while (CPU_ISSET(vcpu, &vcpus_suspended) && vcpu != stepping_vcpu) + pthread_cond_wait(&idle_vcpus, &gdb_lock); + CPU_CLR(vcpu, &vcpus_waiting); + debug("$vCPU %d resuming\n", vcpu); +} + +void +gdb_cpu_add(int vcpu) +{ + + debug("$vCPU %d starting\n", vcpu); + pthread_mutex_lock(&gdb_lock); + CPU_SET(vcpu, &vcpus_active); + + /* + * If a vcpu is added while vcpus are stopped, suspend the new + * vcpu so that it will pop back out with a debug exit before + * executing the first instruction. + */ + if (!CPU_EMPTY(&vcpus_suspended)) { + CPU_SET(vcpu, &vcpus_suspended); + _gdb_cpu_suspend(vcpu, false); + } + pthread_mutex_unlock(&gdb_lock); +} + +void +gdb_cpu_suspend(int vcpu) +{ + + pthread_mutex_lock(&gdb_lock); + _gdb_cpu_suspend(vcpu, true); + pthread_mutex_unlock(&gdb_lock); +} + +void +gdb_cpu_mtrap(int vcpu) +{ + + debug("$vCPU %d MTRAP\n", vcpu); + pthread_mutex_lock(&gdb_lock); + if (vcpu == stepping_vcpu) { + stepping_vcpu = -1; + vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 0); + vm_suspend_cpu(ctx, vcpu); + assert(stopped_vcpu == -1); + stopped_vcpu = vcpu; + _gdb_cpu_suspend(vcpu, true); + } + pthread_mutex_unlock(&gdb_lock); +} + +static void +gdb_suspend_vcpus(void) +{ + + assert(pthread_mutex_isowned_np(&gdb_lock)); + debug("suspending all CPUs\n"); + vcpus_suspended = vcpus_active; + vm_suspend_cpu(ctx, -1); + if (CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) + gdb_finish_suspend_vcpus(); +} + +static bool +gdb_step_vcpu(int vcpu) +{ + int error, val; + + debug("$vCPU %d step\n", vcpu); + error = vm_get_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, &val); + if (error < 0) + return (false); + error = vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 1); + vm_resume_cpu(ctx, vcpu); + stepping_vcpu = vcpu; + pthread_cond_broadcast(&idle_vcpus); + return (true); +} + +static void +gdb_resume_vcpus(void) +{ + + assert(pthread_mutex_isowned_np(&gdb_lock)); + vm_resume_cpu(ctx, -1); + debug("resuming all CPUs\n"); + CPU_ZERO(&vcpus_suspended); + pthread_cond_broadcast(&idle_vcpus); +} + +static void +gdb_read_regs(void) +{ + uint64_t regvals[nitems(gdb_regset)]; + int i; + + if (vm_get_register_set(ctx, cur_vcpu, nitems(gdb_regset), + gdb_regset, regvals) == -1) { + send_error(errno); + return; + } + start_packet(); + for (i = 0; i < nitems(regvals); i++) + append_unsigned_native(regvals[i], gdb_regsize[i]); + finish_packet(); +} + +static void +gdb_read_mem(const uint8_t *data, size_t len) +{ + uint64_t gpa, gva, val; + uint8_t *cp; + size_t resid, todo, bytes; + bool started; + int error; + + /* Skip 'm' */ + data += 1; + len -= 1; + + /* Parse and consume address. */ + cp = memchr(data, ',', len); + if (cp == NULL || cp == data) { + send_error(EINVAL); + return; + } + gva = parse_integer(data, cp - data); + len -= (cp - data) + 1; + data += (cp - data) + 1; + + /* Parse length. */ + resid = parse_integer(data, len); + + started = false; + while (resid > 0) { + error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); + if (error == -1) { + if (started) + finish_packet(); + else + send_error(errno); + return; + } + if (error == 0) { + if (started) + finish_packet(); + else + send_error(EFAULT); + return; + } + + /* Read bytes from current page. */ + todo = getpagesize() - gpa % getpagesize(); + if (todo > resid) + todo = resid; + + cp = paddr_guest2host(ctx, gpa, todo); + if (cp != NULL) { + /* + * If this page is guest RAM, read it a byte + * at a time. + */ + if (!started) { + start_packet(); + started = true; + } + while (todo > 0) { + append_byte(*cp); + cp++; + gpa++; + gva++; + resid--; + todo--; + } + } else { + /* + * If this page isn't guest RAM, try to handle + * it via MMIO. For MMIO requests, use + * aligned reads of words when possible. + */ + while (todo > 0) { + if (gpa & 1 || todo == 1) + bytes = 1; + else if (gpa & 2 || todo == 2) + bytes = 2; + else + bytes = 4; + error = read_mem(ctx, cur_vcpu, gpa, &val, + bytes); + if (error == 0) { + if (!started) { + start_packet(); + started = true; + } + gpa += bytes; + gva += bytes; + resid -= bytes; + todo -= bytes; + while (bytes > 0) { + append_byte(val); + val >>= 8; + bytes--; + } + } else { + if (started) + finish_packet(); + else + send_error(EFAULT); + return; + } + } + } + assert(resid == 0 || gpa % getpagesize() == 0); + } + if (!started) + start_packet(); + finish_packet(); +} + +static void +gdb_write_mem(const uint8_t *data, size_t len) +{ + uint64_t gpa, gva, val; + uint8_t *cp; + size_t resid, todo, bytes; + int error; + + /* Skip 'M' */ + data += 1; + len -= 1; + + /* Parse and consume address. */ + cp = memchr(data, ',', len); + if (cp == NULL || cp == data) { + send_error(EINVAL); + return; + } + gva = parse_integer(data, cp - data); + len -= (cp - data) + 1; + data += (cp - data) + 1; + + /* Parse and consume length. */ + cp = memchr(data, ':', len); + if (cp == NULL || cp == data) { + send_error(EINVAL); + return; + } + resid = parse_integer(data, cp - data); + len -= (cp - data) + 1; + data += (cp - data) + 1; + + /* Verify the available bytes match the length. */ + if (len != resid * 2) { + send_error(EINVAL); + return; + } + + while (resid > 0) { + error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); + if (error == -1) { + send_error(errno); + return; + } + if (error == 0) { + send_error(EFAULT); + return; + } + + /* Write bytes to current page. */ + todo = getpagesize() - gpa % getpagesize(); + if (todo > resid) + todo = resid; + + cp = paddr_guest2host(ctx, gpa, todo); + if (cp != NULL) { + /* + * If this page is guest RAM, write it a byte + * at a time. + */ + while (todo > 0) { + assert(len >= 2); + *cp = parse_byte(data); + data += 2; + len -= 2; + cp++; + gpa++; + gva++; + resid--; + todo--; + } + } else { + /* + * If this page isn't guest RAM, try to handle + * it via MMIO. For MMIO requests, use + * aligned writes of words when possible. + */ + while (todo > 0) { + if (gpa & 1 || todo == 1) { + bytes = 1; + val = parse_byte(data); + } else if (gpa & 2 || todo == 2) { + bytes = 2; + val = parse_byte(data) | + (parse_byte(data + 2) << 8); + } else { + bytes = 4; + val = parse_byte(data) | + (parse_byte(data + 2) << 8) | + (parse_byte(data + 4) << 16) | + (parse_byte(data + 6) << 24); + } + error = write_mem(ctx, cur_vcpu, gpa, val, + bytes); + if (error == 0) { + gpa += bytes; + gva += bytes; + resid -= bytes; + todo -= bytes; + data += 2 * bytes; + len -= 2 * bytes; + } else { + send_error(EFAULT); + return; + } + } + } + assert(resid == 0 || gpa % getpagesize() == 0); + } + assert(len == 0); + send_ok(); +} + +static bool +command_equals(const uint8_t *data, size_t len, const char *cmd) +{ + + if (strlen(cmd) > len) + return (false); + return (memcmp(data, cmd, strlen(cmd)) == 0); +} + +static void +check_features(const uint8_t *data, size_t len) +{ + char *feature, *next_feature, *str, *value; + bool supported; + + str = malloc(len + 1); + memcpy(str, data, len); + str[len] = '\0'; + next_feature = str; + + while ((feature = strsep(&next_feature, ";")) != NULL) { + /* + * Null features shouldn't exist, but skip if they + * do. + */ + if (strcmp(feature, "") == 0) + continue; + + /* + * Look for the value or supported / not supported + * flag. + */ + value = strchr(feature, '='); + if (value != NULL) { + *value = '\0'; + value++; + supported = true; + } else { + value = feature + strlen(feature) - 1; + switch (*value) { + case '+': + supported = true; + break; + case '-': + supported = false; + break; + default: + /* + * This is really a protocol error, + * but we just ignore malformed + * features for ease of + * implementation. + */ + continue; + } + value = NULL; + } + + /* No currently supported features. */ +#ifndef __FreeBSD__ + /* + * The compiler dislikes 'supported' being set but never used. + * Make it happy here. + */ + if (supported) { + debug("feature '%s' supported\n", feature); + } +#endif /* __FreeBSD__ */ + } + free(str); + + start_packet(); + + /* This is an arbitrary limit. */ + append_string("PacketSize=4096"); + finish_packet(); +} + +static void +gdb_query(const uint8_t *data, size_t len) +{ + + /* + * TODO: + * - qSearch + */ + if (command_equals(data, len, "qAttached")) { + start_packet(); + append_char('1'); + finish_packet(); + } else if (command_equals(data, len, "qC")) { + start_packet(); + append_string("QC"); + append_integer(cur_vcpu + 1); + finish_packet(); + } else if (command_equals(data, len, "qfThreadInfo")) { + cpuset_t mask; + bool first; + int vcpu; + + if (CPU_EMPTY(&vcpus_active)) { + send_error(EINVAL); + return; + } + mask = vcpus_active; + start_packet(); + append_char('m'); + first = true; + while (!CPU_EMPTY(&mask)) { + vcpu = CPU_FFS(&mask) - 1; + CPU_CLR(vcpu, &mask); + if (first) + first = false; + else + append_char(','); + append_integer(vcpu + 1); + } + finish_packet(); + } else if (command_equals(data, len, "qsThreadInfo")) { + start_packet(); + append_char('l'); + finish_packet(); + } else if (command_equals(data, len, "qSupported")) { + data += strlen("qSupported"); + len -= strlen("qSupported"); + check_features(data, len); + } else if (command_equals(data, len, "qThreadExtraInfo")) { + char buf[16]; + int tid; + + data += strlen("qThreadExtraInfo"); + len -= strlen("qThreadExtraInfo"); + if (*data != ',') { + send_error(EINVAL); + return; + } + tid = parse_threadid(data + 1, len - 1); + if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { + send_error(EINVAL); + return; + } + + snprintf(buf, sizeof(buf), "vCPU %d", tid - 1); + start_packet(); + append_asciihex(buf); + finish_packet(); + } else + send_empty_response(); +} + +static void +handle_command(const uint8_t *data, size_t len) +{ + + /* Reject packets with a sequence-id. */ + if (len >= 3 && data[0] >= '0' && data[0] <= '9' && + data[0] >= '0' && data[0] <= '9' && data[2] == ':') { + send_empty_response(); + return; + } + + switch (*data) { + case 'c': + if (len != 1) { + send_error(EINVAL); + break; + } + + /* Don't send a reply until a stop occurs. */ + gdb_resume_vcpus(); + break; + case 'D': + send_ok(); + + /* TODO: Resume any stopped CPUs. */ + break; + case 'g': { + gdb_read_regs(); + break; + } + case 'H': { + int tid; + + if (data[1] != 'g' && data[1] != 'c') { + send_error(EINVAL); + break; + } + tid = parse_threadid(data + 2, len - 2); + if (tid == -2) { + send_error(EINVAL); + break; + } + + if (CPU_EMPTY(&vcpus_active)) { + send_error(EINVAL); + break; + } + if (tid == -1 || tid == 0) + cur_vcpu = CPU_FFS(&vcpus_active) - 1; + else if (CPU_ISSET(tid - 1, &vcpus_active)) + cur_vcpu = tid - 1; + else { + send_error(EINVAL); + break; + } + send_ok(); + break; + } + case 'm': + gdb_read_mem(data, len); + break; + case 'M': + gdb_write_mem(data, len); + break; + case 'T': { + int tid; + + tid = parse_threadid(data + 1, len - 1); + if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { + send_error(EINVAL); + return; + } + send_ok(); + break; + } + case 'q': + gdb_query(data, len); + break; + case 's': + if (len != 1) { + send_error(EINVAL); + break; + } + + /* Don't send a reply until a stop occurs. */ + if (!gdb_step_vcpu(cur_vcpu)) { + send_error(EOPNOTSUPP); + break; + } + break; + case '?': + /* XXX: Only if stopped? */ + /* For now, just report that we are always stopped. */ + start_packet(); + append_char('S'); + append_byte(GDB_SIGNAL_TRAP); + finish_packet(); + break; + case 'G': /* TODO */ + case 'v': + /* Handle 'vCont' */ + /* 'vCtrlC' */ + case 'p': /* TODO */ + case 'P': /* TODO */ + case 'Q': /* TODO */ + case 't': /* TODO */ + case 'X': /* TODO */ + case 'z': /* TODO */ + case 'Z': /* TODO */ + default: + send_empty_response(); + } +} + +/* Check for a valid packet in the command buffer. */ +static void +check_command(int fd) +{ + uint8_t *head, *hash, *p, sum; + size_t avail, plen; + + for (;;) { + avail = cur_comm.len; + if (avail == 0) + return; + head = io_buffer_head(&cur_comm); + switch (*head) { + case 0x03: + debug("<- Ctrl-C\n"); + io_buffer_consume(&cur_comm, 1); + + gdb_suspend_vcpus(); + break; + case '+': + /* ACK of previous response. */ + debug("<- +\n"); + if (response_pending()) + io_buffer_reset(&cur_resp); + io_buffer_consume(&cur_comm, 1); + if (stop_pending) { + stop_pending = false; + report_stop(); + send_pending_data(fd); + } + break; + case '-': + /* NACK of previous response. */ + debug("<- -\n"); + if (response_pending()) { + cur_resp.len += cur_resp.start; + cur_resp.start = 0; + if (cur_resp.data[0] == '+') + io_buffer_advance(&cur_resp, 1); + debug("-> %.*s\n", (int)cur_resp.len, + io_buffer_head(&cur_resp)); + } + io_buffer_consume(&cur_comm, 1); + send_pending_data(fd); + break; + case '$': + /* Packet. */ + + if (response_pending()) { + warnx("New GDB command while response in " + "progress"); + io_buffer_reset(&cur_resp); + } + + /* Is packet complete? */ + hash = memchr(head, '#', avail); + if (hash == NULL) + return; + plen = (hash - head + 1) + 2; + if (avail < plen) + return; + debug("<- %.*s\n", (int)plen, head); + + /* Verify checksum. */ + for (sum = 0, p = head + 1; p < hash; p++) + sum += *p; + if (sum != parse_byte(hash + 1)) { + io_buffer_consume(&cur_comm, plen); + debug("-> -\n"); + send_char('-'); + send_pending_data(fd); + break; + } + send_char('+'); + + handle_command(head + 1, hash - (head + 1)); + io_buffer_consume(&cur_comm, plen); + if (!response_pending()) { + debug("-> +\n"); + } + send_pending_data(fd); + break; + default: + /* XXX: Possibly drop connection instead. */ + debug("-> %02x\n", *head); + io_buffer_consume(&cur_comm, 1); + break; + } + } +} + +static void +gdb_readable(int fd, enum ev_type event, void *arg) +{ + ssize_t nread; + int pending; + + if (ioctl(fd, FIONREAD, &pending) == -1) { + warn("FIONREAD on GDB socket"); + return; + } + + /* + * 'pending' might be zero due to EOF. We need to call read + * with a non-zero length to detect EOF. + */ + if (pending == 0) + pending = 1; + + /* Ensure there is room in the command buffer. */ + io_buffer_grow(&cur_comm, pending); + assert(io_buffer_avail(&cur_comm) >= pending); + + nread = read(fd, io_buffer_tail(&cur_comm), io_buffer_avail(&cur_comm)); + if (nread == 0) { + close_connection(); + } else if (nread == -1) { + if (errno == EAGAIN) + return; + + warn("Read from GDB socket"); + close_connection(); + } else { + cur_comm.len += nread; + pthread_mutex_lock(&gdb_lock); + check_command(fd); + pthread_mutex_unlock(&gdb_lock); + } +} + +static void +gdb_writable(int fd, enum ev_type event, void *arg) +{ + + send_pending_data(fd); +} + +static void +new_connection(int fd, enum ev_type event, void *arg) +{ + int optval, s; + + s = accept4(fd, NULL, NULL, SOCK_NONBLOCK); + if (s == -1) { + if (arg != NULL) + err(1, "Failed accepting initial GDB connection"); + + /* Silently ignore errors post-startup. */ + return; + } + + optval = 1; + if (setsockopt(s, SOL_SOCKET, SO_NOSIGPIPE, &optval, sizeof(optval)) == + -1) { + warn("Failed to disable SIGPIPE for GDB connection"); + close(s); + return; + } + + pthread_mutex_lock(&gdb_lock); + if (cur_fd != -1) { + close(s); + warnx("Ignoring additional GDB connection."); + } + + read_event = mevent_add(s, EVF_READ, gdb_readable, NULL); + if (read_event == NULL) { + if (arg != NULL) + err(1, "Failed to setup initial GDB connection"); + pthread_mutex_unlock(&gdb_lock); + return; + } + write_event = mevent_add(s, EVF_WRITE, gdb_writable, NULL); + if (write_event == NULL) { + if (arg != NULL) + err(1, "Failed to setup initial GDB connection"); + mevent_delete_close(read_event); + read_event = NULL; + } + + cur_fd = s; + cur_vcpu = 0; + stepping_vcpu = -1; + stopped_vcpu = -1; + stop_pending = false; + + /* Break on attach. */ + first_stop = true; + gdb_suspend_vcpus(); + pthread_mutex_unlock(&gdb_lock); +} + +#ifndef WITHOUT_CAPSICUM +void +limit_gdb_socket(int s) +{ + cap_rights_t rights; + unsigned long ioctls[] = { FIONREAD }; + + cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE, + CAP_SETSOCKOPT, CAP_IOCTL); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(s, ioctls, nitems(ioctls)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +} +#endif + +void +init_gdb(struct vmctx *_ctx, int sport, bool wait) +{ + struct sockaddr_in sin; + int error, flags, s; + + debug("==> starting on %d, %swaiting\n", sport, wait ? "" : "not "); + + error = pthread_mutex_init(&gdb_lock, NULL); + if (error != 0) + errc(1, error, "gdb mutex init"); + error = pthread_cond_init(&idle_vcpus, NULL); + if (error != 0) + errc(1, error, "gdb cv init"); + + ctx = _ctx; + s = socket(PF_INET, SOCK_STREAM, 0); + if (s < 0) + err(1, "gdb socket create"); + +#ifdef __FreeBSD__ + sin.sin_len = sizeof(sin); +#endif + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(sport); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) + err(1, "gdb socket bind"); + + if (listen(s, 1) < 0) + err(1, "gdb socket listen"); + + if (wait) { + /* + * Set vcpu 0 in vcpus_suspended. This will trigger the + * logic in gdb_cpu_add() to suspend the first vcpu before + * it starts execution. The vcpu will remain suspended + * until a debugger connects. + */ + stepping_vcpu = -1; + stopped_vcpu = -1; + CPU_SET(0, &vcpus_suspended); + } + + flags = fcntl(s, F_GETFL); + if (fcntl(s, F_SETFL, flags | O_NONBLOCK) == -1) + err(1, "Failed to mark gdb socket non-blocking"); + +#ifndef WITHOUT_CAPSICUM + limit_gdb_socket(s); +#endif + mevent_add(s, EVF_READ, new_connection, NULL); +} diff --git a/usr/src/cmd/bhyve/gdb.h b/usr/src/cmd/bhyve/gdb.h new file mode 100644 index 0000000000..09ebc34f24 --- /dev/null +++ b/usr/src/cmd/bhyve/gdb.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017 John H. Baldwin + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __GDB_H__ +#define __GDB_H__ + +void gdb_cpu_add(int vcpu); +void gdb_cpu_mtrap(int vcpu); +void gdb_cpu_suspend(int vcpu); +void init_gdb(struct vmctx *ctx, int sport, bool wait); + +#endif /* !__GDB_H__ */ diff --git a/usr/src/cmd/bhyve/inout.c b/usr/src/cmd/bhyve/inout.c index 510649893a..b460ee2988 100644 --- a/usr/src/cmd/bhyve/inout.c +++ b/usr/src/cmd/bhyve/inout.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,11 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/inout.c 277310 2015-01-18 03:08:30Z neel $ + * $FreeBSD$ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/inout.c 277310 2015-01-18 03:08:30Z neel $"); +__FBSDID("$FreeBSD$"); #include #include @@ -66,21 +68,21 @@ static int default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { - if (in) { - switch (bytes) { - case 4: - *eax = 0xffffffff; - break; - case 2: - *eax = 0xffff; - break; - case 1: - *eax = 0xff; - break; - } - } - - return (0); + if (in) { + switch (bytes) { + case 4: + *eax = 0xffffffff; + break; + case 2: + *eax = 0xffff; + break; + case 1: + *eax = 0xff; + break; + } + } + + return (0); } static void @@ -107,7 +109,7 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) uint32_t eax, val; inout_func_t handler; void *arg; - int error, retval; + int error, fault, retval; enum vm_reg_name idxreg; uint64_t gla, index, iterations, count; struct vm_inout_str *vis; @@ -163,11 +165,11 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) } error = vm_copy_setup(ctx, vcpu, &vis->paging, gla, - bytes, prot, iov, nitems(iov)); - if (error == -1) { + bytes, prot, iov, nitems(iov), &fault); + if (error) { retval = -1; /* Unrecoverable error */ break; - } else if (error == 1) { + } else if (fault) { retval = 0; /* Resume guest to handle fault */ break; } diff --git a/usr/src/cmd/bhyve/inout.h b/usr/src/cmd/bhyve/inout.h index 0d4046bd61..b72ee5d93e 100644 --- a/usr/src/cmd/bhyve/inout.h +++ b/usr/src/cmd/bhyve/inout.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/inout.h 269094 2014-07-25 20:18:35Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the diff --git a/usr/src/cmd/bhyve/ioapic.c b/usr/src/cmd/bhyve/ioapic.c index 86ff5c6580..acdbb5111b 100644 --- a/usr/src/cmd/bhyve/ioapic.c +++ b/usr/src/cmd/bhyve/ioapic.c @@ -1,5 +1,7 @@ /*- - * Copyright (c) 2014 Advanced Computing Technologies LLC + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Hudson River Trading LLC * Written by: John H. Baldwin * All rights reserved. * @@ -26,14 +28,17 @@ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/ioapic.c 261268 2014-01-29 14:56:48Z jhb $"); +__FBSDID("$FreeBSD$"); #include +#include #include #include #include "ioapic.h" +#include "pci_emul.h" +#include "pci_lpc.h" /* * Assign PCI INTx interrupts to I/O APIC pins in a round-robin @@ -64,11 +69,15 @@ ioapic_init(struct vmctx *ctx) } int -ioapic_pci_alloc_irq(void) +ioapic_pci_alloc_irq(struct pci_devinst *pi) { static int last_pin; if (pci_pins == 0) return (-1); + if (lpc_bootrom()) { + /* For external bootrom use fixed mapping. */ + return (16 + (4 + pi->pi_slot + pi->pi_lintr.pin) % 8); + } return (16 + (last_pin++ % pci_pins)); } diff --git a/usr/src/cmd/bhyve/ioapic.h b/usr/src/cmd/bhyve/ioapic.h index 789f90fea9..3a7fa76192 100644 --- a/usr/src/cmd/bhyve/ioapic.h +++ b/usr/src/cmd/bhyve/ioapic.h @@ -1,5 +1,7 @@ /*- - * Copyright (c) 2014 Advanced Computing Technologies LLC + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Hudson River Trading LLC * Written by: John H. Baldwin * All rights reserved. * @@ -24,16 +26,18 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/ioapic.h 261268 2014-01-29 14:56:48Z jhb $ + * $FreeBSD$ */ #ifndef _IOAPIC_H_ #define _IOAPIC_H_ +struct pci_devinst; + /* * Allocate a PCI IRQ from the I/O APIC. */ void ioapic_init(struct vmctx *ctx); -int ioapic_pci_alloc_irq(void); +int ioapic_pci_alloc_irq(struct pci_devinst *pi); #endif diff --git a/usr/src/cmd/bhyve/iov.c b/usr/src/cmd/bhyve/iov.c new file mode 100644 index 0000000000..54ea22aa94 --- /dev/null +++ b/usr/src/cmd/bhyve/iov.c @@ -0,0 +1,148 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama . + * Copyright (c) 2018 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include "iov.h" + +void +seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2, int *niov2, + size_t seek) +{ + size_t remainder = 0; + size_t left = seek; + int i, j; + + for (i = 0; i < niov1; i++) { + size_t toseek = MIN(left, iov1[i].iov_len); + left -= toseek; + + if (toseek == iov1[i].iov_len) + continue; + + if (left == 0) { + remainder = toseek; + break; + } + } + + for (j = i; j < niov1; j++) { + iov2[j - i].iov_base = (char *)iov1[j].iov_base + remainder; + iov2[j - i].iov_len = iov1[j].iov_len - remainder; + remainder = 0; + } + + *niov2 = j - i; +} + +size_t +count_iov(const struct iovec *iov, int niov) +{ + size_t total = 0; + int i; + + for (i = 0; i < niov; i++) + total += iov[i].iov_len; + + return (total); +} + +void +truncate_iov(struct iovec *iov, int *niov, size_t length) +{ + size_t done = 0; + int i; + + for (i = 0; i < *niov; i++) { + size_t toseek = MIN(length - done, iov[i].iov_len); + done += toseek; + + if (toseek <= iov[i].iov_len) { + iov[i].iov_len = toseek; + *niov = i + 1; + return; + } + } +} + +ssize_t +iov_to_buf(const struct iovec *iov, int niov, void **buf) +{ + size_t ptr, total; + int i; + + total = count_iov(iov, niov); + *buf = realloc(*buf, total); + if (*buf == NULL) + return (-1); + + for (i = 0, ptr = 0; i < niov; i++) { + memcpy(*buf + ptr, iov[i].iov_base, iov[i].iov_len); + ptr += iov[i].iov_len; + } + + return (total); +} + +ssize_t +buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov, + size_t seek) +{ + struct iovec *diov; + int ndiov, i; + size_t off = 0, len; + + if (seek > 0) { + diov = malloc(sizeof(struct iovec) * niov); + seek_iov(iov, niov, diov, &ndiov, seek); + } else { + diov = iov; + ndiov = niov; + } + + for (i = 0; i < ndiov && off < buflen; i++) { + len = MIN(diov[i].iov_len, buflen - off); + memcpy(diov[i].iov_base, buf + off, len); + off += len; + } + + if (seek > 0) + free(diov); + + return ((ssize_t)off); +} + diff --git a/usr/src/cmd/bhyve/iov.h b/usr/src/cmd/bhyve/iov.h new file mode 100644 index 0000000000..e3b5916edb --- /dev/null +++ b/usr/src/cmd/bhyve/iov.h @@ -0,0 +1,44 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama . + * Copyright (c) 2018 Alexander Motin + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IOV_H_ +#define _IOV_H_ + +void seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2, + int *niov2, size_t seek); +void truncate_iov(struct iovec *iov, int *niov, size_t length); +size_t count_iov(const struct iovec *iov, int niov); +ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf); +ssize_t buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov, + size_t seek); + +#endif /* _IOV_H_ */ diff --git a/usr/src/cmd/bhyve/mem.c b/usr/src/cmd/bhyve/mem.c index a153a8e960..90aefe45c8 100644 --- a/usr/src/cmd/bhyve/mem.c +++ b/usr/src/cmd/bhyve/mem.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/mem.c 269700 2014-08-08 03:49:01Z neel $ + * $FreeBSD$ */ /* @@ -33,18 +35,19 @@ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/mem.c 269700 2014-08-08 03:49:01Z neel $"); +__FBSDID("$FreeBSD$"); #include -#include #include +#include #include #include -#include -#include #include +#include #include +#include +#include #include "mem.h" @@ -121,6 +124,7 @@ mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new) static void mmio_rb_dump(struct mmio_rb_tree *rbt) { + int perror; struct mmio_rb_range *np; pthread_rwlock_rdlock(&mmio_rwlock); @@ -128,12 +132,16 @@ mmio_rb_dump(struct mmio_rb_tree *rbt) printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end, np->mr_param.name); } - pthread_rwlock_unlock(&mmio_rwlock); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); } #endif RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); +typedef int (mem_cb_t)(struct vmctx *ctx, int vcpu, uint64_t gpa, + struct mem_range *mr, void *arg); + static int mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) { @@ -156,13 +164,12 @@ mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) return (error); } -int -emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, - struct vm_guest_paging *paging) - +static int +access_memory(struct vmctx *ctx, int vcpu, uint64_t paddr, mem_cb_t *cb, + void *arg) { struct mmio_rb_range *entry; - int err, immutable; + int err, perror, immutable; pthread_rwlock_rdlock(&mmio_rwlock); /* @@ -180,7 +187,8 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, /* Update the per-vCPU cache */ mmio_hint[vcpu] = entry; } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) { - pthread_rwlock_unlock(&mmio_rwlock); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); return (ESRCH); } } @@ -199,40 +207,114 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, * config space window as 'immutable' the deadlock can be avoided. */ immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE); - if (immutable) - pthread_rwlock_unlock(&mmio_rwlock); + if (immutable) { + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); + } - err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging, - mem_read, mem_write, &entry->mr_param); + err = cb(ctx, vcpu, paddr, &entry->mr_param, arg); + + if (!immutable) { + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); + } - if (!immutable) - pthread_rwlock_unlock(&mmio_rwlock); return (err); } +struct emulate_mem_args { + struct vie *vie; + struct vm_guest_paging *paging; +}; + +static int +emulate_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr, + void *arg) +{ + struct emulate_mem_args *ema; + + ema = arg; + return (vmm_emulate_instruction(ctx, vcpu, paddr, ema->vie, ema->paging, + mem_read, mem_write, mr)); +} + +int +emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, + struct vm_guest_paging *paging) + +{ + struct emulate_mem_args ema; + + ema.vie = vie; + ema.paging = paging; + return (access_memory(ctx, vcpu, paddr, emulate_mem_cb, &ema)); +} + +struct rw_mem_args { + uint64_t *val; + int size; + int operation; +}; + +static int +rw_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr, + void *arg) +{ + struct rw_mem_args *rma; + + rma = arg; + return (mr->handler(ctx, vcpu, rma->operation, paddr, rma->size, + rma->val, mr->arg1, mr->arg2)); +} + +int +read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size) +{ + struct rw_mem_args rma; + + rma.val = rval; + rma.size = size; + rma.operation = MEM_F_READ; + return (access_memory(ctx, vcpu, gpa, rw_mem_cb, &rma)); +} + +int +write_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size) +{ + struct rw_mem_args rma; + + rma.val = &wval; + rma.size = size; + rma.operation = MEM_F_WRITE; + return (access_memory(ctx, vcpu, gpa, rw_mem_cb, &rma)); +} + static int register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp) { struct mmio_rb_range *entry, *mrp; - int err; + int err, perror; err = 0; mrp = malloc(sizeof(struct mmio_rb_range)); - - if (mrp != NULL) { + if (mrp == NULL) { + warn("%s: couldn't allocate memory for mrp\n", + __func__); + err = ENOMEM; + } else { mrp->mr_param = *memp; mrp->mr_base = memp->base; mrp->mr_end = memp->base + memp->size - 1; pthread_rwlock_wrlock(&mmio_rwlock); if (mmio_rb_lookup(rbt, memp->base, &entry) != 0) err = mmio_rb_add(rbt, mrp); - pthread_rwlock_unlock(&mmio_rwlock); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); if (err) free(mrp); - } else - err = ENOMEM; + } return (err); } @@ -256,7 +338,7 @@ unregister_mem(struct mem_range *memp) { struct mem_range *mr; struct mmio_rb_range *entry = NULL; - int err, i; + int err, perror, i; pthread_rwlock_wrlock(&mmio_rwlock); err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry); @@ -273,7 +355,8 @@ unregister_mem(struct mem_range *memp) mmio_hint[i] = NULL; } } - pthread_rwlock_unlock(&mmio_rwlock); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); if (entry) free(entry); diff --git a/usr/src/cmd/bhyve/mem.h b/usr/src/cmd/bhyve/mem.h index 09cf56b72e..38d773c43f 100644 --- a/usr/src/cmd/bhyve/mem.h +++ b/usr/src/cmd/bhyve/mem.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/mem.h 269700 2014-08-08 03:49:01Z neel $ + * $FreeBSD$ */ #ifndef _MEM_H_ @@ -53,9 +55,13 @@ struct mem_range { void init_mem(void); int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie, struct vm_guest_paging *paging); - + +int read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval, + int size); int register_mem(struct mem_range *memp); int register_mem_fallback(struct mem_range *memp); int unregister_mem(struct mem_range *memp); +int write_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t wval, + int size); #endif /* _MEM_H_ */ diff --git a/usr/src/cmd/bhyve/mevent.c b/usr/src/cmd/bhyve/mevent.c new file mode 100644 index 0000000000..a258fd3047 --- /dev/null +++ b/usr/src/cmd/bhyve/mevent.c @@ -0,0 +1,680 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Micro event library for FreeBSD, designed for a single i/o thread + * using kqueue, and having events be persistent by default. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#ifdef __FreeBSD__ +#include +#else +#include +#include +#include +#include +#endif +#include + +#include +#include + +#include "mevent.h" + +#define MEVENT_MAX 64 + +#define MEV_ADD 1 +#define MEV_ENABLE 2 +#define MEV_DISABLE 3 +#define MEV_DEL_PENDING 4 + +extern char *vmname; + +static pthread_t mevent_tid; +static int mevent_timid = 43; +static int mevent_pipefd[2]; +static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; + +struct mevent { + void (*me_func)(int, enum ev_type, void *); +#define me_msecs me_fd + int me_fd; +#ifdef __FreeBSD__ + int me_timid; +#else + timer_t me_timid; +#endif + enum ev_type me_type; + void *me_param; + int me_cq; + int me_state; + int me_closefd; +#ifndef __FreeBSD__ + port_notify_t me_notify; + struct sigevent me_sigev; + boolean_t me_auto_requeue; +#endif + LIST_ENTRY(mevent) me_list; +}; + +static LIST_HEAD(listhead, mevent) global_head, change_head; + +static void +mevent_qlock(void) +{ + pthread_mutex_lock(&mevent_lmutex); +} + +static void +mevent_qunlock(void) +{ + pthread_mutex_unlock(&mevent_lmutex); +} + +static void +mevent_pipe_read(int fd, enum ev_type type, void *param) +{ + char buf[MEVENT_MAX]; + int status; + + /* + * Drain the pipe read side. The fd is non-blocking so this is + * safe to do. + */ + do { + status = read(fd, buf, sizeof(buf)); + } while (status == MEVENT_MAX); +} + +static void +mevent_notify(void) +{ + char c; + + /* + * If calling from outside the i/o thread, write a byte on the + * pipe to force the i/o thread to exit the blocking kevent call. + */ + if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { + write(mevent_pipefd[1], &c, 1); + } +} +#ifdef __FreeBSD__ +static int +mevent_kq_filter(struct mevent *mevp) +{ + int retval; + + retval = 0; + + if (mevp->me_type == EVF_READ) + retval = EVFILT_READ; + + if (mevp->me_type == EVF_WRITE) + retval = EVFILT_WRITE; + + if (mevp->me_type == EVF_TIMER) + retval = EVFILT_TIMER; + + if (mevp->me_type == EVF_SIGNAL) + retval = EVFILT_SIGNAL; + + return (retval); +} + +static int +mevent_kq_flags(struct mevent *mevp) +{ + int ret; + + switch (mevp->me_state) { + case MEV_ADD: + ret = EV_ADD; /* implicitly enabled */ + break; + case MEV_ENABLE: + ret = EV_ENABLE; + break; + case MEV_DISABLE: + ret = EV_DISABLE; + break; + case MEV_DEL_PENDING: + ret = EV_DELETE; + break; + default: + assert(0); + break; + } + + return (ret); +} + +static int +mevent_kq_fflags(struct mevent *mevp) +{ + /* XXX nothing yet, perhaps EV_EOF for reads ? */ + return (0); +} + +static int +mevent_build(int mfd, struct kevent *kev) +{ + struct mevent *mevp, *tmpp; + int i; + + i = 0; + + mevent_qlock(); + + LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { + if (mevp->me_closefd) { + /* + * A close of the file descriptor will remove the + * event + */ + close(mevp->me_fd); + } else { + if (mevp->me_type == EVF_TIMER) { + kev[i].ident = mevp->me_timid; + kev[i].data = mevp->me_msecs; + } else { + kev[i].ident = mevp->me_fd; + kev[i].data = 0; + } + kev[i].filter = mevent_kq_filter(mevp); + kev[i].flags = mevent_kq_flags(mevp); + kev[i].fflags = mevent_kq_fflags(mevp); + kev[i].udata = mevp; + i++; + } + + mevp->me_cq = 0; + LIST_REMOVE(mevp, me_list); + + if (mevp->me_state == MEV_DEL_PENDING) { + free(mevp); + } else { + LIST_INSERT_HEAD(&global_head, mevp, me_list); + } + + assert(i < MEVENT_MAX); + } + + mevent_qunlock(); + + return (i); +} + +static void +mevent_handle(struct kevent *kev, int numev) +{ + struct mevent *mevp; + int i; + + for (i = 0; i < numev; i++) { + mevp = kev[i].udata; + + /* XXX check for EV_ERROR ? */ + + (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); + } +} + +#else /* __FreeBSD__ */ + +static void +mevent_update_one(struct mevent *mevp) +{ + int portfd = mevp->me_notify.portnfy_port; + + switch (mevp->me_type) { + case EVF_READ: + case EVF_WRITE: + mevp->me_auto_requeue = B_FALSE; + + switch (mevp->me_state) { + case MEV_ADD: + case MEV_ENABLE: + { + int events; + + events = (mevp->me_type == EVF_READ) ? POLLIN : POLLOUT; + + if (port_associate(portfd, PORT_SOURCE_FD, mevp->me_fd, + events, mevp) != 0) { + (void) fprintf(stderr, + "port_associate fd %d %p failed: %s\n", + mevp->me_fd, mevp, strerror(errno)); + } + return; + } + case MEV_DISABLE: + case MEV_DEL_PENDING: + /* + * A disable that comes in while an event is being + * handled will result in an ENOENT. + */ + if (port_dissociate(portfd, PORT_SOURCE_FD, + mevp->me_fd) != 0 && errno != ENOENT) { + (void) fprintf(stderr, "port_dissociate " + "portfd %d fd %d mevp %p failed: %s\n", + portfd, mevp->me_fd, mevp, strerror(errno)); + } + return; + default: + goto abort; + } + + case EVF_TIMER: + mevp->me_auto_requeue = B_TRUE; + + switch (mevp->me_state) { + case MEV_ADD: + case MEV_ENABLE: + { + struct itimerspec it = { 0 }; + + mevp->me_sigev.sigev_notify = SIGEV_PORT; + mevp->me_sigev.sigev_value.sival_ptr = &mevp->me_notify; + + if (timer_create(CLOCK_REALTIME, &mevp->me_sigev, + &mevp->me_timid) != 0) { + (void) fprintf(stderr, + "timer_create failed: %s", strerror(errno)); + return; + } + + /* The first timeout */ + it.it_value.tv_sec = mevp->me_msecs / MILLISEC; + it.it_value.tv_nsec = + MSEC2NSEC(mevp->me_msecs % MILLISEC); + /* Repeat at the same interval */ + it.it_interval = it.it_value; + + if (timer_settime(mevp->me_timid, 0, &it, NULL) != 0) { + (void) fprintf(stderr, "timer_settime failed: " + "%s", strerror(errno)); + } + return; + } + case MEV_DISABLE: + case MEV_DEL_PENDING: + if (timer_delete(mevp->me_timid) != 0) { + (void) fprintf(stderr, "timer_delete failed: " + "%s", strerror(errno)); + } + return; + default: + goto abort; + } + default: + /* EVF_SIGNAL not yet implemented. */ + goto abort; + } + +abort: + (void) fprintf(stderr, "%s: unhandled type %d state %d\n", __func__, + mevp->me_type, mevp->me_state); + abort(); +} + +static void +mevent_update_pending(int portfd) +{ + struct mevent *mevp, *tmpp; + + mevent_qlock(); + + LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { + mevp->me_notify.portnfy_port = portfd; + mevp->me_notify.portnfy_user = mevp; + if (mevp->me_closefd) { + /* + * A close of the file descriptor will remove the + * event + */ + (void) close(mevp->me_fd); + mevp->me_fd = -1; + } else { + mevent_update_one(mevp); + } + + mevp->me_cq = 0; + LIST_REMOVE(mevp, me_list); + + if (mevp->me_state == MEV_DEL_PENDING) { + free(mevp); + } else { + LIST_INSERT_HEAD(&global_head, mevp, me_list); + } + } + + mevent_qunlock(); +} + +static void +mevent_handle_pe(port_event_t *pe) +{ + struct mevent *mevp = pe->portev_user; + + mevent_qunlock(); + + (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); + + mevent_qlock(); + if (!mevp->me_cq && !mevp->me_auto_requeue) { + mevent_update_one(mevp); + } + mevent_qunlock(); +} +#endif + +struct mevent * +mevent_add(int tfd, enum ev_type type, + void (*func)(int, enum ev_type, void *), void *param) +{ + struct mevent *lp, *mevp; + + if (tfd < 0 || func == NULL) { + return (NULL); + } + + mevp = NULL; + + mevent_qlock(); + + /* + * Verify that the fd/type tuple is not present in any list + */ + LIST_FOREACH(lp, &global_head, me_list) { + if (type != EVF_TIMER && lp->me_fd == tfd && + lp->me_type == type) { + goto exit; + } + } + + LIST_FOREACH(lp, &change_head, me_list) { + if (type != EVF_TIMER && lp->me_fd == tfd && + lp->me_type == type) { + goto exit; + } + } + + /* + * Allocate an entry, populate it, and add it to the change list. + */ + mevp = calloc(1, sizeof(struct mevent)); + if (mevp == NULL) { + goto exit; + } + + if (type == EVF_TIMER) { + mevp->me_msecs = tfd; + mevp->me_timid = mevent_timid++; + } else + mevp->me_fd = tfd; + mevp->me_type = type; + mevp->me_func = func; + mevp->me_param = param; + + LIST_INSERT_HEAD(&change_head, mevp, me_list); + mevp->me_cq = 1; + mevp->me_state = MEV_ADD; + mevent_notify(); + +exit: + mevent_qunlock(); + + return (mevp); +} + +static int +mevent_update(struct mevent *evp, int newstate) +{ + /* + * It's not possible to enable/disable a deleted event + */ + if (evp->me_state == MEV_DEL_PENDING) + return (EINVAL); + + /* + * No update needed if state isn't changing + */ + if (evp->me_state == newstate) + return (0); + + mevent_qlock(); + + evp->me_state = newstate; + + /* + * Place the entry onto the changed list if not already there. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + + mevent_qunlock(); + + return (0); +} + +int +mevent_enable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_ENABLE)); +} + +int +mevent_disable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_DISABLE)); +} + +static int +mevent_delete_event(struct mevent *evp, int closefd) +{ + mevent_qlock(); + + /* + * Place the entry onto the changed list if not already there, and + * mark as to be deleted. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + evp->me_state = MEV_DEL_PENDING; + + if (closefd) + evp->me_closefd = 1; + + mevent_qunlock(); + + return (0); +} + +int +mevent_delete(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 0)); +} + +int +mevent_delete_close(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 1)); +} + +static void +mevent_set_name(void) +{ + + pthread_set_name_np(mevent_tid, "mevent"); +} + +void +mevent_dispatch(void) +{ +#ifdef __FreeBSD__ + struct kevent changelist[MEVENT_MAX]; + struct kevent eventlist[MEVENT_MAX]; + struct mevent *pipev; + int mfd; + int numev; +#else + struct mevent *pipev; + int portfd; +#endif + int ret; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + mevent_tid = pthread_self(); + mevent_set_name(); + +#ifdef __FreeBSD__ + mfd = kqueue(); + assert(mfd > 0); +#else + portfd = port_create(); + assert(portfd >= 0); +#endif + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_KQUEUE); + if (caph_rights_limit(mfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Open the pipe that will be used for other threads to force + * the blocking kqueue call to exit by writing to it. Set the + * descriptor to non-blocking. + */ + ret = pipe(mevent_pipefd); + if (ret < 0) { + perror("pipe"); + exit(0); + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(mevent_pipefd[0], &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_rights_limit(mevent_pipefd[1], &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Add internal event handler for the pipe write fd + */ + pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); + assert(pipev != NULL); + + for (;;) { +#ifdef __FreeBSD__ + /* + * Build changelist if required. + * XXX the changelist can be put into the blocking call + * to eliminate the extra syscall. Currently better for + * debug. + */ + numev = mevent_build(mfd, changelist); + if (numev) { + ret = kevent(mfd, changelist, numev, NULL, 0, NULL); + if (ret == -1) { + perror("Error return from kevent change"); + } + } + + /* + * Block awaiting events + */ + ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); + if (ret == -1 && errno != EINTR) { + perror("Error return from kevent monitor"); + } + + /* + * Handle reported events + */ + mevent_handle(eventlist, ret); + +#else /* __FreeBSD__ */ + port_event_t pev; + + /* Handle any pending updates */ + mevent_update_pending(portfd); + + /* Block awaiting events */ + ret = port_get(portfd, &pev, NULL); + if (ret != 0 && errno != EINTR) { + perror("Error return from port_get"); + continue; + } + + /* Handle reported event */ + mevent_handle_pe(&pev); +#endif /* __FreeBSD__ */ + } +} diff --git a/usr/src/cmd/bhyve/mevent.h b/usr/src/cmd/bhyve/mevent.h new file mode 100644 index 0000000000..e6b96f0a7c --- /dev/null +++ b/usr/src/cmd/bhyve/mevent.h @@ -0,0 +1,53 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MEVENT_H_ +#define _MEVENT_H_ + +enum ev_type { + EVF_READ, + EVF_WRITE, + EVF_TIMER, + EVF_SIGNAL +}; + +struct mevent; + +struct mevent *mevent_add(int fd, enum ev_type type, + void (*func)(int, enum ev_type, void *), + void *param); +int mevent_enable(struct mevent *evp); +int mevent_disable(struct mevent *evp); +int mevent_delete(struct mevent *evp); +int mevent_delete_close(struct mevent *evp); + +void mevent_dispatch(void); + +#endif /* _MEVENT_H_ */ diff --git a/usr/src/cmd/bhyve/mevent_test.c b/usr/src/cmd/bhyve/mevent_test.c new file mode 100644 index 0000000000..4da3adb5ae --- /dev/null +++ b/usr/src/cmd/bhyve/mevent_test.c @@ -0,0 +1,282 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test program for the micro event library. Set up a simple TCP echo + * service. + * + * cc mevent_test.c mevent.c -lpthread + */ + +#include +#include +#ifdef __FreeBSD__ +#include +#endif +#include +#include +#ifdef __FreeBSD__ +#include +#endif + +#include +#include +#include +#include + +#include "mevent.h" + +#define TEST_PORT 4321 + +static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER; + +static struct mevent *tevp; + +char *vmname = "test vm"; + + +#define MEVENT_ECHO + +/* Number of timer events to capture */ +#define TEVSZ 4096 +uint64_t tevbuf[TEVSZ]; + +static void +timer_print(void) +{ + uint64_t min, max, diff, sum; +#ifdef __FreeBSD__ + uint64_t tsc_freq; + size_t len; +#endif + int j; + + min = UINT64_MAX; + max = 0; + sum = 0; + +#ifdef __FreeBSD__ + len = sizeof(tsc_freq); + sysctlbyname("machdep.tsc_freq", &tsc_freq, &len, NULL, 0); +#endif + + for (j = 1; j < TEVSZ; j++) { +#ifdef __FreeBSD__ + /* Convert a tsc diff into microseconds */ + diff = (tevbuf[j] - tevbuf[j-1]) * 1000000 / tsc_freq; +#else + diff = (tevbuf[j] - tevbuf[j-1]) / 1000; +#endif + sum += diff; + if (min > diff) + min = diff; + if (max < diff) + max = diff; + } + + printf("timers done: usecs, min %ld, max %ld, mean %ld\n", min, max, + sum/(TEVSZ - 1)); +} + +static void +timer_callback(int fd, enum ev_type type, void *param) +{ + static int i; + + if (i >= TEVSZ) + abort(); + +#ifdef __FreeBSD__ + tevbuf[i++] = rdtsc(); +#else + tevbuf[i++] = gethrtime(); +#endif + + if (i == TEVSZ) { + mevent_delete(tevp); + timer_print(); + } +} + + +#ifdef MEVENT_ECHO +struct esync { + pthread_mutex_t e_mt; + pthread_cond_t e_cond; +}; + +static void +echoer_callback(int fd, enum ev_type type, void *param) +{ + struct esync *sync = param; + + pthread_mutex_lock(&sync->e_mt); + pthread_cond_signal(&sync->e_cond); + pthread_mutex_unlock(&sync->e_mt); +} + +static void * +echoer(void *param) +{ + struct esync sync; + struct mevent *mev; + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + pthread_mutex_init(&sync.e_mt, NULL); + pthread_cond_init(&sync.e_cond, NULL); + + pthread_mutex_lock(&sync.e_mt); + + mev = mevent_add(fd, EVF_READ, echoer_callback, &sync); + if (mev == NULL) { + printf("Could not allocate echoer event\n"); + exit(4); + } + + while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) { + len = read(fd, buf, sizeof(buf)); + if (len > 0) { + write(fd, buf, len); + write(0, buf, len); + } else { + break; + } + } + + mevent_delete_close(mev); + + pthread_mutex_unlock(&sync.e_mt); + pthread_mutex_destroy(&sync.e_mt); + pthread_cond_destroy(&sync.e_cond); + + return (NULL); +} + +#else + +static void * +echoer(void *param) +{ + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + while ((len = read(fd, buf, sizeof(buf))) > 0) { + write(1, buf, len); + } + + return (NULL); +} +#endif /* MEVENT_ECHO */ + +static void +acceptor_callback(int fd, enum ev_type type, void *param) +{ + pthread_mutex_lock(&accept_mutex); + pthread_cond_signal(&accept_condvar); + pthread_mutex_unlock(&accept_mutex); +} + +static void * +acceptor(void *param) +{ + struct sockaddr_in sin; + pthread_t tid; + int news; + int s; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("cannot create socket"); + exit(4); + } + +#ifdef __FreeBSD__ + sin.sin_len = sizeof(sin); +#endif + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(TEST_PORT); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("cannot bind socket"); + exit(4); + } + + if (listen(s, 1) < 0) { + perror("cannot listen socket"); + exit(4); + } + + (void) mevent_add(s, EVF_READ, acceptor_callback, NULL); + + pthread_mutex_lock(&accept_mutex); + + while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) { + news = accept(s, NULL, NULL); + if (news < 0) { + perror("accept error"); + } else { + static int first = 1; + + if (first) { + /* + * Start a timer + */ + first = 0; + tevp = mevent_add(1, EVF_TIMER, timer_callback, + NULL); + } + + printf("incoming connection, spawning thread\n"); + pthread_create(&tid, NULL, echoer, + (void *)(uintptr_t)news); + } + } + + return (NULL); +} + +int +main() +{ + pthread_t tid; + + pthread_create(&tid, NULL, acceptor, NULL); + + mevent_dispatch(); + return (0); +} diff --git a/usr/src/cmd/bhyve/mptbl.c b/usr/src/cmd/bhyve/mptbl.c index 9d03765c7a..e78f88f074 100644 --- a/usr/src/cmd/bhyve/mptbl.c +++ b/usr/src/cmd/bhyve/mptbl.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,11 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/mptbl.c 266125 2014-05-15 14:16:55Z jhb $ + * $FreeBSD$ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/mptbl.c 266125 2014-05-15 14:16:55Z jhb $"); +__FBSDID("$FreeBSD$"); #include #include diff --git a/usr/src/cmd/bhyve/mptbl.h b/usr/src/cmd/bhyve/mptbl.h index d78ea6da09..ebc8d85ea8 100644 --- a/usr/src/cmd/bhyve/mptbl.h +++ b/usr/src/cmd/bhyve/mptbl.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/mptbl.h 257423 2013-10-31 05:44:45Z neel $ + * $FreeBSD$ */ #ifndef _MPTBL_H_ diff --git a/usr/src/cmd/bhyve/pci_ahci.c b/usr/src/cmd/bhyve/pci_ahci.c index b68c977c1f..1e3feffcc2 100644 --- a/usr/src/cmd/bhyve/pci_ahci.c +++ b/usr/src/cmd/bhyve/pci_ahci.c @@ -1,5 +1,8 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Zhixiang Yu + * Copyright (c) 2015-2016 Alexander Motin * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -23,11 +26,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z tychon $ + * $FreeBSD$ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z tychon $"); +__FBSDID("$FreeBSD$"); #include #include @@ -50,13 +53,15 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z t #include #include #include +#include #include "bhyverun.h" #include "pci_emul.h" #include "ahci.h" #include "block_if.h" -#define MAX_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ +#define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ +#define MAX_PORTS 32 /* AHCI supports 32 ports */ #define PxSIG_ATA 0x00000101 /* ATA drive */ #define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ @@ -86,6 +91,7 @@ enum sata_fis_type { #define READ_TOC 0x43 #define GET_EVENT_STATUS_NOTIFICATION 0x4A #define MODE_SENSE_10 0x5A +#define REPORT_LUNS 0xA0 #define READ_12 0xA8 #define READ_CD 0xBE @@ -99,7 +105,7 @@ enum sata_fis_type { * ATA commands */ #define ATA_SF_ENAB_SATA_SF 0x10 -#define ATA_SATA_SF_AN 0x05 +#define ATA_SATA_SF_AN 0x05 #define ATA_SF_DIS_SATA_SF 0x90 /* @@ -113,6 +119,8 @@ static FILE *dbg; #endif #define WPRINTF(format, arg...) printf(format, ##arg) +#define AHCI_PORT_IDENT 20 + 1 + struct ahci_ioreq { struct blockif_req io_req; struct ahci_port *io_pr; @@ -122,7 +130,7 @@ struct ahci_ioreq { uint32_t len; uint32_t done; int slot; - int prdtl; + int more; }; struct ahci_port { @@ -130,12 +138,17 @@ struct ahci_port { struct pci_ahci_softc *pr_sc; uint8_t *cmd_lst; uint8_t *rfis; + char ident[AHCI_PORT_IDENT]; + int port; int atapi; int reset; + int waitforclear; int mult_sectors; uint8_t xfermode; + uint8_t err_cfis[20]; uint8_t sense_key; uint8_t asc; + u_int ccs; uint32_t pending; uint32_t clb; @@ -200,6 +213,8 @@ struct pci_ahci_softc { }; #define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) +static void ahci_handle_port(struct ahci_port *p); + static inline void lba_to_msf(uint8_t *buf, int lba) { lba += 150; @@ -209,47 +224,95 @@ static inline void lba_to_msf(uint8_t *buf, int lba) } /* - * generate HBA intr depending on whether or not ports within - * the controller have an interrupt pending. + * Generate HBA interrupts on global IS register write. */ static void -ahci_generate_intr(struct pci_ahci_softc *sc) +ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask) { - struct pci_devinst *pi; - int i; - - pi = sc->asc_pi; + struct pci_devinst *pi = sc->asc_pi; + struct ahci_port *p; + int i, nmsg; + uint32_t mmask; + /* Update global IS from PxIS/PxIE. */ for (i = 0; i < sc->ports; i++) { - struct ahci_port *pr; - pr = &sc->port[i]; - if (pr->is & pr->ie) + p = &sc->port[i]; + if (p->is & p->ie) sc->is |= (1 << i); } + DPRINTF("%s(%08x) %08x\n", __func__, mask, sc->is); + + /* If there is nothing enabled -- clear legacy interrupt and exit. */ + if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) { + if (sc->lintr) { + pci_lintr_deassert(pi); + sc->lintr = 0; + } + return; + } - DPRINTF("%s %x\n", __func__, sc->is); - - if (sc->is && (sc->ghc & AHCI_GHC_IE)) { - if (pci_msi_enabled(pi)) { - /* - * Generate an MSI interrupt on every edge - */ - pci_generate_msi(pi, 0); - } else if (!sc->lintr) { - /* - * Only generate a pin-based interrupt if one wasn't - * in progress - */ + /* If there is anything and no MSI -- assert legacy interrupt. */ + nmsg = pci_msi_maxmsgnum(pi); + if (nmsg == 0) { + if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } - } else if (sc->lintr) { - /* - * No interrupts: deassert pin-based signal if it had - * been asserted - */ - pci_lintr_deassert(pi); - sc->lintr = 0; + return; + } + + /* Assert respective MSIs for ports that were touched. */ + for (i = 0; i < nmsg; i++) { + if (sc->ports <= nmsg || i < nmsg - 1) + mmask = 1 << i; + else + mmask = 0xffffffff << i; + if (sc->is & mask && mmask & mask) + pci_generate_msi(pi, i); + } +} + +/* + * Generate HBA interrupt on specific port event. + */ +static void +ahci_port_intr(struct ahci_port *p) +{ + struct pci_ahci_softc *sc = p->pr_sc; + struct pci_devinst *pi = sc->asc_pi; + int nmsg; + + DPRINTF("%s(%d) %08x/%08x %08x\n", __func__, + p->port, p->is, p->ie, sc->is); + + /* If there is nothing enabled -- we are done. */ + if ((p->is & p->ie) == 0) + return; + + /* In case of non-shared MSI always generate interrupt. */ + nmsg = pci_msi_maxmsgnum(pi); + if (sc->ports <= nmsg || p->port < nmsg - 1) { + sc->is |= (1 << p->port); + if ((sc->ghc & AHCI_GHC_IE) == 0) + return; + pci_generate_msi(pi, p->port); + return; + } + + /* If IS for this port is already set -- do nothing. */ + if (sc->is & (1 << p->port)) + return; + + sc->is |= (1 << p->port); + + /* If interrupts are enabled -- generate one. */ + if ((sc->ghc & AHCI_GHC_IE) == 0) + return; + if (nmsg > 0) { + pci_generate_msi(pi, nmsg - 1); + } else if (!sc->lintr) { + sc->lintr = 1; + pci_lintr_assert(pi); } } @@ -265,26 +328,32 @@ ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) case FIS_TYPE_REGD2H: offset = 0x40; len = 20; - irq = AHCI_P_IX_DHR; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; break; case FIS_TYPE_SETDEVBITS: offset = 0x58; len = 8; - irq = AHCI_P_IX_SDB; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; break; case FIS_TYPE_PIOSETUP: offset = 0x20; len = 20; - irq = 0; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; break; default: WPRINTF("unsupported fis type %d\n", ft); return; } + if (fis[2] & ATA_S_ERROR) { + p->waitforclear = 1; + irq |= AHCI_P_IX_TFE; + } memcpy(p->rfis + offset, fis, len); if (irq) { - p->is |= irq; - ahci_generate_intr(p->pr_sc); + if (~p->is & irq) { + p->is |= irq; + ahci_port_intr(p); + } } } @@ -299,19 +368,29 @@ ahci_write_fis_piosetup(struct ahci_port *p) } static void -ahci_write_fis_sdb(struct ahci_port *p, int slot, uint32_t tfd) +ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[8]; uint8_t error; error = (tfd >> 8) & 0xff; + tfd &= 0x77; memset(fis, 0, sizeof(fis)); - fis[0] = error; - fis[2] = tfd & 0x77; - *(uint32_t *)(fis + 4) = (1 << slot); - if (fis[2] & ATA_S_ERROR) - p->is |= AHCI_P_IX_TFE; - p->tfd = tfd; + fis[0] = FIS_TYPE_SETDEVBITS; + fis[1] = (1 << 6); + fis[2] = tfd; + fis[3] = error; + if (fis[2] & ATA_S_ERROR) { + p->err_cfis[0] = slot; + p->err_cfis[2] = tfd; + p->err_cfis[3] = error; + memcpy(&p->err_cfis[4], cfis + 4, 16); + } else { + *(uint32_t *)(fis + 4) = (1 << slot); + p->sact &= ~(1 << slot); + } + p->tfd &= ~0x77; + p->tfd |= tfd; ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); } @@ -337,14 +416,32 @@ ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) fis[11] = cfis[11]; fis[12] = cfis[12]; fis[13] = cfis[13]; - if (fis[2] & ATA_S_ERROR) - p->is |= AHCI_P_IX_TFE; - else + if (fis[2] & ATA_S_ERROR) { + p->err_cfis[0] = 0x80; + p->err_cfis[2] = tfd & 0xff; + p->err_cfis[3] = error; + memcpy(&p->err_cfis[4], cfis + 4, 16); + } else p->ci &= ~(1 << slot); p->tfd = tfd; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } +static void +ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) +{ + uint8_t fis[20]; + + p->tfd = ATA_S_READY | ATA_S_DSC; + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[1] = 0; /* No interrupt */ + fis[2] = p->tfd; /* Status */ + fis[3] = 0; /* No error */ + p->ci &= ~(1 << slot); + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + static void ahci_write_reset_fis_d2h(struct ahci_port *p) { @@ -372,9 +469,11 @@ ahci_check_stopped(struct ahci_port *p) */ if (!(p->cmd & AHCI_P_CMD_ST)) { if (p->pending == 0) { + p->ccs = 0; p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); p->ci = 0; p->sact = 0; + p->waitforclear = 0; } } } @@ -385,7 +484,6 @@ ahci_port_stop(struct ahci_port *p) struct ahci_ioreq *aior; uint8_t *cfis; int slot; - int ncq; int error; assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); @@ -401,11 +499,9 @@ ahci_port_stop(struct ahci_port *p) slot = aior->slot; cfis = aior->cfis; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || - cfis[2] == ATA_READ_FPDMA_QUEUED) - ncq = 1; - - if (ncq) - p->sact &= ~(1 << slot); + cfis[2] == ATA_READ_FPDMA_QUEUED || + cfis[2] == ATA_SEND_FPDMA_QUEUED) + p->sact &= ~(1 << slot); /* NCQ */ else p->ci &= ~(1 << slot); @@ -431,7 +527,6 @@ ahci_port_stop(struct ahci_port *p) static void ahci_port_reset(struct ahci_port *pr) { - pr->sctl = 0; pr->serr = 0; pr->sact = 0; pr->xfermode = ATA_UDMA6; @@ -443,8 +538,11 @@ ahci_port_reset(struct ahci_port *pr) pr->tfd = 0x7F; return; } - pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_SPD_GEN2 | - ATA_SS_IPM_ACTIVE; + pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; + if (pr->sctl & ATA_SC_SPD_MASK) + pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); + else + pr->ssts |= ATA_SS_SPD_GEN3; pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; if (!pr->atapi) { pr->sig = PxSIG_ATA; @@ -470,6 +568,10 @@ ahci_reset(struct pci_ahci_softc *sc) for (i = 0; i < sc->ports; i++) { sc->port[i].ie = 0; sc->port[i].is = 0; + sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); + if (sc->port[i].bctx) + sc->port[i].cmd |= AHCI_P_CMD_CPS; + sc->port[i].sctl = 0; ahci_port_reset(&sc->port[i]); } } @@ -500,32 +602,87 @@ atapi_string(uint8_t *dest, const char *src, int len) } } +/* + * Build up the iovec based on the PRDT, 'done' and 'len'. + */ static void -ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done, - int seek) +ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, + struct ahci_prdt_entry *prdt, uint16_t prdtl) +{ + struct blockif_req *breq = &aior->io_req; + int i, j, skip, todo, left, extra; + uint32_t dbcsz; + + /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ + skip = aior->done; + left = aior->len - aior->done; + todo = 0; + for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; + i++, prdt++) { + dbcsz = (prdt->dbc & DBCMASK) + 1; + /* Skip already done part of the PRDT */ + if (dbcsz <= skip) { + skip -= dbcsz; + continue; + } + dbcsz -= skip; + if (dbcsz > left) + dbcsz = left; + breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc), + prdt->dba + skip, dbcsz); + breq->br_iov[j].iov_len = dbcsz; + todo += dbcsz; + left -= dbcsz; + skip = 0; + j++; + } + + /* If we got limited by IOV length, round I/O down to sector size. */ + if (j == BLOCKIF_IOV_MAX) { + extra = todo % blockif_sectsz(p->bctx); + todo -= extra; + assert(todo > 0); + while (extra > 0) { + if (breq->br_iov[j - 1].iov_len > extra) { + breq->br_iov[j - 1].iov_len -= extra; + break; + } + extra -= breq->br_iov[j - 1].iov_len; + j--; + } + } + + breq->br_iovcnt = j; + breq->br_resid = todo; + aior->done += todo; + aior->more = (aior->done < aior->len && i < prdtl); +} + +static void +ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; - struct pci_ahci_softc *sc; struct ahci_prdt_entry *prdt; struct ahci_cmd_hdr *hdr; uint64_t lba; uint32_t len; - int i, err, iovcnt, ncq, readop; + int err, first, ncq, readop; - sc = p->pr_sc; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); ncq = 0; readop = 1; + first = (done == 0); - prdt += seek; - if (cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || - cfis[2] == ATA_WRITE_FPDMA_QUEUED) + if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || + cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || + cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || + cfis[2] == ATA_WRITE_FPDMA_QUEUED) readop = 0; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || - cfis[2] == ATA_READ_FPDMA_QUEUED) { + cfis[2] == ATA_READ_FPDMA_QUEUED) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | @@ -536,7 +693,9 @@ ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done, if (!len) len = 65536; ncq = 1; - } else if (cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { + } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || + cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || + cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | @@ -556,57 +715,33 @@ ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done, lba *= blockif_sectsz(p->bctx); len *= blockif_sectsz(p->bctx); - /* - * Pull request off free list - */ + /* Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; breq = &aior->io_req; breq->br_offset = lba + done; - iovcnt = hdr->prdtl - seek; - if (iovcnt > BLOCKIF_IOV_MAX) { - aior->prdtl = iovcnt - BLOCKIF_IOV_MAX; - iovcnt = BLOCKIF_IOV_MAX; - } else - aior->prdtl = 0; - breq->br_iovcnt = iovcnt; + ahci_build_iov(p, aior, prdt, hdr->prdtl); - /* - * Mark this command in-flight. - */ + /* Mark this command in-flight. */ p->pending |= 1 << slot; - /* - * Stuff request onto busy list - */ + /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); - /* - * Build up the iovec based on the prdt - */ - for (i = 0; i < iovcnt; i++) { - uint32_t dbcsz; + if (ncq && first) + ahci_write_fis_d2h_ncq(p, slot); - dbcsz = (prdt->dbc & DBCMASK) + 1; - breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc), - prdt->dba, dbcsz); - breq->br_iov[i].iov_len = dbcsz; - aior->done += dbcsz; - prdt++; - } if (readop) err = blockif_read(p->bctx, breq); else err = blockif_write(p->bctx, breq); assert(err == 0); - - if (ncq) - p->ci &= ~(1 << slot); } static void @@ -626,7 +761,7 @@ ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) aior->slot = slot; aior->len = 0; aior->done = 0; - aior->prdtl = 0; + aior->more = 0; breq = &aior->io_req; /* @@ -643,6 +778,120 @@ ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) assert(err == 0); } +static inline void +read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, + void *buf, int size) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + void *to; + int i, len; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + len = size; + to = buf; + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + for (i = 0; i < hdr->prdtl && len; i++) { + uint8_t *ptr; + uint32_t dbcsz; + int sublen; + + dbcsz = (prdt->dbc & DBCMASK) + 1; + ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); + sublen = MIN(len, dbcsz); + memcpy(to, ptr, sublen); + len -= sublen; + to += sublen; + prdt++; + } +} + +static void +ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + uint8_t *entry; + uint64_t elba; + uint32_t len, elen; + int err, first, ncq; + uint8_t buf[512]; + + first = (done == 0); + if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { + len = (uint16_t)cfis[13] << 8 | cfis[12]; + len *= 512; + ncq = 0; + } else { /* ATA_SEND_FPDMA_QUEUED */ + len = (uint16_t)cfis[11] << 8 | cfis[3]; + len *= 512; + ncq = 1; + } + read_prdt(p, slot, cfis, buf, sizeof(buf)); + +next: + entry = &buf[done]; + elba = ((uint64_t)entry[5] << 40) | + ((uint64_t)entry[4] << 32) | + ((uint64_t)entry[3] << 24) | + ((uint64_t)entry[2] << 16) | + ((uint64_t)entry[1] << 8) | + entry[0]; + elen = (uint16_t)entry[7] << 8 | entry[6]; + done += 8; + if (elen == 0) { + if (done >= len) { + if (ncq) { + if (first) + ahci_write_fis_d2h_ncq(p, slot); + ahci_write_fis_sdb(p, slot, cfis, + ATA_S_READY | ATA_S_DSC); + } else { + ahci_write_fis_d2h(p, slot, cfis, + ATA_S_READY | ATA_S_DSC); + } + p->pending &= ~(1 << slot); + ahci_check_stopped(p); + if (!first) + ahci_handle_port(p); + return; + } + goto next; + } + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + aior->more = (len != done); + + breq = &aior->io_req; + breq->br_offset = elba * blockif_sectsz(p->bctx); + breq->br_resid = elen * blockif_sectsz(p->bctx); + + /* + * Mark this command in-flight. + */ + p->pending |= 1 << slot; + + /* + * Stuff request onto busy list + */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + if (ncq && first) + ahci_write_fis_d2h_ncq(p, slot); + + err = blockif_delete(p->bctx, breq); + assert(err == 0); +} + static inline void write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, int size) @@ -663,7 +912,7 @@ write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); - sublen = len < dbcsz ? len : dbcsz; + sublen = MIN(len, dbcsz); memcpy(ptr, from, sublen); len -= sublen; from += sublen; @@ -672,6 +921,58 @@ write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, hdr->prdbc = size - len; } +static void +ahci_checksum(uint8_t *buf, int size) +{ + int i; + uint8_t sum = 0; + + for (i = 0; i < size - 1; i++) + sum += buf[i]; + buf[size - 1] = 0x100 - sum; +} + +static void +ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_cmd_hdr *hdr; + uint32_t buf[128]; + uint8_t *buf8 = (uint8_t *)buf; + uint16_t *buf16 = (uint16_t *)buf; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 || + cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + return; + } + + memset(buf, 0, sizeof(buf)); + if (cfis[4] == 0x00) { /* Log directory */ + buf16[0x00] = 1; /* Version -- 1 */ + buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */ + buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */ + } else if (cfis[4] == 0x10) { /* NCQ Command Error Log */ + memcpy(buf8, p->err_cfis, sizeof(p->err_cfis)); + ahci_checksum(buf8, sizeof(buf)); + } else if (cfis[4] == 0x13) { /* SATA NCQ Send and Receive Log */ + if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) { + buf[0x00] = 1; /* SFQ DSM supported */ + buf[0x01] = 1; /* SFQ DSM TRIM supported */ + } + } else { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + return; + } + + if (cfis[2] == ATA_READ_LOG_EXT) + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); +} + static void handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) { @@ -679,82 +980,116 @@ handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0) { - p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; - p->is |= AHCI_P_IX_TFE; + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { uint16_t buf[256]; uint64_t sectors; + int sectsz, psectsz, psectoff, candelete, ro; uint16_t cyl; uint8_t sech, heads; - sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); + ro = blockif_is_ro(p->bctx); + candelete = blockif_candelete(p->bctx); + sectsz = blockif_sectsz(p->bctx); + sectors = blockif_size(p->bctx) / sectsz; blockif_chs(p->bctx, &cyl, &heads, &sech); + blockif_psectsz(p->bctx, &psectsz, &psectoff); memset(buf, 0, sizeof(buf)); buf[0] = 0x0040; buf[1] = cyl; buf[3] = heads; buf[6] = sech; - /* TODO emulate different serial? */ - ata_string((uint8_t *)(buf+10), "123456", 20); + ata_string((uint8_t *)(buf+10), p->ident, 20); ata_string((uint8_t *)(buf+23), "001", 8); ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40); buf[47] = (0x8000 | 128); - buf[48] = 0x1; + buf[48] = 0; buf[49] = (1 << 8 | 1 << 9 | 1 << 11); buf[50] = (1 << 14); buf[53] = (1 << 1 | 1 << 2); if (p->mult_sectors) buf[59] = (0x100 | p->mult_sectors); - buf[60] = sectors; - buf[61] = (sectors >> 16); + if (sectors <= 0x0fffffff) { + buf[60] = sectors; + buf[61] = (sectors >> 16); + } else { + buf[60] = 0xffff; + buf[61] = 0x0fff; + } buf[63] = 0x7; if (p->xfermode & ATA_WDMA0) buf[63] |= (1 << ((p->xfermode & 7) + 8)); buf[64] = 0x3; - buf[65] = 100; - buf[66] = 100; - buf[67] = 100; - buf[68] = 100; + buf[65] = 120; + buf[66] = 120; + buf[67] = 120; + buf[68] = 120; + buf[69] = 0; buf[75] = 31; - buf[76] = (1 << 8 | 1 << 2); - buf[80] = 0x1f0; + buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | + ATA_SUPPORT_NCQ); + buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | + (p->ssts & ATA_SS_SPD_MASK) >> 3); + buf[80] = 0x3f0; buf[81] = 0x28; - buf[82] = (1 << 5 | 1 << 14); - buf[83] = (1 << 10 | 1 << 12 | 1 << 13 | 1 << 14); + buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| + ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); + buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | + ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); buf[84] = (1 << 14); - buf[85] = (1 << 5 | 1 << 14); - buf[86] = (1 << 10 | 1 << 12 | 1 << 13); + buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| + ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); + buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | + ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); buf[87] = (1 << 14); buf[88] = 0x7f; if (p->xfermode & ATA_UDMA0) buf[88] |= (1 << ((p->xfermode & 7) + 8)); - buf[93] = (1 | 1 <<14); buf[100] = sectors; buf[101] = (sectors >> 16); buf[102] = (sectors >> 32); buf[103] = (sectors >> 48); + if (candelete && !ro) { + buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; + buf[105] = 1; + buf[169] = ATA_SUPPORT_DSM_TRIM; + } + buf[106] = 0x4000; + buf[209] = 0x4000; + if (psectsz > sectsz) { + buf[106] |= 0x2000; + buf[106] |= ffsl(psectsz / sectsz) - 1; + buf[209] |= (psectoff / sectsz); + } + if (sectsz > 512) { + buf[106] |= 0x1000; + buf[117] = sectsz / 2; + buf[118] = ((sectsz / 2) >> 16); + } + buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); + buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); + buf[222] = 0x1020; + buf[255] = 0x00a5; + ahci_checksum((uint8_t *)buf, sizeof(buf)); ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); - p->tfd = ATA_S_DSC | ATA_S_READY; - p->is |= AHCI_P_IX_DP; - p->ci &= ~(1 << slot); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } - ahci_generate_intr(p->pr_sc); } static void handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) { if (!p->atapi) { - p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; - p->is |= AHCI_P_IX_TFE; + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { uint16_t buf[256]; memset(buf, 0, sizeof(buf)); buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5); - /* TODO emulate different serial? */ - ata_string((uint8_t *)(buf+10), "123456", 20); + ata_string((uint8_t *)(buf+10), p->ident, 20); ata_string((uint8_t *)(buf+23), "001", 8); ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40); buf[49] = (1 << 9 | 1 << 8); @@ -762,27 +1097,34 @@ handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) buf[53] = (1 << 2 | 1 << 1); buf[62] = 0x3f; buf[63] = 7; + if (p->xfermode & ATA_WDMA0) + buf[63] |= (1 << ((p->xfermode & 7) + 8)); buf[64] = 3; - buf[65] = 100; - buf[66] = 100; - buf[67] = 100; - buf[68] = 100; - buf[76] = (1 << 2 | 1 << 1); + buf[65] = 120; + buf[66] = 120; + buf[67] = 120; + buf[68] = 120; + buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); + buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3); buf[78] = (1 << 5); - buf[80] = (0x1f << 4); - buf[82] = (1 << 4); + buf[80] = 0x3f0; + buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | + ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); buf[83] = (1 << 14); buf[84] = (1 << 14); - buf[85] = (1 << 4); + buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | + ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); buf[87] = (1 << 14); - buf[88] = (1 << 14 | 0x7f); + buf[88] = 0x7f; + if (p->xfermode & ATA_UDMA0) + buf[88] |= (1 << ((p->xfermode & 7) + 8)); + buf[222] = 0x1020; + buf[255] = 0x00a5; + ahci_checksum((uint8_t *)buf, sizeof(buf)); ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); - p->tfd = ATA_S_DSC | ATA_S_READY; - p->is |= AHCI_P_IX_DHR; - p->ci &= ~(1 << slot); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } - ahci_generate_intr(p->pr_sc); } static void @@ -791,22 +1133,41 @@ atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) uint8_t buf[36]; uint8_t *acmd; int len; + uint32_t tfd; acmd = cfis + 0x40; - buf[0] = 0x05; - buf[1] = 0x80; - buf[2] = 0x00; - buf[3] = 0x21; - buf[4] = 31; - buf[5] = 0; - buf[6] = 0; - buf[7] = 0; - atapi_string(buf + 8, "BHYVE", 8); - atapi_string(buf + 16, "BHYVE DVD-ROM", 16); - atapi_string(buf + 32, "001", 4); - - len = sizeof(buf); + if (acmd[1] & 1) { /* VPD */ + if (acmd[2] == 0) { /* Supported VPD pages */ + buf[0] = 0x05; + buf[1] = 0; + buf[2] = 0; + buf[3] = 1; + buf[4] = 0; + len = 4 + buf[3]; + } else { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + return; + } + } else { + buf[0] = 0x05; + buf[1] = 0x80; + buf[2] = 0x00; + buf[3] = 0x21; + buf[4] = 31; + buf[5] = 0; + buf[6] = 0; + buf[7] = 0; + atapi_string(buf + 8, "BHYVE", 8); + atapi_string(buf + 16, "BHYVE DVD-ROM", 16); + atapi_string(buf + 32, "001", 4); + len = sizeof(buf); + } + if (len > acmd[4]) len = acmd[4]; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; @@ -918,10 +1279,9 @@ atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) { int msf, size; uint64_t sectors; - uint8_t start_track, *bp, buf[50]; + uint8_t *bp, buf[50]; msf = (acmd[1] >> 1) & 1; - start_track = acmd[6]; bp = buf + 2; *bp++ = 1; *bp++ = 1; @@ -1010,25 +1370,34 @@ atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) } static void -atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, - uint32_t done, int seek) +atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[16]; + + memset(buf, 0, sizeof(buf)); + buf[3] = 8; + + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; struct blockif_req *breq; - struct pci_ahci_softc *sc; uint8_t *acmd; uint64_t lba; uint32_t len; - int i, err, iovcnt; + int err; - sc = p->pr_sc; acmd = cfis + 0x40; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); prdt = (struct ahci_prdt_entry *)(cfis + 0x80); - prdt += seek; lba = be32dec(acmd + 2); if (acmd[0] == READ_10) len = be16dec(acmd + 7); @@ -1053,37 +1422,14 @@ atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, aior->done = done; breq = &aior->io_req; breq->br_offset = lba + done; - iovcnt = hdr->prdtl - seek; - if (iovcnt > BLOCKIF_IOV_MAX) { - aior->prdtl = iovcnt - BLOCKIF_IOV_MAX; - iovcnt = BLOCKIF_IOV_MAX; - } else - aior->prdtl = 0; - breq->br_iovcnt = iovcnt; + ahci_build_iov(p, aior, prdt, hdr->prdtl); - /* - * Mark this command in-flight. - */ + /* Mark this command in-flight. */ p->pending |= 1 << slot; - /* - * Stuff request onto busy list - */ + /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); - /* - * Build up the iovec based on the prdt - */ - for (i = 0; i < iovcnt; i++) { - uint32_t dbcsz; - - dbcsz = (prdt->dbc & DBCMASK) + 1; - breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc), - prdt->dba, dbcsz); - breq->br_iov[i].iov_len = dbcsz; - aior->done += dbcsz; - prdt++; - } err = blockif_read(p->bctx, breq); assert(err == 0); } @@ -1137,7 +1483,7 @@ static void atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; - uint32_t tfd; + uint32_t tfd = 0; uint8_t pc, code; int len; @@ -1278,9 +1624,12 @@ handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) case READ_TOC: atapi_read_toc(p, slot, cfis); break; + case REPORT_LUNS: + atapi_report_luns(p, slot, cfis); + break; case READ_10: case READ_12: - atapi_read(p, slot, cfis, 0, 0); + atapi_read(p, slot, cfis, 0); break; case REQUEST_SENSE: atapi_request_sense(p, slot, cfis); @@ -1308,6 +1657,7 @@ static void ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { + p->tfd |= ATA_S_BUSY; switch (cfis[2]) { case ATA_ATA_IDENTIFY: handle_identify(p, slot, cfis); @@ -1363,28 +1713,68 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) p->mult_sectors = cfis[12]; p->tfd = ATA_S_DSC | ATA_S_READY; } - p->is |= AHCI_P_IX_DP; - p->ci &= ~(1 << slot); - ahci_generate_intr(p->pr_sc); + ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; + case ATA_READ: + case ATA_WRITE: + case ATA_READ48: + case ATA_WRITE48: + case ATA_READ_MUL: + case ATA_WRITE_MUL: + case ATA_READ_MUL48: + case ATA_WRITE_MUL48: case ATA_READ_DMA: case ATA_WRITE_DMA: case ATA_READ_DMA48: case ATA_WRITE_DMA48: case ATA_READ_FPDMA_QUEUED: case ATA_WRITE_FPDMA_QUEUED: - ahci_handle_dma(p, slot, cfis, 0, 0); + ahci_handle_rw(p, slot, cfis, 0); break; case ATA_FLUSHCACHE: case ATA_FLUSHCACHE48: ahci_handle_flush(p, slot, cfis); break; - case ATA_STANDBY_CMD: + case ATA_DATA_SET_MANAGEMENT: + if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && + cfis[13] == 0 && cfis[12] == 1) { + ahci_handle_dsm_trim(p, slot, cfis, 0); + break; + } + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_SEND_FPDMA_QUEUED: + if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && + cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && + cfis[11] == 0 && cfis[3] == 1) { + ahci_handle_dsm_trim(p, slot, cfis, 0); + break; + } + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_READ_LOG_EXT: + case ATA_READ_LOG_DMA_EXT: + ahci_handle_read_log(p, slot, cfis); break; + case ATA_SECURITY_FREEZE_LOCK: + case ATA_SMART_CMD: case ATA_NOP: + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_CHECK_POWER_MODE: + cfis[12] = 0xff; /* always on */ + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case ATA_STANDBY_CMD: case ATA_STANDBY_IMMEDIATE: + case ATA_IDLE_CMD: case ATA_IDLE_IMMEDIATE: case ATA_SLEEP: + case ATA_READ_VERIFY: + case ATA_READ_VERIFY48: ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_ATAPI_IDENTIFY: @@ -1392,17 +1782,15 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) break; case ATA_PACKET_CMD: if (!p->atapi) { - p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; - p->is |= AHCI_P_IX_TFE; - ahci_generate_intr(p->pr_sc); + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else handle_packet_cmd(p, slot, cfis); break; default: WPRINTF("Unsupported cmd:%02x\n", cfis[2]); - p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; - p->is |= AHCI_P_IX_TFE; - ahci_generate_intr(p->pr_sc); + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; } } @@ -1411,19 +1799,25 @@ static void ahci_handle_slot(struct ahci_port *p, int slot) { struct ahci_cmd_hdr *hdr; +#ifdef AHCI_DEBUG struct ahci_prdt_entry *prdt; +#endif struct pci_ahci_softc *sc; uint8_t *cfis; - int cfl; +#ifdef AHCI_DEBUG + int cfl, i; +#endif sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); +#ifdef AHCI_DEBUG cfl = (hdr->flags & 0x1f) * 4; +#endif cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); +#ifdef AHCI_DEBUG prdt = (struct ahci_prdt_entry *)(cfis + 0x80); -#ifdef AHCI_DEBUG DPRINTF("\ncfis:"); for (i = 0; i < cfl; i++) { if (i % 10 == 0) @@ -1459,20 +1853,23 @@ ahci_handle_slot(struct ahci_port *p, int slot) static void ahci_handle_port(struct ahci_port *p) { - int i; if (!(p->cmd & AHCI_P_CMD_ST)) return; /* * Search for any new commands to issue ignoring those that - * are already in-flight. + * are already in-flight. Stop if device is busy or in error. */ - for (i = 0; (i < 32) && p->ci; i++) { - if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) { + for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { + if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) + break; + if (p->waitforclear) + break; + if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { p->cmd &= ~AHCI_P_CMD_CCS_MASK; - p->cmd |= i << AHCI_P_CMD_CCS_SHIFT; - ahci_handle_slot(p, i); + p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; + ahci_handle_slot(p, p->ccs); } } } @@ -1490,22 +1887,26 @@ ata_ioreq_cb(struct blockif_req *br, int err) struct pci_ahci_softc *sc; uint32_t tfd; uint8_t *cfis; - int pending, slot, ncq; + int slot, ncq, dsm; DPRINTF("%s %d\n", __func__, err); - ncq = 0; + ncq = dsm = 0; aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; - pending = aior->prdtl; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || - cfis[2] == ATA_READ_FPDMA_QUEUED) + cfis[2] == ATA_READ_FPDMA_QUEUED || + cfis[2] == ATA_SEND_FPDMA_QUEUED) ncq = 1; + if (cfis[2] == ATA_DATA_SET_MANAGEMENT || + (cfis[2] == ATA_SEND_FPDMA_QUEUED && + (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) + dsm = 1; pthread_mutex_lock(&sc->mtx); @@ -1519,29 +1920,24 @@ ata_ioreq_cb(struct blockif_req *br, int err) */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); - if (pending && !err) { - ahci_handle_dma(p, slot, cfis, aior->done, - hdr->prdtl - pending); + if (!err) + hdr->prdbc = aior->done; + + if (!err && aior->more) { + if (dsm) + ahci_handle_dsm_trim(p, slot, cfis, aior->done); + else + ahci_handle_rw(p, slot, cfis, aior->done); goto out; } - if (!err && aior->done == aior->len) { + if (!err) tfd = ATA_S_READY | ATA_S_DSC; - if (ncq) - hdr->prdbc = 0; - else - hdr->prdbc = aior->len; - } else { + else tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; - hdr->prdbc = 0; - if (ncq) - p->serr |= (1 << slot); - } - - if (ncq) { - p->sact &= ~(1 << slot); - ahci_write_fis_sdb(p, slot, tfd); - } else + if (ncq) + ahci_write_fis_sdb(p, slot, cfis, tfd); + else ahci_write_fis_d2h(p, slot, cfis, tfd); /* @@ -1550,6 +1946,7 @@ ata_ioreq_cb(struct blockif_req *br, int err) p->pending &= ~(1 << slot); ahci_check_stopped(p); + ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit\n", __func__); @@ -1564,7 +1961,7 @@ atapi_ioreq_cb(struct blockif_req *br, int err) struct pci_ahci_softc *sc; uint8_t *cfis; uint32_t tfd; - int pending, slot; + int slot; DPRINTF("%s %d\n", __func__, err); @@ -1572,7 +1969,6 @@ atapi_ioreq_cb(struct blockif_req *br, int err) p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; - pending = aior->prdtl; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); @@ -1588,21 +1984,21 @@ atapi_ioreq_cb(struct blockif_req *br, int err) */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); - if (pending && !err) { - atapi_read(p, slot, cfis, aior->done, hdr->prdtl - pending); + if (!err) + hdr->prdbc = aior->done; + + if (!err && aior->more) { + atapi_read(p, slot, cfis, aior->done); goto out; } - if (!err && aior->done == aior->len) { + if (!err) { tfd = ATA_S_READY | ATA_S_DSC; - hdr->prdbc = aior->len; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x21; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; - hdr->prdbc = 0; } - cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); @@ -1612,6 +2008,7 @@ atapi_ioreq_cb(struct blockif_req *br, int err) p->pending &= ~(1 << slot); ahci_check_stopped(p); + ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit\n", __func__); @@ -1669,15 +2066,23 @@ pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) break; case AHCI_P_IS: p->is &= ~value; + ahci_port_intr(p); break; case AHCI_P_IE: p->ie = value & 0xFDC000FF; - ahci_generate_intr(sc); + ahci_port_intr(p); break; case AHCI_P_CMD: { - p->cmd = value; - + p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | + AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | + AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | + AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); + p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | + AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | + AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | + AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; + if (!(value & AHCI_P_CMD_ST)) { ahci_port_stop(p); } else { @@ -1701,10 +2106,14 @@ pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) } if (value & AHCI_P_CMD_CLO) { - p->tfd = 0; + p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ); p->cmd &= ~AHCI_P_CMD_CLO; } + if (value & AHCI_P_CMD_ICC_MASK) { + p->cmd &= ~AHCI_P_CMD_ICC_MASK; + } + ahci_handle_port(p); break; } @@ -1714,10 +2123,10 @@ pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset); break; case AHCI_P_SCTL: + p->sctl = value; if (!(p->cmd & AHCI_P_CMD_ST)) { if (value & ATA_SC_DET_RESET) ahci_port_reset(p); - p->sctl = value; } break; case AHCI_P_SERR: @@ -1751,16 +2160,19 @@ pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset); break; case AHCI_GHC: - if (value & AHCI_GHC_HR) + if (value & AHCI_GHC_HR) { ahci_reset(sc); - else if (value & AHCI_GHC_IE) { - sc->ghc |= AHCI_GHC_IE; - ahci_generate_intr(sc); + break; } + if (value & AHCI_GHC_IE) + sc->ghc |= AHCI_GHC_IE; + else + sc->ghc &= ~AHCI_GHC_IE; + ahci_generate_intr(sc, 0xffffffff); break; case AHCI_IS: sc->is &= ~value; - ahci_generate_intr(sc); + ahci_generate_intr(sc, value); break; default: break; @@ -1774,7 +2186,7 @@ pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, struct pci_ahci_softc *sc = pi->pi_arg; assert(baridx == 5); - assert(size == 4); + assert((offset % 4) == 0 && size == 4); pthread_mutex_lock(&sc->mtx); @@ -1863,24 +2275,29 @@ pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) static uint64_t pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, - uint64_t offset, int size) + uint64_t regoff, int size) { struct pci_ahci_softc *sc = pi->pi_arg; + uint64_t offset; uint32_t value; assert(baridx == 5); - assert(size == 4); + assert(size == 1 || size == 2 || size == 4); + assert((regoff & (size - 1)) == 0); pthread_mutex_lock(&sc->mtx); + offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */ if (offset < AHCI_OFFSET) value = pci_ahci_host_read(sc, offset); else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) value = pci_ahci_port_read(sc, offset); else { value = 0; - WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", offset); + WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", + regoff); } + value >>= 8 * (regoff & 0x3); pthread_mutex_unlock(&sc->mtx); @@ -1890,18 +2307,16 @@ pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, static int pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) { - char bident[sizeof("XX:X:X")]; + char bident[sizeof("XX:XX:XX")]; struct blockif_ctxt *bctxt; struct pci_ahci_softc *sc; - int ret, slots; + int ret, slots, p; + MD5_CTX mdctx; + u_char digest[16]; + char *next, *next2; ret = 0; - if (opts == NULL) { - fprintf(stderr, "pci_ahci: backing device required\n"); - return (1); - } - #ifdef AHCI_DEBUG dbg = fopen("/tmp/log", "w+"); #endif @@ -1909,48 +2324,96 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) sc = calloc(1, sizeof(struct pci_ahci_softc)); pi->pi_arg = sc; sc->asc_pi = pi; - sc->ports = MAX_PORTS; + pthread_mutex_init(&sc->mtx, NULL); + sc->ports = 0; + sc->pi = 0; + slots = 32; + + for (p = 0; p < MAX_PORTS && opts != NULL; p++, opts = next) { + /* Identify and cut off type of present port. */ + if (strncmp(opts, "hd:", 3) == 0) { + atapi = 0; + opts += 3; + } else if (strncmp(opts, "cd:", 3) == 0) { + atapi = 1; + opts += 3; + } - /* - * Only use port 0 for a backing device. All other ports will be - * marked as unused - */ - sc->port[0].atapi = atapi; + /* Find and cut off the next port options. */ + next = strstr(opts, ",hd:"); + next2 = strstr(opts, ",cd:"); + if (next == NULL || (next2 != NULL && next2 < next)) + next = next2; + if (next != NULL) { + next[0] = 0; + next++; + } - /* - * Attempt to open the backing image. Use the PCI - * slot/func for the identifier string. - */ - snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); - bctxt = blockif_open(opts, bident); - if (bctxt == NULL) { - ret = 1; - goto open_fail; - } - sc->port[0].bctx = bctxt; - sc->port[0].pr_sc = sc; + if (opts[0] == 0) + continue; - /* - * Allocate blockif request structures and add them - * to the free list - */ - pci_ahci_ioreq_init(&sc->port[0]); + /* + * Attempt to open the backing image. Use the PCI slot/func + * and the port number for the identifier string. + */ + snprintf(bident, sizeof(bident), "%d:%d:%d", pi->pi_slot, + pi->pi_func, p); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { + sc->ports = p; + ret = 1; + goto open_fail; + } + sc->port[p].bctx = bctxt; + sc->port[p].pr_sc = sc; + sc->port[p].port = p; + sc->port[p].atapi = atapi; + +#ifndef __FreeBSD__ + /* + * Attempt to enable the write cache for this device, as the + * guest will issue FLUSH commands when it requires durability. + * + * Failure here is fine, since an always-sync device will not + * have an impact on correctness. + */ + (void) blockif_set_wce(bctxt, 1); +#endif - pthread_mutex_init(&sc->mtx, NULL); + /* + * Create an identifier for the backing file. + * Use parts of the md5 sum of the filename + */ + MD5Init(&mdctx); + MD5Update(&mdctx, opts, strlen(opts)); + MD5Final(digest, &mdctx); + snprintf(sc->port[p].ident, AHCI_PORT_IDENT, + "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], + digest[5]); + + /* + * Allocate blockif request structures and add them + * to the free list + */ + pci_ahci_ioreq_init(&sc->port[p]); + + sc->pi |= (1 << p); + if (sc->port[p].ioqsz < slots) + slots = sc->port[p].ioqsz; + } + sc->ports = p; /* Intel ICH8 AHCI */ - slots = sc->port[0].ioqsz; - if (slots > 32) - slots = 32; --slots; + if (sc->ports < DEF_PORTS) + sc->ports = DEF_PORTS; sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); - /* Only port 0 implemented */ - sc->pi = 1; sc->vs = 0x10300; sc->cap2 = AHCI_CAP2_APST; ahci_reset(sc); @@ -1960,7 +2423,9 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); - pci_emul_add_msicap(pi, 1); + p = MIN(sc->ports, 16); + p = flsl(p) - ((p & (p - 1)) ? 0 : 1); + pci_emul_add_msicap(pi, 1 << p); pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, AHCI_OFFSET + sc->ports * AHCI_STEP); @@ -1968,7 +2433,10 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) open_fail: if (ret) { - blockif_close(sc->port[0].bctx); + for (p = 0; p < sc->ports; p++) { + if (sc->port[p].bctx != NULL) + blockif_close(sc->port[p].bctx); + } free(sc); } @@ -1992,6 +2460,14 @@ pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) /* * Use separate emulation names to distinguish drive and atapi devices */ +struct pci_devemu pci_de_ahci = { + .pe_emu = "ahci", + .pe_init = pci_ahci_hd_init, + .pe_barwrite = pci_ahci_write, + .pe_barread = pci_ahci_read +}; +PCI_EMUL_SET(pci_de_ahci); + struct pci_devemu pci_de_ahci_hd = { .pe_emu = "ahci-hd", .pe_init = pci_ahci_hd_init, diff --git a/usr/src/cmd/bhyve/pci_e82545.c b/usr/src/cmd/bhyve/pci_e82545.c new file mode 100644 index 0000000000..e211b5cf9c --- /dev/null +++ b/usr/src/cmd/bhyve/pci_e82545.c @@ -0,0 +1,2418 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Alexander Motin + * Copyright (c) 2015 Peter Grehan + * Copyright (c) 2013 Jeremiah Lott, Avere Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#ifndef __FreeBSD__ +#include +#endif + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "e1000_regs.h" +#include "e1000_defines.h" +#include "mii.h" + +#include "bhyverun.h" +#include "pci_emul.h" +#include "mevent.h" + +/* Hardware/register definitions XXX: move some to common code. */ +#define E82545_VENDOR_ID_INTEL 0x8086 +#define E82545_DEV_ID_82545EM_COPPER 0x100F +#define E82545_SUBDEV_ID 0x1008 + +#define E82545_REVISION_4 4 + +#define E82545_MDIC_DATA_MASK 0x0000FFFF +#define E82545_MDIC_OP_MASK 0x0c000000 +#define E82545_MDIC_IE 0x20000000 + +#define E82545_EECD_FWE_DIS 0x00000010 /* Flash writes disabled */ +#define E82545_EECD_FWE_EN 0x00000020 /* Flash writes enabled */ +#define E82545_EECD_FWE_MASK 0x00000030 /* Flash writes mask */ + +#define E82545_BAR_REGISTER 0 +#define E82545_BAR_REGISTER_LEN (128*1024) +#define E82545_BAR_FLASH 1 +#define E82545_BAR_FLASH_LEN (64*1024) +#define E82545_BAR_IO 2 +#define E82545_BAR_IO_LEN 8 + +#define E82545_IOADDR 0x00000000 +#define E82545_IODATA 0x00000004 +#define E82545_IO_REGISTER_MAX 0x0001FFFF +#define E82545_IO_FLASH_BASE 0x00080000 +#define E82545_IO_FLASH_MAX 0x000FFFFF + +#define E82545_ARRAY_ENTRY(reg, offset) (reg + (offset<<2)) +#define E82545_RAR_MAX 15 +#define E82545_MTA_MAX 127 +#define E82545_VFTA_MAX 127 + +/* Slightly modified from the driver versions, hardcoded for 3 opcode bits, + * followed by 6 address bits. + * TODO: make opcode bits and addr bits configurable? + * NVM Commands - Microwire */ +#define E82545_NVM_OPCODE_BITS 3 +#define E82545_NVM_ADDR_BITS 6 +#define E82545_NVM_DATA_BITS 16 +#define E82545_NVM_OPADDR_BITS (E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS) +#define E82545_NVM_ADDR_MASK ((1 << E82545_NVM_ADDR_BITS)-1) +#define E82545_NVM_OPCODE_MASK \ + (((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS) +#define E82545_NVM_OPCODE_READ (0x6 << E82545_NVM_ADDR_BITS) /* read */ +#define E82545_NVM_OPCODE_WRITE (0x5 << E82545_NVM_ADDR_BITS) /* write */ +#define E82545_NVM_OPCODE_ERASE (0x7 << E82545_NVM_ADDR_BITS) /* erase */ +#define E82545_NVM_OPCODE_EWEN (0x4 << E82545_NVM_ADDR_BITS) /* wr-enable */ + +#define E82545_NVM_EEPROM_SIZE 64 /* 64 * 16-bit values == 128K */ + +#define E1000_ICR_SRPD 0x00010000 + +/* This is an arbitrary number. There is no hard limit on the chip. */ +#define I82545_MAX_TXSEGS 64 + +/* Legacy receive descriptor */ +struct e1000_rx_desc { + uint64_t buffer_addr; /* Address of the descriptor's data buffer */ + uint16_t length; /* Length of data DMAed into data buffer */ + uint16_t csum; /* Packet checksum */ + uint8_t status; /* Descriptor status */ + uint8_t errors; /* Descriptor Errors */ + uint16_t special; +}; + +/* Transmit descriptor types */ +#define E1000_TXD_MASK (E1000_TXD_CMD_DEXT | 0x00F00000) +#define E1000_TXD_TYP_L (0) +#define E1000_TXD_TYP_C (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C) +#define E1000_TXD_TYP_D (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D) + +/* Legacy transmit descriptor */ +struct e1000_tx_desc { + uint64_t buffer_addr; /* Address of the descriptor's data buffer */ + union { + uint32_t data; + struct { + uint16_t length; /* Data buffer length */ + uint8_t cso; /* Checksum offset */ + uint8_t cmd; /* Descriptor control */ + } flags; + } lower; + union { + uint32_t data; + struct { + uint8_t status; /* Descriptor status */ + uint8_t css; /* Checksum start */ + uint16_t special; + } fields; + } upper; +}; + +/* Context descriptor */ +struct e1000_context_desc { + union { + uint32_t ip_config; + struct { + uint8_t ipcss; /* IP checksum start */ + uint8_t ipcso; /* IP checksum offset */ + uint16_t ipcse; /* IP checksum end */ + } ip_fields; + } lower_setup; + union { + uint32_t tcp_config; + struct { + uint8_t tucss; /* TCP checksum start */ + uint8_t tucso; /* TCP checksum offset */ + uint16_t tucse; /* TCP checksum end */ + } tcp_fields; + } upper_setup; + uint32_t cmd_and_length; + union { + uint32_t data; + struct { + uint8_t status; /* Descriptor status */ + uint8_t hdr_len; /* Header length */ + uint16_t mss; /* Maximum segment size */ + } fields; + } tcp_seg_setup; +}; + +/* Data descriptor */ +struct e1000_data_desc { + uint64_t buffer_addr; /* Address of the descriptor's buffer address */ + union { + uint32_t data; + struct { + uint16_t length; /* Data buffer length */ + uint8_t typ_len_ext; + uint8_t cmd; + } flags; + } lower; + union { + uint32_t data; + struct { + uint8_t status; /* Descriptor status */ + uint8_t popts; /* Packet Options */ + uint16_t special; + } fields; + } upper; +}; + +union e1000_tx_udesc { + struct e1000_tx_desc td; + struct e1000_context_desc cd; + struct e1000_data_desc dd; +}; + +/* Tx checksum info for a packet. */ +struct ck_info { + int ck_valid; /* ck_info is valid */ + uint8_t ck_start; /* start byte of cksum calcuation */ + uint8_t ck_off; /* offset of cksum insertion */ + uint16_t ck_len; /* length of cksum calc: 0 is to packet-end */ +}; + +/* + * Debug printf + */ +static int e82545_debug = 0; +#define DPRINTF(msg,params...) if (e82545_debug) fprintf(stderr, "e82545: " msg, params) +#define WPRINTF(msg,params...) fprintf(stderr, "e82545: " msg, params) + +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) + +/* s/w representation of the RAL/RAH regs */ +struct eth_uni { + int eu_valid; + int eu_addrsel; + struct ether_addr eu_eth; +}; + + +struct e82545_softc { + struct pci_devinst *esc_pi; + struct vmctx *esc_ctx; + struct mevent *esc_mevp; + struct mevent *esc_mevpitr; + pthread_mutex_t esc_mtx; + struct ether_addr esc_mac; + int esc_tapfd; + + /* General */ + uint32_t esc_CTRL; /* x0000 device ctl */ + uint32_t esc_FCAL; /* x0028 flow ctl addr lo */ + uint32_t esc_FCAH; /* x002C flow ctl addr hi */ + uint32_t esc_FCT; /* x0030 flow ctl type */ + uint32_t esc_VET; /* x0038 VLAN eth type */ + uint32_t esc_FCTTV; /* x0170 flow ctl tx timer */ + uint32_t esc_LEDCTL; /* x0E00 LED control */ + uint32_t esc_PBA; /* x1000 pkt buffer allocation */ + + /* Interrupt control */ + int esc_irq_asserted; + uint32_t esc_ICR; /* x00C0 cause read/clear */ + uint32_t esc_ITR; /* x00C4 intr throttling */ + uint32_t esc_ICS; /* x00C8 cause set */ + uint32_t esc_IMS; /* x00D0 mask set/read */ + uint32_t esc_IMC; /* x00D8 mask clear */ + + /* Transmit */ + union e1000_tx_udesc *esc_txdesc; + struct e1000_context_desc esc_txctx; + pthread_t esc_tx_tid; + pthread_cond_t esc_tx_cond; + int esc_tx_enabled; + int esc_tx_active; + uint32_t esc_TXCW; /* x0178 transmit config */ + uint32_t esc_TCTL; /* x0400 transmit ctl */ + uint32_t esc_TIPG; /* x0410 inter-packet gap */ + uint16_t esc_AIT; /* x0458 Adaptive Interframe Throttle */ + uint64_t esc_tdba; /* verified 64-bit desc table addr */ + uint32_t esc_TDBAL; /* x3800 desc table addr, low bits */ + uint32_t esc_TDBAH; /* x3804 desc table addr, hi 32-bits */ + uint32_t esc_TDLEN; /* x3808 # descriptors in bytes */ + uint16_t esc_TDH; /* x3810 desc table head idx */ + uint16_t esc_TDHr; /* internal read version of TDH */ + uint16_t esc_TDT; /* x3818 desc table tail idx */ + uint32_t esc_TIDV; /* x3820 intr delay */ + uint32_t esc_TXDCTL; /* x3828 desc control */ + uint32_t esc_TADV; /* x382C intr absolute delay */ + + /* L2 frame acceptance */ + struct eth_uni esc_uni[16]; /* 16 x unicast MAC addresses */ + uint32_t esc_fmcast[128]; /* Multicast filter bit-match */ + uint32_t esc_fvlan[128]; /* VLAN 4096-bit filter */ + + /* Receive */ + struct e1000_rx_desc *esc_rxdesc; + pthread_cond_t esc_rx_cond; + int esc_rx_enabled; + int esc_rx_active; + int esc_rx_loopback; + uint32_t esc_RCTL; /* x0100 receive ctl */ + uint32_t esc_FCRTL; /* x2160 flow cntl thresh, low */ + uint32_t esc_FCRTH; /* x2168 flow cntl thresh, hi */ + uint64_t esc_rdba; /* verified 64-bit desc table addr */ + uint32_t esc_RDBAL; /* x2800 desc table addr, low bits */ + uint32_t esc_RDBAH; /* x2804 desc table addr, hi 32-bits*/ + uint32_t esc_RDLEN; /* x2808 #descriptors */ + uint16_t esc_RDH; /* x2810 desc table head idx */ + uint16_t esc_RDT; /* x2818 desc table tail idx */ + uint32_t esc_RDTR; /* x2820 intr delay */ + uint32_t esc_RXDCTL; /* x2828 desc control */ + uint32_t esc_RADV; /* x282C intr absolute delay */ + uint32_t esc_RSRPD; /* x2C00 recv small packet detect */ + uint32_t esc_RXCSUM; /* x5000 receive cksum ctl */ + + /* IO Port register access */ + uint32_t io_addr; + + /* Shadow copy of MDIC */ + uint32_t mdi_control; + /* Shadow copy of EECD */ + uint32_t eeprom_control; + /* Latest NVM in/out */ + uint16_t nvm_data; + uint16_t nvm_opaddr; + /* stats */ + uint32_t missed_pkt_count; /* dropped for no room in rx queue */ + uint32_t pkt_rx_by_size[6]; + uint32_t pkt_tx_by_size[6]; + uint32_t good_pkt_rx_count; + uint32_t bcast_pkt_rx_count; + uint32_t mcast_pkt_rx_count; + uint32_t good_pkt_tx_count; + uint32_t bcast_pkt_tx_count; + uint32_t mcast_pkt_tx_count; + uint32_t oversize_rx_count; + uint32_t tso_tx_count; + uint64_t good_octets_rx; + uint64_t good_octets_tx; + uint64_t missed_octets; /* counts missed and oversized */ + + uint8_t nvm_bits:6; /* number of bits remaining in/out */ + uint8_t nvm_mode:2; +#define E82545_NVM_MODE_OPADDR 0x0 +#define E82545_NVM_MODE_DATAIN 0x1 +#define E82545_NVM_MODE_DATAOUT 0x2 + /* EEPROM data */ + uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE]; +}; + +static void e82545_reset(struct e82545_softc *sc, int dev); +static void e82545_rx_enable(struct e82545_softc *sc); +static void e82545_rx_disable(struct e82545_softc *sc); +#ifdef __FreeBSD__ +static void e82545_tap_callback(int fd, enum ev_type type, void *param); +#endif +static void e82545_tx_start(struct e82545_softc *sc); +static void e82545_tx_enable(struct e82545_softc *sc); +static void e82545_tx_disable(struct e82545_softc *sc); + +static inline int +e82545_size_stat_index(uint32_t size) +{ + if (size <= 64) { + return 0; + } else if (size >= 1024) { + return 5; + } else { + /* should be 1-4 */ + return (ffs(size) - 6); + } +} + +static void +e82545_init_eeprom(struct e82545_softc *sc) +{ + uint16_t checksum, i; + + /* mac addr */ + sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) | + (((uint16_t)sc->esc_mac.octet[1]) << 8); + sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) | + (((uint16_t)sc->esc_mac.octet[3]) << 8); + sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) | + (((uint16_t)sc->esc_mac.octet[5]) << 8); + + /* pci ids */ + sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID; + sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL; + sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER; + sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL; + + /* fill in the checksum */ + checksum = 0; + for (i = 0; i < NVM_CHECKSUM_REG; i++) { + checksum += sc->eeprom_data[i]; + } + checksum = NVM_SUM - checksum; + sc->eeprom_data[NVM_CHECKSUM_REG] = checksum; + DPRINTF("eeprom checksum: 0x%x\r\n", checksum); +} + +static void +e82545_write_mdi(struct e82545_softc *sc, uint8_t reg_addr, + uint8_t phy_addr, uint32_t data) +{ + DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x\r\n", reg_addr, phy_addr, data); +} + +static uint32_t +e82545_read_mdi(struct e82545_softc *sc, uint8_t reg_addr, + uint8_t phy_addr) +{ + //DPRINTF("Read mdi reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr); + switch (reg_addr) { + case PHY_STATUS: + return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS | + MII_SR_AUTONEG_COMPLETE); + case PHY_AUTONEG_ADV: + return NWAY_AR_SELECTOR_FIELD; + case PHY_LP_ABILITY: + return 0; + case PHY_1000T_STATUS: + return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS | + SR_1000T_LOCAL_RX_STATUS); + case PHY_ID1: + return (M88E1011_I_PHY_ID >> 16) & 0xFFFF; + case PHY_ID2: + return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF; + default: + DPRINTF("Unknown mdi read reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr); + return 0; + } + /* not reached */ +} + +static void +e82545_eecd_strobe(struct e82545_softc *sc) +{ + /* Microwire state machine */ + /* + DPRINTF("eeprom state machine srtobe " + "0x%x 0x%x 0x%x 0x%x\r\n", + sc->nvm_mode, sc->nvm_bits, + sc->nvm_opaddr, sc->nvm_data);*/ + + if (sc->nvm_bits == 0) { + DPRINTF("eeprom state machine not expecting data! " + "0x%x 0x%x 0x%x 0x%x\r\n", + sc->nvm_mode, sc->nvm_bits, + sc->nvm_opaddr, sc->nvm_data); + return; + } + sc->nvm_bits--; + if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) { + /* shifting out */ + if (sc->nvm_data & 0x8000) { + sc->eeprom_control |= E1000_EECD_DO; + } else { + sc->eeprom_control &= ~E1000_EECD_DO; + } + sc->nvm_data <<= 1; + if (sc->nvm_bits == 0) { + /* read done, back to opcode mode. */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + } + } else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) { + /* shifting in */ + sc->nvm_data <<= 1; + if (sc->eeprom_control & E1000_EECD_DI) { + sc->nvm_data |= 1; + } + if (sc->nvm_bits == 0) { + /* eeprom write */ + uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; + uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; + if (op != E82545_NVM_OPCODE_WRITE) { + DPRINTF("Illegal eeprom write op 0x%x\r\n", + sc->nvm_opaddr); + } else if (addr >= E82545_NVM_EEPROM_SIZE) { + DPRINTF("Illegal eeprom write addr 0x%x\r\n", + sc->nvm_opaddr); + } else { + DPRINTF("eeprom write eeprom[0x%x] = 0x%x\r\n", + addr, sc->nvm_data); + sc->eeprom_data[addr] = sc->nvm_data; + } + /* back to opcode mode */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + } + } else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) { + sc->nvm_opaddr <<= 1; + if (sc->eeprom_control & E1000_EECD_DI) { + sc->nvm_opaddr |= 1; + } + if (sc->nvm_bits == 0) { + uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; + switch (op) { + case E82545_NVM_OPCODE_EWEN: + DPRINTF("eeprom write enable: 0x%x\r\n", + sc->nvm_opaddr); + /* back to opcode mode */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + break; + case E82545_NVM_OPCODE_READ: + { + uint16_t addr = sc->nvm_opaddr & + E82545_NVM_ADDR_MASK; + sc->nvm_mode = E82545_NVM_MODE_DATAOUT; + sc->nvm_bits = E82545_NVM_DATA_BITS; + if (addr < E82545_NVM_EEPROM_SIZE) { + sc->nvm_data = sc->eeprom_data[addr]; + DPRINTF("eeprom read: eeprom[0x%x] = 0x%x\r\n", + addr, sc->nvm_data); + } else { + DPRINTF("eeprom illegal read: 0x%x\r\n", + sc->nvm_opaddr); + sc->nvm_data = 0; + } + break; + } + case E82545_NVM_OPCODE_WRITE: + sc->nvm_mode = E82545_NVM_MODE_DATAIN; + sc->nvm_bits = E82545_NVM_DATA_BITS; + sc->nvm_data = 0; + break; + default: + DPRINTF("eeprom unknown op: 0x%x\r\r", + sc->nvm_opaddr); + /* back to opcode mode */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + } + } + } else { + DPRINTF("eeprom state machine wrong state! " + "0x%x 0x%x 0x%x 0x%x\r\n", + sc->nvm_mode, sc->nvm_bits, + sc->nvm_opaddr, sc->nvm_data); + } +} + +#ifdef __FreeBSD__ +static void +e82545_itr_callback(int fd, enum ev_type type, void *param) +{ + uint32_t new; + struct e82545_softc *sc = param; + + pthread_mutex_lock(&sc->esc_mtx); + new = sc->esc_ICR & sc->esc_IMS; + if (new && !sc->esc_irq_asserted) { + DPRINTF("itr callback: lintr assert %x\r\n", new); + sc->esc_irq_asserted = 1; + pci_lintr_assert(sc->esc_pi); + } else { + mevent_delete(sc->esc_mevpitr); + sc->esc_mevpitr = NULL; + } + pthread_mutex_unlock(&sc->esc_mtx); +} +#endif + +static void +e82545_icr_assert(struct e82545_softc *sc, uint32_t bits) +{ + uint32_t new; + + DPRINTF("icr assert: 0x%x\r\n", bits); + + /* + * An interrupt is only generated if bits are set that + * aren't already in the ICR, these bits are unmasked, + * and there isn't an interrupt already pending. + */ + new = bits & ~sc->esc_ICR & sc->esc_IMS; + sc->esc_ICR |= bits; + + if (new == 0) { + DPRINTF("icr assert: masked %x, ims %x\r\n", new, sc->esc_IMS); + } else if (sc->esc_mevpitr != NULL) { + DPRINTF("icr assert: throttled %x, ims %x\r\n", new, sc->esc_IMS); + } else if (!sc->esc_irq_asserted) { + DPRINTF("icr assert: lintr assert %x\r\n", new); + sc->esc_irq_asserted = 1; + pci_lintr_assert(sc->esc_pi); + if (sc->esc_ITR != 0) { +#ifdef __FreeBSD__ + sc->esc_mevpitr = mevent_add( + (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ + EVF_TIMER, e82545_itr_callback, sc); +#endif + } + } +} + +static void +e82545_ims_change(struct e82545_softc *sc, uint32_t bits) +{ + uint32_t new; + + /* + * Changing the mask may allow previously asserted + * but masked interrupt requests to generate an interrupt. + */ + new = bits & sc->esc_ICR & ~sc->esc_IMS; + sc->esc_IMS |= bits; + + if (new == 0) { + DPRINTF("ims change: masked %x, ims %x\r\n", new, sc->esc_IMS); + } else if (sc->esc_mevpitr != NULL) { + DPRINTF("ims change: throttled %x, ims %x\r\n", new, sc->esc_IMS); + } else if (!sc->esc_irq_asserted) { + DPRINTF("ims change: lintr assert %x\n\r", new); + sc->esc_irq_asserted = 1; + pci_lintr_assert(sc->esc_pi); + if (sc->esc_ITR != 0) { +#ifdef __FreeBSD__ + sc->esc_mevpitr = mevent_add( + (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ + EVF_TIMER, e82545_itr_callback, sc); +#endif + } + } +} + +static void +e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits) +{ + + DPRINTF("icr deassert: 0x%x\r\n", bits); + sc->esc_ICR &= ~bits; + + /* + * If there are no longer any interrupt sources and there + * was an asserted interrupt, clear it + */ + if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) { + DPRINTF("icr deassert: lintr deassert %x\r\n", bits); + pci_lintr_deassert(sc->esc_pi); + sc->esc_irq_asserted = 0; + } +} + +static void +e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value) +{ + + DPRINTF("intr_write: off %x, val %x\n\r", offset, value); + + switch (offset) { + case E1000_ICR: + e82545_icr_deassert(sc, value); + break; + case E1000_ITR: + sc->esc_ITR = value; + break; + case E1000_ICS: + sc->esc_ICS = value; /* not used: store for debug */ + e82545_icr_assert(sc, value); + break; + case E1000_IMS: + e82545_ims_change(sc, value); + break; + case E1000_IMC: + sc->esc_IMC = value; /* for debug */ + sc->esc_IMS &= ~value; + // XXX clear interrupts if all ICR bits now masked + // and interrupt was pending ? + break; + default: + break; + } +} + +static uint32_t +e82545_intr_read(struct e82545_softc *sc, uint32_t offset) +{ + uint32_t retval; + + retval = 0; + + DPRINTF("intr_read: off %x\n\r", offset); + + switch (offset) { + case E1000_ICR: + retval = sc->esc_ICR; + sc->esc_ICR = 0; + e82545_icr_deassert(sc, ~0); + break; + case E1000_ITR: + retval = sc->esc_ITR; + break; + case E1000_ICS: + /* write-only register */ + break; + case E1000_IMS: + retval = sc->esc_IMS; + break; + case E1000_IMC: + /* write-only register */ + break; + default: + break; + } + + return (retval); +} + +static void +e82545_devctl(struct e82545_softc *sc, uint32_t val) +{ + + sc->esc_CTRL = val & ~E1000_CTRL_RST; + + if (val & E1000_CTRL_RST) { + DPRINTF("e1k: s/w reset, ctl %x\n", val); + e82545_reset(sc, 1); + } + /* XXX check for phy reset ? */ +} + +static void +e82545_rx_update_rdba(struct e82545_softc *sc) +{ + + /* XXX verify desc base/len within phys mem range */ + sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 | + sc->esc_RDBAL; + + /* Cache host mapping of guest descriptor array */ + sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx, + sc->esc_rdba, sc->esc_RDLEN); +} + +static void +e82545_rx_ctl(struct e82545_softc *sc, uint32_t val) +{ + int on; + + on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN); + + /* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */ + sc->esc_RCTL = val & ~0xF9204c01; + + DPRINTF("rx_ctl - %s RCTL %x, val %x\n", + on ? "on" : "off", sc->esc_RCTL, val); + + /* state change requested */ + if (on != sc->esc_rx_enabled) { + if (on) { + /* Catch disallowed/unimplemented settings */ + //assert(!(val & E1000_RCTL_LBM_TCVR)); + + if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) { + sc->esc_rx_loopback = 1; + } else { + sc->esc_rx_loopback = 0; + } + + e82545_rx_update_rdba(sc); + e82545_rx_enable(sc); + } else { + e82545_rx_disable(sc); + sc->esc_rx_loopback = 0; + sc->esc_rdba = 0; + sc->esc_rxdesc = NULL; + } + } +} + +static void +e82545_tx_update_tdba(struct e82545_softc *sc) +{ + + /* XXX verify desc base/len within phys mem range */ + sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL; + + /* Cache host mapping of guest descriptor array */ + sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba, + sc->esc_TDLEN); +} + +static void +e82545_tx_ctl(struct e82545_softc *sc, uint32_t val) +{ + int on; + + on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN); + + /* ignore TCTL_EN settings that don't change state */ + if (on == sc->esc_tx_enabled) + return; + + if (on) { + e82545_tx_update_tdba(sc); + e82545_tx_enable(sc); + } else { + e82545_tx_disable(sc); + sc->esc_tdba = 0; + sc->esc_txdesc = NULL; + } + + /* Save TCTL value after stripping reserved bits 31:25,23,2,0 */ + sc->esc_TCTL = val & ~0xFE800005; +} + +int +e82545_bufsz(uint32_t rctl) +{ + + switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) { + case (E1000_RCTL_SZ_2048): return (2048); + case (E1000_RCTL_SZ_1024): return (1024); + case (E1000_RCTL_SZ_512): return (512); + case (E1000_RCTL_SZ_256): return (256); + case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384); + case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192); + case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096); + } + return (256); /* Forbidden value. */ +} + +#ifdef __FreeBSD__ +static uint8_t dummybuf[2048]; + +/* XXX one packet at a time until this is debugged */ +static void +e82545_tap_callback(int fd, enum ev_type type, void *param) +{ + struct e82545_softc *sc = param; + struct e1000_rx_desc *rxd; + struct iovec vec[64]; + int left, len, lim, maxpktsz, maxpktdesc, bufsz, i, n, size; + uint32_t cause = 0; + uint16_t *tp, tag, head; + + pthread_mutex_lock(&sc->esc_mtx); + DPRINTF("rx_run: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT); + + if (!sc->esc_rx_enabled || sc->esc_rx_loopback) { + DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped\r\n", + sc->esc_rx_enabled, sc->esc_rx_loopback); + while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) { + } + goto done1; + } + bufsz = e82545_bufsz(sc->esc_RCTL); + maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522; + maxpktdesc = (maxpktsz + bufsz - 1) / bufsz; + size = sc->esc_RDLEN / 16; + head = sc->esc_RDH; + left = (size + sc->esc_RDT - head) % size; + if (left < maxpktdesc) { + DPRINTF("rx overflow (%d < %d) -- packet(s) dropped\r\n", + left, maxpktdesc); + while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) { + } + goto done1; + } + + sc->esc_rx_active = 1; + pthread_mutex_unlock(&sc->esc_mtx); + + for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) { + + /* Grab rx descriptor pointed to by the head pointer */ + for (i = 0; i < maxpktdesc; i++) { + rxd = &sc->esc_rxdesc[(head + i) % size]; + vec[i].iov_base = paddr_guest2host(sc->esc_ctx, + rxd->buffer_addr, bufsz); + vec[i].iov_len = bufsz; + } + len = readv(sc->esc_tapfd, vec, maxpktdesc); + if (len <= 0) { + DPRINTF("tap: readv() returned %d\n", len); + goto done; + } + + /* + * Adjust the packet length based on whether the CRC needs + * to be stripped or if the packet is less than the minimum + * eth packet size. + */ + if (len < ETHER_MIN_LEN - ETHER_CRC_LEN) + len = ETHER_MIN_LEN - ETHER_CRC_LEN; + if (!(sc->esc_RCTL & E1000_RCTL_SECRC)) + len += ETHER_CRC_LEN; + n = (len + bufsz - 1) / bufsz; + + DPRINTF("packet read %d bytes, %d segs, head %d\r\n", + len, n, head); + + /* Apply VLAN filter. */ + tp = (uint16_t *)vec[0].iov_base + 6; + if ((sc->esc_RCTL & E1000_RCTL_VFE) && + (ntohs(tp[0]) == sc->esc_VET)) { + tag = ntohs(tp[1]) & 0x0fff; + if ((sc->esc_fvlan[tag >> 5] & + (1 << (tag & 0x1f))) != 0) { + DPRINTF("known VLAN %d\r\n", tag); + } else { + DPRINTF("unknown VLAN %d\r\n", tag); + n = 0; + continue; + } + } + + /* Update all consumed descriptors. */ + for (i = 0; i < n - 1; i++) { + rxd = &sc->esc_rxdesc[(head + i) % size]; + rxd->length = bufsz; + rxd->csum = 0; + rxd->errors = 0; + rxd->special = 0; + rxd->status = E1000_RXD_STAT_DD; + } + rxd = &sc->esc_rxdesc[(head + i) % size]; + rxd->length = len % bufsz; + rxd->csum = 0; + rxd->errors = 0; + rxd->special = 0; + /* XXX signal no checksum for now */ + rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM | + E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD; + + /* Schedule receive interrupts. */ + if (len <= sc->esc_RSRPD) { + cause |= E1000_ICR_SRPD | E1000_ICR_RXT0; + } else { + /* XXX: RDRT and RADV timers should be here. */ + cause |= E1000_ICR_RXT0; + } + + head = (head + n) % size; + left -= n; + } + +done: + pthread_mutex_lock(&sc->esc_mtx); + sc->esc_rx_active = 0; + if (sc->esc_rx_enabled == 0) + pthread_cond_signal(&sc->esc_rx_cond); + + sc->esc_RDH = head; + /* Respect E1000_RCTL_RDMTS */ + left = (size + sc->esc_RDT - head) % size; + if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1))) + cause |= E1000_ICR_RXDMT0; + /* Assert all accumulated interrupts. */ + if (cause != 0) + e82545_icr_assert(sc, cause); +done1: + DPRINTF("rx_run done: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT); + pthread_mutex_unlock(&sc->esc_mtx); +} +#endif + +static uint16_t +e82545_carry(uint32_t sum) +{ + + sum = (sum & 0xFFFF) + (sum >> 16); + if (sum > 0xFFFF) + sum -= 0xFFFF; + return (sum); +} + +static uint16_t +#ifdef __FreeBSD__ +e82545_buf_checksum(uint8_t *buf, int len) +#else +e82545_buf_checksum(caddr_t buf, int len) +#endif +{ + int i; + uint32_t sum = 0; + + /* Checksum all the pairs of bytes first... */ + for (i = 0; i < (len & ~1U); i += 2) + sum += *((u_int16_t *)(buf + i)); + + /* + * If there's a single byte left over, checksum it, too. + * Network byte order is big-endian, so the remaining byte is + * the high byte. + */ + if (i < len) + sum += htons(buf[i] << 8); + + return (e82545_carry(sum)); +} + +static uint16_t +e82545_iov_checksum(struct iovec *iov, int iovcnt, int off, int len) +{ + int now, odd; + uint32_t sum = 0, s; + + /* Skip completely unneeded vectors. */ + while (iovcnt > 0 && iov->iov_len <= off && off > 0) { + off -= iov->iov_len; + iov++; + iovcnt--; + } + + /* Calculate checksum of requested range. */ + odd = 0; + while (len > 0 && iovcnt > 0) { + now = MIN(len, iov->iov_len - off); + s = e82545_buf_checksum(iov->iov_base + off, now); + sum += odd ? (s << 8) : s; + odd ^= (now & 1); + len -= now; + off = 0; + iov++; + iovcnt--; + } + + return (e82545_carry(sum)); +} + +/* + * Return the transmit descriptor type. + */ +int +e82545_txdesc_type(uint32_t lower) +{ + int type; + + type = 0; + + if (lower & E1000_TXD_CMD_DEXT) + type = lower & E1000_TXD_MASK; + + return (type); +} + +static void +e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck) +{ + uint16_t cksum; + int cklen; + + DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d\r\n", + iovcnt, ck->ck_start, ck->ck_off, ck->ck_len); + cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1 : INT_MAX; + cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen); + *(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum; +} + +static void +e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt) +{ + + if (sc->esc_tapfd == -1) + return; + + (void) writev(sc->esc_tapfd, iov, iovcnt); +} + +static void +e82545_transmit_done(struct e82545_softc *sc, uint16_t head, uint16_t tail, + uint16_t dsize, int *tdwb) +{ + union e1000_tx_udesc *dsc; + + for ( ; head != tail; head = (head + 1) % dsize) { + dsc = &sc->esc_txdesc[head]; + if (dsc->td.lower.data & E1000_TXD_CMD_RS) { + dsc->td.upper.data |= E1000_TXD_STAT_DD; + *tdwb = 1; + } + } +} + +static int +e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail, + uint16_t dsize, uint16_t *rhead, int *tdwb) +{ +#ifdef __FreeBSD__ + uint8_t *hdr, *hdrp; +#else + caddr_t hdr, hdrp; +#endif + struct iovec iovb[I82545_MAX_TXSEGS + 2]; + struct iovec tiov[I82545_MAX_TXSEGS + 2]; + struct e1000_context_desc *cd; + struct ck_info ckinfo[2]; + struct iovec *iov; + union e1000_tx_udesc *dsc; + int desc, dtype, len, ntype, iovcnt, tlen, hdrlen, vlen, tcp, tso; + int mss, paylen, seg, tiovcnt, left, now, nleft, nnow, pv, pvoff; + uint32_t tcpsum, tcpseq; + uint16_t ipcs, tcpcs, ipid, ohead; + + ckinfo[0].ck_valid = ckinfo[1].ck_valid = 0; + iovcnt = 0; + tlen = 0; + ntype = 0; + tso = 0; + ohead = head; + hdr = NULL; + + /* iovb[0/1] may be used for writable copy of headers. */ + iov = &iovb[2]; + + for (desc = 0; ; desc++, head = (head + 1) % dsize) { + if (head == tail) { + *rhead = head; + return (0); + } + dsc = &sc->esc_txdesc[head]; + dtype = e82545_txdesc_type(dsc->td.lower.data); + + if (desc == 0) { + switch (dtype) { + case E1000_TXD_TYP_C: + DPRINTF("tx ctxt desc idx %d: %016jx " + "%08x%08x\r\n", + head, dsc->td.buffer_addr, + dsc->td.upper.data, dsc->td.lower.data); + /* Save context and return */ + sc->esc_txctx = dsc->cd; + goto done; + case E1000_TXD_TYP_L: + DPRINTF("tx legacy desc idx %d: %08x%08x\r\n", + head, dsc->td.upper.data, dsc->td.lower.data); + /* + * legacy cksum start valid in first descriptor + */ + ntype = dtype; + ckinfo[0].ck_start = dsc->td.upper.fields.css; + break; + case E1000_TXD_TYP_D: + DPRINTF("tx data desc idx %d: %08x%08x\r\n", + head, dsc->td.upper.data, dsc->td.lower.data); + ntype = dtype; + break; + default: + break; + } + } else { + /* Descriptor type must be consistent */ + assert(dtype == ntype); + DPRINTF("tx next desc idx %d: %08x%08x\r\n", + head, dsc->td.upper.data, dsc->td.lower.data); + } + + len = (dtype == E1000_TXD_TYP_L) ? dsc->td.lower.flags.length : + dsc->dd.lower.data & 0xFFFFF; + + if (len > 0) { + /* Strip checksum supplied by guest. */ + if ((dsc->td.lower.data & E1000_TXD_CMD_EOP) != 0 && + (dsc->td.lower.data & E1000_TXD_CMD_IFCS) == 0) + len -= 2; + tlen += len; + if (iovcnt < I82545_MAX_TXSEGS) { + iov[iovcnt].iov_base = paddr_guest2host( + sc->esc_ctx, dsc->td.buffer_addr, len); + iov[iovcnt].iov_len = len; + } + iovcnt++; + } + + /* + * Pull out info that is valid in the final descriptor + * and exit descriptor loop. + */ + if (dsc->td.lower.data & E1000_TXD_CMD_EOP) { + if (dtype == E1000_TXD_TYP_L) { + if (dsc->td.lower.data & E1000_TXD_CMD_IC) { + ckinfo[0].ck_valid = 1; + ckinfo[0].ck_off = + dsc->td.lower.flags.cso; + ckinfo[0].ck_len = 0; + } + } else { + cd = &sc->esc_txctx; + if (dsc->dd.lower.data & E1000_TXD_CMD_TSE) + tso = 1; + if (dsc->dd.upper.fields.popts & + E1000_TXD_POPTS_IXSM) + ckinfo[0].ck_valid = 1; + if (dsc->dd.upper.fields.popts & + E1000_TXD_POPTS_IXSM || tso) { + ckinfo[0].ck_start = + cd->lower_setup.ip_fields.ipcss; + ckinfo[0].ck_off = + cd->lower_setup.ip_fields.ipcso; + ckinfo[0].ck_len = + cd->lower_setup.ip_fields.ipcse; + } + if (dsc->dd.upper.fields.popts & + E1000_TXD_POPTS_TXSM) + ckinfo[1].ck_valid = 1; + if (dsc->dd.upper.fields.popts & + E1000_TXD_POPTS_TXSM || tso) { + ckinfo[1].ck_start = + cd->upper_setup.tcp_fields.tucss; + ckinfo[1].ck_off = + cd->upper_setup.tcp_fields.tucso; + ckinfo[1].ck_len = + cd->upper_setup.tcp_fields.tucse; + } + } + break; + } + } + + if (iovcnt > I82545_MAX_TXSEGS) { + WPRINTF("tx too many descriptors (%d > %d) -- dropped\r\n", + iovcnt, I82545_MAX_TXSEGS); + goto done; + } + + hdrlen = vlen = 0; + /* Estimate writable space for VLAN header insertion. */ + if ((sc->esc_CTRL & E1000_CTRL_VME) && + (dsc->td.lower.data & E1000_TXD_CMD_VLE)) { + hdrlen = ETHER_ADDR_LEN*2; + vlen = ETHER_VLAN_ENCAP_LEN; + } + if (!tso) { + /* Estimate required writable space for checksums. */ + if (ckinfo[0].ck_valid) + hdrlen = MAX(hdrlen, ckinfo[0].ck_off + 2); + if (ckinfo[1].ck_valid) + hdrlen = MAX(hdrlen, ckinfo[1].ck_off + 2); + /* Round up writable space to the first vector. */ + if (hdrlen != 0 && iov[0].iov_len > hdrlen && + iov[0].iov_len < hdrlen + 100) + hdrlen = iov[0].iov_len; + } else { + /* In case of TSO header length provided by software. */ + hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len; + } + + /* Allocate, fill and prepend writable header vector. */ + if (hdrlen != 0) { + hdr = __builtin_alloca(hdrlen + vlen); + hdr += vlen; + for (left = hdrlen, hdrp = hdr; left > 0; + left -= now, hdrp += now) { + now = MIN(left, iov->iov_len); + memcpy(hdrp, iov->iov_base, now); + iov->iov_base += now; + iov->iov_len -= now; + if (iov->iov_len == 0) { + iov++; + iovcnt--; + } + } + iov--; + iovcnt++; + iov->iov_base = hdr; + iov->iov_len = hdrlen; + } + + /* Insert VLAN tag. */ + if (vlen != 0) { + hdr -= ETHER_VLAN_ENCAP_LEN; + memmove(hdr, hdr + ETHER_VLAN_ENCAP_LEN, ETHER_ADDR_LEN*2); + hdrlen += ETHER_VLAN_ENCAP_LEN; + hdr[ETHER_ADDR_LEN*2 + 0] = sc->esc_VET >> 8; + hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff; + hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8; + hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff; + iov->iov_base = hdr; + iov->iov_len += ETHER_VLAN_ENCAP_LEN; + /* Correct checksum offsets after VLAN tag insertion. */ + ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN; + ckinfo[0].ck_off += ETHER_VLAN_ENCAP_LEN; + if (ckinfo[0].ck_len != 0) + ckinfo[0].ck_len += ETHER_VLAN_ENCAP_LEN; + ckinfo[1].ck_start += ETHER_VLAN_ENCAP_LEN; + ckinfo[1].ck_off += ETHER_VLAN_ENCAP_LEN; + if (ckinfo[1].ck_len != 0) + ckinfo[1].ck_len += ETHER_VLAN_ENCAP_LEN; + } + + /* Simple non-TSO case. */ + if (!tso) { + /* Calculate checksums and transmit. */ + if (ckinfo[0].ck_valid) + e82545_transmit_checksum(iov, iovcnt, &ckinfo[0]); + if (ckinfo[1].ck_valid) + e82545_transmit_checksum(iov, iovcnt, &ckinfo[1]); + e82545_transmit_backend(sc, iov, iovcnt); + goto done; + } + + /* Doing TSO. */ + tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0; + mss = sc->esc_txctx.tcp_seg_setup.fields.mss; + paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff); + DPRINTF("tx %s segmentation offload %d+%d/%d bytes %d iovs\r\n", + tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt); + ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]); + tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]); + ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off]; + tcpcs = 0; + if (ckinfo[1].ck_valid) /* Save partial pseudo-header checksum. */ + tcpcs = *(uint16_t *)&hdr[ckinfo[1].ck_off]; + pv = 1; + pvoff = 0; + for (seg = 0, left = paylen; left > 0; seg++, left -= now) { + now = MIN(left, mss); + + /* Construct IOVs for the segment. */ + /* Include whole original header. */ + tiov[0].iov_base = hdr; + tiov[0].iov_len = hdrlen; + tiovcnt = 1; + /* Include respective part of payload IOV. */ + for (nleft = now; pv < iovcnt && nleft > 0; nleft -= nnow) { + nnow = MIN(nleft, iov[pv].iov_len - pvoff); + tiov[tiovcnt].iov_base = iov[pv].iov_base + pvoff; + tiov[tiovcnt++].iov_len = nnow; + if (pvoff + nnow == iov[pv].iov_len) { + pv++; + pvoff = 0; + } else + pvoff += nnow; + } + DPRINTF("tx segment %d %d+%d bytes %d iovs\r\n", + seg, hdrlen, now, tiovcnt); + + /* Update IP header. */ + if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_IP) { + /* IPv4 -- set length and ID */ + *(uint16_t *)&hdr[ckinfo[0].ck_start + 2] = + htons(hdrlen - ckinfo[0].ck_start + now); + *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = + htons(ipid + seg); + } else { + /* IPv6 -- set length */ + *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = + htons(hdrlen - ckinfo[0].ck_start - 40 + + now); + } + + /* Update pseudo-header checksum. */ + tcpsum = tcpcs; + tcpsum += htons(hdrlen - ckinfo[1].ck_start + now); + + /* Update TCP/UDP headers. */ + if (tcp) { + /* Update sequence number and FIN/PUSH flags. */ + *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = + htonl(tcpseq + paylen - left); + if (now < left) { + hdr[ckinfo[1].ck_start + 13] &= + ~(TH_FIN | TH_PUSH); + } + } else { + /* Update payload length. */ + *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = + hdrlen - ckinfo[1].ck_start + now; + } + + /* Calculate checksums and transmit. */ + if (ckinfo[0].ck_valid) { + *(uint16_t *)&hdr[ckinfo[0].ck_off] = ipcs; + e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[0]); + } + if (ckinfo[1].ck_valid) { + *(uint16_t *)&hdr[ckinfo[1].ck_off] = + e82545_carry(tcpsum); + e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[1]); + } + e82545_transmit_backend(sc, tiov, tiovcnt); + } + +done: + head = (head + 1) % dsize; + e82545_transmit_done(sc, ohead, head, dsize, tdwb); + + *rhead = head; + return (desc + 1); +} + +static void +e82545_tx_run(struct e82545_softc *sc) +{ + uint32_t cause; + uint16_t head, rhead, tail, size; + int lim, tdwb, sent; + + head = sc->esc_TDH; + tail = sc->esc_TDT; + size = sc->esc_TDLEN / 16; + DPRINTF("tx_run: head %x, rhead %x, tail %x\r\n", + sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); + + pthread_mutex_unlock(&sc->esc_mtx); + rhead = head; + tdwb = 0; + for (lim = size / 4; sc->esc_tx_enabled && lim > 0; lim -= sent) { + sent = e82545_transmit(sc, head, tail, size, &rhead, &tdwb); + if (sent == 0) + break; + head = rhead; + } + pthread_mutex_lock(&sc->esc_mtx); + + sc->esc_TDH = head; + sc->esc_TDHr = rhead; + cause = 0; + if (tdwb) + cause |= E1000_ICR_TXDW; + if (lim != size / 4 && sc->esc_TDH == sc->esc_TDT) + cause |= E1000_ICR_TXQE; + if (cause) + e82545_icr_assert(sc, cause); + + DPRINTF("tx_run done: head %x, rhead %x, tail %x\r\n", + sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); +} + +static void * +e82545_tx_thread(void *param) +{ + struct e82545_softc *sc = param; + + pthread_mutex_lock(&sc->esc_mtx); + for (;;) { + while (!sc->esc_tx_enabled || sc->esc_TDHr == sc->esc_TDT) { + if (sc->esc_tx_enabled && sc->esc_TDHr != sc->esc_TDT) + break; + sc->esc_tx_active = 0; + if (sc->esc_tx_enabled == 0) + pthread_cond_signal(&sc->esc_tx_cond); + pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); + } + sc->esc_tx_active = 1; + + /* Process some tx descriptors. Lock dropped inside. */ + e82545_tx_run(sc); + } +#ifndef __FreeBSD__ + return (NULL); +#endif +} + +static void +e82545_tx_start(struct e82545_softc *sc) +{ + + if (sc->esc_tx_active == 0) + pthread_cond_signal(&sc->esc_tx_cond); +} + +static void +e82545_tx_enable(struct e82545_softc *sc) +{ + + sc->esc_tx_enabled = 1; +} + +static void +e82545_tx_disable(struct e82545_softc *sc) +{ + + sc->esc_tx_enabled = 0; + while (sc->esc_tx_active) + pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); +} + +static void +e82545_rx_enable(struct e82545_softc *sc) +{ + + sc->esc_rx_enabled = 1; +} + +static void +e82545_rx_disable(struct e82545_softc *sc) +{ + + sc->esc_rx_enabled = 0; + while (sc->esc_rx_active) + pthread_cond_wait(&sc->esc_rx_cond, &sc->esc_mtx); +} + +static void +e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval) +{ + struct eth_uni *eu; + int idx; + + idx = reg >> 1; + assert(idx < 15); + + eu = &sc->esc_uni[idx]; + + if (reg & 0x1) { + /* RAH */ + eu->eu_valid = ((wval & E1000_RAH_AV) == E1000_RAH_AV); + eu->eu_addrsel = (wval >> 16) & 0x3; + eu->eu_eth.octet[5] = wval >> 8; + eu->eu_eth.octet[4] = wval; + } else { + /* RAL */ + eu->eu_eth.octet[3] = wval >> 24; + eu->eu_eth.octet[2] = wval >> 16; + eu->eu_eth.octet[1] = wval >> 8; + eu->eu_eth.octet[0] = wval; + } +} + +static uint32_t +e82545_read_ra(struct e82545_softc *sc, int reg) +{ + struct eth_uni *eu; + uint32_t retval; + int idx; + + idx = reg >> 1; + assert(idx < 15); + + eu = &sc->esc_uni[idx]; + + if (reg & 0x1) { + /* RAH */ + retval = (eu->eu_valid << 31) | + (eu->eu_addrsel << 16) | + (eu->eu_eth.octet[5] << 8) | + eu->eu_eth.octet[4]; + } else { + /* RAL */ + retval = (eu->eu_eth.octet[3] << 24) | + (eu->eu_eth.octet[2] << 16) | + (eu->eu_eth.octet[1] << 8) | + eu->eu_eth.octet[0]; + } + + return (retval); +} + +static void +e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value) +{ + int ridx; + + if (offset & 0x3) { + DPRINTF("Unaligned register write offset:0x%x value:0x%x\r\n", offset, value); + return; + } + DPRINTF("Register write: 0x%x value: 0x%x\r\n", offset, value); + + switch (offset) { + case E1000_CTRL: + case E1000_CTRL_DUP: + e82545_devctl(sc, value); + break; + case E1000_FCAL: + sc->esc_FCAL = value; + break; + case E1000_FCAH: + sc->esc_FCAH = value & ~0xFFFF0000; + break; + case E1000_FCT: + sc->esc_FCT = value & ~0xFFFF0000; + break; + case E1000_VET: + sc->esc_VET = value & ~0xFFFF0000; + break; + case E1000_FCTTV: + sc->esc_FCTTV = value & ~0xFFFF0000; + break; + case E1000_LEDCTL: + sc->esc_LEDCTL = value & ~0x30303000; + break; + case E1000_PBA: + sc->esc_PBA = value & 0x0000FF80; + break; + case E1000_ICR: + case E1000_ITR: + case E1000_ICS: + case E1000_IMS: + case E1000_IMC: + e82545_intr_write(sc, offset, value); + break; + case E1000_RCTL: + e82545_rx_ctl(sc, value); + break; + case E1000_FCRTL: + sc->esc_FCRTL = value & ~0xFFFF0007; + break; + case E1000_FCRTH: + sc->esc_FCRTH = value & ~0xFFFF0007; + break; + case E1000_RDBAL(0): + sc->esc_RDBAL = value & ~0xF; + if (sc->esc_rx_enabled) { + /* Apparently legal: update cached address */ + e82545_rx_update_rdba(sc); + } + break; + case E1000_RDBAH(0): + assert(!sc->esc_rx_enabled); + sc->esc_RDBAH = value; + break; + case E1000_RDLEN(0): + assert(!sc->esc_rx_enabled); + sc->esc_RDLEN = value & ~0xFFF0007F; + break; + case E1000_RDH(0): + /* XXX should only ever be zero ? Range check ? */ + sc->esc_RDH = value; + break; + case E1000_RDT(0): + /* XXX if this opens up the rx ring, do something ? */ + sc->esc_RDT = value; + break; + case E1000_RDTR: + /* ignore FPD bit 31 */ + sc->esc_RDTR = value & ~0xFFFF0000; + break; + case E1000_RXDCTL(0): + sc->esc_RXDCTL = value & ~0xFEC0C0C0; + break; + case E1000_RADV: + sc->esc_RADV = value & ~0xFFFF0000; + break; + case E1000_RSRPD: + sc->esc_RSRPD = value & ~0xFFFFF000; + break; + case E1000_RXCSUM: + sc->esc_RXCSUM = value & ~0xFFFFF800; + break; + case E1000_TXCW: + sc->esc_TXCW = value & ~0x3FFF0000; + break; + case E1000_TCTL: + e82545_tx_ctl(sc, value); + break; + case E1000_TIPG: + sc->esc_TIPG = value; + break; + case E1000_AIT: + sc->esc_AIT = value; + break; + case E1000_TDBAL(0): + sc->esc_TDBAL = value & ~0xF; + if (sc->esc_tx_enabled) { + /* Apparently legal */ + e82545_tx_update_tdba(sc); + } + break; + case E1000_TDBAH(0): + //assert(!sc->esc_tx_enabled); + sc->esc_TDBAH = value; + break; + case E1000_TDLEN(0): + //assert(!sc->esc_tx_enabled); + sc->esc_TDLEN = value & ~0xFFF0007F; + break; + case E1000_TDH(0): + //assert(!sc->esc_tx_enabled); + /* XXX should only ever be zero ? Range check ? */ + sc->esc_TDHr = sc->esc_TDH = value; + break; + case E1000_TDT(0): + /* XXX range check ? */ + sc->esc_TDT = value; + if (sc->esc_tx_enabled) + e82545_tx_start(sc); + break; + case E1000_TIDV: + sc->esc_TIDV = value & ~0xFFFF0000; + break; + case E1000_TXDCTL(0): + //assert(!sc->esc_tx_enabled); + sc->esc_TXDCTL = value & ~0xC0C0C0; + break; + case E1000_TADV: + sc->esc_TADV = value & ~0xFFFF0000; + break; + case E1000_RAL(0) ... E1000_RAH(15): + /* convert to u32 offset */ + ridx = (offset - E1000_RAL(0)) >> 2; + e82545_write_ra(sc, ridx, value); + break; + case E1000_MTA ... (E1000_MTA + (127*4)): + sc->esc_fmcast[(offset - E1000_MTA) >> 2] = value; + break; + case E1000_VFTA ... (E1000_VFTA + (127*4)): + sc->esc_fvlan[(offset - E1000_VFTA) >> 2] = value; + break; + case E1000_EECD: + { + //DPRINTF("EECD write 0x%x -> 0x%x\r\n", sc->eeprom_control, value); + /* edge triggered low->high */ + uint32_t eecd_strobe = ((sc->eeprom_control & E1000_EECD_SK) ? + 0 : (value & E1000_EECD_SK)); + uint32_t eecd_mask = (E1000_EECD_SK|E1000_EECD_CS| + E1000_EECD_DI|E1000_EECD_REQ); + sc->eeprom_control &= ~eecd_mask; + sc->eeprom_control |= (value & eecd_mask); + /* grant/revoke immediately */ + if (value & E1000_EECD_REQ) { + sc->eeprom_control |= E1000_EECD_GNT; + } else { + sc->eeprom_control &= ~E1000_EECD_GNT; + } + if (eecd_strobe && (sc->eeprom_control & E1000_EECD_CS)) { + e82545_eecd_strobe(sc); + } + return; + } + case E1000_MDIC: + { + uint8_t reg_addr = (uint8_t)((value & E1000_MDIC_REG_MASK) >> + E1000_MDIC_REG_SHIFT); + uint8_t phy_addr = (uint8_t)((value & E1000_MDIC_PHY_MASK) >> + E1000_MDIC_PHY_SHIFT); + sc->mdi_control = + (value & ~(E1000_MDIC_ERROR|E1000_MDIC_DEST)); + if ((value & E1000_MDIC_READY) != 0) { + DPRINTF("Incorrect MDIC ready bit: 0x%x\r\n", value); + return; + } + switch (value & E82545_MDIC_OP_MASK) { + case E1000_MDIC_OP_READ: + sc->mdi_control &= ~E82545_MDIC_DATA_MASK; + sc->mdi_control |= e82545_read_mdi(sc, reg_addr, phy_addr); + break; + case E1000_MDIC_OP_WRITE: + e82545_write_mdi(sc, reg_addr, phy_addr, + value & E82545_MDIC_DATA_MASK); + break; + default: + DPRINTF("Unknown MDIC op: 0x%x\r\n", value); + return; + } + /* TODO: barrier? */ + sc->mdi_control |= E1000_MDIC_READY; + if (value & E82545_MDIC_IE) { + // TODO: generate interrupt + } + return; + } + case E1000_MANC: + case E1000_STATUS: + return; + default: + DPRINTF("Unknown write register: 0x%x value:%x\r\n", offset, value); + return; + } +} + +static uint32_t +e82545_read_register(struct e82545_softc *sc, uint32_t offset) +{ + uint32_t retval; + int ridx; + + if (offset & 0x3) { + DPRINTF("Unaligned register read offset:0x%x\r\n", offset); + return 0; + } + + DPRINTF("Register read: 0x%x\r\n", offset); + + switch (offset) { + case E1000_CTRL: + retval = sc->esc_CTRL; + break; + case E1000_STATUS: + retval = E1000_STATUS_FD | E1000_STATUS_LU | + E1000_STATUS_SPEED_1000; + break; + case E1000_FCAL: + retval = sc->esc_FCAL; + break; + case E1000_FCAH: + retval = sc->esc_FCAH; + break; + case E1000_FCT: + retval = sc->esc_FCT; + break; + case E1000_VET: + retval = sc->esc_VET; + break; + case E1000_FCTTV: + retval = sc->esc_FCTTV; + break; + case E1000_LEDCTL: + retval = sc->esc_LEDCTL; + break; + case E1000_PBA: + retval = sc->esc_PBA; + break; + case E1000_ICR: + case E1000_ITR: + case E1000_ICS: + case E1000_IMS: + case E1000_IMC: + retval = e82545_intr_read(sc, offset); + break; + case E1000_RCTL: + retval = sc->esc_RCTL; + break; + case E1000_FCRTL: + retval = sc->esc_FCRTL; + break; + case E1000_FCRTH: + retval = sc->esc_FCRTH; + break; + case E1000_RDBAL(0): + retval = sc->esc_RDBAL; + break; + case E1000_RDBAH(0): + retval = sc->esc_RDBAH; + break; + case E1000_RDLEN(0): + retval = sc->esc_RDLEN; + break; + case E1000_RDH(0): + retval = sc->esc_RDH; + break; + case E1000_RDT(0): + retval = sc->esc_RDT; + break; + case E1000_RDTR: + retval = sc->esc_RDTR; + break; + case E1000_RXDCTL(0): + retval = sc->esc_RXDCTL; + break; + case E1000_RADV: + retval = sc->esc_RADV; + break; + case E1000_RSRPD: + retval = sc->esc_RSRPD; + break; + case E1000_RXCSUM: + retval = sc->esc_RXCSUM; + break; + case E1000_TXCW: + retval = sc->esc_TXCW; + break; + case E1000_TCTL: + retval = sc->esc_TCTL; + break; + case E1000_TIPG: + retval = sc->esc_TIPG; + break; + case E1000_AIT: + retval = sc->esc_AIT; + break; + case E1000_TDBAL(0): + retval = sc->esc_TDBAL; + break; + case E1000_TDBAH(0): + retval = sc->esc_TDBAH; + break; + case E1000_TDLEN(0): + retval = sc->esc_TDLEN; + break; + case E1000_TDH(0): + retval = sc->esc_TDH; + break; + case E1000_TDT(0): + retval = sc->esc_TDT; + break; + case E1000_TIDV: + retval = sc->esc_TIDV; + break; + case E1000_TXDCTL(0): + retval = sc->esc_TXDCTL; + break; + case E1000_TADV: + retval = sc->esc_TADV; + break; + case E1000_RAL(0) ... E1000_RAH(15): + /* convert to u32 offset */ + ridx = (offset - E1000_RAL(0)) >> 2; + retval = e82545_read_ra(sc, ridx); + break; + case E1000_MTA ... (E1000_MTA + (127*4)): + retval = sc->esc_fmcast[(offset - E1000_MTA) >> 2]; + break; + case E1000_VFTA ... (E1000_VFTA + (127*4)): + retval = sc->esc_fvlan[(offset - E1000_VFTA) >> 2]; + break; + case E1000_EECD: + //DPRINTF("EECD read %x\r\n", sc->eeprom_control); + retval = sc->eeprom_control; + break; + case E1000_MDIC: + retval = sc->mdi_control; + break; + case E1000_MANC: + retval = 0; + break; + /* stats that we emulate. */ + case E1000_MPC: + retval = sc->missed_pkt_count; + break; + case E1000_PRC64: + retval = sc->pkt_rx_by_size[0]; + break; + case E1000_PRC127: + retval = sc->pkt_rx_by_size[1]; + break; + case E1000_PRC255: + retval = sc->pkt_rx_by_size[2]; + break; + case E1000_PRC511: + retval = sc->pkt_rx_by_size[3]; + break; + case E1000_PRC1023: + retval = sc->pkt_rx_by_size[4]; + break; + case E1000_PRC1522: + retval = sc->pkt_rx_by_size[5]; + break; + case E1000_GPRC: + retval = sc->good_pkt_rx_count; + break; + case E1000_BPRC: + retval = sc->bcast_pkt_rx_count; + break; + case E1000_MPRC: + retval = sc->mcast_pkt_rx_count; + break; + case E1000_GPTC: + case E1000_TPT: + retval = sc->good_pkt_tx_count; + break; + case E1000_GORCL: + retval = (uint32_t)sc->good_octets_rx; + break; + case E1000_GORCH: + retval = (uint32_t)(sc->good_octets_rx >> 32); + break; + case E1000_TOTL: + case E1000_GOTCL: + retval = (uint32_t)sc->good_octets_tx; + break; + case E1000_TOTH: + case E1000_GOTCH: + retval = (uint32_t)(sc->good_octets_tx >> 32); + break; + case E1000_ROC: + retval = sc->oversize_rx_count; + break; + case E1000_TORL: + retval = (uint32_t)(sc->good_octets_rx + sc->missed_octets); + break; + case E1000_TORH: + retval = (uint32_t)((sc->good_octets_rx + + sc->missed_octets) >> 32); + break; + case E1000_TPR: + retval = sc->good_pkt_rx_count + sc->missed_pkt_count + + sc->oversize_rx_count; + break; + case E1000_PTC64: + retval = sc->pkt_tx_by_size[0]; + break; + case E1000_PTC127: + retval = sc->pkt_tx_by_size[1]; + break; + case E1000_PTC255: + retval = sc->pkt_tx_by_size[2]; + break; + case E1000_PTC511: + retval = sc->pkt_tx_by_size[3]; + break; + case E1000_PTC1023: + retval = sc->pkt_tx_by_size[4]; + break; + case E1000_PTC1522: + retval = sc->pkt_tx_by_size[5]; + break; + case E1000_MPTC: + retval = sc->mcast_pkt_tx_count; + break; + case E1000_BPTC: + retval = sc->bcast_pkt_tx_count; + break; + case E1000_TSCTC: + retval = sc->tso_tx_count; + break; + /* stats that are always 0. */ + case E1000_CRCERRS: + case E1000_ALGNERRC: + case E1000_SYMERRS: + case E1000_RXERRC: + case E1000_SCC: + case E1000_ECOL: + case E1000_MCC: + case E1000_LATECOL: + case E1000_COLC: + case E1000_DC: + case E1000_TNCRS: + case E1000_SEC: + case E1000_CEXTERR: + case E1000_RLEC: + case E1000_XONRXC: + case E1000_XONTXC: + case E1000_XOFFRXC: + case E1000_XOFFTXC: + case E1000_FCRUC: + case E1000_RNBC: + case E1000_RUC: + case E1000_RFC: + case E1000_RJC: + case E1000_MGTPRC: + case E1000_MGTPDC: + case E1000_MGTPTC: + case E1000_TSCTFC: + retval = 0; + break; + default: + DPRINTF("Unknown read register: 0x%x\r\n", offset); + retval = 0; + break; + } + + return (retval); +} + +static void +e82545_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + struct e82545_softc *sc; + + //DPRINTF("Write bar:%d offset:0x%lx value:0x%lx size:%d\r\n", baridx, offset, value, size); + + sc = pi->pi_arg; + + pthread_mutex_lock(&sc->esc_mtx); + + switch (baridx) { + case E82545_BAR_IO: + switch (offset) { + case E82545_IOADDR: + if (size != 4) { + DPRINTF("Wrong io addr write sz:%d value:0x%lx\r\n", size, value); + } else + sc->io_addr = (uint32_t)value; + break; + case E82545_IODATA: + if (size != 4) { + DPRINTF("Wrong io data write size:%d value:0x%lx\r\n", size, value); + } else if (sc->io_addr > E82545_IO_REGISTER_MAX) { + DPRINTF("Non-register io write addr:0x%x value:0x%lx\r\n", sc->io_addr, value); + } else + e82545_write_register(sc, sc->io_addr, + (uint32_t)value); + break; + default: + DPRINTF("Unknown io bar write offset:0x%lx value:0x%lx size:%d\r\n", offset, value, size); + break; + } + break; + case E82545_BAR_REGISTER: + if (size != 4) { + DPRINTF("Wrong register write size:%d offset:0x%lx value:0x%lx\r\n", size, offset, value); + } else + e82545_write_register(sc, (uint32_t)offset, + (uint32_t)value); + break; + default: + DPRINTF("Unknown write bar:%d off:0x%lx val:0x%lx size:%d\r\n", + baridx, offset, value, size); + } + + pthread_mutex_unlock(&sc->esc_mtx); +} + +static uint64_t +e82545_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct e82545_softc *sc; + uint64_t retval; + + //DPRINTF("Read bar:%d offset:0x%lx size:%d\r\n", baridx, offset, size); + sc = pi->pi_arg; + retval = 0; + + pthread_mutex_lock(&sc->esc_mtx); + + switch (baridx) { + case E82545_BAR_IO: + switch (offset) { + case E82545_IOADDR: + if (size != 4) { + DPRINTF("Wrong io addr read sz:%d\r\n", size); + } else + retval = sc->io_addr; + break; + case E82545_IODATA: + if (size != 4) { + DPRINTF("Wrong io data read sz:%d\r\n", size); + } + if (sc->io_addr > E82545_IO_REGISTER_MAX) { + DPRINTF("Non-register io read addr:0x%x\r\n", + sc->io_addr); + } else + retval = e82545_read_register(sc, sc->io_addr); + break; + default: + DPRINTF("Unknown io bar read offset:0x%lx size:%d\r\n", + offset, size); + break; + } + break; + case E82545_BAR_REGISTER: + if (size != 4) { + DPRINTF("Wrong register read size:%d offset:0x%lx\r\n", + size, offset); + } else + retval = e82545_read_register(sc, (uint32_t)offset); + break; + default: + DPRINTF("Unknown read bar:%d offset:0x%lx size:%d\r\n", + baridx, offset, size); + break; + } + + pthread_mutex_unlock(&sc->esc_mtx); + + return (retval); +} + +static void +e82545_reset(struct e82545_softc *sc, int drvr) +{ + int i; + + e82545_rx_disable(sc); + e82545_tx_disable(sc); + + /* clear outstanding interrupts */ + if (sc->esc_irq_asserted) + pci_lintr_deassert(sc->esc_pi); + + /* misc */ + if (!drvr) { + sc->esc_FCAL = 0; + sc->esc_FCAH = 0; + sc->esc_FCT = 0; + sc->esc_VET = 0; + sc->esc_FCTTV = 0; + } + sc->esc_LEDCTL = 0x07061302; + sc->esc_PBA = 0x00100030; + + /* start nvm in opcode mode. */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + sc->eeprom_control = E1000_EECD_PRES | E82545_EECD_FWE_EN; + e82545_init_eeprom(sc); + + /* interrupt */ + sc->esc_ICR = 0; + sc->esc_ITR = 250; + sc->esc_ICS = 0; + sc->esc_IMS = 0; + sc->esc_IMC = 0; + + /* L2 filters */ + if (!drvr) { + memset(sc->esc_fvlan, 0, sizeof(sc->esc_fvlan)); + memset(sc->esc_fmcast, 0, sizeof(sc->esc_fmcast)); + memset(sc->esc_uni, 0, sizeof(sc->esc_uni)); + + /* XXX not necessary on 82545 ?? */ + sc->esc_uni[0].eu_valid = 1; + memcpy(sc->esc_uni[0].eu_eth.octet, sc->esc_mac.octet, + ETHER_ADDR_LEN); + } else { + /* Clear RAH valid bits */ + for (i = 0; i < 16; i++) + sc->esc_uni[i].eu_valid = 0; + } + + /* receive */ + if (!drvr) { + sc->esc_RDBAL = 0; + sc->esc_RDBAH = 0; + } + sc->esc_RCTL = 0; + sc->esc_FCRTL = 0; + sc->esc_FCRTH = 0; + sc->esc_RDLEN = 0; + sc->esc_RDH = 0; + sc->esc_RDT = 0; + sc->esc_RDTR = 0; + sc->esc_RXDCTL = (1 << 24) | (1 << 16); /* default GRAN/WTHRESH */ + sc->esc_RADV = 0; + sc->esc_RXCSUM = 0; + + /* transmit */ + if (!drvr) { + sc->esc_TDBAL = 0; + sc->esc_TDBAH = 0; + sc->esc_TIPG = 0; + sc->esc_AIT = 0; + sc->esc_TIDV = 0; + sc->esc_TADV = 0; + } + sc->esc_tdba = 0; + sc->esc_txdesc = NULL; + sc->esc_TXCW = 0; + sc->esc_TCTL = 0; + sc->esc_TDLEN = 0; + sc->esc_TDT = 0; + sc->esc_TDHr = sc->esc_TDH = 0; + sc->esc_TXDCTL = 0; +} + +static void +e82545_open_tap(struct e82545_softc *sc, char *opts) +{ + char tbuf[80]; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + if (opts == NULL) { + sc->esc_tapfd = -1; + return; + } + + strcpy(tbuf, "/dev/"); + strlcat(tbuf, opts, sizeof(tbuf)); + + sc->esc_tapfd = open(tbuf, O_RDWR); + if (sc->esc_tapfd == -1) { + DPRINTF("unable to open tap device %s\n", opts); + exit(4); + } + + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + int opt = 1; + if (ioctl(sc->esc_tapfd, FIONBIO, &opt) < 0) { + WPRINTF("tap device O_NONBLOCK failed: %d\n", errno); + close(sc->esc_tapfd); + sc->esc_tapfd = -1; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(sc->esc_tapfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + +#ifdef __FreeBSD__ + sc->esc_mevp = mevent_add(sc->esc_tapfd, + EVF_READ, + e82545_tap_callback, + sc); + if (sc->esc_mevp == NULL) { + DPRINTF("Could not register mevent %d\n", EVF_READ); + close(sc->esc_tapfd); + sc->esc_tapfd = -1; + } +#endif +} + +static int +e82545_parsemac(char *mac_str, uint8_t *mac_addr) +{ + struct ether_addr *ea; + char *tmpstr; + char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + + tmpstr = strsep(&mac_str,"="); + if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { + ea = ether_aton(mac_str); + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + fprintf(stderr, "Invalid MAC %s\n", mac_str); + return (1); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); + } + return (0); +} + +static int +e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + DPRINTF("Loading with options: %s\r\n", opts); + + MD5_CTX mdctx; + unsigned char digest[16]; + char nstr[80]; + struct e82545_softc *sc; + char *devname; + char *vtopts; + int mac_provided; + + /* Setup our softc */ + sc = calloc(1, sizeof(*sc)); + + pi->pi_arg = sc; + sc->esc_pi = pi; + sc->esc_ctx = ctx; + + pthread_mutex_init(&sc->esc_mtx, NULL); + pthread_cond_init(&sc->esc_rx_cond, NULL); + pthread_cond_init(&sc->esc_tx_cond, NULL); + pthread_create(&sc->esc_tx_tid, NULL, e82545_tx_thread, sc); + snprintf(nstr, sizeof(nstr), "e82545-%d:%d tx", pi->pi_slot, + pi->pi_func); + pthread_set_name_np(sc->esc_tx_tid, nstr); + + pci_set_cfgdata16(pi, PCIR_DEVICE, E82545_DEV_ID_82545EM_COPPER); + pci_set_cfgdata16(pi, PCIR_VENDOR, E82545_VENDOR_ID_INTEL); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_NETWORK_ETHERNET); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, E82545_SUBDEV_ID); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, E82545_VENDOR_ID_INTEL); + + pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); + pci_set_cfgdata8(pi, PCIR_INTPIN, 0x1); + + /* TODO: this card also supports msi, but the freebsd driver for it + * does not, so I have not implemented it. */ + pci_lintr_request(pi); + + pci_emul_alloc_bar(pi, E82545_BAR_REGISTER, PCIBAR_MEM32, + E82545_BAR_REGISTER_LEN); + pci_emul_alloc_bar(pi, E82545_BAR_FLASH, PCIBAR_MEM32, + E82545_BAR_FLASH_LEN); + pci_emul_alloc_bar(pi, E82545_BAR_IO, PCIBAR_IO, + E82545_BAR_IO_LEN); + + /* + * Attempt to open the tap device and read the MAC address + * if specified. Copied from virtio-net, slightly modified. + */ + mac_provided = 0; + sc->esc_tapfd = -1; + if (opts != NULL) { + int err; + + devname = vtopts = strdup(opts); + (void) strsep(&vtopts, ","); + + if (vtopts != NULL) { + err = e82545_parsemac(vtopts, sc->esc_mac.octet); + if (err != 0) { + free(devname); + return (err); + } + mac_provided = 1; + } + + if (strncmp(devname, "tap", 3) == 0 || + strncmp(devname, "vmnet", 5) == 0) + e82545_open_tap(sc, devname); + + free(devname); + } + + /* + * The default MAC address is the standard NetApp OUI of 00-a0-98, + * followed by an MD5 of the PCI slot/func number and dev name + */ + if (!mac_provided) { + snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, + pi->pi_func, vmname); + + MD5Init(&mdctx); + MD5Update(&mdctx, nstr, strlen(nstr)); + MD5Final(digest, &mdctx); + + sc->esc_mac.octet[0] = 0x00; + sc->esc_mac.octet[1] = 0xa0; + sc->esc_mac.octet[2] = 0x98; + sc->esc_mac.octet[3] = digest[0]; + sc->esc_mac.octet[4] = digest[1]; + sc->esc_mac.octet[5] = digest[2]; + } + + /* H/w initiated reset */ + e82545_reset(sc, 0); + + return (0); +} + +struct pci_devemu pci_de_e82545 = { + .pe_emu = "e1000", + .pe_init = e82545_init, + .pe_barwrite = e82545_write, + .pe_barread = e82545_read +}; +PCI_EMUL_SET(pci_de_e82545); + diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c index 3b4ca805cc..5118b31534 100644 --- a/usr/src/cmd/bhyve/pci_emul.c +++ b/usr/src/cmd/bhyve/pci_emul.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,16 +38,17 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z neel $"); +__FBSDID("$FreeBSD$"); #include #include -#include #include +#include #include #include #include @@ -66,22 +69,11 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z n #include "pci_irq.h" #include "pci_lpc.h" -#define CONF1_ADDR_PORT 0x0cf8 -#define CONF1_DATA_PORT 0x0cfc +#define CONF1_ADDR_PORT 0x0cf8 +#define CONF1_DATA_PORT 0x0cfc #define CONF1_ENABLE 0x80000000ul -#define CFGWRITE(pi,off,val,b) \ -do { \ - if ((b) == 1) { \ - pci_set_cfgdata8((pi),(off),(val)); \ - } else if ((b) == 2) { \ - pci_set_cfgdata16((pi),(off),(val)); \ - } else { \ - pci_set_cfgdata32((pi),(off),(val)); \ - } \ -} while (0) - #define MAXBUSES (PCI_BUSMAX + 1) #define MAXSLOTS (PCI_SLOTMAX + 1) #define MAXFUNCS (PCI_FUNCMAX + 1) @@ -136,6 +128,30 @@ static void pci_lintr_update(struct pci_devinst *pi); static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, int coff, int bytes, uint32_t *val); +static __inline void +CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) +{ + + if (bytes == 1) + pci_set_cfgdata8(pi, coff, val); + else if (bytes == 2) + pci_set_cfgdata16(pi, coff, val); + else + pci_set_cfgdata32(pi, coff, val); +} + +static __inline uint32_t +CFGREAD(struct pci_devinst *pi, int coff, int bytes) +{ + + if (bytes == 1) + return (pci_get_cfgdata8(pi, coff)); + else if (bytes == 2) + return (pci_get_cfgdata16(pi, coff)); + else + return (pci_get_cfgdata32(pi, coff)); +} + /* * I/O access */ @@ -234,6 +250,17 @@ done: return (error); } +void +pci_print_supported_devices() +{ + struct pci_devemu **pdpp, *pdp; + + SET_FOREACH(pdpp, pci_devemu_set) { + pdp = *pdpp; + printf("%s\n", pdp->pe_emu); + } +} + static int pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) { @@ -294,7 +321,7 @@ pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) /* * The PCI standard only allows 4 and 8 byte accesses to the MSI-X - * table but we also allow 1 byte access to accomodate reads from + * table but we also allow 1 byte access to accommodate reads from * ddb. */ if (size != 1 && size != 4 && size != 8) @@ -465,7 +492,7 @@ modify_bar_registration(struct pci_devinst *pi, int idx, int registration) iop.handler = pci_emul_io_handler; iop.arg = pi; error = register_inout(&iop); - } else + } else error = unregister_inout(&iop); break; case PCIBAR_MEM32: @@ -533,7 +560,7 @@ memen(struct pci_devinst *pi) * the address range decoded by the BAR register. */ static void -update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) +update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) { int decode; @@ -570,8 +597,10 @@ int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, enum pcibar_type type, uint64_t size) { + uint64_t *baseptr = NULL; + uint64_t limit = 0, lobits = 0; + uint64_t addr, mask, bar; int error; - uint64_t *baseptr, limit, addr, mask, lobits, bar; assert(idx >= 0 && idx <= PCI_BARMAX); @@ -634,7 +663,11 @@ pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); +#ifdef FreeBSD assert(0); +#else + abort(); +#endif } if (baseptr != NULL) { @@ -656,7 +689,7 @@ pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } - + register_bar(pdi, idx); return (0); @@ -759,8 +792,6 @@ pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) { int mmc; - CTASSERT(sizeof(struct msicap) == 14); - /* Number of msi messages must be a power of 2 between 1 and 32 */ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); mmc = ffs(msgnum) - 1; @@ -785,7 +816,6 @@ static void pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, uint32_t msix_tab_size) { - CTASSERT(sizeof(struct msixcap) == 12); assert(msix_tab_size % 4096 == 0); @@ -832,7 +862,7 @@ pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); - + tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; /* Align table size to nearest 4K */ @@ -862,10 +892,9 @@ msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask; - int off, table_bar; - + int off; + off = offset - capoff; - table_bar = pi->pi_msix.table_bar; /* Message Control Register */ if (off == 2 && bytes == 2) { rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; @@ -877,8 +906,8 @@ msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; pci_lintr_update(pi); - } - + } + CFGWRITE(pi, offset, val, bytes); } @@ -937,8 +966,6 @@ pci_emul_add_pciecap(struct pci_devinst *pi, int type) int err; struct pciecap pciecap; - CTASSERT(sizeof(struct pciecap) == 60); - if (type != PCIEM_TYPE_ROOT_PORT) return (-1); @@ -1085,7 +1112,7 @@ init_pci(struct vmctx *ctx) for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; - /* + /* * Keep track of the i/o and memory resources allocated to * this bus. */ @@ -1186,7 +1213,6 @@ init_pci(struct vmctx *ctx) return (0); } -#ifdef __FreeBSD__ static void pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, void *arg) @@ -1340,11 +1366,11 @@ pci_bus_write_dsdt(int bus) dsdt_line("Name (PPRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); - dsdt_line("})"); + dsdt_line("})"); dsdt_line("Name (APRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_apic_prt_entry, NULL); - dsdt_line("})"); + dsdt_line("})"); dsdt_line("Method (_PRT, 0, NotSerialized)"); dsdt_line("{"); dsdt_line(" If (PICM)"); @@ -1392,7 +1418,6 @@ pci_write_dsdt(void) dsdt_line("}"); dsdt_unindent(1); } -#endif int pci_bus_configured(int bus) @@ -1511,7 +1536,7 @@ pci_lintr_route(struct pci_devinst *pi) * is not yet assigned. */ if (ii->ii_ioapic_irq == 0) - ii->ii_ioapic_irq = ioapic_pci_alloc_irq(); + ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi); assert(ii->ii_ioapic_irq > 0); /* @@ -1519,7 +1544,7 @@ pci_lintr_route(struct pci_devinst *pi) * not yet assigned. */ if (ii->ii_pirq_pin == 0) - ii->ii_pirq_pin = pirq_alloc_pin(pi->pi_vmctx); + ii->ii_pirq_pin = pirq_alloc_pin(pi); assert(ii->ii_pirq_pin > 0); pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq; @@ -1667,27 +1692,31 @@ pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) } } -static uint32_t -bits_changed(uint32_t old, uint32_t new, uint32_t mask) -{ - - return ((old ^ new) & mask); -} - static void -pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes) +pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) { - int i; - uint16_t old; + int i, rshift; + uint32_t cmd, cmd2, changed, old, readonly; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ /* - * The command register is at an offset of 4 bytes and thus the - * guest could write 1, 2 or 4 bytes starting at this offset. + * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. + * + * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are + * 'write 1 to clear'. However these bits are not set to '1' by + * any device emulation so it is simpler to treat them as readonly. */ + rshift = (coff & 0x3) * 8; + readonly = 0xFFFFF880 >> rshift; + + old = CFGREAD(pi, coff, bytes); + new &= ~readonly; + new |= (old & readonly); + CFGWRITE(pi, coff, new, bytes); /* update config */ - old = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ - CFGWRITE(pi, PCIR_COMMAND, new, bytes); /* update config */ - new = pci_get_cfgdata16(pi, PCIR_COMMAND); /* get updated value */ + cmd2 = pci_get_cfgdata16(pi, PCIR_COMMAND); /* get updated value */ + changed = cmd ^ cmd2; /* * If the MMIO or I/O address space decoding has changed then @@ -1700,7 +1729,7 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes) break; case PCIBAR_IO: /* I/O address space decoding changed? */ - if (bits_changed(old, new, PCIM_CMD_PORTEN)) { + if (changed & PCIM_CMD_PORTEN) { if (porten(pi)) register_bar(pi, i); else @@ -1710,15 +1739,15 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes) case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ - if (bits_changed(old, new, PCIM_CMD_MEMEN)) { + if (changed & PCIM_CMD_MEMEN) { if (memen(pi)) register_bar(pi, i); else unregister_bar(pi, i); } - break; + break; default: - assert(0); + assert(0); } } @@ -1727,7 +1756,7 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes) * interrupt. */ pci_lintr_update(pi); -} +} static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, @@ -1738,7 +1767,8 @@ pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, struct pci_devinst *pi; struct pci_devemu *pe; int idx, needcfg; - uint64_t addr, bar, mask; + uint64_t addr, mask; + uint64_t bar = 0; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; @@ -1790,14 +1820,8 @@ pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, needcfg = 1; } - if (needcfg) { - if (bytes == 1) - *eax = pci_get_cfgdata8(pi, coff); - else if (bytes == 2) - *eax = pci_get_cfgdata16(pi, coff); - else - *eax = pci_get_cfgdata32(pi, coff); - } + if (needcfg) + *eax = CFGREAD(pi, coff, bytes); pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); } else { @@ -1867,8 +1891,8 @@ pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, } else if (pci_emul_iscap(pi, coff)) { pci_emul_capwrite(pi, coff, bytes, *eax); - } else if (coff == PCIR_COMMAND) { - pci_emul_cmdwrite(pi, *eax, bytes); + } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { + pci_emul_cmdsts_write(pi, coff, *eax, bytes); } else { CFGWRITE(pi, coff, *eax, bytes); } @@ -1940,8 +1964,8 @@ INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); #define DIOSZ 8 #define DMEMSZ 4096 struct pci_emul_dsoftc { - uint8_t ioregs[DIOSZ]; - uint8_t memregs[DMEMSZ]; + uint8_t ioregs[DIOSZ]; + uint8_t memregs[2][DMEMSZ]; }; #define PCI_EMUL_MSI_MSGS 4 @@ -1970,6 +1994,9 @@ pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); assert(error == 0); + error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); + assert(error == 0); + return (0); } @@ -2009,31 +2036,33 @@ pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, } } - if (baridx == 1) { + if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("diow: memw too large, offset %ld size %d\n", offset, size); return; } + i = baridx - 1; /* 'memregs' index */ + if (size == 1) { - sc->memregs[offset] = value; + sc->memregs[i][offset] = value; } else if (size == 2) { - *(uint16_t *)&sc->memregs[offset] = value; + *(uint16_t *)&sc->memregs[i][offset] = value; } else if (size == 4) { - *(uint32_t *)&sc->memregs[offset] = value; + *(uint32_t *)&sc->memregs[i][offset] = value; } else if (size == 8) { - *(uint64_t *)&sc->memregs[offset] = value; + *(uint64_t *)&sc->memregs[i][offset] = value; } else { printf("diow: memw unknown size %d\n", size); } - + /* * magic interrupt ?? */ } - if (baridx > 1) { + if (baridx > 2 || baridx < 0) { printf("diow: unknown bar idx %d\n", baridx); } } @@ -2044,14 +2073,17 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, { struct pci_emul_dsoftc *sc = pi->pi_arg; uint32_t value; + int i; + value = 0; if (baridx == 0) { if (offset + size > DIOSZ) { printf("dior: ior too large, offset %ld size %d\n", offset, size); return (0); } - + + value = 0; if (size == 1) { value = sc->ioregs[offset]; } else if (size == 2) { @@ -2062,29 +2094,31 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, printf("dior: ior unknown size %d\n", size); } } - - if (baridx == 1) { + + if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("dior: memr too large, offset %ld size %d\n", offset, size); return (0); } - + + i = baridx - 1; /* 'memregs' index */ + if (size == 1) { - value = sc->memregs[offset]; + value = sc->memregs[i][offset]; } else if (size == 2) { - value = *(uint16_t *) &sc->memregs[offset]; + value = *(uint16_t *) &sc->memregs[i][offset]; } else if (size == 4) { - value = *(uint32_t *) &sc->memregs[offset]; + value = *(uint32_t *) &sc->memregs[i][offset]; } else if (size == 8) { - value = *(uint64_t *) &sc->memregs[offset]; + value = *(uint64_t *) &sc->memregs[i][offset]; } else { printf("dior: ior unknown size %d\n", size); } } - if (baridx > 1) { + if (baridx > 2 || baridx < 0) { printf("dior: unknown bar idx %d\n", baridx); return (0); } diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h index 6af01c4c3c..853badaadb 100644 --- a/usr/src/cmd/bhyve/pci_emul.h +++ b/usr/src/cmd/bhyve/pci_emul.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_emul.h 269700 2014-08-08 03:49:01Z neel $ + * $FreeBSD$ */ #ifndef _PCI_EMUL_H_ @@ -142,6 +144,8 @@ struct pci_devinst { int pba_size; int function_mask; struct msix_table_entry *table; /* allocated at runtime */ + void *pba_page; + int pba_page_offset; } pi_msix; void *pi_arg; /* devemu-private data */ @@ -158,6 +162,7 @@ struct msicap { uint32_t addrhi; uint16_t msgdata; } __packed; +static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed"); struct msixcap { uint8_t capid; @@ -166,6 +171,7 @@ struct msixcap { uint32_t table_info; /* bar index and offset within it */ uint32_t pba_info; /* bar index and offset within it */ } __packed; +static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed"); struct pciecap { uint8_t capid; @@ -200,6 +206,7 @@ struct pciecap { uint16_t slot_control2; uint16_t slot_status2; } __packed; +static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed"); typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin, int ioapic_irq, void *arg); @@ -225,8 +232,9 @@ int pci_msi_enabled(struct pci_devinst *pi); int pci_msix_enabled(struct pci_devinst *pi); int pci_msix_table_bar(struct pci_devinst *pi); int pci_msix_pba_bar(struct pci_devinst *pi); -int pci_msi_msgnum(struct pci_devinst *pi); +int pci_msi_maxmsgnum(struct pci_devinst *pi); int pci_parse_slot(char *opt); +void pci_print_supported_devices(); void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, diff --git a/usr/src/cmd/bhyve/pci_fbuf.c b/usr/src/cmd/bhyve/pci_fbuf.c new file mode 100644 index 0000000000..8d24dde9da --- /dev/null +++ b/usr/src/cmd/bhyve/pci_fbuf.c @@ -0,0 +1,467 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Nahanni Systems, Inc. + * Copyright 2018 Joyent, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include + +#include +#include +#include + +#include +#include + +#include "bhyvegc.h" +#include "bhyverun.h" +#include "console.h" +#include "inout.h" +#include "pci_emul.h" +#include "rfb.h" +#include "vga.h" + +/* + * bhyve Framebuffer device emulation. + * BAR0 points to the current mode information. + * BAR1 is the 32-bit framebuffer address. + * + * -s ,fbuf,wait,vga=on|io|off,rfb=:port,w=width,h=height + */ + +static int fbuf_debug = 1; +#define DEBUG_INFO 1 +#define DEBUG_VERBOSE 4 +#define DPRINTF(level, params) if (level <= fbuf_debug) printf params + + +#define KB (1024UL) +#define MB (1024 * 1024UL) + +#define DMEMSZ 128 + +#define FB_SIZE (16*MB) + +#define COLS_MAX 1920 +#define ROWS_MAX 1200 + +#define COLS_DEFAULT 1024 +#define ROWS_DEFAULT 768 + +#define COLS_MIN 640 +#define ROWS_MIN 480 + +struct pci_fbuf_softc { + struct pci_devinst *fsc_pi; + struct { + uint32_t fbsize; + uint16_t width; + uint16_t height; + uint16_t depth; + uint16_t refreshrate; + uint8_t reserved[116]; + } __packed memregs; + + /* rfb server */ + char *rfb_host; + char *rfb_password; + int rfb_port; +#ifndef __FreeBSD__ + char *rfb_unix; +#endif + int rfb_wait; + int vga_enabled; + int vga_full; + + uint32_t fbaddr; + char *fb_base; + uint16_t gc_width; + uint16_t gc_height; + void *vgasc; + struct bhyvegc_image *gc_image; +}; + +static struct pci_fbuf_softc *fbuf_sc; + +#define PCI_FBUF_MSI_MSGS 4 + +static void +pci_fbuf_usage(char *opt) +{ + + fprintf(stderr, "Invalid fbuf emulation option \"%s\"\r\n", opt); + fprintf(stderr, "fbuf: {wait,}{vga=on|io|off,}rfb=:port" + "{,w=width}{,h=height}\r\n"); +} + +static void +pci_fbuf_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_fbuf_softc *sc; + uint8_t *p; + + assert(baridx == 0); + + sc = pi->pi_arg; + + DPRINTF(DEBUG_VERBOSE, + ("fbuf wr: offset 0x%lx, size: %d, value: 0x%lx\n", + offset, size, value)); + + if (offset + size > DMEMSZ) { + printf("fbuf: write too large, offset %ld size %d\n", + offset, size); + return; + } + + p = (uint8_t *)&sc->memregs + offset; + + switch (size) { + case 1: + *p = value; + break; + case 2: + *(uint16_t *)p = value; + break; + case 4: + *(uint32_t *)p = value; + break; + case 8: + *(uint64_t *)p = value; + break; + default: + printf("fbuf: write unknown size %d\n", size); + break; + } + + if (!sc->gc_image->vgamode && sc->memregs.width == 0 && + sc->memregs.height == 0) { + DPRINTF(DEBUG_INFO, ("switching to VGA mode\r\n")); + sc->gc_image->vgamode = 1; + sc->gc_width = 0; + sc->gc_height = 0; + } else if (sc->gc_image->vgamode && sc->memregs.width != 0 && + sc->memregs.height != 0) { + DPRINTF(DEBUG_INFO, ("switching to VESA mode\r\n")); + sc->gc_image->vgamode = 0; + } +} + +uint64_t +pci_fbuf_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + struct pci_fbuf_softc *sc; + uint8_t *p; + uint64_t value; + + assert(baridx == 0); + + sc = pi->pi_arg; + + + if (offset + size > DMEMSZ) { + printf("fbuf: read too large, offset %ld size %d\n", + offset, size); + return (0); + } + + p = (uint8_t *)&sc->memregs + offset; + value = 0; + switch (size) { + case 1: + value = *p; + break; + case 2: + value = *(uint16_t *)p; + break; + case 4: + value = *(uint32_t *)p; + break; + case 8: + value = *(uint64_t *)p; + break; + default: + printf("fbuf: read unknown size %d\n", size); + break; + } + + DPRINTF(DEBUG_VERBOSE, + ("fbuf rd: offset 0x%lx, size: %d, value: 0x%lx\n", + offset, size, value)); + + return (value); +} + +static int +pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts) +{ + char *uopts, *xopts, *config; + char *tmpstr; + int ret; + + ret = 0; + uopts = strdup(opts); + for (xopts = strtok(uopts, ","); + xopts != NULL; + xopts = strtok(NULL, ",")) { + if (strcmp(xopts, "wait") == 0) { + sc->rfb_wait = 1; + continue; + } + + if ((config = strchr(xopts, '=')) == NULL) { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } + + *config++ = '\0'; + + DPRINTF(DEBUG_VERBOSE, ("pci_fbuf option %s = %s\r\n", + xopts, config)); + + if (!strcmp(xopts, "tcp") || !strcmp(xopts, "rfb")) { + /* + * IPv4 -- host-ip:port + * IPv6 -- [host-ip%zone]:port + * XXX for now port is mandatory. + */ + tmpstr = strsep(&config, "]"); + if (config) { + if (tmpstr[0] == '[') + tmpstr++; + sc->rfb_host = tmpstr; + if (config[0] == ':') + config++; + else { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } + sc->rfb_port = atoi(config); + } else { + config = tmpstr; + tmpstr = strsep(&config, ":"); + if (!config) + sc->rfb_port = atoi(tmpstr); + else { + sc->rfb_port = atoi(config); + sc->rfb_host = tmpstr; + } + } +#ifndef __FreeBSD__ + } else if (!strcmp(xopts, "unix")) { + sc->rfb_unix = config; +#endif + } else if (!strcmp(xopts, "vga")) { + if (!strcmp(config, "off")) { + sc->vga_enabled = 0; + } else if (!strcmp(config, "io")) { + sc->vga_enabled = 1; + sc->vga_full = 0; + } else if (!strcmp(config, "on")) { + sc->vga_enabled = 1; + sc->vga_full = 1; + } else { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } + } else if (!strcmp(xopts, "w")) { + sc->memregs.width = atoi(config); + if (sc->memregs.width > COLS_MAX) { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } else if (sc->memregs.width == 0) + sc->memregs.width = 1920; + } else if (!strcmp(xopts, "h")) { + sc->memregs.height = atoi(config); + if (sc->memregs.height > ROWS_MAX) { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } else if (sc->memregs.height == 0) + sc->memregs.height = 1080; + } else if (!strcmp(xopts, "password")) { + sc->rfb_password = config; + } else { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } + } + +done: + return (ret); +} + + +extern void vga_render(struct bhyvegc *gc, void *arg); + +void +pci_fbuf_render(struct bhyvegc *gc, void *arg) +{ + struct pci_fbuf_softc *sc; + + sc = arg; + + if (sc->vga_full && sc->gc_image->vgamode) { + /* TODO: mode switching to vga and vesa should use the special + * EFI-bhyve protocol port. + */ + vga_render(gc, sc->vgasc); + return; + } + if (sc->gc_width != sc->memregs.width || + sc->gc_height != sc->memregs.height) { + bhyvegc_resize(gc, sc->memregs.width, sc->memregs.height); + sc->gc_width = sc->memregs.width; + sc->gc_height = sc->memregs.height; + } + + return; +} + +static int +pci_fbuf_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int error, prot; + struct pci_fbuf_softc *sc; + + if (fbuf_sc != NULL) { + fprintf(stderr, "Only one frame buffer device is allowed.\n"); + return (-1); + } + + sc = calloc(1, sizeof(struct pci_fbuf_softc)); + + pi->pi_arg = sc; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x40FB); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_DISPLAY); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_DISPLAY_VGA); + + error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, DMEMSZ); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, FB_SIZE); + assert(error == 0); + + error = pci_emul_add_msicap(pi, PCI_FBUF_MSI_MSGS); + assert(error == 0); + + sc->fbaddr = pi->pi_bar[1].addr; + sc->memregs.fbsize = FB_SIZE; + sc->memregs.width = COLS_DEFAULT; + sc->memregs.height = ROWS_DEFAULT; + sc->memregs.depth = 32; + + sc->vga_enabled = 1; + sc->vga_full = 0; + + sc->fsc_pi = pi; + + error = pci_fbuf_parse_opts(sc, opts); + if (error != 0) + goto done; + + /* XXX until VGA rendering is enabled */ + if (sc->vga_full != 0) { + fprintf(stderr, "pci_fbuf: VGA rendering not enabled"); + goto done; + } + + sc->fb_base = vm_create_devmem(ctx, VM_FRAMEBUFFER, "framebuffer", FB_SIZE); + if (sc->fb_base == MAP_FAILED) { + error = -1; + goto done; + } + DPRINTF(DEBUG_INFO, ("fbuf frame buffer base: %p [sz %lu]\r\n", + sc->fb_base, FB_SIZE)); + + /* + * Map the framebuffer into the guest address space. + * XXX This may fail if the BAR is different than a prior + * run. In this case flag the error. This will be fixed + * when a change_memseg api is available. + */ + prot = PROT_READ | PROT_WRITE; + if (vm_mmap_memseg(ctx, sc->fbaddr, VM_FRAMEBUFFER, 0, FB_SIZE, prot) != 0) { + fprintf(stderr, "pci_fbuf: mapseg failed - try deleting VM and restarting\n"); + error = -1; + goto done; + } + + console_init(sc->memregs.width, sc->memregs.height, sc->fb_base); + console_fb_register(pci_fbuf_render, sc); + + if (sc->vga_enabled) + sc->vgasc = vga_init(!sc->vga_full); + sc->gc_image = console_get_image(); + + fbuf_sc = sc; + + memset((void *)sc->fb_base, 0, FB_SIZE); + +#ifdef __FreeBSD__ + error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, sc->rfb_password); +#else + if (sc->rfb_unix != NULL) { + error = rfb_init_unix(sc->rfb_unix, sc->rfb_wait, + sc->rfb_password); + } else { + error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, + sc->rfb_password); + } +#endif +done: + if (error) + free(sc); + + return (error); +} + +struct pci_devemu pci_fbuf = { + .pe_emu = "fbuf", + .pe_init = pci_fbuf_init, + .pe_barwrite = pci_fbuf_write, + .pe_barread = pci_fbuf_read +}; +PCI_EMUL_SET(pci_fbuf); diff --git a/usr/src/cmd/bhyve/pci_hostbridge.c b/usr/src/cmd/bhyve/pci_hostbridge.c index 08956d082e..b926c7817e 100644 --- a/usr/src/cmd/bhyve/pci_hostbridge.c +++ b/usr/src/cmd/bhyve/pci_hostbridge.c @@ -1,5 +1,8 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2018 Joyent, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -23,14 +26,21 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_hostbridge.c 283264 2015-05-21 20:11:52Z tychon $ + * $FreeBSD$ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_hostbridge.c 283264 2015-05-21 20:11:52Z tychon $"); +#ifndef __FreeBSD__ +#include +#include +#include +#include +#endif +__FBSDID("$FreeBSD$"); #include "pci_emul.h" +#ifdef __FreeBSD__ static int pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { @@ -56,6 +66,162 @@ pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) return (0); } +#else +static void +pci_hostbridge_setup(struct pci_devinst *pi, uint16_t vendor, uint16_t device) +{ + /* config space */ + pci_set_cfgdata16(pi, PCIR_VENDOR, vendor); + pci_set_cfgdata16(pi, PCIR_DEVICE, device); + pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); + + pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT); +} + + +static int +pci_hostbridge_parse_pci_val(const char *in, uint16_t *val) +{ + long num; + char *endp = NULL; + + errno = 0; + num = strtol(in, &endp, 0); + if (errno != 0 || endp == NULL || *endp != '\0') { + fprintf(stderr, "pci_hostbridge: invalid num '%s'", in); + return (-1); + } else if (num < 1 || num > UINT16_MAX) { + fprintf(stderr, "pci_hostbridge: 0x%04lx out of range", num); + return (-1); + } + *val = num; + return (0); +} + +static struct pci_hostbridge_model { + const char *phm_model; + uint16_t phm_vendor; + uint16_t phm_device; +} pci_hb_models[] = { + { "amd", 0x1022, 0x7432 }, /* AMD/made-up */ + { "netapp", 0x1275, 0x1275 }, /* NetApp/NetApp */ + { "i440fx", 0x8086, 0x1237 }, /* Intel/82441 */ + { "q35", 0x8086, 0x29b0 }, /* Intel/Q35 HB */ +}; + +#define NUM_HB_MODELS (sizeof (pci_hb_models) / sizeof (pci_hb_models[0])) + +static int +pci_hostbridge_parse_args(char *opts, uint16_t *vendorp, uint16_t *devicep) +{ + const char *model = NULL; + char *next; + uint16_t vendor = 0, device = 0; + int err = 0; + + for (; opts != NULL && *opts != '\0'; opts = next) { + char *val, *cp; + + if ((cp = strchr(opts, ',')) != NULL) { + *cp = '\0'; + next = cp + 1; + } else { + next = NULL; + } + + if ((cp = strchr(opts, '=')) == NULL) { + fprintf(stderr, + "pci_hostbridge: expected value for param" + " (%s=VAL)", opts); + err = -1; + continue; + } + + /* = handling */ + val = cp + 1; + *cp = '\0'; + if (strcmp(opts, "model") == 0) { + model = val; + } else if (strcmp(opts, "vendor") == 0) { + if (pci_hostbridge_parse_pci_val(val, &vendor) != 0) { + err = -1; + continue; + } + } else if (strcmp(opts, "device") == 0) { + if (pci_hostbridge_parse_pci_val(val, &device) != 0) { + err = -1; + continue; + } + } else { + fprintf(stderr, + "pci_hostbridge: unrecognized option '%s'", opts); + err = -1; + continue; + } + } + if (err != 0) { + return (err); + } + + if (model != NULL && (vendor != 0 || device != 0)) { + fprintf(stderr, "pci_hostbridge: cannot specify model " + "and vendor/device"); + return (-1); + } else if ((vendor != 0 && device == 0) || + (vendor == 0 && device != 0)) { + fprintf(stderr, "pci_hostbridge: must specify both vendor and" + "device for custom hostbridge"); + return (-1); + } + if (model != NULL) { + uint_t i; + + for (i = 0; i < NUM_HB_MODELS; i++) { + if (strcmp(model, pci_hb_models[i].phm_model) != 0) + continue; + + /* found a model match */ + *vendorp = pci_hb_models[i].phm_vendor; + *devicep = pci_hb_models[i].phm_device; + return (0); + } + fprintf(stderr, "pci_hostbridge: invalid model '%s'", model); + return (-1); + } + + /* custom hostbridge ID was specified */ + *vendorp = vendor; + *devicep = device; + return (0); +} + +static int +pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + uint16_t vendor, device; + + if (opts == NULL) { + /* Fall back to NetApp default if no options are specified */ + vendor = 0x1275; + device = 0x1275; + } else if (pci_hostbridge_parse_args(opts, &vendor, &device) != 0) { + return (-1); + } + + pci_hostbridge_setup(pi, vendor, device); + return (0); +} + +static int +pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + pci_hostbridge_setup(pi, 0x1022, 0x7432); + return (0); +} + +#endif /* __FreeBSD__ */ struct pci_devemu pci_de_amd_hostbridge = { .pe_emu = "amd_hostbridge", diff --git a/usr/src/cmd/bhyve/pci_irq.c b/usr/src/cmd/bhyve/pci_irq.c index 97ee330c65..4ecb3eddb0 100644 --- a/usr/src/cmd/bhyve/pci_irq.c +++ b/usr/src/cmd/bhyve/pci_irq.c @@ -1,5 +1,7 @@ /*- - * Copyright (c) 2014 Advanced Computing Technologies LLC + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Hudson River Trading LLC * Written by: John H. Baldwin * All rights reserved. * @@ -27,7 +29,7 @@ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_irq.c 266125 2014-05-15 14:16:55Z jhb $"); +__FBSDID("$FreeBSD$"); #include #include @@ -115,7 +117,7 @@ void pci_irq_reserve(int irq) { - assert(irq < nitems(irq_counts)); + assert(irq >= 0 && irq < nitems(irq_counts)); assert(pirq_cold); assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED); irq_counts[irq] = IRQ_DISABLED; @@ -125,10 +127,10 @@ void pci_irq_use(int irq) { - assert(irq < nitems(irq_counts)); + assert(irq >= 0 && irq < nitems(irq_counts)); assert(pirq_cold); - if (irq_counts[irq] != IRQ_DISABLED) - irq_counts[irq]++; + assert(irq_counts[irq] != IRQ_DISABLED); + irq_counts[irq]++; } void @@ -193,19 +195,25 @@ pci_irq_deassert(struct pci_devinst *pi) } int -pirq_alloc_pin(struct vmctx *ctx) +pirq_alloc_pin(struct pci_devinst *pi) { + struct vmctx *ctx = pi->pi_vmctx; int best_count, best_irq, best_pin, irq, pin; - pirq_cold = 1; - - /* First, find the least-used PIRQ pin. */ - best_pin = 0; - best_count = pirqs[0].use_count; - for (pin = 1; pin < nitems(pirqs); pin++) { - if (pirqs[pin].use_count < best_count) { - best_pin = pin; - best_count = pirqs[pin].use_count; + pirq_cold = 0; + + if (lpc_bootrom()) { + /* For external bootrom use fixed mapping. */ + best_pin = (4 + pi->pi_slot + pi->pi_lintr.pin) % 8; + } else { + /* Find the least-used PIRQ pin. */ + best_pin = 0; + best_count = pirqs[0].use_count; + for (pin = 1; pin < nitems(pirqs); pin++) { + if (pirqs[pin].use_count < best_count) { + best_pin = pin; + best_count = pirqs[pin].use_count; + } } } pirqs[best_pin].use_count++; @@ -222,7 +230,7 @@ pirq_alloc_pin(struct vmctx *ctx) best_count = irq_counts[irq]; } } - assert(best_irq != 0); + assert(best_irq >= 0); irq_counts[best_irq]++; pirqs[best_pin].reg = best_irq; vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER); @@ -234,16 +242,12 @@ pirq_alloc_pin(struct vmctx *ctx) int pirq_irq(int pin) { - - if (pin == -1) - return (255); assert(pin > 0 && pin <= nitems(pirqs)); return (pirqs[pin - 1].reg & PIRQ_IRQ); } /* XXX: Generate $PIR table. */ -#ifdef __FreeBSD__ static void pirq_dsdt(void) { @@ -348,4 +352,3 @@ pirq_dsdt(void) free(irq_prs); } LPC_DSDT(pirq_dsdt); -#endif diff --git a/usr/src/cmd/bhyve/pci_irq.h b/usr/src/cmd/bhyve/pci_irq.h index 483f12b61e..1ae56efc8f 100644 --- a/usr/src/cmd/bhyve/pci_irq.h +++ b/usr/src/cmd/bhyve/pci_irq.h @@ -1,5 +1,7 @@ /*- - * Copyright (c) 2014 Advanced Computing Technologies LLC + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Hudson River Trading LLC * Written by: John H. Baldwin * All rights reserved. * @@ -24,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_irq.h 266125 2014-05-15 14:16:55Z jhb $ + * $FreeBSD$ */ #ifndef __PCI_IRQ_H__ @@ -37,7 +39,7 @@ void pci_irq_deassert(struct pci_devinst *pi); void pci_irq_init(struct vmctx *ctx); void pci_irq_reserve(int irq); void pci_irq_use(int irq); -int pirq_alloc_pin(struct vmctx *ctx); +int pirq_alloc_pin(struct pci_devinst *pi); int pirq_irq(int pin); uint8_t pirq_read(int pin); void pirq_write(struct vmctx *ctx, int pin, uint8_t val); diff --git a/usr/src/cmd/bhyve/pci_lpc.c b/usr/src/cmd/bhyve/pci_lpc.c index 8c060150dc..b7ddb772a1 100644 --- a/usr/src/cmd/bhyve/pci_lpc.c +++ b/usr/src/cmd/bhyve/pci_lpc.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Neel Natu * Copyright (c) 2013 Tycho Nightingale * All rights reserved. @@ -24,11 +26,15 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z neel $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z neel $"); +__FBSDID("$FreeBSD$"); #include #include @@ -40,6 +46,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z ne #include #include "acpi.h" +#include "bootrom.h" #include "inout.h" #include "pci_emul.h" #include "pci_irq.h" @@ -62,6 +69,8 @@ SYSRES_IO(NMISC_PORT, 1); static struct pci_devinst *lpc_bridge; +static const char *romfile; + #define LPC_UART_NUM 2 static struct lpc_uart_softc { struct uart_softc *uart_softc; @@ -76,7 +85,7 @@ static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" }; /* * LPC device configuration is in the following form: * [,] - * For e.g. "com1,stdio" + * For e.g. "com1,stdio" or "bootrom,/var/romfile" */ int lpc_device_parse(const char *opts) @@ -88,6 +97,11 @@ lpc_device_parse(const char *opts) str = cpy = strdup(opts); lpcdev = strsep(&str, ","); if (lpcdev != NULL) { + if (strcasecmp(lpcdev, "bootrom") == 0) { + romfile = str; + error = 0; + goto done; + } for (unit = 0; unit < LPC_UART_NUM; unit++) { if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) { lpc_uart_softc[unit].opts = str; @@ -104,6 +118,23 @@ done: return (error); } +void +lpc_print_supported_devices() +{ + size_t i; + + printf("bootrom\n"); + for (i = 0; i < LPC_UART_NUM; i++) + printf("%s\n", lpc_uart_names[i]); +} + +const char * +lpc_bootrom(void) +{ + + return (romfile); +} + static void lpc_uart_intr_assert(void *arg) { @@ -148,6 +179,21 @@ lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uart_write(sc->uart_softc, offset + 1, *eax >> 8); } break; +#ifndef __FreeBSD__ + case 4: + if (in) { + *eax = uart_read(sc->uart_softc, offset); + *eax |= uart_read(sc->uart_softc, offset + 1) << 8; + *eax |= uart_read(sc->uart_softc, offset + 2) << 16; + *eax |= uart_read(sc->uart_softc, offset + 3) << 24; + } else { + uart_write(sc->uart_softc, offset, *eax); + uart_write(sc->uart_softc, offset + 1, *eax >> 8); + uart_write(sc->uart_softc, offset + 2, *eax >> 16); + uart_write(sc->uart_softc, offset + 3, *eax >> 24); + } + break; +#endif default: return (-1); } @@ -156,13 +202,19 @@ lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, } static int -lpc_init(void) +lpc_init(struct vmctx *ctx) { struct lpc_uart_softc *sc; struct inout_port iop; const char *name; int unit, error; + if (romfile != NULL) { + error = bootrom_init(ctx, romfile); + if (error) + return (error); + } + /* COM1 and COM2 */ for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = &lpc_uart_softc[unit]; @@ -200,7 +252,6 @@ lpc_init(void) return (0); } -#ifdef __FreeBSD__ static void pci_lpc_write_dsdt(struct pci_devinst *pi) { @@ -320,7 +371,6 @@ pci_lpc_uart_dsdt(void) } } LPC_DSDT(pci_lpc_uart_dsdt); -#endif static int pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, @@ -381,7 +431,7 @@ pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) return (-1); } - if (lpc_init() != 0) + if (lpc_init(ctx) != 0) return (-1); /* initialize config space */ @@ -423,9 +473,7 @@ lpc_pirq_routed(void) struct pci_devemu pci_de_lpc = { .pe_emu = "lpc", .pe_init = pci_lpc_init, -#ifdef __FreeBSD__ .pe_write_dsdt = pci_lpc_write_dsdt, -#endif .pe_cfgwrite = pci_lpc_cfgwrite, .pe_barwrite = pci_lpc_write, .pe_barread = pci_lpc_read diff --git a/usr/src/cmd/bhyve/pci_lpc.h b/usr/src/cmd/bhyve/pci_lpc.h index 4f725b1dd3..9041f79c50 100644 --- a/usr/src/cmd/bhyve/pci_lpc.h +++ b/usr/src/cmd/bhyve/pci_lpc.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Neel Natu * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_lpc.h 266125 2014-05-15 14:16:55Z jhb $ + * $FreeBSD$ */ #ifndef _LPC_H_ @@ -66,7 +68,9 @@ struct lpc_sysres { #define SYSRES_MEM(base, length) LPC_SYSRES(LPC_SYSRES_MEM, base, length) int lpc_device_parse(const char *opt); +void lpc_print_supported_devices(); char *lpc_pirq_name(int pin); void lpc_pirq_routed(void); +const char *lpc_bootrom(void); #endif diff --git a/usr/src/cmd/bhyve/pci_nvme.c b/usr/src/cmd/bhyve/pci_nvme.c new file mode 100644 index 0000000000..a56c1d6959 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_nvme.c @@ -0,0 +1,1953 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017 Shunsuke Mie + * Copyright (c) 2018 Leon Dang + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * bhyve PCIe-NVMe device emulation. + * + * options: + * -s ,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z + * + * accepted devpath: + * /dev/blockdev + * /path/to/image + * ram=size_in_MiB + * + * maxq = max number of queues + * qsz = max elements in each queue + * ioslots = max number of concurrent io requests + * sectsz = sector size (defaults to blockif sector size) + * ser = serial number (20-chars max) + * + */ + +/* TODO: + - create async event for smart and log + - intr coalesce + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "bhyverun.h" +#include "block_if.h" +#include "pci_emul.h" + + +static int nvme_debug = 0; +#define DPRINTF(params) if (nvme_debug) printf params +#define WPRINTF(params) printf params + +/* defaults; can be overridden */ +#define NVME_MSIX_BAR 4 + +#define NVME_IOSLOTS 8 + +/* The NVMe spec defines bits 13:4 in BAR0 as reserved */ +#define NVME_MMIO_SPACE_MIN (1 << 14) + +#define NVME_QUEUES 16 +#define NVME_MAX_QENTRIES 2048 + +#define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) +#define NVME_MAX_BLOCKIOVS 512 + +/* helpers */ + +/* Convert a zero-based value into a one-based value */ +#define ONE_BASED(zero) ((zero) + 1) +/* Convert a one-based value into a zero-based value */ +#define ZERO_BASED(one) ((one) - 1) + +/* Encode number of SQ's and CQ's for Set/Get Features */ +#define NVME_FEATURE_NUM_QUEUES(sc) \ + (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ + (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; + +#define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) + +enum nvme_controller_register_offsets { + NVME_CR_CAP_LOW = 0x00, + NVME_CR_CAP_HI = 0x04, + NVME_CR_VS = 0x08, + NVME_CR_INTMS = 0x0c, + NVME_CR_INTMC = 0x10, + NVME_CR_CC = 0x14, + NVME_CR_CSTS = 0x1c, + NVME_CR_NSSR = 0x20, + NVME_CR_AQA = 0x24, + NVME_CR_ASQ_LOW = 0x28, + NVME_CR_ASQ_HI = 0x2c, + NVME_CR_ACQ_LOW = 0x30, + NVME_CR_ACQ_HI = 0x34, +}; + +enum nvme_cmd_cdw11 { + NVME_CMD_CDW11_PC = 0x0001, + NVME_CMD_CDW11_IEN = 0x0002, + NVME_CMD_CDW11_IV = 0xFFFF0000, +}; + +#define NVME_CQ_INTEN 0x01 +#define NVME_CQ_INTCOAL 0x02 + +struct nvme_completion_queue { + struct nvme_completion *qbase; + uint32_t size; + uint16_t tail; /* nvme progress */ + uint16_t head; /* guest progress */ + uint16_t intr_vec; + uint32_t intr_en; + pthread_mutex_t mtx; +}; + +struct nvme_submission_queue { + struct nvme_command *qbase; + uint32_t size; + uint16_t head; /* nvme progress */ + uint16_t tail; /* guest progress */ + uint16_t cqid; /* completion queue id */ + int busy; /* queue is being processed */ + int qpriority; +}; + +enum nvme_storage_type { + NVME_STOR_BLOCKIF = 0, + NVME_STOR_RAM = 1, +}; + +struct pci_nvme_blockstore { + enum nvme_storage_type type; + void *ctx; + uint64_t size; + uint32_t sectsz; + uint32_t sectsz_bits; +}; + +struct pci_nvme_ioreq { + struct pci_nvme_softc *sc; + struct pci_nvme_ioreq *next; + struct nvme_submission_queue *nvme_sq; + uint16_t sqid; + + /* command information */ + uint16_t opc; + uint16_t cid; + uint32_t nsid; + + uint64_t prev_gpaddr; + size_t prev_size; + + /* + * lock if all iovs consumed (big IO); + * complete transaction before continuing + */ + pthread_mutex_t mtx; + pthread_cond_t cv; + + struct blockif_req io_req; + + /* pad to fit up to 512 page descriptors from guest IO request */ + struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; +}; + +struct pci_nvme_softc { + struct pci_devinst *nsc_pi; + + pthread_mutex_t mtx; + + struct nvme_registers regs; + + struct nvme_namespace_data nsdata; + struct nvme_controller_data ctrldata; + struct nvme_error_information_entry err_log; + struct nvme_health_information_page health_log; + struct nvme_firmware_page fw_log; + + struct pci_nvme_blockstore nvstore; + + uint16_t max_qentries; /* max entries per queue */ + uint32_t max_queues; /* max number of IO SQ's or CQ's */ + uint32_t num_cqueues; + uint32_t num_squeues; + + struct pci_nvme_ioreq *ioreqs; + struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */ + uint32_t pending_ios; + uint32_t ioslots; + sem_t iosemlock; + + /* + * Memory mapped Submission and Completion queues + * Each array includes both Admin and IO queues + */ + struct nvme_completion_queue *compl_queues; + struct nvme_submission_queue *submit_queues; + + /* controller features */ + uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */ + uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */ + uint32_t async_ev_config; /* 0x0B: async event config */ +}; + + +static void pci_nvme_io_partial(struct blockif_req *br, int err); + +/* Controller Configuration utils */ +#define NVME_CC_GET_EN(cc) \ + ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) +#define NVME_CC_GET_CSS(cc) \ + ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) +#define NVME_CC_GET_SHN(cc) \ + ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) +#define NVME_CC_GET_IOSQES(cc) \ + ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) +#define NVME_CC_GET_IOCQES(cc) \ + ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) + +#define NVME_CC_WRITE_MASK \ + ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ + (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ + (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) + +#define NVME_CC_NEN_WRITE_MASK \ + ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ + (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ + (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) + +/* Controller Status utils */ +#define NVME_CSTS_GET_RDY(sts) \ + ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) + +#define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) + +/* Completion Queue status word utils */ +#define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) +#define NVME_STATUS_MASK \ + ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ + (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) + +static __inline void +cpywithpad(char *dst, size_t dst_size, const char *src, char pad) +{ + size_t len; + + len = strnlen(src, dst_size); + memset(dst, pad, dst_size); + memcpy(dst, src, len); +} + +static __inline void +pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) +{ + + *status &= ~NVME_STATUS_MASK; + *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | + (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; +} + +static __inline void +pci_nvme_status_genc(uint16_t *status, uint16_t code) +{ + + pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); +} + +static __inline void +pci_nvme_toggle_phase(uint16_t *status, int prev) +{ + + if (prev) + *status &= ~NVME_STATUS_P; + else + *status |= NVME_STATUS_P; +} + +static void +pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) +{ + struct nvme_controller_data *cd = &sc->ctrldata; + + cd->vid = 0xFB5D; + cd->ssvid = 0x0000; + + cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); + cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); + + /* Num of submission commands that we can handle at a time (2^rab) */ + cd->rab = 4; + + /* FreeBSD OUI */ + cd->ieee[0] = 0x58; + cd->ieee[1] = 0x9c; + cd->ieee[2] = 0xfc; + + cd->mic = 0; + + cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ + + cd->ver = 0x00010300; + + cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; + cd->acl = 2; + cd->aerl = 4; + + cd->lpa = 0; /* TODO: support some simple things like SMART */ + cd->elpe = 0; /* max error log page entries */ + cd->npss = 1; /* number of power states support */ + + /* Warning Composite Temperature Threshold */ + cd->wctemp = 0x0157; + + cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | + (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); + cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | + (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); + cd->nn = 1; /* number of namespaces */ + + cd->fna = 0x03; + + cd->power_state[0].mp = 10; +} + +static void +pci_nvme_init_nsdata(struct pci_nvme_softc *sc) +{ + struct nvme_namespace_data *nd; + + nd = &sc->nsdata; + + nd->nsze = sc->nvstore.size / sc->nvstore.sectsz; + nd->ncap = nd->nsze; + nd->nuse = nd->nsze; + + /* Get LBA and backstore information from backing store */ + nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ + /* LBA data-sz = 2^lbads */ + nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; + + nd->flbas = 0; +} + +static void +pci_nvme_init_logpages(struct pci_nvme_softc *sc) +{ + + memset(&sc->err_log, 0, sizeof(sc->err_log)); + memset(&sc->health_log, 0, sizeof(sc->health_log)); + memset(&sc->fw_log, 0, sizeof(sc->fw_log)); +} + +static void +pci_nvme_reset_locked(struct pci_nvme_softc *sc) +{ + DPRINTF(("%s\r\n", __func__)); + + sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | + (1 << NVME_CAP_LO_REG_CQR_SHIFT) | + (60 << NVME_CAP_LO_REG_TO_SHIFT); + + sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; + + sc->regs.vs = 0x00010300; /* NVMe v1.3 */ + + sc->regs.cc = 0; + sc->regs.csts = 0; + + sc->num_cqueues = sc->num_squeues = sc->max_queues; + if (sc->submit_queues != NULL) { + for (int i = 0; i < sc->num_squeues + 1; i++) { + /* + * The Admin Submission Queue is at index 0. + * It must not be changed at reset otherwise the + * emulation will be out of sync with the guest. + */ + if (i != 0) { + sc->submit_queues[i].qbase = NULL; + sc->submit_queues[i].size = 0; + sc->submit_queues[i].cqid = 0; + } + sc->submit_queues[i].tail = 0; + sc->submit_queues[i].head = 0; + sc->submit_queues[i].busy = 0; + } + } else + sc->submit_queues = calloc(sc->num_squeues + 1, + sizeof(struct nvme_submission_queue)); + + if (sc->compl_queues != NULL) { + for (int i = 0; i < sc->num_cqueues + 1; i++) { + /* See Admin Submission Queue note above */ + if (i != 0) { + sc->compl_queues[i].qbase = NULL; + sc->compl_queues[i].size = 0; + } + + sc->compl_queues[i].tail = 0; + sc->compl_queues[i].head = 0; + } + } else { + sc->compl_queues = calloc(sc->num_cqueues + 1, + sizeof(struct nvme_completion_queue)); + + for (int i = 0; i < sc->num_cqueues + 1; i++) + pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); + } +} + +static void +pci_nvme_reset(struct pci_nvme_softc *sc) +{ + pthread_mutex_lock(&sc->mtx); + pci_nvme_reset_locked(sc); + pthread_mutex_unlock(&sc->mtx); +} + +static void +pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) +{ + uint16_t acqs, asqs; + + DPRINTF(("%s\r\n", __func__)); + + asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; + sc->submit_queues[0].size = asqs; + sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, + sizeof(struct nvme_command) * asqs); + + DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n", + __func__, sc->regs.asq, sc->submit_queues[0].qbase)); + + acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & + NVME_AQA_REG_ACQS_MASK) + 1; + sc->compl_queues[0].size = acqs; + sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, + sizeof(struct nvme_completion) * acqs); + DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n", + __func__, sc->regs.acq, sc->compl_queues[0].qbase)); +} + +static int +nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src, + size_t len) +{ + uint8_t *dst; + size_t bytes; + + if (len > (8 * 1024)) { + return (-1); + } + + /* Copy from the start of prp1 to the end of the physical page */ + bytes = PAGE_SIZE - (prp1 & PAGE_MASK); + bytes = MIN(bytes, len); + + dst = vm_map_gpa(ctx, prp1, bytes); + if (dst == NULL) { + return (-1); + } + + memcpy(dst, src, bytes); + + src += bytes; + + len -= bytes; + if (len == 0) { + return (0); + } + + len = MIN(len, PAGE_SIZE); + + dst = vm_map_gpa(ctx, prp2, len); + if (dst == NULL) { + return (-1); + } + + memcpy(dst, src, len); + + return (0); +} + +static int +nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint16_t qid = command->cdw10 & 0xffff; + + DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid)); + if (qid == 0 || qid > sc->num_squeues) { + WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n", + __func__, qid, sc->num_squeues)); + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + sc->submit_queues[qid].qbase = NULL; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + if (command->cdw11 & NVME_CMD_CDW11_PC) { + uint16_t qid = command->cdw10 & 0xffff; + struct nvme_submission_queue *nsq; + + if ((qid == 0) || (qid > sc->num_squeues)) { + WPRINTF(("%s queue index %u > num_squeues %u\r\n", + __func__, qid, sc->num_squeues)); + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + nsq = &sc->submit_queues[qid]; + nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); + + nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(struct nvme_command) * (size_t)nsq->size); + nsq->cqid = (command->cdw11 >> 16) & 0xffff; + nsq->qpriority = (command->cdw11 >> 1) & 0x03; + + DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__, + qid, nsq->size, nsq->qbase, nsq->cqid)); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + + DPRINTF(("%s completed creating IOSQ qid %u\r\n", + __func__, qid)); + } else { + /* + * Guest sent non-cont submission queue request. + * This setting is unsupported by this emulation. + */ + WPRINTF(("%s unsupported non-contig (list-based) " + "create i/o submission queue\r\n", __func__)); + + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + } + return (1); +} + +static int +nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint16_t qid = command->cdw10 & 0xffff; + + DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid)); + if (qid == 0 || qid > sc->num_cqueues) { + WPRINTF(("%s queue index %u / num_cqueues %u\r\n", + __func__, qid, sc->num_cqueues)); + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + sc->compl_queues[qid].qbase = NULL; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + if (command->cdw11 & NVME_CMD_CDW11_PC) { + uint16_t qid = command->cdw10 & 0xffff; + struct nvme_completion_queue *ncq; + + if ((qid == 0) || (qid > sc->num_cqueues)) { + WPRINTF(("%s queue index %u > num_cqueues %u\r\n", + __func__, qid, sc->num_cqueues)); + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + ncq = &sc->compl_queues[qid]; + ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; + ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; + ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); + + ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, + command->prp1, + sizeof(struct nvme_command) * (size_t)ncq->size); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + } else { + /* + * Non-contig completion queue unsupported. + */ + WPRINTF(("%s unsupported non-contig (list-based) " + "create i/o completion queue\r\n", + __func__)); + + /* 0x12 = Invalid Use of Controller Memory Buffer */ + pci_nvme_status_genc(&compl->status, 0x12); + } + + return (1); +} + +static int +nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2; + uint8_t logpage = command->cdw10 & 0xFF; + + DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize)); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + + switch (logpage) { + case NVME_LOG_ERROR: + nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, + command->prp2, (uint8_t *)&sc->err_log, logsize); + break; + case NVME_LOG_HEALTH_INFORMATION: + /* TODO: present some smart info */ + nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, + command->prp2, (uint8_t *)&sc->health_log, logsize); + break; + case NVME_LOG_FIRMWARE_SLOT: + nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, + command->prp2, (uint8_t *)&sc->fw_log, logsize); + break; + default: + WPRINTF(("%s get log page %x command not supported\r\n", + __func__, logpage)); + + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_LOG_PAGE); + } + + return (1); +} + +static int +nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + void *dest; + + DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__, + command->cdw10 & 0xFF, command->nsid)); + + switch (command->cdw10 & 0xFF) { + case 0x00: /* return Identify Namespace data structure */ + nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, + command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata)); + break; + case 0x01: /* return Identify Controller data structure */ + nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, + command->prp2, (uint8_t *)&sc->ctrldata, + sizeof(sc->ctrldata)); + break; + case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ + dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(uint32_t) * 1024); + ((uint32_t *)dest)[0] = 1; + ((uint32_t *)dest)[1] = 0; + break; + case 0x11: + pci_nvme_status_genc(&compl->status, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return (1); + case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ + case 0x10: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + default: + DPRINTF(("%s unsupported identify command requested 0x%x\r\n", + __func__, command->cdw10 & 0xFF)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint16_t nqr; /* Number of Queues Requested */ + + nqr = command->cdw11 & 0xFFFF; + if (nqr == 0xffff) { + WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (-1); + } + + sc->num_squeues = ONE_BASED(nqr); + if (sc->num_squeues > sc->max_queues) { + DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues, + sc->max_queues)); + sc->num_squeues = sc->max_queues; + } + + nqr = (command->cdw11 >> 16) & 0xFFFF; + if (nqr == 0xffff) { + WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (-1); + } + + sc->num_cqueues = ONE_BASED(nqr); + if (sc->num_cqueues > sc->max_queues) { + DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues, + sc->max_queues)); + sc->num_cqueues = sc->max_queues; + } + + compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); + + return (0); +} + +static int +nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + int feature = command->cdw10 & 0xFF; + uint32_t iv; + + DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); + compl->cdw0 = 0; + + switch (feature) { + case NVME_FEAT_ARBITRATION: + DPRINTF((" arbitration 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_POWER_MANAGEMENT: + DPRINTF((" power management 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_LBA_RANGE_TYPE: + DPRINTF((" lba range 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_TEMPERATURE_THRESHOLD: + DPRINTF((" temperature threshold 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_ERROR_RECOVERY: + DPRINTF((" error recovery 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_VOLATILE_WRITE_CACHE: + DPRINTF((" volatile write cache 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_NUMBER_OF_QUEUES: + nvme_set_feature_queues(sc, command, compl); + break; + case NVME_FEAT_INTERRUPT_COALESCING: + DPRINTF((" interrupt coalescing 0x%x\r\n", command->cdw11)); + + /* in uS */ + sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100; + + sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF; + break; + case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: + iv = command->cdw11 & 0xFFFF; + + DPRINTF((" interrupt vector configuration 0x%x\r\n", + command->cdw11)); + + for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) { + if (sc->compl_queues[i].intr_vec == iv) { + if (command->cdw11 & (1 << 16)) + sc->compl_queues[i].intr_en |= + NVME_CQ_INTCOAL; + else + sc->compl_queues[i].intr_en &= + ~NVME_CQ_INTCOAL; + } + } + break; + case NVME_FEAT_WRITE_ATOMICITY: + DPRINTF((" write atomicity 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + DPRINTF((" async event configuration 0x%x\r\n", + command->cdw11)); + sc->async_ev_config = command->cdw11; + break; + case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: + DPRINTF((" software progress marker 0x%x\r\n", + command->cdw11)); + break; + case 0x0C: + DPRINTF((" autonomous power state transition 0x%x\r\n", + command->cdw11)); + break; + default: + WPRINTF(("%s invalid feature\r\n", __func__)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + int feature = command->cdw10 & 0xFF; + + DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); + + compl->cdw0 = 0; + + switch (feature) { + case NVME_FEAT_ARBITRATION: + DPRINTF((" arbitration\r\n")); + break; + case NVME_FEAT_POWER_MANAGEMENT: + DPRINTF((" power management\r\n")); + break; + case NVME_FEAT_LBA_RANGE_TYPE: + DPRINTF((" lba range\r\n")); + break; + case NVME_FEAT_TEMPERATURE_THRESHOLD: + DPRINTF((" temperature threshold\r\n")); + switch ((command->cdw11 >> 20) & 0x3) { + case 0: + /* Over temp threshold */ + compl->cdw0 = 0xFFFF; + break; + case 1: + /* Under temp threshold */ + compl->cdw0 = 0; + break; + default: + WPRINTF((" invalid threshold type select\r\n")); + pci_nvme_status_genc(&compl->status, + NVME_SC_INVALID_FIELD); + return (1); + } + break; + case NVME_FEAT_ERROR_RECOVERY: + DPRINTF((" error recovery\r\n")); + break; + case NVME_FEAT_VOLATILE_WRITE_CACHE: + DPRINTF((" volatile write cache\r\n")); + break; + case NVME_FEAT_NUMBER_OF_QUEUES: + compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); + + DPRINTF((" number of queues (submit %u, completion %u)\r\n", + compl->cdw0 & 0xFFFF, + (compl->cdw0 >> 16) & 0xFFFF)); + + break; + case NVME_FEAT_INTERRUPT_COALESCING: + DPRINTF((" interrupt coalescing\r\n")); + break; + case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: + DPRINTF((" interrupt vector configuration\r\n")); + break; + case NVME_FEAT_WRITE_ATOMICITY: + DPRINTF((" write atomicity\r\n")); + break; + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + DPRINTF((" async event configuration\r\n")); + sc->async_ev_config = command->cdw11; + break; + case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: + DPRINTF((" software progress marker\r\n")); + break; + case 0x0C: + DPRINTF((" autonomous power state transition\r\n")); + break; + default: + WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__, + command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF)); + + /* TODO: search for the command ID and abort it */ + + compl->cdw0 = 1; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +#ifdef __FreeBSD__ +static int +nvme_opc_async_event_req(struct pci_nvme_softc* sc, + struct nvme_command* command, struct nvme_completion* compl) +{ + DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11)); + + /* + * TODO: raise events when they happen based on the Set Features cmd. + * These events happen async, so only set completion successful if + * there is an event reflective of the request to get event. + */ + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); + return (0); +} +#else +/* This is kept behind an ifdef while it's unused to appease the compiler. */ +#endif /* __FreeBSD__ */ + +static void +pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) +{ + struct nvme_completion compl; + struct nvme_command *cmd; + struct nvme_submission_queue *sq; + struct nvme_completion_queue *cq; + int do_intr = 0; + uint16_t sqhead; + + DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value)); + + sq = &sc->submit_queues[0]; + + sqhead = atomic_load_acq_short(&sq->head); + + if (atomic_testandset_int(&sq->busy, 1)) { + DPRINTF(("%s SQ busy, head %u, tail %u\r\n", + __func__, sqhead, sq->tail)); + return; + } + + DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail)); + + while (sqhead != atomic_load_acq_short(&sq->tail)) { + cmd = &(sq->qbase)[sqhead]; + compl.status = 0; + + switch (cmd->opc) { + case NVME_OPC_DELETE_IO_SQ: + DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__)); + do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl); + break; + case NVME_OPC_CREATE_IO_SQ: + DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__)); + do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl); + break; + case NVME_OPC_DELETE_IO_CQ: + DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__)); + do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl); + break; + case NVME_OPC_CREATE_IO_CQ: + DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__)); + do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl); + break; + case NVME_OPC_GET_LOG_PAGE: + DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__)); + do_intr |= nvme_opc_get_log_page(sc, cmd, &compl); + break; + case NVME_OPC_IDENTIFY: + DPRINTF(("%s command IDENTIFY\r\n", __func__)); + do_intr |= nvme_opc_identify(sc, cmd, &compl); + break; + case NVME_OPC_ABORT: + DPRINTF(("%s command ABORT\r\n", __func__)); + do_intr |= nvme_opc_abort(sc, cmd, &compl); + break; + case NVME_OPC_SET_FEATURES: + DPRINTF(("%s command SET_FEATURES\r\n", __func__)); + do_intr |= nvme_opc_set_features(sc, cmd, &compl); + break; + case NVME_OPC_GET_FEATURES: + DPRINTF(("%s command GET_FEATURES\r\n", __func__)); + do_intr |= nvme_opc_get_features(sc, cmd, &compl); + break; + case NVME_OPC_ASYNC_EVENT_REQUEST: + DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__)); + /* XXX dont care, unhandled for now + do_intr |= nvme_opc_async_event_req(sc, cmd, &compl); + */ + break; + default: + WPRINTF(("0x%x command is not implemented\r\n", + cmd->opc)); + } + + /* for now skip async event generation */ + if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) { + struct nvme_completion *cp; + int phase; + + cq = &sc->compl_queues[0]; + + cp = &(cq->qbase)[cq->tail]; + cp->cdw0 = compl.cdw0; + cp->sqid = 0; + cp->sqhd = sqhead; + cp->cid = cmd->cid; + + phase = NVME_STATUS_GET_P(cp->status); + cp->status = compl.status; + pci_nvme_toggle_phase(&cp->status, phase); + + cq->tail = (cq->tail + 1) % cq->size; + } + sqhead = (sqhead + 1) % sq->size; + } + + DPRINTF(("setting sqhead %u\r\n", sqhead)); + atomic_store_short(&sq->head, sqhead); + atomic_store_int(&sq->busy, 0); + + if (do_intr) + pci_generate_msix(sc->nsc_pi, 0); + +} + +static int +pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, + uint64_t gpaddr, size_t size, int do_write, uint64_t lba) +{ + int iovidx; + + if (req != NULL) { + /* concatenate contig block-iovs to minimize number of iovs */ + if ((req->prev_gpaddr + req->prev_size) == gpaddr) { + iovidx = req->io_req.br_iovcnt - 1; + + req->io_req.br_iov[iovidx].iov_base = + paddr_guest2host(req->sc->nsc_pi->pi_vmctx, + req->prev_gpaddr, size); + + req->prev_size += size; + req->io_req.br_resid += size; + + req->io_req.br_iov[iovidx].iov_len = req->prev_size; + } else { + pthread_mutex_lock(&req->mtx); + + iovidx = req->io_req.br_iovcnt; + if (iovidx == NVME_MAX_BLOCKIOVS) { + int err = 0; + + DPRINTF(("large I/O, doing partial req\r\n")); + + iovidx = 0; + req->io_req.br_iovcnt = 0; + + req->io_req.br_callback = pci_nvme_io_partial; + + if (!do_write) + err = blockif_read(sc->nvstore.ctx, + &req->io_req); + else + err = blockif_write(sc->nvstore.ctx, + &req->io_req); + + /* wait until req completes before cont */ + if (err == 0) + pthread_cond_wait(&req->cv, &req->mtx); + } + if (iovidx == 0) { + req->io_req.br_offset = lba; + req->io_req.br_resid = 0; + req->io_req.br_param = req; + } + + req->io_req.br_iov[iovidx].iov_base = + paddr_guest2host(req->sc->nsc_pi->pi_vmctx, + gpaddr, size); + + req->io_req.br_iov[iovidx].iov_len = size; + + req->prev_gpaddr = gpaddr; + req->prev_size = size; + req->io_req.br_resid += size; + + req->io_req.br_iovcnt++; + + pthread_mutex_unlock(&req->mtx); + } + } else { + /* RAM buffer: read/write directly */ + void *p = sc->nvstore.ctx; + void *gptr; + + if ((lba + size) > sc->nvstore.size) { + WPRINTF(("%s write would overflow RAM\r\n", __func__)); + return (-1); + } + + p = (void *)((uintptr_t)p + (uintptr_t)lba); + gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); + if (do_write) + memcpy(p, gptr, size); + else + memcpy(gptr, p, size); + } + return (0); +} + +static void +pci_nvme_set_completion(struct pci_nvme_softc *sc, + struct nvme_submission_queue *sq, int sqid, uint16_t cid, + uint32_t cdw0, uint16_t status, int ignore_busy) +{ + struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; + struct nvme_completion *compl; + int do_intr = 0; + int phase; + + DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n", + __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), + NVME_STATUS_GET_SC(status))); + + pthread_mutex_lock(&cq->mtx); + + assert(cq->qbase != NULL); + + compl = &cq->qbase[cq->tail]; + + compl->sqhd = atomic_load_acq_short(&sq->head); + compl->sqid = sqid; + compl->cid = cid; + + // toggle phase + phase = NVME_STATUS_GET_P(compl->status); + compl->status = status; + pci_nvme_toggle_phase(&compl->status, phase); + + cq->tail = (cq->tail + 1) % cq->size; + + if (cq->intr_en & NVME_CQ_INTEN) + do_intr = 1; + + pthread_mutex_unlock(&cq->mtx); + + if (ignore_busy || !atomic_load_acq_int(&sq->busy)) + if (do_intr) + pci_generate_msix(sc->nsc_pi, cq->intr_vec); +} + +static void +pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) +{ + req->sc = NULL; + req->nvme_sq = NULL; + req->sqid = 0; + + pthread_mutex_lock(&sc->mtx); + + req->next = sc->ioreqs_free; + sc->ioreqs_free = req; + sc->pending_ios--; + + /* when no more IO pending, can set to ready if device reset/enabled */ + if (sc->pending_ios == 0 && + NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) + sc->regs.csts |= NVME_CSTS_RDY; + + pthread_mutex_unlock(&sc->mtx); + + sem_post(&sc->iosemlock); +} + +static struct pci_nvme_ioreq * +pci_nvme_get_ioreq(struct pci_nvme_softc *sc) +{ + struct pci_nvme_ioreq *req = NULL;; + + sem_wait(&sc->iosemlock); + pthread_mutex_lock(&sc->mtx); + + req = sc->ioreqs_free; + assert(req != NULL); + + sc->ioreqs_free = req->next; + + req->next = NULL; + req->sc = sc; + + sc->pending_ios++; + + pthread_mutex_unlock(&sc->mtx); + + req->io_req.br_iovcnt = 0; + req->io_req.br_offset = 0; + req->io_req.br_resid = 0; + req->io_req.br_param = req; + req->prev_gpaddr = 0; + req->prev_size = 0; + + return req; +} + +static void +pci_nvme_io_done(struct blockif_req *br, int err) +{ + struct pci_nvme_ioreq *req = br->br_param; + struct nvme_submission_queue *sq = req->nvme_sq; + uint16_t code, status = 0; + + DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); + + /* TODO return correct error */ + code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; + pci_nvme_status_genc(&status, code); + + pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); + pci_nvme_release_ioreq(req->sc, req); +} + +static void +pci_nvme_io_partial(struct blockif_req *br, int err) +{ + struct pci_nvme_ioreq *req = br->br_param; + + DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); + + pthread_cond_signal(&req->cv); +} + + +static void +pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) +{ + struct nvme_submission_queue *sq; + uint16_t status = 0; + uint16_t sqhead; + int err; + + /* handle all submissions up to sq->tail index */ + sq = &sc->submit_queues[idx]; + + if (atomic_testandset_int(&sq->busy, 1)) { + DPRINTF(("%s sqid %u busy\r\n", __func__, idx)); + return; + } + + sqhead = atomic_load_acq_short(&sq->head); + + DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n", + idx, sqhead, sq->tail, sq->qbase)); + + while (sqhead != atomic_load_acq_short(&sq->tail)) { + struct nvme_command *cmd; + struct pci_nvme_ioreq *req = NULL; + uint64_t lba; + uint64_t nblocks, bytes, size, cpsz; + + /* TODO: support scatter gather list handling */ + + cmd = &sq->qbase[sqhead]; + sqhead = (sqhead + 1) % sq->size; + + lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; + + if (cmd->opc == NVME_OPC_FLUSH) { + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + + continue; + } else if (cmd->opc == 0x08) { + /* TODO: write zeroes */ + WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n", + __func__, lba, cmd->cdw12 & 0xFFFF)); + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + + continue; + } + + nblocks = (cmd->cdw12 & 0xFFFF) + 1; + + bytes = nblocks * sc->nvstore.sectsz; + + if (sc->nvstore.type == NVME_STOR_BLOCKIF) { + req = pci_nvme_get_ioreq(sc); + req->nvme_sq = sq; + req->sqid = idx; + } + + /* + * If data starts mid-page and flows into the next page, then + * increase page count + */ + + DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " + "(%lu-bytes)\r\n", + sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, + cmd->opc == NVME_OPC_WRITE ? + "WRITE" : "READ", + lba, nblocks, bytes)); + + cmd->prp1 &= ~(0x03UL); + cmd->prp2 &= ~(0x03UL); + + DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2)); + + size = bytes; + lba *= sc->nvstore.sectsz; + + cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); + + if (cpsz > bytes) + cpsz = bytes; + + if (req != NULL) { + req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | + cmd->cdw10; + req->opc = cmd->opc; + req->cid = cmd->cid; + req->nsid = cmd->nsid; + } + + err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, + cmd->opc == NVME_OPC_WRITE, lba); + lba += cpsz; + size -= cpsz; + + if (size == 0) + goto iodone; + + if (size <= PAGE_SIZE) { + /* prp2 is second (and final) page in transfer */ + + err = pci_nvme_append_iov_req(sc, req, cmd->prp2, + size, + cmd->opc == NVME_OPC_WRITE, + lba); + } else { + uint64_t *prp_list; + int i; + + /* prp2 is pointer to a physical region page list */ + prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, + cmd->prp2, PAGE_SIZE); + + i = 0; + while (size != 0) { + cpsz = MIN(size, PAGE_SIZE); + + /* + * Move to linked physical region page list + * in last item. + */ + if (i == (NVME_PRP2_ITEMS-1) && + size > PAGE_SIZE) { + assert((prp_list[i] & (PAGE_SIZE-1)) == 0); + prp_list = paddr_guest2host( + sc->nsc_pi->pi_vmctx, + prp_list[i], PAGE_SIZE); + i = 0; + } + if (prp_list[i] == 0) { + WPRINTF(("PRP2[%d] = 0 !!!\r\n", i)); + err = 1; + break; + } + + err = pci_nvme_append_iov_req(sc, req, + prp_list[i], cpsz, + cmd->opc == NVME_OPC_WRITE, lba); + if (err) + break; + + lba += cpsz; + size -= cpsz; + i++; + } + } + +iodone: + if (sc->nvstore.type == NVME_STOR_RAM) { + uint16_t code, status = 0; + + code = err ? NVME_SC_LBA_OUT_OF_RANGE : + NVME_SC_SUCCESS; + pci_nvme_status_genc(&status, code); + + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + + continue; + } + + + if (err) + goto do_error; + + req->io_req.br_callback = pci_nvme_io_done; + + err = 0; + switch (cmd->opc) { + case NVME_OPC_READ: + err = blockif_read(sc->nvstore.ctx, &req->io_req); + break; + case NVME_OPC_WRITE: + err = blockif_write(sc->nvstore.ctx, &req->io_req); + break; + default: + WPRINTF(("%s unhandled io command 0x%x\r\n", + __func__, cmd->opc)); + err = 1; + } + +do_error: + if (err) { + uint16_t status = 0; + + pci_nvme_status_genc(&status, + NVME_SC_DATA_TRANSFER_ERROR); + + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + pci_nvme_release_ioreq(sc, req); + } + } + + atomic_store_short(&sq->head, sqhead); + atomic_store_int(&sq->busy, 0); +} + +static void +pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, + uint64_t idx, int is_sq, uint64_t value) +{ + DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n", + idx, is_sq ? "SQ" : "CQ", value & 0xFFFF)); + + if (is_sq) { + atomic_store_short(&sc->submit_queues[idx].tail, + (uint16_t)value); + + if (idx == 0) { + pci_nvme_handle_admin_cmd(sc, value); + } else { + /* submission queue; handle new entries in SQ */ + if (idx > sc->num_squeues) { + WPRINTF(("%s SQ index %lu overflow from " + "guest (max %u)\r\n", + __func__, idx, sc->num_squeues)); + return; + } + pci_nvme_handle_io_cmd(sc, (uint16_t)idx); + } + } else { + if (idx > sc->num_cqueues) { + WPRINTF(("%s queue index %lu overflow from " + "guest (max %u)\r\n", + __func__, idx, sc->num_cqueues)); + return; + } + + sc->compl_queues[idx].head = (uint16_t)value; + } +} + +static void +pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) +{ + const char *s = iswrite ? "WRITE" : "READ"; + + switch (offset) { + case NVME_CR_CAP_LOW: + DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s)); + break; + case NVME_CR_CAP_HI: + DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s)); + break; + case NVME_CR_VS: + DPRINTF(("%s %s NVME_CR_VS\r\n", func, s)); + break; + case NVME_CR_INTMS: + DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s)); + break; + case NVME_CR_INTMC: + DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s)); + break; + case NVME_CR_CC: + DPRINTF(("%s %s NVME_CR_CC\r\n", func, s)); + break; + case NVME_CR_CSTS: + DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s)); + break; + case NVME_CR_NSSR: + DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s)); + break; + case NVME_CR_AQA: + DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s)); + break; + case NVME_CR_ASQ_LOW: + DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s)); + break; + case NVME_CR_ASQ_HI: + DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s)); + break; + case NVME_CR_ACQ_LOW: + DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s)); + break; + case NVME_CR_ACQ_HI: + DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s)); + break; + default: + DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset)); + } + +} + +static void +pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, + uint64_t offset, int size, uint64_t value) +{ + uint32_t ccreg; + + if (offset >= NVME_DOORBELL_OFFSET) { + uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; + uint64_t idx = belloffset / 8; /* door bell size = 2*int */ + int is_sq = (belloffset % 8) < 4; + + if (belloffset > ((sc->max_queues+1) * 8 - 4)) { + WPRINTF(("guest attempted an overflow write offset " + "0x%lx, val 0x%lx in %s", + offset, value, __func__)); + return; + } + + pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); + return; + } + + DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n", + offset, size, value)); + + if (size != 4) { + WPRINTF(("guest wrote invalid size %d (offset 0x%lx, " + "val 0x%lx) to bar0 in %s", + size, offset, value, __func__)); + /* TODO: shutdown device */ + return; + } + + pci_nvme_bar0_reg_dumps(__func__, offset, 1); + + pthread_mutex_lock(&sc->mtx); + + switch (offset) { + case NVME_CR_CAP_LOW: + case NVME_CR_CAP_HI: + /* readonly */ + break; + case NVME_CR_VS: + /* readonly */ + break; + case NVME_CR_INTMS: + /* MSI-X, so ignore */ + break; + case NVME_CR_INTMC: + /* MSI-X, so ignore */ + break; + case NVME_CR_CC: + ccreg = (uint32_t)value; + + DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u " + "iocqes %u\r\n", + __func__, + NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), + NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), + NVME_CC_GET_IOCQES(ccreg))); + + if (NVME_CC_GET_SHN(ccreg)) { + /* perform shutdown - flush out data to backend */ + sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << + NVME_CSTS_REG_SHST_SHIFT); + sc->regs.csts |= NVME_SHST_COMPLETE << + NVME_CSTS_REG_SHST_SHIFT; + } + if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { + if (NVME_CC_GET_EN(ccreg) == 0) + /* transition 1-> causes controller reset */ + pci_nvme_reset_locked(sc); + else + pci_nvme_init_controller(ctx, sc); + } + + /* Insert the iocqes, iosqes and en bits from the write */ + sc->regs.cc &= ~NVME_CC_WRITE_MASK; + sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; + if (NVME_CC_GET_EN(ccreg) == 0) { + /* Insert the ams, mps and css bit fields */ + sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; + sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; + sc->regs.csts &= ~NVME_CSTS_RDY; + } else if (sc->pending_ios == 0) { + sc->regs.csts |= NVME_CSTS_RDY; + } + break; + case NVME_CR_CSTS: + break; + case NVME_CR_NSSR: + /* ignore writes; don't support subsystem reset */ + break; + case NVME_CR_AQA: + sc->regs.aqa = (uint32_t)value; + break; + case NVME_CR_ASQ_LOW: + sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | + (0xFFFFF000 & value); + break; + case NVME_CR_ASQ_HI: + sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | + (value << 32); + break; + case NVME_CR_ACQ_LOW: + sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | + (0xFFFFF000 & value); + break; + case NVME_CR_ACQ_HI: + sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | + (value << 32); + break; + default: + DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n", + __func__, offset, value, size)); + } + pthread_mutex_unlock(&sc->mtx); +} + +static void +pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_nvme_softc* sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, " + " value 0x%lx\r\n", baridx, offset, size, value)); + + pci_emul_msix_twrite(pi, offset, size, value); + return; + } + + switch (baridx) { + case 0: + pci_nvme_write_bar_0(ctx, sc, offset, size, value); + break; + + default: + DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n", + __func__, baridx, value)); + } +} + +static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, + uint64_t offset, int size) +{ + uint64_t value; + + pci_nvme_bar0_reg_dumps(__func__, offset, 0); + + if (offset < NVME_DOORBELL_OFFSET) { + void *p = &(sc->regs); + pthread_mutex_lock(&sc->mtx); + memcpy(&value, (void *)((uintptr_t)p + offset), size); + pthread_mutex_unlock(&sc->mtx); + } else { + value = 0; + WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset)); + } + + switch (size) { + case 1: + value &= 0xFF; + break; + case 2: + value &= 0xFFFF; + break; + case 4: + value &= 0xFFFFFFFF; + break; + } + + DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x\r\n", + offset, size, (uint32_t)value)); + + return (value); +} + + + +static uint64_t +pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct pci_nvme_softc* sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n", + baridx, offset, size)); + + return pci_emul_msix_tread(pi, offset, size); + } + + switch (baridx) { + case 0: + return pci_nvme_read_bar_0(sc, offset, size); + + default: + DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset)); + } + + return (0); +} + + +static int +pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) +{ + char bident[sizeof("XX:X:X")]; + char *uopt, *xopts, *config; + uint32_t sectsz; + int optidx; + + sc->max_queues = NVME_QUEUES; + sc->max_qentries = NVME_MAX_QENTRIES; + sc->ioslots = NVME_IOSLOTS; + sc->num_squeues = sc->max_queues; + sc->num_cqueues = sc->max_queues; + sectsz = 0; + + uopt = strdup(opts); + optidx = 0; + snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), + "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); + for (xopts = strtok(uopt, ","); + xopts != NULL; + xopts = strtok(NULL, ",")) { + + if ((config = strchr(xopts, '=')) != NULL) + *config++ = '\0'; + + if (!strcmp("maxq", xopts)) { + sc->max_queues = atoi(config); + } else if (!strcmp("qsz", xopts)) { + sc->max_qentries = atoi(config); + } else if (!strcmp("ioslots", xopts)) { + sc->ioslots = atoi(config); + } else if (!strcmp("sectsz", xopts)) { + sectsz = atoi(config); + } else if (!strcmp("ser", xopts)) { + /* + * This field indicates the Product Serial Number in + * 7-bit ASCII, unused bytes should be space characters. + * Ref: NVMe v1.3c. + */ + cpywithpad((char *)sc->ctrldata.sn, + sizeof(sc->ctrldata.sn), config, ' '); + } else if (!strcmp("ram", xopts)) { + uint64_t sz = strtoull(&xopts[4], NULL, 10); + + sc->nvstore.type = NVME_STOR_RAM; + sc->nvstore.size = sz * 1024 * 1024; + sc->nvstore.ctx = calloc(1, sc->nvstore.size); + sc->nvstore.sectsz = 4096; + sc->nvstore.sectsz_bits = 12; + if (sc->nvstore.ctx == NULL) { + perror("Unable to allocate RAM"); + free(uopt); + return (-1); + } + } else if (optidx == 0) { + snprintf(bident, sizeof(bident), "%d:%d", + sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); + sc->nvstore.ctx = blockif_open(xopts, bident); + if (sc->nvstore.ctx == NULL) { + perror("Could not open backing file"); + free(uopt); + return (-1); + } + sc->nvstore.type = NVME_STOR_BLOCKIF; + sc->nvstore.size = blockif_size(sc->nvstore.ctx); + } else { + fprintf(stderr, "Invalid option %s\n", xopts); + free(uopt); + return (-1); + } + + optidx++; + } + free(uopt); + + if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { + fprintf(stderr, "backing store not specified\n"); + return (-1); + } + if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) + sc->nvstore.sectsz = sectsz; + else if (sc->nvstore.type != NVME_STOR_RAM) + sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); + for (sc->nvstore.sectsz_bits = 9; + (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; + sc->nvstore.sectsz_bits++); + + if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) + sc->max_queues = NVME_QUEUES; + + if (sc->max_qentries <= 0) { + fprintf(stderr, "Invalid qsz option\n"); + return (-1); + } + if (sc->ioslots <= 0) { + fprintf(stderr, "Invalid ioslots option\n"); + return (-1); + } + + return (0); +} + +static int +pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_nvme_softc *sc; + uint32_t pci_membar_sz; + int error; + + error = 0; + + sc = calloc(1, sizeof(struct pci_nvme_softc)); + pi->pi_arg = sc; + sc->nsc_pi = pi; + + error = pci_nvme_parse_opts(sc, opts); + if (error < 0) + goto done; + else + error = 0; + + sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); + for (int i = 0; i < sc->ioslots; i++) { + if (i < (sc->ioslots-1)) + sc->ioreqs[i].next = &sc->ioreqs[i+1]; + pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); + pthread_cond_init(&sc->ioreqs[i].cv, NULL); + } + sc->ioreqs_free = sc->ioreqs; + sc->intr_coales_aggr_thresh = 1; + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); + pci_set_cfgdata8(pi, PCIR_PROGIF, + PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); + + /* + * Allocate size of NVMe registers + doorbell space for all queues. + * + * The specification requires a minimum memory I/O window size of 16K. + * The Windows driver will refuse to start a device with a smaller + * window. + */ + pci_membar_sz = sizeof(struct nvme_registers) + + 2 * sizeof(uint32_t) * (sc->max_queues + 1); + pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); + + DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz)); + + error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); + if (error) { + WPRINTF(("%s pci alloc mem bar failed\r\n", __func__)); + goto done; + } + + error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); + if (error) { + WPRINTF(("%s pci add msixcap failed\r\n", __func__)); + goto done; + } + + pthread_mutex_init(&sc->mtx, NULL); + sem_init(&sc->iosemlock, 0, sc->ioslots); + + pci_nvme_reset(sc); + pci_nvme_init_ctrldata(sc); + pci_nvme_init_nsdata(sc); + pci_nvme_init_logpages(sc); + + pci_lintr_request(pi); + +done: + return (error); +} + + +struct pci_devemu pci_de_nvme = { + .pe_emu = "nvme", + .pe_init = pci_nvme_init, + .pe_barwrite = pci_nvme_write, + .pe_barread = pci_nvme_read +}; +PCI_EMUL_SET(pci_de_nvme); diff --git a/usr/src/cmd/bhyve/pci_passthru.c b/usr/src/cmd/bhyve/pci_passthru.c new file mode 100644 index 0000000000..d2c69e795c --- /dev/null +++ b/usr/src/cmd/bhyve/pci_passthru.c @@ -0,0 +1,937 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include + +#include +#include + +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "pci_emul.h" +#include "mem.h" + +#ifndef _PATH_DEVPCI +#define _PATH_DEVPCI "/dev/pci" +#endif + +#ifndef _PATH_DEVIO +#define _PATH_DEVIO "/dev/io" +#endif + +#ifndef _PATH_MEM +#define _PATH_MEM "/dev/mem" +#endif + +#define LEGACY_SUPPORT 1 + +#define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) +#define MSIX_CAPLEN 12 + +static int pcifd = -1; +static int iofd = -1; +static int memfd = -1; + +struct passthru_softc { + struct pci_devinst *psc_pi; + struct pcibar psc_bar[PCI_BARMAX + 1]; + struct { + int capoff; + int msgctrl; + int emulated; + } psc_msi; + struct { + int capoff; + } psc_msix; + struct pcisel psc_sel; +}; + +static int +msi_caplen(int msgctrl) +{ + int len; + + len = 10; /* minimum length of msi capability */ + + if (msgctrl & PCIM_MSICTRL_64BIT) + len += 4; + +#if 0 + /* + * Ignore the 'mask' and 'pending' bits in the MSI capability. + * We'll let the guest manipulate them directly. + */ + if (msgctrl & PCIM_MSICTRL_VECTOR) + len += 10; +#endif + + return (len); +} + +static uint32_t +read_config(const struct pcisel *sel, long reg, int width) +{ + struct pci_io pi; + + bzero(&pi, sizeof(pi)); + pi.pi_sel = *sel; + pi.pi_reg = reg; + pi.pi_width = width; + + if (ioctl(pcifd, PCIOCREAD, &pi) < 0) + return (0); /* XXX */ + else + return (pi.pi_data); +} + +static void +write_config(const struct pcisel *sel, long reg, int width, uint32_t data) +{ + struct pci_io pi; + + bzero(&pi, sizeof(pi)); + pi.pi_sel = *sel; + pi.pi_reg = reg; + pi.pi_width = width; + pi.pi_data = data; + + (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */ +} + +#ifdef LEGACY_SUPPORT +static int +passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) +{ + int capoff, i; + struct msicap msicap; + u_char *capdata; + + pci_populate_msicap(&msicap, msgnum, nextptr); + + /* + * XXX + * Copy the msi capability structure in the last 16 bytes of the + * config space. This is wrong because it could shadow something + * useful to the device. + */ + capoff = 256 - roundup(sizeof(msicap), 4); + capdata = (u_char *)&msicap; + for (i = 0; i < sizeof(msicap); i++) + pci_set_cfgdata8(pi, capoff + i, capdata[i]); + + return (capoff); +} +#endif /* LEGACY_SUPPORT */ + +static int +cfginitmsi(struct passthru_softc *sc) +{ + int i, ptr, capptr, cap, sts, caplen, table_size; + uint32_t u32; + struct pcisel sel; + struct pci_devinst *pi; + struct msixcap msixcap; + uint32_t *msixcap_ptr; + + pi = sc->psc_pi; + sel = sc->psc_sel; + + /* + * Parse the capabilities and cache the location of the MSI + * and MSI-X capabilities. + */ + sts = read_config(&sel, PCIR_STATUS, 2); + if (sts & PCIM_STATUS_CAPPRESENT) { + ptr = read_config(&sel, PCIR_CAP_PTR, 1); + while (ptr != 0 && ptr != 0xff) { + cap = read_config(&sel, ptr + PCICAP_ID, 1); + if (cap == PCIY_MSI) { + /* + * Copy the MSI capability into the config + * space of the emulated pci device + */ + sc->psc_msi.capoff = ptr; + sc->psc_msi.msgctrl = read_config(&sel, + ptr + 2, 2); + sc->psc_msi.emulated = 0; + caplen = msi_caplen(sc->psc_msi.msgctrl); + capptr = ptr; + while (caplen > 0) { + u32 = read_config(&sel, capptr, 4); + pci_set_cfgdata32(pi, capptr, u32); + caplen -= 4; + capptr += 4; + } + } else if (cap == PCIY_MSIX) { + /* + * Copy the MSI-X capability + */ + sc->psc_msix.capoff = ptr; + caplen = 12; + msixcap_ptr = (uint32_t*) &msixcap; + capptr = ptr; + while (caplen > 0) { + u32 = read_config(&sel, capptr, 4); + *msixcap_ptr = u32; + pci_set_cfgdata32(pi, capptr, u32); + caplen -= 4; + capptr += 4; + msixcap_ptr++; + } + } + ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); + } + } + + if (sc->psc_msix.capoff != 0) { + pi->pi_msix.pba_bar = + msixcap.pba_info & PCIM_MSIX_BIR_MASK; + pi->pi_msix.pba_offset = + msixcap.pba_info & ~PCIM_MSIX_BIR_MASK; + pi->pi_msix.table_bar = + msixcap.table_info & PCIM_MSIX_BIR_MASK; + pi->pi_msix.table_offset = + msixcap.table_info & ~PCIM_MSIX_BIR_MASK; + pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); + pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count); + + /* Allocate the emulated MSI-X table array */ + table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; + pi->pi_msix.table = calloc(1, table_size); + + /* Mask all table entries */ + for (i = 0; i < pi->pi_msix.table_count; i++) { + pi->pi_msix.table[i].vector_control |= + PCIM_MSIX_VCTRL_MASK; + } + } + +#ifdef LEGACY_SUPPORT + /* + * If the passthrough device does not support MSI then craft a + * MSI capability for it. We link the new MSI capability at the + * head of the list of capabilities. + */ + if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { + int origptr, msiptr; + origptr = read_config(&sel, PCIR_CAP_PTR, 1); + msiptr = passthru_add_msicap(pi, 1, origptr); + sc->psc_msi.capoff = msiptr; + sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); + sc->psc_msi.emulated = 1; + pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); + } +#endif + + /* Make sure one of the capabilities is present */ + if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) + return (-1); + else + return (0); +} + +static uint64_t +msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) +{ + struct pci_devinst *pi; + struct msix_table_entry *entry; + uint8_t *src8; + uint16_t *src16; + uint32_t *src32; + uint64_t *src64; + uint64_t data; + size_t entry_offset; + int index; + + pi = sc->psc_pi; + if (offset >= pi->pi_msix.pba_offset && + offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { + switch(size) { + case 1: + src8 = (uint8_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + data = *src8; + break; + case 2: + src16 = (uint16_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + data = *src16; + break; + case 4: + src32 = (uint32_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + data = *src32; + break; + case 8: + src64 = (uint64_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + data = *src64; + break; + default: + return (-1); + } + return (data); + } + + if (offset < pi->pi_msix.table_offset) + return (-1); + + offset -= pi->pi_msix.table_offset; + index = offset / MSIX_TABLE_ENTRY_SIZE; + if (index >= pi->pi_msix.table_count) + return (-1); + + entry = &pi->pi_msix.table[index]; + entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + switch(size) { + case 1: + src8 = (uint8_t *)((void *)entry + entry_offset); + data = *src8; + break; + case 2: + src16 = (uint16_t *)((void *)entry + entry_offset); + data = *src16; + break; + case 4: + src32 = (uint32_t *)((void *)entry + entry_offset); + data = *src32; + break; + case 8: + src64 = (uint64_t *)((void *)entry + entry_offset); + data = *src64; + break; + default: + return (-1); + } + + return (data); +} + +static void +msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc, + uint64_t offset, int size, uint64_t data) +{ + struct pci_devinst *pi; + struct msix_table_entry *entry; + uint8_t *dest8; + uint16_t *dest16; + uint32_t *dest32; + uint64_t *dest64; + size_t entry_offset; + uint32_t vector_control; + int index; + + pi = sc->psc_pi; + if (offset >= pi->pi_msix.pba_offset && + offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { + switch(size) { + case 1: + dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + *dest8 = data; + break; + case 2: + dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + *dest16 = data; + break; + case 4: + dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + *dest32 = data; + break; + case 8: + dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + *dest64 = data; + break; + default: + break; + } + return; + } + + if (offset < pi->pi_msix.table_offset) + return; + + offset -= pi->pi_msix.table_offset; + index = offset / MSIX_TABLE_ENTRY_SIZE; + if (index >= pi->pi_msix.table_count) + return; + + entry = &pi->pi_msix.table[index]; + entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* Only 4 byte naturally-aligned writes are supported */ + assert(size == 4); + assert(entry_offset % 4 == 0); + + vector_control = entry->vector_control; + dest32 = (uint32_t *)((void *)entry + entry_offset); + *dest32 = data; + /* If MSI-X hasn't been enabled, do nothing */ + if (pi->pi_msix.enabled) { + /* If the entry is masked, don't set it up */ + if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || + (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + (void)vm_setup_pptdev_msix(ctx, vcpu, + sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, + sc->psc_sel.pc_func, index, entry->addr, + entry->msg_data, entry->vector_control); + } + } +} + +static int +init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) +{ + int b, s, f; + int error, idx; + size_t len, remaining; + uint32_t table_size, table_offset; + uint32_t pba_size, pba_offset; + vm_paddr_t start; + struct pci_devinst *pi = sc->psc_pi; + + assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); + + b = sc->psc_sel.pc_bus; + s = sc->psc_sel.pc_dev; + f = sc->psc_sel.pc_func; + + /* + * If the MSI-X table BAR maps memory intended for + * other uses, it is at least assured that the table + * either resides in its own page within the region, + * or it resides in a page shared with only the PBA. + */ + table_offset = rounddown2(pi->pi_msix.table_offset, 4096); + + table_size = pi->pi_msix.table_offset - table_offset; + table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; + table_size = roundup2(table_size, 4096); + + idx = pi->pi_msix.table_bar; + start = pi->pi_bar[idx].addr; + remaining = pi->pi_bar[idx].size; + + if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) { + pba_offset = pi->pi_msix.pba_offset; + pba_size = pi->pi_msix.pba_size; + if (pba_offset >= table_offset + table_size || + table_offset >= pba_offset + pba_size) { + /* + * If the PBA does not share a page with the MSI-x + * tables, no PBA emulation is required. + */ + pi->pi_msix.pba_page = NULL; + pi->pi_msix.pba_page_offset = 0; + } else { + /* + * The PBA overlaps with either the first or last + * page of the MSI-X table region. Map the + * appropriate page. + */ + if (pba_offset <= table_offset) + pi->pi_msix.pba_page_offset = table_offset; + else + pi->pi_msix.pba_page_offset = table_offset + + table_size - 4096; + pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ | + PROT_WRITE, MAP_SHARED, memfd, start + + pi->pi_msix.pba_page_offset); + if (pi->pi_msix.pba_page == MAP_FAILED) { + warn( + "Failed to map PBA page for MSI-X on %d/%d/%d", + b, s, f); + return (-1); + } + } + } + + /* Map everything before the MSI-X table */ + if (table_offset > 0) { + len = table_offset; + error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); + if (error) + return (error); + + base += len; + start += len; + remaining -= len; + } + + /* Skip the MSI-X table */ + base += table_size; + start += table_size; + remaining -= table_size; + + /* Map everything beyond the end of the MSI-X table */ + if (remaining > 0) { + len = remaining; + error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); + if (error) + return (error); + } + + return (0); +} + +static int +cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) +{ + int i, error; + struct pci_devinst *pi; + struct pci_bar_io bar; + enum pcibar_type bartype; + uint64_t base, size; + + pi = sc->psc_pi; + + /* + * Initialize BAR registers + */ + for (i = 0; i <= PCI_BARMAX; i++) { + bzero(&bar, sizeof(bar)); + bar.pbi_sel = sc->psc_sel; + bar.pbi_reg = PCIR_BAR(i); + + if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) + continue; + + if (PCI_BAR_IO(bar.pbi_base)) { + bartype = PCIBAR_IO; + base = bar.pbi_base & PCIM_BAR_IO_BASE; + } else { + switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { + case PCIM_BAR_MEM_64: + bartype = PCIBAR_MEM64; + break; + default: + bartype = PCIBAR_MEM32; + break; + } + base = bar.pbi_base & PCIM_BAR_MEM_BASE; + } + size = bar.pbi_length; + + if (bartype != PCIBAR_IO) { + if (((base | size) & PAGE_MASK) != 0) { + warnx("passthru device %d/%d/%d BAR %d: " + "base %#lx or size %#lx not page aligned\n", + sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, + sc->psc_sel.pc_func, i, base, size); + return (-1); + } + } + + /* Cache information about the "real" BAR */ + sc->psc_bar[i].type = bartype; + sc->psc_bar[i].size = size; + sc->psc_bar[i].addr = base; + + /* Allocate the BAR in the guest I/O or MMIO space */ + error = pci_emul_alloc_pbar(pi, i, base, bartype, size); + if (error) + return (-1); + + /* The MSI-X table needs special handling */ + if (i == pci_msix_table_bar(pi)) { + error = init_msix_table(ctx, sc, base); + if (error) + return (-1); + } else if (bartype != PCIBAR_IO) { + /* Map the physical BAR in the guest MMIO space */ + error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func, + pi->pi_bar[i].addr, pi->pi_bar[i].size, base); + if (error) + return (-1); + } + + /* + * 64-bit BAR takes up two slots so skip the next one. + */ + if (bartype == PCIBAR_MEM64) { + i++; + assert(i <= PCI_BARMAX); + sc->psc_bar[i].type = PCIBAR_MEMHI64; + } + } + return (0); +} + +static int +cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) +{ + int error; + struct passthru_softc *sc; + + error = 1; + sc = pi->pi_arg; + + bzero(&sc->psc_sel, sizeof(struct pcisel)); + sc->psc_sel.pc_bus = bus; + sc->psc_sel.pc_dev = slot; + sc->psc_sel.pc_func = func; + + if (cfginitmsi(sc) != 0) { + warnx("failed to initialize MSI for PCI %d/%d/%d", + bus, slot, func); + goto done; + } + + if (cfginitbar(ctx, sc) != 0) { + warnx("failed to initialize BARs for PCI %d/%d/%d", + bus, slot, func); + goto done; + } + + error = 0; /* success */ +done: + return (error); +} + +static int +passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int bus, slot, func, error, memflags; + struct passthru_softc *sc; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR }; + cap_ioctl_t io_ioctls[] = { IODEV_PIO }; +#endif + + sc = NULL; + error = 1; + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE); +#endif + + memflags = vm_get_memflags(ctx); + if (!(memflags & VM_MEM_F_WIRED)) { + warnx("passthru requires guest memory to be wired"); + goto done; + } + + if (pcifd < 0) { + pcifd = open(_PATH_DEVPCI, O_RDWR, 0); + if (pcifd < 0) { + warn("failed to open %s", _PATH_DEVPCI); + goto done; + } + } + +#ifndef WITHOUT_CAPSICUM + if (cap_rights_limit(pcifd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (cap_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + if (iofd < 0) { + iofd = open(_PATH_DEVIO, O_RDWR, 0); + if (iofd < 0) { + warn("failed to open %s", _PATH_DEVIO); + goto done; + } + } + +#ifndef WITHOUT_CAPSICUM + if (cap_rights_limit(iofd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (cap_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + if (memfd < 0) { + memfd = open(_PATH_MEM, O_RDWR, 0); + if (memfd < 0) { + warn("failed to open %s", _PATH_MEM); + goto done; + } + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_clear(&rights, CAP_IOCTL); + cap_rights_set(&rights, CAP_MMAP_RW); + if (cap_rights_limit(memfd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + if (opts == NULL || + sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) { + warnx("invalid passthru options"); + goto done; + } + + if (vm_assign_pptdev(ctx, bus, slot, func) != 0) { + warnx("PCI device at %d/%d/%d is not using the ppt(4) driver", + bus, slot, func); + goto done; + } + + sc = calloc(1, sizeof(struct passthru_softc)); + + pi->pi_arg = sc; + sc->psc_pi = pi; + + /* initialize config space */ + if ((error = cfginit(ctx, pi, bus, slot, func)) != 0) + goto done; + + error = 0; /* success */ +done: + if (error) { + free(sc); + vm_unassign_pptdev(ctx, bus, slot, func); + } + return (error); +} + +static int +bar_access(int coff) +{ + if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) + return (1); + else + return (0); +} + +static int +msicap_access(struct passthru_softc *sc, int coff) +{ + int caplen; + + if (sc->psc_msi.capoff == 0) + return (0); + + caplen = msi_caplen(sc->psc_msi.msgctrl); + + if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) + return (1); + else + return (0); +} + +static int +msixcap_access(struct passthru_softc *sc, int coff) +{ + if (sc->psc_msix.capoff == 0) + return (0); + + return (coff >= sc->psc_msix.capoff && + coff < sc->psc_msix.capoff + MSIX_CAPLEN); +} + +static int +passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t *rv) +{ + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* + * PCI BARs and MSI capability is emulated. + */ + if (bar_access(coff) || msicap_access(sc, coff)) + return (-1); + +#ifdef LEGACY_SUPPORT + /* + * Emulate PCIR_CAP_PTR if this device does not support MSI capability + * natively. + */ + if (sc->psc_msi.emulated) { + if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) + return (-1); + } +#endif + + /* Everything else just read from the device's config space */ + *rv = read_config(&sc->psc_sel, coff, bytes); + + return (0); +} + +static int +passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t val) +{ + int error, msix_table_entries, i; + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* + * PCI BARs are emulated + */ + if (bar_access(coff)) + return (-1); + + /* + * MSI capability is emulated + */ + if (msicap_access(sc, coff)) { + msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val); + + error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func, + pi->pi_msi.addr, pi->pi_msi.msg_data, + pi->pi_msi.maxmsgnum); + if (error != 0) + err(1, "vm_setup_pptdev_msi"); + return (0); + } + + if (msixcap_access(sc, coff)) { + msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val); + if (pi->pi_msix.enabled) { + msix_table_entries = pi->pi_msix.table_count; + for (i = 0; i < msix_table_entries; i++) { + error = vm_setup_pptdev_msix(ctx, vcpu, + sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, + sc->psc_sel.pc_func, i, + pi->pi_msix.table[i].addr, + pi->pi_msix.table[i].msg_data, + pi->pi_msix.table[i].vector_control); + + if (error) + err(1, "vm_setup_pptdev_msix"); + } + } + return (0); + } + +#ifdef LEGACY_SUPPORT + /* + * If this device does not support MSI natively then we cannot let + * the guest disable legacy interrupts from the device. It is the + * legacy interrupt that is triggering the virtual MSI to the guest. + */ + if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { + if (coff == PCIR_COMMAND && bytes == 2) + val &= ~PCIM_CMD_INTxDIS; + } +#endif + + write_config(&sc->psc_sel, coff, bytes, val); + + return (0); +} + +static void +passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + struct passthru_softc *sc; + struct iodev_pio_req pio; + + sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi)) { + msix_table_write(ctx, vcpu, sc, offset, size, value); + } else { + assert(pi->pi_bar[baridx].type == PCIBAR_IO); + bzero(&pio, sizeof(struct iodev_pio_req)); + pio.access = IODEV_PIO_WRITE; + pio.port = sc->psc_bar[baridx].addr + offset; + pio.width = size; + pio.val = value; + + (void)ioctl(iofd, IODEV_PIO, &pio); + } +} + +static uint64_t +passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct passthru_softc *sc; + struct iodev_pio_req pio; + uint64_t val; + + sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi)) { + val = msix_table_read(sc, offset, size); + } else { + assert(pi->pi_bar[baridx].type == PCIBAR_IO); + bzero(&pio, sizeof(struct iodev_pio_req)); + pio.access = IODEV_PIO_READ; + pio.port = sc->psc_bar[baridx].addr + offset; + pio.width = size; + pio.val = 0; + + (void)ioctl(iofd, IODEV_PIO, &pio); + + val = pio.val; + } + + return (val); +} + +struct pci_devemu passthru = { + .pe_emu = "passthru", + .pe_init = passthru_init, + .pe_cfgwrite = passthru_cfgwrite, + .pe_cfgread = passthru_cfgread, + .pe_barwrite = passthru_write, + .pe_barread = passthru_read, +}; +PCI_EMUL_SET(passthru); diff --git a/usr/src/cmd/bhyve/pci_uart.c b/usr/src/cmd/bhyve/pci_uart.c new file mode 100644 index 0000000000..093d0cb361 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_uart.c @@ -0,0 +1,121 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include + +#include "bhyverun.h" +#include "pci_emul.h" +#include "uart_emul.h" + +/* + * Pick a PCI vid/did of a chip with a single uart at + * BAR0, that most versions of FreeBSD can understand: + * Siig CyberSerial 1-port. + */ +#define COM_VENDOR 0x131f +#define COM_DEV 0x2000 + +static void +pci_uart_intr_assert(void *arg) +{ + struct pci_devinst *pi = arg; + + pci_lintr_assert(pi); +} + +static void +pci_uart_intr_deassert(void *arg) +{ + struct pci_devinst *pi = arg; + + pci_lintr_deassert(pi); +} + +static void +pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + + assert(baridx == 0); + assert(size == 1); + + uart_write(pi->pi_arg, offset, value); +} + +uint64_t +pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + uint8_t val; + + assert(baridx == 0); + assert(size == 1); + + val = uart_read(pi->pi_arg, offset); + return (val); +} + +static int +pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct uart_softc *sc; + + pci_emul_alloc_bar(pi, 0, PCIBAR_IO, UART_IO_BAR_SIZE); + pci_lintr_request(pi); + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV); + pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); + + sc = uart_init(pci_uart_intr_assert, pci_uart_intr_deassert, pi); + pi->pi_arg = sc; + + if (uart_set_backend(sc, opts) != 0) { + fprintf(stderr, "Unable to initialize backend '%s' for " + "pci uart at %d:%d\n", opts, pi->pi_slot, pi->pi_func); + return (-1); + } + + return (0); +} + +struct pci_devemu pci_de_com = { + .pe_emu = "uart", + .pe_init = pci_uart_init, + .pe_barwrite = pci_uart_write, + .pe_barread = pci_uart_read +}; +PCI_EMUL_SET(pci_de_com); diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c index 65e2d9c57d..5a7ecbfe9e 100644 --- a/usr/src/cmd/bhyve/pci_virtio_block.c +++ b/usr/src/cmd/bhyve/pci_virtio_block.c @@ -1,6 +1,9 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:47:09Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,10 +39,11 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:47:09Z neel $"); +__FBSDID("$FreeBSD$"); #include #include @@ -63,24 +67,23 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02: #include "bhyverun.h" #include "pci_emul.h" #include "virtio.h" +#include "block_if.h" -#define VTBLK_RINGSZ 64 +#define VTBLK_RINGSZ 128 -#ifdef __FreeBSD__ -#define VTBLK_MAXSEGS 32 -#else -#define VTBLK_MAXSEGS 16 -#endif +_Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); #define VTBLK_S_OK 0 #define VTBLK_S_IOERR 1 #define VTBLK_S_UNSUPP 2 -#define VTBLK_BLK_ID_BYTES 20 +#define VTBLK_BLK_ID_BYTES 20 + 1 /* Capability bits */ #define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */ -#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ +#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ +#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */ +#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */ /* * Host capabilities @@ -88,6 +91,8 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02: #define VTBLK_S_HOSTCAPS \ ( VTBLK_F_SEG_MAX | \ VTBLK_F_BLK_SIZE | \ + VTBLK_F_FLUSH | \ + VTBLK_F_TOPOLOGY | \ VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ /* @@ -97,11 +102,19 @@ struct vtblk_config { uint64_t vbc_capacity; uint32_t vbc_size_max; uint32_t vbc_seg_max; - uint16_t vbc_geom_c; - uint8_t vbc_geom_h; - uint8_t vbc_geom_s; + struct { + uint16_t cylinders; + uint8_t heads; + uint8_t sectors; + } vbc_geometry; uint32_t vbc_blk_size; - uint32_t vbc_sectors_max; + struct { + uint8_t physical_block_exp; + uint8_t alignment_offset; + uint16_t min_io_size; + uint32_t opt_io_size; + } vbc_topology; + uint8_t vbc_writeback; } __packed; /* @@ -110,9 +123,11 @@ struct vtblk_config { struct virtio_blk_hdr { #define VBH_OP_READ 0 #define VBH_OP_WRITE 1 -#define VBH_OP_IDENT 8 +#define VBH_OP_FLUSH 4 +#define VBH_OP_FLUSH_OUT 5 +#define VBH_OP_IDENT 8 #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ - uint32_t vbh_type; + uint32_t vbh_type; uint32_t vbh_ioprio; uint64_t vbh_sector; } __packed; @@ -124,6 +139,13 @@ static int pci_vtblk_debug; #define DPRINTF(params) if (pci_vtblk_debug) printf params #define WPRINTF(params) printf params +struct pci_vtblk_ioreq { + struct blockif_req io_req; + struct pci_vtblk_softc *io_sc; + uint8_t *io_status; + uint16_t io_idx; +}; + /* * Per-device softc */ @@ -131,24 +153,36 @@ struct pci_vtblk_softc { struct virtio_softc vbsc_vs; pthread_mutex_t vsc_mtx; struct vqueue_info vbsc_vq; - int vbsc_fd; - struct vtblk_config vbsc_cfg; + struct vtblk_config vbsc_cfg; + struct blockif_ctxt *bc; +#ifndef __FreeBSD__ + int vbsc_wce; +#endif char vbsc_ident[VTBLK_BLK_ID_BYTES]; + struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; }; static void pci_vtblk_reset(void *); static void pci_vtblk_notify(void *, struct vqueue_info *); static int pci_vtblk_cfgread(void *, int, int, uint32_t *); static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); +#ifndef __FreeBSD__ +static void pci_vtblk_apply_feats(void *, uint64_t); +#endif static struct virtio_consts vtblk_vi_consts = { "vtblk", /* our name */ 1, /* we support 1 virtqueue */ - sizeof(struct vtblk_config), /* config reg size */ + sizeof(struct vtblk_config), /* config reg size */ pci_vtblk_reset, /* reset */ pci_vtblk_notify, /* device-wide qnotify */ pci_vtblk_cfgread, /* read PCI config */ pci_vtblk_cfgwrite, /* write PCI config */ +#ifndef __FreeBSD__ + pci_vtblk_apply_feats, /* apply negotiated features */ +#else + NULL, /* apply negotiated features */ +#endif VTBLK_S_HOSTCAPS, /* our capabilities */ }; @@ -159,22 +193,58 @@ pci_vtblk_reset(void *vsc) DPRINTF(("vtblk: device reset requested !\n")); vi_reset_dev(&sc->vbsc_vs); +#ifndef __FreeBSD__ + /* Disable write cache until FLUSH feature is negotiated */ + (void) blockif_set_wce(sc->bc, 0); + sc->vbsc_wce = 0; +#endif +} + +static void +pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err) +{ + struct pci_vtblk_softc *sc = io->io_sc; + + /* convert errno into a virtio block error return */ + if (err == EOPNOTSUPP || err == ENOSYS) + *io->io_status = VTBLK_S_UNSUPP; + else if (err != 0) + *io->io_status = VTBLK_S_IOERR; + else + *io->io_status = VTBLK_S_OK; + + /* + * Return the descriptor back to the host. + * We wrote 1 byte (our status) to host. + */ + vq_relchain(&sc->vbsc_vq, io->io_idx, 1); + vq_endchains(&sc->vbsc_vq, 0); +} + +static void +pci_vtblk_done(struct blockif_req *br, int err) +{ + struct pci_vtblk_ioreq *io = br->br_param; + struct pci_vtblk_softc *sc = io->io_sc; + + pthread_mutex_lock(&sc->vsc_mtx); + pci_vtblk_done_locked(io, err); + pthread_mutex_unlock(&sc->vsc_mtx); } static void pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) { struct virtio_blk_hdr *vbh; - uint8_t *status; + struct pci_vtblk_ioreq *io; int i, n; int err; - int iolen; + ssize_t iolen; int writeop, type; - off_t offset; - struct iovec iov[VTBLK_MAXSEGS + 2]; - uint16_t flags[VTBLK_MAXSEGS + 2]; + struct iovec iov[BLOCKIF_IOV_MAX + 2]; + uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; - n = vq_getchain(vq, iov, VTBLK_MAXSEGS + 2, flags); + n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); /* * The first descriptor will be the read-only fixed header, @@ -184,13 +254,16 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) * XXX - note - this fails on crash dump, which does a * VIRTIO_BLK_T_FLUSH with a zero transfer length */ - assert(n >= 2 && n <= VTBLK_MAXSEGS + 2); + assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); + io = &sc->vbsc_ios[idx]; assert((flags[0] & VRING_DESC_F_WRITE) == 0); assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); - vbh = (struct virtio_block_hdr *)iov[0].iov_base; - - status = iov[--n].iov_base; + vbh = (struct virtio_blk_hdr *)iov[0].iov_base; + memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); + io->io_req.br_iovcnt = n - 2; + io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE; + io->io_status = (uint8_t *)iov[--n].iov_base; assert(iov[n].iov_len == 1); assert(flags[n] & VRING_DESC_F_WRITE); @@ -202,8 +275,6 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) type = vbh->vbh_type & ~VBH_FLAG_BARRIER; writeop = (type == VBH_OP_WRITE); - offset = vbh->vbh_sector * DEV_BSIZE; - iolen = 0; for (i = 1; i < n; i++) { /* @@ -215,42 +286,36 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); iolen += iov[i].iov_len; } + io->io_req.br_resid = iolen; - DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r", - writeop ? "write" : "read/ident", iolen, i - 1, offset)); + DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r", + writeop ? "write" : "read/ident", iolen, i - 1, + io->io_req.br_offset)); switch (type) { + case VBH_OP_READ: + err = blockif_read(sc->bc, &io->io_req); + break; case VBH_OP_WRITE: - err = pwritev(sc->vbsc_fd, iov + 1, i - 1, offset); + err = blockif_write(sc->bc, &io->io_req); break; - case VBH_OP_READ: - err = preadv(sc->vbsc_fd, iov + 1, i - 1, offset); + case VBH_OP_FLUSH: + case VBH_OP_FLUSH_OUT: + err = blockif_flush(sc->bc, &io->io_req); break; case VBH_OP_IDENT: /* Assume a single buffer */ - strlcpy(iov[1].iov_base, sc->vbsc_ident, + /* S/n equal to buffer is not zero-terminated. */ + memset(iov[1].iov_base, 0, iov[1].iov_len); + strncpy(iov[1].iov_base, sc->vbsc_ident, MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); - err = 0; - break; + pci_vtblk_done_locked(io, 0); + return; default: - err = -ENOSYS; - break; + pci_vtblk_done_locked(io, EOPNOTSUPP); + return; } - - /* convert errno into a virtio block error return */ - if (err < 0) { - if (err == -ENOSYS) - *status = VTBLK_S_UNSUPP; - else - *status = VTBLK_S_IOERR; - } else - *status = VTBLK_S_OK; - - /* - * Return the descriptor back to the host. - * We wrote 1 byte (our status) to host. - */ - vq_relchain(vq, 1); + assert(err == 0); } static void @@ -258,22 +323,20 @@ pci_vtblk_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtblk_softc *sc = vsc; - vq_startchains(vq); while (vq_has_descs(vq)) pci_vtblk_proc(sc, vq); - vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ } static int pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { - struct stat sbuf; + char bident[sizeof("XX:X:X")]; + struct blockif_ctxt *bctxt; MD5_CTX mdctx; u_char digest[16]; struct pci_vtblk_softc *sc; - off_t size; - int fd; - int sectsz; + off_t size; + int i, sectsz, sts, sto; if (opts == NULL) { printf("virtio-block: backing device required\n"); @@ -283,40 +346,32 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) /* * The supplied backing file has to exist */ - fd = open(opts, O_RDWR); - if (fd < 0) { + snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { perror("Could not open backing file"); return (1); } - if (fstat(fd, &sbuf) < 0) { - perror("Could not stat backing file"); - close(fd); - return (1); - } - - /* - * Deal with raw devices - */ - size = sbuf.st_size; - sectsz = DEV_BSIZE; -#ifdef __FreeBSD__ - if (S_ISCHR(sbuf.st_mode)) { - if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || - ioctl(fd, DIOCGSECTORSIZE, §sz)) { - perror("Could not fetch dev blk/sector size"); - close(fd); - return (1); - } - assert(size != 0); - assert(sectsz != 0); - } -#endif + size = blockif_size(bctxt); + sectsz = blockif_sectsz(bctxt); + blockif_psectsz(bctxt, &sts, &sto); sc = calloc(1, sizeof(struct pci_vtblk_softc)); + sc->bc = bctxt; + for (i = 0; i < VTBLK_RINGSZ; i++) { + struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; + io->io_req.br_callback = pci_vtblk_done; + io->io_req.br_param = io; + io->io_sc = sc; + io->io_idx = i; + } - /* record fd of storage device/file */ - sc->vbsc_fd = fd; +#ifndef __FreeBSD__ + /* Disable write cache until FLUSH feature is negotiated */ + (void) blockif_set_wce(sc->bc, 0); + sc->vbsc_wce = 0; +#endif pthread_mutex_init(&sc->vsc_mtx, NULL); @@ -333,19 +388,34 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) */ MD5Init(&mdctx); MD5Update(&mdctx, opts, strlen(opts)); - MD5Final(digest, &mdctx); - sprintf(sc->vbsc_ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X", + MD5Final(digest, &mdctx); + snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, + "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); /* setup virtio block config space */ sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */ - sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS; - sc->vbsc_cfg.vbc_blk_size = sectsz; sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ - sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */ - sc->vbsc_cfg.vbc_geom_h = 0; - sc->vbsc_cfg.vbc_geom_s = 0; - sc->vbsc_cfg.vbc_sectors_max = 0; + + /* + * If Linux is presented with a seg_max greater than the virtio queue + * size, it can stumble into situations where it violates its own + * invariants and panics. For safety, we keep seg_max clamped, paying + * heed to the two extra descriptors needed for the header and status + * of a request. + */ + sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); + sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ + sc->vbsc_cfg.vbc_geometry.heads = 0; + sc->vbsc_cfg.vbc_geometry.sectors = 0; + sc->vbsc_cfg.vbc_blk_size = sectsz; + sc->vbsc_cfg.vbc_topology.physical_block_exp = + (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; + sc->vbsc_cfg.vbc_topology.alignment_offset = + (sto != 0) ? ((sts - sto) / sectsz) : 0; + sc->vbsc_cfg.vbc_topology.min_io_size = 0; + sc->vbsc_cfg.vbc_topology.opt_io_size = 0; + sc->vbsc_cfg.vbc_writeback = 0; /* * Should we move some of this into virtio.c? Could @@ -356,9 +426,13 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); - if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) + if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { + blockif_close(sc->bc); + free(sc); return (1); + } vi_set_io_bar(&sc->vbsc_vs, 0); return (0); } @@ -383,6 +457,20 @@ pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) return (0); } +#ifndef __FreeBSD__ +void +pci_vtblk_apply_feats(void *vsc, uint64_t caps) +{ + struct pci_vtblk_softc *sc = vsc; + const int wce_next = ((caps & VTBLK_F_FLUSH) != 0) ? 1 : 0; + + if (sc->vbsc_wce != wce_next) { + (void) blockif_set_wce(sc->bc, wce_next); + sc->vbsc_wce = wce_next; + } +} +#endif /* __FreeBSD__ */ + struct pci_devemu pci_de_vblk = { .pe_emu = "virtio-blk", .pe_init = pci_vtblk_init, diff --git a/usr/src/cmd/bhyve/pci_virtio_console.c b/usr/src/cmd/bhyve/pci_virtio_console.c new file mode 100644 index 0000000000..90437662df --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_console.c @@ -0,0 +1,701 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 iXsystems Inc. + * All rights reserved. + * + * This software was developed by Jakub Klama + * under sponsorship from iXsystems Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" +#include "mevent.h" +#include "sockstream.h" + +#define VTCON_RINGSZ 64 +#define VTCON_MAXPORTS 16 +#define VTCON_MAXQ (VTCON_MAXPORTS * 2 + 2) + +#define VTCON_DEVICE_READY 0 +#define VTCON_DEVICE_ADD 1 +#define VTCON_DEVICE_REMOVE 2 +#define VTCON_PORT_READY 3 +#define VTCON_CONSOLE_PORT 4 +#define VTCON_CONSOLE_RESIZE 5 +#define VTCON_PORT_OPEN 6 +#define VTCON_PORT_NAME 7 + +#define VTCON_F_SIZE 0 +#define VTCON_F_MULTIPORT 1 +#define VTCON_F_EMERG_WRITE 2 +#define VTCON_S_HOSTCAPS \ + (VTCON_F_SIZE | VTCON_F_MULTIPORT | VTCON_F_EMERG_WRITE) + +static int pci_vtcon_debug; +#define DPRINTF(params) if (pci_vtcon_debug) printf params +#define WPRINTF(params) printf params + +struct pci_vtcon_softc; +struct pci_vtcon_port; +struct pci_vtcon_config; +typedef void (pci_vtcon_cb_t)(struct pci_vtcon_port *, void *, struct iovec *, + int); + +struct pci_vtcon_port { + struct pci_vtcon_softc * vsp_sc; + int vsp_id; + const char * vsp_name; + bool vsp_enabled; + bool vsp_console; + bool vsp_rx_ready; + bool vsp_open; + int vsp_rxq; + int vsp_txq; + void * vsp_arg; + pci_vtcon_cb_t * vsp_cb; +}; + +struct pci_vtcon_sock +{ + struct pci_vtcon_port * vss_port; + const char * vss_path; + struct mevent * vss_server_evp; + struct mevent * vss_conn_evp; + int vss_server_fd; + int vss_conn_fd; + bool vss_open; +}; + +struct pci_vtcon_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTCON_MAXQ]; + pthread_mutex_t vsc_mtx; + uint64_t vsc_cfg; + uint64_t vsc_features; + char * vsc_rootdir; + int vsc_kq; + int vsc_nports; + bool vsc_ready; + struct pci_vtcon_port vsc_control_port; + struct pci_vtcon_port vsc_ports[VTCON_MAXPORTS]; + struct pci_vtcon_config *vsc_config; +}; + +struct pci_vtcon_config { + uint16_t cols; + uint16_t rows; + uint32_t max_nr_ports; + uint32_t emerg_wr; +} __attribute__((packed)); + +struct pci_vtcon_control { + uint32_t id; + uint16_t event; + uint16_t value; +} __attribute__((packed)); + +struct pci_vtcon_console_resize { + uint16_t cols; + uint16_t rows; +} __attribute__((packed)); + +static void pci_vtcon_reset(void *); +static void pci_vtcon_notify_rx(void *, struct vqueue_info *); +static void pci_vtcon_notify_tx(void *, struct vqueue_info *); +static int pci_vtcon_cfgread(void *, int, int, uint32_t *); +static int pci_vtcon_cfgwrite(void *, int, int, uint32_t); +static void pci_vtcon_neg_features(void *, uint64_t); +static void pci_vtcon_sock_accept(int, enum ev_type, void *); +static void pci_vtcon_sock_rx(int, enum ev_type, void *); +static void pci_vtcon_sock_tx(struct pci_vtcon_port *, void *, struct iovec *, + int); +static void pci_vtcon_control_send(struct pci_vtcon_softc *, + struct pci_vtcon_control *, const void *, size_t); +static void pci_vtcon_announce_port(struct pci_vtcon_port *); +static void pci_vtcon_open_port(struct pci_vtcon_port *, bool); + +static struct virtio_consts vtcon_vi_consts = { + "vtcon", /* our name */ + VTCON_MAXQ, /* we support VTCON_MAXQ virtqueues */ + sizeof(struct pci_vtcon_config), /* config reg size */ + pci_vtcon_reset, /* reset */ + NULL, /* device-wide qnotify */ + pci_vtcon_cfgread, /* read virtio config */ + pci_vtcon_cfgwrite, /* write virtio config */ + pci_vtcon_neg_features, /* apply negotiated features */ + VTCON_S_HOSTCAPS, /* our capabilities */ +}; + + +static void +pci_vtcon_reset(void *vsc) +{ + struct pci_vtcon_softc *sc; + + sc = vsc; + + DPRINTF(("vtcon: device reset requested!\n")); + vi_reset_dev(&sc->vsc_vs); +} + +static void +pci_vtcon_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtcon_softc *sc = vsc; + + sc->vsc_features = negotiated_features; +} + +static int +pci_vtcon_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtcon_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static int +pci_vtcon_cfgwrite(void *vsc, int offset, int size, uint32_t val) +{ + + return (0); +} + +static inline struct pci_vtcon_port * +pci_vtcon_vq_to_port(struct pci_vtcon_softc *sc, struct vqueue_info *vq) +{ + uint16_t num = vq->vq_num; + + if (num == 0 || num == 1) + return (&sc->vsc_ports[0]); + + if (num == 2 || num == 3) + return (&sc->vsc_control_port); + + return (&sc->vsc_ports[(num / 2) - 1]); +} + +static inline struct vqueue_info * +pci_vtcon_port_to_vq(struct pci_vtcon_port *port, bool tx_queue) +{ + int qnum; + + qnum = tx_queue ? port->vsp_txq : port->vsp_rxq; + return (&port->vsp_sc->vsc_queues[qnum]); +} + +static struct pci_vtcon_port * +pci_vtcon_port_add(struct pci_vtcon_softc *sc, const char *name, + pci_vtcon_cb_t *cb, void *arg) +{ + struct pci_vtcon_port *port; + + if (sc->vsc_nports == VTCON_MAXPORTS) { + errno = EBUSY; + return (NULL); + } + + port = &sc->vsc_ports[sc->vsc_nports++]; + port->vsp_id = sc->vsc_nports - 1; + port->vsp_sc = sc; + port->vsp_name = name; + port->vsp_cb = cb; + port->vsp_arg = arg; + + if (port->vsp_id == 0) { + /* port0 */ + port->vsp_txq = 0; + port->vsp_rxq = 1; + } else { + port->vsp_txq = sc->vsc_nports * 2; + port->vsp_rxq = port->vsp_txq + 1; + } + + port->vsp_enabled = true; + return (port); +} + +static int +pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *name, + const char *path) +{ + struct pci_vtcon_sock *sock; +#ifdef __FreeBSD__ + struct sockaddr_un sun; + char *pathcopy; +#else + /* Our compiler #defines 'sun' as '1'. Awesome. */ + struct sockaddr_un addr; +#endif + int s = -1, fd = -1, error = 0; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + sock = calloc(1, sizeof(struct pci_vtcon_sock)); + if (sock == NULL) { + error = -1; + goto out; + } + + s = socket(AF_UNIX, SOCK_STREAM, 0); + if (s < 0) { + error = -1; + goto out; + } + +#ifdef __FreeBSD__ + pathcopy = strdup(path); + if (pathcopy == NULL) { + error = -1; + goto out; + } + + fd = open(dirname(pathcopy), O_RDONLY | O_DIRECTORY); + if (fd < 0) { + free(pathcopy); + error = -1; + goto out; + } + + sun.sun_family = AF_UNIX; + sun.sun_len = sizeof(struct sockaddr_un); + strcpy(pathcopy, path); + strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path)); + free(pathcopy); + + if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) { + error = -1; + goto out; + } +#else /* __FreeBSD__ */ + /* Do a simple bind rather than the FreeBSD bindat() */ + addr.sun_family = AF_UNIX; + (void) strlcpy(addr.sun_path, path, sizeof (addr.sun_path)); + if (bind(fd, (struct sockaddr *)&addr, sizeof (addr)) < 0) { + error = -1; + goto out; + } +#endif /* __FreeBSD__ */ + + if (fcntl(s, F_SETFL, O_NONBLOCK) < 0) { + error = -1; + goto out; + } + + if (listen(s, 1) < 0) { + error = -1; + goto out; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + sock->vss_port = pci_vtcon_port_add(sc, name, pci_vtcon_sock_tx, sock); + if (sock->vss_port == NULL) { + error = -1; + goto out; + } + + sock->vss_open = false; + sock->vss_conn_fd = -1; + sock->vss_server_fd = s; + sock->vss_server_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_accept, + sock); + + if (sock->vss_server_evp == NULL) { + error = -1; + goto out; + } + +out: + if (fd != -1) + close(fd); + + if (error != 0 && s != -1) + close(s); + + return (error); +} + +static void +pci_vtcon_sock_accept(int fd __unused, enum ev_type t __unused, void *arg) +{ + struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; + int s; + + s = accept(sock->vss_server_fd, NULL, NULL); + if (s < 0) + return; + + if (sock->vss_open) { + close(s); + return; + } + + sock->vss_open = true; + sock->vss_conn_fd = s; + sock->vss_conn_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_rx, sock); + + pci_vtcon_open_port(sock->vss_port, true); +} + +static void +pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg) +{ + struct pci_vtcon_port *port; + struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; + struct vqueue_info *vq; + struct iovec iov; + static char dummybuf[2048]; + int len, n; + uint16_t idx; + + port = sock->vss_port; + vq = pci_vtcon_port_to_vq(port, true); + + if (!sock->vss_open || !port->vsp_rx_ready) { + len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); + if (len == 0) + goto close; + + return; + } + + if (!vq_has_descs(vq)) { + len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); + vq_endchains(vq, 1); + if (len == 0) + goto close; + + return; + } + + do { + n = vq_getchain(vq, &idx, &iov, 1, NULL); + len = readv(sock->vss_conn_fd, &iov, n); + + if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) { + vq_retchain(vq); + vq_endchains(vq, 0); + if (len == 0) + goto close; + + return; + } + + vq_relchain(vq, idx, len); + } while (vq_has_descs(vq)); + + vq_endchains(vq, 1); + +close: + mevent_delete_close(sock->vss_conn_evp); + sock->vss_conn_fd = -1; + sock->vss_open = false; +} + +static void +pci_vtcon_sock_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, + int niov) +{ + struct pci_vtcon_sock *sock; +#ifdef __FreeBSD__ + int i, ret; +#else + int i, ret = 0; +#endif + + sock = (struct pci_vtcon_sock *)arg; + + if (sock->vss_conn_fd == -1) + return; + + for (i = 0; i < niov; i++) { + ret = stream_write(sock->vss_conn_fd, iov[i].iov_base, + iov[i].iov_len); + if (ret <= 0) + break; + } + + if (ret <= 0) { + mevent_delete_close(sock->vss_conn_evp); + sock->vss_conn_fd = -1; + sock->vss_open = false; + } +} + +static void +pci_vtcon_control_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, + int niov) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *tmp; + struct pci_vtcon_control resp, *ctrl; + int i; + + assert(niov == 1); + + sc = port->vsp_sc; + ctrl = (struct pci_vtcon_control *)iov->iov_base; + + switch (ctrl->event) { + case VTCON_DEVICE_READY: + sc->vsc_ready = true; + /* set port ready events for registered ports */ + for (i = 0; i < VTCON_MAXPORTS; i++) { + tmp = &sc->vsc_ports[i]; + if (tmp->vsp_enabled) + pci_vtcon_announce_port(tmp); + + if (tmp->vsp_open) + pci_vtcon_open_port(tmp, true); + } + break; + + case VTCON_PORT_READY: + if (ctrl->id >= sc->vsc_nports) { + WPRINTF(("VTCON_PORT_READY event for unknown port %d\n", + ctrl->id)); + return; + } + + tmp = &sc->vsc_ports[ctrl->id]; + if (tmp->vsp_console) { + resp.event = VTCON_CONSOLE_PORT; + resp.id = ctrl->id; + resp.value = 1; + pci_vtcon_control_send(sc, &resp, NULL, 0); + } + break; + } +} + +static void +pci_vtcon_announce_port(struct pci_vtcon_port *port) +{ + struct pci_vtcon_control event; + + event.id = port->vsp_id; + event.event = VTCON_DEVICE_ADD; + event.value = 1; + pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); + + event.event = VTCON_PORT_NAME; + pci_vtcon_control_send(port->vsp_sc, &event, port->vsp_name, + strlen(port->vsp_name)); +} + +static void +pci_vtcon_open_port(struct pci_vtcon_port *port, bool open) +{ + struct pci_vtcon_control event; + + if (!port->vsp_sc->vsc_ready) { + port->vsp_open = true; + return; + } + + event.id = port->vsp_id; + event.event = VTCON_PORT_OPEN; + event.value = (int)open; + pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); +} + +static void +pci_vtcon_control_send(struct pci_vtcon_softc *sc, + struct pci_vtcon_control *ctrl, const void *payload, size_t len) +{ + struct vqueue_info *vq; + struct iovec iov; + uint16_t idx; + int n; + + vq = pci_vtcon_port_to_vq(&sc->vsc_control_port, true); + + if (!vq_has_descs(vq)) + return; + + n = vq_getchain(vq, &idx, &iov, 1, NULL); + + assert(n == 1); + + memcpy(iov.iov_base, ctrl, sizeof(struct pci_vtcon_control)); + if (payload != NULL && len > 0) + memcpy(iov.iov_base + sizeof(struct pci_vtcon_control), + payload, len); + + vq_relchain(vq, idx, sizeof(struct pci_vtcon_control) + len); + vq_endchains(vq, 1); +} + + +static void +pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *port; + struct iovec iov[1]; + uint16_t idx, n; + uint16_t flags[8]; + + sc = vsc; + port = pci_vtcon_vq_to_port(sc, vq); + + while (vq_has_descs(vq)) { + n = vq_getchain(vq, &idx, iov, 1, flags); + assert(n >= 1); + if (port != NULL) + port->vsp_cb(port, port->vsp_arg, iov, 1); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, 0); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + +static void +pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *port; + + sc = vsc; + port = pci_vtcon_vq_to_port(sc, vq); + + if (!port->vsp_rx_ready) { + port->vsp_rx_ready = 1; + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + } +} + +static int +pci_vtcon_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_vtcon_softc *sc; + char *portname = NULL; + char *portpath = NULL; + char *opt; + int i; + + sc = calloc(1, sizeof(struct pci_vtcon_softc)); + sc->vsc_config = calloc(1, sizeof(struct pci_vtcon_config)); + sc->vsc_config->max_nr_ports = VTCON_MAXPORTS; + sc->vsc_config->cols = 80; + sc->vsc_config->rows = 25; + + vi_softc_linkup(&sc->vsc_vs, &vtcon_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + + for (i = 0; i < VTCON_MAXQ; i++) { + sc->vsc_queues[i].vq_qsize = VTCON_RINGSZ; + sc->vsc_queues[i].vq_notify = i % 2 == 0 + ? pci_vtcon_notify_rx + : pci_vtcon_notify_tx; + } + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_CONSOLE); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_CONSOLE); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vsc_vs, 0); + + /* create control port */ + sc->vsc_control_port.vsp_sc = sc; + sc->vsc_control_port.vsp_txq = 2; + sc->vsc_control_port.vsp_rxq = 3; + sc->vsc_control_port.vsp_cb = pci_vtcon_control_tx; + sc->vsc_control_port.vsp_enabled = true; + + while ((opt = strsep(&opts, ",")) != NULL) { + portname = strsep(&opt, "="); + portpath = opt; + + /* create port */ + if (pci_vtcon_sock_add(sc, portname, portpath) < 0) { + fprintf(stderr, "cannot create port %s: %s\n", + portname, strerror(errno)); + return (1); + } + } + + return (0); +} + +struct pci_devemu pci_de_vcon = { + .pe_emu = "virtio-console", + .pe_init = pci_vtcon_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vcon); diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c index e58bdd0115..aa188a3e59 100644 --- a/usr/src/cmd/bhyve/pci_virtio_net.c +++ b/usr/src/cmd/bhyve/pci_virtio_net.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37:33Z grehan $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,18 +38,33 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37:33Z grehan $"); +__FBSDID("$FreeBSD$"); #include +#ifndef WITHOUT_CAPSICUM +#include +#endif #include #include #include #include +#include #include +#ifdef __FreeBSD__ +#ifndef NETMAP_WITH_LIBS +#define NETMAP_WITH_LIBS +#endif +#include +#endif +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include #include #include #include @@ -60,21 +77,22 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37 #include #include #include -#ifndef __FreeBSD__ +#include +#ifndef __FreeBSD__ #include #include #endif #include "bhyverun.h" #include "pci_emul.h" -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ #include "mevent.h" #endif #include "virtio.h" #define VTNET_RINGSZ 1024 -#define VTNET_MAXSEGS 32 +#define VTNET_MAXSEGS 256 /* * Host capabilities. Note that we only offer a few of these. @@ -101,7 +119,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37 #define VTNET_S_HOSTCAPS \ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ - VIRTIO_F_NOTIFY_ON_EMPTY) + VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) /* * PCI config-space "registers" @@ -155,25 +173,35 @@ struct pci_vtnet_softc { dlpi_handle_t vsc_dhp; int vsc_dlpifd; #endif + struct nm_desc *vsc_nmd; + int vsc_rx_ready; volatile int resetting; /* set and checked outside lock */ - uint32_t vsc_features; + uint64_t vsc_features; /* negotiated features */ + struct virtio_net_config vsc_config; pthread_mutex_t rx_mtx; int rx_in_progress; + int rx_vhdrlen; + int rx_merge; /* merged rx bufs in use */ pthread_t tx_tid; pthread_mutex_t tx_mtx; pthread_cond_t tx_cond; int tx_in_progress; + + void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc); + void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov, + int iovcnt, int len); }; static void pci_vtnet_reset(void *); /* static void pci_vtnet_notify(void *, struct vqueue_info *); */ static int pci_vtnet_cfgread(void *, int, int, uint32_t *); static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); +static void pci_vtnet_neg_features(void *, uint64_t); static struct virtio_consts vtnet_vi_consts = { "vtnet", /* our name */ @@ -183,6 +211,7 @@ static struct virtio_consts vtnet_vi_consts = { NULL, /* device-wide qnotify -- not used */ pci_vtnet_cfgread, /* read PCI config */ pci_vtnet_cfgwrite, /* write PCI config */ + pci_vtnet_neg_features, /* apply negotiated features */ VTNET_S_HOSTCAPS, /* our capabilities */ }; @@ -235,6 +264,8 @@ pci_vtnet_reset(void *vsc) pci_vtnet_rxwait(sc); sc->vsc_rx_ready = 0; + sc->rx_merge = 1; + sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); /* now reset rings, MSI-X vectors, and negotiated capabilities */ vi_reset_dev(&sc->vsc_vs); @@ -245,7 +276,7 @@ pci_vtnet_reset(void *vsc) /* * Called to send a buffer chain out to the tap device */ -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ static void pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, int len) @@ -275,13 +306,13 @@ pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, int i; for (i = 0; i < iovcnt; i++) { - (void) dlpi_send(sc->vsc_dhp, NULL, NULL, - iov[i].iov_base, iov[i].iov_len, NULL); + (void) dlpi_send(sc->vsc_dhp, NULL, 0, + iov[i].iov_base, iov[i].iov_len, NULL); } } -#endif +#endif /* __FreeBSD__ */ -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ /* * Called when there is read activity on the tap file descriptor. * Each buffer posted by the guest is assumed to be able to contain @@ -290,23 +321,43 @@ pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, * is no need for it to be per-vtnet or locked. */ static uint8_t dummybuf[2048]; -#endif +#endif /* __FreeBSD__ */ + +static __inline struct iovec * +rx_iov_trim(struct iovec *iov, int *niov, int tlen) +{ + struct iovec *riov; + + /* XXX short-cut: assume first segment is >= tlen */ + assert(iov[0].iov_len >= tlen); + + iov[0].iov_len -= tlen; + if (iov[0].iov_len == 0) { + assert(*niov > 1); + *niov -= 1; + riov = &iov[1]; + } else { + iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); + riov = &iov[0]; + } + + return (riov); +} static void pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) { + struct iovec iov[VTNET_MAXSEGS], *riov; struct vqueue_info *vq; - struct virtio_net_rxhdr *vrx; - uint8_t *buf; + void *vrx; + int n; #ifdef __FreeBSD__ int len; -#endif - struct iovec iov[VTNET_MAXSEGS]; -#ifndef __FreeBSD__ +#else size_t len; int ret; #endif - int total_len = 0; + uint16_t idx; /* * Should never be called without a valid tap fd @@ -335,7 +386,6 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) * Check for available rx buffers */ vq = &sc->vsc_queues[VTNET_RXQ]; - vq_startchains(vq); if (!vq_has_descs(vq)) { /* * Drop the packet and try later. Interrupt on @@ -352,109 +402,267 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) /* * Get descriptor chain */ - if (sc->vsc_vs.vs_negotiated_caps & VIRTIO_NET_F_MRG_RXBUF) { - assert(vq_getchain(vq, iov, 1, NULL) == 1); + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); - /* - * Get a pointer to the rx header, and use the - * data immediately following it for the packet buffer. - */ - vrx = (struct virtio_net_rxhdr *)iov[0].iov_base; - buf = (uint8_t *)(vrx + 1); - total_len = iov[0].iov_len; + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = iov[0].iov_base; + riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); #ifdef __FreeBSD__ - len = read(sc->vsc_tapfd, buf, - iov[0].iov_len - sizeof(struct virtio_net_rxhdr)); - - if (len < 0 && errno == EWOULDBLOCK) { - /* - * No more packets, but still some avail ring - * entries. Interrupt if needed/appropriate. - */ - vq_endchains(vq, 0); - return; - } + len = readv(sc->vsc_tapfd, riov, n); #else - len = iov[0].iov_len - sizeof(struct virtio_net_rxhdr); - ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, buf, - &len, 0, NULL); - if (ret != DLPI_SUCCESS) { - /* - * No more packets, but still some avail ring - * entries. Interrupt if needed/appropriate. - */ - vq_endchains(vq, 0); - return; - } + len = riov[0].iov_len; + ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, + (uint8_t *)riov[0].iov_base, &len, 0, NULL); + if (ret != DLPI_SUCCESS) { + errno = EWOULDBLOCK; + len = 0; + } #endif - } else { - int i; - int num_segs; - num_segs = vq_getchain(vq, iov, - VTNET_MAXSEGS, NULL); - vrx = (struct virtio_net_rxhrd *)iov[0].iov_base; - total_len = iov[0].iov_len; - for (i = 1; i < num_segs; i++) { - buf = (uint8_t *)iov[i].iov_base; - total_len += iov[i].iov_len; + if (len <= 0 && errno == EWOULDBLOCK) { + /* + * No more packets, but still some avail ring + * entries. Interrupt if needed/appropriate. + */ + vq_retchain(vq); + vq_endchains(vq, 0); + return; + } + + /* + * The only valid field in the rx packet header is the + * number of buffers if merged rx bufs were negotiated. + */ + memset(vrx, 0, sc->rx_vhdrlen); + + if (sc->rx_merge) { + struct virtio_net_rxhdr *vrxh; + + vrxh = vrx; + vrxh->vrh_bufs = 1; + } + + /* + * Release this chain and handle more chains. + */ + vq_relchain(vq, idx, len + sc->rx_vhdrlen); + } while (vq_has_descs(vq)); + + /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ + vq_endchains(vq, 1); +} + #ifdef __FreeBSD__ - len = read(sc->vsc_tapfd, buf, iov[i].iov_len); - if (len < 0 && errno == EWOULDBLOCK) { - /* - * No more packets, - * but still some avail ring entries. - * Interrupt if needed/appropriate. - */ - break; - } -#else - len = iov[i].iov_len; - ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, buf, - &len, 0, NULL); - if (ret != DLPI_SUCCESS) { - /* - * No more packets, - * but still some avail ring entries. - * Interrupt if needed/appropriate. - */ - total_len = 0; - break; - } -#endif - } - if (total_len == 0) { - vq_endchains(vq, 0); - return; - } +static __inline int +pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt) +{ + int r, i; + int len = 0; + + for (r = nmd->cur_tx_ring; ; ) { + struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r); + uint32_t cur, idx; + char *buf; + + if (nm_ring_empty(ring)) { + r++; + if (r > nmd->last_tx_ring) + r = nmd->first_tx_ring; + if (r == nmd->cur_tx_ring) + break; + continue; + } + cur = ring->cur; + idx = ring->slot[cur].buf_idx; + buf = NETMAP_BUF(ring, idx); + + for (i = 0; i < iovcnt; i++) { + if (len + iov[i].iov_len > 2048) + break; + memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len); + len += iov[i].iov_len; + } + ring->slot[cur].len = len; + ring->head = ring->cur = nm_ring_next(ring, cur); + nmd->cur_tx_ring = r; + ioctl(nmd->fd, NIOCTXSYNC, NULL); + break; + } + + return (len); +} + +static __inline int +pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt) +{ + int len = 0; + int i = 0; + int r; + + for (r = nmd->cur_rx_ring; ; ) { + struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r); + uint32_t cur, idx; + char *buf; + size_t left; + + if (nm_ring_empty(ring)) { + r++; + if (r > nmd->last_rx_ring) + r = nmd->first_rx_ring; + if (r == nmd->cur_rx_ring) + break; + continue; + } + cur = ring->cur; + idx = ring->slot[cur].buf_idx; + buf = NETMAP_BUF(ring, idx); + left = ring->slot[cur].len; + + for (i = 0; i < iovcnt && left > 0; i++) { + if (iov[i].iov_len > left) + iov[i].iov_len = left; + memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len); + len += iov[i].iov_len; + left -= iov[i].iov_len; + } + ring->head = ring->cur = nm_ring_next(ring, cur); + nmd->cur_rx_ring = r; + ioctl(nmd->fd, NIOCRXSYNC, NULL); + break; + } + for (; i < iovcnt; i++) + iov[i].iov_len = 0; + + return (len); +} + +/* + * Called to send a buffer chain out to the vale port + */ +static void +pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + static char pad[60]; /* all zero bytes */ + + if (sc->vsc_nmd == NULL) + return; + + /* + * If the length is < 60, pad out to that and add the + * extra zero'd segment to the iov. It is guaranteed that + * there is always an extra iov available by the caller. + */ + if (len < 60) { + iov[iovcnt].iov_base = pad; + iov[iovcnt].iov_len = 60 - len; + iovcnt++; + } + (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt); +} + +static void +pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) +{ + struct iovec iov[VTNET_MAXSEGS], *riov; + struct vqueue_info *vq; + void *vrx; + int len, n; + uint16_t idx; + + /* + * Should never be called without a valid netmap descriptor + */ + assert(sc->vsc_nmd != NULL); + + /* + * But, will be called when the rx ring hasn't yet + * been set up or the guest is resetting the device. + */ + if (!sc->vsc_rx_ready || sc->resetting) { + /* + * Drop the packet and try later. + */ + (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); + return; + } + + /* + * Check for available rx buffers + */ + vq = &sc->vsc_queues[VTNET_RXQ]; + if (!vq_has_descs(vq)) { + /* + * Drop the packet and try later. Interrupt on + * empty, if that's negotiated. + */ + (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); + vq_endchains(vq, 1); + return; + } + + do { + /* + * Get descriptor chain. + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = iov[0].iov_base; + riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); + + len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n); + + if (len == 0) { + /* + * No more packets, but still some avail ring + * entries. Interrupt if needed/appropriate. + */ + vq_retchain(vq); + vq_endchains(vq, 0); + return; } /* * The only valid field in the rx packet header is the - * number of buffers, which is always 1 without TSO - * support. + * number of buffers if merged rx bufs were negotiated. */ - memset(vrx, 0, sizeof(struct virtio_net_rxhdr)); - vrx->vrh_bufs = 1; + memset(vrx, 0, sc->rx_vhdrlen); + + if (sc->rx_merge) { + struct virtio_net_rxhdr *vrxh; + + vrxh = vrx; + vrxh->vrh_bufs = 1; + } /* * Release this chain and handle more chains. */ - vq_relchain(vq, total_len); + vq_relchain(vq, idx, len + sc->rx_vhdrlen); } while (vq_has_descs(vq)); /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ vq_endchains(vq, 1); } +#endif /* __FreeBSD__ */ -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ static void -pci_vtnet_tap_callback(int fd, enum ev_type type, void *param) +pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) { struct pci_vtnet_softc *sc = param; pthread_mutex_lock(&sc->rx_mtx); sc->rx_in_progress = 1; - pci_vtnet_tap_rx(sc); + sc->pci_vtnet_rx(sc); sc->rx_in_progress = 0; pthread_mutex_unlock(&sc->rx_mtx); @@ -477,11 +685,15 @@ pci_vtnet_poll_thread(void *param) continue; } pthread_mutex_lock(&sc->vsc_mtx); + sc->rx_in_progress = 1; pci_vtnet_tap_rx(sc); + sc->rx_in_progress = 0; pthread_mutex_unlock(&sc->vsc_mtx); } + + return (NULL); } -#endif +#endif /* __FreeBSD__ */ static void pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) @@ -493,6 +705,7 @@ pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) */ if (sc->vsc_rx_ready == 0) { sc->vsc_rx_ready = 1; + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; } } @@ -502,13 +715,14 @@ pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) struct iovec iov[VTNET_MAXSEGS + 1]; int i, n; int plen, tlen; + uint16_t idx; /* * Obtain chain of descriptors. The first one is * really the header descriptor, so we need to sum * up two lengths: packet length and transfer length. */ - n = vq_getchain(vq, iov, VTNET_MAXSEGS, NULL); + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); plen = 0; tlen = iov[0].iov_len; @@ -518,10 +732,10 @@ pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) } DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); - pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen); + sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen); /* chain is processed, release it and set tlen */ - vq_relchain(vq, tlen); + vq_relchain(vq, idx, tlen); } static void @@ -537,6 +751,7 @@ pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) /* Signal the tx thread for processing */ pthread_mutex_lock(&sc->tx_mtx); + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; if (sc->tx_in_progress == 0) pthread_cond_signal(&sc->tx_cond); pthread_mutex_unlock(&sc->tx_mtx); @@ -550,7 +765,7 @@ pci_vtnet_tx_thread(void *param) { struct pci_vtnet_softc *sc = param; struct vqueue_info *vq; - int have_work, error; + int error; vq = &sc->vsc_queues[VTNET_TXQ]; @@ -564,23 +779,20 @@ pci_vtnet_tx_thread(void *param) for (;;) { /* note - tx mutex is locked here */ - do { - if (sc->resetting) - have_work = 0; - else - have_work = vq_has_descs(vq); - - if (!have_work) { - sc->tx_in_progress = 0; - error = pthread_cond_wait(&sc->tx_cond, - &sc->tx_mtx); - assert(error == 0); - } - } while (!have_work); + while (sc->resetting || !vq_has_descs(vq)) { + vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY; + mb(); + if (!sc->resetting && vq_has_descs(vq)) + break; + + sc->tx_in_progress = 0; + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + } + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; sc->tx_in_progress = 1; pthread_mutex_unlock(&sc->tx_mtx); - vq_startchains(vq); do { /* * Run through entries, placing them into @@ -597,42 +809,161 @@ pci_vtnet_tx_thread(void *param) pthread_mutex_lock(&sc->tx_mtx); } + return (NULL); } -#ifdef notyet +#ifdef __FreeBSD__ static void pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) { DPRINTF(("vtnet: control qnotify!\n\r")); } -#endif +#endif /* __FreeBSD__ */ -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ static int pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr) { - struct ether_addr *ea; - char *tmpstr; - char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + struct ether_addr *ea; + char *tmpstr; + char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + + tmpstr = strsep(&mac_str,"="); - tmpstr = strsep(&mac_str,"="); - - if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { - ea = ether_aton(mac_str); + if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { + ea = ether_aton(mac_str); - if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || - memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { fprintf(stderr, "Invalid MAC %s\n", mac_str); - return (EINVAL); - } else - memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); - } + return (EINVAL); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); + } - return (0); + return (0); } +#endif /* __FreeBSD__ */ + +static void +pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname) +{ + char tbuf[80]; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif +#ifndef __FreeBSD__ + uchar_t physaddr[DLPI_PHYSADDR_MAX]; + size_t physaddrlen = DLPI_PHYSADDR_MAX; + int error; +#endif + + strcpy(tbuf, "/dev/"); + strlcat(tbuf, devname, sizeof(tbuf)); + + sc->pci_vtnet_rx = pci_vtnet_tap_rx; + sc->pci_vtnet_tx = pci_vtnet_tap_tx; +#ifdef __FreeBSD__ + sc->vsc_tapfd = open(tbuf, O_RDWR); + if (sc->vsc_tapfd == -1) { + WPRINTF(("open of tap device %s failed\n", tbuf)); + return; + } + + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + int opt = 1; + if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { + WPRINTF(("tap device O_NONBLOCK failed\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + sc->vsc_mevp = mevent_add(sc->vsc_tapfd, + EVF_READ, + pci_vtnet_rx_callback, + sc); + if (sc->vsc_mevp == NULL) { + WPRINTF(("Could not register event\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } +#else + if (dlpi_open(devname, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) { + WPRINTF(("open of vnic device %s failed\n", devname)); + } + + if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr, + &physaddrlen) != DLPI_SUCCESS) { + WPRINTF(("read MAC address of vnic device %s failed\n", + devname)); + } + if (physaddrlen != ETHERADDRL) { + WPRINTF(("bad MAC address len %d on vnic device %s\n", + physaddrlen, devname)); + } + memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL); + + if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) { + WPRINTF(("bind of vnic device %s failed\n", devname)); + } + + if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) { + WPRINTF(("enable promiscous mode(physical) of vnic device %s " + "failed\n", devname)); + } + if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) { + WPRINTF(("enable promiscous mode(SAP) of vnic device %s " + "failed\n", devname)); + } + + sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp); + + if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) { + WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n", + devname)); + dlpi_close(sc->vsc_dhp); + sc->vsc_dlpifd = -1; + } + + error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc); + assert(error == 0); #endif +} + +#ifdef __FreeBSD__ +static void +pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname) +{ + sc->pci_vtnet_rx = pci_vtnet_netmap_rx; + sc->pci_vtnet_tx = pci_vtnet_netmap_tx; + + sc->vsc_nmd = nm_open(ifname, NULL, 0, 0); + if (sc->vsc_nmd == NULL) { + WPRINTF(("open of netmap device %s failed\n", ifname)); + return; + } + sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd, + EVF_READ, + pci_vtnet_rx_callback, + sc); + if (sc->vsc_mevp == NULL) { + WPRINTF(("Could not register event\n")); + nm_close(sc->vsc_nmd); + sc->vsc_nmd = NULL; + } +} +#endif /* __FreeBSD__ */ static int pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) @@ -640,31 +971,30 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) #ifdef __FreeBSD__ MD5_CTX mdctx; unsigned char digest[16]; -#else - uchar_t physaddr[DLPI_PHYSADDR_MAX]; - size_t physaddrlen = DLPI_PHYSADDR_MAX; - int error; -#endif char nstr[80]; +#endif char tname[MAXCOMLEN + 1]; struct pci_vtnet_softc *sc; const char *env_msi; char *devname; char *vtopts; +#ifdef __FreeBSD__ int mac_provided; +#endif int use_msix; - sc = malloc(sizeof(struct pci_vtnet_softc)); - memset(sc, 0, sizeof(struct pci_vtnet_softc)); + sc = calloc(1, sizeof(struct pci_vtnet_softc)); pthread_mutex_init(&sc->vsc_mtx, NULL); vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; -#ifdef notyet +#ifdef __FreeBSD__ sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; #endif @@ -682,13 +1012,15 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) * Attempt to open the tap device and read the MAC address * if specified */ - mac_provided = 0; #ifdef __FreeBSD__ + mac_provided = 0; sc->vsc_tapfd = -1; #endif + sc->vsc_nmd = NULL; if (opts != NULL) { - char tbuf[80]; +#ifdef __FreeBSD__ int err; +#endif devname = vtopts = strdup(opts); (void) strsep(&vtopts, ","); @@ -704,72 +1036,15 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) } #endif - strcpy(tbuf, "/dev/"); - strlcat(tbuf, devname, sizeof(tbuf)); +#ifdef __FreeBSD__ + if (strncmp(devname, "vale", 4) == 0) + pci_vtnet_netmap_setup(sc, devname); +#endif + if (strncmp(devname, "tap", 3) == 0 || + strncmp(devname, "vmnet", 5) == 0) + pci_vtnet_tap_setup(sc, devname); free(devname); - -#ifdef __FreeBSD__ - sc->vsc_tapfd = open(tbuf, O_RDWR); - if (sc->vsc_tapfd == -1) { - WPRINTF(("open of tap device %s failed\n", tbuf)); - } else { - /* - * Set non-blocking and register for read - * notifications with the event loop - */ - int opt = 1; - if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { - WPRINTF(("tap device O_NONBLOCK failed\n")); - close(sc->vsc_tapfd); - sc->vsc_tapfd = -1; - } - - sc->vsc_mevp = mevent_add(sc->vsc_tapfd, - EVF_READ, - pci_vtnet_tap_callback, - sc); - if (sc->vsc_mevp == NULL) { - WPRINTF(("Could not register event\n")); - close(sc->vsc_tapfd); - sc->vsc_tapfd = -1; - } - } -#else - if (dlpi_open(opts, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) { - WPRINTF(("open of vnic device %s failed\n", opts)); - } - - if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr, &physaddrlen) != DLPI_SUCCESS) { - WPRINTF(("read MAC address of vnic device %s failed\n", opts)); - } - if (physaddrlen != ETHERADDRL) { - WPRINTF(("bad MAC address len %d on vnic device %s\n", physaddrlen, opts)); - } - memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL); - - if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) { - WPRINTF(("bind of vnic device %s failed\n", opts)); - } - - if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) { - WPRINTF(("enable promiscous mode(physical) of vnic device %s failed\n", opts)); - } - if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) { - WPRINTF(("enable promiscous mode(SAP) of vnic device %s failed\n", opts)); - } - - sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp); - - if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) { - WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n", opts)); - dlpi_close(sc->vsc_dhp); - sc->vsc_dlpifd = -1; - } - - error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc); - assert(error == 0); -#endif } #ifdef __FreeBSD__ @@ -799,9 +1074,15 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); - /* link always up */ - sc->vsc_config.status = 1; + /* Link is up if we managed to open tap device or vale port. */ +#ifdef __FreeBSD__ + sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 || +#else + sc->vsc_config.status = (opts == NULL || sc->vsc_dlpifd >= 0 || +#endif + sc->vsc_nmd != NULL); /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ if (vi_intr_init(&sc->vsc_vs, 1, use_msix)) @@ -812,6 +1093,8 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) sc->resetting = 0; + sc->rx_merge = 1; + sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); sc->rx_in_progress = 0; pthread_mutex_init(&sc->rx_mtx, NULL); @@ -824,8 +1107,9 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pthread_mutex_init(&sc->tx_mtx, NULL); pthread_cond_init(&sc->tx_cond, NULL); pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); - snprintf(tname, sizeof(tname), "%s vtnet%d tx", vmname, pi->pi_slot); - pthread_set_name_np(sc->tx_tid, tname); + snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, + pi->pi_func); + pthread_set_name_np(sc->tx_tid, tname); return (0); } @@ -844,9 +1128,10 @@ pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) ptr = &sc->vsc_config.mac[offset]; memcpy(ptr, &value, size); } else { + /* silently ignore other writes */ DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); - return (1); } + return (0); } @@ -861,6 +1146,20 @@ pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) return (0); } +static void +pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtnet_softc *sc = vsc; + + sc->vsc_features = negotiated_features; + + if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) { + sc->rx_merge = 0; + /* non-merge rx header is 2 bytes shorter */ + sc->rx_vhdrlen -= 2; + } +} + struct pci_devemu pci_de_vnet = { .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, diff --git a/usr/src/cmd/bhyve/pci_virtio_rnd.c b/usr/src/cmd/bhyve/pci_virtio_rnd.c new file mode 100644 index 0000000000..5f470c03a6 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_rnd.c @@ -0,0 +1,209 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * virtio entropy device emulation. + * Randomness is sourced from /dev/random which does not block + * once it has been seeded at bootup. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include + +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" + +#define VTRND_RINGSZ 64 + + +static int pci_vtrnd_debug; +#define DPRINTF(params) if (pci_vtrnd_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtrnd_softc { + struct virtio_softc vrsc_vs; + struct vqueue_info vrsc_vq; + pthread_mutex_t vrsc_mtx; + uint64_t vrsc_cfg; + int vrsc_fd; +}; + +static void pci_vtrnd_reset(void *); +static void pci_vtrnd_notify(void *, struct vqueue_info *); + +static struct virtio_consts vtrnd_vi_consts = { + "vtrnd", /* our name */ + 1, /* we support 1 virtqueue */ + 0, /* config reg size */ + pci_vtrnd_reset, /* reset */ + pci_vtrnd_notify, /* device-wide qnotify */ + NULL, /* read virtio config */ + NULL, /* write virtio config */ + NULL, /* apply negotiated features */ + 0, /* our capabilities */ +}; + + +static void +pci_vtrnd_reset(void *vsc) +{ + struct pci_vtrnd_softc *sc; + + sc = vsc; + + DPRINTF(("vtrnd: device reset requested !\n")); + vi_reset_dev(&sc->vrsc_vs); +} + + +static void +pci_vtrnd_notify(void *vsc, struct vqueue_info *vq) +{ + struct iovec iov; + struct pci_vtrnd_softc *sc; + int len; + uint16_t idx; + + sc = vsc; + + if (sc->vrsc_fd < 0) { + vq_endchains(vq, 0); + return; + } + + while (vq_has_descs(vq)) { + vq_getchain(vq, &idx, &iov, 1, NULL); + + len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len); + + DPRINTF(("vtrnd: vtrnd_notify(): %d\r\n", len)); + + /* Catastrophe if unable to read from /dev/random */ + assert(len > 0); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, len); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + + +static int +pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_vtrnd_softc *sc; + int fd; + int len; + uint8_t v; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + /* + * Should always be able to open /dev/random. + */ + fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + + assert(fd >= 0); + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_READ); + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Check that device is seeded and non-blocking. + */ + len = read(fd, &v, sizeof(v)); + if (len <= 0) { + WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len)); + close(fd); + return (1); + } + + sc = calloc(1, sizeof(struct pci_vtrnd_softc)); + + vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq); + sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx; + + sc->vrsc_vq.vq_qsize = VTRND_RINGSZ; + + /* keep /dev/random opened while emulating */ + sc->vrsc_fd = fd; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_ENTROPY); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vrsc_vs, 0); + + return (0); +} + + +struct pci_devemu pci_de_vrnd = { + .pe_emu = "virtio-rnd", + .pe_init = pci_vtrnd_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vrnd); diff --git a/usr/src/cmd/bhyve/pci_virtio_scsi.c b/usr/src/cmd/bhyve/pci_virtio_scsi.c new file mode 100644 index 0000000000..38e7d918a0 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_scsi.c @@ -0,0 +1,737 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama . + * Copyright (c) 2018 Marcelo Araujo . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" +#include "iov.h" + +#define VTSCSI_RINGSZ 64 +#define VTSCSI_REQUESTQ 1 +#define VTSCSI_THR_PER_Q 16 +#define VTSCSI_MAXQ (VTSCSI_REQUESTQ + 2) +#define VTSCSI_MAXSEG 64 + +#define VTSCSI_IN_HEADER_LEN(_sc) \ + (sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size) + +#define VTSCSI_OUT_HEADER_LEN(_sc) \ + (sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size) + +#define VIRTIO_SCSI_MAX_CHANNEL 0 +#define VIRTIO_SCSI_MAX_TARGET 0 +#define VIRTIO_SCSI_MAX_LUN 16383 + +#define VIRTIO_SCSI_F_INOUT (1 << 0) +#define VIRTIO_SCSI_F_HOTPLUG (1 << 1) +#define VIRTIO_SCSI_F_CHANGE (1 << 2) + +static int pci_vtscsi_debug = 0; +#define DPRINTF(params) if (pci_vtscsi_debug) printf params +#define WPRINTF(params) printf params + +struct pci_vtscsi_config { + uint32_t num_queues; + uint32_t seg_max; + uint32_t max_sectors; + uint32_t cmd_per_lun; + uint32_t event_info_size; + uint32_t sense_size; + uint32_t cdb_size; + uint16_t max_channel; + uint16_t max_target; + uint32_t max_lun; +} __attribute__((packed)); + +struct pci_vtscsi_queue { + struct pci_vtscsi_softc * vsq_sc; + struct vqueue_info * vsq_vq; + pthread_mutex_t vsq_mtx; + pthread_mutex_t vsq_qmtx; + pthread_cond_t vsq_cv; + STAILQ_HEAD(, pci_vtscsi_request) vsq_requests; + LIST_HEAD(, pci_vtscsi_worker) vsq_workers; +}; + +struct pci_vtscsi_worker { + struct pci_vtscsi_queue * vsw_queue; + pthread_t vsw_thread; + bool vsw_exiting; + LIST_ENTRY(pci_vtscsi_worker) vsw_link; +}; + +struct pci_vtscsi_request { + struct pci_vtscsi_queue * vsr_queue; + struct iovec vsr_iov_in[VTSCSI_MAXSEG]; + int vsr_niov_in; + struct iovec vsr_iov_out[VTSCSI_MAXSEG]; + int vsr_niov_out; + uint32_t vsr_idx; + STAILQ_ENTRY(pci_vtscsi_request) vsr_link; +}; + +/* + * Per-device softc + */ +struct pci_vtscsi_softc { + struct virtio_softc vss_vs; + struct vqueue_info vss_vq[VTSCSI_MAXQ]; + struct pci_vtscsi_queue vss_queues[VTSCSI_REQUESTQ]; + pthread_mutex_t vss_mtx; + int vss_iid; + int vss_ctl_fd; + uint32_t vss_features; + struct pci_vtscsi_config vss_config; +}; + +#define VIRTIO_SCSI_T_TMF 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 +#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 +#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 +#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 +#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 + +/* command-specific response values */ +#define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 +#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 +#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 + +struct pci_vtscsi_ctrl_tmf { + uint32_t type; + uint32_t subtype; + uint8_t lun[8]; + uint64_t id; + uint8_t response; +} __attribute__((packed)); + +#define VIRTIO_SCSI_T_AN_QUERY 1 +#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 +#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 +#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 +#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 +#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 +#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 + +struct pci_vtscsi_ctrl_an { + uint32_t type; + uint8_t lun[8]; + uint32_t event_requested; + uint32_t event_actual; + uint8_t response; +} __attribute__((packed)); + +/* command-specific response values */ +#define VIRTIO_SCSI_S_OK 0 +#define VIRTIO_SCSI_S_OVERRUN 1 +#define VIRTIO_SCSI_S_ABORTED 2 +#define VIRTIO_SCSI_S_BAD_TARGET 3 +#define VIRTIO_SCSI_S_RESET 4 +#define VIRTIO_SCSI_S_BUSY 5 +#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 +#define VIRTIO_SCSI_S_TARGET_FAILURE 7 +#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 +#define VIRTIO_SCSI_S_FAILURE 9 +#define VIRTIO_SCSI_S_INCORRECT_LUN 12 + +/* task_attr */ +#define VIRTIO_SCSI_S_SIMPLE 0 +#define VIRTIO_SCSI_S_ORDERED 1 +#define VIRTIO_SCSI_S_HEAD 2 +#define VIRTIO_SCSI_S_ACA 3 + +struct pci_vtscsi_event { + uint32_t event; + uint8_t lun[8]; + uint32_t reason; +} __attribute__((packed)); + +struct pci_vtscsi_req_cmd_rd { + uint8_t lun[8]; + uint64_t id; + uint8_t task_attr; + uint8_t prio; + uint8_t crn; + uint8_t cdb[]; +} __attribute__((packed)); + +struct pci_vtscsi_req_cmd_wr { + uint32_t sense_len; + uint32_t residual; + uint16_t status_qualifier; + uint8_t status; + uint8_t response; + uint8_t sense[]; +} __attribute__((packed)); + +static void *pci_vtscsi_proc(void *); +static void pci_vtscsi_reset(void *); +static void pci_vtscsi_neg_features(void *, uint64_t); +static int pci_vtscsi_cfgread(void *, int, int, uint32_t *); +static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t); +static inline int pci_vtscsi_get_lun(uint8_t *); +static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t); +static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *, + struct pci_vtscsi_ctrl_tmf *); +static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *, + struct pci_vtscsi_ctrl_an *); +static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *, + int, struct iovec *, int); +static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *); +static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *); +static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *); +static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *, + struct pci_vtscsi_queue *, int); +static int pci_vtscsi_init(struct vmctx *, struct pci_devinst *, char *); + +static struct virtio_consts vtscsi_vi_consts = { + "vtscsi", /* our name */ + VTSCSI_MAXQ, /* we support 2+n virtqueues */ + sizeof(struct pci_vtscsi_config), /* config reg size */ + pci_vtscsi_reset, /* reset */ + NULL, /* device-wide qnotify */ + pci_vtscsi_cfgread, /* read virtio config */ + pci_vtscsi_cfgwrite, /* write virtio config */ + pci_vtscsi_neg_features, /* apply negotiated features */ + 0, /* our capabilities */ +}; + +static void * +pci_vtscsi_proc(void *arg) +{ + struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg; + struct pci_vtscsi_queue *q = worker->vsw_queue; + struct pci_vtscsi_request *req; + int iolen; + + for (;;) { + pthread_mutex_lock(&q->vsq_mtx); + + while (STAILQ_EMPTY(&q->vsq_requests) + && !worker->vsw_exiting) + pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx); + + if (worker->vsw_exiting) + break; + + req = STAILQ_FIRST(&q->vsq_requests); + STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link); + + pthread_mutex_unlock(&q->vsq_mtx); + iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in, + req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out); + + pthread_mutex_lock(&q->vsq_qmtx); + vq_relchain(q->vsq_vq, req->vsr_idx, iolen); + vq_endchains(q->vsq_vq, 0); + pthread_mutex_unlock(&q->vsq_qmtx); + + DPRINTF(("virtio-scsi: request completed\n", + req->vsr_idx)); + free(req); + } + + pthread_mutex_unlock(&q->vsq_mtx); + return (NULL); +} + +static void +pci_vtscsi_reset(void *vsc) +{ + struct pci_vtscsi_softc *sc; + + sc = vsc; + + DPRINTF(("vtscsi: device reset requested\n")); + vi_reset_dev(&sc->vss_vs); + + /* initialize config structure */ + sc->vss_config = (struct pci_vtscsi_config){ + .num_queues = VTSCSI_REQUESTQ, + .seg_max = VTSCSI_MAXSEG, + .max_sectors = 2, + .cmd_per_lun = 1, + .event_info_size = sizeof(struct pci_vtscsi_event), + .sense_size = 96, + .cdb_size = 32, + .max_channel = VIRTIO_SCSI_MAX_CHANNEL, + .max_target = VIRTIO_SCSI_MAX_TARGET, + .max_lun = VIRTIO_SCSI_MAX_LUN + }; +} + +static void +pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtscsi_softc *sc = vsc; + + sc->vss_features = negotiated_features; +} + +static int +pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtscsi_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vss_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static int +pci_vtscsi_cfgwrite(void *vsc, int offset, int size, uint32_t val) +{ + + return (0); +} + +static inline int +pci_vtscsi_get_lun(uint8_t *lun) +{ + + return (((lun[2] << 8) | lun[3]) & 0x3fff); +} + +static int +pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf, + size_t bufsize) +{ + struct pci_vtscsi_ctrl_tmf *tmf; + struct pci_vtscsi_ctrl_an *an; + uint32_t type; + + type = *(uint32_t *)buf; + + if (type == VIRTIO_SCSI_T_TMF) { + tmf = (struct pci_vtscsi_ctrl_tmf *)buf; + return (pci_vtscsi_tmf_handle(sc, tmf)); + } + + if (type == VIRTIO_SCSI_T_AN_QUERY) { + an = (struct pci_vtscsi_ctrl_an *)buf; + return (pci_vtscsi_an_handle(sc, an)); + } + + return (0); +} + +static int +pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_ctrl_tmf *tmf) +{ + union ctl_io *io; + int err; + + io = ctl_scsi_alloc_io(sc->vss_iid); + ctl_scsi_zero_io(io); + + io->io_hdr.io_type = CTL_IO_TASK; + io->io_hdr.nexus.initid = sc->vss_iid; + io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun); + io->taskio.tag_type = CTL_TAG_SIMPLE; + io->taskio.tag_num = (uint32_t)tmf->id; + + switch (tmf->subtype) { + case VIRTIO_SCSI_T_TMF_ABORT_TASK: + io->taskio.task_action = CTL_TASK_ABORT_TASK; + break; + + case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: + io->taskio.task_action = CTL_TASK_ABORT_TASK_SET; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_ACA: + io->taskio.task_action = CTL_TASK_CLEAR_ACA; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: + io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET; + break; + + case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: + io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET; + break; + + case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: + io->taskio.task_action = CTL_TASK_LUN_RESET; + break; + + case VIRTIO_SCSI_T_TMF_QUERY_TASK: + io->taskio.task_action = CTL_TASK_QUERY_TASK; + break; + + case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: + io->taskio.task_action = CTL_TASK_QUERY_TASK_SET; + break; + } + + if (pci_vtscsi_debug) { + struct sbuf *sb = sbuf_new_auto(); + ctl_io_sbuf(io, sb); + sbuf_finish(sb); + DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); + sbuf_delete(sb); + } + + err = ioctl(sc->vss_ctl_fd, CTL_IO, io); + if (err != 0) + WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno))); + + tmf->response = io->taskio.task_status; + ctl_scsi_free_io(io); + return (1); +} + +static int +pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_ctrl_an *an) +{ + + return (0); +} + +static int +pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in, + int niov_in, struct iovec *iov_out, int niov_out) +{ + struct pci_vtscsi_softc *sc = q->vsq_sc; + struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL; + struct pci_vtscsi_req_cmd_wr *cmd_wr; + struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG]; + union ctl_io *io; + int data_niov_in, data_niov_out; + void *ext_data_ptr = NULL; + uint32_t ext_data_len = 0, ext_sg_entries = 0; + int err; + + seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in, + VTSCSI_IN_HEADER_LEN(sc)); + seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out, + VTSCSI_OUT_HEADER_LEN(sc)); + + truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc)); + truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc)); + iov_to_buf(iov_in, niov_in, (void **)&cmd_rd); + + cmd_wr = malloc(VTSCSI_OUT_HEADER_LEN(sc)); + io = ctl_scsi_alloc_io(sc->vss_iid); + ctl_scsi_zero_io(io); + + io->io_hdr.nexus.initid = sc->vss_iid; + io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun); + + io->io_hdr.io_type = CTL_IO_SCSI; + + if (data_niov_in > 0) { + ext_data_ptr = (void *)data_iov_in; + ext_sg_entries = data_niov_in; + ext_data_len = count_iov(data_iov_in, data_niov_in); + io->io_hdr.flags |= CTL_FLAG_DATA_OUT; + } else if (data_niov_out > 0) { + ext_data_ptr = (void *)data_iov_out; + ext_sg_entries = data_niov_out; + ext_data_len = count_iov(data_iov_out, data_niov_out); + io->io_hdr.flags |= CTL_FLAG_DATA_IN; + } + + io->scsiio.sense_len = sc->vss_config.sense_size; + io->scsiio.tag_num = (uint32_t)cmd_rd->id; + switch (cmd_rd->task_attr) { + case VIRTIO_SCSI_S_ORDERED: + io->scsiio.tag_type = CTL_TAG_ORDERED; + break; + case VIRTIO_SCSI_S_HEAD: + io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE; + break; + case VIRTIO_SCSI_S_ACA: + io->scsiio.tag_type = CTL_TAG_ACA; + break; + case VIRTIO_SCSI_S_SIMPLE: + default: + io->scsiio.tag_type = CTL_TAG_SIMPLE; + break; + } + io->scsiio.ext_sg_entries = ext_sg_entries; + io->scsiio.ext_data_ptr = ext_data_ptr; + io->scsiio.ext_data_len = ext_data_len; + io->scsiio.ext_data_filled = 0; + io->scsiio.cdb_len = sc->vss_config.cdb_size; + memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size); + + if (pci_vtscsi_debug) { + struct sbuf *sb = sbuf_new_auto(); + ctl_io_sbuf(io, sb); + sbuf_finish(sb); + DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); + sbuf_delete(sb); + } + + err = ioctl(sc->vss_ctl_fd, CTL_IO, io); + if (err != 0) { + WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno))); + cmd_wr->response = VIRTIO_SCSI_S_FAILURE; + } else { + cmd_wr->sense_len = MIN(io->scsiio.sense_len, + sc->vss_config.sense_size); + cmd_wr->residual = io->scsiio.residual; + cmd_wr->status = io->scsiio.scsi_status; + cmd_wr->response = VIRTIO_SCSI_S_OK; + memcpy(&cmd_wr->sense, &io->scsiio.sense_data, + cmd_wr->sense_len); + } + + buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0); + free(cmd_rd); + free(cmd_wr); + ctl_scsi_free_io(io); + return (VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled); +} + +static void +pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtscsi_softc *sc; + struct iovec iov[VTSCSI_MAXSEG]; + uint16_t idx, n; + void *buf = NULL; + size_t bufsize; + int iolen; + + sc = vsc; + + while (vq_has_descs(vq)) { + n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, NULL); + bufsize = iov_to_buf(iov, n, &buf); + iolen = pci_vtscsi_control_handle(sc, buf, bufsize); + buf_to_iov(buf + bufsize - iolen, iolen, iov, n, + bufsize - iolen); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, iolen); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ + free(buf); +} + +static void +pci_vtscsi_eventq_notify(void *vsc, struct vqueue_info *vq) +{ + + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; +} + +static void +pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtscsi_softc *sc; + struct pci_vtscsi_queue *q; + struct pci_vtscsi_request *req; + struct iovec iov[VTSCSI_MAXSEG]; + uint16_t flags[VTSCSI_MAXSEG]; + uint16_t idx, n, i; + int readable; + + sc = vsc; + q = &sc->vss_queues[vq->vq_num - 2]; + + while (vq_has_descs(vq)) { + readable = 0; + n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, flags); + + /* Count readable descriptors */ + for (i = 0; i < n; i++) { + if (flags[i] & VRING_DESC_F_WRITE) + break; + + readable++; + } + + req = calloc(1, sizeof(struct pci_vtscsi_request)); + req->vsr_idx = idx; + req->vsr_queue = q; + req->vsr_niov_in = readable; + req->vsr_niov_out = n - readable; + memcpy(req->vsr_iov_in, iov, + req->vsr_niov_in * sizeof(struct iovec)); + memcpy(req->vsr_iov_out, iov + readable, + req->vsr_niov_out * sizeof(struct iovec)); + + pthread_mutex_lock(&q->vsq_mtx); + STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link); + pthread_cond_signal(&q->vsq_cv); + pthread_mutex_unlock(&q->vsq_mtx); + + DPRINTF(("virtio-scsi: request enqueued\n", idx)); + } +} + +static int +pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_queue *queue, int num) +{ + struct pci_vtscsi_worker *worker; + char tname[MAXCOMLEN + 1]; + int i; + + queue->vsq_sc = sc; + queue->vsq_vq = &sc->vss_vq[num + 2]; + + pthread_mutex_init(&queue->vsq_mtx, NULL); + pthread_mutex_init(&queue->vsq_qmtx, NULL); + pthread_cond_init(&queue->vsq_cv, NULL); + STAILQ_INIT(&queue->vsq_requests); + LIST_INIT(&queue->vsq_workers); + + for (i = 0; i < VTSCSI_THR_PER_Q; i++) { + worker = calloc(1, sizeof(struct pci_vtscsi_worker)); + worker->vsw_queue = queue; + + pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc, + (void *)worker); + + snprintf(tname, sizeof(tname), "vtscsi:%d-%d", num, i); + pthread_set_name_np(worker->vsw_thread, tname); + LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link); + } + + return (0); +} + +static int +pci_vtscsi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_vtscsi_softc *sc; + char *opt, *optname; + const char *devname; + int i, optidx = 0; + + sc = calloc(1, sizeof(struct pci_vtscsi_softc)); + devname = "/dev/cam/ctl"; + while ((opt = strsep(&opts, ",")) != NULL) { + optname = strsep(&opt, "="); + if (opt == NULL && optidx == 0) { + if (optname[0] != 0) + devname = optname; + } else if (strcmp(optname, "dev") == 0 && opt != NULL) { + devname = opt; + } else if (strcmp(optname, "iid") == 0 && opt != NULL) { + sc->vss_iid = strtoul(opt, NULL, 10); + } else { + fprintf(stderr, "Invalid option %s\n", optname); + free(sc); + return (1); + } + optidx++; + } + + sc->vss_ctl_fd = open(devname, O_RDWR); + if (sc->vss_ctl_fd < 0) { + WPRINTF(("cannot open %s: %s\n", devname, strerror(errno))); + free(sc); + return (1); + } + + vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq); + sc->vss_vs.vs_mtx = &sc->vss_mtx; + + /* controlq */ + sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify; + + /* eventq */ + sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify; + + /* request queues */ + for (i = 2; i < VTSCSI_MAXQ; i++) { + sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify; + pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2); + } + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_SCSI); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vss_vs, 0); + + return (0); +} + + +struct pci_devemu pci_de_vscsi = { + .pe_emu = "virtio-scsi", + .pe_init = pci_vtscsi_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vscsi); diff --git a/usr/src/cmd/bhyve/pci_virtio_viona.c b/usr/src/cmd/bhyve/pci_virtio_viona.c index f4d5d528be..e5a5cb584f 100644 --- a/usr/src/cmd/bhyve/pci_virtio_viona.c +++ b/usr/src/cmd/bhyve/pci_virtio_viona.c @@ -34,6 +34,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #include @@ -289,6 +290,8 @@ pci_viona_tx_thread(void *param) sc->vsc_tx_kick_lock_held = B_FALSE; } pthread_mutex_unlock(&sc->tx_mtx); + + return (NULL); } static void @@ -347,8 +350,10 @@ static int pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc) { vioc_create_t vna_create; +#if notyet char devname[MAXNAMELEN]; int ctlfd; +#endif int error; sc->vsc_vnafd = open("/devices/pseudo/viona@0:ctl", O_RDWR | O_EXCL); @@ -360,10 +365,12 @@ pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc) vna_create.c_linkid = sc->vsc_linkid; strlcpy(vna_create.c_vmname, vmname, sizeof (vna_create.c_vmname)); +#if notyet vm_get_memory_seg(ctx, 1 * (1024 * 1024UL), &vna_create.c_lomem_size, NULL); vm_get_memory_seg(ctx, 4 * (1024 * 1024 * 1024UL), &vna_create.c_himem_size, NULL); +#endif error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create); if (error != 0) { WPRINTF(("ioctl viona create failed %d\n", error)); @@ -495,7 +502,7 @@ viona_adjust_offset(struct pci_devinst *pi, uint64_t offset) static void pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, - int baridx, uint64_t offset, int size, uint64_t value) + int baridx, uint64_t offset, int size, uint64_t value) { struct pci_viona_softc *sc = pi->pi_arg; void *ptr; diff --git a/usr/src/cmd/bhyve/pci_xhci.c b/usr/src/cmd/bhyve/pci_xhci.c new file mode 100644 index 0000000000..29d56ec32c --- /dev/null +++ b/usr/src/cmd/bhyve/pci_xhci.c @@ -0,0 +1,2855 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Leon Dang + * Copyright 2018 Joyent, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + XHCI options: + -s ,xhci,{devices} + + devices: + tablet USB tablet mouse + */ +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "bhyverun.h" +#include "pci_emul.h" +#include "pci_xhci.h" +#include "usb_emul.h" + + +static int xhci_debug = 0; +#define DPRINTF(params) if (xhci_debug) printf params +#define WPRINTF(params) printf params + + +#define XHCI_NAME "xhci" +#define XHCI_MAX_DEVS 8 /* 4 USB3 + 4 USB2 devs */ + +#define XHCI_MAX_SLOTS 64 /* min allowed by Windows drivers */ + +/* + * XHCI data structures can be up to 64k, but limit paddr_guest2host mapping + * to 4k to avoid going over the guest physical memory barrier. + */ +#define XHCI_PADDR_SZ 4096 /* paddr_guest2host max size */ + +#define XHCI_ERST_MAX 0 /* max 2^entries event ring seg tbl */ + +#define XHCI_CAPLEN (4*8) /* offset of op register space */ +#define XHCI_HCCPRAMS2 0x1C /* offset of HCCPARAMS2 register */ +#define XHCI_PORTREGS_START 0x400 +#define XHCI_DOORBELL_MAX 256 + +#define XHCI_STREAMS_MAX 1 /* 4-15 in XHCI spec */ + +/* caplength and hci-version registers */ +#define XHCI_SET_CAPLEN(x) ((x) & 0xFF) +#define XHCI_SET_HCIVERSION(x) (((x) & 0xFFFF) << 16) +#define XHCI_GET_HCIVERSION(x) (((x) >> 16) & 0xFFFF) + +/* hcsparams1 register */ +#define XHCI_SET_HCSP1_MAXSLOTS(x) ((x) & 0xFF) +#define XHCI_SET_HCSP1_MAXINTR(x) (((x) & 0x7FF) << 8) +#define XHCI_SET_HCSP1_MAXPORTS(x) (((x) & 0xFF) << 24) + +/* hcsparams2 register */ +#define XHCI_SET_HCSP2_IST(x) ((x) & 0x0F) +#define XHCI_SET_HCSP2_ERSTMAX(x) (((x) & 0x0F) << 4) +#define XHCI_SET_HCSP2_MAXSCRATCH_HI(x) (((x) & 0x1F) << 21) +#define XHCI_SET_HCSP2_MAXSCRATCH_LO(x) (((x) & 0x1F) << 27) + +/* hcsparams3 register */ +#define XHCI_SET_HCSP3_U1EXITLATENCY(x) ((x) & 0xFF) +#define XHCI_SET_HCSP3_U2EXITLATENCY(x) (((x) & 0xFFFF) << 16) + +/* hccparams1 register */ +#define XHCI_SET_HCCP1_AC64(x) ((x) & 0x01) +#define XHCI_SET_HCCP1_BNC(x) (((x) & 0x01) << 1) +#define XHCI_SET_HCCP1_CSZ(x) (((x) & 0x01) << 2) +#define XHCI_SET_HCCP1_PPC(x) (((x) & 0x01) << 3) +#define XHCI_SET_HCCP1_PIND(x) (((x) & 0x01) << 4) +#define XHCI_SET_HCCP1_LHRC(x) (((x) & 0x01) << 5) +#define XHCI_SET_HCCP1_LTC(x) (((x) & 0x01) << 6) +#define XHCI_SET_HCCP1_NSS(x) (((x) & 0x01) << 7) +#define XHCI_SET_HCCP1_PAE(x) (((x) & 0x01) << 8) +#define XHCI_SET_HCCP1_SPC(x) (((x) & 0x01) << 9) +#define XHCI_SET_HCCP1_SEC(x) (((x) & 0x01) << 10) +#define XHCI_SET_HCCP1_CFC(x) (((x) & 0x01) << 11) +#define XHCI_SET_HCCP1_MAXPSA(x) (((x) & 0x0F) << 12) +#define XHCI_SET_HCCP1_XECP(x) (((x) & 0xFFFF) << 16) + +/* hccparams2 register */ +#define XHCI_SET_HCCP2_U3C(x) ((x) & 0x01) +#define XHCI_SET_HCCP2_CMC(x) (((x) & 0x01) << 1) +#define XHCI_SET_HCCP2_FSC(x) (((x) & 0x01) << 2) +#define XHCI_SET_HCCP2_CTC(x) (((x) & 0x01) << 3) +#define XHCI_SET_HCCP2_LEC(x) (((x) & 0x01) << 4) +#define XHCI_SET_HCCP2_CIC(x) (((x) & 0x01) << 5) + +/* other registers */ +#define XHCI_SET_DOORBELL(x) ((x) & ~0x03) +#define XHCI_SET_RTSOFFSET(x) ((x) & ~0x0F) + +/* register masks */ +#define XHCI_PS_PLS_MASK (0xF << 5) /* port link state */ +#define XHCI_PS_SPEED_MASK (0xF << 10) /* port speed */ +#define XHCI_PS_PIC_MASK (0x3 << 14) /* port indicator */ + +/* port register set */ +#define XHCI_PORTREGS_BASE 0x400 /* base offset */ +#define XHCI_PORTREGS_PORT0 0x3F0 +#define XHCI_PORTREGS_SETSZ 0x10 /* size of a set */ + +#define MASK_64_HI(x) ((x) & ~0xFFFFFFFFULL) +#define MASK_64_LO(x) ((x) & 0xFFFFFFFFULL) + +#define FIELD_REPLACE(a,b,m,s) (((a) & ~((m) << (s))) | \ + (((b) & (m)) << (s))) +#define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \ + (((b) & ((m) << (s))))) + +struct pci_xhci_trb_ring { + uint64_t ringaddr; /* current dequeue guest address */ + uint32_t ccs; /* consumer cycle state */ +}; + +/* device endpoint transfer/stream rings */ +struct pci_xhci_dev_ep { + union { + struct xhci_trb *_epu_tr; + struct xhci_stream_ctx *_epu_sctx; + } _ep_trbsctx; +#define ep_tr _ep_trbsctx._epu_tr +#define ep_sctx _ep_trbsctx._epu_sctx + + union { + struct pci_xhci_trb_ring _epu_trb; + struct pci_xhci_trb_ring *_epu_sctx_trbs; + } _ep_trb_rings; +#define ep_ringaddr _ep_trb_rings._epu_trb.ringaddr +#define ep_ccs _ep_trb_rings._epu_trb.ccs +#define ep_sctx_trbs _ep_trb_rings._epu_sctx_trbs + + struct usb_data_xfer *ep_xfer; /* transfer chain */ +}; + +/* device context base address array: maps slot->device context */ +struct xhci_dcbaa { + uint64_t dcba[USB_MAX_DEVICES+1]; /* xhci_dev_ctx ptrs */ +}; + +/* port status registers */ +struct pci_xhci_portregs { + uint32_t portsc; /* port status and control */ + uint32_t portpmsc; /* port pwr mgmt status & control */ + uint32_t portli; /* port link info */ + uint32_t porthlpmc; /* port hardware LPM control */ +} __packed; +#define XHCI_PS_SPEED_SET(x) (((x) & 0xF) << 10) + +/* xHC operational registers */ +struct pci_xhci_opregs { + uint32_t usbcmd; /* usb command */ + uint32_t usbsts; /* usb status */ + uint32_t pgsz; /* page size */ + uint32_t dnctrl; /* device notification control */ + uint64_t crcr; /* command ring control */ + uint64_t dcbaap; /* device ctx base addr array ptr */ + uint32_t config; /* configure */ + + /* guest mapped addresses: */ + struct xhci_trb *cr_p; /* crcr dequeue */ + struct xhci_dcbaa *dcbaa_p; /* dev ctx array ptr */ +}; + +/* xHC runtime registers */ +struct pci_xhci_rtsregs { + uint32_t mfindex; /* microframe index */ + struct { /* interrupter register set */ + uint32_t iman; /* interrupter management */ + uint32_t imod; /* interrupter moderation */ + uint32_t erstsz; /* event ring segment table size */ + uint32_t rsvd; + uint64_t erstba; /* event ring seg-tbl base addr */ + uint64_t erdp; /* event ring dequeue ptr */ + } intrreg __packed; + + /* guest mapped addresses */ + struct xhci_event_ring_seg *erstba_p; + struct xhci_trb *erst_p; /* event ring segment tbl */ + int er_deq_seg; /* event ring dequeue segment */ + int er_enq_idx; /* event ring enqueue index - xHCI */ + int er_enq_seg; /* event ring enqueue segment */ + uint32_t er_events_cnt; /* number of events in ER */ + uint32_t event_pcs; /* producer cycle state flag */ +}; + + +struct pci_xhci_softc; + + +/* + * USB device emulation container. + * This is referenced from usb_hci->hci_sc; 1 pci_xhci_dev_emu for each + * emulated device instance. + */ +struct pci_xhci_dev_emu { + struct pci_xhci_softc *xsc; + + /* XHCI contexts */ + struct xhci_dev_ctx *dev_ctx; + struct pci_xhci_dev_ep eps[XHCI_MAX_ENDPOINTS]; + int dev_slotstate; + + struct usb_devemu *dev_ue; /* USB emulated dev */ + void *dev_sc; /* device's softc */ + + struct usb_hci hci; +}; + +struct pci_xhci_softc { + struct pci_devinst *xsc_pi; + + pthread_mutex_t mtx; + + uint32_t caplength; /* caplen & hciversion */ + uint32_t hcsparams1; /* structural parameters 1 */ + uint32_t hcsparams2; /* structural parameters 2 */ + uint32_t hcsparams3; /* structural parameters 3 */ + uint32_t hccparams1; /* capability parameters 1 */ + uint32_t dboff; /* doorbell offset */ + uint32_t rtsoff; /* runtime register space offset */ + uint32_t hccparams2; /* capability parameters 2 */ + + uint32_t regsend; /* end of configuration registers */ + + struct pci_xhci_opregs opregs; + struct pci_xhci_rtsregs rtsregs; + + struct pci_xhci_portregs *portregs; + struct pci_xhci_dev_emu **devices; /* XHCI[port] = device */ + struct pci_xhci_dev_emu **slots; /* slots assigned from 1 */ + int ndevices; + + int usb2_port_start; + int usb3_port_start; +}; + + +/* portregs and devices arrays are set up to start from idx=1 */ +#define XHCI_PORTREG_PTR(x,n) &(x)->portregs[(n)] +#define XHCI_DEVINST_PTR(x,n) (x)->devices[(n)] +#define XHCI_SLOTDEV_PTR(x,n) (x)->slots[(n)] + +#define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH) + +#define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \ + (a), \ + XHCI_PADDR_SZ - ((a) & (XHCI_PADDR_SZ-1))) + +static int xhci_in_use; + +/* map USB errors to XHCI */ +static const int xhci_usb_errors[USB_ERR_MAX] = { + [USB_ERR_NORMAL_COMPLETION] = XHCI_TRB_ERROR_SUCCESS, + [USB_ERR_PENDING_REQUESTS] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_NOT_STARTED] = XHCI_TRB_ERROR_ENDP_NOT_ON, + [USB_ERR_INVAL] = XHCI_TRB_ERROR_INVALID, + [USB_ERR_NOMEM] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_CANCELLED] = XHCI_TRB_ERROR_STOPPED, + [USB_ERR_BAD_ADDRESS] = XHCI_TRB_ERROR_PARAMETER, + [USB_ERR_BAD_BUFSIZE] = XHCI_TRB_ERROR_PARAMETER, + [USB_ERR_BAD_FLAG] = XHCI_TRB_ERROR_PARAMETER, + [USB_ERR_NO_CALLBACK] = XHCI_TRB_ERROR_STALL, + [USB_ERR_IN_USE] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_NO_ADDR] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_NO_PIPE] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_ZERO_NFRAMES] = XHCI_TRB_ERROR_UNDEFINED, + [USB_ERR_ZERO_MAXP] = XHCI_TRB_ERROR_UNDEFINED, + [USB_ERR_SET_ADDR_FAILED] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_NO_POWER] = XHCI_TRB_ERROR_ENDP_NOT_ON, + [USB_ERR_TOO_DEEP] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_IOERROR] = XHCI_TRB_ERROR_TRB, + [USB_ERR_NOT_CONFIGURED] = XHCI_TRB_ERROR_ENDP_NOT_ON, + [USB_ERR_TIMEOUT] = XHCI_TRB_ERROR_CMD_ABORTED, + [USB_ERR_SHORT_XFER] = XHCI_TRB_ERROR_SHORT_PKT, + [USB_ERR_STALLED] = XHCI_TRB_ERROR_STALL, + [USB_ERR_INTERRUPTED] = XHCI_TRB_ERROR_CMD_ABORTED, + [USB_ERR_DMA_LOAD_FAILED] = XHCI_TRB_ERROR_DATA_BUF, + [USB_ERR_BAD_CONTEXT] = XHCI_TRB_ERROR_TRB, + [USB_ERR_NO_ROOT_HUB] = XHCI_TRB_ERROR_UNDEFINED, + [USB_ERR_NO_INTR_THREAD] = XHCI_TRB_ERROR_UNDEFINED, + [USB_ERR_NOT_LOCKED] = XHCI_TRB_ERROR_UNDEFINED, +}; +#define USB_TO_XHCI_ERR(e) ((e) < USB_ERR_MAX ? xhci_usb_errors[(e)] : \ + XHCI_TRB_ERROR_INVALID) + +static int pci_xhci_insert_event(struct pci_xhci_softc *sc, + struct xhci_trb *evtrb, int do_intr); +static void pci_xhci_dump_trb(struct xhci_trb *trb); +static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc); +static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot); +static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm); +static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, + struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, + struct xhci_endp_ctx *ep_ctx, uint32_t streamid, + uint64_t ringaddr, int ccs); + +static void +pci_xhci_set_evtrb(struct xhci_trb *evtrb, uint64_t port, uint32_t errcode, + uint32_t evtype) +{ + evtrb->qwTrb0 = port << 24; + evtrb->dwTrb2 = XHCI_TRB_2_ERROR_SET(errcode); + evtrb->dwTrb3 = XHCI_TRB_3_TYPE_SET(evtype); +} + + +/* controller reset */ +static void +pci_xhci_reset(struct pci_xhci_softc *sc) +{ + int i; + + sc->rtsregs.er_enq_idx = 0; + sc->rtsregs.er_events_cnt = 0; + sc->rtsregs.event_pcs = 1; + + for (i = 1; i <= XHCI_MAX_SLOTS; i++) { + pci_xhci_reset_slot(sc, i); + } +} + +static uint32_t +pci_xhci_usbcmd_write(struct pci_xhci_softc *sc, uint32_t cmd) +{ + int do_intr = 0; + int i; + + if (cmd & XHCI_CMD_RS) { + do_intr = (sc->opregs.usbcmd & XHCI_CMD_RS) == 0; + + sc->opregs.usbcmd |= XHCI_CMD_RS; + sc->opregs.usbsts &= ~XHCI_STS_HCH; + sc->opregs.usbsts |= XHCI_STS_PCD; + + /* Queue port change event on controller run from stop */ + if (do_intr) + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + struct pci_xhci_dev_emu *dev; + struct pci_xhci_portregs *port; + struct xhci_trb evtrb; + + if ((dev = XHCI_DEVINST_PTR(sc, i)) == NULL) + continue; + + port = XHCI_PORTREG_PTR(sc, i); + port->portsc |= XHCI_PS_CSC | XHCI_PS_CCS; + port->portsc &= ~XHCI_PS_PLS_MASK; + + /* + * XHCI 4.19.3 USB2 RxDetect->Polling, + * USB3 Polling->U0 + */ + if (dev->dev_ue->ue_usbver == 2) + port->portsc |= + XHCI_PS_PLS_SET(UPS_PORT_LS_POLL); + else + port->portsc |= + XHCI_PS_PLS_SET(UPS_PORT_LS_U0); + + pci_xhci_set_evtrb(&evtrb, i, + XHCI_TRB_ERROR_SUCCESS, + XHCI_TRB_EVENT_PORT_STS_CHANGE); + + if (pci_xhci_insert_event(sc, &evtrb, 0) != + XHCI_TRB_ERROR_SUCCESS) + break; + } + } else { + sc->opregs.usbcmd &= ~XHCI_CMD_RS; + sc->opregs.usbsts |= XHCI_STS_HCH; + sc->opregs.usbsts &= ~XHCI_STS_PCD; + } + + /* start execution of schedule; stop when set to 0 */ + cmd |= sc->opregs.usbcmd & XHCI_CMD_RS; + + if (cmd & XHCI_CMD_HCRST) { + /* reset controller */ + pci_xhci_reset(sc); + cmd &= ~XHCI_CMD_HCRST; + } + + cmd &= ~(XHCI_CMD_CSS | XHCI_CMD_CRS); + + if (do_intr) + pci_xhci_assert_interrupt(sc); + + return (cmd); +} + +static void +pci_xhci_portregs_write(struct pci_xhci_softc *sc, uint64_t offset, + uint64_t value) +{ + struct xhci_trb evtrb; + struct pci_xhci_portregs *p; + int port; + uint32_t oldpls, newpls; + + if (sc->portregs == NULL) + return; + + port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ; + offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ; + + DPRINTF(("pci_xhci: portregs wr offset 0x%lx, port %u: 0x%lx\r\n", + offset, port, value)); + + assert(port >= 0); + + if (port > XHCI_MAX_DEVS) { + DPRINTF(("pci_xhci: portregs_write port %d > ndevices\r\n", + port)); + return; + } + + if (XHCI_DEVINST_PTR(sc, port) == NULL) { + DPRINTF(("pci_xhci: portregs_write to unattached port %d\r\n", + port)); + } + + p = XHCI_PORTREG_PTR(sc, port); + switch (offset) { + case 0: + /* port reset or warm reset */ + if (value & (XHCI_PS_PR | XHCI_PS_WPR)) { + pci_xhci_reset_port(sc, port, value & XHCI_PS_WPR); + break; + } + + if ((p->portsc & XHCI_PS_PP) == 0) { + WPRINTF(("pci_xhci: portregs_write to unpowered " + "port %d\r\n", port)); + break; + } + + /* Port status and control register */ + oldpls = XHCI_PS_PLS_GET(p->portsc); + newpls = XHCI_PS_PLS_GET(value); + + p->portsc &= XHCI_PS_PED | XHCI_PS_PLS_MASK | + XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK; + + if (XHCI_DEVINST_PTR(sc, port)) + p->portsc |= XHCI_PS_CCS; + + p->portsc |= (value & + ~(XHCI_PS_OCA | + XHCI_PS_PR | + XHCI_PS_PED | + XHCI_PS_PLS_MASK | /* link state */ + XHCI_PS_SPEED_MASK | + XHCI_PS_PIC_MASK | /* port indicator */ + XHCI_PS_LWS | XHCI_PS_DR | XHCI_PS_WPR)); + + /* clear control bits */ + p->portsc &= ~(value & + (XHCI_PS_CSC | + XHCI_PS_PEC | + XHCI_PS_WRC | + XHCI_PS_OCC | + XHCI_PS_PRC | + XHCI_PS_PLC | + XHCI_PS_CEC | + XHCI_PS_CAS)); + + /* port disable request; for USB3, don't care */ + if (value & XHCI_PS_PED) + DPRINTF(("Disable port %d request\r\n", port)); + + if (!(value & XHCI_PS_LWS)) + break; + + DPRINTF(("Port new PLS: %d\r\n", newpls)); + switch (newpls) { + case 0: /* U0 */ + case 3: /* U3 */ + if (oldpls != newpls) { + p->portsc &= ~XHCI_PS_PLS_MASK; + p->portsc |= XHCI_PS_PLS_SET(newpls) | + XHCI_PS_PLC; + + if (oldpls != 0 && newpls == 0) { + pci_xhci_set_evtrb(&evtrb, port, + XHCI_TRB_ERROR_SUCCESS, + XHCI_TRB_EVENT_PORT_STS_CHANGE); + + pci_xhci_insert_event(sc, &evtrb, 1); + } + } + break; + + default: + DPRINTF(("Unhandled change port %d PLS %u\r\n", + port, newpls)); + break; + } + break; + case 4: + /* Port power management status and control register */ + p->portpmsc = value; + break; + case 8: + /* Port link information register */ + DPRINTF(("pci_xhci attempted write to PORTLI, port %d\r\n", + port)); + break; + case 12: + /* + * Port hardware LPM control register. + * For USB3, this register is reserved. + */ + p->porthlpmc = value; + break; + } +} + +struct xhci_dev_ctx * +pci_xhci_get_dev_ctx(struct pci_xhci_softc *sc, uint32_t slot) +{ + uint64_t devctx_addr; + struct xhci_dev_ctx *devctx; + + assert(slot > 0 && slot <= sc->ndevices); + assert(sc->opregs.dcbaa_p != NULL); + + devctx_addr = sc->opregs.dcbaa_p->dcba[slot]; + + if (devctx_addr == 0) { + DPRINTF(("get_dev_ctx devctx_addr == 0\r\n")); + return (NULL); + } + + DPRINTF(("pci_xhci: get dev ctx, slot %u devctx addr %016lx\r\n", + slot, devctx_addr)); + devctx = XHCI_GADDR(sc, devctx_addr & ~0x3FUL); + + return (devctx); +} + +struct xhci_trb * +pci_xhci_trb_next(struct pci_xhci_softc *sc, struct xhci_trb *curtrb, + uint64_t *guestaddr) +{ + struct xhci_trb *next; + + assert(curtrb != NULL); + + if (XHCI_TRB_3_TYPE_GET(curtrb->dwTrb3) == XHCI_TRB_TYPE_LINK) { + if (guestaddr) + *guestaddr = curtrb->qwTrb0 & ~0xFUL; + + next = XHCI_GADDR(sc, curtrb->qwTrb0 & ~0xFUL); + } else { + if (guestaddr) + *guestaddr += sizeof(struct xhci_trb) & ~0xFUL; + + next = curtrb + 1; + } + + return (next); +} + +static void +pci_xhci_assert_interrupt(struct pci_xhci_softc *sc) +{ + + sc->rtsregs.intrreg.erdp |= XHCI_ERDP_LO_BUSY; + sc->rtsregs.intrreg.iman |= XHCI_IMAN_INTR_PEND; + sc->opregs.usbsts |= XHCI_STS_EINT; + + /* only trigger interrupt if permitted */ + if ((sc->opregs.usbcmd & XHCI_CMD_INTE) && + (sc->rtsregs.intrreg.iman & XHCI_IMAN_INTR_ENA)) { + if (pci_msi_enabled(sc->xsc_pi)) + pci_generate_msi(sc->xsc_pi, 0); + else + pci_lintr_assert(sc->xsc_pi); + } +} + +static void +pci_xhci_deassert_interrupt(struct pci_xhci_softc *sc) +{ + + if (!pci_msi_enabled(sc->xsc_pi)) + pci_lintr_assert(sc->xsc_pi); +} + +static void +pci_xhci_init_ep(struct pci_xhci_dev_emu *dev, int epid) +{ + struct xhci_dev_ctx *dev_ctx; + struct pci_xhci_dev_ep *devep; + struct xhci_endp_ctx *ep_ctx; + uint32_t pstreams; + int i; + + dev_ctx = dev->dev_ctx; + ep_ctx = &dev_ctx->ctx_ep[epid]; + devep = &dev->eps[epid]; + pstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0); + if (pstreams > 0) { + DPRINTF(("init_ep %d with pstreams %d\r\n", epid, pstreams)); + assert(devep->ep_sctx_trbs == NULL); + + devep->ep_sctx = XHCI_GADDR(dev->xsc, ep_ctx->qwEpCtx2 & + XHCI_EPCTX_2_TR_DQ_PTR_MASK); + devep->ep_sctx_trbs = calloc(pstreams, + sizeof(struct pci_xhci_trb_ring)); + for (i = 0; i < pstreams; i++) { + devep->ep_sctx_trbs[i].ringaddr = + devep->ep_sctx[i].qwSctx0 & + XHCI_SCTX_0_TR_DQ_PTR_MASK; + devep->ep_sctx_trbs[i].ccs = + XHCI_SCTX_0_DCS_GET(devep->ep_sctx[i].qwSctx0); + } + } else { + DPRINTF(("init_ep %d with no pstreams\r\n", epid)); + devep->ep_ringaddr = ep_ctx->qwEpCtx2 & + XHCI_EPCTX_2_TR_DQ_PTR_MASK; + devep->ep_ccs = XHCI_EPCTX_2_DCS_GET(ep_ctx->qwEpCtx2); + devep->ep_tr = XHCI_GADDR(dev->xsc, devep->ep_ringaddr); + DPRINTF(("init_ep tr DCS %x\r\n", devep->ep_ccs)); + } + + if (devep->ep_xfer == NULL) { + devep->ep_xfer = malloc(sizeof(struct usb_data_xfer)); + USB_DATA_XFER_INIT(devep->ep_xfer); + } +} + +static void +pci_xhci_disable_ep(struct pci_xhci_dev_emu *dev, int epid) +{ + struct xhci_dev_ctx *dev_ctx; + struct pci_xhci_dev_ep *devep; + struct xhci_endp_ctx *ep_ctx; + + DPRINTF(("pci_xhci disable_ep %d\r\n", epid)); + + dev_ctx = dev->dev_ctx; + ep_ctx = &dev_ctx->ctx_ep[epid]; + ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_DISABLED; + + devep = &dev->eps[epid]; + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0 && + devep->ep_sctx_trbs != NULL) + free(devep->ep_sctx_trbs); + + if (devep->ep_xfer != NULL) { + free(devep->ep_xfer); + devep->ep_xfer = NULL; + } + + memset(devep, 0, sizeof(struct pci_xhci_dev_ep)); +} + + +/* reset device at slot and data structures related to it */ +static void +pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot) +{ + struct pci_xhci_dev_emu *dev; + + dev = XHCI_SLOTDEV_PTR(sc, slot); + + if (!dev) { + DPRINTF(("xhci reset unassigned slot (%d)?\r\n", slot)); + } else { + dev->dev_slotstate = XHCI_ST_DISABLED; + } + + /* TODO: reset ring buffer pointers */ +} + +static int +pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, + int do_intr) +{ + struct pci_xhci_rtsregs *rts; + uint64_t erdp; + int erdp_idx; + int err; + struct xhci_trb *evtrbptr; + + err = XHCI_TRB_ERROR_SUCCESS; + + rts = &sc->rtsregs; + + erdp = rts->intrreg.erdp & ~0xF; + erdp_idx = (erdp - rts->erstba_p[rts->er_deq_seg].qwEvrsTablePtr) / + sizeof(struct xhci_trb); + + DPRINTF(("pci_xhci: insert event 0[%lx] 2[%x] 3[%x]\r\n" + "\terdp idx %d/seg %d, enq idx %d/seg %d, pcs %u\r\n" + "\t(erdp=0x%lx, erst=0x%lx, tblsz=%u, do_intr %d)\r\n", + evtrb->qwTrb0, evtrb->dwTrb2, evtrb->dwTrb3, + erdp_idx, rts->er_deq_seg, rts->er_enq_idx, + rts->er_enq_seg, + rts->event_pcs, erdp, rts->erstba_p->qwEvrsTablePtr, + rts->erstba_p->dwEvrsTableSize, do_intr)); + + evtrbptr = &rts->erst_p[rts->er_enq_idx]; + + /* TODO: multi-segment table */ + if (rts->er_events_cnt >= rts->erstba_p->dwEvrsTableSize) { + DPRINTF(("pci_xhci[%d] cannot insert event; ring full\r\n", + __LINE__)); + err = XHCI_TRB_ERROR_EV_RING_FULL; + goto done; + } + + if (rts->er_events_cnt == rts->erstba_p->dwEvrsTableSize - 1) { + struct xhci_trb errev; + + if ((evtrbptr->dwTrb3 & 0x1) == (rts->event_pcs & 0x1)) { + + DPRINTF(("pci_xhci[%d] insert evt err: ring full\r\n", + __LINE__)); + + errev.qwTrb0 = 0; + errev.dwTrb2 = XHCI_TRB_2_ERROR_SET( + XHCI_TRB_ERROR_EV_RING_FULL); + errev.dwTrb3 = XHCI_TRB_3_TYPE_SET( + XHCI_TRB_EVENT_HOST_CTRL) | + rts->event_pcs; + rts->er_events_cnt++; + memcpy(&rts->erst_p[rts->er_enq_idx], &errev, + sizeof(struct xhci_trb)); + rts->er_enq_idx = (rts->er_enq_idx + 1) % + rts->erstba_p->dwEvrsTableSize; + err = XHCI_TRB_ERROR_EV_RING_FULL; + do_intr = 1; + + goto done; + } + } else { + rts->er_events_cnt++; + } + + evtrb->dwTrb3 &= ~XHCI_TRB_3_CYCLE_BIT; + evtrb->dwTrb3 |= rts->event_pcs; + + memcpy(&rts->erst_p[rts->er_enq_idx], evtrb, sizeof(struct xhci_trb)); + rts->er_enq_idx = (rts->er_enq_idx + 1) % + rts->erstba_p->dwEvrsTableSize; + + if (rts->er_enq_idx == 0) + rts->event_pcs ^= 1; + +done: + if (do_intr) + pci_xhci_assert_interrupt(sc); + + return (err); +} + +static uint32_t +pci_xhci_cmd_enable_slot(struct pci_xhci_softc *sc, uint32_t *slot) +{ + struct pci_xhci_dev_emu *dev; + uint32_t cmderr; + int i; + + cmderr = XHCI_TRB_ERROR_NO_SLOTS; + if (sc->portregs != NULL) + for (i = 1; i <= XHCI_MAX_SLOTS; i++) { + dev = XHCI_SLOTDEV_PTR(sc, i); + if (dev && dev->dev_slotstate == XHCI_ST_DISABLED) { + *slot = i; + dev->dev_slotstate = XHCI_ST_ENABLED; + cmderr = XHCI_TRB_ERROR_SUCCESS; + dev->hci.hci_address = i; + break; + } + } + + DPRINTF(("pci_xhci enable slot (error=%d) slot %u\r\n", + cmderr != XHCI_TRB_ERROR_SUCCESS, *slot)); + + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_disable_slot(struct pci_xhci_softc *sc, uint32_t slot) +{ + struct pci_xhci_dev_emu *dev; + uint32_t cmderr; + + DPRINTF(("pci_xhci disable slot %u\r\n", slot)); + + cmderr = XHCI_TRB_ERROR_NO_SLOTS; + if (sc->portregs == NULL) + goto done; + + if (slot > sc->ndevices) { + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; + goto done; + } + + dev = XHCI_SLOTDEV_PTR(sc, slot); + if (dev) { + if (dev->dev_slotstate == XHCI_ST_DISABLED) { + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; + } else { + dev->dev_slotstate = XHCI_ST_DISABLED; + cmderr = XHCI_TRB_ERROR_SUCCESS; + /* TODO: reset events and endpoints */ + } + } + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_reset_device(struct pci_xhci_softc *sc, uint32_t slot) +{ + struct pci_xhci_dev_emu *dev; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + uint32_t cmderr; + int i; + + cmderr = XHCI_TRB_ERROR_NO_SLOTS; + if (sc->portregs == NULL) + goto done; + + DPRINTF(("pci_xhci reset device slot %u\r\n", slot)); + + dev = XHCI_SLOTDEV_PTR(sc, slot); + if (!dev || dev->dev_slotstate == XHCI_ST_DISABLED) + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; + else { + dev->dev_slotstate = XHCI_ST_DEFAULT; + + dev->hci.hci_address = 0; + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + /* slot state */ + dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_DEFAULT, + 0x1F, 27); + + /* number of contexts */ + dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); + + /* reset all eps other than ep-0 */ + for (i = 2; i <= 31; i++) { + ep_ctx = &dev_ctx->ctx_ep[i]; + ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, + XHCI_ST_EPCTX_DISABLED, 0x7, 0); + } + + cmderr = XHCI_TRB_ERROR_SUCCESS; + } + + pci_xhci_reset_slot(sc, slot); + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_address_device(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct pci_xhci_dev_emu *dev; + struct xhci_input_dev_ctx *input_ctx; + struct xhci_slot_ctx *islot_ctx; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep0_ctx; + uint32_t cmderr; + + input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); + islot_ctx = &input_ctx->ctx_slot; + ep0_ctx = &input_ctx->ctx_ep[1]; + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + DPRINTF(("pci_xhci: address device, input ctl: D 0x%08x A 0x%08x,\r\n" + " slot %08x %08x %08x %08x\r\n" + " ep0 %08x %08x %016lx %08x\r\n", + input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, + islot_ctx->dwSctx0, islot_ctx->dwSctx1, + islot_ctx->dwSctx2, islot_ctx->dwSctx3, + ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, + ep0_ctx->dwEpCtx4)); + + /* when setting address: drop-ctx=0, add-ctx=slot+ep0 */ + if ((input_ctx->ctx_input.dwInCtx0 != 0) || + (input_ctx->ctx_input.dwInCtx1 & 0x03) != 0x03) { + DPRINTF(("pci_xhci: address device, input ctl invalid\r\n")); + cmderr = XHCI_TRB_ERROR_TRB; + goto done; + } + + /* assign address to slot */ + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + DPRINTF(("pci_xhci: address device, dev ctx\r\n" + " slot %08x %08x %08x %08x\r\n", + dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); + + dev = XHCI_SLOTDEV_PTR(sc, slot); + assert(dev != NULL); + + dev->hci.hci_address = slot; + dev->dev_ctx = dev_ctx; + + if (dev->dev_ue->ue_reset == NULL || + dev->dev_ue->ue_reset(dev->dev_sc) < 0) { + cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; + goto done; + } + + memcpy(&dev_ctx->ctx_slot, islot_ctx, sizeof(struct xhci_slot_ctx)); + + dev_ctx->ctx_slot.dwSctx3 = + XHCI_SCTX_3_SLOT_STATE_SET(XHCI_ST_SLCTX_ADDRESSED) | + XHCI_SCTX_3_DEV_ADDR_SET(slot); + + memcpy(&dev_ctx->ctx_ep[1], ep0_ctx, sizeof(struct xhci_endp_ctx)); + ep0_ctx = &dev_ctx->ctx_ep[1]; + ep0_ctx->dwEpCtx0 = (ep0_ctx->dwEpCtx0 & ~0x7) | + XHCI_EPCTX_0_EPSTATE_SET(XHCI_ST_EPCTX_RUNNING); + + pci_xhci_init_ep(dev, 1); + + dev->dev_slotstate = XHCI_ST_ADDRESSED; + + DPRINTF(("pci_xhci: address device, output ctx\r\n" + " slot %08x %08x %08x %08x\r\n" + " ep0 %08x %08x %016lx %08x\r\n", + dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3, + ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, + ep0_ctx->dwEpCtx4)); + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_config_ep(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct xhci_input_dev_ctx *input_ctx; + struct pci_xhci_dev_emu *dev; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx, *iep_ctx; + uint32_t cmderr; + int i; + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + DPRINTF(("pci_xhci config_ep slot %u\r\n", slot)); + + dev = XHCI_SLOTDEV_PTR(sc, slot); + assert(dev != NULL); + + if ((trb->dwTrb3 & XHCI_TRB_3_DCEP_BIT) != 0) { + DPRINTF(("pci_xhci config_ep - deconfigure ep slot %u\r\n", + slot)); + if (dev->dev_ue->ue_stop != NULL) + dev->dev_ue->ue_stop(dev->dev_sc); + + dev->dev_slotstate = XHCI_ST_ADDRESSED; + + dev->hci.hci_address = 0; + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + /* number of contexts */ + dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); + + /* slot state */ + dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_ADDRESSED, + 0x1F, 27); + + /* disable endpoints */ + for (i = 2; i < 32; i++) + pci_xhci_disable_ep(dev, i); + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + goto done; + } + + if (dev->dev_slotstate < XHCI_ST_ADDRESSED) { + DPRINTF(("pci_xhci: config_ep slotstate x%x != addressed\r\n", + dev->dev_slotstate)); + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; + goto done; + } + + /* In addressed/configured state; + * for each drop endpoint ctx flag: + * ep->state = DISABLED + * for each add endpoint ctx flag: + * cp(ep-in, ep-out) + * ep->state = RUNNING + * for each drop+add endpoint flag: + * reset ep resources + * cp(ep-in, ep-out) + * ep->state = RUNNING + * if input->DisabledCtx[2-31] < 30: (at least 1 ep not disabled) + * slot->state = configured + */ + + input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); + dev_ctx = dev->dev_ctx; + DPRINTF(("pci_xhci: config_ep inputctx: D:x%08x A:x%08x 7:x%08x\r\n", + input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, + input_ctx->ctx_input.dwInCtx7)); + + for (i = 2; i <= 31; i++) { + ep_ctx = &dev_ctx->ctx_ep[i]; + + if (input_ctx->ctx_input.dwInCtx0 & + XHCI_INCTX_0_DROP_MASK(i)) { + DPRINTF((" config ep - dropping ep %d\r\n", i)); + pci_xhci_disable_ep(dev, i); + } + + if (input_ctx->ctx_input.dwInCtx1 & + XHCI_INCTX_1_ADD_MASK(i)) { + iep_ctx = &input_ctx->ctx_ep[i]; + + DPRINTF((" enable ep[%d] %08x %08x %016lx %08x\r\n", + i, iep_ctx->dwEpCtx0, iep_ctx->dwEpCtx1, + iep_ctx->qwEpCtx2, iep_ctx->dwEpCtx4)); + + memcpy(ep_ctx, iep_ctx, sizeof(struct xhci_endp_ctx)); + + pci_xhci_init_ep(dev, i); + + /* ep state */ + ep_ctx->dwEpCtx0 = FIELD_REPLACE( + ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); + } + } + + /* slot state to configured */ + dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_CONFIGURED, 0x1F, 27); + dev_ctx->ctx_slot.dwSctx0 = FIELD_COPY( + dev_ctx->ctx_slot.dwSctx0, input_ctx->ctx_slot.dwSctx0, 0x1F, 27); + dev->dev_slotstate = XHCI_ST_CONFIGURED; + + DPRINTF(("EP configured; slot %u [0]=0x%08x [1]=0x%08x [2]=0x%08x " + "[3]=0x%08x\r\n", + slot, dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_reset_ep(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct pci_xhci_dev_emu *dev; + struct pci_xhci_dev_ep *devep; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + uint32_t cmderr, epid; + uint32_t type; + + epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); + + DPRINTF(("pci_xhci: reset ep %u: slot %u\r\n", epid, slot)); + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); + + dev = XHCI_SLOTDEV_PTR(sc, slot); + assert(dev != NULL); + + if (type == XHCI_TRB_TYPE_STOP_EP && + (trb->dwTrb3 & XHCI_TRB_3_SUSP_EP_BIT) != 0) { + /* XXX suspend endpoint for 10ms */ + } + + if (epid < 1 || epid > 31) { + DPRINTF(("pci_xhci: reset ep: invalid epid %u\r\n", epid)); + cmderr = XHCI_TRB_ERROR_TRB; + goto done; + } + + devep = &dev->eps[epid]; + if (devep->ep_xfer != NULL) + USB_DATA_XFER_RESET(devep->ep_xfer); + + dev_ctx = dev->dev_ctx; + assert(dev_ctx != NULL); + + ep_ctx = &dev_ctx->ctx_ep[epid]; + + ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; + + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) == 0) + ep_ctx->qwEpCtx2 = devep->ep_ringaddr | devep->ep_ccs; + + DPRINTF(("pci_xhci: reset ep[%u] %08x %08x %016lx %08x\r\n", + epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, + ep_ctx->dwEpCtx4)); + + if (type == XHCI_TRB_TYPE_RESET_EP && + (dev->dev_ue->ue_reset == NULL || + dev->dev_ue->ue_reset(dev->dev_sc) < 0)) { + cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; + goto done; + } + +done: + return (cmderr); +} + + +static uint32_t +pci_xhci_find_stream(struct pci_xhci_softc *sc, struct xhci_endp_ctx *ep, + uint32_t streamid, struct xhci_stream_ctx **osctx) +{ + struct xhci_stream_ctx *sctx; + uint32_t maxpstreams; + + maxpstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep->dwEpCtx0); + if (maxpstreams == 0) + return (XHCI_TRB_ERROR_TRB); + + if (maxpstreams > XHCI_STREAMS_MAX) + return (XHCI_TRB_ERROR_INVALID_SID); + + if (XHCI_EPCTX_0_LSA_GET(ep->dwEpCtx0) == 0) { + DPRINTF(("pci_xhci: find_stream; LSA bit not set\r\n")); + return (XHCI_TRB_ERROR_INVALID_SID); + } + + /* only support primary stream */ + if (streamid > maxpstreams) + return (XHCI_TRB_ERROR_STREAM_TYPE); + + sctx = XHCI_GADDR(sc, ep->qwEpCtx2 & ~0xFUL) + streamid; + if (!XHCI_SCTX_0_SCT_GET(sctx->qwSctx0)) + return (XHCI_TRB_ERROR_STREAM_TYPE); + + *osctx = sctx; + + return (XHCI_TRB_ERROR_SUCCESS); +} + + +static uint32_t +pci_xhci_cmd_set_tr(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct pci_xhci_dev_emu *dev; + struct pci_xhci_dev_ep *devep; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + uint32_t cmderr, epid; + uint32_t streamid; + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + dev = XHCI_SLOTDEV_PTR(sc, slot); + assert(dev != NULL); + + DPRINTF(("pci_xhci set_tr: new-tr x%016lx, SCT %u DCS %u\r\n" + " stream-id %u, slot %u, epid %u, C %u\r\n", + (trb->qwTrb0 & ~0xF), (uint32_t)((trb->qwTrb0 >> 1) & 0x7), + (uint32_t)(trb->qwTrb0 & 0x1), (trb->dwTrb2 >> 16) & 0xFFFF, + XHCI_TRB_3_SLOT_GET(trb->dwTrb3), + XHCI_TRB_3_EP_GET(trb->dwTrb3), trb->dwTrb3 & 0x1)); + + epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); + if (epid < 1 || epid > 31) { + DPRINTF(("pci_xhci: set_tr_deq: invalid epid %u\r\n", epid)); + cmderr = XHCI_TRB_ERROR_TRB; + goto done; + } + + dev_ctx = dev->dev_ctx; + assert(dev_ctx != NULL); + + ep_ctx = &dev_ctx->ctx_ep[epid]; + devep = &dev->eps[epid]; + + switch (XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)) { + case XHCI_ST_EPCTX_STOPPED: + case XHCI_ST_EPCTX_ERROR: + break; + default: + DPRINTF(("pci_xhci cmd set_tr invalid state %x\r\n", + XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0))); + cmderr = XHCI_TRB_ERROR_CONTEXT_STATE; + goto done; + } + + streamid = XHCI_TRB_2_STREAM_GET(trb->dwTrb2); + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0) { + struct xhci_stream_ctx *sctx; + + sctx = NULL; + cmderr = pci_xhci_find_stream(sc, ep_ctx, streamid, &sctx); + if (sctx != NULL) { + assert(devep->ep_sctx != NULL); + + devep->ep_sctx[streamid].qwSctx0 = trb->qwTrb0; + devep->ep_sctx_trbs[streamid].ringaddr = + trb->qwTrb0 & ~0xF; + devep->ep_sctx_trbs[streamid].ccs = + XHCI_EPCTX_2_DCS_GET(trb->qwTrb0); + } + } else { + if (streamid != 0) { + DPRINTF(("pci_xhci cmd set_tr streamid %x != 0\r\n", + streamid)); + } + ep_ctx->qwEpCtx2 = trb->qwTrb0 & ~0xFUL; + devep->ep_ringaddr = ep_ctx->qwEpCtx2 & ~0xFUL; + devep->ep_ccs = trb->qwTrb0 & 0x1; + devep->ep_tr = XHCI_GADDR(sc, devep->ep_ringaddr); + + DPRINTF(("pci_xhci set_tr first TRB:\r\n")); + pci_xhci_dump_trb(devep->ep_tr); + } + ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_eval_ctx(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct xhci_input_dev_ctx *input_ctx; + struct xhci_slot_ctx *islot_ctx; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep0_ctx; + uint32_t cmderr; + + input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); + islot_ctx = &input_ctx->ctx_slot; + ep0_ctx = &input_ctx->ctx_ep[1]; + + cmderr = XHCI_TRB_ERROR_SUCCESS; + DPRINTF(("pci_xhci: eval ctx, input ctl: D 0x%08x A 0x%08x,\r\n" + " slot %08x %08x %08x %08x\r\n" + " ep0 %08x %08x %016lx %08x\r\n", + input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, + islot_ctx->dwSctx0, islot_ctx->dwSctx1, + islot_ctx->dwSctx2, islot_ctx->dwSctx3, + ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, + ep0_ctx->dwEpCtx4)); + + /* this command expects drop-ctx=0 & add-ctx=slot+ep0 */ + if ((input_ctx->ctx_input.dwInCtx0 != 0) || + (input_ctx->ctx_input.dwInCtx1 & 0x03) == 0) { + DPRINTF(("pci_xhci: eval ctx, input ctl invalid\r\n")); + cmderr = XHCI_TRB_ERROR_TRB; + goto done; + } + + /* assign address to slot; in this emulation, slot_id = address */ + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + DPRINTF(("pci_xhci: eval ctx, dev ctx\r\n" + " slot %08x %08x %08x %08x\r\n", + dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); + + if (input_ctx->ctx_input.dwInCtx1 & 0x01) { /* slot ctx */ + /* set max exit latency */ + dev_ctx->ctx_slot.dwSctx1 = FIELD_COPY( + dev_ctx->ctx_slot.dwSctx1, input_ctx->ctx_slot.dwSctx1, + 0xFFFF, 0); + + /* set interrupter target */ + dev_ctx->ctx_slot.dwSctx2 = FIELD_COPY( + dev_ctx->ctx_slot.dwSctx2, input_ctx->ctx_slot.dwSctx2, + 0x3FF, 22); + } + if (input_ctx->ctx_input.dwInCtx1 & 0x02) { /* control ctx */ + /* set max packet size */ + dev_ctx->ctx_ep[1].dwEpCtx1 = FIELD_COPY( + dev_ctx->ctx_ep[1].dwEpCtx1, ep0_ctx->dwEpCtx1, + 0xFFFF, 16); + + ep0_ctx = &dev_ctx->ctx_ep[1]; + } + + DPRINTF(("pci_xhci: eval ctx, output ctx\r\n" + " slot %08x %08x %08x %08x\r\n" + " ep0 %08x %08x %016lx %08x\r\n", + dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3, + ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, + ep0_ctx->dwEpCtx4)); + +done: + return (cmderr); +} + +static int +pci_xhci_complete_commands(struct pci_xhci_softc *sc) +{ + struct xhci_trb evtrb; + struct xhci_trb *trb; + uint64_t crcr; + uint32_t ccs; /* cycle state (XHCI 4.9.2) */ + uint32_t type; + uint32_t slot; + uint32_t cmderr; + int error; + + error = 0; + sc->opregs.crcr |= XHCI_CRCR_LO_CRR; + + trb = sc->opregs.cr_p; + ccs = sc->opregs.crcr & XHCI_CRCR_LO_RCS; + crcr = sc->opregs.crcr & ~0xF; + + while (1) { + sc->opregs.cr_p = trb; + + type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); + + if ((trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT) != + (ccs & XHCI_TRB_3_CYCLE_BIT)) + break; + + DPRINTF(("pci_xhci: cmd type 0x%x, Trb0 x%016lx dwTrb2 x%08x" + " dwTrb3 x%08x, TRB_CYCLE %u/ccs %u\r\n", + type, trb->qwTrb0, trb->dwTrb2, trb->dwTrb3, + trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT, ccs)); + + cmderr = XHCI_TRB_ERROR_SUCCESS; + evtrb.dwTrb2 = 0; + evtrb.dwTrb3 = (ccs & XHCI_TRB_3_CYCLE_BIT) | + XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_CMD_COMPLETE); + slot = 0; + + switch (type) { + case XHCI_TRB_TYPE_LINK: /* 0x06 */ + if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) + ccs ^= XHCI_CRCR_LO_RCS; + break; + + case XHCI_TRB_TYPE_ENABLE_SLOT: /* 0x09 */ + cmderr = pci_xhci_cmd_enable_slot(sc, &slot); + break; + + case XHCI_TRB_TYPE_DISABLE_SLOT: /* 0x0A */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_disable_slot(sc, slot); + break; + + case XHCI_TRB_TYPE_ADDRESS_DEVICE: /* 0x0B */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_address_device(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_CONFIGURE_EP: /* 0x0C */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_config_ep(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_EVALUATE_CTX: /* 0x0D */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_eval_ctx(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_RESET_EP: /* 0x0E */ + DPRINTF(("Reset Endpoint on slot %d\r\n", slot)); + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_STOP_EP: /* 0x0F */ + DPRINTF(("Stop Endpoint on slot %d\r\n", slot)); + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_SET_TR_DEQUEUE: /* 0x10 */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_set_tr(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_RESET_DEVICE: /* 0x11 */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_reset_device(sc, slot); + break; + + case XHCI_TRB_TYPE_FORCE_EVENT: /* 0x12 */ + /* TODO: */ + break; + + case XHCI_TRB_TYPE_NEGOTIATE_BW: /* 0x13 */ + break; + + case XHCI_TRB_TYPE_SET_LATENCY_TOL: /* 0x14 */ + break; + + case XHCI_TRB_TYPE_GET_PORT_BW: /* 0x15 */ + break; + + case XHCI_TRB_TYPE_FORCE_HEADER: /* 0x16 */ + break; + + case XHCI_TRB_TYPE_NOOP_CMD: /* 0x17 */ + break; + + default: + DPRINTF(("pci_xhci: unsupported cmd %x\r\n", type)); + break; + } + + if (type != XHCI_TRB_TYPE_LINK) { + /* + * insert command completion event and assert intr + */ + evtrb.qwTrb0 = crcr; + evtrb.dwTrb2 |= XHCI_TRB_2_ERROR_SET(cmderr); + evtrb.dwTrb3 |= XHCI_TRB_3_SLOT_SET(slot); + DPRINTF(("pci_xhci: command 0x%x result: 0x%x\r\n", + type, cmderr)); + pci_xhci_insert_event(sc, &evtrb, 1); + } + + trb = pci_xhci_trb_next(sc, trb, &crcr); + } + + sc->opregs.crcr = crcr | (sc->opregs.crcr & XHCI_CRCR_LO_CA) | ccs; + sc->opregs.crcr &= ~XHCI_CRCR_LO_CRR; + return (error); +} + +static void +pci_xhci_dump_trb(struct xhci_trb *trb) +{ + static const char *trbtypes[] = { + "RESERVED", + "NORMAL", + "SETUP_STAGE", + "DATA_STAGE", + "STATUS_STAGE", + "ISOCH", + "LINK", + "EVENT_DATA", + "NOOP", + "ENABLE_SLOT", + "DISABLE_SLOT", + "ADDRESS_DEVICE", + "CONFIGURE_EP", + "EVALUATE_CTX", + "RESET_EP", + "STOP_EP", + "SET_TR_DEQUEUE", + "RESET_DEVICE", + "FORCE_EVENT", + "NEGOTIATE_BW", + "SET_LATENCY_TOL", + "GET_PORT_BW", + "FORCE_HEADER", + "NOOP_CMD" + }; + uint32_t type; + + type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); + DPRINTF(("pci_xhci: trb[@%p] type x%02x %s 0:x%016lx 2:x%08x 3:x%08x\r\n", + trb, type, + type <= XHCI_TRB_TYPE_NOOP_CMD ? trbtypes[type] : "INVALID", + trb->qwTrb0, trb->dwTrb2, trb->dwTrb3)); +} + +static int +pci_xhci_xfer_complete(struct pci_xhci_softc *sc, struct usb_data_xfer *xfer, + uint32_t slot, uint32_t epid, int *do_intr) +{ + struct pci_xhci_dev_emu *dev; + struct pci_xhci_dev_ep *devep; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + struct xhci_trb *trb; + struct xhci_trb evtrb; + uint32_t trbflags; + uint32_t edtla; + int i, err; + + dev = XHCI_SLOTDEV_PTR(sc, slot); + devep = &dev->eps[epid]; + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + assert(dev_ctx != NULL); + + ep_ctx = &dev_ctx->ctx_ep[epid]; + + err = XHCI_TRB_ERROR_SUCCESS; + *do_intr = 0; + edtla = 0; + + /* go through list of TRBs and insert event(s) */ + for (i = xfer->head; xfer->ndata > 0; ) { + evtrb.qwTrb0 = (uint64_t)xfer->data[i].hci_data; + trb = XHCI_GADDR(sc, evtrb.qwTrb0); + trbflags = trb->dwTrb3; + + DPRINTF(("pci_xhci: xfer[%d] done?%u:%d trb %x %016lx %x " + "(err %d) IOC?%d\r\n", + i, xfer->data[i].processed, xfer->data[i].blen, + XHCI_TRB_3_TYPE_GET(trbflags), evtrb.qwTrb0, + trbflags, err, + trb->dwTrb3 & XHCI_TRB_3_IOC_BIT ? 1 : 0)); + + if (!xfer->data[i].processed) { + xfer->head = i; + break; + } + + xfer->ndata--; + edtla += xfer->data[i].bdone; + + trb->dwTrb3 = (trb->dwTrb3 & ~0x1) | (xfer->data[i].ccs); + + pci_xhci_update_ep_ring(sc, dev, devep, ep_ctx, + xfer->data[i].streamid, xfer->data[i].trbnext, + xfer->data[i].ccs); + + /* Only interrupt if IOC or short packet */ + if (!(trb->dwTrb3 & XHCI_TRB_3_IOC_BIT) && + !((err == XHCI_TRB_ERROR_SHORT_PKT) && + (trb->dwTrb3 & XHCI_TRB_3_ISP_BIT))) { + + i = (i + 1) % USB_MAX_XFER_BLOCKS; + continue; + } + + evtrb.dwTrb2 = XHCI_TRB_2_ERROR_SET(err) | + XHCI_TRB_2_REM_SET(xfer->data[i].blen); + + evtrb.dwTrb3 = XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_TRANSFER) | + XHCI_TRB_3_SLOT_SET(slot) | XHCI_TRB_3_EP_SET(epid); + + if (XHCI_TRB_3_TYPE_GET(trbflags) == XHCI_TRB_TYPE_EVENT_DATA) { + DPRINTF(("pci_xhci EVENT_DATA edtla %u\r\n", edtla)); + evtrb.qwTrb0 = trb->qwTrb0; + evtrb.dwTrb2 = (edtla & 0xFFFFF) | + XHCI_TRB_2_ERROR_SET(err); + evtrb.dwTrb3 |= XHCI_TRB_3_ED_BIT; + edtla = 0; + } + + *do_intr = 1; + + err = pci_xhci_insert_event(sc, &evtrb, 0); + if (err != XHCI_TRB_ERROR_SUCCESS) { + break; + } + + i = (i + 1) % USB_MAX_XFER_BLOCKS; + } + + return (err); +} + +static void +pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, + struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, + uint32_t streamid, uint64_t ringaddr, int ccs) +{ + + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) { + devep->ep_sctx[streamid].qwSctx0 = (ringaddr & ~0xFUL) | + (ccs & 0x1); + + devep->ep_sctx_trbs[streamid].ringaddr = ringaddr & ~0xFUL; + devep->ep_sctx_trbs[streamid].ccs = ccs & 0x1; + ep_ctx->qwEpCtx2 = (ep_ctx->qwEpCtx2 & ~0x1) | (ccs & 0x1); + + DPRINTF(("xhci update ep-ring stream %d, addr %lx\r\n", + streamid, devep->ep_sctx[streamid].qwSctx0)); + } else { + devep->ep_ringaddr = ringaddr & ~0xFUL; + devep->ep_ccs = ccs & 0x1; + devep->ep_tr = XHCI_GADDR(sc, ringaddr & ~0xFUL); + ep_ctx->qwEpCtx2 = (ringaddr & ~0xFUL) | (ccs & 0x1); + + DPRINTF(("xhci update ep-ring, addr %lx\r\n", + (devep->ep_ringaddr | devep->ep_ccs))); + } +} + +/* + * Outstanding transfer still in progress (device NAK'd earlier) so retry + * the transfer again to see if it succeeds. + */ +static int +pci_xhci_try_usb_xfer(struct pci_xhci_softc *sc, + struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, + struct xhci_endp_ctx *ep_ctx, uint32_t slot, uint32_t epid) +{ + struct usb_data_xfer *xfer; + int err; + int do_intr; + + ep_ctx->dwEpCtx0 = FIELD_REPLACE( + ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); + + err = 0; + do_intr = 0; + + xfer = devep->ep_xfer; +#ifdef __FreeBSD__ + USB_DATA_XFER_LOCK(xfer); +#else + /* + * At least one caller needs to hold this lock across the call to this + * function and other code. To avoid deadlock from a recursive mutex + * enter, we ensure that all callers hold this lock. + */ + assert(USB_DATA_XFER_LOCK_HELD(xfer)); +#endif + + /* outstanding requests queued up */ + if (dev->dev_ue->ue_data != NULL) { + err = dev->dev_ue->ue_data(dev->dev_sc, xfer, + epid & 0x1 ? USB_XFER_IN : USB_XFER_OUT, epid/2); + if (err == USB_ERR_CANCELLED) { + if (USB_DATA_GET_ERRCODE(&xfer->data[xfer->head]) == + USB_NAK) + err = XHCI_TRB_ERROR_SUCCESS; + } else { + err = pci_xhci_xfer_complete(sc, xfer, slot, epid, + &do_intr); + if (err == XHCI_TRB_ERROR_SUCCESS && do_intr) { + pci_xhci_assert_interrupt(sc); + } + + + /* XXX should not do it if error? */ + USB_DATA_XFER_RESET(xfer); + } + } + +#ifdef __FreeBSD__ + USB_DATA_XFER_UNLOCK(xfer); +#endif + + return (err); +} + + +static int +pci_xhci_handle_transfer(struct pci_xhci_softc *sc, + struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, + struct xhci_endp_ctx *ep_ctx, struct xhci_trb *trb, uint32_t slot, + uint32_t epid, uint64_t addr, uint32_t ccs, uint32_t streamid) +{ + struct xhci_trb *setup_trb; + struct usb_data_xfer *xfer; + struct usb_data_xfer_block *xfer_block; + uint64_t val; + uint32_t trbflags; + int do_intr, err; + int do_retry; + + ep_ctx->dwEpCtx0 = FIELD_REPLACE(ep_ctx->dwEpCtx0, + XHCI_ST_EPCTX_RUNNING, 0x7, 0); + + xfer = devep->ep_xfer; + USB_DATA_XFER_LOCK(xfer); + + DPRINTF(("pci_xhci handle_transfer slot %u\r\n", slot)); + +retry: + err = 0; + do_retry = 0; + do_intr = 0; + setup_trb = NULL; + + while (1) { + pci_xhci_dump_trb(trb); + + trbflags = trb->dwTrb3; + + if (XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK && + (trbflags & XHCI_TRB_3_CYCLE_BIT) != + (ccs & XHCI_TRB_3_CYCLE_BIT)) { + DPRINTF(("Cycle-bit changed trbflags %x, ccs %x\r\n", + trbflags & XHCI_TRB_3_CYCLE_BIT, ccs)); + break; + } + + xfer_block = NULL; + + switch (XHCI_TRB_3_TYPE_GET(trbflags)) { + case XHCI_TRB_TYPE_LINK: + if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) + ccs ^= 0x1; + + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + xfer_block->processed = 1; + break; + + case XHCI_TRB_TYPE_SETUP_STAGE: + if ((trbflags & XHCI_TRB_3_IDT_BIT) == 0 || + XHCI_TRB_2_BYTES_GET(trb->dwTrb2) != 8) { + DPRINTF(("pci_xhci: invalid setup trb\r\n")); + err = XHCI_TRB_ERROR_TRB; + goto errout; + } + setup_trb = trb; + + val = trb->qwTrb0; + if (!xfer->ureq) + xfer->ureq = malloc( + sizeof(struct usb_device_request)); + memcpy(xfer->ureq, &val, + sizeof(struct usb_device_request)); + + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + xfer_block->processed = 1; + break; + + case XHCI_TRB_TYPE_NORMAL: + case XHCI_TRB_TYPE_ISOCH: + if (setup_trb != NULL) { + DPRINTF(("pci_xhci: trb not supposed to be in " + "ctl scope\r\n")); + err = XHCI_TRB_ERROR_TRB; + goto errout; + } + /* fall through */ + + case XHCI_TRB_TYPE_DATA_STAGE: + xfer_block = usb_data_xfer_append(xfer, + (void *)(trbflags & XHCI_TRB_3_IDT_BIT ? + &trb->qwTrb0 : XHCI_GADDR(sc, trb->qwTrb0)), + trb->dwTrb2 & 0x1FFFF, (void *)addr, ccs); + break; + + case XHCI_TRB_TYPE_STATUS_STAGE: + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + break; + + case XHCI_TRB_TYPE_NOOP: + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + xfer_block->processed = 1; + break; + + case XHCI_TRB_TYPE_EVENT_DATA: + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + if ((epid > 1) && (trbflags & XHCI_TRB_3_IOC_BIT)) { + xfer_block->processed = 1; + } + break; + + default: + DPRINTF(("pci_xhci: handle xfer unexpected trb type " + "0x%x\r\n", + XHCI_TRB_3_TYPE_GET(trbflags))); + err = XHCI_TRB_ERROR_TRB; + goto errout; + } + + trb = pci_xhci_trb_next(sc, trb, &addr); + + DPRINTF(("pci_xhci: next trb: 0x%lx\r\n", (uint64_t)trb)); + + if (xfer_block) { + xfer_block->trbnext = addr; + xfer_block->streamid = streamid; + } + + if (!setup_trb && !(trbflags & XHCI_TRB_3_CHAIN_BIT) && + XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK) { + break; + } + + /* handle current batch that requires interrupt on complete */ + if (trbflags & XHCI_TRB_3_IOC_BIT) { + DPRINTF(("pci_xhci: trb IOC bit set\r\n")); + if (epid == 1) + do_retry = 1; + break; + } + } + + DPRINTF(("pci_xhci[%d]: xfer->ndata %u\r\n", __LINE__, xfer->ndata)); + + if (epid == 1) { + err = USB_ERR_NOT_STARTED; + if (dev->dev_ue->ue_request != NULL) + err = dev->dev_ue->ue_request(dev->dev_sc, xfer); + setup_trb = NULL; + } else { + /* handle data transfer */ + pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); + err = XHCI_TRB_ERROR_SUCCESS; + goto errout; + } + + err = USB_TO_XHCI_ERR(err); + if ((err == XHCI_TRB_ERROR_SUCCESS) || + (err == XHCI_TRB_ERROR_SHORT_PKT)) { + err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); + if (err != XHCI_TRB_ERROR_SUCCESS) + do_retry = 0; + } + +errout: + if (err == XHCI_TRB_ERROR_EV_RING_FULL) + DPRINTF(("pci_xhci[%d]: event ring full\r\n", __LINE__)); + + if (!do_retry) + USB_DATA_XFER_UNLOCK(xfer); + + if (do_intr) + pci_xhci_assert_interrupt(sc); + + if (do_retry) { + USB_DATA_XFER_RESET(xfer); + DPRINTF(("pci_xhci[%d]: retry:continuing with next TRBs\r\n", + __LINE__)); + goto retry; + } + + if (epid == 1) + USB_DATA_XFER_RESET(xfer); + + return (err); +} + +static void +pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot, + uint32_t epid, uint32_t streamid) +{ + struct pci_xhci_dev_emu *dev; + struct pci_xhci_dev_ep *devep; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + struct pci_xhci_trb_ring *sctx_tr; + struct xhci_trb *trb; + uint64_t ringaddr; + uint32_t ccs; + + DPRINTF(("pci_xhci doorbell slot %u epid %u stream %u\r\n", + slot, epid, streamid)); + + if (slot == 0 || slot > sc->ndevices) { + DPRINTF(("pci_xhci: invalid doorbell slot %u\r\n", slot)); + return; + } + + dev = XHCI_SLOTDEV_PTR(sc, slot); + devep = &dev->eps[epid]; + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + if (!dev_ctx) { + return; + } + ep_ctx = &dev_ctx->ctx_ep[epid]; + + sctx_tr = NULL; + + DPRINTF(("pci_xhci: device doorbell ep[%u] %08x %08x %016lx %08x\r\n", + epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, + ep_ctx->dwEpCtx4)); + + if (ep_ctx->qwEpCtx2 == 0) + return; + + /* handle pending transfers */ + if (devep->ep_xfer->ndata > 0) { +#ifndef __FreeBSD__ + USB_DATA_XFER_LOCK(devep->ep_xfer); +#endif + pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); +#ifndef __FreeBSD__ + USB_DATA_XFER_UNLOCK(devep->ep_xfer); +#endif + return; + } + + /* get next trb work item */ + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) { + sctx_tr = &devep->ep_sctx_trbs[streamid]; + ringaddr = sctx_tr->ringaddr; + ccs = sctx_tr->ccs; + trb = XHCI_GADDR(sc, sctx_tr->ringaddr & ~0xFUL); + DPRINTF(("doorbell, stream %u, ccs %lx, trb ccs %x\r\n", + streamid, ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, + trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); + } else { + ringaddr = devep->ep_ringaddr; + ccs = devep->ep_ccs; + trb = devep->ep_tr; + DPRINTF(("doorbell, ccs %lx, trb ccs %x\r\n", + ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, + trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); + } + + if (XHCI_TRB_3_TYPE_GET(trb->dwTrb3) == 0) { + DPRINTF(("pci_xhci: ring %lx trb[%lx] EP %u is RESERVED?\r\n", + ep_ctx->qwEpCtx2, devep->ep_ringaddr, epid)); + return; + } + + pci_xhci_handle_transfer(sc, dev, devep, ep_ctx, trb, slot, epid, + ringaddr, ccs, streamid); +} + +static void +pci_xhci_dbregs_write(struct pci_xhci_softc *sc, uint64_t offset, + uint64_t value) +{ + + offset = (offset - sc->dboff) / sizeof(uint32_t); + + DPRINTF(("pci_xhci: doorbell write offset 0x%lx: 0x%lx\r\n", + offset, value)); + + if (XHCI_HALTED(sc)) { + DPRINTF(("pci_xhci: controller halted\r\n")); + return; + } + + if (offset == 0) + pci_xhci_complete_commands(sc); + else if (sc->portregs != NULL) + pci_xhci_device_doorbell(sc, offset, + XHCI_DB_TARGET_GET(value), XHCI_DB_SID_GET(value)); +} + +static void +pci_xhci_rtsregs_write(struct pci_xhci_softc *sc, uint64_t offset, + uint64_t value) +{ + struct pci_xhci_rtsregs *rts; + + offset -= sc->rtsoff; + + if (offset == 0) { + DPRINTF(("pci_xhci attempted write to MFINDEX\r\n")); + return; + } + + DPRINTF(("pci_xhci: runtime regs write offset 0x%lx: 0x%lx\r\n", + offset, value)); + + offset -= 0x20; /* start of intrreg */ + + rts = &sc->rtsregs; + + switch (offset) { + case 0x00: + if (value & XHCI_IMAN_INTR_PEND) + rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; + rts->intrreg.iman = (value & XHCI_IMAN_INTR_ENA) | + (rts->intrreg.iman & XHCI_IMAN_INTR_PEND); + + if (!(value & XHCI_IMAN_INTR_ENA)) + pci_xhci_deassert_interrupt(sc); + + break; + + case 0x04: + rts->intrreg.imod = value; + break; + + case 0x08: + rts->intrreg.erstsz = value & 0xFFFF; + break; + + case 0x10: + /* ERSTBA low bits */ + rts->intrreg.erstba = MASK_64_HI(sc->rtsregs.intrreg.erstba) | + (value & ~0x3F); + break; + + case 0x14: + /* ERSTBA high bits */ + rts->intrreg.erstba = (value << 32) | + MASK_64_LO(sc->rtsregs.intrreg.erstba); + + rts->erstba_p = XHCI_GADDR(sc, + sc->rtsregs.intrreg.erstba & ~0x3FUL); + + rts->erst_p = XHCI_GADDR(sc, + sc->rtsregs.erstba_p->qwEvrsTablePtr & ~0x3FUL); + + rts->er_enq_idx = 0; + rts->er_events_cnt = 0; + + DPRINTF(("pci_xhci: wr erstba erst (%p) ptr 0x%lx, sz %u\r\n", + rts->erstba_p, + rts->erstba_p->qwEvrsTablePtr, + rts->erstba_p->dwEvrsTableSize)); + break; + + case 0x18: + /* ERDP low bits */ + rts->intrreg.erdp = + MASK_64_HI(sc->rtsregs.intrreg.erdp) | + (rts->intrreg.erdp & XHCI_ERDP_LO_BUSY) | + (value & ~0xF); + if (value & XHCI_ERDP_LO_BUSY) { + rts->intrreg.erdp &= ~XHCI_ERDP_LO_BUSY; + rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; + } + + rts->er_deq_seg = XHCI_ERDP_LO_SINDEX(value); + + break; + + case 0x1C: + /* ERDP high bits */ + rts->intrreg.erdp = (value << 32) | + MASK_64_LO(sc->rtsregs.intrreg.erdp); + + if (rts->er_events_cnt > 0) { + uint64_t erdp; + uint32_t erdp_i; + + erdp = rts->intrreg.erdp & ~0xF; + erdp_i = (erdp - rts->erstba_p->qwEvrsTablePtr) / + sizeof(struct xhci_trb); + + if (erdp_i <= rts->er_enq_idx) + rts->er_events_cnt = rts->er_enq_idx - erdp_i; + else + rts->er_events_cnt = + rts->erstba_p->dwEvrsTableSize - + (erdp_i - rts->er_enq_idx); + + DPRINTF(("pci_xhci: erdp 0x%lx, events cnt %u\r\n", + erdp, rts->er_events_cnt)); + } + + break; + + default: + DPRINTF(("pci_xhci attempted write to RTS offset 0x%lx\r\n", + offset)); + break; + } +} + +static uint64_t +pci_xhci_portregs_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + int port; + uint32_t *p; + + if (sc->portregs == NULL) + return (0); + + port = (offset - 0x3F0) / 0x10; + + if (port > XHCI_MAX_DEVS) { + DPRINTF(("pci_xhci: portregs_read port %d >= XHCI_MAX_DEVS\r\n", + port)); + + /* return default value for unused port */ + return (XHCI_PS_SPEED_SET(3)); + } + + offset = (offset - 0x3F0) % 0x10; + + p = &sc->portregs[port].portsc; + p += offset / sizeof(uint32_t); + + DPRINTF(("pci_xhci: portregs read offset 0x%lx port %u -> 0x%x\r\n", + offset, port, *p)); + + return (*p); +} + +static void +pci_xhci_hostop_write(struct pci_xhci_softc *sc, uint64_t offset, + uint64_t value) +{ + offset -= XHCI_CAPLEN; + + if (offset < 0x400) + DPRINTF(("pci_xhci: hostop write offset 0x%lx: 0x%lx\r\n", + offset, value)); + + switch (offset) { + case XHCI_USBCMD: + sc->opregs.usbcmd = pci_xhci_usbcmd_write(sc, value & 0x3F0F); + break; + + case XHCI_USBSTS: + /* clear bits on write */ + sc->opregs.usbsts &= ~(value & + (XHCI_STS_HSE|XHCI_STS_EINT|XHCI_STS_PCD|XHCI_STS_SSS| + XHCI_STS_RSS|XHCI_STS_SRE|XHCI_STS_CNR)); + break; + + case XHCI_PAGESIZE: + /* read only */ + break; + + case XHCI_DNCTRL: + sc->opregs.dnctrl = value & 0xFFFF; + break; + + case XHCI_CRCR_LO: + if (sc->opregs.crcr & XHCI_CRCR_LO_CRR) { + sc->opregs.crcr &= ~(XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); + sc->opregs.crcr |= value & + (XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); + } else { + sc->opregs.crcr = MASK_64_HI(sc->opregs.crcr) | + (value & (0xFFFFFFC0 | XHCI_CRCR_LO_RCS)); + } + break; + + case XHCI_CRCR_HI: + if (!(sc->opregs.crcr & XHCI_CRCR_LO_CRR)) { + sc->opregs.crcr = MASK_64_LO(sc->opregs.crcr) | + (value << 32); + + sc->opregs.cr_p = XHCI_GADDR(sc, + sc->opregs.crcr & ~0xF); + } + + if (sc->opregs.crcr & XHCI_CRCR_LO_CS) { + /* Stop operation of Command Ring */ + } + + if (sc->opregs.crcr & XHCI_CRCR_LO_CA) { + /* Abort command */ + } + + break; + + case XHCI_DCBAAP_LO: + sc->opregs.dcbaap = MASK_64_HI(sc->opregs.dcbaap) | + (value & 0xFFFFFFC0); + break; + + case XHCI_DCBAAP_HI: + sc->opregs.dcbaap = MASK_64_LO(sc->opregs.dcbaap) | + (value << 32); + sc->opregs.dcbaa_p = XHCI_GADDR(sc, sc->opregs.dcbaap & ~0x3FUL); + + DPRINTF(("pci_xhci: opregs dcbaap = 0x%lx (vaddr 0x%lx)\r\n", + sc->opregs.dcbaap, (uint64_t)sc->opregs.dcbaa_p)); + break; + + case XHCI_CONFIG: + sc->opregs.config = value & 0x03FF; + break; + + default: + if (offset >= 0x400) + pci_xhci_portregs_write(sc, offset, value); + + break; + } +} + + +static void +pci_xhci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_xhci_softc *sc; + + sc = pi->pi_arg; + + assert(baridx == 0); + + + pthread_mutex_lock(&sc->mtx); + if (offset < XHCI_CAPLEN) /* read only registers */ + WPRINTF(("pci_xhci: write RO-CAPs offset %ld\r\n", offset)); + else if (offset < sc->dboff) + pci_xhci_hostop_write(sc, offset, value); + else if (offset < sc->rtsoff) + pci_xhci_dbregs_write(sc, offset, value); + else if (offset < sc->regsend) + pci_xhci_rtsregs_write(sc, offset, value); + else + WPRINTF(("pci_xhci: write invalid offset %ld\r\n", offset)); + + pthread_mutex_unlock(&sc->mtx); +} + +static uint64_t +pci_xhci_hostcap_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + uint64_t value; + + switch (offset) { + case XHCI_CAPLENGTH: /* 0x00 */ + value = sc->caplength; + break; + + case XHCI_HCSPARAMS1: /* 0x04 */ + value = sc->hcsparams1; + break; + + case XHCI_HCSPARAMS2: /* 0x08 */ + value = sc->hcsparams2; + break; + + case XHCI_HCSPARAMS3: /* 0x0C */ + value = sc->hcsparams3; + break; + + case XHCI_HCSPARAMS0: /* 0x10 */ + value = sc->hccparams1; + break; + + case XHCI_DBOFF: /* 0x14 */ + value = sc->dboff; + break; + + case XHCI_RTSOFF: /* 0x18 */ + value = sc->rtsoff; + break; + + case XHCI_HCCPRAMS2: /* 0x1C */ + value = sc->hccparams2; + break; + + default: + value = 0; + break; + } + + DPRINTF(("pci_xhci: hostcap read offset 0x%lx -> 0x%lx\r\n", + offset, value)); + + return (value); +} + +static uint64_t +pci_xhci_hostop_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + uint64_t value; + + offset = (offset - XHCI_CAPLEN); + + switch (offset) { + case XHCI_USBCMD: /* 0x00 */ + value = sc->opregs.usbcmd; + break; + + case XHCI_USBSTS: /* 0x04 */ + value = sc->opregs.usbsts; + break; + + case XHCI_PAGESIZE: /* 0x08 */ + value = sc->opregs.pgsz; + break; + + case XHCI_DNCTRL: /* 0x14 */ + value = sc->opregs.dnctrl; + break; + + case XHCI_CRCR_LO: /* 0x18 */ + value = sc->opregs.crcr & XHCI_CRCR_LO_CRR; + break; + + case XHCI_CRCR_HI: /* 0x1C */ + value = 0; + break; + + case XHCI_DCBAAP_LO: /* 0x30 */ + value = sc->opregs.dcbaap & 0xFFFFFFFF; + break; + + case XHCI_DCBAAP_HI: /* 0x34 */ + value = (sc->opregs.dcbaap >> 32) & 0xFFFFFFFF; + break; + + case XHCI_CONFIG: /* 0x38 */ + value = sc->opregs.config; + break; + + default: + if (offset >= 0x400) + value = pci_xhci_portregs_read(sc, offset); + else + value = 0; + + break; + } + + if (offset < 0x400) + DPRINTF(("pci_xhci: hostop read offset 0x%lx -> 0x%lx\r\n", + offset, value)); + + return (value); +} + +static uint64_t +pci_xhci_dbregs_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + + /* read doorbell always returns 0 */ + return (0); +} + +static uint64_t +pci_xhci_rtsregs_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + uint32_t value; + + offset -= sc->rtsoff; + value = 0; + + if (offset == XHCI_MFINDEX) { + value = sc->rtsregs.mfindex; + } else if (offset >= 0x20) { + int item; + uint32_t *p; + + offset -= 0x20; + item = offset % 32; + + assert(offset < sizeof(sc->rtsregs.intrreg)); + + p = &sc->rtsregs.intrreg.iman; + p += item / sizeof(uint32_t); + value = *p; + } + + DPRINTF(("pci_xhci: rtsregs read offset 0x%lx -> 0x%x\r\n", + offset, value)); + + return (value); +} + +static uint64_t +pci_xhci_xecp_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + uint32_t value; + + offset -= sc->regsend; + value = 0; + + switch (offset) { + case 0: + /* rev major | rev minor | next-cap | cap-id */ + value = (0x02 << 24) | (4 << 8) | XHCI_ID_PROTOCOLS; + break; + case 4: + /* name string = "USB" */ + value = 0x20425355; + break; + case 8: + /* psic | proto-defined | compat # | compat offset */ + value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb2_port_start; + break; + case 12: + break; + case 16: + /* rev major | rev minor | next-cap | cap-id */ + value = (0x03 << 24) | XHCI_ID_PROTOCOLS; + break; + case 20: + /* name string = "USB" */ + value = 0x20425355; + break; + case 24: + /* psic | proto-defined | compat # | compat offset */ + value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb3_port_start; + break; + case 28: + break; + default: + DPRINTF(("pci_xhci: xecp invalid offset 0x%lx\r\n", offset)); + break; + } + + DPRINTF(("pci_xhci: xecp read offset 0x%lx -> 0x%x\r\n", + offset, value)); + + return (value); +} + + +static uint64_t +pci_xhci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct pci_xhci_softc *sc; + uint32_t value; + + sc = pi->pi_arg; + + assert(baridx == 0); + + pthread_mutex_lock(&sc->mtx); + if (offset < XHCI_CAPLEN) + value = pci_xhci_hostcap_read(sc, offset); + else if (offset < sc->dboff) + value = pci_xhci_hostop_read(sc, offset); + else if (offset < sc->rtsoff) + value = pci_xhci_dbregs_read(sc, offset); + else if (offset < sc->regsend) + value = pci_xhci_rtsregs_read(sc, offset); + else if (offset < (sc->regsend + 4*32)) + value = pci_xhci_xecp_read(sc, offset); + else { + value = 0; + WPRINTF(("pci_xhci: read invalid offset %ld\r\n", offset)); + } + + pthread_mutex_unlock(&sc->mtx); + + switch (size) { + case 1: + value &= 0xFF; + break; + case 2: + value &= 0xFFFF; + break; + case 4: + value &= 0xFFFFFFFF; + break; + } + + return (value); +} + +static void +pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm) +{ + struct pci_xhci_portregs *port; + struct pci_xhci_dev_emu *dev; + struct xhci_trb evtrb; + int error; + + assert(portn <= XHCI_MAX_DEVS); + + DPRINTF(("xhci reset port %d\r\n", portn)); + + port = XHCI_PORTREG_PTR(sc, portn); + dev = XHCI_DEVINST_PTR(sc, portn); + if (dev) { + port->portsc &= ~(XHCI_PS_PLS_MASK | XHCI_PS_PR | XHCI_PS_PRC); + port->portsc |= XHCI_PS_PED | + XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); + + if (warm && dev->dev_ue->ue_usbver == 3) { + port->portsc |= XHCI_PS_WRC; + } + + if ((port->portsc & XHCI_PS_PRC) == 0) { + port->portsc |= XHCI_PS_PRC; + + pci_xhci_set_evtrb(&evtrb, portn, + XHCI_TRB_ERROR_SUCCESS, + XHCI_TRB_EVENT_PORT_STS_CHANGE); + error = pci_xhci_insert_event(sc, &evtrb, 1); + if (error != XHCI_TRB_ERROR_SUCCESS) + DPRINTF(("xhci reset port insert event " + "failed\r\n")); + } + } +} + +static void +pci_xhci_init_port(struct pci_xhci_softc *sc, int portn) +{ + struct pci_xhci_portregs *port; + struct pci_xhci_dev_emu *dev; + + port = XHCI_PORTREG_PTR(sc, portn); + dev = XHCI_DEVINST_PTR(sc, portn); + if (dev) { + port->portsc = XHCI_PS_CCS | /* connected */ + XHCI_PS_PP; /* port power */ + + if (dev->dev_ue->ue_usbver == 2) { + port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL) | + XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); + } else { + port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0) | + XHCI_PS_PED | /* enabled */ + XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); + } + + DPRINTF(("Init port %d 0x%x\n", portn, port->portsc)); + } else { + port->portsc = XHCI_PS_PLS_SET(UPS_PORT_LS_RX_DET) | XHCI_PS_PP; + DPRINTF(("Init empty port %d 0x%x\n", portn, port->portsc)); + } +} + +static int +pci_xhci_dev_intr(struct usb_hci *hci, int epctx) +{ + struct pci_xhci_dev_emu *dev; + struct xhci_dev_ctx *dev_ctx; + struct xhci_trb evtrb; + struct pci_xhci_softc *sc; + struct pci_xhci_portregs *p; + struct xhci_endp_ctx *ep_ctx; + int error; + int dir_in; + int epid; + + dir_in = epctx & 0x80; + epid = epctx & ~0x80; + + /* HW endpoint contexts are 0-15; convert to epid based on dir */ + epid = (epid * 2) + (dir_in ? 1 : 0); + + assert(epid >= 1 && epid <= 31); + + dev = hci->hci_sc; + sc = dev->xsc; + + /* check if device is ready; OS has to initialise it */ + if (sc->rtsregs.erstba_p == NULL || + (sc->opregs.usbcmd & XHCI_CMD_RS) == 0 || + dev->dev_ctx == NULL) + return (0); + + p = XHCI_PORTREG_PTR(sc, hci->hci_port); + + /* raise event if link U3 (suspended) state */ + if (XHCI_PS_PLS_GET(p->portsc) == 3) { + p->portsc &= ~XHCI_PS_PLS_MASK; + p->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_RESUME); + if ((p->portsc & XHCI_PS_PLC) != 0) + return (0); + + p->portsc |= XHCI_PS_PLC; + + pci_xhci_set_evtrb(&evtrb, hci->hci_port, + XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); + error = pci_xhci_insert_event(sc, &evtrb, 0); + if (error != XHCI_TRB_ERROR_SUCCESS) + goto done; + } + + dev_ctx = dev->dev_ctx; + ep_ctx = &dev_ctx->ctx_ep[epid]; + if ((ep_ctx->dwEpCtx0 & 0x7) == XHCI_ST_EPCTX_DISABLED) { + DPRINTF(("xhci device interrupt on disabled endpoint %d\r\n", + epid)); + return (0); + } + + DPRINTF(("xhci device interrupt on endpoint %d\r\n", epid)); + + pci_xhci_device_doorbell(sc, hci->hci_port, epid, 0); + +done: + return (error); +} + +static int +pci_xhci_dev_event(struct usb_hci *hci, enum hci_usbev evid, void *param) +{ + + DPRINTF(("xhci device event port %d\r\n", hci->hci_port)); + return (0); +} + + + +static void +pci_xhci_device_usage(char *opt) +{ + + fprintf(stderr, "Invalid USB emulation \"%s\"\r\n", opt); +} + +static int +pci_xhci_parse_opts(struct pci_xhci_softc *sc, char *opts) +{ + struct pci_xhci_dev_emu **devices; + struct pci_xhci_dev_emu *dev; + struct usb_devemu *ue; + void *devsc; + char *uopt, *xopts, *config; + int usb3_port, usb2_port, i; + + uopt = NULL; + usb3_port = sc->usb3_port_start - 1; + usb2_port = sc->usb2_port_start - 1; + devices = NULL; + + if (opts == NULL) + goto portsfinal; + + devices = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_dev_emu *)); + + sc->slots = calloc(XHCI_MAX_SLOTS, sizeof(struct pci_xhci_dev_emu *)); + sc->devices = devices; + sc->ndevices = 0; + + uopt = strdup(opts); + for (xopts = strtok(uopt, ","); + xopts != NULL; + xopts = strtok(NULL, ",")) { + if (usb2_port == ((sc->usb2_port_start-1) + XHCI_MAX_DEVS/2) || + usb3_port == ((sc->usb3_port_start-1) + XHCI_MAX_DEVS/2)) { + WPRINTF(("pci_xhci max number of USB 2 or 3 " + "devices reached, max %d\r\n", XHCI_MAX_DEVS/2)); + usb2_port = usb3_port = -1; + goto done; + } + + /* device[=] */ + if ((config = strchr(xopts, '=')) == NULL) + config = ""; /* no config */ + else + *config++ = '\0'; + + ue = usb_emu_finddev(xopts); + if (ue == NULL) { + pci_xhci_device_usage(xopts); + DPRINTF(("pci_xhci device not found %s\r\n", xopts)); + usb2_port = usb3_port = -1; + goto done; + } + + DPRINTF(("pci_xhci adding device %s, opts \"%s\"\r\n", + xopts, config)); + + dev = calloc(1, sizeof(struct pci_xhci_dev_emu)); + dev->xsc = sc; + dev->hci.hci_sc = dev; + dev->hci.hci_intr = pci_xhci_dev_intr; + dev->hci.hci_event = pci_xhci_dev_event; + + if (ue->ue_usbver == 2) { + dev->hci.hci_port = usb2_port + 1; + devices[usb2_port] = dev; + usb2_port++; + } else { + dev->hci.hci_port = usb3_port + 1; + devices[usb3_port] = dev; + usb3_port++; + } + + dev->hci.hci_address = 0; + devsc = ue->ue_init(&dev->hci, config); + if (devsc == NULL) { + pci_xhci_device_usage(xopts); + usb2_port = usb3_port = -1; + goto done; + } + + dev->dev_ue = ue; + dev->dev_sc = devsc; + + /* assign slot number to device */ + sc->slots[sc->ndevices] = dev; + + sc->ndevices++; + } + +portsfinal: + sc->portregs = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_portregs)); + + if (sc->ndevices > 0) { + /* port and slot numbering start from 1 */ + sc->devices--; + sc->portregs--; + sc->slots--; + + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + pci_xhci_init_port(sc, i); + } + } else { + WPRINTF(("pci_xhci no USB devices configured\r\n")); + sc->ndevices = 1; + } + +done: + if (devices != NULL) { + if (usb2_port <= 0 && usb3_port <= 0) { + sc->devices = NULL; + for (i = 0; devices[i] != NULL; i++) + free(devices[i]); + sc->ndevices = -1; + + free(devices); + } + } + free(uopt); + return (sc->ndevices); +} + +static int +pci_xhci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_xhci_softc *sc; + int error; + + if (xhci_in_use) { + WPRINTF(("pci_xhci controller already defined\r\n")); + return (-1); + } + xhci_in_use = 1; + + sc = calloc(1, sizeof(struct pci_xhci_softc)); + pi->pi_arg = sc; + sc->xsc_pi = pi; + + sc->usb2_port_start = (XHCI_MAX_DEVS/2) + 1; + sc->usb3_port_start = 1; + + /* discover devices */ + error = pci_xhci_parse_opts(sc, opts); + if (error < 0) + goto done; + else + error = 0; + + sc->caplength = XHCI_SET_CAPLEN(XHCI_CAPLEN) | + XHCI_SET_HCIVERSION(0x0100); + sc->hcsparams1 = XHCI_SET_HCSP1_MAXPORTS(XHCI_MAX_DEVS) | + XHCI_SET_HCSP1_MAXINTR(1) | /* interrupters */ + XHCI_SET_HCSP1_MAXSLOTS(XHCI_MAX_SLOTS); + sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) | + XHCI_SET_HCSP2_IST(0x04); + sc->hcsparams3 = 0; /* no latency */ + sc->hccparams1 = XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */ + XHCI_SET_HCCP1_SPC(1) | /* short packet */ + XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX); + sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) | + XHCI_SET_HCCP2_U3C(1); + sc->dboff = XHCI_SET_DOORBELL(XHCI_CAPLEN + XHCI_PORTREGS_START + + XHCI_MAX_DEVS * sizeof(struct pci_xhci_portregs)); + + /* dboff must be 32-bit aligned */ + if (sc->dboff & 0x3) + sc->dboff = (sc->dboff + 0x3) & ~0x3; + + /* rtsoff must be 32-bytes aligned */ + sc->rtsoff = XHCI_SET_RTSOFFSET(sc->dboff + (XHCI_MAX_SLOTS+1) * 32); + if (sc->rtsoff & 0x1F) + sc->rtsoff = (sc->rtsoff + 0x1F) & ~0x1F; + + DPRINTF(("pci_xhci dboff: 0x%x, rtsoff: 0x%x\r\n", sc->dboff, + sc->rtsoff)); + + sc->opregs.usbsts = XHCI_STS_HCH; + sc->opregs.pgsz = XHCI_PAGESIZE_4K; + + pci_xhci_reset(sc); + + sc->regsend = sc->rtsoff + 0x20 + 32; /* only 1 intrpter */ + + /* + * Set extended capabilities pointer to be after regsend; + * value of xecp field is 32-bit offset. + */ + sc->hccparams1 |= XHCI_SET_HCCP1_XECP(sc->regsend/4); + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1E31); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SERIALBUS); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_SERIALBUS_USB); + pci_set_cfgdata8(pi, PCIR_PROGIF,PCIP_SERIALBUS_USB_XHCI); + pci_set_cfgdata8(pi, PCI_USBREV, PCI_USB_REV_3_0); + + pci_emul_add_msicap(pi, 1); + + /* regsend + xecp registers */ + pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, sc->regsend + 4*32); + DPRINTF(("pci_xhci pci_emu_alloc: %d\r\n", sc->regsend + 4*32)); + + + pci_lintr_request(pi); + + pthread_mutex_init(&sc->mtx, NULL); + +done: + if (error) { + free(sc); + } + + return (error); +} + + + +struct pci_devemu pci_de_xhci = { + .pe_emu = "xhci", + .pe_init = pci_xhci_init, + .pe_barwrite = pci_xhci_write, + .pe_barread = pci_xhci_read +}; +PCI_EMUL_SET(pci_de_xhci); diff --git a/usr/src/cmd/bhyve/pci_xhci.h b/usr/src/cmd/bhyve/pci_xhci.h new file mode 100644 index 0000000000..7502f9396a --- /dev/null +++ b/usr/src/cmd/bhyve/pci_xhci.h @@ -0,0 +1,355 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Leon Dang + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PCI_XHCI_H_ +#define _PCI_XHCI_H_ + +#define PCI_USBREV 0x60 /* USB protocol revision */ + + +enum { /* dsc_slotstate */ + XHCI_ST_DISABLED, + XHCI_ST_ENABLED, + XHCI_ST_DEFAULT, + XHCI_ST_ADDRESSED, + XHCI_ST_CONFIGURED, + XHCI_ST_MAX +}; + +enum { + XHCI_ST_SLCTX_DISABLED, + XHCI_ST_SLCTX_DEFAULT, + XHCI_ST_SLCTX_ADDRESSED, + XHCI_ST_SLCTX_CONFIGURED +}; + +enum { + XHCI_ST_EPCTX_DISABLED, + XHCI_ST_EPCTX_RUNNING, + XHCI_ST_EPCTX_HALTED, + XHCI_ST_EPCTX_STOPPED, + XHCI_ST_EPCTX_ERROR +}; + +#define XHCI_MAX_DEVICES MIN(USB_MAX_DEVICES, 128) +#define XHCI_MAX_ENDPOINTS 32 /* hardcoded - do not change */ +#define XHCI_MAX_SCRATCHPADS 32 +#define XHCI_MAX_EVENTS (16 * 13) +#define XHCI_MAX_COMMANDS (16 * 1) +#define XHCI_MAX_RSEG 1 +#define XHCI_MAX_TRANSFERS 4 +#if USB_MAX_EP_STREAMS == 8 +#define XHCI_MAX_STREAMS 8 +#define XHCI_MAX_STREAMS_LOG 3 +#elif USB_MAX_EP_STREAMS == 1 +#define XHCI_MAX_STREAMS 1 +#define XHCI_MAX_STREAMS_LOG 0 +#else +#error "The USB_MAX_EP_STREAMS value is not supported." +#endif +#define XHCI_DEV_CTX_ADDR_ALIGN 64 /* bytes */ +#define XHCI_DEV_CTX_ALIGN 64 /* bytes */ +#define XHCI_INPUT_CTX_ALIGN 64 /* bytes */ +#define XHCI_SLOT_CTX_ALIGN 32 /* bytes */ +#define XHCI_ENDP_CTX_ALIGN 32 /* bytes */ +#define XHCI_STREAM_CTX_ALIGN 16 /* bytes */ +#define XHCI_TRANS_RING_SEG_ALIGN 16 /* bytes */ +#define XHCI_CMD_RING_SEG_ALIGN 64 /* bytes */ +#define XHCI_EVENT_RING_SEG_ALIGN 64 /* bytes */ +#define XHCI_SCRATCH_BUF_ARRAY_ALIGN 64 /* bytes */ +#define XHCI_SCRATCH_BUFFER_ALIGN USB_PAGE_SIZE +#define XHCI_TRB_ALIGN 16 /* bytes */ +#define XHCI_TD_ALIGN 64 /* bytes */ +#define XHCI_PAGE_SIZE 4096 /* bytes */ + +struct xhci_slot_ctx { + volatile uint32_t dwSctx0; +#define XHCI_SCTX_0_ROUTE_SET(x) ((x) & 0xFFFFF) +#define XHCI_SCTX_0_ROUTE_GET(x) ((x) & 0xFFFFF) +#define XHCI_SCTX_0_SPEED_SET(x) (((x) & 0xF) << 20) +#define XHCI_SCTX_0_SPEED_GET(x) (((x) >> 20) & 0xF) +#define XHCI_SCTX_0_MTT_SET(x) (((x) & 0x1) << 25) +#define XHCI_SCTX_0_MTT_GET(x) (((x) >> 25) & 0x1) +#define XHCI_SCTX_0_HUB_SET(x) (((x) & 0x1) << 26) +#define XHCI_SCTX_0_HUB_GET(x) (((x) >> 26) & 0x1) +#define XHCI_SCTX_0_CTX_NUM_SET(x) (((x) & 0x1F) << 27) +#define XHCI_SCTX_0_CTX_NUM_GET(x) (((x) >> 27) & 0x1F) + volatile uint32_t dwSctx1; +#define XHCI_SCTX_1_MAX_EL_SET(x) ((x) & 0xFFFF) +#define XHCI_SCTX_1_MAX_EL_GET(x) ((x) & 0xFFFF) +#define XHCI_SCTX_1_RH_PORT_SET(x) (((x) & 0xFF) << 16) +#define XHCI_SCTX_1_RH_PORT_GET(x) (((x) >> 16) & 0xFF) +#define XHCI_SCTX_1_NUM_PORTS_SET(x) (((x) & 0xFF) << 24) +#define XHCI_SCTX_1_NUM_PORTS_GET(x) (((x) >> 24) & 0xFF) + volatile uint32_t dwSctx2; +#define XHCI_SCTX_2_TT_HUB_SID_SET(x) ((x) & 0xFF) +#define XHCI_SCTX_2_TT_HUB_SID_GET(x) ((x) & 0xFF) +#define XHCI_SCTX_2_TT_PORT_NUM_SET(x) (((x) & 0xFF) << 8) +#define XHCI_SCTX_2_TT_PORT_NUM_GET(x) (((x) >> 8) & 0xFF) +#define XHCI_SCTX_2_TT_THINK_TIME_SET(x) (((x) & 0x3) << 16) +#define XHCI_SCTX_2_TT_THINK_TIME_GET(x) (((x) >> 16) & 0x3) +#define XHCI_SCTX_2_IRQ_TARGET_SET(x) (((x) & 0x3FF) << 22) +#define XHCI_SCTX_2_IRQ_TARGET_GET(x) (((x) >> 22) & 0x3FF) + volatile uint32_t dwSctx3; +#define XHCI_SCTX_3_DEV_ADDR_SET(x) ((x) & 0xFF) +#define XHCI_SCTX_3_DEV_ADDR_GET(x) ((x) & 0xFF) +#define XHCI_SCTX_3_SLOT_STATE_SET(x) (((x) & 0x1F) << 27) +#define XHCI_SCTX_3_SLOT_STATE_GET(x) (((x) >> 27) & 0x1F) + volatile uint32_t dwSctx4; + volatile uint32_t dwSctx5; + volatile uint32_t dwSctx6; + volatile uint32_t dwSctx7; +}; + +struct xhci_endp_ctx { + volatile uint32_t dwEpCtx0; +#define XHCI_EPCTX_0_EPSTATE_SET(x) ((x) & 0x7) +#define XHCI_EPCTX_0_EPSTATE_GET(x) ((x) & 0x7) +#define XHCI_EPCTX_0_MULT_SET(x) (((x) & 0x3) << 8) +#define XHCI_EPCTX_0_MULT_GET(x) (((x) >> 8) & 0x3) +#define XHCI_EPCTX_0_MAXP_STREAMS_SET(x) (((x) & 0x1F) << 10) +#define XHCI_EPCTX_0_MAXP_STREAMS_GET(x) (((x) >> 10) & 0x1F) +#define XHCI_EPCTX_0_LSA_SET(x) (((x) & 0x1) << 15) +#define XHCI_EPCTX_0_LSA_GET(x) (((x) >> 15) & 0x1) +#define XHCI_EPCTX_0_IVAL_SET(x) (((x) & 0xFF) << 16) +#define XHCI_EPCTX_0_IVAL_GET(x) (((x) >> 16) & 0xFF) + volatile uint32_t dwEpCtx1; +#define XHCI_EPCTX_1_CERR_SET(x) (((x) & 0x3) << 1) +#define XHCI_EPCTX_1_CERR_GET(x) (((x) >> 1) & 0x3) +#define XHCI_EPCTX_1_EPTYPE_SET(x) (((x) & 0x7) << 3) +#define XHCI_EPCTX_1_EPTYPE_GET(x) (((x) >> 3) & 0x7) +#define XHCI_EPCTX_1_HID_SET(x) (((x) & 0x1) << 7) +#define XHCI_EPCTX_1_HID_GET(x) (((x) >> 7) & 0x1) +#define XHCI_EPCTX_1_MAXB_SET(x) (((x) & 0xFF) << 8) +#define XHCI_EPCTX_1_MAXB_GET(x) (((x) >> 8) & 0xFF) +#define XHCI_EPCTX_1_MAXP_SIZE_SET(x) (((x) & 0xFFFF) << 16) +#define XHCI_EPCTX_1_MAXP_SIZE_GET(x) (((x) >> 16) & 0xFFFF) + volatile uint64_t qwEpCtx2; +#define XHCI_EPCTX_2_DCS_SET(x) ((x) & 0x1) +#define XHCI_EPCTX_2_DCS_GET(x) ((x) & 0x1) +#define XHCI_EPCTX_2_TR_DQ_PTR_MASK 0xFFFFFFFFFFFFFFF0U + volatile uint32_t dwEpCtx4; +#define XHCI_EPCTX_4_AVG_TRB_LEN_SET(x) ((x) & 0xFFFF) +#define XHCI_EPCTX_4_AVG_TRB_LEN_GET(x) ((x) & 0xFFFF) +#define XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_SET(x) (((x) & 0xFFFF) << 16) +#define XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_GET(x) (((x) >> 16) & 0xFFFF) + volatile uint32_t dwEpCtx5; + volatile uint32_t dwEpCtx6; + volatile uint32_t dwEpCtx7; +}; + +struct xhci_input_ctx { +#define XHCI_INCTX_NON_CTRL_MASK 0xFFFFFFFCU + volatile uint32_t dwInCtx0; +#define XHCI_INCTX_0_DROP_MASK(n) (1U << (n)) + volatile uint32_t dwInCtx1; +#define XHCI_INCTX_1_ADD_MASK(n) (1U << (n)) + volatile uint32_t dwInCtx2; + volatile uint32_t dwInCtx3; + volatile uint32_t dwInCtx4; + volatile uint32_t dwInCtx5; + volatile uint32_t dwInCtx6; + volatile uint32_t dwInCtx7; +}; + +struct xhci_input_dev_ctx { + struct xhci_input_ctx ctx_input; + union { + struct xhci_slot_ctx u_slot; + struct xhci_endp_ctx u_ep[XHCI_MAX_ENDPOINTS]; + } ctx_dev_slep; +}; + +struct xhci_dev_ctx { + union { + struct xhci_slot_ctx u_slot; + struct xhci_endp_ctx u_ep[XHCI_MAX_ENDPOINTS]; + } ctx_dev_slep; +} __aligned(XHCI_DEV_CTX_ALIGN); +#define ctx_slot ctx_dev_slep.u_slot +#define ctx_ep ctx_dev_slep.u_ep + +struct xhci_stream_ctx { + volatile uint64_t qwSctx0; +#define XHCI_SCTX_0_DCS_GET(x) ((x) & 0x1) +#define XHCI_SCTX_0_DCS_SET(x) ((x) & 0x1) +#define XHCI_SCTX_0_SCT_SET(x) (((x) & 0x7) << 1) +#define XHCI_SCTX_0_SCT_GET(x) (((x) >> 1) & 0x7) +#define XHCI_SCTX_0_SCT_SEC_TR_RING 0x0 +#define XHCI_SCTX_0_SCT_PRIM_TR_RING 0x1 +#define XHCI_SCTX_0_SCT_PRIM_SSA_8 0x2 +#define XHCI_SCTX_0_SCT_PRIM_SSA_16 0x3 +#define XHCI_SCTX_0_SCT_PRIM_SSA_32 0x4 +#define XHCI_SCTX_0_SCT_PRIM_SSA_64 0x5 +#define XHCI_SCTX_0_SCT_PRIM_SSA_128 0x6 +#define XHCI_SCTX_0_SCT_PRIM_SSA_256 0x7 +#define XHCI_SCTX_0_TR_DQ_PTR_MASK 0xFFFFFFFFFFFFFFF0U + volatile uint32_t dwSctx2; + volatile uint32_t dwSctx3; +}; + +struct xhci_trb { + volatile uint64_t qwTrb0; +#define XHCI_TRB_0_DIR_IN_MASK (0x80ULL << 0) +#define XHCI_TRB_0_WLENGTH_MASK (0xFFFFULL << 48) + volatile uint32_t dwTrb2; +#define XHCI_TRB_2_ERROR_GET(x) (((x) >> 24) & 0xFF) +#define XHCI_TRB_2_ERROR_SET(x) (((x) & 0xFF) << 24) +#define XHCI_TRB_2_TDSZ_GET(x) (((x) >> 17) & 0x1F) +#define XHCI_TRB_2_TDSZ_SET(x) (((x) & 0x1F) << 17) +#define XHCI_TRB_2_REM_GET(x) ((x) & 0xFFFFFF) +#define XHCI_TRB_2_REM_SET(x) ((x) & 0xFFFFFF) +#define XHCI_TRB_2_BYTES_GET(x) ((x) & 0x1FFFF) +#define XHCI_TRB_2_BYTES_SET(x) ((x) & 0x1FFFF) +#define XHCI_TRB_2_IRQ_GET(x) (((x) >> 22) & 0x3FF) +#define XHCI_TRB_2_IRQ_SET(x) (((x) & 0x3FF) << 22) +#define XHCI_TRB_2_STREAM_GET(x) (((x) >> 16) & 0xFFFF) +#define XHCI_TRB_2_STREAM_SET(x) (((x) & 0xFFFF) << 16) + + volatile uint32_t dwTrb3; +#define XHCI_TRB_3_TYPE_GET(x) (((x) >> 10) & 0x3F) +#define XHCI_TRB_3_TYPE_SET(x) (((x) & 0x3F) << 10) +#define XHCI_TRB_3_CYCLE_BIT (1U << 0) +#define XHCI_TRB_3_TC_BIT (1U << 1) /* command ring only */ +#define XHCI_TRB_3_ENT_BIT (1U << 1) /* transfer ring only */ +#define XHCI_TRB_3_ISP_BIT (1U << 2) +#define XHCI_TRB_3_ED_BIT (1U << 2) +#define XHCI_TRB_3_NSNOOP_BIT (1U << 3) +#define XHCI_TRB_3_CHAIN_BIT (1U << 4) +#define XHCI_TRB_3_IOC_BIT (1U << 5) +#define XHCI_TRB_3_IDT_BIT (1U << 6) +#define XHCI_TRB_3_TBC_GET(x) (((x) >> 7) & 3) +#define XHCI_TRB_3_TBC_SET(x) (((x) & 3) << 7) +#define XHCI_TRB_3_BEI_BIT (1U << 9) +#define XHCI_TRB_3_DCEP_BIT (1U << 9) +#define XHCI_TRB_3_PRSV_BIT (1U << 9) +#define XHCI_TRB_3_BSR_BIT (1U << 9) +#define XHCI_TRB_3_TRT_MASK (3U << 16) +#define XHCI_TRB_3_TRT_NONE (0U << 16) +#define XHCI_TRB_3_TRT_OUT (2U << 16) +#define XHCI_TRB_3_TRT_IN (3U << 16) +#define XHCI_TRB_3_DIR_IN (1U << 16) +#define XHCI_TRB_3_TLBPC_GET(x) (((x) >> 16) & 0xF) +#define XHCI_TRB_3_TLBPC_SET(x) (((x) & 0xF) << 16) +#define XHCI_TRB_3_EP_GET(x) (((x) >> 16) & 0x1F) +#define XHCI_TRB_3_EP_SET(x) (((x) & 0x1F) << 16) +#define XHCI_TRB_3_FRID_GET(x) (((x) >> 20) & 0x7FF) +#define XHCI_TRB_3_FRID_SET(x) (((x) & 0x7FF) << 20) +#define XHCI_TRB_3_ISO_SIA_BIT (1U << 31) +#define XHCI_TRB_3_SUSP_EP_BIT (1U << 23) +#define XHCI_TRB_3_SLOT_GET(x) (((x) >> 24) & 0xFF) +#define XHCI_TRB_3_SLOT_SET(x) (((x) & 0xFF) << 24) + +/* Commands */ +#define XHCI_TRB_TYPE_RESERVED 0x00 +#define XHCI_TRB_TYPE_NORMAL 0x01 +#define XHCI_TRB_TYPE_SETUP_STAGE 0x02 +#define XHCI_TRB_TYPE_DATA_STAGE 0x03 +#define XHCI_TRB_TYPE_STATUS_STAGE 0x04 +#define XHCI_TRB_TYPE_ISOCH 0x05 +#define XHCI_TRB_TYPE_LINK 0x06 +#define XHCI_TRB_TYPE_EVENT_DATA 0x07 +#define XHCI_TRB_TYPE_NOOP 0x08 +#define XHCI_TRB_TYPE_ENABLE_SLOT 0x09 +#define XHCI_TRB_TYPE_DISABLE_SLOT 0x0A +#define XHCI_TRB_TYPE_ADDRESS_DEVICE 0x0B +#define XHCI_TRB_TYPE_CONFIGURE_EP 0x0C +#define XHCI_TRB_TYPE_EVALUATE_CTX 0x0D +#define XHCI_TRB_TYPE_RESET_EP 0x0E +#define XHCI_TRB_TYPE_STOP_EP 0x0F +#define XHCI_TRB_TYPE_SET_TR_DEQUEUE 0x10 +#define XHCI_TRB_TYPE_RESET_DEVICE 0x11 +#define XHCI_TRB_TYPE_FORCE_EVENT 0x12 +#define XHCI_TRB_TYPE_NEGOTIATE_BW 0x13 +#define XHCI_TRB_TYPE_SET_LATENCY_TOL 0x14 +#define XHCI_TRB_TYPE_GET_PORT_BW 0x15 +#define XHCI_TRB_TYPE_FORCE_HEADER 0x16 +#define XHCI_TRB_TYPE_NOOP_CMD 0x17 + +/* Events */ +#define XHCI_TRB_EVENT_TRANSFER 0x20 +#define XHCI_TRB_EVENT_CMD_COMPLETE 0x21 +#define XHCI_TRB_EVENT_PORT_STS_CHANGE 0x22 +#define XHCI_TRB_EVENT_BW_REQUEST 0x23 +#define XHCI_TRB_EVENT_DOORBELL 0x24 +#define XHCI_TRB_EVENT_HOST_CTRL 0x25 +#define XHCI_TRB_EVENT_DEVICE_NOTIFY 0x26 +#define XHCI_TRB_EVENT_MFINDEX_WRAP 0x27 + +/* Error codes */ +#define XHCI_TRB_ERROR_INVALID 0x00 +#define XHCI_TRB_ERROR_SUCCESS 0x01 +#define XHCI_TRB_ERROR_DATA_BUF 0x02 +#define XHCI_TRB_ERROR_BABBLE 0x03 +#define XHCI_TRB_ERROR_XACT 0x04 +#define XHCI_TRB_ERROR_TRB 0x05 +#define XHCI_TRB_ERROR_STALL 0x06 +#define XHCI_TRB_ERROR_RESOURCE 0x07 +#define XHCI_TRB_ERROR_BANDWIDTH 0x08 +#define XHCI_TRB_ERROR_NO_SLOTS 0x09 +#define XHCI_TRB_ERROR_STREAM_TYPE 0x0A +#define XHCI_TRB_ERROR_SLOT_NOT_ON 0x0B +#define XHCI_TRB_ERROR_ENDP_NOT_ON 0x0C +#define XHCI_TRB_ERROR_SHORT_PKT 0x0D +#define XHCI_TRB_ERROR_RING_UNDERRUN 0x0E +#define XHCI_TRB_ERROR_RING_OVERRUN 0x0F +#define XHCI_TRB_ERROR_VF_RING_FULL 0x10 +#define XHCI_TRB_ERROR_PARAMETER 0x11 +#define XHCI_TRB_ERROR_BW_OVERRUN 0x12 +#define XHCI_TRB_ERROR_CONTEXT_STATE 0x13 +#define XHCI_TRB_ERROR_NO_PING_RESP 0x14 +#define XHCI_TRB_ERROR_EV_RING_FULL 0x15 +#define XHCI_TRB_ERROR_INCOMPAT_DEV 0x16 +#define XHCI_TRB_ERROR_MISSED_SERVICE 0x17 +#define XHCI_TRB_ERROR_CMD_RING_STOP 0x18 +#define XHCI_TRB_ERROR_CMD_ABORTED 0x19 +#define XHCI_TRB_ERROR_STOPPED 0x1A +#define XHCI_TRB_ERROR_LENGTH 0x1B +#define XHCI_TRB_ERROR_BAD_MELAT 0x1D +#define XHCI_TRB_ERROR_ISOC_OVERRUN 0x1F +#define XHCI_TRB_ERROR_EVENT_LOST 0x20 +#define XHCI_TRB_ERROR_UNDEFINED 0x21 +#define XHCI_TRB_ERROR_INVALID_SID 0x22 +#define XHCI_TRB_ERROR_SEC_BW 0x23 +#define XHCI_TRB_ERROR_SPLIT_XACT 0x24 +} __aligned(4); + +struct xhci_dev_endpoint_trbs { + struct xhci_trb trb[(XHCI_MAX_STREAMS * + XHCI_MAX_TRANSFERS) + XHCI_MAX_STREAMS]; +}; + +struct xhci_event_ring_seg { + volatile uint64_t qwEvrsTablePtr; + volatile uint32_t dwEvrsTableSize; + volatile uint32_t dwEvrsReserved; +}; + +#endif /* _PCI_XHCI_H_ */ diff --git a/usr/src/cmd/bhyve/pm.c b/usr/src/cmd/bhyve/pm.c index 70c4f1fae8..be188b79f2 100644 --- a/usr/src/cmd/bhyve/pm.c +++ b/usr/src/cmd/bhyve/pm.c @@ -1,5 +1,7 @@ /*- - * Copyright (c) 2013 Advanced Computing Technologies LLC + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Hudson River Trading LLC * Written by: John H. Baldwin * All rights reserved. * @@ -24,14 +26,18 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ +/* + * Copyright 2018 Joyent, Inc. + */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pm.c 266125 2014-05-15 14:16:55Z jhb $"); +__FBSDID("$FreeBSD$"); #include #include #include +#include #include #ifndef __FreeBSD__ #include @@ -51,6 +57,8 @@ static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER; #ifdef __FreeBSD__ static struct mevent *power_button; static sig_t old_power_handler; +#else +struct vmctx *pwr_ctx; #endif /* @@ -63,6 +71,8 @@ static int reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { + int error; + static uint8_t reset_control; if (bytes != 1) @@ -74,12 +84,8 @@ reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, /* Treat hard and soft resets the same. */ if (reset_control & 0x4) { -#ifdef __FreeBSD__ error = vm_suspend(ctx, VM_SUSPEND_RESET); assert(error == 0 || errno == EALREADY); -#else - exit(0); -#endif } } return (0); @@ -220,6 +226,34 @@ power_button_handler(int signal, enum ev_type type, void *arg) } pthread_mutex_unlock(&pm_lock); } + +#else +/* + * Initiate graceful power off. + */ +/*ARGSUSED*/ +static void +power_button_handler(int signal, siginfo_t *type, void *cp) +{ + /* + * In theory, taking the 'pm_lock' mutex from within this signal + * handler could lead to deadlock if the main thread already held this + * mutex. In reality, this mutex is local to this file and all of the + * other usage in this file only occurs in functions which are FreeBSD + * specific (and thus currently not used). Thus, for consistency with + * the other code in this file, we take the mutex, but in the future, + * if these other functions are ever enabled for use on non-FreeBSD + * systems and these functions could be called directly by a thread + * (which would then hold the mutex), then we need to revisit the use + * of this mutex in this signal handler. + */ + pthread_mutex_lock(&pm_lock); + if (!(pm1_status & PM1_PWRBTN_STS)) { + pm1_status |= PM1_PWRBTN_STS; + sci_update(pwr_ctx); + } + pthread_mutex_unlock(&pm_lock); +} #endif /* @@ -239,6 +273,7 @@ static int pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { + int error; if (bytes != 2) return (-1); @@ -259,12 +294,8 @@ pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, */ if (*eax & PM1_SLP_EN) { if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) { -#ifdef __FreeBSD__ error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); assert(error == 0 || errno == EALREADY); -#else - exit(0); -#endif } } } @@ -330,4 +361,18 @@ sci_init(struct vmctx *ctx) */ pci_irq_use(SCI_INT); vm_isa_set_irq_trigger(ctx, SCI_INT, LEVEL_TRIGGER); + +#ifndef __FreeBSD__ + { + /* + * Install SIGTERM signal handler for graceful power off. + */ + struct sigaction act; + + pwr_ctx = ctx; + act.sa_flags = 0; + act.sa_sigaction = power_button_handler; + (void) sigaction(SIGTERM, &act, NULL); + } +#endif } diff --git a/usr/src/cmd/bhyve/pmtmr.c b/usr/src/cmd/bhyve/pmtmr.c deleted file mode 100644 index 92ab24be57..0000000000 --- a/usr/src/cmd/bhyve/pmtmr.c +++ /dev/null @@ -1,212 +0,0 @@ -/*- - * Copyright (c) 2012 NetApp, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: head/usr.sbin/bhyve/pmtmr.c 259998 2013-12-28 04:01:05Z jhb $ - */ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * Copyright 2014 Pluribus Networks Inc. - */ - -#include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pmtmr.c 259998 2013-12-28 04:01:05Z jhb $"); - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#ifndef __FreeBSD__ -#include -#endif - -#include "acpi.h" -#include "inout.h" - -/* - * The ACPI Power Management timer is a free-running 24- or 32-bit - * timer with a frequency of 3.579545MHz - * - * This implementation will be 32-bits - */ - -#define PMTMR_FREQ 3579545 /* 3.579545MHz */ - -static pthread_mutex_t pmtmr_mtx; -static pthread_once_t pmtmr_once = PTHREAD_ONCE_INIT; - -static uint64_t pmtmr_old; - -static uint64_t pmtmr_tscf; -static uint64_t pmtmr_tsc_old; - -#ifdef __FreeBSD__ -static clockid_t clockid = CLOCK_UPTIME_FAST; -static struct timespec pmtmr_uptime_old; - -#define timespecsub(vvp, uvp) \ - do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_nsec -= (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ - } \ - } while (0) - -static uint64_t -timespec_to_pmtmr(const struct timespec *tsnew, const struct timespec *tsold) -{ - struct timespec tsdiff; - int64_t nsecs; - - tsdiff = *tsnew; - timespecsub(&tsdiff, tsold); - nsecs = tsdiff.tv_sec * 1000000000 + tsdiff.tv_nsec; - assert(nsecs >= 0); - - return (nsecs * PMTMR_FREQ / 1000000000 + pmtmr_old); -} -#endif - -static uint64_t -tsc_to_pmtmr(uint64_t tsc_new, uint64_t tsc_old) -{ - - return ((tsc_new - tsc_old) * PMTMR_FREQ / pmtmr_tscf + pmtmr_old); -} - -static void -pmtmr_init(void) -{ -#ifdef __FreeBSD__ - size_t len; - int smp_tsc, err; - struct timespec tsnew, tsold = { 0 }; - - len = sizeof(smp_tsc); - err = sysctlbyname("kern.timecounter.smp_tsc", &smp_tsc, &len, NULL, 0); - assert(err == 0); - - if (smp_tsc) { - len = sizeof(pmtmr_tscf); - err = sysctlbyname("machdep.tsc_freq", &pmtmr_tscf, &len, - NULL, 0); - assert(err == 0); - - pmtmr_tsc_old = rdtsc(); - pmtmr_old = tsc_to_pmtmr(pmtmr_tsc_old, 0); - } else { - if (getenv("BHYVE_PMTMR_PRECISE") != NULL) - clockid = CLOCK_UPTIME; - - err = clock_gettime(clockid, &tsnew); - assert(err == 0); - - pmtmr_uptime_old = tsnew; - pmtmr_old = timespec_to_pmtmr(&tsnew, &tsold); - } -#else - kstat_ctl_t *kstat_ctl; - kstat_t *kstat; - kstat_named_t *kstat_cpu_freq; - - kstat_ctl = kstat_open(); - kstat = kstat_lookup(kstat_ctl, "cpu_info", 0, NULL); - kstat_read(kstat_ctl, kstat, NULL); - kstat_cpu_freq = kstat_data_lookup(kstat, "current_clock_Hz"); - pmtmr_tscf = kstat_cpu_freq->value.ul; - kstat_close(kstat_ctl); - - pmtmr_tsc_old = rdtsc(); - pmtmr_old = tsc_to_pmtmr(pmtmr_tsc_old, 0); -#endif - pthread_mutex_init(&pmtmr_mtx, NULL); -} - -static uint32_t -pmtmr_val(void) -{ - struct timespec tsnew; - uint64_t pmtmr_tsc_new; - uint64_t pmtmr_new; - int error; - - pthread_once(&pmtmr_once, pmtmr_init); - - pthread_mutex_lock(&pmtmr_mtx); - -#ifdef __FreeBSD__ - if (pmtmr_tscf) { - pmtmr_tsc_new = rdtsc(); - pmtmr_new = tsc_to_pmtmr(pmtmr_tsc_new, pmtmr_tsc_old); - pmtmr_tsc_old = pmtmr_tsc_new; - } else { - error = clock_gettime(clockid, &tsnew); - assert(error == 0); - - pmtmr_new = timespec_to_pmtmr(&tsnew, &pmtmr_uptime_old); - pmtmr_uptime_old = tsnew; - } -#else - pmtmr_tsc_new = rdtsc(); - pmtmr_new = tsc_to_pmtmr(pmtmr_tsc_new, pmtmr_tsc_old); - pmtmr_tsc_old = pmtmr_tsc_new; -#endif - pmtmr_old = pmtmr_new; - - pthread_mutex_unlock(&pmtmr_mtx); - - return (pmtmr_new); -} - -static int -pmtmr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) -{ - assert(in == 1); - - if (bytes != 4) - return (-1); - - *eax = pmtmr_val(); - - return (0); -} - -INOUT_PORT(pmtmr, IO_PMTMR, IOPORT_F_IN, pmtmr_handler); diff --git a/usr/src/cmd/bhyve/post.c b/usr/src/cmd/bhyve/post.c index dcb481aac4..d3040a8df7 100644 --- a/usr/src/cmd/bhyve/post.c +++ b/usr/src/cmd/bhyve/post.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,11 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/post.c 260206 2014-01-02 21:26:59Z jhb $ + * $FreeBSD$ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/post.c 260206 2014-01-02 21:26:59Z jhb $"); +__FBSDID("$FreeBSD$"); #include diff --git a/usr/src/cmd/bhyve/ps2kbd.c b/usr/src/cmd/bhyve/ps2kbd.c index 22e566ac21..5453a26949 100644 --- a/usr/src/cmd/bhyve/ps2kbd.c +++ b/usr/src/cmd/bhyve/ps2kbd.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale * Copyright (c) 2015 Nahanni Systems Inc. * All rights reserved. @@ -74,6 +76,107 @@ struct ps2kbd_softc { uint8_t curcmd; /* current command for next byte */ }; +#define SCANCODE_E0_PREFIX 1 +struct extended_translation { + uint32_t keysym; + uint8_t scancode; + int flags; +}; + +/* + * FIXME: Pause/break and Print Screen/SysRq require special handling. + */ +static const struct extended_translation extended_translations[] = { + {0xff08, 0x66}, /* Back space */ + {0xff09, 0x0d}, /* Tab */ + {0xff0d, 0x5a}, /* Return */ + {0xff1b, 0x76}, /* Escape */ + {0xff50, 0x6c, SCANCODE_E0_PREFIX}, /* Home */ + {0xff51, 0x6b, SCANCODE_E0_PREFIX}, /* Left arrow */ + {0xff52, 0x75, SCANCODE_E0_PREFIX}, /* Up arrow */ + {0xff53, 0x74, SCANCODE_E0_PREFIX}, /* Right arrow */ + {0xff54, 0x72, SCANCODE_E0_PREFIX}, /* Down arrow */ + {0xff55, 0x7d, SCANCODE_E0_PREFIX}, /* PgUp */ + {0xff56, 0x7a, SCANCODE_E0_PREFIX}, /* PgDown */ + {0xff57, 0x69, SCANCODE_E0_PREFIX}, /* End */ + {0xff63, 0x70, SCANCODE_E0_PREFIX}, /* Ins */ + {0xff8d, 0x5a, SCANCODE_E0_PREFIX}, /* Keypad Enter */ + {0xffe1, 0x12}, /* Left shift */ + {0xffe2, 0x59}, /* Right shift */ + {0xffe3, 0x14}, /* Left control */ + {0xffe4, 0x14, SCANCODE_E0_PREFIX}, /* Right control */ + /* {0xffe7, XXX}, Left meta */ + /* {0xffe8, XXX}, Right meta */ + {0xffe9, 0x11}, /* Left alt */ + {0xfe03, 0x11, SCANCODE_E0_PREFIX}, /* AltGr */ + {0xffea, 0x11, SCANCODE_E0_PREFIX}, /* Right alt */ + {0xffeb, 0x1f, SCANCODE_E0_PREFIX}, /* Left Windows */ + {0xffec, 0x27, SCANCODE_E0_PREFIX}, /* Right Windows */ + {0xffbe, 0x05}, /* F1 */ + {0xffbf, 0x06}, /* F2 */ + {0xffc0, 0x04}, /* F3 */ + {0xffc1, 0x0c}, /* F4 */ + {0xffc2, 0x03}, /* F5 */ + {0xffc3, 0x0b}, /* F6 */ + {0xffc4, 0x83}, /* F7 */ + {0xffc5, 0x0a}, /* F8 */ + {0xffc6, 0x01}, /* F9 */ + {0xffc7, 0x09}, /* F10 */ + {0xffc8, 0x78}, /* F11 */ + {0xffc9, 0x07}, /* F12 */ + {0xffff, 0x71, SCANCODE_E0_PREFIX}, /* Del */ + {0xff14, 0x7e}, /* ScrollLock */ + /* NumLock and Keypads*/ + {0xff7f, 0x77}, /* NumLock */ + {0xffaf, 0x4a, SCANCODE_E0_PREFIX}, /* Keypad slash */ + {0xffaa, 0x7c}, /* Keypad asterisk */ + {0xffad, 0x7b}, /* Keypad minus */ + {0xffab, 0x79}, /* Keypad plus */ + {0xffb7, 0x6c}, /* Keypad 7 */ + {0xff95, 0x6c}, /* Keypad home */ + {0xffb8, 0x75}, /* Keypad 8 */ + {0xff97, 0x75}, /* Keypad up arrow */ + {0xffb9, 0x7d}, /* Keypad 9 */ + {0xff9a, 0x7d}, /* Keypad PgUp */ + {0xffb4, 0x6b}, /* Keypad 4 */ + {0xff96, 0x6b}, /* Keypad left arrow */ + {0xffb5, 0x73}, /* Keypad 5 */ + {0xff9d, 0x73}, /* Keypad empty */ + {0xffb6, 0x74}, /* Keypad 6 */ + {0xff98, 0x74}, /* Keypad right arrow */ + {0xffb1, 0x69}, /* Keypad 1 */ + {0xff9c, 0x69}, /* Keypad end */ + {0xffb2, 0x72}, /* Keypad 2 */ + {0xff99, 0x72}, /* Keypad down arrow */ + {0xffb3, 0x7a}, /* Keypad 3 */ + {0xff9b, 0x7a}, /* Keypad PgDown */ + {0xffb0, 0x70}, /* Keypad 0 */ + {0xff9e, 0x70}, /* Keypad ins */ + {0xffae, 0x71}, /* Keypad . */ + {0xff9f, 0x71}, /* Keypad del */ + {0, 0, 0} /* Terminator */ +}; + +/* ASCII to type 2 scancode lookup table */ +static const uint8_t ascii_translations[128] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52, + 0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a, + 0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d, + 0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a, + 0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, + 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, + 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, + 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e, + 0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, + 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, + 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, + 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00, +}; + static void fifo_init(struct ps2kbd_softc *sc) { @@ -93,15 +196,6 @@ fifo_reset(struct ps2kbd_softc *sc) fifo->size = sizeof(((struct fifo *)0)->buf); } -static int -fifo_available(struct ps2kbd_softc *sc) -{ - struct fifo *fifo; - - fifo = &sc->fifo; - return (fifo->num < fifo->size); -} - static void fifo_put(struct ps2kbd_softc *sc, uint8_t val) { @@ -166,6 +260,9 @@ ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val) sc->curcmd = 0; } else { switch (val) { + case 0x00: + fifo_put(sc, PS2KC_ACK); + break; case PS2KC_RESET_DEV: fifo_reset(sc); fifo_put(sc, PS2KC_ACK); @@ -216,190 +313,57 @@ static void ps2kbd_keysym_queue(struct ps2kbd_softc *sc, int down, uint32_t keysym) { - /* ASCII to type 2 scancode lookup table */ - const uint8_t translation[128] = { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52, - 0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a, - 0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d, - 0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a, - 0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, - 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, - 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, - 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e, - 0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, - 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, - 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, - 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00, - }; - assert(pthread_mutex_isowned_np(&sc->mtx)); + int e0_prefix, found; + uint8_t code; + const struct extended_translation *trans; + + found = 0; + if (keysym < 0x80) { + code = ascii_translations[keysym]; + e0_prefix = 0; + found = 1; + } else { + for (trans = &(extended_translations[0]); trans->keysym != 0; + trans++) { + if (keysym == trans->keysym) { + code = trans->scancode; + e0_prefix = trans->flags & SCANCODE_E0_PREFIX; + found = 1; + break; + } + } + } - switch (keysym) { - case 0x0 ... 0x7f: - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, translation[keysym]); - break; - case 0xff08: /* Back space */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x66); - break; - case 0xff09: /* Tab */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x0d); - break; - case 0xff0d: /* Return */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x5a); - break; - case 0xff1b: /* Escape */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x76); - break; - case 0xff51: /* Left arrow */ - fifo_put(sc, 0xe0); - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x6b); - break; - case 0xff52: /* Up arrow */ - fifo_put(sc, 0xe0); - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x75); - break; - case 0xff53: /* Right arrow */ - fifo_put(sc, 0xe0); - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x74); - break; - case 0xff54: /* Down arrow */ - fifo_put(sc, 0xe0); - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x72); - break; - case 0xffbe: /* F1 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x05); - break; - case 0xffbf: /* F2 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x06); - break; - case 0xffc0: /* F3 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x04); - break; - case 0xffc1: /* F4 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x0c); - break; - case 0xffc2: /* F5 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x03); - break; - case 0xffc3: /* F6 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x0b); - break; - case 0xffc4: /* F7 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x83); - break; - case 0xffc5: /* F8 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x0a); - break; - case 0xffc6: /* F9 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x01); - break; - case 0xffc7: /* F10 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x09); - break; - case 0xffc8: /* F11 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x78); - break; - case 0xffc9: /* F12 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x07); - break; - case 0xffe1: /* Left shift */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x12); - break; - case 0xffe2: /* Right shift */ - /* XXX */ - break; - case 0xffe3: /* Left control */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x14); - break; - case 0xffe4: /* Right control */ - /* XXX */ - break; - case 0xffe7: /* Left meta */ - /* XXX */ - break; - case 0xffe8: /* Right meta */ - /* XXX */ - break; - case 0xffe9: /* Left alt */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x11); - break; - case 0xffea: /* Right alt */ - /* XXX */ - break; - default: - fprintf(stderr, "Unhandled ps2 keyboard keysym 0x%x\n", - keysym); - break; + if (!found) { + fprintf(stderr, "Unhandled ps2 keyboard keysym 0x%x\n", keysym); + return; } + + if (e0_prefix) + fifo_put(sc, 0xe0); + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, code); } static void ps2kbd_event(int down, uint32_t keysym, void *arg) { struct ps2kbd_softc *sc = arg; + int fifo_full; pthread_mutex_lock(&sc->mtx); if (!sc->enabled) { pthread_mutex_unlock(&sc->mtx); return; } - + fifo_full = sc->fifo.num == PS2KBD_FIFOSZ; ps2kbd_keysym_queue(sc, down, keysym); pthread_mutex_unlock(&sc->mtx); - atkbdc_event(sc->atkbdc_sc); + if (!fifo_full) + atkbdc_event(sc->atkbdc_sc, 1); } struct ps2kbd_softc * @@ -412,7 +376,8 @@ ps2kbd_init(struct atkbdc_softc *atkbdc_sc) fifo_init(sc); sc->atkbdc_sc = atkbdc_sc; - console_kbd_register(ps2kbd_event, sc); + console_kbd_register(ps2kbd_event, sc, 1); return (sc); } + diff --git a/usr/src/cmd/bhyve/ps2kbd.h b/usr/src/cmd/bhyve/ps2kbd.h index 34c31b1ea8..17be6d0466 100644 --- a/usr/src/cmd/bhyve/ps2kbd.h +++ b/usr/src/cmd/bhyve/ps2kbd.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale * All rights reserved. * diff --git a/usr/src/cmd/bhyve/ps2mouse.c b/usr/src/cmd/bhyve/ps2mouse.c index e96fbbf411..b2e08262b1 100644 --- a/usr/src/cmd/bhyve/ps2mouse.c +++ b/usr/src/cmd/bhyve/ps2mouse.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale * Copyright (c) 2015 Nahanni Systems Inc. * All rights reserved. @@ -62,6 +64,16 @@ __FBSDID("$FreeBSD$"); /* mouse device id */ #define PS2MOUSE_DEV_ID 0x0 +/* mouse data bits */ +#define PS2M_DATA_Y_OFLOW 0x80 +#define PS2M_DATA_X_OFLOW 0x40 +#define PS2M_DATA_Y_SIGN 0x20 +#define PS2M_DATA_X_SIGN 0x10 +#define PS2M_DATA_AONE 0x08 +#define PS2M_DATA_MID_BUTTON 0x04 +#define PS2M_DATA_RIGHT_BUTTON 0x02 +#define PS2M_DATA_LEFT_BUTTON 0x01 + /* mouse status bits */ #define PS2M_STS_REMOTE_MODE 0x40 #define PS2M_STS_ENABLE_DEV 0x20 @@ -87,6 +99,7 @@ struct ps2mouse_softc { uint8_t status; uint8_t resolution; uint8_t sampling_rate; + int ctrlenable; struct fifo fifo; uint8_t curcmd; /* current command for next byte */ @@ -168,19 +181,20 @@ movement_get(struct ps2mouse_softc *sc) assert(pthread_mutex_isowned_np(&sc->mtx)); - val0 = sc->status & (PS2M_STS_LEFT_BUTTON | - PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON); + val0 = PS2M_DATA_AONE; + val0 |= sc->status & (PS2M_DATA_LEFT_BUTTON | + PS2M_DATA_RIGHT_BUTTON | PS2M_DATA_MID_BUTTON); if (sc->delta_x >= 0) { if (sc->delta_x > 255) { - val0 |= (1 << 6); + val0 |= PS2M_DATA_X_OFLOW; val1 = 255; } else val1 = sc->delta_x; } else { - val0 |= (1 << 4); + val0 |= PS2M_DATA_X_SIGN; if (sc->delta_x < -255) { - val0 |= (1 << 6); + val0 |= PS2M_DATA_X_OFLOW; val1 = 255; } else val1 = sc->delta_x; @@ -189,23 +203,25 @@ movement_get(struct ps2mouse_softc *sc) if (sc->delta_y >= 0) { if (sc->delta_y > 255) { - val0 |= (1 << 7); + val0 |= PS2M_DATA_Y_OFLOW; val2 = 255; } else val2 = sc->delta_y; } else { - val0 |= (1 << 5); + val0 |= PS2M_DATA_Y_SIGN; if (sc->delta_y < -255) { - val0 |= (1 << 7); + val0 |= PS2M_DATA_Y_OFLOW; val2 = 255; } else val2 = sc->delta_y; } sc->delta_y = 0; - fifo_put(sc, val0); - fifo_put(sc, val1); - fifo_put(sc, val2); + if (sc->fifo.num < (sc->fifo.size - 3)) { + fifo_put(sc, val0); + fifo_put(sc, val1); + fifo_put(sc, val2); + } } static void @@ -214,7 +230,7 @@ ps2mouse_reset(struct ps2mouse_softc *sc) assert(pthread_mutex_isowned_np(&sc->mtx)); fifo_reset(sc); movement_reset(sc); - sc->status = 0x8; + sc->status = PS2M_STS_ENABLE_DEV; sc->resolution = 4; sc->sampling_rate = 100; @@ -236,10 +252,32 @@ ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val) return (retval); } +int +ps2mouse_fifocnt(struct ps2mouse_softc *sc) +{ + return (sc->fifo.num); +} + void -ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val) +ps2mouse_toggle(struct ps2mouse_softc *sc, int enable) { pthread_mutex_lock(&sc->mtx); + if (enable) + sc->ctrlenable = 1; + else { + sc->ctrlenable = 0; + sc->fifo.rindex = 0; + sc->fifo.windex = 0; + sc->fifo.num = 0; + } + pthread_mutex_unlock(&sc->mtx); +} + +void +ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert) +{ + pthread_mutex_lock(&sc->mtx); + fifo_reset(sc); if (sc->curcmd) { switch (sc->curcmd) { case PS2MC_SET_SAMPLING_RATE: @@ -256,8 +294,14 @@ ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val) break; } sc->curcmd = 0; + + } else if (insert) { + fifo_put(sc, val); } else { switch (val) { + case 0x00: + fifo_put(sc, PS2MC_ACK); + break; case PS2MC_RESET_DEV: ps2mouse_reset(sc); fifo_put(sc, PS2MC_ACK); @@ -313,6 +357,7 @@ ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val) fifo_put(sc, PS2MC_ACK); break; default: + fifo_put(sc, PS2MC_ACK); fprintf(stderr, "Unhandled ps2 mouse command " "0x%02x\n", val); break; @@ -338,7 +383,7 @@ ps2mouse_event(uint8_t button, int x, int y, void *arg) if (button & (1 << 2)) sc->status |= PS2M_STS_RIGHT_BUTTON; - if ((sc->status & PS2M_STS_ENABLE_DEV) == 0) { + if ((sc->status & PS2M_STS_ENABLE_DEV) == 0 || !sc->ctrlenable) { /* no data reporting */ pthread_mutex_unlock(&sc->mtx); return; @@ -347,7 +392,8 @@ ps2mouse_event(uint8_t button, int x, int y, void *arg) movement_get(sc); pthread_mutex_unlock(&sc->mtx); - atkbdc_event(sc->atkbdc_sc); + if (sc->fifo.num > 0) + atkbdc_event(sc->atkbdc_sc, 0); } struct ps2mouse_softc * @@ -364,8 +410,9 @@ ps2mouse_init(struct atkbdc_softc *atkbdc_sc) ps2mouse_reset(sc); pthread_mutex_unlock(&sc->mtx); - console_ptr_register(ps2mouse_event, sc); + console_ptr_register(ps2mouse_event, sc, 1); return (sc); } + diff --git a/usr/src/cmd/bhyve/ps2mouse.h b/usr/src/cmd/bhyve/ps2mouse.h index 1a78934b98..59430b01e2 100644 --- a/usr/src/cmd/bhyve/ps2mouse.h +++ b/usr/src/cmd/bhyve/ps2mouse.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale * All rights reserved. * @@ -34,6 +36,8 @@ struct atkbdc_softc; struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc); int ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val); -void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val); +void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert); +void ps2mouse_toggle(struct ps2mouse_softc *sc, int enable); +int ps2mouse_fifocnt(struct ps2mouse_softc *sc); #endif /* _PS2MOUSE_H_ */ diff --git a/usr/src/cmd/bhyve/rfb.c b/usr/src/cmd/bhyve/rfb.c index 0846316378..39ea1611f9 100644 --- a/usr/src/cmd/bhyve/rfb.c +++ b/usr/src/cmd/bhyve/rfb.c @@ -1,6 +1,9 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale - * Copyright (c) 2015 Nahanni Systems Inc. + * Copyright (c) 2015 Leon Dang + * Copyright 2018 Joyent, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,30 +31,91 @@ #include __FBSDID("$FreeBSD$"); +#include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include #include +#include +#include +#include +#include +#include #include +#include #include +#ifndef WITHOUT_CAPSICUM +#include +#endif +#include +#include #include +#include #include #include #include #include #include +#include #include +#include + +#ifndef __FreeBSD__ +#include +#endif + #include "bhyvegc.h" #include "console.h" #include "rfb.h" +#include "sockstream.h" + +#ifndef NO_OPENSSL +#include +#endif + +static int rfb_debug = 0; +#define DPRINTF(params) if (rfb_debug) printf params +#define WPRINTF(params) printf params + +#define AUTH_LENGTH 16 +#define PASSWD_LENGTH 8 + +#define SECURITY_TYPE_NONE 1 +#define SECURITY_TYPE_VNC_AUTH 2 + +#define AUTH_FAILED_UNAUTH 1 +#define AUTH_FAILED_ERROR 2 struct rfb_softc { int sfd; pthread_t tid; + int cfd; + int width, height; - bool enc_raw_ok; - bool enc_resize_ok; + char *password; + + bool enc_raw_ok; + bool enc_zlib_ok; + bool enc_resize_ok; + + z_stream zstream; + uint8_t *zbuf; + int zbuflen; + + int conn_wait; + int sending; + pthread_mutex_t mtx; + pthread_cond_t cond; + + int hw_crc; + uint32_t *crc; /* WxH crc cells */ + uint32_t *crc_tmp; /* buffer to store single crc row */ + int crc_width, crc_height; }; struct rfb_pixfmt { @@ -82,8 +146,16 @@ struct rfb_pixfmt_msg { }; #define RFB_ENCODING_RAW 0 +#define RFB_ENCODING_ZLIB 6 #define RFB_ENCODING_RESIZE -223 +#define RFB_MAX_WIDTH 2000 +#define RFB_MAX_HEIGHT 1200 +#define RFB_ZLIB_BUFSZ RFB_MAX_WIDTH*RFB_MAX_HEIGHT*4 + +/* percentage changes to screen before sending the entire screen */ +#define RFB_SEND_ALL_THRESH 25 + struct rfb_enc_msg { uint8_t type; uint8_t pad; @@ -127,60 +199,65 @@ struct rfb_srvr_rect_hdr { uint32_t encoding; }; +struct rfb_cuttext_msg { + uint8_t type; + uint8_t padding[3]; + uint32_t length; +}; + + static void rfb_send_server_init_msg(int cfd) { struct bhyvegc_image *gc_image; struct rfb_srvr_info sinfo; - int len; gc_image = console_get_image(); - sinfo.width = ntohs(gc_image->width); - sinfo.height = ntohs(gc_image->height); + sinfo.width = htons(gc_image->width); + sinfo.height = htons(gc_image->height); sinfo.pixfmt.bpp = 32; sinfo.pixfmt.depth = 32; sinfo.pixfmt.bigendian = 0; sinfo.pixfmt.truecolor = 1; - sinfo.pixfmt.red_max = ntohs(255); - sinfo.pixfmt.green_max = ntohs(255); - sinfo.pixfmt.blue_max = ntohs(255); + sinfo.pixfmt.red_max = htons(255); + sinfo.pixfmt.green_max = htons(255); + sinfo.pixfmt.blue_max = htons(255); sinfo.pixfmt.red_shift = 16; sinfo.pixfmt.green_shift = 8; sinfo.pixfmt.blue_shift = 0; - sinfo.namelen = ntohl(strlen("bhyve")); - len = write(cfd, &sinfo, sizeof(sinfo)); - len = write(cfd, "bhyve", strlen("bhyve")); + sinfo.namelen = htonl(strlen("bhyve")); + (void)stream_write(cfd, &sinfo, sizeof(sinfo)); + (void)stream_write(cfd, "bhyve", strlen("bhyve")); } static void rfb_send_resize_update_msg(struct rfb_softc *rc, int cfd) { struct rfb_srvr_updt_msg supdt_msg; - struct rfb_srvr_rect_hdr srect_hdr; + struct rfb_srvr_rect_hdr srect_hdr; /* Number of rectangles: 1 */ supdt_msg.type = 0; supdt_msg.pad = 0; - supdt_msg.numrects = ntohs(1); - write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg)); + supdt_msg.numrects = htons(1); + stream_write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg)); /* Rectangle header */ - srect_hdr.x = ntohs(0); - srect_hdr.y = ntohs(0); - srect_hdr.width = ntohs(rc->width); - srect_hdr.height = ntohs(rc->height); - srect_hdr.encoding = ntohl(RFB_ENCODING_RESIZE); - write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); + srect_hdr.x = htons(0); + srect_hdr.y = htons(0); + srect_hdr.width = htons(rc->width); + srect_hdr.height = htons(rc->height); + srect_hdr.encoding = htonl(RFB_ENCODING_RESIZE); + stream_write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); } static void rfb_recv_set_pixfmt_msg(struct rfb_softc *rc, int cfd) { struct rfb_pixfmt_msg pixfmt_msg; - int len; - len = read(cfd, ((void *)&pixfmt_msg) + 1, sizeof(pixfmt_msg) - 1); + (void)stream_read(cfd, ((void *)&pixfmt_msg)+1, sizeof(pixfmt_msg)-1); } @@ -188,18 +265,22 @@ static void rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd) { struct rfb_enc_msg enc_msg; - int len, i; + int i; uint32_t encoding; assert((sizeof(enc_msg) - 1) == 3); - len = read(cfd, ((void *)&enc_msg) + 1, sizeof(enc_msg) - 1); + (void)stream_read(cfd, ((void *)&enc_msg)+1, sizeof(enc_msg)-1); - for (i = 0; i < ntohs(enc_msg.numencs); i++) { - len = read(cfd, &encoding, sizeof(encoding)); - switch (ntohl(encoding)) { + for (i = 0; i < htons(enc_msg.numencs); i++) { + (void)stream_read(cfd, &encoding, sizeof(encoding)); + switch (htonl(encoding)) { case RFB_ENCODING_RAW: rc->enc_raw_ok = true; break; + case RFB_ENCODING_ZLIB: + rc->enc_zlib_ok = true; + deflateInit(&rc->zstream, Z_BEST_SPEED); + break; case RFB_ENCODING_RESIZE: rc->enc_resize_ok = true; break; @@ -207,88 +288,460 @@ rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd) } } -static void -rfb_resize_update(struct rfb_softc *rc, int fd) +/* + * Calculate CRC32 using SSE4.2; Intel or AMD Bulldozer+ CPUs only + */ +static __inline uint32_t +fast_crc32(void *buf, int len, uint32_t crcval) +{ + uint32_t q = len / sizeof(uint32_t); + uint32_t *p = (uint32_t *)buf; + + while (q--) { + asm volatile ( + ".byte 0xf2, 0xf, 0x38, 0xf1, 0xf1;" + :"=S" (crcval) + :"0" (crcval), "c" (*p) + ); + p++; + } + + return (crcval); +} + + +static int +rfb_send_rect(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc, + int x, int y, int w, int h) +{ + struct rfb_srvr_updt_msg supdt_msg; + struct rfb_srvr_rect_hdr srect_hdr; + unsigned long zlen; + ssize_t nwrite, total; + int err; + uint32_t *p; + uint8_t *zbufp; + + /* + * Send a single rectangle of the given x, y, w h dimensions. + */ + + /* Number of rectangles: 1 */ + supdt_msg.type = 0; + supdt_msg.pad = 0; + supdt_msg.numrects = htons(1); + nwrite = stream_write(cfd, &supdt_msg, + sizeof(struct rfb_srvr_updt_msg)); + if (nwrite <= 0) + return (nwrite); + + + /* Rectangle header */ + srect_hdr.x = htons(x); + srect_hdr.y = htons(y); + srect_hdr.width = htons(w); + srect_hdr.height = htons(h); + + h = y + h; + w *= sizeof(uint32_t); + if (rc->enc_zlib_ok) { + zbufp = rc->zbuf; + rc->zstream.total_in = 0; + rc->zstream.total_out = 0; + for (p = &gc->data[y * gc->width + x]; y < h; y++) { + rc->zstream.next_in = (Bytef *)p; + rc->zstream.avail_in = w; + rc->zstream.next_out = (Bytef *)zbufp; + rc->zstream.avail_out = RFB_ZLIB_BUFSZ + 16 - + rc->zstream.total_out; + rc->zstream.data_type = Z_BINARY; + + /* Compress with zlib */ + err = deflate(&rc->zstream, Z_SYNC_FLUSH); + if (err != Z_OK) { + WPRINTF(("zlib[rect] deflate err: %d\n", err)); + rc->enc_zlib_ok = false; + deflateEnd(&rc->zstream); + goto doraw; + } + zbufp = rc->zbuf + rc->zstream.total_out; + p += gc->width; + } + srect_hdr.encoding = htonl(RFB_ENCODING_ZLIB); + nwrite = stream_write(cfd, &srect_hdr, + sizeof(struct rfb_srvr_rect_hdr)); + if (nwrite <= 0) + return (nwrite); + + zlen = htonl(rc->zstream.total_out); + nwrite = stream_write(cfd, &zlen, sizeof(uint32_t)); + if (nwrite <= 0) + return (nwrite); + return (stream_write(cfd, rc->zbuf, rc->zstream.total_out)); + } + +doraw: + + total = 0; + zbufp = rc->zbuf; + for (p = &gc->data[y * gc->width + x]; y < h; y++) { + memcpy(zbufp, p, w); + zbufp += w; + total += w; + p += gc->width; + } + + srect_hdr.encoding = htonl(RFB_ENCODING_RAW); + nwrite = stream_write(cfd, &srect_hdr, + sizeof(struct rfb_srvr_rect_hdr)); + if (nwrite <= 0) + return (nwrite); + + total = stream_write(cfd, rc->zbuf, total); + + return (total); +} + +static int +rfb_send_all(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc) { struct rfb_srvr_updt_msg supdt_msg; struct rfb_srvr_rect_hdr srect_hdr; + ssize_t nwrite; + unsigned long zlen; + int err; + + /* + * Send the whole thing + */ /* Number of rectangles: 1 */ supdt_msg.type = 0; supdt_msg.pad = 0; - supdt_msg.numrects = ntohs(1); - write(fd, &supdt_msg, sizeof (struct rfb_srvr_updt_msg)); + supdt_msg.numrects = htons(1); + nwrite = stream_write(cfd, &supdt_msg, + sizeof(struct rfb_srvr_updt_msg)); + if (nwrite <= 0) + return (nwrite); /* Rectangle header */ - srect_hdr.x = ntohs(0); - srect_hdr.y = ntohs(0); - srect_hdr.width = ntohs(rc->width); - srect_hdr.height = ntohs(rc->height); - srect_hdr.encoding = ntohl(RFB_ENCODING_RESIZE); - write(fd, &srect_hdr, sizeof (struct rfb_srvr_rect_hdr)); + srect_hdr.x = 0; + srect_hdr.y = 0; + srect_hdr.width = htons(gc->width); + srect_hdr.height = htons(gc->height); + if (rc->enc_zlib_ok) { + rc->zstream.next_in = (Bytef *)gc->data; + rc->zstream.avail_in = gc->width * gc->height * + sizeof(uint32_t); + rc->zstream.next_out = (Bytef *)rc->zbuf; + rc->zstream.avail_out = RFB_ZLIB_BUFSZ + 16; + rc->zstream.data_type = Z_BINARY; + + rc->zstream.total_in = 0; + rc->zstream.total_out = 0; + + /* Compress with zlib */ + err = deflate(&rc->zstream, Z_SYNC_FLUSH); + if (err != Z_OK) { + WPRINTF(("zlib deflate err: %d\n", err)); + rc->enc_zlib_ok = false; + deflateEnd(&rc->zstream); + goto doraw; + } + + srect_hdr.encoding = htonl(RFB_ENCODING_ZLIB); + nwrite = stream_write(cfd, &srect_hdr, + sizeof(struct rfb_srvr_rect_hdr)); + if (nwrite <= 0) + return (nwrite); + + zlen = htonl(rc->zstream.total_out); + nwrite = stream_write(cfd, &zlen, sizeof(uint32_t)); + if (nwrite <= 0) + return (nwrite); + return (stream_write(cfd, rc->zbuf, rc->zstream.total_out)); + } + +doraw: + srect_hdr.encoding = htonl(RFB_ENCODING_RAW); + nwrite = stream_write(cfd, &srect_hdr, + sizeof(struct rfb_srvr_rect_hdr)); + if (nwrite <= 0) + return (nwrite); + + nwrite = stream_write(cfd, gc->data, + gc->width * gc->height * sizeof(uint32_t)); + + return (nwrite); } +#define PIX_PER_CELL 32 +#define PIXCELL_SHIFT 5 +#define PIXCELL_MASK 0x1F + +static int +rfb_send_screen(struct rfb_softc *rc, int cfd, int all) +{ + struct bhyvegc_image *gc_image; + ssize_t nwrite; + int x, y; + int celly, cellwidth; + int xcells, ycells; + int w, h; + uint32_t *p; + int rem_x, rem_y; /* remainder for resolutions not x32 pixels ratio */ + int retval; + uint32_t *crc_p, *orig_crc; + int changes; + + console_refresh(); + gc_image = console_get_image(); + + pthread_mutex_lock(&rc->mtx); + if (rc->sending) { + pthread_mutex_unlock(&rc->mtx); + return (1); + } + rc->sending = 1; + pthread_mutex_unlock(&rc->mtx); + + retval = 0; + + if (all) { + retval = rfb_send_all(rc, cfd, gc_image); + goto done; + } + + /* + * Calculate the checksum for each 32x32 cell. Send each that + * has changed since the last scan. + */ + + /* Resolution changed */ + + rc->crc_width = gc_image->width; + rc->crc_height = gc_image->height; + + w = rc->crc_width; + h = rc->crc_height; + xcells = howmany(rc->crc_width, PIX_PER_CELL); + ycells = howmany(rc->crc_height, PIX_PER_CELL); + + rem_x = w & PIXCELL_MASK; + + rem_y = h & PIXCELL_MASK; + if (!rem_y) + rem_y = PIX_PER_CELL; + + p = gc_image->data; + + /* + * Go through all cells and calculate crc. If significant number + * of changes, then send entire screen. + * crc_tmp is dual purpose: to store the new crc and to flag as + * a cell that has changed. + */ + crc_p = rc->crc_tmp - xcells; + orig_crc = rc->crc - xcells; + changes = 0; + memset(rc->crc_tmp, 0, sizeof(uint32_t) * xcells * ycells); + for (y = 0; y < h; y++) { + if ((y & PIXCELL_MASK) == 0) { + crc_p += xcells; + orig_crc += xcells; + } + + for (x = 0; x < xcells; x++) { + if (x == (xcells - 1) && rem_x > 0) + cellwidth = rem_x; + else + cellwidth = PIX_PER_CELL; + + if (rc->hw_crc) + crc_p[x] = fast_crc32(p, + cellwidth * sizeof(uint32_t), + crc_p[x]); + else + crc_p[x] = (uint32_t)crc32(crc_p[x], + (Bytef *)p, + cellwidth * sizeof(uint32_t)); + + p += cellwidth; + + /* check for crc delta if last row in cell */ + if ((y & PIXCELL_MASK) == PIXCELL_MASK || y == (h-1)) { + if (orig_crc[x] != crc_p[x]) { + orig_crc[x] = crc_p[x]; + crc_p[x] = 1; + changes++; + } else { + crc_p[x] = 0; + } + } + } + } + + /* If number of changes is > THRESH percent, send the whole screen */ + if (((changes * 100) / (xcells * ycells)) >= RFB_SEND_ALL_THRESH) { + retval = rfb_send_all(rc, cfd, gc_image); + goto done; + } + + /* Go through all cells, and send only changed ones */ + crc_p = rc->crc_tmp; + for (y = 0; y < h; y += PIX_PER_CELL) { + /* previous cell's row */ + celly = (y >> PIXCELL_SHIFT); + + /* Delta check crc to previous set */ + for (x = 0; x < xcells; x++) { + if (*crc_p++ == 0) + continue; + + if (x == (xcells - 1) && rem_x > 0) + cellwidth = rem_x; + else + cellwidth = PIX_PER_CELL; + nwrite = rfb_send_rect(rc, cfd, + gc_image, + x * PIX_PER_CELL, + celly * PIX_PER_CELL, + cellwidth, + y + PIX_PER_CELL >= h ? rem_y : PIX_PER_CELL); + if (nwrite <= 0) { + retval = nwrite; + goto done; + } + } + } + retval = 1; + +done: + pthread_mutex_lock(&rc->mtx); + rc->sending = 0; + pthread_mutex_unlock(&rc->mtx); + + return (retval); +} + + static void -rfb_recv_update_msg(struct rfb_softc *rc, int cfd) +rfb_recv_update_msg(struct rfb_softc *rc, int cfd, int discardonly) { struct rfb_updt_msg updt_msg; - struct rfb_srvr_updt_msg supdt_msg; - struct rfb_srvr_rect_hdr srect_hdr; struct bhyvegc_image *gc_image; - int len; - len = read(cfd, ((void *)&updt_msg) + 1 , sizeof(updt_msg) - 1); + (void)stream_read(cfd, ((void *)&updt_msg) + 1 , sizeof(updt_msg) - 1); console_refresh(); gc_image = console_get_image(); - if (rc->width != gc_image->width || rc->height != gc_image->height) { + updt_msg.x = htons(updt_msg.x); + updt_msg.y = htons(updt_msg.y); + updt_msg.width = htons(updt_msg.width); + updt_msg.height = htons(updt_msg.height); + + if (updt_msg.width != gc_image->width || + updt_msg.height != gc_image->height) { rc->width = gc_image->width; rc->height = gc_image->height; - rfb_send_resize_update_msg(rc, cfd); + if (rc->enc_resize_ok) + rfb_send_resize_update_msg(rc, cfd); } - /* - * Send the whole thing - */ - /* Number of rectangles: 1 */ - supdt_msg.type = 0; - supdt_msg.pad = 0; - supdt_msg.numrects = ntohs(1); - write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg)); + if (discardonly) + return; - /* Rectangle header */ - srect_hdr.x = ntohs(0); - srect_hdr.y = ntohs(0); - srect_hdr.width = ntohs(gc_image->width); - srect_hdr.height = ntohs(gc_image->height); - srect_hdr.encoding = ntohl(0); /* raw */ - write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); - - write(cfd, gc_image->data, gc_image->width * gc_image->height * - sizeof(uint32_t)); + rfb_send_screen(rc, cfd, 1); } static void rfb_recv_key_msg(struct rfb_softc *rc, int cfd) { struct rfb_key_msg key_msg; - int len; - len = read(cfd, ((void *)&key_msg) + 1, sizeof(key_msg) - 1); + (void)stream_read(cfd, ((void *)&key_msg) + 1, sizeof(key_msg) - 1); - console_key_event(key_msg.down, ntohl(key_msg.code)); + console_key_event(key_msg.down, htonl(key_msg.code)); } static void rfb_recv_ptr_msg(struct rfb_softc *rc, int cfd) { struct rfb_ptr_msg ptr_msg; + + (void)stream_read(cfd, ((void *)&ptr_msg) + 1, sizeof(ptr_msg) - 1); + + console_ptr_event(ptr_msg.button, htons(ptr_msg.x), htons(ptr_msg.y)); +} + +static void +rfb_recv_cuttext_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_cuttext_msg ct_msg; + unsigned char buf[32]; int len; - len = read(cfd, ((void *)&ptr_msg) + 1, sizeof(ptr_msg) - 1); + len = stream_read(cfd, ((void *)&ct_msg) + 1, sizeof(ct_msg) - 1); + ct_msg.length = htonl(ct_msg.length); + while (ct_msg.length > 0) { + len = stream_read(cfd, buf, ct_msg.length > sizeof(buf) ? + sizeof(buf) : ct_msg.length); + ct_msg.length -= len; + } +} - console_ptr_event(ptr_msg.button, ntohs(ptr_msg.x), ntohs(ptr_msg.y)); +static int64_t +timeval_delta(struct timeval *prev, struct timeval *now) +{ + int64_t n1, n2; + n1 = now->tv_sec * 1000000 + now->tv_usec; + n2 = prev->tv_sec * 1000000 + prev->tv_usec; + return (n1 - n2); +} + +static void * +rfb_wr_thr(void *arg) +{ + struct rfb_softc *rc; + fd_set rfds; + struct timeval tv; + struct timeval prev_tv; + int64_t tdiff; + int cfd; + int err; + + rc = arg; + cfd = rc->cfd; + + prev_tv.tv_sec = 0; + prev_tv.tv_usec = 0; + while (rc->cfd >= 0) { + FD_ZERO(&rfds); + FD_SET(cfd, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 10000; + + err = select(cfd+1, &rfds, NULL, NULL, &tv); + if (err < 0) + return (NULL); + + /* Determine if its time to push screen; ~24hz */ + gettimeofday(&tv, NULL); + tdiff = timeval_delta(&prev_tv, &tv); + if (tdiff > 40000) { + prev_tv.tv_sec = tv.tv_sec; + prev_tv.tv_usec = tv.tv_usec; + if (rfb_send_screen(rc, cfd, 0) <= 0) { + return (NULL); + } + } else { + /* sleep */ + usleep(40000 - tdiff); + } + } + + return (NULL); } void @@ -296,39 +749,145 @@ rfb_handle(struct rfb_softc *rc, int cfd) { const char *vbuf = "RFB 003.008\n"; unsigned char buf[80]; + unsigned char *message = NULL; + +#ifndef NO_OPENSSL + unsigned char challenge[AUTH_LENGTH]; + unsigned char keystr[PASSWD_LENGTH]; + unsigned char crypt_expected[AUTH_LENGTH]; + + DES_key_schedule ks; + int i; +#endif + + pthread_t tid; + uint32_t sres = 0; int len; - uint32_t sres; + int perror = 1; + + rc->cfd = cfd; /* 1a. Send server version */ - printf("server vers write: (%s), %d bytes\n", vbuf, (int) strlen(vbuf)); - write(cfd, vbuf, strlen(vbuf)); + stream_write(cfd, vbuf, strlen(vbuf)); /* 1b. Read client version */ len = read(cfd, buf, sizeof(buf)); - /* 2a. Send security type 'none' */ + /* 2a. Send security type */ buf[0] = 1; - buf[1] = 1; /* none */ - write(cfd, buf, 2); +#ifndef NO_OPENSSL + if (rc->password) + buf[1] = SECURITY_TYPE_VNC_AUTH; + else + buf[1] = SECURITY_TYPE_NONE; +#else + buf[1] = SECURITY_TYPE_NONE; +#endif + + stream_write(cfd, buf, 2); /* 2b. Read agreed security type */ - len = read(cfd, buf, 1); + len = stream_read(cfd, buf, 1); + + /* 2c. Do VNC authentication */ + switch (buf[0]) { + case SECURITY_TYPE_NONE: + sres = 0; + break; + case SECURITY_TYPE_VNC_AUTH: + /* + * The client encrypts the challenge with DES, using a password + * supplied by the user as the key. + * To form the key, the password is truncated to + * eight characters, or padded with null bytes on the right. + * The client then sends the resulting 16-bytes response. + */ +#ifndef NO_OPENSSL + strncpy(keystr, rc->password, PASSWD_LENGTH); + + /* VNC clients encrypts the challenge with all the bit fields + * in each byte of the password mirrored. + * Here we flip each byte of the keystr. + */ + for (i = 0; i < PASSWD_LENGTH; i++) { + keystr[i] = (keystr[i] & 0xF0) >> 4 + | (keystr[i] & 0x0F) << 4; + keystr[i] = (keystr[i] & 0xCC) >> 2 + | (keystr[i] & 0x33) << 2; + keystr[i] = (keystr[i] & 0xAA) >> 1 + | (keystr[i] & 0x55) << 1; + } + + /* Initialize a 16-byte random challenge */ + arc4random_buf(challenge, sizeof(challenge)); + stream_write(cfd, challenge, AUTH_LENGTH); + + /* Receive the 16-byte challenge response */ + stream_read(cfd, buf, AUTH_LENGTH); + + memcpy(crypt_expected, challenge, AUTH_LENGTH); + + /* Encrypt the Challenge with DES */ + DES_set_key((const_DES_cblock *)keystr, &ks); + DES_ecb_encrypt((const_DES_cblock *)challenge, + (const_DES_cblock *)crypt_expected, + &ks, DES_ENCRYPT); + DES_ecb_encrypt((const_DES_cblock *)(challenge + PASSWD_LENGTH), + (const_DES_cblock *)(crypt_expected + + PASSWD_LENGTH), + &ks, DES_ENCRYPT); + + if (memcmp(crypt_expected, buf, AUTH_LENGTH) != 0) { + message = "Auth Failed: Invalid Password."; + sres = htonl(1); + } else + sres = 0; +#else + sres = 0; + WPRINTF(("Auth not supported, no OpenSSL in your system")); +#endif - /* 2c. Write back a status of 0 */ - sres = 0; - write(cfd, &sres, 4); + break; + } + + /* 2d. Write back a status */ + stream_write(cfd, &sres, 4); + + if (sres) { +#ifdef __FreeBSD__ + be32enc(buf, strlen(message)); + stream_write(cfd, buf, 4); + stream_write(cfd, message, strlen(message)); +#else + be32enc(buf, strlen((char *)message)); + stream_write(cfd, buf, 4); + stream_write(cfd, message, strlen((char *)message)); +#endif + goto done; + } /* 3a. Read client shared-flag byte */ - len = read(cfd, buf, 1); + len = stream_read(cfd, buf, 1); /* 4a. Write server-init info */ rfb_send_server_init_msg(cfd); + if (!rc->zbuf) { + rc->zbuf = malloc(RFB_ZLIB_BUFSZ + 16); + assert(rc->zbuf != NULL); + } + + rfb_send_screen(rc, cfd, 1); + + perror = pthread_create(&tid, NULL, rfb_wr_thr, rc); + if (perror == 0) + pthread_set_name_np(tid, "rfbout"); + /* Now read in client requests. 1st byte identifies type */ for (;;) { len = read(cfd, buf, 1); if (len <= 0) { - printf("exiting\n"); + DPRINTF(("rfb client exiting\r\n")); break; } @@ -340,7 +899,7 @@ rfb_handle(struct rfb_softc *rc, int cfd) rfb_recv_set_encodings_msg(rc, cfd); break; case 3: - rfb_recv_update_msg(rc, cfd); + rfb_recv_update_msg(rc, cfd, 1); break; case 4: rfb_recv_key_msg(rc, cfd); @@ -348,11 +907,20 @@ rfb_handle(struct rfb_softc *rc, int cfd) case 5: rfb_recv_ptr_msg(rc, cfd); break; + case 6: + rfb_recv_cuttext_msg(rc, cfd); + break; default: - printf("unknown client code!\n"); - exit(1); + WPRINTF(("rfb unknown cli-code %d!\n", buf[0] & 0xff)); + goto done; } } +done: + rc->cfd = -1; + if (perror == 0) + pthread_join(tid, NULL); + if (rc->enc_zlib_ok) + deflateEnd(&rc->zstream); } static void * @@ -373,48 +941,208 @@ rfb_thr(void *arg) } for (;;) { + rc->enc_raw_ok = false; + rc->enc_zlib_ok = false; + rc->enc_resize_ok = false; + cfd = accept(rc->sfd, NULL, NULL); + if (rc->conn_wait) { + pthread_mutex_lock(&rc->mtx); + pthread_cond_signal(&rc->cond); + pthread_mutex_unlock(&rc->mtx); + rc->conn_wait = 0; + } rfb_handle(rc, cfd); + close(cfd); } /* NOTREACHED */ return (NULL); } +static int +sse42_supported(void) +{ + u_int cpu_registers[4], ecx; + + do_cpuid(1, cpu_registers); + + ecx = cpu_registers[2]; + + return ((ecx & CPUID2_SSE42) != 0); +} + int -rfb_init(int port) +rfb_init(char *hostname, int port, int wait, char *password) { + int e; + char servname[6]; struct rfb_softc *rc; - struct sockaddr_in sin; + struct addrinfo *ai; + struct addrinfo hints; int on = 1; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif rc = calloc(1, sizeof(struct rfb_softc)); - rc->sfd = socket(AF_INET, SOCK_STREAM, 0); + rc->crc = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), + sizeof(uint32_t)); + rc->crc_tmp = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), + sizeof(uint32_t)); + rc->crc_width = RFB_MAX_WIDTH; + rc->crc_height = RFB_MAX_HEIGHT; + + rc->password = password; + + snprintf(servname, sizeof(servname), "%d", port ? port : 5900); + + if (!hostname || strlen(hostname) == 0) +#if defined(INET) + hostname = "127.0.0.1"; +#elif defined(INET6) + hostname = "[::1]"; +#endif + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_NUMERICHOST | AI_NUMERICSERV | AI_PASSIVE; + + if ((e = getaddrinfo(hostname, servname, &hints, &ai)) != 0) { + fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(e)); + return(-1); + } + + rc->sfd = socket(ai->ai_family, ai->ai_socktype, 0); if (rc->sfd < 0) { perror("socket"); + freeaddrinfo(ai); return (-1); } setsockopt(rc->sfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); -#ifdef __FreeBSD__ - sin.sin_len = sizeof(sin); -#endif - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(INADDR_ANY); - sin.sin_port = htons(port); - if (bind(rc->sfd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + if (bind(rc->sfd, ai->ai_addr, ai->ai_addrlen) < 0) { perror("bind"); + freeaddrinfo(ai); return (-1); } if (listen(rc->sfd, 1) < 0) { perror("listen"); + freeaddrinfo(ai); return (-1); } +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(rc->sfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + rc->hw_crc = sse42_supported(); + + rc->conn_wait = wait; + if (wait) { + pthread_mutex_init(&rc->mtx, NULL); + pthread_cond_init(&rc->cond, NULL); + } + pthread_create(&rc->tid, NULL, rfb_thr, rc); + pthread_set_name_np(rc->tid, "rfb"); + + if (wait) { + DPRINTF(("Waiting for rfb client...\n")); + pthread_mutex_lock(&rc->mtx); + pthread_cond_wait(&rc->cond, &rc->mtx); + pthread_mutex_unlock(&rc->mtx); + } + freeaddrinfo(ai); return (0); } + +#ifndef __FreeBSD__ +int +rfb_init_unix(char *path, int wait, char *password) +{ + struct rfb_softc *rc; + struct sockaddr_un sock; + + if ((rc = calloc(1, sizeof (struct rfb_softc))) == NULL) { + perror("calloc"); + return (-1); + } + rc->sfd = -1; + + if ((rc->crc = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), + sizeof (uint32_t))) == NULL) { + perror("calloc"); + goto fail; + } + if ((rc->crc_tmp = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), + sizeof (uint32_t))) == NULL) { + perror("calloc"); + goto fail; + } + rc->crc_width = RFB_MAX_WIDTH; + rc->crc_height = RFB_MAX_HEIGHT; + + rc->password = password; + + rc->sfd = socket(PF_UNIX, SOCK_STREAM, 0); + if (rc->sfd < 0) { + perror("socket"); + goto fail; + } + + sock.sun_family = AF_UNIX; + if (strlcpy(sock.sun_path, path, sizeof (sock.sun_path)) >= + sizeof (sock.sun_path)) { + (void) fprintf(stderr, "socket path '%s' too long\n", path); + goto fail; + } + + (void) unlink(path); + if (bind(rc->sfd, (struct sockaddr *)&sock, sizeof (sock)) < 0) { + perror("bind"); + goto fail; + } + + if (listen(rc->sfd, 1) < 0) { + perror("listen"); + goto fail; + } + + rc->hw_crc = sse42_supported(); + + rc->conn_wait = wait; + if (wait) { + VERIFY3S(pthread_mutex_init(&rc->mtx, NULL), ==, 0); + VERIFY3S(pthread_cond_init(&rc->cond, NULL), ==, 0); + } + + VERIFY3S(pthread_create(&rc->tid, NULL, rfb_thr, rc), ==, 0); + pthread_set_name_np(rc->tid, "rfb"); + + if (wait) { + DPRINTF(("Waiting for rfb client...\n")); + VERIFY3S(pthread_mutex_lock(&rc->mtx), ==, 0); + VERIFY3S(pthread_cond_wait(&rc->cond, &rc->mtx), ==, 0); + VERIFY3S(pthread_mutex_unlock(&rc->mtx), ==, 0); + } + + return (0); + +fail: + if (rc->sfd != -1) { + VERIFY3S(close(rc->sfd), ==, 0); + } + free(rc->crc); + free(rc->crc_tmp); + free(rc); + return (-1); +} +#endif diff --git a/usr/src/cmd/bhyve/rfb.h b/usr/src/cmd/bhyve/rfb.h index 5504c333ab..990e2075ac 100644 --- a/usr/src/cmd/bhyve/rfb.h +++ b/usr/src/cmd/bhyve/rfb.h @@ -1,5 +1,8 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale + * Copyright 2018 Joyent, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,6 +34,9 @@ #define RFB_PORT 5900 -int rfb_init(int port); +int rfb_init(char *hostname, int port, int wait, char *password); +#ifndef __FreeBSD__ +int rfb_init_unix(char *path, int wait, char *password); +#endif #endif /* _RFB_H_ */ diff --git a/usr/src/cmd/bhyve/rtc.c b/usr/src/cmd/bhyve/rtc.c index 5ab78e060f..09ca3f61ae 100644 --- a/usr/src/cmd/bhyve/rtc.c +++ b/usr/src/cmd/bhyve/rtc.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,17 +25,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $ + * $FreeBSD$ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $"); +__FBSDID("$FreeBSD$"); #include -#include -#include -#include #include #include @@ -41,300 +40,45 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $" #include #include "acpi.h" -#include "inout.h" #include "pci_lpc.h" #include "rtc.h" -#define IO_RTC 0x70 - -#define RTC_SEC 0x00 /* seconds */ -#define RTC_SEC_ALARM 0x01 -#define RTC_MIN 0x02 -#define RTC_MIN_ALARM 0x03 -#define RTC_HRS 0x04 -#define RTC_HRS_ALARM 0x05 -#define RTC_WDAY 0x06 -#define RTC_DAY 0x07 -#define RTC_MONTH 0x08 -#define RTC_YEAR 0x09 -#define RTC_CENTURY 0x32 /* current century */ - -#define RTC_STATUSA 0xA -#define RTCSA_TUP 0x80 /* time update, don't look now */ - -#define RTC_STATUSB 0xB -#define RTCSB_DST 0x01 -#define RTCSB_24HR 0x02 -#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */ -#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */ -#define RTCSB_HALT 0x80 /* stop clock updates */ +#define IO_RTC 0x70 -#define RTC_INTR 0x0c /* status register C (R) interrupt source */ - -#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */ -#define RTCSD_PWR 0x80 /* clock power OK */ - -#define RTC_NVRAM_START 0x0e -#define RTC_NVRAM_END 0x7f -#define RTC_NVRAM_SZ (128 - RTC_NVRAM_START) -#define nvoff(x) ((x) - RTC_NVRAM_START) - -#define RTC_DIAG 0x0e -#define RTC_RSTCODE 0x0f -#define RTC_EQUIPMENT 0x14 #define RTC_LMEM_LSB 0x34 #define RTC_LMEM_MSB 0x35 #define RTC_HMEM_LSB 0x5b #define RTC_HMEM_SB 0x5c #define RTC_HMEM_MSB 0x5d -#define m_64KB (64*1024) +#define m_64KB (64*1024) #define m_16MB (16*1024*1024) #define m_4GB (4ULL*1024*1024*1024) -static int addr; - -static uint8_t rtc_nvram[RTC_NVRAM_SZ]; - -/* XXX initialize these to default values as they would be from BIOS */ -static uint8_t status_a, status_b; - -static struct { - uint8_t hours; - uint8_t mins; - uint8_t secs; -} rtc_alarm; - -static u_char const bin2bcd_data[] = { - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, - 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, - 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, - 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, - 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, - 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, - 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, - 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, - 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99 -}; -#define bin2bcd(bin) (bin2bcd_data[bin]) - -#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val))) - -static void -timevalfix(struct timeval *t1) -{ - - if (t1->tv_usec < 0) { - t1->tv_sec--; - t1->tv_usec += 1000000; - } - if (t1->tv_usec >= 1000000) { - t1->tv_sec++; - t1->tv_usec -= 1000000; - } -} - -static void -timevalsub(struct timeval *t1, const struct timeval *t2) -{ - - t1->tv_sec -= t2->tv_sec; - t1->tv_usec -= t2->tv_usec; - timevalfix(t1); -} - -static int -rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) -{ - if (bytes != 1) - return (-1); - - if (in) { - /* straight read of this register will return 0xFF */ - *eax = 0xff; - return (0); - } - - switch (*eax & 0x7f) { - case RTC_SEC: - case RTC_SEC_ALARM: - case RTC_MIN: - case RTC_MIN_ALARM: - case RTC_HRS: - case RTC_HRS_ALARM: - case RTC_WDAY: - case RTC_DAY: - case RTC_MONTH: - case RTC_YEAR: - case RTC_STATUSA: - case RTC_STATUSB: - case RTC_INTR: - case RTC_STATUSD: - case RTC_NVRAM_START ... RTC_NVRAM_END: - break; - default: - return (-1); - } - - addr = *eax & 0x7f; - return (0); -} - -static int -rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) +/* + * Returns the current RTC time as number of seconds since 00:00:00 Jan 1, 1970 + */ +static time_t +rtc_time(struct vmctx *ctx, int use_localtime) { - int hour; + struct tm tm; time_t t; - struct timeval cur, delta; - - static struct timeval last; - static struct tm tm; - - if (bytes != 1) - return (-1); - - gettimeofday(&cur, NULL); - /* - * Increment the cached time only once per second so we can guarantee - * that the guest has at least one second to read the hour:min:sec - * separately and still get a coherent view of the time. - */ - delta = cur; - timevalsub(&delta, &last); - if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) { - t = cur.tv_sec; + time(&t); + if (use_localtime) { localtime_r(&t, &tm); - last = cur; - } - - if (in) { - switch (addr) { - case RTC_SEC_ALARM: - *eax = rtc_alarm.secs; - break; - case RTC_MIN_ALARM: - *eax = rtc_alarm.mins; - break; - case RTC_HRS_ALARM: - *eax = rtc_alarm.hours; - break; - case RTC_SEC: - *eax = rtcout(tm.tm_sec); - return (0); - case RTC_MIN: - *eax = rtcout(tm.tm_min); - return (0); - case RTC_HRS: - if (status_b & RTCSB_24HR) - hour = tm.tm_hour; - else - hour = (tm.tm_hour % 12) + 1; - - *eax = rtcout(hour); - - /* - * If we are representing time in the 12-hour format - * then set the MSB to indicate PM. - */ - if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12) - *eax |= 0x80; - - return (0); - case RTC_WDAY: - *eax = rtcout(tm.tm_wday + 1); - return (0); - case RTC_DAY: - *eax = rtcout(tm.tm_mday); - return (0); - case RTC_MONTH: - *eax = rtcout(tm.tm_mon + 1); - return (0); - case RTC_YEAR: - *eax = rtcout(tm.tm_year % 100); - return (0); - case RTC_STATUSA: - *eax = status_a; - return (0); - case RTC_STATUSB: - *eax = status_b; - return (0); - case RTC_INTR: - *eax = 0; - return (0); - case RTC_STATUSD: - *eax = RTCSD_PWR; - return (0); - case RTC_NVRAM_START ... RTC_NVRAM_END: - *eax = rtc_nvram[addr - RTC_NVRAM_START]; - return (0); - default: - return (-1); - } - } - - switch (addr) { - case RTC_STATUSA: - status_a = *eax & ~RTCSA_TUP; - break; - case RTC_STATUSB: - /* XXX not implemented yet XXX */ - if (*eax & RTCSB_PINTR) - return (-1); - status_b = *eax; - break; - case RTC_STATUSD: - /* ignore write */ - break; - case RTC_SEC_ALARM: - rtc_alarm.secs = *eax; - break; - case RTC_MIN_ALARM: - rtc_alarm.mins = *eax; - break; - case RTC_HRS_ALARM: - rtc_alarm.hours = *eax; - break; - case RTC_SEC: - case RTC_MIN: - case RTC_HRS: - case RTC_WDAY: - case RTC_DAY: - case RTC_MONTH: - case RTC_YEAR: - /* - * Ignore writes to the time of day registers - */ - break; - case RTC_NVRAM_START ... RTC_NVRAM_END: - rtc_nvram[addr - RTC_NVRAM_START] = *eax; - break; - default: - return (-1); + t = timegm(&tm); } - return (0); + return (t); } void -rtc_init(struct vmctx *ctx) +rtc_init(struct vmctx *ctx, int use_localtime) { - struct timeval cur; - struct tm tm; size_t himem; size_t lomem; int err; - err = gettimeofday(&cur, NULL); - assert(err == 0); - (void) localtime_r(&cur.tv_sec, &tm); - - memset(rtc_nvram, 0, sizeof(rtc_nvram)); - - rtc_nvram[nvoff(RTC_CENTURY)] = bin2bcd((tm.tm_year + 1900) / 100); - /* XXX init diag/reset code/equipment/checksum ? */ /* @@ -344,19 +88,23 @@ rtc_init(struct vmctx *ctx) * 0x5b/0x5c/0x5d - 64KB chunks above 4GB */ lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB; - rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem; - rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8; + err = vm_rtc_write(ctx, RTC_LMEM_LSB, lomem); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_LMEM_MSB, lomem >> 8); + assert(err == 0); himem = vm_get_highmem_size(ctx) / m_64KB; - rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem; - rtc_nvram[nvoff(RTC_HMEM_SB)] = himem >> 8; - rtc_nvram[nvoff(RTC_HMEM_MSB)] = himem >> 16; -} + err = vm_rtc_write(ctx, RTC_HMEM_LSB, himem); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_HMEM_SB, himem >> 8); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_HMEM_MSB, himem >> 16); + assert(err == 0); -INOUT_PORT(rtc, IO_RTC, IOPORT_F_INOUT, rtc_addr_handler); -INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler); + err = vm_rtc_settime(ctx, rtc_time(ctx, use_localtime)); + assert(err == 0); +} -#ifdef __FreeBSD__ static void rtc_dsdt(void) { @@ -375,6 +123,9 @@ rtc_dsdt(void) dsdt_line("}"); } LPC_DSDT(rtc_dsdt); -#endif +/* + * Reserve the extended RTC I/O ports although they are not emulated at this + * time. + */ SYSRES_IO(0x72, 6); diff --git a/usr/src/cmd/bhyve/rtc.h b/usr/src/cmd/bhyve/rtc.h index 6406d24c37..1c108eed99 100644 --- a/usr/src/cmd/bhyve/rtc.h +++ b/usr/src/cmd/bhyve/rtc.h @@ -1,4 +1,6 @@ -/* +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Peter Grehan * All rights reserved. * @@ -23,12 +25,12 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/rtc.h 253181 2013-07-11 03:54:35Z grehan $ + * $FreeBSD$ */ #ifndef _RTC_H_ #define _RTC_H_ -void rtc_init(struct vmctx *ctx); +void rtc_init(struct vmctx *ctx, int use_localtime); #endif /* _RTC_H_ */ diff --git a/usr/src/cmd/bhyve/smbiostbl.c b/usr/src/cmd/bhyve/smbiostbl.c index 7ba0f0dfa0..da227f813a 100644 --- a/usr/src/cmd/bhyve/smbiostbl.c +++ b/usr/src/cmd/bhyve/smbiostbl.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * @@ -25,7 +27,7 @@ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/smbiostbl.c 272007 2014-09-23 01:17:22Z grehan $"); +__FBSDID("$FreeBSD$"); #include @@ -33,6 +35,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/smbiostbl.c 272007 2014-09-23 01:17:22Z #include #include #include +#include #include #include #include @@ -321,8 +324,8 @@ struct smbios_table_type0 smbios_type0_template = { const char *smbios_type0_strings[] = { "BHYVE", /* vendor string */ - __TIME__, /* bios version string */ - __DATE__, /* bios release date string */ + "1.00", /* bios version string */ + "03/14/2014", /* bios release date string */ NULL }; @@ -634,7 +637,7 @@ smbios_type4_initializer(struct smbios_structure *template_entry, { int i; - for (i = 0; i < guest_ncpus; i++) { + for (i = 0; i < sockets; i++) { struct smbios_table_type4 *type4; char *p; int nstrings, len; @@ -653,6 +656,16 @@ smbios_type4_initializer(struct smbios_structure *template_entry, *(*endaddr) = '\0'; (*endaddr)++; type4->socket = nstrings + 1; + /* Revise cores and threads after update to smbios 3.0 */ + if (cores > 254) + type4->cores = 0; + else + type4->cores = cores; + /* This threads is total threads in a socket */ + if ((cores * threads) > 254) + type4->threads = 0; + else + type4->threads = (cores * threads); curaddr = *endaddr; } @@ -825,3 +838,80 @@ smbios_build(struct vmctx *ctx) return (0); } + +int +smbios_parse(const char *opts) +{ + char *buf; + char *lasts; + char *token; + char *end; + long type; + struct { + const char *key; + const char **targetp; + } type1_map[] = { + { "manufacturer", &smbios_type1_strings[0] }, + { "product", &smbios_type1_strings[1] }, + { "version", &smbios_type1_strings[2] }, + { "serial", &smbios_type1_strings[3] }, + { "sku", &smbios_type1_strings[4] }, + { "family", &smbios_type1_strings[5] }, + { "uuid", (const char **)&guest_uuid_str }, + { 0 } + }; + + if ((buf = strdup(opts)) == NULL) { + (void) fprintf(stderr, "out of memory\n"); + return (-1); + } + + if ((token = strtok_r(buf, ",", &lasts)) == NULL) { + (void) fprintf(stderr, "too few fields\n"); + goto fail; + } + + errno = 0; + type = strtol(token, &end, 10); + if (errno != 0 || *end != '\0') { + (void) fprintf(stderr, "first token '%s' is not an integer\n", + token); + goto fail; + } + + /* For now, only type 1 is supported. */ + if (type != 1) { + (void) fprintf(stderr, "unsupported type %d\n", type); + goto fail; + } + + while ((token = strtok_r(NULL, ",", &lasts)) != NULL) { + char *val; + int i; + + if ((val = strchr(token, '=')) == NULL) { + (void) fprintf(stderr, "invalid key=value: '%s'\n", + token); + goto fail; + } + *val = '\0'; + val++; + + for (i = 0; type1_map[i].key != NULL; i++) { + if (strcmp(token, type1_map[i].key) == 0) { + break; + } + } + if (type1_map[i].key == NULL) { + (void) fprintf(stderr, "invalid key '%s'\n", token); + goto fail; + } + *type1_map[i].targetp = val; + } + + return (0); + +fail: + free(buf); + return (-1); +} diff --git a/usr/src/cmd/bhyve/smbiostbl.h b/usr/src/cmd/bhyve/smbiostbl.h index fd7f86be80..81e26309e5 100644 --- a/usr/src/cmd/bhyve/smbiostbl.h +++ b/usr/src/cmd/bhyve/smbiostbl.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * @@ -23,7 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/smbiostbl.h 262744 2014-03-04 17:12:06Z tychon $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #ifndef _SMBIOSTBL_H_ @@ -32,5 +38,6 @@ struct vmctx; int smbios_build(struct vmctx *ctx); +int smbios_parse(const char *opts); #endif /* _SMBIOSTBL_H_ */ diff --git a/usr/src/cmd/bhyve/sockstream.c b/usr/src/cmd/bhyve/sockstream.c new file mode 100644 index 0000000000..b592bce9aa --- /dev/null +++ b/usr/src/cmd/bhyve/sockstream.c @@ -0,0 +1,86 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Nahanni Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include + +#include "sockstream.h" + +ssize_t +stream_read(int fd, void *buf, ssize_t nbytes) +{ + uint8_t *p; + ssize_t len = 0; + ssize_t n; + + p = buf; + + while (len < nbytes) { + n = read(fd, p + len, nbytes - len); + if (n == 0) + break; + + if (n < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + return (n); + } + len += n; + } + return (len); +} + +ssize_t +stream_write(int fd, const void *buf, ssize_t nbytes) +{ + const uint8_t *p; + ssize_t len = 0; + ssize_t n; + + p = buf; + + while (len < nbytes) { + n = write(fd, p + len, nbytes - len); + if (n == 0) + break; + if (n < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + return (n); + } + len += n; + } + return (len); +} diff --git a/usr/src/cmd/bhyve/sockstream.h b/usr/src/cmd/bhyve/sockstream.h new file mode 100644 index 0000000000..ecea849471 --- /dev/null +++ b/usr/src/cmd/bhyve/sockstream.h @@ -0,0 +1,35 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Nahanni Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include + +ssize_t stream_read(int fd, void *buf, ssize_t nbytes); +ssize_t stream_write(int fd, const void *buf, ssize_t nbytes); diff --git a/usr/src/cmd/bhyve/spinup_ap.c b/usr/src/cmd/bhyve/spinup_ap.c index e1dd562d3f..7c4186f5ed 100644 --- a/usr/src/cmd/bhyve/spinup_ap.c +++ b/usr/src/cmd/bhyve/spinup_ap.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,11 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/spinup_ap.c 263432 2014-03-20 18:15:37Z neel $ + * $FreeBSD$ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/spinup_ap.c 263432 2014-03-20 18:15:37Z neel $"); +__FBSDID("$FreeBSD$"); #include #include diff --git a/usr/src/cmd/bhyve/spinup_ap.h b/usr/src/cmd/bhyve/spinup_ap.h index 090de091ba..226542f6c3 100644 --- a/usr/src/cmd/bhyve/spinup_ap.h +++ b/usr/src/cmd/bhyve/spinup_ap.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/spinup_ap.h 240912 2012-09-25 02:33:25Z neel $ + * $FreeBSD$ */ #ifndef _SPINUP_AP_H_ diff --git a/usr/src/cmd/bhyve/task_switch.c b/usr/src/cmd/bhyve/task_switch.c new file mode 100644 index 0000000000..b5950a19d8 --- /dev/null +++ b/usr/src/cmd/bhyve/task_switch.c @@ -0,0 +1,941 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include "bhyverun.h" + +/* + * Using 'struct i386tss' is tempting but causes myriad sign extension + * issues because all of its fields are defined as signed integers. + */ +struct tss32 { + uint16_t tss_link; + uint16_t rsvd1; + uint32_t tss_esp0; + uint16_t tss_ss0; + uint16_t rsvd2; + uint32_t tss_esp1; + uint16_t tss_ss1; + uint16_t rsvd3; + uint32_t tss_esp2; + uint16_t tss_ss2; + uint16_t rsvd4; + uint32_t tss_cr3; + uint32_t tss_eip; + uint32_t tss_eflags; + uint32_t tss_eax; + uint32_t tss_ecx; + uint32_t tss_edx; + uint32_t tss_ebx; + uint32_t tss_esp; + uint32_t tss_ebp; + uint32_t tss_esi; + uint32_t tss_edi; + uint16_t tss_es; + uint16_t rsvd5; + uint16_t tss_cs; + uint16_t rsvd6; + uint16_t tss_ss; + uint16_t rsvd7; + uint16_t tss_ds; + uint16_t rsvd8; + uint16_t tss_fs; + uint16_t rsvd9; + uint16_t tss_gs; + uint16_t rsvd10; + uint16_t tss_ldt; + uint16_t rsvd11; + uint16_t tss_trap; + uint16_t tss_iomap; +}; +static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed"); + +#define SEL_START(sel) (((sel) & ~0x7)) +#define SEL_LIMIT(sel) (((sel) | 0x7)) +#define TSS_BUSY(type) (((type) & 0x2) != 0) + +static uint64_t +GETREG(struct vmctx *ctx, int vcpu, int reg) +{ + uint64_t val; + int error; + + error = vm_get_register(ctx, vcpu, reg, &val); + assert(error == 0); + return (val); +} + +static void +SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +{ + int error; + + error = vm_set_register(ctx, vcpu, reg, val); + assert(error == 0); +} + +static struct seg_desc +usd_to_seg_desc(struct user_segment_descriptor *usd) +{ + struct seg_desc seg_desc; + + seg_desc.base = (u_int)USD_GETBASE(usd); + if (usd->sd_gran) + seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; + else + seg_desc.limit = (u_int)USD_GETLIMIT(usd); + seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; + seg_desc.access |= usd->sd_xx << 12; + seg_desc.access |= usd->sd_def32 << 14; + seg_desc.access |= usd->sd_gran << 15; + + return (seg_desc); +} + +/* + * Inject an exception with an error code that is a segment selector. + * The format of the error code is described in section 6.13, "Error Code", + * Intel SDM volume 3. + * + * Bit 0 (EXT) denotes whether the exception occurred during delivery + * of an external event like an interrupt. + * + * Bit 1 (IDT) indicates whether the selector points to a gate descriptor + * in the IDT. + * + * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). + */ +static void +sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext) +{ + /* + * Bit 2 from the selector is retained as-is in the error code. + * + * Bit 1 can be safely cleared because none of the selectors + * encountered during task switch emulation refer to a task + * gate in the IDT. + * + * Bit 0 is set depending on the value of 'ext'. + */ + sel &= ~0x3; + if (ext) + sel |= 0x1; + vm_inject_fault(ctx, vcpu, vector, 1, sel); +} + +/* + * Return 0 if the selector 'sel' in within the limits of the GDT/LDT + * and non-zero otherwise. + */ +static int +desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel) +{ + uint64_t base; + uint32_t limit, access; + int error, reg; + + reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; + error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); + assert(error == 0); + + if (reg == VM_REG_GUEST_LDTR) { + if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) + return (-1); + } + + if (limit < SEL_LIMIT(sel)) + return (-1); + else + return (0); +} + +/* + * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced + * by the selector 'sel'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, bool doread, + int *faultptr) +{ + struct iovec iov[2]; + uint64_t base; + uint32_t limit, access; + int error, reg; + + reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; + error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); + assert(error == 0); + assert(limit >= SEL_LIMIT(sel)); + + error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel), + sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov), + faultptr); + if (error || *faultptr) + return (error); + + if (doread) + vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc)); + else + vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc)); + return (0); +} + +static int +desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) +{ + return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr)); +} + +static int +desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) +{ + return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr)); +} + +/* + * Read the TSS descriptor referenced by 'sel' into 'desc'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) +{ + struct vm_guest_paging sup_paging; + int error; + + assert(!ISLDT(sel)); + assert(IDXSEL(sel) != 0); + + /* Fetch the new TSS descriptor */ + if (desc_table_limit_check(ctx, vcpu, sel)) { + if (ts->reason == TSR_IRET) + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + else + sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext); + return (1); + } + + sup_paging = ts->paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr); + return (error); +} + +static bool +code_desc(int sd_type) +{ + /* code descriptor */ + return ((sd_type & 0x18) == 0x18); +} + +static bool +stack_desc(int sd_type) +{ + /* writable data descriptor */ + return ((sd_type & 0x1A) == 0x12); +} + +static bool +data_desc(int sd_type) +{ + /* data descriptor or a readable code descriptor */ + return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); +} + +static bool +ldt_desc(int sd_type) +{ + + return (sd_type == SDT_SYSLDT); +} + +/* + * Validate the descriptor 'seg_desc' associated with 'segment'. + */ +static int +validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + int segment, struct seg_desc *seg_desc, int *faultptr) +{ + struct vm_guest_paging sup_paging; + struct user_segment_descriptor usd; + int error, idtvec; + int cpl, dpl, rpl; + uint16_t sel, cs; + bool ldtseg, codeseg, stackseg, dataseg, conforming; + + ldtseg = codeseg = stackseg = dataseg = false; + switch (segment) { + case VM_REG_GUEST_LDTR: + ldtseg = true; + break; + case VM_REG_GUEST_CS: + codeseg = true; + break; + case VM_REG_GUEST_SS: + stackseg = true; + break; + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + dataseg = true; + break; + default: + assert(0); + } + + /* Get the segment selector */ + sel = GETREG(ctx, vcpu, segment); + + /* LDT selector must point into the GDT */ + if (ldtseg && ISLDT(sel)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* Descriptor table limit check */ + if (desc_table_limit_check(ctx, vcpu, sel)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* NULL selector */ + if (IDXSEL(sel) == 0) { + /* Code and stack segment selectors cannot be NULL */ + if (codeseg || stackseg) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + seg_desc->base = 0; + seg_desc->limit = 0; + seg_desc->access = 0x10000; /* unusable */ + return (0); + } + + /* Read the descriptor from the GDT/LDT */ + sup_paging = ts->paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr); + if (error || *faultptr) + return (error); + + /* Verify that the descriptor type is compatible with the segment */ + if ((ldtseg && !ldt_desc(usd.sd_type)) || + (codeseg && !code_desc(usd.sd_type)) || + (dataseg && !data_desc(usd.sd_type)) || + (stackseg && !stack_desc(usd.sd_type))) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* Segment must be marked present */ + if (!usd.sd_p) { + if (ldtseg) + idtvec = IDT_TS; + else if (stackseg) + idtvec = IDT_SS; + else + idtvec = IDT_NP; + sel_exception(ctx, vcpu, idtvec, sel, ts->ext); + return (1); + } + + cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); + cpl = cs & SEL_RPL_MASK; + rpl = sel & SEL_RPL_MASK; + dpl = usd.sd_dpl; + + if (stackseg && (rpl != cpl || dpl != cpl)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + if (codeseg) { + conforming = (usd.sd_type & 0x4) ? true : false; + if ((conforming && (cpl < dpl)) || + (!conforming && (cpl != dpl))) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + } + + if (dataseg) { + /* + * A data segment is always non-conforming except when it's + * descriptor is a readable, conforming code segment. + */ + if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) + conforming = true; + else + conforming = false; + + if (!conforming && (rpl > dpl || cpl > dpl)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + } + *seg_desc = usd_to_seg_desc(&usd); + return (0); +} + +static void +tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch, + uint32_t eip, struct tss32 *tss, struct iovec *iov) +{ + + /* General purpose registers */ + tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX); + tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX); + tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX); + tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX); + tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); + tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP); + tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI); + tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI); + + /* Segment selectors */ + tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES); + tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); + tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS); + tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS); + tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS); + tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS); + + /* eflags and eip */ + tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); + if (task_switch->reason == TSR_IRET) + tss->tss_eflags &= ~PSL_NT; + tss->tss_eip = eip; + + /* Copy updated old TSS into guest memory */ + vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32)); +} + +static void +update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd) +{ + int error; + + error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access); + assert(error == 0); +} + +/* + * Update the vcpu registers to reflect the state of the new task. + */ +static int +tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr) +{ + struct seg_desc seg_desc, seg_desc2; + uint64_t *pdpte, maxphyaddr, reserved; + uint32_t eflags; + int error, i; + bool nested; + + nested = false; + if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { + tss->tss_link = ot_sel; + nested = true; + } + + eflags = tss->tss_eflags; + if (nested) + eflags |= PSL_NT; + + /* LDTR */ + SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); + + /* PBDR */ + if (ts->paging.paging_mode != PAGING_MODE_FLAT) { + if (ts->paging.paging_mode == PAGING_MODE_PAE) { + /* + * XXX Assuming 36-bit MAXPHYADDR. + */ + maxphyaddr = (1UL << 36) - 1; + pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); + for (i = 0; i < 4; i++) { + /* Check reserved bits if the PDPTE is valid */ + if (!(pdpte[i] & 0x1)) + continue; + /* + * Bits 2:1, 8:5 and bits above the processor's + * maximum physical address are reserved. + */ + reserved = ~maxphyaddr | 0x1E6; + if (pdpte[i] & reserved) { + vm_inject_gp(ctx, vcpu); + return (1); + } + } + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); + } + SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); + ts->paging.cr3 = tss->tss_cr3; + } + + /* eflags and eip */ + SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags); + SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip); + + /* General purpose registers */ + SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax); + SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); + SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx); + SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); + SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp); + SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); + SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi); + SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi); + + /* Segment selectors */ + SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es); + SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs); + SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss); + SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds); + SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs); + SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs); + + /* + * If this is a nested task then write out the new TSS to update + * the previous link field. + */ + if (nested) + vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss)); + + /* Validate segment descriptors */ + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc); + + /* + * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. + * + * The SS and CS attribute checks on VM-entry are inter-dependent so + * we need to make sure that both segments are valid before updating + * either of them. This ensures that the VMCS state can pass the + * VM-entry checks so the guest can handle any exception injected + * during task switch emulation. + */ + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2); + ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc); + + return (0); +} + +/* + * Push an error code on the stack of the new task. This is needed if the + * task switch was triggered by a hardware exception that causes an error + * code to be saved (e.g. #PF). + */ +static int +push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + int task_type, uint32_t errcode, int *faultptr) +{ + struct iovec iov[2]; + struct seg_desc seg_desc; + int stacksize, bytes, error; + uint64_t gla, cr0, rflags; + uint32_t esp; + uint16_t stacksel; + + *faultptr = 0; + + cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); + rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); + stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS); + + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base, + &seg_desc.limit, &seg_desc.access); + assert(error == 0); + + /* + * Section "Error Code" in the Intel SDM vol 3: the error code is + * pushed on the stack as a doubleword or word (depending on the + * default interrupt, trap or task gate size). + */ + if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) + bytes = 4; + else + bytes = 2; + + /* + * PUSH instruction from Intel SDM vol 2: the 'B' flag in the + * stack-segment descriptor determines the size of the stack + * pointer outside of 64-bit mode. + */ + if (SEG_DESC_DEF32(seg_desc.access)) + stacksize = 4; + else + stacksize = 2; + + esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); + esp -= bytes; + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, + &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { + sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); + *faultptr = 1; + return (0); + } + + if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { + vm_inject_ac(ctx, vcpu, 1); + *faultptr = 1; + return (0); + } + + error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE, + iov, nitems(iov), faultptr); + if (error || *faultptr) + return (error); + + vm_copyout(ctx, vcpu, &errcode, iov, bytes); + SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp); + return (0); +} + +/* + * Evaluate return value from helper functions and potentially return to + * the VM run loop. + */ +#define CHKERR(error,fault) \ + do { \ + assert((error == 0) || (error == EFAULT)); \ + if (error) \ + return (VMEXIT_ABORT); \ + else if (fault) \ + return (VMEXIT_CONTINUE); \ + } while (0) + +int +vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + struct seg_desc nt; + struct tss32 oldtss, newtss; + struct vm_task_switch *task_switch; + struct vm_guest_paging *paging, sup_paging; + struct user_segment_descriptor nt_desc, ot_desc; + struct iovec nt_iov[2], ot_iov[2]; + uint64_t cr0, ot_base; + uint32_t eip, ot_lim, access; + int error, ext, fault, minlimit, nt_type, ot_type, vcpu; + enum task_switch_reason reason; + uint16_t nt_sel, ot_sel; + + task_switch = &vmexit->u.task_switch; + nt_sel = task_switch->tsssel; + ext = vmexit->u.task_switch.ext; + reason = vmexit->u.task_switch.reason; + paging = &vmexit->u.task_switch.paging; + vcpu = *pvcpu; + + assert(paging->cpu_mode == CPU_MODE_PROTECTED); + + /* + * Calculate the instruction pointer to store in the old TSS. + */ + eip = vmexit->rip + vmexit->inst_length; + + /* + * Section 4.6, "Access Rights" in Intel SDM Vol 3. + * The following page table accesses are implicitly supervisor mode: + * - accesses to GDT or LDT to load segment descriptors + * - accesses to the task state segment during task switch + */ + sup_paging = *paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + + /* Fetch the new TSS descriptor */ + error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc, + &fault); + CHKERR(error, fault); + + nt = usd_to_seg_desc(&nt_desc); + + /* Verify the type of the new TSS */ + nt_type = SEG_DESC_TYPE(nt.access); + if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && + nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* TSS descriptor must have present bit set */ + if (!SEG_DESC_PRESENT(nt.access)) { + sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext); + goto done; + } + + /* + * TSS must have a minimum length of 104 bytes for a 32-bit TSS and + * 44 bytes for a 16-bit TSS. + */ + if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) + minlimit = 104 - 1; + else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) + minlimit = 44 - 1; + else + minlimit = 0; + + assert(minlimit > 0); + if (nt.limit < minlimit) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* TSS must be busy if task switch is due to IRET */ + if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* + * TSS must be available (not busy) if task switch reason is + * CALL, JMP, exception or interrupt. + */ + if (reason != TSR_IRET && TSS_BUSY(nt_type)) { + sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext); + goto done; + } + + /* Fetch the new TSS */ + error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1, + PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault); + CHKERR(error, fault); + vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1); + + /* Get the old TSS selector from the guest's task register */ + ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR); + if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { + /* + * This might happen if a task switch was attempted without + * ever loading the task register with LTR. In this case the + * TR would contain the values from power-on: + * (sel = 0, base = 0, limit = 0xffff). + */ + sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext); + goto done; + } + + /* Get the old TSS base and limit from the guest's task register */ + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, + &access); + assert(error == 0); + assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); + ot_type = SEG_DESC_TYPE(access); + assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); + + /* Fetch the old TSS descriptor */ + error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc, + &fault); + CHKERR(error, fault); + + /* Get the old TSS */ + error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1, + PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault); + CHKERR(error, fault); + vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1); + + /* + * Clear the busy bit in the old TSS descriptor if the task switch + * due to an IRET or JMP instruction. + */ + if (reason == TSR_IRET || reason == TSR_JMP) { + ot_desc.sd_type &= ~0x2; + error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel, + &ot_desc, &fault); + CHKERR(error, fault); + } + + if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { + fprintf(stderr, "Task switch to 16-bit TSS not supported\n"); + return (VMEXIT_ABORT); + } + + /* Save processor state in old TSS */ + tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov); + + /* + * If the task switch was triggered for any reason other than IRET + * then set the busy bit in the new TSS descriptor. + */ + if (reason != TSR_IRET) { + nt_desc.sd_type |= 0x2; + error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel, + &nt_desc, &fault); + CHKERR(error, fault); + } + + /* Update task register to point at the new TSS */ + SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel); + + /* Update the hidden descriptor state of the task register */ + nt = usd_to_seg_desc(&nt_desc); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt); + + /* Set CR0.TS */ + cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); + SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); + + /* + * We are now committed to the task switch. Any exceptions encountered + * after this point will be handled in the context of the new task and + * the saved instruction pointer will belong to the new task. + */ + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); + assert(error == 0); + + /* Load processor state from new TSS */ + error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov, + &fault); + CHKERR(error, fault); + + /* + * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception + * caused an error code to be generated, this error code is copied + * to the stack of the new task. + */ + if (task_switch->errcode_valid) { + assert(task_switch->ext); + assert(task_switch->reason == TSR_IDT_GATE); + error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type, + task_switch->errcode, &fault); + CHKERR(error, fault); + } + + /* + * Treatment of virtual-NMI blocking if NMI is delivered through + * a task gate. + * + * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: + * If the virtual NMIs VM-execution control is 1, VM entry injects + * an NMI, and delivery of the NMI causes a task switch that causes + * a VM exit, virtual-NMI blocking is in effect before the VM exit + * commences. + * + * Thus, virtual-NMI blocking is in effect at the time of the task + * switch VM exit. + */ + + /* + * Treatment of virtual-NMI unblocking on IRET from NMI handler task. + * + * Section "Changes to Instruction Behavior in VMX Non-Root Operation" + * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. + * This unblocking of virtual-NMI occurs even if IRET causes a fault. + * + * Thus, virtual-NMI blocking is cleared at the time of the task switch + * VM exit. + */ + + /* + * If the task switch was triggered by an event delivered through + * the IDT then extinguish the pending event from the vcpu's + * exitintinfo. + */ + if (task_switch->reason == TSR_IDT_GATE) { + error = vm_set_intinfo(ctx, vcpu, 0); + assert(error == 0); + } + + /* + * XXX should inject debug exception if 'T' bit is 1 + */ +done: + return (VMEXIT_CONTINUE); +} diff --git a/usr/src/cmd/bhyve/test/Makefile b/usr/src/cmd/bhyve/test/Makefile new file mode 100644 index 0000000000..7dbee0c5f3 --- /dev/null +++ b/usr/src/cmd/bhyve/test/Makefile @@ -0,0 +1,18 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +SUBDIRS = scripts tst + +include Makefile.subdirs diff --git a/usr/src/cmd/bhyve/test/Makefile.com b/usr/src/cmd/bhyve/test/Makefile.com new file mode 100644 index 0000000000..f5efacc510 --- /dev/null +++ b/usr/src/cmd/bhyve/test/Makefile.com @@ -0,0 +1,61 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +include $(SRC)/Makefile.master +include $(SRC)/cmd/Makefile.cmd +include $(SRC)/cmd/Makefile.cmd.64 + +# +# Force c99 for everything +# +CSTD= $(CSTD_GNU99) +C99MODE= -xc99=%all +C99LMODE= -Xc99=%all + +CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration \ + -_gcc=-Wno-parentheses +CFLAGS64 += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration \ + -_gcc=-Wno-parentheses +CPPFLAGS = -I$(SRC)/cmd/bhyve \ + -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ + -I$(CONTRIB)/freebsd/dev/usb/controller \ + -I$(CONTRIB)/freebsd/dev/mii \ + $(CPPFLAGS.master) \ + -I$(SRC)/uts/i86pc/io/vmm \ + -I$(SRC)/uts/common \ + -I$(SRC)/uts/i86pc \ + -I$(SRC)/lib/libdladm/common \ + -DWITHOUT_CAPSICUM +CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 + +SMOFF += all_func_returns + +CLEANFILES += $(EXETESTS) +CLOBBERFILES += $(ROOTTESTS) + +# +# Install related definitions +# +ROOTOPTPKG = $(ROOT)/opt/bhyvetest +ROOTBIN = $(ROOTOPTPKG)/bin +ROOTTST = $(ROOTOPTPKG)/tst +ROOTTSTDIR = $(ROOTTST)/$(TSTDIR) +ROOTTSTEXES = $(EXETESTS:%=$(ROOTTSTDIR)/%) +ROOTTSTSH = $(SHTESTS:%=$(ROOTTSTDIR)/%) +ROOTOUT = $(OUTFILES:%=$(ROOTTSTDIR)/%) +ROOTTESTS = $(ROOTTSTEXES) $(ROOTTSTSH) $(ROOTOUT) +FILEMODE = 0555 +LDLIBS = $(LDLIBS.cmd) +LINTEXE = $(EXETESTS:%.exe=%.exe.ln) diff --git a/usr/src/cmd/bhyve/test/Makefile.subdirs b/usr/src/cmd/bhyve/test/Makefile.subdirs new file mode 100644 index 0000000000..45f0aa67fa --- /dev/null +++ b/usr/src/cmd/bhyve/test/Makefile.subdirs @@ -0,0 +1,29 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +.KEEP_STATE: + +all := TARGET += all +clean := TARGET += clean +clobber := TARGET += clobber +install := TARGET += install +lint := TARGET += lint + +all clean clobber install lint: $(SUBDIRS) + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: diff --git a/usr/src/cmd/bhyve/test/Makefile.targ b/usr/src/cmd/bhyve/test/Makefile.targ new file mode 100644 index 0000000000..e3ec55cfdb --- /dev/null +++ b/usr/src/cmd/bhyve/test/Makefile.targ @@ -0,0 +1,55 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +$(ROOTOPTPKG): + $(INS.dir) + +$(ROOTBIN): $(ROOTOPTPKG) + $(INS.dir) + +$(ROOTBIN)/%: %.ksh $(ROOTBIN) + $(INS.rename) + +$(ROOTTST): $(ROOTOPTPKG) + $(INS.dir) + +$(ROOTTSTDIR): $(ROOTTST) + $(INS.dir) + +$(ROOTTSTDIR)/%.ksh: %.ksh $(ROOTTSTDIR) + $(INS.file) + +$(ROOTTSTDIR)/%.out: %.out $(ROOTTSTDIR) + $(INS.file) + +%.exe: %.o $(SUPOBJS) + $(LINK.c) -o $@ $< $(SUPOBJS) $(LDLIBS) + $(POST_PROCESS) + +$(ROOTTSTDIR)/%.exe: %.exe $(ROOTTSTDIR) + $(INS.file) + +all: install + +%.exe.ln: %.c $(SUPOBJS) + $(LINT.c) $< $(LDLIBS) + +lint: $(LINTEXE) + +clean: + -$(RM) *.o $(CLEANFILES) + +clobber: clean + -$(RM) $(CLOBBERFILES) diff --git a/usr/src/cmd/bhyve/test/scripts/Makefile b/usr/src/cmd/bhyve/test/scripts/Makefile new file mode 100644 index 0000000000..d28a5edb8f --- /dev/null +++ b/usr/src/cmd/bhyve/test/scripts/Makefile @@ -0,0 +1,28 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +include ../Makefile.com + +SRCS = bhyvetest +SCRIPTS = $(SRCS:%=$(ROOTBIN)/%) + +SCRIPTS := FILEMODE = 0555 +CLOBBERFILES = $(SCRIPTS) + +install: $(SCRIPTS) + +lint: + +include ../Makefile.targ diff --git a/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh b/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh new file mode 100644 index 0000000000..95b7743417 --- /dev/null +++ b/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh @@ -0,0 +1,231 @@ +#!/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +# +# bhyve test suite driver +# +unalias -a + +bt_arg0=$(basename $0) +bt_root="$(cd $(dirname $0)/..; pwd -P)" +bt_ksh="/usr/bin/ksh" +bt_outdir= +bt_keep= +bt_all= +bt_tnum=0 +bt_tfail=0 +bt_tsuc=0 + +function usage +{ + typeset msg="$*" + [[ -z "$msg" ]] || echo "$msg" 2>&1 + cat <&2 +Usage: $bt_arg0 [ -o dir ] [ -k ] [ -a | test ... ] + + -o dir Sets 'dir' as the output directory + -a Runs all tests, ignores tests passed in + -k Keep output from all tests, not just failures + -m mdb binary to test +USAGE + exit 2 +} + +function fatal +{ + typeset msg="$*" + [[ -z "$msg" ]] && msg="failed" + echo "$bt_arg0: $msg" >&2 + exit 1 +} + +function setup_outdir +{ + bt_outdir="$bt_outdir/$bt_arg0.$$" + mkdir -p $bt_outdir || fatal "failed to make output dir $bt_outdir" +} + +function run_single +{ + typeset name=$1 + typeset expect base ext exe command odir res reason + typeset iserr + + [[ -z "$name" ]] && fail "missing test to run" + base=${name##*/} + ext=${base##*.} + expect=${base%%.*} + odir="$bt_outdir/current" + [[ -z "$ext" ]] && fatal "found test without ext: $name" + [[ -z "$expect" ]] && fatal "found test without prefix: $name" + + if [[ "$expect" == "err" || "$expect" == "ecreate" ]]; then + iserr="yup" + else + iserr="" + fi + + case "$ext" in + "ksh") + command="$bt_ksh ./$base" + ;; + "exe") + command="./$base" + ;; + "out") + # + # This is the file format for checking output against. + # + return 0 + ;; + *) + echo "skipping test $name (unknown extensino)" + return 0 + ;; + esac + + echo "Executing test $name ... \c" + mkdir -p "$odir" >/dev/null || fatal "can't make output directory" + cd $(dirname $name) || fatal "failed to enter test directory" + $command > "$odir/stdout" 2>"$odir/stderr" + res=$? + cd - > /dev/null || fatal "failed to leave test directory" + + if [[ -f "$name.out" ]] && \ + ! diff "$name.out" "$odir/stdout" >/dev/null; then + cp $name.out $odir/$base.out + reason="stdout mismatch" + elif [[ -n "$iserr" && $res -eq 0 ]]; then + reason="test exited $res, not non-zero" + elif [[ -z "$iserr" && $res -ne 0 ]]; then + reason="test exited $res, not zero" + fi + + if [[ -n "$reason" ]]; then + echo "$reason" + ((bt_tfail++)) + mv "$odir" "$bt_outdir/failure.$bt_tfail" || fatal \ + "failed to move test output directory" + cp "$name" "$bt_outdir/failure.$bt_tfail/$(basename $name)" || \ + fatal "failed to copy test into output directory" + else + echo "passed" + ((bt_tsuc++)) + mv "$odir" "$bt_outdir/success.$bt_tsuc" || fatal \ + "failed to move test directory" + fi + + ((bt_tnum++)) +} + +function run_all +{ + typeset tests t dir + + tests=$(ls -1 $bt_root/tst/*/*.@(ksh|exe)) + for t in $tests; do + run_single $t + done +} + +function welcome +{ + cat < +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "testlib.h" +#include "mevent.h" + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; + +static int +get_count(void) +{ + int global = -1, change = -1, del_pending = -1; + int total; + + test_mevent_count_lists(&global, &change, &del_pending); + ASSERT_INT_NEQ(("count not set"), global, -1); + ASSERT_INT_NEQ(("count not set"), change, -1); + ASSERT_INT_NEQ(("count not set"), change, -1); + ASSERT_INT_EQ(("pending delete not processed"), del_pending, 0); + + total = global + change + del_pending; + + VERBOSE(("count = %d (%d + %d + %d)", total, global, change, + del_pending)); + + return (total); +} + +static void +not_called_cb(int fd, enum ev_type ev, void *arg) +{ + FAIL(("this callback should never be called")); +} + +static void +flush_cb(int fd, enum ev_type ev, void *arg) +{ + char buf[32]; + + /* Drain the pipe */ + while (read(fd, buf, sizeof (buf)) > 0) + ; + + pthread_mutex_lock(&mtx); + pthread_cond_signal(&cv); + pthread_mutex_unlock(&mtx); +} + +void +flush_and_wait(int fd) +{ + uint8_t msg = 42; + + /* + * Lock taken ahead of waking flush_cb so this thread doesn't race + * with the event thread. + */ + pthread_mutex_lock(&mtx); + if (write(fd, &msg, sizeof (msg)) != sizeof (msg)) { + FAIL(("bad write")); + } + + /* Wait for it to be read */ + pthread_cond_wait(&cv, &mtx); + pthread_mutex_unlock(&mtx); +} + +int +main(int argc, const char *argv[]) +{ + int unused_pipe[2]; + int flush_pipe[2]; + struct mevent *unused_evp, *flush_evp; + int count1, count2; + + start_test(argv[0], 5); + start_event_thread(); + + /* + * Create first pipe and related event + */ + if (pipe(unused_pipe) != 0) { + FAIL_ERRNO("pipe"); + } + VERBOSE(("unused_pipe[] = { %d, %d }", unused_pipe[0], unused_pipe[1])); + if (fcntl(unused_pipe[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + unused_evp = mevent_add(unused_pipe[0], EVF_READ, not_called_cb, NULL); + ASSERT_PTR_NEQ(("mevent_add"), unused_evp, NULL); + + /* + * Create flush pipe and related event + */ + if (pipe(flush_pipe) != 0) { + FAIL_ERRNO("pipe"); + } + VERBOSE(("flush_pipe[] = { %d, %d }", flush_pipe[0], + flush_pipe[1])); + if (fcntl(flush_pipe[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + flush_evp = mevent_add(flush_pipe[0], EVF_READ, flush_cb, NULL); + ASSERT_PTR_NEQ(("mevent_add"), flush_evp, NULL); + + /* Get count before delete. */ + flush_and_wait(flush_pipe[1]); + count1 = get_count(); + + /* + * Delete the first event and flush a read after the delete is + * complete. + */ + if (mevent_delete(unused_evp) != 0) { + FAIL_ERRNO("mevent_delete"); + } + + /* + * Verify count decreased. + */ + flush_and_wait(flush_pipe[1]); + count2 = get_count(); + if (count1 - 1 != count2) { + FAIL(("mevent_delete() did not decrease count by 1: " + "was %d, now %d", count1, count2)); + } + + PASS(); +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/mevent.c b/usr/src/cmd/bhyve/test/tst/mevent/mevent.c new file mode 100644 index 0000000000..17b6546847 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/mevent.c @@ -0,0 +1,57 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include "../../../mevent.c" +#include "testlib.h" + +/* + * Returns by reference the number of events on the global and change lists. + * + * Used by tests that wish to ensure that the event count changes as suggested + * by mevent_add() and mevent_delete(). Note that a delete does not immediately + * delete an event. Events that are pending delete are included in the change + * list until the next pass through the change list to process pending changes. + */ +void +test_mevent_count_lists(int *ret_global, int *ret_change, int *ret_del_pending) +{ + struct mevent *mevp; + int global = 0; + int change = 0; + int del_pending = 0; + + mevent_qlock(); + + LIST_FOREACH(mevp, &global_head, me_list) { + global++; + VERBOSE(("on global: type %d fd %d state %d", mevp->me_type, + mevp->me_fd, mevp->me_state)); + } + + LIST_FOREACH(mevp, &change_head, me_list) { + change++; + if (mevp->me_state == MEV_DEL_PENDING) { + del_pending++; + } + VERBOSE(("on change: type %d fd %d state %d", mevp->me_type, + mevp->me_fd, mevp->me_state)); + } + + mevent_qunlock(); + + *ret_global = global; + *ret_change = change; + *ret_del_pending = del_pending; +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c b/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c new file mode 100644 index 0000000000..d23b1af96c --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c @@ -0,0 +1,163 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test: read.cancel + * Assertion: A read is not requeued if mevent_disable() is called while it is + * being handled. + * + * Strategy: 1. Create a pipe + * 2. Call mevent_add() to be notified of writes to the pipe. The + * callback will signal a cv. + * 3. Write to the pipe then wait for a wakeup. + * 4. From the read event callback, disable the event then awaken + * the main thread. + * 5. In the main thread, add a timer event that will awaken the + * main thread after a short delay. + * 5. Write to the pipe and wait to be awoken. The wakeup should + * come from the timer event, not the read event. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "testlib.h" +#include "mevent.h" + +typedef enum { + CB_NONE, + CB_READ, + CB_TIMER, +} lastwake_t; + +static lastwake_t lastwake = CB_NONE; + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; + +static struct mevent *read_event; + +static void +munch(int fd, enum ev_type ev, void *arg) +{ + ssize_t nbytes; + char buf[32] = { 0 }; + int err; + + if ((nbytes = read(fd, buf, sizeof (buf))) < 0) { + FAIL_ERRNO("bad read"); + } + VERBOSE(("read %ld bytes '%s'", nbytes, buf)); + + err = mevent_disable(read_event); + ASSERT_INT_EQ(("mevent_disable: ", strerror(err)), err, 0); + + pthread_mutex_lock(&mtx); + + ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_NONE); + lastwake = CB_READ; + + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + + pthread_mutex_unlock(&mtx); +} + +static void +tick(int ms, enum ev_type ev, void *arg) +{ + pthread_mutex_lock(&mtx); + + ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_READ); + lastwake = CB_TIMER; + + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + + pthread_mutex_unlock(&mtx); +} + +int +main(int argc, const char *argv[]) +{ + int pipefds[2]; + struct mevent *timer; + ssize_t written; + char *msgs[] = { "first", "second" }; + char *msg; + + start_test(argv[0], 5); + start_event_thread(); + + if (pipe(pipefds) != 0) { + FAIL_ERRNO("pipe"); + } + if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + + /* + * First write + */ + msg = msgs[0]; + read_event = mevent_add(pipefds[0], EVF_READ, munch, msg); + ASSERT_PTR_NEQ(("mevent_add pipefd"), read_event, NULL); + + pthread_mutex_lock(&mtx); + written = write(pipefds[1], msg, strlen(msg)); + if (written < 0) { + FAIL_ERRNO("bad write"); + } + ASSERT_INT64_EQ(("write '%s' failed", msg), written, strlen(msg)); + + /* + * Wait for it to be read + */ + pthread_cond_wait(&cv, &mtx); + ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_READ); + pthread_mutex_unlock(&mtx); + + /* + * Add timer, second write. + */ + msg = msgs[1]; + timer = mevent_add(50, EVF_TIMER, tick, msg); + ASSERT_PTR_NEQ(("mevent_add timer"), timer, NULL); + + pthread_mutex_lock(&mtx); + written = write(pipefds[1], msg, strlen(msg)); + if (written < 0) { + FAIL_ERRNO("bad write"); + } + ASSERT_INT64_EQ(("write '%s' failed", msg), written, strlen(msg)); + + /* + * Wait for timer to expire + */ + pthread_cond_wait(&cv, &mtx); + ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_TIMER); + pthread_mutex_unlock(&mtx); + + PASS(); +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c b/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c new file mode 100644 index 0000000000..c877f014f6 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c @@ -0,0 +1,152 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test: read.pause + * Assertion: mevent_disable() can be used to pause reads. + * + * Strategy: 1. Create a pipe + * 2. Call mevent_add() to be notified of writes to the pipe. The + * callback will signal a cv. + * 3. In a loop, write to the pipe then wait on the cv. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "testlib.h" +#include "mevent.h" + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; + +static char cookie[] = "Chocolate chip with fudge stripes"; + +/* + * After this many bytes are sent, writes will get batched up, progress will be + * made on the write side via an interval timer + */ +const int pauseat = 8; + +static void +munch(int fd, enum ev_type ev, void *arg) +{ + static int i = 0; + char buf[sizeof (cookie)] = { 0 }; + ssize_t nbytes; + ssize_t expected; + + ASSERT_INT_EQ(("bad event"), ev, EVF_READ); + ASSERT_PTR_EQ(("bad cookie"), arg, cookie); + + /* + * For the first while, expect data to come a byte at a time. After the + * pause, we should get a burst with the rest of the data. + */ + if (i > pauseat) { + expected = strlen(cookie) - pauseat - 1; + } else { + expected = 1; + } + + if ((nbytes = read(fd, buf, sizeof (buf))) < 0) { + FAIL_ERRNO("bad read"); + } + VERBOSE(("read %ld bytes '%s'", nbytes, buf)); + + ASSERT_INT64_EQ(("wanted a byte of cookie"), nbytes, expected); + + if (expected == 1) { + ASSERT_CHAR_EQ(("bad byte %d of cookie", i), buf[0], cookie[i]); + } else { + ASSERT_STR_EQ(("bad last half of cookie"), buf, &cookie[i]); + } + + pthread_mutex_lock(&mtx); + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + pthread_mutex_unlock(&mtx); + + i++; +} + +static void +tick(int ms, enum ev_type ev, void *arg) +{ + pthread_mutex_lock(&mtx); + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + pthread_mutex_unlock(&mtx); +} + +int +main(int argc, const char *argv[]) +{ + int pipefds[2]; + struct mevent *evp, *timer; + ssize_t written; + + start_test(argv[0], 5); + start_event_thread(); + + if (pipe(pipefds) != 0) { + FAIL_ERRNO("pipe"); + } + if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + + evp = mevent_add(pipefds[0], EVF_READ, munch, cookie); + ASSERT_PTR_NEQ(("mevent_add pipefd"), evp, NULL); + + for (int i = 0; cookie[i] != 0; i++) { + pthread_mutex_lock(&mtx); + written = write(pipefds[1], cookie + i, 1); + if (written < 0) { + FAIL_ERRNO("bad write"); + } + ASSERT_INT64_EQ(("write byte %d of cookie", i), written, 1); + + /* Wait for it to be read */ + pthread_cond_wait(&cv, &mtx); + pthread_mutex_unlock(&mtx); + + if (i == pauseat) { + timer = mevent_add(10, EVF_TIMER, tick, + &cookie[pauseat]); + ASSERT_PTR_NEQ(("mevent_add timer"), timer, NULL); + VERBOSE(("disable munch")); + mevent_disable(evp); + } + } + + pthread_mutex_lock(&mtx); + + mevent_enable(evp); + + pthread_cond_wait(&cv, &mtx); + pthread_mutex_unlock(&mtx); + + PASS(); +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c b/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c new file mode 100644 index 0000000000..ddc3e27235 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c @@ -0,0 +1,108 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test: read.requeue + * Assertion: A sequence of writes turns into a sequence of events. + * + * Strategy: 1. Create a pipe + * 2. Call mevent_add() to be notified of writes to the pipe. The + * callback will signal a cv. + * 3. In a loop, write to the pipe then wait on the cv. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "testlib.h" +#include "mevent.h" + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; + +static char *cookie = "Chocolate chip with fudge stripes"; + +static void +munch(int fd, enum ev_type ev, void *arg) +{ + static int i = 0; + char buf[8] = { 0 }; + ssize_t nbytes; + + ASSERT_INT_EQ(("bad event"), ev, EVF_READ); + ASSERT_PTR_EQ(("bad cookie"), arg, cookie); + + if ((nbytes = read(fd, buf, sizeof (buf))) < 0) { + ASSERT_INT64_EQ(("bad read: %s", strerror(errno)), nbytes, 1); + } + VERBOSE(("read %ld bytes '%s'", nbytes, buf)); + + ASSERT_INT64_EQ(("wanted a byte of cookie"), nbytes, 1); + + ASSERT_CHAR_EQ(("bad byte %d of cookie", i), buf[0], cookie[i]); + + pthread_mutex_lock(&mtx); + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + pthread_mutex_unlock(&mtx); + + i++; +} + +int +main(int argc, const char *argv[]) +{ + int pipefds[2]; + struct mevent *evp; + + start_test(argv[0], 5); + start_event_thread(); + + if (pipe(pipefds) != 0) { + FAIL_ERRNO("pipe"); + } + if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + + evp = mevent_add(pipefds[0], EVF_READ, munch, cookie); + ASSERT_PTR_NEQ(("mevent_add"), evp, NULL); + + for (int i = 0; cookie[i] != '\0'; i++) { + ssize_t written; + + pthread_mutex_lock(&mtx); + written = write(pipefds[1], cookie + i, 1); + if (written < 0) { + FAIL_ERRNO("bad write"); + } + ASSERT_INT64_EQ(("write byte %d of cookie", i), written, 1); + + /* Wait for it to be read */ + pthread_cond_wait(&cv, &mtx); + pthread_mutex_unlock(&mtx); + } + + PASS(); +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/testlib.c b/usr/src/cmd/bhyve/test/tst/mevent/testlib.c new file mode 100644 index 0000000000..67261b9a31 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/testlib.c @@ -0,0 +1,70 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include +#include +#include +#include + +#include "testlib.h" +#include "mevent.h" + +const char *testlib_prog; +boolean_t testlib_verbose; + +static void +timed_out(int signo) +{ + ASSERT_INT_EQ(("timeout signal"), signo, SIGALRM); + + FAIL(("Timed out")); +} + +void +start_test(const char *argv0, uint32_t timeout) +{ + char *val; + + testlib_prog = strrchr(argv0, '/'); + if (testlib_prog == NULL) { + testlib_prog = argv0; + } else { + testlib_prog++; + } + + testlib_verbose = ((val = getenv("TEST_VERBOSE")) != NULL) && + val[0] != '\0'; + + signal(SIGALRM, timed_out); + alarm(timeout); +} + +/* ARGSUSED */ +static void * +event_thread(void *arg) +{ + mevent_dispatch(); + return (NULL); +} + +void +start_event_thread(void) +{ + pthread_t tid; + + if (pthread_create(&tid, NULL, event_thread, NULL) != 0) { + FAIL_ERRNO("pthread_create"); + } +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/testlib.h b/usr/src/cmd/bhyve/test/tst/mevent/testlib.h new file mode 100644 index 0000000000..7e5ca2e9c9 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/testlib.h @@ -0,0 +1,93 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _TESTLIB_H_ +#define _TESTLIB_H_ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "mevent.h" + +#define EXIT_PASS 0 +#define EXIT_FAIL 1 + +#define VERBOSE(msg) \ + if (testlib_verbose) { \ + (void) printf("VERBOSE %s: %s:%d %s: ", testlib_prog, \ + __FILE__, __LINE__, __func__); \ + (void) printf msg; \ + (void) printf("\n"); \ + } + +#define FAIL_PROLOGUE() \ + (void) printf("FAIL %s: %s:%d: ", testlib_prog, __FILE__, __LINE__) + +#define FAIL(msg) \ + { \ + FAIL_PROLOGUE(); \ + (void) printf msg; \ + (void) printf("\n"); \ + exit(EXIT_FAIL); \ + } + +#define FAIL_ERRNO(msg) FAIL((msg ": %s", strerror(errno))) + +#define PASS() \ + { \ + (void) printf("PASS %s\n", testlib_prog); \ + exit(EXIT_PASS); \ + } + +#define ASSERT_CMP(msg, got, cmp, exp, nfmt) \ + if (!(got cmp exp)) { \ + FAIL_PROLOGUE(); \ + (void) printf msg; \ + (void) printf(": %s=" nfmt " %s %s=" nfmt "\n", \ + #got, got, #cmp, #exp, exp); \ + exit(EXIT_FAIL); \ + } + +#define ASSERT_CHAR_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%c") +#define ASSERT_INT_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%d") +#define ASSERT_INT_NEQ(msg, got, exp) ASSERT_CMP(msg, got, !=, exp, "%d") +#define ASSERT_INT64_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%ld") +#define ASSERT_PTR_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%p") +#define ASSERT_PTR_NEQ(msg, got, exp) ASSERT_CMP(msg, got, !=, exp, "%p") + +#define ASSERT_STR_EQ(msg, got, exp) \ + if (strcmp(got, exp) != 0) { \ + FAIL_PROLOGUE(); \ + (void) printf msg; \ + (void) printf(": %s='%s' != %s='%s'\n", \ + #got, got, #exp, exp); \ + exit(EXIT_FAIL); \ + } + +extern const char *testlib_prog; +extern boolean_t testlib_verbose; + +extern void start_test(const char *, uint32_t); +extern void start_event_thread(void); +extern void test_mevent_count_lists(int *, int *, int *); + +#endif /* _TESTLIB_H_ */ diff --git a/usr/src/cmd/bhyve/uart_emul.c b/usr/src/cmd/bhyve/uart_emul.c index a8b5d40356..c0fff61d00 100644 --- a/usr/src/cmd/bhyve/uart_emul.c +++ b/usr/src/cmd/bhyve/uart_emul.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * Copyright (c) 2013 Neel Natu * All rights reserved. @@ -24,7 +26,8 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z neel $ + * $FreeBSD$ + * */ /* * This file and its contents are supplied under the terms of the @@ -37,46 +40,42 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z neel $"); +__FBSDID("$FreeBSD$"); #include #include - -#ifndef __FreeBSD__ -#include -#include +#ifndef WITHOUT_CAPSICUM +#include +#include #endif + #include #include #include +#include +#include +#include #include #include #include #include #include +#include #ifndef __FreeBSD__ -#include -#include -#include +#include #endif -#ifndef __FreeBSD__ -#include - -#include "bhyverun.h" -#endif -#ifdef __FreeBSD__ #include "mevent.h" -#endif #include "uart_emul.h" #define COM1_BASE 0x3F8 #define COM1_IRQ 4 -#define COM2_BASE 0x2F8 -#define COM2_IRQ 3 +#define COM2_BASE 0x2F8 +#define COM2_IRQ 3 #define DEFAULT_RCLK 1843200 #define DEFAULT_BAUD 9600 @@ -89,15 +88,13 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z #define MSR_DELTA_MASK 0x0f #ifndef REG_SCR -#define REG_SCR com_scr +#define REG_SCR com_scr #endif #define FIFOSZ 16 static bool uart_stdio; /* stdio in use for i/o */ -#ifndef __FreeBSD__ -static bool uart_bcons; /* bhyveconsole in use for i/o */ -#endif +static struct termios tio_stdio_orig; static struct { int baseaddr; @@ -118,9 +115,15 @@ struct fifo { int size; /* size of the fifo */ }; +struct ttyfd { + bool opened; + int rfd; /* fd for reading */ + int wfd; /* fd for writing, may be == rfd */ +}; + struct uart_softc { pthread_mutex_t mtx; /* protects all softc elements */ - uint8_t data; /* Data register (R/W) */ + uint8_t data; /* Data register (R/W) */ uint8_t ier; /* Interrupt enable register (R/W) */ uint8_t lcr; /* Line control register (R/W) */ uint8_t mcr; /* Modem control register (R/W) */ @@ -133,16 +136,16 @@ struct uart_softc { uint8_t dlh; /* Baudrate divisor latch MSB */ struct fifo rxfifo; + struct mevent *mev; - bool opened; - bool stdio; + struct ttyfd tty; #ifndef __FreeBSD__ - bool bcons; + bool sock; struct { - pid_t clipid; int clifd; /* console client unix domain socket */ int servfd; /* console server unix domain socket */ - } usc_bcons; + struct mevent *servmev; /* mevent for server socket */ + } usc_sock; #endif bool thre_int_pending; /* THRE interrupt pending */ @@ -152,140 +155,222 @@ struct uart_softc { uart_intr_func_t intr_deassert; }; -#ifdef __FreeBSD__ static void uart_drain(int fd, enum ev_type ev, void *arg); -#else -static void uart_tty_drain(struct uart_softc *sc); -static int uart_bcons_drain(struct uart_softc *sc); -#endif - -static struct termios tio_orig, tio_new; /* I/O Terminals */ static void ttyclose(void) { - tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); + tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig); } static void -ttyopen(void) +ttyopen(struct ttyfd *tf) { - - tcgetattr(STDIN_FILENO, &tio_orig); - - tio_new = tio_orig; - cfmakeraw(&tio_new); - tcsetattr(STDIN_FILENO, TCSANOW, &tio_new); - - atexit(ttyclose); -} - -static bool -tty_char_available(void) -{ - fd_set rfds; - struct timeval tv; - - FD_ZERO(&rfds); - FD_SET(STDIN_FILENO, &rfds); - tv.tv_sec = 0; - tv.tv_usec = 0; - if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0 ) { - return (true); - } else { - return (false); + struct termios orig, new; + + tcgetattr(tf->rfd, &orig); + new = orig; + cfmakeraw(&new); + new.c_cflag |= CLOCAL; + tcsetattr(tf->rfd, TCSANOW, &new); + if (uart_stdio) { + tio_stdio_orig = orig; + atexit(ttyclose); } } static int -ttyread(void) +ttyread(struct ttyfd *tf) { - char rb; + unsigned char rb; - if (tty_char_available()) { - read(STDIN_FILENO, &rb, 1); - return (rb & 0xff); - } else { + if (read(tf->rfd, &rb, 1) == 1) + return (rb); + else return (-1); - } } static void -ttywrite(unsigned char wb) +ttywrite(struct ttyfd *tf, unsigned char wb) { - (void)write(STDIN_FILENO, &wb, 1); + (void)write(tf->wfd, &wb, 1); } #ifndef __FreeBSD__ static void -bconswrite(struct uart_softc *sc, unsigned char wb) +sockwrite(struct uart_softc *sc, unsigned char wb) { - (void) write(sc->usc_bcons.clifd, &wb, 1); + (void) write(sc->usc_sock.clifd, &wb, 1); } #endif static void -fifo_reset(struct fifo *fifo, int size) +rxfifo_reset(struct uart_softc *sc, int size) { + char flushbuf[32]; + struct fifo *fifo; + ssize_t nread; + int error; + + fifo = &sc->rxfifo; bzero(fifo, sizeof(struct fifo)); fifo->size = size; + + if (sc->tty.opened) { + /* + * Flush any unread input from the tty buffer. + */ + while (1) { + nread = read(sc->tty.rfd, flushbuf, sizeof(flushbuf)); + if (nread != sizeof(flushbuf)) + break; + } + + /* + * Enable mevent to trigger when new characters are available + * on the tty fd. + */ + error = mevent_enable(sc->mev); + assert(error == 0); + } +#ifndef __FreeBSD__ + if (sc->sock && sc->usc_sock.clifd != -1) { + /* Flush any unread input from the socket buffer. */ + do { + nread = read(sc->usc_sock.clifd, flushbuf, + sizeof (flushbuf)); + } while (nread == sizeof (flushbuf)); + + /* Enable mevent to trigger when new data available on sock */ + error = mevent_enable(sc->mev); + assert(error == 0); + } +#endif /* __FreeBSD__ */ +} + +static int +rxfifo_available(struct uart_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->rxfifo; + return (fifo->num < fifo->size); } static int -fifo_putchar(struct fifo *fifo, uint8_t ch) +rxfifo_putchar(struct uart_softc *sc, uint8_t ch) { + struct fifo *fifo; + int error; + + fifo = &sc->rxfifo; if (fifo->num < fifo->size) { fifo->buf[fifo->windex] = ch; fifo->windex = (fifo->windex + 1) % fifo->size; fifo->num++; + if (!rxfifo_available(sc)) { + if (sc->tty.opened) { + /* + * Disable mevent callback if the FIFO is full. + */ + error = mevent_disable(sc->mev); + assert(error == 0); + } +#ifndef __FreeBSD__ + if (sc->sock && sc->usc_sock.clifd != -1) { + /* + * Disable mevent callback if the FIFO is full. + */ + error = mevent_disable(sc->mev); + assert(error == 0); + } +#endif /* __FreeBSD__ */ + } return (0); } else return (-1); } static int -fifo_getchar(struct fifo *fifo) +rxfifo_getchar(struct uart_softc *sc) { - int c; + struct fifo *fifo; + int c, error, wasfull; + wasfull = 0; + fifo = &sc->rxfifo; if (fifo->num > 0) { + if (!rxfifo_available(sc)) + wasfull = 1; c = fifo->buf[fifo->rindex]; fifo->rindex = (fifo->rindex + 1) % fifo->size; fifo->num--; + if (wasfull) { + if (sc->tty.opened) { + error = mevent_enable(sc->mev); + assert(error == 0); + } +#ifndef __FreeBSD__ + if (sc->sock && sc->usc_sock.clifd != -1) { + error = mevent_enable(sc->mev); + assert(error == 0); + } +#endif /* __FreeBSD__ */ + } return (c); } else return (-1); } static int -fifo_numchars(struct fifo *fifo) +rxfifo_numchars(struct uart_softc *sc) { + struct fifo *fifo = &sc->rxfifo; return (fifo->num); } -static int -fifo_available(struct fifo *fifo) +static void +uart_opentty(struct uart_softc *sc) { - return (fifo->num < fifo->size); + ttyopen(&sc->tty); + sc->mev = mevent_add(sc->tty.rfd, EVF_READ, uart_drain, sc); + assert(sc->mev != NULL); } -static void -uart_opentty(struct uart_softc *sc) +static uint8_t +modem_status(uint8_t mcr) { - struct mevent *mev; + uint8_t msr; - assert(!sc->opened && sc->stdio); + if (mcr & MCR_LOOPBACK) { + /* + * In the loopback mode certain bits from the MCR are + * reflected back into MSR. + */ + msr = 0; + if (mcr & MCR_RTS) + msr |= MSR_CTS; + if (mcr & MCR_DTR) + msr |= MSR_DSR; + if (mcr & MCR_OUT1) + msr |= MSR_RI; + if (mcr & MCR_OUT2) + msr |= MSR_DCD; + } else { + /* + * Always assert DCD and DSR so tty open doesn't block + * even if CLOCAL is turned off. + */ + msr = MSR_DCD | MSR_DSR; + } + assert((msr & MSR_DELTA_MASK) == 0); - ttyopen(); -#ifdef __FreeBSD__ - mev = mevent_add(STDIN_FILENO, EVF_READ, uart_drain, sc); -#endif - assert(mev); + return (msr); } /* @@ -302,7 +387,7 @@ uart_intr_reason(struct uart_softc *sc) if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0) return (IIR_RLS); - else if (fifo_numchars(&sc->rxfifo) > 0 && (sc->ier & IER_ERXRDY) != 0) + else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0) return (IIR_RXTOUT); else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0) return (IIR_TXRDY); @@ -319,9 +404,14 @@ uart_reset(struct uart_softc *sc) divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16; sc->dll = divisor; +#ifndef __FreeBSD__ + sc->dlh = 0; +#else sc->dlh = divisor >> 16; +#endif + sc->msr = modem_status(sc->mcr); - fifo_reset(&sc->rxfifo, 1); /* no fifo until enabled by software */ + rxfifo_reset(sc, 1); /* no fifo until enabled by software */ } /* @@ -341,7 +431,6 @@ uart_toggle_intr(struct uart_softc *sc) (*sc->intr_assert)(sc->arg); } -#ifdef __FreeBSD__ static void uart_drain(int fd, enum ev_type ev, void *arg) { @@ -350,7 +439,7 @@ uart_drain(int fd, enum ev_type ev, void *arg) sc = arg; - assert(fd == STDIN_FILENO); + assert(fd == sc->tty.rfd); assert(ev == EVF_READ); /* @@ -361,35 +450,11 @@ uart_drain(int fd, enum ev_type ev, void *arg) pthread_mutex_lock(&sc->mtx); if ((sc->mcr & MCR_LOOPBACK) != 0) { - (void) ttyread(); - } else { - while (fifo_available(&sc->rxfifo) && - ((ch = ttyread()) != -1)) { - fifo_putchar(&sc->rxfifo, ch); - } - uart_toggle_intr(sc); - } - - pthread_mutex_unlock(&sc->mtx); -} -#else -static void -uart_tty_drain(struct uart_softc *sc) -{ - int ch; - - /* - * Take the softc lock to protect against concurrent - * access from a vCPU i/o exit - */ - pthread_mutex_lock(&sc->mtx); - - if ((sc->mcr & MCR_LOOPBACK) != 0) { - (void) ttyread(); + (void) ttyread(&sc->tty); } else { - while (fifo_available(&sc->rxfifo) && - ((ch = ttyread()) != -1)) { - fifo_putchar(&sc->rxfifo, ch); + while (rxfifo_available(sc) && + ((ch = ttyread(&sc->tty)) != -1)) { + rxfifo_putchar(sc, ch); } uart_toggle_intr(sc); } @@ -397,50 +462,6 @@ uart_tty_drain(struct uart_softc *sc) pthread_mutex_unlock(&sc->mtx); } -static int -uart_bcons_drain(struct uart_softc *sc) -{ - char ch; - int nbytes; - int ret = 0; - - /* - * Take the softc lock to protect against concurrent - * access from a vCPU i/o exit - */ - pthread_mutex_lock(&sc->mtx); - - if ((sc->mcr & MCR_LOOPBACK) != 0) { - (void) read(sc->usc_bcons.clifd, &ch, 1); - } else { - for (;;) { - nbytes = read(sc->usc_bcons.clifd, &ch, 1); - if (nbytes == 0) { - ret = 1; - break; - } - if (nbytes == -1 && - errno != EINTR && errno != EAGAIN) { - ret = -1; - break; - } - if (nbytes == -1) { - break; - } - - if (fifo_available(&sc->rxfifo)) { - fifo_putchar(&sc->rxfifo, ch); - } - } - uart_toggle_intr(sc); - } - - pthread_mutex_unlock(&sc->mtx); - - return (ret); -} -#endif - void uart_write(struct uart_softc *sc, int offset, uint8_t value) { @@ -449,12 +470,6 @@ uart_write(struct uart_softc *sc, int offset, uint8_t value) pthread_mutex_lock(&sc->mtx); - /* Open terminal */ - if (!sc->opened && sc->stdio) { - uart_opentty(sc); - sc->opened = true; - } - /* * Take care of the special case DLAB accesses first */ @@ -473,108 +488,96 @@ uart_write(struct uart_softc *sc, int offset, uint8_t value) switch (offset) { case REG_DATA: if (sc->mcr & MCR_LOOPBACK) { - if (fifo_putchar(&sc->rxfifo, value) != 0) + if (rxfifo_putchar(sc, value) != 0) sc->lsr |= LSR_OE; - } else if (sc->stdio) { - ttywrite(value); + } else if (sc->tty.opened) { + ttywrite(&sc->tty, value); #ifndef __FreeBSD__ - } else if (sc->bcons) { - bconswrite(sc, value); + } else if (sc->sock) { + sockwrite(sc, value); #endif } /* else drop on floor */ sc->thre_int_pending = true; break; case REG_IER: + /* Set pending when IER_ETXRDY is raised (edge-triggered). */ + if ((sc->ier & IER_ETXRDY) == 0 && (value & IER_ETXRDY) != 0) + sc->thre_int_pending = true; /* * Apply mask so that bits 4-7 are 0 * Also enables bits 0-3 only if they're 1 */ sc->ier = value & 0x0F; break; - case REG_FCR: - /* - * When moving from FIFO and 16450 mode and vice versa, - * the FIFO contents are reset. - */ - if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) { - fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1; - fifo_reset(&sc->rxfifo, fifosz); - } + case REG_FCR: + /* + * When moving from FIFO and 16450 mode and vice versa, + * the FIFO contents are reset. + */ + if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) { + fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1; + rxfifo_reset(sc, fifosz); + } - /* - * The FCR_ENABLE bit must be '1' for the programming - * of other FCR bits to be effective. - */ - if ((value & FCR_ENABLE) == 0) { - sc->fcr = 0; - } else { - if ((value & FCR_RCV_RST) != 0) - fifo_reset(&sc->rxfifo, FIFOSZ); - - sc->fcr = value & - (FCR_ENABLE | FCR_DMA | FCR_RX_MASK); - } - break; - case REG_LCR: - sc->lcr = value; - break; - case REG_MCR: - /* Apply mask so that bits 5-7 are 0 */ - sc->mcr = value & 0x1F; - - msr = 0; - if (sc->mcr & MCR_LOOPBACK) { - /* - * In the loopback mode certain bits from the - * MCR are reflected back into MSR - */ - if (sc->mcr & MCR_RTS) - msr |= MSR_CTS; - if (sc->mcr & MCR_DTR) - msr |= MSR_DSR; - if (sc->mcr & MCR_OUT1) - msr |= MSR_RI; - if (sc->mcr & MCR_OUT2) - msr |= MSR_DCD; - } + /* + * The FCR_ENABLE bit must be '1' for the programming + * of other FCR bits to be effective. + */ + if ((value & FCR_ENABLE) == 0) { + sc->fcr = 0; + } else { + if ((value & FCR_RCV_RST) != 0) + rxfifo_reset(sc, FIFOSZ); + + sc->fcr = value & + (FCR_ENABLE | FCR_DMA | FCR_RX_MASK); + } + break; + case REG_LCR: + sc->lcr = value; + break; + case REG_MCR: + /* Apply mask so that bits 5-7 are 0 */ + sc->mcr = value & 0x1F; + msr = modem_status(sc->mcr); - /* - * Detect if there has been any change between the - * previous and the new value of MSR. If there is - * then assert the appropriate MSR delta bit. - */ - if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS)) - sc->msr |= MSR_DCTS; - if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR)) - sc->msr |= MSR_DDSR; - if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD)) - sc->msr |= MSR_DDCD; - if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0) - sc->msr |= MSR_TERI; - - /* - * Update the value of MSR while retaining the delta - * bits. - */ - sc->msr &= MSR_DELTA_MASK; - sc->msr |= msr; - break; - case REG_LSR: - /* - * Line status register is not meant to be written to - * during normal operation. - */ - break; - case REG_MSR: - /* - * As far as I can tell MSR is a read-only register. - */ - break; - case REG_SCR: - sc->scr = value; - break; - default: - break; + /* + * Detect if there has been any change between the + * previous and the new value of MSR. If there is + * then assert the appropriate MSR delta bit. + */ + if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS)) + sc->msr |= MSR_DCTS; + if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR)) + sc->msr |= MSR_DDSR; + if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD)) + sc->msr |= MSR_DDCD; + if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0) + sc->msr |= MSR_TERI; + + /* + * Update the value of MSR while retaining the delta + * bits. + */ + sc->msr &= MSR_DELTA_MASK; + sc->msr |= msr; + break; + case REG_LSR: + /* + * Line status register is not meant to be written to + * during normal operation. + */ + break; + case REG_MSR: + /* + * As far as I can tell MSR is a read-only register. + */ + break; + case REG_SCR: + sc->scr = value; + break; + default: + break; } done: @@ -589,12 +592,6 @@ uart_read(struct uart_softc *sc, int offset) pthread_mutex_lock(&sc->mtx); - /* Open terminal */ - if (!sc->opened && sc->stdio) { - uart_opentty(sc); - sc->opened = true; - } - /* * Take care of the special case DLAB accesses first */ @@ -612,7 +609,7 @@ uart_read(struct uart_softc *sc, int offset) switch (offset) { case REG_DATA: - reg = fifo_getchar(&sc->rxfifo); + reg = rxfifo_getchar(sc); break; case REG_IER: reg = sc->ier; @@ -643,7 +640,7 @@ uart_read(struct uart_softc *sc, int offset) sc->lsr |= LSR_TEMT | LSR_THRE; /* Check for new receive data */ - if (fifo_numchars(&sc->rxfifo) > 0) + if (rxfifo_numchars(sc) > 0) sc->lsr |= LSR_RXRDY; else sc->lsr &= ~LSR_RXRDY; @@ -676,277 +673,123 @@ done: } #ifndef __FreeBSD__ -static void * -uart_tty_thread(void *param) -{ - struct uart_softc *sc = param; - pollfd_t pollset; - - pollset.fd = STDIN_FILENO; - pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND; - - for (;;) { - if (poll(&pollset, 1, -1) < 0) { - if (errno != EINTR) { - perror("poll failed"); - break; - } - continue; - } - uart_tty_drain(sc); - } - - return (NULL); -} - -/* - * Read the "ident" string from the client's descriptor; this routine also - * tolerates being called with pid=NULL, for times when you want to "eat" - * the ident string from a client without saving it. - */ -static int -get_client_ident(int clifd, pid_t *pid) +static void +uart_sock_drain(int fd, enum ev_type ev, void *arg) { - char buf[BUFSIZ], *bufp; - size_t buflen = sizeof (buf); - char c = '\0'; - int i = 0, r; - - /* "eat up the ident string" case, for simplicity */ - if (pid == NULL) { - while (read(clifd, &c, 1) == 1) { - if (c == '\n') - return (0); - } - } - - bzero(buf, sizeof (buf)); - while ((buflen > 1) && (r = read(clifd, &c, 1)) == 1) { - buflen--; - if (c == '\n') - break; - - buf[i] = c; - i++; - } - if (r == -1) - return (-1); - - /* - * We've filled the buffer, but still haven't seen \n. Keep eating - * until we find it; we don't expect this to happen, but this is - * defensive. - */ - if (c != '\n') { - while ((r = read(clifd, &c, sizeof (c))) > 0) - if (c == '\n') - break; - } + struct uart_softc *sc = arg; + char ch; /* - * Parse buffer for message of the form: IDENT + * Take the softc lock to protect against concurrent + * access from a vCPU i/o exit */ - bufp = buf; - if (strncmp(bufp, "IDENT ", 6) != 0) - return (-1); - bufp += 6; - errno = 0; - *pid = strtoll(bufp, &bufp, 10); - if (errno != 0) - return (-1); + pthread_mutex_lock(&sc->mtx); - return (0); -} + if ((sc->mcr & MCR_LOOPBACK) != 0) { + (void) read(sc->usc_sock.clifd, &ch, 1); + } else { + bool err_close = false; -static int -uart_bcons_accept_client(struct uart_softc *sc) -{ - int connfd; - struct sockaddr_un cliaddr; - socklen_t clilen; - pid_t pid; - - clilen = sizeof (cliaddr); - connfd = accept(sc->usc_bcons.servfd, - (struct sockaddr *)&cliaddr, &clilen); - if (connfd == -1) - return (-1); - if (get_client_ident(connfd, &pid) == -1) { - (void) shutdown(connfd, SHUT_RDWR); - (void) close(connfd); - return (-1); - } + while (rxfifo_available(sc)) { + int res; - if (fcntl(connfd, F_SETFL, O_NONBLOCK) < 0) { - (void) shutdown(connfd, SHUT_RDWR); - (void) close(connfd); - return (-1); - } - (void) write(connfd, "OK\n", 3); + res = read(sc->usc_sock.clifd, &ch, 1); + if (res == 0) { + err_close = true; + break; + } else if (res == -1) { + if (errno != EAGAIN && errno != EINTR) { + err_close = true; + } + break; + } - sc->usc_bcons.clipid = pid; - sc->usc_bcons.clifd = connfd; + rxfifo_putchar(sc, ch); + } + uart_toggle_intr(sc); - printf("Connection from process ID %lu.\n", pid); + if (err_close) { + (void) fprintf(stderr, "uart: closing client conn\n"); + (void) shutdown(sc->usc_sock.clifd, SHUT_RDWR); + mevent_delete_close(sc->mev); + sc->mev = NULL; + sc->usc_sock.clifd = -1; + } + } - return (0); + pthread_mutex_unlock(&sc->mtx); } static void -uart_bcons_reject_client(struct uart_softc *sc) +uart_sock_accept(int fd, enum ev_type ev, void *arg) { + struct uart_softc *sc = arg; int connfd; - struct sockaddr_un cliaddr; - socklen_t clilen; - char nak[MAXPATHLEN]; - clilen = sizeof (cliaddr); - connfd = accept(sc->usc_bcons.servfd, - (struct sockaddr *)&cliaddr, &clilen); + connfd = accept(sc->usc_sock.servfd, NULL, NULL); + if (connfd == -1) { + return; + } /* - * After hear its ident string, tell client to get lost. + * Do client connection management under protection of the softc lock + * to avoid racing with concurrent UART events. */ - if (get_client_ident(connfd, NULL) == 0) { - (void) snprintf(nak, sizeof (nak), "%lu\n", - sc->usc_bcons.clipid); - (void) write(connfd, nak, strlen(nak)); - } - (void) shutdown(connfd, SHUT_RDWR); - (void) close(connfd); -} - -static int -uart_bcons_client_event(struct uart_softc *sc) -{ - int res; - - res = uart_bcons_drain(sc); - if (res < 0) - return (-1); - - if (res > 0) { - fprintf(stderr, "Closing connection with bhyve console\n"); - (void) shutdown(sc->usc_bcons.clifd, SHUT_RDWR); - (void) close(sc->usc_bcons.clifd); - sc->usc_bcons.clifd = -1; - } - - return (0); -} - -static void -uart_bcons_server_event(struct uart_softc *sc) -{ - int clifd; + pthread_mutex_lock(&sc->mtx); - if (sc->usc_bcons.clifd != -1) { + if (sc->usc_sock.clifd != -1) { /* we're already handling a client */ - uart_bcons_reject_client(sc); - return; - } - - if (uart_bcons_accept_client(sc) == 0) { - pthread_mutex_lock(&bcons_wait_lock); - bcons_connected = B_TRUE; - pthread_cond_signal(&bcons_wait_done); - pthread_mutex_unlock(&bcons_wait_lock); - } -} - -static void * -uart_bcons_thread(void *param) -{ - struct uart_softc *sc = param; - struct pollfd pollfds[2]; - int res; - - /* read from client and write to vm */ - pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND | - POLLPRI | POLLERR | POLLHUP; - - /* the server socket; watch for events (new connections) */ - pollfds[1].events = pollfds[0].events; - - for (;;) { - pollfds[0].fd = sc->usc_bcons.clifd; - pollfds[1].fd = sc->usc_bcons.servfd; - pollfds[0].revents = pollfds[1].revents = 0; - - res = poll(pollfds, - sizeof (pollfds) / sizeof (struct pollfd), -1); - - if (res == -1 && errno != EINTR) { - perror("poll failed"); - /* we are hosed, close connection */ - break; - } - - /* event from client side */ - if (pollfds[0].revents) { - if (pollfds[0].revents & - (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) { - if (uart_bcons_client_event(sc) < 0) - break; - } else { - break; - } - } - - /* event from server socket */ - if (pollfds[1].revents) { - if (pollfds[1].revents & (POLLIN | POLLRDNORM)) { - uart_bcons_server_event(sc); - } else { - break; - } + (void) fprintf(stderr, "uart: unexpected client conn\n"); + (void) shutdown(connfd, SHUT_RDWR); + (void) close(connfd); + } else { + if (fcntl(connfd, F_SETFL, O_NONBLOCK) < 0) { + perror("uart: fcntl(O_NONBLOCK)"); + (void) shutdown(connfd, SHUT_RDWR); + (void) close(connfd); + } else { + sc->usc_sock.clifd = connfd; + sc->mev = mevent_add(sc->usc_sock.clifd, EVF_READ, + uart_sock_drain, sc); } } - if (sc->usc_bcons.clifd != -1) { - fprintf(stderr, "Closing connection with bhyve console\n"); - (void) shutdown(sc->usc_bcons.clifd, SHUT_RDWR); - (void) close(sc->usc_bcons.clifd); - sc->usc_bcons.clifd = -1; - } - - return (NULL); + pthread_mutex_unlock(&sc->mtx); } static int -init_bcons_sock(void) +init_sock(const char *path) { int servfd; struct sockaddr_un servaddr; - if (mkdir(BHYVE_TMPDIR, S_IRWXU) < 0 && errno != EEXIST) { - fprintf(stderr, "bhyve console setup: " - "could not mkdir %s", BHYVE_TMPDIR, strerror(errno)); - return (-1); - } - bzero(&servaddr, sizeof (servaddr)); servaddr.sun_family = AF_UNIX; - (void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path), - BHYVE_CONS_SOCKPATH, vmname); + + if (strlcpy(servaddr.sun_path, path, sizeof (servaddr.sun_path)) >= + sizeof (servaddr.sun_path)) { + (void) fprintf(stderr, "uart: path '%s' too long\n", + path); + return (-1); + } if ((servfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { - fprintf(stderr, "bhyve console setup: " - "could not create socket\n"); + (void) fprintf(stderr, "uart: socket() error - %s\n", + strerror(errno)); return (-1); } (void) unlink(servaddr.sun_path); if (bind(servfd, (struct sockaddr *)&servaddr, sizeof (servaddr)) == -1) { - fprintf(stderr, "bhyve console setup: " - "could not bind to socket\n"); + (void) fprintf(stderr, "uart: bind() error - %s\n", + strerror(errno)); goto out; } - if (listen(servfd, 4) == -1) { - fprintf(stderr, "bhyve console setup: " - "could not listen on socket"); + if (listen(servfd, 1) == -1) { + (void) fprintf(stderr, "uart: listen() error - %s\n", + strerror(errno)); goto out; } return (servfd); @@ -956,7 +799,7 @@ out: (void) close(servfd); return (-1); } -#endif +#endif /* not __FreeBSD__ */ int uart_legacy_alloc(int which, int *baseaddr, int *irq) @@ -978,8 +821,7 @@ uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, { struct uart_softc *sc; - sc = malloc(sizeof(struct uart_softc)); - bzero(sc, sizeof(struct uart_softc)); + sc = calloc(1, sizeof(struct uart_softc)); sc->arg = arg; sc->intr_assert = intr_assert; @@ -992,51 +834,130 @@ uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, return (sc); } -int -uart_set_backend(struct uart_softc *sc, const char *opts) +#ifndef __FreeBSD__ +static int +uart_sock_backend(struct uart_softc *sc, const char *inopts) { -#ifndef __FreeBSD__ - int error; + char *opts; + char *opt; + char *nextopt; + char *path = NULL; + + if (strncmp(inopts, "socket,", 7) != 0) { + return (-1); + } + if ((opts = strdup(inopts + 7)) == NULL) { + return (-1); + } + + nextopt = opts; + for (opt = strsep(&nextopt, ","); opt != NULL; + opt = strsep(&nextopt, ",")) { + if (path == NULL && *opt == '/') { + path = opt; + continue; + } + /* + * XXX check for server and client options here. For now, + * everything is a server + */ + free(opts); + return (-1); + } + + sc->usc_sock.clifd = -1; + if ((sc->usc_sock.servfd = init_sock(path)) == -1) { + free(opts); + return (-1); + } + sc->sock = true; + sc->tty.rfd = sc->tty.wfd = -1; + sc->usc_sock.servmev = mevent_add(sc->usc_sock.servfd, EVF_READ, + uart_sock_accept, sc); + assert(sc->usc_sock.servmev != NULL); + + return (0); +} +#endif /* not __FreeBSD__ */ + +static int +uart_stdio_backend(struct uart_softc *sc) +{ +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; #endif - /* - * XXX one stdio backend supported at this time. - */ - if (opts == NULL) - return (0); -#ifdef __FreeBSD__ - if (strcmp("stdio", opts) == 0 && !uart_stdio) { - sc->stdio = true; - uart_stdio = true; - return (0); -#else - if (strcmp("stdio", opts) == 0 && !uart_stdio && !uart_bcons) { - sc->stdio = true; - uart_stdio = true; + if (uart_stdio) + return (-1); - error = pthread_create(NULL, NULL, uart_tty_thread, sc); - assert(error == 0); + sc->tty.rfd = STDIN_FILENO; + sc->tty.wfd = STDOUT_FILENO; + sc->tty.opened = true; - return (0); - } else if (strstr(opts, "bcons") != 0 && !uart_stdio && !uart_bcons) { - sc->bcons = true; - uart_bcons= true; + if (fcntl(sc->tty.rfd, F_SETFL, O_NONBLOCK) != 0) + return (-1); + if (fcntl(sc->tty.wfd, F_SETFL, O_NONBLOCK) != 0) + return (-1); - if (strstr(opts, "bcons,wait") != 0) { - bcons_wait = true; - } +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ); + if (caph_rights_limit(sc->tty.rfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(sc->tty.rfd, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif - sc->usc_bcons.clifd = -1; - if ((sc->usc_bcons.servfd = init_bcons_sock()) == -1) { - fprintf(stderr, "bhyve console setup: " - "socket initialization failed\n"); - return (-1); - } - error = pthread_create(NULL, NULL, uart_bcons_thread, sc); - assert(error == 0); + uart_stdio = true; - return (0); + return (0); +} + +static int +uart_tty_backend(struct uart_softc *sc, const char *opts) +{ +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; #endif - } else + int fd; + + fd = open(opts, O_RDWR | O_NONBLOCK); + if (fd < 0 || !isatty(fd)) return (-1); + + sc->tty.rfd = sc->tty.wfd = fd; + sc->tty.opened = true; + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, CAP_WRITE); + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + return (0); +} + +int +uart_set_backend(struct uart_softc *sc, const char *opts) +{ + int retval; + + if (opts == NULL) + return (0); + +#ifndef __FreeBSD__ + if (strncmp("socket,", opts, 7) == 0) + return (uart_sock_backend(sc, opts)); +#endif + if (strcmp("stdio", opts) == 0) + retval = uart_stdio_backend(sc); + else + retval = uart_tty_backend(sc, opts); + if (retval == 0) + uart_opentty(sc); + + return (retval); } diff --git a/usr/src/cmd/bhyve/uart_emul.h b/usr/src/cmd/bhyve/uart_emul.h index ecff957991..a87202df1f 100644 --- a/usr/src/cmd/bhyve/uart_emul.h +++ b/usr/src/cmd/bhyve/uart_emul.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Neel Natu * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/uart_emul.h 257293 2013-10-29 00:18:11Z neel $ + * $FreeBSD$ */ #ifndef _UART_EMUL_H_ diff --git a/usr/src/cmd/bhyve/usb_emul.c b/usr/src/cmd/bhyve/usb_emul.c new file mode 100644 index 0000000000..6ecdd9530e --- /dev/null +++ b/usr/src/cmd/bhyve/usb_emul.c @@ -0,0 +1,78 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include + +#include "usb_emul.h" + +SET_DECLARE(usb_emu_set, struct usb_devemu); + +struct usb_devemu * +usb_emu_finddev(char *name) +{ + struct usb_devemu **udpp, *udp; + + SET_FOREACH(udpp, usb_emu_set) { + udp = *udpp; + if (!strcmp(udp->ue_emu, name)) + return (udp); + } + + return (NULL); +} + +struct usb_data_xfer_block * +usb_data_xfer_append(struct usb_data_xfer *xfer, void *buf, int blen, + void *hci_data, int ccs) +{ + struct usb_data_xfer_block *xb; + + if (xfer->ndata >= USB_MAX_XFER_BLOCKS) + return (NULL); + + xb = &xfer->data[xfer->tail]; + xb->buf = buf; + xb->blen = blen; + xb->hci_data = hci_data; + xb->ccs = ccs; + xb->processed = 0; + xb->bdone = 0; + xfer->ndata++; + xfer->tail = (xfer->tail + 1) % USB_MAX_XFER_BLOCKS; + return (xb); +} diff --git a/usr/src/cmd/bhyve/usb_emul.h b/usr/src/cmd/bhyve/usb_emul.h new file mode 100644 index 0000000000..e55a421b6f --- /dev/null +++ b/usr/src/cmd/bhyve/usb_emul.h @@ -0,0 +1,164 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Leon Dang + * Copyright 2018 Joyent, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _USB_EMUL_H_ +#define _USB_EMUL_H_ + +#include +#include +#include +#ifndef __FreeBSD__ +#include +#endif + +#define USB_MAX_XFER_BLOCKS 8 + +#define USB_XFER_OUT 0 +#define USB_XFER_IN 1 + + + +struct usb_hci; +struct usb_device_request; +struct usb_data_xfer; + +/* Device emulation handlers */ +struct usb_devemu { + char *ue_emu; /* name of device emulation */ + int ue_usbver; /* usb version: 2 or 3 */ + int ue_usbspeed; /* usb device speed */ + + /* instance creation */ + void *(*ue_init)(struct usb_hci *hci, char *opt); + + /* handlers */ + int (*ue_request)(void *sc, struct usb_data_xfer *xfer); + int (*ue_data)(void *sc, struct usb_data_xfer *xfer, int dir, + int epctx); + int (*ue_reset)(void *sc); + int (*ue_remove)(void *sc); + int (*ue_stop)(void *sc); +}; +#define USB_EMUL_SET(x) DATA_SET(usb_emu_set, x); + +/* + * USB device events to notify HCI when state changes + */ +enum hci_usbev { + USBDEV_ATTACH, + USBDEV_RESET, + USBDEV_STOP, + USBDEV_REMOVE, +}; + +/* usb controller, ie xhci, ehci */ +struct usb_hci { + int (*hci_intr)(struct usb_hci *hci, int epctx); + int (*hci_event)(struct usb_hci *hci, enum hci_usbev evid, + void *param); + void *hci_sc; /* private softc for hci */ + + /* controller managed fields */ + int hci_address; + int hci_port; +}; + +/* + * Each xfer block is mapped to the hci transfer block. + * On input into the device handler, blen is set to the lenght of buf. + * The device handler is to update blen to reflect on the residual size + * of the buffer, i.e. len(buf) - len(consumed). + */ +struct usb_data_xfer_block { + void *buf; /* IN or OUT pointer */ + int blen; /* in:len(buf), out:len(remaining) */ + int bdone; /* bytes transferred */ + uint32_t processed; /* device processed this + errcode */ + void *hci_data; /* HCI private reference */ + int ccs; + uint32_t streamid; + uint64_t trbnext; /* next TRB guest address */ +}; + +struct usb_data_xfer { + struct usb_data_xfer_block data[USB_MAX_XFER_BLOCKS]; + struct usb_device_request *ureq; /* setup ctl request */ + int ndata; /* # of data items */ + int head; + int tail; + pthread_mutex_t mtx; +}; + +enum USB_ERRCODE { + USB_ACK, + USB_NAK, + USB_STALL, + USB_NYET, + USB_ERR, + USB_SHORT +}; + +#define USB_DATA_GET_ERRCODE(x) (x)->processed >> 8 +#define USB_DATA_SET_ERRCODE(x,e) do { \ + (x)->processed = ((x)->processed & 0xFF) | (e << 8); \ + } while (0) + +#define USB_DATA_OK(x,i) ((x)->data[(i)].buf != NULL) + +#define USB_DATA_XFER_INIT(x) do { \ + memset((x), 0, sizeof(*(x))); \ + pthread_mutex_init(&((x)->mtx), NULL); \ + } while (0) + +#define USB_DATA_XFER_RESET(x) do { \ + memset((x)->data, 0, sizeof((x)->data)); \ + (x)->ndata = 0; \ + (x)->head = (x)->tail = 0; \ + } while (0) + +#define USB_DATA_XFER_LOCK(x) do { \ + pthread_mutex_lock(&((x)->mtx)); \ + } while (0) + +#define USB_DATA_XFER_UNLOCK(x) do { \ + pthread_mutex_unlock(&((x)->mtx)); \ + } while (0) +#ifndef __FreeBSD__ +#define USB_DATA_XFER_LOCK_HELD(x) MUTEX_HELD(&((x)->mtx)) +#endif + +struct usb_devemu *usb_emu_finddev(char *name); + +struct usb_data_xfer_block *usb_data_xfer_append(struct usb_data_xfer *xfer, + void *buf, int blen, void *hci_data, int ccs); + + +#endif /* _USB_EMUL_H_ */ diff --git a/usr/src/cmd/bhyve/usb_mouse.c b/usr/src/cmd/bhyve/usb_mouse.c new file mode 100644 index 0000000000..921fce5db9 --- /dev/null +++ b/usr/src/cmd/bhyve/usb_mouse.c @@ -0,0 +1,809 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Leon Dang + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#include +#include +#include +#include + +#include +#include + +#include "usb_emul.h" +#include "console.h" +#include "bhyvegc.h" + +static int umouse_debug = 0; +#define DPRINTF(params) if (umouse_debug) printf params +#define WPRINTF(params) printf params + +/* USB endpoint context (1-15) for reporting mouse data events*/ +#define UMOUSE_INTR_ENDPT 1 + +#define UMOUSE_REPORT_DESC_TYPE 0x22 + +#define UMOUSE_GET_REPORT 0x01 +#define UMOUSE_GET_IDLE 0x02 +#define UMOUSE_GET_PROTOCOL 0x03 +#define UMOUSE_SET_REPORT 0x09 +#define UMOUSE_SET_IDLE 0x0A +#define UMOUSE_SET_PROTOCOL 0x0B + +#define HSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) } + +enum { + UMSTR_LANG, + UMSTR_MANUFACTURER, + UMSTR_PRODUCT, + UMSTR_SERIAL, + UMSTR_CONFIG, + UMSTR_MAX +}; + +static const char *umouse_desc_strings[] = { + "\x04\x09", + "BHYVE", + "HID Tablet", + "01", + "HID Tablet Device", +}; + +struct umouse_hid_descriptor { + uint8_t bLength; + uint8_t bDescriptorType; + uint8_t bcdHID[2]; + uint8_t bCountryCode; + uint8_t bNumDescriptors; + uint8_t bReportDescriptorType; + uint8_t wItemLength[2]; +} __packed; + +struct umouse_config_desc { + struct usb_config_descriptor confd; + struct usb_interface_descriptor ifcd; + struct umouse_hid_descriptor hidd; + struct usb_endpoint_descriptor endpd; + struct usb_endpoint_ss_comp_descriptor sscompd; +} __packed; + +#define MOUSE_MAX_X 0x8000 +#define MOUSE_MAX_Y 0x8000 + +static const uint8_t umouse_report_desc[] = { + 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ + 0x09, 0x02, /* USAGE (Mouse) */ + 0xa1, 0x01, /* COLLECTION (Application) */ + 0x09, 0x01, /* USAGE (Pointer) */ + 0xa1, 0x00, /* COLLECTION (Physical) */ + 0x05, 0x09, /* USAGE_PAGE (Button) */ + 0x19, 0x01, /* USAGE_MINIMUM (Button 1) */ + 0x29, 0x03, /* USAGE_MAXIMUM (Button 3) */ + 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ + 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ + 0x75, 0x01, /* REPORT_SIZE (1) */ + 0x95, 0x03, /* REPORT_COUNT (3) */ + 0x81, 0x02, /* INPUT (Data,Var,Abs); 3 buttons */ + 0x75, 0x05, /* REPORT_SIZE (5) */ + 0x95, 0x01, /* REPORT_COUNT (1) */ + 0x81, 0x03, /* INPUT (Cnst,Var,Abs); padding */ + 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ + 0x09, 0x30, /* USAGE (X) */ + 0x09, 0x31, /* USAGE (Y) */ + 0x35, 0x00, /* PHYSICAL_MINIMUM (0) */ + 0x46, 0xff, 0x7f, /* PHYSICAL_MAXIMUM (0x7fff) */ + 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ + 0x26, 0xff, 0x7f, /* LOGICAL_MAXIMUM (0x7fff) */ + 0x75, 0x10, /* REPORT_SIZE (16) */ + 0x95, 0x02, /* REPORT_COUNT (2) */ + 0x81, 0x02, /* INPUT (Data,Var,Abs) */ + 0x05, 0x01, /* USAGE Page (Generic Desktop) */ + 0x09, 0x38, /* USAGE (Wheel) */ + 0x35, 0x00, /* PHYSICAL_MINIMUM (0) */ + 0x45, 0x00, /* PHYSICAL_MAXIMUM (0) */ + 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */ + 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */ + 0x75, 0x08, /* REPORT_SIZE (8) */ + 0x95, 0x01, /* REPORT_COUNT (1) */ + 0x81, 0x06, /* INPUT (Data,Var,Rel) */ + 0xc0, /* END_COLLECTION */ + 0xc0 /* END_COLLECTION */ +}; + +struct umouse_report { + uint8_t buttons; /* bits: 0 left, 1 right, 2 middle */ + int16_t x; /* x position */ + int16_t y; /* y position */ + int8_t z; /* z wheel position */ +} __packed; + + +#define MSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) } + +static struct usb_device_descriptor umouse_dev_desc = { + .bLength = sizeof(umouse_dev_desc), + .bDescriptorType = UDESC_DEVICE, + MSETW(.bcdUSB, UD_USB_3_0), + .bMaxPacketSize = 8, /* max packet size */ + MSETW(.idVendor, 0xFB5D), /* vendor */ + MSETW(.idProduct, 0x0001), /* product */ + MSETW(.bcdDevice, 0), /* device version */ + .iManufacturer = UMSTR_MANUFACTURER, + .iProduct = UMSTR_PRODUCT, + .iSerialNumber = UMSTR_SERIAL, + .bNumConfigurations = 1, +}; + +static struct umouse_config_desc umouse_confd = { + .confd = { + .bLength = sizeof(umouse_confd.confd), + .bDescriptorType = UDESC_CONFIG, + .wTotalLength[0] = sizeof(umouse_confd), + .bNumInterface = 1, + .bConfigurationValue = 1, + .iConfiguration = UMSTR_CONFIG, + .bmAttributes = UC_BUS_POWERED | UC_REMOTE_WAKEUP, + .bMaxPower = 0, + }, + .ifcd = { + .bLength = sizeof(umouse_confd.ifcd), + .bDescriptorType = UDESC_INTERFACE, + .bNumEndpoints = 1, + .bInterfaceClass = UICLASS_HID, + .bInterfaceSubClass = UISUBCLASS_BOOT, + .bInterfaceProtocol = UIPROTO_MOUSE, + }, + .hidd = { + .bLength = sizeof(umouse_confd.hidd), + .bDescriptorType = 0x21, + .bcdHID = { 0x01, 0x10 }, + .bCountryCode = 0, + .bNumDescriptors = 1, + .bReportDescriptorType = UMOUSE_REPORT_DESC_TYPE, + .wItemLength = { sizeof(umouse_report_desc), 0 }, + }, + .endpd = { + .bLength = sizeof(umouse_confd.endpd), + .bDescriptorType = UDESC_ENDPOINT, + .bEndpointAddress = UE_DIR_IN | UMOUSE_INTR_ENDPT, + .bmAttributes = UE_INTERRUPT, + .wMaxPacketSize[0] = 8, + .bInterval = 0xA, + }, + .sscompd = { + .bLength = sizeof(umouse_confd.sscompd), + .bDescriptorType = UDESC_ENDPOINT_SS_COMP, + .bMaxBurst = 0, + .bmAttributes = 0, + MSETW(.wBytesPerInterval, 0), + }, +}; + + +struct umouse_bos_desc { + struct usb_bos_descriptor bosd; + struct usb_devcap_ss_descriptor usbssd; +} __packed; + + +struct umouse_bos_desc umouse_bosd = { + .bosd = { + .bLength = sizeof(umouse_bosd.bosd), + .bDescriptorType = UDESC_BOS, + HSETW(.wTotalLength, sizeof(umouse_bosd)), + .bNumDeviceCaps = 1, + }, + .usbssd = { + .bLength = sizeof(umouse_bosd.usbssd), + .bDescriptorType = UDESC_DEVICE_CAPABILITY, + .bDevCapabilityType = 3, + .bmAttributes = 0, + HSETW(.wSpeedsSupported, 0x08), + .bFunctionalitySupport = 3, + .bU1DevExitLat = 0xa, /* dummy - not used */ + .wU2DevExitLat = { 0x20, 0x00 }, + } +}; + + +struct umouse_softc { + struct usb_hci *hci; + + char *opt; + + struct umouse_report um_report; + int newdata; + struct { + uint8_t idle; + uint8_t protocol; + uint8_t feature; + } hid; + + pthread_mutex_t mtx; + pthread_mutex_t ev_mtx; + int polling; + struct timeval prev_evt; +}; + +static void +umouse_event(uint8_t button, int x, int y, void *arg) +{ + struct umouse_softc *sc; + struct bhyvegc_image *gc; + + gc = console_get_image(); + if (gc == NULL) { + /* not ready */ + return; + } + + sc = arg; + + pthread_mutex_lock(&sc->mtx); + + sc->um_report.buttons = 0; + sc->um_report.z = 0; + + if (button & 0x01) + sc->um_report.buttons |= 0x01; /* left */ + if (button & 0x02) + sc->um_report.buttons |= 0x04; /* middle */ + if (button & 0x04) + sc->um_report.buttons |= 0x02; /* right */ + if (button & 0x8) + sc->um_report.z = 1; + if (button & 0x10) + sc->um_report.z = -1; + + /* scale coords to mouse resolution */ + sc->um_report.x = MOUSE_MAX_X * x / gc->width; + sc->um_report.y = MOUSE_MAX_Y * y / gc->height; + sc->newdata = 1; + pthread_mutex_unlock(&sc->mtx); + + pthread_mutex_lock(&sc->ev_mtx); + sc->hci->hci_intr(sc->hci, UE_DIR_IN | UMOUSE_INTR_ENDPT); + pthread_mutex_unlock(&sc->ev_mtx); +} + +static void * +umouse_init(struct usb_hci *hci, char *opt) +{ + struct umouse_softc *sc; + + sc = calloc(1, sizeof(struct umouse_softc)); + sc->hci = hci; + + sc->hid.protocol = 1; /* REPORT protocol */ + sc->opt = strdup(opt); + pthread_mutex_init(&sc->mtx, NULL); + pthread_mutex_init(&sc->ev_mtx, NULL); + + console_ptr_register(umouse_event, sc, 10); + + return (sc); +} + +#define UREQ(x,y) ((x) | ((y) << 8)) + +static int +umouse_request(void *scarg, struct usb_data_xfer *xfer) +{ + struct umouse_softc *sc; + struct usb_data_xfer_block *data; + const char *str; + uint16_t value; + uint16_t index; + uint16_t len; + uint16_t slen; + uint8_t *udata; + int err; + int i, idx; + int eshort; + + sc = scarg; + + data = NULL; + udata = NULL; + idx = xfer->head; + for (i = 0; i < xfer->ndata; i++) { + xfer->data[idx].bdone = 0; + if (data == NULL && USB_DATA_OK(xfer,i)) { + data = &xfer->data[idx]; + udata = data->buf; + } + + xfer->data[idx].processed = 1; + idx = (idx + 1) % USB_MAX_XFER_BLOCKS; + } + + err = USB_ERR_NORMAL_COMPLETION; + eshort = 0; + + if (!xfer->ureq) { + DPRINTF(("umouse_request: port %d\r\n", sc->hci->hci_port)); + goto done; + } + + value = UGETW(xfer->ureq->wValue); + index = UGETW(xfer->ureq->wIndex); + len = UGETW(xfer->ureq->wLength); + + DPRINTF(("umouse_request: port %d, type 0x%x, req 0x%x, val 0x%x, " + "idx 0x%x, len %u\r\n", + sc->hci->hci_port, xfer->ureq->bmRequestType, + xfer->ureq->bRequest, value, index, len)); + + switch (UREQ(xfer->ureq->bRequest, xfer->ureq->bmRequestType)) { + case UREQ(UR_GET_CONFIG, UT_READ_DEVICE): + DPRINTF(("umouse: (UR_GET_CONFIG, UT_READ_DEVICE)\r\n")); + if (!data) + break; + + *udata = umouse_confd.confd.bConfigurationValue; + data->blen = len > 0 ? len - 1 : 0; + eshort = data->blen > 0; + data->bdone += 1; + break; + + case UREQ(UR_GET_DESCRIPTOR, UT_READ_DEVICE): + DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_DEVICE) val %x\r\n", + value >> 8)); + if (!data) + break; + + switch (value >> 8) { + case UDESC_DEVICE: + DPRINTF(("umouse: (->UDESC_DEVICE) len %u ?= " + "sizeof(umouse_dev_desc) %lu\r\n", + len, sizeof(umouse_dev_desc))); + if ((value & 0xFF) != 0) { + err = USB_ERR_IOERROR; + goto done; + } + if (len > sizeof(umouse_dev_desc)) { + data->blen = len - sizeof(umouse_dev_desc); + len = sizeof(umouse_dev_desc); + } else + data->blen = 0; + memcpy(data->buf, &umouse_dev_desc, len); + data->bdone += len; + break; + + case UDESC_CONFIG: + DPRINTF(("umouse: (->UDESC_CONFIG)\r\n")); + if ((value & 0xFF) != 0) { + err = USB_ERR_IOERROR; + goto done; + } + if (len > sizeof(umouse_confd)) { + data->blen = len - sizeof(umouse_confd); + len = sizeof(umouse_confd); + } else + data->blen = 0; + + memcpy(data->buf, &umouse_confd, len); + data->bdone += len; + break; + + case UDESC_STRING: + DPRINTF(("umouse: (->UDESC_STRING)\r\n")); + str = NULL; + if ((value & 0xFF) < UMSTR_MAX) + str = umouse_desc_strings[value & 0xFF]; + else + goto done; + + if ((value & 0xFF) == UMSTR_LANG) { + udata[0] = 4; + udata[1] = UDESC_STRING; + data->blen = len - 2; + len -= 2; + data->bdone += 2; + + if (len >= 2) { + udata[2] = str[0]; + udata[3] = str[1]; + data->blen -= 2; + data->bdone += 2; + } else + data->blen = 0; + + goto done; + } + + slen = 2 + strlen(str) * 2; + udata[0] = slen; + udata[1] = UDESC_STRING; + + if (len > slen) { + data->blen = len - slen; + len = slen; + } else + data->blen = 0; + for (i = 2; i < len; i += 2) { + udata[i] = *str++; + udata[i+1] = '\0'; + } + data->bdone += slen; + + break; + + case UDESC_BOS: + DPRINTF(("umouse: USB3 BOS\r\n")); + if (len > sizeof(umouse_bosd)) { + data->blen = len - sizeof(umouse_bosd); + len = sizeof(umouse_bosd); + } else + data->blen = 0; + memcpy(udata, &umouse_bosd, len); + data->bdone += len; + break; + + default: + DPRINTF(("umouse: unknown(%d)->ERROR\r\n", value >> 8)); + err = USB_ERR_IOERROR; + goto done; + } + eshort = data->blen > 0; + break; + + case UREQ(UR_GET_DESCRIPTOR, UT_READ_INTERFACE): + DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_INTERFACE) " + "0x%x\r\n", (value >> 8))); + if (!data) + break; + + switch (value >> 8) { + case UMOUSE_REPORT_DESC_TYPE: + if (len > sizeof(umouse_report_desc)) { + data->blen = len - sizeof(umouse_report_desc); + len = sizeof(umouse_report_desc); + } else + data->blen = 0; + memcpy(data->buf, umouse_report_desc, len); + data->bdone += len; + break; + default: + DPRINTF(("umouse: IO ERROR\r\n")); + err = USB_ERR_IOERROR; + goto done; + } + eshort = data->blen > 0; + break; + + case UREQ(UR_GET_INTERFACE, UT_READ_INTERFACE): + DPRINTF(("umouse: (UR_GET_INTERFACE, UT_READ_INTERFACE)\r\n")); + if (index != 0) { + DPRINTF(("umouse get_interface, invalid index %d\r\n", + index)); + err = USB_ERR_IOERROR; + goto done; + } + + if (!data) + break; + + if (len > 0) { + *udata = 0; + data->blen = len - 1; + } + eshort = data->blen > 0; + data->bdone += 1; + break; + + case UREQ(UR_GET_STATUS, UT_READ_DEVICE): + DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_DEVICE)\r\n")); + if (data != NULL && len > 1) { + if (sc->hid.feature == UF_DEVICE_REMOTE_WAKEUP) + USETW(udata, UDS_REMOTE_WAKEUP); + else + USETW(udata, 0); + data->blen = len - 2; + data->bdone += 2; + } + + eshort = data->blen > 0; + break; + + case UREQ(UR_GET_STATUS, UT_READ_INTERFACE): + case UREQ(UR_GET_STATUS, UT_READ_ENDPOINT): + DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_INTERFACE)\r\n")); + if (data != NULL && len > 1) { + USETW(udata, 0); + data->blen = len - 2; + data->bdone += 2; + } + eshort = data->blen > 0; + break; + + case UREQ(UR_SET_ADDRESS, UT_WRITE_DEVICE): + /* XXX Controller should've handled this */ + DPRINTF(("umouse set address %u\r\n", value)); + break; + + case UREQ(UR_SET_CONFIG, UT_WRITE_DEVICE): + DPRINTF(("umouse set config %u\r\n", value)); + break; + + case UREQ(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE): + DPRINTF(("umouse set descriptor %u\r\n", value)); + break; + + + case UREQ(UR_CLEAR_FEATURE, UT_WRITE_DEVICE): + DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x\r\n", value)); + if (value == UF_DEVICE_REMOTE_WAKEUP) + sc->hid.feature = 0; + break; + + case UREQ(UR_SET_FEATURE, UT_WRITE_DEVICE): + DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x\r\n", value)); + if (value == UF_DEVICE_REMOTE_WAKEUP) + sc->hid.feature = UF_DEVICE_REMOTE_WAKEUP; + break; + + case UREQ(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE): + case UREQ(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT): + case UREQ(UR_SET_FEATURE, UT_WRITE_INTERFACE): + case UREQ(UR_SET_FEATURE, UT_WRITE_ENDPOINT): + DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_INTERFACE)\r\n")); + err = USB_ERR_IOERROR; + goto done; + + case UREQ(UR_SET_INTERFACE, UT_WRITE_INTERFACE): + DPRINTF(("umouse set interface %u\r\n", value)); + break; + + case UREQ(UR_ISOCH_DELAY, UT_WRITE_DEVICE): + DPRINTF(("umouse set isoch delay %u\r\n", value)); + break; + + case UREQ(UR_SET_SEL, 0): + DPRINTF(("umouse set sel\r\n")); + break; + + case UREQ(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT): + DPRINTF(("umouse synch frame\r\n")); + break; + + /* HID device requests */ + + case UREQ(UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE): + DPRINTF(("umouse: (UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE) " + "0x%x\r\n", (value >> 8))); + if (!data) + break; + + if ((value >> 8) == 0x01 && len >= sizeof(sc->um_report)) { + /* TODO read from backend */ + + if (len > sizeof(sc->um_report)) { + data->blen = len - sizeof(sc->um_report); + len = sizeof(sc->um_report); + } else + data->blen = 0; + + memcpy(data->buf, &sc->um_report, len); + data->bdone += len; + } else { + err = USB_ERR_IOERROR; + goto done; + } + eshort = data->blen > 0; + break; + + case UREQ(UMOUSE_GET_IDLE, UT_READ_CLASS_INTERFACE): + if (data != NULL && len > 0) { + *udata = sc->hid.idle; + data->blen = len - 1; + data->bdone += 1; + } + eshort = data->blen > 0; + break; + + case UREQ(UMOUSE_GET_PROTOCOL, UT_READ_CLASS_INTERFACE): + if (data != NULL && len > 0) { + *udata = sc->hid.protocol; + data->blen = len - 1; + data->bdone += 1; + } + eshort = data->blen > 0; + break; + + case UREQ(UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE): + DPRINTF(("umouse: (UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE) ignored\r\n")); + break; + + case UREQ(UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE): + sc->hid.idle = UGETW(xfer->ureq->wValue) >> 8; + DPRINTF(("umouse: (UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE) %x\r\n", + sc->hid.idle)); + break; + + case UREQ(UMOUSE_SET_PROTOCOL, UT_WRITE_CLASS_INTERFACE): + sc->hid.protocol = UGETW(xfer->ureq->wValue) >> 8; + DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_CLASS_INTERFACE) %x\r\n", + sc->hid.protocol)); + break; + + default: + DPRINTF(("**** umouse request unhandled\r\n")); + err = USB_ERR_IOERROR; + break; + } + +done: +/* UT_WRITE is 0, so this is condition is never true. */ +#ifdef __FreeBSD__ + if (xfer->ureq && (xfer->ureq->bmRequestType & UT_WRITE) && + (err == USB_ERR_NORMAL_COMPLETION) && (data != NULL)) + data->blen = 0; + else if (eshort) + err = USB_ERR_SHORT_XFER; +#else + if (eshort) + err = USB_ERR_SHORT_XFER; +#endif + + + DPRINTF(("umouse request error code %d (0=ok), blen %u txlen %u\r\n", + err, (data ? data->blen : 0), (data ? data->bdone : 0))); + + return (err); +} + +static int +umouse_data_handler(void *scarg, struct usb_data_xfer *xfer, int dir, + int epctx) +{ + struct umouse_softc *sc; + struct usb_data_xfer_block *data; + uint8_t *udata; + int len, i, idx; + int err; + + DPRINTF(("umouse handle data - DIR=%s|EP=%d, blen %d\r\n", + dir ? "IN" : "OUT", epctx, xfer->data[0].blen)); + + + /* find buffer to add data */ + udata = NULL; + err = USB_ERR_NORMAL_COMPLETION; + + /* handle xfer at first unprocessed item with buffer */ + data = NULL; + idx = xfer->head; + for (i = 0; i < xfer->ndata; i++) { + data = &xfer->data[idx]; + if (data->buf != NULL && data->blen != 0) { + break; + } else { + data->processed = 1; + data = NULL; + } + idx = (idx + 1) % USB_MAX_XFER_BLOCKS; + } + if (!data) + goto done; + + udata = data->buf; + len = data->blen; + + if (udata == NULL) { + DPRINTF(("umouse no buffer provided for input\r\n")); + err = USB_ERR_NOMEM; + goto done; + } + + sc = scarg; + + if (dir) { + + pthread_mutex_lock(&sc->mtx); + + if (!sc->newdata) { + err = USB_ERR_CANCELLED; + USB_DATA_SET_ERRCODE(&xfer->data[xfer->head], USB_NAK); + pthread_mutex_unlock(&sc->mtx); + goto done; + } + + if (sc->polling) { + err = USB_ERR_STALLED; + USB_DATA_SET_ERRCODE(data, USB_STALL); + pthread_mutex_unlock(&sc->mtx); + goto done; + } + sc->polling = 1; + + if (len > 0) { + sc->newdata = 0; + + data->processed = 1; + data->bdone += 6; + memcpy(udata, &sc->um_report, 6); + data->blen = len - 6; + if (data->blen > 0) + err = USB_ERR_SHORT_XFER; + } + + sc->polling = 0; + pthread_mutex_unlock(&sc->mtx); + } else { + USB_DATA_SET_ERRCODE(data, USB_STALL); + err = USB_ERR_STALLED; + } + +done: + return (err); +} + +static int +umouse_reset(void *scarg) +{ + struct umouse_softc *sc; + + sc = scarg; + + sc->newdata = 0; + + return (0); +} + +static int +umouse_remove(void *scarg) +{ + + return (0); +} + +static int +umouse_stop(void *scarg) +{ + + return (0); +} + + +struct usb_devemu ue_mouse = { + .ue_emu = "tablet", + .ue_usbver = 3, + .ue_usbspeed = USB_SPEED_HIGH, + .ue_init = umouse_init, + .ue_request = umouse_request, + .ue_data = umouse_data_handler, + .ue_reset = umouse_reset, + .ue_remove = umouse_remove, + .ue_stop = umouse_stop +}; +USB_EMUL_SET(ue_mouse); diff --git a/usr/src/cmd/bhyve/vga.c b/usr/src/cmd/bhyve/vga.c index 4330741042..314ddeb1e8 100644 --- a/usr/src/cmd/bhyve/vga.c +++ b/usr/src/cmd/bhyve/vga.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale * All rights reserved. * @@ -24,6 +26,10 @@ * SUCH DAMAGE. */ +/* + * Copyright 2018 Joyent, Inc. + */ + #include __FBSDID("$FreeBSD$"); @@ -161,10 +167,10 @@ struct vga_softc { */ struct { uint8_t dac_state; - int dac_rd_index; - int dac_rd_subindex; - int dac_wr_index; - int dac_wr_subindex; + uint8_t dac_rd_index; + uint8_t dac_rd_subindex; + uint8_t dac_wr_index; + uint8_t dac_wr_subindex; uint8_t dac_palette[3 * 256]; uint32_t dac_palette_rgb[256]; } vga_dac; @@ -187,8 +193,10 @@ vga_check_size(struct bhyvegc *gc, struct vga_softc *sc) if (vga_in_reset(sc)) return; - old_width = sc->gc_width; - old_height = sc->gc_height; + //old_width = sc->gc_width; + //old_height = sc->gc_height; + old_width = sc->gc_image->width; + old_height = sc->gc_image->height; /* * Horizontal Display End: For text modes this is the number @@ -263,7 +271,7 @@ vga_get_text_pixel(struct vga_softc *sc, int x, int y) offset = 2 * sc->vga_crtc.crtc_start_addr; offset += (y / 16 * sc->gc_width / dots) * 2 + (x / dots) * 2; - bit = 7 - (x % dots); + bit = 7 - (x % dots > 7 ? 7 : x % dots); ch = sc->vga_ram[offset + 0 * 64*KB]; attr = sc->vga_ram[offset + 1 * 64*KB]; @@ -291,7 +299,7 @@ vga_get_text_pixel(struct vga_softc *sc, int x, int y) font = sc->vga_ram[font_offset + 2 * 64*KB]; - if ((bit > 0) && (font & (1 << bit))) + if (font & (1 << bit)) idx = sc->vga_atc.atc_palette[attr & 0xf]; else idx = sc->vga_atc.atc_palette[attr >> 4]; @@ -314,7 +322,7 @@ vga_render_text(struct vga_softc *sc) } } -static void +void vga_render(struct bhyvegc *gc, void *arg) { struct vga_softc *sc = arg; @@ -361,7 +369,11 @@ vga_mem_rd_handler(struct vmctx *ctx, uint64_t addr, void *arg1) /* * monochrome text mode: base 0xb0000 size 32kb */ +#ifdef __FreeBSD__ assert(0); +#else + abort(); +#endif case 0x3: /* * color text mode and CGA: base 0xb8000 size 32kb @@ -425,7 +437,11 @@ vga_mem_wr_handler(struct vmctx *ctx, uint64_t addr, uint8_t val, void *arg1) /* * monochrome text mode: base 0xb0000 size 32kb */ +#ifdef __FreeBSD__ assert(0); +#else + abort(); +#endif case 0x3: /* * color text mode and CGA: base 0xb8000 size 32kb @@ -858,6 +874,7 @@ vga_port_in_handler(struct vmctx *ctx, int in, int port, int bytes, assert(0); break; } + break; case DAC_DATA_PORT: *val = sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_rd_index + sc->vga_dac.dac_rd_subindex]; @@ -914,15 +931,33 @@ vga_port_in_handler(struct vmctx *ctx, int in, int port, int bytes, case GEN_INPUT_STS1_MONO_PORT: case GEN_INPUT_STS1_COLOR_PORT: sc->vga_atc.atc_flipflop = 0; +#ifdef __FreeBSD__ + sc->vga_sts1 = GEN_IS1_VR | GEN_IS1_DE; + //sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE); +#else + /* + * During the bhyve bring-up process, a guest image was failing + * to successfully boot. It appeared to be spinning, waiting + * for this value to be toggled. Until it can be ruled out + * that this is unnecessary (and documentation seems to + * indicate that it should be present), the toggle should + * remain. + */ sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE); +#endif *val = sc->vga_sts1; break; case GEN_FEATURE_CTRL_PORT: - assert(0); + // OpenBSD calls this with bytes = 1 + //assert(0); + *val = 0; + break; + case 0x3c3: + *val = 0; break; default: printf("XXX vga_port_in_handler() unhandled port 0x%x\n", port); - assert(0); + //assert(0); return (-1); } @@ -1060,7 +1095,7 @@ vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes, sc->vga_atc.atc_color_select_45 = (val & ATC_CS_C45) << 4; sc->vga_atc.atc_color_select_67 = - (val & ATC_CS_C67) << 6; + ((val & ATC_CS_C67) >> 2) << 6; break; default: //printf("XXX VGA ATC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_atc.atc_index); @@ -1095,7 +1130,8 @@ vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes, break; case SEQ_MEMORY_MODE: sc->vga_seq.seq_mm = val; - assert((sc->vga_seq.seq_mm & SEQ_MM_C4) == 0); + /* Windows queries Chain4 */ + //assert((sc->vga_seq.seq_mm & SEQ_MM_C4) == 0); break; default: //printf("XXX VGA SEQ: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_seq.seq_index); @@ -1161,6 +1197,9 @@ vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes, sc->vga_gc.gc_mode_oe = (val & GC_MODE_OE) != 0; sc->vga_gc.gc_mode_rm = (val >> 3) & 0x1; sc->vga_gc.gc_mode_wm = val & 0x3; + + if (sc->gc_image) + sc->gc_image->vgamode = 1; break; case GC_MISCELLANEOUS: sc->vga_gc.gc_misc = val; @@ -1188,8 +1227,10 @@ vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes, case GEN_INPUT_STS1_COLOR_PORT: /* write to Feature Control Register */ break; +// case 0x3c3: +// break; default: - printf("XXX vga_port_out_handler() unhandled port 0x%x\n", port); + printf("XXX vga_port_out_handler() unhandled port 0x%x, val 0x%x\n", port, val); //assert(0); return (-1); } @@ -1248,8 +1289,8 @@ vga_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, return (error); } -int -vga_init(void) +void * +vga_init(int io_only) { struct inout_port iop; struct vga_softc *sc; @@ -1270,6 +1311,12 @@ vga_init(void) assert(error == 0); } + sc->gc_image = console_get_image(); + + /* only handle io ports; vga graphics is disabled */ + if (io_only) + return(sc); + sc->mr.name = "VGA memory"; sc->mr.flags = MEM_F_RW; sc->mr.base = 640 * KB; @@ -1282,8 +1329,29 @@ vga_init(void) sc->vga_ram = malloc(256 * KB); memset(sc->vga_ram, 0, 256 * KB); - sc->gc_image = console_get_image(); - console_fb_register(vga_render, sc); + { + static uint8_t palette[] = { + 0x00,0x00,0x00, 0x00,0x00,0x2a, 0x00,0x2a,0x00, 0x00,0x2a,0x2a, + 0x2a,0x00,0x00, 0x2a,0x00,0x2a, 0x2a,0x2a,0x00, 0x2a,0x2a,0x2a, + 0x00,0x00,0x15, 0x00,0x00,0x3f, 0x00,0x2a,0x15, 0x00,0x2a,0x3f, + 0x2a,0x00,0x15, 0x2a,0x00,0x3f, 0x2a,0x2a,0x15, 0x2a,0x2a,0x3f, + }; + int i; + + memcpy(sc->vga_dac.dac_palette, palette, 16 * 3 * sizeof (uint8_t)); + for (i = 0; i < 16; i++) { + sc->vga_dac.dac_palette_rgb[i] = + ((((sc->vga_dac.dac_palette[3*i + 0] << 2) | + ((sc->vga_dac.dac_palette[3*i + 0] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*i + 0] & 0x1)) << 16) | + (((sc->vga_dac.dac_palette[3*i + 1] << 2) | + ((sc->vga_dac.dac_palette[3*i + 1] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*i + 1] & 0x1)) << 8) | + (((sc->vga_dac.dac_palette[3*i + 2] << 2) | + ((sc->vga_dac.dac_palette[3*i + 2] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*i + 2] & 0x1)) << 0)); + } + } - return (0); + return (sc); } diff --git a/usr/src/cmd/bhyve/vga.h b/usr/src/cmd/bhyve/vga.h index 14637b12b3..36c6dc15fa 100644 --- a/usr/src/cmd/bhyve/vga.h +++ b/usr/src/cmd/bhyve/vga.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale * All rights reserved. * @@ -38,8 +40,8 @@ #define GEN_MISC_OUTPUT_PORT 0x3cc #define GEN_INPUT_STS1_MONO_PORT 0x3ba #define GEN_INPUT_STS1_COLOR_PORT 0x3da -#define GEN_IS1_VR 0x08 /* Vertical retrace */ -#define GEN_IS1_DE 0x01 /* Display enable not */ +#define GEN_IS1_VR 0x08 /* Vertical retrace */ +#define GEN_IS1_DE 0x01 /* Display enable not */ /* Attribute controller registers. */ #define ATC_IDX_PORT 0x3c0 @@ -49,14 +51,14 @@ #define ATC_PALETTE0 0 #define ATC_PALETTE15 15 #define ATC_MODE_CONTROL 16 -#define ATC_MC_IPS 0x80 /* Internal palette size */ -#define ATC_MC_GA 0x01 /* Graphics/alphanumeric */ +#define ATC_MC_IPS 0x80 /* Internal palette size */ +#define ATC_MC_GA 0x01 /* Graphics/alphanumeric */ #define ATC_OVERSCAN_COLOR 17 #define ATC_COLOR_PLANE_ENABLE 18 #define ATC_HORIZ_PIXEL_PANNING 19 #define ATC_COLOR_SELECT 20 -#define ATC_CS_C67 0x0c /* Color select bits 6+7 */ -#define ATC_CS_C45 0x03 /* Color select bits 4+5 */ +#define ATC_CS_C67 0x0c /* Color select bits 6+7 */ +#define ATC_CS_C45 0x03 /* Color select bits 4+5 */ /* Sequencer registers. */ #define SEQ_IDX_PORT 0x3c4 @@ -66,22 +68,22 @@ #define SEQ_RESET_ASYNC 0x1 #define SEQ_RESET_SYNC 0x2 #define SEQ_CLOCKING_MODE 1 -#define SEQ_CM_SO 0x20 /* Screen off */ -#define SEQ_CM_89 0x01 /* 8/9 dot clock */ +#define SEQ_CM_SO 0x20 /* Screen off */ +#define SEQ_CM_89 0x01 /* 8/9 dot clock */ #define SEQ_MAP_MASK 2 #define SEQ_CHAR_MAP_SELECT 3 -#define SEQ_CMS_SAH 0x20 /* Char map A bit 2 */ -#define SEQ_CMS_SAH_SHIFT 5 -#define SEQ_CMS_SA 0x0c /* Char map A bits 0+1 */ -#define SEQ_CMS_SA_SHIFT 2 -#define SEQ_CMS_SBH 0x10 /* Char map B bit 2 */ -#define SEQ_CMS_SBH_SHIFT 4 -#define SEQ_CMS_SB 0x03 /* Char map B bits 0+1 */ -#define SEQ_CMS_SB_SHIFT 0 +#define SEQ_CMS_SAH 0x20 /* Char map A bit 2 */ +#define SEQ_CMS_SAH_SHIFT 5 +#define SEQ_CMS_SA 0x0c /* Char map A bits 0+1 */ +#define SEQ_CMS_SA_SHIFT 2 +#define SEQ_CMS_SBH 0x10 /* Char map B bit 2 */ +#define SEQ_CMS_SBH_SHIFT 4 +#define SEQ_CMS_SB 0x03 /* Char map B bits 0+1 */ +#define SEQ_CMS_SB_SHIFT 0 #define SEQ_MEMORY_MODE 4 -#define SEQ_MM_C4 0x08 /* Chain 4 */ -#define SEQ_MM_OE 0x04 /* Odd/even */ -#define SEQ_MM_EM 0x02 /* Extended memory */ +#define SEQ_MM_C4 0x08 /* Chain 4 */ +#define SEQ_MM_OE 0x04 /* Odd/even */ +#define SEQ_MM_EM 0x02 /* Extended memory */ /* Graphics controller registers. */ #define GC_IDX_PORT 0x3ce @@ -93,13 +95,13 @@ #define GC_DATA_ROTATE 3 #define GC_READ_MAP_SELECT 4 #define GC_MODE 5 -#define GC_MODE_OE 0x10 /* Odd/even */ -#define GC_MODE_C4 0x04 /* Chain 4 */ +#define GC_MODE_OE 0x10 /* Odd/even */ +#define GC_MODE_C4 0x04 /* Chain 4 */ #define GC_MISCELLANEOUS 6 -#define GC_MISC_GM 0x01 /* Graphics/alphanumeric */ -#define GC_MISC_MM 0x0c /* memory map */ -#define GC_MISC_MM_SHIFT 2 +#define GC_MISC_GM 0x01 /* Graphics/alphanumeric */ +#define GC_MISC_MM 0x0c /* memory map */ +#define GC_MISC_MM_SHIFT 2 #define GC_COLOR_DONT_CARE 7 #define GC_BIT_MASK 8 @@ -117,36 +119,36 @@ #define CRTC_END_HORIZ_RETRACE 5 #define CRTC_VERT_TOTAL 6 #define CRTC_OVERFLOW 7 -#define CRTC_OF_VRS9 0x80 /* VRS bit 9 */ -#define CRTC_OF_VRS9_SHIFT 7 -#define CRTC_OF_VDE9 0x40 /* VDE bit 9 */ -#define CRTC_OF_VDE9_SHIFT 6 -#define CRTC_OF_VRS8 0x04 /* VRS bit 8 */ -#define CRTC_OF_VRS8_SHIFT 2 -#define CRTC_OF_VDE8 0x02 /* VDE bit 8 */ -#define CRTC_OF_VDE8_SHIFT 1 +#define CRTC_OF_VRS9 0x80 /* VRS bit 9 */ +#define CRTC_OF_VRS9_SHIFT 7 +#define CRTC_OF_VDE9 0x40 /* VDE bit 9 */ +#define CRTC_OF_VDE9_SHIFT 6 +#define CRTC_OF_VRS8 0x04 /* VRS bit 8 */ +#define CRTC_OF_VRS8_SHIFT 2 +#define CRTC_OF_VDE8 0x02 /* VDE bit 8 */ +#define CRTC_OF_VDE8_SHIFT 1 #define CRTC_PRESET_ROW_SCAN 8 #define CRTC_MAX_SCAN_LINE 9 -#define CRTC_MSL_MSL 0x1f +#define CRTC_MSL_MSL 0x1f #define CRTC_CURSOR_START 10 -#define CRTC_CS_CO 0x20 /* Cursor off */ -#define CRTC_CS_CS 0x1f /* Cursor start */ +#define CRTC_CS_CO 0x20 /* Cursor off */ +#define CRTC_CS_CS 0x1f /* Cursor start */ #define CRTC_CURSOR_END 11 -#define CRTC_CE_CE 0x1f /* Cursor end */ +#define CRTC_CE_CE 0x1f /* Cursor end */ #define CRTC_START_ADDR_HIGH 12 #define CRTC_START_ADDR_LOW 13 #define CRTC_CURSOR_LOC_HIGH 14 #define CRTC_CURSOR_LOC_LOW 15 #define CRTC_VERT_RETRACE_START 16 #define CRTC_VERT_RETRACE_END 17 -#define CRTC_VRE_MASK 0xf +#define CRTC_VRE_MASK 0xf #define CRTC_VERT_DISP_END 18 #define CRTC_OFFSET 19 #define CRTC_UNDERLINE_LOC 20 #define CRTC_START_VERT_BLANK 21 #define CRTC_END_VERT_BLANK 22 #define CRTC_MODE_CONTROL 23 -#define CRTC_MC_TE 0x80 /* Timing enable */ +#define CRTC_MC_TE 0x80 /* Timing enable */ #define CRTC_LINE_COMPARE 24 /* DAC registers */ @@ -155,6 +157,6 @@ #define DAC_IDX_WR_PORT 0x3c8 #define DAC_DATA_PORT 0x3c9 -int vga_init(void); +void *vga_init(int io_only); #endif /* _VGA_H_ */ diff --git a/usr/src/cmd/bhyve/virtio.c b/usr/src/cmd/bhyve/virtio.c index c3b11dc439..47a3ed29ba 100644 --- a/usr/src/cmd/bhyve/virtio.c +++ b/usr/src/cmd/bhyve/virtio.c @@ -1,6 +1,9 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Chris Torek * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -25,11 +28,13 @@ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/virtio.c 270326 2014-08-22 13:01:22Z tychon $"); +__FBSDID("$FreeBSD$"); #include #include +#include + #include #include #include @@ -49,7 +54,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/virtio.c 270326 2014-08-22 13:01:22Z tyc * front of virtio-based device softc" constraint, let's use * this to convert. */ -#define DEV_SOFTC(vs) ((void *)(vs)) +#define DEV_SOFTC(vs) ((void *)(vs)) /* * Link a virtio_softc to its constants, the device softc, and @@ -97,6 +102,7 @@ vi_reset_dev(struct virtio_softc *vs) for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { vq->vq_flags = 0; vq->vq_last_avail = 0; + vq->vq_save_used = 0; vq->vq_pfn = 0; vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; } @@ -147,8 +153,13 @@ vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) return (1); } else vs->vs_flags &= ~VIRTIO_USE_MSIX; + /* Only 1 MSI vector for bhyve */ pci_emul_add_msicap(vs->vs_pi, 1); + + /* Legacy interrupts are mandatory for virtio devices */ + pci_lintr_request(vs->vs_pi); + return (0); } @@ -188,6 +199,7 @@ vi_vq_init(struct virtio_softc *vs, uint32_t pfn) /* Mark queue as allocated, and start at 0 when we use it. */ vq->vq_flags = VQ_ALLOC; vq->vq_last_avail = 0; + vq->vq_save_used = 0; } /* @@ -247,12 +259,12 @@ _vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx, * that vq_has_descs() does one). */ int -vq_getchain(struct vqueue_info *vq, +vq_getchain(struct vqueue_info *vq, uint16_t *pidx, struct iovec *iov, int n_iov, uint16_t *flags) { int i; u_int ndesc, n_indir; - u_int idx, head, next; + u_int idx, next; volatile struct virtio_desc *vdir, *vindir, *vp; struct vmctx *ctx; struct virtio_softc *vs; @@ -295,8 +307,8 @@ vq_getchain(struct vqueue_info *vq, * index, but we just abort if the count gets excessive. */ ctx = vs->vs_pi->pi_vmctx; - head = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; - next = head; + *pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; + vq->vq_last_avail++; for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { if (next >= vq->vq_qsize) { fprintf(stderr, @@ -309,7 +321,7 @@ vq_getchain(struct vqueue_info *vq, if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { _vq_record(i, vdir, ctx, iov, n_iov, flags); i++; - } else if ((vs->vs_negotiated_caps & + } else if ((vs->vs_vc->vc_hv_caps & VIRTIO_RING_F_INDIRECT_DESC) == 0) { fprintf(stderr, "%s: descriptor has forbidden INDIRECT flag, " @@ -370,16 +382,29 @@ loopy: } /* - * Return the currently-first request chain to the guest, setting - * its I/O length to the provided value. + * Return the currently-first request chain back to the available queue. * * (This chain is the one you handled when you called vq_getchain() * and used its positive return value.) */ void -vq_relchain(struct vqueue_info *vq, uint32_t iolen) +vq_retchain(struct vqueue_info *vq) { - uint16_t head, uidx, mask; + + vq->vq_last_avail--; +} + +/* + * Return specified request chain to the guest, setting its I/O length + * to the provided value. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) +{ + uint16_t uidx, mask; volatile struct vring_used *vuh; volatile struct virtio_used *vue; @@ -395,12 +420,17 @@ vq_relchain(struct vqueue_info *vq, uint32_t iolen) */ mask = vq->vq_qsize - 1; vuh = vq->vq_used; - head = vq->vq_avail->va_ring[vq->vq_last_avail++ & mask]; uidx = vuh->vu_idx; vue = &vuh->vu_ring[uidx++ & mask]; - vue->vu_idx = head; /* ie, vue->id = head */ + vue->vu_idx = idx; vue->vu_tlen = iolen; + + /* + * Ensure the used descriptor is visible before updating the index. + * This is necessary on ISAs with memory ordering less strict than x86. + */ + atomic_thread_fence_rel(); vuh->vu_idx = uidx; } @@ -436,8 +466,15 @@ vq_endchains(struct vqueue_info *vq, int used_all_avail) * entire avail was processed, we need to interrupt always. */ vs = vq->vq_vs; - new_idx = vq->vq_used->vu_idx; old_idx = vq->vq_save_used; + vq->vq_save_used = new_idx = vq->vq_used->vu_idx; + + /* + * Use full memory barrier between vu_idx store from preceding + * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or + * va_flags below. + */ + atomic_thread_fence_seq_cst(); if (used_all_avail && (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) intr = 1; @@ -698,6 +735,9 @@ bad: switch (offset) { case VTCFG_R_GUESTCAP: vs->vs_negotiated_caps = value & vc->vc_hv_caps; + if (vc->vc_apply_features) + (*vc->vc_apply_features)(DEV_SOFTC(vs), + vs->vs_negotiated_caps); break; case VTCFG_R_PFN: if (vs->vs_curq >= vc->vc_nvq) diff --git a/usr/src/cmd/bhyve/virtio.h b/usr/src/cmd/bhyve/virtio.h index 1a2ebe8118..a2c3362ec2 100644 --- a/usr/src/cmd/bhyve/virtio.h +++ b/usr/src/cmd/bhyve/virtio.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Chris Torek * All rights reserved. * @@ -23,12 +25,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/virtio.h 268276 2014-07-05 02:38:53Z grehan $ + * $FreeBSD$ */ #ifndef _VIRTIO_H_ #define _VIRTIO_H_ +#include + /* * These are derived from several virtio specifications. * @@ -184,7 +188,7 @@ struct vring_used { /* * PFN register shift amount */ -#define VRING_PFN 12 +#define VRING_PFN 12 /* * Virtio device types @@ -209,7 +213,9 @@ struct vring_used { #define VIRTIO_VENDOR 0x1AF4 #define VIRTIO_DEV_NET 0x1000 #define VIRTIO_DEV_BLOCK 0x1001 -#define VIRTIO_DEV_RANDOM 0x1002 +#define VIRTIO_DEV_CONSOLE 0x1003 +#define VIRTIO_DEV_RANDOM 0x1005 +#define VIRTIO_DEV_SCSI 0x1008 /* * PCI config space constants. @@ -220,19 +226,19 @@ struct vring_used { * If MSI-X is not enabled, those two registers disappear and * the remaining configuration registers start at offset 20. */ -#define VTCFG_R_HOSTCAP 0 -#define VTCFG_R_GUESTCAP 4 -#define VTCFG_R_PFN 8 -#define VTCFG_R_QNUM 12 -#define VTCFG_R_QSEL 14 -#define VTCFG_R_QNOTIFY 16 -#define VTCFG_R_STATUS 18 -#define VTCFG_R_ISR 19 -#define VTCFG_R_CFGVEC 20 -#define VTCFG_R_QVEC 22 -#define VTCFG_R_CFG0 20 /* No MSI-X */ -#define VTCFG_R_CFG1 24 /* With MSI-X */ -#define VTCFG_R_MSIX 20 +#define VTCFG_R_HOSTCAP 0 +#define VTCFG_R_GUESTCAP 4 +#define VTCFG_R_PFN 8 +#define VTCFG_R_QNUM 12 +#define VTCFG_R_QSEL 14 +#define VTCFG_R_QNOTIFY 16 +#define VTCFG_R_STATUS 18 +#define VTCFG_R_ISR 19 +#define VTCFG_R_CFGVEC 20 +#define VTCFG_R_QVEC 22 +#define VTCFG_R_CFG0 20 /* No MSI-X */ +#define VTCFG_R_CFG1 24 /* With MSI-X */ +#define VTCFG_R_MSIX 20 /* * Bits in VTCFG_R_STATUS. Guests need not actually set any of these, @@ -251,7 +257,7 @@ struct vring_used { #define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */ #define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */ -#define VIRTIO_MSI_NO_VECTOR 0xFFFF +#define VIRTIO_MSI_NO_VECTOR 0xFFFF /* * Feature flags. @@ -352,6 +358,8 @@ struct virtio_consts { /* called to read config regs */ int (*vc_cfgwrite)(void *, int, int, uint32_t); /* called to write config regs */ + void (*vc_apply_features)(void *, uint64_t); + /* called to apply negotiated features */ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ }; @@ -422,20 +430,6 @@ vq_has_descs(struct vqueue_info *vq) vq->vq_avail->va_idx); } -/* - * Called by virtio driver as it starts processing chains. Each - * completed chain (obtained from vq_getchain()) is released by - * calling vq_relchain(), then when all are done, vq_endchains() - * can tell if / how-many chains were processed and know whether - * and how to generate an interrupt. - */ -static inline void -vq_startchains(struct vqueue_info *vq) -{ - - vq->vq_save_used = vq->vq_used->vu_idx; -} - /* * Deliver an interrupt to guest on the given virtual queue * (if possible, or a generic MSI interrupt if not using MSI-X). @@ -447,11 +441,25 @@ vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) if (pci_msix_enabled(vs->vs_pi)) pci_generate_msix(vs->vs_pi, vq->vq_msix_idx); else { +#ifndef __FreeBSD__ + boolean_t unlock = B_FALSE; + + if (vs->vs_mtx && !pthread_mutex_isowned_np(vs->vs_mtx)) { + unlock = B_TRUE; + pthread_mutex_lock(vs->vs_mtx); + } +#else VS_LOCK(vs); +#endif vs->vs_isr |= VTCFG_ISR_QUEUES; pci_generate_msi(vs->vs_pi, 0); pci_lintr_assert(vs->vs_pi); +#ifndef __FreeBSD__ + if (unlock) + pthread_mutex_unlock(vs->vs_mtx); +#else VS_UNLOCK(vs); +#endif } } @@ -463,9 +471,10 @@ int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); void vi_reset_dev(struct virtio_softc *); void vi_set_io_bar(struct virtio_softc *, int); -int vq_getchain(struct vqueue_info *vq, +int vq_getchain(struct vqueue_info *vq, uint16_t *pidx, struct iovec *iov, int n_iov, uint16_t *flags); -void vq_relchain(struct vqueue_info *vq, uint32_t iolen); +void vq_retchain(struct vqueue_info *vq); +void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_endchains(struct vqueue_info *vq, int used_all_avail); uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, diff --git a/usr/src/cmd/bhyve/xmsr.c b/usr/src/cmd/bhyve/xmsr.c index 0c097251e0..994445b3e3 100644 --- a/usr/src/cmd/bhyve/xmsr.c +++ b/usr/src/cmd/bhyve/xmsr.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,11 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/xmsr.c 279227 2015-02-24 05:15:40Z neel $ + * $FreeBSD$ */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/xmsr.c 279227 2015-02-24 05:15:40Z neel $"); +__FBSDID("$FreeBSD$"); #include @@ -77,6 +79,7 @@ emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t val) return (0); case MSR_NB_CFG1: + case MSR_LS_CFG: case MSR_IC_CFG: return (0); /* Ignore writes */ @@ -146,6 +149,7 @@ emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val) break; case MSR_NB_CFG1: + case MSR_LS_CFG: case MSR_IC_CFG: /* * The reset value is processor family dependent so @@ -195,12 +199,23 @@ emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val) /* * OpenBSD guests test bit 0 of this MSR to detect if the * workaround for erratum 721 is already applied. - * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf + * https://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf */ case 0xC0011029: *val = 1; break; +#ifndef __FreeBSD__ + case MSR_VM_CR: + /* + * We currently don't support nested virt. + * Windows seems to ignore the cpuid bits and reads this + * MSR anyways. + */ + *val = VM_CR_SVMDIS; + break; +#endif + default: error = -1; break; diff --git a/usr/src/cmd/bhyve/xmsr.h b/usr/src/cmd/bhyve/xmsr.h index ac3c147442..1fb47c3ae2 100644 --- a/usr/src/cmd/bhyve/xmsr.h +++ b/usr/src/cmd/bhyve/xmsr.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/xmsr.h 271888 2014-09-20 02:35:21Z neel $ + * $FreeBSD$ */ #ifndef _XMSR_H_ diff --git a/usr/src/cmd/bhyveconsole/Makefile b/usr/src/cmd/bhyveconsole/Makefile deleted file mode 100644 index 11d34e6599..0000000000 --- a/usr/src/cmd/bhyveconsole/Makefile +++ /dev/null @@ -1,41 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -include ../Makefile.cmd - -SUBDIRS= $(MACH) - -all := TARGET = all -install := TARGET = install -clean := TARGET = clean -clobber := TARGET = clobber -lint := TARGET = lint - -.KEEP_STATE: - -all: $(SUBDIRS) - -clean clobber lint: $(SUBDIRS) - -install: $(SUBDIRS) - -$(RM) $(ROOTUSRSBINPROG) - -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG) - -$(SUBDIRS): FRC - @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) - -FRC: - -include ../Makefile.targ diff --git a/usr/src/cmd/bhyveconsole/bhyveconsole.c b/usr/src/cmd/bhyveconsole/bhyveconsole.c deleted file mode 100644 index 7f237a72f6..0000000000 --- a/usr/src/cmd/bhyveconsole/bhyveconsole.c +++ /dev/null @@ -1,360 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2013 Pluribus Networks Inc. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -static int masterfd; -static struct termios save_termios; -static int save_fd; - -static int nocmdchar = 0; -static char cmdchar = '~'; - -static const char *pname; - -#define BCONS_BUFSIZ 8192 - -static void -usage(void) -{ - (void) fprintf(stderr, "usage: %s vmname\n", pname); - exit(2); -} - -static void -bcons_error(const char *fmt, ...) -{ - va_list alist; - - (void) fprintf(stderr, "%s: ", pname); - va_start(alist, fmt); - (void) vfprintf(stderr, fmt, alist); - va_end(alist); - (void) fprintf(stderr, "\n"); -} - -static void -bcons_perror(const char *str) -{ - const char *estr; - - if ((estr = strerror(errno)) != NULL) - (void) fprintf(stderr, "%s: %s: %s\n", pname, str, estr); - else - (void) fprintf(stderr, "%s: %s: errno %d\n", pname, str, errno); -} - -/* - * Create the unix domain socket and call bhyve; handshake - * with it to determine whether it will allow us to connect. - */ -static int -get_console(const char *vmname) -{ - int sockfd = -1; - struct sockaddr_un servaddr; - char clientid[MAXPATHLEN]; - char handshake[MAXPATHLEN], c; - int msglen; - int i = 0, err = 0; - - if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { - bcons_perror("could not create socket"); - return (-1); - } - - bzero(&servaddr, sizeof (servaddr)); - servaddr.sun_family = AF_UNIX; - (void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path), - BHYVE_CONS_SOCKPATH, vmname); - - if (connect(sockfd, (struct sockaddr *)&servaddr, - sizeof (servaddr)) == -1) { - bcons_perror("Could not connect to console server"); - goto bad; - } - masterfd = sockfd; - - msglen = snprintf(clientid, sizeof (clientid), "IDENT %lu\n", - getpid()); - assert(msglen > 0 && msglen < sizeof (clientid)); - - if (write(masterfd, clientid, msglen) != msglen) { - bcons_error("protocol error"); - goto bad; - } - - /* - * Take care not to accumulate more than our fill, and leave room for - * the NUL at the end. - */ - while ((err = read(masterfd, &c, 1)) == 1) { - if (i >= (sizeof (handshake) - 1)) - break; - if (c == '\n') - break; - handshake[i] = c; - i++; - } - handshake[i] = '\0'; - - /* - * If something went wrong during the handshake we bail; perhaps - * the server died off. - */ - if (err == -1) { - bcons_perror("Could not connect to console server"); - goto bad; - } - - if (strncmp(handshake, "OK", sizeof (handshake)) == 0) - return (0); - - bcons_error("Console is already in use by process ID %s.", - handshake); -bad: - (void) close(sockfd); - masterfd = -1; - return (-1); -} - -/* - * Place terminal into raw mode. - */ -static int -set_tty_rawmode(int fd) -{ - struct termios term; - if (tcgetattr(fd, &term) < 0) { - bcons_perror("failed to get user terminal settings"); - return (-1); - } - - /* Stash for later, so we can revert back to previous mode */ - save_termios = term; - save_fd = fd; - - /* disable 8->7 bit strip, start/stop, enable any char to restart */ - term.c_iflag &= ~(ISTRIP|IXON|IXANY); - /* disable NL->CR, CR->NL, ignore CR, UPPER->lower */ - term.c_iflag &= ~(INLCR|ICRNL|IGNCR|IUCLC); - /* disable output post-processing */ - term.c_oflag &= ~OPOST; - /* disable canonical mode, signal chars, echo & extended functions */ - term.c_lflag &= ~(ICANON|ISIG|ECHO|IEXTEN); - - term.c_cc[VMIN] = 1; /* byte-at-a-time */ - term.c_cc[VTIME] = 0; - - if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &term)) { - bcons_perror("failed to set user terminal to raw mode"); - return (-1); - } - - return (0); -} - -/* - * reset terminal settings for global environment - */ -static void -reset_tty(void) -{ - (void) tcsetattr(save_fd, TCSADRAIN, &save_termios); -} - -/* - * process_user_input watches the input stream for the escape sequence for - * 'quit' (by default, tilde-period). Because we might be fed just one - * keystroke at a time, state associated with the user input (are we at the - * beginning of the line? are we locally echoing the next character?) is - * maintained by beginning_of_line and local_echo across calls to the routine. - * - * This routine returns -1 when the 'quit' escape sequence has been issued, - * or an error is encountered and 0 otherwise. - */ -static int -process_user_input(int out_fd, int in_fd) -{ - static boolean_t beginning_of_line = B_TRUE; - static boolean_t local_echo = B_FALSE; - char ibuf[BCONS_BUFSIZ]; - int nbytes; - char *buf = ibuf; - char c; - - nbytes = read(in_fd, ibuf, sizeof (ibuf)); - if (nbytes == -1 && errno != EINTR) - return (-1); - - if (nbytes == -1) /* The read was interrupted. */ - return (0); - - for (c = *buf; nbytes > 0; c = *buf, --nbytes) { - buf++; - if (beginning_of_line && !nocmdchar) { - beginning_of_line = B_FALSE; - if (c == cmdchar) { - local_echo = B_TRUE; - continue; - } - } else if (local_echo) { - local_echo = B_FALSE; - if (c == '.') { - (void) write(STDOUT_FILENO, &cmdchar, 1); - (void) write(STDOUT_FILENO, &c, 1); - return (-1); - } - } - - (void) write(out_fd, &c, 1); - - beginning_of_line = (c == '\r' || c == '\n'); - } - - return (0); -} - -static int -process_output(int in_fd, int out_fd) -{ - int wrote = 0; - int cc; - char ibuf[BCONS_BUFSIZ]; - - cc = read(in_fd, ibuf, sizeof (ibuf)); - if (cc == -1 && errno != EINTR) - return (-1); - if (cc == 0) /* EOF */ - return (-1); - if (cc == -1) /* The read was interrupted. */ - return (0); - - do { - int len; - - len = write(out_fd, ibuf + wrote, cc - wrote); - if (len == -1 && errno != EINTR) - return (-1); - if (len != -1) - wrote += len; - } while (wrote < cc); - - return (0); -} - -/* - * This is the main I/O loop. - */ -static void -doio(void) -{ - struct pollfd pollfds[2]; - int res; - - /* read from vm and write to stdout */ - pollfds[0].fd = masterfd; - pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI; - - /* read from stdin and write to vm */ - pollfds[1].fd = STDIN_FILENO; - pollfds[1].events = pollfds[0].events; - - for (;;) { - pollfds[0].revents = pollfds[1].revents = 0; - - res = poll(pollfds, - sizeof (pollfds) / sizeof (struct pollfd), -1); - - if (res == -1 && errno != EINTR) { - bcons_perror("poll failed"); - /* we are hosed, close connection */ - break; - } - - /* event from master side stdout */ - if (pollfds[0].revents) { - if (pollfds[0].revents & - (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) { - if (process_output(masterfd, STDOUT_FILENO) - != 0) - break; - } else { - break; - } - } - - /* event from user stdin side */ - if (pollfds[1].revents) { - if (pollfds[1].revents & - (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) { - if (process_user_input(masterfd, STDIN_FILENO) - != 0) - break; - } else { - break; - } - } - } -} - -int -main(int argc, char **argv) -{ - char *vmname; - - pname = basename(argv[0]); - - if (argc == 2) { - vmname = argv[1]; - } else { - usage(); - } - - /* - * Make contact with bhyve - */ - if (get_console(vmname) == -1) - return (1); - - (void) printf("[Connected to vm '%s' console]\n", vmname); - - if (set_tty_rawmode(STDIN_FILENO) == -1) { - reset_tty(); - bcons_perror("failed to set stdin pty to raw mode"); - return (1); - } - - /* - * Run the I/O loop until we get disconnected. - */ - doio(); - reset_tty(); - (void) printf("\n[Connection to vm '%s' console closed]\n", vmname); - - return (0); -} diff --git a/usr/src/cmd/bhyveconsole/i386/Makefile b/usr/src/cmd/bhyveconsole/i386/Makefile deleted file mode 100644 index c4f317a9fa..0000000000 --- a/usr/src/cmd/bhyveconsole/i386/Makefile +++ /dev/null @@ -1,43 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -PROG= bhyveconsole - -OBJS= bhyveconsole.o - -SRCS= $(OBJS:%.o=../%.c) - -include ../../Makefile.cmd - -CFLAGS += $(CCVERBOSE) -LDLIBS += -lsocket - -.KEEP_STATE: - -%.o: ../%.c - $(COMPILE.c) $< - -all: $(PROG) - -$(PROG): $(OBJS) - $(LINK.c) $(OBJS) -o $@ $(LDLIBS) - $(POST_PROCESS) - -install: all $(ROOTUSRSBINPROG32) - -clean: - $(RM) $(OBJS) - -include ../../Makefile.targ diff --git a/usr/src/cmd/bhyvectl/Makefile b/usr/src/cmd/bhyvectl/Makefile index fe98204056..0a8a96cfc9 100644 --- a/usr/src/cmd/bhyvectl/Makefile +++ b/usr/src/cmd/bhyvectl/Makefile @@ -11,31 +11,50 @@ # # Copyright 2013 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. # PROG = bhyvectl include ../Makefile.cmd +include ../Makefile.cmd.64 -$(BUILD64)SUBDIRS += $(MACH64) +SRCS = bhyvectl.c +OBJS = $(SRCS:.c=.o) humanize_number.o -all := TARGET = all -install := TARGET = install -clean := TARGET = clean -clobber := TARGET = clobber -lint := TARGET = lint +CLEANFILES = $(PROG) +CLOBBERFILES += $(ROOTUSRSBINPROG) .KEEP_STATE: -all clean clobber lint: $(SUBDIRS) +CFLAGS += $(CCVERBOSE) +CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ + -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 \ + $(CPPFLAGS.master) \ + -I$(SRC)/uts/i86pc/io/vmm \ + -I$(SRC)/uts/i86pc +LDLIBS += -lvmmapi -install: $(SUBDIRS) - -$(RM) $(ROOTUSRSBINPROG) - -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG) +CERRWARN += -_gcc=-Wno-uninitialized -$(SUBDIRS): FRC - @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) +# main() is too hairy for smatch +bhyvectl.o := SMATCH=off -FRC: +all: $(PROG) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) + $(POST_PROCESS) + +install: all $(ROOTUSRSBINPROG) + +clean: + $(RM) $(OBJS) $(CLEANFILES) + +lint: lint_SRCS include ../Makefile.targ + +%.o: $(CONTRIB)/freebsd/lib/libutil/%.c + $(COMPILE.c) -o $@ $< + $(POST_PROCESS_O) diff --git a/usr/src/cmd/bhyvectl/Makefile.com b/usr/src/cmd/bhyvectl/Makefile.com deleted file mode 100644 index 03ca34792c..0000000000 --- a/usr/src/cmd/bhyvectl/Makefile.com +++ /dev/null @@ -1,48 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -PROG= bhyvectl - -SRCS = bhyvectl.c -OBJS = $(SRCS:.c=.o) - -include ../../Makefile.cmd - -.KEEP_STATE: - -CFLAGS += $(CCVERBOSE) -CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \ - -I$(ROOT)/usr/platform/i86pc/include \ - -I$(SRC)/uts/i86pc/io/vmm -LDLIBS += -lvmmapi - -all: $(PROG) - -$(PROG): $(OBJS) - $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) - $(POST_PROCESS) - -install: all $(ROOTUSRSBINPROG) - -clean: - $(RM) $(OBJS) - -lint: lint_SRCS - -include ../../Makefile.targ - -%.o: ../%.c - $(COMPILE.c) -I$(SRC)/common $< - $(POST_PROCESS_O) diff --git a/usr/src/cmd/bhyvectl/amd64/Makefile b/usr/src/cmd/bhyvectl/amd64/Makefile deleted file mode 100644 index b602c50d05..0000000000 --- a/usr/src/cmd/bhyvectl/amd64/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -include ../Makefile.com -include ../../Makefile.cmd.64 - -CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 - -install: all $(ROOTUSRSBINPROG64) diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c index 07d0a83df5..b8bdf524a9 100644 --- a/usr/src/cmd/bhyvectl/bhyvectl.c +++ b/usr/src/cmd/bhyvectl/bhyvectl.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -38,30 +40,39 @@ /* * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43Z neel $"); +__FBSDID("$FreeBSD$"); #include #include #include #include #include +#include #include #include +#include +#include #include #include #include #include -#include #include +#include #include +#include +#include +#include #include +#include #include +#include "amd/vmcb.h" #include "intel/vmcs.h" #define MB (1UL << 20) @@ -74,7 +85,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43 static const char *progname; static void -usage(void) +usage(bool cpu_intel) { (void)fprintf(stderr, @@ -82,6 +93,9 @@ usage(void) " [--cpu=]\n" " [--create]\n" " [--destroy]\n" +#ifndef __FreeBSD__ + " [--wrlock-cycle]\n" +#endif " [--get-all]\n" " [--get-stats]\n" " [--set-desc-ds]\n" @@ -113,10 +127,22 @@ usage(void) " [--desc-access=]\n" " [--set-cr0=]\n" " [--get-cr0]\n" + " [--set-cr2=]\n" + " [--get-cr2]\n" " [--set-cr3=]\n" " [--get-cr3]\n" " [--set-cr4=]\n" " [--get-cr4]\n" + " [--set-dr0=]\n" + " [--get-dr0]\n" + " [--set-dr1=]\n" + " [--get-dr1]\n" + " [--set-dr2=]\n" + " [--get-dr2]\n" + " [--set-dr3=]\n" + " [--get-dr3]\n" + " [--set-dr6=]\n" + " [--get-dr6]\n" " [--set-dr7=]\n" " [--get-dr7]\n" " [--set-rsp=]\n" @@ -155,64 +181,108 @@ usage(void) " [--get-ss]\n" " [--get-tr]\n" " [--get-ldtr]\n" - " [--get-vmcs-pinbased-ctls]\n" - " [--get-vmcs-procbased-ctls]\n" - " [--get-vmcs-procbased-ctls2]\n" - " [--get-vmcs-entry-interruption-info]\n" - " [--set-vmcs-entry-interruption-info=]\n" - " [--get-vmcs-eptp]\n" - " [--get-vmcs-guest-physical-address\n" - " [--get-vmcs-guest-linear-address\n" - " [--set-vmcs-exception-bitmap]\n" - " [--get-vmcs-exception-bitmap]\n" - " [--get-vmcs-io-bitmap-address]\n" - " [--get-vmcs-tsc-offset]\n" - " [--get-vmcs-guest-pat]\n" - " [--get-vmcs-host-pat]\n" - " [--get-vmcs-host-cr0]\n" - " [--get-vmcs-host-cr3]\n" - " [--get-vmcs-host-cr4]\n" - " [--get-vmcs-host-rip]\n" - " [--get-vmcs-host-rsp]\n" - " [--get-vmcs-cr0-mask]\n" - " [--get-vmcs-cr0-shadow]\n" - " [--get-vmcs-cr4-mask]\n" - " [--get-vmcs-cr4-shadow]\n" - " [--get-vmcs-cr3-targets]\n" - " [--get-vmcs-apic-access-address]\n" - " [--get-vmcs-virtual-apic-address]\n" - " [--get-vmcs-tpr-threshold]\n" - " [--get-vmcs-msr-bitmap]\n" - " [--get-vmcs-msr-bitmap-address]\n" - " [--get-vmcs-vpid]\n" - " [--get-vmcs-ple-gap]\n" - " [--get-vmcs-ple-window]\n" - " [--get-vmcs-instruction-error]\n" - " [--get-vmcs-exit-ctls]\n" - " [--get-vmcs-entry-ctls]\n" - " [--get-vmcs-guest-sysenter]\n" - " [--get-vmcs-link]\n" - " [--get-vmcs-exit-reason]\n" - " [--get-vmcs-exit-qualification]\n" - " [--get-vmcs-exit-interruption-info]\n" - " [--get-vmcs-exit-interruption-error]\n" - " [--get-vmcs-interruptibility]\n" " [--set-x2apic-state=]\n" " [--get-x2apic-state]\n" " [--unassign-pptdev=]\n" " [--set-mem=]\n" " [--get-lowmem]\n" - " [--get-highmem]\n", + " [--get-highmem]\n" + " [--get-gpa-pmap]\n" + " [--assert-lapic-lvt=]\n" + " [--inject-nmi]\n" + " [--force-reset]\n" + " [--force-poweroff]\n" + " [--get-rtc-time]\n" + " [--set-rtc-time=]\n" + " [--get-rtc-nvram]\n" + " [--set-rtc-nvram=]\n" + " [--rtc-nvram-offset=]\n" + " [--get-active-cpus]\n" + " [--get-suspended-cpus]\n" + " [--get-intinfo]\n" + " [--get-eptp]\n" + " [--set-exception-bitmap]\n" + " [--get-exception-bitmap]\n" + " [--get-tsc-offset]\n" + " [--get-guest-pat]\n" + " [--get-io-bitmap-address]\n" + " [--get-msr-bitmap]\n" + " [--get-msr-bitmap-address]\n" + " [--get-guest-sysenter]\n" + " [--get-exit-reason]\n" + " [--get-cpu-topology]\n", progname); + + if (cpu_intel) { + (void)fprintf(stderr, + " [--get-vmcs-pinbased-ctls]\n" + " [--get-vmcs-procbased-ctls]\n" + " [--get-vmcs-procbased-ctls2]\n" + " [--get-vmcs-entry-interruption-info]\n" + " [--set-vmcs-entry-interruption-info=]\n" + " [--get-vmcs-guest-physical-address\n" + " [--get-vmcs-guest-linear-address\n" + " [--get-vmcs-host-pat]\n" + " [--get-vmcs-host-cr0]\n" + " [--get-vmcs-host-cr3]\n" + " [--get-vmcs-host-cr4]\n" + " [--get-vmcs-host-rip]\n" + " [--get-vmcs-host-rsp]\n" + " [--get-vmcs-cr0-mask]\n" + " [--get-vmcs-cr0-shadow]\n" + " [--get-vmcs-cr4-mask]\n" + " [--get-vmcs-cr4-shadow]\n" + " [--get-vmcs-cr3-targets]\n" + " [--get-vmcs-apic-access-address]\n" + " [--get-vmcs-virtual-apic-address]\n" + " [--get-vmcs-tpr-threshold]\n" + " [--get-vmcs-vpid]\n" + " [--get-vmcs-instruction-error]\n" + " [--get-vmcs-exit-ctls]\n" + " [--get-vmcs-entry-ctls]\n" + " [--get-vmcs-link]\n" + " [--get-vmcs-exit-qualification]\n" + " [--get-vmcs-exit-interruption-info]\n" + " [--get-vmcs-exit-interruption-error]\n" + " [--get-vmcs-interruptibility]\n" + ); + } else { + (void)fprintf(stderr, + " [--get-vmcb-intercepts]\n" + " [--get-vmcb-asid]\n" + " [--get-vmcb-exit-details]\n" + " [--get-vmcb-tlb-ctrl]\n" + " [--get-vmcb-virq]\n" + " [--get-avic-apic-bar]\n" + " [--get-avic-backing-page]\n" + " [--get-avic-table]\n" + ); + } exit(1); } -static int get_stats, getcap, setcap, capval; +static int get_rtc_time, set_rtc_time; +static int get_rtc_nvram, set_rtc_nvram; +static int rtc_nvram_offset; +static uint8_t rtc_nvram_value; +static time_t rtc_secs; + +static int get_stats, getcap, setcap, capval, get_gpa_pmap; +static int inject_nmi, assert_lapic_lvt; +static int force_reset, force_poweroff; static const char *capname; -static int create, destroy, get_lowmem, get_highmem; +static int create, destroy, get_memmap, get_memseg; +static int get_intinfo; +static int get_active_cpus, get_suspended_cpus; static uint64_t memsize; -static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4; +static int set_cr0, get_cr0, set_cr2, get_cr2, set_cr3, get_cr3; +static int set_cr4, get_cr4; static int set_efer, get_efer; +static int set_dr0, get_dr0; +static int set_dr1, get_dr1; +static int set_dr2, get_dr2; +static int set_dr3, get_dr3; +static int set_dr6, get_dr6; static int set_dr7, get_dr7; static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags; static int set_rax, get_rax; @@ -234,6 +304,16 @@ static int set_x2apic_state, get_x2apic_state; enum x2apic_state x2apic_state; static int unassign_pptdev, bus, slot, func; static int run; +static int get_cpu_topology; +#ifndef __FreeBSD__ +static int wrlock_cycle; +#endif + +/* + * VMCB specific. + */ +static int get_vmcb_intercept, get_vmcb_exit_details, get_vmcb_tlb_ctrl; +static int get_vmcb_virq, get_avic_table; /* * VMCS-specific fields @@ -250,14 +330,15 @@ static int get_cr4_mask, get_cr4_shadow; static int get_cr3_targets; static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold; static int get_msr_bitmap, get_msr_bitmap_address; -static int get_vpid, get_ple_gap, get_ple_window; +static int get_vpid_asid; static int get_inst_err, get_exit_ctls, get_entry_ctls; static int get_host_cr0, get_host_cr3, get_host_cr4; static int get_host_rip, get_host_rsp; static int get_guest_pat, get_host_pat; static int get_guest_sysenter, get_vmcs_link; -static int get_vmcs_exit_reason, get_vmcs_exit_qualification; +static int get_exit_reason, get_vmcs_exit_qualification; static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error; +static int get_vmcs_exit_inst_length; static uint64_t desc_base; static uint32_t desc_limit, desc_access; @@ -291,29 +372,115 @@ dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu) printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); break; + case VM_EXITCODE_SVM: + printf("\treason\t\tSVM\n"); + printf("\texit_reason\t\t%#lx\n", vmexit->u.svm.exitcode); + printf("\texitinfo1\t\t%#lx\n", vmexit->u.svm.exitinfo1); + printf("\texitinfo2\t\t%#lx\n", vmexit->u.svm.exitinfo2); + break; default: printf("*** unknown vm run exitcode %d\n", vmexit->exitcode); break; } } -static int -dump_vmcs_msr_bitmap(int vcpu, u_long addr) +/* AMD 6th generation and Intel compatible MSRs */ +#define MSR_AMD6TH_START 0xC0000000 +#define MSR_AMD6TH_END 0xC0001FFF +/* AMD 7th and 8th generation compatible MSRs */ +#define MSR_AMD7TH_START 0xC0010000 +#define MSR_AMD7TH_END 0xC0011FFF + +static const char * +msr_name(uint32_t msr) { - int error, fd, byte, bit, readable, writeable; - u_int msr; - const char *bitmap; + static char buf[32]; + + switch(msr) { + case MSR_TSC: + return ("MSR_TSC"); + case MSR_EFER: + return ("MSR_EFER"); + case MSR_STAR: + return ("MSR_STAR"); + case MSR_LSTAR: + return ("MSR_LSTAR"); + case MSR_CSTAR: + return ("MSR_CSTAR"); + case MSR_SF_MASK: + return ("MSR_SF_MASK"); + case MSR_FSBASE: + return ("MSR_FSBASE"); + case MSR_GSBASE: + return ("MSR_GSBASE"); + case MSR_KGSBASE: + return ("MSR_KGSBASE"); + case MSR_SYSENTER_CS_MSR: + return ("MSR_SYSENTER_CS_MSR"); + case MSR_SYSENTER_ESP_MSR: + return ("MSR_SYSENTER_ESP_MSR"); + case MSR_SYSENTER_EIP_MSR: + return ("MSR_SYSENTER_EIP_MSR"); + case MSR_PAT: + return ("MSR_PAT"); + } + snprintf(buf, sizeof(buf), "MSR %#08x", msr); + + return (buf); +} - error = -1; - bitmap = MAP_FAILED; +static inline void +print_msr_pm(uint64_t msr, int vcpu, int readable, int writeable) +{ - fd = open("/dev/mem", O_RDONLY, 0); - if (fd < 0) - goto done; + if (readable || writeable) { + printf("%-20s[%d]\t\t%c%c\n", msr_name(msr), vcpu, + readable ? 'R' : '-', writeable ? 'W' : '-'); + } +} - bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr); - if (bitmap == MAP_FAILED) - goto done; +/* + * Reference APM vol2, section 15.11 MSR Intercepts. + */ +static void +dump_amd_msr_pm(const char *bitmap, int vcpu) +{ + int byte, bit, readable, writeable; + uint32_t msr; + + for (msr = 0; msr < 0x2000; msr++) { + byte = msr / 4; + bit = (msr % 4) * 2; + + /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; + print_msr_pm(msr, vcpu, readable, writeable); + + /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ + byte += 2048; + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; + print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable, + writeable); + + /* MSR 0xC0010000 to 0xC0011FF is only for AMD */ + byte += 4096; + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; + print_msr_pm(msr + MSR_AMD7TH_START, vcpu, readable, + writeable); + } +} + +/* + * Reference Intel SDM Vol3 Section 24.6.9 MSR-Bitmap Address + */ +static void +dump_intel_msr_pm(const char *bitmap, int vcpu) +{ + int byte, bit, readable, writeable; + uint32_t msr; for (msr = 0; msr < 0x2000; msr++) { byte = msr / 8; @@ -321,31 +488,56 @@ dump_vmcs_msr_bitmap(int vcpu, u_long addr) /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; - writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; - if (readable || writeable) { - printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu, - readable ? 'R' : '-', - writeable ? 'W' : '-'); - } + writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; + print_msr_pm(msr, vcpu, readable, writeable); /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ byte += 1024; readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; - writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; - if (readable || writeable) { - printf("msr 0x%08x[%d]\t\t%c%c\n", - 0xc0000000 + msr, vcpu, - readable ? 'R' : '-', - writeable ? 'W' : '-'); - } + writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; + print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable, + writeable); + } +} + +static int +dump_msr_bitmap(int vcpu, uint64_t addr, bool cpu_intel) +{ + int error, fd, map_size; + const char *bitmap; + + error = -1; + bitmap = MAP_FAILED; + + fd = open("/dev/mem", O_RDONLY, 0); + if (fd < 0) { + perror("Couldn't open /dev/mem"); + goto done; + } + + if (cpu_intel) + map_size = PAGE_SIZE; + else + map_size = 2 * PAGE_SIZE; + + bitmap = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, addr); + if (bitmap == MAP_FAILED) { + perror("mmap failed"); + goto done; } + + if (cpu_intel) + dump_intel_msr_pm(bitmap, vcpu); + else + dump_amd_msr_pm(bitmap, vcpu); error = 0; done: if (bitmap != MAP_FAILED) - munmap((void *)bitmap, PAGE_SIZE); + munmap((void *)bitmap, map_size); if (fd >= 0) close(fd); + return (error); } @@ -363,14 +555,36 @@ vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val) return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val)); } +static int +vm_get_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes, + uint64_t *ret_val) +{ + + return (vm_get_register(ctx, vcpu, VMCB_ACCESS(off, bytes), ret_val)); +} + +static int +vm_set_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes, + uint64_t val) +{ + + return (vm_set_register(ctx, vcpu, VMCB_ACCESS(off, bytes), val)); +} + enum { VMNAME = 1000, /* avoid collision with return values from getopt */ VCPU, SET_MEM, SET_EFER, SET_CR0, + SET_CR2, SET_CR3, SET_CR4, + SET_DR0, + SET_DR1, + SET_DR2, + SET_DR3, + SET_DR6, SET_DR7, SET_RSP, SET_RIP, @@ -388,492 +602,158 @@ enum { SET_TR, SET_LDTR, SET_X2APIC_STATE, - SET_VMCS_EXCEPTION_BITMAP, + SET_EXCEPTION_BITMAP, SET_VMCS_ENTRY_INTERRUPTION_INFO, SET_CAP, CAPNAME, UNASSIGN_PPTDEV, + GET_GPA_PMAP, + ASSERT_LAPIC_LVT, + SET_RTC_TIME, + SET_RTC_NVRAM, + RTC_NVRAM_OFFSET, }; -int -main(int argc, char *argv[]) +static void +print_cpus(const char *banner, const cpuset_t *cpus) { - char *vmname; - int error, ch, vcpu; - vm_paddr_t gpa; - size_t len; - struct vm_exit vmexit; - uint64_t ctl, eptp, bm, addr, u64; - struct vmctx *ctx; - int wired; + int i; + int first; + + first = 1; + printf("%s:\t", banner); + if (!CPU_EMPTY(cpus)) { + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, cpus)) { + printf("%s%d", first ? " " : ", ", i); + first = 0; + } + } + } else + printf(" (none)"); + printf("\n"); +} + +static void +print_intinfo(const char *banner, uint64_t info) +{ + int type; + + printf("%s:\t", banner); + if (info & VM_INTINFO_VALID) { + type = info & VM_INTINFO_TYPE; + switch (type) { + case VM_INTINFO_HWINTR: + printf("extint"); + break; + case VM_INTINFO_NMI: + printf("nmi"); + break; + case VM_INTINFO_SWINTR: + printf("swint"); + break; + default: + printf("exception"); + break; + } + printf(" vector %d", (int)VM_INTINFO_VECTOR(info)); + if (info & VM_INTINFO_DEL_ERRCODE) + printf(" errcode %#x", (u_int)(info >> 32)); + } else { + printf("n/a"); + } + printf("\n"); +} - uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat; +static bool +cpu_vendor_intel(void) +{ + u_int regs[4]; + char cpu_vendor[13]; + + do_cpuid(0, regs); + ((u_int *)&cpu_vendor)[0] = regs[1]; + ((u_int *)&cpu_vendor)[1] = regs[3]; + ((u_int *)&cpu_vendor)[2] = regs[2]; + cpu_vendor[12] = '\0'; + + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { + return (false); + } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) { + return (true); + } else { + fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor); + exit(1); + } +} + +static int +get_all_registers(struct vmctx *ctx, int vcpu) +{ + uint64_t cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7; + uint64_t rsp, rip, rflags, efer; uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp; uint64_t r8, r9, r10, r11, r12, r13, r14, r15; - uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; + int error = 0; - struct option opts[] = { - { "vm", REQ_ARG, 0, VMNAME }, - { "cpu", REQ_ARG, 0, VCPU }, - { "set-mem", REQ_ARG, 0, SET_MEM }, - { "set-efer", REQ_ARG, 0, SET_EFER }, - { "set-cr0", REQ_ARG, 0, SET_CR0 }, - { "set-cr3", REQ_ARG, 0, SET_CR3 }, - { "set-cr4", REQ_ARG, 0, SET_CR4 }, - { "set-dr7", REQ_ARG, 0, SET_DR7 }, - { "set-rsp", REQ_ARG, 0, SET_RSP }, - { "set-rip", REQ_ARG, 0, SET_RIP }, - { "set-rax", REQ_ARG, 0, SET_RAX }, - { "set-rflags", REQ_ARG, 0, SET_RFLAGS }, - { "desc-base", REQ_ARG, 0, DESC_BASE }, - { "desc-limit", REQ_ARG, 0, DESC_LIMIT }, - { "desc-access",REQ_ARG, 0, DESC_ACCESS }, - { "set-cs", REQ_ARG, 0, SET_CS }, - { "set-ds", REQ_ARG, 0, SET_DS }, - { "set-es", REQ_ARG, 0, SET_ES }, - { "set-fs", REQ_ARG, 0, SET_FS }, - { "set-gs", REQ_ARG, 0, SET_GS }, - { "set-ss", REQ_ARG, 0, SET_SS }, - { "set-tr", REQ_ARG, 0, SET_TR }, - { "set-ldtr", REQ_ARG, 0, SET_LDTR }, - { "set-x2apic-state",REQ_ARG, 0, SET_X2APIC_STATE }, - { "set-vmcs-exception-bitmap", - REQ_ARG, 0, SET_VMCS_EXCEPTION_BITMAP }, - { "set-vmcs-entry-interruption-info", - REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO }, - { "capname", REQ_ARG, 0, CAPNAME }, - { "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV }, - { "setcap", REQ_ARG, 0, SET_CAP }, - { "getcap", NO_ARG, &getcap, 1 }, - { "get-stats", NO_ARG, &get_stats, 1 }, - { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, - { "set-desc-ds",NO_ARG, &set_desc_ds, 1 }, - { "get-desc-es",NO_ARG, &get_desc_es, 1 }, - { "set-desc-es",NO_ARG, &set_desc_es, 1 }, - { "get-desc-ss",NO_ARG, &get_desc_ss, 1 }, - { "set-desc-ss",NO_ARG, &set_desc_ss, 1 }, - { "get-desc-cs",NO_ARG, &get_desc_cs, 1 }, - { "set-desc-cs",NO_ARG, &set_desc_cs, 1 }, - { "get-desc-fs",NO_ARG, &get_desc_fs, 1 }, - { "set-desc-fs",NO_ARG, &set_desc_fs, 1 }, - { "get-desc-gs",NO_ARG, &get_desc_gs, 1 }, - { "set-desc-gs",NO_ARG, &set_desc_gs, 1 }, - { "get-desc-tr",NO_ARG, &get_desc_tr, 1 }, - { "set-desc-tr",NO_ARG, &set_desc_tr, 1 }, - { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 }, - { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 }, - { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 }, - { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 }, - { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 }, - { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 }, - { "get-lowmem", NO_ARG, &get_lowmem, 1 }, - { "get-highmem",NO_ARG, &get_highmem, 1 }, - { "get-efer", NO_ARG, &get_efer, 1 }, - { "get-cr0", NO_ARG, &get_cr0, 1 }, - { "get-cr3", NO_ARG, &get_cr3, 1 }, - { "get-cr4", NO_ARG, &get_cr4, 1 }, - { "get-dr7", NO_ARG, &get_dr7, 1 }, - { "get-rsp", NO_ARG, &get_rsp, 1 }, - { "get-rip", NO_ARG, &get_rip, 1 }, - { "get-rax", NO_ARG, &get_rax, 1 }, - { "get-rbx", NO_ARG, &get_rbx, 1 }, - { "get-rcx", NO_ARG, &get_rcx, 1 }, - { "get-rdx", NO_ARG, &get_rdx, 1 }, - { "get-rsi", NO_ARG, &get_rsi, 1 }, - { "get-rdi", NO_ARG, &get_rdi, 1 }, - { "get-rbp", NO_ARG, &get_rbp, 1 }, - { "get-r8", NO_ARG, &get_r8, 1 }, - { "get-r9", NO_ARG, &get_r9, 1 }, - { "get-r10", NO_ARG, &get_r10, 1 }, - { "get-r11", NO_ARG, &get_r11, 1 }, - { "get-r12", NO_ARG, &get_r12, 1 }, - { "get-r13", NO_ARG, &get_r13, 1 }, - { "get-r14", NO_ARG, &get_r14, 1 }, - { "get-r15", NO_ARG, &get_r15, 1 }, - { "get-rflags", NO_ARG, &get_rflags, 1 }, - { "get-cs", NO_ARG, &get_cs, 1 }, - { "get-ds", NO_ARG, &get_ds, 1 }, - { "get-es", NO_ARG, &get_es, 1 }, - { "get-fs", NO_ARG, &get_fs, 1 }, - { "get-gs", NO_ARG, &get_gs, 1 }, - { "get-ss", NO_ARG, &get_ss, 1 }, - { "get-tr", NO_ARG, &get_tr, 1 }, - { "get-ldtr", NO_ARG, &get_ldtr, 1 }, - { "get-vmcs-pinbased-ctls", - NO_ARG, &get_pinbased_ctls, 1 }, - { "get-vmcs-procbased-ctls", - NO_ARG, &get_procbased_ctls, 1 }, - { "get-vmcs-procbased-ctls2", - NO_ARG, &get_procbased_ctls2, 1 }, - { "get-vmcs-guest-linear-address", - NO_ARG, &get_vmcs_gla, 1 }, - { "get-vmcs-guest-physical-address", - NO_ARG, &get_vmcs_gpa, 1 }, - { "get-vmcs-entry-interruption-info", - NO_ARG, &get_vmcs_entry_interruption_info, 1}, - { "get-vmcs-eptp", NO_ARG, &get_eptp, 1 }, - { "get-vmcs-exception-bitmap", - NO_ARG, &get_exception_bitmap, 1 }, - { "get-vmcs-io-bitmap-address", - NO_ARG, &get_io_bitmap, 1 }, - { "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 }, - { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 }, - { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 }, - { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 }, - { "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 }, - { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1}, - { "get-vmcs-apic-access-address", - NO_ARG, &get_apic_access_addr, 1}, - { "get-vmcs-virtual-apic-address", - NO_ARG, &get_virtual_apic_addr, 1}, - { "get-vmcs-tpr-threshold", - NO_ARG, &get_tpr_threshold, 1 }, - { "get-vmcs-msr-bitmap", - NO_ARG, &get_msr_bitmap, 1 }, - { "get-vmcs-msr-bitmap-address", - NO_ARG, &get_msr_bitmap_address, 1 }, - { "get-vmcs-vpid", NO_ARG, &get_vpid, 1 }, - { "get-vmcs-ple-gap", NO_ARG, &get_ple_gap, 1 }, - { "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 }, - { "get-vmcs-instruction-error", - NO_ARG, &get_inst_err, 1 }, - { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 }, - { "get-vmcs-entry-ctls", - NO_ARG, &get_entry_ctls, 1 }, - { "get-vmcs-guest-pat", NO_ARG, &get_guest_pat, 1 }, - { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 }, - { "get-vmcs-host-cr0", - NO_ARG, &get_host_cr0, 1 }, - { "get-vmcs-host-cr3", - NO_ARG, &get_host_cr3, 1 }, - { "get-vmcs-host-cr4", - NO_ARG, &get_host_cr4, 1 }, - { "get-vmcs-host-rip", - NO_ARG, &get_host_rip, 1 }, - { "get-vmcs-host-rsp", - NO_ARG, &get_host_rsp, 1 }, - { "get-vmcs-guest-sysenter", - NO_ARG, &get_guest_sysenter, 1 }, - { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 }, - { "get-vmcs-exit-reason", - NO_ARG, &get_vmcs_exit_reason, 1 }, - { "get-vmcs-exit-qualification", - NO_ARG, &get_vmcs_exit_qualification, 1 }, - { "get-vmcs-exit-interruption-info", - NO_ARG, &get_vmcs_exit_interruption_info, 1}, - { "get-vmcs-exit-interruption-error", - NO_ARG, &get_vmcs_exit_interruption_error, 1}, - { "get-vmcs-interruptibility", - NO_ARG, &get_vmcs_interruptibility, 1 }, - { "get-x2apic-state",NO_ARG, &get_x2apic_state, 1 }, - { "get-all", NO_ARG, &get_all, 1 }, - { "run", NO_ARG, &run, 1 }, - { "create", NO_ARG, &create, 1 }, - { "destroy", NO_ARG, &destroy, 1 }, - { NULL, 0, NULL, 0 } - }; + if (!error && (get_efer || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer); + if (error == 0) + printf("efer[%d]\t\t0x%016lx\n", vcpu, efer); + } - vcpu = 0; - progname = basename(argv[0]); + if (!error && (get_cr0 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0); + if (error == 0) + printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + } - while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { - switch (ch) { - case 0: - break; - case VMNAME: - vmname = optarg; - break; - case VCPU: - vcpu = atoi(optarg); - break; - case SET_MEM: - memsize = atoi(optarg) * MB; - memsize = roundup(memsize, 2 * MB); - break; - case SET_EFER: - efer = strtoul(optarg, NULL, 0); - set_efer = 1; - break; - case SET_CR0: - cr0 = strtoul(optarg, NULL, 0); - set_cr0 = 1; - break; - case SET_CR3: - cr3 = strtoul(optarg, NULL, 0); - set_cr3 = 1; - break; - case SET_CR4: - cr4 = strtoul(optarg, NULL, 0); - set_cr4 = 1; - break; - case SET_DR7: - dr7 = strtoul(optarg, NULL, 0); - set_dr7 = 1; - break; - case SET_RSP: - rsp = strtoul(optarg, NULL, 0); - set_rsp = 1; - break; - case SET_RIP: - rip = strtoul(optarg, NULL, 0); - set_rip = 1; - break; - case SET_RAX: - rax = strtoul(optarg, NULL, 0); - set_rax = 1; - break; - case SET_RFLAGS: - rflags = strtoul(optarg, NULL, 0); - set_rflags = 1; - break; - case DESC_BASE: - desc_base = strtoul(optarg, NULL, 0); - break; - case DESC_LIMIT: - desc_limit = strtoul(optarg, NULL, 0); - break; - case DESC_ACCESS: - desc_access = strtoul(optarg, NULL, 0); - break; - case SET_CS: - cs = strtoul(optarg, NULL, 0); - set_cs = 1; - break; - case SET_DS: - ds = strtoul(optarg, NULL, 0); - set_ds = 1; - break; - case SET_ES: - es = strtoul(optarg, NULL, 0); - set_es = 1; - break; - case SET_FS: - fs = strtoul(optarg, NULL, 0); - set_fs = 1; - break; - case SET_GS: - gs = strtoul(optarg, NULL, 0); - set_gs = 1; - break; - case SET_SS: - ss = strtoul(optarg, NULL, 0); - set_ss = 1; - break; - case SET_TR: - tr = strtoul(optarg, NULL, 0); - set_tr = 1; - break; - case SET_LDTR: - ldtr = strtoul(optarg, NULL, 0); - set_ldtr = 1; - break; - case SET_X2APIC_STATE: - x2apic_state = strtol(optarg, NULL, 0); - set_x2apic_state = 1; - break; - case SET_VMCS_EXCEPTION_BITMAP: - exception_bitmap = strtoul(optarg, NULL, 0); - set_exception_bitmap = 1; - break; - case SET_VMCS_ENTRY_INTERRUPTION_INFO: - vmcs_entry_interruption_info = strtoul(optarg, NULL, 0); - set_vmcs_entry_interruption_info = 1; - break; - case SET_CAP: - capval = strtoul(optarg, NULL, 0); - setcap = 1; - break; - case CAPNAME: - capname = optarg; - break; - case UNASSIGN_PPTDEV: - unassign_pptdev = 1; - if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3) - usage(); - break; - default: - usage(); - } - } - argc -= optind; - argv += optind; - - if (vmname == NULL) - usage(); - - error = 0; - - if (!error && create) - error = vm_create(vmname); - - if (!error) { - ctx = vm_open(vmname); - if (ctx == NULL) - error = -1; - } - - if (!error && memsize) - error = vm_setup_memory(ctx, memsize, VM_MMAP_NONE); - - if (!error && set_efer) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer); - - if (!error && set_cr0) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0); - - if (!error && set_cr3) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3); - - if (!error && set_cr4) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4); - - if (!error && set_dr7) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7); - - if (!error && set_rsp) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp); - - if (!error && set_rip) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip); - - if (!error && set_rax) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax); - - if (!error && set_rflags) { - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, - rflags); - } - - if (!error && set_desc_ds) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_es) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_ss) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_cs) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_fs) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_gs) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_tr) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_ldtr) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_gdtr) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR, - desc_base, desc_limit, 0); - } - - if (!error && set_desc_idtr) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR, - desc_base, desc_limit, 0); - } - - if (!error && set_cs) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs); - - if (!error && set_ds) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds); - - if (!error && set_es) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es); - - if (!error && set_fs) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs); - - if (!error && set_gs) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs); - - if (!error && set_ss) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss); - - if (!error && set_tr) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr); - - if (!error && set_ldtr) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr); - - if (!error && set_x2apic_state) - error = vm_set_x2apic_state(ctx, vcpu, x2apic_state); - -#ifdef __FreeBSD__ - if (!error && unassign_pptdev) - error = vm_unassign_pptdev(ctx, bus, slot, func); -#endif - - if (!error && set_exception_bitmap) { - error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP, - exception_bitmap); + if (!error && (get_cr2 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR2, &cr2); + if (error == 0) + printf("cr2[%d]\t\t0x%016lx\n", vcpu, cr2); } - if (!error && set_vmcs_entry_interruption_info) { - error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO, - vmcs_entry_interruption_info); + if (!error && (get_cr3 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3); + if (error == 0) + printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3); } - if (!error && (get_lowmem || get_all)) { - gpa = 0; - error = vm_get_memory_seg(ctx, gpa, &len, &wired); + if (!error && (get_cr4 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4); if (error == 0) - printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len, - wired ? " wired" : ""); + printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4); } - if (!error && (get_highmem || get_all)) { - gpa = 4 * GB; - error = vm_get_memory_seg(ctx, gpa, &len, &wired); + if (!error && (get_dr0 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR0, &dr0); if (error == 0) - printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len, - wired ? " wired" : ""); + printf("dr0[%d]\t\t0x%016lx\n", vcpu, dr0); } - if (!error && (get_efer || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer); + if (!error && (get_dr1 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR1, &dr1); if (error == 0) - printf("efer[%d]\t\t0x%016lx\n", vcpu, efer); + printf("dr1[%d]\t\t0x%016lx\n", vcpu, dr1); } - if (!error && (get_cr0 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0); + if (!error && (get_dr2 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR2, &dr2); if (error == 0) - printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + printf("dr2[%d]\t\t0x%016lx\n", vcpu, dr2); } - if (!error && (get_cr3 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3); + if (!error && (get_dr3 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR3, &dr3); if (error == 0) - printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3); + printf("dr3[%d]\t\t0x%016lx\n", vcpu, dr3); } - if (!error && (get_cr4 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4); + if (!error && (get_dr6 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR6, &dr6); if (error == 0) - printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4); + printf("dr6[%d]\t\t0x%016lx\n", vcpu, dr6); } if (!error && (get_dr7 || get_all)) { @@ -991,30 +871,21 @@ main(int argc, char *argv[]) printf("rflags[%d]\t0x%016lx\n", vcpu, rflags); } -#ifdef __FreeBSD__ - if (!error && (get_stats || get_all)) { - int i, num_stats; - uint64_t *stats; - struct timeval tv; - const char *desc; + return (error); +} - stats = vm_get_stats(ctx, vcpu, &tv, &num_stats); - if (stats != NULL) { - printf("vcpu%d\n", vcpu); - for (i = 0; i < num_stats; i++) { - desc = vm_get_stat_desc(ctx, i); - printf("%-40s\t%ld\n", desc, stats[i]); - } - } - } -#endif +static int +get_all_segments(struct vmctx *ctx, int vcpu) +{ + uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; + int error = 0; if (!error && (get_desc_ds || get_all)) { error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS, - &desc_base, &desc_limit, &desc_access); + &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1023,7 +894,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1032,7 +903,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1041,7 +912,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1050,7 +921,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1059,7 +930,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1068,7 +939,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1077,7 +948,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1086,7 +957,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("gdtr[%d]\t\t0x%016lx/0x%08x\n", - vcpu, desc_base, desc_limit); + vcpu, desc_base, desc_limit); } } @@ -1095,7 +966,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("idtr[%d]\t\t0x%016lx/0x%08x\n", - vcpu, desc_base, desc_limit); + vcpu, desc_base, desc_limit); } } @@ -1147,82 +1018,14 @@ main(int argc, char *argv[]) printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr); } - if (!error && (get_x2apic_state || get_all)) { - error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state); - if (error == 0) - printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state); - } - - if (!error && (get_pinbased_ctls || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl); - if (error == 0) - printf("pinbased_ctls[%d]\t0x%08lx\n", vcpu, ctl); - } - - if (!error && (get_procbased_ctls || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_PRI_PROC_BASED_CTLS, &ctl); - if (error == 0) - printf("procbased_ctls[%d]\t0x%08lx\n", vcpu, ctl); - } - - if (!error && (get_procbased_ctls2 || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_SEC_PROC_BASED_CTLS, &ctl); - if (error == 0) - printf("procbased_ctls2[%d]\t0x%08lx\n", vcpu, ctl); - } - - if (!error && (get_vmcs_gla || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_GUEST_LINEAR_ADDRESS, &u64); - if (error == 0) - printf("gla[%d]\t\t0x%016lx\n", vcpu, u64); - } - - if (!error && (get_vmcs_gpa || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_GUEST_PHYSICAL_ADDRESS, &u64); - if (error == 0) - printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64); - } - - if (!error && (get_vmcs_entry_interruption_info || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64); - if (error == 0) { - printf("entry_interruption_info[%d]\t0x%08lx\n", - vcpu, u64); - } - } - - if (!error && (get_eptp || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp); - if (error == 0) - printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp); - } - - if (!error && (get_exception_bitmap || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP, - &bm); - if (error == 0) - printf("exception_bitmap[%d]\t0x%08lx\n", vcpu, bm); - } - - if (!error && (get_io_bitmap || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm); - if (error == 0) - printf("io_bitmap_a[%d]\t0x%08lx\n", vcpu, bm); - error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm); - if (error == 0) - printf("io_bitmap_b[%d]\t0x%08lx\n", vcpu, bm); - } + return (error); +} - if (!error && (get_tsc_offset || get_all)) { - uint64_t tscoff; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff); - if (error == 0) - printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff); - } +static int +get_misc_vmcs(struct vmctx *ctx, int vcpu) +{ + uint64_t ctl, cr0, cr3, cr4, rsp, rip, pat, addr, u64; + int error = 0; if (!error && (get_cr0_mask || get_all)) { uint64_t cr0mask; @@ -1259,7 +1062,7 @@ main(int argc, char *argv[]) error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT, &target_count); if (error == 0) { - printf("cr3_target_count[%d]\t0x%08lx\n", + printf("cr3_target_count[%d]\t0x%016lx\n", vcpu, target_count); } @@ -1292,57 +1095,55 @@ main(int argc, char *argv[]) } } - if (!error && (get_apic_access_addr || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr); + if (!error && (get_pinbased_ctls || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl); if (error == 0) - printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr); + printf("pinbased_ctls[%d]\t0x%016lx\n", vcpu, ctl); } - if (!error && (get_virtual_apic_addr || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr); + if (!error && (get_procbased_ctls || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_PRI_PROC_BASED_CTLS, &ctl); if (error == 0) - printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr); + printf("procbased_ctls[%d]\t0x%016lx\n", vcpu, ctl); } - if (!error && (get_tpr_threshold || get_all)) { - uint64_t threshold; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD, - &threshold); + if (!error && (get_procbased_ctls2 || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_SEC_PROC_BASED_CTLS, &ctl); if (error == 0) - printf("tpr_threshold[%d]\t0x%08lx\n", vcpu, threshold); + printf("procbased_ctls2[%d]\t0x%016lx\n", vcpu, ctl); } - if (!error && (get_msr_bitmap_address || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr); + if (!error && (get_vmcs_gla || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_LINEAR_ADDRESS, &u64); if (error == 0) - printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr); + printf("gla[%d]\t\t0x%016lx\n", vcpu, u64); } - if (!error && (get_msr_bitmap || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr); + if (!error && (get_vmcs_gpa || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_PHYSICAL_ADDRESS, &u64); if (error == 0) - error = dump_vmcs_msr_bitmap(vcpu, addr); + printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64); } - if (!error && (get_vpid || get_all)) { - uint64_t vpid; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid); - if (error == 0) - printf("vpid[%d]\t\t0x%04lx\n", vcpu, vpid); - } - - if (!error && (get_ple_window || get_all)) { - uint64_t window; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window); - if (error == 0) - printf("ple_window[%d]\t\t0x%08lx\n", vcpu, window); + if (!error && (get_vmcs_entry_interruption_info || + get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64); + if (error == 0) { + printf("entry_interruption_info[%d]\t0x%016lx\n", + vcpu, u64); + } } - if (!error && (get_ple_gap || get_all)) { - uint64_t gap; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap); + if (!error && (get_tpr_threshold || get_all)) { + uint64_t threshold; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD, + &threshold); if (error == 0) - printf("ple_gap[%d]\t\t0x%08lx\n", vcpu, gap); + printf("tpr_threshold[%d]\t0x%016lx\n", vcpu, threshold); } if (!error && (get_inst_err || get_all)) { @@ -1350,7 +1151,7 @@ main(int argc, char *argv[]) error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR, &insterr); if (error == 0) { - printf("instruction_error[%d]\t0x%08lx\n", + printf("instruction_error[%d]\t0x%016lx\n", vcpu, insterr); } } @@ -1358,13 +1159,13 @@ main(int argc, char *argv[]) if (!error && (get_exit_ctls || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl); if (error == 0) - printf("exit_ctls[%d]\t\t0x%08lx\n", vcpu, ctl); + printf("exit_ctls[%d]\t\t0x%016lx\n", vcpu, ctl); } if (!error && (get_entry_ctls || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl); if (error == 0) - printf("entry_ctls[%d]\t\t0x%08lx\n", vcpu, ctl); + printf("entry_ctls[%d]\t\t0x%016lx\n", vcpu, ctl); } if (!error && (get_host_pat || get_all)) { @@ -1373,12 +1174,6 @@ main(int argc, char *argv[]) printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat); } - if (!error && (get_guest_pat || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat); - if (error == 0) - printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat); - } - if (!error && (get_host_cr0 || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0); if (error == 0) @@ -1409,55 +1204,25 @@ main(int argc, char *argv[]) printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp); } - if (!error && (get_guest_sysenter || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_GUEST_IA32_SYSENTER_CS, &cs); - if (error == 0) - printf("guest_sysenter_cs[%d]\t0x%08lx\n", vcpu, cs); - - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_GUEST_IA32_SYSENTER_ESP, &rsp); - if (error == 0) - printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp); - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_GUEST_IA32_SYSENTER_EIP, &rip); - if (error == 0) - printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip); - } - if (!error && (get_vmcs_link || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr); if (error == 0) printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr); } - if (!error && (get_vmcs_exit_reason || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64); - if (error == 0) - printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64); - } - - if (!error && (get_vmcs_exit_qualification || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION, - &u64); - if (error == 0) - printf("vmcs_exit_qualification[%d]\t0x%016lx\n", - vcpu, u64); - } - if (!error && (get_vmcs_exit_interruption_info || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_INFO, &u64); if (error == 0) { - printf("vmcs_exit_interruption_info[%d]\t0x%08lx\n", + printf("vmcs_exit_interruption_info[%d]\t0x%016lx\n", vcpu, u64); } } if (!error && (get_vmcs_exit_interruption_error || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_ERRCODE, - &u64); + &u64); if (error == 0) { - printf("vmcs_exit_interruption_error[%d]\t0x%08lx\n", + printf("vmcs_exit_interruption_error[%d]\t0x%016lx\n", vcpu, u64); } } @@ -1466,58 +1231,1150 @@ main(int argc, char *argv[]) error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_INTERRUPTIBILITY, &u64); if (error == 0) { - printf("vmcs_guest_interruptibility[%d]\t0x%08lx\n", + printf("vmcs_guest_interruptibility[%d]\t0x%016lx\n", vcpu, u64); } } - if (!error && setcap) { - int captype; - captype = vm_capability_name2type(capname); - error = vm_set_capability(ctx, vcpu, captype, capval); - if (error != 0 && errno == ENOENT) - printf("Capability \"%s\" is not available\n", capname); + if (!error && (get_vmcs_exit_inst_length || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_EXIT_INSTRUCTION_LENGTH, &u64); + if (error == 0) + printf("vmcs_exit_inst_length[%d]\t0x%08x\n", vcpu, + (uint32_t)u64); } - if (!error && (getcap || get_all)) { - int captype, val, getcaptype; + if (!error && (get_vmcs_exit_qualification || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION, + &u64); + if (error == 0) + printf("vmcs_exit_qualification[%d]\t0x%016lx\n", + vcpu, u64); + } + + return (error); +} - if (getcap && capname) - getcaptype = vm_capability_name2type(capname); - else - getcaptype = -1; +static int +get_misc_vmcb(struct vmctx *ctx, int vcpu) +{ + uint64_t ctl, addr; + int error = 0; - for (captype = 0; captype < VM_CAP_MAX; captype++) { - if (getcaptype >= 0 && captype != getcaptype) - continue; - error = vm_get_capability(ctx, vcpu, captype, &val); - if (error == 0) { - printf("Capability \"%s\" is %s on vcpu %d\n", - vm_capability_type2name(captype), - val ? "set" : "not set", vcpu); - } else if (errno == ENOENT) { - error = 0; - printf("Capability \"%s\" is not available\n", - vm_capability_type2name(captype)); - } else { - break; - } - } + if (!error && (get_vmcb_intercept || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_CR_INTERCEPT, 4, + &ctl); + if (error == 0) + printf("cr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_DR_INTERCEPT, 4, + &ctl); + if (error == 0) + printf("dr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXC_INTERCEPT, 4, + &ctl); + if (error == 0) + printf("exc_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST1_INTERCEPT, + 4, &ctl); + if (error == 0) + printf("inst1_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST2_INTERCEPT, + 4, &ctl); + if (error == 0) + printf("inst2_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); } - if (!error && run) { - error = vm_run(ctx, vcpu, &vmexit); + if (!error && (get_vmcb_tlb_ctrl || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_TLB_CTRL, + 4, &ctl); if (error == 0) - dump_vm_run_exitcode(&vmexit, vcpu); - else - printf("vm_run error %d\n", error); + printf("TLB ctrl[%d]\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_vmcb_exit_details || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO1, + 8, &ctl); + if (error == 0) + printf("exitinfo1[%d]\t0x%016lx\n", vcpu, ctl); + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO2, + 8, &ctl); + if (error == 0) + printf("exitinfo2[%d]\t0x%016lx\n", vcpu, ctl); + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINTINFO, + 8, &ctl); + if (error == 0) + printf("exitintinfo[%d]\t0x%016lx\n", vcpu, ctl); } + if (!error && (get_vmcb_virq || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_VIRQ, + 8, &ctl); + if (error == 0) + printf("v_irq/tpr[%d]\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_apic_access_addr || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_BAR, 8, + &addr); + if (error == 0) + printf("AVIC apic_bar[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && (get_virtual_apic_addr || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PAGE, 8, + &addr); + if (error == 0) + printf("AVIC backing page[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && (get_avic_table || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_LT, 8, + &addr); + if (error == 0) + printf("AVIC logical table[%d]\t0x%016lx\n", + vcpu, addr); + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PT, 8, + &addr); + if (error == 0) + printf("AVIC physical table[%d]\t0x%016lx\n", + vcpu, addr); + } + + return (error); +} + +static struct option * +setup_options(bool cpu_intel) +{ + const struct option common_opts[] = { + { "vm", REQ_ARG, 0, VMNAME }, + { "cpu", REQ_ARG, 0, VCPU }, + { "set-mem", REQ_ARG, 0, SET_MEM }, + { "set-efer", REQ_ARG, 0, SET_EFER }, + { "set-cr0", REQ_ARG, 0, SET_CR0 }, + { "set-cr2", REQ_ARG, 0, SET_CR2 }, + { "set-cr3", REQ_ARG, 0, SET_CR3 }, + { "set-cr4", REQ_ARG, 0, SET_CR4 }, + { "set-dr0", REQ_ARG, 0, SET_DR0 }, + { "set-dr1", REQ_ARG, 0, SET_DR1 }, + { "set-dr2", REQ_ARG, 0, SET_DR2 }, + { "set-dr3", REQ_ARG, 0, SET_DR3 }, + { "set-dr6", REQ_ARG, 0, SET_DR6 }, + { "set-dr7", REQ_ARG, 0, SET_DR7 }, + { "set-rsp", REQ_ARG, 0, SET_RSP }, + { "set-rip", REQ_ARG, 0, SET_RIP }, + { "set-rax", REQ_ARG, 0, SET_RAX }, + { "set-rflags", REQ_ARG, 0, SET_RFLAGS }, + { "desc-base", REQ_ARG, 0, DESC_BASE }, + { "desc-limit", REQ_ARG, 0, DESC_LIMIT }, + { "desc-access",REQ_ARG, 0, DESC_ACCESS }, + { "set-cs", REQ_ARG, 0, SET_CS }, + { "set-ds", REQ_ARG, 0, SET_DS }, + { "set-es", REQ_ARG, 0, SET_ES }, + { "set-fs", REQ_ARG, 0, SET_FS }, + { "set-gs", REQ_ARG, 0, SET_GS }, + { "set-ss", REQ_ARG, 0, SET_SS }, + { "set-tr", REQ_ARG, 0, SET_TR }, + { "set-ldtr", REQ_ARG, 0, SET_LDTR }, + { "set-x2apic-state",REQ_ARG, 0, SET_X2APIC_STATE }, + { "set-exception-bitmap", + REQ_ARG, 0, SET_EXCEPTION_BITMAP }, + { "capname", REQ_ARG, 0, CAPNAME }, + { "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV }, + { "setcap", REQ_ARG, 0, SET_CAP }, + { "get-gpa-pmap", REQ_ARG, 0, GET_GPA_PMAP }, + { "assert-lapic-lvt", REQ_ARG, 0, ASSERT_LAPIC_LVT }, + { "get-rtc-time", NO_ARG, &get_rtc_time, 1 }, + { "set-rtc-time", REQ_ARG, 0, SET_RTC_TIME }, + { "rtc-nvram-offset", REQ_ARG, 0, RTC_NVRAM_OFFSET }, + { "get-rtc-nvram", NO_ARG, &get_rtc_nvram, 1 }, + { "set-rtc-nvram", REQ_ARG, 0, SET_RTC_NVRAM }, + { "getcap", NO_ARG, &getcap, 1 }, + { "get-stats", NO_ARG, &get_stats, 1 }, + { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, + { "set-desc-ds",NO_ARG, &set_desc_ds, 1 }, + { "get-desc-es",NO_ARG, &get_desc_es, 1 }, + { "set-desc-es",NO_ARG, &set_desc_es, 1 }, + { "get-desc-ss",NO_ARG, &get_desc_ss, 1 }, + { "set-desc-ss",NO_ARG, &set_desc_ss, 1 }, + { "get-desc-cs",NO_ARG, &get_desc_cs, 1 }, + { "set-desc-cs",NO_ARG, &set_desc_cs, 1 }, + { "get-desc-fs",NO_ARG, &get_desc_fs, 1 }, + { "set-desc-fs",NO_ARG, &set_desc_fs, 1 }, + { "get-desc-gs",NO_ARG, &get_desc_gs, 1 }, + { "set-desc-gs",NO_ARG, &set_desc_gs, 1 }, + { "get-desc-tr",NO_ARG, &get_desc_tr, 1 }, + { "set-desc-tr",NO_ARG, &set_desc_tr, 1 }, + { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 }, + { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 }, + { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 }, + { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 }, + { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 }, + { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 }, + { "get-memmap", NO_ARG, &get_memmap, 1 }, + { "get-memseg", NO_ARG, &get_memseg, 1 }, + { "get-efer", NO_ARG, &get_efer, 1 }, + { "get-cr0", NO_ARG, &get_cr0, 1 }, + { "get-cr2", NO_ARG, &get_cr2, 1 }, + { "get-cr3", NO_ARG, &get_cr3, 1 }, + { "get-cr4", NO_ARG, &get_cr4, 1 }, + { "get-dr0", NO_ARG, &get_dr0, 1 }, + { "get-dr1", NO_ARG, &get_dr1, 1 }, + { "get-dr2", NO_ARG, &get_dr2, 1 }, + { "get-dr3", NO_ARG, &get_dr3, 1 }, + { "get-dr6", NO_ARG, &get_dr6, 1 }, + { "get-dr7", NO_ARG, &get_dr7, 1 }, + { "get-rsp", NO_ARG, &get_rsp, 1 }, + { "get-rip", NO_ARG, &get_rip, 1 }, + { "get-rax", NO_ARG, &get_rax, 1 }, + { "get-rbx", NO_ARG, &get_rbx, 1 }, + { "get-rcx", NO_ARG, &get_rcx, 1 }, + { "get-rdx", NO_ARG, &get_rdx, 1 }, + { "get-rsi", NO_ARG, &get_rsi, 1 }, + { "get-rdi", NO_ARG, &get_rdi, 1 }, + { "get-rbp", NO_ARG, &get_rbp, 1 }, + { "get-r8", NO_ARG, &get_r8, 1 }, + { "get-r9", NO_ARG, &get_r9, 1 }, + { "get-r10", NO_ARG, &get_r10, 1 }, + { "get-r11", NO_ARG, &get_r11, 1 }, + { "get-r12", NO_ARG, &get_r12, 1 }, + { "get-r13", NO_ARG, &get_r13, 1 }, + { "get-r14", NO_ARG, &get_r14, 1 }, + { "get-r15", NO_ARG, &get_r15, 1 }, + { "get-rflags", NO_ARG, &get_rflags, 1 }, + { "get-cs", NO_ARG, &get_cs, 1 }, + { "get-ds", NO_ARG, &get_ds, 1 }, + { "get-es", NO_ARG, &get_es, 1 }, + { "get-fs", NO_ARG, &get_fs, 1 }, + { "get-gs", NO_ARG, &get_gs, 1 }, + { "get-ss", NO_ARG, &get_ss, 1 }, + { "get-tr", NO_ARG, &get_tr, 1 }, + { "get-ldtr", NO_ARG, &get_ldtr, 1 }, + { "get-eptp", NO_ARG, &get_eptp, 1 }, + { "get-exception-bitmap", + NO_ARG, &get_exception_bitmap, 1 }, + { "get-io-bitmap-address", + NO_ARG, &get_io_bitmap, 1 }, + { "get-tsc-offset", NO_ARG, &get_tsc_offset, 1 }, + { "get-msr-bitmap", + NO_ARG, &get_msr_bitmap, 1 }, + { "get-msr-bitmap-address", + NO_ARG, &get_msr_bitmap_address, 1 }, + { "get-guest-pat", NO_ARG, &get_guest_pat, 1 }, + { "get-guest-sysenter", + NO_ARG, &get_guest_sysenter, 1 }, + { "get-exit-reason", + NO_ARG, &get_exit_reason, 1 }, + { "get-x2apic-state", NO_ARG, &get_x2apic_state, 1 }, + { "get-all", NO_ARG, &get_all, 1 }, + { "run", NO_ARG, &run, 1 }, + { "create", NO_ARG, &create, 1 }, + { "destroy", NO_ARG, &destroy, 1 }, + { "inject-nmi", NO_ARG, &inject_nmi, 1 }, + { "force-reset", NO_ARG, &force_reset, 1 }, + { "force-poweroff", NO_ARG, &force_poweroff, 1 }, + { "get-active-cpus", NO_ARG, &get_active_cpus, 1 }, + { "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 }, + { "get-intinfo", NO_ARG, &get_intinfo, 1 }, + { "get-cpu-topology", NO_ARG, &get_cpu_topology, 1 }, +#ifndef __FreeBSD__ + { "wrlock-cycle", NO_ARG, &wrlock_cycle, 1 }, +#endif + }; + + const struct option intel_opts[] = { + { "get-vmcs-pinbased-ctls", + NO_ARG, &get_pinbased_ctls, 1 }, + { "get-vmcs-procbased-ctls", + NO_ARG, &get_procbased_ctls, 1 }, + { "get-vmcs-procbased-ctls2", + NO_ARG, &get_procbased_ctls2, 1 }, + { "get-vmcs-guest-linear-address", + NO_ARG, &get_vmcs_gla, 1 }, + { "get-vmcs-guest-physical-address", + NO_ARG, &get_vmcs_gpa, 1 }, + { "get-vmcs-entry-interruption-info", + NO_ARG, &get_vmcs_entry_interruption_info, 1}, + { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 }, + { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 }, + { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 }, + { "get-vmcs-cr4-shadow", NO_ARG, &get_cr4_shadow, 1 }, + { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1 }, + { "get-vmcs-tpr-threshold", + NO_ARG, &get_tpr_threshold, 1 }, + { "get-vmcs-vpid", NO_ARG, &get_vpid_asid, 1 }, + { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 }, + { "get-vmcs-entry-ctls", + NO_ARG, &get_entry_ctls, 1 }, + { "get-vmcs-instruction-error", + NO_ARG, &get_inst_err, 1 }, + { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 }, + { "get-vmcs-host-cr0", + NO_ARG, &get_host_cr0, 1 }, + { "set-vmcs-entry-interruption-info", + REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO }, + { "get-vmcs-exit-qualification", + NO_ARG, &get_vmcs_exit_qualification, 1 }, + { "get-vmcs-exit-inst-length", + NO_ARG, &get_vmcs_exit_inst_length, 1 }, + { "get-vmcs-interruptibility", + NO_ARG, &get_vmcs_interruptibility, 1 }, + { "get-vmcs-exit-interruption-error", + NO_ARG, &get_vmcs_exit_interruption_error, 1 }, + { "get-vmcs-exit-interruption-info", + NO_ARG, &get_vmcs_exit_interruption_info, 1 }, + { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 }, + { "get-vmcs-host-cr3", + NO_ARG, &get_host_cr3, 1 }, + { "get-vmcs-host-cr4", + NO_ARG, &get_host_cr4, 1 }, + { "get-vmcs-host-rip", + NO_ARG, &get_host_rip, 1 }, + { "get-vmcs-host-rsp", + NO_ARG, &get_host_rsp, 1 }, + { "get-apic-access-address", + NO_ARG, &get_apic_access_addr, 1}, + { "get-virtual-apic-address", + NO_ARG, &get_virtual_apic_addr, 1} + }; + + const struct option amd_opts[] = { + { "get-vmcb-intercepts", + NO_ARG, &get_vmcb_intercept, 1 }, + { "get-vmcb-asid", + NO_ARG, &get_vpid_asid, 1 }, + { "get-vmcb-exit-details", + NO_ARG, &get_vmcb_exit_details, 1 }, + { "get-vmcb-tlb-ctrl", + NO_ARG, &get_vmcb_tlb_ctrl, 1 }, + { "get-vmcb-virq", + NO_ARG, &get_vmcb_virq, 1 }, + { "get-avic-apic-bar", + NO_ARG, &get_apic_access_addr, 1 }, + { "get-avic-backing-page", + NO_ARG, &get_virtual_apic_addr, 1 }, + { "get-avic-table", + NO_ARG, &get_avic_table, 1 } + }; + + const struct option null_opt = { + NULL, 0, NULL, 0 + }; + + struct option *all_opts; + char *cp; + int optlen; + + optlen = sizeof(common_opts); + + if (cpu_intel) + optlen += sizeof(intel_opts); + else + optlen += sizeof(amd_opts); + + optlen += sizeof(null_opt); + + all_opts = malloc(optlen); + + cp = (char *)all_opts; + memcpy(cp, common_opts, sizeof(common_opts)); + cp += sizeof(common_opts); + + if (cpu_intel) { + memcpy(cp, intel_opts, sizeof(intel_opts)); + cp += sizeof(intel_opts); + } else { + memcpy(cp, amd_opts, sizeof(amd_opts)); + cp += sizeof(amd_opts); + } + + memcpy(cp, &null_opt, sizeof(null_opt)); + cp += sizeof(null_opt); + + return (all_opts); +} + +static const char * +wday_str(int idx) +{ + static const char *weekdays[] = { + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" + }; + + if (idx >= 0 && idx < 7) + return (weekdays[idx]); + else + return ("UNK"); +} + +static const char * +mon_str(int idx) +{ + static const char *months[] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" + }; + + if (idx >= 0 && idx < 12) + return (months[idx]); + else + return ("UNK"); +} + +static int +show_memmap(struct vmctx *ctx) +{ + char name[SPECNAMELEN + 1], numbuf[8]; + vm_ooffset_t segoff; + vm_paddr_t gpa; + size_t maplen, seglen; + int error, flags, prot, segid, delim; + + printf("Address Length Segment Offset "); + printf("Prot Flags\n"); + + gpa = 0; + while (1) { + error = vm_mmap_getnext(ctx, &gpa, &segid, &segoff, &maplen, + &prot, &flags); + if (error) + return (errno == ENOENT ? 0 : error); + + error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name)); + if (error) + return (error); + + printf("%-12lX", gpa); + humanize_number(numbuf, sizeof(numbuf), maplen, "B", + HN_AUTOSCALE, HN_NOSPACE); + printf("%-12s", numbuf); + + printf("%-12s", name[0] ? name : "sysmem"); + printf("%-12lX", segoff); + printf("%c%c%c ", prot & PROT_READ ? 'R' : '-', + prot & PROT_WRITE ? 'W' : '-', + prot & PROT_EXEC ? 'X' : '-'); + + delim = '\0'; + if (flags & VM_MEMMAP_F_WIRED) { + printf("%cwired", delim); + delim = '/'; + } + if (flags & VM_MEMMAP_F_IOMMU) { + printf("%ciommu", delim); + delim = '/'; + } + printf("\n"); + + gpa += maplen; + } +} + +static int +show_memseg(struct vmctx *ctx) +{ + char name[SPECNAMELEN + 1], numbuf[8]; + size_t seglen; + int error, segid; + + printf("ID Length Name\n"); + + segid = 0; + while (1) { + error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name)); + if (error) + return (errno == EINVAL ? 0 : error); + + if (seglen) { + printf("%-4d", segid); + humanize_number(numbuf, sizeof(numbuf), seglen, "B", + HN_AUTOSCALE, HN_NOSPACE); + printf("%-12s", numbuf); + printf("%s", name[0] ? name : "sysmem"); + printf("\n"); + } + segid++; + } +} + +int +main(int argc, char *argv[]) +{ + char *vmname; + int error, ch, vcpu, ptenum; + vm_paddr_t gpa_pmap; + struct vm_exit vmexit; + uint64_t rax, cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7; + uint64_t rsp, rip, rflags, efer, pat; + uint64_t eptp, bm, addr, u64, pteval[4], *pte, info[2]; + struct vmctx *ctx; + cpuset_t cpus; + bool cpu_intel; + uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; + struct tm tm; + struct option *opts; + + cpu_intel = cpu_vendor_intel(); + opts = setup_options(cpu_intel); + + vcpu = 0; + vmname = NULL; + assert_lapic_lvt = -1; + progname = basename(argv[0]); + + while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { + switch (ch) { + case 0: + break; + case VMNAME: + vmname = optarg; + break; + case VCPU: + vcpu = atoi(optarg); + break; + case SET_MEM: + memsize = atoi(optarg) * MB; + memsize = roundup(memsize, 2 * MB); + break; + case SET_EFER: + efer = strtoul(optarg, NULL, 0); + set_efer = 1; + break; + case SET_CR0: + cr0 = strtoul(optarg, NULL, 0); + set_cr0 = 1; + break; + case SET_CR2: + cr2 = strtoul(optarg, NULL, 0); + set_cr2 = 1; + break; + case SET_CR3: + cr3 = strtoul(optarg, NULL, 0); + set_cr3 = 1; + break; + case SET_CR4: + cr4 = strtoul(optarg, NULL, 0); + set_cr4 = 1; + break; + case SET_DR0: + dr0 = strtoul(optarg, NULL, 0); + set_dr0 = 1; + break; + case SET_DR1: + dr1 = strtoul(optarg, NULL, 0); + set_dr1 = 1; + break; + case SET_DR2: + dr2 = strtoul(optarg, NULL, 0); + set_dr2 = 1; + break; + case SET_DR3: + dr3 = strtoul(optarg, NULL, 0); + set_dr3 = 1; + break; + case SET_DR6: + dr6 = strtoul(optarg, NULL, 0); + set_dr6 = 1; + break; + case SET_DR7: + dr7 = strtoul(optarg, NULL, 0); + set_dr7 = 1; + break; + case SET_RSP: + rsp = strtoul(optarg, NULL, 0); + set_rsp = 1; + break; + case SET_RIP: + rip = strtoul(optarg, NULL, 0); + set_rip = 1; + break; + case SET_RAX: + rax = strtoul(optarg, NULL, 0); + set_rax = 1; + break; + case SET_RFLAGS: + rflags = strtoul(optarg, NULL, 0); + set_rflags = 1; + break; + case DESC_BASE: + desc_base = strtoul(optarg, NULL, 0); + break; + case DESC_LIMIT: + desc_limit = strtoul(optarg, NULL, 0); + break; + case DESC_ACCESS: + desc_access = strtoul(optarg, NULL, 0); + break; + case SET_CS: + cs = strtoul(optarg, NULL, 0); + set_cs = 1; + break; + case SET_DS: + ds = strtoul(optarg, NULL, 0); + set_ds = 1; + break; + case SET_ES: + es = strtoul(optarg, NULL, 0); + set_es = 1; + break; + case SET_FS: + fs = strtoul(optarg, NULL, 0); + set_fs = 1; + break; + case SET_GS: + gs = strtoul(optarg, NULL, 0); + set_gs = 1; + break; + case SET_SS: + ss = strtoul(optarg, NULL, 0); + set_ss = 1; + break; + case SET_TR: + tr = strtoul(optarg, NULL, 0); + set_tr = 1; + break; + case SET_LDTR: + ldtr = strtoul(optarg, NULL, 0); + set_ldtr = 1; + break; + case SET_X2APIC_STATE: + x2apic_state = strtol(optarg, NULL, 0); + set_x2apic_state = 1; + break; + case SET_EXCEPTION_BITMAP: + exception_bitmap = strtoul(optarg, NULL, 0); + set_exception_bitmap = 1; + break; + case SET_VMCS_ENTRY_INTERRUPTION_INFO: + vmcs_entry_interruption_info = strtoul(optarg, NULL, 0); + set_vmcs_entry_interruption_info = 1; + break; + case SET_CAP: + capval = strtoul(optarg, NULL, 0); + setcap = 1; + break; + case SET_RTC_TIME: + rtc_secs = strtoul(optarg, NULL, 0); + set_rtc_time = 1; + break; + case SET_RTC_NVRAM: + rtc_nvram_value = (uint8_t)strtoul(optarg, NULL, 0); + set_rtc_nvram = 1; + break; + case RTC_NVRAM_OFFSET: + rtc_nvram_offset = strtoul(optarg, NULL, 0); + break; + case GET_GPA_PMAP: + gpa_pmap = strtoul(optarg, NULL, 0); + get_gpa_pmap = 1; + break; + case CAPNAME: + capname = optarg; + break; + case UNASSIGN_PPTDEV: + unassign_pptdev = 1; + if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3) + usage(cpu_intel); + break; + case ASSERT_LAPIC_LVT: + assert_lapic_lvt = atoi(optarg); + break; + default: + usage(cpu_intel); + } + } + argc -= optind; + argv += optind; + + if (vmname == NULL) + usage(cpu_intel); + + error = 0; + + if (!error && create) + error = vm_create(vmname); + + if (!error) { + ctx = vm_open(vmname); + if (ctx == NULL) { + printf("VM:%s is not created.\n", vmname); + exit (1); + } + } + +#ifndef __FreeBSD__ + if (!error && wrlock_cycle) { + error = vm_wrlock_cycle(ctx); + exit(error); + } +#endif /* __FreeBSD__ */ + + if (!error && memsize) + error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); + + if (!error && set_efer) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer); + + if (!error && set_cr0) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0); + + if (!error && set_cr2) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR2, cr2); + + if (!error && set_cr3) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3); + + if (!error && set_cr4) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4); + + if (!error && set_dr0) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR0, dr0); + + if (!error && set_dr1) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR1, dr1); + + if (!error && set_dr2) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR2, dr2); + + if (!error && set_dr3) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR3, dr3); + + if (!error && set_dr6) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR6, dr6); + + if (!error && set_dr7) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7); + + if (!error && set_rsp) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp); + + if (!error && set_rip) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip); + + if (!error && set_rax) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax); + + if (!error && set_rflags) { + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, + rflags); + } + + if (!error && set_desc_ds) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_es) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_ss) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_cs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_fs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_gs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_tr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_ldtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_gdtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, 0); + } + + if (!error && set_desc_idtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR, + desc_base, desc_limit, 0); + } + + if (!error && set_cs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs); + + if (!error && set_ds) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds); + + if (!error && set_es) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es); + + if (!error && set_fs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs); + + if (!error && set_gs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs); + + if (!error && set_ss) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss); + + if (!error && set_tr) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr); + + if (!error && set_ldtr) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr); + + if (!error && set_x2apic_state) + error = vm_set_x2apic_state(ctx, vcpu, x2apic_state); + + if (!error && unassign_pptdev) + error = vm_unassign_pptdev(ctx, bus, slot, func); + + if (!error && set_exception_bitmap) { + if (cpu_intel) + error = vm_set_vmcs_field(ctx, vcpu, + VMCS_EXCEPTION_BITMAP, + exception_bitmap); + else + error = vm_set_vmcb_field(ctx, vcpu, + VMCB_OFF_EXC_INTERCEPT, + 4, exception_bitmap); + } + + if (!error && cpu_intel && set_vmcs_entry_interruption_info) { + error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO, + vmcs_entry_interruption_info); + } + + if (!error && inject_nmi) { + error = vm_inject_nmi(ctx, vcpu); + } + + if (!error && assert_lapic_lvt != -1) { + error = vm_lapic_local_irq(ctx, vcpu, assert_lapic_lvt); + } + + if (!error && (get_memseg || get_all)) + error = show_memseg(ctx); + + if (!error && (get_memmap || get_all)) + error = show_memmap(ctx); + + if (!error) + error = get_all_registers(ctx, vcpu); + + if (!error) + error = get_all_segments(ctx, vcpu); + + if (!error) { + if (cpu_intel) + error = get_misc_vmcs(ctx, vcpu); + else + error = get_misc_vmcb(ctx, vcpu); + } + + if (!error && (get_x2apic_state || get_all)) { + error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state); + if (error == 0) + printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state); + } + + if (!error && (get_eptp || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp); + else + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_NPT_BASE, + 8, &eptp); + if (error == 0) + printf("%s[%d]\t\t0x%016lx\n", + cpu_intel ? "eptp" : "rvi/npt", vcpu, eptp); + } + + if (!error && (get_exception_bitmap || get_all)) { + if(cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_EXCEPTION_BITMAP, &bm); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_EXC_INTERCEPT, + 4, &bm); + if (error == 0) + printf("exception_bitmap[%d]\t%#lx\n", vcpu, bm); + } + + if (!error && (get_io_bitmap || get_all)) { + if (cpu_intel) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, + &bm); + if (error == 0) + printf("io_bitmap_a[%d]\t%#lx\n", vcpu, bm); + error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, + &bm); + if (error == 0) + printf("io_bitmap_b[%d]\t%#lx\n", vcpu, bm); + } else { + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_IO_PERM, 8, &bm); + if (error == 0) + printf("io_bitmap[%d]\t%#lx\n", vcpu, bm); + } + } + + if (!error && (get_tsc_offset || get_all)) { + uint64_t tscoff; + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, + &tscoff); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_TSC_OFFSET, + 8, &tscoff); + if (error == 0) + printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff); + } + + if (!error && (get_msr_bitmap_address || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, + &addr); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_MSR_PERM, 8, &addr); + if (error == 0) + printf("msr_bitmap[%d]\t\t%#lx\n", vcpu, addr); + } + + if (!error && (get_msr_bitmap || get_all)) { + if (cpu_intel) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_MSR_BITMAP, &addr); + } else { + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_MSR_PERM, 8, + &addr); + } + + if (error == 0) + error = dump_msr_bitmap(vcpu, addr, cpu_intel); + } + + if (!error && (get_vpid_asid || get_all)) { + uint64_t vpid; + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid); + else + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_ASID, + 4, &vpid); + if (error == 0) + printf("%s[%d]\t\t0x%04lx\n", + cpu_intel ? "vpid" : "asid", vcpu, vpid); + } + + if (!error && (get_guest_pat || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_PAT, &pat); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_GUEST_PAT, 8, &pat); + if (error == 0) + printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat); + } + + if (!error && (get_guest_sysenter || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_CS, + &cs); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_SYSENTER_CS, 8, + &cs); + + if (error == 0) + printf("guest_sysenter_cs[%d]\t%#lx\n", vcpu, cs); + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_ESP, + &rsp); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_SYSENTER_ESP, 8, + &rsp); + + if (error == 0) + printf("guest_sysenter_sp[%d]\t%#lx\n", vcpu, rsp); + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_EIP, + &rip); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_SYSENTER_EIP, 8, + &rip); + if (error == 0) + printf("guest_sysenter_ip[%d]\t%#lx\n", vcpu, rip); + } + + if (!error && (get_exit_reason || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, + &u64); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_EXIT_REASON, 8, + &u64); + if (error == 0) + printf("exit_reason[%d]\t%#lx\n", vcpu, u64); + } + + if (!error && setcap) { + int captype; + captype = vm_capability_name2type(capname); + error = vm_set_capability(ctx, vcpu, captype, capval); + if (error != 0 && errno == ENOENT) + printf("Capability \"%s\" is not available\n", capname); + } + + if (!error && get_gpa_pmap) { + error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum); + if (error == 0) { + printf("gpa %#lx:", gpa_pmap); + pte = &pteval[0]; + while (ptenum-- > 0) + printf(" %#lx", *pte++); + printf("\n"); + } + } + + if (!error && set_rtc_nvram) + error = vm_rtc_write(ctx, rtc_nvram_offset, rtc_nvram_value); + + if (!error && (get_rtc_nvram || get_all)) { + error = vm_rtc_read(ctx, rtc_nvram_offset, &rtc_nvram_value); + if (error == 0) { + printf("rtc nvram[%03d]: 0x%02x\n", rtc_nvram_offset, + rtc_nvram_value); + } + } + + if (!error && set_rtc_time) + error = vm_rtc_settime(ctx, rtc_secs); + + if (!error && (get_rtc_time || get_all)) { + error = vm_rtc_gettime(ctx, &rtc_secs); + if (error == 0) { + gmtime_r(&rtc_secs, &tm); + printf("rtc time %#lx: %s %s %02d %02d:%02d:%02d %d\n", + rtc_secs, wday_str(tm.tm_wday), mon_str(tm.tm_mon), + tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, + 1900 + tm.tm_year); + } + } + + if (!error && (getcap || get_all)) { + int captype, val, getcaptype; + + if (getcap && capname) + getcaptype = vm_capability_name2type(capname); + else + getcaptype = -1; + + for (captype = 0; captype < VM_CAP_MAX; captype++) { + if (getcaptype >= 0 && captype != getcaptype) + continue; + error = vm_get_capability(ctx, vcpu, captype, &val); + if (error == 0) { + printf("Capability \"%s\" is %s on vcpu %d\n", + vm_capability_type2name(captype), + val ? "set" : "not set", vcpu); + } else if (errno == ENOENT) { + error = 0; + printf("Capability \"%s\" is not available\n", + vm_capability_type2name(captype)); + } else { + break; + } + } + } + + if (!error && (get_active_cpus || get_all)) { + error = vm_active_cpus(ctx, &cpus); + if (!error) + print_cpus("active cpus", &cpus); + } + + if (!error && (get_suspended_cpus || get_all)) { + error = vm_suspended_cpus(ctx, &cpus); + if (!error) + print_cpus("suspended cpus", &cpus); + } + + if (!error && (get_intinfo || get_all)) { + error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]); + if (!error) { + print_intinfo("pending", info[0]); + print_intinfo("current", info[1]); + } + } + + if (!error && (get_stats || get_all)) { + int i, num_stats; + uint64_t *stats; + struct timeval tv; + const char *desc; + + stats = vm_get_stats(ctx, vcpu, &tv, &num_stats); + if (stats != NULL) { + printf("vcpu%d stats:\n", vcpu); + for (i = 0; i < num_stats; i++) { + desc = vm_get_stat_desc(ctx, i); + printf("%-40s\t%ld\n", desc, stats[i]); + } + } + } + + if (!error && (get_cpu_topology || get_all)) { + uint16_t sockets, cores, threads, maxcpus; + + vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); + printf("cpu_topology:\tsockets=%hu, cores=%hu, threads=%hu, " + "maxcpus=%hu\n", sockets, cores, threads, maxcpus); + } + + if (!error && run) { + error = vm_run(ctx, vcpu, &vmexit); + if (error == 0) + dump_vm_run_exitcode(&vmexit, vcpu); + else + printf("vm_run error %d\n", error); + } + + if (!error && force_reset) + error = vm_suspend(ctx, VM_SUSPEND_RESET); + + if (!error && force_poweroff) + error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); + if (error) printf("errno = %d\n", errno); if (!error && destroy) - error = vm_destroy(ctx); + vm_destroy(ctx); + free (opts); exit(error); } diff --git a/usr/src/cmd/bhyveload-uefi/Makefile b/usr/src/cmd/bhyveload-uefi/Makefile deleted file mode 100644 index bbcbacf32f..0000000000 --- a/usr/src/cmd/bhyveload-uefi/Makefile +++ /dev/null @@ -1,41 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -PROG = bhyveload-uefi - -include ../Makefile.cmd - -$(BUILD64)SUBDIRS += $(MACH64) - -all := TARGET = all -install := TARGET = install -clean := TARGET = clean -clobber := TARGET = clobber -lint := TARGET = lint - -.KEEP_STATE: - -all clean clobber lint: $(SUBDIRS) - -install: $(SUBDIRS) - -$(RM) $(ROOTUSRSBINPROG) - -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG) - -$(SUBDIRS): FRC - @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) - -FRC: - -include ../Makefile.targ diff --git a/usr/src/cmd/bhyveload-uefi/Makefile.com b/usr/src/cmd/bhyveload-uefi/Makefile.com deleted file mode 100644 index 7865cca8d8..0000000000 --- a/usr/src/cmd/bhyveload-uefi/Makefile.com +++ /dev/null @@ -1,52 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -PROG= bhyveload-uefi - -SRCS = ../bhyveload-uefi.c expand_number.c -OBJS = bhyveload-uefi.o expand_number.o - -include ../../Makefile.cmd - -.KEEP_STATE: - -CFLAGS += $(CCVERBOSE) -CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \ - -I$(ROOT)/usr/platform/i86pc/include -LDLIBS += -lvmmapi - -all: $(PROG) - -$(PROG): $(OBJS) - $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) - $(POST_PROCESS) - -install: all $(ROOTUSRSBINPROG) - -clean: - $(RM) $(OBJS) - -lint: lint_SRCS - -include ../../Makefile.targ - -%.o: ../%.c - $(COMPILE.c) $< - $(POST_PROCESS_O) - -%.o: $(CONTRIB)/freebsd/lib/libutil/%.c - $(COMPILE.c) $< - $(POST_PROCESS_O) - diff --git a/usr/src/cmd/bhyveload-uefi/amd64/Makefile b/usr/src/cmd/bhyveload-uefi/amd64/Makefile deleted file mode 100644 index b602c50d05..0000000000 --- a/usr/src/cmd/bhyveload-uefi/amd64/Makefile +++ /dev/null @@ -1,21 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -include ../Makefile.com -include ../../Makefile.cmd.64 - -CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 - -install: all $(ROOTUSRSBINPROG64) diff --git a/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c b/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c deleted file mode 100644 index 62a7ca5d0f..0000000000 --- a/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c +++ /dev/null @@ -1,190 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2013 Pluribus Networks Inc. - */ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include - -#define KB (1024UL) -#define MB (1024 * 1024UL) -#define GB (1024 * 1024 * 1024UL) - -#define UEFI_ROM_ADDR 0xFFE00000 -#define UEFI_ROM_SIZE (2 * MB) -/* - * N.B. the UEFI code zeros the first page in memory so use the second. - */ -#define BHYVE_HOB_ADDR 0x00002000 -#define BHYVE_BO_HOB_ADDR 0x00002080 - -#define UEFI_ROM_PATH "/usr/share/bhyve/uefi-rom.bin" - -struct platform_info { - uint32_t ncpus; -}; - -/* - * Boot order code: - * 0 - EFI_CD_HD - * 1 - EFI_CD - * 2 - EFI_HD_CD - * 3 - EFI_HD - * 4 - EFI_NET - * 5 - EFI_NET_CD_HD - * 6 - EFI_HD_HD_CD - * 7 - LEGACY_CD_HD - * 8 - LEGACY_CD - * 9 - LEGACY_HD_CD - * 10 - LEGACY_HD - * 11 - EFI_SHELL - */ - -struct bootorder_info { - uint32_t guestbootorder; -}; - -static char *vmname, *progname; -static struct vmctx *ctx; - -static void -usage(void) -{ - printf("usage: %s " - "[-c vcpus] [-m mem-size] [-b bootorder]" - "\n", progname); - exit(1); -} - -int -main(int argc, char** argv) -{ - int opt, error, fd; - int guest_ncpus; - int guest_bootorder = 0; - uint64_t mem_size; - char *membase, *rombase; - struct platform_info *pi; - struct bootorder_info *bi; - - progname = argv[0]; - - guest_ncpus = 1; - mem_size = 256 * MB; - - while ((opt = getopt(argc, argv, "c:m:b:")) != -1) { - switch (opt) { - case 'c': - guest_ncpus = atoi(optarg); - break; - case 'm': - error = vm_parse_memsize(optarg, &mem_size); - if (error != 0 || mem_size == 0) - errx(EX_USAGE, "Invalid memsize '%s'", optarg); - break; - case 'b': - guest_bootorder = atoi(optarg); - if (guest_bootorder < 0 || guest_bootorder > 11) { - errx(EX_USAGE, "Invalid bootoption: %d\n" - "\tBoot order code:\n" - "\t0 - EFI_CD_HD\n" - "\t1 - EFI_CD\n" - "\t2 - EFI_HD_CD\n" - "\t3 - EFI_HD\n" - "\t4 - EFI_NET\n" - "\t5 - EFI_NET_CD_HD\n" - "\t6 - EFI_HD_HD_CD\n" - "\t7 - LEGACY_CD_HD\n" - "\t8 - LEGACY_CD\n" - "\t9 - LEGACY_HD_CD\n" - "\t10 - LEGACY_HD\n" - "\t11 - EFI_SHELL\n", guest_bootorder); - exit(1); - } - break; - case '?': - usage(); - } - } - - argc -= optind; - argv += optind; - - if (argc != 1) - usage(); - - vmname = argv[0]; - error = vm_create(vmname); - if (error != 0 && errno != EEXIST) { - perror("vm_create"); - exit(1); - - } - - ctx = vm_open(vmname); - if (ctx == NULL) { - perror("vm_open"); - exit(1); - } - - error = vm_set_capability(ctx, 0, VM_CAP_UNRESTRICTED_GUEST, 1); - if (error) { - perror("vm_set_capability(VM_CAP_UNRESTRICTED_GUEST)"); - } - - error = vm_setup_memory(ctx, mem_size, VM_MMAP_ALL); - if (error) { - perror("vm_setup_memory"); - exit(1); - } - membase = vm_map_gpa(ctx, 0, 8 * KB); - - error = vm_setup_rom(ctx, UEFI_ROM_ADDR, UEFI_ROM_SIZE); - if (error) { - perror("vm_setup_rom"); - exit(1); - } - rombase = vm_map_gpa(ctx, UEFI_ROM_ADDR, UEFI_ROM_SIZE); - - fd = open(UEFI_ROM_PATH, O_RDONLY); - if (fd == -1) { - perror("open"); - exit(1); - } - read(fd, rombase, UEFI_ROM_SIZE); - close(fd); - - pi = (struct platform_info *)(membase + BHYVE_HOB_ADDR); - pi->ncpus = guest_ncpus; - bi = (struct bootorder_info *)(membase + BHYVE_BO_HOB_ADDR); - bi->guestbootorder = guest_bootorder; - - error = vcpu_reset(ctx, 0); - if (error) { - perror("vcpu_reset"); - exit(1); - } - - return (0); -} diff --git a/usr/src/cmd/bhyveload-uefi/i386/Makefile b/usr/src/cmd/bhyveload-uefi/i386/Makefile deleted file mode 100644 index f5b7bb6915..0000000000 --- a/usr/src/cmd/bhyveload-uefi/i386/Makefile +++ /dev/null @@ -1,18 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -include ../Makefile.com - -install: all $(ROOTUSRSBINPROG32) diff --git a/usr/src/cmd/devfsadm/i386/misc_link_i386.c b/usr/src/cmd/devfsadm/i386/misc_link_i386.c index 2b678df527..4aeea7d294 100644 --- a/usr/src/cmd/devfsadm/i386/misc_link_i386.c +++ b/usr/src/cmd/devfsadm/i386/misc_link_i386.c @@ -45,6 +45,7 @@ static int vt00(di_minor_t minor, di_node_t node); static int kdmouse(di_minor_t minor, di_node_t node); static int ipmi(di_minor_t minor, di_node_t node); static int mc_node(di_minor_t minor, di_node_t node); +static int vmmctl(di_minor_t minor, di_node_t node); static devfsadm_create_t misc_cbt[] = { { "vt00", "ddi_display", NULL, @@ -84,6 +85,9 @@ static devfsadm_create_t misc_cbt[] = { { "pseudo", "ddi_pseudo", "ucode", TYPE_EXACT | DRV_EXACT, ILEVEL_0, ln_minor_name, }, + { "pseudo", "ddi_pseudo", "vmm", + TYPE_EXACT | DRV_EXACT, ILEVEL_0, vmmctl, + } }; DEVFSADM_CREATE_INIT_V0(misc_cbt); @@ -109,6 +113,9 @@ static devfsadm_remove_t misc_remove_cbt[] = { }, { "serial", "^tty[a-z]$", RM_ALWAYS | RM_PRE, ILEVEL_1, devfsadm_rm_all + }, + { "pseudo", "^vmmctl$", RM_ALWAYS | RM_PRE | RM_HOT, + ILEVEL_0, devfsadm_rm_all } }; @@ -345,3 +352,14 @@ mc_node(di_minor_t minor, di_node_t node) (void) devfsadm_mklink(linkpath, node, minor, 0); return (DEVFSADM_CONTINUE); } + +/* + * /dev/vmmctl -> /devices/pseudo/vmm@0:ctl + */ +static int +vmmctl(di_minor_t minor, di_node_t node) +{ + if (strcmp(di_minor_name(minor), "ctl") == 0) + (void) devfsadm_mklink("vmmctl", node, minor, 0); + return (DEVFSADM_CONTINUE); +} diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/Makefile b/usr/src/cmd/mdb/intel/amd64/vmm/Makefile deleted file mode 100644 index bf9219b435..0000000000 --- a/usr/src/cmd/mdb/intel/amd64/vmm/Makefile +++ /dev/null @@ -1,20 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2014 Pluribus Networks Inc. -# - -MAKEVARS = CW_NO_SHADOW=true __GNUC= - -include $(SRC)/Makefile.master -$(BUILD64)SUBDIRS += $(MACH64) -include ../../../Makefile.subdirs diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile b/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile deleted file mode 100644 index 49ca0c5eb3..0000000000 --- a/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -MODULE = vmm.so -MDBTGT = kvm - -MODSRCS = vmm.c - -include ../../../../../Makefile.cmd -include ../../../../../Makefile.cmd.64 -include ../../../Makefile.amd64 -include ../../../../Makefile.module - -CPPFLAGS = -D_KERNEL -D_MACHDEP -CPPFLAGS += -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64 -CPPFLAGS += -I$(CONTRIB)/freebsd -I$(CONTRIB)/freebsd/amd64 -CPPFLAGS += -I$(SRC)/uts/common -I$(SRC)/uts/i86pc -CPPFLAGS += -I$(SRC)/cmd/mdb/common - -CPPFLAGS += -_cc=-xdryrun diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c b/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c deleted file mode 100644 index 9e29d8662a..0000000000 --- a/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c +++ /dev/null @@ -1,238 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2014 Pluribus Networks Inc. - */ - -#include - -#include -#include -#include -#include -#include - -/* - * VMM trace debug walker/dcmd code - */ - -/* - * Initialize the vmm_trace_dmsg_t walker by either using the given starting - * address, or reading the value of the kernel's vmm_debug_rbuf pointer. - * We also allocate a vmm_trace_dmsg_t for storage, and save this using the - * walk_data pointer. - */ -static int -vmm_dmsg_walk_i(mdb_walk_state_t *wsp) -{ - uintptr_t rbuf_addr; - vmm_trace_rbuf_t rbuf; - - if (wsp->walk_addr == NULL) { - if (mdb_readvar(&rbuf_addr, "vmm_debug_rbuf") == -1) { - mdb_warn("failed to read 'vmm_debug_rbuf'"); - return (WALK_ERR); - } - - if (mdb_vread(&rbuf, sizeof (vmm_trace_rbuf_t), rbuf_addr) - == -1) { - mdb_warn("failed to read vmm_trace_rbuf_t at %p", - rbuf_addr); - return (WALK_ERR); - } - - wsp->walk_addr = (uintptr_t)(vmm_trace_dmsg_t *)rbuf.dmsgh; - } - - /* - * Save ptr to head of ring buffer to prevent looping. - */ - wsp->walk_arg = (void *)wsp->walk_addr; - wsp->walk_data = mdb_alloc(sizeof (vmm_trace_dmsg_t), UM_SLEEP); - return (WALK_NEXT); -} - -/* - * At each step, read a vmm_trace_dmsg_t into our private storage, and then - * invoke the callback function. We terminate when we reach a NULL next - * pointer. - */ -static int -vmm_dmsg_walk_s(mdb_walk_state_t *wsp) -{ - int status; - - if (wsp->walk_addr == NULL) - return (WALK_DONE); - - if (mdb_vread(wsp->walk_data, sizeof (vmm_trace_dmsg_t), - wsp->walk_addr) == -1) { - mdb_warn("failed to read vmm_trace_dmsg_t at %p", - wsp->walk_addr); - return (WALK_ERR); - } - - status = wsp->walk_callback(wsp->walk_addr, wsp->walk_data, - wsp->walk_cbdata); - - wsp->walk_addr = - (uintptr_t)(((vmm_trace_dmsg_t *)wsp->walk_data)->next); - - /* - * If we've looped then we're done. - */ - if (wsp->walk_addr == (uintptr_t)wsp->walk_arg) - wsp->walk_addr = NULL; - - return (status); -} - -/* - * The walker's fini function is invoked at the end of each walk. Since we - * dynamically allocated a vmm_trace_dmsg_t in vmm_dmsg_walk_i, we must - * free it now. - */ -static void -vmm_dmsg_walk_f(mdb_walk_state_t *wsp) -{ - mdb_free(wsp->walk_data, sizeof (vmm_trace_dmsg_t)); -} - -/* - * This routine is used by the vmm_dmsg_dump dcmd to dump content of - * VMM trace ring buffer. - */ -int -vmm_dmsg_dump(vmm_trace_dmsg_t *addr, int print_pathname, uint_t *printed) -{ - vmm_trace_dmsg_t dmsg, *dmsgh = addr; - char pathname[MAXPATHLEN]; - char merge[1024]; - - while (addr != NULL) { - if (mdb_vread(&dmsg, sizeof (dmsg), (uintptr_t)addr) != - sizeof (dmsg)) { - mdb_warn("failed to read message pointer in kernel"); - return (DCMD_ERR); - } - - (void) mdb_snprintf(merge, sizeof (merge), - "[%Y:%03d:%03d:%03d] : %s", - dmsg.timestamp.tv_sec, - (int)dmsg.timestamp.tv_nsec/1000000, - (int)(dmsg.timestamp.tv_nsec/1000)%1000, - (int)dmsg.timestamp.tv_nsec%1000, - dmsg.buf); - - mdb_printf("%s", merge); - - if (printed != NULL) { - (*printed)++; - } - - if (((addr = dmsg.next) == NULL) || (dmsg.next == dmsgh)) { - break; - } - } - - return (DCMD_OK); -} - -/* - * 1. Process flag passed to vmm_dmsg_dump dcmd. - * 2. Obtain VMM trace ring buffer pointer. - * 3. Pass VMM trace ring buffer pointer to vmm_dmsg_dump() - * to dump content of VMM trace ring buffer. - */ -int -vmm_rbuf_dump(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) -{ - vmm_trace_rbuf_t rbuf; - uint_t printed = 0; /* have we printed anything? */ - int print_pathname = FALSE; - int rval = DCMD_OK; - - if (argc > 1) { - return (DCMD_USAGE); - } - - if (mdb_getopts(argc, argv, - 'a', MDB_OPT_SETBITS, TRUE, &print_pathname) != argc) { - return (DCMD_USAGE); - } - - /* - * If ring buffer address not provided try to obtain - * it using vmm_debug_rbuf global. - */ - if ((addr == NULL) || !(flags & DCMD_ADDRSPEC)) { - if (mdb_readvar(&addr, "vmm_debug_rbuf") == -1) { - mdb_warn("Failed to read 'vmm_debug_rbuf'."); - return (DCMD_ERR); - } - } - - if (mdb_vread(&rbuf, sizeof (rbuf), addr) != sizeof (rbuf)) { - mdb_warn("Failed to read ring buffer in kernel."); - return (DCMD_ERR); - } - - if (rbuf.dmsgh == NULL) { - mdb_printf("The vmm trace ring buffer is empty.\n"); - return (DCMD_OK); - } - - rval = vmm_dmsg_dump((vmm_trace_dmsg_t *)rbuf.dmsgh, - print_pathname, &printed); - - if (rval != DCMD_OK) { - return (rval); - } - - if (printed == 0) { - mdb_warn("Failed to read vmm trace ring buffer."); - return (DCMD_ERR); - } - - return (rval); -} - -/* - * MDB module linkage information: - * - * We declare a list of structures describing our dcmds, a list of structures - * describing our walkers, and a function named _mdb_init to return a pointer - * to our module information. - */ - -static const mdb_dcmd_t dcmds[] = { - { "vmm_dmsg_dump", "[-a]", "Dump vmm trace debug messages", - vmm_rbuf_dump }, - { NULL } -}; - -static const mdb_walker_t walkers[] = { - { "vmm_dmsg", - "walk ring buffer containing vmm trace debug messages", - vmm_dmsg_walk_i, vmm_dmsg_walk_s, vmm_dmsg_walk_f }, - { NULL } -}; - -static const mdb_modinfo_t modinfo = { - MDB_API_VERSION, dcmds, walkers -}; - -const mdb_modinfo_t * -_mdb_init(void) -{ - return (&modinfo); -} diff --git a/usr/src/compat/freebsd/amd64/machine/asmacros.h b/usr/src/compat/freebsd/amd64/machine/asmacros.h index fcf35a7b78..1f6955130b 100644 --- a/usr/src/compat/freebsd/amd64/machine/asmacros.h +++ b/usr/src/compat/freebsd/amd64/machine/asmacros.h @@ -25,4 +25,7 @@ x: #define END(x) \ .size x, [.-x] +#define ALIGN_TEXT \ + .p2align 4,0x90; /* 16-byte alignment, nop filled */ + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_ASMACROS_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/atomic.h b/usr/src/compat/freebsd/amd64/machine/atomic.h index 5b78143d21..1da9724b7d 100644 --- a/usr/src/compat/freebsd/amd64/machine/atomic.h +++ b/usr/src/compat/freebsd/amd64/machine/atomic.h @@ -11,31 +11,20 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ -static __inline u_char -atomic_load_acq_char(volatile u_char *p) -{ - u_char res; - - __asm volatile("lock ; " "cmpxchgb %b0,%1" - : "=a" (res), "=m" (*p) - : "m" (*p) : "memory", "cc"); - return (res); -} - -static __inline u_short +static __inline u_int atomic_load_acq_short(volatile u_short *p) { u_short res; - __asm volatile("lock ; " "cmpxchgw %w0,%1" - : "=a" (res), "=m" (*p) - : "m" (*p) - : "memory", "cc"); + res = *p; + __asm volatile("" : : : "memory"); + return (res); } @@ -44,10 +33,9 @@ atomic_load_acq_int(volatile u_int *p) { u_int res; - __asm volatile("lock ; " "cmpxchgl %0,%1" - : "=a" (res), "=m" (*p) - : "m" (*p) - : "memory", "cc"); + res = *p; + __asm volatile("" : : : "memory"); + return (res); } @@ -56,25 +44,10 @@ atomic_load_acq_long(volatile u_long *p) { u_long res; - __asm volatile("lock ; " "cmpxchgq %0,%1" - : "=a" (res), "=m" (*p) - : "m" (*p) - : "memory", "cc"); - return (res); -} - -static __inline void -atomic_store_rel_char(volatile u_char *p, u_char v) -{ + res = *p; __asm volatile("" : : : "memory"); - *p = v; -} -static __inline void -atomic_store_rel_short(volatile u_short *p, u_short v) -{ - __asm volatile("" : : : "memory"); - *p = v; + return (res); } static __inline void @@ -134,6 +107,23 @@ atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src) return (res); } +static __inline int +atomic_testandset_int(volatile u_int *p, u_int v) +{ + u_char res; + + __asm __volatile( + " lock ; " + " btsl %2,%1 ; " + " setc %0 ; " + "# atomic_testandset_int" + : "=q" (res), /* 0 */ + "+m" (*p) /* 1 */ + : "Ir" (v & 0x1f) /* 2 */ + : "cc"); + return (res); +} + /* * Atomically add the value of v to the integer pointed to by p and return * the previous value of *p. @@ -226,6 +216,13 @@ atomic_swap_long(volatile u_long *p, u_long v) return (v); } + +#define atomic_store_short(p, v) \ + (*(volatile u_short *)(p) = (u_short)(v)) +#define atomic_store_int(p, v) \ + (*(volatile u_int *)(p) = (u_int)(v)) + + #define atomic_readandclear_int(p) atomic_swap_int(p, 0) #define atomic_readandclear_long(p) atomic_swap_long(p, 0) @@ -241,4 +238,25 @@ atomic_swap_long(volatile u_long *p, u_long v) /* Operations on pointers. */ #define atomic_cmpset_ptr atomic_cmpset_long +/* Needed for the membar functions */ +#include_next + +static __inline void +atomic_thread_fence_rel(void) +{ + /* Equivalent to their __compiler_membar() */ + __asm __volatile(" " : : : "memory"); +} + +static __inline void +atomic_thread_fence_seq_cst(void) +{ + /* Equivalent to their !KERNEL storeload_barrer() */ + __asm __volatile("lock; addl $0,-8(%%rsp)" : : : "memory", "cc"); +} + +#define mb() membar_enter() +#define rmb() membar_consumer() +#define wmb() membar_producer() + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/cpufunc.h b/usr/src/compat/freebsd/amd64/machine/cpufunc.h index cf485e947c..0b7bcdaa59 100644 --- a/usr/src/compat/freebsd/amd64/machine/cpufunc.h +++ b/usr/src/compat/freebsd/amd64/machine/cpufunc.h @@ -16,6 +16,8 @@ #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_ +#include + static __inline u_long bsfq(u_long mask) { @@ -65,6 +67,12 @@ cpuid_count(u_int ax, u_int cx, u_int *p) : "0" (ax), "c" (cx)); } +static __inline void +disable_intr(void) +{ + __asm __volatile("cli"); +} + static __inline void enable_intr(void) { @@ -95,6 +103,15 @@ flsll(long long mask) return (flsl((long)mask)); } +static __inline u_long +read_rflags(void) +{ + u_long rf; + + __asm __volatile("pushfq; popq %0" : "=r" (rf)); + return (rf); +} + static __inline uint64_t rdmsr(u_int msr) { @@ -107,10 +124,10 @@ rdmsr(u_int msr) static __inline uint64_t rdtsc(void) { - uint32_t low, high; - - __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); - return (low | ((uint64_t)high << 32)); + extern hrtime_t tsc_gethrtimeunscaled_delta(void); + + /* Get the TSC reading with any needed synch offset applied */ + return ((uint64_t)tsc_gethrtimeunscaled_delta()); } static __inline void @@ -162,4 +179,133 @@ rcr4(void) return (data); } +static __inline u_long +rxcr(u_int reg) +{ + u_int low, high; + + __asm __volatile("xgetbv" : "=a" (low), "=d" (high) : "c" (reg)); + return (low | ((uint64_t)high << 32)); +} + +static __inline void +load_xcr(u_int reg, u_long val) +{ + u_int low, high; + + low = val; + high = val >> 32; + __asm __volatile("xsetbv" : : "c" (reg), "a" (low), "d" (high)); +} + +static __inline void +write_rflags(u_long rf) +{ + __asm __volatile("pushq %0; popfq" : : "r" (rf)); +} + +static __inline uint64_t +rdr0(void) +{ + uint64_t data; + __asm __volatile("movq %%dr0,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr0(uint64_t dr0) +{ + __asm __volatile("movq %0,%%dr0" : : "r" (dr0)); +} + +static __inline uint64_t +rdr1(void) +{ + uint64_t data; + __asm __volatile("movq %%dr1,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr1(uint64_t dr1) +{ + __asm __volatile("movq %0,%%dr1" : : "r" (dr1)); +} + +static __inline uint64_t +rdr2(void) +{ + uint64_t data; + __asm __volatile("movq %%dr2,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr2(uint64_t dr2) +{ + __asm __volatile("movq %0,%%dr2" : : "r" (dr2)); +} + +static __inline uint64_t +rdr3(void) +{ + uint64_t data; + __asm __volatile("movq %%dr3,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr3(uint64_t dr3) +{ + __asm __volatile("movq %0,%%dr3" : : "r" (dr3)); +} + +static __inline uint64_t +rdr6(void) +{ + uint64_t data; + __asm __volatile("movq %%dr6,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr6(uint64_t dr6) +{ + __asm __volatile("movq %0,%%dr6" : : "r" (dr6)); +} + +static __inline uint64_t +rdr7(void) +{ + uint64_t data; + __asm __volatile("movq %%dr7,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr7(uint64_t dr7) +{ + __asm __volatile("movq %0,%%dr7" : : "r" (dr7)); +} + +#ifdef _KERNEL +/* + * Including the native sys/segments.h in userspace seriously conflicts with + * the FreeBSD compat/contrib headers. + */ +#include + +static __inline void +lldt(u_short sel) +{ + wr_ldtr(sel); +} + +static __inline u_short +sldt() +{ + return (rd_ldtr()); +} +#endif /* _KERNEL */ + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/fpu.h b/usr/src/compat/freebsd/amd64/machine/fpu.h index 48e686780c..6bc651d996 100644 --- a/usr/src/compat/freebsd/amd64/machine/fpu.h +++ b/usr/src/compat/freebsd/amd64/machine/fpu.h @@ -11,13 +11,12 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright (c) 2018, Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_ -#define XSAVE_AREA_ALIGN 64 - void fpuexit(kthread_t *td); void fpurestore(void *); void fpusave(void *); diff --git a/usr/src/compat/freebsd/amd64/machine/iodev.h b/usr/src/compat/freebsd/amd64/machine/iodev.h new file mode 100644 index 0000000000..c7cdddc817 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/iodev.h @@ -0,0 +1,19 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_IODEV_H +#define _COMPAT_FREEBSD_AMD64_MACHINE_IODEV_H + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_IODEV_H */ diff --git a/usr/src/compat/freebsd/amd64/machine/md_var.h b/usr/src/compat/freebsd/amd64/machine/md_var.h index 60fdd566e5..ed57a8bebc 100644 --- a/usr/src/compat/freebsd/amd64/machine/md_var.h +++ b/usr/src/compat/freebsd/amd64/machine/md_var.h @@ -21,4 +21,8 @@ extern u_int cpu_exthigh; /* Highest arg to extended CPUID */ extern u_int cpu_id; /* Stepping ID */ extern char cpu_vendor[]; /* CPU Origin code */ +#include + +#define Maxmem (physmax + 1) + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/param.h b/usr/src/compat/freebsd/amd64/machine/param.h index eaca5ab8d7..b152f4d526 100644 --- a/usr/src/compat/freebsd/amd64/machine/param.h +++ b/usr/src/compat/freebsd/amd64/machine/param.h @@ -36,4 +36,6 @@ /* Size of the level 4 page-map level-4 table units */ #define NPML4EPG (PAGE_SIZE/(sizeof (pml4_entry_t))) +#define CACHE_LINE_SIZE 64 + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_PARAM_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/pmap.h b/usr/src/compat/freebsd/amd64/machine/pmap.h index d0303bdd56..ce3185629b 100644 --- a/usr/src/compat/freebsd/amd64/machine/pmap.h +++ b/usr/src/compat/freebsd/amd64/machine/pmap.h @@ -1,3 +1,54 @@ +/* + * All rights reserved. This copyright notice is Copyright Management + * Information under 17 USC 1202 and is included to protect this work and + * deter copyright infringement. Removal or alteration of this Copyright + * Management Information without the express written permission from + * Pluribus Networks Inc is prohibited, and any such unauthorized removal + * or alteration will be a violation of federal law. + * + * Copyright (c) 2003 Peter Wemm. + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Derived from hp300 version by Mike Hibler, this version by William + * Jolitz uses a recursive map [a pde points to the page directory] to + * map the page tables using the pagetables themselves. This is done to + * reduce the impact on kernel virtual memory for lots of sparse address + * space, and to reduce the cost of memory to each process. + * + * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90 + * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91 + * $FreeBSD$ + */ + /* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. @@ -13,32 +64,426 @@ * Copyright 2014 Pluribus Networks Inc. */ + #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ +/* + * Page-directory and page-table entries follow this format, with a few + * of the fields not present here and there, depending on a lot of things. + */ /* ---- Intel Nomenclature ---- */ -#define PG_V 0x001 /* P Valid */ -#define PG_RW 0x002 /* R/W Read/Write */ -#define PG_U 0x004 /* U/S User/Supervisor */ -#define PG_A 0x020 /* A Accessed */ -#define PG_M 0x040 /* D Dirty */ -#define PG_PS 0x080 /* PS Page size (0=4k,1=2M) */ +#define X86_PG_V 0x001 /* P Valid */ +#define X86_PG_RW 0x002 /* R/W Read/Write */ +#define X86_PG_U 0x004 /* U/S User/Supervisor */ +#define X86_PG_NC_PWT 0x008 /* PWT Write through */ +#define X86_PG_NC_PCD 0x010 /* PCD Cache disable */ +#define X86_PG_A 0x020 /* A Accessed */ +#define X86_PG_M 0x040 /* D Dirty */ +#define X86_PG_PS 0x080 /* PS Page size (0=4k,1=2M) */ +#define X86_PG_PTE_PAT 0x080 /* PAT PAT index */ +#define X86_PG_G 0x100 /* G Global */ +#define X86_PG_AVAIL1 0x200 /* / Available for system */ +#define X86_PG_AVAIL2 0x400 /* < programmers use */ +#define X86_PG_AVAIL3 0x800 /* \ */ +#define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */ +#define X86_PG_NX (1ul<<63) /* No-execute */ +#define X86_PG_AVAIL(x) (1ul << (x)) + +/* Page level cache control fields used to determine the PAT type */ +#define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) +#define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) + +/* + * Intel extended page table (EPT) bit definitions. + */ +#define EPT_PG_READ 0x001 /* R Read */ +#define EPT_PG_WRITE 0x002 /* W Write */ +#define EPT_PG_EXECUTE 0x004 /* X Execute */ +#define EPT_PG_IGNORE_PAT 0x040 /* IPAT Ignore PAT */ +#define EPT_PG_PS 0x080 /* PS Page size */ +#define EPT_PG_A 0x100 /* A Accessed */ +#define EPT_PG_M 0x200 /* D Dirty */ +#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) /* MT Memory Type */ + +/* + * Define the PG_xx macros in terms of the bits on x86 PTEs. + */ +#define PG_V X86_PG_V +#define PG_RW X86_PG_RW +#define PG_U X86_PG_U +#define PG_NC_PWT X86_PG_NC_PWT +#define PG_NC_PCD X86_PG_NC_PCD +#define PG_A X86_PG_A +#define PG_M X86_PG_M +#define PG_PS X86_PG_PS +#define PG_PTE_PAT X86_PG_PTE_PAT +#define PG_G X86_PG_G +#define PG_AVAIL1 X86_PG_AVAIL1 +#define PG_AVAIL2 X86_PG_AVAIL2 +#define PG_AVAIL3 X86_PG_AVAIL3 +#define PG_PDE_PAT X86_PG_PDE_PAT +#define PG_NX X86_PG_NX +#define PG_PDE_CACHE X86_PG_PDE_CACHE +#define PG_PTE_CACHE X86_PG_PTE_CACHE + +/* Our various interpretations of the above */ +#define PG_W X86_PG_AVAIL3 /* "Wired" pseudoflag */ +#define PG_MANAGED X86_PG_AVAIL2 +#define EPT_PG_EMUL_V X86_PG_AVAIL(52) +#define EPT_PG_EMUL_RW X86_PG_AVAIL(53) +#define PG_PROMOTED X86_PG_AVAIL(54) /* PDE only */ +#define PG_FRAME (0x000ffffffffff000ul) +#define PG_PS_FRAME (0x000fffffffe00000ul) + +/* + * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB + * (PTE) page mappings have identical settings for the following fields: + */ +#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \ + PG_M | PG_A | PG_U | PG_RW | PG_V) /* * Page Protection Exception bits */ + #define PGEX_P 0x01 /* Protection violation vs. not present */ #define PGEX_W 0x02 /* during a Write cycle */ #define PGEX_U 0x04 /* access from User mode (UPL) */ #define PGEX_RSV 0x08 /* reserved PTE field is non-zero */ #define PGEX_I 0x10 /* during an instruction fetch */ +/* + * undef the PG_xx macros that define bits in the regular x86 PTEs that + * have a different position in nested PTEs. This is done when compiling + * code that needs to be aware of the differences between regular x86 and + * nested PTEs. + * + * The appropriate bitmask will be calculated at runtime based on the pmap + * type. + */ +#ifdef AMD64_NPT_AWARE +#undef PG_AVAIL1 /* X86_PG_AVAIL1 aliases with EPT_PG_M */ +#undef PG_G +#undef PG_A +#undef PG_M +#undef PG_PDE_PAT +#undef PG_PDE_CACHE +#undef PG_PTE_PAT +#undef PG_PTE_CACHE +#undef PG_RW +#undef PG_V +#endif + +/* + * Pte related macros. This is complicated by having to deal with + * the sign extension of the 48th bit. + */ +#define KVADDR(l4, l3, l2, l1) ( \ + ((unsigned long)-1 << 47) | \ + ((unsigned long)(l4) << PML4SHIFT) | \ + ((unsigned long)(l3) << PDPSHIFT) | \ + ((unsigned long)(l2) << PDRSHIFT) | \ + ((unsigned long)(l1) << PAGE_SHIFT)) + +#define UVADDR(l4, l3, l2, l1) ( \ + ((unsigned long)(l4) << PML4SHIFT) | \ + ((unsigned long)(l3) << PDPSHIFT) | \ + ((unsigned long)(l2) << PDRSHIFT) | \ + ((unsigned long)(l1) << PAGE_SHIFT)) + +/* + * Number of kernel PML4 slots. Can be anywhere from 1 to 64 or so, + * but setting it larger than NDMPML4E makes no sense. + * + * Each slot provides .5 TB of kernel virtual space. + */ +#define NKPML4E 4 + +#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */ +#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */ +#define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */ + +/* + * NDMPML4E is the maximum number of PML4 entries that will be + * used to implement the direct map. It must be a power of two, + * and should generally exceed NKPML4E. The maximum possible + * value is 64; using 128 will make the direct map intrude into + * the recursive page table map. + */ +#define NDMPML4E 8 + +/* + * These values control the layout of virtual memory. The starting address + * of the direct map, which is controlled by DMPML4I, must be a multiple of + * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.) + * + * Note: KPML4I is the index of the (single) level 4 page that maps + * the KVA that holds KERNBASE, while KPML4BASE is the index of the + * first level 4 page that maps VM_MIN_KERNEL_ADDRESS. If NKPML4E + * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra + * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to + * KERNBASE. + * + * (KPML4I combines with KPDPI to choose where KERNBASE starts. + * Or, in other words, KPML4I provides bits 39..47 of KERNBASE, + * and KPDPI provides bits 30..38.) + */ +#define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */ + +#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */ +#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */ + +#define KPML4I (NPML4EPG-1) +#define KPDPI (NPDPEPG-2) /* kernbase at -2GB */ + +/* + * XXX doesn't really belong here I guess... + */ +#define ISA_HOLE_START 0xa0000 +#define ISA_HOLE_LENGTH (0x100000-ISA_HOLE_START) + +#define PMAP_PCID_NONE 0xffffffff +#define PMAP_PCID_KERN 0 +#define PMAP_PCID_OVERMAX 0x1000 + +#ifndef LOCORE + +#ifdef __FreeBSD__ +#include +#include +#include +#include + +#include +#endif /* __FreeBSD__ */ + typedef u_int64_t pd_entry_t; typedef u_int64_t pt_entry_t; typedef u_int64_t pdp_entry_t; typedef u_int64_t pml4_entry_t; +/* + * Address of current address space page table maps and directories. + */ +#ifdef _KERNEL +#define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0)) +#define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0)) +#define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0)) +#define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)) +#define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t))) +#define PTmap ((pt_entry_t *)(addr_PTmap)) +#define PDmap ((pd_entry_t *)(addr_PDmap)) +#define PDPmap ((pd_entry_t *)(addr_PDPmap)) +#define PML4map ((pd_entry_t *)(addr_PML4map)) +#define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e)) + +extern int nkpt; /* Initial number of kernel page tables */ +extern u_int64_t KPDPphys; /* physical address of kernel level 3 */ +extern u_int64_t KPML4phys; /* physical address of kernel level 4 */ + +/* + * virtual address to page table entry and + * to physical address. + * Note: these work recursively, thus vtopte of a pte will give + * the corresponding pde that in turn maps it. + */ +pt_entry_t *vtopte(vm_offset_t); #define vtophys(va) pmap_kextract(((vm_offset_t) (va))) -vm_paddr_t pmap_kextract(vm_offset_t va); +#ifndef __FreeBSD__ +extern vm_paddr_t pmap_kextract(vm_offset_t); +#endif + +#define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) +#define pte_load_clear(ptep) atomic_swap_long(ptep, 0) +#define pte_store(ptep, pte) do { \ + *(u_long *)(ptep) = (u_long)(pte); \ +} while (0) +#define pte_clear(ptep) pte_store(ptep, 0) + +#define pde_store(pdep, pde) pte_store(pdep, pde) + +extern pt_entry_t pg_nx; + +#endif /* _KERNEL */ + +#ifdef __FreeBSD__ +/* + * Pmap stuff + */ +struct pv_entry; +struct pv_chunk; + +/* + * Locks + * (p) PV list lock + */ +struct md_page { + TAILQ_HEAD(, pv_entry) pv_list; /* (p) */ + int pv_gen; /* (p) */ + int pat_mode; +}; +#endif /* __FreeBSD__ */ + +enum pmap_type { + PT_X86, /* regular x86 page tables */ + PT_EPT, /* Intel's nested page tables */ + PT_RVI, /* AMD's nested page tables */ +}; + +#ifdef __FreeBSD__ +struct pmap_pcids { + uint32_t pm_pcid; + uint32_t pm_gen; +}; + +/* + * The kernel virtual address (KVA) of the level 4 page table page is always + * within the direct map (DMAP) region. + */ +struct pmap { + struct mtx pm_mtx; + pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ + uint64_t pm_cr3; + TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ + cpuset_t pm_active; /* active on cpus */ + enum pmap_type pm_type; /* regular or nested tables */ + struct pmap_statistics pm_stats; /* pmap statistics */ + struct vm_radix pm_root; /* spare page table pages */ + long pm_eptgen; /* EPT pmap generation id */ + int pm_flags; + struct pmap_pcids pm_pcids[MAXCPU]; +}; +#endif /* __FreeBSD__ */ + +/* flags */ +#define PMAP_NESTED_IPIMASK 0xff +#define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ +#define PMAP_EMULATE_AD_BITS (1 << 9) /* needs A/D bits emulation */ +#define PMAP_SUPPORTS_EXEC_ONLY (1 << 10) /* execute only mappings ok */ + +typedef struct pmap *pmap_t; + +#ifdef _KERNEL +extern struct pmap kernel_pmap_store; +#define kernel_pmap (&kernel_pmap_store) + +#define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx) +#define PMAP_LOCK_ASSERT(pmap, type) \ + mtx_assert(&(pmap)->pm_mtx, (type)) +#define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) +#define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \ + NULL, MTX_DEF | MTX_DUPOK) +#define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) +#define PMAP_MTX(pmap) (&(pmap)->pm_mtx) +#define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) +#define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) + +int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags); +int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype); +#endif + +#ifdef __FreeBSD__ +/* + * For each vm_page_t, there is a list of all currently valid virtual + * mappings of that page. An entry is a pv_entry_t, the list is pv_list. + */ +typedef struct pv_entry { + vm_offset_t pv_va; /* virtual address for mapping */ + TAILQ_ENTRY(pv_entry) pv_next; +} *pv_entry_t; + +/* + * pv_entries are allocated in chunks per-process. This avoids the + * need to track per-pmap assignments. + */ +#define _NPCM 3 +#define _NPCPV 168 +struct pv_chunk { + pmap_t pc_pmap; + TAILQ_ENTRY(pv_chunk) pc_list; + uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ + TAILQ_ENTRY(pv_chunk) pc_lru; + struct pv_entry pc_pventry[_NPCPV]; +}; + +#ifdef _KERNEL + +extern caddr_t CADDR1; +extern pt_entry_t *CMAP1; +extern vm_paddr_t phys_avail[]; +extern vm_paddr_t dump_avail[]; +extern vm_offset_t virtual_avail; +extern vm_offset_t virtual_end; +extern vm_paddr_t dmaplimit; +extern int pmap_pcid_enabled; +extern int invpcid_works; + +#define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) +#define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) +#define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) + +struct thread; + +void pmap_activate_sw(struct thread *); +void pmap_bootstrap(vm_paddr_t *); +int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde); +int pmap_change_attr(vm_offset_t, vm_size_t, int); +void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate); +void pmap_init_pat(void); +void pmap_kenter(vm_offset_t va, vm_paddr_t pa); +void *pmap_kenter_temporary(vm_paddr_t pa, int i); +vm_paddr_t pmap_kextract(vm_offset_t); +void pmap_kremove(vm_offset_t); +void *pmap_mapbios(vm_paddr_t, vm_size_t); +void *pmap_mapdev(vm_paddr_t, vm_size_t); +void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); +boolean_t pmap_page_is_mapped(vm_page_t m); +void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma); +void pmap_pinit_pml4(vm_page_t); +void pmap_unmapdev(vm_offset_t, vm_size_t); +void pmap_invalidate_page(pmap_t, vm_offset_t); +void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); +void pmap_invalidate_all(pmap_t); +void pmap_invalidate_cache(void); +void pmap_invalidate_cache_pages(vm_page_t *pages, int count); +void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, + boolean_t force); +void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); +boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); +void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); +#endif /* _KERNEL */ + +/* Return various clipped indexes for a given VA */ +static __inline vm_pindex_t +pmap_pte_index(vm_offset_t va) +{ + + return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); +} + +static __inline vm_pindex_t +pmap_pde_index(vm_offset_t va) +{ + + return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); +} + +static __inline vm_pindex_t +pmap_pdpe_index(vm_offset_t va) +{ + + return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); +} + +static __inline vm_pindex_t +pmap_pml4e_index(vm_offset_t va) +{ + + return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); +} + +#endif /* __FreeBSD__ */ +#endif /* !LOCORE */ -#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ */ +#endif /* !_COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/reg.h b/usr/src/compat/freebsd/amd64/machine/reg.h new file mode 100644 index 0000000000..4a73463603 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/reg.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_REG_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_REG_H_ + +#define DBREG_DR6_RESERVED1 0xffff0ff0 +#define DBREG_DR7_RESERVED1 0x0400 + + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_REG_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/smp.h b/usr/src/compat/freebsd/amd64/machine/smp.h index ef719b9684..9c4f2d111b 100644 --- a/usr/src/compat/freebsd/amd64/machine/smp.h +++ b/usr/src/compat/freebsd/amd64/machine/smp.h @@ -11,9 +11,20 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ +#ifdef _KERNEL + +/* + * APIC-related functions are replaced with native calls rather than shims + * which attempt to replicate the FreeBSD interfaces. This is empty, but will + * remain present to appease sources which wish to include the path. + */ + +#endif /* _KERNEL */ + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/specialreg.h b/usr/src/compat/freebsd/amd64/machine/specialreg.h new file mode 100644 index 0000000000..871573ea6b --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/specialreg.h @@ -0,0 +1,61 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SPECIALREG_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_SPECIALREG_H_ + +#ifdef _SYS_X86_ARCHEXT_H +/* Our x86_archext conflicts with BSD header for the XFEATURE_ defines */ +#undef XFEATURE_AVX +#undef XFEATURE_MPX +#undef XFEATURE_AVX512 +#endif + +#ifdef _SYS_CONTROLREGS_H +/* Our CR4 defines conflict with BSD header */ +#undef CR4_VME +#undef CR4_PVI +#undef CR4_TSD +#undef CR4_DE +#undef CR4_PSE +#undef CR4_PAE +#undef CR4_MCE +#undef CR4_PGE +#undef CR4_PCE +#undef CR4_VMXE +#undef CR4_SMEP +#undef CR4_SMAP +#undef CR4_PKE +#undef CR4_PCIDE +#endif /* _SYS_CONTROLREGS_H */ + +#ifdef _SYS_X86_ARCHEXT_H +/* Our IA32 speculation-related defines conflict with BSD header */ +#undef IA32_ARCH_CAP_RDCL_NO +#undef IA32_ARCH_CAP_IBRS_ALL +#undef IA32_ARCH_CAP_RSBA +#undef IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY +#undef IA32_ARCH_CAP_SSB_NO +#undef IA32_ARCH_CAP_MDS_NO +#undef IA32_SPEC_CTRL_IBRS +#undef IA32_SPEC_CTRL_STIBP +#undef IA32_SPEC_CTRL_SSBD +#undef IA32_FLUSH_CMD_L1D +#undef MSR_IA32_SPEC_CTRL +#undef MSR_IA32_PRED_CMD +#endif /* _SYS_X86_ARCHEXT_H */ + +#include +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SPECIALREG_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/vmm.h b/usr/src/compat/freebsd/amd64/machine/vmm.h index 79c3ec959e..1c54c0830d 100644 --- a/usr/src/compat/freebsd/amd64/machine/vmm.h +++ b/usr/src/compat/freebsd/amd64/machine/vmm.h @@ -11,11 +11,14 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_ +#include + #include #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/vmparam.h b/usr/src/compat/freebsd/amd64/machine/vmparam.h index c80c2af545..c76a3259f3 100644 --- a/usr/src/compat/freebsd/amd64/machine/vmparam.h +++ b/usr/src/compat/freebsd/amd64/machine/vmparam.h @@ -11,9 +11,35 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_ +extern caddr_t kpm_vbase; +extern size_t kpm_size; + +static inline uintptr_t +phys_to_dmap(uintptr_t pa) +{ + ASSERT3U(pa, <, kpm_size); + return ((uintptr_t)kpm_vbase + pa); +} + +static inline uintptr_t +dmap_to_phys(uintptr_t kva) +{ + const uintptr_t base = (uintptr_t)kpm_vbase; + + ASSERT3U(kva, >=, base); + ASSERT3U(kva, <, base + kpm_size); + + return (kva - base); +} + +#define PHYS_TO_DMAP(x) phys_to_dmap(x) +#define DMAP_TO_PHYS(x) dmap_to_phys(x) + + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_ */ diff --git a/usr/src/compat/freebsd/err.h b/usr/src/compat/freebsd/err.h new file mode 100644 index 0000000000..40d144e025 --- /dev/null +++ b/usr/src/compat/freebsd/err.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_ERR_H_ +#define _COMPAT_FREEBSD_ERR_H_ + +#define errc(code, num, ...) err(code, __VA_ARGS__) + +#include_next + +#endif /* _COMPAT_FREEBSD_ERR_H_ */ diff --git a/usr/src/compat/freebsd/libutil.h b/usr/src/compat/freebsd/libutil.h index e22ffc0551..f899d4425e 100644 --- a/usr/src/compat/freebsd/libutil.h +++ b/usr/src/compat/freebsd/libutil.h @@ -17,5 +17,19 @@ #define _COMPAT_FREEBSD_LIBUTIL_H_ int expand_number(const char *_buf, uint64_t *_num); +int humanize_number(char *_buf, size_t _len, int64_t _number, + const char *_suffix, int _scale, int _flags); + +/* Values for humanize_number(3)'s flags parameter. */ +#define HN_DECIMAL 0x01 +#define HN_NOSPACE 0x02 +#define HN_B 0x04 +#define HN_DIVISOR_1000 0x08 +#define HN_IEC_PREFIXES 0x10 + +/* Values for humanize_number(3)'s scale parameter. */ +#define HN_GETSCALE 0x10 +#define HN_AUTOSCALE 0x20 + #endif /* _COMPAT_FREEBSD_LIBUTIL_H_ */ diff --git a/usr/src/compat/freebsd/net/ethernet.h b/usr/src/compat/freebsd/net/ethernet.h index a0d5a828c6..dcd3a58925 100644 --- a/usr/src/compat/freebsd/net/ethernet.h +++ b/usr/src/compat/freebsd/net/ethernet.h @@ -11,11 +11,25 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_ #define _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_ +#define ether_addr_octet octet + #include +/* + * Some basic Ethernet constants. + */ +#define ETHER_ADDR_LEN 6 /* length of an Ethernet address */ +#define ETHER_CRC_LEN 4 /* length of the Ethernet CRC */ +#define ETHER_MIN_LEN 64 /* minimum frame len, including CRC */ + +#define ETHER_VLAN_ENCAP_LEN 4 /* len of 802.1Q VLAN encapsulation */ + +#define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */ + #endif /* _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_ */ diff --git a/usr/src/compat/freebsd/pthread_np.h b/usr/src/compat/freebsd/pthread_np.h index 641c58f406..c4f76b259c 100644 --- a/usr/src/compat/freebsd/pthread_np.h +++ b/usr/src/compat/freebsd/pthread_np.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_PTHREAD_NP_H_ @@ -20,8 +21,9 @@ #include #include +#include -#define pthread_set_name_np(thread, name) +#define pthread_set_name_np pthread_setname_np #define pthread_mutex_isowned_np(x) _mutex_held(x) diff --git a/usr/src/compat/freebsd/sys/_cpuset.h b/usr/src/compat/freebsd/sys/_cpuset.h new file mode 100644 index 0000000000..286d26fc00 --- /dev/null +++ b/usr/src/compat/freebsd/sys/_cpuset.h @@ -0,0 +1,33 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS__CPUSET_H_ +#define _COMPAT_FREEBSD_SYS__CPUSET_H_ + +#ifdef _KERNEL +/* + * The sys/_cpuset.h header is used to communicate the layout of cpuset_t while + * sys/cpuset.h contains the manipulation routines. + * + * The explicit guard definition below is necessary as other contrib headers + * change their behavior based on its presence. + */ +#define _SYS__CPUSET_H_ + +#include + +#endif /* _KERNEL */ + +#endif /* _COMPAT_FREEBSD_SYS__CPUSET_H_ */ diff --git a/usr/src/compat/freebsd/sys/callout.h b/usr/src/compat/freebsd/sys/callout.h index 17b6e31507..6087a09f54 100644 --- a/usr/src/compat/freebsd/sys/callout.h +++ b/usr/src/compat/freebsd/sys/callout.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_CALLOUT_H_ @@ -41,6 +42,9 @@ int vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, int vmm_glue_callout_stop(struct callout *c); int vmm_glue_callout_drain(struct callout *c); +/* illumos-custom function for resource locality optimization */ +void vmm_glue_callout_localize(struct callout *c); + static __inline void callout_init(struct callout *c, int mpsafe) { diff --git a/usr/src/compat/freebsd/sys/cdefs.h b/usr/src/compat/freebsd/sys/cdefs.h index 974e323dbe..0b857437e3 100644 --- a/usr/src/compat/freebsd/sys/cdefs.h +++ b/usr/src/compat/freebsd/sys/cdefs.h @@ -11,48 +11,67 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_CDEFS_H_ #define _COMPAT_FREEBSD_SYS_CDEFS_H_ +/* + * Testing against Clang-specific extensions. + */ +#ifndef __has_extension +#define __has_extension __has_feature +#endif +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + +/* + * Macro to test if we're using a specific version of gcc or later. + */ +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) +#define __GNUC_PREREQ__(ma, mi) \ + (__GNUC__ > (ma) || __GNUC__ == (ma) && __GNUC_MINOR__ >= (mi)) +#else +#define __GNUC_PREREQ__(ma, mi) 0 +#endif + #define __FBSDID(s) #ifdef __GNUC__ +#define asm __asm #define inline __inline #define __GNUCLIKE___SECTION 1 #define __dead2 __attribute__((__noreturn__)) -#define __unused __attribute__((__unused__)) #define __used __attribute__((__used__)) #define __packed __attribute__((__packed__)) #define __aligned(x) __attribute__((__aligned__(x))) #define __section(x) __attribute__((__section__(x))) +#define __weak_symbol __attribute__((__weak__)) #endif -/* - * The __CONCAT macro is used to concatenate parts of symbol names, e.g. - * with "#define OLD(foo) __CONCAT(old,foo)", OLD(foo) produces oldfoo. - * The __CONCAT macro is a bit tricky to use if it must work in non-ANSI - * mode -- there must be no spaces between its arguments, and for nested - * __CONCAT's, all the __CONCAT's must be at the left. __CONCAT can also - * concatenate double-quoted strings produced by the __STRING macro, but - * this only works with ANSI C. - * - * __XSTRING is like __STRING, but it expands any macros in its argument - * first. It is only available with ANSI C. - */ -#if defined(__STDC__) || defined(__cplusplus) -#define __P(protos) protos /* full-blown ANSI C */ -#define __CONCAT1(x,y) x ## y -#define __CONCAT(x,y) __CONCAT1(x,y) -#define __STRING(x) #x /* stringify without expanding x */ -#define __XSTRING(x) __STRING(x) /* expand x, then stringify */ -#else /* !(__STDC__ || __cplusplus) */ -#define __P(protos) () /* traditional C preprocessor */ -#define __CONCAT(x,y) x/**/y -#define __STRING(x) "x" -#endif /* !(__STDC__ || __cplusplus) */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201112L || defined(lint) + +#if !__has_extension(c_static_assert) +#if (defined(__cplusplus) && __cplusplus >= 201103L) || \ + __has_extension(cxx_static_assert) +#define _Static_assert(x, y) static_assert(x, y) +#elif __GNUC_PREREQ__(4,6) +/* Nothing, gcc 4.6 and higher has _Static_assert built-in */ +#elif defined(__COUNTER__) +#define _Static_assert(x, y) __Static_assert(x, __COUNTER__) +#define __Static_assert(x, y) ___Static_assert(x, y) +#define ___Static_assert(x, y) typedef char __assert_ ## y[(x) ? 1 : -1] \ + __unused +#else +#define _Static_assert(x, y) struct __hack +#endif +#endif +#define static_assert(x, y) _Static_assert(x, y) + +#endif /* __STDC_VERSION__ || __STDC_VERSION__ < 201112L */ #endif /* _COMPAT_FREEBSD_SYS_CDEFS_H_ */ diff --git a/usr/src/compat/freebsd/sys/clock.h b/usr/src/compat/freebsd/sys/clock.h new file mode 100644 index 0000000000..ebf7f171a3 --- /dev/null +++ b/usr/src/compat/freebsd/sys/clock.h @@ -0,0 +1,110 @@ +/*- + * Copyright (c) 1996 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Gordon W. Ross + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $NetBSD: clock_subr.h,v 1.7 2000/10/03 13:41:07 tsutsui Exp $ + * + * + * This file is the central clearing-house for calendrical issues. + * + * In general the kernel does not know about minutes, hours, days, timezones, + * daylight savings time, leap-years and such. All that is theoretically a + * matter for userland only. + * + * Parts of kernel code does however care: badly designed filesystems store + * timestamps in local time and RTC chips sometimes track time in a local + * timezone instead of UTC and so on. + * + * All that code should go here for service. + * + * $FreeBSD$ + */ + +#ifndef _COMPAT_FREEBSD_SYS_CLOCK_H_ +#define _COMPAT_FREEBSD_SYS_CLOCK_H_ + +#include_next + +#ifdef _KERNEL /* No user serviceable parts */ + +#ifdef __FreeBSD__ +/* + * Timezone info from settimeofday(2), usually not used + */ +extern int tz_minuteswest; +extern int tz_dsttime; +extern struct mtx resettodr_lock; + +int utc_offset(void); +#endif /* __FreeBSD__ */ + +/* + * Structure to hold the values typically reported by time-of-day clocks. + * This can be passed to the generic conversion functions to be converted + * to a struct timespec. + */ +struct clocktime { + int year; /* year (4 digit year) */ + int mon; /* month (1 - 12) */ + int day; /* day (1 - 31) */ + int hour; /* hour (0 - 23) */ + int min; /* minute (0 - 59) */ + int sec; /* second (0 - 59) */ + int dow; /* day of week (0 - 6; 0 = Sunday) */ + long nsec; /* nano seconds */ +}; + +int clock_ct_to_ts(struct clocktime *, struct timespec *); +void clock_ts_to_ct(struct timespec *, struct clocktime *); +#ifdef __FreeBSD__ +void clock_register(device_t, long); +#endif + +#ifndef __FreeBSD__ +extern u_char const bin2bcd_data[]; +#define bin2bcd(x) (bin2bcd_data[bin]) +#endif + +/* + * BCD to decimal and decimal to BCD. + */ +#define FROMBCD(x) bcd2bin(x) +#define TOBCD(x) bin2bcd(x) + +/* Some handy constants. */ +#define SECDAY (24 * 60 * 60) +#define SECYR (SECDAY * 365) + +/* Traditional POSIX base year */ +#define POSIX_BASE_YEAR 1970 + +void timespec2fattime(struct timespec *tsp, int utc, u_int16_t *ddp, u_int16_t *dtp, u_int8_t *dhp); +void fattime2timespec(unsigned dd, unsigned dt, unsigned dh, int utc, struct timespec *tsp); + +#endif /* _KERNEL */ + +#endif /* _COMPAT_FREEBSD_SYS_CLOCK_H_ */ diff --git a/usr/src/compat/freebsd/sys/cpuset.h b/usr/src/compat/freebsd/sys/cpuset.h index 8527624b5e..626b323d7d 100644 --- a/usr/src/compat/freebsd/sys/cpuset.h +++ b/usr/src/compat/freebsd/sys/cpuset.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_CPUSET_H_ @@ -19,26 +20,115 @@ #define NOCPU -1 #ifdef _KERNEL -#define CPU_SET(cpu, set) CPUSET_ADD(*(set), cpu) -#define CPU_SETOF(cpu, set) CPUSET_ONLY(*(set), cpu) -#define CPU_ZERO(set) CPUSET_ZERO(*(set)) -#define CPU_CLR(cpu, set) CPUSET_DEL(*(set), cpu) + +#include + +#define CPU_SET(cpu, set) cpuset_add((set), (cpu)) +#define CPU_SETOF(cpu, set) cpuset_only((set), (cpu)) +#define CPU_ZERO(set) cpuset_zero((cpuset_t *)(set)) +#define CPU_CLR(cpu, set) cpuset_del((set), (cpu)) +#define CPU_EMPTY(set) cpuset_isnull((set)) #define CPU_FFS(set) cpusetobj_ffs(set) -#define CPU_ISSET(cpu, set) CPU_IN_SET(*(set), cpu) -#define CPU_CMP(set1, set2) CPUSET_ISEQUAL(*(set1), *(set2)) -#define CPU_SET_ATOMIC(cpu, set) CPUSET_ATOMIC_ADD(*(set), cpu) +#define CPU_ISSET(cpu, set) cpu_in_set((cpuset_t *)(set), (cpu)) +#define CPU_AND(dst, src) cpuset_and( \ + (cpuset_t *)(dst), \ + (cpuset_t *)(src)) +#define CPU_OR(dst, src) cpuset_or( \ + (cpuset_t *)(dst), \ + (cpuset_t *)(src)) +#define CPU_CMP(set1, set2) (cpuset_isequal( \ + (cpuset_t *)(set1), \ + (cpuset_t *)(set2)) == 0) +#define CPU_SET_ATOMIC(cpu, set) cpuset_atomic_add( \ + (cpuset_t *)(set), \ + (cpu)) +#define CPU_CLR_ATOMIC(cpu, set) cpuset_atomic_del( \ + (cpuset_t *)(set), \ + (cpu)) + +#define CPU_SET_ATOMIC_ACQ(cpu, set) cpuset_atomic_add((set), (cpu)) -#include int cpusetobj_ffs(const cpuset_t *set); + #else + +#include #include +#include + +/* For now, assume NCPU of 256 */ +#define CPU_SETSIZE (256) + +typedef struct { + ulong_t _bits[BT_BITOUL(CPU_SETSIZE)]; +} cpuset_t; + +static __inline int +cpuset_isempty(const cpuset_t *set) +{ + uint_t i; -typedef int cpuset_t; + for (i = 0; i < BT_BITOUL(CPU_SETSIZE); i++) { + if (set->_bits[i] != 0) + return (0); + } + return (1); +} -#define CPUSET(cpu) (1UL << (cpu)) +static __inline void +cpuset_zero(cpuset_t *dst) +{ + uint_t i; + + for (i = 0; i < BT_BITOUL(CPU_SETSIZE); i++) { + dst->_bits[i] = 0; + } +} + +static __inline int +cpuset_isequal(cpuset_t *s1, cpuset_t *s2) +{ + uint_t i; + + for (i = 0; i < BT_BITOUL(CPU_SETSIZE); i++) { + if (s1->_bits[i] != s2->_bits[i]) + return (0); + } + return (1); +} + +static __inline uint_t +cpusetobj_ffs(const cpuset_t *set) +{ + uint_t i, cbit; + + cbit = 0; + for (i = 0; i < BT_BITOUL(CPU_SETSIZE); i++) { + if (set->_bits[i] != 0) { + cbit = ffsl(set->_bits[i]); + cbit += i * sizeof (set->_bits[0]); + break; + } + } + return (cbit); +} + + +#define CPU_SET(cpu, setp) BT_SET((setp)->_bits, cpu) +#define CPU_CLR(cpu, setp) BT_CLEAR((setp)->_bits, cpu) +#define CPU_ZERO(setp) cpuset_zero((setp)) +#define CPU_CMP(set1, set2) (cpuset_isequal( \ + (cpuset_t *)(set1), \ + (cpuset_t *)(set2)) == 0) +#define CPU_FFS(set) cpusetobj_ffs(set) +#define CPU_ISSET(cpu, setp) BT_TEST((setp)->_bits, cpu) +#define CPU_EMPTY(setp) cpuset_isempty((setp)) +#define CPU_SET_ATOMIC(cpu, setp) \ + atomic_set_long(&(BT_WIM((setp)->_bits, cpu)), BT_BIW(cpu)) +#define CPU_CLR_ATOMIC(cpu, setp) \ + atomic_clear_long(&(BT_WIM((setp)->_bits, cpu)), BT_BIW(cpu)) -#define CPU_SET_ATOMIC(cpu, set) atomic_set_int((set), CPUSET(cpu)) #endif #endif /* _COMPAT_FREEBSD_SYS_CPUSET_H_ */ diff --git a/usr/src/compat/freebsd/sys/endian.h b/usr/src/compat/freebsd/sys/endian.h index a31bff55d6..24ea02d251 100644 --- a/usr/src/compat/freebsd/sys/endian.h +++ b/usr/src/compat/freebsd/sys/endian.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_ENDIAN_H_ @@ -122,4 +123,14 @@ le64enc(void *pp, uint64_t u) le32enc(p + 4, (uint32_t)(u >> 32)); } +#ifdef _LITTLE_ENDIAN +#define htole16(x) ((uint16_t)(x)) +#define htole32(x) ((uint32_t)(x)) +#define htole64(x) ((uint64_t)(x)) + +#define le16toh(x) ((uint16_t)(x)) +#define le32toh(x) ((uint32_t)(x)) +#define le64toh(x) ((uint64_t)(x)) +#endif + #endif /* _COMPAT_FREEBSD_SYS_ENDIAN_H_ */ diff --git a/usr/src/compat/freebsd/sys/eventhandler.h b/usr/src/compat/freebsd/sys/eventhandler.h new file mode 100644 index 0000000000..133aa664f0 --- /dev/null +++ b/usr/src/compat/freebsd/sys/eventhandler.h @@ -0,0 +1,19 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_EVENTHANDLER_H_ +#define _COMPAT_FREEBSD_SYS_EVENTHANDLER_H_ + +#endif /* _COMPAT_FREEBSD_SYS_EVENTHANDLER_H_ */ diff --git a/usr/src/compat/freebsd/sys/ioctl.h b/usr/src/compat/freebsd/sys/ioctl.h index e223e1e4c7..72a46b8085 100644 --- a/usr/src/compat/freebsd/sys/ioctl.h +++ b/usr/src/compat/freebsd/sys/ioctl.h @@ -17,6 +17,8 @@ #define _COMPAT_FREEBSD_SYS_IOCTL_H_ #include +/* Get BSD compatibility from the ioctl header */ +#define BSD_COMP #include_next #endif /* _COMPAT_FREEBSD_SYS_IOCTL_H_ */ diff --git a/usr/src/compat/freebsd/sys/kernel.h b/usr/src/compat/freebsd/sys/kernel.h index b1c07674e4..adf96f40fc 100644 --- a/usr/src/compat/freebsd/sys/kernel.h +++ b/usr/src/compat/freebsd/sys/kernel.h @@ -11,15 +11,32 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_KERNEL_H_ #define _COMPAT_FREEBSD_SYS_KERNEL_H_ -#define SYSINIT(uniquifier, subsystem, order, func, ident) +#define TUNABLE_INT_FETCH(path, var) #include +typedef void (*sysinit_func_t)(const void *); + +struct sysinit { + const sysinit_func_t func; + const void *data; +}; + +#define SYSINIT(uniquifier, subsystem, order, func, ident) \ + static struct sysinit uniquifier ## _sys_init = { \ + (const sysinit_func_t)func, \ + (const void *)&(ident) \ + }; \ + DATA_SET(sysinit_set, uniquifier ## _sys_init); + +extern void sysinit(void); + #define ticks ddi_get_lbolt() #endif /* _COMPAT_FREEBSD_SYS_KERNEL_H_ */ diff --git a/usr/src/compat/freebsd/sys/limits.h b/usr/src/compat/freebsd/sys/limits.h index 99ae0f4d64..0e66319791 100644 --- a/usr/src/compat/freebsd/sys/limits.h +++ b/usr/src/compat/freebsd/sys/limits.h @@ -11,9 +11,14 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_LIMITS_H_ #define _COMPAT_FREEBSD_SYS_LIMITS_H_ +#include_next + +#define OFF_MAX ((off_t)-1) + #endif /* _COMPAT_FREEBSD_SYS_LIMITS_H_ */ diff --git a/usr/src/compat/freebsd/sys/lock.h b/usr/src/compat/freebsd/sys/lock.h new file mode 100644 index 0000000000..fd6021a87e --- /dev/null +++ b/usr/src/compat/freebsd/sys/lock.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_LOCK_H_ +#define _COMPAT_FREEBSD_SYS_LOCK_H_ + +#include_next + +#define WITNESS_WARN(...) + +#endif /* _COMPAT_FREEBSD_SYS_LOCK_H_ */ diff --git a/usr/src/compat/freebsd/sys/malloc.h b/usr/src/compat/freebsd/sys/malloc.h index 579df44533..341d57b807 100644 --- a/usr/src/compat/freebsd/sys/malloc.h +++ b/usr/src/compat/freebsd/sys/malloc.h @@ -39,6 +39,11 @@ struct malloc_type { void free(void *addr, struct malloc_type *type); void *malloc(unsigned long size, struct malloc_type *type, int flags); void *old_malloc(unsigned long size, struct malloc_type *type , int flags); +void *contigmalloc(unsigned long, struct malloc_type *, int, vm_paddr_t, + vm_paddr_t, unsigned long, vm_paddr_t); +void contigfree(void *, unsigned long, struct malloc_type *); + + #endif /* _KERNEL */ #endif /* _COMPAT_FREEBSD_SYS_MALLOC_H_ */ diff --git a/usr/src/compat/freebsd/sys/mutex.h b/usr/src/compat/freebsd/sys/mutex.h index b99884b652..9e588cb98a 100644 --- a/usr/src/compat/freebsd/sys/mutex.h +++ b/usr/src/compat/freebsd/sys/mutex.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_MUTEX_H_ @@ -28,15 +29,11 @@ struct mtx; void mtx_init(struct mtx *, char *name, const char *type_name, int opts); void mtx_destroy(struct mtx *); -int mtx_sleep(void *chan, struct mtx *mtx, int priority, const char *wmesg, - int timo); - #endif /* KERNEL */ #include_next #ifdef _KERNEL struct mtx { - kmutex_type_t t; kmutex_t m; }; diff --git a/usr/src/compat/freebsd/sys/param.h b/usr/src/compat/freebsd/sys/param.h index f09e9183f6..b125f9014f 100644 --- a/usr/src/compat/freebsd/sys/param.h +++ b/usr/src/compat/freebsd/sys/param.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_PARAM_H_ @@ -18,8 +19,11 @@ #ifndef _KERNEL #define MAXCOMLEN 16 +/* default value of the kernel tunable 'maxphys' in i86pc */ +#define MAXPHYS (56 * 1024) #endif #define MAXHOSTNAMELEN 256 +#define SPECNAMELEN 63 #ifdef _KERNEL #include @@ -36,13 +40,18 @@ #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x,y) (((x)/(y))*(y)) +#define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x,y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ +#define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) +#define trunc_page(x) ((unsigned long)(x) & ~(PAGE_MASK)) +#define ptoa(x) ((unsigned long)(x) << PAGE_SHIFT) + #include_next #endif /* _COMPAT_FREEBSD_SYS_PARAM_H_ */ diff --git a/usr/src/compat/freebsd/sys/sdt.h b/usr/src/compat/freebsd/sys/sdt.h new file mode 100644 index 0000000000..32d887c0d8 --- /dev/null +++ b/usr/src/compat/freebsd/sys/sdt.h @@ -0,0 +1,37 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_SDT_H_ +#define _COMPAT_FREEBSD_SYS_SDT_H_ + +/* Empty macros to cover FreeBSD's SDT linker tricks */ + +#define SDT_PROVIDER_DECLARE(mod) +#define SDT_PROVIDER_DEFINE(mod) + +#define SDT_PROBE_DEFINE1(...) +#define SDT_PROBE_DEFINE2(...) +#define SDT_PROBE_DEFINE3(...) +#define SDT_PROBE_DEFINE4(...) +#define SDT_PROBE_DEFINE5(...) +#define SDT_PROBE1(...) +#define SDT_PROBE2(...) +#define SDT_PROBE3(...) +#define SDT_PROBE4(...) +#define SDT_PROBE5(...) + +#include_next + +#endif /* _COMPAT_FREEBSD_SYS_SDT_H_ */ diff --git a/usr/src/compat/freebsd/sys/sglist.h b/usr/src/compat/freebsd/sys/sglist.h new file mode 100644 index 0000000000..519c67915f --- /dev/null +++ b/usr/src/compat/freebsd/sys/sglist.h @@ -0,0 +1,29 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_SGLIST_H_ +#define _COMPAT_FREEBSD_SYS_SGLIST_H_ + +#ifdef _KERNEL + +struct sglist; + +struct sglist *sglist_alloc(int, int); +void sglist_free(struct sglist *); +int sglist_append_phys(struct sglist *, vm_paddr_t, size_t); + +#endif /* _KERNEL */ + +#endif /* _COMPAT_FREEBSD_SYS_SGLIST_H_ */ diff --git a/usr/src/compat/freebsd/sys/smp.h b/usr/src/compat/freebsd/sys/smp.h index 46183e8677..3d6413ce16 100644 --- a/usr/src/compat/freebsd/sys/smp.h +++ b/usr/src/compat/freebsd/sys/smp.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_SMP_H_ @@ -18,10 +19,7 @@ #include -void smp_rendezvous(void (*)(void *), - void (*)(void *), - void (*)(void *), - void *arg); +#define IPI_AST 0 void ipi_cpu(int cpu, u_int ipi); diff --git a/usr/src/compat/freebsd/sys/socket.h b/usr/src/compat/freebsd/sys/socket.h new file mode 100644 index 0000000000..3bf7a8f440 --- /dev/null +++ b/usr/src/compat/freebsd/sys/socket.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_SOCKET_H +#define _COMPAT_FREEBSD_SYS_SOCKET_H + +#include_next + +#define SO_NOSIGPIPE 0 + +#endif /* _COMPAT_FREEBSD_SYS_SOCKET_H */ diff --git a/usr/src/compat/freebsd/sys/systm.h b/usr/src/compat/freebsd/sys/systm.h index e25acc0e4a..43fa16d450 100644 --- a/usr/src/compat/freebsd/sys/systm.h +++ b/usr/src/compat/freebsd/sys/systm.h @@ -28,18 +28,9 @@ struct mtx; panic msg; \ } while (0) -#define CTASSERT(x) _CTASSERT(x, __LINE__) -#define _CTASSERT(x,y) __CTASSERT(x,y) -#define __CTASSERT(x,y) typedef char __assert ## y[(x) ? 1 : -1] - void critical_enter(void); void critical_exit(void); -int msleep_spin(void *chan, struct mtx *mutex, const char *wmesg, - int ticks); -void wakeup(void *chan); -void wakeup_one(void *chan); - struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex); void delete_unrhdr(struct unrhdr *uh); int alloc_unr(struct unrhdr *uh); diff --git a/usr/src/compat/freebsd/sys/time.h b/usr/src/compat/freebsd/sys/time.h index f8f9da5cdf..4e0fbfc02c 100644 --- a/usr/src/compat/freebsd/sys/time.h +++ b/usr/src/compat/freebsd/sys/time.h @@ -50,7 +50,13 @@ binuptime(struct bintime *bt) ((a)->frac cmp (b)->frac) : \ ((a)->sec cmp (b)->sec)) -#define SBT_1US (1000) +#define SBT_1S ((sbintime_t)1 << 32) +#define SBT_1M (SBT_1S * 60) +#define SBT_1MS (SBT_1S / 1000) +#define SBT_1US (SBT_1S / 1000000) +#define SBT_1NS (SBT_1S / 1000000000) +#define SBT_MAX 0x7fffffffffffffffLL + static __inline void bintime_add(struct bintime *bt, const struct bintime *bt2) @@ -91,14 +97,28 @@ bintime_mul(struct bintime *bt, u_int x) static __inline sbintime_t bttosbt(const struct bintime bt) { - return ((bt.sec * 1000000000) + - (((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32)); + return (((sbintime_t)bt.sec << 32) + (bt.frac >> 32)); +} + +static __inline struct bintime +sbttobt(sbintime_t _sbt) +{ + struct bintime _bt; + + _bt.sec = _sbt >> 32; + _bt.frac = _sbt << 32; + return (_bt); } static __inline sbintime_t sbinuptime(void) { - return (gethrtime()); + hrtime_t hrt = gethrtime(); + uint64_t sec = hrt / NANOSEC; + uint64_t nsec = hrt % NANOSEC; + + return (((sbintime_t)sec << 32) + + (nsec * (((uint64_t)1 << 63) / 500000000) >> 32)); } #endif /* _COMPAT_FREEBSD_SYS_TIME_H_ */ diff --git a/usr/src/compat/freebsd/sys/types.h b/usr/src/compat/freebsd/sys/types.h index 6fc8179f2e..63731da42e 100644 --- a/usr/src/compat/freebsd/sys/types.h +++ b/usr/src/compat/freebsd/sys/types.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_TYPES_H_ @@ -53,6 +54,16 @@ typedef __vm_ooffset_t vm_ooffset_t; typedef __vm_paddr_t vm_paddr_t; #endif +#ifndef __VM_PINDEX_T_DEFINED +#define __VM_PINDEX_T_DEFINED +typedef __uint64_t vm_pindex_t; +#endif + +#ifndef __VM_SIZE_T_DEFINED +#define __VM_SIZE_T_DEFINED +typedef __vm_size_t vm_size_t; +#endif + #ifndef __VM_MEMATTR_T_DEFINED #define __VM_MEMATTR_T_DEFINED typedef char vm_memattr_t; @@ -65,8 +76,8 @@ typedef char vm_memattr_t; typedef _Bool bool; #endif -#if defined(_KERNEL) && !defined(offsetof) -#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#if defined(_KERNEL) +typedef struct __dev_info **device_t; #endif #include_next diff --git a/usr/src/compat/freebsd/unistd.h b/usr/src/compat/freebsd/unistd.h new file mode 100644 index 0000000000..b4357e1da5 --- /dev/null +++ b/usr/src/compat/freebsd/unistd.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_UNISTD_H +#define _COMPAT_FREEBSD_UNISTD_H + +#define setproctitle(fmt, ...) + +#include_next + +#endif /* _COMPAT_FREEBSD_UNISTD_H */ diff --git a/usr/src/compat/freebsd/vm/pmap.h b/usr/src/compat/freebsd/vm/pmap.h deleted file mode 100644 index 5958c4b101..0000000000 --- a/usr/src/compat/freebsd/vm/pmap.h +++ /dev/null @@ -1,21 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2014 Pluribus Networks Inc. - */ - -#ifndef _COMPAT_FREEBSD_VM_PMAP_H_ -#define _COMPAT_FREEBSD_VM_PMAP_H_ - -#include - -#endif /* _COMPAT_FREEBSD_VM_PMAP_H_ */ diff --git a/usr/src/compat/freebsd/vm/vm.h b/usr/src/compat/freebsd/vm/vm.h index 7da22099b6..f5bb7b6eb8 100644 --- a/usr/src/compat/freebsd/vm/vm.h +++ b/usr/src/compat/freebsd/vm/vm.h @@ -11,23 +11,48 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _FREEBSD_VM_VM_H_ #define _FREEBSD_VM_VM_H_ #include +#include typedef u_char vm_prot_t; +/* + * Even though the FreeBSD VM_PROT defines happen to match illumos, this + * references the native values directly so there's no risk of breakage. + */ #define VM_PROT_NONE ((vm_prot_t) 0x00) -#define VM_PROT_READ ((vm_prot_t) 0x01) -#define VM_PROT_WRITE ((vm_prot_t) 0x02) -#define VM_PROT_EXECUTE ((vm_prot_t) 0x04) +#define VM_PROT_READ ((vm_prot_t) PROT_READ) +#define VM_PROT_WRITE ((vm_prot_t) PROT_WRITE) +#define VM_PROT_EXECUTE ((vm_prot_t) PROT_EXEC) #define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE) #define VM_PROT_RW (VM_PROT_READ|VM_PROT_WRITE) +struct vm_page; +typedef struct vm_page *vm_page_t; + +enum obj_type { OBJT_DEFAULT, OBJT_SWAP, OBJT_VNODE, OBJT_DEVICE, OBJT_PHYS, + OBJT_DEAD, OBJT_SG, OBJT_MGTDEVICE }; +typedef u_char objtype_t; + +union vm_map_object; +typedef union vm_map_object vm_map_object_t; + +struct vm_map_entry; +typedef struct vm_map_entry *vm_map_entry_t; + +struct vm_map; +typedef struct vm_map *vm_map_t; + +struct vm_object; +typedef struct vm_object *vm_object_t; + /* * contains a troublesome preprocessor define for BYTE. * Do this ugly workaround to avoid it. diff --git a/usr/src/compat/freebsd/vm/vm_param.h b/usr/src/compat/freebsd/vm/vm_param.h new file mode 100644 index 0000000000..fd76b62a37 --- /dev/null +++ b/usr/src/compat/freebsd/vm/vm_param.h @@ -0,0 +1,21 @@ +#ifndef _COMPAT_FREEBSD_VM_VM_PARAM_H_ +#define _COMPAT_FREEBSD_VM_VM_PARAM_H_ + +#include + +#define KERN_SUCCESS 0 + +/* Not a direct correlation, but the primary necessity is being non-zero */ +#define KERN_RESOURCE_SHORTAGE ENOMEM + +/* + * The VM_MAXUSER_ADDRESS is used to determine the upper limit size limit of a + * vmspace, their 'struct as' equivalent. The compat value is sized well below + * our native userlimit, even halving the available space below the VA hole. + * This is to avoid Intel EPT limits and leave room available in the usabe VA + * range for other mmap tricks. + */ +#define VM_MAXUSER_ADDRESS 0x00003ffffffffffful + + +#endif /* _COMPAT_FREEBSD_VM_VM_PARAM_H_ */ diff --git a/usr/src/compat/freebsd/x86/_types.h b/usr/src/compat/freebsd/x86/_types.h index a07fc017ad..8bbae549d8 100644 --- a/usr/src/compat/freebsd/x86/_types.h +++ b/usr/src/compat/freebsd/x86/_types.h @@ -41,9 +41,11 @@ typedef __int64_t __register_t; typedef __uint64_t __vm_offset_t; typedef __uint64_t __vm_paddr_t; typedef __int64_t __vm_ooffset_t; +typedef __uint64_t __vm_size_t; #else typedef __int32_t __register_t; typedef __uint32_t __vm_paddr_t; +typedef __uint32_t __vm_size_t; #endif #endif /* _FREEBSD_X86__TYPES_H_ */ diff --git a/usr/src/compat/freebsd/x86/segments.h b/usr/src/compat/freebsd/x86/segments.h index bc6ba976b8..11edc582b5 100644 --- a/usr/src/compat/freebsd/x86/segments.h +++ b/usr/src/compat/freebsd/x86/segments.h @@ -11,18 +11,19 @@ /* * Copyright 2015 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ -#ifndef _COMPAT_FREEBSD_X86_SEGMENTS_H_ -#define _COMPAT_FREEBSD_X86_SEGMENTS_H_ +#ifndef _COMPAT_FREEBSD_X86_SEGMENTS_H +#define _COMPAT_FREEBSD_X86_SEGMENTS_H -/* - * Entries in the Interrupt Descriptor Table (IDT) - */ -#define IDT_BP 3 /* #BP: Breakpoint */ +#if defined(_COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_) || defined(_KERNEL) #define IDT_UD 6 /* #UD: Undefined/Invalid Opcode */ #define IDT_SS 12 /* #SS: Stack Segment Fault */ #define IDT_GP 13 /* #GP: General Protection Fault */ #define IDT_AC 17 /* #AC: Alignment Check */ +#else +#include_next +#endif -#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_ */ +#endif /* _COMPAT_FREEBSD_X86_SEGMENTS_H */ diff --git a/usr/src/head/bhyve.h b/usr/src/head/bhyve.h deleted file mode 100644 index 8c79ca1ccc..0000000000 --- a/usr/src/head/bhyve.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * COPYRIGHT 2013 Pluribus Networks Inc. - * - * All rights reserved. This copyright notice is Copyright Management - * Information under 17 USC 1202 and is included to protect this work and - * deter copyright infringement. Removal or alteration of this Copyright - * Management Information without the express written permission from - * Pluribus Networks Inc is prohibited, and any such unauthorized removal - * or alteration will be a violation of federal law. - */ -#ifndef _BHYVE_H -#define _BHYVE_H - -#ifdef __cplusplus -extern "C" { -#endif - -#define BHYVE_TMPDIR "/var/run/bhyve" -#define BHYVE_CONS_SOCKPATH BHYVE_TMPDIR "/%s.console_sock" - -#ifdef __cplusplus -} -#endif - -#endif /* _BHYVE_H */ diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile index e2bbd9a8c0..b64d4c2bc1 100644 --- a/usr/src/lib/Makefile +++ b/usr/src/lib/Makefile @@ -280,7 +280,8 @@ SUBDIRS += \ i386_SUBDIRS= \ libfdisk \ - libsaveargs + libsaveargs \ + libvmmapi sparc_SUBDIRS= \ efcode \ @@ -504,7 +505,8 @@ HDRSUBDIRS= \ i386_HDRSUBDIRS= \ libfdisk \ - libsaveargs + libsaveargs \ + libvmmapi sparc_HDRSUBDIRS= \ libds \ diff --git a/usr/src/lib/libvmmapi/Makefile b/usr/src/lib/libvmmapi/Makefile index 60621fcb75..233fcd5edb 100644 --- a/usr/src/lib/libvmmapi/Makefile +++ b/usr/src/lib/libvmmapi/Makefile @@ -19,11 +19,13 @@ HDRS = vmmapi.h HDRDIR = common +CHECKHDRS = + $(BUILD64)SUBDIRS += $(MACH64) all:= TARGET= all install:= TARGET= install -clean:= TARGET= clean +clean:= TARGET= clean clobber:= TARGET= clobber lint:= TARGET= lint _msg:= TARGET= _msg diff --git a/usr/src/lib/libvmmapi/Makefile.com b/usr/src/lib/libvmmapi/Makefile.com index e41a82f9a2..34240f4331 100644 --- a/usr/src/lib/libvmmapi/Makefile.com +++ b/usr/src/lib/libvmmapi/Makefile.com @@ -12,11 +12,12 @@ # # Copyright 2013 Pluribus Networks Inc. # +# Copyright 2019 Joyent, Inc. -LIBRARY = libvmmapi.a +LIBRARY = libvmmapi.a VERS = .1 -OBJECTS = vmmapi.o expand_number.o +OBJECTS = vmmapi.o expand_number.o # include library definitions include ../../Makefile.lib @@ -24,16 +25,19 @@ include ../../Makefile.lib # install this library in the root filesystem include ../../Makefile.rootfs -SRCDIR = ../common +SRCDIR = ../common -LIBS = $(DYNLIB) $(LINTLIB) +LIBS = $(DYNLIB) $(LINTLIB) -CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ +CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ $(CPPFLAGS.master) -I$(SRC)/uts/i86pc +# not linted +SMATCH=off + $(LINTLIB) := SRCS = $(SRCDIR)/$(LINTSRC) -LDLIBS += -lc +LDLIBS += -lc .KEEP_STATE: diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers index 7a8443a2b8..a64231ad1c 100644 --- a/usr/src/lib/libvmmapi/common/mapfile-vers +++ b/usr/src/lib/libvmmapi/common/mapfile-vers @@ -11,6 +11,7 @@ # # Copyright 2013 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. # # @@ -27,51 +28,96 @@ # MAPFILE HEADER END # -SUNWprivate_1.0 { - global: - vcpu_reset; - vm_activate_cpu; - vm_apicid2vcpu; - vm_capability_name2type; - vm_capability_type2name; - vm_copy_setup; - vm_copy_teardown; - vm_copyin; - vm_copyout; - vm_create; - vm_destroy; - vm_get_capability; - vm_get_desc; - vm_get_highmem_size; - vm_get_lowmem_limit; - vm_get_lowmem_size; - vm_get_memory_seg; - vm_get_register; - vm_get_seg_desc; - vm_get_x2apic_state; - vm_gla2gpa; - vm_inject_exception; - vm_isa_assert_irq; - vm_isa_deassert_irq; - vm_isa_pulse_irq; - vm_isa_set_irq_trigger; - vm_ioapic_assert_irq; - vm_ioapic_deassert_irq; - vm_ioapic_pincount; - vm_ioapic_pulse_irq; - vm_lapic_irq; - vm_lapic_msi; - vm_map_gpa; - vm_open; - vm_parse_memsize; - vm_restart_instruction; - vm_run; - vm_set_capability; - vm_set_desc; - vm_set_register; - vm_set_x2apic_state; - vm_setup_memory; - vm_setup_rom; - local: - *; +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + vcpu_reset; + vm_active_cpus; + vm_activate_cpu; + vm_active_cpus; + vm_apicid2vcpu; + vm_assign_pptdev; + vm_capability_name2type; + vm_capability_type2name; + vm_copy_setup; + vm_copy_teardown; + vm_copyin; + vm_copyout; + vm_create_devmem; + vm_create; + vm_create_devmem; + vm_debug_cpus; + vm_destroy; + vm_destroy; + vm_get_capability; + vm_get_desc; + vm_get_device_fd; + vm_get_gpa_pmap; + vm_get_hpet_capabilities; + vm_get_highmem_size; + vm_get_intinfo; + vm_get_lowmem_limit; + vm_get_lowmem_size; + vm_get_memflags; + vm_get_memseg; + vm_get_register; + vm_get_register_set; + vm_get_seg_desc; + vm_get_stat_desc; + vm_get_stats; + vm_get_topology; + vm_get_x2apic_state; + vm_gla2gpa; + vm_gla2gpa_nofault; + vm_inject_exception; + vm_inject_nmi; + vm_isa_assert_irq; + vm_isa_deassert_irq; + vm_isa_pulse_irq; + vm_isa_set_irq_trigger; + vm_ioapic_assert_irq; + vm_ioapic_deassert_irq; + vm_ioapic_pincount; + vm_ioapic_pulse_irq; + vm_isa_assert_irq; + vm_isa_deassert_irq; + vm_isa_pulse_irq; + vm_isa_set_irq_trigger; + vm_lapic_irq; + vm_lapic_local_irq; + vm_lapic_msi; + vm_map_gpa; + vm_map_pptdev_mmio; + vm_mmap_getnext; + vm_mmap_memseg; + vm_open; + vm_parse_memsize; + vm_reinit; + vm_restart_instruction; + vm_rtc_gettime; + vm_rtc_read; + vm_rtc_settime; + vm_rtc_write; + vm_run; + vm_set_capability; + vm_set_desc; + vm_set_intinfo; + vm_set_memflags; + vm_set_register; + vm_set_register_set; + vm_set_topology; + vm_set_x2apic_state; + vm_setup_memory; + vm_setup_pptdev_msi; + vm_setup_pptdev_msix; + vm_suspend; + vm_suspend_cpu; + vm_suspended_cpus; + vm_resume_cpu; + vm_unassign_pptdev; + vm_wrlock_cycle; + + local: + *; }; diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c index bbab3961a9..0b9b871081 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.c +++ b/usr/src/lib/libvmmapi/common/vmmapi.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tychon $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,10 +38,11 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tychon $"); +__FBSDID("$FreeBSD$"); #include #include @@ -48,11 +51,10 @@ __FBSDID("$FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tych #include #include +#include #include -#ifndef __FreeBSD__ #include -#endif #include #include #include @@ -70,23 +72,35 @@ __FBSDID("$FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tych #include "vmmapi.h" -#define KB (1024UL) #define MB (1024 * 1024UL) #define GB (1024 * 1024 * 1024UL) +#ifndef __FreeBSD__ +/* shim to no-op for now */ +#define MAP_NOCORE 0 +#define MAP_ALIGNED_SUPER 0 + +/* Rely on PROT_NONE for guard purposes */ +#define MAP_GUARD (MAP_PRIVATE | MAP_ANON | MAP_NORESERVE) +#endif + +/* + * Size of the guard region before and after the virtual address space + * mapping the guest physical memory. This must be a multiple of the + * superpage size for performance reasons. + */ +#define VM_MMAP_GUARD_SIZE (4 * MB) + +#define PROT_RW (PROT_READ | PROT_WRITE) +#define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) + struct vmctx { int fd; uint32_t lowmem_limit; - enum vm_mmap_style vms; - char *lowermem_addr; - char *biosmem_addr; + int memflags; size_t lowmem; - char *lowmem_addr; size_t highmem; - char *highmem_addr; - uint64_t rombase; - uint64_t romlimit; - char *rom_addr; + char *baseaddr; char *name; }; @@ -94,68 +108,50 @@ struct vmctx { #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) #else -#define CREATE(x) vmm_vm_create(x) -#define DESTROY(x) vmm_vm_destroy(x) -#endif +#define CREATE(x) vm_do_ctl(VMM_CREATE_VM, (x)) +#define DESTROY(x) vm_do_ctl(VMM_DESTROY_VM, (x)) static int -vm_device_open(const char *name) +vm_do_ctl(int cmd, const char *name) { - int fd, len; - char *vmfile; + int ctl_fd; -#ifdef __FreeBSD__ - len = strlen("/dev/vmm/") + strlen(name) + 1; -#else - len = strlen("/devices/pseudo/vmm@0:") + strlen(name) + 1; -#endif - vmfile = malloc(len); - assert(vmfile != NULL); -#ifdef __FreeBSD__ - snprintf(vmfile, len, "/dev/vmm/%s", name); -#else - snprintf(vmfile, len, "/devices/pseudo/vmm@0:%s", name); -#endif + ctl_fd = open(VMM_CTL_DEV, O_EXCL | O_RDWR); + if (ctl_fd < 0) { + return (-1); + } - /* Open the device file */ - fd = open(vmfile, O_RDWR, 0); + if (ioctl(ctl_fd, cmd, name) == -1) { + int err = errno; - free(vmfile); - return (fd); + /* Do not lose ioctl errno through the close(2) */ + (void) close(ctl_fd); + errno = err; + return (-1); + } + (void) close(ctl_fd); + + return (0); } +#endif -#ifndef __FreeBSD__ static int -vmm_vm_create(const char *name) +vm_device_open(const char *name) { - const char vmm_ctl[] = "/devices/pseudo/vmm@0:ctl"; - struct vmm_ioctl vi; - int err = 0; - int ctl_fd; + int fd, len; + char *vmfile; - (void) strlcpy(vi.vmm_name, name, sizeof (vi.vmm_name) - 1); + len = strlen("/dev/vmm/") + strlen(name) + 1; + vmfile = malloc(len); + assert(vmfile != NULL); + snprintf(vmfile, len, "/dev/vmm/%s", name); - ctl_fd = open(vmm_ctl, O_EXCL | O_RDWR); - if (ctl_fd == -1) { - err = errno; - if ((errno == EPERM) || (errno == EACCES)) { - fprintf(stderr, "you do not have permission to " - "perform that operation.\n"); - } else { - fprintf(stderr, "open: %s: %s\n", vmm_ctl, - strerror(errno)); - } - return (err); - } - if (ioctl(ctl_fd, VMM_CREATE_VM, &vi) == -1) { - err = errno; - fprintf(stderr, "couldn't create vm \"%s\"", name); - } - close (ctl_fd); + /* Open the device file */ + fd = open(vmfile, O_RDWR, 0); - return (err); + free(vmfile); + return (fd); } -#endif int vm_create(const char *name) @@ -173,6 +169,7 @@ vm_open(const char *name) assert(vm != NULL); vm->fd = -1; + vm->memflags = 0; vm->lowmem_limit = 3 * GB; vm->name = (char *)(vm + 1); strcpy(vm->name, name); @@ -182,54 +179,20 @@ vm_open(const char *name) return (vm); err: - (void) vm_destroy(vm); + vm_destroy(vm); return (NULL); } -#ifndef __FreeBSD__ -static int -vmm_vm_destroy(const char *name) -{ - const char vmm_ctl[] = "/devices/pseudo/vmm@0:ctl"; - struct vmm_ioctl vi; - int ctl_fd; - int err = 0; - - (void) strlcpy(vi.vmm_name, name, sizeof (vi.vmm_name) - 1); - - ctl_fd = open(vmm_ctl, O_EXCL | O_RDWR); - if (ctl_fd == -1) { - err = errno; - if ((errno == EPERM) || (errno == EACCES)) { - fprintf(stderr, "you do not have permission to " - "perform that operation.\n"); - } else { - fprintf(stderr, "open: %s: %s\n", vmm_ctl, - strerror(errno)); - } - return (err); - } - if (ioctl(ctl_fd, VMM_DESTROY_VM, &vi) == -1) { - err = errno; - fprintf(stderr, "couldn't destroy vm \"%s\"", name); - } - close (ctl_fd); - return (err); -} -#endif - -int +void vm_destroy(struct vmctx *vm) { - int err; assert(vm != NULL); if (vm->fd >= 0) close(vm->fd); - err = DESTROY(vm->name); + DESTROY(vm->name); free(vm); - return (err); } int @@ -256,92 +219,218 @@ vm_parse_memsize(const char *optarg, size_t *ret_memsize) return (error); } -#ifdef __FreeBSD__ -size_t -vmm_get_mem_total(void) +uint32_t +vm_get_lowmem_limit(struct vmctx *ctx) { - size_t mem_total = 0; - size_t oldlen = sizeof(mem_total); - int error; - error = sysctlbyname("hw.vmm.mem_total", &mem_total, &oldlen, NULL, 0); - if (error) - return -1; - return mem_total; + + return (ctx->lowmem_limit); } -size_t -vmm_get_mem_free(void) +void +vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) { - size_t mem_free = 0; - size_t oldlen = sizeof(mem_free); - int error; - error = sysctlbyname("hw.vmm.mem_free", &mem_free, &oldlen, NULL, 0); - if (error) - return -1; - return mem_free; + + ctx->lowmem_limit = limit; +} + +void +vm_set_memflags(struct vmctx *ctx, int flags) +{ + + ctx->memflags = flags; } -#endif int -vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len, - int *wired) +vm_get_memflags(struct vmctx *ctx) { - int error; - struct vm_memory_segment seg; - - bzero(&seg, sizeof(seg)); - seg.gpa = gpa; - error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg); - *ret_len = seg.len; - if (wired != NULL) - *wired = seg.wired; + + return (ctx->memflags); +} + +/* + * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). + */ +int +vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, + size_t len, int prot) +{ + struct vm_memmap memmap; + int error, flags; + + memmap.gpa = gpa; + memmap.segid = segid; + memmap.segoff = off; + memmap.len = len; + memmap.prot = prot; + memmap.flags = 0; + + if (ctx->memflags & VM_MEM_F_WIRED) + memmap.flags |= VM_MEMMAP_F_WIRED; + + /* + * If this mapping already exists then don't create it again. This + * is the common case for SYSMEM mappings created by bhyveload(8). + */ + error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); + if (error == 0 && gpa == memmap.gpa) { + if (segid != memmap.segid || off != memmap.segoff || + prot != memmap.prot || flags != memmap.flags) { + errno = EEXIST; + return (-1); + } else { + return (0); + } + } + + error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); return (error); } -uint32_t -vm_get_lowmem_limit(struct vmctx *ctx) +int +vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { + struct vm_memmap memmap; + int error; - return (ctx->lowmem_limit); + bzero(&memmap, sizeof(struct vm_memmap)); + memmap.gpa = *gpa; + error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); + if (error == 0) { + *gpa = memmap.gpa; + *segid = memmap.segid; + *segoff = memmap.segoff; + *len = memmap.len; + *prot = memmap.prot; + *flags = memmap.flags; + } + return (error); } -void -vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) +/* + * Return 0 if the segments are identical and non-zero otherwise. + * + * This is slightly complicated by the fact that only device memory segments + * are named. + */ +static int +cmpseg(size_t len, const char *str, size_t len2, const char *str2) { - ctx->lowmem_limit = limit; + if (len == len2) { + if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) + return (0); + } + return (-1); } static int -setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr) +vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) { + struct vm_memseg memseg; + size_t n; int error; - struct vm_memory_segment seg; /* - * Create and optionally map 'len' bytes of memory at guest - * physical address 'gpa' + * If the memory segment has already been created then just return. + * This is the usual case for the SYSMEM segment created by userspace + * loaders like bhyveload(8). */ - bzero(&seg, sizeof(seg)); - seg.gpa = gpa; - seg.len = len; - error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg); - if (error == 0 && addr != NULL) { - *addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, - ctx->fd, gpa); + error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, + sizeof(memseg.name)); + if (error) + return (error); + + if (memseg.len != 0) { + if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { + errno = EINVAL; + return (-1); + } else { + return (0); + } + } + + bzero(&memseg, sizeof(struct vm_memseg)); + memseg.segid = segid; + memseg.len = len; + if (name != NULL) { + n = strlcpy(memseg.name, name, sizeof(memseg.name)); + if (n >= sizeof(memseg.name)) { + errno = ENAMETOOLONG; + return (-1); + } + } + + error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); + return (error); +} + +int +vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, + size_t bufsize) +{ + struct vm_memseg memseg; + size_t n; + int error; + + memseg.segid = segid; + error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); + if (error == 0) { + *lenp = memseg.len; + n = strlcpy(namebuf, memseg.name, bufsize); + if (n >= bufsize) { + errno = ENAMETOOLONG; + error = -1; + } } return (error); } +static int +#ifdef __FreeBSD__ +setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) +#else +setup_memory_segment(struct vmctx *ctx, int segid, vm_paddr_t gpa, size_t len, + char *base) +#endif +{ + char *ptr; + int error, flags; + + /* Map 'len' bytes starting at 'gpa' in the guest address space */ +#ifdef __FreeBSD__ + error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); +#else + /* + * As we use two segments for lowmem/highmem the offset within the + * segment is 0 on illumos. + */ + error = vm_mmap_memseg(ctx, gpa, segid, 0, len, PROT_ALL); +#endif + if (error) + return (error); + + flags = MAP_SHARED | MAP_FIXED; + if ((ctx->memflags & VM_MEM_F_INCORE) == 0) + flags |= MAP_NOCORE; + + /* mmap into the process address space on the host */ + ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); + if (ptr == MAP_FAILED) + return (-1); + + return (0); +} + int vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) { - char **addr; + size_t objsize, len; + vm_paddr_t gpa; + char *baseaddr, *ptr; int error; - /* XXX VM_MMAP_SPARSE not implemented yet */ - assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL); - ctx->vms = vms; + assert(vms == VM_MMAP_ALL); /* * If 'memsize' cannot fit entirely in the 'lowmem' segment then @@ -349,81 +438,100 @@ vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) */ if (memsize > ctx->lowmem_limit) { ctx->lowmem = ctx->lowmem_limit; - ctx->highmem = memsize - ctx->lowmem; + ctx->highmem = memsize - ctx->lowmem_limit; + objsize = 4*GB + ctx->highmem; } else { ctx->lowmem = memsize; ctx->highmem = 0; + objsize = ctx->lowmem; } - if (ctx->lowmem > 0) { - addr = (vms == VM_MMAP_ALL) ? &ctx->lowermem_addr : NULL; - error = setup_memory_segment(ctx, 0, 640*KB, addr); - if (error) - return (error); +#ifdef __FreeBSD__ + error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); + if (error) + return (error); +#endif + + /* + * Stake out a contiguous region covering the guest physical memory + * and the adjoining guard regions. + */ + len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; + ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); + if (ptr == MAP_FAILED) + return (-1); + + baseaddr = ptr + VM_MMAP_GUARD_SIZE; - addr = (vms == VM_MMAP_ALL) ? &ctx->biosmem_addr : NULL; - error = setup_memory_segment(ctx, 768*KB, 256*KB, addr); +#ifdef __FreeBSD__ + if (ctx->highmem > 0) { + gpa = 4*GB; + len = ctx->highmem; + error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); + } - addr = (vms == VM_MMAP_ALL) ? &ctx->lowmem_addr : NULL; - error = setup_memory_segment(ctx, 1*MB, ctx->lowmem - 1*MB, addr); + if (ctx->lowmem > 0) { + gpa = 0; + len = ctx->lowmem; + error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } - +#else if (ctx->highmem > 0) { - addr = (vms == VM_MMAP_ALL) ? &ctx->highmem_addr : NULL; - error = setup_memory_segment(ctx, 4*GB, ctx->highmem, addr); + error = vm_alloc_memseg(ctx, VM_HIGHMEM, ctx->highmem, NULL); + if (error) + return (error); + gpa = 4*GB; + len = ctx->highmem; + error = setup_memory_segment(ctx, VM_HIGHMEM, gpa, len, baseaddr); if (error) return (error); } - return (0); -} + if (ctx->lowmem > 0) { + error = vm_alloc_memseg(ctx, VM_LOWMEM, ctx->lowmem, NULL); + if (error) + return (error); + gpa = 0; + len = ctx->lowmem; + error = setup_memory_segment(ctx, VM_LOWMEM, gpa, len, baseaddr); + if (error) + return (error); + } +#endif -int -vm_setup_rom(struct vmctx *ctx, vm_paddr_t gpa, size_t len) -{ - ctx->rombase = gpa; - ctx->romlimit = gpa + len; + ctx->baseaddr = baseaddr; - return (setup_memory_segment(ctx, gpa, len, &ctx->rom_addr)); + return (0); } +/* + * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in + * the lowmem or highmem regions. + * + * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. + * The instruction emulation code depends on this behavior. + */ void * vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) { - /* XXX VM_MMAP_SPARSE not implemented yet */ - assert(ctx->vms == VM_MMAP_ALL); - - if (gaddr + len <= 1*MB) { - if (gaddr + len <= 640*KB) - return ((void *)(ctx->lowermem_addr + gaddr)); - - if (768*KB <= gaddr && gaddr + len <= 1*MB) { - gaddr -= 768*KB; - return ((void *)(ctx->biosmem_addr + gaddr)); - } - - return (NULL); - } - - if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem) { - gaddr -= 1*MB; - return ((void *)(ctx->lowmem_addr + gaddr)); - } - - if (ctx->rombase <= gaddr && gaddr + len <= ctx->romlimit) { - gaddr -= ctx->rombase; - return ((void *)(ctx->rom_addr + gaddr)); + if (ctx->lowmem > 0) { + if (gaddr < ctx->lowmem && len <= ctx->lowmem && + gaddr + len <= ctx->lowmem) + return (ctx->baseaddr + gaddr); } - if (gaddr >= 4*GB) { - gaddr -= 4*GB; - if (gaddr < ctx->highmem && gaddr + len <= ctx->highmem) - return ((void *)(ctx->highmem_addr + gaddr)); + if (ctx->highmem > 0) { + if (gaddr >= 4*GB) { + if (gaddr < 4*GB + ctx->highmem && + len <= ctx->highmem && + gaddr + len <= 4*GB + ctx->highmem) + return (ctx->baseaddr + gaddr); + } } return (NULL); @@ -443,6 +551,79 @@ vm_get_highmem_size(struct vmctx *ctx) return (ctx->highmem); } +void * +vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) +{ +#ifdef __FreeBSD__ + char pathname[MAXPATHLEN]; +#endif + size_t len2; + char *base, *ptr; + int fd, error, flags; + off_t mapoff; + + fd = -1; + ptr = MAP_FAILED; + if (name == NULL || strlen(name) == 0) { + errno = EINVAL; + goto done; + } + + error = vm_alloc_memseg(ctx, segid, len, name); + if (error) + goto done; + +#ifdef __FreeBSD__ + strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); + strlcat(pathname, ctx->name, sizeof(pathname)); + strlcat(pathname, ".", sizeof(pathname)); + strlcat(pathname, name, sizeof(pathname)); + + fd = open(pathname, O_RDWR); + if (fd < 0) + goto done; +#else + { + struct vm_devmem_offset vdo; + + vdo.segid = segid; + error = ioctl(ctx->fd, VM_DEVMEM_GETOFFSET, &vdo); + if (error == 0) { + mapoff = vdo.offset; + } else { + goto done; + } + } +#endif + + /* + * Stake out a contiguous region covering the device memory and the + * adjoining guard regions. + */ + len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; + base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, + 0); + if (base == MAP_FAILED) + goto done; + + flags = MAP_SHARED | MAP_FIXED; + if ((ctx->memflags & VM_MEM_F_INCORE) == 0) + flags |= MAP_NOCORE; + +#ifdef __FreeBSD__ + /* mmap the devmem region in the host address space */ + ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); +#else + /* mmap the devmem region in the host address space */ + ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, ctx->fd, + mapoff); +#endif +done: + if (fd >= 0) + close(fd); + return (ptr); +} + int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access) @@ -521,6 +702,40 @@ vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) return (error); } +int +vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, + const int *regnums, uint64_t *regvals) +{ + int error; + struct vm_register_set vmregset; + + bzero(&vmregset, sizeof(vmregset)); + vmregset.cpuid = vcpu; + vmregset.count = count; + vmregset.regnums = regnums; + vmregset.regvals = regvals; + + error = ioctl(ctx->fd, VM_SET_REGISTER_SET, &vmregset); + return (error); +} + +int +vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, + const int *regnums, uint64_t *regvals) +{ + int error; + struct vm_register_set vmregset; + + bzero(&vmregset, sizeof(vmregset)); + vmregset.cpuid = vcpu; + vmregset.count = count; + vmregset.regnums = regnums; + vmregset.regvals = regvals; + + error = ioctl(ctx->fd, VM_GET_REGISTER_SET, &vmregset); + return (error); +} + int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) { @@ -535,19 +750,21 @@ vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) return (error); } -static int -vm_inject_exception_real(struct vmctx *ctx, int vcpu, int vector, - int error_code, int error_code_valid) +int +vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) { - struct vm_exception exc; + struct vm_suspend vmsuspend; - bzero(&exc, sizeof(exc)); - exc.cpuid = vcpu; - exc.vector = vector; - exc.error_code = error_code; - exc.error_code_valid = error_code_valid; + bzero(&vmsuspend, sizeof(vmsuspend)); + vmsuspend.how = how; + return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); +} - return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc)); +int +vm_reinit(struct vmctx *ctx) +{ + + return (ioctl(ctx->fd, VM_REINIT, 0)); } int @@ -774,7 +991,7 @@ vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) vmcap.cpuid = vcpu; vmcap.captype = cap; vmcap.capval = val; - + return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); } @@ -858,7 +1075,6 @@ vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); } -#ifdef __FreeBSD__ uint64_t * vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, int *ret_entries) @@ -869,7 +1085,7 @@ vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, vmstats.cpuid = vcpu; - error = ioctl(ctx->fd, VM_STATS, &vmstats); + error = ioctl(ctx->fd, VM_STATS_IOC, &vmstats); if (error == 0) { if (ret_entries) *ret_entries = vmstats.num_entries; @@ -891,7 +1107,6 @@ vm_get_stat_desc(struct vmctx *ctx, int index) else return (NULL); } -#endif int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state) @@ -1112,9 +1327,9 @@ vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities) return (error); } -static int -gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, - uint64_t gla, int prot, int *fault, uint64_t *gpa) +int +vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; @@ -1134,14 +1349,23 @@ gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, } int -vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa) +vm_gla2gpa_nofault(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *fault) { - int error, fault; + struct vm_gla2gpa gg; + int error; + + bzero(&gg, sizeof(struct vm_gla2gpa)); + gg.vcpuid = vcpu; + gg.prot = prot; + gg.gla = gla; + gg.paging = *paging; - error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, gpa); - if (fault) - error = fault; + error = ioctl(ctx->fd, VM_GLA2GPA_NOFAULT, &gg); + if (error == 0) { + *fault = gg.fault; + *gpa = gg.gpa; + } return (error); } @@ -1151,11 +1375,12 @@ vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, - uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt) + uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, + int *fault) { void *va; uint64_t gpa; - int error, fault, i, n, off; + int error, i, n, off; for (i = 0; i < iovcnt; i++) { iov[i].iov_base = 0; @@ -1164,18 +1389,16 @@ vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, while (len) { assert(iovcnt > 0); - error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, &gpa); - if (error) - return (-1); - if (fault) - return (1); + error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault); + if (error || *fault) + return (error); off = gpa & PAGE_MASK; n = min(len, PAGE_SIZE - off); va = vm_map_gpa(ctx, gpa, n); if (va == NULL) - return (-1); + return (EFAULT); iov->iov_base = va; iov->iov_len = n; @@ -1236,6 +1459,42 @@ vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov, } } +static int +vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) +{ + struct vm_cpuset vm_cpuset; + int error; + + bzero(&vm_cpuset, sizeof(struct vm_cpuset)); + vm_cpuset.which = which; + vm_cpuset.cpusetsize = sizeof(cpuset_t); + vm_cpuset.cpus = cpus; + + error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); + return (error); +} + +int +vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) +{ + + return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); +} + +int +vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) +{ + + return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); +} + +int +vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) +{ + + return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); +} + int vm_activate_cpu(struct vmctx *ctx, int vcpu) { @@ -1248,6 +1507,111 @@ vm_activate_cpu(struct vmctx *ctx, int vcpu) return (error); } +int +vm_suspend_cpu(struct vmctx *ctx, int vcpu) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + ac.vcpuid = vcpu; + error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); + return (error); +} + +int +vm_resume_cpu(struct vmctx *ctx, int vcpu) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + ac.vcpuid = vcpu; + error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); + return (error); +} + +int +vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2) +{ + struct vm_intinfo vmii; + int error; + + bzero(&vmii, sizeof(struct vm_intinfo)); + vmii.vcpuid = vcpu; + error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii); + if (error == 0) { + *info1 = vmii.info1; + *info2 = vmii.info2; + } + return (error); +} + +int +vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1) +{ + struct vm_intinfo vmii; + int error; + + bzero(&vmii, sizeof(struct vm_intinfo)); + vmii.vcpuid = vcpu; + vmii.info1 = info1; + error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii); + return (error); +} + +int +vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value) +{ + struct vm_rtc_data rtcdata; + int error; + + bzero(&rtcdata, sizeof(struct vm_rtc_data)); + rtcdata.offset = offset; + rtcdata.value = value; + error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata); + return (error); +} + +int +vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval) +{ + struct vm_rtc_data rtcdata; + int error; + + bzero(&rtcdata, sizeof(struct vm_rtc_data)); + rtcdata.offset = offset; + error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata); + if (error == 0) + *retval = rtcdata.value; + return (error); +} + +int +vm_rtc_settime(struct vmctx *ctx, time_t secs) +{ + struct vm_rtc_time rtctime; + int error; + + bzero(&rtctime, sizeof(struct vm_rtc_time)); + rtctime.secs = secs; + error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime); + return (error); +} + +int +vm_rtc_gettime(struct vmctx *ctx, time_t *secs) +{ + struct vm_rtc_time rtctime; + int error; + + bzero(&rtctime, sizeof(struct vm_rtc_time)); + error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime); + if (error == 0) + *secs = rtctime.secs; + return (error); +} + int vm_restart_instruction(void *arg, int vcpu) { @@ -1255,3 +1619,92 @@ vm_restart_instruction(void *arg, int vcpu) return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); } + +int +vm_set_topology(struct vmctx *ctx, + uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) +{ + struct vm_cpu_topology topology; + + bzero(&topology, sizeof (struct vm_cpu_topology)); + topology.sockets = sockets; + topology.cores = cores; + topology.threads = threads; + topology.maxcpus = maxcpus; + return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); +} + +int +vm_get_topology(struct vmctx *ctx, + uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) +{ + struct vm_cpu_topology topology; + int error; + + bzero(&topology, sizeof (struct vm_cpu_topology)); + error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); + if (error == 0) { + *sockets = topology.sockets; + *cores = topology.cores; + *threads = topology.threads; + *maxcpus = topology.maxcpus; + } + return (error); +} + +int +vm_get_device_fd(struct vmctx *ctx) +{ + + return (ctx->fd); +} + +#ifndef __FreeBSD__ +int +vm_wrlock_cycle(struct vmctx *ctx) +{ + if (ioctl(ctx->fd, VM_WRLOCK_CYCLE, 0) != 0) { + return (errno); + } + return (0); +} +#endif /* __FreeBSD__ */ + +#ifdef __FreeBSD__ +const cap_ioctl_t * +vm_get_ioctls(size_t *len) +{ + cap_ioctl_t *cmds; + /* keep in sync with machine/vmm_dev.h */ + static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT, + VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG, + VM_MMAP_GETNEXT, VM_SET_REGISTER, VM_GET_REGISTER, + VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR, + VM_SET_REGISTER_SET, VM_GET_REGISTER_SET, + VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ, + VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ, + VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ, + VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER, + VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV, + VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI, + VM_PPTDEV_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, + VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE, + VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA, + VM_GLA2GPA_NOFAULT, + VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU, + VM_SET_INTINFO, VM_GET_INTINFO, + VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME, + VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY }; + + if (len == NULL) { + cmds = malloc(sizeof(vm_ioctl_cmds)); + if (cmds == NULL) + return (NULL); + bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds)); + return (cmds); + } + + *len = nitems(vm_ioctl_cmds); + return (NULL); +} +#endif /* __FreeBSD__ */ diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h index d7eb67aa58..a1507255cb 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.h +++ b/usr/src/lib/libvmmapi/common/vmmapi.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/lib/libvmmapi/vmmapi.h 280929 2015-04-01 00:15:31Z tychon $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,12 +38,20 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMMAPI_H_ #define _VMMAPI_H_ #include +#include + +/* + * API version for out-of-tree consumers like grub-bhyve for making compile + * time decisions. + */ +#define VMMAPI_VERSION 0103 /* 2 digit major followed by 2 digit minor */ struct iovec; struct vmctx; @@ -57,19 +67,77 @@ enum vm_mmap_style { VM_MMAP_SPARSE, /* mappings created on-demand */ }; +/* + * 'flags' value passed to 'vm_set_memflags()'. + */ +#define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ +#define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ + +/* + * Identifiers for memory segments: + * - vm_setup_memory() uses VM_SYSMEM for the system memory segment. + * - the remaining identifiers can be used to create devmem segments. + */ +enum { +#ifdef __FreeBSD__ + VM_SYSMEM, +#else + VM_LOWMEM, + VM_HIGHMEM, +#endif + VM_BOOTROM, + VM_FRAMEBUFFER, +}; + +/* + * Get the length and name of the memory segment identified by 'segid'. + * Note that system memory segments are identified with a nul name. + * + * Returns 0 on success and non-zero otherwise. + */ +int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name, + size_t namesiz); + +/* + * Iterate over the guest address space. This function finds an address range + * that starts at an address >= *gpa. + * + * Returns 0 if the next address range was found and non-zero otherwise. + */ +int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); +/* + * Create a device memory segment identified by 'segid'. + * + * Returns a pointer to the memory segment on success and MAP_FAILED otherwise. + */ +void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name, + size_t len); + +/* + * Map the memory segment identified by 'segid' into the guest address space + * at [gpa,gpa+len) with protection 'prot'. + */ +int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, + vm_ooffset_t segoff, size_t len, int prot); + int vm_create(const char *name); +int vm_get_device_fd(struct vmctx *ctx); struct vmctx *vm_open(const char *name); -int vm_destroy(struct vmctx *ctx); +void vm_destroy(struct vmctx *ctx); int vm_parse_memsize(const char *optarg, size_t *memsize); -int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len, - int *wired); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); -int vm_setup_rom(struct vmctx *ctx, vm_paddr_t gpa, size_t len); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); +int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa); + uint64_t gla, int prot, uint64_t *gpa, int *fault); +int vm_gla2gpa_nofault(struct vmctx *, int vcpuid, + struct vm_guest_paging *paging, uint64_t gla, int prot, + uint64_t *gpa, int *fault); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); +void vm_set_memflags(struct vmctx *ctx, int flags); +int vm_get_memflags(struct vmctx *ctx); size_t vm_get_lowmem_size(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, @@ -80,7 +148,13 @@ int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc); int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); +int vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, + const int *regnums, uint64_t *regvals); +int vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, + const int *regnums, uint64_t *regvals); int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit); +int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); +int vm_reinit(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction); @@ -113,6 +187,13 @@ int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2); +int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo); + +#ifdef __FreeBSD__ +const cap_ioctl_t *vm_get_ioctls(size_t *len); +#endif + /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ @@ -127,11 +208,16 @@ int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); /* * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. - * The 'iovcnt' should be big enough to accomodate all GPA segments. - * Returns 0 on success, 1 on a guest fault condition and -1 otherwise. + * The 'iovcnt' should be big enough to accommodate all GPA segments. + * + * retval fault Interpretation + * 0 0 Success + * 0 1 An exception was injected into the guest + * EFAULT N/A Error */ int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg, - uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt); + uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, + int *fault); void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov, void *host_dst, size_t len); void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, @@ -139,10 +225,32 @@ void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt); +/* RTC */ +int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); +int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); +int vm_rtc_settime(struct vmctx *ctx, time_t secs); +int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); + /* Reset vcpu register state */ int vcpu_reset(struct vmctx *ctx, int vcpu); +int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus); +int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus); +int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_activate_cpu(struct vmctx *ctx, int vcpu); +int vm_suspend_cpu(struct vmctx *ctx, int vcpu); +int vm_resume_cpu(struct vmctx *ctx, int vcpu); + +/* CPU topology */ +int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus); +int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus); + +#ifndef __FreeBSD__ +/* illumos-specific APIs */ +int vm_wrlock_cycle(struct vmctx *ctx); +#endif /* __FreeBSD__ */ #ifdef __FreeBSD__ /* diff --git a/usr/src/pkg/manifests/system-bhyve-tests.mf b/usr/src/pkg/manifests/system-bhyve-tests.mf new file mode 100644 index 0000000000..14586b5177 --- /dev/null +++ b/usr/src/pkg/manifests/system-bhyve-tests.mf @@ -0,0 +1,35 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright 2018 OmniOS Community Edition (OmniOSce) Association. +# + +set name=pkg.fmri value=pkg:/system/bhyve/tests@$(PKGVERS) +set name=pkg.description value="BSD hypervisor tests" +set name=pkg.summary value="BSD hypervisor tests" +set name=info.classification \ + value=org.opensolaris.category.2008:System/Virtualization +set name=variant.arch value=i386 +dir path=opt/bhyvetest +dir path=opt/bhyvetest/bin +dir path=opt/bhyvetest/tst +dir path=opt/bhyvetest/tst/mevent +file path=opt/bhyvetest/bin/bhyvetest mode=0555 +file path=opt/bhyvetest/tst/mevent/lists.delete.exe mode=0555 +file path=opt/bhyvetest/tst/mevent/read.disable.exe mode=0555 +file path=opt/bhyvetest/tst/mevent/read.pause.exe mode=0555 +file path=opt/bhyvetest/tst/mevent/read.requeue.exe mode=0555 +license lic_CDDL license=lic_CDDL diff --git a/usr/src/pkg/manifests/system-bhyve.mf b/usr/src/pkg/manifests/system-bhyve.mf new file mode 100644 index 0000000000..2a51d4fc22 --- /dev/null +++ b/usr/src/pkg/manifests/system-bhyve.mf @@ -0,0 +1,46 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright 2018 Joyent, Inc. +# Copyright 2019 OmniOS Community Edition (OmniOSce) Association. +# + +# +# The default for payload-bearing actions in this package is to appear in the +# global zone only. See the include file for greater detail, as well as +# information about overriding the defaults. +# + +set name=pkg.fmri value=pkg:/system/bhyve@$(PKGVERS) +set name=pkg.description value="BSD hypervisor" +set name=pkg.summary value="BSD hypervisor" +set name=info.classification \ + value=org.opensolaris.category.2008:System/Virtualization +set name=variant.arch value=i386 +dir path=kernel group=sys +dir path=usr group=sys +dir path=usr/kernel/drv group=sys +dir path=usr/kernel/drv/$(ARCH64) group=sys +dir path=usr/sbin +driver name=vmm +file path=usr/kernel/drv/$(ARCH64)/vmm +file path=usr/kernel/drv/vmm.conf +file path=usr/sbin/bhyve mode=0555 +file path=usr/sbin/bhyvectl mode=0555 +license lic_CDDL license=lic_CDDL +depend fmri=developer/acpi type=require +depend fmri=system/bhyve/firmware type=require +depend fmri=system/library/bhyve type=require diff --git a/usr/src/pkg/manifests/system-library-bhyve.mf b/usr/src/pkg/manifests/system-library-bhyve.mf new file mode 100644 index 0000000000..d9a15e1b37 --- /dev/null +++ b/usr/src/pkg/manifests/system-library-bhyve.mf @@ -0,0 +1,31 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright 2019 OmniOS Community Edition (OmniOSce) Association. +# + +set name=pkg.fmri value=pkg:/system/library/bhyve@$(PKGVERS) +set name=pkg.description value="BSD hypervisor (libraries)" +set name=pkg.summary value="BSD hypervisor (libraries)" +set name=info.classification \ + value=org.opensolaris.category.2008:System/Virtualization +set name=variant.arch value=i386 +dir path=lib group=bin +dir path=lib/$(ARCH64) group=bin +dir path=usr group=sys +dir path=usr/lib group=bin +file path=lib/$(ARCH64)/libvmmapi.so.1 +license lic_CDDL license=lic_CDDL diff --git a/usr/src/req.flg b/usr/src/req.flg index 9c992b1120..26415fa51f 100644 --- a/usr/src/req.flg +++ b/usr/src/req.flg @@ -33,3 +33,5 @@ echo_file usr/src/Makefile.master.64 echo_file usr/src/Makefile.msg.targ echo_file usr/src/Makefile.psm echo_file usr/src/Makefile.psm.targ + +find_files "s.*" usr/contrib/freebsd diff --git a/usr/src/tools/scripts/build_cscope.conf b/usr/src/tools/scripts/build_cscope.conf index 859b5137d6..298db1281b 100644 --- a/usr/src/tools/scripts/build_cscope.conf +++ b/usr/src/tools/scripts/build_cscope.conf @@ -22,8 +22,8 @@ # # Copyright 2005 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2018 Joyent, Inc. # -# ident "%Z%%M% %I% %E% SMI" # # This file configures the set of cross-references built by build_cscope. # The format is: @@ -35,6 +35,6 @@ # directories. # -complete -f . +complete "" . uts "" uts uts/sun4u uts/sun4v uts/i86pc psm "" psm/stand psm/stand/boot psm/stand/boot/sparcv9/sun4u psm/stand/boot/sparcv9/sun4v diff --git a/usr/src/tools/scripts/gensetdefs.pl b/usr/src/tools/scripts/gensetdefs.pl deleted file mode 100644 index 8ca5782feb..0000000000 --- a/usr/src/tools/scripts/gensetdefs.pl +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/perl -w -# -# COPYRIGHT 2013 Pluribus Networks Inc. -# -# All rights reserved. This copyright notice is Copyright Management -# Information under 17 USC 1202 and is included to protect this work and -# deter copyright infringement. Removal or alteration of this Copyright -# Management Information without the express written permission from -# Pluribus Networks Inc is prohibited, and any such unauthorized removal -# or alteration will be a violation of federal law. - -use strict; - -my @Sections = split(/\n/, `elfedit -r -e \'shdr:sh_name -osimple\' $ARGV[0] 2>&1`); - -foreach my $Section (@Sections) { - if ($Section =~ "^set_") { - print "\tfixing $Section\n"; - - chomp(my $SectionAddr = `elfedit -r -e \'shdr:sh_addr -onum $Section\' $ARGV[0] 2>&1`); - chomp(my $SectionSize = `elfedit -r -e \'shdr:sh_size -onum $Section\' $ARGV[0] 2>&1`); - my $SectionEnd = hex($SectionAddr) + hex($SectionSize); - - `elfedit -e \'sym:st_bind __start_$Section global\' $ARGV[0] 2>&1`; - `elfedit -e \'sym:st_value __start_$Section $SectionAddr\' $ARGV[0] 2>&1`; - `elfedit -e \'sym:st_shndx __start_$Section $Section\' $ARGV[0] 2>&1`; - `elfedit -e \'sym:st_bind __stop_$Section global\' $ARGV[0] 2>&1`; - `elfedit -e \'sym:st_value __stop_$Section $SectionEnd\' $ARGV[0] 2>&1`; - `elfedit -e \'sym:st_shndx __stop_$Section $Section\' $ARGV[0] 2>&1`; - } -} diff --git a/usr/src/uts/Makefile.targ b/usr/src/uts/Makefile.targ index c5c32caa19..c42f458948 100644 --- a/usr/src/uts/Makefile.targ +++ b/usr/src/uts/Makefile.targ @@ -23,6 +23,7 @@ # Copyright 2014 Garrett D'Amore # Copyright 2016 Hans Rosenfeld # Copyright (c) 2017 by Delphix. All rights reserved. +# Copyright 2019 Joyent, Inc. # # This Makefiles contains the common targets and definitions for # all kernels. It is to be included in the Makefiles for specific @@ -51,7 +52,7 @@ $(OBJECTS): $(INLINES) # Partially link .o files to generate the kmod. The fake dependency # on modstubs simplifies things... # -$(BINARY): $(OBJECTS) $(DTRACE_MAPFILE) +$(BINARY): $(OBJECTS) $(DTRACE_MAPFILE) $(MAPFILE) $(LD) -r $(LDFLAGS) -o $@ $(OBJECTS) $(CTFMERGE_UNIQUIFY_AGAINST_GENUNIX) $(POST_PROCESS) diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index dbed5ea9cc..63f314ca93 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -1924,9 +1924,9 @@ LINT_DEFS += -Dunix # It is a bug in the current compilation system that the assember # can't process the -Y I, flag. # -NATIVE_INC_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common -AS_INC_PATH += $(INC_PATH) -I$(UTSBASE)/common -INCLUDE_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common +NATIVE_INC_PATH += $(PRE_INC_PATH) $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common +AS_INC_PATH += $(PRE_INC_PATH) $(INC_PATH) -I$(UTSBASE)/common +INCLUDE_PATH += $(PRE_INC_PATH) $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common PCIEB_OBJS += pcieb.o diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 879b8d86cb..ca4ae0cd65 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -23,8 +23,8 @@ # Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. # # Copyright (c) 2010, Intel Corporation. -# Copyright 2018 Joyent, Inc. # Copyright 2019 OmniOS Community Edition (OmniOSce) Association. +# Copyright 2019 Joyent, Inc. # # This Makefile defines file modules in the directory uts/i86pc # and its children. These are the source files which are i86pc @@ -237,6 +237,46 @@ UPPC_OBJS += uppc.o psm_common.o XSVC_OBJS += xsvc.o AMD_IOMMU_OBJS += amd_iommu.o amd_iommu_impl.o amd_iommu_acpi.o \ amd_iommu_cmd.o amd_iommu_log.o amd_iommu_page_tables.o +VMM_OBJS += vmm.o \ + vmm_sol_dev.o \ + vmm_host.o \ + vmm_instruction_emul.o \ + vmm_ioport.o \ + vmm_lapic.o \ + vmm_mem.o \ + vmm_stat.o \ + vmm_util.o \ + x86.o \ + vdev.o \ + vatpic.o \ + vatpit.o \ + vhpet.o \ + vioapic.o \ + vlapic.o \ + vrtc.o \ + vpmtmr.o \ + ept.o \ + vmcs.o \ + vmx_msr.o \ + vmx.o \ + vmx_support.o \ + svm.o \ + svm_msr.o \ + npt.o \ + vmcb.o \ + svm_support.o \ + amdv.o \ + sol_iommu.o \ + sol_ppt.o \ + gipt.o \ + vmm_sol_vm.o \ + vmm_sol_glue.o \ + vmm_sol_ept.o \ + vmm_sol_rvi.o \ + vmm_support.o \ + vmm_zsd.o + +VIONA_OBJS += viona.o # # Build up defines and paths. diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc index f5021ec738..b66b0ca2da 100644 --- a/usr/src/uts/i86pc/Makefile.i86pc +++ b/usr/src/uts/i86pc/Makefile.i86pc @@ -246,6 +246,7 @@ DRV_KMODS += dr DRV_KMODS += ioat DRV_KMODS += fipe DRV_KMODS += imc imcstub +DRV_KMODS += vmm DRV_KMODS += cpudrv diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules index e4f2fee0a0..3d3c8131c1 100644 --- a/usr/src/uts/i86pc/Makefile.rules +++ b/usr/src/uts/i86pc/Makefile.rules @@ -225,6 +225,35 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/dboot/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/amd/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/intel/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/io/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/%.s + $(COMPILE.s) -o $@ $< + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/intel/%.s + $(COMPILE.s) -o $@ $< + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/amd/%.s + $(COMPILE.s) -o $@ $< + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/viona/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + # # dboot stuff is always 32 bit, linked to run with phys_addr == virt_addr # diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c index 40bdd80a6e..2371a2f3ae 100644 --- a/usr/src/uts/i86pc/io/viona/viona.c +++ b/usr/src/uts/i86pc/io/viona/viona.c @@ -34,6 +34,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #include @@ -194,8 +195,8 @@ static void *viona_state; static dev_info_t *viona_dip; static id_space_t *viona_minor_ids; /* - * copy tx mbufs from virtio ring to avoid necessitating a wait - * for packet transmission to free resources. + * copy tx mbufs from virtio ring to avoid necessitating a wait for packet + * transmission to free resources. */ static boolean_t copy_tx_mblks = B_TRUE; @@ -914,7 +915,7 @@ viona_ioc_tx_intr_clear(viona_link_t *link) static int vq_popchain(viona_link_t *link, viona_vring_hqueue_t *hq, struct iovec *iov, -int n_iov, uint16_t *cookie) + int n_iov, uint16_t *cookie) { int i; int ndesc, nindir; @@ -1139,10 +1140,12 @@ viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, size_t mblklen; int n, i = 0; uint16_t cookie; - struct virtio_net_hdr *vrx; - struct virtio_net_mrgrxhdr *vmrgrx; + struct virtio_net_hdr *vrx = NULL; + struct virtio_net_mrgrxhdr *vmrgrx = NULL; +#if notyet mblk_t *ml; - caddr_t buf; +#endif + caddr_t buf = NULL; int total_len = 0; int copied_buf = 0; int num_bufs = 0; @@ -1312,8 +1315,10 @@ viona_desb_free(viona_desb_t *dp) { viona_link_t *link; viona_vring_hqueue_t *hq; +#if notyet struct virtio_used *vu; int uidx; +#endif uint_t ref; ref = atomic_dec_uint_nv(&dp->d_ref); diff --git a/usr/src/uts/i86pc/io/vmm/README.sync b/usr/src/uts/i86pc/io/vmm/README.sync new file mode 100644 index 0000000000..1cddfd829e --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/README.sync @@ -0,0 +1,18 @@ +The bhyve kernel module and its associated userland consumers have been updated +to the latest upstream FreeBSD sources as of: + + +commit 3b9cb80b242682690203709aaff4eafae41c138f +Author: jhb +Date: Mon Jun 3 23:17:35 2019 +0000 + + Emulate the AMD MSR_LS_CFG MSR used for various Ryzen errata. + + Writes are ignored and reads always return zero. + + Submitted by: José Albornoz (write-only version) + Reviewed by: Patrick Mooney, cem + MFC after: 2 weeks + Differential Revision: https://reviews.freebsd.org/D19506 + +Which corresponds to SVN revision: 348592 diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdv.c b/usr/src/uts/i86pc/io/vmm/amd/amdv.c index 6b62daae6c..c34a1e897b 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/amdv.c +++ b/usr/src/uts/i86pc/io/vmm/amd/amdv.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/amd/amdv.c 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,141 +38,18 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/amd/amdv.c 245678 2013-01-20 03:42:49Z neel $"); +__FBSDID("$FreeBSD$"); #include #include #include -#include #include -#ifdef __FreeBSD__ #include "io/iommu.h" -#endif - -static int -amdv_init(void) -{ - - printf("amdv_init: not implemented\n"); - return (ENXIO); -} - -static int -amdv_cleanup(void) -{ - - printf("amdv_cleanup: not implemented\n"); - return (ENXIO); -} - -static void * -amdv_vminit(struct vm *vm) -{ - - printf("amdv_vminit: not implemented\n"); - return (NULL); -} - -static int -amdv_vmrun(void *arg, int vcpu, register_t rip) -{ - - printf("amdv_vmrun: not implemented\n"); - return (ENXIO); -} - -static void -amdv_vmcleanup(void *arg) -{ - - printf("amdv_vmcleanup: not implemented\n"); - return; -} - -static int -amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, - vm_memattr_t attr, int prot, boolean_t spok) -{ - - printf("amdv_vmmmap_set: not implemented\n"); - return (EINVAL); -} - -static vm_paddr_t -amdv_vmmmap_get(void *arg, vm_paddr_t gpa) -{ - - printf("amdv_vmmmap_get: not implemented\n"); - return (EINVAL); -} - -static int -amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval) -{ - - printf("amdv_getreg: not implemented\n"); - return (EINVAL); -} - -static int -amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val) -{ - - printf("amdv_setreg: not implemented\n"); - return (EINVAL); -} - -static int -amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc) -{ - - printf("amdv_get_desc: not implemented\n"); - return (EINVAL); -} - -static int -amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc) -{ - - printf("amdv_get_desc: not implemented\n"); - return (EINVAL); -} - -static int -amdv_getcap(void *arg, int vcpu, int type, int *retval) -{ - - printf("amdv_getcap: not implemented\n"); - return (EINVAL); -} - -static int -amdv_setcap(void *arg, int vcpu, int type, int val) -{ - - printf("amdv_setcap: not implemented\n"); - return (EINVAL); -} - -struct vmm_ops vmm_ops_amd = { - amdv_init, - amdv_cleanup, - amdv_vminit, - amdv_vmrun, - amdv_vmcleanup, - amdv_vmmmap_set, - amdv_vmmmap_get, - amdv_getreg, - amdv_setreg, - amdv_getdesc, - amdv_setdesc, - amdv_getcap, - amdv_setcap -}; static int amd_iommu_init(void) @@ -234,14 +113,14 @@ amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len) } static void -amd_iommu_add_device(void *domain, int bus, int slot, int func) +amd_iommu_add_device(void *domain, uint16_t rid) { printf("amd_iommu_add_device: not implemented\n"); } static void -amd_iommu_remove_device(void *domain, int bus, int slot, int func) +amd_iommu_remove_device(void *domain, uint16_t rid) { printf("amd_iommu_remove_device: not implemented\n"); @@ -254,7 +133,6 @@ amd_iommu_invalidate_tlb(void *domain) printf("amd_iommu_invalidate_tlb: not implemented\n"); } -#ifdef __FreeBSD__ struct iommu_ops iommu_ops_amd = { amd_iommu_init, amd_iommu_cleanup, @@ -268,4 +146,3 @@ struct iommu_ops iommu_ops_amd = { amd_iommu_remove_device, amd_iommu_invalidate_tlb, }; -#endif diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c new file mode 100644 index 0000000000..f6b6e60363 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c @@ -0,0 +1,1461 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include "pcib_if.h" + +#include "io/iommu.h" +#include "amdvi_priv.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW, NULL, NULL); + +#define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s))) +#define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s))) + +/* Print RID or device ID in PCI string format. */ +#define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d) + +static void amdvi_dump_cmds(struct amdvi_softc *softc); +static void amdvi_print_dev_cap(struct amdvi_softc *softc); + +MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi"); + +extern device_t *ivhd_devs; + +extern int ivhd_count; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count, + 0, NULL); + +static int amdvi_enable_user = 0; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN, + &amdvi_enable_user, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user); + +#ifdef AMDVI_ATS_ENABLE +/* XXX: ATS is not tested. */ +static int amdvi_enable_iotlb = 1; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN, + &amdvi_enable_iotlb, 0, NULL); +TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb); +#endif + +static int amdvi_host_ptp = 1; /* Use page tables for host. */ +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN, + &amdvi_host_ptp, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp); + +/* Page table level used <= supported by h/w[v1=7]. */ +static int amdvi_ptp_level = 4; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN, + &amdvi_ptp_level, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level); + +/* Disable fault event reporting. */ +static int amdvi_disable_io_fault = 0; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN, + &amdvi_disable_io_fault, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault); + +static uint32_t amdvi_dom_id = 0; /* 0 is reserved for host. */ +SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD, + &amdvi_dom_id, 0, NULL); +/* + * Device table entry. + * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes). + * = 256 * 2 * PAGE_SIZE. + */ +static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE); +CTASSERT(PCI_NUM_DEV_MAX == 0x10000); +CTASSERT(sizeof(amdvi_dte) == 0x200000); + +static SLIST_HEAD (, amdvi_domain) dom_head; + +static inline uint32_t +amdvi_pci_read(struct amdvi_softc *softc, int off) +{ + + return (pci_cfgregread(PCI_RID2BUS(softc->pci_rid), + PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid), + off, 4)); +} + +#ifdef AMDVI_ATS_ENABLE +/* XXX: Should be in pci.c */ +/* + * Check if device has ATS capability and its enabled. + * If ATS is absent or disabled, return (-1), otherwise ATS + * queue length. + */ +static int +amdvi_find_ats_qlen(uint16_t devid) +{ + device_t dev; + uint32_t off, cap; + int qlen = -1; + + dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid), + PCI_RID2FUNC(devid)); + + if (!dev) { + return (-1); + } +#define PCIM_ATS_EN BIT(31) + + if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) { + cap = pci_read_config(dev, off + 4, 4); + qlen = (cap & 0x1F); + qlen = qlen ? qlen : 32; + printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n", + RID2PCI_STR(devid), + (cap & PCIM_ATS_EN) ? "enabled" : "Disabled", + qlen); + qlen = (cap & PCIM_ATS_EN) ? qlen : -1; + } + + return (qlen); +} + +/* + * Check if an endpoint device support device IOTLB or ATS. + */ +static inline bool +amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid) +{ + struct ivhd_dev_cfg *cfg; + int qlen, i; + bool pci_ats, ivhd_ats; + + qlen = amdvi_find_ats_qlen(devid); + if (qlen < 0) + return (false); + + KASSERT(softc, ("softc is NULL")); + cfg = softc->dev_cfg; + + ivhd_ats = false; + for (i = 0; i < softc->dev_cfg_cnt; i++) { + if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) { + ivhd_ats = cfg->enable_ats; + break; + } + cfg++; + } + + pci_ats = (qlen < 0) ? false : true; + if (pci_ats != ivhd_ats) + device_printf(softc->dev, + "BIOS bug: mismatch in ATS setting for %d.%d.%d," + "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen); + + /* Ignore IVRS setting and respect PCI setting. */ + return (pci_ats); +} +#endif + +/* Enable IOTLB support for IOMMU if its supported. */ +static inline void +amdvi_hw_enable_iotlb(struct amdvi_softc *softc) +{ +#ifndef AMDVI_ATS_ENABLE + softc->iotlb = false; +#else + bool supported; + + supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false; + + if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) { + if (!supported) + device_printf(softc->dev, "IOTLB disabled by BIOS.\n"); + + if (supported && !amdvi_enable_iotlb) { + device_printf(softc->dev, "IOTLB disabled by user.\n"); + supported = false; + } + } else + supported = false; + + softc->iotlb = supported; + +#endif +} + +static int +amdvi_init_cmd(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl = softc->ctrl; + + ctrl->cmd.len = 8; /* Use 256 command buffer entries. */ + softc->cmd_max = 1 << ctrl->cmd.len; + + softc->cmd = malloc(sizeof(struct amdvi_cmd) * + softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO); + + if ((uintptr_t)softc->cmd & PAGE_MASK) + panic("AMDVi: Command buffer not aligned on page boundary."); + + ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE; + /* + * XXX: Reset the h/w pointers in case IOMMU is restarting, + * h/w doesn't clear these pointers based on empirical data. + */ + ctrl->cmd_tail = 0; + ctrl->cmd_head = 0; + + return (0); +} + +/* + * Note: Update tail pointer after we have written the command since tail + * pointer update cause h/w to execute new commands, see section 3.3 + * of AMD IOMMU spec ver 2.0. + */ +/* Get the command tail pointer w/o updating it. */ +static struct amdvi_cmd * +amdvi_get_cmd_tail(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_cmd *tail; + + KASSERT(softc, ("softc is NULL")); + KASSERT(softc->cmd != NULL, ("cmd is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + + tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd + + ctrl->cmd_tail); + + return (tail); +} + +/* + * Update the command tail pointer which will start command execution. + */ +static void +amdvi_update_cmd_tail(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + int size; + + size = sizeof(struct amdvi_cmd); + KASSERT(softc->cmd != NULL, ("cmd is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + + ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max); + softc->total_cmd++; + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n", + ctrl->cmd_tail, + ctrl->cmd_head); +#endif + +} + +/* + * Various commands supported by IOMMU. + */ + +/* Completion wait command. */ +static void +amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data) +{ + struct amdvi_cmd *cmd; + uint64_t pa; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + pa = vtophys(&softc->cmp_data); + cmd->opcode = AMDVI_CMP_WAIT_OPCODE; + cmd->word0 = (pa & 0xFFFFFFF8) | + (AMDVI_CMP_WAIT_STORE); + //(AMDVI_CMP_WAIT_FLUSH | AMDVI_CMP_WAIT_STORE); + cmd->word1 = (pa >> 32) & 0xFFFFF; + cmd->addr = data; + + amdvi_update_cmd_tail(softc); +} + +/* Invalidate device table entry. */ +static void +amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + cmd->opcode = AMDVI_INVD_DTE_OPCODE; + cmd->word0 = devid; + amdvi_update_cmd_tail(softc); +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid); +#endif +} + +/* Invalidate IOMMU page, use for invalidation of domain. */ +static void +amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id, + uint64_t addr, bool guest_nested, + bool pde, bool page) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + + cmd->opcode = AMDVI_INVD_PAGE_OPCODE; + cmd->word1 = domain_id; + /* + * Invalidate all addresses for this domain. + */ + cmd->addr = addr; + cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0; + cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0; + + amdvi_update_cmd_tail(softc); +} + +#ifdef AMDVI_ATS_ENABLE +/* Invalidate device IOTLB. */ +static void +amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid) +{ + struct amdvi_cmd *cmd; + int qlen; + + if (!softc->iotlb) + return; + + qlen = amdvi_find_ats_qlen(devid); + if (qlen < 0) { + panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n", + qlen, RID2PCI_STR(devid)); + } + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate IOTLB devID 0x%x" + " Qlen:%d\n", devid, qlen); +#endif + cmd->opcode = AMDVI_INVD_IOTLB_OPCODE; + cmd->word0 = devid; + cmd->word1 = qlen; + cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR | + AMDVI_INVD_IOTLB_S; + amdvi_update_cmd_tail(softc); +} +#endif + +#ifdef notyet /* For Interrupt Remap. */ +static void +amdvi_cmd_inv_intr_map(struct amdvi_softc *softc, + uint16_t devid) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + cmd->opcode = AMDVI_INVD_INTR_OPCODE; + cmd->word0 = devid; + amdvi_update_cmd_tail(softc); +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid); +#endif +} +#endif + +/* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */ +static void +amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + /* + * See section 3.3.3 of IOMMU spec rev 2.0, software note + * for invalidating domain. + */ + amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR, + false, true, true); + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id); + +#endif +} + +static bool +amdvi_cmp_wait(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + const uint64_t VERIFY = 0xA5A5; + volatile uint64_t *read; + int i; + bool status; + + ctrl = softc->ctrl; + read = &softc->cmp_data; + *read = 0; + amdvi_cmd_cmp(softc, VERIFY); + /* Wait for h/w to update completion data. */ + for (i = 0; i < 100 && (*read != VERIFY); i++) { + DELAY(1000); /* 1 ms */ + } + status = (VERIFY == softc->cmp_data) ? true : false; + +#ifdef AMDVI_DEBUG_CMD + if (status) + device_printf(softc->dev, "CMD completion DONE Tail:0x%x, " + "Head:0x%x, loop:%d.\n", ctrl->cmd_tail, + ctrl->cmd_head, loop); +#endif + return (status); +} + +static void +amdvi_wait(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + int i; + + KASSERT(softc, ("softc is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + /* Don't wait if h/w is not enabled. */ + if ((ctrl->control & AMDVI_CTRL_EN) == 0) + return; + + for (i = 0; i < 10; i++) { + if (amdvi_cmp_wait(softc)) + return; + } + + device_printf(softc->dev, "Error: completion failed" + " tail:0x%x, head:0x%x.\n", + ctrl->cmd_tail, ctrl->cmd_head); + amdvi_dump_cmds(softc); +} + +static void +amdvi_dump_cmds(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_cmd *cmd; + int off, i; + + ctrl = softc->ctrl; + device_printf(softc->dev, "Dump all the commands:\n"); + /* + * If h/w is stuck in completion, it is the previous command, + * start dumping from previous command onward. + */ + off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd), + softc->cmd_max); + for (i = 0; off != ctrl->cmd_tail && + i < softc->cmd_max; i++) { + cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off); + printf(" [CMD%d, off:0x%x] opcode= 0x%x 0x%x" + " 0x%x 0x%lx\n", i, off, cmd->opcode, + cmd->word0, cmd->word1, cmd->addr); + off = (off + sizeof(struct amdvi_cmd)) % + (softc->cmd_max * sizeof(struct amdvi_cmd)); + } +} + +static int +amdvi_init_event(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + + ctrl = softc->ctrl; + ctrl->event.len = 8; + softc->event_max = 1 << ctrl->event.len; + softc->event = malloc(sizeof(struct amdvi_event) * + softc->event_max, M_AMDVI, M_WAITOK | M_ZERO); + if ((uintptr_t)softc->event & PAGE_MASK) { + device_printf(softc->dev, "Event buffer not aligned on page."); + return (false); + } + ctrl->event.base = vtophys(softc->event) / PAGE_SIZE; + + /* Reset the pointers. */ + ctrl->evt_head = 0; + ctrl->evt_tail = 0; + + return (0); +} + +static inline void +amdvi_decode_evt_flag(uint16_t flag) +{ + + flag &= AMDVI_EVENT_FLAG_MASK; + printf(" 0x%b]\n", flag, + "\020" + "\001GN" + "\002NX" + "\003US" + "\004I" + "\005PR" + "\006RW" + "\007PE" + "\010RZ" + "\011TR" + ); +} + +/* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/ +static inline void +amdvi_decode_evt_flag_type(uint8_t type) +{ + + switch (AMDVI_EVENT_FLAG_TYPE(type)) { + case 0: + printf("RSVD\n"); + break; + case 1: + printf("Master Abort\n"); + break; + case 2: + printf("Target Abort\n"); + break; + case 3: + printf("Data Err\n"); + break; + default: + break; + } +} + +static void +amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", + devid, domid, addr); + amdvi_decode_evt_flag(flag); +} + +static void +amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", + devid, domid, addr); + amdvi_decode_evt_flag(flag); +} + +static void +amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid, + uint64_t addr, uint16_t flag) +{ + + printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", devid, domid, addr); + amdvi_decode_evt_flag(flag); + amdvi_decode_evt_flag_type(flag); +} + +static void +amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", devid, domid, addr); + amdvi_decode_evt_flag(flag); + amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag)); +} + +static void +amdvi_decode_evt(struct amdvi_event *evt) +{ + struct amdvi_cmd *cmd; + + switch (evt->opcode) { + case AMDVI_EVENT_INVALID_DTE: + amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_PFAULT: + amdvi_decode_pf_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_DTE_HW_ERROR: + amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_PAGE_HW_ERROR: + amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_ILLEGAL_CMD: + /* FALL THROUGH */ + case AMDVI_EVENT_CMD_HW_ERROR: + printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ? + "ILLEGAL CMD" : "CMD HW ERR"); + cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr); + printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n", + cmd->opcode, cmd->word0, cmd->word1, cmd->addr); + break; + + case AMDVI_EVENT_IOTLB_TIMEOUT: + printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n", + evt->devid, evt->addr); + break; + + case AMDVI_EVENT_INVALID_DTE_REQ: + printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n", + evt->devid, evt->addr, evt->flag >> 9, + (evt->flag >> 8) & 1); + break; + + case AMDVI_EVENT_INVALID_PPR_REQ: + case AMDVI_EVENT_COUNTER_ZERO: + printf("AMD-Vi: v2 events.\n"); + break; + + default: + printf("Unsupported AMD-Vi event:%d\n", evt->opcode); + } +} + +static void +amdvi_print_events(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_event *event; + int i, size; + + ctrl = softc->ctrl; + size = sizeof(struct amdvi_event); + for (i = 0; i < softc->event_max; i++) { + event = &softc->event[ctrl->evt_head / size]; + if (!event->opcode) + break; + device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n", + i, ctrl->evt_head, ctrl->evt_tail); + amdvi_decode_evt(event); + ctrl->evt_head = MOD_INC(ctrl->evt_head, size, + softc->event_max); + } +} + +static int +amdvi_init_dte(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + + ctrl = softc->ctrl; + ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE; + ctrl->dte.size = 0x1FF; /* 2MB device table. */ + + return (0); +} + +/* + * Not all capabilities of IOMMU are available in ACPI IVHD flag + * or EFR entry, read directly from device. + */ +static int +amdvi_print_pci_cap(device_t dev) +{ + struct amdvi_softc *softc; + uint32_t off, cap; + + + softc = device_get_softc(dev); + off = softc->cap_off; + + /* + * Section 3.7.1 of IOMMU sepc rev 2.0. + * Read capability from device. + */ + cap = amdvi_pci_read(softc, off); + + /* Make sure capability type[18:16] is 3. */ + KASSERT((((cap >> 16) & 0x7) == 0x3), + ("Not a IOMMU capability 0x%x@0x%x", cap, off)); + + softc->pci_cap = cap >> 24; + device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n", + cap, off, softc->pci_cap, + "\20\1IOTLB\2HT\3NPCache\4EFR\5CapExt"); + + return (0); +} + +static void +amdvi_event_intr(void *arg) +{ + struct amdvi_softc *softc; + struct amdvi_ctrl *ctrl; + + softc = (struct amdvi_softc *)arg; + ctrl = softc->ctrl; + device_printf(softc->dev, "EVT INTR %ld Status:0x%x" + " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++, + ctrl->status, ctrl->evt_head, ctrl->evt_tail); + printf(" [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n", + softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head); + + amdvi_print_events(softc); + ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; +} + +static void +amdvi_free_evt_intr_res(device_t dev) +{ + + struct amdvi_softc *softc; + + softc = device_get_softc(dev); + if (softc->event_tag != NULL) { + bus_teardown_intr(dev, softc->event_res, softc->event_tag); + } + if (softc->event_res != NULL) { + bus_release_resource(dev, SYS_RES_IRQ, softc->event_rid, + softc->event_res); + } + bus_delete_resource(dev, SYS_RES_IRQ, softc->event_rid); + PCIB_RELEASE_MSI(device_get_parent(device_get_parent(dev)), + dev, 1, &softc->event_irq); +} + +static bool +amdvi_alloc_intr_resources(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + device_t dev, pcib; + device_t mmio_dev; + uint64_t msi_addr; + uint32_t msi_data; + int err; + + dev = softc->dev; + pcib = device_get_parent(device_get_parent(dev)); + mmio_dev = pci_find_bsf(PCI_RID2BUS(softc->pci_rid), + PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid)); + if (device_is_attached(mmio_dev)) { + device_printf(dev, + "warning: IOMMU device is claimed by another driver %s\n", + device_get_driver(mmio_dev)->name); + } + + softc->event_irq = -1; + softc->event_rid = 0; + + /* + * Section 3.7.1 of IOMMU rev 2.0. With MSI, there is only one + * interrupt. XXX: Enable MSI/X support. + */ + err = PCIB_ALLOC_MSI(pcib, dev, 1, 1, &softc->event_irq); + if (err) { + device_printf(dev, + "Couldn't find event MSI IRQ resource.\n"); + return (ENOENT); + } + + err = bus_set_resource(dev, SYS_RES_IRQ, softc->event_rid, + softc->event_irq, 1); + if (err) { + device_printf(dev, "Couldn't set event MSI resource.\n"); + return (ENXIO); + } + + softc->event_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, + &softc->event_rid, RF_ACTIVE); + if (!softc->event_res) { + device_printf(dev, + "Unable to allocate event INTR resource.\n"); + return (ENOMEM); + } + + if (bus_setup_intr(dev, softc->event_res, + INTR_TYPE_MISC | INTR_MPSAFE, NULL, amdvi_event_intr, + softc, &softc->event_tag)) { + device_printf(dev, "Fail to setup event intr\n"); + bus_release_resource(softc->dev, SYS_RES_IRQ, + softc->event_rid, softc->event_res); + softc->event_res = NULL; + return (ENXIO); + } + + bus_describe_intr(dev, softc->event_res, softc->event_tag, + "fault"); + + err = PCIB_MAP_MSI(pcib, dev, softc->event_irq, &msi_addr, + &msi_data); + if (err) { + device_printf(dev, + "Event interrupt config failed, err=%d.\n", + err); + amdvi_free_evt_intr_res(softc->dev); + return (err); + } + + /* Clear interrupt status bits. */ + ctrl = softc->ctrl; + ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; + + /* Now enable MSI interrupt. */ + pci_enable_msi(mmio_dev, msi_addr, msi_data); + return (0); +} + + +static void +amdvi_print_dev_cap(struct amdvi_softc *softc) +{ + struct ivhd_dev_cfg *cfg; + int i; + + cfg = softc->dev_cfg; + for (i = 0; i < softc->dev_cfg_cnt; i++) { + device_printf(softc->dev, "device [0x%x - 0x%x]" + "config:%b%s\n", cfg->start_id, cfg->end_id, + cfg->data, + "\020\001INIT\002ExtInt\003NMI" + "\007LINT0\008LINT1", + cfg->enable_ats ? "ATS enabled" : ""); + cfg++; + } +} + +static int +amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct amdvi_softc *softc; + int result, type, error = 0; + + softc = (struct amdvi_softc *)arg1; + type = arg2; + + switch (type) { + case 0: + result = softc->ctrl->cmd_head; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 1: + result = softc->ctrl->cmd_tail; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 2: + result = softc->ctrl->evt_head; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 3: + result = softc->ctrl->evt_tail; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + + default: + device_printf(softc->dev, "Unknown sysctl:%d\n", type); + } + + return (error); +} + +static void +amdvi_add_sysctl(struct amdvi_softc *softc) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + device_t dev; + + dev = softc->dev; + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD, + &softc->event_intr_cnt, "Event interrupt count"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD, + &softc->total_cmd, "Command submitted count"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD, + &softc->pci_rid, 0, "IOMMU RID"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "start_dev_rid", CTLFLAG_RD, + &softc->start_dev_rid, 0, "Start of device under this IOMMU"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "end_dev_rid", CTLFLAG_RD, + &softc->end_dev_rid, 0, "End of device under this IOMMU"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head", + CTLTYPE_UINT | CTLFLAG_RD, softc, 0, + amdvi_handle_sysctl, "IU", "Command head"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail", + CTLTYPE_UINT | CTLFLAG_RD, softc, 1, + amdvi_handle_sysctl, "IU", "Command tail"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head", + CTLTYPE_UINT | CTLFLAG_RD, softc, 2, + amdvi_handle_sysctl, "IU", "Command head"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail", + CTLTYPE_UINT | CTLFLAG_RD, softc, 3, + amdvi_handle_sysctl, "IU", "Command tail"); +} + +int +amdvi_setup_hw(struct amdvi_softc *softc) +{ + device_t dev; + int status; + + dev = softc->dev; + + amdvi_hw_enable_iotlb(softc); + + amdvi_print_dev_cap(softc); + + if ((status = amdvi_print_pci_cap(dev)) != 0) { + device_printf(dev, "PCI capability.\n"); + return (status); + } + if ((status = amdvi_init_cmd(softc)) != 0) { + device_printf(dev, "Couldn't configure command buffer.\n"); + return (status); + } + if ((status = amdvi_init_event(softc)) != 0) { + device_printf(dev, "Couldn't configure event buffer.\n"); + return (status); + } + if ((status = amdvi_init_dte(softc)) != 0) { + device_printf(dev, "Couldn't configure device table.\n"); + return (status); + } + if ((status = amdvi_alloc_intr_resources(softc)) != 0) { + return (status); + } + amdvi_add_sysctl(softc); + return (0); +} + +int +amdvi_teardown_hw(struct amdvi_softc *softc) +{ + device_t dev; + + dev = softc->dev; + + /* + * Called after disable, h/w is stopped by now, free all the resources. + */ + amdvi_free_evt_intr_res(dev); + + if (softc->cmd) + free(softc->cmd, M_AMDVI); + + if (softc->event) + free(softc->event, M_AMDVI); + + return (0); +} + +/*********** bhyve interfaces *********************/ +static int +amdvi_init(void) +{ + if (!ivhd_count) { + return (EIO); + } + if (!amdvi_enable_user && ivhd_count) { + printf("bhyve: Found %d AMD-Vi/IOMMU device(s), " + "use hw.vmm.amdvi.enable=1 to enable pass-through.\n", + ivhd_count); + return (EINVAL); + } + return (0); +} + +static void +amdvi_cleanup(void) +{ + /* Nothing. */ +} + +static uint16_t +amdvi_domainId(void) +{ + + /* + * If we hit maximum domain limit, rollover leaving host + * domain(0). + * XXX: make sure that this domain is not used. + */ + if (amdvi_dom_id == AMDVI_MAX_DOMAIN) + amdvi_dom_id = 1; + + return ((uint16_t)amdvi_dom_id++); +} + +static void +amdvi_do_inv_domain(uint16_t domain_id, bool create) +{ + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL")); + /* + * If not present pages are cached, invalidate page after + * creating domain. + */ +#if 0 + if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0)) + continue; +#endif + amdvi_inv_domain(softc, domain_id); + amdvi_wait(softc); + } +} + +static void * +amdvi_create_domain(vm_paddr_t maxaddr) +{ + struct amdvi_domain *dom; + + dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK); + dom->id = amdvi_domainId(); + //dom->maxaddr = maxaddr; +#ifdef AMDVI_DEBUG_CMD + printf("Created domain #%d\n", dom->id); +#endif + /* + * Host domain(#0) don't create translation table. + */ + if (dom->id || amdvi_host_ptp) + dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); + + dom->ptp_level = amdvi_ptp_level; + + amdvi_do_inv_domain(dom->id, true); + SLIST_INSERT_HEAD(&dom_head, dom, next); + + return (dom); +} + +static void +amdvi_free_ptp(uint64_t *ptp, int level) +{ + int i; + + if (level < 1) + return; + + for (i = 0; i < NPTEPG ; i++) { + if ((ptp[i] & AMDVI_PT_PRESENT) == 0) + continue; + /* XXX: Add super-page or PTE mapping > 4KB. */ +#ifdef notyet + /* Super-page mapping. */ + if (AMDVI_PD_SUPER(ptp[i])) + continue; +#endif + + amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i] + & AMDVI_PT_MASK), level - 1); + + } + + free(ptp, M_AMDVI); +} + +static void +amdvi_destroy_domain(void *arg) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain, ("domain is NULL")); +#ifdef AMDVI_DEBUG_CMD + printf("Destroying domain %d\n", domain->id); +#endif + if (domain->ptp) + amdvi_free_ptp(domain->ptp, domain->ptp_level); + + amdvi_do_inv_domain(domain->id, false); + SLIST_REMOVE(&dom_head, domain, amdvi_domain, next); + free(domain, M_AMDVI); +} + +static uint64_t +amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t pg_size, bool create) +{ + uint64_t *page, pa; + int shift, index; + const int PT_SHIFT = 9; + const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1; /* Based on PT_SHIFT */ + + if (!pg_size) + return (0); + + if (hpa & (pg_size - 1)) { + printf("HPA is not size aligned.\n"); + return (0); + } + if (gpa & (pg_size - 1)) { + printf("HPA is not size aligned.\n"); + return (0); + } + shift = PML4SHIFT; + while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) { + index = (gpa >> shift) & PT_INDEX_MASK; + + if ((pt[index] == 0) && create) { + page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); + pa = vtophys(page); + pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW | + ((level - 1) << AMDVI_PD_LEVEL_SHIFT); + } +#ifdef AMDVI_DEBUG_PTE + if ((gpa % 0x1000000) == 0) + printf("[level%d, shift = %d]PTE:0x%lx\n", + level, shift, pt[index]); +#endif +#define PTE2PA(x) ((uint64_t)(x) & AMDVI_PT_MASK) + pa = PTE2PA(pt[index]); + pt = (uint64_t *)PHYS_TO_DMAP(pa); + shift -= PT_SHIFT; + level--; + } + + /* Leaf entry. */ + index = (gpa >> shift) & PT_INDEX_MASK; + + if (create) { + pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT; + } else + pt[index] = 0; + +#ifdef AMDVI_DEBUG_PTE + if ((gpa % 0x1000000) == 0) + printf("[Last level%d, shift = %d]PTE:0x%lx\n", + level, shift, pt[index]); +#endif + return (1ULL << shift); +} + +static uint64_t +amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t size, bool create) +{ + uint64_t mapped, *ptp, len; + int level; + + KASSERT(domain, ("domain is NULL")); + level = domain->ptp_level; + KASSERT(level, ("Page table level is 0")); + + ptp = domain->ptp; + KASSERT(ptp, ("PTP is NULL")); + mapped = 0; + while (mapped < size) { + len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped, + PAGE_SIZE, create); + if (!len) { + printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n", + hpa, gpa); + return (0); + } + mapped += len; + } + + return (mapped); +} + +static uint64_t +amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + + if (domain->id && !domain->ptp) { + printf("ptp is NULL"); + return (-1); + } + + /* + * If host domain is created w/o page table, skip IOMMU page + * table set-up. + */ + if (domain->ptp) + return (amdvi_update_mapping(domain, gpa, hpa, len, true)); + else + return (len); +} + +static uint64_t +amdvi_destroy_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + /* + * If host domain is created w/o page table, skip IOMMU page + * table set-up. + */ + if (domain->ptp) + return (amdvi_update_mapping(domain, gpa, 0, len, false)); + return + (len); +} + +static struct amdvi_softc * +amdvi_find_iommu(uint16_t devid) +{ + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + if ((devid >= softc->start_dev_rid) && + (devid <= softc->end_dev_rid)) + return (softc); + } + + /* + * XXX: BIOS bug, device not in IVRS table, assume its from first IOMMU. + */ + printf("BIOS bug device(%d.%d.%d) doesn't have IVHD entry.\n", + RID2PCI_STR(devid)); + + return (device_get_softc(ivhd_devs[0])); +} + +/* + * Set-up device table entry. + * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must + * be set concurrently, e.g. read and write bits. + */ +static void +amdvi_set_dte(struct amdvi_domain *domain, uint16_t devid, bool enable) +{ + struct amdvi_softc *softc; + struct amdvi_dte* temp; + + KASSERT(domain, ("domain is NULL for pci_rid:0x%x\n", devid)); + + softc = amdvi_find_iommu(devid); + KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid)); + + temp = &amdvi_dte[devid]; + +#ifdef AMDVI_ATS_ENABLE + /* If IOMMU and device support IOTLB, enable it. */ + if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb) + temp->iotlb_enable = 1; +#endif + + /* Avoid duplicate I/O faults. */ + temp->sup_second_io_fault = 1; + temp->sup_all_io_fault = amdvi_disable_io_fault; + + temp->dt_valid = 1; + temp->domain_id = domain->id; + + if (enable) { + if (domain->ptp) { + temp->pt_base = vtophys(domain->ptp) >> 12; + temp->pt_level = amdvi_ptp_level; + } + /* + * XXX: Page table valid[TV] bit must be set even if host domain + * page tables are not enabled. + */ + temp->pt_valid = 1; + temp->read_allow = 1; + temp->write_allow = 1; + } +} + +static void +amdvi_inv_device(uint16_t devid) +{ + struct amdvi_softc *softc; + + softc = amdvi_find_iommu(devid); + KASSERT(softc, ("softc is NULL")); + + amdvi_cmd_inv_dte(softc, devid); +#ifdef AMDVI_ATS_ENABLE + if (amdvi_dev_support_iotlb(softc, devid)) + amdvi_cmd_inv_iotlb(softc, devid); +#endif + amdvi_wait(softc); +} + +static void +amdvi_add_device(void *arg, uint16_t devid) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain != NULL, ("domain is NULL")); +#ifdef AMDVI_DEBUG_CMD + printf("Assigning device(%d.%d.%d) to domain:%d\n", + RID2PCI_STR(devid), domain->id); +#endif + amdvi_set_dte(domain, devid, true); + amdvi_inv_device(devid); +} + +static void +amdvi_remove_device(void *arg, uint16_t devid) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; +#ifdef AMDVI_DEBUG_CMD + printf("Remove device(0x%x) from domain:%d\n", + devid, domain->id); +#endif + amdvi_set_dte(domain, devid, false); + amdvi_inv_device(devid); +} + +static void +amdvi_enable(void) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_softc *softc; + uint64_t val; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL\n")); + ctrl = softc->ctrl; + KASSERT(ctrl, ("ctrl is NULL\n")); + + val = ( AMDVI_CTRL_EN | + AMDVI_CTRL_CMD | + AMDVI_CTRL_ELOG | + AMDVI_CTRL_ELOGINT | + AMDVI_CTRL_INV_TO_1S); + + if (softc->ivhd_flag & IVHD_FLAG_COH) + val |= AMDVI_CTRL_COH; + if (softc->ivhd_flag & IVHD_FLAG_HTT) + val |= AMDVI_CTRL_HTT; + if (softc->ivhd_flag & IVHD_FLAG_RPPW) + val |= AMDVI_CTRL_RPPW; + if (softc->ivhd_flag & IVHD_FLAG_PPW) + val |= AMDVI_CTRL_PPW; + if (softc->ivhd_flag & IVHD_FLAG_ISOC) + val |= AMDVI_CTRL_ISOC; + + ctrl->control = val; + } +} + +static void +amdvi_disable(void) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL\n")); + ctrl = softc->ctrl; + KASSERT(ctrl, ("ctrl is NULL\n")); + + ctrl->control = 0; + } +} + +static void +amdvi_inv_tlb(void *arg) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain, ("domain is NULL")); + amdvi_do_inv_domain(domain->id, false); +} + +struct iommu_ops iommu_ops_amd = { + amdvi_init, + amdvi_cleanup, + amdvi_enable, + amdvi_disable, + amdvi_create_domain, + amdvi_destroy_domain, + amdvi_create_mapping, + amdvi_destroy_mapping, + amdvi_add_device, + amdvi_remove_device, + amdvi_inv_tlb +}; diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h b/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h new file mode 100644 index 0000000000..6ee6c36632 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h @@ -0,0 +1,431 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _AMDVI_PRIV_H_ +#define _AMDVI_PRIV_H_ + +#include + +#define BIT(n) (1ULL << (n)) +/* Return value of bits[n:m] where n and (n >= ) m are bit positions. */ +#define REG_BITS(x, n, m) (((x) >> (m)) & \ + ((1 << (((n) - (m)) + 1)) - 1)) + +/* + * IOMMU PCI capability. + */ +#define AMDVI_PCI_CAP_IOTLB BIT(0) /* IOTLB is supported. */ +#define AMDVI_PCI_CAP_HT BIT(1) /* HyperTransport tunnel support. */ +#define AMDVI_PCI_CAP_NPCACHE BIT(2) /* Not present page cached. */ +#define AMDVI_PCI_CAP_EFR BIT(3) /* Extended features. */ +#define AMDVI_PCI_CAP_EXT BIT(4) /* Miscellaneous information reg. */ + +/* + * IOMMU extended features. + */ +#define AMDVI_EX_FEA_PREFSUP BIT(0) /* Prefetch command support. */ +#define AMDVI_EX_FEA_PPRSUP BIT(1) /* PPR support */ +#define AMDVI_EX_FEA_XTSUP BIT(2) /* Reserved */ +#define AMDVI_EX_FEA_NXSUP BIT(3) /* No-execute. */ +#define AMDVI_EX_FEA_GTSUP BIT(4) /* Guest translation support. */ +#define AMDVI_EX_FEA_EFRW BIT(5) /* Reserved */ +#define AMDVI_EX_FEA_IASUP BIT(6) /* Invalidate all command supp. */ +#define AMDVI_EX_FEA_GASUP BIT(7) /* Guest APIC or AVIC support. */ +#define AMDVI_EX_FEA_HESUP BIT(8) /* Hardware Error. */ +#define AMDVI_EX_FEA_PCSUP BIT(9) /* Performance counters support. */ +/* XXX: add more EFER bits. */ + +/* + * Device table entry or DTE + * NOTE: Must be 256-bits/32 bytes aligned. + */ +struct amdvi_dte { + uint32_t dt_valid:1; /* Device Table valid. */ + uint32_t pt_valid:1; /* Page translation valid. */ + uint16_t :7; /* Reserved[8:2] */ + uint8_t pt_level:3; /* Paging level, 0 to disable. */ + uint64_t pt_base:40; /* Page table root pointer. */ + uint8_t :3; /* Reserved[54:52] */ + uint8_t gv_valid:1; /* Revision 2, GVA to SPA. */ + uint8_t gv_level:2; /* Revision 2, GLX level. */ + uint8_t gv_cr3_lsb:3; /* Revision 2, GCR3[14:12] */ + uint8_t read_allow:1; /* I/O read enabled. */ + uint8_t write_allow:1; /* I/O write enabled. */ + uint8_t :1; /* Reserved[63] */ + uint16_t domain_id:16; /* Domain ID */ + uint16_t gv_cr3_lsb2:16; /* Revision 2, GCR3[30:15] */ + uint8_t iotlb_enable:1; /* Device support IOTLB */ + uint8_t sup_second_io_fault:1; /* Suppress subsequent I/O faults. */ + uint8_t sup_all_io_fault:1; /* Suppress all I/O page faults. */ + uint8_t IOctl:2; /* Port I/O control. */ + uint8_t iotlb_cache_disable:1; /* IOTLB cache hints. */ + uint8_t snoop_disable:1; /* Snoop disable. */ + uint8_t allow_ex:1; /* Allow exclusion. */ + uint8_t sysmgmt:2; /* System management message.*/ + uint8_t :1; /* Reserved[106] */ + uint32_t gv_cr3_msb:21; /* Revision 2, GCR3[51:31] */ + uint8_t intmap_valid:1; /* Interrupt map valid. */ + uint8_t intmap_len:4; /* Interrupt map table length. */ + uint8_t intmap_ign:1; /* Ignore unmapped interrupts. */ + uint64_t intmap_base:46; /* IntMap base. */ + uint8_t :4; /* Reserved[183:180] */ + uint8_t init_pass:1; /* INIT pass through or PT */ + uint8_t extintr_pass:1; /* External Interrupt PT */ + uint8_t nmi_pass:1; /* NMI PT */ + uint8_t :1; /* Reserved[187] */ + uint8_t intr_ctrl:2; /* Interrupt control */ + uint8_t lint0_pass:1; /* LINT0 PT */ + uint8_t lint1_pass:1; /* LINT1 PT */ + uint64_t :64; /* Reserved[255:192] */ +} __attribute__((__packed__)); +CTASSERT(sizeof(struct amdvi_dte) == 32); + +/* + * IOMMU command entry. + */ +struct amdvi_cmd { + uint32_t word0; + uint32_t word1:28; + uint8_t opcode:4; + uint64_t addr; +} __attribute__((__packed__)); + +/* Command opcodes. */ +#define AMDVI_CMP_WAIT_OPCODE 0x1 /* Completion wait. */ +#define AMDVI_INVD_DTE_OPCODE 0x2 /* Invalidate device table entry. */ +#define AMDVI_INVD_PAGE_OPCODE 0x3 /* Invalidate pages. */ +#define AMDVI_INVD_IOTLB_OPCODE 0x4 /* Invalidate IOTLB pages. */ +#define AMDVI_INVD_INTR_OPCODE 0x5 /* Invalidate Interrupt table. */ +#define AMDVI_PREFETCH_PAGES_OPCODE 0x6 /* Prefetch IOMMU pages. */ +#define AMDVI_COMP_PPR_OPCODE 0x7 /* Complete PPR request. */ +#define AMDVI_INV_ALL_OPCODE 0x8 /* Invalidate all. */ + +/* Completion wait attributes. */ +#define AMDVI_CMP_WAIT_STORE BIT(0) /* Write back data. */ +#define AMDVI_CMP_WAIT_INTR BIT(1) /* Completion wait interrupt. */ +#define AMDVI_CMP_WAIT_FLUSH BIT(2) /* Flush queue. */ + +/* Invalidate page. */ +#define AMDVI_INVD_PAGE_S BIT(0) /* Invalidation size. */ +#define AMDVI_INVD_PAGE_PDE BIT(1) /* Invalidate PDE. */ +#define AMDVI_INVD_PAGE_GN_GVA BIT(2) /* GPA or GVA. */ + +#define AMDVI_INVD_PAGE_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) + +/* Invalidate IOTLB. */ +#define AMDVI_INVD_IOTLB_S BIT(0) /* Invalidation size 4k or addr */ +#define AMDVI_INVD_IOTLB_GN_GVA BIT(2) /* GPA or GVA. */ + +#define AMDVI_INVD_IOTLB_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) +/* XXX: add more command entries. */ + +/* + * IOMMU event entry. + */ +struct amdvi_event { + uint16_t devid; + uint16_t pasid_hi; + uint16_t pasid_domid; /* PASID low or DomainID */ + uint16_t flag:12; + uint8_t opcode:4; + uint64_t addr; +} __attribute__((__packed__)); +CTASSERT(sizeof(struct amdvi_event) == 16); + +/* Various event types. */ +#define AMDVI_EVENT_INVALID_DTE 0x1 +#define AMDVI_EVENT_PFAULT 0x2 +#define AMDVI_EVENT_DTE_HW_ERROR 0x3 +#define AMDVI_EVENT_PAGE_HW_ERROR 0x4 +#define AMDVI_EVENT_ILLEGAL_CMD 0x5 +#define AMDVI_EVENT_CMD_HW_ERROR 0x6 +#define AMDVI_EVENT_IOTLB_TIMEOUT 0x7 +#define AMDVI_EVENT_INVALID_DTE_REQ 0x8 +#define AMDVI_EVENT_INVALID_PPR_REQ 0x9 +#define AMDVI_EVENT_COUNTER_ZERO 0xA + +#define AMDVI_EVENT_FLAG_MASK 0x1FF /* Mask for event flags. */ +#define AMDVI_EVENT_FLAG_TYPE(x) (((x) >> 9) & 0x3) + +/* + * IOMMU control block. + */ +struct amdvi_ctrl { + struct { + uint16_t size:9; + uint16_t :3; + uint64_t base:40; /* Devtable register base. */ + uint16_t :12; + } dte; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } cmd; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } event; + uint16_t control :13; + uint64_t :51; + struct { + uint8_t enable:1; + uint8_t allow:1; + uint16_t :10; + uint64_t base:40; + uint16_t :12; + uint16_t :12; + uint64_t limit:40; + uint16_t :12; + } excl; + /* + * Revision 2 only. + */ + uint64_t ex_feature; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } ppr; + uint64_t first_event; + uint64_t second_event; + uint64_t event_status; + /* Revision 2 only, end. */ + uint8_t pad1[0x1FA8]; /* Padding. */ + uint32_t cmd_head:19; + uint64_t :45; + uint32_t cmd_tail:19; + uint64_t :45; + uint32_t evt_head:19; + uint64_t :45; + uint32_t evt_tail:19; + uint64_t :45; + uint32_t status:19; + uint64_t :45; + uint64_t pad2; + uint8_t :4; + uint16_t ppr_head:15; + uint64_t :45; + uint8_t :4; + uint16_t ppr_tail:15; + uint64_t :45; + uint8_t pad3[0x1FC0]; /* Padding. */ + + /* XXX: More for rev2. */ +} __attribute__((__packed__)); +CTASSERT(offsetof(struct amdvi_ctrl, pad1)== 0x58); +CTASSERT(offsetof(struct amdvi_ctrl, pad2)== 0x2028); +CTASSERT(offsetof(struct amdvi_ctrl, pad3)== 0x2040); + +#define AMDVI_MMIO_V1_SIZE (4 * PAGE_SIZE) /* v1 size */ +/* + * AMF IOMMU v2 size including event counters + */ +#define AMDVI_MMIO_V2_SIZE (8 * PAGE_SIZE) + +CTASSERT(sizeof(struct amdvi_ctrl) == 0x4000); +CTASSERT(sizeof(struct amdvi_ctrl) == AMDVI_MMIO_V1_SIZE); + +/* IVHD flag */ +#define IVHD_FLAG_HTT BIT(0) /* Hypertransport Tunnel. */ +#define IVHD_FLAG_PPW BIT(1) /* Pass posted write. */ +#define IVHD_FLAG_RPPW BIT(2) /* Response pass posted write. */ +#define IVHD_FLAG_ISOC BIT(3) /* Isoc support. */ +#define IVHD_FLAG_IOTLB BIT(4) /* IOTLB support. */ +#define IVHD_FLAG_COH BIT(5) /* Coherent control, default 1 */ +#define IVHD_FLAG_PFS BIT(6) /* Prefetch IOMMU pages. */ +#define IVHD_FLAG_PPRS BIT(7) /* Peripheral page support. */ + +/* IVHD device entry data setting. */ +#define IVHD_DEV_LINT0_PASS BIT(6) /* LINT0 interrupts. */ +#define IVHD_DEV_LINT1_PASS BIT(7) /* LINT1 interrupts. */ + +/* Bit[5:4] for System Mgmt. Bit3 is reserved. */ +#define IVHD_DEV_INIT_PASS BIT(0) /* INIT */ +#define IVHD_DEV_EXTINTR_PASS BIT(1) /* ExtInt */ +#define IVHD_DEV_NMI_PASS BIT(2) /* NMI */ + +/* IVHD 8-byte extended data settings. */ +#define IVHD_DEV_EXT_ATS_DISABLE BIT(31) /* Disable ATS */ + +/* IOMMU control register. */ +#define AMDVI_CTRL_EN BIT(0) /* IOMMU enable. */ +#define AMDVI_CTRL_HTT BIT(1) /* Hypertransport tunnel enable. */ +#define AMDVI_CTRL_ELOG BIT(2) /* Event log enable. */ +#define AMDVI_CTRL_ELOGINT BIT(3) /* Event log interrupt. */ +#define AMDVI_CTRL_COMINT BIT(4) /* Completion wait interrupt. */ +#define AMDVI_CTRL_PPW BIT(8) +#define AMDVI_CTRL_RPPW BIT(9) +#define AMDVI_CTRL_COH BIT(10) +#define AMDVI_CTRL_ISOC BIT(11) +#define AMDVI_CTRL_CMD BIT(12) /* Command buffer enable. */ +#define AMDVI_CTRL_PPRLOG BIT(13) +#define AMDVI_CTRL_PPRINT BIT(14) +#define AMDVI_CTRL_PPREN BIT(15) +#define AMDVI_CTRL_GTE BIT(16) /* Guest translation enable. */ +#define AMDVI_CTRL_GAE BIT(17) /* Guest APIC enable. */ + +/* Invalidation timeout. */ +#define AMDVI_CTRL_INV_NO_TO 0 /* No timeout. */ +#define AMDVI_CTRL_INV_TO_1ms 1 /* 1 ms */ +#define AMDVI_CTRL_INV_TO_10ms 2 /* 10 ms */ +#define AMDVI_CTRL_INV_TO_100ms 3 /* 100 ms */ +#define AMDVI_CTRL_INV_TO_1S 4 /* 1 second */ +#define AMDVI_CTRL_INV_TO_10S 5 /* 10 second */ +#define AMDVI_CTRL_INV_TO_100S 6 /* 100 second */ + +/* + * Max number of PCI devices. + * 256 bus x 32 slot/devices x 8 functions. + */ +#define PCI_NUM_DEV_MAX 0x10000 + +/* Maximum number of domains supported by IOMMU. */ +#define AMDVI_MAX_DOMAIN (BIT(16) - 1) + +/* + * IOMMU Page Table attributes. + */ +#define AMDVI_PT_PRESENT BIT(0) +#define AMDVI_PT_COHERENT BIT(60) +#define AMDVI_PT_READ BIT(61) +#define AMDVI_PT_WRITE BIT(62) + +#define AMDVI_PT_RW (AMDVI_PT_READ | AMDVI_PT_WRITE) +#define AMDVI_PT_MASK 0xFFFFFFFFFF000UL /* Only [51:12] for PA */ + +#define AMDVI_PD_LEVEL_SHIFT 9 +#define AMDVI_PD_SUPER(x) (((x) >> AMDVI_PD_LEVEL_SHIFT) == 7) +/* + * IOMMU Status, offset 0x2020 + */ +#define AMDVI_STATUS_EV_OF BIT(0) /* Event overflow. */ +#define AMDVI_STATUS_EV_INTR BIT(1) /* Event interrupt. */ +/* Completion wait command completed. */ +#define AMDVI_STATUS_CMP BIT(2) + +#define IVRS_CTRL_RID 1 /* MMIO RID */ + +/* ACPI IVHD */ +struct ivhd_dev_cfg { + uint32_t start_id; + uint32_t end_id; + uint8_t data; /* Device configuration. */ + bool enable_ats; /* ATS enabled for the device. */ + int ats_qlen; /* ATS invalidation queue depth. */ +}; + +struct amdvi_domain { + uint64_t *ptp; /* Highest level page table */ + int ptp_level; /* Level of page tables */ + u_int id; /* Domain id */ + SLIST_ENTRY (amdvi_domain) next; +}; + +/* + * I/O Virtualization Hardware Definition Block (IVHD) type 0x10 (legacy) + * uses ACPI_IVRS_HARDWARE define in contrib/dev/acpica/include/actbl2.h + * New IVHD types 0x11 and 0x40 as defined in AMD IOMMU spec[48882] are missing in + * ACPI code. These new types add extra field EFR(Extended Feature Register). + * XXX : Use definition from ACPI when it is available. + */ +typedef struct acpi_ivrs_hardware_efr_sup +{ + ACPI_IVRS_HEADER Header; + UINT16 CapabilityOffset; /* Offset for IOMMU control fields */ + UINT64 BaseAddress; /* IOMMU control registers */ + UINT16 PciSegmentGroup; + UINT16 Info; /* MSI number and unit ID */ + UINT32 Attr; /* IOMMU Feature */ + UINT64 ExtFR; /* IOMMU Extended Feature */ + UINT64 Reserved; /* v1 feature or v2 attribute */ +} __attribute__ ((__packed__)) ACPI_IVRS_HARDWARE_EFRSUP; +CTASSERT(sizeof(ACPI_IVRS_HARDWARE_EFRSUP) == 40); + +/* + * Different type of IVHD. + * XXX: Use AcpiIvrsType once new IVHD types are available. +*/ +enum IvrsType +{ + IVRS_TYPE_HARDWARE_LEGACY = 0x10, /* Legacy without EFRi support. */ + IVRS_TYPE_HARDWARE_EFR = 0x11, /* With EFR support. */ + IVRS_TYPE_HARDWARE_MIXED = 0x40, /* Mixed with EFR support. */ +}; + +/* + * AMD IOMMU softc. + */ +struct amdvi_softc { + struct amdvi_ctrl *ctrl; /* Control area. */ + device_t dev; /* IOMMU device. */ + enum IvrsType ivhd_type; /* IOMMU IVHD type. */ + bool iotlb; /* IOTLB supported by IOMMU */ + struct amdvi_cmd *cmd; /* Command descriptor area. */ + int cmd_max; /* Max number of commands. */ + uint64_t cmp_data; /* Command completion write back. */ + struct amdvi_event *event; /* Event descriptor area. */ + struct resource *event_res; /* Event interrupt resource. */ + void *event_tag; /* Event interrupt tag. */ + int event_max; /* Max number of events. */ + int event_irq; + int event_rid; + /* ACPI various flags. */ + uint32_t ivhd_flag; /* ACPI IVHD flag. */ + uint32_t ivhd_feature; /* ACPI v1 Reserved or v2 attribute. */ + uint64_t ext_feature; /* IVHD EFR */ + /* PCI related. */ + uint16_t cap_off; /* PCI Capability offset. */ + uint8_t pci_cap; /* PCI capability. */ + uint16_t pci_seg; /* IOMMU PCI domain/segment. */ + uint16_t pci_rid; /* PCI BDF of IOMMU */ + /* Device range under this IOMMU. */ + uint16_t start_dev_rid; /* First device under this IOMMU. */ + uint16_t end_dev_rid; /* Last device under this IOMMU. */ + + /* BIOS provided device configuration for end points. */ + struct ivhd_dev_cfg dev_cfg[10]; + int dev_cfg_cnt; + + /* Software statistics. */ + uint64_t event_intr_cnt; /* Total event INTR count. */ + uint64_t total_cmd; /* Total number of commands. */ +}; + +int amdvi_setup_hw(struct amdvi_softc *softc); +int amdvi_teardown_hw(struct amdvi_softc *softc); +#endif /* _AMDVI_PRIV_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c new file mode 100644 index 0000000000..370c20fb01 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c @@ -0,0 +1,735 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_acpi.h" +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include + +#include "io/iommu.h" +#include "amdvi_priv.h" + +device_t *ivhd_devs; /* IVHD or AMD-Vi device list. */ +int ivhd_count; /* Number of IVHD header. */ +/* + * Cached IVHD header list. + * Single entry for each IVHD, filtered the legacy one. + */ +ACPI_IVRS_HARDWARE *ivhd_hdrs[10]; + +extern int amdvi_ptp_level; /* Page table levels. */ + +typedef int (*ivhd_iter_t)(ACPI_IVRS_HEADER *ptr, void *arg); +/* + * Iterate IVRS table for IVHD and IVMD device type. + */ +static void +ivrs_hdr_iterate_tbl(ivhd_iter_t iter, void *arg) +{ + ACPI_TABLE_IVRS *ivrs; + ACPI_IVRS_HEADER *ivrs_hdr, *end; + ACPI_STATUS status; + + status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); + if (ACPI_FAILURE(status)) + return; + + if (ivrs->Header.Length == 0) { + return; + } + + ivrs_hdr = (ACPI_IVRS_HEADER *)(ivrs + 1); + end = (ACPI_IVRS_HEADER *)((char *)ivrs + ivrs->Header.Length); + + while (ivrs_hdr < end) { + if ((uint8_t *)ivrs_hdr + ivrs_hdr->Length > (uint8_t *)end) { + printf("AMD-Vi:IVHD/IVMD is corrupted, length : %d\n", + ivrs_hdr->Length); + break; + } + + switch (ivrs_hdr->Type) { + case IVRS_TYPE_HARDWARE_LEGACY: /* Legacy */ + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + if (!iter(ivrs_hdr, arg)) + return; + break; + + case ACPI_IVRS_TYPE_MEMORY1: + case ACPI_IVRS_TYPE_MEMORY2: + case ACPI_IVRS_TYPE_MEMORY3: + if (!iter(ivrs_hdr, arg)) + return; + + break; + + default: + printf("AMD-Vi:Not IVHD/IVMD type(%d)", ivrs_hdr->Type); + + } + + ivrs_hdr = (ACPI_IVRS_HEADER *)((uint8_t *)ivrs_hdr + + ivrs_hdr->Length); + } +} + +static bool +ivrs_is_ivhd(UINT8 type) +{ + + switch(type) { + case IVRS_TYPE_HARDWARE_LEGACY: + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + return (true); + + default: + return (false); + } +} + +/* Count the number of AMD-Vi devices in the system. */ +static int +ivhd_count_iter(ACPI_IVRS_HEADER * ivrs_he, void *arg) +{ + + if (ivrs_is_ivhd(ivrs_he->Type)) + ivhd_count++; + + return (1); +} + +struct find_ivrs_hdr_args { + int i; + ACPI_IVRS_HEADER *ptr; +}; + +static int +ivrs_hdr_find_iter(ACPI_IVRS_HEADER * ivrs_hdr, void *args) +{ + struct find_ivrs_hdr_args *fi; + + fi = (struct find_ivrs_hdr_args *)args; + if (ivrs_is_ivhd(ivrs_hdr->Type)) { + if (fi->i == 0) { + fi->ptr = ivrs_hdr; + return (0); + } + fi->i--; + } + + return (1); +} + +static ACPI_IVRS_HARDWARE * +ivhd_find_by_index(int idx) +{ + struct find_ivrs_hdr_args fi; + + fi.i = idx; + fi.ptr = NULL; + + ivrs_hdr_iterate_tbl(ivrs_hdr_find_iter, &fi); + + return ((ACPI_IVRS_HARDWARE *)fi.ptr); +} + +static void +ivhd_dev_add_entry(struct amdvi_softc *softc, uint32_t start_id, + uint32_t end_id, uint8_t cfg, bool ats) +{ + struct ivhd_dev_cfg *dev_cfg; + + /* If device doesn't have special data, don't add it. */ + if (!cfg) + return; + + dev_cfg = &softc->dev_cfg[softc->dev_cfg_cnt++]; + dev_cfg->start_id = start_id; + dev_cfg->end_id = end_id; + dev_cfg->data = cfg; + dev_cfg->enable_ats = ats; +} + +/* + * Record device attributes as suggested by BIOS. + */ +static int +ivhd_dev_parse(ACPI_IVRS_HARDWARE* ivhd, struct amdvi_softc *softc) +{ + ACPI_IVRS_DE_HEADER *de; + uint8_t *p, *end; + int range_start_id = 0, range_end_id = 0; + uint32_t *extended; + uint8_t all_data = 0, range_data = 0; + bool range_enable_ats = false, enable_ats; + + softc->start_dev_rid = ~0; + softc->end_dev_rid = 0; + + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_LEGACY: + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE); + break; + + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE_EFRSUP); + break; + + default: + device_printf(softc->dev, + "unknown type: 0x%x\n", ivhd->Header.Type); + return (-1); + } + + end = (uint8_t *)ivhd + ivhd->Header.Length; + + while (p < end) { + de = (ACPI_IVRS_DE_HEADER *)p; + softc->start_dev_rid = MIN(softc->start_dev_rid, de->Id); + softc->end_dev_rid = MAX(softc->end_dev_rid, de->Id); + switch (de->Type) { + case ACPI_IVRS_TYPE_ALL: + all_data = de->DataSetting; + break; + + case ACPI_IVRS_TYPE_SELECT: + case ACPI_IVRS_TYPE_ALIAS_SELECT: + case ACPI_IVRS_TYPE_EXT_SELECT: + enable_ats = false; + if (de->Type == ACPI_IVRS_TYPE_EXT_SELECT) { + extended = (uint32_t *)(de + 1); + enable_ats = + (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? + false : true; + } + ivhd_dev_add_entry(softc, de->Id, de->Id, + de->DataSetting | all_data, enable_ats); + break; + + case ACPI_IVRS_TYPE_START: + case ACPI_IVRS_TYPE_ALIAS_START: + case ACPI_IVRS_TYPE_EXT_START: + range_start_id = de->Id; + range_data = de->DataSetting; + if (de->Type == ACPI_IVRS_TYPE_EXT_START) { + extended = (uint32_t *)(de + 1); + range_enable_ats = + (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? + false : true; + } + break; + + case ACPI_IVRS_TYPE_END: + range_end_id = de->Id; + ivhd_dev_add_entry(softc, range_start_id, range_end_id, + range_data | all_data, range_enable_ats); + range_start_id = range_end_id = 0; + range_data = 0; + all_data = 0; + break; + + case ACPI_IVRS_TYPE_PAD4: + break; + + case ACPI_IVRS_TYPE_SPECIAL: + /* HPET or IOAPIC */ + break; + default: + if ((de->Type < 5) || + (de->Type >= ACPI_IVRS_TYPE_PAD8)) + device_printf(softc->dev, + "Unknown dev entry:0x%x\n", de->Type); + } + + if (softc->dev_cfg_cnt > + (sizeof(softc->dev_cfg) / sizeof(softc->dev_cfg[0]))) { + device_printf(softc->dev, + "WARN Too many device entries.\n"); + return (EINVAL); + } + if (de->Type < 0x40) + p += sizeof(ACPI_IVRS_DEVICE4); + else if (de->Type < 0x80) + p += sizeof(ACPI_IVRS_DEVICE8A); + else { + printf("Variable size IVHD type 0x%x not supported\n", + de->Type); + break; + } + } + + KASSERT((softc->end_dev_rid >= softc->start_dev_rid), + ("Device end[0x%x] < start[0x%x.\n", + softc->end_dev_rid, softc->start_dev_rid)); + + return (0); +} + +static bool +ivhd_is_newer(ACPI_IVRS_HEADER *old, ACPI_IVRS_HEADER *new) +{ + /* + * Newer IVRS header type take precedence. + */ + if ((old->DeviceId == new->DeviceId) && + (old->Type == IVRS_TYPE_HARDWARE_LEGACY) && + ((new->Type == IVRS_TYPE_HARDWARE_EFR) || + (new->Type == IVRS_TYPE_HARDWARE_MIXED))) { + return (true); + } + + return (false); +} + +static void +ivhd_identify(driver_t *driver, device_t parent) +{ + ACPI_TABLE_IVRS *ivrs; + ACPI_IVRS_HARDWARE *ivhd; + ACPI_STATUS status; + int i, count = 0; + uint32_t ivrs_ivinfo; + + if (acpi_disabled("ivhd")) + return; + + status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); + if (ACPI_FAILURE(status)) + return; + + if (ivrs->Header.Length == 0) { + return; + } + + ivrs_ivinfo = ivrs->Info; + printf("AMD-Vi: IVRS Info VAsize = %d PAsize = %d GVAsize = %d" + " flags:%b\n", + REG_BITS(ivrs_ivinfo, 21, 15), REG_BITS(ivrs_ivinfo, 14, 8), + REG_BITS(ivrs_ivinfo, 7, 5), REG_BITS(ivrs_ivinfo, 22, 22), + "\020\001EFRSup"); + + ivrs_hdr_iterate_tbl(ivhd_count_iter, NULL); + if (!ivhd_count) + return; + + for (i = 0; i < ivhd_count; i++) { + ivhd = ivhd_find_by_index(i); + KASSERT(ivhd, ("ivhd%d is NULL\n", i)); + ivhd_hdrs[i] = ivhd; + } + + /* + * Scan for presence of legacy and non-legacy device type + * for same AMD-Vi device and override the old one. + */ + for (i = ivhd_count - 1 ; i > 0 ; i--){ + if (ivhd_is_newer(&ivhd_hdrs[i-1]->Header, + &ivhd_hdrs[i]->Header)) { + ivhd_hdrs[i-1] = ivhd_hdrs[i]; + ivhd_count--; + } + } + + ivhd_devs = malloc(sizeof(device_t) * ivhd_count, M_DEVBUF, + M_WAITOK | M_ZERO); + for (i = 0; i < ivhd_count; i++) { + ivhd = ivhd_hdrs[i]; + KASSERT(ivhd, ("ivhd%d is NULL\n", i)); + + /* + * Use a high order to ensure that this driver is probed after + * the Host-PCI bridge and the root PCI bus. + */ + ivhd_devs[i] = BUS_ADD_CHILD(parent, + ACPI_DEV_BASE_ORDER + 10 * 10, "ivhd", i); + + /* + * XXX: In case device was not destroyed before, add will fail. + * locate the old device instance. + */ + if (ivhd_devs[i] == NULL) { + ivhd_devs[i] = device_find_child(parent, "ivhd", i); + if (ivhd_devs[i] == NULL) { + printf("AMD-Vi: cant find ivhd%d\n", i); + break; + } + } + count++; + } + + /* + * Update device count in case failed to attach. + */ + ivhd_count = count; +} + +static int +ivhd_probe(device_t dev) +{ + ACPI_IVRS_HARDWARE *ivhd; + int unit; + + if (acpi_get_handle(dev) != NULL) + return (ENXIO); + + unit = device_get_unit(dev); + KASSERT((unit < ivhd_count), + ("ivhd unit %d > count %d", unit, ivhd_count)); + ivhd = ivhd_hdrs[unit]; + KASSERT(ivhd, ("ivhd is NULL")); + + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_EFR: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd with EFR"); + break; + + case IVRS_TYPE_HARDWARE_MIXED: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd in mixed format"); + break; + + case IVRS_TYPE_HARDWARE_LEGACY: + default: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd"); + break; + } + + return (BUS_PROBE_NOWILDCARD); +} + +static void +ivhd_print_flag(device_t dev, enum IvrsType ivhd_type, uint8_t flag) +{ + /* + * IVHD lgeacy type has two extra high bits in flag which has + * been moved to EFR for non-legacy device. + */ + switch (ivhd_type) { + case IVRS_TYPE_HARDWARE_LEGACY: + device_printf(dev, "Flag:%b\n", flag, + "\020" + "\001HtTunEn" + "\002PassPW" + "\003ResPassPW" + "\004Isoc" + "\005IotlbSup" + "\006Coherent" + "\007PreFSup" + "\008PPRSup"); + break; + + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + device_printf(dev, "Flag:%b\n", flag, + "\020" + "\001HtTunEn" + "\002PassPW" + "\003ResPassPW" + "\004Isoc" + "\005IotlbSup" + "\006Coherent"); + break; + + default: + device_printf(dev, "Can't decode flag of ivhd type :0x%x\n", + ivhd_type); + break; + } +} + +/* + * Feature in legacy IVHD type(0x10) and attribute in newer type(0x11 and 0x40). + */ +static void +ivhd_print_feature(device_t dev, enum IvrsType ivhd_type, uint32_t feature) +{ + switch (ivhd_type) { + case IVRS_TYPE_HARDWARE_LEGACY: + device_printf(dev, "Features(type:0x%x) HATS = %d GATS = %d" + " MsiNumPPR = %d PNBanks= %d PNCounters= %d\n", + ivhd_type, + REG_BITS(feature, 31, 30), + REG_BITS(feature, 29, 28), + REG_BITS(feature, 27, 23), + REG_BITS(feature, 22, 17), + REG_BITS(feature, 16, 13)); + device_printf(dev, "max PASID = %d GLXSup = %d Feature:%b\n", + REG_BITS(feature, 12, 8), + REG_BITS(feature, 4, 3), + feature, + "\020" + "\002NXSup" + "\003GTSup" + "\004" + "\005IASup" + "\006GASup" + "\007HESup"); + break; + + /* Fewer features or attributes are reported in non-legacy type. */ + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + device_printf(dev, "Features(type:0x%x) MsiNumPPR = %d" + " PNBanks= %d PNCounters= %d\n", + ivhd_type, + REG_BITS(feature, 27, 23), + REG_BITS(feature, 22, 17), + REG_BITS(feature, 16, 13)); + break; + + default: /* Other ivhd type features are not decoded. */ + device_printf(dev, "Can't decode ivhd type :0x%x\n", ivhd_type); + } +} + +/* Print extended features of IOMMU. */ +static void +ivhd_print_ext_feature(device_t dev, uint64_t ext_feature) +{ + uint32_t ext_low, ext_high; + + if (!ext_feature) + return; + + ext_low = ext_feature; + device_printf(dev, "Extended features[31:0]:%b " + "HATS = 0x%x GATS = 0x%x " + "GLXSup = 0x%x SmiFSup = 0x%x SmiFRC = 0x%x " + "GAMSup = 0x%x DualPortLogSup = 0x%x DualEventLogSup = 0x%x\n", + (int)ext_low, + "\020" + "\001PreFSup" + "\002PPRSup" + "\003" + "\004NXSup" + "\005GTSup" + "\006" + "\007IASup" + "\008GASup" + "\009HESup" + "\010PCSup", + REG_BITS(ext_low, 11, 10), + REG_BITS(ext_low, 13, 12), + REG_BITS(ext_low, 15, 14), + REG_BITS(ext_low, 17, 16), + REG_BITS(ext_low, 20, 18), + REG_BITS(ext_low, 23, 21), + REG_BITS(ext_low, 25, 24), + REG_BITS(ext_low, 29, 28)); + + ext_high = ext_feature >> 32; + device_printf(dev, "Extended features[62:32]:%b " + "Max PASID: 0x%x DevTblSegSup = 0x%x " + "MarcSup = 0x%x\n", + (int)(ext_high), + "\020" + "\006USSup" + "\009PprOvrflwEarlySup" + "\010PPRAutoRspSup" + "\013BlKStopMrkSup" + "\014PerfOptSup" + "\015MsiCapMmioSup" + "\017GIOSup" + "\018HASup" + "\019EPHSup" + "\020AttrFWSup" + "\021HDSup" + "\023InvIotlbSup", + REG_BITS(ext_high, 5, 0), + REG_BITS(ext_high, 8, 7), + REG_BITS(ext_high, 11, 10)); +} + +static int +ivhd_print_cap(struct amdvi_softc *softc, ACPI_IVRS_HARDWARE * ivhd) +{ + device_t dev; + int max_ptp_level; + + dev = softc->dev; + + ivhd_print_flag(dev, softc->ivhd_type, softc->ivhd_flag); + ivhd_print_feature(dev, softc->ivhd_type, softc->ivhd_feature); + ivhd_print_ext_feature(dev, softc->ext_feature); + max_ptp_level = 7; + /* Make sure device support minimum page level as requested by user. */ + if (max_ptp_level < amdvi_ptp_level) { + device_printf(dev, "insufficient PTP level:%d\n", + max_ptp_level); + return (EINVAL); + } else { + device_printf(softc->dev, "supported paging level:%d, will use only: %d\n", + max_ptp_level, amdvi_ptp_level); + } + + device_printf(softc->dev, "device range: 0x%x - 0x%x\n", + softc->start_dev_rid, softc->end_dev_rid); + + return (0); +} + +static int +ivhd_attach(device_t dev) +{ + ACPI_IVRS_HARDWARE *ivhd; + ACPI_IVRS_HARDWARE_EFRSUP *ivhd_efr; + struct amdvi_softc *softc; + int status, unit; + + unit = device_get_unit(dev); + KASSERT((unit < ivhd_count), + ("ivhd unit %d > count %d", unit, ivhd_count)); + /* Make sure its same device for which attach is called. */ + KASSERT((ivhd_devs[unit] == dev), + ("Not same device old %p new %p", ivhd_devs[unit], dev)); + + softc = device_get_softc(dev); + softc->dev = dev; + ivhd = ivhd_hdrs[unit]; + KASSERT(ivhd, ("ivhd is NULL")); + + softc->ivhd_type = ivhd->Header.Type; + softc->pci_seg = ivhd->PciSegmentGroup; + softc->pci_rid = ivhd->Header.DeviceId; + softc->ivhd_flag = ivhd->Header.Flags; + /* + * On lgeacy IVHD type(0x10), it is documented as feature + * but in newer type it is attribute. + */ + softc->ivhd_feature = ivhd->Reserved; + /* + * PCI capability has more capabilities that are not part of IVRS. + */ + softc->cap_off = ivhd->CapabilityOffset; + +#ifdef notyet + /* IVHD Info bit[4:0] is event MSI/X number. */ + softc->event_msix = ivhd->Info & 0x1F; +#endif + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + ivhd_efr = (ACPI_IVRS_HARDWARE_EFRSUP *)ivhd; + softc->ext_feature = ivhd_efr->ExtFR; + break; + + } + + softc->ctrl = (struct amdvi_ctrl *) PHYS_TO_DMAP(ivhd->BaseAddress); + status = ivhd_dev_parse(ivhd, softc); + if (status != 0) { + device_printf(dev, + "endpoint device parsing error=%d\n", status); + } + + status = ivhd_print_cap(softc, ivhd); + if (status != 0) { + return (status); + } + + status = amdvi_setup_hw(softc); + if (status != 0) { + device_printf(dev, "couldn't be initialised, error=%d\n", + status); + return (status); + } + + return (0); +} + +static int +ivhd_detach(device_t dev) +{ + struct amdvi_softc *softc; + + softc = device_get_softc(dev); + + amdvi_teardown_hw(softc); + + /* + * XXX: delete the device. + * don't allow detach, return EBUSY. + */ + return (0); +} + +static int +ivhd_suspend(device_t dev) +{ + + return (0); +} + +static int +ivhd_resume(device_t dev) +{ + + return (0); +} + +static device_method_t ivhd_methods[] = { + DEVMETHOD(device_identify, ivhd_identify), + DEVMETHOD(device_probe, ivhd_probe), + DEVMETHOD(device_attach, ivhd_attach), + DEVMETHOD(device_detach, ivhd_detach), + DEVMETHOD(device_suspend, ivhd_suspend), + DEVMETHOD(device_resume, ivhd_resume), + DEVMETHOD_END +}; + +static driver_t ivhd_driver = { + "ivhd", + ivhd_methods, + sizeof(struct amdvi_softc), +}; + +static devclass_t ivhd_devclass; + +/* + * Load this module at the end after PCI re-probing to configure interrupt. + */ +DRIVER_MODULE_ORDERED(ivhd, acpi, ivhd_driver, ivhd_devclass, 0, 0, + SI_ORDER_ANY); +MODULE_DEPEND(ivhd, acpi, 1, 1, 1); +MODULE_DEPEND(ivhd, pci, 1, 1, 1); diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.c b/usr/src/uts/i86pc/io/vmm/amd/npt.c new file mode 100644 index 0000000000..e61464a964 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/npt.c @@ -0,0 +1,87 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include + +#include "npt.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, npt, CTLFLAG_RW, NULL, NULL); + +static int npt_flags; +SYSCTL_INT(_hw_vmm_npt, OID_AUTO, pmap_flags, CTLFLAG_RD, + &npt_flags, 0, NULL); + +#define NPT_IPIMASK 0xFF + +/* + * AMD nested page table init. + */ +int +svm_npt_init(int ipinum) +{ + int enable_superpage = 1; + + npt_flags = ipinum & NPT_IPIMASK; + TUNABLE_INT_FETCH("hw.vmm.npt.enable_superpage", &enable_superpage); + if (enable_superpage) + npt_flags |= PMAP_PDE_SUPERPAGE; + + return (0); +} + +static int +npt_pinit(pmap_t pmap) +{ + + return (pmap_pinit_type(pmap, PT_RVI, npt_flags)); +} + +struct vmspace * +svm_npt_alloc(vm_offset_t min, vm_offset_t max) +{ + + return (vmspace_alloc(min, max, npt_pinit)); +} + +void +svm_npt_free(struct vmspace *vmspace) +{ + + vmspace_free(vmspace); +} diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.h b/usr/src/uts/i86pc/io/vmm/amd/npt.h new file mode 100644 index 0000000000..35530d7833 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/npt.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_NPT_H_ +#define _SVM_NPT_H_ + +int svm_npt_init(int ipinum); +struct vmspace *svm_npt_alloc(vm_offset_t min, vm_offset_t max); +void svm_npt_free(struct vmspace *vmspace); + +#endif /* _SVM_NPT_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/offsets.in b/usr/src/uts/i86pc/io/vmm/amd/offsets.in new file mode 100644 index 0000000000..f8d2a716d7 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/offsets.in @@ -0,0 +1,36 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ +#include + +#include "amd/svm.h" + +svm_regctx + sctx_rbx SCTX_RBX + sctx_rcx SCTX_RCX + sctx_rbp SCTX_RBP + sctx_rdx SCTX_RDX + sctx_rdi SCTX_RDI + sctx_rsi SCTX_RSI + sctx_r8 SCTX_R8 + sctx_r9 SCTX_R9 + sctx_r10 SCTX_R10 + sctx_r11 SCTX_R11 + sctx_r12 SCTX_R12 + sctx_r13 SCTX_R13 + sctx_r14 SCTX_R14 + sctx_r15 SCTX_R15 + +/* Pull in definition for MSR_GSBASE */ +\#include diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c new file mode 100644 index 0000000000..25dc3a63fa --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c @@ -0,0 +1,2446 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef __FreeBSD__ +#include +#include +#endif + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vmm_lapic.h" +#include "vmm_stat.h" +#include "vmm_ktr.h" +#include "vmm_ioport.h" +#include "vatpic.h" +#include "vlapic.h" +#include "vlapic_priv.h" + +#include "x86.h" +#include "vmcb.h" +#include "svm.h" +#include "svm_softc.h" +#include "svm_msr.h" +#include "npt.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL); + +/* + * SVM CPUID function 0x8000_000A, edx bit decoding. + */ +#define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ +#define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ +#define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ +#define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ +#define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ +#define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ +#define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ +#define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ +#define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ +#define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ +#define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ + +#define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ + VMCB_CACHE_IOPM | \ + VMCB_CACHE_I | \ + VMCB_CACHE_TPR | \ + VMCB_CACHE_CR2 | \ + VMCB_CACHE_CR | \ + VMCB_CACHE_DR | \ + VMCB_CACHE_DT | \ + VMCB_CACHE_SEG | \ + VMCB_CACHE_NP) + +static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT; +SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean, + 0, NULL); + +static MALLOC_DEFINE(M_SVM, "svm", "svm"); +static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic"); + +#ifdef __FreeBSD__ +/* Per-CPU context area. */ +extern struct pcpu __pcpu[]; +#endif + +static uint32_t svm_feature = ~0U; /* AMD SVM features. */ +SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0, + "SVM features advertised by CPUID.8000000AH:EDX"); + +static int disable_npf_assist; +SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN, + &disable_npf_assist, 0, NULL); + +#ifdef __FreeBSD__ +/* Maximum ASIDs supported by the processor */ +static uint32_t nasid; +SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0, + "Number of ASIDs supported by this processor"); + +/* Current ASID generation for each host cpu */ +static struct asid asid[MAXCPU]; + +/* + * SVM host state saved area of size 4KB for each core. + */ +static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); +#endif /* __FreeBSD__ */ + +static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); +static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); +static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); + +static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); + +static __inline int +flush_by_asid(void) +{ + + return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID); +} + +static __inline int +decode_assist(void) +{ + + return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST); +} + +#ifdef __FreeBSD__ +static void +svm_disable(void *arg __unused) +{ + uint64_t efer; + + efer = rdmsr(MSR_EFER); + efer &= ~EFER_SVM; + wrmsr(MSR_EFER, efer); +} + +/* + * Disable SVM on all CPUs. + */ +static int +svm_cleanup(void) +{ + + smp_rendezvous(NULL, svm_disable, NULL, NULL); + return (0); +} + +/* + * Verify that all the features required by bhyve are available. + */ +static int +check_svm_features(void) +{ + u_int regs[4]; + + /* CPUID Fn8000_000A is for SVM */ + do_cpuid(0x8000000A, regs); + svm_feature &= regs[3]; + + /* + * The number of ASIDs can be configured to be less than what is + * supported by the hardware but not more. + */ + if (nasid == 0 || nasid > regs[1]) + nasid = regs[1]; + KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid)); + + /* bhyve requires the Nested Paging feature */ + if (!(svm_feature & AMD_CPUID_SVM_NP)) { + printf("SVM: Nested Paging feature not available.\n"); + return (ENXIO); + } + + /* bhyve requires the NRIP Save feature */ + if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) { + printf("SVM: NRIP Save feature not available.\n"); + return (ENXIO); + } + + return (0); +} + +static void +svm_enable(void *arg __unused) +{ + uint64_t efer; + + efer = rdmsr(MSR_EFER); + efer |= EFER_SVM; + wrmsr(MSR_EFER, efer); + + wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu])); +} + +/* + * Return 1 if SVM is enabled on this processor and 0 otherwise. + */ +static int +svm_available(void) +{ + uint64_t msr; + +#ifdef __FreeBSD__ + /* Section 15.4 Enabling SVM from APM2. */ + if ((amd_feature2 & AMDID2_SVM) == 0) { + printf("SVM: not available.\n"); + return (0); + } +#else + if (!is_x86_feature(x86_featureset, X86FSET_SVM)) { + cmn_err(CE_WARN, "processor does not support SVM operation\n"); + return (0); + } +#endif + + msr = rdmsr(MSR_VM_CR); + if ((msr & VM_CR_SVMDIS) != 0) { +#ifdef __FreeBSD__ + printf("SVM: disabled by BIOS.\n"); +#else + cmn_err(CE_WARN, "SVM disabled by BIOS.\n"); +#endif + return (0); + } + + return (1); +} + +static int +svm_init(int ipinum) +{ + int error, cpu; + + if (!svm_available()) + return (ENXIO); + + error = check_svm_features(); + if (error) + return (error); + + vmcb_clean &= VMCB_CACHE_DEFAULT; + + for (cpu = 0; cpu < MAXCPU; cpu++) { + /* + * Initialize the host ASIDs to their "highest" valid values. + * + * The next ASID allocation will rollover both 'gen' and 'num' + * and start off the sequence at {1,1}. + */ + asid[cpu].gen = ~0UL; + asid[cpu].num = nasid - 1; + } + + svm_msr_init(); + svm_npt_init(ipinum); + + /* Enable SVM on all CPUs */ + smp_rendezvous(NULL, svm_enable, NULL, NULL); + + return (0); +} + +static void +svm_restore(void) +{ + + svm_enable(NULL); +} +#else /* __FreeBSD__ */ +static int +svm_cleanup(void) +{ + /* This is taken care of by the hma registration */ + return (0); +} + +static int +svm_init(int ipinum) +{ + vmcb_clean &= VMCB_CACHE_DEFAULT; + + svm_msr_init(); + svm_npt_init(ipinum); + + return (0); +} + +static void +svm_restore(void) +{ + /* No-op on illumos */ +} +#endif /* __FreeBSD__ */ + +/* Pentium compatible MSRs */ +#define MSR_PENTIUM_START 0 +#define MSR_PENTIUM_END 0x1FFF +/* AMD 6th generation and Intel compatible MSRs */ +#define MSR_AMD6TH_START 0xC0000000UL +#define MSR_AMD6TH_END 0xC0001FFFUL +/* AMD 7th and 8th generation compatible MSRs */ +#define MSR_AMD7TH_START 0xC0010000UL +#define MSR_AMD7TH_END 0xC0011FFFUL + +/* + * Get the index and bit position for a MSR in permission bitmap. + * Two bits are used for each MSR: lower bit for read and higher bit for write. + */ +static int +svm_msr_index(uint64_t msr, int *index, int *bit) +{ + uint32_t base, off; + + *index = -1; + *bit = (msr % 4) * 2; + base = 0; + + if (msr <= MSR_PENTIUM_END) { + *index = msr / 4; + return (0); + } + + base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); + if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { + off = (msr - MSR_AMD6TH_START); + *index = (off + base) / 4; + return (0); + } + + base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); + if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { + off = (msr - MSR_AMD7TH_START); + *index = (off + base) / 4; + return (0); + } + + return (EINVAL); +} + +/* + * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. + */ +static void +svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) +{ + int index, bit, error; + + error = svm_msr_index(msr, &index, &bit); + KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr)); + KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, + ("%s: invalid index %d for msr %#lx", __func__, index, msr)); + KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " + "msr %#lx", __func__, bit, msr)); + + if (read) + perm_bitmap[index] &= ~(1UL << bit); + + if (write) + perm_bitmap[index] &= ~(2UL << bit); +} + +static void +svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) +{ + + svm_msr_perm(perm_bitmap, msr, true, true); +} + +static void +svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) +{ + + svm_msr_perm(perm_bitmap, msr, true, false); +} + +static __inline int +svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) +{ + struct vmcb_ctrl *ctrl; + + KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + return (ctrl->intercept[idx] & bitmask ? 1 : 0); +} + +static __inline void +svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, + int enabled) +{ + struct vmcb_ctrl *ctrl; + uint32_t oldval; + + KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + oldval = ctrl->intercept[idx]; + + if (enabled) + ctrl->intercept[idx] |= bitmask; + else + ctrl->intercept[idx] &= ~bitmask; + + if (ctrl->intercept[idx] != oldval) { + svm_set_dirty(sc, vcpu, VMCB_CACHE_I); + VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified " + "from %#x to %#x", idx, oldval, ctrl->intercept[idx]); + } +} + +static __inline void +svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) +{ + + svm_set_intercept(sc, vcpu, off, bitmask, 0); +} + +static __inline void +svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) +{ + + svm_set_intercept(sc, vcpu, off, bitmask, 1); +} + +static void +vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, + uint64_t msrpm_base_pa, uint64_t np_pml4) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + uint32_t mask; + int n; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + state = svm_get_vmcb_state(sc, vcpu); + + ctrl->iopm_base_pa = iopm_base_pa; + ctrl->msrpm_base_pa = msrpm_base_pa; + + /* Enable nested paging */ + ctrl->np_enable = 1; + ctrl->n_cr3 = np_pml4; + + /* + * Intercept accesses to the control registers that are not shadowed + * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. + */ + for (n = 0; n < 16; n++) { + mask = (BIT(n) << 16) | BIT(n); + if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) + svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); + else + svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); + } + + + /* + * Intercept everything when tracing guest exceptions otherwise + * just intercept machine check exception. + */ + if (vcpu_trace_exceptions(sc->vm, vcpu)) { + for (n = 0; n < 32; n++) { + /* + * Skip unimplemented vectors in the exception bitmap. + */ + if (n == 2 || n == 9) { + continue; + } + svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); + } + } else { + svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); + } + + /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_FERR_FREEZE); + + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); + + /* + * From section "Canonicalization and Consistency Checks" in APMv2 + * the VMRUN intercept bit must be set to pass the consistency check. + */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); + + /* + * The ASID will be set to a non-zero value just before VMRUN. + */ + ctrl->asid = 0; + + /* + * Section 15.21.1, Interrupt Masking in EFLAGS + * Section 15.21.2, Virtualizing APIC.TPR + * + * This must be set for %rflag and %cr8 isolation of guest and host. + */ + ctrl->v_intr_masking = 1; + + /* Enable Last Branch Record aka LBR for debugging */ + ctrl->lbr_virt_en = 1; + state->dbgctl = BIT(0); + + /* EFER_SVM must always be set when the guest is executing */ + state->efer = EFER_SVM; + + /* Set up the PAT to power-on state */ + state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + + /* Set up DR6/7 to power-on state */ + state->dr6 = DBREG_DR6_RESERVED1; + state->dr7 = DBREG_DR7_RESERVED1; +} + +/* + * Initialize a virtual machine. + */ +static void * +svm_vminit(struct vm *vm, pmap_t pmap) +{ + struct svm_softc *svm_sc; + struct svm_vcpu *vcpu; + vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; + int i; + uint16_t maxcpus; + + svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO); + if (((uintptr_t)svm_sc & PAGE_MASK) != 0) + panic("malloc of svm_softc not aligned on page boundary"); + + svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM, + M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); + if (svm_sc->msr_bitmap == NULL) + panic("contigmalloc of SVM MSR bitmap failed"); + svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM, + M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); + if (svm_sc->iopm_bitmap == NULL) + panic("contigmalloc of SVM IO bitmap failed"); + + svm_sc->vm = vm; + svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); + + /* + * Intercept read and write accesses to all MSRs. + */ + memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE); + + /* + * Access to the following MSRs is redirected to the VMCB when the + * guest is executing. Therefore it is safe to allow the guest to + * read/write these MSRs directly without hypervisor involvement. + */ + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); + + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); + + svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); + + /* + * Intercept writes to make sure that the EFER_SVM bit is not cleared. + */ + svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); + + /* Intercept access to all I/O ports. */ + memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE); + + iopm_pa = vtophys(svm_sc->iopm_bitmap); + msrpm_pa = vtophys(svm_sc->msr_bitmap); + pml4_pa = svm_sc->nptp; + maxcpus = vm_get_maxcpus(svm_sc->vm); + for (i = 0; i < maxcpus; i++) { + vcpu = svm_get_vcpu(svm_sc, i); + vcpu->nextrip = ~0; + vcpu->lastcpu = NOCPU; + vcpu->vmcb_pa = vtophys(&vcpu->vmcb); + vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); + svm_msr_guest_init(svm_sc, i); + } + return (svm_sc); +} + +/* + * Collateral for a generic SVM VM-exit. + */ +static void +vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) +{ + + vme->exitcode = VM_EXITCODE_SVM; + vme->u.svm.exitcode = code; + vme->u.svm.exitinfo1 = info1; + vme->u.svm.exitinfo2 = info2; +} + +static int +svm_cpl(struct vmcb_state *state) +{ + + /* + * From APMv2: + * "Retrieve the CPL from the CPL field in the VMCB, not + * from any segment DPL" + */ + return (state->cpl); +} + +static enum vm_cpu_mode +svm_vcpu_mode(struct vmcb *vmcb) +{ + struct vmcb_segment seg; + struct vmcb_state *state; + int error; + + state = &vmcb->state; + + if (state->efer & EFER_LMA) { + error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); + KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__, + error)); + + /* + * Section 4.8.1 for APM2, check if Code Segment has + * Long attribute set in descriptor. + */ + if (seg.attrib & VMCB_CS_ATTRIB_L) + return (CPU_MODE_64BIT); + else + return (CPU_MODE_COMPATIBILITY); + } else if (state->cr0 & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } +} + +static enum vm_paging_mode +svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) +{ + + if ((cr0 & CR0_PG) == 0) + return (PAGING_MODE_FLAT); + if ((cr4 & CR4_PAE) == 0) + return (PAGING_MODE_32); + if (efer & EFER_LME) + return (PAGING_MODE_64); + else + return (PAGING_MODE_PAE); +} + +/* + * ins/outs utility routines + */ +static uint64_t +svm_inout_str_index(struct svm_regctx *regs, int in) +{ + uint64_t val; + + val = in ? regs->sctx_rdi : regs->sctx_rsi; + + return (val); +} + +static uint64_t +svm_inout_str_count(struct svm_regctx *regs, int rep) +{ + uint64_t val; + + val = rep ? regs->sctx_rcx : 1; + + return (val); +} + +static void +svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1, + int in, struct vm_inout_str *vis) +{ + int error, s; + + if (in) { + vis->seg_name = VM_REG_GUEST_ES; + } else { + /* The segment field has standard encoding */ + s = (info1 >> 10) & 0x7; + vis->seg_name = vm_segment_name(s); + } + + error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc); + KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error)); +} + +static int +svm_inout_str_addrsize(uint64_t info1) +{ + uint32_t size; + + size = (info1 >> 7) & 0x7; + switch (size) { + case 1: + return (2); /* 16 bit */ + case 2: + return (4); /* 32 bit */ + case 4: + return (8); /* 64 bit */ + default: + panic("%s: invalid size encoding %d", __func__, size); + } +} + +static void +svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) +{ + struct vmcb_state *state; + + state = &vmcb->state; + paging->cr3 = state->cr3; + paging->cpl = svm_cpl(state); + paging->cpu_mode = svm_vcpu_mode(vmcb); + paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, + state->efer); +} + +#define UNHANDLED 0 + +/* + * Handle guest I/O intercept. + */ +static int +svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + struct svm_regctx *regs; + struct vm_inout_str *vis; + uint64_t info1; + int inout_string; + + state = svm_get_vmcb_state(svm_sc, vcpu); + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + regs = svm_get_guest_regctx(svm_sc, vcpu); + + info1 = ctrl->exitinfo1; + inout_string = info1 & BIT(2) ? 1 : 0; + + /* + * The effective segment number in EXITINFO1[12:10] is populated + * only if the processor has the DecodeAssist capability. + * + * XXX this is not specified explicitly in APMv2 but can be verified + * empirically. + */ + if (inout_string && !decode_assist()) + return (UNHANDLED); + + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; + vmexit->u.inout.string = inout_string; + vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0; + vmexit->u.inout.bytes = (info1 >> 4) & 0x7; + vmexit->u.inout.port = (uint16_t)(info1 >> 16); + vmexit->u.inout.eax = (uint32_t)(state->rax); + + if (inout_string) { + vmexit->exitcode = VM_EXITCODE_INOUT_STR; + vis = &vmexit->u.inout_str; + svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging); + vis->rflags = state->rflags; + vis->cr0 = state->cr0; + vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); + vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); + vis->addrsize = svm_inout_str_addrsize(info1); + svm_inout_str_seginfo(svm_sc, vcpu, info1, + vmexit->u.inout.in, vis); + } + + return (UNHANDLED); +} + +static int +npf_fault_type(uint64_t exitinfo1) +{ + + if (exitinfo1 & VMCB_NPF_INFO1_W) + return (VM_PROT_WRITE); + else if (exitinfo1 & VMCB_NPF_INFO1_ID) + return (VM_PROT_EXECUTE); + else + return (VM_PROT_READ); +} + +static bool +svm_npf_emul_fault(uint64_t exitinfo1) +{ + + if (exitinfo1 & VMCB_NPF_INFO1_ID) { + return (false); + } + + if (exitinfo1 & VMCB_NPF_INFO1_GPT) { + return (false); + } + + if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { + return (false); + } + + return (true); +} + +static void +svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) +{ + struct vm_guest_paging *paging; + struct vmcb_segment seg; + struct vmcb_ctrl *ctrl; + char *inst_bytes; + int error, inst_len; + + ctrl = &vmcb->ctrl; + paging = &vmexit->u.inst_emul.paging; + + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->u.inst_emul.gpa = gpa; + vmexit->u.inst_emul.gla = VIE_INVALID_GLA; + svm_paging_info(vmcb, paging); + + error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); + KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error)); + + switch(paging->cpu_mode) { + case CPU_MODE_REAL: + vmexit->u.inst_emul.cs_base = seg.base; + vmexit->u.inst_emul.cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + vmexit->u.inst_emul.cs_base = seg.base; + + /* + * Section 4.8.1 of APM2, Default Operand Size or D bit. + */ + vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ? + 1 : 0; + break; + default: + vmexit->u.inst_emul.cs_base = 0; + vmexit->u.inst_emul.cs_d = 0; + break; + } + + /* + * Copy the instruction bytes into 'vie' if available. + */ + if (decode_assist() && !disable_npf_assist) { + inst_len = ctrl->inst_len; + inst_bytes = (char *)ctrl->inst_bytes; + } else { + inst_len = 0; + inst_bytes = NULL; + } + vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); +} + +#ifdef KTR +static const char * +intrtype_to_str(int intr_type) +{ + switch (intr_type) { + case VMCB_EVENTINJ_TYPE_INTR: + return ("hwintr"); + case VMCB_EVENTINJ_TYPE_NMI: + return ("nmi"); + case VMCB_EVENTINJ_TYPE_INTn: + return ("swintr"); + case VMCB_EVENTINJ_TYPE_EXCEPTION: + return ("exception"); + default: + panic("%s: unknown intr_type %d", __func__, intr_type); + } +} +#endif + +/* + * Inject an event to vcpu as described in section 15.20, "Event injection". + */ +static void +svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector, + uint32_t error, bool ec_valid) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, + ("%s: event already pending %#lx", __func__, ctrl->eventinj)); + + KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d", + __func__, vector)); + + switch (intr_type) { + case VMCB_EVENTINJ_TYPE_INTR: + case VMCB_EVENTINJ_TYPE_NMI: + case VMCB_EVENTINJ_TYPE_INTn: + break; + case VMCB_EVENTINJ_TYPE_EXCEPTION: + if (vector >= 0 && vector <= 31 && vector != 2) + break; + /* FALLTHROUGH */ + default: + panic("%s: invalid intr_type/vector: %d/%d", __func__, + intr_type, vector); + } + ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID; + if (ec_valid) { + ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; + ctrl->eventinj |= (uint64_t)error << 32; + VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x", + intrtype_to_str(intr_type), vector, error); + } else { + VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d", + intrtype_to_str(intr_type), vector); + } +} + +static void +svm_update_virqinfo(struct svm_softc *sc, int vcpu) +{ + struct vm *vm; + struct vlapic *vlapic; + struct vmcb_ctrl *ctrl; + + vm = sc->vm; + vlapic = vm_lapic(vm, vcpu); + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + /* Update %cr8 in the emulated vlapic */ + vlapic_set_cr8(vlapic, ctrl->v_tpr); + + /* Virtual interrupt injection is not used. */ + KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid " + "v_intr_vector %d", __func__, ctrl->v_intr_vector)); +} + +static void +svm_save_intinfo(struct svm_softc *svm_sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + uint64_t intinfo; + + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + intinfo = ctrl->exitintinfo; + if (!VMCB_EXITINTINFO_VALID(intinfo)) + return; + + /* + * From APMv2, Section "Intercepts during IDT interrupt delivery" + * + * If a #VMEXIT happened during event delivery then record the event + * that was being delivered. + */ + VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", + intinfo, VMCB_EXITINTINFO_VECTOR(intinfo)); + vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); + vm_exit_intinfo(svm_sc->vm, vcpu, intinfo); +} + +static __inline int +vintr_intercept_enabled(struct svm_softc *sc, int vcpu) +{ + + return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_VINTR)); +} + +static __inline void +enable_intr_window_exiting(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + if (ctrl->v_irq && ctrl->v_intr_vector == 0) { + KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); + KASSERT(vintr_intercept_enabled(sc, vcpu), + ("%s: vintr intercept should be enabled", __func__)); + return; + } + + VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting"); + ctrl->v_irq = 1; + ctrl->v_ign_tpr = 1; + ctrl->v_intr_vector = 0; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); +} + +static __inline void +disable_intr_window_exiting(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + if (!ctrl->v_irq && ctrl->v_intr_vector == 0) { + KASSERT(!vintr_intercept_enabled(sc, vcpu), + ("%s: vintr intercept should be disabled", __func__)); + return; + } + + VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting"); + ctrl->v_irq = 0; + ctrl->v_intr_vector = 0; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); +} + +static int +svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val) +{ + struct vmcb_ctrl *ctrl; + int oldval, newval; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + oldval = ctrl->intr_shadow; + newval = val ? 1 : 0; + if (newval != oldval) { + ctrl->intr_shadow = newval; + VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval); + } + return (0); +} + +static int +svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + *val = ctrl->intr_shadow; + return (0); +} + +/* + * Once an NMI is injected it blocks delivery of further NMIs until the handler + * executes an IRET. The IRET intercept is enabled when an NMI is injected to + * to track when the vcpu is done handling the NMI. + */ +static int +nmi_blocked(struct svm_softc *sc, int vcpu) +{ + int blocked; + + blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_IRET); + return (blocked); +} + +static void +enable_nmi_blocking(struct svm_softc *sc, int vcpu) +{ + + KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked")); + VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled"); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); +} + +static void +clear_nmi_blocking(struct svm_softc *sc, int vcpu) +{ + int error; + + KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked")); + VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared"); + /* + * When the IRET intercept is cleared the vcpu will attempt to execute + * the "iret" when it runs next. However, it is possible to inject + * another NMI into the vcpu before the "iret" has actually executed. + * + * For e.g. if the "iret" encounters a #NPF when accessing the stack + * it will trap back into the hypervisor. If an NMI is pending for + * the vcpu it will be injected into the guest. + * + * XXX this needs to be fixed + */ + svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); + + /* + * Set 'intr_shadow' to prevent an NMI from being injected on the + * immediate VMRUN. + */ + error = svm_modify_intr_shadow(sc, vcpu, 1); + KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error)); +} + +#define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL + +static int +svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu) +{ + struct vm_exit *vme; + struct vmcb_state *state; + uint64_t changed, lma, oldval; + int error; + + state = svm_get_vmcb_state(sc, vcpu); + + oldval = state->efer; + VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval); + + newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ + changed = oldval ^ newval; + + if (newval & EFER_MBZ_BITS) + goto gpf; + + /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ + if (changed & EFER_LME) { + if (state->cr0 & CR0_PG) + goto gpf; + } + + /* EFER.LMA = EFER.LME & CR0.PG */ + if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) + lma = EFER_LMA; + else + lma = 0; + + if ((newval & EFER_LMA) != lma) + goto gpf; + + if (newval & EFER_NXE) { + if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) + goto gpf; + } + + /* + * XXX bhyve does not enforce segment limits in 64-bit mode. Until + * this is fixed flag guest attempt to set EFER_LMSLE as an error. + */ + if (newval & EFER_LMSLE) { + vme = vm_exitinfo(sc->vm, vcpu); + vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0); + *retu = true; + return (0); + } + + if (newval & EFER_FFXSR) { + if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) + goto gpf; + } + + if (newval & EFER_TCE) { + if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) + goto gpf; + } + + error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval); + KASSERT(error == 0, ("%s: error %d updating efer", __func__, error)); + return (0); +gpf: + vm_inject_gp(sc->vm, vcpu); + return (0); +} + +static int +emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, + bool *retu) +{ + int error; + + if (lapic_msr(num)) + error = lapic_wrmsr(sc->vm, vcpu, num, val, retu); + else if (num == MSR_EFER) + error = svm_write_efer(sc, vcpu, val, retu); + else + error = svm_wrmsr(sc, vcpu, num, val, retu); + + return (error); +} + +static int +emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu) +{ + struct vmcb_state *state; + struct svm_regctx *ctx; + uint64_t result; + int error; + + if (lapic_msr(num)) + error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu); + else + error = svm_rdmsr(sc, vcpu, num, &result, retu); + + if (error == 0) { + state = svm_get_vmcb_state(sc, vcpu); + ctx = svm_get_guest_regctx(sc, vcpu); + state->rax = result & 0xffffffff; + ctx->sctx_rdx = result >> 32; + } + + return (error); +} + +#ifdef KTR +static const char * +exit_reason_to_str(uint64_t reason) +{ + static char reasonbuf[32]; + + switch (reason) { + case VMCB_EXIT_INVALID: + return ("invalvmcb"); + case VMCB_EXIT_SHUTDOWN: + return ("shutdown"); + case VMCB_EXIT_NPF: + return ("nptfault"); + case VMCB_EXIT_PAUSE: + return ("pause"); + case VMCB_EXIT_HLT: + return ("hlt"); + case VMCB_EXIT_CPUID: + return ("cpuid"); + case VMCB_EXIT_IO: + return ("inout"); + case VMCB_EXIT_MC: + return ("mchk"); + case VMCB_EXIT_INTR: + return ("extintr"); + case VMCB_EXIT_NMI: + return ("nmi"); + case VMCB_EXIT_VINTR: + return ("vintr"); + case VMCB_EXIT_MSR: + return ("msr"); + case VMCB_EXIT_IRET: + return ("iret"); + case VMCB_EXIT_MONITOR: + return ("monitor"); + case VMCB_EXIT_MWAIT: + return ("mwait"); + default: + snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason); + return (reasonbuf); + } +} +#endif /* KTR */ + +/* + * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs + * that are due to instruction intercepts as well as MSR and IOIO intercepts + * and exceptions caused by INT3, INTO and BOUND instructions. + * + * Return 1 if the nRIP is valid and 0 otherwise. + */ +static int +nrip_valid(uint64_t exitcode) +{ + switch (exitcode) { + case 0x00 ... 0x0F: /* read of CR0 through CR15 */ + case 0x10 ... 0x1F: /* write of CR0 through CR15 */ + case 0x20 ... 0x2F: /* read of DR0 through DR15 */ + case 0x30 ... 0x3F: /* write of DR0 through DR15 */ + case 0x43: /* INT3 */ + case 0x44: /* INTO */ + case 0x45: /* BOUND */ + case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ + case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ + return (1); + default: + return (0); + } +} + +static int +svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct svm_regctx *ctx; + uint64_t code, info1, info2, val; + uint32_t eax, ecx, edx; +#ifdef __FreeBSD__ + int error, errcode_valid, handled, idtvec, reflect; +#else + int error, errcode_valid = 0, handled, idtvec, reflect; +#endif + bool retu; + + ctx = svm_get_guest_regctx(svm_sc, vcpu); + vmcb = svm_get_vmcb(svm_sc, vcpu); + state = &vmcb->state; + ctrl = &vmcb->ctrl; + + handled = 0; + code = ctrl->exitcode; + info1 = ctrl->exitinfo1; + info2 = ctrl->exitinfo2; + + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmexit->rip = state->rip; + vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; + + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); + + /* + * #VMEXIT(INVALID) needs to be handled early because the VMCB is + * in an inconsistent state and can trigger assertions that would + * never happen otherwise. + */ + if (code == VMCB_EXIT_INVALID) { + vm_exit_svm(vmexit, code, info1, info2); + return (0); + } + + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " + "injection valid bit is set %#lx", __func__, ctrl->eventinj)); + + KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, + ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)", + vmexit->inst_length, code, info1, info2)); + + svm_update_virqinfo(svm_sc, vcpu); + svm_save_intinfo(svm_sc, vcpu); + + switch (code) { + case VMCB_EXIT_IRET: + /* + * Restart execution at "iret" but with the intercept cleared. + */ + vmexit->inst_length = 0; + clear_nmi_blocking(svm_sc, vcpu); + handled = 1; + break; + case VMCB_EXIT_VINTR: /* interrupt window exiting */ + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); + handled = 1; + break; + case VMCB_EXIT_INTR: /* external interrupt */ + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); + handled = 1; + break; + case VMCB_EXIT_NMI: /* external NMI */ + handled = 1; + break; + case 0x40 ... 0x5F: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); + reflect = 1; + idtvec = code - 0x40; + switch (idtvec) { + case IDT_MC: + /* + * Call the machine check handler by hand. Also don't + * reflect the machine check back into the guest. + */ + reflect = 0; + VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler"); +#ifdef __FreeBSD__ + __asm __volatile("int $18"); +#else + vmm_call_trap(T_MCE); +#endif + break; + case IDT_PF: + error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, + info2); + KASSERT(error == 0, ("%s: error %d updating cr2", + __func__, error)); + /* fallthru */ + case IDT_NP: + case IDT_SS: + case IDT_GP: + case IDT_AC: + case IDT_TS: + errcode_valid = 1; + break; + + case IDT_DF: + errcode_valid = 1; + info1 = 0; + break; + + case IDT_BP: + case IDT_OF: + case IDT_BR: + /* + * The 'nrip' field is populated for INT3, INTO and + * BOUND exceptions and this also implies that + * 'inst_length' is non-zero. + * + * Reset 'inst_length' to zero so the guest %rip at + * event injection is identical to what it was when + * the exception originally happened. + */ + VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d " + "to zero before injecting exception %d", + vmexit->inst_length, idtvec); + vmexit->inst_length = 0; + /* fallthru */ + default: + errcode_valid = 0; + info1 = 0; + break; + } + KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) " + "when reflecting exception %d into guest", + vmexit->inst_length, idtvec)); + + if (reflect) { + /* Reflect the exception back into the guest */ + VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception " + "%d/%#x into the guest", idtvec, (int)info1); + error = vm_inject_exception(svm_sc->vm, vcpu, idtvec, + errcode_valid, info1, 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + } + handled = 1; + break; + case VMCB_EXIT_MSR: /* MSR access. */ + eax = state->rax; + ecx = ctx->sctx_rcx; + edx = ctx->sctx_rdx; + retu = false; + + if (info1) { + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); + val = (uint64_t)edx << 32 | eax; + VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx", + ecx, val); + if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = val; + } else if (!retu) { + handled = 1; + } else { + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_wrmsr retu with bogus exitcode")); + } + } else { + VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); + if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } else if (!retu) { + handled = 1; + } else { + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_rdmsr retu with bogus exitcode")); + } + } + break; + case VMCB_EXIT_IO: + handled = svm_handle_io(svm_sc, vcpu, vmexit); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); + break; + case VMCB_EXIT_CPUID: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); + handled = x86_emulate_cpuid(svm_sc->vm, vcpu, + (uint32_t *)&state->rax, + (uint32_t *)&ctx->sctx_rbx, + (uint32_t *)&ctx->sctx_rcx, + (uint32_t *)&ctx->sctx_rdx); + break; + case VMCB_EXIT_HLT: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); + vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = state->rflags; + break; + case VMCB_EXIT_PAUSE: + vmexit->exitcode = VM_EXITCODE_PAUSE; + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); + break; + case VMCB_EXIT_NPF: + /* EXITINFO2 contains the faulting guest physical address */ + if (info1 & VMCB_NPF_INFO1_RSV) { + VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with " + "reserved bits set: info1(%#lx) info2(%#lx)", + info1, info2); + } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->u.paging.gpa = info2; + vmexit->u.paging.fault_type = npf_fault_type(info1); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); + VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault " + "on gpa %#lx/%#lx at rip %#lx", + info2, info1, state->rip); + } else if (svm_npf_emul_fault(info1)) { + svm_handle_inst_emul(vmcb, info2, vmexit); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1); + VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault " + "for gpa %#lx/%#lx at rip %#lx", + info2, info1, state->rip); + } + break; + case VMCB_EXIT_MONITOR: + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case VMCB_EXIT_MWAIT: + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + default: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); + break; + } + + VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d", + handled ? "handled" : "unhandled", exit_reason_to_str(code), + vmexit->rip, vmexit->inst_length); + + if (handled) { + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + state->rip = vmexit->rip; + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic SVM exit. + */ + vm_exit_svm(vmexit, code, info1, info2); + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + return (handled); +} + +static void +svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu) +{ + uint64_t intinfo; + + if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo)) + return; + + KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not " + "valid: %#lx", __func__, intinfo)); + + svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo), + VMCB_EXITINTINFO_VECTOR(intinfo), + VMCB_EXITINTINFO_EC(intinfo), + VMCB_EXITINTINFO_EC_VALID(intinfo)); + vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); + VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo); +} + +/* + * Inject event to virtual cpu. + */ +static void +svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + struct svm_vcpu *vcpustate; + uint8_t v_tpr; + int vector, need_intr_window; + int extint_pending; + + state = svm_get_vmcb_state(sc, vcpu); + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + vcpustate = svm_get_vcpu(sc, vcpu); + + need_intr_window = 0; + + vlapic_tmr_update(vlapic); + + if (vcpustate->nextrip != state->rip) { + ctrl->intr_shadow = 0; + VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vcpustate->nextrip, state->rip); + } + + /* + * Inject pending events or exceptions for this vcpu. + * + * An event might be pending because the previous #VMEXIT happened + * during event delivery (i.e. ctrl->exitintinfo). + * + * An event might also be pending because an exception was injected + * by the hypervisor (e.g. #PF during instruction emulation). + */ + svm_inj_intinfo(sc, vcpu); + + /* NMI event has priority over interrupts. */ + if (vm_nmi_pending(sc->vm, vcpu)) { + if (nmi_blocked(sc, vcpu)) { + /* + * Can't inject another NMI if the guest has not + * yet executed an "iret" after the last NMI. + */ + VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due " + "to NMI-blocking"); + } else if (ctrl->intr_shadow) { + /* + * Can't inject an NMI if the vcpu is in an intr_shadow. + */ + VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to " + "interrupt shadow"); + need_intr_window = 1; + goto done; + } else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { + /* + * If there is already an exception/interrupt pending + * then defer the NMI until after that. + */ + VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to " + "eventinj %#lx", ctrl->eventinj); + + /* + * Use self-IPI to trigger a VM-exit as soon as + * possible after the event injection is completed. + * + * This works only if the external interrupt exiting + * is at a lower priority than the event injection. + * + * Although not explicitly specified in APMv2 the + * relative priorities were verified empirically. + */ + ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */ + } else { + vm_nmi_clear(sc->vm, vcpu); + + /* Inject NMI, vector number is not used */ + svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI, + IDT_NMI, 0, false); + + /* virtual NMI blocking is now in effect */ + enable_nmi_blocking(sc, vcpu); + + VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI"); + } + } + + extint_pending = vm_extint_pending(sc->vm, vcpu); + if (!extint_pending) { + if (!vlapic_pending_intr(vlapic, &vector)) + goto done; + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + } else { + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(sc->vm, &vector); + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + } + + /* + * If the guest has disabled interrupts or is in an interrupt shadow + * then we cannot inject the pending interrupt. + */ + if ((state->rflags & PSL_I) == 0) { + VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " + "rflags %#lx", vector, state->rflags); + need_intr_window = 1; + goto done; + } + + if (ctrl->intr_shadow) { + VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to " + "interrupt shadow", vector); + need_intr_window = 1; + goto done; + } + + if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { + VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " + "eventinj %#lx", vector, ctrl->eventinj); + need_intr_window = 1; + goto done; + } + + svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false); + + if (!extint_pending) { + vlapic_intr_accepted(vlapic, vector); + } else { + vm_extint_clear(sc->vm, vcpu); + vatpic_intr_accepted(sc->vm, vector); + } + + /* + * Force a VM-exit as soon as the vcpu is ready to accept another + * interrupt. This is done because the PIC might have another vector + * that it wants to inject. Also, if the APIC has a pending interrupt + * that was preempted by the ExtInt then it allows us to inject the + * APIC vector as soon as possible. + */ + need_intr_window = 1; +done: + /* + * The guest can modify the TPR by writing to %CR8. In guest mode + * the processor reflects this write to V_TPR without hypervisor + * intervention. + * + * The guest can also modify the TPR by writing to it via the memory + * mapped APIC page. In this case, the write will be emulated by the + * hypervisor. For this reason V_TPR must be updated before every + * VMRUN. + */ + v_tpr = vlapic_get_cr8(vlapic); + KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr)); + if (ctrl->v_tpr != v_tpr) { + VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x", + ctrl->v_tpr, v_tpr); + ctrl->v_tpr = v_tpr; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + } + + if (need_intr_window) { + /* + * We use V_IRQ in conjunction with the VINTR intercept to + * trap into the hypervisor as soon as a virtual interrupt + * can be delivered. + * + * Since injected events are not subject to intercept checks + * we need to ensure that the V_IRQ is not actually going to + * be delivered on VM entry. The KASSERT below enforces this. + */ + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || + (state->rflags & PSL_I) == 0 || ctrl->intr_shadow, + ("Bogus intr_window_exiting: eventinj (%#lx), " + "intr_shadow (%u), rflags (%#lx)", + ctrl->eventinj, ctrl->intr_shadow, state->rflags)); + enable_intr_window_exiting(sc, vcpu); + } else { + disable_intr_window_exiting(sc, vcpu); + } +} + +static __inline void +restore_host_tss(void) +{ +#ifdef __FreeBSD__ + struct system_segment_descriptor *tss_sd; + + /* + * The TSS descriptor was in use prior to launching the guest so it + * has been marked busy. + * + * 'ltr' requires the descriptor to be marked available so change the + * type to "64-bit available TSS". + */ + tss_sd = PCPU_GET(tss); + tss_sd->sd_type = SDT_SYSTSS; + ltr(GSEL(GPROC0_SEL, SEL_KPL)); +#else + system_desc_t *tss = (system_desc_t *)&CPU->cpu_gdt[GDT_KTSS]; + + tss->ssd_type = SDT_SYSTSS; + wr_tsr(KTSS_SEL); +#endif +} + +#ifdef __FreeBSD__ +static void +check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) +{ + struct svm_vcpu *vcpustate; + struct vmcb_ctrl *ctrl; + long eptgen; + bool alloc_asid; + + KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not " + "active on cpu %u", __func__, thiscpu)); + + vcpustate = svm_get_vcpu(sc, vcpuid); + ctrl = svm_get_vmcb_ctrl(sc, vcpuid); + + /* + * The TLB entries associated with the vcpu's ASID are not valid + * if either of the following conditions is true: + * + * 1. The vcpu's ASID generation is different than the host cpu's + * ASID generation. This happens when the vcpu migrates to a new + * host cpu. It can also happen when the number of vcpus executing + * on a host cpu is greater than the number of ASIDs available. + * + * 2. The pmap generation number is different than the value cached in + * the 'vcpustate'. This happens when the host invalidates pages + * belonging to the guest. + * + * asidgen eptgen Action + * mismatch mismatch + * 0 0 (a) + * 0 1 (b1) or (b2) + * 1 0 (c) + * 1 1 (d) + * + * (a) There is no mismatch in eptgen or ASID generation and therefore + * no further action is needed. + * + * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is + * retained and the TLB entries associated with this ASID + * are flushed by VMRUN. + * + * (b2) If the cpu does not support FlushByAsid then a new ASID is + * allocated. + * + * (c) A new ASID is allocated. + * + * (d) A new ASID is allocated. + */ + + alloc_asid = false; + eptgen = pmap->pm_eptgen; + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING; + + if (vcpustate->asid.gen != asid[thiscpu].gen) { + alloc_asid = true; /* (c) and (d) */ + } else if (vcpustate->eptgen != eptgen) { + if (flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */ + else + alloc_asid = true; /* (b2) */ + } else { + /* + * This is the common case (a). + */ + KASSERT(!alloc_asid, ("ASID allocation not necessary")); + KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING, + ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl)); + } + + if (alloc_asid) { + if (++asid[thiscpu].num >= nasid) { + asid[thiscpu].num = 1; + if (++asid[thiscpu].gen == 0) + asid[thiscpu].gen = 1; + /* + * If this cpu does not support "flush-by-asid" + * then flush the entire TLB on a generation + * bump. Subsequent ASID allocation in this + * generation can be done without a TLB flush. + */ + if (!flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL; + } + vcpustate->asid.gen = asid[thiscpu].gen; + vcpustate->asid.num = asid[thiscpu].num; + + ctrl->asid = vcpustate->asid.num; + svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); + /* + * If this cpu supports "flush-by-asid" then the TLB + * was not flushed after the generation bump. The TLB + * is flushed selectively after every new ASID allocation. + */ + if (flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; + } + vcpustate->eptgen = eptgen; + + KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero")); + KASSERT(ctrl->asid == vcpustate->asid.num, + ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num)); +} +#else /* __FreeBSD__ */ +static void +check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) +{ + struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid); + struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid); + long eptgen; + uint8_t flush; + + eptgen = pmap->pm_eptgen; + flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(), + vcpustate->eptgen == eptgen); + + if (flush != VMCB_TLB_FLUSH_NOTHING) { + ctrl->asid = vcpustate->hma_asid.hsa_asid; + svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); + } + ctrl->tlb_ctrl = flush; + vcpustate->eptgen = eptgen; +} +#endif /* __FreeBSD__ */ + +static __inline void +disable_gintr(void) +{ + + __asm __volatile("clgi"); +} + +static __inline void +enable_gintr(void) +{ + + __asm __volatile("stgi"); +} + +static __inline void +svm_dr_enter_guest(struct svm_regctx *gctx) +{ + + /* Save host control debug registers. */ + gctx->host_dr7 = rdr7(); + gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); + + /* + * Disable debugging in DR7 and DEBUGCTL to avoid triggering + * exceptions in the host based on the guest DRx values. The + * guest DR6, DR7, and DEBUGCTL are saved/restored in the + * VMCB. + */ + load_dr7(0); + wrmsr(MSR_DEBUGCTLMSR, 0); + + /* Save host debug registers. */ + gctx->host_dr0 = rdr0(); + gctx->host_dr1 = rdr1(); + gctx->host_dr2 = rdr2(); + gctx->host_dr3 = rdr3(); + gctx->host_dr6 = rdr6(); + + /* Restore guest debug registers. */ + load_dr0(gctx->sctx_dr0); + load_dr1(gctx->sctx_dr1); + load_dr2(gctx->sctx_dr2); + load_dr3(gctx->sctx_dr3); +} + +static __inline void +svm_dr_leave_guest(struct svm_regctx *gctx) +{ + + /* Save guest debug registers. */ + gctx->sctx_dr0 = rdr0(); + gctx->sctx_dr1 = rdr1(); + gctx->sctx_dr2 = rdr2(); + gctx->sctx_dr3 = rdr3(); + + /* + * Restore host debug registers. Restore DR7 and DEBUGCTL + * last. + */ + load_dr0(gctx->host_dr0); + load_dr1(gctx->host_dr1); + load_dr2(gctx->host_dr2); + load_dr3(gctx->host_dr3); + load_dr6(gctx->host_dr6); + wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl); + load_dr7(gctx->host_dr7); +} + +/* + * Start vcpu with specified RIP. + */ +static int +svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, + struct vm_eventinfo *evinfo) +{ + struct svm_regctx *gctx; + struct svm_softc *svm_sc; + struct svm_vcpu *vcpustate; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct vm_exit *vmexit; + struct vlapic *vlapic; + struct vm *vm; + uint64_t vmcb_pa; + int handled; + uint16_t ldt_sel; + + svm_sc = arg; + vm = svm_sc->vm; + + vcpustate = svm_get_vcpu(svm_sc, vcpu); + state = svm_get_vmcb_state(svm_sc, vcpu); + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + vmexit = vm_exitinfo(vm, vcpu); + vlapic = vm_lapic(vm, vcpu); + + gctx = svm_get_guest_regctx(svm_sc, vcpu); + vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; + + if (vcpustate->lastcpu != curcpu) { + /* + * Force new ASID allocation by invalidating the generation. + */ +#ifdef __FreeBSD__ + vcpustate->asid.gen = 0; +#else + vcpustate->hma_asid.hsa_gen = 0; +#endif + + /* + * Invalidate the VMCB state cache by marking all fields dirty. + */ + svm_set_dirty(svm_sc, vcpu, 0xffffffff); + + /* + * XXX + * Setting 'vcpustate->lastcpu' here is bit premature because + * we may return from this function without actually executing + * the VMRUN instruction. This could happen if an AST or yield + * condition is pending on the first time through the loop. + * + * This works for now but any new side-effects of vcpu + * migration should take this case into account. + */ + vcpustate->lastcpu = curcpu; + vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); + } + + svm_msr_guest_enter(svm_sc, vcpu); + +#ifndef __FreeBSD__ + VERIFY(!vcpustate->loaded && curthread->t_preempt != 0); + vcpustate->loaded = B_TRUE; +#endif + + /* Update Guest RIP */ + state->rip = rip; + + do { +#ifndef __FreeBSD__ + /* + * Interrupt injection may involve mutex contention which, on + * illumos bhyve, are blocking/non-spin. Doing so with global + * interrupts disabled is a recipe for deadlock, so it is + * performed here. + */ + svm_inj_interrupts(svm_sc, vcpu, vlapic); +#endif + + /* + * Disable global interrupts to guarantee atomicity during + * loading of guest state. This includes not only the state + * loaded by the "vmrun" instruction but also software state + * maintained by the hypervisor: suspended and rendezvous + * state, NPT generation number, vlapic interrupts etc. + */ + disable_gintr(); + + if (vcpu_suspended(evinfo)) { + enable_gintr(); + vm_exit_suspended(vm, vcpu, state->rip); + break; + } + + if (vcpu_runblocked(evinfo)) { + enable_gintr(); + vm_exit_runblock(vm, vcpu, state->rip); + break; + } + + if (vcpu_reqidle(evinfo)) { + enable_gintr(); + vm_exit_reqidle(vm, vcpu, state->rip); + break; + } + + /* We are asked to give the cpu by scheduler. */ + if (vcpu_should_yield(vm, vcpu)) { + enable_gintr(); + vm_exit_astpending(vm, vcpu, state->rip); + break; + } + + if (vcpu_debugged(vm, vcpu)) { + enable_gintr(); + vm_exit_debug(vm, vcpu, state->rip); + break; + } + + /* + * #VMEXIT resumes the host with the guest LDTR, so + * save the current LDT selector so it can be restored + * after an exit. The userspace hypervisor probably + * doesn't use a LDT, but save and restore it to be + * safe. + */ + ldt_sel = sldt(); + +#ifdef __FreeBSD__ + svm_inj_interrupts(svm_sc, vcpu, vlapic); +#endif + + /* Activate the nested pmap on 'curcpu' */ + CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active); + + /* + * Check the pmap generation and the ASID generation to + * ensure that the vcpu does not use stale TLB mappings. + */ + check_asid(svm_sc, vcpu, pmap, curcpu); + + ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty; + vcpustate->dirty = 0; + VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean); + + /* Launch Virtual Machine. */ + VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip); + svm_dr_enter_guest(gctx); +#ifdef __FreeBSD__ + svm_launch(vmcb_pa, gctx, &__pcpu[curcpu]); +#else + svm_launch(vmcb_pa, gctx, CPU); +#endif + svm_dr_leave_guest(gctx); + + CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); + + /* + * The host GDTR and IDTR is saved by VMRUN and restored + * automatically on #VMEXIT. However, the host TSS needs + * to be restored explicitly. + */ + restore_host_tss(); + + /* Restore host LDTR. */ + lldt(ldt_sel); + + /* #VMEXIT disables interrupts so re-enable them here. */ + enable_gintr(); + + /* Update 'nextrip' */ + vcpustate->nextrip = state->rip; + + /* Handle #VMEXIT and if required return to user space. */ + handled = svm_vmexit(svm_sc, vcpu, vmexit); + } while (handled); + + svm_msr_guest_exit(svm_sc, vcpu); + +#ifndef __FreeBSD__ + VERIFY(vcpustate->loaded && curthread->t_preempt != 0); + vcpustate->loaded = B_FALSE; +#endif + + return (0); +} + +static void +svm_vmcleanup(void *arg) +{ + struct svm_softc *sc = arg; + + contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM); + contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM); + free(sc, M_SVM); +} + +static register_t * +swctx_regptr(struct svm_regctx *regctx, int reg) +{ + + switch (reg) { + case VM_REG_GUEST_RBX: + return (®ctx->sctx_rbx); + case VM_REG_GUEST_RCX: + return (®ctx->sctx_rcx); + case VM_REG_GUEST_RDX: + return (®ctx->sctx_rdx); + case VM_REG_GUEST_RDI: + return (®ctx->sctx_rdi); + case VM_REG_GUEST_RSI: + return (®ctx->sctx_rsi); + case VM_REG_GUEST_RBP: + return (®ctx->sctx_rbp); + case VM_REG_GUEST_R8: + return (®ctx->sctx_r8); + case VM_REG_GUEST_R9: + return (®ctx->sctx_r9); + case VM_REG_GUEST_R10: + return (®ctx->sctx_r10); + case VM_REG_GUEST_R11: + return (®ctx->sctx_r11); + case VM_REG_GUEST_R12: + return (®ctx->sctx_r12); + case VM_REG_GUEST_R13: + return (®ctx->sctx_r13); + case VM_REG_GUEST_R14: + return (®ctx->sctx_r14); + case VM_REG_GUEST_R15: + return (®ctx->sctx_r15); + case VM_REG_GUEST_DR0: + return (®ctx->sctx_dr0); + case VM_REG_GUEST_DR1: + return (®ctx->sctx_dr1); + case VM_REG_GUEST_DR2: + return (®ctx->sctx_dr2); + case VM_REG_GUEST_DR3: + return (®ctx->sctx_dr3); + default: + return (NULL); + } +} + +static int +svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) +{ + struct svm_softc *svm_sc; + register_t *reg; + + svm_sc = arg; + + if (ident == VM_REG_GUEST_INTR_SHADOW) { + return (svm_get_intr_shadow(svm_sc, vcpu, val)); + } + + if (vmcb_read(svm_sc, vcpu, ident, val) == 0) { + return (0); + } + + reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); + + if (reg != NULL) { + *val = *reg; + return (0); + } + + VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident); + return (EINVAL); +} + +static int +svm_setreg(void *arg, int vcpu, int ident, uint64_t val) +{ + struct svm_softc *svm_sc; + register_t *reg; + + svm_sc = arg; + + if (ident == VM_REG_GUEST_INTR_SHADOW) { + return (svm_modify_intr_shadow(svm_sc, vcpu, val)); + } + + if (vmcb_write(svm_sc, vcpu, ident, val) == 0) { + return (0); + } + + reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); + + if (reg != NULL) { + *reg = val; + return (0); + } + + /* + * XXX deal with CR3 and invalidate TLB entries tagged with the + * vcpu's ASID. This needs to be treated differently depending on + * whether 'running' is true/false. + */ + + VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident); + return (EINVAL); +} + +static int +svm_setcap(void *arg, int vcpu, int type, int val) +{ + struct svm_softc *sc; + int error; + + sc = arg; + error = 0; + switch (type) { + case VM_CAP_HALT_EXIT: + svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_HLT, val); + break; + case VM_CAP_PAUSE_EXIT: + svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PAUSE, val); + break; + case VM_CAP_UNRESTRICTED_GUEST: + /* Unrestricted guest execution cannot be disabled in SVM */ + if (val == 0) + error = EINVAL; + break; + default: + error = ENOENT; + break; + } + return (error); +} + +static int +svm_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct svm_softc *sc; + int error; + + sc = arg; + error = 0; + + switch (type) { + case VM_CAP_HALT_EXIT: + *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_HLT); + break; + case VM_CAP_PAUSE_EXIT: + *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PAUSE); + break; + case VM_CAP_UNRESTRICTED_GUEST: + *retval = 1; /* unrestricted guest is always enabled */ + break; + default: + error = ENOENT; + break; + } + return (error); +} + +static struct vlapic * +svm_vlapic_init(void *arg, int vcpuid) +{ + struct svm_softc *svm_sc; + struct vlapic *vlapic; + + svm_sc = arg; + vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = svm_sc->vm; + vlapic->vcpuid = vcpuid; + vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) +{ + + vlapic_cleanup(vlapic); + free(vlapic, M_SVM_VLAPIC); +} + +#ifndef __FreeBSD__ +static void +svm_savectx(void *arg, int vcpu) +{ + struct svm_softc *sc = arg; + + if (sc->vcpu[vcpu].loaded) { + svm_msr_guest_exit(sc, vcpu); + } +} + +static void +svm_restorectx(void *arg, int vcpu) +{ + struct svm_softc *sc = arg; + + if (sc->vcpu[vcpu].loaded) { + svm_msr_guest_enter(sc, vcpu); + } +} +#endif /* __FreeBSD__ */ + +struct vmm_ops vmm_ops_amd = { + svm_init, + svm_cleanup, + svm_restore, + svm_vminit, + svm_vmrun, + svm_vmcleanup, + svm_getreg, + svm_setreg, + vmcb_getdesc, + vmcb_setdesc, + svm_getcap, + svm_setcap, + svm_npt_alloc, + svm_npt_free, + svm_vlapic_init, + svm_vlapic_cleanup, + +#ifndef __FreeBSD__ + svm_savectx, + svm_restorectx, +#endif +}; diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.h b/usr/src/uts/i86pc/io/vmm/amd/svm.h new file mode 100644 index 0000000000..c78f7eb067 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.h @@ -0,0 +1,74 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_H_ +#define _SVM_H_ + +/* + * Guest register state that is saved outside the VMCB. + */ +struct svm_regctx { + register_t sctx_rbp; + register_t sctx_rbx; + register_t sctx_rcx; + register_t sctx_rdx; + register_t sctx_rdi; + register_t sctx_rsi; + register_t sctx_r8; + register_t sctx_r9; + register_t sctx_r10; + register_t sctx_r11; + register_t sctx_r12; + register_t sctx_r13; + register_t sctx_r14; + register_t sctx_r15; + register_t sctx_dr0; + register_t sctx_dr1; + register_t sctx_dr2; + register_t sctx_dr3; + + register_t host_dr0; + register_t host_dr1; + register_t host_dr2; + register_t host_dr3; + register_t host_dr6; + register_t host_dr7; + uint64_t host_debugctl; +}; + +#ifdef __FreeBSD__ +struct pcpu; +void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu); +#else +struct cpu; +void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct cpu *pcpu); +#endif + +#endif /* _SVM_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c new file mode 100644 index 0000000000..0c1ce0e4e0 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c @@ -0,0 +1,199 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include + +#include "svm.h" +#include "vmcb.h" +#include "svm_softc.h" +#include "svm_msr.h" + +#ifndef MSR_AMDK8_IPM +#define MSR_AMDK8_IPM 0xc0010055 +#endif + +enum { + IDX_MSR_LSTAR, + IDX_MSR_CSTAR, + IDX_MSR_STAR, + IDX_MSR_SF_MASK, + HOST_MSR_NUM /* must be the last enumeration */ +}; + +#ifdef __FreeBSD__ +static uint64_t host_msrs[HOST_MSR_NUM]; + +void +svm_msr_init(void) +{ + /* + * It is safe to cache the values of the following MSRs because they + * don't change based on curcpu, curproc or curthread. + */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +} +#else + +CTASSERT(HOST_MSR_NUM == SVM_HOST_MSR_NUM); + +void +svm_msr_init(void) +{ + /* + * These MSRs do vary between CPUs on illumos, so saving system-wide + * values for them serves no purpose. + */ +} +#endif /* __FreeBSD__ */ + +void +svm_msr_guest_init(struct svm_softc *sc, int vcpu) +{ + /* + * All the MSRs accessible to the guest are either saved/restored by + * hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored + * by VMSAVE/VMLOAD (e.g., MSR_GSBASE). + * + * There are no guest MSRs that are saved/restored "by hand" so nothing + * more to do here. + */ + return; +} + +void +svm_msr_guest_enter(struct svm_softc *sc, int vcpu) +{ + /* + * Save host MSRs (if any) and restore guest MSRs (if any). + */ +#ifndef __FreeBSD__ + uint64_t *host_msrs = sc->host_msrs[vcpu]; + + /* Save host MSRs */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +#endif /* __FreeBSD__ */ +} + +void +svm_msr_guest_exit(struct svm_softc *sc, int vcpu) +{ +#ifndef __FreeBSD__ + uint64_t *host_msrs = sc->host_msrs[vcpu]; +#endif + /* + * Save guest MSRs (if any) and restore host MSRs. + */ + wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); + + /* MSR_KGSBASE will be restored on the way back to userspace */ +} + +int +svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result, + bool *retu) +{ + int error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + *result = 0; + break; + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_SYSCFG: + case MSR_AMDK8_IPM: + case MSR_EXTFEATURES: + *result = 0; + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu) +{ + int error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + break; /* ignore writes */ + case MSR_MTRRcap: + vm_inject_gp(sc->vm, vcpu); + break; + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_SYSCFG: + break; /* Ignore writes */ + case MSR_AMDK8_IPM: + /* + * Ignore writes to the "Interrupt Pending Message" MSR. + */ + break; + case MSR_K8_UCODE_UPDATE: + /* + * Ignore writes to microcode update register. + */ + break; + case MSR_EXTFEATURES: + break; + default: + error = EINVAL; + break; + } + + return (error); +} diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h new file mode 100644 index 0000000000..1dba8101ab --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h @@ -0,0 +1,46 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_MSR_H_ +#define _SVM_MSR_H_ + +struct svm_softc; + +void svm_msr_init(void); +void svm_msr_guest_init(struct svm_softc *sc, int vcpu); +void svm_msr_guest_enter(struct svm_softc *sc, int vcpu); +void svm_msr_guest_exit(struct svm_softc *sc, int vcpu); + +int svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, + bool *retu); +int svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result, + bool *retu); + +#endif /* _SVM_MSR_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h new file mode 100644 index 0000000000..b5ac1903e7 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h @@ -0,0 +1,131 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_SOFTC_H_ +#define _SVM_SOFTC_H_ + +#define SVM_IO_BITMAP_SIZE (3 * PAGE_SIZE) +#define SVM_MSR_BITMAP_SIZE (2 * PAGE_SIZE) + +#ifdef __FreeBSD__ +struct asid { + uint64_t gen; /* range is [1, ~0UL] */ + uint32_t num; /* range is [1, nasid - 1] */ +}; +#else +#include + +/* This must match HOST_MSR_NUM in svm_msr.c (where it is CTASSERTed) */ +#define SVM_HOST_MSR_NUM 4 +#endif /* __FreeBSD__ */ + +/* + * XXX separate out 'struct vmcb' from 'svm_vcpu' to avoid wasting space + * due to VMCB alignment requirements. + */ +struct svm_vcpu { + struct vmcb vmcb; /* hardware saved vcpu context */ + struct svm_regctx swctx; /* software saved vcpu context */ + uint64_t vmcb_pa; /* VMCB physical address */ + uint64_t nextrip; /* next instruction to be executed by guest */ + int lastcpu; /* host cpu that the vcpu last ran on */ + uint32_t dirty; /* state cache bits that must be cleared */ + long eptgen; /* pmap->pm_eptgen when the vcpu last ran */ +#ifdef __FreeBSD__ + struct asid asid; +#else + hma_svm_asid_t hma_asid; + boolean_t loaded; +#endif +} __aligned(PAGE_SIZE); + +/* + * SVM softc, one per virtual machine. + */ +struct svm_softc { + uint8_t apic_page[VM_MAXCPU][PAGE_SIZE]; + struct svm_vcpu vcpu[VM_MAXCPU]; + vm_offset_t nptp; /* nested page table */ + uint8_t *iopm_bitmap; /* shared by all vcpus */ + uint8_t *msr_bitmap; /* shared by all vcpus */ + struct vm *vm; +#ifndef __FreeBSD__ + uint64_t host_msrs[VM_MAXCPU][SVM_HOST_MSR_NUM]; +#endif +}; + +CTASSERT((offsetof(struct svm_softc, nptp) & PAGE_MASK) == 0); + +static __inline struct svm_vcpu * +svm_get_vcpu(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu])); +} + +static __inline struct vmcb * +svm_get_vmcb(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb)); +} + +static __inline struct vmcb_state * +svm_get_vmcb_state(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb.state)); +} + +static __inline struct vmcb_ctrl * +svm_get_vmcb_ctrl(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb.ctrl)); +} + +static __inline struct svm_regctx * +svm_get_guest_regctx(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].swctx)); +} + +static __inline void +svm_set_dirty(struct svm_softc *sc, int vcpu, uint32_t dirtybits) +{ + struct svm_vcpu *vcpustate; + + vcpustate = svm_get_vcpu(sc, vcpu); + + vcpustate->dirty |= dirtybits; +} + +#endif /* _SVM_SOFTC_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s new file mode 100644 index 0000000000..fad994b09c --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s @@ -0,0 +1,164 @@ +/*- + * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include + +#include "svm_assym.h" + +/* Porting note: This is named 'svm_support.S' upstream. */ + +#if defined(lint) + +struct svm_regctx; +struct cpu; + +/*ARGSUSED*/ +void +svm_launch(uint64_t pa, struct svm_regctx *gctx, struct cpu *cpu) +{} + +#else /* lint */ + +#define VMLOAD .byte 0x0f, 0x01, 0xda +#define VMRUN .byte 0x0f, 0x01, 0xd8 +#define VMSAVE .byte 0x0f, 0x01, 0xdb + + +/* + * Flush scratch registers to avoid lingering guest state being used for + * Spectre v1 attacks when returning from guest entry. + */ +#define SVM_GUEST_FLUSH_SCRATCH \ + xorl %edi, %edi; \ + xorl %esi, %esi; \ + xorl %edx, %edx; \ + xorl %ecx, %ecx; \ + xorl %r8d, %r8d; \ + xorl %r9d, %r9d; \ + xorl %r10d, %r10d; \ + xorl %r11d, %r11d; + +/* Stack layout (offset from %rsp) for svm_launch */ +#define SVMSTK_R15 0x00 /* callee saved %r15 */ +#define SVMSTK_R14 0x08 /* callee saved %r14 */ +#define SVMSTK_R13 0x10 /* callee saved %r13 */ +#define SVMSTK_R12 0x18 /* callee saved %r12 */ +#define SVMSTK_RBX 0x20 /* callee saved %rbx */ +#define SVMSTK_RDX 0x28 /* save-args %rdx (struct cpu *) */ +#define SVMSTK_RSI 0x30 /* save-args %rsi (struct svm_regctx *) */ +#define SVMSTK_RDI 0x38 /* save-args %rdi (uint64_t vmcb_pa) */ +#define SVMSTK_FP 0x40 /* frame pointer %rbp */ +#define SVMSTKSIZE SVMSTK_FP + +/* + * svm_launch(uint64_t vmcb, struct svm_regctx *gctx, struct pcpu *pcpu) + * %rdi: physical address of VMCB + * %rsi: pointer to guest context + * %rdx: pointer to the pcpu data + */ +ENTRY_NP(svm_launch) + pushq %rbp + movq %rsp, %rbp + subq $SVMSTKSIZE, %rsp + movq %r15, SVMSTK_R15(%rsp) + movq %r14, SVMSTK_R14(%rsp) + movq %r13, SVMSTK_R13(%rsp) + movq %r12, SVMSTK_R12(%rsp) + movq %rbx, SVMSTK_RBX(%rsp) + movq %rdx, SVMSTK_RDX(%rsp) + movq %rsi, SVMSTK_RSI(%rsp) + movq %rdi, SVMSTK_RDI(%rsp) + + /* VMLOAD and VMRUN expect the VMCB physaddr in %rax */ + movq %rdi, %rax + + /* Restore guest state. */ + movq SCTX_R8(%rsi), %r8 + movq SCTX_R9(%rsi), %r9 + movq SCTX_R10(%rsi), %r10 + movq SCTX_R11(%rsi), %r11 + movq SCTX_R12(%rsi), %r12 + movq SCTX_R13(%rsi), %r13 + movq SCTX_R14(%rsi), %r14 + movq SCTX_R15(%rsi), %r15 + movq SCTX_RBP(%rsi), %rbp + movq SCTX_RBX(%rsi), %rbx + movq SCTX_RCX(%rsi), %rcx + movq SCTX_RDX(%rsi), %rdx + movq SCTX_RDI(%rsi), %rdi + movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */ + + VMLOAD + VMRUN + VMSAVE + + /* Grab the svm_regctx pointer */ + movq SVMSTK_RSI(%rsp), %rax + + /* Save guest state. */ + movq %r8, SCTX_R8(%rax) + movq %r9, SCTX_R9(%rax) + movq %r10, SCTX_R10(%rax) + movq %r11, SCTX_R11(%rax) + movq %r12, SCTX_R12(%rax) + movq %r13, SCTX_R13(%rax) + movq %r14, SCTX_R14(%rax) + movq %r15, SCTX_R15(%rax) + movq %rbp, SCTX_RBP(%rax) + movq %rbx, SCTX_RBX(%rax) + movq %rcx, SCTX_RCX(%rax) + movq %rdx, SCTX_RDX(%rax) + movq %rdi, SCTX_RDI(%rax) + movq %rsi, SCTX_RSI(%rax) + + /* Restore callee-saved registers */ + movq SVMSTK_R15(%rsp), %r15 + movq SVMSTK_R14(%rsp), %r14 + movq SVMSTK_R13(%rsp), %r13 + movq SVMSTK_R12(%rsp), %r12 + movq SVMSTK_RBX(%rsp), %rbx + + /* Fix %gsbase to point back to the correct 'struct cpu *' */ + movq SVMSTK_RDX(%rsp), %rdx + movl %edx, %eax + shrq $32, %rdx + movl $MSR_GSBASE, %ecx + wrmsr + + SVM_GUEST_FLUSH_SCRATCH + + addq $SVMSTKSIZE, %rsp + popq %rbp + ret +SET_SIZE(svm_launch) + +#endif /* lint */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/vmcb.c b/usr/src/uts/i86pc/io/vmm/amd/vmcb.c new file mode 100644 index 0000000000..5075b69867 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/vmcb.c @@ -0,0 +1,454 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include + +#include "vmm_ktr.h" + +#include "vmcb.h" +#include "svm.h" +#include "svm_softc.h" + +/* + * The VMCB aka Virtual Machine Control Block is a 4KB aligned page + * in memory that describes the virtual machine. + * + * The VMCB contains: + * - instructions or events in the guest to intercept + * - control bits that modify execution environment of the guest + * - guest processor state (e.g. general purpose registers) + */ + +/* + * Return VMCB segment area. + */ +static struct vmcb_segment * +vmcb_segptr(struct vmcb *vmcb, int type) +{ + struct vmcb_state *state; + struct vmcb_segment *seg; + + state = &vmcb->state; + + switch (type) { + case VM_REG_GUEST_CS: + seg = &state->cs; + break; + + case VM_REG_GUEST_DS: + seg = &state->ds; + break; + + case VM_REG_GUEST_ES: + seg = &state->es; + break; + + case VM_REG_GUEST_FS: + seg = &state->fs; + break; + + case VM_REG_GUEST_GS: + seg = &state->gs; + break; + + case VM_REG_GUEST_SS: + seg = &state->ss; + break; + + case VM_REG_GUEST_GDTR: + seg = &state->gdt; + break; + + case VM_REG_GUEST_IDTR: + seg = &state->idt; + break; + + case VM_REG_GUEST_LDTR: + seg = &state->ldt; + break; + + case VM_REG_GUEST_TR: + seg = &state->tr; + break; + + default: + seg = NULL; + break; + } + + return (seg); +} + +static int +vmcb_access(struct svm_softc *softc, int vcpu, int write, int ident, + uint64_t *val) +{ + struct vmcb *vmcb; + int off, bytes; + char *ptr; + + vmcb = svm_get_vmcb(softc, vcpu); + off = VMCB_ACCESS_OFFSET(ident); + bytes = VMCB_ACCESS_BYTES(ident); + + if ((off + bytes) >= sizeof (struct vmcb)) + return (EINVAL); + + ptr = (char *)vmcb; + + if (!write) + *val = 0; + + switch (bytes) { + case 8: + case 4: + case 2: + if (write) + memcpy(ptr + off, val, bytes); + else + memcpy(val, ptr + off, bytes); + break; + default: + VCPU_CTR1(softc->vm, vcpu, + "Invalid size %d for VMCB access: %d", bytes); + return (EINVAL); + } + + /* Invalidate all VMCB state cached by h/w. */ + if (write) + svm_set_dirty(softc, vcpu, 0xffffffff); + + return (0); +} + +/* + * Read from segment selector, control and general purpose register of VMCB. + */ +int +vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_segment *seg; + int err; + + vmcb = svm_get_vmcb(sc, vcpu); + state = &vmcb->state; + err = 0; + + if (VMCB_ACCESS_OK(ident)) + return (vmcb_access(sc, vcpu, 0, ident, retval)); + + switch (ident) { + case VM_REG_GUEST_CR0: + *retval = state->cr0; + break; + + case VM_REG_GUEST_CR2: + *retval = state->cr2; + break; + + case VM_REG_GUEST_CR3: + *retval = state->cr3; + break; + + case VM_REG_GUEST_CR4: + *retval = state->cr4; + break; + + case VM_REG_GUEST_DR6: + *retval = state->dr6; + break; + + case VM_REG_GUEST_DR7: + *retval = state->dr7; + break; + + case VM_REG_GUEST_EFER: + *retval = state->efer; + break; + + case VM_REG_GUEST_RAX: + *retval = state->rax; + break; + + case VM_REG_GUEST_RFLAGS: + *retval = state->rflags; + break; + + case VM_REG_GUEST_RIP: + *retval = state->rip; + break; + + case VM_REG_GUEST_RSP: + *retval = state->rsp; + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + seg = vmcb_segptr(vmcb, ident); + KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", + __func__, ident)); + *retval = seg->selector; + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + /* GDTR and IDTR don't have segment selectors */ + err = EINVAL; + break; + default: + err = EINVAL; + break; + } + + return (err); +} + +/* + * Write to segment selector, control and general purpose register of VMCB. + */ +int +vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_segment *seg; + int err, dirtyseg; + + vmcb = svm_get_vmcb(sc, vcpu); + state = &vmcb->state; + dirtyseg = 0; + err = 0; + + if (VMCB_ACCESS_OK(ident)) + return (vmcb_access(sc, vcpu, 1, ident, &val)); + + switch (ident) { + case VM_REG_GUEST_CR0: + state->cr0 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_CR2: + state->cr2 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR2); + break; + + case VM_REG_GUEST_CR3: + state->cr3 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_CR4: + state->cr4 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_DR6: + state->dr6 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_DR); + break; + + case VM_REG_GUEST_DR7: + state->dr7 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_DR); + break; + + case VM_REG_GUEST_EFER: + /* EFER_SVM must always be set when the guest is executing */ + state->efer = val | EFER_SVM; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_RAX: + state->rax = val; + break; + + case VM_REG_GUEST_RFLAGS: + state->rflags = val; + break; + + case VM_REG_GUEST_RIP: + state->rip = val; + break; + + case VM_REG_GUEST_RSP: + state->rsp = val; + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_SS: + dirtyseg = 1; /* FALLTHROUGH */ + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + seg = vmcb_segptr(vmcb, ident); + KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", + __func__, ident)); + seg->selector = val; + if (dirtyseg) + svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + /* GDTR and IDTR don't have segment selectors */ + err = EINVAL; + break; + default: + err = EINVAL; + break; + } + + return (err); +} + +int +vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2) +{ + struct vmcb_segment *seg; + + seg = vmcb_segptr(vmcb, ident); + if (seg != NULL) { + bcopy(seg, seg2, sizeof(struct vmcb_segment)); + return (0); + } else { + return (EINVAL); + } +} + +int +vmcb_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + struct vmcb_segment *seg; + uint16_t attrib; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + + seg = vmcb_segptr(vmcb, reg); + KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", + __func__, reg)); + + seg->base = desc->base; + seg->limit = desc->limit; + if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { + /* + * Map seg_desc access to VMCB attribute format. + * + * SVM uses the 'P' bit in the segment attributes to indicate a + * NULL segment so clear it if the segment is marked unusable. + */ + attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF); + if (SEG_DESC_UNUSABLE(desc->access)) { + attrib &= ~0x80; + } + seg->attrib = attrib; + } + + VCPU_CTR4(sc->vm, vcpu, "Setting desc %d: base (%#lx), limit (%#x), " + "attrib (%#x)", reg, seg->base, seg->limit, seg->attrib); + + switch (reg) { + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_SS: + svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); + break; + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + svm_set_dirty(sc, vcpu, VMCB_CACHE_DT); + break; + default: + break; + } + + return (0); +} + +int +vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + struct vmcb_segment *seg; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + seg = vmcb_segptr(vmcb, reg); + KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", + __func__, reg)); + + desc->base = seg->base; + desc->limit = seg->limit; + desc->access = 0; + + if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { + /* Map seg_desc access to VMCB attribute format */ + desc->access = ((seg->attrib & 0xF00) << 4) | + (seg->attrib & 0xFF); + + /* + * VT-x uses bit 16 to indicate a segment that has been loaded + * with a NULL selector (aka unusable). The 'desc->access' + * field is interpreted in the VT-x format by the + * processor-independent code. + * + * SVM uses the 'P' bit to convey the same information so + * convert it into the VT-x format. For more details refer to + * section "Segment State in the VMCB" in APMv2. + */ + if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) { + if ((desc->access & 0x80) == 0) + desc->access |= 0x10000; /* Unusable segment */ + } + } + + return (0); +} diff --git a/usr/src/uts/i86pc/io/vmm/amd/vmcb.h b/usr/src/uts/i86pc/io/vmm/amd/vmcb.h new file mode 100644 index 0000000000..ec7caa91f9 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/vmcb.h @@ -0,0 +1,336 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMCB_H_ +#define _VMCB_H_ + +struct svm_softc; + +#define BIT(n) (1ULL << n) + +/* + * Secure Virtual Machine: AMD64 Programmer's Manual Vol2, Chapter 15 + * Layout of VMCB: AMD64 Programmer's Manual Vol2, Appendix B + */ + +/* vmcb_ctrl->intercept[] array indices */ +#define VMCB_CR_INTCPT 0 +#define VMCB_DR_INTCPT 1 +#define VMCB_EXC_INTCPT 2 +#define VMCB_CTRL1_INTCPT 3 +#define VMCB_CTRL2_INTCPT 4 + +/* intercept[VMCB_CTRL1_INTCPT] fields */ +#define VMCB_INTCPT_INTR BIT(0) +#define VMCB_INTCPT_NMI BIT(1) +#define VMCB_INTCPT_SMI BIT(2) +#define VMCB_INTCPT_INIT BIT(3) +#define VMCB_INTCPT_VINTR BIT(4) +#define VMCB_INTCPT_CR0_WRITE BIT(5) +#define VMCB_INTCPT_IDTR_READ BIT(6) +#define VMCB_INTCPT_GDTR_READ BIT(7) +#define VMCB_INTCPT_LDTR_READ BIT(8) +#define VMCB_INTCPT_TR_READ BIT(9) +#define VMCB_INTCPT_IDTR_WRITE BIT(10) +#define VMCB_INTCPT_GDTR_WRITE BIT(11) +#define VMCB_INTCPT_LDTR_WRITE BIT(12) +#define VMCB_INTCPT_TR_WRITE BIT(13) +#define VMCB_INTCPT_RDTSC BIT(14) +#define VMCB_INTCPT_RDPMC BIT(15) +#define VMCB_INTCPT_PUSHF BIT(16) +#define VMCB_INTCPT_POPF BIT(17) +#define VMCB_INTCPT_CPUID BIT(18) +#define VMCB_INTCPT_RSM BIT(19) +#define VMCB_INTCPT_IRET BIT(20) +#define VMCB_INTCPT_INTn BIT(21) +#define VMCB_INTCPT_INVD BIT(22) +#define VMCB_INTCPT_PAUSE BIT(23) +#define VMCB_INTCPT_HLT BIT(24) +#define VMCB_INTCPT_INVPG BIT(25) +#define VMCB_INTCPT_INVPGA BIT(26) +#define VMCB_INTCPT_IO BIT(27) +#define VMCB_INTCPT_MSR BIT(28) +#define VMCB_INTCPT_TASK_SWITCH BIT(29) +#define VMCB_INTCPT_FERR_FREEZE BIT(30) +#define VMCB_INTCPT_SHUTDOWN BIT(31) + +/* intercept[VMCB_CTRL2_INTCPT] fields */ +#define VMCB_INTCPT_VMRUN BIT(0) +#define VMCB_INTCPT_VMMCALL BIT(1) +#define VMCB_INTCPT_VMLOAD BIT(2) +#define VMCB_INTCPT_VMSAVE BIT(3) +#define VMCB_INTCPT_STGI BIT(4) +#define VMCB_INTCPT_CLGI BIT(5) +#define VMCB_INTCPT_SKINIT BIT(6) +#define VMCB_INTCPT_RDTSCP BIT(7) +#define VMCB_INTCPT_ICEBP BIT(8) +#define VMCB_INTCPT_WBINVD BIT(9) +#define VMCB_INTCPT_MONITOR BIT(10) +#define VMCB_INTCPT_MWAIT BIT(11) +#define VMCB_INTCPT_MWAIT_ARMED BIT(12) +#define VMCB_INTCPT_XSETBV BIT(13) + +/* VMCB TLB control */ +#define VMCB_TLB_FLUSH_NOTHING 0 /* Flush nothing */ +#define VMCB_TLB_FLUSH_ALL 1 /* Flush entire TLB */ +#define VMCB_TLB_FLUSH_GUEST 3 /* Flush all guest entries */ +#define VMCB_TLB_FLUSH_GUEST_NONGLOBAL 7 /* Flush guest non-PG entries */ + +/* VMCB state caching */ +#define VMCB_CACHE_NONE 0 /* No caching */ +#define VMCB_CACHE_I BIT(0) /* Intercept, TSC off, Pause filter */ +#define VMCB_CACHE_IOPM BIT(1) /* I/O and MSR permission */ +#define VMCB_CACHE_ASID BIT(2) /* ASID */ +#define VMCB_CACHE_TPR BIT(3) /* V_TPR to V_INTR_VECTOR */ +#define VMCB_CACHE_NP BIT(4) /* Nested Paging */ +#define VMCB_CACHE_CR BIT(5) /* CR0, CR3, CR4 & EFER */ +#define VMCB_CACHE_DR BIT(6) /* Debug registers */ +#define VMCB_CACHE_DT BIT(7) /* GDT/IDT */ +#define VMCB_CACHE_SEG BIT(8) /* User segments, CPL */ +#define VMCB_CACHE_CR2 BIT(9) /* page fault address */ +#define VMCB_CACHE_LBR BIT(10) /* Last branch */ + +/* VMCB control event injection */ +#define VMCB_EVENTINJ_EC_VALID BIT(11) /* Error Code valid */ +#define VMCB_EVENTINJ_VALID BIT(31) /* Event valid */ + +/* Event types that can be injected */ +#define VMCB_EVENTINJ_TYPE_INTR 0 +#define VMCB_EVENTINJ_TYPE_NMI 2 +#define VMCB_EVENTINJ_TYPE_EXCEPTION 3 +#define VMCB_EVENTINJ_TYPE_INTn 4 + +/* VMCB exit code, APM vol2 Appendix C */ +#define VMCB_EXIT_MC 0x52 +#define VMCB_EXIT_INTR 0x60 +#define VMCB_EXIT_NMI 0x61 +#define VMCB_EXIT_VINTR 0x64 +#define VMCB_EXIT_PUSHF 0x70 +#define VMCB_EXIT_POPF 0x71 +#define VMCB_EXIT_CPUID 0x72 +#define VMCB_EXIT_IRET 0x74 +#define VMCB_EXIT_PAUSE 0x77 +#define VMCB_EXIT_HLT 0x78 +#define VMCB_EXIT_IO 0x7B +#define VMCB_EXIT_MSR 0x7C +#define VMCB_EXIT_SHUTDOWN 0x7F +#define VMCB_EXIT_VMSAVE 0x83 +#define VMCB_EXIT_MONITOR 0x8A +#define VMCB_EXIT_MWAIT 0x8B +#define VMCB_EXIT_NPF 0x400 +#define VMCB_EXIT_INVALID -1 + +/* + * Nested page fault. + * Bit definitions to decode EXITINFO1. + */ +#define VMCB_NPF_INFO1_P BIT(0) /* Nested page present. */ +#define VMCB_NPF_INFO1_W BIT(1) /* Access was write. */ +#define VMCB_NPF_INFO1_U BIT(2) /* Access was user access. */ +#define VMCB_NPF_INFO1_RSV BIT(3) /* Reserved bits present. */ +#define VMCB_NPF_INFO1_ID BIT(4) /* Code read. */ + +#define VMCB_NPF_INFO1_GPA BIT(32) /* Guest physical address. */ +#define VMCB_NPF_INFO1_GPT BIT(33) /* Guest page table. */ + +/* + * EXITINTINFO, Interrupt exit info for all intrecepts. + * Section 15.7.2, Intercepts during IDT Interrupt Delivery. + */ +#define VMCB_EXITINTINFO_VECTOR(x) ((x) & 0xFF) +#define VMCB_EXITINTINFO_TYPE(x) (((x) >> 8) & 0x7) +#define VMCB_EXITINTINFO_EC_VALID(x) (((x) & BIT(11)) ? 1 : 0) +#define VMCB_EXITINTINFO_VALID(x) (((x) & BIT(31)) ? 1 : 0) +#define VMCB_EXITINTINFO_EC(x) (((x) >> 32) & 0xFFFFFFFF) + +/* Offset of various VMCB fields. */ +#define VMCB_OFF_CTRL(x) (x) +#define VMCB_OFF_STATE(x) ((x) + 0x400) + +#define VMCB_OFF_CR_INTERCEPT VMCB_OFF_CTRL(0x0) +#define VMCB_OFF_DR_INTERCEPT VMCB_OFF_CTRL(0x4) +#define VMCB_OFF_EXC_INTERCEPT VMCB_OFF_CTRL(0x8) +#define VMCB_OFF_INST1_INTERCEPT VMCB_OFF_CTRL(0xC) +#define VMCB_OFF_INST2_INTERCEPT VMCB_OFF_CTRL(0x10) +#define VMCB_OFF_IO_PERM VMCB_OFF_CTRL(0x40) +#define VMCB_OFF_MSR_PERM VMCB_OFF_CTRL(0x48) +#define VMCB_OFF_TSC_OFFSET VMCB_OFF_CTRL(0x50) +#define VMCB_OFF_ASID VMCB_OFF_CTRL(0x58) +#define VMCB_OFF_TLB_CTRL VMCB_OFF_CTRL(0x5C) +#define VMCB_OFF_VIRQ VMCB_OFF_CTRL(0x60) +#define VMCB_OFF_EXIT_REASON VMCB_OFF_CTRL(0x70) +#define VMCB_OFF_EXITINFO1 VMCB_OFF_CTRL(0x78) +#define VMCB_OFF_EXITINFO2 VMCB_OFF_CTRL(0x80) +#define VMCB_OFF_EXITINTINFO VMCB_OFF_CTRL(0x88) +#define VMCB_OFF_AVIC_BAR VMCB_OFF_CTRL(0x98) +#define VMCB_OFF_NPT_BASE VMCB_OFF_CTRL(0xB0) +#define VMCB_OFF_AVIC_PAGE VMCB_OFF_CTRL(0xE0) +#define VMCB_OFF_AVIC_LT VMCB_OFF_CTRL(0xF0) +#define VMCB_OFF_AVIC_PT VMCB_OFF_CTRL(0xF8) +#define VMCB_OFF_SYSENTER_CS VMCB_OFF_STATE(0x228) +#define VMCB_OFF_SYSENTER_ESP VMCB_OFF_STATE(0x230) +#define VMCB_OFF_SYSENTER_EIP VMCB_OFF_STATE(0x238) +#define VMCB_OFF_GUEST_PAT VMCB_OFF_STATE(0x268) + +/* + * Encode the VMCB offset and bytes that we want to read from VMCB. + */ +#define VMCB_ACCESS(o, w) (0x80000000 | (((w) & 0xF) << 16) | \ + ((o) & 0xFFF)) +#define VMCB_ACCESS_OK(v) ((v) & 0x80000000 ) +#define VMCB_ACCESS_BYTES(v) (((v) >> 16) & 0xF) +#define VMCB_ACCESS_OFFSET(v) ((v) & 0xFFF) + +#ifdef _KERNEL +/* VMCB save state area segment format */ +struct vmcb_segment { + uint16_t selector; + uint16_t attrib; + uint32_t limit; + uint64_t base; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_segment) == 16); + +/* Code segment descriptor attribute in 12 bit format as saved by VMCB. */ +#define VMCB_CS_ATTRIB_L BIT(9) /* Long mode. */ +#define VMCB_CS_ATTRIB_D BIT(10) /* OPerand size bit. */ + +/* + * The VMCB is divided into two areas - the first one contains various + * control bits including the intercept vector and the second one contains + * the guest state. + */ + +/* VMCB control area - padded up to 1024 bytes */ +struct vmcb_ctrl { + uint32_t intercept[5]; /* all intercepts */ + uint8_t pad1[0x28]; /* Offsets 0x14-0x3B are reserved. */ + uint16_t pause_filthresh; /* Offset 0x3C, PAUSE filter threshold */ + uint16_t pause_filcnt; /* Offset 0x3E, PAUSE filter count */ + uint64_t iopm_base_pa; /* 0x40: IOPM_BASE_PA */ + uint64_t msrpm_base_pa; /* 0x48: MSRPM_BASE_PA */ + uint64_t tsc_offset; /* 0x50: TSC_OFFSET */ + uint32_t asid; /* 0x58: Guest ASID */ + uint8_t tlb_ctrl; /* 0x5C: TLB_CONTROL */ + uint8_t pad2[3]; /* 0x5D-0x5F: Reserved. */ + uint8_t v_tpr; /* 0x60: V_TPR, guest CR8 */ + uint8_t v_irq:1; /* Is virtual interrupt pending? */ + uint8_t :7; /* Padding */ + uint8_t v_intr_prio:4; /* 0x62: Priority for virtual interrupt. */ + uint8_t v_ign_tpr:1; + uint8_t :3; + uint8_t v_intr_masking:1; /* Guest and host sharing of RFLAGS. */ + uint8_t :7; + uint8_t v_intr_vector; /* 0x64: Vector for virtual interrupt. */ + uint8_t pad3[3]; /* 0x65-0x67 Reserved. */ + uint64_t intr_shadow:1; /* 0x68: Interrupt shadow, section15.2.1 APM2 */ + uint64_t :63; + uint64_t exitcode; /* 0x70, Exitcode */ + uint64_t exitinfo1; /* 0x78, EXITINFO1 */ + uint64_t exitinfo2; /* 0x80, EXITINFO2 */ + uint64_t exitintinfo; /* 0x88, Interrupt exit value. */ + uint64_t np_enable:1; /* 0x90, Nested paging enable. */ + uint64_t :63; + uint8_t pad4[0x10]; /* 0x98-0xA7 reserved. */ + uint64_t eventinj; /* 0xA8, Event injection. */ + uint64_t n_cr3; /* B0, Nested page table. */ + uint64_t lbr_virt_en:1; /* Enable LBR virtualization. */ + uint64_t :63; + uint32_t vmcb_clean; /* 0xC0: VMCB clean bits for caching */ + uint32_t :32; /* 0xC4: Reserved */ + uint64_t nrip; /* 0xC8: Guest next nRIP. */ + uint8_t inst_len; /* 0xD0: #NPF decode assist */ + uint8_t inst_bytes[15]; + uint8_t padd6[0x320]; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_ctrl) == 1024); + +struct vmcb_state { + struct vmcb_segment es; + struct vmcb_segment cs; + struct vmcb_segment ss; + struct vmcb_segment ds; + struct vmcb_segment fs; + struct vmcb_segment gs; + struct vmcb_segment gdt; + struct vmcb_segment ldt; + struct vmcb_segment idt; + struct vmcb_segment tr; + uint8_t pad1[0x2b]; /* Reserved: 0xA0-0xCA */ + uint8_t cpl; + uint8_t pad2[4]; + uint64_t efer; + uint8_t pad3[0x70]; /* Reserved: 0xd8-0x147 */ + uint64_t cr4; + uint64_t cr3; /* Guest CR3 */ + uint64_t cr0; + uint64_t dr7; + uint64_t dr6; + uint64_t rflags; + uint64_t rip; + uint8_t pad4[0x58]; /* Reserved: 0x180-0x1D7 */ + uint64_t rsp; + uint8_t pad5[0x18]; /* Reserved 0x1E0-0x1F7 */ + uint64_t rax; + uint64_t star; + uint64_t lstar; + uint64_t cstar; + uint64_t sfmask; + uint64_t kernelgsbase; + uint64_t sysenter_cs; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + uint64_t cr2; + uint8_t pad6[0x20]; + uint64_t g_pat; + uint64_t dbgctl; + uint64_t br_from; + uint64_t br_to; + uint64_t int_from; + uint64_t int_to; + uint8_t pad7[0x968]; /* Reserved up to end of VMCB */ +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_state) == 0xC00); + +struct vmcb { + struct vmcb_ctrl ctrl; + struct vmcb_state state; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb) == PAGE_SIZE); +CTASSERT(offsetof(struct vmcb, state) == 0x400); + +int vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval); +int vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val); +int vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); +int vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); +int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg); + +#endif /* _KERNEL */ +#endif /* _VMCB_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.c b/usr/src/uts/i86pc/io/vmm/intel/ept.c index 5ae9ed2f6a..4915537b0a 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/ept.c +++ b/usr/src/uts/i86pc/io/vmm/intel/ept.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z grehan $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -39,33 +41,35 @@ */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z grehan $"); +__FBSDID("$FreeBSD$"); +#include +#include #include -#include #include -#include #include +#include +#ifndef __FreeBSD__ +#include +#endif #include #include - -#include -#include -#include -#include +#include #include + #include "vmx_cpufunc.h" -#include "vmx.h" #include "ept.h" +#define EPT_SUPPORTS_EXEC_ONLY(cap) ((cap) & (1UL << 0)) #define EPT_PWL4(cap) ((cap) & (1UL << 6)) #define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14)) #define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */ #define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */ -#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) #define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) +#define AD_BITS_SUPPORTED(cap) ((cap) & (1UL << 21)) +#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) #define INVVPID_ALL_TYPES_MASK 0xF0000000000UL #define INVVPID_ALL_TYPES_SUPPORTED(cap) \ @@ -75,28 +79,22 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z g #define INVEPT_ALL_TYPES_SUPPORTED(cap) \ (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK) -#define EPT_PG_RD (1 << 0) -#define EPT_PG_WR (1 << 1) -#define EPT_PG_EX (1 << 2) -#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) -#define EPT_PG_IGNORE_PAT (1 << 6) -#define EPT_PG_SUPERPAGE (1 << 7) +#define EPT_PWLEVELS 4 /* page walk levels */ +#define EPT_ENABLE_AD_BITS (1 << 6) -#define EPT_ADDR_MASK ((uint64_t)-1 << 12) +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL); -MALLOC_DECLARE(M_VMX); +static int ept_enable_ad_bits; -static uint64_t page_sizes_mask; - -/* - * Set this to 1 to have the EPT tables respect the guest PAT settings - */ -static int ept_pat_passthru; +static int ept_pmap_flags; +SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD, + &ept_pmap_flags, 0, NULL); int -ept_init(void) +ept_init(int ipinum) { - int page_shift; + int use_hw_ad_bits, use_superpages, use_exec_only; uint64_t cap; cap = rdmsr(MSR_VMX_EPT_VPID_CAP); @@ -116,17 +114,24 @@ ept_init(void) !INVEPT_ALL_TYPES_SUPPORTED(cap)) return (EINVAL); - /* Set bits in 'page_sizes_mask' for each valid page size */ - page_shift = PAGE_SHIFT; - page_sizes_mask = 1UL << page_shift; /* 4KB page */ + ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK; - page_shift += 9; - if (EPT_PDE_SUPERPAGE(cap)) - page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */ + use_superpages = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages); + if (use_superpages && EPT_PDE_SUPERPAGE(cap)) + ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */ - page_shift += 9; - if (EPT_PDPTE_SUPERPAGE(cap)) - page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */ + use_hw_ad_bits = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits); + if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap)) + ept_enable_ad_bits = 1; + else + ept_pmap_flags |= PMAP_EMULATE_AD_BITS; + + use_exec_only = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only); + if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap)) + ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY; return (0); } @@ -165,288 +170,61 @@ ept_dump(uint64_t *ptp, int nlevels) } #endif -static size_t -ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, - vm_memattr_t attr, vm_prot_t prot, boolean_t spok) -{ - int spshift, ptpshift, ptpindex, nlevels; - - /* - * Compute the size of the mapping that we can accomodate. - * - * This is based on three factors: - * - super page sizes supported by the processor - * - alignment of the region starting at 'gpa' and 'hpa' - * - length of the region 'len' - */ - spshift = PAGE_SHIFT; - if (spok) - spshift += (EPT_PWLEVELS - 1) * 9; - while (spshift >= PAGE_SHIFT) { - uint64_t spsize = 1UL << spshift; - if ((page_sizes_mask & spsize) != 0 && - (gpa & (spsize - 1)) == 0 && - (hpa & (spsize - 1)) == 0 && - length >= spsize) { - break; - } - spshift -= 9; - } - - if (spshift < PAGE_SHIFT) { - panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, " - "length 0x%016lx, page_sizes_mask 0x%016lx", - gpa, hpa, length, page_sizes_mask); - } - - nlevels = EPT_PWLEVELS; - while (--nlevels >= 0) { - ptpshift = PAGE_SHIFT + nlevels * 9; - ptpindex = (gpa >> ptpshift) & 0x1FF; - - /* We have reached the leaf mapping */ - if (spshift >= ptpshift) - break; - - /* - * We are working on a non-leaf page table page. - * - * Create the next level page table page if necessary and point - * to it from the current page table. - */ - if (ptp[ptpindex] == 0) { -#ifdef __FreeBSD__ - void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO); -#else - void *nlp = kmem_zalloc(PAGE_SIZE, KM_SLEEP); - ASSERT((((uintptr_t)nlp) & PAGE_MASK) == 0); -#endif - ptp[ptpindex] = vtophys(nlp); - ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX; - } - - /* Work our way down to the next level page table page */ -#ifdef __FreeBSD__ - ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK); -#else - ptp = (uint64_t *)hat_kpm_pfn2va(btop(ptp[ptpindex] & EPT_ADDR_MASK)); -#endif - } - - if ((gpa & ((1UL << ptpshift) - 1)) != 0) { - panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d " - "mismatch\n", gpa, ptpshift); - } - - if (prot != VM_PROT_NONE) { - /* Do the mapping */ - ptp[ptpindex] = hpa; - - /* Apply the access controls */ - if (prot & VM_PROT_READ) - ptp[ptpindex] |= EPT_PG_RD; - if (prot & VM_PROT_WRITE) - ptp[ptpindex] |= EPT_PG_WR; - if (prot & VM_PROT_EXECUTE) - ptp[ptpindex] |= EPT_PG_EX; - - /* - * By default the PAT type is ignored - this appears to - * be how other hypervisors handle EPT. Allow this to be - * overridden. - */ - ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr); - if (!ept_pat_passthru) - ptp[ptpindex] |= EPT_PG_IGNORE_PAT; - - if (nlevels > 0) - ptp[ptpindex] |= EPT_PG_SUPERPAGE; - } else { - /* Remove the mapping */ - ptp[ptpindex] = 0; - } - - return (1UL << ptpshift); -} - -static vm_paddr_t -ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa) -{ - int nlevels, ptpshift, ptpindex; - uint64_t ptpval, hpabase, pgmask; - - nlevels = EPT_PWLEVELS; - while (--nlevels >= 0) { - ptpshift = PAGE_SHIFT + nlevels * 9; - ptpindex = (gpa >> ptpshift) & 0x1FF; - - ptpval = ptp[ptpindex]; - - /* Cannot make progress beyond this point */ - if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0) - break; - - if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) { - pgmask = (1UL << ptpshift) - 1; - hpabase = ptpval & ~pgmask; - return (hpabase | (gpa & pgmask)); - } - - /* Work our way down to the next level page table page */ -#ifdef __FreBSD__ - ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK); -#else - ptp = (uint64_t *)hat_kpm_pfn2va(btop(ptpval & EPT_ADDR_MASK)); -#endif - } - - return ((vm_paddr_t)-1); -} - -static void -ept_free_pt_entry(pt_entry_t pte) -{ - if (pte == 0) - return; - - /* sanity check */ - if ((pte & EPT_PG_SUPERPAGE) != 0) - panic("ept_free_pt_entry: pte cannot have superpage bit"); - - return; -} - -static void -ept_free_pd_entry(pd_entry_t pde) -{ - pt_entry_t *pt; - int i; - - if (pde == 0) - return; - - if ((pde & EPT_PG_SUPERPAGE) == 0) { -#ifdef __FreeBSD__ - pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK); - for (i = 0; i < NPTEPG; i++) - ept_free_pt_entry(pt[i]); - free(pt, M_VMX); /* free the page table page */ -#else - page_t *pp; - pt = (pt_entry_t *)hat_kpm_pfn2va(btop(pde & EPT_ADDR_MASK)); - for (i = 0; i < NPTEPG; i++) - ept_free_pt_entry(pt[i]); - pp = page_numtopp_nolock(btop(pde & EPT_ADDR_MASK)); - kmem_free((void *)pp->p_offset, PAGE_SIZE); -#endif - } -} - +#ifdef __FreeBSD__ static void -ept_free_pdp_entry(pdp_entry_t pdpe) +invept_single_context(void *arg) { - pd_entry_t *pd; - int i; - - if (pdpe == 0) - return; + struct invept_desc desc = *(struct invept_desc *)arg; - if ((pdpe & EPT_PG_SUPERPAGE) == 0) { -#ifdef __FreeBSD__ - pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK); - for (i = 0; i < NPDEPG; i++) - ept_free_pd_entry(pd[i]); - free(pd, M_VMX); /* free the page directory page */ -#else - page_t *pp; - pd = (pd_entry_t *)hat_kpm_pfn2va(btop(pdpe & EPT_ADDR_MASK)); - for (i = 0; i < NPDEPG; i++) - ept_free_pd_entry(pd[i]); - pp = page_numtopp_nolock(btop(pdpe & EPT_ADDR_MASK)); - kmem_free((void *)pp->p_offset, PAGE_SIZE); -#endif - } + invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); } -static void -ept_free_pml4_entry(pml4_entry_t pml4e) +void +ept_invalidate_mappings(u_long eptp) { - pdp_entry_t *pdp; - int i; + struct invept_desc invept_desc = { 0 }; - if (pml4e == 0) - return; + invept_desc.eptp = eptp; - if ((pml4e & EPT_PG_SUPERPAGE) == 0) { -#ifdef __FreeBSD__ - pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK); - for (i = 0; i < NPDPEPG; i++) - ept_free_pdp_entry(pdp[i]); - free(pdp, M_VMX); /* free the page directory ptr page */ -#else - page_t *pp; - pdp = (pdp_entry_t *)hat_kpm_pfn2va(btop(pml4e - & EPT_ADDR_MASK)); - for (i = 0; i < NPDPEPG; i++) - ept_free_pdp_entry(pdp[i]); - pp = page_numtopp_nolock(btop(pml4e & EPT_ADDR_MASK)); - kmem_free((void *)pp->p_offset, PAGE_SIZE); -#endif - } + smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); } - +#else /* __FreeBSD__ */ void -ept_vmcleanup(struct vmx *vmx) +ept_invalidate_mappings(u_long eptp) { - int i; - - for (i = 0; i < NPML4EPG; i++) - ept_free_pml4_entry(vmx->pml4ept[i]); + hma_vmx_invept_allcpus((uintptr_t)eptp); } +#endif /* __FreeBSD__ */ -int -ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len, - vm_memattr_t attr, int prot, boolean_t spok) +static int +ept_pinit(pmap_t pmap) { - size_t n; - struct vmx *vmx = arg; - - while (len > 0) { - n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr, - prot, spok); - len -= n; - gpa += n; - hpa += n; - } - return (0); + return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags)); } -vm_paddr_t -ept_vmmmap_get(void *arg, vm_paddr_t gpa) +struct vmspace * +ept_vmspace_alloc(vm_offset_t min, vm_offset_t max) { - vm_paddr_t hpa; - struct vmx *vmx; - vmx = arg; - hpa = ept_lookup_mapping(vmx->pml4ept, gpa); - return (hpa); + return (vmspace_alloc(min, max, ept_pinit)); } -static void -invept_single_context(void *arg) +void +ept_vmspace_free(struct vmspace *vmspace) { - struct invept_desc desc = *(struct invept_desc *)arg; - invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); + vmspace_free(vmspace); } -void -ept_invalidate_mappings(u_long pml4ept) +uint64_t +eptp(uint64_t pml4) { - struct invept_desc invept_desc = { 0 }; + uint64_t eptp_val; - invept_desc.eptp = EPTP(pml4ept); + eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK; + if (ept_enable_ad_bits) + eptp_val |= EPT_ENABLE_AD_BITS; - smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); + return (eptp_val); } diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.h b/usr/src/uts/i86pc/io/vmm/intel/ept.h index d0bcce7ec3..4a029e8b22 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/ept.h +++ b/usr/src/uts/i86pc/io/vmm/intel/ept.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/ept.h 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ #ifndef _EPT_H_ @@ -31,13 +33,9 @@ struct vmx; -#define EPT_PWLEVELS 4 /* page walk levels */ -#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK) - -int ept_init(void); -int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, - vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings); -vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa); -void ept_invalidate_mappings(u_long ept_pml4); -void ept_vmcleanup(struct vmx *vmx); +int ept_init(int ipinum); +void ept_invalidate_mappings(u_long eptp); +struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max); +void ept_vmspace_free(struct vmspace *vmspace); +uint64_t eptp(uint64_t pml4); #endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/offsets.in b/usr/src/uts/i86pc/io/vmm/intel/offsets.in new file mode 100644 index 0000000000..d60a2d8f5f --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/offsets.in @@ -0,0 +1,62 @@ +/* + * COPYRIGHT 2014 Pluribus Networks Inc. + * + * All rights reserved. This copyright notice is Copyright Management + * Information under 17 USC 1202 and is included to protect this work and + * deter copyright infringement. Removal or alteration of this Copyright + * Management Information without the express written permission from + * Pluribus Networks Inc is prohibited, and any such unauthorized removal + * or alteration will be a violation of federal law. + */ +#include +#include +#include +#include + +#include +#include + +#include "intel/vmx_cpufunc.h" +#include "intel/vmx.h" +#include "vm/vm_glue.h" + +vmxctx + guest_rdi VMXCTX_GUEST_RDI + guest_rsi VMXCTX_GUEST_RSI + guest_rdx VMXCTX_GUEST_RDX + guest_rcx VMXCTX_GUEST_RCX + guest_r8 VMXCTX_GUEST_R8 + guest_r9 VMXCTX_GUEST_R9 + guest_rax VMXCTX_GUEST_RAX + guest_rbx VMXCTX_GUEST_RBX + guest_rbp VMXCTX_GUEST_RBP + guest_r10 VMXCTX_GUEST_R10 + guest_r11 VMXCTX_GUEST_R11 + guest_r12 VMXCTX_GUEST_R12 + guest_r13 VMXCTX_GUEST_R13 + guest_r14 VMXCTX_GUEST_R14 + guest_r15 VMXCTX_GUEST_R15 + guest_cr2 VMXCTX_GUEST_CR2 + inst_fail_status VMXCTX_INST_FAIL_STATUS + pmap VMXCTX_PMAP + +vmx + eptgen VMX_EPTGEN + eptp VMX_EPTP + +pmap + pm_active PM_ACTIVE + pm_eptgen PM_EPTGEN + +cpu + cpu_id + +\#define VM_SUCCESS 0 +\#define VM_FAIL_INVALID 1 +\#define VM_FAIL_VALID 2 + +\#define VMX_GUEST_VMEXIT 0 +\#define VMX_VMRESUME_ERROR 1 +\#define VMX_VMLAUNCH_ERROR 2 +\#define VMX_INVEPT_ERROR 3 +\#define VMX_VMWRITE_ERROR 4 diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.c b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c index bbd2da2a34..d19f6bc262 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmcs.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,6 +38,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifdef __FreeBSD__ @@ -43,9 +46,10 @@ #endif #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z neel $"); +__FBSDID("$FreeBSD$"); #include +#include #include #include @@ -64,6 +68,12 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z #include #endif +SYSCTL_DECL(_hw_vmm_vmx); + +static int no_flush_rsb; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, no_flush_rsb, CTLFLAG_RW, + &no_flush_rsb, 0, "Do not flush RSB upon vmexit"); + static uint64_t vmcs_fix_regval(uint32_t encoding, uint64_t val) { @@ -117,6 +127,14 @@ vmcs_field_encoding(int ident) return (VMCS_GUEST_LDTR_SELECTOR); case VM_REG_GUEST_EFER: return (VMCS_GUEST_IA32_EFER); + case VM_REG_GUEST_PDPTE0: + return (VMCS_GUEST_PDPTE0); + case VM_REG_GUEST_PDPTE1: + return (VMCS_GUEST_PDPTE1); + case VM_REG_GUEST_PDPTE2: + return (VMCS_GUEST_PDPTE2); + case VM_REG_GUEST_PDPTE3: + return (VMCS_GUEST_PDPTE3); default: return (-1); } @@ -332,40 +350,15 @@ done: return (error); } -#ifndef __FreeBSD__ -int -vmcs_set_host_msr_save(struct vmcs *vmcs, u_long h_area, u_int h_count) -{ - int error; - - VMPTRLD(vmcs); - - /* - * Host MSRs are loaded from the VM-exit MSR-load area. - */ - if ((error = vmwrite(VMCS_EXIT_MSR_LOAD, h_area)) != 0) - goto done; - if ((error = vmwrite(VMCS_EXIT_MSR_LOAD_COUNT, h_count)) != 0) - goto done; - - error = 0; -done: - VMCLEAR(vmcs); - return (error); -} -#endif - int -vmcs_set_defaults(struct vmcs *vmcs, - u_long host_rip, u_long host_rsp, u_long ept_pml4, - uint32_t pinbased_ctls, uint32_t procbased_ctls, - uint32_t procbased_ctls2, uint32_t exit_ctls, - uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid) +vmcs_init(struct vmcs *vmcs) { int error, codesel, datasel, tsssel; u_long cr0, cr4, efer; - uint64_t eptp, pat, fsbase, idtrbase; - uint32_t exc_bitmap; + uint64_t pat; +#ifdef __FreeBSD__ + uint64_t fsbase, idtrbase; +#endif codesel = vmm_get_host_codesel(); datasel = vmm_get_host_datasel(); @@ -376,34 +369,6 @@ vmcs_set_defaults(struct vmcs *vmcs, */ VMPTRLD(vmcs); - /* - * Load the VMX controls - */ - if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0) - goto done; - if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0) - goto done; - if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0) - goto done; - if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0) - goto done; - if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0) - goto done; - - /* Guest state */ - - /* Initialize guest IA32_PAT MSR with the default value */ - pat = PAT_VALUE(0, PAT_WRITE_BACK) | - PAT_VALUE(1, PAT_WRITE_THROUGH) | - PAT_VALUE(2, PAT_UNCACHED) | - PAT_VALUE(3, PAT_UNCACHEABLE) | - PAT_VALUE(4, PAT_WRITE_BACK) | - PAT_VALUE(5, PAT_WRITE_THROUGH) | - PAT_VALUE(6, PAT_UNCACHED) | - PAT_VALUE(7, PAT_UNCACHEABLE); - if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0) - goto done; - /* Host state */ /* Initialize host IA32_PAT MSR */ @@ -466,37 +431,35 @@ vmcs_set_defaults(struct vmcs *vmcs, fsbase = vmm_get_host_fsbase(); if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0) goto done; -#endif idtrbase = vmm_get_host_idtrbase(); if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0) goto done; - /* instruction pointer */ - if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0) - goto done; - - /* stack pointer */ - if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0) - goto done; - - /* eptp */ - eptp = EPTP(ept_pml4); - if ((error = vmwrite(VMCS_EPTP, eptp)) != 0) +#else /* __FreeBSD__ */ + /* + * Configure host sysenter MSRs to be restored on VM exit. + * The thread-specific MSR_INTC_SEP_ESP value is loaded in vmx_run. + */ + if ((error = vmwrite(VMCS_HOST_IA32_SYSENTER_CS, KCS_SEL)) != 0) goto done; - - /* vpid */ - if ((error = vmwrite(VMCS_VPID, vpid)) != 0) + /* Natively defined as MSR_INTC_SEP_EIP */ + if ((error = vmwrite(VMCS_HOST_IA32_SYSENTER_EIP, + rdmsr(MSR_SYSENTER_EIP_MSR))) != 0) goto done; - /* msr bitmap */ - if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0) - goto done; +#endif /* __FreeBSD__ */ - /* exception bitmap */ - exc_bitmap = 1 << IDT_MC; - if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0) - goto done; + /* instruction pointer */ + if (no_flush_rsb) { + if ((error = vmwrite(VMCS_HOST_RIP, + (u_long)vmx_exit_guest)) != 0) + goto done; + } else { + if ((error = vmwrite(VMCS_HOST_RIP, + (u_long)vmx_exit_guest_flush_rsb)) != 0) + goto done; + } /* link pointer */ if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0) diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.h b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h index 20e99e8184..edde5c6dd5 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmcs.h +++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,19 +25,40 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmcs.h 276098 2014-12-23 02:14:49Z neel $ + * $FreeBSD$ + */ + +/* + * Copyright 2017 Joyent, Inc. */ #ifndef _VMCS_H_ #define _VMCS_H_ #ifdef _KERNEL +#ifndef _ASM struct vmcs { uint32_t identifier; uint32_t abort_code; char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2]; +#ifndef __FreeBSD__ + /* + * Keep the physical address of the VMCS cached adjacent for the + * structure so it can be referenced in contexts which are too delicate + * for a call into the HAT. For the moment it means wasting a whole + * page on padding for the PA value to maintain alignment, but it + * allows the consumers of 'struct vmcs *' to easily access the value + * without a significant change to the interface. + */ + uint64_t vmcs_pa; + char _pa_pad[PAGE_SIZE - sizeof (vm_paddr_t)]; +#endif }; +#ifdef __FreeBSD__ CTASSERT(sizeof(struct vmcs) == PAGE_SIZE); +#else +CTASSERT(sizeof(struct vmcs) == (2*PAGE_SIZE)); +#endif /* MSR save region is composed of an array of 'struct msr_entry' */ struct msr_entry { @@ -47,15 +70,6 @@ struct msr_entry { int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); int vmcs_init(struct vmcs *vmcs); -#ifndef __FreeBSD__ -int vmcs_set_host_msr_save(struct vmcs *vmcs, u_long h_area, u_int h_count); -#endif -int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp, - u_long ept_pml4, - uint32_t pinbased_ctls, uint32_t procbased_ctls, - uint32_t procbased_ctls2, uint32_t exit_ctls, - uint32_t entry_ctls, u_long msr_bitmap, - uint16_t vpid); int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv); int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val); int vmcs_getdesc(struct vmcs *vmcs, int running, int ident, @@ -86,6 +100,65 @@ vmcs_write(uint32_t encoding, uint64_t val) error = vmwrite(encoding, val); KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error)); } + +#ifndef __FreeBSD__ +/* + * Due to header complexity combined with the need to cache the physical + * address for the VMCS, these must be defined here rather than vmx_cpufunc.h. + */ +static __inline int +vmclear(struct vmcs *vmcs) +{ + int error; + uint64_t addr = vmcs->vmcs_pa; + + __asm __volatile("vmclear %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} + +static __inline int +vmptrld(struct vmcs *vmcs) +{ + int error; + uint64_t addr = vmcs->vmcs_pa; + + __asm __volatile("vmptrld %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} + +static __inline void +VMCLEAR(struct vmcs *vmcs) +{ + int err; + + err = vmclear(vmcs); + if (err != 0) + panic("%s: vmclear(%p) error %d", __func__, vmcs, err); + + critical_exit(); +} + +static __inline void +VMPTRLD(struct vmcs *vmcs) +{ + int err; + + critical_enter(); + + err = vmptrld(vmcs); + if (err != 0) + panic("%s: vmptrld(%p) error %d", __func__, vmcs, err); +} +#endif /* __FreeBSD__ */ + #endif /* _VMX_CPUFUNC_H_ */ #define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH) @@ -99,6 +172,7 @@ vmcs_write(uint32_t encoding, uint64_t val) #define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO) #define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR) +#endif /* _ASM */ #endif /* _KERNEL */ #define VMCS_INITIAL 0xffffffffffffffff @@ -345,6 +419,14 @@ vmcs_write(uint32_t encoding, uint64_t val) #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 +#define EXIT_REASON_RDRAND 57 +#define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_VMFUNC 59 +#define EXIT_REASON_ENCLS 60 +#define EXIT_REASON_RDSEED 61 +#define EXIT_REASON_PM_LOG_FULL 62 +#define EXIT_REASON_XSAVES 63 +#define EXIT_REASON_XRSTORS 64 /* * NMI unblocking due to IRET. diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c index 7ddf4e2a46..ce42ff8c9c 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c @@ -1,6 +1,9 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. + * Copyright (c) 2018 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z tychon $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,10 +39,11 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z tychon $"); +__FBSDID("$FreeBSD$"); #include #include @@ -50,12 +54,21 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z t #include #include +#ifndef __FreeBSD__ +#include +#include +#include +#include +#include +#endif + #include #include #include #include #include +#include #include #include #include @@ -75,6 +88,7 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z t #include "ept.h" #include "vmx_cpufunc.h" +#include "vmcs.h" #include "vmx.h" #include "vmx_msr.h" #include "x86.h" @@ -90,13 +104,30 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z t (PROCBASED_INT_WINDOW_EXITING | \ PROCBASED_NMI_WINDOW_EXITING) +#ifdef __FreeBSD__ +#define PROCBASED_CTLS_ONE_SETTING \ + (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_MWAIT_EXITING | \ + PROCBASED_MONITOR_EXITING | \ + PROCBASED_IO_EXITING | \ + PROCBASED_MSR_BITMAPS | \ + PROCBASED_CTLS_WINDOW_SETTING | \ + PROCBASED_CR8_LOAD_EXITING | \ + PROCBASED_CR8_STORE_EXITING) +#else +/* We consider TSC offset a necessity for unsynched TSC handling */ #define PROCBASED_CTLS_ONE_SETTING \ (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_TSC_OFFSET | \ + PROCBASED_MWAIT_EXITING | \ + PROCBASED_MONITOR_EXITING | \ PROCBASED_IO_EXITING | \ PROCBASED_MSR_BITMAPS | \ PROCBASED_CTLS_WINDOW_SETTING | \ PROCBASED_CR8_LOAD_EXITING | \ PROCBASED_CR8_STORE_EXITING) +#endif /* __FreeBSD__ */ + #define PROCBASED_CTLS_ZERO_SETTING \ (PROCBASED_CR3_LOAD_EXITING | \ PROCBASED_CR3_STORE_EXITING | \ @@ -106,20 +137,21 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z t #define PROCBASED_CTLS2_ZERO_SETTING 0 #define VM_EXIT_CTLS_ONE_SETTING \ - (VM_EXIT_HOST_LMA | \ + (VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_HOST_LMA | \ + VM_EXIT_LOAD_PAT | \ VM_EXIT_SAVE_EFER | \ VM_EXIT_LOAD_EFER | \ - VM_EXIT_LOAD_PAT | \ - VM_EXIT_SAVE_PAT | \ - VM_EXIT_LOAD_PAT) + VM_EXIT_ACKNOWLEDGE_INTERRUPT) -#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS +#define VM_EXIT_CTLS_ZERO_SETTING 0 -#define VM_ENTRY_CTLS_ONE_SETTING (VM_ENTRY_LOAD_EFER | VM_ENTRY_LOAD_PAT) +#define VM_ENTRY_CTLS_ONE_SETTING \ + (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_LOAD_EFER) #define VM_ENTRY_CTLS_ZERO_SETTING \ - (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ - VM_ENTRY_INTO_SMM | \ + (VM_ENTRY_INTO_SMM | \ VM_ENTRY_DEACTIVATE_DUAL_MONITOR) #define HANDLED 1 @@ -131,11 +163,10 @@ static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); +#ifdef __FreeBSD__ int vmxon_enabled[MAXCPU]; static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); -#ifndef __FreeBSD__ -static vm_paddr_t vmxon_region_pa[MAXCPU]; -#endif +#endif /*__FreeBSD__ */ static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; static uint32_t exit_ctls, entry_ctls; @@ -159,29 +190,135 @@ SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, /* * Optional capabilities */ +SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); + static int cap_halt_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, + "HLT triggers a VM-exit"); + static int cap_pause_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, + 0, "PAUSE triggers a VM-exit"); + static int cap_unrestricted_guest; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, + &cap_unrestricted_guest, 0, "Unrestricted guests"); + static int cap_monitor_trap; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, + &cap_monitor_trap, 0, "Monitor trap flag"); + static int cap_invpcid; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, + 0, "Guests are allowed to use INVPCID"); static int virtual_interrupt_delivery; -SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); static int posted_interrupts; -SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD, +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, &posted_interrupts, 0, "APICv posted interrupt support"); -static int pirvec; +static int pirvec = -1; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, &pirvec, 0, "APICv posted interrupt vector"); +#ifdef __FreeBSD__ static struct unrhdr *vpid_unr; +#endif /* __FreeBSD__ */ static u_int vpid_alloc_failed; SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, &vpid_alloc_failed, 0, NULL); +static int guest_l1d_flush; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, + &guest_l1d_flush, 0, NULL); +static int guest_l1d_flush_sw; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD, + &guest_l1d_flush_sw, 0, NULL); + +static struct msr_entry msr_load_list[1] __aligned(16); + +/* + * The definitions of SDT probes for VMX. + */ + +SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, + "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, + "struct vmx *", "int", "struct vm_exit *", "uint64_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, + "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, + "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, + "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, + "struct vmx *", "int", "struct vm_exit *", "uint64_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, + "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, return, + "struct vmx *", "int", "struct vm_exit *", "int"); + /* * Use the last page below 4GB as the APIC access address. This address is * occupied by the boot firmware so it is guaranteed that it will not conflict @@ -193,6 +330,9 @@ static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); static void vmx_inject_pir(struct vlapic *vlapic); +#ifndef __FreeBSD__ +static int vmx_apply_tsc_adjust(struct vmx *, int); +#endif /* __FreeBSD__ */ #ifdef KTR static const char * @@ -279,8 +419,8 @@ exit_reason_to_str(int reason) return "monitor"; case EXIT_REASON_PAUSE: return "pause"; - case EXIT_REASON_MCE: - return "mce"; + case EXIT_REASON_MCE_DURING_ENTRY: + return "mce-during-entry"; case EXIT_REASON_TPR: return "tpr"; case EXIT_REASON_APIC_ACCESS: @@ -312,83 +452,6 @@ exit_reason_to_str(int reason) return (reasonbuf); } } - -#ifdef SETJMP_TRACE -static const char * -vmx_setjmp_rc2str(int rc) -{ - switch (rc) { - case VMX_RETURN_DIRECT: - return "direct"; - case VMX_RETURN_LONGJMP: - return "longjmp"; - case VMX_RETURN_VMRESUME: - return "vmresume"; - case VMX_RETURN_VMLAUNCH: - return "vmlaunch"; - case VMX_RETURN_AST: - return "ast"; - default: - return "unknown"; - } -} - -#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ - VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ - (vmxctx)->regname) - -static void -vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) -{ - uint64_t host_rip, host_rsp; - - if (vmxctx != &vmx->ctx[vcpu]) - panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", - vmxctx, &vmx->ctx[vcpu]); - - VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); - VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", - vmx_setjmp_rc2str(rc), rc); - - host_rsp = host_rip = ~0; - vmread(VMCS_HOST_RIP, &host_rip); - vmread(VMCS_HOST_RSP, &host_rsp); - VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", - host_rip, host_rsp); - - SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); - - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); -} -#endif -#else -static void __inline -vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) -{ - return; -} #endif /* KTR */ static int @@ -411,7 +474,7 @@ vmx_allow_x2apic_msrs(struct vmx *vmx) for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); - + for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); @@ -465,7 +528,11 @@ vpid_free(int vpid) */ if (vpid > VM_MAXCPU) +#ifdef __FreeBSD__ free_unr(vpid_unr, vpid); +#else + hma_vmx_vpid_free((uint16_t)vpid); +#endif } static void @@ -490,7 +557,14 @@ vpid_alloc(uint16_t *vpid, int num) * Allocate a unique VPID for each vcpu from the unit number allocator. */ for (i = 0; i < num; i++) { +#ifdef __FreeBSD__ x = alloc_unr(vpid_unr); +#else + uint16_t tmp; + + tmp = hma_vmx_vpid_alloc(); + x = (tmp == 0) ? -1 : tmp; +#endif if (x == -1) break; else @@ -519,6 +593,7 @@ vpid_alloc(uint16_t *vpid, int num) } } +#ifdef __FreeBSD__ static void vpid_init(void) { @@ -535,50 +610,6 @@ vpid_init(void) vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); } -#ifndef __FreeBSD__ -static void -msr_save_area_init(struct msr_entry *g_area, int *g_count) -{ - int cnt; - - static struct msr_entry guest_msrs[] = { - { MSR_KGSBASE, 0, 0 }, - { MSR_LSTAR, 0, 0 }, - { MSR_CSTAR, 0, 0 }, - { MSR_STAR, 0, 0 }, - { MSR_SF_MASK, 0, 0 }, - }; - - cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); - if (cnt > GUEST_MSR_MAX_ENTRIES) - panic("guest msr save area overrun"); - bcopy(guest_msrs, g_area, sizeof(guest_msrs)); - *g_count = cnt; -} - -static void -host_msr_save_area_init(struct msr_entry *h_area, int *h_count) -{ - int i, cnt; - - static struct msr_entry host_msrs[] = { - { MSR_LSTAR, 0, 0 }, - { MSR_CSTAR, 0, 0 }, - { MSR_STAR, 0, 0 }, - { MSR_SF_MASK, 0, 0 }, - }; - - cnt = sizeof(host_msrs) / sizeof(host_msrs[0]); - if (cnt > HOST_MSR_MAX_ENTRIES) - panic("host msr save area overrun"); - for (i = 0; i < cnt; i++) { - host_msrs[i].val = rdmsr(host_msrs[i].index); - } - bcopy(host_msrs, h_area, sizeof(host_msrs)); - *h_count = cnt; -} -#endif - static void vmx_disable(void *arg __unused) { @@ -603,17 +634,18 @@ vmx_disable(void *arg __unused) static int vmx_cleanup(void) { - -#ifdef __FreeBSD__ - if (pirvec != 0) - vmm_ipi_free(pirvec); -#endif + + if (pirvec >= 0) + lapic_ipi_free(pirvec); if (vpid_unr != NULL) { delete_unrhdr(vpid_unr); vpid_unr = NULL; } + if (nmi_flush_l1d_sw == 1) + nmi_flush_l1d_sw = 0; + smp_rendezvous(NULL, vmx_disable, NULL, NULL); return (0); @@ -636,40 +668,50 @@ vmx_enable(void *arg __unused) load_cr4(rcr4() | CR4_VMXE); *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); -#ifdef __FreeBSD__ error = vmxon(vmxon_region[curcpu]); -#else - error = vmxon_pa(vmxon_region_pa[curcpu]); - ASSERT(error == 0); -#endif if (error == 0) vmxon_enabled[curcpu] = 1; } +static void +vmx_restore(void) +{ + + if (vmxon_enabled[curcpu]) + vmxon(vmxon_region[curcpu]); +} +#else /* __FreeBSD__ */ static int -vmx_init(void) +vmx_cleanup(void) { -#define X86FSET_VMX 35 - extern uchar_t x86_featureset[]; - extern boolean_t is_x86_feature(void *featureset, uint_t feature); - int error; - uint64_t fixed0, fixed1, feature_control; - uint32_t tmp; -#ifndef __FreeBSD__ - int i; + /* This is taken care of by the hma registration */ + return (0); +} + +static void +vmx_restore(void) +{ + /* No-op on illumos */ +} +#endif /* __FreeBSD__ */ + +static int +vmx_init(int ipinum) +{ + int error, use_tpr_shadow; +#ifdef __FreeBSD__ + uint64_t basic, fixed0, fixed1, feature_control; +#else + uint64_t fixed0, fixed1; #endif + uint32_t tmp, procbased2_vid_bits; +#ifdef __FreeBSD__ /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ -#ifdef __FreeBSD__ if (!(cpu_feature2 & CPUID2_VMX)) { printf("vmx_init: processor does not support VMX operation\n"); return (ENXIO); } -#else - if (!is_x86_feature(x86_featureset, X86FSET_VMX)) { - cmn_err(CE_WARN, "vmx_init: processor does not support VMX operation\n"); - } -#endif /* * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits @@ -682,6 +724,18 @@ vmx_init(void) return (ENXIO); } + /* + * Verify capabilities MSR_VMX_BASIC: + * - bit 54 indicates support for INS/OUTS decoding + */ + basic = rdmsr(MSR_VMX_BASIC); + if ((basic & (1UL << 54)) == 0) { + printf("vmx_init: processor does not support desired basic " + "capabilities\n"); + return (EINVAL); + } +#endif /* __FreeBSD__ */ + /* Check support for primary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, @@ -769,13 +823,119 @@ vmx_init(void) PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp) == 0); + cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, + &tmp) == 0); + + /* + * Check support for virtual interrupt delivery. + */ + procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | + PROCBASED2_VIRTUALIZE_X2APIC_MODE | + PROCBASED2_APIC_REGISTER_VIRTUALIZATION | + PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); + + use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, + &tmp) == 0); + + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + procbased2_vid_bits, 0, &tmp); + if (error == 0 && use_tpr_shadow) { + virtual_interrupt_delivery = 1; + TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", + &virtual_interrupt_delivery); + } + + if (virtual_interrupt_delivery) { + procbased_ctls |= PROCBASED_USE_TPR_SHADOW; + procbased_ctls2 |= procbased2_vid_bits; + procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; + + /* + * No need to emulate accesses to %CR8 if virtual + * interrupt delivery is enabled. + */ + procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; + procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; + + /* + * Check for Posted Interrupts only if Virtual Interrupt + * Delivery is enabled. + */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, + &tmp); + if (error == 0) { +#ifdef __FreeBSD__ + pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : + &IDTVEC(justreturn)); + if (pirvec < 0) { + if (bootverbose) { + printf("vmx_init: unable to allocate " + "posted interrupt vector\n"); + } + } else { + posted_interrupts = 1; + TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", + &posted_interrupts); + } +#else + /* + * If the PSM-provided interfaces for requesting and + * using a PIR IPI vector are present, use them for + * posted interrupts. + */ + if (psm_get_pir_ipivect != NULL && + psm_send_pir_ipi != NULL) { + pirvec = psm_get_pir_ipivect(); + posted_interrupts = 1; + } +#endif + } + } + + if (posted_interrupts) + pinbased_ctls |= PINBASED_POSTED_INTERRUPT; + /* Initialize EPT */ - error = ept_init(); + error = ept_init(ipinum); if (error) { printf("vmx_init: ept initialization failed (%d)\n", error); return (error); } +#ifdef __FreeBSD__ + guest_l1d_flush = (cpu_ia32_arch_caps & + IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; + TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); + + /* + * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when + * available. Otherwise fall back to the software flush + * method which loads enough data from the kernel text to + * flush existing L1D content, both on VMX entry and on NMI + * return. + */ + if (guest_l1d_flush) { + if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { + guest_l1d_flush_sw = 1; + TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", + &guest_l1d_flush_sw); + } + if (guest_l1d_flush_sw) { + if (nmi_flush_l1d_sw <= 1) + nmi_flush_l1d_sw = 1; + } else { + msr_load_list[0].index = MSR_IA32_FLUSH_CMD; + msr_load_list[0].val = IA32_FLUSH_CMD_L1D; + } + } +#else + /* L1D flushing is taken care of by smt_acquire() and friends */ + guest_l1d_flush = 0; +#endif /* __FreeBSD__ */ + /* * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 */ @@ -801,24 +961,52 @@ vmx_init(void) cr4_ones_mask = fixed0 & fixed1; cr4_zeros_mask = ~fixed0 & ~fixed1; -#ifndef __FreeBSD__ - for (i = 0; i < MAXCPU; i++) { - vmxon_region_pa[i] = vtophys(&vmxon_region[i]); - } -#endif - +#ifdef __FreeBSD__ vpid_init(); +#endif vmx_msr_init(); +#ifdef __FreeBSD__ /* enable VMX operation */ smp_rendezvous(NULL, vmx_enable, NULL, NULL); +#endif vmx_initialized = 1; return (0); } +static void +vmx_trigger_hostintr(int vector) +{ +#ifdef __FreeBSD__ + uintptr_t func; + struct gate_descriptor *gd; + + gd = &idt[vector]; + + KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " + "invalid vector %d", vector)); + KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", + vector)); + KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " + "has invalid type %d", vector, gd->gd_type)); + KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " + "has invalid dpl %d", vector, gd->gd_dpl)); + KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " + "for vector %d has invalid selector %d", vector, gd->gd_selector)); + KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " + "IST %d", vector, gd->gd_ist)); + + func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); + vmx_call_isr(func); +#else + VERIFY(vector >= 32 && vector <= 255); + vmx_call_isr(vector - 32); +#endif /* __FreeBSD__ */ +} + static int vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) { @@ -852,15 +1040,14 @@ vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) static void * -vmx_vminit(struct vm *vm) +vmx_vminit(struct vm *vm, pmap_t pmap) { uint16_t vpid[VM_MAXCPU]; - int i, error, guest_msr_count; -#ifndef __FreeBSD__ - int host_msr_count; -#endif + int i, error; struct vmx *vmx; struct vmcs *vmcs; + uint32_t exc_bitmap; + uint16_t maxcpus; vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); if ((uintptr_t)vmx & PAGE_MASK) { @@ -869,6 +1056,8 @@ vmx_vminit(struct vm *vm) } vmx->vm = vm; + vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); + /* * Clean up EPTP-tagged guest physical and combined mappings * @@ -878,7 +1067,7 @@ vmx_vminit(struct vm *vm) * * Combined mappings for this EP4TA are also invalidated for all VPIDs. */ - ept_invalidate_mappings(vtophys(vmx->pml4ept)); + ept_invalidate_mappings(vmx->eptp); msr_bitmap_initialize(vmx->msr_bitmap); @@ -896,10 +1085,6 @@ vmx_vminit(struct vm *vm) * VM exit and entry respectively. It is also restored from the * host VMCS area on a VM exit. * - * MSR_PAT is saved and restored in the guest VMCS are on a VM exit - * and entry respectively. It is also restored from the host VMCS - * area on a VM exit. - * * The TSC MSR is exposed read-only. Writes are disallowed as * that will impact the host TSC. If the guest does a write * the "use TSC offsetting" execution control is enabled and the @@ -912,15 +1097,36 @@ vmx_vminit(struct vm *vm) guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || guest_msr_rw(vmx, MSR_EFER) || - guest_msr_rw(vmx, MSR_PAT) || guest_msr_ro(vmx, MSR_TSC)) panic("vmx_vminit: error setting guest msr access"); vpid_alloc(vpid, VM_MAXCPU); - for (i = 0; i < VM_MAXCPU; i++) { + if (virtual_interrupt_delivery) { + error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, + APIC_ACCESS_ADDRESS); + /* XXX this should really return an error to the caller */ + KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); + } + + maxcpus = vm_get_maxcpus(vm); + for (i = 0; i < maxcpus; i++) { +#ifndef __FreeBSD__ + /* + * Cache physical address lookups for various components which + * may be required inside the critical_enter() section implied + * by VMPTRLD() below. + */ + vm_paddr_t msr_bitmap_pa = vtophys(vmx->msr_bitmap); + vm_paddr_t apic_page_pa = vtophys(&vmx->apic_page[i]); + vm_paddr_t pir_desc_pa = vtophys(&vmx->pir_desc[i]); +#endif /* __FreeBSD__ */ + vmcs = &vmx->vmcs[i]; vmcs->identifier = vmx_revision(); +#ifndef __FreeBSD__ + vmcs->vmcs_pa = (uint64_t)vtophys(vmcs); +#endif error = vmclear(vmcs); if (error != 0) { panic("vmx_vminit: vmclear error %d on vcpu %d\n", @@ -929,42 +1135,83 @@ vmx_vminit(struct vm *vm) vmx_msr_guest_init(vmx, i); - error = vmcs_set_defaults(vmcs, - (u_long)vmx_longjmp, - (u_long)&vmx->ctx[i], - vtophys(vmx->pml4ept), - pinbased_ctls, - procbased_ctls, - procbased_ctls2, - exit_ctls, entry_ctls, - vtophys(vmx->msr_bitmap), - vpid[i]); - - if (error != 0) - panic("vmx_vminit: vmcs_set_defaults error %d", error); + error = vmcs_init(vmcs); + KASSERT(error == 0, ("vmcs_init error %d", error)); - vmx->cap[i].set = 0; - vmx->cap[i].proc_ctls = procbased_ctls; + VMPTRLD(vmcs); + error = 0; +#ifdef __FreeBSD__ + /* + * The illumos vmx_enter_guest implementation avoids some of + * the %rsp-manipulation games which are present in the stock + * one from FreeBSD. + */ + error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); +#endif + error += vmwrite(VMCS_EPTP, vmx->eptp); + error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); + error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); + error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); + error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); + error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); +#ifdef __FreeBSD__ + error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); +#else + error += vmwrite(VMCS_MSR_BITMAP, msr_bitmap_pa); +#endif + error += vmwrite(VMCS_VPID, vpid[i]); + + if (guest_l1d_flush && !guest_l1d_flush_sw) { + vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( + (vm_offset_t)&msr_load_list[0])); + vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, + nitems(msr_load_list)); + vmcs_write(VMCS_EXIT_MSR_STORE, 0); + vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); + } - vmx->state[i].lastcpu = -1; - vmx->state[i].vpid = vpid[i]; + /* exception bitmap */ + if (vcpu_trace_exceptions(vm, i)) + exc_bitmap = 0xffffffff; + else + exc_bitmap = 1 << IDT_MC; + error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); -#ifndef __FreeBSD__ - msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); + vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; + error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); - error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]), - guest_msr_count); - if (error != 0) - panic("vmcs_set_msr_save error %d", error); + if (virtual_interrupt_delivery) { + error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); +#ifdef __FreeBSD__ + error += vmwrite(VMCS_VIRTUAL_APIC, + vtophys(&vmx->apic_page[i])); +#else + error += vmwrite(VMCS_VIRTUAL_APIC, apic_page_pa); +#endif + error += vmwrite(VMCS_EOI_EXIT0, 0); + error += vmwrite(VMCS_EOI_EXIT1, 0); + error += vmwrite(VMCS_EOI_EXIT2, 0); + error += vmwrite(VMCS_EOI_EXIT3, 0); + } + if (posted_interrupts) { + error += vmwrite(VMCS_PIR_VECTOR, pirvec); +#ifdef __FreeBSD__ + error += vmwrite(VMCS_PIR_DESC, + vtophys(&vmx->pir_desc[i])); +#else + error += vmwrite(VMCS_PIR_DESC, pir_desc_pa); +#endif + } + VMCLEAR(vmcs); + KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); - host_msr_save_area_init(vmx->host_msrs[i], &host_msr_count); + vmx->cap[i].set = 0; + vmx->cap[i].proc_ctls = procbased_ctls; + vmx->cap[i].proc_ctls2 = procbased_ctls2; - error = vmcs_set_host_msr_save(&vmx->vmcs[i], - vtophys(vmx->host_msrs[i]), - host_msr_count); - if (error != 0) - panic("vmcs_set_msr_save error %d", error); -#endif + vmx->state[i].nextrip = ~0; + vmx->state[i].lastcpu = NOCPU; + vmx->state[i].vpid = vpid[i]; /* * Set up the CR0/4 shadows, and init the read shadow @@ -979,6 +1226,8 @@ vmx_vminit(struct vm *vm) error = vmx_setup_cr4_shadow(vmcs, 0); if (error != 0) panic("vmx_setup_cr4_shadow %d", error); + + vmx->ctx[i].pmap = pmap; } return (vmx); @@ -987,9 +1236,13 @@ vmx_vminit(struct vm *vm) static int vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) { +#ifdef __FreeBSD__ int handled, func; - + func = vmxctx->guest_rax; +#else + int handled; +#endif handled = x86_emulate_cpuid(vm, vcpu, (uint32_t*)(&vmxctx->guest_rax), @@ -1016,6 +1269,8 @@ vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, handled ? "handled" : "unhandled", exit_reason_to_str(exit_reason), rip); #endif + DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, rip, + uint32_t, exit_reason); } static __inline void @@ -1026,36 +1281,40 @@ vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) #endif } -static void -vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) +static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); +static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); + +/* + * Invalidate guest mappings identified by its vpid from the TLB. + */ +static __inline void +vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) { struct vmxstate *vmxstate; - struct invvpid_desc invvpid_desc = { 0 }; -#ifndef __FreeBSD__ - desctbr_t idtr, gdtr; -#endif + struct invvpid_desc invvpid_desc; vmxstate = &vmx->state[vcpu]; - vmcs_write(VMCS_HOST_FS_BASE, vmm_get_host_fsbase()); - if (vmxstate->lastcpu == curcpu) + if (vmxstate->vpid == 0) return; - vmxstate->lastcpu = curcpu; - - vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); - - vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); - vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); - vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + if (!running) { + /* + * Set the 'lastcpu' to an invalid host cpu. + * + * This will invalidate TLB entries tagged with the vcpu's + * vpid the next time it runs via vmx_set_pcpu_defaults(). + */ + vmxstate->lastcpu = NOCPU; + return; + } -#ifndef __FreeBSD__ - vmcs_write(VMCS_HOST_IA32_SYSENTER_CS, rdmsr(MSR_SYSENTER_CS_MSR)); - vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR)); - vmcs_write(VMCS_HOST_IA32_SYSENTER_EIP, rdmsr(MSR_SYSENTER_EIP_MSR)); +#ifdef __FreeBSD__ + KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " + "critical section", __func__, vcpu)); #endif /* - * If we are using VPIDs then invalidate all mappings tagged with 'vpid' + * Invalidate all mappings tagged with 'vpid' * * We do this because this vcpu was executing on a different host * cpu when it last ran. We do not track whether it invalidated @@ -1069,29 +1328,70 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) * Note also that this will invalidate mappings tagged with 'vpid' * for "all" EP4TAs. */ - if (vmxstate->vpid != 0) { + if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { + invvpid_desc._res1 = 0; + invvpid_desc._res2 = 0; invvpid_desc.vpid = vmxstate->vpid; + invvpid_desc.linear_addr = 0; invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); + } else { + /* + * The invvpid can be skipped if an invept is going to + * be performed before entering the guest. The invept + * will invalidate combined mappings tagged with + * 'vmx->eptp' for all vpids. + */ + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); } } -static void -vm_exit_update_rip(struct vm_exit *vmexit) +static void +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) { - int error; - - error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); - if (error) - panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); -} + struct vmxstate *vmxstate; -/* - * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. - */ -CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); +#ifndef __FreeBSD__ + /* + * Regardless of whether the VM appears to have migrated between CPUs, + * save the host sysenter stack pointer. As it points to the kernel + * stack of each thread, the correct value must be maintained for every + * trip into the critical section. + */ + vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR)); -static void __inline -vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) + /* + * Perform any needed TSC_OFFSET adjustment based on TSC_MSR writes or + * migration between host CPUs with differing TSC values. + */ + VERIFY0(vmx_apply_tsc_adjust(vmx, vcpu)); +#endif + + vmxstate = &vmx->state[vcpu]; + if (vmxstate->lastcpu == curcpu) + return; + + vmxstate->lastcpu = curcpu; + + vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + +#ifndef __FreeBSD__ + /* Load the per-CPU IDT address */ + vmcs_write(VMCS_HOST_IDTR_BASE, vmm_get_host_idtrbase()); +#endif + vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); + vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); + vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + vmx_invvpid(vmx, vcpu, pmap, 1); +} + +/* + * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set. + */ +CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); + +static __inline void +vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) { if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) { @@ -1101,23 +1401,18 @@ vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) } } -static void __inline +static __inline void vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) { -#ifdef __FreeBSD__ KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); -#else - KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, - ("intr_window_exiting not set: %x", vmx->cap[vcpu].proc_ctls)); -#endif vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); } -static void __inline +static __inline void vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) { @@ -1128,22 +1423,18 @@ vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) } } -static void __inline +static __inline void vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) { -#ifdef __FreeBSD__ KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); -#else - KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, - ("nmi_window_exiting not set %x", vmx->cap[vcpu].proc_ctls)); -#endif vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); } +#ifdef __FreeBSD__ int vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) { @@ -1159,34 +1450,55 @@ vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) return (error); } +#else /* __FreeBSD__ */ +/* + * Set the TSC adjustment, taking into account the offsets measured between + * host physical CPUs. This is required even if the guest has not set a TSC + * offset since vCPUs inherit the TSC offset of whatever physical CPU it has + * migrated onto. Without this mitigation, un-synched host TSCs will convey + * the appearance of TSC time-travel to the guest as its vCPUs migrate. + */ +static int +vmx_apply_tsc_adjust(struct vmx *vmx, int vcpu) +{ + extern hrtime_t tsc_gethrtime_tick_delta(void); + const uint64_t target_offset = (vcpu_tsc_offset(vmx->vm, vcpu) + + (uint64_t)tsc_gethrtime_tick_delta()); + int error = 0; + + ASSERT(vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET); + + if (vmx->tsc_offset_active[vcpu] != target_offset) { + error = vmwrite(VMCS_TSC_OFFSET, target_offset); + vmx->tsc_offset_active[vcpu] = target_offset; + } + + return (error); +} +#endif /* __FreeBSD__ */ #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) +#ifndef __FreeBSD__ +static uint32_t +vmx_inject_nmi(struct vmx *vmx, int vcpu) +#else static void vmx_inject_nmi(struct vmx *vmx, int vcpu) +#endif { uint32_t gi, info; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); -#ifdef __FreeBSD__ KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " "interruptibility-state %#x", gi)); -#else - KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " - "interruptibility-state %x", gi)); -#endif info = vmcs_read(VMCS_ENTRY_INTR_INFO); -#ifdef __FreeBSD__ KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " "VM-entry interruption information %#x", info)); -#else - KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " - "VM-entry interruption information %x", info)); -#endif /* * Inject the virtual NMI. The vector must be the NMI IDT entry @@ -1199,32 +1511,220 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu) /* Clear the request */ vm_nmi_clear(vmx->vm, vcpu); + +#ifndef __FreeBSD__ + return (info); +#endif } +#ifndef __FreeBSD__ static void -vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic) +vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, + uint64_t guestrip) { - int vector, need_nmi_exiting, extint_pending; - uint64_t rflags, entryinfo; + uint64_t entryinfo, rflags; uint32_t gi, info; + int vector; + boolean_t extint_pending = B_FALSE; + + vlapic_tmr_update(vlapic); + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + + if (vmx->state[vcpu].nextrip != guestrip && + (gi & HWINTR_BLOCKING) != 0) { + VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vmx->state[vcpu].nextrip, guestrip); + gi &= ~HWINTR_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); + } + + /* + * It could be that an interrupt is already pending for injection from + * the VMCS. This would be the case if the vCPU exited for conditions + * such as an AST before a vm-entry delivered the injection. + */ + if ((info & VMCS_INTR_VALID) != 0) { + goto cantinject; + } if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { -#ifdef __FreeBSD__ KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " "intinfo is not valid: %#lx", __func__, entryinfo)); + + KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " + "pending exception: %#lx/%#x", __func__, entryinfo, info)); + + info = entryinfo; + vector = info & 0xff; + if (vector == IDT_BP || vector == IDT_OF) { + /* + * VT-x requires #BP and #OF to be injected as software + * exceptions. + */ + info &= ~VMCS_INTR_T_MASK; + info |= VMCS_INTR_T_SWEXCEPTION; + } + + if (info & VMCS_INTR_DEL_ERRCODE) + vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); + + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + } + + if (vm_nmi_pending(vmx->vm, vcpu)) { + int need_nmi_exiting = 1; + + /* + * If there are no conditions blocking NMI injection then + * inject it directly here otherwise enable "NMI window + * exiting" to inject it as soon as we can. + * + * We also check for STI_BLOCKING because some implementations + * don't allow NMI injection in this case. If we are running + * on a processor that doesn't have this restriction it will + * immediately exit and the NMI will be injected in the + * "NMI window exiting" handler. + */ + if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { + if ((info & VMCS_INTR_VALID) == 0) { + info = vmx_inject_nmi(vmx, vcpu); + need_nmi_exiting = 0; + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " + "due to VM-entry intr info %#x", info); + } + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " + "Guest Interruptibility-state %#x", gi); + } + + if (need_nmi_exiting) { + vmx_set_nmi_window_exiting(vmx, vcpu); + return; + } + } + + /* Check the AT-PIC and APIC for interrupts. */ + if (vm_extint_pending(vmx->vm, vcpu)) { + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(vmx->vm, &vector); + extint_pending = B_TRUE; + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [0,255] can be delivered + * through the INTR pin. + */ + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + } else if (!virtual_interrupt_delivery) { + /* Ask the local apic for a vector to inject */ + if (!vlapic_pending_intr(vlapic, &vector)) + return; + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [16,255] can be delivered + * through the local APIC. + */ + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + } else { + /* No futher injection needed */ + return; + } + + /* + * Verify that the guest is interruptable and the above logic has not + * already queued an event for injection. + */ + if ((gi & HWINTR_BLOCKING) != 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "Guest Interruptibility-state %#x", vector, gi); + goto cantinject; + } + if ((info & VMCS_INTR_VALID) != 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "VM-entry intr info %#x", vector, info); + goto cantinject; + } + rflags = vmcs_read(VMCS_GUEST_RFLAGS); + if ((rflags & PSL_I) == 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "rflags %#lx", vector, rflags); + goto cantinject; + } + + /* Inject the interrupt */ + info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; + info |= vector; + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + + if (extint_pending) { + vm_extint_clear(vmx->vm, vcpu); + vatpic_intr_accepted(vmx->vm, vector); + + /* + * After we accepted the current ExtINT the PIC may + * have posted another one. If that is the case, set + * the Interrupt Window Exiting execution control so + * we can inject that one too. + * + * Also, interrupt window exiting allows us to inject any + * pending APIC vector that was preempted by the ExtINT + * as soon as possible. This applies both for the software + * emulated vlapic and the hardware assisted virtual APIC. + */ + vmx_set_int_window_exiting(vmx, vcpu); + } else { + /* Update the Local APIC ISR */ + vlapic_intr_accepted(vlapic, vector); + } + + VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); + return; + +cantinject: + /* + * Set the Interrupt Window Exiting execution control so we can inject + * the interrupt as soon as blocking condition goes away. + */ + vmx_set_int_window_exiting(vmx, vcpu); +} #else +static void +vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, + uint64_t guestrip) +{ + int vector, need_nmi_exiting, extint_pending; + uint64_t rflags, entryinfo; + uint32_t gi, info; + + vlapic_tmr_update(vlapic); + + if (vmx->state[vcpu].nextrip != guestrip) { + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if (gi & HWINTR_BLOCKING) { + VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vmx->state[vcpu].nextrip, guestrip); + gi &= ~HWINTR_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); + } + } + + if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " - "intinfo is not valid: %lx", __func__, entryinfo)); -#endif + "intinfo is not valid: %#lx", __func__, entryinfo)); info = vmcs_read(VMCS_ENTRY_INTR_INFO); -#ifdef __FreeBSD__ KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " "pending exception: %#lx/%#x", __func__, entryinfo, info)); -#else - KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " - "pending exception: %lx/%x", __func__, entryinfo, info)); -#endif info = entryinfo; vector = info & 0xff; @@ -1277,12 +1777,10 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic) extint_pending = vm_extint_pending(vmx->vm, vcpu); -#ifdef __FreeBSD__ if (!extint_pending && virtual_interrupt_delivery) { vmx_inject_pir(vlapic); return; } -#endif /* * If interrupt-window exiting is already in effect then don't bother @@ -1388,6 +1886,7 @@ cantinject: */ vmx_set_int_window_exiting(vmx, vcpu); } +#endif /* __FreeBSD__ */ /* * If the Virtual NMIs execution control is '1' then the logical processor @@ -1420,6 +1919,92 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } +static void +vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, + ("NMI blocking is not in effect %#x", gi)); +} + +static int +vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + struct vmxctx *vmxctx; + uint64_t xcrval; + const struct xsave_limits *limits; + + vmxctx = &vmx->ctx[vcpu]; + limits = vmm_get_xsave_limits(); + + /* + * Note that the processor raises a GP# fault on its own if + * xsetbv is executed for CPL != 0, so we do not have to + * emulate that fault here. + */ + + /* Only xcr0 is supported. */ + if (vmxctx->guest_rcx != 0) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ + if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { + vm_inject_ud(vmx->vm, vcpu); + return (HANDLED); + } + + xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); + if ((xcrval & ~limits->xcr0_allowed) != 0) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + if (!(xcrval & XFEATURE_ENABLED_X87)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* AVX (YMM_Hi128) requires SSE. */ + if (xcrval & XFEATURE_ENABLED_AVX && + (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, + * ZMM_Hi256, and Hi16_ZMM. + */ + if (xcrval & XFEATURE_AVX512 && + (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != + (XFEATURE_AVX512 | XFEATURE_AVX)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * Intel MPX requires both bound register state flags to be + * set. + */ + if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != + ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * This runs "inside" vmrun() with the guest's FPU state, so + * modifying xcr0 directly modifies the guest's xcr0, not the + * host's. + */ + load_xcr(0, xcrval); + return (HANDLED); +} + static uint64_t vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) { @@ -1734,6 +2319,7 @@ vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) paging = &vmexit->u.inst_emul.paging; vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->inst_length = 0; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = gla; vmx_paging_info(paging); @@ -1799,6 +2385,189 @@ ept_emulation_fault(uint64_t ept_qual) return (TRUE); } +static __inline int +apic_access_virtualization(struct vmx *vmx, int vcpuid) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); +} + +static __inline int +x2apic_virtualization(struct vmx *vmx, int vcpuid) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); +} + +static int +vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, + uint64_t qual) +{ + int error, handled, offset; + uint32_t *apic_regs, vector; + bool retu; + + handled = HANDLED; + offset = APIC_WRITE_OFFSET(qual); + + if (!apic_access_virtualization(vmx, vcpuid)) { + /* + * In general there should not be any APIC write VM-exits + * unless APIC-access virtualization is enabled. + * + * However self-IPI virtualization can legitimately trigger + * an APIC-write VM-exit so treat it specially. + */ + if (x2apic_virtualization(vmx, vcpuid) && + offset == APIC_OFFSET_SELF_IPI) { + apic_regs = (uint32_t *)(vlapic->apic_page); + vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; + vlapic_self_ipi_handler(vlapic, vector); + return (HANDLED); + } else + return (UNHANDLED); + } + + switch (offset) { + case APIC_OFFSET_ID: + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_LDR: + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + retu = false; + error = vlapic_icrlo_write_handler(vlapic, &retu); + if (error != 0 || retu) + handled = UNHANDLED; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + vlapic_lvt_write_handler(vlapic, offset); + break; + case APIC_OFFSET_TIMER_ICR: + vlapic_icrtmr_write_handler(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + vlapic_dcr_write_handler(vlapic); + break; + default: + handled = UNHANDLED; + break; + } + return (handled); +} + +static bool +apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) +{ + + if (apic_access_virtualization(vmx, vcpuid) && + (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) + return (true); + else + return (false); +} + +static int +vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) +{ + uint64_t qual; + int access_type, offset, allowed; + + if (!apic_access_virtualization(vmx, vcpuid)) + return (UNHANDLED); + + qual = vmexit->u.vmx.exit_qualification; + access_type = APIC_ACCESS_TYPE(qual); + offset = APIC_ACCESS_OFFSET(qual); + + allowed = 0; + if (access_type == 0) { + /* + * Read data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } else if (access_type == 1) { + /* + * Write data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } + + if (allowed) { + vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, + VIE_INVALID_GLA); + } + + /* + * Regardless of whether the APIC-access is allowed this handler + * always returns UNHANDLED: + * - if the access is allowed then it is handled by emulating the + * instruction that caused the VM-exit (outside the critical section) + * - if the access is not allowed then it will be converted to an + * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. + */ + return (UNHANDLED); +} + +static enum task_switch_reason +vmx_task_switch_reason(uint64_t qual) +{ + int reason; + + reason = (qual >> 30) & 0x3; + switch (reason) { + case 0: + return (TSR_CALL); + case 1: + return (TSR_IRET); + case 2: + return (TSR_JMP); + case 3: + return (TSR_IDT_GATE); + default: + panic("%s: invalid reason %d", __func__, reason); + } +} + static int emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) { @@ -1839,31 +2608,150 @@ emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) return (error); } +#ifndef __FreeBSD__ +#define __predict_false(x) (x) +#endif + static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { - int error, handled, in; - struct vmcs *vmcs; + int error, errcode, errcode_valid, handled, in; struct vmxctx *vmxctx; + struct vlapic *vlapic; struct vm_inout_str *vis; - uint32_t eax, ecx, edx, idtvec_info, intr_info, inst_info; - uint64_t qual, gla, gpa, cr3; + struct vm_task_switch *ts; + uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; + uint32_t intr_type, intr_vec, reason; + uint64_t exitintinfo, qual, gpa; bool retu; CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); handled = UNHANDLED; - vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; + qual = vmexit->u.vmx.exit_qualification; + reason = vmexit->u.vmx.exit_reason; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); + SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); + + /* + * VM-entry failures during or after loading guest state. + * + * These VM-exits are uncommon but must be handled specially + * as most VM-exit fields are not populated as usual. + */ + if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { + VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); +#ifdef __FreeBSD__ + __asm __volatile("int $18"); +#else + vmm_call_trap(T_MCE); +#endif + return (1); + } - switch (vmexit->u.vmx.exit_reason) { + /* + * VM exits that can be triggered during event delivery need to + * be handled specially by re-injecting the event if the IDT + * vectoring information field's valid bit is set. + * + * See "Information for VM Exits During Event Delivery" in Intel SDM + * for details. + */ + idtvec_info = vmcs_idt_vectoring_info(); + if (idtvec_info & VMCS_IDT_VEC_VALID) { + idtvec_info &= ~(1 << 12); /* clear undefined bit */ + exitintinfo = idtvec_info; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + idtvec_err = vmcs_idt_vectoring_err(); + exitintinfo |= (uint64_t)idtvec_err << 32; + } + error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); + KASSERT(error == 0, ("%s: vm_set_intinfo error %d", + __func__, error)); + + /* + * If 'virtual NMIs' are being used and the VM-exit + * happened while injecting an NMI during the previous + * VM-entry, then clear "blocking by NMI" in the + * Guest Interruptibility-State so the NMI can be + * reinjected on the subsequent VM-entry. + * + * However, if the NMI was being delivered through a task + * gate, then the new task must start execution with NMIs + * blocked so don't clear NMI blocking in this case. + */ + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type == VMCS_INTR_T_NMI) { + if (reason != EXIT_REASON_TASK_SWITCH) + vmx_clear_nmi_blocking(vmx, vcpu); + else + vmx_assert_nmi_blocking(vmx, vcpu); + } + + /* + * Update VM-entry instruction length if the event being + * delivered was a software interrupt or software exception. + */ + if (intr_type == VMCS_INTR_T_SWINTR || + intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || + intr_type == VMCS_INTR_T_SWEXCEPTION) { + vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + } + } + + switch (reason) { + case EXIT_REASON_TASK_SWITCH: + ts = &vmexit->u.task_switch; + ts->tsssel = qual & 0xffff; + ts->reason = vmx_task_switch_reason(qual); + ts->ext = 0; + ts->errcode_valid = 0; + vmx_paging_info(&ts->paging); + /* + * If the task switch was due to a CALL, JMP, IRET, software + * interrupt (INT n) or software exception (INT3, INTO), + * then the saved %rip references the instruction that caused + * the task switch. The instruction length field in the VMCS + * is valid in this case. + * + * In all other cases (e.g., NMI, hardware exception) the + * saved %rip is one that would have been saved in the old TSS + * had the task switch completed normally so the instruction + * length field is not needed in this case and is explicitly + * set to 0. + */ + if (ts->reason == TSR_IDT_GATE) { + KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, + ("invalid idtvec_info %#x for IDT task switch", + idtvec_info)); + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type != VMCS_INTR_T_SWINTR && + intr_type != VMCS_INTR_T_SWEXCEPTION && + intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { + /* Task switch triggered by external event */ + ts->ext = 1; + vmexit->inst_length = 0; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + ts->errcode_valid = 1; + ts->errcode = vmcs_idt_vectoring_err(); + } + } + } + vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; + SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); + VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " + "%s errcode 0x%016lx", ts->reason, ts->tsssel, + ts->ext ? "external" : "internal", + ((uint64_t)ts->errcode << 32) | ts->errcode_valid); + break; case EXIT_REASON_CR_ACCESS: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); + SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); switch (qual & 0xf) { case 0: handled = vmx_emulate_cr0_access(vmx, vcpu, qual); @@ -1881,6 +2769,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) retu = false; ecx = vmxctx->guest_rcx; VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); + SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx); error = emulate_rdmsr(vmx, vcpu, ecx, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_RDMSR; @@ -1901,6 +2790,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) edx = vmxctx->guest_rdx; VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", ecx, (uint64_t)edx << 32 | eax); + SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx, + (uint64_t)edx << 32 | eax); error = emulate_wrmsr(vmx, vcpu, ecx, (uint64_t)edx << 32 | eax, &retu); if (error) { @@ -1917,19 +2808,29 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) break; case EXIT_REASON_HLT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); + SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_HLT; vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); + if (virtual_interrupt_delivery) + vmexit->u.hlt.intr_status = + vmcs_read(VMCS_GUEST_INTR_STATUS); + else + vmexit->u.hlt.intr_status = 0; break; case EXIT_REASON_MTF: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); + SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MTRAP; + vmexit->inst_length = 0; break; case EXIT_REASON_PAUSE: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); + SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_PAUSE; break; case EXIT_REASON_INTR_WINDOW: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); + SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); vmx_clear_int_window_exiting(vmx, vcpu); return (1); case EXIT_REASON_EXT_INTR: @@ -1943,6 +2844,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) * this virtual interrupt during the subsequent VM enter. */ intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + SDT_PROBE4(vmm, vmx, exit, interrupt, + vmx, vcpu, vmexit, intr_info); /* * XXX: Ignore this exit if VMCS_INTR_VALID is not set. @@ -1950,18 +2853,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) */ if (!(intr_info & VMCS_INTR_VALID)) return (1); -#ifdef __FreeBSD__ KASSERT((intr_info & VMCS_INTR_VALID) != 0 && (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, ("VM exit interruption info invalid: %#x", intr_info)); -#else - KASSERT((intr_info & VMCS_INTR_VALID) != 0 && - (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, - ("VM exit interruption info invalid: %x", intr_info)); -#endif -#if 0 /* XXX */ vmx_trigger_hostintr(intr_info & 0xff); -#endif /* * This is special. We want to treat this as an 'handled' @@ -1970,6 +2865,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); return (1); case EXIT_REASON_NMI_WINDOW: + SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); /* Exit to allow the pending virtual NMI to be injected */ if (vm_nmi_pending(vmx->vm, vcpu)) vmx_inject_nmi(vmx, vcpu); @@ -1997,21 +2893,21 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) vis->addrsize = inout_str_addrsize(inst_info); inout_str_seginfo(vmx, vcpu, inst_info, in, vis); } + SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); break; case EXIT_REASON_CPUID: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); + SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); break; case EXIT_REASON_EXCEPTION: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); -#ifdef __FreeBSD__ KASSERT((intr_info & VMCS_INTR_VALID) != 0, ("VM exit interruption info invalid: %#x", intr_info)); -#else - KASSERT((intr_info & VMCS_INTR_VALID) != 0, - ("VM exit interruption info invalid: %x", intr_info)); -#endif + + intr_vec = intr_info & 0xff; + intr_type = intr_info & VMCS_INTR_T_MASK; /* * If Virtual NMIs control is 1 and the VM-exit is due to a @@ -2020,26 +2916,147 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) * the guest. * * See "Resuming Guest Software after Handling an Exception". + * See "Information for VM Exits Due to Vectored Events". */ if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && - (intr_info & 0xff) != IDT_DF && + (intr_vec != IDT_DF) && (intr_info & EXIT_QUAL_NMIUDTI) != 0) vmx_restore_nmi_blocking(vmx, vcpu); /* * The NMI has already been handled in vmx_exit_handle_nmi(). */ - if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) + if (intr_type == VMCS_INTR_T_NMI) return (1); - break; + + /* + * Call the machine check handler by hand. Also don't reflect + * the machine check back into the guest. + */ + if (intr_vec == IDT_MC) { + VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); +#ifdef __FreeBSD__ + __asm __volatile("int $18"); +#else + vmm_call_trap(T_MCE); +#endif + return (1); + } + + if (intr_vec == IDT_PF) { + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); + KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", + __func__, error)); + } + + /* + * Software exceptions exhibit trap-like behavior. This in + * turn requires populating the VM-entry instruction length + * so that the %rip in the trap frame is past the INT3/INTO + * instruction. + */ + if (intr_type == VMCS_INTR_T_SWEXCEPTION) + vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + + /* Reflect all other exceptions back into the guest */ + errcode_valid = errcode = 0; + if (intr_info & VMCS_INTR_DEL_ERRCODE) { + errcode_valid = 1; + errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); + } + VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " + "the guest", intr_vec, errcode); + SDT_PROBE5(vmm, vmx, exit, exception, + vmx, vcpu, vmexit, intr_vec, errcode); + error = vm_inject_exception(vmx->vm, vcpu, intr_vec, + errcode_valid, errcode, 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + return (1); + case EXIT_REASON_EPT_FAULT: + /* + * If 'gpa' lies within the address space allocated to + * memory then this must be a nested page fault otherwise + * this must be an instruction that accesses MMIO space. + */ gpa = vmcs_gpa(); - if (ept_emulation_fault(qual)) { + if (vm_mem_allocated(vmx->vm, vcpu, gpa) || + apic_access_fault(vmx, vcpu, gpa)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->inst_length = 0; + vmexit->u.paging.gpa = gpa; + vmexit->u.paging.fault_type = ept_fault_type(qual); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); + SDT_PROBE5(vmm, vmx, exit, nestedfault, + vmx, vcpu, vmexit, gpa, qual); + } else if (ept_emulation_fault(qual)) { vmexit_inst_emul(vmexit, gpa, vmcs_gla()); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); + SDT_PROBE4(vmm, vmx, exit, mmiofault, + vmx, vcpu, vmexit, gpa); } + /* + * If Virtual NMIs control is 1 and the VM-exit is due to an + * EPT fault during the execution of IRET then we must restore + * the state of "virtual-NMI blocking" before resuming. + * + * See description of "NMI unblocking due to IRET" in + * "Exit Qualification for EPT Violations". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (qual & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vmx, vcpu); + break; + case EXIT_REASON_VIRTUALIZED_EOI: + vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; + vmexit->u.ioapic_eoi.vector = qual & 0xFF; + SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); + vmexit->inst_length = 0; /* trap-like */ + break; + case EXIT_REASON_APIC_ACCESS: + SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); + handled = vmx_handle_apic_access(vmx, vcpu, vmexit); + break; + case EXIT_REASON_APIC_WRITE: + /* + * APIC-write VM exit is trap-like so the %rip is already + * pointing to the next instruction. + */ + vmexit->inst_length = 0; + vlapic = vm_lapic(vmx->vm, vcpu); + SDT_PROBE4(vmm, vmx, exit, apicwrite, + vmx, vcpu, vmexit, vlapic); + handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); + break; + case EXIT_REASON_XSETBV: + SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); + handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); + break; + case EXIT_REASON_MONITOR: + SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case EXIT_REASON_MWAIT: + SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + case EXIT_REASON_VMCALL: + case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: + case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: + case EXIT_REASON_VMREAD: + case EXIT_REASON_VMRESUME: + case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMXOFF: + case EXIT_REASON_VMXON: + SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_VMINSN; break; default: + SDT_PROBE4(vmm, vmx, exit, unknown, + vmx, vcpu, vmexit, reason); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); break; } @@ -2055,17 +3072,9 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) * the one we just processed. Therefore we update the * guest rip in the VMCS and in 'vmexit'. */ - vm_exit_update_rip(vmexit); vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; - - /* - * Special case for spinning up an AP - exit to userspace to - * give the controlling process a chance to intercept and - * spin up a thread for the AP. - */ - if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) - handled = 0; + vmcs_write(VMCS_GUEST_RIP, vmexit->rip); } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* @@ -2083,91 +3092,340 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) */ } } - return (handled); + + SDT_PROBE4(vmm, vmx, exit, return, + vmx, vcpu, vmexit, handled); + return (handled); +} + +static void +vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) +{ + + KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, + ("vmx_exit_inst_error: invalid inst_fail_status %d", + vmxctx->inst_fail_status)); + + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = vmxctx->inst_fail_status; + vmexit->u.vmx.inst_error = vmcs_instruction_error(); + vmexit->u.vmx.exit_reason = ~0; + vmexit->u.vmx.exit_qualification = ~0; + + switch (rc) { + case VMX_VMRESUME_ERROR: + case VMX_VMLAUNCH_ERROR: + case VMX_INVEPT_ERROR: +#ifndef __FreeBSD__ + case VMX_VMWRITE_ERROR: +#endif + vmexit->u.vmx.inst_type = rc; + break; + default: + panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); + } +} + +/* + * If the NMI-exiting VM execution control is set to '1' then an NMI in + * non-root operation causes a VM-exit. NMI blocking is in effect so it is + * sufficient to simply vector to the NMI handler via a software interrupt. + * However, this must be done before maskable interrupts are enabled + * otherwise the "iret" issued by an interrupt handler will incorrectly + * clear NMI blocking. + */ +static __inline void +vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) +{ + uint32_t intr_info; + + KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); + + if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) + return; + + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + KASSERT((intr_info & VMCS_INTR_VALID) != 0, + ("VM exit interruption info invalid: %#x", intr_info)); + + if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { + KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " + "to NMI has invalid vector: %#x", intr_info)); + VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); +#ifdef __FreeBSD__ + __asm __volatile("int $2"); +#else + vmm_call_trap(T_NMIFLT); +#endif + } +} + +static __inline void +vmx_dr_enter_guest(struct vmxctx *vmxctx) +{ + register_t rflags; + + /* Save host control debug registers. */ + vmxctx->host_dr7 = rdr7(); + vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); + + /* + * Disable debugging in DR7 and DEBUGCTL to avoid triggering + * exceptions in the host based on the guest DRx values. The + * guest DR7 and DEBUGCTL are saved/restored in the VMCS. + */ + load_dr7(0); + wrmsr(MSR_DEBUGCTLMSR, 0); + + /* + * Disable single stepping the kernel to avoid corrupting the + * guest DR6. A debugger might still be able to corrupt the + * guest DR6 by setting a breakpoint after this point and then + * single stepping. + */ + rflags = read_rflags(); + vmxctx->host_tf = rflags & PSL_T; + write_rflags(rflags & ~PSL_T); + + /* Save host debug registers. */ + vmxctx->host_dr0 = rdr0(); + vmxctx->host_dr1 = rdr1(); + vmxctx->host_dr2 = rdr2(); + vmxctx->host_dr3 = rdr3(); + vmxctx->host_dr6 = rdr6(); + + /* Restore guest debug registers. */ + load_dr0(vmxctx->guest_dr0); + load_dr1(vmxctx->guest_dr1); + load_dr2(vmxctx->guest_dr2); + load_dr3(vmxctx->guest_dr3); + load_dr6(vmxctx->guest_dr6); +} + +static __inline void +vmx_dr_leave_guest(struct vmxctx *vmxctx) +{ + + /* Save guest debug registers. */ + vmxctx->guest_dr0 = rdr0(); + vmxctx->guest_dr1 = rdr1(); + vmxctx->guest_dr2 = rdr2(); + vmxctx->guest_dr3 = rdr3(); + vmxctx->guest_dr6 = rdr6(); + + /* + * Restore host debug registers. Restore DR7, DEBUGCTL, and + * PSL_T last. + */ + load_dr0(vmxctx->host_dr0); + load_dr1(vmxctx->host_dr1); + load_dr2(vmxctx->host_dr2); + load_dr3(vmxctx->host_dr3); + load_dr6(vmxctx->host_dr6); + wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); + load_dr7(vmxctx->host_dr7); + write_rflags(read_rflags() | vmxctx->host_tf); } static int -vmx_run(void *arg, int vcpu, register_t rip) +vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, + struct vm_eventinfo *evinfo) { - int error, vie, rc, handled, astpending; - uint32_t exit_reason; + int rc, handled, launched; struct vmx *vmx; struct vm *vm; struct vmxctx *vmxctx; struct vmcs *vmcs; struct vm_exit *vmexit; struct vlapic *vlapic; - + uint32_t exit_reason; +#ifdef __FreeBSD__ + struct region_descriptor gdtr, idtr; + uint16_t ldt_sel; +#endif + vmx = arg; vm = vmx->vm; vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; vlapic = vm_lapic(vm, vcpu); - vmxctx->launched = 0; + vmexit = vm_exitinfo(vm, vcpu); + launched = 0; - astpending = 0; - vmexit = vm_exitinfo(vmx->vm, vcpu); + KASSERT(vmxctx->pmap == pmap, + ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); vmx_msr_guest_enter(vmx, vcpu); VMPTRLD(vmcs); +#ifndef __FreeBSD__ + VERIFY(vmx->vmcs_state[vcpu] == VS_NONE && curthread->t_preempt != 0); + vmx->vmcs_state[vcpu] = VS_LOADED; +#endif + /* * XXX * We do this every time because we may setup the virtual machine * from a different process than the one that actually runs it. * * If the life of a virtual machine was spent entirely in the context - * of a single process we could do this once in vmcs_set_defaults(). + * of a single process we could do this once in vmx_vminit(). */ vmcs_write(VMCS_HOST_CR3, rcr3()); vmcs_write(VMCS_GUEST_RIP, rip); - vmx_set_pcpu_defaults(vmx, vcpu); + vmx_set_pcpu_defaults(vmx, vcpu, pmap); do { - vmx_inject_interrupts(vmx, vcpu, vlapic); - vmx_run_trace(vmx, vcpu); - rc = vmx_setjmp(vmxctx); -#ifdef SETJMP_TRACE - vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); -#endif - switch (rc) { - case VMX_RETURN_DIRECT: - if (vmxctx->launched == 0) { - vmxctx->launched = 1; - vmx_launch(vmxctx); - } else - vmx_resume(vmxctx); - panic("vmx_launch/resume should not return"); + KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " + "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); + + handled = UNHANDLED; + /* + * Interrupts are disabled from this point on until the + * guest starts executing. This is done for the following + * reasons: + * + * If an AST is asserted on this thread after the check below, + * then the IPI_AST notification will not be lost, because it + * will cause a VM exit due to external interrupt as soon as + * the guest state is loaded. + * + * A posted interrupt after 'vmx_inject_interrupts()' will + * not be "lost" because it will be held pending in the host + * APIC because interrupts are disabled. The pending interrupt + * will be recognized as soon as the guest state is loaded. + * + * The same reasoning applies to the IPI generated by + * pmap_invalidate_ept(). + */ +#ifdef __FreeBSD__ + disable_intr(); + vmx_inject_interrupts(vmx, vcpu, vlapic, rip); +#else + /* + * The bulk of guest interrupt injection is done without + * interrupts disabled on the host CPU. This is necessary + * since contended mutexes might force the thread to sleep. + */ + vmx_inject_interrupts(vmx, vcpu, vlapic, rip); + disable_intr(); + if (virtual_interrupt_delivery) { + vmx_inject_pir(vlapic); + } +#endif /* __FreeBSD__ */ + + /* + * Check for vcpu suspension after injecting events because + * vmx_inject_interrupts() can suspend the vcpu due to a + * triple fault. + */ + if (vcpu_suspended(evinfo)) { + enable_intr(); + vm_exit_suspended(vmx->vm, vcpu, rip); + break; + } + + if (vcpu_runblocked(evinfo)) { + enable_intr(); + vm_exit_runblock(vmx->vm, vcpu, rip); + break; + } + + if (vcpu_reqidle(evinfo)) { + enable_intr(); + vm_exit_reqidle(vmx->vm, vcpu, rip); break; - case VMX_RETURN_LONGJMP: - break; /* vm exit */ - case VMX_RETURN_AST: - astpending = 1; + } + + if (vcpu_should_yield(vm, vcpu)) { + enable_intr(); + vm_exit_astpending(vmx->vm, vcpu, rip); + vmx_astpending_trace(vmx, vcpu, rip); + handled = HANDLED; break; - case VMX_RETURN_VMRESUME: - vie = vmcs_instruction_error(); - if (vmxctx->launch_error == VM_FAIL_INVALID || - vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { - printf("vmresume error %d vmcs inst error %d\n", - vmxctx->launch_error, vie); - goto err_exit; + } + + if (vcpu_debugged(vm, vcpu)) { + enable_intr(); + vm_exit_debug(vmx->vm, vcpu, rip); + break; + } + +#ifndef __FreeBSD__ + if ((rc = smt_acquire()) != 1) { + enable_intr(); + vmexit->rip = rip; + vmexit->inst_length = 0; + if (rc == -1) { + vmexit->exitcode = VM_EXITCODE_HT; + } else { + vmexit->exitcode = VM_EXITCODE_BOGUS; + handled = HANDLED; } - vmx_launch(vmxctx); /* try to launch the guest */ - panic("vmx_launch should not return"); break; - case VMX_RETURN_VMLAUNCH: - vie = vmcs_instruction_error(); -#if 1 - printf("vmlaunch error %d vmcs inst error %d\n", - vmxctx->launch_error, vie); -#endif - goto err_exit; - default: - panic("vmx_setjmp returned %d", rc); } - - /* collect some basic information for VM exit processing */ + + /* + * If this thread has gone off-cpu due to mutex operations + * during vmx_run, the VMCS will have been unloaded, forcing a + * re-VMLAUNCH as opposed to VMRESUME. + */ + launched = (vmx->vmcs_state[vcpu] & VS_LAUNCHED) != 0; + /* + * Restoration of the GDT limit is taken care of by + * vmx_savectx(). Since the maximum practical index for the + * IDT is 255, restoring its limits from the post-VMX-exit + * default of 0xffff is not a concern. + * + * Only 64-bit hypervisor callers are allowed, which forgoes + * the need to restore any LDT descriptor. Toss an error to + * anyone attempting to break that rule. + */ + if (curproc->p_model != DATAMODEL_LP64) { + smt_release(); + enable_intr(); + bzero(vmexit, sizeof (*vmexit)); + vmexit->rip = rip; + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = VM_FAIL_INVALID; + handled = UNHANDLED; + break; + } +#else + /* + * VM exits restore the base address but not the + * limits of GDTR and IDTR. The VMCS only stores the + * base address, so VM exits set the limits to 0xffff. + * Save and restore the full GDTR and IDTR to restore + * the limits. + * + * The VMCS does not save the LDTR at all, and VM + * exits clear LDTR as if a NULL selector were loaded. + * The userspace hypervisor probably doesn't use a + * LDT, but save and restore it to be safe. + */ + sgdt(&gdtr); + sidt(&idtr); + ldt_sel = sldt(); +#endif + + vmx_run_trace(vmx, vcpu); + vmx_dr_enter_guest(vmxctx); + rc = vmx_enter_guest(vmxctx, vmx, launched); + vmx_dr_leave_guest(vmxctx); + +#ifndef __FreeBSD__ + vmx->vmcs_state[vcpu] |= VS_LAUNCHED; + smt_release(); +#else + bare_lgdt(&gdtr); + lidt(&idtr); + lldt(ldt_sel); +#endif + + /* Collect some information for VM exit processing */ vmexit->rip = rip = vmcs_guest_rip(); vmexit->inst_length = vmexit_instruction_length(); vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); @@ -2176,21 +3434,19 @@ vmx_run(void *arg, int vcpu, register_t rip) /* Update 'nextrip' */ vmx->state[vcpu].nextrip = rip; - /* enable interrupts */ - enable_intr(); - - if (astpending) { - handled = 1; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_BOGUS; - vmx_astpending_trace(vmx, vcpu, rip); - vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1); - break; + if (rc == VMX_GUEST_VMEXIT) { + vmx_exit_handle_nmi(vmx, vcpu, vmexit); + enable_intr(); + handled = vmx_exit_process(vmx, vcpu, vmexit); + } else { + enable_intr(); + vmx_exit_inst_error(vmxctx, rc, vmexit); } - - handled = vmx_exit_process(vmx, vcpu, vmexit); +#ifdef __FreeBSD__ + launched = 1; +#endif vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); - + rip = vmexit->rip; } while (handled); /* @@ -2204,44 +3460,36 @@ vmx_run(void *arg, int vcpu, register_t rip) } if (!handled) - vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1); + vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); - VCPU_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d", + VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", vmexit->exitcode); VMCLEAR(vmcs); vmx_msr_guest_exit(vmx, vcpu); - return (0); - -err_exit: - vmexit->exitcode = VM_EXITCODE_VMX; - vmexit->u.vmx.exit_reason = (uint32_t)-1; - vmexit->u.vmx.exit_qualification = (uint32_t)-1; - vmexit->u.vmx.status = ~0; - VMCLEAR(vmcs); - vmx_msr_guest_exit(vmx, vcpu); +#ifndef __FreeBSD__ + VERIFY(vmx->vmcs_state != VS_NONE && curthread->t_preempt != 0); + vmx->vmcs_state[vcpu] = VS_NONE; +#endif - return (ENOEXEC); + return (0); } static void vmx_vmcleanup(void *arg) { - int i, error; + int i; struct vmx *vmx = arg; + uint16_t maxcpus; - for (i = 0; i < VM_MAXCPU; i++) - vpid_free(vmx->state[i].vpid); + if (apic_access_virtualization(vmx, 0)) + vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); - /* - * XXXSMP we also need to clear the VMCS active on the other vcpus. - */ - error = vmclear(&vmx->vmcs[0]); - if (error != 0) - panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); + maxcpus = vm_get_maxcpus(vmx->vm); + for (i = 0; i < maxcpus; i++) + vpid_free(vmx->state[i].vpid); - ept_vmcleanup(vmx); free(vmx, M_VMX); return; @@ -2284,6 +3532,16 @@ vmxctx_regptr(struct vmxctx *vmxctx, int reg) return (&vmxctx->guest_r15); case VM_REG_GUEST_CR2: return (&vmxctx->guest_cr2); + case VM_REG_GUEST_DR0: + return (&vmxctx->guest_dr0); + case VM_REG_GUEST_DR1: + return (&vmxctx->guest_dr1); + case VM_REG_GUEST_DR2: + return (&vmxctx->guest_dr2); + case VM_REG_GUEST_DR3: + return (&vmxctx->guest_dr3); + case VM_REG_GUEST_DR6: + return (&vmxctx->guest_dr6); default: break; } @@ -2314,6 +3572,46 @@ vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) return (EINVAL); } +static int +vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) +{ + uint64_t gi; + int error; + + error = vmcs_getreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); + *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; + return (error); +} + +static int +vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) +{ + struct vmcs *vmcs; + uint64_t gi; + int error, ident; + + /* + * Forcing the vcpu into an interrupt shadow is not supported. + */ + if (val) { + error = EINVAL; + goto done; + } + + vmcs = &vmx->vmcs[vcpu]; + ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); + error = vmcs_getreg(vmcs, running, ident, &gi); + if (error == 0) { + gi &= ~HWINTR_BLOCKING; + error = vmcs_setreg(vmcs, running, ident, gi); + } +done: + VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, + error ? "failed" : "succeeded"); + return (error); +} + static int vmx_shadow_reg(int reg) { @@ -2324,8 +3622,8 @@ vmx_shadow_reg(int reg) switch (reg) { case VM_REG_GUEST_CR0: shreg = VMCS_CR0_SHADOW; - break; - case VM_REG_GUEST_CR4: + break; + case VM_REG_GUEST_CR4: shreg = VMCS_CR4_SHADOW; break; default: @@ -2345,6 +3643,9 @@ vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) if (running && hostcpu != curcpu) panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); + if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) return (0); @@ -2356,12 +3657,16 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) { int error, hostcpu, running, shadow; uint64_t ctls; + pmap_t pmap; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); + if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) return (0); @@ -2389,10 +3694,22 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) if (shadow > 0) { /* * Store the unmodified value in the shadow - */ + */ error = vmcs_setreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(shadow), val); } + + if (reg == VM_REG_GUEST_CR3) { + /* + * Invalidate the guest vcpu's TLB mappings to emulate + * the behavior of updating %cr3. + * + * XXX the processor retains global mappings when %cr3 + * is updated but vmx_invvpid() does not. + */ + pmap = vmx->ctx[vcpu].pmap; + vmx_invvpid(vmx, vcpu, pmap, running); + } } return (error); @@ -2452,6 +3769,10 @@ vmx_getcap(void *arg, int vcpu, int type, int *retval) if (cap_unrestricted_guest) ret = 0; break; + case VM_CAP_ENABLE_INVPCID: + if (cap_invpcid) + ret = 0; + break; default: break; } @@ -2508,11 +3829,21 @@ vmx_setcap(void *arg, int vcpu, int type, int val) case VM_CAP_UNRESTRICTED_GUEST: if (cap_unrestricted_guest) { retval = 0; - baseval = procbased_ctls2; + pptr = &vmx->cap[vcpu].proc_ctls2; + baseval = *pptr; flag = PROCBASED2_UNRESTRICTED_GUEST; reg = VMCS_SEC_PROC_BASED_CTLS; } break; + case VM_CAP_ENABLE_INVPCID: + if (cap_invpcid) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls2; + baseval = *pptr; + flag = PROCBASED2_ENABLE_INVPCID; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; default: break; } @@ -2546,15 +3877,18 @@ vmx_setcap(void *arg, int vcpu, int type, int val) } } - return (retval); + return (retval); } struct vlapic_vtx { struct vlapic vlapic; struct pir_desc *pir_desc; struct vmx *vmx; + u_int pending_prio; }; +#define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4)) + #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ do { \ VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ @@ -2576,7 +3910,7 @@ vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; uint64_t mask; - int idx, notify; + int idx, notify = 0; vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; @@ -2589,7 +3923,37 @@ vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) idx = vector / 64; mask = 1UL << (vector % 64); atomic_set_long(&pir_desc->pir[idx], mask); - notify = atomic_cmpset_long(&pir_desc->pending, 0, 1); + + /* + * A notification is required whenever the 'pending' bit makes a + * transition from 0->1. + * + * Even if the 'pending' bit is already asserted, notification about + * the incoming interrupt may still be necessary. For example, if a + * vCPU is HLTed with a high PPR, a low priority interrupt would cause + * the 0->1 'pending' transition with a notification, but the vCPU + * would ignore the interrupt for the time being. The same vCPU would + * need to then be notified if a high-priority interrupt arrived which + * satisfied the PPR. + * + * The priorities of interrupts injected while 'pending' is asserted + * are tracked in a custom bitfield 'pending_prio'. Should the + * to-be-injected interrupt exceed the priorities already present, the + * notification is sent. The priorities recorded in 'pending_prio' are + * cleared whenever the 'pending' bit makes another 0->1 transition. + */ + if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) { + notify = 1; + vlapic_vtx->pending_prio = 0; + } else { + const u_int old_prio = vlapic_vtx->pending_prio; + const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT); + + if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) { + atomic_set_int(&vlapic_vtx->pending_prio, prio_bit); + notify = 1; + } + } VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, level, "vmx_set_intr_ready"); @@ -2616,8 +3980,27 @@ vmx_pending_intr(struct vlapic *vlapic, int *vecptr) pir_desc = vlapic_vtx->pir_desc; pending = atomic_load_acq_long(&pir_desc->pending); - if (!pending) - return (0); /* common case */ + if (!pending) { + /* + * While a virtual interrupt may have already been + * processed the actual delivery maybe pending the + * interruptibility of the guest. Recognize a pending + * interrupt by reevaluating virtual interrupts + * following Section 29.2.1 in the Intel SDM Volume 3. + */ + struct vm_exit *vmexit; + uint8_t rvi, ppr; + + vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); + rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; + lapic = vlapic->apic_page; + ppr = lapic->ppr & APIC_TPR_INT; + if (rvi > ppr) { + return (1); + } + + return (0); + } /* * If there is an interrupt pending then it will be recognized only @@ -2627,21 +4010,38 @@ vmx_pending_intr(struct vlapic *vlapic, int *vecptr) * interrupt will be recognized. */ lapic = vlapic->apic_page; - ppr = lapic->ppr & 0xf0; + ppr = lapic->ppr & APIC_TPR_INT; if (ppr == 0) return (1); VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", lapic->ppr); + vpr = 0; for (i = 3; i >= 0; i--) { pirval = pir_desc->pir[i]; if (pirval != 0) { - vpr = (i * 64 + flsl(pirval) - 1) & 0xf0; - return (vpr > ppr); + vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; + break; } } - return (0); + + /* + * If the highest-priority pending interrupt falls short of the + * processor priority of this vCPU, ensure that 'pending_prio' does not + * have any stale bits which would preclude a higher-priority interrupt + * from incurring a notification later. + */ + if (vpr <= ppr) { + const u_int prio_bit = VPR_PRIO_BIT(vpr); + const u_int old = vlapic_vtx->pending_prio; + + if (old > prio_bit && (old & prio_bit) == 0) { + vlapic_vtx->pending_prio = prio_bit; + } + return (0); + } + return (1); } static void @@ -2652,37 +4052,65 @@ vmx_intr_accepted(struct vlapic *vlapic, int vector) } static void -vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) +vmx_set_tmr(struct vlapic *vlapic, const uint32_t *masks) +{ + vmcs_write(VMCS_EOI_EXIT0, ((uint64_t)masks[1] << 32) | masks[0]); + vmcs_write(VMCS_EOI_EXIT1, ((uint64_t)masks[3] << 32) | masks[2]); + vmcs_write(VMCS_EOI_EXIT2, ((uint64_t)masks[5] << 32) | masks[4]); + vmcs_write(VMCS_EOI_EXIT3, ((uint64_t)masks[7] << 32) | masks[6]); +} + +static void +vmx_enable_x2apic_mode(struct vlapic *vlapic) { - struct vlapic_vtx *vlapic_vtx; struct vmx *vmx; struct vmcs *vmcs; - uint64_t mask, val; + uint32_t proc_ctls2; + int vcpuid, error; - KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); - KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), - ("vmx_set_tmr: vcpu cannot be running")); + vcpuid = vlapic->vcpuid; + vmx = ((struct vlapic_vtx *)vlapic)->vmx; + vmcs = &vmx->vmcs[vcpuid]; - vlapic_vtx = (struct vlapic_vtx *)vlapic; - vmx = vlapic_vtx->vmx; - vmcs = &vmx->vmcs[vlapic->vcpuid]; - mask = 1UL << (vector % 64); + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, + ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); + + proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; + proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; + vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; VMPTRLD(vmcs); - val = vmcs_read(VMCS_EOI_EXIT(vector)); - if (level) - val |= mask; - else - val &= ~mask; - vmcs_write(VMCS_EOI_EXIT(vector), val); + vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); VMCLEAR(vmcs); + + if (vlapic->vcpuid == 0) { + /* + * The nested page table mappings are shared by all vcpus + * so unmap the APIC access page just once. + */ + error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); + KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", + __func__, error)); + + /* + * The MSR bitmap is shared by all vcpus so modify it only + * once in the context of vcpu 0. + */ + error = vmx_allow_x2apic_msrs(vmx); + KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", + __func__, error)); + } } static void vmx_post_intr(struct vlapic *vlapic, int hostcpu) { - +#ifdef __FreeBSD__ ipi_cpu(hostcpu, pirvec); +#else + psm_send_pir_ipi(hostcpu); +#endif } /* @@ -2785,7 +4213,7 @@ vmx_vlapic_init(void *arg, int vcpuid) struct vmx *vmx; struct vlapic *vlapic; struct vlapic_vtx *vlapic_vtx; - + vmx = arg; vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); @@ -2802,9 +4230,7 @@ vmx_vlapic_init(void *arg, int vcpuid) vlapic->ops.pending_intr = vmx_pending_intr; vlapic->ops.intr_accepted = vmx_intr_accepted; vlapic->ops.set_tmr = vmx_set_tmr; -#ifdef __FreeBSD__ vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; -#endif } if (posted_interrupts) @@ -2823,20 +4249,129 @@ vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) free(vlapic, M_VLAPIC); } +#ifndef __FreeBSD__ +static void +vmx_savectx(void *arg, int vcpu) +{ + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + + if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) { + VERIFY3U(vmclear(vmcs), ==, 0); + vmx_msr_guest_exit(vmx, vcpu); + /* + * Having VMCLEARed the VMCS, it can no longer be re-entered + * with VMRESUME, but must be VMLAUNCHed again. + */ + vmx->vmcs_state[vcpu] &= ~VS_LAUNCHED; + } + + reset_gdtr_limit(); +} + +static void +vmx_restorectx(void *arg, int vcpu) +{ + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + + ASSERT0(vmx->vmcs_state[vcpu] & VS_LAUNCHED); + + if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) { + vmx_msr_guest_enter(vmx, vcpu); + VERIFY3U(vmptrld(vmcs), ==, 0); + } +} +#endif /* __FreeBSD__ */ + struct vmm_ops vmm_ops_intel = { vmx_init, vmx_cleanup, + vmx_restore, vmx_vminit, vmx_run, vmx_vmcleanup, - ept_vmmmap_set, - ept_vmmmap_get, vmx_getreg, vmx_setreg, vmx_getdesc, vmx_setdesc, vmx_getcap, vmx_setcap, + ept_vmspace_alloc, + ept_vmspace_free, vmx_vlapic_init, vmx_vlapic_cleanup, + +#ifndef __FreeBSD__ + vmx_savectx, + vmx_restorectx, +#endif }; + +#ifndef __FreeBSD__ +/* Side-effect free HW validation derived from checks in vmx_init. */ +int +vmx_x86_supported(const char **msg) +{ + int error; + uint32_t tmp; + + ASSERT(msg != NULL); + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired primary " + "processor-based controls"; + return (error); + } + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired secondary " + "processor-based controls"; + return (error); + } + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired pin-based controls"; + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired exit controls"; + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired entry controls"; + return (error); + } + + /* Unrestricted guest is nominally optional, but not for us. */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp); + if (error) { + *msg = "processor does not support desired unrestricted guest " + "controls"; + return (error); + } + + return (0); +} +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.h b/usr/src/uts/i86pc/io/vmm/intel/vmx.h index 50ca62b371..2d16799bdd 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.h +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx.h 284174 2015-06-09 00:14:47Z tychon $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #ifndef _VMX_H_ @@ -31,15 +37,9 @@ #include "vmcs.h" -#ifndef __FreeBSD__ -#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */ -#define HOST_MSR_MAX_ENTRIES 64 /* arbitrary */ -#endif +struct pmap; struct vmxctx { - register_t tmpstk[32]; /* vmx_return() stack */ - register_t tmpstktop; - register_t guest_rdi; /* Guest state */ register_t guest_rsi; register_t guest_rdx; @@ -56,7 +56,13 @@ struct vmxctx { register_t guest_r14; register_t guest_r15; register_t guest_cr2; + register_t guest_dr0; + register_t guest_dr1; + register_t guest_dr2; + register_t guest_dr3; + register_t guest_dr6; +#ifdef __FreeBSD__ register_t host_r15; /* Host state */ register_t host_r14; register_t host_r13; @@ -64,13 +70,24 @@ struct vmxctx { register_t host_rbp; register_t host_rsp; register_t host_rbx; - register_t host_rip; +#endif /* __FreeBSD__ */ + + register_t host_dr0; + register_t host_dr1; + register_t host_dr2; + register_t host_dr3; + register_t host_dr6; + register_t host_dr7; + uint64_t host_debugctl; + int host_tf; + + int inst_fail_status; + /* - * XXX todo debug registers and fpu state + * The pmap needs to be deactivated in vmx_enter_guest() + * so keep a copy of the 'pmap' in each vmxctx. */ - - int launched; /* vmcs launch state */ - int launch_error; + struct pmap *pmap; }; struct vmxcap { @@ -105,52 +122,55 @@ enum { IDX_MSR_STAR, IDX_MSR_SF_MASK, IDX_MSR_KGSBASE, + IDX_MSR_PAT, GUEST_MSR_NUM /* must be the last enumeration */ }; +#ifndef __FreeBSD__ +typedef enum { + VS_NONE = 0x0, + VS_LAUNCHED = 0x1, + VS_LOADED = 0x2 +} vmcs_state_t; +#endif /* __FreeBSD__ */ + /* virtual machine softc */ struct vmx { - pml4_entry_t pml4ept[NPML4EPG]; struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */ struct apic_page apic_page[VM_MAXCPU]; /* one apic page per vcpu */ char msr_bitmap[PAGE_SIZE]; struct pir_desc pir_desc[VM_MAXCPU]; -#ifdef __FreeBSD__ uint64_t guest_msrs[VM_MAXCPU][GUEST_MSR_NUM]; -#else - struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES]; - struct msr_entry host_msrs[VM_MAXCPU][HOST_MSR_MAX_ENTRIES]; +#ifndef __FreeBSD__ + uint64_t host_msrs[VM_MAXCPU][GUEST_MSR_NUM]; + uint64_t tsc_offset_active[VM_MAXCPU]; + vmcs_state_t vmcs_state[VM_MAXCPU]; #endif struct vmxctx ctx[VM_MAXCPU]; struct vmxcap cap[VM_MAXCPU]; struct vmxstate state[VM_MAXCPU]; + uint64_t eptp; struct vm *vm; + long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */ }; -CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0); -#define VMX_RETURN_DIRECT 0 -#define VMX_RETURN_LONGJMP 1 -#define VMX_RETURN_VMRESUME 2 -#define VMX_RETURN_VMLAUNCH 3 -#define VMX_RETURN_AST 4 -/* - * vmx_setjmp() returns: - * - 0 when it returns directly - * - 1 when it returns from vmx_longjmp - * - 2 when it returns from vmx_resume (which would only be in the error case) - * - 3 when it returns from vmx_launch (which would only be in the error case) - * - 4 when it returns from vmx_resume or vmx_launch because of AST pending - */ -int vmx_setjmp(struct vmxctx *ctx); -void vmx_longjmp(void); /* returns via vmx_setjmp */ -void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */ -void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */ +#define VMX_GUEST_VMEXIT 0 +#define VMX_VMRESUME_ERROR 1 +#define VMX_VMLAUNCH_ERROR 2 +#define VMX_INVEPT_ERROR 3 +#define VMX_VMWRITE_ERROR 4 +int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched); +void vmx_call_isr(uintptr_t entry); u_long vmx_fix_cr0(u_long cr0); u_long vmx_fix_cr4(u_long cr4); int vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset); +extern char vmx_exit_guest[]; +extern char vmx_exit_guest_flush_rsb[]; + #endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h index 08b1469f19..5408d129ad 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx_controls.h 260410 2014-01-07 21:04:49Z neel $ + * $FreeBSD$ */ #ifndef _VMX_CONTROLS_H_ diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h index 9513f6c70b..f0c5ba7691 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx_cpufunc.h 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,6 +38,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _VMX_CPUFUNC_H_ @@ -71,7 +74,12 @@ vmxon(char *region) int error; uint64_t addr; +#ifdef __FreeBSD__ addr = vtophys(region); +#else + /* This is pre-translated in illumos */ + addr = (uint64_t)region; +#endif __asm __volatile("vmxon %[addr];" VMX_SET_ERROR_CODE : [error] "=r" (error) @@ -81,21 +89,7 @@ vmxon(char *region) return (error); } -/* returns 0 on success and non-zero on failure */ -static __inline int -vmxon_pa(vm_paddr_t addr) -{ - int error; - - __asm __volatile("vmxon %[addr];" - VMX_SET_ERROR_CODE - : [error] "=r" (error) - : [addr] "m" (*(uint64_t *)&addr) - : "memory"); - - return (error); -} - +#ifdef __FreeBSD__ /* returns 0 on success and non-zero on failure */ static __inline int vmclear(struct vmcs *vmcs) @@ -111,6 +105,7 @@ vmclear(struct vmcs *vmcs) : "memory"); return (error); } +#endif /* __FreeBSD__ */ static __inline void vmxoff(void) @@ -126,6 +121,7 @@ vmptrst(uint64_t *addr) __asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory"); } +#ifdef __FreeBSD__ static __inline int vmptrld(struct vmcs *vmcs) { @@ -140,6 +136,7 @@ vmptrld(struct vmcs *vmcs) : "memory"); return (error); } +#endif /* __FreeBSD__ */ static __inline int vmwrite(uint64_t reg, uint64_t val) @@ -169,7 +166,8 @@ vmread(uint64_t r, uint64_t *addr) return (error); } -static void __inline +#ifdef __FreeBSD__ +static __inline void VMCLEAR(struct vmcs *vmcs) { int err; @@ -181,7 +179,7 @@ VMCLEAR(struct vmcs *vmcs) critical_exit(); } -static void __inline +static __inline void VMPTRLD(struct vmcs *vmcs) { int err; @@ -192,6 +190,7 @@ VMPTRLD(struct vmcs *vmcs) if (err != 0) panic("%s: vmptrld(%p) error %d", __func__, vmcs, err); } +#endif /* __FreeBSD__ */ #define INVVPID_TYPE_ADDRESS 0UL #define INVVPID_TYPE_SINGLE_CONTEXT 1UL @@ -205,7 +204,7 @@ struct invvpid_desc { }; CTASSERT(sizeof(struct invvpid_desc) == 16); -static void __inline +static __inline void invvpid(uint64_t type, struct invvpid_desc desc) { int error; @@ -228,7 +227,7 @@ struct invept_desc { }; CTASSERT(sizeof(struct invept_desc) == 16); -static void __inline +static __inline void invept(uint64_t type, struct invept_desc desc) { int error; diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c index 1ced311ca8..4a1a2cd358 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,26 +25,26 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.c 284174 2015-06-09 00:14:47Z tychon $ + * $FreeBSD$ + */ +/* + * Copyright 2017 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.c 284174 2015-06-09 00:14:47Z tychon $"); +__FBSDID("$FreeBSD$"); #include #include -#include +#include #include #include #include +#include #include #include -#ifndef __FreeBSD__ -#include -#endif - #include "vmx.h" #include "vmx_msr.h" @@ -184,7 +186,9 @@ msr_bitmap_change_access(char *bitmap, u_int msr, int access) static uint64_t misc_enable; static uint64_t platform_info; static uint64_t turbo_ratio_limit; +#ifdef __FreeBSD__ static uint64_t host_msrs[GUEST_MSR_NUM]; +#endif /* __FreeBSD__ */ static bool nehalem_cpu(void) @@ -234,13 +238,33 @@ westmere_cpu(void) return (false); } +static bool +pat_valid(uint64_t val) +{ + int i, pa; + + /* + * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" + * + * Extract PA0 through PA7 and validate that each one encodes a + * valid memory type. + */ + for (i = 0; i < 8; i++) { + pa = (val >> (i * 8)) & 0xff; + if (pa == 2 || pa == 3 || pa >= 8) + return (false); + } + return (true); +} + void vmx_msr_init(void) { uint64_t bus_freq, ratio; int i; -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ + /* XXXJOY: Do we want to do this caching? */ /* * It is safe to cache the values of the following MSRs because * they don't change based on curcpu, curproc or curthread. @@ -249,7 +273,7 @@ vmx_msr_init(void) host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); -#endif +#endif /* __FreeBSD__ */ /* * Initialize emulated MSRs @@ -308,6 +332,10 @@ vmx_msr_init(void) void vmx_msr_guest_init(struct vmx *vmx, int vcpuid) { + uint64_t *guest_msrs; + + guest_msrs = vmx->guest_msrs[vcpuid]; + /* * The permissions bitmap is shared between all vcpus so initialize it * once when initializing the vBSP. @@ -319,29 +347,55 @@ vmx_msr_guest_init(struct vmx *vmx, int vcpuid) guest_msr_rw(vmx, MSR_SF_MASK); guest_msr_rw(vmx, MSR_KGSBASE); } + + /* + * Initialize guest IA32_PAT MSR with default value after reset. + */ + guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + return; } void vmx_msr_guest_enter(struct vmx *vmx, int vcpuid) { -#ifdef __FreeBSD__ uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; - /* Save host MSRs (if any) and restore guest MSRs */ +#ifndef __FreeBSD__ + uint64_t *host_msrs = vmx->host_msrs[vcpuid]; + + /* Save host MSRs */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +#endif /* __FreeBSD__ */ + + /* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */ +#ifdef __FreeBSD__ + update_pcb_bases(curpcb); +#endif wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]); wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]); wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]); wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]); wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]); -#endif } void vmx_msr_guest_exit(struct vmx *vmx, int vcpuid) { -#ifdef __FreeBSD__ uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; +#ifndef __FreeBSD__ + uint64_t *host_msrs = vmx->host_msrs[vcpuid]; +#endif /* Save guest MSRs */ guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); @@ -357,13 +411,16 @@ vmx_msr_guest_exit(struct vmx *vmx, int vcpuid) wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); /* MSR_KGSBASE will be restored on the way back to userspace */ -#endif } int vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) { - int error = 0; + const uint64_t *guest_msrs; + int error; + + guest_msrs = vmx->guest_msrs[vcpuid]; + error = 0; switch (num) { case MSR_MCG_CAP: @@ -387,6 +444,9 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) case MSR_TURBO_RATIO_LIMIT1: *val = turbo_ratio_limit; break; + case MSR_PAT: + *val = guest_msrs[IDX_MSR_PAT]; + break; default: error = EINVAL; break; @@ -397,10 +457,13 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) int vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) { + uint64_t *guest_msrs; uint64_t changed; int error; + guest_msrs = vmx->guest_msrs[vcpuid]; error = 0; + switch (num) { case MSR_MCG_CAP: case MSR_MCG_STATUS: @@ -433,9 +496,17 @@ vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) error = EINVAL; break; + case MSR_PAT: + if (pat_valid(val)) + guest_msrs[IDX_MSR_PAT] = val; + else + vm_inject_gp(vmx->vm, vcpuid); + break; +#ifdef __FreeBSD__ case MSR_TSC: error = vmx_set_tsc_offset(vmx, vcpuid, val - rdtsc()); break; +#endif /* __FreeBSD__ */ default: error = EINVAL; break; diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h index 5300d14d9b..ac2adb0dd1 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.h 271888 2014-09-20 02:35:21Z neel $ + * $FreeBSD$ */ #ifndef _VMX_MSR_H_ diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s index d57dde1093..0130f88dd6 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s @@ -1,5 +1,6 @@ /*- * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2013 Neel Natu * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -23,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx_support.S 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,50 +37,41 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ -#include +#include +#include -#include "vmx_assym.s" +/* Porting note: This is named 'vmx_support.S' upstream. */ -/* - * Disable interrupts before updating %rsp in VMX_CHECK_AST or - * VMX_GUEST_RESTORE. - * - * The location that %rsp points to is a 'vmxctx' and not a - * real stack so we don't want an interrupt handler to trash it - */ -#define VMX_DISABLE_INTERRUPTS cli -/* - * If the thread hosting the vcpu has an ast pending then take care of it - * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST. - * - * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts - * are disabled. - */ -#ifdef __FreeBSD__ -#define VMX_CHECK_AST \ - movq PCPU(CURTHREAD),%rax; \ - testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax); \ - je 9f; \ - movq $VMX_RETURN_AST,%rsi; \ - movq %rdi,%rsp; \ - addq $VMXCTX_TMPSTKTOP,%rsp; \ - callq vmx_return; \ -9: -#else -#define VMX_CHECK_AST \ - movq %gs:CPU_THREAD,%rax; \ - movl T_ASTFLAG(%rax),%eax; \ - test %al,%al; \ - je 9f; \ - movq $VMX_RETURN_AST,%rsi; \ - movq %rdi,%rsp; \ - addq $VMXCTX_TMPSTKTOP,%rsp; \ - callq vmx_return; \ -9: -#endif + +#if defined(lint) + +struct vmxctx; +struct vmx; + +/*ARGSUSED*/ +void +vmx_launch(struct vmxctx *ctx) +{} + +void +vmx_exit_guest() +{} + +/*ARGSUSED*/ +int +vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched) +{ + return (0); +} + +#else /* lint */ + +#include "vmx_assym.h" +#include "vmcs.h" /* * Assumes that %rdi holds a pointer to the 'vmxctx'. @@ -92,7 +84,6 @@ * host context in case of an error with 'vmlaunch' or 'vmresume'. */ #define VMX_GUEST_RESTORE \ - movq %rdi,%rsp; \ movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ movq %rsi,%cr2; \ movq VMXCTX_GUEST_RSI(%rdi),%rsi; \ @@ -111,161 +102,283 @@ movq VMXCTX_GUEST_R15(%rdi),%r15; \ movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */ -#define VM_INSTRUCTION_ERROR(reg) \ - jnc 1f; \ - movl $VM_FAIL_INVALID,reg; /* CF is set */ \ - jmp 3f; \ -1: jnz 2f; \ - movl $VM_FAIL_VALID,reg; /* ZF is set */ \ - jmp 3f; \ -2: movl $VM_SUCCESS,reg; \ -3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp) - - .text -/* - * int vmx_setjmp(ctxp) - * %rdi = ctxp - * - * Return value is '0' when it returns directly from here. - * Return value is '1' when it returns after a vm exit through vmx_longjmp. - */ -ENTRY(vmx_setjmp) - movq (%rsp),%rax /* return address */ - movq %r15,VMXCTX_HOST_R15(%rdi) - movq %r14,VMXCTX_HOST_R14(%rdi) - movq %r13,VMXCTX_HOST_R13(%rdi) - movq %r12,VMXCTX_HOST_R12(%rdi) - movq %rbp,VMXCTX_HOST_RBP(%rdi) - movq %rsp,VMXCTX_HOST_RSP(%rdi) - movq %rbx,VMXCTX_HOST_RBX(%rdi) - movq %rax,VMXCTX_HOST_RIP(%rdi) +#define VMX_GUEST_SAVE \ + movq %rdi, VMXSTK_TMPRDI(%rsp); \ + movq VMXSTK_RDI(%rsp), %rdi; \ + movq %rbp, VMXCTX_GUEST_RBP(%rdi); \ + leaq VMXSTK_FP(%rsp), %rbp; \ + movq %rsi, VMXCTX_GUEST_RSI(%rdi); \ + movq %rdx, VMXCTX_GUEST_RDX(%rdi); \ + movq %rcx, VMXCTX_GUEST_RCX(%rdi); \ + movq %r8, VMXCTX_GUEST_R8(%rdi); \ + movq %r9, VMXCTX_GUEST_R9(%rdi); \ + movq %rax, VMXCTX_GUEST_RAX(%rdi); \ + movq %rbx, VMXCTX_GUEST_RBX(%rdi); \ + movq %r10, VMXCTX_GUEST_R10(%rdi); \ + movq %r11, VMXCTX_GUEST_R11(%rdi); \ + movq %r12, VMXCTX_GUEST_R12(%rdi); \ + movq %r13, VMXCTX_GUEST_R13(%rdi); \ + movq %r14, VMXCTX_GUEST_R14(%rdi); \ + movq %r15, VMXCTX_GUEST_R15(%rdi); \ + movq %cr2, %rbx; \ + movq %rbx, VMXCTX_GUEST_CR2(%rdi); \ + movq VMXSTK_TMPRDI(%rsp), %rdx; \ + movq %rdx, VMXCTX_GUEST_RDI(%rdi); - /* - * XXX save host debug registers - */ - movl $VMX_RETURN_DIRECT,%eax - ret -END(vmx_setjmp) /* - * void vmx_return(struct vmxctx *ctxp, int retval) - * %rdi = ctxp - * %rsi = retval - * Return to vmm context through vmx_setjmp() with a value of 'retval'. + * Flush scratch registers to avoid lingering guest state being used for + * Spectre v1 attacks when returning from guest entry. */ -ENTRY(vmx_return) - /* Restore host context. */ - movq VMXCTX_HOST_R15(%rdi),%r15 - movq VMXCTX_HOST_R14(%rdi),%r14 - movq VMXCTX_HOST_R13(%rdi),%r13 - movq VMXCTX_HOST_R12(%rdi),%r12 - movq VMXCTX_HOST_RBP(%rdi),%rbp - movq VMXCTX_HOST_RSP(%rdi),%rsp - movq VMXCTX_HOST_RBX(%rdi),%rbx - movq VMXCTX_HOST_RIP(%rdi),%rax - movq %rax,(%rsp) /* return address */ +#define VMX_GUEST_FLUSH_SCRATCH \ + xorl %edi, %edi; \ + xorl %esi, %esi; \ + xorl %edx, %edx; \ + xorl %ecx, %ecx; \ + xorl %r8d, %r8d; \ + xorl %r9d, %r9d; \ + xorl %r10d, %r10d; \ + xorl %r11d, %r11d; - /* - * XXX restore host debug registers - */ - movl %esi,%eax - ret -END(vmx_return) -/* - * void vmx_longjmp(void) - * %rsp points to the struct vmxctx - */ -ENTRY(vmx_longjmp) - /* - * Save guest state that is not automatically saved in the vmcs. - */ - movq %rdi,VMXCTX_GUEST_RDI(%rsp) - movq %rsi,VMXCTX_GUEST_RSI(%rsp) - movq %rdx,VMXCTX_GUEST_RDX(%rsp) - movq %rcx,VMXCTX_GUEST_RCX(%rsp) - movq %r8,VMXCTX_GUEST_R8(%rsp) - movq %r9,VMXCTX_GUEST_R9(%rsp) - movq %rax,VMXCTX_GUEST_RAX(%rsp) - movq %rbx,VMXCTX_GUEST_RBX(%rsp) - movq %rbp,VMXCTX_GUEST_RBP(%rsp) - movq %r10,VMXCTX_GUEST_R10(%rsp) - movq %r11,VMXCTX_GUEST_R11(%rsp) - movq %r12,VMXCTX_GUEST_R12(%rsp) - movq %r13,VMXCTX_GUEST_R13(%rsp) - movq %r14,VMXCTX_GUEST_R14(%rsp) - movq %r15,VMXCTX_GUEST_R15(%rsp) - - movq %cr2,%rdi - movq %rdi,VMXCTX_GUEST_CR2(%rsp) - - movq %rsp,%rdi - movq $VMX_RETURN_LONGJMP,%rsi - - addq $VMXCTX_TMPSTKTOP,%rsp - callq vmx_return -END(vmx_longjmp) +/* Stack layout (offset from %rsp) for vmx_enter_guest */ +#define VMXSTK_TMPRDI 0x00 /* temp store %rdi on vmexit */ +#define VMXSTK_R15 0x08 /* callee saved %r15 */ +#define VMXSTK_R14 0x10 /* callee saved %r14 */ +#define VMXSTK_R13 0x18 /* callee saved %r13 */ +#define VMXSTK_R12 0x20 /* callee saved %r12 */ +#define VMXSTK_RBX 0x28 /* callee saved %rbx */ +#define VMXSTK_RDX 0x30 /* save-args %rdx (int launched) */ +#define VMXSTK_RSI 0x38 /* save-args %rsi (struct vmx *vmx) */ +#define VMXSTK_RDI 0x40 /* save-args %rdi (struct vmxctx *ctx) */ +#define VMXSTK_FP 0x48 /* frame pointer %rbp */ +#define VMXSTKSIZE VMXSTK_FP /* - * void vmx_resume(struct vmxctx *ctxp) - * %rdi = ctxp - * - * Although the return type is a 'void' this function may return indirectly - * through vmx_setjmp() with a return value of 2. + * vmx_enter_guest(struct vmxctx *vmxctx, int launched) + * Interrupts must be disabled on entry. */ -ENTRY(vmx_resume) - VMX_DISABLE_INTERRUPTS +ENTRY_NP(vmx_enter_guest) + pushq %rbp + movq %rsp, %rbp + subq $VMXSTKSIZE, %rsp + movq %r15, VMXSTK_R15(%rsp) + movq %r14, VMXSTK_R14(%rsp) + movq %r13, VMXSTK_R13(%rsp) + movq %r12, VMXSTK_R12(%rsp) + movq %rbx, VMXSTK_RBX(%rsp) + movq %rdx, VMXSTK_RDX(%rsp) + movq %rsi, VMXSTK_RSI(%rsp) + movq %rdi, VMXSTK_RDI(%rsp) + + movq %rdi, %r12 /* vmxctx */ + movq %rsi, %r13 /* vmx */ + movl %edx, %r14d /* launch state */ + movq VMXCTX_PMAP(%rdi), %rbx - VMX_CHECK_AST + /* Activate guest pmap on this cpu. */ + leaq PM_ACTIVE(%rbx), %rdi + movl %gs:CPU_ID, %esi + call cpuset_atomic_add + movq %r12, %rdi /* - * Restore guest state that is not automatically loaded from the vmcs. + * If 'vmx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen' + * then we must invalidate all mappings associated with this EPTP. */ - VMX_GUEST_RESTORE + movq PM_EPTGEN(%rbx), %r10 + movl %gs:CPU_ID, %eax + cmpq %r10, VMX_EPTGEN(%r13, %rax, 8) + je guest_restore + + /* Refresh 'vmx->eptgen[curcpu]' */ + movq %r10, VMX_EPTGEN(%r13, %rax, 8) + + /* Setup the invept descriptor on the host stack */ + pushq $0x0 + pushq VMX_EPTP(%r13) + movl $0x1, %eax /* Single context invalidate */ + invept (%rsp), %rax + leaq 0x10(%rsp), %rsp + jbe invept_error /* Check invept instruction error */ +guest_restore: + /* Write the current %rsp into the VMCS to be restored on vmexit */ + movl $VMCS_HOST_RSP, %eax + vmwrite %rsp, %rax + jbe vmwrite_error + + /* Check if vmresume is adequate or a full vmlaunch is required */ + cmpl $0, %r14d + je do_launch + + VMX_GUEST_RESTORE vmresume + /* + * In the common case, 'vmresume' returns back to the host through + * 'vmx_exit_guest'. If there is an error we return VMX_VMRESUME_ERROR + * to the caller. + */ + leaq VMXSTK_FP(%rsp), %rbp + movq VMXSTK_RDI(%rsp), %rdi + movl $VMX_VMRESUME_ERROR, %eax + jmp decode_inst_error +do_launch: + VMX_GUEST_RESTORE + vmlaunch /* - * Capture the reason why vmresume failed. + * In the common case, 'vmlaunch' returns back to the host through + * 'vmx_exit_guest'. If there is an error we return VMX_VMLAUNCH_ERROR + * to the caller. */ - VM_INSTRUCTION_ERROR(%eax) + leaq VMXSTK_FP(%rsp), %rbp + movq VMXSTK_RDI(%rsp), %rdi + movl $VMX_VMLAUNCH_ERROR, %eax + jmp decode_inst_error + +vmwrite_error: + movl $VMX_VMWRITE_ERROR, %eax + jmp decode_inst_error +invept_error: + movl $VMX_INVEPT_ERROR, %eax + jmp decode_inst_error +decode_inst_error: + movl $VM_FAIL_VALID, %r11d + jz inst_error + movl $VM_FAIL_INVALID, %r11d +inst_error: + movl %r11d, VMXCTX_INST_FAIL_STATUS(%rdi) - /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */ - movq %rsp,%rdi - movq $VMX_RETURN_VMRESUME,%rsi + movq VMXCTX_PMAP(%rdi), %rdi + leaq PM_ACTIVE(%rdi), %rdi + movl %gs:CPU_ID, %esi + movq %rax, %r12 + call cpuset_atomic_del + movq %r12, %rax - addq $VMXCTX_TMPSTKTOP,%rsp - callq vmx_return -END(vmx_resume) + movq VMXSTK_RBX(%rsp), %rbx + movq VMXSTK_R12(%rsp), %r12 + movq VMXSTK_R13(%rsp), %r13 + movq VMXSTK_R14(%rsp), %r14 + movq VMXSTK_R15(%rsp), %r15 + + VMX_GUEST_FLUSH_SCRATCH + + addq $VMXSTKSIZE, %rsp + popq %rbp + ret /* - * void vmx_launch(struct vmxctx *ctxp) - * %rdi = ctxp - * - * Although the return type is a 'void' this function may return indirectly - * through vmx_setjmp() with a return value of 3. + * Non-error VM-exit from the guest. Make this a label so it can + * be used by C code when setting up the VMCS. + * The VMCS-restored %rsp points to the struct vmxctx */ -ENTRY(vmx_launch) - VMX_DISABLE_INTERRUPTS +.align ASM_ENTRY_ALIGN; +ALTENTRY(vmx_exit_guest) + /* Save guest state that is not automatically saved in the vmcs. */ + VMX_GUEST_SAVE - VMX_CHECK_AST + /* Deactivate guest pmap on this cpu. */ + movq VMXCTX_PMAP(%rdi), %rdi + leaq PM_ACTIVE(%rdi), %rdi + movl %gs:CPU_ID, %esi + call cpuset_atomic_del /* - * Restore guest state that is not automatically loaded from the vmcs. + * This will return to the caller of 'vmx_enter_guest()' with a return + * value of VMX_GUEST_VMEXIT. */ - VMX_GUEST_RESTORE + movl $VMX_GUEST_VMEXIT, %eax + movq VMXSTK_RBX(%rsp), %rbx + movq VMXSTK_R12(%rsp), %r12 + movq VMXSTK_R13(%rsp), %r13 + movq VMXSTK_R14(%rsp), %r14 + movq VMXSTK_R15(%rsp), %r15 - vmlaunch + VMX_GUEST_FLUSH_SCRATCH + + addq $VMXSTKSIZE, %rsp + popq %rbp + ret +SET_SIZE(vmx_enter_guest) + + + +.align ASM_ENTRY_ALIGN; +ALTENTRY(vmx_exit_guest_flush_rsb) + /* Save guest state that is not automatically saved in the vmcs. */ + VMX_GUEST_SAVE + + /* Deactivate guest pmap on this cpu. */ + movq VMXCTX_PMAP(%rdi), %rdi + leaq PM_ACTIVE(%rdi), %rdi + movl %gs:CPU_ID, %esi + call cpuset_atomic_del + + VMX_GUEST_FLUSH_SCRATCH /* - * Capture the reason why vmlaunch failed. + * To prevent malicious branch target predictions from affecting the + * host, overwrite all entries in the RSB upon exiting a guest. */ - VM_INSTRUCTION_ERROR(%eax) + movl $16, %ecx /* 16 iterations, two calls per loop */ + movq %rsp, %rax +loop: + call 2f /* create an RSB entry. */ +1: + pause + call 1b /* capture rogue speculation. */ +2: + call 2f /* create an RSB entry. */ +1: + pause + call 1b /* capture rogue speculation. */ +2: + subl $1, %ecx + jnz loop + movq %rax, %rsp - /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */ - movq %rsp,%rdi - movq $VMX_RETURN_VMLAUNCH,%rsi + /* + * This will return to the caller of 'vmx_enter_guest()' with a return + * value of VMX_GUEST_VMEXIT. + */ + movl $VMX_GUEST_VMEXIT, %eax + movq VMXSTK_RBX(%rsp), %rbx + movq VMXSTK_R12(%rsp), %r12 + movq VMXSTK_R13(%rsp), %r13 + movq VMXSTK_R14(%rsp), %r14 + movq VMXSTK_R15(%rsp), %r15 + + addq $VMXSTKSIZE, %rsp + popq %rbp + ret +SET_SIZE(vmx_exit_guest_flush_rsb) + +/* + * %rdi = trapno + * + * We need to do enough to convince cmnint - and its iretting tail - that we're + * a legit interrupt stack frame. + */ +ENTRY_NP(vmx_call_isr) + pushq %rbp + movq %rsp, %rbp + movq %rsp, %r11 + andq $~0xf, %rsp /* align stack */ + pushq $KDS_SEL /* %ss */ + pushq %r11 /* %rsp */ + pushfq /* %rflags */ + pushq $KCS_SEL /* %cs */ + leaq .iret_dest(%rip), %rcx + pushq %rcx /* %rip */ + pushq $0 /* err */ + pushq %rdi /* trapno */ + cli + jmp cmnint /* %rip (and call) */ +.iret_dest: + popq %rbp + ret +SET_SIZE(vmx_call_isr) - addq $VMXCTX_TMPSTKTOP,%rsp - callq vmx_return -END(vmx_launch) +#endif /* lint */ diff --git a/usr/src/uts/i86pc/io/vmm/intel/vtd.c b/usr/src/uts/i86pc/io/vmm/intel/vtd.c new file mode 100644 index 0000000000..9474b30fc6 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vtd.c @@ -0,0 +1,690 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include + +#include + +#include +#include + +#include "io/iommu.h" + +/* + * Documented in the "Intel Virtualization Technology for Directed I/O", + * Architecture Spec, September 2008. + */ + +/* Section 10.4 "Register Descriptions" */ +struct vtdmap { + volatile uint32_t version; + volatile uint32_t res0; + volatile uint64_t cap; + volatile uint64_t ext_cap; + volatile uint32_t gcr; + volatile uint32_t gsr; + volatile uint64_t rta; + volatile uint64_t ccr; +}; + +#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F) +#define VTD_CAP_ND(cap) ((cap) & 0x7) +#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1) +#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF) +#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1) + +#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1) +#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1) +#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF) + +#define VTD_GCR_WBF (1 << 27) +#define VTD_GCR_SRTP (1 << 30) +#define VTD_GCR_TE (1U << 31) + +#define VTD_GSR_WBFS (1 << 27) +#define VTD_GSR_RTPS (1 << 30) +#define VTD_GSR_TES (1U << 31) + +#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */ +#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */ + +#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */ +#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */ +#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */ +#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */ +#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */ +#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */ +#define VTD_IIR_DOMAIN_P 32 + +#define VTD_ROOT_PRESENT 0x1 +#define VTD_CTX_PRESENT 0x1 +#define VTD_CTX_TT_ALL (1UL << 2) + +#define VTD_PTE_RD (1UL << 0) +#define VTD_PTE_WR (1UL << 1) +#define VTD_PTE_SUPERPAGE (1UL << 7) +#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL) + +#define VTD_RID2IDX(rid) (((rid) & 0xff) * 2) + +struct domain { + uint64_t *ptp; /* first level page table page */ + int pt_levels; /* number of page table levels */ + int addrwidth; /* 'AW' field in context entry */ + int spsmask; /* supported super page sizes */ + u_int id; /* domain id */ + vm_paddr_t maxaddr; /* highest address to be mapped */ + SLIST_ENTRY(domain) next; +}; + +static SLIST_HEAD(, domain) domhead; + +#define DRHD_MAX_UNITS 8 +static int drhd_num; +static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; +static int max_domains; +typedef int (*drhd_ident_func_t)(void); + +static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); +static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); + +static MALLOC_DEFINE(M_VTD, "vtd", "vtd"); + +static int +vtd_max_domains(struct vtdmap *vtdmap) +{ + int nd; + + nd = VTD_CAP_ND(vtdmap->cap); + + switch (nd) { + case 0: + return (16); + case 1: + return (64); + case 2: + return (256); + case 3: + return (1024); + case 4: + return (4 * 1024); + case 5: + return (16 * 1024); + case 6: + return (64 * 1024); + default: + panic("vtd_max_domains: invalid value of nd (0x%0x)", nd); + } +} + +static u_int +domain_id(void) +{ + u_int id; + struct domain *dom; + + /* Skip domain id 0 - it is reserved when Caching Mode field is set */ + for (id = 1; id < max_domains; id++) { + SLIST_FOREACH(dom, &domhead, next) { + if (dom->id == id) + break; + } + if (dom == NULL) + break; /* found it */ + } + + if (id >= max_domains) + panic("domain ids exhausted"); + + return (id); +} + +static void +vtd_wbflush(struct vtdmap *vtdmap) +{ + + if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0) + pmap_invalidate_cache(); + + if (VTD_CAP_RWBF(vtdmap->cap)) { + vtdmap->gcr = VTD_GCR_WBF; + while ((vtdmap->gsr & VTD_GSR_WBFS) != 0) + ; + } +} + +static void +vtd_ctx_global_invalidate(struct vtdmap *vtdmap) +{ + + vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL; + while ((vtdmap->ccr & VTD_CCR_ICC) != 0) + ; +} + +static void +vtd_iotlb_global_invalidate(struct vtdmap *vtdmap) +{ + int offset; + volatile uint64_t *iotlb_reg, val; + + vtd_wbflush(vtdmap); + + offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16; + iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8); + + *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL | + VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES; + + while (1) { + val = *iotlb_reg; + if ((val & VTD_IIR_IVT) == 0) + break; + } +} + +static void +vtd_translation_enable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = VTD_GCR_TE; + while ((vtdmap->gsr & VTD_GSR_TES) == 0) + ; +} + +static void +vtd_translation_disable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = 0; + while ((vtdmap->gsr & VTD_GSR_TES) != 0) + ; +} + +static int +vtd_init(void) +{ + int i, units, remaining; + struct vtdmap *vtdmap; + vm_paddr_t ctx_paddr; + char *end, envname[32]; + unsigned long mapaddr; + ACPI_STATUS status; + ACPI_TABLE_DMAR *dmar; + ACPI_DMAR_HEADER *hdr; + ACPI_DMAR_HARDWARE_UNIT *drhd; + + /* + * Allow the user to override the ACPI DMAR table by specifying the + * physical address of each remapping unit. + * + * The following example specifies two remapping units at + * physical addresses 0xfed90000 and 0xfeda0000 respectively. + * set vtd.regmap.0.addr=0xfed90000 + * set vtd.regmap.1.addr=0xfeda0000 + */ + for (units = 0; units < DRHD_MAX_UNITS; units++) { + snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units); + if (getenv_ulong(envname, &mapaddr) == 0) + break; + vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr); + } + + if (units > 0) + goto skip_dmar; + + /* Search for DMAR table. */ + status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar); + if (ACPI_FAILURE(status)) + return (ENXIO); + + end = (char *)dmar + dmar->Header.Length; + remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR); + while (remaining > sizeof(ACPI_DMAR_HEADER)) { + hdr = (ACPI_DMAR_HEADER *)(end - remaining); + if (hdr->Length > remaining) + break; + /* + * From Intel VT-d arch spec, version 1.3: + * BIOS implementations must report mapping structures + * in numerical order, i.e. All remapping structures of + * type 0 (DRHD) enumerated before remapping structures of + * type 1 (RMRR) and so forth. + */ + if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT) + break; + + drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr; + vtdmaps[units++] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address); + if (units >= DRHD_MAX_UNITS) + break; + remaining -= hdr->Length; + } + + if (units <= 0) + return (ENXIO); + +skip_dmar: + drhd_num = units; + vtdmap = vtdmaps[0]; + + if (VTD_CAP_CM(vtdmap->cap) != 0) + panic("vtd_init: invalid caching mode"); + + max_domains = vtd_max_domains(vtdmap); + + /* + * Set up the root-table to point to the context-entry tables + */ + for (i = 0; i < 256; i++) { + ctx_paddr = vtophys(ctx_tables[i]); + if (ctx_paddr & PAGE_MASK) + panic("ctx table (0x%0lx) not page aligned", ctx_paddr); + + root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT; + } + + return (0); +} + +static void +vtd_cleanup(void) +{ +} + +static void +vtd_enable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_wbflush(vtdmap); + + /* Update the root table address */ + vtdmap->rta = vtophys(root_table); + vtdmap->gcr = VTD_GCR_SRTP; + while ((vtdmap->gsr & VTD_GSR_RTPS) == 0) + ; + + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + + vtd_translation_enable(vtdmap); + } +} + +static void +vtd_disable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_translation_disable(vtdmap); + } +} + +static void +vtd_add_device(void *arg, uint16_t rid) +{ + int idx; + uint64_t *ctxp; + struct domain *dom = arg; + vm_paddr_t pt_paddr; + struct vtdmap *vtdmap; + uint8_t bus; + + vtdmap = vtdmaps[0]; + bus = PCI_RID2BUS(rid); + ctxp = ctx_tables[bus]; + pt_paddr = vtophys(dom->ptp); + idx = VTD_RID2IDX(rid); + + if (ctxp[idx] & VTD_CTX_PRESENT) { + panic("vtd_add_device: device %x is already owned by " + "domain %d", rid, + (uint16_t)(ctxp[idx + 1] >> 8)); + } + + /* + * Order is important. The 'present' bit is set only after all fields + * of the context pointer are initialized. + */ + ctxp[idx + 1] = dom->addrwidth | (dom->id << 8); + + if (VTD_ECAP_DI(vtdmap->ext_cap)) + ctxp[idx] = VTD_CTX_TT_ALL; + else + ctxp[idx] = 0; + + ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT; + + /* + * 'Not Present' entries are not cached in either the Context Cache + * or in the IOTLB, so there is no need to invalidate either of them. + */ +} + +static void +vtd_remove_device(void *arg, uint16_t rid) +{ + int i, idx; + uint64_t *ctxp; + struct vtdmap *vtdmap; + uint8_t bus; + + bus = PCI_RID2BUS(rid); + ctxp = ctx_tables[bus]; + idx = VTD_RID2IDX(rid); + + /* + * Order is important. The 'present' bit is must be cleared first. + */ + ctxp[idx] = 0; + ctxp[idx + 1] = 0; + + /* + * Invalidate the Context Cache and the IOTLB. + * + * XXX use device-selective invalidation for Context Cache + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + } +} + +#define CREATE_MAPPING 0 +#define REMOVE_MAPPING 1 + +static uint64_t +vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, + int remove) +{ + struct domain *dom; + int i, spshift, ptpshift, ptpindex, nlevels; + uint64_t spsize, *ptp; + + dom = arg; + ptpindex = 0; + ptpshift = 0; + + KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__, + gpa, len)); + KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond " + "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr)); + + if (gpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); + + if (hpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa); + + if (len & PAGE_MASK) + panic("vtd_create_mapping: unaligned len 0x%0lx", len); + + /* + * Compute the size of the mapping that we can accommodate. + * + * This is based on three factors: + * - supported super page size + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = 48; + for (i = 3; i >= 0; i--) { + spsize = 1UL << spshift; + if ((dom->spsmask & (1 << i)) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + (len >= spsize)) { + break; + } + spshift -= 9; + } + + ptp = dom->ptp; + nlevels = dom->pt_levels; + while (--nlevels >= 0) { + ptpshift = 12 + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) { + break; + } + + /* + * We are working on a non-leaf page table page. + * + * Create a downstream page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { + void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO); + ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR; + } + + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M); + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) + panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift); + + /* + * Update the 'gpa' -> 'hpa' mapping + */ + if (remove) { + ptp[ptpindex] = 0; + } else { + ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; + + if (nlevels > 0) + ptp[ptpindex] |= VTD_PTE_SUPERPAGE; + } + + return (1UL << ptpshift); +} + +static uint64_t +vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + + return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING)); +} + +static uint64_t +vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +{ + + return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING)); +} + +static void +vtd_invalidate_tlb(void *dom) +{ + int i; + struct vtdmap *vtdmap; + + /* + * Invalidate the IOTLB. + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_iotlb_global_invalidate(vtdmap); + } +} + +static void * +vtd_create_domain(vm_paddr_t maxaddr) +{ + struct domain *dom; + vm_paddr_t addr; + int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth; + struct vtdmap *vtdmap; + + if (drhd_num <= 0) + panic("vtd_create_domain: no dma remapping hardware available"); + + vtdmap = vtdmaps[0]; + + /* + * Calculate AGAW. + * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec. + */ + addr = 0; + for (gaw = 0; addr < maxaddr; gaw++) + addr = 1ULL << gaw; + + res = (gaw - 12) % 9; + if (res == 0) + agaw = gaw; + else + agaw = gaw + 9 - res; + + if (agaw > 64) + agaw = 64; + + /* + * Select the smallest Supported AGAW and the corresponding number + * of page table levels. + */ + pt_levels = 2; + sagaw = 30; + addrwidth = 0; + tmp = VTD_CAP_SAGAW(vtdmap->cap); + for (i = 0; i < 5; i++) { + if ((tmp & (1 << i)) != 0 && sagaw >= agaw) + break; + pt_levels++; + addrwidth++; + sagaw += 9; + if (sagaw > 64) + sagaw = 64; + } + + if (i >= 5) { + panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d", + VTD_CAP_SAGAW(vtdmap->cap), agaw); + } + + dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK); + dom->pt_levels = pt_levels; + dom->addrwidth = addrwidth; + dom->id = domain_id(); + dom->maxaddr = maxaddr; + dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK); + if ((uintptr_t)dom->ptp & PAGE_MASK) + panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp); + +#ifdef notyet + /* + * XXX superpage mappings for the iommu do not work correctly. + * + * By default all physical memory is mapped into the host_domain. + * When a VM is allocated wired memory the pages belonging to it + * are removed from the host_domain and added to the vm's domain. + * + * If the page being removed was mapped using a superpage mapping + * in the host_domain then we need to demote the mapping before + * removing the page. + * + * There is not any code to deal with the demotion at the moment + * so we disable superpage mappings altogether. + */ + dom->spsmask = VTD_CAP_SPS(vtdmap->cap); +#endif + + SLIST_INSERT_HEAD(&domhead, dom, next); + + return (dom); +} + +static void +vtd_free_ptp(uint64_t *ptp, int level) +{ + int i; + uint64_t *nlp; + + if (level > 1) { + for (i = 0; i < 512; i++) { + if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0) + continue; + if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0) + continue; + nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M); + vtd_free_ptp(nlp, level - 1); + } + } + + bzero(ptp, PAGE_SIZE); + free(ptp, M_VTD); +} + +static void +vtd_destroy_domain(void *arg) +{ + struct domain *dom; + + dom = arg; + + SLIST_REMOVE(&domhead, dom, domain, next); + vtd_free_ptp(dom->ptp, dom->pt_levels); + free(dom, M_VTD); +} + +struct iommu_ops iommu_ops_intel = { + vtd_init, + vtd_cleanup, + vtd_enable, + vtd_disable, + vtd_create_domain, + vtd_destroy_domain, + vtd_create_mapping, + vtd_remove_mapping, + vtd_add_device, + vtd_remove_device, + vtd_invalidate_tlb, +}; diff --git a/usr/src/uts/i86pc/io/vmm/io/iommu.h b/usr/src/uts/i86pc/io/vmm/io/iommu.h new file mode 100644 index 0000000000..f8003a5d45 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/iommu.h @@ -0,0 +1,76 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_IOMMU_H_ +#define _IO_IOMMU_H_ + +typedef int (*iommu_init_func_t)(void); +typedef void (*iommu_cleanup_func_t)(void); +typedef void (*iommu_enable_func_t)(void); +typedef void (*iommu_disable_func_t)(void); +typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr); +typedef void (*iommu_destroy_domain_t)(void *domain); +typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t len); +typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa, + uint64_t len); +typedef void (*iommu_add_device_t)(void *domain, uint16_t rid); +typedef void (*iommu_remove_device_t)(void *dom, uint16_t rid); +typedef void (*iommu_invalidate_tlb_t)(void *dom); + +struct iommu_ops { + iommu_init_func_t init; /* module wide */ + iommu_cleanup_func_t cleanup; + iommu_enable_func_t enable; + iommu_disable_func_t disable; + + iommu_create_domain_t create_domain; /* domain-specific */ + iommu_destroy_domain_t destroy_domain; + iommu_create_mapping_t create_mapping; + iommu_remove_mapping_t remove_mapping; + iommu_add_device_t add_device; + iommu_remove_device_t remove_device; + iommu_invalidate_tlb_t invalidate_tlb; +}; + +extern struct iommu_ops iommu_ops_intel; +extern struct iommu_ops iommu_ops_amd; + +void iommu_cleanup(void); +void *iommu_host_domain(void); +void *iommu_create_domain(vm_paddr_t maxaddr); +void iommu_destroy_domain(void *dom); +void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, + size_t len); +void iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len); +void iommu_add_device(void *dom, uint16_t rid); +void iommu_remove_device(void *dom, uint16_t rid); +void iommu_invalidate_tlb(void *domain); +#endif diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.h b/usr/src/uts/i86pc/io/vmm/io/ppt.h new file mode 100644 index 0000000000..686b15db49 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/ppt.h @@ -0,0 +1,56 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_PPT_H_ +#define _IO_PPT_H_ + +int ppt_unassign_all(struct vm *vm); +int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, + uint64_t addr, uint64_t msg, int numvec); +int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, + int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int ppt_assigned_devices(struct vm *vm); +boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa); + +/* + * Returns the number of devices sequestered by the ppt driver for assignment + * to virtual machines. + */ +int ppt_avail_devices(void); + +/* + * The following functions should never be called directly. + * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead. + */ +int ppt_assign_device(struct vm *vm, int bus, int slot, int func); +int ppt_unassign_device(struct vm *vm, int bus, int slot, int func); +#endif diff --git a/usr/src/uts/i86pc/io/vmm/io/sol_iommu.c b/usr/src/uts/i86pc/io/vmm/io/sol_iommu.c new file mode 100644 index 0000000000..989e88e17b --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/sol_iommu.c @@ -0,0 +1,86 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include +#include +#include + +/* + * IOMMU Stub + * + * Until proper iommu support can be wired into bhyve, stub out all the + * functions to either fail, if reasonable, or panic. + */ + +void +iommu_cleanup(void) +{ +} + +void * +iommu_host_domain(void) +{ + return (NULL); +} + +/*ARGSUSED*/ +void * +iommu_create_domain(vm_paddr_t maxaddr) +{ + return (NULL); +} + +/*ARGSUSED*/ +void +iommu_destroy_domain(void *dom) +{ + panic("unimplemented"); +} + +/*ARGSUSED*/ +void +iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len) +{ + panic("unimplemented"); +} + +/*ARGSUSED*/ +void +iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len) +{ + panic("unimplemented"); +} + +/*ARGSUSED*/ +void +iommu_add_device(void *dom, uint16_t rid) +{ + panic("unimplemented"); +} + +/*ARGSUSED*/ +void +iommu_remove_device(void *dom, uint16_t rid) +{ + panic("unimplemented"); +} + +/*ARGSUSED*/ +void +iommu_invalidate_tlb(void *domain) +{ + panic("unimplemented"); +} + diff --git a/usr/src/uts/i86pc/io/vmm/io/sol_ppt.c b/usr/src/uts/i86pc/io/vmm/io/sol_ppt.c new file mode 100644 index 0000000000..9d5b1f5cdc --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/sol_ppt.c @@ -0,0 +1,92 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include +#include +#include + +#include + +/* + * PCI Pass-Through Stub + * + * Until proper passthrough support can be wired into bhyve, stub out all the + * functions to either fail or no-op. + */ + +int +ppt_unassign_all(struct vm *vm) +{ + return (0); +} + +/*ARGSUSED*/ +int +ppt_map_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa, + size_t len, vm_paddr_t hpa) +{ + return (ENXIO); +} + +/*ARGSUSED*/ +int +ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, + uint64_t addr, uint64_t msg, int numvec) +{ + return (ENXIO); +} + +/*ARGSUSED*/ +int +ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, int idx, + uint64_t addr, uint64_t msg, uint32_t vector_control) +{ + return (ENXIO); +} + +/*ARGSUSED*/ +int +ppt_assigned_devices(struct vm *vm) +{ + return (0); +} + +/*ARGSUSED*/ +boolean_t +ppt_is_mmio(struct vm *vm, vm_paddr_t gpa) +{ + return (B_FALSE); +} + +/*ARGSUSED*/ +int +ppt_avail_devices(void) +{ + return (0); +} + +/*ARGSUSED*/ +int +ppt_assign_device(struct vm *vm, int bus, int slot, int func) +{ + return (ENOENT); +} + +/*ARGSUSED*/ +int +ppt_unassign_device(struct vm *vm, int bus, int slot, int func) +{ + return (ENXIO); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.c b/usr/src/uts/i86pc/io/vmm/io/vatpic.c index a93b252c91..ba4cd7785e 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpic.c +++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * @@ -25,12 +27,11 @@ */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vatpic.c 279683 2015-03-06 02:05:45Z tychon $"); +__FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.h b/usr/src/uts/i86pc/io/vmm/io/vatpic.h index ef5e51b158..d4a1be1820 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpic.h +++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.h @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vatpic.h 273706 2014-10-26 19:03:06Z neel $ + * $FreeBSD$ */ #ifndef _VATPIC_H_ diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.c b/usr/src/uts/i86pc/io/vmm/io/vatpit.c index ce17bdc92c..9b3e7376d5 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpit.c +++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.c @@ -1,4 +1,5 @@ /*- + * Copyright (c) 2018 Joyent, Inc. * Copyright (c) 2014 Tycho Nightingale * Copyright (c) 2011 NetApp, Inc. * All rights reserved. @@ -26,12 +27,11 @@ */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vatpit.c 273706 2014-10-26 19:03:06Z neel $"); +__FBSDID("$FreeBSD$"); #include #include #include -#include #include #include #include @@ -79,7 +79,7 @@ struct vatpit_callout_arg { struct channel { int mode; uint16_t initial; /* initial counter value */ - sbintime_t now_sbt; /* uptime when counter was loaded */ + struct bintime now_bt; /* uptime when counter was loaded */ uint8_t cr[2]; uint8_t ol[2]; bool slatched; /* status latched */ @@ -88,7 +88,7 @@ struct channel { int olbyte; int frbyte; struct callout callout; - sbintime_t callout_sbt; /* target time */ + struct bintime callout_bt; /* target time */ struct vatpit_callout_arg callout_arg; }; @@ -96,26 +96,41 @@ struct vatpit { struct vm *vm; struct mtx mtx; - sbintime_t freq_sbt; + struct bintime freq_bt; struct channel channel[3]; }; static void pit_timer_start_cntr0(struct vatpit *vatpit); +static uint64_t +vatpit_delta_ticks(struct vatpit *vatpit, struct channel *c) +{ + struct bintime delta; + uint64_t result; + + binuptime(&delta); + bintime_sub(&delta, &c->now_bt); + + result = delta.sec * PIT_8254_FREQ; + result += delta.frac / vatpit->freq_bt.frac; + + return (result); +} + static int vatpit_get_out(struct vatpit *vatpit, int channel) { struct channel *c; - sbintime_t delta_ticks; + uint64_t delta_ticks; int out; c = &vatpit->channel[channel]; switch (c->mode) { case TIMER_INTTC: - delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt; - out = ((c->initial - delta_ticks) <= 0); + delta_ticks = vatpit_delta_ticks(vatpit, c); + out = (delta_ticks >= c->initial); break; default: out = 0; @@ -165,24 +180,28 @@ static void pit_timer_start_cntr0(struct vatpit *vatpit) { struct channel *c; - sbintime_t now, delta, precision; c = &vatpit->channel[0]; if (c->initial != 0) { - delta = c->initial * vatpit->freq_sbt; - precision = delta >> tc_precexp; - c->callout_sbt = c->callout_sbt + delta; + sbintime_t precision; + struct bintime now, delta; + + delta.sec = 0; + delta.frac = vatpit->freq_bt.frac * c->initial; + bintime_add(&c->callout_bt, &delta); + precision = bttosbt(delta) >> tc_precexp; /* - * Reset 'callout_sbt' if the time that the callout - * was supposed to fire is more than 'c->initial' - * ticks in the past. + * Reset 'callout_bt' if the time that the callout was supposed + * to fire is more than 'c->initial' ticks in the past. */ - now = sbinuptime(); - if (c->callout_sbt < now) - c->callout_sbt = now + delta; + binuptime(&now); + if (bintime_cmp(&c->callout_bt, &now, <)) { + c->callout_bt = now; + bintime_add(&c->callout_bt, &delta); + } - callout_reset_sbt(&c->callout, c->callout_sbt, + callout_reset_sbt(&c->callout, bttosbt(c->callout_bt), precision, vatpit_callout_handler, &c->callout_arg, C_ABSOLUTE); } @@ -192,7 +211,7 @@ static uint16_t pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch) { uint16_t lval; - sbintime_t delta_ticks; + uint64_t delta_ticks; /* cannot latch a new value until the old one has been consumed */ if (latch && c->olbyte != 0) @@ -208,12 +227,11 @@ pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch) * here. */ c->initial = TIMER_DIV(PIT_8254_FREQ, 100); - c->now_sbt = sbinuptime(); + binuptime(&c->now_bt); c->status &= ~TIMER_STS_NULLCNT; } - delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt; - + delta_ticks = vatpit_delta_ticks(vatpit, c); lval = c->initial - delta_ticks % c->initial; if (latch) { @@ -384,10 +402,10 @@ vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, c->frbyte = 0; c->crbyte = 0; c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8; - c->now_sbt = sbinuptime(); + binuptime(&c->now_bt); /* Start an interval timer for channel 0 */ if (port == TIMER_CNTR0) { - c->callout_sbt = c->now_sbt; + c->callout_bt = c->now_bt; pit_timer_start_cntr0(vatpit); } if (c->initial == 0) @@ -424,7 +442,6 @@ struct vatpit * vatpit_init(struct vm *vm) { struct vatpit *vatpit; - struct bintime bt; struct vatpit_callout_arg *arg; int i; @@ -433,11 +450,10 @@ vatpit_init(struct vm *vm) mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN); - FREQ2BT(PIT_8254_FREQ, &bt); - vatpit->freq_sbt = bttosbt(bt); + FREQ2BT(PIT_8254_FREQ, &vatpit->freq_bt); for (i = 0; i < 3; i++) { - callout_init(&vatpit->channel[i].callout, true); + callout_init(&vatpit->channel[i].callout, 1); arg = &vatpit->channel[i].callout_arg; arg->vatpit = vatpit; arg->channel_num = i; @@ -456,3 +472,16 @@ vatpit_cleanup(struct vatpit *vatpit) free(vatpit, M_VATPIT); } + +#ifndef __FreeBSD__ +void +vatpit_localize_resources(struct vatpit *vatpit) +{ + for (uint_t i = 0; i < 3; i++) { + /* Only localize channels which might be running */ + if (vatpit->channel[i].mode != 0) { + vmm_glue_callout_localize(&vatpit->channel[i].callout); + } + } +} +#endif /* __FreeBSD */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.h b/usr/src/uts/i86pc/io/vmm/io/vatpit.h index f20ad73e47..4bf9fe048d 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpit.h +++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale * Copyright (c) 2011 NetApp, Inc. * All rights reserved. @@ -24,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vatpit.h 273706 2014-10-26 19:03:06Z neel $ + * $FreeBSD$ */ #ifndef _VATPIT_H_ @@ -42,4 +44,8 @@ int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax); +#ifndef __FreeBSD__ +void vatpit_localize_resources(struct vatpit *); +#endif + #endif /* _VATPIT_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.c b/usr/src/uts/i86pc/io/vmm/io/vhpet.c index 25f6013da0..c82b4626bd 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vhpet.c +++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Tycho Nightingale * Copyright (c) 2013 Neel Natu * All rights reserved. @@ -24,11 +26,15 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z tychon $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z tychon $"); +__FBSDID("$FreeBSD$"); #include #include @@ -36,7 +42,6 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z ty #include #include #include -#include #include @@ -52,7 +57,7 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z ty static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet"); -#define HPET_FREQ 10000000 /* 10.0 Mhz */ +#define HPET_FREQ 16777216 /* 16.7 (2^24) Mhz */ #define FS_PER_S 1000000000000000ul /* Timer N Configuration and Capabilities Register */ @@ -104,7 +109,6 @@ vhpet_capabilities(void) uint64_t cap = 0; cap |= 0x8086 << 16; /* vendor id */ - cap |= HPET_CAP_LEG_RT; /* legacy routing capable */ cap |= (VHPET_NUM_TIMERS - 1) << 8; /* number of timers */ cap |= 1; /* revision */ cap &= ~HPET_CAP_COUNT_SIZE; /* 32-bit timer */ @@ -127,15 +131,6 @@ vhpet_timer_msi_enabled(struct vhpet *vhpet, int n) { const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN; - /* - * LegacyReplacement Route configuration takes precedence over MSI - * for timers 0 and 1. - */ - if (n == 0 || n == 1) { - if (vhpet->config & HPET_CNF_LEG_RT) - return (false); - } - if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable) return (true); else @@ -152,41 +147,9 @@ vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n) if (vhpet_timer_msi_enabled(vhpet, n)) return (0); - if (vhpet->config & HPET_CNF_LEG_RT) { - /* - * In "legacy routing" timers 0 and 1 are connected to - * ioapic pins 2 and 8 respectively. - */ - switch (n) { - case 0: - return (2); - case 1: - return (8); - } - } - return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9); } -static __inline int -vhpet_timer_atpic_pin(struct vhpet *vhpet, int n) -{ - if (vhpet->config & HPET_CNF_LEG_RT) { - /* - * In "legacy routing" timers 0 and 1 are connected to - * 8259 master pin 0 and slave pin 0 respectively. - */ - switch (n) { - case 0: - return (0); - case 1: - return (8); - } - } - - return (-1); -} - static uint32_t vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) { @@ -211,7 +174,7 @@ vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) /* * The sbinuptime corresponding to the 'countbase' is * meaningless when the counter is disabled. Make sure - * that the the caller doesn't want to use it. + * that the caller doesn't want to use it. */ KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL")); } @@ -221,17 +184,12 @@ vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) static void vhpet_timer_clear_isr(struct vhpet *vhpet, int n) { - int pin, legacy_pin; + int pin; if (vhpet->isr & (1 << n)) { pin = vhpet_timer_ioapic_pin(vhpet, n); KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n)); vioapic_deassert_irq(vhpet->vm, pin); - - legacy_pin = vhpet_timer_atpic_pin(vhpet, n); - if (legacy_pin != -1) - vatpic_deassert_irq(vhpet->vm, legacy_pin); - vhpet->isr &= ~(1 << n); } } @@ -257,12 +215,6 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n) KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: " "timer %d is using MSI", n)); - /* The legacy replacement interrupts are always edge triggered */ - if (vhpet->config & HPET_CNF_LEG_RT) { - if (n == 0 || n == 1) - return (true); - } - if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0) return (true); else @@ -272,7 +224,7 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n) static void vhpet_timer_interrupt(struct vhpet *vhpet, int n) { - int pin, legacy_pin; + int pin; /* If interrupts are not enabled for this timer then just return. */ if (!vhpet_timer_interrupt_enabled(vhpet, n)) @@ -298,17 +250,11 @@ vhpet_timer_interrupt(struct vhpet *vhpet, int n) return; } - legacy_pin = vhpet_timer_atpic_pin(vhpet, n); - if (vhpet_timer_edge_trig(vhpet, n)) { vioapic_pulse_irq(vhpet->vm, pin); - if (legacy_pin != -1) - vatpic_pulse_irq(vhpet->vm, legacy_pin); } else { vhpet->isr |= 1 << n; vioapic_assert_irq(vhpet->vm, pin); - if (legacy_pin != -1) - vatpic_assert_irq(vhpet->vm, legacy_pin); } } @@ -402,10 +348,6 @@ vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now) { sbintime_t delta, precision; - /* If interrupts are not enabled for this timer then just return. */ - if (!vhpet_timer_interrupt_enabled(vhpet, n)) - return; - if (vhpet->timer[n].comprate != 0) vhpet_adjust_compval(vhpet, n, counter); else { @@ -588,6 +530,13 @@ vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size, counter = vhpet_counter(vhpet, nowptr); oldval = vhpet->config; update_register(&vhpet->config, data, mask); + + /* + * LegacyReplacement Routing is not supported so clear the + * bit explicitly. + */ + vhpet->config &= ~HPET_CNF_LEG_RT; + if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) { if (vhpet_counter_enabled(vhpet)) { vhpet_start_counting(vhpet); @@ -777,8 +726,10 @@ vhpet_init(struct vm *vm) vhpet->freq_sbt = bttosbt(bt); pincount = vioapic_pincount(vm); - if (pincount >= 24) - allowed_irqs = 0x00f00000; /* irqs 20, 21, 22 and 23 */ + if (pincount >= 32) + allowed_irqs = 0xff000000; /* irqs 24-31 */ + else if (pincount >= 20) + allowed_irqs = 0xf << (pincount - 4); /* 4 upper irqs */ else allowed_irqs = 0; @@ -819,3 +770,12 @@ vhpet_getcap(struct vm_hpet_cap *cap) cap->capabilities = vhpet_capabilities(); return (0); } +#ifndef __FreeBSD__ +void +vhpet_localize_resources(struct vhpet *vhpet) +{ + for (uint_t i = 0; i < VHPET_NUM_TIMERS; i++) { + vmm_glue_callout_localize(&vhpet->timer[i].callout); + } +} +#endif /* __FreeBSD */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.h b/usr/src/uts/i86pc/io/vmm/io/vhpet.h index 868809d166..8e28241b32 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vhpet.h +++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Tycho Nightingale * Copyright (c) 2013 Neel Natu * All rights reserved. @@ -24,7 +26,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vhpet.h 258579 2013-11-25 19:04:51Z neel $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #ifndef _VHPET_H_ @@ -41,4 +47,8 @@ int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val, int size, void *arg); int vhpet_getcap(struct vm_hpet_cap *cap); +#ifndef __FreeBSD__ +void vhpet_localize_resources(struct vhpet *vhpet); +#endif + #endif /* _VHPET_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.c b/usr/src/uts/i86pc/io/vmm/io/vioapic.c index 5adf5de16d..dbd3420420 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vioapic.c +++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Tycho Nightingale * Copyright (c) 2013 Neel Natu * All rights reserved. @@ -24,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -37,19 +39,20 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z neel $"); +__FBSDID("$FreeBSD$"); #include #include -#include #include #include #include #include #include +#include #include #include @@ -62,7 +65,7 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z #define IOREGSEL 0x00 #define IOWIN 0x10 -#define REDIR_ENTRIES 24 +#define REDIR_ENTRIES 32 #define RTBL_RO_BITS ((uint64_t)(IOART_REM_IRR | IOART_DELIVS)) struct vioapic { @@ -234,48 +237,139 @@ vioapic_pulse_irq(struct vm *vm, int irq) return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE)); } +#define REDIR_IS_PHYS(reg) (((reg) & IOART_DESTMOD) == IOART_DESTPHY) +#define REDIR_IS_LOWPRIO(reg) (((reg) & IOART_DELMOD) == IOART_DELLOPRI) +/* Level-triggered interrupts only valid in fixed and low-priority modes */ +#define REDIR_IS_LVLTRIG(reg) \ + (((reg) & IOART_TRGRLVL) != 0 && \ + (((reg) & IOART_DELMOD) == IOART_DELFIXED || REDIR_IS_LOWPRIO(reg))) +#define REDIR_DEST(reg) ((reg) >> (32 + APIC_ID_SHIFT)) +#define REDIR_VECTOR(reg) ((reg) & IOART_INTVEC) + /* - * Reset the vlapic's trigger-mode register to reflect the ioapic pin - * configuration. + * Given a redirection entry, determine which vCPUs would be targeted. */ static void -vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg) +vioapic_calcdest(struct vioapic *vioapic, uint64_t redir_ent, cpuset_t *dmask) { - struct vioapic *vioapic; - struct vlapic *vlapic; - uint32_t low, high, dest; - int delmode, pin, vector; - bool level, phys; - vlapic = vm_lapic(vm, vcpuid); - vioapic = vm_ioapic(vm); + /* + * When calculating interrupt destinations with vlapic_calcdest(), the + * legacy xAPIC format is assumed, since the system lacks interrupt + * redirection hardware. + * See vlapic_deliver_intr() for more details. + */ + vlapic_calcdest(vioapic->vm, dmask, REDIR_DEST(redir_ent), + REDIR_IS_PHYS(redir_ent), REDIR_IS_LOWPRIO(redir_ent), false); +} + +/* + * Across all redirection entries utilizing a specified vector, determine the + * set of vCPUs which would be targeted by a level-triggered interrupt. + */ +static void +vioapic_tmr_active(struct vioapic *vioapic, uint8_t vec, cpuset_t *result) +{ + u_int i; + + CPU_ZERO(result); + if (vec == 0) { + return; + } + + for (i = 0; i < REDIR_ENTRIES; i++) { + cpuset_t dest; + const uint64_t val = vioapic->rtbl[i].reg; + + if (!REDIR_IS_LVLTRIG(val) || REDIR_VECTOR(val) != vec) { + continue; + } + + CPU_ZERO(&dest); + vioapic_calcdest(vioapic, val, &dest); + CPU_OR(result, &dest); + } +} + +/* + * Update TMR state in vLAPICs after changes to vIOAPIC pin configuration + */ +static void +vioapic_update_tmrs(struct vioapic *vioapic, int vcpuid, uint64_t oldval, + uint64_t newval) +{ + cpuset_t active, allset, newset, oldset; + struct vm *vm; + uint8_t newvec, oldvec; + + vm = vioapic->vm; + CPU_ZERO(&allset); + CPU_ZERO(&newset); + CPU_ZERO(&oldset); + newvec = oldvec = 0; + + if (REDIR_IS_LVLTRIG(oldval)) { + vioapic_calcdest(vioapic, oldval, &oldset); + CPU_OR(&allset, &oldset); + oldvec = REDIR_VECTOR(oldval); + } + + if (REDIR_IS_LVLTRIG(newval)) { + vioapic_calcdest(vioapic, newval, &newset); + CPU_OR(&allset, &newset); + newvec = REDIR_VECTOR(newval); + } + + if (CPU_EMPTY(&allset) || + (CPU_CMP(&oldset, &newset) == 0 && oldvec == newvec)) { + return; + } - VIOAPIC_LOCK(vioapic); /* - * Reset all vectors to be edge-triggered. + * Since the write to the redirection table has already occurred, a + * scan of level-triggered entries referencing the old vector will find + * only entries which are now currently valid. */ - vlapic_reset_tmr(vlapic); - for (pin = 0; pin < REDIR_ENTRIES; pin++) { - low = vioapic->rtbl[pin].reg; - high = vioapic->rtbl[pin].reg >> 32; + vioapic_tmr_active(vioapic, oldvec, &active); - level = low & IOART_TRGRLVL ? true : false; - if (!level) + while (!CPU_EMPTY(&allset)) { + struct vlapic *vlapic; + u_int i; + + i = CPU_FFS(&allset) - 1; + CPU_CLR(i, &allset); + + if (oldvec == newvec && + CPU_ISSET(i, &oldset) && CPU_ISSET(i, &newset)) { continue; + } - /* - * For a level-triggered 'pin' let the vlapic figure out if - * an assertion on this 'pin' would result in an interrupt - * being delivered to it. If yes, then it will modify the - * TMR bit associated with this vector to level-triggered. - */ - phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); - delmode = low & IOART_DELMOD; - vector = low & IOART_INTVEC; - dest = high >> APIC_ID_SHIFT; - vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector); + if (i != vcpuid) { + vcpu_block_run(vm, i); + } + + vlapic = vm_lapic(vm, i); + if (CPU_ISSET(i, &oldset)) { + /* + * Perform the deassertion if no other level-triggered + * IOAPIC entries target this vCPU with the old vector + * + * Note: Sharing of vectors like that should be + * extremely rare in modern operating systems and was + * previously unsupported by the bhyve vIOAPIC. + */ + if (!CPU_ISSET(i, &active)) { + vlapic_tmr_set(vlapic, oldvec, false); + } + } + if (CPU_ISSET(i, &newset)) { + vlapic_tmr_set(vlapic, newvec, true); + } + + if (i != vcpuid) { + vcpu_unblock_run(vm, i); + } } - VIOAPIC_UNLOCK(vioapic); } static uint32_t @@ -319,7 +413,6 @@ vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data) uint64_t data64, mask64; uint64_t last, changed; int regnum, pin, lshift; - cpuset_t allvcpus; regnum = addr & 0xff; switch (regnum) { @@ -355,20 +448,15 @@ vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data) /* * If any fields in the redirection table entry (except mask - * or polarity) have changed then rendezvous all the vcpus - * to update their vlapic trigger-mode registers. + * or polarity) have changed then update the trigger-mode + * registers on all the vlapics. */ changed = last ^ vioapic->rtbl[pin].reg; if (changed & ~(IOART_INTMASK | IOART_INTPOL)) { VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate " "vlapic trigger-mode register", pin); - VIOAPIC_UNLOCK(vioapic); -#if 0 /* XXX */ - allvcpus = vm_active_cpus(vioapic->vm); - vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus, - vioapic_update_tmr, NULL); -#endif - VIOAPIC_LOCK(vioapic); + vioapic_update_tmrs(vioapic, vcpuid, last, + vioapic->rtbl[pin].reg); } /* diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.h b/usr/src/uts/i86pc/io/vmm/io/vioapic.h index 9479ebb10e..6bf3e80e05 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vioapic.h +++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Tycho Nightingale * Copyright (c) 2013 Neel Natu * All rights reserved. @@ -24,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vioapic.h 258699 2013-11-27 22:18:08Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -45,10 +47,6 @@ #define VIOAPIC_BASE 0xFEC00000 #define VIOAPIC_SIZE 4096 -#include "vdev.h" - -struct vm; - struct vioapic *vioapic_init(struct vm *vm); void vioapic_cleanup(struct vioapic *vioapic); diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c index 9a0a3058ea..4e58249c8d 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vlapic.c +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c @@ -1,6 +1,9 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,10 +39,11 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z neel $"); +__FBSDID("$FreeBSD$"); #include #include @@ -57,7 +61,6 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z n #include -#include "vmm_ipi.h" #include "vmm_lapic.h" #include "vmm_ktr.h" #include "vmm_stat.h" @@ -82,7 +85,15 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z n #define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) -#define VLAPIC_BUS_FREQ tsc_freq +/* + * APIC timer frequency: + * - arbitrary but chosen to be in the ballpark of contemporary hardware. + * - power-of-two to avoid loss of precision when converted to a bintime. + */ +#define VLAPIC_BUS_FREQ (128 * 1024 * 1024) + +static void vlapic_set_error(struct vlapic *, uint32_t, bool); +static void vlapic_tmr_reset(struct vlapic *); static __inline uint32_t vlapic_get_id(struct vlapic *vlapic) @@ -259,7 +270,6 @@ vlapic_dcr_write_handler(struct vlapic *vlapic) VLAPIC_TIMER_UNLOCK(vlapic); } - void vlapic_esr_write_handler(struct vlapic *vlapic) { @@ -287,7 +297,8 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) } if (vector < 16) { - vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR); + vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR, + false); VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", vector); return (1); @@ -449,20 +460,22 @@ vlapic_mask_lvts(struct vlapic *vlapic) } static int -vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt) +vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt) { - uint32_t vec, mode; + uint32_t mode, reg, vec; - if (lvt & APIC_LVT_M) - return (0); + reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]); - vec = lvt & APIC_LVT_VECTOR; - mode = lvt & APIC_LVT_DM; + if (reg & APIC_LVT_M) + return (0); + vec = reg & APIC_LVT_VECTOR; + mode = reg & APIC_LVT_DM; switch (mode) { case APIC_LVT_DM_FIXED: if (vec < 16) { - vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, + lvt == APIC_LVT_ERROR); return (0); } if (vlapic_set_intr_ready(vlapic, vec, false)) @@ -566,6 +579,8 @@ vlapic_update_ppr(struct vlapic *vlapic) VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); } +static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); + static void vlapic_process_eoi(struct vlapic *vlapic) { @@ -576,11 +591,7 @@ vlapic_process_eoi(struct vlapic *vlapic) isrptr = &lapic->isr0; tmrptr = &lapic->tmr0; - /* - * The x86 architecture reserves the the first 32 vectors for use - * by the processor. - */ - for (i = 7; i > 0; i--) { + for (i = 7; i >= 0; i--) { idx = i * 4; bitpos = fls(isrptr[idx]); if (bitpos-- != 0) { @@ -589,17 +600,21 @@ vlapic_process_eoi(struct vlapic *vlapic) vlapic->isrvec_stk_top); } isrptr[idx] &= ~(1 << bitpos); + vector = i * 32 + bitpos; + VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d", + vector); VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); vlapic->isrvec_stk_top--; vlapic_update_ppr(vlapic); if ((tmrptr[idx] & (1 << bitpos)) != 0) { - vector = i * 32 + bitpos; vioapic_process_eoi(vlapic->vm, vlapic->vcpuid, vector); } return; } } + VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI"); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1); } static __inline int @@ -621,22 +636,22 @@ vlapic_periodic_timer(struct vlapic *vlapic) static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); -void -vlapic_set_error(struct vlapic *vlapic, uint32_t mask) +static void +vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error) { - uint32_t lvt; vlapic->esr_pending |= mask; - if (vlapic->esr_firing) + + /* + * Avoid infinite recursion if the error LVT itself is configured with + * an illegal vector. + */ + if (lvt_error) return; - vlapic->esr_firing = 1; - // The error LVT always uses the fixed delivery mode. - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); - if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { + if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1); } - vlapic->esr_firing = 0; } static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); @@ -644,13 +659,10 @@ static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); static void vlapic_fire_timer(struct vlapic *vlapic) { - uint32_t lvt; KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); - - // The timer LVT always uses the fixed delivery mode. - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); - if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { + + if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) { VLAPIC_CTR0(vlapic, "vlapic timer fired"); vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1); } @@ -662,10 +674,8 @@ static VMM_STAT(VLAPIC_INTR_CMC, void vlapic_fire_cmci(struct vlapic *vlapic) { - uint32_t lvt; - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); - if (vlapic_fire_lvt(vlapic, lvt)) { + if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1); } } @@ -676,7 +686,6 @@ static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, int vlapic_trigger_lvt(struct vlapic *vlapic, int vector) { - uint32_t lvt; if (vlapic_enabled(vlapic) == false) { /* @@ -699,35 +708,20 @@ vlapic_trigger_lvt(struct vlapic *vlapic, int vector) switch (vector) { case APIC_LVT_LINT0: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT); - break; case APIC_LVT_LINT1: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT); - break; case APIC_LVT_TIMER: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); - lvt |= APIC_LVT_DM_FIXED; - break; case APIC_LVT_ERROR: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); - lvt |= APIC_LVT_DM_FIXED; - break; case APIC_LVT_PMC: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT); - break; case APIC_LVT_THERMAL: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT); - break; case APIC_LVT_CMCI: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); + if (vlapic_fire_lvt(vlapic, vector)) { + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + LVTS_TRIGGERRED, vector, 1); + } break; default: return (EINVAL); } - if (vlapic_fire_lvt(vlapic, lvt)) { - vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, - LVTS_TRIGGERRED, vector, 1); - } return (0); } @@ -831,11 +825,11 @@ vlapic_icrtmr_write_handler(struct vlapic *vlapic) /* * This function populates 'dmask' with the set of vcpus that match the * addressing specified by the (dest, phys, lowprio) tuple. - * + * * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) * or xAPIC (8-bit) destination field. */ -static void +void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, bool lowprio, bool x2apic_dest) { @@ -860,12 +854,12 @@ vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, */ CPU_ZERO(dmask); vcpuid = vm_apicid2vcpuid(vm, dest); - if (vcpuid < VM_MAXCPU) + if (vcpuid < vm_get_maxcpus(vm)) CPU_SET(vcpuid, dmask); } else { /* * In the "Flat Model" the MDA is interpreted as an 8-bit wide - * bitmask. This model is only avilable in the xAPIC mode. + * bitmask. This model is only available in the xAPIC mode. */ mda_flat_ldest = dest & 0xff; @@ -883,7 +877,7 @@ vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, /* * Logical mode: match each APIC that has a bit set - * in it's LDR that matches a bit in the ldest. + * in its LDR that matches a bit in the ldest. */ CPU_ZERO(dmask); amask = vm_active_cpus(vm); @@ -987,6 +981,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) struct vlapic *vlapic2; struct vm_exit *vmexit; struct LAPIC *lapic; + uint16_t maxcpus; lapic = vlapic->apic_page; lapic->icr_lo &= ~APIC_DELSTAT_PEND; @@ -1000,7 +995,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) mode = icrval & APIC_DELMODE_MASK; if (mode == APIC_DELMODE_FIXED && vec < 16) { - vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false); VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); return (0); } @@ -1048,11 +1043,12 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) return (0); /* handled completely in the kernel */ } + maxcpus = vm_get_maxcpus(vlapic->vm); if (mode == APIC_DELMODE_INIT) { if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) return (0); - if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { + if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { vlapic2 = vm_lapic(vlapic->vm, dest); /* move from INIT to waiting-for-SIPI state */ @@ -1065,7 +1061,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) } if (mode == APIC_DELMODE_STARTUP) { - if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { + if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { vlapic2 = vm_lapic(vlapic->vm, dest); /* @@ -1118,11 +1114,7 @@ vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) irrptr = &lapic->irr0; - /* - * The x86 architecture reserves the the first 32 vectors for use - * by the processor. - */ - for (i = 7; i > 0; i--) { + for (i = 7; i >= 0; i--) { idx = i * 4; val = atomic_load_acq_int(&irrptr[idx]); bitpos = fls(val); @@ -1461,7 +1453,7 @@ vlapic_reset(struct vlapic *vlapic) lapic->dfr = 0xffffffff; lapic->svr = APIC_SVR_VECTOR; vlapic_mask_lvts(vlapic); - vlapic_reset_tmr(vlapic); + vlapic_tmr_reset(vlapic); lapic->dcr_timer = 0; vlapic_dcr_write_handler(vlapic); @@ -1478,7 +1470,8 @@ void vlapic_init(struct vlapic *vlapic) { KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); - KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU, + KASSERT(vlapic->vcpuid >= 0 && + vlapic->vcpuid < vm_get_maxcpus(vlapic->vm), ("vlapic_init: vcpuid is not initialized")); KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " "initialized")); @@ -1628,60 +1621,85 @@ vlapic_enabled(struct vlapic *vlapic) } static void -vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) +vlapic_tmr_reset(struct vlapic *vlapic) { struct LAPIC *lapic; - uint32_t *tmrptr, mask; - int idx; lapic = vlapic->apic_page; - tmrptr = &lapic->tmr0; - idx = (vector / 32) * 4; - mask = 1 << (vector % 32); - if (level) - tmrptr[idx] |= mask; - else - tmrptr[idx] &= ~mask; - - if (vlapic->ops.set_tmr != NULL) - (*vlapic->ops.set_tmr)(vlapic, vector, level); + lapic->tmr0 = lapic->tmr1 = lapic->tmr2 = lapic->tmr3 = 0; + lapic->tmr4 = lapic->tmr5 = lapic->tmr6 = lapic->tmr7 = 0; + vlapic->tmr_pending = 1; } +/* + * Synchronize TMR designations into the LAPIC state. + * The vCPU must be in the VCPU_RUNNING state. + */ void -vlapic_reset_tmr(struct vlapic *vlapic) +vlapic_tmr_update(struct vlapic *vlapic) { - int vector; + struct LAPIC *lapic; + uint32_t *tmrptr; + uint32_t result[VLAPIC_TMR_CNT]; + u_int i, tmr_idx; + + if (vlapic->tmr_pending == 0) { + return; + } + + lapic = vlapic->apic_page; + tmrptr = &lapic->tmr0; + + VLAPIC_CTR0(vlapic, "synchronizing TMR"); + for (i = 0; i < VLAPIC_TMR_CNT; i++) { + tmr_idx = i * 4; - VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); + tmrptr[tmr_idx] &= ~vlapic->tmr_vec_deassert[i]; + tmrptr[tmr_idx] |= vlapic->tmr_vec_assert[i]; + vlapic->tmr_vec_deassert[i] = 0; + vlapic->tmr_vec_assert[i] = 0; + result[i] = tmrptr[tmr_idx]; + } + vlapic->tmr_pending = 0; - for (vector = 0; vector <= 255; vector++) - vlapic_set_tmr(vlapic, vector, false); + if (vlapic->ops.set_tmr != NULL) { + (*vlapic->ops.set_tmr)(vlapic, result); + } } +/* + * Designate the TMR state for a given interrupt vector. + * The caller must hold the vIOAPIC lock and prevent the vCPU corresponding to + * this vLAPIC instance from being-in or entering the VCPU_RUNNING state. + */ void -vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, - int delmode, int vector) +vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active) { - cpuset_t dmask; - bool lowprio; - - KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); + const uint32_t idx = vector / 32; + const uint32_t mask = 1 << (vector % 32); + + VLAPIC_CTR2(vlapic, "TMR for vector %u %sasserted", vector, + active ? "" : "de"); + if (active) { + vlapic->tmr_vec_assert[idx] |= mask; + vlapic->tmr_vec_deassert[idx] &= ~mask; + } else { + vlapic->tmr_vec_deassert[idx] |= mask; + vlapic->tmr_vec_assert[idx] &= ~mask; + } /* - * A level trigger is valid only for fixed and lowprio delivery modes. + * Track the number of TMR changes between calls to vlapic_tmr_update. + * While a simple boolean would suffice, this count may be useful when + * tracing or debugging, and is cheap to calculate. */ - if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { - VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " - "delivery-mode %d", delmode); - return; - } - - lowprio = (delmode == APIC_DELMODE_LOWPRIO); - vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); - - if (!CPU_ISSET(vlapic->vcpuid, &dmask)) - return; + vlapic->tmr_pending = MIN(UINT32_MAX - 1, vlapic->tmr_pending) + 1; +} - VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); - vlapic_set_tmr(vlapic, vector, true); +#ifndef __FreeBSD__ +void +vlapic_localize_resources(struct vlapic *vlapic) +{ + vmm_glue_callout_localize(&vlapic->callout); } +#endif /* __FreeBSD */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.h b/usr/src/uts/i86pc/io/vmm/io/vlapic.h index 3fa705d818..e1a52551a9 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vlapic.h +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vlapic.h 262281 2014-02-21 06:03:54Z neel $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #ifndef _VLAPIC_H_ @@ -69,7 +75,6 @@ int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); */ void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum); -void vlapic_set_error(struct vlapic *vlapic, uint32_t mask); void vlapic_fire_cmci(struct vlapic *vlapic); int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); @@ -81,16 +86,11 @@ bool vlapic_enabled(struct vlapic *vlapic); void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, int delmode, int vec); -/* Reset the trigger-mode bits for all vectors to be edge-triggered */ -void vlapic_reset_tmr(struct vlapic *vlapic); +void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, + bool lowprio, bool x2apic_dest); -/* - * Set the trigger-mode bit associated with 'vector' to level-triggered if - * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to - * this 'vlapic'. - */ -void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, - int delmode, int vector); +void vlapic_tmr_update(struct vlapic *vlapic); +void vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active); void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val); uint64_t vlapic_get_cr8(struct vlapic *vlapic); @@ -106,4 +106,9 @@ void vlapic_icrtmr_write_handler(struct vlapic *vlapic); void vlapic_dcr_write_handler(struct vlapic *vlapic); void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val); + +#ifndef __FreeBSD__ +void vlapic_localize_resources(struct vlapic *vlapic); +#endif + #endif /* _VLAPIC_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h index f9bd2e0e8b..5795d48d52 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Neel Natu * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vlapic_priv.h 263211 2014-03-15 23:09:34Z tychon $ + * $FreeBSD$ */ #ifndef _VLAPIC_PRIV_H_ @@ -136,6 +138,8 @@ enum boot_state { #define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI +#define VLAPIC_TMR_CNT 8 + struct vlapic; struct vlapic_ops { @@ -143,7 +147,7 @@ struct vlapic_ops { int (*pending_intr)(struct vlapic *vlapic, int *vecptr); void (*intr_accepted)(struct vlapic *vlapic, int vector); void (*post_intr)(struct vlapic *vlapic, int hostcpu); - void (*set_tmr)(struct vlapic *vlapic, int vector, bool level); + void (*set_tmr)(struct vlapic *vlapic, const uint32_t *result); void (*enable_x2apic_mode)(struct vlapic *vlapic); }; @@ -154,7 +158,7 @@ struct vlapic { struct vlapic_ops ops; uint32_t esr_pending; - int esr_firing; + uint32_t tmr_pending; struct callout callout; /* vlapic timer */ struct bintime timer_fire_bt; /* callout expiry time */ @@ -182,6 +186,19 @@ struct vlapic { */ uint32_t svr_last; uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1]; + + /* + * Store intended modifications to the trigger-mode register state. + * Along with the tmr_pending counter above, these are protected by the + * vIOAPIC lock and can only be modified under specific conditions: + * + * 1. When holding the vIOAPIC lock, and the vCPU to which the vLAPIC + * belongs is prevented from entering the VCPU_RUNNING state. + * 2. When the owning vCPU is in the VCPU_RUNNING state, and is + * applying the TMR modifications prior to interrupt injection. + */ + uint32_t tmr_vec_deassert[VLAPIC_TMR_CNT]; + uint32_t tmr_vec_assert[VLAPIC_TMR_CNT]; }; void vlapic_init(struct vlapic *vlapic); diff --git a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c new file mode 100644 index 0000000000..4df909777d --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c @@ -0,0 +1,105 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +#include + +#include "vpmtmr.h" + +/* + * The ACPI Power Management timer is a free-running 24- or 32-bit + * timer with a frequency of 3.579545MHz + * + * This implementation will be 32-bits + */ + +#define PMTMR_FREQ 3579545 /* 3.579545MHz */ + +struct vpmtmr { + sbintime_t freq_sbt; + sbintime_t baseuptime; + uint32_t baseval; +}; + +static MALLOC_DEFINE(M_VPMTMR, "vpmtmr", "bhyve virtual acpi timer"); + +struct vpmtmr * +vpmtmr_init(struct vm *vm) +{ + struct vpmtmr *vpmtmr; + struct bintime bt; + + vpmtmr = malloc(sizeof(struct vpmtmr), M_VPMTMR, M_WAITOK | M_ZERO); + vpmtmr->baseuptime = sbinuptime(); + vpmtmr->baseval = 0; + + FREQ2BT(PMTMR_FREQ, &bt); + vpmtmr->freq_sbt = bttosbt(bt); + + return (vpmtmr); +} + +void +vpmtmr_cleanup(struct vpmtmr *vpmtmr) +{ + + free(vpmtmr, M_VPMTMR); +} + +int +vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vpmtmr *vpmtmr; + sbintime_t now, delta; + + if (!in || bytes != 4) + return (-1); + + vpmtmr = vm_pmtmr(vm); + + /* + * No locking needed because 'baseuptime' and 'baseval' are + * written only during initialization. + */ + now = sbinuptime(); + delta = now - vpmtmr->baseuptime; + KASSERT(delta >= 0, ("vpmtmr_handler: uptime went backwards: " + "%#lx to %#lx", vpmtmr->baseuptime, now)); + *val = vpmtmr->baseval + delta / vpmtmr->freq_sbt; + + return (0); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h new file mode 100644 index 0000000000..e6562da5c0 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h @@ -0,0 +1,44 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VPMTMR_H_ +#define _VPMTMR_H_ + +#define IO_PMTMR 0x408 + +struct vpmtmr; + +struct vpmtmr *vpmtmr_init(struct vm *vm); +void vpmtmr_cleanup(struct vpmtmr *pmtmr); + +int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/io/vrtc.c b/usr/src/uts/i86pc/io/vmm/io/vrtc.c new file mode 100644 index 0000000000..f12d22fc26 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vrtc.c @@ -0,0 +1,1061 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include "vmm_ktr.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vrtc.h" + +/* Register layout of the RTC */ +struct rtcdev { + uint8_t sec; + uint8_t alarm_sec; + uint8_t min; + uint8_t alarm_min; + uint8_t hour; + uint8_t alarm_hour; + uint8_t day_of_week; + uint8_t day_of_month; + uint8_t month; + uint8_t year; + uint8_t reg_a; + uint8_t reg_b; + uint8_t reg_c; + uint8_t reg_d; + uint8_t nvram[36]; + uint8_t century; + uint8_t nvram2[128 - 51]; +} __packed; +CTASSERT(sizeof(struct rtcdev) == 128); +CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY); + +struct vrtc { + struct vm *vm; + struct mtx mtx; + struct callout callout; + u_int addr; /* RTC register to read or write */ + sbintime_t base_uptime; + time_t base_rtctime; + struct rtcdev rtcdev; +}; + +#define VRTC_LOCK(vrtc) mtx_lock(&((vrtc)->mtx)) +#define VRTC_UNLOCK(vrtc) mtx_unlock(&((vrtc)->mtx)) +#define VRTC_LOCKED(vrtc) mtx_owned(&((vrtc)->mtx)) + +/* + * RTC time is considered "broken" if: + * - RTC updates are halted by the guest + * - RTC date/time fields have invalid values + */ +#define VRTC_BROKEN_TIME ((time_t)-1) + +#define RTC_IRQ 8 +#define RTCSB_BIN 0x04 +#define RTCSB_ALL_INTRS (RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR) +#define rtc_halted(vrtc) ((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0) +#define aintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0) +#define pintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0) +#define uintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0) + +static void vrtc_callout_handler(void *arg); +static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval); + +static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc"); + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW, NULL, NULL); + +static int rtc_flag_broken_time = 1; +SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN, + &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected"); + +static __inline bool +divider_enabled(int reg_a) +{ + /* + * The RTC is counting only when dividers are not held in reset. + */ + return ((reg_a & 0x70) == 0x20); +} + +static __inline bool +update_enabled(struct vrtc *vrtc) +{ + /* + * RTC date/time can be updated only if: + * - divider is not held in reset + * - guest has not disabled updates + * - the date/time fields have valid contents + */ + if (!divider_enabled(vrtc->rtcdev.reg_a)) + return (false); + + if (rtc_halted(vrtc)) + return (false); + + if (vrtc->base_rtctime == VRTC_BROKEN_TIME) + return (false); + + return (true); +} + +static time_t +vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime) +{ + sbintime_t now, delta; + time_t t, secs; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + t = vrtc->base_rtctime; + *basetime = vrtc->base_uptime; + if (update_enabled(vrtc)) { + now = sbinuptime(); + delta = now - vrtc->base_uptime; + KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: " + "%#lx to %#lx", vrtc->base_uptime, now)); + secs = delta / SBT_1S; + t += secs; + *basetime += secs * SBT_1S; + } + return (t); +} + +static __inline uint8_t +rtcset(struct rtcdev *rtc, int val) +{ + + KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d", + __func__, val)); + + return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]); +} + +static void +secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; + int hour; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + if (rtctime < 0) { + KASSERT(rtctime == VRTC_BROKEN_TIME, + ("%s: invalid vrtc time %#lx", __func__, rtctime)); + return; + } + + /* + * If the RTC is halted then the guest has "ownership" of the + * date/time fields. Don't update the RTC date/time fields in + * this case (unless forced). + */ + if (rtc_halted(vrtc) && !force_update) + return; + + ts.tv_sec = rtctime; + ts.tv_nsec = 0; + clock_ts_to_ct(&ts, &ct); + + KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d", + ct.sec)); + KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d", + ct.min)); + KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d", + ct.hour)); + KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d", + ct.dow)); + KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d", + ct.day)); + KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d", + ct.mon)); + KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d", + ct.year)); + + rtc = &vrtc->rtcdev; + rtc->sec = rtcset(rtc, ct.sec); + rtc->min = rtcset(rtc, ct.min); + + if (rtc->reg_b & RTCSB_24HR) { + hour = ct.hour; + } else { + /* + * Convert to the 12-hour format. + */ + switch (ct.hour) { + case 0: /* 12 AM */ + case 12: /* 12 PM */ + hour = 12; + break; + default: + /* + * The remaining 'ct.hour' values are interpreted as: + * [1 - 11] -> 1 - 11 AM + * [13 - 23] -> 1 - 11 PM + */ + hour = ct.hour % 12; + break; + } + } + + rtc->hour = rtcset(rtc, hour); + + if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12) + rtc->hour |= 0x80; /* set MSB to indicate PM */ + + rtc->day_of_week = rtcset(rtc, ct.dow + 1); + rtc->day_of_month = rtcset(rtc, ct.day); + rtc->month = rtcset(rtc, ct.mon); + rtc->year = rtcset(rtc, ct.year % 100); + rtc->century = rtcset(rtc, ct.year / 100); +} + +static int +rtcget(struct rtcdev *rtc, int val, int *retval) +{ + uint8_t upper, lower; + + if (rtc->reg_b & RTCSB_BIN) { + *retval = val; + return (0); + } + + lower = val & 0xf; + upper = (val >> 4) & 0xf; + + if (lower > 9 || upper > 9) + return (-1); + + *retval = upper * 10 + lower; + return (0); +} + +static time_t +rtc_to_secs(struct vrtc *vrtc) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; +#ifdef __FreeBSD__ + struct vm *vm; +#endif + int century, error, hour, pm, year; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + +#ifdef __FreeBSD__ + vm = vrtc->vm; +#endif + rtc = &vrtc->rtcdev; + + bzero(&ct, sizeof(struct clocktime)); + + error = rtcget(rtc, rtc->sec, &ct.sec); + if (error || ct.sec < 0 || ct.sec > 59) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec); +#endif + goto fail; + } + + error = rtcget(rtc, rtc->min, &ct.min); + if (error || ct.min < 0 || ct.min > 59) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min); +#endif + goto fail; + } + + pm = 0; + hour = rtc->hour; + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (hour & 0x80) { + hour &= ~0x80; + pm = 1; + } + } + error = rtcget(rtc, hour, &ct.hour); + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (ct.hour >= 1 && ct.hour <= 12) { + /* + * Convert from 12-hour format to internal 24-hour + * representation as follows: + * + * 12-hour format ct.hour + * 12 AM 0 + * 1 - 11 AM 1 - 11 + * 12 PM 12 + * 1 - 11 PM 13 - 23 + */ + if (ct.hour == 12) + ct.hour = 0; + if (pm) + ct.hour += 12; + } else { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d", + rtc->hour, ct.hour); +#endif + goto fail; + } + } + + if (error || ct.hour < 0 || ct.hour > 23) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour); +#endif + goto fail; + } + + /* + * Ignore 'rtc->dow' because some guests like Linux don't bother + * setting it at all while others like OpenBSD/i386 set it incorrectly. + * + * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it. + */ + ct.dow = -1; + + error = rtcget(rtc, rtc->day_of_month, &ct.day); + if (error || ct.day < 1 || ct.day > 31) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month, + ct.day); +#endif + goto fail; + } + + error = rtcget(rtc, rtc->month, &ct.mon); + if (error || ct.mon < 1 || ct.mon > 12) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon); +#endif + goto fail; + } + + error = rtcget(rtc, rtc->year, &year); + if (error || year < 0 || year > 99) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year); +#endif + goto fail; + } + + error = rtcget(rtc, rtc->century, ¢ury); + ct.year = century * 100 + year; + if (error || ct.year < POSIX_BASE_YEAR) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century, + ct.year); +#endif + goto fail; + } + + error = clock_ct_to_ts(&ct, &ts); + if (error || ts.tv_sec < 0) { +#ifdef __FreeBSD__ + VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d", + ct.year, ct.mon, ct.day); + VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d", + ct.hour, ct.min, ct.sec); +#endif + goto fail; + } + return (ts.tv_sec); /* success */ +fail: + /* + * Stop updating the RTC if the date/time fields programmed by + * the guest are invalid. + */ +#ifdef __FreeBSD__ + VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected"); +#endif + return (VRTC_BROKEN_TIME); +} + +static int +vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase) +{ + struct rtcdev *rtc; +#ifdef __FreeBSD__ + sbintime_t oldbase; +#endif + time_t oldtime; + uint8_t alarm_sec, alarm_min, alarm_hour; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + alarm_sec = rtc->alarm_sec; + alarm_min = rtc->alarm_min; + alarm_hour = rtc->alarm_hour; + + oldtime = vrtc->base_rtctime; + VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx", + oldtime, newtime); + +#ifdef __FreeBSD__ + oldbase = vrtc->base_uptime; + VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx", + oldbase, newbase); +#endif + vrtc->base_uptime = newbase; + + if (newtime == oldtime) + return (0); + + /* + * If 'newtime' indicates that RTC updates are disabled then just + * record that and return. There is no need to do alarm interrupt + * processing in this case. + */ + if (newtime == VRTC_BROKEN_TIME) { + vrtc->base_rtctime = VRTC_BROKEN_TIME; + return (0); + } + + /* + * Return an error if RTC updates are halted by the guest. + */ + if (rtc_halted(vrtc)) { + VM_CTR0(vrtc->vm, "RTC update halted by guest"); + return (EBUSY); + } + + do { + /* + * If the alarm interrupt is enabled and 'oldtime' is valid + * then visit all the seconds between 'oldtime' and 'newtime' + * to check for the alarm condition. + * + * Otherwise move the RTC time forward directly to 'newtime'. + */ + if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME) + vrtc->base_rtctime++; + else + vrtc->base_rtctime = newtime; + + if (aintr_enabled(vrtc)) { + /* + * Update the RTC date/time fields before checking + * if the alarm conditions are satisfied. + */ + secs_to_rtc(vrtc->base_rtctime, vrtc, 0); + + if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) && + (alarm_min >= 0xC0 || alarm_min == rtc->min) && + (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) { + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM); + } + } + } while (vrtc->base_rtctime != newtime); + + if (uintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE); + + return (0); +} + +static sbintime_t +vrtc_freq(struct vrtc *vrtc) +{ + int ratesel; + + static sbintime_t pf[16] = { + 0, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 8192, + SBT_1S / 4096, + SBT_1S / 2048, + SBT_1S / 1024, + SBT_1S / 512, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 64, + SBT_1S / 32, + SBT_1S / 16, + SBT_1S / 8, + SBT_1S / 4, + SBT_1S / 2, + }; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + /* + * If both periodic and alarm interrupts are enabled then use the + * periodic frequency to drive the callout. The minimum periodic + * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so + * piggyback the alarm on top of it. The same argument applies to + * the update interrupt. + */ + if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) { + ratesel = vrtc->rtcdev.reg_a & 0xf; + return (pf[ratesel]); + } else if (aintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else if (uintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else { + return (0); + } +} + +static void +vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt) +{ + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + if (freqsbt == 0) { + if (callout_active(&vrtc->callout)) { + VM_CTR0(vrtc->vm, "RTC callout stopped"); + callout_stop(&vrtc->callout); + } + return; + } + VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt); + callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler, + vrtc, 0); +} + +static void +vrtc_callout_handler(void *arg) +{ + struct vrtc *vrtc = arg; + sbintime_t freqsbt, basetime; + time_t rtctime; + int error; + + VM_CTR0(vrtc->vm, "vrtc callout fired"); + + VRTC_LOCK(vrtc); + if (callout_pending(&vrtc->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vrtc->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vrtc->callout); + + KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0, + ("gratuitous vrtc callout")); + + if (pintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD); + + if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) { + rtctime = vrtc_curtime(vrtc, &basetime); + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("%s: vrtc_time_update error %d", + __func__, error)); + } + + freqsbt = vrtc_freq(vrtc); + KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__)); + vrtc_callout_reset(vrtc, freqsbt); +done: + VRTC_UNLOCK(vrtc); +} + +static __inline void +vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq) +{ + int active; + + active = callout_active(&vrtc->callout) ? 1 : 0; + KASSERT((freq == 0 && !active) || (freq != 0 && active), + ("vrtc callout %s with frequency %#lx", + active ? "active" : "inactive", freq)); +} + +static void +vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + int oldirqf, newirqf; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE; + + oldirqf = rtc->reg_c & RTCIR_INT; + if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) || + (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) || + (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) { + newirqf = RTCIR_INT; + } else { + newirqf = 0; + } + + oldval = rtc->reg_c; + rtc->reg_c = newirqf | newval; + changed = oldval ^ rtc->reg_c; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x", + oldval, rtc->reg_c); + } + + if (!oldirqf && newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ); + vatpic_pulse_irq(vrtc->vm, RTC_IRQ); + vioapic_pulse_irq(vrtc->vm, RTC_IRQ); + } else if (oldirqf && !newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ); + } +} + +static int +vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + sbintime_t oldfreq, newfreq, basetime; + time_t curtime, rtctime; + int error; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + oldval = rtc->reg_b; + oldfreq = vrtc_freq(vrtc); + + rtc->reg_b = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x", + oldval, newval); + } + + if (changed & RTCSB_HALT) { + if ((newval & RTCSB_HALT) == 0) { + rtctime = rtc_to_secs(vrtc); + basetime = sbinuptime(); + if (rtctime == VRTC_BROKEN_TIME) { + if (rtc_flag_broken_time) + return (-1); + } + } else { + curtime = vrtc_curtime(vrtc, &basetime); + KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch " + "between vrtc basetime (%#lx) and curtime (%#lx)", + __func__, vrtc->base_rtctime, curtime)); + + /* + * Force a refresh of the RTC date/time fields so + * they reflect the time right before the guest set + * the HALT bit. + */ + secs_to_rtc(curtime, vrtc, 1); + + /* + * Updates are halted so mark 'base_rtctime' to denote + * that the RTC date/time is in flux. + */ + rtctime = VRTC_BROKEN_TIME; + rtc->reg_b &= ~RTCSB_UINTR; + } + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("vrtc_time_update error %d", error)); + } + + /* + * Side effect of changes to the interrupt enable bits. + */ + if (changed & RTCSB_ALL_INTRS) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c); + + /* + * Change the callout frequency if it has changed. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); + + /* + * The side effect of bits that control the RTC date/time format + * is handled lazily when those fields are actually read. + */ + return (0); +} + +static void +vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval) +{ + sbintime_t oldfreq, newfreq; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + newval &= ~RTCSA_TUP; + oldval = vrtc->rtcdev.reg_a; + oldfreq = vrtc_freq(vrtc); + + if (divider_enabled(oldval) && !divider_enabled(newval)) { + VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else if (!divider_enabled(oldval) && divider_enabled(newval)) { + /* + * If the dividers are coming out of reset then update + * 'base_uptime' before this happens. This is done to + * maintain the illusion that the RTC date/time was frozen + * while the dividers were disabled. + */ + vrtc->base_uptime = sbinuptime(); + VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else { + /* NOTHING */ + } + + vrtc->rtcdev.reg_a = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x", + oldval, newval); + } + + /* + * Side effect of changes to rate select and divider enable bits. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); +} + +int +vrtc_set_time(struct vm *vm, time_t secs) +{ + struct vrtc *vrtc; + int error; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + error = vrtc_time_update(vrtc, secs, sbinuptime()); + VRTC_UNLOCK(vrtc); + + if (error) { + VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error, + secs); + } else { + VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs); + } + + return (error); +} + +time_t +vrtc_get_time(struct vm *vm) +{ + struct vrtc *vrtc; + sbintime_t basetime; + time_t t; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + t = vrtc_curtime(vrtc, &basetime); + VRTC_UNLOCK(vrtc); + + return (t); +} + +int +vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) +{ + struct vrtc *vrtc; + uint8_t *ptr; + + vrtc = vm_rtc(vm); + + /* + * Don't allow writes to RTC control registers or the date/time fields. + */ + if (offset < offsetof(struct rtcdev, nvram[0]) || + offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) { + VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", + offset); + return (EINVAL); + } + + VRTC_LOCK(vrtc); + ptr = (uint8_t *)(&vrtc->rtcdev); + ptr[offset] = value; + VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset); + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) +{ + struct vrtc *vrtc; + sbintime_t basetime; + time_t curtime; + uint8_t *ptr; + + /* + * Allow all offsets in the RTC to be read. + */ + if (offset < 0 || offset >= sizeof(struct rtcdev)) + return (EINVAL); + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + + /* + * Update RTC date/time fields if necessary. + */ + if (offset < 10 || offset == RTC_CENTURY) { + curtime = vrtc_curtime(vrtc, &basetime); + secs_to_rtc(curtime, vrtc, 0); + } + + ptr = (uint8_t *)(&vrtc->rtcdev); + *retval = ptr[offset]; + + VRTC_UNLOCK(vrtc); + return (0); +} + +int +vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vrtc *vrtc; + + vrtc = vm_rtc(vm); + + if (bytes != 1) + return (-1); + + if (in) { + *val = 0xff; + return (0); + } + + VRTC_LOCK(vrtc); + vrtc->addr = *val & 0x7f; + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + sbintime_t basetime; + time_t curtime; + int error, offset; + + vrtc = vm_rtc(vm); + rtc = &vrtc->rtcdev; + + if (bytes != 1) + return (-1); + + VRTC_LOCK(vrtc); + offset = vrtc->addr; + if (offset >= sizeof(struct rtcdev)) { + VRTC_UNLOCK(vrtc); + return (-1); + } + + error = 0; + curtime = vrtc_curtime(vrtc, &basetime); + vrtc_time_update(vrtc, curtime, basetime); + + /* + * Update RTC date/time fields if necessary. + * + * This is not just for reads of the RTC. The side-effect of writing + * the century byte requires other RTC date/time fields (e.g. sec) + * to be updated here. + */ + if (offset < 10 || offset == RTC_CENTURY) + secs_to_rtc(curtime, vrtc, 0); + + if (in) { + if (offset == 12) { + /* + * XXX + * reg_c interrupt flags are updated only if the + * corresponding interrupt enable bit in reg_b is set. + */ + *val = vrtc->rtcdev.reg_c; + vrtc_set_reg_c(vrtc, 0); + } else { + *val = *((uint8_t *)rtc + offset); + } + VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x", + *val, offset); + } else { + switch (offset) { + case 10: + VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val); + vrtc_set_reg_a(vrtc, *val); + break; + case 11: + VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val); + error = vrtc_set_reg_b(vrtc, *val); + break; + case 12: + VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)", + *val); + break; + case 13: + VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)", + *val); + break; + case 0: + /* + * High order bit of 'seconds' is readonly. + */ + *val &= 0x7f; + /* FALLTHRU */ + default: + VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x", + offset, *val); + *((uint8_t *)rtc + offset) = *val; + break; + } + + /* + * XXX some guests (e.g. OpenBSD) write the century byte + * outside of RTCSB_HALT so re-calculate the RTC date/time. + */ + if (offset == RTC_CENTURY && !rtc_halted(vrtc)) { + curtime = rtc_to_secs(vrtc); + error = vrtc_time_update(vrtc, curtime, sbinuptime()); + KASSERT(!error, ("vrtc_time_update error %d", error)); + if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time) + error = -1; + } + } + VRTC_UNLOCK(vrtc); + return (error); +} + +void +vrtc_reset(struct vrtc *vrtc) +{ + struct rtcdev *rtc; + + VRTC_LOCK(vrtc); + + rtc = &vrtc->rtcdev; + vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE)); + vrtc_set_reg_c(vrtc, 0); + KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active")); + + VRTC_UNLOCK(vrtc); +} + +struct vrtc * +vrtc_init(struct vm *vm) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + time_t curtime; + + vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO); + vrtc->vm = vm; + mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF); + callout_init(&vrtc->callout, 1); + + /* Allow dividers to keep time but disable everything else */ + rtc = &vrtc->rtcdev; + rtc->reg_a = 0x20; + rtc->reg_b = RTCSB_24HR; + rtc->reg_c = 0; + rtc->reg_d = RTCSD_PWR; + + /* Reset the index register to a safe value. */ + vrtc->addr = RTC_STATUSD; + + /* + * Initialize RTC time to 00:00:00 Jan 1, 1970. + */ + curtime = 0; + + VRTC_LOCK(vrtc); + vrtc->base_rtctime = VRTC_BROKEN_TIME; + vrtc_time_update(vrtc, curtime, sbinuptime()); + secs_to_rtc(curtime, vrtc, 0); + VRTC_UNLOCK(vrtc); + + return (vrtc); +} + +void +vrtc_cleanup(struct vrtc *vrtc) +{ + + callout_drain(&vrtc->callout); + free(vrtc, M_VRTC); +} + +#ifndef __FreeBSD__ +void +vrtc_localize_resources(struct vrtc *vrtc) +{ + vmm_glue_callout_localize(&vrtc->callout); +} +#endif /* __FreeBSD */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vrtc.h b/usr/src/uts/i86pc/io/vmm/io/vrtc.h new file mode 100644 index 0000000000..13abbedeb9 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vrtc.h @@ -0,0 +1,60 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VRTC_H_ +#define _VRTC_H_ + +#include + +struct vrtc; + +struct vrtc *vrtc_init(struct vm *vm); +void vrtc_cleanup(struct vrtc *vrtc); +void vrtc_reset(struct vrtc *vrtc); + +time_t vrtc_get_time(struct vm *vm); +int vrtc_set_time(struct vm *vm, time_t secs); +int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value); +int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval); + +int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); +int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); + +#ifndef __FreeBSD__ +void vrtc_localize_resources(struct vrtc *); +#endif + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/offsets.in b/usr/src/uts/i86pc/io/vmm/offsets.in deleted file mode 100644 index 4b1fe1d6b6..0000000000 --- a/usr/src/uts/i86pc/io/vmm/offsets.in +++ /dev/null @@ -1,72 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2014 Pluribus Networks Inc. - */ - -#include -#include -#include -#include - -#include - -#include -#include "intel/vmx_cpufunc.h" -#include "intel/vmx.h" - -vmxctx - tmpstktop VMXCTX_TMPSTKTOP - guest_rdi VMXCTX_GUEST_RDI - guest_rsi VMXCTX_GUEST_RSI - guest_rdx VMXCTX_GUEST_RDX - guest_rcx VMXCTX_GUEST_RCX - guest_r8 VMXCTX_GUEST_R8 - guest_r9 VMXCTX_GUEST_R9 - guest_rax VMXCTX_GUEST_RAX - guest_rbx VMXCTX_GUEST_RBX - guest_rbp VMXCTX_GUEST_RBP - guest_r10 VMXCTX_GUEST_R10 - guest_r11 VMXCTX_GUEST_R11 - guest_r12 VMXCTX_GUEST_R12 - guest_r13 VMXCTX_GUEST_R13 - guest_r14 VMXCTX_GUEST_R14 - guest_r15 VMXCTX_GUEST_R15 - guest_cr2 VMXCTX_GUEST_CR2 - host_r15 VMXCTX_HOST_R15 - host_r14 VMXCTX_HOST_R14 - host_r13 VMXCTX_HOST_R13 - host_r12 VMXCTX_HOST_R12 - host_rbp VMXCTX_HOST_RBP - host_rsp VMXCTX_HOST_RSP - host_rbx VMXCTX_HOST_RBX - host_rip VMXCTX_HOST_RIP - launch_error VMXCTX_LAUNCH_ERROR - -vmx VMX_SIZE - -\#define VM_SUCCESS 0 -\#define VM_FAIL_INVALID 1 -\#define VM_FAIL_VALID 2 - -\#define VMX_RETURN_DIRECT 0 -\#define VMX_RETURN_LONGJMP 1 -\#define VMX_RETURN_VMRESUME 2 -\#define VMX_RETURN_VMLAUNCH 3 -\#define VMX_RETURN_AST 4 - -cpu - cpu_thread - -_kthread - t_lwp - _tu._ts._t_astflag T_ASTFLAG diff --git a/usr/src/uts/i86pc/io/vmm/vm/pmap.h b/usr/src/uts/i86pc/io/vmm/vm/pmap.h new file mode 100644 index 0000000000..512fc4acee --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/pmap.h @@ -0,0 +1,27 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _PMAP_VM_ +#define _PMAP_VM_ + +#include +#include "vm_glue.h" + +void pmap_invalidate_cache(void); +void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); +int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype); +long pmap_wired_count(pmap_t pmap); + +#endif /* _PMAP_VM_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_extern.h b/usr/src/uts/i86pc/io/vmm/vm/vm_extern.h new file mode 100644 index 0000000000..92a959960a --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_extern.h @@ -0,0 +1,35 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_EXTERN_H_ +#define _VM_EXTERN_H_ + +#include +#include + +struct vmspace; +struct pmap; + +typedef int (*pmap_pinit_t)(struct pmap *pmap); + +struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t); +void vmspace_free(struct vmspace *); + +int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); +int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, + vm_prot_t prot, vm_page_t *ma, int max_count); + + +#endif /* _VM_EXTERN_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h b/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h new file mode 100644 index 0000000000..600872c321 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h @@ -0,0 +1,99 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VM_GLUE_ +#define _VM_GLUE_ + +#include +#include +#include + +struct vmspace; +struct vm_map; +struct pmap; +struct vm_object; +struct vmm_pt_ops; + +struct vm_map { + struct vmspace *vmm_space; +}; + +struct pmap { + void *pm_pml4; + cpuset_t pm_active; + long pm_eptgen; + + /* Implementation private */ + enum pmap_type pm_type; + struct vmm_pt_ops *pm_ops; + void *pm_impl; +}; + +struct vmspace { + struct vm_map vm_map; + + /* Implementation private */ + kmutex_t vms_lock; + boolean_t vms_map_changing; + struct pmap vms_pmap; + uintptr_t vms_size; /* fixed after creation */ + + list_t vms_maplist; +}; + +typedef pfn_t (*vm_pager_fn_t)(vm_object_t, uintptr_t, pfn_t *, uint_t *); + +struct vm_object { + uint_t vmo_refcnt; /* manipulated with atomic ops */ + + /* This group of fields are fixed at creation time */ + objtype_t vmo_type; + size_t vmo_size; + vm_pager_fn_t vmo_pager; + void *vmo_data; + + kmutex_t vmo_lock; /* protects fields below */ + vm_memattr_t vmo_attr; +}; + +struct vm_page { + kmutex_t vmp_lock; + pfn_t vmp_pfn; + struct vm_object *vmp_obj_held; +}; + +/* Illumos-specific functions for setup and operation */ +int vm_segmap_obj(struct vmspace *, vm_object_t, struct as *, caddr_t *, + uint_t, uint_t, uint_t); +int vm_segmap_space(struct vmspace *, off_t, struct as *, caddr_t *, off_t, + uint_t, uint_t, uint_t); +void *vmspace_find_kva(struct vmspace *, uintptr_t, size_t); +void vmm_arena_init(void); +void vmm_arena_fini(void); + +struct vmm_pt_ops { + void * (*vpo_init)(uint64_t *); + void (*vpo_free)(void *); + uint64_t (*vpo_wired_cnt)(void *); + int (*vpo_is_wired)(void *, uint64_t, uint_t *); + int (*vpo_map)(void *, uint64_t, pfn_t, uint_t, uint_t, uint8_t); + uint64_t (*vpo_unmap)(void *, uint64_t, uint64_t); +}; + +extern struct vmm_pt_ops ept_ops; +extern struct vmm_pt_ops rvi_ops; + + +#endif /* _VM_GLUE_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_map.h b/usr/src/uts/i86pc/io/vmm/vm/vm_map.h new file mode 100644 index 0000000000..70826ac8f1 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_map.h @@ -0,0 +1,63 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_MAP_ +#define _VM_MAP_ + +#include "vm_glue.h" + +/* + * vm_map_wire and vm_map_unwire option flags + */ +#define VM_MAP_WIRE_SYSTEM 0 /* wiring in a kernel map */ +#define VM_MAP_WIRE_USER 1 /* wiring in a user map */ + +#define VM_MAP_WIRE_NOHOLES 0 /* region must not have holes */ +#define VM_MAP_WIRE_HOLESOK 2 /* region may have holes */ + +#define VM_MAP_WIRE_WRITE 4 /* Validate writable. */ + +/* + * The following "find_space" options are supported by vm_map_find(). + * + * For VMFS_ALIGNED_SPACE, the desired alignment is specified to + * the macro argument as log base 2 of the desired alignment. + */ +#define VMFS_NO_SPACE 0 /* don't find; use the given range */ +#define VMFS_ANY_SPACE 1 /* find range with any alignment */ +#define VMFS_OPTIMAL_SPACE 2 /* find range with optimal alignment */ +#define VMFS_SUPER_SPACE 3 /* find superpage-aligned range */ +#define VMFS_ALIGNED_SPACE(x) ((x) << 8) /* find range with fixed alignment */ + +/* + * vm_fault option flags + */ +#define VM_FAULT_NORMAL 0 /* Nothing special */ +#define VM_FAULT_WIRE 1 /* Wire the mapped page */ +#define VM_FAULT_DIRTY 2 /* Dirty the page; use w/VM_PROT_COPY */ + + + +pmap_t vmspace_pmap(struct vmspace *); + +int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, + vm_offset_t, int, vm_prot_t, vm_prot_t, int); +int vm_map_remove(vm_map_t, vm_offset_t, vm_offset_t); +int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); + +long vmspace_resident_count(struct vmspace *vmspace); + + +#endif /* _VM_MAP_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_object.h b/usr/src/uts/i86pc/io/vmm/vm/vm_object.h new file mode 100644 index 0000000000..1f16fa9b83 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_object.h @@ -0,0 +1,31 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_OBJECT_ +#define _VM_OBJECT_ + +#include "vm_glue.h" + +vm_object_t vm_object_allocate(objtype_t, vm_pindex_t); +void vm_object_deallocate(vm_object_t); +void vm_object_reference(vm_object_t); +int vm_object_set_memattr(vm_object_t, vm_memattr_t); +void vm_object_clear(vm_object_t); + + +#define VM_OBJECT_WLOCK(vmo) mutex_enter(&(vmo)->vmo_lock) +#define VM_OBJECT_WUNLOCK(vmo) mutex_exit(&(vmo)->vmo_lock) + +#endif /* _VM_OBJECT_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_page.h b/usr/src/uts/i86pc/io/vmm/vm/vm_page.h new file mode 100644 index 0000000000..4559fe6d4c --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_page.h @@ -0,0 +1,28 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + + +#ifndef _VM_PAGE_ +#define _VM_PAGE_ + +#include "vm_glue.h" + +void vm_page_lock(vm_page_t); +void vm_page_unhold(vm_page_t); +void vm_page_unlock(vm_page_t); + +#define VM_PAGE_TO_PHYS(page) (mmu_ptob((uintptr_t)((page)->vmp_pfn))) + +#endif /* _VM_PAGE_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_pager.h b/usr/src/uts/i86pc/io/vmm/vm/vm_pager.h new file mode 100644 index 0000000000..11aa344f61 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_pager.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_PAGER_ +#define _VM_PAGER_ + +vm_object_t vm_pager_allocate(objtype_t, void *, vm_ooffset_t, vm_prot_t, + vm_ooffset_t, void *); + + +#endif /* _VM_PAGER_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c index 7081368f4a..6df094b50e 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,10 +38,11 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon $"); +__FBSDID("$FreeBSD$"); #include #include @@ -51,16 +54,26 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon #include #include #include +#include #include #include -#include #include #include - -#include +#include +#include +#include +#include +#include +#include + +#ifdef __FreeBSD__ +#include +#endif #include #include +#include +#include #include #include @@ -77,83 +90,132 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon #include "vhpet.h" #include "vioapic.h" #include "vlapic.h" -#include "vmm_ipi.h" +#include "vpmtmr.h" +#include "vrtc.h" #include "vmm_stat.h" #include "vmm_lapic.h" -#ifdef __FreeBSD__ #include "io/ppt.h" #include "io/iommu.h" -#endif -struct vhpet; -struct vioapic; struct vlapic; +/* + * Initialization: + * (a) allocated when vcpu is created + * (i) initialized when vcpu is created and when it is reinitialized + * (o) initialized the first time the vcpu is created + * (x) initialized before use + */ struct vcpu { - int flags; - enum vcpu_state state; - struct mtx mtx; - int hostcpu; /* host cpuid this vcpu last ran on */ - struct vlapic *vlapic; - int vcpuid; - struct savefpu *guestfpu; /* guest fpu state */ - void *stats; - struct vm_exit exitinfo; + struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ + enum vcpu_state state; /* (o) vcpu state */ +#ifndef __FreeBSD__ + kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ + kcondvar_t state_cv; /* (o) IDLE-transition cv */ +#endif /* __FreeBSD__ */ + int hostcpu; /* (o) vcpu's current host cpu */ +#ifndef __FreeBSD__ + int lastloccpu; /* (o) last host cpu localized to */ +#endif + u_int runblock; /* (i) block vcpu from run state */ + int reqidle; /* (i) request vcpu to idle */ + struct vlapic *vlapic; /* (i) APIC device model */ + enum x2apic_state x2apic_state; /* (i) APIC mode */ + uint64_t exitintinfo; /* (i) events pending at VM exit */ + int nmi_pending; /* (i) NMI pending */ + int extint_pending; /* (i) INTR pending */ + int exception_pending; /* (i) exception pending */ + int exc_vector; /* (x) exception collateral */ + int exc_errcode_valid; + uint32_t exc_errcode; + struct savefpu *guestfpu; /* (a,i) guest fpu state */ + uint64_t guest_xcr0; /* (i) guest %xcr0 register */ + void *stats; /* (a,i) statistics */ + struct vm_exit exitinfo; /* (x) exit reason and collateral */ uint64_t nextrip; /* (x) next instruction to execute */ - enum x2apic_state x2apic_state; - uint64_t exitintinfo; - int nmi_pending; - int extint_pending; - struct vm_exception exception; - int exception_pending; +#ifndef __FreeBSD__ + uint64_t tsc_offset; /* (x) offset from host TSC */ +#endif }; +#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) -#define VM_MAX_MEMORY_SEGMENTS 8 - -struct vm { - void *cookie; /* processor-specific data */ - void *iommu; /* iommu-specific data */ - struct vcpu vcpu[VM_MAXCPU]; - struct vhpet *vhpet; - struct vioapic *vioapic; /* virtual ioapic */ - struct vatpic *vatpic; /* virtual atpic */ - struct vatpit *vatpit; /* virtual atpit */ - int num_mem_segs; - struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS]; - char name[VM_MAX_NAMELEN]; +struct mem_seg { + size_t len; + bool sysmem; + struct vm_object *object; +}; +#ifdef __FreeBSD__ +#define VM_MAX_MEMSEGS 3 +#else +#define VM_MAX_MEMSEGS 4 +#endif - /* - * Set of active vcpus. - * An active vcpu is one that has been started implicitly (BSP) or - * explicitly (AP) by sending it a startup ipi. - */ - cpuset_t active_cpus; +struct mem_map { + vm_paddr_t gpa; + size_t len; + vm_ooffset_t segoff; + int segid; + int prot; + int flags; +}; +#define VM_MAX_MEMMAPS 4 - vm_rendezvous_func_t rendezvous_func; +/* + * Initialization: + * (o) initialized the first time the VM is created + * (i) initialized when VM is created and when it is reinitialized + * (x) initialized before use + */ +struct vm { + void *cookie; /* (i) cpu-specific data */ + void *iommu; /* (x) iommu-specific data */ + struct vhpet *vhpet; /* (i) virtual HPET */ + struct vioapic *vioapic; /* (i) virtual ioapic */ + struct vatpic *vatpic; /* (i) virtual atpic */ + struct vatpit *vatpit; /* (i) virtual atpit */ + struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ + struct vrtc *vrtc; /* (o) virtual RTC */ + volatile cpuset_t active_cpus; /* (i) active vcpus */ + volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ + int suspend; /* (i) stop VM execution */ + volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ + volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ + struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ + struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ + struct vmspace *vmspace; /* (o) guest's address space */ + char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ + struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ + /* The following describe the vm cpu topology */ + uint16_t sockets; /* (o) num of sockets */ + uint16_t cores; /* (o) num of cores/socket */ + uint16_t threads; /* (o) num of threads/core */ + uint16_t maxcpus; /* (o) max pluggable cpus */ +#ifndef __FreeBSD__ + list_t ioport_hooks; +#endif /* __FreeBSD__ */ }; static int vmm_initialized; static struct vmm_ops *ops; -#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) +#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) +#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) -#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) -#define VMRUN(vmi, vcpu, rip) \ - (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO) +#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) +#define VMRUN(vmi, vcpu, rip, pmap, evinfo) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) -#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \ - (ops != NULL ? \ - (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \ - ENXIO) -#define VMMMAP_GET(vmi, gpa) \ - (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO) +#define VMSPACE_ALLOC(min, max) \ + (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) +#define VMSPACE_FREE(vmspace) \ + (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) #define VMGETREG(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) #define VMSETREG(vmi, vcpu, num, val) \ @@ -174,45 +236,134 @@ static struct vmm_ops *ops; #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() +SDT_PROVIDER_DEFINE(vmm); + static MALLOC_DEFINE(M_VM, "vm", "vm"); /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +/* + * Halt the guest if all vcpus are executing a HLT instruction with + * interrupts disabled. + */ +static int halt_detection_enabled = 1; +SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, + &halt_detection_enabled, 0, + "Halt VM if all vcpus execute HLT with interrupts disabled"); + static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); +static int trace_guest_exceptions; +SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, + &trace_guest_exceptions, 0, + "Trap into hypervisor on all guest exceptions and reflect them back"); + +static void vm_free_memmap(struct vm *vm, int ident); +static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); +static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); + +#ifndef __FreeBSD__ +static void vm_clear_memseg(struct vm *, int); + +typedef struct vm_ioport_hook { + list_node_t vmih_node; + uint_t vmih_ioport; + void *vmih_arg; + vmm_rmem_cb_t vmih_rmem_cb; + vmm_wmem_cb_t vmih_wmem_cb; +} vm_ioport_hook_t; + +/* Flags for vtc_status */ +#define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ +#define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ + +typedef struct vm_thread_ctx { + struct vm *vtc_vm; + int vtc_vcpuid; + uint_t vtc_status; +} vm_thread_ctx_t; +#endif /* __FreeBSD__ */ + +#ifdef KTR +static const char * +vcpu_state2str(enum vcpu_state state) +{ + + switch (state) { + case VCPU_IDLE: + return ("idle"); + case VCPU_FROZEN: + return ("frozen"); + case VCPU_RUNNING: + return ("running"); + case VCPU_SLEEPING: + return ("sleeping"); + default: + return ("unknown"); + } +} +#endif + static void -vcpu_cleanup(struct vm *vm, int i) +vcpu_cleanup(struct vm *vm, int i, bool destroy) { struct vcpu *vcpu = &vm->vcpu[i]; VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); -#ifdef __FreeBSD__ - vmm_stat_free(vcpu->stats); -#endif - fpu_save_area_free(vcpu->guestfpu); + if (destroy) { + vmm_stat_free(vcpu->stats); + fpu_save_area_free(vcpu->guestfpu); + } } static void -vcpu_init(struct vm *vm, uint32_t vcpu_id) +vcpu_init(struct vm *vm, int vcpu_id, bool create) { struct vcpu *vcpu; - + + KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, + ("vcpu_init: invalid vcpu %d", vcpu_id)); + vcpu = &vm->vcpu[vcpu_id]; - vcpu_lock_init(vcpu); - vcpu->hostcpu = NOCPU; - vcpu->vcpuid = vcpu_id; + if (create) { +#ifdef __FreeBSD__ + KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " + "initialized", vcpu_id)); +#endif + vcpu_lock_init(vcpu); + vcpu->state = VCPU_IDLE; + vcpu->hostcpu = NOCPU; +#ifndef __FreeBSD__ + vcpu->lastloccpu = NOCPU; +#endif + vcpu->guestfpu = fpu_save_area_alloc(); + vcpu->stats = vmm_stat_alloc(); + } + vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); + vcpu->runblock = 0; + vcpu->reqidle = 0; vcpu->exitintinfo = 0; - vcpu->guestfpu = fpu_save_area_alloc(); + vcpu->nmi_pending = 0; + vcpu->extint_pending = 0; + vcpu->exception_pending = 0; + vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; fpu_save_area_reset(vcpu->guestfpu); -#ifdef __FreeBSD__ - vcpu->stats = vmm_stat_alloc(); -#endif + vmm_stat_init(vcpu->stats); +} + +int +vcpu_trace_exceptions(struct vm *vm, int vcpuid) +{ + + return (trace_guest_exceptions); } struct vm_exit * @@ -220,7 +371,7 @@ vm_exitinfo(struct vm *vm, int cpuid) { struct vcpu *vcpu; - if (cpuid < 0 || cpuid >= VM_MAXCPU) + if (cpuid < 0 || cpuid >= vm->maxcpus) panic("vm_exitinfo: invalid cpuid %d", cpuid); vcpu = &vm->vcpu[cpuid]; @@ -228,24 +379,35 @@ vm_exitinfo(struct vm *vm, int cpuid) return (&vcpu->exitinfo); } +#ifdef __FreeBSD__ +static void +vmm_resume(void) +{ + VMM_RESUME(); +} +#endif + static int vmm_init(void) { int error; -#ifndef __FreeBSD__ - vmm_sol_glue_init(); -#endif - vmm_host_state_init(); -#ifdef __FreeBSD__ - vmm_ipi_init(); + +#ifdef __FreeBSD__ + vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : + &IDTVEC(justreturn)); + if (vmm_ipinum < 0) + vmm_ipinum = IPI_AST; +#else + /* We use cpu_poke() for IPIs */ + vmm_ipinum = 0; #endif error = vmm_mem_init(); if (error) return (error); - + if (vmm_is_intel()) ops = &vmm_ops_intel; else if (vmm_is_amd()) @@ -253,10 +415,15 @@ vmm_init(void) else return (ENXIO); - return (VMM_INIT()); +#ifdef __FreeBSD__ + vmm_resume_p = vmm_resume; +#endif + + return (VMM_INIT(vmm_ipinum)); } -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ + static int vmm_handler(module_t mod, int what, void *arg) { @@ -265,8 +432,6 @@ vmm_handler(module_t mod, int what, void *arg) switch (what) { case MOD_LOAD: vmmdev_init(); - if (ppt_num_devices() > 0) - iommu_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; @@ -274,11 +439,12 @@ vmm_handler(module_t mod, int what, void *arg) case MOD_UNLOAD: error = vmmdev_cleanup(); if (error == 0) { -#ifndef __FreeBSD__ - vmm_sol_glue_cleanup(); -#endif + vmm_resume_p = NULL; iommu_cleanup(); - vmm_ipi_cleanup(); +#ifdef __FreeBSD__ + if (vmm_ipinum != IPI_AST) + lapic_ipi_free(vmm_ipinum); +#endif error = VMM_CLEANUP(); /* * Something bad happened - prevent new @@ -304,23 +470,19 @@ static moduledata_t vmm_kmod = { /* * vmm initialization has the following dependencies: * - * - iommu initialization must happen after the pci passthru driver has had - * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). - * * - VT-x initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). */ DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); -SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); -#else +#else /* __FreeBSD__ */ + int vmm_mod_load() { int error; - vmmdev_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; @@ -333,9 +495,6 @@ vmm_mod_unload() { int error; - error = vmmdev_cleanup(); - if (error) - return (error); error = VMM_CLEANUP(); if (error) return (error); @@ -343,16 +502,63 @@ vmm_mod_unload() return (0); } + +#endif /* __FreeBSD__ */ + +static void +vm_init(struct vm *vm, bool create) +{ + int i; +#ifndef __FreeBSD__ + uint64_t tsc_off; #endif + vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); + vm->iommu = NULL; + vm->vioapic = vioapic_init(vm); + vm->vhpet = vhpet_init(vm); + vm->vatpic = vatpic_init(vm); + vm->vatpit = vatpit_init(vm); + vm->vpmtmr = vpmtmr_init(vm); + if (create) + vm->vrtc = vrtc_init(vm); +#ifndef __FreeBSD__ + if (create) { + list_create(&vm->ioport_hooks, sizeof (vm_ioport_hook_t), + offsetof (vm_ioport_hook_t, vmih_node)); + } else { + VERIFY(list_is_empty(&vm->ioport_hooks)); + } +#endif /* __FreeBSD__ */ + + CPU_ZERO(&vm->active_cpus); + CPU_ZERO(&vm->debug_cpus); + + vm->suspend = 0; + CPU_ZERO(&vm->suspended_cpus); + + for (i = 0; i < vm->maxcpus; i++) + vcpu_init(vm, i, create); + +#ifndef __FreeBSD__ + tsc_off = (uint64_t)(-(int64_t)rdtsc()); + for (i = 0; i < vm->maxcpus; i++) { + vm->vcpu[i].tsc_offset = tsc_off; + } +#endif /* __FreeBSD__ */ +} + +/* + * The default CPU topology is a single thread per package. + */ +u_int cores_per_package = 1; +u_int threads_per_core = 1; + int vm_create(const char *name, struct vm **retvm) { - int i; struct vm *vm; - vm_paddr_t maxaddr; - - const int BSP = 0; + struct vmspace *vmspace; /* * If vmm.ko could not be successfully initialized then don't attempt @@ -364,269 +570,587 @@ vm_create(const char *name, struct vm **retvm) if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); + vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS); + if (vmspace == NULL) + return (ENOMEM); + vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); - vm->cookie = VMINIT(vm); - - vm->vioapic = vioapic_init(vm); - vm->vhpet = vhpet_init(vm); - vm->vatpic = vatpic_init(vm); - vm->vatpit = vatpit_init(vm); + vm->vmspace = vmspace; - for (i = 0; i < VM_MAXCPU; i++) { - vcpu_init(vm, i); - } + vm->sockets = 1; + vm->cores = cores_per_package; /* XXX backwards compatibility */ + vm->threads = threads_per_core; /* XXX backwards compatibility */ + vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ -#ifdef __FreeBSD__ - maxaddr = vmm_mem_maxaddr(); - vm->iommu = iommu_create_domain(maxaddr); -#endif + vm_init(vm, true); *retvm = vm; return (0); } -static void -vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg) +void +vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus) { - size_t len; - vm_paddr_t hpa; - void *host_domain; - -#ifdef __FreeBSD__ - host_domain = iommu_host_domain(); -#endif - - len = 0; - while (len < seg->len) { - hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE); - if (hpa == (vm_paddr_t)-1) { - panic("vm_free_mem_segs: cannot free hpa " - "associated with gpa 0x%016lx", seg->gpa + len); - } - -#ifdef __FreeBSD__ - /* - * Remove the 'gpa' to 'hpa' mapping in VMs domain. - * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'. - */ - iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE); - iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE); -#endif - - vmm_mem_free(hpa, PAGE_SIZE); - - len += PAGE_SIZE; - } + *sockets = vm->sockets; + *cores = vm->cores; + *threads = vm->threads; + *maxcpus = vm->maxcpus; +} -#ifdef __FreeBSD__ - /* - * Invalidate cached translations associated with 'vm->iommu' since - * we have now moved some pages from it. - */ - iommu_invalidate_tlb(vm->iommu); -#endif +uint16_t +vm_get_maxcpus(struct vm *vm) +{ + return (vm->maxcpus); +} - bzero(seg, sizeof(struct vm_memory_segment)); +int +vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus) +{ + if (maxcpus != 0) + return (EINVAL); /* XXX remove when supported */ + if ((sockets * cores * threads) > vm->maxcpus) + return (EINVAL); + /* XXX need to check sockets * cores * threads == vCPU, how? */ + vm->sockets = sockets; + vm->cores = cores; + vm->threads = threads; + vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ + return(0); } -void -vm_destroy(struct vm *vm) +static void +vm_cleanup(struct vm *vm, bool destroy) { + struct mem_map *mm; int i; -#ifdef __FreeBSD__ ppt_unassign_all(vm); -#endif - - for (i = 0; i < vm->num_mem_segs; i++) - vm_free_mem_seg(vm, &vm->mem_segs[i]); - vm->num_mem_segs = 0; - - for (i = 0; i < VM_MAXCPU; i++) - vcpu_cleanup(vm, i); + if (vm->iommu != NULL) + iommu_destroy_domain(vm->iommu); + if (destroy) + vrtc_cleanup(vm->vrtc); + else + vrtc_reset(vm->vrtc); + vpmtmr_cleanup(vm->vpmtmr); vatpit_cleanup(vm->vatpit); vhpet_cleanup(vm->vhpet); vatpic_cleanup(vm->vatpic); vioapic_cleanup(vm->vioapic); -#ifdef __FreeBSD__ - iommu_destroy_domain(vm->iommu); -#endif + for (i = 0; i < vm->maxcpus; i++) + vcpu_cleanup(vm, i, destroy); VMCLEANUP(vm->cookie); + /* + * System memory is removed from the guest address space only when + * the VM is destroyed. This is because the mapping remains the same + * across VM reset. + * + * Device memory can be relocated by the guest (e.g. using PCI BARs) + * so those mappings are removed on a VM reset. + */ + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (destroy || !sysmem_mapping(vm, mm)) + vm_free_memmap(vm, i); +#ifndef __FreeBSD__ + else { + /* + * We need to reset the IOMMU flag so this mapping can + * be reused when a VM is rebooted. Since the IOMMU + * domain has already been destroyed we can just reset + * the flag here. + */ + mm->flags &= ~VM_MEMMAP_F_IOMMU; + } +#endif + } + + if (destroy) { + for (i = 0; i < VM_MAX_MEMSEGS; i++) + vm_free_memseg(vm, i); + + VMSPACE_FREE(vm->vmspace); + vm->vmspace = NULL; + } +#ifndef __FreeBSD__ + else { + /* + * Clear the first memory segment (low mem), old memory contents + * could confuse the UEFI firmware. + */ + vm_clear_memseg(vm, 0); + } +#endif +} + +void +vm_destroy(struct vm *vm) +{ + vm_cleanup(vm, true); free(vm, M_VM); } +int +vm_reinit(struct vm *vm) +{ + int error; + + /* + * A virtual machine can be reset only if all vcpus are suspended. + */ + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + vm_cleanup(vm, false); + vm_init(vm, false); + error = 0; + } else { + error = EBUSY; + } + + return (error); +} + const char * vm_name(struct vm *vm) { return (vm->name); } -#ifdef __FreeBSD__ int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { - const boolean_t spok = TRUE; /* superpage mappings are ok */ + vm_object_t obj; - return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE, - VM_PROT_RW, spok)); + if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) + return (ENOMEM); + else + return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { - const boolean_t spok = TRUE; /* superpage mappings are ok */ - return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0, - VM_PROT_NONE, spok)); + vmm_mmio_free(vm->vmspace, gpa, len); + return (0); } -#endif /* - * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise + * Return 'true' if 'gpa' is allocated in the guest address space. + * + * This function is called in the context of a running vcpu which acts as + * an implicit lock on 'vm->mem_maps[]'. */ -static boolean_t -vm_gpa_available(struct vm *vm, vm_paddr_t gpa) +bool +vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) { + struct mem_map *mm; int i; - vm_paddr_t gpabase, gpalimit; - if (gpa & PAGE_MASK) - panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa); +#ifdef INVARIANTS + int hostcpu, state; + state = vcpu_get_state(vm, vcpuid, &hostcpu); + KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, + ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); +#endif - for (i = 0; i < vm->num_mem_segs; i++) { - gpabase = vm->mem_segs[i].gpa; - gpalimit = gpabase + vm->mem_segs[i].len; - if (gpa >= gpabase && gpa < gpalimit) - return (FALSE); + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) + return (true); /* 'gpa' is sysmem or devmem */ } - return (TRUE); + if (ppt_is_mmio(vm, gpa)) + return (true); /* 'gpa' is pci passthru mmio */ + + return (false); } int -vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) +vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) { - int error, available, allocated; - struct vm_memory_segment *seg; - vm_paddr_t g, hpa; - void *host_domain; + struct mem_seg *seg; + vm_object_t obj; - const boolean_t spok = TRUE; /* superpage mappings are ok */ +#ifndef __FreeBSD__ + extern pgcnt_t get_max_page_get(void); +#endif - if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) + if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); - - available = allocated = 0; - g = gpa; - while (g < gpa + len) { - if (vm_gpa_available(vm, g)) - available++; - else - allocated++; - - g += PAGE_SIZE; - } - /* - * If there are some allocated and some available pages in the address - * range then it is an error. - */ - if (allocated && available) + if (len == 0 || (len & PAGE_MASK)) return (EINVAL); - /* - * If the entire address range being requested has already been - * allocated then there isn't anything more to do. - */ - if (allocated && available == 0) - return (0); - - if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) - return (E2BIG); - -#ifdef __FreeBSD__ - host_domain = iommu_host_domain(); +#ifndef __FreeBSD__ + if (len > ptob(get_max_page_get())) + return (EINVAL); #endif - seg = &vm->mem_segs[vm->num_mem_segs]; + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + if (seg->len == len && seg->sysmem == sysmem) + return (EEXIST); + else + return (EINVAL); + } - error = 0; - seg->gpa = gpa; - seg->len = 0; - while (seg->len < len) { - hpa = vmm_mem_alloc(PAGE_SIZE); - if (hpa == 0) { - error = ENOMEM; - break; - } + obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); + if (obj == NULL) + return (ENOMEM); - error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE, - VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok); - if (error) - break; + seg->len = len; + seg->object = obj; + seg->sysmem = sysmem; + return (0); +} -#ifdef __FreeBSD__ - /* - * Remove the 1:1 mapping for 'hpa' from the 'host_domain'. - * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain. - */ - iommu_remove_mapping(host_domain, hpa, PAGE_SIZE); - iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE); -#endif +int +vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + vm_object_t *objptr) +{ + struct mem_seg *seg; - seg->len += PAGE_SIZE; - } + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); - if (error) { - vm_free_mem_seg(vm, seg); - return (error); - } + seg = &vm->mem_segs[ident]; + if (len) + *len = seg->len; + if (sysmem) + *sysmem = seg->sysmem; + if (objptr) + *objptr = seg->object; + return (0); +} -#ifdef __FreeBSD__ - /* - * Invalidate cached translations associated with 'host_domain' since - * we have now moved some pages from it. - */ - iommu_invalidate_tlb(host_domain); -#endif +#ifndef __FreeBSD__ +static void +vm_clear_memseg(struct vm *vm, int ident) +{ + struct mem_seg *seg; - vm->num_mem_segs++; + KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, + ("%s: invalid memseg ident %d", __func__, ident)); - return (0); + seg = &vm->mem_segs[ident]; + + if (seg->object != NULL) + vm_object_clear(seg->object); } +#endif -vm_paddr_t -vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len) +void +vm_free_memseg(struct vm *vm, int ident) { - vm_paddr_t nextpage; + struct mem_seg *seg; - nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE); - if (len > nextpage - gpa) - panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len); + KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, + ("%s: invalid memseg ident %d", __func__, ident)); - return (VMMMAP_GET(vm->cookie, gpa)); + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + vm_object_deallocate(seg->object); + bzero(seg, sizeof(struct mem_seg)); + } } -void * -vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, - void **cookie) +int +vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, + size_t len, int prot, int flags) { -#ifdef __FreeBSD__ - int count, pageoff; - vm_page_t m; + struct mem_seg *seg; + struct mem_map *m, *map; + vm_ooffset_t last; + int i, error; + if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) + return (EINVAL); + + if (flags & ~VM_MEMMAP_F_WIRED) + return (EINVAL); + + if (segid < 0 || segid >= VM_MAX_MEMSEGS) + return (EINVAL); + + seg = &vm->mem_segs[segid]; + if (seg->object == NULL) + return (EINVAL); + + last = first + len; + if (first < 0 || first >= last || last > seg->len) + return (EINVAL); + + if ((gpa | first | last) & PAGE_MASK) + return (EINVAL); + + map = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + m = &vm->mem_maps[i]; + if (m->len == 0) { + map = m; + break; + } + } + + if (map == NULL) + return (ENOSPC); + + error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, + len, 0, VMFS_NO_SPACE, prot, prot, 0); + if (error != KERN_SUCCESS) + return (EFAULT); + + vm_object_reference(seg->object); + + if ((flags & VM_MEMMAP_F_WIRED) != 0) { + error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + if (error != KERN_SUCCESS) { + vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); + return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : + EFAULT); + } + } + + map->gpa = gpa; + map->len = len; + map->segoff = first; + map->segid = segid; + map->prot = prot; + map->flags = flags; + return (0); +} + +int +vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) +{ + struct mem_map *mm, *mmnext; + int i; + + mmnext = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len == 0 || mm->gpa < *gpa) + continue; + if (mmnext == NULL || mm->gpa < mmnext->gpa) + mmnext = mm; + } + + if (mmnext != NULL) { + *gpa = mmnext->gpa; + if (segid) + *segid = mmnext->segid; + if (segoff) + *segoff = mmnext->segoff; + if (len) + *len = mmnext->len; + if (prot) + *prot = mmnext->prot; + if (flags) + *flags = mmnext->flags; + return (0); + } else { + return (ENOENT); + } +} + +static void +vm_free_memmap(struct vm *vm, int ident) +{ + struct mem_map *mm; + int error; + + mm = &vm->mem_maps[ident]; + if (mm->len) { + error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, + mm->gpa + mm->len); + KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", + __func__, error)); + bzero(mm, sizeof(struct mem_map)); + } +} + +static __inline bool +sysmem_mapping(struct vm *vm, struct mem_map *mm) +{ + + if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) + return (true); + else + return (false); +} + +vm_paddr_t +vmm_sysmem_maxaddr(struct vm *vm) +{ + struct mem_map *mm; + vm_paddr_t maxaddr; + int i; + + maxaddr = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm)) { + if (maxaddr < mm->gpa + mm->len) + maxaddr = mm->gpa + mm->len; + } + } + return (maxaddr); +} + +static void +vm_iommu_modify(struct vm *vm, boolean_t map) +{ + int i, sz; + vm_paddr_t gpa, hpa; + struct mem_map *mm; +#ifdef __FreeBSD__ + void *vp, *cookie, *host_domain; +#else + void *vp, *cookie, *host_domain __unused; +#endif + + sz = PAGE_SIZE; + host_domain = iommu_host_domain(); + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (!sysmem_mapping(vm, mm)) + continue; + + if (map) { + KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, + ("iommu map found invalid memmap %#lx/%#lx/%#x", + mm->gpa, mm->len, mm->flags)); + if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) + continue; + mm->flags |= VM_MEMMAP_F_IOMMU; + } else { + if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) + continue; + mm->flags &= ~VM_MEMMAP_F_IOMMU; + KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, + ("iommu unmap found invalid memmap %#lx/%#lx/%#x", + mm->gpa, mm->len, mm->flags)); + } + + gpa = mm->gpa; + while (gpa < mm->gpa + mm->len) { + vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE, + &cookie); + KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", + vm_name(vm), gpa)); + + vm_gpa_release(cookie); + + hpa = DMAP_TO_PHYS((uintptr_t)vp); + if (map) { + iommu_create_mapping(vm->iommu, gpa, hpa, sz); + iommu_remove_mapping(host_domain, hpa, sz); + } else { + iommu_remove_mapping(vm->iommu, gpa, sz); + iommu_create_mapping(host_domain, hpa, hpa, sz); + } + + gpa += PAGE_SIZE; + } + } + + /* + * Invalidate the cached translations associated with the domain + * from which pages were removed. + */ + if (map) + iommu_invalidate_tlb(host_domain); + else + iommu_invalidate_tlb(vm->iommu); +} + +#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) +#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) + +int +vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) +{ + int error; + + error = ppt_unassign_device(vm, bus, slot, func); + if (error) + return (error); + + if (ppt_assigned_devices(vm) == 0) + vm_iommu_unmap(vm); + + return (0); +} + +int +vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) +{ + int error; + vm_paddr_t maxaddr; + + /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ + if (ppt_assigned_devices(vm) == 0) { + KASSERT(vm->iommu == NULL, + ("vm_assign_pptdev: iommu must be NULL")); + maxaddr = vmm_sysmem_maxaddr(vm); + vm->iommu = iommu_create_domain(maxaddr); + if (vm->iommu == NULL) + return (ENXIO); + vm_iommu_map(vm); + } + + error = ppt_assign_device(vm, bus, slot, func); + return (error); +} + +void * +vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot, + void **cookie) +{ + int i, count, pageoff; + struct mem_map *mm; + vm_page_t m; +#ifdef INVARIANTS + /* + * All vcpus are frozen by ioctls that modify the memory map + * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is + * guaranteed if at least one vcpu is in the VCPU_FROZEN state. + */ + int state; + KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d", + __func__, vcpuid)); + for (i = 0; i < vm->maxcpus; i++) { + if (vcpuid != -1 && vcpuid != i) + continue; + state = vcpu_get_state(vm, i, NULL); + KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", + __func__, state)); + } +#endif pageoff = gpa & PAGE_MASK; if (len > PAGE_SIZE - pageoff) panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); - count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, - trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + count = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm) && gpa >= mm->gpa && + gpa < mm->gpa + mm->len) { + count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, + trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + break; + } + } if (count == 1) { *cookie = m; @@ -635,54 +1159,23 @@ vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, *cookie = NULL; return (NULL); } -#else - int pageoff; - vm_paddr_t hpa; - - pageoff = gpa & PAGE_MASK; - if (len > PAGE_SIZE - pageoff) - panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); - - hpa = vm_gpa2hpa(vm, gpa, len); - if (hpa == (vm_paddr_t)-1) - return (NULL); - - return (hat_kpm_pfn2va(btop(hpa)) + pageoff); -#endif } void vm_gpa_release(void *cookie) { -#ifdef __FreeBSD__ vm_page_t m = cookie; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); -#endif -} - -int -vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, - struct vm_memory_segment *seg) -{ - int i; - - for (i = 0; i < vm->num_mem_segs; i++) { - if (gpabase == vm->mem_segs[i].gpa) { - *seg = vm->mem_segs[i]; - return (0); - } - } - return (-1); } int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) { - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (reg >= VM_REG_LAST) @@ -697,7 +1190,7 @@ vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) struct vcpu *vcpu; int error; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (reg >= VM_REG_LAST) @@ -751,7 +1244,7 @@ vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) @@ -764,7 +1257,7 @@ int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) @@ -784,6 +1277,10 @@ restore_guest_fpustate(struct vcpu *vcpu) fpu_stop_emulating(); fpurestore(vcpu->guestfpu); + /* restore guest XCR0 if XSAVE is enabled in the host */ + if (rcr4() & CR4_XSAVE) + load_xcr(0, vcpu->guest_xcr0); + /* * The FPU is now "dirty" with the guest's state so turn on emulation * to trap any access to the FPU by the host. @@ -798,20 +1295,35 @@ save_guest_fpustate(struct vcpu *vcpu) if ((rcr0() & CR0_TS) == 0) panic("fpu emulation not enabled in host!"); + /* save guest XCR0 and restore host XCR0 */ + if (rcr4() & CR4_XSAVE) { + vcpu->guest_xcr0 = rxcr(0); + load_xcr(0, vmm_get_host_xcr0()); + } + /* save guest FPU state */ fpu_stop_emulating(); fpusave(vcpu->guestfpu); +#ifdef __FreeBSD__ fpu_start_emulating(); +#else + /* + * When the host state has been restored, we should not re-enable + * CR0.TS on illumos for eager FPU. + */ +#endif } static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); static int -vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, +vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, bool from_idle) { + struct vcpu *vcpu; int error; + vcpu = &vm->vcpu[vcpuid]; vcpu_assert_locked(vcpu); /* @@ -820,8 +1332,17 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, * ioctl() operating on a vcpu at any point. */ if (from_idle) { - while (vcpu->state != VCPU_IDLE) + while (vcpu->state != VCPU_IDLE) { + vcpu->reqidle = 1; + vcpu_notify_event_locked(vcpu, false); + VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " + "idle requested", vcpu_state2str(vcpu->state)); +#ifdef __FreeBSD__ msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); +#else + cv_wait(&vcpu->state_cv, &vcpu->mtx.m); +#endif + } } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); @@ -855,17 +1376,36 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, break; } + if (newstate == VCPU_RUNNING) { + while (vcpu->runblock != 0) { +#ifdef __FreeBSD__ + msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); +#else + cv_wait(&vcpu->state_cv, &vcpu->mtx.m); +#endif + } + } + if (error) return (EBUSY); + VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", + vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); + vcpu->state = newstate; if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; - if (newstate == VCPU_IDLE) + if (newstate == VCPU_IDLE || + (newstate == VCPU_FROZEN && vcpu->runblock != 0)) { +#ifdef __FreeBSD__ wakeup(&vcpu->state); +#else + cv_broadcast(&vcpu->state_cv); +#endif + } return (0); } @@ -880,11 +1420,11 @@ vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) } static void -vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) +vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) { int error; - if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) + if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } @@ -894,60 +1434,139 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) static int vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) { - struct vm_exit *vmexit; struct vcpu *vcpu; - int t, timo, spindown; +#ifdef __FreeBSD__ + const char *wmesg; +#else + const char *wmesg __unused; +#endif + int t, vcpu_halted, vm_halted; + + KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); vcpu = &vm->vcpu[vcpuid]; - spindown = 0; + vcpu_halted = 0; + vm_halted = 0; vcpu_lock(vcpu); + while (1) { + /* + * Do a final check for pending NMI or interrupts before + * really putting this thread to sleep. Also check for + * software events that would cause this vcpu to wakeup. + * + * These interrupts/events could have happened after the + * vcpu returned from VMRUN() and before it acquired the + * vcpu lock above. + */ + if (vm->suspend || vcpu->reqidle) + break; + if (vm_nmi_pending(vm, vcpuid)) + break; + if (!intr_disabled) { + if (vm_extint_pending(vm, vcpuid) || + vlapic_pending_intr(vcpu->vlapic, NULL)) { + break; + } + } - /* - * Do a final check for pending NMI or interrupts before - * really putting this thread to sleep. - * - * These interrupts could have happened any time after we - * returned from VMRUN() and before we grabbed the vcpu lock. - */ - if (vm->rendezvous_func == NULL && - !vm_nmi_pending(vm, vcpuid) && - (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) { - t = ticks; - vcpu_require_state_locked(vcpu, VCPU_SLEEPING); - if (vlapic_enabled(vcpu->vlapic)) { - /* - * XXX msleep_spin() is not interruptible so use the - * 'timo' to put an upper bound on the sleep time. - */ - timo = hz; - msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo); + /* Don't go to sleep if the vcpu thread needs to yield */ + if (vcpu_should_yield(vm, vcpuid)) + break; + + if (vcpu_debugged(vm, vcpuid)) + break; + + /* + * Some Linux guests implement "halt" by having all vcpus + * execute HLT with interrupts disabled. 'halted_cpus' keeps + * track of the vcpus that have entered this state. When all + * vcpus enter the halted state the virtual machine is halted. + */ + if (intr_disabled) { + wmesg = "vmhalt"; + VCPU_CTR0(vm, vcpuid, "Halted"); + if (!vcpu_halted && halt_detection_enabled) { + vcpu_halted = 1; + CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); + } + if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { + vm_halted = 1; + break; + } } else { - /* - * Spindown the vcpu if the apic is disabled and it - * had entered the halted state. - */ - spindown = 1; + wmesg = "vmidle"; } - vcpu_require_state_locked(vcpu, VCPU_FROZEN); + + t = ticks; + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); +#ifdef __FreeBSD__ + /* + * XXX msleep_spin() cannot be interrupted by signals so + * wake up periodically to check pending signals. + */ + msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); +#else + /* + * Fortunately, cv_wait_sig can be interrupted by signals, so + * there is no need to periodically wake up. + */ + (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m); +#endif + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } + + if (vcpu_halted) + CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); + vcpu_unlock(vcpu); -#ifdef __FreeBSD__ - /* - * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it - * outside the confines of the vcpu spinlock. - */ - if (spindown) { - *retu = true; - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU; - vm_deactivate_cpu(vm, vcpuid); - VCPU_CTR0(vm, vcpuid, "spinning down cpu"); + if (vm_halted) + vm_suspend(vm, VM_SUSPEND_HALT); + + return (0); +} + +static int +vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) +{ + int rv, ftype; + struct vm_map *map; + struct vcpu *vcpu; + struct vm_exit *vme; + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + + ftype = vme->u.paging.fault_type; + KASSERT(ftype == VM_PROT_READ || + ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, + ("vm_handle_paging: invalid fault_type %d", ftype)); + + if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { + rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), + vme->u.paging.gpa, ftype); + if (rv == 0) { + VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", + ftype == VM_PROT_READ ? "accessed" : "dirty", + vme->u.paging.gpa); + goto done; + } } -#endif + map = &vm->vmspace->vm_map; + rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); + + VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " + "ftype = %d", rv, vme->u.paging.gpa, ftype); + + if (rv != KERN_SUCCESS) + return (EFAULT); +done: return (0); } @@ -962,11 +1581,14 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) mem_region_read_t mread; mem_region_write_t mwrite; enum vm_cpu_mode cpu_mode; - int cs_d, error, length; + int cs_d, error, fault; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; cs_base = vme->u.inst_emul.cs_base; @@ -979,37 +1601,31 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) /* Fetch, decode and emulate the faulting instruction */ if (vie->num_valid == 0) { - /* - * If the instruction length is not known then assume a - * maximum size instruction. - */ - length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE; error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + - cs_base, length, vie); + cs_base, VIE_INST_SIZE, vie, &fault); } else { /* * The instruction bytes have already been copied into 'vie' */ - error = 0; + error = fault = 0; } - if (error == 1) - return (0); /* Resume guest to handle page fault */ - else if (error == -1) - return (EFAULT); - else if (error != 0) - panic("%s: vmm_fetch_instruction error %d", __func__, error); + if (error || fault) + return (error); - if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) - return (EFAULT); + if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { + VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", + vme->rip + cs_base); + *retu = true; /* dump instruction bytes in userspace */ + return (0); + } /* - * If the instruction length was not specified then update it now - * along with 'nextrip'. + * Update 'nextrip' based on the length of the emulated instruction. */ - if (vme->inst_length == 0) { - vme->inst_length = vie->num_processed; - vcpu->nextrip += vie->num_processed; - } + vme->inst_length = vie->num_processed; + vcpu->nextrip += vie->num_processed; + VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction " + "decoding", vcpu->nextrip); /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { @@ -1032,47 +1648,394 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) return (error); } +static int +vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) +{ +#ifdef __FreeBSD__ + int i, done; + struct vcpu *vcpu; + + done = 0; +#else + int i; + struct vcpu *vcpu; +#endif + vcpu = &vm->vcpu[vcpuid]; + + CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); + + /* + * Wait until all 'active_cpus' have suspended themselves. + */ + vcpu_lock(vcpu); + while (1) { + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); + break; + } + + VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); +#ifdef __FreeBSD__ + msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); +#else + /* + * To prevent vm_handle_suspend from becoming stuck in the + * kernel if the bhyve process driving its vCPUs is killed, + * offer a bail-out, even though not all the vCPUs have reached + * the suspended state. + */ + if (cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, + hz, TR_CLOCK_TICK) <= 0) { + if ((curproc->p_flag & SEXITING) != 0) { + vcpu_require_state_locked(vm, vcpuid, + VCPU_FROZEN); + break; + } + } +#endif + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + } + vcpu_unlock(vcpu); + + /* + * Wakeup the other sleeping vcpus and return to userspace. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->suspended_cpus)) { + vcpu_notify_event(vm, i, false); + } + } + + *retu = true; + return (0); +} + +static int +vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); + vcpu->reqidle = 0; + vcpu_unlock(vcpu); + *retu = true; + return (0); +} + +#ifndef __FreeBSD__ +static int +vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) +{ + struct vcpu *cpu = &vm->vcpu[vcpuid]; + const uint32_t code = vme->u.msr.code; + const uint64_t val = vme->u.msr.wval; + + switch (code) { + case MSR_TSC: + cpu->tsc_offset = val - rdtsc(); + return (0); + } + + return (-1); +} +#endif /* __FreeBSD__ */ + +int +vm_suspend(struct vm *vm, enum vm_suspend_how how) +{ + int i; + + if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) + return (EINVAL); + + if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { + VM_CTR2(vm, "virtual machine already suspended %d/%d", + vm->suspend, how); + return (EALREADY); + } + + VM_CTR1(vm, "virtual machine successfully suspended %d", how); + + /* + * Notify all active vcpus that they are now suspended. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm, i, false); + } + + return (0); +} + +void +vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, + ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_SUSPENDED; + vmexit->u.suspended.how = vm->suspend; +} + +void +vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_DEBUG; +} + +void +vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_RUNBLOCK; + vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1); +} + +void +vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_REQIDLE; + vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); +} + +void +vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); +} + +#ifndef __FreeBSD__ +/* + * Some vmm resources, such as the lapic, may have CPU-specific resources + * allocated to them which would benefit from migration onto the host CPU which + * is processing the vcpu state. + */ +static void +vm_localize_resources(struct vm *vm, struct vcpu *vcpu) +{ + /* + * Localizing cyclic resources requires acquisition of cpu_lock, and + * doing so with kpreempt disabled is a recipe for deadlock disaster. + */ + VERIFY(curthread->t_preempt == 0); + + /* + * Do not bother with localization if this vCPU is about to return to + * the host CPU it was last localized to. + */ + if (vcpu->lastloccpu == curcpu) + return; + + /* + * Localize system-wide resources to the primary boot vCPU. While any + * of the other vCPUs may access them, it keeps the potential interrupt + * footprint constrained to CPUs involved with this instance. + */ + if (vcpu == &vm->vcpu[0]) { + vhpet_localize_resources(vm->vhpet); + vrtc_localize_resources(vm->vrtc); + vatpit_localize_resources(vm->vatpit); + } + + vlapic_localize_resources(vcpu->vlapic); + + vcpu->lastloccpu = curcpu; +} + +static void +vmm_savectx(void *arg) +{ + vm_thread_ctx_t *vtc = arg; + struct vm *vm = vtc->vtc_vm; + const int vcpuid = vtc->vtc_vcpuid; + + if (ops->vmsavectx != NULL) { + ops->vmsavectx(vm->cookie, vcpuid); + } + + /* + * If the CPU holds the restored guest FPU state, save it and restore + * the host FPU state before this thread goes off-cpu. + */ + if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + save_guest_fpustate(vcpu); + vtc->vtc_status &= ~VTCS_FPU_RESTORED; + } +} + +static void +vmm_restorectx(void *arg) +{ + vm_thread_ctx_t *vtc = arg; + struct vm *vm = vtc->vtc_vm; + const int vcpuid = vtc->vtc_vcpuid; + + /* + * When coming back on-cpu, only restore the guest FPU status if the + * thread is in a context marked as requiring it. This should be rare, + * occurring only when a future logic error results in a voluntary + * sleep during the VMRUN critical section. + * + * The common case will result in elision of the guest FPU state + * restoration, deferring that action until it is clearly necessary + * during vm_run. + */ + VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); + if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + restore_guest_fpustate(vcpu); + vtc->vtc_status |= VTCS_FPU_RESTORED; + } + + if (ops->vmrestorectx != NULL) { + ops->vmrestorectx(vm->cookie, vcpuid); + } + +} + +/* + * If we're in removectx(), we might still have state to tidy up. + */ +static void +vmm_freectx(void *arg, int isexec) +{ + vmm_savectx(arg); +} + +#endif /* __FreeBSD */ + int vm_run(struct vm *vm, struct vm_run *vmrun) { + struct vm_eventinfo evinfo; int error, vcpuid; struct vcpu *vcpu; +#ifdef __FreeBSD__ struct pcb *pcb; +#endif uint64_t tscval; struct vm_exit *vme; bool retu, intr_disabled; + pmap_t pmap; +#ifndef __FreeBSD__ + vm_thread_ctx_t vtc; + int affinity_type = CPU_CURRENT; +#endif vcpuid = vmrun->cpuid; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); + pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; + evinfo.rptr = &vcpu->runblock; + evinfo.sptr = &vm->suspend; + evinfo.iptr = &vcpu->reqidle; + +#ifndef __FreeBSD__ + vtc.vtc_vm = vm; + vtc.vtc_vcpuid = vcpuid; + vtc.vtc_status = 0; + + installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL, + NULL, vmm_freectx); +#endif + restart: +#ifndef __FreeBSD__ + thread_affinity_set(curthread, affinity_type); + /* + * Resource localization should happen after the CPU affinity for the + * thread has been set to ensure that access from restricted contexts, + * such as VMX-accelerated APIC operations, can occur without inducing + * cyclic cross-calls. + * + * This must be done prior to disabling kpreempt via critical_enter(). + */ + vm_localize_resources(vm, vcpu); + + affinity_type = CPU_CURRENT; +#endif + critical_enter(); + KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), + ("vm_run: absurd pm_active")); + tscval = rdtsc(); #ifdef __FreeBSD__ pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); +#else + /* Force a trip through update_sregs to reload %fs/%gs and friends */ + PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); #endif -#ifndef __FreeBSD__ - installctx(curthread, vcpu, save_guest_fpustate, - restore_guest_fpustate, NULL, NULL, NULL, NULL); -#endif +#ifdef __FreeBSD__ restore_guest_fpustate(vcpu); +#else + if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { + restore_guest_fpustate(vcpu); + vtc.vtc_status |= VTCS_FPU_RESTORED; + } + vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; +#endif vcpu_require_state(vm, vcpuid, VCPU_RUNNING); - error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); + error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); +#ifdef __FreeBSD__ save_guest_fpustate(vcpu); +#else + vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; +#endif + #ifndef __FreeBSD__ - removectx(curthread, vcpu, save_guest_fpustate, - restore_guest_fpustate, NULL, NULL, NULL, NULL); + /* + * Once clear of the delicate contexts comprising the VM_RUN handler, + * thread CPU affinity can be loosened while other processing occurs. + */ + thread_affinity_clear(curthread); #endif vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); @@ -1083,10 +2046,25 @@ restart: retu = false; vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { + case VM_EXITCODE_REQIDLE: + error = vm_handle_reqidle(vm, vcpuid, &retu); + break; + case VM_EXITCODE_SUSPENDED: + error = vm_handle_suspend(vm, vcpuid, &retu); + break; + case VM_EXITCODE_IOAPIC_EOI: + vioapic_process_eoi(vm, vcpuid, + vme->u.ioapic_eoi.vector); + break; + case VM_EXITCODE_RUNBLOCK: + break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); break; + case VM_EXITCODE_PAGING: + error = vm_handle_paging(vm, vcpuid, &retu); + break; case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vm, vcpuid, &retu); break; @@ -1094,18 +2072,42 @@ restart: case VM_EXITCODE_INOUT_STR: error = vm_handle_inout(vm, vcpuid, vme, &retu); break; + case VM_EXITCODE_MONITOR: + case VM_EXITCODE_MWAIT: + case VM_EXITCODE_VMINSN: + vm_inject_ud(vm, vcpuid); + break; +#ifndef __FreeBSD__ + case VM_EXITCODE_WRMSR: + if (vm_handle_wrmsr(vm, vcpuid, vme) != 0) { + retu = true; + } + break; + + case VM_EXITCODE_HT: { + affinity_type = CPU_BEST; + break; + } + +#endif default: retu = true; /* handled in userland */ break; } } - if (error == 0 && retu == false) { + if (error == 0 && retu == false) goto restart; - } + +#ifndef __FreeBSD__ + removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL, + NULL, vmm_freectx); +#endif + + VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); /* copy the exit information */ - bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); + bcopy(vme, &vmrun->vm_exit, sizeof (struct vm_exit)); return (error); } @@ -1119,7 +2121,7 @@ vm_restart_instruction(void *arg, int vcpuid) int error; vm = arg; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; @@ -1158,7 +2160,7 @@ vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) struct vcpu *vcpu; int type, vector; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; @@ -1262,9 +2264,7 @@ nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", info1, info2); -#ifdef __FreeBSD__ vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); -#endif *retinfo = 0; return (0); } @@ -1293,11 +2293,11 @@ vcpu_exception_intinfo(struct vcpu *vcpu) uint64_t info = 0; if (vcpu->exception_pending) { - info = vcpu->exception.vector & 0xff; + info = vcpu->exc_vector & 0xff; info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; - if (vcpu->exception.error_code_valid) { + if (vcpu->exc_errcode_valid) { info |= VM_INTINFO_DEL_ERRCODE; - info |= (uint64_t)vcpu->exception.error_code << 32; + info |= (uint64_t)vcpu->exc_errcode << 32; } } return (info); @@ -1310,7 +2310,8 @@ vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) uint64_t info1, info2; int valid; - KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); + KASSERT(vcpuid >= 0 && + vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid)); vcpu = &vm->vcpu[vcpuid]; @@ -1322,7 +2323,7 @@ vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) info2 = vcpu_exception_intinfo(vcpu); vcpu->exception_pending = 0; VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", - vcpu->exception.vector, info2); + vcpu->exc_vector, info2); } if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { @@ -1346,76 +2347,93 @@ vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) } int -vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception) +vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + vcpu = &vm->vcpu[vcpuid]; + *info1 = vcpu->exitintinfo; + *info2 = vcpu_exception_intinfo(vcpu); + return (0); +} + +int +vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) +{ + struct vcpu *vcpu; + uint64_t regval; + int error; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vector < 0 || vector >= 32) return (EINVAL); - if (exception->vector < 0 || exception->vector >= 32) + /* + * A double fault exception should never be injected directly into + * the guest. It is a derived exception that results from specific + * combinations of nested faults. + */ + if (vector == IDT_DF) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; if (vcpu->exception_pending) { VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " - "pending exception %d", exception->vector, - vcpu->exception.vector); + "pending exception %d", vector, vcpu->exc_vector); return (EBUSY); } - vcpu->exception_pending = 1; - vcpu->exception = *exception; - VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector); - return (0); -} + if (errcode_valid) { + /* + * Exceptions don't deliver an error code in real mode. + */ + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); + KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); + if (!(regval & CR0_PE)) + errcode_valid = 0; + } -int -vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception) -{ - struct vcpu *vcpu; - int pending; + /* + * From section 26.6.1 "Interruptibility State" in Intel SDM: + * + * Event blocking by "STI" or "MOV SS" is cleared after guest executes + * one instruction or incurs an exception. + */ + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); + KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", + __func__, error)); - KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); + if (restart_instruction) + vm_restart_instruction(vm, vcpuid); - vcpu = &vm->vcpu[vcpuid]; - pending = vcpu->exception_pending; - if (pending) { - vcpu->exception_pending = 0; - *exception = vcpu->exception; - VCPU_CTR1(vm, vcpuid, "Exception %d delivered", - exception->vector); - } - return (pending); + vcpu->exception_pending = 1; + vcpu->exc_vector = vector; + vcpu->exc_errcode = errcode; + vcpu->exc_errcode_valid = errcode_valid; + VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); + return (0); } void vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, int errcode) { - struct vm_exception exception; - struct vm_exit *vmexit; struct vm *vm; - int error; + int error, restart_instruction; vm = vmarg; + restart_instruction = 1; - exception.vector = vector; - exception.error_code = errcode; - exception.error_code_valid = errcode_valid; - error = vm_inject_exception(vm, vcpuid, &exception); + error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, + errcode, restart_instruction); KASSERT(error == 0, ("vm_inject_exception error %d", error)); - - /* - * A fault-like exception allows the instruction to be restarted - * after the exception handler returns. - * - * By setting the inst_length to 0 we ensure that the instruction - * pointer remains at the faulting instruction. - */ - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->inst_length = 0; } void @@ -1441,14 +2459,13 @@ vm_inject_nmi(struct vm *vm, int vcpuid) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->nmi_pending = 1; vcpu_notify_event(vm, vcpuid, false); - return (0); } @@ -1457,7 +2474,7 @@ vm_nmi_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -1470,7 +2487,7 @@ vm_nmi_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -1489,14 +2506,13 @@ vm_inject_extint(struct vm *vm, int vcpuid) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->extint_pending = 1; vcpu_notify_event(vm, vcpuid, false); - return (0); } @@ -1505,7 +2521,7 @@ vm_extint_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_extint_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -1518,7 +2534,7 @@ vm_extint_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_extint_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -1533,7 +2549,7 @@ vm_extint_clear(struct vm *vm, int vcpuid) int vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) { - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) @@ -1545,7 +2561,7 @@ vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) int vm_set_capability(struct vm *vm, int vcpu, int type, int val) { - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) @@ -1554,22 +2570,24 @@ vm_set_capability(struct vm *vm, int vcpu, int type, int val) return (VMSETCAP(vm->cookie, vcpu, type, val)); } -struct vhpet * -vm_hpet(struct vm *vm) +struct vlapic * +vm_lapic(struct vm *vm, int cpu) { - return (vm->vhpet); + return (vm->vcpu[cpu].vlapic); } struct vioapic * vm_ioapic(struct vm *vm) { + return (vm->vioapic); } -struct vlapic * -vm_lapic(struct vm *vm, int cpu) +struct vhpet * +vm_hpet(struct vm *vm) { - return (vm->vcpu[cpu].vlapic); + + return (vm->vhpet); } #ifdef __FreeBSD__ @@ -1594,7 +2612,7 @@ vmm_is_pptdev(int bus, int slot, int func) /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = 0; for (i = 0; names[i] != NULL && !found; i++) { - cp = val = getenv(names[i]); + cp = val = kern_getenv(names[i]); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; @@ -1630,13 +2648,13 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, int error; struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_set_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); - error = vcpu_set_state_locked(vcpu, newstate, from_idle); + error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); vcpu_unlock(vcpu); return (error); @@ -1648,7 +2666,7 @@ vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) struct vcpu *vcpu; enum vcpu_state state; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_get_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -1662,11 +2680,67 @@ vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) return (state); } +void +vcpu_block_run(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vcpu_block_run: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu->runblock++; + if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) { + vcpu_notify_event_locked(vcpu, false); + } + while (vcpu->state == VCPU_RUNNING) { +#ifdef __FreeBSD__ + msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); +#else + cv_wait(&vcpu->state_cv, &vcpu->mtx.m); +#endif + } + vcpu_unlock(vcpu); +} + +void +vcpu_unblock_run(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vcpu_block_run: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + KASSERT(vcpu->runblock != 0, ("expected non-zero runblock")); + vcpu->runblock--; + if (vcpu->runblock == 0) { +#ifdef __FreeBSD__ + wakeup(&vcpu->state); +#else + cv_broadcast(&vcpu->state_cv); +#endif + } + vcpu_unlock(vcpu); +} + +#ifndef __FreeBSD__ +uint64_t +vcpu_tsc_offset(struct vm *vm, int vcpuid) +{ + return (vm->vcpu[vcpuid].tsc_offset); +} +#endif /* __FreeBSD__ */ + int vm_activate_cpu(struct vm *vm, int vcpuid) { - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->active_cpus)) @@ -1677,6 +2751,55 @@ vm_activate_cpu(struct vm *vm, int vcpuid) return (0); } +int +vm_suspend_cpu(struct vm *vm, int vcpuid) +{ + int i; + + if (vcpuid < -1 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vcpuid == -1) { + vm->debug_cpus = vm->active_cpus; + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm, i, false); + } + } else { + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); + vcpu_notify_event(vm, vcpuid, false); + } + return (0); +} + +int +vm_resume_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < -1 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vcpuid == -1) { + CPU_ZERO(&vm->debug_cpus); + } else { + if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) + return (EINVAL); + + CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); + } + return (0); +} + +int +vcpu_debugged(struct vm *vm, int vcpuid) +{ + + return (CPU_ISSET(vcpuid, &vm->debug_cpus)); +} + cpuset_t vm_active_cpus(struct vm *vm) { @@ -1684,6 +2807,20 @@ vm_active_cpus(struct vm *vm) return (vm->active_cpus); } +cpuset_t +vm_debug_cpus(struct vm *vm) +{ + + return (vm->debug_cpus); +} + +cpuset_t +vm_suspended_cpus(struct vm *vm) +{ + + return (vm->suspended_cpus); +} + void * vcpu_stats(struct vm *vm, int vcpuid) { @@ -1694,7 +2831,7 @@ vcpu_stats(struct vm *vm, int vcpuid) int vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) { - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); *state = vm->vcpu[vcpuid].x2apic_state; @@ -1705,7 +2842,7 @@ vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) int vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) { - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (state >= X2APIC_STATE_LAST) @@ -1725,15 +2862,11 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ -void -vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) +static void +vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) { int hostcpu; - struct vcpu *vcpu; - vcpu = &vm->vcpu[vcpuid]; - - vcpu_lock(vcpu); hostcpu = vcpu->hostcpu; if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); @@ -1755,12 +2888,33 @@ vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) } else { KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " "with hostcpu %d", vcpu->state, hostcpu)); - if (vcpu->state == VCPU_SLEEPING) + if (vcpu->state == VCPU_SLEEPING) { +#ifdef __FreeBSD__ wakeup_one(vcpu); +#else + cv_signal(&vcpu->vcpu_cv); +#endif + } } +} + +void +vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu_notify_event_locked(vcpu, lapic_intr); vcpu_unlock(vcpu); } +struct vmspace * +vm_get_vmspace(struct vm *vm) +{ + + return (vm->vmspace); +} + int vm_apicid2vcpuid(struct vm *vm, int apicid) { @@ -1782,6 +2936,20 @@ vm_atpit(struct vm *vm) return (vm->vatpit); } +struct vpmtmr * +vm_pmtmr(struct vm *vm) +{ + + return (vm->vpmtmr); +} + +struct vrtc * +vm_rtc(struct vm *vm) +{ + + return (vm->vrtc); +} + enum vm_reg_name vm_segment_name(int seg) { @@ -1805,19 +2973,17 @@ vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, { int idx; -#ifdef __FreeBSD__ for (idx = 0; idx < num_copyinfo; idx++) { if (copyinfo[idx].cookie != NULL) vm_gpa_release(copyinfo[idx].cookie); } -#endif bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); } int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, - int num_copyinfo) + int num_copyinfo, int *fault) { int error, idx, nused; size_t n, off, remaining; @@ -1830,8 +2996,8 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, remaining = len; while (remaining > 0) { KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); - error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa); - if (error) + error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); + if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(remaining, PAGE_SIZE - off); @@ -1843,8 +3009,8 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, } for (idx = 0; idx < nused; idx++) { - hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len, - prot, &cookie); + hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa, + copyinfo[idx].len, prot, &cookie); if (hva == NULL) break; copyinfo[idx].hva = hva; @@ -1853,8 +3019,9 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, if (idx != nused) { vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); - return (-1); + return (EFAULT); } else { + *fault = 0; return (0); } } @@ -1892,3 +3059,125 @@ vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, idx++; } } + +/* + * Return the amount of in-use and wired memory for the VM. Since + * these are global stats, only return the values with for vCPU 0 + */ +VMM_STAT_DECLARE(VMM_MEM_RESIDENT); +VMM_STAT_DECLARE(VMM_MEM_WIRED); + +static void +vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) +{ + + if (vcpu == 0) { + vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, + PAGE_SIZE * vmspace_resident_count(vm->vmspace)); + } +} + +static void +vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) +{ + + if (vcpu == 0) { + vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, + PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); + } +} + +VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); +VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); + +#ifndef __FreeBSD__ +int +vm_ioport_hook(struct vm *vm, uint_t ioport, vmm_rmem_cb_t rfunc, + vmm_wmem_cb_t wfunc, void *arg, void **cookie) +{ + list_t *ih = &vm->ioport_hooks; + vm_ioport_hook_t *hook, *node; + + if (ioport == 0) { + return (EINVAL); + } + + /* + * Find the node position in the list which this region should be + * inserted behind to maintain sorted order. + */ + for (node = list_tail(ih); node != NULL; node = list_prev(ih, node)) { + if (ioport == node->vmih_ioport) { + /* Reject duplicate port hook */ + return (EEXIST); + } else if (ioport > node->vmih_ioport) { + break; + } + } + + hook = kmem_alloc(sizeof (*hook), KM_SLEEP); + hook->vmih_ioport = ioport; + hook->vmih_arg = arg; + hook->vmih_rmem_cb = rfunc; + hook->vmih_wmem_cb = wfunc; + if (node == NULL) { + list_insert_head(ih, hook); + } else { + list_insert_after(ih, node, hook); + } + + *cookie = (void *)hook; + return (0); +} + +void +vm_ioport_unhook(struct vm *vm, void **cookie) +{ + vm_ioport_hook_t *hook; + list_t *ih = &vm->ioport_hooks; + + hook = *cookie; + list_remove(ih, hook); + kmem_free(hook, sizeof (*hook)); + *cookie = NULL; +} + +int +vm_ioport_handle_hook(struct vm *vm, int cpuid, bool in, int port, int bytes, + uint32_t *val) +{ + vm_ioport_hook_t *hook; + list_t *ih = &vm->ioport_hooks; + int err = 0; + + for (hook = list_head(ih); hook != NULL; hook = list_next(ih, hook)) { + if (hook->vmih_ioport == port) { + break; + } + } + if (hook == NULL) { + return (ENOENT); + } + + if (in) { + uint64_t tval; + + if (hook->vmih_rmem_cb == NULL) { + return (ENOENT); + } + err = hook->vmih_rmem_cb(hook->vmih_arg, (uintptr_t)port, + (uint_t)bytes, &tval); + *val = (uint32_t)tval; + } else { + if (hook->vmih_wmem_cb == NULL) { + return (ENOENT); + } + err = hook->vmih_wmem_cb(hook->vmih_arg, (uintptr_t)port, + (uint_t)bytes, (uint64_t)*val); + } + + return (err); +} + + +#endif /* __FreeBSD__ */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm.mapfile b/usr/src/uts/i86pc/io/vmm/vmm.mapfile new file mode 100644 index 0000000000..83c14de895 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm.mapfile @@ -0,0 +1,62 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + # bhyve driver API + vmm_drv_hold; + vmm_drv_rele; + vmm_drv_release_reqd; + vmm_drv_lease_sign; + vmm_drv_lease_break; + vmm_drv_lease_expired; + vmm_drv_gpa2kva; + vmm_drv_ioport_hook; + vmm_drv_ioport_unhook; + vmm_drv_msi; + + # IOMMU API for PCI pass-thru + iommu_add_device; + iommu_host_domain; + iommu_remove_device; + lapic_intr_msi; + vm_iommu_domain; + vm_map_mmio; + vm_unmap_mmio; + + local: + *; +}; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.c b/usr/src/uts/i86pc/io/vmm/vmm_host.c index b94caf4009..9e390c93dd 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_host.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_host.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -39,7 +41,7 @@ */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z neel $"); +__FBSDID("$FreeBSD$"); #include #include @@ -50,11 +52,14 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z ne #include "vmm_host.h" -static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4; +static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4, + vmm_host_xcr0; +static struct xsave_limits vmm_xsave_limits; void vmm_host_state_init(void) { + unsigned int regs[4]; vmm_host_efer = rdmsr(MSR_EFER); vmm_host_pat = rdmsr(MSR_PAT); @@ -68,7 +73,36 @@ vmm_host_state_init(void) */ vmm_host_cr0 = rcr0() | CR0_TS; - vmm_host_cr4 = rcr4(); + /* + * On non-PCID or PCID but without INVPCID support machines, + * we flush kernel i.e. global TLB entries, by temporary + * clearing the CR4.PGE bit, see invltlb_glob(). If + * preemption occurs at the wrong time, cached vmm_host_cr4 + * might store the value with CR4.PGE cleared. Since FreeBSD + * requires support for PG_G on amd64, just set it + * unconditionally. + */ + vmm_host_cr4 = rcr4() | CR4_PGE; + + /* + * Only permit a guest to use XSAVE if the host is using + * XSAVE. Only permit a guest to use XSAVE features supported + * by the host. This ensures that the FPU state used by the + * guest is always a subset of the saved guest FPU state. + * + * In addition, only permit known XSAVE features where the + * rules for which features depend on other features is known + * to properly emulate xsetbv. + */ + if (vmm_host_cr4 & CR4_XSAVE) { + vmm_xsave_limits.xsave_enabled = 1; + vmm_host_xcr0 = rxcr(0); + vmm_xsave_limits.xcr0_allowed = vmm_host_xcr0 & + (XFEATURE_AVX | XFEATURE_MPX | XFEATURE_AVX512); + + cpuid_count(0xd, 0x0, regs); + vmm_xsave_limits.xsave_max_size = regs[1]; + } } uint64_t @@ -99,6 +133,13 @@ vmm_get_host_cr4(void) return (vmm_host_cr4); } +uint64_t +vmm_get_host_xcr0(void) +{ + + return (vmm_host_xcr0); +} + uint64_t vmm_get_host_datasel(void) { @@ -122,7 +163,6 @@ vmm_get_host_codesel(void) #endif } - uint64_t vmm_get_host_tsssel(void) { @@ -158,3 +198,10 @@ vmm_get_host_idtrbase(void) return (idtr.dtr_base); #endif } + +const struct xsave_limits * +vmm_get_xsave_limits(void) +{ + + return (&vmm_xsave_limits); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.h b/usr/src/uts/i86pc/io/vmm/vmm_host.h index 5de015a228..f12047819d 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_host.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_host.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_host.h 242275 2012-10-29 01:51:24Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,6 +38,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _VMM_HOST_H_ @@ -46,20 +49,28 @@ #endif #ifndef _KERNEL -#error "no user-servicable parts inside" +#error "no user-serviceable parts inside" #endif +struct xsave_limits { + int xsave_enabled; + uint64_t xcr0_allowed; + uint32_t xsave_max_size; +}; + void vmm_host_state_init(void); uint64_t vmm_get_host_pat(void); uint64_t vmm_get_host_efer(void); uint64_t vmm_get_host_cr0(void); uint64_t vmm_get_host_cr4(void); +uint64_t vmm_get_host_xcr0(void); uint64_t vmm_get_host_datasel(void); uint64_t vmm_get_host_codesel(void); uint64_t vmm_get_host_tsssel(void); uint64_t vmm_get_host_fsbase(void); uint64_t vmm_get_host_idtrbase(void); +const struct xsave_limits *vmm_get_xsave_limits(void); /* * Inline access to host state that is used on every VM entry @@ -89,8 +100,10 @@ vmm_get_host_gdtrbase(void) #endif } +#ifdef __FreeBSD__ struct pcpu; extern struct pcpu __pcpu[]; +#endif static __inline uint64_t vmm_get_host_gsbase(void) diff --git a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c index 72c7056e26..ea96cd8db0 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 Sandvine, Inc. * Copyright (c) 2012 NetApp, Inc. * All rights reserved. @@ -24,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_instruction_emul.c 281987 2015-04-25 19:02:06Z tychon $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -37,15 +39,17 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_instruction_emul.c 281987 2015-04-25 19:02:06Z tychon $"); +__FBSDID("$FreeBSD$"); #ifdef _KERNEL #include #include #include +#include #include #include @@ -84,6 +88,9 @@ enum { VIE_OP_TYPE_MOVS, VIE_OP_TYPE_GROUP1, VIE_OP_TYPE_STOS, + VIE_OP_TYPE_BITTEST, + VIE_OP_TYPE_TWOB_GRP15, + VIE_OP_TYPE_ADD, VIE_OP_TYPE_LAST }; @@ -94,7 +101,12 @@ enum { #define VIE_OP_F_NO_MODRM (1 << 3) #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) +#ifdef _KERNEL static const struct vie_op two_byte_opcodes[256] = { + [0xAE] = { + .op_byte = 0xAE, + .op_type = VIE_OP_TYPE_TWOB_GRP15, + }, [0xB6] = { .op_byte = 0xB6, .op_type = VIE_OP_TYPE_MOVZX, @@ -103,6 +115,11 @@ static const struct vie_op two_byte_opcodes[256] = { .op_byte = 0xB7, .op_type = VIE_OP_TYPE_MOVZX, }, + [0xBA] = { + .op_byte = 0xBA, + .op_type = VIE_OP_TYPE_BITTEST, + .op_flags = VIE_OP_F_IMM8, + }, [0xBE] = { .op_byte = 0xBE, .op_type = VIE_OP_TYPE_MOVSX, @@ -110,14 +127,26 @@ static const struct vie_op two_byte_opcodes[256] = { }; static const struct vie_op one_byte_opcodes[256] = { + [0x03] = { + .op_byte = 0x03, + .op_type = VIE_OP_TYPE_ADD, + }, [0x0F] = { .op_byte = 0x0F, .op_type = VIE_OP_TYPE_TWO_BYTE }, + [0x0B] = { + .op_byte = 0x0B, + .op_type = VIE_OP_TYPE_OR, + }, [0x2B] = { .op_byte = 0x2B, .op_type = VIE_OP_TYPE_SUB, }, + [0x39] = { + .op_byte = 0x39, + .op_type = VIE_OP_TYPE_CMP, + }, [0x3B] = { .op_byte = 0x3B, .op_type = VIE_OP_TYPE_CMP, @@ -183,14 +212,20 @@ static const struct vie_op one_byte_opcodes[256] = { .op_byte = 0x23, .op_type = VIE_OP_TYPE_AND, }, + [0x80] = { + /* Group 1 extended opcode */ + .op_byte = 0x80, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, [0x81] = { - /* XXX Group 1 extended opcode */ + /* Group 1 extended opcode */ .op_byte = 0x81, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM, }, [0x83] = { - /* XXX Group 1 extended opcode */ + /* Group 1 extended opcode */ .op_byte = 0x83, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM8, @@ -206,6 +241,7 @@ static const struct vie_op one_byte_opcodes[256] = { .op_type = VIE_OP_TYPE_PUSH, } }; +#endif /* struct vie.mod */ #define VIE_MOD_INDIRECT 0 @@ -394,6 +430,41 @@ getcc(int opsize, uint64_t x, uint64_t y) return (getcc64(x, y)); } +/* + * Macro creation of functions getaddflags{8,16,32,64} + */ +#define GETADDFLAGS(sz) \ +static u_long \ +getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + u_long rflags; \ + \ + __asm __volatile("add %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack + +GETADDFLAGS(8); +GETADDFLAGS(16); +GETADDFLAGS(32); +GETADDFLAGS(64); + +static u_long +getaddflags(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getaddflags: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getaddflags8(x, y)); + else if (opsize == 2) + return (getaddflags16(x, y)); + else if (opsize == 4) + return (getaddflags32(x, y)); + else + return (getaddflags64(x, y)); +} + static int emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) @@ -596,13 +667,11 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, /* * Helper function to calculate and validate a linear address. - * - * Returns 0 on success and 1 if an exception was injected into the guest. */ static int get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, int opsize, int addrsize, int prot, enum vm_reg_name seg, - enum vm_reg_name gpr, uint64_t *gla) + enum vm_reg_name gpr, uint64_t *gla, int *fault) { struct seg_desc desc; uint64_t cr0, val, rflags; @@ -628,7 +697,7 @@ get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, vm_inject_ss(vm, vcpuid, 0); else vm_inject_gp(vm, vcpuid); - return (1); + goto guest_fault; } if (vie_canonical_check(paging->cpu_mode, *gla)) { @@ -636,14 +705,19 @@ get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, vm_inject_ss(vm, vcpuid, 0); else vm_inject_gp(vm, vcpuid); - return (1); + goto guest_fault; } if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { vm_inject_ac(vm, vcpuid, 0); - return (1); + goto guest_fault; } + *fault = 0; + return (0); + +guest_fault: + *fault = 1; return (0); } @@ -659,7 +733,7 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, #endif uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; uint64_t rcx, rdi, rsi, rflags; - int error, opsize, seg, repeat; + int error, fault, opsize, seg, repeat; opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; val = 0; @@ -682,8 +756,10 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * The count register is %rcx, %ecx or %cx depending on the * address size of the instruction. */ - if ((rcx & vie_size2mask(vie->addrsize)) == 0) - return (0); + if ((rcx & vie_size2mask(vie->addrsize)) == 0) { + error = 0; + goto done; + } } /* @@ -704,13 +780,16 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, - PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr); - if (error) + PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); + if (error || fault) goto done; error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, - copyinfo, nitems(copyinfo)); + copyinfo, nitems(copyinfo), &fault); if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + /* * case (2): read from system memory and write to mmio. */ @@ -719,11 +798,6 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, error = memwrite(vm, vcpuid, gpa, val, opsize, arg); if (error) goto done; - } else if (error > 0) { - /* - * Resume guest execution to handle fault. - */ - goto done; } else { /* * 'vm_copy_setup()' is expected to fail for cases (3) and (4) @@ -731,13 +805,17 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, */ error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, - PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr); - if (error) + PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, + &fault); + if (error || fault) goto done; error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, - PROT_WRITE, copyinfo, nitems(copyinfo)); + PROT_WRITE, copyinfo, nitems(copyinfo), &fault); if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + /* * case (3): read from MMIO and write to system memory. * @@ -753,27 +831,29 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, vm_copyout(vm, vcpuid, &val, copyinfo, opsize); vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); - } else if (error > 0) { - /* - * Resume guest execution to handle fault. - */ - goto done; } else { /* * Case (4): read from and write to mmio. + * + * Commit to the MMIO read/write (with potential + * side-effects) only after we are sure that the + * instruction is not going to be restarted due + * to address translation faults. */ error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, - PROT_READ, &srcgpa); - if (error) - goto done; - error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); - if (error) + PROT_READ, &srcgpa, &fault); + if (error || fault) goto done; error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, - PROT_WRITE, &dstgpa); + PROT_WRITE, &dstgpa, &fault); + if (error || fault) + goto done; + + error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); if (error) goto done; + error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); if (error) goto done; @@ -818,10 +898,9 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, vm_restart_instruction(vm, vcpuid); } done: - if (error < 0) - return (EFAULT); - else - return (0); + KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", + __func__, error)); + return (error); } static int @@ -979,12 +1058,38 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; - uint64_t val1, result, rflags, rflags2; + enum vm_reg_name reg; + uint64_t result, rflags, rflags2, val1, val2; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { + case 0x0B: + /* + * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 0b/r or r16, r/m16 + * 0b/r or r32, r/m32 + * REX.W + 0b/r or r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + result = val1 | val2; + error = vie_update_register(vm, vcpuid, reg, result, size); + break; case 0x81: case 0x83: /* @@ -1041,39 +1146,55 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; - uint64_t op1, op2, rflags, rflags2; + uint64_t regop, memop, op1, op2, rflags, rflags2; enum vm_reg_name reg; size = vie->opsize; switch (vie->op.op_byte) { + case 0x39: case 0x3B: /* + * 39/r CMP r/m16, r16 + * 39/r CMP r/m32, r32 + * REX.W 39/r CMP r/m64, r64 + * * 3B/r CMP r16, r/m16 * 3B/r CMP r32, r/m32 * REX.W + 3B/r CMP r64, r/m64 * - * Compare first operand (reg) with second operand (r/m) and + * Compare the first operand with the second operand and * set status flags in EFLAGS register. The comparison is * performed by subtracting the second operand from the first * operand and then setting the status flags. */ - /* Get the first operand */ + /* Get the register operand */ reg = gpr_map[vie->reg]; - error = vie_read_register(vm, vcpuid, reg, &op1); + error = vie_read_register(vm, vcpuid, reg, ®op); if (error) return (error); - /* Get the second operand */ - error = memread(vm, vcpuid, gpa, &op2, size, arg); + /* Get the memory operand */ + error = memread(vm, vcpuid, gpa, &memop, size, arg); if (error) return (error); + if (vie->op.op_byte == 0x3B) { + op1 = regop; + op2 = memop; + } else { + op1 = memop; + op2 = regop; + } rflags2 = getcc(size, op1, op2); break; + case 0x80: case 0x81: case 0x83: /* + * 80 /7 cmp r/m8, imm8 + * REX + 80 /7 cmp r/m8, imm8 + * * 81 /7 cmp r/m16, imm16 * 81 /7 cmp r/m32, imm32 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 @@ -1089,6 +1210,8 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * the status flags. * */ + if (vie->op.op_byte == 0x80) + size = 1; /* get the first operand */ error = memread(vm, vcpuid, gpa, &op1, size, arg); @@ -1110,6 +1233,62 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, return (error); } +static int +emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t nval, rflags, rflags2, val1, val2; + enum vm_reg_name reg; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x03: + /* + * ADD r/m to r and store the result in r + * + * 03/r ADD r16, r/m16 + * 03/r ADD r32, r/m32 + * REX.W + 03/r ADD r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + nval = val1 + val2; + error = vie_update_register(vm, vcpuid, reg, nval, size); + break; + default: + break; + } + + if (!error) { + rflags2 = getaddflags(size, val1, val2); + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + &rflags); + if (error) + return (error); + + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + rflags, 8); + } + + return (error); +} + static int emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) @@ -1178,7 +1357,7 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, #endif struct seg_desc ss_desc; uint64_t cr0, rflags, rsp, stack_gla, val; - int error, size, stackaddrsize, pushop; + int error, fault, size, stackaddrsize, pushop; val = 0; size = vie->opsize; @@ -1201,7 +1380,7 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, size = vie->opsize_override ? 2 : 8; } else { /* - * In protected or compability mode the 'B' flag in the + * In protected or compatibility mode the 'B' flag in the * stack-segment descriptor determines the size of the * stack pointer. */ @@ -1244,18 +1423,10 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, } error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, - pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo)); - if (error == -1) { - /* - * XXX cannot return a negative error value here because it - * ends up being the return value of the VM_RUN() ioctl and - * is interpreted as a pseudo-error (for e.g. ERESTART). - */ - return (EFAULT); - } else if (error == 1) { - /* Resume guest execution to handle page fault */ - return (0); - } + pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), + &fault); + if (error || fault) + return (error); if (pushop) { error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); @@ -1346,6 +1517,79 @@ emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, return (error); } +static int +emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) +{ + uint64_t val, rflags; + int error, bitmask, bitoff; + + /* + * 0F BA is a Group 8 extended opcode. + * + * Currently we only emulate the 'Bit Test' instruction which is + * identified by a ModR/M:reg encoding of 100b. + */ + if ((vie->reg & 7) != 4) + return (EINVAL); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg); + if (error) + return (error); + + /* + * Intel SDM, Vol 2, Table 3-2: + * "Range of Bit Positions Specified by Bit Offset Operands" + */ + bitmask = vie->opsize * 8 - 1; + bitoff = vie->immediate & bitmask; + + /* Copy the bit into the Carry flag in %rflags */ + if (val & (1UL << bitoff)) + rflags |= PSL_C; + else + rflags &= ~PSL_C; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); + + return (0); +} + +static int +emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) +{ + int error; + uint64_t buf; + + switch (vie->reg & 7) { + case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ + if (vie->mod == 0x3) { + /* + * SFENCE. Ignore it, VM exit provides enough + * barriers on its own. + */ + error = 0; + } else { + /* + * CLFLUSH, CLFLUSHOPT. Only check for access + * rights. + */ + error = memread(vm, vcpuid, gpa, &buf, 1, memarg); + } + break; + default: + error = EINVAL; + break; + } + + return (error); +} + int vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, @@ -1402,6 +1646,18 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, error = emulate_sub(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; + case VIE_OP_TYPE_BITTEST: + error = emulate_bittest(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_TWOB_GRP15: + error = emulate_twob_group15(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_ADD: + error = emulate_add(vm, vcpuid, gpa, vie, memread, + memwrite, memarg); + break; default: error = EINVAL; break; @@ -1623,27 +1879,31 @@ ptp_release(void **cookie) } static void * -ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) +ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) { void *ptr; ptp_release(cookie); - ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); + ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie); return (ptr); } -int -vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa) +static int +_vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) { - int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; + int nlevels, pfcode, retval, usermode, writable; + int ptpshift = 0, ptpindex = 0; + uint64_t ptpphys; + uint64_t *ptpbase = NULL, pte = 0, pgsize = 0; #ifdef __FreeBSD__ -#endif u_int retries; - uint64_t *ptpbase, ptpphys, pte, pgsize; +#endif uint32_t *ptpbase32, pte32; void *cookie; + *guest_fault = 0; + usermode = (paging->cpl == 3 ? 1 : 0); writable = prot & VM_PROT_WRITE; cookie = NULL; @@ -1664,7 +1924,8 @@ restart: * XXX assuming a non-stack reference otherwise a stack fault * should be generated. */ - vm_inject_gp(vm, vcpuid); + if (!check_only) + vm_inject_gp(vm, vcpuid); goto fault; } @@ -1679,7 +1940,8 @@ restart: /* Zero out the lower 12 bits. */ ptpphys &= ~0xfff; - ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); + ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, + &cookie); if (ptpbase32 == NULL) goto error; @@ -1693,9 +1955,11 @@ restart: if ((pte32 & PG_V) == 0 || (usermode && (pte32 & PG_U) == 0) || (writable && (pte32 & PG_RW) == 0)) { - pfcode = pf_error_code(usermode, prot, 0, - pte32); - vm_inject_pf(vm, vcpuid, pfcode, gla); + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, + pte32); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } goto fault; } @@ -1706,7 +1970,7 @@ restart: * is only set at the last level providing the guest * physical address. */ - if ((pte32 & PG_A) == 0) { + if (!check_only && (pte32 & PG_A) == 0) { if (atomic_cmpset_32(&ptpbase32[ptpindex], pte32, pte32 | PG_A) == 0) { goto restart; @@ -1721,7 +1985,7 @@ restart: } /* Set the dirty bit in the page table entry if necessary */ - if (writable && (pte32 & PG_M) == 0) { + if (!check_only && writable && (pte32 & PG_M) == 0) { if (atomic_cmpset_32(&ptpbase32[ptpindex], pte32, pte32 | PG_M) == 0) { goto restart; @@ -1738,7 +2002,8 @@ restart: /* Zero out the lower 5 bits and the upper 32 bits */ ptpphys &= 0xffffffe0UL; - ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); + ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4, + &cookie); if (ptpbase == NULL) goto error; @@ -1747,8 +2012,10 @@ restart: pte = ptpbase[ptpindex]; if ((pte & PG_V) == 0) { - pfcode = pf_error_code(usermode, prot, 0, pte); - vm_inject_pf(vm, vcpuid, pfcode, gla); + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } goto fault; } @@ -1761,7 +2028,7 @@ restart: /* Zero out the lower 12 bits and the upper 12 bits */ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; - ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); + ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); if (ptpbase == NULL) goto error; @@ -1774,13 +2041,15 @@ restart: if ((pte & PG_V) == 0 || (usermode && (pte & PG_U) == 0) || (writable && (pte & PG_RW) == 0)) { - pfcode = pf_error_code(usermode, prot, 0, pte); - vm_inject_pf(vm, vcpuid, pfcode, gla); + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } goto fault; } /* Set the accessed bit in the page table entry */ - if ((pte & PG_A) == 0) { + if (!check_only && (pte & PG_A) == 0) { if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_A) == 0) { goto restart; @@ -1789,8 +2058,11 @@ restart: if (nlevels > 0 && (pte & PG_PS) != 0) { if (pgsize > 1 * GB) { - pfcode = pf_error_code(usermode, prot, 1, pte); - vm_inject_pf(vm, vcpuid, pfcode, gla); + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 1, + pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } goto fault; } break; @@ -1800,7 +2072,7 @@ restart: } /* Set the dirty bit in the page table entry if necessary */ - if (writable && (pte & PG_M) == 0) { + if (!check_only && writable && (pte & PG_M) == 0) { if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) goto restart; } @@ -1810,18 +2082,38 @@ restart: *gpa = pte | (gla & (pgsize - 1)); done: ptp_release(&cookie); + KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", + __func__, retval)); return (retval); error: - retval = -1; + retval = EFAULT; goto done; fault: - retval = 1; + *guest_fault = 1; goto done; } +int +vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) +{ + + return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, + false)); +} + +int +vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) +{ + + return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, + true)); +} + int vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, - uint64_t rip, int inst_length, struct vie *vie) + uint64_t rip, int inst_length, struct vie *vie, int *faultptr) { struct vm_copyinfo copyinfo[2]; int error, prot; @@ -1831,13 +2123,14 @@ vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, prot = PROT_READ | PROT_EXEC; error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, - copyinfo, nitems(copyinfo)); - if (error == 0) { - vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); - vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); - vie->num_valid = inst_length; - } - return (error); + copyinfo, nitems(copyinfo), faultptr); + if (error || *faultptr) + return (error); + + vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + vie->num_valid = inst_length; + return (0); } static int @@ -2261,28 +2554,18 @@ decode_moffset(struct vie *vie) return (0); } -/* - * Verify that all the bytes in the instruction buffer were consumed. - */ -static int -verify_inst_length(struct vie *vie) -{ - - if (vie->num_processed) - return (0); - else - return (-1); -} - /* * Verify that the 'guest linear address' provided as collateral of the nested * page table fault matches with our instruction decoding. */ static int -verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) +verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie, + enum vm_cpu_mode cpu_mode) { int error; - uint64_t base, idx, gla2; + uint64_t base, segbase, idx, gla2; + enum vm_reg_name seg; + struct seg_desc desc; /* Skip 'gla' verification */ if (gla == VIE_INVALID_GLA) @@ -2302,7 +2585,7 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) * instruction */ if (vie->base_register == VM_REG_GUEST_RIP) - base += vie->num_valid; + base += vie->num_processed; } idx = 0; @@ -2315,14 +2598,48 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) } } - /* XXX assuming that the base address of the segment is 0 */ - gla2 = base + vie->scale * idx + vie->displacement; + /* + * From "Specifying a Segment Selector", Intel SDM, Vol 1 + * + * In 64-bit mode, segmentation is generally (but not + * completely) disabled. The exceptions are the FS and GS + * segments. + * + * In legacy IA-32 mode, when the ESP or EBP register is used + * as the base, the SS segment is the default segment. For + * other data references, except when relative to stack or + * string destination the DS segment is the default. These + * can be overridden to allow other segments to be accessed. + */ + if (vie->segment_override) + seg = vie->segment_register; + else if (vie->base_register == VM_REG_GUEST_RSP || + vie->base_register == VM_REG_GUEST_RBP) + seg = VM_REG_GUEST_SS; + else + seg = VM_REG_GUEST_DS; + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + error = vm_get_seg_desc(vm, cpuid, seg, &desc); + if (error) { + printf("verify_gla: error %d getting segment" + " descriptor %d", error, + vie->segment_register); + return (-1); + } + segbase = desc.base; + } + + gla2 = segbase + base + vie->scale * idx + vie->displacement; gla2 &= size2mask[vie->addrsize]; if (gla != gla2) { - printf("verify_gla mismatch: " + printf("verify_gla mismatch: segbase(0x%0lx)" "base(0x%0lx), scale(%d), index(0x%0lx), " "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", - base, vie->scale, idx, vie->displacement, gla, gla2); + segbase, base, vie->scale, idx, vie->displacement, + gla, gla2); return (-1); } @@ -2355,11 +2672,8 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, if (decode_moffset(vie)) return (-1); - if (verify_inst_length(vie)) - return (-1); - if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { - if (verify_gla(vm, cpuid, gla, vie)) + if (verify_gla(vm, cpuid, gla, vie, cpu_mode)) return (-1); } diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c index bea750f162..3d08fd5e85 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * @@ -25,12 +27,9 @@ */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_ioport.c 277168 2015-01-14 07:18:51Z neel $"); +__FBSDID("$FreeBSD$"); #include -#include -#include -#include #include #include @@ -38,6 +37,8 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_ioport.c 277168 2015-01-14 07:18:51Z #include "vatpic.h" #include "vatpit.h" +#include "vpmtmr.h" +#include "vrtc.h" #include "vmm_ioport.h" #include "vmm_ktr.h" @@ -55,6 +56,9 @@ ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { [IO_ICU2 + ICU_IMR_OFFSET] = vatpic_slave_handler, [IO_ELCR1] = vatpic_elc_handler, [IO_ELCR2] = vatpic_elc_handler, + [IO_PMTMR] = vpmtmr_handler, + [IO_RTC] = vrtc_addr_handler, + [IO_RTC + 1] = vrtc_data_handler, }; #ifdef KTR @@ -103,6 +107,7 @@ emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, uint32_t mask, val; int error; +#ifdef __FreeBSD__ /* * If there is no handler for the I/O port then punt to userspace. */ @@ -111,6 +116,28 @@ emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, *retu = true; return (0); } +#else /* __FreeBSD__ */ + handler = NULL; + if (vmexit->u.inout.port < MAX_IOPORTS) { + handler = ioport_handler[vmexit->u.inout.port]; + } + /* Look for hooks, if a standard handler is not present */ + if (handler == NULL) { + mask = vie_size2mask(vmexit->u.inout.bytes); + if (!vmexit->u.inout.in) { + val = vmexit->u.inout.eax & mask; + } + error = vm_ioport_handle_hook(vm, vcpuid, vmexit->u.inout.in, + vmexit->u.inout.port, vmexit->u.inout.bytes, &val); + if (error == 0) { + goto finish; + } + + *retu = true; + return (0); + } + +#endif /* __FreeBSD__ */ mask = vie_size2mask(vmexit->u.inout.bytes); @@ -131,6 +158,9 @@ emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, return (EIO); } +#ifndef __FreeBSD__ +finish: +#endif /* __FreeBSD__ */ if (vmexit->u.inout.in) { vmexit->u.inout.eax &= ~mask; vmexit->u.inout.eax |= val & mask; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h index 624dd8f1d8..14e315f400 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_ioport.h 273706 2014-10-26 19:03:06Z neel $ + * $FreeBSD$ */ #ifndef _VMM_IOPORT_H_ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ipi.h b/usr/src/uts/i86pc/io/vmm/vmm_ipi.h deleted file mode 100644 index 4dff03ba1f..0000000000 --- a/usr/src/uts/i86pc/io/vmm/vmm_ipi.h +++ /dev/null @@ -1,37 +0,0 @@ -/*- - * Copyright (c) 2011 NetApp, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: head/sys/amd64/vmm/vmm_ipi.h 260466 2014-01-09 03:25:54Z neel $ - */ - -#ifndef _VMM_IPI_H_ -#define _VMM_IPI_H_ - -#ifdef __FreeBSD__ -int vmm_ipi_alloc(void); -void vmm_ipi_free(int num); -#endif - -#endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ktr.h b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h index 917c7f83a4..414d0341cc 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_ktr.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_ktr.h 258699 2013-11-27 22:18:08Z neel $ + * $FreeBSD$ */ #ifndef _VMM_KTR_H_ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c index 3215c74a44..43b2bebe97 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z tychon $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -39,7 +41,7 @@ */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z tychon $"); +__FBSDID("$FreeBSD$"); #include #include @@ -49,7 +51,6 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z t #include #include -#include "vmm_ipi.h" #include "vmm_ktr.h" #include "vmm_lapic.h" #include "vlapic.h" @@ -67,10 +68,14 @@ lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) { struct vlapic *vlapic; - if (cpu < 0 || cpu >= VM_MAXCPU) + if (cpu < 0 || cpu >= vm_get_maxcpus(vm)) return (EINVAL); - if (vector < 32 || vector > 255) + /* + * According to section "Maskable Hardware Interrupts" in Intel SDM + * vectors 16 through 255 can be delivered through the local APIC. + */ + if (vector < 16 || vector > 255) return (EINVAL); vlapic = vm_lapic(vm, cpu); @@ -86,7 +91,7 @@ lapic_set_local_intr(struct vm *vm, int cpu, int vector) cpuset_t dmask; int error; - if (cpu < -1 || cpu >= VM_MAXCPU) + if (cpu < -1 || cpu >= vm_get_maxcpus(vm)) return (EINVAL); if (cpu == -1) diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.h b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h index ee47ee7783..da3b0ff660 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_lapic.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_lapic.h 259863 2013-12-25 06:46:31Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.c b/usr/src/uts/i86pc/io/vmm/vmm_mem.c new file mode 100644 index 0000000000..a736d94bba --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_mem.c @@ -0,0 +1,124 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "vmm_mem.h" + +int +vmm_mem_init(void) +{ + + return (0); +} + +vm_object_t +vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa) +{ + int error; + vm_object_t obj; + struct sglist *sg; + + sg = sglist_alloc(1, M_WAITOK); + error = sglist_append_phys(sg, hpa, len); + KASSERT(error == 0, ("error %d appending physaddr to sglist", error)); + + obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL); + if (obj != NULL) { + /* + * VT-x ignores the MTRR settings when figuring out the + * memory type for translations obtained through EPT. + * + * Therefore we explicitly force the pages provided by + * this object to be mapped as uncacheable. + */ + VM_OBJECT_WLOCK(obj); + error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); + VM_OBJECT_WUNLOCK(obj); + if (error != KERN_SUCCESS) { + panic("vmm_mmio_alloc: vm_object_set_memattr error %d", + error); + } + error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, + VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0); + if (error != KERN_SUCCESS) { + vm_object_deallocate(obj); + obj = NULL; + } + } + + /* + * Drop the reference on the sglist. + * + * If the scatter/gather object was successfully allocated then it + * has incremented the reference count on the sglist. Dropping the + * initial reference count ensures that the sglist will be freed + * when the object is deallocated. + * + * If the object could not be allocated then we end up freeing the + * sglist. + */ + sglist_free(sg); + + return (obj); +} + +void +vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) +{ + + vm_map_remove(&vmspace->vm_map, gpa, gpa + len); +} + +vm_paddr_t +vmm_mem_maxaddr(void) +{ + + return (ptoa(Maxmem)); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.h b/usr/src/uts/i86pc/io/vmm/vmm_mem.h index 05dc37fb9a..e6f88fb222 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_mem.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_mem.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_mem.h 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -41,9 +43,13 @@ #ifndef _VMM_MEM_H_ #define _VMM_MEM_H_ +struct vmspace; +struct vm_object; + int vmm_mem_init(void); -vm_paddr_t vmm_mem_alloc(size_t size); -void vmm_mem_free(vm_paddr_t start, size_t size); +struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa); +void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size); vm_paddr_t vmm_mem_maxaddr(void); #endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c index 79e1cb1a44..2b612b20e9 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -11,6 +11,7 @@ /* * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #include @@ -20,246 +21,401 @@ #include #include #include -/* - * struct modctl in contains "void *__unused". - * Do this ugly workaround to avoid it. - */ -#undef __unused +#include #include #include +#include +#include +#include +#include + +#include +#include +#include #include #include #include #include +#include #include #include #include "io/vatpic.h" #include "io/vioapic.h" +#include "io/vrtc.h" +#include "io/vhpet.h" #include "vmm_lapic.h" +#include "vmm_stat.h" +#include "vmm_util.h" +#include "vm/vm_glue.h" -static dev_info_t *vmm_dip; -static void *vmm_statep; +/* + * Locking details: + * + * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is + * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data + * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire + * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to + * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. + */ -static SLIST_HEAD(, vmm_softc) head; +static kmutex_t vmmdev_mtx; +static dev_info_t *vmmdev_dip; +static hma_reg_t *vmmdev_hma_reg; +static sdev_plugin_hdl_t vmmdev_sdev_hdl; -static kmutex_t vmmdev_mtx; +static kmutex_t vmm_mtx; +static list_t vmm_list; +static list_t vmm_destroy_list; +static id_space_t *vmm_minors; +static void *vmm_statep; -/* - * vmm trace ring - */ -int vmm_dmsg_ring_size = VMM_DMSG_RING_SIZE; -static vmm_trace_rbuf_t *vmm_debug_rbuf; -static vmm_trace_dmsg_t *vmm_trace_dmsg_alloc(void); -static void vmm_trace_dmsg_free(void); -static void vmm_trace_rbuf_alloc(void); -static void vmm_trace_rbuf_free(void); +static const char *vmmdev_hvm_name = "bhyve"; -/* - * This routine is used to manage debug messages - * on ring buffer. - */ -static vmm_trace_dmsg_t * -vmm_trace_dmsg_alloc(void) +/* For sdev plugin (/dev) */ +#define VMM_SDEV_ROOT "/dev/vmm" + +/* From uts/i86pc/io/vmm/intel/vmx.c */ +extern int vmx_x86_supported(const char **); + +/* Holds and hooks from drivers external to vmm */ +struct vmm_hold { + list_node_t vmh_node; + vmm_softc_t *vmh_sc; + boolean_t vmh_release_req; + uint_t vmh_ioport_hook_cnt; +}; + +struct vmm_lease { + list_node_t vml_node; + struct vm *vml_vm; + boolean_t vml_expired; + boolean_t (*vml_expire_func)(void *); + void *vml_expire_arg; + list_node_t vml_expire_node; + struct vmm_hold *vml_hold; +}; + +static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); +static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *); + +static int +vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) { - vmm_trace_dmsg_t *dmsg_alloc, *dmsg = vmm_debug_rbuf->dmsgp; + int error; + bool sysmem; - if (vmm_debug_rbuf->looped == TRUE) { - vmm_debug_rbuf->dmsgp = dmsg->next; - return (vmm_debug_rbuf->dmsgp); - } + error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, + NULL); + if (error || mseg->len == 0) + return (error); - /* - * If we're looping for the first time, - * connect the ring. - */ - if (((vmm_debug_rbuf->size + (sizeof (vmm_trace_dmsg_t))) > - vmm_debug_rbuf->maxsize) && (vmm_debug_rbuf->dmsgh != NULL)) { - dmsg->next = vmm_debug_rbuf->dmsgh; - vmm_debug_rbuf->dmsgp = vmm_debug_rbuf->dmsgh; - vmm_debug_rbuf->looped = TRUE; - return (vmm_debug_rbuf->dmsgp); - } - - /* If we've gotten this far then memory allocation is needed */ - dmsg_alloc = kmem_zalloc(sizeof (vmm_trace_dmsg_t), KM_NOSLEEP); - if (dmsg_alloc == NULL) { - vmm_debug_rbuf->allocfailed++; - return (dmsg_alloc); - } else { - vmm_debug_rbuf->size += sizeof (vmm_trace_dmsg_t); - } + if (!sysmem) { + vmm_devmem_entry_t *de; + list_t *dl = &sc->vmm_devmem_list; - if (vmm_debug_rbuf->dmsgp != NULL) { - dmsg->next = dmsg_alloc; - vmm_debug_rbuf->dmsgp = dmsg->next; - return (vmm_debug_rbuf->dmsgp); - } else { - /* - * We should only be here if we're initializing - * the ring buffer. - */ - if (vmm_debug_rbuf->dmsgh == NULL) { - vmm_debug_rbuf->dmsgh = dmsg_alloc; - } else { - /* Something is wrong */ - kmem_free(dmsg_alloc, sizeof (vmm_trace_dmsg_t)); - return (NULL); + for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { + if (de->vde_segid == mseg->segid) { + break; + } } - - vmm_debug_rbuf->dmsgp = dmsg_alloc; - return (vmm_debug_rbuf->dmsgp); + if (de != NULL) { + (void) strlcpy(mseg->name, de->vde_name, + sizeof (mseg->name)); + } + } else { + bzero(mseg->name, sizeof (mseg->name)); } + + return (error); } /* - * Free all messages on debug ring buffer. + * The 'devmem' hack: + * + * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments + * in the vm which appear with their own name related to the vm under /dev. + * Since this would be a hassle from an sdev perspective and would require a + * new cdev interface (or complicate the existing one), we choose to implement + * this in a different manner. When 'devmem' mappings are created, an + * identifying off_t is communicated back out to userspace. That off_t, + * residing above the normal guest memory space, can be used to mmap the + * 'devmem' mapping from the already-open vm device. */ -static void -vmm_trace_dmsg_free(void) -{ - vmm_trace_dmsg_t *dmsg_next, *dmsg = vmm_debug_rbuf->dmsgh; - while (dmsg != NULL) { - dmsg_next = dmsg->next; - kmem_free(dmsg, sizeof (vmm_trace_dmsg_t)); +static int +vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) +{ + off_t map_offset; + vmm_devmem_entry_t *entry; + if (list_is_empty(&sc->vmm_devmem_list)) { + map_offset = VM_DEVMEM_START; + } else { + entry = list_tail(&sc->vmm_devmem_list); + map_offset = entry->vde_off + entry->vde_len; + if (map_offset < entry->vde_off) { + /* Do not tolerate overflow */ + return (ERANGE); + } /* - * If we've looped around the ring than we're done. + * XXXJOY: We could choose to search the list for duplicate + * names and toss an error. Since we're using the offset + * method for now, it does not make much of a difference. */ - if (dmsg_next == vmm_debug_rbuf->dmsgh) { - break; - } else { - dmsg = dmsg_next; - } } + + entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); + entry->vde_segid = mseg->segid; + entry->vde_len = mseg->len; + entry->vde_off = map_offset; + (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); + list_insert_tail(&sc->vmm_devmem_list, entry); + + return (0); } -static void -vmm_trace_rbuf_alloc(void) +static boolean_t +vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp) { - vmm_debug_rbuf = kmem_zalloc(sizeof (vmm_trace_rbuf_t), KM_SLEEP); + list_t *dl = &sc->vmm_devmem_list; + vmm_devmem_entry_t *de = NULL; - mutex_init(&vmm_debug_rbuf->lock, NULL, MUTEX_DRIVER, NULL); + VERIFY(off >= VM_DEVMEM_START); - if (vmm_dmsg_ring_size > 0) { - vmm_debug_rbuf->maxsize = vmm_dmsg_ring_size; + for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { + /* XXX: Only hit on direct offset/length matches for now */ + if (de->vde_off == off && de->vde_len == len) { + break; + } + } + if (de == NULL) { + return (B_FALSE); } -} + *segidp = de->vde_segid; + return (B_TRUE); +} static void -vmm_trace_rbuf_free(void) +vmmdev_devmem_purge(vmm_softc_t *sc) { - vmm_trace_dmsg_free(); - mutex_destroy(&vmm_debug_rbuf->lock); - kmem_free(vmm_debug_rbuf, sizeof (vmm_trace_rbuf_t)); + vmm_devmem_entry_t *entry; + + while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { + kmem_free(entry, sizeof (*entry)); + } } -static void -vmm_vtrace_log(const char *fmt, va_list ap) +static int +vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) { - vmm_trace_dmsg_t *dmsg; + int error; + bool sysmem = true; - if (vmm_debug_rbuf == NULL) { - return; + if (VM_MEMSEG_NAME(mseg)) { + sysmem = false; } + error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); - /* - * If max size of ring buffer is smaller than size - * required for one debug message then just return - * since we have no room for the debug message. - */ - if (vmm_debug_rbuf->maxsize < (sizeof (vmm_trace_dmsg_t))) { - return; + if (error == 0 && VM_MEMSEG_NAME(mseg)) { + /* + * Rather than create a whole fresh device from which userspace + * can mmap this segment, instead make it available at an + * offset above where the main guest memory resides. + */ + error = vmmdev_devmem_create(sc, mseg, mseg->name); + if (error != 0) { + vm_free_memseg(sc->vmm_vm, mseg->segid); + } } + return (error); +} - mutex_enter(&vmm_debug_rbuf->lock); - - /* alloc or reuse on ring buffer */ - dmsg = vmm_trace_dmsg_alloc(); +/* + * Resource Locking and Exclusion + * + * Much of bhyve depends on key portions of VM state, such as the guest memory + * map, to remain unchanged while the guest is running. As ported from + * FreeBSD, the initial strategy for this resource exclusion hinged on gating + * access to the instance vCPUs. Threads acting on a single vCPU, like those + * performing the work of actually running the guest in VMX/SVM, would lock + * only that vCPU during ioctl() entry. For ioctls which would change VM-wide + * state, all of the vCPUs would be first locked, ensuring that the + * operation(s) could complete without any other threads stumbling into + * intermediate states. + * + * This approach is largely effective for bhyve. Common operations, such as + * running the vCPUs, steer clear of lock contention. The model begins to + * break down for operations which do not occur in the context of a specific + * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker + * thread in the bhyve process. In order to properly protect those vCPU-less + * operations from encountering invalid states, additional locking is required. + * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. + * It does mean that class of operations will be serialized on locking the + * specific vCPU and that instances sized at VM_MAXCPU will potentially see + * undue contention on the VM_MAXCPU-1 vCPU. + * + * In order to address the shortcomings of this model, the concept of a + * read/write lock has been added to bhyve. Operations which change + * fundamental aspects of a VM (such as the memory map) must acquire the write + * lock, which also implies locking all of the vCPUs and waiting for all read + * lock holders to release. While it increases the cost and waiting time for + * those few operations, it allows most hot-path operations on the VM (which + * depend on its configuration remaining stable) to occur with minimal locking. + * + * Consumers of the Driver API (see below) are a special case when it comes to + * this locking, since they may hold a read lock via the drv_lease mechanism + * for an extended period of time. Rather than forcing those consumers to + * continuously poll for a write lock attempt, the lease system forces them to + * provide a release callback to trigger their clean-up (and potential later + * reacquisition) of the read lock. + */ - if (dmsg == NULL) { - /* resource allocation failed */ - mutex_exit(&vmm_debug_rbuf->lock); - return; - } +static void +vcpu_lock_one(vmm_softc_t *sc, int vcpu) +{ + ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); - gethrestime(&dmsg->timestamp); + /* + * Since this state transition is utilizing from_idle=true, it should + * not fail, but rather block until it can be successful. + */ + VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); +} - (void) vsnprintf(dmsg->buf, sizeof (dmsg->buf), fmt, ap); +static void +vcpu_unlock_one(vmm_softc_t *sc, int vcpu) +{ + ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); - mutex_exit(&vmm_debug_rbuf->lock); + VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); + vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false); } -void -vmm_trace_log(const char *fmt, ...) +static void +vmm_read_lock(vmm_softc_t *sc) { - va_list ap; - - va_start(ap, fmt); - vmm_vtrace_log(fmt, ap); - va_end(ap); + rw_enter(&sc->vmm_rwlock, RW_READER); } -void -vmmdev_init(void) +static void +vmm_read_unlock(vmm_softc_t *sc) { - vmm_trace_rbuf_alloc(); + rw_exit(&sc->vmm_rwlock); } -int -vmmdev_cleanup(void) +static void +vmm_write_lock(vmm_softc_t *sc) { - int error; + int maxcpus; - if (SLIST_EMPTY(&head)) - error = 0; - else - error = EBUSY; + /* First lock all the vCPUs */ + maxcpus = vm_get_maxcpus(sc->vmm_vm); + for (int vcpu = 0; vcpu < maxcpus; vcpu++) { + vcpu_lock_one(sc, vcpu); + } - if (error == 0) - vmm_trace_dmsg_free(); + mutex_enter(&sc->vmm_lease_lock); + VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); + sc->vmm_lease_blocker++; + if (sc->vmm_lease_blocker == 1) { + list_t *list = &sc->vmm_lease_list; + vmm_lease_t *lease = list_head(list); - return (error); + while (lease != NULL) { + boolean_t sync_break = B_FALSE; + + if (!lease->vml_expired) { + void *arg = lease->vml_expire_arg; + lease->vml_expired = B_TRUE; + sync_break = lease->vml_expire_func(arg); + } + + if (sync_break) { + vmm_lease_t *next; + + /* + * These leases which are synchronously broken + * result in vmm_read_unlock() calls from a + * different thread than the corresponding + * vmm_read_lock(). This is acceptable, given + * that the rwlock underpinning the whole + * mechanism tolerates the behavior. This + * flexibility is _only_ afforded to VM read + * lock (RW_READER) holders. + */ + next = list_next(list, lease); + vmm_lease_break_locked(sc, lease); + lease = next; + } else { + lease = list_next(list, lease); + } + } + } + mutex_exit(&sc->vmm_lease_lock); + + rw_enter(&sc->vmm_rwlock, RW_WRITER); + /* + * For now, the 'maxcpus' value for an instance is fixed at the + * compile-time constant of VM_MAXCPU at creation. If this changes in + * the future, allowing for dynamic vCPU resource sizing, acquisition + * of the write lock will need to be wary of such changes. + */ + VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); } -int -vmmdev_do_ioctl(struct vmm_softc *sc, int cmd, intptr_t arg, int mode, - cred_t *credp, int *rvalp) +static void +vmm_write_unlock(vmm_softc_t *sc) { - int error, vcpu, state_changed; - struct vm_memory_segment seg; - struct vm_register vmreg; - struct vm_seg_desc vmsegdesc; - struct vm_run vmrun; - struct vm_exception vmexc; - struct vm_lapic_irq vmirq; - struct vm_lapic_msi vmmsi; - struct vm_ioapic_irq ioapic_irq; - struct vm_isa_irq isa_irq; - struct vm_capability vmcap; - struct vm_nmi vmnmi; - struct vm_x2apic x2apic; - struct vm_gla2gpa gg; - struct vm_activate_cpu vac; - int pincount; - int i; - - vcpu = -1; - state_changed = 0; + int maxcpus; + + mutex_enter(&sc->vmm_lease_lock); + VERIFY3U(sc->vmm_lease_blocker, !=, 0); + sc->vmm_lease_blocker--; + if (sc->vmm_lease_blocker == 0) { + cv_broadcast(&sc->vmm_lease_cv); + } + mutex_exit(&sc->vmm_lease_lock); /* - * Some VMM ioctls can operate only on vcpus that are not running. + * The VM write lock _must_ be released from the same thread it was + * acquired in, unlike the read lock. */ + VERIFY(rw_write_held(&sc->vmm_rwlock)); + rw_exit(&sc->vmm_rwlock); + + /* Unlock all the vCPUs */ + maxcpus = vm_get_maxcpus(sc->vmm_vm); + for (int vcpu = 0; vcpu < maxcpus; vcpu++) { + vcpu_unlock_one(sc, vcpu); + } +} + +static int +vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, + cred_t *credp, int *rvalp) +{ + int error = 0, vcpu = -1; + void *datap = (void *)arg; + enum vm_lock_type { + LOCK_NONE = 0, + LOCK_VCPU, + LOCK_READ_HOLD, + LOCK_WRITE_HOLD + } lock_type = LOCK_NONE; + + /* Acquire any exclusion resources needed for the operation. */ switch (cmd) { case VM_RUN: case VM_GET_REGISTER: case VM_SET_REGISTER: case VM_GET_SEGMENT_DESCRIPTOR: case VM_SET_SEGMENT_DESCRIPTOR: + case VM_GET_REGISTER_SET: + case VM_SET_REGISTER_SET: case VM_INJECT_EXCEPTION: case VM_GET_CAPABILITY: case VM_SET_CAPABILITY: @@ -267,494 +423,1320 @@ vmmdev_do_ioctl(struct vmm_softc *sc, int cmd, intptr_t arg, int mode, case VM_PPTDEV_MSIX: case VM_SET_X2APIC_STATE: case VM_GLA2GPA: + case VM_GLA2GPA_NOFAULT: case VM_ACTIVATE_CPU: + case VM_SET_INTINFO: + case VM_GET_INTINFO: case VM_RESTART_INSTRUCTION: /* - * XXX fragile, handle with care - * Assumes that the first field of the ioctl data is the vcpu. + * Copy in the ID of the vCPU chosen for this operation. + * Since a nefarious caller could update their struct between + * this locking and when the rest of the ioctl data is copied + * in, it is _critical_ that this local 'vcpu' variable be used + * rather than the in-struct one when performing the ioctl. */ - if (ddi_copyin((void *)arg, &vcpu, sizeof (vcpu), mode)) { + if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { return (EFAULT); } - if (vcpu < 0 || vcpu >= VM_MAXCPU) { - error = EINVAL; - goto done; + if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { + return (EINVAL); } - - error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); - if (error) - goto done; - - state_changed = 1; + vcpu_lock_one(sc, vcpu); + lock_type = LOCK_VCPU; break; - case VM_MAP_MEMORY: - /* - * ioctls that operate on the entire virtual machine must - * prevent all vcpus from running. - */ - error = 0; - for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { - error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); - if (error) - break; - } - if (error) { - while (--vcpu >= 0) - vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); - goto done; - } + case VM_REINIT: + case VM_BIND_PPTDEV: + case VM_UNBIND_PPTDEV: + case VM_MAP_PPTDEV_MMIO: + case VM_ALLOC_MEMSEG: + case VM_MMAP_MEMSEG: + case VM_WRLOCK_CYCLE: + vmm_write_lock(sc); + lock_type = LOCK_WRITE_HOLD; + break; - state_changed = 2; + case VM_GET_GPA_PMAP: + case VM_GET_MEMSEG: + case VM_MMAP_GETNEXT: + case VM_LAPIC_IRQ: + case VM_INJECT_NMI: + case VM_IOAPIC_ASSERT_IRQ: + case VM_IOAPIC_DEASSERT_IRQ: + case VM_IOAPIC_PULSE_IRQ: + case VM_LAPIC_MSI: + case VM_LAPIC_LOCAL_IRQ: + case VM_GET_X2APIC_STATE: + case VM_RTC_READ: + case VM_RTC_WRITE: + case VM_RTC_SETTIME: + case VM_RTC_GETTIME: +#ifndef __FreeBSD__ + case VM_DEVMEM_GETOFFSET: +#endif + vmm_read_lock(sc); + lock_type = LOCK_READ_HOLD; break; + case VM_IOAPIC_PINCOUNT: default: break; } - switch(cmd) { - case VM_RUN: - if (ddi_copyin((void *)arg, &vmrun, - sizeof (struct vm_run), mode)) { - return (EFAULT); + /* Execute the primary logic for the ioctl. */ + switch (cmd) { + case VM_RUN: { + struct vm_run vmrun; + + if (ddi_copyin(datap, &vmrun, sizeof (vmrun), md)) { + error = EFAULT; + break; } - error = vm_run(sc->vm, &vmrun); - if (ddi_copyout(&vmrun, (void *)arg, - sizeof (struct vm_run), mode)) { - return (EFAULT); + vmrun.cpuid = vcpu; + + if (!(curthread->t_schedflag & TS_VCPU)) + smt_mark_as_vcpu(); + + error = vm_run(sc->vmm_vm, &vmrun); + /* + * XXXJOY: I think it's necessary to do copyout, even in the + * face of errors, since the exit state is communicated out. + */ + if (ddi_copyout(&vmrun, datap, sizeof (vmrun), md)) { + error = EFAULT; + break; } break; - case VM_LAPIC_IRQ: - if (ddi_copyin((void *)arg, &vmirq, - sizeof (struct vm_lapic_irq), mode)) { - return (EFAULT); - } - error = lapic_intr_edge(sc->vm, vmirq.cpuid, vmirq.vector); - if (ddi_copyout(&vmirq, (void *)arg, - sizeof (struct vm_lapic_irq), mode)) { - return (EFAULT); + } + case VM_SUSPEND: { + struct vm_suspend vmsuspend; + + if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { + error = EFAULT; + break; } + error = vm_suspend(sc->vmm_vm, vmsuspend.how); break; - case VM_LAPIC_LOCAL_IRQ: - if (ddi_copyin((void *)arg, &vmirq, - sizeof (struct vm_lapic_irq), mode)) { - return (EFAULT); - } - error = lapic_set_local_intr(sc->vm, vmirq.cpuid, - vmirq.vector); - if (ddi_copyout(&vmirq, (void *)arg, - sizeof (struct vm_lapic_irq), mode)) { - return (EFAULT); + } + case VM_REINIT: + if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { + /* + * The VM instance should be free of driver-attached + * hooks during the reinitialization process. + */ + break; } + error = vm_reinit(sc->vmm_vm); + (void) vmm_drv_block_hook(sc, B_FALSE); break; - case VM_LAPIC_MSI: - if (ddi_copyin((void *)arg, &vmmsi, - sizeof (struct vm_lapic_msi), mode)) { - return (EFAULT); - } - error = lapic_intr_msi(sc->vm, vmmsi.addr, vmmsi.msg); - if (ddi_copyout(&vmmsi, (void *)arg, - sizeof (struct vm_lapic_msi), mode)) { - return (EFAULT); - } - case VM_IOAPIC_ASSERT_IRQ: - if (ddi_copyin((void *)arg, &ioapic_irq, - sizeof (struct vm_ioapic_irq), mode)) { - return (EFAULT); + case VM_STAT_DESC: { + struct vm_stat_desc statdesc; + + if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { + error = EFAULT; + break; } - error = vioapic_assert_irq(sc->vm, ioapic_irq.irq);; - if (ddi_copyout(&ioapic_irq, (void *)arg, - sizeof (struct vm_ioapic_irq), mode)) { - return (EFAULT); + error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, + sizeof (statdesc.desc)); + if (error == 0 && + ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { + error = EFAULT; + break; } break; - case VM_IOAPIC_DEASSERT_IRQ: - if (ddi_copyin((void *)arg, &ioapic_irq, - sizeof (struct vm_ioapic_irq), mode)) { - return (EFAULT); + } + case VM_STATS_IOC: { + struct vm_stats vmstats; + + CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); + if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { + error = EFAULT; + break; } - error = vioapic_deassert_irq(sc->vm, ioapic_irq.irq); - if (ddi_copyout(&ioapic_irq, (void *)arg, - sizeof (struct vm_ioapic_irq), mode)) { - return (EFAULT); + hrt2tv(gethrtime(), &vmstats.tv); + error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, + &vmstats.num_entries, vmstats.statbuf); + if (error == 0 && + ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { + error = EFAULT; + break; } break; - case VM_IOAPIC_PULSE_IRQ: - if (ddi_copyin((void *)arg, &ioapic_irq, - sizeof (struct vm_ioapic_irq), mode)) { - return (EFAULT); + } + + /* XXXJOY: punt on these for now */ + case VM_PPTDEV_MSI: { + struct vm_pptdev_msi pptmsi; + + if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { + error = EFAULT; + break; } - error = vioapic_pulse_irq(sc->vm, ioapic_irq.irq); - if (ddi_copyout(&ioapic_irq, (void *)arg, - sizeof (struct vm_ioapic_irq), mode)) { - return (EFAULT); + return (ENOTTY); + } + case VM_PPTDEV_MSIX: { + struct vm_pptdev_msix pptmsix; + + if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { + error = EFAULT; + break; } - break; - case VM_IOAPIC_PINCOUNT: - error = 0; - pincount = vioapic_pincount(sc->vm); - if (ddi_copyout(&pincount, (void *)arg, sizeof (int), mode)) { - return (EFAULT); + return (ENOTTY); + } + case VM_MAP_PPTDEV_MMIO: { + struct vm_pptdev_mmio pptmmio; + + if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { + error = EFAULT; + break; } - break; - case VM_ISA_ASSERT_IRQ: - if (ddi_copyin((void *)arg, &isa_irq, - sizeof (struct vm_isa_irq), mode)) { - return (EFAULT); + return (ENOTTY); + } + case VM_BIND_PPTDEV: + case VM_UNBIND_PPTDEV: { + struct vm_pptdev pptdev; + + if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { + error = EFAULT; + break; } - error = vatpic_assert_irq(sc->vm, isa_irq.atpic_irq); - if (error == 0 && isa_irq.ioapic_irq != -1) - error = vioapic_assert_irq(sc->vm, - isa_irq.ioapic_irq); - if (ddi_copyout(&isa_irq, (void *)arg, - sizeof (struct vm_isa_irq), mode)) { - return (EFAULT); - + return (ENOTTY); + } + + case VM_INJECT_EXCEPTION: { + struct vm_exception vmexc; + + if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { + error = EFAULT; + break; } + error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, + vmexc.error_code_valid, vmexc.error_code, + vmexc.restart_instruction); break; - case VM_ISA_DEASSERT_IRQ: - if (ddi_copyin((void *)arg, &isa_irq, - sizeof (struct vm_isa_irq), mode)) { - return (EFAULT); - } - error = vatpic_deassert_irq(sc->vm, isa_irq.atpic_irq); - if (error == 0 && isa_irq.ioapic_irq != -1) - error = vioapic_deassert_irq(sc->vm, - isa_irq.ioapic_irq); - if (ddi_copyout(&isa_irq, (void *)arg, - sizeof (struct vm_isa_irq), mode)) { - return (EFAULT); - + } + case VM_INJECT_NMI: { + struct vm_nmi vmnmi; + + if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { + error = EFAULT; + break; } + error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); break; - case VM_ISA_PULSE_IRQ: - if (ddi_copyin((void *)arg, &isa_irq, - sizeof (struct vm_isa_irq), mode)) { - return (EFAULT); - } - error = vatpic_pulse_irq(sc->vm, isa_irq.atpic_irq); - if (error == 0 && isa_irq.ioapic_irq != -1) - error = vioapic_pulse_irq(sc->vm, isa_irq.ioapic_irq); - if (ddi_copyout(&isa_irq, (void *)arg, - sizeof (struct vm_isa_irq), mode)) { - return (EFAULT); - + } + case VM_LAPIC_IRQ: { + struct vm_lapic_irq vmirq; + + if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { + error = EFAULT; + break; } + error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); break; - case VM_MAP_MEMORY: - if (ddi_copyin((void *)arg, &seg, - sizeof (struct vm_memory_segment), mode)) { - return (EFAULT); + } + case VM_LAPIC_LOCAL_IRQ: { + struct vm_lapic_irq vmirq; + + if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { + error = EFAULT; + break; } - error = vm_malloc(sc->vm, seg.gpa, seg.len); + error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, + vmirq.vector); break; - case VM_GET_MEMORY_SEG: - if (ddi_copyin((void *)arg, &seg, - sizeof (struct vm_memory_segment), mode)) { - return (EFAULT); - } - seg.len = 0; - (void)vm_gpabase2memseg(sc->vm, seg.gpa, &seg); - if (ddi_copyout(&seg, (void *)arg, - sizeof (struct vm_memory_segment), mode)) { - return (EFAULT); + } + case VM_LAPIC_MSI: { + struct vm_lapic_msi vmmsi; + + if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { + error = EFAULT; + break; } - error = 0; + error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); break; - case VM_GET_REGISTER: - if (ddi_copyin((void *)arg, &vmreg, - sizeof (struct vm_register), mode)) { - return (EFAULT); - } - error = vm_get_register(sc->vm, vmreg.cpuid, vmreg.regnum, - &vmreg.regval); - if (!error) { - if (ddi_copyout(&vmreg, (void *)arg, - sizeof (struct vm_register), mode)) { - return (EFAULT); - } + } + + case VM_IOAPIC_ASSERT_IRQ: { + struct vm_ioapic_irq ioapic_irq; + + if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { + error = EFAULT; + break; } + error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); break; - case VM_SET_REGISTER: - if (ddi_copyin((void *)arg, &vmreg, - sizeof (struct vm_register), mode)) { - return (EFAULT); + } + case VM_IOAPIC_DEASSERT_IRQ: { + struct vm_ioapic_irq ioapic_irq; + + if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { + error = EFAULT; + break; } - error = vm_set_register(sc->vm, vmreg.cpuid, vmreg.regnum, - vmreg.regval); + error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); break; - case VM_SET_SEGMENT_DESCRIPTOR: - if (ddi_copyin((void *)arg, &vmsegdesc, - sizeof (struct vm_seg_desc), mode)) { - return (EFAULT); + } + case VM_IOAPIC_PULSE_IRQ: { + struct vm_ioapic_irq ioapic_irq; + + if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { + error = EFAULT; + break; } - error = vm_set_seg_desc(sc->vm, vmsegdesc.cpuid, - vmsegdesc.regnum, - &vmsegdesc.desc); + error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); break; - case VM_GET_SEGMENT_DESCRIPTOR: - if (ddi_copyin((void *)arg, &vmsegdesc, - sizeof (struct vm_seg_desc), mode)) { - return (EFAULT); + } + case VM_IOAPIC_PINCOUNT: { + int pincount; + + pincount = vioapic_pincount(sc->vmm_vm); + if (ddi_copyout(&pincount, datap, sizeof (int), md)) { + error = EFAULT; + break; } - error = vm_get_seg_desc(sc->vm, vmsegdesc.cpuid, - vmsegdesc.regnum, - &vmsegdesc.desc); - if (!error) { - if (ddi_copyout(&vmsegdesc, (void *)arg, - sizeof (struct vm_seg_desc), mode)) { - return (EFAULT); + break; + } + + case VM_ISA_ASSERT_IRQ: { + struct vm_isa_irq isa_irq; + + if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { + error = EFAULT; + break; + } + error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) { + error = vioapic_assert_irq(sc->vmm_vm, + isa_irq.ioapic_irq); + } + break; + } + case VM_ISA_DEASSERT_IRQ: { + struct vm_isa_irq isa_irq; + + if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { + error = EFAULT; + break; + } + error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) { + error = vioapic_deassert_irq(sc->vmm_vm, + isa_irq.ioapic_irq); + } + break; + } + case VM_ISA_PULSE_IRQ: { + struct vm_isa_irq isa_irq; + + if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { + error = EFAULT; + break; + } + error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) { + error = vioapic_pulse_irq(sc->vmm_vm, + isa_irq.ioapic_irq); + } + break; + } + case VM_ISA_SET_IRQ_TRIGGER: { + struct vm_isa_irq_trigger isa_irq_trigger; + + if (ddi_copyin(datap, &isa_irq_trigger, + sizeof (isa_irq_trigger), md)) { + error = EFAULT; + break; + } + error = vatpic_set_irq_trigger(sc->vmm_vm, + isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); + break; + } + + case VM_MMAP_GETNEXT: { + struct vm_memmap mm; + + if (ddi_copyin(datap, &mm, sizeof (mm), md)) { + error = EFAULT; + break; + } + error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, + &mm.segoff, &mm.len, &mm.prot, &mm.flags); + if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { + error = EFAULT; + break; + } + break; + } + case VM_MMAP_MEMSEG: { + struct vm_memmap mm; + + if (ddi_copyin(datap, &mm, sizeof (mm), md)) { + error = EFAULT; + break; + } + error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, + mm.len, mm.prot, mm.flags); + break; + } + case VM_ALLOC_MEMSEG: { + struct vm_memseg vmseg; + + if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { + error = EFAULT; + break; + } + error = vmmdev_alloc_memseg(sc, &vmseg); + break; + } + case VM_GET_MEMSEG: { + struct vm_memseg vmseg; + + if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { + error = EFAULT; + break; + } + error = vmmdev_get_memseg(sc, &vmseg); + if (error == 0 && + ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GET_REGISTER: { + struct vm_register vmreg; + + if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { + error = EFAULT; + break; + } + error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, + &vmreg.regval); + if (error == 0 && + ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { + error = EFAULT; + break; + } + break; + } + case VM_SET_REGISTER: { + struct vm_register vmreg; + + if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { + error = EFAULT; + break; + } + error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, + vmreg.regval); + break; + } + case VM_SET_SEGMENT_DESCRIPTOR: { + struct vm_seg_desc vmsegd; + + if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { + error = EFAULT; + break; + } + error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, + &vmsegd.desc); + break; + } + case VM_GET_SEGMENT_DESCRIPTOR: { + struct vm_seg_desc vmsegd; + + if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { + error = EFAULT; + break; + } + error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, + &vmsegd.desc); + if (error == 0 && + ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GET_REGISTER_SET: { + struct vm_register_set vrs; + int regnums[VM_REG_LAST]; + uint64_t regvals[VM_REG_LAST]; + + if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { + error = EFAULT; + break; + } + if (vrs.count > VM_REG_LAST || vrs.count == 0) { + error = EINVAL; + break; + } + if (ddi_copyin(vrs.regnums, regnums, + sizeof (int) * vrs.count, md)) { + error = EFAULT; + break; + } + + error = 0; + for (uint_t i = 0; i < vrs.count && error == 0; i++) { + if (regnums[i] < 0) { + error = EINVAL; + break; } + error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], + ®vals[i]); + } + if (error == 0 && ddi_copyout(regvals, vrs.regvals, + sizeof (uint64_t) * vrs.count, md)) { + error = EFAULT; } break; - case VM_GET_CAPABILITY: - if (ddi_copyin((void *)arg, &vmcap, - sizeof (struct vm_capability), mode)) { - return (EFAULT); + } + case VM_SET_REGISTER_SET: { + struct vm_register_set vrs; + int regnums[VM_REG_LAST]; + uint64_t regvals[VM_REG_LAST]; + + if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { + error = EFAULT; + break; } - error = vm_get_capability(sc->vm, vmcap.cpuid, - vmcap.captype, - &vmcap.capval); - if (!error) { - if (ddi_copyout(&vmcap, (void *)arg, - sizeof (struct vm_capability), mode)) { - return (EFAULT); + if (vrs.count > VM_REG_LAST || vrs.count == 0) { + error = EINVAL; + break; + } + if (ddi_copyin(vrs.regnums, regnums, + sizeof (int) * vrs.count, md)) { + error = EFAULT; + break; + } + if (ddi_copyin(vrs.regvals, regvals, + sizeof (uint64_t) * vrs.count, md)) { + error = EFAULT; + break; + } + + error = 0; + for (uint_t i = 0; i < vrs.count && error == 0; i++) { + /* + * Setting registers in a set is not atomic, since a + * failure in the middle of the set will cause a + * bail-out and inconsistent register state. Callers + * should be wary of this. + */ + if (regnums[i] < 0) { + error = EINVAL; + break; } + error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], + regvals[i]); } break; - case VM_SET_CAPABILITY: - if (ddi_copyin((void *)arg, &vmcap, - sizeof (struct vm_capability), mode)) { - return (EFAULT); + } + + case VM_GET_CAPABILITY: { + struct vm_capability vmcap; + + if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { + error = EFAULT; + break; + } + error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, + &vmcap.capval); + if (error == 0 && + ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { + error = EFAULT; + break; } - error = vm_set_capability(sc->vm, vmcap.cpuid, - vmcap.captype, - vmcap.capval); break; - case VM_SET_X2APIC_STATE: - if (ddi_copyin((void *)arg, &x2apic, - sizeof (struct vm_x2apic), mode)) { - return (EFAULT); + } + case VM_SET_CAPABILITY: { + struct vm_capability vmcap; + + if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { + error = EFAULT; + break; } - error = vm_set_x2apic_state(sc->vm, - x2apic.cpuid, x2apic.state); + error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, + vmcap.capval); break; - case VM_GET_X2APIC_STATE: - if (ddi_copyin((void *)arg, &x2apic, - sizeof (struct vm_x2apic), mode)) { - return (EFAULT); + } + case VM_SET_X2APIC_STATE: { + struct vm_x2apic x2apic; + + if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { + error = EFAULT; + break; } - error = vm_get_x2apic_state(sc->vm, - x2apic.cpuid, &x2apic.state); - if (!error) { - if (ddi_copyout(&x2apic, (void *)arg, - sizeof (struct vm_x2apic), mode)) { - return (EFAULT); - } + error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); + break; + } + case VM_GET_X2APIC_STATE: { + struct vm_x2apic x2apic; + + if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { + error = EFAULT; + break; + } + error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, + &x2apic.state); + if (error == 0 && + ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GET_GPA_PMAP: { + struct vm_gpa_pte gpapte; + + if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) { + error = EFAULT; + break; + } +#ifdef __FreeBSD__ + /* XXXJOY: add function? */ + pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)), + gpapte.gpa, gpapte.pte, &gpapte.ptenum); +#endif + error = 0; + break; + } + case VM_GET_HPET_CAPABILITIES: { + struct vm_hpet_cap hpetcap; + + error = vhpet_getcap(&hpetcap); + if (error == 0 && + ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { + error = EFAULT; + break; } break; + } case VM_GLA2GPA: { + struct vm_gla2gpa gg; + CTASSERT(PROT_READ == VM_PROT_READ); CTASSERT(PROT_WRITE == VM_PROT_WRITE); CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); - if (ddi_copyin((void *)arg, &gg, - sizeof (struct vm_gla2gpa), mode)) { - return (EFAULT); + + if (ddi_copyin(datap, &gg, sizeof (gg), md)) { + error = EFAULT; + break; } - error = vm_gla2gpa(sc->vm, gg.vcpuid, &gg.paging, gg.gla, - gg.prot, &gg.gpa); - KASSERT(error == 0 || error == 1 || error == -1, - ("%s: vm_gla2gpa unknown error %d", __func__, error)); - if (error >= 0) { - /* - * error = 0: the translation was successful - * error = 1: a fault was injected into the guest - */ - gg.fault = error; - error = 0; - if (ddi_copyout(&gg, (void *)arg, - sizeof (struct vm_gla2gpa), mode)) { - return (EFAULT); - } + gg.vcpuid = vcpu; + error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, + gg.prot, &gg.gpa, &gg.fault); + if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GLA2GPA_NOFAULT: { + struct vm_gla2gpa gg; + + CTASSERT(PROT_READ == VM_PROT_READ); + CTASSERT(PROT_WRITE == VM_PROT_WRITE); + CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); + + if (ddi_copyin(datap, &gg, sizeof (gg), md)) { + error = EFAULT; + break; + } + gg.vcpuid = vcpu; + error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, + gg.gla, gg.prot, &gg.gpa, &gg.fault); + if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { + error = EFAULT; + break; + } + break; + } + + case VM_ACTIVATE_CPU: + error = vm_activate_cpu(sc->vmm_vm, vcpu); + break; + + case VM_SUSPEND_CPU: + if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { + error = EFAULT; + } else { + error = vm_suspend_cpu(sc->vmm_vm, vcpu); + } + break; + + case VM_RESUME_CPU: + if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { + error = EFAULT; + } else { + error = vm_resume_cpu(sc->vmm_vm, vcpu); + } + break; + + case VM_GET_CPUS: { + struct vm_cpuset vm_cpuset; + cpuset_t tempset; + void *srcp = &tempset; + int size; + + if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { + error = EFAULT; + break; + } + + /* Be more generous about sizing since our cpuset_t is large. */ + size = vm_cpuset.cpusetsize; + if (size <= 0 || size > sizeof (cpuset_t)) { + error = ERANGE; + } + /* + * If they want a ulong_t or less, make sure they receive the + * low bits with all the useful information. + */ + if (size <= sizeof (tempset.cpub[0])) { + srcp = &tempset.cpub[0]; + } + + if (vm_cpuset.which == VM_ACTIVE_CPUS) { + tempset = vm_active_cpus(sc->vmm_vm); + } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { + tempset = vm_suspended_cpus(sc->vmm_vm); + } else if (vm_cpuset.which == VM_DEBUG_CPUS) { + tempset = vm_debug_cpus(sc->vmm_vm); } else { + error = EINVAL; + } + + ASSERT(size > 0 && size <= sizeof (tempset)); + if (error == 0 && + ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { + error = EFAULT; + break; + } + break; + } + case VM_SET_INTINFO: { + struct vm_intinfo vmii; + + if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { + error = EFAULT; + break; + } + error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); + break; + } + case VM_GET_INTINFO: { + struct vm_intinfo vmii; + + vmii.vcpuid = vcpu; + error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, + &vmii.info2); + if (error == 0 && + ddi_copyout(&vmii, datap, sizeof (vmii), md)) { + error = EFAULT; + break; + } + break; + } + case VM_RTC_WRITE: { + struct vm_rtc_data rtcdata; + + if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { + error = EFAULT; + break; + } + error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, + rtcdata.value); + break; + } + case VM_RTC_READ: { + struct vm_rtc_data rtcdata; + + if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { + error = EFAULT; + break; + } + error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, + &rtcdata.value); + if (error == 0 && + ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { + error = EFAULT; + break; + } + break; + } + case VM_RTC_SETTIME: { + struct vm_rtc_time rtctime; + + if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { + error = EFAULT; + break; + } + error = vrtc_set_time(sc->vmm_vm, rtctime.secs); + break; + } + case VM_RTC_GETTIME: { + struct vm_rtc_time rtctime; + + rtctime.secs = vrtc_get_time(sc->vmm_vm); + if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { + error = EFAULT; + break; + } + break; + } + + case VM_RESTART_INSTRUCTION: + error = vm_restart_instruction(sc->vmm_vm, vcpu); + break; + + case VM_SET_TOPOLOGY: { + struct vm_cpu_topology topo; + + if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { + error = EFAULT; + break; + } + error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, + topo.threads, topo.maxcpus); + break; + } + case VM_GET_TOPOLOGY: { + struct vm_cpu_topology topo; + + vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, + &topo.threads, &topo.maxcpus); + if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { + error = EFAULT; + break; + } + break; + } + +#ifndef __FreeBSD__ + case VM_DEVMEM_GETOFFSET: { + struct vm_devmem_offset vdo; + list_t *dl = &sc->vmm_devmem_list; + vmm_devmem_entry_t *de = NULL; + + if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { error = EFAULT; + break; + } + + for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { + if (de->vde_segid == vdo.segid) { + break; + } + } + if (de != NULL) { + vdo.offset = de->vde_off; + if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { + error = EFAULT; + } + } else { + error = ENOENT; } break; } - case VM_ACTIVATE_CPU: - if (ddi_copyin((void *)arg, &vac, - sizeof (struct vm_activate_cpu), mode)) { - return (EFAULT); - } - error = vm_activate_cpu(sc->vm, vac.vcpuid); - break; - case VM_RESTART_INSTRUCTION: - error = vm_restart_instruction(sc->vm, vcpu); - break; - default: - error = ENOTTY; - break; + case VM_WRLOCK_CYCLE: { + /* + * Present a test mechanism to acquire/release the write lock + * on the VM without any other effects. + */ + break; + } +#endif + default: + error = ENOTTY; + break; + } + + /* Release exclusion resources */ + switch (lock_type) { + case LOCK_NONE: + break; + case LOCK_VCPU: + vcpu_unlock_one(sc, vcpu); + break; + case LOCK_READ_HOLD: + vmm_read_unlock(sc); + break; + case LOCK_WRITE_HOLD: + vmm_write_unlock(sc); + break; + default: + panic("unexpected lock type"); + break; + } + + return (error); +} + +static vmm_softc_t * +vmm_lookup(const char *name) +{ + list_t *vml = &vmm_list; + vmm_softc_t *sc; + + ASSERT(MUTEX_HELD(&vmm_mtx)); + + for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { + if (strcmp(sc->vmm_name, name) == 0) { + break; + } + } + + return (sc); +} + +static int +vmmdev_do_vm_create(char *name, cred_t *cr) +{ + vmm_softc_t *sc = NULL; + minor_t minor; + int error = ENOMEM; + + if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) { + return (EINVAL); + } + + mutex_enter(&vmm_mtx); + + /* Look for duplicates names */ + if (vmm_lookup(name) != NULL) { + mutex_exit(&vmm_mtx); + return (EEXIST); + } + + /* Allow only one instance per non-global zone. */ + if (!INGLOBALZONE(curproc)) { + for (sc = list_head(&vmm_list); sc != NULL; + sc = list_next(&vmm_list, sc)) { + if (sc->vmm_zone == curzone) { + mutex_exit(&vmm_mtx); + return (EINVAL); + } + } + } + + minor = id_alloc(vmm_minors); + if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { + goto fail; + } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { + ddi_soft_state_free(vmm_statep, minor); + goto fail; + } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + goto fail; + } + + error = vm_create(name, &sc->vmm_vm); + if (error == 0) { + /* Complete VM intialization and report success. */ + (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); + sc->vmm_minor = minor; + list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), + offsetof(vmm_devmem_entry_t, vde_node)); + + list_create(&sc->vmm_holds, sizeof (vmm_hold_t), + offsetof(vmm_hold_t, vmh_node)); + cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); + + mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), + offsetof(vmm_lease_t, vml_node)); + cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); + rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); + + sc->vmm_zone = crgetzone(cr); + zone_hold(sc->vmm_zone); + vmm_zsd_add_vm(sc); + + list_insert_tail(&vmm_list, sc); + mutex_exit(&vmm_mtx); + return (0); + } + + ddi_remove_minor_node(vmmdev_dip, name); +fail: + id_free(vmm_minors, minor); + if (sc != NULL) { + ddi_soft_state_free(vmm_statep, minor); + } + mutex_exit(&vmm_mtx); + + return (error); +} + +/* + * Bhyve 'Driver' Interface + * + * While many devices are emulated in the bhyve userspace process, there are + * others with performance constraints which require that they run mostly or + * entirely in-kernel. For those not integrated directly into bhyve, an API is + * needed so they can query/manipulate the portions of VM state needed to + * fulfill their purpose. + * + * This includes: + * - Translating guest-physical addresses to host-virtual pointers + * - Injecting MSIs + * - Hooking IO port addresses + * + * The vmm_drv interface exists to provide that functionality to its consumers. + * (At this time, 'viona' is the only user) + */ +int +vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) +{ + vnode_t *vp = fp->f_vnode; + const dev_t dev = vp->v_rdev; + vmm_softc_t *sc; + vmm_hold_t *hold; + int err = 0; + + if (vp->v_type != VCHR) { + return (ENXIO); + } + const major_t major = getmajor(dev); + const minor_t minor = getminor(dev); + + mutex_enter(&vmmdev_mtx); + if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { + mutex_exit(&vmmdev_mtx); + return (ENOENT); } + mutex_enter(&vmm_mtx); + mutex_exit(&vmmdev_mtx); - if (state_changed == 1) { - vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); - } else if (state_changed == 2) { - for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) - vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { + err = ENOENT; + goto out; } + /* XXXJOY: check cred permissions against instance */ -done: - /* Make sure that no handler returns a bogus value like ERESTART */ - KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); - return (error); + if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) { + err = EBUSY; + goto out; + } + + hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); + hold->vmh_sc = sc; + hold->vmh_release_req = B_FALSE; + + list_insert_tail(&sc->vmm_holds, hold); + sc->vmm_flags |= VMM_HELD; + *holdp = hold; + +out: + mutex_exit(&vmm_mtx); + return (err); } -static -minor_t vmm_find_free_minor(void) +void +vmm_drv_rele(vmm_hold_t *hold) { - minor_t minor; - - for (minor = 1; ; minor++) { - if (ddi_get_soft_state(vmm_statep, minor) == NULL) - break; + vmm_softc_t *sc; + + ASSERT(hold != NULL); + ASSERT(hold->vmh_sc != NULL); + VERIFY(hold->vmh_ioport_hook_cnt == 0); + + mutex_enter(&vmm_mtx); + sc = hold->vmh_sc; + list_remove(&sc->vmm_holds, hold); + if (list_is_empty(&sc->vmm_holds)) { + sc->vmm_flags &= ~VMM_HELD; + cv_broadcast(&sc->vmm_cv); } + mutex_exit(&vmm_mtx); + kmem_free(hold, sizeof (*hold)); +} + +boolean_t +vmm_drv_release_reqd(vmm_hold_t *hold) +{ + ASSERT(hold != NULL); - return (minor); + return (hold->vmh_release_req); } -int -vmmdev_do_vm_create(dev_info_t *dip, char *name) +vmm_lease_t * +vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) { - struct vmm_softc *sc; - minor_t minor; - int error; + vmm_softc_t *sc = hold->vmh_sc; + vmm_lease_t *lease; - mutex_enter(&vmmdev_mtx); + ASSERT3P(expiref, !=, NULL); - if (strlen(name) >= VM_MAX_NAMELEN) { - mutex_exit(&vmmdev_mtx); - return (EINVAL); + if (hold->vmh_release_req) { + return (NULL); } - minor = vmm_find_free_minor(); - if (ddi_soft_state_zalloc(vmm_statep, minor) == DDI_FAILURE) { - mutex_exit(&vmmdev_mtx); - return (DDI_FAILURE); + lease = kmem_alloc(sizeof (*lease), KM_SLEEP); + list_link_init(&lease->vml_node); + lease->vml_expire_func = expiref; + lease->vml_expire_arg = arg; + lease->vml_expired = B_FALSE; + lease->vml_hold = hold; + /* cache the VM pointer for one less pointer chase */ + lease->vml_vm = sc->vmm_vm; + + mutex_enter(&sc->vmm_lease_lock); + while (sc->vmm_lease_blocker != 0) { + cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); } + list_insert_tail(&sc->vmm_lease_list, lease); + vmm_read_lock(sc); + mutex_exit(&sc->vmm_lease_lock); - if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { - ddi_soft_state_free(vmm_statep, minor); - mutex_exit(&vmmdev_mtx); - return (DDI_FAILURE); - } - strcpy(sc->name, name); - sc->minor = minor; + return (lease); +} - if (ddi_create_minor_node(dip, name, S_IFCHR, minor, - DDI_PSEUDO, 0) == DDI_FAILURE) { - ddi_soft_state_free(vmm_statep, minor); - mutex_exit(&vmmdev_mtx); - return (DDI_FAILURE); - } +static void +vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) +{ + ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); - error = vm_create(name, &sc->vm); - if (error != 0) { - ddi_soft_state_free(vmm_statep, minor); - ddi_remove_minor_node(dip, name); - mutex_exit(&vmmdev_mtx); - return (error); + list_remove(&sc->vmm_lease_list, lease); + vmm_read_unlock(sc); + kmem_free(lease, sizeof (*lease)); +} + +void +vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) +{ + vmm_softc_t *sc = hold->vmh_sc; + + VERIFY3P(hold, ==, lease->vml_hold); + + mutex_enter(&sc->vmm_lease_lock); + vmm_lease_break_locked(sc, lease); + mutex_exit(&sc->vmm_lease_lock); +} + +boolean_t +vmm_drv_lease_expired(vmm_lease_t *lease) +{ + return (lease->vml_expired); +} + +void * +vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz) +{ + ASSERT(lease != NULL); + + return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz)); +} + +int +vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) +{ + ASSERT(lease != NULL); + + return (lapic_intr_msi(lease->vml_vm, addr, msg)); +} + +int +vmm_drv_ioport_hook(vmm_hold_t *hold, uint_t ioport, vmm_drv_rmem_cb_t rfunc, + vmm_drv_wmem_cb_t wfunc, void *arg, void **cookie) +{ + vmm_softc_t *sc; + int err; + + ASSERT(hold != NULL); + ASSERT(cookie != NULL); + + sc = hold->vmh_sc; + mutex_enter(&vmm_mtx); + /* Confirm that hook installation is not blocked */ + if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { + mutex_exit(&vmm_mtx); + return (EBUSY); + } + /* + * Optimistically record an installed hook which will prevent a block + * from being asserted while the mutex is dropped. + */ + hold->vmh_ioport_hook_cnt++; + mutex_exit(&vmm_mtx); + + vmm_write_lock(sc); + err = vm_ioport_hook(sc->vmm_vm, ioport, (vmm_rmem_cb_t)rfunc, + (vmm_wmem_cb_t)wfunc, arg, cookie); + vmm_write_unlock(sc); + + if (err != 0) { + mutex_enter(&vmm_mtx); + /* Walk back optimism about the hook installation */ + hold->vmh_ioport_hook_cnt--; + mutex_exit(&vmm_mtx); } - SLIST_INSERT_HEAD(&head, sc, link); + return (err); +} - mutex_exit(&vmmdev_mtx); +void +vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) +{ + vmm_softc_t *sc; - return (0); + ASSERT(hold != NULL); + ASSERT(cookie != NULL); + ASSERT(hold->vmh_ioport_hook_cnt != 0); + + sc = hold->vmh_sc; + vmm_write_lock(sc); + vm_ioport_unhook(sc->vmm_vm, cookie); + vmm_write_unlock(sc); + + mutex_enter(&vmm_mtx); + hold->vmh_ioport_hook_cnt--; + mutex_exit(&vmm_mtx); } -static struct vmm_softc * -vmm_lookup(char *name) +static int +vmm_drv_purge(vmm_softc_t *sc) { - struct vmm_softc *sc; + ASSERT(MUTEX_HELD(&vmm_mtx)); - SLIST_FOREACH(sc, &head, link) { - if (strcmp(sc->name, name) == 0) { - break; + if ((sc->vmm_flags & VMM_HELD) != 0) { + vmm_hold_t *hold; + + sc->vmm_flags |= VMM_CLEANUP; + for (hold = list_head(&sc->vmm_holds); hold != NULL; + hold = list_next(&sc->vmm_holds, hold)) { + hold->vmh_release_req = B_TRUE; } + while ((sc->vmm_flags & VMM_HELD) != 0) { + if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { + return (EINTR); + } + } + sc->vmm_flags &= ~VMM_CLEANUP; } - return (sc); - + VERIFY(list_is_empty(&sc->vmm_holds)); + sc->vmm_flags |= VMM_PURGED; + return (0); } -struct vm * -vm_lookup_by_name(char *name) +static int +vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) { - struct vmm_softc *sc; + int err = 0; - mutex_enter(&vmmdev_mtx); + mutex_enter(&vmm_mtx); + if (!enable_block) { + VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); - if ((sc = vmm_lookup(name)) == NULL) { - mutex_exit(&vmmdev_mtx); - return (NULL); + sc->vmm_flags &= ~VMM_BLOCK_HOOK; + goto done; } - mutex_exit(&vmmdev_mtx); + /* If any holds have hooks installed, the block is a failure */ + if (!list_is_empty(&sc->vmm_holds)) { + vmm_hold_t *hold; + + for (hold = list_head(&sc->vmm_holds); hold != NULL; + hold = list_next(&sc->vmm_holds, hold)) { + if (hold->vmh_ioport_hook_cnt != 0) { + err = EBUSY; + goto done; + } + } + } + sc->vmm_flags |= VMM_BLOCK_HOOK; - return (sc->vm); +done: + mutex_exit(&vmm_mtx); + return (err); } -int -vmmdev_do_vm_destroy(dev_info_t *dip, char *name) +static int +vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd) { - struct vmm_softc *sc; - dev_info_t *pdip = ddi_get_parent(dip); + dev_info_t *pdip = ddi_get_parent(vmmdev_dip); + minor_t minor; - mutex_enter(&vmmdev_mtx); + ASSERT(MUTEX_HELD(&vmm_mtx)); - if ((sc = vmm_lookup(name)) == NULL) { - mutex_exit(&vmmdev_mtx); - return (ENOENT); + if (clean_zsd) { + vmm_zsd_rem_vm(sc); } - if (sc->open) { - mutex_exit(&vmmdev_mtx); - return (EBUSY); + if (vmm_drv_purge(sc) != 0) { + return (EINTR); } - vm_destroy(sc->vm); - SLIST_REMOVE(&head, sc, vmm_softc, link); - ddi_remove_minor_node(dip, name); - ddi_soft_state_free(vmm_statep, sc->minor); - (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); + /* Clean up devmem entries */ + vmmdev_devmem_purge(sc); - mutex_exit(&vmmdev_mtx); + list_remove(&vmm_list, sc); + ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); + minor = sc->vmm_minor; + zone_rele(sc->vmm_zone); + if (sc->vmm_is_open) { + list_insert_tail(&vmm_destroy_list, sc); + sc->vmm_flags |= VMM_DESTROY; + } else { + vm_destroy(sc->vmm_vm); + ddi_soft_state_free(vmm_statep, minor); + id_free(vmm_minors, minor); + } + (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); return (0); } int -vmmdev_do_vm_mmap(struct vmm_softc *vmm_sc, off_t off, int nprot) +vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd) { - vm_paddr_t paddr; + int err; - mutex_enter(&vmmdev_mtx); + mutex_enter(&vmm_mtx); + err = vmm_do_vm_destroy_locked(sc, clean_zsd); + mutex_exit(&vmm_mtx); - paddr = vm_gpa2hpa(vmm_sc->vm, (vm_paddr_t)off, PAGE_SIZE); - if (paddr == -1) { - return (-1); - } + return (err); +} - mutex_exit(&vmmdev_mtx); +/* ARGSUSED */ +static int +vmmdev_do_vm_destroy(const char *name, cred_t *cr) +{ + vmm_softc_t *sc; + int err; + + if (crgetuid(cr) != 0) + return (EPERM); + + mutex_enter(&vmm_mtx); + + if ((sc = vmm_lookup(name)) == NULL) { + mutex_exit(&vmm_mtx); + return (ENOENT); + } + /* + * We don't check this in vmm_lookup() since that function is also used + * for validation during create and currently vmm names must be unique. + */ + if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { + mutex_exit(&vmm_mtx); + return (EPERM); + } + err = vmm_do_vm_destroy_locked(sc, B_TRUE); + mutex_exit(&vmm_mtx); - return (btop(paddr)); + return (err); } static int vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) { - minor_t minor; - struct vmm_softc *sc; + minor_t minor; + vmm_softc_t *sc; minor = getminor(*devp); if (minor == VMM_CTL_MINOR) { @@ -768,19 +1750,15 @@ vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) return (0); } - mutex_enter(&vmmdev_mtx); + mutex_enter(&vmm_mtx); sc = ddi_get_soft_state(vmm_statep, minor); if (sc == NULL) { - mutex_exit(&vmmdev_mtx); + mutex_exit(&vmm_mtx); return (ENXIO); } - if (sc->open) { - mutex_exit(&vmmdev_mtx); - return (EBUSY); - } - sc->open = B_TRUE; - mutex_exit(&vmmdev_mtx); + sc->vmm_is_open = B_TRUE; + mutex_exit(&vmm_mtx); return (0); } @@ -788,170 +1766,360 @@ vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) static int vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) { - minor_t minor; - struct vmm_softc *sc; + minor_t minor; + vmm_softc_t *sc; minor = getminor(dev); if (minor == VMM_CTL_MINOR) return (0); - mutex_enter(&vmmdev_mtx); + mutex_enter(&vmm_mtx); sc = ddi_get_soft_state(vmm_statep, minor); if (sc == NULL) { - mutex_exit(&vmmdev_mtx); + mutex_exit(&vmm_mtx); return (ENXIO); } - sc->open = B_FALSE; - mutex_exit(&vmmdev_mtx); + VERIFY(sc->vmm_is_open); + sc->vmm_is_open = B_FALSE; + + if (sc->vmm_flags & VMM_DESTROY) { + list_remove(&vmm_destroy_list, sc); + vm_destroy(sc->vmm_vm); + ddi_soft_state_free(vmm_statep, minor); + id_free(vmm_minors, minor); + } + mutex_exit(&vmm_mtx); return (0); } +static int +vmm_is_supported(intptr_t arg) +{ + int r; + const char *msg; + + if (vmm_is_intel()) { + r = vmx_x86_supported(&msg); + } else if (vmm_is_amd()) { + /* + * HMA already ensured that the features necessary for SVM + * operation were present and online during vmm_attach(). + */ + r = 0; + } else { + r = ENXIO; + msg = "Unsupported CPU vendor"; + } + + if (r != 0 && arg != (intptr_t)NULL) { + if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) + return (EFAULT); + } + return (r); +} + static int vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { - struct vmm_softc *sc; - struct vmm_ioctl kvi; - minor_t minor; + vmm_softc_t *sc; + minor_t minor; minor = getminor(dev); if (minor == VMM_CTL_MINOR) { - if (ddi_copyin((void *)arg, &kvi, sizeof (struct vmm_ioctl), - mode)) { - return (EFAULT); + void *argp = (void *)arg; + char name[VM_MAX_NAMELEN] = { 0 }; + size_t len = 0; + + if ((mode & FKIOCTL) != 0) { + len = strlcpy(name, argp, sizeof (name)); + } else { + if (copyinstr(argp, name, sizeof (name), &len) != 0) { + return (EFAULT); + } } + if (len >= VM_MAX_NAMELEN) { + return (ENAMETOOLONG); + } + switch (cmd) { case VMM_CREATE_VM: if ((mode & FWRITE) == 0) return (EPERM); - return (vmmdev_do_vm_create(vmm_dip, kvi.vmm_name)); + return (vmmdev_do_vm_create(name, credp)); case VMM_DESTROY_VM: if ((mode & FWRITE) == 0) return (EPERM); - return (vmmdev_do_vm_destroy(vmm_dip, kvi.vmm_name)); + return (vmmdev_do_vm_destroy(name, credp)); + case VMM_VM_SUPPORTED: + return (vmm_is_supported(arg)); default: - break; + /* No other actions are legal on ctl device */ + return (ENOTTY); } } sc = ddi_get_soft_state(vmm_statep, minor); ASSERT(sc); + if (sc->vmm_flags & VMM_DESTROY) + return (ENXIO); + return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); } static int -vmm_mmap(dev_t dev, off_t off, int prot) +vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, + unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) { - struct vmm_softc *sc; + vmm_softc_t *sc; + const minor_t minor = getminor(dev); + struct vm *vm; + int err; + vm_object_t vmo = NULL; + struct vmspace *vms; + + if (minor == VMM_CTL_MINOR) { + return (ENODEV); + } + if (off < 0 || (off + len) <= 0) { + return (EINVAL); + } + if ((prot & PROT_USER) == 0) { + return (EACCES); + } - sc = ddi_get_soft_state(vmm_statep, getminor(dev)); + sc = ddi_get_soft_state(vmm_statep, minor); ASSERT(sc); - return (vmmdev_do_vm_mmap(sc, off, prot)); -} + if (sc->vmm_flags & VMM_DESTROY) + return (ENXIO); -static int -vmm_segmap(dev_t dev, off_t off, struct as *as, - caddr_t *addrp, off_t len, unsigned int prot, - unsigned int maxprot, unsigned int flags, cred_t *credp) -{ - struct segdev_crargs dev_a; - int error; + /* Grab read lock on the VM to prevent any changes to the memory map */ + vmm_read_lock(sc); - as_rangelock(as); + vm = sc->vmm_vm; + vms = vm_get_vmspace(vm); + if (off >= VM_DEVMEM_START) { + int segid; - error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); - if (error != 0) { - as_rangeunlock(as); - return (error); + /* Mapping a devmem "device" */ + if (!vmmdev_devmem_segid(sc, off, len, &segid)) { + err = ENODEV; + goto out; + } + err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); + if (err != 0) { + goto out; + } + err = vm_segmap_obj(vms, vmo, as, addrp, prot, maxprot, flags); + } else { + /* Mapping a part of the guest physical space */ + err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot, + flags); } - dev_a.mapfunc = vmm_mmap; - dev_a.dev = dev; - dev_a.offset = off; - dev_a.type = (flags & MAP_TYPE); - dev_a.prot = (uchar_t)prot; - dev_a.maxprot = (uchar_t)maxprot; - dev_a.hat_attr = 0; - dev_a.hat_flags = HAT_LOAD_NOCONSIST; - dev_a.devmap_data = NULL; - - error = as_map(as, *addrp, len, segdev_create, &dev_a); - - as_rangeunlock(as); - return (error); +out: + vmm_read_unlock(sc); + return (err); } -static int -vmm_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +static sdev_plugin_validate_t +vmm_sdev_validate(sdev_ctx_t ctx) { - return (0); + const char *name = sdev_ctx_name(ctx); + vmm_softc_t *sc; + sdev_plugin_validate_t ret; + minor_t minor; + + if (sdev_ctx_vtype(ctx) != VCHR) + return (SDEV_VTOR_INVALID); + + VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); + + mutex_enter(&vmm_mtx); + if ((sc = vmm_lookup(name)) == NULL) + ret = SDEV_VTOR_INVALID; + else if (sc->vmm_minor != minor) + ret = SDEV_VTOR_STALE; + else + ret = SDEV_VTOR_VALID; + mutex_exit(&vmm_mtx); + + return (ret); } static int -vmm_probe(dev_info_t *dip) +vmm_sdev_filldir(sdev_ctx_t ctx) { - if (driver_installed(ddi_name_to_major("kvm"))) { - cmn_err(CE_WARN, "kvm is installed\n"); - return (DDI_PROBE_FAILURE); + vmm_softc_t *sc; + int ret; + + if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { + cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, + sdev_ctx_path(ctx), VMM_SDEV_ROOT); + return (EINVAL); + } + + mutex_enter(&vmm_mtx); + ASSERT(vmmdev_dip != NULL); + for (sc = list_head(&vmm_list); sc != NULL; + sc = list_next(&vmm_list, sc)) { + if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { + ret = sdev_plugin_mknod(ctx, sc->vmm_name, + S_IFCHR | 0600, + makedevice(ddi_driver_major(vmmdev_dip), + sc->vmm_minor)); + } else { + continue; + } + if (ret != 0 && ret != EEXIST) + goto out; } - return (DDI_PROBE_SUCCESS); + ret = 0; + +out: + mutex_exit(&vmm_mtx); + return (ret); +} + +/* ARGSUSED */ +static void +vmm_sdev_inactive(sdev_ctx_t ctx) +{ } +static sdev_plugin_ops_t vmm_sdev_ops = { + .spo_version = SDEV_PLUGIN_VERSION, + .spo_flags = SDEV_PLUGIN_SUBDIR, + .spo_validate = vmm_sdev_validate, + .spo_filldir = vmm_sdev_filldir, + .spo_inactive = vmm_sdev_inactive +}; + +/* ARGSUSED */ static int -vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) { + int error; + switch (cmd) { - case DDI_ATTACH: + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)vmmdev_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; break; default: + error = DDI_FAILURE; + break; + } + return (error); +} + +static int +vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + sdev_plugin_hdl_t sph; + hma_reg_t *reg = NULL; + boolean_t vmm_loaded = B_FALSE; + + if (cmd != DDI_ATTACH) { return (DDI_FAILURE); } - if (vmm_mod_load()) { + mutex_enter(&vmmdev_mtx); + /* Ensure we are not already attached. */ + if (vmmdev_dip != NULL) { + mutex_exit(&vmmdev_mtx); return (DDI_FAILURE); } - vmm_dip = dip; + vmm_sol_glue_init(); + vmm_arena_init(); - /* - * Create control node. Other nodes will be created on demand. - */ - if (ddi_create_minor_node(dip, VMM_CTL_MINOR_NODE, S_IFCHR, + if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { + goto fail; + } else if (vmm_mod_load() != 0) { + goto fail; + } + vmm_loaded = B_TRUE; + + /* Create control node. Other nodes will be created on demand. */ + if (ddi_create_minor_node(dip, "ctl", S_IFCHR, VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { - return (DDI_FAILURE); + goto fail; } - ddi_report_dev(dip); + if ((sph = sdev_plugin_register("vmm", &vmm_sdev_ops, NULL)) == + (sdev_plugin_hdl_t)NULL) { + ddi_remove_minor_node(dip, NULL); + goto fail; + } + ddi_report_dev(dip); + vmmdev_hma_reg = reg; + vmmdev_sdev_hdl = sph; + vmmdev_dip = dip; + mutex_exit(&vmmdev_mtx); return (DDI_SUCCESS); + +fail: + if (vmm_loaded) { + VERIFY0(vmm_mod_unload()); + } + if (reg != NULL) { + hma_unregister(reg); + } + vmm_arena_fini(); + vmm_sol_glue_cleanup(); + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); } static int vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { - switch (cmd) { - case DDI_DETACH: - break; - default: + if (cmd != DDI_DETACH) { + return (DDI_FAILURE); + } + + /* Ensure that all resources have been cleaned up */ + mutex_enter(&vmmdev_mtx); + + mutex_enter(&vmm_mtx); + if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) { + mutex_exit(&vmm_mtx); + mutex_exit(&vmmdev_mtx); return (DDI_FAILURE); } + mutex_exit(&vmm_mtx); - if (vmm_mod_unload()) {; + VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); + if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { + mutex_exit(&vmmdev_mtx); return (DDI_FAILURE); } + vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; - /* - * Remove the control node. - */ - ddi_remove_minor_node(dip, VMM_CTL_MINOR_NODE); - vmm_dip = NULL; + /* Remove the control node. */ + ddi_remove_minor_node(dip, "ctl"); + vmmdev_dip = NULL; + + VERIFY0(vmm_mod_unload()); + hma_unregister(vmmdev_hma_reg); + vmmdev_hma_reg = NULL; + vmm_arena_fini(); + vmm_sol_glue_cleanup(); + + mutex_exit(&vmmdev_mtx); return (DDI_SUCCESS); } @@ -966,7 +2134,7 @@ static struct cb_ops vmm_cb_ops = { nodev, /* write */ vmm_ioctl, nodev, /* devmap */ - vmm_mmap, + nodev, /* mmap */ vmm_segmap, nochpoll, /* poll */ ddi_prop_op, @@ -977,9 +2145,9 @@ static struct cb_ops vmm_cb_ops = { static struct dev_ops vmm_ops = { DEVO_REV, 0, - ddi_no_info, + vmm_info, nulldev, /* identify */ - vmm_probe, + nulldev, /* probe */ vmm_attach, vmm_detach, nodev, /* reset */ @@ -989,7 +2157,7 @@ static struct dev_ops vmm_ops = { static struct modldrv modldrv = { &mod_driverops, - "vmm", + "bhyve vmm", &vmm_ops }; @@ -1004,16 +2172,27 @@ _init(void) { int error; - mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); + sysinit(); - error = ddi_soft_state_init(&vmm_statep, sizeof (struct vmm_softc), 0); + mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); + list_create(&vmm_list, sizeof (vmm_softc_t), + offsetof(vmm_softc_t, vmm_node)); + list_create(&vmm_destroy_list, sizeof (vmm_softc_t), + offsetof(vmm_softc_t, vmm_node)); + vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); + + error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); if (error) { return (error); } + vmm_zsd_init(); + error = mod_install(&modlinkage); if (error) { ddi_soft_state_fini(&vmm_statep); + vmm_zsd_fini(); } return (error); @@ -1028,6 +2207,9 @@ _fini(void) if (error) { return (error); } + + vmm_zsd_fini(); + ddi_soft_state_fini(&vmm_statep); return (0); diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c new file mode 100644 index 0000000000..c26e763805 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c @@ -0,0 +1,268 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include +#include +#include +#include + +#include +#include + + +struct ept_map { + gipt_map_t em_gipt; + uint64_t em_wired_page_count; +}; +typedef struct ept_map ept_map_t; + +#define EPT_LOCK(m) (&(m)->em_gipt.giptm_lock) + +#define EPT_MAX_LEVELS 4 + +CTASSERT(EPT_MAX_LEVELS <= GIPT_MAX_LEVELS); + +#define EPT_R (0x1 << 0) +#define EPT_W (0x1 << 1) +#define EPT_X (0x1 << 2) +#define EPT_RWX (EPT_R | EPT_W | EPT_X) +#define EPT_LGPG (0x1 << 7) + +#define EPT_PA_MASK (0x000ffffffffff000ull) + +CTASSERT(EPT_R == PROT_READ); +CTASSERT(EPT_W == PROT_WRITE); +CTASSERT(EPT_X == PROT_EXEC); + + +#define EPT_PAT(attr) (((attr) & 0x7) << 3) +#define EPT_PADDR(addr) ((addr) & EPT_PA_MASK) + +#define EPT_IS_ABSENT(pte) (((pte) & EPT_RWX) == 0) +#define EPT_PTE_PFN(pte) mmu_btop(EPT_PADDR(pte)) +#define EPT_PTE_PROT(pte) ((pte) & EPT_RWX) +#define EPT_MAPS_PAGE(pte, lvl) \ + (EPT_PTE_PROT(pte) != 0 && (((pte) & EPT_LGPG) != 0 || (lvl) == 0)) + +/* + * Only assign EPT_LGPG for levels higher than 0. Although this bit is defined + * as being ignored at level 0, some versions of VMWare fail to honor this and + * report such a PTE as an EPT mis-configuration. + */ +#define EPT_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr) \ + (EPT_PADDR(pfn_to_pa(pfn)) | \ + (((lvl) != 0) ? EPT_LGPG : 0) | \ + EPT_PAT(attr) | ((prot) & EPT_RWX)) +#define EPT_PTE_ASSIGN_TABLE(pfn) (EPT_PADDR(pfn_to_pa(pfn)) | EPT_RWX) + + +static gipt_pte_type_t +ept_pte_type(uint64_t pte, uint_t level) +{ + if (EPT_IS_ABSENT(pte)) { + return (PTET_EMPTY); + } else if (EPT_MAPS_PAGE(pte, level)) { + return (PTET_PAGE); + } else { + return (PTET_LINK); + } +} + +static uint64_t +ept_pte_map(uint64_t pfn) +{ + return (EPT_PTE_ASSIGN_TABLE(pfn)); +} + +static void * +ept_create(uintptr_t *pml4_kaddr) +{ + ept_map_t *emap; + gipt_map_t *map; + gipt_t *root; + struct gipt_cbs cbs = { + .giptc_pte_type = ept_pte_type, + .giptc_pte_map = ept_pte_map, + }; + + emap = kmem_zalloc(sizeof (*emap), KM_SLEEP); + map = &emap->em_gipt; + root = gipt_alloc(); + root->gipt_level = EPT_MAX_LEVELS - 1; + gipt_map_init(map, EPT_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root); + + *pml4_kaddr = (uintptr_t)root->gipt_kva; + return (emap); +} + +static void +ept_destroy(void *arg) +{ + ept_map_t *emap = arg; + + if (emap != NULL) { + gipt_map_t *map = &emap->em_gipt; + + gipt_map_fini(map); + kmem_free(emap, sizeof (*emap)); + } +} + +static uint64_t +ept_wired_count(void *arg) +{ + ept_map_t *emap = arg; + uint64_t res; + + mutex_enter(EPT_LOCK(emap)); + res = emap->em_wired_page_count; + mutex_exit(EPT_LOCK(emap)); + + return (res); +} + +static int +ept_is_wired(void *arg, uint64_t va, uint_t *protp) +{ + ept_map_t *emap = arg; + gipt_t *pt; + int rv = -1; + + mutex_enter(EPT_LOCK(emap)); + pt = gipt_map_lookup_deepest(&emap->em_gipt, va); + if (pt != NULL) { + const uint64_t pte = GIPT_VA2PTE(pt, va); + + if (EPT_MAPS_PAGE(pte, pt->gipt_level)) { + *protp = EPT_PTE_PROT(pte); + rv = 0; + } + } + mutex_exit(EPT_LOCK(emap)); + + return (rv); +} + +static int +ept_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot, + uint8_t attr) +{ + ept_map_t *emap = arg; + gipt_map_t *map = &emap->em_gipt; + gipt_t *pt; + uint64_t *ptep, pte; + + ASSERT((prot & EPT_RWX) != 0 && (prot & ~EPT_RWX) == 0); + ASSERT3U(lvl, <, EPT_MAX_LEVELS); + + mutex_enter(EPT_LOCK(emap)); + pt = gipt_map_lookup(map, va, lvl); + if (pt == NULL) { + /* + * A table at the appropriate VA/level that would house this + * mapping does not currently exist. Try to walk down to that + * point, creating any necessary parent(s). + */ + pt = gipt_map_create_parents(map, va, lvl); + + /* + * There was a large page mapping in the way of creating the + * necessary parent table(s). + */ + if (pt == NULL) { + panic("unexpected large page @ %08lx", va); + } + } + ptep = GIPT_VA2PTEP(pt, va); + + pte = *ptep; + if (!EPT_IS_ABSENT(pte)) { + if (!EPT_MAPS_PAGE(pte, lvl)) { + panic("unexpected PT link @ %08lx in %p", va, pt); + } else { + panic("unexpected page mapped @ %08lx in %p", va, pt); + } + } + + pte = EPT_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr); + *ptep = pte; + pt->gipt_valid_cnt++; + emap->em_wired_page_count += gipt_level_count[lvl]; + + mutex_exit(EPT_LOCK(emap)); + return (0); +} + +static uint64_t +ept_unmap(void *arg, uint64_t va, uint64_t end_va) +{ + ept_map_t *emap = arg; + gipt_map_t *map = &emap->em_gipt; + gipt_t *pt; + uint64_t cur_va = va; + uint64_t unmapped = 0; + + mutex_enter(EPT_LOCK(emap)); + + pt = gipt_map_lookup_deepest(map, cur_va); + if (pt == NULL) { + mutex_exit(EPT_LOCK(emap)); + return (0); + } + if (!EPT_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) { + cur_va = gipt_map_next_page(map, cur_va, end_va, &pt); + if (cur_va == 0) { + mutex_exit(EPT_LOCK(emap)); + return (0); + } + } + + while (cur_va < end_va) { + uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va); + const uint_t lvl = pt->gipt_level; + + ASSERT(EPT_MAPS_PAGE(*ptep, lvl)); + *ptep = 0; + pt->gipt_valid_cnt--; + unmapped += gipt_level_count[pt->gipt_level]; + + gipt_t *next_pt = pt; + uint64_t next_va; + next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt); + + if (pt->gipt_valid_cnt == 0) { + gipt_map_clean_parents(map, pt); + } + if (next_va == 0) { + break; + } + pt = next_pt; + cur_va = next_va; + } + emap->em_wired_page_count -= unmapped; + + mutex_exit(EPT_LOCK(emap)); + + return (unmapped); +} + +struct vmm_pt_ops ept_ops = { + .vpo_init = ept_create, + .vpo_free = ept_destroy, + .vpo_wired_cnt = ept_wired_count, + .vpo_is_wired = ept_is_wired, + .vpo_map = ept_map, + .vpo_unmap = ept_unmap, +}; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c index 6588f5a46d..a8d94ea024 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c @@ -1,31 +1,4 @@ /* - * Copyright (c) 2004 John Baldwin - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: head/sys/kern/subr_sleepqueue.c 261520 2014-02-05 18:13:27Z jhb $ - */ -/*- * Copyright (c) 2004 Poul-Henning Kamp * All rights reserved. * @@ -63,6 +36,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #include @@ -73,22 +47,62 @@ #include #include #include +#include +#include +#include +#include +#include +#include #include #include #include +#include #include #include #include +#include #include #include +SET_DECLARE(sysinit_set, struct sysinit); + +void +sysinit(void) +{ + struct sysinit **si; + + SET_FOREACH(si, sysinit_set) + (*si)->func((*si)->data); +} + +u_char const bin2bcd_data[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99 +}; + vm_paddr_t pmap_kextract(vm_offset_t va) { pfn_t pfn; + /* + * Since hat_getpfnum() may block on an htable mutex, this is not at + * all safe to run from a critical_enter/kpreempt_disable context. + * The FreeBSD analog does not have the same locking constraints, so + * close attention must be paid wherever this is called. + */ + ASSERT(curthread->t_preempt == 0); + pfn = hat_getpfnum(kas.a_hat, (caddr_t)va); ASSERT(pfn != PFN_INVALID); return (pfn << PAGE_SHIFT) | ((uintptr_t)va & PAGE_MASK); @@ -97,45 +111,72 @@ pmap_kextract(vm_offset_t va) int cpusetobj_ffs(const cpuset_t *set) { -#if CPUSET_WORDS > 1 - int i, cbit; + uint_t large, small; - cbit = 0; - for (i = 0; i < CPUSET_WORDS; i++) { - if (set->cpub[i] != 0) { - cbit = ffsl(set->cpub[i]); - cbit += i * sizeof (set->cpub[0]); - break; - } - } - return (cbit); -#else - return(ffsl(*set)); -#endif -} + /* + * Rather than reaching into the cpuset_t ourselves, leave that task to + * cpuset_bounds(). The simplicity is worth the extra wasted work to + * find the upper bound. + */ + cpuset_bounds(set, &small, &large); -void -smp_rendezvous(void (* setup_func)(void *), - void (* action_func)(void *), - void (* teardown_func)(void *), - void *arg) -{ - cpuset_t cpuset; + if (small == CPUSET_NOTINSET) { + /* The FreeBSD version returns 0 if it find nothing */ + return (0); + } - ASSERT(setup_func == NULL); - ASSERT(teardown_func == NULL); + ASSERT3U(small, <=, INT_MAX); - CPUSET_ALL(cpuset); - xc_sync((xc_arg_t)arg, 0, 0, CPUSET2BV(cpuset), (xc_func_t)action_func); + /* Least significant bit index starts at 1 for valid results */ + return (small + 1); } struct kmem_item { void *addr; size_t size; - LIST_ENTRY(kmem_item) next; }; static kmutex_t kmem_items_lock; -static LIST_HEAD(, kmem_item) kmem_items; + +static mod_hash_t *vmm_alloc_hash; +uint_t vmm_alloc_hash_nchains = 16381; +uint_t vmm_alloc_hash_size = PAGESIZE; + +static void +vmm_alloc_hash_valdtor(mod_hash_val_t val) +{ + struct kmem_item *i = (struct kmem_item *)val; + + kmem_free(i->addr, i->size); + kmem_free(i, sizeof (struct kmem_item)); +} + +static void +vmm_alloc_init(void) +{ + vmm_alloc_hash = mod_hash_create_ptrhash("vmm_alloc_hash", + vmm_alloc_hash_nchains, vmm_alloc_hash_valdtor, + vmm_alloc_hash_size); + + VERIFY(vmm_alloc_hash != NULL); +} + +static uint_t +vmm_alloc_check(mod_hash_key_t key, mod_hash_val_t *val, void *unused) +{ + struct kmem_item *i = (struct kmem_item *)val; + + cmn_err(CE_PANIC, "!vmm_alloc_check: hash not empty: %p, %d", i->addr, + i->size); + + return (MH_WALK_TERMINATE); +} + +static void +vmm_alloc_cleanup(void) +{ + mod_hash_walk(vmm_alloc_hash, vmm_alloc_check, NULL); + mod_hash_destroy_ptrhash(vmm_alloc_hash); +} void * malloc(unsigned long size, struct malloc_type *mtp, int flags) @@ -148,17 +189,28 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags) kmem_flag = KM_NOSLEEP; if (flags & M_ZERO) { - p = kmem_zalloc(size + sizeof(struct kmem_item), kmem_flag); + p = kmem_zalloc(size, kmem_flag); } else { - p = kmem_alloc(size + sizeof(struct kmem_item), kmem_flag); + p = kmem_alloc(size, kmem_flag); + } + + if (p == NULL) + return (NULL); + + i = kmem_zalloc(sizeof (struct kmem_item), kmem_flag); + + if (i == NULL) { + kmem_free(p, size); + return (NULL); } mutex_enter(&kmem_items_lock); - i = p + size; i->addr = p; i->size = size; - LIST_INSERT_HEAD(&kmem_items, i, next); + VERIFY(mod_hash_insert(vmm_alloc_hash, + (mod_hash_key_t)PHYS_TO_DMAP(vtophys(p)), (mod_hash_val_t)i) == 0); + mutex_exit(&kmem_items_lock); return (p); @@ -167,29 +219,66 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags) void free(void *addr, struct malloc_type *mtp) { - struct kmem_item *i; - mutex_enter(&kmem_items_lock); - LIST_FOREACH(i, &kmem_items, next) { - if (i->addr == addr) - break; - } - ASSERT(i != NULL); - LIST_REMOVE(i, next); + VERIFY(mod_hash_destroy(vmm_alloc_hash, + (mod_hash_key_t)PHYS_TO_DMAP(vtophys(addr))) == 0); mutex_exit(&kmem_items_lock); +} + +extern void *contig_alloc(size_t, ddi_dma_attr_t *, uintptr_t, int); +extern void contig_free(void *, size_t); - kmem_free(addr, i->size + sizeof(struct kmem_item)); +void * +contigmalloc(unsigned long size, struct malloc_type *type, int flags, + vm_paddr_t low, vm_paddr_t high, unsigned long alignment, + vm_paddr_t boundary) +{ + ddi_dma_attr_t attr = { + /* Using fastboot_dma_attr as a guide... */ + DMA_ATTR_V0, + low, /* dma_attr_addr_lo */ + high, /* dma_attr_addr_hi */ + 0x00000000FFFFFFFFULL, /* dma_attr_count_max */ + alignment, /* dma_attr_align */ + 1, /* dma_attr_burstsize */ + 1, /* dma_attr_minxfer */ + 0x00000000FFFFFFFFULL, /* dma_attr_maxxfer */ + 0x00000000FFFFFFFFULL, /* dma_attr_seg: any */ + 1, /* dma_attr_sgllen */ + alignment, /* dma_attr_granular */ + 0, /* dma_attr_flags */ + }; + int cansleep = (flags & M_WAITOK); + void *result; + + ASSERT(alignment == PAGESIZE); + + result = contig_alloc((size_t)size, &attr, alignment, cansleep); + + if (result != NULL && (flags & M_ZERO) != 0) { + bzero(result, size); + } + return (result); +} + +void +contigfree(void *addr, unsigned long size, struct malloc_type *type) +{ + contig_free(addr, size); } void mtx_init(struct mtx *mtx, char *name, const char *type_name, int opts) { - if (opts & MTX_SPIN) { - mutex_init(&mtx->m, name, MUTEX_SPIN, - (ddi_iblock_cookie_t)ipltospl(DISP_LEVEL)); - } else { - mutex_init(&mtx->m, name, MUTEX_DRIVER, NULL); - } + /* + * Requests that a mutex be initialized to the MTX_SPIN type are + * ignored. The limitations which may have required spinlocks on + * FreeBSD do not apply to how bhyve has been structured here. + * + * Adaptive mutexes are required to avoid deadlocks when certain + * cyclics behavior interacts with interrupts and contended locks. + */ + mutex_init(&mtx->m, name, MUTEX_ADAPTIVE, NULL); } void @@ -202,130 +291,14 @@ void critical_enter(void) { kpreempt_disable(); - thread_affinity_set(curthread, CPU_CURRENT); } void critical_exit(void) { - thread_affinity_clear(curthread); kpreempt_enable(); } -struct unr { - u_int item; - struct unr *link; -}; - -#define UNR_HASHSIZE 8 - -struct unrhdr { - struct mtx *mtx; - struct unr *hash[UNR_HASHSIZE]; - u_int min; - u_int max; - u_int next; -}; - -#define HASH_UNR(uh, i) ((uh)->hash[(i) & ((UNR_HASHSIZE) - 1)]) - -static struct mtx unr_mtx; - -/* - * Allocate a new unrheader set. - * - * Highest and lowest valid values given as parameters. - */ -struct unrhdr * -new_unrhdr(int low, int high, struct mtx *mtx) -{ - struct unrhdr *uh; - - uh = kmem_zalloc(sizeof (struct unrhdr), KM_SLEEP); - if (mtx) { - uh->mtx = mtx; - } else { - uh->mtx = &unr_mtx; - } - uh->min = low; - uh->max = high; - uh->next = uh->min; - - return (uh); -} - -void -delete_unrhdr(struct unrhdr *uh) -{ - kmem_free(uh, sizeof (struct unrhdr)); -} - -static struct unr * -unr_lookup(struct unrhdr *uh, int item) -{ - struct unr *unr; - - ASSERT(MUTEX_HELD(&uh->mtx->m)); - - for (unr = HASH_UNR(uh, item); unr != NULL; unr = unr->link) { - if (unr->item == item) - break; - } - - return (unr); -} - -int -alloc_unr(struct unrhdr *uh) -{ - struct unr *unr; - int item, start; - - mutex_enter(&uh->mtx->m); - start = uh->next; - for (;;) { - item = uh->next; - if (++uh->next == uh->max) { - uh->next = uh->min; - } - - if (unr_lookup(uh, item) == NULL) { - unr = kmem_zalloc(sizeof (struct unr), KM_SLEEP); - unr->item = item; - unr->link = HASH_UNR(uh, item); - HASH_UNR(uh, item) = unr; - break; - } - - if (item == start) { - item = -1; - break; - } - } - mutex_exit(&uh->mtx->m); - - return (item); -} - -void -free_unr(struct unrhdr *uh, u_int item) -{ - struct unr *unr, **unrp; - - mutex_enter(&uh->mtx->m); - unrp = &HASH_UNR(uh, item); - for (;;) { - ASSERT(*unrp != NULL); - if ((*unrp)->item == item) - break; - unrp = &(*unrp)->link; - } - unr = *unrp; - *unrp = unr->link; - mutex_exit(&uh->mtx->m); - kmem_free(unr, sizeof(struct unr)); -} - static void vmm_glue_callout_handler(void *arg) @@ -351,25 +324,43 @@ vmm_glue_callout_init(struct callout *c, int mpsafe) when.cyt_interval = CY_INFINITY; mutex_enter(&cpu_lock); - c->c_cyc_id = cyclic_add(&hdlr, &when); +#if 0 + /* + * XXXJOY: according to the freebsd sources, callouts do not begin + * their life in the ACTIVE state. + */ c->c_flags |= CALLOUT_ACTIVE; +#else + bzero(c, sizeof (*c)); +#endif + c->c_cyc_id = cyclic_add(&hdlr, &when); mutex_exit(&cpu_lock); } +static __inline hrtime_t +sbttohrtime(sbintime_t sbt) +{ + return (((sbt >> 32) * NANOSEC) + + (((uint64_t)NANOSEC * (uint32_t)sbt) >> 32)); +} + int vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr, void (*func)(void *), void *arg, int flags) { + hrtime_t target = sbttohrtime(sbt); + ASSERT(c->c_cyc_id != CYCLIC_NONE); c->c_func = func; c->c_arg = arg; c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); - if (flags & C_ABSOLUTE) - cyclic_reprogram(c->c_cyc_id, sbt); - else - cyclic_reprogram(c->c_cyc_id, sbt + gethrtime()); + if (flags & C_ABSOLUTE) { + cyclic_reprogram(c->c_cyc_id, target); + } else { + cyclic_reprogram(c->c_cyc_id, target + gethrtime()); + } return (0); } @@ -397,201 +388,24 @@ vmm_glue_callout_drain(struct callout *c) return (0); } -static int -ipi_cpu_justreturn(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) -{ - return (0); -} - -void -ipi_cpu(int cpu, u_int ipi) -{ - cpuset_t set; - - CPUSET_ONLY(set, cpu); - xc_call_nowait(NULL, NULL, NULL, CPUSET2BV(set), - ipi_cpu_justreturn); -} - -#define SC_TABLESIZE 256 /* Must be power of 2. */ -#define SC_MASK (SC_TABLESIZE - 1) -#define SC_SHIFT 8 -#define SC_HASH(wc) ((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \ - SC_MASK) -#define SC_LOOKUP(wc) &sleepq_chains[SC_HASH(wc)] - -struct sleepqueue { - u_int sq_blockedcnt; /* Num. of blocked threads. */ - LIST_ENTRY(sleepqueue) sq_hash; /* Chain. */ - void *sq_wchan; /* Wait channel. */ - kcondvar_t sq_cv; -}; - -struct sleepqueue_chain { - LIST_HEAD(, sleepqueue) sc_queues; /* List of sleep queues. */ - struct mtx sc_lock; /* Spin lock for this chain. */ -}; - -static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE]; - -#define SLEEPQ_CACHE_SZ (64) -static kmem_cache_t *vmm_sleepq_cache; - -static int -vmm_sleepq_cache_init(void *buf, void *user_arg, int kmflags) -{ - struct sleepqueue *sq = (struct sleepqueue *)buf; - - bzero(sq, sizeof (struct sleepqueue)); - cv_init(&sq->sq_cv, NULL, CV_DRIVER, NULL); - - return (0); -} - -static void -vmm_sleepq_cache_fini(void *buf, void *user_arg) -{ - struct sleepqueue *sq = (struct sleepqueue *)buf; - cv_destroy(&sq->sq_cv); -} - -static void -init_sleepqueues(void) -{ - int i; - - for (i = 0; i < SC_TABLESIZE; i++) { - LIST_INIT(&sleepq_chains[i].sc_queues); - mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL, - MTX_SPIN); - } - - vmm_sleepq_cache = kmem_cache_create("vmm_sleepq_cache", - sizeof (struct sleepqueue), SLEEPQ_CACHE_SZ, vmm_sleepq_cache_init, - vmm_sleepq_cache_fini, NULL, NULL, NULL, 0); - -} - -/* - * Lock the sleep queue chain associated with the specified wait channel. - */ -static void -sleepq_lock(void *wchan) -{ - struct sleepqueue_chain *sc; - - sc = SC_LOOKUP(wchan); - mtx_lock_spin(&sc->sc_lock); -} - -/* - * Look up the sleep queue associated with a given wait channel in the hash - * table locking the associated sleep queue chain. If no queue is found in - * the table, NULL is returned. - */ -static struct sleepqueue * -sleepq_lookup(void *wchan) -{ - struct sleepqueue_chain *sc; - struct sleepqueue *sq; - - KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); - sc = SC_LOOKUP(wchan); - mtx_assert(&sc->sc_lock, MA_OWNED); - LIST_FOREACH(sq, &sc->sc_queues, sq_hash) - if (sq->sq_wchan == wchan) - return (sq); - return (NULL); -} - -/* - * Unlock the sleep queue chain associated with a given wait channel. - */ -static void -sleepq_release(void *wchan) -{ - struct sleepqueue_chain *sc; - - sc = SC_LOOKUP(wchan); - mtx_unlock_spin(&sc->sc_lock); -} - -struct sleepqueue * -sleepq_add(void *wchan) -{ - struct sleepqueue_chain *sc; - struct sleepqueue *sq; - - sc = SC_LOOKUP(wchan); - - /* Look up the sleep queue associated with the wait channel 'wchan'. */ - sq = sleepq_lookup(wchan); - - if (sq == NULL) { - sq = kmem_cache_alloc(vmm_sleepq_cache, KM_SLEEP); - LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash); - sq->sq_wchan = wchan; - } - - sq->sq_blockedcnt++; - - return (sq); -} - -void -sleepq_remove(struct sleepqueue *sq) -{ - sq->sq_blockedcnt--; - - if (sq->sq_blockedcnt == 0) { - LIST_REMOVE(sq, sq_hash); - kmem_cache_free(vmm_sleepq_cache, sq); - } -} - -int -msleep_spin(void *chan, struct mtx *mtx, const char *wmesg, int ticks) -{ - struct sleepqueue *sq; - int error; - - sleepq_lock(chan); - sq = sleepq_add(chan); - sleepq_release(chan); - - cv_reltimedwait(&sq->sq_cv, &mtx->m, ticks, TR_CLOCK_TICK); - - sleepq_lock(chan); - sleepq_remove(sq); - sleepq_release(chan); - - return (error); -} - void -wakeup(void *chan) +vmm_glue_callout_localize(struct callout *c) { - struct sleepqueue *sq; - - sleepq_lock(chan); - sq = sleepq_lookup(chan); - if (sq != NULL) { - cv_broadcast(&sq->sq_cv); - } - sleepq_release(chan); + mutex_enter(&cpu_lock); + cyclic_move_here(c->c_cyc_id); + mutex_exit(&cpu_lock); } void -wakeup_one(void *chan) +ipi_cpu(int cpu, u_int ipi) { - struct sleepqueue *sq; - - sleepq_lock(chan); - sq = sleepq_lookup(chan); - if (sq != NULL) { - cv_signal(&sq->sq_cv); - } - sleepq_release(chan); + /* + * This was previously implemented as an invocation of asynchronous + * no-op crosscalls to interrupt the target CPU. Since even nowait + * crosscalls can block in certain circumstances, a direct poke_cpu() + * is safer when called from delicate contexts. + */ + poke_cpu(cpu); } u_int cpu_high; /* Highest arg to CPUID */ @@ -618,162 +432,257 @@ vmm_cpuid_init(void) cpu_exthigh = regs[0]; } -struct savefpu { - fpu_ctx_t fsa_fp_ctx; -}; - -static vmem_t *fpu_save_area_arena; - -static void -fpu_save_area_init(void) -{ - fpu_save_area_arena = vmem_create("fpu_save_area", - NULL, 0, XSAVE_AREA_ALIGN, - segkmem_alloc, segkmem_free, heap_arena, 0, VM_BESTFIT | VM_SLEEP); -} - -static void -fpu_save_area_cleanup(void) -{ - vmem_destroy(fpu_save_area_arena); -} - +/* + * FreeBSD uses the struct savefpu for managing the FPU state. That is mimicked + * by our hypervisor multiplexor framework structure. + */ struct savefpu * fpu_save_area_alloc(void) { - return (vmem_alloc(fpu_save_area_arena, sizeof (struct savefpu), - VM_SLEEP)); + return ((struct savefpu *)hma_fpu_alloc(KM_SLEEP)); } void fpu_save_area_free(struct savefpu *fsa) { - vmem_free(fpu_save_area_arena, fsa, sizeof (struct savefpu)); + hma_fpu_t *fpu = (hma_fpu_t *)fsa; + hma_fpu_free(fpu); } void fpu_save_area_reset(struct savefpu *fsa) { - extern const struct fxsave_state sse_initial; - extern const struct xsave_state avx_initial; - struct fpu_ctx *fp; - struct fxsave_state *fx; - struct xsave_state *xs; - - fp = &fsa->fsa_fp_ctx; - - fp->fpu_regs.kfpu_status = 0; - fp->fpu_regs.kfpu_xstatus = 0; - - switch (fp_save_mech) { - case FP_FXSAVE: - fx = &fp->fpu_regs.kfpu_u.kfpu_fx; - bcopy(&sse_initial, fx, sizeof (*fx)); - break; - case FP_XSAVE: - fp->fpu_xsave_mask = (XFEATURE_ENABLED_X87 | - XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX); - xs = &fp->fpu_regs.kfpu_u.kfpu_xs; - bcopy(&avx_initial, xs, sizeof (*xs)); - break; - default: - panic("Invalid fp_save_mech"); - /*NOTREACHED*/ - } + hma_fpu_t *fpu = (hma_fpu_t *)fsa; + hma_fpu_init(fpu); } +/* + * This glue function is supposed to save the host's FPU state. This is always + * paired in the general bhyve code with a call to fpusave. Therefore, we treat + * this as a nop and do all the work in fpusave(), which will have the context + * argument that we want anyways. + */ void fpuexit(kthread_t *td) { - fp_save(&curthread->t_lwp->lwp_pcb.pcb_fpu); } -static __inline void -vmm_fxrstor(struct fxsave_state *addr) +/* + * This glue function is supposed to restore the guest's FPU state from the save + * area back to the host. In FreeBSD, it is assumed that the host state has + * already been saved by a call to fpuexit(); however, we do both here. + */ +void +fpurestore(void *arg) { - __asm __volatile("fxrstor %0" : : "m" (*(addr))); -} + hma_fpu_t *fpu = arg; -static __inline void -vmm_fxsave(struct fxsave_state *addr) -{ - __asm __volatile("fxsave %0" : "=m" (*(addr))); + hma_fpu_start_guest(fpu); } -static __inline void -vmm_xrstor(struct xsave_state *addr, uint64_t mask) +/* + * This glue function is supposed to save the guest's FPU state. The host's FPU + * state is not expected to be restored necessarily due to the use of FPU + * emulation through CR0.TS. However, we can and do restore it here. + */ +void +fpusave(void *arg) { - uint32_t low, hi; + hma_fpu_t *fpu = arg; - low = mask; - hi = mask >> 32; - __asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi)); + hma_fpu_stop_guest(fpu); } -static __inline void -vmm_xsave(struct xsave_state *addr, uint64_t mask) +void +vmm_sol_glue_init(void) { - uint32_t low, hi; - - low = mask; - hi = mask >> 32; - __asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) : - "memory"); + vmm_alloc_init(); + vmm_cpuid_init(); } void -fpurestore(void *arg) +vmm_sol_glue_cleanup(void) { - struct savefpu *fsa = (struct savefpu *)arg; - struct fpu_ctx *fp; - - fp = &fsa->fsa_fp_ctx; - - switch (fp_save_mech) { - case FP_FXSAVE: - vmm_fxrstor(&fp->fpu_regs.kfpu_u.kfpu_fx); - break; - case FP_XSAVE: - vmm_xrstor(&fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); - break; - default: - panic("Invalid fp_save_mech"); - /*NOTREACHED*/ - } + vmm_alloc_cleanup(); } -void -fpusave(void *arg) + +/* From FreeBSD's sys/kern/subr_clock.c */ + +/*- + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1982, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: clock.c 1.18 91/01/21$ + * from: @(#)clock.c 8.2 (Berkeley) 1/12/94 + * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp + * and + * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04 + */ + +#include + +/*--------------------------------------------------------------------* + * Generic routines to convert between a POSIX date + * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec + * Derived from NetBSD arch/hp300/hp300/clock.c + */ + +#define FEBRUARY 2 +#define days_in_year(y) (leapyear(y) ? 366 : 365) +#define days_in_month(y, m) \ + (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0)) +/* Day of week. Days are counted from 1/1/1970, which was a Thursday */ +#define day_of_week(days) (((days) + 4) % 7) + +static const int month_days[12] = { + 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 +}; + + +/* + * This inline avoids some unnecessary modulo operations + * as compared with the usual macro: + * ( ((year % 4) == 0 && + * (year % 100) != 0) || + * ((year % 400) == 0) ) + * It is otherwise equivalent. + */ +static int +leapyear(int year) { - struct savefpu *fsa = (struct savefpu *)arg; - struct fpu_ctx *fp; - - fp = &fsa->fsa_fp_ctx; - - switch (fp_save_mech) { - case FP_FXSAVE: - vmm_fxsave(&fp->fpu_regs.kfpu_u.kfpu_fx); - break; - case FP_XSAVE: - vmm_xsave(&fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); - break; - default: - panic("Invalid fp_save_mech"); - /*NOTREACHED*/ + int rv = 0; + + if ((year & 3) == 0) { + rv = 1; + if ((year % 100) == 0) { + rv = 0; + if ((year % 400) == 0) + rv = 1; + } } + return (rv); } -void -vmm_sol_glue_init(void) +int +clock_ct_to_ts(struct clocktime *ct, struct timespec *ts) { - vmm_cpuid_init(); - fpu_save_area_init(); - init_sleepqueues(); + int i, year, days; + + year = ct->year; + +#ifdef __FreeBSD__ + if (ct_debug) { + printf("ct_to_ts("); + print_ct(ct); + printf(")"); + } +#endif + + /* Sanity checks. */ + if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 || + ct->day > days_in_month(year, ct->mon) || + ct->hour > 23 || ct->min > 59 || ct->sec > 59 || + (sizeof(time_t) == 4 && year > 2037)) { /* time_t overflow */ +#ifdef __FreeBSD__ + if (ct_debug) + printf(" = EINVAL\n"); +#endif + return (EINVAL); + } + + /* + * Compute days since start of time + * First from years, then from months. + */ + days = 0; + for (i = POSIX_BASE_YEAR; i < year; i++) + days += days_in_year(i); + + /* Months */ + for (i = 1; i < ct->mon; i++) + days += days_in_month(year, i); + days += (ct->day - 1); + + ts->tv_sec = (((time_t)days * 24 + ct->hour) * 60 + ct->min) * 60 + + ct->sec; + ts->tv_nsec = ct->nsec; + +#ifdef __FreeBSD__ + if (ct_debug) + printf(" = %ld.%09ld\n", (long)ts->tv_sec, (long)ts->tv_nsec); +#endif + return (0); } void -vmm_sol_glue_cleanup(void) -{ - fpu_save_area_cleanup(); - kmem_cache_destroy(vmm_sleepq_cache); +clock_ts_to_ct(struct timespec *ts, struct clocktime *ct) +{ + int i, year, days; + time_t rsec; /* remainder seconds */ + time_t secs; + + secs = ts->tv_sec; + days = secs / SECDAY; + rsec = secs % SECDAY; + + ct->dow = day_of_week(days); + + /* Subtract out whole years, counting them in i. */ + for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++) + days -= days_in_year(year); + ct->year = year; + + /* Subtract out whole months, counting them in i. */ + for (i = 1; days >= days_in_month(year, i); i++) + days -= days_in_month(year, i); + ct->mon = i; + + /* Days are what is left over (+1) from all that. */ + ct->day = days + 1; + + /* Hours, minutes, seconds are easy */ + ct->hour = rsec / 3600; + rsec = rsec % 3600; + ct->min = rsec / 60; + rsec = rsec % 60; + ct->sec = rsec; + ct->nsec = ts->tv_nsec; +#ifdef __FreeBSD__ + if (ct_debug) { + printf("ts_to_ct(%ld.%09ld) = ", + (long)ts->tv_sec, (long)ts->tv_nsec); + print_ct(ct); + printf("\n"); + } +#endif } diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c deleted file mode 100644 index 3bb5412d16..0000000000 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c +++ /dev/null @@ -1,111 +0,0 @@ -/*- - * Copyright (c) 2011 NetApp, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: head/sys/amd64/vmm/vmm_mem.c 245678 2013-01-20 03:42:49Z neel $ - */ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * Copyright 2013 Pluribus Networks Inc. - */ - -#include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_mem.c 245678 2013-01-20 03:42:49Z neel $"); - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include - -#include "vmm_util.h" -#include "vmm_mem.h" - -int -vmm_mem_init(void) -{ - return (0); -} - -vm_paddr_t -vmm_mem_alloc(size_t size) -{ - clock_t usec = 2 * 1000000; - vm_paddr_t pa; - caddr_t addr; - - if (size != PAGE_SIZE) - panic("vmm_mem_alloc: invalid allocation size %lu", size); - - while (usec > 0) { - if ((addr = kmem_zalloc(PAGE_SIZE, KM_NOSLEEP)) != NULL) { - ASSERT(((uintptr_t)addr & PAGE_MASK) == 0); - pa = vtophys((vm_offset_t)addr); - return (pa); - } - delay(drv_usectohz((clock_t)500000)); - usec -= 500000; - } - - return (NULL); -} - -void -vmm_mem_free(vm_paddr_t base, size_t length) -{ - page_t *pp; - - if (base & PAGE_MASK) { - panic("vmm_mem_free: base 0x%0lx must be aligned on a " - "0x%0x boundary\n", base, PAGE_SIZE); - } - - if (length != PAGE_SIZE) { - panic("vmm_mem_free: invalid length %lu", length); - } - - pp = page_numtopp_nolock(btop(base)); - kmem_free((void *)pp->p_offset, PAGE_SIZE); -} - -vm_paddr_t -vmm_mem_maxaddr(void) -{ - - return (ptob(physmax + 1)); -} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c new file mode 100644 index 0000000000..d630d32630 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c @@ -0,0 +1,297 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include +#include +#include +#include +#include + +#include +#include + + +struct rvi_map { + gipt_map_t rm_gipt; + uint64_t rm_wired_page_count; +}; +typedef struct rvi_map rvi_map_t; + +#define RVI_LOCK(m) (&(m)->rm_gipt.giptm_lock) + +#define RVI_MAX_LEVELS 4 + +CTASSERT(RVI_MAX_LEVELS <= GIPT_MAX_LEVELS); + +#define RVI_PRESENT PT_VALID +#define RVI_WRITABLE PT_WRITABLE +#define RVI_ACCESSED PT_REF +#define RVI_DIRTY PT_MOD +#define RVI_LGPG PT_PAGESIZE +#define RVI_NX PT_NX +#define RVI_USER PT_USER +#define RVI_PWT PT_WRITETHRU +#define RVI_PCD PT_NOCACHE + +#define RVI_PA_MASK PT_PADDR + +#define RVI_PAT(attr) rvi_attr_to_pat(attr) +#define RVI_PADDR(addr) ((addr) & RVI_PA_MASK) +#define RVI_PROT(prot) \ + ((((prot) & PROT_WRITE) != 0 ? RVI_WRITABLE : 0) | \ + (((prot) & PROT_EXEC) == 0 ? RVI_NX : 0)) + +#define RVI_IS_ABSENT(pte) (((pte) & RVI_PRESENT) == 0) +#define RVI_PTE_PFN(pte) mmu_btop(RVI_PADDR(pte)) +#define RVI_MAPS_PAGE(pte, lvl) \ + (!RVI_IS_ABSENT(pte) && (((pte) & RVI_LGPG) != 0 || (lvl) == 0)) +#define RVI_PTE_PROT(pte) \ + (RVI_IS_ABSENT(pte) ? 0 : ( \ + PROT_READ | \ + (((pte) & RVI_NX) == 0 ? PROT_EXEC : 0) | \ + (((pte) & RVI_WRITABLE) != 0 ? PROT_WRITE : 0))) + +#define RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr) \ + (RVI_PADDR(pfn_to_pa(pfn)) | \ + (((lvl) != 0) ? RVI_LGPG : 0) | \ + RVI_USER | RVI_ACCESSED | RVI_PRESENT | \ + RVI_PAT(attr) | \ + RVI_PROT(prot)) + +#define RVI_PTE_ASSIGN_TABLE(pfn) \ + (RVI_PADDR(pfn_to_pa(pfn)) | \ + RVI_USER | RVI_ACCESSED | RVI_PRESENT | \ + RVI_PAT(MTRR_TYPE_WB) | \ + RVI_PROT(PROT_READ | PROT_WRITE | PROT_EXEC)) + + +/* Make sure that PAT indexes line up as expected */ +CTASSERT((PAT_DEFAULT_ATTRIBUTE & 0xf) == MTRR_TYPE_WB); +CTASSERT(((PAT_DEFAULT_ATTRIBUTE >> 24) & 0xf) == MTRR_TYPE_UC); + +static inline uint64_t +rvi_attr_to_pat(const uint8_t attr) +{ + if (attr == MTRR_TYPE_UC) { + /* !PAT + PCD + PWT -> PAT3 -> MTRR_TYPE_UC */ + return (RVI_PCD|RVI_PWT); + } else if (attr == MTRR_TYPE_WB) { + /* !PAT + !PCD + !PWT -> PAT0 -> MTRR_TYPE_WB */ + return (0); + } + + panic("unexpected memattr %x", attr); + return (0); +} + +static gipt_pte_type_t +rvi_pte_type(uint64_t pte, uint_t level) +{ + if (RVI_IS_ABSENT(pte)) { + return (PTET_EMPTY); + } else if (RVI_MAPS_PAGE(pte, level)) { + return (PTET_PAGE); + } else { + return (PTET_LINK); + } +} + +static uint64_t +rvi_pte_map(uint64_t pfn) +{ + return (RVI_PTE_ASSIGN_TABLE(pfn)); +} + +static void * +rvi_create(uintptr_t *pml4_kaddr) +{ + rvi_map_t *rmap; + gipt_map_t *map; + gipt_t *root; + struct gipt_cbs cbs = { + .giptc_pte_type = rvi_pte_type, + .giptc_pte_map = rvi_pte_map, + }; + + rmap = kmem_zalloc(sizeof (*rmap), KM_SLEEP); + map = &rmap->rm_gipt; + root = gipt_alloc(); + root->gipt_level = RVI_MAX_LEVELS - 1; + gipt_map_init(map, RVI_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root); + + *pml4_kaddr = (uintptr_t)root->gipt_kva; + return (rmap); +} + +static void +rvi_destroy(void *arg) +{ + rvi_map_t *rmap = arg; + + if (rmap != NULL) { + gipt_map_t *map = &rmap->rm_gipt; + + gipt_map_fini(map); + kmem_free(rmap, sizeof (*rmap)); + } +} + +static uint64_t +rvi_wired_count(void *arg) +{ + rvi_map_t *rmap = arg; + uint64_t res; + + mutex_enter(RVI_LOCK(rmap)); + res = rmap->rm_wired_page_count; + mutex_exit(RVI_LOCK(rmap)); + + return (res); +} + +static int +rvi_is_wired(void *arg, uint64_t va, uint_t *protp) +{ + rvi_map_t *rmap = arg; + gipt_t *pt; + int rv = -1; + + mutex_enter(RVI_LOCK(rmap)); + pt = gipt_map_lookup_deepest(&rmap->rm_gipt, va); + if (pt != NULL) { + const uint64_t pte = GIPT_VA2PTE(pt, va); + + if (RVI_MAPS_PAGE(pte, pt->gipt_level)) { + *protp = RVI_PTE_PROT(pte); + rv = 0; + } + } + mutex_exit(RVI_LOCK(rmap)); + + return (rv); +} + +static int +rvi_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot, + uint8_t attr) +{ + rvi_map_t *rmap = arg; + gipt_map_t *map = &rmap->rm_gipt; + gipt_t *pt; + uint64_t *ptep, pte; + + ASSERT((prot & PROT_READ) != 0); + ASSERT3U((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)), ==, 0); + ASSERT3U(lvl, <, RVI_MAX_LEVELS); + + mutex_enter(RVI_LOCK(rmap)); + pt = gipt_map_lookup(map, va, lvl); + if (pt == NULL) { + /* + * A table at the appropriate VA/level that would house this + * mapping does not currently exist. Try to walk down to that + * point, creating any necessary parent(s). + */ + pt = gipt_map_create_parents(map, va, lvl); + + /* + * There was a large page mapping in the way of creating the + * necessary parent table(s). + */ + if (pt == NULL) { + panic("unexpected large page @ %08lx", va); + } + } + ptep = GIPT_VA2PTEP(pt, va); + + pte = *ptep; + if (!RVI_IS_ABSENT(pte)) { + if (!RVI_MAPS_PAGE(pte, lvl)) { + panic("unexpected PT link @ %08lx in %p", va, pt); + } else { + panic("unexpected page mapped @ %08lx in %p", va, pt); + } + } + + pte = RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr); + *ptep = pte; + pt->gipt_valid_cnt++; + rmap->rm_wired_page_count += gipt_level_count[lvl]; + + mutex_exit(RVI_LOCK(rmap)); + return (0); +} + +static uint64_t +rvi_unmap(void *arg, uint64_t va, uint64_t end_va) +{ + rvi_map_t *rmap = arg; + gipt_map_t *map = &rmap->rm_gipt; + gipt_t *pt; + uint64_t cur_va = va; + uint64_t unmapped = 0; + + mutex_enter(RVI_LOCK(rmap)); + + pt = gipt_map_lookup_deepest(map, cur_va); + if (pt == NULL) { + mutex_exit(RVI_LOCK(rmap)); + return (0); + } + if (!RVI_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) { + cur_va = gipt_map_next_page(map, cur_va, end_va, &pt); + if (cur_va == 0) { + mutex_exit(RVI_LOCK(rmap)); + return (0); + } + } + + while (cur_va < end_va) { + uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va); + const uint_t lvl = pt->gipt_level; + + ASSERT(RVI_MAPS_PAGE(*ptep, lvl)); + *ptep = 0; + pt->gipt_valid_cnt--; + unmapped += gipt_level_count[pt->gipt_level]; + + gipt_t *next_pt = pt; + uint64_t next_va; + next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt); + + if (pt->gipt_valid_cnt == 0) { + gipt_map_clean_parents(map, pt); + } + if (next_va == 0) { + break; + } + pt = next_pt; + cur_va = next_va; + } + rmap->rm_wired_page_count -= unmapped; + + mutex_exit(RVI_LOCK(rmap)); + + return (unmapped); +} + +struct vmm_pt_ops rvi_ops = { + .vpo_init = rvi_create, + .vpo_free = rvi_destroy, + .vpo_wired_cnt = rvi_wired_count, + .vpo_is_wired = rvi_is_wired, + .vpo_map = rvi_map, + .vpo_unmap = rvi_unmap, +}; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c new file mode 100644 index 0000000000..66a67d9529 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c @@ -0,0 +1,1016 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include "vm/vm_glue.h" + +#define PMAP_TO_VMMAP(pm) ((vm_map_t) \ + ((caddr_t)(pm) - offsetof(struct vmspace, vms_pmap))) +#define VMMAP_TO_VMSPACE(vmmap) ((struct vmspace *) \ + ((caddr_t)(vmmap) - offsetof(struct vmspace, vm_map))) + + +struct vmspace_mapping { + list_node_t vmsm_node; + vm_object_t vmsm_object; + uintptr_t vmsm_addr; + size_t vmsm_len; + off_t vmsm_offset; + uint_t vmsm_prot; +}; +typedef struct vmspace_mapping vmspace_mapping_t; + +#define VMSM_OFFSET(vmsm, addr) ( \ + (vmsm)->vmsm_offset + \ + ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) + + +/* Private glue interfaces */ +static void pmap_free(pmap_t); +static vmspace_mapping_t *vm_mapping_find(struct vmspace *, uintptr_t, size_t, + boolean_t); +static void vm_mapping_remove(struct vmspace *, vmspace_mapping_t *); + +static vmem_t *vmm_alloc_arena = NULL; + +static void * +vmm_arena_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + return (segkmem_xalloc(vmp, NULL, size, vmflag, 0, + segkmem_page_create, &kvps[KV_VVP])); +} + +static void +vmm_arena_free(vmem_t *vmp, void *inaddr, size_t size) +{ + segkmem_xfree(vmp, inaddr, size, &kvps[KV_VVP], NULL); +} + +void +vmm_arena_init(void) +{ + vmm_alloc_arena = vmem_create("vmm_alloc_arena", NULL, 0, 1024 * 1024, + vmm_arena_alloc, vmm_arena_free, kvmm_arena, 0, VM_SLEEP); + + ASSERT(vmm_alloc_arena != NULL); +} + +void +vmm_arena_fini(void) +{ + VERIFY(vmem_size(vmm_alloc_arena, VMEM_ALLOC) == 0); + vmem_destroy(vmm_alloc_arena); + vmm_alloc_arena = NULL; +} + +struct vmspace * +vmspace_alloc(vm_offset_t start, vm_offset_t end, pmap_pinit_t pinit) +{ + struct vmspace *vms; + const uintptr_t size = end + 1; + + /* + * This whole mess is built on the assumption that a 64-bit address + * space is available to work with for the various pagetable tricks. + */ + VERIFY(ttoproc(curthread)->p_model == DATAMODEL_LP64); + VERIFY(start == 0 && size > 0 && (size & PAGEOFFSET) == 0 && + size <= (uintptr_t)USERLIMIT); + + vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); + vms->vms_size = size; + list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), + offsetof(vmspace_mapping_t, vmsm_node)); + + if (pinit(&vms->vms_pmap) == 0) { + kmem_free(vms, sizeof (*vms)); + return (NULL); + } + + return (vms); +} + +void +vmspace_free(struct vmspace *vms) +{ + VERIFY(list_is_empty(&vms->vms_maplist)); + + pmap_free(&vms->vms_pmap); + kmem_free(vms, sizeof (*vms)); +} + +pmap_t +vmspace_pmap(struct vmspace *vms) +{ + return (&vms->vms_pmap); +} + +long +vmspace_resident_count(struct vmspace *vms) +{ + /* XXXJOY: finish */ + return (0); +} + +void * +vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size) +{ + vmspace_mapping_t *vmsm; + void *result = NULL; + + /* + * Since vmspace_find_kva is provided so that vmm_drv consumers can do + * GPA2KVA translations, it is expected to be called when there is a + * read lock preventing vmspace alterations. As such, it can do the + * lockless vm_mapping_find() lookup. + */ + vmsm = vm_mapping_find(vms, addr, size, B_TRUE); + if (vmsm != NULL) { + struct vm_object *vmo = vmsm->vmsm_object; + + switch (vmo->vmo_type) { + case OBJT_DEFAULT: + result = (void *)((uintptr_t)vmo->vmo_data + + VMSM_OFFSET(vmsm, addr)); + break; + default: + break; + } + } + + return (result); +} + +static int +vmspace_pmap_iswired(struct vmspace *vms, uintptr_t addr, uint_t *prot) +{ + pmap_t pmap = &vms->vms_pmap; + int rv; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + + rv = pmap->pm_ops->vpo_is_wired(pmap->pm_impl, addr, prot); + return (rv); +} + +static void +pmap_free(pmap_t pmap) +{ + void *pmi = pmap->pm_impl; + struct vmm_pt_ops *ops = pmap->pm_ops; + + pmap->pm_pml4 = NULL; + pmap->pm_impl = NULL; + pmap->pm_ops = NULL; + + ops->vpo_free(pmi); +} + +int +pmap_pinit_type(pmap_t pmap, enum pmap_type type, int flags) +{ + /* For use in vmm only */ + pmap->pm_type = type; + switch (type) { + case PT_EPT: { + struct vmm_pt_ops *ops = &ept_ops; + void *pml4, *pmi; + + pmi = ops->vpo_init((uintptr_t *)&pml4); + + pmap->pm_ops = ops; + pmap->pm_impl = pmi; + pmap->pm_pml4 = pml4; + return (1); + } + case PT_RVI: { + struct vmm_pt_ops *ops = &rvi_ops; + void *pml4, *pmi; + + pmi = ops->vpo_init((uintptr_t *)&pml4); + + pmap->pm_ops = ops; + pmap->pm_impl = pmi; + pmap->pm_pml4 = pml4; + return (1); + } + default: + panic("unsupported pmap type: %x", type); + break; + } + + return (1); +} + +long +pmap_wired_count(pmap_t pmap) +{ + long val; + + val = pmap->pm_ops->vpo_wired_cnt(pmap->pm_impl); + VERIFY3S(val, >=, 0); + + return (val); +} + +int +pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) +{ + /* Allow the fallback to vm_fault to handle this */ + return (-1); +} + + + +struct sglist_ent { + vm_paddr_t sge_pa; + size_t sge_len; +}; +struct sglist { + kmutex_t sg_lock; + uint_t sg_refcnt; + uint_t sg_len; + uint_t sg_next; + struct sglist_ent sg_entries[]; +}; + +#define SG_SIZE(cnt) (sizeof (struct sglist) + \ + (sizeof (struct sglist_ent) * (cnt))) + +struct sglist * +sglist_alloc(int nseg, int flags) +{ + const size_t sz = SG_SIZE(nseg); + const int flag = (flags & M_WAITOK) ? KM_SLEEP : KM_NOSLEEP; + struct sglist *sg; + + ASSERT(nseg > 0); + + sg = kmem_zalloc(sz, flag); + if (sg != NULL) { + sg->sg_len = nseg; + sg->sg_refcnt = 1; + } + return (sg); +} + +void +sglist_free(struct sglist *sg) +{ + size_t sz; + + mutex_enter(&sg->sg_lock); + if (sg->sg_refcnt > 1) { + sg->sg_refcnt--; + mutex_exit(&sg->sg_lock); + return; + } + + VERIFY(sg->sg_refcnt == 1); + sg->sg_refcnt = 0; + sz = SG_SIZE(sg->sg_len); + mutex_exit(&sg->sg_lock); + kmem_free(sg, sz); +} + +int +sglist_append_phys(struct sglist *sg, vm_paddr_t pa, size_t len) +{ + uint_t idx; + struct sglist_ent *ent; + + /* Restrict to page-aligned entries */ + if ((pa & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0 || len == 0) { + return (EINVAL); + } + + mutex_enter(&sg->sg_lock); + idx = sg->sg_next; + if (idx >= sg->sg_len) { + mutex_exit(&sg->sg_lock); + return (ENOSPC); + } + + ent = &sg->sg_entries[idx]; + ASSERT(ent->sge_pa == 0 && ent->sge_len == 0); + ent->sge_pa = pa; + ent->sge_len = len; + sg->sg_next++; + + mutex_exit(&sg->sg_lock); + return (0); +} + + +static pfn_t +vm_object_pager_none(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) +{ + panic("bad vm_object pager"); + return (PFN_INVALID); +} + +static pfn_t +vm_object_pager_heap(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) +{ + const uintptr_t kaddr = ALIGN2PAGE((uintptr_t)vmo->vmo_data + off); + uint_t idx, level; + htable_t *ht; + x86pte_t pte; + pfn_t top_pfn, pfn; + + ASSERT(vmo->vmo_type == OBJT_DEFAULT); + ASSERT(off < vmo->vmo_size); + + ht = htable_getpage(kas.a_hat, kaddr, &idx); + if (ht == NULL) { + return (PFN_INVALID); + } + pte = x86pte_get(ht, idx); + if (!PTE_ISPAGE(pte, ht->ht_level)) { + htable_release(ht); + return (PFN_INVALID); + } + + pfn = top_pfn = PTE2PFN(pte, ht->ht_level); + level = ht->ht_level; + if (ht->ht_level > 0) { + pfn += mmu_btop(kaddr & LEVEL_OFFSET((uint_t)ht->ht_level)); + } + htable_release(ht); + + if (lpfn != NULL) { + *lpfn = top_pfn; + } + if (lvl != NULL) { + *lvl = level; + } + return (pfn); +} + +static pfn_t +vm_object_pager_sg(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) +{ + const uintptr_t aoff = ALIGN2PAGE(off); + uint_t level = 0; + uintptr_t pos = 0; + struct sglist *sg; + struct sglist_ent *ent; + pfn_t pfn = PFN_INVALID; + + ASSERT(vmo->vmo_type == OBJT_SG); + ASSERT(off < vmo->vmo_size); + + sg = vmo->vmo_data; + if (sg == NULL) { + return (PFN_INVALID); + } + + ent = &sg->sg_entries[0]; + for (uint_t i = 0; i < sg->sg_next; i++, ent++) { + if (aoff >= pos && aoff < (pos + ent->sge_len)) { + /* XXXJOY: Punt on large pages for now */ + level = 0; + pfn = mmu_btop(ent->sge_pa + (aoff - pos)); + break; + } + pos += ent->sge_len; + } + + if (lpfn != 0) { + *lpfn = pfn; + } + if (lvl != 0) { + *lvl = level; + } + return (pfn); +} + +static void +vm_reserve_pages(size_t npages) +{ + uint_t retries = 60; + int rc; + + mutex_enter(&freemem_lock); + if (availrmem < npages) { + mutex_exit(&freemem_lock); + + /* + * Set needfree and wait for the ZFS ARC reap thread to free up + * some memory. + */ + page_needfree(npages); + + mutex_enter(&freemem_lock); + while ((availrmem < npages) && retries-- > 0) { + mutex_exit(&freemem_lock); + rc = delay_sig(drv_usectohz(1 * MICROSEC)); + mutex_enter(&freemem_lock); + + if (rc == EINTR) + break; + } + mutex_exit(&freemem_lock); + + page_needfree(-npages); + } else { + mutex_exit(&freemem_lock); + } +} + +void +vm_object_clear(vm_object_t vmo) +{ + ASSERT(vmo->vmo_type == OBJT_DEFAULT); + + /* XXXJOY: Better zeroing approach? */ + bzero(vmo->vmo_data, vmo->vmo_size); +} + +vm_object_t +vm_object_allocate(objtype_t type, vm_pindex_t psize) +{ + vm_object_t vmo; + const size_t size = ptob((size_t)psize); + + vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); + mutex_init(&vmo->vmo_lock, NULL, MUTEX_DEFAULT, NULL); + + /* For now, these are to stay fixed after allocation */ + vmo->vmo_type = type; + vmo->vmo_size = size; + vmo->vmo_attr = VM_MEMATTR_DEFAULT; + + switch (type) { + case OBJT_DEFAULT: { + vm_reserve_pages(psize); + + /* XXXJOY: opt-in to larger pages? */ + vmo->vmo_data = vmem_alloc(vmm_alloc_arena, size, KM_NOSLEEP); + if (vmo->vmo_data == NULL) { + mutex_destroy(&vmo->vmo_lock); + kmem_free(vmo, sizeof (*vmo)); + return (NULL); + } + vm_object_clear(vmo); + vmo->vmo_pager = vm_object_pager_heap; + } + break; + case OBJT_SG: + vmo->vmo_data = NULL; + vmo->vmo_pager = vm_object_pager_sg; + break; + default: + panic("Unsupported vm_object type"); + break; + } + + vmo->vmo_refcnt = 1; + return (vmo); +} + +vm_object_t +vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, + vm_prot_t prot, vm_ooffset_t off, void *cred) +{ + struct vm_object *vmo; + struct sglist *sg = (struct sglist *)handle; + + /* XXXJOY: be very restrictive for now */ + VERIFY(type == OBJT_SG); + VERIFY(off == 0); + + vmo = vm_object_allocate(type, size); + vmo->vmo_data = sg; + + mutex_enter(&sg->sg_lock); + VERIFY(sg->sg_refcnt++ >= 1); + mutex_exit(&sg->sg_lock); + + return (vmo); +} + +void +vm_object_deallocate(vm_object_t vmo) +{ + ASSERT(vmo != NULL); + + uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); + /* underflow would be a deadly serious mistake */ + VERIFY3U(ref, !=, UINT_MAX); + if (ref != 0) { + return; + } + + switch (vmo->vmo_type) { + case OBJT_DEFAULT: + vmem_free(vmm_alloc_arena, vmo->vmo_data, vmo->vmo_size); + break; + case OBJT_SG: + sglist_free((struct sglist *)vmo->vmo_data); + break; + default: + panic("Unsupported vm_object type"); + break; + } + + vmo->vmo_pager = vm_object_pager_none; + vmo->vmo_data = NULL; + vmo->vmo_size = 0; + mutex_destroy(&vmo->vmo_lock); + kmem_free(vmo, sizeof (*vmo)); +} + +CTASSERT(VM_MEMATTR_UNCACHEABLE == MTRR_TYPE_UC); +CTASSERT(VM_MEMATTR_WRITE_BACK == MTRR_TYPE_WB); +int +vm_object_set_memattr(vm_object_t vmo, vm_memattr_t attr) +{ + ASSERT(MUTEX_HELD(&vmo->vmo_lock)); + + switch (attr) { + case VM_MEMATTR_UNCACHEABLE: + case VM_MEMATTR_WRITE_BACK: + vmo->vmo_attr = attr; + return (0); + default: + break; + } + return (EINVAL); +} + +void +vm_object_reference(vm_object_t vmo) +{ + ASSERT(vmo != NULL); + + uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); + /* overflow would be a deadly serious mistake */ + VERIFY3U(ref, !=, 0); +} + +static vmspace_mapping_t * +vm_mapping_find(struct vmspace *vms, uintptr_t addr, size_t size, + boolean_t no_lock) +{ + vmspace_mapping_t *vmsm; + list_t *ml = &vms->vms_maplist; + const uintptr_t range_end = addr + size; + + ASSERT(addr <= range_end); + + if (no_lock) { + /* + * This check should be superflous with the protections + * promised by the bhyve logic which calls into the VM shim. + * All the same, it is cheap to be paranoid. + */ + VERIFY(!vms->vms_map_changing); + } else { + VERIFY(MUTEX_HELD(&vms->vms_lock)); + } + + if (addr >= vms->vms_size) { + return (NULL); + } + for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { + const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; + + if (addr >= vmsm->vmsm_addr && addr < seg_end) { + if (range_end <= seg_end) { + return (vmsm); + } else { + return (NULL); + } + } + } + return (NULL); +} + +static boolean_t +vm_mapping_gap(struct vmspace *vms, uintptr_t addr, size_t size) +{ + vmspace_mapping_t *vmsm; + list_t *ml = &vms->vms_maplist; + const uintptr_t range_end = addr + size; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + + for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { + const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; + + if ((vmsm->vmsm_addr >= addr && vmsm->vmsm_addr < range_end) || + (seg_end > addr && seg_end < range_end)) { + return (B_FALSE); + } + } + return (B_TRUE); +} + +static void +vm_mapping_remove(struct vmspace *vms, vmspace_mapping_t *vmsm) +{ + list_t *ml = &vms->vms_maplist; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + ASSERT(vms->vms_map_changing); + + list_remove(ml, vmsm); + vm_object_deallocate(vmsm->vmsm_object); + kmem_free(vmsm, sizeof (*vmsm)); +} + +int +vm_fault(vm_map_t map, vm_offset_t off, vm_prot_t type, int flag) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + pmap_t pmap = &vms->vms_pmap; + void *pmi = pmap->pm_impl; + const uintptr_t addr = off; + vmspace_mapping_t *vmsm; + struct vm_object *vmo; + uint_t prot, map_lvl; + pfn_t pfn; + uintptr_t map_addr; + + mutex_enter(&vms->vms_lock); + if (vmspace_pmap_iswired(vms, addr, &prot) == 0) { + int err = 0; + + /* + * It is possible that multiple vCPUs will race to fault-in a + * given address. In such cases, the race loser(s) will + * encounter the already-mapped page, needing to do nothing + * more than consider it a success. + * + * If the fault exceeds protection, it is an obvious error. + */ + if ((prot & type) != type) { + err = FC_PROT; + } + + mutex_exit(&vms->vms_lock); + return (err); + } + + /* Try to wire up the address */ + if ((vmsm = vm_mapping_find(vms, addr, 0, B_FALSE)) == NULL) { + mutex_exit(&vms->vms_lock); + return (FC_NOMAP); + } + vmo = vmsm->vmsm_object; + prot = vmsm->vmsm_prot; + + /* XXXJOY: punt on large pages for now */ + pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, addr), NULL, NULL); + map_lvl = 0; + map_addr = P2ALIGN((uintptr_t)addr, LEVEL_SIZE(map_lvl)); + VERIFY(pfn != PFN_INVALID); + + /* + * If pmap failure is to be handled, the previously acquired page locks + * would need to be released. + */ + VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl, prot, + vmo->vmo_attr)); + pmap->pm_eptgen++; + + mutex_exit(&vms->vms_lock); + return (0); +} + +int +vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, + vm_prot_t prot, vm_page_t *ma, int max_count) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + const uintptr_t vaddr = addr; + vmspace_mapping_t *vmsm; + struct vm_object *vmo; + vm_page_t vmp; + + ASSERT0(addr & PAGEOFFSET); + ASSERT(len == PAGESIZE); + ASSERT(max_count == 1); + + /* + * Unlike practically all of the other logic that queries or + * manipulates vmspace objects, vm_fault_quick_hold_pages() does so + * without holding vms_lock. This is safe because bhyve ensures that + * changes to the vmspace map occur only when all other threads have + * been excluded from running. + * + * Since this task can count on vms_maplist remaining static and does + * not need to modify the pmap (like vm_fault might), it can proceed + * without the lock. The vm_object has independent refcount and lock + * protection, while the vmo_pager methods do not rely on vms_lock for + * safety. + * + * Performing this work without locks is critical in cases where + * multiple vCPUs require simultaneous instruction emulation, such as + * for frequent guest APIC accesses on a host that lacks hardware + * acceleration for that behavior. + */ + if ((vmsm = vm_mapping_find(vms, vaddr, PAGESIZE, B_TRUE)) == NULL || + (prot & ~vmsm->vmsm_prot) != 0) { + return (-1); + } + + vmp = kmem_zalloc(sizeof (struct vm_page), KM_SLEEP); + + vmo = vmsm->vmsm_object; + vm_object_reference(vmo); + vmp->vmp_obj_held = vmo; + vmp->vmp_pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, vaddr), NULL, + NULL); + + *ma = vmp; + return (1); +} + +/* + * Find a suitable location for a mapping (and install it). + */ +int +vm_map_find(vm_map_t map, vm_object_t vmo, vm_ooffset_t off, vm_offset_t *addr, + vm_size_t len, vm_offset_t max_addr, int find_flags, vm_prot_t prot, + vm_prot_t prot_max, int cow) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + const size_t size = (size_t)len; + const uintptr_t uoff = (uintptr_t)off; + uintptr_t base = *addr; + vmspace_mapping_t *vmsm; + int res = 0; + + /* For use in vmm only */ + VERIFY(find_flags == VMFS_NO_SPACE); /* essentially MAP_FIXED */ + VERIFY(max_addr == 0); + + if (size == 0 || off < 0 || + uoff >= (uoff + size) || vmo->vmo_size < (uoff + size)) { + return (EINVAL); + } + + if (*addr >= vms->vms_size) { + return (ENOMEM); + } + + vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); + + mutex_enter(&vms->vms_lock); + vms->vms_map_changing = B_TRUE; + if (!vm_mapping_gap(vms, base, size)) { + res = ENOMEM; + goto out; + } + + if (res == 0) { + vmsm->vmsm_object = vmo; + vmsm->vmsm_addr = base; + vmsm->vmsm_len = len; + vmsm->vmsm_offset = (off_t)uoff; + vmsm->vmsm_prot = prot; + list_insert_tail(&vms->vms_maplist, vmsm); + + /* Communicate out the chosen address. */ + *addr = (vm_offset_t)base; + } +out: + vms->vms_map_changing = B_FALSE; + mutex_exit(&vms->vms_lock); + if (res != 0) { + kmem_free(vmsm, sizeof (*vmsm)); + } + return (res); +} + +int +vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + pmap_t pmap = &vms->vms_pmap; + void *pmi = pmap->pm_impl; + const uintptr_t addr = start; + const size_t size = (size_t)(end - start); + vmspace_mapping_t *vmsm; + + ASSERT(start < end); + + mutex_enter(&vms->vms_lock); + vms->vms_map_changing = B_TRUE; + /* expect to match existing mapping exactly */ + if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL || + vmsm->vmsm_addr != addr || vmsm->vmsm_len != size) { + vms->vms_map_changing = B_FALSE; + mutex_exit(&vms->vms_lock); + return (ENOENT); + } + + (void) pmap->pm_ops->vpo_unmap(pmi, addr, end); + pmap->pm_eptgen++; + + vm_mapping_remove(vms, vmsm); + vms->vms_map_changing = B_FALSE; + mutex_exit(&vms->vms_lock); + return (0); +} + +int +vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + pmap_t pmap = &vms->vms_pmap; + void *pmi = pmap->pm_impl; + const uintptr_t addr = start; + const size_t size = end - start; + vmspace_mapping_t *vmsm; + struct vm_object *vmo; + uint_t prot; + + mutex_enter(&vms->vms_lock); + + /* For the time being, only exact-match mappings are expected */ + if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) { + mutex_exit(&vms->vms_lock); + return (FC_NOMAP); + } + vmo = vmsm->vmsm_object; + prot = vmsm->vmsm_prot; + + for (uintptr_t pos = addr; pos < end; ) { + pfn_t pfn; + uintptr_t pg_size, map_addr; + uint_t map_lvl = 0; + + /* XXXJOY: punt on large pages for now */ + pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, pos), NULL, NULL); + pg_size = LEVEL_SIZE(map_lvl); + map_addr = P2ALIGN(pos, pg_size); + VERIFY(pfn != PFN_INVALID); + + VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl, + prot, vmo->vmo_attr)); + vms->vms_pmap.pm_eptgen++; + + pos += pg_size; + } + + mutex_exit(&vms->vms_lock); + + return (0); +} + +/* Provided custom for bhyve 'devmem' segment mapping */ +int +vm_segmap_obj(struct vmspace *vms, vm_object_t vmo, struct as *as, + caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) +{ + const size_t size = vmo->vmo_size; + int err; + + if (vmo->vmo_type != OBJT_DEFAULT) { + /* Only support default objects for now */ + return (ENOTSUP); + } + + as_rangelock(as); + + err = choose_addr(as, addrp, size, 0, ADDR_VACALIGN, flags); + if (err == 0) { + segvmm_crargs_t svma; + + svma.kaddr = vmo->vmo_data; + svma.prot = prot; + svma.cookie = vmo; + svma.hold = (segvmm_holdfn_t)vm_object_reference; + svma.rele = (segvmm_relefn_t)vm_object_deallocate; + + err = as_map(as, *addrp, size, segvmm_create, &svma); + } + + as_rangeunlock(as); + return (err); +} + +int +vm_segmap_space(struct vmspace *vms, off_t off, struct as *as, caddr_t *addrp, + off_t len, uint_t prot, uint_t maxprot, uint_t flags) +{ + const uintptr_t addr = (uintptr_t)off; + const size_t size = (uintptr_t)len; + vmspace_mapping_t *vmsm; + vm_object_t vmo; + int err; + + if (off < 0 || len <= 0 || + (addr & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { + return (EINVAL); + } + + mutex_enter(&vms->vms_lock); + if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) { + mutex_exit(&vms->vms_lock); + return (ENXIO); + } + if ((prot & ~(vmsm->vmsm_prot | PROT_USER)) != 0) { + mutex_exit(&vms->vms_lock); + return (EACCES); + } + vmo = vmsm->vmsm_object; + if (vmo->vmo_type != OBJT_DEFAULT) { + /* Only support default objects for now */ + mutex_exit(&vms->vms_lock); + return (ENOTSUP); + } + + as_rangelock(as); + + err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); + if (err == 0) { + segvmm_crargs_t svma; + const uintptr_t addroff = addr - vmsm->vmsm_addr; + const uintptr_t mapoff = addroff + vmsm->vmsm_offset; + + VERIFY(addroff < vmsm->vmsm_len); + VERIFY((vmsm->vmsm_len - addroff) >= size); + VERIFY(mapoff < vmo->vmo_size); + VERIFY((mapoff + size) <= vmo->vmo_size); + + svma.kaddr = (void *)((uintptr_t)vmo->vmo_data + mapoff); + svma.prot = prot; + svma.cookie = vmo; + svma.hold = (segvmm_holdfn_t)vm_object_reference; + svma.rele = (segvmm_relefn_t)vm_object_deallocate; + + err = as_map(as, *addrp, len, segvmm_create, &svma); + } + + as_rangeunlock(as); + mutex_exit(&vms->vms_lock); + return (err); +} + +void +vm_page_lock(vm_page_t vmp) +{ + ASSERT(!MUTEX_HELD(&vmp->vmp_lock)); + + mutex_enter(&vmp->vmp_lock); +} + +void +vm_page_unlock(vm_page_t vmp) +{ + boolean_t purge = (vmp->vmp_pfn == PFN_INVALID); + + ASSERT(MUTEX_HELD(&vmp->vmp_lock)); + + mutex_exit(&vmp->vmp_lock); + + if (purge) { + mutex_destroy(&vmp->vmp_lock); + kmem_free(vmp, sizeof (*vmp)); + } +} + +void +vm_page_unhold(vm_page_t vmp) +{ + ASSERT(MUTEX_HELD(&vmp->vmp_lock)); + VERIFY(vmp->vmp_pfn != PFN_INVALID); + + vm_object_deallocate(vmp->vmp_obj_held); + vmp->vmp_obj_held = NULL; + vmp->vmp_pfn = PFN_INVALID; +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.c b/usr/src/uts/i86pc/io/vmm/vmm_stat.c new file mode 100644 index 0000000000..2cbcce9590 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.c @@ -0,0 +1,172 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include "vmm_util.h" +#include "vmm_stat.h" + +/* + * 'vst_num_elems' is the total number of addressable statistic elements + * 'vst_num_types' is the number of unique statistic types + * + * It is always true that 'vst_num_elems' is greater than or equal to + * 'vst_num_types'. This is because a stat type may represent more than + * one element (for e.g. VMM_STAT_ARRAY). + */ +static int vst_num_elems, vst_num_types; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +#define vst_size ((size_t)vst_num_elems * sizeof(uint64_t)) + +void +vmm_stat_register(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vst->scope == VMM_STAT_SCOPE_INTEL && !vmm_is_intel()) + return; + + if (vst->scope == VMM_STAT_SCOPE_AMD && !vmm_is_amd()) + return; + + if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) { + printf("Cannot accommodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vst_num_elems; + vst_num_elems += vst->nelems; + + vsttab[vst_num_types++] = vst; +} + +int +vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf) +{ + struct vmm_stat_type *vst; + uint64_t *stats; + int i; + + if (vcpu < 0 || vcpu >= vm_get_maxcpus(vm)) + return (EINVAL); + + /* Let stats functions update their counters */ + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (vst->func != NULL) + (*vst->func)(vm, vcpu, vst); + } + + /* Copy over the stats */ + stats = vcpu_stats(vm, vcpu); + for (i = 0; i < vst_num_elems; i++) + buf[i] = stats[i]; + *num_stats = vst_num_elems; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + + return (malloc(vst_size, M_VMM_STAT, M_WAITOK)); +} + +void +vmm_stat_init(void *vp) +{ + + bzero(vp, vst_size); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +int +vmm_stat_desc_copy(int index, char *buf, int bufsize) +{ + int i; + struct vmm_stat_type *vst; + + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (index >= vst->index && index < vst->index + vst->nelems) { + if (vst->nelems > 1) { + snprintf(buf, bufsize, "%s[%d]", + vst->desc, index - vst->index); + } else { + strlcpy(buf, vst->desc, bufsize); + } + return (0); /* found it */ + } + } + + return (EINVAL); +} + +/* global statistics */ +VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); +VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); +VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); +VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); +VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); +VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); +VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); +VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); +VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); +VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); +VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); +VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); +VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); +VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); +VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); +VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); +VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); +VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); +VMM_STAT(VMEXIT_RUNBLOCK, "number of times runblock at exit"); +VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.h b/usr/src/uts/i86pc/io/vmm/vmm_stat.h index 9bf7a60e0b..3232e23888 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_stat.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -10,7 +12,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -26,15 +28,24 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_stat.h 250427 2013-05-10 02:59:49Z neel $ + * $FreeBSD$ + */ +/* + * Copyright 2018 Joyent, Inc. */ #ifndef _VMM_STAT_H_ #define _VMM_STAT_H_ +#include + struct vm; -#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ +#ifdef __FreeBSD__ +#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ +#else +#define MAX_VMM_STAT_ELEMS (64 + VM_MAXCPU) /* arbitrary */ +#endif enum vmm_stat_scope { VMM_STAT_SCOPE_ANY, @@ -42,20 +53,28 @@ enum vmm_stat_scope { VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */ }; +struct vmm_stat_type; +typedef void (*vmm_stat_func_t)(struct vm *vm, int vcpu, + struct vmm_stat_type *stat); + struct vmm_stat_type { int index; /* position in the stats buffer */ int nelems; /* standalone or array */ const char *desc; /* description of statistic */ + vmm_stat_func_t func; enum vmm_stat_scope scope; }; -void vmm_stat_init(void *arg); +void vmm_stat_register(void *arg); -#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ +#define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \ struct vmm_stat_type type[1] = { \ - { -1, nelems, desc, scope } \ + { -1, nelems, desc, func, scope } \ }; \ - SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type) + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type) + +#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ + VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope) #define VMM_STAT_DECLARE(type) \ extern struct vmm_stat_type type[1] @@ -67,10 +86,14 @@ void vmm_stat_init(void *arg); #define VMM_STAT_AMD(type, desc) \ VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD) +#define VMM_STAT_FUNC(type, desc, func) \ + VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY) + #define VMM_STAT_ARRAY(type, nelems, desc) \ VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) void *vmm_stat_alloc(void); +void vmm_stat_init(void *vp); void vmm_stat_free(void *vp); /* @@ -79,7 +102,7 @@ void vmm_stat_free(void *vp); int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf); int vmm_stat_desc_copy(int index, char *buf, int buflen); -static void __inline +static __inline void vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, int statidx, uint64_t x) { @@ -92,9 +115,22 @@ vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, stats[vst->index + statidx] += x; #endif } - -static void __inline +static __inline void +vmm_stat_array_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t val) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] = val; +#endif +} + +static __inline void vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) { @@ -103,6 +139,15 @@ vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) #endif } +static __inline void +vmm_stat_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t val) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_set(vm, vcpu, vst, 0, val); +#endif +} + VMM_STAT_DECLARE(VCPU_MIGRATIONS); VMM_STAT_DECLARE(VMEXIT_COUNT); VMM_STAT_DECLARE(VMEXIT_EXTINT); @@ -121,7 +166,7 @@ VMM_STAT_DECLARE(VMEXIT_INST_EMUL); VMM_STAT_DECLARE(VMEXIT_UNKNOWN); VMM_STAT_DECLARE(VMEXIT_ASTPENDING); VMM_STAT_DECLARE(VMEXIT_USERSPACE); -VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS); -VMM_STAT_DECLARE(VMEXIT_USERSPACE); +VMM_STAT_DECLARE(VMEXIT_RUNBLOCK); VMM_STAT_DECLARE(VMEXIT_EXCEPTION); +VMM_STAT_DECLARE(VMEXIT_REQIDLE); #endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_support.s b/usr/src/uts/i86pc/io/vmm/vmm_support.s new file mode 100644 index 0000000000..5777d46959 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_support.s @@ -0,0 +1,54 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2019 Joyent, Inc. + */ + +#include +#include + +/* + * %rdi = trapno + * + * This variant is for any explicit exception injection that we need: in this + * case, we can't just, for example, do a direct "int $2", as that will then + * trash our %cr3 via tr_nmiint due to KPTI, so we have to fake a trap frame. + * Both NMIs and MCEs don't push an 'err' into the frame. + */ +ENTRY_NP(vmm_call_trap) + pushq %rbp + movq %rsp, %rbp + movq %rsp, %r11 + andq $~0xf, %rsp /* align stack */ + pushq $KDS_SEL /* %ss */ + pushq %r11 /* %rsp */ + pushfq /* %rflags */ + pushq $KCS_SEL /* %cs */ + leaq .trap_iret_dest(%rip), %rcx + pushq %rcx /* %rip */ + cli + cmpq $T_NMIFLT, %rdi + je nmiint + cmpq $T_MCE, %rdi + je mcetrap + + pushq %rdi /* save our bad trapno... */ + leaq __vmm_call_bad_trap(%rip), %rdi + xorl %eax, %eax + call panic + /*NOTREACHED*/ + +.trap_iret_dest: + popq %rbp + ret +SET_SIZE(vmm_call_trap) + +__vmm_call_bad_trap: + .string "bad trapno for vmm_call_trap()" diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.c b/usr/src/uts/i86pc/io/vmm/vmm_util.c index fabd42e13c..3eadfe57e5 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_util.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_util.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_util.c 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -39,7 +41,7 @@ */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_util.c 245678 2013-01-20 03:42:49Z neel $"); +__FBSDID("$FreeBSD$"); #include #include diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.h b/usr/src/uts/i86pc/io/vmm/vmm_util.h index fe1c1c9449..fc7e7364c7 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_util.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_util.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_util.h 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ #ifndef _VMM_UTIL_H_ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_zsd.c b/usr/src/uts/i86pc/io/vmm/vmm_zsd.c new file mode 100644 index 0000000000..0271cc339e --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_zsd.c @@ -0,0 +1,218 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * zone specific data + * + * Zone specific data is used to keep an association between zones and the vmm + * instances that may be running in them. This is used to ensure that vmm + * instances do not outlive their parent zone. + * + * Locking strategy + * + * The global vmm_zsd_lock is held while modifying vmm_zsd_list. + * + * The per zone vz_lock in vmm_zsd_t is held while reading or writing anything + * within in vmm_zsd_t instance. This is important to ensure that there's not + * an accidental VM creating as a zone is going down. + */ + +/* + * One of these per zone. + */ +struct vmm_zsd { + list_t vz_vmms; /* vmm instances in the zone */ + list_node_t vz_linkage; /* link to other zones */ + boolean_t vz_active; /* B_FALSE early in shutdown callback */ + zoneid_t vz_zoneid; + kmutex_t vz_lock; +}; + +static kmutex_t vmm_zsd_lock; /* Protects vmm_zsd_list */ +static list_t vmm_zsd_list; /* Linkage between all zsd instances */ + +static zone_key_t vmm_zsd_key; + +int +vmm_zsd_add_vm(vmm_softc_t *sc) +{ + vmm_zsd_t *zsd; + + ASSERT(sc->vmm_zone != NULL); + + mutex_enter(&vmm_zsd_lock); + + for (zsd = list_head(&vmm_zsd_list); zsd != NULL; + zsd = list_next(&vmm_zsd_list, zsd)) { + if (zsd->vz_zoneid == sc->vmm_zone->zone_id) { + break; + } + } + + VERIFY(zsd != NULL); + mutex_exit(&vmm_zsd_lock); + + mutex_enter(&zsd->vz_lock); + if (!zsd->vz_active) { + mutex_exit(&zsd->vz_lock); + return (ENOSYS); + } + + sc->vmm_zsd = zsd; + list_insert_tail(&zsd->vz_vmms, sc); + + mutex_exit(&zsd->vz_lock); + + return (0); +} + +void +vmm_zsd_rem_vm(vmm_softc_t *sc) +{ + vmm_zsd_t *zsd = sc->vmm_zsd; + + mutex_enter(&zsd->vz_lock); + + list_remove(&zsd->vz_vmms, sc); + sc->vmm_zsd = NULL; + + mutex_exit(&zsd->vz_lock); +} + +static void * +vmm_zsd_create(zoneid_t zid) +{ + vmm_zsd_t *zsd; + zone_t *zone; + + zsd = kmem_zalloc(sizeof (*zsd), KM_SLEEP); + + list_create(&zsd->vz_vmms, sizeof (vmm_softc_t), + offsetof(vmm_softc_t, vmm_zsd_linkage)); + + zsd->vz_zoneid = zid; + + mutex_init(&zsd->vz_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * If the vmm module is loaded while this zone is in the midst of + * shutting down, vmm_zsd_destroy() may be called without + * vmm_zsd_shutdown() ever being called. If it is shutting down, there + * is no sense in letting any in-flight VM creation succeed so set + * vz_active accordingly. + * + * zone_find_by_id_nolock() is used rather than zone_find_by_id() + * so that the zone is returned regardless of state. + */ + zone = zone_find_by_id_nolock(zid); + VERIFY(zone != NULL); + zsd->vz_active = zone_status_get(zone) < ZONE_IS_SHUTTING_DOWN; + + mutex_enter(&vmm_zsd_lock); + list_insert_tail(&vmm_zsd_list, zsd); + mutex_exit(&vmm_zsd_lock); + + return (zsd); +} + +/* + * Tells all runing VMs in the zone to poweroff. This does not reclaim guest + * resources (memory, etc.). + */ +static void +vmm_zsd_shutdown(zoneid_t zid, void *data) +{ + vmm_zsd_t *zsd = data; + vmm_softc_t *sc; + + mutex_enter(&zsd->vz_lock); + + /* + * This may already be B_FALSE. See comment in vmm_zsd_create(). If it + * is already B_FALSE we will take a quick trip through the empty list. + */ + zsd->vz_active = B_FALSE; + + for (sc = list_head(&zsd->vz_vmms); sc != NULL; + sc = list_next(&zsd->vz_vmms, sc)) { + /* Send a poweroff to the VM, whether running or not. */ + (void) vm_suspend(sc->vmm_vm, VM_SUSPEND_POWEROFF); + } + mutex_exit(&zsd->vz_lock); +} + +/* + * Reap all VMs that remain and free up guest resources. + */ +static void +vmm_zsd_destroy(zoneid_t zid, void *data) +{ + vmm_zsd_t *zsd = data; + vmm_softc_t *sc; + + mutex_enter(&vmm_zsd_lock); + list_remove(&vmm_zsd_list, zsd); + mutex_exit(&vmm_zsd_lock); + + mutex_enter(&zsd->vz_lock); + ASSERT(!zsd->vz_active); + + while ((sc = list_remove_head(&zsd->vz_vmms)) != NULL) { + int err; + + /* + * This frees all resources associated with the vm, including + * sc. + */ + err = vmm_do_vm_destroy(sc, B_FALSE); + ASSERT3S(err, ==, 0); + } + + mutex_exit(&zsd->vz_lock); + mutex_destroy(&zsd->vz_lock); + + kmem_free(zsd, sizeof (*zsd)); +} + +void +vmm_zsd_init(void) +{ + mutex_init(&vmm_zsd_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&vmm_zsd_list, sizeof (vmm_zsd_t), + offsetof(vmm_zsd_t, vz_linkage)); + zone_key_create(&vmm_zsd_key, vmm_zsd_create, vmm_zsd_shutdown, + vmm_zsd_destroy); +} + +void +vmm_zsd_fini(void) +{ + /* Calls vmm_zsd_destroy() on all zones. */ + zone_key_delete(vmm_zsd_key); + ASSERT(list_is_empty(&vmm_zsd_list)); + + list_destroy(&vmm_zsd_list); + mutex_destroy(&vmm_zsd_lock); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmx_assym.s b/usr/src/uts/i86pc/io/vmm/vmx_assym.s deleted file mode 100644 index d84ca30275..0000000000 --- a/usr/src/uts/i86pc/io/vmm/vmx_assym.s +++ /dev/null @@ -1 +0,0 @@ -#include "vmx_assym.h" diff --git a/usr/src/uts/i86pc/io/vmm/x86.c b/usr/src/uts/i86pc/io/vmm/x86.c index 02222ef5e7..d74f866013 100644 --- a/usr/src/uts/i86pc/io/vmm/x86.c +++ b/usr/src/uts/i86pc/io/vmm/x86.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/x86.c 255645 2013-09-17 17:56:53Z grehan $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,38 +38,69 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include -__FBSDID("$FreeBSD: head/sys/amd64/vmm/x86.c 255645 2013-09-17 17:56:53Z grehan $"); +__FBSDID("$FreeBSD$"); #include -#include +#include #include -#include +#include +#include #include #include #include +#include #include #include +#include "vmm_host.h" +#include "vmm_ktr.h" +#include "vmm_util.h" #include "x86.h" +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL); + #define CPUID_VM_HIGH 0x40000000 static const char bhyve_id[12] = "bhyve bhyve "; static uint64_t bhyve_xcpuids; +SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0, + "Number of times an unknown cpuid leaf was accessed"); + +static int cpuid_leaf_b = 1; +SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN, + &cpuid_leaf_b, 0, NULL); + +/* + * Round up to the next power of two, if necessary, and then take log2. + * Returns -1 if argument is zero. + */ +static __inline int +log2(u_int x) +{ + + return (fls(x << (1 - powerof2(x))) - 1); +} int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { - int error; - unsigned int func, regs[4]; + const struct xsave_limits *limits; + uint64_t cr4; + int error, enable_invpcid, level, width = 0, x2apic_id = 0; + unsigned int func, regs[4], logical_cpus = 0; enum x2apic_state x2apic_state; + uint16_t cores, maxcpus, sockets, threads; + + VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx); /* * Requests for invalid CPUID levels should map to the highest @@ -102,26 +135,108 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, case CPUID_8000_0003: case CPUID_8000_0004: case CPUID_8000_0006: + cpuid_count(*eax, *ecx, regs); + break; case CPUID_8000_0008: cpuid_count(*eax, *ecx, regs); + if (vmm_is_amd()) { + /* + * As on Intel (0000_0007:0, EDX), mask out + * unsupported or unsafe AMD extended features + * (8000_0008 EBX). + */ + regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF | + AMDFEID_XSAVEERPTR); + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + /* + * Here, width is ApicIdCoreIdSize, present on + * at least Family 15h and newer. It + * represents the "number of bits in the + * initial apicid that indicate thread id + * within a package." + * + * Our topo_probe_amd() uses it for + * pkg_id_shift and other OSes may rely on it. + */ + width = MIN(0xF, log2(threads * cores)); + if (width < 0x4) + width = 0; + logical_cpus = MIN(0xFF, threads * cores - 1); + regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus; + } break; case CPUID_8000_0001: + cpuid_count(*eax, *ecx, regs); + + /* + * Hide SVM from guest. + */ + regs[2] &= ~AMDID2_SVM; + + /* + * Don't advertise extended performance counter MSRs + * to the guest. + */ + regs[2] &= ~AMDID2_PCXC; + regs[2] &= ~AMDID2_PNXC; + regs[2] &= ~AMDID2_PTSCEL2I; + + /* + * Don't advertise Instruction Based Sampling feature. + */ + regs[2] &= ~AMDID2_IBS; + + /* NodeID MSR not available */ + regs[2] &= ~AMDID2_NODE_ID; + + /* Don't advertise the OS visible workaround feature */ + regs[2] &= ~AMDID2_OSVW; + + /* Hide mwaitx/monitorx capability from the guest */ + regs[2] &= ~AMDID2_MWAITX; + +#ifndef __FreeBSD__ + /* + * Detection routines for TCE and FFXSR are missing + * from our vm_cpuid_capability() detection logic + * today. Mask them out until that is remedied. + * They do not appear to be in common usage, so their + * absence should not cause undue trouble. + */ + regs[2] &= ~AMDID2_TCE; + regs[3] &= ~AMDID_FFXSR; +#endif + /* * Hide rdtscp/ia32_tsc_aux until we know how * to deal with them. */ - cpuid_count(*eax, *ecx, regs); regs[3] &= ~AMDID_RDTSCP; break; case CPUID_8000_0007: - cpuid_count(*eax, *ecx, regs); -#ifdef __FreeBSD__ /* - * If the host TSCs are not synchronized across - * physical cpus then we cannot advertise an - * invariant tsc to a vcpu. + * AMD uses this leaf to advertise the processor's + * power monitoring and RAS capabilities. These + * features are hardware-specific and exposing + * them to a guest doesn't make a lot of sense. + * + * Intel uses this leaf only to advertise the + * "Invariant TSC" feature with all other bits + * being reserved (set to zero). + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* + * "Invariant TSC" can be advertised to the guest if: + * - host TSC frequency is invariant + * - host TSCs are synchronized across physical cpus * * XXX This still falls short because the vcpu * can observe the TSC moving backwards as it @@ -129,9 +244,73 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, * it should discourage the guest from using the * TSC to keep track of time. */ - if (!smp_tsc) - regs[3] &= ~AMDPM_TSC_INVARIANT; -#endif +#ifdef __FreeBSD__ + /* XXXJOY: Wire up with our own TSC logic */ + if (tsc_is_invariant && smp_tsc) + regs[3] |= AMDPM_TSC_INVARIANT; +#endif /* __FreeBSD__ */ + break; + + case CPUID_8000_001D: + /* AMD Cache topology, like 0000_0004 for Intel. */ + if (!vmm_is_amd()) + goto default_leaf; + + /* + * Similar to Intel, generate a ficticious cache + * topology for the guest with L3 shared by the + * package, and L1 and L2 local to a core. + */ + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + switch (*ecx) { + case 0: + logical_cpus = threads; + level = 1; + func = 1; /* data cache */ + break; + case 1: + logical_cpus = threads; + level = 2; + func = 3; /* unified cache */ + break; + case 2: + logical_cpus = threads * cores; + level = 3; + func = 3; /* unified cache */ + break; + default: + logical_cpus = 0; + level = 0; + func = 0; + break; + } + + logical_cpus = MIN(0xfff, logical_cpus - 1); + regs[0] = (logical_cpus << 14) | (1 << 8) | + (level << 5) | func; + regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_8000_001E: + /* AMD Family 16h+ additional identifiers */ + if (!vmm_is_amd() || CPUID_TO_FAMILY(cpu_id) < 0x16) + goto default_leaf; + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + regs[0] = vcpu_id; + threads = MIN(0xFF, threads - 1); + regs[1] = (threads << 8) | + (vcpu_id >> log2(threads + 1)); + /* + * XXX Bhyve topology cannot yet represent >1 node per + * processor. + */ + regs[2] = 0; + regs[3] = 0; break; case CPUID_0000_0001: @@ -150,22 +329,41 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); /* - * Don't expose VMX, SpeedStep or TME capability. + * Don't expose VMX, SpeedStep, TME or SMX capability. * Advertise x2APIC capability and Hypervisor guest. */ regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); + regs[2] &= ~(CPUID2_SMX); regs[2] |= CPUID2_HV; if (x2apic_state != X2APIC_DISABLED) regs[2] |= CPUID2_X2APIC; + else + regs[2] &= ~CPUID2_X2APIC; /* - * Hide xsave/osxsave/avx until the FPU save/restore - * issues are resolved + * Only advertise CPUID2_XSAVE in the guest if + * the host is using XSAVE. */ - regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE | - CPUID2_AVX); + if (!(regs[2] & CPUID2_OSXSAVE)) + regs[2] &= ~CPUID2_XSAVE; + + /* + * If CPUID2_XSAVE is being advertised and the + * guest has set CR4_XSAVE, set + * CPUID2_OSXSAVE. + */ + regs[2] &= ~CPUID2_OSXSAVE; + if (regs[2] & CPUID2_XSAVE) { + error = vm_get_register(vm, vcpu_id, + VM_REG_GUEST_CR4, &cr4); + if (error) + panic("x86_emulate_cpuid: error %d " + "fetching %%cr4", error); + if (cr4 & CR4_XSAVE) + regs[2] |= CPUID2_OSXSAVE; + } /* * Hide monitor/mwait until we know how to deal with @@ -177,7 +375,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, * Hide the performance and debug features. */ regs[2] &= ~CPUID2_PDCM; - + /* * No TSC deadline support in the APIC yet */ @@ -187,48 +385,95 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, * Hide thermal monitoring */ regs[3] &= ~(CPUID_ACPI | CPUID_TM); - + /* - * Machine check handling is done in the host. + * Hide the debug store capability. */ - regs[3] &= ~(CPUID_MCA | CPUID_MCE); - - /* - * Hide the debug store capability. - */ regs[3] &= ~CPUID_DS; /* - * Disable multi-core. + * Advertise the Machine Check and MTRR capability. + * + * Some guest OSes (e.g. Windows) will not boot if + * these features are absent. */ + regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + logical_cpus = threads * cores; regs[1] &= ~CPUID_HTT_CORES; - regs[3] &= ~CPUID_HTT; + regs[1] |= (logical_cpus & 0xff) << 16; + regs[3] |= CPUID_HTT; break; case CPUID_0000_0004: - do_cpuid(4, regs); + cpuid_count(*eax, *ecx, regs); - /* - * Do not expose topology. - */ - regs[0] &= 0xffff8000; - /* - * The maximum number of processor cores in - * this physical processor package and the - * maximum number of threads sharing this - * cache are encoded with "plus 1" encoding. - * Adding one to the value in this register - * field to obtains the actual value. - * - * Therefore 0 for both indicates 1 core - * per package and no cache sharing. - */ + if (regs[0] || regs[1] || regs[2] || regs[3]) { + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + regs[0] &= 0x3ff; + regs[0] |= (cores - 1) << 26; + /* + * Cache topology: + * - L1 and L2 are shared only by the logical + * processors in a single core. + * - L3 and above are shared by all logical + * processors in the package. + */ + logical_cpus = threads; + level = (regs[0] >> 5) & 0x7; + if (level >= 3) + logical_cpus *= cores; + regs[0] |= (logical_cpus - 1) << 14; + } break; - case CPUID_0000_0006: case CPUID_0000_0007: + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* leaf 0 */ + if (*ecx == 0) { + cpuid_count(*eax, *ecx, regs); + + /* Only leaf 0 is supported */ + regs[0] = 0; + + /* + * Expose known-safe features. + */ + regs[1] &= (CPUID_STDEXT_FSGSBASE | + CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | + CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 | + CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | + CPUID_STDEXT_AVX512F | + CPUID_STDEXT_RDSEED | + CPUID_STDEXT_AVX512PF | + CPUID_STDEXT_AVX512ER | + CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA); + regs[2] = 0; + regs[3] &= CPUID_STDEXT3_MD_CLEAR; + + /* Advertise INVPCID if it is enabled. */ + error = vm_get_capability(vm, vcpu_id, + VM_CAP_ENABLE_INVPCID, &enable_invpcid); + if (error == 0 && enable_invpcid) + regs[1] |= CPUID_STDEXT_INVPCID; + } + break; + + case CPUID_0000_0006: + regs[0] = CPUTPM1_ARAT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + case CPUID_0000_000A: - case CPUID_0000_000D: /* * Handle the access, but report 0 for * all options @@ -241,12 +486,93 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, case CPUID_0000_000B: /* - * Processor topology enumeration + * Intel processor topology enumeration */ - regs[0] = 0; - regs[1] = 0; - regs[2] = *ecx & 0xff; - regs[3] = vcpu_id; + if (vmm_is_intel()) { + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + if (*ecx == 0) { + logical_cpus = threads; + width = log2(logical_cpus); + level = CPUID_TYPE_SMT; + x2apic_id = vcpu_id; + } + + if (*ecx == 1) { + logical_cpus = threads * cores; + width = log2(logical_cpus); + level = CPUID_TYPE_CORE; + x2apic_id = vcpu_id; + } + + if (!cpuid_leaf_b || *ecx >= 2) { + width = 0; + logical_cpus = 0; + level = 0; + x2apic_id = 0; + } + + regs[0] = width & 0x1f; + regs[1] = logical_cpus & 0xffff; + regs[2] = (level << 8) | (*ecx & 0xff); + regs[3] = x2apic_id; + } else { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + + case CPUID_0000_000D: + limits = vmm_get_xsave_limits(); + if (!limits->xsave_enabled) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + } + + cpuid_count(*eax, *ecx, regs); + switch (*ecx) { + case 0: + /* + * Only permit the guest to use bits + * that are active in the host in + * %xcr0. Also, claim that the + * maximum save area size is + * equivalent to the host's current + * save area size. Since this runs + * "inside" of vmrun(), it runs with + * the guest's xcr0, so the current + * save area size is correct as-is. + */ + regs[0] &= limits->xcr0_allowed; + regs[2] = limits->xsave_max_size; + regs[3] &= (limits->xcr0_allowed >> 32); + break; + case 1: + /* Only permit XSAVEOPT. */ + regs[0] &= CPUID_EXTSTATE_XSAVEOPT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + default: + /* + * If the leaf is for a permitted feature, + * pass through as-is, otherwise return + * all zeroes. + */ + if (!(limits->xcr0_allowed & (1ul << *ecx))) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + } break; case 0x40000000: @@ -257,6 +583,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, break; default: +default_leaf: /* * The leaf value has already been clamped so * simply pass this through, keeping count of @@ -274,3 +601,45 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, return (1); } + +bool +vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap) +{ + bool rv; + + KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d", + __func__, cap)); + + /* + * Simply passthrough the capabilities of the host cpu for now. + */ + rv = false; + switch (cap) { +#ifdef __FreeBSD__ + case VCC_NO_EXECUTE: + if (amd_feature & AMDID_NX) + rv = true; + break; + case VCC_FFXSR: + if (amd_feature & AMDID_FFXSR) + rv = true; + break; + case VCC_TCE: + if (amd_feature2 & AMDID2_TCE) + rv = true; + break; +#else + case VCC_NO_EXECUTE: + if (is_x86_feature(x86_featureset, X86FSET_NX)) + rv = true; + break; + /* XXXJOY: No kernel detection for FFXR or TCE at present, so ignore */ + case VCC_FFXSR: + case VCC_TCE: + break; +#endif + default: + panic("%s: unknown vm_cpu_capability %d", __func__, cap); + } + return (rv); +} diff --git a/usr/src/uts/i86pc/io/vmm/x86.h b/usr/src/uts/i86pc/io/vmm/x86.h index db2340b37b..0d70c04fd8 100644 --- a/usr/src/uts/i86pc/io/vmm/x86.h +++ b/usr/src/uts/i86pc/io/vmm/x86.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/x86.h 255287 2013-09-06 05:16:10Z grehan $ + * $FreeBSD$ */ #ifndef _X86_H_ @@ -47,6 +49,8 @@ #define CPUID_8000_0006 (0x80000006) #define CPUID_8000_0007 (0x80000007) #define CPUID_8000_0008 (0x80000008) +#define CPUID_8000_001D (0x8000001D) +#define CPUID_8000_001E (0x8000001E) /* * CPUID instruction Fn0000_0001: @@ -62,4 +66,17 @@ int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx); +enum vm_cpuid_capability { + VCC_NONE, + VCC_NO_EXECUTE, + VCC_FFXSR, + VCC_TCE, + VCC_LAST +}; + +/* + * Return 'true' if the capability 'cap' is enabled in this virtual cpu + * and 'false' otherwise. + */ +bool vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability); #endif diff --git a/usr/src/uts/i86pc/os/gipt.c b/usr/src/uts/i86pc/os/gipt.c new file mode 100644 index 0000000000..ace7e03438 --- /dev/null +++ b/usr/src/uts/i86pc/os/gipt.c @@ -0,0 +1,566 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Generic Indexed Page Table + * + * There are several applications, such as hardware virtualization or IOMMU + * control, which require construction of a page table tree to represent a + * virtual address space. Many features of the existing htable system would be + * convenient for this, but its tight coupling to the VM system make it + * undesirable for independent consumers. The GIPT interface exists to provide + * page table allocation and indexing on top of which a table hierarchy + * (EPT, VT-d, etc) can be built by upstack logic. + * + * Types: + * + * gipt_t - Represents a single page table with a physical backing page and + * associated metadata. + * gipt_map_t - The workhorse of this facility, it contains an hash table to + * index all of the gipt_t entries which make up the page table tree. + * struct gipt_cbs - Callbacks used by the gipt_map_t: + * gipt_pte_type_cb_t - Given a PTE, emit the type (empty/page/table) + * gipt_pte_map_cb_t - Given a PFN, emit a (child) table mapping + */ + +/* + * For now, the level shifts are hard-coded to match with standard 4-level + * 64-bit paging structures. + */ + +#define GIPT_HASH(map, va, lvl) \ + ((((va) >> 12) + ((va) >> 28) + (lvl)) & ((map)->giptm_table_cnt - 1)) + +const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1] = { + 12, /* 4K */ + 21, /* 2M */ + 30, /* 1G */ + 39, /* 512G */ + 48 /* MAX */ +}; +const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1] = { + 0xfffffffffffff000ull, /* 4K */ + 0xffffffffffe00000ull, /* 2M */ + 0xffffffffc0000000ull, /* 1G */ + 0xffffff8000000000ull, /* 512G */ + 0xffff000000000000ull /* MAX */ +}; +const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1] = { + 0x0000000000001000ull, /* 4K */ + 0x0000000000200000ull, /* 2M */ + 0x0000000040000000ull, /* 1G */ + 0x0000008000000000ull, /* 512G */ + 0x0001000000000000ull /* MAX */ +}; +const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1] = { + 0x0000000000000001ull, /* 4K */ + 0x0000000000000200ull, /* 2M */ + 0x0000000000040000ull, /* 1G */ + 0x0000000008000000ull, /* 512G */ + 0x0000001000000000ull /* MAX */ +}; + +/* + * Allocate a gipt_t structure with corresponding page of memory to hold the + * PTEs which it contains. + */ +gipt_t * +gipt_alloc(void) +{ + gipt_t *pt; + void *page; + + pt = kmem_zalloc(sizeof (*pt), KM_SLEEP); + page = kmem_zalloc(PAGESIZE, KM_SLEEP); + pt->gipt_kva = page; + pt->gipt_pfn = hat_getpfnum(kas.a_hat, page); + + return (pt); +} + +/* + * Free a gipt_t structure along with its page of PTE storage. + */ +void +gipt_free(gipt_t *pt) +{ + void *page = pt->gipt_kva; + + ASSERT(pt->gipt_pfn != PFN_INVALID); + ASSERT(pt->gipt_kva != NULL); + + pt->gipt_pfn = PFN_INVALID; + pt->gipt_kva = NULL; + + kmem_free(page, PAGESIZE); + kmem_free(pt, sizeof (*pt)); +} + +/* + * Initialize a gipt_map_t with a max level (must be >= 1) and allocating its + * hash table based on a provided size (must be a power of 2). + */ +void +gipt_map_init(gipt_map_t *map, uint_t levels, uint_t hash_table_size, + const struct gipt_cbs *cbs, gipt_t *root) +{ + VERIFY(map->giptm_root == NULL); + VERIFY(map->giptm_hash == NULL); + VERIFY3U(levels, >, 0); + VERIFY3U(levels, <=, GIPT_MAX_LEVELS); + VERIFY(ISP2(hash_table_size)); + VERIFY(root != NULL); + + mutex_init(&map->giptm_lock, NULL, MUTEX_DEFAULT, NULL); + map->giptm_table_cnt = hash_table_size; + bcopy(cbs, &map->giptm_cbs, sizeof (*cbs)); + map->giptm_hash = kmem_alloc(sizeof (list_t) * map->giptm_table_cnt, + KM_SLEEP); + for (uint_t i = 0; i < hash_table_size; i++) { + list_create(&map->giptm_hash[i], sizeof (gipt_t), + offsetof(gipt_t, gipt_node)); + } + map->giptm_levels = levels; + + /* + * Insert the table root into the hash. It will be held in existence + * with an extra "valid" reference. This will prevent its clean-up + * during gipt_map_clean_parents() calls, even if it has no children. + */ + mutex_enter(&map->giptm_lock); + gipt_map_insert(map, root); + map->giptm_root = root; + root->gipt_valid_cnt++; + mutex_exit(&map->giptm_lock); +} + +/* + * Clean up a gipt_map_t by removing any lingering gipt_t entries referenced by + * it, and freeing its hash table. + */ +void +gipt_map_fini(gipt_map_t *map) +{ + const uint_t cnt = map->giptm_table_cnt; + const size_t sz = sizeof (list_t) * cnt; + + mutex_enter(&map->giptm_lock); + /* Clean up any lingering tables */ + for (uint_t i = 0; i < cnt; i++) { + list_t *list = &map->giptm_hash[i]; + gipt_t *pt; + + while ((pt = list_remove_head(list)) != NULL) { + gipt_free(pt); + } + ASSERT(list_is_empty(list)); + } + + kmem_free(map->giptm_hash, sz); + map->giptm_hash = NULL; + map->giptm_root = NULL; + map->giptm_levels = 0; + mutex_exit(&map->giptm_lock); + mutex_destroy(&map->giptm_lock); +} + +/* + * Look in the map for a gipt_t containing a given VA which is located at a + * specified level. + */ +gipt_t * +gipt_map_lookup(gipt_map_t *map, uint64_t va, uint_t lvl) +{ + gipt_t *pt; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + ASSERT3U(lvl, <=, GIPT_MAX_LEVELS); + + /* + * Lookup gipt_t at the VA aligned to the next level up. For example, + * level 0 corresponds to a page table containing 512 PTEs which cover + * 4k each, spanning a total 2MB. As such, the base VA of that table + * must be aligned to the same 2MB. + */ + const uint64_t masked_va = va & gipt_level_mask[lvl + 1]; + const uint_t hash = GIPT_HASH(map, masked_va, lvl); + + /* Only the root is expected to be at the top level. */ + if (lvl == (map->giptm_levels - 1) && map->giptm_root != NULL) { + pt = map->giptm_root; + + ASSERT3U(pt->gipt_level, ==, lvl); + + /* + * It may be so that the VA in question is not covered by the + * range of the table root. + */ + if (pt->gipt_vaddr != masked_va) { + return (NULL); + } + + return (pt); + } + + list_t *list = &map->giptm_hash[hash]; + for (pt = list_head(list); pt != NULL; pt = list_next(list, pt)) { + if (pt->gipt_vaddr == masked_va && pt->gipt_level == lvl) + break; + } + return (pt); +} + +/* + * Look in the map for the deepest (lowest level) gipt_t which contains a given + * VA. This could still fail if the VA is outside the range of the table root. + */ +gipt_t * +gipt_map_lookup_deepest(gipt_map_t *map, uint64_t va) +{ + gipt_t *pt = NULL; + uint_t lvl; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + + for (lvl = 0; lvl < map->giptm_levels; lvl++) { + pt = gipt_map_lookup(map, va, lvl); + if (pt != NULL) { + break; + } + } + return (pt); +} + +/* + * Given a VA inside a gipt_t, calculate (based on the level of that PT) the VA + * corresponding to the next entry in the table. It returns 0 if that VA would + * fall beyond the bounds of the table. + */ +static __inline__ uint64_t +gipt_next_va(gipt_t *pt, uint64_t va) +{ + const uint_t lvl = pt->gipt_level; + const uint64_t masked = va & gipt_level_mask[lvl]; + const uint64_t max = pt->gipt_vaddr + gipt_level_size[lvl+1]; + const uint64_t next = masked + gipt_level_size[lvl]; + + ASSERT3U(masked, >=, pt->gipt_vaddr); + ASSERT3U(masked, <, max); + + /* + * If the "next" VA would be outside this table, including cases where + * it overflowed, indicate an error result. + */ + if (next >= max || next <= masked) { + return (0); + } + return (next); +} + +/* + * For a given VA, find the next VA which corresponds to a valid page mapping. + * The gipt_t containing that VA will be indicated via 'ptp'. (The gipt_t of + * the starting VA can be passed in via 'ptp' for a minor optimization). If + * there is no valid mapping higher than 'va' but contained within 'max_va', + * then this will indicate failure with 0 returned. + */ +uint64_t +gipt_map_next_page(gipt_map_t *map, uint64_t va, uint64_t max_va, gipt_t **ptp) +{ + gipt_t *pt = *ptp; + uint64_t cur_va = va; + gipt_pte_type_cb_t pte_type = map->giptm_cbs.giptc_pte_type; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + ASSERT3U(max_va, !=, 0); + ASSERT3U(ptp, !=, NULL); + + /* + * If a starting table is not provided, search the map for the deepest + * table which contains the VA. If for some reason that VA is beyond + * coverage of the map root, indicate failure. + */ + if (pt == NULL) { + pt = gipt_map_lookup_deepest(map, cur_va); + if (pt == NULL) { + goto fail; + } + } + + /* + * From the starting table (at whatever level that may reside), walk + * forward through the PTEs looking for a valid page mapping. + */ + while (cur_va < max_va) { + const uint64_t next_va = gipt_next_va(pt, cur_va); + if (next_va == 0) { + /* + * The end of this table has been reached. Ascend one + * level to continue the walk if possible. If already + * at the root, the end of the table means failure. + */ + if (pt->gipt_level >= map->giptm_levels) { + goto fail; + } + pt = gipt_map_lookup(map, cur_va, pt->gipt_level + 1); + if (pt == NULL) { + goto fail; + } + continue; + } else if (next_va >= max_va) { + /* + * Terminate the walk with a failure if the VA + * corresponding to the next PTE is beyond the max. + */ + goto fail; + } + cur_va = next_va; + + const uint64_t pte = GIPT_VA2PTE(pt, cur_va); + const gipt_pte_type_t ptet = pte_type(pte, pt->gipt_level); + if (ptet == PTET_EMPTY) { + continue; + } else if (ptet == PTET_PAGE) { + /* A valid page mapping: success. */ + *ptp = pt; + return (cur_va); + } else if (ptet == PTET_LINK) { + /* + * A child page table is present at this PTE. Look it + * up from the map. + */ + ASSERT3U(pt->gipt_level, >, 0); + pt = gipt_map_lookup(map, cur_va, pt->gipt_level - 1); + ASSERT3P(pt, !=, NULL); + break; + } else { + panic("unexpected PTE type %x @ va %p", ptet, cur_va); + } + } + + /* + * By this point, the above loop has located a table structure to + * descend into in order to find the next page. + */ + while (cur_va < max_va) { + const uint64_t pte = GIPT_VA2PTE(pt, cur_va); + const gipt_pte_type_t ptet = pte_type(pte, pt->gipt_level); + + if (ptet == PTET_EMPTY) { + const uint64_t next_va = gipt_next_va(pt, cur_va); + if (next_va == 0 || next_va >= max_va) { + goto fail; + } + cur_va = next_va; + continue; + } else if (ptet == PTET_PAGE) { + /* A valid page mapping: success. */ + *ptp = pt; + return (cur_va); + } else if (ptet == PTET_LINK) { + /* + * A child page table is present at this PTE. Look it + * up from the map. + */ + ASSERT3U(pt->gipt_level, >, 0); + pt = gipt_map_lookup(map, cur_va, pt->gipt_level - 1); + ASSERT3P(pt, !=, NULL); + } else { + panic("unexpected PTE type %x @ va %p", ptet, cur_va); + } + } + +fail: + *ptp = NULL; + return (0); +} + +/* + * Insert a gipt_t into the map based on its VA and level. It is up to the + * caller to ensure that a duplicate entry does not already exist in the map. + */ +void +gipt_map_insert(gipt_map_t *map, gipt_t *pt) +{ + const uint_t hash = GIPT_HASH(map, pt->gipt_vaddr, pt->gipt_level); + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + ASSERT(gipt_map_lookup(map, pt->gipt_vaddr, pt->gipt_level) == NULL); + VERIFY3U(pt->gipt_level, <, map->giptm_levels); + + list_insert_head(&map->giptm_hash[hash], pt); +} + +/* + * Remove a gipt_t from the map. + */ +void +gipt_map_remove(gipt_map_t *map, gipt_t *pt) +{ + const uint_t hash = GIPT_HASH(map, pt->gipt_vaddr, pt->gipt_level); + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + + list_remove(&map->giptm_hash[hash], pt); +} + +/* + * Given a VA, create any missing gipt_t entries from the specified level all + * the way up to (but not including) the root. This is done from lowest level + * to highest, and stops when an existing table covering that VA is found. + * References to any created gipt_t tables, plus the final "found" gipt_t are + * stored in 'pts'. The number of gipt_t pointers stored to 'pts' serves as + * the return value (1 <= val <= root level). It is up to the caller to + * populate linking PTEs to the newly created empty tables. + */ +static uint_t +gipt_map_ensure_chain(gipt_map_t *map, uint64_t va, uint_t lvl, gipt_t **pts) +{ + const uint_t root_lvl = map->giptm_root->gipt_level; + uint_t clvl = lvl, count = 0; + gipt_t *child_pt = NULL; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + ASSERT3U(lvl, <, root_lvl); + ASSERT3P(map->giptm_root, !=, NULL); + + do { + const uint64_t pva = (va & gipt_level_mask[clvl + 1]); + gipt_t *pt; + + pt = gipt_map_lookup(map, pva, clvl); + if (pt != NULL) { + ASSERT3U(pva, ==, pt->gipt_vaddr); + + if (child_pt != NULL) { + child_pt->gipt_parent = pt; + } + pts[count++] = pt; + return (count); + } + + pt = gipt_alloc(); + pt->gipt_vaddr = pva; + pt->gipt_level = clvl; + if (child_pt != NULL) { + child_pt->gipt_parent = pt; + } + + gipt_map_insert(map, pt); + child_pt = pt; + pts[count++] = pt; + clvl++; + } while (clvl <= root_lvl); + + return (count); +} + +/* + * Ensure that a page table covering a VA at a specified level exists. This + * will create any necessary tables chaining up to the root as well. + */ +gipt_t * +gipt_map_create_parents(gipt_map_t *map, uint64_t va, uint_t lvl) +{ + gipt_t *pt, *pts[GIPT_MAX_LEVELS] = { 0 }; + gipt_pte_type_cb_t pte_type = map->giptm_cbs.giptc_pte_type; + gipt_pte_map_cb_t pte_map = map->giptm_cbs.giptc_pte_map; + uint64_t *ptep; + uint_t i, count; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + + count = gipt_map_ensure_chain(map, va, lvl, pts); + if (count == 1) { + /* Table already exists in the hierarchy */ + return (pts[0]); + } + ASSERT3U(count, >, 1); + + /* Make sure there is not already a large page mapping at the top */ + pt = pts[count - 1]; + if (pte_type(GIPT_VA2PTE(pt, va), pt->gipt_level) == PTET_PAGE) { + const uint_t end = count - 1; + + /* + * Nuke those gipt_t entries which were optimistically created + * for what was found to be a conflicted mapping. + */ + for (i = 0; i < end; i++) { + gipt_map_remove(map, pts[i]); + gipt_free(pts[i]); + } + return (NULL); + } + + /* Initialize the appropriate tables from bottom to top */ + for (i = 1; i < count; i++) { + pt = pts[i]; + ptep = GIPT_VA2PTEP(pt, va); + + /* + * Since gipt_map_ensure_chain() creates missing tables until + * it find a valid one, and that existing table has been + * checked for the existence of a large page, nothing should + * occupy this PTE. + */ + ASSERT3U(pte_type(*ptep, pt->gipt_level), ==, PTET_EMPTY); + + *ptep = pte_map(pts[i - 1]->gipt_pfn); + pt->gipt_valid_cnt++; + } + + return (pts[0]); +} + +/* + * If a page table is empty, free it from the map, as well as any parent tables + * that would subsequently become empty as part of the clean-up. As noted in + * gipt_map_init(), the table root is a special case and will remain in the + * map, even when empty. + */ +void +gipt_map_clean_parents(gipt_map_t *map, gipt_t *pt) +{ + ASSERT(MUTEX_HELD(&map->giptm_lock)); + + while (pt->gipt_valid_cnt == 0) { + gipt_t *parent = pt->gipt_parent; + uint64_t *ptep = GIPT_VA2PTEP(parent, pt->gipt_vaddr); + + ASSERT3S(map->giptm_cbs.giptc_pte_type(*ptep, + parent->gipt_level), ==, PTET_LINK); + + /* + * For now, it is assumed that all gipt consumers consider PTE + * zeroing as an adequate action for table unmap. + */ + *ptep = 0; + + parent->gipt_valid_cnt--; + gipt_map_remove(map, pt); + gipt_free(pt); + pt = parent; + } +} diff --git a/usr/src/uts/i86pc/sys/gipt.h b/usr/src/uts/i86pc/sys/gipt.h new file mode 100644 index 0000000000..4d7d523726 --- /dev/null +++ b/usr/src/uts/i86pc/sys/gipt.h @@ -0,0 +1,92 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _GIPT_H_ +#define _GIPT_H_ + +#include +#include +#include +#include + +struct gipt { + list_node_t gipt_node; + uint64_t gipt_vaddr; + uint64_t gipt_pfn; + uint16_t gipt_level; + uint16_t gipt_valid_cnt; + uint32_t _gipt_pad; + struct gipt *gipt_parent; + uint64_t *gipt_kva; + uint64_t _gipt_pad2; +}; +typedef struct gipt gipt_t; + +typedef enum { + PTET_EMPTY = 0, + PTET_PAGE = 1, + PTET_LINK = 2, +} gipt_pte_type_t; + +/* Given a PTE and its level, determine the type of that PTE */ +typedef gipt_pte_type_t (*gipt_pte_type_cb_t)(uint64_t, uint_t); +/* Given the PFN of a child table, emit a PTE that references it */ +typedef uint64_t (*gipt_pte_map_cb_t)(uint64_t); + +struct gipt_cbs { + gipt_pte_type_cb_t giptc_pte_type; + gipt_pte_map_cb_t giptc_pte_map; +}; + +struct gipt_map { + kmutex_t giptm_lock; + gipt_t *giptm_root; + list_t *giptm_hash; + struct gipt_cbs giptm_cbs; + size_t giptm_table_cnt; + uint_t giptm_levels; +}; +typedef struct gipt_map gipt_map_t; + +#define GIPT_HASH_SIZE_DEFAULT 0x2000 +#define GIPT_MAX_LEVELS 4 + +#define GIPT_VA2IDX(pt, va) \ + (((va) - (pt)->gipt_vaddr) >> \ + gipt_level_shift[(pt)->gipt_level]) + +#define GIPT_VA2PTE(pt, va) ((pt)->gipt_kva[GIPT_VA2IDX(pt, va)]) +#define GIPT_VA2PTEP(pt, va) (&(pt)->gipt_kva[GIPT_VA2IDX(pt, va)]) + +extern const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1]; +extern const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1]; +extern const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1]; +extern const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1]; + +extern gipt_t *gipt_alloc(void); +extern void gipt_free(gipt_t *); +extern void gipt_map_init(gipt_map_t *, uint_t, uint_t, + const struct gipt_cbs *, gipt_t *); +extern void gipt_map_fini(gipt_map_t *); +extern gipt_t *gipt_map_lookup(gipt_map_t *, uint64_t, uint_t); +extern gipt_t *gipt_map_lookup_deepest(gipt_map_t *, uint64_t); +extern uint64_t gipt_map_next_page(gipt_map_t *, uint64_t, uint64_t, + gipt_t **); +extern void gipt_map_insert(gipt_map_t *, gipt_t *); +extern void gipt_map_remove(gipt_map_t *, gipt_t *); +extern gipt_t *gipt_map_create_parents(gipt_map_t *, uint64_t, uint_t); +extern void gipt_map_clean_parents(gipt_map_t *, gipt_t *); + +#endif /* _GIPT_H_ */ diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h index a4fb0f2527..a26cc00a55 100644 --- a/usr/src/uts/i86pc/sys/viona_io.h +++ b/usr/src/uts/i86pc/sys/viona_io.h @@ -11,6 +11,7 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _VIONA_IO_H_ @@ -27,8 +28,8 @@ #define VNA_IOC_TX_RING_KICK (VNA_IOC | 8) #define VNA_IOC_RX_INTR_CLR (VNA_IOC | 9) #define VNA_IOC_TX_INTR_CLR (VNA_IOC | 10) -#define VNA_IOC_SET_FEATURES (VNA_IOC | 11) -#define VNA_IOC_GET_FEATURES (VNA_IOC | 12) +#define VNA_IOC_SET_FEATURES (VNA_IOC | 11) +#define VNA_IOC_GET_FEATURES (VNA_IOC | 12) typedef struct vioc_create { datalink_id_t c_linkid; diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h index e876ce748f..8a35d123c7 100644 --- a/usr/src/uts/i86pc/sys/vmm.h +++ b/usr/src/uts/i86pc/sys/vmm.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/include/vmm.h 273375 2014-10-21 07:10:43Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,18 +38,25 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMM_H_ #define _VMM_H_ +#include #include +#ifdef _KERNEL +SDT_PROVIDER_DECLARE(vmm); +#endif + enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, + VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_LAST }; @@ -89,6 +98,16 @@ enum vm_reg_name { VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, + VM_REG_GUEST_PDPTE0, + VM_REG_GUEST_PDPTE1, + VM_REG_GUEST_PDPTE2, + VM_REG_GUEST_PDPTE3, + VM_REG_GUEST_INTR_SHADOW, + VM_REG_GUEST_DR0, + VM_REG_GUEST_DR1, + VM_REG_GUEST_DR2, + VM_REG_GUEST_DR3, + VM_REG_GUEST_DR6, VM_REG_LAST }; @@ -108,31 +127,37 @@ enum x2apic_state { #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) + #define VM_MAX_NAMELEN 32 #ifdef _KERNEL struct vm; struct vm_exception; -struct vm_memory_segment; struct seg_desc; struct vm_exit; struct vm_run; struct vhpet; struct vioapic; struct vlapic; +struct vmspace; +struct vm_object; struct vm_guest_paging; +struct pmap; -typedef int (*vmm_init_func_t)(void); +struct vm_eventinfo { + u_int *rptr; /* runblock cookie */ + int *sptr; /* suspend cookie */ + int *iptr; /* reqidle cookie */ +}; + +typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); -typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */ -typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip); +typedef void (*vmm_resume_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + struct pmap *pmap, struct vm_eventinfo *info); typedef void (*vmi_cleanup_func_t)(void *vmi); -typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa, - vm_paddr_t hpa, size_t length, - vm_memattr_t attr, int prot, - boolean_t superpages_ok); -typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, @@ -143,26 +168,38 @@ typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); +typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); +typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); +#ifndef __FreeBSD__ +typedef void (*vmi_savectx)(void *vmi, int vcpu); +typedef void (*vmi_restorectx)(void *vmi, int vcpu); +#endif struct vmm_ops { vmm_init_func_t init; /* module wide initialization */ vmm_cleanup_func_t cleanup; + vmm_resume_func_t resume; vmi_init_func_t vminit; /* vm-specific initialization */ vmi_run_func_t vmrun; vmi_cleanup_func_t vmcleanup; - vmi_mmap_set_func_t vmmmap_set; - vmi_mmap_get_func_t vmmmap_get; vmi_get_register_t vmgetreg; vmi_set_register_t vmsetreg; vmi_get_desc_t vmgetdesc; vmi_set_desc_t vmsetdesc; vmi_get_cap_t vmgetcap; vmi_set_cap_t vmsetcap; + vmi_vmspace_alloc vmspace_alloc; + vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; + +#ifndef __FreeBSD__ + vmi_savectx vmsavectx; + vmi_restorectx vmrestorectx; +#endif }; extern struct vmm_ops vmm_ops_intel; @@ -170,20 +207,41 @@ extern struct vmm_ops vmm_ops_amd; int vm_create(const char *name, struct vm **retvm); void vm_destroy(struct vm *vm); +int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); -int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len); -#ifdef __FreeBSD__ +uint16_t vm_get_maxcpus(struct vm *vm); +void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus); +int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus); + +/* + * APIs that modify the guest memory map require all vcpus to be frozen. + */ +int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, + size_t len, int prot, int flags); +int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); +void vm_free_memseg(struct vm *vm, int ident); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); -#endif int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); -#ifndef __FreeBSD__ -vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size); -#endif -void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot, - void **cookie); +int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); +int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); + +/* + * APIs that inspect the guest memory map require only a *single* vcpu to + * be frozen. This acts like a read lock on the guest memory map since any + * modification requires *all* vcpus to be frozen. + */ +int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); +int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + struct vm_object **objptr); +vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); +void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len, + int prot, void **cookie); void vm_gpa_release(void *cookie); -int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, - struct vm_memory_segment *seg); +bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); + int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, @@ -191,6 +249,7 @@ int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc); int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vm *vm, int vcpu); int vm_nmi_pending(struct vm *vm, int vcpuid); void vm_nmi_clear(struct vm *vm, int vcpuid); @@ -206,10 +265,43 @@ int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); int vm_activate_cpu(struct vm *vm, int vcpu); -cpuset_t vm_active_cpus(struct vm *vm); +int vm_suspend_cpu(struct vm *vm, int vcpu); +int vm_resume_cpu(struct vm *vm, int vcpu); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); +void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); + +#ifdef _SYS__CPUSET_H_ +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_debug_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); +#endif /* _SYS__CPUSET_H_ */ + +static __inline int +vcpu_runblocked(struct vm_eventinfo *info) +{ -typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); + return (*info->rptr != 0); +} + +static __inline int +vcpu_suspended(struct vm_eventinfo *info) +{ + + return (*info->sptr); +} + +static __inline int +vcpu_reqidle(struct vm_eventinfo *info) +{ + + return (*info->iptr); +} + +int vcpu_debugged(struct vm *vm, int vcpuid); /* * Return 1 if device indicated by bus/slot/func is supposed to be a @@ -231,21 +323,43 @@ enum vcpu_state { int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); +void vcpu_block_run(struct vm *, int); +void vcpu_unblock_run(struct vm *, int); + +#ifndef __FreeBSD__ +uint64_t vcpu_tsc_offset(struct vm *vm, int vcpuid); +#endif -static int __inline +static __inline int vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) { return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); } +#ifdef _SYS_THREAD_H +static __inline int +vcpu_should_yield(struct vm *vm, int vcpu) +{ + + if (curthread->t_astflag) + return (1); + else if (CPU->cpu_runrun) + return (1); + else + return (0); +} +#endif /* _SYS_THREAD_H */ + void *vcpu_stats(struct vm *vm, int vcpu); -void vm_interrupt_hostcpu(struct vm *vm, int vcpu); void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); +struct vmspace *vm_get_vmspace(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); +struct vpmtmr *vm_pmtmr(struct vm *vm); +struct vrtc *vm_rtc(struct vm *vm); /* - * Inject exception 'vme' into the guest vcpu. This function returns 0 on + * Inject exception 'vector' into the guest vcpu. This function returns 0 on * success and non-zero on failure. * * Wrapper functions like 'vm_inject_gp()' should be preferred to calling @@ -255,7 +369,8 @@ struct vatpit *vm_atpit(struct vm *vm); * This function should only be called in the context of the thread that is * executing this vcpu. */ -int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme); +int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid, + uint32_t errcode, int restart_instruction); /* * This function is called after a VM-exit that occurred during exception or @@ -298,9 +413,10 @@ struct vm_copyinfo { * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for * a copyin or PROT_WRITE for a copyout. * - * Returns 0 on success. - * Returns 1 if an exception was injected into the guest. - * Returns -1 otherwise. + * retval is_fault Interpretation + * 0 0 Success + * 0 1 An exception was injected into the guest + * EFAULT N/A Unrecoverable error * * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if * the return value is 0. The 'copyinfo[]' resources should be freed by calling @@ -308,16 +424,18 @@ struct vm_copyinfo { */ int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, - int num_copyinfo); + int num_copyinfo, int *is_fault); void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, int num_copyinfo); void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, size_t len); void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, struct vm_copyinfo *copyinfo, size_t len); + +int vcpu_trace_exceptions(struct vm *vm, int vcpuid); #endif /* KERNEL */ -#define VM_MAXCPU 16 /* maximum virtual cpus */ +#define VM_MAXCPU 32 /* maximum virtual cpus */ /* * Identifiers for optional vmm capabilities @@ -348,7 +466,6 @@ struct seg_desc { uint32_t limit; uint32_t access; }; - #define SEG_DESC_TYPE(access) ((access) & 0x001f) #define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) #define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) @@ -443,7 +560,20 @@ enum vm_exitcode { VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ + VM_EXITCODE_RUNBLOCK, + VM_EXITCODE_IOAPIC_EOI, + VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, + VM_EXITCODE_TASK_SWITCH, + VM_EXITCODE_MONITOR, + VM_EXITCODE_MWAIT, + VM_EXITCODE_SVM, + VM_EXITCODE_REQIDLE, + VM_EXITCODE_DEBUG, + VM_EXITCODE_VMINSN, +#ifndef __FreeBSD__ + VM_EXITCODE_HT, +#endif VM_EXITCODE_MAX }; @@ -468,6 +598,22 @@ struct vm_inout_str { struct seg_desc seg_desc; }; +enum task_switch_reason { + TSR_CALL, + TSR_IRET, + TSR_JMP, + TSR_IDT_GATE, /* task gate in IDT */ +}; + +struct vm_task_switch { + uint16_t tsssel; /* new TSS selector */ + int ext; /* task switch due to external event */ + uint32_t errcode; + int errcode_valid; /* push 'errcode' on the new stack */ + enum task_switch_reason reason; + struct vm_guest_paging paging; +}; + struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ @@ -506,6 +652,14 @@ struct vm_exit { int inst_type; int inst_error; } vmx; + /* + * SVM specific payload. + */ + struct { + uint64_t exitcode; + uint64_t exitinfo1; + uint64_t exitinfo2; + } svm; struct { uint32_t code; /* ecx value */ uint64_t wval; @@ -516,7 +670,15 @@ struct vm_exit { } spinup_ap; struct { uint64_t rflags; + uint64_t intr_status; } hlt; + struct { + int vector; + } ioapic_eoi; + struct { + enum vm_suspend_how how; + } suspended; + struct vm_task_switch task_switch; } u; }; @@ -554,12 +716,28 @@ int vm_restart_instruction(void *vm, int vcpuid); #ifndef __FreeBSD__ #ifdef _KERNEL -extern void vmm_sol_glue_init(void); -extern void vmm_sol_glue_cleanup(void); -extern int vmm_mod_load(void); -extern int vmm_mod_unload(void); -#endif -#endif +void vmm_sol_glue_init(void); +void vmm_sol_glue_cleanup(void); + +int vmm_mod_load(void); +int vmm_mod_unload(void); + +void vmm_call_trap(uint64_t); + +/* + * Because of tangled headers, these are mirrored by vmm_drv.h to present the + * interface to driver consumers. + */ +typedef int (*vmm_rmem_cb_t)(void *, uintptr_t, uint_t, uint64_t *); +typedef int (*vmm_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t); + +int vm_ioport_hook(struct vm *, uint_t, vmm_rmem_cb_t, vmm_wmem_cb_t, void *, + void **); +void vm_ioport_unhook(struct vm *, void **); +int vm_ioport_handle_hook(struct vm *, int, bool, int, int, uint32_t *); + +#endif /* _KERNEL */ +#endif /* __FreeBSD */ #endif /* _VMM_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h index 3e74eb8786..58e581a60d 100644 --- a/usr/src/uts/i86pc/sys/vmm_dev.h +++ b/usr/src/uts/i86pc/sys/vmm_dev.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/include/vmm_dev.h 268889 2014-07-19 20:59:08Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,20 +38,30 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ -#ifdef _KERNEL -void vmmdev_init(void); -int vmmdev_cleanup(void); -#endif +#include + +struct vm_memmap { + vm_paddr_t gpa; + int segid; /* memory segment */ + vm_ooffset_t segoff; /* offset into memory segment */ + size_t len; /* mmap length */ + int prot; /* RWX */ + int flags; +}; +#define VM_MEMMAP_F_WIRED 0x01 +#define VM_MEMMAP_F_IOMMU 0x02 -struct vm_memory_segment { - vm_paddr_t gpa; /* in */ +#define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL) +struct vm_memseg { + int segid; size_t len; - int wired; + char name[SPECNAMELEN + 1]; }; struct vm_register { @@ -64,6 +76,13 @@ struct vm_seg_desc { /* data or code segment */ struct seg_desc desc; }; +struct vm_register_set { + int cpuid; + unsigned int count; + const int *regnums; /* enum vm_reg_name */ + uint64_t *regvals; +}; + struct vm_run { int cpuid; struct vm_exit vm_exit; @@ -129,7 +148,7 @@ struct vm_pptdev_msi { int slot; int func; int numvec; /* 0 means disabled */ - uint32_t msg; + uint64_t msg; uint64_t addr; }; @@ -139,7 +158,7 @@ struct vm_pptdev_msix { int slot; int func; int idx; - uint32_t msg; + uint64_t msg; uint32_t vector_control; uint64_t addr; }; @@ -148,7 +167,12 @@ struct vm_nmi { int cpuid; }; +#ifdef __FreeBSD__ #define MAX_VM_STATS 64 +#else +#define MAX_VM_STATS (64 + VM_MAXCPU) +#endif + struct vm_stats { int cpuid; /* in */ int num_entries; /* out */ @@ -176,8 +200,8 @@ struct vm_hpet_cap { uint32_t capabilities; /* lower 32 bits of HPET capabilities */ }; -struct vm_activate_cpu { - int vcpuid; +struct vm_suspend { + enum vm_suspend_how how; }; struct vm_gla2gpa { @@ -189,13 +213,51 @@ struct vm_gla2gpa { uint64_t gpa; }; +struct vm_activate_cpu { + int vcpuid; +}; + struct vm_cpuset { int which; int cpusetsize; +#ifndef _KERNEL cpuset_t *cpus; +#else + void *cpus; +#endif }; #define VM_ACTIVE_CPUS 0 #define VM_SUSPENDED_CPUS 1 +#define VM_DEBUG_CPUS 2 + +struct vm_intinfo { + int vcpuid; + uint64_t info1; + uint64_t info2; +}; + +struct vm_rtc_time { + time_t secs; +}; + +struct vm_rtc_data { + int offset; + uint8_t value; +}; + +#ifndef __FreeBSD__ +struct vm_devmem_offset { + int segid; + off_t offset; +}; +#endif + +struct vm_cpu_topology { + uint16_t sockets; + uint16_t cores; + uint16_t threads; + uint16_t maxcpus; +}; enum { /* general routines */ @@ -203,20 +265,31 @@ enum { IOCNUM_RUN = 1, IOCNUM_SET_CAPABILITY = 2, IOCNUM_GET_CAPABILITY = 3, + IOCNUM_SUSPEND = 4, + IOCNUM_REINIT = 5, /* memory apis */ - IOCNUM_MAP_MEMORY = 10, - IOCNUM_GET_MEMORY_SEG = 11, + IOCNUM_MAP_MEMORY = 10, /* deprecated */ + IOCNUM_GET_MEMORY_SEG = 11, /* deprecated */ IOCNUM_GET_GPA_PMAP = 12, IOCNUM_GLA2GPA = 13, + IOCNUM_ALLOC_MEMSEG = 14, + IOCNUM_GET_MEMSEG = 15, + IOCNUM_MMAP_MEMSEG = 16, + IOCNUM_MMAP_GETNEXT = 17, + IOCNUM_GLA2GPA_NOFAULT = 18, /* register/state accessors */ IOCNUM_SET_REGISTER = 20, IOCNUM_GET_REGISTER = 21, IOCNUM_SET_SEGMENT_DESCRIPTOR = 22, IOCNUM_GET_SEGMENT_DESCRIPTOR = 23, + IOCNUM_SET_REGISTER_SET = 24, + IOCNUM_GET_REGISTER_SET = 25, /* interrupt injection */ + IOCNUM_GET_INTINFO = 28, + IOCNUM_SET_INTINFO = 29, IOCNUM_INJECT_EXCEPTION = 30, IOCNUM_LAPIC_IRQ = 31, IOCNUM_INJECT_NMI = 32, @@ -244,6 +317,10 @@ enum { IOCNUM_GET_X2APIC_STATE = 61, IOCNUM_GET_HPET_CAPABILITIES = 62, + /* CPU Topology */ + IOCNUM_SET_TOPOLOGY = 63, + IOCNUM_GET_TOPOLOGY = 64, + /* legacy interrupt injection */ IOCNUM_ISA_ASSERT_IRQ = 80, IOCNUM_ISA_DEASSERT_IRQ = 81, @@ -253,14 +330,36 @@ enum { /* vm_cpuset */ IOCNUM_ACTIVATE_CPU = 90, IOCNUM_GET_CPUSET = 91, + IOCNUM_SUSPEND_CPU = 92, + IOCNUM_RESUME_CPU = 93, + + /* RTC */ + IOCNUM_RTC_READ = 100, + IOCNUM_RTC_WRITE = 101, + IOCNUM_RTC_SETTIME = 102, + IOCNUM_RTC_GETTIME = 103, + +#ifndef __FreeBSD__ + /* illumos-custom ioctls */ + IOCNUM_DEVMEM_GETOFFSET = 256, + IOCNUM_WRLOCK_CYCLE = 257, +#endif }; #define VM_RUN \ _IOWR('v', IOCNUM_RUN, struct vm_run) -#define VM_MAP_MEMORY \ - _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment) -#define VM_GET_MEMORY_SEG \ - _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment) +#define VM_SUSPEND \ + _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) +#define VM_REINIT \ + _IO('v', IOCNUM_REINIT) +#define VM_ALLOC_MEMSEG \ + _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg) +#define VM_GET_MEMSEG \ + _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg) +#define VM_MMAP_MEMSEG \ + _IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap) +#define VM_MMAP_GETNEXT \ + _IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap) #define VM_SET_REGISTER \ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) #define VM_GET_REGISTER \ @@ -269,6 +368,10 @@ enum { _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) #define VM_GET_SEGMENT_DESCRIPTOR \ _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_SET_REGISTER_SET \ + _IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set) +#define VM_GET_REGISTER_SET \ + _IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set) #define VM_INJECT_EXCEPTION \ _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception) #define VM_LAPIC_IRQ \ @@ -309,10 +412,8 @@ enum { _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix) #define VM_INJECT_NMI \ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) -#ifdef __FreeBSD__ -#define VM_STATS \ +#define VM_STATS_IOC \ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) -#endif #define VM_STAT_DESC \ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) #define VM_SET_X2APIC_STATE \ @@ -321,14 +422,52 @@ enum { _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic) #define VM_GET_HPET_CAPABILITIES \ _IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap) +#define VM_SET_TOPOLOGY \ + _IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GET_TOPOLOGY \ + _IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology) #define VM_GET_GPA_PMAP \ _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte) #define VM_GLA2GPA \ _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) +#define VM_GLA2GPA_NOFAULT \ + _IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa) #define VM_ACTIVATE_CPU \ _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) #define VM_GET_CPUS \ _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_SUSPEND_CPU \ + _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu) +#define VM_RESUME_CPU \ + _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu) +#define VM_SET_INTINFO \ + _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo) +#define VM_GET_INTINFO \ + _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo) +#define VM_RTC_WRITE \ + _IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data) +#define VM_RTC_READ \ + _IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data) +#define VM_RTC_SETTIME \ + _IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time) +#define VM_RTC_GETTIME \ + _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time) #define VM_RESTART_INSTRUCTION \ _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) + +#ifndef __FreeBSD__ +#define VM_DEVMEM_GETOFFSET \ + _IOW('v', IOCNUM_DEVMEM_GETOFFSET, struct vm_devmem_offset) +#define VM_WRLOCK_CYCLE _IO('v', IOCNUM_WRLOCK_CYCLE) + +/* ioctls used against ctl device for vm create/destroy */ +#define VMM_IOC_BASE (('V' << 16) | ('M' << 8)) +#define VMM_CREATE_VM (VMM_IOC_BASE | 0x01) +#define VMM_DESTROY_VM (VMM_IOC_BASE | 0x02) +#define VMM_VM_SUPPORTED (VMM_IOC_BASE | 0x03) + +#define VMM_CTL_DEV "/dev/vmmctl" + +#endif + #endif diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h new file mode 100644 index 0000000000..33fefc10ea --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_drv.h @@ -0,0 +1,50 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VMM_DRV_H_ +#define _VMM_DRV_H_ + +#ifdef _KERNEL +struct vmm_hold; +typedef struct vmm_hold vmm_hold_t; + +struct vmm_lease; +typedef struct vmm_lease vmm_lease_t; + +/* + * Because of tangled headers, these definitions mirror their vmm_[rw]mem_cb_t + * counterparts in vmm.h. + */ +typedef int (*vmm_drv_rmem_cb_t)(void *, uintptr_t, uint_t, uint64_t *); +typedef int (*vmm_drv_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t); + +extern int vmm_drv_hold(file_t *, cred_t *, vmm_hold_t **); +extern void vmm_drv_rele(vmm_hold_t *); +extern boolean_t vmm_drv_release_reqd(vmm_hold_t *); + +extern vmm_lease_t *vmm_drv_lease_sign(vmm_hold_t *, boolean_t (*)(void *), + void *); +extern void vmm_drv_lease_break(vmm_hold_t *, vmm_lease_t *); +extern boolean_t vmm_drv_lease_expired(vmm_lease_t *); + +extern void *vmm_drv_gpa2kva(vmm_lease_t *, uintptr_t, size_t); +extern int vmm_drv_msi(vmm_lease_t *, uint64_t, uint64_t); + +extern int vmm_drv_ioport_hook(vmm_hold_t *, uint_t, vmm_drv_rmem_cb_t, + vmm_drv_wmem_cb_t, void *, void **); +extern void vmm_drv_ioport_unhook(vmm_hold_t *, void **); +#endif /* _KERNEL */ + +#endif /* _VMM_DRV_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_impl.h b/usr/src/uts/i86pc/sys/vmm_impl.h index 1602fa286d..cdc56cc464 100644 --- a/usr/src/uts/i86pc/sys/vmm_impl.h +++ b/usr/src/uts/i86pc/sys/vmm_impl.h @@ -11,76 +11,79 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMM_IMPL_H_ -#define _VMM_IMPL_H_ +#define _VMM_IMPL_H_ #include #include #include +#include + +#ifdef _KERNEL + +#define VMM_CTL_MINOR 0 /* - * /dev names: - * /dev/vmmctl - control device - * /dev/vmm/ - vm devices + * Rather than creating whole character devices for devmem mappings, they are + * available by mmap(2)ing the vmm handle at a specific offset. These offsets + * begin just above the maximum allow guest physical address. */ -#define VMM_DRIVER_NAME "vmm" +#include +#define VM_DEVMEM_START (VM_MAXUSER_ADDRESS + 1) -#define VMM_CTL_MINOR_NODE "ctl" -#define VMM_CTL_MINOR_NAME VMM_DRIVER_NAME VMM_CTL_NODE -#define VMM_CTL_MINOR 0 - -#define VMM_IOC_BASE (('V' << 16) | ('M' << 8)) +struct vmm_devmem_entry { + list_node_t vde_node; + int vde_segid; + char vde_name[SPECNAMELEN + 1]; + size_t vde_len; + off_t vde_off; +}; +typedef struct vmm_devmem_entry vmm_devmem_entry_t; -#define VMM_CREATE_VM (VMM_IOC_BASE | 0x01) -#define VMM_DESTROY_VM (VMM_IOC_BASE | 0x02) +typedef struct vmm_zsd vmm_zsd_t; -struct vmm_ioctl { - char vmm_name[VM_MAX_NAMELEN]; +enum vmm_softc_state { + VMM_HELD = 1, /* external driver(s) possess hold on the VM */ + VMM_CLEANUP = 2, /* request that holds are released */ + VMM_PURGED = 4, /* all hold have been released */ + VMM_BLOCK_HOOK = 8, /* mem hook install temporarily blocked */ + VMM_DESTROY = 16 /* VM is destroyed, softc still around */ }; -#ifdef _KERNEL struct vmm_softc { - boolean_t open; - minor_t minor; - struct vm *vm; - char name[VM_MAX_NAMELEN]; - SLIST_ENTRY(vmm_softc) link; -}; -#endif + list_node_t vmm_node; + struct vm *vmm_vm; + minor_t vmm_minor; + char vmm_name[VM_MAX_NAMELEN]; + list_t vmm_devmem_list; -/* - * VMM trace ring buffer constants - */ -#define VMM_DMSG_RING_SIZE 0x100000 /* 1MB */ -#define VMM_DMSG_BUF_SIZE 256 + kcondvar_t vmm_cv; + list_t vmm_holds; + uint_t vmm_flags; + boolean_t vmm_is_open; -/* - * VMM trace ring buffer content - */ -typedef struct vmm_trace_dmsg { - timespec_t timestamp; - char buf[VMM_DMSG_BUF_SIZE]; - struct vmm_trace_dmsg *next; -} vmm_trace_dmsg_t; + kmutex_t vmm_lease_lock; + list_t vmm_lease_list; + uint_t vmm_lease_blocker; + kcondvar_t vmm_lease_cv; + krwlock_t vmm_rwlock; -/* - * VMM trace ring buffer header - */ -typedef struct vmm_trace_rbuf { - kmutex_t lock; /* lock to avoid clutter */ - int looped; /* completed ring */ - int allocfailed; /* dmsg mem alloc failed */ - size_t size; /* current size */ - size_t maxsize; /* max size */ - vmm_trace_dmsg_t *dmsgh; /* messages head */ - vmm_trace_dmsg_t *dmsgp; /* ptr to last message */ -} vmm_trace_rbuf_t; + /* For zone specific data */ + list_node_t vmm_zsd_linkage; + zone_t *vmm_zone; + vmm_zsd_t *vmm_zsd; +}; +typedef struct vmm_softc vmm_softc_t; -/* - * VMM trace ring buffer interfaces - */ -void vmm_trace_log(const char *fmt, ...); +void vmm_zsd_init(void); +void vmm_zsd_fini(void); +int vmm_zsd_add_vm(vmm_softc_t *sc); +void vmm_zsd_rem_vm(vmm_softc_t *sc); +int vmm_do_vm_destroy(vmm_softc_t *, boolean_t); + +#endif /* _KERNEL */ #endif /* _VMM_IMPL_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h index 8138890a2c..f10f407164 100644 --- a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h +++ b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/include/vmm_instruction_emul.h 276479 2014-12-31 20:31:32Z dim $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -93,17 +95,26 @@ int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, */ int vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *guest_paging, - uint64_t rip, int inst_length, struct vie *vie); + uint64_t rip, int inst_length, struct vie *vie, + int *is_fault); /* * Translate the guest linear address 'gla' to a guest physical address. * - * Returns 0 on success and '*gpa' contains the result of the translation. - * Returns 1 if an exception was injected into the guest. - * Returns -1 otherwise. + * retval is_fault Interpretation + * 0 0 'gpa' contains result of the translation + * 0 1 An exception was injected into the guest + * EFAULT N/A An unrecoverable hypervisor error occurred */ int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa); + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); + +/* + * Like vm_gla2gpa, but no exceptions are injected into the guest and + * PTEs are not changed. + */ +int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); void vie_init(struct vie *vie, const char *inst_bytes, int inst_length); diff --git a/usr/src/uts/i86pc/viona/Makefile b/usr/src/uts/i86pc/viona/Makefile index c2b8bd8dcf..4ede5bbd84 100644 --- a/usr/src/uts/i86pc/viona/Makefile +++ b/usr/src/uts/i86pc/viona/Makefile @@ -11,6 +11,7 @@ # # Copyright 2013 Pluribus Networks Inc. +# Copyright 2017 Joyent, Inc. # # @@ -22,7 +23,7 @@ UTSBASE = ../.. # Define the module and object file sets. # MODULE = viona -OBJECTS = $(VIONA_OBJS:%=$(OBJS_DIR)/%) +OBJECTS = $(VIONA_OBJS:%=$(OBJS_DIR)/%) LINTS = $(VIONA_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/i86pc/io/viona @@ -42,6 +43,12 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) # # Overrides # +LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV +LINTTAGS += -erroff=E_FUNC_ARG_UNUSED +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_FUNC_RET_MAYBE_IGNORED2 +LINTTAGS += -erroff=E_FUNC_RET_ALWAYS_IGNOR2 + CFLAGS += $(CCVERBOSE) LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm diff --git a/usr/src/uts/i86pc/vmm/Makefile b/usr/src/uts/i86pc/vmm/Makefile index b3ab735781..5b93db987b 100644 --- a/usr/src/uts/i86pc/vmm/Makefile +++ b/usr/src/uts/i86pc/vmm/Makefile @@ -11,6 +11,7 @@ # # Copyright 2013 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. # # @@ -26,6 +27,7 @@ OBJECTS = $(VMM_OBJS:%=$(OBJS_DIR)/%) LINTS = $(VMM_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/i86pc/io/vmm +MAPFILE = $(UTSBASE)/i86pc/io/vmm/vmm.mapfile # # Include common rules. @@ -42,11 +44,52 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) # # Overrides and additions # +LINTTAGS += -erroff=E_EMPTY_DECLARATION +LINTTAGS += -erroff=E_OPERANDS_INCOMPATIBLE_TYPES +LINTTAGS += -erroff=E_VOID_CANT_RETURN_VALUE +LINTTAGS += -erroff=E_YACC_ERROR +LINTTAGS += -erroff=E_STATIC_UNUSED +LINTTAGS += -erroff=E_FUNC_RET_MAYBE_IGNORED2 +LINTTAGS += -erroff=E_FUNC_RET_ALWAYS_IGNOR2 +LINTTAGS += -erroff=E_BAD_FORMAT_ARG_TYPE2 +LINTTAGS += -erroff=E_FUNC_ARG_UNUSED +LINTTAGS += -erroff=E_FUNC_SET_NOT_USED +LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_CONSTANT_CONDITION +LINTTAGS += -erroff=E_PTR_TO_VOID_IN_ARITHMETIC +LINTTAGS += -erroff=E_CONST_TRUNCATED_BY_ASSIGN +LINTTAGS += -erroff=E_NOP_ELSE_STMT +LINTTAGS += -erroff=E_FUNC_NO_RET_VAL +LINTTAGS += -erroff=E_OLD_STYLE_DECL_OR_BAD_TYPE +LINTTAGS += -erroff=E_VAR_USED_BEFORE_SET +LINTTAGS += -erroff=E_INTEGER_OVERFLOW_DETECTED +LINTTAGS += -erroff=E_STMT_NOT_REACHED +LINTTAGS += -erroff=E_FUNC_NO_RET_VAL +LINTTAGS += -erroff=E_USELESS_DECLARATION +LINTTAGS += -erroff=E_EXPR_NULL_EFFECT +LINTTAGS += -erroff=E_CASE_FALLTHRU +LINTTAGS += -erroff=E_FUNC_DECL_VAR_ARG2 +LINTTAGS += -erroff=E_ASM_IMPOSSIBLE_CONSTRAINT +LINTTAGS += -erroff=E_ASM_UNUSED_PARAM +LINTTAGS += -erroff=E_NOP_IF_STMT +LINTTAGS += -erroff=E_ZERO_OR_NEGATIVE_SUBSCRIPT + +CERRWARN += -_gcc=-Wno-empty-body + +# 3rd party code +SMOFF += all_func_returns + +# needs work +$(OBJS_DIR)/vmm_sol_dev.o := SMOFF += signed_integer_overflow_check + +# a can't happen: vmx_setcap() warn: variable dereferenced before check 'pptr' +$(OBJS_DIR)/vmx.o := SMOFF += deref_check # These sources only compile with gcc. Workaround a confluence of cruft # regarding dmake and shadow compilation by neutering the sun compiler. -amd64_CC = $(ONBLD_TOOLS)/bin/$(MACH)/cw -_gcc -CFLAGS += -_cc=-xdryrun +#amd64_CC = $(ONBLD_TOOLS)/bin/$(MACH)/cw -_gcc +#CFLAGS += -_cc=-xdryrun ALL_BUILDS = $(ALL_BUILDSONLY64) DEF_BUILDS = $(DEF_BUILDSONLY64) @@ -56,9 +99,23 @@ INC_PATH += -I$(UTSBASE)/i86pc/io/vmm -I$(UTSBASE)/i86pc/io/vmm/io AS_INC_PATH += -I$(UTSBASE)/i86pc/io/vmm -I$(OBJS_DIR) CFLAGS += -_gcc=-Wimplicit-function-declaration +# The FreeBSD %# notation makes gcc gripe +CFLAGS += -_gcc=-Wno-format +# enable collection of VMM statistics +CFLAGS += -DVMM_KEEP_STATS -OFFSETS_SRC = $(CONF_SRCDIR)/offsets.in -ASSYM_H = $(OBJS_DIR)/vmx_assym.h +LDFLAGS += -Nfs/dev + +$(OBJS_DIR)/vmm.o := CERRWARN += -_gcc=-Wno-pointer-sign -_gcc=-Wno-type-limits +$(OBJS_DIR)/svm.o := CERRWARN += -_gcc=-Wno-pointer-sign -_gcc=-Wno-type-limits + +LDFLAGS += -z type=kmod -M $(MAPFILE) + +OFFSETS_VMX = $(CONF_SRCDIR)/intel/offsets.in +OFFSETS_SVM = $(CONF_SRCDIR)/amd/offsets.in +ASSYM_VMX = $(OBJS_DIR)/vmx_assym.h +ASSYM_SVM = $(OBJS_DIR)/svm_assym.h +ASSYM_H = $(ASSYM_VMX) $(ASSYM_SVM) CLEANFILES += $(ASSYM_H) @@ -88,7 +145,10 @@ install: $(INSTALL_DEPS) # include $(UTSBASE)/i86pc/Makefile.targ -$(OBJECTS): $(ASSYM_H) +$(ASSYM_VMX): $(OFFSETS_VMX) $(GENASSYM) + $(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_VMX) >$@ +$(ASSYM_SVM): $(OFFSETS_SVM) $(GENASSYM) + $(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_SVM) >$@ -$(ASSYM_H): $(OFFSETS_SRC) $(GENASSYM) - $(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_SRC) >$@ +$(OBJS_DIR)/vmx_support.o: $(ASSYM_VMX) +$(OBJS_DIR)/svm_support.o: $(ASSYM_SVM) diff --git a/usr/src/uts/req.flg b/usr/src/uts/req.flg index ffbaa3f643..15085a486d 100644 --- a/usr/src/uts/req.flg +++ b/usr/src/uts/req.flg @@ -22,6 +22,7 @@ # # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2018 Joyent, Inc. # # @@ -37,6 +38,7 @@ echo_file usr/src/Makefile # For full builds (open and closed), we want both etc/certs and # etc/keys. For an open source build, there's no etc/keys directory. +find_files "s.*" usr/contrib/freebsd find_files "s.*" usr/src/cmd/cmd-crypto/etc find_files "s.*" usr/src/common/acl find_files "s.*" usr/src/common/atomic @@ -56,4 +58,5 @@ find_files "s.*" usr/src/common/smbios find_files "s.*" usr/src/common/tsol find_files "s.*" usr/src/common/util find_files "s.*" usr/src/common/zfs +find_files "s.*" usr/src/compat/freebsd find_files "s.*" usr/src/psm/promif -- cgit v1.2.3 From bd025ac2b6fbfbc59dfc0bd50a0e56dd56de9229 Mon Sep 17 00:00:00 2001 From: Robert Mustacchi Date: Fri, 15 May 2020 07:33:03 -0700 Subject: 12739 gld module missing dep on mac after 12671 Reviewed by: Patrick Mooney Reviewed by: Aurélien Larcher Approved by: Dan McDonald MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- usr/src/uts/intel/gld/Makefile | 4 +++- usr/src/uts/sparc/gld/Makefile | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'usr/src') diff --git a/usr/src/uts/intel/gld/Makefile b/usr/src/uts/intel/gld/Makefile index 3b88451670..7a5d77368e 100644 --- a/usr/src/uts/intel/gld/Makefile +++ b/usr/src/uts/intel/gld/Makefile @@ -26,7 +26,7 @@ # # -# This makefile drives the production of the gld driver +# This makefile drives the production of the gld driver # kernel module. # # intel architecture dependent @@ -57,6 +57,8 @@ ALL_TARGET = $(BINARY) LINT_TARGET = $(MODULE).lint INSTALL_TARGET = $(BINARY) $(ROOTMODULE) +LDFLAGS += -dy -Nmisc/mac + # # For now, disable these lint checks; maintainers should endeavor # to investigate and remove these for maximum lint coverage. diff --git a/usr/src/uts/sparc/gld/Makefile b/usr/src/uts/sparc/gld/Makefile index d56548e81b..ad3519c146 100644 --- a/usr/src/uts/sparc/gld/Makefile +++ b/usr/src/uts/sparc/gld/Makefile @@ -25,7 +25,7 @@ # Use is subject to license terms. # # -# This makefile drives the production of the gld driver +# This makefile drives the production of the gld driver # kernel module. # # sparc architecture dependent @@ -56,6 +56,8 @@ ALL_TARGET = $(BINARY) LINT_TARGET = $(MODULE).lint INSTALL_TARGET = $(BINARY) $(ROOTMODULE) +LDFLAGS += -dy -Nmisc/mac + # # Overrides. # -- cgit v1.2.3 From 7e95e8d94bf4ccccc742a5e7a1370afe0d890ae3 Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Wed, 13 May 2020 10:34:42 +0300 Subject: 12728 emlxs: symbol 'emlxs_prog_type_t' is multiply-defined Reviewed by: Gergő Doma Reviewed by: Andy Fiddaman Reviewed by: Yuri Pankov Reviewed by: Aurélien Larcher Approved by: Garrett D'Amore MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- usr/src/uts/common/sys/fibre-channel/fca/emlxs/emlxs_hw.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'usr/src') diff --git a/usr/src/uts/common/sys/fibre-channel/fca/emlxs/emlxs_hw.h b/usr/src/uts/common/sys/fibre-channel/fca/emlxs/emlxs_hw.h index ab4b4b4e6b..406c90303b 100644 --- a/usr/src/uts/common/sys/fibre-channel/fca/emlxs/emlxs_hw.h +++ b/usr/src/uts/common/sys/fibre-channel/fca/emlxs/emlxs_hw.h @@ -2140,7 +2140,7 @@ typedef struct #define SLI_FW_TYPE_101 SLI_FW_TYPE_SHIFT(0xb) /* LP101 */ -enum emlxs_prog_type +typedef enum emlxs_prog_type { TEST_PROGRAM, /* 0 */ UTIL_PROGRAM, /* 1 */ -- cgit v1.2.3 From d0b89bad7e1fdc02b67434ccc5d1c0e983e25583 Mon Sep 17 00:00:00 2001 From: Peter Tribble Date: Sat, 16 May 2020 15:24:14 +0100 Subject: 12743 man page spelling mistakes Reviewed by: Yuri Pankov Reviewed by: Toomas Soome Approved by: Garrett D'Amore --- usr/src/man/man1m/picld.1m | 18 ++-------- usr/src/man/man3ext/efi_alloc_and_init.3ext | 4 +-- usr/src/man/man3picl/picl_get_propinfo.3picl | 14 ++------ usr/src/man/man3picltree/libpicltree.3picltree | 17 ++------- usr/src/man/man3pool/pool_get_property.3pool | 15 ++------ usr/src/man/man3proc/Pclearsig.3proc | 4 +-- usr/src/man/man4/ippool.4 | 9 ++--- usr/src/man/man4/krb5.conf.4 | 18 ++-------- usr/src/man/man4/securenets.4 | 13 ++----- usr/src/man/man4/smb.4 | 7 ++-- usr/src/man/man4/smhba.conf.4 | 13 ++----- usr/src/man/man4/tnf_kernel_probes.4 | 41 +++------------------- usr/src/man/man5/condition.5 | 20 ++--------- usr/src/man/man5/epoll.5 | 6 ++-- usr/src/man/man5/locale.5 | 48 +++++++++----------------- 15 files changed, 49 insertions(+), 198 deletions(-) (limited to 'usr/src') diff --git a/usr/src/man/man1m/picld.1m b/usr/src/man/man1m/picld.1m index debb8bf9af..d233fb7444 100644 --- a/usr/src/man/man1m/picld.1m +++ b/usr/src/man/man1m/picld.1m @@ -3,18 +3,15 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH PICLD 1M "Aug 10, 2004" +.TH PICLD 1M "May 16, 2020" .SH NAME picld \- PICL daemon .SH SYNOPSIS -.LP .nf \fB/usr/lib/picl/picld\fR .fi .SH DESCRIPTION -.sp -.LP The Platform Information and Control Library (\fBPICL\fR) provides a mechanism to publish platform-specific information for clients to access in a platform-independent way. \fBpicld\fR maintains and controls access to the @@ -29,8 +26,6 @@ information. After the plug-in modules are initialized, the daemon opens the \fBPICL\fR daemon door to service client requests to access information in the \fBPICL\fR tree. .SS "PICL Tree" -.sp -.LP The \fBPICL\fR tree is the repository of all the nodes and properties created by the plug-in modules to represent the platform configuration. Every node in the \fBPICL\fR tree is an instance of a well-defined \fBPICL\fR class. The name @@ -42,15 +37,13 @@ nodes in the \fBPICL\fR tree have well-known names. For example, the name of the root node of the \fBPICL\fR tree is \fB/\fR and the name of the root node of the sub-tree containing platform device nodes is \fBplatform\fR. .SS "PICL plug-in Modules" -.sp -.LP The \fBPICL\fR plug-in modules are shared objects that publish platform-specific data in the \fBPICL\fR tree. They are located in well-known directories so that the daemon can locate and load them. .sp .LP Plug-in modules are located in one of the following plug-in directories -depending on the plaform-specific nature of the data that they collect and +depending on the platform-specific nature of the data that they collect and publish: .sp .in +2 @@ -92,11 +85,8 @@ environmental information in the \fBPICL\fR tree so clients can access them. Clients use the \fBlibpicl\fR(3PICL) interface to send requests to \fBpicld\fR for accessing the \fBPICL\fR tree. .SH EXIT STATUS -.sp -.LP \fBpicld\fR does not return an exit status. .SH FILES -.sp .ne 2 .na \fB\fB/var/run/picld_door\fR\fR @@ -115,14 +105,10 @@ for accessing the \fBPICL\fR tree. .RE .SH SEE ALSO -.sp -.LP \fBsvcs\fR(1), \fBsvcadm\fR(1M), \fBdlopen\fR(3C), \fBlibpicl\fR(3PICL), \fBlibpicltree\fR(3PICLTREE), \fBpicld_log\fR(3PICLTREE), \fBpicld_plugin_register\fR(3PICLTREE), \fBattributes\fR(5), \fBsmf\fR(5) .SH NOTES -.sp -.LP The \fBpicld\fR service is managed by the service management facility, \fBsmf\fR(5), under the service identifier: .sp diff --git a/usr/src/man/man3ext/efi_alloc_and_init.3ext b/usr/src/man/man3ext/efi_alloc_and_init.3ext index bd7bb22af6..ad4c21a543 100644 --- a/usr/src/man/man3ext/efi_alloc_and_init.3ext +++ b/usr/src/man/man3ext/efi_alloc_and_init.3ext @@ -4,7 +4,7 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH EFI_ALLOC_AND_INIT 3EXT "November 20, 2019" +.TH EFI_ALLOC_AND_INIT 3EXT "May 16, 2020" .SH NAME efi_alloc_and_init, efi_alloc_and_read, efi_free, efi_write, efi_use_whole_disk, efi_reserved_sectors \- manipulate a disk's EFI Partition Table @@ -100,7 +100,7 @@ struct dk_part efi_parts[]; /* array of partitions */ .SS "Protective Master Boot Record" When a disk receives an EFI label, a protective MBR (\fBPMBR\fR) is also -written containing a single partiton of type \fBEEh\fR and spanning the +written containing a single partition of type \fBEEh\fR and spanning the entire disk (up to the limit of what can be represented in an MBR). By default that partition is placed in slot 0 of the PMBR and not marked as active. Some BIOS implementations contain bugs that require the entry to be diff --git a/usr/src/man/man3picl/picl_get_propinfo.3picl b/usr/src/man/man3picl/picl_get_propinfo.3picl index 1581d57e5a..ddf18cbb4b 100644 --- a/usr/src/man/man3picl/picl_get_propinfo.3picl +++ b/usr/src/man/man3picl/picl_get_propinfo.3picl @@ -3,11 +3,10 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH PICL_GET_PROPINFO 3PICL "Mar 28, 2000" +.TH PICL_GET_PROPINFO 3PICL "May 16, 2020" .SH NAME picl_get_propinfo \- get the information about a property .SH SYNOPSIS -.LP .nf \fBcc\fR [ \fIflag\fR... ] \fIfile\fR... \fB-lpicl\fR [ \fIlibrary\fR... ] #include @@ -17,8 +16,6 @@ picl_get_propinfo \- get the information about a property .fi .SH DESCRIPTION -.sp -.LP The \fBpicl_get_propinfo()\fR function gets the information about the property specified by handle \fIproph\fR and copies it into the location specified by \fIpinfo\fR. The property information includes the property type, access mode, @@ -29,8 +26,6 @@ page. The maximum size of a property value is specified by \fBPICL_PROPSIZE_MAX\fR. It is currently set to 512KB. .SH RETURN VALUES -.sp -.LP Upon successful completion, \fB0\fR is returned. On failure, a non-negative integer is returned to indicate an error. .sp @@ -44,7 +39,6 @@ This error may be returned for a previously valid handle if the daemon was brought down and restarted. When this occurs a client must revalidate any saved handles. .SH ERRORS -.sp .ne 2 .na \fB\fBPICL_NOTINITIALIZED\fR\fR @@ -86,7 +80,7 @@ Invalid handle specified \fB\fBPICL_STALEHANDLE\fR\fR .ad .RS 23n -Stale handle specifie +Stale handle specified .RE .sp @@ -99,8 +93,6 @@ General system failure .RE .SH ATTRIBUTES -.sp -.LP See \fBattributes\fR(5) for descriptions of the following attributes: .sp @@ -115,7 +107,5 @@ MT-Level MT-Safe .TE .SH SEE ALSO -.sp -.LP \fBlibpicl\fR(3PICL), \fBpicl_get_propval\fR(3PICL), \fBpicl_get_propval_by_name\fR(3PICL), \fBattributes\fR(5) diff --git a/usr/src/man/man3picltree/libpicltree.3picltree b/usr/src/man/man3picltree/libpicltree.3picltree index 2651d0d63d..3732c1a259 100644 --- a/usr/src/man/man3picltree/libpicltree.3picltree +++ b/usr/src/man/man3picltree/libpicltree.3picltree @@ -3,19 +3,16 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH LIBPICLTREE 3PICLTREE "Mar 1, 2004" +.TH LIBPICLTREE 3PICLTREE "May 16, 2020" .SH NAME libpicltree \- PTree and Plug-in Registration interface library .SH SYNOPSIS -.LP .nf \fBcc\fR [\fIflag \&.\|.\|.\fR] \fIfile \fR\&.\|.\|. \fB-lpicltree\fR [\fIlibrary \&.\|.\|.\fR] #include .fi .SH DESCRIPTION -.sp -.LP The PTree interface is the set of functions and data structures to access and manipulate the PICL tree. The daemon and the plug-in modules use the PTree interface. @@ -53,8 +50,6 @@ properties. The maximum size of a property value cannot exceed \fBPICL_PROPSIZE_MAX\fR. It is currently set to 512KB. .SS "Volatile Properties" -.sp -.LP In addition to \fBPICL_READ\fR and \fBPICL_WRITE\fR property access modes, the plug-in modules specify whether a property is volatile or not by setting the bit \fBPICL_VOLATILE\fR. @@ -142,11 +137,9 @@ the maximum possible size of the value. The maximum size of the value cannot exceed \fBPICL_PROPSIZE_MAX\fR. This allows a client to allocate a sufficiently large buffer before retrieving a volatile property's value .SS "Plug-in Modules" -.sp -.LP Plug-in modules are shared objects that are located in well-known directories -for the daemon to locate and load them. Plug-in module's are located in the one -of the following plug-in directories depending on the plaform-specific nature +for the daemon to locate and load them. Plug-in modules are located in the one +of the following plug-in directories depending on the platform-specific nature of the data they collect and publish. .sp .in +2 @@ -171,8 +164,6 @@ registration. The plug-in modules may use the \fBpicld_log\fR(3PICLTREE) function to log their messages to the system log file. .SH ATTRIBUTES -.sp -.LP See \fBattributes\fR(5) for descriptions of the following attributes: .sp @@ -187,7 +178,5 @@ MT-Level MT-Safe .TE .SH SEE ALSO -.sp -.LP \fBlibpicl\fR(3PICL), \fBlibpicltree\fR(3LIB), \fBpicld_log\fR(3PICLTREE), \fBpicld_plugin_register\fR(3PICLTREE), \fBattributes\fR(5) diff --git a/usr/src/man/man3pool/pool_get_property.3pool b/usr/src/man/man3pool/pool_get_property.3pool index 410c40ded2..39ac2fb084 100644 --- a/usr/src/man/man3pool/pool_get_property.3pool +++ b/usr/src/man/man3pool/pool_get_property.3pool @@ -3,12 +3,11 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH POOL_GET_PROPERTY 3POOL "Sep 23, 2003" +.TH POOL_GET_PROPERTY 3POOL "May 16, 2020" .SH NAME pool_get_property, pool_put_property, pool_rm_property, pool_walk_properties \- resource pool element property manipulation .SH SYNOPSIS -.LP .nf cc [ \fIflag\fR\&.\|.\|. ] \fIfile\fR\&.\|.\|. \fB-lpool\fR [ \fIlibrary\fR\&.\|.\|. ] #include @@ -38,8 +37,6 @@ cc [ \fIflag\fR\&.\|.\|. ] \fIfile\fR\&.\|.\|. \fB-lpool\fR [ \fIlibrary\fR\&.\| .fi .SH DESCRIPTION -.sp -.LP The various pool types are converted to the common pool element type (\fBpool_elem_t\fR) before property manipulation. A \fBpool_value_t\fR is an opaque type that contains a property value of one of the following types: @@ -110,7 +107,7 @@ error. .LP The \fBpool_rm_property()\fR function attempts to remove the named property from the element. If the property does not exist or is not removable, -1 is -returned and \fBpool_error\fR(3POOL) reporst an error of \fBPOE_PUTPROP\fR. +returned and \fBpool_error\fR(3POOL) reports an error of \fBPOE_PUTPROP\fR. .sp .LP The \fBpool_walk_properties()\fR function invokes \fIcallback\fR on all @@ -123,8 +120,6 @@ A number of special properties are reserved for internal use and cannot be set or removed. Attempting to do so will fail. These properties are documented on the \fBlibpool\fR(3LIB) manual page. .SH RETURN VALUES -.sp -.LP Upon successful completion, \fBpool_get_property()\fR returns the type of the property. Otherwise it returns \fBPOC_INVAL\fR and \fBpool_error()\fR returns the pool-specific error value. @@ -134,8 +129,6 @@ Upon successful completion, \fBpool_put_property()\fR, \fBpool_rm_property()\fR, and \fBpool_walk_properties()\fR return 0. Otherwise they return \(mi1 and \fBpool_error()\fR returns the pool-specific error value. .SH ERRORS -.sp -.LP The \fBpool_get_property()\fR function will fail if: .sp .ne 2 @@ -253,8 +246,6 @@ A system error has occurred. Check the system error code for more details. .RE .SH ATTRIBUTES -.sp -.LP See \fBattributes\fR(5) for descriptions of the following attributes: .sp @@ -273,6 +264,4 @@ MT-Level Safe .TE .SH SEE ALSO -.sp -.LP \fBlibpool\fR(3LIB), \fBpool_error\fR(3POOL), \fBattributes\fR(5) diff --git a/usr/src/man/man3proc/Pclearsig.3proc b/usr/src/man/man3proc/Pclearsig.3proc index 23f81df918..116e23c391 100644 --- a/usr/src/man/man3proc/Pclearsig.3proc +++ b/usr/src/man/man3proc/Pclearsig.3proc @@ -11,7 +11,7 @@ .\" .\" Copyright 2015 Joyent, Inc. .\" -.Dd May 11, 2016 +.Dd May 16, 2020 .Dt PCLEARSIG 3PROC .Os .Sh NAME @@ -50,7 +50,7 @@ The function is identical to the .Fn Pclearsig function, except rather than operating on the process and its -representive thread, it instead operates on the thread handle +representative thread, it instead operates on the thread handle .Fa L . .Pp The diff --git a/usr/src/man/man4/ippool.4 b/usr/src/man/man4/ippool.4 index d70b5cad9a..1b6586a4b0 100644 --- a/usr/src/man/man4/ippool.4 +++ b/usr/src/man/man4/ippool.4 @@ -1,10 +1,10 @@ '\" te .\" To view license terms, attribution, and copyright for IP Filter, the -.\" default path is /usr/lib/ipf/IPFILTER.LICENCE. If the Illumos operating +.\" default path is /usr/lib/ipf/IPFILTER.LICENCE. If the illumos operating .\" environment has been installed anywhere other than the default, modify the .\" given path to access the file at the installed location. .\" Portions Copyright (c) 2015, Joyent, Inc. -.TH IPPOOL 4 "April 9, 2016" +.TH IPPOOL 4 "May 16, 2020" .SH NAME ippool, ippool.conf \- IP Pool file format .SH DESCRIPTION @@ -62,7 +62,6 @@ of a mixture of netmask sizes, from 0 to 32. .PP At this point in time, only IPv4 addressing is supported. .SH OVERVIEW -.PP The IP pool configuration file provides for defining two different mechanisms for improving speed in matching IP addresses with rules. The first, @@ -86,7 +85,6 @@ or respectively, for determining which filter group to jump to next for continuation of filter packet processing. .SH POOL TYPES -.PP Two storage formats are provided: hash tables and tree structure. The hash table is intended for use with objects all containing the same netmask or a few different sized netmasks of non-overlapping address space and the tree @@ -96,8 +94,7 @@ to use the tree data storage type with .B group-map configuration entries. .SH POOL ROLES -.PP -When a pool is defined in the configruation file, it must have an associated +When a pool is defined in the configuration file, it must have an associated role. At present the only supported role is .B ipf. Future development will see further expansion of their use by other sections diff --git a/usr/src/man/man4/krb5.conf.4 b/usr/src/man/man4/krb5.conf.4 index c97d2a7eae..4d18ab5362 100644 --- a/usr/src/man/man4/krb5.conf.4 +++ b/usr/src/man/man4/krb5.conf.4 @@ -3,17 +3,15 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH KRB5.CONF 4 "Nov 26, 2017" +.TH KRB5.CONF 4 "May 16, 2020" .SH NAME krb5.conf \- Kerberos configuration file .SH SYNOPSIS -.LP .nf /etc/krb5/krb5.conf .fi .SH DESCRIPTION -.LP The \fBkrb5.conf\fR file contains Kerberos configuration information, including the locations of \fBKDC\fRs and administration daemons for the Kerberos realms of interest, defaults for the current realm and for Kerberos applications, and @@ -141,7 +139,6 @@ For a Key Distribution Center (\fBKDC\fR), can contain the location of the .RE .SS "The \fB[libdefaults]\fR Section" -.LP The \fB[libdefaults]\fR section can contain any of the following relations: .sp .ne 2 @@ -346,7 +343,7 @@ above. Indicates whether DNS SRV records need to be used to locate the KDCs and the other servers for a realm, if they have not already been listed in the \fB[realms]\fR section. This option makes the machine vulnerable to a certain -type of DoS attack if somone spoofs the DNS records and does a redirect to +type of DoS attack if someone spoofs the DNS records and does a redirect to another server. This is, however, no worse than a DoS, since the bogus KDC is unable to decode anything sent (excepting the initial ticket request, which has no encrypted data). Also, anything the fake KDC sends out isl not trusted @@ -407,7 +404,6 @@ to make it a network-wide setting for all realms. .RE .SS "The \fB[appdefaults]\fR Section" -.LP This section contains subsections for Kerberos V5 applications, where \fIrelation-subsection\fR is the name of an application. Each subsection contains relations that define the default behaviors for that application. @@ -589,7 +585,6 @@ In the following example, \fBkinit\fR gets forwardable tickets by default and The application defaults specified here are overridden by those specified in the \fB[realms]\fR section. .SS "The \fB[realms]\fR Section" -.LP This section contains subsections for Kerberos realms, where \fIrelation-subsection\fR is the name of a realm. Each subsection contains relations that define the properties for that particular realm. The following @@ -869,7 +864,6 @@ parameters. Most often, you need to specify them only when using a non-Solaris-based Kerberos server. Otherwise, the change request is sent over \fBRPCSEC_GSS\fR to the Solaris Kerberos administration server. .SS "The \fB[domain_realm]\fR Section" -.LP This section provides a translation from a domain name or hostname to a Kerberos realm name. The \fIrelation\fR can be a host name, or a domain name, where domain names are indicated by a period (`\fB\&.\fR') prefix. @@ -902,7 +896,6 @@ default into the \fBFUBAR.ORG\fR realm. The entries for the hosts \fBmit.edu\fR and \fBfubar.org\fR. Without these entries, these hosts would be mapped into the Kerberos realms \fBEDU\fR and \fBORG\fR, respectively. .SS "The \fB[logging]\fR Section" -.LP This section indicates how Kerberos programs are to perform logging. There are two types of relations for this section: relations to specify how to log and a relation to specify how to rotate \fBkdc\fR log files. @@ -1111,7 +1104,6 @@ with a specified time interval of a day. .sp .SS "The \fB[capaths]\fR Section" -.LP In order to perform direct (non-hierarchical) cross-realm authentication, a database is needed to construct the authentication paths between the realms. This section defines that database. @@ -1215,7 +1207,6 @@ In the above examples, the ordering is not important, except when the same relation is used more than once. The client uses this to determine the path. (It is not important to the server, since the transited field is not sorted.) .SS "PKINIT-specific Options" -.LP The following are \fBpkinit-specific\fR options. These values can be specified in \fB[libdefaults]\fR as global defaults, or within a realm-specific subsection of \fB[libdefaults]\fR, or can be specified as realm-specific values @@ -1646,7 +1637,6 @@ been set to \fBFILE:/tmp/my_proxy.pem\fR. .RE .SS "The \fB[dbmodules]\fR Section" -.LP This section consists of relations that provide configuration information for plug-in modules. In particular, the relations describe the configuration for LDAP KDB plug-in. Use of the \fBdb2\fR KDB plug-in is the default behavior and @@ -1761,7 +1751,6 @@ Port number for SSL connection with directory server. The default is \fB389\fR. .RE .SH EXAMPLES -.LP \fBExample 1 \fRSample File .sp .LP @@ -1840,7 +1829,6 @@ a Kerberos configuration file when the KDC is using the LDAP KDB plug-in. .RE .SH ATTRIBUTES -.LP See \fBattributes\fR(5) for descriptions of the following attributes: .sp @@ -1859,12 +1847,10 @@ Interface Stability See below. All of the keywords are Committed, except for the \fBPKINIT\fR keywords, which are Volatile. .SH SEE ALSO -.LP \fBkinit\fR(1), \fBrcp\fR(1), \fBrdist\fR(1), \fBrlogin\fR(1), \fBrsh\fR(1), \fBtelnet\fR(1), \fBsyslog\fR(3C), \fBattributes\fR(5), \fBkerberos\fR(5), \fBregex\fR(5) .SH NOTES -.LP If the \fBkrb5.conf\fR file is not formatted properly, the \fBtelnet\fR command fails. However, the \fBdtlogin\fR and \fBlogin\fR commands still succeed, even if the \fBkrb5.conf\fR file is specified as required for the commands. If this diff --git a/usr/src/man/man4/securenets.4 b/usr/src/man/man4/securenets.4 index 5500c61fa6..825b72f835 100644 --- a/usr/src/man/man4/securenets.4 +++ b/usr/src/man/man4/securenets.4 @@ -4,18 +4,15 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH SECURENETS 4 "Apr 26, 1999" +.TH SECURENETS 4 "May 16, 2020" .SH NAME securenets \- configuration file for NIS security .SH SYNOPSIS -.LP .nf \fB/var/yp/securenets\fR .fi .SH DESCRIPTION -.sp -.LP The \fB/var/yp/securenets\fR file defines the networks or hosts which are allowed access to information by the Network Information Service ("\fBNIS\fR"). .sp @@ -66,7 +63,6 @@ hosts 127.0.0.1 .sp .SH EXAMPLES -.LP \fBExample 1 \fRAccess for Individual Entries .sp .LP @@ -121,7 +117,7 @@ The entry for access to a class B network could be: .sp .LP -\fBExample 4 \fRAccess for an Invidual IPv6 Address +\fBExample 4 \fRAccess for an Individual IPv6 Address .sp .LP Similarly, to allow access for an individual IPv6 address: @@ -161,7 +157,6 @@ ffff:: fe80:: .sp .SH FILES -.sp .ne 2 .na \fB\fB/var/yp/securenets\fR\fR @@ -171,12 +166,8 @@ Configuration file for \fBNIS\fR security. .RE .SH SEE ALSO -.sp -.LP \fBypserv\fR(1M), \fBypstart\fR(1M), \fBypstop\fR(1M), \fBypxfrd\fR(1M) .SH NOTES -.sp -.LP The Network Information Service (NIS) was formerly known as Sun Yellow Pages (YP). The functionality of the two remains the same; only the name has changed. The name Yellow Pages is a registered trademark in the United Kingdom diff --git a/usr/src/man/man4/smb.4 b/usr/src/man/man4/smb.4 index c593a52178..b800c3a7b9 100644 --- a/usr/src/man/man4/smb.4 +++ b/usr/src/man/man4/smb.4 @@ -16,11 +16,10 @@ .\" fields enclosed by brackets "[]" replaced with your own identifying .\" information: Portions Copyright [yyyy] [name of copyright owner] .\" -.TH SMB 4 "Apr 23, 2015" +.TH SMB 4 "May 16, 2020" .SH NAME smb \- configuration properties for Solaris CIFS server .SH DESCRIPTION -.LP Behavior of the Solaris CIFS server is defined by property values that are stored in the Service Management Facility, \fBsmf\fR(5). .sp @@ -419,7 +418,7 @@ Controls whether "oplocks" may be granted by the SMB server. The term "oplock" is short for "opportunistic lock", which is the legacy name for cache delegations in SMB. By default, oplocks are enabled. -Note that if oplocks are disabled, file I/O perfrormance may be +Note that if oplocks are disabled, file I/O performance may be severely reduced. .RE @@ -549,7 +548,6 @@ set. .RE .SH ATTRIBUTES -.LP See the \fBattributes\fR(5) man page for descriptions of the following attributes: .sp @@ -565,6 +563,5 @@ Interface Stability Uncommitted .TE .SH SEE ALSO -.LP \fBsharectl\fR(1M), \fBsmbadm\fR(1M), \fBsmbd\fR(1M), \fBsmbstat\fR(1M), \fBattributes\fR(5), \fBsmf\fR(5) diff --git a/usr/src/man/man4/smhba.conf.4 b/usr/src/man/man4/smhba.conf.4 index 91ed33f378..a960c3526f 100644 --- a/usr/src/man/man4/smhba.conf.4 +++ b/usr/src/man/man4/smhba.conf.4 @@ -3,12 +3,10 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. .\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with .\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH SMHBA.CONF 4 "Sep 28, 2009" +.TH SMHBA.CONF 4 "May 16, 2020" .SH NAME smhba.conf \- configuration file for the SMHBAAPI library .SH DESCRIPTION -.sp -.LP The \fB/etc/smhba.conf\fR file is used to specify the Vendor-Specific Libraries that are installed on the system. This file is used by the Common Library to load the individual VSLs when \fBHBA_LoadLibrary\fR(3HBAAPI) is called. If @@ -49,7 +47,6 @@ is the absolute path to the shared object library file. .RE .SH EXAMPLES -.LP \fBExample 1 \fRContents of \fB/etc/smhba.conf\fR .sp .in +2 @@ -69,8 +66,6 @@ com.sun.sashba64 /usr/lib/64/libsun_sas.so.1 .in -2 .SH ATTRIBUTES -.sp -.LP See \fBattributes\fR(5) for descriptions of the following attributes: .sp @@ -84,17 +79,13 @@ _ Interface Stability Committed _ Standard T{ -ANSI INCITS 428 Storage Management Host Bus Adapter Application Programming Ingerface(SM-HBA) +ANSI INCITS 428 Storage Management Host Bus Adapter Application Programming Interface(SM-HBA) T} .TE .SH SEE ALSO -.sp -.LP \fBHBA_LoadLibrary\fR(3HBAAPI), \fBlibSMHBAAPI\fR(3LIB), \fBattributes\fR(5) .SH NOTES -.sp -.LP The SMHBAAPI library is provided in both 32-and 64-bit versions, but only one configuration file is specified. As a result, both 32- and 64-bit VSL libraries must be specified within the same file. When using the 32-bit Common Library, diff --git a/usr/src/man/man4/tnf_kernel_probes.4 b/usr/src/man/man4/tnf_kernel_probes.4 index 7d5c3c1ee0..b7c84e1e41 100644 --- a/usr/src/man/man4/tnf_kernel_probes.4 +++ b/usr/src/man/man4/tnf_kernel_probes.4 @@ -3,12 +3,10 @@ .\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. .\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. See the License for the specific language governing permissions and limitations under the License. .\" When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] -.TH TNF_KERNEL_PROBES 4 "Nov 8, 1999" +.TH TNF_KERNEL_PROBES 4 "May 16, 2020" .SH NAME tnf_kernel_probes \- TNF kernel probes .SH DESCRIPTION -.sp -.LP The set of probes (trace instrumentation points) available in the standard kernel. The probes log trace data to a kernel trace buffer in Trace Normal Form (TNF). Kernel probes are controlled by \fBprex\fR(1). A snapshot of the @@ -22,7 +20,6 @@ from \fBprex\fR(1). A probe that is enabled for tracing generates a \fBTNF\fR record, called an \fIevent record\fR. An event record contains two common members and may contain other probe-specific data members. .SS "Common Members" -.sp .in +2 .nf \fBtnf_probe_event\fR \fItag\fR @@ -70,7 +67,6 @@ time of the event. .SS "Threads" .SS "\fBthread_create\fR" -.sp .in +2 .nf \fBtnf_kthread_id\fR \fItid\fR @@ -110,7 +106,6 @@ The kernel address of its start routine. .RE .SS "\fBthread_state\fR" -.sp .in +2 .nf \fBtnf_kthread_id\fR \fItid\fR @@ -181,16 +176,11 @@ is implicit in the system call entry and exit events. .RE .SS "thread_exit" -.sp -.LP Thread termination event for writing thread. This probe has no data members other than the common members. .SS "Scheduling" -.sp -.LP \fB\fR .SS "thread_queue" -.sp .in +2 .nf \fBtnf_kthread_id\fR \fItid\fR @@ -233,7 +223,6 @@ The current length of the cpu's dispatch queue. .SS "Blocking" .SS "\fBthread_block\fR" -.sp .in +2 .nf \fBtnf_opaque\fR \fIreason\fR @@ -266,7 +255,6 @@ stack at the time the thread blocks. .SS "System Calls" .SS "\fBsyscall_start\fR" -.sp .in +2 .nf \fBtnf_sysnum\fR \fIsysnum\fR @@ -287,7 +275,6 @@ microstate with this event. .RE .SS "\fBsyscall_end\fR" -.sp .in +2 .nf \fBtnf_long\fR \fIrval1\fR @@ -322,7 +309,6 @@ The error return. The writing thread implicitly enters the \fIuser\fR microstate with this event. .SS "Page Faults" .SS "\fBaddress_fault\fR" -.sp .in +2 .nf \fBtnf_opaque\fR \fIaddress\fR @@ -364,7 +350,6 @@ for these two members are defined in <\fBvm/seg_enum.h\fR>. .RE .SS "\fBmajor_fault\fR" -.sp .in +2 .nf \fBtnf_opaque\fR \fIvnode\fR @@ -379,7 +364,6 @@ Major page fault event. The faulting page is mapped to the file given by the virtual address is in the most recent \fBaddress_fault\fR event for the writing thread.) .SS "\fBanon_private\fR" -.sp .in +2 .nf \fBtnf_opaque\fR \fIaddress\fR @@ -399,7 +383,6 @@ The virtual address at which the new page is mapped. .RE .SS "\fBanon_zero\fR" -.sp .in +2 .nf \fBtnf_opaque\fR \fIaddress\fR @@ -419,7 +402,6 @@ The virtual address at which the new page is mapped. .RE .SS "\fBpage_unmap\fR" -.sp .in +2 .nf \fBtnf_opaque\fR \fIvnode\fR @@ -442,7 +424,6 @@ Identifies the file and offset of the page being unmapped. .SS "Pageins and Pageouts" .SS "\fBpagein\fR" -.sp .in +2 .nf \fBtnf_opaque\fR \fIvnode\fR @@ -460,7 +441,7 @@ Pagein start event. This event signals the initiation of pagein I/O. \fB\fIvnode\fRand\fIoffset\fR\fR .ad .RS 18n -Identifyies the file and offset to be paged in. +Identifies the file and offset to be paged in. .RE .sp @@ -473,7 +454,6 @@ Specifies the number of bytes to be paged in. .RE .SS "\fBpageout\fR" -.sp .in +2 .nf \fBtnf_opaque\fR \fIvnode\fR @@ -524,7 +504,6 @@ The number of pages reclaimed after being written out. .SS "Page Daemon (Page Stealer)" .SS "\fBpageout_scan_start\fR" -.sp .in +2 .nf \fBtnf_ulong\fR \fIpages_free\fR @@ -555,7 +534,6 @@ The number of pages desired free. .RE .SS "\fBpageout_scan_end\fR" -.sp .in +2 .nf \fBtnf_ulong\fR \fIpages_free\fR @@ -588,7 +566,6 @@ be freed when any queued pageout requests complete.) .SS "Swapper" .SS "\fBswapout_process\fR" -.sp .in +2 .nf \fBtnf_pid\fR \fIpid\fR @@ -619,7 +596,6 @@ Reports the number of pages either freed or queued for pageout. .RE .SS "\fBswapout_lwp\fR" -.sp .in +2 .nf \fBtnf_pid\fR \fIpid\fR @@ -670,7 +646,6 @@ The number of pages swapped out. .RE .SS "\fBswapin_lwp\fR" -.sp .in +2 .nf \fBtnf_pid\fR \fIpid\fR @@ -722,7 +697,6 @@ The number of pages swapped in. .SS "Local I/O" .SS "\fBstrategy\fR" -.sp .in +2 .nf \fBtnf_device\fR \fIdevice\fR @@ -783,7 +757,6 @@ The \fBbuf\fR(9S) flags associated with the transfer. .RE .SS "\fBbiodone\fR" -.sp .in +2 .nf \fBtnf_device\fR \fIdevice\fR @@ -824,7 +797,6 @@ The kernel address of the \fBbuf\fR(9S) structure associated with the transfer. .RE .SS "\fBphysio_start\fR" -.sp .in +2 .nf \fBtnf_device\fR \fIdevice\fR @@ -837,7 +809,7 @@ The kernel address of the \fBbuf\fR(9S) structure associated with the transfer. .sp .LP Raw I/O start event. This event marks entry into the \fBphysio\fR(9F) -fufnction which performs unbuffered I/O. +function which performs unbuffered I/O. .sp .ne 2 .na @@ -875,7 +847,6 @@ The direction of the transfer: read or write (see \fBbuf\fR(9S)). .RE .SS "\fBphysio_end\fR" -.sp .in +2 .nf \fBtnf_device\fR \fIdevice\fR @@ -884,7 +855,7 @@ The direction of the transfer: read or write (see \fBbuf\fR(9S)). .sp .LP -Raw I/O end event. This event marks exit from the \fBphysio\fR(9F) fufnction. +Raw I/O end event. This event marks exit from the \fBphysio\fR(9F) function. .sp .ne 2 .na @@ -895,8 +866,6 @@ The major and minor numbers of the device of the transfer. .RE .SH USAGE -.sp -.LP Use the \fBprex\fR utility to control kernel probes. The standard \fBprex\fR commands to list and manipulate probes are available to you, along with commands to set up and manage kernel tracing. @@ -968,8 +937,6 @@ A convenient way to follow these steps is to use two shell windows; run an interactive \fBprex\fR session in one, and run your application and \fBtnfxtract\fR in the other. .SH SEE ALSO -.sp -.LP \fBprex\fR(1), \fBtnfdump\fR(1), \fBtnfxtract\fR(1), \fBlibtnfctl\fR(3TNF), \fBTNF_PROBE\fR(3TNF), \fBtracing\fR(3TNF), \fBstrategy\fR(9E), \fBbiodone\fR(9F), \fBphysio\fR(9F), \fBbuf\fR(9S) diff --git a/usr/src/man/man5/condition.5 b/usr/src/man/man5/condition.5 index 5ebaf3ef50..1282ea26e8 100644 --- a/usr/src/man/man5/condition.5 +++ b/usr/src/man/man5/condition.5 @@ -44,12 +44,10 @@ .\" Copyright (c) 1998 Sun Microsystems, Inc. All Rights Reserved. .\" Copyright (c) 2001, The IEEE and The Open Group. All Rights Reserved. .\" -.TH CONDITION 5 "Jul 20, 1998" +.TH CONDITION 5 "May 16, 2020" .SH NAME condition \- concepts related to condition variables .SH DESCRIPTION -.sp -.LP Occasionally, a thread running within a mutex needs to wait for an event, in which case it blocks or sleeps. When a thread is waiting for another thread to communicate its disposition, it uses a condition variable in conjunction with a @@ -68,8 +66,6 @@ If another thread changes the condition, it may wake up waiting threads by signaling the associated condition variable. The waiting threads, upon awakening, reacquire the mutex and re-evaluate the condition. .SS "Initialize" -.sp -.LP Condition variables and mutexes should be global. Condition variables that are allocated in writable memory can synchronize threads among processes if they are shared by the cooperating processes (see \fBmmap\fR(2)) and are initialized @@ -99,26 +95,18 @@ initialization of the condition variable. Solaris condition variables also implement as the default, intra-process; however, they set this attribute according to the argument, \fItype\fR, passed to their initialization function. .SS "Condition Wait" -.sp -.LP The condition wait interface allows a thread to wait for a condition and atomically release the associated mutex that it needs to hold to check the condition. The thread waits for another thread to make the condition true and that thread's resulting call to signal and wakeup the waiting thread. .SS "Condition Signaling" -.sp -.LP A condition signal allows a thread to unblock the next thread waiting on the condition variable, whereas, a condition broadcast allows a thread to unblock all threads waiting on the condition variable. .SS "Destroy" -.sp -.LP The condition destroy functions destroy any state, but not the space, associated with the condition variable. .SH ATTRIBUTES -.sp -.LP See \fBattributes\fR(5) for descriptions of the following attributes: .sp @@ -133,8 +121,6 @@ MT-Level MT-Safe .TE .SH SEE ALSO -.sp -.LP \fBfork\fR(2), \fBmmap\fR(2), \fBsetitimer\fR(2), \fBshmop\fR(2), \fBcond_broadcast\fR(3C), \fBcond_destroy\fR(3C), \fBcond_init\fR(3C), \fBcond_signal\fR(3C), \fBcond_timedwait\fR(3C), \fBcond_wait\fR(3C), @@ -144,13 +130,11 @@ MT-Level MT-Safe \fBpthread_condattr_init\fR(3C), \fBsignal\fR(3C), \fBattributes\fR(5), \fBmutex\fR(5), \fBstandards\fR(5) .SH NOTES -.sp -.LP If more than one thread is blocked on a condition variable, the order in which threads are unblocked is determined by the scheduling policy. .sp .LP -\fBUSYNC_THREAD\fR does not support multiple mapplings to the same logical +\fBUSYNC_THREAD\fR does not support multiple mappings to the same logical synch object. If you need to \fBmmap()\fR a synch object to different locations within the same address space, then the synch object should be initialized as a shared object \fBUSYNC_PROCESS\fR for Solaris, and diff --git a/usr/src/man/man5/epoll.5 b/usr/src/man/man5/epoll.5 index 1cf5d39947..2e64991cbf 100644 --- a/usr/src/man/man5/epoll.5 +++ b/usr/src/man/man5/epoll.5 @@ -8,7 +8,7 @@ .\" A full copy of the text of the CDDL should have accompanied this .\" source. A copy of the CDDL is also available via the Internet at .\" http://www.illumos.org/license/CDDL. -.TH EPOLL 5 "Apr 17, 2014" +.TH EPOLL 5 "May 16, 2020" .SH NAME epoll \- Linux-compatible I/O event notification facility .SH SYNOPSIS @@ -28,7 +28,7 @@ the Linux facility, including the following interfaces: .ie t \(bu .el o \fBepoll_create\fR(3C) creates an \fBepoll\fR instance, returning a file -descriptor. It contains a size arugment which is meaningful only in as +descriptor. It contains a size argument which is meaningful only in as much as it cannot be 0. .RE .RS +4 @@ -59,7 +59,7 @@ via \fBepoll_ctl\fR(3C), blocking the caller if no such events are pending. .TP .ie t \(bu .el o -\fBepoll_pwait\fR(3C) opeates in a similar manner to \fBepoll_wait\fR(3C), but +\fBepoll_pwait\fR(3C) operates in a similar manner to \fBepoll_wait\fR(3C), but allows the caller to specify a signal mask to be set atomically with respect to waiting for events. .RE diff --git a/usr/src/man/man5/locale.5 b/usr/src/man/man5/locale.5 index dd8d3ba858..e6d63073aa 100644 --- a/usr/src/man/man5/locale.5 +++ b/usr/src/man/man5/locale.5 @@ -43,12 +43,11 @@ .\" Copyright (c) 1992, X/Open Company Limited. All Rights Reserved. .\" Portions Copyright (c) 2003, Sun Microsystems, Inc. All Rights Reserved. .\" -.TH LOCALE 5 "April 9, 2016" +.TH LOCALE 5 "May 16, 2020" .SH NAME locale \- subset of a user's environment that depends on language and cultural conventions .SH DESCRIPTION -.LP A \fBlocale\fR is the definition of the subset of a user's environment that depends on language and cultural conventions. It is made up from one or more categories. Each category is identified by its name and controls specific @@ -139,7 +138,6 @@ the value of the corresponding environment variable is used. If the environment variable is unset or is set to the empty string, the \fBsetlocale()\fR function sets the appropriate environment. .SS "Locale Definition" -.LP Locales can be described with the file format accepted by the \fBlocaledef\fR utility. .sp @@ -341,7 +339,6 @@ decimal or hexadecimal constants. Symbolic names not present in the charmap file can be specified and will be ignored, as specified under item 1 above. .RE .SS "LC_CTYPE" -.LP The \fBLC_CTYPE\fR category defines character classification, case conversion and other character attributes. In addition, a series of characters can be represented by three adjacent periods representing an ellipsis symbol @@ -719,7 +716,6 @@ the mapping will be the reverse mapping of the one specified for \fBtoupper\fR. .RE .SS "LC_COLLATE" -.LP The \fBLC_COLLATE\fR category provides a collation sequence definition for numerous utilities (such as \fBsort\fR(1), \fBuniq\fR(1), and so forth), regular expression matching (see \fBregex\fR(5)), and the \fBstrcoll\fR(3C), @@ -744,7 +740,7 @@ as an entity). \fBUser-defined ordering of collating elements\fR. Each collating element is assigned a collation value defining its order in the character (or basic) collation sequence. This ordering is used by regular expressions and pattern -matching and, unless collation weights are explicity specified, also as the +matching and, unless collation weights are explicitly specified, also as the collation weight to be used in sorting. .RE .RS +4 @@ -833,7 +829,6 @@ Specify the end of the collation-order statements. .RE .SS "collating-element \fIkeyword\fR" -.LP In addition to the collating elements in the character set, the \fBcollating-element\fR keyword is used to define multi-character collating elements. The syntax is: @@ -868,7 +863,6 @@ Example: \fBcollating-element\fR <\fBll\fR> from "\fBll\fR" .in -2 .SS "collating-symbol \fIkeyword\fR" -.LP This keyword will be used to define symbols for use in collation sequence statements; that is, between the \fBorder_start\fR and the \fBorder_end\fR keywords. The syntax is: @@ -907,7 +901,6 @@ associated with a relative position in the character order sequence. While such a symbolic name does not represent any collating element, it can be used as a weight. .SS "order_start \fIkeyword\fR" -.LP The \fBorder_start\fR keyword must precede collation order entries and also defines the number of weights for this collation sequence definition and other collation rules. @@ -987,7 +980,6 @@ order_start forward;backward .LP If no operands are specified, a single \fBforward\fR operand is assumed. .SS "Collation Order" -.LP The \fBorder_start\fR keyword is followed by collating identifier entries. The syntax for the collating element entries is: .sp @@ -1139,7 +1131,7 @@ l l l l . \fBorder_start\fR \fBforward;backward\fR \fBUNDEFINED\fR \fBIGNORE;IGNORE\fR -\fB\fR +\fB\fR \fB\fR \fB;\fR \fB\&.\|.\|.\fR \fB;.\|.\|.\fR \fB\fR \fB;\fR @@ -1152,7 +1144,7 @@ l l . \fB\fR \fB;\fR \fB\fR \fB;\fR \fB\fR \fB"";""\fR -\fBorder_end\fR +\fBorder_end\fR .TE .sp @@ -1186,10 +1178,8 @@ collating symbol <\fBch\fR> and belongs to the same primary equivalence class as the multi-character collating element <\fBCh\fR>. .RE .SS "order_end \fIkeyword\fR" -.LP The collating order entries must be terminated with an \fBorder_end\fR keyword. .SS "LC_MONETARY" -.LP The \fBLC_MONETARY\fR category defines the rules and symbols that are used to format monetary numeric information. This information is available through the \fBlocaleconv\fR(3C) function @@ -1637,18 +1627,18 @@ The following table shows the result of various combinations: .TS l l l l l l l l l l l l . - \fBp_sep_by_space\fR - 2 1 0 -\fBp_cs_precedes\fR= 1 \fBp_sign_posn\fR= 0 \fB($1.25)\fR \fB($1.25)\fR \fB($1.25)\fR - \fBp_sign_posn\fR= 1 \fB+$1.25\fR \fB+$1.25\fR \fB+$1.25\fR - \fBp_sign_posn\fR= 2 \fB$1.25+\fR \fB$1.25+\fR \fB$1.25+\fR - \fBp_sign_posn\fR= 3 \fB+$1.25\fR \fB+$1.25\fR \fB+$1.25\fR - \fBp_sign_posn\fR= 4 \fB$+1.25\fR \fB$+1.25\fR \fB$+1.25\fR -\fBp_cs_precedes\fR= 0 \fBp_sign_posn\fR= 0 \fB(1.25 $)\fR \fB(1.25 $)\fR \fB(1.25$)\fR - \fBp_sign_posn\fR= 1 \fB+1.25 $\fR \fB+1.25 $\fR \fB+1.25$\fR - \fBp_sign_posn\fR= 2 \fB1.25$ +\fR \fB1.25 $+\fR \fB1.25$+\fR - \fBp_sign_posn\fR= 3 \fB1.25+ $\fR \fB1.25 +$\fR \fB1.25+$\fR - \fBp_sign_posn\fR= 4 \fB1.25$ +\fR \fB1.25 $+\fR \fB1.25$+\fR + \fBp_sep_by_space\fR + 2 1 0 +\fBp_cs_precedes\fR= 1 \fBp_sign_posn\fR= 0 \fB($1.25)\fR \fB($1.25)\fR \fB($1.25)\fR + \fBp_sign_posn\fR= 1 \fB+$1.25\fR \fB+$1.25\fR \fB+$1.25\fR + \fBp_sign_posn\fR= 2 \fB$1.25+\fR \fB$1.25+\fR \fB$1.25+\fR + \fBp_sign_posn\fR= 3 \fB+$1.25\fR \fB+$1.25\fR \fB+$1.25\fR + \fBp_sign_posn\fR= 4 \fB$+1.25\fR \fB$+1.25\fR \fB$+1.25\fR +\fBp_cs_precedes\fR= 0 \fBp_sign_posn\fR= 0 \fB(1.25 $)\fR \fB(1.25 $)\fR \fB(1.25$)\fR + \fBp_sign_posn\fR= 1 \fB+1.25 $\fR \fB+1.25 $\fR \fB+1.25$\fR + \fBp_sign_posn\fR= 2 \fB1.25$ +\fR \fB1.25 $+\fR \fB1.25$+\fR + \fBp_sign_posn\fR= 3 \fB1.25+ $\fR \fB1.25 +$\fR \fB1.25+$\fR + \fBp_sign_posn\fR= 4 \fB1.25$ +\fR \fB1.25 $+\fR \fB1.25$+\fR .TE .sp @@ -1696,7 +1686,6 @@ END LC_MONETARY The entry \fBn/a\fR indicates that the value is not available in the POSIX locale. .SS "LC_NUMERIC" -.LP The \fBLC_NUMERIC\fR category defines the rules and symbols that will be used to format non-monetary numeric information. This information is available through the \fBlocaleconv\fR(3C) function. @@ -1795,7 +1784,6 @@ _ The entry \fBn/a\fR indicates that the value is not available in the POSIX locale. .SS "LC_TIME" -.LP The \fBLC_TIME\fR category defines the interpretation of the field descriptors supported by \fBdate\fR(1) and affects the behavior of the \fBstrftime\fR(3C), \fBwcsftime\fR(3C), \fBstrptime\fR(3C), and \fBnl_langinfo\fR(3C) functions. @@ -2062,7 +2050,6 @@ field descriptor will be used instead of the value. .RE .SS "LC_TIME \fIC-language\fR Access" -.LP The following information can be accessed. These correspond to constants defined in <\fBlanginfo.h\fR> and used as arguments to the \fBnl_langinfo\fR(3C) function. @@ -2324,7 +2311,6 @@ _ .TE .SS "LC_TIME \fIGeneral\fR Information" -.LP Although certain of the field descriptors in the POSIX locale (such as the name of the month) are shown with initial capital letters, this need not be the case in other locales. Programs using these fields may need to adjust the @@ -2362,7 +2348,6 @@ in 1776" while 7/14/1789 would come out as "The 14 day of July in 1789" The above example is for illustrative purposes only. The \fB%O\fR modifier is primarily intended to provide for Kanji or Hindi digits in \fBdate\fR formats. .SS "LC_MESSAGES" -.LP The \fBLC_MESSAGES\fR category defines the format and values for affirmative and negative responses. .sp @@ -2455,7 +2440,6 @@ l | l | l . In an application conforming to the SUSv3 standard, the information on \fByesstr\fR and \fBnostr\fR is not available. .SH SEE ALSO -.LP \fBdate\fR(1), \fBlocale\fR(1), \fBlocaledef\fR(1), \fBsort\fR(1), \fBtr\fR(1), \fBuniq\fR(1), \fBlocaleconv\fR(3C), \fBnl_langinfo\fR(3C), \fBsetlocale\fR(3C), \fBstrcoll\fR(3C), \fBstrftime\fR(3C), \fBstrptime\fR(3C), -- cgit v1.2.3