diff options
author | Patrick Mooney <pmooney@pfmooney.com> | 2017-10-10 12:37:29 +0200 |
---|---|---|
committer | Patrick Mooney <pmooney@pfmooney.com> | 2020-05-15 18:37:56 +0000 |
commit | 4c87aefe8930bd07275b8dd2e96ea5f24d93a52e (patch) | |
tree | 8763eb97971828355c999f954f5fd7b14218dfed | |
parent | a68aefedafdc78f1f25e9c888f61357d59c87311 (diff) | |
download | illumos-joyent-4c87aefe8930bd07275b8dd2e96ea5f24d93a52e.tar.gz |
12665 want modern bhyve
Portions contributed by: Hans Rosenfeld <hans.rosenfeld@joyent.com>
Portions contributed by: John Levon <john.levon@joyent.com>
Portions contributed by: Mike Gerdts <mike.gerdts@joyent.com>
Portions contributed by: Andy Fiddaman <omnios@citrus-it.co.uk>
Portions contributed by: Dominik Hassler <hadfl@omniosce.org>
Portions contributed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Portions contributed by: Robert Mustacchi <rm@joyent.com>
Portions contributed by: Mike Zeller <mike.zeller@joyent.com>
Reviewed by: Andy Fiddaman <omnios@citrus-it.co.uk>
Approved by: Dan McDonald <danmcd@joyent.com>
297 files changed, 56133 insertions, 8958 deletions
diff --git a/exception_lists/check_rtime b/exception_lists/check_rtime index 01bb189dca..42964957d4 100644 --- a/exception_lists/check_rtime +++ b/exception_lists/check_rtime @@ -24,6 +24,7 @@ # Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. # Copyright 2018 OmniOS Community Edition (OmniOSce) Association. # Copyright 2019 Peter Tribble. +# Copyright 2018 Joyent, Inc. # Copyright 2020 Oxide Computer Company # @@ -83,6 +84,7 @@ EXEC_STACK ^opt/os-tests/tests/secflags/stacky$ # Objects for which we allow relocations to the text segment TEXTREL ^platform/.*/MACH(kernel)/unix$ +TEXTREL ^usr/sbin/amd64/bhyve$ # Directories and files that are allowed to have no direct bound symbols NODIRECT ^platform/.*/MACH(kernel)/unix$ diff --git a/exception_lists/copyright b/exception_lists/copyright index 647bc46b60..c62835e304 100644 --- a/exception_lists/copyright +++ b/exception_lists/copyright @@ -466,3 +466,87 @@ usr/src/uts/common/sys/THIRDPARTYLICENSE.firmload usr/src/uts/common/sys/THIRDPARTYLICENSE.firmload.descrip usr/src/uts/common/sys/scsi/adapters/mpt_sas/mpi/* usr/src/uts/sparc/nsmb/ioc_check.ref + +# bhyve sources +usr/src/cmd/bhyve/acpi.[ch] +usr/src/cmd/bhyve/ahci.h +usr/src/cmd/bhyve/atkbdc.[ch] +usr/src/cmd/bhyve/bhyvegc.[ch] +usr/src/cmd/bhyve/bhyverun.[ch] +usr/src/cmd/bhyve/block_if.[ch] +usr/src/cmd/bhyve/bootrom.[ch] +usr/src/cmd/bhyve/console.[ch] +usr/src/cmd/bhyve/consport.c +usr/src/cmd/bhyve/dbgport.[ch] +usr/src/cmd/bhyve/fwctl.[ch] +usr/src/cmd/bhyve/gdb.[ch] +usr/src/cmd/bhyve/inout.[ch] +usr/src/cmd/bhyve/ioapic.[ch] +usr/src/cmd/bhyve/mem.[ch] +usr/src/cmd/bhyve/mevent.[ch] +usr/src/cmd/bhyve/mevent_test.c +usr/src/cmd/bhyve/mptbl.[ch] +usr/src/cmd/bhyve/pci_ahci.c +usr/src/cmd/bhyve/pci_e82545.c +usr/src/cmd/bhyve/pci_emul.[ch] +usr/src/cmd/bhyve/pci_fbuf.c +usr/src/cmd/bhyve/pci_hostbridge.c +usr/src/cmd/bhyve/pci_irq.[ch] +usr/src/cmd/bhyve/pci_lpc.[ch] +usr/src/cmd/bhyve/pci_nvme.c +usr/src/cmd/bhyve/pci_passthru.c +usr/src/cmd/bhyve/pci_uart.c +usr/src/cmd/bhyve/pci_virtio_block.c +usr/src/cmd/bhyve/pci_virtio_net.c +usr/src/cmd/bhyve/pci_virtio_rnd.c +usr/src/cmd/bhyve/pci_virtio_scsi.c +usr/src/cmd/bhyve/pci_xhci.[ch] +usr/src/cmd/bhyve/pm.c +usr/src/cmd/bhyve/pmtmr.c +usr/src/cmd/bhyve/post.c +usr/src/cmd/bhyve/ps2kbd.[ch] +usr/src/cmd/bhyve/ps2mouse.[ch] +usr/src/cmd/bhyve/rfb.[ch] +usr/src/cmd/bhyve/rtc.[ch] +usr/src/cmd/bhyve/smbiostbl.[ch] +usr/src/cmd/bhyve/sockstream.[ch] +usr/src/cmd/bhyve/spinup_ap.[ch] +usr/src/cmd/bhyve/task_switch.c +usr/src/cmd/bhyve/uart_emul.[ch] +usr/src/cmd/bhyve/usb_emul.[ch] +usr/src/cmd/bhyve/usb_mouse.c +usr/src/cmd/bhyve/vga.[ch] +usr/src/cmd/bhyve/virtio.[ch] +usr/src/cmd/bhyve/xmsr.[ch] +usr/src/cmd/bhyvectl/bhyvectl.c +usr/src/compat/freebsd/*.h +usr/src/compat/freebsd/*/*.h +usr/src/compat/freebsd/amd64/machine/*.h +usr/contrib/freebsd/*/*.h +usr/contrib/freebsd/*/*/*.h +usr/contrib/freebsd/lib/libutil/*.c +usr/src/lib/libvmmapi/common/vmmapi.[ch] +usr/src/tools/scripts/gensetdefs.pl +usr/src/uts/i86pc/io/vmm/amd/*.[chs] +usr/src/uts/i86pc/io/vmm/intel/*.[chs] +usr/src/uts/i86pc/io/vmm/intel/offsets.in +usr/src/uts/i86pc/io/vmm/io/*.[ch] +usr/src/uts/i86pc/io/vmm/README.sync +usr/src/uts/i86pc/io/vmm/vmm.c +usr/src/uts/i86pc/io/vmm/vmm_host.[ch] +usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c +usr/src/uts/i86pc/io/vmm/vmm_ioport.[ch] +usr/src/uts/i86pc/io/vmm/vmm_ipi.h +usr/src/uts/i86pc/io/vmm/vmm_ktr.h +usr/src/uts/i86pc/io/vmm/vmm_lapic.[ch] +usr/src/uts/i86pc/io/vmm/vmm_mem.[ch] +usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c +usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c +usr/src/uts/i86pc/io/vmm/vmm_stat.[ch] +usr/src/uts/i86pc/io/vmm/vmm_util.[ch] +usr/src/uts/i86pc/io/vmm/vmx_assym.s +usr/src/uts/i86pc/io/vmm/x86.[ch] +usr/src/uts/i86pc/sys/vmm.h +usr/src/uts/i86pc/sys/vmm_dev.h +usr/src/uts/i86pc/sys/vmm_instruction_emul.h diff --git a/exception_lists/cstyle b/exception_lists/cstyle index d320dcfacc..73edc10e88 100644 --- a/exception_lists/cstyle +++ b/exception_lists/cstyle @@ -1326,3 +1326,86 @@ usr/src/uts/intel/sys/acpi/platform/acos2.h usr/src/uts/intel/sys/acpi/platform/acsolaris.h usr/src/uts/intel/sys/acpi/platform/acwin.h usr/src/uts/intel/sys/acpi/platform/acwin64.h + +# bhyve sources +syntax: glob +usr/src/cmd/bhyve/acpi.[ch] +usr/src/cmd/bhyve/ahci.h +usr/src/cmd/bhyve/atkbdc.[ch] +usr/src/cmd/bhyve/bhyvegc.[ch] +usr/src/cmd/bhyve/bhyverun.[ch] +usr/src/cmd/bhyve/block_if.[ch] +usr/src/cmd/bhyve/bootrom.[ch] +usr/src/cmd/bhyve/console.[ch] +usr/src/cmd/bhyve/consport.c +usr/src/cmd/bhyve/dbgport.[ch] +usr/src/cmd/bhyve/fwctl.[ch] +usr/src/cmd/bhyve/gdb.[ch] +usr/src/cmd/bhyve/inout.[ch] +usr/src/cmd/bhyve/ioapic.[ch] +usr/src/cmd/bhyve/iov.[ch] +usr/src/cmd/bhyve/mem.[ch] +usr/src/cmd/bhyve/mevent.[ch] +usr/src/cmd/bhyve/mevent_test.c +usr/src/cmd/bhyve/mptbl.[ch] +usr/src/cmd/bhyve/pci_ahci.c +usr/src/cmd/bhyve/pci_e82545.c +usr/src/cmd/bhyve/pci_emul.[ch] +usr/src/cmd/bhyve/pci_fbuf.c +usr/src/cmd/bhyve/pci_hostbridge.c +usr/src/cmd/bhyve/pci_irq.[ch] +usr/src/cmd/bhyve/pci_lpc.[ch] +usr/src/cmd/bhyve/pci_nvme.c +usr/src/cmd/bhyve/pci_passthru.c +usr/src/cmd/bhyve/pci_uart.c +usr/src/cmd/bhyve/pci_virtio_block.c +usr/src/cmd/bhyve/pci_virtio_console.c +usr/src/cmd/bhyve/pci_virtio_net.c +usr/src/cmd/bhyve/pci_virtio_rnd.c +usr/src/cmd/bhyve/pci_virtio_scsi.c +usr/src/cmd/bhyve/pci_xhci.[ch] +usr/src/cmd/bhyve/pm.c +usr/src/cmd/bhyve/pmtmr.c +usr/src/cmd/bhyve/post.c +usr/src/cmd/bhyve/ps2kbd.[ch] +usr/src/cmd/bhyve/ps2mouse.[ch] +usr/src/cmd/bhyve/rfb.[ch] +usr/src/cmd/bhyve/rtc.[ch] +usr/src/cmd/bhyve/smbiostbl.[ch] +usr/src/cmd/bhyve/sockstream.[ch] +usr/src/cmd/bhyve/spinup_ap.[ch] +usr/src/cmd/bhyve/task_switch.c +usr/src/cmd/bhyve/uart_emul.[ch] +usr/src/cmd/bhyve/usb_emul.[ch] +usr/src/cmd/bhyve/usb_mouse.c +usr/src/cmd/bhyve/vga.[ch] +usr/src/cmd/bhyve/virtio.[ch] +usr/src/cmd/bhyve/xmsr.[ch] +usr/src/cmd/bhyveconsole/bhyveconsole.c +usr/src/cmd/bhyvectl/bhyvectl.c +usr/src/compat/freebsd/*.h +usr/src/compat/freebsd/*/*.h +usr/src/compat/freebsd/amd64/machine/*.h +usr/contrib/freebsd/*/*.h +usr/contrib/freebsd/*/*/*.h +usr/contrib/freebsd/lib/libutil/*.c +usr/src/lib/libvmmapi/common/vmmapi.[ch] +usr/src/uts/i86pc/io/vmm/amd/*.[ch] +usr/src/uts/i86pc/io/vmm/intel/*.[chs] +usr/src/uts/i86pc/io/vmm/io/*.[ch] +usr/src/uts/i86pc/io/vmm/vmm.c +usr/src/uts/i86pc/io/vmm/vmm_host.[ch] +usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c +usr/src/uts/i86pc/io/vmm/vmm_ioport.[ch] +usr/src/uts/i86pc/io/vmm/vmm_ipi.h +usr/src/uts/i86pc/io/vmm/vmm_ktr.h +usr/src/uts/i86pc/io/vmm/vmm_lapic.[ch] +usr/src/uts/i86pc/io/vmm/vmm_mem.[ch] +usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c +usr/src/uts/i86pc/io/vmm/vmm_stat.[ch] +usr/src/uts/i86pc/io/vmm/vmm_util.[ch] +usr/src/uts/i86pc/io/vmm/vmx_assym.s +usr/src/uts/i86pc/io/vmm/x86.[ch] +usr/src/uts/i86pc/sys/vmm.h +usr/src/uts/i86pc/sys/vmm_dev.h +usr/src/uts/i86pc/sys/vmm_instruction_emul.h diff --git a/exception_lists/hdrchk b/exception_lists/hdrchk index c8edb3e5ae..7fa467f735 100644 --- a/exception_lists/hdrchk +++ b/exception_lists/hdrchk @@ -374,3 +374,53 @@ usr/src/uts/intel/sys/acpi/acresrc.h usr/src/uts/intel/sys/acpi/acstruct.h usr/src/uts/intel/sys/acpi/amlresrc.h usr/src/uts/intel/sys/acpi/platform/acwin64.h + +# bhyve sources +syntax: glob +usr/src/cmd/bhyve/acpi.h +usr/src/cmd/bhyve/ahci.h +usr/src/cmd/bhyve/atkbdc.h +usr/src/cmd/bhyve/bhyvegc.h +usr/src/cmd/bhyve/bhyverun.h +usr/src/cmd/bhyve/block_if.h +usr/src/cmd/bhyve/bootrom.h +usr/src/cmd/bhyve/console.h +usr/src/cmd/bhyve/dbgport.h +usr/src/cmd/bhyve/inout.h +usr/src/cmd/bhyve/ioapic.h +usr/src/cmd/bhyve/mem.h +usr/src/cmd/bhyve/mptbl.h +usr/src/cmd/bhyve/pci_emul.h +usr/src/cmd/bhyve/pci_irq.h +usr/src/cmd/bhyve/pci_lpc.h +usr/src/cmd/bhyve/ps2kbd.h +usr/src/cmd/bhyve/ps2mouse.h +usr/src/cmd/bhyve/rfb.h +usr/src/cmd/bhyve/rtc.h +usr/src/cmd/bhyve/smbiostbl.h +usr/src/cmd/bhyve/sockstream.h +usr/src/cmd/bhyve/spinup_ap.h +usr/src/cmd/bhyve/uart_emul.h +usr/src/cmd/bhyve/vga.h +usr/src/cmd/bhyve/virtio.h +usr/src/cmd/bhyve/xmsr.h +usr/src/compat/freebsd/*.h +usr/src/compat/freebsd/*/*.h +usr/src/compat/freebsd/amd64/machine/*.h +usr/contrib/freebsd/*/*.h +usr/contrib/freebsd/*/*/*.h +usr/src/lib/libvmmapi/common/vmmapi.h +usr/src/uts/i86pc/io/vmm/intel/*.h +usr/src/uts/i86pc/io/vmm/io/*.h +usr/src/uts/i86pc/io/vmm/vmm_host.h +usr/src/uts/i86pc/io/vmm/vmm_ioport.h +usr/src/uts/i86pc/io/vmm/vmm_ipi.h +usr/src/uts/i86pc/io/vmm/vmm_ktr.h +usr/src/uts/i86pc/io/vmm/vmm_lapic.h +usr/src/uts/i86pc/io/vmm/vmm_mem.h +usr/src/uts/i86pc/io/vmm/vmm_stat.h +usr/src/uts/i86pc/io/vmm/vmm_util.h +usr/src/uts/i86pc/io/vmm/x86.h +usr/src/uts/i86pc/sys/vmm.h +usr/src/uts/i86pc/sys/vmm_dev.h +usr/src/uts/i86pc/sys/vmm_instruction_emul.h diff --git a/exception_lists/packaging b/exception_lists/packaging index 41ca551cc2..cd1e8ed230 100644 --- a/exception_lists/packaging +++ b/exception_lists/packaging @@ -817,6 +817,12 @@ usr/lib/sparcv9/libsff.so sparc usr/lib/libsff.so # +# private bhyve files +# +lib/amd64/libvmmapi.so i386 +usr/include/vmmapi.h i386 + +# # libcustr is private # usr/include/libcustr.h diff --git a/exception_lists/wscheck b/exception_lists/wscheck index 489668a350..ac16cc54b2 100644 --- a/exception_lists/wscheck +++ b/exception_lists/wscheck @@ -26,3 +26,84 @@ usr/src/uts/common/io/e1000api/* usr/src/uts/common/io/qede/* usr/src/uts/common/io/i40e/core/* usr/src/uts/common/io/ixgbe/core/* + +# bhyve sources +usr/src/cmd/bhyve/acpi.[ch] +usr/src/cmd/bhyve/ahci.h +usr/src/cmd/bhyve/atkbdc.[ch] +usr/src/cmd/bhyve/bhyvegc.[ch] +usr/src/cmd/bhyve/bhyverun.[ch] +usr/src/cmd/bhyve/block_if.[ch] +usr/src/cmd/bhyve/bootrom.[ch] +usr/src/cmd/bhyve/console.[ch] +usr/src/cmd/bhyve/consport.c +usr/src/cmd/bhyve/dbgport.[ch] +usr/src/cmd/bhyve/fwctl.[ch] +usr/src/cmd/bhyve/gdb.[ch] +usr/src/cmd/bhyve/inout.[ch] +usr/src/cmd/bhyve/ioapic.[ch] +usr/src/cmd/bhyve/mem.[ch] +usr/src/cmd/bhyve/mevent.[ch] +usr/src/cmd/bhyve/mevent_test.c +usr/src/cmd/bhyve/mptbl.[ch] +usr/src/cmd/bhyve/pci_ahci.c +usr/src/cmd/bhyve/pci_e82545.c +usr/src/cmd/bhyve/pci_emul.[ch] +usr/src/cmd/bhyve/pci_fbuf.c +usr/src/cmd/bhyve/pci_hostbridge.c +usr/src/cmd/bhyve/pci_irq.[ch] +usr/src/cmd/bhyve/pci_lpc.[ch] +usr/src/cmd/bhyve/pci_nvme.c +usr/src/cmd/bhyve/pci_passthru.c +usr/src/cmd/bhyve/pci_uart.c +usr/src/cmd/bhyve/pci_virtio_block.c +usr/src/cmd/bhyve/pci_virtio_console.c +usr/src/cmd/bhyve/pci_virtio_net.c +usr/src/cmd/bhyve/pci_virtio_rnd.c +usr/src/cmd/bhyve/pci_virtio_scsi.c +usr/src/cmd/bhyve/pci_xhci.[ch] +usr/src/cmd/bhyve/pm.c +usr/src/cmd/bhyve/pmtmr.c +usr/src/cmd/bhyve/post.c +usr/src/cmd/bhyve/ps2kbd.[ch] +usr/src/cmd/bhyve/ps2mouse.[ch] +usr/src/cmd/bhyve/rfb.[ch] +usr/src/cmd/bhyve/rtc.[ch] +usr/src/cmd/bhyve/smbiostbl.[ch] +usr/src/cmd/bhyve/sockstream.[ch] +usr/src/cmd/bhyve/spinup_ap.[ch] +usr/src/cmd/bhyve/task_switch.c +usr/src/cmd/bhyve/uart_emul.[ch] +usr/src/cmd/bhyve/usb_emul.[ch] +usr/src/cmd/bhyve/usb_mouse.c +usr/src/cmd/bhyve/vga.[ch] +usr/src/cmd/bhyve/virtio.[ch] +usr/src/cmd/bhyve/xmsr.[ch] +usr/src/cmd/bhyveconsole/bhyveconsole.c +usr/src/cmd/bhyvectl/bhyvectl.c +usr/src/compat/freebsd/*.h +usr/src/compat/freebsd/*/*.h +usr/src/compat/freebsd/amd64/machine/*.h +usr/contrib/freebsd/*/*.h +usr/contrib/freebsd/*/*/*.h +usr/contrib/freebsd/lib/libutil/*.c +usr/src/lib/libvmmapi/common/vmmapi.[ch] +usr/src/uts/i86pc/io/vmm/amd/*.[ch] +usr/src/uts/i86pc/io/vmm/intel/*.[chs] +usr/src/uts/i86pc/io/vmm/io/*.[ch] +usr/src/uts/i86pc/io/vmm/vmm.c +usr/src/uts/i86pc/io/vmm/vmm_host.[ch] +usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c +usr/src/uts/i86pc/io/vmm/vmm_ioport.[ch] +usr/src/uts/i86pc/io/vmm/vmm_ipi.h +usr/src/uts/i86pc/io/vmm/vmm_ktr.h +usr/src/uts/i86pc/io/vmm/vmm_lapic.[ch] +usr/src/uts/i86pc/io/vmm/vmm_mem.[ch] +usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c +usr/src/uts/i86pc/io/vmm/vmm_stat.[ch] +usr/src/uts/i86pc/io/vmm/vmm_util.[ch] +usr/src/uts/i86pc/io/vmm/vmx_assym.s +usr/src/uts/i86pc/io/vmm/x86.[ch] +usr/src/uts/i86pc/sys/vmm.h +usr/src/uts/i86pc/sys/vmm_dev.h +usr/src/uts/i86pc/sys/vmm_instruction_emul.h diff --git a/usr/contrib/freebsd/amd64/machine/pmap.h b/usr/contrib/freebsd/amd64/machine/pmap.h new file mode 100644 index 0000000000..a0b8ee37f2 --- /dev/null +++ b/usr/contrib/freebsd/amd64/machine/pmap.h @@ -0,0 +1,455 @@ +/*- + * Copyright (c) 2003 Peter Wemm. + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Derived from hp300 version by Mike Hibler, this version by William + * Jolitz uses a recursive map [a pde points to the page directory] to + * map the page tables using the pagetables themselves. This is done to + * reduce the impact on kernel virtual memory for lots of sparse address + * space, and to reduce the cost of memory to each process. + * + * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90 + * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91 + * $FreeBSD$ + */ + +#ifndef _MACHINE_PMAP_H_ +#define _MACHINE_PMAP_H_ + +/* + * Page-directory and page-table entries follow this format, with a few + * of the fields not present here and there, depending on a lot of things. + */ + /* ---- Intel Nomenclature ---- */ +#define X86_PG_V 0x001 /* P Valid */ +#define X86_PG_RW 0x002 /* R/W Read/Write */ +#define X86_PG_U 0x004 /* U/S User/Supervisor */ +#define X86_PG_NC_PWT 0x008 /* PWT Write through */ +#define X86_PG_NC_PCD 0x010 /* PCD Cache disable */ +#define X86_PG_A 0x020 /* A Accessed */ +#define X86_PG_M 0x040 /* D Dirty */ +#define X86_PG_PS 0x080 /* PS Page size (0=4k,1=2M) */ +#define X86_PG_PTE_PAT 0x080 /* PAT PAT index */ +#define X86_PG_G 0x100 /* G Global */ +#define X86_PG_AVAIL1 0x200 /* / Available for system */ +#define X86_PG_AVAIL2 0x400 /* < programmers use */ +#define X86_PG_AVAIL3 0x800 /* \ */ +#define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */ +#define X86_PG_NX (1ul<<63) /* No-execute */ +#define X86_PG_AVAIL(x) (1ul << (x)) + +/* Page level cache control fields used to determine the PAT type */ +#define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) +#define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) + +/* + * Intel extended page table (EPT) bit definitions. + */ +#define EPT_PG_READ 0x001 /* R Read */ +#define EPT_PG_WRITE 0x002 /* W Write */ +#define EPT_PG_EXECUTE 0x004 /* X Execute */ +#define EPT_PG_IGNORE_PAT 0x040 /* IPAT Ignore PAT */ +#define EPT_PG_PS 0x080 /* PS Page size */ +#define EPT_PG_A 0x100 /* A Accessed */ +#define EPT_PG_M 0x200 /* D Dirty */ +#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) /* MT Memory Type */ + +/* + * Define the PG_xx macros in terms of the bits on x86 PTEs. + */ +#define PG_V X86_PG_V +#define PG_RW X86_PG_RW +#define PG_U X86_PG_U +#define PG_NC_PWT X86_PG_NC_PWT +#define PG_NC_PCD X86_PG_NC_PCD +#define PG_A X86_PG_A +#define PG_M X86_PG_M +#define PG_PS X86_PG_PS +#define PG_PTE_PAT X86_PG_PTE_PAT +#define PG_G X86_PG_G +#define PG_AVAIL1 X86_PG_AVAIL1 +#define PG_AVAIL2 X86_PG_AVAIL2 +#define PG_AVAIL3 X86_PG_AVAIL3 +#define PG_PDE_PAT X86_PG_PDE_PAT +#define PG_NX X86_PG_NX +#define PG_PDE_CACHE X86_PG_PDE_CACHE +#define PG_PTE_CACHE X86_PG_PTE_CACHE + +/* Our various interpretations of the above */ +#define PG_W X86_PG_AVAIL3 /* "Wired" pseudoflag */ +#define PG_MANAGED X86_PG_AVAIL2 +#define EPT_PG_EMUL_V X86_PG_AVAIL(52) +#define EPT_PG_EMUL_RW X86_PG_AVAIL(53) +#define PG_PROMOTED X86_PG_AVAIL(54) /* PDE only */ +#define PG_FRAME (0x000ffffffffff000ul) +#define PG_PS_FRAME (0x000fffffffe00000ul) + +/* + * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB + * (PTE) page mappings have identical settings for the following fields: + */ +#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \ + PG_M | PG_A | PG_U | PG_RW | PG_V) + +/* + * Page Protection Exception bits + */ + +#define PGEX_P 0x01 /* Protection violation vs. not present */ +#define PGEX_W 0x02 /* during a Write cycle */ +#define PGEX_U 0x04 /* access from User mode (UPL) */ +#define PGEX_RSV 0x08 /* reserved PTE field is non-zero */ +#define PGEX_I 0x10 /* during an instruction fetch */ + +/* + * undef the PG_xx macros that define bits in the regular x86 PTEs that + * have a different position in nested PTEs. This is done when compiling + * code that needs to be aware of the differences between regular x86 and + * nested PTEs. + * + * The appropriate bitmask will be calculated at runtime based on the pmap + * type. + */ +#ifdef AMD64_NPT_AWARE +#undef PG_AVAIL1 /* X86_PG_AVAIL1 aliases with EPT_PG_M */ +#undef PG_G +#undef PG_A +#undef PG_M +#undef PG_PDE_PAT +#undef PG_PDE_CACHE +#undef PG_PTE_PAT +#undef PG_PTE_CACHE +#undef PG_RW +#undef PG_V +#endif + +/* + * Pte related macros. This is complicated by having to deal with + * the sign extension of the 48th bit. + */ +#define KVADDR(l4, l3, l2, l1) ( \ + ((unsigned long)-1 << 47) | \ + ((unsigned long)(l4) << PML4SHIFT) | \ + ((unsigned long)(l3) << PDPSHIFT) | \ + ((unsigned long)(l2) << PDRSHIFT) | \ + ((unsigned long)(l1) << PAGE_SHIFT)) + +#define UVADDR(l4, l3, l2, l1) ( \ + ((unsigned long)(l4) << PML4SHIFT) | \ + ((unsigned long)(l3) << PDPSHIFT) | \ + ((unsigned long)(l2) << PDRSHIFT) | \ + ((unsigned long)(l1) << PAGE_SHIFT)) + +/* + * Number of kernel PML4 slots. Can be anywhere from 1 to 64 or so, + * but setting it larger than NDMPML4E makes no sense. + * + * Each slot provides .5 TB of kernel virtual space. + */ +#define NKPML4E 4 + +#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */ +#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */ +#define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */ + +/* + * NDMPML4E is the maximum number of PML4 entries that will be + * used to implement the direct map. It must be a power of two, + * and should generally exceed NKPML4E. The maximum possible + * value is 64; using 128 will make the direct map intrude into + * the recursive page table map. + */ +#define NDMPML4E 8 + +/* + * These values control the layout of virtual memory. The starting address + * of the direct map, which is controlled by DMPML4I, must be a multiple of + * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.) + * + * Note: KPML4I is the index of the (single) level 4 page that maps + * the KVA that holds KERNBASE, while KPML4BASE is the index of the + * first level 4 page that maps VM_MIN_KERNEL_ADDRESS. If NKPML4E + * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra + * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to + * KERNBASE. + * + * (KPML4I combines with KPDPI to choose where KERNBASE starts. + * Or, in other words, KPML4I provides bits 39..47 of KERNBASE, + * and KPDPI provides bits 30..38.) + */ +#define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */ + +#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */ +#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */ + +#define KPML4I (NPML4EPG-1) +#define KPDPI (NPDPEPG-2) /* kernbase at -2GB */ + +/* + * XXX doesn't really belong here I guess... + */ +#define ISA_HOLE_START 0xa0000 +#define ISA_HOLE_LENGTH (0x100000-ISA_HOLE_START) + +#define PMAP_PCID_NONE 0xffffffff +#define PMAP_PCID_KERN 0 +#define PMAP_PCID_OVERMAX 0x1000 + +#ifndef LOCORE + +#include <sys/queue.h> +#include <sys/_cpuset.h> +#include <sys/_lock.h> +#include <sys/_mutex.h> + +#include <vm/_vm_radix.h> + +typedef u_int64_t pd_entry_t; +typedef u_int64_t pt_entry_t; +typedef u_int64_t pdp_entry_t; +typedef u_int64_t pml4_entry_t; + +/* + * Address of current address space page table maps and directories. + */ +#ifdef _KERNEL +#define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0)) +#define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0)) +#define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0)) +#define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)) +#define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t))) +#define PTmap ((pt_entry_t *)(addr_PTmap)) +#define PDmap ((pd_entry_t *)(addr_PDmap)) +#define PDPmap ((pd_entry_t *)(addr_PDPmap)) +#define PML4map ((pd_entry_t *)(addr_PML4map)) +#define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e)) + +extern int nkpt; /* Initial number of kernel page tables */ +extern u_int64_t KPDPphys; /* physical address of kernel level 3 */ +extern u_int64_t KPML4phys; /* physical address of kernel level 4 */ + +/* + * virtual address to page table entry and + * to physical address. + * Note: these work recursively, thus vtopte of a pte will give + * the corresponding pde that in turn maps it. + */ +pt_entry_t *vtopte(vm_offset_t); +#define vtophys(va) pmap_kextract(((vm_offset_t) (va))) + +#define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) +#define pte_load_clear(ptep) atomic_swap_long(ptep, 0) +#define pte_store(ptep, pte) do { \ + *(u_long *)(ptep) = (u_long)(pte); \ +} while (0) +#define pte_clear(ptep) pte_store(ptep, 0) + +#define pde_store(pdep, pde) pte_store(pdep, pde) + +extern pt_entry_t pg_nx; + +#endif /* _KERNEL */ + +/* + * Pmap stuff + */ +struct pv_entry; +struct pv_chunk; + +/* + * Locks + * (p) PV list lock + */ +struct md_page { + TAILQ_HEAD(, pv_entry) pv_list; /* (p) */ + int pv_gen; /* (p) */ + int pat_mode; +}; + +enum pmap_type { + PT_X86, /* regular x86 page tables */ + PT_EPT, /* Intel's nested page tables */ + PT_RVI, /* AMD's nested page tables */ +}; + +struct pmap_pcids { + uint32_t pm_pcid; + uint32_t pm_gen; +}; + +/* + * The kernel virtual address (KVA) of the level 4 page table page is always + * within the direct map (DMAP) region. + */ +struct pmap { + struct mtx pm_mtx; + pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ + uint64_t pm_cr3; + TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ + cpuset_t pm_active; /* active on cpus */ + enum pmap_type pm_type; /* regular or nested tables */ + struct pmap_statistics pm_stats; /* pmap statistics */ + struct vm_radix pm_root; /* spare page table pages */ + long pm_eptgen; /* EPT pmap generation id */ + int pm_flags; + struct pmap_pcids pm_pcids[MAXCPU]; +}; + +/* flags */ +#define PMAP_NESTED_IPIMASK 0xff +#define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ +#define PMAP_EMULATE_AD_BITS (1 << 9) /* needs A/D bits emulation */ +#define PMAP_SUPPORTS_EXEC_ONLY (1 << 10) /* execute only mappings ok */ + +typedef struct pmap *pmap_t; + +#ifdef _KERNEL +extern struct pmap kernel_pmap_store; +#define kernel_pmap (&kernel_pmap_store) + +#define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx) +#define PMAP_LOCK_ASSERT(pmap, type) \ + mtx_assert(&(pmap)->pm_mtx, (type)) +#define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) +#define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \ + NULL, MTX_DEF | MTX_DUPOK) +#define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) +#define PMAP_MTX(pmap) (&(pmap)->pm_mtx) +#define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) +#define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) + +int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags); +int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype); +#endif + +/* + * For each vm_page_t, there is a list of all currently valid virtual + * mappings of that page. An entry is a pv_entry_t, the list is pv_list. + */ +typedef struct pv_entry { + vm_offset_t pv_va; /* virtual address for mapping */ + TAILQ_ENTRY(pv_entry) pv_next; +} *pv_entry_t; + +/* + * pv_entries are allocated in chunks per-process. This avoids the + * need to track per-pmap assignments. + */ +#define _NPCM 3 +#define _NPCPV 168 +struct pv_chunk { + pmap_t pc_pmap; + TAILQ_ENTRY(pv_chunk) pc_list; + uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ + TAILQ_ENTRY(pv_chunk) pc_lru; + struct pv_entry pc_pventry[_NPCPV]; +}; + +#ifdef _KERNEL + +extern caddr_t CADDR1; +extern pt_entry_t *CMAP1; +extern vm_paddr_t phys_avail[]; +extern vm_paddr_t dump_avail[]; +extern vm_offset_t virtual_avail; +extern vm_offset_t virtual_end; +extern vm_paddr_t dmaplimit; +extern int pmap_pcid_enabled; +extern int invpcid_works; + +#define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) +#define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) +#define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) + +struct thread; + +void pmap_activate_sw(struct thread *); +void pmap_bootstrap(vm_paddr_t *); +int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde); +int pmap_change_attr(vm_offset_t, vm_size_t, int); +void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate); +void pmap_init_pat(void); +void pmap_kenter(vm_offset_t va, vm_paddr_t pa); +void *pmap_kenter_temporary(vm_paddr_t pa, int i); +vm_paddr_t pmap_kextract(vm_offset_t); +void pmap_kremove(vm_offset_t); +void *pmap_mapbios(vm_paddr_t, vm_size_t); +void *pmap_mapdev(vm_paddr_t, vm_size_t); +void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); +boolean_t pmap_page_is_mapped(vm_page_t m); +void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma); +void pmap_pinit_pml4(vm_page_t); +void pmap_unmapdev(vm_offset_t, vm_size_t); +void pmap_invalidate_page(pmap_t, vm_offset_t); +void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); +void pmap_invalidate_all(pmap_t); +void pmap_invalidate_cache(void); +void pmap_invalidate_cache_pages(vm_page_t *pages, int count); +void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, + boolean_t force); +void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); +boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); +void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); +#endif /* _KERNEL */ + +/* Return various clipped indexes for a given VA */ +static __inline vm_pindex_t +pmap_pte_index(vm_offset_t va) +{ + + return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); +} + +static __inline vm_pindex_t +pmap_pde_index(vm_offset_t va) +{ + + return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); +} + +static __inline vm_pindex_t +pmap_pdpe_index(vm_offset_t va) +{ + + return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); +} + +static __inline vm_pindex_t +pmap_pml4e_index(vm_offset_t va) +{ + + return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); +} + +#endif /* !LOCORE */ + +#endif /* !_MACHINE_PMAP_H_ */ diff --git a/usr/contrib/freebsd/amd64/machine/specialreg.h b/usr/contrib/freebsd/amd64/machine/specialreg.h deleted file mode 100644 index 41d4125cb9..0000000000 --- a/usr/contrib/freebsd/amd64/machine/specialreg.h +++ /dev/null @@ -1,6 +0,0 @@ -/*- - * This file is in the public domain. - */ -/* $FreeBSD: head/sys/amd64/include/specialreg.h 233207 2012-03-19 21:34:11Z tijl $ */ - -#include <x86/specialreg.h> diff --git a/usr/contrib/freebsd/dev/io/iodev.h b/usr/contrib/freebsd/dev/io/iodev.h new file mode 100644 index 0000000000..d040fcccf4 --- /dev/null +++ b/usr/contrib/freebsd/dev/io/iodev.h @@ -0,0 +1,44 @@ +/*- + * Copyright (c) 2010 Marcel Moolenaar + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _DEV_IODEV_H_ +#define _DEV_IODEV_H_ + +#define IODEV_PIO_READ 0 +#define IODEV_PIO_WRITE 1 + +struct iodev_pio_req { + u_int access; + u_int port; + u_int width; + u_int val; +}; + +#define IODEV_PIO _IOWR('I', 0, struct iodev_pio_req) + +#endif /* _DEV_IODEV_H_ */ diff --git a/usr/contrib/freebsd/dev/mii/mii.h b/usr/contrib/freebsd/dev/mii/mii.h new file mode 100644 index 0000000000..fa1ec84eaa --- /dev/null +++ b/usr/contrib/freebsd/dev/mii/mii.h @@ -0,0 +1,239 @@ +/* $NetBSD: mii.h,v 1.18 2014/06/16 14:43:22 msaitoh Exp $ */ + +/*- + * Copyright (c) 1997 Manuel Bouyer. All rights reserved. + * + * Modification to match BSD/OS 3.0 MII interface by Jason R. Thorpe, + * Numerical Aerospace Simulation Facility, NASA Ames Research Center. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _DEV_MII_MII_H_ +#define _DEV_MII_MII_H_ + +/* + * Registers common to all PHYs. + */ + +#define MII_NPHY 32 /* max # of PHYs per MII */ + +/* + * MII commands, used if a device must drive the MII lines + * manually. + */ +#define MII_COMMAND_START 0x01 +#define MII_COMMAND_READ 0x02 +#define MII_COMMAND_WRITE 0x01 +#define MII_COMMAND_ACK 0x02 + +#define MII_BMCR 0x00 /* Basic mode control register (rw) */ +#define BMCR_RESET 0x8000 /* reset */ +#define BMCR_LOOP 0x4000 /* loopback */ +#define BMCR_SPEED0 0x2000 /* speed selection (LSB) */ +#define BMCR_AUTOEN 0x1000 /* autonegotiation enable */ +#define BMCR_PDOWN 0x0800 /* power down */ +#define BMCR_ISO 0x0400 /* isolate */ +#define BMCR_STARTNEG 0x0200 /* restart autonegotiation */ +#define BMCR_FDX 0x0100 /* Set duplex mode */ +#define BMCR_CTEST 0x0080 /* collision test */ +#define BMCR_SPEED1 0x0040 /* speed selection (MSB) */ + +#define BMCR_S10 0x0000 /* 10 Mb/s */ +#define BMCR_S100 BMCR_SPEED0 /* 100 Mb/s */ +#define BMCR_S1000 BMCR_SPEED1 /* 1000 Mb/s */ + +#define BMCR_SPEED(x) ((x) & (BMCR_SPEED0|BMCR_SPEED1)) + +#define MII_BMSR 0x01 /* Basic mode status register (ro) */ +#define BMSR_100T4 0x8000 /* 100 base T4 capable */ +#define BMSR_100TXFDX 0x4000 /* 100 base Tx full duplex capable */ +#define BMSR_100TXHDX 0x2000 /* 100 base Tx half duplex capable */ +#define BMSR_10TFDX 0x1000 /* 10 base T full duplex capable */ +#define BMSR_10THDX 0x0800 /* 10 base T half duplex capable */ +#define BMSR_100T2FDX 0x0400 /* 100 base T2 full duplex capable */ +#define BMSR_100T2HDX 0x0200 /* 100 base T2 half duplex capable */ +#define BMSR_EXTSTAT 0x0100 /* Extended status in register 15 */ +#define BMSR_MFPS 0x0040 /* MII Frame Preamble Suppression */ +#define BMSR_ACOMP 0x0020 /* Autonegotiation complete */ +#define BMSR_RFAULT 0x0010 /* Link partner fault */ +#define BMSR_ANEG 0x0008 /* Autonegotiation capable */ +#define BMSR_LINK 0x0004 /* Link status */ +#define BMSR_JABBER 0x0002 /* Jabber detected */ +#define BMSR_EXTCAP 0x0001 /* Extended capability */ + +#define BMSR_DEFCAPMASK 0xffffffff + +/* + * Note that the EXTSTAT bit indicates that there is extended status + * info available in register 15, but 802.3 section 22.2.4.3 also + * states that all 1000 Mb/s capable PHYs will set this bit to 1. + */ + +#define BMSR_MEDIAMASK (BMSR_100T4|BMSR_100TXFDX|BMSR_100TXHDX| \ + BMSR_10TFDX|BMSR_10THDX|BMSR_100T2FDX|BMSR_100T2HDX) + +/* + * Convert BMSR media capabilities to ANAR bits for autonegotiation. + * Note the shift chopps off the BMSR_ANEG bit. + */ +#define BMSR_MEDIA_TO_ANAR(x) (((x) & BMSR_MEDIAMASK) >> 6) + +#define MII_PHYIDR1 0x02 /* ID register 1 (ro) */ + +#define MII_PHYIDR2 0x03 /* ID register 2 (ro) */ +#define IDR2_OUILSB 0xfc00 /* OUI LSB */ +#define IDR2_MODEL 0x03f0 /* vendor model */ +#define IDR2_REV 0x000f /* vendor revision */ + +#define MII_ANAR 0x04 /* Autonegotiation advertisement (rw) */ + /* section 28.2.4.1 and 37.2.6.1 */ +#define ANAR_NP 0x8000 /* Next page (ro) */ +#define ANAR_ACK 0x4000 /* link partner abilities acknowledged (ro) */ +#define ANAR_RF 0x2000 /* remote fault (ro) */ + /* Annex 28B.2 */ +#define ANAR_FC 0x0400 /* local device supports PAUSE */ +#define ANAR_T4 0x0200 /* local device supports 100bT4 */ +#define ANAR_TX_FD 0x0100 /* local device supports 100bTx FD */ +#define ANAR_TX 0x0080 /* local device supports 100bTx */ +#define ANAR_10_FD 0x0040 /* local device supports 10bT FD */ +#define ANAR_10 0x0020 /* local device supports 10bT */ +#define ANAR_CSMA 0x0001 /* protocol selector CSMA/CD */ +#define ANAR_PAUSE_NONE (0 << 10) +#define ANAR_PAUSE_SYM (1 << 10) +#define ANAR_PAUSE_ASYM (2 << 10) +#define ANAR_PAUSE_TOWARDS (3 << 10) + + /* Annex 28D */ +#define ANAR_X_FD 0x0020 /* local device supports 1000BASE-X FD */ +#define ANAR_X_HD 0x0040 /* local device supports 1000BASE-X HD */ +#define ANAR_X_PAUSE_NONE (0 << 7) +#define ANAR_X_PAUSE_SYM (1 << 7) +#define ANAR_X_PAUSE_ASYM (2 << 7) +#define ANAR_X_PAUSE_TOWARDS (3 << 7) + +#define MII_ANLPAR 0x05 /* Autonegotiation lnk partner abilities (rw) */ + /* section 28.2.4.1 and 37.2.6.1 */ +#define ANLPAR_NP 0x8000 /* Next page (ro) */ +#define ANLPAR_ACK 0x4000 /* link partner accepted ACK (ro) */ +#define ANLPAR_RF 0x2000 /* remote fault (ro) */ +#define ANLPAR_FC 0x0400 /* link partner supports PAUSE */ +#define ANLPAR_T4 0x0200 /* link partner supports 100bT4 */ +#define ANLPAR_TX_FD 0x0100 /* link partner supports 100bTx FD */ +#define ANLPAR_TX 0x0080 /* link partner supports 100bTx */ +#define ANLPAR_10_FD 0x0040 /* link partner supports 10bT FD */ +#define ANLPAR_10 0x0020 /* link partner supports 10bT */ +#define ANLPAR_CSMA 0x0001 /* protocol selector CSMA/CD */ +#define ANLPAR_PAUSE_MASK (3 << 10) +#define ANLPAR_PAUSE_NONE (0 << 10) +#define ANLPAR_PAUSE_SYM (1 << 10) +#define ANLPAR_PAUSE_ASYM (2 << 10) +#define ANLPAR_PAUSE_TOWARDS (3 << 10) + +#define ANLPAR_X_FD 0x0020 /* local device supports 1000BASE-X FD */ +#define ANLPAR_X_HD 0x0040 /* local device supports 1000BASE-X HD */ +#define ANLPAR_X_PAUSE_MASK (3 << 7) +#define ANLPAR_X_PAUSE_NONE (0 << 7) +#define ANLPAR_X_PAUSE_SYM (1 << 7) +#define ANLPAR_X_PAUSE_ASYM (2 << 7) +#define ANLPAR_X_PAUSE_TOWARDS (3 << 7) + +#define MII_ANER 0x06 /* Autonegotiation expansion (ro) */ + /* section 28.2.4.1 and 37.2.6.1 */ +#define ANER_MLF 0x0010 /* multiple link detection fault */ +#define ANER_LPNP 0x0008 /* link parter next page-able */ +#define ANER_NP 0x0004 /* next page-able */ +#define ANER_PAGE_RX 0x0002 /* Page received */ +#define ANER_LPAN 0x0001 /* link parter autoneg-able */ + +#define MII_ANNP 0x07 /* Autonegotiation next page */ + /* section 28.2.4.1 and 37.2.6.1 */ + +#define MII_ANLPRNP 0x08 /* Autonegotiation link partner rx next page */ + /* section 32.5.1 and 37.2.6.1 */ + + /* This is also the 1000baseT control register */ +#define MII_100T2CR 0x09 /* 100base-T2 control register */ +#define GTCR_TEST_MASK 0xe000 /* see 802.3ab ss. 40.6.1.1.2 */ +#define GTCR_MAN_MS 0x1000 /* enable manual master/slave control */ +#define GTCR_ADV_MS 0x0800 /* 1 = adv. master, 0 = adv. slave */ +#define GTCR_PORT_TYPE 0x0400 /* 1 = DCE, 0 = DTE (NIC) */ +#define GTCR_ADV_1000TFDX 0x0200 /* adv. 1000baseT FDX */ +#define GTCR_ADV_1000THDX 0x0100 /* adv. 1000baseT HDX */ + + /* This is also the 1000baseT status register */ +#define MII_100T2SR 0x0a /* 100base-T2 status register */ +#define GTSR_MAN_MS_FLT 0x8000 /* master/slave config fault */ +#define GTSR_MS_RES 0x4000 /* result: 1 = master, 0 = slave */ +#define GTSR_LRS 0x2000 /* local rx status, 1 = ok */ +#define GTSR_RRS 0x1000 /* remote rx status, 1 = ok */ +#define GTSR_LP_1000TFDX 0x0800 /* link partner 1000baseT FDX capable */ +#define GTSR_LP_1000THDX 0x0400 /* link partner 1000baseT HDX capable */ +#define GTSR_LP_ASM_DIR 0x0200 /* link partner asym. pause dir. capable */ +#define GTSR_IDLE_ERR 0x00ff /* IDLE error count */ + +#define MII_PSECR 0x0b /* PSE control register */ +#define PSECR_PACTLMASK 0x000c /* pair control mask */ +#define PSECR_PSEENMASK 0x0003 /* PSE enable mask */ +#define PSECR_PINOUTB 0x0008 /* PSE pinout Alternative B */ +#define PSECR_PINOUTA 0x0004 /* PSE pinout Alternative A */ +#define PSECR_FOPOWTST 0x0002 /* Force Power Test Mode */ +#define PSECR_PSEEN 0x0001 /* PSE Enabled */ +#define PSECR_PSEDIS 0x0000 /* PSE Disabled */ + +#define MII_PSESR 0x0c /* PSE status register */ +#define PSESR_PWRDENIED 0x1000 /* Power Denied */ +#define PSESR_VALSIG 0x0800 /* Valid PD signature detected */ +#define PSESR_INVALSIG 0x0400 /* Invalid PD signature detected */ +#define PSESR_SHORTCIRC 0x0200 /* Short circuit condition detected */ +#define PSESR_OVERLOAD 0x0100 /* Overload condition detected */ +#define PSESR_MPSABSENT 0x0080 /* MPS absent condition detected */ +#define PSESR_PDCLMASK 0x0070 /* PD Class mask */ +#define PSESR_STATMASK 0x000e /* PSE Status mask */ +#define PSESR_PAIRCTABL 0x0001 /* PAIR Control Ability */ +#define PSESR_PDCL_4 (4 << 4) /* Class 4 */ +#define PSESR_PDCL_3 (3 << 4) /* Class 3 */ +#define PSESR_PDCL_2 (2 << 4) /* Class 2 */ +#define PSESR_PDCL_1 (1 << 4) /* Class 1 */ +#define PSESR_PDCL_0 (0 << 4) /* Class 0 */ + +#define MII_MMDACR 0x0d /* MMD access control register */ +#define MMDACR_FUNCMASK 0xc000 /* function */ +#define MMDACR_DADDRMASK 0x001f /* device address */ +#define MMDACR_FN_ADDRESS (0 << 14) /* address */ +#define MMDACR_FN_DATANPI (1 << 14) /* data, no post increment */ +#define MMDACR_FN_DATAPIRW (2 << 14) /* data, post increment on r/w */ +#define MMDACR_FN_DATAPIW (3 << 14) /* data, post increment on wr only */ + +#define MII_MMDAADR 0x0e /* MMD access address data register */ + +#define MII_EXTSR 0x0f /* Extended status register */ +#define EXTSR_1000XFDX 0x8000 /* 1000X full-duplex capable */ +#define EXTSR_1000XHDX 0x4000 /* 1000X half-duplex capable */ +#define EXTSR_1000TFDX 0x2000 /* 1000T full-duplex capable */ +#define EXTSR_1000THDX 0x1000 /* 1000T half-duplex capable */ + +#define EXTSR_MEDIAMASK (EXTSR_1000XFDX|EXTSR_1000XHDX| \ + EXTSR_1000TFDX|EXTSR_1000THDX) + +#endif /* _DEV_MII_MII_H_ */ diff --git a/usr/contrib/freebsd/dev/nvme/nvme.h b/usr/contrib/freebsd/dev/nvme/nvme.h new file mode 100644 index 0000000000..c7f6496426 --- /dev/null +++ b/usr/contrib/freebsd/dev/nvme/nvme.h @@ -0,0 +1,1511 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (C) 2012-2013 Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + * Copyright 2019 Joyent, Inc. + */ + +#ifndef __NVME_H__ +#define __NVME_H__ + +#ifdef _KERNEL +#include <sys/types.h> +#endif + +#include <sys/param.h> +#include <sys/endian.h> + +#define NVME_PASSTHROUGH_CMD _IOWR('n', 0, struct nvme_pt_command) +#define NVME_RESET_CONTROLLER _IO('n', 1) + +#define NVME_IO_TEST _IOWR('n', 100, struct nvme_io_test) +#define NVME_BIO_TEST _IOWR('n', 101, struct nvme_io_test) + +/* + * Macros to deal with NVME revisions, as defined VS register + */ +#define NVME_REV(x, y) (((x) << 16) | ((y) << 8)) +#define NVME_MAJOR(r) (((r) >> 16) & 0xffff) +#define NVME_MINOR(r) (((r) >> 8) & 0xff) + +/* + * Use to mark a command to apply to all namespaces, or to retrieve global + * log pages. + */ +#define NVME_GLOBAL_NAMESPACE_TAG ((uint32_t)0xFFFFFFFF) + +/* Cap nvme to 1MB transfers driver explodes with larger sizes */ +#define NVME_MAX_XFER_SIZE (MAXPHYS < (1<<20) ? MAXPHYS : (1<<20)) + +/* Register field definitions */ +#define NVME_CAP_LO_REG_MQES_SHIFT (0) +#define NVME_CAP_LO_REG_MQES_MASK (0xFFFF) +#define NVME_CAP_LO_REG_CQR_SHIFT (16) +#define NVME_CAP_LO_REG_CQR_MASK (0x1) +#define NVME_CAP_LO_REG_AMS_SHIFT (17) +#define NVME_CAP_LO_REG_AMS_MASK (0x3) +#define NVME_CAP_LO_REG_TO_SHIFT (24) +#define NVME_CAP_LO_REG_TO_MASK (0xFF) + +#define NVME_CAP_HI_REG_DSTRD_SHIFT (0) +#define NVME_CAP_HI_REG_DSTRD_MASK (0xF) +#define NVME_CAP_HI_REG_CSS_NVM_SHIFT (5) +#define NVME_CAP_HI_REG_CSS_NVM_MASK (0x1) +#define NVME_CAP_HI_REG_MPSMIN_SHIFT (16) +#define NVME_CAP_HI_REG_MPSMIN_MASK (0xF) +#define NVME_CAP_HI_REG_MPSMAX_SHIFT (20) +#define NVME_CAP_HI_REG_MPSMAX_MASK (0xF) + +#define NVME_CC_REG_EN_SHIFT (0) +#define NVME_CC_REG_EN_MASK (0x1) +#define NVME_CC_REG_CSS_SHIFT (4) +#define NVME_CC_REG_CSS_MASK (0x7) +#define NVME_CC_REG_MPS_SHIFT (7) +#define NVME_CC_REG_MPS_MASK (0xF) +#define NVME_CC_REG_AMS_SHIFT (11) +#define NVME_CC_REG_AMS_MASK (0x7) +#define NVME_CC_REG_SHN_SHIFT (14) +#define NVME_CC_REG_SHN_MASK (0x3) +#define NVME_CC_REG_IOSQES_SHIFT (16) +#define NVME_CC_REG_IOSQES_MASK (0xF) +#define NVME_CC_REG_IOCQES_SHIFT (20) +#define NVME_CC_REG_IOCQES_MASK (0xF) + +#define NVME_CSTS_REG_RDY_SHIFT (0) +#define NVME_CSTS_REG_RDY_MASK (0x1) +#define NVME_CSTS_REG_CFS_SHIFT (1) +#define NVME_CSTS_REG_CFS_MASK (0x1) +#define NVME_CSTS_REG_SHST_SHIFT (2) +#define NVME_CSTS_REG_SHST_MASK (0x3) + +#define NVME_CSTS_GET_SHST(csts) (((csts) >> NVME_CSTS_REG_SHST_SHIFT) & NVME_CSTS_REG_SHST_MASK) + +#define NVME_AQA_REG_ASQS_SHIFT (0) +#define NVME_AQA_REG_ASQS_MASK (0xFFF) +#define NVME_AQA_REG_ACQS_SHIFT (16) +#define NVME_AQA_REG_ACQS_MASK (0xFFF) + +/* Command field definitions */ + +#define NVME_CMD_FUSE_SHIFT (8) +#define NVME_CMD_FUSE_MASK (0x3) + +#define NVME_STATUS_P_SHIFT (0) +#define NVME_STATUS_P_MASK (0x1) +#define NVME_STATUS_SC_SHIFT (1) +#define NVME_STATUS_SC_MASK (0xFF) +#define NVME_STATUS_SCT_SHIFT (9) +#define NVME_STATUS_SCT_MASK (0x7) +#define NVME_STATUS_M_SHIFT (14) +#define NVME_STATUS_M_MASK (0x1) +#define NVME_STATUS_DNR_SHIFT (15) +#define NVME_STATUS_DNR_MASK (0x1) + +#define NVME_STATUS_GET_P(st) (((st) >> NVME_STATUS_P_SHIFT) & NVME_STATUS_P_MASK) +#define NVME_STATUS_GET_SC(st) (((st) >> NVME_STATUS_SC_SHIFT) & NVME_STATUS_SC_MASK) +#define NVME_STATUS_GET_SCT(st) (((st) >> NVME_STATUS_SCT_SHIFT) & NVME_STATUS_SCT_MASK) +#define NVME_STATUS_GET_M(st) (((st) >> NVME_STATUS_M_SHIFT) & NVME_STATUS_M_MASK) +#define NVME_STATUS_GET_DNR(st) (((st) >> NVME_STATUS_DNR_SHIFT) & NVME_STATUS_DNR_MASK) + +#define NVME_PWR_ST_MPS_SHIFT (0) +#define NVME_PWR_ST_MPS_MASK (0x1) +#define NVME_PWR_ST_NOPS_SHIFT (1) +#define NVME_PWR_ST_NOPS_MASK (0x1) +#define NVME_PWR_ST_RRT_SHIFT (0) +#define NVME_PWR_ST_RRT_MASK (0x1F) +#define NVME_PWR_ST_RRL_SHIFT (0) +#define NVME_PWR_ST_RRL_MASK (0x1F) +#define NVME_PWR_ST_RWT_SHIFT (0) +#define NVME_PWR_ST_RWT_MASK (0x1F) +#define NVME_PWR_ST_RWL_SHIFT (0) +#define NVME_PWR_ST_RWL_MASK (0x1F) +#define NVME_PWR_ST_IPS_SHIFT (6) +#define NVME_PWR_ST_IPS_MASK (0x3) +#define NVME_PWR_ST_APW_SHIFT (0) +#define NVME_PWR_ST_APW_MASK (0x7) +#define NVME_PWR_ST_APS_SHIFT (6) +#define NVME_PWR_ST_APS_MASK (0x3) + +/** Controller Multi-path I/O and Namespace Sharing Capabilities */ +/* More then one port */ +#define NVME_CTRLR_DATA_MIC_MPORTS_SHIFT (0) +#define NVME_CTRLR_DATA_MIC_MPORTS_MASK (0x1) +/* More then one controller */ +#define NVME_CTRLR_DATA_MIC_MCTRLRS_SHIFT (1) +#define NVME_CTRLR_DATA_MIC_MCTRLRS_MASK (0x1) +/* SR-IOV Virtual Function */ +#define NVME_CTRLR_DATA_MIC_SRIOVVF_SHIFT (2) +#define NVME_CTRLR_DATA_MIC_SRIOVVF_MASK (0x1) + +/** OACS - optional admin command support */ +/* supports security send/receive commands */ +#define NVME_CTRLR_DATA_OACS_SECURITY_SHIFT (0) +#define NVME_CTRLR_DATA_OACS_SECURITY_MASK (0x1) +/* supports format nvm command */ +#define NVME_CTRLR_DATA_OACS_FORMAT_SHIFT (1) +#define NVME_CTRLR_DATA_OACS_FORMAT_MASK (0x1) +/* supports firmware activate/download commands */ +#define NVME_CTRLR_DATA_OACS_FIRMWARE_SHIFT (2) +#define NVME_CTRLR_DATA_OACS_FIRMWARE_MASK (0x1) +/* supports namespace management commands */ +#define NVME_CTRLR_DATA_OACS_NSMGMT_SHIFT (3) +#define NVME_CTRLR_DATA_OACS_NSMGMT_MASK (0x1) +/* supports Device Self-test command */ +#define NVME_CTRLR_DATA_OACS_SELFTEST_SHIFT (4) +#define NVME_CTRLR_DATA_OACS_SELFTEST_MASK (0x1) +/* supports Directives */ +#define NVME_CTRLR_DATA_OACS_DIRECTIVES_SHIFT (5) +#define NVME_CTRLR_DATA_OACS_DIRECTIVES_MASK (0x1) +/* supports NVMe-MI Send/Receive */ +#define NVME_CTRLR_DATA_OACS_NVMEMI_SHIFT (6) +#define NVME_CTRLR_DATA_OACS_NVMEMI_MASK (0x1) +/* supports Virtualization Management */ +#define NVME_CTRLR_DATA_OACS_VM_SHIFT (7) +#define NVME_CTRLR_DATA_OACS_VM_MASK (0x1) +/* supports Doorbell Buffer Config */ +#define NVME_CTRLR_DATA_OACS_DBBUFFER_SHIFT (8) +#define NVME_CTRLR_DATA_OACS_DBBUFFER_MASK (0x1) + +/** firmware updates */ +/* first slot is read-only */ +#define NVME_CTRLR_DATA_FRMW_SLOT1_RO_SHIFT (0) +#define NVME_CTRLR_DATA_FRMW_SLOT1_RO_MASK (0x1) +/* number of firmware slots */ +#define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_SHIFT (1) +#define NVME_CTRLR_DATA_FRMW_NUM_SLOTS_MASK (0x7) + +/** log page attributes */ +/* per namespace smart/health log page */ +#define NVME_CTRLR_DATA_LPA_NS_SMART_SHIFT (0) +#define NVME_CTRLR_DATA_LPA_NS_SMART_MASK (0x1) + +/** AVSCC - admin vendor specific command configuration */ +/* admin vendor specific commands use spec format */ +#define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_SHIFT (0) +#define NVME_CTRLR_DATA_AVSCC_SPEC_FORMAT_MASK (0x1) + +/** Autonomous Power State Transition Attributes */ +/* Autonomous Power State Transitions supported */ +#define NVME_CTRLR_DATA_APSTA_APST_SUPP_SHIFT (0) +#define NVME_CTRLR_DATA_APSTA_APST_SUPP_MASK (0x1) + +/** submission queue entry size */ +#define NVME_CTRLR_DATA_SQES_MIN_SHIFT (0) +#define NVME_CTRLR_DATA_SQES_MIN_MASK (0xF) +#define NVME_CTRLR_DATA_SQES_MAX_SHIFT (4) +#define NVME_CTRLR_DATA_SQES_MAX_MASK (0xF) + +/** completion queue entry size */ +#define NVME_CTRLR_DATA_CQES_MIN_SHIFT (0) +#define NVME_CTRLR_DATA_CQES_MIN_MASK (0xF) +#define NVME_CTRLR_DATA_CQES_MAX_SHIFT (4) +#define NVME_CTRLR_DATA_CQES_MAX_MASK (0xF) + +/** optional nvm command support */ +#define NVME_CTRLR_DATA_ONCS_COMPARE_SHIFT (0) +#define NVME_CTRLR_DATA_ONCS_COMPARE_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_WRITE_UNC_SHIFT (1) +#define NVME_CTRLR_DATA_ONCS_WRITE_UNC_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_DSM_SHIFT (2) +#define NVME_CTRLR_DATA_ONCS_DSM_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_WRZERO_SHIFT (3) +#define NVME_CTRLR_DATA_ONCS_WRZERO_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_SAVEFEAT_SHIFT (4) +#define NVME_CTRLR_DATA_ONCS_SAVEFEAT_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_RESERV_SHIFT (5) +#define NVME_CTRLR_DATA_ONCS_RESERV_MASK (0x1) +#define NVME_CTRLR_DATA_ONCS_TIMESTAMP_SHIFT (6) +#define NVME_CTRLR_DATA_ONCS_TIMESTAMP_MASK (0x1) + +/** Fused Operation Support */ +#define NVME_CTRLR_DATA_FUSES_CNW_SHIFT (0) +#define NVME_CTRLR_DATA_FUSES_CNW_MASK (0x1) + +/** Format NVM Attributes */ +#define NVME_CTRLR_DATA_FNA_FORMAT_ALL_SHIFT (0) +#define NVME_CTRLR_DATA_FNA_FORMAT_ALL_MASK (0x1) +#define NVME_CTRLR_DATA_FNA_ERASE_ALL_SHIFT (1) +#define NVME_CTRLR_DATA_FNA_ERASE_ALL_MASK (0x1) +#define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_SHIFT (2) +#define NVME_CTRLR_DATA_FNA_CRYPTO_ERASE_MASK (0x1) + +/** volatile write cache */ +#define NVME_CTRLR_DATA_VWC_PRESENT_SHIFT (0) +#define NVME_CTRLR_DATA_VWC_PRESENT_MASK (0x1) + +/** namespace features */ +/* thin provisioning */ +#define NVME_NS_DATA_NSFEAT_THIN_PROV_SHIFT (0) +#define NVME_NS_DATA_NSFEAT_THIN_PROV_MASK (0x1) +/* NAWUN, NAWUPF, and NACWU fields are valid */ +#define NVME_NS_DATA_NSFEAT_NA_FIELDS_SHIFT (1) +#define NVME_NS_DATA_NSFEAT_NA_FIELDS_MASK (0x1) +/* Deallocated or Unwritten Logical Block errors supported */ +#define NVME_NS_DATA_NSFEAT_DEALLOC_SHIFT (2) +#define NVME_NS_DATA_NSFEAT_DEALLOC_MASK (0x1) +/* NGUID and EUI64 fields are not reusable */ +#define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_SHIFT (3) +#define NVME_NS_DATA_NSFEAT_NO_ID_REUSE_MASK (0x1) + +/** formatted lba size */ +#define NVME_NS_DATA_FLBAS_FORMAT_SHIFT (0) +#define NVME_NS_DATA_FLBAS_FORMAT_MASK (0xF) +#define NVME_NS_DATA_FLBAS_EXTENDED_SHIFT (4) +#define NVME_NS_DATA_FLBAS_EXTENDED_MASK (0x1) + +/** metadata capabilities */ +/* metadata can be transferred as part of data prp list */ +#define NVME_NS_DATA_MC_EXTENDED_SHIFT (0) +#define NVME_NS_DATA_MC_EXTENDED_MASK (0x1) +/* metadata can be transferred with separate metadata pointer */ +#define NVME_NS_DATA_MC_POINTER_SHIFT (1) +#define NVME_NS_DATA_MC_POINTER_MASK (0x1) + +/** end-to-end data protection capabilities */ +/* protection information type 1 */ +#define NVME_NS_DATA_DPC_PIT1_SHIFT (0) +#define NVME_NS_DATA_DPC_PIT1_MASK (0x1) +/* protection information type 2 */ +#define NVME_NS_DATA_DPC_PIT2_SHIFT (1) +#define NVME_NS_DATA_DPC_PIT2_MASK (0x1) +/* protection information type 3 */ +#define NVME_NS_DATA_DPC_PIT3_SHIFT (2) +#define NVME_NS_DATA_DPC_PIT3_MASK (0x1) +/* first eight bytes of metadata */ +#define NVME_NS_DATA_DPC_MD_START_SHIFT (3) +#define NVME_NS_DATA_DPC_MD_START_MASK (0x1) +/* last eight bytes of metadata */ +#define NVME_NS_DATA_DPC_MD_END_SHIFT (4) +#define NVME_NS_DATA_DPC_MD_END_MASK (0x1) + +/** end-to-end data protection type settings */ +/* protection information type */ +#define NVME_NS_DATA_DPS_PIT_SHIFT (0) +#define NVME_NS_DATA_DPS_PIT_MASK (0x7) +/* 1 == protection info transferred at start of metadata */ +/* 0 == protection info transferred at end of metadata */ +#define NVME_NS_DATA_DPS_MD_START_SHIFT (3) +#define NVME_NS_DATA_DPS_MD_START_MASK (0x1) + +/** Namespace Multi-path I/O and Namespace Sharing Capabilities */ +/* the namespace may be attached to two or more controllers */ +#define NVME_NS_DATA_NMIC_MAY_BE_SHARED_SHIFT (0) +#define NVME_NS_DATA_NMIC_MAY_BE_SHARED_MASK (0x1) + +/** Reservation Capabilities */ +/* Persist Through Power Loss */ +#define NVME_NS_DATA_RESCAP_PTPL_SHIFT (0) +#define NVME_NS_DATA_RESCAP_PTPL_MASK (0x1) +/* supports the Write Exclusive */ +#define NVME_NS_DATA_RESCAP_WR_EX_SHIFT (1) +#define NVME_NS_DATA_RESCAP_WR_EX_MASK (0x1) +/* supports the Exclusive Access */ +#define NVME_NS_DATA_RESCAP_EX_AC_SHIFT (2) +#define NVME_NS_DATA_RESCAP_EX_AC_MASK (0x1) +/* supports the Write Exclusive – Registrants Only */ +#define NVME_NS_DATA_RESCAP_WR_EX_RO_SHIFT (3) +#define NVME_NS_DATA_RESCAP_WR_EX_RO_MASK (0x1) +/* supports the Exclusive Access - Registrants Only */ +#define NVME_NS_DATA_RESCAP_EX_AC_RO_SHIFT (4) +#define NVME_NS_DATA_RESCAP_EX_AC_RO_MASK (0x1) +/* supports the Write Exclusive – All Registrants */ +#define NVME_NS_DATA_RESCAP_WR_EX_AR_SHIFT (5) +#define NVME_NS_DATA_RESCAP_WR_EX_AR_MASK (0x1) +/* supports the Exclusive Access - All Registrants */ +#define NVME_NS_DATA_RESCAP_EX_AC_AR_SHIFT (6) +#define NVME_NS_DATA_RESCAP_EX_AC_AR_MASK (0x1) +/* Ignore Existing Key is used as defined in revision 1.3 or later */ +#define NVME_NS_DATA_RESCAP_IEKEY13_SHIFT (7) +#define NVME_NS_DATA_RESCAP_IEKEY13_MASK (0x1) + +/** Format Progress Indicator */ +/* percentage of the Format NVM command that remains to be completed */ +#define NVME_NS_DATA_FPI_PERC_SHIFT (0) +#define NVME_NS_DATA_FPI_PERC_MASK (0x7f) +/* namespace supports the Format Progress Indicator */ +#define NVME_NS_DATA_FPI_SUPP_SHIFT (7) +#define NVME_NS_DATA_FPI_SUPP_MASK (0x1) + +/** lba format support */ +/* metadata size */ +#define NVME_NS_DATA_LBAF_MS_SHIFT (0) +#define NVME_NS_DATA_LBAF_MS_MASK (0xFFFF) +/* lba data size */ +#define NVME_NS_DATA_LBAF_LBADS_SHIFT (16) +#define NVME_NS_DATA_LBAF_LBADS_MASK (0xFF) +/* relative performance */ +#define NVME_NS_DATA_LBAF_RP_SHIFT (24) +#define NVME_NS_DATA_LBAF_RP_MASK (0x3) + +enum nvme_critical_warning_state { + NVME_CRIT_WARN_ST_AVAILABLE_SPARE = 0x1, + NVME_CRIT_WARN_ST_TEMPERATURE = 0x2, + NVME_CRIT_WARN_ST_DEVICE_RELIABILITY = 0x4, + NVME_CRIT_WARN_ST_READ_ONLY = 0x8, + NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP = 0x10, +}; +#define NVME_CRIT_WARN_ST_RESERVED_MASK (0xE0) + +/* slot for current FW */ +#define NVME_FIRMWARE_PAGE_AFI_SLOT_SHIFT (0) +#define NVME_FIRMWARE_PAGE_AFI_SLOT_MASK (0x7) + +/* CC register SHN field values */ +enum shn_value { + NVME_SHN_NORMAL = 0x1, + NVME_SHN_ABRUPT = 0x2, +}; + +/* CSTS register SHST field values */ +enum shst_value { + NVME_SHST_NORMAL = 0x0, + NVME_SHST_OCCURRING = 0x1, + NVME_SHST_COMPLETE = 0x2, +}; + +struct nvme_registers +{ + /** controller capabilities */ + uint32_t cap_lo; + uint32_t cap_hi; + + uint32_t vs; /* version */ + uint32_t intms; /* interrupt mask set */ + uint32_t intmc; /* interrupt mask clear */ + + /** controller configuration */ + uint32_t cc; + + uint32_t reserved1; + + /** controller status */ + uint32_t csts; + + uint32_t reserved2; + + /** admin queue attributes */ + uint32_t aqa; + + uint64_t asq; /* admin submission queue base addr */ + uint64_t acq; /* admin completion queue base addr */ + uint32_t reserved3[0x3f2]; + + struct { + uint32_t sq_tdbl; /* submission queue tail doorbell */ + uint32_t cq_hdbl; /* completion queue head doorbell */ + } doorbell[1] __packed; +} __packed; + +_Static_assert(sizeof(struct nvme_registers) == 0x1008, "bad size for nvme_registers"); + +struct nvme_command +{ + /* dword 0 */ + uint8_t opc; /* opcode */ + uint8_t fuse; /* fused operation */ + uint16_t cid; /* command identifier */ + + /* dword 1 */ + uint32_t nsid; /* namespace identifier */ + + /* dword 2-3 */ + uint32_t rsvd2; + uint32_t rsvd3; + + /* dword 4-5 */ + uint64_t mptr; /* metadata pointer */ + + /* dword 6-7 */ + uint64_t prp1; /* prp entry 1 */ + + /* dword 8-9 */ + uint64_t prp2; /* prp entry 2 */ + + /* dword 10-15 */ + uint32_t cdw10; /* command-specific */ + uint32_t cdw11; /* command-specific */ + uint32_t cdw12; /* command-specific */ + uint32_t cdw13; /* command-specific */ + uint32_t cdw14; /* command-specific */ + uint32_t cdw15; /* command-specific */ +} __packed; + +_Static_assert(sizeof(struct nvme_command) == 16 * 4, "bad size for nvme_command"); + +struct nvme_completion { + + /* dword 0 */ + uint32_t cdw0; /* command-specific */ + + /* dword 1 */ + uint32_t rsvd1; + + /* dword 2 */ + uint16_t sqhd; /* submission queue head pointer */ + uint16_t sqid; /* submission queue identifier */ + + /* dword 3 */ + uint16_t cid; /* command identifier */ + uint16_t status; +} __packed; + +_Static_assert(sizeof(struct nvme_completion) == 4 * 4, "bad size for nvme_completion"); + +struct nvme_dsm_range { + uint32_t attributes; + uint32_t length; + uint64_t starting_lba; +} __packed; + +/* Largest DSM Trim that can be done */ +#define NVME_MAX_DSM_TRIM 4096 + +_Static_assert(sizeof(struct nvme_dsm_range) == 16, "bad size for nvme_dsm_ranage"); + +/* status code types */ +enum nvme_status_code_type { + NVME_SCT_GENERIC = 0x0, + NVME_SCT_COMMAND_SPECIFIC = 0x1, + NVME_SCT_MEDIA_ERROR = 0x2, + /* 0x3-0x6 - reserved */ + NVME_SCT_VENDOR_SPECIFIC = 0x7, +}; + +/* generic command status codes */ +enum nvme_generic_command_status_code { + NVME_SC_SUCCESS = 0x00, + NVME_SC_INVALID_OPCODE = 0x01, + NVME_SC_INVALID_FIELD = 0x02, + NVME_SC_COMMAND_ID_CONFLICT = 0x03, + NVME_SC_DATA_TRANSFER_ERROR = 0x04, + NVME_SC_ABORTED_POWER_LOSS = 0x05, + NVME_SC_INTERNAL_DEVICE_ERROR = 0x06, + NVME_SC_ABORTED_BY_REQUEST = 0x07, + NVME_SC_ABORTED_SQ_DELETION = 0x08, + NVME_SC_ABORTED_FAILED_FUSED = 0x09, + NVME_SC_ABORTED_MISSING_FUSED = 0x0a, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT = 0x0b, + NVME_SC_COMMAND_SEQUENCE_ERROR = 0x0c, + NVME_SC_INVALID_SGL_SEGMENT_DESCR = 0x0d, + NVME_SC_INVALID_NUMBER_OF_SGL_DESCR = 0x0e, + NVME_SC_DATA_SGL_LENGTH_INVALID = 0x0f, + NVME_SC_METADATA_SGL_LENGTH_INVALID = 0x10, + NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID = 0x11, + NVME_SC_INVALID_USE_OF_CMB = 0x12, + NVME_SC_PRP_OFFET_INVALID = 0x13, + NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED = 0x14, + NVME_SC_OPERATION_DENIED = 0x15, + NVME_SC_SGL_OFFSET_INVALID = 0x16, + /* 0x17 - reserved */ + NVME_SC_HOST_ID_INCONSISTENT_FORMAT = 0x18, + NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED = 0x19, + NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID = 0x1a, + NVME_SC_ABORTED_DUE_TO_PREEMPT = 0x1b, + NVME_SC_SANITIZE_FAILED = 0x1c, + NVME_SC_SANITIZE_IN_PROGRESS = 0x1d, + NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID = 0x1e, + NVME_SC_NOT_SUPPORTED_IN_CMB = 0x1f, + + NVME_SC_LBA_OUT_OF_RANGE = 0x80, + NVME_SC_CAPACITY_EXCEEDED = 0x81, + NVME_SC_NAMESPACE_NOT_READY = 0x82, + NVME_SC_RESERVATION_CONFLICT = 0x83, + NVME_SC_FORMAT_IN_PROGRESS = 0x84, +}; + +/* command specific status codes */ +enum nvme_command_specific_status_code { + NVME_SC_COMPLETION_QUEUE_INVALID = 0x00, + NVME_SC_INVALID_QUEUE_IDENTIFIER = 0x01, + NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED = 0x02, + NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED = 0x03, + /* 0x04 - reserved */ + NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED = 0x05, + NVME_SC_INVALID_FIRMWARE_SLOT = 0x06, + NVME_SC_INVALID_FIRMWARE_IMAGE = 0x07, + NVME_SC_INVALID_INTERRUPT_VECTOR = 0x08, + NVME_SC_INVALID_LOG_PAGE = 0x09, + NVME_SC_INVALID_FORMAT = 0x0a, + NVME_SC_FIRMWARE_REQUIRES_RESET = 0x0b, + NVME_SC_INVALID_QUEUE_DELETION = 0x0c, + NVME_SC_FEATURE_NOT_SAVEABLE = 0x0d, + NVME_SC_FEATURE_NOT_CHANGEABLE = 0x0e, + NVME_SC_FEATURE_NOT_NS_SPECIFIC = 0x0f, + NVME_SC_FW_ACT_REQUIRES_NVMS_RESET = 0x10, + NVME_SC_FW_ACT_REQUIRES_RESET = 0x11, + NVME_SC_FW_ACT_REQUIRES_TIME = 0x12, + NVME_SC_FW_ACT_PROHIBITED = 0x13, + NVME_SC_OVERLAPPING_RANGE = 0x14, + NVME_SC_NS_INSUFFICIENT_CAPACITY = 0x15, + NVME_SC_NS_ID_UNAVAILABLE = 0x16, + /* 0x17 - reserved */ + NVME_SC_NS_ALREADY_ATTACHED = 0x18, + NVME_SC_NS_IS_PRIVATE = 0x19, + NVME_SC_NS_NOT_ATTACHED = 0x1a, + NVME_SC_THIN_PROV_NOT_SUPPORTED = 0x1b, + NVME_SC_CTRLR_LIST_INVALID = 0x1c, + NVME_SC_SELT_TEST_IN_PROGRESS = 0x1d, + NVME_SC_BOOT_PART_WRITE_PROHIB = 0x1e, + NVME_SC_INVALID_CTRLR_ID = 0x1f, + NVME_SC_INVALID_SEC_CTRLR_STATE = 0x20, + NVME_SC_INVALID_NUM_OF_CTRLR_RESRC = 0x21, + NVME_SC_INVALID_RESOURCE_ID = 0x22, + + NVME_SC_CONFLICTING_ATTRIBUTES = 0x80, + NVME_SC_INVALID_PROTECTION_INFO = 0x81, + NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE = 0x82, +}; + +/* media error status codes */ +enum nvme_media_error_status_code { + NVME_SC_WRITE_FAULTS = 0x80, + NVME_SC_UNRECOVERED_READ_ERROR = 0x81, + NVME_SC_GUARD_CHECK_ERROR = 0x82, + NVME_SC_APPLICATION_TAG_CHECK_ERROR = 0x83, + NVME_SC_REFERENCE_TAG_CHECK_ERROR = 0x84, + NVME_SC_COMPARE_FAILURE = 0x85, + NVME_SC_ACCESS_DENIED = 0x86, + NVME_SC_DEALLOCATED_OR_UNWRITTEN = 0x87, +}; + +/* admin opcodes */ +enum nvme_admin_opcode { + NVME_OPC_DELETE_IO_SQ = 0x00, + NVME_OPC_CREATE_IO_SQ = 0x01, + NVME_OPC_GET_LOG_PAGE = 0x02, + /* 0x03 - reserved */ + NVME_OPC_DELETE_IO_CQ = 0x04, + NVME_OPC_CREATE_IO_CQ = 0x05, + NVME_OPC_IDENTIFY = 0x06, + /* 0x07 - reserved */ + NVME_OPC_ABORT = 0x08, + NVME_OPC_SET_FEATURES = 0x09, + NVME_OPC_GET_FEATURES = 0x0a, + /* 0x0b - reserved */ + NVME_OPC_ASYNC_EVENT_REQUEST = 0x0c, + NVME_OPC_NAMESPACE_MANAGEMENT = 0x0d, + /* 0x0e-0x0f - reserved */ + NVME_OPC_FIRMWARE_ACTIVATE = 0x10, + NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD = 0x11, + NVME_OPC_DEVICE_SELF_TEST = 0x14, + NVME_OPC_NAMESPACE_ATTACHMENT = 0x15, + NVME_OPC_KEEP_ALIVE = 0x18, + NVME_OPC_DIRECTIVE_SEND = 0x19, + NVME_OPC_DIRECTIVE_RECEIVE = 0x1a, + NVME_OPC_VIRTUALIZATION_MANAGEMENT = 0x1c, + NVME_OPC_NVME_MI_SEND = 0x1d, + NVME_OPC_NVME_MI_RECEIVE = 0x1e, + NVME_OPC_DOORBELL_BUFFER_CONFIG = 0x7c, + + NVME_OPC_FORMAT_NVM = 0x80, + NVME_OPC_SECURITY_SEND = 0x81, + NVME_OPC_SECURITY_RECEIVE = 0x82, + NVME_OPC_SANITIZE = 0x84, +}; + +/* nvme nvm opcodes */ +enum nvme_nvm_opcode { + NVME_OPC_FLUSH = 0x00, + NVME_OPC_WRITE = 0x01, + NVME_OPC_READ = 0x02, + /* 0x03 - reserved */ + NVME_OPC_WRITE_UNCORRECTABLE = 0x04, + NVME_OPC_COMPARE = 0x05, + /* 0x06 - reserved */ + NVME_OPC_WRITE_ZEROES = 0x08, + /* 0x07 - reserved */ + NVME_OPC_DATASET_MANAGEMENT = 0x09, + /* 0x0a-0x0c - reserved */ + NVME_OPC_RESERVATION_REGISTER = 0x0d, + NVME_OPC_RESERVATION_REPORT = 0x0e, + /* 0x0f-0x10 - reserved */ + NVME_OPC_RESERVATION_ACQUIRE = 0x11, + /* 0x12-0x14 - reserved */ + NVME_OPC_RESERVATION_RELEASE = 0x15, +}; + +enum nvme_feature { + /* 0x00 - reserved */ + NVME_FEAT_ARBITRATION = 0x01, + NVME_FEAT_POWER_MANAGEMENT = 0x02, + NVME_FEAT_LBA_RANGE_TYPE = 0x03, + NVME_FEAT_TEMPERATURE_THRESHOLD = 0x04, + NVME_FEAT_ERROR_RECOVERY = 0x05, + NVME_FEAT_VOLATILE_WRITE_CACHE = 0x06, + NVME_FEAT_NUMBER_OF_QUEUES = 0x07, + NVME_FEAT_INTERRUPT_COALESCING = 0x08, + NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION = 0x09, + NVME_FEAT_WRITE_ATOMICITY = 0x0A, + NVME_FEAT_ASYNC_EVENT_CONFIGURATION = 0x0B, + NVME_FEAT_AUTONOMOUS_POWER_STATE_TRANSITION = 0x0C, + NVME_FEAT_HOST_MEMORY_BUFFER = 0x0D, + NVME_FEAT_TIMESTAMP = 0x0E, + NVME_FEAT_KEEP_ALIVE_TIMER = 0x0F, + NVME_FEAT_HOST_CONTROLLED_THERMAL_MGMT = 0x10, + NVME_FEAT_NON_OP_POWER_STATE_CONFIG = 0x11, + /* 0x12-0x77 - reserved */ + /* 0x78-0x7f - NVMe Management Interface */ + NVME_FEAT_SOFTWARE_PROGRESS_MARKER = 0x80, + /* 0x81-0xBF - command set specific (reserved) */ + /* 0xC0-0xFF - vendor specific */ +}; + +enum nvme_dsm_attribute { + NVME_DSM_ATTR_INTEGRAL_READ = 0x1, + NVME_DSM_ATTR_INTEGRAL_WRITE = 0x2, + NVME_DSM_ATTR_DEALLOCATE = 0x4, +}; + +enum nvme_activate_action { + NVME_AA_REPLACE_NO_ACTIVATE = 0x0, + NVME_AA_REPLACE_ACTIVATE = 0x1, + NVME_AA_ACTIVATE = 0x2, +}; + +struct nvme_power_state { + /** Maximum Power */ + uint16_t mp; /* Maximum Power */ + uint8_t ps_rsvd1; + uint8_t mps_nops; /* Max Power Scale, Non-Operational State */ + + uint32_t enlat; /* Entry Latency */ + uint32_t exlat; /* Exit Latency */ + + uint8_t rrt; /* Relative Read Throughput */ + uint8_t rrl; /* Relative Read Latency */ + uint8_t rwt; /* Relative Write Throughput */ + uint8_t rwl; /* Relative Write Latency */ + + uint16_t idlp; /* Idle Power */ + uint8_t ips; /* Idle Power Scale */ + uint8_t ps_rsvd8; + + uint16_t actp; /* Active Power */ + uint8_t apw_aps; /* Active Power Workload, Active Power Scale */ + uint8_t ps_rsvd10[9]; +} __packed; + +_Static_assert(sizeof(struct nvme_power_state) == 32, "bad size for nvme_power_state"); + +#define NVME_SERIAL_NUMBER_LENGTH 20 +#define NVME_MODEL_NUMBER_LENGTH 40 +#define NVME_FIRMWARE_REVISION_LENGTH 8 + +struct nvme_controller_data { + + /* bytes 0-255: controller capabilities and features */ + + /** pci vendor id */ + uint16_t vid; + + /** pci subsystem vendor id */ + uint16_t ssvid; + + /** serial number */ + uint8_t sn[NVME_SERIAL_NUMBER_LENGTH]; + + /** model number */ + uint8_t mn[NVME_MODEL_NUMBER_LENGTH]; + + /** firmware revision */ + uint8_t fr[NVME_FIRMWARE_REVISION_LENGTH]; + + /** recommended arbitration burst */ + uint8_t rab; + + /** ieee oui identifier */ + uint8_t ieee[3]; + + /** multi-interface capabilities */ + uint8_t mic; + + /** maximum data transfer size */ + uint8_t mdts; + + /** Controller ID */ + uint16_t ctrlr_id; + + /** Version */ + uint32_t ver; + + /** RTD3 Resume Latency */ + uint32_t rtd3r; + + /** RTD3 Enter Latency */ + uint32_t rtd3e; + + /** Optional Asynchronous Events Supported */ + uint32_t oaes; /* bitfield really */ + + /** Controller Attributes */ + uint32_t ctratt; /* bitfield really */ + + uint8_t reserved1[12]; + + /** FRU Globally Unique Identifier */ + uint8_t fguid[16]; + + uint8_t reserved2[128]; + + /* bytes 256-511: admin command set attributes */ + + /** optional admin command support */ + uint16_t oacs; + + /** abort command limit */ + uint8_t acl; + + /** asynchronous event request limit */ + uint8_t aerl; + + /** firmware updates */ + uint8_t frmw; + + /** log page attributes */ + uint8_t lpa; + + /** error log page entries */ + uint8_t elpe; + + /** number of power states supported */ + uint8_t npss; + + /** admin vendor specific command configuration */ + uint8_t avscc; + + /** Autonomous Power State Transition Attributes */ + uint8_t apsta; + + /** Warning Composite Temperature Threshold */ + uint16_t wctemp; + + /** Critical Composite Temperature Threshold */ + uint16_t cctemp; + + /** Maximum Time for Firmware Activation */ + uint16_t mtfa; + + /** Host Memory Buffer Preferred Size */ + uint32_t hmpre; + + /** Host Memory Buffer Minimum Size */ + uint32_t hmmin; + + /** Name space capabilities */ + struct { + /* if nsmgmt, report tnvmcap and unvmcap */ + uint8_t tnvmcap[16]; + uint8_t unvmcap[16]; + } __packed untncap; + + /** Replay Protected Memory Block Support */ + uint32_t rpmbs; /* Really a bitfield */ + + /** Extended Device Self-test Time */ + uint16_t edstt; + + /** Device Self-test Options */ + uint8_t dsto; /* Really a bitfield */ + + /** Firmware Update Granularity */ + uint8_t fwug; + + /** Keep Alive Support */ + uint16_t kas; + + /** Host Controlled Thermal Management Attributes */ + uint16_t hctma; /* Really a bitfield */ + + /** Minimum Thermal Management Temperature */ + uint16_t mntmt; + + /** Maximum Thermal Management Temperature */ + uint16_t mxtmt; + + /** Sanitize Capabilities */ + uint32_t sanicap; /* Really a bitfield */ + + uint8_t reserved3[180]; + /* bytes 512-703: nvm command set attributes */ + + /** submission queue entry size */ + uint8_t sqes; + + /** completion queue entry size */ + uint8_t cqes; + + /** Maximum Outstanding Commands */ + uint16_t maxcmd; + + /** number of namespaces */ + uint32_t nn; + + /** optional nvm command support */ + uint16_t oncs; + + /** fused operation support */ + uint16_t fuses; + + /** format nvm attributes */ + uint8_t fna; + + /** volatile write cache */ + uint8_t vwc; + + /** Atomic Write Unit Normal */ + uint16_t awun; + + /** Atomic Write Unit Power Fail */ + uint16_t awupf; + + /** NVM Vendor Specific Command Configuration */ + uint8_t nvscc; + uint8_t reserved5; + + /** Atomic Compare & Write Unit */ + uint16_t acwu; + uint16_t reserved6; + + /** SGL Support */ + uint32_t sgls; + + /* bytes 540-767: Reserved */ + uint8_t reserved7[228]; + + /** NVM Subsystem NVMe Qualified Name */ + uint8_t subnqn[256]; + + /* bytes 1024-1791: Reserved */ + uint8_t reserved8[768]; + + /* bytes 1792-2047: NVMe over Fabrics specification */ + uint8_t reserved9[256]; + + /* bytes 2048-3071: power state descriptors */ + struct nvme_power_state power_state[32]; + + /* bytes 3072-4095: vendor specific */ + uint8_t vs[1024]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_controller_data) == 4096, "bad size for nvme_controller_data"); + +struct nvme_namespace_data { + + /** namespace size */ + uint64_t nsze; + + /** namespace capacity */ + uint64_t ncap; + + /** namespace utilization */ + uint64_t nuse; + + /** namespace features */ + uint8_t nsfeat; + + /** number of lba formats */ + uint8_t nlbaf; + + /** formatted lba size */ + uint8_t flbas; + + /** metadata capabilities */ + uint8_t mc; + + /** end-to-end data protection capabilities */ + uint8_t dpc; + + /** end-to-end data protection type settings */ + uint8_t dps; + + /** Namespace Multi-path I/O and Namespace Sharing Capabilities */ + uint8_t nmic; + + /** Reservation Capabilities */ + uint8_t rescap; + + /** Format Progress Indicator */ + uint8_t fpi; + + /** Deallocate Logical Block Features */ + uint8_t dlfeat; + + /** Namespace Atomic Write Unit Normal */ + uint16_t nawun; + + /** Namespace Atomic Write Unit Power Fail */ + uint16_t nawupf; + + /** Namespace Atomic Compare & Write Unit */ + uint16_t nacwu; + + /** Namespace Atomic Boundary Size Normal */ + uint16_t nabsn; + + /** Namespace Atomic Boundary Offset */ + uint16_t nabo; + + /** Namespace Atomic Boundary Size Power Fail */ + uint16_t nabspf; + + /** Namespace Optimal IO Boundary */ + uint16_t noiob; + + /** NVM Capacity */ + uint8_t nvmcap[16]; + + /* bytes 64-103: Reserved */ + uint8_t reserved5[40]; + + /** Namespace Globally Unique Identifier */ + uint8_t nguid[16]; + + /** IEEE Extended Unique Identifier */ + uint8_t eui64[8]; + + /** lba format support */ + uint32_t lbaf[16]; + + uint8_t reserved6[192]; + + uint8_t vendor_specific[3712]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_namespace_data) == 4096, "bad size for nvme_namepsace_data"); + +enum nvme_log_page { + + /* 0x00 - reserved */ + NVME_LOG_ERROR = 0x01, + NVME_LOG_HEALTH_INFORMATION = 0x02, + NVME_LOG_FIRMWARE_SLOT = 0x03, + NVME_LOG_CHANGED_NAMESPACE = 0x04, + NVME_LOG_COMMAND_EFFECT = 0x05, + /* 0x06-0x7F - reserved */ + /* 0x80-0xBF - I/O command set specific */ + NVME_LOG_RES_NOTIFICATION = 0x80, + /* 0xC0-0xFF - vendor specific */ + + /* + * The following are Intel Specific log pages, but they seem + * to be widely implemented. + */ + INTEL_LOG_READ_LAT_LOG = 0xc1, + INTEL_LOG_WRITE_LAT_LOG = 0xc2, + INTEL_LOG_TEMP_STATS = 0xc5, + INTEL_LOG_ADD_SMART = 0xca, + INTEL_LOG_DRIVE_MKT_NAME = 0xdd, + + /* + * HGST log page, with lots ofs sub pages. + */ + HGST_INFO_LOG = 0xc1, +}; + +struct nvme_error_information_entry { + + uint64_t error_count; + uint16_t sqid; + uint16_t cid; + uint16_t status; + uint16_t error_location; + uint64_t lba; + uint32_t nsid; + uint8_t vendor_specific; + uint8_t reserved[35]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_error_information_entry) == 64, "bad size for nvme_error_information_entry"); + +struct nvme_health_information_page { + + uint8_t critical_warning; + uint16_t temperature; + uint8_t available_spare; + uint8_t available_spare_threshold; + uint8_t percentage_used; + + uint8_t reserved[26]; + + /* + * Note that the following are 128-bit values, but are + * defined as an array of 2 64-bit values. + */ + /* Data Units Read is always in 512-byte units. */ + uint64_t data_units_read[2]; + /* Data Units Written is always in 512-byte units. */ + uint64_t data_units_written[2]; + /* For NVM command set, this includes Compare commands. */ + uint64_t host_read_commands[2]; + uint64_t host_write_commands[2]; + /* Controller Busy Time is reported in minutes. */ + uint64_t controller_busy_time[2]; + uint64_t power_cycles[2]; + uint64_t power_on_hours[2]; + uint64_t unsafe_shutdowns[2]; + uint64_t media_errors[2]; + uint64_t num_error_info_log_entries[2]; + uint32_t warning_temp_time; + uint32_t error_temp_time; + uint16_t temp_sensor[8]; + + uint8_t reserved2[296]; +} __packed __aligned(4); + +/* Currently sparse/smatch incorrectly packs this struct in some situations. */ +#ifndef __CHECKER__ +_Static_assert(sizeof(struct nvme_health_information_page) == 512, "bad size for nvme_health_information_page"); +#endif + +struct nvme_firmware_page { + + uint8_t afi; + uint8_t reserved[7]; + uint64_t revision[7]; /* revisions for 7 slots */ + uint8_t reserved2[448]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_firmware_page) == 512, "bad size for nvme_firmware_page"); + +struct nvme_ns_list { + uint32_t ns[1024]; +} __packed __aligned(4); + +_Static_assert(sizeof(struct nvme_ns_list) == 4096, "bad size for nvme_ns_list"); + +struct intel_log_temp_stats +{ + uint64_t current; + uint64_t overtemp_flag_last; + uint64_t overtemp_flag_life; + uint64_t max_temp; + uint64_t min_temp; + uint64_t _rsvd[5]; + uint64_t max_oper_temp; + uint64_t min_oper_temp; + uint64_t est_offset; +} __packed __aligned(4); + +_Static_assert(sizeof(struct intel_log_temp_stats) == 13 * 8, "bad size for intel_log_temp_stats"); + +#define NVME_TEST_MAX_THREADS 128 + +struct nvme_io_test { + + enum nvme_nvm_opcode opc; + uint32_t size; + uint32_t time; /* in seconds */ + uint32_t num_threads; + uint32_t flags; + uint64_t io_completed[NVME_TEST_MAX_THREADS]; +}; + +enum nvme_io_test_flags { + + /* + * Specifies whether dev_refthread/dev_relthread should be + * called during NVME_BIO_TEST. Ignored for other test + * types. + */ + NVME_TEST_FLAG_REFTHREAD = 0x1, +}; + +struct nvme_pt_command { + + /* + * cmd is used to specify a passthrough command to a controller or + * namespace. + * + * The following fields from cmd may be specified by the caller: + * * opc (opcode) + * * nsid (namespace id) - for admin commands only + * * cdw10-cdw15 + * + * Remaining fields must be set to 0 by the caller. + */ + struct nvme_command cmd; + + /* + * cpl returns completion status for the passthrough command + * specified by cmd. + * + * The following fields will be filled out by the driver, for + * consumption by the caller: + * * cdw0 + * * status (except for phase) + * + * Remaining fields will be set to 0 by the driver. + */ + struct nvme_completion cpl; + + /* buf is the data buffer associated with this passthrough command. */ + void * buf; + + /* + * len is the length of the data buffer associated with this + * passthrough command. + */ + uint32_t len; + + /* + * is_read = 1 if the passthrough command will read data into the + * supplied buffer from the controller. + * + * is_read = 0 if the passthrough command will write data from the + * supplied buffer to the controller. + */ + uint32_t is_read; + + /* + * driver_lock is used by the driver only. It must be set to 0 + * by the caller. + */ + struct mtx * driver_lock; +}; + +#define nvme_completion_is_error(cpl) \ + (NVME_STATUS_GET_SC((cpl)->status) != 0 || NVME_STATUS_GET_SCT((cpl)->status) != 0) + +void nvme_strvis(uint8_t *dst, const uint8_t *src, int dstlen, int srclen); + +#ifdef _KERNEL + +struct bio; + +struct nvme_namespace; +struct nvme_controller; +struct nvme_consumer; + +typedef void (*nvme_cb_fn_t)(void *, const struct nvme_completion *); + +typedef void *(*nvme_cons_ns_fn_t)(struct nvme_namespace *, void *); +typedef void *(*nvme_cons_ctrlr_fn_t)(struct nvme_controller *); +typedef void (*nvme_cons_async_fn_t)(void *, const struct nvme_completion *, + uint32_t, void *, uint32_t); +typedef void (*nvme_cons_fail_fn_t)(void *); + +enum nvme_namespace_flags { + NVME_NS_DEALLOCATE_SUPPORTED = 0x1, + NVME_NS_FLUSH_SUPPORTED = 0x2, +}; + +int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, + struct nvme_pt_command *pt, + uint32_t nsid, int is_user_buffer, + int is_admin_cmd); + +/* Admin functions */ +void nvme_ctrlr_cmd_set_feature(struct nvme_controller *ctrlr, + uint8_t feature, uint32_t cdw11, + void *payload, uint32_t payload_size, + nvme_cb_fn_t cb_fn, void *cb_arg); +void nvme_ctrlr_cmd_get_feature(struct nvme_controller *ctrlr, + uint8_t feature, uint32_t cdw11, + void *payload, uint32_t payload_size, + nvme_cb_fn_t cb_fn, void *cb_arg); +void nvme_ctrlr_cmd_get_log_page(struct nvme_controller *ctrlr, + uint8_t log_page, uint32_t nsid, + void *payload, uint32_t payload_size, + nvme_cb_fn_t cb_fn, void *cb_arg); + +/* NVM I/O functions */ +int nvme_ns_cmd_write(struct nvme_namespace *ns, void *payload, + uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, + void *cb_arg); +int nvme_ns_cmd_write_bio(struct nvme_namespace *ns, struct bio *bp, + nvme_cb_fn_t cb_fn, void *cb_arg); +int nvme_ns_cmd_read(struct nvme_namespace *ns, void *payload, + uint64_t lba, uint32_t lba_count, nvme_cb_fn_t cb_fn, + void *cb_arg); +int nvme_ns_cmd_read_bio(struct nvme_namespace *ns, struct bio *bp, + nvme_cb_fn_t cb_fn, void *cb_arg); +int nvme_ns_cmd_deallocate(struct nvme_namespace *ns, void *payload, + uint8_t num_ranges, nvme_cb_fn_t cb_fn, + void *cb_arg); +int nvme_ns_cmd_flush(struct nvme_namespace *ns, nvme_cb_fn_t cb_fn, + void *cb_arg); +int nvme_ns_dump(struct nvme_namespace *ns, void *virt, off_t offset, + size_t len); + +/* Registration functions */ +struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, + nvme_cons_ctrlr_fn_t ctrlr_fn, + nvme_cons_async_fn_t async_fn, + nvme_cons_fail_fn_t fail_fn); +void nvme_unregister_consumer(struct nvme_consumer *consumer); + +/* Controller helper functions */ +device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr); +const struct nvme_controller_data * + nvme_ctrlr_get_data(struct nvme_controller *ctrlr); + +/* Namespace helper functions */ +uint32_t nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns); +uint32_t nvme_ns_get_sector_size(struct nvme_namespace *ns); +uint64_t nvme_ns_get_num_sectors(struct nvme_namespace *ns); +uint64_t nvme_ns_get_size(struct nvme_namespace *ns); +uint32_t nvme_ns_get_flags(struct nvme_namespace *ns); +const char * nvme_ns_get_serial_number(struct nvme_namespace *ns); +const char * nvme_ns_get_model_number(struct nvme_namespace *ns); +const struct nvme_namespace_data * + nvme_ns_get_data(struct nvme_namespace *ns); +uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns); + +int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp, + nvme_cb_fn_t cb_fn); + +/* + * Command building helper functions -- shared with CAM + * These functions assume allocator zeros out cmd structure + * CAM's xpt_get_ccb and the request allocator for nvme both + * do zero'd allocations. + */ +static inline +void nvme_ns_flush_cmd(struct nvme_command *cmd, uint32_t nsid) +{ + + cmd->opc = NVME_OPC_FLUSH; + cmd->nsid = htole32(nsid); +} + +static inline +void nvme_ns_rw_cmd(struct nvme_command *cmd, uint32_t rwcmd, uint32_t nsid, + uint64_t lba, uint32_t count) +{ + cmd->opc = rwcmd; + cmd->nsid = htole32(nsid); + cmd->cdw10 = htole32(lba & 0xffffffffu); + cmd->cdw11 = htole32(lba >> 32); + cmd->cdw12 = htole32(count-1); +} + +static inline +void nvme_ns_write_cmd(struct nvme_command *cmd, uint32_t nsid, + uint64_t lba, uint32_t count) +{ + nvme_ns_rw_cmd(cmd, NVME_OPC_WRITE, nsid, lba, count); +} + +static inline +void nvme_ns_read_cmd(struct nvme_command *cmd, uint32_t nsid, + uint64_t lba, uint32_t count) +{ + nvme_ns_rw_cmd(cmd, NVME_OPC_READ, nsid, lba, count); +} + +static inline +void nvme_ns_trim_cmd(struct nvme_command *cmd, uint32_t nsid, + uint32_t num_ranges) +{ + cmd->opc = NVME_OPC_DATASET_MANAGEMENT; + cmd->nsid = htole32(nsid); + cmd->cdw10 = htole32(num_ranges - 1); + cmd->cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); +} + +extern int nvme_use_nvd; + +#endif /* _KERNEL */ + +/* Endianess conversion functions for NVMe structs */ +static inline +void nvme_completion_swapbytes(struct nvme_completion *s) +{ + + s->cdw0 = le32toh(s->cdw0); + /* omit rsvd1 */ + s->sqhd = le16toh(s->sqhd); + s->sqid = le16toh(s->sqid); + /* omit cid */ + s->status = le16toh(s->status); +} + +static inline +void nvme_power_state_swapbytes(struct nvme_power_state *s) +{ + + s->mp = le16toh(s->mp); + s->enlat = le32toh(s->enlat); + s->exlat = le32toh(s->exlat); + s->idlp = le16toh(s->idlp); + s->actp = le16toh(s->actp); +} + +static inline +void nvme_controller_data_swapbytes(struct nvme_controller_data *s) +{ + int i; + + s->vid = le16toh(s->vid); + s->ssvid = le16toh(s->ssvid); + s->ctrlr_id = le16toh(s->ctrlr_id); + s->ver = le32toh(s->ver); + s->rtd3r = le32toh(s->rtd3r); + s->rtd3e = le32toh(s->rtd3e); + s->oaes = le32toh(s->oaes); + s->ctratt = le32toh(s->ctratt); + s->oacs = le16toh(s->oacs); + s->wctemp = le16toh(s->wctemp); + s->cctemp = le16toh(s->cctemp); + s->mtfa = le16toh(s->mtfa); + s->hmpre = le32toh(s->hmpre); + s->hmmin = le32toh(s->hmmin); + s->rpmbs = le32toh(s->rpmbs); + s->edstt = le16toh(s->edstt); + s->kas = le16toh(s->kas); + s->hctma = le16toh(s->hctma); + s->mntmt = le16toh(s->mntmt); + s->mxtmt = le16toh(s->mxtmt); + s->sanicap = le32toh(s->sanicap); + s->maxcmd = le16toh(s->maxcmd); + s->nn = le32toh(s->nn); + s->oncs = le16toh(s->oncs); + s->fuses = le16toh(s->fuses); + s->awun = le16toh(s->awun); + s->awupf = le16toh(s->awupf); + s->acwu = le16toh(s->acwu); + s->sgls = le32toh(s->sgls); + for (i = 0; i < 32; i++) + nvme_power_state_swapbytes(&s->power_state[i]); +} + +static inline +void nvme_namespace_data_swapbytes(struct nvme_namespace_data *s) +{ + int i; + + s->nsze = le64toh(s->nsze); + s->ncap = le64toh(s->ncap); + s->nuse = le64toh(s->nuse); + s->nawun = le16toh(s->nawun); + s->nawupf = le16toh(s->nawupf); + s->nacwu = le16toh(s->nacwu); + s->nabsn = le16toh(s->nabsn); + s->nabo = le16toh(s->nabo); + s->nabspf = le16toh(s->nabspf); + s->noiob = le16toh(s->noiob); + for (i = 0; i < 16; i++) + s->lbaf[i] = le32toh(s->lbaf[i]); +} + +static inline +void nvme_error_information_entry_swapbytes(struct nvme_error_information_entry *s) +{ + + s->error_count = le64toh(s->error_count); + s->sqid = le16toh(s->sqid); + s->cid = le16toh(s->cid); + s->status = le16toh(s->status); + s->error_location = le16toh(s->error_location); + s->lba = le64toh(s->lba); + s->nsid = le32toh(s->nsid); +} + +static inline +void nvme_le128toh(void *p) +{ + /* + * Upstream, this uses the following comparison: + * #if _BYTE_ORDER != _LITTLE_ENDIAN + * + * Rather than keep this file in compat with only that little bit + * changed, we'll just float a little patch here for now. + */ +#ifndef _LITTLE_ENDIAN + /* Swap 16 bytes in place */ + char *tmp = (char*)p; + char b; + int i; + for (i = 0; i < 8; i++) { + b = tmp[i]; + tmp[i] = tmp[15-i]; + tmp[15-i] = b; + } +#else + (void)p; +#endif +} + +static inline +void nvme_health_information_page_swapbytes(struct nvme_health_information_page *s) +{ + int i; + + s->temperature = le16toh(s->temperature); + nvme_le128toh((void *)s->data_units_read); + nvme_le128toh((void *)s->data_units_written); + nvme_le128toh((void *)s->host_read_commands); + nvme_le128toh((void *)s->host_write_commands); + nvme_le128toh((void *)s->controller_busy_time); + nvme_le128toh((void *)s->power_cycles); + nvme_le128toh((void *)s->power_on_hours); + nvme_le128toh((void *)s->unsafe_shutdowns); + nvme_le128toh((void *)s->media_errors); + nvme_le128toh((void *)s->num_error_info_log_entries); + s->warning_temp_time = le32toh(s->warning_temp_time); + s->error_temp_time = le32toh(s->error_temp_time); + for (i = 0; i < 8; i++) + s->temp_sensor[i] = le16toh(s->temp_sensor[i]); +} + + +static inline +void nvme_firmware_page_swapbytes(struct nvme_firmware_page *s) +{ + int i; + + for (i = 0; i < 7; i++) + s->revision[i] = le64toh(s->revision[i]); +} + +static inline +void nvme_ns_list_swapbytes(struct nvme_ns_list *s) +{ + int i; + + for (i = 0; i < 1024; i++) + s->ns[i] = le32toh(s->ns[i]); +} + +static inline +void intel_log_temp_stats_swapbytes(struct intel_log_temp_stats *s) +{ + + s->current = le64toh(s->current); + s->overtemp_flag_last = le64toh(s->overtemp_flag_last); + s->overtemp_flag_life = le64toh(s->overtemp_flag_life); + s->max_temp = le64toh(s->max_temp); + s->min_temp = le64toh(s->min_temp); + /* omit _rsvd[] */ + s->max_oper_temp = le64toh(s->max_oper_temp); + s->min_oper_temp = le64toh(s->min_oper_temp); + s->est_offset = le64toh(s->est_offset); +} + +#endif /* __NVME_H__ */ diff --git a/usr/contrib/freebsd/dev/usb/controller/xhcireg.h b/usr/contrib/freebsd/dev/usb/controller/xhcireg.h new file mode 100644 index 0000000000..0e588ecba3 --- /dev/null +++ b/usr/contrib/freebsd/dev/usb/controller/xhcireg.h @@ -0,0 +1,224 @@ +/* $FreeBSD$ */ + +/*- + * Copyright (c) 2010 Hans Petter Selasky. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _XHCIREG_H_ +#define _XHCIREG_H_ + +/* XHCI PCI config registers */ +#define PCI_XHCI_CBMEM 0x10 /* configuration base MEM */ +#define PCI_XHCI_USBREV 0x60 /* RO USB protocol revision */ +#define PCI_USB_REV_3_0 0x30 /* USB 3.0 */ +#define PCI_XHCI_FLADJ 0x61 /* RW frame length adjust */ + +#define PCI_XHCI_INTEL_XUSB2PR 0xD0 /* Intel USB2 Port Routing */ +#define PCI_XHCI_INTEL_USB2PRM 0xD4 /* Intel USB2 Port Routing Mask */ +#define PCI_XHCI_INTEL_USB3_PSSEN 0xD8 /* Intel USB3 Port SuperSpeed Enable */ +#define PCI_XHCI_INTEL_USB3PRM 0xDC /* Intel USB3 Port Routing Mask */ + +/* XHCI capability registers */ +#define XHCI_CAPLENGTH 0x00 /* RO capability */ +#define XHCI_RESERVED 0x01 /* Reserved */ +#define XHCI_HCIVERSION 0x02 /* RO Interface version number */ +#define XHCI_HCIVERSION_0_9 0x0090 /* xHCI version 0.9 */ +#define XHCI_HCIVERSION_1_0 0x0100 /* xHCI version 1.0 */ +#define XHCI_HCSPARAMS1 0x04 /* RO structural parameters 1 */ +#define XHCI_HCS1_DEVSLOT_MAX(x)((x) & 0xFF) +#define XHCI_HCS1_IRQ_MAX(x) (((x) >> 8) & 0x3FF) +#define XHCI_HCS1_N_PORTS(x) (((x) >> 24) & 0xFF) +#define XHCI_HCSPARAMS2 0x08 /* RO structural parameters 2 */ +#define XHCI_HCS2_IST(x) ((x) & 0xF) +#define XHCI_HCS2_ERST_MAX(x) (((x) >> 4) & 0xF) +#define XHCI_HCS2_SPR(x) (((x) >> 26) & 0x1) +#define XHCI_HCS2_SPB_MAX(x) ((((x) >> 16) & 0x3E0) | (((x) >> 27) & 0x1F)) +#define XHCI_HCSPARAMS3 0x0C /* RO structural parameters 3 */ +#define XHCI_HCS3_U1_DEL(x) ((x) & 0xFF) +#define XHCI_HCS3_U2_DEL(x) (((x) >> 16) & 0xFFFF) +#define XHCI_HCSPARAMS0 0x10 /* RO capability parameters */ +#define XHCI_HCS0_AC64(x) ((x) & 0x1) /* 64-bit capable */ +#define XHCI_HCS0_BNC(x) (((x) >> 1) & 0x1) /* BW negotiation */ +#define XHCI_HCS0_CSZ(x) (((x) >> 2) & 0x1) /* context size */ +#define XHCI_HCS0_PPC(x) (((x) >> 3) & 0x1) /* port power control */ +#define XHCI_HCS0_PIND(x) (((x) >> 4) & 0x1) /* port indicators */ +#define XHCI_HCS0_LHRC(x) (((x) >> 5) & 0x1) /* light HC reset */ +#define XHCI_HCS0_LTC(x) (((x) >> 6) & 0x1) /* latency tolerance msg */ +#define XHCI_HCS0_NSS(x) (((x) >> 7) & 0x1) /* no secondary sid */ +#define XHCI_HCS0_PSA_SZ_MAX(x) (((x) >> 12) & 0xF) /* max pri. stream array size */ +#define XHCI_HCS0_XECP(x) (((x) >> 16) & 0xFFFF) /* extended capabilities pointer */ +#define XHCI_DBOFF 0x14 /* RO doorbell offset */ +#define XHCI_RTSOFF 0x18 /* RO runtime register space offset */ + +/* XHCI operational registers. Offset given by XHCI_CAPLENGTH register */ +#define XHCI_USBCMD 0x00 /* XHCI command */ +#define XHCI_CMD_RS 0x00000001 /* RW Run/Stop */ +#define XHCI_CMD_HCRST 0x00000002 /* RW Host Controller Reset */ +#define XHCI_CMD_INTE 0x00000004 /* RW Interrupter Enable */ +#define XHCI_CMD_HSEE 0x00000008 /* RW Host System Error Enable */ +#define XHCI_CMD_LHCRST 0x00000080 /* RO/RW Light Host Controller Reset */ +#define XHCI_CMD_CSS 0x00000100 /* RW Controller Save State */ +#define XHCI_CMD_CRS 0x00000200 /* RW Controller Restore State */ +#define XHCI_CMD_EWE 0x00000400 /* RW Enable Wrap Event */ +#define XHCI_CMD_EU3S 0x00000800 /* RW Enable U3 MFINDEX Stop */ +#define XHCI_USBSTS 0x04 /* XHCI status */ +#define XHCI_STS_HCH 0x00000001 /* RO - Host Controller Halted */ +#define XHCI_STS_HSE 0x00000004 /* RW - Host System Error */ +#define XHCI_STS_EINT 0x00000008 /* RW - Event Interrupt */ +#define XHCI_STS_PCD 0x00000010 /* RW - Port Change Detect */ +#define XHCI_STS_SSS 0x00000100 /* RO - Save State Status */ +#define XHCI_STS_RSS 0x00000200 /* RO - Restore State Status */ +#define XHCI_STS_SRE 0x00000400 /* RW - Save/Restore Error */ +#define XHCI_STS_CNR 0x00000800 /* RO - Controller Not Ready */ +#define XHCI_STS_HCE 0x00001000 /* RO - Host Controller Error */ +#define XHCI_PAGESIZE 0x08 /* XHCI page size mask */ +#define XHCI_PAGESIZE_4K 0x00000001 /* 4K Page Size */ +#define XHCI_PAGESIZE_8K 0x00000002 /* 8K Page Size */ +#define XHCI_PAGESIZE_16K 0x00000004 /* 16K Page Size */ +#define XHCI_PAGESIZE_32K 0x00000008 /* 32K Page Size */ +#define XHCI_PAGESIZE_64K 0x00000010 /* 64K Page Size */ +#define XHCI_DNCTRL 0x14 /* XHCI device notification control */ +#define XHCI_DNCTRL_MASK(n) (1U << (n)) +#define XHCI_CRCR_LO 0x18 /* XHCI command ring control */ +#define XHCI_CRCR_LO_RCS 0x00000001 /* RW - consumer cycle state */ +#define XHCI_CRCR_LO_CS 0x00000002 /* RW - command stop */ +#define XHCI_CRCR_LO_CA 0x00000004 /* RW - command abort */ +#define XHCI_CRCR_LO_CRR 0x00000008 /* RW - command ring running */ +#define XHCI_CRCR_LO_MASK 0x0000000F +#define XHCI_CRCR_HI 0x1C /* XHCI command ring control */ +#define XHCI_DCBAAP_LO 0x30 /* XHCI dev context BA pointer */ +#define XHCI_DCBAAP_HI 0x34 /* XHCI dev context BA pointer */ +#define XHCI_CONFIG 0x38 +#define XHCI_CONFIG_SLOTS_MASK 0x000000FF /* RW - number of device slots enabled */ + +/* XHCI port status registers */ +#define XHCI_PORTSC(n) (0x3F0 + (0x10 * (n))) /* XHCI port status */ +#define XHCI_PS_CCS 0x00000001 /* RO - current connect status */ +#define XHCI_PS_PED 0x00000002 /* RW - port enabled / disabled */ +#define XHCI_PS_OCA 0x00000008 /* RO - over current active */ +#define XHCI_PS_PR 0x00000010 /* RW - port reset */ +#define XHCI_PS_PLS_GET(x) (((x) >> 5) & 0xF) /* RW - port link state */ +#define XHCI_PS_PLS_SET(x) (((x) & 0xF) << 5) /* RW - port link state */ +#define XHCI_PS_PP 0x00000200 /* RW - port power */ +#define XHCI_PS_SPEED_GET(x) (((x) >> 10) & 0xF) /* RO - port speed */ +#define XHCI_PS_PIC_GET(x) (((x) >> 14) & 0x3) /* RW - port indicator */ +#define XHCI_PS_PIC_SET(x) (((x) & 0x3) << 14) /* RW - port indicator */ +#define XHCI_PS_LWS 0x00010000 /* RW - port link state write strobe */ +#define XHCI_PS_CSC 0x00020000 /* RW - connect status change */ +#define XHCI_PS_PEC 0x00040000 /* RW - port enable/disable change */ +#define XHCI_PS_WRC 0x00080000 /* RW - warm port reset change */ +#define XHCI_PS_OCC 0x00100000 /* RW - over-current change */ +#define XHCI_PS_PRC 0x00200000 /* RW - port reset change */ +#define XHCI_PS_PLC 0x00400000 /* RW - port link state change */ +#define XHCI_PS_CEC 0x00800000 /* RW - config error change */ +#define XHCI_PS_CAS 0x01000000 /* RO - cold attach status */ +#define XHCI_PS_WCE 0x02000000 /* RW - wake on connect enable */ +#define XHCI_PS_WDE 0x04000000 /* RW - wake on disconnect enable */ +#define XHCI_PS_WOE 0x08000000 /* RW - wake on over-current enable */ +#define XHCI_PS_DR 0x40000000 /* RO - device removable */ +#define XHCI_PS_WPR 0x80000000U /* RW - warm port reset */ +#define XHCI_PS_CLEAR 0x80FF01FFU /* command bits */ + +#define XHCI_PORTPMSC(n) (0x3F4 + (0x10 * (n))) /* XHCI status and control */ +#define XHCI_PM3_U1TO_GET(x) (((x) >> 0) & 0xFF) /* RW - U1 timeout */ +#define XHCI_PM3_U1TO_SET(x) (((x) & 0xFF) << 0) /* RW - U1 timeout */ +#define XHCI_PM3_U2TO_GET(x) (((x) >> 8) & 0xFF) /* RW - U2 timeout */ +#define XHCI_PM3_U2TO_SET(x) (((x) & 0xFF) << 8) /* RW - U2 timeout */ +#define XHCI_PM3_FLA 0x00010000 /* RW - Force Link PM Accept */ +#define XHCI_PM2_L1S_GET(x) (((x) >> 0) & 0x7) /* RO - L1 status */ +#define XHCI_PM2_RWE 0x00000008 /* RW - remote wakup enable */ +#define XHCI_PM2_HIRD_GET(x) (((x) >> 4) & 0xF) /* RW - host initiated resume duration */ +#define XHCI_PM2_HIRD_SET(x) (((x) & 0xF) << 4) /* RW - host initiated resume duration */ +#define XHCI_PM2_L1SLOT_GET(x) (((x) >> 8) & 0xFF) /* RW - L1 device slot */ +#define XHCI_PM2_L1SLOT_SET(x) (((x) & 0xFF) << 8) /* RW - L1 device slot */ +#define XHCI_PM2_HLE 0x00010000 /* RW - hardware LPM enable */ +#define XHCI_PORTLI(n) (0x3F8 + (0x10 * (n))) /* XHCI port link info */ +#define XHCI_PLI3_ERR_GET(x) (((x) >> 0) & 0xFFFF) /* RO - port link errors */ +#define XHCI_PORTRSV(n) (0x3FC + (0x10 * (n))) /* XHCI port reserved */ + +/* XHCI runtime registers. Offset given by XHCI_CAPLENGTH + XHCI_RTSOFF registers */ +#define XHCI_MFINDEX 0x0000 /* RO - microframe index */ +#define XHCI_MFINDEX_GET(x) ((x) & 0x3FFF) +#define XHCI_IMAN(n) (0x0020 + (0x20 * (n))) /* XHCI interrupt management */ +#define XHCI_IMAN_INTR_PEND 0x00000001 /* RW - interrupt pending */ +#define XHCI_IMAN_INTR_ENA 0x00000002 /* RW - interrupt enable */ +#define XHCI_IMOD(n) (0x0024 + (0x20 * (n))) /* XHCI interrupt moderation */ +#define XHCI_IMOD_IVAL_GET(x) (((x) >> 0) & 0xFFFF) /* 250ns unit */ +#define XHCI_IMOD_IVAL_SET(x) (((x) & 0xFFFF) << 0) /* 250ns unit */ +#define XHCI_IMOD_ICNT_GET(x) (((x) >> 16) & 0xFFFF) /* 250ns unit */ +#define XHCI_IMOD_ICNT_SET(x) (((x) & 0xFFFF) << 16) /* 250ns unit */ +#define XHCI_IMOD_DEFAULT 0x000001F4U /* 8000 IRQs/second */ +#define XHCI_IMOD_DEFAULT_LP 0x000003F8U /* 4000 IRQs/second - LynxPoint */ +#define XHCI_ERSTSZ(n) (0x0028 + (0x20 * (n))) /* XHCI event ring segment table size */ +#define XHCI_ERSTS_GET(x) ((x) & 0xFFFF) +#define XHCI_ERSTS_SET(x) ((x) & 0xFFFF) +#define XHCI_ERSTBA_LO(n) (0x0030 + (0x20 * (n))) /* XHCI event ring segment table BA */ +#define XHCI_ERSTBA_HI(n) (0x0034 + (0x20 * (n))) /* XHCI event ring segment table BA */ +#define XHCI_ERDP_LO(n) (0x0038 + (0x20 * (n))) /* XHCI event ring dequeue pointer */ +#define XHCI_ERDP_LO_SINDEX(x) ((x) & 0x7) /* RO - dequeue segment index */ +#define XHCI_ERDP_LO_BUSY 0x00000008 /* RW - event handler busy */ +#define XHCI_ERDP_HI(n) (0x003C + (0x20 * (n))) /* XHCI event ring dequeue pointer */ + +/* XHCI doorbell registers. Offset given by XHCI_CAPLENGTH + XHCI_DBOFF registers */ +#define XHCI_DOORBELL(n) (0x0000 + (4 * (n))) +#define XHCI_DB_TARGET_GET(x) ((x) & 0xFF) /* RW - doorbell target */ +#define XHCI_DB_TARGET_SET(x) ((x) & 0xFF) /* RW - doorbell target */ +#define XHCI_DB_SID_GET(x) (((x) >> 16) & 0xFFFF) /* RW - doorbell stream ID */ +#define XHCI_DB_SID_SET(x) (((x) & 0xFFFF) << 16) /* RW - doorbell stream ID */ + +/* XHCI legacy support */ +#define XHCI_XECP_ID(x) ((x) & 0xFF) +#define XHCI_XECP_NEXT(x) (((x) >> 8) & 0xFF) +#define XHCI_XECP_BIOS_SEM 0x0002 +#define XHCI_XECP_OS_SEM 0x0003 + +/* XHCI capability ID's */ +#define XHCI_ID_USB_LEGACY 0x0001 +#define XHCI_ID_PROTOCOLS 0x0002 +#define XHCI_ID_POWER_MGMT 0x0003 +#define XHCI_ID_VIRTUALIZATION 0x0004 +#define XHCI_ID_MSG_IRQ 0x0005 +#define XHCI_ID_USB_LOCAL_MEM 0x0006 + +/* XHCI register R/W wrappers */ +#define XREAD1(sc, what, a) \ + bus_space_read_1((sc)->sc_io_tag, (sc)->sc_io_hdl, \ + (a) + (sc)->sc_##what##_off) +#define XREAD2(sc, what, a) \ + bus_space_read_2((sc)->sc_io_tag, (sc)->sc_io_hdl, \ + (a) + (sc)->sc_##what##_off) +#define XREAD4(sc, what, a) \ + bus_space_read_4((sc)->sc_io_tag, (sc)->sc_io_hdl, \ + (a) + (sc)->sc_##what##_off) +#define XWRITE1(sc, what, a, x) \ + bus_space_write_1((sc)->sc_io_tag, (sc)->sc_io_hdl, \ + (a) + (sc)->sc_##what##_off, (x)) +#define XWRITE2(sc, what, a, x) \ + bus_space_write_2((sc)->sc_io_tag, (sc)->sc_io_hdl, \ + (a) + (sc)->sc_##what##_off, (x)) +#define XWRITE4(sc, what, a, x) \ + bus_space_write_4((sc)->sc_io_tag, (sc)->sc_io_hdl, \ + (a) + (sc)->sc_##what##_off, (x)) + +#endif /* _XHCIREG_H_ */ diff --git a/usr/contrib/freebsd/dev/usb/usb.h b/usr/contrib/freebsd/dev/usb/usb.h new file mode 100644 index 0000000000..bcea2ac8bd --- /dev/null +++ b/usr/contrib/freebsd/dev/usb/usb.h @@ -0,0 +1,801 @@ +/* $FreeBSD$ */ +/*- + * Copyright (c) 2008 Hans Petter Selasky. All rights reserved. + * Copyright (c) 1998 The NetBSD Foundation, Inc. All rights reserved. + * Copyright (c) 1998 Lennart Augustsson. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file contains standard definitions for the following USB + * protocol versions: + * + * USB v1.0 + * USB v1.1 + * USB v2.0 + * USB v3.0 + */ + +#ifndef _USB_STANDARD_H_ +#define _USB_STANDARD_H_ + +#if defined(_KERNEL) +#ifndef USB_GLOBAL_INCLUDE_FILE +#include "opt_usb.h" +#endif + +/* Declare parent SYSCTL USB node. */ +#ifdef SYSCTL_DECL +SYSCTL_DECL(_hw_usb); +#endif + +#ifndef USB_GLOBAL_INCLUDE_FILE +#include <sys/malloc.h> +#endif + +MALLOC_DECLARE(M_USB); +MALLOC_DECLARE(M_USBDEV); +#endif /* _KERNEL */ + +#ifndef USB_GLOBAL_INCLUDE_FILE +#include <dev/usb/usb_endian.h> +#include <dev/usb/usb_freebsd.h> +#endif + +#define USB_STACK_VERSION 2000 /* 2.0 */ + +/* Definition of some hardcoded USB constants. */ + +#define USB_MAX_IPACKET 8 /* initial USB packet size */ +#define USB_EP_MAX (2*16) /* hardcoded */ +#define USB_ROOT_HUB_ADDR 1 /* index */ +#define USB_MIN_DEVICES 2 /* unused + root HUB */ +#define USB_UNCONFIG_INDEX 0xFF /* internal use only */ +#define USB_IFACE_INDEX_ANY 0xFF /* internal use only */ +#define USB_START_ADDR 0 /* default USB device BUS address + * after USB bus reset */ +#define USB_CONTROL_ENDPOINT 0 /* default control endpoint */ + +#define USB_FRAMES_PER_SECOND_FS 1000 /* full speed */ +#define USB_FRAMES_PER_SECOND_HS 8000 /* high speed */ + +#define USB_FS_BYTES_PER_HS_UFRAME 188 /* bytes */ +#define USB_HS_MICRO_FRAMES_MAX 8 /* units */ + +#define USB_ISOC_TIME_MAX 128 /* ms */ + +/* + * Minimum time a device needs to be powered down to go through a + * power cycle. These values are not in the USB specification. + */ +#define USB_POWER_DOWN_TIME 200 /* ms */ +#define USB_PORT_POWER_DOWN_TIME 100 /* ms */ + +/* Definition of software USB power modes */ +#define USB_POWER_MODE_OFF 0 /* turn off device */ +#define USB_POWER_MODE_ON 1 /* always on */ +#define USB_POWER_MODE_SAVE 2 /* automatic suspend and resume */ +#define USB_POWER_MODE_SUSPEND 3 /* force suspend */ +#define USB_POWER_MODE_RESUME 4 /* force resume */ + +/* These are the values from the USB specification. */ +#define USB_PORT_RESET_DELAY_SPEC 10 /* ms */ +#define USB_PORT_ROOT_RESET_DELAY_SPEC 50 /* ms */ +#define USB_PORT_RESET_RECOVERY_SPEC 10 /* ms */ +#define USB_PORT_POWERUP_DELAY_SPEC 100 /* ms */ +#define USB_PORT_RESUME_DELAY_SPEC 20 /* ms */ +#define USB_SET_ADDRESS_SETTLE_SPEC 2 /* ms */ +#define USB_RESUME_DELAY_SPEC (20*5) /* ms */ +#define USB_RESUME_WAIT_SPEC 10 /* ms */ +#define USB_RESUME_RECOVERY_SPEC 10 /* ms */ +#define USB_EXTRA_POWER_UP_TIME_SPEC 0 /* ms */ + +/* Allow for marginal and non-conforming devices. */ +#define USB_PORT_RESET_DELAY 50 /* ms */ +#define USB_PORT_ROOT_RESET_DELAY 200 /* ms */ +#define USB_PORT_RESET_RECOVERY 250 /* ms */ +#define USB_PORT_POWERUP_DELAY 300 /* ms */ +#define USB_PORT_RESUME_DELAY (20*2) /* ms */ +#define USB_SET_ADDRESS_SETTLE 10 /* ms */ +#define USB_RESUME_DELAY (50*5) /* ms */ +#define USB_RESUME_WAIT 50 /* ms */ +#define USB_RESUME_RECOVERY 50 /* ms */ +#define USB_EXTRA_POWER_UP_TIME 20 /* ms */ + +#define USB_MIN_POWER 100 /* mA */ +#define USB_MAX_POWER 500 /* mA */ + +#define USB_BUS_RESET_DELAY 100 /* ms */ + +/* + * USB record layout in memory: + * + * - USB config 0 + * - USB interfaces + * - USB alternative interfaces + * - USB endpoints + * + * - USB config 1 + * - USB interfaces + * - USB alternative interfaces + * - USB endpoints + */ + +/* Declaration of USB records */ + +struct usb_device_request { + uByte bmRequestType; + uByte bRequest; + uWord wValue; + uWord wIndex; + uWord wLength; +} __packed; +typedef struct usb_device_request usb_device_request_t; + +#define UT_WRITE 0x00 +#define UT_READ 0x80 +#define UT_STANDARD 0x00 +#define UT_CLASS 0x20 +#define UT_VENDOR 0x40 +#define UT_DEVICE 0x00 +#define UT_INTERFACE 0x01 +#define UT_ENDPOINT 0x02 +#define UT_OTHER 0x03 + +#define UT_READ_DEVICE (UT_READ | UT_STANDARD | UT_DEVICE) +#define UT_READ_INTERFACE (UT_READ | UT_STANDARD | UT_INTERFACE) +#define UT_READ_ENDPOINT (UT_READ | UT_STANDARD | UT_ENDPOINT) +#define UT_WRITE_DEVICE (UT_WRITE | UT_STANDARD | UT_DEVICE) +#define UT_WRITE_INTERFACE (UT_WRITE | UT_STANDARD | UT_INTERFACE) +#define UT_WRITE_ENDPOINT (UT_WRITE | UT_STANDARD | UT_ENDPOINT) +#define UT_READ_CLASS_DEVICE (UT_READ | UT_CLASS | UT_DEVICE) +#define UT_READ_CLASS_INTERFACE (UT_READ | UT_CLASS | UT_INTERFACE) +#define UT_READ_CLASS_OTHER (UT_READ | UT_CLASS | UT_OTHER) +#define UT_READ_CLASS_ENDPOINT (UT_READ | UT_CLASS | UT_ENDPOINT) +#define UT_WRITE_CLASS_DEVICE (UT_WRITE | UT_CLASS | UT_DEVICE) +#define UT_WRITE_CLASS_INTERFACE (UT_WRITE | UT_CLASS | UT_INTERFACE) +#define UT_WRITE_CLASS_OTHER (UT_WRITE | UT_CLASS | UT_OTHER) +#define UT_WRITE_CLASS_ENDPOINT (UT_WRITE | UT_CLASS | UT_ENDPOINT) +#define UT_READ_VENDOR_DEVICE (UT_READ | UT_VENDOR | UT_DEVICE) +#define UT_READ_VENDOR_INTERFACE (UT_READ | UT_VENDOR | UT_INTERFACE) +#define UT_READ_VENDOR_OTHER (UT_READ | UT_VENDOR | UT_OTHER) +#define UT_READ_VENDOR_ENDPOINT (UT_READ | UT_VENDOR | UT_ENDPOINT) +#define UT_WRITE_VENDOR_DEVICE (UT_WRITE | UT_VENDOR | UT_DEVICE) +#define UT_WRITE_VENDOR_INTERFACE (UT_WRITE | UT_VENDOR | UT_INTERFACE) +#define UT_WRITE_VENDOR_OTHER (UT_WRITE | UT_VENDOR | UT_OTHER) +#define UT_WRITE_VENDOR_ENDPOINT (UT_WRITE | UT_VENDOR | UT_ENDPOINT) + +/* Requests */ +#define UR_GET_STATUS 0x00 +#define UR_CLEAR_FEATURE 0x01 +#define UR_SET_FEATURE 0x03 +#define UR_SET_ADDRESS 0x05 +#define UR_GET_DESCRIPTOR 0x06 +#define UDESC_DEVICE 0x01 +#define UDESC_CONFIG 0x02 +#define UDESC_STRING 0x03 +#define USB_LANGUAGE_TABLE 0x00 /* language ID string index */ +#define UDESC_INTERFACE 0x04 +#define UDESC_ENDPOINT 0x05 +#define UDESC_DEVICE_QUALIFIER 0x06 +#define UDESC_OTHER_SPEED_CONFIGURATION 0x07 +#define UDESC_INTERFACE_POWER 0x08 +#define UDESC_OTG 0x09 +#define UDESC_DEBUG 0x0A +#define UDESC_IFACE_ASSOC 0x0B /* interface association */ +#define UDESC_BOS 0x0F /* binary object store */ +#define UDESC_DEVICE_CAPABILITY 0x10 +#define UDESC_CS_DEVICE 0x21 /* class specific */ +#define UDESC_CS_CONFIG 0x22 +#define UDESC_CS_STRING 0x23 +#define UDESC_CS_INTERFACE 0x24 +#define UDESC_CS_ENDPOINT 0x25 +#define UDESC_HUB 0x29 +#define UDESC_SS_HUB 0x2A /* super speed */ +#define UDESC_ENDPOINT_SS_COMP 0x30 /* super speed */ +#define UR_SET_DESCRIPTOR 0x07 +#define UR_GET_CONFIG 0x08 +#define UR_SET_CONFIG 0x09 +#define UR_GET_INTERFACE 0x0a +#define UR_SET_INTERFACE 0x0b +#define UR_SYNCH_FRAME 0x0c +#define UR_SET_SEL 0x30 +#define UR_ISOCH_DELAY 0x31 + +/* HUB specific request */ +#define UR_GET_BUS_STATE 0x02 +#define UR_CLEAR_TT_BUFFER 0x08 +#define UR_RESET_TT 0x09 +#define UR_GET_TT_STATE 0x0a +#define UR_STOP_TT 0x0b +#define UR_SET_AND_TEST 0x0c /* USB 2.0 only */ +#define UR_SET_HUB_DEPTH 0x0c /* USB 3.0 only */ +#define USB_SS_HUB_DEPTH_MAX 5 +#define UR_GET_PORT_ERR_COUNT 0x0d + +/* Feature numbers */ +#define UF_ENDPOINT_HALT 0 +#define UF_DEVICE_REMOTE_WAKEUP 1 +#define UF_TEST_MODE 2 +#define UF_U1_ENABLE 0x30 +#define UF_U2_ENABLE 0x31 +#define UF_LTM_ENABLE 0x32 + +/* HUB specific features */ +#define UHF_C_HUB_LOCAL_POWER 0 +#define UHF_C_HUB_OVER_CURRENT 1 +#define UHF_PORT_CONNECTION 0 +#define UHF_PORT_ENABLE 1 +#define UHF_PORT_SUSPEND 2 +#define UHF_PORT_OVER_CURRENT 3 +#define UHF_PORT_RESET 4 +#define UHF_PORT_LINK_STATE 5 +#define UHF_PORT_POWER 8 +#define UHF_PORT_LOW_SPEED 9 +#define UHF_PORT_L1 10 +#define UHF_C_PORT_CONNECTION 16 +#define UHF_C_PORT_ENABLE 17 +#define UHF_C_PORT_SUSPEND 18 +#define UHF_C_PORT_OVER_CURRENT 19 +#define UHF_C_PORT_RESET 20 +#define UHF_PORT_TEST 21 +#define UHF_PORT_INDICATOR 22 +#define UHF_C_PORT_L1 23 + +/* SuperSpeed HUB specific features */ +#define UHF_PORT_U1_TIMEOUT 23 +#define UHF_PORT_U2_TIMEOUT 24 +#define UHF_C_PORT_LINK_STATE 25 +#define UHF_C_PORT_CONFIG_ERROR 26 +#define UHF_PORT_REMOTE_WAKE_MASK 27 +#define UHF_BH_PORT_RESET 28 +#define UHF_C_BH_PORT_RESET 29 +#define UHF_FORCE_LINKPM_ACCEPT 30 + +struct usb_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bDescriptorSubtype; +} __packed; +typedef struct usb_descriptor usb_descriptor_t; + +struct usb_device_descriptor { + uByte bLength; + uByte bDescriptorType; + uWord bcdUSB; +#define UD_USB_2_0 0x0200 +#define UD_USB_3_0 0x0300 +#define UD_IS_USB2(d) ((d)->bcdUSB[1] == 0x02) +#define UD_IS_USB3(d) ((d)->bcdUSB[1] == 0x03) + uByte bDeviceClass; + uByte bDeviceSubClass; + uByte bDeviceProtocol; + uByte bMaxPacketSize; + /* The fields below are not part of the initial descriptor. */ + uWord idVendor; + uWord idProduct; + uWord bcdDevice; + uByte iManufacturer; + uByte iProduct; + uByte iSerialNumber; + uByte bNumConfigurations; +} __packed; +typedef struct usb_device_descriptor usb_device_descriptor_t; + +/* Binary Device Object Store (BOS) */ +struct usb_bos_descriptor { + uByte bLength; + uByte bDescriptorType; + uWord wTotalLength; + uByte bNumDeviceCaps; +} __packed; +typedef struct usb_bos_descriptor usb_bos_descriptor_t; + +/* Binary Device Object Store Capability */ +struct usb_bos_cap_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bDevCapabilityType; +#define USB_DEVCAP_RESERVED 0x00 +#define USB_DEVCAP_WUSB 0x01 +#define USB_DEVCAP_USB2EXT 0x02 +#define USB_DEVCAP_SUPER_SPEED 0x03 +#define USB_DEVCAP_CONTAINER_ID 0x04 + /* data ... */ +} __packed; +typedef struct usb_bos_cap_descriptor usb_bos_cap_descriptor_t; + +struct usb_devcap_usb2ext_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bDevCapabilityType; + uDWord bmAttributes; +#define USB_V2EXT_LPM (1U << 1) +#define USB_V2EXT_BESL_SUPPORTED (1U << 2) +#define USB_V2EXT_BESL_BASELINE_VALID (1U << 3) +#define USB_V2EXT_BESL_DEEP_VALID (1U << 4) +#define USB_V2EXT_BESL_BASELINE_GET(x) (((x) >> 8) & 0xF) +#define USB_V2EXT_BESL_DEEP_GET(x) (((x) >> 12) & 0xF) +} __packed; +typedef struct usb_devcap_usb2ext_descriptor usb_devcap_usb2ext_descriptor_t; + +struct usb_devcap_ss_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bDevCapabilityType; + uByte bmAttributes; + uWord wSpeedsSupported; + uByte bFunctionalitySupport; + uByte bU1DevExitLat; + uWord wU2DevExitLat; +} __packed; +typedef struct usb_devcap_ss_descriptor usb_devcap_ss_descriptor_t; + +struct usb_devcap_container_id_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bDevCapabilityType; + uByte bReserved; + uByte bContainerID; +} __packed; +typedef struct usb_devcap_container_id_descriptor + usb_devcap_container_id_descriptor_t; + +/* Device class codes */ +#define UDCLASS_IN_INTERFACE 0x00 +#define UDCLASS_COMM 0x02 +#define UDCLASS_HUB 0x09 +#define UDSUBCLASS_HUB 0x00 +#define UDPROTO_FSHUB 0x00 +#define UDPROTO_HSHUBSTT 0x01 +#define UDPROTO_HSHUBMTT 0x02 +#define UDPROTO_SSHUB 0x03 +#define UDCLASS_DIAGNOSTIC 0xdc +#define UDCLASS_WIRELESS 0xe0 +#define UDSUBCLASS_RF 0x01 +#define UDPROTO_BLUETOOTH 0x01 +#define UDCLASS_VENDOR 0xff + +struct usb_config_descriptor { + uByte bLength; + uByte bDescriptorType; + uWord wTotalLength; + uByte bNumInterface; + uByte bConfigurationValue; +#define USB_UNCONFIG_NO 0 + uByte iConfiguration; + uByte bmAttributes; +#define UC_BUS_POWERED 0x80 +#define UC_SELF_POWERED 0x40 +#define UC_REMOTE_WAKEUP 0x20 + uByte bMaxPower; /* max current in 2 mA units */ +#define UC_POWER_FACTOR 2 +} __packed; +typedef struct usb_config_descriptor usb_config_descriptor_t; + +struct usb_interface_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bInterfaceNumber; + uByte bAlternateSetting; + uByte bNumEndpoints; + uByte bInterfaceClass; + uByte bInterfaceSubClass; + uByte bInterfaceProtocol; + uByte iInterface; +} __packed; +typedef struct usb_interface_descriptor usb_interface_descriptor_t; + +struct usb_interface_assoc_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bFirstInterface; + uByte bInterfaceCount; + uByte bFunctionClass; + uByte bFunctionSubClass; + uByte bFunctionProtocol; + uByte iFunction; +} __packed; +typedef struct usb_interface_assoc_descriptor usb_interface_assoc_descriptor_t; + +/* Interface class codes */ +#define UICLASS_UNSPEC 0x00 +#define UICLASS_AUDIO 0x01 /* audio */ +#define UISUBCLASS_AUDIOCONTROL 1 +#define UISUBCLASS_AUDIOSTREAM 2 +#define UISUBCLASS_MIDISTREAM 3 + +#define UICLASS_CDC 0x02 /* communication */ +#define UISUBCLASS_DIRECT_LINE_CONTROL_MODEL 1 +#define UISUBCLASS_ABSTRACT_CONTROL_MODEL 2 +#define UISUBCLASS_TELEPHONE_CONTROL_MODEL 3 +#define UISUBCLASS_MULTICHANNEL_CONTROL_MODEL 4 +#define UISUBCLASS_CAPI_CONTROLMODEL 5 +#define UISUBCLASS_ETHERNET_NETWORKING_CONTROL_MODEL 6 +#define UISUBCLASS_ATM_NETWORKING_CONTROL_MODEL 7 +#define UISUBCLASS_WIRELESS_HANDSET_CM 8 +#define UISUBCLASS_DEVICE_MGMT 9 +#define UISUBCLASS_MOBILE_DIRECT_LINE_MODEL 10 +#define UISUBCLASS_OBEX 11 +#define UISUBCLASS_ETHERNET_EMULATION_MODEL 12 +#define UISUBCLASS_NETWORK_CONTROL_MODEL 13 + +#define UIPROTO_CDC_NONE 0 +#define UIPROTO_CDC_AT 1 + +#define UICLASS_HID 0x03 +#define UISUBCLASS_BOOT 1 +#define UIPROTO_BOOT_KEYBOARD 1 +#define UIPROTO_MOUSE 2 + +#define UICLASS_PHYSICAL 0x05 +#define UICLASS_IMAGE 0x06 +#define UISUBCLASS_SIC 1 /* still image class */ +#define UICLASS_PRINTER 0x07 +#define UISUBCLASS_PRINTER 1 +#define UIPROTO_PRINTER_UNI 1 +#define UIPROTO_PRINTER_BI 2 +#define UIPROTO_PRINTER_1284 3 + +#define UICLASS_MASS 0x08 +#define UISUBCLASS_RBC 1 +#define UISUBCLASS_SFF8020I 2 +#define UISUBCLASS_QIC157 3 +#define UISUBCLASS_UFI 4 +#define UISUBCLASS_SFF8070I 5 +#define UISUBCLASS_SCSI 6 +#define UIPROTO_MASS_CBI_I 0 +#define UIPROTO_MASS_CBI 1 +#define UIPROTO_MASS_BBB_OLD 2 /* Not in the spec anymore */ +#define UIPROTO_MASS_BBB 80 /* 'P' for the Iomega Zip drive */ + +#define UICLASS_HUB 0x09 +#define UISUBCLASS_HUB 0 +#define UIPROTO_FSHUB 0 +#define UIPROTO_HSHUBSTT 0 /* Yes, same as previous */ +#define UIPROTO_HSHUBMTT 1 + +#define UICLASS_CDC_DATA 0x0a +#define UISUBCLASS_DATA 0x00 +#define UIPROTO_DATA_ISDNBRI 0x30 /* Physical iface */ +#define UIPROTO_DATA_HDLC 0x31 /* HDLC */ +#define UIPROTO_DATA_TRANSPARENT 0x32 /* Transparent */ +#define UIPROTO_DATA_Q921M 0x50 /* Management for Q921 */ +#define UIPROTO_DATA_Q921 0x51 /* Data for Q921 */ +#define UIPROTO_DATA_Q921TM 0x52 /* TEI multiplexer for Q921 */ +#define UIPROTO_DATA_V42BIS 0x90 /* Data compression */ +#define UIPROTO_DATA_Q931 0x91 /* Euro-ISDN */ +#define UIPROTO_DATA_V120 0x92 /* V.24 rate adaption */ +#define UIPROTO_DATA_CAPI 0x93 /* CAPI 2.0 commands */ +#define UIPROTO_DATA_HOST_BASED 0xfd /* Host based driver */ +#define UIPROTO_DATA_PUF 0xfe /* see Prot. Unit Func. Desc. */ +#define UIPROTO_DATA_VENDOR 0xff /* Vendor specific */ +#define UIPROTO_DATA_NCM 0x01 /* Network Control Model */ + +#define UICLASS_SMARTCARD 0x0b +#define UICLASS_FIRM_UPD 0x0c +#define UICLASS_SECURITY 0x0d +#define UICLASS_DIAGNOSTIC 0xdc +#define UICLASS_WIRELESS 0xe0 +#define UISUBCLASS_RF 0x01 +#define UIPROTO_BLUETOOTH 0x01 +#define UIPROTO_RNDIS 0x03 + +#define UICLASS_IAD 0xEF /* Interface Association Descriptor */ +#define UISUBCLASS_SYNC 0x01 +#define UIPROTO_ACTIVESYNC 0x01 + +#define UICLASS_APPL_SPEC 0xfe +#define UISUBCLASS_FIRMWARE_DOWNLOAD 1 +#define UISUBCLASS_IRDA 2 +#define UIPROTO_IRDA 0 + +#define UICLASS_VENDOR 0xff +#define UISUBCLASS_XBOX360_CONTROLLER 0x5d +#define UIPROTO_XBOX360_GAMEPAD 0x01 + +struct usb_endpoint_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bEndpointAddress; +#define UE_GET_DIR(a) ((a) & 0x80) +#define UE_SET_DIR(a,d) ((a) | (((d)&1) << 7)) +#define UE_DIR_IN 0x80 /* IN-token endpoint, fixed */ +#define UE_DIR_OUT 0x00 /* OUT-token endpoint, fixed */ +#define UE_DIR_RX 0xfd /* for internal use only! */ +#define UE_DIR_TX 0xfe /* for internal use only! */ +#define UE_DIR_ANY 0xff /* for internal use only! */ +#define UE_ADDR 0x0f +#define UE_ADDR_ANY 0xff /* for internal use only! */ +#define UE_GET_ADDR(a) ((a) & UE_ADDR) + uByte bmAttributes; +#define UE_XFERTYPE 0x03 +#define UE_CONTROL 0x00 +#define UE_ISOCHRONOUS 0x01 +#define UE_BULK 0x02 +#define UE_INTERRUPT 0x03 +#define UE_BULK_INTR 0xfe /* for internal use only! */ +#define UE_TYPE_ANY 0xff /* for internal use only! */ +#define UE_GET_XFERTYPE(a) ((a) & UE_XFERTYPE) +#define UE_ISO_TYPE 0x0c +#define UE_ISO_ASYNC 0x04 +#define UE_ISO_ADAPT 0x08 +#define UE_ISO_SYNC 0x0c +#define UE_GET_ISO_TYPE(a) ((a) & UE_ISO_TYPE) +#define UE_ISO_USAGE 0x30 +#define UE_ISO_USAGE_DATA 0x00 +#define UE_ISO_USAGE_FEEDBACK 0x10 +#define UE_ISO_USAGE_IMPLICT_FB 0x20 +#define UE_GET_ISO_USAGE(a) ((a) & UE_ISO_USAGE) + uWord wMaxPacketSize; +#define UE_ZERO_MPS 0xFFFF /* for internal use only */ + uByte bInterval; +} __packed; +typedef struct usb_endpoint_descriptor usb_endpoint_descriptor_t; + +struct usb_endpoint_ss_comp_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bMaxBurst; + uByte bmAttributes; +#define UE_GET_BULK_STREAMS(x) ((x) & 0x0F) +#define UE_GET_SS_ISO_MULT(x) ((x) & 0x03) + uWord wBytesPerInterval; +} __packed; +typedef struct usb_endpoint_ss_comp_descriptor + usb_endpoint_ss_comp_descriptor_t; + +struct usb_string_descriptor { + uByte bLength; + uByte bDescriptorType; + uWord bString[126]; + uByte bUnused; +} __packed; +typedef struct usb_string_descriptor usb_string_descriptor_t; + +#define USB_MAKE_STRING_DESC(m,name) \ +static const struct { \ + uByte bLength; \ + uByte bDescriptorType; \ + uByte bData[sizeof((uint8_t []){m})]; \ +} __packed name = { \ + .bLength = sizeof(name), \ + .bDescriptorType = UDESC_STRING, \ + .bData = { m }, \ +} + +struct usb_string_lang { + uByte bLength; + uByte bDescriptorType; + uByte bData[2]; +} __packed; +typedef struct usb_string_lang usb_string_lang_t; + +struct usb_hub_descriptor { + uByte bDescLength; + uByte bDescriptorType; + uByte bNbrPorts; + uWord wHubCharacteristics; +#define UHD_PWR 0x0003 +#define UHD_PWR_GANGED 0x0000 +#define UHD_PWR_INDIVIDUAL 0x0001 +#define UHD_PWR_NO_SWITCH 0x0002 +#define UHD_COMPOUND 0x0004 +#define UHD_OC 0x0018 +#define UHD_OC_GLOBAL 0x0000 +#define UHD_OC_INDIVIDUAL 0x0008 +#define UHD_OC_NONE 0x0010 +#define UHD_TT_THINK 0x0060 +#define UHD_TT_THINK_8 0x0000 +#define UHD_TT_THINK_16 0x0020 +#define UHD_TT_THINK_24 0x0040 +#define UHD_TT_THINK_32 0x0060 +#define UHD_PORT_IND 0x0080 + uByte bPwrOn2PwrGood; /* delay in 2 ms units */ +#define UHD_PWRON_FACTOR 2 + uByte bHubContrCurrent; + uByte DeviceRemovable[32]; /* max 255 ports */ +#define UHD_NOT_REMOV(desc, i) \ + (((desc)->DeviceRemovable[(i)/8] >> ((i) % 8)) & 1) + uByte PortPowerCtrlMask[1]; /* deprecated */ +} __packed; +typedef struct usb_hub_descriptor usb_hub_descriptor_t; + +struct usb_hub_ss_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bNbrPorts; + uWord wHubCharacteristics; + uByte bPwrOn2PwrGood; /* delay in 2 ms units */ + uByte bHubContrCurrent; + uByte bHubHdrDecLat; + uWord wHubDelay; + uByte DeviceRemovable[32]; /* max 255 ports */ +} __packed; +typedef struct usb_hub_ss_descriptor usb_hub_ss_descriptor_t; + +/* minimum HUB descriptor (8-ports maximum) */ +struct usb_hub_descriptor_min { + uByte bDescLength; + uByte bDescriptorType; + uByte bNbrPorts; + uWord wHubCharacteristics; + uByte bPwrOn2PwrGood; + uByte bHubContrCurrent; + uByte DeviceRemovable[1]; + uByte PortPowerCtrlMask[1]; +} __packed; +typedef struct usb_hub_descriptor_min usb_hub_descriptor_min_t; + +struct usb_device_qualifier { + uByte bLength; + uByte bDescriptorType; + uWord bcdUSB; + uByte bDeviceClass; + uByte bDeviceSubClass; + uByte bDeviceProtocol; + uByte bMaxPacketSize0; + uByte bNumConfigurations; + uByte bReserved; +} __packed; +typedef struct usb_device_qualifier usb_device_qualifier_t; + +struct usb_otg_descriptor { + uByte bLength; + uByte bDescriptorType; + uByte bmAttributes; +#define UOTG_SRP 0x01 +#define UOTG_HNP 0x02 +} __packed; +typedef struct usb_otg_descriptor usb_otg_descriptor_t; + +/* OTG feature selectors */ +#define UOTG_B_HNP_ENABLE 3 +#define UOTG_A_HNP_SUPPORT 4 +#define UOTG_A_ALT_HNP_SUPPORT 5 + +struct usb_status { + uWord wStatus; +/* Device status flags */ +#define UDS_SELF_POWERED 0x0001 +#define UDS_REMOTE_WAKEUP 0x0002 +/* Endpoint status flags */ +#define UES_HALT 0x0001 +} __packed; +typedef struct usb_status usb_status_t; + +struct usb_hub_status { + uWord wHubStatus; +#define UHS_LOCAL_POWER 0x0001 +#define UHS_OVER_CURRENT 0x0002 + uWord wHubChange; +} __packed; +typedef struct usb_hub_status usb_hub_status_t; + +struct usb_port_status { + uWord wPortStatus; +#define UPS_CURRENT_CONNECT_STATUS 0x0001 +#define UPS_PORT_ENABLED 0x0002 +#define UPS_SUSPEND 0x0004 +#define UPS_OVERCURRENT_INDICATOR 0x0008 +#define UPS_RESET 0x0010 +#define UPS_PORT_L1 0x0020 /* USB 2.0 only */ +/* The link-state bits are valid for Super-Speed USB HUBs */ +#define UPS_PORT_LINK_STATE_GET(x) (((x) >> 5) & 0xF) +#define UPS_PORT_LINK_STATE_SET(x) (((x) & 0xF) << 5) +#define UPS_PORT_LS_U0 0x00 +#define UPS_PORT_LS_U1 0x01 +#define UPS_PORT_LS_U2 0x02 +#define UPS_PORT_LS_U3 0x03 +#define UPS_PORT_LS_SS_DIS 0x04 +#define UPS_PORT_LS_RX_DET 0x05 +#define UPS_PORT_LS_SS_INA 0x06 +#define UPS_PORT_LS_POLL 0x07 +#define UPS_PORT_LS_RECOVER 0x08 +#define UPS_PORT_LS_HOT_RST 0x09 +#define UPS_PORT_LS_COMP_MODE 0x0A +#define UPS_PORT_LS_LOOPBACK 0x0B +#define UPS_PORT_LS_RESUME 0x0F +#define UPS_PORT_POWER 0x0100 +#define UPS_PORT_POWER_SS 0x0200 /* super-speed only */ +#define UPS_LOW_SPEED 0x0200 +#define UPS_HIGH_SPEED 0x0400 +#define UPS_OTHER_SPEED 0x0600 /* currently FreeBSD specific */ +#define UPS_PORT_TEST 0x0800 +#define UPS_PORT_INDICATOR 0x1000 +#define UPS_PORT_MODE_DEVICE 0x8000 /* currently FreeBSD specific */ + uWord wPortChange; +#define UPS_C_CONNECT_STATUS 0x0001 +#define UPS_C_PORT_ENABLED 0x0002 +#define UPS_C_SUSPEND 0x0004 +#define UPS_C_OVERCURRENT_INDICATOR 0x0008 +#define UPS_C_PORT_RESET 0x0010 +#define UPS_C_PORT_L1 0x0020 /* USB 2.0 only */ +#define UPS_C_BH_PORT_RESET 0x0020 /* USB 3.0 only */ +#define UPS_C_PORT_LINK_STATE 0x0040 +#define UPS_C_PORT_CONFIG_ERROR 0x0080 +} __packed; +typedef struct usb_port_status usb_port_status_t; + +/* + * The "USB_SPEED" macros defines all the supported USB speeds. + */ +enum usb_dev_speed { + USB_SPEED_VARIABLE, + USB_SPEED_LOW, + USB_SPEED_FULL, + USB_SPEED_HIGH, + USB_SPEED_SUPER, +}; +#define USB_SPEED_MAX (USB_SPEED_SUPER+1) + +/* + * The "USB_REV" macros defines all the supported USB revisions. + */ +enum usb_revision { + USB_REV_UNKNOWN, + USB_REV_PRE_1_0, + USB_REV_1_0, + USB_REV_1_1, + USB_REV_2_0, + USB_REV_2_5, + USB_REV_3_0 +}; +#define USB_REV_MAX (USB_REV_3_0+1) + +/* + * Supported host controller modes. + */ +enum usb_hc_mode { + USB_MODE_HOST, /* initiates transfers */ + USB_MODE_DEVICE, /* bus transfer target */ + USB_MODE_DUAL /* can be host or device */ +}; +#define USB_MODE_MAX (USB_MODE_DUAL+1) + +/* + * The "USB_STATE" enums define all the supported device states. + */ +enum usb_dev_state { + USB_STATE_DETACHED, + USB_STATE_ATTACHED, + USB_STATE_POWERED, + USB_STATE_ADDRESSED, + USB_STATE_CONFIGURED, +}; +#define USB_STATE_MAX (USB_STATE_CONFIGURED+1) + +/* + * The "USB_EP_MODE" macros define all the currently supported + * endpoint modes. + */ +enum usb_ep_mode { + USB_EP_MODE_DEFAULT, + USB_EP_MODE_STREAMS, /* USB3.0 specific */ + USB_EP_MODE_HW_MASS_STORAGE, + USB_EP_MODE_HW_SERIAL, + USB_EP_MODE_HW_ETHERNET_CDC, + USB_EP_MODE_HW_ETHERNET_NCM, + USB_EP_MODE_MAX +}; +#endif /* _USB_STANDARD_H_ */ diff --git a/usr/contrib/freebsd/dev/usb/usb_endian.h b/usr/contrib/freebsd/dev/usb/usb_endian.h new file mode 100644 index 0000000000..0bbcb9bf82 --- /dev/null +++ b/usr/contrib/freebsd/dev/usb/usb_endian.h @@ -0,0 +1,121 @@ +/* $FreeBSD$ */ +/* + * Copyright (c) 2008 Hans Petter Selasky. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#ifndef _USB_ENDIAN_H_ +#define _USB_ENDIAN_H_ + +#ifndef USB_GLOBAL_INCLUDE_FILE +#include <sys/stdint.h> +#include <sys/endian.h> +#endif + +/* + * Declare the basic USB record types. USB records have an alignment + * of 1 byte and are always packed. + */ +typedef uint8_t uByte; +typedef uint8_t uWord[2]; +typedef uint8_t uDWord[4]; +typedef uint8_t uQWord[8]; + +/* + * Define a set of macros that can get and set data independent of + * CPU endianness and CPU alignment requirements: + */ +#define UGETB(w) \ + ((w)[0]) + +#define UGETW(w) \ + ((w)[0] | \ + (((uint16_t)((w)[1])) << 8)) + +#define UGETDW(w) \ + ((w)[0] | \ + (((uint16_t)((w)[1])) << 8) | \ + (((uint32_t)((w)[2])) << 16) | \ + (((uint32_t)((w)[3])) << 24)) + +#define UGETQW(w) \ + ((w)[0] | \ + (((uint16_t)((w)[1])) << 8) | \ + (((uint32_t)((w)[2])) << 16) | \ + (((uint32_t)((w)[3])) << 24) | \ + (((uint64_t)((w)[4])) << 32) | \ + (((uint64_t)((w)[5])) << 40) | \ + (((uint64_t)((w)[6])) << 48) | \ + (((uint64_t)((w)[7])) << 56)) + +#define USETB(w,v) do { \ + (w)[0] = (uint8_t)(v); \ +} while (0) + +#define USETW(w,v) do { \ + (w)[0] = (uint8_t)(v); \ + (w)[1] = (uint8_t)((v) >> 8); \ +} while (0) + +#define USETDW(w,v) do { \ + (w)[0] = (uint8_t)(v); \ + (w)[1] = (uint8_t)((v) >> 8); \ + (w)[2] = (uint8_t)((v) >> 16); \ + (w)[3] = (uint8_t)((v) >> 24); \ +} while (0) + +#define USETQW(w,v) do { \ + (w)[0] = (uint8_t)(v); \ + (w)[1] = (uint8_t)((v) >> 8); \ + (w)[2] = (uint8_t)((v) >> 16); \ + (w)[3] = (uint8_t)((v) >> 24); \ + (w)[4] = (uint8_t)((v) >> 32); \ + (w)[5] = (uint8_t)((v) >> 40); \ + (w)[6] = (uint8_t)((v) >> 48); \ + (w)[7] = (uint8_t)((v) >> 56); \ +} while (0) + +#define USETW2(w,b1,b0) do { \ + (w)[0] = (uint8_t)(b0); \ + (w)[1] = (uint8_t)(b1); \ +} while (0) + +#define USETW4(w,b3,b2,b1,b0) do { \ + (w)[0] = (uint8_t)(b0); \ + (w)[1] = (uint8_t)(b1); \ + (w)[2] = (uint8_t)(b2); \ + (w)[3] = (uint8_t)(b3); \ +} while (0) + +#define USETW8(w,b7,b6,b5,b4,b3,b2,b1,b0) do { \ + (w)[0] = (uint8_t)(b0); \ + (w)[1] = (uint8_t)(b1); \ + (w)[2] = (uint8_t)(b2); \ + (w)[3] = (uint8_t)(b3); \ + (w)[4] = (uint8_t)(b4); \ + (w)[5] = (uint8_t)(b5); \ + (w)[6] = (uint8_t)(b6); \ + (w)[7] = (uint8_t)(b7); \ +} while (0) + +#endif /* _USB_ENDIAN_H_ */ diff --git a/usr/contrib/freebsd/dev/usb/usb_freebsd.h b/usr/contrib/freebsd/dev/usb/usb_freebsd.h new file mode 100644 index 0000000000..3bc9d2c1eb --- /dev/null +++ b/usr/contrib/freebsd/dev/usb/usb_freebsd.h @@ -0,0 +1,101 @@ +/* $FreeBSD$ */ +/*- + * Copyright (c) 2008 Hans Petter Selasky. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Including this file is mandatory for all USB related c-files in the kernel. + */ + +#ifndef _USB_FREEBSD_H_ +#define _USB_FREEBSD_H_ + +/* Default USB configuration */ +#define USB_HAVE_UGEN 1 +#define USB_HAVE_DEVCTL 1 +#define USB_HAVE_BUSDMA 1 +#define USB_HAVE_COMPAT_LINUX 1 +#define USB_HAVE_USER_IO 1 +#define USB_HAVE_MBUF 1 +#define USB_HAVE_TT_SUPPORT 1 +#define USB_HAVE_POWERD 1 +#define USB_HAVE_MSCTEST 1 +#define USB_HAVE_MSCTEST_DETACH 1 +#define USB_HAVE_PF 1 +#define USB_HAVE_ROOT_MOUNT_HOLD 1 +#define USB_HAVE_ID_SECTION 1 +#define USB_HAVE_PER_BUS_PROCESS 1 +#define USB_HAVE_FIXED_ENDPOINT 0 +#define USB_HAVE_FIXED_IFACE 0 +#define USB_HAVE_FIXED_CONFIG 0 +#define USB_HAVE_FIXED_PORT 0 +#define USB_HAVE_DISABLE_ENUM 1 + +/* define zero ticks callout value */ +#define USB_CALLOUT_ZERO_TICKS 1 + +#define USB_TD_GET_PROC(td) (td)->td_proc +#define USB_PROC_GET_GID(td) (td)->p_pgid + +#if (!defined(USB_HOST_ALIGN)) || (USB_HOST_ALIGN <= 0) +/* Use default value. */ +#undef USB_HOST_ALIGN +#if defined(__arm__) || defined(__mips__) || defined(__powerpc__) +#define USB_HOST_ALIGN 32 /* Arm and MIPS need at least this much, if not more */ +#else +#define USB_HOST_ALIGN 8 /* bytes, must be power of two */ +#endif +#endif +/* Sanity check for USB_HOST_ALIGN: Verify power of two. */ +#if ((-USB_HOST_ALIGN) & USB_HOST_ALIGN) != USB_HOST_ALIGN +#error "USB_HOST_ALIGN is not power of two." +#endif +#define USB_FS_ISOC_UFRAME_MAX 4 /* exclusive unit */ +#define USB_BUS_MAX 256 /* units */ +#define USB_MAX_DEVICES 128 /* units */ +#define USB_CONFIG_MAX 65535 /* bytes */ +#define USB_IFACE_MAX 32 /* units */ +#define USB_FIFO_MAX 128 /* units */ +#define USB_MAX_EP_STREAMS 8 /* units */ +#define USB_MAX_EP_UNITS 32 /* units */ +#define USB_MAX_PORTS 255 /* units */ + +#define USB_MAX_FS_ISOC_FRAMES_PER_XFER (120) /* units */ +#define USB_MAX_HS_ISOC_FRAMES_PER_XFER (8*120) /* units */ + +#define USB_HUB_MAX_DEPTH 5 +#define USB_EP0_BUFSIZE 1024 /* bytes */ +#define USB_CS_RESET_LIMIT 20 /* failures = 20 * 50 ms = 1sec */ + +#define USB_MAX_AUTO_QUIRK 8 /* maximum number of dynamic quirks */ + +typedef uint32_t usb_timeout_t; /* milliseconds */ +typedef uint32_t usb_frlength_t; /* bytes */ +typedef uint32_t usb_frcount_t; /* units */ +typedef uint32_t usb_size_t; /* bytes */ +typedef uint32_t usb_ticks_t; /* system defined */ +typedef uint16_t usb_power_mask_t; /* see "USB_HW_POWER_XXX" */ +typedef uint16_t usb_stream_t; /* stream ID */ + +#endif /* _USB_FREEBSD_H_ */ diff --git a/usr/contrib/freebsd/dev/usb/usbdi.h b/usr/contrib/freebsd/dev/usb/usbdi.h new file mode 100644 index 0000000000..202ad89fa7 --- /dev/null +++ b/usr/contrib/freebsd/dev/usb/usbdi.h @@ -0,0 +1,657 @@ +/*- + * Copyright (c) 2009 Andrew Thompson + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef _USB_USBDI_H_ +#define _USB_USBDI_H_ + +struct usb_fifo; +struct usb_xfer; +struct usb_device; +struct usb_attach_arg; +struct usb_interface; +struct usb_endpoint; +struct usb_page_cache; +struct usb_page_search; +struct usb_process; +struct usb_proc_msg; +struct usb_mbuf; +struct usb_fs_privdata; +struct mbuf; + +typedef enum { /* keep in sync with usb_errstr_table */ + USB_ERR_NORMAL_COMPLETION = 0, + USB_ERR_PENDING_REQUESTS, /* 1 */ + USB_ERR_NOT_STARTED, /* 2 */ + USB_ERR_INVAL, /* 3 */ + USB_ERR_NOMEM, /* 4 */ + USB_ERR_CANCELLED, /* 5 */ + USB_ERR_BAD_ADDRESS, /* 6 */ + USB_ERR_BAD_BUFSIZE, /* 7 */ + USB_ERR_BAD_FLAG, /* 8 */ + USB_ERR_NO_CALLBACK, /* 9 */ + USB_ERR_IN_USE, /* 10 */ + USB_ERR_NO_ADDR, /* 11 */ + USB_ERR_NO_PIPE, /* 12 */ + USB_ERR_ZERO_NFRAMES, /* 13 */ + USB_ERR_ZERO_MAXP, /* 14 */ + USB_ERR_SET_ADDR_FAILED, /* 15 */ + USB_ERR_NO_POWER, /* 16 */ + USB_ERR_TOO_DEEP, /* 17 */ + USB_ERR_IOERROR, /* 18 */ + USB_ERR_NOT_CONFIGURED, /* 19 */ + USB_ERR_TIMEOUT, /* 20 */ + USB_ERR_SHORT_XFER, /* 21 */ + USB_ERR_STALLED, /* 22 */ + USB_ERR_INTERRUPTED, /* 23 */ + USB_ERR_DMA_LOAD_FAILED, /* 24 */ + USB_ERR_BAD_CONTEXT, /* 25 */ + USB_ERR_NO_ROOT_HUB, /* 26 */ + USB_ERR_NO_INTR_THREAD, /* 27 */ + USB_ERR_NOT_LOCKED, /* 28 */ + USB_ERR_MAX +} usb_error_t; + +/* + * Flags for transfers + */ +#define USB_FORCE_SHORT_XFER 0x0001 /* force a short transmit last */ +#define USB_SHORT_XFER_OK 0x0004 /* allow short reads */ +#define USB_DELAY_STATUS_STAGE 0x0010 /* insert delay before STATUS stage */ +#define USB_USER_DATA_PTR 0x0020 /* internal flag */ +#define USB_MULTI_SHORT_OK 0x0040 /* allow multiple short frames */ +#define USB_MANUAL_STATUS 0x0080 /* manual ctrl status */ + +#define USB_NO_TIMEOUT 0 +#define USB_DEFAULT_TIMEOUT 5000 /* 5000 ms = 5 seconds */ + +#if defined(_KERNEL) +/* typedefs */ + +typedef void (usb_callback_t)(struct usb_xfer *, usb_error_t); +typedef void (usb_proc_callback_t)(struct usb_proc_msg *); +typedef usb_error_t (usb_handle_req_t)(struct usb_device *, + struct usb_device_request *, const void **, uint16_t *); + +typedef int (usb_fifo_open_t)(struct usb_fifo *fifo, int fflags); +typedef void (usb_fifo_close_t)(struct usb_fifo *fifo, int fflags); +typedef int (usb_fifo_ioctl_t)(struct usb_fifo *fifo, u_long cmd, void *addr, int fflags); +typedef void (usb_fifo_cmd_t)(struct usb_fifo *fifo); +typedef void (usb_fifo_filter_t)(struct usb_fifo *fifo, struct usb_mbuf *m); + + +/* USB events */ +#ifndef USB_GLOBAL_INCLUDE_FILE +#include <sys/eventhandler.h> +#endif +typedef void (*usb_dev_configured_t)(void *, struct usb_device *, + struct usb_attach_arg *); +EVENTHANDLER_DECLARE(usb_dev_configured, usb_dev_configured_t); + +/* + * The following macros are used used to convert milliseconds into + * HZ. We use 1024 instead of 1000 milliseconds per second to save a + * full division. + */ +#define USB_MS_HZ 1024 + +#define USB_MS_TO_TICKS(ms) \ + (((uint32_t)((((uint32_t)(ms)) * ((uint32_t)(hz))) + USB_MS_HZ - 1)) / USB_MS_HZ) + +/* + * Common queue structure for USB transfers. + */ +struct usb_xfer_queue { + TAILQ_HEAD(, usb_xfer) head; + struct usb_xfer *curr; /* current USB transfer processed */ + void (*command) (struct usb_xfer_queue *pq); + uint8_t recurse_1:1; + uint8_t recurse_2:1; + uint8_t recurse_3:1; + uint8_t reserved:5; +}; + +/* + * The following structure defines an USB endpoint + * USB endpoint. + */ +struct usb_endpoint { + /* queue of USB transfers */ + struct usb_xfer_queue endpoint_q[USB_MAX_EP_STREAMS]; + + struct usb_endpoint_descriptor *edesc; + struct usb_endpoint_ss_comp_descriptor *ecomp; + const struct usb_pipe_methods *methods; /* set by HC driver */ + + uint16_t isoc_next; + + uint8_t toggle_next:1; /* next data toggle value */ + uint8_t is_stalled:1; /* set if endpoint is stalled */ + uint8_t is_synced:1; /* set if we a synchronised */ + uint8_t unused:5; + uint8_t iface_index; /* not used by "default endpoint" */ + + uint8_t refcount_alloc; /* allocation refcount */ + uint8_t refcount_bw; /* bandwidth refcount */ +#define USB_EP_REF_MAX 0x3f + + /* High-Speed resource allocation (valid if "refcount_bw" > 0) */ + + uint8_t usb_smask; /* USB start mask */ + uint8_t usb_cmask; /* USB complete mask */ + uint8_t usb_uframe; /* USB microframe */ + + /* USB endpoint mode, see USB_EP_MODE_XXX */ + + uint8_t ep_mode; +}; + +/* + * The following structure defines an USB interface. + */ +struct usb_interface { + struct usb_interface_descriptor *idesc; + device_t subdev; + uint8_t alt_index; + uint8_t parent_iface_index; + + /* Linux compat */ + struct usb_host_interface *altsetting; + struct usb_host_interface *cur_altsetting; + struct usb_device *linux_udev; + void *bsd_priv_sc; /* device specific information */ + char *pnpinfo; /* additional PnP-info for this interface */ + uint8_t num_altsetting; /* number of alternate settings */ + uint8_t bsd_iface_index; +}; + +/* + * The following structure defines a set of USB transfer flags. + */ +struct usb_xfer_flags { + uint8_t force_short_xfer:1; /* force a short transmit transfer + * last */ + uint8_t short_xfer_ok:1; /* allow short receive transfers */ + uint8_t short_frames_ok:1; /* allow short frames */ + uint8_t pipe_bof:1; /* block pipe on failure */ + uint8_t proxy_buffer:1; /* makes buffer size a factor of + * "max_frame_size" */ + uint8_t ext_buffer:1; /* uses external DMA buffer */ + uint8_t manual_status:1; /* non automatic status stage on + * control transfers */ + uint8_t no_pipe_ok:1; /* set if "USB_ERR_NO_PIPE" error can + * be ignored */ + uint8_t stall_pipe:1; /* set if the endpoint belonging to + * this USB transfer should be stalled + * before starting this transfer! */ + uint8_t pre_scale_frames:1; /* "usb_config->frames" is + * assumed to give the + * buffering time in + * milliseconds and is + * converted into the nearest + * number of frames when the + * USB transfer is setup. This + * option only has effect for + * ISOCHRONOUS transfers. + */ +}; + +/* + * The following structure define an USB configuration, that basically + * is used when setting up an USB transfer. + */ +struct usb_config { + usb_callback_t *callback; /* USB transfer callback */ + usb_frlength_t bufsize; /* total pipe buffer size in bytes */ + usb_frcount_t frames; /* maximum number of USB frames */ + usb_timeout_t interval; /* interval in milliseconds */ +#define USB_DEFAULT_INTERVAL 0 + usb_timeout_t timeout; /* transfer timeout in milliseconds */ + struct usb_xfer_flags flags; /* transfer flags */ + usb_stream_t stream_id; /* USB3.0 specific */ + enum usb_hc_mode usb_mode; /* host or device mode */ + uint8_t type; /* pipe type */ + uint8_t endpoint; /* pipe number */ + uint8_t direction; /* pipe direction */ + uint8_t ep_index; /* pipe index match to use */ + uint8_t if_index; /* "ifaces" index to use */ +}; + +/* + * Use these macro when defining USB device ID arrays if you want to + * have your driver module automatically loaded in host, device or + * both modes respectively: + */ +#if USB_HAVE_ID_SECTION +#define STRUCT_USB_HOST_ID \ + struct usb_device_id __section("usb_host_id") +#define STRUCT_USB_DEVICE_ID \ + struct usb_device_id __section("usb_device_id") +#define STRUCT_USB_DUAL_ID \ + struct usb_device_id __section("usb_dual_id") +#else +#define STRUCT_USB_HOST_ID \ + struct usb_device_id +#define STRUCT_USB_DEVICE_ID \ + struct usb_device_id +#define STRUCT_USB_DUAL_ID \ + struct usb_device_id +#endif /* USB_HAVE_ID_SECTION */ + +/* + * The following structure is used when looking up an USB driver for + * an USB device. It is inspired by the Linux structure called + * "usb_device_id". + */ +struct usb_device_id { + + /* Select which fields to match against */ +#if BYTE_ORDER == LITTLE_ENDIAN + uint16_t + match_flag_vendor:1, + match_flag_product:1, + match_flag_dev_lo:1, + match_flag_dev_hi:1, + + match_flag_dev_class:1, + match_flag_dev_subclass:1, + match_flag_dev_protocol:1, + match_flag_int_class:1, + + match_flag_int_subclass:1, + match_flag_int_protocol:1, + match_flag_unused:6; +#else + uint16_t + match_flag_unused:6, + match_flag_int_protocol:1, + match_flag_int_subclass:1, + + match_flag_int_class:1, + match_flag_dev_protocol:1, + match_flag_dev_subclass:1, + match_flag_dev_class:1, + + match_flag_dev_hi:1, + match_flag_dev_lo:1, + match_flag_product:1, + match_flag_vendor:1; +#endif + + /* Used for product specific matches; the BCD range is inclusive */ + uint16_t idVendor; + uint16_t idProduct; + uint16_t bcdDevice_lo; + uint16_t bcdDevice_hi; + + /* Used for device class matches */ + uint8_t bDeviceClass; + uint8_t bDeviceSubClass; + uint8_t bDeviceProtocol; + + /* Used for interface class matches */ + uint8_t bInterfaceClass; + uint8_t bInterfaceSubClass; + uint8_t bInterfaceProtocol; + +#if USB_HAVE_COMPAT_LINUX + /* which fields to match against */ + uint16_t match_flags; +#define USB_DEVICE_ID_MATCH_VENDOR 0x0001 +#define USB_DEVICE_ID_MATCH_PRODUCT 0x0002 +#define USB_DEVICE_ID_MATCH_DEV_LO 0x0004 +#define USB_DEVICE_ID_MATCH_DEV_HI 0x0008 +#define USB_DEVICE_ID_MATCH_DEV_CLASS 0x0010 +#define USB_DEVICE_ID_MATCH_DEV_SUBCLASS 0x0020 +#define USB_DEVICE_ID_MATCH_DEV_PROTOCOL 0x0040 +#define USB_DEVICE_ID_MATCH_INT_CLASS 0x0080 +#define USB_DEVICE_ID_MATCH_INT_SUBCLASS 0x0100 +#define USB_DEVICE_ID_MATCH_INT_PROTOCOL 0x0200 +#endif + + /* Hook for driver specific information */ + unsigned long driver_info; +} __aligned(32); + +#define USB_STD_PNP_INFO "M16:mask;U16:vendor;U16:product;L16:product;G16:product;" \ + "U8:devclass;U8:devsubclass;U8:devprotocol;" \ + "U8:intclass;U8:intsubclass;U8:intprotocol;" +#define USB_STD_PNP_HOST_INFO USB_STD_PNP_INFO "T:mode=host;" +#define USB_STD_PNP_DEVICE_INFO USB_STD_PNP_INFO "T:mode=device;" +#define USB_PNP_HOST_INFO(table) \ + MODULE_PNP_INFO(USB_STD_PNP_HOST_INFO, usb, table, table, sizeof(table[0]), \ + sizeof(table) / sizeof(table[0])) +#define USB_PNP_DEVICE_INFO(table) \ + MODULE_PNP_INFO(USB_STD_PNP_DEVICE_INFO, usb, table, table, sizeof(table[0]), \ + sizeof(table) / sizeof(table[0])) +#define USB_PNP_DUAL_INFO(table) \ + MODULE_PNP_INFO(USB_STD_PNP_INFO, usb, table, table, sizeof(table[0]), \ + sizeof(table) / sizeof(table[0])) + +/* check that the size of the structure above is correct */ +extern char usb_device_id_assert[(sizeof(struct usb_device_id) == 32) ? 1 : -1]; + +#define USB_VENDOR(vend) \ + .match_flag_vendor = 1, .idVendor = (vend) + +#define USB_PRODUCT(prod) \ + .match_flag_product = 1, .idProduct = (prod) + +#define USB_VP(vend,prod) \ + USB_VENDOR(vend), USB_PRODUCT(prod) + +#define USB_VPI(vend,prod,info) \ + USB_VENDOR(vend), USB_PRODUCT(prod), USB_DRIVER_INFO(info) + +#define USB_DEV_BCD_GTEQ(lo) /* greater than or equal */ \ + .match_flag_dev_lo = 1, .bcdDevice_lo = (lo) + +#define USB_DEV_BCD_LTEQ(hi) /* less than or equal */ \ + .match_flag_dev_hi = 1, .bcdDevice_hi = (hi) + +#define USB_DEV_CLASS(dc) \ + .match_flag_dev_class = 1, .bDeviceClass = (dc) + +#define USB_DEV_SUBCLASS(dsc) \ + .match_flag_dev_subclass = 1, .bDeviceSubClass = (dsc) + +#define USB_DEV_PROTOCOL(dp) \ + .match_flag_dev_protocol = 1, .bDeviceProtocol = (dp) + +#define USB_IFACE_CLASS(ic) \ + .match_flag_int_class = 1, .bInterfaceClass = (ic) + +#define USB_IFACE_SUBCLASS(isc) \ + .match_flag_int_subclass = 1, .bInterfaceSubClass = (isc) + +#define USB_IFACE_PROTOCOL(ip) \ + .match_flag_int_protocol = 1, .bInterfaceProtocol = (ip) + +#define USB_IF_CSI(class,subclass,info) \ + USB_IFACE_CLASS(class), USB_IFACE_SUBCLASS(subclass), USB_DRIVER_INFO(info) + +#define USB_DRIVER_INFO(n) \ + .driver_info = (n) + +#define USB_GET_DRIVER_INFO(did) \ + (did)->driver_info + +/* + * The following structure keeps information that is used to match + * against an array of "usb_device_id" elements. + */ +struct usbd_lookup_info { + uint16_t idVendor; + uint16_t idProduct; + uint16_t bcdDevice; + uint8_t bDeviceClass; + uint8_t bDeviceSubClass; + uint8_t bDeviceProtocol; + uint8_t bInterfaceClass; + uint8_t bInterfaceSubClass; + uint8_t bInterfaceProtocol; + uint8_t bIfaceIndex; + uint8_t bIfaceNum; + uint8_t bConfigIndex; + uint8_t bConfigNum; +}; + +/* Structure used by probe and attach */ + +struct usb_attach_arg { + struct usbd_lookup_info info; + device_t temp_dev; /* for internal use */ + unsigned long driver_info; /* for internal use */ + void *driver_ivar; + struct usb_device *device; /* current device */ + struct usb_interface *iface; /* current interface */ + enum usb_hc_mode usb_mode; /* host or device mode */ + uint8_t port; + uint8_t dev_state; +#define UAA_DEV_READY 0 +#define UAA_DEV_DISABLED 1 +#define UAA_DEV_EJECTING 2 +}; + +/* + * The following is a wrapper for the callout structure to ease + * porting the code to other platforms. + */ +struct usb_callout { + struct callout co; +}; +#define usb_callout_init_mtx(c,m,f) callout_init_mtx(&(c)->co,m,f) +#define usb_callout_reset(c,t,f,d) callout_reset(&(c)->co,t,f,d) +#define usb_callout_stop(c) callout_stop(&(c)->co) +#define usb_callout_drain(c) callout_drain(&(c)->co) +#define usb_callout_pending(c) callout_pending(&(c)->co) + +/* USB transfer states */ + +#define USB_ST_SETUP 0 +#define USB_ST_TRANSFERRED 1 +#define USB_ST_ERROR 2 + +/* USB handle request states */ +#define USB_HR_NOT_COMPLETE 0 +#define USB_HR_COMPLETE_OK 1 +#define USB_HR_COMPLETE_ERR 2 + +/* + * The following macro will return the current state of an USB + * transfer like defined by the "USB_ST_XXX" enums. + */ +#define USB_GET_STATE(xfer) (usbd_xfer_state(xfer)) + +/* + * The following structure defines the USB process message header. + */ +struct usb_proc_msg { + TAILQ_ENTRY(usb_proc_msg) pm_qentry; + usb_proc_callback_t *pm_callback; + usb_size_t pm_num; +}; + +#define USB_FIFO_TX 0 +#define USB_FIFO_RX 1 + +/* + * Locking note for the following functions. All the + * "usb_fifo_cmd_t" and "usb_fifo_filter_t" functions are called + * locked. The others are called unlocked. + */ +struct usb_fifo_methods { + usb_fifo_open_t *f_open; + usb_fifo_close_t *f_close; + usb_fifo_ioctl_t *f_ioctl; + /* + * NOTE: The post-ioctl callback is called after the USB reference + * gets locked in the IOCTL handler: + */ + usb_fifo_ioctl_t *f_ioctl_post; + usb_fifo_cmd_t *f_start_read; + usb_fifo_cmd_t *f_stop_read; + usb_fifo_cmd_t *f_start_write; + usb_fifo_cmd_t *f_stop_write; + usb_fifo_filter_t *f_filter_read; + usb_fifo_filter_t *f_filter_write; + const char *basename[4]; + const char *postfix[4]; +}; + +struct usb_fifo_sc { + struct usb_fifo *fp[2]; + struct usb_fs_privdata *dev; +}; + +const char *usbd_errstr(usb_error_t error); +void *usbd_find_descriptor(struct usb_device *udev, void *id, + uint8_t iface_index, uint8_t type, uint8_t type_mask, + uint8_t subtype, uint8_t subtype_mask); +struct usb_config_descriptor *usbd_get_config_descriptor( + struct usb_device *udev); +struct usb_device_descriptor *usbd_get_device_descriptor( + struct usb_device *udev); +struct usb_interface *usbd_get_iface(struct usb_device *udev, + uint8_t iface_index); +struct usb_interface_descriptor *usbd_get_interface_descriptor( + struct usb_interface *iface); +struct usb_endpoint *usbd_get_endpoint(struct usb_device *udev, uint8_t iface_index, + const struct usb_config *setup); +struct usb_endpoint *usbd_get_ep_by_addr(struct usb_device *udev, uint8_t ea_val); +usb_error_t usbd_interface_count(struct usb_device *udev, uint8_t *count); +enum usb_hc_mode usbd_get_mode(struct usb_device *udev); +enum usb_dev_speed usbd_get_speed(struct usb_device *udev); +void device_set_usb_desc(device_t dev); +void usb_pause_mtx(struct mtx *mtx, int _ticks); +usb_error_t usbd_set_pnpinfo(struct usb_device *udev, + uint8_t iface_index, const char *pnpinfo); +usb_error_t usbd_add_dynamic_quirk(struct usb_device *udev, + uint16_t quirk); +usb_error_t usbd_set_endpoint_mode(struct usb_device *udev, + struct usb_endpoint *ep, uint8_t ep_mode); +uint8_t usbd_get_endpoint_mode(struct usb_device *udev, + struct usb_endpoint *ep); + +const struct usb_device_id *usbd_lookup_id_by_info( + const struct usb_device_id *id, usb_size_t sizeof_id, + const struct usbd_lookup_info *info); +int usbd_lookup_id_by_uaa(const struct usb_device_id *id, + usb_size_t sizeof_id, struct usb_attach_arg *uaa); + +usb_error_t usbd_do_request_flags(struct usb_device *udev, struct mtx *mtx, + struct usb_device_request *req, void *data, uint16_t flags, + uint16_t *actlen, usb_timeout_t timeout); +#define usbd_do_request(u,m,r,d) \ + usbd_do_request_flags(u,m,r,d,0,NULL,USB_DEFAULT_TIMEOUT) + +uint8_t usbd_clear_stall_callback(struct usb_xfer *xfer1, + struct usb_xfer *xfer2); +uint8_t usbd_get_interface_altindex(struct usb_interface *iface); +usb_error_t usbd_set_alt_interface_index(struct usb_device *udev, + uint8_t iface_index, uint8_t alt_index); +uint32_t usbd_get_isoc_fps(struct usb_device *udev); +usb_error_t usbd_transfer_setup(struct usb_device *udev, + const uint8_t *ifaces, struct usb_xfer **pxfer, + const struct usb_config *setup_start, uint16_t n_setup, + void *priv_sc, struct mtx *priv_mtx); +void usbd_transfer_submit(struct usb_xfer *xfer); +void usbd_transfer_clear_stall(struct usb_xfer *xfer); +void usbd_transfer_drain(struct usb_xfer *xfer); +uint8_t usbd_transfer_pending(struct usb_xfer *xfer); +void usbd_transfer_start(struct usb_xfer *xfer); +void usbd_transfer_stop(struct usb_xfer *xfer); +void usbd_transfer_unsetup(struct usb_xfer **pxfer, uint16_t n_setup); +void usbd_transfer_poll(struct usb_xfer **ppxfer, uint16_t max); +void usbd_set_parent_iface(struct usb_device *udev, uint8_t iface_index, + uint8_t parent_index); +uint8_t usbd_get_bus_index(struct usb_device *udev); +uint8_t usbd_get_device_index(struct usb_device *udev); +void usbd_set_power_mode(struct usb_device *udev, uint8_t power_mode); +uint8_t usbd_filter_power_mode(struct usb_device *udev, uint8_t power_mode); +uint8_t usbd_device_attached(struct usb_device *udev); + +usb_frlength_t + usbd_xfer_old_frame_length(struct usb_xfer *xfer, usb_frcount_t frindex); +void usbd_xfer_status(struct usb_xfer *xfer, int *actlen, int *sumlen, + int *aframes, int *nframes); +struct usb_page_cache *usbd_xfer_get_frame(struct usb_xfer *, usb_frcount_t); +void *usbd_xfer_get_frame_buffer(struct usb_xfer *, usb_frcount_t); +void *usbd_xfer_softc(struct usb_xfer *xfer); +void *usbd_xfer_get_priv(struct usb_xfer *xfer); +void usbd_xfer_set_priv(struct usb_xfer *xfer, void *); +void usbd_xfer_set_interval(struct usb_xfer *xfer, int); +uint8_t usbd_xfer_state(struct usb_xfer *xfer); +void usbd_xfer_set_frame_data(struct usb_xfer *xfer, usb_frcount_t frindex, + void *ptr, usb_frlength_t len); +void usbd_xfer_frame_data(struct usb_xfer *xfer, usb_frcount_t frindex, + void **ptr, int *len); +void usbd_xfer_set_frame_offset(struct usb_xfer *xfer, usb_frlength_t offset, + usb_frcount_t frindex); +usb_frlength_t usbd_xfer_max_len(struct usb_xfer *xfer); +usb_frlength_t usbd_xfer_max_framelen(struct usb_xfer *xfer); +usb_frcount_t usbd_xfer_max_frames(struct usb_xfer *xfer); +uint8_t usbd_xfer_get_fps_shift(struct usb_xfer *xfer); +usb_frlength_t usbd_xfer_frame_len(struct usb_xfer *xfer, + usb_frcount_t frindex); +void usbd_xfer_set_frame_len(struct usb_xfer *xfer, usb_frcount_t frindex, + usb_frlength_t len); +void usbd_xfer_set_timeout(struct usb_xfer *xfer, int timeout); +void usbd_xfer_set_frames(struct usb_xfer *xfer, usb_frcount_t n); +void usbd_xfer_set_stall(struct usb_xfer *xfer); +int usbd_xfer_is_stalled(struct usb_xfer *xfer); +void usbd_xfer_set_flag(struct usb_xfer *xfer, int flag); +void usbd_xfer_clr_flag(struct usb_xfer *xfer, int flag); +uint16_t usbd_xfer_get_timestamp(struct usb_xfer *xfer); +uint8_t usbd_xfer_maxp_was_clamped(struct usb_xfer *xfer); + +void usbd_copy_in(struct usb_page_cache *cache, usb_frlength_t offset, + const void *ptr, usb_frlength_t len); +int usbd_copy_in_user(struct usb_page_cache *cache, usb_frlength_t offset, + const void *ptr, usb_frlength_t len); +void usbd_copy_out(struct usb_page_cache *cache, usb_frlength_t offset, + void *ptr, usb_frlength_t len); +int usbd_copy_out_user(struct usb_page_cache *cache, usb_frlength_t offset, + void *ptr, usb_frlength_t len); +void usbd_get_page(struct usb_page_cache *pc, usb_frlength_t offset, + struct usb_page_search *res); +void usbd_m_copy_in(struct usb_page_cache *cache, usb_frlength_t dst_offset, + struct mbuf *m, usb_size_t src_offset, usb_frlength_t src_len); +void usbd_frame_zero(struct usb_page_cache *cache, usb_frlength_t offset, + usb_frlength_t len); +void usbd_start_re_enumerate(struct usb_device *udev); +usb_error_t + usbd_start_set_config(struct usb_device *, uint8_t); + +int usb_fifo_attach(struct usb_device *udev, void *priv_sc, + struct mtx *priv_mtx, struct usb_fifo_methods *pm, + struct usb_fifo_sc *f_sc, uint16_t unit, int16_t subunit, + uint8_t iface_index, uid_t uid, gid_t gid, int mode); +void usb_fifo_detach(struct usb_fifo_sc *f_sc); +int usb_fifo_alloc_buffer(struct usb_fifo *f, uint32_t bufsize, + uint16_t nbuf); +void usb_fifo_free_buffer(struct usb_fifo *f); +uint32_t usb_fifo_put_bytes_max(struct usb_fifo *fifo); +void usb_fifo_put_data(struct usb_fifo *fifo, struct usb_page_cache *pc, + usb_frlength_t offset, usb_frlength_t len, uint8_t what); +void usb_fifo_put_data_linear(struct usb_fifo *fifo, void *ptr, + usb_size_t len, uint8_t what); +uint8_t usb_fifo_put_data_buffer(struct usb_fifo *f, void *ptr, usb_size_t len); +void usb_fifo_put_data_error(struct usb_fifo *fifo); +uint8_t usb_fifo_get_data(struct usb_fifo *fifo, struct usb_page_cache *pc, + usb_frlength_t offset, usb_frlength_t len, usb_frlength_t *actlen, + uint8_t what); +uint8_t usb_fifo_get_data_linear(struct usb_fifo *fifo, void *ptr, + usb_size_t len, usb_size_t *actlen, uint8_t what); +uint8_t usb_fifo_get_data_buffer(struct usb_fifo *f, void **pptr, + usb_size_t *plen); +void usb_fifo_reset(struct usb_fifo *f); +void usb_fifo_wakeup(struct usb_fifo *f); +void usb_fifo_get_data_error(struct usb_fifo *fifo); +void *usb_fifo_softc(struct usb_fifo *fifo); +void usb_fifo_set_close_zlp(struct usb_fifo *, uint8_t); +void usb_fifo_set_write_defrag(struct usb_fifo *, uint8_t); +void usb_fifo_free(struct usb_fifo *f); +#endif /* _KERNEL */ +#endif /* _USB_USBDI_H_ */ diff --git a/usr/contrib/freebsd/isa/rtc.h b/usr/contrib/freebsd/isa/rtc.h new file mode 100644 index 0000000000..bb964ddf6a --- /dev/null +++ b/usr/contrib/freebsd/isa/rtc.h @@ -0,0 +1,125 @@ +/*- + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)rtc.h 7.1 (Berkeley) 5/12/91 + * $FreeBSD$ + */ + +#ifndef _I386_ISA_RTC_H_ +#define _I386_ISA_RTC_H_ 1 + +/* + * MC146818 RTC Register locations + */ + +#define RTC_SEC 0x00 /* seconds */ +#define RTC_SECALRM 0x01 /* seconds alarm */ +#define RTC_MIN 0x02 /* minutes */ +#define RTC_MINALRM 0x03 /* minutes alarm */ +#define RTC_HRS 0x04 /* hours */ +#define RTC_HRSALRM 0x05 /* hours alarm */ +#define RTC_WDAY 0x06 /* week day */ +#define RTC_DAY 0x07 /* day of month */ +#define RTC_MONTH 0x08 /* month of year */ +#define RTC_YEAR 0x09 /* month of year */ + +#define RTC_STATUSA 0x0a /* status register A */ +#define RTCSA_TUP 0x80 /* time update, don't look now */ +#define RTCSA_RESET 0x70 /* reset divider */ +#define RTCSA_DIVIDER 0x20 /* divider correct for 32768 Hz */ +#define RTCSA_8192 0x03 /* 8192 Hz interrupt */ +#define RTCSA_4096 0x04 +#define RTCSA_2048 0x05 +#define RTCSA_1024 0x06 /* default for profiling */ +#define RTCSA_PROF RTCSA_1024 +#define RTC_PROFRATE 1024 +#define RTCSA_512 0x07 +#define RTCSA_256 0x08 +#define RTCSA_128 0x09 +#define RTCSA_NOPROF RTCSA_128 +#define RTC_NOPROFRATE 128 +#define RTCSA_64 0x0a +#define RTCSA_32 0x0b /* 32 Hz interrupt */ + +#define RTC_STATUSB 0x0b /* status register B */ +#define RTCSB_DST 0x01 /* USA Daylight Savings Time enable */ +#define RTCSB_24HR 0x02 /* 0 = 12 hours, 1 = 24 hours */ +#define RTCSB_BCD 0x04 /* 0 = BCD, 1 = Binary coded time */ +#define RTCSB_SQWE 0x08 /* 1 = output sqare wave at SQW pin */ +#define RTCSB_UINTR 0x10 /* 1 = enable update-ended interrupt */ +#define RTCSB_AINTR 0x20 /* 1 = enable alarm interrupt */ +#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */ +#define RTCSB_HALT 0x80 /* stop clock updates */ + +#define RTC_INTR 0x0c /* status register C (R) interrupt source */ +#define RTCIR_UPDATE 0x10 /* update intr */ +#define RTCIR_ALARM 0x20 /* alarm intr */ +#define RTCIR_PERIOD 0x40 /* periodic intr */ +#define RTCIR_INT 0x80 /* interrupt output signal */ + +#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */ +#define RTCSD_PWR 0x80 /* clock power OK */ + +#define RTC_DIAG 0x0e /* status register E - bios diagnostic */ +#define RTCDG_BITS "\020\010clock_battery\007ROM_cksum\006config_unit\005memory_size\004fixed_disk\003invalid_time" + +#define RTC_RESET 0x0f /* status register F - reset code byte */ +#define RTCRS_RST 0x00 /* normal reset */ +#define RTCRS_LOAD 0x04 /* load system */ + +#define RTC_FDISKETTE 0x10 /* diskette drive type in upper/lower nibble */ +#define RTCFDT_NONE 0 /* none present */ +#define RTCFDT_360K 0x10 /* 360K */ +#define RTCFDT_12M 0x20 /* 1.2M */ +#define RTCFDT_720K 0x30 /* 720K */ +#define RTCFDT_144M 0x40 /* 1.44M */ +#define RTCFDT_288M_1 0x50 /* 2.88M, some BIOSes */ +#define RTCFDT_288M 0x60 /* 2.88M */ + +#define RTC_BASELO 0x15 /* low byte of basemem size */ +#define RTC_BASEHI 0x16 /* high byte of basemem size */ +#define RTC_EXTLO 0x17 /* low byte of extended mem size */ +#define RTC_EXTHI 0x18 /* low byte of extended mem size */ + +#define RTC_CENTURY 0x32 /* current century */ + +#ifdef __FreeBSD__ +#ifdef _KERNEL +extern struct mtx clock_lock; +extern int atrtcclock_disable; +int rtcin(int reg); +void atrtc_restore(void); +void writertc(int reg, u_char val); +void atrtc_set(struct timespec *ts); +#endif +#endif + +#endif /* _I386_ISA_RTC_H_ */ diff --git a/usr/contrib/freebsd/lib/libutil/humanize_number.c b/usr/contrib/freebsd/lib/libutil/humanize_number.c new file mode 100644 index 0000000000..675a969aaa --- /dev/null +++ b/usr/contrib/freebsd/lib/libutil/humanize_number.c @@ -0,0 +1,179 @@ +/* $NetBSD: humanize_number.c,v 1.14 2008/04/28 20:22:59 martin Exp $ */ + +/* + * Copyright (c) 1997, 1998, 1999, 2002 The NetBSD Foundation, Inc. + * Copyright 2013 John-Mark Gurney <jmg@FreeBSD.org> + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Jason R. Thorpe of the Numerical Aerospace Simulation Facility, + * NASA Ames Research Center, by Luke Mewburn and by Tomas Svensson. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <assert.h> +#include <inttypes.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <locale.h> +#include <libutil.h> + +static const int maxscale = 6; + +int +humanize_number(char *buf, size_t len, int64_t quotient, + const char *suffix, int scale, int flags) +{ + const char *prefixes, *sep; + int i, r, remainder, s1, s2, sign; + int divisordeccut; + int64_t divisor, max; + size_t baselen; + + /* Since so many callers don't check -1, NUL terminate the buffer */ + if (len > 0) + buf[0] = '\0'; + + /* validate args */ + if (buf == NULL || suffix == NULL) + return (-1); + if (scale < 0) + return (-1); + else if (scale > maxscale && + ((scale & ~(HN_AUTOSCALE|HN_GETSCALE)) != 0)) + return (-1); + if ((flags & HN_DIVISOR_1000) && (flags & HN_IEC_PREFIXES)) + return (-1); + + /* setup parameters */ + remainder = 0; + + if (flags & HN_IEC_PREFIXES) { + baselen = 2; + /* + * Use the prefixes for power of two recommended by + * the International Electrotechnical Commission + * (IEC) in IEC 80000-3 (i.e. Ki, Mi, Gi...). + * + * HN_IEC_PREFIXES implies a divisor of 1024 here + * (use of HN_DIVISOR_1000 would have triggered + * an assertion earlier). + */ + divisor = 1024; + divisordeccut = 973; /* ceil(.95 * 1024) */ + if (flags & HN_B) + prefixes = "B\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei"; + else + prefixes = "\0\0\0Ki\0Mi\0Gi\0Ti\0Pi\0Ei"; + } else { + baselen = 1; + if (flags & HN_DIVISOR_1000) { + divisor = 1000; + divisordeccut = 950; + if (flags & HN_B) + prefixes = "B\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E"; + else + prefixes = "\0\0\0k\0\0M\0\0G\0\0T\0\0P\0\0E"; + } else { + divisor = 1024; + divisordeccut = 973; /* ceil(.95 * 1024) */ + if (flags & HN_B) + prefixes = "B\0\0K\0\0M\0\0G\0\0T\0\0P\0\0E"; + else + prefixes = "\0\0\0K\0\0M\0\0G\0\0T\0\0P\0\0E"; + } + } + +#define SCALE2PREFIX(scale) (&prefixes[(scale) * 3]) + + if (quotient < 0) { + sign = -1; + quotient = -quotient; + baselen += 2; /* sign, digit */ + } else { + sign = 1; + baselen += 1; /* digit */ + } + if (flags & HN_NOSPACE) + sep = ""; + else { + sep = " "; + baselen++; + } + baselen += strlen(suffix); + + /* Check if enough room for `x y' + suffix + `\0' */ + if (len < baselen + 1) + return (-1); + + if (scale & (HN_AUTOSCALE | HN_GETSCALE)) { + /* See if there is additional columns can be used. */ + for (max = 1, i = len - baselen; i-- > 0;) + max *= 10; + + /* + * Divide the number until it fits the given column. + * If there will be an overflow by the rounding below, + * divide once more. + */ + for (i = 0; + (quotient >= max || (quotient == max - 1 && + remainder >= divisordeccut)) && i < maxscale; i++) { + remainder = quotient % divisor; + quotient /= divisor; + } + + if (scale & HN_GETSCALE) + return (i); + } else { + for (i = 0; i < scale && i < maxscale; i++) { + remainder = quotient % divisor; + quotient /= divisor; + } + } + + /* If a value <= 9.9 after rounding and ... */ + /* + * XXX - should we make sure there is enough space for the decimal + * place and if not, don't do HN_DECIMAL? + */ + if (((quotient == 9 && remainder < divisordeccut) || quotient < 9) && + i > 0 && flags & HN_DECIMAL) { + s1 = (int)quotient + ((remainder * 10 + divisor / 2) / + divisor / 10); + s2 = ((remainder * 10 + divisor / 2) / divisor) % 10; + r = snprintf(buf, len, "%d%s%d%s%s%s", + sign * s1, localeconv()->decimal_point, s2, + sep, SCALE2PREFIX(i), suffix); + } else + r = snprintf(buf, len, "%" PRId64 "%s%s%s", + sign * (quotient + (remainder + divisor / 2) / divisor), + sep, SCALE2PREFIX(i), suffix); + + return (r); +} diff --git a/usr/contrib/freebsd/sys/ata.h b/usr/contrib/freebsd/sys/ata.h index 705460355f..223bd7b3eb 100644 --- a/usr/contrib/freebsd/sys/ata.h +++ b/usr/contrib/freebsd/sys/ata.h @@ -23,7 +23,7 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: head/sys/sys/ata.h 264853 2014-04-24 01:28:14Z smh $ + * $FreeBSD$ */ #ifndef _SYS_ATA_H_ @@ -105,6 +105,10 @@ struct ata_params { /*069*/ u_int16_t support3; #define ATA_SUPPORT_RZAT 0x0020 #define ATA_SUPPORT_DRAT 0x4000 +#define ATA_SUPPORT_ZONE_MASK 0x0003 +#define ATA_SUPPORT_ZONE_NR 0x0000 +#define ATA_SUPPORT_ZONE_HOST_AWARE 0x0001 +#define ATA_SUPPORT_ZONE_DEV_MANAGED 0x0002 u_int16_t reserved70; /*071*/ u_int16_t rlsovlap; /* rel time (us) for overlap */ /*072*/ u_int16_t rlsservice; /* rel time (us) for service */ @@ -228,7 +232,14 @@ struct ata_params { #define ATA_SUPPORT_RWLOGDMAEXT 0x0008 #define ATA_SUPPORT_MICROCODE3 0x0010 #define ATA_SUPPORT_FREEFALL 0x0020 +#define ATA_SUPPORT_SENSE_REPORT 0x0040 +#define ATA_SUPPORT_EPC 0x0080 /*120*/ u_int16_t enabled2; +#define ATA_ENABLED_WRITEREADVERIFY 0x0002 +#define ATA_ENABLED_WRITEUNCORREXT 0x0004 +#define ATA_ENABLED_FREEFALL 0x0020 +#define ATA_ENABLED_SENSE_REPORT 0x0040 +#define ATA_ENABLED_EPC 0x0080 u_int16_t reserved121[6]; /*127*/ u_int16_t removable_status; /*128*/ u_int16_t security_status; @@ -252,7 +263,7 @@ struct ata_params { u_int16_t reserved170[6]; /*176*/ u_int8_t media_serial[60]; /*206*/ u_int16_t sct; - u_int16_t reserved206[2]; + u_int16_t reserved207[2]; /*209*/ u_int16_t lsalign; /*210*/ u_int16_t wrv_sectors_m3_1; u_int16_t wrv_sectors_m3_2; @@ -298,8 +309,14 @@ struct ata_params { #define ATA_MAX_28BIT_LBA 268435455UL /* ATA Status Register */ -#define ATA_STATUS_ERROR 0x01 -#define ATA_STATUS_DEVICE_FAULT 0x20 +#define ATA_STATUS_ERROR 0x01 +#define ATA_STATUS_SENSE_AVAIL 0x02 +#define ATA_STATUS_ALIGN_ERR 0x04 +#define ATA_STATUS_DATA_REQ 0x08 +#define ATA_STATUS_DEF_WRITE_ERR 0x10 +#define ATA_STATUS_DEVICE_FAULT 0x20 +#define ATA_STATUS_DEVICE_READY 0x40 +#define ATA_STATUS_BUSY 0x80 /* ATA Error Register */ #define ATA_ERROR_ABORT 0x04 @@ -335,6 +352,7 @@ struct ata_params { #define ATA_UDMA6 0x46 #define ATA_SA150 0x47 #define ATA_SA300 0x48 +#define ATA_SA600 0x49 #define ATA_DMA_MAX 0x4f @@ -367,13 +385,36 @@ struct ata_params { #define ATA_WRITE_LOG_EXT 0x3f #define ATA_READ_VERIFY 0x40 #define ATA_READ_VERIFY48 0x42 +#define ATA_WRITE_UNCORRECTABLE48 0x45 /* write uncorrectable 48bit LBA */ +#define ATA_WU_PSEUDO 0x55 /* pseudo-uncorrectable error */ +#define ATA_WU_FLAGGED 0xaa /* flagged-uncorrectable error */ #define ATA_READ_LOG_DMA_EXT 0x47 /* read log DMA ext - PIO Data-In */ +#define ATA_ZAC_MANAGEMENT_IN 0x4a /* ZAC management in */ +#define ATA_ZM_REPORT_ZONES 0x00 /* report zones */ #define ATA_READ_FPDMA_QUEUED 0x60 /* read DMA NCQ */ #define ATA_WRITE_FPDMA_QUEUED 0x61 /* write DMA NCQ */ +#define ATA_NCQ_NON_DATA 0x63 /* NCQ non-data command */ +#define ATA_ABORT_NCQ_QUEUE 0x00 /* abort NCQ queue */ +#define ATA_DEADLINE_HANDLING 0x01 /* deadline handling */ +#define ATA_SET_FEATURES 0x05 /* set features */ +#define ATA_ZERO_EXT 0x06 /* zero ext */ +#define ATA_NCQ_ZAC_MGMT_OUT 0x07 /* NCQ ZAC mgmt out no data */ #define ATA_SEND_FPDMA_QUEUED 0x64 /* send DMA NCQ */ -#define ATA_RECV_FPDMA_QUEUED 0x65 /* recieve DMA NCQ */ +#define ATA_SFPDMA_DSM 0x00 /* Data set management */ +#define ATA_SFPDMA_DSM_TRIM 0x01 /* Set trim bit in auxiliary */ +#define ATA_SFPDMA_HYBRID_EVICT 0x01 /* Hybrid Evict */ +#define ATA_SFPDMA_WLDMA 0x02 /* Write Log DMA EXT */ +#define ATA_SFPDMA_ZAC_MGMT_OUT 0x03 /* NCQ ZAC mgmt out w/data */ +#define ATA_RECV_FPDMA_QUEUED 0x65 /* receive DMA NCQ */ +#define ATA_RFPDMA_RL_DMA_EXT 0x00 /* Read Log DMA EXT */ +#define ATA_RFPDMA_ZAC_MGMT_IN 0x02 /* NCQ ZAC mgmt in w/data */ #define ATA_SEP_ATTN 0x67 /* SEP request */ #define ATA_SEEK 0x70 /* seek */ +#define ATA_ZAC_MANAGEMENT_OUT 0x9f /* ZAC management out */ +#define ATA_ZM_CLOSE_ZONE 0x01 /* close zone */ +#define ATA_ZM_FINISH_ZONE 0x02 /* finish zone */ +#define ATA_ZM_OPEN_ZONE 0x03 /* open zone */ +#define ATA_ZM_RWP 0x04 /* reset write pointer */ #define ATA_PACKET_CMD 0xa0 /* packet command */ #define ATA_ATAPI_IDENTIFY 0xa1 /* get ATAPI params*/ #define ATA_SERVICE 0xa2 /* service command */ @@ -393,24 +434,36 @@ struct ata_params { #define ATA_IDLE_CMD 0xe3 /* idle */ #define ATA_READ_BUFFER 0xe4 /* read buffer */ #define ATA_READ_PM 0xe4 /* read portmultiplier */ +#define ATA_CHECK_POWER_MODE 0xe5 /* device power mode */ #define ATA_SLEEP 0xe6 /* sleep */ #define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */ #define ATA_WRITE_PM 0xe8 /* write portmultiplier */ #define ATA_FLUSHCACHE48 0xea /* flush cache to disk */ #define ATA_ATA_IDENTIFY 0xec /* get ATA params */ #define ATA_SETFEATURES 0xef /* features command */ -#define ATA_SF_SETXFER 0x03 /* set transfer mode */ #define ATA_SF_ENAB_WCACHE 0x02 /* enable write cache */ #define ATA_SF_DIS_WCACHE 0x82 /* disable write cache */ +#define ATA_SF_SETXFER 0x03 /* set transfer mode */ +#define ATA_SF_APM 0x05 /* Enable APM feature set */ #define ATA_SF_ENAB_PUIS 0x06 /* enable PUIS */ #define ATA_SF_DIS_PUIS 0x86 /* disable PUIS */ #define ATA_SF_PUIS_SPINUP 0x07 /* PUIS spin-up */ +#define ATA_SF_WRV 0x0b /* Enable Write-Read-Verify */ +#define ATA_SF_DLC 0x0c /* Enable device life control */ +#define ATA_SF_SATA 0x10 /* Enable use of SATA feature */ +#define ATA_SF_FFC 0x41 /* Free-fall Control */ +#define ATA_SF_MHIST 0x43 /* Set Max Host Sect. Times */ +#define ATA_SF_RATE 0x45 /* Set Rate Basis */ +#define ATA_SF_EPC 0x4A /* Extended Power Conditions */ #define ATA_SF_ENAB_RCACHE 0xaa /* enable readahead cache */ #define ATA_SF_DIS_RCACHE 0x55 /* disable readahead cache */ #define ATA_SF_ENAB_RELIRQ 0x5d /* enable release interrupt */ #define ATA_SF_DIS_RELIRQ 0xdd /* disable release interrupt */ #define ATA_SF_ENAB_SRVIRQ 0x5e /* enable service interrupt */ #define ATA_SF_DIS_SRVIRQ 0xde /* disable service interrupt */ +#define ATA_SF_LPSAERC 0x62 /* Long Phys Sect Align ErrRep*/ +#define ATA_SF_DSN 0x63 /* Device Stats Notification */ +#define ATA_CHECK_POWER_MODE 0xe5 /* Check Power Mode */ #define ATA_SECURITY_SET_PASSWORD 0xf1 /* set drive password */ #define ATA_SECURITY_UNLOCK 0xf2 /* unlock drive using passwd */ #define ATA_SECURITY_ERASE_PREPARE 0xf3 /* prepare to erase drive */ @@ -537,6 +590,333 @@ struct atapi_sense { u_int8_t specific2; /* sense key specific */ } __packed; +/* + * SET FEATURES subcommands + */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * These values go in the LBA 3:0. + */ +#define ATA_SF_EPC_RESTORE 0x00 /* Restore Power Condition Settings */ +#define ATA_SF_EPC_GOTO 0x01 /* Go To Power Condition */ +#define ATA_SF_EPC_SET_TIMER 0x02 /* Set Power Condition Timer */ +#define ATA_SF_EPC_SET_STATE 0x03 /* Set Power Condition State */ +#define ATA_SF_EPC_ENABLE 0x04 /* Enable the EPC feature set */ +#define ATA_SF_EPC_DISABLE 0x05 /* Disable the EPC feature set */ +#define ATA_SF_EPC_SET_SOURCE 0x06 /* Set EPC Power Source */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * Power Condition ID field + * These values go in the count register. + */ +#define ATA_EPC_STANDBY_Z 0x00 /* Substate of PM2:Standby */ +#define ATA_EPC_STANDBY_Y 0x01 /* Substate of PM2:Standby */ +#define ATA_EPC_IDLE_A 0x81 /* Substate of PM1:Idle */ +#define ATA_EPC_IDLE_B 0x82 /* Substate of PM1:Idle */ +#define ATA_EPC_IDLE_C 0x83 /* Substate of PM1:Idle */ +#define ATA_EPC_ALL 0xff /* All supported power conditions */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * Restore Power Conditions Settings subcommand + * These values go in the LBA register. + */ +#define ATA_SF_EPC_RST_DFLT 0x40 /* 1=Rst from Default, 0= from Saved */ +#define ATA_SF_EPC_RST_SAVE 0x10 /* 1=Save on completion */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * Got To Power Condition subcommand + * These values go in the LBA register. + */ +#define ATA_SF_EPC_GOTO_DELAY 0x02000000 /* Delayed entry bit */ +#define ATA_SF_EPC_GOTO_HOLD 0x01000000 /* Hold Power Cond bit */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * Set Power Condition Timer subcommand + * These values go in the LBA register. + */ +#define ATA_SF_EPC_TIMER_MASK 0x00ffff00 /* Timer field */ +#define ATA_SF_EPC_TIMER_SHIFT 8 +#define ATA_SF_EPC_TIMER_SEC 0x00000080 /* Timer units, 1=sec, 0=.1s */ +#define ATA_SF_EPC_TIMER_EN 0x00000020 /* Enable/disable cond. */ +#define ATA_SF_EPC_TIMER_SAVE 0x00000010 /* Save settings on comp. */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * Set Power Condition State subcommand + * These values go in the LBA register. + */ +#define ATA_SF_EPC_SETCON_EN 0x00000020 /* Enable power cond. */ +#define ATA_SF_EPC_SETCON_SAVE 0x00000010 /* Save settings on comp */ + +/* + * SET FEATURES command + * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A) + * Set EPC Power Source subcommand + * These values go in the count register. + */ +#define ATA_SF_EPC_SRC_UNKNOWN 0x0000 /* Unknown source */ +#define ATA_SF_EPC_SRC_BAT 0x0001 /* battery source */ +#define ATA_SF_EPC_SRC_NOT_BAT 0x0002 /* not battery source */ + +#define ATA_LOG_DIRECTORY 0x00 /* Directory of all logs */ +#define ATA_POWER_COND_LOG 0x08 /* Power Conditions Log */ +#define ATA_PCL_IDLE 0x00 /* Idle Power Conditions Page */ +#define ATA_PCL_STANDBY 0x01 /* Standby Power Conditions Page */ +#define ATA_IDENTIFY_DATA_LOG 0x30 /* Identify Device Data Log */ +#define ATA_IDL_PAGE_LIST 0x00 /* List of supported pages */ +#define ATA_IDL_IDENTIFY_DATA 0x01 /* Copy of Identify Device data */ +#define ATA_IDL_CAPACITY 0x02 /* Capacity */ +#define ATA_IDL_SUP_CAP 0x03 /* Supported Capabilities */ +#define ATA_IDL_CUR_SETTINGS 0x04 /* Current Settings */ +#define ATA_IDL_ATA_STRINGS 0x05 /* ATA Strings */ +#define ATA_IDL_SECURITY 0x06 /* Security */ +#define ATA_IDL_PARALLEL_ATA 0x07 /* Parallel ATA */ +#define ATA_IDL_SERIAL_ATA 0x08 /* Seiral ATA */ +#define ATA_IDL_ZDI 0x09 /* Zoned Device Information */ + +struct ata_gp_log_dir { + uint8_t header[2]; +#define ATA_GP_LOG_DIR_VERSION 0x0001 + uint8_t num_pages[255*2]; /* Number of log pages at address */ +}; + +/* + * ATA Power Conditions log descriptor + */ +struct ata_power_cond_log_desc { + uint8_t reserved1; + uint8_t flags; +#define ATA_PCL_COND_SUPPORTED 0x80 +#define ATA_PCL_COND_SAVEABLE 0x40 +#define ATA_PCL_COND_CHANGEABLE 0x20 +#define ATA_PCL_DEFAULT_TIMER_EN 0x10 +#define ATA_PCL_SAVED_TIMER_EN 0x08 +#define ATA_PCL_CURRENT_TIMER_EN 0x04 +#define ATA_PCL_HOLD_PC_NOT_SUP 0x02 + uint8_t reserved2[2]; + uint8_t default_timer[4]; + uint8_t saved_timer[4]; + uint8_t current_timer[4]; + uint8_t nom_time_to_active[4]; + uint8_t min_timer[4]; + uint8_t max_timer[4]; + uint8_t num_transitions_to_pc[4]; + uint8_t hours_in_pc[4]; + uint8_t reserved3[28]; +}; + +/* + * ATA Power Conditions Log (0x08), Idle power conditions page (0x00) + */ +struct ata_power_cond_log_idle { + struct ata_power_cond_log_desc idle_a_desc; + struct ata_power_cond_log_desc idle_b_desc; + struct ata_power_cond_log_desc idle_c_desc; + uint8_t reserved[320]; +}; + +/* + * ATA Power Conditions Log (0x08), Standby power conditions page (0x01) + */ +struct ata_power_cond_log_standby { + uint8_t reserved[384]; + struct ata_power_cond_log_desc standby_y_desc; + struct ata_power_cond_log_desc standby_z_desc; +}; + +/* + * ATA IDENTIFY DEVICE data log (0x30) page 0x00 + * List of Supported IDENTIFY DEVICE data pages. + */ +struct ata_identify_log_pages { + uint8_t header[8]; +#define ATA_IDLOG_REVISION 0x0000000000000001 + uint8_t entry_count; + uint8_t entries[503]; +}; + +/* + * ATA IDENTIFY DEVICE data log (0x30) + * Capacity (Page 0x02). + */ +struct ata_identify_log_capacity { + uint8_t header[8]; +#define ATA_CAP_HEADER_VALID 0x8000000000000000 +#define ATA_CAP_PAGE_NUM_MASK 0x0000000000ff0000 +#define ATA_CAP_PAGE_NUM_SHIFT 16 +#define ATA_CAP_REV_MASK 0x00000000000000ff + uint8_t capacity[8]; +#define ATA_CAP_CAPACITY_VALID 0x8000000000000000 +#define ATA_CAP_ACCESSIBLE_CAP 0x0000ffffffffffff + uint8_t phys_logical_sect_size[8]; +#define ATA_CAP_PL_VALID 0x8000000000000000 +#define ATA_CAP_LTOP_REL_SUP 0x4000000000000000 +#define ATA_CAP_LOG_SECT_SUP 0x2000000000000000 +#define ATA_CAP_ALIGN_ERR_MASK 0x0000000000300000 +#define ATA_CAP_LTOP_MASK 0x00000000000f0000 +#define ATA_CAP_LOG_SECT_OFF 0x000000000000ffff + uint8_t logical_sect_size[8]; +#define ATA_CAP_LOG_SECT_VALID 0x8000000000000000 +#define ATA_CAP_LOG_SECT_SIZE 0x00000000ffffffff + uint8_t nominal_buffer_size[8]; +#define ATA_CAP_NOM_BUF_VALID 0x8000000000000000 +#define ATA_CAP_NOM_BUF_SIZE 0x7fffffffffffffff + uint8_t reserved[472]; +}; + +/* + * ATA IDENTIFY DEVICE data log (0x30) + * Supported Capabilities (Page 0x03). + */ + +struct ata_identify_log_sup_cap { + uint8_t header[8]; +#define ATA_SUP_CAP_HEADER_VALID 0x8000000000000000 +#define ATA_SUP_CAP_PAGE_NUM_MASK 0x0000000000ff0000 +#define ATA_SUP_CAP_PAGE_NUM_SHIFT 16 +#define ATA_SUP_CAP_REV_MASK 0x00000000000000ff + uint8_t sup_cap[8]; +#define ATA_SUP_CAP_VALID 0x8000000000000000 +#define ATA_SC_SET_SECT_CONFIG_SUP 0x0002000000000000 /* Set Sect Conf*/ +#define ATA_SC_ZERO_EXT_SUP 0x0001000000000000 /* Zero EXT */ +#define ATA_SC_SUCC_NCQ_SENSE_SUP 0x0000800000000000 /* Succ. NCQ Sns */ +#define ATA_SC_DLC_SUP 0x0000400000000000 /* DLC */ +#define ATA_SC_RQSN_DEV_FAULT_SUP 0x0000200000000000 /* Req Sns Dev Flt*/ +#define ATA_SC_DSN_SUP 0x0000100000000000 /* DSN */ +#define ATA_SC_LP_STANDBY_SUP 0x0000080000000000 /* LP Standby */ +#define ATA_SC_SET_EPC_PS_SUP 0x0000040000000000 /* Set EPC PS */ +#define ATA_SC_AMAX_ADDR_SUP 0x0000020000000000 /* AMAX Addr */ +#define ATA_SC_DRAT_SUP 0x0000008000000000 /* DRAT */ +#define ATA_SC_LPS_MISALGN_SUP 0x0000004000000000 /* LPS Misalign */ +#define ATA_SC_RB_DMA_SUP 0x0000001000000000 /* Read Buf DMA */ +#define ATA_SC_WB_DMA_SUP 0x0000000800000000 /* Write Buf DMA */ +#define ATA_SC_DNLD_MC_DMA_SUP 0x0000000200000000 /* DL MCode DMA */ +#define ATA_SC_28BIT_SUP 0x0000000100000000 /* 28-bit */ +#define ATA_SC_RZAT_SUP 0x0000000080000000 /* RZAT */ +#define ATA_SC_NOP_SUP 0x0000000020000000 /* NOP */ +#define ATA_SC_READ_BUFFER_SUP 0x0000000010000000 /* Read Buffer */ +#define ATA_SC_WRITE_BUFFER_SUP 0x0000000008000000 /* Write Buffer */ +#define ATA_SC_READ_LOOK_AHEAD_SUP 0x0000000002000000 /* Read Look-Ahead*/ +#define ATA_SC_VOLATILE_WC_SUP 0x0000000001000000 /* Volatile WC */ +#define ATA_SC_SMART_SUP 0x0000000000800000 /* SMART */ +#define ATA_SC_FLUSH_CACHE_EXT_SUP 0x0000000000400000 /* Flush Cache Ext */ +#define ATA_SC_48BIT_SUP 0x0000000000100000 /* 48-Bit */ +#define ATA_SC_SPINUP_SUP 0x0000000000040000 /* Spin-Up */ +#define ATA_SC_PUIS_SUP 0x0000000000020000 /* PUIS */ +#define ATA_SC_APM_SUP 0x0000000000010000 /* APM */ +#define ATA_SC_DL_MICROCODE_SUP 0x0000000000004000 /* DL Microcode */ +#define ATA_SC_UNLOAD_SUP 0x0000000000002000 /* Unload */ +#define ATA_SC_WRITE_FUA_EXT_SUP 0x0000000000001000 /* Write FUA EXT */ +#define ATA_SC_GPL_SUP 0x0000000000000800 /* GPL */ +#define ATA_SC_STREAMING_SUP 0x0000000000000400 /* Streaming */ +#define ATA_SC_SMART_SELFTEST_SUP 0x0000000000000100 /* SMART self-test */ +#define ATA_SC_SMART_ERR_LOG_SUP 0x0000000000000080 /* SMART Err Log */ +#define ATA_SC_EPC_SUP 0x0000000000000040 /* EPC */ +#define ATA_SC_SENSE_SUP 0x0000000000000020 /* Sense data */ +#define ATA_SC_FREEFALL_SUP 0x0000000000000010 /* Free-Fall */ +#define ATA_SC_DM_MODE3_SUP 0x0000000000000008 /* DM Mode 3 */ +#define ATA_SC_GPL_DMA_SUP 0x0000000000000004 /* GPL DMA */ +#define ATA_SC_WRITE_UNCOR_SUP 0x0000000000000002 /* Write uncorr. */ +#define ATA_SC_WRV_SUP 0x0000000000000001 /* WRV */ + uint8_t download_code_cap[8]; +#define ATA_DL_CODE_VALID 0x8000000000000000 +#define ATA_DLC_DM_OFFSETS_DEFER_SUP 0x0000000400000000 +#define ATA_DLC_DM_IMMED_SUP 0x0000000200000000 +#define ATA_DLC_DM_OFF_IMMED_SUP 0x0000000100000000 +#define ATA_DLC_DM_MAX_XFER_SIZE_MASK 0x00000000ffff0000 +#define ATA_DLC_DM_MAX_XFER_SIZE_SHIFT 16 +#define ATA_DLC_DM_MIN_XFER_SIZE_MASK 0x000000000000ffff + uint8_t nom_media_rotation_rate[8]; +#define ATA_NOM_MEDIA_ROTATION_VALID 0x8000000000000000 +#define ATA_ROTATION_MASK 0x000000000000ffff + uint8_t form_factor[8]; +#define ATA_FORM_FACTOR_VALID 0x8000000000000000 +#define ATA_FF_MASK 0x000000000000000f +#define ATA_FF_NOT_REPORTED 0x0000000000000000 /* Not reported */ +#define ATA_FF_525_IN 0x0000000000000001 /* 5.25 inch */ +#define ATA_FF_35_IN 0x0000000000000002 /* 3.5 inch */ +#define ATA_FF_25_IN 0x0000000000000003 /* 2.5 inch */ +#define ATA_FF_18_IN 0x0000000000000004 /* 1.8 inch */ +#define ATA_FF_LT_18_IN 0x0000000000000005 /* < 1.8 inch */ +#define ATA_FF_MSATA 0x0000000000000006 /* mSATA */ +#define ATA_FF_M2 0x0000000000000007 /* M.2 */ +#define ATA_FF_MICROSSD 0x0000000000000008 /* MicroSSD */ +#define ATA_FF_CFAST 0x0000000000000009 /* CFast */ + uint8_t wrv_sec_cnt_mode3[8]; +#define ATA_WRV_MODE3_VALID 0x8000000000000000 +#define ATA_WRV_MODE3_COUNT 0x00000000ffffffff + uint8_t wrv_sec_cnt_mode2[8]; +#define ATA_WRV_MODE2_VALID 0x8000000000000000 +#define ATA_WRV_MODE2_COUNT 0x00000000ffffffff + uint8_t wwn[16]; + /* XXX KDM need to figure out how to handle 128-bit fields */ + uint8_t dsm[8]; +#define ATA_DSM_VALID 0x8000000000000000 +#define ATA_LB_MARKUP_SUP 0x000000000000ff00 +#define ATA_TRIM_SUP 0x0000000000000001 + uint8_t util_per_unit_time[16]; + /* XXX KDM need to figure out how to handle 128-bit fields */ + uint8_t util_usage_rate_sup[8]; +#define ATA_UTIL_USAGE_RATE_VALID 0x8000000000000000 +#define ATA_SETTING_RATE_SUP 0x0000000000800000 +#define ATA_SINCE_POWERON_SUP 0x0000000000000100 +#define ATA_POH_RATE_SUP 0x0000000000000010 +#define ATA_DATE_TIME_RATE_SUP 0x0000000000000001 + uint8_t zoned_cap[8]; +#define ATA_ZONED_VALID 0x8000000000000000 +#define ATA_ZONED_MASK 0x0000000000000003 + uint8_t sup_zac_cap[8]; +#define ATA_SUP_ZAC_CAP_VALID 0x8000000000000000 +#define ATA_ND_RWP_SUP 0x0000000000000010 /* Reset Write Ptr*/ +#define ATA_ND_FINISH_ZONE_SUP 0x0000000000000008 /* Finish Zone */ +#define ATA_ND_CLOSE_ZONE_SUP 0x0000000000000004 /* Close Zone */ +#define ATA_ND_OPEN_ZONE_SUP 0x0000000000000002 /* Open Zone */ +#define ATA_REPORT_ZONES_SUP 0x0000000000000001 /* Report Zones */ + uint8_t reserved[392]; +}; + +/* + * ATA Identify Device Data Log Zoned Device Information Page (0x09). + * Current as of ZAC r04a, August 25, 2015. + */ +struct ata_zoned_info_log { + uint8_t header[8]; +#define ATA_ZDI_HEADER_VALID 0x8000000000000000 +#define ATA_ZDI_PAGE_NUM_MASK 0x0000000000ff0000 +#define ATA_ZDI_PAGE_NUM_SHIFT 16 +#define ATA_ZDI_REV_MASK 0x00000000000000ff + uint8_t zoned_cap[8]; +#define ATA_ZDI_CAP_VALID 0x8000000000000000 +#define ATA_ZDI_CAP_URSWRZ 0x0000000000000001 + uint8_t zoned_settings[8]; +#define ATA_ZDI_SETTINGS_VALID 0x8000000000000000 + uint8_t optimal_seq_zones[8]; +#define ATA_ZDI_OPT_SEQ_VALID 0x8000000000000000 +#define ATA_ZDI_OPT_SEQ_MASK 0x00000000ffffffff + uint8_t optimal_nonseq_zones[8]; +#define ATA_ZDI_OPT_NS_VALID 0x8000000000000000 +#define ATA_ZDI_OPT_NS_MASK 0x00000000ffffffff + uint8_t max_seq_req_zones[8]; +#define ATA_ZDI_MAX_SEQ_VALID 0x8000000000000000 +#define ATA_ZDI_MAX_SEQ_MASK 0x00000000ffffffff + uint8_t version_info[8]; +#define ATA_ZDI_VER_VALID 0x8000000000000000 +#define ATA_ZDI_VER_ZAC_SUP 0x0100000000000000 +#define ATA_ZDI_VER_ZAC_MASK 0x00000000000000ff + uint8_t reserved[456]; +}; + struct ata_ioc_request { union { struct { diff --git a/usr/contrib/freebsd/sys/linker_set.h b/usr/contrib/freebsd/sys/linker_set.h deleted file mode 100644 index 393dfbc131..0000000000 --- a/usr/contrib/freebsd/sys/linker_set.h +++ /dev/null @@ -1,119 +0,0 @@ -/*- - * Copyright (c) 1999 John D. Polstra - * Copyright (c) 1999,2001 Peter Wemm <peter@FreeBSD.org> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: head/sys/sys/linker_set.h 215701 2010-11-22 19:32:54Z dim $ - */ - -#ifndef _SYS_LINKER_SET_H_ -#define _SYS_LINKER_SET_H_ - -#ifdef __FreeBSD__ -#ifndef _SYS_CDEFS_H_ -#error this file needs sys/cdefs.h as a prerequisite -#endif -#else -#ifndef _COMPAT_FREEBSD_SYS_CDEFS_H_ -#error this file needs sys/cdefs.h as a prerequisite -#endif -#endif - -/* - * The following macros are used to declare global sets of objects, which - * are collected by the linker into a `linker_set' as defined below. - * For ELF, this is done by constructing a separate segment for each set. - */ - -/* - * Private macros, not to be used outside this header file. - */ -#ifdef __GNUCLIKE___SECTION -#ifdef __FreeBSD__ -#define __MAKE_SET(set, sym) \ - __GLOBL(__CONCAT(__start_set_,set)); \ - __GLOBL(__CONCAT(__stop_set_,set)); \ - static void const * const __set_##set##_sym_##sym \ - __section("set_" #set) __used = &sym -#else -#define __MAKE_SET(set, sym) \ - static void const * const __set_##set##_sym_##sym \ - __section("set_" #set) __used = &sym -#endif -#else /* !__GNUCLIKE___SECTION */ -#ifndef lint -#error this file needs to be ported to your compiler -#endif /* lint */ -#define __MAKE_SET(set, sym) extern void const * const (__set_##set##_sym_##sym) -#endif /* __GNUCLIKE___SECTION */ - -/* - * Public macros. - */ -#define TEXT_SET(set, sym) __MAKE_SET(set, sym) -#define DATA_SET(set, sym) __MAKE_SET(set, sym) -#define BSS_SET(set, sym) __MAKE_SET(set, sym) -#define ABS_SET(set, sym) __MAKE_SET(set, sym) -#define SET_ENTRY(set, sym) __MAKE_SET(set, sym) - -/* - * Initialize before referring to a given linker set. - */ -#ifdef __FreeBSD__ -#define SET_DECLARE(set, ptype) \ - extern ptype *__CONCAT(__start_set_,set); \ - extern ptype *__CONCAT(__stop_set_,set) -#else -#define SET_DECLARE(set, ptype) \ - _Pragma(__XSTRING(weak __CONCAT(__start_set_,set))) \ - _Pragma(__XSTRING(weak __CONCAT(__stop_set_,set))) \ - extern ptype *__CONCAT(__start_set_,set); \ - extern ptype *__CONCAT(__stop_set_,set) -#endif - -#define SET_BEGIN(set) \ - (&__CONCAT(__start_set_,set)) -#define SET_LIMIT(set) \ - (&__CONCAT(__stop_set_,set)) - -/* - * Iterate over all the elements of a set. - * - * Sets always contain addresses of things, and "pvar" points to words - * containing those addresses. Thus is must be declared as "type **pvar", - * and the address of each set item is obtained inside the loop by "*pvar". - */ -#define SET_FOREACH(pvar, set) \ - for (pvar = SET_BEGIN(set); pvar < SET_LIMIT(set); pvar++) - -#define SET_ITEM(set, i) \ - ((SET_BEGIN(set))[i]) - -/* - * Provide a count of the items in a set. - */ -#define SET_COUNT(set) \ - (SET_LIMIT(set) - SET_BEGIN(set)) - -#endif /* _SYS_LINKER_SET_H_ */ diff --git a/usr/contrib/freebsd/sys/pciio.h b/usr/contrib/freebsd/sys/pciio.h new file mode 100644 index 0000000000..d70bfbcf6f --- /dev/null +++ b/usr/contrib/freebsd/sys/pciio.h @@ -0,0 +1,146 @@ +/*- + * Copyright (c) 1997, Stefan Esser <se@FreeBSD.ORG> + * Copyright (c) 1997, 1998, 1999, Kenneth D. Merry <ken@FreeBSD.ORG> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _SYS_PCIIO_H_ +#define _SYS_PCIIO_H_ + +#include <sys/ioccom.h> + +#define PCI_MAXNAMELEN 16 + +typedef enum { + PCI_GETCONF_LAST_DEVICE, + PCI_GETCONF_LIST_CHANGED, + PCI_GETCONF_MORE_DEVS, + PCI_GETCONF_ERROR +} pci_getconf_status; + +typedef enum { + PCI_GETCONF_NO_MATCH = 0x0000, + PCI_GETCONF_MATCH_DOMAIN = 0x0001, + PCI_GETCONF_MATCH_BUS = 0x0002, + PCI_GETCONF_MATCH_DEV = 0x0004, + PCI_GETCONF_MATCH_FUNC = 0x0008, + PCI_GETCONF_MATCH_NAME = 0x0010, + PCI_GETCONF_MATCH_UNIT = 0x0020, + PCI_GETCONF_MATCH_VENDOR = 0x0040, + PCI_GETCONF_MATCH_DEVICE = 0x0080, + PCI_GETCONF_MATCH_CLASS = 0x0100 +} pci_getconf_flags; + +struct pcisel { + u_int32_t pc_domain; /* domain number */ + u_int8_t pc_bus; /* bus number */ + u_int8_t pc_dev; /* device on this bus */ + u_int8_t pc_func; /* function on this device */ +}; + +struct pci_conf { + struct pcisel pc_sel; /* domain+bus+slot+function */ + u_int8_t pc_hdr; /* PCI header type */ + u_int16_t pc_subvendor; /* card vendor ID */ + u_int16_t pc_subdevice; /* card device ID, assigned by + card vendor */ + u_int16_t pc_vendor; /* chip vendor ID */ + u_int16_t pc_device; /* chip device ID, assigned by + chip vendor */ + u_int8_t pc_class; /* chip PCI class */ + u_int8_t pc_subclass; /* chip PCI subclass */ + u_int8_t pc_progif; /* chip PCI programming interface */ + u_int8_t pc_revid; /* chip revision ID */ + char pd_name[PCI_MAXNAMELEN + 1]; /* device name */ + u_long pd_unit; /* device unit number */ +}; + +struct pci_match_conf { + struct pcisel pc_sel; /* domain+bus+slot+function */ + char pd_name[PCI_MAXNAMELEN + 1]; /* device name */ + u_long pd_unit; /* Unit number */ + u_int16_t pc_vendor; /* PCI Vendor ID */ + u_int16_t pc_device; /* PCI Device ID */ + u_int8_t pc_class; /* PCI class */ + pci_getconf_flags flags; /* Matching expression */ +}; + +struct pci_conf_io { + u_int32_t pat_buf_len; /* pattern buffer length */ + u_int32_t num_patterns; /* number of patterns */ + struct pci_match_conf *patterns; /* pattern buffer */ + u_int32_t match_buf_len; /* match buffer length */ + u_int32_t num_matches; /* number of matches returned */ + struct pci_conf *matches; /* match buffer */ + u_int32_t offset; /* offset into device list */ + u_int32_t generation; /* device list generation */ + pci_getconf_status status; /* request status */ +}; + +struct pci_io { + struct pcisel pi_sel; /* device to operate on */ + int pi_reg; /* configuration register to examine */ + int pi_width; /* width (in bytes) of read or write */ + u_int32_t pi_data; /* data to write or result of read */ +}; + +struct pci_bar_io { + struct pcisel pbi_sel; /* device to operate on */ + int pbi_reg; /* starting address of BAR */ + int pbi_enabled; /* decoding enabled */ + uint64_t pbi_base; /* current value of BAR */ + uint64_t pbi_length; /* length of BAR */ +}; + +struct pci_vpd_element { + char pve_keyword[2]; + uint8_t pve_flags; + uint8_t pve_datalen; + uint8_t pve_data[0]; +}; + +#define PVE_FLAG_IDENT 0x01 /* Element is the string identifier */ +#define PVE_FLAG_RW 0x02 /* Element is read/write */ + +#define PVE_NEXT(pve) \ + ((struct pci_vpd_element *)((char *)(pve) + \ + sizeof(struct pci_vpd_element) + (pve)->pve_datalen)) + +struct pci_list_vpd_io { + struct pcisel plvi_sel; /* device to operate on */ + size_t plvi_len; /* size of the data area */ + struct pci_vpd_element *plvi_data; +}; + +#define PCIOCGETCONF _IOWR('p', 5, struct pci_conf_io) +#define PCIOCREAD _IOWR('p', 2, struct pci_io) +#define PCIOCWRITE _IOWR('p', 3, struct pci_io) +#define PCIOCATTACHED _IOWR('p', 4, struct pci_io) +#define PCIOCGETBAR _IOWR('p', 6, struct pci_bar_io) +#define PCIOCLISTVPD _IOWR('p', 7, struct pci_list_vpd_io) + +#endif /* !_SYS_PCIIO_H_ */ diff --git a/usr/contrib/freebsd/sys/queue.h b/usr/contrib/freebsd/sys/queue.h new file mode 100644 index 0000000000..f26c492af1 --- /dev/null +++ b/usr/contrib/freebsd/sys/queue.h @@ -0,0 +1,787 @@ +/*- + * Copyright (c) 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)queue.h 8.5 (Berkeley) 8/20/94 + * $FreeBSD$ + */ + +#ifndef _SYS_QUEUE_H_ +#define _SYS_QUEUE_H_ + +#include <sys/cdefs.h> + +/* + * This file defines four types of data structures: singly-linked lists, + * singly-linked tail queues, lists and tail queues. + * + * A singly-linked list is headed by a single forward pointer. The elements + * are singly linked for minimum space and pointer manipulation overhead at + * the expense of O(n) removal for arbitrary elements. New elements can be + * added to the list after an existing element or at the head of the list. + * Elements being removed from the head of the list should use the explicit + * macro for this purpose for optimum efficiency. A singly-linked list may + * only be traversed in the forward direction. Singly-linked lists are ideal + * for applications with large datasets and few or no removals or for + * implementing a LIFO queue. + * + * A singly-linked tail queue is headed by a pair of pointers, one to the + * head of the list and the other to the tail of the list. The elements are + * singly linked for minimum space and pointer manipulation overhead at the + * expense of O(n) removal for arbitrary elements. New elements can be added + * to the list after an existing element, at the head of the list, or at the + * end of the list. Elements being removed from the head of the tail queue + * should use the explicit macro for this purpose for optimum efficiency. + * A singly-linked tail queue may only be traversed in the forward direction. + * Singly-linked tail queues are ideal for applications with large datasets + * and few or no removals or for implementing a FIFO queue. + * + * A list is headed by a single forward pointer (or an array of forward + * pointers for a hash table header). The elements are doubly linked + * so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before + * or after an existing element or at the head of the list. A list + * may be traversed in either direction. + * + * A tail queue is headed by a pair of pointers, one to the head of the + * list and the other to the tail of the list. The elements are doubly + * linked so that an arbitrary element can be removed without a need to + * traverse the list. New elements can be added to the list before or + * after an existing element, at the head of the list, or at the end of + * the list. A tail queue may be traversed in either direction. + * + * For details on the use of these macros, see the queue(3) manual page. + * + * Below is a summary of implemented functions where: + * + means the macro is available + * - means the macro is not available + * s means the macro is available but is slow (runs in O(n) time) + * + * SLIST LIST STAILQ TAILQ + * _HEAD + + + + + * _CLASS_HEAD + + + + + * _HEAD_INITIALIZER + + + + + * _ENTRY + + + + + * _CLASS_ENTRY + + + + + * _INIT + + + + + * _EMPTY + + + + + * _FIRST + + + + + * _NEXT + + + + + * _PREV - + - + + * _LAST - - + + + * _FOREACH + + + + + * _FOREACH_FROM + + + + + * _FOREACH_SAFE + + + + + * _FOREACH_FROM_SAFE + + + + + * _FOREACH_REVERSE - - - + + * _FOREACH_REVERSE_FROM - - - + + * _FOREACH_REVERSE_SAFE - - - + + * _FOREACH_REVERSE_FROM_SAFE - - - + + * _INSERT_HEAD + + + + + * _INSERT_BEFORE - + - + + * _INSERT_AFTER + + + + + * _INSERT_TAIL - - + + + * _CONCAT s s + + + * _REMOVE_AFTER + - + - + * _REMOVE_HEAD + - + - + * _REMOVE s + s + + * _SWAP + + + + + * + */ +#ifdef QUEUE_MACRO_DEBUG +/* Store the last 2 places the queue element or head was altered */ +struct qm_trace { + unsigned long lastline; + unsigned long prevline; + const char *lastfile; + const char *prevfile; +}; + +#define TRACEBUF struct qm_trace trace; +#define TRACEBUF_INITIALIZER { __LINE__, 0, __FILE__, NULL } , +#define TRASHIT(x) do {(x) = (void *)-1;} while (0) +#define QMD_SAVELINK(name, link) void **name = (void *)&(link) + +#define QMD_TRACE_HEAD(head) do { \ + (head)->trace.prevline = (head)->trace.lastline; \ + (head)->trace.prevfile = (head)->trace.lastfile; \ + (head)->trace.lastline = __LINE__; \ + (head)->trace.lastfile = __FILE__; \ +} while (0) + +#define QMD_TRACE_ELEM(elem) do { \ + (elem)->trace.prevline = (elem)->trace.lastline; \ + (elem)->trace.prevfile = (elem)->trace.lastfile; \ + (elem)->trace.lastline = __LINE__; \ + (elem)->trace.lastfile = __FILE__; \ +} while (0) + +#else +#define QMD_TRACE_ELEM(elem) +#define QMD_TRACE_HEAD(head) +#define QMD_SAVELINK(name, link) +#define TRACEBUF +#define TRACEBUF_INITIALIZER +#define TRASHIT(x) +#endif /* QUEUE_MACRO_DEBUG */ + +#ifdef __cplusplus +/* + * In C++ there can be structure lists and class lists: + */ +#define QUEUE_TYPEOF(type) type +#else +#define QUEUE_TYPEOF(type) struct type +#endif + +/* + * Singly-linked List declarations. + */ +#define SLIST_HEAD(name, type) \ +struct name { \ + struct type *slh_first; /* first element */ \ +} + +#define SLIST_CLASS_HEAD(name, type) \ +struct name { \ + class type *slh_first; /* first element */ \ +} + +#define SLIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define SLIST_ENTRY(type) \ +struct { \ + struct type *sle_next; /* next element */ \ +} + +#define SLIST_CLASS_ENTRY(type) \ +struct { \ + class type *sle_next; /* next element */ \ +} + +/* + * Singly-linked List functions. + */ +#define SLIST_CONCAT(head1, head2, type, field) do { \ + QUEUE_TYPEOF(type) *curelm = SLIST_FIRST(head1); \ + if (curelm == NULL) { \ + if ((SLIST_FIRST(head1) = SLIST_FIRST(head2)) != NULL) \ + SLIST_INIT(head2); \ + } else if (SLIST_FIRST(head2) != NULL) { \ + while (SLIST_NEXT(curelm, field) != NULL) \ + curelm = SLIST_NEXT(curelm, field); \ + SLIST_NEXT(curelm, field) = SLIST_FIRST(head2); \ + SLIST_INIT(head2); \ + } \ +} while (0) + +#define SLIST_EMPTY(head) ((head)->slh_first == NULL) + +#define SLIST_FIRST(head) ((head)->slh_first) + +#define SLIST_FOREACH(var, head, field) \ + for ((var) = SLIST_FIRST((head)); \ + (var); \ + (var) = SLIST_NEXT((var), field)) + +#define SLIST_FOREACH_FROM(var, head, field) \ + for ((var) = ((var) ? (var) : SLIST_FIRST((head))); \ + (var); \ + (var) = SLIST_NEXT((var), field)) + +#define SLIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = SLIST_FIRST((head)); \ + (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define SLIST_FOREACH_FROM_SAFE(var, head, field, tvar) \ + for ((var) = ((var) ? (var) : SLIST_FIRST((head))); \ + (var) && ((tvar) = SLIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define SLIST_FOREACH_PREVPTR(var, varp, head, field) \ + for ((varp) = &SLIST_FIRST((head)); \ + ((var) = *(varp)) != NULL; \ + (varp) = &SLIST_NEXT((var), field)) + +#define SLIST_INIT(head) do { \ + SLIST_FIRST((head)) = NULL; \ +} while (0) + +#define SLIST_INSERT_AFTER(slistelm, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_NEXT((slistelm), field); \ + SLIST_NEXT((slistelm), field) = (elm); \ +} while (0) + +#define SLIST_INSERT_HEAD(head, elm, field) do { \ + SLIST_NEXT((elm), field) = SLIST_FIRST((head)); \ + SLIST_FIRST((head)) = (elm); \ +} while (0) + +#define SLIST_NEXT(elm, field) ((elm)->field.sle_next) + +#define SLIST_REMOVE(head, elm, type, field) do { \ + QMD_SAVELINK(oldnext, (elm)->field.sle_next); \ + if (SLIST_FIRST((head)) == (elm)) { \ + SLIST_REMOVE_HEAD((head), field); \ + } \ + else { \ + QUEUE_TYPEOF(type) *curelm = SLIST_FIRST(head); \ + while (SLIST_NEXT(curelm, field) != (elm)) \ + curelm = SLIST_NEXT(curelm, field); \ + SLIST_REMOVE_AFTER(curelm, field); \ + } \ + TRASHIT(*oldnext); \ +} while (0) + +#define SLIST_REMOVE_AFTER(elm, field) do { \ + SLIST_NEXT(elm, field) = \ + SLIST_NEXT(SLIST_NEXT(elm, field), field); \ +} while (0) + +#define SLIST_REMOVE_HEAD(head, field) do { \ + SLIST_FIRST((head)) = SLIST_NEXT(SLIST_FIRST((head)), field); \ +} while (0) + +#define SLIST_SWAP(head1, head2, type) do { \ + QUEUE_TYPEOF(type) *swap_first = SLIST_FIRST(head1); \ + SLIST_FIRST(head1) = SLIST_FIRST(head2); \ + SLIST_FIRST(head2) = swap_first; \ +} while (0) + +/* + * Singly-linked Tail queue declarations. + */ +#define STAILQ_HEAD(name, type) \ +struct name { \ + struct type *stqh_first;/* first element */ \ + struct type **stqh_last;/* addr of last next element */ \ +} + +#define STAILQ_CLASS_HEAD(name, type) \ +struct name { \ + class type *stqh_first; /* first element */ \ + class type **stqh_last; /* addr of last next element */ \ +} + +#define STAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).stqh_first } + +#define STAILQ_ENTRY(type) \ +struct { \ + struct type *stqe_next; /* next element */ \ +} + +#define STAILQ_CLASS_ENTRY(type) \ +struct { \ + class type *stqe_next; /* next element */ \ +} + +/* + * Singly-linked Tail queue functions. + */ +#define STAILQ_CONCAT(head1, head2) do { \ + if (!STAILQ_EMPTY((head2))) { \ + *(head1)->stqh_last = (head2)->stqh_first; \ + (head1)->stqh_last = (head2)->stqh_last; \ + STAILQ_INIT((head2)); \ + } \ +} while (0) + +#define STAILQ_EMPTY(head) ((head)->stqh_first == NULL) + +#define STAILQ_FIRST(head) ((head)->stqh_first) + +#define STAILQ_FOREACH(var, head, field) \ + for((var) = STAILQ_FIRST((head)); \ + (var); \ + (var) = STAILQ_NEXT((var), field)) + +#define STAILQ_FOREACH_FROM(var, head, field) \ + for ((var) = ((var) ? (var) : STAILQ_FIRST((head))); \ + (var); \ + (var) = STAILQ_NEXT((var), field)) + +#define STAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = STAILQ_FIRST((head)); \ + (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define STAILQ_FOREACH_FROM_SAFE(var, head, field, tvar) \ + for ((var) = ((var) ? (var) : STAILQ_FIRST((head))); \ + (var) && ((tvar) = STAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define STAILQ_INIT(head) do { \ + STAILQ_FIRST((head)) = NULL; \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_INSERT_AFTER(head, tqelm, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_NEXT((tqelm), field)) == NULL)\ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_NEXT((tqelm), field) = (elm); \ +} while (0) + +#define STAILQ_INSERT_HEAD(head, elm, field) do { \ + if ((STAILQ_NEXT((elm), field) = STAILQ_FIRST((head))) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ + STAILQ_FIRST((head)) = (elm); \ +} while (0) + +#define STAILQ_INSERT_TAIL(head, elm, field) do { \ + STAILQ_NEXT((elm), field) = NULL; \ + *(head)->stqh_last = (elm); \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_LAST(head, type, field) \ + (STAILQ_EMPTY((head)) ? NULL : \ + __containerof((head)->stqh_last, \ + QUEUE_TYPEOF(type), field.stqe_next)) + +#define STAILQ_NEXT(elm, field) ((elm)->field.stqe_next) + +#define STAILQ_REMOVE(head, elm, type, field) do { \ + QMD_SAVELINK(oldnext, (elm)->field.stqe_next); \ + if (STAILQ_FIRST((head)) == (elm)) { \ + STAILQ_REMOVE_HEAD((head), field); \ + } \ + else { \ + QUEUE_TYPEOF(type) *curelm = STAILQ_FIRST(head); \ + while (STAILQ_NEXT(curelm, field) != (elm)) \ + curelm = STAILQ_NEXT(curelm, field); \ + STAILQ_REMOVE_AFTER(head, curelm, field); \ + } \ + TRASHIT(*oldnext); \ +} while (0) + +#define STAILQ_REMOVE_AFTER(head, elm, field) do { \ + if ((STAILQ_NEXT(elm, field) = \ + STAILQ_NEXT(STAILQ_NEXT(elm, field), field)) == NULL) \ + (head)->stqh_last = &STAILQ_NEXT((elm), field); \ +} while (0) + +#define STAILQ_REMOVE_HEAD(head, field) do { \ + if ((STAILQ_FIRST((head)) = \ + STAILQ_NEXT(STAILQ_FIRST((head)), field)) == NULL) \ + (head)->stqh_last = &STAILQ_FIRST((head)); \ +} while (0) + +#define STAILQ_SWAP(head1, head2, type) do { \ + QUEUE_TYPEOF(type) *swap_first = STAILQ_FIRST(head1); \ + QUEUE_TYPEOF(type) **swap_last = (head1)->stqh_last; \ + STAILQ_FIRST(head1) = STAILQ_FIRST(head2); \ + (head1)->stqh_last = (head2)->stqh_last; \ + STAILQ_FIRST(head2) = swap_first; \ + (head2)->stqh_last = swap_last; \ + if (STAILQ_EMPTY(head1)) \ + (head1)->stqh_last = &STAILQ_FIRST(head1); \ + if (STAILQ_EMPTY(head2)) \ + (head2)->stqh_last = &STAILQ_FIRST(head2); \ +} while (0) + + +/* + * List declarations. + */ +#define LIST_HEAD(name, type) \ +struct name { \ + struct type *lh_first; /* first element */ \ +} + +#define LIST_CLASS_HEAD(name, type) \ +struct name { \ + class type *lh_first; /* first element */ \ +} + +#define LIST_HEAD_INITIALIZER(head) \ + { NULL } + +#define LIST_ENTRY(type) \ +struct { \ + struct type *le_next; /* next element */ \ + struct type **le_prev; /* address of previous next element */ \ +} + +#define LIST_CLASS_ENTRY(type) \ +struct { \ + class type *le_next; /* next element */ \ + class type **le_prev; /* address of previous next element */ \ +} + +/* + * List functions. + */ + +#if (defined(_KERNEL) && defined(INVARIANTS)) +#define QMD_LIST_CHECK_HEAD(head, field) do { \ + if (LIST_FIRST((head)) != NULL && \ + LIST_FIRST((head))->field.le_prev != \ + &LIST_FIRST((head))) \ + panic("Bad list head %p first->prev != head", (head)); \ +} while (0) + +#define QMD_LIST_CHECK_NEXT(elm, field) do { \ + if (LIST_NEXT((elm), field) != NULL && \ + LIST_NEXT((elm), field)->field.le_prev != \ + &((elm)->field.le_next)) \ + panic("Bad link elm %p next->prev != elm", (elm)); \ +} while (0) + +#define QMD_LIST_CHECK_PREV(elm, field) do { \ + if (*(elm)->field.le_prev != (elm)) \ + panic("Bad link elm %p prev->next != elm", (elm)); \ +} while (0) +#else +#define QMD_LIST_CHECK_HEAD(head, field) +#define QMD_LIST_CHECK_NEXT(elm, field) +#define QMD_LIST_CHECK_PREV(elm, field) +#endif /* (_KERNEL && INVARIANTS) */ + +#define LIST_CONCAT(head1, head2, type, field) do { \ + QUEUE_TYPEOF(type) *curelm = LIST_FIRST(head1); \ + if (curelm == NULL) { \ + if ((LIST_FIRST(head1) = LIST_FIRST(head2)) != NULL) { \ + LIST_FIRST(head2)->field.le_prev = \ + &LIST_FIRST((head1)); \ + LIST_INIT(head2); \ + } \ + } else if (LIST_FIRST(head2) != NULL) { \ + while (LIST_NEXT(curelm, field) != NULL) \ + curelm = LIST_NEXT(curelm, field); \ + LIST_NEXT(curelm, field) = LIST_FIRST(head2); \ + LIST_FIRST(head2)->field.le_prev = &LIST_NEXT(curelm, field); \ + LIST_INIT(head2); \ + } \ +} while (0) + +#define LIST_EMPTY(head) ((head)->lh_first == NULL) + +#define LIST_FIRST(head) ((head)->lh_first) + +#define LIST_FOREACH(var, head, field) \ + for ((var) = LIST_FIRST((head)); \ + (var); \ + (var) = LIST_NEXT((var), field)) + +#define LIST_FOREACH_FROM(var, head, field) \ + for ((var) = ((var) ? (var) : LIST_FIRST((head))); \ + (var); \ + (var) = LIST_NEXT((var), field)) + +#define LIST_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = LIST_FIRST((head)); \ + (var) && ((tvar) = LIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define LIST_FOREACH_FROM_SAFE(var, head, field, tvar) \ + for ((var) = ((var) ? (var) : LIST_FIRST((head))); \ + (var) && ((tvar) = LIST_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define LIST_INIT(head) do { \ + LIST_FIRST((head)) = NULL; \ +} while (0) + +#define LIST_INSERT_AFTER(listelm, elm, field) do { \ + QMD_LIST_CHECK_NEXT(listelm, field); \ + if ((LIST_NEXT((elm), field) = LIST_NEXT((listelm), field)) != NULL)\ + LIST_NEXT((listelm), field)->field.le_prev = \ + &LIST_NEXT((elm), field); \ + LIST_NEXT((listelm), field) = (elm); \ + (elm)->field.le_prev = &LIST_NEXT((listelm), field); \ +} while (0) + +#define LIST_INSERT_BEFORE(listelm, elm, field) do { \ + QMD_LIST_CHECK_PREV(listelm, field); \ + (elm)->field.le_prev = (listelm)->field.le_prev; \ + LIST_NEXT((elm), field) = (listelm); \ + *(listelm)->field.le_prev = (elm); \ + (listelm)->field.le_prev = &LIST_NEXT((elm), field); \ +} while (0) + +#define LIST_INSERT_HEAD(head, elm, field) do { \ + QMD_LIST_CHECK_HEAD((head), field); \ + if ((LIST_NEXT((elm), field) = LIST_FIRST((head))) != NULL) \ + LIST_FIRST((head))->field.le_prev = &LIST_NEXT((elm), field);\ + LIST_FIRST((head)) = (elm); \ + (elm)->field.le_prev = &LIST_FIRST((head)); \ +} while (0) + +#define LIST_NEXT(elm, field) ((elm)->field.le_next) + +#define LIST_PREV(elm, head, type, field) \ + ((elm)->field.le_prev == &LIST_FIRST((head)) ? NULL : \ + __containerof((elm)->field.le_prev, \ + QUEUE_TYPEOF(type), field.le_next)) + +#define LIST_REMOVE(elm, field) do { \ + QMD_SAVELINK(oldnext, (elm)->field.le_next); \ + QMD_SAVELINK(oldprev, (elm)->field.le_prev); \ + QMD_LIST_CHECK_NEXT(elm, field); \ + QMD_LIST_CHECK_PREV(elm, field); \ + if (LIST_NEXT((elm), field) != NULL) \ + LIST_NEXT((elm), field)->field.le_prev = \ + (elm)->field.le_prev; \ + *(elm)->field.le_prev = LIST_NEXT((elm), field); \ + TRASHIT(*oldnext); \ + TRASHIT(*oldprev); \ +} while (0) + +#define LIST_SWAP(head1, head2, type, field) do { \ + QUEUE_TYPEOF(type) *swap_tmp = LIST_FIRST(head1); \ + LIST_FIRST((head1)) = LIST_FIRST((head2)); \ + LIST_FIRST((head2)) = swap_tmp; \ + if ((swap_tmp = LIST_FIRST((head1))) != NULL) \ + swap_tmp->field.le_prev = &LIST_FIRST((head1)); \ + if ((swap_tmp = LIST_FIRST((head2))) != NULL) \ + swap_tmp->field.le_prev = &LIST_FIRST((head2)); \ +} while (0) + +/* + * Tail queue declarations. + */ +#define TAILQ_HEAD(name, type) \ +struct name { \ + struct type *tqh_first; /* first element */ \ + struct type **tqh_last; /* addr of last next element */ \ + TRACEBUF \ +} + +#define TAILQ_CLASS_HEAD(name, type) \ +struct name { \ + class type *tqh_first; /* first element */ \ + class type **tqh_last; /* addr of last next element */ \ + TRACEBUF \ +} + +#define TAILQ_HEAD_INITIALIZER(head) \ + { NULL, &(head).tqh_first, TRACEBUF_INITIALIZER } + +#define TAILQ_ENTRY(type) \ +struct { \ + struct type *tqe_next; /* next element */ \ + struct type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ +} + +#define TAILQ_CLASS_ENTRY(type) \ +struct { \ + class type *tqe_next; /* next element */ \ + class type **tqe_prev; /* address of previous next element */ \ + TRACEBUF \ +} + +/* + * Tail queue functions. + */ +#if (defined(_KERNEL) && defined(INVARIANTS)) +#define QMD_TAILQ_CHECK_HEAD(head, field) do { \ + if (!TAILQ_EMPTY(head) && \ + TAILQ_FIRST((head))->field.tqe_prev != \ + &TAILQ_FIRST((head))) \ + panic("Bad tailq head %p first->prev != head", (head)); \ +} while (0) + +#define QMD_TAILQ_CHECK_TAIL(head, field) do { \ + if (*(head)->tqh_last != NULL) \ + panic("Bad tailq NEXT(%p->tqh_last) != NULL", (head)); \ +} while (0) + +#define QMD_TAILQ_CHECK_NEXT(elm, field) do { \ + if (TAILQ_NEXT((elm), field) != NULL && \ + TAILQ_NEXT((elm), field)->field.tqe_prev != \ + &((elm)->field.tqe_next)) \ + panic("Bad link elm %p next->prev != elm", (elm)); \ +} while (0) + +#define QMD_TAILQ_CHECK_PREV(elm, field) do { \ + if (*(elm)->field.tqe_prev != (elm)) \ + panic("Bad link elm %p prev->next != elm", (elm)); \ +} while (0) +#else +#define QMD_TAILQ_CHECK_HEAD(head, field) +#define QMD_TAILQ_CHECK_TAIL(head, headname) +#define QMD_TAILQ_CHECK_NEXT(elm, field) +#define QMD_TAILQ_CHECK_PREV(elm, field) +#endif /* (_KERNEL && INVARIANTS) */ + +#define TAILQ_CONCAT(head1, head2, field) do { \ + if (!TAILQ_EMPTY(head2)) { \ + *(head1)->tqh_last = (head2)->tqh_first; \ + (head2)->tqh_first->field.tqe_prev = (head1)->tqh_last; \ + (head1)->tqh_last = (head2)->tqh_last; \ + TAILQ_INIT((head2)); \ + QMD_TRACE_HEAD(head1); \ + QMD_TRACE_HEAD(head2); \ + } \ +} while (0) + +#define TAILQ_EMPTY(head) ((head)->tqh_first == NULL) + +#define TAILQ_FIRST(head) ((head)->tqh_first) + +#define TAILQ_FOREACH(var, head, field) \ + for ((var) = TAILQ_FIRST((head)); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_FROM(var, head, field) \ + for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \ + (var); \ + (var) = TAILQ_NEXT((var), field)) + +#define TAILQ_FOREACH_SAFE(var, head, field, tvar) \ + for ((var) = TAILQ_FIRST((head)); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_FROM_SAFE(var, head, field, tvar) \ + for ((var) = ((var) ? (var) : TAILQ_FIRST((head))); \ + (var) && ((tvar) = TAILQ_NEXT((var), field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_REVERSE(var, head, headname, field) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_FOREACH_REVERSE_FROM(var, head, headname, field) \ + for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \ + (var); \ + (var) = TAILQ_PREV((var), headname, field)) + +#define TAILQ_FOREACH_REVERSE_SAFE(var, head, headname, field, tvar) \ + for ((var) = TAILQ_LAST((head), headname); \ + (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ + (var) = (tvar)) + +#define TAILQ_FOREACH_REVERSE_FROM_SAFE(var, head, headname, field, tvar) \ + for ((var) = ((var) ? (var) : TAILQ_LAST((head), headname)); \ + (var) && ((tvar) = TAILQ_PREV((var), headname, field), 1); \ + (var) = (tvar)) + +#define TAILQ_INIT(head) do { \ + TAILQ_FIRST((head)) = NULL; \ + (head)->tqh_last = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ +} while (0) + +#define TAILQ_INSERT_AFTER(head, listelm, elm, field) do { \ + QMD_TAILQ_CHECK_NEXT(listelm, field); \ + if ((TAILQ_NEXT((elm), field) = TAILQ_NEXT((listelm), field)) != NULL)\ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else { \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + } \ + TAILQ_NEXT((listelm), field) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_NEXT((listelm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&(listelm)->field); \ +} while (0) + +#define TAILQ_INSERT_BEFORE(listelm, elm, field) do { \ + QMD_TAILQ_CHECK_PREV(listelm, field); \ + (elm)->field.tqe_prev = (listelm)->field.tqe_prev; \ + TAILQ_NEXT((elm), field) = (listelm); \ + *(listelm)->field.tqe_prev = (elm); \ + (listelm)->field.tqe_prev = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_ELEM(&(elm)->field); \ + QMD_TRACE_ELEM(&(listelm)->field); \ +} while (0) + +#define TAILQ_INSERT_HEAD(head, elm, field) do { \ + QMD_TAILQ_CHECK_HEAD(head, field); \ + if ((TAILQ_NEXT((elm), field) = TAILQ_FIRST((head))) != NULL) \ + TAILQ_FIRST((head))->field.tqe_prev = \ + &TAILQ_NEXT((elm), field); \ + else \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + TAILQ_FIRST((head)) = (elm); \ + (elm)->field.tqe_prev = &TAILQ_FIRST((head)); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_INSERT_TAIL(head, elm, field) do { \ + QMD_TAILQ_CHECK_TAIL(head, field); \ + TAILQ_NEXT((elm), field) = NULL; \ + (elm)->field.tqe_prev = (head)->tqh_last; \ + *(head)->tqh_last = (elm); \ + (head)->tqh_last = &TAILQ_NEXT((elm), field); \ + QMD_TRACE_HEAD(head); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_LAST(head, headname) \ + (*(((struct headname *)((head)->tqh_last))->tqh_last)) + +#define TAILQ_NEXT(elm, field) ((elm)->field.tqe_next) + +#define TAILQ_PREV(elm, headname, field) \ + (*(((struct headname *)((elm)->field.tqe_prev))->tqh_last)) + +#define TAILQ_REMOVE(head, elm, field) do { \ + QMD_SAVELINK(oldnext, (elm)->field.tqe_next); \ + QMD_SAVELINK(oldprev, (elm)->field.tqe_prev); \ + QMD_TAILQ_CHECK_NEXT(elm, field); \ + QMD_TAILQ_CHECK_PREV(elm, field); \ + if ((TAILQ_NEXT((elm), field)) != NULL) \ + TAILQ_NEXT((elm), field)->field.tqe_prev = \ + (elm)->field.tqe_prev; \ + else { \ + (head)->tqh_last = (elm)->field.tqe_prev; \ + QMD_TRACE_HEAD(head); \ + } \ + *(elm)->field.tqe_prev = TAILQ_NEXT((elm), field); \ + TRASHIT(*oldnext); \ + TRASHIT(*oldprev); \ + QMD_TRACE_ELEM(&(elm)->field); \ +} while (0) + +#define TAILQ_SWAP(head1, head2, type, field) do { \ + QUEUE_TYPEOF(type) *swap_first = (head1)->tqh_first; \ + QUEUE_TYPEOF(type) **swap_last = (head1)->tqh_last; \ + (head1)->tqh_first = (head2)->tqh_first; \ + (head1)->tqh_last = (head2)->tqh_last; \ + (head2)->tqh_first = swap_first; \ + (head2)->tqh_last = swap_last; \ + if ((swap_first = (head1)->tqh_first) != NULL) \ + swap_first->field.tqe_prev = &(head1)->tqh_first; \ + else \ + (head1)->tqh_last = &(head1)->tqh_first; \ + if ((swap_first = (head2)->tqh_first) != NULL) \ + swap_first->field.tqe_prev = &(head2)->tqh_first; \ + else \ + (head2)->tqh_last = &(head2)->tqh_first; \ +} while (0) + +#endif /* !_SYS_QUEUE_H_ */ diff --git a/usr/contrib/freebsd/x86/segments.h b/usr/contrib/freebsd/x86/segments.h new file mode 100644 index 0000000000..1b8c4a3c1c --- /dev/null +++ b/usr/contrib/freebsd/x86/segments.h @@ -0,0 +1,274 @@ +/*- + * Copyright (c) 1989, 1990 William F. Jolitz + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)segments.h 7.1 (Berkeley) 5/9/91 + * $FreeBSD$ + */ + +#ifndef _X86_SEGMENTS_H_ +#define _X86_SEGMENTS_H_ + +/* + * X86 Segmentation Data Structures and definitions + */ + +/* + * Selectors + */ +#define SEL_RPL_MASK 3 /* requester priv level */ +#define ISPL(s) ((s)&3) /* priority level of a selector */ +#define SEL_KPL 0 /* kernel priority level */ +#define SEL_UPL 3 /* user priority level */ +#define ISLDT(s) ((s)&SEL_LDT) /* is it local or global */ +#define SEL_LDT 4 /* local descriptor table */ +#define IDXSEL(s) (((s)>>3) & 0x1fff) /* index of selector */ +#define LSEL(s,r) (((s)<<3) | SEL_LDT | r) /* a local selector */ +#define GSEL(s,r) (((s)<<3) | r) /* a global selector */ + +/* + * User segment descriptors (%cs, %ds etc for i386 apps. 64 bit wide) + * For long-mode apps, %cs only has the conforming bit in sd_type, the sd_dpl, + * sd_p, sd_l and sd_def32 which must be zero). %ds only has sd_p. + */ +struct segment_descriptor { + unsigned sd_lolimit:16; /* segment extent (lsb) */ + unsigned sd_lobase:24; /* segment base address (lsb) */ + unsigned sd_type:5; /* segment type */ + unsigned sd_dpl:2; /* segment descriptor priority level */ + unsigned sd_p:1; /* segment descriptor present */ + unsigned sd_hilimit:4; /* segment extent (msb) */ + unsigned sd_xx:2; /* unused */ + unsigned sd_def32:1; /* default 32 vs 16 bit size */ + unsigned sd_gran:1; /* limit granularity (byte/page units)*/ + unsigned sd_hibase:8; /* segment base address (msb) */ +} __packed; + +struct user_segment_descriptor { + unsigned sd_lolimit:16; /* segment extent (lsb) */ + unsigned sd_lobase:24; /* segment base address (lsb) */ + unsigned sd_type:5; /* segment type */ + unsigned sd_dpl:2; /* segment descriptor priority level */ + unsigned sd_p:1; /* segment descriptor present */ + unsigned sd_hilimit:4; /* segment extent (msb) */ + unsigned sd_xx:1; /* unused */ + unsigned sd_long:1; /* long mode (cs only) */ + unsigned sd_def32:1; /* default 32 vs 16 bit size */ + unsigned sd_gran:1; /* limit granularity (byte/page units)*/ + unsigned sd_hibase:8; /* segment base address (msb) */ +} __packed; + +#define USD_GETBASE(sd) (((sd)->sd_lobase) | (sd)->sd_hibase << 24) +#define USD_SETBASE(sd, b) (sd)->sd_lobase = (b); \ + (sd)->sd_hibase = ((b) >> 24); +#define USD_GETLIMIT(sd) (((sd)->sd_lolimit) | (sd)->sd_hilimit << 16) +#define USD_SETLIMIT(sd, l) (sd)->sd_lolimit = (l); \ + (sd)->sd_hilimit = ((l) >> 16); + +#ifdef __i386__ +/* + * Gate descriptors (e.g. indirect descriptors) + */ +struct gate_descriptor { + unsigned gd_looffset:16; /* gate offset (lsb) */ + unsigned gd_selector:16; /* gate segment selector */ + unsigned gd_stkcpy:5; /* number of stack wds to cpy */ + unsigned gd_xx:3; /* unused */ + unsigned gd_type:5; /* segment type */ + unsigned gd_dpl:2; /* segment descriptor priority level */ + unsigned gd_p:1; /* segment descriptor present */ + unsigned gd_hioffset:16; /* gate offset (msb) */ +} __packed; + +/* + * Generic descriptor + */ +union descriptor { + struct segment_descriptor sd; + struct gate_descriptor gd; +}; +#else +/* + * Gate descriptors (e.g. indirect descriptors, trap, interrupt etc. 128 bit) + * Only interrupt and trap gates have gd_ist. + */ +struct gate_descriptor { + uint64_t gd_looffset:16; /* gate offset (lsb) */ + uint64_t gd_selector:16; /* gate segment selector */ + uint64_t gd_ist:3; /* IST table index */ + uint64_t gd_xx:5; /* unused */ + uint64_t gd_type:5; /* segment type */ + uint64_t gd_dpl:2; /* segment descriptor priority level */ + uint64_t gd_p:1; /* segment descriptor present */ + uint64_t gd_hioffset:48; /* gate offset (msb) */ + uint64_t sd_xx1:32; +} __packed; + +/* + * Generic descriptor + */ +union descriptor { + struct user_segment_descriptor sd; + struct gate_descriptor gd; +}; +#endif + + /* system segments and gate types */ +#define SDT_SYSNULL 0 /* system null */ +#define SDT_SYS286TSS 1 /* system 286 TSS available */ +#define SDT_SYSLDT 2 /* system local descriptor table */ +#define SDT_SYS286BSY 3 /* system 286 TSS busy */ +#define SDT_SYS286CGT 4 /* system 286 call gate */ +#define SDT_SYSTASKGT 5 /* system task gate */ +#define SDT_SYS286IGT 6 /* system 286 interrupt gate */ +#define SDT_SYS286TGT 7 /* system 286 trap gate */ +#define SDT_SYSNULL2 8 /* system null again */ +#define SDT_SYS386TSS 9 /* system 386 TSS available */ +#define SDT_SYSTSS 9 /* system available 64 bit TSS */ +#define SDT_SYSNULL3 10 /* system null again */ +#define SDT_SYS386BSY 11 /* system 386 TSS busy */ +#define SDT_SYSBSY 11 /* system busy 64 bit TSS */ +#define SDT_SYS386CGT 12 /* system 386 call gate */ +#define SDT_SYSCGT 12 /* system 64 bit call gate */ +#define SDT_SYSNULL4 13 /* system null again */ +#define SDT_SYS386IGT 14 /* system 386 interrupt gate */ +#define SDT_SYSIGT 14 /* system 64 bit interrupt gate */ +#define SDT_SYS386TGT 15 /* system 386 trap gate */ +#define SDT_SYSTGT 15 /* system 64 bit trap gate */ + + /* memory segment types */ +#define SDT_MEMRO 16 /* memory read only */ +#define SDT_MEMROA 17 /* memory read only accessed */ +#define SDT_MEMRW 18 /* memory read write */ +#define SDT_MEMRWA 19 /* memory read write accessed */ +#define SDT_MEMROD 20 /* memory read only expand dwn limit */ +#define SDT_MEMRODA 21 /* memory read only expand dwn limit accessed */ +#define SDT_MEMRWD 22 /* memory read write expand dwn limit */ +#define SDT_MEMRWDA 23 /* memory read write expand dwn limit accessed*/ +#define SDT_MEME 24 /* memory execute only */ +#define SDT_MEMEA 25 /* memory execute only accessed */ +#define SDT_MEMER 26 /* memory execute read */ +#define SDT_MEMERA 27 /* memory execute read accessed */ +#define SDT_MEMEC 28 /* memory execute only conforming */ +#define SDT_MEMEAC 29 /* memory execute only accessed conforming */ +#define SDT_MEMERC 30 /* memory execute read conforming */ +#define SDT_MEMERAC 31 /* memory execute read accessed conforming */ + +/* + * Size of IDT table + */ +#define NIDT 256 /* 32 reserved, 0x80 syscall, most are h/w */ +#define NRSVIDT 32 /* reserved entries for cpu exceptions */ + +/* + * Entries in the Interrupt Descriptor Table (IDT) + */ +#define IDT_DE 0 /* #DE: Divide Error */ +#define IDT_DB 1 /* #DB: Debug */ +#define IDT_NMI 2 /* Nonmaskable External Interrupt */ +#define IDT_BP 3 /* #BP: Breakpoint */ +#define IDT_OF 4 /* #OF: Overflow */ +#define IDT_BR 5 /* #BR: Bound Range Exceeded */ +#define IDT_UD 6 /* #UD: Undefined/Invalid Opcode */ +#define IDT_NM 7 /* #NM: No Math Coprocessor */ +#define IDT_DF 8 /* #DF: Double Fault */ +#define IDT_FPUGP 9 /* Coprocessor Segment Overrun */ +#define IDT_TS 10 /* #TS: Invalid TSS */ +#define IDT_NP 11 /* #NP: Segment Not Present */ +#define IDT_SS 12 /* #SS: Stack Segment Fault */ +#define IDT_GP 13 /* #GP: General Protection Fault */ +#define IDT_PF 14 /* #PF: Page Fault */ +#define IDT_MF 16 /* #MF: FPU Floating-Point Error */ +#define IDT_AC 17 /* #AC: Alignment Check */ +#define IDT_MC 18 /* #MC: Machine Check */ +#define IDT_XF 19 /* #XF: SIMD Floating-Point Exception */ +#define IDT_IO_INTS NRSVIDT /* Base of IDT entries for I/O interrupts. */ +#define IDT_SYSCALL 0x80 /* System Call Interrupt Vector */ +#define IDT_DTRACE_RET 0x92 /* DTrace pid provider Interrupt Vector */ +#define IDT_EVTCHN 0x93 /* Xen HVM Event Channel Interrupt Vector */ + +#if defined(__i386__) +/* + * Entries in the Global Descriptor Table (GDT) + * Note that each 4 entries share a single 32 byte L1 cache line. + * Some of the fast syscall instructions require a specific order here. + */ +#define GNULL_SEL 0 /* Null Descriptor */ +#define GPRIV_SEL 1 /* SMP Per-Processor Private Data */ +#define GUFS_SEL 2 /* User %fs Descriptor (order critical: 1) */ +#define GUGS_SEL 3 /* User %gs Descriptor (order critical: 2) */ +#define GCODE_SEL 4 /* Kernel Code Descriptor (order critical: 1) */ +#define GDATA_SEL 5 /* Kernel Data Descriptor (order critical: 2) */ +#define GUCODE_SEL 6 /* User Code Descriptor (order critical: 3) */ +#define GUDATA_SEL 7 /* User Data Descriptor (order critical: 4) */ +#define GBIOSLOWMEM_SEL 8 /* BIOS low memory access (must be entry 8) */ +#define GPROC0_SEL 9 /* Task state process slot zero and up */ +#define GLDT_SEL 10 /* Default User LDT */ +#define GUSERLDT_SEL 11 /* User LDT */ +#define GPANIC_SEL 12 /* Task state to consider panic from */ +#define GBIOSCODE32_SEL 13 /* BIOS interface (32bit Code) */ +#define GBIOSCODE16_SEL 14 /* BIOS interface (16bit Code) */ +#define GBIOSDATA_SEL 15 /* BIOS interface (Data) */ +#define GBIOSUTIL_SEL 16 /* BIOS interface (Utility) */ +#define GBIOSARGS_SEL 17 /* BIOS interface (Arguments) */ +#define GNDIS_SEL 18 /* For the NDIS layer */ +#define NGDT 19 + +/* + * Entries in the Local Descriptor Table (LDT) + */ +#define LSYS5CALLS_SEL 0 /* forced by intel BCS */ +#define LSYS5SIGR_SEL 1 +#define LUCODE_SEL 3 +#define LUDATA_SEL 5 +#define NLDT (LUDATA_SEL + 1) + +#else /* !__i386__ */ +/* + * Entries in the Global Descriptor Table (GDT) + */ +#define GNULL_SEL 0 /* Null Descriptor */ +#define GNULL2_SEL 1 /* Null Descriptor */ +#define GUFS32_SEL 2 /* User 32 bit %fs Descriptor */ +#define GUGS32_SEL 3 /* User 32 bit %gs Descriptor */ +#define GCODE_SEL 4 /* Kernel Code Descriptor */ +#define GDATA_SEL 5 /* Kernel Data Descriptor */ +#define GUCODE32_SEL 6 /* User 32 bit code Descriptor */ +#define GUDATA_SEL 7 /* User 32/64 bit Data Descriptor */ +#define GUCODE_SEL 8 /* User 64 bit Code Descriptor */ +#define GPROC0_SEL 9 /* TSS for entering kernel etc */ +/* slot 10 is second half of GPROC0_SEL */ +#define GUSERLDT_SEL 11 /* LDT */ +/* slot 12 is second half of GUSERLDT_SEL */ +#define NGDT 13 +#endif /* __i386__ */ + +#endif /* !_X86_SEGMENTS_H_ */ diff --git a/usr/contrib/freebsd/x86/specialreg.h b/usr/contrib/freebsd/x86/specialreg.h index bea3122423..f528bad55c 100644 --- a/usr/contrib/freebsd/x86/specialreg.h +++ b/usr/contrib/freebsd/x86/specialreg.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 1991 The Regents of the University of California. * All rights reserved. * @@ -10,7 +12,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -27,7 +29,7 @@ * SUCH DAMAGE. * * from: @(#)specialreg.h 7.1 (Berkeley) 5/9/91 - * $FreeBSD: head/sys/x86/include/specialreg.h 273338 2014-10-20 18:09:33Z neel $ + * $FreeBSD$ */ #ifndef _MACHINE_SPECIALREG_H_ @@ -53,6 +55,7 @@ #define CR0_CD 0x40000000 /* Cache Disable */ #define CR3_PCID_SAVE 0x8000000000000000 +#define CR3_PCID_MASK 0xfff /* * Bits in PPro special registers @@ -73,6 +76,8 @@ #define CR4_PCIDE 0x00020000 /* Enable Context ID */ #define CR4_XSAVE 0x00040000 /* XSETBV/XGETBV */ #define CR4_SMEP 0x00100000 /* Supervisor-Mode Execution Prevention */ +#define CR4_SMAP 0x00200000 /* Supervisor-Mode Access Prevention */ +#define CR4_PKE 0x00400000 /* Protection Keys Enable */ /* * Bits in AMD64 special registers. EFER is 64 bits wide. @@ -82,6 +87,9 @@ #define EFER_LMA 0x000000400 /* Long mode active (R) */ #define EFER_NXE 0x000000800 /* PTE No-Execute bit enable (R/W) */ #define EFER_SVM 0x000001000 /* SVM enable bit for AMD, reserved for Intel */ +#define EFER_LMSLE 0x000002000 /* Long Mode Segment Limit Enable */ +#define EFER_FFXSR 0x000004000 /* Fast FXSAVE/FSRSTOR */ +#define EFER_TCE 0x000008000 /* Translation Cache Extension */ /* * Intel Extended Features registers @@ -154,6 +162,7 @@ #define CPUID2_TM2 0x00000100 #define CPUID2_SSSE3 0x00000200 #define CPUID2_CNXTID 0x00000400 +#define CPUID2_SDBG 0x00000800 #define CPUID2_FMA 0x00001000 #define CPUID2_CX16 0x00002000 #define CPUID2_XTPR 0x00004000 @@ -181,8 +190,43 @@ #define CPUTPM1_SENSOR 0x00000001 #define CPUTPM1_TURBO 0x00000002 #define CPUTPM1_ARAT 0x00000004 +#define CPUTPM1_HWP 0x00000080 +#define CPUTPM1_HWP_NOTIFICATION 0x00000100 +#define CPUTPM1_HWP_ACTIVITY_WINDOW 0x00000200 +#define CPUTPM1_HWP_PERF_PREF 0x00000400 +#define CPUTPM1_HWP_PKG 0x00000800 +#define CPUTPM1_HWP_FLEXIBLE 0x00020000 #define CPUTPM2_EFFREQ 0x00000001 +/* Intel Processor Trace CPUID. */ + +/* Leaf 0 ebx. */ +#define CPUPT_CR3 (1 << 0) /* CR3 Filtering Support */ +#define CPUPT_PSB (1 << 1) /* Configurable PSB and Cycle-Accurate Mode Supported */ +#define CPUPT_IPF (1 << 2) /* IP Filtering and TraceStop supported */ +#define CPUPT_MTC (1 << 3) /* MTC Supported */ +#define CPUPT_PRW (1 << 4) /* PTWRITE Supported */ +#define CPUPT_PWR (1 << 5) /* Power Event Trace Supported */ + +/* Leaf 0 ecx. */ +#define CPUPT_TOPA (1 << 0) /* ToPA Output Supported */ +#define CPUPT_TOPA_MULTI (1 << 1) /* ToPA Tables Allow Multiple Output Entries */ +#define CPUPT_SINGLE (1 << 2) /* Single-Range Output Supported */ +#define CPUPT_TT_OUT (1 << 3) /* Output to Trace Transport Subsystem Supported */ +#define CPUPT_LINEAR_IP (1 << 31) /* IP Payloads are Linear IP, otherwise IP is effective */ + +/* Leaf 1 eax. */ +#define CPUPT_NADDR_S 0 /* Number of Address Ranges */ +#define CPUPT_NADDR_M (0x7 << CPUPT_NADDR_S) +#define CPUPT_MTC_BITMAP_S 16 /* Bitmap of supported MTC Period Encodings */ +#define CPUPT_MTC_BITMAP_M (0xffff << CPUPT_MTC_BITMAP_S) + +/* Leaf 1 ebx. */ +#define CPUPT_CT_BITMAP_S 0 /* Bitmap of supported Cycle Threshold values */ +#define CPUPT_CT_BITMAP_M (0xffff << CPUPT_CT_BITMAP_S) +#define CPUPT_PFE_BITMAP_S 16 /* Bitmap of supported Configurable PSB Frequency encoding */ +#define CPUPT_PFE_BITMAP_M (0xffff << CPUPT_PFE_BITMAP_S) + /* * Important bits in the AMD extended cpuid flags */ @@ -190,7 +234,7 @@ #define AMDID_MP 0x00080000 #define AMDID_NX 0x00100000 #define AMDID_EXT_MMX 0x00400000 -#define AMDID_FFXSR 0x01000000 +#define AMDID_FFXSR 0x02000000 #define AMDID_PAGE1GB 0x04000000 #define AMDID_RDTSCP 0x08000000 #define AMDID_LM 0x20000000 @@ -222,6 +266,7 @@ #define AMDID2_DBE 0x04000000 #define AMDID2_PTSC 0x08000000 #define AMDID2_PTSCEL2I 0x10000000 +#define AMDID2_MWAITX 0x20000000 /* * CPUID instruction 1 eax info @@ -302,6 +347,15 @@ #define CPUID_EXTSTATE_XSAVES 0x00000008 /* + * AMD extended function 8000_0007h ebx info + */ +#define AMDRAS_MCA_OF_RECOV 0x00000001 +#define AMDRAS_SUCCOR 0x00000002 +#define AMDRAS_HW_ASSERT 0x00000004 +#define AMDRAS_SCALABLE_MCA 0x00000008 +#define AMDRAS_PFEH_SUPPORT 0x00000010 + +/* * AMD extended function 8000_0007h edx info */ #define AMDPM_TS 0x00000001 @@ -316,6 +370,24 @@ #define AMDPM_CPB 0x00000200 /* + * AMD extended function 8000_0008h ebx info (amd_extended_feature_extensions) + */ +#define AMDFEID_CLZERO 0x00000001 +#define AMDFEID_IRPERF 0x00000002 +#define AMDFEID_XSAVEERPTR 0x00000004 +#define AMDFEID_IBPB 0x00001000 +#define AMDFEID_IBRS 0x00004000 +#define AMDFEID_STIBP 0x00008000 +/* The below are only defined if the corresponding base feature above exists. */ +#define AMDFEID_IBRS_ALWAYSON 0x00010000 +#define AMDFEID_STIBP_ALWAYSON 0x00020000 +#define AMDFEID_PREFER_IBRS 0x00040000 +#define AMDFEID_SSBD 0x01000000 +/* SSBD via MSRC001_011F instead of MSR 0x48: */ +#define AMDFEID_VIRT_SSBD 0x02000000 +#define AMDFEID_SSB_NO 0x04000000 + +/* * AMD extended function 8000_0008h ecx info */ #define AMDID_CMP_CORES 0x000000ff @@ -327,25 +399,83 @@ */ #define CPUID_STDEXT_FSGSBASE 0x00000001 #define CPUID_STDEXT_TSC_ADJUST 0x00000002 +#define CPUID_STDEXT_SGX 0x00000004 #define CPUID_STDEXT_BMI1 0x00000008 #define CPUID_STDEXT_HLE 0x00000010 #define CPUID_STDEXT_AVX2 0x00000020 +#define CPUID_STDEXT_FDP_EXC 0x00000040 #define CPUID_STDEXT_SMEP 0x00000080 #define CPUID_STDEXT_BMI2 0x00000100 #define CPUID_STDEXT_ERMS 0x00000200 #define CPUID_STDEXT_INVPCID 0x00000400 #define CPUID_STDEXT_RTM 0x00000800 +#define CPUID_STDEXT_PQM 0x00001000 +#define CPUID_STDEXT_NFPUSG 0x00002000 #define CPUID_STDEXT_MPX 0x00004000 +#define CPUID_STDEXT_PQE 0x00008000 #define CPUID_STDEXT_AVX512F 0x00010000 +#define CPUID_STDEXT_AVX512DQ 0x00020000 #define CPUID_STDEXT_RDSEED 0x00040000 #define CPUID_STDEXT_ADX 0x00080000 #define CPUID_STDEXT_SMAP 0x00100000 +#define CPUID_STDEXT_AVX512IFMA 0x00200000 +#define CPUID_STDEXT_PCOMMIT 0x00400000 #define CPUID_STDEXT_CLFLUSHOPT 0x00800000 +#define CPUID_STDEXT_CLWB 0x01000000 #define CPUID_STDEXT_PROCTRACE 0x02000000 #define CPUID_STDEXT_AVX512PF 0x04000000 #define CPUID_STDEXT_AVX512ER 0x08000000 #define CPUID_STDEXT_AVX512CD 0x10000000 #define CPUID_STDEXT_SHA 0x20000000 +#define CPUID_STDEXT_AVX512BW 0x40000000 +#define CPUID_STDEXT_AVX512VL 0x80000000 + +/* + * CPUID instruction 7 Structured Extended Features, leaf 0 ecx info + */ +#define CPUID_STDEXT2_PREFETCHWT1 0x00000001 +#define CPUID_STDEXT2_AVX512VBMI 0x00000002 +#define CPUID_STDEXT2_UMIP 0x00000004 +#define CPUID_STDEXT2_PKU 0x00000008 +#define CPUID_STDEXT2_OSPKE 0x00000010 +#define CPUID_STDEXT2_WAITPKG 0x00000020 +#define CPUID_STDEXT2_AVX512VBMI2 0x00000040 +#define CPUID_STDEXT2_GFNI 0x00000100 +#define CPUID_STDEXT2_VAES 0x00000200 +#define CPUID_STDEXT2_VPCLMULQDQ 0x00000400 +#define CPUID_STDEXT2_AVX512VNNI 0x00000800 +#define CPUID_STDEXT2_AVX512BITALG 0x00001000 +#define CPUID_STDEXT2_AVX512VPOPCNTDQ 0x00004000 +#define CPUID_STDEXT2_RDPID 0x00400000 +#define CPUID_STDEXT2_CLDEMOTE 0x02000000 +#define CPUID_STDEXT2_MOVDIRI 0x08000000 +#define CPUID_STDEXT2_MOVDIRI64B 0x10000000 +#define CPUID_STDEXT2_ENQCMD 0x20000000 +#define CPUID_STDEXT2_SGXLC 0x40000000 + +/* + * CPUID instruction 7 Structured Extended Features, leaf 0 edx info + */ +#define CPUID_STDEXT3_AVX5124VNNIW 0x00000004 +#define CPUID_STDEXT3_AVX5124FMAPS 0x00000008 +#define CPUID_STDEXT3_AVX512VP2INTERSECT 0x00000100 +#define CPUID_STDEXT3_MD_CLEAR 0x00000400 +#define CPUID_STDEXT3_TSXFA 0x00002000 +#define CPUID_STDEXT3_PCONFIG 0x00040000 +#define CPUID_STDEXT3_IBPB 0x04000000 +#define CPUID_STDEXT3_STIBP 0x08000000 +#define CPUID_STDEXT3_L1D_FLUSH 0x10000000 +#define CPUID_STDEXT3_ARCH_CAP 0x20000000 +#define CPUID_STDEXT3_CORE_CAP 0x40000000 +#define CPUID_STDEXT3_SSBD 0x80000000 + +/* MSR IA32_ARCH_CAP(ABILITIES) bits */ +#define IA32_ARCH_CAP_RDCL_NO 0x00000001 +#define IA32_ARCH_CAP_IBRS_ALL 0x00000002 +#define IA32_ARCH_CAP_RSBA 0x00000004 +#define IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY 0x00000008 +#define IA32_ARCH_CAP_SSB_NO 0x00000010 +#define IA32_ARCH_CAP_MDS_NO 0x00000020 /* * CPUID manufacturers identifiers @@ -375,6 +505,8 @@ #define MSR_EBL_CR_POWERON 0x02a #define MSR_TEST_CTL 0x033 #define MSR_IA32_FEATURE_CONTROL 0x03a +#define MSR_IA32_SPEC_CTRL 0x048 +#define MSR_IA32_PRED_CMD 0x049 #define MSR_BIOS_UPDT_TRIG 0x079 #define MSR_BBL_CR_D0 0x088 #define MSR_BBL_CR_D1 0x089 @@ -387,6 +519,9 @@ #define MSR_APERF 0x0e8 #define MSR_IA32_EXT_CONFIG 0x0ee /* Undocumented. Core Solo/Duo only */ #define MSR_MTRRcap 0x0fe +#define MSR_IA32_ARCH_CAP 0x10a +#define MSR_IA32_FLUSH_CMD 0x10b +#define MSR_TSX_FORCE_ABORT 0x10f #define MSR_BBL_CR_ADDR 0x116 #define MSR_BBL_CR_DECC 0x118 #define MSR_BBL_CR_CTL 0x119 @@ -446,6 +581,14 @@ #define MSR_DRAM_ENERGY_STATUS 0x619 #define MSR_PP0_ENERGY_STATUS 0x639 #define MSR_PP1_ENERGY_STATUS 0x641 +#define MSR_PPERF 0x64e +#define MSR_TSC_DEADLINE 0x6e0 /* Writes are not serializing */ +#define MSR_IA32_PM_ENABLE 0x770 +#define MSR_IA32_HWP_CAPABILITIES 0x771 +#define MSR_IA32_HWP_REQUEST_PKG 0x772 +#define MSR_IA32_HWP_INTERRUPT 0x773 +#define MSR_IA32_HWP_REQUEST 0x774 +#define MSR_IA32_HWP_STATUS 0x777 /* * VMX MSRs @@ -467,8 +610,10 @@ #define MSR_VMX_TRUE_ENTRY_CTLS 0x490 /* - * X2APIC MSRs + * X2APIC MSRs. + * Writes are not serializing. */ +#define MSR_APIC_000 0x800 #define MSR_APIC_ID 0x802 #define MSR_APIC_VERSION 0x803 #define MSR_APIC_TPR 0x808 @@ -502,6 +647,85 @@ #define MSR_IA32_XSS 0xda0 /* + * Intel Processor Trace (PT) MSRs. + */ +#define MSR_IA32_RTIT_OUTPUT_BASE 0x560 /* Trace Output Base Register (R/W) */ +#define MSR_IA32_RTIT_OUTPUT_MASK_PTRS 0x561 /* Trace Output Mask Pointers Register (R/W) */ +#define MSR_IA32_RTIT_CTL 0x570 /* Trace Control Register (R/W) */ +#define RTIT_CTL_TRACEEN (1 << 0) +#define RTIT_CTL_CYCEN (1 << 1) +#define RTIT_CTL_OS (1 << 2) +#define RTIT_CTL_USER (1 << 3) +#define RTIT_CTL_PWREVTEN (1 << 4) +#define RTIT_CTL_FUPONPTW (1 << 5) +#define RTIT_CTL_FABRICEN (1 << 6) +#define RTIT_CTL_CR3FILTER (1 << 7) +#define RTIT_CTL_TOPA (1 << 8) +#define RTIT_CTL_MTCEN (1 << 9) +#define RTIT_CTL_TSCEN (1 << 10) +#define RTIT_CTL_DISRETC (1 << 11) +#define RTIT_CTL_PTWEN (1 << 12) +#define RTIT_CTL_BRANCHEN (1 << 13) +#define RTIT_CTL_MTC_FREQ_S 14 +#define RTIT_CTL_MTC_FREQ(n) ((n) << RTIT_CTL_MTC_FREQ_S) +#define RTIT_CTL_MTC_FREQ_M (0xf << RTIT_CTL_MTC_FREQ_S) +#define RTIT_CTL_CYC_THRESH_S 19 +#define RTIT_CTL_CYC_THRESH_M (0xf << RTIT_CTL_CYC_THRESH_S) +#define RTIT_CTL_PSB_FREQ_S 24 +#define RTIT_CTL_PSB_FREQ_M (0xf << RTIT_CTL_PSB_FREQ_S) +#define RTIT_CTL_ADDR_CFG_S(n) (32 + (n) * 4) +#define RTIT_CTL_ADDR0_CFG_S 32 +#define RTIT_CTL_ADDR0_CFG_M (0xfULL << RTIT_CTL_ADDR0_CFG_S) +#define RTIT_CTL_ADDR1_CFG_S 36 +#define RTIT_CTL_ADDR1_CFG_M (0xfULL << RTIT_CTL_ADDR1_CFG_S) +#define RTIT_CTL_ADDR2_CFG_S 40 +#define RTIT_CTL_ADDR2_CFG_M (0xfULL << RTIT_CTL_ADDR2_CFG_S) +#define RTIT_CTL_ADDR3_CFG_S 44 +#define RTIT_CTL_ADDR3_CFG_M (0xfULL << RTIT_CTL_ADDR3_CFG_S) +#define MSR_IA32_RTIT_STATUS 0x571 /* Tracing Status Register (R/W) */ +#define RTIT_STATUS_FILTEREN (1 << 0) +#define RTIT_STATUS_CONTEXTEN (1 << 1) +#define RTIT_STATUS_TRIGGEREN (1 << 2) +#define RTIT_STATUS_ERROR (1 << 4) +#define RTIT_STATUS_STOPPED (1 << 5) +#define RTIT_STATUS_PACKETBYTECNT_S 32 +#define RTIT_STATUS_PACKETBYTECNT_M (0x1ffffULL << RTIT_STATUS_PACKETBYTECNT_S) +#define MSR_IA32_RTIT_CR3_MATCH 0x572 /* Trace Filter CR3 Match Register (R/W) */ +#define MSR_IA32_RTIT_ADDR_A(n) (0x580 + (n) * 2) +#define MSR_IA32_RTIT_ADDR_B(n) (0x581 + (n) * 2) +#define MSR_IA32_RTIT_ADDR0_A 0x580 /* Region 0 Start Address (R/W) */ +#define MSR_IA32_RTIT_ADDR0_B 0x581 /* Region 0 End Address (R/W) */ +#define MSR_IA32_RTIT_ADDR1_A 0x582 /* Region 1 Start Address (R/W) */ +#define MSR_IA32_RTIT_ADDR1_B 0x583 /* Region 1 End Address (R/W) */ +#define MSR_IA32_RTIT_ADDR2_A 0x584 /* Region 2 Start Address (R/W) */ +#define MSR_IA32_RTIT_ADDR2_B 0x585 /* Region 2 End Address (R/W) */ +#define MSR_IA32_RTIT_ADDR3_A 0x586 /* Region 3 Start Address (R/W) */ +#define MSR_IA32_RTIT_ADDR3_B 0x587 /* Region 3 End Address (R/W) */ + +/* Intel Processor Trace Table of Physical Addresses (ToPA). */ +#define TOPA_SIZE_S 6 +#define TOPA_SIZE_M (0xf << TOPA_SIZE_S) +#define TOPA_SIZE_4K (0 << TOPA_SIZE_S) +#define TOPA_SIZE_8K (1 << TOPA_SIZE_S) +#define TOPA_SIZE_16K (2 << TOPA_SIZE_S) +#define TOPA_SIZE_32K (3 << TOPA_SIZE_S) +#define TOPA_SIZE_64K (4 << TOPA_SIZE_S) +#define TOPA_SIZE_128K (5 << TOPA_SIZE_S) +#define TOPA_SIZE_256K (6 << TOPA_SIZE_S) +#define TOPA_SIZE_512K (7 << TOPA_SIZE_S) +#define TOPA_SIZE_1M (8 << TOPA_SIZE_S) +#define TOPA_SIZE_2M (9 << TOPA_SIZE_S) +#define TOPA_SIZE_4M (10 << TOPA_SIZE_S) +#define TOPA_SIZE_8M (11 << TOPA_SIZE_S) +#define TOPA_SIZE_16M (12 << TOPA_SIZE_S) +#define TOPA_SIZE_32M (13 << TOPA_SIZE_S) +#define TOPA_SIZE_64M (14 << TOPA_SIZE_S) +#define TOPA_SIZE_128M (15 << TOPA_SIZE_S) +#define TOPA_STOP (1 << 4) +#define TOPA_INT (1 << 2) +#define TOPA_END (1 << 0) + +/* * Constants related to MSR's. */ #define APICBASE_RESERVED 0x000002ff @@ -515,6 +739,55 @@ #define IA32_FEATURE_CONTROL_SMX_EN 0x02 /* enable VMX inside SMX */ #define IA32_FEATURE_CONTROL_VMX_EN 0x04 /* enable VMX outside SMX */ +/* MSR IA32_MISC_ENABLE */ +#define IA32_MISC_EN_FASTSTR 0x0000000000000001ULL +#define IA32_MISC_EN_ATCCE 0x0000000000000008ULL +#define IA32_MISC_EN_PERFMON 0x0000000000000080ULL +#define IA32_MISC_EN_PEBSU 0x0000000000001000ULL +#define IA32_MISC_EN_ESSTE 0x0000000000010000ULL +#define IA32_MISC_EN_MONE 0x0000000000040000ULL +#define IA32_MISC_EN_LIMCPUID 0x0000000000400000ULL +#define IA32_MISC_EN_xTPRD 0x0000000000800000ULL +#define IA32_MISC_EN_XDD 0x0000000400000000ULL + +/* + * IA32_SPEC_CTRL and IA32_PRED_CMD MSRs are described in the Intel' + * document 336996-001 Speculative Execution Side Channel Mitigations. + * + * AMD uses the same MSRs and bit definitions, as described in 111006-B + * "Indirect Branch Control Extension" and 124441 "Speculative Store Bypass + * Disable." + */ +/* MSR IA32_SPEC_CTRL */ +#define IA32_SPEC_CTRL_IBRS 0x00000001 +#define IA32_SPEC_CTRL_STIBP 0x00000002 +#define IA32_SPEC_CTRL_SSBD 0x00000004 + +/* MSR IA32_PRED_CMD */ +#define IA32_PRED_CMD_IBPB_BARRIER 0x0000000000000001ULL + +/* MSR IA32_FLUSH_CMD */ +#define IA32_FLUSH_CMD_L1D 0x00000001 + +/* MSR IA32_HWP_CAPABILITIES */ +#define IA32_HWP_CAPABILITIES_HIGHEST_PERFORMANCE(x) (((x) >> 0) & 0xff) +#define IA32_HWP_CAPABILITIES_GUARANTEED_PERFORMANCE(x) (((x) >> 8) & 0xff) +#define IA32_HWP_CAPABILITIES_EFFICIENT_PERFORMANCE(x) (((x) >> 16) & 0xff) +#define IA32_HWP_CAPABILITIES_LOWEST_PERFORMANCE(x) (((x) >> 24) & 0xff) + +/* MSR IA32_HWP_REQUEST */ +#define IA32_HWP_REQUEST_MINIMUM_VALID (1ULL << 63) +#define IA32_HWP_REQUEST_MAXIMUM_VALID (1ULL << 62) +#define IA32_HWP_REQUEST_DESIRED_VALID (1ULL << 61) +#define IA32_HWP_REQUEST_EPP_VALID (1ULL << 60) +#define IA32_HWP_REQUEST_ACTIVITY_WINDOW_VALID (1ULL << 59) +#define IA32_HWP_REQUEST_PACKAGE_CONTROL (1ULL << 42) +#define IA32_HWP_ACTIVITY_WINDOW (0x3ffULL << 32) +#define IA32_HWP_REQUEST_ENERGY_PERFORMANCE_PREFERENCE (0xffULL << 24) +#define IA32_HWP_DESIRED_PERFORMANCE (0xffULL << 16) +#define IA32_HWP_REQUEST_MAXIMUM_PERFORMANCE (0xffULL << 8) +#define IA32_HWP_MINIMUM_PERFORMANCE (0xffULL << 0) + /* * PAT modes. */ @@ -665,6 +938,33 @@ #define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */ #define MC_CTL2_THRESHOLD 0x0000000000007fff #define MC_CTL2_CMCI_EN 0x0000000040000000 +#define MC_AMDNB_BANK 4 +#define MC_MISC_AMD_VAL 0x8000000000000000 /* Counter presence valid */ +#define MC_MISC_AMD_CNTP 0x4000000000000000 /* Counter present */ +#define MC_MISC_AMD_LOCK 0x2000000000000000 /* Register locked */ +#define MC_MISC_AMD_INTP 0x1000000000000000 /* Int. type can generate interrupts */ +#define MC_MISC_AMD_LVT_MASK 0x00f0000000000000 /* Extended LVT offset */ +#define MC_MISC_AMD_LVT_SHIFT 52 +#define MC_MISC_AMD_CNTEN 0x0008000000000000 /* Counter enabled */ +#define MC_MISC_AMD_INT_MASK 0x0006000000000000 /* Interrupt type */ +#define MC_MISC_AMD_INT_LVT 0x0002000000000000 /* Interrupt via Extended LVT */ +#define MC_MISC_AMD_INT_SMI 0x0004000000000000 /* SMI */ +#define MC_MISC_AMD_OVERFLOW 0x0001000000000000 /* Counter overflow */ +#define MC_MISC_AMD_CNT_MASK 0x00000fff00000000 /* Counter value */ +#define MC_MISC_AMD_CNT_SHIFT 32 +#define MC_MISC_AMD_CNT_MAX 0xfff +#define MC_MISC_AMD_PTR_MASK 0x00000000ff000000 /* Pointer to additional registers */ +#define MC_MISC_AMD_PTR_SHIFT 24 + +/* AMD Scalable MCA */ +#define MSR_SMCA_MC0_CTL 0xc0002000 +#define MSR_SMCA_MC0_STATUS 0xc0002001 +#define MSR_SMCA_MC0_ADDR 0xc0002002 +#define MSR_SMCA_MC0_MISC0 0xc0002003 +#define MSR_SMCA_MC_CTL(x) (MSR_SMCA_MC0_CTL + 0x10 * (x)) +#define MSR_SMCA_MC_STATUS(x) (MSR_SMCA_MC0_STATUS + 0x10 * (x)) +#define MSR_SMCA_MC_ADDR(x) (MSR_SMCA_MC0_ADDR + 0x10 * (x)) +#define MSR_SMCA_MC_MISC(x) (MSR_SMCA_MC0_MISC0 + 0x10 * (x)) /* * The following four 3-byte registers control the non-cacheable regions. @@ -768,6 +1068,7 @@ #define MSR_FSBASE 0xc0000100 /* base address of the %fs "segment" */ #define MSR_GSBASE 0xc0000101 /* base address of the %gs "segment" */ #define MSR_KGSBASE 0xc0000102 /* base address of the kernel %gs */ +#define MSR_TSC_AUX 0xc0000103 #define MSR_PERFEVSEL0 0xc0010000 #define MSR_PERFEVSEL1 0xc0010001 #define MSR_PERFEVSEL2 0xc0010002 @@ -785,17 +1086,20 @@ #define MSR_TOP_MEM 0xc001001a /* boundary for ram below 4G */ #define MSR_TOP_MEM2 0xc001001d /* boundary for ram above 4G */ #define MSR_NB_CFG1 0xc001001f /* NB configuration 1 */ +#define MSR_K8_UCODE_UPDATE 0xc0010020 /* update microcode */ +#define MSR_MC0_CTL_MASK 0xc0010044 #define MSR_P_STATE_LIMIT 0xc0010061 /* P-state Current Limit Register */ #define MSR_P_STATE_CONTROL 0xc0010062 /* P-state Control Register */ #define MSR_P_STATE_STATUS 0xc0010063 /* P-state Status Register */ #define MSR_P_STATE_CONFIG(n) (0xc0010064 + (n)) /* P-state Config */ #define MSR_SMM_ADDR 0xc0010112 /* SMM TSEG base address */ #define MSR_SMM_MASK 0xc0010113 /* SMM TSEG address mask */ +#define MSR_VM_CR 0xc0010114 /* SVM: feature control */ +#define MSR_VM_HSAVE_PA 0xc0010117 /* SVM: host save area address */ +#define MSR_AMD_CPUID07 0xc0011002 /* CPUID 07 %ebx override */ +#define MSR_EXTFEATURES 0xc0011005 /* Extended CPUID Features override */ +#define MSR_LS_CFG 0xc0011020 #define MSR_IC_CFG 0xc0011021 /* Instruction Cache Configuration */ -#define MSR_K8_UCODE_UPDATE 0xc0010020 /* update microcode */ -#define MSR_MC0_CTL_MASK 0xc0010044 -#define MSR_VM_CR 0xc0010114 /* SVM: feature control */ -#define MSR_VM_HSAVE_PA 0xc0010117 /* SVM: host save area address */ /* MSR_VM_CR related */ #define VM_CR_SVMDIS 0x10 /* SVM: disabled by BIOS */ diff --git a/usr/src/Makefile.master b/usr/src/Makefile.master index a6d4d763d6..aa7bd524bd 100644 --- a/usr/src/Makefile.master +++ b/usr/src/Makefile.master @@ -58,6 +58,12 @@ $(NO_ADJUNCT_PROTO)HAVE_ADJUNCT_PROTO=$(POUND_SIGN) NATIVE_ADJUNCT= /usr # +# Compatibility code for FreeBSD etc. +# +COMPAT= $(SRC)/compat +CONTRIB= $(SRC)/../contrib + +# # RELEASE_BUILD should be cleared for final release builds. # NOT_RELEASE_BUILD is exactly what the name implies. # diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile index a4a744fe95..f20274bd35 100644 --- a/usr/src/cmd/Makefile +++ b/usr/src/cmd/Makefile @@ -488,6 +488,8 @@ i386_SUBDIRS= \ acpihpd \ addbadsec \ ahciem \ + bhyve \ + bhyvectl \ biosdev \ cxgbetool \ diskscan \ diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile index f47daead31..e96868e006 100644 --- a/usr/src/cmd/bhyve/Makefile +++ b/usr/src/cmd/bhyve/Makefile @@ -11,13 +11,16 @@ # # Copyright 2014 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. # PROG = bhyve include ../Makefile.cmd +include ../Makefile.cmd.64 +include ../Makefile.ctf -$(BUILD64)SUBDIRS += $(MACH64) +SUBDIRS = test all := TARGET = all install := TARGET = install @@ -25,17 +28,127 @@ clean := TARGET = clean clobber := TARGET = clobber lint := TARGET = lint +SRCS = acpi.c \ + atkbdc.c \ + bhyvegc.c \ + bhyverun.c \ + block_if.c \ + bootrom.c \ + console.c \ + consport.c \ + dbgport.c \ + fwctl.c \ + gdb.c \ + inout.c \ + ioapic.c \ + mem.c \ + mevent.c \ + mptbl.c \ + pci_ahci.c \ + pci_e82545.c \ + pci_emul.c \ + pci_fbuf.c \ + pci_hostbridge.c \ + pci_irq.c \ + pci_lpc.c \ + pci_nvme.c \ + pci_passthru.c \ + pci_uart.c \ + pci_virtio_block.c \ + pci_virtio_console.c \ + pci_virtio_net.c \ + pci_virtio_rnd.c \ + pci_xhci.c \ + pm.c \ + post.c \ + ps2kbd.c \ + ps2mouse.c \ + rfb.c \ + rtc.c \ + smbiostbl.c \ + sockstream.c \ + task_switch.c \ + uart_emul.c \ + usb_emul.c \ + usb_mouse.c \ + vga.c \ + virtio.c \ + vmm_instruction_emul.c \ + xmsr.c \ + spinup_ap.c \ + iov.c \ + bhyve_sol_glue.c + +# The virtio-scsi driver appears to include a slew of materials from FreeBSD's +# native SCSI implementation. We will omit that complexity for now. + #ctl_util.c \ + #ctl_scsi_all.c \ + #pci_virtio_scsi.c \ + + +OBJS = $(SRCS:.c=.o) + +CLOBBERFILES = $(ROOTUSRSBINPROG) + +MEVENT_TEST_PROG = mevent_test +MEVENT_TEST_SRCS = mevent.c mevent_test.c +MEVENT_TEST_OBJS = $(MEVENT_TEST_SRCS:.c=.o) + +CLEANFILES = $(PROG) $(MEVENT_TEST_PROG) $(MEVENT_TEST_OBJS) + +CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration -_gcc=-Wno-parentheses +CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ + -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 \ + -I$(CONTRIB)/freebsd/dev/usb/controller \ + -I$(CONTRIB)/freebsd/dev/mii \ + -I$(SRC)/uts/common/io/e1000api \ + $(CPPFLAGS.master) \ + -I$(SRC)/uts/i86pc/io/vmm \ + -I$(SRC)/uts/common \ + -I$(SRC)/uts/i86pc \ + -DWITHOUT_CAPSICUM + +# Disable the crypto code until it is wired up +CPPFLAGS += -DNO_OPENSSL + +pci_nvme.o := CERRWARN += -_gcc=-Wno-pointer-sign + +SMOFF += all_func_returns,leaks,no_if_block + +# Force c99 for everything +CSTD= $(CSTD_GNU99) +C99MODE= -xc99=%all +C99LMODE= -Xc99=%all + +$(PROG) := LDLIBS += -lsocket -lnsl -ldlpi -lmd -luuid -lvmmapi -lz +$(MEVENT_TEST_PROG) := LDLIBS += -lsocket + .KEEP_STATE: -all clean clobber lint: $(SUBDIRS) +all: $(PROG) $(MEVENT_TEST_PROG) $(SUBDIRS) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) + $(POST_PROCESS) + +$(MEVENT_TEST_PROG): $(MEVENT_TEST_OBJS) + $(LINK.c) -o $@ $(MEVENT_TEST_OBJS) $(LDFLAGS) $(LDLIBS) + +install: all $(ROOTUSRSBINPROG) $(SUBDIRS) + +clean: $(SUBDIRS) + $(RM) $(OBJS) $(CLEANFILES) + +clobber: clean $(SUBDIRS) + $(RM) $(CLOBBERFILES) -install: $(SUBDIRS) - -$(RM) $(ROOTUSRSBINPROG) - -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG) +lint: lint_SRCS $(SUBDIRS) -$(SUBDIRS): FRC - @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) FRC: -include ../Makefile.targ +%.o: $(SRC)/uts/i86pc/io/vmm/%.c + $(COMPILE.c) $< + $(POST_PROCESS_O) diff --git a/usr/src/cmd/bhyve/Makefile.com b/usr/src/cmd/bhyve/Makefile.com deleted file mode 100644 index 4a92b622ab..0000000000 --- a/usr/src/cmd/bhyve/Makefile.com +++ /dev/null @@ -1,94 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2015 Pluribus Networks Inc. -# - -PROG= bhyve - -SRCS = atkbdc.c \ - bhyvegc.c \ - bhyverun.c \ - block_if.c \ - console.c \ - consport.c \ - inout.c \ - ioapic.c \ - mem.c \ - mptbl.c \ - pci_ahci.c \ - pci_emul.c \ - pci_hostbridge.c \ - pci_irq.c \ - pci_lpc.c \ - pci_virtio_block.c \ - pci_virtio_net.c \ - pci_virtio_viona.c \ - pm.c \ - pmtmr.c \ - post.c \ - ps2kbd.c \ - ps2mouse.c \ - rfb.c \ - rtc.c \ - smbiostbl.c \ - uart_emul.c \ - vga.c \ - virtio.c \ - vmm_instruction_emul.c \ - xmsr.c \ - spinup_ap.c \ - bhyve_sol_glue.c - -OBJS = $(SRCS:.c=.o) - -include ../../Makefile.cmd - -.KEEP_STATE: - -CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration -CFLAGS64 += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration -CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \ - -I$(ROOT)/usr/platform/i86pc/include \ - -I$(SRC)/uts/i86pc/io/vmm \ - -I$(SRC)/uts/common \ - -I$(SRC)/uts/i86pc \ - -I$(SRC)/lib/libdladm/common -LDLIBS += -lsocket -lnsl -ldlpi -ldladm -lkstat -lmd -luuid -lvmmapi - -POST_PROCESS += ; $(GENSETDEFS) $@ - -all: $(PROG) - -$(PROG): $(OBJS) - $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) - $(POST_PROCESS) - -install: all $(ROOTUSRSBINPROG) - -clean: - $(RM) $(OBJS) - -lint: lint_SRCS - -include ../../Makefile.targ - -%.o: ../%.c - $(COMPILE.c) $< - $(POST_PROCESS_O) - -%.o: $(SRC)/uts/i86pc/io/vmm/%.c - $(COMPILE.c) $< - $(POST_PROCESS_O) - -%.o: ../%.s - $(COMPILE.s) $< diff --git a/usr/src/cmd/bhyve/acpi.c b/usr/src/cmd/bhyve/acpi.c new file mode 100644 index 0000000000..862f4512f8 --- /dev/null +++ b/usr/src/cmd/bhyve/acpi.c @@ -0,0 +1,1007 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * bhyve ACPI table generator. + * + * Create the minimal set of ACPI tables required to boot FreeBSD (and + * hopefully other o/s's) by writing out ASL template files for each of + * the tables and the compiling them to AML with the Intel iasl compiler. + * The AML files are then read into guest memory. + * + * The tables are placed in the guest's ROM area just below 1MB physical, + * above the MPTable. + * + * Layout (No longer correct at FADT and beyond due to properly + * calculating the size of the MADT to allow for changes to + * VM_MAXCPU above 21 which overflows this layout.) + * ------ + * RSDP -> 0xf2400 (36 bytes fixed) + * RSDT -> 0xf2440 (36 bytes + 4*7 table addrs, 4 used) + * XSDT -> 0xf2480 (36 bytes + 8*7 table addrs, 4 used) + * MADT -> 0xf2500 (depends on #CPUs) + * FADT -> 0xf2600 (268 bytes) + * HPET -> 0xf2740 (56 bytes) + * MCFG -> 0xf2780 (60 bytes) + * FACS -> 0xf27C0 (64 bytes) + * DSDT -> 0xf2800 (variable - can go up to 0x100000) + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/stat.h> + +#include <paths.h> +#include <stdarg.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include "bhyverun.h" +#include "acpi.h" +#include "pci_emul.h" + +/* + * Define the base address of the ACPI tables, the sizes of some tables, + * and the offsets to the individual tables, + */ +#define BHYVE_ACPI_BASE 0xf2400 +#define RSDT_OFFSET 0x040 +#define XSDT_OFFSET 0x080 +#define MADT_OFFSET 0x100 +/* + * The MADT consists of: + * 44 Fixed Header + * 8 * maxcpu Processor Local APIC entries + * 12 I/O APIC entry + * 2 * 10 Interrupt Source Override entires + * 6 Local APIC NMI entry + */ +#define MADT_SIZE (44 + VM_MAXCPU*8 + 12 + 2*10 + 6) +#define FADT_OFFSET (MADT_OFFSET + MADT_SIZE) +#define FADT_SIZE 0x140 +#define HPET_OFFSET (FADT_OFFSET + FADT_SIZE) +#define HPET_SIZE 0x40 +#define MCFG_OFFSET (HPET_OFFSET + HPET_SIZE) +#define MCFG_SIZE 0x40 +#define FACS_OFFSET (MCFG_OFFSET + MCFG_SIZE) +#define FACS_SIZE 0x40 +#define DSDT_OFFSET (FACS_OFFSET + FACS_SIZE) + +#define BHYVE_ASL_TEMPLATE "bhyve.XXXXXXX" +#define BHYVE_ASL_SUFFIX ".aml" +#define BHYVE_ASL_COMPILER "/usr/sbin/iasl" + +static int basl_keep_temps; +static int basl_verbose_iasl; +static int basl_ncpu; +static uint32_t basl_acpi_base = BHYVE_ACPI_BASE; +static uint32_t hpet_capabilities; + +/* + * Contains the full pathname of the template to be passed + * to mkstemp/mktemps(3) + */ +static char basl_template[MAXPATHLEN]; +static char basl_stemplate[MAXPATHLEN]; + +/* + * State for dsdt_line(), dsdt_indent(), and dsdt_unindent(). + */ +static FILE *dsdt_fp; +static int dsdt_indent_level; +static int dsdt_error; + +struct basl_fio { + int fd; + FILE *fp; + char f_name[MAXPATHLEN]; +}; + +#define EFPRINTF(...) \ + if (fprintf(__VA_ARGS__) < 0) goto err_exit; + +#define EFFLUSH(x) \ + if (fflush(x) != 0) goto err_exit; + +static int +basl_fwrite_rsdp(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve RSDP template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0008]\t\tSignature : \"RSD PTR \"\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 43\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 02\n"); + EFPRINTF(fp, "[0004]\t\tRSDT Address : %08X\n", + basl_acpi_base + RSDT_OFFSET); + EFPRINTF(fp, "[0004]\t\tLength : 00000024\n"); + EFPRINTF(fp, "[0008]\t\tXSDT Address : 00000000%08X\n", + basl_acpi_base + XSDT_OFFSET); + EFPRINTF(fp, "[0001]\t\tExtended Checksum : 00\n"); + EFPRINTF(fp, "[0003]\t\tReserved : 000000\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_rsdt(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve RSDT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"RSDT\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVRSDT \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + /* Add in pointers to the MADT, FADT and HPET */ + EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : %08X\n", + basl_acpi_base + MADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : %08X\n", + basl_acpi_base + FADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : %08X\n", + basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : %08X\n", + basl_acpi_base + MCFG_OFFSET); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_xsdt(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve XSDT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"XSDT\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVXSDT \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + /* Add in pointers to the MADT, FADT and HPET */ + EFPRINTF(fp, "[0004]\t\tACPI Table Address 0 : 00000000%08X\n", + basl_acpi_base + MADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 1 : 00000000%08X\n", + basl_acpi_base + FADT_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 2 : 00000000%08X\n", + basl_acpi_base + HPET_OFFSET); + EFPRINTF(fp, "[0004]\t\tACPI Table Address 3 : 00000000%08X\n", + basl_acpi_base + MCFG_OFFSET); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_madt(FILE *fp) +{ + int i; + + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve MADT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"APIC\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMADT \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0004]\t\tLocal Apic Address : FEE00000\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); + EFPRINTF(fp, "\t\t\tPC-AT Compatibility : 1\n"); + EFPRINTF(fp, "\n"); + + /* Add a Processor Local APIC entry for each CPU */ + for (i = 0; i < basl_ncpu; i++) { + EFPRINTF(fp, "[0001]\t\tSubtable Type : 00\n"); + EFPRINTF(fp, "[0001]\t\tLength : 08\n"); + /* iasl expects hex values for the proc and apic id's */ + EFPRINTF(fp, "[0001]\t\tProcessor ID : %02x\n", i); + EFPRINTF(fp, "[0001]\t\tLocal Apic ID : %02x\n", i); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); + EFPRINTF(fp, "\t\t\tProcessor Enabled : 1\n"); +#ifdef __FreeBSD__ + EFPRINTF(fp, "\t\t\tRuntime Online Capable : 0\n"); +#else + /* + * Until iasl is updated to support the "Runtime Online + * Capable" entry, it must be omitted. This should be + * re-checked when illumos receives an acpica update. + */ +#endif /* __FreeBSD__ */ + EFPRINTF(fp, "\n"); + } + + /* Always a single IOAPIC entry, with ID 0 */ + EFPRINTF(fp, "[0001]\t\tSubtable Type : 01\n"); + EFPRINTF(fp, "[0001]\t\tLength : 0C\n"); + /* iasl expects a hex value for the i/o apic id */ + EFPRINTF(fp, "[0001]\t\tI/O Apic ID : %02x\n", 0); + EFPRINTF(fp, "[0001]\t\tReserved : 00\n"); + EFPRINTF(fp, "[0004]\t\tAddress : fec00000\n"); + EFPRINTF(fp, "[0004]\t\tInterrupt : 00000000\n"); + EFPRINTF(fp, "\n"); + + /* Legacy IRQ0 is connected to pin 2 of the IOAPIC */ + EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n"); + EFPRINTF(fp, "[0001]\t\tLength : 0A\n"); + EFPRINTF(fp, "[0001]\t\tBus : 00\n"); + EFPRINTF(fp, "[0001]\t\tSource : 00\n"); + EFPRINTF(fp, "[0004]\t\tInterrupt : 00000002\n"); + EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n"); + EFPRINTF(fp, "\t\t\tPolarity : 1\n"); + EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0001]\t\tSubtable Type : 02\n"); + EFPRINTF(fp, "[0001]\t\tLength : 0A\n"); + EFPRINTF(fp, "[0001]\t\tBus : 00\n"); + EFPRINTF(fp, "[0001]\t\tSource : %02X\n", SCI_INT); + EFPRINTF(fp, "[0004]\t\tInterrupt : %08X\n", SCI_INT); + EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0000\n"); + EFPRINTF(fp, "\t\t\tPolarity : 3\n"); + EFPRINTF(fp, "\t\t\tTrigger Mode : 3\n"); + EFPRINTF(fp, "\n"); + + /* Local APIC NMI is connected to LINT 1 on all CPUs */ + EFPRINTF(fp, "[0001]\t\tSubtable Type : 04\n"); + EFPRINTF(fp, "[0001]\t\tLength : 06\n"); + EFPRINTF(fp, "[0001]\t\tProcessorId : FF\n"); + EFPRINTF(fp, "[0002]\t\tFlags (decoded below) : 0005\n"); + EFPRINTF(fp, "\t\t\tPolarity : 1\n"); + EFPRINTF(fp, "\t\t\tTrigger Mode : 1\n"); + EFPRINTF(fp, "[0001]\t\tInterrupt : 01\n"); + EFPRINTF(fp, "\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_fadt(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve FADT template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"FACP\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 0000010C\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 05\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVFACP \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0004]\t\tFACS Address : %08X\n", + basl_acpi_base + FACS_OFFSET); + EFPRINTF(fp, "[0004]\t\tDSDT Address : %08X\n", + basl_acpi_base + DSDT_OFFSET); + EFPRINTF(fp, "[0001]\t\tModel : 01\n"); + EFPRINTF(fp, "[0001]\t\tPM Profile : 00 [Unspecified]\n"); + EFPRINTF(fp, "[0002]\t\tSCI Interrupt : %04X\n", + SCI_INT); + EFPRINTF(fp, "[0004]\t\tSMI Command Port : %08X\n", + SMI_CMD); + EFPRINTF(fp, "[0001]\t\tACPI Enable Value : %02X\n", + BHYVE_ACPI_ENABLE); + EFPRINTF(fp, "[0001]\t\tACPI Disable Value : %02X\n", + BHYVE_ACPI_DISABLE); + EFPRINTF(fp, "[0001]\t\tS4BIOS Command : 00\n"); + EFPRINTF(fp, "[0001]\t\tP-State Control : 00\n"); + EFPRINTF(fp, "[0004]\t\tPM1A Event Block Address : %08X\n", + PM1A_EVT_ADDR); + EFPRINTF(fp, "[0004]\t\tPM1B Event Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tPM1A Control Block Address : %08X\n", + PM1A_CNT_ADDR); + EFPRINTF(fp, "[0004]\t\tPM1B Control Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tPM2 Control Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tPM Timer Block Address : %08X\n", + IO_PMTMR); + EFPRINTF(fp, "[0004]\t\tGPE0 Block Address : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tGPE1 Block Address : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tPM1 Event Block Length : 04\n"); + EFPRINTF(fp, "[0001]\t\tPM1 Control Block Length : 02\n"); + EFPRINTF(fp, "[0001]\t\tPM2 Control Block Length : 00\n"); + EFPRINTF(fp, "[0001]\t\tPM Timer Block Length : 04\n"); + EFPRINTF(fp, "[0001]\t\tGPE0 Block Length : 00\n"); + EFPRINTF(fp, "[0001]\t\tGPE1 Block Length : 00\n"); + EFPRINTF(fp, "[0001]\t\tGPE1 Base Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\t_CST Support : 00\n"); + EFPRINTF(fp, "[0002]\t\tC2 Latency : 0000\n"); + EFPRINTF(fp, "[0002]\t\tC3 Latency : 0000\n"); + EFPRINTF(fp, "[0002]\t\tCPU Cache Size : 0000\n"); + EFPRINTF(fp, "[0002]\t\tCache Flush Stride : 0000\n"); + EFPRINTF(fp, "[0001]\t\tDuty Cycle Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tDuty Cycle Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tRTC Day Alarm Index : 00\n"); + EFPRINTF(fp, "[0001]\t\tRTC Month Alarm Index : 00\n"); + EFPRINTF(fp, "[0001]\t\tRTC Century Index : 32\n"); + EFPRINTF(fp, "[0002]\t\tBoot Flags (decoded below) : 0000\n"); + EFPRINTF(fp, "\t\t\tLegacy Devices Supported (V2) : 0\n"); + EFPRINTF(fp, "\t\t\t8042 Present on ports 60/64 (V2) : 0\n"); + EFPRINTF(fp, "\t\t\tVGA Not Present (V4) : 1\n"); + EFPRINTF(fp, "\t\t\tMSI Not Supported (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tPCIe ASPM Not Supported (V4) : 1\n"); + EFPRINTF(fp, "\t\t\tCMOS RTC Not Present (V5) : 0\n"); + EFPRINTF(fp, "[0001]\t\tReserved : 00\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n"); + EFPRINTF(fp, "\t\t\tWBINVD instruction is operational (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tWBINVD flushes all caches (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tAll CPUs support C1 (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tC2 works on MP system (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tControl Method Power Button (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tControl Method Sleep Button (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tRTC wake not in fixed reg space (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tRTC can wake system from S4 (V1) : 0\n"); + EFPRINTF(fp, "\t\t\t32-bit PM Timer (V1) : 1\n"); + EFPRINTF(fp, "\t\t\tDocking Supported (V1) : 0\n"); + EFPRINTF(fp, "\t\t\tReset Register Supported (V2) : 1\n"); + EFPRINTF(fp, "\t\t\tSealed Case (V3) : 0\n"); + EFPRINTF(fp, "\t\t\tHeadless - No Video (V3) : 1\n"); + EFPRINTF(fp, "\t\t\tUse native instr after SLP_TYPx (V3) : 0\n"); + EFPRINTF(fp, "\t\t\tPCIEXP_WAK Bits Supported (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tUse Platform Timer (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tRTC_STS valid on S4 wake (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tRemote Power-on capable (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tUse APIC Cluster Model (V4) : 0\n"); + EFPRINTF(fp, "\t\t\tUse APIC Physical Destination Mode (V4) : 1\n"); + EFPRINTF(fp, "\t\t\tHardware Reduced (V5) : 0\n"); + EFPRINTF(fp, "\t\t\tLow Power S0 Idle (V5) : 0\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tReset Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000CF9\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0001]\t\tValue to cause reset : 06\n"); + EFPRINTF(fp, "[0002]\t\tARM Flags (decoded below): 0000\n"); + EFPRINTF(fp, "\t\t\tPSCI Compliant : 0\n"); + EFPRINTF(fp, "\t\t\tMust use HVC for PSCI : 0\n"); + EFPRINTF(fp, "[0001]\t\tFADT Minor Revision : 01\n"); + EFPRINTF(fp, "[0008]\t\tFACS Address : 00000000%08X\n", + basl_acpi_base + FACS_OFFSET); + EFPRINTF(fp, "[0008]\t\tDSDT Address : 00000000%08X\n", + basl_acpi_base + DSDT_OFFSET); + EFPRINTF(fp, + "[0012]\t\tPM1A Event Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 20\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", + PM1A_EVT_ADDR); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM1B Event Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM1A Control Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 10\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 02 [Word Access:16]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", + PM1A_CNT_ADDR); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM1B Control Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tPM2 Control Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + /* Valid for bhyve */ + EFPRINTF(fp, + "[0012]\t\tPM Timer Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 20\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 03 [DWord Access:32]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000%08X\n", + IO_PMTMR); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0012]\t\tGPE0 Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0012]\t\tGPE1 Block : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tSleep Control Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, + "[0012]\t\tSleep Status Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 01 [SystemIO]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 08\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, "[0001]\t\tEncoded Access Width : 01 [Byte Access:8]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 0000000000000000\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_hpet(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve HPET template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"HPET\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVHPET \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0004]\t\tTimer Block ID : %08X\n", hpet_capabilities); + EFPRINTF(fp, + "[0012]\t\tTimer Block Register : [Generic Address Structure]\n"); + EFPRINTF(fp, "[0001]\t\tSpace ID : 00 [SystemMemory]\n"); + EFPRINTF(fp, "[0001]\t\tBit Width : 00\n"); + EFPRINTF(fp, "[0001]\t\tBit Offset : 00\n"); + EFPRINTF(fp, + "[0001]\t\tEncoded Access Width : 00 [Undefined/Legacy]\n"); + EFPRINTF(fp, "[0008]\t\tAddress : 00000000FED00000\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0001]\t\tHPET Number : 00\n"); + EFPRINTF(fp, "[0002]\t\tMinimum Clock Ticks : 0000\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000001\n"); + EFPRINTF(fp, "\t\t\t4K Page Protect : 1\n"); + EFPRINTF(fp, "\t\t\t64K Page Protect : 0\n"); + EFPRINTF(fp, "\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_fwrite_mcfg(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve MCFG template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"MCFG\"\n"); + EFPRINTF(fp, "[0004]\t\tTable Length : 00000000\n"); + EFPRINTF(fp, "[0001]\t\tRevision : 01\n"); + EFPRINTF(fp, "[0001]\t\tChecksum : 00\n"); + EFPRINTF(fp, "[0006]\t\tOem ID : \"BHYVE \"\n"); + EFPRINTF(fp, "[0008]\t\tOem Table ID : \"BVMCFG \"\n"); + EFPRINTF(fp, "[0004]\t\tOem Revision : 00000001\n"); + + /* iasl will fill in the compiler ID/revision fields */ + EFPRINTF(fp, "[0004]\t\tAsl Compiler ID : \"xxxx\"\n"); + EFPRINTF(fp, "[0004]\t\tAsl Compiler Revision : 00000000\n"); + EFPRINTF(fp, "[0008]\t\tReserved : 0\n"); + EFPRINTF(fp, "\n"); + + EFPRINTF(fp, "[0008]\t\tBase Address : %016lX\n", pci_ecfg_base()); + EFPRINTF(fp, "[0002]\t\tSegment Group: 0000\n"); + EFPRINTF(fp, "[0001]\t\tStart Bus: 00\n"); + EFPRINTF(fp, "[0001]\t\tEnd Bus: FF\n"); + EFPRINTF(fp, "[0004]\t\tReserved : 0\n"); + EFFLUSH(fp); + return (0); +err_exit: + return (errno); +} + +static int +basl_fwrite_facs(FILE *fp) +{ + EFPRINTF(fp, "/*\n"); + EFPRINTF(fp, " * bhyve FACS template\n"); + EFPRINTF(fp, " */\n"); + EFPRINTF(fp, "[0004]\t\tSignature : \"FACS\"\n"); + EFPRINTF(fp, "[0004]\t\tLength : 00000040\n"); + EFPRINTF(fp, "[0004]\t\tHardware Signature : 00000000\n"); + EFPRINTF(fp, "[0004]\t\t32 Firmware Waking Vector : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tGlobal Lock : 00000000\n"); + EFPRINTF(fp, "[0004]\t\tFlags (decoded below) : 00000000\n"); + EFPRINTF(fp, "\t\t\tS4BIOS Support Present : 0\n"); + EFPRINTF(fp, "\t\t\t64-bit Wake Supported (V2) : 0\n"); + EFPRINTF(fp, + "[0008]\t\t64 Firmware Waking Vector : 0000000000000000\n"); + EFPRINTF(fp, "[0001]\t\tVersion : 02\n"); + EFPRINTF(fp, "[0003]\t\tReserved : 000000\n"); + EFPRINTF(fp, "[0004]\t\tOspmFlags (decoded below) : 00000000\n"); + EFPRINTF(fp, "\t\t\t64-bit Wake Env Required (V2) : 0\n"); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +/* + * Helper routines for writing to the DSDT from other modules. + */ +void +dsdt_line(const char *fmt, ...) +{ + va_list ap; + + if (dsdt_error != 0) + return; + + if (strcmp(fmt, "") != 0) { + if (dsdt_indent_level != 0) + EFPRINTF(dsdt_fp, "%*c", dsdt_indent_level * 2, ' '); + va_start(ap, fmt); + if (vfprintf(dsdt_fp, fmt, ap) < 0) { + va_end(ap); + goto err_exit; + } + va_end(ap); + } + EFPRINTF(dsdt_fp, "\n"); + return; + +err_exit: + dsdt_error = errno; +} + +void +dsdt_indent(int levels) +{ + + dsdt_indent_level += levels; + assert(dsdt_indent_level >= 0); +} + +void +dsdt_unindent(int levels) +{ + + assert(dsdt_indent_level >= levels); + dsdt_indent_level -= levels; +} + +void +dsdt_fixed_ioport(uint16_t iobase, uint16_t length) +{ + + dsdt_line("IO (Decode16,"); + dsdt_line(" 0x%04X, // Range Minimum", iobase); + dsdt_line(" 0x%04X, // Range Maximum", iobase); + dsdt_line(" 0x01, // Alignment"); + dsdt_line(" 0x%02X, // Length", length); + dsdt_line(" )"); +} + +void +dsdt_fixed_irq(uint8_t irq) +{ + + dsdt_line("IRQNoFlags ()"); + dsdt_line(" {%d}", irq); +} + +void +dsdt_fixed_mem32(uint32_t base, uint32_t length) +{ + + dsdt_line("Memory32Fixed (ReadWrite,"); + dsdt_line(" 0x%08X, // Address Base", base); + dsdt_line(" 0x%08X, // Address Length", length); + dsdt_line(" )"); +} + +static int +basl_fwrite_dsdt(FILE *fp) +{ + dsdt_fp = fp; + dsdt_error = 0; + dsdt_indent_level = 0; + + dsdt_line("/*"); + dsdt_line(" * bhyve DSDT template"); + dsdt_line(" */"); + dsdt_line("DefinitionBlock (\"bhyve_dsdt.aml\", \"DSDT\", 2," + "\"BHYVE \", \"BVDSDT \", 0x00000001)"); + dsdt_line("{"); + dsdt_line(" Name (_S5, Package ()"); + dsdt_line(" {"); + dsdt_line(" 0x05,"); + dsdt_line(" Zero,"); + dsdt_line(" })"); + + pci_write_dsdt(); + + dsdt_line(""); + dsdt_line(" Scope (_SB.PC00)"); + dsdt_line(" {"); + dsdt_line(" Device (HPET)"); + dsdt_line(" {"); + dsdt_line(" Name (_HID, EISAID(\"PNP0103\"))"); + dsdt_line(" Name (_UID, 0)"); + dsdt_line(" Name (_CRS, ResourceTemplate ()"); + dsdt_line(" {"); + dsdt_indent(4); + dsdt_fixed_mem32(0xFED00000, 0x400); + dsdt_unindent(4); + dsdt_line(" })"); + dsdt_line(" }"); + dsdt_line(" }"); + dsdt_line("}"); + + if (dsdt_error != 0) + return (dsdt_error); + + EFFLUSH(fp); + + return (0); + +err_exit: + return (errno); +} + +static int +basl_open(struct basl_fio *bf, int suffix) +{ + int err; + + err = 0; + + if (suffix) { + strlcpy(bf->f_name, basl_stemplate, MAXPATHLEN); + bf->fd = mkstemps(bf->f_name, strlen(BHYVE_ASL_SUFFIX)); + } else { + strlcpy(bf->f_name, basl_template, MAXPATHLEN); + bf->fd = mkstemp(bf->f_name); + } + + if (bf->fd > 0) { + bf->fp = fdopen(bf->fd, "w+"); + if (bf->fp == NULL) { + unlink(bf->f_name); + close(bf->fd); + } + } else { + err = 1; + } + + return (err); +} + +static void +basl_close(struct basl_fio *bf) +{ + + if (!basl_keep_temps) + unlink(bf->f_name); + fclose(bf->fp); +} + +static int +basl_start(struct basl_fio *in, struct basl_fio *out) +{ + int err; + + err = basl_open(in, 0); + if (!err) { + err = basl_open(out, 1); + if (err) { + basl_close(in); + } + } + + return (err); +} + +static void +basl_end(struct basl_fio *in, struct basl_fio *out) +{ + + basl_close(in); + basl_close(out); +} + +static int +basl_load(struct vmctx *ctx, int fd, uint64_t off) +{ + struct stat sb; + void *gaddr; + + if (fstat(fd, &sb) < 0) + return (errno); + + gaddr = paddr_guest2host(ctx, basl_acpi_base + off, sb.st_size); + if (gaddr == NULL) + return (EFAULT); + + if (read(fd, gaddr, sb.st_size) < 0) + return (errno); + + return (0); +} + +static int +basl_compile(struct vmctx *ctx, int (*fwrite_section)(FILE *), uint64_t offset) +{ + struct basl_fio io[2]; + static char iaslbuf[3*MAXPATHLEN + 10]; + char *fmt; + int err; + + err = basl_start(&io[0], &io[1]); + if (!err) { + err = (*fwrite_section)(io[0].fp); + + if (!err) { + /* + * iasl sends the results of the compilation to + * stdout. Shut this down by using the shell to + * redirect stdout to /dev/null, unless the user + * has requested verbose output for debugging + * purposes + */ + fmt = basl_verbose_iasl ? + "%s -p %s %s" : + "/bin/sh -c \"%s -p %s %s\" 1> /dev/null"; + + snprintf(iaslbuf, sizeof(iaslbuf), + fmt, + BHYVE_ASL_COMPILER, + io[1].f_name, io[0].f_name); + err = system(iaslbuf); + + if (!err) { + /* + * Copy the aml output file into guest + * memory at the specified location + */ + err = basl_load(ctx, io[1].fd, offset); + } + } + basl_end(&io[0], &io[1]); + } + + return (err); +} + +static int +basl_make_templates(void) +{ + const char *tmpdir; + int err; + int len; + + err = 0; + + /* + * + */ + if ((tmpdir = getenv("BHYVE_TMPDIR")) == NULL || *tmpdir == '\0' || + (tmpdir = getenv("TMPDIR")) == NULL || *tmpdir == '\0') { + tmpdir = _PATH_TMP; + } + + len = strlen(tmpdir); + + if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1) < MAXPATHLEN) { + strcpy(basl_template, tmpdir); + while (len > 0 && basl_template[len - 1] == '/') + len--; + basl_template[len] = '/'; + strcpy(&basl_template[len + 1], BHYVE_ASL_TEMPLATE); + } else + err = E2BIG; + + if (!err) { + /* + * len has been intialized (and maybe adjusted) above + */ + if ((len + sizeof(BHYVE_ASL_TEMPLATE) + 1 + + sizeof(BHYVE_ASL_SUFFIX)) < MAXPATHLEN) { + strcpy(basl_stemplate, tmpdir); + basl_stemplate[len] = '/'; + strcpy(&basl_stemplate[len + 1], BHYVE_ASL_TEMPLATE); + len = strlen(basl_stemplate); + strcpy(&basl_stemplate[len], BHYVE_ASL_SUFFIX); + } else + err = E2BIG; + } + + return (err); +} + +static struct { + int (*wsect)(FILE *fp); + uint64_t offset; +} basl_ftables[] = +{ + { basl_fwrite_rsdp, 0}, + { basl_fwrite_rsdt, RSDT_OFFSET }, + { basl_fwrite_xsdt, XSDT_OFFSET }, + { basl_fwrite_madt, MADT_OFFSET }, + { basl_fwrite_fadt, FADT_OFFSET }, + { basl_fwrite_hpet, HPET_OFFSET }, + { basl_fwrite_mcfg, MCFG_OFFSET }, + { basl_fwrite_facs, FACS_OFFSET }, + { basl_fwrite_dsdt, DSDT_OFFSET }, + { NULL } +}; + +int +acpi_build(struct vmctx *ctx, int ncpu) +{ + int err; + int i; + + basl_ncpu = ncpu; + + err = vm_get_hpet_capabilities(ctx, &hpet_capabilities); + if (err != 0) + return (err); + + /* + * For debug, allow the user to have iasl compiler output sent + * to stdout rather than /dev/null + */ + if (getenv("BHYVE_ACPI_VERBOSE_IASL")) + basl_verbose_iasl = 1; + + /* + * Allow the user to keep the generated ASL files for debugging + * instead of deleting them following use + */ + if (getenv("BHYVE_ACPI_KEEPTMPS")) + basl_keep_temps = 1; + + i = 0; + err = basl_make_templates(); + + /* + * Run through all the ASL files, compiling them and + * copying them into guest memory + */ + while (!err && basl_ftables[i].wsect != NULL) { + err = basl_compile(ctx, basl_ftables[i].wsect, + basl_ftables[i].offset); + i++; + } + + return (err); +} diff --git a/usr/src/cmd/bhyve/acpi.h b/usr/src/cmd/bhyve/acpi.h index 477f827286..4c6d86d091 100644 --- a/usr/src/cmd/bhyve/acpi.h +++ b/usr/src/cmd/bhyve/acpi.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/acpi.h 266125 2014-05-15 14:16:55Z jhb $ + * $FreeBSD$ */ #ifndef _ACPI_H_ diff --git a/usr/src/cmd/bhyve/ahci.h b/usr/src/cmd/bhyve/ahci.h index 1cf09adcbf..691d4bd438 100644 --- a/usr/src/cmd/bhyve/ahci.h +++ b/usr/src/cmd/bhyve/ahci.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org> * Copyright (c) 2009-2012 Alexander Motin <mav@FreeBSD.org> * All rights reserved. @@ -24,281 +26,299 @@ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/ahci.h 256056 2013-10-04 18:31:38Z grehan $ + * $FreeBSD$ */ #ifndef _AHCI_H_ #define _AHCI_H_ /* ATA register defines */ -#define ATA_DATA 0 /* (RW) data */ - -#define ATA_FEATURE 1 /* (W) feature */ -#define ATA_F_DMA 0x01 /* enable DMA */ -#define ATA_F_OVL 0x02 /* enable overlap */ - -#define ATA_COUNT 2 /* (W) sector count */ - -#define ATA_SECTOR 3 /* (RW) sector # */ -#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */ -#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */ -#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */ -#define ATA_D_LBA 0x40 /* use LBA addressing */ -#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */ - -#define ATA_COMMAND 7 /* (W) command */ - -#define ATA_ERROR 8 /* (R) error */ -#define ATA_E_ILI 0x01 /* illegal length */ -#define ATA_E_NM 0x02 /* no media */ -#define ATA_E_ABORT 0x04 /* command aborted */ -#define ATA_E_MCR 0x08 /* media change request */ -#define ATA_E_IDNF 0x10 /* ID not found */ -#define ATA_E_MC 0x20 /* media changed */ -#define ATA_E_UNC 0x40 /* uncorrectable data */ -#define ATA_E_ICRC 0x80 /* UDMA crc error */ -#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */ - -#define ATA_IREASON 9 /* (R) interrupt reason */ -#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */ -#define ATA_I_IN 0x02 /* read (1) | write (0) */ -#define ATA_I_RELEASE 0x04 /* released bus (1) */ -#define ATA_I_TAGMASK 0xf8 /* tag mask */ - -#define ATA_STATUS 10 /* (R) status */ -#define ATA_ALTSTAT 11 /* (R) alternate status */ -#define ATA_S_ERROR 0x01 /* error */ -#define ATA_S_INDEX 0x02 /* index */ -#define ATA_S_CORR 0x04 /* data corrected */ -#define ATA_S_DRQ 0x08 /* data request */ -#define ATA_S_DSC 0x10 /* drive seek completed */ -#define ATA_S_SERVICE 0x10 /* drive needs service */ -#define ATA_S_DWF 0x20 /* drive write fault */ -#define ATA_S_DMA 0x20 /* DMA ready */ -#define ATA_S_READY 0x40 /* drive ready */ -#define ATA_S_BUSY 0x80 /* busy */ - -#define ATA_CONTROL 12 /* (W) control */ -#define ATA_A_IDS 0x02 /* disable interrupts */ -#define ATA_A_RESET 0x04 /* RESET controller */ -#define ATA_A_4BIT 0x08 /* 4 head bits */ -#define ATA_A_HOB 0x80 /* High Order Byte enable */ +#define ATA_DATA 0 /* (RW) data */ + +#define ATA_FEATURE 1 /* (W) feature */ +#define ATA_F_DMA 0x01 /* enable DMA */ +#define ATA_F_OVL 0x02 /* enable overlap */ + +#define ATA_COUNT 2 /* (W) sector count */ + +#define ATA_SECTOR 3 /* (RW) sector # */ +#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */ +#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */ +#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */ +#define ATA_D_LBA 0x40 /* use LBA addressing */ +#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */ + +#define ATA_COMMAND 7 /* (W) command */ + +#define ATA_ERROR 8 /* (R) error */ +#define ATA_E_ILI 0x01 /* illegal length */ +#define ATA_E_NM 0x02 /* no media */ +#define ATA_E_ABORT 0x04 /* command aborted */ +#define ATA_E_MCR 0x08 /* media change request */ +#define ATA_E_IDNF 0x10 /* ID not found */ +#define ATA_E_MC 0x20 /* media changed */ +#define ATA_E_UNC 0x40 /* uncorrectable data */ +#define ATA_E_ICRC 0x80 /* UDMA crc error */ +#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */ + +#define ATA_IREASON 9 /* (R) interrupt reason */ +#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */ +#define ATA_I_IN 0x02 /* read (1) | write (0) */ +#define ATA_I_RELEASE 0x04 /* released bus (1) */ +#define ATA_I_TAGMASK 0xf8 /* tag mask */ + +#define ATA_STATUS 10 /* (R) status */ +#define ATA_ALTSTAT 11 /* (R) alternate status */ +#define ATA_S_ERROR 0x01 /* error */ +#define ATA_S_INDEX 0x02 /* index */ +#define ATA_S_CORR 0x04 /* data corrected */ +#define ATA_S_DRQ 0x08 /* data request */ +#define ATA_S_DSC 0x10 /* drive seek completed */ +#define ATA_S_SERVICE 0x10 /* drive needs service */ +#define ATA_S_DWF 0x20 /* drive write fault */ +#define ATA_S_DMA 0x20 /* DMA ready */ +#define ATA_S_READY 0x40 /* drive ready */ +#define ATA_S_BUSY 0x80 /* busy */ + +#define ATA_CONTROL 12 /* (W) control */ +#define ATA_A_IDS 0x02 /* disable interrupts */ +#define ATA_A_RESET 0x04 /* RESET controller */ +#define ATA_A_4BIT 0x08 /* 4 head bits */ +#define ATA_A_HOB 0x80 /* High Order Byte enable */ /* SATA register defines */ -#define ATA_SSTATUS 13 -#define ATA_SS_DET_MASK 0x0000000f -#define ATA_SS_DET_NO_DEVICE 0x00000000 -#define ATA_SS_DET_DEV_PRESENT 0x00000001 -#define ATA_SS_DET_PHY_ONLINE 0x00000003 -#define ATA_SS_DET_PHY_OFFLINE 0x00000004 - -#define ATA_SS_SPD_MASK 0x000000f0 -#define ATA_SS_SPD_NO_SPEED 0x00000000 -#define ATA_SS_SPD_GEN1 0x00000010 -#define ATA_SS_SPD_GEN2 0x00000020 -#define ATA_SS_SPD_GEN3 0x00000040 - -#define ATA_SS_IPM_MASK 0x00000f00 -#define ATA_SS_IPM_NO_DEVICE 0x00000000 -#define ATA_SS_IPM_ACTIVE 0x00000100 -#define ATA_SS_IPM_PARTIAL 0x00000200 -#define ATA_SS_IPM_SLUMBER 0x00000600 - -#define ATA_SERROR 14 -#define ATA_SE_DATA_CORRECTED 0x00000001 -#define ATA_SE_COMM_CORRECTED 0x00000002 -#define ATA_SE_DATA_ERR 0x00000100 -#define ATA_SE_COMM_ERR 0x00000200 -#define ATA_SE_PROT_ERR 0x00000400 -#define ATA_SE_HOST_ERR 0x00000800 -#define ATA_SE_PHY_CHANGED 0x00010000 -#define ATA_SE_PHY_IERROR 0x00020000 -#define ATA_SE_COMM_WAKE 0x00040000 -#define ATA_SE_DECODE_ERR 0x00080000 -#define ATA_SE_PARITY_ERR 0x00100000 -#define ATA_SE_CRC_ERR 0x00200000 -#define ATA_SE_HANDSHAKE_ERR 0x00400000 -#define ATA_SE_LINKSEQ_ERR 0x00800000 -#define ATA_SE_TRANSPORT_ERR 0x01000000 -#define ATA_SE_UNKNOWN_FIS 0x02000000 -#define ATA_SE_EXCHANGED 0x04000000 - -#define ATA_SCONTROL 15 -#define ATA_SC_DET_MASK 0x0000000f -#define ATA_SC_DET_IDLE 0x00000000 -#define ATA_SC_DET_RESET 0x00000001 -#define ATA_SC_DET_DISABLE 0x00000004 - -#define ATA_SC_SPD_MASK 0x000000f0 -#define ATA_SC_SPD_NO_SPEED 0x00000000 -#define ATA_SC_SPD_SPEED_GEN1 0x00000010 -#define ATA_SC_SPD_SPEED_GEN2 0x00000020 -#define ATA_SC_SPD_SPEED_GEN3 0x00000040 - -#define ATA_SC_IPM_MASK 0x00000f00 -#define ATA_SC_IPM_NONE 0x00000000 -#define ATA_SC_IPM_DIS_PARTIAL 0x00000100 -#define ATA_SC_IPM_DIS_SLUMBER 0x00000200 - -#define ATA_SACTIVE 16 - -#define AHCI_MAX_PORTS 32 -#define AHCI_MAX_SLOTS 32 +#define ATA_SSTATUS 13 +#define ATA_SS_DET_MASK 0x0000000f +#define ATA_SS_DET_NO_DEVICE 0x00000000 +#define ATA_SS_DET_DEV_PRESENT 0x00000001 +#define ATA_SS_DET_PHY_ONLINE 0x00000003 +#define ATA_SS_DET_PHY_OFFLINE 0x00000004 + +#define ATA_SS_SPD_MASK 0x000000f0 +#define ATA_SS_SPD_NO_SPEED 0x00000000 +#define ATA_SS_SPD_GEN1 0x00000010 +#define ATA_SS_SPD_GEN2 0x00000020 +#define ATA_SS_SPD_GEN3 0x00000030 + +#define ATA_SS_IPM_MASK 0x00000f00 +#define ATA_SS_IPM_NO_DEVICE 0x00000000 +#define ATA_SS_IPM_ACTIVE 0x00000100 +#define ATA_SS_IPM_PARTIAL 0x00000200 +#define ATA_SS_IPM_SLUMBER 0x00000600 +#define ATA_SS_IPM_DEVSLEEP 0x00000800 + +#define ATA_SERROR 14 +#define ATA_SE_DATA_CORRECTED 0x00000001 +#define ATA_SE_COMM_CORRECTED 0x00000002 +#define ATA_SE_DATA_ERR 0x00000100 +#define ATA_SE_COMM_ERR 0x00000200 +#define ATA_SE_PROT_ERR 0x00000400 +#define ATA_SE_HOST_ERR 0x00000800 +#define ATA_SE_PHY_CHANGED 0x00010000 +#define ATA_SE_PHY_IERROR 0x00020000 +#define ATA_SE_COMM_WAKE 0x00040000 +#define ATA_SE_DECODE_ERR 0x00080000 +#define ATA_SE_PARITY_ERR 0x00100000 +#define ATA_SE_CRC_ERR 0x00200000 +#define ATA_SE_HANDSHAKE_ERR 0x00400000 +#define ATA_SE_LINKSEQ_ERR 0x00800000 +#define ATA_SE_TRANSPORT_ERR 0x01000000 +#define ATA_SE_UNKNOWN_FIS 0x02000000 +#define ATA_SE_EXCHANGED 0x04000000 + +#define ATA_SCONTROL 15 +#define ATA_SC_DET_MASK 0x0000000f +#define ATA_SC_DET_IDLE 0x00000000 +#define ATA_SC_DET_RESET 0x00000001 +#define ATA_SC_DET_DISABLE 0x00000004 + +#define ATA_SC_SPD_MASK 0x000000f0 +#define ATA_SC_SPD_NO_SPEED 0x00000000 +#define ATA_SC_SPD_SPEED_GEN1 0x00000010 +#define ATA_SC_SPD_SPEED_GEN2 0x00000020 +#define ATA_SC_SPD_SPEED_GEN3 0x00000030 + +#define ATA_SC_IPM_MASK 0x00000f00 +#define ATA_SC_IPM_NONE 0x00000000 +#define ATA_SC_IPM_DIS_PARTIAL 0x00000100 +#define ATA_SC_IPM_DIS_SLUMBER 0x00000200 +#define ATA_SC_IPM_DIS_DEVSLEEP 0x00000400 + +#define ATA_SACTIVE 16 + +#define AHCI_MAX_PORTS 32 +#define AHCI_MAX_SLOTS 32 +#define AHCI_MAX_IRQS 16 /* SATA AHCI v1.0 register defines */ -#define AHCI_CAP 0x00 -#define AHCI_CAP_NPMASK 0x0000001f -#define AHCI_CAP_SXS 0x00000020 -#define AHCI_CAP_EMS 0x00000040 -#define AHCI_CAP_CCCS 0x00000080 -#define AHCI_CAP_NCS 0x00001F00 -#define AHCI_CAP_NCS_SHIFT 8 -#define AHCI_CAP_PSC 0x00002000 -#define AHCI_CAP_SSC 0x00004000 -#define AHCI_CAP_PMD 0x00008000 -#define AHCI_CAP_FBSS 0x00010000 -#define AHCI_CAP_SPM 0x00020000 -#define AHCI_CAP_SAM 0x00080000 -#define AHCI_CAP_ISS 0x00F00000 -#define AHCI_CAP_ISS_SHIFT 20 -#define AHCI_CAP_SCLO 0x01000000 -#define AHCI_CAP_SAL 0x02000000 -#define AHCI_CAP_SALP 0x04000000 -#define AHCI_CAP_SSS 0x08000000 -#define AHCI_CAP_SMPS 0x10000000 -#define AHCI_CAP_SSNTF 0x20000000 -#define AHCI_CAP_SNCQ 0x40000000 -#define AHCI_CAP_64BIT 0x80000000 - -#define AHCI_GHC 0x04 -#define AHCI_GHC_AE 0x80000000 -#define AHCI_GHC_MRSM 0x00000004 -#define AHCI_GHC_IE 0x00000002 -#define AHCI_GHC_HR 0x00000001 - -#define AHCI_IS 0x08 -#define AHCI_PI 0x0c -#define AHCI_VS 0x10 - -#define AHCI_CCCC 0x14 -#define AHCI_CCCC_TV_MASK 0xffff0000 -#define AHCI_CCCC_TV_SHIFT 16 -#define AHCI_CCCC_CC_MASK 0x0000ff00 -#define AHCI_CCCC_CC_SHIFT 8 -#define AHCI_CCCC_INT_MASK 0x000000f8 -#define AHCI_CCCC_INT_SHIFT 3 -#define AHCI_CCCC_EN 0x00000001 -#define AHCI_CCCP 0x18 - -#define AHCI_EM_LOC 0x1C -#define AHCI_EM_CTL 0x20 -#define AHCI_EM_MR 0x00000001 -#define AHCI_EM_TM 0x00000100 -#define AHCI_EM_RST 0x00000200 -#define AHCI_EM_LED 0x00010000 -#define AHCI_EM_SAFTE 0x00020000 -#define AHCI_EM_SES2 0x00040000 -#define AHCI_EM_SGPIO 0x00080000 -#define AHCI_EM_SMB 0x01000000 -#define AHCI_EM_XMT 0x02000000 -#define AHCI_EM_ALHD 0x04000000 -#define AHCI_EM_PM 0x08000000 - -#define AHCI_CAP2 0x24 -#define AHCI_CAP2_BOH 0x00000001 -#define AHCI_CAP2_NVMP 0x00000002 -#define AHCI_CAP2_APST 0x00000004 - -#define AHCI_OFFSET 0x100 -#define AHCI_STEP 0x80 - -#define AHCI_P_CLB 0x00 -#define AHCI_P_CLBU 0x04 -#define AHCI_P_FB 0x08 -#define AHCI_P_FBU 0x0c -#define AHCI_P_IS 0x10 -#define AHCI_P_IE 0x14 -#define AHCI_P_IX_DHR 0x00000001 -#define AHCI_P_IX_PS 0x00000002 -#define AHCI_P_IX_DS 0x00000004 -#define AHCI_P_IX_SDB 0x00000008 -#define AHCI_P_IX_UF 0x00000010 -#define AHCI_P_IX_DP 0x00000020 -#define AHCI_P_IX_PC 0x00000040 -#define AHCI_P_IX_MP 0x00000080 - -#define AHCI_P_IX_PRC 0x00400000 -#define AHCI_P_IX_IPM 0x00800000 -#define AHCI_P_IX_OF 0x01000000 -#define AHCI_P_IX_INF 0x04000000 -#define AHCI_P_IX_IF 0x08000000 -#define AHCI_P_IX_HBD 0x10000000 -#define AHCI_P_IX_HBF 0x20000000 -#define AHCI_P_IX_TFE 0x40000000 -#define AHCI_P_IX_CPD 0x80000000 - -#define AHCI_P_CMD 0x18 -#define AHCI_P_CMD_ST 0x00000001 -#define AHCI_P_CMD_SUD 0x00000002 -#define AHCI_P_CMD_POD 0x00000004 -#define AHCI_P_CMD_CLO 0x00000008 -#define AHCI_P_CMD_FRE 0x00000010 -#define AHCI_P_CMD_CCS_MASK 0x00001f00 -#define AHCI_P_CMD_CCS_SHIFT 8 -#define AHCI_P_CMD_ISS 0x00002000 -#define AHCI_P_CMD_FR 0x00004000 -#define AHCI_P_CMD_CR 0x00008000 -#define AHCI_P_CMD_CPS 0x00010000 -#define AHCI_P_CMD_PMA 0x00020000 -#define AHCI_P_CMD_HPCP 0x00040000 -#define AHCI_P_CMD_MPSP 0x00080000 -#define AHCI_P_CMD_CPD 0x00100000 -#define AHCI_P_CMD_ESP 0x00200000 -#define AHCI_P_CMD_FBSCP 0x00400000 -#define AHCI_P_CMD_APSTE 0x00800000 -#define AHCI_P_CMD_ATAPI 0x01000000 -#define AHCI_P_CMD_DLAE 0x02000000 -#define AHCI_P_CMD_ALPE 0x04000000 -#define AHCI_P_CMD_ASP 0x08000000 -#define AHCI_P_CMD_ICC_MASK 0xf0000000 -#define AHCI_P_CMD_NOOP 0x00000000 -#define AHCI_P_CMD_ACTIVE 0x10000000 -#define AHCI_P_CMD_PARTIAL 0x20000000 -#define AHCI_P_CMD_SLUMBER 0x60000000 - -#define AHCI_P_TFD 0x20 -#define AHCI_P_SIG 0x24 -#define AHCI_P_SSTS 0x28 -#define AHCI_P_SCTL 0x2c -#define AHCI_P_SERR 0x30 -#define AHCI_P_SACT 0x34 -#define AHCI_P_CI 0x38 -#define AHCI_P_SNTF 0x3C -#define AHCI_P_FBS 0x40 -#define AHCI_P_FBS_EN 0x00000001 -#define AHCI_P_FBS_DEC 0x00000002 -#define AHCI_P_FBS_SDE 0x00000004 -#define AHCI_P_FBS_DEV 0x00000f00 -#define AHCI_P_FBS_DEV_SHIFT 8 -#define AHCI_P_FBS_ADO 0x0000f000 -#define AHCI_P_FBS_ADO_SHIFT 12 -#define AHCI_P_FBS_DWE 0x000f0000 -#define AHCI_P_FBS_DWE_SHIFT 16 +#define AHCI_CAP 0x00 +#define AHCI_CAP_NPMASK 0x0000001f +#define AHCI_CAP_SXS 0x00000020 +#define AHCI_CAP_EMS 0x00000040 +#define AHCI_CAP_CCCS 0x00000080 +#define AHCI_CAP_NCS 0x00001F00 +#define AHCI_CAP_NCS_SHIFT 8 +#define AHCI_CAP_PSC 0x00002000 +#define AHCI_CAP_SSC 0x00004000 +#define AHCI_CAP_PMD 0x00008000 +#define AHCI_CAP_FBSS 0x00010000 +#define AHCI_CAP_SPM 0x00020000 +#define AHCI_CAP_SAM 0x00080000 +#define AHCI_CAP_ISS 0x00F00000 +#define AHCI_CAP_ISS_SHIFT 20 +#define AHCI_CAP_SCLO 0x01000000 +#define AHCI_CAP_SAL 0x02000000 +#define AHCI_CAP_SALP 0x04000000 +#define AHCI_CAP_SSS 0x08000000 +#define AHCI_CAP_SMPS 0x10000000 +#define AHCI_CAP_SSNTF 0x20000000 +#define AHCI_CAP_SNCQ 0x40000000 +#define AHCI_CAP_64BIT 0x80000000 + +#define AHCI_GHC 0x04 +#define AHCI_GHC_AE 0x80000000 +#define AHCI_GHC_MRSM 0x00000004 +#define AHCI_GHC_IE 0x00000002 +#define AHCI_GHC_HR 0x00000001 + +#define AHCI_IS 0x08 +#define AHCI_PI 0x0c +#define AHCI_VS 0x10 + +#define AHCI_CCCC 0x14 +#define AHCI_CCCC_TV_MASK 0xffff0000 +#define AHCI_CCCC_TV_SHIFT 16 +#define AHCI_CCCC_CC_MASK 0x0000ff00 +#define AHCI_CCCC_CC_SHIFT 8 +#define AHCI_CCCC_INT_MASK 0x000000f8 +#define AHCI_CCCC_INT_SHIFT 3 +#define AHCI_CCCC_EN 0x00000001 +#define AHCI_CCCP 0x18 + +#define AHCI_EM_LOC 0x1C +#define AHCI_EM_CTL 0x20 +#define AHCI_EM_MR 0x00000001 +#define AHCI_EM_TM 0x00000100 +#define AHCI_EM_RST 0x00000200 +#define AHCI_EM_LED 0x00010000 +#define AHCI_EM_SAFTE 0x00020000 +#define AHCI_EM_SES2 0x00040000 +#define AHCI_EM_SGPIO 0x00080000 +#define AHCI_EM_SMB 0x01000000 +#define AHCI_EM_XMT 0x02000000 +#define AHCI_EM_ALHD 0x04000000 +#define AHCI_EM_PM 0x08000000 + +#define AHCI_CAP2 0x24 +#define AHCI_CAP2_BOH 0x00000001 +#define AHCI_CAP2_NVMP 0x00000002 +#define AHCI_CAP2_APST 0x00000004 +#define AHCI_CAP2_SDS 0x00000008 +#define AHCI_CAP2_SADM 0x00000010 +#define AHCI_CAP2_DESO 0x00000020 + +#define AHCI_OFFSET 0x100 +#define AHCI_STEP 0x80 + +#define AHCI_P_CLB 0x00 +#define AHCI_P_CLBU 0x04 +#define AHCI_P_FB 0x08 +#define AHCI_P_FBU 0x0c +#define AHCI_P_IS 0x10 +#define AHCI_P_IE 0x14 +#define AHCI_P_IX_DHR 0x00000001 +#define AHCI_P_IX_PS 0x00000002 +#define AHCI_P_IX_DS 0x00000004 +#define AHCI_P_IX_SDB 0x00000008 +#define AHCI_P_IX_UF 0x00000010 +#define AHCI_P_IX_DP 0x00000020 +#define AHCI_P_IX_PC 0x00000040 +#define AHCI_P_IX_MP 0x00000080 + +#define AHCI_P_IX_PRC 0x00400000 +#define AHCI_P_IX_IPM 0x00800000 +#define AHCI_P_IX_OF 0x01000000 +#define AHCI_P_IX_INF 0x04000000 +#define AHCI_P_IX_IF 0x08000000 +#define AHCI_P_IX_HBD 0x10000000 +#define AHCI_P_IX_HBF 0x20000000 +#define AHCI_P_IX_TFE 0x40000000 +#define AHCI_P_IX_CPD 0x80000000 + +#define AHCI_P_CMD 0x18 +#define AHCI_P_CMD_ST 0x00000001 +#define AHCI_P_CMD_SUD 0x00000002 +#define AHCI_P_CMD_POD 0x00000004 +#define AHCI_P_CMD_CLO 0x00000008 +#define AHCI_P_CMD_FRE 0x00000010 +#define AHCI_P_CMD_CCS_MASK 0x00001f00 +#define AHCI_P_CMD_CCS_SHIFT 8 +#define AHCI_P_CMD_ISS 0x00002000 +#define AHCI_P_CMD_FR 0x00004000 +#define AHCI_P_CMD_CR 0x00008000 +#define AHCI_P_CMD_CPS 0x00010000 +#define AHCI_P_CMD_PMA 0x00020000 +#define AHCI_P_CMD_HPCP 0x00040000 +#define AHCI_P_CMD_MPSP 0x00080000 +#define AHCI_P_CMD_CPD 0x00100000 +#define AHCI_P_CMD_ESP 0x00200000 +#define AHCI_P_CMD_FBSCP 0x00400000 +#define AHCI_P_CMD_APSTE 0x00800000 +#define AHCI_P_CMD_ATAPI 0x01000000 +#define AHCI_P_CMD_DLAE 0x02000000 +#define AHCI_P_CMD_ALPE 0x04000000 +#define AHCI_P_CMD_ASP 0x08000000 +#define AHCI_P_CMD_ICC_MASK 0xf0000000 +#define AHCI_P_CMD_NOOP 0x00000000 +#define AHCI_P_CMD_ACTIVE 0x10000000 +#define AHCI_P_CMD_PARTIAL 0x20000000 +#define AHCI_P_CMD_SLUMBER 0x60000000 +#define AHCI_P_CMD_DEVSLEEP 0x80000000 + +#define AHCI_P_TFD 0x20 +#define AHCI_P_SIG 0x24 +#define AHCI_P_SSTS 0x28 +#define AHCI_P_SCTL 0x2c +#define AHCI_P_SERR 0x30 +#define AHCI_P_SACT 0x34 +#define AHCI_P_CI 0x38 +#define AHCI_P_SNTF 0x3C +#define AHCI_P_FBS 0x40 +#define AHCI_P_FBS_EN 0x00000001 +#define AHCI_P_FBS_DEC 0x00000002 +#define AHCI_P_FBS_SDE 0x00000004 +#define AHCI_P_FBS_DEV 0x00000f00 +#define AHCI_P_FBS_DEV_SHIFT 8 +#define AHCI_P_FBS_ADO 0x0000f000 +#define AHCI_P_FBS_ADO_SHIFT 12 +#define AHCI_P_FBS_DWE 0x000f0000 +#define AHCI_P_FBS_DWE_SHIFT 16 +#define AHCI_P_DEVSLP 0x44 +#define AHCI_P_DEVSLP_ADSE 0x00000001 +#define AHCI_P_DEVSLP_DSP 0x00000002 +#define AHCI_P_DEVSLP_DETO 0x000003fc +#define AHCI_P_DEVSLP_DETO_SHIFT 2 +#define AHCI_P_DEVSLP_MDAT 0x00007c00 +#define AHCI_P_DEVSLP_MDAT_SHIFT 10 +#define AHCI_P_DEVSLP_DITO 0x01ff8000 +#define AHCI_P_DEVSLP_DITO_SHIFT 15 +#define AHCI_P_DEVSLP_DM 0x0e000000 +#define AHCI_P_DEVSLP_DM_SHIFT 25 /* Just to be sure, if building as module. */ #if MAXPHYS < 512 * 1024 #undef MAXPHYS -#define MAXPHYS 512 * 1024 +#define MAXPHYS 512 * 1024 #endif /* Pessimistic prognosis on number of required S/G entries */ -#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8)) +#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8)) /* Command list. 32 commands. First, 1Kbyte aligned. */ -#define AHCI_CL_OFFSET 0 -#define AHCI_CL_SIZE 32 +#define AHCI_CL_OFFSET 0 +#define AHCI_CL_SIZE 32 /* Command tables. Up to 32 commands, Each, 128byte aligned. */ -#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS) -#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16) +#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS) +#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16) /* Total main work area. */ -#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots) +#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots) #endif /* _AHCI_H_ */ diff --git a/usr/src/cmd/bhyve/atkbdc.c b/usr/src/cmd/bhyve/atkbdc.c index 4d09d88266..1c1838c2e8 100644 --- a/usr/src/cmd/bhyve/atkbdc.c +++ b/usr/src/cmd/bhyve/atkbdc.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2015 Nahanni Systems Inc. * All rights reserved. @@ -26,7 +28,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/atkbdc.c 267611 2014-06-18 17:20:02Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/types.h> @@ -45,6 +47,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/atkbdc.c 267611 2014-06-18 17:20:02Z nee #include <pthread_np.h> #include "acpi.h" +#include "atkbdc.h" #include "inout.h" #include "pci_emul.h" #include "pci_irq.h" @@ -99,19 +102,21 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/atkbdc.c 267611 2014-06-18 17:20:02Z nee #define KBDO_AUX_OUTFULL 0x20 #define RAMSZ 32 +#define FIFOSZ 15 +#define CTRL_CMD_FLAG 0x8000 struct kbd_dev { bool irq_active; int irq; - uint8_t buffer; + uint8_t buffer[FIFOSZ]; + int brd, bwr; + int bcnt; }; struct aux_dev { bool irq_active; int irq; - - uint8_t buffer; }; struct atkbdc_softc { @@ -126,6 +131,7 @@ struct atkbdc_softc { uint8_t ram[RAMSZ]; /* byte0 = controller config */ uint32_t curcmd; /* current command for next byte */ + uint32_t ctrlbyte; struct kbd_dev kbd; struct aux_dev aux; @@ -134,72 +140,37 @@ struct atkbdc_softc { static void atkbdc_assert_kbd_intr(struct atkbdc_softc *sc) { - if (!sc->kbd.irq_active && - (sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) { + if ((sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) { sc->kbd.irq_active = true; - vm_isa_assert_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq); - } -} - -static void -atkbdc_deassert_kbd_intr(struct atkbdc_softc *sc) -{ - if (sc->kbd.irq_active) { - vm_isa_deassert_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq); - sc->kbd.irq_active = false; + vm_isa_pulse_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq); } } static void atkbdc_assert_aux_intr(struct atkbdc_softc *sc) { - if (!sc->aux.irq_active && - (sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) { + if ((sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) { sc->aux.irq_active = true; - vm_isa_assert_irq(sc->ctx, sc->aux.irq, sc->aux.irq); + vm_isa_pulse_irq(sc->ctx, sc->aux.irq, sc->aux.irq); } } -static void -atkbdc_deassert_aux_intr(struct atkbdc_softc *sc) -{ - if (sc->aux.irq_active) { - vm_isa_deassert_irq(sc->ctx, sc->aux.irq, sc->aux.irq); - sc->aux.irq_active = false; - } -} - -static void -atkbdc_aux_queue_data(struct atkbdc_softc *sc, uint8_t val) -{ - assert(pthread_mutex_isowned_np(&sc->mtx)); - - sc->aux.buffer = val; - sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); - sc->outport |= KBDO_AUX_OUTFULL; - atkbdc_assert_aux_intr(sc); -} - -static void +static int atkbdc_kbd_queue_data(struct atkbdc_softc *sc, uint8_t val) { assert(pthread_mutex_isowned_np(&sc->mtx)); - sc->kbd.buffer = val; - sc->status |= KBDS_KBD_BUFFER_FULL; - sc->outport |= KBDO_KBD_OUTFULL; - atkbdc_assert_kbd_intr(sc); -} - -static void -atkbdc_aux_read(struct atkbdc_softc *sc) -{ - uint8_t val; - - assert(pthread_mutex_isowned_np(&sc->mtx)); + if (sc->kbd.bcnt < FIFOSZ) { + sc->kbd.buffer[sc->kbd.bwr] = val; + sc->kbd.bwr = (sc->kbd.bwr + 1) % FIFOSZ; + sc->kbd.bcnt++; + sc->status |= KBDS_KBD_BUFFER_FULL; + sc->outport |= KBDO_KBD_OUTFULL; + } else { + printf("atkbd data buffer full\n"); + } - if (ps2mouse_read(sc->ps2mouse_sc, &val) != -1) - atkbdc_aux_queue_data(sc, val); + return (sc->kbd.bcnt < FIFOSZ); } static void @@ -252,21 +223,31 @@ atkbdc_kbd_read(struct atkbdc_softc *sc) } else { val = translation[val] | release; } - atkbdc_kbd_queue_data(sc, val); break; } } else { - if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) - atkbdc_kbd_queue_data(sc, val); + while (sc->kbd.bcnt < FIFOSZ) { + if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) + atkbdc_kbd_queue_data(sc, val); + else + break; + } } + + if (((sc->ram[0] & KBD_DISABLE_AUX_PORT) || + ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) && sc->kbd.bcnt > 0) + atkbdc_assert_kbd_intr(sc); } static void atkbdc_aux_poll(struct atkbdc_softc *sc) { - if ((sc->outport & KBDO_AUX_OUTFULL) == 0) - atkbdc_aux_read(sc); + if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0) { + sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL; + sc->outport |= KBDO_AUX_OUTFULL; + atkbdc_assert_aux_intr(sc); + } } static void @@ -274,8 +255,7 @@ atkbdc_kbd_poll(struct atkbdc_softc *sc) { assert(pthread_mutex_isowned_np(&sc->mtx)); - if ((sc->outport & KBDO_KBD_OUTFULL) == 0) - atkbdc_kbd_read(sc); + atkbdc_kbd_read(sc); } static void @@ -290,22 +270,35 @@ atkbdc_dequeue_data(struct atkbdc_softc *sc, uint8_t *buf) { assert(pthread_mutex_isowned_np(&sc->mtx)); - if (sc->outport & KBDO_AUX_OUTFULL) { - *buf = sc->aux.buffer; - sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); - sc->outport &= ~KBDO_AUX_OUTFULL; - atkbdc_deassert_aux_intr(sc); + if (ps2mouse_read(sc->ps2mouse_sc, buf) == 0) { + if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0) { + if (sc->kbd.bcnt == 0) + sc->status &= ~(KBDS_AUX_BUFFER_FULL | + KBDS_KBD_BUFFER_FULL); + else + sc->status &= ~(KBDS_AUX_BUFFER_FULL); + sc->outport &= ~KBDO_AUX_OUTFULL; + } atkbdc_poll(sc); return; } - *buf = sc->kbd.buffer; - sc->status &= ~KBDS_KBD_BUFFER_FULL; - sc->outport &= ~KBDO_KBD_OUTFULL; - atkbdc_deassert_kbd_intr(sc); + if (sc->kbd.bcnt > 0) { + *buf = sc->kbd.buffer[sc->kbd.brd]; + sc->kbd.brd = (sc->kbd.brd + 1) % FIFOSZ; + sc->kbd.bcnt--; + if (sc->kbd.bcnt == 0) { + sc->status &= ~KBDS_KBD_BUFFER_FULL; + sc->outport &= ~KBDO_KBD_OUTFULL; + } - atkbdc_poll(sc); + atkbdc_poll(sc); + } + + if (ps2mouse_fifocnt(sc->ps2mouse_sc) == 0 && sc->kbd.bcnt == 0) { + sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); + } } static int @@ -318,19 +311,22 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, if (bytes != 1) return (-1); - sc = arg; retval = 0; pthread_mutex_lock(&sc->mtx); if (in) { sc->curcmd = 0; - sc->status &= ~KBDS_CTRL_FLAG; - - /* read device buffer; includes kbd cmd responses */ - atkbdc_dequeue_data(sc, &buf); - *eax = buf; + if (sc->ctrlbyte != 0) { + *eax = sc->ctrlbyte & 0xff; + sc->ctrlbyte = 0; + } else { + /* read device buffer; includes kbd cmd responses */ + atkbdc_dequeue_data(sc, &buf); + *eax = buf; + } + sc->status &= ~KBDS_CTRL_FLAG; pthread_mutex_unlock(&sc->mtx); return (retval); } @@ -345,29 +341,22 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, if (sc->ram[0] & KBD_SYS_FLAG_BIT) sc->status |= KBDS_SYS_FLAG; else - sc->status &= KBDS_SYS_FLAG; - if (sc->outport & KBDO_AUX_OUTFULL) - atkbdc_assert_aux_intr(sc); - else if (sc->outport & KBDO_KBD_OUTFULL) - atkbdc_assert_kbd_intr(sc); + sc->status &= ~KBDS_SYS_FLAG; break; case KBDC_WRITE_OUTPORT: sc->outport = *eax; - if (sc->outport & KBDO_AUX_OUTFULL) - sc->status |= (KBDS_AUX_BUFFER_FULL | - KBDS_KBD_BUFFER_FULL); - if (sc->outport & KBDO_KBD_OUTFULL) - sc->status |= KBDS_KBD_BUFFER_FULL; break; case KBDC_WRITE_TO_AUX: - ps2mouse_write(sc->ps2mouse_sc, *eax); + ps2mouse_write(sc->ps2mouse_sc, *eax, 0); atkbdc_poll(sc); break; case KBDC_WRITE_KBD_OUTBUF: atkbdc_kbd_queue_data(sc, *eax); break; case KBDC_WRITE_AUX_OUTBUF: - atkbdc_aux_queue_data(sc, *eax); + ps2mouse_write(sc->ps2mouse_sc, *eax, 1); + sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); + atkbdc_aux_poll(sc); break; default: /* write to particular RAM byte */ @@ -398,7 +387,6 @@ atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, return (retval); } - static int atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) @@ -421,25 +409,27 @@ atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, return (retval); } + sc->curcmd = 0; sc->status |= KBDS_CTRL_FLAG; + sc->ctrlbyte = 0; switch (*eax) { case KBDC_GET_COMMAND_BYTE: - atkbdc_kbd_queue_data(sc, sc->ram[0]); + sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[0]; break; case KBDC_TEST_CTRL: - atkbdc_kbd_queue_data(sc, 0x55); + sc->ctrlbyte = CTRL_CMD_FLAG | 0x55; break; case KBDC_TEST_AUX_PORT: case KBDC_TEST_KBD_PORT: - atkbdc_kbd_queue_data(sc, 0); + sc->ctrlbyte = CTRL_CMD_FLAG | 0; break; case KBDC_READ_INPORT: - atkbdc_kbd_queue_data(sc, 0); + sc->ctrlbyte = CTRL_CMD_FLAG | 0; break; case KBDC_READ_OUTPORT: - atkbdc_kbd_queue_data(sc, sc->outport); + sc->ctrlbyte = CTRL_CMD_FLAG | sc->outport; break; case KBDC_SET_COMMAND_BYTE: case KBDC_WRITE_OUTPORT: @@ -452,6 +442,8 @@ atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, break; case KBDC_ENABLE_KBD_PORT: sc->ram[0] &= ~KBD_DISABLE_KBD_PORT; + if (sc->kbd.bcnt > 0) + sc->status |= KBDS_KBD_BUFFER_FULL; atkbdc_poll(sc); break; case KBDC_WRITE_TO_AUX: @@ -459,17 +451,19 @@ atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, break; case KBDC_DISABLE_AUX_PORT: sc->ram[0] |= KBD_DISABLE_AUX_PORT; + ps2mouse_toggle(sc->ps2mouse_sc, 0); + sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL); + sc->outport &= ~KBDS_AUX_BUFFER_FULL; break; case KBDC_ENABLE_AUX_PORT: sc->ram[0] &= ~KBD_DISABLE_AUX_PORT; + ps2mouse_toggle(sc->ps2mouse_sc, 1); + if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0) + sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL; break; case KBDC_RESET: /* Pulse "reset" line */ -#ifdef __FreeBSD__ error = vm_suspend(ctx, VM_SUSPEND_RESET); assert(error == 0 || errno == EALREADY); -#else - exit(0); -#endif break; default: if (*eax >= 0x21 && *eax <= 0x3f) { @@ -477,21 +471,38 @@ atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int byten; byten = (*eax - 0x20) & 0x1f; - atkbdc_kbd_queue_data(sc, sc->ram[byten]); + sc->ctrlbyte = CTRL_CMD_FLAG | sc->ram[byten]; } break; } pthread_mutex_unlock(&sc->mtx); + if (sc->ctrlbyte != 0) { + sc->status |= KBDS_KBD_BUFFER_FULL; + sc->status &= ~KBDS_AUX_BUFFER_FULL; + atkbdc_assert_kbd_intr(sc); + } else if (ps2mouse_fifocnt(sc->ps2mouse_sc) > 0 && + (sc->ram[0] & KBD_DISABLE_AUX_PORT) == 0) { + sc->status |= KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL; + atkbdc_assert_aux_intr(sc); + } else if (sc->kbd.bcnt > 0 && (sc->ram[0] & KBD_DISABLE_KBD_PORT) == 0) { + sc->status |= KBDS_KBD_BUFFER_FULL; + atkbdc_assert_kbd_intr(sc); + } + return (retval); } void -atkbdc_event(struct atkbdc_softc *sc) +atkbdc_event(struct atkbdc_softc *sc, int iskbd) { pthread_mutex_lock(&sc->mtx); - atkbdc_poll(sc); + + if (iskbd) + atkbdc_kbd_poll(sc); + else + atkbdc_aux_poll(sc); pthread_mutex_unlock(&sc->mtx); } @@ -539,7 +550,6 @@ atkbdc_init(struct vmctx *ctx) sc->ps2mouse_sc = ps2mouse_init(sc); } -#ifdef __FreeBSD__ static void atkbdc_dsdt(void) { @@ -573,4 +583,4 @@ atkbdc_dsdt(void) dsdt_line("}"); } LPC_DSDT(atkbdc_dsdt); -#endif + diff --git a/usr/src/cmd/bhyve/atkbdc.h b/usr/src/cmd/bhyve/atkbdc.h index 48b3a8b00c..85c8a7141e 100644 --- a/usr/src/cmd/bhyve/atkbdc.h +++ b/usr/src/cmd/bhyve/atkbdc.h @@ -33,6 +33,6 @@ struct atkbdc_softc; struct vmctx; void atkbdc_init(struct vmctx *ctx); -void atkbdc_event(struct atkbdc_softc *sc); +void atkbdc_event(struct atkbdc_softc *sc, int iskbd); #endif /* _ATKBDC_H_ */ diff --git a/usr/src/cmd/bhyve/bhyve_sol_glue.c b/usr/src/cmd/bhyve/bhyve_sol_glue.c index 633faacc5f..7b24ea7f5d 100644 --- a/usr/src/cmd/bhyve/bhyve_sol_glue.c +++ b/usr/src/cmd/bhyve/bhyve_sol_glue.c @@ -11,6 +11,7 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #include <sys/uio.h> @@ -25,62 +26,14 @@ void cfmakeraw(struct termios *t) { - t->c_iflag &= ~(IMAXBEL|IXOFF|INPCK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|ICRNL|IXON|IGNPAR); + t->c_iflag &= ~(IMAXBEL|IXOFF|INPCK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR| + ICRNL|IXON|IGNPAR); t->c_iflag |= IGNBRK; t->c_oflag &= ~OPOST; - t->c_lflag &= ~(ECHO|ECHOE|ECHOK|ECHONL|ICANON|ISIG|IEXTEN|NOFLSH|TOSTOP |PENDIN); + t->c_lflag &= ~(ECHO|ECHOE|ECHOK|ECHONL|ICANON|ISIG|IEXTEN|NOFLSH| + TOSTOP|PENDIN); t->c_cflag &= ~(CSIZE|PARENB); t->c_cflag |= CS8|CREAD; t->c_cc[VMIN] = 1; t->c_cc[VTIME] = 0; } - -ssize_t -preadv(int d, const struct iovec *iov, int iovcnt, off_t offset) -{ - off_t old_offset; - ssize_t n; - - old_offset = lseek(d, (off_t)0, SEEK_CUR); - if (old_offset == -1) - return (-1); - - offset = lseek(d, offset, SEEK_SET); - if (offset == -1) - return (-1); - - n = readv(d, iov, iovcnt); - if (n == -1) - return (-1); - - offset = lseek(d, old_offset, SEEK_SET); - if (offset == -1) - return (-1); - - return (n); -} - -ssize_t -pwritev(int d, const struct iovec *iov, int iovcnt, off_t offset) -{ - off_t old_offset; - ssize_t n; - - old_offset = lseek(d, (off_t)0, SEEK_CUR); - if (old_offset == -1) - return (-1); - - offset = lseek(d, offset, SEEK_SET); - if (offset == -1) - return (-1); - - n = writev(d, iov, iovcnt); - if (n == -1) - return (-1); - - offset = lseek(d, old_offset, SEEK_SET); - if (offset == -1) - return (-1); - - return (n); -} diff --git a/usr/src/cmd/bhyve/bhyvegc.c b/usr/src/cmd/bhyve/bhyvegc.c index 7a13c4c83f..4bd49ded79 100644 --- a/usr/src/cmd/bhyve/bhyvegc.c +++ b/usr/src/cmd/bhyve/bhyvegc.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -37,10 +39,11 @@ __FBSDID("$FreeBSD$"); struct bhyvegc { struct bhyvegc_image *gc_image; + int raw; }; struct bhyvegc * -bhyvegc_init(int width, int height) +bhyvegc_init(int width, int height, void *fbaddr) { struct bhyvegc *gc; struct bhyvegc_image *gc_image; @@ -50,7 +53,13 @@ bhyvegc_init(int width, int height) gc_image = calloc(1, sizeof(struct bhyvegc_image)); gc_image->width = width; gc_image->height = height; - gc_image->data = calloc(width * height, sizeof (uint32_t)); + if (fbaddr) { + gc_image->data = fbaddr; + gc->raw = 1; + } else { + gc_image->data = calloc(width * height, sizeof (uint32_t)); + gc->raw = 0; + } gc->gc_image = gc_image; @@ -58,6 +67,15 @@ bhyvegc_init(int width, int height) } void +bhyvegc_set_fbaddr(struct bhyvegc *gc, void *fbaddr) +{ + gc->raw = 1; + if (gc->gc_image->data && gc->gc_image->data != fbaddr) + free(gc->gc_image->data); + gc->gc_image->data = fbaddr; +} + +void bhyvegc_resize(struct bhyvegc *gc, int width, int height) { struct bhyvegc_image *gc_image; @@ -66,13 +84,20 @@ bhyvegc_resize(struct bhyvegc *gc, int width, int height) gc_image->width = width; gc_image->height = height; - gc_image->data = realloc(gc_image->data, - sizeof (uint32_t) * width * height); - memset(gc_image->data, 0, width * height * sizeof (uint32_t)); + if (!gc->raw) { + gc_image->data = reallocarray(gc_image->data, width * height, + sizeof (uint32_t)); + if (gc_image->data != NULL) + memset(gc_image->data, 0, width * height * + sizeof (uint32_t)); + } } struct bhyvegc_image * bhyvegc_get_image(struct bhyvegc *gc) { + if (gc == NULL) + return (NULL); + return (gc->gc_image); } diff --git a/usr/src/cmd/bhyve/bhyvegc.h b/usr/src/cmd/bhyve/bhyvegc.h index 19648f98af..11323586df 100644 --- a/usr/src/cmd/bhyve/bhyvegc.h +++ b/usr/src/cmd/bhyve/bhyvegc.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -32,12 +34,14 @@ struct bhyvegc; struct bhyvegc_image { + int vgamode; int width; int height; uint32_t *data; }; -struct bhyvegc *bhyvegc_init(int width, int height); +struct bhyvegc *bhyvegc_init(int width, int height, void *fbaddr); +void bhyvegc_set_fbaddr(struct bhyvegc *gc, void *fbaddr); void bhyvegc_resize(struct bhyvegc *gc, int width, int height); struct bhyvegc_image *bhyvegc_get_image(struct bhyvegc *gc); diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c index b985a2286e..928d2dc811 100644 --- a/usr/src/cmd/bhyve/bhyverun.c +++ b/usr/src/cmd/bhyve/bhyverun.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,30 +38,50 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/types.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif #include <sys/mman.h> #include <sys/time.h> +#include <sys/cpuset.h> + +#ifdef __FreeBSD__ +#include <amd64/vmm/intel/vmcs.h> +#else +#include <intel/vmcs.h> +#endif +#include <machine/atomic.h> #include <machine/segments.h> +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif #include <stdio.h> #include <stdlib.h> #include <string.h> #include <err.h> +#include <errno.h> #include <libgen.h> #include <unistd.h> #include <assert.h> -#include <errno.h> #include <pthread.h> #include <pthread_np.h> #include <sysexits.h> +#include <stdbool.h> +#include <stdint.h> #include <machine/vmm.h> +#ifndef WITHOUT_CAPSICUM +#include <machine/vmm_dev.h> +#endif #include <vmmapi.h> #include "bhyverun.h" @@ -68,11 +90,11 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z n #include "console.h" #include "inout.h" #include "dbgport.h" +#include "fwctl.h" +#include "gdb.h" #include "ioapic.h" #include "mem.h" -#ifdef __FreeBSD__ #include "mevent.h" -#endif #include "mptbl.h" #include "pci_emul.h" #include "pci_irq.h" @@ -89,11 +111,81 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z n #define MB (1024UL * 1024) #define GB (1024UL * MB) +static const char * const vmx_exit_reason_desc[] = { + [EXIT_REASON_EXCEPTION] = "Exception or non-maskable interrupt (NMI)", + [EXIT_REASON_EXT_INTR] = "External interrupt", + [EXIT_REASON_TRIPLE_FAULT] = "Triple fault", + [EXIT_REASON_INIT] = "INIT signal", + [EXIT_REASON_SIPI] = "Start-up IPI (SIPI)", + [EXIT_REASON_IO_SMI] = "I/O system-management interrupt (SMI)", + [EXIT_REASON_SMI] = "Other SMI", + [EXIT_REASON_INTR_WINDOW] = "Interrupt window", + [EXIT_REASON_NMI_WINDOW] = "NMI window", + [EXIT_REASON_TASK_SWITCH] = "Task switch", + [EXIT_REASON_CPUID] = "CPUID", + [EXIT_REASON_GETSEC] = "GETSEC", + [EXIT_REASON_HLT] = "HLT", + [EXIT_REASON_INVD] = "INVD", + [EXIT_REASON_INVLPG] = "INVLPG", + [EXIT_REASON_RDPMC] = "RDPMC", + [EXIT_REASON_RDTSC] = "RDTSC", + [EXIT_REASON_RSM] = "RSM", + [EXIT_REASON_VMCALL] = "VMCALL", + [EXIT_REASON_VMCLEAR] = "VMCLEAR", + [EXIT_REASON_VMLAUNCH] = "VMLAUNCH", + [EXIT_REASON_VMPTRLD] = "VMPTRLD", + [EXIT_REASON_VMPTRST] = "VMPTRST", + [EXIT_REASON_VMREAD] = "VMREAD", + [EXIT_REASON_VMRESUME] = "VMRESUME", + [EXIT_REASON_VMWRITE] = "VMWRITE", + [EXIT_REASON_VMXOFF] = "VMXOFF", + [EXIT_REASON_VMXON] = "VMXON", + [EXIT_REASON_CR_ACCESS] = "Control-register accesses", + [EXIT_REASON_DR_ACCESS] = "MOV DR", + [EXIT_REASON_INOUT] = "I/O instruction", + [EXIT_REASON_RDMSR] = "RDMSR", + [EXIT_REASON_WRMSR] = "WRMSR", + [EXIT_REASON_INVAL_VMCS] = + "VM-entry failure due to invalid guest state", + [EXIT_REASON_INVAL_MSR] = "VM-entry failure due to MSR loading", + [EXIT_REASON_MWAIT] = "MWAIT", + [EXIT_REASON_MTF] = "Monitor trap flag", + [EXIT_REASON_MONITOR] = "MONITOR", + [EXIT_REASON_PAUSE] = "PAUSE", + [EXIT_REASON_MCE_DURING_ENTRY] = + "VM-entry failure due to machine-check event", + [EXIT_REASON_TPR] = "TPR below threshold", + [EXIT_REASON_APIC_ACCESS] = "APIC access", + [EXIT_REASON_VIRTUALIZED_EOI] = "Virtualized EOI", + [EXIT_REASON_GDTR_IDTR] = "Access to GDTR or IDTR", + [EXIT_REASON_LDTR_TR] = "Access to LDTR or TR", + [EXIT_REASON_EPT_FAULT] = "EPT violation", + [EXIT_REASON_EPT_MISCONFIG] = "EPT misconfiguration", + [EXIT_REASON_INVEPT] = "INVEPT", + [EXIT_REASON_RDTSCP] = "RDTSCP", + [EXIT_REASON_VMX_PREEMPT] = "VMX-preemption timer expired", + [EXIT_REASON_INVVPID] = "INVVPID", + [EXIT_REASON_WBINVD] = "WBINVD", + [EXIT_REASON_XSETBV] = "XSETBV", + [EXIT_REASON_APIC_WRITE] = "APIC write", + [EXIT_REASON_RDRAND] = "RDRAND", + [EXIT_REASON_INVPCID] = "INVPCID", + [EXIT_REASON_VMFUNC] = "VMFUNC", + [EXIT_REASON_ENCLS] = "ENCLS", + [EXIT_REASON_RDSEED] = "RDSEED", + [EXIT_REASON_PM_LOG_FULL] = "Page-modification log full", + [EXIT_REASON_XSAVES] = "XSAVES", + [EXIT_REASON_XRSTORS] = "XRSTORS" +}; + typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu); +extern int vmexit_task_switch(struct vmctx *, struct vm_exit *, int *vcpu); char *vmname; int guest_ncpus; +uint16_t cores, maxcpus, sockets, threads; + char *guest_uuid_str; static int guest_vmexit_on_hlt, guest_vmexit_on_pause; @@ -103,9 +195,7 @@ static int x2apic_mode = 0; /* default is xAPIC */ static int strictio; static int strictmsr = 1; -#ifdef __FreeBSD__ static int acpi; -#endif static char *progname; static const int BSP = 0; @@ -124,15 +214,14 @@ static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip); static struct vm_exit vmexit[VM_MAXCPU]; struct bhyvestats { - uint64_t vmexit_bogus; - uint64_t vmexit_bogus_switch; - uint64_t vmexit_hlt; - uint64_t vmexit_pause; - uint64_t vmexit_mtrap; - uint64_t vmexit_inst_emul; - uint64_t cpu_switch_rotate; - uint64_t cpu_switch_direct; - int io_reset; + uint64_t vmexit_bogus; + uint64_t vmexit_reqidle; + uint64_t vmexit_hlt; + uint64_t vmexit_pause; + uint64_t vmexit_mtrap; + uint64_t vmexit_inst_emul; + uint64_t cpu_switch_rotate; + uint64_t cpu_switch_direct; } stats; struct mt_vmm_info { @@ -141,55 +230,211 @@ struct mt_vmm_info { int mt_vcpu; } mt_vmm_info[VM_MAXCPU]; +#ifdef __FreeBSD__ +static cpuset_t *vcpumap[VM_MAXCPU] = { NULL }; +#endif + static void usage(int code) { -#ifdef __FreeBSD__ fprintf(stderr, - "Usage: %s [-aehwAHIPW] [-g <gdb port>] [-s <pci>] [-c vcpus]\n" - " %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n" + "Usage: %s [-abehuwxACHPSWY]\n" + " %*s [-c [[cpus=]numcpus][,sockets=n][,cores=n][,threads=n]]\n" + " %*s [-g <gdb port>] [-l <lpc>]\n" +#ifdef __FreeBSD__ + " %*s [-m mem] [-p vcpu:hostcpu] [-s <pci>] [-U uuid] <vm>\n" +#else + " %*s [-m mem] [-s <pci>] [-U uuid] <vm>\n" +#endif " -a: local apic is in xAPIC mode (deprecated)\n" - " -A: create an ACPI table\n" - " -g: gdb port\n" - " -c: # cpus (default 1)\n" + " -A: create ACPI tables\n" + " -c: number of cpus and/or topology specification\n" " -C: include guest memory in core file\n" - " -p: pin 'vcpu' to 'hostcpu'\n" - " -H: vmexit from the guest on hlt\n" - " -P: vmexit from the guest on pause\n" - " -W: force virtio to use single-vector MSI\n" " -e: exit on unhandled I/O access\n" + " -g: gdb port\n" " -h: help\n" - " -s: <slot,driver,configinfo> PCI slot config\n" - " -l: LPC device configuration\n" - " -m: memory size in MB\n" - " -w: ignore unimplemented MSRs\n" - " -x: local apic is in x2APIC mode\n" - " -Y: disable MPtable generation\n" - " -U: uuid\n", - progname, (int)strlen(progname), ""); -#else - fprintf(stderr, - "Usage: %s [-ehwHPW] [-s <pci>] [-c vcpus]\n" - " %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n" - " -c: # cpus (default 1)\n" " -H: vmexit from the guest on hlt\n" + " -l: LPC device configuration\n" + " -m: memory size\n" +#ifdef __FreeBSD__ + " -p: pin 'vcpu' to 'hostcpu'\n" +#endif " -P: vmexit from the guest on pause\n" - " -W: force virtio to use single-vector MSI\n" - " -e: exit on unhandled I/O access\n" - " -h: help\n" " -s: <slot,driver,configinfo> PCI slot config\n" - " -l: LPC device configuration\n" - " -m: memory size in MB\n" + " -S: guest memory cannot be swapped\n" + " -u: RTC keeps UTC time\n" + " -U: uuid\n" " -w: ignore unimplemented MSRs\n" - " -Y: disable MPtable generation\n" - " -U: uuid\n", - progname, (int)strlen(progname), ""); -#endif + " -W: force virtio to use single-vector MSI\n" + " -x: local apic is in x2APIC mode\n" + " -Y: disable MPtable generation\n", + progname, (int)strlen(progname), "", (int)strlen(progname), "", + (int)strlen(progname), ""); exit(code); } +/* + * XXX This parser is known to have the following issues: + * 1. It accepts null key=value tokens ",,". + * 2. It accepts whitespace after = and before value. + * 3. Values out of range of INT are silently wrapped. + * 4. It doesn't check non-final values. + * 5. The apparently bogus limits of UINT16_MAX are for future expansion. + * + * The acceptance of a null specification ('-c ""') is by design to match the + * manual page syntax specification, this results in a topology of 1 vCPU. + */ +static int +topology_parse(const char *opt) +{ + uint64_t ncpus; + int c, chk, n, s, t, tmp; + char *cp, *str; + bool ns, scts; + + c = 1, n = 1, s = 1, t = 1; + ns = false, scts = false; + str = strdup(opt); + if (str == NULL) + goto out; + + while ((cp = strsep(&str, ",")) != NULL) { + if (sscanf(cp, "%i%n", &tmp, &chk) == 1) { + n = tmp; + ns = true; + } else if (sscanf(cp, "cpus=%i%n", &tmp, &chk) == 1) { + n = tmp; + ns = true; + } else if (sscanf(cp, "sockets=%i%n", &tmp, &chk) == 1) { + s = tmp; + scts = true; + } else if (sscanf(cp, "cores=%i%n", &tmp, &chk) == 1) { + c = tmp; + scts = true; + } else if (sscanf(cp, "threads=%i%n", &tmp, &chk) == 1) { + t = tmp; + scts = true; +#ifdef notyet /* Do not expose this until vmm.ko implements it */ + } else if (sscanf(cp, "maxcpus=%i%n", &tmp, &chk) == 1) { + m = tmp; +#endif + /* Skip the empty argument case from -c "" */ + } else if (cp[0] == '\0') + continue; + else + goto out; + /* Any trailing garbage causes an error */ + if (cp[chk] != '\0') + goto out; + } + free(str); + str = NULL; + + /* + * Range check 1 <= n <= UINT16_MAX all values + */ + if (n < 1 || s < 1 || c < 1 || t < 1 || + n > UINT16_MAX || s > UINT16_MAX || c > UINT16_MAX || + t > UINT16_MAX) + return (-1); + + /* If only the cpus was specified, use that as sockets */ + if (!scts) + s = n; + /* + * Compute sockets * cores * threads avoiding overflow + * The range check above insures these are 16 bit values + * If n was specified check it against computed ncpus + */ + ncpus = (uint64_t)s * c * t; + if (ncpus > UINT16_MAX || (ns && n != ncpus)) + return (-1); + + guest_ncpus = ncpus; + sockets = s; + cores = c; + threads = t; + return(0); + +out: + free(str); + return (-1); +} + +#ifndef WITHOUT_CAPSICUM +/* + * 11-stable capsicum helpers + */ +static void +bhyve_caph_cache_catpages(void) +{ + + (void)catopen("libc", NL_CAT_LOCALE); +} + +static int +bhyve_caph_limit_stdoe(void) +{ + cap_rights_t rights; + unsigned long cmds[] = { TIOCGETA, TIOCGWINSZ }; + int i, fds[] = { STDOUT_FILENO, STDERR_FILENO }; + + cap_rights_init(&rights, CAP_FCNTL, CAP_FSTAT, CAP_IOCTL); + cap_rights_set(&rights, CAP_WRITE); + + for (i = 0; i < nitems(fds); i++) { + if (cap_rights_limit(fds[i], &rights) < 0 && errno != ENOSYS) + return (-1); + + if (cap_ioctls_limit(fds[i], cmds, nitems(cmds)) < 0 && errno != ENOSYS) + return (-1); + + if (cap_fcntls_limit(fds[i], CAP_FCNTL_GETFL) < 0 && errno != ENOSYS) + return (-1); + } + + return (0); +} + +#endif + +#ifdef __FreeBSD__ +static int +pincpu_parse(const char *opt) +{ + int vcpu, pcpu; + + if (sscanf(opt, "%d:%d", &vcpu, &pcpu) != 2) { + fprintf(stderr, "invalid format: %s\n", opt); + return (-1); + } + + if (vcpu < 0 || vcpu >= VM_MAXCPU) { + fprintf(stderr, "vcpu '%d' outside valid range from 0 to %d\n", + vcpu, VM_MAXCPU - 1); + return (-1); + } + + if (pcpu < 0 || pcpu >= CPU_SETSIZE) { + fprintf(stderr, "hostcpu '%d' outside valid range from " + "0 to %d\n", pcpu, CPU_SETSIZE - 1); + return (-1); + } + + if (vcpumap[vcpu] == NULL) { + if ((vcpumap[vcpu] = malloc(sizeof(cpuset_t))) == NULL) { + perror("malloc"); + return (-1); + } + CPU_ZERO(vcpumap[vcpu]); + } + CPU_SET(pcpu, vcpumap[vcpu]); + return (0); +} +#endif + void vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid, int errcode) @@ -246,6 +491,8 @@ fbsdrun_start_thread(void *param) snprintf(tname, sizeof(tname), "vcpu %d", vcpu); pthread_set_name_np(mtp->mt_thr, tname); + gdb_cpu_add(vcpu); + vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip); /* not reached */ @@ -267,7 +514,8 @@ fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) * with vm_suspend(). */ error = vm_activate_cpu(ctx, newcpu); - assert(error == 0); + if (error != 0) + err(EX_OSERR, "could not activate CPU %d", newcpu); CPU_SET_ATOMIC(newcpu, &cpumask); @@ -287,6 +535,19 @@ fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip) } static int +fbsdrun_deletecpu(struct vmctx *ctx, int vcpu) +{ + + if (!CPU_ISSET(vcpu, &cpumask)) { + fprintf(stderr, "Attempting to delete unknown cpu %d\n", vcpu); + exit(4); + } + + CPU_CLR_ATOMIC(vcpu, &cpumask); + return (CPU_EMPTY(&cpumask)); +} + +static int vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, uint32_t eax) { @@ -295,21 +556,20 @@ vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu, * put guest-driven debug here */ #endif - return (VMEXIT_CONTINUE); + return (VMEXIT_CONTINUE); } static int vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { int error; - int bytes, port, in, out, string; + int bytes, port, in, out; int vcpu; vcpu = *pvcpu; port = vme->u.inout.port; bytes = vme->u.inout.bytes; - string = vme->u.inout.string; in = vme->u.inout.in; out = !in; @@ -380,13 +640,29 @@ vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) static int vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu) { - int newcpu; - int retval = VMEXIT_CONTINUE; - newcpu = spinup_ap(ctx, *pvcpu, - vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); + (void)spinup_ap(ctx, *pvcpu, + vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip); + + return (VMEXIT_CONTINUE); +} + +#define DEBUG_EPT_MISCONFIG +#ifdef DEBUG_EPT_MISCONFIG +#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400 + +static uint64_t ept_misconfig_gpa, ept_misconfig_pte[4]; +static int ept_misconfig_ptenum; +#endif + +static const char * +vmexit_vmx_desc(uint32_t exit_reason) +{ - return (retval); + if (exit_reason >= nitems(vmx_exit_reason_desc) || + vmx_exit_reason_desc[exit_reason] == NULL) + return ("Unknown"); + return (vmx_exit_reason_desc[exit_reason]); } static int @@ -398,12 +674,41 @@ vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status); - fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason); + fprintf(stderr, "\texit_reason\t%u (%s)\n", vmexit->u.vmx.exit_reason, + vmexit_vmx_desc(vmexit->u.vmx.exit_reason)); fprintf(stderr, "\tqualification\t0x%016lx\n", vmexit->u.vmx.exit_qualification); fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); +#ifdef DEBUG_EPT_MISCONFIG + if (vmexit->u.vmx.exit_reason == EXIT_REASON_EPT_MISCONFIG) { + vm_get_register(ctx, *pvcpu, + VMCS_IDENT(VMCS_GUEST_PHYSICAL_ADDRESS), + &ept_misconfig_gpa); + vm_get_gpa_pmap(ctx, ept_misconfig_gpa, ept_misconfig_pte, + &ept_misconfig_ptenum); + fprintf(stderr, "\tEPT misconfiguration:\n"); + fprintf(stderr, "\t\tGPA: %#lx\n", ept_misconfig_gpa); + fprintf(stderr, "\t\tPTE(%d): %#lx %#lx %#lx %#lx\n", + ept_misconfig_ptenum, ept_misconfig_pte[0], + ept_misconfig_pte[1], ept_misconfig_pte[2], + ept_misconfig_pte[3]); + } +#endif /* DEBUG_EPT_MISCONFIG */ + return (VMEXIT_ABORT); +} +static int +vmexit_svm(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + fprintf(stderr, "vm exit[%d]\n", *pvcpu); + fprintf(stderr, "\treason\t\tSVM\n"); + fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip); + fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length); + fprintf(stderr, "\texitcode\t%#lx\n", vmexit->u.svm.exitcode); + fprintf(stderr, "\texitinfo1\t%#lx\n", vmexit->u.svm.exitinfo1); + fprintf(stderr, "\texitinfo2\t%#lx\n", vmexit->u.svm.exitinfo2); return (VMEXIT_ABORT); } @@ -411,12 +716,25 @@ static int vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { + assert(vmexit->inst_length == 0); + stats.vmexit_bogus++; return (VMEXIT_CONTINUE); } static int +vmexit_reqidle(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + assert(vmexit->inst_length == 0); + + stats.vmexit_reqidle++; + + return (VMEXIT_CONTINUE); +} + +static int vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { @@ -443,8 +761,12 @@ static int vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) { + assert(vmexit->inst_length == 0); + stats.vmexit_mtrap++; + gdb_cpu_mtrap(*pvcpu); + return (VMEXIT_CONTINUE); } @@ -478,35 +800,88 @@ vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) return (VMEXIT_CONTINUE); } +static pthread_mutex_t resetcpu_mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t resetcpu_cond = PTHREAD_COND_INITIALIZER; + +static int +vmexit_suspend(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + enum vm_suspend_how how; + + how = vmexit->u.suspended.how; + + fbsdrun_deletecpu(ctx, *pvcpu); + + if (*pvcpu != BSP) { + pthread_mutex_lock(&resetcpu_mtx); + pthread_cond_signal(&resetcpu_cond); + pthread_mutex_unlock(&resetcpu_mtx); + pthread_exit(NULL); + } + + pthread_mutex_lock(&resetcpu_mtx); + while (!CPU_EMPTY(&cpumask)) { + pthread_cond_wait(&resetcpu_cond, &resetcpu_mtx); + } + pthread_mutex_unlock(&resetcpu_mtx); + + switch (how) { + case VM_SUSPEND_RESET: + exit(0); + case VM_SUSPEND_POWEROFF: + exit(1); + case VM_SUSPEND_HALT: + exit(2); + case VM_SUSPEND_TRIPLEFAULT: + exit(3); + default: + fprintf(stderr, "vmexit_suspend: invalid reason %d\n", how); + exit(100); + } + return (0); /* NOTREACHED */ +} + +static int +vmexit_debug(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + + gdb_cpu_suspend(*pvcpu); + return (VMEXIT_CONTINUE); +} + static vmexit_handler_t handler[VM_EXITCODE_MAX] = { [VM_EXITCODE_INOUT] = vmexit_inout, + [VM_EXITCODE_INOUT_STR] = vmexit_inout, [VM_EXITCODE_VMX] = vmexit_vmx, + [VM_EXITCODE_SVM] = vmexit_svm, [VM_EXITCODE_BOGUS] = vmexit_bogus, + [VM_EXITCODE_REQIDLE] = vmexit_reqidle, [VM_EXITCODE_RDMSR] = vmexit_rdmsr, [VM_EXITCODE_WRMSR] = vmexit_wrmsr, [VM_EXITCODE_MTRAP] = vmexit_mtrap, [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul, [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap, + [VM_EXITCODE_SUSPENDED] = vmexit_suspend, + [VM_EXITCODE_TASK_SWITCH] = vmexit_task_switch, + [VM_EXITCODE_DEBUG] = vmexit_debug, }; static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) { -#ifdef __FreeBSD__ - cpuset_t mask; -#endif - int error, rc, prevcpu; + int error, rc; enum vm_exitcode exitcode; + cpuset_t active_cpus; #ifdef __FreeBSD__ - if (pincpu >= 0) { - CPU_ZERO(&mask); - CPU_SET(pincpu + vcpu, &mask); + if (vcpumap[vcpu] != NULL) { error = pthread_setaffinity_np(pthread_self(), - sizeof(mask), &mask); + sizeof(cpuset_t), vcpumap[vcpu]); assert(error == 0); } #endif + error = vm_active_cpus(ctx, &active_cpus); + assert(CPU_ISSET(vcpu, &active_cpus)); error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip); assert(error == 0); @@ -516,16 +891,14 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) if (error != 0) break; - prevcpu = vcpu; - exitcode = vmexit[vcpu].exitcode; if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) { fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n", exitcode); - exit(1); + exit(4); } - rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); + rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu); switch (rc) { case VMEXIT_CONTINUE: @@ -533,7 +906,7 @@ vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip) case VMEXIT_ABORT: abort(); default: - exit(1); + exit(4); } } fprintf(stderr, "vm_run error %d, errno %d\n", error, errno); @@ -565,7 +938,7 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp); if (err < 0) { fprintf(stderr, "VM exit on HLT not supported\n"); - exit(1); + exit(4); } vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1); if (cpu == BSP) @@ -580,7 +953,7 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) if (err < 0) { fprintf(stderr, "SMP mux requested, no pause support\n"); - exit(1); + exit(4); } vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1); if (cpu == BSP) @@ -594,7 +967,7 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) if (err) { fprintf(stderr, "Unable to set x2apic state (%d)\n", err); - exit(1); + exit(4); } #ifdef __FreeBSD__ @@ -602,70 +975,175 @@ fbsdrun_set_capabilities(struct vmctx *ctx, int cpu) #endif } +static struct vmctx * +do_open(const char *vmname) +{ + struct vmctx *ctx; + int error; + bool reinit, romboot; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + const cap_ioctl_t *cmds; + size_t ncmds; +#endif + + reinit = romboot = false; + + if (lpc_bootrom()) + romboot = true; + + error = vm_create(vmname); + if (error) { + if (errno == EEXIST) { + if (romboot) { + reinit = true; + } else { + /* + * The virtual machine has been setup by the + * userspace bootloader. + */ + } + } else { + perror("vm_create"); + exit(4); + } + } else { + if (!romboot) { + /* + * If the virtual machine was just created then a + * bootrom must be configured to boot it. + */ + fprintf(stderr, "virtual machine cannot be booted\n"); + exit(4); + } + } + + ctx = vm_open(vmname); + if (ctx == NULL) { + perror("vm_open"); + exit(4); + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_IOCTL, CAP_MMAP_RW); + if (caph_rights_limit(vm_get_device_fd(ctx), &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + vm_get_ioctls(&ncmds); + cmds = vm_get_ioctls(NULL); + if (cmds == NULL) + errx(EX_OSERR, "out of memory"); + if (caph_ioctls_limit(vm_get_device_fd(ctx), cmds, ncmds) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + free((cap_ioctl_t *)cmds); +#endif + + if (reinit) { + error = vm_reinit(ctx); + if (error) { + perror("vm_reinit"); + exit(4); + } + } + error = vm_set_topology(ctx, sockets, cores, threads, maxcpus); + if (error) + errx(EX_OSERR, "vm_set_topology"); + return (ctx); +} + int main(int argc, char *argv[]) { - int c, error, gdb_port, rfb_port, err, bvmcons; - int max_vcpus; + int c, error, dbg_port, gdb_port, err, bvmcons; + int max_vcpus, mptgen, memflags; + int rtc_localtime; + bool gdb_stop; struct vmctx *ctx; uint64_t rip; size_t memsize; + char *optstr; bvmcons = 0; progname = basename(argv[0]); + dbg_port = 0; gdb_port = 0; - rfb_port = -1; + gdb_stop = false; guest_ncpus = 1; + sockets = cores = threads = 1; + maxcpus = 0; memsize = 256 * MB; - + mptgen = 1; + rtc_localtime = 1; + memflags = 0; #ifdef __FreeBSD__ - while ((c = getopt(argc, argv, "abehwxACHIPWYp:r:g:c:s:m:l:U:")) != -1) { + optstr = "abehuwxACHIPSWYp:g:G:c:s:m:l:B:U:"; #else - while ((c = getopt(argc, argv, "abehwxHIPWYr:c:s:m:l:U:")) != -1) { + optstr = "abehuwxACHIPSWYg:G:c:s:m:l:B:U:"; #endif + while ((c = getopt(argc, argv, optstr)) != -1) { switch (c) { case 'a': x2apic_mode = 0; break; -#ifdef __FreeBSD__ case 'A': acpi = 1; break; -#endif case 'b': bvmcons = 1; break; + case 'B': + if (smbios_parse(optarg) != 0) { + errx(EX_USAGE, "invalid SMBIOS " + "configuration '%s'", optarg); + } + break; #ifdef __FreeBSD__ case 'p': - pincpu = atoi(optarg); + if (pincpu_parse(optarg) != 0) { + errx(EX_USAGE, "invalid vcpu pinning " + "configuration '%s'", optarg); + } break; #endif - case 'r': - if (optarg[0] == ':') - rfb_port = atoi(optarg + 1) + RFB_PORT; - else - rfb_port = atoi(optarg); - break; case 'c': - guest_ncpus = atoi(optarg); + if (topology_parse(optarg) != 0) { + errx(EX_USAGE, "invalid cpu topology " + "'%s'", optarg); + } + break; + case 'C': + memflags |= VM_MEM_F_INCORE; break; -#ifdef __FreeBSD__ case 'g': + dbg_port = atoi(optarg); + break; + case 'G': + if (optarg[0] == 'w') { + gdb_stop = true; + optarg++; + } gdb_port = atoi(optarg); break; -#endif case 'l': - if (lpc_device_parse(optarg) != 0) { + if (strncmp(optarg, "help", strlen(optarg)) == 0) { + lpc_print_supported_devices(); + exit(0); + } else if (lpc_device_parse(optarg) != 0) { errx(EX_USAGE, "invalid lpc device " "configuration '%s'", optarg); } break; case 's': - if (pci_parse_slot(optarg) != 0) - exit(1); + if (strncmp(optarg, "help", strlen(optarg)) == 0) { + pci_print_supported_devices(); + exit(0); + } else if (pci_parse_slot(optarg) != 0) + exit(4); else break; + case 'S': + memflags |= VM_MEM_F_WIRED; + break; case 'm': error = vm_parse_memsize(optarg, &memsize); if (error) @@ -689,15 +1167,24 @@ main(int argc, char *argv[]) case 'e': strictio = 1; break; + case 'u': + rtc_localtime = 0; + break; case 'U': guest_uuid_str = optarg; break; + case 'w': + strictmsr = 0; + break; case 'W': virtio_msix = 0; break; case 'x': x2apic_mode = 1; break; + case 'Y': + mptgen = 0; + break; case 'h': usage(0); default: @@ -711,32 +1198,41 @@ main(int argc, char *argv[]) usage(1); vmname = argv[0]; - - ctx = vm_open(vmname); - if (ctx == NULL) { - perror("vm_open"); - exit(1); - } + ctx = do_open(vmname); max_vcpus = num_vcpus_allowed(ctx); if (guest_ncpus > max_vcpus) { fprintf(stderr, "%d vCPUs requested but only %d available\n", guest_ncpus, max_vcpus); - exit(1); + exit(4); } fbsdrun_set_capabilities(ctx, BSP); + vm_set_memflags(ctx, memflags); +#ifdef __FreeBSD__ err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); +#else + do { + errno = 0; + err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); + error = errno; + if (err != 0 && error == ENOMEM) { + (void) fprintf(stderr, "Unable to allocate memory " + "(%llu), retrying in 1 second\n", memsize); + sleep(1); + } + } while (error == ENOMEM); +#endif if (err) { - fprintf(stderr, "Unable to setup memory (%d)\n", err); - exit(1); + fprintf(stderr, "Unable to setup memory (%d)\n", errno); + exit(4); } error = init_msr(); if (error) { fprintf(stderr, "init_msr error %d", error); - exit(1); + exit(4); } init_mem(); @@ -745,26 +1241,37 @@ main(int argc, char *argv[]) pci_irq_init(ctx); ioapic_init(ctx); - rtc_init(ctx); + rtc_init(ctx, rtc_localtime); + sci_init(ctx); /* - * Exit if a device emulation finds an error in it's initilization + * Exit if a device emulation finds an error in its initilization */ - if (init_pci(ctx) != 0) - exit(1); + if (init_pci(ctx) != 0) { + perror("device emulation initialization error"); + exit(4); + } + + if (dbg_port != 0) + init_dbgport(dbg_port); -#ifdef __FreeBSD__ if (gdb_port != 0) - init_dbgport(gdb_port); -#endif + init_gdb(ctx, gdb_port, gdb_stop); if (bvmcons) init_bvmcons(); - console_init(); - vga_init(); - if (rfb_port != -1) - rfb_init(rfb_port); + vga_init(1); + + if (lpc_bootrom()) { + if (vm_set_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, 1)) { + fprintf(stderr, "ROM boot failed: unrestricted guest " + "capability not available\n"); + exit(4); + } + error = vcpu_reset(ctx, BSP); + assert(error == 0); + } error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip); assert(error == 0); @@ -772,22 +1279,41 @@ main(int argc, char *argv[]) /* * build the guest tables, MP etc. */ - mptable_build(ctx, guest_ncpus); + if (mptgen) { + error = mptable_build(ctx, guest_ncpus); + if (error) { + perror("error to build the guest tables"); + exit(4); + } + } error = smbios_build(ctx); assert(error == 0); -#ifdef __FreeBSD__ if (acpi) { error = acpi_build(ctx, guest_ncpus); assert(error == 0); } + if (lpc_bootrom()) + fwctl_init(); + /* * Change the proc title to include the VM name. */ - setproctitle("%s", vmname); -#else + setproctitle("%s", vmname); + +#ifndef WITHOUT_CAPSICUM + caph_cache_catpages(); + + if (caph_limit_stdout() == -1 || caph_limit_stderr() == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + + if (caph_enter() == -1) + errx(EX_OSERR, "cap_enter() failed"); +#endif + +#ifndef __FreeBSD__ /* * If applicable, wait for bhyveconsole */ @@ -810,11 +1336,7 @@ main(int argc, char *argv[]) /* * Head off to the main event dispatch loop */ -#ifdef __FreeBSD__ mevent_dispatch(); -#else - pthread_exit(NULL); -#endif - exit(1); + exit(4); } diff --git a/usr/src/cmd/bhyve/bhyverun.h b/usr/src/cmd/bhyve/bhyverun.h index be89314c09..78b3f1111f 100644 --- a/usr/src/cmd/bhyve/bhyverun.h +++ b/usr/src/cmd/bhyve/bhyverun.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/bhyverun.h 277310 2015-01-18 03:08:30Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -41,17 +43,12 @@ #ifndef _FBSDRUN_H_ #define _FBSDRUN_H_ -#ifndef CTASSERT /* Allow lint to override */ -#define CTASSERT(x) _CTASSERT(x, __LINE__) -#define _CTASSERT(x, y) __CTASSERT(x, y) -#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1] -#endif - #define VMEXIT_CONTINUE (0) #define VMEXIT_ABORT (-1) struct vmctx; extern int guest_ncpus; +extern uint16_t cores, sockets, threads; extern char *guest_uuid_str; extern char *vmname; #ifndef __FreeBSD__ diff --git a/usr/src/cmd/bhyve/block_if.c b/usr/src/cmd/bhyve/block_if.c index 2da946d420..72c5b02a0d 100644 --- a/usr/src/cmd/bhyve/block_if.c +++ b/usr/src/cmd/bhyve/block_if.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> * All rights reserved. * @@ -23,20 +25,36 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z tychon $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z tychon $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif #include <sys/queue.h> #include <sys/errno.h> #include <sys/stat.h> #include <sys/ioctl.h> #include <sys/disk.h> +#include <sys/limits.h> +#include <sys/uio.h> +#ifndef __FreeBSD__ +#include <sys/dkio.h> +#endif #include <assert.h> +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> #include <fcntl.h> #include <stdio.h> #include <stdlib.h> @@ -44,6 +62,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z t #include <pthread.h> #include <pthread_np.h> #include <signal.h> +#include <sysexits.h> #include <unistd.h> #include <machine/atomic.h> @@ -56,16 +75,27 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z t #define BLOCKIF_SIG 0xb109b109 -#define BLOCKIF_MAXREQ 33 +#ifdef __FreeBSD__ +#define BLOCKIF_NUMTHR 8 +#else +/* Enlarge to keep pace with the virtio-block ring size */ +#define BLOCKIF_NUMTHR 16 +#endif +#define BLOCKIF_MAXREQ (BLOCKIF_RING_MAX + BLOCKIF_NUMTHR) enum blockop { BOP_READ, BOP_WRITE, - BOP_FLUSH +#ifndef __FreeBSD__ + BOP_WRITE_SYNC, +#endif + BOP_FLUSH, + BOP_DELETE }; enum blockstat { BST_FREE, + BST_BLOCK, BST_PEND, BST_BUSY, BST_DONE @@ -77,24 +107,40 @@ struct blockif_elem { enum blockop be_op; enum blockstat be_status; pthread_t be_tid; + off_t be_block; }; +#ifndef __FreeBSD__ +enum blockif_wce { + WCE_NONE = 0, + WCE_IOCTL, + WCE_FCNTL +}; +#endif + struct blockif_ctxt { int bc_magic; int bc_fd; + int bc_ischr; + int bc_isgeom; + int bc_candelete; +#ifndef __FreeBSD__ + enum blockif_wce bc_wce; +#endif int bc_rdonly; off_t bc_size; int bc_sectsz; - pthread_t bc_btid; - pthread_mutex_t bc_mtx; - pthread_cond_t bc_cond; + int bc_psectsz; + int bc_psectoff; int bc_closing; + pthread_t bc_btid[BLOCKIF_NUMTHR]; + pthread_mutex_t bc_mtx; + pthread_cond_t bc_cond; /* Request elements and free/pending/busy queues */ TAILQ_HEAD(, blockif_elem) bc_freeq; TAILQ_HEAD(, blockif_elem) bc_pendq; TAILQ_HEAD(, blockif_elem) bc_busyq; - u_int bc_req_count; struct blockif_elem bc_reqs[BLOCKIF_MAXREQ]; }; @@ -113,83 +159,214 @@ static int blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq, enum blockop op) { - struct blockif_elem *be; - - assert(bc->bc_req_count < BLOCKIF_MAXREQ); + struct blockif_elem *be, *tbe; + off_t off; + int i; be = TAILQ_FIRST(&bc->bc_freeq); assert(be != NULL); assert(be->be_status == BST_FREE); - TAILQ_REMOVE(&bc->bc_freeq, be, be_link); - be->be_status = BST_PEND; be->be_req = breq; be->be_op = op; + switch (op) { + case BOP_READ: + case BOP_WRITE: +#ifndef __FreeBSD__ + case BOP_WRITE_SYNC: +#endif + case BOP_DELETE: + off = breq->br_offset; + for (i = 0; i < breq->br_iovcnt; i++) + off += breq->br_iov[i].iov_len; + break; + default: + off = OFF_MAX; + } + be->be_block = off; + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + if (tbe == NULL) { + TAILQ_FOREACH(tbe, &bc->bc_busyq, be_link) { + if (tbe->be_block == breq->br_offset) + break; + } + } + if (tbe == NULL) + be->be_status = BST_PEND; + else + be->be_status = BST_BLOCK; TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link); - - bc->bc_req_count++; - - return (0); + return (be->be_status == BST_PEND); } static int -blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep) +blockif_dequeue(struct blockif_ctxt *bc, pthread_t t, struct blockif_elem **bep) { struct blockif_elem *be; - if (bc->bc_req_count == 0) - return (ENOENT); - - be = TAILQ_FIRST(&bc->bc_pendq); - assert(be != NULL); - assert(be->be_status == BST_PEND); + TAILQ_FOREACH(be, &bc->bc_pendq, be_link) { + if (be->be_status == BST_PEND) + break; + assert(be->be_status == BST_BLOCK); + } + if (be == NULL) + return (0); TAILQ_REMOVE(&bc->bc_pendq, be, be_link); be->be_status = BST_BUSY; - be->be_tid = bc->bc_btid; + be->be_tid = t; TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link); - *bep = be; - - return (0); + return (1); } static void blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be) { - assert(be->be_status == BST_DONE); + struct blockif_elem *tbe; - TAILQ_REMOVE(&bc->bc_busyq, be, be_link); + if (be->be_status == BST_DONE || be->be_status == BST_BUSY) + TAILQ_REMOVE(&bc->bc_busyq, be, be_link); + else + TAILQ_REMOVE(&bc->bc_pendq, be, be_link); + TAILQ_FOREACH(tbe, &bc->bc_pendq, be_link) { + if (tbe->be_req->br_offset == be->be_block) + tbe->be_status = BST_PEND; + } be->be_tid = 0; be->be_status = BST_FREE; be->be_req = NULL; TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); - - bc->bc_req_count--; } static void -blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be) +blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be, uint8_t *buf) { struct blockif_req *br; - int err; +#ifdef __FreeBSD__ + off_t arg[2]; +#endif + ssize_t clen, len, off, boff, voff; + int i, err; br = be->be_req; + if (br->br_iovcnt <= 1) + buf = NULL; err = 0; - switch (be->be_op) { case BOP_READ: - if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, - br->br_offset) < 0) - err = errno; + if (buf == NULL) { + if ((len = preadv(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + if (pread(bc->bc_fd, buf, len, br->br_offset + + off) < 0) { + err = errno; + break; + } + boff = 0; + do { + clen = MIN(len - boff, br->br_iov[i].iov_len - + voff); + memcpy(br->br_iov[i].iov_base + voff, + buf + boff, clen); + if (clen < br->br_iov[i].iov_len - voff) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + off += len; + br->br_resid -= len; + } break; case BOP_WRITE: - if (bc->bc_rdonly) + if (bc->bc_rdonly) { err = EROFS; - else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, - br->br_offset) < 0) - err = errno; + break; + } + if (buf == NULL) { + if ((len = pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt, + br->br_offset)) < 0) + err = errno; + else + br->br_resid -= len; + break; + } + i = 0; + off = voff = 0; + while (br->br_resid > 0) { + len = MIN(br->br_resid, MAXPHYS); + boff = 0; + do { + clen = MIN(len - boff, br->br_iov[i].iov_len - + voff); + memcpy(buf + boff, + br->br_iov[i].iov_base + voff, clen); + if (clen < br->br_iov[i].iov_len - voff) + voff += clen; + else { + i++; + voff = 0; + } + boff += clen; + } while (boff < len); + if (pwrite(bc->bc_fd, buf, len, br->br_offset + + off) < 0) { + err = errno; + break; + } + off += len; + br->br_resid -= len; + } break; case BOP_FLUSH: +#ifdef __FreeBSD__ + if (bc->bc_ischr) { + if (ioctl(bc->bc_fd, DIOCGFLUSH)) + err = errno; + } else if (fsync(bc->bc_fd)) + err = errno; +#else + /* + * This fsync() should be adequate to flush the cache of a file + * or device. In VFS, the VOP_SYNC operation is converted to + * the appropriate ioctl in both sdev (for real devices) and + * zfs (for zvols). + */ + if (fsync(bc->bc_fd)) + err = errno; +#endif + break; + case BOP_DELETE: + if (!bc->bc_candelete) + err = EOPNOTSUPP; + else if (bc->bc_rdonly) + err = EROFS; +#ifdef __FreeBSD__ + else if (bc->bc_ischr) { + arg[0] = br->br_offset; + arg[1] = br->br_resid; + if (ioctl(bc->bc_fd, DIOCGDELETE, arg)) + err = errno; + else + br->br_resid = 0; + } +#endif + else + err = EOPNOTSUPP; break; default: err = EINVAL; @@ -206,28 +383,34 @@ blockif_thr(void *arg) { struct blockif_ctxt *bc; struct blockif_elem *be; + pthread_t t; + uint8_t *buf; bc = arg; + if (bc->bc_isgeom) + buf = malloc(MAXPHYS); + else + buf = NULL; + t = pthread_self(); + pthread_mutex_lock(&bc->bc_mtx); for (;;) { - pthread_mutex_lock(&bc->bc_mtx); - while (!blockif_dequeue(bc, &be)) { + while (blockif_dequeue(bc, t, &be)) { pthread_mutex_unlock(&bc->bc_mtx); - blockif_proc(bc, be); + blockif_proc(bc, be, buf); pthread_mutex_lock(&bc->bc_mtx); blockif_complete(bc, be); } - pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); - pthread_mutex_unlock(&bc->bc_mtx); - - /* - * Check ctxt status here to see if exit requested - */ + /* Check ctxt status here to see if exit requested */ if (bc->bc_closing) - pthread_exit(NULL); + break; + pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx); } + pthread_mutex_unlock(&bc->bc_mtx); - /* Not reached */ + if (buf) + free(buf); + pthread_exit(NULL); return (NULL); } @@ -276,15 +459,31 @@ struct blockif_ctxt * blockif_open(const char *optstr, const char *ident) { char tname[MAXCOMLEN + 1]; - char *nopt, *xopts; +#ifdef __FreeBSD__ + char name[MAXPATHLEN]; + char *nopt, *xopts, *cp; +#else + char *nopt, *xopts, *cp = NULL; +#endif struct blockif_ctxt *bc; struct stat sbuf; - off_t size; +#ifdef __FreeBSD__ + struct diocgattr_arg arg; +#else + enum blockif_wce wce = WCE_NONE; +#endif + off_t size, psectsz, psectoff; int extra, fd, i, sectsz; - int nocache, sync, ro; + int nocache, sync, ro, candelete, geom, ssopt, pssopt; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { DIOCGFLUSH, DIOCGDELETE }; +#endif pthread_once(&blockif_once, blockif_init); + fd = -1; + ssopt = 0; nocache = 0; sync = 0; ro = 0; @@ -293,16 +492,25 @@ blockif_open(const char *optstr, const char *ident) * The first element in the optstring is always a pathname. * Optional elements follow */ - nopt = strdup(optstr); - for (xopts = strtok(nopt, ","); - xopts != NULL; - xopts = strtok(NULL, ",")) { - if (!strcmp(xopts, "nocache")) + nopt = xopts = strdup(optstr); + while (xopts != NULL) { + cp = strsep(&xopts, ","); + if (cp == nopt) /* file or device pathname */ + continue; + else if (!strcmp(cp, "nocache")) nocache = 1; - else if (!strcmp(xopts, "sync")) + else if (!strcmp(cp, "sync") || !strcmp(cp, "direct")) sync = 1; - else if (!strcmp(xopts, "ro")) + else if (!strcmp(cp, "ro")) ro = 1; + else if (sscanf(cp, "sectorsize=%d/%d", &ssopt, &pssopt) == 2) + ; + else if (sscanf(cp, "sectorsize=%d", &ssopt) == 1) + pssopt = ssopt; + else { + fprintf(stderr, "Invalid device option \"%s\"\n", cp); + goto err; + } } extra = 0; @@ -319,62 +527,185 @@ blockif_open(const char *optstr, const char *ident) } if (fd < 0) { - perror("Could not open backing file"); - return (NULL); + warn("Could not open backing file: %s", nopt); + goto err; } if (fstat(fd, &sbuf) < 0) { - perror("Could not stat backing file"); - close(fd); - return (NULL); + warn("Could not stat backing file %s", nopt); + goto err; } +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_FSYNC, CAP_IOCTL, CAP_READ, CAP_SEEK, + CAP_WRITE); + if (ro) + cap_rights_clear(&rights, CAP_FSYNC, CAP_WRITE); + + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + /* * Deal with raw devices */ size = sbuf.st_size; sectsz = DEV_BSIZE; + psectsz = psectoff = 0; + candelete = geom = 0; #ifdef __FreeBSD__ if (S_ISCHR(sbuf.st_mode)) { if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || ioctl(fd, DIOCGSECTORSIZE, §sz)) { perror("Could not fetch dev blk/sector size"); - close(fd); - return (NULL); + goto err; } assert(size != 0); assert(sectsz != 0); + if (ioctl(fd, DIOCGSTRIPESIZE, &psectsz) == 0 && psectsz > 0) + ioctl(fd, DIOCGSTRIPEOFFSET, &psectoff); + strlcpy(arg.name, "GEOM::candelete", sizeof(arg.name)); + arg.len = sizeof(arg.value.i); + if (ioctl(fd, DIOCGATTR, &arg) == 0) + candelete = arg.value.i; + if (ioctl(fd, DIOCGPROVIDERNAME, name) == 0) + geom = 1; + } else { + psectsz = sbuf.st_blksize; } +#else + psectsz = sbuf.st_blksize; + if (S_ISCHR(sbuf.st_mode)) { + struct dk_minfo_ext dkmext; + int wce_val; + + /* Look for a more accurate physical blocksize */ + if (ioctl(fd, DKIOCGMEDIAINFOEXT, &dkmext) == 0) { + psectsz = dkmext.dki_pbsize; + } + /* See if a configurable write cache is present and working */ + if (ioctl(fd, DKIOCGETWCE, &wce_val) == 0) { + /* + * If WCE is already active, disable it until the + * specific device driver calls for its return. If it + * is not active, toggle it on and off to verify that + * such actions are possible. + */ + if (wce_val != 0) { + wce_val = 0; + /* + * Inability to disable the cache is a threat + * to data durability. + */ + assert(ioctl(fd, DKIOCSETWCE, &wce_val) == 0); + wce = WCE_IOCTL; + } else { + int r1, r2; + + wce_val = 1; + r1 = ioctl(fd, DKIOCSETWCE, &wce_val); + wce_val = 0; + r2 = ioctl(fd, DKIOCSETWCE, &wce_val); + + if (r1 == 0 && r2 == 0) { + wce = WCE_IOCTL; + } else { + /* + * If the cache cache toggle was not + * successful, ensure that the cache + * was not left enabled. + */ + assert(r1 != 0); + } + } + } + } else { + int flags; + + if ((flags = fcntl(fd, F_GETFL)) >= 0) { + flags |= O_DSYNC; + if (fcntl(fd, F_SETFL, flags) != -1) { + wce = WCE_FCNTL; + } + } + } +#endif + +#ifndef WITHOUT_CAPSICUM + if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); #endif + if (ssopt != 0) { + if (!powerof2(ssopt) || !powerof2(pssopt) || ssopt < 512 || + ssopt > pssopt) { + fprintf(stderr, "Invalid sector size %d/%d\n", + ssopt, pssopt); + goto err; + } + + /* + * Some backend drivers (e.g. cd0, ada0) require that the I/O + * size be a multiple of the device's sector size. + * + * Validate that the emulated sector size complies with this + * requirement. + */ + if (S_ISCHR(sbuf.st_mode)) { + if (ssopt < sectsz || (ssopt % sectsz) != 0) { + fprintf(stderr, "Sector size %d incompatible " + "with underlying device sector size %d\n", + ssopt, sectsz); + goto err; + } + } + + sectsz = ssopt; + psectsz = pssopt; + psectoff = 0; + } + bc = calloc(1, sizeof(struct blockif_ctxt)); if (bc == NULL) { - close(fd); - return (NULL); + perror("calloc"); + goto err; } bc->bc_magic = BLOCKIF_SIG; bc->bc_fd = fd; + bc->bc_ischr = S_ISCHR(sbuf.st_mode); + bc->bc_isgeom = geom; + bc->bc_candelete = candelete; +#ifndef __FreeBSD__ + bc->bc_wce = wce; +#endif bc->bc_rdonly = ro; bc->bc_size = size; bc->bc_sectsz = sectsz; + bc->bc_psectsz = psectsz; + bc->bc_psectoff = psectoff; pthread_mutex_init(&bc->bc_mtx, NULL); pthread_cond_init(&bc->bc_cond, NULL); TAILQ_INIT(&bc->bc_freeq); TAILQ_INIT(&bc->bc_pendq); TAILQ_INIT(&bc->bc_busyq); - bc->bc_req_count = 0; for (i = 0; i < BLOCKIF_MAXREQ; i++) { bc->bc_reqs[i].be_status = BST_FREE; TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link); } - pthread_create(&bc->bc_btid, NULL, blockif_thr, bc); - - snprintf(tname, sizeof(tname), "blk-%s", ident); - pthread_set_name_np(bc->bc_btid, tname); + for (i = 0; i < BLOCKIF_NUMTHR; i++) { + pthread_create(&bc->bc_btid[i], NULL, blockif_thr, bc); + snprintf(tname, sizeof(tname), "blk-%s-%d", ident, i); + pthread_set_name_np(bc->bc_btid[i], tname); + } return (bc); +err: + if (fd >= 0) + close(fd); + free(nopt); + return (NULL); } static int @@ -386,13 +717,13 @@ blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq, err = 0; pthread_mutex_lock(&bc->bc_mtx); - if (bc->bc_req_count < BLOCKIF_MAXREQ) { + if (!TAILQ_EMPTY(&bc->bc_freeq)) { /* * Enqueue and inform the block i/o thread * that there is work available */ - blockif_enqueue(bc, breq, op); - pthread_cond_signal(&bc->bc_cond); + if (blockif_enqueue(bc, breq, op)) + pthread_cond_signal(&bc->bc_cond); } else { /* * Callers are not allowed to enqueue more than @@ -432,6 +763,14 @@ blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq) } int +blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (blockif_request(bc, breq, BOP_DELETE)); +} + +int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) { struct blockif_elem *be; @@ -450,11 +789,7 @@ blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq) /* * Found it. */ - TAILQ_REMOVE(&bc->bc_pendq, be, be_link); - be->be_status = BST_FREE; - be->be_req = NULL; - TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link); - bc->bc_req_count--; + blockif_complete(bc, be); pthread_mutex_unlock(&bc->bc_mtx); return (0); @@ -515,18 +850,19 @@ int blockif_close(struct blockif_ctxt *bc) { void *jval; - int err; - - err = 0; + int i; assert(bc->bc_magic == BLOCKIF_SIG); /* * Stop the block i/o thread */ + pthread_mutex_lock(&bc->bc_mtx); bc->bc_closing = 1; - pthread_cond_signal(&bc->bc_cond); - pthread_join(bc->bc_btid, &jval); + pthread_mutex_unlock(&bc->bc_mtx); + pthread_cond_broadcast(&bc->bc_cond); + for (i = 0; i < BLOCKIF_NUMTHR; i++) + pthread_join(bc->bc_btid[i], &jval); /* XXX Cancel queued i/o's ??? */ @@ -608,6 +944,15 @@ blockif_sectsz(struct blockif_ctxt *bc) return (bc->bc_sectsz); } +void +blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + *size = bc->bc_psectsz; + *off = bc->bc_psectoff; +} + int blockif_queuesz(struct blockif_ctxt *bc) { @@ -623,3 +968,54 @@ blockif_is_ro(struct blockif_ctxt *bc) assert(bc->bc_magic == BLOCKIF_SIG); return (bc->bc_rdonly); } + +int +blockif_candelete(struct blockif_ctxt *bc) +{ + + assert(bc->bc_magic == BLOCKIF_SIG); + return (bc->bc_candelete); +} + +#ifndef __FreeBSD__ +int +blockif_set_wce(struct blockif_ctxt *bc, int wc_enable) +{ + int res = 0, flags; + int clean_val = (wc_enable != 0) ? 1 : 0; + + (void) pthread_mutex_lock(&bc->bc_mtx); + switch (bc->bc_wce) { + case WCE_IOCTL: + res = ioctl(bc->bc_fd, DKIOCSETWCE, &clean_val); + break; + case WCE_FCNTL: + if ((flags = fcntl(bc->bc_fd, F_GETFL)) >= 0) { + if (wc_enable == 0) { + flags |= O_DSYNC; + } else { + flags &= ~O_DSYNC; + } + if (fcntl(bc->bc_fd, F_SETFL, flags) == -1) { + res = -1; + } + } else { + res = -1; + } + break; + default: + break; + } + + /* + * After a successful disable of the write cache, ensure that any + * lingering data in the cache is synced out. + */ + if (res == 0 && wc_enable == 0) { + res = fsync(bc->bc_fd); + } + (void) pthread_mutex_unlock(&bc->bc_mtx); + + return (res); +} +#endif /* __FreeBSD__ */ diff --git a/usr/src/cmd/bhyve/block_if.h b/usr/src/cmd/bhyve/block_if.h index 5ef120933c..bff2b42768 100644 --- a/usr/src/cmd/bhyve/block_if.h +++ b/usr/src/cmd/bhyve/block_if.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/block_if.h 268638 2014-07-15 00:25:54Z grehan $ + * $FreeBSD$ */ /* @@ -39,18 +41,21 @@ #include <sys/uio.h> #include <sys/unistd.h> -#ifdef __FreeBSD__ -#define BLOCKIF_IOV_MAX 32 /* not practical to be IOV_MAX */ -#else -#define BLOCKIF_IOV_MAX 16 /* not practical to be IOV_MAX */ -#endif +/* + * BLOCKIF_IOV_MAX is the maximum number of scatter/gather entries in + * a single request. BLOCKIF_RING_MAX is the maxmimum number of + * pending requests that can be queued. + */ +#define BLOCKIF_IOV_MAX 128 /* not practical to be IOV_MAX */ +#define BLOCKIF_RING_MAX 128 struct blockif_req { - struct iovec br_iov[BLOCKIF_IOV_MAX]; int br_iovcnt; off_t br_offset; + ssize_t br_resid; void (*br_callback)(struct blockif_req *req, int err); void *br_param; + struct iovec br_iov[BLOCKIF_IOV_MAX]; }; struct blockif_ctxt; @@ -59,11 +64,17 @@ off_t blockif_size(struct blockif_ctxt *bc); void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s); int blockif_sectsz(struct blockif_ctxt *bc); +void blockif_psectsz(struct blockif_ctxt *bc, int *size, int *off); int blockif_queuesz(struct blockif_ctxt *bc); int blockif_is_ro(struct blockif_ctxt *bc); +int blockif_candelete(struct blockif_ctxt *bc); +#ifndef __FreeBSD__ +int blockif_set_wce(struct blockif_ctxt *bc, int enable); +#endif int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq); +int blockif_delete(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq); int blockif_close(struct blockif_ctxt *bc); diff --git a/usr/src/cmd/bhyve/bootrom.c b/usr/src/cmd/bhyve/bootrom.c new file mode 100644 index 0000000000..b8c63828c8 --- /dev/null +++ b/usr/src/cmd/bhyve/bootrom.c @@ -0,0 +1,113 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/param.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/stat.h> + +#include <machine/vmm.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <stdbool.h> + +#include <vmmapi.h> +#include "bhyverun.h" +#include "bootrom.h" + +#define MAX_BOOTROM_SIZE (16 * 1024 * 1024) /* 16 MB */ + +int +bootrom_init(struct vmctx *ctx, const char *romfile) +{ + struct stat sbuf; + vm_paddr_t gpa; + ssize_t rlen; + char *ptr; + int fd, i, rv, prot; + + rv = -1; + fd = open(romfile, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "Error opening bootrom \"%s\": %s\n", + romfile, strerror(errno)); + goto done; + } + + if (fstat(fd, &sbuf) < 0) { + fprintf(stderr, "Could not fstat bootrom file \"%s\": %s\n", + romfile, strerror(errno)); + goto done; + } + + /* + * Limit bootrom size to 16MB so it doesn't encroach into reserved + * MMIO space (e.g. APIC, HPET, MSI). + */ + if (sbuf.st_size > MAX_BOOTROM_SIZE || sbuf.st_size < PAGE_SIZE) { + fprintf(stderr, "Invalid bootrom size %ld\n", sbuf.st_size); + goto done; + } + + if (sbuf.st_size & PAGE_MASK) { + fprintf(stderr, "Bootrom size %ld is not a multiple of the " + "page size\n", sbuf.st_size); + goto done; + } + + ptr = vm_create_devmem(ctx, VM_BOOTROM, "bootrom", sbuf.st_size); + if (ptr == MAP_FAILED) + goto done; + + /* Map the bootrom into the guest address space */ + prot = PROT_READ | PROT_EXEC; + gpa = (1ULL << 32) - sbuf.st_size; + if (vm_mmap_memseg(ctx, gpa, VM_BOOTROM, 0, sbuf.st_size, prot) != 0) + goto done; + + /* Read 'romfile' into the guest address space */ + for (i = 0; i < sbuf.st_size / PAGE_SIZE; i++) { + rlen = read(fd, ptr + i * PAGE_SIZE, PAGE_SIZE); + if (rlen != PAGE_SIZE) { + fprintf(stderr, "Incomplete read of page %d of bootrom " + "file %s: %ld bytes\n", i, romfile, rlen); + goto done; + } + } + rv = 0; +done: + if (fd >= 0) + close(fd); + return (rv); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ipi.h b/usr/src/cmd/bhyve/bootrom.h index 4dff03ba1f..7fb12181dd 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_ipi.h +++ b/usr/src/cmd/bhyve/bootrom.h @@ -1,5 +1,7 @@ /*- - * Copyright (c) 2011 NetApp, Inc. + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Neel Natu <neel@freebsd.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -11,10 +13,10 @@ * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -23,15 +25,16 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_ipi.h 260466 2014-01-09 03:25:54Z neel $ + * $FreeBSD$ */ -#ifndef _VMM_IPI_H_ -#define _VMM_IPI_H_ +#ifndef _BOOTROM_H_ +#define _BOOTROM_H_ -#ifdef __FreeBSD__ -int vmm_ipi_alloc(void); -void vmm_ipi_free(int num); -#endif +#include <stdbool.h> + +struct vmctx; + +int bootrom_init(struct vmctx *ctx, const char *romfile); #endif diff --git a/usr/src/cmd/bhyve/console.c b/usr/src/cmd/bhyve/console.c index a8d07709be..2567f69959 100644 --- a/usr/src/cmd/bhyve/console.c +++ b/usr/src/cmd/bhyve/console.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -40,15 +42,23 @@ static struct { kbd_event_func_t kbd_event_cb; void *kbd_arg; + int kbd_priority; ptr_event_func_t ptr_event_cb; void *ptr_arg; + int ptr_priority; } console; void -console_init(void) +console_init(int w, int h, void *fbaddr) +{ + console.gc = bhyvegc_init(w, h, fbaddr); +} + +void +console_set_fbaddr(void *fbaddr) { - console.gc = bhyvegc_init(640, 400); + bhyvegc_set_fbaddr(console.gc, fbaddr); } struct bhyvegc_image * @@ -71,31 +81,40 @@ console_fb_register(fb_render_func_t render_cb, void *arg) void console_refresh(void) { - (*console.fb_render_cb)(console.gc, console.fb_arg); + if (console.fb_render_cb) + (*console.fb_render_cb)(console.gc, console.fb_arg); } void -console_kbd_register(kbd_event_func_t event_cb, void *arg) +console_kbd_register(kbd_event_func_t event_cb, void *arg, int pri) { - console.kbd_event_cb = event_cb; - console.kbd_arg = arg; + if (pri > console.kbd_priority) { + console.kbd_event_cb = event_cb; + console.kbd_arg = arg; + console.kbd_priority = pri; + } } void -console_ptr_register(ptr_event_func_t event_cb, void *arg) +console_ptr_register(ptr_event_func_t event_cb, void *arg, int pri) { - console.ptr_event_cb = event_cb; - console.ptr_arg = arg; + if (pri > console.ptr_priority) { + console.ptr_event_cb = event_cb; + console.ptr_arg = arg; + console.ptr_priority = pri; + } } void console_key_event(int down, uint32_t keysym) { - (*console.kbd_event_cb)(down, keysym, console.kbd_arg); + if (console.kbd_event_cb) + (*console.kbd_event_cb)(down, keysym, console.kbd_arg); } void console_ptr_event(uint8_t button, int x, int y) { - (*console.ptr_event_cb)(button, x, y, console.ptr_arg); + if (console.ptr_event_cb) + (*console.ptr_event_cb)(button, x, y, console.ptr_arg); } diff --git a/usr/src/cmd/bhyve/console.h b/usr/src/cmd/bhyve/console.h index bffb7c2456..0d0a854866 100644 --- a/usr/src/cmd/bhyve/console.h +++ b/usr/src/cmd/bhyve/console.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -35,16 +37,19 @@ typedef void (*fb_render_func_t)(struct bhyvegc *gc, void *arg); typedef void (*kbd_event_func_t)(int down, uint32_t keysym, void *arg); typedef void (*ptr_event_func_t)(uint8_t mask, int x, int y, void *arg); -void console_init(void); +void console_init(int w, int h, void *fbaddr); + +void console_set_fbaddr(void *fbaddr); + struct bhyvegc_image *console_get_image(void); -void console_fb_register(fb_render_func_t render_cb, void *arg); -void console_refresh(void); +void console_fb_register(fb_render_func_t render_cb, void *arg); +void console_refresh(void); -void console_kbd_register(kbd_event_func_t event_cb, void *arg); -void console_key_event(int down, uint32_t keysym); +void console_kbd_register(kbd_event_func_t event_cb, void *arg, int pri); +void console_key_event(int down, uint32_t keysym); -void console_ptr_register(ptr_event_func_t event_cb, void *arg); -void console_ptr_event(uint8_t button, int x, int y); +void console_ptr_register(ptr_event_func_t event_cb, void *arg, int pri); +void console_ptr_event(uint8_t button, int x, int y); #endif /* _CONSOLE_H_ */ diff --git a/usr/src/cmd/bhyve/consport.c b/usr/src/cmd/bhyve/consport.c index 69b6dfddf1..cda2df2414 100644 --- a/usr/src/cmd/bhyve/consport.c +++ b/usr/src/cmd/bhyve/consport.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,20 +25,29 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z jhb $ + * $FreeBSD$ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z jhb $"); +__FBSDID("$FreeBSD$"); #include <sys/types.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif #include <sys/select.h> +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> #include <stdio.h> #include <stdlib.h> #include <termios.h> #include <unistd.h> #include <stdbool.h> +#include <sysexits.h> #include "inout.h" #include "pci_lpc.h" @@ -44,6 +55,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z j #define BVM_CONSOLE_PORT 0x220 #define BVM_CONS_SIG ('b' << 8 | 'v') +#ifdef __FreeBSD__ static struct termios tio_orig, tio_new; static void @@ -51,6 +63,7 @@ ttyclose(void) { tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); } +#endif static void ttyopen(void) @@ -68,14 +81,14 @@ ttyopen(void) static bool tty_char_available(void) { - fd_set rfds; - struct timeval tv; - - FD_ZERO(&rfds); - FD_SET(STDIN_FILENO, &rfds); - tv.tv_sec = 0; - tv.tv_usec = 0; - if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { + fd_set rfds; + struct timeval tv; + + FD_ZERO(&rfds); + FD_SET(STDIN_FILENO, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 0; + if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) { return (true); } else { return (false); @@ -106,6 +119,10 @@ console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { static int opened; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; +#endif if (bytes == 2 && in) { *eax = BVM_CONS_SIG; @@ -125,6 +142,14 @@ console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, return (-1); if (!opened) { +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, + CAP_WRITE); + if (caph_rights_limit(STDIN_FILENO, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(STDIN_FILENO, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif ttyopen(); opened = 1; } diff --git a/usr/src/cmd/bhyve/dbgport.c b/usr/src/cmd/bhyve/dbgport.c new file mode 100644 index 0000000000..88a616b50d --- /dev/null +++ b/usr/src/cmd/bhyve/dbgport.c @@ -0,0 +1,180 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/socket.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#include <sys/uio.h> + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <stdio.h> +#include <stdlib.h> +#include <sysexits.h> +#include <fcntl.h> +#include <unistd.h> +#include <errno.h> + +#include "inout.h" +#include "dbgport.h" +#include "pci_lpc.h" + +#define BVM_DBG_PORT 0x224 +#define BVM_DBG_SIG ('B' << 8 | 'V') + +static int listen_fd, conn_fd; + +static struct sockaddr_in sin; + +static int +dbg_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + int nwritten, nread, printonce; + int on = 1; + char ch; + + if (bytes == 2 && in) { + *eax = BVM_DBG_SIG; + return (0); + } + + if (bytes != 4) + return (-1); + +again: + printonce = 0; + while (conn_fd < 0) { + if (!printonce) { + printf("Waiting for connection from gdb\r\n"); + printonce = 1; + } + conn_fd = accept4(listen_fd, NULL, NULL, SOCK_NONBLOCK); + if (conn_fd >= 0) { + /* Avoid EPIPE after the client drops off. */ + (void)setsockopt(conn_fd, SOL_SOCKET, SO_NOSIGPIPE, + &on, sizeof(on)); + /* Improve latency for one byte at a time tranfers. */ + (void)setsockopt(conn_fd, IPPROTO_TCP, TCP_NODELAY, + &on, sizeof(on)); + } else if (errno != EINTR) { + perror("accept"); + } + } + + if (in) { + nread = read(conn_fd, &ch, 1); + if (nread == -1 && errno == EAGAIN) + *eax = -1; + else if (nread == 1) + *eax = ch; + else { + close(conn_fd); + conn_fd = -1; + goto again; + } + } else { + ch = *eax; + nwritten = write(conn_fd, &ch, 1); + if (nwritten != 1) { + close(conn_fd); + conn_fd = -1; + goto again; + } + } + return (0); +} + +static struct inout_port dbgport = { + "bvmdbg", + BVM_DBG_PORT, + 1, + IOPORT_F_INOUT, + dbg_handler +}; + +SYSRES_IO(BVM_DBG_PORT, 4); + +void +init_dbgport(int sport) +{ + int reuse; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + conn_fd = -1; + + if ((listen_fd = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("cannot create socket"); + exit(4); + } + +#ifdef __FreeBSD__ + sin.sin_len = sizeof(sin); +#endif + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(sport); + + reuse = 1; + if (setsockopt(listen_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, + sizeof(reuse)) < 0) { + perror("cannot set socket options"); + exit(4); + } + + if (bind(listen_fd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("cannot bind socket"); + exit(4); + } + + if (listen(listen_fd, 1) < 0) { + perror("cannot listen socket"); + exit(4); + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(listen_fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + register_inout(&dbgport); +} diff --git a/usr/src/cmd/bhyve/dbgport.h b/usr/src/cmd/bhyve/dbgport.h index b95df0bd31..407ff3ffbf 100644 --- a/usr/src/cmd/bhyve/dbgport.h +++ b/usr/src/cmd/bhyve/dbgport.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/dbgport.h 256156 2013-10-08 16:36:17Z neel $ + * $FreeBSD$ */ #ifndef _DBGPORT_H_ diff --git a/usr/src/cmd/bhyve/fwctl.c b/usr/src/cmd/bhyve/fwctl.c new file mode 100644 index 0000000000..0640bc28ba --- /dev/null +++ b/usr/src/cmd/bhyve/fwctl.c @@ -0,0 +1,552 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Guest firmware interface. Uses i/o ports x510/x511 as Qemu does, + * but with a request/response messaging protocol. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/uio.h> + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "bhyverun.h" +#include "inout.h" +#include "fwctl.h" + +/* + * Messaging protocol base operations + */ +#define OP_NULL 1 +#define OP_ECHO 2 +#define OP_GET 3 +#define OP_GET_LEN 4 +#define OP_SET 5 +#define OP_MAX OP_SET + +/* I/O ports */ +#define FWCTL_OUT 0x510 +#define FWCTL_IN 0x511 + +/* + * Back-end state-machine + */ +enum state { + DORMANT, + IDENT_WAIT, + IDENT_SEND, + REQ, + RESP +} be_state = DORMANT; + +static uint8_t sig[] = { 'B', 'H', 'Y', 'V' }; +static u_int ident_idx; + +struct op_info { + int op; + int (*op_start)(uint32_t len); + void (*op_data)(uint32_t data, uint32_t len); + int (*op_result)(struct iovec **data); + void (*op_done)(struct iovec *data); +}; +static struct op_info *ops[OP_MAX+1]; + +/* Return 0-padded uint32_t */ +static uint32_t +fwctl_send_rest(uint32_t *data, size_t len) +{ + union { + uint8_t c[4]; + uint32_t w; + } u; + uint8_t *cdata; + int i; + + cdata = (uint8_t *) data; + u.w = 0; + + for (i = 0, u.w = 0; i < len; i++) + u.c[i] = *cdata++; + + return (u.w); +} + +/* + * error op dummy proto - drop all data sent and return an error +*/ +static int errop_code; + +static void +errop_set(int err) +{ + + errop_code = err; +} + +static int +errop_start(uint32_t len) +{ + errop_code = ENOENT; + + /* accept any length */ + return (errop_code); +} + +static void +errop_data(uint32_t data, uint32_t len) +{ + + /* ignore */ +} + +static int +errop_result(struct iovec **data) +{ + + /* no data to send back; always successful */ + *data = NULL; + return (errop_code); +} + +static void +errop_done(struct iovec *data) +{ + + /* assert data is NULL */ +} + +static struct op_info errop_info = { + .op_start = errop_start, + .op_data = errop_data, + .op_result = errop_result, + .op_done = errop_done +}; + +/* OID search */ +SET_DECLARE(ctl_set, struct ctl); + +CTL_NODE("hw.ncpu", &guest_ncpus, sizeof(guest_ncpus)); + +static struct ctl * +ctl_locate(const char *str, int maxlen) +{ + struct ctl *cp, **cpp; + + SET_FOREACH(cpp, ctl_set) { + cp = *cpp; + if (!strncmp(str, cp->c_oid, maxlen)) + return (cp); + } + + return (NULL); +} + +/* uefi-sysctl get-len */ +#define FGET_STRSZ 80 +static struct iovec fget_biov[2]; +static char fget_str[FGET_STRSZ]; +static struct { + size_t f_sz; + uint32_t f_data[1024]; +} fget_buf; +static int fget_cnt; +static size_t fget_size; + +static int +fget_start(uint32_t len) +{ + + if (len > FGET_STRSZ) + return(E2BIG); + + fget_cnt = 0; + + return (0); +} + +static void +fget_data(uint32_t data, uint32_t len) +{ + + *((uint32_t *) &fget_str[fget_cnt]) = data; + fget_cnt += sizeof(uint32_t); +} + +static int +fget_result(struct iovec **data, int val) +{ + struct ctl *cp; + int err; + + err = 0; + + /* Locate the OID */ + cp = ctl_locate(fget_str, fget_cnt); + if (cp == NULL) { + *data = NULL; + err = ENOENT; + } else { + if (val) { + /* For now, copy the len/data into a buffer */ + memset(&fget_buf, 0, sizeof(fget_buf)); + fget_buf.f_sz = cp->c_len; + memcpy(fget_buf.f_data, cp->c_data, cp->c_len); + fget_biov[0].iov_base = (char *)&fget_buf; + fget_biov[0].iov_len = sizeof(fget_buf.f_sz) + + cp->c_len; + } else { + fget_size = cp->c_len; + fget_biov[0].iov_base = (char *)&fget_size; + fget_biov[0].iov_len = sizeof(fget_size); + } + + fget_biov[1].iov_base = NULL; + fget_biov[1].iov_len = 0; + *data = fget_biov; + } + + return (err); +} + +static void +fget_done(struct iovec *data) +{ + + /* nothing needs to be freed */ +} + +static int +fget_len_result(struct iovec **data) +{ + return (fget_result(data, 0)); +} + +static int +fget_val_result(struct iovec **data) +{ + return (fget_result(data, 1)); +} + +static struct op_info fgetlen_info = { + .op_start = fget_start, + .op_data = fget_data, + .op_result = fget_len_result, + .op_done = fget_done +}; + +static struct op_info fgetval_info = { + .op_start = fget_start, + .op_data = fget_data, + .op_result = fget_val_result, + .op_done = fget_done +}; + +static struct req_info { + int req_error; + u_int req_count; + uint32_t req_size; + uint32_t req_type; + uint32_t req_txid; + struct op_info *req_op; + int resp_error; + int resp_count; + size_t resp_size; + size_t resp_off; + struct iovec *resp_biov; +} rinfo; + +static void +fwctl_response_done(void) +{ + + (*rinfo.req_op->op_done)(rinfo.resp_biov); + + /* reinit the req data struct */ + memset(&rinfo, 0, sizeof(rinfo)); +} + +static void +fwctl_request_done(void) +{ + + rinfo.resp_error = (*rinfo.req_op->op_result)(&rinfo.resp_biov); + + /* XXX only a single vector supported at the moment */ + rinfo.resp_off = 0; + if (rinfo.resp_biov == NULL) { + rinfo.resp_size = 0; + } else { + rinfo.resp_size = rinfo.resp_biov[0].iov_len; + } +} + +static int +fwctl_request_start(void) +{ + int err; + + /* Data size doesn't include header */ + rinfo.req_size -= 12; + + rinfo.req_op = &errop_info; + if (rinfo.req_type <= OP_MAX && ops[rinfo.req_type] != NULL) + rinfo.req_op = ops[rinfo.req_type]; + + err = (*rinfo.req_op->op_start)(rinfo.req_size); + + if (err) { + errop_set(err); + rinfo.req_op = &errop_info; + } + + /* Catch case of zero-length message here */ + if (rinfo.req_size == 0) { + fwctl_request_done(); + return (1); + } + + return (0); +} + +static int +fwctl_request_data(uint32_t value) +{ + + /* Make sure remaining size is >= 0 */ + if (rinfo.req_size <= sizeof(uint32_t)) + rinfo.req_size = 0; + else + rinfo.req_size -= sizeof(uint32_t); + + (*rinfo.req_op->op_data)(value, rinfo.req_size); + + if (rinfo.req_size < sizeof(uint32_t)) { + fwctl_request_done(); + return (1); + } + + return (0); +} + +static int +fwctl_request(uint32_t value) +{ + + int ret; + + ret = 0; + + switch (rinfo.req_count) { + case 0: + /* Verify size */ + if (value < 12) { + printf("msg size error"); + exit(4); + } + rinfo.req_size = value; + rinfo.req_count = 1; + break; + case 1: + rinfo.req_type = value; + rinfo.req_count++; + break; + case 2: + rinfo.req_txid = value; + rinfo.req_count++; + ret = fwctl_request_start(); + break; + default: + ret = fwctl_request_data(value); + break; + } + + return (ret); +} + +static int +fwctl_response(uint32_t *retval) +{ + uint32_t *dp; + ssize_t remlen; + + switch(rinfo.resp_count) { + case 0: + /* 4 x u32 header len + data */ + *retval = 4*sizeof(uint32_t) + + roundup(rinfo.resp_size, sizeof(uint32_t)); + rinfo.resp_count++; + break; + case 1: + *retval = rinfo.req_type; + rinfo.resp_count++; + break; + case 2: + *retval = rinfo.req_txid; + rinfo.resp_count++; + break; + case 3: + *retval = rinfo.resp_error; + rinfo.resp_count++; + break; + default: + remlen = rinfo.resp_size - rinfo.resp_off; + dp = (uint32_t *) + ((uint8_t *)rinfo.resp_biov->iov_base + rinfo.resp_off); + if (remlen >= sizeof(uint32_t)) { + *retval = *dp; + } else if (remlen > 0) { + *retval = fwctl_send_rest(dp, remlen); + } + rinfo.resp_off += sizeof(uint32_t); + break; + } + + if (rinfo.resp_count > 3 && + rinfo.resp_off >= rinfo.resp_size) { + fwctl_response_done(); + return (1); + } + + return (0); +} + + +/* + * i/o port handling. + */ +static uint8_t +fwctl_inb(void) +{ + uint8_t retval; + + retval = 0xff; + + switch (be_state) { + case IDENT_SEND: + retval = sig[ident_idx++]; + if (ident_idx >= sizeof(sig)) + be_state = REQ; + break; + default: + break; + } + + return (retval); +} + +static void +fwctl_outw(uint16_t val) +{ + switch (be_state) { + case IDENT_WAIT: + if (val == 0) { + be_state = IDENT_SEND; + ident_idx = 0; + } + break; + default: + /* ignore */ + break; + } +} + +static uint32_t +fwctl_inl(void) +{ + uint32_t retval; + + switch (be_state) { + case RESP: + if (fwctl_response(&retval)) + be_state = REQ; + break; + default: + retval = 0xffffffff; + break; + } + + return (retval); +} + +static void +fwctl_outl(uint32_t val) +{ + + switch (be_state) { + case REQ: + if (fwctl_request(val)) + be_state = RESP; + default: + break; + } + +} + +static int +fwctl_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, + uint32_t *eax, void *arg) +{ + + if (in) { + if (bytes == 1) + *eax = fwctl_inb(); + else if (bytes == 4) + *eax = fwctl_inl(); + else + *eax = 0xffff; + } else { + if (bytes == 2) + fwctl_outw(*eax); + else if (bytes == 4) + fwctl_outl(*eax); + } + + return (0); +} +INOUT_PORT(fwctl_wreg, FWCTL_OUT, IOPORT_F_INOUT, fwctl_handler); +INOUT_PORT(fwctl_rreg, FWCTL_IN, IOPORT_F_IN, fwctl_handler); + +void +fwctl_init(void) +{ + + ops[OP_GET_LEN] = &fgetlen_info; + ops[OP_GET] = &fgetval_info; + + be_state = IDENT_WAIT; +} diff --git a/usr/src/cmd/bhyve/fwctl.h b/usr/src/cmd/bhyve/fwctl.h new file mode 100644 index 0000000000..6dad244811 --- /dev/null +++ b/usr/src/cmd/bhyve/fwctl.h @@ -0,0 +1,56 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _FWCTL_H_ +#define _FWCTL_H_ + +#include <sys/linker_set.h> + +/* + * Linker set api for export of information to guest firmware via + * a sysctl-like OID interface + */ +struct ctl { + const char *c_oid; + const void *c_data; + const int c_len; +}; + +#define CTL_NODE(oid, data, len) \ + static struct ctl __CONCAT(__ctl, __LINE__) = { \ + oid, \ + (data), \ + (len), \ + }; \ + DATA_SET(ctl_set, __CONCAT(__ctl, __LINE__)) + +void fwctl_init(void); + +#endif /* _FWCTL_H_ */ diff --git a/usr/src/cmd/bhyve/gdb.c b/usr/src/cmd/bhyve/gdb.c new file mode 100644 index 0000000000..20c2de1dec --- /dev/null +++ b/usr/src/cmd/bhyve/gdb.c @@ -0,0 +1,1523 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017-2018 John H. Baldwin <jhb@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/ioctl.h> +#include <sys/mman.h> +#include <sys/socket.h> +#include <machine/atomic.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> +#include <netinet/in.h> +#include <assert.h> +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <pthread_np.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> +#include <vmmapi.h> + +#include "bhyverun.h" +#include "mem.h" +#include "mevent.h" + +/* + * GDB_SIGNAL_* numbers are part of the GDB remote protocol. Most stops + * use SIGTRAP. + */ +#define GDB_SIGNAL_TRAP 5 + +static void gdb_resume_vcpus(void); +static void check_command(int fd); + +static struct mevent *read_event, *write_event; + +static cpuset_t vcpus_active, vcpus_suspended, vcpus_waiting; +static pthread_mutex_t gdb_lock; +static pthread_cond_t idle_vcpus; +static bool stop_pending, first_stop; +static int stepping_vcpu, stopped_vcpu; + +/* + * An I/O buffer contains 'capacity' bytes of room at 'data'. For a + * read buffer, 'start' is unused and 'len' contains the number of + * valid bytes in the buffer. For a write buffer, 'start' is set to + * the index of the next byte in 'data' to send, and 'len' contains + * the remaining number of valid bytes to send. + */ +struct io_buffer { + uint8_t *data; + size_t capacity; + size_t start; + size_t len; +}; + +static struct io_buffer cur_comm, cur_resp; +static uint8_t cur_csum; +static int cur_vcpu; +static struct vmctx *ctx; +static int cur_fd = -1; + +const int gdb_regset[] = { + VM_REG_GUEST_RAX, + VM_REG_GUEST_RBX, + VM_REG_GUEST_RCX, + VM_REG_GUEST_RDX, + VM_REG_GUEST_RSI, + VM_REG_GUEST_RDI, + VM_REG_GUEST_RBP, + VM_REG_GUEST_RSP, + VM_REG_GUEST_R8, + VM_REG_GUEST_R9, + VM_REG_GUEST_R10, + VM_REG_GUEST_R11, + VM_REG_GUEST_R12, + VM_REG_GUEST_R13, + VM_REG_GUEST_R14, + VM_REG_GUEST_R15, + VM_REG_GUEST_RIP, + VM_REG_GUEST_RFLAGS, + VM_REG_GUEST_CS, + VM_REG_GUEST_SS, + VM_REG_GUEST_DS, + VM_REG_GUEST_ES, + VM_REG_GUEST_FS, + VM_REG_GUEST_GS +}; + +const int gdb_regsize[] = { + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 8, + 4, + 4, + 4, + 4, + 4, + 4, + 4 +}; + +#ifdef GDB_LOG +#include <stdarg.h> +#include <stdio.h> + +static void __printflike(1, 2) +debug(const char *fmt, ...) +{ + static FILE *logfile; + va_list ap; + + if (logfile == NULL) { + logfile = fopen("/tmp/bhyve_gdb.log", "w"); + if (logfile == NULL) + return; +#ifndef WITHOUT_CAPSICUM + if (caph_limit_stream(fileno(logfile), CAPH_WRITE) == -1) { + fclose(logfile); + logfile = NULL; + return; + } +#endif + setlinebuf(logfile); + } + va_start(ap, fmt); + vfprintf(logfile, fmt, ap); + va_end(ap); +} +#else +#define debug(...) +#endif + +static int +guest_paging_info(int vcpu, struct vm_guest_paging *paging) +{ + uint64_t regs[4]; + const int regset[4] = { + VM_REG_GUEST_CR0, + VM_REG_GUEST_CR3, + VM_REG_GUEST_CR4, + VM_REG_GUEST_EFER + }; + + if (vm_get_register_set(ctx, vcpu, nitems(regset), regset, regs) == -1) + return (-1); + + /* + * For the debugger, always pretend to be the kernel (CPL 0), + * and if long-mode is enabled, always parse addresses as if + * in 64-bit mode. + */ + paging->cr3 = regs[1]; + paging->cpl = 0; + if (regs[3] & EFER_LMA) + paging->cpu_mode = CPU_MODE_64BIT; + else if (regs[0] & CR0_PE) + paging->cpu_mode = CPU_MODE_PROTECTED; + else + paging->cpu_mode = CPU_MODE_REAL; + if (!(regs[0] & CR0_PG)) + paging->paging_mode = PAGING_MODE_FLAT; + else if (!(regs[2] & CR4_PAE)) + paging->paging_mode = PAGING_MODE_32; + else if (regs[3] & EFER_LME) + paging->paging_mode = PAGING_MODE_64; + else + paging->paging_mode = PAGING_MODE_PAE; + return (0); +} + +/* + * Map a guest virtual address to a physical address (for a given vcpu). + * If a guest virtual address is valid, return 1. If the address is + * not valid, return 0. If an error occurs obtaining the mapping, + * return -1. + */ +static int +guest_vaddr2paddr(int vcpu, uint64_t vaddr, uint64_t *paddr) +{ + struct vm_guest_paging paging; + int fault; + + if (guest_paging_info(vcpu, &paging) == -1) + return (-1); + + /* + * Always use PROT_READ. We really care if the VA is + * accessible, not if the current vCPU can write. + */ + if (vm_gla2gpa_nofault(ctx, vcpu, &paging, vaddr, PROT_READ, paddr, + &fault) == -1) + return (-1); + if (fault) + return (0); + return (1); +} + +static void +io_buffer_reset(struct io_buffer *io) +{ + + io->start = 0; + io->len = 0; +} + +/* Available room for adding data. */ +static size_t +io_buffer_avail(struct io_buffer *io) +{ + + return (io->capacity - (io->start + io->len)); +} + +static uint8_t * +io_buffer_head(struct io_buffer *io) +{ + + return (io->data + io->start); +} + +static uint8_t * +io_buffer_tail(struct io_buffer *io) +{ + + return (io->data + io->start + io->len); +} + +static void +io_buffer_advance(struct io_buffer *io, size_t amount) +{ + + assert(amount <= io->len); + io->start += amount; + io->len -= amount; +} + +static void +io_buffer_consume(struct io_buffer *io, size_t amount) +{ + + io_buffer_advance(io, amount); + if (io->len == 0) { + io->start = 0; + return; + } + + /* + * XXX: Consider making this move optional and compacting on a + * future read() before realloc(). + */ + memmove(io->data, io_buffer_head(io), io->len); + io->start = 0; +} + +static void +io_buffer_grow(struct io_buffer *io, size_t newsize) +{ + uint8_t *new_data; + size_t avail, new_cap; + + avail = io_buffer_avail(io); + if (newsize <= avail) + return; + + new_cap = io->capacity + (newsize - avail); + new_data = realloc(io->data, new_cap); + if (new_data == NULL) + err(1, "Failed to grow GDB I/O buffer"); + io->data = new_data; + io->capacity = new_cap; +} + +static bool +response_pending(void) +{ + + if (cur_resp.start == 0 && cur_resp.len == 0) + return (false); + if (cur_resp.start + cur_resp.len == 1 && cur_resp.data[0] == '+') + return (false); + return (true); +} + +static void +close_connection(void) +{ + + /* + * XXX: This triggers a warning because mevent does the close + * before the EV_DELETE. + */ + pthread_mutex_lock(&gdb_lock); + mevent_delete(write_event); + mevent_delete_close(read_event); + write_event = NULL; + read_event = NULL; + io_buffer_reset(&cur_comm); + io_buffer_reset(&cur_resp); + cur_fd = -1; + + /* Resume any stopped vCPUs. */ + gdb_resume_vcpus(); + pthread_mutex_unlock(&gdb_lock); +} + +static uint8_t +hex_digit(uint8_t nibble) +{ + + if (nibble <= 9) + return (nibble + '0'); + else + return (nibble + 'a' - 10); +} + +static uint8_t +parse_digit(uint8_t v) +{ + + if (v >= '0' && v <= '9') + return (v - '0'); + if (v >= 'a' && v <= 'f') + return (v - 'a' + 10); + if (v >= 'A' && v <= 'F') + return (v - 'A' + 10); + return (0xF); +} + +/* Parses big-endian hexadecimal. */ +static uintmax_t +parse_integer(const uint8_t *p, size_t len) +{ + uintmax_t v; + + v = 0; + while (len > 0) { + v <<= 4; + v |= parse_digit(*p); + p++; + len--; + } + return (v); +} + +static uint8_t +parse_byte(const uint8_t *p) +{ + + return (parse_digit(p[0]) << 4 | parse_digit(p[1])); +} + +static void +send_pending_data(int fd) +{ + ssize_t nwritten; + + if (cur_resp.len == 0) { + mevent_disable(write_event); + return; + } + nwritten = write(fd, io_buffer_head(&cur_resp), cur_resp.len); + if (nwritten == -1) { + warn("Write to GDB socket failed"); + close_connection(); + } else { + io_buffer_advance(&cur_resp, nwritten); + if (cur_resp.len == 0) + mevent_disable(write_event); + else + mevent_enable(write_event); + } +} + +/* Append a single character to the output buffer. */ +static void +send_char(uint8_t data) +{ + io_buffer_grow(&cur_resp, 1); + *io_buffer_tail(&cur_resp) = data; + cur_resp.len++; +} + +/* Append an array of bytes to the output buffer. */ +static void +send_data(const uint8_t *data, size_t len) +{ + + io_buffer_grow(&cur_resp, len); + memcpy(io_buffer_tail(&cur_resp), data, len); + cur_resp.len += len; +} + +static void +format_byte(uint8_t v, uint8_t *buf) +{ + + buf[0] = hex_digit(v >> 4); + buf[1] = hex_digit(v & 0xf); +} + +/* + * Append a single byte (formatted as two hex characters) to the + * output buffer. + */ +static void +send_byte(uint8_t v) +{ + uint8_t buf[2]; + + format_byte(v, buf); + send_data(buf, sizeof(buf)); +} + +static void +start_packet(void) +{ + + send_char('$'); + cur_csum = 0; +} + +static void +finish_packet(void) +{ + + send_char('#'); + send_byte(cur_csum); + debug("-> %.*s\n", (int)cur_resp.len, io_buffer_head(&cur_resp)); +} + +/* + * Append a single character (for the packet payload) and update the + * checksum. + */ +static void +append_char(uint8_t v) +{ + + send_char(v); + cur_csum += v; +} + +/* + * Append an array of bytes (for the packet payload) and update the + * checksum. + */ +static void +append_packet_data(const uint8_t *data, size_t len) +{ + + send_data(data, len); + while (len > 0) { + cur_csum += *data; + data++; + len--; + } +} + +static void +append_string(const char *str) +{ + +#ifdef __FreeBSD__ + append_packet_data(str, strlen(str)); +#else + append_packet_data((const uint8_t *)str, strlen(str)); +#endif +} + +static void +append_byte(uint8_t v) +{ + uint8_t buf[2]; + + format_byte(v, buf); + append_packet_data(buf, sizeof(buf)); +} + +static void +append_unsigned_native(uintmax_t value, size_t len) +{ + size_t i; + + for (i = 0; i < len; i++) { + append_byte(value); + value >>= 8; + } +} + +static void +append_unsigned_be(uintmax_t value, size_t len) +{ + char buf[len * 2]; + size_t i; + + for (i = 0; i < len; i++) { +#ifdef __FreeBSD__ + format_byte(value, buf + (len - i - 1) * 2); +#else + format_byte(value, (uint8_t *)(buf + (len - i - 1) * 2)); +#endif + value >>= 8; + } +#ifdef __FreeBSD__ + append_packet_data(buf, sizeof(buf)); +#else + append_packet_data((const uint8_t *)buf, sizeof(buf)); +#endif +} + +static void +append_integer(unsigned int value) +{ + + if (value == 0) + append_char('0'); + else + append_unsigned_be(value, fls(value) + 7 / 8); +} + +static void +append_asciihex(const char *str) +{ + + while (*str != '\0') { + append_byte(*str); + str++; + } +} + +static void +send_empty_response(void) +{ + + start_packet(); + finish_packet(); +} + +static void +send_error(int error) +{ + + start_packet(); + append_char('E'); + append_byte(error); + finish_packet(); +} + +static void +send_ok(void) +{ + + start_packet(); + append_string("OK"); + finish_packet(); +} + +static int +parse_threadid(const uint8_t *data, size_t len) +{ + + if (len == 1 && *data == '0') + return (0); + if (len == 2 && memcmp(data, "-1", 2) == 0) + return (-1); + if (len == 0) + return (-2); + return (parse_integer(data, len)); +} + +static void +report_stop(void) +{ + + start_packet(); + if (stopped_vcpu == -1) + append_char('S'); + else + append_char('T'); + append_byte(GDB_SIGNAL_TRAP); + if (stopped_vcpu != -1) { + append_string("thread:"); + append_integer(stopped_vcpu + 1); + append_char(';'); + } + stopped_vcpu = -1; + finish_packet(); +} + +static void +gdb_finish_suspend_vcpus(void) +{ + + if (first_stop) { + first_stop = false; + stopped_vcpu = -1; + } else if (response_pending()) + stop_pending = true; + else { + report_stop(); + send_pending_data(cur_fd); + } +} + +static void +_gdb_cpu_suspend(int vcpu, bool report_stop) +{ + + debug("$vCPU %d suspending\n", vcpu); + CPU_SET(vcpu, &vcpus_waiting); + if (report_stop && CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) + gdb_finish_suspend_vcpus(); + while (CPU_ISSET(vcpu, &vcpus_suspended) && vcpu != stepping_vcpu) + pthread_cond_wait(&idle_vcpus, &gdb_lock); + CPU_CLR(vcpu, &vcpus_waiting); + debug("$vCPU %d resuming\n", vcpu); +} + +void +gdb_cpu_add(int vcpu) +{ + + debug("$vCPU %d starting\n", vcpu); + pthread_mutex_lock(&gdb_lock); + CPU_SET(vcpu, &vcpus_active); + + /* + * If a vcpu is added while vcpus are stopped, suspend the new + * vcpu so that it will pop back out with a debug exit before + * executing the first instruction. + */ + if (!CPU_EMPTY(&vcpus_suspended)) { + CPU_SET(vcpu, &vcpus_suspended); + _gdb_cpu_suspend(vcpu, false); + } + pthread_mutex_unlock(&gdb_lock); +} + +void +gdb_cpu_suspend(int vcpu) +{ + + pthread_mutex_lock(&gdb_lock); + _gdb_cpu_suspend(vcpu, true); + pthread_mutex_unlock(&gdb_lock); +} + +void +gdb_cpu_mtrap(int vcpu) +{ + + debug("$vCPU %d MTRAP\n", vcpu); + pthread_mutex_lock(&gdb_lock); + if (vcpu == stepping_vcpu) { + stepping_vcpu = -1; + vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 0); + vm_suspend_cpu(ctx, vcpu); + assert(stopped_vcpu == -1); + stopped_vcpu = vcpu; + _gdb_cpu_suspend(vcpu, true); + } + pthread_mutex_unlock(&gdb_lock); +} + +static void +gdb_suspend_vcpus(void) +{ + + assert(pthread_mutex_isowned_np(&gdb_lock)); + debug("suspending all CPUs\n"); + vcpus_suspended = vcpus_active; + vm_suspend_cpu(ctx, -1); + if (CPU_CMP(&vcpus_waiting, &vcpus_suspended) == 0) + gdb_finish_suspend_vcpus(); +} + +static bool +gdb_step_vcpu(int vcpu) +{ + int error, val; + + debug("$vCPU %d step\n", vcpu); + error = vm_get_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, &val); + if (error < 0) + return (false); + error = vm_set_capability(ctx, vcpu, VM_CAP_MTRAP_EXIT, 1); + vm_resume_cpu(ctx, vcpu); + stepping_vcpu = vcpu; + pthread_cond_broadcast(&idle_vcpus); + return (true); +} + +static void +gdb_resume_vcpus(void) +{ + + assert(pthread_mutex_isowned_np(&gdb_lock)); + vm_resume_cpu(ctx, -1); + debug("resuming all CPUs\n"); + CPU_ZERO(&vcpus_suspended); + pthread_cond_broadcast(&idle_vcpus); +} + +static void +gdb_read_regs(void) +{ + uint64_t regvals[nitems(gdb_regset)]; + int i; + + if (vm_get_register_set(ctx, cur_vcpu, nitems(gdb_regset), + gdb_regset, regvals) == -1) { + send_error(errno); + return; + } + start_packet(); + for (i = 0; i < nitems(regvals); i++) + append_unsigned_native(regvals[i], gdb_regsize[i]); + finish_packet(); +} + +static void +gdb_read_mem(const uint8_t *data, size_t len) +{ + uint64_t gpa, gva, val; + uint8_t *cp; + size_t resid, todo, bytes; + bool started; + int error; + + /* Skip 'm' */ + data += 1; + len -= 1; + + /* Parse and consume address. */ + cp = memchr(data, ',', len); + if (cp == NULL || cp == data) { + send_error(EINVAL); + return; + } + gva = parse_integer(data, cp - data); + len -= (cp - data) + 1; + data += (cp - data) + 1; + + /* Parse length. */ + resid = parse_integer(data, len); + + started = false; + while (resid > 0) { + error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); + if (error == -1) { + if (started) + finish_packet(); + else + send_error(errno); + return; + } + if (error == 0) { + if (started) + finish_packet(); + else + send_error(EFAULT); + return; + } + + /* Read bytes from current page. */ + todo = getpagesize() - gpa % getpagesize(); + if (todo > resid) + todo = resid; + + cp = paddr_guest2host(ctx, gpa, todo); + if (cp != NULL) { + /* + * If this page is guest RAM, read it a byte + * at a time. + */ + if (!started) { + start_packet(); + started = true; + } + while (todo > 0) { + append_byte(*cp); + cp++; + gpa++; + gva++; + resid--; + todo--; + } + } else { + /* + * If this page isn't guest RAM, try to handle + * it via MMIO. For MMIO requests, use + * aligned reads of words when possible. + */ + while (todo > 0) { + if (gpa & 1 || todo == 1) + bytes = 1; + else if (gpa & 2 || todo == 2) + bytes = 2; + else + bytes = 4; + error = read_mem(ctx, cur_vcpu, gpa, &val, + bytes); + if (error == 0) { + if (!started) { + start_packet(); + started = true; + } + gpa += bytes; + gva += bytes; + resid -= bytes; + todo -= bytes; + while (bytes > 0) { + append_byte(val); + val >>= 8; + bytes--; + } + } else { + if (started) + finish_packet(); + else + send_error(EFAULT); + return; + } + } + } + assert(resid == 0 || gpa % getpagesize() == 0); + } + if (!started) + start_packet(); + finish_packet(); +} + +static void +gdb_write_mem(const uint8_t *data, size_t len) +{ + uint64_t gpa, gva, val; + uint8_t *cp; + size_t resid, todo, bytes; + int error; + + /* Skip 'M' */ + data += 1; + len -= 1; + + /* Parse and consume address. */ + cp = memchr(data, ',', len); + if (cp == NULL || cp == data) { + send_error(EINVAL); + return; + } + gva = parse_integer(data, cp - data); + len -= (cp - data) + 1; + data += (cp - data) + 1; + + /* Parse and consume length. */ + cp = memchr(data, ':', len); + if (cp == NULL || cp == data) { + send_error(EINVAL); + return; + } + resid = parse_integer(data, cp - data); + len -= (cp - data) + 1; + data += (cp - data) + 1; + + /* Verify the available bytes match the length. */ + if (len != resid * 2) { + send_error(EINVAL); + return; + } + + while (resid > 0) { + error = guest_vaddr2paddr(cur_vcpu, gva, &gpa); + if (error == -1) { + send_error(errno); + return; + } + if (error == 0) { + send_error(EFAULT); + return; + } + + /* Write bytes to current page. */ + todo = getpagesize() - gpa % getpagesize(); + if (todo > resid) + todo = resid; + + cp = paddr_guest2host(ctx, gpa, todo); + if (cp != NULL) { + /* + * If this page is guest RAM, write it a byte + * at a time. + */ + while (todo > 0) { + assert(len >= 2); + *cp = parse_byte(data); + data += 2; + len -= 2; + cp++; + gpa++; + gva++; + resid--; + todo--; + } + } else { + /* + * If this page isn't guest RAM, try to handle + * it via MMIO. For MMIO requests, use + * aligned writes of words when possible. + */ + while (todo > 0) { + if (gpa & 1 || todo == 1) { + bytes = 1; + val = parse_byte(data); + } else if (gpa & 2 || todo == 2) { + bytes = 2; + val = parse_byte(data) | + (parse_byte(data + 2) << 8); + } else { + bytes = 4; + val = parse_byte(data) | + (parse_byte(data + 2) << 8) | + (parse_byte(data + 4) << 16) | + (parse_byte(data + 6) << 24); + } + error = write_mem(ctx, cur_vcpu, gpa, val, + bytes); + if (error == 0) { + gpa += bytes; + gva += bytes; + resid -= bytes; + todo -= bytes; + data += 2 * bytes; + len -= 2 * bytes; + } else { + send_error(EFAULT); + return; + } + } + } + assert(resid == 0 || gpa % getpagesize() == 0); + } + assert(len == 0); + send_ok(); +} + +static bool +command_equals(const uint8_t *data, size_t len, const char *cmd) +{ + + if (strlen(cmd) > len) + return (false); + return (memcmp(data, cmd, strlen(cmd)) == 0); +} + +static void +check_features(const uint8_t *data, size_t len) +{ + char *feature, *next_feature, *str, *value; + bool supported; + + str = malloc(len + 1); + memcpy(str, data, len); + str[len] = '\0'; + next_feature = str; + + while ((feature = strsep(&next_feature, ";")) != NULL) { + /* + * Null features shouldn't exist, but skip if they + * do. + */ + if (strcmp(feature, "") == 0) + continue; + + /* + * Look for the value or supported / not supported + * flag. + */ + value = strchr(feature, '='); + if (value != NULL) { + *value = '\0'; + value++; + supported = true; + } else { + value = feature + strlen(feature) - 1; + switch (*value) { + case '+': + supported = true; + break; + case '-': + supported = false; + break; + default: + /* + * This is really a protocol error, + * but we just ignore malformed + * features for ease of + * implementation. + */ + continue; + } + value = NULL; + } + + /* No currently supported features. */ +#ifndef __FreeBSD__ + /* + * The compiler dislikes 'supported' being set but never used. + * Make it happy here. + */ + if (supported) { + debug("feature '%s' supported\n", feature); + } +#endif /* __FreeBSD__ */ + } + free(str); + + start_packet(); + + /* This is an arbitrary limit. */ + append_string("PacketSize=4096"); + finish_packet(); +} + +static void +gdb_query(const uint8_t *data, size_t len) +{ + + /* + * TODO: + * - qSearch + */ + if (command_equals(data, len, "qAttached")) { + start_packet(); + append_char('1'); + finish_packet(); + } else if (command_equals(data, len, "qC")) { + start_packet(); + append_string("QC"); + append_integer(cur_vcpu + 1); + finish_packet(); + } else if (command_equals(data, len, "qfThreadInfo")) { + cpuset_t mask; + bool first; + int vcpu; + + if (CPU_EMPTY(&vcpus_active)) { + send_error(EINVAL); + return; + } + mask = vcpus_active; + start_packet(); + append_char('m'); + first = true; + while (!CPU_EMPTY(&mask)) { + vcpu = CPU_FFS(&mask) - 1; + CPU_CLR(vcpu, &mask); + if (first) + first = false; + else + append_char(','); + append_integer(vcpu + 1); + } + finish_packet(); + } else if (command_equals(data, len, "qsThreadInfo")) { + start_packet(); + append_char('l'); + finish_packet(); + } else if (command_equals(data, len, "qSupported")) { + data += strlen("qSupported"); + len -= strlen("qSupported"); + check_features(data, len); + } else if (command_equals(data, len, "qThreadExtraInfo")) { + char buf[16]; + int tid; + + data += strlen("qThreadExtraInfo"); + len -= strlen("qThreadExtraInfo"); + if (*data != ',') { + send_error(EINVAL); + return; + } + tid = parse_threadid(data + 1, len - 1); + if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { + send_error(EINVAL); + return; + } + + snprintf(buf, sizeof(buf), "vCPU %d", tid - 1); + start_packet(); + append_asciihex(buf); + finish_packet(); + } else + send_empty_response(); +} + +static void +handle_command(const uint8_t *data, size_t len) +{ + + /* Reject packets with a sequence-id. */ + if (len >= 3 && data[0] >= '0' && data[0] <= '9' && + data[0] >= '0' && data[0] <= '9' && data[2] == ':') { + send_empty_response(); + return; + } + + switch (*data) { + case 'c': + if (len != 1) { + send_error(EINVAL); + break; + } + + /* Don't send a reply until a stop occurs. */ + gdb_resume_vcpus(); + break; + case 'D': + send_ok(); + + /* TODO: Resume any stopped CPUs. */ + break; + case 'g': { + gdb_read_regs(); + break; + } + case 'H': { + int tid; + + if (data[1] != 'g' && data[1] != 'c') { + send_error(EINVAL); + break; + } + tid = parse_threadid(data + 2, len - 2); + if (tid == -2) { + send_error(EINVAL); + break; + } + + if (CPU_EMPTY(&vcpus_active)) { + send_error(EINVAL); + break; + } + if (tid == -1 || tid == 0) + cur_vcpu = CPU_FFS(&vcpus_active) - 1; + else if (CPU_ISSET(tid - 1, &vcpus_active)) + cur_vcpu = tid - 1; + else { + send_error(EINVAL); + break; + } + send_ok(); + break; + } + case 'm': + gdb_read_mem(data, len); + break; + case 'M': + gdb_write_mem(data, len); + break; + case 'T': { + int tid; + + tid = parse_threadid(data + 1, len - 1); + if (tid <= 0 || !CPU_ISSET(tid - 1, &vcpus_active)) { + send_error(EINVAL); + return; + } + send_ok(); + break; + } + case 'q': + gdb_query(data, len); + break; + case 's': + if (len != 1) { + send_error(EINVAL); + break; + } + + /* Don't send a reply until a stop occurs. */ + if (!gdb_step_vcpu(cur_vcpu)) { + send_error(EOPNOTSUPP); + break; + } + break; + case '?': + /* XXX: Only if stopped? */ + /* For now, just report that we are always stopped. */ + start_packet(); + append_char('S'); + append_byte(GDB_SIGNAL_TRAP); + finish_packet(); + break; + case 'G': /* TODO */ + case 'v': + /* Handle 'vCont' */ + /* 'vCtrlC' */ + case 'p': /* TODO */ + case 'P': /* TODO */ + case 'Q': /* TODO */ + case 't': /* TODO */ + case 'X': /* TODO */ + case 'z': /* TODO */ + case 'Z': /* TODO */ + default: + send_empty_response(); + } +} + +/* Check for a valid packet in the command buffer. */ +static void +check_command(int fd) +{ + uint8_t *head, *hash, *p, sum; + size_t avail, plen; + + for (;;) { + avail = cur_comm.len; + if (avail == 0) + return; + head = io_buffer_head(&cur_comm); + switch (*head) { + case 0x03: + debug("<- Ctrl-C\n"); + io_buffer_consume(&cur_comm, 1); + + gdb_suspend_vcpus(); + break; + case '+': + /* ACK of previous response. */ + debug("<- +\n"); + if (response_pending()) + io_buffer_reset(&cur_resp); + io_buffer_consume(&cur_comm, 1); + if (stop_pending) { + stop_pending = false; + report_stop(); + send_pending_data(fd); + } + break; + case '-': + /* NACK of previous response. */ + debug("<- -\n"); + if (response_pending()) { + cur_resp.len += cur_resp.start; + cur_resp.start = 0; + if (cur_resp.data[0] == '+') + io_buffer_advance(&cur_resp, 1); + debug("-> %.*s\n", (int)cur_resp.len, + io_buffer_head(&cur_resp)); + } + io_buffer_consume(&cur_comm, 1); + send_pending_data(fd); + break; + case '$': + /* Packet. */ + + if (response_pending()) { + warnx("New GDB command while response in " + "progress"); + io_buffer_reset(&cur_resp); + } + + /* Is packet complete? */ + hash = memchr(head, '#', avail); + if (hash == NULL) + return; + plen = (hash - head + 1) + 2; + if (avail < plen) + return; + debug("<- %.*s\n", (int)plen, head); + + /* Verify checksum. */ + for (sum = 0, p = head + 1; p < hash; p++) + sum += *p; + if (sum != parse_byte(hash + 1)) { + io_buffer_consume(&cur_comm, plen); + debug("-> -\n"); + send_char('-'); + send_pending_data(fd); + break; + } + send_char('+'); + + handle_command(head + 1, hash - (head + 1)); + io_buffer_consume(&cur_comm, plen); + if (!response_pending()) { + debug("-> +\n"); + } + send_pending_data(fd); + break; + default: + /* XXX: Possibly drop connection instead. */ + debug("-> %02x\n", *head); + io_buffer_consume(&cur_comm, 1); + break; + } + } +} + +static void +gdb_readable(int fd, enum ev_type event, void *arg) +{ + ssize_t nread; + int pending; + + if (ioctl(fd, FIONREAD, &pending) == -1) { + warn("FIONREAD on GDB socket"); + return; + } + + /* + * 'pending' might be zero due to EOF. We need to call read + * with a non-zero length to detect EOF. + */ + if (pending == 0) + pending = 1; + + /* Ensure there is room in the command buffer. */ + io_buffer_grow(&cur_comm, pending); + assert(io_buffer_avail(&cur_comm) >= pending); + + nread = read(fd, io_buffer_tail(&cur_comm), io_buffer_avail(&cur_comm)); + if (nread == 0) { + close_connection(); + } else if (nread == -1) { + if (errno == EAGAIN) + return; + + warn("Read from GDB socket"); + close_connection(); + } else { + cur_comm.len += nread; + pthread_mutex_lock(&gdb_lock); + check_command(fd); + pthread_mutex_unlock(&gdb_lock); + } +} + +static void +gdb_writable(int fd, enum ev_type event, void *arg) +{ + + send_pending_data(fd); +} + +static void +new_connection(int fd, enum ev_type event, void *arg) +{ + int optval, s; + + s = accept4(fd, NULL, NULL, SOCK_NONBLOCK); + if (s == -1) { + if (arg != NULL) + err(1, "Failed accepting initial GDB connection"); + + /* Silently ignore errors post-startup. */ + return; + } + + optval = 1; + if (setsockopt(s, SOL_SOCKET, SO_NOSIGPIPE, &optval, sizeof(optval)) == + -1) { + warn("Failed to disable SIGPIPE for GDB connection"); + close(s); + return; + } + + pthread_mutex_lock(&gdb_lock); + if (cur_fd != -1) { + close(s); + warnx("Ignoring additional GDB connection."); + } + + read_event = mevent_add(s, EVF_READ, gdb_readable, NULL); + if (read_event == NULL) { + if (arg != NULL) + err(1, "Failed to setup initial GDB connection"); + pthread_mutex_unlock(&gdb_lock); + return; + } + write_event = mevent_add(s, EVF_WRITE, gdb_writable, NULL); + if (write_event == NULL) { + if (arg != NULL) + err(1, "Failed to setup initial GDB connection"); + mevent_delete_close(read_event); + read_event = NULL; + } + + cur_fd = s; + cur_vcpu = 0; + stepping_vcpu = -1; + stopped_vcpu = -1; + stop_pending = false; + + /* Break on attach. */ + first_stop = true; + gdb_suspend_vcpus(); + pthread_mutex_unlock(&gdb_lock); +} + +#ifndef WITHOUT_CAPSICUM +void +limit_gdb_socket(int s) +{ + cap_rights_t rights; + unsigned long ioctls[] = { FIONREAD }; + + cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE, + CAP_SETSOCKOPT, CAP_IOCTL); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(s, ioctls, nitems(ioctls)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +} +#endif + +void +init_gdb(struct vmctx *_ctx, int sport, bool wait) +{ + struct sockaddr_in sin; + int error, flags, s; + + debug("==> starting on %d, %swaiting\n", sport, wait ? "" : "not "); + + error = pthread_mutex_init(&gdb_lock, NULL); + if (error != 0) + errc(1, error, "gdb mutex init"); + error = pthread_cond_init(&idle_vcpus, NULL); + if (error != 0) + errc(1, error, "gdb cv init"); + + ctx = _ctx; + s = socket(PF_INET, SOCK_STREAM, 0); + if (s < 0) + err(1, "gdb socket create"); + +#ifdef __FreeBSD__ + sin.sin_len = sizeof(sin); +#endif + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(sport); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) + err(1, "gdb socket bind"); + + if (listen(s, 1) < 0) + err(1, "gdb socket listen"); + + if (wait) { + /* + * Set vcpu 0 in vcpus_suspended. This will trigger the + * logic in gdb_cpu_add() to suspend the first vcpu before + * it starts execution. The vcpu will remain suspended + * until a debugger connects. + */ + stepping_vcpu = -1; + stopped_vcpu = -1; + CPU_SET(0, &vcpus_suspended); + } + + flags = fcntl(s, F_GETFL); + if (fcntl(s, F_SETFL, flags | O_NONBLOCK) == -1) + err(1, "Failed to mark gdb socket non-blocking"); + +#ifndef WITHOUT_CAPSICUM + limit_gdb_socket(s); +#endif + mevent_add(s, EVF_READ, new_connection, NULL); +} diff --git a/usr/src/cmd/bhyve/gdb.h b/usr/src/cmd/bhyve/gdb.h new file mode 100644 index 0000000000..09ebc34f24 --- /dev/null +++ b/usr/src/cmd/bhyve/gdb.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017 John H. Baldwin <jhb@FreeBSD.org> + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef __GDB_H__ +#define __GDB_H__ + +void gdb_cpu_add(int vcpu); +void gdb_cpu_mtrap(int vcpu); +void gdb_cpu_suspend(int vcpu); +void init_gdb(struct vmctx *ctx, int sport, bool wait); + +#endif /* !__GDB_H__ */ diff --git a/usr/src/cmd/bhyve/inout.c b/usr/src/cmd/bhyve/inout.c index 510649893a..b460ee2988 100644 --- a/usr/src/cmd/bhyve/inout.c +++ b/usr/src/cmd/bhyve/inout.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,11 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/inout.c 277310 2015-01-18 03:08:30Z neel $ + * $FreeBSD$ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/inout.c 277310 2015-01-18 03:08:30Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/linker_set.h> @@ -66,21 +68,21 @@ static int default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { - if (in) { - switch (bytes) { - case 4: - *eax = 0xffffffff; - break; - case 2: - *eax = 0xffff; - break; - case 1: - *eax = 0xff; - break; - } - } - - return (0); + if (in) { + switch (bytes) { + case 4: + *eax = 0xffffffff; + break; + case 2: + *eax = 0xffff; + break; + case 1: + *eax = 0xff; + break; + } + } + + return (0); } static void @@ -107,7 +109,7 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) uint32_t eax, val; inout_func_t handler; void *arg; - int error, retval; + int error, fault, retval; enum vm_reg_name idxreg; uint64_t gla, index, iterations, count; struct vm_inout_str *vis; @@ -163,11 +165,11 @@ emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict) } error = vm_copy_setup(ctx, vcpu, &vis->paging, gla, - bytes, prot, iov, nitems(iov)); - if (error == -1) { + bytes, prot, iov, nitems(iov), &fault); + if (error) { retval = -1; /* Unrecoverable error */ break; - } else if (error == 1) { + } else if (fault) { retval = 0; /* Resume guest to handle fault */ break; } diff --git a/usr/src/cmd/bhyve/inout.h b/usr/src/cmd/bhyve/inout.h index 0d4046bd61..b72ee5d93e 100644 --- a/usr/src/cmd/bhyve/inout.h +++ b/usr/src/cmd/bhyve/inout.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/inout.h 269094 2014-07-25 20:18:35Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the diff --git a/usr/src/cmd/bhyve/ioapic.c b/usr/src/cmd/bhyve/ioapic.c index 86ff5c6580..acdbb5111b 100644 --- a/usr/src/cmd/bhyve/ioapic.c +++ b/usr/src/cmd/bhyve/ioapic.c @@ -1,5 +1,7 @@ /*- - * Copyright (c) 2014 Advanced Computing Technologies LLC + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * @@ -26,14 +28,17 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/ioapic.c 261268 2014-01-29 14:56:48Z jhb $"); +__FBSDID("$FreeBSD$"); #include <sys/types.h> +#include <stdio.h> #include <machine/vmm.h> #include <vmmapi.h> #include "ioapic.h" +#include "pci_emul.h" +#include "pci_lpc.h" /* * Assign PCI INTx interrupts to I/O APIC pins in a round-robin @@ -64,11 +69,15 @@ ioapic_init(struct vmctx *ctx) } int -ioapic_pci_alloc_irq(void) +ioapic_pci_alloc_irq(struct pci_devinst *pi) { static int last_pin; if (pci_pins == 0) return (-1); + if (lpc_bootrom()) { + /* For external bootrom use fixed mapping. */ + return (16 + (4 + pi->pi_slot + pi->pi_lintr.pin) % 8); + } return (16 + (last_pin++ % pci_pins)); } diff --git a/usr/src/cmd/bhyve/ioapic.h b/usr/src/cmd/bhyve/ioapic.h index 789f90fea9..3a7fa76192 100644 --- a/usr/src/cmd/bhyve/ioapic.h +++ b/usr/src/cmd/bhyve/ioapic.h @@ -1,5 +1,7 @@ /*- - * Copyright (c) 2014 Advanced Computing Technologies LLC + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * @@ -24,16 +26,18 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/ioapic.h 261268 2014-01-29 14:56:48Z jhb $ + * $FreeBSD$ */ #ifndef _IOAPIC_H_ #define _IOAPIC_H_ +struct pci_devinst; + /* * Allocate a PCI IRQ from the I/O APIC. */ void ioapic_init(struct vmctx *ctx); -int ioapic_pci_alloc_irq(void); +int ioapic_pci_alloc_irq(struct pci_devinst *pi); #endif diff --git a/usr/src/cmd/bhyve/iov.c b/usr/src/cmd/bhyve/iov.c new file mode 100644 index 0000000000..54ea22aa94 --- /dev/null +++ b/usr/src/cmd/bhyve/iov.c @@ -0,0 +1,148 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>. + * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/uio.h> + +#include <stdlib.h> +#include <string.h> +#include "iov.h" + +void +seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2, int *niov2, + size_t seek) +{ + size_t remainder = 0; + size_t left = seek; + int i, j; + + for (i = 0; i < niov1; i++) { + size_t toseek = MIN(left, iov1[i].iov_len); + left -= toseek; + + if (toseek == iov1[i].iov_len) + continue; + + if (left == 0) { + remainder = toseek; + break; + } + } + + for (j = i; j < niov1; j++) { + iov2[j - i].iov_base = (char *)iov1[j].iov_base + remainder; + iov2[j - i].iov_len = iov1[j].iov_len - remainder; + remainder = 0; + } + + *niov2 = j - i; +} + +size_t +count_iov(const struct iovec *iov, int niov) +{ + size_t total = 0; + int i; + + for (i = 0; i < niov; i++) + total += iov[i].iov_len; + + return (total); +} + +void +truncate_iov(struct iovec *iov, int *niov, size_t length) +{ + size_t done = 0; + int i; + + for (i = 0; i < *niov; i++) { + size_t toseek = MIN(length - done, iov[i].iov_len); + done += toseek; + + if (toseek <= iov[i].iov_len) { + iov[i].iov_len = toseek; + *niov = i + 1; + return; + } + } +} + +ssize_t +iov_to_buf(const struct iovec *iov, int niov, void **buf) +{ + size_t ptr, total; + int i; + + total = count_iov(iov, niov); + *buf = realloc(*buf, total); + if (*buf == NULL) + return (-1); + + for (i = 0, ptr = 0; i < niov; i++) { + memcpy(*buf + ptr, iov[i].iov_base, iov[i].iov_len); + ptr += iov[i].iov_len; + } + + return (total); +} + +ssize_t +buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov, + size_t seek) +{ + struct iovec *diov; + int ndiov, i; + size_t off = 0, len; + + if (seek > 0) { + diov = malloc(sizeof(struct iovec) * niov); + seek_iov(iov, niov, diov, &ndiov, seek); + } else { + diov = iov; + ndiov = niov; + } + + for (i = 0; i < ndiov && off < buflen; i++) { + len = MIN(diov[i].iov_len, buflen - off); + memcpy(diov[i].iov_base, buf + off, len); + off += len; + } + + if (seek > 0) + free(diov); + + return ((ssize_t)off); +} + diff --git a/usr/src/cmd/bhyve/iov.h b/usr/src/cmd/bhyve/iov.h new file mode 100644 index 0000000000..e3b5916edb --- /dev/null +++ b/usr/src/cmd/bhyve/iov.h @@ -0,0 +1,44 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>. + * Copyright (c) 2018 Alexander Motin <mav@FreeBSD.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IOV_H_ +#define _IOV_H_ + +void seek_iov(const struct iovec *iov1, int niov1, struct iovec *iov2, + int *niov2, size_t seek); +void truncate_iov(struct iovec *iov, int *niov, size_t length); +size_t count_iov(const struct iovec *iov, int niov); +ssize_t iov_to_buf(const struct iovec *iov, int niov, void **buf); +ssize_t buf_to_iov(const void *buf, size_t buflen, struct iovec *iov, int niov, + size_t seek); + +#endif /* _IOV_H_ */ diff --git a/usr/src/cmd/bhyve/mem.c b/usr/src/cmd/bhyve/mem.c index a153a8e960..90aefe45c8 100644 --- a/usr/src/cmd/bhyve/mem.c +++ b/usr/src/cmd/bhyve/mem.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/mem.c 269700 2014-08-08 03:49:01Z neel $ + * $FreeBSD$ */ /* @@ -33,18 +35,19 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/mem.c 269700 2014-08-08 03:49:01Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/types.h> -#include <sys/tree.h> #include <sys/errno.h> +#include <sys/tree.h> #include <machine/vmm.h> #include <machine/vmm_instruction_emul.h> -#include <stdio.h> -#include <stdlib.h> #include <assert.h> +#include <err.h> #include <pthread.h> +#include <stdio.h> +#include <stdlib.h> #include "mem.h" @@ -121,6 +124,7 @@ mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new) static void mmio_rb_dump(struct mmio_rb_tree *rbt) { + int perror; struct mmio_rb_range *np; pthread_rwlock_rdlock(&mmio_rwlock); @@ -128,12 +132,16 @@ mmio_rb_dump(struct mmio_rb_tree *rbt) printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end, np->mr_param.name); } - pthread_rwlock_unlock(&mmio_rwlock); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); } #endif RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare); +typedef int (mem_cb_t)(struct vmctx *ctx, int vcpu, uint64_t gpa, + struct mem_range *mr, void *arg); + static int mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg) { @@ -156,13 +164,12 @@ mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg) return (error); } -int -emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, - struct vm_guest_paging *paging) - +static int +access_memory(struct vmctx *ctx, int vcpu, uint64_t paddr, mem_cb_t *cb, + void *arg) { struct mmio_rb_range *entry; - int err, immutable; + int err, perror, immutable; pthread_rwlock_rdlock(&mmio_rwlock); /* @@ -180,7 +187,8 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, /* Update the per-vCPU cache */ mmio_hint[vcpu] = entry; } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) { - pthread_rwlock_unlock(&mmio_rwlock); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); return (ESRCH); } } @@ -199,40 +207,114 @@ emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, * config space window as 'immutable' the deadlock can be avoided. */ immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE); - if (immutable) - pthread_rwlock_unlock(&mmio_rwlock); + if (immutable) { + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); + } - err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging, - mem_read, mem_write, &entry->mr_param); + err = cb(ctx, vcpu, paddr, &entry->mr_param, arg); + + if (!immutable) { + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); + } - if (!immutable) - pthread_rwlock_unlock(&mmio_rwlock); return (err); } +struct emulate_mem_args { + struct vie *vie; + struct vm_guest_paging *paging; +}; + +static int +emulate_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr, + void *arg) +{ + struct emulate_mem_args *ema; + + ema = arg; + return (vmm_emulate_instruction(ctx, vcpu, paddr, ema->vie, ema->paging, + mem_read, mem_write, mr)); +} + +int +emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie, + struct vm_guest_paging *paging) + +{ + struct emulate_mem_args ema; + + ema.vie = vie; + ema.paging = paging; + return (access_memory(ctx, vcpu, paddr, emulate_mem_cb, &ema)); +} + +struct rw_mem_args { + uint64_t *val; + int size; + int operation; +}; + +static int +rw_mem_cb(struct vmctx *ctx, int vcpu, uint64_t paddr, struct mem_range *mr, + void *arg) +{ + struct rw_mem_args *rma; + + rma = arg; + return (mr->handler(ctx, vcpu, rma->operation, paddr, rma->size, + rma->val, mr->arg1, mr->arg2)); +} + +int +read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size) +{ + struct rw_mem_args rma; + + rma.val = rval; + rma.size = size; + rma.operation = MEM_F_READ; + return (access_memory(ctx, vcpu, gpa, rw_mem_cb, &rma)); +} + +int +write_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size) +{ + struct rw_mem_args rma; + + rma.val = &wval; + rma.size = size; + rma.operation = MEM_F_WRITE; + return (access_memory(ctx, vcpu, gpa, rw_mem_cb, &rma)); +} + static int register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp) { struct mmio_rb_range *entry, *mrp; - int err; + int err, perror; err = 0; mrp = malloc(sizeof(struct mmio_rb_range)); - - if (mrp != NULL) { + if (mrp == NULL) { + warn("%s: couldn't allocate memory for mrp\n", + __func__); + err = ENOMEM; + } else { mrp->mr_param = *memp; mrp->mr_base = memp->base; mrp->mr_end = memp->base + memp->size - 1; pthread_rwlock_wrlock(&mmio_rwlock); if (mmio_rb_lookup(rbt, memp->base, &entry) != 0) err = mmio_rb_add(rbt, mrp); - pthread_rwlock_unlock(&mmio_rwlock); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); if (err) free(mrp); - } else - err = ENOMEM; + } return (err); } @@ -256,7 +338,7 @@ unregister_mem(struct mem_range *memp) { struct mem_range *mr; struct mmio_rb_range *entry = NULL; - int err, i; + int err, perror, i; pthread_rwlock_wrlock(&mmio_rwlock); err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry); @@ -273,7 +355,8 @@ unregister_mem(struct mem_range *memp) mmio_hint[i] = NULL; } } - pthread_rwlock_unlock(&mmio_rwlock); + perror = pthread_rwlock_unlock(&mmio_rwlock); + assert(perror == 0); if (entry) free(entry); diff --git a/usr/src/cmd/bhyve/mem.h b/usr/src/cmd/bhyve/mem.h index 09cf56b72e..38d773c43f 100644 --- a/usr/src/cmd/bhyve/mem.h +++ b/usr/src/cmd/bhyve/mem.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/mem.h 269700 2014-08-08 03:49:01Z neel $ + * $FreeBSD$ */ #ifndef _MEM_H_ @@ -53,9 +55,13 @@ struct mem_range { void init_mem(void); int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie, struct vm_guest_paging *paging); - + +int read_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t *rval, + int size); int register_mem(struct mem_range *memp); int register_mem_fallback(struct mem_range *memp); int unregister_mem(struct mem_range *memp); +int write_mem(struct vmctx *ctx, int vcpu, uint64_t gpa, uint64_t wval, + int size); #endif /* _MEM_H_ */ diff --git a/usr/src/cmd/bhyve/mevent.c b/usr/src/cmd/bhyve/mevent.c new file mode 100644 index 0000000000..a258fd3047 --- /dev/null +++ b/usr/src/cmd/bhyve/mevent.c @@ -0,0 +1,680 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Micro event library for FreeBSD, designed for a single i/o thread + * using kqueue, and having events be persistent by default. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <assert.h> +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> + +#include <sys/types.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#ifdef __FreeBSD__ +#include <sys/event.h> +#else +#include <port.h> +#include <sys/poll.h> +#include <sys/siginfo.h> +#include <sys/queue.h> +#endif +#include <sys/time.h> + +#include <pthread.h> +#include <pthread_np.h> + +#include "mevent.h" + +#define MEVENT_MAX 64 + +#define MEV_ADD 1 +#define MEV_ENABLE 2 +#define MEV_DISABLE 3 +#define MEV_DEL_PENDING 4 + +extern char *vmname; + +static pthread_t mevent_tid; +static int mevent_timid = 43; +static int mevent_pipefd[2]; +static pthread_mutex_t mevent_lmutex = PTHREAD_MUTEX_INITIALIZER; + +struct mevent { + void (*me_func)(int, enum ev_type, void *); +#define me_msecs me_fd + int me_fd; +#ifdef __FreeBSD__ + int me_timid; +#else + timer_t me_timid; +#endif + enum ev_type me_type; + void *me_param; + int me_cq; + int me_state; + int me_closefd; +#ifndef __FreeBSD__ + port_notify_t me_notify; + struct sigevent me_sigev; + boolean_t me_auto_requeue; +#endif + LIST_ENTRY(mevent) me_list; +}; + +static LIST_HEAD(listhead, mevent) global_head, change_head; + +static void +mevent_qlock(void) +{ + pthread_mutex_lock(&mevent_lmutex); +} + +static void +mevent_qunlock(void) +{ + pthread_mutex_unlock(&mevent_lmutex); +} + +static void +mevent_pipe_read(int fd, enum ev_type type, void *param) +{ + char buf[MEVENT_MAX]; + int status; + + /* + * Drain the pipe read side. The fd is non-blocking so this is + * safe to do. + */ + do { + status = read(fd, buf, sizeof(buf)); + } while (status == MEVENT_MAX); +} + +static void +mevent_notify(void) +{ + char c; + + /* + * If calling from outside the i/o thread, write a byte on the + * pipe to force the i/o thread to exit the blocking kevent call. + */ + if (mevent_pipefd[1] != 0 && pthread_self() != mevent_tid) { + write(mevent_pipefd[1], &c, 1); + } +} +#ifdef __FreeBSD__ +static int +mevent_kq_filter(struct mevent *mevp) +{ + int retval; + + retval = 0; + + if (mevp->me_type == EVF_READ) + retval = EVFILT_READ; + + if (mevp->me_type == EVF_WRITE) + retval = EVFILT_WRITE; + + if (mevp->me_type == EVF_TIMER) + retval = EVFILT_TIMER; + + if (mevp->me_type == EVF_SIGNAL) + retval = EVFILT_SIGNAL; + + return (retval); +} + +static int +mevent_kq_flags(struct mevent *mevp) +{ + int ret; + + switch (mevp->me_state) { + case MEV_ADD: + ret = EV_ADD; /* implicitly enabled */ + break; + case MEV_ENABLE: + ret = EV_ENABLE; + break; + case MEV_DISABLE: + ret = EV_DISABLE; + break; + case MEV_DEL_PENDING: + ret = EV_DELETE; + break; + default: + assert(0); + break; + } + + return (ret); +} + +static int +mevent_kq_fflags(struct mevent *mevp) +{ + /* XXX nothing yet, perhaps EV_EOF for reads ? */ + return (0); +} + +static int +mevent_build(int mfd, struct kevent *kev) +{ + struct mevent *mevp, *tmpp; + int i; + + i = 0; + + mevent_qlock(); + + LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { + if (mevp->me_closefd) { + /* + * A close of the file descriptor will remove the + * event + */ + close(mevp->me_fd); + } else { + if (mevp->me_type == EVF_TIMER) { + kev[i].ident = mevp->me_timid; + kev[i].data = mevp->me_msecs; + } else { + kev[i].ident = mevp->me_fd; + kev[i].data = 0; + } + kev[i].filter = mevent_kq_filter(mevp); + kev[i].flags = mevent_kq_flags(mevp); + kev[i].fflags = mevent_kq_fflags(mevp); + kev[i].udata = mevp; + i++; + } + + mevp->me_cq = 0; + LIST_REMOVE(mevp, me_list); + + if (mevp->me_state == MEV_DEL_PENDING) { + free(mevp); + } else { + LIST_INSERT_HEAD(&global_head, mevp, me_list); + } + + assert(i < MEVENT_MAX); + } + + mevent_qunlock(); + + return (i); +} + +static void +mevent_handle(struct kevent *kev, int numev) +{ + struct mevent *mevp; + int i; + + for (i = 0; i < numev; i++) { + mevp = kev[i].udata; + + /* XXX check for EV_ERROR ? */ + + (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); + } +} + +#else /* __FreeBSD__ */ + +static void +mevent_update_one(struct mevent *mevp) +{ + int portfd = mevp->me_notify.portnfy_port; + + switch (mevp->me_type) { + case EVF_READ: + case EVF_WRITE: + mevp->me_auto_requeue = B_FALSE; + + switch (mevp->me_state) { + case MEV_ADD: + case MEV_ENABLE: + { + int events; + + events = (mevp->me_type == EVF_READ) ? POLLIN : POLLOUT; + + if (port_associate(portfd, PORT_SOURCE_FD, mevp->me_fd, + events, mevp) != 0) { + (void) fprintf(stderr, + "port_associate fd %d %p failed: %s\n", + mevp->me_fd, mevp, strerror(errno)); + } + return; + } + case MEV_DISABLE: + case MEV_DEL_PENDING: + /* + * A disable that comes in while an event is being + * handled will result in an ENOENT. + */ + if (port_dissociate(portfd, PORT_SOURCE_FD, + mevp->me_fd) != 0 && errno != ENOENT) { + (void) fprintf(stderr, "port_dissociate " + "portfd %d fd %d mevp %p failed: %s\n", + portfd, mevp->me_fd, mevp, strerror(errno)); + } + return; + default: + goto abort; + } + + case EVF_TIMER: + mevp->me_auto_requeue = B_TRUE; + + switch (mevp->me_state) { + case MEV_ADD: + case MEV_ENABLE: + { + struct itimerspec it = { 0 }; + + mevp->me_sigev.sigev_notify = SIGEV_PORT; + mevp->me_sigev.sigev_value.sival_ptr = &mevp->me_notify; + + if (timer_create(CLOCK_REALTIME, &mevp->me_sigev, + &mevp->me_timid) != 0) { + (void) fprintf(stderr, + "timer_create failed: %s", strerror(errno)); + return; + } + + /* The first timeout */ + it.it_value.tv_sec = mevp->me_msecs / MILLISEC; + it.it_value.tv_nsec = + MSEC2NSEC(mevp->me_msecs % MILLISEC); + /* Repeat at the same interval */ + it.it_interval = it.it_value; + + if (timer_settime(mevp->me_timid, 0, &it, NULL) != 0) { + (void) fprintf(stderr, "timer_settime failed: " + "%s", strerror(errno)); + } + return; + } + case MEV_DISABLE: + case MEV_DEL_PENDING: + if (timer_delete(mevp->me_timid) != 0) { + (void) fprintf(stderr, "timer_delete failed: " + "%s", strerror(errno)); + } + return; + default: + goto abort; + } + default: + /* EVF_SIGNAL not yet implemented. */ + goto abort; + } + +abort: + (void) fprintf(stderr, "%s: unhandled type %d state %d\n", __func__, + mevp->me_type, mevp->me_state); + abort(); +} + +static void +mevent_update_pending(int portfd) +{ + struct mevent *mevp, *tmpp; + + mevent_qlock(); + + LIST_FOREACH_SAFE(mevp, &change_head, me_list, tmpp) { + mevp->me_notify.portnfy_port = portfd; + mevp->me_notify.portnfy_user = mevp; + if (mevp->me_closefd) { + /* + * A close of the file descriptor will remove the + * event + */ + (void) close(mevp->me_fd); + mevp->me_fd = -1; + } else { + mevent_update_one(mevp); + } + + mevp->me_cq = 0; + LIST_REMOVE(mevp, me_list); + + if (mevp->me_state == MEV_DEL_PENDING) { + free(mevp); + } else { + LIST_INSERT_HEAD(&global_head, mevp, me_list); + } + } + + mevent_qunlock(); +} + +static void +mevent_handle_pe(port_event_t *pe) +{ + struct mevent *mevp = pe->portev_user; + + mevent_qunlock(); + + (*mevp->me_func)(mevp->me_fd, mevp->me_type, mevp->me_param); + + mevent_qlock(); + if (!mevp->me_cq && !mevp->me_auto_requeue) { + mevent_update_one(mevp); + } + mevent_qunlock(); +} +#endif + +struct mevent * +mevent_add(int tfd, enum ev_type type, + void (*func)(int, enum ev_type, void *), void *param) +{ + struct mevent *lp, *mevp; + + if (tfd < 0 || func == NULL) { + return (NULL); + } + + mevp = NULL; + + mevent_qlock(); + + /* + * Verify that the fd/type tuple is not present in any list + */ + LIST_FOREACH(lp, &global_head, me_list) { + if (type != EVF_TIMER && lp->me_fd == tfd && + lp->me_type == type) { + goto exit; + } + } + + LIST_FOREACH(lp, &change_head, me_list) { + if (type != EVF_TIMER && lp->me_fd == tfd && + lp->me_type == type) { + goto exit; + } + } + + /* + * Allocate an entry, populate it, and add it to the change list. + */ + mevp = calloc(1, sizeof(struct mevent)); + if (mevp == NULL) { + goto exit; + } + + if (type == EVF_TIMER) { + mevp->me_msecs = tfd; + mevp->me_timid = mevent_timid++; + } else + mevp->me_fd = tfd; + mevp->me_type = type; + mevp->me_func = func; + mevp->me_param = param; + + LIST_INSERT_HEAD(&change_head, mevp, me_list); + mevp->me_cq = 1; + mevp->me_state = MEV_ADD; + mevent_notify(); + +exit: + mevent_qunlock(); + + return (mevp); +} + +static int +mevent_update(struct mevent *evp, int newstate) +{ + /* + * It's not possible to enable/disable a deleted event + */ + if (evp->me_state == MEV_DEL_PENDING) + return (EINVAL); + + /* + * No update needed if state isn't changing + */ + if (evp->me_state == newstate) + return (0); + + mevent_qlock(); + + evp->me_state = newstate; + + /* + * Place the entry onto the changed list if not already there. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + + mevent_qunlock(); + + return (0); +} + +int +mevent_enable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_ENABLE)); +} + +int +mevent_disable(struct mevent *evp) +{ + + return (mevent_update(evp, MEV_DISABLE)); +} + +static int +mevent_delete_event(struct mevent *evp, int closefd) +{ + mevent_qlock(); + + /* + * Place the entry onto the changed list if not already there, and + * mark as to be deleted. + */ + if (evp->me_cq == 0) { + evp->me_cq = 1; + LIST_REMOVE(evp, me_list); + LIST_INSERT_HEAD(&change_head, evp, me_list); + mevent_notify(); + } + evp->me_state = MEV_DEL_PENDING; + + if (closefd) + evp->me_closefd = 1; + + mevent_qunlock(); + + return (0); +} + +int +mevent_delete(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 0)); +} + +int +mevent_delete_close(struct mevent *evp) +{ + + return (mevent_delete_event(evp, 1)); +} + +static void +mevent_set_name(void) +{ + + pthread_set_name_np(mevent_tid, "mevent"); +} + +void +mevent_dispatch(void) +{ +#ifdef __FreeBSD__ + struct kevent changelist[MEVENT_MAX]; + struct kevent eventlist[MEVENT_MAX]; + struct mevent *pipev; + int mfd; + int numev; +#else + struct mevent *pipev; + int portfd; +#endif + int ret; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + mevent_tid = pthread_self(); + mevent_set_name(); + +#ifdef __FreeBSD__ + mfd = kqueue(); + assert(mfd > 0); +#else + portfd = port_create(); + assert(portfd >= 0); +#endif + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_KQUEUE); + if (caph_rights_limit(mfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Open the pipe that will be used for other threads to force + * the blocking kqueue call to exit by writing to it. Set the + * descriptor to non-blocking. + */ + ret = pipe(mevent_pipefd); + if (ret < 0) { + perror("pipe"); + exit(0); + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(mevent_pipefd[0], &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_rights_limit(mevent_pipefd[1], &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Add internal event handler for the pipe write fd + */ + pipev = mevent_add(mevent_pipefd[0], EVF_READ, mevent_pipe_read, NULL); + assert(pipev != NULL); + + for (;;) { +#ifdef __FreeBSD__ + /* + * Build changelist if required. + * XXX the changelist can be put into the blocking call + * to eliminate the extra syscall. Currently better for + * debug. + */ + numev = mevent_build(mfd, changelist); + if (numev) { + ret = kevent(mfd, changelist, numev, NULL, 0, NULL); + if (ret == -1) { + perror("Error return from kevent change"); + } + } + + /* + * Block awaiting events + */ + ret = kevent(mfd, NULL, 0, eventlist, MEVENT_MAX, NULL); + if (ret == -1 && errno != EINTR) { + perror("Error return from kevent monitor"); + } + + /* + * Handle reported events + */ + mevent_handle(eventlist, ret); + +#else /* __FreeBSD__ */ + port_event_t pev; + + /* Handle any pending updates */ + mevent_update_pending(portfd); + + /* Block awaiting events */ + ret = port_get(portfd, &pev, NULL); + if (ret != 0 && errno != EINTR) { + perror("Error return from port_get"); + continue; + } + + /* Handle reported event */ + mevent_handle_pe(&pev); +#endif /* __FreeBSD__ */ + } +} diff --git a/usr/src/cmd/bhyve/mevent.h b/usr/src/cmd/bhyve/mevent.h new file mode 100644 index 0000000000..e6b96f0a7c --- /dev/null +++ b/usr/src/cmd/bhyve/mevent.h @@ -0,0 +1,53 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _MEVENT_H_ +#define _MEVENT_H_ + +enum ev_type { + EVF_READ, + EVF_WRITE, + EVF_TIMER, + EVF_SIGNAL +}; + +struct mevent; + +struct mevent *mevent_add(int fd, enum ev_type type, + void (*func)(int, enum ev_type, void *), + void *param); +int mevent_enable(struct mevent *evp); +int mevent_disable(struct mevent *evp); +int mevent_delete(struct mevent *evp); +int mevent_delete_close(struct mevent *evp); + +void mevent_dispatch(void); + +#endif /* _MEVENT_H_ */ diff --git a/usr/src/cmd/bhyve/mevent_test.c b/usr/src/cmd/bhyve/mevent_test.c new file mode 100644 index 0000000000..4da3adb5ae --- /dev/null +++ b/usr/src/cmd/bhyve/mevent_test.c @@ -0,0 +1,282 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test program for the micro event library. Set up a simple TCP echo + * service. + * + * cc mevent_test.c mevent.c -lpthread + */ + +#include <sys/types.h> +#include <sys/stdint.h> +#ifdef __FreeBSD__ +#include <sys/sysctl.h> +#endif +#include <sys/socket.h> +#include <netinet/in.h> +#ifdef __FreeBSD__ +#include <machine/cpufunc.h> +#endif + +#include <stdio.h> +#include <stdlib.h> +#include <pthread.h> +#include <unistd.h> + +#include "mevent.h" + +#define TEST_PORT 4321 + +static pthread_mutex_t accept_mutex = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t accept_condvar = PTHREAD_COND_INITIALIZER; + +static struct mevent *tevp; + +char *vmname = "test vm"; + + +#define MEVENT_ECHO + +/* Number of timer events to capture */ +#define TEVSZ 4096 +uint64_t tevbuf[TEVSZ]; + +static void +timer_print(void) +{ + uint64_t min, max, diff, sum; +#ifdef __FreeBSD__ + uint64_t tsc_freq; + size_t len; +#endif + int j; + + min = UINT64_MAX; + max = 0; + sum = 0; + +#ifdef __FreeBSD__ + len = sizeof(tsc_freq); + sysctlbyname("machdep.tsc_freq", &tsc_freq, &len, NULL, 0); +#endif + + for (j = 1; j < TEVSZ; j++) { +#ifdef __FreeBSD__ + /* Convert a tsc diff into microseconds */ + diff = (tevbuf[j] - tevbuf[j-1]) * 1000000 / tsc_freq; +#else + diff = (tevbuf[j] - tevbuf[j-1]) / 1000; +#endif + sum += diff; + if (min > diff) + min = diff; + if (max < diff) + max = diff; + } + + printf("timers done: usecs, min %ld, max %ld, mean %ld\n", min, max, + sum/(TEVSZ - 1)); +} + +static void +timer_callback(int fd, enum ev_type type, void *param) +{ + static int i; + + if (i >= TEVSZ) + abort(); + +#ifdef __FreeBSD__ + tevbuf[i++] = rdtsc(); +#else + tevbuf[i++] = gethrtime(); +#endif + + if (i == TEVSZ) { + mevent_delete(tevp); + timer_print(); + } +} + + +#ifdef MEVENT_ECHO +struct esync { + pthread_mutex_t e_mt; + pthread_cond_t e_cond; +}; + +static void +echoer_callback(int fd, enum ev_type type, void *param) +{ + struct esync *sync = param; + + pthread_mutex_lock(&sync->e_mt); + pthread_cond_signal(&sync->e_cond); + pthread_mutex_unlock(&sync->e_mt); +} + +static void * +echoer(void *param) +{ + struct esync sync; + struct mevent *mev; + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + pthread_mutex_init(&sync.e_mt, NULL); + pthread_cond_init(&sync.e_cond, NULL); + + pthread_mutex_lock(&sync.e_mt); + + mev = mevent_add(fd, EVF_READ, echoer_callback, &sync); + if (mev == NULL) { + printf("Could not allocate echoer event\n"); + exit(4); + } + + while (!pthread_cond_wait(&sync.e_cond, &sync.e_mt)) { + len = read(fd, buf, sizeof(buf)); + if (len > 0) { + write(fd, buf, len); + write(0, buf, len); + } else { + break; + } + } + + mevent_delete_close(mev); + + pthread_mutex_unlock(&sync.e_mt); + pthread_mutex_destroy(&sync.e_mt); + pthread_cond_destroy(&sync.e_cond); + + return (NULL); +} + +#else + +static void * +echoer(void *param) +{ + char buf[128]; + int fd = (int)(uintptr_t) param; + int len; + + while ((len = read(fd, buf, sizeof(buf))) > 0) { + write(1, buf, len); + } + + return (NULL); +} +#endif /* MEVENT_ECHO */ + +static void +acceptor_callback(int fd, enum ev_type type, void *param) +{ + pthread_mutex_lock(&accept_mutex); + pthread_cond_signal(&accept_condvar); + pthread_mutex_unlock(&accept_mutex); +} + +static void * +acceptor(void *param) +{ + struct sockaddr_in sin; + pthread_t tid; + int news; + int s; + + if ((s = socket(AF_INET, SOCK_STREAM, 0)) < 0) { + perror("cannot create socket"); + exit(4); + } + +#ifdef __FreeBSD__ + sin.sin_len = sizeof(sin); +#endif + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = htonl(INADDR_ANY); + sin.sin_port = htons(TEST_PORT); + + if (bind(s, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + perror("cannot bind socket"); + exit(4); + } + + if (listen(s, 1) < 0) { + perror("cannot listen socket"); + exit(4); + } + + (void) mevent_add(s, EVF_READ, acceptor_callback, NULL); + + pthread_mutex_lock(&accept_mutex); + + while (!pthread_cond_wait(&accept_condvar, &accept_mutex)) { + news = accept(s, NULL, NULL); + if (news < 0) { + perror("accept error"); + } else { + static int first = 1; + + if (first) { + /* + * Start a timer + */ + first = 0; + tevp = mevent_add(1, EVF_TIMER, timer_callback, + NULL); + } + + printf("incoming connection, spawning thread\n"); + pthread_create(&tid, NULL, echoer, + (void *)(uintptr_t)news); + } + } + + return (NULL); +} + +int +main() +{ + pthread_t tid; + + pthread_create(&tid, NULL, acceptor, NULL); + + mevent_dispatch(); + return (0); +} diff --git a/usr/src/cmd/bhyve/mptbl.c b/usr/src/cmd/bhyve/mptbl.c index 9d03765c7a..e78f88f074 100644 --- a/usr/src/cmd/bhyve/mptbl.c +++ b/usr/src/cmd/bhyve/mptbl.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,11 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/mptbl.c 266125 2014-05-15 14:16:55Z jhb $ + * $FreeBSD$ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/mptbl.c 266125 2014-05-15 14:16:55Z jhb $"); +__FBSDID("$FreeBSD$"); #include <sys/types.h> #include <sys/errno.h> diff --git a/usr/src/cmd/bhyve/mptbl.h b/usr/src/cmd/bhyve/mptbl.h index d78ea6da09..ebc8d85ea8 100644 --- a/usr/src/cmd/bhyve/mptbl.h +++ b/usr/src/cmd/bhyve/mptbl.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/mptbl.h 257423 2013-10-31 05:44:45Z neel $ + * $FreeBSD$ */ #ifndef _MPTBL_H_ diff --git a/usr/src/cmd/bhyve/pci_ahci.c b/usr/src/cmd/bhyve/pci_ahci.c index b68c977c1f..1e3feffcc2 100644 --- a/usr/src/cmd/bhyve/pci_ahci.c +++ b/usr/src/cmd/bhyve/pci_ahci.c @@ -1,5 +1,8 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Zhixiang Yu <zcore@freebsd.org> + * Copyright (c) 2015-2016 Alexander Motin <mav@FreeBSD.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -23,11 +26,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z tychon $ + * $FreeBSD$ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z tychon $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/linker_set.h> @@ -50,13 +53,15 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z t #include <pthread.h> #include <pthread_np.h> #include <inttypes.h> +#include <md5.h> #include "bhyverun.h" #include "pci_emul.h" #include "ahci.h" #include "block_if.h" -#define MAX_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ +#define DEF_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */ +#define MAX_PORTS 32 /* AHCI supports 32 ports */ #define PxSIG_ATA 0x00000101 /* ATA drive */ #define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */ @@ -86,6 +91,7 @@ enum sata_fis_type { #define READ_TOC 0x43 #define GET_EVENT_STATUS_NOTIFICATION 0x4A #define MODE_SENSE_10 0x5A +#define REPORT_LUNS 0xA0 #define READ_12 0xA8 #define READ_CD 0xBE @@ -99,7 +105,7 @@ enum sata_fis_type { * ATA commands */ #define ATA_SF_ENAB_SATA_SF 0x10 -#define ATA_SATA_SF_AN 0x05 +#define ATA_SATA_SF_AN 0x05 #define ATA_SF_DIS_SATA_SF 0x90 /* @@ -113,6 +119,8 @@ static FILE *dbg; #endif #define WPRINTF(format, arg...) printf(format, ##arg) +#define AHCI_PORT_IDENT 20 + 1 + struct ahci_ioreq { struct blockif_req io_req; struct ahci_port *io_pr; @@ -122,7 +130,7 @@ struct ahci_ioreq { uint32_t len; uint32_t done; int slot; - int prdtl; + int more; }; struct ahci_port { @@ -130,12 +138,17 @@ struct ahci_port { struct pci_ahci_softc *pr_sc; uint8_t *cmd_lst; uint8_t *rfis; + char ident[AHCI_PORT_IDENT]; + int port; int atapi; int reset; + int waitforclear; int mult_sectors; uint8_t xfermode; + uint8_t err_cfis[20]; uint8_t sense_key; uint8_t asc; + u_int ccs; uint32_t pending; uint32_t clb; @@ -200,6 +213,8 @@ struct pci_ahci_softc { }; #define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx) +static void ahci_handle_port(struct ahci_port *p); + static inline void lba_to_msf(uint8_t *buf, int lba) { lba += 150; @@ -209,47 +224,95 @@ static inline void lba_to_msf(uint8_t *buf, int lba) } /* - * generate HBA intr depending on whether or not ports within - * the controller have an interrupt pending. + * Generate HBA interrupts on global IS register write. */ static void -ahci_generate_intr(struct pci_ahci_softc *sc) +ahci_generate_intr(struct pci_ahci_softc *sc, uint32_t mask) { - struct pci_devinst *pi; - int i; - - pi = sc->asc_pi; + struct pci_devinst *pi = sc->asc_pi; + struct ahci_port *p; + int i, nmsg; + uint32_t mmask; + /* Update global IS from PxIS/PxIE. */ for (i = 0; i < sc->ports; i++) { - struct ahci_port *pr; - pr = &sc->port[i]; - if (pr->is & pr->ie) + p = &sc->port[i]; + if (p->is & p->ie) sc->is |= (1 << i); } + DPRINTF("%s(%08x) %08x\n", __func__, mask, sc->is); + + /* If there is nothing enabled -- clear legacy interrupt and exit. */ + if (sc->is == 0 || (sc->ghc & AHCI_GHC_IE) == 0) { + if (sc->lintr) { + pci_lintr_deassert(pi); + sc->lintr = 0; + } + return; + } - DPRINTF("%s %x\n", __func__, sc->is); - - if (sc->is && (sc->ghc & AHCI_GHC_IE)) { - if (pci_msi_enabled(pi)) { - /* - * Generate an MSI interrupt on every edge - */ - pci_generate_msi(pi, 0); - } else if (!sc->lintr) { - /* - * Only generate a pin-based interrupt if one wasn't - * in progress - */ + /* If there is anything and no MSI -- assert legacy interrupt. */ + nmsg = pci_msi_maxmsgnum(pi); + if (nmsg == 0) { + if (!sc->lintr) { sc->lintr = 1; pci_lintr_assert(pi); } - } else if (sc->lintr) { - /* - * No interrupts: deassert pin-based signal if it had - * been asserted - */ - pci_lintr_deassert(pi); - sc->lintr = 0; + return; + } + + /* Assert respective MSIs for ports that were touched. */ + for (i = 0; i < nmsg; i++) { + if (sc->ports <= nmsg || i < nmsg - 1) + mmask = 1 << i; + else + mmask = 0xffffffff << i; + if (sc->is & mask && mmask & mask) + pci_generate_msi(pi, i); + } +} + +/* + * Generate HBA interrupt on specific port event. + */ +static void +ahci_port_intr(struct ahci_port *p) +{ + struct pci_ahci_softc *sc = p->pr_sc; + struct pci_devinst *pi = sc->asc_pi; + int nmsg; + + DPRINTF("%s(%d) %08x/%08x %08x\n", __func__, + p->port, p->is, p->ie, sc->is); + + /* If there is nothing enabled -- we are done. */ + if ((p->is & p->ie) == 0) + return; + + /* In case of non-shared MSI always generate interrupt. */ + nmsg = pci_msi_maxmsgnum(pi); + if (sc->ports <= nmsg || p->port < nmsg - 1) { + sc->is |= (1 << p->port); + if ((sc->ghc & AHCI_GHC_IE) == 0) + return; + pci_generate_msi(pi, p->port); + return; + } + + /* If IS for this port is already set -- do nothing. */ + if (sc->is & (1 << p->port)) + return; + + sc->is |= (1 << p->port); + + /* If interrupts are enabled -- generate one. */ + if ((sc->ghc & AHCI_GHC_IE) == 0) + return; + if (nmsg > 0) { + pci_generate_msi(pi, nmsg - 1); + } else if (!sc->lintr) { + sc->lintr = 1; + pci_lintr_assert(pi); } } @@ -265,26 +328,32 @@ ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis) case FIS_TYPE_REGD2H: offset = 0x40; len = 20; - irq = AHCI_P_IX_DHR; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_DHR : 0; break; case FIS_TYPE_SETDEVBITS: offset = 0x58; len = 8; - irq = AHCI_P_IX_SDB; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_SDB : 0; break; case FIS_TYPE_PIOSETUP: offset = 0x20; len = 20; - irq = 0; + irq = (fis[1] & (1 << 6)) ? AHCI_P_IX_PS : 0; break; default: WPRINTF("unsupported fis type %d\n", ft); return; } + if (fis[2] & ATA_S_ERROR) { + p->waitforclear = 1; + irq |= AHCI_P_IX_TFE; + } memcpy(p->rfis + offset, fis, len); if (irq) { - p->is |= irq; - ahci_generate_intr(p->pr_sc); + if (~p->is & irq) { + p->is |= irq; + ahci_port_intr(p); + } } } @@ -299,19 +368,29 @@ ahci_write_fis_piosetup(struct ahci_port *p) } static void -ahci_write_fis_sdb(struct ahci_port *p, int slot, uint32_t tfd) +ahci_write_fis_sdb(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) { uint8_t fis[8]; uint8_t error; error = (tfd >> 8) & 0xff; + tfd &= 0x77; memset(fis, 0, sizeof(fis)); - fis[0] = error; - fis[2] = tfd & 0x77; - *(uint32_t *)(fis + 4) = (1 << slot); - if (fis[2] & ATA_S_ERROR) - p->is |= AHCI_P_IX_TFE; - p->tfd = tfd; + fis[0] = FIS_TYPE_SETDEVBITS; + fis[1] = (1 << 6); + fis[2] = tfd; + fis[3] = error; + if (fis[2] & ATA_S_ERROR) { + p->err_cfis[0] = slot; + p->err_cfis[2] = tfd; + p->err_cfis[3] = error; + memcpy(&p->err_cfis[4], cfis + 4, 16); + } else { + *(uint32_t *)(fis + 4) = (1 << slot); + p->sact &= ~(1 << slot); + } + p->tfd &= ~0x77; + p->tfd |= tfd; ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis); } @@ -337,15 +416,33 @@ ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd) fis[11] = cfis[11]; fis[12] = cfis[12]; fis[13] = cfis[13]; - if (fis[2] & ATA_S_ERROR) - p->is |= AHCI_P_IX_TFE; - else + if (fis[2] & ATA_S_ERROR) { + p->err_cfis[0] = 0x80; + p->err_cfis[2] = tfd & 0xff; + p->err_cfis[3] = error; + memcpy(&p->err_cfis[4], cfis + 4, 16); + } else p->ci &= ~(1 << slot); p->tfd = tfd; ahci_write_fis(p, FIS_TYPE_REGD2H, fis); } static void +ahci_write_fis_d2h_ncq(struct ahci_port *p, int slot) +{ + uint8_t fis[20]; + + p->tfd = ATA_S_READY | ATA_S_DSC; + memset(fis, 0, sizeof(fis)); + fis[0] = FIS_TYPE_REGD2H; + fis[1] = 0; /* No interrupt */ + fis[2] = p->tfd; /* Status */ + fis[3] = 0; /* No error */ + p->ci &= ~(1 << slot); + ahci_write_fis(p, FIS_TYPE_REGD2H, fis); +} + +static void ahci_write_reset_fis_d2h(struct ahci_port *p) { uint8_t fis[20]; @@ -372,9 +469,11 @@ ahci_check_stopped(struct ahci_port *p) */ if (!(p->cmd & AHCI_P_CMD_ST)) { if (p->pending == 0) { + p->ccs = 0; p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK); p->ci = 0; p->sact = 0; + p->waitforclear = 0; } } } @@ -385,7 +484,6 @@ ahci_port_stop(struct ahci_port *p) struct ahci_ioreq *aior; uint8_t *cfis; int slot; - int ncq; int error; assert(pthread_mutex_isowned_np(&p->pr_sc->mtx)); @@ -401,11 +499,9 @@ ahci_port_stop(struct ahci_port *p) slot = aior->slot; cfis = aior->cfis; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || - cfis[2] == ATA_READ_FPDMA_QUEUED) - ncq = 1; - - if (ncq) - p->sact &= ~(1 << slot); + cfis[2] == ATA_READ_FPDMA_QUEUED || + cfis[2] == ATA_SEND_FPDMA_QUEUED) + p->sact &= ~(1 << slot); /* NCQ */ else p->ci &= ~(1 << slot); @@ -431,7 +527,6 @@ ahci_port_stop(struct ahci_port *p) static void ahci_port_reset(struct ahci_port *pr) { - pr->sctl = 0; pr->serr = 0; pr->sact = 0; pr->xfermode = ATA_UDMA6; @@ -443,8 +538,11 @@ ahci_port_reset(struct ahci_port *pr) pr->tfd = 0x7F; return; } - pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_SPD_GEN2 | - ATA_SS_IPM_ACTIVE; + pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_IPM_ACTIVE; + if (pr->sctl & ATA_SC_SPD_MASK) + pr->ssts |= (pr->sctl & ATA_SC_SPD_MASK); + else + pr->ssts |= ATA_SS_SPD_GEN3; pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA; if (!pr->atapi) { pr->sig = PxSIG_ATA; @@ -470,6 +568,10 @@ ahci_reset(struct pci_ahci_softc *sc) for (i = 0; i < sc->ports; i++) { sc->port[i].ie = 0; sc->port[i].is = 0; + sc->port[i].cmd = (AHCI_P_CMD_SUD | AHCI_P_CMD_POD); + if (sc->port[i].bctx) + sc->port[i].cmd |= AHCI_P_CMD_CPS; + sc->port[i].sctl = 0; ahci_port_reset(&sc->port[i]); } } @@ -500,32 +602,87 @@ atapi_string(uint8_t *dest, const char *src, int len) } } +/* + * Build up the iovec based on the PRDT, 'done' and 'len'. + */ static void -ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done, - int seek) +ahci_build_iov(struct ahci_port *p, struct ahci_ioreq *aior, + struct ahci_prdt_entry *prdt, uint16_t prdtl) +{ + struct blockif_req *breq = &aior->io_req; + int i, j, skip, todo, left, extra; + uint32_t dbcsz; + + /* Copy part of PRDT between 'done' and 'len' bytes into the iov. */ + skip = aior->done; + left = aior->len - aior->done; + todo = 0; + for (i = 0, j = 0; i < prdtl && j < BLOCKIF_IOV_MAX && left > 0; + i++, prdt++) { + dbcsz = (prdt->dbc & DBCMASK) + 1; + /* Skip already done part of the PRDT */ + if (dbcsz <= skip) { + skip -= dbcsz; + continue; + } + dbcsz -= skip; + if (dbcsz > left) + dbcsz = left; + breq->br_iov[j].iov_base = paddr_guest2host(ahci_ctx(p->pr_sc), + prdt->dba + skip, dbcsz); + breq->br_iov[j].iov_len = dbcsz; + todo += dbcsz; + left -= dbcsz; + skip = 0; + j++; + } + + /* If we got limited by IOV length, round I/O down to sector size. */ + if (j == BLOCKIF_IOV_MAX) { + extra = todo % blockif_sectsz(p->bctx); + todo -= extra; + assert(todo > 0); + while (extra > 0) { + if (breq->br_iov[j - 1].iov_len > extra) { + breq->br_iov[j - 1].iov_len -= extra; + break; + } + extra -= breq->br_iov[j - 1].iov_len; + j--; + } + } + + breq->br_iovcnt = j; + breq->br_resid = todo; + aior->done += todo; + aior->more = (aior->done < aior->len && i < prdtl); +} + +static void +ahci_handle_rw(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct blockif_req *breq; - struct pci_ahci_softc *sc; struct ahci_prdt_entry *prdt; struct ahci_cmd_hdr *hdr; uint64_t lba; uint32_t len; - int i, err, iovcnt, ncq, readop; + int err, first, ncq, readop; - sc = p->pr_sc; prdt = (struct ahci_prdt_entry *)(cfis + 0x80); hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); ncq = 0; readop = 1; + first = (done == 0); - prdt += seek; - if (cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || - cfis[2] == ATA_WRITE_FPDMA_QUEUED) + if (cfis[2] == ATA_WRITE || cfis[2] == ATA_WRITE48 || + cfis[2] == ATA_WRITE_MUL || cfis[2] == ATA_WRITE_MUL48 || + cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 || + cfis[2] == ATA_WRITE_FPDMA_QUEUED) readop = 0; if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || - cfis[2] == ATA_READ_FPDMA_QUEUED) { + cfis[2] == ATA_READ_FPDMA_QUEUED) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | @@ -536,7 +693,9 @@ ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done, if (!len) len = 65536; ncq = 1; - } else if (cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { + } else if (cfis[2] == ATA_READ48 || cfis[2] == ATA_WRITE48 || + cfis[2] == ATA_READ_MUL48 || cfis[2] == ATA_WRITE_MUL48 || + cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) { lba = ((uint64_t)cfis[10] << 40) | ((uint64_t)cfis[9] << 32) | ((uint64_t)cfis[8] << 24) | @@ -556,57 +715,33 @@ ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done, lba *= blockif_sectsz(p->bctx); len *= blockif_sectsz(p->bctx); - /* - * Pull request off free list - */ + /* Pull request off free list */ aior = STAILQ_FIRST(&p->iofhd); assert(aior != NULL); STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; aior->slot = slot; aior->len = len; aior->done = done; breq = &aior->io_req; breq->br_offset = lba + done; - iovcnt = hdr->prdtl - seek; - if (iovcnt > BLOCKIF_IOV_MAX) { - aior->prdtl = iovcnt - BLOCKIF_IOV_MAX; - iovcnt = BLOCKIF_IOV_MAX; - } else - aior->prdtl = 0; - breq->br_iovcnt = iovcnt; + ahci_build_iov(p, aior, prdt, hdr->prdtl); - /* - * Mark this command in-flight. - */ + /* Mark this command in-flight. */ p->pending |= 1 << slot; - /* - * Stuff request onto busy list - */ + /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); - /* - * Build up the iovec based on the prdt - */ - for (i = 0; i < iovcnt; i++) { - uint32_t dbcsz; + if (ncq && first) + ahci_write_fis_d2h_ncq(p, slot); - dbcsz = (prdt->dbc & DBCMASK) + 1; - breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc), - prdt->dba, dbcsz); - breq->br_iov[i].iov_len = dbcsz; - aior->done += dbcsz; - prdt++; - } if (readop) err = blockif_read(p->bctx, breq); else err = blockif_write(p->bctx, breq); assert(err == 0); - - if (ncq) - p->ci &= ~(1 << slot); } static void @@ -626,7 +761,7 @@ ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) aior->slot = slot; aior->len = 0; aior->done = 0; - aior->prdtl = 0; + aior->more = 0; breq = &aior->io_req; /* @@ -644,6 +779,120 @@ ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis) } static inline void +read_prdt(struct ahci_port *p, int slot, uint8_t *cfis, + void *buf, int size) +{ + struct ahci_cmd_hdr *hdr; + struct ahci_prdt_entry *prdt; + void *to; + int i, len; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + len = size; + to = buf; + prdt = (struct ahci_prdt_entry *)(cfis + 0x80); + for (i = 0; i < hdr->prdtl && len; i++) { + uint8_t *ptr; + uint32_t dbcsz; + int sublen; + + dbcsz = (prdt->dbc & DBCMASK) + 1; + ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); + sublen = MIN(len, dbcsz); + memcpy(to, ptr, sublen); + len -= sublen; + to += sublen; + prdt++; + } +} + +static void +ahci_handle_dsm_trim(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) +{ + struct ahci_ioreq *aior; + struct blockif_req *breq; + uint8_t *entry; + uint64_t elba; + uint32_t len, elen; + int err, first, ncq; + uint8_t buf[512]; + + first = (done == 0); + if (cfis[2] == ATA_DATA_SET_MANAGEMENT) { + len = (uint16_t)cfis[13] << 8 | cfis[12]; + len *= 512; + ncq = 0; + } else { /* ATA_SEND_FPDMA_QUEUED */ + len = (uint16_t)cfis[11] << 8 | cfis[3]; + len *= 512; + ncq = 1; + } + read_prdt(p, slot, cfis, buf, sizeof(buf)); + +next: + entry = &buf[done]; + elba = ((uint64_t)entry[5] << 40) | + ((uint64_t)entry[4] << 32) | + ((uint64_t)entry[3] << 24) | + ((uint64_t)entry[2] << 16) | + ((uint64_t)entry[1] << 8) | + entry[0]; + elen = (uint16_t)entry[7] << 8 | entry[6]; + done += 8; + if (elen == 0) { + if (done >= len) { + if (ncq) { + if (first) + ahci_write_fis_d2h_ncq(p, slot); + ahci_write_fis_sdb(p, slot, cfis, + ATA_S_READY | ATA_S_DSC); + } else { + ahci_write_fis_d2h(p, slot, cfis, + ATA_S_READY | ATA_S_DSC); + } + p->pending &= ~(1 << slot); + ahci_check_stopped(p); + if (!first) + ahci_handle_port(p); + return; + } + goto next; + } + + /* + * Pull request off free list + */ + aior = STAILQ_FIRST(&p->iofhd); + assert(aior != NULL); + STAILQ_REMOVE_HEAD(&p->iofhd, io_flist); + aior->cfis = cfis; + aior->slot = slot; + aior->len = len; + aior->done = done; + aior->more = (len != done); + + breq = &aior->io_req; + breq->br_offset = elba * blockif_sectsz(p->bctx); + breq->br_resid = elen * blockif_sectsz(p->bctx); + + /* + * Mark this command in-flight. + */ + p->pending |= 1 << slot; + + /* + * Stuff request onto busy list + */ + TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); + + if (ncq && first) + ahci_write_fis_d2h_ncq(p, slot); + + err = blockif_delete(p->bctx, breq); + assert(err == 0); +} + +static inline void write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, void *buf, int size) { @@ -663,7 +912,7 @@ write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, dbcsz = (prdt->dbc & DBCMASK) + 1; ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz); - sublen = len < dbcsz ? len : dbcsz; + sublen = MIN(len, dbcsz); memcpy(ptr, from, sublen); len -= sublen; from += sublen; @@ -673,88 +922,174 @@ write_prdt(struct ahci_port *p, int slot, uint8_t *cfis, } static void +ahci_checksum(uint8_t *buf, int size) +{ + int i; + uint8_t sum = 0; + + for (i = 0; i < size - 1; i++) + sum += buf[i]; + buf[size - 1] = 0x100 - sum; +} + +static void +ahci_handle_read_log(struct ahci_port *p, int slot, uint8_t *cfis) +{ + struct ahci_cmd_hdr *hdr; + uint32_t buf[128]; + uint8_t *buf8 = (uint8_t *)buf; + uint16_t *buf16 = (uint16_t *)buf; + + hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); + if (p->atapi || hdr->prdtl == 0 || cfis[5] != 0 || + cfis[9] != 0 || cfis[12] != 1 || cfis[13] != 0) { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + return; + } + + memset(buf, 0, sizeof(buf)); + if (cfis[4] == 0x00) { /* Log directory */ + buf16[0x00] = 1; /* Version -- 1 */ + buf16[0x10] = 1; /* NCQ Command Error Log -- 1 page */ + buf16[0x13] = 1; /* SATA NCQ Send and Receive Log -- 1 page */ + } else if (cfis[4] == 0x10) { /* NCQ Command Error Log */ + memcpy(buf8, p->err_cfis, sizeof(p->err_cfis)); + ahci_checksum(buf8, sizeof(buf)); + } else if (cfis[4] == 0x13) { /* SATA NCQ Send and Receive Log */ + if (blockif_candelete(p->bctx) && !blockif_is_ro(p->bctx)) { + buf[0x00] = 1; /* SFQ DSM supported */ + buf[0x01] = 1; /* SFQ DSM TRIM supported */ + } + } else { + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + return; + } + + if (cfis[2] == ATA_READ_LOG_EXT) + ahci_write_fis_piosetup(p); + write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); +} + +static void handle_identify(struct ahci_port *p, int slot, uint8_t *cfis) { struct ahci_cmd_hdr *hdr; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (p->atapi || hdr->prdtl == 0) { - p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; - p->is |= AHCI_P_IX_TFE; + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { uint16_t buf[256]; uint64_t sectors; + int sectsz, psectsz, psectoff, candelete, ro; uint16_t cyl; uint8_t sech, heads; - sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx); + ro = blockif_is_ro(p->bctx); + candelete = blockif_candelete(p->bctx); + sectsz = blockif_sectsz(p->bctx); + sectors = blockif_size(p->bctx) / sectsz; blockif_chs(p->bctx, &cyl, &heads, &sech); + blockif_psectsz(p->bctx, &psectsz, &psectoff); memset(buf, 0, sizeof(buf)); buf[0] = 0x0040; buf[1] = cyl; buf[3] = heads; buf[6] = sech; - /* TODO emulate different serial? */ - ata_string((uint8_t *)(buf+10), "123456", 20); + ata_string((uint8_t *)(buf+10), p->ident, 20); ata_string((uint8_t *)(buf+23), "001", 8); ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40); buf[47] = (0x8000 | 128); - buf[48] = 0x1; + buf[48] = 0; buf[49] = (1 << 8 | 1 << 9 | 1 << 11); buf[50] = (1 << 14); buf[53] = (1 << 1 | 1 << 2); if (p->mult_sectors) buf[59] = (0x100 | p->mult_sectors); - buf[60] = sectors; - buf[61] = (sectors >> 16); + if (sectors <= 0x0fffffff) { + buf[60] = sectors; + buf[61] = (sectors >> 16); + } else { + buf[60] = 0xffff; + buf[61] = 0x0fff; + } buf[63] = 0x7; if (p->xfermode & ATA_WDMA0) buf[63] |= (1 << ((p->xfermode & 7) + 8)); buf[64] = 0x3; - buf[65] = 100; - buf[66] = 100; - buf[67] = 100; - buf[68] = 100; + buf[65] = 120; + buf[66] = 120; + buf[67] = 120; + buf[68] = 120; + buf[69] = 0; buf[75] = 31; - buf[76] = (1 << 8 | 1 << 2); - buf[80] = 0x1f0; + buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3 | + ATA_SUPPORT_NCQ); + buf[77] = (ATA_SUPPORT_RCVSND_FPDMA_QUEUED | + (p->ssts & ATA_SS_SPD_MASK) >> 3); + buf[80] = 0x3f0; buf[81] = 0x28; - buf[82] = (1 << 5 | 1 << 14); - buf[83] = (1 << 10 | 1 << 12 | 1 << 13 | 1 << 14); + buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| + ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); + buf[83] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | + ATA_SUPPORT_FLUSHCACHE48 | 1 << 14); buf[84] = (1 << 14); - buf[85] = (1 << 5 | 1 << 14); - buf[86] = (1 << 10 | 1 << 12 | 1 << 13); + buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_WRITECACHE| + ATA_SUPPORT_LOOKAHEAD | ATA_SUPPORT_NOP); + buf[86] = (ATA_SUPPORT_ADDRESS48 | ATA_SUPPORT_FLUSHCACHE | + ATA_SUPPORT_FLUSHCACHE48 | 1 << 15); buf[87] = (1 << 14); buf[88] = 0x7f; if (p->xfermode & ATA_UDMA0) buf[88] |= (1 << ((p->xfermode & 7) + 8)); - buf[93] = (1 | 1 <<14); buf[100] = sectors; buf[101] = (sectors >> 16); buf[102] = (sectors >> 32); buf[103] = (sectors >> 48); + if (candelete && !ro) { + buf[69] |= ATA_SUPPORT_RZAT | ATA_SUPPORT_DRAT; + buf[105] = 1; + buf[169] = ATA_SUPPORT_DSM_TRIM; + } + buf[106] = 0x4000; + buf[209] = 0x4000; + if (psectsz > sectsz) { + buf[106] |= 0x2000; + buf[106] |= ffsl(psectsz / sectsz) - 1; + buf[209] |= (psectoff / sectsz); + } + if (sectsz > 512) { + buf[106] |= 0x1000; + buf[117] = sectsz / 2; + buf[118] = ((sectsz / 2) >> 16); + } + buf[119] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); + buf[120] = (ATA_SUPPORT_RWLOGDMAEXT | 1 << 14); + buf[222] = 0x1020; + buf[255] = 0x00a5; + ahci_checksum((uint8_t *)buf, sizeof(buf)); ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); - p->tfd = ATA_S_DSC | ATA_S_READY; - p->is |= AHCI_P_IX_DP; - p->ci &= ~(1 << slot); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } - ahci_generate_intr(p->pr_sc); } static void handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) { if (!p->atapi) { - p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; - p->is |= AHCI_P_IX_TFE; + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else { uint16_t buf[256]; memset(buf, 0, sizeof(buf)); buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5); - /* TODO emulate different serial? */ - ata_string((uint8_t *)(buf+10), "123456", 20); + ata_string((uint8_t *)(buf+10), p->ident, 20); ata_string((uint8_t *)(buf+23), "001", 8); ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40); buf[49] = (1 << 9 | 1 << 8); @@ -762,27 +1097,34 @@ handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis) buf[53] = (1 << 2 | 1 << 1); buf[62] = 0x3f; buf[63] = 7; + if (p->xfermode & ATA_WDMA0) + buf[63] |= (1 << ((p->xfermode & 7) + 8)); buf[64] = 3; - buf[65] = 100; - buf[66] = 100; - buf[67] = 100; - buf[68] = 100; - buf[76] = (1 << 2 | 1 << 1); + buf[65] = 120; + buf[66] = 120; + buf[67] = 120; + buf[68] = 120; + buf[76] = (ATA_SATA_GEN1 | ATA_SATA_GEN2 | ATA_SATA_GEN3); + buf[77] = ((p->ssts & ATA_SS_SPD_MASK) >> 3); buf[78] = (1 << 5); - buf[80] = (0x1f << 4); - buf[82] = (1 << 4); + buf[80] = 0x3f0; + buf[82] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | + ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); buf[83] = (1 << 14); buf[84] = (1 << 14); - buf[85] = (1 << 4); + buf[85] = (ATA_SUPPORT_POWERMGT | ATA_SUPPORT_PACKET | + ATA_SUPPORT_RESET | ATA_SUPPORT_NOP); buf[87] = (1 << 14); - buf[88] = (1 << 14 | 0x7f); + buf[88] = 0x7f; + if (p->xfermode & ATA_UDMA0) + buf[88] |= (1 << ((p->xfermode & 7) + 8)); + buf[222] = 0x1020; + buf[255] = 0x00a5; + ahci_checksum((uint8_t *)buf, sizeof(buf)); ahci_write_fis_piosetup(p); write_prdt(p, slot, cfis, (void *)buf, sizeof(buf)); - p->tfd = ATA_S_DSC | ATA_S_READY; - p->is |= AHCI_P_IX_DHR; - p->ci &= ~(1 << slot); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_DSC | ATA_S_READY); } - ahci_generate_intr(p->pr_sc); } static void @@ -791,22 +1133,41 @@ atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis) uint8_t buf[36]; uint8_t *acmd; int len; + uint32_t tfd; acmd = cfis + 0x40; - buf[0] = 0x05; - buf[1] = 0x80; - buf[2] = 0x00; - buf[3] = 0x21; - buf[4] = 31; - buf[5] = 0; - buf[6] = 0; - buf[7] = 0; - atapi_string(buf + 8, "BHYVE", 8); - atapi_string(buf + 16, "BHYVE DVD-ROM", 16); - atapi_string(buf + 32, "001", 4); - - len = sizeof(buf); + if (acmd[1] & 1) { /* VPD */ + if (acmd[2] == 0) { /* Supported VPD pages */ + buf[0] = 0x05; + buf[1] = 0; + buf[2] = 0; + buf[3] = 1; + buf[4] = 0; + len = 4 + buf[3]; + } else { + p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; + p->asc = 0x24; + tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + ahci_write_fis_d2h(p, slot, cfis, tfd); + return; + } + } else { + buf[0] = 0x05; + buf[1] = 0x80; + buf[2] = 0x00; + buf[3] = 0x21; + buf[4] = 31; + buf[5] = 0; + buf[6] = 0; + buf[7] = 0; + atapi_string(buf + 8, "BHYVE", 8); + atapi_string(buf + 16, "BHYVE DVD-ROM", 16); + atapi_string(buf + 32, "001", 4); + len = sizeof(buf); + } + if (len > acmd[4]) len = acmd[4]; cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; @@ -918,10 +1279,9 @@ atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) { int msf, size; uint64_t sectors; - uint8_t start_track, *bp, buf[50]; + uint8_t *bp, buf[50]; msf = (acmd[1] >> 1) & 1; - start_track = acmd[6]; bp = buf + 2; *bp++ = 1; *bp++ = 1; @@ -1010,25 +1370,34 @@ atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis) } static void -atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, - uint32_t done, int seek) +atapi_report_luns(struct ahci_port *p, int slot, uint8_t *cfis) +{ + uint8_t buf[16]; + + memset(buf, 0, sizeof(buf)); + buf[3] = 8; + + cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; + write_prdt(p, slot, cfis, buf, sizeof(buf)); + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); +} + +static void +atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done) { struct ahci_ioreq *aior; struct ahci_cmd_hdr *hdr; struct ahci_prdt_entry *prdt; struct blockif_req *breq; - struct pci_ahci_softc *sc; uint8_t *acmd; uint64_t lba; uint32_t len; - int i, err, iovcnt; + int err; - sc = p->pr_sc; acmd = cfis + 0x40; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); prdt = (struct ahci_prdt_entry *)(cfis + 0x80); - prdt += seek; lba = be32dec(acmd + 2); if (acmd[0] == READ_10) len = be16dec(acmd + 7); @@ -1053,37 +1422,14 @@ atapi_read(struct ahci_port *p, int slot, uint8_t *cfis, aior->done = done; breq = &aior->io_req; breq->br_offset = lba + done; - iovcnt = hdr->prdtl - seek; - if (iovcnt > BLOCKIF_IOV_MAX) { - aior->prdtl = iovcnt - BLOCKIF_IOV_MAX; - iovcnt = BLOCKIF_IOV_MAX; - } else - aior->prdtl = 0; - breq->br_iovcnt = iovcnt; + ahci_build_iov(p, aior, prdt, hdr->prdtl); - /* - * Mark this command in-flight. - */ + /* Mark this command in-flight. */ p->pending |= 1 << slot; - /* - * Stuff request onto busy list - */ + /* Stuff request onto busy list. */ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist); - /* - * Build up the iovec based on the prdt - */ - for (i = 0; i < iovcnt; i++) { - uint32_t dbcsz; - - dbcsz = (prdt->dbc & DBCMASK) + 1; - breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc), - prdt->dba, dbcsz); - breq->br_iov[i].iov_len = dbcsz; - aior->done += dbcsz; - prdt++; - } err = blockif_read(p->bctx, breq); assert(err == 0); } @@ -1137,7 +1483,7 @@ static void atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis) { uint8_t *acmd; - uint32_t tfd; + uint32_t tfd = 0; uint8_t pc, code; int len; @@ -1278,9 +1624,12 @@ handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis) case READ_TOC: atapi_read_toc(p, slot, cfis); break; + case REPORT_LUNS: + atapi_report_luns(p, slot, cfis); + break; case READ_10: case READ_12: - atapi_read(p, slot, cfis, 0, 0); + atapi_read(p, slot, cfis, 0); break; case REQUEST_SENSE: atapi_request_sense(p, slot, cfis); @@ -1308,6 +1657,7 @@ static void ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) { + p->tfd |= ATA_S_BUSY; switch (cfis[2]) { case ATA_ATA_IDENTIFY: handle_identify(p, slot, cfis); @@ -1363,28 +1713,68 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) p->mult_sectors = cfis[12]; p->tfd = ATA_S_DSC | ATA_S_READY; } - p->is |= AHCI_P_IX_DP; - p->ci &= ~(1 << slot); - ahci_generate_intr(p->pr_sc); + ahci_write_fis_d2h(p, slot, cfis, p->tfd); break; + case ATA_READ: + case ATA_WRITE: + case ATA_READ48: + case ATA_WRITE48: + case ATA_READ_MUL: + case ATA_WRITE_MUL: + case ATA_READ_MUL48: + case ATA_WRITE_MUL48: case ATA_READ_DMA: case ATA_WRITE_DMA: case ATA_READ_DMA48: case ATA_WRITE_DMA48: case ATA_READ_FPDMA_QUEUED: case ATA_WRITE_FPDMA_QUEUED: - ahci_handle_dma(p, slot, cfis, 0, 0); + ahci_handle_rw(p, slot, cfis, 0); break; case ATA_FLUSHCACHE: case ATA_FLUSHCACHE48: ahci_handle_flush(p, slot, cfis); break; - case ATA_STANDBY_CMD: + case ATA_DATA_SET_MANAGEMENT: + if (cfis[11] == 0 && cfis[3] == ATA_DSM_TRIM && + cfis[13] == 0 && cfis[12] == 1) { + ahci_handle_dsm_trim(p, slot, cfis, 0); + break; + } + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_SEND_FPDMA_QUEUED: + if ((cfis[13] & 0x1f) == ATA_SFPDMA_DSM && + cfis[17] == 0 && cfis[16] == ATA_DSM_TRIM && + cfis[11] == 0 && cfis[3] == 1) { + ahci_handle_dsm_trim(p, slot, cfis, 0); + break; + } + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_READ_LOG_EXT: + case ATA_READ_LOG_DMA_EXT: + ahci_handle_read_log(p, slot, cfis); break; + case ATA_SECURITY_FREEZE_LOCK: + case ATA_SMART_CMD: case ATA_NOP: + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); + break; + case ATA_CHECK_POWER_MODE: + cfis[12] = 0xff; /* always on */ + ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); + break; + case ATA_STANDBY_CMD: case ATA_STANDBY_IMMEDIATE: + case ATA_IDLE_CMD: case ATA_IDLE_IMMEDIATE: case ATA_SLEEP: + case ATA_READ_VERIFY: + case ATA_READ_VERIFY48: ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC); break; case ATA_ATAPI_IDENTIFY: @@ -1392,17 +1782,15 @@ ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis) break; case ATA_PACKET_CMD: if (!p->atapi) { - p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; - p->is |= AHCI_P_IX_TFE; - ahci_generate_intr(p->pr_sc); + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); } else handle_packet_cmd(p, slot, cfis); break; default: WPRINTF("Unsupported cmd:%02x\n", cfis[2]); - p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; - p->is |= AHCI_P_IX_TFE; - ahci_generate_intr(p->pr_sc); + ahci_write_fis_d2h(p, slot, cfis, + (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR); break; } } @@ -1411,19 +1799,25 @@ static void ahci_handle_slot(struct ahci_port *p, int slot) { struct ahci_cmd_hdr *hdr; +#ifdef AHCI_DEBUG struct ahci_prdt_entry *prdt; +#endif struct pci_ahci_softc *sc; uint8_t *cfis; - int cfl; +#ifdef AHCI_DEBUG + int cfl, i; +#endif sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); +#ifdef AHCI_DEBUG cfl = (hdr->flags & 0x1f) * 4; +#endif cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba, 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry)); +#ifdef AHCI_DEBUG prdt = (struct ahci_prdt_entry *)(cfis + 0x80); -#ifdef AHCI_DEBUG DPRINTF("\ncfis:"); for (i = 0; i < cfl; i++) { if (i % 10 == 0) @@ -1459,20 +1853,23 @@ ahci_handle_slot(struct ahci_port *p, int slot) static void ahci_handle_port(struct ahci_port *p) { - int i; if (!(p->cmd & AHCI_P_CMD_ST)) return; /* * Search for any new commands to issue ignoring those that - * are already in-flight. + * are already in-flight. Stop if device is busy or in error. */ - for (i = 0; (i < 32) && p->ci; i++) { - if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) { + for (; (p->ci & ~p->pending) != 0; p->ccs = ((p->ccs + 1) & 31)) { + if ((p->tfd & (ATA_S_BUSY | ATA_S_DRQ)) != 0) + break; + if (p->waitforclear) + break; + if ((p->ci & ~p->pending & (1 << p->ccs)) != 0) { p->cmd &= ~AHCI_P_CMD_CCS_MASK; - p->cmd |= i << AHCI_P_CMD_CCS_SHIFT; - ahci_handle_slot(p, i); + p->cmd |= p->ccs << AHCI_P_CMD_CCS_SHIFT; + ahci_handle_slot(p, p->ccs); } } } @@ -1490,22 +1887,26 @@ ata_ioreq_cb(struct blockif_req *br, int err) struct pci_ahci_softc *sc; uint32_t tfd; uint8_t *cfis; - int pending, slot, ncq; + int slot, ncq, dsm; DPRINTF("%s %d\n", __func__, err); - ncq = 0; + ncq = dsm = 0; aior = br->br_param; p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; - pending = aior->prdtl; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE); if (cfis[2] == ATA_WRITE_FPDMA_QUEUED || - cfis[2] == ATA_READ_FPDMA_QUEUED) + cfis[2] == ATA_READ_FPDMA_QUEUED || + cfis[2] == ATA_SEND_FPDMA_QUEUED) ncq = 1; + if (cfis[2] == ATA_DATA_SET_MANAGEMENT || + (cfis[2] == ATA_SEND_FPDMA_QUEUED && + (cfis[13] & 0x1f) == ATA_SFPDMA_DSM)) + dsm = 1; pthread_mutex_lock(&sc->mtx); @@ -1519,29 +1920,24 @@ ata_ioreq_cb(struct blockif_req *br, int err) */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); - if (pending && !err) { - ahci_handle_dma(p, slot, cfis, aior->done, - hdr->prdtl - pending); + if (!err) + hdr->prdbc = aior->done; + + if (!err && aior->more) { + if (dsm) + ahci_handle_dsm_trim(p, slot, cfis, aior->done); + else + ahci_handle_rw(p, slot, cfis, aior->done); goto out; } - if (!err && aior->done == aior->len) { + if (!err) tfd = ATA_S_READY | ATA_S_DSC; - if (ncq) - hdr->prdbc = 0; - else - hdr->prdbc = aior->len; - } else { + else tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR; - hdr->prdbc = 0; - if (ncq) - p->serr |= (1 << slot); - } - - if (ncq) { - p->sact &= ~(1 << slot); - ahci_write_fis_sdb(p, slot, tfd); - } else + if (ncq) + ahci_write_fis_sdb(p, slot, cfis, tfd); + else ahci_write_fis_d2h(p, slot, cfis, tfd); /* @@ -1550,6 +1946,7 @@ ata_ioreq_cb(struct blockif_req *br, int err) p->pending &= ~(1 << slot); ahci_check_stopped(p); + ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit\n", __func__); @@ -1564,7 +1961,7 @@ atapi_ioreq_cb(struct blockif_req *br, int err) struct pci_ahci_softc *sc; uint8_t *cfis; uint32_t tfd; - int pending, slot; + int slot; DPRINTF("%s %d\n", __func__, err); @@ -1572,7 +1969,6 @@ atapi_ioreq_cb(struct blockif_req *br, int err) p = aior->io_pr; cfis = aior->cfis; slot = aior->slot; - pending = aior->prdtl; sc = p->pr_sc; hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE); @@ -1588,21 +1984,21 @@ atapi_ioreq_cb(struct blockif_req *br, int err) */ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist); - if (pending && !err) { - atapi_read(p, slot, cfis, aior->done, hdr->prdtl - pending); + if (!err) + hdr->prdbc = aior->done; + + if (!err && aior->more) { + atapi_read(p, slot, cfis, aior->done); goto out; } - if (!err && aior->done == aior->len) { + if (!err) { tfd = ATA_S_READY | ATA_S_DSC; - hdr->prdbc = aior->len; } else { p->sense_key = ATA_SENSE_ILLEGAL_REQUEST; p->asc = 0x21; tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR; - hdr->prdbc = 0; } - cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN; ahci_write_fis_d2h(p, slot, cfis, tfd); @@ -1612,6 +2008,7 @@ atapi_ioreq_cb(struct blockif_req *br, int err) p->pending &= ~(1 << slot); ahci_check_stopped(p); + ahci_handle_port(p); out: pthread_mutex_unlock(&sc->mtx); DPRINTF("%s exit\n", __func__); @@ -1669,15 +2066,23 @@ pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) break; case AHCI_P_IS: p->is &= ~value; + ahci_port_intr(p); break; case AHCI_P_IE: p->ie = value & 0xFDC000FF; - ahci_generate_intr(sc); + ahci_port_intr(p); break; case AHCI_P_CMD: { - p->cmd = value; - + p->cmd &= ~(AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | + AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | + AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | + AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK); + p->cmd |= (AHCI_P_CMD_ST | AHCI_P_CMD_SUD | AHCI_P_CMD_POD | + AHCI_P_CMD_CLO | AHCI_P_CMD_FRE | AHCI_P_CMD_APSTE | + AHCI_P_CMD_ATAPI | AHCI_P_CMD_DLAE | AHCI_P_CMD_ALPE | + AHCI_P_CMD_ASP | AHCI_P_CMD_ICC_MASK) & value; + if (!(value & AHCI_P_CMD_ST)) { ahci_port_stop(p); } else { @@ -1701,10 +2106,14 @@ pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) } if (value & AHCI_P_CMD_CLO) { - p->tfd = 0; + p->tfd &= ~(ATA_S_BUSY | ATA_S_DRQ); p->cmd &= ~AHCI_P_CMD_CLO; } + if (value & AHCI_P_CMD_ICC_MASK) { + p->cmd &= ~AHCI_P_CMD_ICC_MASK; + } + ahci_handle_port(p); break; } @@ -1714,10 +2123,10 @@ pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset); break; case AHCI_P_SCTL: + p->sctl = value; if (!(p->cmd & AHCI_P_CMD_ST)) { if (value & ATA_SC_DET_RESET) ahci_port_reset(p); - p->sctl = value; } break; case AHCI_P_SERR: @@ -1751,16 +2160,19 @@ pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value) DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset); break; case AHCI_GHC: - if (value & AHCI_GHC_HR) + if (value & AHCI_GHC_HR) { ahci_reset(sc); - else if (value & AHCI_GHC_IE) { - sc->ghc |= AHCI_GHC_IE; - ahci_generate_intr(sc); + break; } + if (value & AHCI_GHC_IE) + sc->ghc |= AHCI_GHC_IE; + else + sc->ghc &= ~AHCI_GHC_IE; + ahci_generate_intr(sc, 0xffffffff); break; case AHCI_IS: sc->is &= ~value; - ahci_generate_intr(sc); + ahci_generate_intr(sc, value); break; default: break; @@ -1774,7 +2186,7 @@ pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, struct pci_ahci_softc *sc = pi->pi_arg; assert(baridx == 5); - assert(size == 4); + assert((offset % 4) == 0 && size == 4); pthread_mutex_lock(&sc->mtx); @@ -1863,24 +2275,29 @@ pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset) static uint64_t pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, - uint64_t offset, int size) + uint64_t regoff, int size) { struct pci_ahci_softc *sc = pi->pi_arg; + uint64_t offset; uint32_t value; assert(baridx == 5); - assert(size == 4); + assert(size == 1 || size == 2 || size == 4); + assert((regoff & (size - 1)) == 0); pthread_mutex_lock(&sc->mtx); + offset = regoff & ~0x3; /* round down to a multiple of 4 bytes */ if (offset < AHCI_OFFSET) value = pci_ahci_host_read(sc, offset); else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP) value = pci_ahci_port_read(sc, offset); else { value = 0; - WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", offset); + WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", + regoff); } + value >>= 8 * (regoff & 0x3); pthread_mutex_unlock(&sc->mtx); @@ -1890,18 +2307,16 @@ pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, static int pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) { - char bident[sizeof("XX:X:X")]; + char bident[sizeof("XX:XX:XX")]; struct blockif_ctxt *bctxt; struct pci_ahci_softc *sc; - int ret, slots; + int ret, slots, p; + MD5_CTX mdctx; + u_char digest[16]; + char *next, *next2; ret = 0; - if (opts == NULL) { - fprintf(stderr, "pci_ahci: backing device required\n"); - return (1); - } - #ifdef AHCI_DEBUG dbg = fopen("/tmp/log", "w+"); #endif @@ -1909,48 +2324,96 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) sc = calloc(1, sizeof(struct pci_ahci_softc)); pi->pi_arg = sc; sc->asc_pi = pi; - sc->ports = MAX_PORTS; + pthread_mutex_init(&sc->mtx, NULL); + sc->ports = 0; + sc->pi = 0; + slots = 32; + + for (p = 0; p < MAX_PORTS && opts != NULL; p++, opts = next) { + /* Identify and cut off type of present port. */ + if (strncmp(opts, "hd:", 3) == 0) { + atapi = 0; + opts += 3; + } else if (strncmp(opts, "cd:", 3) == 0) { + atapi = 1; + opts += 3; + } - /* - * Only use port 0 for a backing device. All other ports will be - * marked as unused - */ - sc->port[0].atapi = atapi; + /* Find and cut off the next port options. */ + next = strstr(opts, ",hd:"); + next2 = strstr(opts, ",cd:"); + if (next == NULL || (next2 != NULL && next2 < next)) + next = next2; + if (next != NULL) { + next[0] = 0; + next++; + } - /* - * Attempt to open the backing image. Use the PCI - * slot/func for the identifier string. - */ - snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); - bctxt = blockif_open(opts, bident); - if (bctxt == NULL) { - ret = 1; - goto open_fail; - } - sc->port[0].bctx = bctxt; - sc->port[0].pr_sc = sc; + if (opts[0] == 0) + continue; - /* - * Allocate blockif request structures and add them - * to the free list - */ - pci_ahci_ioreq_init(&sc->port[0]); + /* + * Attempt to open the backing image. Use the PCI slot/func + * and the port number for the identifier string. + */ + snprintf(bident, sizeof(bident), "%d:%d:%d", pi->pi_slot, + pi->pi_func, p); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { + sc->ports = p; + ret = 1; + goto open_fail; + } + sc->port[p].bctx = bctxt; + sc->port[p].pr_sc = sc; + sc->port[p].port = p; + sc->port[p].atapi = atapi; + +#ifndef __FreeBSD__ + /* + * Attempt to enable the write cache for this device, as the + * guest will issue FLUSH commands when it requires durability. + * + * Failure here is fine, since an always-sync device will not + * have an impact on correctness. + */ + (void) blockif_set_wce(bctxt, 1); +#endif - pthread_mutex_init(&sc->mtx, NULL); + /* + * Create an identifier for the backing file. + * Use parts of the md5 sum of the filename + */ + MD5Init(&mdctx); + MD5Update(&mdctx, opts, strlen(opts)); + MD5Final(digest, &mdctx); + snprintf(sc->port[p].ident, AHCI_PORT_IDENT, + "BHYVE-%02X%02X-%02X%02X-%02X%02X", + digest[0], digest[1], digest[2], digest[3], digest[4], + digest[5]); + + /* + * Allocate blockif request structures and add them + * to the free list + */ + pci_ahci_ioreq_init(&sc->port[p]); + + sc->pi |= (1 << p); + if (sc->port[p].ioqsz < slots) + slots = sc->port[p].ioqsz; + } + sc->ports = p; /* Intel ICH8 AHCI */ - slots = sc->port[0].ioqsz; - if (slots > 32) - slots = 32; --slots; + if (sc->ports < DEF_PORTS) + sc->ports = DEF_PORTS; sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF | AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP | AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)| AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC | (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1); - /* Only port 0 implemented */ - sc->pi = 1; sc->vs = 0x10300; sc->cap2 = AHCI_CAP2_APST; ahci_reset(sc); @@ -1960,7 +2423,9 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA); pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0); - pci_emul_add_msicap(pi, 1); + p = MIN(sc->ports, 16); + p = flsl(p) - ((p & (p - 1)) ? 0 : 1); + pci_emul_add_msicap(pi, 1 << p); pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32, AHCI_OFFSET + sc->ports * AHCI_STEP); @@ -1968,7 +2433,10 @@ pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi) open_fail: if (ret) { - blockif_close(sc->port[0].bctx); + for (p = 0; p < sc->ports; p++) { + if (sc->port[p].bctx != NULL) + blockif_close(sc->port[p].bctx); + } free(sc); } @@ -1992,6 +2460,14 @@ pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) /* * Use separate emulation names to distinguish drive and atapi devices */ +struct pci_devemu pci_de_ahci = { + .pe_emu = "ahci", + .pe_init = pci_ahci_hd_init, + .pe_barwrite = pci_ahci_write, + .pe_barread = pci_ahci_read +}; +PCI_EMUL_SET(pci_de_ahci); + struct pci_devemu pci_de_ahci_hd = { .pe_emu = "ahci-hd", .pe_init = pci_ahci_hd_init, diff --git a/usr/src/cmd/bhyve/pci_e82545.c b/usr/src/cmd/bhyve/pci_e82545.c new file mode 100644 index 0000000000..e211b5cf9c --- /dev/null +++ b/usr/src/cmd/bhyve/pci_e82545.c @@ -0,0 +1,2418 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Alexander Motin <mav@FreeBSD.org> + * Copyright (c) 2015 Peter Grehan <grehan@freebsd.org> + * Copyright (c) 2013 Jeremiah Lott, Avere Systems + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/limits.h> +#include <sys/ioctl.h> +#include <sys/uio.h> +#include <net/ethernet.h> +#include <netinet/in.h> +#include <netinet/tcp.h> +#ifndef __FreeBSD__ +#include <sys/filio.h> +#endif + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <md5.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sysexits.h> +#include <unistd.h> +#include <pthread.h> +#include <pthread_np.h> + +#include "e1000_regs.h" +#include "e1000_defines.h" +#include "mii.h" + +#include "bhyverun.h" +#include "pci_emul.h" +#include "mevent.h" + +/* Hardware/register definitions XXX: move some to common code. */ +#define E82545_VENDOR_ID_INTEL 0x8086 +#define E82545_DEV_ID_82545EM_COPPER 0x100F +#define E82545_SUBDEV_ID 0x1008 + +#define E82545_REVISION_4 4 + +#define E82545_MDIC_DATA_MASK 0x0000FFFF +#define E82545_MDIC_OP_MASK 0x0c000000 +#define E82545_MDIC_IE 0x20000000 + +#define E82545_EECD_FWE_DIS 0x00000010 /* Flash writes disabled */ +#define E82545_EECD_FWE_EN 0x00000020 /* Flash writes enabled */ +#define E82545_EECD_FWE_MASK 0x00000030 /* Flash writes mask */ + +#define E82545_BAR_REGISTER 0 +#define E82545_BAR_REGISTER_LEN (128*1024) +#define E82545_BAR_FLASH 1 +#define E82545_BAR_FLASH_LEN (64*1024) +#define E82545_BAR_IO 2 +#define E82545_BAR_IO_LEN 8 + +#define E82545_IOADDR 0x00000000 +#define E82545_IODATA 0x00000004 +#define E82545_IO_REGISTER_MAX 0x0001FFFF +#define E82545_IO_FLASH_BASE 0x00080000 +#define E82545_IO_FLASH_MAX 0x000FFFFF + +#define E82545_ARRAY_ENTRY(reg, offset) (reg + (offset<<2)) +#define E82545_RAR_MAX 15 +#define E82545_MTA_MAX 127 +#define E82545_VFTA_MAX 127 + +/* Slightly modified from the driver versions, hardcoded for 3 opcode bits, + * followed by 6 address bits. + * TODO: make opcode bits and addr bits configurable? + * NVM Commands - Microwire */ +#define E82545_NVM_OPCODE_BITS 3 +#define E82545_NVM_ADDR_BITS 6 +#define E82545_NVM_DATA_BITS 16 +#define E82545_NVM_OPADDR_BITS (E82545_NVM_OPCODE_BITS + E82545_NVM_ADDR_BITS) +#define E82545_NVM_ADDR_MASK ((1 << E82545_NVM_ADDR_BITS)-1) +#define E82545_NVM_OPCODE_MASK \ + (((1 << E82545_NVM_OPCODE_BITS) - 1) << E82545_NVM_ADDR_BITS) +#define E82545_NVM_OPCODE_READ (0x6 << E82545_NVM_ADDR_BITS) /* read */ +#define E82545_NVM_OPCODE_WRITE (0x5 << E82545_NVM_ADDR_BITS) /* write */ +#define E82545_NVM_OPCODE_ERASE (0x7 << E82545_NVM_ADDR_BITS) /* erase */ +#define E82545_NVM_OPCODE_EWEN (0x4 << E82545_NVM_ADDR_BITS) /* wr-enable */ + +#define E82545_NVM_EEPROM_SIZE 64 /* 64 * 16-bit values == 128K */ + +#define E1000_ICR_SRPD 0x00010000 + +/* This is an arbitrary number. There is no hard limit on the chip. */ +#define I82545_MAX_TXSEGS 64 + +/* Legacy receive descriptor */ +struct e1000_rx_desc { + uint64_t buffer_addr; /* Address of the descriptor's data buffer */ + uint16_t length; /* Length of data DMAed into data buffer */ + uint16_t csum; /* Packet checksum */ + uint8_t status; /* Descriptor status */ + uint8_t errors; /* Descriptor Errors */ + uint16_t special; +}; + +/* Transmit descriptor types */ +#define E1000_TXD_MASK (E1000_TXD_CMD_DEXT | 0x00F00000) +#define E1000_TXD_TYP_L (0) +#define E1000_TXD_TYP_C (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_C) +#define E1000_TXD_TYP_D (E1000_TXD_CMD_DEXT | E1000_TXD_DTYP_D) + +/* Legacy transmit descriptor */ +struct e1000_tx_desc { + uint64_t buffer_addr; /* Address of the descriptor's data buffer */ + union { + uint32_t data; + struct { + uint16_t length; /* Data buffer length */ + uint8_t cso; /* Checksum offset */ + uint8_t cmd; /* Descriptor control */ + } flags; + } lower; + union { + uint32_t data; + struct { + uint8_t status; /* Descriptor status */ + uint8_t css; /* Checksum start */ + uint16_t special; + } fields; + } upper; +}; + +/* Context descriptor */ +struct e1000_context_desc { + union { + uint32_t ip_config; + struct { + uint8_t ipcss; /* IP checksum start */ + uint8_t ipcso; /* IP checksum offset */ + uint16_t ipcse; /* IP checksum end */ + } ip_fields; + } lower_setup; + union { + uint32_t tcp_config; + struct { + uint8_t tucss; /* TCP checksum start */ + uint8_t tucso; /* TCP checksum offset */ + uint16_t tucse; /* TCP checksum end */ + } tcp_fields; + } upper_setup; + uint32_t cmd_and_length; + union { + uint32_t data; + struct { + uint8_t status; /* Descriptor status */ + uint8_t hdr_len; /* Header length */ + uint16_t mss; /* Maximum segment size */ + } fields; + } tcp_seg_setup; +}; + +/* Data descriptor */ +struct e1000_data_desc { + uint64_t buffer_addr; /* Address of the descriptor's buffer address */ + union { + uint32_t data; + struct { + uint16_t length; /* Data buffer length */ + uint8_t typ_len_ext; + uint8_t cmd; + } flags; + } lower; + union { + uint32_t data; + struct { + uint8_t status; /* Descriptor status */ + uint8_t popts; /* Packet Options */ + uint16_t special; + } fields; + } upper; +}; + +union e1000_tx_udesc { + struct e1000_tx_desc td; + struct e1000_context_desc cd; + struct e1000_data_desc dd; +}; + +/* Tx checksum info for a packet. */ +struct ck_info { + int ck_valid; /* ck_info is valid */ + uint8_t ck_start; /* start byte of cksum calcuation */ + uint8_t ck_off; /* offset of cksum insertion */ + uint16_t ck_len; /* length of cksum calc: 0 is to packet-end */ +}; + +/* + * Debug printf + */ +static int e82545_debug = 0; +#define DPRINTF(msg,params...) if (e82545_debug) fprintf(stderr, "e82545: " msg, params) +#define WPRINTF(msg,params...) fprintf(stderr, "e82545: " msg, params) + +#define MIN(a,b) (((a)<(b))?(a):(b)) +#define MAX(a,b) (((a)>(b))?(a):(b)) + +/* s/w representation of the RAL/RAH regs */ +struct eth_uni { + int eu_valid; + int eu_addrsel; + struct ether_addr eu_eth; +}; + + +struct e82545_softc { + struct pci_devinst *esc_pi; + struct vmctx *esc_ctx; + struct mevent *esc_mevp; + struct mevent *esc_mevpitr; + pthread_mutex_t esc_mtx; + struct ether_addr esc_mac; + int esc_tapfd; + + /* General */ + uint32_t esc_CTRL; /* x0000 device ctl */ + uint32_t esc_FCAL; /* x0028 flow ctl addr lo */ + uint32_t esc_FCAH; /* x002C flow ctl addr hi */ + uint32_t esc_FCT; /* x0030 flow ctl type */ + uint32_t esc_VET; /* x0038 VLAN eth type */ + uint32_t esc_FCTTV; /* x0170 flow ctl tx timer */ + uint32_t esc_LEDCTL; /* x0E00 LED control */ + uint32_t esc_PBA; /* x1000 pkt buffer allocation */ + + /* Interrupt control */ + int esc_irq_asserted; + uint32_t esc_ICR; /* x00C0 cause read/clear */ + uint32_t esc_ITR; /* x00C4 intr throttling */ + uint32_t esc_ICS; /* x00C8 cause set */ + uint32_t esc_IMS; /* x00D0 mask set/read */ + uint32_t esc_IMC; /* x00D8 mask clear */ + + /* Transmit */ + union e1000_tx_udesc *esc_txdesc; + struct e1000_context_desc esc_txctx; + pthread_t esc_tx_tid; + pthread_cond_t esc_tx_cond; + int esc_tx_enabled; + int esc_tx_active; + uint32_t esc_TXCW; /* x0178 transmit config */ + uint32_t esc_TCTL; /* x0400 transmit ctl */ + uint32_t esc_TIPG; /* x0410 inter-packet gap */ + uint16_t esc_AIT; /* x0458 Adaptive Interframe Throttle */ + uint64_t esc_tdba; /* verified 64-bit desc table addr */ + uint32_t esc_TDBAL; /* x3800 desc table addr, low bits */ + uint32_t esc_TDBAH; /* x3804 desc table addr, hi 32-bits */ + uint32_t esc_TDLEN; /* x3808 # descriptors in bytes */ + uint16_t esc_TDH; /* x3810 desc table head idx */ + uint16_t esc_TDHr; /* internal read version of TDH */ + uint16_t esc_TDT; /* x3818 desc table tail idx */ + uint32_t esc_TIDV; /* x3820 intr delay */ + uint32_t esc_TXDCTL; /* x3828 desc control */ + uint32_t esc_TADV; /* x382C intr absolute delay */ + + /* L2 frame acceptance */ + struct eth_uni esc_uni[16]; /* 16 x unicast MAC addresses */ + uint32_t esc_fmcast[128]; /* Multicast filter bit-match */ + uint32_t esc_fvlan[128]; /* VLAN 4096-bit filter */ + + /* Receive */ + struct e1000_rx_desc *esc_rxdesc; + pthread_cond_t esc_rx_cond; + int esc_rx_enabled; + int esc_rx_active; + int esc_rx_loopback; + uint32_t esc_RCTL; /* x0100 receive ctl */ + uint32_t esc_FCRTL; /* x2160 flow cntl thresh, low */ + uint32_t esc_FCRTH; /* x2168 flow cntl thresh, hi */ + uint64_t esc_rdba; /* verified 64-bit desc table addr */ + uint32_t esc_RDBAL; /* x2800 desc table addr, low bits */ + uint32_t esc_RDBAH; /* x2804 desc table addr, hi 32-bits*/ + uint32_t esc_RDLEN; /* x2808 #descriptors */ + uint16_t esc_RDH; /* x2810 desc table head idx */ + uint16_t esc_RDT; /* x2818 desc table tail idx */ + uint32_t esc_RDTR; /* x2820 intr delay */ + uint32_t esc_RXDCTL; /* x2828 desc control */ + uint32_t esc_RADV; /* x282C intr absolute delay */ + uint32_t esc_RSRPD; /* x2C00 recv small packet detect */ + uint32_t esc_RXCSUM; /* x5000 receive cksum ctl */ + + /* IO Port register access */ + uint32_t io_addr; + + /* Shadow copy of MDIC */ + uint32_t mdi_control; + /* Shadow copy of EECD */ + uint32_t eeprom_control; + /* Latest NVM in/out */ + uint16_t nvm_data; + uint16_t nvm_opaddr; + /* stats */ + uint32_t missed_pkt_count; /* dropped for no room in rx queue */ + uint32_t pkt_rx_by_size[6]; + uint32_t pkt_tx_by_size[6]; + uint32_t good_pkt_rx_count; + uint32_t bcast_pkt_rx_count; + uint32_t mcast_pkt_rx_count; + uint32_t good_pkt_tx_count; + uint32_t bcast_pkt_tx_count; + uint32_t mcast_pkt_tx_count; + uint32_t oversize_rx_count; + uint32_t tso_tx_count; + uint64_t good_octets_rx; + uint64_t good_octets_tx; + uint64_t missed_octets; /* counts missed and oversized */ + + uint8_t nvm_bits:6; /* number of bits remaining in/out */ + uint8_t nvm_mode:2; +#define E82545_NVM_MODE_OPADDR 0x0 +#define E82545_NVM_MODE_DATAIN 0x1 +#define E82545_NVM_MODE_DATAOUT 0x2 + /* EEPROM data */ + uint16_t eeprom_data[E82545_NVM_EEPROM_SIZE]; +}; + +static void e82545_reset(struct e82545_softc *sc, int dev); +static void e82545_rx_enable(struct e82545_softc *sc); +static void e82545_rx_disable(struct e82545_softc *sc); +#ifdef __FreeBSD__ +static void e82545_tap_callback(int fd, enum ev_type type, void *param); +#endif +static void e82545_tx_start(struct e82545_softc *sc); +static void e82545_tx_enable(struct e82545_softc *sc); +static void e82545_tx_disable(struct e82545_softc *sc); + +static inline int +e82545_size_stat_index(uint32_t size) +{ + if (size <= 64) { + return 0; + } else if (size >= 1024) { + return 5; + } else { + /* should be 1-4 */ + return (ffs(size) - 6); + } +} + +static void +e82545_init_eeprom(struct e82545_softc *sc) +{ + uint16_t checksum, i; + + /* mac addr */ + sc->eeprom_data[NVM_MAC_ADDR] = ((uint16_t)sc->esc_mac.octet[0]) | + (((uint16_t)sc->esc_mac.octet[1]) << 8); + sc->eeprom_data[NVM_MAC_ADDR+1] = ((uint16_t)sc->esc_mac.octet[2]) | + (((uint16_t)sc->esc_mac.octet[3]) << 8); + sc->eeprom_data[NVM_MAC_ADDR+2] = ((uint16_t)sc->esc_mac.octet[4]) | + (((uint16_t)sc->esc_mac.octet[5]) << 8); + + /* pci ids */ + sc->eeprom_data[NVM_SUB_DEV_ID] = E82545_SUBDEV_ID; + sc->eeprom_data[NVM_SUB_VEN_ID] = E82545_VENDOR_ID_INTEL; + sc->eeprom_data[NVM_DEV_ID] = E82545_DEV_ID_82545EM_COPPER; + sc->eeprom_data[NVM_VEN_ID] = E82545_VENDOR_ID_INTEL; + + /* fill in the checksum */ + checksum = 0; + for (i = 0; i < NVM_CHECKSUM_REG; i++) { + checksum += sc->eeprom_data[i]; + } + checksum = NVM_SUM - checksum; + sc->eeprom_data[NVM_CHECKSUM_REG] = checksum; + DPRINTF("eeprom checksum: 0x%x\r\n", checksum); +} + +static void +e82545_write_mdi(struct e82545_softc *sc, uint8_t reg_addr, + uint8_t phy_addr, uint32_t data) +{ + DPRINTF("Write mdi reg:0x%x phy:0x%x data: 0x%x\r\n", reg_addr, phy_addr, data); +} + +static uint32_t +e82545_read_mdi(struct e82545_softc *sc, uint8_t reg_addr, + uint8_t phy_addr) +{ + //DPRINTF("Read mdi reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr); + switch (reg_addr) { + case PHY_STATUS: + return (MII_SR_LINK_STATUS | MII_SR_AUTONEG_CAPS | + MII_SR_AUTONEG_COMPLETE); + case PHY_AUTONEG_ADV: + return NWAY_AR_SELECTOR_FIELD; + case PHY_LP_ABILITY: + return 0; + case PHY_1000T_STATUS: + return (SR_1000T_LP_FD_CAPS | SR_1000T_REMOTE_RX_STATUS | + SR_1000T_LOCAL_RX_STATUS); + case PHY_ID1: + return (M88E1011_I_PHY_ID >> 16) & 0xFFFF; + case PHY_ID2: + return (M88E1011_I_PHY_ID | E82545_REVISION_4) & 0xFFFF; + default: + DPRINTF("Unknown mdi read reg:0x%x phy:0x%x\r\n", reg_addr, phy_addr); + return 0; + } + /* not reached */ +} + +static void +e82545_eecd_strobe(struct e82545_softc *sc) +{ + /* Microwire state machine */ + /* + DPRINTF("eeprom state machine srtobe " + "0x%x 0x%x 0x%x 0x%x\r\n", + sc->nvm_mode, sc->nvm_bits, + sc->nvm_opaddr, sc->nvm_data);*/ + + if (sc->nvm_bits == 0) { + DPRINTF("eeprom state machine not expecting data! " + "0x%x 0x%x 0x%x 0x%x\r\n", + sc->nvm_mode, sc->nvm_bits, + sc->nvm_opaddr, sc->nvm_data); + return; + } + sc->nvm_bits--; + if (sc->nvm_mode == E82545_NVM_MODE_DATAOUT) { + /* shifting out */ + if (sc->nvm_data & 0x8000) { + sc->eeprom_control |= E1000_EECD_DO; + } else { + sc->eeprom_control &= ~E1000_EECD_DO; + } + sc->nvm_data <<= 1; + if (sc->nvm_bits == 0) { + /* read done, back to opcode mode. */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + } + } else if (sc->nvm_mode == E82545_NVM_MODE_DATAIN) { + /* shifting in */ + sc->nvm_data <<= 1; + if (sc->eeprom_control & E1000_EECD_DI) { + sc->nvm_data |= 1; + } + if (sc->nvm_bits == 0) { + /* eeprom write */ + uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; + uint16_t addr = sc->nvm_opaddr & E82545_NVM_ADDR_MASK; + if (op != E82545_NVM_OPCODE_WRITE) { + DPRINTF("Illegal eeprom write op 0x%x\r\n", + sc->nvm_opaddr); + } else if (addr >= E82545_NVM_EEPROM_SIZE) { + DPRINTF("Illegal eeprom write addr 0x%x\r\n", + sc->nvm_opaddr); + } else { + DPRINTF("eeprom write eeprom[0x%x] = 0x%x\r\n", + addr, sc->nvm_data); + sc->eeprom_data[addr] = sc->nvm_data; + } + /* back to opcode mode */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + } + } else if (sc->nvm_mode == E82545_NVM_MODE_OPADDR) { + sc->nvm_opaddr <<= 1; + if (sc->eeprom_control & E1000_EECD_DI) { + sc->nvm_opaddr |= 1; + } + if (sc->nvm_bits == 0) { + uint16_t op = sc->nvm_opaddr & E82545_NVM_OPCODE_MASK; + switch (op) { + case E82545_NVM_OPCODE_EWEN: + DPRINTF("eeprom write enable: 0x%x\r\n", + sc->nvm_opaddr); + /* back to opcode mode */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + break; + case E82545_NVM_OPCODE_READ: + { + uint16_t addr = sc->nvm_opaddr & + E82545_NVM_ADDR_MASK; + sc->nvm_mode = E82545_NVM_MODE_DATAOUT; + sc->nvm_bits = E82545_NVM_DATA_BITS; + if (addr < E82545_NVM_EEPROM_SIZE) { + sc->nvm_data = sc->eeprom_data[addr]; + DPRINTF("eeprom read: eeprom[0x%x] = 0x%x\r\n", + addr, sc->nvm_data); + } else { + DPRINTF("eeprom illegal read: 0x%x\r\n", + sc->nvm_opaddr); + sc->nvm_data = 0; + } + break; + } + case E82545_NVM_OPCODE_WRITE: + sc->nvm_mode = E82545_NVM_MODE_DATAIN; + sc->nvm_bits = E82545_NVM_DATA_BITS; + sc->nvm_data = 0; + break; + default: + DPRINTF("eeprom unknown op: 0x%x\r\r", + sc->nvm_opaddr); + /* back to opcode mode */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + } + } + } else { + DPRINTF("eeprom state machine wrong state! " + "0x%x 0x%x 0x%x 0x%x\r\n", + sc->nvm_mode, sc->nvm_bits, + sc->nvm_opaddr, sc->nvm_data); + } +} + +#ifdef __FreeBSD__ +static void +e82545_itr_callback(int fd, enum ev_type type, void *param) +{ + uint32_t new; + struct e82545_softc *sc = param; + + pthread_mutex_lock(&sc->esc_mtx); + new = sc->esc_ICR & sc->esc_IMS; + if (new && !sc->esc_irq_asserted) { + DPRINTF("itr callback: lintr assert %x\r\n", new); + sc->esc_irq_asserted = 1; + pci_lintr_assert(sc->esc_pi); + } else { + mevent_delete(sc->esc_mevpitr); + sc->esc_mevpitr = NULL; + } + pthread_mutex_unlock(&sc->esc_mtx); +} +#endif + +static void +e82545_icr_assert(struct e82545_softc *sc, uint32_t bits) +{ + uint32_t new; + + DPRINTF("icr assert: 0x%x\r\n", bits); + + /* + * An interrupt is only generated if bits are set that + * aren't already in the ICR, these bits are unmasked, + * and there isn't an interrupt already pending. + */ + new = bits & ~sc->esc_ICR & sc->esc_IMS; + sc->esc_ICR |= bits; + + if (new == 0) { + DPRINTF("icr assert: masked %x, ims %x\r\n", new, sc->esc_IMS); + } else if (sc->esc_mevpitr != NULL) { + DPRINTF("icr assert: throttled %x, ims %x\r\n", new, sc->esc_IMS); + } else if (!sc->esc_irq_asserted) { + DPRINTF("icr assert: lintr assert %x\r\n", new); + sc->esc_irq_asserted = 1; + pci_lintr_assert(sc->esc_pi); + if (sc->esc_ITR != 0) { +#ifdef __FreeBSD__ + sc->esc_mevpitr = mevent_add( + (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ + EVF_TIMER, e82545_itr_callback, sc); +#endif + } + } +} + +static void +e82545_ims_change(struct e82545_softc *sc, uint32_t bits) +{ + uint32_t new; + + /* + * Changing the mask may allow previously asserted + * but masked interrupt requests to generate an interrupt. + */ + new = bits & sc->esc_ICR & ~sc->esc_IMS; + sc->esc_IMS |= bits; + + if (new == 0) { + DPRINTF("ims change: masked %x, ims %x\r\n", new, sc->esc_IMS); + } else if (sc->esc_mevpitr != NULL) { + DPRINTF("ims change: throttled %x, ims %x\r\n", new, sc->esc_IMS); + } else if (!sc->esc_irq_asserted) { + DPRINTF("ims change: lintr assert %x\n\r", new); + sc->esc_irq_asserted = 1; + pci_lintr_assert(sc->esc_pi); + if (sc->esc_ITR != 0) { +#ifdef __FreeBSD__ + sc->esc_mevpitr = mevent_add( + (sc->esc_ITR + 3905) / 3906, /* 256ns -> 1ms */ + EVF_TIMER, e82545_itr_callback, sc); +#endif + } + } +} + +static void +e82545_icr_deassert(struct e82545_softc *sc, uint32_t bits) +{ + + DPRINTF("icr deassert: 0x%x\r\n", bits); + sc->esc_ICR &= ~bits; + + /* + * If there are no longer any interrupt sources and there + * was an asserted interrupt, clear it + */ + if (sc->esc_irq_asserted && !(sc->esc_ICR & sc->esc_IMS)) { + DPRINTF("icr deassert: lintr deassert %x\r\n", bits); + pci_lintr_deassert(sc->esc_pi); + sc->esc_irq_asserted = 0; + } +} + +static void +e82545_intr_write(struct e82545_softc *sc, uint32_t offset, uint32_t value) +{ + + DPRINTF("intr_write: off %x, val %x\n\r", offset, value); + + switch (offset) { + case E1000_ICR: + e82545_icr_deassert(sc, value); + break; + case E1000_ITR: + sc->esc_ITR = value; + break; + case E1000_ICS: + sc->esc_ICS = value; /* not used: store for debug */ + e82545_icr_assert(sc, value); + break; + case E1000_IMS: + e82545_ims_change(sc, value); + break; + case E1000_IMC: + sc->esc_IMC = value; /* for debug */ + sc->esc_IMS &= ~value; + // XXX clear interrupts if all ICR bits now masked + // and interrupt was pending ? + break; + default: + break; + } +} + +static uint32_t +e82545_intr_read(struct e82545_softc *sc, uint32_t offset) +{ + uint32_t retval; + + retval = 0; + + DPRINTF("intr_read: off %x\n\r", offset); + + switch (offset) { + case E1000_ICR: + retval = sc->esc_ICR; + sc->esc_ICR = 0; + e82545_icr_deassert(sc, ~0); + break; + case E1000_ITR: + retval = sc->esc_ITR; + break; + case E1000_ICS: + /* write-only register */ + break; + case E1000_IMS: + retval = sc->esc_IMS; + break; + case E1000_IMC: + /* write-only register */ + break; + default: + break; + } + + return (retval); +} + +static void +e82545_devctl(struct e82545_softc *sc, uint32_t val) +{ + + sc->esc_CTRL = val & ~E1000_CTRL_RST; + + if (val & E1000_CTRL_RST) { + DPRINTF("e1k: s/w reset, ctl %x\n", val); + e82545_reset(sc, 1); + } + /* XXX check for phy reset ? */ +} + +static void +e82545_rx_update_rdba(struct e82545_softc *sc) +{ + + /* XXX verify desc base/len within phys mem range */ + sc->esc_rdba = (uint64_t)sc->esc_RDBAH << 32 | + sc->esc_RDBAL; + + /* Cache host mapping of guest descriptor array */ + sc->esc_rxdesc = paddr_guest2host(sc->esc_ctx, + sc->esc_rdba, sc->esc_RDLEN); +} + +static void +e82545_rx_ctl(struct e82545_softc *sc, uint32_t val) +{ + int on; + + on = ((val & E1000_RCTL_EN) == E1000_RCTL_EN); + + /* Save RCTL after stripping reserved bits 31:27,24,21,14,11:10,0 */ + sc->esc_RCTL = val & ~0xF9204c01; + + DPRINTF("rx_ctl - %s RCTL %x, val %x\n", + on ? "on" : "off", sc->esc_RCTL, val); + + /* state change requested */ + if (on != sc->esc_rx_enabled) { + if (on) { + /* Catch disallowed/unimplemented settings */ + //assert(!(val & E1000_RCTL_LBM_TCVR)); + + if (sc->esc_RCTL & E1000_RCTL_LBM_TCVR) { + sc->esc_rx_loopback = 1; + } else { + sc->esc_rx_loopback = 0; + } + + e82545_rx_update_rdba(sc); + e82545_rx_enable(sc); + } else { + e82545_rx_disable(sc); + sc->esc_rx_loopback = 0; + sc->esc_rdba = 0; + sc->esc_rxdesc = NULL; + } + } +} + +static void +e82545_tx_update_tdba(struct e82545_softc *sc) +{ + + /* XXX verify desc base/len within phys mem range */ + sc->esc_tdba = (uint64_t)sc->esc_TDBAH << 32 | sc->esc_TDBAL; + + /* Cache host mapping of guest descriptor array */ + sc->esc_txdesc = paddr_guest2host(sc->esc_ctx, sc->esc_tdba, + sc->esc_TDLEN); +} + +static void +e82545_tx_ctl(struct e82545_softc *sc, uint32_t val) +{ + int on; + + on = ((val & E1000_TCTL_EN) == E1000_TCTL_EN); + + /* ignore TCTL_EN settings that don't change state */ + if (on == sc->esc_tx_enabled) + return; + + if (on) { + e82545_tx_update_tdba(sc); + e82545_tx_enable(sc); + } else { + e82545_tx_disable(sc); + sc->esc_tdba = 0; + sc->esc_txdesc = NULL; + } + + /* Save TCTL value after stripping reserved bits 31:25,23,2,0 */ + sc->esc_TCTL = val & ~0xFE800005; +} + +int +e82545_bufsz(uint32_t rctl) +{ + + switch (rctl & (E1000_RCTL_BSEX | E1000_RCTL_SZ_256)) { + case (E1000_RCTL_SZ_2048): return (2048); + case (E1000_RCTL_SZ_1024): return (1024); + case (E1000_RCTL_SZ_512): return (512); + case (E1000_RCTL_SZ_256): return (256); + case (E1000_RCTL_BSEX|E1000_RCTL_SZ_16384): return (16384); + case (E1000_RCTL_BSEX|E1000_RCTL_SZ_8192): return (8192); + case (E1000_RCTL_BSEX|E1000_RCTL_SZ_4096): return (4096); + } + return (256); /* Forbidden value. */ +} + +#ifdef __FreeBSD__ +static uint8_t dummybuf[2048]; + +/* XXX one packet at a time until this is debugged */ +static void +e82545_tap_callback(int fd, enum ev_type type, void *param) +{ + struct e82545_softc *sc = param; + struct e1000_rx_desc *rxd; + struct iovec vec[64]; + int left, len, lim, maxpktsz, maxpktdesc, bufsz, i, n, size; + uint32_t cause = 0; + uint16_t *tp, tag, head; + + pthread_mutex_lock(&sc->esc_mtx); + DPRINTF("rx_run: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT); + + if (!sc->esc_rx_enabled || sc->esc_rx_loopback) { + DPRINTF("rx disabled (!%d || %d) -- packet(s) dropped\r\n", + sc->esc_rx_enabled, sc->esc_rx_loopback); + while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) { + } + goto done1; + } + bufsz = e82545_bufsz(sc->esc_RCTL); + maxpktsz = (sc->esc_RCTL & E1000_RCTL_LPE) ? 16384 : 1522; + maxpktdesc = (maxpktsz + bufsz - 1) / bufsz; + size = sc->esc_RDLEN / 16; + head = sc->esc_RDH; + left = (size + sc->esc_RDT - head) % size; + if (left < maxpktdesc) { + DPRINTF("rx overflow (%d < %d) -- packet(s) dropped\r\n", + left, maxpktdesc); + while (read(sc->esc_tapfd, dummybuf, sizeof(dummybuf)) > 0) { + } + goto done1; + } + + sc->esc_rx_active = 1; + pthread_mutex_unlock(&sc->esc_mtx); + + for (lim = size / 4; lim > 0 && left >= maxpktdesc; lim -= n) { + + /* Grab rx descriptor pointed to by the head pointer */ + for (i = 0; i < maxpktdesc; i++) { + rxd = &sc->esc_rxdesc[(head + i) % size]; + vec[i].iov_base = paddr_guest2host(sc->esc_ctx, + rxd->buffer_addr, bufsz); + vec[i].iov_len = bufsz; + } + len = readv(sc->esc_tapfd, vec, maxpktdesc); + if (len <= 0) { + DPRINTF("tap: readv() returned %d\n", len); + goto done; + } + + /* + * Adjust the packet length based on whether the CRC needs + * to be stripped or if the packet is less than the minimum + * eth packet size. + */ + if (len < ETHER_MIN_LEN - ETHER_CRC_LEN) + len = ETHER_MIN_LEN - ETHER_CRC_LEN; + if (!(sc->esc_RCTL & E1000_RCTL_SECRC)) + len += ETHER_CRC_LEN; + n = (len + bufsz - 1) / bufsz; + + DPRINTF("packet read %d bytes, %d segs, head %d\r\n", + len, n, head); + + /* Apply VLAN filter. */ + tp = (uint16_t *)vec[0].iov_base + 6; + if ((sc->esc_RCTL & E1000_RCTL_VFE) && + (ntohs(tp[0]) == sc->esc_VET)) { + tag = ntohs(tp[1]) & 0x0fff; + if ((sc->esc_fvlan[tag >> 5] & + (1 << (tag & 0x1f))) != 0) { + DPRINTF("known VLAN %d\r\n", tag); + } else { + DPRINTF("unknown VLAN %d\r\n", tag); + n = 0; + continue; + } + } + + /* Update all consumed descriptors. */ + for (i = 0; i < n - 1; i++) { + rxd = &sc->esc_rxdesc[(head + i) % size]; + rxd->length = bufsz; + rxd->csum = 0; + rxd->errors = 0; + rxd->special = 0; + rxd->status = E1000_RXD_STAT_DD; + } + rxd = &sc->esc_rxdesc[(head + i) % size]; + rxd->length = len % bufsz; + rxd->csum = 0; + rxd->errors = 0; + rxd->special = 0; + /* XXX signal no checksum for now */ + rxd->status = E1000_RXD_STAT_PIF | E1000_RXD_STAT_IXSM | + E1000_RXD_STAT_EOP | E1000_RXD_STAT_DD; + + /* Schedule receive interrupts. */ + if (len <= sc->esc_RSRPD) { + cause |= E1000_ICR_SRPD | E1000_ICR_RXT0; + } else { + /* XXX: RDRT and RADV timers should be here. */ + cause |= E1000_ICR_RXT0; + } + + head = (head + n) % size; + left -= n; + } + +done: + pthread_mutex_lock(&sc->esc_mtx); + sc->esc_rx_active = 0; + if (sc->esc_rx_enabled == 0) + pthread_cond_signal(&sc->esc_rx_cond); + + sc->esc_RDH = head; + /* Respect E1000_RCTL_RDMTS */ + left = (size + sc->esc_RDT - head) % size; + if (left < (size >> (((sc->esc_RCTL >> 8) & 3) + 1))) + cause |= E1000_ICR_RXDMT0; + /* Assert all accumulated interrupts. */ + if (cause != 0) + e82545_icr_assert(sc, cause); +done1: + DPRINTF("rx_run done: head %x, tail %x\r\n", sc->esc_RDH, sc->esc_RDT); + pthread_mutex_unlock(&sc->esc_mtx); +} +#endif + +static uint16_t +e82545_carry(uint32_t sum) +{ + + sum = (sum & 0xFFFF) + (sum >> 16); + if (sum > 0xFFFF) + sum -= 0xFFFF; + return (sum); +} + +static uint16_t +#ifdef __FreeBSD__ +e82545_buf_checksum(uint8_t *buf, int len) +#else +e82545_buf_checksum(caddr_t buf, int len) +#endif +{ + int i; + uint32_t sum = 0; + + /* Checksum all the pairs of bytes first... */ + for (i = 0; i < (len & ~1U); i += 2) + sum += *((u_int16_t *)(buf + i)); + + /* + * If there's a single byte left over, checksum it, too. + * Network byte order is big-endian, so the remaining byte is + * the high byte. + */ + if (i < len) + sum += htons(buf[i] << 8); + + return (e82545_carry(sum)); +} + +static uint16_t +e82545_iov_checksum(struct iovec *iov, int iovcnt, int off, int len) +{ + int now, odd; + uint32_t sum = 0, s; + + /* Skip completely unneeded vectors. */ + while (iovcnt > 0 && iov->iov_len <= off && off > 0) { + off -= iov->iov_len; + iov++; + iovcnt--; + } + + /* Calculate checksum of requested range. */ + odd = 0; + while (len > 0 && iovcnt > 0) { + now = MIN(len, iov->iov_len - off); + s = e82545_buf_checksum(iov->iov_base + off, now); + sum += odd ? (s << 8) : s; + odd ^= (now & 1); + len -= now; + off = 0; + iov++; + iovcnt--; + } + + return (e82545_carry(sum)); +} + +/* + * Return the transmit descriptor type. + */ +int +e82545_txdesc_type(uint32_t lower) +{ + int type; + + type = 0; + + if (lower & E1000_TXD_CMD_DEXT) + type = lower & E1000_TXD_MASK; + + return (type); +} + +static void +e82545_transmit_checksum(struct iovec *iov, int iovcnt, struct ck_info *ck) +{ + uint16_t cksum; + int cklen; + + DPRINTF("tx cksum: iovcnt/s/off/len %d/%d/%d/%d\r\n", + iovcnt, ck->ck_start, ck->ck_off, ck->ck_len); + cklen = ck->ck_len ? ck->ck_len - ck->ck_start + 1 : INT_MAX; + cksum = e82545_iov_checksum(iov, iovcnt, ck->ck_start, cklen); + *(uint16_t *)((uint8_t *)iov[0].iov_base + ck->ck_off) = ~cksum; +} + +static void +e82545_transmit_backend(struct e82545_softc *sc, struct iovec *iov, int iovcnt) +{ + + if (sc->esc_tapfd == -1) + return; + + (void) writev(sc->esc_tapfd, iov, iovcnt); +} + +static void +e82545_transmit_done(struct e82545_softc *sc, uint16_t head, uint16_t tail, + uint16_t dsize, int *tdwb) +{ + union e1000_tx_udesc *dsc; + + for ( ; head != tail; head = (head + 1) % dsize) { + dsc = &sc->esc_txdesc[head]; + if (dsc->td.lower.data & E1000_TXD_CMD_RS) { + dsc->td.upper.data |= E1000_TXD_STAT_DD; + *tdwb = 1; + } + } +} + +static int +e82545_transmit(struct e82545_softc *sc, uint16_t head, uint16_t tail, + uint16_t dsize, uint16_t *rhead, int *tdwb) +{ +#ifdef __FreeBSD__ + uint8_t *hdr, *hdrp; +#else + caddr_t hdr, hdrp; +#endif + struct iovec iovb[I82545_MAX_TXSEGS + 2]; + struct iovec tiov[I82545_MAX_TXSEGS + 2]; + struct e1000_context_desc *cd; + struct ck_info ckinfo[2]; + struct iovec *iov; + union e1000_tx_udesc *dsc; + int desc, dtype, len, ntype, iovcnt, tlen, hdrlen, vlen, tcp, tso; + int mss, paylen, seg, tiovcnt, left, now, nleft, nnow, pv, pvoff; + uint32_t tcpsum, tcpseq; + uint16_t ipcs, tcpcs, ipid, ohead; + + ckinfo[0].ck_valid = ckinfo[1].ck_valid = 0; + iovcnt = 0; + tlen = 0; + ntype = 0; + tso = 0; + ohead = head; + hdr = NULL; + + /* iovb[0/1] may be used for writable copy of headers. */ + iov = &iovb[2]; + + for (desc = 0; ; desc++, head = (head + 1) % dsize) { + if (head == tail) { + *rhead = head; + return (0); + } + dsc = &sc->esc_txdesc[head]; + dtype = e82545_txdesc_type(dsc->td.lower.data); + + if (desc == 0) { + switch (dtype) { + case E1000_TXD_TYP_C: + DPRINTF("tx ctxt desc idx %d: %016jx " + "%08x%08x\r\n", + head, dsc->td.buffer_addr, + dsc->td.upper.data, dsc->td.lower.data); + /* Save context and return */ + sc->esc_txctx = dsc->cd; + goto done; + case E1000_TXD_TYP_L: + DPRINTF("tx legacy desc idx %d: %08x%08x\r\n", + head, dsc->td.upper.data, dsc->td.lower.data); + /* + * legacy cksum start valid in first descriptor + */ + ntype = dtype; + ckinfo[0].ck_start = dsc->td.upper.fields.css; + break; + case E1000_TXD_TYP_D: + DPRINTF("tx data desc idx %d: %08x%08x\r\n", + head, dsc->td.upper.data, dsc->td.lower.data); + ntype = dtype; + break; + default: + break; + } + } else { + /* Descriptor type must be consistent */ + assert(dtype == ntype); + DPRINTF("tx next desc idx %d: %08x%08x\r\n", + head, dsc->td.upper.data, dsc->td.lower.data); + } + + len = (dtype == E1000_TXD_TYP_L) ? dsc->td.lower.flags.length : + dsc->dd.lower.data & 0xFFFFF; + + if (len > 0) { + /* Strip checksum supplied by guest. */ + if ((dsc->td.lower.data & E1000_TXD_CMD_EOP) != 0 && + (dsc->td.lower.data & E1000_TXD_CMD_IFCS) == 0) + len -= 2; + tlen += len; + if (iovcnt < I82545_MAX_TXSEGS) { + iov[iovcnt].iov_base = paddr_guest2host( + sc->esc_ctx, dsc->td.buffer_addr, len); + iov[iovcnt].iov_len = len; + } + iovcnt++; + } + + /* + * Pull out info that is valid in the final descriptor + * and exit descriptor loop. + */ + if (dsc->td.lower.data & E1000_TXD_CMD_EOP) { + if (dtype == E1000_TXD_TYP_L) { + if (dsc->td.lower.data & E1000_TXD_CMD_IC) { + ckinfo[0].ck_valid = 1; + ckinfo[0].ck_off = + dsc->td.lower.flags.cso; + ckinfo[0].ck_len = 0; + } + } else { + cd = &sc->esc_txctx; + if (dsc->dd.lower.data & E1000_TXD_CMD_TSE) + tso = 1; + if (dsc->dd.upper.fields.popts & + E1000_TXD_POPTS_IXSM) + ckinfo[0].ck_valid = 1; + if (dsc->dd.upper.fields.popts & + E1000_TXD_POPTS_IXSM || tso) { + ckinfo[0].ck_start = + cd->lower_setup.ip_fields.ipcss; + ckinfo[0].ck_off = + cd->lower_setup.ip_fields.ipcso; + ckinfo[0].ck_len = + cd->lower_setup.ip_fields.ipcse; + } + if (dsc->dd.upper.fields.popts & + E1000_TXD_POPTS_TXSM) + ckinfo[1].ck_valid = 1; + if (dsc->dd.upper.fields.popts & + E1000_TXD_POPTS_TXSM || tso) { + ckinfo[1].ck_start = + cd->upper_setup.tcp_fields.tucss; + ckinfo[1].ck_off = + cd->upper_setup.tcp_fields.tucso; + ckinfo[1].ck_len = + cd->upper_setup.tcp_fields.tucse; + } + } + break; + } + } + + if (iovcnt > I82545_MAX_TXSEGS) { + WPRINTF("tx too many descriptors (%d > %d) -- dropped\r\n", + iovcnt, I82545_MAX_TXSEGS); + goto done; + } + + hdrlen = vlen = 0; + /* Estimate writable space for VLAN header insertion. */ + if ((sc->esc_CTRL & E1000_CTRL_VME) && + (dsc->td.lower.data & E1000_TXD_CMD_VLE)) { + hdrlen = ETHER_ADDR_LEN*2; + vlen = ETHER_VLAN_ENCAP_LEN; + } + if (!tso) { + /* Estimate required writable space for checksums. */ + if (ckinfo[0].ck_valid) + hdrlen = MAX(hdrlen, ckinfo[0].ck_off + 2); + if (ckinfo[1].ck_valid) + hdrlen = MAX(hdrlen, ckinfo[1].ck_off + 2); + /* Round up writable space to the first vector. */ + if (hdrlen != 0 && iov[0].iov_len > hdrlen && + iov[0].iov_len < hdrlen + 100) + hdrlen = iov[0].iov_len; + } else { + /* In case of TSO header length provided by software. */ + hdrlen = sc->esc_txctx.tcp_seg_setup.fields.hdr_len; + } + + /* Allocate, fill and prepend writable header vector. */ + if (hdrlen != 0) { + hdr = __builtin_alloca(hdrlen + vlen); + hdr += vlen; + for (left = hdrlen, hdrp = hdr; left > 0; + left -= now, hdrp += now) { + now = MIN(left, iov->iov_len); + memcpy(hdrp, iov->iov_base, now); + iov->iov_base += now; + iov->iov_len -= now; + if (iov->iov_len == 0) { + iov++; + iovcnt--; + } + } + iov--; + iovcnt++; + iov->iov_base = hdr; + iov->iov_len = hdrlen; + } + + /* Insert VLAN tag. */ + if (vlen != 0) { + hdr -= ETHER_VLAN_ENCAP_LEN; + memmove(hdr, hdr + ETHER_VLAN_ENCAP_LEN, ETHER_ADDR_LEN*2); + hdrlen += ETHER_VLAN_ENCAP_LEN; + hdr[ETHER_ADDR_LEN*2 + 0] = sc->esc_VET >> 8; + hdr[ETHER_ADDR_LEN*2 + 1] = sc->esc_VET & 0xff; + hdr[ETHER_ADDR_LEN*2 + 2] = dsc->td.upper.fields.special >> 8; + hdr[ETHER_ADDR_LEN*2 + 3] = dsc->td.upper.fields.special & 0xff; + iov->iov_base = hdr; + iov->iov_len += ETHER_VLAN_ENCAP_LEN; + /* Correct checksum offsets after VLAN tag insertion. */ + ckinfo[0].ck_start += ETHER_VLAN_ENCAP_LEN; + ckinfo[0].ck_off += ETHER_VLAN_ENCAP_LEN; + if (ckinfo[0].ck_len != 0) + ckinfo[0].ck_len += ETHER_VLAN_ENCAP_LEN; + ckinfo[1].ck_start += ETHER_VLAN_ENCAP_LEN; + ckinfo[1].ck_off += ETHER_VLAN_ENCAP_LEN; + if (ckinfo[1].ck_len != 0) + ckinfo[1].ck_len += ETHER_VLAN_ENCAP_LEN; + } + + /* Simple non-TSO case. */ + if (!tso) { + /* Calculate checksums and transmit. */ + if (ckinfo[0].ck_valid) + e82545_transmit_checksum(iov, iovcnt, &ckinfo[0]); + if (ckinfo[1].ck_valid) + e82545_transmit_checksum(iov, iovcnt, &ckinfo[1]); + e82545_transmit_backend(sc, iov, iovcnt); + goto done; + } + + /* Doing TSO. */ + tcp = (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_TCP) != 0; + mss = sc->esc_txctx.tcp_seg_setup.fields.mss; + paylen = (sc->esc_txctx.cmd_and_length & 0x000fffff); + DPRINTF("tx %s segmentation offload %d+%d/%d bytes %d iovs\r\n", + tcp ? "TCP" : "UDP", hdrlen, paylen, mss, iovcnt); + ipid = ntohs(*(uint16_t *)&hdr[ckinfo[0].ck_start + 4]); + tcpseq = ntohl(*(uint32_t *)&hdr[ckinfo[1].ck_start + 4]); + ipcs = *(uint16_t *)&hdr[ckinfo[0].ck_off]; + tcpcs = 0; + if (ckinfo[1].ck_valid) /* Save partial pseudo-header checksum. */ + tcpcs = *(uint16_t *)&hdr[ckinfo[1].ck_off]; + pv = 1; + pvoff = 0; + for (seg = 0, left = paylen; left > 0; seg++, left -= now) { + now = MIN(left, mss); + + /* Construct IOVs for the segment. */ + /* Include whole original header. */ + tiov[0].iov_base = hdr; + tiov[0].iov_len = hdrlen; + tiovcnt = 1; + /* Include respective part of payload IOV. */ + for (nleft = now; pv < iovcnt && nleft > 0; nleft -= nnow) { + nnow = MIN(nleft, iov[pv].iov_len - pvoff); + tiov[tiovcnt].iov_base = iov[pv].iov_base + pvoff; + tiov[tiovcnt++].iov_len = nnow; + if (pvoff + nnow == iov[pv].iov_len) { + pv++; + pvoff = 0; + } else + pvoff += nnow; + } + DPRINTF("tx segment %d %d+%d bytes %d iovs\r\n", + seg, hdrlen, now, tiovcnt); + + /* Update IP header. */ + if (sc->esc_txctx.cmd_and_length & E1000_TXD_CMD_IP) { + /* IPv4 -- set length and ID */ + *(uint16_t *)&hdr[ckinfo[0].ck_start + 2] = + htons(hdrlen - ckinfo[0].ck_start + now); + *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = + htons(ipid + seg); + } else { + /* IPv6 -- set length */ + *(uint16_t *)&hdr[ckinfo[0].ck_start + 4] = + htons(hdrlen - ckinfo[0].ck_start - 40 + + now); + } + + /* Update pseudo-header checksum. */ + tcpsum = tcpcs; + tcpsum += htons(hdrlen - ckinfo[1].ck_start + now); + + /* Update TCP/UDP headers. */ + if (tcp) { + /* Update sequence number and FIN/PUSH flags. */ + *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = + htonl(tcpseq + paylen - left); + if (now < left) { + hdr[ckinfo[1].ck_start + 13] &= + ~(TH_FIN | TH_PUSH); + } + } else { + /* Update payload length. */ + *(uint32_t *)&hdr[ckinfo[1].ck_start + 4] = + hdrlen - ckinfo[1].ck_start + now; + } + + /* Calculate checksums and transmit. */ + if (ckinfo[0].ck_valid) { + *(uint16_t *)&hdr[ckinfo[0].ck_off] = ipcs; + e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[0]); + } + if (ckinfo[1].ck_valid) { + *(uint16_t *)&hdr[ckinfo[1].ck_off] = + e82545_carry(tcpsum); + e82545_transmit_checksum(tiov, tiovcnt, &ckinfo[1]); + } + e82545_transmit_backend(sc, tiov, tiovcnt); + } + +done: + head = (head + 1) % dsize; + e82545_transmit_done(sc, ohead, head, dsize, tdwb); + + *rhead = head; + return (desc + 1); +} + +static void +e82545_tx_run(struct e82545_softc *sc) +{ + uint32_t cause; + uint16_t head, rhead, tail, size; + int lim, tdwb, sent; + + head = sc->esc_TDH; + tail = sc->esc_TDT; + size = sc->esc_TDLEN / 16; + DPRINTF("tx_run: head %x, rhead %x, tail %x\r\n", + sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); + + pthread_mutex_unlock(&sc->esc_mtx); + rhead = head; + tdwb = 0; + for (lim = size / 4; sc->esc_tx_enabled && lim > 0; lim -= sent) { + sent = e82545_transmit(sc, head, tail, size, &rhead, &tdwb); + if (sent == 0) + break; + head = rhead; + } + pthread_mutex_lock(&sc->esc_mtx); + + sc->esc_TDH = head; + sc->esc_TDHr = rhead; + cause = 0; + if (tdwb) + cause |= E1000_ICR_TXDW; + if (lim != size / 4 && sc->esc_TDH == sc->esc_TDT) + cause |= E1000_ICR_TXQE; + if (cause) + e82545_icr_assert(sc, cause); + + DPRINTF("tx_run done: head %x, rhead %x, tail %x\r\n", + sc->esc_TDH, sc->esc_TDHr, sc->esc_TDT); +} + +static void * +e82545_tx_thread(void *param) +{ + struct e82545_softc *sc = param; + + pthread_mutex_lock(&sc->esc_mtx); + for (;;) { + while (!sc->esc_tx_enabled || sc->esc_TDHr == sc->esc_TDT) { + if (sc->esc_tx_enabled && sc->esc_TDHr != sc->esc_TDT) + break; + sc->esc_tx_active = 0; + if (sc->esc_tx_enabled == 0) + pthread_cond_signal(&sc->esc_tx_cond); + pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); + } + sc->esc_tx_active = 1; + + /* Process some tx descriptors. Lock dropped inside. */ + e82545_tx_run(sc); + } +#ifndef __FreeBSD__ + return (NULL); +#endif +} + +static void +e82545_tx_start(struct e82545_softc *sc) +{ + + if (sc->esc_tx_active == 0) + pthread_cond_signal(&sc->esc_tx_cond); +} + +static void +e82545_tx_enable(struct e82545_softc *sc) +{ + + sc->esc_tx_enabled = 1; +} + +static void +e82545_tx_disable(struct e82545_softc *sc) +{ + + sc->esc_tx_enabled = 0; + while (sc->esc_tx_active) + pthread_cond_wait(&sc->esc_tx_cond, &sc->esc_mtx); +} + +static void +e82545_rx_enable(struct e82545_softc *sc) +{ + + sc->esc_rx_enabled = 1; +} + +static void +e82545_rx_disable(struct e82545_softc *sc) +{ + + sc->esc_rx_enabled = 0; + while (sc->esc_rx_active) + pthread_cond_wait(&sc->esc_rx_cond, &sc->esc_mtx); +} + +static void +e82545_write_ra(struct e82545_softc *sc, int reg, uint32_t wval) +{ + struct eth_uni *eu; + int idx; + + idx = reg >> 1; + assert(idx < 15); + + eu = &sc->esc_uni[idx]; + + if (reg & 0x1) { + /* RAH */ + eu->eu_valid = ((wval & E1000_RAH_AV) == E1000_RAH_AV); + eu->eu_addrsel = (wval >> 16) & 0x3; + eu->eu_eth.octet[5] = wval >> 8; + eu->eu_eth.octet[4] = wval; + } else { + /* RAL */ + eu->eu_eth.octet[3] = wval >> 24; + eu->eu_eth.octet[2] = wval >> 16; + eu->eu_eth.octet[1] = wval >> 8; + eu->eu_eth.octet[0] = wval; + } +} + +static uint32_t +e82545_read_ra(struct e82545_softc *sc, int reg) +{ + struct eth_uni *eu; + uint32_t retval; + int idx; + + idx = reg >> 1; + assert(idx < 15); + + eu = &sc->esc_uni[idx]; + + if (reg & 0x1) { + /* RAH */ + retval = (eu->eu_valid << 31) | + (eu->eu_addrsel << 16) | + (eu->eu_eth.octet[5] << 8) | + eu->eu_eth.octet[4]; + } else { + /* RAL */ + retval = (eu->eu_eth.octet[3] << 24) | + (eu->eu_eth.octet[2] << 16) | + (eu->eu_eth.octet[1] << 8) | + eu->eu_eth.octet[0]; + } + + return (retval); +} + +static void +e82545_write_register(struct e82545_softc *sc, uint32_t offset, uint32_t value) +{ + int ridx; + + if (offset & 0x3) { + DPRINTF("Unaligned register write offset:0x%x value:0x%x\r\n", offset, value); + return; + } + DPRINTF("Register write: 0x%x value: 0x%x\r\n", offset, value); + + switch (offset) { + case E1000_CTRL: + case E1000_CTRL_DUP: + e82545_devctl(sc, value); + break; + case E1000_FCAL: + sc->esc_FCAL = value; + break; + case E1000_FCAH: + sc->esc_FCAH = value & ~0xFFFF0000; + break; + case E1000_FCT: + sc->esc_FCT = value & ~0xFFFF0000; + break; + case E1000_VET: + sc->esc_VET = value & ~0xFFFF0000; + break; + case E1000_FCTTV: + sc->esc_FCTTV = value & ~0xFFFF0000; + break; + case E1000_LEDCTL: + sc->esc_LEDCTL = value & ~0x30303000; + break; + case E1000_PBA: + sc->esc_PBA = value & 0x0000FF80; + break; + case E1000_ICR: + case E1000_ITR: + case E1000_ICS: + case E1000_IMS: + case E1000_IMC: + e82545_intr_write(sc, offset, value); + break; + case E1000_RCTL: + e82545_rx_ctl(sc, value); + break; + case E1000_FCRTL: + sc->esc_FCRTL = value & ~0xFFFF0007; + break; + case E1000_FCRTH: + sc->esc_FCRTH = value & ~0xFFFF0007; + break; + case E1000_RDBAL(0): + sc->esc_RDBAL = value & ~0xF; + if (sc->esc_rx_enabled) { + /* Apparently legal: update cached address */ + e82545_rx_update_rdba(sc); + } + break; + case E1000_RDBAH(0): + assert(!sc->esc_rx_enabled); + sc->esc_RDBAH = value; + break; + case E1000_RDLEN(0): + assert(!sc->esc_rx_enabled); + sc->esc_RDLEN = value & ~0xFFF0007F; + break; + case E1000_RDH(0): + /* XXX should only ever be zero ? Range check ? */ + sc->esc_RDH = value; + break; + case E1000_RDT(0): + /* XXX if this opens up the rx ring, do something ? */ + sc->esc_RDT = value; + break; + case E1000_RDTR: + /* ignore FPD bit 31 */ + sc->esc_RDTR = value & ~0xFFFF0000; + break; + case E1000_RXDCTL(0): + sc->esc_RXDCTL = value & ~0xFEC0C0C0; + break; + case E1000_RADV: + sc->esc_RADV = value & ~0xFFFF0000; + break; + case E1000_RSRPD: + sc->esc_RSRPD = value & ~0xFFFFF000; + break; + case E1000_RXCSUM: + sc->esc_RXCSUM = value & ~0xFFFFF800; + break; + case E1000_TXCW: + sc->esc_TXCW = value & ~0x3FFF0000; + break; + case E1000_TCTL: + e82545_tx_ctl(sc, value); + break; + case E1000_TIPG: + sc->esc_TIPG = value; + break; + case E1000_AIT: + sc->esc_AIT = value; + break; + case E1000_TDBAL(0): + sc->esc_TDBAL = value & ~0xF; + if (sc->esc_tx_enabled) { + /* Apparently legal */ + e82545_tx_update_tdba(sc); + } + break; + case E1000_TDBAH(0): + //assert(!sc->esc_tx_enabled); + sc->esc_TDBAH = value; + break; + case E1000_TDLEN(0): + //assert(!sc->esc_tx_enabled); + sc->esc_TDLEN = value & ~0xFFF0007F; + break; + case E1000_TDH(0): + //assert(!sc->esc_tx_enabled); + /* XXX should only ever be zero ? Range check ? */ + sc->esc_TDHr = sc->esc_TDH = value; + break; + case E1000_TDT(0): + /* XXX range check ? */ + sc->esc_TDT = value; + if (sc->esc_tx_enabled) + e82545_tx_start(sc); + break; + case E1000_TIDV: + sc->esc_TIDV = value & ~0xFFFF0000; + break; + case E1000_TXDCTL(0): + //assert(!sc->esc_tx_enabled); + sc->esc_TXDCTL = value & ~0xC0C0C0; + break; + case E1000_TADV: + sc->esc_TADV = value & ~0xFFFF0000; + break; + case E1000_RAL(0) ... E1000_RAH(15): + /* convert to u32 offset */ + ridx = (offset - E1000_RAL(0)) >> 2; + e82545_write_ra(sc, ridx, value); + break; + case E1000_MTA ... (E1000_MTA + (127*4)): + sc->esc_fmcast[(offset - E1000_MTA) >> 2] = value; + break; + case E1000_VFTA ... (E1000_VFTA + (127*4)): + sc->esc_fvlan[(offset - E1000_VFTA) >> 2] = value; + break; + case E1000_EECD: + { + //DPRINTF("EECD write 0x%x -> 0x%x\r\n", sc->eeprom_control, value); + /* edge triggered low->high */ + uint32_t eecd_strobe = ((sc->eeprom_control & E1000_EECD_SK) ? + 0 : (value & E1000_EECD_SK)); + uint32_t eecd_mask = (E1000_EECD_SK|E1000_EECD_CS| + E1000_EECD_DI|E1000_EECD_REQ); + sc->eeprom_control &= ~eecd_mask; + sc->eeprom_control |= (value & eecd_mask); + /* grant/revoke immediately */ + if (value & E1000_EECD_REQ) { + sc->eeprom_control |= E1000_EECD_GNT; + } else { + sc->eeprom_control &= ~E1000_EECD_GNT; + } + if (eecd_strobe && (sc->eeprom_control & E1000_EECD_CS)) { + e82545_eecd_strobe(sc); + } + return; + } + case E1000_MDIC: + { + uint8_t reg_addr = (uint8_t)((value & E1000_MDIC_REG_MASK) >> + E1000_MDIC_REG_SHIFT); + uint8_t phy_addr = (uint8_t)((value & E1000_MDIC_PHY_MASK) >> + E1000_MDIC_PHY_SHIFT); + sc->mdi_control = + (value & ~(E1000_MDIC_ERROR|E1000_MDIC_DEST)); + if ((value & E1000_MDIC_READY) != 0) { + DPRINTF("Incorrect MDIC ready bit: 0x%x\r\n", value); + return; + } + switch (value & E82545_MDIC_OP_MASK) { + case E1000_MDIC_OP_READ: + sc->mdi_control &= ~E82545_MDIC_DATA_MASK; + sc->mdi_control |= e82545_read_mdi(sc, reg_addr, phy_addr); + break; + case E1000_MDIC_OP_WRITE: + e82545_write_mdi(sc, reg_addr, phy_addr, + value & E82545_MDIC_DATA_MASK); + break; + default: + DPRINTF("Unknown MDIC op: 0x%x\r\n", value); + return; + } + /* TODO: barrier? */ + sc->mdi_control |= E1000_MDIC_READY; + if (value & E82545_MDIC_IE) { + // TODO: generate interrupt + } + return; + } + case E1000_MANC: + case E1000_STATUS: + return; + default: + DPRINTF("Unknown write register: 0x%x value:%x\r\n", offset, value); + return; + } +} + +static uint32_t +e82545_read_register(struct e82545_softc *sc, uint32_t offset) +{ + uint32_t retval; + int ridx; + + if (offset & 0x3) { + DPRINTF("Unaligned register read offset:0x%x\r\n", offset); + return 0; + } + + DPRINTF("Register read: 0x%x\r\n", offset); + + switch (offset) { + case E1000_CTRL: + retval = sc->esc_CTRL; + break; + case E1000_STATUS: + retval = E1000_STATUS_FD | E1000_STATUS_LU | + E1000_STATUS_SPEED_1000; + break; + case E1000_FCAL: + retval = sc->esc_FCAL; + break; + case E1000_FCAH: + retval = sc->esc_FCAH; + break; + case E1000_FCT: + retval = sc->esc_FCT; + break; + case E1000_VET: + retval = sc->esc_VET; + break; + case E1000_FCTTV: + retval = sc->esc_FCTTV; + break; + case E1000_LEDCTL: + retval = sc->esc_LEDCTL; + break; + case E1000_PBA: + retval = sc->esc_PBA; + break; + case E1000_ICR: + case E1000_ITR: + case E1000_ICS: + case E1000_IMS: + case E1000_IMC: + retval = e82545_intr_read(sc, offset); + break; + case E1000_RCTL: + retval = sc->esc_RCTL; + break; + case E1000_FCRTL: + retval = sc->esc_FCRTL; + break; + case E1000_FCRTH: + retval = sc->esc_FCRTH; + break; + case E1000_RDBAL(0): + retval = sc->esc_RDBAL; + break; + case E1000_RDBAH(0): + retval = sc->esc_RDBAH; + break; + case E1000_RDLEN(0): + retval = sc->esc_RDLEN; + break; + case E1000_RDH(0): + retval = sc->esc_RDH; + break; + case E1000_RDT(0): + retval = sc->esc_RDT; + break; + case E1000_RDTR: + retval = sc->esc_RDTR; + break; + case E1000_RXDCTL(0): + retval = sc->esc_RXDCTL; + break; + case E1000_RADV: + retval = sc->esc_RADV; + break; + case E1000_RSRPD: + retval = sc->esc_RSRPD; + break; + case E1000_RXCSUM: + retval = sc->esc_RXCSUM; + break; + case E1000_TXCW: + retval = sc->esc_TXCW; + break; + case E1000_TCTL: + retval = sc->esc_TCTL; + break; + case E1000_TIPG: + retval = sc->esc_TIPG; + break; + case E1000_AIT: + retval = sc->esc_AIT; + break; + case E1000_TDBAL(0): + retval = sc->esc_TDBAL; + break; + case E1000_TDBAH(0): + retval = sc->esc_TDBAH; + break; + case E1000_TDLEN(0): + retval = sc->esc_TDLEN; + break; + case E1000_TDH(0): + retval = sc->esc_TDH; + break; + case E1000_TDT(0): + retval = sc->esc_TDT; + break; + case E1000_TIDV: + retval = sc->esc_TIDV; + break; + case E1000_TXDCTL(0): + retval = sc->esc_TXDCTL; + break; + case E1000_TADV: + retval = sc->esc_TADV; + break; + case E1000_RAL(0) ... E1000_RAH(15): + /* convert to u32 offset */ + ridx = (offset - E1000_RAL(0)) >> 2; + retval = e82545_read_ra(sc, ridx); + break; + case E1000_MTA ... (E1000_MTA + (127*4)): + retval = sc->esc_fmcast[(offset - E1000_MTA) >> 2]; + break; + case E1000_VFTA ... (E1000_VFTA + (127*4)): + retval = sc->esc_fvlan[(offset - E1000_VFTA) >> 2]; + break; + case E1000_EECD: + //DPRINTF("EECD read %x\r\n", sc->eeprom_control); + retval = sc->eeprom_control; + break; + case E1000_MDIC: + retval = sc->mdi_control; + break; + case E1000_MANC: + retval = 0; + break; + /* stats that we emulate. */ + case E1000_MPC: + retval = sc->missed_pkt_count; + break; + case E1000_PRC64: + retval = sc->pkt_rx_by_size[0]; + break; + case E1000_PRC127: + retval = sc->pkt_rx_by_size[1]; + break; + case E1000_PRC255: + retval = sc->pkt_rx_by_size[2]; + break; + case E1000_PRC511: + retval = sc->pkt_rx_by_size[3]; + break; + case E1000_PRC1023: + retval = sc->pkt_rx_by_size[4]; + break; + case E1000_PRC1522: + retval = sc->pkt_rx_by_size[5]; + break; + case E1000_GPRC: + retval = sc->good_pkt_rx_count; + break; + case E1000_BPRC: + retval = sc->bcast_pkt_rx_count; + break; + case E1000_MPRC: + retval = sc->mcast_pkt_rx_count; + break; + case E1000_GPTC: + case E1000_TPT: + retval = sc->good_pkt_tx_count; + break; + case E1000_GORCL: + retval = (uint32_t)sc->good_octets_rx; + break; + case E1000_GORCH: + retval = (uint32_t)(sc->good_octets_rx >> 32); + break; + case E1000_TOTL: + case E1000_GOTCL: + retval = (uint32_t)sc->good_octets_tx; + break; + case E1000_TOTH: + case E1000_GOTCH: + retval = (uint32_t)(sc->good_octets_tx >> 32); + break; + case E1000_ROC: + retval = sc->oversize_rx_count; + break; + case E1000_TORL: + retval = (uint32_t)(sc->good_octets_rx + sc->missed_octets); + break; + case E1000_TORH: + retval = (uint32_t)((sc->good_octets_rx + + sc->missed_octets) >> 32); + break; + case E1000_TPR: + retval = sc->good_pkt_rx_count + sc->missed_pkt_count + + sc->oversize_rx_count; + break; + case E1000_PTC64: + retval = sc->pkt_tx_by_size[0]; + break; + case E1000_PTC127: + retval = sc->pkt_tx_by_size[1]; + break; + case E1000_PTC255: + retval = sc->pkt_tx_by_size[2]; + break; + case E1000_PTC511: + retval = sc->pkt_tx_by_size[3]; + break; + case E1000_PTC1023: + retval = sc->pkt_tx_by_size[4]; + break; + case E1000_PTC1522: + retval = sc->pkt_tx_by_size[5]; + break; + case E1000_MPTC: + retval = sc->mcast_pkt_tx_count; + break; + case E1000_BPTC: + retval = sc->bcast_pkt_tx_count; + break; + case E1000_TSCTC: + retval = sc->tso_tx_count; + break; + /* stats that are always 0. */ + case E1000_CRCERRS: + case E1000_ALGNERRC: + case E1000_SYMERRS: + case E1000_RXERRC: + case E1000_SCC: + case E1000_ECOL: + case E1000_MCC: + case E1000_LATECOL: + case E1000_COLC: + case E1000_DC: + case E1000_TNCRS: + case E1000_SEC: + case E1000_CEXTERR: + case E1000_RLEC: + case E1000_XONRXC: + case E1000_XONTXC: + case E1000_XOFFRXC: + case E1000_XOFFTXC: + case E1000_FCRUC: + case E1000_RNBC: + case E1000_RUC: + case E1000_RFC: + case E1000_RJC: + case E1000_MGTPRC: + case E1000_MGTPDC: + case E1000_MGTPTC: + case E1000_TSCTFC: + retval = 0; + break; + default: + DPRINTF("Unknown read register: 0x%x\r\n", offset); + retval = 0; + break; + } + + return (retval); +} + +static void +e82545_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + struct e82545_softc *sc; + + //DPRINTF("Write bar:%d offset:0x%lx value:0x%lx size:%d\r\n", baridx, offset, value, size); + + sc = pi->pi_arg; + + pthread_mutex_lock(&sc->esc_mtx); + + switch (baridx) { + case E82545_BAR_IO: + switch (offset) { + case E82545_IOADDR: + if (size != 4) { + DPRINTF("Wrong io addr write sz:%d value:0x%lx\r\n", size, value); + } else + sc->io_addr = (uint32_t)value; + break; + case E82545_IODATA: + if (size != 4) { + DPRINTF("Wrong io data write size:%d value:0x%lx\r\n", size, value); + } else if (sc->io_addr > E82545_IO_REGISTER_MAX) { + DPRINTF("Non-register io write addr:0x%x value:0x%lx\r\n", sc->io_addr, value); + } else + e82545_write_register(sc, sc->io_addr, + (uint32_t)value); + break; + default: + DPRINTF("Unknown io bar write offset:0x%lx value:0x%lx size:%d\r\n", offset, value, size); + break; + } + break; + case E82545_BAR_REGISTER: + if (size != 4) { + DPRINTF("Wrong register write size:%d offset:0x%lx value:0x%lx\r\n", size, offset, value); + } else + e82545_write_register(sc, (uint32_t)offset, + (uint32_t)value); + break; + default: + DPRINTF("Unknown write bar:%d off:0x%lx val:0x%lx size:%d\r\n", + baridx, offset, value, size); + } + + pthread_mutex_unlock(&sc->esc_mtx); +} + +static uint64_t +e82545_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct e82545_softc *sc; + uint64_t retval; + + //DPRINTF("Read bar:%d offset:0x%lx size:%d\r\n", baridx, offset, size); + sc = pi->pi_arg; + retval = 0; + + pthread_mutex_lock(&sc->esc_mtx); + + switch (baridx) { + case E82545_BAR_IO: + switch (offset) { + case E82545_IOADDR: + if (size != 4) { + DPRINTF("Wrong io addr read sz:%d\r\n", size); + } else + retval = sc->io_addr; + break; + case E82545_IODATA: + if (size != 4) { + DPRINTF("Wrong io data read sz:%d\r\n", size); + } + if (sc->io_addr > E82545_IO_REGISTER_MAX) { + DPRINTF("Non-register io read addr:0x%x\r\n", + sc->io_addr); + } else + retval = e82545_read_register(sc, sc->io_addr); + break; + default: + DPRINTF("Unknown io bar read offset:0x%lx size:%d\r\n", + offset, size); + break; + } + break; + case E82545_BAR_REGISTER: + if (size != 4) { + DPRINTF("Wrong register read size:%d offset:0x%lx\r\n", + size, offset); + } else + retval = e82545_read_register(sc, (uint32_t)offset); + break; + default: + DPRINTF("Unknown read bar:%d offset:0x%lx size:%d\r\n", + baridx, offset, size); + break; + } + + pthread_mutex_unlock(&sc->esc_mtx); + + return (retval); +} + +static void +e82545_reset(struct e82545_softc *sc, int drvr) +{ + int i; + + e82545_rx_disable(sc); + e82545_tx_disable(sc); + + /* clear outstanding interrupts */ + if (sc->esc_irq_asserted) + pci_lintr_deassert(sc->esc_pi); + + /* misc */ + if (!drvr) { + sc->esc_FCAL = 0; + sc->esc_FCAH = 0; + sc->esc_FCT = 0; + sc->esc_VET = 0; + sc->esc_FCTTV = 0; + } + sc->esc_LEDCTL = 0x07061302; + sc->esc_PBA = 0x00100030; + + /* start nvm in opcode mode. */ + sc->nvm_opaddr = 0; + sc->nvm_mode = E82545_NVM_MODE_OPADDR; + sc->nvm_bits = E82545_NVM_OPADDR_BITS; + sc->eeprom_control = E1000_EECD_PRES | E82545_EECD_FWE_EN; + e82545_init_eeprom(sc); + + /* interrupt */ + sc->esc_ICR = 0; + sc->esc_ITR = 250; + sc->esc_ICS = 0; + sc->esc_IMS = 0; + sc->esc_IMC = 0; + + /* L2 filters */ + if (!drvr) { + memset(sc->esc_fvlan, 0, sizeof(sc->esc_fvlan)); + memset(sc->esc_fmcast, 0, sizeof(sc->esc_fmcast)); + memset(sc->esc_uni, 0, sizeof(sc->esc_uni)); + + /* XXX not necessary on 82545 ?? */ + sc->esc_uni[0].eu_valid = 1; + memcpy(sc->esc_uni[0].eu_eth.octet, sc->esc_mac.octet, + ETHER_ADDR_LEN); + } else { + /* Clear RAH valid bits */ + for (i = 0; i < 16; i++) + sc->esc_uni[i].eu_valid = 0; + } + + /* receive */ + if (!drvr) { + sc->esc_RDBAL = 0; + sc->esc_RDBAH = 0; + } + sc->esc_RCTL = 0; + sc->esc_FCRTL = 0; + sc->esc_FCRTH = 0; + sc->esc_RDLEN = 0; + sc->esc_RDH = 0; + sc->esc_RDT = 0; + sc->esc_RDTR = 0; + sc->esc_RXDCTL = (1 << 24) | (1 << 16); /* default GRAN/WTHRESH */ + sc->esc_RADV = 0; + sc->esc_RXCSUM = 0; + + /* transmit */ + if (!drvr) { + sc->esc_TDBAL = 0; + sc->esc_TDBAH = 0; + sc->esc_TIPG = 0; + sc->esc_AIT = 0; + sc->esc_TIDV = 0; + sc->esc_TADV = 0; + } + sc->esc_tdba = 0; + sc->esc_txdesc = NULL; + sc->esc_TXCW = 0; + sc->esc_TCTL = 0; + sc->esc_TDLEN = 0; + sc->esc_TDT = 0; + sc->esc_TDHr = sc->esc_TDH = 0; + sc->esc_TXDCTL = 0; +} + +static void +e82545_open_tap(struct e82545_softc *sc, char *opts) +{ + char tbuf[80]; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + if (opts == NULL) { + sc->esc_tapfd = -1; + return; + } + + strcpy(tbuf, "/dev/"); + strlcat(tbuf, opts, sizeof(tbuf)); + + sc->esc_tapfd = open(tbuf, O_RDWR); + if (sc->esc_tapfd == -1) { + DPRINTF("unable to open tap device %s\n", opts); + exit(4); + } + + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + int opt = 1; + if (ioctl(sc->esc_tapfd, FIONBIO, &opt) < 0) { + WPRINTF("tap device O_NONBLOCK failed: %d\n", errno); + close(sc->esc_tapfd); + sc->esc_tapfd = -1; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(sc->esc_tapfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + +#ifdef __FreeBSD__ + sc->esc_mevp = mevent_add(sc->esc_tapfd, + EVF_READ, + e82545_tap_callback, + sc); + if (sc->esc_mevp == NULL) { + DPRINTF("Could not register mevent %d\n", EVF_READ); + close(sc->esc_tapfd); + sc->esc_tapfd = -1; + } +#endif +} + +static int +e82545_parsemac(char *mac_str, uint8_t *mac_addr) +{ + struct ether_addr *ea; + char *tmpstr; + char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + + tmpstr = strsep(&mac_str,"="); + if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { + ea = ether_aton(mac_str); + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + fprintf(stderr, "Invalid MAC %s\n", mac_str); + return (1); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); + } + return (0); +} + +static int +e82545_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + DPRINTF("Loading with options: %s\r\n", opts); + + MD5_CTX mdctx; + unsigned char digest[16]; + char nstr[80]; + struct e82545_softc *sc; + char *devname; + char *vtopts; + int mac_provided; + + /* Setup our softc */ + sc = calloc(1, sizeof(*sc)); + + pi->pi_arg = sc; + sc->esc_pi = pi; + sc->esc_ctx = ctx; + + pthread_mutex_init(&sc->esc_mtx, NULL); + pthread_cond_init(&sc->esc_rx_cond, NULL); + pthread_cond_init(&sc->esc_tx_cond, NULL); + pthread_create(&sc->esc_tx_tid, NULL, e82545_tx_thread, sc); + snprintf(nstr, sizeof(nstr), "e82545-%d:%d tx", pi->pi_slot, + pi->pi_func); + pthread_set_name_np(sc->esc_tx_tid, nstr); + + pci_set_cfgdata16(pi, PCIR_DEVICE, E82545_DEV_ID_82545EM_COPPER); + pci_set_cfgdata16(pi, PCIR_VENDOR, E82545_VENDOR_ID_INTEL); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_NETWORK_ETHERNET); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, E82545_SUBDEV_ID); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, E82545_VENDOR_ID_INTEL); + + pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); + pci_set_cfgdata8(pi, PCIR_INTPIN, 0x1); + + /* TODO: this card also supports msi, but the freebsd driver for it + * does not, so I have not implemented it. */ + pci_lintr_request(pi); + + pci_emul_alloc_bar(pi, E82545_BAR_REGISTER, PCIBAR_MEM32, + E82545_BAR_REGISTER_LEN); + pci_emul_alloc_bar(pi, E82545_BAR_FLASH, PCIBAR_MEM32, + E82545_BAR_FLASH_LEN); + pci_emul_alloc_bar(pi, E82545_BAR_IO, PCIBAR_IO, + E82545_BAR_IO_LEN); + + /* + * Attempt to open the tap device and read the MAC address + * if specified. Copied from virtio-net, slightly modified. + */ + mac_provided = 0; + sc->esc_tapfd = -1; + if (opts != NULL) { + int err; + + devname = vtopts = strdup(opts); + (void) strsep(&vtopts, ","); + + if (vtopts != NULL) { + err = e82545_parsemac(vtopts, sc->esc_mac.octet); + if (err != 0) { + free(devname); + return (err); + } + mac_provided = 1; + } + + if (strncmp(devname, "tap", 3) == 0 || + strncmp(devname, "vmnet", 5) == 0) + e82545_open_tap(sc, devname); + + free(devname); + } + + /* + * The default MAC address is the standard NetApp OUI of 00-a0-98, + * followed by an MD5 of the PCI slot/func number and dev name + */ + if (!mac_provided) { + snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot, + pi->pi_func, vmname); + + MD5Init(&mdctx); + MD5Update(&mdctx, nstr, strlen(nstr)); + MD5Final(digest, &mdctx); + + sc->esc_mac.octet[0] = 0x00; + sc->esc_mac.octet[1] = 0xa0; + sc->esc_mac.octet[2] = 0x98; + sc->esc_mac.octet[3] = digest[0]; + sc->esc_mac.octet[4] = digest[1]; + sc->esc_mac.octet[5] = digest[2]; + } + + /* H/w initiated reset */ + e82545_reset(sc, 0); + + return (0); +} + +struct pci_devemu pci_de_e82545 = { + .pe_emu = "e1000", + .pe_init = e82545_init, + .pe_barwrite = e82545_write, + .pe_barread = e82545_read +}; +PCI_EMUL_SET(pci_de_e82545); + diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c index 3b4ca805cc..5118b31534 100644 --- a/usr/src/cmd/bhyve/pci_emul.c +++ b/usr/src/cmd/bhyve/pci_emul.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,16 +38,17 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/linker_set.h> -#include <sys/errno.h> #include <ctype.h> +#include <errno.h> #include <pthread.h> #include <stdio.h> #include <stdlib.h> @@ -66,22 +69,11 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z n #include "pci_irq.h" #include "pci_lpc.h" -#define CONF1_ADDR_PORT 0x0cf8 -#define CONF1_DATA_PORT 0x0cfc +#define CONF1_ADDR_PORT 0x0cf8 +#define CONF1_DATA_PORT 0x0cfc #define CONF1_ENABLE 0x80000000ul -#define CFGWRITE(pi,off,val,b) \ -do { \ - if ((b) == 1) { \ - pci_set_cfgdata8((pi),(off),(val)); \ - } else if ((b) == 2) { \ - pci_set_cfgdata16((pi),(off),(val)); \ - } else { \ - pci_set_cfgdata32((pi),(off),(val)); \ - } \ -} while (0) - #define MAXBUSES (PCI_BUSMAX + 1) #define MAXSLOTS (PCI_SLOTMAX + 1) #define MAXFUNCS (PCI_FUNCMAX + 1) @@ -136,6 +128,30 @@ static void pci_lintr_update(struct pci_devinst *pi); static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, int coff, int bytes, uint32_t *val); +static __inline void +CFGWRITE(struct pci_devinst *pi, int coff, uint32_t val, int bytes) +{ + + if (bytes == 1) + pci_set_cfgdata8(pi, coff, val); + else if (bytes == 2) + pci_set_cfgdata16(pi, coff, val); + else + pci_set_cfgdata32(pi, coff, val); +} + +static __inline uint32_t +CFGREAD(struct pci_devinst *pi, int coff, int bytes) +{ + + if (bytes == 1) + return (pci_get_cfgdata8(pi, coff)); + else if (bytes == 2) + return (pci_get_cfgdata16(pi, coff)); + else + return (pci_get_cfgdata32(pi, coff)); +} + /* * I/O access */ @@ -234,6 +250,17 @@ done: return (error); } +void +pci_print_supported_devices() +{ + struct pci_devemu **pdpp, *pdp; + + SET_FOREACH(pdpp, pci_devemu_set) { + pdp = *pdpp; + printf("%s\n", pdp->pe_emu); + } +} + static int pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset) { @@ -294,7 +321,7 @@ pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size) /* * The PCI standard only allows 4 and 8 byte accesses to the MSI-X - * table but we also allow 1 byte access to accomodate reads from + * table but we also allow 1 byte access to accommodate reads from * ddb. */ if (size != 1 && size != 4 && size != 8) @@ -465,7 +492,7 @@ modify_bar_registration(struct pci_devinst *pi, int idx, int registration) iop.handler = pci_emul_io_handler; iop.arg = pi; error = register_inout(&iop); - } else + } else error = unregister_inout(&iop); break; case PCIBAR_MEM32: @@ -533,7 +560,7 @@ memen(struct pci_devinst *pi) * the address range decoded by the BAR register. */ static void -update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) +update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type) { int decode; @@ -570,8 +597,10 @@ int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, enum pcibar_type type, uint64_t size) { + uint64_t *baseptr = NULL; + uint64_t limit = 0, lobits = 0; + uint64_t addr, mask, bar; int error; - uint64_t *baseptr, limit, addr, mask, lobits, bar; assert(idx >= 0 && idx <= PCI_BARMAX); @@ -634,7 +663,11 @@ pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, break; default: printf("pci_emul_alloc_base: invalid bar type %d\n", type); +#ifdef FreeBSD assert(0); +#else + abort(); +#endif } if (baseptr != NULL) { @@ -656,7 +689,7 @@ pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase, pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64; pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32); } - + register_bar(pdi, idx); return (0); @@ -759,8 +792,6 @@ pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr) { int mmc; - CTASSERT(sizeof(struct msicap) == 14); - /* Number of msi messages must be a power of 2 between 1 and 32 */ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32); mmc = ffs(msgnum) - 1; @@ -785,7 +816,6 @@ static void pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum, uint32_t msix_tab_size) { - CTASSERT(sizeof(struct msixcap) == 12); assert(msix_tab_size % 4096 == 0); @@ -832,7 +862,7 @@ pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum) assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES); assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0); - + tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE; /* Align table size to nearest 4K */ @@ -862,10 +892,9 @@ msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, int bytes, uint32_t val) { uint16_t msgctrl, rwmask; - int off, table_bar; - + int off; + off = offset - capoff; - table_bar = pi->pi_msix.table_bar; /* Message Control Register */ if (off == 2 && bytes == 2) { rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK; @@ -877,8 +906,8 @@ msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset, pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE; pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK; pci_lintr_update(pi); - } - + } + CFGWRITE(pi, offset, val, bytes); } @@ -937,8 +966,6 @@ pci_emul_add_pciecap(struct pci_devinst *pi, int type) int err; struct pciecap pciecap; - CTASSERT(sizeof(struct pciecap) == 60); - if (type != PCIEM_TYPE_ROOT_PORT) return (-1); @@ -1085,7 +1112,7 @@ init_pci(struct vmctx *ctx) for (bus = 0; bus < MAXBUSES; bus++) { if ((bi = pci_businfo[bus]) == NULL) continue; - /* + /* * Keep track of the i/o and memory resources allocated to * this bus. */ @@ -1186,7 +1213,6 @@ init_pci(struct vmctx *ctx) return (0); } -#ifdef __FreeBSD__ static void pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq, void *arg) @@ -1340,11 +1366,11 @@ pci_bus_write_dsdt(int bus) dsdt_line("Name (PPRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_pirq_prt_entry, NULL); - dsdt_line("})"); + dsdt_line("})"); dsdt_line("Name (APRT, Package ()"); dsdt_line("{"); pci_walk_lintr(bus, pci_apic_prt_entry, NULL); - dsdt_line("})"); + dsdt_line("})"); dsdt_line("Method (_PRT, 0, NotSerialized)"); dsdt_line("{"); dsdt_line(" If (PICM)"); @@ -1392,7 +1418,6 @@ pci_write_dsdt(void) dsdt_line("}"); dsdt_unindent(1); } -#endif int pci_bus_configured(int bus) @@ -1511,7 +1536,7 @@ pci_lintr_route(struct pci_devinst *pi) * is not yet assigned. */ if (ii->ii_ioapic_irq == 0) - ii->ii_ioapic_irq = ioapic_pci_alloc_irq(); + ii->ii_ioapic_irq = ioapic_pci_alloc_irq(pi); assert(ii->ii_ioapic_irq > 0); /* @@ -1519,7 +1544,7 @@ pci_lintr_route(struct pci_devinst *pi) * not yet assigned. */ if (ii->ii_pirq_pin == 0) - ii->ii_pirq_pin = pirq_alloc_pin(pi->pi_vmctx); + ii->ii_pirq_pin = pirq_alloc_pin(pi); assert(ii->ii_pirq_pin > 0); pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq; @@ -1667,27 +1692,31 @@ pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv) } } -static uint32_t -bits_changed(uint32_t old, uint32_t new, uint32_t mask) -{ - - return ((old ^ new) & mask); -} - static void -pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes) +pci_emul_cmdsts_write(struct pci_devinst *pi, int coff, uint32_t new, int bytes) { - int i; - uint16_t old; + int i, rshift; + uint32_t cmd, cmd2, changed, old, readonly; + + cmd = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ /* - * The command register is at an offset of 4 bytes and thus the - * guest could write 1, 2 or 4 bytes starting at this offset. + * From PCI Local Bus Specification 3.0 sections 6.2.2 and 6.2.3. + * + * XXX Bits 8, 11, 12, 13, 14 and 15 in the status register are + * 'write 1 to clear'. However these bits are not set to '1' by + * any device emulation so it is simpler to treat them as readonly. */ + rshift = (coff & 0x3) * 8; + readonly = 0xFFFFF880 >> rshift; + + old = CFGREAD(pi, coff, bytes); + new &= ~readonly; + new |= (old & readonly); + CFGWRITE(pi, coff, new, bytes); /* update config */ - old = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */ - CFGWRITE(pi, PCIR_COMMAND, new, bytes); /* update config */ - new = pci_get_cfgdata16(pi, PCIR_COMMAND); /* get updated value */ + cmd2 = pci_get_cfgdata16(pi, PCIR_COMMAND); /* get updated value */ + changed = cmd ^ cmd2; /* * If the MMIO or I/O address space decoding has changed then @@ -1700,7 +1729,7 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes) break; case PCIBAR_IO: /* I/O address space decoding changed? */ - if (bits_changed(old, new, PCIM_CMD_PORTEN)) { + if (changed & PCIM_CMD_PORTEN) { if (porten(pi)) register_bar(pi, i); else @@ -1710,15 +1739,15 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes) case PCIBAR_MEM32: case PCIBAR_MEM64: /* MMIO address space decoding changed? */ - if (bits_changed(old, new, PCIM_CMD_MEMEN)) { + if (changed & PCIM_CMD_MEMEN) { if (memen(pi)) register_bar(pi, i); else unregister_bar(pi, i); } - break; + break; default: - assert(0); + assert(0); } } @@ -1727,7 +1756,7 @@ pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes) * interrupt. */ pci_lintr_update(pi); -} +} static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, @@ -1738,7 +1767,8 @@ pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, struct pci_devinst *pi; struct pci_devemu *pe; int idx, needcfg; - uint64_t addr, bar, mask; + uint64_t addr, mask; + uint64_t bar = 0; if ((bi = pci_businfo[bus]) != NULL) { si = &bi->slotinfo[slot]; @@ -1790,14 +1820,8 @@ pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, needcfg = 1; } - if (needcfg) { - if (bytes == 1) - *eax = pci_get_cfgdata8(pi, coff); - else if (bytes == 2) - *eax = pci_get_cfgdata16(pi, coff); - else - *eax = pci_get_cfgdata32(pi, coff); - } + if (needcfg) + *eax = CFGREAD(pi, coff, bytes); pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax); } else { @@ -1867,8 +1891,8 @@ pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func, } else if (pci_emul_iscap(pi, coff)) { pci_emul_capwrite(pi, coff, bytes, *eax); - } else if (coff == PCIR_COMMAND) { - pci_emul_cmdwrite(pi, *eax, bytes); + } else if (coff >= PCIR_COMMAND && coff < PCIR_REVID) { + pci_emul_cmdsts_write(pi, coff, *eax, bytes); } else { CFGWRITE(pi, coff, *eax, bytes); } @@ -1940,8 +1964,8 @@ INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata); #define DIOSZ 8 #define DMEMSZ 4096 struct pci_emul_dsoftc { - uint8_t ioregs[DIOSZ]; - uint8_t memregs[DMEMSZ]; + uint8_t ioregs[DIOSZ]; + uint8_t memregs[2][DMEMSZ]; }; #define PCI_EMUL_MSI_MSGS 4 @@ -1970,6 +1994,9 @@ pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts) error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ); assert(error == 0); + error = pci_emul_alloc_bar(pi, 2, PCIBAR_MEM32, DMEMSZ); + assert(error == 0); + return (0); } @@ -2009,31 +2036,33 @@ pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, } } - if (baridx == 1) { + if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("diow: memw too large, offset %ld size %d\n", offset, size); return; } + i = baridx - 1; /* 'memregs' index */ + if (size == 1) { - sc->memregs[offset] = value; + sc->memregs[i][offset] = value; } else if (size == 2) { - *(uint16_t *)&sc->memregs[offset] = value; + *(uint16_t *)&sc->memregs[i][offset] = value; } else if (size == 4) { - *(uint32_t *)&sc->memregs[offset] = value; + *(uint32_t *)&sc->memregs[i][offset] = value; } else if (size == 8) { - *(uint64_t *)&sc->memregs[offset] = value; + *(uint64_t *)&sc->memregs[i][offset] = value; } else { printf("diow: memw unknown size %d\n", size); } - + /* * magic interrupt ?? */ } - if (baridx > 1) { + if (baridx > 2 || baridx < 0) { printf("diow: unknown bar idx %d\n", baridx); } } @@ -2044,14 +2073,17 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, { struct pci_emul_dsoftc *sc = pi->pi_arg; uint32_t value; + int i; + value = 0; if (baridx == 0) { if (offset + size > DIOSZ) { printf("dior: ior too large, offset %ld size %d\n", offset, size); return (0); } - + + value = 0; if (size == 1) { value = sc->ioregs[offset]; } else if (size == 2) { @@ -2062,29 +2094,31 @@ pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, printf("dior: ior unknown size %d\n", size); } } - - if (baridx == 1) { + + if (baridx == 1 || baridx == 2) { if (offset + size > DMEMSZ) { printf("dior: memr too large, offset %ld size %d\n", offset, size); return (0); } - + + i = baridx - 1; /* 'memregs' index */ + if (size == 1) { - value = sc->memregs[offset]; + value = sc->memregs[i][offset]; } else if (size == 2) { - value = *(uint16_t *) &sc->memregs[offset]; + value = *(uint16_t *) &sc->memregs[i][offset]; } else if (size == 4) { - value = *(uint32_t *) &sc->memregs[offset]; + value = *(uint32_t *) &sc->memregs[i][offset]; } else if (size == 8) { - value = *(uint64_t *) &sc->memregs[offset]; + value = *(uint64_t *) &sc->memregs[i][offset]; } else { printf("dior: ior unknown size %d\n", size); } } - if (baridx > 1) { + if (baridx > 2 || baridx < 0) { printf("dior: unknown bar idx %d\n", baridx); return (0); } diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h index 6af01c4c3c..853badaadb 100644 --- a/usr/src/cmd/bhyve/pci_emul.h +++ b/usr/src/cmd/bhyve/pci_emul.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_emul.h 269700 2014-08-08 03:49:01Z neel $ + * $FreeBSD$ */ #ifndef _PCI_EMUL_H_ @@ -142,6 +144,8 @@ struct pci_devinst { int pba_size; int function_mask; struct msix_table_entry *table; /* allocated at runtime */ + void *pba_page; + int pba_page_offset; } pi_msix; void *pi_arg; /* devemu-private data */ @@ -158,6 +162,7 @@ struct msicap { uint32_t addrhi; uint16_t msgdata; } __packed; +static_assert(sizeof(struct msicap) == 14, "compile-time assertion failed"); struct msixcap { uint8_t capid; @@ -166,6 +171,7 @@ struct msixcap { uint32_t table_info; /* bar index and offset within it */ uint32_t pba_info; /* bar index and offset within it */ } __packed; +static_assert(sizeof(struct msixcap) == 12, "compile-time assertion failed"); struct pciecap { uint8_t capid; @@ -200,6 +206,7 @@ struct pciecap { uint16_t slot_control2; uint16_t slot_status2; } __packed; +static_assert(sizeof(struct pciecap) == 60, "compile-time assertion failed"); typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin, int ioapic_irq, void *arg); @@ -225,8 +232,9 @@ int pci_msi_enabled(struct pci_devinst *pi); int pci_msix_enabled(struct pci_devinst *pi); int pci_msix_table_bar(struct pci_devinst *pi); int pci_msix_pba_bar(struct pci_devinst *pi); -int pci_msi_msgnum(struct pci_devinst *pi); +int pci_msi_maxmsgnum(struct pci_devinst *pi); int pci_parse_slot(char *opt); +void pci_print_supported_devices(); void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr); int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum); int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size, diff --git a/usr/src/cmd/bhyve/pci_fbuf.c b/usr/src/cmd/bhyve/pci_fbuf.c new file mode 100644 index 0000000000..8d24dde9da --- /dev/null +++ b/usr/src/cmd/bhyve/pci_fbuf.c @@ -0,0 +1,467 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Nahanni Systems, Inc. + * Copyright 2018 Joyent, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/mman.h> + +#include <machine/vmm.h> +#include <vmmapi.h> + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <errno.h> +#include <unistd.h> + +#include "bhyvegc.h" +#include "bhyverun.h" +#include "console.h" +#include "inout.h" +#include "pci_emul.h" +#include "rfb.h" +#include "vga.h" + +/* + * bhyve Framebuffer device emulation. + * BAR0 points to the current mode information. + * BAR1 is the 32-bit framebuffer address. + * + * -s <b>,fbuf,wait,vga=on|io|off,rfb=<ip>:port,w=width,h=height + */ + +static int fbuf_debug = 1; +#define DEBUG_INFO 1 +#define DEBUG_VERBOSE 4 +#define DPRINTF(level, params) if (level <= fbuf_debug) printf params + + +#define KB (1024UL) +#define MB (1024 * 1024UL) + +#define DMEMSZ 128 + +#define FB_SIZE (16*MB) + +#define COLS_MAX 1920 +#define ROWS_MAX 1200 + +#define COLS_DEFAULT 1024 +#define ROWS_DEFAULT 768 + +#define COLS_MIN 640 +#define ROWS_MIN 480 + +struct pci_fbuf_softc { + struct pci_devinst *fsc_pi; + struct { + uint32_t fbsize; + uint16_t width; + uint16_t height; + uint16_t depth; + uint16_t refreshrate; + uint8_t reserved[116]; + } __packed memregs; + + /* rfb server */ + char *rfb_host; + char *rfb_password; + int rfb_port; +#ifndef __FreeBSD__ + char *rfb_unix; +#endif + int rfb_wait; + int vga_enabled; + int vga_full; + + uint32_t fbaddr; + char *fb_base; + uint16_t gc_width; + uint16_t gc_height; + void *vgasc; + struct bhyvegc_image *gc_image; +}; + +static struct pci_fbuf_softc *fbuf_sc; + +#define PCI_FBUF_MSI_MSGS 4 + +static void +pci_fbuf_usage(char *opt) +{ + + fprintf(stderr, "Invalid fbuf emulation option \"%s\"\r\n", opt); + fprintf(stderr, "fbuf: {wait,}{vga=on|io|off,}rfb=<ip>:port" + "{,w=width}{,h=height}\r\n"); +} + +static void +pci_fbuf_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_fbuf_softc *sc; + uint8_t *p; + + assert(baridx == 0); + + sc = pi->pi_arg; + + DPRINTF(DEBUG_VERBOSE, + ("fbuf wr: offset 0x%lx, size: %d, value: 0x%lx\n", + offset, size, value)); + + if (offset + size > DMEMSZ) { + printf("fbuf: write too large, offset %ld size %d\n", + offset, size); + return; + } + + p = (uint8_t *)&sc->memregs + offset; + + switch (size) { + case 1: + *p = value; + break; + case 2: + *(uint16_t *)p = value; + break; + case 4: + *(uint32_t *)p = value; + break; + case 8: + *(uint64_t *)p = value; + break; + default: + printf("fbuf: write unknown size %d\n", size); + break; + } + + if (!sc->gc_image->vgamode && sc->memregs.width == 0 && + sc->memregs.height == 0) { + DPRINTF(DEBUG_INFO, ("switching to VGA mode\r\n")); + sc->gc_image->vgamode = 1; + sc->gc_width = 0; + sc->gc_height = 0; + } else if (sc->gc_image->vgamode && sc->memregs.width != 0 && + sc->memregs.height != 0) { + DPRINTF(DEBUG_INFO, ("switching to VESA mode\r\n")); + sc->gc_image->vgamode = 0; + } +} + +uint64_t +pci_fbuf_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + struct pci_fbuf_softc *sc; + uint8_t *p; + uint64_t value; + + assert(baridx == 0); + + sc = pi->pi_arg; + + + if (offset + size > DMEMSZ) { + printf("fbuf: read too large, offset %ld size %d\n", + offset, size); + return (0); + } + + p = (uint8_t *)&sc->memregs + offset; + value = 0; + switch (size) { + case 1: + value = *p; + break; + case 2: + value = *(uint16_t *)p; + break; + case 4: + value = *(uint32_t *)p; + break; + case 8: + value = *(uint64_t *)p; + break; + default: + printf("fbuf: read unknown size %d\n", size); + break; + } + + DPRINTF(DEBUG_VERBOSE, + ("fbuf rd: offset 0x%lx, size: %d, value: 0x%lx\n", + offset, size, value)); + + return (value); +} + +static int +pci_fbuf_parse_opts(struct pci_fbuf_softc *sc, char *opts) +{ + char *uopts, *xopts, *config; + char *tmpstr; + int ret; + + ret = 0; + uopts = strdup(opts); + for (xopts = strtok(uopts, ","); + xopts != NULL; + xopts = strtok(NULL, ",")) { + if (strcmp(xopts, "wait") == 0) { + sc->rfb_wait = 1; + continue; + } + + if ((config = strchr(xopts, '=')) == NULL) { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } + + *config++ = '\0'; + + DPRINTF(DEBUG_VERBOSE, ("pci_fbuf option %s = %s\r\n", + xopts, config)); + + if (!strcmp(xopts, "tcp") || !strcmp(xopts, "rfb")) { + /* + * IPv4 -- host-ip:port + * IPv6 -- [host-ip%zone]:port + * XXX for now port is mandatory. + */ + tmpstr = strsep(&config, "]"); + if (config) { + if (tmpstr[0] == '[') + tmpstr++; + sc->rfb_host = tmpstr; + if (config[0] == ':') + config++; + else { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } + sc->rfb_port = atoi(config); + } else { + config = tmpstr; + tmpstr = strsep(&config, ":"); + if (!config) + sc->rfb_port = atoi(tmpstr); + else { + sc->rfb_port = atoi(config); + sc->rfb_host = tmpstr; + } + } +#ifndef __FreeBSD__ + } else if (!strcmp(xopts, "unix")) { + sc->rfb_unix = config; +#endif + } else if (!strcmp(xopts, "vga")) { + if (!strcmp(config, "off")) { + sc->vga_enabled = 0; + } else if (!strcmp(config, "io")) { + sc->vga_enabled = 1; + sc->vga_full = 0; + } else if (!strcmp(config, "on")) { + sc->vga_enabled = 1; + sc->vga_full = 1; + } else { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } + } else if (!strcmp(xopts, "w")) { + sc->memregs.width = atoi(config); + if (sc->memregs.width > COLS_MAX) { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } else if (sc->memregs.width == 0) + sc->memregs.width = 1920; + } else if (!strcmp(xopts, "h")) { + sc->memregs.height = atoi(config); + if (sc->memregs.height > ROWS_MAX) { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } else if (sc->memregs.height == 0) + sc->memregs.height = 1080; + } else if (!strcmp(xopts, "password")) { + sc->rfb_password = config; + } else { + pci_fbuf_usage(xopts); + ret = -1; + goto done; + } + } + +done: + return (ret); +} + + +extern void vga_render(struct bhyvegc *gc, void *arg); + +void +pci_fbuf_render(struct bhyvegc *gc, void *arg) +{ + struct pci_fbuf_softc *sc; + + sc = arg; + + if (sc->vga_full && sc->gc_image->vgamode) { + /* TODO: mode switching to vga and vesa should use the special + * EFI-bhyve protocol port. + */ + vga_render(gc, sc->vgasc); + return; + } + if (sc->gc_width != sc->memregs.width || + sc->gc_height != sc->memregs.height) { + bhyvegc_resize(gc, sc->memregs.width, sc->memregs.height); + sc->gc_width = sc->memregs.width; + sc->gc_height = sc->memregs.height; + } + + return; +} + +static int +pci_fbuf_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int error, prot; + struct pci_fbuf_softc *sc; + + if (fbuf_sc != NULL) { + fprintf(stderr, "Only one frame buffer device is allowed.\n"); + return (-1); + } + + sc = calloc(1, sizeof(struct pci_fbuf_softc)); + + pi->pi_arg = sc; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x40FB); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_DISPLAY); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_DISPLAY_VGA); + + error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, DMEMSZ); + assert(error == 0); + + error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, FB_SIZE); + assert(error == 0); + + error = pci_emul_add_msicap(pi, PCI_FBUF_MSI_MSGS); + assert(error == 0); + + sc->fbaddr = pi->pi_bar[1].addr; + sc->memregs.fbsize = FB_SIZE; + sc->memregs.width = COLS_DEFAULT; + sc->memregs.height = ROWS_DEFAULT; + sc->memregs.depth = 32; + + sc->vga_enabled = 1; + sc->vga_full = 0; + + sc->fsc_pi = pi; + + error = pci_fbuf_parse_opts(sc, opts); + if (error != 0) + goto done; + + /* XXX until VGA rendering is enabled */ + if (sc->vga_full != 0) { + fprintf(stderr, "pci_fbuf: VGA rendering not enabled"); + goto done; + } + + sc->fb_base = vm_create_devmem(ctx, VM_FRAMEBUFFER, "framebuffer", FB_SIZE); + if (sc->fb_base == MAP_FAILED) { + error = -1; + goto done; + } + DPRINTF(DEBUG_INFO, ("fbuf frame buffer base: %p [sz %lu]\r\n", + sc->fb_base, FB_SIZE)); + + /* + * Map the framebuffer into the guest address space. + * XXX This may fail if the BAR is different than a prior + * run. In this case flag the error. This will be fixed + * when a change_memseg api is available. + */ + prot = PROT_READ | PROT_WRITE; + if (vm_mmap_memseg(ctx, sc->fbaddr, VM_FRAMEBUFFER, 0, FB_SIZE, prot) != 0) { + fprintf(stderr, "pci_fbuf: mapseg failed - try deleting VM and restarting\n"); + error = -1; + goto done; + } + + console_init(sc->memregs.width, sc->memregs.height, sc->fb_base); + console_fb_register(pci_fbuf_render, sc); + + if (sc->vga_enabled) + sc->vgasc = vga_init(!sc->vga_full); + sc->gc_image = console_get_image(); + + fbuf_sc = sc; + + memset((void *)sc->fb_base, 0, FB_SIZE); + +#ifdef __FreeBSD__ + error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, sc->rfb_password); +#else + if (sc->rfb_unix != NULL) { + error = rfb_init_unix(sc->rfb_unix, sc->rfb_wait, + sc->rfb_password); + } else { + error = rfb_init(sc->rfb_host, sc->rfb_port, sc->rfb_wait, + sc->rfb_password); + } +#endif +done: + if (error) + free(sc); + + return (error); +} + +struct pci_devemu pci_fbuf = { + .pe_emu = "fbuf", + .pe_init = pci_fbuf_init, + .pe_barwrite = pci_fbuf_write, + .pe_barread = pci_fbuf_read +}; +PCI_EMUL_SET(pci_fbuf); diff --git a/usr/src/cmd/bhyve/pci_hostbridge.c b/usr/src/cmd/bhyve/pci_hostbridge.c index 08956d082e..b926c7817e 100644 --- a/usr/src/cmd/bhyve/pci_hostbridge.c +++ b/usr/src/cmd/bhyve/pci_hostbridge.c @@ -1,5 +1,8 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2018 Joyent, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -23,14 +26,21 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_hostbridge.c 283264 2015-05-21 20:11:52Z tychon $ + * $FreeBSD$ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_hostbridge.c 283264 2015-05-21 20:11:52Z tychon $"); +#ifndef __FreeBSD__ +#include <errno.h> +#include <stdlib.h> +#include <stdio.h> +#include <strings.h> +#endif +__FBSDID("$FreeBSD$"); #include "pci_emul.h" +#ifdef __FreeBSD__ static int pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { @@ -56,6 +66,162 @@ pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) return (0); } +#else +static void +pci_hostbridge_setup(struct pci_devinst *pi, uint16_t vendor, uint16_t device) +{ + /* config space */ + pci_set_cfgdata16(pi, PCIR_VENDOR, vendor); + pci_set_cfgdata16(pi, PCIR_DEVICE, device); + pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST); + + pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT); +} + + +static int +pci_hostbridge_parse_pci_val(const char *in, uint16_t *val) +{ + long num; + char *endp = NULL; + + errno = 0; + num = strtol(in, &endp, 0); + if (errno != 0 || endp == NULL || *endp != '\0') { + fprintf(stderr, "pci_hostbridge: invalid num '%s'", in); + return (-1); + } else if (num < 1 || num > UINT16_MAX) { + fprintf(stderr, "pci_hostbridge: 0x%04lx out of range", num); + return (-1); + } + *val = num; + return (0); +} + +static struct pci_hostbridge_model { + const char *phm_model; + uint16_t phm_vendor; + uint16_t phm_device; +} pci_hb_models[] = { + { "amd", 0x1022, 0x7432 }, /* AMD/made-up */ + { "netapp", 0x1275, 0x1275 }, /* NetApp/NetApp */ + { "i440fx", 0x8086, 0x1237 }, /* Intel/82441 */ + { "q35", 0x8086, 0x29b0 }, /* Intel/Q35 HB */ +}; + +#define NUM_HB_MODELS (sizeof (pci_hb_models) / sizeof (pci_hb_models[0])) + +static int +pci_hostbridge_parse_args(char *opts, uint16_t *vendorp, uint16_t *devicep) +{ + const char *model = NULL; + char *next; + uint16_t vendor = 0, device = 0; + int err = 0; + + for (; opts != NULL && *opts != '\0'; opts = next) { + char *val, *cp; + + if ((cp = strchr(opts, ',')) != NULL) { + *cp = '\0'; + next = cp + 1; + } else { + next = NULL; + } + + if ((cp = strchr(opts, '=')) == NULL) { + fprintf(stderr, + "pci_hostbridge: expected value for param" + " (%s=VAL)", opts); + err = -1; + continue; + } + + /* <param>=<value> handling */ + val = cp + 1; + *cp = '\0'; + if (strcmp(opts, "model") == 0) { + model = val; + } else if (strcmp(opts, "vendor") == 0) { + if (pci_hostbridge_parse_pci_val(val, &vendor) != 0) { + err = -1; + continue; + } + } else if (strcmp(opts, "device") == 0) { + if (pci_hostbridge_parse_pci_val(val, &device) != 0) { + err = -1; + continue; + } + } else { + fprintf(stderr, + "pci_hostbridge: unrecognized option '%s'", opts); + err = -1; + continue; + } + } + if (err != 0) { + return (err); + } + + if (model != NULL && (vendor != 0 || device != 0)) { + fprintf(stderr, "pci_hostbridge: cannot specify model " + "and vendor/device"); + return (-1); + } else if ((vendor != 0 && device == 0) || + (vendor == 0 && device != 0)) { + fprintf(stderr, "pci_hostbridge: must specify both vendor and" + "device for custom hostbridge"); + return (-1); + } + if (model != NULL) { + uint_t i; + + for (i = 0; i < NUM_HB_MODELS; i++) { + if (strcmp(model, pci_hb_models[i].phm_model) != 0) + continue; + + /* found a model match */ + *vendorp = pci_hb_models[i].phm_vendor; + *devicep = pci_hb_models[i].phm_device; + return (0); + } + fprintf(stderr, "pci_hostbridge: invalid model '%s'", model); + return (-1); + } + + /* custom hostbridge ID was specified */ + *vendorp = vendor; + *devicep = device; + return (0); +} + +static int +pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + uint16_t vendor, device; + + if (opts == NULL) { + /* Fall back to NetApp default if no options are specified */ + vendor = 0x1275; + device = 0x1275; + } else if (pci_hostbridge_parse_args(opts, &vendor, &device) != 0) { + return (-1); + } + + pci_hostbridge_setup(pi, vendor, device); + return (0); +} + +static int +pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + pci_hostbridge_setup(pi, 0x1022, 0x7432); + return (0); +} + +#endif /* __FreeBSD__ */ struct pci_devemu pci_de_amd_hostbridge = { .pe_emu = "amd_hostbridge", diff --git a/usr/src/cmd/bhyve/pci_irq.c b/usr/src/cmd/bhyve/pci_irq.c index 97ee330c65..4ecb3eddb0 100644 --- a/usr/src/cmd/bhyve/pci_irq.c +++ b/usr/src/cmd/bhyve/pci_irq.c @@ -1,5 +1,7 @@ /*- - * Copyright (c) 2014 Advanced Computing Technologies LLC + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * @@ -27,7 +29,7 @@ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_irq.c 266125 2014-05-15 14:16:55Z jhb $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <machine/vmm.h> @@ -115,7 +117,7 @@ void pci_irq_reserve(int irq) { - assert(irq < nitems(irq_counts)); + assert(irq >= 0 && irq < nitems(irq_counts)); assert(pirq_cold); assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED); irq_counts[irq] = IRQ_DISABLED; @@ -125,10 +127,10 @@ void pci_irq_use(int irq) { - assert(irq < nitems(irq_counts)); + assert(irq >= 0 && irq < nitems(irq_counts)); assert(pirq_cold); - if (irq_counts[irq] != IRQ_DISABLED) - irq_counts[irq]++; + assert(irq_counts[irq] != IRQ_DISABLED); + irq_counts[irq]++; } void @@ -193,19 +195,25 @@ pci_irq_deassert(struct pci_devinst *pi) } int -pirq_alloc_pin(struct vmctx *ctx) +pirq_alloc_pin(struct pci_devinst *pi) { + struct vmctx *ctx = pi->pi_vmctx; int best_count, best_irq, best_pin, irq, pin; - pirq_cold = 1; - - /* First, find the least-used PIRQ pin. */ - best_pin = 0; - best_count = pirqs[0].use_count; - for (pin = 1; pin < nitems(pirqs); pin++) { - if (pirqs[pin].use_count < best_count) { - best_pin = pin; - best_count = pirqs[pin].use_count; + pirq_cold = 0; + + if (lpc_bootrom()) { + /* For external bootrom use fixed mapping. */ + best_pin = (4 + pi->pi_slot + pi->pi_lintr.pin) % 8; + } else { + /* Find the least-used PIRQ pin. */ + best_pin = 0; + best_count = pirqs[0].use_count; + for (pin = 1; pin < nitems(pirqs); pin++) { + if (pirqs[pin].use_count < best_count) { + best_pin = pin; + best_count = pirqs[pin].use_count; + } } } pirqs[best_pin].use_count++; @@ -222,7 +230,7 @@ pirq_alloc_pin(struct vmctx *ctx) best_count = irq_counts[irq]; } } - assert(best_irq != 0); + assert(best_irq >= 0); irq_counts[best_irq]++; pirqs[best_pin].reg = best_irq; vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER); @@ -234,16 +242,12 @@ pirq_alloc_pin(struct vmctx *ctx) int pirq_irq(int pin) { - - if (pin == -1) - return (255); assert(pin > 0 && pin <= nitems(pirqs)); return (pirqs[pin - 1].reg & PIRQ_IRQ); } /* XXX: Generate $PIR table. */ -#ifdef __FreeBSD__ static void pirq_dsdt(void) { @@ -348,4 +352,3 @@ pirq_dsdt(void) free(irq_prs); } LPC_DSDT(pirq_dsdt); -#endif diff --git a/usr/src/cmd/bhyve/pci_irq.h b/usr/src/cmd/bhyve/pci_irq.h index 483f12b61e..1ae56efc8f 100644 --- a/usr/src/cmd/bhyve/pci_irq.h +++ b/usr/src/cmd/bhyve/pci_irq.h @@ -1,5 +1,7 @@ /*- - * Copyright (c) 2014 Advanced Computing Technologies LLC + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * @@ -24,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_irq.h 266125 2014-05-15 14:16:55Z jhb $ + * $FreeBSD$ */ #ifndef __PCI_IRQ_H__ @@ -37,7 +39,7 @@ void pci_irq_deassert(struct pci_devinst *pi); void pci_irq_init(struct vmctx *ctx); void pci_irq_reserve(int irq); void pci_irq_use(int irq); -int pirq_alloc_pin(struct vmctx *ctx); +int pirq_alloc_pin(struct pci_devinst *pi); int pirq_irq(int pin); uint8_t pirq_read(int pin); void pirq_write(struct vmctx *ctx, int pin, uint8_t val); diff --git a/usr/src/cmd/bhyve/pci_lpc.c b/usr/src/cmd/bhyve/pci_lpc.c index 8c060150dc..b7ddb772a1 100644 --- a/usr/src/cmd/bhyve/pci_lpc.c +++ b/usr/src/cmd/bhyve/pci_lpc.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Neel Natu <neel@freebsd.org> * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. @@ -24,11 +26,15 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z neel $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/types.h> #include <machine/vmm.h> @@ -40,6 +46,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z ne #include <vmmapi.h> #include "acpi.h" +#include "bootrom.h" #include "inout.h" #include "pci_emul.h" #include "pci_irq.h" @@ -62,6 +69,8 @@ SYSRES_IO(NMISC_PORT, 1); static struct pci_devinst *lpc_bridge; +static const char *romfile; + #define LPC_UART_NUM 2 static struct lpc_uart_softc { struct uart_softc *uart_softc; @@ -76,7 +85,7 @@ static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" }; /* * LPC device configuration is in the following form: * <lpc_device_name>[,<options>] - * For e.g. "com1,stdio" + * For e.g. "com1,stdio" or "bootrom,/var/romfile" */ int lpc_device_parse(const char *opts) @@ -88,6 +97,11 @@ lpc_device_parse(const char *opts) str = cpy = strdup(opts); lpcdev = strsep(&str, ","); if (lpcdev != NULL) { + if (strcasecmp(lpcdev, "bootrom") == 0) { + romfile = str; + error = 0; + goto done; + } for (unit = 0; unit < LPC_UART_NUM; unit++) { if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) { lpc_uart_softc[unit].opts = str; @@ -104,6 +118,23 @@ done: return (error); } +void +lpc_print_supported_devices() +{ + size_t i; + + printf("bootrom\n"); + for (i = 0; i < LPC_UART_NUM; i++) + printf("%s\n", lpc_uart_names[i]); +} + +const char * +lpc_bootrom(void) +{ + + return (romfile); +} + static void lpc_uart_intr_assert(void *arg) { @@ -148,6 +179,21 @@ lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uart_write(sc->uart_softc, offset + 1, *eax >> 8); } break; +#ifndef __FreeBSD__ + case 4: + if (in) { + *eax = uart_read(sc->uart_softc, offset); + *eax |= uart_read(sc->uart_softc, offset + 1) << 8; + *eax |= uart_read(sc->uart_softc, offset + 2) << 16; + *eax |= uart_read(sc->uart_softc, offset + 3) << 24; + } else { + uart_write(sc->uart_softc, offset, *eax); + uart_write(sc->uart_softc, offset + 1, *eax >> 8); + uart_write(sc->uart_softc, offset + 2, *eax >> 16); + uart_write(sc->uart_softc, offset + 3, *eax >> 24); + } + break; +#endif default: return (-1); } @@ -156,13 +202,19 @@ lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, } static int -lpc_init(void) +lpc_init(struct vmctx *ctx) { struct lpc_uart_softc *sc; struct inout_port iop; const char *name; int unit, error; + if (romfile != NULL) { + error = bootrom_init(ctx, romfile); + if (error) + return (error); + } + /* COM1 and COM2 */ for (unit = 0; unit < LPC_UART_NUM; unit++) { sc = &lpc_uart_softc[unit]; @@ -200,7 +252,6 @@ lpc_init(void) return (0); } -#ifdef __FreeBSD__ static void pci_lpc_write_dsdt(struct pci_devinst *pi) { @@ -320,7 +371,6 @@ pci_lpc_uart_dsdt(void) } } LPC_DSDT(pci_lpc_uart_dsdt); -#endif static int pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, @@ -381,7 +431,7 @@ pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) return (-1); } - if (lpc_init() != 0) + if (lpc_init(ctx) != 0) return (-1); /* initialize config space */ @@ -423,9 +473,7 @@ lpc_pirq_routed(void) struct pci_devemu pci_de_lpc = { .pe_emu = "lpc", .pe_init = pci_lpc_init, -#ifdef __FreeBSD__ .pe_write_dsdt = pci_lpc_write_dsdt, -#endif .pe_cfgwrite = pci_lpc_cfgwrite, .pe_barwrite = pci_lpc_write, .pe_barread = pci_lpc_read diff --git a/usr/src/cmd/bhyve/pci_lpc.h b/usr/src/cmd/bhyve/pci_lpc.h index 4f725b1dd3..9041f79c50 100644 --- a/usr/src/cmd/bhyve/pci_lpc.h +++ b/usr/src/cmd/bhyve/pci_lpc.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Neel Natu <neel@freebsd.org> * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_lpc.h 266125 2014-05-15 14:16:55Z jhb $ + * $FreeBSD$ */ #ifndef _LPC_H_ @@ -66,7 +68,9 @@ struct lpc_sysres { #define SYSRES_MEM(base, length) LPC_SYSRES(LPC_SYSRES_MEM, base, length) int lpc_device_parse(const char *opt); +void lpc_print_supported_devices(); char *lpc_pirq_name(int pin); void lpc_pirq_routed(void); +const char *lpc_bootrom(void); #endif diff --git a/usr/src/cmd/bhyve/pci_nvme.c b/usr/src/cmd/bhyve/pci_nvme.c new file mode 100644 index 0000000000..a56c1d6959 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_nvme.c @@ -0,0 +1,1953 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2017 Shunsuke Mie + * Copyright (c) 2018 Leon Dang + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * bhyve PCIe-NVMe device emulation. + * + * options: + * -s <n>,nvme,devpath,maxq=#,qsz=#,ioslots=#,sectsz=#,ser=A-Z + * + * accepted devpath: + * /dev/blockdev + * /path/to/image + * ram=size_in_MiB + * + * maxq = max number of queues + * qsz = max elements in each queue + * ioslots = max number of concurrent io requests + * sectsz = sector size (defaults to blockif sector size) + * ser = serial number (20-chars max) + * + */ + +/* TODO: + - create async event for smart and log + - intr coalesce + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <assert.h> +#include <pthread.h> +#include <semaphore.h> +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <machine/atomic.h> +#include <machine/vmm.h> +#include <vmmapi.h> + +#include <dev/nvme/nvme.h> + +#include "bhyverun.h" +#include "block_if.h" +#include "pci_emul.h" + + +static int nvme_debug = 0; +#define DPRINTF(params) if (nvme_debug) printf params +#define WPRINTF(params) printf params + +/* defaults; can be overridden */ +#define NVME_MSIX_BAR 4 + +#define NVME_IOSLOTS 8 + +/* The NVMe spec defines bits 13:4 in BAR0 as reserved */ +#define NVME_MMIO_SPACE_MIN (1 << 14) + +#define NVME_QUEUES 16 +#define NVME_MAX_QENTRIES 2048 + +#define NVME_PRP2_ITEMS (PAGE_SIZE/sizeof(uint64_t)) +#define NVME_MAX_BLOCKIOVS 512 + +/* helpers */ + +/* Convert a zero-based value into a one-based value */ +#define ONE_BASED(zero) ((zero) + 1) +/* Convert a one-based value into a zero-based value */ +#define ZERO_BASED(one) ((one) - 1) + +/* Encode number of SQ's and CQ's for Set/Get Features */ +#define NVME_FEATURE_NUM_QUEUES(sc) \ + (ZERO_BASED((sc)->num_squeues) & 0xffff) | \ + (ZERO_BASED((sc)->num_cqueues) & 0xffff) << 16; + +#define NVME_DOORBELL_OFFSET offsetof(struct nvme_registers, doorbell) + +enum nvme_controller_register_offsets { + NVME_CR_CAP_LOW = 0x00, + NVME_CR_CAP_HI = 0x04, + NVME_CR_VS = 0x08, + NVME_CR_INTMS = 0x0c, + NVME_CR_INTMC = 0x10, + NVME_CR_CC = 0x14, + NVME_CR_CSTS = 0x1c, + NVME_CR_NSSR = 0x20, + NVME_CR_AQA = 0x24, + NVME_CR_ASQ_LOW = 0x28, + NVME_CR_ASQ_HI = 0x2c, + NVME_CR_ACQ_LOW = 0x30, + NVME_CR_ACQ_HI = 0x34, +}; + +enum nvme_cmd_cdw11 { + NVME_CMD_CDW11_PC = 0x0001, + NVME_CMD_CDW11_IEN = 0x0002, + NVME_CMD_CDW11_IV = 0xFFFF0000, +}; + +#define NVME_CQ_INTEN 0x01 +#define NVME_CQ_INTCOAL 0x02 + +struct nvme_completion_queue { + struct nvme_completion *qbase; + uint32_t size; + uint16_t tail; /* nvme progress */ + uint16_t head; /* guest progress */ + uint16_t intr_vec; + uint32_t intr_en; + pthread_mutex_t mtx; +}; + +struct nvme_submission_queue { + struct nvme_command *qbase; + uint32_t size; + uint16_t head; /* nvme progress */ + uint16_t tail; /* guest progress */ + uint16_t cqid; /* completion queue id */ + int busy; /* queue is being processed */ + int qpriority; +}; + +enum nvme_storage_type { + NVME_STOR_BLOCKIF = 0, + NVME_STOR_RAM = 1, +}; + +struct pci_nvme_blockstore { + enum nvme_storage_type type; + void *ctx; + uint64_t size; + uint32_t sectsz; + uint32_t sectsz_bits; +}; + +struct pci_nvme_ioreq { + struct pci_nvme_softc *sc; + struct pci_nvme_ioreq *next; + struct nvme_submission_queue *nvme_sq; + uint16_t sqid; + + /* command information */ + uint16_t opc; + uint16_t cid; + uint32_t nsid; + + uint64_t prev_gpaddr; + size_t prev_size; + + /* + * lock if all iovs consumed (big IO); + * complete transaction before continuing + */ + pthread_mutex_t mtx; + pthread_cond_t cv; + + struct blockif_req io_req; + + /* pad to fit up to 512 page descriptors from guest IO request */ + struct iovec iovpadding[NVME_MAX_BLOCKIOVS-BLOCKIF_IOV_MAX]; +}; + +struct pci_nvme_softc { + struct pci_devinst *nsc_pi; + + pthread_mutex_t mtx; + + struct nvme_registers regs; + + struct nvme_namespace_data nsdata; + struct nvme_controller_data ctrldata; + struct nvme_error_information_entry err_log; + struct nvme_health_information_page health_log; + struct nvme_firmware_page fw_log; + + struct pci_nvme_blockstore nvstore; + + uint16_t max_qentries; /* max entries per queue */ + uint32_t max_queues; /* max number of IO SQ's or CQ's */ + uint32_t num_cqueues; + uint32_t num_squeues; + + struct pci_nvme_ioreq *ioreqs; + struct pci_nvme_ioreq *ioreqs_free; /* free list of ioreqs */ + uint32_t pending_ios; + uint32_t ioslots; + sem_t iosemlock; + + /* + * Memory mapped Submission and Completion queues + * Each array includes both Admin and IO queues + */ + struct nvme_completion_queue *compl_queues; + struct nvme_submission_queue *submit_queues; + + /* controller features */ + uint32_t intr_coales_aggr_time; /* 0x08: uS to delay intr */ + uint32_t intr_coales_aggr_thresh; /* 0x08: compl-Q entries */ + uint32_t async_ev_config; /* 0x0B: async event config */ +}; + + +static void pci_nvme_io_partial(struct blockif_req *br, int err); + +/* Controller Configuration utils */ +#define NVME_CC_GET_EN(cc) \ + ((cc) >> NVME_CC_REG_EN_SHIFT & NVME_CC_REG_EN_MASK) +#define NVME_CC_GET_CSS(cc) \ + ((cc) >> NVME_CC_REG_CSS_SHIFT & NVME_CC_REG_CSS_MASK) +#define NVME_CC_GET_SHN(cc) \ + ((cc) >> NVME_CC_REG_SHN_SHIFT & NVME_CC_REG_SHN_MASK) +#define NVME_CC_GET_IOSQES(cc) \ + ((cc) >> NVME_CC_REG_IOSQES_SHIFT & NVME_CC_REG_IOSQES_MASK) +#define NVME_CC_GET_IOCQES(cc) \ + ((cc) >> NVME_CC_REG_IOCQES_SHIFT & NVME_CC_REG_IOCQES_MASK) + +#define NVME_CC_WRITE_MASK \ + ((NVME_CC_REG_EN_MASK << NVME_CC_REG_EN_SHIFT) | \ + (NVME_CC_REG_IOSQES_MASK << NVME_CC_REG_IOSQES_SHIFT) | \ + (NVME_CC_REG_IOCQES_MASK << NVME_CC_REG_IOCQES_SHIFT)) + +#define NVME_CC_NEN_WRITE_MASK \ + ((NVME_CC_REG_CSS_MASK << NVME_CC_REG_CSS_SHIFT) | \ + (NVME_CC_REG_MPS_MASK << NVME_CC_REG_MPS_SHIFT) | \ + (NVME_CC_REG_AMS_MASK << NVME_CC_REG_AMS_SHIFT)) + +/* Controller Status utils */ +#define NVME_CSTS_GET_RDY(sts) \ + ((sts) >> NVME_CSTS_REG_RDY_SHIFT & NVME_CSTS_REG_RDY_MASK) + +#define NVME_CSTS_RDY (1 << NVME_CSTS_REG_RDY_SHIFT) + +/* Completion Queue status word utils */ +#define NVME_STATUS_P (1 << NVME_STATUS_P_SHIFT) +#define NVME_STATUS_MASK \ + ((NVME_STATUS_SCT_MASK << NVME_STATUS_SCT_SHIFT) |\ + (NVME_STATUS_SC_MASK << NVME_STATUS_SC_SHIFT)) + +static __inline void +cpywithpad(char *dst, size_t dst_size, const char *src, char pad) +{ + size_t len; + + len = strnlen(src, dst_size); + memset(dst, pad, dst_size); + memcpy(dst, src, len); +} + +static __inline void +pci_nvme_status_tc(uint16_t *status, uint16_t type, uint16_t code) +{ + + *status &= ~NVME_STATUS_MASK; + *status |= (type & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT | + (code & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; +} + +static __inline void +pci_nvme_status_genc(uint16_t *status, uint16_t code) +{ + + pci_nvme_status_tc(status, NVME_SCT_GENERIC, code); +} + +static __inline void +pci_nvme_toggle_phase(uint16_t *status, int prev) +{ + + if (prev) + *status &= ~NVME_STATUS_P; + else + *status |= NVME_STATUS_P; +} + +static void +pci_nvme_init_ctrldata(struct pci_nvme_softc *sc) +{ + struct nvme_controller_data *cd = &sc->ctrldata; + + cd->vid = 0xFB5D; + cd->ssvid = 0x0000; + + cpywithpad((char *)cd->mn, sizeof(cd->mn), "bhyve-NVMe", ' '); + cpywithpad((char *)cd->fr, sizeof(cd->fr), "1.0", ' '); + + /* Num of submission commands that we can handle at a time (2^rab) */ + cd->rab = 4; + + /* FreeBSD OUI */ + cd->ieee[0] = 0x58; + cd->ieee[1] = 0x9c; + cd->ieee[2] = 0xfc; + + cd->mic = 0; + + cd->mdts = 9; /* max data transfer size (2^mdts * CAP.MPSMIN) */ + + cd->ver = 0x00010300; + + cd->oacs = 1 << NVME_CTRLR_DATA_OACS_FORMAT_SHIFT; + cd->acl = 2; + cd->aerl = 4; + + cd->lpa = 0; /* TODO: support some simple things like SMART */ + cd->elpe = 0; /* max error log page entries */ + cd->npss = 1; /* number of power states support */ + + /* Warning Composite Temperature Threshold */ + cd->wctemp = 0x0157; + + cd->sqes = (6 << NVME_CTRLR_DATA_SQES_MAX_SHIFT) | + (6 << NVME_CTRLR_DATA_SQES_MIN_SHIFT); + cd->cqes = (4 << NVME_CTRLR_DATA_CQES_MAX_SHIFT) | + (4 << NVME_CTRLR_DATA_CQES_MIN_SHIFT); + cd->nn = 1; /* number of namespaces */ + + cd->fna = 0x03; + + cd->power_state[0].mp = 10; +} + +static void +pci_nvme_init_nsdata(struct pci_nvme_softc *sc) +{ + struct nvme_namespace_data *nd; + + nd = &sc->nsdata; + + nd->nsze = sc->nvstore.size / sc->nvstore.sectsz; + nd->ncap = nd->nsze; + nd->nuse = nd->nsze; + + /* Get LBA and backstore information from backing store */ + nd->nlbaf = 0; /* NLBAF is a 0's based value (i.e. 1 LBA Format) */ + /* LBA data-sz = 2^lbads */ + nd->lbaf[0] = sc->nvstore.sectsz_bits << NVME_NS_DATA_LBAF_LBADS_SHIFT; + + nd->flbas = 0; +} + +static void +pci_nvme_init_logpages(struct pci_nvme_softc *sc) +{ + + memset(&sc->err_log, 0, sizeof(sc->err_log)); + memset(&sc->health_log, 0, sizeof(sc->health_log)); + memset(&sc->fw_log, 0, sizeof(sc->fw_log)); +} + +static void +pci_nvme_reset_locked(struct pci_nvme_softc *sc) +{ + DPRINTF(("%s\r\n", __func__)); + + sc->regs.cap_lo = (ZERO_BASED(sc->max_qentries) & NVME_CAP_LO_REG_MQES_MASK) | + (1 << NVME_CAP_LO_REG_CQR_SHIFT) | + (60 << NVME_CAP_LO_REG_TO_SHIFT); + + sc->regs.cap_hi = 1 << NVME_CAP_HI_REG_CSS_NVM_SHIFT; + + sc->regs.vs = 0x00010300; /* NVMe v1.3 */ + + sc->regs.cc = 0; + sc->regs.csts = 0; + + sc->num_cqueues = sc->num_squeues = sc->max_queues; + if (sc->submit_queues != NULL) { + for (int i = 0; i < sc->num_squeues + 1; i++) { + /* + * The Admin Submission Queue is at index 0. + * It must not be changed at reset otherwise the + * emulation will be out of sync with the guest. + */ + if (i != 0) { + sc->submit_queues[i].qbase = NULL; + sc->submit_queues[i].size = 0; + sc->submit_queues[i].cqid = 0; + } + sc->submit_queues[i].tail = 0; + sc->submit_queues[i].head = 0; + sc->submit_queues[i].busy = 0; + } + } else + sc->submit_queues = calloc(sc->num_squeues + 1, + sizeof(struct nvme_submission_queue)); + + if (sc->compl_queues != NULL) { + for (int i = 0; i < sc->num_cqueues + 1; i++) { + /* See Admin Submission Queue note above */ + if (i != 0) { + sc->compl_queues[i].qbase = NULL; + sc->compl_queues[i].size = 0; + } + + sc->compl_queues[i].tail = 0; + sc->compl_queues[i].head = 0; + } + } else { + sc->compl_queues = calloc(sc->num_cqueues + 1, + sizeof(struct nvme_completion_queue)); + + for (int i = 0; i < sc->num_cqueues + 1; i++) + pthread_mutex_init(&sc->compl_queues[i].mtx, NULL); + } +} + +static void +pci_nvme_reset(struct pci_nvme_softc *sc) +{ + pthread_mutex_lock(&sc->mtx); + pci_nvme_reset_locked(sc); + pthread_mutex_unlock(&sc->mtx); +} + +static void +pci_nvme_init_controller(struct vmctx *ctx, struct pci_nvme_softc *sc) +{ + uint16_t acqs, asqs; + + DPRINTF(("%s\r\n", __func__)); + + asqs = (sc->regs.aqa & NVME_AQA_REG_ASQS_MASK) + 1; + sc->submit_queues[0].size = asqs; + sc->submit_queues[0].qbase = vm_map_gpa(ctx, sc->regs.asq, + sizeof(struct nvme_command) * asqs); + + DPRINTF(("%s mapping Admin-SQ guest 0x%lx, host: %p\r\n", + __func__, sc->regs.asq, sc->submit_queues[0].qbase)); + + acqs = ((sc->regs.aqa >> NVME_AQA_REG_ACQS_SHIFT) & + NVME_AQA_REG_ACQS_MASK) + 1; + sc->compl_queues[0].size = acqs; + sc->compl_queues[0].qbase = vm_map_gpa(ctx, sc->regs.acq, + sizeof(struct nvme_completion) * acqs); + DPRINTF(("%s mapping Admin-CQ guest 0x%lx, host: %p\r\n", + __func__, sc->regs.acq, sc->compl_queues[0].qbase)); +} + +static int +nvme_prp_memcpy(struct vmctx *ctx, uint64_t prp1, uint64_t prp2, uint8_t *src, + size_t len) +{ + uint8_t *dst; + size_t bytes; + + if (len > (8 * 1024)) { + return (-1); + } + + /* Copy from the start of prp1 to the end of the physical page */ + bytes = PAGE_SIZE - (prp1 & PAGE_MASK); + bytes = MIN(bytes, len); + + dst = vm_map_gpa(ctx, prp1, bytes); + if (dst == NULL) { + return (-1); + } + + memcpy(dst, src, bytes); + + src += bytes; + + len -= bytes; + if (len == 0) { + return (0); + } + + len = MIN(len, PAGE_SIZE); + + dst = vm_map_gpa(ctx, prp2, len); + if (dst == NULL) { + return (-1); + } + + memcpy(dst, src, len); + + return (0); +} + +static int +nvme_opc_delete_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint16_t qid = command->cdw10 & 0xffff; + + DPRINTF(("%s DELETE_IO_SQ %u\r\n", __func__, qid)); + if (qid == 0 || qid > sc->num_squeues) { + WPRINTF(("%s NOT PERMITTED queue id %u / num_squeues %u\r\n", + __func__, qid, sc->num_squeues)); + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + sc->submit_queues[qid].qbase = NULL; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_create_io_sq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + if (command->cdw11 & NVME_CMD_CDW11_PC) { + uint16_t qid = command->cdw10 & 0xffff; + struct nvme_submission_queue *nsq; + + if ((qid == 0) || (qid > sc->num_squeues)) { + WPRINTF(("%s queue index %u > num_squeues %u\r\n", + __func__, qid, sc->num_squeues)); + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + nsq = &sc->submit_queues[qid]; + nsq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); + + nsq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(struct nvme_command) * (size_t)nsq->size); + nsq->cqid = (command->cdw11 >> 16) & 0xffff; + nsq->qpriority = (command->cdw11 >> 1) & 0x03; + + DPRINTF(("%s sq %u size %u gaddr %p cqid %u\r\n", __func__, + qid, nsq->size, nsq->qbase, nsq->cqid)); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + + DPRINTF(("%s completed creating IOSQ qid %u\r\n", + __func__, qid)); + } else { + /* + * Guest sent non-cont submission queue request. + * This setting is unsupported by this emulation. + */ + WPRINTF(("%s unsupported non-contig (list-based) " + "create i/o submission queue\r\n", __func__)); + + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + } + return (1); +} + +static int +nvme_opc_delete_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint16_t qid = command->cdw10 & 0xffff; + + DPRINTF(("%s DELETE_IO_CQ %u\r\n", __func__, qid)); + if (qid == 0 || qid > sc->num_cqueues) { + WPRINTF(("%s queue index %u / num_cqueues %u\r\n", + __func__, qid, sc->num_cqueues)); + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + sc->compl_queues[qid].qbase = NULL; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_create_io_cq(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + if (command->cdw11 & NVME_CMD_CDW11_PC) { + uint16_t qid = command->cdw10 & 0xffff; + struct nvme_completion_queue *ncq; + + if ((qid == 0) || (qid > sc->num_cqueues)) { + WPRINTF(("%s queue index %u > num_cqueues %u\r\n", + __func__, qid, sc->num_cqueues)); + pci_nvme_status_tc(&compl->status, + NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_QUEUE_IDENTIFIER); + return (1); + } + + ncq = &sc->compl_queues[qid]; + ncq->intr_en = (command->cdw11 & NVME_CMD_CDW11_IEN) >> 1; + ncq->intr_vec = (command->cdw11 >> 16) & 0xffff; + ncq->size = ONE_BASED((command->cdw10 >> 16) & 0xffff); + + ncq->qbase = vm_map_gpa(sc->nsc_pi->pi_vmctx, + command->prp1, + sizeof(struct nvme_command) * (size_t)ncq->size); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + } else { + /* + * Non-contig completion queue unsupported. + */ + WPRINTF(("%s unsupported non-contig (list-based) " + "create i/o completion queue\r\n", + __func__)); + + /* 0x12 = Invalid Use of Controller Memory Buffer */ + pci_nvme_status_genc(&compl->status, 0x12); + } + + return (1); +} + +static int +nvme_opc_get_log_page(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint32_t logsize = (1 + ((command->cdw10 >> 16) & 0xFFF)) * 2; + uint8_t logpage = command->cdw10 & 0xFF; + + DPRINTF(("%s log page %u len %u\r\n", __func__, logpage, logsize)); + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + + switch (logpage) { + case NVME_LOG_ERROR: + nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, + command->prp2, (uint8_t *)&sc->err_log, logsize); + break; + case NVME_LOG_HEALTH_INFORMATION: + /* TODO: present some smart info */ + nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, + command->prp2, (uint8_t *)&sc->health_log, logsize); + break; + case NVME_LOG_FIRMWARE_SLOT: + nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, + command->prp2, (uint8_t *)&sc->fw_log, logsize); + break; + default: + WPRINTF(("%s get log page %x command not supported\r\n", + __func__, logpage)); + + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_INVALID_LOG_PAGE); + } + + return (1); +} + +static int +nvme_opc_identify(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + void *dest; + + DPRINTF(("%s identify 0x%x nsid 0x%x\r\n", __func__, + command->cdw10 & 0xFF, command->nsid)); + + switch (command->cdw10 & 0xFF) { + case 0x00: /* return Identify Namespace data structure */ + nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, + command->prp2, (uint8_t *)&sc->nsdata, sizeof(sc->nsdata)); + break; + case 0x01: /* return Identify Controller data structure */ + nvme_prp_memcpy(sc->nsc_pi->pi_vmctx, command->prp1, + command->prp2, (uint8_t *)&sc->ctrldata, + sizeof(sc->ctrldata)); + break; + case 0x02: /* list of 1024 active NSIDs > CDW1.NSID */ + dest = vm_map_gpa(sc->nsc_pi->pi_vmctx, command->prp1, + sizeof(uint32_t) * 1024); + ((uint32_t *)dest)[0] = 1; + ((uint32_t *)dest)[1] = 0; + break; + case 0x11: + pci_nvme_status_genc(&compl->status, + NVME_SC_INVALID_NAMESPACE_OR_FORMAT); + return (1); + case 0x03: /* list of NSID structures in CDW1.NSID, 4096 bytes */ + case 0x10: + case 0x12: + case 0x13: + case 0x14: + case 0x15: + default: + DPRINTF(("%s unsupported identify command requested 0x%x\r\n", + __func__, command->cdw10 & 0xFF)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_set_feature_queues(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + uint16_t nqr; /* Number of Queues Requested */ + + nqr = command->cdw11 & 0xFFFF; + if (nqr == 0xffff) { + WPRINTF(("%s: Illegal NSQR value %#x\n", __func__, nqr)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (-1); + } + + sc->num_squeues = ONE_BASED(nqr); + if (sc->num_squeues > sc->max_queues) { + DPRINTF(("NSQR=%u is greater than max %u\n", sc->num_squeues, + sc->max_queues)); + sc->num_squeues = sc->max_queues; + } + + nqr = (command->cdw11 >> 16) & 0xFFFF; + if (nqr == 0xffff) { + WPRINTF(("%s: Illegal NCQR value %#x\n", __func__, nqr)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (-1); + } + + sc->num_cqueues = ONE_BASED(nqr); + if (sc->num_cqueues > sc->max_queues) { + DPRINTF(("NCQR=%u is greater than max %u\n", sc->num_cqueues, + sc->max_queues)); + sc->num_cqueues = sc->max_queues; + } + + compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); + + return (0); +} + +static int +nvme_opc_set_features(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + int feature = command->cdw10 & 0xFF; + uint32_t iv; + + DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); + compl->cdw0 = 0; + + switch (feature) { + case NVME_FEAT_ARBITRATION: + DPRINTF((" arbitration 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_POWER_MANAGEMENT: + DPRINTF((" power management 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_LBA_RANGE_TYPE: + DPRINTF((" lba range 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_TEMPERATURE_THRESHOLD: + DPRINTF((" temperature threshold 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_ERROR_RECOVERY: + DPRINTF((" error recovery 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_VOLATILE_WRITE_CACHE: + DPRINTF((" volatile write cache 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_NUMBER_OF_QUEUES: + nvme_set_feature_queues(sc, command, compl); + break; + case NVME_FEAT_INTERRUPT_COALESCING: + DPRINTF((" interrupt coalescing 0x%x\r\n", command->cdw11)); + + /* in uS */ + sc->intr_coales_aggr_time = ((command->cdw11 >> 8) & 0xFF)*100; + + sc->intr_coales_aggr_thresh = command->cdw11 & 0xFF; + break; + case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: + iv = command->cdw11 & 0xFFFF; + + DPRINTF((" interrupt vector configuration 0x%x\r\n", + command->cdw11)); + + for (uint32_t i = 0; i < sc->num_cqueues + 1; i++) { + if (sc->compl_queues[i].intr_vec == iv) { + if (command->cdw11 & (1 << 16)) + sc->compl_queues[i].intr_en |= + NVME_CQ_INTCOAL; + else + sc->compl_queues[i].intr_en &= + ~NVME_CQ_INTCOAL; + } + } + break; + case NVME_FEAT_WRITE_ATOMICITY: + DPRINTF((" write atomicity 0x%x\r\n", command->cdw11)); + break; + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + DPRINTF((" async event configuration 0x%x\r\n", + command->cdw11)); + sc->async_ev_config = command->cdw11; + break; + case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: + DPRINTF((" software progress marker 0x%x\r\n", + command->cdw11)); + break; + case 0x0C: + DPRINTF((" autonomous power state transition 0x%x\r\n", + command->cdw11)); + break; + default: + WPRINTF(("%s invalid feature\r\n", __func__)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_get_features(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + int feature = command->cdw10 & 0xFF; + + DPRINTF(("%s feature 0x%x\r\n", __func__, feature)); + + compl->cdw0 = 0; + + switch (feature) { + case NVME_FEAT_ARBITRATION: + DPRINTF((" arbitration\r\n")); + break; + case NVME_FEAT_POWER_MANAGEMENT: + DPRINTF((" power management\r\n")); + break; + case NVME_FEAT_LBA_RANGE_TYPE: + DPRINTF((" lba range\r\n")); + break; + case NVME_FEAT_TEMPERATURE_THRESHOLD: + DPRINTF((" temperature threshold\r\n")); + switch ((command->cdw11 >> 20) & 0x3) { + case 0: + /* Over temp threshold */ + compl->cdw0 = 0xFFFF; + break; + case 1: + /* Under temp threshold */ + compl->cdw0 = 0; + break; + default: + WPRINTF((" invalid threshold type select\r\n")); + pci_nvme_status_genc(&compl->status, + NVME_SC_INVALID_FIELD); + return (1); + } + break; + case NVME_FEAT_ERROR_RECOVERY: + DPRINTF((" error recovery\r\n")); + break; + case NVME_FEAT_VOLATILE_WRITE_CACHE: + DPRINTF((" volatile write cache\r\n")); + break; + case NVME_FEAT_NUMBER_OF_QUEUES: + compl->cdw0 = NVME_FEATURE_NUM_QUEUES(sc); + + DPRINTF((" number of queues (submit %u, completion %u)\r\n", + compl->cdw0 & 0xFFFF, + (compl->cdw0 >> 16) & 0xFFFF)); + + break; + case NVME_FEAT_INTERRUPT_COALESCING: + DPRINTF((" interrupt coalescing\r\n")); + break; + case NVME_FEAT_INTERRUPT_VECTOR_CONFIGURATION: + DPRINTF((" interrupt vector configuration\r\n")); + break; + case NVME_FEAT_WRITE_ATOMICITY: + DPRINTF((" write atomicity\r\n")); + break; + case NVME_FEAT_ASYNC_EVENT_CONFIGURATION: + DPRINTF((" async event configuration\r\n")); + sc->async_ev_config = command->cdw11; + break; + case NVME_FEAT_SOFTWARE_PROGRESS_MARKER: + DPRINTF((" software progress marker\r\n")); + break; + case 0x0C: + DPRINTF((" autonomous power state transition\r\n")); + break; + default: + WPRINTF(("%s invalid feature 0x%x\r\n", __func__, feature)); + pci_nvme_status_genc(&compl->status, NVME_SC_INVALID_FIELD); + return (1); + } + + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +static int +nvme_opc_abort(struct pci_nvme_softc* sc, struct nvme_command* command, + struct nvme_completion* compl) +{ + DPRINTF(("%s submission queue %u, command ID 0x%x\r\n", __func__, + command->cdw10 & 0xFFFF, (command->cdw10 >> 16) & 0xFFFF)); + + /* TODO: search for the command ID and abort it */ + + compl->cdw0 = 1; + pci_nvme_status_genc(&compl->status, NVME_SC_SUCCESS); + return (1); +} + +#ifdef __FreeBSD__ +static int +nvme_opc_async_event_req(struct pci_nvme_softc* sc, + struct nvme_command* command, struct nvme_completion* compl) +{ + DPRINTF(("%s async event request 0x%x\r\n", __func__, command->cdw11)); + + /* + * TODO: raise events when they happen based on the Set Features cmd. + * These events happen async, so only set completion successful if + * there is an event reflective of the request to get event. + */ + pci_nvme_status_tc(&compl->status, NVME_SCT_COMMAND_SPECIFIC, + NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED); + return (0); +} +#else +/* This is kept behind an ifdef while it's unused to appease the compiler. */ +#endif /* __FreeBSD__ */ + +static void +pci_nvme_handle_admin_cmd(struct pci_nvme_softc* sc, uint64_t value) +{ + struct nvme_completion compl; + struct nvme_command *cmd; + struct nvme_submission_queue *sq; + struct nvme_completion_queue *cq; + int do_intr = 0; + uint16_t sqhead; + + DPRINTF(("%s index %u\r\n", __func__, (uint32_t)value)); + + sq = &sc->submit_queues[0]; + + sqhead = atomic_load_acq_short(&sq->head); + + if (atomic_testandset_int(&sq->busy, 1)) { + DPRINTF(("%s SQ busy, head %u, tail %u\r\n", + __func__, sqhead, sq->tail)); + return; + } + + DPRINTF(("sqhead %u, tail %u\r\n", sqhead, sq->tail)); + + while (sqhead != atomic_load_acq_short(&sq->tail)) { + cmd = &(sq->qbase)[sqhead]; + compl.status = 0; + + switch (cmd->opc) { + case NVME_OPC_DELETE_IO_SQ: + DPRINTF(("%s command DELETE_IO_SQ\r\n", __func__)); + do_intr |= nvme_opc_delete_io_sq(sc, cmd, &compl); + break; + case NVME_OPC_CREATE_IO_SQ: + DPRINTF(("%s command CREATE_IO_SQ\r\n", __func__)); + do_intr |= nvme_opc_create_io_sq(sc, cmd, &compl); + break; + case NVME_OPC_DELETE_IO_CQ: + DPRINTF(("%s command DELETE_IO_CQ\r\n", __func__)); + do_intr |= nvme_opc_delete_io_cq(sc, cmd, &compl); + break; + case NVME_OPC_CREATE_IO_CQ: + DPRINTF(("%s command CREATE_IO_CQ\r\n", __func__)); + do_intr |= nvme_opc_create_io_cq(sc, cmd, &compl); + break; + case NVME_OPC_GET_LOG_PAGE: + DPRINTF(("%s command GET_LOG_PAGE\r\n", __func__)); + do_intr |= nvme_opc_get_log_page(sc, cmd, &compl); + break; + case NVME_OPC_IDENTIFY: + DPRINTF(("%s command IDENTIFY\r\n", __func__)); + do_intr |= nvme_opc_identify(sc, cmd, &compl); + break; + case NVME_OPC_ABORT: + DPRINTF(("%s command ABORT\r\n", __func__)); + do_intr |= nvme_opc_abort(sc, cmd, &compl); + break; + case NVME_OPC_SET_FEATURES: + DPRINTF(("%s command SET_FEATURES\r\n", __func__)); + do_intr |= nvme_opc_set_features(sc, cmd, &compl); + break; + case NVME_OPC_GET_FEATURES: + DPRINTF(("%s command GET_FEATURES\r\n", __func__)); + do_intr |= nvme_opc_get_features(sc, cmd, &compl); + break; + case NVME_OPC_ASYNC_EVENT_REQUEST: + DPRINTF(("%s command ASYNC_EVENT_REQ\r\n", __func__)); + /* XXX dont care, unhandled for now + do_intr |= nvme_opc_async_event_req(sc, cmd, &compl); + */ + break; + default: + WPRINTF(("0x%x command is not implemented\r\n", + cmd->opc)); + } + + /* for now skip async event generation */ + if (cmd->opc != NVME_OPC_ASYNC_EVENT_REQUEST) { + struct nvme_completion *cp; + int phase; + + cq = &sc->compl_queues[0]; + + cp = &(cq->qbase)[cq->tail]; + cp->cdw0 = compl.cdw0; + cp->sqid = 0; + cp->sqhd = sqhead; + cp->cid = cmd->cid; + + phase = NVME_STATUS_GET_P(cp->status); + cp->status = compl.status; + pci_nvme_toggle_phase(&cp->status, phase); + + cq->tail = (cq->tail + 1) % cq->size; + } + sqhead = (sqhead + 1) % sq->size; + } + + DPRINTF(("setting sqhead %u\r\n", sqhead)); + atomic_store_short(&sq->head, sqhead); + atomic_store_int(&sq->busy, 0); + + if (do_intr) + pci_generate_msix(sc->nsc_pi, 0); + +} + +static int +pci_nvme_append_iov_req(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req, + uint64_t gpaddr, size_t size, int do_write, uint64_t lba) +{ + int iovidx; + + if (req != NULL) { + /* concatenate contig block-iovs to minimize number of iovs */ + if ((req->prev_gpaddr + req->prev_size) == gpaddr) { + iovidx = req->io_req.br_iovcnt - 1; + + req->io_req.br_iov[iovidx].iov_base = + paddr_guest2host(req->sc->nsc_pi->pi_vmctx, + req->prev_gpaddr, size); + + req->prev_size += size; + req->io_req.br_resid += size; + + req->io_req.br_iov[iovidx].iov_len = req->prev_size; + } else { + pthread_mutex_lock(&req->mtx); + + iovidx = req->io_req.br_iovcnt; + if (iovidx == NVME_MAX_BLOCKIOVS) { + int err = 0; + + DPRINTF(("large I/O, doing partial req\r\n")); + + iovidx = 0; + req->io_req.br_iovcnt = 0; + + req->io_req.br_callback = pci_nvme_io_partial; + + if (!do_write) + err = blockif_read(sc->nvstore.ctx, + &req->io_req); + else + err = blockif_write(sc->nvstore.ctx, + &req->io_req); + + /* wait until req completes before cont */ + if (err == 0) + pthread_cond_wait(&req->cv, &req->mtx); + } + if (iovidx == 0) { + req->io_req.br_offset = lba; + req->io_req.br_resid = 0; + req->io_req.br_param = req; + } + + req->io_req.br_iov[iovidx].iov_base = + paddr_guest2host(req->sc->nsc_pi->pi_vmctx, + gpaddr, size); + + req->io_req.br_iov[iovidx].iov_len = size; + + req->prev_gpaddr = gpaddr; + req->prev_size = size; + req->io_req.br_resid += size; + + req->io_req.br_iovcnt++; + + pthread_mutex_unlock(&req->mtx); + } + } else { + /* RAM buffer: read/write directly */ + void *p = sc->nvstore.ctx; + void *gptr; + + if ((lba + size) > sc->nvstore.size) { + WPRINTF(("%s write would overflow RAM\r\n", __func__)); + return (-1); + } + + p = (void *)((uintptr_t)p + (uintptr_t)lba); + gptr = paddr_guest2host(sc->nsc_pi->pi_vmctx, gpaddr, size); + if (do_write) + memcpy(p, gptr, size); + else + memcpy(gptr, p, size); + } + return (0); +} + +static void +pci_nvme_set_completion(struct pci_nvme_softc *sc, + struct nvme_submission_queue *sq, int sqid, uint16_t cid, + uint32_t cdw0, uint16_t status, int ignore_busy) +{ + struct nvme_completion_queue *cq = &sc->compl_queues[sq->cqid]; + struct nvme_completion *compl; + int do_intr = 0; + int phase; + + DPRINTF(("%s sqid %d cqid %u cid %u status: 0x%x 0x%x\r\n", + __func__, sqid, sq->cqid, cid, NVME_STATUS_GET_SCT(status), + NVME_STATUS_GET_SC(status))); + + pthread_mutex_lock(&cq->mtx); + + assert(cq->qbase != NULL); + + compl = &cq->qbase[cq->tail]; + + compl->sqhd = atomic_load_acq_short(&sq->head); + compl->sqid = sqid; + compl->cid = cid; + + // toggle phase + phase = NVME_STATUS_GET_P(compl->status); + compl->status = status; + pci_nvme_toggle_phase(&compl->status, phase); + + cq->tail = (cq->tail + 1) % cq->size; + + if (cq->intr_en & NVME_CQ_INTEN) + do_intr = 1; + + pthread_mutex_unlock(&cq->mtx); + + if (ignore_busy || !atomic_load_acq_int(&sq->busy)) + if (do_intr) + pci_generate_msix(sc->nsc_pi, cq->intr_vec); +} + +static void +pci_nvme_release_ioreq(struct pci_nvme_softc *sc, struct pci_nvme_ioreq *req) +{ + req->sc = NULL; + req->nvme_sq = NULL; + req->sqid = 0; + + pthread_mutex_lock(&sc->mtx); + + req->next = sc->ioreqs_free; + sc->ioreqs_free = req; + sc->pending_ios--; + + /* when no more IO pending, can set to ready if device reset/enabled */ + if (sc->pending_ios == 0 && + NVME_CC_GET_EN(sc->regs.cc) && !(NVME_CSTS_GET_RDY(sc->regs.csts))) + sc->regs.csts |= NVME_CSTS_RDY; + + pthread_mutex_unlock(&sc->mtx); + + sem_post(&sc->iosemlock); +} + +static struct pci_nvme_ioreq * +pci_nvme_get_ioreq(struct pci_nvme_softc *sc) +{ + struct pci_nvme_ioreq *req = NULL;; + + sem_wait(&sc->iosemlock); + pthread_mutex_lock(&sc->mtx); + + req = sc->ioreqs_free; + assert(req != NULL); + + sc->ioreqs_free = req->next; + + req->next = NULL; + req->sc = sc; + + sc->pending_ios++; + + pthread_mutex_unlock(&sc->mtx); + + req->io_req.br_iovcnt = 0; + req->io_req.br_offset = 0; + req->io_req.br_resid = 0; + req->io_req.br_param = req; + req->prev_gpaddr = 0; + req->prev_size = 0; + + return req; +} + +static void +pci_nvme_io_done(struct blockif_req *br, int err) +{ + struct pci_nvme_ioreq *req = br->br_param; + struct nvme_submission_queue *sq = req->nvme_sq; + uint16_t code, status = 0; + + DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); + + /* TODO return correct error */ + code = err ? NVME_SC_DATA_TRANSFER_ERROR : NVME_SC_SUCCESS; + pci_nvme_status_genc(&status, code); + + pci_nvme_set_completion(req->sc, sq, req->sqid, req->cid, 0, status, 0); + pci_nvme_release_ioreq(req->sc, req); +} + +static void +pci_nvme_io_partial(struct blockif_req *br, int err) +{ + struct pci_nvme_ioreq *req = br->br_param; + + DPRINTF(("%s error %d %s\r\n", __func__, err, strerror(err))); + + pthread_cond_signal(&req->cv); +} + + +static void +pci_nvme_handle_io_cmd(struct pci_nvme_softc* sc, uint16_t idx) +{ + struct nvme_submission_queue *sq; + uint16_t status = 0; + uint16_t sqhead; + int err; + + /* handle all submissions up to sq->tail index */ + sq = &sc->submit_queues[idx]; + + if (atomic_testandset_int(&sq->busy, 1)) { + DPRINTF(("%s sqid %u busy\r\n", __func__, idx)); + return; + } + + sqhead = atomic_load_acq_short(&sq->head); + + DPRINTF(("nvme_handle_io qid %u head %u tail %u cmdlist %p\r\n", + idx, sqhead, sq->tail, sq->qbase)); + + while (sqhead != atomic_load_acq_short(&sq->tail)) { + struct nvme_command *cmd; + struct pci_nvme_ioreq *req = NULL; + uint64_t lba; + uint64_t nblocks, bytes, size, cpsz; + + /* TODO: support scatter gather list handling */ + + cmd = &sq->qbase[sqhead]; + sqhead = (sqhead + 1) % sq->size; + + lba = ((uint64_t)cmd->cdw11 << 32) | cmd->cdw10; + + if (cmd->opc == NVME_OPC_FLUSH) { + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + + continue; + } else if (cmd->opc == 0x08) { + /* TODO: write zeroes */ + WPRINTF(("%s write zeroes lba 0x%lx blocks %u\r\n", + __func__, lba, cmd->cdw12 & 0xFFFF)); + pci_nvme_status_genc(&status, NVME_SC_SUCCESS); + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + + continue; + } + + nblocks = (cmd->cdw12 & 0xFFFF) + 1; + + bytes = nblocks * sc->nvstore.sectsz; + + if (sc->nvstore.type == NVME_STOR_BLOCKIF) { + req = pci_nvme_get_ioreq(sc); + req->nvme_sq = sq; + req->sqid = idx; + } + + /* + * If data starts mid-page and flows into the next page, then + * increase page count + */ + + DPRINTF(("[h%u:t%u:n%u] %s starting LBA 0x%lx blocks %lu " + "(%lu-bytes)\r\n", + sqhead==0 ? sq->size-1 : sqhead-1, sq->tail, sq->size, + cmd->opc == NVME_OPC_WRITE ? + "WRITE" : "READ", + lba, nblocks, bytes)); + + cmd->prp1 &= ~(0x03UL); + cmd->prp2 &= ~(0x03UL); + + DPRINTF((" prp1 0x%lx prp2 0x%lx\r\n", cmd->prp1, cmd->prp2)); + + size = bytes; + lba *= sc->nvstore.sectsz; + + cpsz = PAGE_SIZE - (cmd->prp1 % PAGE_SIZE); + + if (cpsz > bytes) + cpsz = bytes; + + if (req != NULL) { + req->io_req.br_offset = ((uint64_t)cmd->cdw11 << 32) | + cmd->cdw10; + req->opc = cmd->opc; + req->cid = cmd->cid; + req->nsid = cmd->nsid; + } + + err = pci_nvme_append_iov_req(sc, req, cmd->prp1, cpsz, + cmd->opc == NVME_OPC_WRITE, lba); + lba += cpsz; + size -= cpsz; + + if (size == 0) + goto iodone; + + if (size <= PAGE_SIZE) { + /* prp2 is second (and final) page in transfer */ + + err = pci_nvme_append_iov_req(sc, req, cmd->prp2, + size, + cmd->opc == NVME_OPC_WRITE, + lba); + } else { + uint64_t *prp_list; + int i; + + /* prp2 is pointer to a physical region page list */ + prp_list = paddr_guest2host(sc->nsc_pi->pi_vmctx, + cmd->prp2, PAGE_SIZE); + + i = 0; + while (size != 0) { + cpsz = MIN(size, PAGE_SIZE); + + /* + * Move to linked physical region page list + * in last item. + */ + if (i == (NVME_PRP2_ITEMS-1) && + size > PAGE_SIZE) { + assert((prp_list[i] & (PAGE_SIZE-1)) == 0); + prp_list = paddr_guest2host( + sc->nsc_pi->pi_vmctx, + prp_list[i], PAGE_SIZE); + i = 0; + } + if (prp_list[i] == 0) { + WPRINTF(("PRP2[%d] = 0 !!!\r\n", i)); + err = 1; + break; + } + + err = pci_nvme_append_iov_req(sc, req, + prp_list[i], cpsz, + cmd->opc == NVME_OPC_WRITE, lba); + if (err) + break; + + lba += cpsz; + size -= cpsz; + i++; + } + } + +iodone: + if (sc->nvstore.type == NVME_STOR_RAM) { + uint16_t code, status = 0; + + code = err ? NVME_SC_LBA_OUT_OF_RANGE : + NVME_SC_SUCCESS; + pci_nvme_status_genc(&status, code); + + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + + continue; + } + + + if (err) + goto do_error; + + req->io_req.br_callback = pci_nvme_io_done; + + err = 0; + switch (cmd->opc) { + case NVME_OPC_READ: + err = blockif_read(sc->nvstore.ctx, &req->io_req); + break; + case NVME_OPC_WRITE: + err = blockif_write(sc->nvstore.ctx, &req->io_req); + break; + default: + WPRINTF(("%s unhandled io command 0x%x\r\n", + __func__, cmd->opc)); + err = 1; + } + +do_error: + if (err) { + uint16_t status = 0; + + pci_nvme_status_genc(&status, + NVME_SC_DATA_TRANSFER_ERROR); + + pci_nvme_set_completion(sc, sq, idx, cmd->cid, 0, + status, 1); + pci_nvme_release_ioreq(sc, req); + } + } + + atomic_store_short(&sq->head, sqhead); + atomic_store_int(&sq->busy, 0); +} + +static void +pci_nvme_handle_doorbell(struct vmctx *ctx, struct pci_nvme_softc* sc, + uint64_t idx, int is_sq, uint64_t value) +{ + DPRINTF(("nvme doorbell %lu, %s, val 0x%lx\r\n", + idx, is_sq ? "SQ" : "CQ", value & 0xFFFF)); + + if (is_sq) { + atomic_store_short(&sc->submit_queues[idx].tail, + (uint16_t)value); + + if (idx == 0) { + pci_nvme_handle_admin_cmd(sc, value); + } else { + /* submission queue; handle new entries in SQ */ + if (idx > sc->num_squeues) { + WPRINTF(("%s SQ index %lu overflow from " + "guest (max %u)\r\n", + __func__, idx, sc->num_squeues)); + return; + } + pci_nvme_handle_io_cmd(sc, (uint16_t)idx); + } + } else { + if (idx > sc->num_cqueues) { + WPRINTF(("%s queue index %lu overflow from " + "guest (max %u)\r\n", + __func__, idx, sc->num_cqueues)); + return; + } + + sc->compl_queues[idx].head = (uint16_t)value; + } +} + +static void +pci_nvme_bar0_reg_dumps(const char *func, uint64_t offset, int iswrite) +{ + const char *s = iswrite ? "WRITE" : "READ"; + + switch (offset) { + case NVME_CR_CAP_LOW: + DPRINTF(("%s %s NVME_CR_CAP_LOW\r\n", func, s)); + break; + case NVME_CR_CAP_HI: + DPRINTF(("%s %s NVME_CR_CAP_HI\r\n", func, s)); + break; + case NVME_CR_VS: + DPRINTF(("%s %s NVME_CR_VS\r\n", func, s)); + break; + case NVME_CR_INTMS: + DPRINTF(("%s %s NVME_CR_INTMS\r\n", func, s)); + break; + case NVME_CR_INTMC: + DPRINTF(("%s %s NVME_CR_INTMC\r\n", func, s)); + break; + case NVME_CR_CC: + DPRINTF(("%s %s NVME_CR_CC\r\n", func, s)); + break; + case NVME_CR_CSTS: + DPRINTF(("%s %s NVME_CR_CSTS\r\n", func, s)); + break; + case NVME_CR_NSSR: + DPRINTF(("%s %s NVME_CR_NSSR\r\n", func, s)); + break; + case NVME_CR_AQA: + DPRINTF(("%s %s NVME_CR_AQA\r\n", func, s)); + break; + case NVME_CR_ASQ_LOW: + DPRINTF(("%s %s NVME_CR_ASQ_LOW\r\n", func, s)); + break; + case NVME_CR_ASQ_HI: + DPRINTF(("%s %s NVME_CR_ASQ_HI\r\n", func, s)); + break; + case NVME_CR_ACQ_LOW: + DPRINTF(("%s %s NVME_CR_ACQ_LOW\r\n", func, s)); + break; + case NVME_CR_ACQ_HI: + DPRINTF(("%s %s NVME_CR_ACQ_HI\r\n", func, s)); + break; + default: + DPRINTF(("unknown nvme bar-0 offset 0x%lx\r\n", offset)); + } + +} + +static void +pci_nvme_write_bar_0(struct vmctx *ctx, struct pci_nvme_softc* sc, + uint64_t offset, int size, uint64_t value) +{ + uint32_t ccreg; + + if (offset >= NVME_DOORBELL_OFFSET) { + uint64_t belloffset = offset - NVME_DOORBELL_OFFSET; + uint64_t idx = belloffset / 8; /* door bell size = 2*int */ + int is_sq = (belloffset % 8) < 4; + + if (belloffset > ((sc->max_queues+1) * 8 - 4)) { + WPRINTF(("guest attempted an overflow write offset " + "0x%lx, val 0x%lx in %s", + offset, value, __func__)); + return; + } + + pci_nvme_handle_doorbell(ctx, sc, idx, is_sq, value); + return; + } + + DPRINTF(("nvme-write offset 0x%lx, size %d, value 0x%lx\r\n", + offset, size, value)); + + if (size != 4) { + WPRINTF(("guest wrote invalid size %d (offset 0x%lx, " + "val 0x%lx) to bar0 in %s", + size, offset, value, __func__)); + /* TODO: shutdown device */ + return; + } + + pci_nvme_bar0_reg_dumps(__func__, offset, 1); + + pthread_mutex_lock(&sc->mtx); + + switch (offset) { + case NVME_CR_CAP_LOW: + case NVME_CR_CAP_HI: + /* readonly */ + break; + case NVME_CR_VS: + /* readonly */ + break; + case NVME_CR_INTMS: + /* MSI-X, so ignore */ + break; + case NVME_CR_INTMC: + /* MSI-X, so ignore */ + break; + case NVME_CR_CC: + ccreg = (uint32_t)value; + + DPRINTF(("%s NVME_CR_CC en %x css %x shn %x iosqes %u " + "iocqes %u\r\n", + __func__, + NVME_CC_GET_EN(ccreg), NVME_CC_GET_CSS(ccreg), + NVME_CC_GET_SHN(ccreg), NVME_CC_GET_IOSQES(ccreg), + NVME_CC_GET_IOCQES(ccreg))); + + if (NVME_CC_GET_SHN(ccreg)) { + /* perform shutdown - flush out data to backend */ + sc->regs.csts &= ~(NVME_CSTS_REG_SHST_MASK << + NVME_CSTS_REG_SHST_SHIFT); + sc->regs.csts |= NVME_SHST_COMPLETE << + NVME_CSTS_REG_SHST_SHIFT; + } + if (NVME_CC_GET_EN(ccreg) != NVME_CC_GET_EN(sc->regs.cc)) { + if (NVME_CC_GET_EN(ccreg) == 0) + /* transition 1-> causes controller reset */ + pci_nvme_reset_locked(sc); + else + pci_nvme_init_controller(ctx, sc); + } + + /* Insert the iocqes, iosqes and en bits from the write */ + sc->regs.cc &= ~NVME_CC_WRITE_MASK; + sc->regs.cc |= ccreg & NVME_CC_WRITE_MASK; + if (NVME_CC_GET_EN(ccreg) == 0) { + /* Insert the ams, mps and css bit fields */ + sc->regs.cc &= ~NVME_CC_NEN_WRITE_MASK; + sc->regs.cc |= ccreg & NVME_CC_NEN_WRITE_MASK; + sc->regs.csts &= ~NVME_CSTS_RDY; + } else if (sc->pending_ios == 0) { + sc->regs.csts |= NVME_CSTS_RDY; + } + break; + case NVME_CR_CSTS: + break; + case NVME_CR_NSSR: + /* ignore writes; don't support subsystem reset */ + break; + case NVME_CR_AQA: + sc->regs.aqa = (uint32_t)value; + break; + case NVME_CR_ASQ_LOW: + sc->regs.asq = (sc->regs.asq & (0xFFFFFFFF00000000)) | + (0xFFFFF000 & value); + break; + case NVME_CR_ASQ_HI: + sc->regs.asq = (sc->regs.asq & (0x00000000FFFFFFFF)) | + (value << 32); + break; + case NVME_CR_ACQ_LOW: + sc->regs.acq = (sc->regs.acq & (0xFFFFFFFF00000000)) | + (0xFFFFF000 & value); + break; + case NVME_CR_ACQ_HI: + sc->regs.acq = (sc->regs.acq & (0x00000000FFFFFFFF)) | + (value << 32); + break; + default: + DPRINTF(("%s unknown offset 0x%lx, value 0x%lx size %d\r\n", + __func__, offset, value, size)); + } + pthread_mutex_unlock(&sc->mtx); +} + +static void +pci_nvme_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_nvme_softc* sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + DPRINTF(("nvme-write baridx %d, msix: off 0x%lx, size %d, " + " value 0x%lx\r\n", baridx, offset, size, value)); + + pci_emul_msix_twrite(pi, offset, size, value); + return; + } + + switch (baridx) { + case 0: + pci_nvme_write_bar_0(ctx, sc, offset, size, value); + break; + + default: + DPRINTF(("%s unknown baridx %d, val 0x%lx\r\n", + __func__, baridx, value)); + } +} + +static uint64_t pci_nvme_read_bar_0(struct pci_nvme_softc* sc, + uint64_t offset, int size) +{ + uint64_t value; + + pci_nvme_bar0_reg_dumps(__func__, offset, 0); + + if (offset < NVME_DOORBELL_OFFSET) { + void *p = &(sc->regs); + pthread_mutex_lock(&sc->mtx); + memcpy(&value, (void *)((uintptr_t)p + offset), size); + pthread_mutex_unlock(&sc->mtx); + } else { + value = 0; + WPRINTF(("pci_nvme: read invalid offset %ld\r\n", offset)); + } + + switch (size) { + case 1: + value &= 0xFF; + break; + case 2: + value &= 0xFFFF; + break; + case 4: + value &= 0xFFFFFFFF; + break; + } + + DPRINTF((" nvme-read offset 0x%lx, size %d -> value 0x%x\r\n", + offset, size, (uint32_t)value)); + + return (value); +} + + + +static uint64_t +pci_nvme_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct pci_nvme_softc* sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi) || + baridx == pci_msix_pba_bar(pi)) { + DPRINTF(("nvme-read bar: %d, msix: regoff 0x%lx, size %d\r\n", + baridx, offset, size)); + + return pci_emul_msix_tread(pi, offset, size); + } + + switch (baridx) { + case 0: + return pci_nvme_read_bar_0(sc, offset, size); + + default: + DPRINTF(("unknown bar %d, 0x%lx\r\n", baridx, offset)); + } + + return (0); +} + + +static int +pci_nvme_parse_opts(struct pci_nvme_softc *sc, char *opts) +{ + char bident[sizeof("XX:X:X")]; + char *uopt, *xopts, *config; + uint32_t sectsz; + int optidx; + + sc->max_queues = NVME_QUEUES; + sc->max_qentries = NVME_MAX_QENTRIES; + sc->ioslots = NVME_IOSLOTS; + sc->num_squeues = sc->max_queues; + sc->num_cqueues = sc->max_queues; + sectsz = 0; + + uopt = strdup(opts); + optidx = 0; + snprintf(sc->ctrldata.sn, sizeof(sc->ctrldata.sn), + "NVME-%d-%d", sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); + for (xopts = strtok(uopt, ","); + xopts != NULL; + xopts = strtok(NULL, ",")) { + + if ((config = strchr(xopts, '=')) != NULL) + *config++ = '\0'; + + if (!strcmp("maxq", xopts)) { + sc->max_queues = atoi(config); + } else if (!strcmp("qsz", xopts)) { + sc->max_qentries = atoi(config); + } else if (!strcmp("ioslots", xopts)) { + sc->ioslots = atoi(config); + } else if (!strcmp("sectsz", xopts)) { + sectsz = atoi(config); + } else if (!strcmp("ser", xopts)) { + /* + * This field indicates the Product Serial Number in + * 7-bit ASCII, unused bytes should be space characters. + * Ref: NVMe v1.3c. + */ + cpywithpad((char *)sc->ctrldata.sn, + sizeof(sc->ctrldata.sn), config, ' '); + } else if (!strcmp("ram", xopts)) { + uint64_t sz = strtoull(&xopts[4], NULL, 10); + + sc->nvstore.type = NVME_STOR_RAM; + sc->nvstore.size = sz * 1024 * 1024; + sc->nvstore.ctx = calloc(1, sc->nvstore.size); + sc->nvstore.sectsz = 4096; + sc->nvstore.sectsz_bits = 12; + if (sc->nvstore.ctx == NULL) { + perror("Unable to allocate RAM"); + free(uopt); + return (-1); + } + } else if (optidx == 0) { + snprintf(bident, sizeof(bident), "%d:%d", + sc->nsc_pi->pi_slot, sc->nsc_pi->pi_func); + sc->nvstore.ctx = blockif_open(xopts, bident); + if (sc->nvstore.ctx == NULL) { + perror("Could not open backing file"); + free(uopt); + return (-1); + } + sc->nvstore.type = NVME_STOR_BLOCKIF; + sc->nvstore.size = blockif_size(sc->nvstore.ctx); + } else { + fprintf(stderr, "Invalid option %s\n", xopts); + free(uopt); + return (-1); + } + + optidx++; + } + free(uopt); + + if (sc->nvstore.ctx == NULL || sc->nvstore.size == 0) { + fprintf(stderr, "backing store not specified\n"); + return (-1); + } + if (sectsz == 512 || sectsz == 4096 || sectsz == 8192) + sc->nvstore.sectsz = sectsz; + else if (sc->nvstore.type != NVME_STOR_RAM) + sc->nvstore.sectsz = blockif_sectsz(sc->nvstore.ctx); + for (sc->nvstore.sectsz_bits = 9; + (1 << sc->nvstore.sectsz_bits) < sc->nvstore.sectsz; + sc->nvstore.sectsz_bits++); + + if (sc->max_queues <= 0 || sc->max_queues > NVME_QUEUES) + sc->max_queues = NVME_QUEUES; + + if (sc->max_qentries <= 0) { + fprintf(stderr, "Invalid qsz option\n"); + return (-1); + } + if (sc->ioslots <= 0) { + fprintf(stderr, "Invalid ioslots option\n"); + return (-1); + } + + return (0); +} + +static int +pci_nvme_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_nvme_softc *sc; + uint32_t pci_membar_sz; + int error; + + error = 0; + + sc = calloc(1, sizeof(struct pci_nvme_softc)); + pi->pi_arg = sc; + sc->nsc_pi = pi; + + error = pci_nvme_parse_opts(sc, opts); + if (error < 0) + goto done; + else + error = 0; + + sc->ioreqs = calloc(sc->ioslots, sizeof(struct pci_nvme_ioreq)); + for (int i = 0; i < sc->ioslots; i++) { + if (i < (sc->ioslots-1)) + sc->ioreqs[i].next = &sc->ioreqs[i+1]; + pthread_mutex_init(&sc->ioreqs[i].mtx, NULL); + pthread_cond_init(&sc->ioreqs[i].cv, NULL); + } + sc->ioreqs_free = sc->ioreqs; + sc->intr_coales_aggr_thresh = 1; + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0A0A); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0xFB5D); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_NVM); + pci_set_cfgdata8(pi, PCIR_PROGIF, + PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0); + + /* + * Allocate size of NVMe registers + doorbell space for all queues. + * + * The specification requires a minimum memory I/O window size of 16K. + * The Windows driver will refuse to start a device with a smaller + * window. + */ + pci_membar_sz = sizeof(struct nvme_registers) + + 2 * sizeof(uint32_t) * (sc->max_queues + 1); + pci_membar_sz = MAX(pci_membar_sz, NVME_MMIO_SPACE_MIN); + + DPRINTF(("nvme membar size: %u\r\n", pci_membar_sz)); + + error = pci_emul_alloc_bar(pi, 0, PCIBAR_MEM64, pci_membar_sz); + if (error) { + WPRINTF(("%s pci alloc mem bar failed\r\n", __func__)); + goto done; + } + + error = pci_emul_add_msixcap(pi, sc->max_queues + 1, NVME_MSIX_BAR); + if (error) { + WPRINTF(("%s pci add msixcap failed\r\n", __func__)); + goto done; + } + + pthread_mutex_init(&sc->mtx, NULL); + sem_init(&sc->iosemlock, 0, sc->ioslots); + + pci_nvme_reset(sc); + pci_nvme_init_ctrldata(sc); + pci_nvme_init_nsdata(sc); + pci_nvme_init_logpages(sc); + + pci_lintr_request(pi); + +done: + return (error); +} + + +struct pci_devemu pci_de_nvme = { + .pe_emu = "nvme", + .pe_init = pci_nvme_init, + .pe_barwrite = pci_nvme_write, + .pe_barread = pci_nvme_read +}; +PCI_EMUL_SET(pci_de_nvme); diff --git a/usr/src/cmd/bhyve/pci_passthru.c b/usr/src/cmd/bhyve/pci_passthru.c new file mode 100644 index 0000000000..d2c69e795c --- /dev/null +++ b/usr/src/cmd/bhyve/pci_passthru.c @@ -0,0 +1,937 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/types.h> +#include <sys/mman.h> +#include <sys/pciio.h> +#include <sys/ioctl.h> + +#include <dev/io/iodev.h> +#include <dev/pci/pcireg.h> + +#include <machine/iodev.h> + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <sysexits.h> +#include <unistd.h> + +#include <machine/vmm.h> +#include <vmmapi.h> +#include "pci_emul.h" +#include "mem.h" + +#ifndef _PATH_DEVPCI +#define _PATH_DEVPCI "/dev/pci" +#endif + +#ifndef _PATH_DEVIO +#define _PATH_DEVIO "/dev/io" +#endif + +#ifndef _PATH_MEM +#define _PATH_MEM "/dev/mem" +#endif + +#define LEGACY_SUPPORT 1 + +#define MSIX_TABLE_COUNT(ctrl) (((ctrl) & PCIM_MSIXCTRL_TABLE_SIZE) + 1) +#define MSIX_CAPLEN 12 + +static int pcifd = -1; +static int iofd = -1; +static int memfd = -1; + +struct passthru_softc { + struct pci_devinst *psc_pi; + struct pcibar psc_bar[PCI_BARMAX + 1]; + struct { + int capoff; + int msgctrl; + int emulated; + } psc_msi; + struct { + int capoff; + } psc_msix; + struct pcisel psc_sel; +}; + +static int +msi_caplen(int msgctrl) +{ + int len; + + len = 10; /* minimum length of msi capability */ + + if (msgctrl & PCIM_MSICTRL_64BIT) + len += 4; + +#if 0 + /* + * Ignore the 'mask' and 'pending' bits in the MSI capability. + * We'll let the guest manipulate them directly. + */ + if (msgctrl & PCIM_MSICTRL_VECTOR) + len += 10; +#endif + + return (len); +} + +static uint32_t +read_config(const struct pcisel *sel, long reg, int width) +{ + struct pci_io pi; + + bzero(&pi, sizeof(pi)); + pi.pi_sel = *sel; + pi.pi_reg = reg; + pi.pi_width = width; + + if (ioctl(pcifd, PCIOCREAD, &pi) < 0) + return (0); /* XXX */ + else + return (pi.pi_data); +} + +static void +write_config(const struct pcisel *sel, long reg, int width, uint32_t data) +{ + struct pci_io pi; + + bzero(&pi, sizeof(pi)); + pi.pi_sel = *sel; + pi.pi_reg = reg; + pi.pi_width = width; + pi.pi_data = data; + + (void)ioctl(pcifd, PCIOCWRITE, &pi); /* XXX */ +} + +#ifdef LEGACY_SUPPORT +static int +passthru_add_msicap(struct pci_devinst *pi, int msgnum, int nextptr) +{ + int capoff, i; + struct msicap msicap; + u_char *capdata; + + pci_populate_msicap(&msicap, msgnum, nextptr); + + /* + * XXX + * Copy the msi capability structure in the last 16 bytes of the + * config space. This is wrong because it could shadow something + * useful to the device. + */ + capoff = 256 - roundup(sizeof(msicap), 4); + capdata = (u_char *)&msicap; + for (i = 0; i < sizeof(msicap); i++) + pci_set_cfgdata8(pi, capoff + i, capdata[i]); + + return (capoff); +} +#endif /* LEGACY_SUPPORT */ + +static int +cfginitmsi(struct passthru_softc *sc) +{ + int i, ptr, capptr, cap, sts, caplen, table_size; + uint32_t u32; + struct pcisel sel; + struct pci_devinst *pi; + struct msixcap msixcap; + uint32_t *msixcap_ptr; + + pi = sc->psc_pi; + sel = sc->psc_sel; + + /* + * Parse the capabilities and cache the location of the MSI + * and MSI-X capabilities. + */ + sts = read_config(&sel, PCIR_STATUS, 2); + if (sts & PCIM_STATUS_CAPPRESENT) { + ptr = read_config(&sel, PCIR_CAP_PTR, 1); + while (ptr != 0 && ptr != 0xff) { + cap = read_config(&sel, ptr + PCICAP_ID, 1); + if (cap == PCIY_MSI) { + /* + * Copy the MSI capability into the config + * space of the emulated pci device + */ + sc->psc_msi.capoff = ptr; + sc->psc_msi.msgctrl = read_config(&sel, + ptr + 2, 2); + sc->psc_msi.emulated = 0; + caplen = msi_caplen(sc->psc_msi.msgctrl); + capptr = ptr; + while (caplen > 0) { + u32 = read_config(&sel, capptr, 4); + pci_set_cfgdata32(pi, capptr, u32); + caplen -= 4; + capptr += 4; + } + } else if (cap == PCIY_MSIX) { + /* + * Copy the MSI-X capability + */ + sc->psc_msix.capoff = ptr; + caplen = 12; + msixcap_ptr = (uint32_t*) &msixcap; + capptr = ptr; + while (caplen > 0) { + u32 = read_config(&sel, capptr, 4); + *msixcap_ptr = u32; + pci_set_cfgdata32(pi, capptr, u32); + caplen -= 4; + capptr += 4; + msixcap_ptr++; + } + } + ptr = read_config(&sel, ptr + PCICAP_NEXTPTR, 1); + } + } + + if (sc->psc_msix.capoff != 0) { + pi->pi_msix.pba_bar = + msixcap.pba_info & PCIM_MSIX_BIR_MASK; + pi->pi_msix.pba_offset = + msixcap.pba_info & ~PCIM_MSIX_BIR_MASK; + pi->pi_msix.table_bar = + msixcap.table_info & PCIM_MSIX_BIR_MASK; + pi->pi_msix.table_offset = + msixcap.table_info & ~PCIM_MSIX_BIR_MASK; + pi->pi_msix.table_count = MSIX_TABLE_COUNT(msixcap.msgctrl); + pi->pi_msix.pba_size = PBA_SIZE(pi->pi_msix.table_count); + + /* Allocate the emulated MSI-X table array */ + table_size = pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; + pi->pi_msix.table = calloc(1, table_size); + + /* Mask all table entries */ + for (i = 0; i < pi->pi_msix.table_count; i++) { + pi->pi_msix.table[i].vector_control |= + PCIM_MSIX_VCTRL_MASK; + } + } + +#ifdef LEGACY_SUPPORT + /* + * If the passthrough device does not support MSI then craft a + * MSI capability for it. We link the new MSI capability at the + * head of the list of capabilities. + */ + if ((sts & PCIM_STATUS_CAPPRESENT) != 0 && sc->psc_msi.capoff == 0) { + int origptr, msiptr; + origptr = read_config(&sel, PCIR_CAP_PTR, 1); + msiptr = passthru_add_msicap(pi, 1, origptr); + sc->psc_msi.capoff = msiptr; + sc->psc_msi.msgctrl = pci_get_cfgdata16(pi, msiptr + 2); + sc->psc_msi.emulated = 1; + pci_set_cfgdata8(pi, PCIR_CAP_PTR, msiptr); + } +#endif + + /* Make sure one of the capabilities is present */ + if (sc->psc_msi.capoff == 0 && sc->psc_msix.capoff == 0) + return (-1); + else + return (0); +} + +static uint64_t +msix_table_read(struct passthru_softc *sc, uint64_t offset, int size) +{ + struct pci_devinst *pi; + struct msix_table_entry *entry; + uint8_t *src8; + uint16_t *src16; + uint32_t *src32; + uint64_t *src64; + uint64_t data; + size_t entry_offset; + int index; + + pi = sc->psc_pi; + if (offset >= pi->pi_msix.pba_offset && + offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { + switch(size) { + case 1: + src8 = (uint8_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + data = *src8; + break; + case 2: + src16 = (uint16_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + data = *src16; + break; + case 4: + src32 = (uint32_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + data = *src32; + break; + case 8: + src64 = (uint64_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + data = *src64; + break; + default: + return (-1); + } + return (data); + } + + if (offset < pi->pi_msix.table_offset) + return (-1); + + offset -= pi->pi_msix.table_offset; + index = offset / MSIX_TABLE_ENTRY_SIZE; + if (index >= pi->pi_msix.table_count) + return (-1); + + entry = &pi->pi_msix.table[index]; + entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + switch(size) { + case 1: + src8 = (uint8_t *)((void *)entry + entry_offset); + data = *src8; + break; + case 2: + src16 = (uint16_t *)((void *)entry + entry_offset); + data = *src16; + break; + case 4: + src32 = (uint32_t *)((void *)entry + entry_offset); + data = *src32; + break; + case 8: + src64 = (uint64_t *)((void *)entry + entry_offset); + data = *src64; + break; + default: + return (-1); + } + + return (data); +} + +static void +msix_table_write(struct vmctx *ctx, int vcpu, struct passthru_softc *sc, + uint64_t offset, int size, uint64_t data) +{ + struct pci_devinst *pi; + struct msix_table_entry *entry; + uint8_t *dest8; + uint16_t *dest16; + uint32_t *dest32; + uint64_t *dest64; + size_t entry_offset; + uint32_t vector_control; + int index; + + pi = sc->psc_pi; + if (offset >= pi->pi_msix.pba_offset && + offset < pi->pi_msix.pba_offset + pi->pi_msix.pba_size) { + switch(size) { + case 1: + dest8 = (uint8_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + *dest8 = data; + break; + case 2: + dest16 = (uint16_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + *dest16 = data; + break; + case 4: + dest32 = (uint32_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + *dest32 = data; + break; + case 8: + dest64 = (uint64_t *)(pi->pi_msix.pba_page + offset - + pi->pi_msix.pba_page_offset); + *dest64 = data; + break; + default: + break; + } + return; + } + + if (offset < pi->pi_msix.table_offset) + return; + + offset -= pi->pi_msix.table_offset; + index = offset / MSIX_TABLE_ENTRY_SIZE; + if (index >= pi->pi_msix.table_count) + return; + + entry = &pi->pi_msix.table[index]; + entry_offset = offset % MSIX_TABLE_ENTRY_SIZE; + + /* Only 4 byte naturally-aligned writes are supported */ + assert(size == 4); + assert(entry_offset % 4 == 0); + + vector_control = entry->vector_control; + dest32 = (uint32_t *)((void *)entry + entry_offset); + *dest32 = data; + /* If MSI-X hasn't been enabled, do nothing */ + if (pi->pi_msix.enabled) { + /* If the entry is masked, don't set it up */ + if ((entry->vector_control & PCIM_MSIX_VCTRL_MASK) == 0 || + (vector_control & PCIM_MSIX_VCTRL_MASK) == 0) { + (void)vm_setup_pptdev_msix(ctx, vcpu, + sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, + sc->psc_sel.pc_func, index, entry->addr, + entry->msg_data, entry->vector_control); + } + } +} + +static int +init_msix_table(struct vmctx *ctx, struct passthru_softc *sc, uint64_t base) +{ + int b, s, f; + int error, idx; + size_t len, remaining; + uint32_t table_size, table_offset; + uint32_t pba_size, pba_offset; + vm_paddr_t start; + struct pci_devinst *pi = sc->psc_pi; + + assert(pci_msix_table_bar(pi) >= 0 && pci_msix_pba_bar(pi) >= 0); + + b = sc->psc_sel.pc_bus; + s = sc->psc_sel.pc_dev; + f = sc->psc_sel.pc_func; + + /* + * If the MSI-X table BAR maps memory intended for + * other uses, it is at least assured that the table + * either resides in its own page within the region, + * or it resides in a page shared with only the PBA. + */ + table_offset = rounddown2(pi->pi_msix.table_offset, 4096); + + table_size = pi->pi_msix.table_offset - table_offset; + table_size += pi->pi_msix.table_count * MSIX_TABLE_ENTRY_SIZE; + table_size = roundup2(table_size, 4096); + + idx = pi->pi_msix.table_bar; + start = pi->pi_bar[idx].addr; + remaining = pi->pi_bar[idx].size; + + if (pi->pi_msix.pba_bar == pi->pi_msix.table_bar) { + pba_offset = pi->pi_msix.pba_offset; + pba_size = pi->pi_msix.pba_size; + if (pba_offset >= table_offset + table_size || + table_offset >= pba_offset + pba_size) { + /* + * If the PBA does not share a page with the MSI-x + * tables, no PBA emulation is required. + */ + pi->pi_msix.pba_page = NULL; + pi->pi_msix.pba_page_offset = 0; + } else { + /* + * The PBA overlaps with either the first or last + * page of the MSI-X table region. Map the + * appropriate page. + */ + if (pba_offset <= table_offset) + pi->pi_msix.pba_page_offset = table_offset; + else + pi->pi_msix.pba_page_offset = table_offset + + table_size - 4096; + pi->pi_msix.pba_page = mmap(NULL, 4096, PROT_READ | + PROT_WRITE, MAP_SHARED, memfd, start + + pi->pi_msix.pba_page_offset); + if (pi->pi_msix.pba_page == MAP_FAILED) { + warn( + "Failed to map PBA page for MSI-X on %d/%d/%d", + b, s, f); + return (-1); + } + } + } + + /* Map everything before the MSI-X table */ + if (table_offset > 0) { + len = table_offset; + error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); + if (error) + return (error); + + base += len; + start += len; + remaining -= len; + } + + /* Skip the MSI-X table */ + base += table_size; + start += table_size; + remaining -= table_size; + + /* Map everything beyond the end of the MSI-X table */ + if (remaining > 0) { + len = remaining; + error = vm_map_pptdev_mmio(ctx, b, s, f, start, len, base); + if (error) + return (error); + } + + return (0); +} + +static int +cfginitbar(struct vmctx *ctx, struct passthru_softc *sc) +{ + int i, error; + struct pci_devinst *pi; + struct pci_bar_io bar; + enum pcibar_type bartype; + uint64_t base, size; + + pi = sc->psc_pi; + + /* + * Initialize BAR registers + */ + for (i = 0; i <= PCI_BARMAX; i++) { + bzero(&bar, sizeof(bar)); + bar.pbi_sel = sc->psc_sel; + bar.pbi_reg = PCIR_BAR(i); + + if (ioctl(pcifd, PCIOCGETBAR, &bar) < 0) + continue; + + if (PCI_BAR_IO(bar.pbi_base)) { + bartype = PCIBAR_IO; + base = bar.pbi_base & PCIM_BAR_IO_BASE; + } else { + switch (bar.pbi_base & PCIM_BAR_MEM_TYPE) { + case PCIM_BAR_MEM_64: + bartype = PCIBAR_MEM64; + break; + default: + bartype = PCIBAR_MEM32; + break; + } + base = bar.pbi_base & PCIM_BAR_MEM_BASE; + } + size = bar.pbi_length; + + if (bartype != PCIBAR_IO) { + if (((base | size) & PAGE_MASK) != 0) { + warnx("passthru device %d/%d/%d BAR %d: " + "base %#lx or size %#lx not page aligned\n", + sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, + sc->psc_sel.pc_func, i, base, size); + return (-1); + } + } + + /* Cache information about the "real" BAR */ + sc->psc_bar[i].type = bartype; + sc->psc_bar[i].size = size; + sc->psc_bar[i].addr = base; + + /* Allocate the BAR in the guest I/O or MMIO space */ + error = pci_emul_alloc_pbar(pi, i, base, bartype, size); + if (error) + return (-1); + + /* The MSI-X table needs special handling */ + if (i == pci_msix_table_bar(pi)) { + error = init_msix_table(ctx, sc, base); + if (error) + return (-1); + } else if (bartype != PCIBAR_IO) { + /* Map the physical BAR in the guest MMIO space */ + error = vm_map_pptdev_mmio(ctx, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func, + pi->pi_bar[i].addr, pi->pi_bar[i].size, base); + if (error) + return (-1); + } + + /* + * 64-bit BAR takes up two slots so skip the next one. + */ + if (bartype == PCIBAR_MEM64) { + i++; + assert(i <= PCI_BARMAX); + sc->psc_bar[i].type = PCIBAR_MEMHI64; + } + } + return (0); +} + +static int +cfginit(struct vmctx *ctx, struct pci_devinst *pi, int bus, int slot, int func) +{ + int error; + struct passthru_softc *sc; + + error = 1; + sc = pi->pi_arg; + + bzero(&sc->psc_sel, sizeof(struct pcisel)); + sc->psc_sel.pc_bus = bus; + sc->psc_sel.pc_dev = slot; + sc->psc_sel.pc_func = func; + + if (cfginitmsi(sc) != 0) { + warnx("failed to initialize MSI for PCI %d/%d/%d", + bus, slot, func); + goto done; + } + + if (cfginitbar(ctx, sc) != 0) { + warnx("failed to initialize BARs for PCI %d/%d/%d", + bus, slot, func); + goto done; + } + + error = 0; /* success */ +done: + return (error); +} + +static int +passthru_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + int bus, slot, func, error, memflags; + struct passthru_softc *sc; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t pci_ioctls[] = { PCIOCREAD, PCIOCWRITE, PCIOCGETBAR }; + cap_ioctl_t io_ioctls[] = { IODEV_PIO }; +#endif + + sc = NULL; + error = 1; + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_IOCTL, CAP_READ, CAP_WRITE); +#endif + + memflags = vm_get_memflags(ctx); + if (!(memflags & VM_MEM_F_WIRED)) { + warnx("passthru requires guest memory to be wired"); + goto done; + } + + if (pcifd < 0) { + pcifd = open(_PATH_DEVPCI, O_RDWR, 0); + if (pcifd < 0) { + warn("failed to open %s", _PATH_DEVPCI); + goto done; + } + } + +#ifndef WITHOUT_CAPSICUM + if (cap_rights_limit(pcifd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (cap_ioctls_limit(pcifd, pci_ioctls, nitems(pci_ioctls)) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + if (iofd < 0) { + iofd = open(_PATH_DEVIO, O_RDWR, 0); + if (iofd < 0) { + warn("failed to open %s", _PATH_DEVIO); + goto done; + } + } + +#ifndef WITHOUT_CAPSICUM + if (cap_rights_limit(iofd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (cap_ioctls_limit(iofd, io_ioctls, nitems(io_ioctls)) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + if (memfd < 0) { + memfd = open(_PATH_MEM, O_RDWR, 0); + if (memfd < 0) { + warn("failed to open %s", _PATH_MEM); + goto done; + } + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_clear(&rights, CAP_IOCTL); + cap_rights_set(&rights, CAP_MMAP_RW); + if (cap_rights_limit(memfd, &rights) == -1 && errno != ENOSYS) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + if (opts == NULL || + sscanf(opts, "%d/%d/%d", &bus, &slot, &func) != 3) { + warnx("invalid passthru options"); + goto done; + } + + if (vm_assign_pptdev(ctx, bus, slot, func) != 0) { + warnx("PCI device at %d/%d/%d is not using the ppt(4) driver", + bus, slot, func); + goto done; + } + + sc = calloc(1, sizeof(struct passthru_softc)); + + pi->pi_arg = sc; + sc->psc_pi = pi; + + /* initialize config space */ + if ((error = cfginit(ctx, pi, bus, slot, func)) != 0) + goto done; + + error = 0; /* success */ +done: + if (error) { + free(sc); + vm_unassign_pptdev(ctx, bus, slot, func); + } + return (error); +} + +static int +bar_access(int coff) +{ + if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) + return (1); + else + return (0); +} + +static int +msicap_access(struct passthru_softc *sc, int coff) +{ + int caplen; + + if (sc->psc_msi.capoff == 0) + return (0); + + caplen = msi_caplen(sc->psc_msi.msgctrl); + + if (coff >= sc->psc_msi.capoff && coff < sc->psc_msi.capoff + caplen) + return (1); + else + return (0); +} + +static int +msixcap_access(struct passthru_softc *sc, int coff) +{ + if (sc->psc_msix.capoff == 0) + return (0); + + return (coff >= sc->psc_msix.capoff && + coff < sc->psc_msix.capoff + MSIX_CAPLEN); +} + +static int +passthru_cfgread(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t *rv) +{ + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* + * PCI BARs and MSI capability is emulated. + */ + if (bar_access(coff) || msicap_access(sc, coff)) + return (-1); + +#ifdef LEGACY_SUPPORT + /* + * Emulate PCIR_CAP_PTR if this device does not support MSI capability + * natively. + */ + if (sc->psc_msi.emulated) { + if (coff >= PCIR_CAP_PTR && coff < PCIR_CAP_PTR + 4) + return (-1); + } +#endif + + /* Everything else just read from the device's config space */ + *rv = read_config(&sc->psc_sel, coff, bytes); + + return (0); +} + +static int +passthru_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int coff, int bytes, uint32_t val) +{ + int error, msix_table_entries, i; + struct passthru_softc *sc; + + sc = pi->pi_arg; + + /* + * PCI BARs are emulated + */ + if (bar_access(coff)) + return (-1); + + /* + * MSI capability is emulated + */ + if (msicap_access(sc, coff)) { + msicap_cfgwrite(pi, sc->psc_msi.capoff, coff, bytes, val); + + error = vm_setup_pptdev_msi(ctx, vcpu, sc->psc_sel.pc_bus, + sc->psc_sel.pc_dev, sc->psc_sel.pc_func, + pi->pi_msi.addr, pi->pi_msi.msg_data, + pi->pi_msi.maxmsgnum); + if (error != 0) + err(1, "vm_setup_pptdev_msi"); + return (0); + } + + if (msixcap_access(sc, coff)) { + msixcap_cfgwrite(pi, sc->psc_msix.capoff, coff, bytes, val); + if (pi->pi_msix.enabled) { + msix_table_entries = pi->pi_msix.table_count; + for (i = 0; i < msix_table_entries; i++) { + error = vm_setup_pptdev_msix(ctx, vcpu, + sc->psc_sel.pc_bus, sc->psc_sel.pc_dev, + sc->psc_sel.pc_func, i, + pi->pi_msix.table[i].addr, + pi->pi_msix.table[i].msg_data, + pi->pi_msix.table[i].vector_control); + + if (error) + err(1, "vm_setup_pptdev_msix"); + } + } + return (0); + } + +#ifdef LEGACY_SUPPORT + /* + * If this device does not support MSI natively then we cannot let + * the guest disable legacy interrupts from the device. It is the + * legacy interrupt that is triggering the virtual MSI to the guest. + */ + if (sc->psc_msi.emulated && pci_msi_enabled(pi)) { + if (coff == PCIR_COMMAND && bytes == 2) + val &= ~PCIM_CMD_INTxDIS; + } +#endif + + write_config(&sc->psc_sel, coff, bytes, val); + + return (0); +} + +static void +passthru_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size, uint64_t value) +{ + struct passthru_softc *sc; + struct iodev_pio_req pio; + + sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi)) { + msix_table_write(ctx, vcpu, sc, offset, size, value); + } else { + assert(pi->pi_bar[baridx].type == PCIBAR_IO); + bzero(&pio, sizeof(struct iodev_pio_req)); + pio.access = IODEV_PIO_WRITE; + pio.port = sc->psc_bar[baridx].addr + offset; + pio.width = size; + pio.val = value; + + (void)ioctl(iofd, IODEV_PIO, &pio); + } +} + +static uint64_t +passthru_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct passthru_softc *sc; + struct iodev_pio_req pio; + uint64_t val; + + sc = pi->pi_arg; + + if (baridx == pci_msix_table_bar(pi)) { + val = msix_table_read(sc, offset, size); + } else { + assert(pi->pi_bar[baridx].type == PCIBAR_IO); + bzero(&pio, sizeof(struct iodev_pio_req)); + pio.access = IODEV_PIO_READ; + pio.port = sc->psc_bar[baridx].addr + offset; + pio.width = size; + pio.val = 0; + + (void)ioctl(iofd, IODEV_PIO, &pio); + + val = pio.val; + } + + return (val); +} + +struct pci_devemu passthru = { + .pe_emu = "passthru", + .pe_init = passthru_init, + .pe_cfgwrite = passthru_cfgwrite, + .pe_cfgread = passthru_cfgread, + .pe_barwrite = passthru_write, + .pe_barread = passthru_read, +}; +PCI_EMUL_SET(passthru); diff --git a/usr/src/cmd/bhyve/pci_uart.c b/usr/src/cmd/bhyve/pci_uart.c new file mode 100644 index 0000000000..093d0cb361 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_uart.c @@ -0,0 +1,121 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2012 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> + +#include <stdio.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "uart_emul.h" + +/* + * Pick a PCI vid/did of a chip with a single uart at + * BAR0, that most versions of FreeBSD can understand: + * Siig CyberSerial 1-port. + */ +#define COM_VENDOR 0x131f +#define COM_DEV 0x2000 + +static void +pci_uart_intr_assert(void *arg) +{ + struct pci_devinst *pi = arg; + + pci_lintr_assert(pi); +} + +static void +pci_uart_intr_deassert(void *arg) +{ + struct pci_devinst *pi = arg; + + pci_lintr_deassert(pi); +} + +static void +pci_uart_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + + assert(baridx == 0); + assert(size == 1); + + uart_write(pi->pi_arg, offset, value); +} + +uint64_t +pci_uart_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size) +{ + uint8_t val; + + assert(baridx == 0); + assert(size == 1); + + val = uart_read(pi->pi_arg, offset); + return (val); +} + +static int +pci_uart_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct uart_softc *sc; + + pci_emul_alloc_bar(pi, 0, PCIBAR_IO, UART_IO_BAR_SIZE); + pci_lintr_request(pi); + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, COM_DEV); + pci_set_cfgdata16(pi, PCIR_VENDOR, COM_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); + + sc = uart_init(pci_uart_intr_assert, pci_uart_intr_deassert, pi); + pi->pi_arg = sc; + + if (uart_set_backend(sc, opts) != 0) { + fprintf(stderr, "Unable to initialize backend '%s' for " + "pci uart at %d:%d\n", opts, pi->pi_slot, pi->pi_func); + return (-1); + } + + return (0); +} + +struct pci_devemu pci_de_com = { + .pe_emu = "uart", + .pe_init = pci_uart_init, + .pe_barwrite = pci_uart_write, + .pe_barread = pci_uart_read +}; +PCI_EMUL_SET(pci_de_com); diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c index 65e2d9c57d..5a7ecbfe9e 100644 --- a/usr/src/cmd/bhyve/pci_virtio_block.c +++ b/usr/src/cmd/bhyve/pci_virtio_block.c @@ -1,6 +1,9 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:47:09Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,10 +39,11 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:47:09Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/linker_set.h> @@ -63,24 +67,23 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02: #include "bhyverun.h" #include "pci_emul.h" #include "virtio.h" +#include "block_if.h" -#define VTBLK_RINGSZ 64 +#define VTBLK_RINGSZ 128 -#ifdef __FreeBSD__ -#define VTBLK_MAXSEGS 32 -#else -#define VTBLK_MAXSEGS 16 -#endif +_Static_assert(VTBLK_RINGSZ <= BLOCKIF_RING_MAX, "Each ring entry must be able to queue a request"); #define VTBLK_S_OK 0 #define VTBLK_S_IOERR 1 #define VTBLK_S_UNSUPP 2 -#define VTBLK_BLK_ID_BYTES 20 +#define VTBLK_BLK_ID_BYTES 20 + 1 /* Capability bits */ #define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */ -#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ +#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */ +#define VTBLK_F_FLUSH (1 << 9) /* Cache flush support */ +#define VTBLK_F_TOPOLOGY (1 << 10) /* Optimal I/O alignment */ /* * Host capabilities @@ -88,6 +91,8 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02: #define VTBLK_S_HOSTCAPS \ ( VTBLK_F_SEG_MAX | \ VTBLK_F_BLK_SIZE | \ + VTBLK_F_FLUSH | \ + VTBLK_F_TOPOLOGY | \ VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */ /* @@ -97,11 +102,19 @@ struct vtblk_config { uint64_t vbc_capacity; uint32_t vbc_size_max; uint32_t vbc_seg_max; - uint16_t vbc_geom_c; - uint8_t vbc_geom_h; - uint8_t vbc_geom_s; + struct { + uint16_t cylinders; + uint8_t heads; + uint8_t sectors; + } vbc_geometry; uint32_t vbc_blk_size; - uint32_t vbc_sectors_max; + struct { + uint8_t physical_block_exp; + uint8_t alignment_offset; + uint16_t min_io_size; + uint32_t opt_io_size; + } vbc_topology; + uint8_t vbc_writeback; } __packed; /* @@ -110,9 +123,11 @@ struct vtblk_config { struct virtio_blk_hdr { #define VBH_OP_READ 0 #define VBH_OP_WRITE 1 -#define VBH_OP_IDENT 8 +#define VBH_OP_FLUSH 4 +#define VBH_OP_FLUSH_OUT 5 +#define VBH_OP_IDENT 8 #define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */ - uint32_t vbh_type; + uint32_t vbh_type; uint32_t vbh_ioprio; uint64_t vbh_sector; } __packed; @@ -124,6 +139,13 @@ static int pci_vtblk_debug; #define DPRINTF(params) if (pci_vtblk_debug) printf params #define WPRINTF(params) printf params +struct pci_vtblk_ioreq { + struct blockif_req io_req; + struct pci_vtblk_softc *io_sc; + uint8_t *io_status; + uint16_t io_idx; +}; + /* * Per-device softc */ @@ -131,24 +153,36 @@ struct pci_vtblk_softc { struct virtio_softc vbsc_vs; pthread_mutex_t vsc_mtx; struct vqueue_info vbsc_vq; - int vbsc_fd; - struct vtblk_config vbsc_cfg; + struct vtblk_config vbsc_cfg; + struct blockif_ctxt *bc; +#ifndef __FreeBSD__ + int vbsc_wce; +#endif char vbsc_ident[VTBLK_BLK_ID_BYTES]; + struct pci_vtblk_ioreq vbsc_ios[VTBLK_RINGSZ]; }; static void pci_vtblk_reset(void *); static void pci_vtblk_notify(void *, struct vqueue_info *); static int pci_vtblk_cfgread(void *, int, int, uint32_t *); static int pci_vtblk_cfgwrite(void *, int, int, uint32_t); +#ifndef __FreeBSD__ +static void pci_vtblk_apply_feats(void *, uint64_t); +#endif static struct virtio_consts vtblk_vi_consts = { "vtblk", /* our name */ 1, /* we support 1 virtqueue */ - sizeof(struct vtblk_config), /* config reg size */ + sizeof(struct vtblk_config), /* config reg size */ pci_vtblk_reset, /* reset */ pci_vtblk_notify, /* device-wide qnotify */ pci_vtblk_cfgread, /* read PCI config */ pci_vtblk_cfgwrite, /* write PCI config */ +#ifndef __FreeBSD__ + pci_vtblk_apply_feats, /* apply negotiated features */ +#else + NULL, /* apply negotiated features */ +#endif VTBLK_S_HOSTCAPS, /* our capabilities */ }; @@ -159,22 +193,58 @@ pci_vtblk_reset(void *vsc) DPRINTF(("vtblk: device reset requested !\n")); vi_reset_dev(&sc->vbsc_vs); +#ifndef __FreeBSD__ + /* Disable write cache until FLUSH feature is negotiated */ + (void) blockif_set_wce(sc->bc, 0); + sc->vbsc_wce = 0; +#endif +} + +static void +pci_vtblk_done_locked(struct pci_vtblk_ioreq *io, int err) +{ + struct pci_vtblk_softc *sc = io->io_sc; + + /* convert errno into a virtio block error return */ + if (err == EOPNOTSUPP || err == ENOSYS) + *io->io_status = VTBLK_S_UNSUPP; + else if (err != 0) + *io->io_status = VTBLK_S_IOERR; + else + *io->io_status = VTBLK_S_OK; + + /* + * Return the descriptor back to the host. + * We wrote 1 byte (our status) to host. + */ + vq_relchain(&sc->vbsc_vq, io->io_idx, 1); + vq_endchains(&sc->vbsc_vq, 0); +} + +static void +pci_vtblk_done(struct blockif_req *br, int err) +{ + struct pci_vtblk_ioreq *io = br->br_param; + struct pci_vtblk_softc *sc = io->io_sc; + + pthread_mutex_lock(&sc->vsc_mtx); + pci_vtblk_done_locked(io, err); + pthread_mutex_unlock(&sc->vsc_mtx); } static void pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) { struct virtio_blk_hdr *vbh; - uint8_t *status; + struct pci_vtblk_ioreq *io; int i, n; int err; - int iolen; + ssize_t iolen; int writeop, type; - off_t offset; - struct iovec iov[VTBLK_MAXSEGS + 2]; - uint16_t flags[VTBLK_MAXSEGS + 2]; + struct iovec iov[BLOCKIF_IOV_MAX + 2]; + uint16_t idx, flags[BLOCKIF_IOV_MAX + 2]; - n = vq_getchain(vq, iov, VTBLK_MAXSEGS + 2, flags); + n = vq_getchain(vq, &idx, iov, BLOCKIF_IOV_MAX + 2, flags); /* * The first descriptor will be the read-only fixed header, @@ -184,13 +254,16 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) * XXX - note - this fails on crash dump, which does a * VIRTIO_BLK_T_FLUSH with a zero transfer length */ - assert(n >= 2 && n <= VTBLK_MAXSEGS + 2); + assert(n >= 2 && n <= BLOCKIF_IOV_MAX + 2); + io = &sc->vbsc_ios[idx]; assert((flags[0] & VRING_DESC_F_WRITE) == 0); assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr)); - vbh = (struct virtio_block_hdr *)iov[0].iov_base; - - status = iov[--n].iov_base; + vbh = (struct virtio_blk_hdr *)iov[0].iov_base; + memcpy(&io->io_req.br_iov, &iov[1], sizeof(struct iovec) * (n - 2)); + io->io_req.br_iovcnt = n - 2; + io->io_req.br_offset = vbh->vbh_sector * DEV_BSIZE; + io->io_status = (uint8_t *)iov[--n].iov_base; assert(iov[n].iov_len == 1); assert(flags[n] & VRING_DESC_F_WRITE); @@ -202,8 +275,6 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) type = vbh->vbh_type & ~VBH_FLAG_BARRIER; writeop = (type == VBH_OP_WRITE); - offset = vbh->vbh_sector * DEV_BSIZE; - iolen = 0; for (i = 1; i < n; i++) { /* @@ -215,42 +286,36 @@ pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq) assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop); iolen += iov[i].iov_len; } + io->io_req.br_resid = iolen; - DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r", - writeop ? "write" : "read/ident", iolen, i - 1, offset)); + DPRINTF(("virtio-block: %s op, %zd bytes, %d segs, offset %ld\n\r", + writeop ? "write" : "read/ident", iolen, i - 1, + io->io_req.br_offset)); switch (type) { + case VBH_OP_READ: + err = blockif_read(sc->bc, &io->io_req); + break; case VBH_OP_WRITE: - err = pwritev(sc->vbsc_fd, iov + 1, i - 1, offset); + err = blockif_write(sc->bc, &io->io_req); break; - case VBH_OP_READ: - err = preadv(sc->vbsc_fd, iov + 1, i - 1, offset); + case VBH_OP_FLUSH: + case VBH_OP_FLUSH_OUT: + err = blockif_flush(sc->bc, &io->io_req); break; case VBH_OP_IDENT: /* Assume a single buffer */ - strlcpy(iov[1].iov_base, sc->vbsc_ident, + /* S/n equal to buffer is not zero-terminated. */ + memset(iov[1].iov_base, 0, iov[1].iov_len); + strncpy(iov[1].iov_base, sc->vbsc_ident, MIN(iov[1].iov_len, sizeof(sc->vbsc_ident))); - err = 0; - break; + pci_vtblk_done_locked(io, 0); + return; default: - err = -ENOSYS; - break; + pci_vtblk_done_locked(io, EOPNOTSUPP); + return; } - - /* convert errno into a virtio block error return */ - if (err < 0) { - if (err == -ENOSYS) - *status = VTBLK_S_UNSUPP; - else - *status = VTBLK_S_IOERR; - } else - *status = VTBLK_S_OK; - - /* - * Return the descriptor back to the host. - * We wrote 1 byte (our status) to host. - */ - vq_relchain(vq, 1); + assert(err == 0); } static void @@ -258,22 +323,20 @@ pci_vtblk_notify(void *vsc, struct vqueue_info *vq) { struct pci_vtblk_softc *sc = vsc; - vq_startchains(vq); while (vq_has_descs(vq)) pci_vtblk_proc(sc, vq); - vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ } static int pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) { - struct stat sbuf; + char bident[sizeof("XX:X:X")]; + struct blockif_ctxt *bctxt; MD5_CTX mdctx; u_char digest[16]; struct pci_vtblk_softc *sc; - off_t size; - int fd; - int sectsz; + off_t size; + int i, sectsz, sts, sto; if (opts == NULL) { printf("virtio-block: backing device required\n"); @@ -283,40 +346,32 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) /* * The supplied backing file has to exist */ - fd = open(opts, O_RDWR); - if (fd < 0) { + snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func); + bctxt = blockif_open(opts, bident); + if (bctxt == NULL) { perror("Could not open backing file"); return (1); } - if (fstat(fd, &sbuf) < 0) { - perror("Could not stat backing file"); - close(fd); - return (1); - } - - /* - * Deal with raw devices - */ - size = sbuf.st_size; - sectsz = DEV_BSIZE; -#ifdef __FreeBSD__ - if (S_ISCHR(sbuf.st_mode)) { - if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 || - ioctl(fd, DIOCGSECTORSIZE, §sz)) { - perror("Could not fetch dev blk/sector size"); - close(fd); - return (1); - } - assert(size != 0); - assert(sectsz != 0); - } -#endif + size = blockif_size(bctxt); + sectsz = blockif_sectsz(bctxt); + blockif_psectsz(bctxt, &sts, &sto); sc = calloc(1, sizeof(struct pci_vtblk_softc)); + sc->bc = bctxt; + for (i = 0; i < VTBLK_RINGSZ; i++) { + struct pci_vtblk_ioreq *io = &sc->vbsc_ios[i]; + io->io_req.br_callback = pci_vtblk_done; + io->io_req.br_param = io; + io->io_sc = sc; + io->io_idx = i; + } - /* record fd of storage device/file */ - sc->vbsc_fd = fd; +#ifndef __FreeBSD__ + /* Disable write cache until FLUSH feature is negotiated */ + (void) blockif_set_wce(sc->bc, 0); + sc->vbsc_wce = 0; +#endif pthread_mutex_init(&sc->vsc_mtx, NULL); @@ -333,19 +388,34 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) */ MD5Init(&mdctx); MD5Update(&mdctx, opts, strlen(opts)); - MD5Final(digest, &mdctx); - sprintf(sc->vbsc_ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X", + MD5Final(digest, &mdctx); + snprintf(sc->vbsc_ident, VTBLK_BLK_ID_BYTES, + "BHYVE-%02X%02X-%02X%02X-%02X%02X", digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]); /* setup virtio block config space */ sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */ - sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS; - sc->vbsc_cfg.vbc_blk_size = sectsz; sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */ - sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */ - sc->vbsc_cfg.vbc_geom_h = 0; - sc->vbsc_cfg.vbc_geom_s = 0; - sc->vbsc_cfg.vbc_sectors_max = 0; + + /* + * If Linux is presented with a seg_max greater than the virtio queue + * size, it can stumble into situations where it violates its own + * invariants and panics. For safety, we keep seg_max clamped, paying + * heed to the two extra descriptors needed for the header and status + * of a request. + */ + sc->vbsc_cfg.vbc_seg_max = MIN(VTBLK_RINGSZ - 2, BLOCKIF_IOV_MAX); + sc->vbsc_cfg.vbc_geometry.cylinders = 0; /* no geometry */ + sc->vbsc_cfg.vbc_geometry.heads = 0; + sc->vbsc_cfg.vbc_geometry.sectors = 0; + sc->vbsc_cfg.vbc_blk_size = sectsz; + sc->vbsc_cfg.vbc_topology.physical_block_exp = + (sts > sectsz) ? (ffsll(sts / sectsz) - 1) : 0; + sc->vbsc_cfg.vbc_topology.alignment_offset = + (sto != 0) ? ((sts - sto) / sectsz) : 0; + sc->vbsc_cfg.vbc_topology.min_io_size = 0; + sc->vbsc_cfg.vbc_topology.opt_io_size = 0; + sc->vbsc_cfg.vbc_writeback = 0; /* * Should we move some of this into virtio.c? Could @@ -356,9 +426,13 @@ pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); - if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) + if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix())) { + blockif_close(sc->bc); + free(sc); return (1); + } vi_set_io_bar(&sc->vbsc_vs, 0); return (0); } @@ -383,6 +457,20 @@ pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval) return (0); } +#ifndef __FreeBSD__ +void +pci_vtblk_apply_feats(void *vsc, uint64_t caps) +{ + struct pci_vtblk_softc *sc = vsc; + const int wce_next = ((caps & VTBLK_F_FLUSH) != 0) ? 1 : 0; + + if (sc->vbsc_wce != wce_next) { + (void) blockif_set_wce(sc->bc, wce_next); + sc->vbsc_wce = wce_next; + } +} +#endif /* __FreeBSD__ */ + struct pci_devemu pci_de_vblk = { .pe_emu = "virtio-blk", .pe_init = pci_vtblk_init, diff --git a/usr/src/cmd/bhyve/pci_virtio_console.c b/usr/src/cmd/bhyve/pci_virtio_console.c new file mode 100644 index 0000000000..90437662df --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_console.c @@ -0,0 +1,701 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 iXsystems Inc. + * All rights reserved. + * + * This software was developed by Jakub Klama <jceel@FreeBSD.org> + * under sponsorship from iXsystems Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/linker_set.h> +#include <sys/uio.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <libgen.h> +#include <sysexits.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" +#include "mevent.h" +#include "sockstream.h" + +#define VTCON_RINGSZ 64 +#define VTCON_MAXPORTS 16 +#define VTCON_MAXQ (VTCON_MAXPORTS * 2 + 2) + +#define VTCON_DEVICE_READY 0 +#define VTCON_DEVICE_ADD 1 +#define VTCON_DEVICE_REMOVE 2 +#define VTCON_PORT_READY 3 +#define VTCON_CONSOLE_PORT 4 +#define VTCON_CONSOLE_RESIZE 5 +#define VTCON_PORT_OPEN 6 +#define VTCON_PORT_NAME 7 + +#define VTCON_F_SIZE 0 +#define VTCON_F_MULTIPORT 1 +#define VTCON_F_EMERG_WRITE 2 +#define VTCON_S_HOSTCAPS \ + (VTCON_F_SIZE | VTCON_F_MULTIPORT | VTCON_F_EMERG_WRITE) + +static int pci_vtcon_debug; +#define DPRINTF(params) if (pci_vtcon_debug) printf params +#define WPRINTF(params) printf params + +struct pci_vtcon_softc; +struct pci_vtcon_port; +struct pci_vtcon_config; +typedef void (pci_vtcon_cb_t)(struct pci_vtcon_port *, void *, struct iovec *, + int); + +struct pci_vtcon_port { + struct pci_vtcon_softc * vsp_sc; + int vsp_id; + const char * vsp_name; + bool vsp_enabled; + bool vsp_console; + bool vsp_rx_ready; + bool vsp_open; + int vsp_rxq; + int vsp_txq; + void * vsp_arg; + pci_vtcon_cb_t * vsp_cb; +}; + +struct pci_vtcon_sock +{ + struct pci_vtcon_port * vss_port; + const char * vss_path; + struct mevent * vss_server_evp; + struct mevent * vss_conn_evp; + int vss_server_fd; + int vss_conn_fd; + bool vss_open; +}; + +struct pci_vtcon_softc { + struct virtio_softc vsc_vs; + struct vqueue_info vsc_queues[VTCON_MAXQ]; + pthread_mutex_t vsc_mtx; + uint64_t vsc_cfg; + uint64_t vsc_features; + char * vsc_rootdir; + int vsc_kq; + int vsc_nports; + bool vsc_ready; + struct pci_vtcon_port vsc_control_port; + struct pci_vtcon_port vsc_ports[VTCON_MAXPORTS]; + struct pci_vtcon_config *vsc_config; +}; + +struct pci_vtcon_config { + uint16_t cols; + uint16_t rows; + uint32_t max_nr_ports; + uint32_t emerg_wr; +} __attribute__((packed)); + +struct pci_vtcon_control { + uint32_t id; + uint16_t event; + uint16_t value; +} __attribute__((packed)); + +struct pci_vtcon_console_resize { + uint16_t cols; + uint16_t rows; +} __attribute__((packed)); + +static void pci_vtcon_reset(void *); +static void pci_vtcon_notify_rx(void *, struct vqueue_info *); +static void pci_vtcon_notify_tx(void *, struct vqueue_info *); +static int pci_vtcon_cfgread(void *, int, int, uint32_t *); +static int pci_vtcon_cfgwrite(void *, int, int, uint32_t); +static void pci_vtcon_neg_features(void *, uint64_t); +static void pci_vtcon_sock_accept(int, enum ev_type, void *); +static void pci_vtcon_sock_rx(int, enum ev_type, void *); +static void pci_vtcon_sock_tx(struct pci_vtcon_port *, void *, struct iovec *, + int); +static void pci_vtcon_control_send(struct pci_vtcon_softc *, + struct pci_vtcon_control *, const void *, size_t); +static void pci_vtcon_announce_port(struct pci_vtcon_port *); +static void pci_vtcon_open_port(struct pci_vtcon_port *, bool); + +static struct virtio_consts vtcon_vi_consts = { + "vtcon", /* our name */ + VTCON_MAXQ, /* we support VTCON_MAXQ virtqueues */ + sizeof(struct pci_vtcon_config), /* config reg size */ + pci_vtcon_reset, /* reset */ + NULL, /* device-wide qnotify */ + pci_vtcon_cfgread, /* read virtio config */ + pci_vtcon_cfgwrite, /* write virtio config */ + pci_vtcon_neg_features, /* apply negotiated features */ + VTCON_S_HOSTCAPS, /* our capabilities */ +}; + + +static void +pci_vtcon_reset(void *vsc) +{ + struct pci_vtcon_softc *sc; + + sc = vsc; + + DPRINTF(("vtcon: device reset requested!\n")); + vi_reset_dev(&sc->vsc_vs); +} + +static void +pci_vtcon_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtcon_softc *sc = vsc; + + sc->vsc_features = negotiated_features; +} + +static int +pci_vtcon_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtcon_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)sc->vsc_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static int +pci_vtcon_cfgwrite(void *vsc, int offset, int size, uint32_t val) +{ + + return (0); +} + +static inline struct pci_vtcon_port * +pci_vtcon_vq_to_port(struct pci_vtcon_softc *sc, struct vqueue_info *vq) +{ + uint16_t num = vq->vq_num; + + if (num == 0 || num == 1) + return (&sc->vsc_ports[0]); + + if (num == 2 || num == 3) + return (&sc->vsc_control_port); + + return (&sc->vsc_ports[(num / 2) - 1]); +} + +static inline struct vqueue_info * +pci_vtcon_port_to_vq(struct pci_vtcon_port *port, bool tx_queue) +{ + int qnum; + + qnum = tx_queue ? port->vsp_txq : port->vsp_rxq; + return (&port->vsp_sc->vsc_queues[qnum]); +} + +static struct pci_vtcon_port * +pci_vtcon_port_add(struct pci_vtcon_softc *sc, const char *name, + pci_vtcon_cb_t *cb, void *arg) +{ + struct pci_vtcon_port *port; + + if (sc->vsc_nports == VTCON_MAXPORTS) { + errno = EBUSY; + return (NULL); + } + + port = &sc->vsc_ports[sc->vsc_nports++]; + port->vsp_id = sc->vsc_nports - 1; + port->vsp_sc = sc; + port->vsp_name = name; + port->vsp_cb = cb; + port->vsp_arg = arg; + + if (port->vsp_id == 0) { + /* port0 */ + port->vsp_txq = 0; + port->vsp_rxq = 1; + } else { + port->vsp_txq = sc->vsc_nports * 2; + port->vsp_rxq = port->vsp_txq + 1; + } + + port->vsp_enabled = true; + return (port); +} + +static int +pci_vtcon_sock_add(struct pci_vtcon_softc *sc, const char *name, + const char *path) +{ + struct pci_vtcon_sock *sock; +#ifdef __FreeBSD__ + struct sockaddr_un sun; + char *pathcopy; +#else + /* Our compiler #defines 'sun' as '1'. Awesome. */ + struct sockaddr_un addr; +#endif + int s = -1, fd = -1, error = 0; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + sock = calloc(1, sizeof(struct pci_vtcon_sock)); + if (sock == NULL) { + error = -1; + goto out; + } + + s = socket(AF_UNIX, SOCK_STREAM, 0); + if (s < 0) { + error = -1; + goto out; + } + +#ifdef __FreeBSD__ + pathcopy = strdup(path); + if (pathcopy == NULL) { + error = -1; + goto out; + } + + fd = open(dirname(pathcopy), O_RDONLY | O_DIRECTORY); + if (fd < 0) { + free(pathcopy); + error = -1; + goto out; + } + + sun.sun_family = AF_UNIX; + sun.sun_len = sizeof(struct sockaddr_un); + strcpy(pathcopy, path); + strlcpy(sun.sun_path, basename(pathcopy), sizeof(sun.sun_path)); + free(pathcopy); + + if (bindat(fd, s, (struct sockaddr *)&sun, sun.sun_len) < 0) { + error = -1; + goto out; + } +#else /* __FreeBSD__ */ + /* Do a simple bind rather than the FreeBSD bindat() */ + addr.sun_family = AF_UNIX; + (void) strlcpy(addr.sun_path, path, sizeof (addr.sun_path)); + if (bind(fd, (struct sockaddr *)&addr, sizeof (addr)) < 0) { + error = -1; + goto out; + } +#endif /* __FreeBSD__ */ + + if (fcntl(s, F_SETFL, O_NONBLOCK) < 0) { + error = -1; + goto out; + } + + if (listen(s, 1) < 0) { + error = -1; + goto out; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(s, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + sock->vss_port = pci_vtcon_port_add(sc, name, pci_vtcon_sock_tx, sock); + if (sock->vss_port == NULL) { + error = -1; + goto out; + } + + sock->vss_open = false; + sock->vss_conn_fd = -1; + sock->vss_server_fd = s; + sock->vss_server_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_accept, + sock); + + if (sock->vss_server_evp == NULL) { + error = -1; + goto out; + } + +out: + if (fd != -1) + close(fd); + + if (error != 0 && s != -1) + close(s); + + return (error); +} + +static void +pci_vtcon_sock_accept(int fd __unused, enum ev_type t __unused, void *arg) +{ + struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; + int s; + + s = accept(sock->vss_server_fd, NULL, NULL); + if (s < 0) + return; + + if (sock->vss_open) { + close(s); + return; + } + + sock->vss_open = true; + sock->vss_conn_fd = s; + sock->vss_conn_evp = mevent_add(s, EVF_READ, pci_vtcon_sock_rx, sock); + + pci_vtcon_open_port(sock->vss_port, true); +} + +static void +pci_vtcon_sock_rx(int fd __unused, enum ev_type t __unused, void *arg) +{ + struct pci_vtcon_port *port; + struct pci_vtcon_sock *sock = (struct pci_vtcon_sock *)arg; + struct vqueue_info *vq; + struct iovec iov; + static char dummybuf[2048]; + int len, n; + uint16_t idx; + + port = sock->vss_port; + vq = pci_vtcon_port_to_vq(port, true); + + if (!sock->vss_open || !port->vsp_rx_ready) { + len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); + if (len == 0) + goto close; + + return; + } + + if (!vq_has_descs(vq)) { + len = read(sock->vss_conn_fd, dummybuf, sizeof(dummybuf)); + vq_endchains(vq, 1); + if (len == 0) + goto close; + + return; + } + + do { + n = vq_getchain(vq, &idx, &iov, 1, NULL); + len = readv(sock->vss_conn_fd, &iov, n); + + if (len == 0 || (len < 0 && errno == EWOULDBLOCK)) { + vq_retchain(vq); + vq_endchains(vq, 0); + if (len == 0) + goto close; + + return; + } + + vq_relchain(vq, idx, len); + } while (vq_has_descs(vq)); + + vq_endchains(vq, 1); + +close: + mevent_delete_close(sock->vss_conn_evp); + sock->vss_conn_fd = -1; + sock->vss_open = false; +} + +static void +pci_vtcon_sock_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, + int niov) +{ + struct pci_vtcon_sock *sock; +#ifdef __FreeBSD__ + int i, ret; +#else + int i, ret = 0; +#endif + + sock = (struct pci_vtcon_sock *)arg; + + if (sock->vss_conn_fd == -1) + return; + + for (i = 0; i < niov; i++) { + ret = stream_write(sock->vss_conn_fd, iov[i].iov_base, + iov[i].iov_len); + if (ret <= 0) + break; + } + + if (ret <= 0) { + mevent_delete_close(sock->vss_conn_evp); + sock->vss_conn_fd = -1; + sock->vss_open = false; + } +} + +static void +pci_vtcon_control_tx(struct pci_vtcon_port *port, void *arg, struct iovec *iov, + int niov) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *tmp; + struct pci_vtcon_control resp, *ctrl; + int i; + + assert(niov == 1); + + sc = port->vsp_sc; + ctrl = (struct pci_vtcon_control *)iov->iov_base; + + switch (ctrl->event) { + case VTCON_DEVICE_READY: + sc->vsc_ready = true; + /* set port ready events for registered ports */ + for (i = 0; i < VTCON_MAXPORTS; i++) { + tmp = &sc->vsc_ports[i]; + if (tmp->vsp_enabled) + pci_vtcon_announce_port(tmp); + + if (tmp->vsp_open) + pci_vtcon_open_port(tmp, true); + } + break; + + case VTCON_PORT_READY: + if (ctrl->id >= sc->vsc_nports) { + WPRINTF(("VTCON_PORT_READY event for unknown port %d\n", + ctrl->id)); + return; + } + + tmp = &sc->vsc_ports[ctrl->id]; + if (tmp->vsp_console) { + resp.event = VTCON_CONSOLE_PORT; + resp.id = ctrl->id; + resp.value = 1; + pci_vtcon_control_send(sc, &resp, NULL, 0); + } + break; + } +} + +static void +pci_vtcon_announce_port(struct pci_vtcon_port *port) +{ + struct pci_vtcon_control event; + + event.id = port->vsp_id; + event.event = VTCON_DEVICE_ADD; + event.value = 1; + pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); + + event.event = VTCON_PORT_NAME; + pci_vtcon_control_send(port->vsp_sc, &event, port->vsp_name, + strlen(port->vsp_name)); +} + +static void +pci_vtcon_open_port(struct pci_vtcon_port *port, bool open) +{ + struct pci_vtcon_control event; + + if (!port->vsp_sc->vsc_ready) { + port->vsp_open = true; + return; + } + + event.id = port->vsp_id; + event.event = VTCON_PORT_OPEN; + event.value = (int)open; + pci_vtcon_control_send(port->vsp_sc, &event, NULL, 0); +} + +static void +pci_vtcon_control_send(struct pci_vtcon_softc *sc, + struct pci_vtcon_control *ctrl, const void *payload, size_t len) +{ + struct vqueue_info *vq; + struct iovec iov; + uint16_t idx; + int n; + + vq = pci_vtcon_port_to_vq(&sc->vsc_control_port, true); + + if (!vq_has_descs(vq)) + return; + + n = vq_getchain(vq, &idx, &iov, 1, NULL); + + assert(n == 1); + + memcpy(iov.iov_base, ctrl, sizeof(struct pci_vtcon_control)); + if (payload != NULL && len > 0) + memcpy(iov.iov_base + sizeof(struct pci_vtcon_control), + payload, len); + + vq_relchain(vq, idx, sizeof(struct pci_vtcon_control) + len); + vq_endchains(vq, 1); +} + + +static void +pci_vtcon_notify_tx(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *port; + struct iovec iov[1]; + uint16_t idx, n; + uint16_t flags[8]; + + sc = vsc; + port = pci_vtcon_vq_to_port(sc, vq); + + while (vq_has_descs(vq)) { + n = vq_getchain(vq, &idx, iov, 1, flags); + assert(n >= 1); + if (port != NULL) + port->vsp_cb(port, port->vsp_arg, iov, 1); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, 0); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + +static void +pci_vtcon_notify_rx(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtcon_softc *sc; + struct pci_vtcon_port *port; + + sc = vsc; + port = pci_vtcon_vq_to_port(sc, vq); + + if (!port->vsp_rx_ready) { + port->vsp_rx_ready = 1; + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; + } +} + +static int +pci_vtcon_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_vtcon_softc *sc; + char *portname = NULL; + char *portpath = NULL; + char *opt; + int i; + + sc = calloc(1, sizeof(struct pci_vtcon_softc)); + sc->vsc_config = calloc(1, sizeof(struct pci_vtcon_config)); + sc->vsc_config->max_nr_ports = VTCON_MAXPORTS; + sc->vsc_config->cols = 80; + sc->vsc_config->rows = 25; + + vi_softc_linkup(&sc->vsc_vs, &vtcon_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + + for (i = 0; i < VTCON_MAXQ; i++) { + sc->vsc_queues[i].vq_qsize = VTCON_RINGSZ; + sc->vsc_queues[i].vq_notify = i % 2 == 0 + ? pci_vtcon_notify_rx + : pci_vtcon_notify_tx; + } + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_CONSOLE); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SIMPLECOMM); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_CONSOLE); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vsc_vs, 0); + + /* create control port */ + sc->vsc_control_port.vsp_sc = sc; + sc->vsc_control_port.vsp_txq = 2; + sc->vsc_control_port.vsp_rxq = 3; + sc->vsc_control_port.vsp_cb = pci_vtcon_control_tx; + sc->vsc_control_port.vsp_enabled = true; + + while ((opt = strsep(&opts, ",")) != NULL) { + portname = strsep(&opt, "="); + portpath = opt; + + /* create port */ + if (pci_vtcon_sock_add(sc, portname, portpath) < 0) { + fprintf(stderr, "cannot create port %s: %s\n", + portname, strerror(errno)); + return (1); + } + } + + return (0); +} + +struct pci_devemu pci_de_vcon = { + .pe_emu = "virtio-console", + .pe_init = pci_vtcon_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vcon); diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c index e58bdd0115..aa188a3e59 100644 --- a/usr/src/cmd/bhyve/pci_virtio_net.c +++ b/usr/src/cmd/bhyve/pci_virtio_net.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37:33Z grehan $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,18 +38,33 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37:33Z grehan $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif #include <sys/linker_set.h> #include <sys/select.h> #include <sys/uio.h> #include <sys/ioctl.h> +#include <machine/atomic.h> #include <net/ethernet.h> +#ifdef __FreeBSD__ +#ifndef NETMAP_WITH_LIBS +#define NETMAP_WITH_LIBS +#endif +#include <net/netmap_user.h> +#endif +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> #include <errno.h> #include <fcntl.h> #include <stdio.h> @@ -60,21 +77,22 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37 #include <md5.h> #include <pthread.h> #include <pthread_np.h> -#ifndef __FreeBSD__ +#include <sysexits.h> +#ifndef __FreeBSD__ #include <poll.h> #include <libdlpi.h> #endif #include "bhyverun.h" #include "pci_emul.h" -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ #include "mevent.h" #endif #include "virtio.h" #define VTNET_RINGSZ 1024 -#define VTNET_MAXSEGS 32 +#define VTNET_MAXSEGS 256 /* * Host capabilities. Note that we only offer a few of these. @@ -101,7 +119,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37 #define VTNET_S_HOSTCAPS \ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \ - VIRTIO_F_NOTIFY_ON_EMPTY) + VIRTIO_F_NOTIFY_ON_EMPTY | VIRTIO_RING_F_INDIRECT_DESC) /* * PCI config-space "registers" @@ -155,25 +173,35 @@ struct pci_vtnet_softc { dlpi_handle_t vsc_dhp; int vsc_dlpifd; #endif + struct nm_desc *vsc_nmd; + int vsc_rx_ready; volatile int resetting; /* set and checked outside lock */ - uint32_t vsc_features; + uint64_t vsc_features; /* negotiated features */ + struct virtio_net_config vsc_config; pthread_mutex_t rx_mtx; int rx_in_progress; + int rx_vhdrlen; + int rx_merge; /* merged rx bufs in use */ pthread_t tx_tid; pthread_mutex_t tx_mtx; pthread_cond_t tx_cond; int tx_in_progress; + + void (*pci_vtnet_rx)(struct pci_vtnet_softc *sc); + void (*pci_vtnet_tx)(struct pci_vtnet_softc *sc, struct iovec *iov, + int iovcnt, int len); }; static void pci_vtnet_reset(void *); /* static void pci_vtnet_notify(void *, struct vqueue_info *); */ static int pci_vtnet_cfgread(void *, int, int, uint32_t *); static int pci_vtnet_cfgwrite(void *, int, int, uint32_t); +static void pci_vtnet_neg_features(void *, uint64_t); static struct virtio_consts vtnet_vi_consts = { "vtnet", /* our name */ @@ -183,6 +211,7 @@ static struct virtio_consts vtnet_vi_consts = { NULL, /* device-wide qnotify -- not used */ pci_vtnet_cfgread, /* read PCI config */ pci_vtnet_cfgwrite, /* write PCI config */ + pci_vtnet_neg_features, /* apply negotiated features */ VTNET_S_HOSTCAPS, /* our capabilities */ }; @@ -235,6 +264,8 @@ pci_vtnet_reset(void *vsc) pci_vtnet_rxwait(sc); sc->vsc_rx_ready = 0; + sc->rx_merge = 1; + sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); /* now reset rings, MSI-X vectors, and negotiated capabilities */ vi_reset_dev(&sc->vsc_vs); @@ -245,7 +276,7 @@ pci_vtnet_reset(void *vsc) /* * Called to send a buffer chain out to the tap device */ -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ static void pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, int len) @@ -275,13 +306,13 @@ pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, int i; for (i = 0; i < iovcnt; i++) { - (void) dlpi_send(sc->vsc_dhp, NULL, NULL, - iov[i].iov_base, iov[i].iov_len, NULL); + (void) dlpi_send(sc->vsc_dhp, NULL, 0, + iov[i].iov_base, iov[i].iov_len, NULL); } } -#endif +#endif /* __FreeBSD__ */ -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ /* * Called when there is read activity on the tap file descriptor. * Each buffer posted by the guest is assumed to be able to contain @@ -290,23 +321,43 @@ pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, * is no need for it to be per-vtnet or locked. */ static uint8_t dummybuf[2048]; -#endif +#endif /* __FreeBSD__ */ + +static __inline struct iovec * +rx_iov_trim(struct iovec *iov, int *niov, int tlen) +{ + struct iovec *riov; + + /* XXX short-cut: assume first segment is >= tlen */ + assert(iov[0].iov_len >= tlen); + + iov[0].iov_len -= tlen; + if (iov[0].iov_len == 0) { + assert(*niov > 1); + *niov -= 1; + riov = &iov[1]; + } else { + iov[0].iov_base = (void *)((uintptr_t)iov[0].iov_base + tlen); + riov = &iov[0]; + } + + return (riov); +} static void pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) { + struct iovec iov[VTNET_MAXSEGS], *riov; struct vqueue_info *vq; - struct virtio_net_rxhdr *vrx; - uint8_t *buf; + void *vrx; + int n; #ifdef __FreeBSD__ int len; -#endif - struct iovec iov[VTNET_MAXSEGS]; -#ifndef __FreeBSD__ +#else size_t len; int ret; #endif - int total_len = 0; + uint16_t idx; /* * Should never be called without a valid tap fd @@ -335,7 +386,6 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) * Check for available rx buffers */ vq = &sc->vsc_queues[VTNET_RXQ]; - vq_startchains(vq); if (!vq_has_descs(vq)) { /* * Drop the packet and try later. Interrupt on @@ -352,109 +402,267 @@ pci_vtnet_tap_rx(struct pci_vtnet_softc *sc) /* * Get descriptor chain */ - if (sc->vsc_vs.vs_negotiated_caps & VIRTIO_NET_F_MRG_RXBUF) { - assert(vq_getchain(vq, iov, 1, NULL) == 1); + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); - /* - * Get a pointer to the rx header, and use the - * data immediately following it for the packet buffer. - */ - vrx = (struct virtio_net_rxhdr *)iov[0].iov_base; - buf = (uint8_t *)(vrx + 1); - total_len = iov[0].iov_len; + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = iov[0].iov_base; + riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); #ifdef __FreeBSD__ - len = read(sc->vsc_tapfd, buf, - iov[0].iov_len - sizeof(struct virtio_net_rxhdr)); - - if (len < 0 && errno == EWOULDBLOCK) { - /* - * No more packets, but still some avail ring - * entries. Interrupt if needed/appropriate. - */ - vq_endchains(vq, 0); - return; - } + len = readv(sc->vsc_tapfd, riov, n); #else - len = iov[0].iov_len - sizeof(struct virtio_net_rxhdr); - ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, buf, - &len, 0, NULL); - if (ret != DLPI_SUCCESS) { - /* - * No more packets, but still some avail ring - * entries. Interrupt if needed/appropriate. - */ - vq_endchains(vq, 0); - return; - } + len = riov[0].iov_len; + ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, + (uint8_t *)riov[0].iov_base, &len, 0, NULL); + if (ret != DLPI_SUCCESS) { + errno = EWOULDBLOCK; + len = 0; + } #endif - } else { - int i; - int num_segs; - num_segs = vq_getchain(vq, iov, - VTNET_MAXSEGS, NULL); - vrx = (struct virtio_net_rxhrd *)iov[0].iov_base; - total_len = iov[0].iov_len; - for (i = 1; i < num_segs; i++) { - buf = (uint8_t *)iov[i].iov_base; - total_len += iov[i].iov_len; + if (len <= 0 && errno == EWOULDBLOCK) { + /* + * No more packets, but still some avail ring + * entries. Interrupt if needed/appropriate. + */ + vq_retchain(vq); + vq_endchains(vq, 0); + return; + } + + /* + * The only valid field in the rx packet header is the + * number of buffers if merged rx bufs were negotiated. + */ + memset(vrx, 0, sc->rx_vhdrlen); + + if (sc->rx_merge) { + struct virtio_net_rxhdr *vrxh; + + vrxh = vrx; + vrxh->vrh_bufs = 1; + } + + /* + * Release this chain and handle more chains. + */ + vq_relchain(vq, idx, len + sc->rx_vhdrlen); + } while (vq_has_descs(vq)); + + /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ + vq_endchains(vq, 1); +} + #ifdef __FreeBSD__ - len = read(sc->vsc_tapfd, buf, iov[i].iov_len); - if (len < 0 && errno == EWOULDBLOCK) { - /* - * No more packets, - * but still some avail ring entries. - * Interrupt if needed/appropriate. - */ - break; - } -#else - len = iov[i].iov_len; - ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, buf, - &len, 0, NULL); - if (ret != DLPI_SUCCESS) { - /* - * No more packets, - * but still some avail ring entries. - * Interrupt if needed/appropriate. - */ - total_len = 0; - break; - } -#endif - } - if (total_len == 0) { - vq_endchains(vq, 0); - return; - } +static __inline int +pci_vtnet_netmap_writev(struct nm_desc *nmd, struct iovec *iov, int iovcnt) +{ + int r, i; + int len = 0; + + for (r = nmd->cur_tx_ring; ; ) { + struct netmap_ring *ring = NETMAP_TXRING(nmd->nifp, r); + uint32_t cur, idx; + char *buf; + + if (nm_ring_empty(ring)) { + r++; + if (r > nmd->last_tx_ring) + r = nmd->first_tx_ring; + if (r == nmd->cur_tx_ring) + break; + continue; + } + cur = ring->cur; + idx = ring->slot[cur].buf_idx; + buf = NETMAP_BUF(ring, idx); + + for (i = 0; i < iovcnt; i++) { + if (len + iov[i].iov_len > 2048) + break; + memcpy(&buf[len], iov[i].iov_base, iov[i].iov_len); + len += iov[i].iov_len; + } + ring->slot[cur].len = len; + ring->head = ring->cur = nm_ring_next(ring, cur); + nmd->cur_tx_ring = r; + ioctl(nmd->fd, NIOCTXSYNC, NULL); + break; + } + + return (len); +} + +static __inline int +pci_vtnet_netmap_readv(struct nm_desc *nmd, struct iovec *iov, int iovcnt) +{ + int len = 0; + int i = 0; + int r; + + for (r = nmd->cur_rx_ring; ; ) { + struct netmap_ring *ring = NETMAP_RXRING(nmd->nifp, r); + uint32_t cur, idx; + char *buf; + size_t left; + + if (nm_ring_empty(ring)) { + r++; + if (r > nmd->last_rx_ring) + r = nmd->first_rx_ring; + if (r == nmd->cur_rx_ring) + break; + continue; + } + cur = ring->cur; + idx = ring->slot[cur].buf_idx; + buf = NETMAP_BUF(ring, idx); + left = ring->slot[cur].len; + + for (i = 0; i < iovcnt && left > 0; i++) { + if (iov[i].iov_len > left) + iov[i].iov_len = left; + memcpy(iov[i].iov_base, &buf[len], iov[i].iov_len); + len += iov[i].iov_len; + left -= iov[i].iov_len; + } + ring->head = ring->cur = nm_ring_next(ring, cur); + nmd->cur_rx_ring = r; + ioctl(nmd->fd, NIOCRXSYNC, NULL); + break; + } + for (; i < iovcnt; i++) + iov[i].iov_len = 0; + + return (len); +} + +/* + * Called to send a buffer chain out to the vale port + */ +static void +pci_vtnet_netmap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt, + int len) +{ + static char pad[60]; /* all zero bytes */ + + if (sc->vsc_nmd == NULL) + return; + + /* + * If the length is < 60, pad out to that and add the + * extra zero'd segment to the iov. It is guaranteed that + * there is always an extra iov available by the caller. + */ + if (len < 60) { + iov[iovcnt].iov_base = pad; + iov[iovcnt].iov_len = 60 - len; + iovcnt++; + } + (void) pci_vtnet_netmap_writev(sc->vsc_nmd, iov, iovcnt); +} + +static void +pci_vtnet_netmap_rx(struct pci_vtnet_softc *sc) +{ + struct iovec iov[VTNET_MAXSEGS], *riov; + struct vqueue_info *vq; + void *vrx; + int len, n; + uint16_t idx; + + /* + * Should never be called without a valid netmap descriptor + */ + assert(sc->vsc_nmd != NULL); + + /* + * But, will be called when the rx ring hasn't yet + * been set up or the guest is resetting the device. + */ + if (!sc->vsc_rx_ready || sc->resetting) { + /* + * Drop the packet and try later. + */ + (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); + return; + } + + /* + * Check for available rx buffers + */ + vq = &sc->vsc_queues[VTNET_RXQ]; + if (!vq_has_descs(vq)) { + /* + * Drop the packet and try later. Interrupt on + * empty, if that's negotiated. + */ + (void) nm_nextpkt(sc->vsc_nmd, (void *)dummybuf); + vq_endchains(vq, 1); + return; + } + + do { + /* + * Get descriptor chain. + */ + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); + assert(n >= 1 && n <= VTNET_MAXSEGS); + + /* + * Get a pointer to the rx header, and use the + * data immediately following it for the packet buffer. + */ + vrx = iov[0].iov_base; + riov = rx_iov_trim(iov, &n, sc->rx_vhdrlen); + + len = pci_vtnet_netmap_readv(sc->vsc_nmd, riov, n); + + if (len == 0) { + /* + * No more packets, but still some avail ring + * entries. Interrupt if needed/appropriate. + */ + vq_retchain(vq); + vq_endchains(vq, 0); + return; } /* * The only valid field in the rx packet header is the - * number of buffers, which is always 1 without TSO - * support. + * number of buffers if merged rx bufs were negotiated. */ - memset(vrx, 0, sizeof(struct virtio_net_rxhdr)); - vrx->vrh_bufs = 1; + memset(vrx, 0, sc->rx_vhdrlen); + + if (sc->rx_merge) { + struct virtio_net_rxhdr *vrxh; + + vrxh = vrx; + vrxh->vrh_bufs = 1; + } /* * Release this chain and handle more chains. */ - vq_relchain(vq, total_len); + vq_relchain(vq, idx, len + sc->rx_vhdrlen); } while (vq_has_descs(vq)); /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */ vq_endchains(vq, 1); } +#endif /* __FreeBSD__ */ -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ static void -pci_vtnet_tap_callback(int fd, enum ev_type type, void *param) +pci_vtnet_rx_callback(int fd, enum ev_type type, void *param) { struct pci_vtnet_softc *sc = param; pthread_mutex_lock(&sc->rx_mtx); sc->rx_in_progress = 1; - pci_vtnet_tap_rx(sc); + sc->pci_vtnet_rx(sc); sc->rx_in_progress = 0; pthread_mutex_unlock(&sc->rx_mtx); @@ -477,11 +685,15 @@ pci_vtnet_poll_thread(void *param) continue; } pthread_mutex_lock(&sc->vsc_mtx); + sc->rx_in_progress = 1; pci_vtnet_tap_rx(sc); + sc->rx_in_progress = 0; pthread_mutex_unlock(&sc->vsc_mtx); } + + return (NULL); } -#endif +#endif /* __FreeBSD__ */ static void pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) @@ -493,6 +705,7 @@ pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq) */ if (sc->vsc_rx_ready == 0) { sc->vsc_rx_ready = 1; + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; } } @@ -502,13 +715,14 @@ pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) struct iovec iov[VTNET_MAXSEGS + 1]; int i, n; int plen, tlen; + uint16_t idx; /* * Obtain chain of descriptors. The first one is * really the header descriptor, so we need to sum * up two lengths: packet length and transfer length. */ - n = vq_getchain(vq, iov, VTNET_MAXSEGS, NULL); + n = vq_getchain(vq, &idx, iov, VTNET_MAXSEGS, NULL); assert(n >= 1 && n <= VTNET_MAXSEGS); plen = 0; tlen = iov[0].iov_len; @@ -518,10 +732,10 @@ pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq) } DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n)); - pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen); + sc->pci_vtnet_tx(sc, &iov[1], n - 1, plen); /* chain is processed, release it and set tlen */ - vq_relchain(vq, tlen); + vq_relchain(vq, idx, tlen); } static void @@ -537,6 +751,7 @@ pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq) /* Signal the tx thread for processing */ pthread_mutex_lock(&sc->tx_mtx); + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; if (sc->tx_in_progress == 0) pthread_cond_signal(&sc->tx_cond); pthread_mutex_unlock(&sc->tx_mtx); @@ -550,7 +765,7 @@ pci_vtnet_tx_thread(void *param) { struct pci_vtnet_softc *sc = param; struct vqueue_info *vq; - int have_work, error; + int error; vq = &sc->vsc_queues[VTNET_TXQ]; @@ -564,23 +779,20 @@ pci_vtnet_tx_thread(void *param) for (;;) { /* note - tx mutex is locked here */ - do { - if (sc->resetting) - have_work = 0; - else - have_work = vq_has_descs(vq); - - if (!have_work) { - sc->tx_in_progress = 0; - error = pthread_cond_wait(&sc->tx_cond, - &sc->tx_mtx); - assert(error == 0); - } - } while (!have_work); + while (sc->resetting || !vq_has_descs(vq)) { + vq->vq_used->vu_flags &= ~VRING_USED_F_NO_NOTIFY; + mb(); + if (!sc->resetting && vq_has_descs(vq)) + break; + + sc->tx_in_progress = 0; + error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx); + assert(error == 0); + } + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; sc->tx_in_progress = 1; pthread_mutex_unlock(&sc->tx_mtx); - vq_startchains(vq); do { /* * Run through entries, placing them into @@ -597,42 +809,161 @@ pci_vtnet_tx_thread(void *param) pthread_mutex_lock(&sc->tx_mtx); } + return (NULL); } -#ifdef notyet +#ifdef __FreeBSD__ static void pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq) { DPRINTF(("vtnet: control qnotify!\n\r")); } -#endif +#endif /* __FreeBSD__ */ -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ static int pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr) { - struct ether_addr *ea; - char *tmpstr; - char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + struct ether_addr *ea; + char *tmpstr; + char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 }; + + tmpstr = strsep(&mac_str,"="); - tmpstr = strsep(&mac_str,"="); - - if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { - ea = ether_aton(mac_str); + if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) { + ea = ether_aton(mac_str); - if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || - memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { + if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) || + memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) { fprintf(stderr, "Invalid MAC %s\n", mac_str); - return (EINVAL); - } else - memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); - } + return (EINVAL); + } else + memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN); + } - return (0); + return (0); } +#endif /* __FreeBSD__ */ + +static void +pci_vtnet_tap_setup(struct pci_vtnet_softc *sc, char *devname) +{ + char tbuf[80]; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif +#ifndef __FreeBSD__ + uchar_t physaddr[DLPI_PHYSADDR_MAX]; + size_t physaddrlen = DLPI_PHYSADDR_MAX; + int error; +#endif + + strcpy(tbuf, "/dev/"); + strlcat(tbuf, devname, sizeof(tbuf)); + + sc->pci_vtnet_rx = pci_vtnet_tap_rx; + sc->pci_vtnet_tx = pci_vtnet_tap_tx; +#ifdef __FreeBSD__ + sc->vsc_tapfd = open(tbuf, O_RDWR); + if (sc->vsc_tapfd == -1) { + WPRINTF(("open of tap device %s failed\n", tbuf)); + return; + } + + /* + * Set non-blocking and register for read + * notifications with the event loop + */ + int opt = 1; + if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { + WPRINTF(("tap device O_NONBLOCK failed\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(sc->vsc_tapfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + sc->vsc_mevp = mevent_add(sc->vsc_tapfd, + EVF_READ, + pci_vtnet_rx_callback, + sc); + if (sc->vsc_mevp == NULL) { + WPRINTF(("Could not register event\n")); + close(sc->vsc_tapfd); + sc->vsc_tapfd = -1; + } +#else + if (dlpi_open(devname, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) { + WPRINTF(("open of vnic device %s failed\n", devname)); + } + + if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr, + &physaddrlen) != DLPI_SUCCESS) { + WPRINTF(("read MAC address of vnic device %s failed\n", + devname)); + } + if (physaddrlen != ETHERADDRL) { + WPRINTF(("bad MAC address len %d on vnic device %s\n", + physaddrlen, devname)); + } + memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL); + + if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) { + WPRINTF(("bind of vnic device %s failed\n", devname)); + } + + if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) { + WPRINTF(("enable promiscous mode(physical) of vnic device %s " + "failed\n", devname)); + } + if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) { + WPRINTF(("enable promiscous mode(SAP) of vnic device %s " + "failed\n", devname)); + } + + sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp); + + if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) { + WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n", + devname)); + dlpi_close(sc->vsc_dhp); + sc->vsc_dlpifd = -1; + } + + error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc); + assert(error == 0); #endif +} + +#ifdef __FreeBSD__ +static void +pci_vtnet_netmap_setup(struct pci_vtnet_softc *sc, char *ifname) +{ + sc->pci_vtnet_rx = pci_vtnet_netmap_rx; + sc->pci_vtnet_tx = pci_vtnet_netmap_tx; + + sc->vsc_nmd = nm_open(ifname, NULL, 0, 0); + if (sc->vsc_nmd == NULL) { + WPRINTF(("open of netmap device %s failed\n", ifname)); + return; + } + sc->vsc_mevp = mevent_add(sc->vsc_nmd->fd, + EVF_READ, + pci_vtnet_rx_callback, + sc); + if (sc->vsc_mevp == NULL) { + WPRINTF(("Could not register event\n")); + nm_close(sc->vsc_nmd); + sc->vsc_nmd = NULL; + } +} +#endif /* __FreeBSD__ */ static int pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) @@ -640,31 +971,30 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) #ifdef __FreeBSD__ MD5_CTX mdctx; unsigned char digest[16]; -#else - uchar_t physaddr[DLPI_PHYSADDR_MAX]; - size_t physaddrlen = DLPI_PHYSADDR_MAX; - int error; -#endif char nstr[80]; +#endif char tname[MAXCOMLEN + 1]; struct pci_vtnet_softc *sc; const char *env_msi; char *devname; char *vtopts; +#ifdef __FreeBSD__ int mac_provided; +#endif int use_msix; - sc = malloc(sizeof(struct pci_vtnet_softc)); - memset(sc, 0, sizeof(struct pci_vtnet_softc)); + sc = calloc(1, sizeof(struct pci_vtnet_softc)); pthread_mutex_init(&sc->vsc_mtx, NULL); vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues); + sc->vsc_vs.vs_mtx = &sc->vsc_mtx; + sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq; sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq; -#ifdef notyet +#ifdef __FreeBSD__ sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ; sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq; #endif @@ -682,13 +1012,15 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) * Attempt to open the tap device and read the MAC address * if specified */ - mac_provided = 0; #ifdef __FreeBSD__ + mac_provided = 0; sc->vsc_tapfd = -1; #endif + sc->vsc_nmd = NULL; if (opts != NULL) { - char tbuf[80]; +#ifdef __FreeBSD__ int err; +#endif devname = vtopts = strdup(opts); (void) strsep(&vtopts, ","); @@ -704,72 +1036,15 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) } #endif - strcpy(tbuf, "/dev/"); - strlcat(tbuf, devname, sizeof(tbuf)); +#ifdef __FreeBSD__ + if (strncmp(devname, "vale", 4) == 0) + pci_vtnet_netmap_setup(sc, devname); +#endif + if (strncmp(devname, "tap", 3) == 0 || + strncmp(devname, "vmnet", 5) == 0) + pci_vtnet_tap_setup(sc, devname); free(devname); - -#ifdef __FreeBSD__ - sc->vsc_tapfd = open(tbuf, O_RDWR); - if (sc->vsc_tapfd == -1) { - WPRINTF(("open of tap device %s failed\n", tbuf)); - } else { - /* - * Set non-blocking and register for read - * notifications with the event loop - */ - int opt = 1; - if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) { - WPRINTF(("tap device O_NONBLOCK failed\n")); - close(sc->vsc_tapfd); - sc->vsc_tapfd = -1; - } - - sc->vsc_mevp = mevent_add(sc->vsc_tapfd, - EVF_READ, - pci_vtnet_tap_callback, - sc); - if (sc->vsc_mevp == NULL) { - WPRINTF(("Could not register event\n")); - close(sc->vsc_tapfd); - sc->vsc_tapfd = -1; - } - } -#else - if (dlpi_open(opts, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) { - WPRINTF(("open of vnic device %s failed\n", opts)); - } - - if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr, &physaddrlen) != DLPI_SUCCESS) { - WPRINTF(("read MAC address of vnic device %s failed\n", opts)); - } - if (physaddrlen != ETHERADDRL) { - WPRINTF(("bad MAC address len %d on vnic device %s\n", physaddrlen, opts)); - } - memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL); - - if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) { - WPRINTF(("bind of vnic device %s failed\n", opts)); - } - - if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) { - WPRINTF(("enable promiscous mode(physical) of vnic device %s failed\n", opts)); - } - if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) { - WPRINTF(("enable promiscous mode(SAP) of vnic device %s failed\n", opts)); - } - - sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp); - - if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) { - WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n", opts)); - dlpi_close(sc->vsc_dhp); - sc->vsc_dlpifd = -1; - } - - error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc); - assert(error == 0); -#endif } #ifdef __FreeBSD__ @@ -799,9 +1074,15 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK); pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); - /* link always up */ - sc->vsc_config.status = 1; + /* Link is up if we managed to open tap device or vale port. */ +#ifdef __FreeBSD__ + sc->vsc_config.status = (opts == NULL || sc->vsc_tapfd >= 0 || +#else + sc->vsc_config.status = (opts == NULL || sc->vsc_dlpifd >= 0 || +#endif + sc->vsc_nmd != NULL); /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */ if (vi_intr_init(&sc->vsc_vs, 1, use_msix)) @@ -812,6 +1093,8 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) sc->resetting = 0; + sc->rx_merge = 1; + sc->rx_vhdrlen = sizeof(struct virtio_net_rxhdr); sc->rx_in_progress = 0; pthread_mutex_init(&sc->rx_mtx, NULL); @@ -824,8 +1107,9 @@ pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) pthread_mutex_init(&sc->tx_mtx, NULL); pthread_cond_init(&sc->tx_cond, NULL); pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc); - snprintf(tname, sizeof(tname), "%s vtnet%d tx", vmname, pi->pi_slot); - pthread_set_name_np(sc->tx_tid, tname); + snprintf(tname, sizeof(tname), "vtnet-%d:%d tx", pi->pi_slot, + pi->pi_func); + pthread_set_name_np(sc->tx_tid, tname); return (0); } @@ -844,9 +1128,10 @@ pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value) ptr = &sc->vsc_config.mac[offset]; memcpy(ptr, &value, size); } else { + /* silently ignore other writes */ DPRINTF(("vtnet: write to readonly reg %d\n\r", offset)); - return (1); } + return (0); } @@ -861,6 +1146,20 @@ pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval) return (0); } +static void +pci_vtnet_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtnet_softc *sc = vsc; + + sc->vsc_features = negotiated_features; + + if (!(sc->vsc_features & VIRTIO_NET_F_MRG_RXBUF)) { + sc->rx_merge = 0; + /* non-merge rx header is 2 bytes shorter */ + sc->rx_vhdrlen -= 2; + } +} + struct pci_devemu pci_de_vnet = { .pe_emu = "virtio-net", .pe_init = pci_vtnet_init, diff --git a/usr/src/cmd/bhyve/pci_virtio_rnd.c b/usr/src/cmd/bhyve/pci_virtio_rnd.c new file mode 100644 index 0000000000..5f470c03a6 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_rnd.c @@ -0,0 +1,209 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * virtio entropy device emulation. + * Randomness is sourced from /dev/random which does not block + * once it has been seeded at bootup. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/linker_set.h> +#include <sys/uio.h> + +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <sysexits.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" + +#define VTRND_RINGSZ 64 + + +static int pci_vtrnd_debug; +#define DPRINTF(params) if (pci_vtrnd_debug) printf params +#define WPRINTF(params) printf params + +/* + * Per-device softc + */ +struct pci_vtrnd_softc { + struct virtio_softc vrsc_vs; + struct vqueue_info vrsc_vq; + pthread_mutex_t vrsc_mtx; + uint64_t vrsc_cfg; + int vrsc_fd; +}; + +static void pci_vtrnd_reset(void *); +static void pci_vtrnd_notify(void *, struct vqueue_info *); + +static struct virtio_consts vtrnd_vi_consts = { + "vtrnd", /* our name */ + 1, /* we support 1 virtqueue */ + 0, /* config reg size */ + pci_vtrnd_reset, /* reset */ + pci_vtrnd_notify, /* device-wide qnotify */ + NULL, /* read virtio config */ + NULL, /* write virtio config */ + NULL, /* apply negotiated features */ + 0, /* our capabilities */ +}; + + +static void +pci_vtrnd_reset(void *vsc) +{ + struct pci_vtrnd_softc *sc; + + sc = vsc; + + DPRINTF(("vtrnd: device reset requested !\n")); + vi_reset_dev(&sc->vrsc_vs); +} + + +static void +pci_vtrnd_notify(void *vsc, struct vqueue_info *vq) +{ + struct iovec iov; + struct pci_vtrnd_softc *sc; + int len; + uint16_t idx; + + sc = vsc; + + if (sc->vrsc_fd < 0) { + vq_endchains(vq, 0); + return; + } + + while (vq_has_descs(vq)) { + vq_getchain(vq, &idx, &iov, 1, NULL); + + len = read(sc->vrsc_fd, iov.iov_base, iov.iov_len); + + DPRINTF(("vtrnd: vtrnd_notify(): %d\r\n", len)); + + /* Catastrophe if unable to read from /dev/random */ + assert(len > 0); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, len); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ +} + + +static int +pci_vtrnd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_vtrnd_softc *sc; + int fd; + int len; + uint8_t v; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif + + /* + * Should always be able to open /dev/random. + */ + fd = open("/dev/random", O_RDONLY | O_NONBLOCK); + + assert(fd >= 0); + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_READ); + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + /* + * Check that device is seeded and non-blocking. + */ + len = read(fd, &v, sizeof(v)); + if (len <= 0) { + WPRINTF(("vtrnd: /dev/random not ready, read(): %d", len)); + close(fd); + return (1); + } + + sc = calloc(1, sizeof(struct pci_vtrnd_softc)); + + vi_softc_linkup(&sc->vrsc_vs, &vtrnd_vi_consts, sc, pi, &sc->vrsc_vq); + sc->vrsc_vs.vs_mtx = &sc->vrsc_mtx; + + sc->vrsc_vq.vq_qsize = VTRND_RINGSZ; + + /* keep /dev/random opened while emulating */ + sc->vrsc_fd = fd; + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_RANDOM); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_CRYPTO); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_ENTROPY); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vrsc_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vrsc_vs, 0); + + return (0); +} + + +struct pci_devemu pci_de_vrnd = { + .pe_emu = "virtio-rnd", + .pe_init = pci_vtrnd_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vrnd); diff --git a/usr/src/cmd/bhyve/pci_virtio_scsi.c b/usr/src/cmd/bhyve/pci_virtio_scsi.c new file mode 100644 index 0000000000..38e7d918a0 --- /dev/null +++ b/usr/src/cmd/bhyve/pci_virtio_scsi.c @@ -0,0 +1,737 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Jakub Klama <jceel@FreeBSD.org>. + * Copyright (c) 2018 Marcelo Araujo <araujo@FreeBSD.org>. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/linker_set.h> +#include <sys/types.h> +#include <sys/uio.h> +#include <sys/time.h> +#include <sys/queue.h> +#include <sys/sbuf.h> + +#include <errno.h> +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#include <stdbool.h> +#include <string.h> +#include <unistd.h> +#include <assert.h> +#include <pthread.h> +#include <pthread_np.h> + +#include <cam/scsi/scsi_all.h> +#include <cam/scsi/scsi_message.h> +#include <cam/ctl/ctl.h> +#include <cam/ctl/ctl_io.h> +#include <cam/ctl/ctl_backend.h> +#include <cam/ctl/ctl_ioctl.h> +#include <cam/ctl/ctl_util.h> +#include <cam/ctl/ctl_scsi_all.h> +#include <camlib.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "virtio.h" +#include "iov.h" + +#define VTSCSI_RINGSZ 64 +#define VTSCSI_REQUESTQ 1 +#define VTSCSI_THR_PER_Q 16 +#define VTSCSI_MAXQ (VTSCSI_REQUESTQ + 2) +#define VTSCSI_MAXSEG 64 + +#define VTSCSI_IN_HEADER_LEN(_sc) \ + (sizeof(struct pci_vtscsi_req_cmd_rd) + _sc->vss_config.cdb_size) + +#define VTSCSI_OUT_HEADER_LEN(_sc) \ + (sizeof(struct pci_vtscsi_req_cmd_wr) + _sc->vss_config.sense_size) + +#define VIRTIO_SCSI_MAX_CHANNEL 0 +#define VIRTIO_SCSI_MAX_TARGET 0 +#define VIRTIO_SCSI_MAX_LUN 16383 + +#define VIRTIO_SCSI_F_INOUT (1 << 0) +#define VIRTIO_SCSI_F_HOTPLUG (1 << 1) +#define VIRTIO_SCSI_F_CHANGE (1 << 2) + +static int pci_vtscsi_debug = 0; +#define DPRINTF(params) if (pci_vtscsi_debug) printf params +#define WPRINTF(params) printf params + +struct pci_vtscsi_config { + uint32_t num_queues; + uint32_t seg_max; + uint32_t max_sectors; + uint32_t cmd_per_lun; + uint32_t event_info_size; + uint32_t sense_size; + uint32_t cdb_size; + uint16_t max_channel; + uint16_t max_target; + uint32_t max_lun; +} __attribute__((packed)); + +struct pci_vtscsi_queue { + struct pci_vtscsi_softc * vsq_sc; + struct vqueue_info * vsq_vq; + pthread_mutex_t vsq_mtx; + pthread_mutex_t vsq_qmtx; + pthread_cond_t vsq_cv; + STAILQ_HEAD(, pci_vtscsi_request) vsq_requests; + LIST_HEAD(, pci_vtscsi_worker) vsq_workers; +}; + +struct pci_vtscsi_worker { + struct pci_vtscsi_queue * vsw_queue; + pthread_t vsw_thread; + bool vsw_exiting; + LIST_ENTRY(pci_vtscsi_worker) vsw_link; +}; + +struct pci_vtscsi_request { + struct pci_vtscsi_queue * vsr_queue; + struct iovec vsr_iov_in[VTSCSI_MAXSEG]; + int vsr_niov_in; + struct iovec vsr_iov_out[VTSCSI_MAXSEG]; + int vsr_niov_out; + uint32_t vsr_idx; + STAILQ_ENTRY(pci_vtscsi_request) vsr_link; +}; + +/* + * Per-device softc + */ +struct pci_vtscsi_softc { + struct virtio_softc vss_vs; + struct vqueue_info vss_vq[VTSCSI_MAXQ]; + struct pci_vtscsi_queue vss_queues[VTSCSI_REQUESTQ]; + pthread_mutex_t vss_mtx; + int vss_iid; + int vss_ctl_fd; + uint32_t vss_features; + struct pci_vtscsi_config vss_config; +}; + +#define VIRTIO_SCSI_T_TMF 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK 0 +#define VIRTIO_SCSI_T_TMF_ABORT_TASK_SET 1 +#define VIRTIO_SCSI_T_TMF_CLEAR_ACA 2 +#define VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET 3 +#define VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET 4 +#define VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET 5 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK 6 +#define VIRTIO_SCSI_T_TMF_QUERY_TASK_SET 7 + +/* command-specific response values */ +#define VIRTIO_SCSI_S_FUNCTION_COMPLETE 0 +#define VIRTIO_SCSI_S_FUNCTION_SUCCEEDED 10 +#define VIRTIO_SCSI_S_FUNCTION_REJECTED 11 + +struct pci_vtscsi_ctrl_tmf { + uint32_t type; + uint32_t subtype; + uint8_t lun[8]; + uint64_t id; + uint8_t response; +} __attribute__((packed)); + +#define VIRTIO_SCSI_T_AN_QUERY 1 +#define VIRTIO_SCSI_EVT_ASYNC_OPERATIONAL_CHANGE 2 +#define VIRTIO_SCSI_EVT_ASYNC_POWER_MGMT 4 +#define VIRTIO_SCSI_EVT_ASYNC_EXTERNAL_REQUEST 8 +#define VIRTIO_SCSI_EVT_ASYNC_MEDIA_CHANGE 16 +#define VIRTIO_SCSI_EVT_ASYNC_MULTI_HOST 32 +#define VIRTIO_SCSI_EVT_ASYNC_DEVICE_BUSY 64 + +struct pci_vtscsi_ctrl_an { + uint32_t type; + uint8_t lun[8]; + uint32_t event_requested; + uint32_t event_actual; + uint8_t response; +} __attribute__((packed)); + +/* command-specific response values */ +#define VIRTIO_SCSI_S_OK 0 +#define VIRTIO_SCSI_S_OVERRUN 1 +#define VIRTIO_SCSI_S_ABORTED 2 +#define VIRTIO_SCSI_S_BAD_TARGET 3 +#define VIRTIO_SCSI_S_RESET 4 +#define VIRTIO_SCSI_S_BUSY 5 +#define VIRTIO_SCSI_S_TRANSPORT_FAILURE 6 +#define VIRTIO_SCSI_S_TARGET_FAILURE 7 +#define VIRTIO_SCSI_S_NEXUS_FAILURE 8 +#define VIRTIO_SCSI_S_FAILURE 9 +#define VIRTIO_SCSI_S_INCORRECT_LUN 12 + +/* task_attr */ +#define VIRTIO_SCSI_S_SIMPLE 0 +#define VIRTIO_SCSI_S_ORDERED 1 +#define VIRTIO_SCSI_S_HEAD 2 +#define VIRTIO_SCSI_S_ACA 3 + +struct pci_vtscsi_event { + uint32_t event; + uint8_t lun[8]; + uint32_t reason; +} __attribute__((packed)); + +struct pci_vtscsi_req_cmd_rd { + uint8_t lun[8]; + uint64_t id; + uint8_t task_attr; + uint8_t prio; + uint8_t crn; + uint8_t cdb[]; +} __attribute__((packed)); + +struct pci_vtscsi_req_cmd_wr { + uint32_t sense_len; + uint32_t residual; + uint16_t status_qualifier; + uint8_t status; + uint8_t response; + uint8_t sense[]; +} __attribute__((packed)); + +static void *pci_vtscsi_proc(void *); +static void pci_vtscsi_reset(void *); +static void pci_vtscsi_neg_features(void *, uint64_t); +static int pci_vtscsi_cfgread(void *, int, int, uint32_t *); +static int pci_vtscsi_cfgwrite(void *, int, int, uint32_t); +static inline int pci_vtscsi_get_lun(uint8_t *); +static int pci_vtscsi_control_handle(struct pci_vtscsi_softc *, void *, size_t); +static int pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *, + struct pci_vtscsi_ctrl_tmf *); +static int pci_vtscsi_an_handle(struct pci_vtscsi_softc *, + struct pci_vtscsi_ctrl_an *); +static int pci_vtscsi_request_handle(struct pci_vtscsi_queue *, struct iovec *, + int, struct iovec *, int); +static void pci_vtscsi_controlq_notify(void *, struct vqueue_info *); +static void pci_vtscsi_eventq_notify(void *, struct vqueue_info *); +static void pci_vtscsi_requestq_notify(void *, struct vqueue_info *); +static int pci_vtscsi_init_queue(struct pci_vtscsi_softc *, + struct pci_vtscsi_queue *, int); +static int pci_vtscsi_init(struct vmctx *, struct pci_devinst *, char *); + +static struct virtio_consts vtscsi_vi_consts = { + "vtscsi", /* our name */ + VTSCSI_MAXQ, /* we support 2+n virtqueues */ + sizeof(struct pci_vtscsi_config), /* config reg size */ + pci_vtscsi_reset, /* reset */ + NULL, /* device-wide qnotify */ + pci_vtscsi_cfgread, /* read virtio config */ + pci_vtscsi_cfgwrite, /* write virtio config */ + pci_vtscsi_neg_features, /* apply negotiated features */ + 0, /* our capabilities */ +}; + +static void * +pci_vtscsi_proc(void *arg) +{ + struct pci_vtscsi_worker *worker = (struct pci_vtscsi_worker *)arg; + struct pci_vtscsi_queue *q = worker->vsw_queue; + struct pci_vtscsi_request *req; + int iolen; + + for (;;) { + pthread_mutex_lock(&q->vsq_mtx); + + while (STAILQ_EMPTY(&q->vsq_requests) + && !worker->vsw_exiting) + pthread_cond_wait(&q->vsq_cv, &q->vsq_mtx); + + if (worker->vsw_exiting) + break; + + req = STAILQ_FIRST(&q->vsq_requests); + STAILQ_REMOVE_HEAD(&q->vsq_requests, vsr_link); + + pthread_mutex_unlock(&q->vsq_mtx); + iolen = pci_vtscsi_request_handle(q, req->vsr_iov_in, + req->vsr_niov_in, req->vsr_iov_out, req->vsr_niov_out); + + pthread_mutex_lock(&q->vsq_qmtx); + vq_relchain(q->vsq_vq, req->vsr_idx, iolen); + vq_endchains(q->vsq_vq, 0); + pthread_mutex_unlock(&q->vsq_qmtx); + + DPRINTF(("virtio-scsi: request <idx=%d> completed\n", + req->vsr_idx)); + free(req); + } + + pthread_mutex_unlock(&q->vsq_mtx); + return (NULL); +} + +static void +pci_vtscsi_reset(void *vsc) +{ + struct pci_vtscsi_softc *sc; + + sc = vsc; + + DPRINTF(("vtscsi: device reset requested\n")); + vi_reset_dev(&sc->vss_vs); + + /* initialize config structure */ + sc->vss_config = (struct pci_vtscsi_config){ + .num_queues = VTSCSI_REQUESTQ, + .seg_max = VTSCSI_MAXSEG, + .max_sectors = 2, + .cmd_per_lun = 1, + .event_info_size = sizeof(struct pci_vtscsi_event), + .sense_size = 96, + .cdb_size = 32, + .max_channel = VIRTIO_SCSI_MAX_CHANNEL, + .max_target = VIRTIO_SCSI_MAX_TARGET, + .max_lun = VIRTIO_SCSI_MAX_LUN + }; +} + +static void +pci_vtscsi_neg_features(void *vsc, uint64_t negotiated_features) +{ + struct pci_vtscsi_softc *sc = vsc; + + sc->vss_features = negotiated_features; +} + +static int +pci_vtscsi_cfgread(void *vsc, int offset, int size, uint32_t *retval) +{ + struct pci_vtscsi_softc *sc = vsc; + void *ptr; + + ptr = (uint8_t *)&sc->vss_config + offset; + memcpy(retval, ptr, size); + return (0); +} + +static int +pci_vtscsi_cfgwrite(void *vsc, int offset, int size, uint32_t val) +{ + + return (0); +} + +static inline int +pci_vtscsi_get_lun(uint8_t *lun) +{ + + return (((lun[2] << 8) | lun[3]) & 0x3fff); +} + +static int +pci_vtscsi_control_handle(struct pci_vtscsi_softc *sc, void *buf, + size_t bufsize) +{ + struct pci_vtscsi_ctrl_tmf *tmf; + struct pci_vtscsi_ctrl_an *an; + uint32_t type; + + type = *(uint32_t *)buf; + + if (type == VIRTIO_SCSI_T_TMF) { + tmf = (struct pci_vtscsi_ctrl_tmf *)buf; + return (pci_vtscsi_tmf_handle(sc, tmf)); + } + + if (type == VIRTIO_SCSI_T_AN_QUERY) { + an = (struct pci_vtscsi_ctrl_an *)buf; + return (pci_vtscsi_an_handle(sc, an)); + } + + return (0); +} + +static int +pci_vtscsi_tmf_handle(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_ctrl_tmf *tmf) +{ + union ctl_io *io; + int err; + + io = ctl_scsi_alloc_io(sc->vss_iid); + ctl_scsi_zero_io(io); + + io->io_hdr.io_type = CTL_IO_TASK; + io->io_hdr.nexus.initid = sc->vss_iid; + io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(tmf->lun); + io->taskio.tag_type = CTL_TAG_SIMPLE; + io->taskio.tag_num = (uint32_t)tmf->id; + + switch (tmf->subtype) { + case VIRTIO_SCSI_T_TMF_ABORT_TASK: + io->taskio.task_action = CTL_TASK_ABORT_TASK; + break; + + case VIRTIO_SCSI_T_TMF_ABORT_TASK_SET: + io->taskio.task_action = CTL_TASK_ABORT_TASK_SET; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_ACA: + io->taskio.task_action = CTL_TASK_CLEAR_ACA; + break; + + case VIRTIO_SCSI_T_TMF_CLEAR_TASK_SET: + io->taskio.task_action = CTL_TASK_CLEAR_TASK_SET; + break; + + case VIRTIO_SCSI_T_TMF_I_T_NEXUS_RESET: + io->taskio.task_action = CTL_TASK_I_T_NEXUS_RESET; + break; + + case VIRTIO_SCSI_T_TMF_LOGICAL_UNIT_RESET: + io->taskio.task_action = CTL_TASK_LUN_RESET; + break; + + case VIRTIO_SCSI_T_TMF_QUERY_TASK: + io->taskio.task_action = CTL_TASK_QUERY_TASK; + break; + + case VIRTIO_SCSI_T_TMF_QUERY_TASK_SET: + io->taskio.task_action = CTL_TASK_QUERY_TASK_SET; + break; + } + + if (pci_vtscsi_debug) { + struct sbuf *sb = sbuf_new_auto(); + ctl_io_sbuf(io, sb); + sbuf_finish(sb); + DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); + sbuf_delete(sb); + } + + err = ioctl(sc->vss_ctl_fd, CTL_IO, io); + if (err != 0) + WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno))); + + tmf->response = io->taskio.task_status; + ctl_scsi_free_io(io); + return (1); +} + +static int +pci_vtscsi_an_handle(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_ctrl_an *an) +{ + + return (0); +} + +static int +pci_vtscsi_request_handle(struct pci_vtscsi_queue *q, struct iovec *iov_in, + int niov_in, struct iovec *iov_out, int niov_out) +{ + struct pci_vtscsi_softc *sc = q->vsq_sc; + struct pci_vtscsi_req_cmd_rd *cmd_rd = NULL; + struct pci_vtscsi_req_cmd_wr *cmd_wr; + struct iovec data_iov_in[VTSCSI_MAXSEG], data_iov_out[VTSCSI_MAXSEG]; + union ctl_io *io; + int data_niov_in, data_niov_out; + void *ext_data_ptr = NULL; + uint32_t ext_data_len = 0, ext_sg_entries = 0; + int err; + + seek_iov(iov_in, niov_in, data_iov_in, &data_niov_in, + VTSCSI_IN_HEADER_LEN(sc)); + seek_iov(iov_out, niov_out, data_iov_out, &data_niov_out, + VTSCSI_OUT_HEADER_LEN(sc)); + + truncate_iov(iov_in, &niov_in, VTSCSI_IN_HEADER_LEN(sc)); + truncate_iov(iov_out, &niov_out, VTSCSI_OUT_HEADER_LEN(sc)); + iov_to_buf(iov_in, niov_in, (void **)&cmd_rd); + + cmd_wr = malloc(VTSCSI_OUT_HEADER_LEN(sc)); + io = ctl_scsi_alloc_io(sc->vss_iid); + ctl_scsi_zero_io(io); + + io->io_hdr.nexus.initid = sc->vss_iid; + io->io_hdr.nexus.targ_lun = pci_vtscsi_get_lun(cmd_rd->lun); + + io->io_hdr.io_type = CTL_IO_SCSI; + + if (data_niov_in > 0) { + ext_data_ptr = (void *)data_iov_in; + ext_sg_entries = data_niov_in; + ext_data_len = count_iov(data_iov_in, data_niov_in); + io->io_hdr.flags |= CTL_FLAG_DATA_OUT; + } else if (data_niov_out > 0) { + ext_data_ptr = (void *)data_iov_out; + ext_sg_entries = data_niov_out; + ext_data_len = count_iov(data_iov_out, data_niov_out); + io->io_hdr.flags |= CTL_FLAG_DATA_IN; + } + + io->scsiio.sense_len = sc->vss_config.sense_size; + io->scsiio.tag_num = (uint32_t)cmd_rd->id; + switch (cmd_rd->task_attr) { + case VIRTIO_SCSI_S_ORDERED: + io->scsiio.tag_type = CTL_TAG_ORDERED; + break; + case VIRTIO_SCSI_S_HEAD: + io->scsiio.tag_type = CTL_TAG_HEAD_OF_QUEUE; + break; + case VIRTIO_SCSI_S_ACA: + io->scsiio.tag_type = CTL_TAG_ACA; + break; + case VIRTIO_SCSI_S_SIMPLE: + default: + io->scsiio.tag_type = CTL_TAG_SIMPLE; + break; + } + io->scsiio.ext_sg_entries = ext_sg_entries; + io->scsiio.ext_data_ptr = ext_data_ptr; + io->scsiio.ext_data_len = ext_data_len; + io->scsiio.ext_data_filled = 0; + io->scsiio.cdb_len = sc->vss_config.cdb_size; + memcpy(io->scsiio.cdb, cmd_rd->cdb, sc->vss_config.cdb_size); + + if (pci_vtscsi_debug) { + struct sbuf *sb = sbuf_new_auto(); + ctl_io_sbuf(io, sb); + sbuf_finish(sb); + DPRINTF(("pci_virtio_scsi: %s", sbuf_data(sb))); + sbuf_delete(sb); + } + + err = ioctl(sc->vss_ctl_fd, CTL_IO, io); + if (err != 0) { + WPRINTF(("CTL_IO: err=%d (%s)\n", errno, strerror(errno))); + cmd_wr->response = VIRTIO_SCSI_S_FAILURE; + } else { + cmd_wr->sense_len = MIN(io->scsiio.sense_len, + sc->vss_config.sense_size); + cmd_wr->residual = io->scsiio.residual; + cmd_wr->status = io->scsiio.scsi_status; + cmd_wr->response = VIRTIO_SCSI_S_OK; + memcpy(&cmd_wr->sense, &io->scsiio.sense_data, + cmd_wr->sense_len); + } + + buf_to_iov(cmd_wr, VTSCSI_OUT_HEADER_LEN(sc), iov_out, niov_out, 0); + free(cmd_rd); + free(cmd_wr); + ctl_scsi_free_io(io); + return (VTSCSI_OUT_HEADER_LEN(sc) + io->scsiio.ext_data_filled); +} + +static void +pci_vtscsi_controlq_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtscsi_softc *sc; + struct iovec iov[VTSCSI_MAXSEG]; + uint16_t idx, n; + void *buf = NULL; + size_t bufsize; + int iolen; + + sc = vsc; + + while (vq_has_descs(vq)) { + n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, NULL); + bufsize = iov_to_buf(iov, n, &buf); + iolen = pci_vtscsi_control_handle(sc, buf, bufsize); + buf_to_iov(buf + bufsize - iolen, iolen, iov, n, + bufsize - iolen); + + /* + * Release this chain and handle more + */ + vq_relchain(vq, idx, iolen); + } + vq_endchains(vq, 1); /* Generate interrupt if appropriate. */ + free(buf); +} + +static void +pci_vtscsi_eventq_notify(void *vsc, struct vqueue_info *vq) +{ + + vq->vq_used->vu_flags |= VRING_USED_F_NO_NOTIFY; +} + +static void +pci_vtscsi_requestq_notify(void *vsc, struct vqueue_info *vq) +{ + struct pci_vtscsi_softc *sc; + struct pci_vtscsi_queue *q; + struct pci_vtscsi_request *req; + struct iovec iov[VTSCSI_MAXSEG]; + uint16_t flags[VTSCSI_MAXSEG]; + uint16_t idx, n, i; + int readable; + + sc = vsc; + q = &sc->vss_queues[vq->vq_num - 2]; + + while (vq_has_descs(vq)) { + readable = 0; + n = vq_getchain(vq, &idx, iov, VTSCSI_MAXSEG, flags); + + /* Count readable descriptors */ + for (i = 0; i < n; i++) { + if (flags[i] & VRING_DESC_F_WRITE) + break; + + readable++; + } + + req = calloc(1, sizeof(struct pci_vtscsi_request)); + req->vsr_idx = idx; + req->vsr_queue = q; + req->vsr_niov_in = readable; + req->vsr_niov_out = n - readable; + memcpy(req->vsr_iov_in, iov, + req->vsr_niov_in * sizeof(struct iovec)); + memcpy(req->vsr_iov_out, iov + readable, + req->vsr_niov_out * sizeof(struct iovec)); + + pthread_mutex_lock(&q->vsq_mtx); + STAILQ_INSERT_TAIL(&q->vsq_requests, req, vsr_link); + pthread_cond_signal(&q->vsq_cv); + pthread_mutex_unlock(&q->vsq_mtx); + + DPRINTF(("virtio-scsi: request <idx=%d> enqueued\n", idx)); + } +} + +static int +pci_vtscsi_init_queue(struct pci_vtscsi_softc *sc, + struct pci_vtscsi_queue *queue, int num) +{ + struct pci_vtscsi_worker *worker; + char tname[MAXCOMLEN + 1]; + int i; + + queue->vsq_sc = sc; + queue->vsq_vq = &sc->vss_vq[num + 2]; + + pthread_mutex_init(&queue->vsq_mtx, NULL); + pthread_mutex_init(&queue->vsq_qmtx, NULL); + pthread_cond_init(&queue->vsq_cv, NULL); + STAILQ_INIT(&queue->vsq_requests); + LIST_INIT(&queue->vsq_workers); + + for (i = 0; i < VTSCSI_THR_PER_Q; i++) { + worker = calloc(1, sizeof(struct pci_vtscsi_worker)); + worker->vsw_queue = queue; + + pthread_create(&worker->vsw_thread, NULL, &pci_vtscsi_proc, + (void *)worker); + + snprintf(tname, sizeof(tname), "vtscsi:%d-%d", num, i); + pthread_set_name_np(worker->vsw_thread, tname); + LIST_INSERT_HEAD(&queue->vsq_workers, worker, vsw_link); + } + + return (0); +} + +static int +pci_vtscsi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_vtscsi_softc *sc; + char *opt, *optname; + const char *devname; + int i, optidx = 0; + + sc = calloc(1, sizeof(struct pci_vtscsi_softc)); + devname = "/dev/cam/ctl"; + while ((opt = strsep(&opts, ",")) != NULL) { + optname = strsep(&opt, "="); + if (opt == NULL && optidx == 0) { + if (optname[0] != 0) + devname = optname; + } else if (strcmp(optname, "dev") == 0 && opt != NULL) { + devname = opt; + } else if (strcmp(optname, "iid") == 0 && opt != NULL) { + sc->vss_iid = strtoul(opt, NULL, 10); + } else { + fprintf(stderr, "Invalid option %s\n", optname); + free(sc); + return (1); + } + optidx++; + } + + sc->vss_ctl_fd = open(devname, O_RDWR); + if (sc->vss_ctl_fd < 0) { + WPRINTF(("cannot open %s: %s\n", devname, strerror(errno))); + free(sc); + return (1); + } + + vi_softc_linkup(&sc->vss_vs, &vtscsi_vi_consts, sc, pi, sc->vss_vq); + sc->vss_vs.vs_mtx = &sc->vss_mtx; + + /* controlq */ + sc->vss_vq[0].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[0].vq_notify = pci_vtscsi_controlq_notify; + + /* eventq */ + sc->vss_vq[1].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[1].vq_notify = pci_vtscsi_eventq_notify; + + /* request queues */ + for (i = 2; i < VTSCSI_MAXQ; i++) { + sc->vss_vq[i].vq_qsize = VTSCSI_RINGSZ; + sc->vss_vq[i].vq_notify = pci_vtscsi_requestq_notify; + pci_vtscsi_init_queue(sc, &sc->vss_queues[i - 2], i - 2); + } + + /* initialize config space */ + pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_SCSI); + pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE); + pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_SCSI); + pci_set_cfgdata16(pi, PCIR_SUBVEND_0, VIRTIO_VENDOR); + + if (vi_intr_init(&sc->vss_vs, 1, fbsdrun_virtio_msix())) + return (1); + vi_set_io_bar(&sc->vss_vs, 0); + + return (0); +} + + +struct pci_devemu pci_de_vscsi = { + .pe_emu = "virtio-scsi", + .pe_init = pci_vtscsi_init, + .pe_barwrite = vi_pci_write, + .pe_barread = vi_pci_read +}; +PCI_EMUL_SET(pci_de_vscsi); diff --git a/usr/src/cmd/bhyve/pci_virtio_viona.c b/usr/src/cmd/bhyve/pci_virtio_viona.c index f4d5d528be..e5a5cb584f 100644 --- a/usr/src/cmd/bhyve/pci_virtio_viona.c +++ b/usr/src/cmd/bhyve/pci_virtio_viona.c @@ -34,6 +34,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #include <sys/cdefs.h> @@ -289,6 +290,8 @@ pci_viona_tx_thread(void *param) sc->vsc_tx_kick_lock_held = B_FALSE; } pthread_mutex_unlock(&sc->tx_mtx); + + return (NULL); } static void @@ -347,8 +350,10 @@ static int pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc) { vioc_create_t vna_create; +#if notyet char devname[MAXNAMELEN]; int ctlfd; +#endif int error; sc->vsc_vnafd = open("/devices/pseudo/viona@0:ctl", O_RDWR | O_EXCL); @@ -360,10 +365,12 @@ pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc) vna_create.c_linkid = sc->vsc_linkid; strlcpy(vna_create.c_vmname, vmname, sizeof (vna_create.c_vmname)); +#if notyet vm_get_memory_seg(ctx, 1 * (1024 * 1024UL), &vna_create.c_lomem_size, NULL); vm_get_memory_seg(ctx, 4 * (1024 * 1024 * 1024UL), &vna_create.c_himem_size, NULL); +#endif error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create); if (error != 0) { WPRINTF(("ioctl viona create failed %d\n", error)); @@ -495,7 +502,7 @@ viona_adjust_offset(struct pci_devinst *pi, uint64_t offset) static void pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, - int baridx, uint64_t offset, int size, uint64_t value) + int baridx, uint64_t offset, int size, uint64_t value) { struct pci_viona_softc *sc = pi->pi_arg; void *ptr; diff --git a/usr/src/cmd/bhyve/pci_xhci.c b/usr/src/cmd/bhyve/pci_xhci.c new file mode 100644 index 0000000000..29d56ec32c --- /dev/null +++ b/usr/src/cmd/bhyve/pci_xhci.c @@ -0,0 +1,2855 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com> + * Copyright 2018 Joyent, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ +/* + XHCI options: + -s <n>,xhci,{devices} + + devices: + tablet USB tablet mouse + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/uio.h> +#include <sys/types.h> +#include <sys/queue.h> + +#include <stdio.h> +#include <stdlib.h> +#include <stdint.h> +#include <string.h> +#include <errno.h> +#include <pthread.h> +#include <unistd.h> + +#include <dev/usb/usbdi.h> +#include <dev/usb/usb.h> +#include <dev/usb/usb_freebsd.h> +#include <xhcireg.h> + +#include "bhyverun.h" +#include "pci_emul.h" +#include "pci_xhci.h" +#include "usb_emul.h" + + +static int xhci_debug = 0; +#define DPRINTF(params) if (xhci_debug) printf params +#define WPRINTF(params) printf params + + +#define XHCI_NAME "xhci" +#define XHCI_MAX_DEVS 8 /* 4 USB3 + 4 USB2 devs */ + +#define XHCI_MAX_SLOTS 64 /* min allowed by Windows drivers */ + +/* + * XHCI data structures can be up to 64k, but limit paddr_guest2host mapping + * to 4k to avoid going over the guest physical memory barrier. + */ +#define XHCI_PADDR_SZ 4096 /* paddr_guest2host max size */ + +#define XHCI_ERST_MAX 0 /* max 2^entries event ring seg tbl */ + +#define XHCI_CAPLEN (4*8) /* offset of op register space */ +#define XHCI_HCCPRAMS2 0x1C /* offset of HCCPARAMS2 register */ +#define XHCI_PORTREGS_START 0x400 +#define XHCI_DOORBELL_MAX 256 + +#define XHCI_STREAMS_MAX 1 /* 4-15 in XHCI spec */ + +/* caplength and hci-version registers */ +#define XHCI_SET_CAPLEN(x) ((x) & 0xFF) +#define XHCI_SET_HCIVERSION(x) (((x) & 0xFFFF) << 16) +#define XHCI_GET_HCIVERSION(x) (((x) >> 16) & 0xFFFF) + +/* hcsparams1 register */ +#define XHCI_SET_HCSP1_MAXSLOTS(x) ((x) & 0xFF) +#define XHCI_SET_HCSP1_MAXINTR(x) (((x) & 0x7FF) << 8) +#define XHCI_SET_HCSP1_MAXPORTS(x) (((x) & 0xFF) << 24) + +/* hcsparams2 register */ +#define XHCI_SET_HCSP2_IST(x) ((x) & 0x0F) +#define XHCI_SET_HCSP2_ERSTMAX(x) (((x) & 0x0F) << 4) +#define XHCI_SET_HCSP2_MAXSCRATCH_HI(x) (((x) & 0x1F) << 21) +#define XHCI_SET_HCSP2_MAXSCRATCH_LO(x) (((x) & 0x1F) << 27) + +/* hcsparams3 register */ +#define XHCI_SET_HCSP3_U1EXITLATENCY(x) ((x) & 0xFF) +#define XHCI_SET_HCSP3_U2EXITLATENCY(x) (((x) & 0xFFFF) << 16) + +/* hccparams1 register */ +#define XHCI_SET_HCCP1_AC64(x) ((x) & 0x01) +#define XHCI_SET_HCCP1_BNC(x) (((x) & 0x01) << 1) +#define XHCI_SET_HCCP1_CSZ(x) (((x) & 0x01) << 2) +#define XHCI_SET_HCCP1_PPC(x) (((x) & 0x01) << 3) +#define XHCI_SET_HCCP1_PIND(x) (((x) & 0x01) << 4) +#define XHCI_SET_HCCP1_LHRC(x) (((x) & 0x01) << 5) +#define XHCI_SET_HCCP1_LTC(x) (((x) & 0x01) << 6) +#define XHCI_SET_HCCP1_NSS(x) (((x) & 0x01) << 7) +#define XHCI_SET_HCCP1_PAE(x) (((x) & 0x01) << 8) +#define XHCI_SET_HCCP1_SPC(x) (((x) & 0x01) << 9) +#define XHCI_SET_HCCP1_SEC(x) (((x) & 0x01) << 10) +#define XHCI_SET_HCCP1_CFC(x) (((x) & 0x01) << 11) +#define XHCI_SET_HCCP1_MAXPSA(x) (((x) & 0x0F) << 12) +#define XHCI_SET_HCCP1_XECP(x) (((x) & 0xFFFF) << 16) + +/* hccparams2 register */ +#define XHCI_SET_HCCP2_U3C(x) ((x) & 0x01) +#define XHCI_SET_HCCP2_CMC(x) (((x) & 0x01) << 1) +#define XHCI_SET_HCCP2_FSC(x) (((x) & 0x01) << 2) +#define XHCI_SET_HCCP2_CTC(x) (((x) & 0x01) << 3) +#define XHCI_SET_HCCP2_LEC(x) (((x) & 0x01) << 4) +#define XHCI_SET_HCCP2_CIC(x) (((x) & 0x01) << 5) + +/* other registers */ +#define XHCI_SET_DOORBELL(x) ((x) & ~0x03) +#define XHCI_SET_RTSOFFSET(x) ((x) & ~0x0F) + +/* register masks */ +#define XHCI_PS_PLS_MASK (0xF << 5) /* port link state */ +#define XHCI_PS_SPEED_MASK (0xF << 10) /* port speed */ +#define XHCI_PS_PIC_MASK (0x3 << 14) /* port indicator */ + +/* port register set */ +#define XHCI_PORTREGS_BASE 0x400 /* base offset */ +#define XHCI_PORTREGS_PORT0 0x3F0 +#define XHCI_PORTREGS_SETSZ 0x10 /* size of a set */ + +#define MASK_64_HI(x) ((x) & ~0xFFFFFFFFULL) +#define MASK_64_LO(x) ((x) & 0xFFFFFFFFULL) + +#define FIELD_REPLACE(a,b,m,s) (((a) & ~((m) << (s))) | \ + (((b) & (m)) << (s))) +#define FIELD_COPY(a,b,m,s) (((a) & ~((m) << (s))) | \ + (((b) & ((m) << (s))))) + +struct pci_xhci_trb_ring { + uint64_t ringaddr; /* current dequeue guest address */ + uint32_t ccs; /* consumer cycle state */ +}; + +/* device endpoint transfer/stream rings */ +struct pci_xhci_dev_ep { + union { + struct xhci_trb *_epu_tr; + struct xhci_stream_ctx *_epu_sctx; + } _ep_trbsctx; +#define ep_tr _ep_trbsctx._epu_tr +#define ep_sctx _ep_trbsctx._epu_sctx + + union { + struct pci_xhci_trb_ring _epu_trb; + struct pci_xhci_trb_ring *_epu_sctx_trbs; + } _ep_trb_rings; +#define ep_ringaddr _ep_trb_rings._epu_trb.ringaddr +#define ep_ccs _ep_trb_rings._epu_trb.ccs +#define ep_sctx_trbs _ep_trb_rings._epu_sctx_trbs + + struct usb_data_xfer *ep_xfer; /* transfer chain */ +}; + +/* device context base address array: maps slot->device context */ +struct xhci_dcbaa { + uint64_t dcba[USB_MAX_DEVICES+1]; /* xhci_dev_ctx ptrs */ +}; + +/* port status registers */ +struct pci_xhci_portregs { + uint32_t portsc; /* port status and control */ + uint32_t portpmsc; /* port pwr mgmt status & control */ + uint32_t portli; /* port link info */ + uint32_t porthlpmc; /* port hardware LPM control */ +} __packed; +#define XHCI_PS_SPEED_SET(x) (((x) & 0xF) << 10) + +/* xHC operational registers */ +struct pci_xhci_opregs { + uint32_t usbcmd; /* usb command */ + uint32_t usbsts; /* usb status */ + uint32_t pgsz; /* page size */ + uint32_t dnctrl; /* device notification control */ + uint64_t crcr; /* command ring control */ + uint64_t dcbaap; /* device ctx base addr array ptr */ + uint32_t config; /* configure */ + + /* guest mapped addresses: */ + struct xhci_trb *cr_p; /* crcr dequeue */ + struct xhci_dcbaa *dcbaa_p; /* dev ctx array ptr */ +}; + +/* xHC runtime registers */ +struct pci_xhci_rtsregs { + uint32_t mfindex; /* microframe index */ + struct { /* interrupter register set */ + uint32_t iman; /* interrupter management */ + uint32_t imod; /* interrupter moderation */ + uint32_t erstsz; /* event ring segment table size */ + uint32_t rsvd; + uint64_t erstba; /* event ring seg-tbl base addr */ + uint64_t erdp; /* event ring dequeue ptr */ + } intrreg __packed; + + /* guest mapped addresses */ + struct xhci_event_ring_seg *erstba_p; + struct xhci_trb *erst_p; /* event ring segment tbl */ + int er_deq_seg; /* event ring dequeue segment */ + int er_enq_idx; /* event ring enqueue index - xHCI */ + int er_enq_seg; /* event ring enqueue segment */ + uint32_t er_events_cnt; /* number of events in ER */ + uint32_t event_pcs; /* producer cycle state flag */ +}; + + +struct pci_xhci_softc; + + +/* + * USB device emulation container. + * This is referenced from usb_hci->hci_sc; 1 pci_xhci_dev_emu for each + * emulated device instance. + */ +struct pci_xhci_dev_emu { + struct pci_xhci_softc *xsc; + + /* XHCI contexts */ + struct xhci_dev_ctx *dev_ctx; + struct pci_xhci_dev_ep eps[XHCI_MAX_ENDPOINTS]; + int dev_slotstate; + + struct usb_devemu *dev_ue; /* USB emulated dev */ + void *dev_sc; /* device's softc */ + + struct usb_hci hci; +}; + +struct pci_xhci_softc { + struct pci_devinst *xsc_pi; + + pthread_mutex_t mtx; + + uint32_t caplength; /* caplen & hciversion */ + uint32_t hcsparams1; /* structural parameters 1 */ + uint32_t hcsparams2; /* structural parameters 2 */ + uint32_t hcsparams3; /* structural parameters 3 */ + uint32_t hccparams1; /* capability parameters 1 */ + uint32_t dboff; /* doorbell offset */ + uint32_t rtsoff; /* runtime register space offset */ + uint32_t hccparams2; /* capability parameters 2 */ + + uint32_t regsend; /* end of configuration registers */ + + struct pci_xhci_opregs opregs; + struct pci_xhci_rtsregs rtsregs; + + struct pci_xhci_portregs *portregs; + struct pci_xhci_dev_emu **devices; /* XHCI[port] = device */ + struct pci_xhci_dev_emu **slots; /* slots assigned from 1 */ + int ndevices; + + int usb2_port_start; + int usb3_port_start; +}; + + +/* portregs and devices arrays are set up to start from idx=1 */ +#define XHCI_PORTREG_PTR(x,n) &(x)->portregs[(n)] +#define XHCI_DEVINST_PTR(x,n) (x)->devices[(n)] +#define XHCI_SLOTDEV_PTR(x,n) (x)->slots[(n)] + +#define XHCI_HALTED(sc) ((sc)->opregs.usbsts & XHCI_STS_HCH) + +#define XHCI_GADDR(sc,a) paddr_guest2host((sc)->xsc_pi->pi_vmctx, \ + (a), \ + XHCI_PADDR_SZ - ((a) & (XHCI_PADDR_SZ-1))) + +static int xhci_in_use; + +/* map USB errors to XHCI */ +static const int xhci_usb_errors[USB_ERR_MAX] = { + [USB_ERR_NORMAL_COMPLETION] = XHCI_TRB_ERROR_SUCCESS, + [USB_ERR_PENDING_REQUESTS] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_NOT_STARTED] = XHCI_TRB_ERROR_ENDP_NOT_ON, + [USB_ERR_INVAL] = XHCI_TRB_ERROR_INVALID, + [USB_ERR_NOMEM] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_CANCELLED] = XHCI_TRB_ERROR_STOPPED, + [USB_ERR_BAD_ADDRESS] = XHCI_TRB_ERROR_PARAMETER, + [USB_ERR_BAD_BUFSIZE] = XHCI_TRB_ERROR_PARAMETER, + [USB_ERR_BAD_FLAG] = XHCI_TRB_ERROR_PARAMETER, + [USB_ERR_NO_CALLBACK] = XHCI_TRB_ERROR_STALL, + [USB_ERR_IN_USE] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_NO_ADDR] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_NO_PIPE] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_ZERO_NFRAMES] = XHCI_TRB_ERROR_UNDEFINED, + [USB_ERR_ZERO_MAXP] = XHCI_TRB_ERROR_UNDEFINED, + [USB_ERR_SET_ADDR_FAILED] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_NO_POWER] = XHCI_TRB_ERROR_ENDP_NOT_ON, + [USB_ERR_TOO_DEEP] = XHCI_TRB_ERROR_RESOURCE, + [USB_ERR_IOERROR] = XHCI_TRB_ERROR_TRB, + [USB_ERR_NOT_CONFIGURED] = XHCI_TRB_ERROR_ENDP_NOT_ON, + [USB_ERR_TIMEOUT] = XHCI_TRB_ERROR_CMD_ABORTED, + [USB_ERR_SHORT_XFER] = XHCI_TRB_ERROR_SHORT_PKT, + [USB_ERR_STALLED] = XHCI_TRB_ERROR_STALL, + [USB_ERR_INTERRUPTED] = XHCI_TRB_ERROR_CMD_ABORTED, + [USB_ERR_DMA_LOAD_FAILED] = XHCI_TRB_ERROR_DATA_BUF, + [USB_ERR_BAD_CONTEXT] = XHCI_TRB_ERROR_TRB, + [USB_ERR_NO_ROOT_HUB] = XHCI_TRB_ERROR_UNDEFINED, + [USB_ERR_NO_INTR_THREAD] = XHCI_TRB_ERROR_UNDEFINED, + [USB_ERR_NOT_LOCKED] = XHCI_TRB_ERROR_UNDEFINED, +}; +#define USB_TO_XHCI_ERR(e) ((e) < USB_ERR_MAX ? xhci_usb_errors[(e)] : \ + XHCI_TRB_ERROR_INVALID) + +static int pci_xhci_insert_event(struct pci_xhci_softc *sc, + struct xhci_trb *evtrb, int do_intr); +static void pci_xhci_dump_trb(struct xhci_trb *trb); +static void pci_xhci_assert_interrupt(struct pci_xhci_softc *sc); +static void pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot); +static void pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm); +static void pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, + struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, + struct xhci_endp_ctx *ep_ctx, uint32_t streamid, + uint64_t ringaddr, int ccs); + +static void +pci_xhci_set_evtrb(struct xhci_trb *evtrb, uint64_t port, uint32_t errcode, + uint32_t evtype) +{ + evtrb->qwTrb0 = port << 24; + evtrb->dwTrb2 = XHCI_TRB_2_ERROR_SET(errcode); + evtrb->dwTrb3 = XHCI_TRB_3_TYPE_SET(evtype); +} + + +/* controller reset */ +static void +pci_xhci_reset(struct pci_xhci_softc *sc) +{ + int i; + + sc->rtsregs.er_enq_idx = 0; + sc->rtsregs.er_events_cnt = 0; + sc->rtsregs.event_pcs = 1; + + for (i = 1; i <= XHCI_MAX_SLOTS; i++) { + pci_xhci_reset_slot(sc, i); + } +} + +static uint32_t +pci_xhci_usbcmd_write(struct pci_xhci_softc *sc, uint32_t cmd) +{ + int do_intr = 0; + int i; + + if (cmd & XHCI_CMD_RS) { + do_intr = (sc->opregs.usbcmd & XHCI_CMD_RS) == 0; + + sc->opregs.usbcmd |= XHCI_CMD_RS; + sc->opregs.usbsts &= ~XHCI_STS_HCH; + sc->opregs.usbsts |= XHCI_STS_PCD; + + /* Queue port change event on controller run from stop */ + if (do_intr) + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + struct pci_xhci_dev_emu *dev; + struct pci_xhci_portregs *port; + struct xhci_trb evtrb; + + if ((dev = XHCI_DEVINST_PTR(sc, i)) == NULL) + continue; + + port = XHCI_PORTREG_PTR(sc, i); + port->portsc |= XHCI_PS_CSC | XHCI_PS_CCS; + port->portsc &= ~XHCI_PS_PLS_MASK; + + /* + * XHCI 4.19.3 USB2 RxDetect->Polling, + * USB3 Polling->U0 + */ + if (dev->dev_ue->ue_usbver == 2) + port->portsc |= + XHCI_PS_PLS_SET(UPS_PORT_LS_POLL); + else + port->portsc |= + XHCI_PS_PLS_SET(UPS_PORT_LS_U0); + + pci_xhci_set_evtrb(&evtrb, i, + XHCI_TRB_ERROR_SUCCESS, + XHCI_TRB_EVENT_PORT_STS_CHANGE); + + if (pci_xhci_insert_event(sc, &evtrb, 0) != + XHCI_TRB_ERROR_SUCCESS) + break; + } + } else { + sc->opregs.usbcmd &= ~XHCI_CMD_RS; + sc->opregs.usbsts |= XHCI_STS_HCH; + sc->opregs.usbsts &= ~XHCI_STS_PCD; + } + + /* start execution of schedule; stop when set to 0 */ + cmd |= sc->opregs.usbcmd & XHCI_CMD_RS; + + if (cmd & XHCI_CMD_HCRST) { + /* reset controller */ + pci_xhci_reset(sc); + cmd &= ~XHCI_CMD_HCRST; + } + + cmd &= ~(XHCI_CMD_CSS | XHCI_CMD_CRS); + + if (do_intr) + pci_xhci_assert_interrupt(sc); + + return (cmd); +} + +static void +pci_xhci_portregs_write(struct pci_xhci_softc *sc, uint64_t offset, + uint64_t value) +{ + struct xhci_trb evtrb; + struct pci_xhci_portregs *p; + int port; + uint32_t oldpls, newpls; + + if (sc->portregs == NULL) + return; + + port = (offset - XHCI_PORTREGS_PORT0) / XHCI_PORTREGS_SETSZ; + offset = (offset - XHCI_PORTREGS_PORT0) % XHCI_PORTREGS_SETSZ; + + DPRINTF(("pci_xhci: portregs wr offset 0x%lx, port %u: 0x%lx\r\n", + offset, port, value)); + + assert(port >= 0); + + if (port > XHCI_MAX_DEVS) { + DPRINTF(("pci_xhci: portregs_write port %d > ndevices\r\n", + port)); + return; + } + + if (XHCI_DEVINST_PTR(sc, port) == NULL) { + DPRINTF(("pci_xhci: portregs_write to unattached port %d\r\n", + port)); + } + + p = XHCI_PORTREG_PTR(sc, port); + switch (offset) { + case 0: + /* port reset or warm reset */ + if (value & (XHCI_PS_PR | XHCI_PS_WPR)) { + pci_xhci_reset_port(sc, port, value & XHCI_PS_WPR); + break; + } + + if ((p->portsc & XHCI_PS_PP) == 0) { + WPRINTF(("pci_xhci: portregs_write to unpowered " + "port %d\r\n", port)); + break; + } + + /* Port status and control register */ + oldpls = XHCI_PS_PLS_GET(p->portsc); + newpls = XHCI_PS_PLS_GET(value); + + p->portsc &= XHCI_PS_PED | XHCI_PS_PLS_MASK | + XHCI_PS_SPEED_MASK | XHCI_PS_PIC_MASK; + + if (XHCI_DEVINST_PTR(sc, port)) + p->portsc |= XHCI_PS_CCS; + + p->portsc |= (value & + ~(XHCI_PS_OCA | + XHCI_PS_PR | + XHCI_PS_PED | + XHCI_PS_PLS_MASK | /* link state */ + XHCI_PS_SPEED_MASK | + XHCI_PS_PIC_MASK | /* port indicator */ + XHCI_PS_LWS | XHCI_PS_DR | XHCI_PS_WPR)); + + /* clear control bits */ + p->portsc &= ~(value & + (XHCI_PS_CSC | + XHCI_PS_PEC | + XHCI_PS_WRC | + XHCI_PS_OCC | + XHCI_PS_PRC | + XHCI_PS_PLC | + XHCI_PS_CEC | + XHCI_PS_CAS)); + + /* port disable request; for USB3, don't care */ + if (value & XHCI_PS_PED) + DPRINTF(("Disable port %d request\r\n", port)); + + if (!(value & XHCI_PS_LWS)) + break; + + DPRINTF(("Port new PLS: %d\r\n", newpls)); + switch (newpls) { + case 0: /* U0 */ + case 3: /* U3 */ + if (oldpls != newpls) { + p->portsc &= ~XHCI_PS_PLS_MASK; + p->portsc |= XHCI_PS_PLS_SET(newpls) | + XHCI_PS_PLC; + + if (oldpls != 0 && newpls == 0) { + pci_xhci_set_evtrb(&evtrb, port, + XHCI_TRB_ERROR_SUCCESS, + XHCI_TRB_EVENT_PORT_STS_CHANGE); + + pci_xhci_insert_event(sc, &evtrb, 1); + } + } + break; + + default: + DPRINTF(("Unhandled change port %d PLS %u\r\n", + port, newpls)); + break; + } + break; + case 4: + /* Port power management status and control register */ + p->portpmsc = value; + break; + case 8: + /* Port link information register */ + DPRINTF(("pci_xhci attempted write to PORTLI, port %d\r\n", + port)); + break; + case 12: + /* + * Port hardware LPM control register. + * For USB3, this register is reserved. + */ + p->porthlpmc = value; + break; + } +} + +struct xhci_dev_ctx * +pci_xhci_get_dev_ctx(struct pci_xhci_softc *sc, uint32_t slot) +{ + uint64_t devctx_addr; + struct xhci_dev_ctx *devctx; + + assert(slot > 0 && slot <= sc->ndevices); + assert(sc->opregs.dcbaa_p != NULL); + + devctx_addr = sc->opregs.dcbaa_p->dcba[slot]; + + if (devctx_addr == 0) { + DPRINTF(("get_dev_ctx devctx_addr == 0\r\n")); + return (NULL); + } + + DPRINTF(("pci_xhci: get dev ctx, slot %u devctx addr %016lx\r\n", + slot, devctx_addr)); + devctx = XHCI_GADDR(sc, devctx_addr & ~0x3FUL); + + return (devctx); +} + +struct xhci_trb * +pci_xhci_trb_next(struct pci_xhci_softc *sc, struct xhci_trb *curtrb, + uint64_t *guestaddr) +{ + struct xhci_trb *next; + + assert(curtrb != NULL); + + if (XHCI_TRB_3_TYPE_GET(curtrb->dwTrb3) == XHCI_TRB_TYPE_LINK) { + if (guestaddr) + *guestaddr = curtrb->qwTrb0 & ~0xFUL; + + next = XHCI_GADDR(sc, curtrb->qwTrb0 & ~0xFUL); + } else { + if (guestaddr) + *guestaddr += sizeof(struct xhci_trb) & ~0xFUL; + + next = curtrb + 1; + } + + return (next); +} + +static void +pci_xhci_assert_interrupt(struct pci_xhci_softc *sc) +{ + + sc->rtsregs.intrreg.erdp |= XHCI_ERDP_LO_BUSY; + sc->rtsregs.intrreg.iman |= XHCI_IMAN_INTR_PEND; + sc->opregs.usbsts |= XHCI_STS_EINT; + + /* only trigger interrupt if permitted */ + if ((sc->opregs.usbcmd & XHCI_CMD_INTE) && + (sc->rtsregs.intrreg.iman & XHCI_IMAN_INTR_ENA)) { + if (pci_msi_enabled(sc->xsc_pi)) + pci_generate_msi(sc->xsc_pi, 0); + else + pci_lintr_assert(sc->xsc_pi); + } +} + +static void +pci_xhci_deassert_interrupt(struct pci_xhci_softc *sc) +{ + + if (!pci_msi_enabled(sc->xsc_pi)) + pci_lintr_assert(sc->xsc_pi); +} + +static void +pci_xhci_init_ep(struct pci_xhci_dev_emu *dev, int epid) +{ + struct xhci_dev_ctx *dev_ctx; + struct pci_xhci_dev_ep *devep; + struct xhci_endp_ctx *ep_ctx; + uint32_t pstreams; + int i; + + dev_ctx = dev->dev_ctx; + ep_ctx = &dev_ctx->ctx_ep[epid]; + devep = &dev->eps[epid]; + pstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0); + if (pstreams > 0) { + DPRINTF(("init_ep %d with pstreams %d\r\n", epid, pstreams)); + assert(devep->ep_sctx_trbs == NULL); + + devep->ep_sctx = XHCI_GADDR(dev->xsc, ep_ctx->qwEpCtx2 & + XHCI_EPCTX_2_TR_DQ_PTR_MASK); + devep->ep_sctx_trbs = calloc(pstreams, + sizeof(struct pci_xhci_trb_ring)); + for (i = 0; i < pstreams; i++) { + devep->ep_sctx_trbs[i].ringaddr = + devep->ep_sctx[i].qwSctx0 & + XHCI_SCTX_0_TR_DQ_PTR_MASK; + devep->ep_sctx_trbs[i].ccs = + XHCI_SCTX_0_DCS_GET(devep->ep_sctx[i].qwSctx0); + } + } else { + DPRINTF(("init_ep %d with no pstreams\r\n", epid)); + devep->ep_ringaddr = ep_ctx->qwEpCtx2 & + XHCI_EPCTX_2_TR_DQ_PTR_MASK; + devep->ep_ccs = XHCI_EPCTX_2_DCS_GET(ep_ctx->qwEpCtx2); + devep->ep_tr = XHCI_GADDR(dev->xsc, devep->ep_ringaddr); + DPRINTF(("init_ep tr DCS %x\r\n", devep->ep_ccs)); + } + + if (devep->ep_xfer == NULL) { + devep->ep_xfer = malloc(sizeof(struct usb_data_xfer)); + USB_DATA_XFER_INIT(devep->ep_xfer); + } +} + +static void +pci_xhci_disable_ep(struct pci_xhci_dev_emu *dev, int epid) +{ + struct xhci_dev_ctx *dev_ctx; + struct pci_xhci_dev_ep *devep; + struct xhci_endp_ctx *ep_ctx; + + DPRINTF(("pci_xhci disable_ep %d\r\n", epid)); + + dev_ctx = dev->dev_ctx; + ep_ctx = &dev_ctx->ctx_ep[epid]; + ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_DISABLED; + + devep = &dev->eps[epid]; + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0 && + devep->ep_sctx_trbs != NULL) + free(devep->ep_sctx_trbs); + + if (devep->ep_xfer != NULL) { + free(devep->ep_xfer); + devep->ep_xfer = NULL; + } + + memset(devep, 0, sizeof(struct pci_xhci_dev_ep)); +} + + +/* reset device at slot and data structures related to it */ +static void +pci_xhci_reset_slot(struct pci_xhci_softc *sc, int slot) +{ + struct pci_xhci_dev_emu *dev; + + dev = XHCI_SLOTDEV_PTR(sc, slot); + + if (!dev) { + DPRINTF(("xhci reset unassigned slot (%d)?\r\n", slot)); + } else { + dev->dev_slotstate = XHCI_ST_DISABLED; + } + + /* TODO: reset ring buffer pointers */ +} + +static int +pci_xhci_insert_event(struct pci_xhci_softc *sc, struct xhci_trb *evtrb, + int do_intr) +{ + struct pci_xhci_rtsregs *rts; + uint64_t erdp; + int erdp_idx; + int err; + struct xhci_trb *evtrbptr; + + err = XHCI_TRB_ERROR_SUCCESS; + + rts = &sc->rtsregs; + + erdp = rts->intrreg.erdp & ~0xF; + erdp_idx = (erdp - rts->erstba_p[rts->er_deq_seg].qwEvrsTablePtr) / + sizeof(struct xhci_trb); + + DPRINTF(("pci_xhci: insert event 0[%lx] 2[%x] 3[%x]\r\n" + "\terdp idx %d/seg %d, enq idx %d/seg %d, pcs %u\r\n" + "\t(erdp=0x%lx, erst=0x%lx, tblsz=%u, do_intr %d)\r\n", + evtrb->qwTrb0, evtrb->dwTrb2, evtrb->dwTrb3, + erdp_idx, rts->er_deq_seg, rts->er_enq_idx, + rts->er_enq_seg, + rts->event_pcs, erdp, rts->erstba_p->qwEvrsTablePtr, + rts->erstba_p->dwEvrsTableSize, do_intr)); + + evtrbptr = &rts->erst_p[rts->er_enq_idx]; + + /* TODO: multi-segment table */ + if (rts->er_events_cnt >= rts->erstba_p->dwEvrsTableSize) { + DPRINTF(("pci_xhci[%d] cannot insert event; ring full\r\n", + __LINE__)); + err = XHCI_TRB_ERROR_EV_RING_FULL; + goto done; + } + + if (rts->er_events_cnt == rts->erstba_p->dwEvrsTableSize - 1) { + struct xhci_trb errev; + + if ((evtrbptr->dwTrb3 & 0x1) == (rts->event_pcs & 0x1)) { + + DPRINTF(("pci_xhci[%d] insert evt err: ring full\r\n", + __LINE__)); + + errev.qwTrb0 = 0; + errev.dwTrb2 = XHCI_TRB_2_ERROR_SET( + XHCI_TRB_ERROR_EV_RING_FULL); + errev.dwTrb3 = XHCI_TRB_3_TYPE_SET( + XHCI_TRB_EVENT_HOST_CTRL) | + rts->event_pcs; + rts->er_events_cnt++; + memcpy(&rts->erst_p[rts->er_enq_idx], &errev, + sizeof(struct xhci_trb)); + rts->er_enq_idx = (rts->er_enq_idx + 1) % + rts->erstba_p->dwEvrsTableSize; + err = XHCI_TRB_ERROR_EV_RING_FULL; + do_intr = 1; + + goto done; + } + } else { + rts->er_events_cnt++; + } + + evtrb->dwTrb3 &= ~XHCI_TRB_3_CYCLE_BIT; + evtrb->dwTrb3 |= rts->event_pcs; + + memcpy(&rts->erst_p[rts->er_enq_idx], evtrb, sizeof(struct xhci_trb)); + rts->er_enq_idx = (rts->er_enq_idx + 1) % + rts->erstba_p->dwEvrsTableSize; + + if (rts->er_enq_idx == 0) + rts->event_pcs ^= 1; + +done: + if (do_intr) + pci_xhci_assert_interrupt(sc); + + return (err); +} + +static uint32_t +pci_xhci_cmd_enable_slot(struct pci_xhci_softc *sc, uint32_t *slot) +{ + struct pci_xhci_dev_emu *dev; + uint32_t cmderr; + int i; + + cmderr = XHCI_TRB_ERROR_NO_SLOTS; + if (sc->portregs != NULL) + for (i = 1; i <= XHCI_MAX_SLOTS; i++) { + dev = XHCI_SLOTDEV_PTR(sc, i); + if (dev && dev->dev_slotstate == XHCI_ST_DISABLED) { + *slot = i; + dev->dev_slotstate = XHCI_ST_ENABLED; + cmderr = XHCI_TRB_ERROR_SUCCESS; + dev->hci.hci_address = i; + break; + } + } + + DPRINTF(("pci_xhci enable slot (error=%d) slot %u\r\n", + cmderr != XHCI_TRB_ERROR_SUCCESS, *slot)); + + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_disable_slot(struct pci_xhci_softc *sc, uint32_t slot) +{ + struct pci_xhci_dev_emu *dev; + uint32_t cmderr; + + DPRINTF(("pci_xhci disable slot %u\r\n", slot)); + + cmderr = XHCI_TRB_ERROR_NO_SLOTS; + if (sc->portregs == NULL) + goto done; + + if (slot > sc->ndevices) { + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; + goto done; + } + + dev = XHCI_SLOTDEV_PTR(sc, slot); + if (dev) { + if (dev->dev_slotstate == XHCI_ST_DISABLED) { + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; + } else { + dev->dev_slotstate = XHCI_ST_DISABLED; + cmderr = XHCI_TRB_ERROR_SUCCESS; + /* TODO: reset events and endpoints */ + } + } + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_reset_device(struct pci_xhci_softc *sc, uint32_t slot) +{ + struct pci_xhci_dev_emu *dev; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + uint32_t cmderr; + int i; + + cmderr = XHCI_TRB_ERROR_NO_SLOTS; + if (sc->portregs == NULL) + goto done; + + DPRINTF(("pci_xhci reset device slot %u\r\n", slot)); + + dev = XHCI_SLOTDEV_PTR(sc, slot); + if (!dev || dev->dev_slotstate == XHCI_ST_DISABLED) + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; + else { + dev->dev_slotstate = XHCI_ST_DEFAULT; + + dev->hci.hci_address = 0; + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + /* slot state */ + dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_DEFAULT, + 0x1F, 27); + + /* number of contexts */ + dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); + + /* reset all eps other than ep-0 */ + for (i = 2; i <= 31; i++) { + ep_ctx = &dev_ctx->ctx_ep[i]; + ep_ctx->dwEpCtx0 = FIELD_REPLACE( ep_ctx->dwEpCtx0, + XHCI_ST_EPCTX_DISABLED, 0x7, 0); + } + + cmderr = XHCI_TRB_ERROR_SUCCESS; + } + + pci_xhci_reset_slot(sc, slot); + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_address_device(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct pci_xhci_dev_emu *dev; + struct xhci_input_dev_ctx *input_ctx; + struct xhci_slot_ctx *islot_ctx; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep0_ctx; + uint32_t cmderr; + + input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); + islot_ctx = &input_ctx->ctx_slot; + ep0_ctx = &input_ctx->ctx_ep[1]; + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + DPRINTF(("pci_xhci: address device, input ctl: D 0x%08x A 0x%08x,\r\n" + " slot %08x %08x %08x %08x\r\n" + " ep0 %08x %08x %016lx %08x\r\n", + input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, + islot_ctx->dwSctx0, islot_ctx->dwSctx1, + islot_ctx->dwSctx2, islot_ctx->dwSctx3, + ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, + ep0_ctx->dwEpCtx4)); + + /* when setting address: drop-ctx=0, add-ctx=slot+ep0 */ + if ((input_ctx->ctx_input.dwInCtx0 != 0) || + (input_ctx->ctx_input.dwInCtx1 & 0x03) != 0x03) { + DPRINTF(("pci_xhci: address device, input ctl invalid\r\n")); + cmderr = XHCI_TRB_ERROR_TRB; + goto done; + } + + /* assign address to slot */ + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + DPRINTF(("pci_xhci: address device, dev ctx\r\n" + " slot %08x %08x %08x %08x\r\n", + dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); + + dev = XHCI_SLOTDEV_PTR(sc, slot); + assert(dev != NULL); + + dev->hci.hci_address = slot; + dev->dev_ctx = dev_ctx; + + if (dev->dev_ue->ue_reset == NULL || + dev->dev_ue->ue_reset(dev->dev_sc) < 0) { + cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; + goto done; + } + + memcpy(&dev_ctx->ctx_slot, islot_ctx, sizeof(struct xhci_slot_ctx)); + + dev_ctx->ctx_slot.dwSctx3 = + XHCI_SCTX_3_SLOT_STATE_SET(XHCI_ST_SLCTX_ADDRESSED) | + XHCI_SCTX_3_DEV_ADDR_SET(slot); + + memcpy(&dev_ctx->ctx_ep[1], ep0_ctx, sizeof(struct xhci_endp_ctx)); + ep0_ctx = &dev_ctx->ctx_ep[1]; + ep0_ctx->dwEpCtx0 = (ep0_ctx->dwEpCtx0 & ~0x7) | + XHCI_EPCTX_0_EPSTATE_SET(XHCI_ST_EPCTX_RUNNING); + + pci_xhci_init_ep(dev, 1); + + dev->dev_slotstate = XHCI_ST_ADDRESSED; + + DPRINTF(("pci_xhci: address device, output ctx\r\n" + " slot %08x %08x %08x %08x\r\n" + " ep0 %08x %08x %016lx %08x\r\n", + dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3, + ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, + ep0_ctx->dwEpCtx4)); + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_config_ep(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct xhci_input_dev_ctx *input_ctx; + struct pci_xhci_dev_emu *dev; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx, *iep_ctx; + uint32_t cmderr; + int i; + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + DPRINTF(("pci_xhci config_ep slot %u\r\n", slot)); + + dev = XHCI_SLOTDEV_PTR(sc, slot); + assert(dev != NULL); + + if ((trb->dwTrb3 & XHCI_TRB_3_DCEP_BIT) != 0) { + DPRINTF(("pci_xhci config_ep - deconfigure ep slot %u\r\n", + slot)); + if (dev->dev_ue->ue_stop != NULL) + dev->dev_ue->ue_stop(dev->dev_sc); + + dev->dev_slotstate = XHCI_ST_ADDRESSED; + + dev->hci.hci_address = 0; + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + /* number of contexts */ + dev_ctx->ctx_slot.dwSctx0 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx0, 1, 0x1F, 27); + + /* slot state */ + dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_ADDRESSED, + 0x1F, 27); + + /* disable endpoints */ + for (i = 2; i < 32; i++) + pci_xhci_disable_ep(dev, i); + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + goto done; + } + + if (dev->dev_slotstate < XHCI_ST_ADDRESSED) { + DPRINTF(("pci_xhci: config_ep slotstate x%x != addressed\r\n", + dev->dev_slotstate)); + cmderr = XHCI_TRB_ERROR_SLOT_NOT_ON; + goto done; + } + + /* In addressed/configured state; + * for each drop endpoint ctx flag: + * ep->state = DISABLED + * for each add endpoint ctx flag: + * cp(ep-in, ep-out) + * ep->state = RUNNING + * for each drop+add endpoint flag: + * reset ep resources + * cp(ep-in, ep-out) + * ep->state = RUNNING + * if input->DisabledCtx[2-31] < 30: (at least 1 ep not disabled) + * slot->state = configured + */ + + input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); + dev_ctx = dev->dev_ctx; + DPRINTF(("pci_xhci: config_ep inputctx: D:x%08x A:x%08x 7:x%08x\r\n", + input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, + input_ctx->ctx_input.dwInCtx7)); + + for (i = 2; i <= 31; i++) { + ep_ctx = &dev_ctx->ctx_ep[i]; + + if (input_ctx->ctx_input.dwInCtx0 & + XHCI_INCTX_0_DROP_MASK(i)) { + DPRINTF((" config ep - dropping ep %d\r\n", i)); + pci_xhci_disable_ep(dev, i); + } + + if (input_ctx->ctx_input.dwInCtx1 & + XHCI_INCTX_1_ADD_MASK(i)) { + iep_ctx = &input_ctx->ctx_ep[i]; + + DPRINTF((" enable ep[%d] %08x %08x %016lx %08x\r\n", + i, iep_ctx->dwEpCtx0, iep_ctx->dwEpCtx1, + iep_ctx->qwEpCtx2, iep_ctx->dwEpCtx4)); + + memcpy(ep_ctx, iep_ctx, sizeof(struct xhci_endp_ctx)); + + pci_xhci_init_ep(dev, i); + + /* ep state */ + ep_ctx->dwEpCtx0 = FIELD_REPLACE( + ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); + } + } + + /* slot state to configured */ + dev_ctx->ctx_slot.dwSctx3 = FIELD_REPLACE( + dev_ctx->ctx_slot.dwSctx3, XHCI_ST_SLCTX_CONFIGURED, 0x1F, 27); + dev_ctx->ctx_slot.dwSctx0 = FIELD_COPY( + dev_ctx->ctx_slot.dwSctx0, input_ctx->ctx_slot.dwSctx0, 0x1F, 27); + dev->dev_slotstate = XHCI_ST_CONFIGURED; + + DPRINTF(("EP configured; slot %u [0]=0x%08x [1]=0x%08x [2]=0x%08x " + "[3]=0x%08x\r\n", + slot, dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_reset_ep(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct pci_xhci_dev_emu *dev; + struct pci_xhci_dev_ep *devep; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + uint32_t cmderr, epid; + uint32_t type; + + epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); + + DPRINTF(("pci_xhci: reset ep %u: slot %u\r\n", epid, slot)); + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); + + dev = XHCI_SLOTDEV_PTR(sc, slot); + assert(dev != NULL); + + if (type == XHCI_TRB_TYPE_STOP_EP && + (trb->dwTrb3 & XHCI_TRB_3_SUSP_EP_BIT) != 0) { + /* XXX suspend endpoint for 10ms */ + } + + if (epid < 1 || epid > 31) { + DPRINTF(("pci_xhci: reset ep: invalid epid %u\r\n", epid)); + cmderr = XHCI_TRB_ERROR_TRB; + goto done; + } + + devep = &dev->eps[epid]; + if (devep->ep_xfer != NULL) + USB_DATA_XFER_RESET(devep->ep_xfer); + + dev_ctx = dev->dev_ctx; + assert(dev_ctx != NULL); + + ep_ctx = &dev_ctx->ctx_ep[epid]; + + ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; + + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) == 0) + ep_ctx->qwEpCtx2 = devep->ep_ringaddr | devep->ep_ccs; + + DPRINTF(("pci_xhci: reset ep[%u] %08x %08x %016lx %08x\r\n", + epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, + ep_ctx->dwEpCtx4)); + + if (type == XHCI_TRB_TYPE_RESET_EP && + (dev->dev_ue->ue_reset == NULL || + dev->dev_ue->ue_reset(dev->dev_sc) < 0)) { + cmderr = XHCI_TRB_ERROR_ENDP_NOT_ON; + goto done; + } + +done: + return (cmderr); +} + + +static uint32_t +pci_xhci_find_stream(struct pci_xhci_softc *sc, struct xhci_endp_ctx *ep, + uint32_t streamid, struct xhci_stream_ctx **osctx) +{ + struct xhci_stream_ctx *sctx; + uint32_t maxpstreams; + + maxpstreams = XHCI_EPCTX_0_MAXP_STREAMS_GET(ep->dwEpCtx0); + if (maxpstreams == 0) + return (XHCI_TRB_ERROR_TRB); + + if (maxpstreams > XHCI_STREAMS_MAX) + return (XHCI_TRB_ERROR_INVALID_SID); + + if (XHCI_EPCTX_0_LSA_GET(ep->dwEpCtx0) == 0) { + DPRINTF(("pci_xhci: find_stream; LSA bit not set\r\n")); + return (XHCI_TRB_ERROR_INVALID_SID); + } + + /* only support primary stream */ + if (streamid > maxpstreams) + return (XHCI_TRB_ERROR_STREAM_TYPE); + + sctx = XHCI_GADDR(sc, ep->qwEpCtx2 & ~0xFUL) + streamid; + if (!XHCI_SCTX_0_SCT_GET(sctx->qwSctx0)) + return (XHCI_TRB_ERROR_STREAM_TYPE); + + *osctx = sctx; + + return (XHCI_TRB_ERROR_SUCCESS); +} + + +static uint32_t +pci_xhci_cmd_set_tr(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct pci_xhci_dev_emu *dev; + struct pci_xhci_dev_ep *devep; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + uint32_t cmderr, epid; + uint32_t streamid; + + cmderr = XHCI_TRB_ERROR_SUCCESS; + + dev = XHCI_SLOTDEV_PTR(sc, slot); + assert(dev != NULL); + + DPRINTF(("pci_xhci set_tr: new-tr x%016lx, SCT %u DCS %u\r\n" + " stream-id %u, slot %u, epid %u, C %u\r\n", + (trb->qwTrb0 & ~0xF), (uint32_t)((trb->qwTrb0 >> 1) & 0x7), + (uint32_t)(trb->qwTrb0 & 0x1), (trb->dwTrb2 >> 16) & 0xFFFF, + XHCI_TRB_3_SLOT_GET(trb->dwTrb3), + XHCI_TRB_3_EP_GET(trb->dwTrb3), trb->dwTrb3 & 0x1)); + + epid = XHCI_TRB_3_EP_GET(trb->dwTrb3); + if (epid < 1 || epid > 31) { + DPRINTF(("pci_xhci: set_tr_deq: invalid epid %u\r\n", epid)); + cmderr = XHCI_TRB_ERROR_TRB; + goto done; + } + + dev_ctx = dev->dev_ctx; + assert(dev_ctx != NULL); + + ep_ctx = &dev_ctx->ctx_ep[epid]; + devep = &dev->eps[epid]; + + switch (XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0)) { + case XHCI_ST_EPCTX_STOPPED: + case XHCI_ST_EPCTX_ERROR: + break; + default: + DPRINTF(("pci_xhci cmd set_tr invalid state %x\r\n", + XHCI_EPCTX_0_EPSTATE_GET(ep_ctx->dwEpCtx0))); + cmderr = XHCI_TRB_ERROR_CONTEXT_STATE; + goto done; + } + + streamid = XHCI_TRB_2_STREAM_GET(trb->dwTrb2); + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) > 0) { + struct xhci_stream_ctx *sctx; + + sctx = NULL; + cmderr = pci_xhci_find_stream(sc, ep_ctx, streamid, &sctx); + if (sctx != NULL) { + assert(devep->ep_sctx != NULL); + + devep->ep_sctx[streamid].qwSctx0 = trb->qwTrb0; + devep->ep_sctx_trbs[streamid].ringaddr = + trb->qwTrb0 & ~0xF; + devep->ep_sctx_trbs[streamid].ccs = + XHCI_EPCTX_2_DCS_GET(trb->qwTrb0); + } + } else { + if (streamid != 0) { + DPRINTF(("pci_xhci cmd set_tr streamid %x != 0\r\n", + streamid)); + } + ep_ctx->qwEpCtx2 = trb->qwTrb0 & ~0xFUL; + devep->ep_ringaddr = ep_ctx->qwEpCtx2 & ~0xFUL; + devep->ep_ccs = trb->qwTrb0 & 0x1; + devep->ep_tr = XHCI_GADDR(sc, devep->ep_ringaddr); + + DPRINTF(("pci_xhci set_tr first TRB:\r\n")); + pci_xhci_dump_trb(devep->ep_tr); + } + ep_ctx->dwEpCtx0 = (ep_ctx->dwEpCtx0 & ~0x7) | XHCI_ST_EPCTX_STOPPED; + +done: + return (cmderr); +} + +static uint32_t +pci_xhci_cmd_eval_ctx(struct pci_xhci_softc *sc, uint32_t slot, + struct xhci_trb *trb) +{ + struct xhci_input_dev_ctx *input_ctx; + struct xhci_slot_ctx *islot_ctx; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep0_ctx; + uint32_t cmderr; + + input_ctx = XHCI_GADDR(sc, trb->qwTrb0 & ~0xFUL); + islot_ctx = &input_ctx->ctx_slot; + ep0_ctx = &input_ctx->ctx_ep[1]; + + cmderr = XHCI_TRB_ERROR_SUCCESS; + DPRINTF(("pci_xhci: eval ctx, input ctl: D 0x%08x A 0x%08x,\r\n" + " slot %08x %08x %08x %08x\r\n" + " ep0 %08x %08x %016lx %08x\r\n", + input_ctx->ctx_input.dwInCtx0, input_ctx->ctx_input.dwInCtx1, + islot_ctx->dwSctx0, islot_ctx->dwSctx1, + islot_ctx->dwSctx2, islot_ctx->dwSctx3, + ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, + ep0_ctx->dwEpCtx4)); + + /* this command expects drop-ctx=0 & add-ctx=slot+ep0 */ + if ((input_ctx->ctx_input.dwInCtx0 != 0) || + (input_ctx->ctx_input.dwInCtx1 & 0x03) == 0) { + DPRINTF(("pci_xhci: eval ctx, input ctl invalid\r\n")); + cmderr = XHCI_TRB_ERROR_TRB; + goto done; + } + + /* assign address to slot; in this emulation, slot_id = address */ + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + DPRINTF(("pci_xhci: eval ctx, dev ctx\r\n" + " slot %08x %08x %08x %08x\r\n", + dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3)); + + if (input_ctx->ctx_input.dwInCtx1 & 0x01) { /* slot ctx */ + /* set max exit latency */ + dev_ctx->ctx_slot.dwSctx1 = FIELD_COPY( + dev_ctx->ctx_slot.dwSctx1, input_ctx->ctx_slot.dwSctx1, + 0xFFFF, 0); + + /* set interrupter target */ + dev_ctx->ctx_slot.dwSctx2 = FIELD_COPY( + dev_ctx->ctx_slot.dwSctx2, input_ctx->ctx_slot.dwSctx2, + 0x3FF, 22); + } + if (input_ctx->ctx_input.dwInCtx1 & 0x02) { /* control ctx */ + /* set max packet size */ + dev_ctx->ctx_ep[1].dwEpCtx1 = FIELD_COPY( + dev_ctx->ctx_ep[1].dwEpCtx1, ep0_ctx->dwEpCtx1, + 0xFFFF, 16); + + ep0_ctx = &dev_ctx->ctx_ep[1]; + } + + DPRINTF(("pci_xhci: eval ctx, output ctx\r\n" + " slot %08x %08x %08x %08x\r\n" + " ep0 %08x %08x %016lx %08x\r\n", + dev_ctx->ctx_slot.dwSctx0, dev_ctx->ctx_slot.dwSctx1, + dev_ctx->ctx_slot.dwSctx2, dev_ctx->ctx_slot.dwSctx3, + ep0_ctx->dwEpCtx0, ep0_ctx->dwEpCtx1, ep0_ctx->qwEpCtx2, + ep0_ctx->dwEpCtx4)); + +done: + return (cmderr); +} + +static int +pci_xhci_complete_commands(struct pci_xhci_softc *sc) +{ + struct xhci_trb evtrb; + struct xhci_trb *trb; + uint64_t crcr; + uint32_t ccs; /* cycle state (XHCI 4.9.2) */ + uint32_t type; + uint32_t slot; + uint32_t cmderr; + int error; + + error = 0; + sc->opregs.crcr |= XHCI_CRCR_LO_CRR; + + trb = sc->opregs.cr_p; + ccs = sc->opregs.crcr & XHCI_CRCR_LO_RCS; + crcr = sc->opregs.crcr & ~0xF; + + while (1) { + sc->opregs.cr_p = trb; + + type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); + + if ((trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT) != + (ccs & XHCI_TRB_3_CYCLE_BIT)) + break; + + DPRINTF(("pci_xhci: cmd type 0x%x, Trb0 x%016lx dwTrb2 x%08x" + " dwTrb3 x%08x, TRB_CYCLE %u/ccs %u\r\n", + type, trb->qwTrb0, trb->dwTrb2, trb->dwTrb3, + trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT, ccs)); + + cmderr = XHCI_TRB_ERROR_SUCCESS; + evtrb.dwTrb2 = 0; + evtrb.dwTrb3 = (ccs & XHCI_TRB_3_CYCLE_BIT) | + XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_CMD_COMPLETE); + slot = 0; + + switch (type) { + case XHCI_TRB_TYPE_LINK: /* 0x06 */ + if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) + ccs ^= XHCI_CRCR_LO_RCS; + break; + + case XHCI_TRB_TYPE_ENABLE_SLOT: /* 0x09 */ + cmderr = pci_xhci_cmd_enable_slot(sc, &slot); + break; + + case XHCI_TRB_TYPE_DISABLE_SLOT: /* 0x0A */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_disable_slot(sc, slot); + break; + + case XHCI_TRB_TYPE_ADDRESS_DEVICE: /* 0x0B */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_address_device(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_CONFIGURE_EP: /* 0x0C */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_config_ep(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_EVALUATE_CTX: /* 0x0D */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_eval_ctx(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_RESET_EP: /* 0x0E */ + DPRINTF(("Reset Endpoint on slot %d\r\n", slot)); + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_STOP_EP: /* 0x0F */ + DPRINTF(("Stop Endpoint on slot %d\r\n", slot)); + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_reset_ep(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_SET_TR_DEQUEUE: /* 0x10 */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_set_tr(sc, slot, trb); + break; + + case XHCI_TRB_TYPE_RESET_DEVICE: /* 0x11 */ + slot = XHCI_TRB_3_SLOT_GET(trb->dwTrb3); + cmderr = pci_xhci_cmd_reset_device(sc, slot); + break; + + case XHCI_TRB_TYPE_FORCE_EVENT: /* 0x12 */ + /* TODO: */ + break; + + case XHCI_TRB_TYPE_NEGOTIATE_BW: /* 0x13 */ + break; + + case XHCI_TRB_TYPE_SET_LATENCY_TOL: /* 0x14 */ + break; + + case XHCI_TRB_TYPE_GET_PORT_BW: /* 0x15 */ + break; + + case XHCI_TRB_TYPE_FORCE_HEADER: /* 0x16 */ + break; + + case XHCI_TRB_TYPE_NOOP_CMD: /* 0x17 */ + break; + + default: + DPRINTF(("pci_xhci: unsupported cmd %x\r\n", type)); + break; + } + + if (type != XHCI_TRB_TYPE_LINK) { + /* + * insert command completion event and assert intr + */ + evtrb.qwTrb0 = crcr; + evtrb.dwTrb2 |= XHCI_TRB_2_ERROR_SET(cmderr); + evtrb.dwTrb3 |= XHCI_TRB_3_SLOT_SET(slot); + DPRINTF(("pci_xhci: command 0x%x result: 0x%x\r\n", + type, cmderr)); + pci_xhci_insert_event(sc, &evtrb, 1); + } + + trb = pci_xhci_trb_next(sc, trb, &crcr); + } + + sc->opregs.crcr = crcr | (sc->opregs.crcr & XHCI_CRCR_LO_CA) | ccs; + sc->opregs.crcr &= ~XHCI_CRCR_LO_CRR; + return (error); +} + +static void +pci_xhci_dump_trb(struct xhci_trb *trb) +{ + static const char *trbtypes[] = { + "RESERVED", + "NORMAL", + "SETUP_STAGE", + "DATA_STAGE", + "STATUS_STAGE", + "ISOCH", + "LINK", + "EVENT_DATA", + "NOOP", + "ENABLE_SLOT", + "DISABLE_SLOT", + "ADDRESS_DEVICE", + "CONFIGURE_EP", + "EVALUATE_CTX", + "RESET_EP", + "STOP_EP", + "SET_TR_DEQUEUE", + "RESET_DEVICE", + "FORCE_EVENT", + "NEGOTIATE_BW", + "SET_LATENCY_TOL", + "GET_PORT_BW", + "FORCE_HEADER", + "NOOP_CMD" + }; + uint32_t type; + + type = XHCI_TRB_3_TYPE_GET(trb->dwTrb3); + DPRINTF(("pci_xhci: trb[@%p] type x%02x %s 0:x%016lx 2:x%08x 3:x%08x\r\n", + trb, type, + type <= XHCI_TRB_TYPE_NOOP_CMD ? trbtypes[type] : "INVALID", + trb->qwTrb0, trb->dwTrb2, trb->dwTrb3)); +} + +static int +pci_xhci_xfer_complete(struct pci_xhci_softc *sc, struct usb_data_xfer *xfer, + uint32_t slot, uint32_t epid, int *do_intr) +{ + struct pci_xhci_dev_emu *dev; + struct pci_xhci_dev_ep *devep; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + struct xhci_trb *trb; + struct xhci_trb evtrb; + uint32_t trbflags; + uint32_t edtla; + int i, err; + + dev = XHCI_SLOTDEV_PTR(sc, slot); + devep = &dev->eps[epid]; + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + + assert(dev_ctx != NULL); + + ep_ctx = &dev_ctx->ctx_ep[epid]; + + err = XHCI_TRB_ERROR_SUCCESS; + *do_intr = 0; + edtla = 0; + + /* go through list of TRBs and insert event(s) */ + for (i = xfer->head; xfer->ndata > 0; ) { + evtrb.qwTrb0 = (uint64_t)xfer->data[i].hci_data; + trb = XHCI_GADDR(sc, evtrb.qwTrb0); + trbflags = trb->dwTrb3; + + DPRINTF(("pci_xhci: xfer[%d] done?%u:%d trb %x %016lx %x " + "(err %d) IOC?%d\r\n", + i, xfer->data[i].processed, xfer->data[i].blen, + XHCI_TRB_3_TYPE_GET(trbflags), evtrb.qwTrb0, + trbflags, err, + trb->dwTrb3 & XHCI_TRB_3_IOC_BIT ? 1 : 0)); + + if (!xfer->data[i].processed) { + xfer->head = i; + break; + } + + xfer->ndata--; + edtla += xfer->data[i].bdone; + + trb->dwTrb3 = (trb->dwTrb3 & ~0x1) | (xfer->data[i].ccs); + + pci_xhci_update_ep_ring(sc, dev, devep, ep_ctx, + xfer->data[i].streamid, xfer->data[i].trbnext, + xfer->data[i].ccs); + + /* Only interrupt if IOC or short packet */ + if (!(trb->dwTrb3 & XHCI_TRB_3_IOC_BIT) && + !((err == XHCI_TRB_ERROR_SHORT_PKT) && + (trb->dwTrb3 & XHCI_TRB_3_ISP_BIT))) { + + i = (i + 1) % USB_MAX_XFER_BLOCKS; + continue; + } + + evtrb.dwTrb2 = XHCI_TRB_2_ERROR_SET(err) | + XHCI_TRB_2_REM_SET(xfer->data[i].blen); + + evtrb.dwTrb3 = XHCI_TRB_3_TYPE_SET(XHCI_TRB_EVENT_TRANSFER) | + XHCI_TRB_3_SLOT_SET(slot) | XHCI_TRB_3_EP_SET(epid); + + if (XHCI_TRB_3_TYPE_GET(trbflags) == XHCI_TRB_TYPE_EVENT_DATA) { + DPRINTF(("pci_xhci EVENT_DATA edtla %u\r\n", edtla)); + evtrb.qwTrb0 = trb->qwTrb0; + evtrb.dwTrb2 = (edtla & 0xFFFFF) | + XHCI_TRB_2_ERROR_SET(err); + evtrb.dwTrb3 |= XHCI_TRB_3_ED_BIT; + edtla = 0; + } + + *do_intr = 1; + + err = pci_xhci_insert_event(sc, &evtrb, 0); + if (err != XHCI_TRB_ERROR_SUCCESS) { + break; + } + + i = (i + 1) % USB_MAX_XFER_BLOCKS; + } + + return (err); +} + +static void +pci_xhci_update_ep_ring(struct pci_xhci_softc *sc, struct pci_xhci_dev_emu *dev, + struct pci_xhci_dev_ep *devep, struct xhci_endp_ctx *ep_ctx, + uint32_t streamid, uint64_t ringaddr, int ccs) +{ + + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) { + devep->ep_sctx[streamid].qwSctx0 = (ringaddr & ~0xFUL) | + (ccs & 0x1); + + devep->ep_sctx_trbs[streamid].ringaddr = ringaddr & ~0xFUL; + devep->ep_sctx_trbs[streamid].ccs = ccs & 0x1; + ep_ctx->qwEpCtx2 = (ep_ctx->qwEpCtx2 & ~0x1) | (ccs & 0x1); + + DPRINTF(("xhci update ep-ring stream %d, addr %lx\r\n", + streamid, devep->ep_sctx[streamid].qwSctx0)); + } else { + devep->ep_ringaddr = ringaddr & ~0xFUL; + devep->ep_ccs = ccs & 0x1; + devep->ep_tr = XHCI_GADDR(sc, ringaddr & ~0xFUL); + ep_ctx->qwEpCtx2 = (ringaddr & ~0xFUL) | (ccs & 0x1); + + DPRINTF(("xhci update ep-ring, addr %lx\r\n", + (devep->ep_ringaddr | devep->ep_ccs))); + } +} + +/* + * Outstanding transfer still in progress (device NAK'd earlier) so retry + * the transfer again to see if it succeeds. + */ +static int +pci_xhci_try_usb_xfer(struct pci_xhci_softc *sc, + struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, + struct xhci_endp_ctx *ep_ctx, uint32_t slot, uint32_t epid) +{ + struct usb_data_xfer *xfer; + int err; + int do_intr; + + ep_ctx->dwEpCtx0 = FIELD_REPLACE( + ep_ctx->dwEpCtx0, XHCI_ST_EPCTX_RUNNING, 0x7, 0); + + err = 0; + do_intr = 0; + + xfer = devep->ep_xfer; +#ifdef __FreeBSD__ + USB_DATA_XFER_LOCK(xfer); +#else + /* + * At least one caller needs to hold this lock across the call to this + * function and other code. To avoid deadlock from a recursive mutex + * enter, we ensure that all callers hold this lock. + */ + assert(USB_DATA_XFER_LOCK_HELD(xfer)); +#endif + + /* outstanding requests queued up */ + if (dev->dev_ue->ue_data != NULL) { + err = dev->dev_ue->ue_data(dev->dev_sc, xfer, + epid & 0x1 ? USB_XFER_IN : USB_XFER_OUT, epid/2); + if (err == USB_ERR_CANCELLED) { + if (USB_DATA_GET_ERRCODE(&xfer->data[xfer->head]) == + USB_NAK) + err = XHCI_TRB_ERROR_SUCCESS; + } else { + err = pci_xhci_xfer_complete(sc, xfer, slot, epid, + &do_intr); + if (err == XHCI_TRB_ERROR_SUCCESS && do_intr) { + pci_xhci_assert_interrupt(sc); + } + + + /* XXX should not do it if error? */ + USB_DATA_XFER_RESET(xfer); + } + } + +#ifdef __FreeBSD__ + USB_DATA_XFER_UNLOCK(xfer); +#endif + + return (err); +} + + +static int +pci_xhci_handle_transfer(struct pci_xhci_softc *sc, + struct pci_xhci_dev_emu *dev, struct pci_xhci_dev_ep *devep, + struct xhci_endp_ctx *ep_ctx, struct xhci_trb *trb, uint32_t slot, + uint32_t epid, uint64_t addr, uint32_t ccs, uint32_t streamid) +{ + struct xhci_trb *setup_trb; + struct usb_data_xfer *xfer; + struct usb_data_xfer_block *xfer_block; + uint64_t val; + uint32_t trbflags; + int do_intr, err; + int do_retry; + + ep_ctx->dwEpCtx0 = FIELD_REPLACE(ep_ctx->dwEpCtx0, + XHCI_ST_EPCTX_RUNNING, 0x7, 0); + + xfer = devep->ep_xfer; + USB_DATA_XFER_LOCK(xfer); + + DPRINTF(("pci_xhci handle_transfer slot %u\r\n", slot)); + +retry: + err = 0; + do_retry = 0; + do_intr = 0; + setup_trb = NULL; + + while (1) { + pci_xhci_dump_trb(trb); + + trbflags = trb->dwTrb3; + + if (XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK && + (trbflags & XHCI_TRB_3_CYCLE_BIT) != + (ccs & XHCI_TRB_3_CYCLE_BIT)) { + DPRINTF(("Cycle-bit changed trbflags %x, ccs %x\r\n", + trbflags & XHCI_TRB_3_CYCLE_BIT, ccs)); + break; + } + + xfer_block = NULL; + + switch (XHCI_TRB_3_TYPE_GET(trbflags)) { + case XHCI_TRB_TYPE_LINK: + if (trb->dwTrb3 & XHCI_TRB_3_TC_BIT) + ccs ^= 0x1; + + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + xfer_block->processed = 1; + break; + + case XHCI_TRB_TYPE_SETUP_STAGE: + if ((trbflags & XHCI_TRB_3_IDT_BIT) == 0 || + XHCI_TRB_2_BYTES_GET(trb->dwTrb2) != 8) { + DPRINTF(("pci_xhci: invalid setup trb\r\n")); + err = XHCI_TRB_ERROR_TRB; + goto errout; + } + setup_trb = trb; + + val = trb->qwTrb0; + if (!xfer->ureq) + xfer->ureq = malloc( + sizeof(struct usb_device_request)); + memcpy(xfer->ureq, &val, + sizeof(struct usb_device_request)); + + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + xfer_block->processed = 1; + break; + + case XHCI_TRB_TYPE_NORMAL: + case XHCI_TRB_TYPE_ISOCH: + if (setup_trb != NULL) { + DPRINTF(("pci_xhci: trb not supposed to be in " + "ctl scope\r\n")); + err = XHCI_TRB_ERROR_TRB; + goto errout; + } + /* fall through */ + + case XHCI_TRB_TYPE_DATA_STAGE: + xfer_block = usb_data_xfer_append(xfer, + (void *)(trbflags & XHCI_TRB_3_IDT_BIT ? + &trb->qwTrb0 : XHCI_GADDR(sc, trb->qwTrb0)), + trb->dwTrb2 & 0x1FFFF, (void *)addr, ccs); + break; + + case XHCI_TRB_TYPE_STATUS_STAGE: + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + break; + + case XHCI_TRB_TYPE_NOOP: + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + xfer_block->processed = 1; + break; + + case XHCI_TRB_TYPE_EVENT_DATA: + xfer_block = usb_data_xfer_append(xfer, NULL, 0, + (void *)addr, ccs); + if ((epid > 1) && (trbflags & XHCI_TRB_3_IOC_BIT)) { + xfer_block->processed = 1; + } + break; + + default: + DPRINTF(("pci_xhci: handle xfer unexpected trb type " + "0x%x\r\n", + XHCI_TRB_3_TYPE_GET(trbflags))); + err = XHCI_TRB_ERROR_TRB; + goto errout; + } + + trb = pci_xhci_trb_next(sc, trb, &addr); + + DPRINTF(("pci_xhci: next trb: 0x%lx\r\n", (uint64_t)trb)); + + if (xfer_block) { + xfer_block->trbnext = addr; + xfer_block->streamid = streamid; + } + + if (!setup_trb && !(trbflags & XHCI_TRB_3_CHAIN_BIT) && + XHCI_TRB_3_TYPE_GET(trbflags) != XHCI_TRB_TYPE_LINK) { + break; + } + + /* handle current batch that requires interrupt on complete */ + if (trbflags & XHCI_TRB_3_IOC_BIT) { + DPRINTF(("pci_xhci: trb IOC bit set\r\n")); + if (epid == 1) + do_retry = 1; + break; + } + } + + DPRINTF(("pci_xhci[%d]: xfer->ndata %u\r\n", __LINE__, xfer->ndata)); + + if (epid == 1) { + err = USB_ERR_NOT_STARTED; + if (dev->dev_ue->ue_request != NULL) + err = dev->dev_ue->ue_request(dev->dev_sc, xfer); + setup_trb = NULL; + } else { + /* handle data transfer */ + pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); + err = XHCI_TRB_ERROR_SUCCESS; + goto errout; + } + + err = USB_TO_XHCI_ERR(err); + if ((err == XHCI_TRB_ERROR_SUCCESS) || + (err == XHCI_TRB_ERROR_SHORT_PKT)) { + err = pci_xhci_xfer_complete(sc, xfer, slot, epid, &do_intr); + if (err != XHCI_TRB_ERROR_SUCCESS) + do_retry = 0; + } + +errout: + if (err == XHCI_TRB_ERROR_EV_RING_FULL) + DPRINTF(("pci_xhci[%d]: event ring full\r\n", __LINE__)); + + if (!do_retry) + USB_DATA_XFER_UNLOCK(xfer); + + if (do_intr) + pci_xhci_assert_interrupt(sc); + + if (do_retry) { + USB_DATA_XFER_RESET(xfer); + DPRINTF(("pci_xhci[%d]: retry:continuing with next TRBs\r\n", + __LINE__)); + goto retry; + } + + if (epid == 1) + USB_DATA_XFER_RESET(xfer); + + return (err); +} + +static void +pci_xhci_device_doorbell(struct pci_xhci_softc *sc, uint32_t slot, + uint32_t epid, uint32_t streamid) +{ + struct pci_xhci_dev_emu *dev; + struct pci_xhci_dev_ep *devep; + struct xhci_dev_ctx *dev_ctx; + struct xhci_endp_ctx *ep_ctx; + struct pci_xhci_trb_ring *sctx_tr; + struct xhci_trb *trb; + uint64_t ringaddr; + uint32_t ccs; + + DPRINTF(("pci_xhci doorbell slot %u epid %u stream %u\r\n", + slot, epid, streamid)); + + if (slot == 0 || slot > sc->ndevices) { + DPRINTF(("pci_xhci: invalid doorbell slot %u\r\n", slot)); + return; + } + + dev = XHCI_SLOTDEV_PTR(sc, slot); + devep = &dev->eps[epid]; + dev_ctx = pci_xhci_get_dev_ctx(sc, slot); + if (!dev_ctx) { + return; + } + ep_ctx = &dev_ctx->ctx_ep[epid]; + + sctx_tr = NULL; + + DPRINTF(("pci_xhci: device doorbell ep[%u] %08x %08x %016lx %08x\r\n", + epid, ep_ctx->dwEpCtx0, ep_ctx->dwEpCtx1, ep_ctx->qwEpCtx2, + ep_ctx->dwEpCtx4)); + + if (ep_ctx->qwEpCtx2 == 0) + return; + + /* handle pending transfers */ + if (devep->ep_xfer->ndata > 0) { +#ifndef __FreeBSD__ + USB_DATA_XFER_LOCK(devep->ep_xfer); +#endif + pci_xhci_try_usb_xfer(sc, dev, devep, ep_ctx, slot, epid); +#ifndef __FreeBSD__ + USB_DATA_XFER_UNLOCK(devep->ep_xfer); +#endif + return; + } + + /* get next trb work item */ + if (XHCI_EPCTX_0_MAXP_STREAMS_GET(ep_ctx->dwEpCtx0) != 0) { + sctx_tr = &devep->ep_sctx_trbs[streamid]; + ringaddr = sctx_tr->ringaddr; + ccs = sctx_tr->ccs; + trb = XHCI_GADDR(sc, sctx_tr->ringaddr & ~0xFUL); + DPRINTF(("doorbell, stream %u, ccs %lx, trb ccs %x\r\n", + streamid, ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, + trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); + } else { + ringaddr = devep->ep_ringaddr; + ccs = devep->ep_ccs; + trb = devep->ep_tr; + DPRINTF(("doorbell, ccs %lx, trb ccs %x\r\n", + ep_ctx->qwEpCtx2 & XHCI_TRB_3_CYCLE_BIT, + trb->dwTrb3 & XHCI_TRB_3_CYCLE_BIT)); + } + + if (XHCI_TRB_3_TYPE_GET(trb->dwTrb3) == 0) { + DPRINTF(("pci_xhci: ring %lx trb[%lx] EP %u is RESERVED?\r\n", + ep_ctx->qwEpCtx2, devep->ep_ringaddr, epid)); + return; + } + + pci_xhci_handle_transfer(sc, dev, devep, ep_ctx, trb, slot, epid, + ringaddr, ccs, streamid); +} + +static void +pci_xhci_dbregs_write(struct pci_xhci_softc *sc, uint64_t offset, + uint64_t value) +{ + + offset = (offset - sc->dboff) / sizeof(uint32_t); + + DPRINTF(("pci_xhci: doorbell write offset 0x%lx: 0x%lx\r\n", + offset, value)); + + if (XHCI_HALTED(sc)) { + DPRINTF(("pci_xhci: controller halted\r\n")); + return; + } + + if (offset == 0) + pci_xhci_complete_commands(sc); + else if (sc->portregs != NULL) + pci_xhci_device_doorbell(sc, offset, + XHCI_DB_TARGET_GET(value), XHCI_DB_SID_GET(value)); +} + +static void +pci_xhci_rtsregs_write(struct pci_xhci_softc *sc, uint64_t offset, + uint64_t value) +{ + struct pci_xhci_rtsregs *rts; + + offset -= sc->rtsoff; + + if (offset == 0) { + DPRINTF(("pci_xhci attempted write to MFINDEX\r\n")); + return; + } + + DPRINTF(("pci_xhci: runtime regs write offset 0x%lx: 0x%lx\r\n", + offset, value)); + + offset -= 0x20; /* start of intrreg */ + + rts = &sc->rtsregs; + + switch (offset) { + case 0x00: + if (value & XHCI_IMAN_INTR_PEND) + rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; + rts->intrreg.iman = (value & XHCI_IMAN_INTR_ENA) | + (rts->intrreg.iman & XHCI_IMAN_INTR_PEND); + + if (!(value & XHCI_IMAN_INTR_ENA)) + pci_xhci_deassert_interrupt(sc); + + break; + + case 0x04: + rts->intrreg.imod = value; + break; + + case 0x08: + rts->intrreg.erstsz = value & 0xFFFF; + break; + + case 0x10: + /* ERSTBA low bits */ + rts->intrreg.erstba = MASK_64_HI(sc->rtsregs.intrreg.erstba) | + (value & ~0x3F); + break; + + case 0x14: + /* ERSTBA high bits */ + rts->intrreg.erstba = (value << 32) | + MASK_64_LO(sc->rtsregs.intrreg.erstba); + + rts->erstba_p = XHCI_GADDR(sc, + sc->rtsregs.intrreg.erstba & ~0x3FUL); + + rts->erst_p = XHCI_GADDR(sc, + sc->rtsregs.erstba_p->qwEvrsTablePtr & ~0x3FUL); + + rts->er_enq_idx = 0; + rts->er_events_cnt = 0; + + DPRINTF(("pci_xhci: wr erstba erst (%p) ptr 0x%lx, sz %u\r\n", + rts->erstba_p, + rts->erstba_p->qwEvrsTablePtr, + rts->erstba_p->dwEvrsTableSize)); + break; + + case 0x18: + /* ERDP low bits */ + rts->intrreg.erdp = + MASK_64_HI(sc->rtsregs.intrreg.erdp) | + (rts->intrreg.erdp & XHCI_ERDP_LO_BUSY) | + (value & ~0xF); + if (value & XHCI_ERDP_LO_BUSY) { + rts->intrreg.erdp &= ~XHCI_ERDP_LO_BUSY; + rts->intrreg.iman &= ~XHCI_IMAN_INTR_PEND; + } + + rts->er_deq_seg = XHCI_ERDP_LO_SINDEX(value); + + break; + + case 0x1C: + /* ERDP high bits */ + rts->intrreg.erdp = (value << 32) | + MASK_64_LO(sc->rtsregs.intrreg.erdp); + + if (rts->er_events_cnt > 0) { + uint64_t erdp; + uint32_t erdp_i; + + erdp = rts->intrreg.erdp & ~0xF; + erdp_i = (erdp - rts->erstba_p->qwEvrsTablePtr) / + sizeof(struct xhci_trb); + + if (erdp_i <= rts->er_enq_idx) + rts->er_events_cnt = rts->er_enq_idx - erdp_i; + else + rts->er_events_cnt = + rts->erstba_p->dwEvrsTableSize - + (erdp_i - rts->er_enq_idx); + + DPRINTF(("pci_xhci: erdp 0x%lx, events cnt %u\r\n", + erdp, rts->er_events_cnt)); + } + + break; + + default: + DPRINTF(("pci_xhci attempted write to RTS offset 0x%lx\r\n", + offset)); + break; + } +} + +static uint64_t +pci_xhci_portregs_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + int port; + uint32_t *p; + + if (sc->portregs == NULL) + return (0); + + port = (offset - 0x3F0) / 0x10; + + if (port > XHCI_MAX_DEVS) { + DPRINTF(("pci_xhci: portregs_read port %d >= XHCI_MAX_DEVS\r\n", + port)); + + /* return default value for unused port */ + return (XHCI_PS_SPEED_SET(3)); + } + + offset = (offset - 0x3F0) % 0x10; + + p = &sc->portregs[port].portsc; + p += offset / sizeof(uint32_t); + + DPRINTF(("pci_xhci: portregs read offset 0x%lx port %u -> 0x%x\r\n", + offset, port, *p)); + + return (*p); +} + +static void +pci_xhci_hostop_write(struct pci_xhci_softc *sc, uint64_t offset, + uint64_t value) +{ + offset -= XHCI_CAPLEN; + + if (offset < 0x400) + DPRINTF(("pci_xhci: hostop write offset 0x%lx: 0x%lx\r\n", + offset, value)); + + switch (offset) { + case XHCI_USBCMD: + sc->opregs.usbcmd = pci_xhci_usbcmd_write(sc, value & 0x3F0F); + break; + + case XHCI_USBSTS: + /* clear bits on write */ + sc->opregs.usbsts &= ~(value & + (XHCI_STS_HSE|XHCI_STS_EINT|XHCI_STS_PCD|XHCI_STS_SSS| + XHCI_STS_RSS|XHCI_STS_SRE|XHCI_STS_CNR)); + break; + + case XHCI_PAGESIZE: + /* read only */ + break; + + case XHCI_DNCTRL: + sc->opregs.dnctrl = value & 0xFFFF; + break; + + case XHCI_CRCR_LO: + if (sc->opregs.crcr & XHCI_CRCR_LO_CRR) { + sc->opregs.crcr &= ~(XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); + sc->opregs.crcr |= value & + (XHCI_CRCR_LO_CS|XHCI_CRCR_LO_CA); + } else { + sc->opregs.crcr = MASK_64_HI(sc->opregs.crcr) | + (value & (0xFFFFFFC0 | XHCI_CRCR_LO_RCS)); + } + break; + + case XHCI_CRCR_HI: + if (!(sc->opregs.crcr & XHCI_CRCR_LO_CRR)) { + sc->opregs.crcr = MASK_64_LO(sc->opregs.crcr) | + (value << 32); + + sc->opregs.cr_p = XHCI_GADDR(sc, + sc->opregs.crcr & ~0xF); + } + + if (sc->opregs.crcr & XHCI_CRCR_LO_CS) { + /* Stop operation of Command Ring */ + } + + if (sc->opregs.crcr & XHCI_CRCR_LO_CA) { + /* Abort command */ + } + + break; + + case XHCI_DCBAAP_LO: + sc->opregs.dcbaap = MASK_64_HI(sc->opregs.dcbaap) | + (value & 0xFFFFFFC0); + break; + + case XHCI_DCBAAP_HI: + sc->opregs.dcbaap = MASK_64_LO(sc->opregs.dcbaap) | + (value << 32); + sc->opregs.dcbaa_p = XHCI_GADDR(sc, sc->opregs.dcbaap & ~0x3FUL); + + DPRINTF(("pci_xhci: opregs dcbaap = 0x%lx (vaddr 0x%lx)\r\n", + sc->opregs.dcbaap, (uint64_t)sc->opregs.dcbaa_p)); + break; + + case XHCI_CONFIG: + sc->opregs.config = value & 0x03FF; + break; + + default: + if (offset >= 0x400) + pci_xhci_portregs_write(sc, offset, value); + + break; + } +} + + +static void +pci_xhci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, + int baridx, uint64_t offset, int size, uint64_t value) +{ + struct pci_xhci_softc *sc; + + sc = pi->pi_arg; + + assert(baridx == 0); + + + pthread_mutex_lock(&sc->mtx); + if (offset < XHCI_CAPLEN) /* read only registers */ + WPRINTF(("pci_xhci: write RO-CAPs offset %ld\r\n", offset)); + else if (offset < sc->dboff) + pci_xhci_hostop_write(sc, offset, value); + else if (offset < sc->rtsoff) + pci_xhci_dbregs_write(sc, offset, value); + else if (offset < sc->regsend) + pci_xhci_rtsregs_write(sc, offset, value); + else + WPRINTF(("pci_xhci: write invalid offset %ld\r\n", offset)); + + pthread_mutex_unlock(&sc->mtx); +} + +static uint64_t +pci_xhci_hostcap_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + uint64_t value; + + switch (offset) { + case XHCI_CAPLENGTH: /* 0x00 */ + value = sc->caplength; + break; + + case XHCI_HCSPARAMS1: /* 0x04 */ + value = sc->hcsparams1; + break; + + case XHCI_HCSPARAMS2: /* 0x08 */ + value = sc->hcsparams2; + break; + + case XHCI_HCSPARAMS3: /* 0x0C */ + value = sc->hcsparams3; + break; + + case XHCI_HCSPARAMS0: /* 0x10 */ + value = sc->hccparams1; + break; + + case XHCI_DBOFF: /* 0x14 */ + value = sc->dboff; + break; + + case XHCI_RTSOFF: /* 0x18 */ + value = sc->rtsoff; + break; + + case XHCI_HCCPRAMS2: /* 0x1C */ + value = sc->hccparams2; + break; + + default: + value = 0; + break; + } + + DPRINTF(("pci_xhci: hostcap read offset 0x%lx -> 0x%lx\r\n", + offset, value)); + + return (value); +} + +static uint64_t +pci_xhci_hostop_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + uint64_t value; + + offset = (offset - XHCI_CAPLEN); + + switch (offset) { + case XHCI_USBCMD: /* 0x00 */ + value = sc->opregs.usbcmd; + break; + + case XHCI_USBSTS: /* 0x04 */ + value = sc->opregs.usbsts; + break; + + case XHCI_PAGESIZE: /* 0x08 */ + value = sc->opregs.pgsz; + break; + + case XHCI_DNCTRL: /* 0x14 */ + value = sc->opregs.dnctrl; + break; + + case XHCI_CRCR_LO: /* 0x18 */ + value = sc->opregs.crcr & XHCI_CRCR_LO_CRR; + break; + + case XHCI_CRCR_HI: /* 0x1C */ + value = 0; + break; + + case XHCI_DCBAAP_LO: /* 0x30 */ + value = sc->opregs.dcbaap & 0xFFFFFFFF; + break; + + case XHCI_DCBAAP_HI: /* 0x34 */ + value = (sc->opregs.dcbaap >> 32) & 0xFFFFFFFF; + break; + + case XHCI_CONFIG: /* 0x38 */ + value = sc->opregs.config; + break; + + default: + if (offset >= 0x400) + value = pci_xhci_portregs_read(sc, offset); + else + value = 0; + + break; + } + + if (offset < 0x400) + DPRINTF(("pci_xhci: hostop read offset 0x%lx -> 0x%lx\r\n", + offset, value)); + + return (value); +} + +static uint64_t +pci_xhci_dbregs_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + + /* read doorbell always returns 0 */ + return (0); +} + +static uint64_t +pci_xhci_rtsregs_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + uint32_t value; + + offset -= sc->rtsoff; + value = 0; + + if (offset == XHCI_MFINDEX) { + value = sc->rtsregs.mfindex; + } else if (offset >= 0x20) { + int item; + uint32_t *p; + + offset -= 0x20; + item = offset % 32; + + assert(offset < sizeof(sc->rtsregs.intrreg)); + + p = &sc->rtsregs.intrreg.iman; + p += item / sizeof(uint32_t); + value = *p; + } + + DPRINTF(("pci_xhci: rtsregs read offset 0x%lx -> 0x%x\r\n", + offset, value)); + + return (value); +} + +static uint64_t +pci_xhci_xecp_read(struct pci_xhci_softc *sc, uint64_t offset) +{ + uint32_t value; + + offset -= sc->regsend; + value = 0; + + switch (offset) { + case 0: + /* rev major | rev minor | next-cap | cap-id */ + value = (0x02 << 24) | (4 << 8) | XHCI_ID_PROTOCOLS; + break; + case 4: + /* name string = "USB" */ + value = 0x20425355; + break; + case 8: + /* psic | proto-defined | compat # | compat offset */ + value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb2_port_start; + break; + case 12: + break; + case 16: + /* rev major | rev minor | next-cap | cap-id */ + value = (0x03 << 24) | XHCI_ID_PROTOCOLS; + break; + case 20: + /* name string = "USB" */ + value = 0x20425355; + break; + case 24: + /* psic | proto-defined | compat # | compat offset */ + value = ((XHCI_MAX_DEVS/2) << 8) | sc->usb3_port_start; + break; + case 28: + break; + default: + DPRINTF(("pci_xhci: xecp invalid offset 0x%lx\r\n", offset)); + break; + } + + DPRINTF(("pci_xhci: xecp read offset 0x%lx -> 0x%x\r\n", + offset, value)); + + return (value); +} + + +static uint64_t +pci_xhci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx, + uint64_t offset, int size) +{ + struct pci_xhci_softc *sc; + uint32_t value; + + sc = pi->pi_arg; + + assert(baridx == 0); + + pthread_mutex_lock(&sc->mtx); + if (offset < XHCI_CAPLEN) + value = pci_xhci_hostcap_read(sc, offset); + else if (offset < sc->dboff) + value = pci_xhci_hostop_read(sc, offset); + else if (offset < sc->rtsoff) + value = pci_xhci_dbregs_read(sc, offset); + else if (offset < sc->regsend) + value = pci_xhci_rtsregs_read(sc, offset); + else if (offset < (sc->regsend + 4*32)) + value = pci_xhci_xecp_read(sc, offset); + else { + value = 0; + WPRINTF(("pci_xhci: read invalid offset %ld\r\n", offset)); + } + + pthread_mutex_unlock(&sc->mtx); + + switch (size) { + case 1: + value &= 0xFF; + break; + case 2: + value &= 0xFFFF; + break; + case 4: + value &= 0xFFFFFFFF; + break; + } + + return (value); +} + +static void +pci_xhci_reset_port(struct pci_xhci_softc *sc, int portn, int warm) +{ + struct pci_xhci_portregs *port; + struct pci_xhci_dev_emu *dev; + struct xhci_trb evtrb; + int error; + + assert(portn <= XHCI_MAX_DEVS); + + DPRINTF(("xhci reset port %d\r\n", portn)); + + port = XHCI_PORTREG_PTR(sc, portn); + dev = XHCI_DEVINST_PTR(sc, portn); + if (dev) { + port->portsc &= ~(XHCI_PS_PLS_MASK | XHCI_PS_PR | XHCI_PS_PRC); + port->portsc |= XHCI_PS_PED | + XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); + + if (warm && dev->dev_ue->ue_usbver == 3) { + port->portsc |= XHCI_PS_WRC; + } + + if ((port->portsc & XHCI_PS_PRC) == 0) { + port->portsc |= XHCI_PS_PRC; + + pci_xhci_set_evtrb(&evtrb, portn, + XHCI_TRB_ERROR_SUCCESS, + XHCI_TRB_EVENT_PORT_STS_CHANGE); + error = pci_xhci_insert_event(sc, &evtrb, 1); + if (error != XHCI_TRB_ERROR_SUCCESS) + DPRINTF(("xhci reset port insert event " + "failed\r\n")); + } + } +} + +static void +pci_xhci_init_port(struct pci_xhci_softc *sc, int portn) +{ + struct pci_xhci_portregs *port; + struct pci_xhci_dev_emu *dev; + + port = XHCI_PORTREG_PTR(sc, portn); + dev = XHCI_DEVINST_PTR(sc, portn); + if (dev) { + port->portsc = XHCI_PS_CCS | /* connected */ + XHCI_PS_PP; /* port power */ + + if (dev->dev_ue->ue_usbver == 2) { + port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_POLL) | + XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); + } else { + port->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_U0) | + XHCI_PS_PED | /* enabled */ + XHCI_PS_SPEED_SET(dev->dev_ue->ue_usbspeed); + } + + DPRINTF(("Init port %d 0x%x\n", portn, port->portsc)); + } else { + port->portsc = XHCI_PS_PLS_SET(UPS_PORT_LS_RX_DET) | XHCI_PS_PP; + DPRINTF(("Init empty port %d 0x%x\n", portn, port->portsc)); + } +} + +static int +pci_xhci_dev_intr(struct usb_hci *hci, int epctx) +{ + struct pci_xhci_dev_emu *dev; + struct xhci_dev_ctx *dev_ctx; + struct xhci_trb evtrb; + struct pci_xhci_softc *sc; + struct pci_xhci_portregs *p; + struct xhci_endp_ctx *ep_ctx; + int error; + int dir_in; + int epid; + + dir_in = epctx & 0x80; + epid = epctx & ~0x80; + + /* HW endpoint contexts are 0-15; convert to epid based on dir */ + epid = (epid * 2) + (dir_in ? 1 : 0); + + assert(epid >= 1 && epid <= 31); + + dev = hci->hci_sc; + sc = dev->xsc; + + /* check if device is ready; OS has to initialise it */ + if (sc->rtsregs.erstba_p == NULL || + (sc->opregs.usbcmd & XHCI_CMD_RS) == 0 || + dev->dev_ctx == NULL) + return (0); + + p = XHCI_PORTREG_PTR(sc, hci->hci_port); + + /* raise event if link U3 (suspended) state */ + if (XHCI_PS_PLS_GET(p->portsc) == 3) { + p->portsc &= ~XHCI_PS_PLS_MASK; + p->portsc |= XHCI_PS_PLS_SET(UPS_PORT_LS_RESUME); + if ((p->portsc & XHCI_PS_PLC) != 0) + return (0); + + p->portsc |= XHCI_PS_PLC; + + pci_xhci_set_evtrb(&evtrb, hci->hci_port, + XHCI_TRB_ERROR_SUCCESS, XHCI_TRB_EVENT_PORT_STS_CHANGE); + error = pci_xhci_insert_event(sc, &evtrb, 0); + if (error != XHCI_TRB_ERROR_SUCCESS) + goto done; + } + + dev_ctx = dev->dev_ctx; + ep_ctx = &dev_ctx->ctx_ep[epid]; + if ((ep_ctx->dwEpCtx0 & 0x7) == XHCI_ST_EPCTX_DISABLED) { + DPRINTF(("xhci device interrupt on disabled endpoint %d\r\n", + epid)); + return (0); + } + + DPRINTF(("xhci device interrupt on endpoint %d\r\n", epid)); + + pci_xhci_device_doorbell(sc, hci->hci_port, epid, 0); + +done: + return (error); +} + +static int +pci_xhci_dev_event(struct usb_hci *hci, enum hci_usbev evid, void *param) +{ + + DPRINTF(("xhci device event port %d\r\n", hci->hci_port)); + return (0); +} + + + +static void +pci_xhci_device_usage(char *opt) +{ + + fprintf(stderr, "Invalid USB emulation \"%s\"\r\n", opt); +} + +static int +pci_xhci_parse_opts(struct pci_xhci_softc *sc, char *opts) +{ + struct pci_xhci_dev_emu **devices; + struct pci_xhci_dev_emu *dev; + struct usb_devemu *ue; + void *devsc; + char *uopt, *xopts, *config; + int usb3_port, usb2_port, i; + + uopt = NULL; + usb3_port = sc->usb3_port_start - 1; + usb2_port = sc->usb2_port_start - 1; + devices = NULL; + + if (opts == NULL) + goto portsfinal; + + devices = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_dev_emu *)); + + sc->slots = calloc(XHCI_MAX_SLOTS, sizeof(struct pci_xhci_dev_emu *)); + sc->devices = devices; + sc->ndevices = 0; + + uopt = strdup(opts); + for (xopts = strtok(uopt, ","); + xopts != NULL; + xopts = strtok(NULL, ",")) { + if (usb2_port == ((sc->usb2_port_start-1) + XHCI_MAX_DEVS/2) || + usb3_port == ((sc->usb3_port_start-1) + XHCI_MAX_DEVS/2)) { + WPRINTF(("pci_xhci max number of USB 2 or 3 " + "devices reached, max %d\r\n", XHCI_MAX_DEVS/2)); + usb2_port = usb3_port = -1; + goto done; + } + + /* device[=<config>] */ + if ((config = strchr(xopts, '=')) == NULL) + config = ""; /* no config */ + else + *config++ = '\0'; + + ue = usb_emu_finddev(xopts); + if (ue == NULL) { + pci_xhci_device_usage(xopts); + DPRINTF(("pci_xhci device not found %s\r\n", xopts)); + usb2_port = usb3_port = -1; + goto done; + } + + DPRINTF(("pci_xhci adding device %s, opts \"%s\"\r\n", + xopts, config)); + + dev = calloc(1, sizeof(struct pci_xhci_dev_emu)); + dev->xsc = sc; + dev->hci.hci_sc = dev; + dev->hci.hci_intr = pci_xhci_dev_intr; + dev->hci.hci_event = pci_xhci_dev_event; + + if (ue->ue_usbver == 2) { + dev->hci.hci_port = usb2_port + 1; + devices[usb2_port] = dev; + usb2_port++; + } else { + dev->hci.hci_port = usb3_port + 1; + devices[usb3_port] = dev; + usb3_port++; + } + + dev->hci.hci_address = 0; + devsc = ue->ue_init(&dev->hci, config); + if (devsc == NULL) { + pci_xhci_device_usage(xopts); + usb2_port = usb3_port = -1; + goto done; + } + + dev->dev_ue = ue; + dev->dev_sc = devsc; + + /* assign slot number to device */ + sc->slots[sc->ndevices] = dev; + + sc->ndevices++; + } + +portsfinal: + sc->portregs = calloc(XHCI_MAX_DEVS, sizeof(struct pci_xhci_portregs)); + + if (sc->ndevices > 0) { + /* port and slot numbering start from 1 */ + sc->devices--; + sc->portregs--; + sc->slots--; + + for (i = 1; i <= XHCI_MAX_DEVS; i++) { + pci_xhci_init_port(sc, i); + } + } else { + WPRINTF(("pci_xhci no USB devices configured\r\n")); + sc->ndevices = 1; + } + +done: + if (devices != NULL) { + if (usb2_port <= 0 && usb3_port <= 0) { + sc->devices = NULL; + for (i = 0; devices[i] != NULL; i++) + free(devices[i]); + sc->ndevices = -1; + + free(devices); + } + } + free(uopt); + return (sc->ndevices); +} + +static int +pci_xhci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts) +{ + struct pci_xhci_softc *sc; + int error; + + if (xhci_in_use) { + WPRINTF(("pci_xhci controller already defined\r\n")); + return (-1); + } + xhci_in_use = 1; + + sc = calloc(1, sizeof(struct pci_xhci_softc)); + pi->pi_arg = sc; + sc->xsc_pi = pi; + + sc->usb2_port_start = (XHCI_MAX_DEVS/2) + 1; + sc->usb3_port_start = 1; + + /* discover devices */ + error = pci_xhci_parse_opts(sc, opts); + if (error < 0) + goto done; + else + error = 0; + + sc->caplength = XHCI_SET_CAPLEN(XHCI_CAPLEN) | + XHCI_SET_HCIVERSION(0x0100); + sc->hcsparams1 = XHCI_SET_HCSP1_MAXPORTS(XHCI_MAX_DEVS) | + XHCI_SET_HCSP1_MAXINTR(1) | /* interrupters */ + XHCI_SET_HCSP1_MAXSLOTS(XHCI_MAX_SLOTS); + sc->hcsparams2 = XHCI_SET_HCSP2_ERSTMAX(XHCI_ERST_MAX) | + XHCI_SET_HCSP2_IST(0x04); + sc->hcsparams3 = 0; /* no latency */ + sc->hccparams1 = XHCI_SET_HCCP1_NSS(1) | /* no 2nd-streams */ + XHCI_SET_HCCP1_SPC(1) | /* short packet */ + XHCI_SET_HCCP1_MAXPSA(XHCI_STREAMS_MAX); + sc->hccparams2 = XHCI_SET_HCCP2_LEC(1) | + XHCI_SET_HCCP2_U3C(1); + sc->dboff = XHCI_SET_DOORBELL(XHCI_CAPLEN + XHCI_PORTREGS_START + + XHCI_MAX_DEVS * sizeof(struct pci_xhci_portregs)); + + /* dboff must be 32-bit aligned */ + if (sc->dboff & 0x3) + sc->dboff = (sc->dboff + 0x3) & ~0x3; + + /* rtsoff must be 32-bytes aligned */ + sc->rtsoff = XHCI_SET_RTSOFFSET(sc->dboff + (XHCI_MAX_SLOTS+1) * 32); + if (sc->rtsoff & 0x1F) + sc->rtsoff = (sc->rtsoff + 0x1F) & ~0x1F; + + DPRINTF(("pci_xhci dboff: 0x%x, rtsoff: 0x%x\r\n", sc->dboff, + sc->rtsoff)); + + sc->opregs.usbsts = XHCI_STS_HCH; + sc->opregs.pgsz = XHCI_PAGESIZE_4K; + + pci_xhci_reset(sc); + + sc->regsend = sc->rtsoff + 0x20 + 32; /* only 1 intrpter */ + + /* + * Set extended capabilities pointer to be after regsend; + * value of xecp field is 32-bit offset. + */ + sc->hccparams1 |= XHCI_SET_HCCP1_XECP(sc->regsend/4); + + pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1E31); + pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086); + pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_SERIALBUS); + pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_SERIALBUS_USB); + pci_set_cfgdata8(pi, PCIR_PROGIF,PCIP_SERIALBUS_USB_XHCI); + pci_set_cfgdata8(pi, PCI_USBREV, PCI_USB_REV_3_0); + + pci_emul_add_msicap(pi, 1); + + /* regsend + xecp registers */ + pci_emul_alloc_bar(pi, 0, PCIBAR_MEM32, sc->regsend + 4*32); + DPRINTF(("pci_xhci pci_emu_alloc: %d\r\n", sc->regsend + 4*32)); + + + pci_lintr_request(pi); + + pthread_mutex_init(&sc->mtx, NULL); + +done: + if (error) { + free(sc); + } + + return (error); +} + + + +struct pci_devemu pci_de_xhci = { + .pe_emu = "xhci", + .pe_init = pci_xhci_init, + .pe_barwrite = pci_xhci_write, + .pe_barread = pci_xhci_read +}; +PCI_EMUL_SET(pci_de_xhci); diff --git a/usr/src/cmd/bhyve/pci_xhci.h b/usr/src/cmd/bhyve/pci_xhci.h new file mode 100644 index 0000000000..7502f9396a --- /dev/null +++ b/usr/src/cmd/bhyve/pci_xhci.h @@ -0,0 +1,355 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PCI_XHCI_H_ +#define _PCI_XHCI_H_ + +#define PCI_USBREV 0x60 /* USB protocol revision */ + + +enum { /* dsc_slotstate */ + XHCI_ST_DISABLED, + XHCI_ST_ENABLED, + XHCI_ST_DEFAULT, + XHCI_ST_ADDRESSED, + XHCI_ST_CONFIGURED, + XHCI_ST_MAX +}; + +enum { + XHCI_ST_SLCTX_DISABLED, + XHCI_ST_SLCTX_DEFAULT, + XHCI_ST_SLCTX_ADDRESSED, + XHCI_ST_SLCTX_CONFIGURED +}; + +enum { + XHCI_ST_EPCTX_DISABLED, + XHCI_ST_EPCTX_RUNNING, + XHCI_ST_EPCTX_HALTED, + XHCI_ST_EPCTX_STOPPED, + XHCI_ST_EPCTX_ERROR +}; + +#define XHCI_MAX_DEVICES MIN(USB_MAX_DEVICES, 128) +#define XHCI_MAX_ENDPOINTS 32 /* hardcoded - do not change */ +#define XHCI_MAX_SCRATCHPADS 32 +#define XHCI_MAX_EVENTS (16 * 13) +#define XHCI_MAX_COMMANDS (16 * 1) +#define XHCI_MAX_RSEG 1 +#define XHCI_MAX_TRANSFERS 4 +#if USB_MAX_EP_STREAMS == 8 +#define XHCI_MAX_STREAMS 8 +#define XHCI_MAX_STREAMS_LOG 3 +#elif USB_MAX_EP_STREAMS == 1 +#define XHCI_MAX_STREAMS 1 +#define XHCI_MAX_STREAMS_LOG 0 +#else +#error "The USB_MAX_EP_STREAMS value is not supported." +#endif +#define XHCI_DEV_CTX_ADDR_ALIGN 64 /* bytes */ +#define XHCI_DEV_CTX_ALIGN 64 /* bytes */ +#define XHCI_INPUT_CTX_ALIGN 64 /* bytes */ +#define XHCI_SLOT_CTX_ALIGN 32 /* bytes */ +#define XHCI_ENDP_CTX_ALIGN 32 /* bytes */ +#define XHCI_STREAM_CTX_ALIGN 16 /* bytes */ +#define XHCI_TRANS_RING_SEG_ALIGN 16 /* bytes */ +#define XHCI_CMD_RING_SEG_ALIGN 64 /* bytes */ +#define XHCI_EVENT_RING_SEG_ALIGN 64 /* bytes */ +#define XHCI_SCRATCH_BUF_ARRAY_ALIGN 64 /* bytes */ +#define XHCI_SCRATCH_BUFFER_ALIGN USB_PAGE_SIZE +#define XHCI_TRB_ALIGN 16 /* bytes */ +#define XHCI_TD_ALIGN 64 /* bytes */ +#define XHCI_PAGE_SIZE 4096 /* bytes */ + +struct xhci_slot_ctx { + volatile uint32_t dwSctx0; +#define XHCI_SCTX_0_ROUTE_SET(x) ((x) & 0xFFFFF) +#define XHCI_SCTX_0_ROUTE_GET(x) ((x) & 0xFFFFF) +#define XHCI_SCTX_0_SPEED_SET(x) (((x) & 0xF) << 20) +#define XHCI_SCTX_0_SPEED_GET(x) (((x) >> 20) & 0xF) +#define XHCI_SCTX_0_MTT_SET(x) (((x) & 0x1) << 25) +#define XHCI_SCTX_0_MTT_GET(x) (((x) >> 25) & 0x1) +#define XHCI_SCTX_0_HUB_SET(x) (((x) & 0x1) << 26) +#define XHCI_SCTX_0_HUB_GET(x) (((x) >> 26) & 0x1) +#define XHCI_SCTX_0_CTX_NUM_SET(x) (((x) & 0x1F) << 27) +#define XHCI_SCTX_0_CTX_NUM_GET(x) (((x) >> 27) & 0x1F) + volatile uint32_t dwSctx1; +#define XHCI_SCTX_1_MAX_EL_SET(x) ((x) & 0xFFFF) +#define XHCI_SCTX_1_MAX_EL_GET(x) ((x) & 0xFFFF) +#define XHCI_SCTX_1_RH_PORT_SET(x) (((x) & 0xFF) << 16) +#define XHCI_SCTX_1_RH_PORT_GET(x) (((x) >> 16) & 0xFF) +#define XHCI_SCTX_1_NUM_PORTS_SET(x) (((x) & 0xFF) << 24) +#define XHCI_SCTX_1_NUM_PORTS_GET(x) (((x) >> 24) & 0xFF) + volatile uint32_t dwSctx2; +#define XHCI_SCTX_2_TT_HUB_SID_SET(x) ((x) & 0xFF) +#define XHCI_SCTX_2_TT_HUB_SID_GET(x) ((x) & 0xFF) +#define XHCI_SCTX_2_TT_PORT_NUM_SET(x) (((x) & 0xFF) << 8) +#define XHCI_SCTX_2_TT_PORT_NUM_GET(x) (((x) >> 8) & 0xFF) +#define XHCI_SCTX_2_TT_THINK_TIME_SET(x) (((x) & 0x3) << 16) +#define XHCI_SCTX_2_TT_THINK_TIME_GET(x) (((x) >> 16) & 0x3) +#define XHCI_SCTX_2_IRQ_TARGET_SET(x) (((x) & 0x3FF) << 22) +#define XHCI_SCTX_2_IRQ_TARGET_GET(x) (((x) >> 22) & 0x3FF) + volatile uint32_t dwSctx3; +#define XHCI_SCTX_3_DEV_ADDR_SET(x) ((x) & 0xFF) +#define XHCI_SCTX_3_DEV_ADDR_GET(x) ((x) & 0xFF) +#define XHCI_SCTX_3_SLOT_STATE_SET(x) (((x) & 0x1F) << 27) +#define XHCI_SCTX_3_SLOT_STATE_GET(x) (((x) >> 27) & 0x1F) + volatile uint32_t dwSctx4; + volatile uint32_t dwSctx5; + volatile uint32_t dwSctx6; + volatile uint32_t dwSctx7; +}; + +struct xhci_endp_ctx { + volatile uint32_t dwEpCtx0; +#define XHCI_EPCTX_0_EPSTATE_SET(x) ((x) & 0x7) +#define XHCI_EPCTX_0_EPSTATE_GET(x) ((x) & 0x7) +#define XHCI_EPCTX_0_MULT_SET(x) (((x) & 0x3) << 8) +#define XHCI_EPCTX_0_MULT_GET(x) (((x) >> 8) & 0x3) +#define XHCI_EPCTX_0_MAXP_STREAMS_SET(x) (((x) & 0x1F) << 10) +#define XHCI_EPCTX_0_MAXP_STREAMS_GET(x) (((x) >> 10) & 0x1F) +#define XHCI_EPCTX_0_LSA_SET(x) (((x) & 0x1) << 15) +#define XHCI_EPCTX_0_LSA_GET(x) (((x) >> 15) & 0x1) +#define XHCI_EPCTX_0_IVAL_SET(x) (((x) & 0xFF) << 16) +#define XHCI_EPCTX_0_IVAL_GET(x) (((x) >> 16) & 0xFF) + volatile uint32_t dwEpCtx1; +#define XHCI_EPCTX_1_CERR_SET(x) (((x) & 0x3) << 1) +#define XHCI_EPCTX_1_CERR_GET(x) (((x) >> 1) & 0x3) +#define XHCI_EPCTX_1_EPTYPE_SET(x) (((x) & 0x7) << 3) +#define XHCI_EPCTX_1_EPTYPE_GET(x) (((x) >> 3) & 0x7) +#define XHCI_EPCTX_1_HID_SET(x) (((x) & 0x1) << 7) +#define XHCI_EPCTX_1_HID_GET(x) (((x) >> 7) & 0x1) +#define XHCI_EPCTX_1_MAXB_SET(x) (((x) & 0xFF) << 8) +#define XHCI_EPCTX_1_MAXB_GET(x) (((x) >> 8) & 0xFF) +#define XHCI_EPCTX_1_MAXP_SIZE_SET(x) (((x) & 0xFFFF) << 16) +#define XHCI_EPCTX_1_MAXP_SIZE_GET(x) (((x) >> 16) & 0xFFFF) + volatile uint64_t qwEpCtx2; +#define XHCI_EPCTX_2_DCS_SET(x) ((x) & 0x1) +#define XHCI_EPCTX_2_DCS_GET(x) ((x) & 0x1) +#define XHCI_EPCTX_2_TR_DQ_PTR_MASK 0xFFFFFFFFFFFFFFF0U + volatile uint32_t dwEpCtx4; +#define XHCI_EPCTX_4_AVG_TRB_LEN_SET(x) ((x) & 0xFFFF) +#define XHCI_EPCTX_4_AVG_TRB_LEN_GET(x) ((x) & 0xFFFF) +#define XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_SET(x) (((x) & 0xFFFF) << 16) +#define XHCI_EPCTX_4_MAX_ESIT_PAYLOAD_GET(x) (((x) >> 16) & 0xFFFF) + volatile uint32_t dwEpCtx5; + volatile uint32_t dwEpCtx6; + volatile uint32_t dwEpCtx7; +}; + +struct xhci_input_ctx { +#define XHCI_INCTX_NON_CTRL_MASK 0xFFFFFFFCU + volatile uint32_t dwInCtx0; +#define XHCI_INCTX_0_DROP_MASK(n) (1U << (n)) + volatile uint32_t dwInCtx1; +#define XHCI_INCTX_1_ADD_MASK(n) (1U << (n)) + volatile uint32_t dwInCtx2; + volatile uint32_t dwInCtx3; + volatile uint32_t dwInCtx4; + volatile uint32_t dwInCtx5; + volatile uint32_t dwInCtx6; + volatile uint32_t dwInCtx7; +}; + +struct xhci_input_dev_ctx { + struct xhci_input_ctx ctx_input; + union { + struct xhci_slot_ctx u_slot; + struct xhci_endp_ctx u_ep[XHCI_MAX_ENDPOINTS]; + } ctx_dev_slep; +}; + +struct xhci_dev_ctx { + union { + struct xhci_slot_ctx u_slot; + struct xhci_endp_ctx u_ep[XHCI_MAX_ENDPOINTS]; + } ctx_dev_slep; +} __aligned(XHCI_DEV_CTX_ALIGN); +#define ctx_slot ctx_dev_slep.u_slot +#define ctx_ep ctx_dev_slep.u_ep + +struct xhci_stream_ctx { + volatile uint64_t qwSctx0; +#define XHCI_SCTX_0_DCS_GET(x) ((x) & 0x1) +#define XHCI_SCTX_0_DCS_SET(x) ((x) & 0x1) +#define XHCI_SCTX_0_SCT_SET(x) (((x) & 0x7) << 1) +#define XHCI_SCTX_0_SCT_GET(x) (((x) >> 1) & 0x7) +#define XHCI_SCTX_0_SCT_SEC_TR_RING 0x0 +#define XHCI_SCTX_0_SCT_PRIM_TR_RING 0x1 +#define XHCI_SCTX_0_SCT_PRIM_SSA_8 0x2 +#define XHCI_SCTX_0_SCT_PRIM_SSA_16 0x3 +#define XHCI_SCTX_0_SCT_PRIM_SSA_32 0x4 +#define XHCI_SCTX_0_SCT_PRIM_SSA_64 0x5 +#define XHCI_SCTX_0_SCT_PRIM_SSA_128 0x6 +#define XHCI_SCTX_0_SCT_PRIM_SSA_256 0x7 +#define XHCI_SCTX_0_TR_DQ_PTR_MASK 0xFFFFFFFFFFFFFFF0U + volatile uint32_t dwSctx2; + volatile uint32_t dwSctx3; +}; + +struct xhci_trb { + volatile uint64_t qwTrb0; +#define XHCI_TRB_0_DIR_IN_MASK (0x80ULL << 0) +#define XHCI_TRB_0_WLENGTH_MASK (0xFFFFULL << 48) + volatile uint32_t dwTrb2; +#define XHCI_TRB_2_ERROR_GET(x) (((x) >> 24) & 0xFF) +#define XHCI_TRB_2_ERROR_SET(x) (((x) & 0xFF) << 24) +#define XHCI_TRB_2_TDSZ_GET(x) (((x) >> 17) & 0x1F) +#define XHCI_TRB_2_TDSZ_SET(x) (((x) & 0x1F) << 17) +#define XHCI_TRB_2_REM_GET(x) ((x) & 0xFFFFFF) +#define XHCI_TRB_2_REM_SET(x) ((x) & 0xFFFFFF) +#define XHCI_TRB_2_BYTES_GET(x) ((x) & 0x1FFFF) +#define XHCI_TRB_2_BYTES_SET(x) ((x) & 0x1FFFF) +#define XHCI_TRB_2_IRQ_GET(x) (((x) >> 22) & 0x3FF) +#define XHCI_TRB_2_IRQ_SET(x) (((x) & 0x3FF) << 22) +#define XHCI_TRB_2_STREAM_GET(x) (((x) >> 16) & 0xFFFF) +#define XHCI_TRB_2_STREAM_SET(x) (((x) & 0xFFFF) << 16) + + volatile uint32_t dwTrb3; +#define XHCI_TRB_3_TYPE_GET(x) (((x) >> 10) & 0x3F) +#define XHCI_TRB_3_TYPE_SET(x) (((x) & 0x3F) << 10) +#define XHCI_TRB_3_CYCLE_BIT (1U << 0) +#define XHCI_TRB_3_TC_BIT (1U << 1) /* command ring only */ +#define XHCI_TRB_3_ENT_BIT (1U << 1) /* transfer ring only */ +#define XHCI_TRB_3_ISP_BIT (1U << 2) +#define XHCI_TRB_3_ED_BIT (1U << 2) +#define XHCI_TRB_3_NSNOOP_BIT (1U << 3) +#define XHCI_TRB_3_CHAIN_BIT (1U << 4) +#define XHCI_TRB_3_IOC_BIT (1U << 5) +#define XHCI_TRB_3_IDT_BIT (1U << 6) +#define XHCI_TRB_3_TBC_GET(x) (((x) >> 7) & 3) +#define XHCI_TRB_3_TBC_SET(x) (((x) & 3) << 7) +#define XHCI_TRB_3_BEI_BIT (1U << 9) +#define XHCI_TRB_3_DCEP_BIT (1U << 9) +#define XHCI_TRB_3_PRSV_BIT (1U << 9) +#define XHCI_TRB_3_BSR_BIT (1U << 9) +#define XHCI_TRB_3_TRT_MASK (3U << 16) +#define XHCI_TRB_3_TRT_NONE (0U << 16) +#define XHCI_TRB_3_TRT_OUT (2U << 16) +#define XHCI_TRB_3_TRT_IN (3U << 16) +#define XHCI_TRB_3_DIR_IN (1U << 16) +#define XHCI_TRB_3_TLBPC_GET(x) (((x) >> 16) & 0xF) +#define XHCI_TRB_3_TLBPC_SET(x) (((x) & 0xF) << 16) +#define XHCI_TRB_3_EP_GET(x) (((x) >> 16) & 0x1F) +#define XHCI_TRB_3_EP_SET(x) (((x) & 0x1F) << 16) +#define XHCI_TRB_3_FRID_GET(x) (((x) >> 20) & 0x7FF) +#define XHCI_TRB_3_FRID_SET(x) (((x) & 0x7FF) << 20) +#define XHCI_TRB_3_ISO_SIA_BIT (1U << 31) +#define XHCI_TRB_3_SUSP_EP_BIT (1U << 23) +#define XHCI_TRB_3_SLOT_GET(x) (((x) >> 24) & 0xFF) +#define XHCI_TRB_3_SLOT_SET(x) (((x) & 0xFF) << 24) + +/* Commands */ +#define XHCI_TRB_TYPE_RESERVED 0x00 +#define XHCI_TRB_TYPE_NORMAL 0x01 +#define XHCI_TRB_TYPE_SETUP_STAGE 0x02 +#define XHCI_TRB_TYPE_DATA_STAGE 0x03 +#define XHCI_TRB_TYPE_STATUS_STAGE 0x04 +#define XHCI_TRB_TYPE_ISOCH 0x05 +#define XHCI_TRB_TYPE_LINK 0x06 +#define XHCI_TRB_TYPE_EVENT_DATA 0x07 +#define XHCI_TRB_TYPE_NOOP 0x08 +#define XHCI_TRB_TYPE_ENABLE_SLOT 0x09 +#define XHCI_TRB_TYPE_DISABLE_SLOT 0x0A +#define XHCI_TRB_TYPE_ADDRESS_DEVICE 0x0B +#define XHCI_TRB_TYPE_CONFIGURE_EP 0x0C +#define XHCI_TRB_TYPE_EVALUATE_CTX 0x0D +#define XHCI_TRB_TYPE_RESET_EP 0x0E +#define XHCI_TRB_TYPE_STOP_EP 0x0F +#define XHCI_TRB_TYPE_SET_TR_DEQUEUE 0x10 +#define XHCI_TRB_TYPE_RESET_DEVICE 0x11 +#define XHCI_TRB_TYPE_FORCE_EVENT 0x12 +#define XHCI_TRB_TYPE_NEGOTIATE_BW 0x13 +#define XHCI_TRB_TYPE_SET_LATENCY_TOL 0x14 +#define XHCI_TRB_TYPE_GET_PORT_BW 0x15 +#define XHCI_TRB_TYPE_FORCE_HEADER 0x16 +#define XHCI_TRB_TYPE_NOOP_CMD 0x17 + +/* Events */ +#define XHCI_TRB_EVENT_TRANSFER 0x20 +#define XHCI_TRB_EVENT_CMD_COMPLETE 0x21 +#define XHCI_TRB_EVENT_PORT_STS_CHANGE 0x22 +#define XHCI_TRB_EVENT_BW_REQUEST 0x23 +#define XHCI_TRB_EVENT_DOORBELL 0x24 +#define XHCI_TRB_EVENT_HOST_CTRL 0x25 +#define XHCI_TRB_EVENT_DEVICE_NOTIFY 0x26 +#define XHCI_TRB_EVENT_MFINDEX_WRAP 0x27 + +/* Error codes */ +#define XHCI_TRB_ERROR_INVALID 0x00 +#define XHCI_TRB_ERROR_SUCCESS 0x01 +#define XHCI_TRB_ERROR_DATA_BUF 0x02 +#define XHCI_TRB_ERROR_BABBLE 0x03 +#define XHCI_TRB_ERROR_XACT 0x04 +#define XHCI_TRB_ERROR_TRB 0x05 +#define XHCI_TRB_ERROR_STALL 0x06 +#define XHCI_TRB_ERROR_RESOURCE 0x07 +#define XHCI_TRB_ERROR_BANDWIDTH 0x08 +#define XHCI_TRB_ERROR_NO_SLOTS 0x09 +#define XHCI_TRB_ERROR_STREAM_TYPE 0x0A +#define XHCI_TRB_ERROR_SLOT_NOT_ON 0x0B +#define XHCI_TRB_ERROR_ENDP_NOT_ON 0x0C +#define XHCI_TRB_ERROR_SHORT_PKT 0x0D +#define XHCI_TRB_ERROR_RING_UNDERRUN 0x0E +#define XHCI_TRB_ERROR_RING_OVERRUN 0x0F +#define XHCI_TRB_ERROR_VF_RING_FULL 0x10 +#define XHCI_TRB_ERROR_PARAMETER 0x11 +#define XHCI_TRB_ERROR_BW_OVERRUN 0x12 +#define XHCI_TRB_ERROR_CONTEXT_STATE 0x13 +#define XHCI_TRB_ERROR_NO_PING_RESP 0x14 +#define XHCI_TRB_ERROR_EV_RING_FULL 0x15 +#define XHCI_TRB_ERROR_INCOMPAT_DEV 0x16 +#define XHCI_TRB_ERROR_MISSED_SERVICE 0x17 +#define XHCI_TRB_ERROR_CMD_RING_STOP 0x18 +#define XHCI_TRB_ERROR_CMD_ABORTED 0x19 +#define XHCI_TRB_ERROR_STOPPED 0x1A +#define XHCI_TRB_ERROR_LENGTH 0x1B +#define XHCI_TRB_ERROR_BAD_MELAT 0x1D +#define XHCI_TRB_ERROR_ISOC_OVERRUN 0x1F +#define XHCI_TRB_ERROR_EVENT_LOST 0x20 +#define XHCI_TRB_ERROR_UNDEFINED 0x21 +#define XHCI_TRB_ERROR_INVALID_SID 0x22 +#define XHCI_TRB_ERROR_SEC_BW 0x23 +#define XHCI_TRB_ERROR_SPLIT_XACT 0x24 +} __aligned(4); + +struct xhci_dev_endpoint_trbs { + struct xhci_trb trb[(XHCI_MAX_STREAMS * + XHCI_MAX_TRANSFERS) + XHCI_MAX_STREAMS]; +}; + +struct xhci_event_ring_seg { + volatile uint64_t qwEvrsTablePtr; + volatile uint32_t dwEvrsTableSize; + volatile uint32_t dwEvrsReserved; +}; + +#endif /* _PCI_XHCI_H_ */ diff --git a/usr/src/cmd/bhyve/pm.c b/usr/src/cmd/bhyve/pm.c index 70c4f1fae8..be188b79f2 100644 --- a/usr/src/cmd/bhyve/pm.c +++ b/usr/src/cmd/bhyve/pm.c @@ -1,5 +1,7 @@ /*- - * Copyright (c) 2013 Advanced Computing Technologies LLC + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Hudson River Trading LLC * Written by: John H. Baldwin <jhb@FreeBSD.org> * All rights reserved. * @@ -24,14 +26,18 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ +/* + * Copyright 2018 Joyent, Inc. + */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pm.c 266125 2014-05-15 14:16:55Z jhb $"); +__FBSDID("$FreeBSD$"); #include <sys/types.h> #include <machine/vmm.h> #include <assert.h> +#include <errno.h> #include <pthread.h> #ifndef __FreeBSD__ #include <stdlib.h> @@ -51,6 +57,8 @@ static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER; #ifdef __FreeBSD__ static struct mevent *power_button; static sig_t old_power_handler; +#else +struct vmctx *pwr_ctx; #endif /* @@ -63,6 +71,8 @@ static int reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { + int error; + static uint8_t reset_control; if (bytes != 1) @@ -74,12 +84,8 @@ reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, /* Treat hard and soft resets the same. */ if (reset_control & 0x4) { -#ifdef __FreeBSD__ error = vm_suspend(ctx, VM_SUSPEND_RESET); assert(error == 0 || errno == EALREADY); -#else - exit(0); -#endif } } return (0); @@ -220,6 +226,34 @@ power_button_handler(int signal, enum ev_type type, void *arg) } pthread_mutex_unlock(&pm_lock); } + +#else +/* + * Initiate graceful power off. + */ +/*ARGSUSED*/ +static void +power_button_handler(int signal, siginfo_t *type, void *cp) +{ + /* + * In theory, taking the 'pm_lock' mutex from within this signal + * handler could lead to deadlock if the main thread already held this + * mutex. In reality, this mutex is local to this file and all of the + * other usage in this file only occurs in functions which are FreeBSD + * specific (and thus currently not used). Thus, for consistency with + * the other code in this file, we take the mutex, but in the future, + * if these other functions are ever enabled for use on non-FreeBSD + * systems and these functions could be called directly by a thread + * (which would then hold the mutex), then we need to revisit the use + * of this mutex in this signal handler. + */ + pthread_mutex_lock(&pm_lock); + if (!(pm1_status & PM1_PWRBTN_STS)) { + pm1_status |= PM1_PWRBTN_STS; + sci_update(pwr_ctx); + } + pthread_mutex_unlock(&pm_lock); +} #endif /* @@ -239,6 +273,7 @@ static int pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, uint32_t *eax, void *arg) { + int error; if (bytes != 2) return (-1); @@ -259,12 +294,8 @@ pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, */ if (*eax & PM1_SLP_EN) { if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) { -#ifdef __FreeBSD__ error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); assert(error == 0 || errno == EALREADY); -#else - exit(0); -#endif } } } @@ -330,4 +361,18 @@ sci_init(struct vmctx *ctx) */ pci_irq_use(SCI_INT); vm_isa_set_irq_trigger(ctx, SCI_INT, LEVEL_TRIGGER); + +#ifndef __FreeBSD__ + { + /* + * Install SIGTERM signal handler for graceful power off. + */ + struct sigaction act; + + pwr_ctx = ctx; + act.sa_flags = 0; + act.sa_sigaction = power_button_handler; + (void) sigaction(SIGTERM, &act, NULL); + } +#endif } diff --git a/usr/src/cmd/bhyve/pmtmr.c b/usr/src/cmd/bhyve/pmtmr.c deleted file mode 100644 index 92ab24be57..0000000000 --- a/usr/src/cmd/bhyve/pmtmr.c +++ /dev/null @@ -1,212 +0,0 @@ -/*- - * Copyright (c) 2012 NetApp, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: head/usr.sbin/bhyve/pmtmr.c 259998 2013-12-28 04:01:05Z jhb $ - */ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * Copyright 2014 Pluribus Networks Inc. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pmtmr.c 259998 2013-12-28 04:01:05Z jhb $"); - -#include <sys/types.h> -#include <sys/sysctl.h> -#include <sys/time.h> -#include <machine/cpufunc.h> - -#include <stdio.h> -#include <stdlib.h> -#include <time.h> -#include <assert.h> -#include <pthread.h> -#ifndef __FreeBSD__ -#include <kstat.h> -#endif - -#include "acpi.h" -#include "inout.h" - -/* - * The ACPI Power Management timer is a free-running 24- or 32-bit - * timer with a frequency of 3.579545MHz - * - * This implementation will be 32-bits - */ - -#define PMTMR_FREQ 3579545 /* 3.579545MHz */ - -static pthread_mutex_t pmtmr_mtx; -static pthread_once_t pmtmr_once = PTHREAD_ONCE_INIT; - -static uint64_t pmtmr_old; - -static uint64_t pmtmr_tscf; -static uint64_t pmtmr_tsc_old; - -#ifdef __FreeBSD__ -static clockid_t clockid = CLOCK_UPTIME_FAST; -static struct timespec pmtmr_uptime_old; - -#define timespecsub(vvp, uvp) \ - do { \ - (vvp)->tv_sec -= (uvp)->tv_sec; \ - (vvp)->tv_nsec -= (uvp)->tv_nsec; \ - if ((vvp)->tv_nsec < 0) { \ - (vvp)->tv_sec--; \ - (vvp)->tv_nsec += 1000000000; \ - } \ - } while (0) - -static uint64_t -timespec_to_pmtmr(const struct timespec *tsnew, const struct timespec *tsold) -{ - struct timespec tsdiff; - int64_t nsecs; - - tsdiff = *tsnew; - timespecsub(&tsdiff, tsold); - nsecs = tsdiff.tv_sec * 1000000000 + tsdiff.tv_nsec; - assert(nsecs >= 0); - - return (nsecs * PMTMR_FREQ / 1000000000 + pmtmr_old); -} -#endif - -static uint64_t -tsc_to_pmtmr(uint64_t tsc_new, uint64_t tsc_old) -{ - - return ((tsc_new - tsc_old) * PMTMR_FREQ / pmtmr_tscf + pmtmr_old); -} - -static void -pmtmr_init(void) -{ -#ifdef __FreeBSD__ - size_t len; - int smp_tsc, err; - struct timespec tsnew, tsold = { 0 }; - - len = sizeof(smp_tsc); - err = sysctlbyname("kern.timecounter.smp_tsc", &smp_tsc, &len, NULL, 0); - assert(err == 0); - - if (smp_tsc) { - len = sizeof(pmtmr_tscf); - err = sysctlbyname("machdep.tsc_freq", &pmtmr_tscf, &len, - NULL, 0); - assert(err == 0); - - pmtmr_tsc_old = rdtsc(); - pmtmr_old = tsc_to_pmtmr(pmtmr_tsc_old, 0); - } else { - if (getenv("BHYVE_PMTMR_PRECISE") != NULL) - clockid = CLOCK_UPTIME; - - err = clock_gettime(clockid, &tsnew); - assert(err == 0); - - pmtmr_uptime_old = tsnew; - pmtmr_old = timespec_to_pmtmr(&tsnew, &tsold); - } -#else - kstat_ctl_t *kstat_ctl; - kstat_t *kstat; - kstat_named_t *kstat_cpu_freq; - - kstat_ctl = kstat_open(); - kstat = kstat_lookup(kstat_ctl, "cpu_info", 0, NULL); - kstat_read(kstat_ctl, kstat, NULL); - kstat_cpu_freq = kstat_data_lookup(kstat, "current_clock_Hz"); - pmtmr_tscf = kstat_cpu_freq->value.ul; - kstat_close(kstat_ctl); - - pmtmr_tsc_old = rdtsc(); - pmtmr_old = tsc_to_pmtmr(pmtmr_tsc_old, 0); -#endif - pthread_mutex_init(&pmtmr_mtx, NULL); -} - -static uint32_t -pmtmr_val(void) -{ - struct timespec tsnew; - uint64_t pmtmr_tsc_new; - uint64_t pmtmr_new; - int error; - - pthread_once(&pmtmr_once, pmtmr_init); - - pthread_mutex_lock(&pmtmr_mtx); - -#ifdef __FreeBSD__ - if (pmtmr_tscf) { - pmtmr_tsc_new = rdtsc(); - pmtmr_new = tsc_to_pmtmr(pmtmr_tsc_new, pmtmr_tsc_old); - pmtmr_tsc_old = pmtmr_tsc_new; - } else { - error = clock_gettime(clockid, &tsnew); - assert(error == 0); - - pmtmr_new = timespec_to_pmtmr(&tsnew, &pmtmr_uptime_old); - pmtmr_uptime_old = tsnew; - } -#else - pmtmr_tsc_new = rdtsc(); - pmtmr_new = tsc_to_pmtmr(pmtmr_tsc_new, pmtmr_tsc_old); - pmtmr_tsc_old = pmtmr_tsc_new; -#endif - pmtmr_old = pmtmr_new; - - pthread_mutex_unlock(&pmtmr_mtx); - - return (pmtmr_new); -} - -static int -pmtmr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) -{ - assert(in == 1); - - if (bytes != 4) - return (-1); - - *eax = pmtmr_val(); - - return (0); -} - -INOUT_PORT(pmtmr, IO_PMTMR, IOPORT_F_IN, pmtmr_handler); diff --git a/usr/src/cmd/bhyve/post.c b/usr/src/cmd/bhyve/post.c index dcb481aac4..d3040a8df7 100644 --- a/usr/src/cmd/bhyve/post.c +++ b/usr/src/cmd/bhyve/post.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,11 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/post.c 260206 2014-01-02 21:26:59Z jhb $ + * $FreeBSD$ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/post.c 260206 2014-01-02 21:26:59Z jhb $"); +__FBSDID("$FreeBSD$"); #include <sys/types.h> diff --git a/usr/src/cmd/bhyve/ps2kbd.c b/usr/src/cmd/bhyve/ps2kbd.c index 22e566ac21..5453a26949 100644 --- a/usr/src/cmd/bhyve/ps2kbd.c +++ b/usr/src/cmd/bhyve/ps2kbd.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2015 Nahanni Systems Inc. * All rights reserved. @@ -74,6 +76,107 @@ struct ps2kbd_softc { uint8_t curcmd; /* current command for next byte */ }; +#define SCANCODE_E0_PREFIX 1 +struct extended_translation { + uint32_t keysym; + uint8_t scancode; + int flags; +}; + +/* + * FIXME: Pause/break and Print Screen/SysRq require special handling. + */ +static const struct extended_translation extended_translations[] = { + {0xff08, 0x66}, /* Back space */ + {0xff09, 0x0d}, /* Tab */ + {0xff0d, 0x5a}, /* Return */ + {0xff1b, 0x76}, /* Escape */ + {0xff50, 0x6c, SCANCODE_E0_PREFIX}, /* Home */ + {0xff51, 0x6b, SCANCODE_E0_PREFIX}, /* Left arrow */ + {0xff52, 0x75, SCANCODE_E0_PREFIX}, /* Up arrow */ + {0xff53, 0x74, SCANCODE_E0_PREFIX}, /* Right arrow */ + {0xff54, 0x72, SCANCODE_E0_PREFIX}, /* Down arrow */ + {0xff55, 0x7d, SCANCODE_E0_PREFIX}, /* PgUp */ + {0xff56, 0x7a, SCANCODE_E0_PREFIX}, /* PgDown */ + {0xff57, 0x69, SCANCODE_E0_PREFIX}, /* End */ + {0xff63, 0x70, SCANCODE_E0_PREFIX}, /* Ins */ + {0xff8d, 0x5a, SCANCODE_E0_PREFIX}, /* Keypad Enter */ + {0xffe1, 0x12}, /* Left shift */ + {0xffe2, 0x59}, /* Right shift */ + {0xffe3, 0x14}, /* Left control */ + {0xffe4, 0x14, SCANCODE_E0_PREFIX}, /* Right control */ + /* {0xffe7, XXX}, Left meta */ + /* {0xffe8, XXX}, Right meta */ + {0xffe9, 0x11}, /* Left alt */ + {0xfe03, 0x11, SCANCODE_E0_PREFIX}, /* AltGr */ + {0xffea, 0x11, SCANCODE_E0_PREFIX}, /* Right alt */ + {0xffeb, 0x1f, SCANCODE_E0_PREFIX}, /* Left Windows */ + {0xffec, 0x27, SCANCODE_E0_PREFIX}, /* Right Windows */ + {0xffbe, 0x05}, /* F1 */ + {0xffbf, 0x06}, /* F2 */ + {0xffc0, 0x04}, /* F3 */ + {0xffc1, 0x0c}, /* F4 */ + {0xffc2, 0x03}, /* F5 */ + {0xffc3, 0x0b}, /* F6 */ + {0xffc4, 0x83}, /* F7 */ + {0xffc5, 0x0a}, /* F8 */ + {0xffc6, 0x01}, /* F9 */ + {0xffc7, 0x09}, /* F10 */ + {0xffc8, 0x78}, /* F11 */ + {0xffc9, 0x07}, /* F12 */ + {0xffff, 0x71, SCANCODE_E0_PREFIX}, /* Del */ + {0xff14, 0x7e}, /* ScrollLock */ + /* NumLock and Keypads*/ + {0xff7f, 0x77}, /* NumLock */ + {0xffaf, 0x4a, SCANCODE_E0_PREFIX}, /* Keypad slash */ + {0xffaa, 0x7c}, /* Keypad asterisk */ + {0xffad, 0x7b}, /* Keypad minus */ + {0xffab, 0x79}, /* Keypad plus */ + {0xffb7, 0x6c}, /* Keypad 7 */ + {0xff95, 0x6c}, /* Keypad home */ + {0xffb8, 0x75}, /* Keypad 8 */ + {0xff97, 0x75}, /* Keypad up arrow */ + {0xffb9, 0x7d}, /* Keypad 9 */ + {0xff9a, 0x7d}, /* Keypad PgUp */ + {0xffb4, 0x6b}, /* Keypad 4 */ + {0xff96, 0x6b}, /* Keypad left arrow */ + {0xffb5, 0x73}, /* Keypad 5 */ + {0xff9d, 0x73}, /* Keypad empty */ + {0xffb6, 0x74}, /* Keypad 6 */ + {0xff98, 0x74}, /* Keypad right arrow */ + {0xffb1, 0x69}, /* Keypad 1 */ + {0xff9c, 0x69}, /* Keypad end */ + {0xffb2, 0x72}, /* Keypad 2 */ + {0xff99, 0x72}, /* Keypad down arrow */ + {0xffb3, 0x7a}, /* Keypad 3 */ + {0xff9b, 0x7a}, /* Keypad PgDown */ + {0xffb0, 0x70}, /* Keypad 0 */ + {0xff9e, 0x70}, /* Keypad ins */ + {0xffae, 0x71}, /* Keypad . */ + {0xff9f, 0x71}, /* Keypad del */ + {0, 0, 0} /* Terminator */ +}; + +/* ASCII to type 2 scancode lookup table */ +static const uint8_t ascii_translations[128] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52, + 0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a, + 0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d, + 0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a, + 0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, + 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, + 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, + 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e, + 0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, + 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, + 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, + 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00, +}; + static void fifo_init(struct ps2kbd_softc *sc) { @@ -93,15 +196,6 @@ fifo_reset(struct ps2kbd_softc *sc) fifo->size = sizeof(((struct fifo *)0)->buf); } -static int -fifo_available(struct ps2kbd_softc *sc) -{ - struct fifo *fifo; - - fifo = &sc->fifo; - return (fifo->num < fifo->size); -} - static void fifo_put(struct ps2kbd_softc *sc, uint8_t val) { @@ -166,6 +260,9 @@ ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val) sc->curcmd = 0; } else { switch (val) { + case 0x00: + fifo_put(sc, PS2KC_ACK); + break; case PS2KC_RESET_DEV: fifo_reset(sc); fifo_put(sc, PS2KC_ACK); @@ -216,190 +313,57 @@ static void ps2kbd_keysym_queue(struct ps2kbd_softc *sc, int down, uint32_t keysym) { - /* ASCII to type 2 scancode lookup table */ - const uint8_t translation[128] = { - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52, - 0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a, - 0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d, - 0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a, - 0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, - 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, - 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, - 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e, - 0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34, - 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44, - 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d, - 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00, - }; - assert(pthread_mutex_isowned_np(&sc->mtx)); + int e0_prefix, found; + uint8_t code; + const struct extended_translation *trans; + + found = 0; + if (keysym < 0x80) { + code = ascii_translations[keysym]; + e0_prefix = 0; + found = 1; + } else { + for (trans = &(extended_translations[0]); trans->keysym != 0; + trans++) { + if (keysym == trans->keysym) { + code = trans->scancode; + e0_prefix = trans->flags & SCANCODE_E0_PREFIX; + found = 1; + break; + } + } + } - switch (keysym) { - case 0x0 ... 0x7f: - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, translation[keysym]); - break; - case 0xff08: /* Back space */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x66); - break; - case 0xff09: /* Tab */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x0d); - break; - case 0xff0d: /* Return */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x5a); - break; - case 0xff1b: /* Escape */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x76); - break; - case 0xff51: /* Left arrow */ - fifo_put(sc, 0xe0); - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x6b); - break; - case 0xff52: /* Up arrow */ - fifo_put(sc, 0xe0); - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x75); - break; - case 0xff53: /* Right arrow */ - fifo_put(sc, 0xe0); - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x74); - break; - case 0xff54: /* Down arrow */ - fifo_put(sc, 0xe0); - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x72); - break; - case 0xffbe: /* F1 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x05); - break; - case 0xffbf: /* F2 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x06); - break; - case 0xffc0: /* F3 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x04); - break; - case 0xffc1: /* F4 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x0c); - break; - case 0xffc2: /* F5 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x03); - break; - case 0xffc3: /* F6 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x0b); - break; - case 0xffc4: /* F7 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x83); - break; - case 0xffc5: /* F8 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x0a); - break; - case 0xffc6: /* F9 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x01); - break; - case 0xffc7: /* F10 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x09); - break; - case 0xffc8: /* F11 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x78); - break; - case 0xffc9: /* F12 */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x07); - break; - case 0xffe1: /* Left shift */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x12); - break; - case 0xffe2: /* Right shift */ - /* XXX */ - break; - case 0xffe3: /* Left control */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x14); - break; - case 0xffe4: /* Right control */ - /* XXX */ - break; - case 0xffe7: /* Left meta */ - /* XXX */ - break; - case 0xffe8: /* Right meta */ - /* XXX */ - break; - case 0xffe9: /* Left alt */ - if (!down) - fifo_put(sc, 0xf0); - fifo_put(sc, 0x11); - break; - case 0xffea: /* Right alt */ - /* XXX */ - break; - default: - fprintf(stderr, "Unhandled ps2 keyboard keysym 0x%x\n", - keysym); - break; + if (!found) { + fprintf(stderr, "Unhandled ps2 keyboard keysym 0x%x\n", keysym); + return; } + + if (e0_prefix) + fifo_put(sc, 0xe0); + if (!down) + fifo_put(sc, 0xf0); + fifo_put(sc, code); } static void ps2kbd_event(int down, uint32_t keysym, void *arg) { struct ps2kbd_softc *sc = arg; + int fifo_full; pthread_mutex_lock(&sc->mtx); if (!sc->enabled) { pthread_mutex_unlock(&sc->mtx); return; } - + fifo_full = sc->fifo.num == PS2KBD_FIFOSZ; ps2kbd_keysym_queue(sc, down, keysym); pthread_mutex_unlock(&sc->mtx); - atkbdc_event(sc->atkbdc_sc); + if (!fifo_full) + atkbdc_event(sc->atkbdc_sc, 1); } struct ps2kbd_softc * @@ -412,7 +376,8 @@ ps2kbd_init(struct atkbdc_softc *atkbdc_sc) fifo_init(sc); sc->atkbdc_sc = atkbdc_sc; - console_kbd_register(ps2kbd_event, sc); + console_kbd_register(ps2kbd_event, sc, 1); return (sc); } + diff --git a/usr/src/cmd/bhyve/ps2kbd.h b/usr/src/cmd/bhyve/ps2kbd.h index 34c31b1ea8..17be6d0466 100644 --- a/usr/src/cmd/bhyve/ps2kbd.h +++ b/usr/src/cmd/bhyve/ps2kbd.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * diff --git a/usr/src/cmd/bhyve/ps2mouse.c b/usr/src/cmd/bhyve/ps2mouse.c index e96fbbf411..b2e08262b1 100644 --- a/usr/src/cmd/bhyve/ps2mouse.c +++ b/usr/src/cmd/bhyve/ps2mouse.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2015 Nahanni Systems Inc. * All rights reserved. @@ -62,6 +64,16 @@ __FBSDID("$FreeBSD$"); /* mouse device id */ #define PS2MOUSE_DEV_ID 0x0 +/* mouse data bits */ +#define PS2M_DATA_Y_OFLOW 0x80 +#define PS2M_DATA_X_OFLOW 0x40 +#define PS2M_DATA_Y_SIGN 0x20 +#define PS2M_DATA_X_SIGN 0x10 +#define PS2M_DATA_AONE 0x08 +#define PS2M_DATA_MID_BUTTON 0x04 +#define PS2M_DATA_RIGHT_BUTTON 0x02 +#define PS2M_DATA_LEFT_BUTTON 0x01 + /* mouse status bits */ #define PS2M_STS_REMOTE_MODE 0x40 #define PS2M_STS_ENABLE_DEV 0x20 @@ -87,6 +99,7 @@ struct ps2mouse_softc { uint8_t status; uint8_t resolution; uint8_t sampling_rate; + int ctrlenable; struct fifo fifo; uint8_t curcmd; /* current command for next byte */ @@ -168,19 +181,20 @@ movement_get(struct ps2mouse_softc *sc) assert(pthread_mutex_isowned_np(&sc->mtx)); - val0 = sc->status & (PS2M_STS_LEFT_BUTTON | - PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON); + val0 = PS2M_DATA_AONE; + val0 |= sc->status & (PS2M_DATA_LEFT_BUTTON | + PS2M_DATA_RIGHT_BUTTON | PS2M_DATA_MID_BUTTON); if (sc->delta_x >= 0) { if (sc->delta_x > 255) { - val0 |= (1 << 6); + val0 |= PS2M_DATA_X_OFLOW; val1 = 255; } else val1 = sc->delta_x; } else { - val0 |= (1 << 4); + val0 |= PS2M_DATA_X_SIGN; if (sc->delta_x < -255) { - val0 |= (1 << 6); + val0 |= PS2M_DATA_X_OFLOW; val1 = 255; } else val1 = sc->delta_x; @@ -189,23 +203,25 @@ movement_get(struct ps2mouse_softc *sc) if (sc->delta_y >= 0) { if (sc->delta_y > 255) { - val0 |= (1 << 7); + val0 |= PS2M_DATA_Y_OFLOW; val2 = 255; } else val2 = sc->delta_y; } else { - val0 |= (1 << 5); + val0 |= PS2M_DATA_Y_SIGN; if (sc->delta_y < -255) { - val0 |= (1 << 7); + val0 |= PS2M_DATA_Y_OFLOW; val2 = 255; } else val2 = sc->delta_y; } sc->delta_y = 0; - fifo_put(sc, val0); - fifo_put(sc, val1); - fifo_put(sc, val2); + if (sc->fifo.num < (sc->fifo.size - 3)) { + fifo_put(sc, val0); + fifo_put(sc, val1); + fifo_put(sc, val2); + } } static void @@ -214,7 +230,7 @@ ps2mouse_reset(struct ps2mouse_softc *sc) assert(pthread_mutex_isowned_np(&sc->mtx)); fifo_reset(sc); movement_reset(sc); - sc->status = 0x8; + sc->status = PS2M_STS_ENABLE_DEV; sc->resolution = 4; sc->sampling_rate = 100; @@ -236,10 +252,32 @@ ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val) return (retval); } +int +ps2mouse_fifocnt(struct ps2mouse_softc *sc) +{ + return (sc->fifo.num); +} + void -ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val) +ps2mouse_toggle(struct ps2mouse_softc *sc, int enable) { pthread_mutex_lock(&sc->mtx); + if (enable) + sc->ctrlenable = 1; + else { + sc->ctrlenable = 0; + sc->fifo.rindex = 0; + sc->fifo.windex = 0; + sc->fifo.num = 0; + } + pthread_mutex_unlock(&sc->mtx); +} + +void +ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert) +{ + pthread_mutex_lock(&sc->mtx); + fifo_reset(sc); if (sc->curcmd) { switch (sc->curcmd) { case PS2MC_SET_SAMPLING_RATE: @@ -256,8 +294,14 @@ ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val) break; } sc->curcmd = 0; + + } else if (insert) { + fifo_put(sc, val); } else { switch (val) { + case 0x00: + fifo_put(sc, PS2MC_ACK); + break; case PS2MC_RESET_DEV: ps2mouse_reset(sc); fifo_put(sc, PS2MC_ACK); @@ -313,6 +357,7 @@ ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val) fifo_put(sc, PS2MC_ACK); break; default: + fifo_put(sc, PS2MC_ACK); fprintf(stderr, "Unhandled ps2 mouse command " "0x%02x\n", val); break; @@ -338,7 +383,7 @@ ps2mouse_event(uint8_t button, int x, int y, void *arg) if (button & (1 << 2)) sc->status |= PS2M_STS_RIGHT_BUTTON; - if ((sc->status & PS2M_STS_ENABLE_DEV) == 0) { + if ((sc->status & PS2M_STS_ENABLE_DEV) == 0 || !sc->ctrlenable) { /* no data reporting */ pthread_mutex_unlock(&sc->mtx); return; @@ -347,7 +392,8 @@ ps2mouse_event(uint8_t button, int x, int y, void *arg) movement_get(sc); pthread_mutex_unlock(&sc->mtx); - atkbdc_event(sc->atkbdc_sc); + if (sc->fifo.num > 0) + atkbdc_event(sc->atkbdc_sc, 0); } struct ps2mouse_softc * @@ -364,8 +410,9 @@ ps2mouse_init(struct atkbdc_softc *atkbdc_sc) ps2mouse_reset(sc); pthread_mutex_unlock(&sc->mtx); - console_ptr_register(ps2mouse_event, sc); + console_ptr_register(ps2mouse_event, sc, 1); return (sc); } + diff --git a/usr/src/cmd/bhyve/ps2mouse.h b/usr/src/cmd/bhyve/ps2mouse.h index 1a78934b98..59430b01e2 100644 --- a/usr/src/cmd/bhyve/ps2mouse.h +++ b/usr/src/cmd/bhyve/ps2mouse.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -34,6 +36,8 @@ struct atkbdc_softc; struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc); int ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val); -void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val); +void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val, int insert); +void ps2mouse_toggle(struct ps2mouse_softc *sc, int enable); +int ps2mouse_fifocnt(struct ps2mouse_softc *sc); #endif /* _PS2MOUSE_H_ */ diff --git a/usr/src/cmd/bhyve/rfb.c b/usr/src/cmd/bhyve/rfb.c index 0846316378..39ea1611f9 100644 --- a/usr/src/cmd/bhyve/rfb.c +++ b/usr/src/cmd/bhyve/rfb.c @@ -1,6 +1,9 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> - * Copyright (c) 2015 Nahanni Systems Inc. + * Copyright (c) 2015 Leon Dang + * Copyright 2018 Joyent, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -28,30 +31,91 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include <sys/param.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#endif +#include <sys/endian.h> #include <sys/socket.h> +#include <sys/select.h> +#include <sys/time.h> +#include <arpa/inet.h> +#include <machine/cpufunc.h> +#include <machine/specialreg.h> #include <netinet/in.h> +#include <netdb.h> #include <assert.h> +#ifndef WITHOUT_CAPSICUM +#include <capsicum_helpers.h> +#endif +#include <err.h> +#include <errno.h> #include <pthread.h> +#include <pthread_np.h> #include <signal.h> #include <stdbool.h> #include <stdlib.h> #include <stdio.h> #include <string.h> +#include <sysexits.h> #include <unistd.h> +#include <zlib.h> + +#ifndef __FreeBSD__ +#include <sys/debug.h> +#endif + #include "bhyvegc.h" #include "console.h" #include "rfb.h" +#include "sockstream.h" + +#ifndef NO_OPENSSL +#include <openssl/des.h> +#endif + +static int rfb_debug = 0; +#define DPRINTF(params) if (rfb_debug) printf params +#define WPRINTF(params) printf params + +#define AUTH_LENGTH 16 +#define PASSWD_LENGTH 8 + +#define SECURITY_TYPE_NONE 1 +#define SECURITY_TYPE_VNC_AUTH 2 + +#define AUTH_FAILED_UNAUTH 1 +#define AUTH_FAILED_ERROR 2 struct rfb_softc { int sfd; pthread_t tid; + int cfd; + int width, height; - bool enc_raw_ok; - bool enc_resize_ok; + char *password; + + bool enc_raw_ok; + bool enc_zlib_ok; + bool enc_resize_ok; + + z_stream zstream; + uint8_t *zbuf; + int zbuflen; + + int conn_wait; + int sending; + pthread_mutex_t mtx; + pthread_cond_t cond; + + int hw_crc; + uint32_t *crc; /* WxH crc cells */ + uint32_t *crc_tmp; /* buffer to store single crc row */ + int crc_width, crc_height; }; struct rfb_pixfmt { @@ -82,8 +146,16 @@ struct rfb_pixfmt_msg { }; #define RFB_ENCODING_RAW 0 +#define RFB_ENCODING_ZLIB 6 #define RFB_ENCODING_RESIZE -223 +#define RFB_MAX_WIDTH 2000 +#define RFB_MAX_HEIGHT 1200 +#define RFB_ZLIB_BUFSZ RFB_MAX_WIDTH*RFB_MAX_HEIGHT*4 + +/* percentage changes to screen before sending the entire screen */ +#define RFB_SEND_ALL_THRESH 25 + struct rfb_enc_msg { uint8_t type; uint8_t pad; @@ -127,60 +199,65 @@ struct rfb_srvr_rect_hdr { uint32_t encoding; }; +struct rfb_cuttext_msg { + uint8_t type; + uint8_t padding[3]; + uint32_t length; +}; + + static void rfb_send_server_init_msg(int cfd) { struct bhyvegc_image *gc_image; struct rfb_srvr_info sinfo; - int len; gc_image = console_get_image(); - sinfo.width = ntohs(gc_image->width); - sinfo.height = ntohs(gc_image->height); + sinfo.width = htons(gc_image->width); + sinfo.height = htons(gc_image->height); sinfo.pixfmt.bpp = 32; sinfo.pixfmt.depth = 32; sinfo.pixfmt.bigendian = 0; sinfo.pixfmt.truecolor = 1; - sinfo.pixfmt.red_max = ntohs(255); - sinfo.pixfmt.green_max = ntohs(255); - sinfo.pixfmt.blue_max = ntohs(255); + sinfo.pixfmt.red_max = htons(255); + sinfo.pixfmt.green_max = htons(255); + sinfo.pixfmt.blue_max = htons(255); sinfo.pixfmt.red_shift = 16; sinfo.pixfmt.green_shift = 8; sinfo.pixfmt.blue_shift = 0; - sinfo.namelen = ntohl(strlen("bhyve")); - len = write(cfd, &sinfo, sizeof(sinfo)); - len = write(cfd, "bhyve", strlen("bhyve")); + sinfo.namelen = htonl(strlen("bhyve")); + (void)stream_write(cfd, &sinfo, sizeof(sinfo)); + (void)stream_write(cfd, "bhyve", strlen("bhyve")); } static void rfb_send_resize_update_msg(struct rfb_softc *rc, int cfd) { struct rfb_srvr_updt_msg supdt_msg; - struct rfb_srvr_rect_hdr srect_hdr; + struct rfb_srvr_rect_hdr srect_hdr; /* Number of rectangles: 1 */ supdt_msg.type = 0; supdt_msg.pad = 0; - supdt_msg.numrects = ntohs(1); - write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg)); + supdt_msg.numrects = htons(1); + stream_write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg)); /* Rectangle header */ - srect_hdr.x = ntohs(0); - srect_hdr.y = ntohs(0); - srect_hdr.width = ntohs(rc->width); - srect_hdr.height = ntohs(rc->height); - srect_hdr.encoding = ntohl(RFB_ENCODING_RESIZE); - write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); + srect_hdr.x = htons(0); + srect_hdr.y = htons(0); + srect_hdr.width = htons(rc->width); + srect_hdr.height = htons(rc->height); + srect_hdr.encoding = htonl(RFB_ENCODING_RESIZE); + stream_write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); } static void rfb_recv_set_pixfmt_msg(struct rfb_softc *rc, int cfd) { struct rfb_pixfmt_msg pixfmt_msg; - int len; - len = read(cfd, ((void *)&pixfmt_msg) + 1, sizeof(pixfmt_msg) - 1); + (void)stream_read(cfd, ((void *)&pixfmt_msg)+1, sizeof(pixfmt_msg)-1); } @@ -188,18 +265,22 @@ static void rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd) { struct rfb_enc_msg enc_msg; - int len, i; + int i; uint32_t encoding; assert((sizeof(enc_msg) - 1) == 3); - len = read(cfd, ((void *)&enc_msg) + 1, sizeof(enc_msg) - 1); + (void)stream_read(cfd, ((void *)&enc_msg)+1, sizeof(enc_msg)-1); - for (i = 0; i < ntohs(enc_msg.numencs); i++) { - len = read(cfd, &encoding, sizeof(encoding)); - switch (ntohl(encoding)) { + for (i = 0; i < htons(enc_msg.numencs); i++) { + (void)stream_read(cfd, &encoding, sizeof(encoding)); + switch (htonl(encoding)) { case RFB_ENCODING_RAW: rc->enc_raw_ok = true; break; + case RFB_ENCODING_ZLIB: + rc->enc_zlib_ok = true; + deflateInit(&rc->zstream, Z_BEST_SPEED); + break; case RFB_ENCODING_RESIZE: rc->enc_resize_ok = true; break; @@ -207,88 +288,460 @@ rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd) } } -static void -rfb_resize_update(struct rfb_softc *rc, int fd) +/* + * Calculate CRC32 using SSE4.2; Intel or AMD Bulldozer+ CPUs only + */ +static __inline uint32_t +fast_crc32(void *buf, int len, uint32_t crcval) +{ + uint32_t q = len / sizeof(uint32_t); + uint32_t *p = (uint32_t *)buf; + + while (q--) { + asm volatile ( + ".byte 0xf2, 0xf, 0x38, 0xf1, 0xf1;" + :"=S" (crcval) + :"0" (crcval), "c" (*p) + ); + p++; + } + + return (crcval); +} + + +static int +rfb_send_rect(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc, + int x, int y, int w, int h) +{ + struct rfb_srvr_updt_msg supdt_msg; + struct rfb_srvr_rect_hdr srect_hdr; + unsigned long zlen; + ssize_t nwrite, total; + int err; + uint32_t *p; + uint8_t *zbufp; + + /* + * Send a single rectangle of the given x, y, w h dimensions. + */ + + /* Number of rectangles: 1 */ + supdt_msg.type = 0; + supdt_msg.pad = 0; + supdt_msg.numrects = htons(1); + nwrite = stream_write(cfd, &supdt_msg, + sizeof(struct rfb_srvr_updt_msg)); + if (nwrite <= 0) + return (nwrite); + + + /* Rectangle header */ + srect_hdr.x = htons(x); + srect_hdr.y = htons(y); + srect_hdr.width = htons(w); + srect_hdr.height = htons(h); + + h = y + h; + w *= sizeof(uint32_t); + if (rc->enc_zlib_ok) { + zbufp = rc->zbuf; + rc->zstream.total_in = 0; + rc->zstream.total_out = 0; + for (p = &gc->data[y * gc->width + x]; y < h; y++) { + rc->zstream.next_in = (Bytef *)p; + rc->zstream.avail_in = w; + rc->zstream.next_out = (Bytef *)zbufp; + rc->zstream.avail_out = RFB_ZLIB_BUFSZ + 16 - + rc->zstream.total_out; + rc->zstream.data_type = Z_BINARY; + + /* Compress with zlib */ + err = deflate(&rc->zstream, Z_SYNC_FLUSH); + if (err != Z_OK) { + WPRINTF(("zlib[rect] deflate err: %d\n", err)); + rc->enc_zlib_ok = false; + deflateEnd(&rc->zstream); + goto doraw; + } + zbufp = rc->zbuf + rc->zstream.total_out; + p += gc->width; + } + srect_hdr.encoding = htonl(RFB_ENCODING_ZLIB); + nwrite = stream_write(cfd, &srect_hdr, + sizeof(struct rfb_srvr_rect_hdr)); + if (nwrite <= 0) + return (nwrite); + + zlen = htonl(rc->zstream.total_out); + nwrite = stream_write(cfd, &zlen, sizeof(uint32_t)); + if (nwrite <= 0) + return (nwrite); + return (stream_write(cfd, rc->zbuf, rc->zstream.total_out)); + } + +doraw: + + total = 0; + zbufp = rc->zbuf; + for (p = &gc->data[y * gc->width + x]; y < h; y++) { + memcpy(zbufp, p, w); + zbufp += w; + total += w; + p += gc->width; + } + + srect_hdr.encoding = htonl(RFB_ENCODING_RAW); + nwrite = stream_write(cfd, &srect_hdr, + sizeof(struct rfb_srvr_rect_hdr)); + if (nwrite <= 0) + return (nwrite); + + total = stream_write(cfd, rc->zbuf, total); + + return (total); +} + +static int +rfb_send_all(struct rfb_softc *rc, int cfd, struct bhyvegc_image *gc) { struct rfb_srvr_updt_msg supdt_msg; struct rfb_srvr_rect_hdr srect_hdr; + ssize_t nwrite; + unsigned long zlen; + int err; + + /* + * Send the whole thing + */ /* Number of rectangles: 1 */ supdt_msg.type = 0; supdt_msg.pad = 0; - supdt_msg.numrects = ntohs(1); - write(fd, &supdt_msg, sizeof (struct rfb_srvr_updt_msg)); + supdt_msg.numrects = htons(1); + nwrite = stream_write(cfd, &supdt_msg, + sizeof(struct rfb_srvr_updt_msg)); + if (nwrite <= 0) + return (nwrite); /* Rectangle header */ - srect_hdr.x = ntohs(0); - srect_hdr.y = ntohs(0); - srect_hdr.width = ntohs(rc->width); - srect_hdr.height = ntohs(rc->height); - srect_hdr.encoding = ntohl(RFB_ENCODING_RESIZE); - write(fd, &srect_hdr, sizeof (struct rfb_srvr_rect_hdr)); + srect_hdr.x = 0; + srect_hdr.y = 0; + srect_hdr.width = htons(gc->width); + srect_hdr.height = htons(gc->height); + if (rc->enc_zlib_ok) { + rc->zstream.next_in = (Bytef *)gc->data; + rc->zstream.avail_in = gc->width * gc->height * + sizeof(uint32_t); + rc->zstream.next_out = (Bytef *)rc->zbuf; + rc->zstream.avail_out = RFB_ZLIB_BUFSZ + 16; + rc->zstream.data_type = Z_BINARY; + + rc->zstream.total_in = 0; + rc->zstream.total_out = 0; + + /* Compress with zlib */ + err = deflate(&rc->zstream, Z_SYNC_FLUSH); + if (err != Z_OK) { + WPRINTF(("zlib deflate err: %d\n", err)); + rc->enc_zlib_ok = false; + deflateEnd(&rc->zstream); + goto doraw; + } + + srect_hdr.encoding = htonl(RFB_ENCODING_ZLIB); + nwrite = stream_write(cfd, &srect_hdr, + sizeof(struct rfb_srvr_rect_hdr)); + if (nwrite <= 0) + return (nwrite); + + zlen = htonl(rc->zstream.total_out); + nwrite = stream_write(cfd, &zlen, sizeof(uint32_t)); + if (nwrite <= 0) + return (nwrite); + return (stream_write(cfd, rc->zbuf, rc->zstream.total_out)); + } + +doraw: + srect_hdr.encoding = htonl(RFB_ENCODING_RAW); + nwrite = stream_write(cfd, &srect_hdr, + sizeof(struct rfb_srvr_rect_hdr)); + if (nwrite <= 0) + return (nwrite); + + nwrite = stream_write(cfd, gc->data, + gc->width * gc->height * sizeof(uint32_t)); + + return (nwrite); } +#define PIX_PER_CELL 32 +#define PIXCELL_SHIFT 5 +#define PIXCELL_MASK 0x1F + +static int +rfb_send_screen(struct rfb_softc *rc, int cfd, int all) +{ + struct bhyvegc_image *gc_image; + ssize_t nwrite; + int x, y; + int celly, cellwidth; + int xcells, ycells; + int w, h; + uint32_t *p; + int rem_x, rem_y; /* remainder for resolutions not x32 pixels ratio */ + int retval; + uint32_t *crc_p, *orig_crc; + int changes; + + console_refresh(); + gc_image = console_get_image(); + + pthread_mutex_lock(&rc->mtx); + if (rc->sending) { + pthread_mutex_unlock(&rc->mtx); + return (1); + } + rc->sending = 1; + pthread_mutex_unlock(&rc->mtx); + + retval = 0; + + if (all) { + retval = rfb_send_all(rc, cfd, gc_image); + goto done; + } + + /* + * Calculate the checksum for each 32x32 cell. Send each that + * has changed since the last scan. + */ + + /* Resolution changed */ + + rc->crc_width = gc_image->width; + rc->crc_height = gc_image->height; + + w = rc->crc_width; + h = rc->crc_height; + xcells = howmany(rc->crc_width, PIX_PER_CELL); + ycells = howmany(rc->crc_height, PIX_PER_CELL); + + rem_x = w & PIXCELL_MASK; + + rem_y = h & PIXCELL_MASK; + if (!rem_y) + rem_y = PIX_PER_CELL; + + p = gc_image->data; + + /* + * Go through all cells and calculate crc. If significant number + * of changes, then send entire screen. + * crc_tmp is dual purpose: to store the new crc and to flag as + * a cell that has changed. + */ + crc_p = rc->crc_tmp - xcells; + orig_crc = rc->crc - xcells; + changes = 0; + memset(rc->crc_tmp, 0, sizeof(uint32_t) * xcells * ycells); + for (y = 0; y < h; y++) { + if ((y & PIXCELL_MASK) == 0) { + crc_p += xcells; + orig_crc += xcells; + } + + for (x = 0; x < xcells; x++) { + if (x == (xcells - 1) && rem_x > 0) + cellwidth = rem_x; + else + cellwidth = PIX_PER_CELL; + + if (rc->hw_crc) + crc_p[x] = fast_crc32(p, + cellwidth * sizeof(uint32_t), + crc_p[x]); + else + crc_p[x] = (uint32_t)crc32(crc_p[x], + (Bytef *)p, + cellwidth * sizeof(uint32_t)); + + p += cellwidth; + + /* check for crc delta if last row in cell */ + if ((y & PIXCELL_MASK) == PIXCELL_MASK || y == (h-1)) { + if (orig_crc[x] != crc_p[x]) { + orig_crc[x] = crc_p[x]; + crc_p[x] = 1; + changes++; + } else { + crc_p[x] = 0; + } + } + } + } + + /* If number of changes is > THRESH percent, send the whole screen */ + if (((changes * 100) / (xcells * ycells)) >= RFB_SEND_ALL_THRESH) { + retval = rfb_send_all(rc, cfd, gc_image); + goto done; + } + + /* Go through all cells, and send only changed ones */ + crc_p = rc->crc_tmp; + for (y = 0; y < h; y += PIX_PER_CELL) { + /* previous cell's row */ + celly = (y >> PIXCELL_SHIFT); + + /* Delta check crc to previous set */ + for (x = 0; x < xcells; x++) { + if (*crc_p++ == 0) + continue; + + if (x == (xcells - 1) && rem_x > 0) + cellwidth = rem_x; + else + cellwidth = PIX_PER_CELL; + nwrite = rfb_send_rect(rc, cfd, + gc_image, + x * PIX_PER_CELL, + celly * PIX_PER_CELL, + cellwidth, + y + PIX_PER_CELL >= h ? rem_y : PIX_PER_CELL); + if (nwrite <= 0) { + retval = nwrite; + goto done; + } + } + } + retval = 1; + +done: + pthread_mutex_lock(&rc->mtx); + rc->sending = 0; + pthread_mutex_unlock(&rc->mtx); + + return (retval); +} + + static void -rfb_recv_update_msg(struct rfb_softc *rc, int cfd) +rfb_recv_update_msg(struct rfb_softc *rc, int cfd, int discardonly) { struct rfb_updt_msg updt_msg; - struct rfb_srvr_updt_msg supdt_msg; - struct rfb_srvr_rect_hdr srect_hdr; struct bhyvegc_image *gc_image; - int len; - len = read(cfd, ((void *)&updt_msg) + 1 , sizeof(updt_msg) - 1); + (void)stream_read(cfd, ((void *)&updt_msg) + 1 , sizeof(updt_msg) - 1); console_refresh(); gc_image = console_get_image(); - if (rc->width != gc_image->width || rc->height != gc_image->height) { + updt_msg.x = htons(updt_msg.x); + updt_msg.y = htons(updt_msg.y); + updt_msg.width = htons(updt_msg.width); + updt_msg.height = htons(updt_msg.height); + + if (updt_msg.width != gc_image->width || + updt_msg.height != gc_image->height) { rc->width = gc_image->width; rc->height = gc_image->height; - rfb_send_resize_update_msg(rc, cfd); + if (rc->enc_resize_ok) + rfb_send_resize_update_msg(rc, cfd); } - /* - * Send the whole thing - */ - /* Number of rectangles: 1 */ - supdt_msg.type = 0; - supdt_msg.pad = 0; - supdt_msg.numrects = ntohs(1); - write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg)); + if (discardonly) + return; - /* Rectangle header */ - srect_hdr.x = ntohs(0); - srect_hdr.y = ntohs(0); - srect_hdr.width = ntohs(gc_image->width); - srect_hdr.height = ntohs(gc_image->height); - srect_hdr.encoding = ntohl(0); /* raw */ - write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr)); - - write(cfd, gc_image->data, gc_image->width * gc_image->height * - sizeof(uint32_t)); + rfb_send_screen(rc, cfd, 1); } static void rfb_recv_key_msg(struct rfb_softc *rc, int cfd) { struct rfb_key_msg key_msg; - int len; - len = read(cfd, ((void *)&key_msg) + 1, sizeof(key_msg) - 1); + (void)stream_read(cfd, ((void *)&key_msg) + 1, sizeof(key_msg) - 1); - console_key_event(key_msg.down, ntohl(key_msg.code)); + console_key_event(key_msg.down, htonl(key_msg.code)); } static void rfb_recv_ptr_msg(struct rfb_softc *rc, int cfd) { struct rfb_ptr_msg ptr_msg; + + (void)stream_read(cfd, ((void *)&ptr_msg) + 1, sizeof(ptr_msg) - 1); + + console_ptr_event(ptr_msg.button, htons(ptr_msg.x), htons(ptr_msg.y)); +} + +static void +rfb_recv_cuttext_msg(struct rfb_softc *rc, int cfd) +{ + struct rfb_cuttext_msg ct_msg; + unsigned char buf[32]; int len; - len = read(cfd, ((void *)&ptr_msg) + 1, sizeof(ptr_msg) - 1); + len = stream_read(cfd, ((void *)&ct_msg) + 1, sizeof(ct_msg) - 1); + ct_msg.length = htonl(ct_msg.length); + while (ct_msg.length > 0) { + len = stream_read(cfd, buf, ct_msg.length > sizeof(buf) ? + sizeof(buf) : ct_msg.length); + ct_msg.length -= len; + } +} - console_ptr_event(ptr_msg.button, ntohs(ptr_msg.x), ntohs(ptr_msg.y)); +static int64_t +timeval_delta(struct timeval *prev, struct timeval *now) +{ + int64_t n1, n2; + n1 = now->tv_sec * 1000000 + now->tv_usec; + n2 = prev->tv_sec * 1000000 + prev->tv_usec; + return (n1 - n2); +} + +static void * +rfb_wr_thr(void *arg) +{ + struct rfb_softc *rc; + fd_set rfds; + struct timeval tv; + struct timeval prev_tv; + int64_t tdiff; + int cfd; + int err; + + rc = arg; + cfd = rc->cfd; + + prev_tv.tv_sec = 0; + prev_tv.tv_usec = 0; + while (rc->cfd >= 0) { + FD_ZERO(&rfds); + FD_SET(cfd, &rfds); + tv.tv_sec = 0; + tv.tv_usec = 10000; + + err = select(cfd+1, &rfds, NULL, NULL, &tv); + if (err < 0) + return (NULL); + + /* Determine if its time to push screen; ~24hz */ + gettimeofday(&tv, NULL); + tdiff = timeval_delta(&prev_tv, &tv); + if (tdiff > 40000) { + prev_tv.tv_sec = tv.tv_sec; + prev_tv.tv_usec = tv.tv_usec; + if (rfb_send_screen(rc, cfd, 0) <= 0) { + return (NULL); + } + } else { + /* sleep */ + usleep(40000 - tdiff); + } + } + + return (NULL); } void @@ -296,39 +749,145 @@ rfb_handle(struct rfb_softc *rc, int cfd) { const char *vbuf = "RFB 003.008\n"; unsigned char buf[80]; + unsigned char *message = NULL; + +#ifndef NO_OPENSSL + unsigned char challenge[AUTH_LENGTH]; + unsigned char keystr[PASSWD_LENGTH]; + unsigned char crypt_expected[AUTH_LENGTH]; + + DES_key_schedule ks; + int i; +#endif + + pthread_t tid; + uint32_t sres = 0; int len; - uint32_t sres; + int perror = 1; + + rc->cfd = cfd; /* 1a. Send server version */ - printf("server vers write: (%s), %d bytes\n", vbuf, (int) strlen(vbuf)); - write(cfd, vbuf, strlen(vbuf)); + stream_write(cfd, vbuf, strlen(vbuf)); /* 1b. Read client version */ len = read(cfd, buf, sizeof(buf)); - /* 2a. Send security type 'none' */ + /* 2a. Send security type */ buf[0] = 1; - buf[1] = 1; /* none */ - write(cfd, buf, 2); +#ifndef NO_OPENSSL + if (rc->password) + buf[1] = SECURITY_TYPE_VNC_AUTH; + else + buf[1] = SECURITY_TYPE_NONE; +#else + buf[1] = SECURITY_TYPE_NONE; +#endif + + stream_write(cfd, buf, 2); /* 2b. Read agreed security type */ - len = read(cfd, buf, 1); + len = stream_read(cfd, buf, 1); + + /* 2c. Do VNC authentication */ + switch (buf[0]) { + case SECURITY_TYPE_NONE: + sres = 0; + break; + case SECURITY_TYPE_VNC_AUTH: + /* + * The client encrypts the challenge with DES, using a password + * supplied by the user as the key. + * To form the key, the password is truncated to + * eight characters, or padded with null bytes on the right. + * The client then sends the resulting 16-bytes response. + */ +#ifndef NO_OPENSSL + strncpy(keystr, rc->password, PASSWD_LENGTH); + + /* VNC clients encrypts the challenge with all the bit fields + * in each byte of the password mirrored. + * Here we flip each byte of the keystr. + */ + for (i = 0; i < PASSWD_LENGTH; i++) { + keystr[i] = (keystr[i] & 0xF0) >> 4 + | (keystr[i] & 0x0F) << 4; + keystr[i] = (keystr[i] & 0xCC) >> 2 + | (keystr[i] & 0x33) << 2; + keystr[i] = (keystr[i] & 0xAA) >> 1 + | (keystr[i] & 0x55) << 1; + } + + /* Initialize a 16-byte random challenge */ + arc4random_buf(challenge, sizeof(challenge)); + stream_write(cfd, challenge, AUTH_LENGTH); + + /* Receive the 16-byte challenge response */ + stream_read(cfd, buf, AUTH_LENGTH); + + memcpy(crypt_expected, challenge, AUTH_LENGTH); + + /* Encrypt the Challenge with DES */ + DES_set_key((const_DES_cblock *)keystr, &ks); + DES_ecb_encrypt((const_DES_cblock *)challenge, + (const_DES_cblock *)crypt_expected, + &ks, DES_ENCRYPT); + DES_ecb_encrypt((const_DES_cblock *)(challenge + PASSWD_LENGTH), + (const_DES_cblock *)(crypt_expected + + PASSWD_LENGTH), + &ks, DES_ENCRYPT); + + if (memcmp(crypt_expected, buf, AUTH_LENGTH) != 0) { + message = "Auth Failed: Invalid Password."; + sres = htonl(1); + } else + sres = 0; +#else + sres = 0; + WPRINTF(("Auth not supported, no OpenSSL in your system")); +#endif - /* 2c. Write back a status of 0 */ - sres = 0; - write(cfd, &sres, 4); + break; + } + + /* 2d. Write back a status */ + stream_write(cfd, &sres, 4); + + if (sres) { +#ifdef __FreeBSD__ + be32enc(buf, strlen(message)); + stream_write(cfd, buf, 4); + stream_write(cfd, message, strlen(message)); +#else + be32enc(buf, strlen((char *)message)); + stream_write(cfd, buf, 4); + stream_write(cfd, message, strlen((char *)message)); +#endif + goto done; + } /* 3a. Read client shared-flag byte */ - len = read(cfd, buf, 1); + len = stream_read(cfd, buf, 1); /* 4a. Write server-init info */ rfb_send_server_init_msg(cfd); + if (!rc->zbuf) { + rc->zbuf = malloc(RFB_ZLIB_BUFSZ + 16); + assert(rc->zbuf != NULL); + } + + rfb_send_screen(rc, cfd, 1); + + perror = pthread_create(&tid, NULL, rfb_wr_thr, rc); + if (perror == 0) + pthread_set_name_np(tid, "rfbout"); + /* Now read in client requests. 1st byte identifies type */ for (;;) { len = read(cfd, buf, 1); if (len <= 0) { - printf("exiting\n"); + DPRINTF(("rfb client exiting\r\n")); break; } @@ -340,7 +899,7 @@ rfb_handle(struct rfb_softc *rc, int cfd) rfb_recv_set_encodings_msg(rc, cfd); break; case 3: - rfb_recv_update_msg(rc, cfd); + rfb_recv_update_msg(rc, cfd, 1); break; case 4: rfb_recv_key_msg(rc, cfd); @@ -348,11 +907,20 @@ rfb_handle(struct rfb_softc *rc, int cfd) case 5: rfb_recv_ptr_msg(rc, cfd); break; + case 6: + rfb_recv_cuttext_msg(rc, cfd); + break; default: - printf("unknown client code!\n"); - exit(1); + WPRINTF(("rfb unknown cli-code %d!\n", buf[0] & 0xff)); + goto done; } } +done: + rc->cfd = -1; + if (perror == 0) + pthread_join(tid, NULL); + if (rc->enc_zlib_ok) + deflateEnd(&rc->zstream); } static void * @@ -373,48 +941,208 @@ rfb_thr(void *arg) } for (;;) { + rc->enc_raw_ok = false; + rc->enc_zlib_ok = false; + rc->enc_resize_ok = false; + cfd = accept(rc->sfd, NULL, NULL); + if (rc->conn_wait) { + pthread_mutex_lock(&rc->mtx); + pthread_cond_signal(&rc->cond); + pthread_mutex_unlock(&rc->mtx); + rc->conn_wait = 0; + } rfb_handle(rc, cfd); + close(cfd); } /* NOTREACHED */ return (NULL); } +static int +sse42_supported(void) +{ + u_int cpu_registers[4], ecx; + + do_cpuid(1, cpu_registers); + + ecx = cpu_registers[2]; + + return ((ecx & CPUID2_SSE42) != 0); +} + int -rfb_init(int port) +rfb_init(char *hostname, int port, int wait, char *password) { + int e; + char servname[6]; struct rfb_softc *rc; - struct sockaddr_in sin; + struct addrinfo *ai; + struct addrinfo hints; int on = 1; +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; +#endif rc = calloc(1, sizeof(struct rfb_softc)); - rc->sfd = socket(AF_INET, SOCK_STREAM, 0); + rc->crc = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), + sizeof(uint32_t)); + rc->crc_tmp = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), + sizeof(uint32_t)); + rc->crc_width = RFB_MAX_WIDTH; + rc->crc_height = RFB_MAX_HEIGHT; + + rc->password = password; + + snprintf(servname, sizeof(servname), "%d", port ? port : 5900); + + if (!hostname || strlen(hostname) == 0) +#if defined(INET) + hostname = "127.0.0.1"; +#elif defined(INET6) + hostname = "[::1]"; +#endif + + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_NUMERICHOST | AI_NUMERICSERV | AI_PASSIVE; + + if ((e = getaddrinfo(hostname, servname, &hints, &ai)) != 0) { + fprintf(stderr, "getaddrinfo: %s\n", gai_strerror(e)); + return(-1); + } + + rc->sfd = socket(ai->ai_family, ai->ai_socktype, 0); if (rc->sfd < 0) { perror("socket"); + freeaddrinfo(ai); return (-1); } setsockopt(rc->sfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); -#ifdef __FreeBSD__ - sin.sin_len = sizeof(sin); -#endif - sin.sin_family = AF_INET; - sin.sin_addr.s_addr = htonl(INADDR_ANY); - sin.sin_port = htons(port); - if (bind(rc->sfd, (struct sockaddr *)&sin, sizeof(sin)) < 0) { + if (bind(rc->sfd, ai->ai_addr, ai->ai_addrlen) < 0) { perror("bind"); + freeaddrinfo(ai); return (-1); } if (listen(rc->sfd, 1) < 0) { perror("listen"); + freeaddrinfo(ai); return (-1); } +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_ACCEPT, CAP_EVENT, CAP_READ, CAP_WRITE); + if (caph_rights_limit(rc->sfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + rc->hw_crc = sse42_supported(); + + rc->conn_wait = wait; + if (wait) { + pthread_mutex_init(&rc->mtx, NULL); + pthread_cond_init(&rc->cond, NULL); + } + pthread_create(&rc->tid, NULL, rfb_thr, rc); + pthread_set_name_np(rc->tid, "rfb"); + + if (wait) { + DPRINTF(("Waiting for rfb client...\n")); + pthread_mutex_lock(&rc->mtx); + pthread_cond_wait(&rc->cond, &rc->mtx); + pthread_mutex_unlock(&rc->mtx); + } + freeaddrinfo(ai); return (0); } + +#ifndef __FreeBSD__ +int +rfb_init_unix(char *path, int wait, char *password) +{ + struct rfb_softc *rc; + struct sockaddr_un sock; + + if ((rc = calloc(1, sizeof (struct rfb_softc))) == NULL) { + perror("calloc"); + return (-1); + } + rc->sfd = -1; + + if ((rc->crc = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), + sizeof (uint32_t))) == NULL) { + perror("calloc"); + goto fail; + } + if ((rc->crc_tmp = calloc(howmany(RFB_MAX_WIDTH * RFB_MAX_HEIGHT, 32), + sizeof (uint32_t))) == NULL) { + perror("calloc"); + goto fail; + } + rc->crc_width = RFB_MAX_WIDTH; + rc->crc_height = RFB_MAX_HEIGHT; + + rc->password = password; + + rc->sfd = socket(PF_UNIX, SOCK_STREAM, 0); + if (rc->sfd < 0) { + perror("socket"); + goto fail; + } + + sock.sun_family = AF_UNIX; + if (strlcpy(sock.sun_path, path, sizeof (sock.sun_path)) >= + sizeof (sock.sun_path)) { + (void) fprintf(stderr, "socket path '%s' too long\n", path); + goto fail; + } + + (void) unlink(path); + if (bind(rc->sfd, (struct sockaddr *)&sock, sizeof (sock)) < 0) { + perror("bind"); + goto fail; + } + + if (listen(rc->sfd, 1) < 0) { + perror("listen"); + goto fail; + } + + rc->hw_crc = sse42_supported(); + + rc->conn_wait = wait; + if (wait) { + VERIFY3S(pthread_mutex_init(&rc->mtx, NULL), ==, 0); + VERIFY3S(pthread_cond_init(&rc->cond, NULL), ==, 0); + } + + VERIFY3S(pthread_create(&rc->tid, NULL, rfb_thr, rc), ==, 0); + pthread_set_name_np(rc->tid, "rfb"); + + if (wait) { + DPRINTF(("Waiting for rfb client...\n")); + VERIFY3S(pthread_mutex_lock(&rc->mtx), ==, 0); + VERIFY3S(pthread_cond_wait(&rc->cond, &rc->mtx), ==, 0); + VERIFY3S(pthread_mutex_unlock(&rc->mtx), ==, 0); + } + + return (0); + +fail: + if (rc->sfd != -1) { + VERIFY3S(close(rc->sfd), ==, 0); + } + free(rc->crc); + free(rc->crc_tmp); + free(rc); + return (-1); +} +#endif diff --git a/usr/src/cmd/bhyve/rfb.h b/usr/src/cmd/bhyve/rfb.h index 5504c333ab..990e2075ac 100644 --- a/usr/src/cmd/bhyve/rfb.h +++ b/usr/src/cmd/bhyve/rfb.h @@ -1,5 +1,8 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> + * Copyright 2018 Joyent, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -31,6 +34,9 @@ #define RFB_PORT 5900 -int rfb_init(int port); +int rfb_init(char *hostname, int port, int wait, char *password); +#ifndef __FreeBSD__ +int rfb_init_unix(char *path, int wait, char *password); +#endif #endif /* _RFB_H_ */ diff --git a/usr/src/cmd/bhyve/rtc.c b/usr/src/cmd/bhyve/rtc.c index 5ab78e060f..09ca3f61ae 100644 --- a/usr/src/cmd/bhyve/rtc.c +++ b/usr/src/cmd/bhyve/rtc.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,17 +25,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $ + * $FreeBSD$ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $"); +__FBSDID("$FreeBSD$"); #include <sys/types.h> -#include <sys/time.h> -#include <stdio.h> -#include <string.h> #include <time.h> #include <assert.h> @@ -41,300 +40,45 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $" #include <vmmapi.h> #include "acpi.h" -#include "inout.h" #include "pci_lpc.h" #include "rtc.h" -#define IO_RTC 0x70 - -#define RTC_SEC 0x00 /* seconds */ -#define RTC_SEC_ALARM 0x01 -#define RTC_MIN 0x02 -#define RTC_MIN_ALARM 0x03 -#define RTC_HRS 0x04 -#define RTC_HRS_ALARM 0x05 -#define RTC_WDAY 0x06 -#define RTC_DAY 0x07 -#define RTC_MONTH 0x08 -#define RTC_YEAR 0x09 -#define RTC_CENTURY 0x32 /* current century */ - -#define RTC_STATUSA 0xA -#define RTCSA_TUP 0x80 /* time update, don't look now */ - -#define RTC_STATUSB 0xB -#define RTCSB_DST 0x01 -#define RTCSB_24HR 0x02 -#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */ -#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */ -#define RTCSB_HALT 0x80 /* stop clock updates */ +#define IO_RTC 0x70 -#define RTC_INTR 0x0c /* status register C (R) interrupt source */ - -#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */ -#define RTCSD_PWR 0x80 /* clock power OK */ - -#define RTC_NVRAM_START 0x0e -#define RTC_NVRAM_END 0x7f -#define RTC_NVRAM_SZ (128 - RTC_NVRAM_START) -#define nvoff(x) ((x) - RTC_NVRAM_START) - -#define RTC_DIAG 0x0e -#define RTC_RSTCODE 0x0f -#define RTC_EQUIPMENT 0x14 #define RTC_LMEM_LSB 0x34 #define RTC_LMEM_MSB 0x35 #define RTC_HMEM_LSB 0x5b #define RTC_HMEM_SB 0x5c #define RTC_HMEM_MSB 0x5d -#define m_64KB (64*1024) +#define m_64KB (64*1024) #define m_16MB (16*1024*1024) #define m_4GB (4ULL*1024*1024*1024) -static int addr; - -static uint8_t rtc_nvram[RTC_NVRAM_SZ]; - -/* XXX initialize these to default values as they would be from BIOS */ -static uint8_t status_a, status_b; - -static struct { - uint8_t hours; - uint8_t mins; - uint8_t secs; -} rtc_alarm; - -static u_char const bin2bcd_data[] = { - 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, - 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, - 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, - 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, - 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, - 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, - 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, - 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, - 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, - 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99 -}; -#define bin2bcd(bin) (bin2bcd_data[bin]) - -#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val))) - -static void -timevalfix(struct timeval *t1) -{ - - if (t1->tv_usec < 0) { - t1->tv_sec--; - t1->tv_usec += 1000000; - } - if (t1->tv_usec >= 1000000) { - t1->tv_sec++; - t1->tv_usec -= 1000000; - } -} - -static void -timevalsub(struct timeval *t1, const struct timeval *t2) -{ - - t1->tv_sec -= t2->tv_sec; - t1->tv_usec -= t2->tv_usec; - timevalfix(t1); -} - -static int -rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) -{ - if (bytes != 1) - return (-1); - - if (in) { - /* straight read of this register will return 0xFF */ - *eax = 0xff; - return (0); - } - - switch (*eax & 0x7f) { - case RTC_SEC: - case RTC_SEC_ALARM: - case RTC_MIN: - case RTC_MIN_ALARM: - case RTC_HRS: - case RTC_HRS_ALARM: - case RTC_WDAY: - case RTC_DAY: - case RTC_MONTH: - case RTC_YEAR: - case RTC_STATUSA: - case RTC_STATUSB: - case RTC_INTR: - case RTC_STATUSD: - case RTC_NVRAM_START ... RTC_NVRAM_END: - break; - default: - return (-1); - } - - addr = *eax & 0x7f; - return (0); -} - -static int -rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, - uint32_t *eax, void *arg) +/* + * Returns the current RTC time as number of seconds since 00:00:00 Jan 1, 1970 + */ +static time_t +rtc_time(struct vmctx *ctx, int use_localtime) { - int hour; + struct tm tm; time_t t; - struct timeval cur, delta; - - static struct timeval last; - static struct tm tm; - - if (bytes != 1) - return (-1); - - gettimeofday(&cur, NULL); - /* - * Increment the cached time only once per second so we can guarantee - * that the guest has at least one second to read the hour:min:sec - * separately and still get a coherent view of the time. - */ - delta = cur; - timevalsub(&delta, &last); - if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) { - t = cur.tv_sec; + time(&t); + if (use_localtime) { localtime_r(&t, &tm); - last = cur; - } - - if (in) { - switch (addr) { - case RTC_SEC_ALARM: - *eax = rtc_alarm.secs; - break; - case RTC_MIN_ALARM: - *eax = rtc_alarm.mins; - break; - case RTC_HRS_ALARM: - *eax = rtc_alarm.hours; - break; - case RTC_SEC: - *eax = rtcout(tm.tm_sec); - return (0); - case RTC_MIN: - *eax = rtcout(tm.tm_min); - return (0); - case RTC_HRS: - if (status_b & RTCSB_24HR) - hour = tm.tm_hour; - else - hour = (tm.tm_hour % 12) + 1; - - *eax = rtcout(hour); - - /* - * If we are representing time in the 12-hour format - * then set the MSB to indicate PM. - */ - if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12) - *eax |= 0x80; - - return (0); - case RTC_WDAY: - *eax = rtcout(tm.tm_wday + 1); - return (0); - case RTC_DAY: - *eax = rtcout(tm.tm_mday); - return (0); - case RTC_MONTH: - *eax = rtcout(tm.tm_mon + 1); - return (0); - case RTC_YEAR: - *eax = rtcout(tm.tm_year % 100); - return (0); - case RTC_STATUSA: - *eax = status_a; - return (0); - case RTC_STATUSB: - *eax = status_b; - return (0); - case RTC_INTR: - *eax = 0; - return (0); - case RTC_STATUSD: - *eax = RTCSD_PWR; - return (0); - case RTC_NVRAM_START ... RTC_NVRAM_END: - *eax = rtc_nvram[addr - RTC_NVRAM_START]; - return (0); - default: - return (-1); - } - } - - switch (addr) { - case RTC_STATUSA: - status_a = *eax & ~RTCSA_TUP; - break; - case RTC_STATUSB: - /* XXX not implemented yet XXX */ - if (*eax & RTCSB_PINTR) - return (-1); - status_b = *eax; - break; - case RTC_STATUSD: - /* ignore write */ - break; - case RTC_SEC_ALARM: - rtc_alarm.secs = *eax; - break; - case RTC_MIN_ALARM: - rtc_alarm.mins = *eax; - break; - case RTC_HRS_ALARM: - rtc_alarm.hours = *eax; - break; - case RTC_SEC: - case RTC_MIN: - case RTC_HRS: - case RTC_WDAY: - case RTC_DAY: - case RTC_MONTH: - case RTC_YEAR: - /* - * Ignore writes to the time of day registers - */ - break; - case RTC_NVRAM_START ... RTC_NVRAM_END: - rtc_nvram[addr - RTC_NVRAM_START] = *eax; - break; - default: - return (-1); + t = timegm(&tm); } - return (0); + return (t); } void -rtc_init(struct vmctx *ctx) +rtc_init(struct vmctx *ctx, int use_localtime) { - struct timeval cur; - struct tm tm; size_t himem; size_t lomem; int err; - err = gettimeofday(&cur, NULL); - assert(err == 0); - (void) localtime_r(&cur.tv_sec, &tm); - - memset(rtc_nvram, 0, sizeof(rtc_nvram)); - - rtc_nvram[nvoff(RTC_CENTURY)] = bin2bcd((tm.tm_year + 1900) / 100); - /* XXX init diag/reset code/equipment/checksum ? */ /* @@ -344,19 +88,23 @@ rtc_init(struct vmctx *ctx) * 0x5b/0x5c/0x5d - 64KB chunks above 4GB */ lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB; - rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem; - rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8; + err = vm_rtc_write(ctx, RTC_LMEM_LSB, lomem); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_LMEM_MSB, lomem >> 8); + assert(err == 0); himem = vm_get_highmem_size(ctx) / m_64KB; - rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem; - rtc_nvram[nvoff(RTC_HMEM_SB)] = himem >> 8; - rtc_nvram[nvoff(RTC_HMEM_MSB)] = himem >> 16; -} + err = vm_rtc_write(ctx, RTC_HMEM_LSB, himem); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_HMEM_SB, himem >> 8); + assert(err == 0); + err = vm_rtc_write(ctx, RTC_HMEM_MSB, himem >> 16); + assert(err == 0); -INOUT_PORT(rtc, IO_RTC, IOPORT_F_INOUT, rtc_addr_handler); -INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler); + err = vm_rtc_settime(ctx, rtc_time(ctx, use_localtime)); + assert(err == 0); +} -#ifdef __FreeBSD__ static void rtc_dsdt(void) { @@ -375,6 +123,9 @@ rtc_dsdt(void) dsdt_line("}"); } LPC_DSDT(rtc_dsdt); -#endif +/* + * Reserve the extended RTC I/O ports although they are not emulated at this + * time. + */ SYSRES_IO(0x72, 6); diff --git a/usr/src/cmd/bhyve/rtc.h b/usr/src/cmd/bhyve/rtc.h index 6406d24c37..1c108eed99 100644 --- a/usr/src/cmd/bhyve/rtc.h +++ b/usr/src/cmd/bhyve/rtc.h @@ -1,4 +1,6 @@ -/* +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org> * All rights reserved. * @@ -23,12 +25,12 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/rtc.h 253181 2013-07-11 03:54:35Z grehan $ + * $FreeBSD$ */ #ifndef _RTC_H_ #define _RTC_H_ -void rtc_init(struct vmctx *ctx); +void rtc_init(struct vmctx *ctx, int use_localtime); #endif /* _RTC_H_ */ diff --git a/usr/src/cmd/bhyve/smbiostbl.c b/usr/src/cmd/bhyve/smbiostbl.c index 7ba0f0dfa0..da227f813a 100644 --- a/usr/src/cmd/bhyve/smbiostbl.c +++ b/usr/src/cmd/bhyve/smbiostbl.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -25,7 +27,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/smbiostbl.c 272007 2014-09-23 01:17:22Z grehan $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> @@ -33,6 +35,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/smbiostbl.c 272007 2014-09-23 01:17:22Z #include <errno.h> #include <md5.h> #include <stdio.h> +#include <stdlib.h> #include <string.h> #include <unistd.h> #include <uuid.h> @@ -321,8 +324,8 @@ struct smbios_table_type0 smbios_type0_template = { const char *smbios_type0_strings[] = { "BHYVE", /* vendor string */ - __TIME__, /* bios version string */ - __DATE__, /* bios release date string */ + "1.00", /* bios version string */ + "03/14/2014", /* bios release date string */ NULL }; @@ -634,7 +637,7 @@ smbios_type4_initializer(struct smbios_structure *template_entry, { int i; - for (i = 0; i < guest_ncpus; i++) { + for (i = 0; i < sockets; i++) { struct smbios_table_type4 *type4; char *p; int nstrings, len; @@ -653,6 +656,16 @@ smbios_type4_initializer(struct smbios_structure *template_entry, *(*endaddr) = '\0'; (*endaddr)++; type4->socket = nstrings + 1; + /* Revise cores and threads after update to smbios 3.0 */ + if (cores > 254) + type4->cores = 0; + else + type4->cores = cores; + /* This threads is total threads in a socket */ + if ((cores * threads) > 254) + type4->threads = 0; + else + type4->threads = (cores * threads); curaddr = *endaddr; } @@ -825,3 +838,80 @@ smbios_build(struct vmctx *ctx) return (0); } + +int +smbios_parse(const char *opts) +{ + char *buf; + char *lasts; + char *token; + char *end; + long type; + struct { + const char *key; + const char **targetp; + } type1_map[] = { + { "manufacturer", &smbios_type1_strings[0] }, + { "product", &smbios_type1_strings[1] }, + { "version", &smbios_type1_strings[2] }, + { "serial", &smbios_type1_strings[3] }, + { "sku", &smbios_type1_strings[4] }, + { "family", &smbios_type1_strings[5] }, + { "uuid", (const char **)&guest_uuid_str }, + { 0 } + }; + + if ((buf = strdup(opts)) == NULL) { + (void) fprintf(stderr, "out of memory\n"); + return (-1); + } + + if ((token = strtok_r(buf, ",", &lasts)) == NULL) { + (void) fprintf(stderr, "too few fields\n"); + goto fail; + } + + errno = 0; + type = strtol(token, &end, 10); + if (errno != 0 || *end != '\0') { + (void) fprintf(stderr, "first token '%s' is not an integer\n", + token); + goto fail; + } + + /* For now, only type 1 is supported. */ + if (type != 1) { + (void) fprintf(stderr, "unsupported type %d\n", type); + goto fail; + } + + while ((token = strtok_r(NULL, ",", &lasts)) != NULL) { + char *val; + int i; + + if ((val = strchr(token, '=')) == NULL) { + (void) fprintf(stderr, "invalid key=value: '%s'\n", + token); + goto fail; + } + *val = '\0'; + val++; + + for (i = 0; type1_map[i].key != NULL; i++) { + if (strcmp(token, type1_map[i].key) == 0) { + break; + } + } + if (type1_map[i].key == NULL) { + (void) fprintf(stderr, "invalid key '%s'\n", token); + goto fail; + } + *type1_map[i].targetp = val; + } + + return (0); + +fail: + free(buf); + return (-1); +} diff --git a/usr/src/cmd/bhyve/smbiostbl.h b/usr/src/cmd/bhyve/smbiostbl.h index fd7f86be80..81e26309e5 100644 --- a/usr/src/cmd/bhyve/smbiostbl.h +++ b/usr/src/cmd/bhyve/smbiostbl.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -23,7 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/smbiostbl.h 262744 2014-03-04 17:12:06Z tychon $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #ifndef _SMBIOSTBL_H_ @@ -32,5 +38,6 @@ struct vmctx; int smbios_build(struct vmctx *ctx); +int smbios_parse(const char *opts); #endif /* _SMBIOSTBL_H_ */ diff --git a/usr/src/cmd/bhyve/sockstream.c b/usr/src/cmd/bhyve/sockstream.c new file mode 100644 index 0000000000..b592bce9aa --- /dev/null +++ b/usr/src/cmd/bhyve/sockstream.c @@ -0,0 +1,86 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Nahanni Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <unistd.h> + +#include <errno.h> + +#include "sockstream.h" + +ssize_t +stream_read(int fd, void *buf, ssize_t nbytes) +{ + uint8_t *p; + ssize_t len = 0; + ssize_t n; + + p = buf; + + while (len < nbytes) { + n = read(fd, p + len, nbytes - len); + if (n == 0) + break; + + if (n < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + return (n); + } + len += n; + } + return (len); +} + +ssize_t +stream_write(int fd, const void *buf, ssize_t nbytes) +{ + const uint8_t *p; + ssize_t len = 0; + ssize_t n; + + p = buf; + + while (len < nbytes) { + n = write(fd, p + len, nbytes - len); + if (n == 0) + break; + if (n < 0) { + if (errno == EINTR || errno == EAGAIN) + continue; + return (n); + } + len += n; + } + return (len); +} diff --git a/usr/src/cmd/bhyve/sockstream.h b/usr/src/cmd/bhyve/sockstream.h new file mode 100644 index 0000000000..ecea849471 --- /dev/null +++ b/usr/src/cmd/bhyve/sockstream.h @@ -0,0 +1,35 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2015 Nahanni Systems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/types.h> +#include <unistd.h> + +ssize_t stream_read(int fd, void *buf, ssize_t nbytes); +ssize_t stream_write(int fd, const void *buf, ssize_t nbytes); diff --git a/usr/src/cmd/bhyve/spinup_ap.c b/usr/src/cmd/bhyve/spinup_ap.c index e1dd562d3f..7c4186f5ed 100644 --- a/usr/src/cmd/bhyve/spinup_ap.c +++ b/usr/src/cmd/bhyve/spinup_ap.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,11 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/spinup_ap.c 263432 2014-03-20 18:15:37Z neel $ + * $FreeBSD$ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/spinup_ap.c 263432 2014-03-20 18:15:37Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/types.h> diff --git a/usr/src/cmd/bhyve/spinup_ap.h b/usr/src/cmd/bhyve/spinup_ap.h index 090de091ba..226542f6c3 100644 --- a/usr/src/cmd/bhyve/spinup_ap.h +++ b/usr/src/cmd/bhyve/spinup_ap.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/spinup_ap.h 240912 2012-09-25 02:33:25Z neel $ + * $FreeBSD$ */ #ifndef _SPINUP_AP_H_ diff --git a/usr/src/cmd/bhyve/task_switch.c b/usr/src/cmd/bhyve/task_switch.c new file mode 100644 index 0000000000..b5950a19d8 --- /dev/null +++ b/usr/src/cmd/bhyve/task_switch.c @@ -0,0 +1,941 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu <neel@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/_iovec.h> +#include <sys/mman.h> + +#include <x86/psl.h> +#include <x86/segments.h> +#include <x86/specialreg.h> +#include <machine/vmm.h> +#include <machine/vmm_instruction_emul.h> + +#include <assert.h> +#include <errno.h> +#include <stdbool.h> +#include <stdio.h> +#include <stdlib.h> + +#include <vmmapi.h> + +#include "bhyverun.h" + +/* + * Using 'struct i386tss' is tempting but causes myriad sign extension + * issues because all of its fields are defined as signed integers. + */ +struct tss32 { + uint16_t tss_link; + uint16_t rsvd1; + uint32_t tss_esp0; + uint16_t tss_ss0; + uint16_t rsvd2; + uint32_t tss_esp1; + uint16_t tss_ss1; + uint16_t rsvd3; + uint32_t tss_esp2; + uint16_t tss_ss2; + uint16_t rsvd4; + uint32_t tss_cr3; + uint32_t tss_eip; + uint32_t tss_eflags; + uint32_t tss_eax; + uint32_t tss_ecx; + uint32_t tss_edx; + uint32_t tss_ebx; + uint32_t tss_esp; + uint32_t tss_ebp; + uint32_t tss_esi; + uint32_t tss_edi; + uint16_t tss_es; + uint16_t rsvd5; + uint16_t tss_cs; + uint16_t rsvd6; + uint16_t tss_ss; + uint16_t rsvd7; + uint16_t tss_ds; + uint16_t rsvd8; + uint16_t tss_fs; + uint16_t rsvd9; + uint16_t tss_gs; + uint16_t rsvd10; + uint16_t tss_ldt; + uint16_t rsvd11; + uint16_t tss_trap; + uint16_t tss_iomap; +}; +static_assert(sizeof(struct tss32) == 104, "compile-time assertion failed"); + +#define SEL_START(sel) (((sel) & ~0x7)) +#define SEL_LIMIT(sel) (((sel) | 0x7)) +#define TSS_BUSY(type) (((type) & 0x2) != 0) + +static uint64_t +GETREG(struct vmctx *ctx, int vcpu, int reg) +{ + uint64_t val; + int error; + + error = vm_get_register(ctx, vcpu, reg, &val); + assert(error == 0); + return (val); +} + +static void +SETREG(struct vmctx *ctx, int vcpu, int reg, uint64_t val) +{ + int error; + + error = vm_set_register(ctx, vcpu, reg, val); + assert(error == 0); +} + +static struct seg_desc +usd_to_seg_desc(struct user_segment_descriptor *usd) +{ + struct seg_desc seg_desc; + + seg_desc.base = (u_int)USD_GETBASE(usd); + if (usd->sd_gran) + seg_desc.limit = (u_int)(USD_GETLIMIT(usd) << 12) | 0xfff; + else + seg_desc.limit = (u_int)USD_GETLIMIT(usd); + seg_desc.access = usd->sd_type | usd->sd_dpl << 5 | usd->sd_p << 7; + seg_desc.access |= usd->sd_xx << 12; + seg_desc.access |= usd->sd_def32 << 14; + seg_desc.access |= usd->sd_gran << 15; + + return (seg_desc); +} + +/* + * Inject an exception with an error code that is a segment selector. + * The format of the error code is described in section 6.13, "Error Code", + * Intel SDM volume 3. + * + * Bit 0 (EXT) denotes whether the exception occurred during delivery + * of an external event like an interrupt. + * + * Bit 1 (IDT) indicates whether the selector points to a gate descriptor + * in the IDT. + * + * Bit 2(GDT/LDT) has the usual interpretation of Table Indicator (TI). + */ +static void +sel_exception(struct vmctx *ctx, int vcpu, int vector, uint16_t sel, int ext) +{ + /* + * Bit 2 from the selector is retained as-is in the error code. + * + * Bit 1 can be safely cleared because none of the selectors + * encountered during task switch emulation refer to a task + * gate in the IDT. + * + * Bit 0 is set depending on the value of 'ext'. + */ + sel &= ~0x3; + if (ext) + sel |= 0x1; + vm_inject_fault(ctx, vcpu, vector, 1, sel); +} + +/* + * Return 0 if the selector 'sel' in within the limits of the GDT/LDT + * and non-zero otherwise. + */ +static int +desc_table_limit_check(struct vmctx *ctx, int vcpu, uint16_t sel) +{ + uint64_t base; + uint32_t limit, access; + int error, reg; + + reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; + error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); + assert(error == 0); + + if (reg == VM_REG_GUEST_LDTR) { + if (SEG_DESC_UNUSABLE(access) || !SEG_DESC_PRESENT(access)) + return (-1); + } + + if (limit < SEL_LIMIT(sel)) + return (-1); + else + return (0); +} + +/* + * Read/write the segment descriptor 'desc' into the GDT/LDT slot referenced + * by the selector 'sel'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +desc_table_rw(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, bool doread, + int *faultptr) +{ + struct iovec iov[2]; + uint64_t base; + uint32_t limit, access; + int error, reg; + + reg = ISLDT(sel) ? VM_REG_GUEST_LDTR : VM_REG_GUEST_GDTR; + error = vm_get_desc(ctx, vcpu, reg, &base, &limit, &access); + assert(error == 0); + assert(limit >= SEL_LIMIT(sel)); + + error = vm_copy_setup(ctx, vcpu, paging, base + SEL_START(sel), + sizeof(*desc), doread ? PROT_READ : PROT_WRITE, iov, nitems(iov), + faultptr); + if (error || *faultptr) + return (error); + + if (doread) + vm_copyin(ctx, vcpu, iov, desc, sizeof(*desc)); + else + vm_copyout(ctx, vcpu, desc, iov, sizeof(*desc)); + return (0); +} + +static int +desc_table_read(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) +{ + return (desc_table_rw(ctx, vcpu, paging, sel, desc, true, faultptr)); +} + +static int +desc_table_write(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) +{ + return (desc_table_rw(ctx, vcpu, paging, sel, desc, false, faultptr)); +} + +/* + * Read the TSS descriptor referenced by 'sel' into 'desc'. + * + * Returns 0 on success. + * Returns 1 if an exception was injected into the guest. + * Returns -1 otherwise. + */ +static int +read_tss_descriptor(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + uint16_t sel, struct user_segment_descriptor *desc, int *faultptr) +{ + struct vm_guest_paging sup_paging; + int error; + + assert(!ISLDT(sel)); + assert(IDXSEL(sel) != 0); + + /* Fetch the new TSS descriptor */ + if (desc_table_limit_check(ctx, vcpu, sel)) { + if (ts->reason == TSR_IRET) + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + else + sel_exception(ctx, vcpu, IDT_GP, sel, ts->ext); + return (1); + } + + sup_paging = ts->paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + error = desc_table_read(ctx, vcpu, &sup_paging, sel, desc, faultptr); + return (error); +} + +static bool +code_desc(int sd_type) +{ + /* code descriptor */ + return ((sd_type & 0x18) == 0x18); +} + +static bool +stack_desc(int sd_type) +{ + /* writable data descriptor */ + return ((sd_type & 0x1A) == 0x12); +} + +static bool +data_desc(int sd_type) +{ + /* data descriptor or a readable code descriptor */ + return ((sd_type & 0x18) == 0x10 || (sd_type & 0x1A) == 0x1A); +} + +static bool +ldt_desc(int sd_type) +{ + + return (sd_type == SDT_SYSLDT); +} + +/* + * Validate the descriptor 'seg_desc' associated with 'segment'. + */ +static int +validate_seg_desc(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + int segment, struct seg_desc *seg_desc, int *faultptr) +{ + struct vm_guest_paging sup_paging; + struct user_segment_descriptor usd; + int error, idtvec; + int cpl, dpl, rpl; + uint16_t sel, cs; + bool ldtseg, codeseg, stackseg, dataseg, conforming; + + ldtseg = codeseg = stackseg = dataseg = false; + switch (segment) { + case VM_REG_GUEST_LDTR: + ldtseg = true; + break; + case VM_REG_GUEST_CS: + codeseg = true; + break; + case VM_REG_GUEST_SS: + stackseg = true; + break; + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + dataseg = true; + break; + default: + assert(0); + } + + /* Get the segment selector */ + sel = GETREG(ctx, vcpu, segment); + + /* LDT selector must point into the GDT */ + if (ldtseg && ISLDT(sel)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* Descriptor table limit check */ + if (desc_table_limit_check(ctx, vcpu, sel)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* NULL selector */ + if (IDXSEL(sel) == 0) { + /* Code and stack segment selectors cannot be NULL */ + if (codeseg || stackseg) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + seg_desc->base = 0; + seg_desc->limit = 0; + seg_desc->access = 0x10000; /* unusable */ + return (0); + } + + /* Read the descriptor from the GDT/LDT */ + sup_paging = ts->paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + error = desc_table_read(ctx, vcpu, &sup_paging, sel, &usd, faultptr); + if (error || *faultptr) + return (error); + + /* Verify that the descriptor type is compatible with the segment */ + if ((ldtseg && !ldt_desc(usd.sd_type)) || + (codeseg && !code_desc(usd.sd_type)) || + (dataseg && !data_desc(usd.sd_type)) || + (stackseg && !stack_desc(usd.sd_type))) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + /* Segment must be marked present */ + if (!usd.sd_p) { + if (ldtseg) + idtvec = IDT_TS; + else if (stackseg) + idtvec = IDT_SS; + else + idtvec = IDT_NP; + sel_exception(ctx, vcpu, idtvec, sel, ts->ext); + return (1); + } + + cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); + cpl = cs & SEL_RPL_MASK; + rpl = sel & SEL_RPL_MASK; + dpl = usd.sd_dpl; + + if (stackseg && (rpl != cpl || dpl != cpl)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + + if (codeseg) { + conforming = (usd.sd_type & 0x4) ? true : false; + if ((conforming && (cpl < dpl)) || + (!conforming && (cpl != dpl))) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + } + + if (dataseg) { + /* + * A data segment is always non-conforming except when it's + * descriptor is a readable, conforming code segment. + */ + if (code_desc(usd.sd_type) && (usd.sd_type & 0x4) != 0) + conforming = true; + else + conforming = false; + + if (!conforming && (rpl > dpl || cpl > dpl)) { + sel_exception(ctx, vcpu, IDT_TS, sel, ts->ext); + return (1); + } + } + *seg_desc = usd_to_seg_desc(&usd); + return (0); +} + +static void +tss32_save(struct vmctx *ctx, int vcpu, struct vm_task_switch *task_switch, + uint32_t eip, struct tss32 *tss, struct iovec *iov) +{ + + /* General purpose registers */ + tss->tss_eax = GETREG(ctx, vcpu, VM_REG_GUEST_RAX); + tss->tss_ecx = GETREG(ctx, vcpu, VM_REG_GUEST_RCX); + tss->tss_edx = GETREG(ctx, vcpu, VM_REG_GUEST_RDX); + tss->tss_ebx = GETREG(ctx, vcpu, VM_REG_GUEST_RBX); + tss->tss_esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); + tss->tss_ebp = GETREG(ctx, vcpu, VM_REG_GUEST_RBP); + tss->tss_esi = GETREG(ctx, vcpu, VM_REG_GUEST_RSI); + tss->tss_edi = GETREG(ctx, vcpu, VM_REG_GUEST_RDI); + + /* Segment selectors */ + tss->tss_es = GETREG(ctx, vcpu, VM_REG_GUEST_ES); + tss->tss_cs = GETREG(ctx, vcpu, VM_REG_GUEST_CS); + tss->tss_ss = GETREG(ctx, vcpu, VM_REG_GUEST_SS); + tss->tss_ds = GETREG(ctx, vcpu, VM_REG_GUEST_DS); + tss->tss_fs = GETREG(ctx, vcpu, VM_REG_GUEST_FS); + tss->tss_gs = GETREG(ctx, vcpu, VM_REG_GUEST_GS); + + /* eflags and eip */ + tss->tss_eflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); + if (task_switch->reason == TSR_IRET) + tss->tss_eflags &= ~PSL_NT; + tss->tss_eip = eip; + + /* Copy updated old TSS into guest memory */ + vm_copyout(ctx, vcpu, tss, iov, sizeof(struct tss32)); +} + +static void +update_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *sd) +{ + int error; + + error = vm_set_desc(ctx, vcpu, reg, sd->base, sd->limit, sd->access); + assert(error == 0); +} + +/* + * Update the vcpu registers to reflect the state of the new task. + */ +static int +tss32_restore(struct vmctx *ctx, int vcpu, struct vm_task_switch *ts, + uint16_t ot_sel, struct tss32 *tss, struct iovec *iov, int *faultptr) +{ + struct seg_desc seg_desc, seg_desc2; + uint64_t *pdpte, maxphyaddr, reserved; + uint32_t eflags; + int error, i; + bool nested; + + nested = false; + if (ts->reason != TSR_IRET && ts->reason != TSR_JMP) { + tss->tss_link = ot_sel; + nested = true; + } + + eflags = tss->tss_eflags; + if (nested) + eflags |= PSL_NT; + + /* LDTR */ + SETREG(ctx, vcpu, VM_REG_GUEST_LDTR, tss->tss_ldt); + + /* PBDR */ + if (ts->paging.paging_mode != PAGING_MODE_FLAT) { + if (ts->paging.paging_mode == PAGING_MODE_PAE) { + /* + * XXX Assuming 36-bit MAXPHYADDR. + */ + maxphyaddr = (1UL << 36) - 1; + pdpte = paddr_guest2host(ctx, tss->tss_cr3 & ~0x1f, 32); + for (i = 0; i < 4; i++) { + /* Check reserved bits if the PDPTE is valid */ + if (!(pdpte[i] & 0x1)) + continue; + /* + * Bits 2:1, 8:5 and bits above the processor's + * maximum physical address are reserved. + */ + reserved = ~maxphyaddr | 0x1E6; + if (pdpte[i] & reserved) { + vm_inject_gp(ctx, vcpu); + return (1); + } + } + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE0, pdpte[0]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE1, pdpte[1]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE2, pdpte[2]); + SETREG(ctx, vcpu, VM_REG_GUEST_PDPTE3, pdpte[3]); + } + SETREG(ctx, vcpu, VM_REG_GUEST_CR3, tss->tss_cr3); + ts->paging.cr3 = tss->tss_cr3; + } + + /* eflags and eip */ + SETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS, eflags); + SETREG(ctx, vcpu, VM_REG_GUEST_RIP, tss->tss_eip); + + /* General purpose registers */ + SETREG(ctx, vcpu, VM_REG_GUEST_RAX, tss->tss_eax); + SETREG(ctx, vcpu, VM_REG_GUEST_RCX, tss->tss_ecx); + SETREG(ctx, vcpu, VM_REG_GUEST_RDX, tss->tss_edx); + SETREG(ctx, vcpu, VM_REG_GUEST_RBX, tss->tss_ebx); + SETREG(ctx, vcpu, VM_REG_GUEST_RSP, tss->tss_esp); + SETREG(ctx, vcpu, VM_REG_GUEST_RBP, tss->tss_ebp); + SETREG(ctx, vcpu, VM_REG_GUEST_RSI, tss->tss_esi); + SETREG(ctx, vcpu, VM_REG_GUEST_RDI, tss->tss_edi); + + /* Segment selectors */ + SETREG(ctx, vcpu, VM_REG_GUEST_ES, tss->tss_es); + SETREG(ctx, vcpu, VM_REG_GUEST_CS, tss->tss_cs); + SETREG(ctx, vcpu, VM_REG_GUEST_SS, tss->tss_ss); + SETREG(ctx, vcpu, VM_REG_GUEST_DS, tss->tss_ds); + SETREG(ctx, vcpu, VM_REG_GUEST_FS, tss->tss_fs); + SETREG(ctx, vcpu, VM_REG_GUEST_GS, tss->tss_gs); + + /* + * If this is a nested task then write out the new TSS to update + * the previous link field. + */ + if (nested) + vm_copyout(ctx, vcpu, tss, iov, sizeof(*tss)); + + /* Validate segment descriptors */ + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_LDTR, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_LDTR, &seg_desc); + + /* + * Section "Checks on Guest Segment Registers", Intel SDM, Vol 3. + * + * The SS and CS attribute checks on VM-entry are inter-dependent so + * we need to make sure that both segments are valid before updating + * either of them. This ensures that the VMCS state can pass the + * VM-entry checks so the guest can handle any exception injected + * during task switch emulation. + */ + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_CS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_SS, &seg_desc2, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_CS, &seg_desc); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc2); + ts->paging.cpl = tss->tss_cs & SEL_RPL_MASK; + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_DS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_DS, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_ES, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_ES, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_FS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_FS, &seg_desc); + + error = validate_seg_desc(ctx, vcpu, ts, VM_REG_GUEST_GS, &seg_desc, + faultptr); + if (error || *faultptr) + return (error); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_GS, &seg_desc); + + return (0); +} + +/* + * Push an error code on the stack of the new task. This is needed if the + * task switch was triggered by a hardware exception that causes an error + * code to be saved (e.g. #PF). + */ +static int +push_errcode(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + int task_type, uint32_t errcode, int *faultptr) +{ + struct iovec iov[2]; + struct seg_desc seg_desc; + int stacksize, bytes, error; + uint64_t gla, cr0, rflags; + uint32_t esp; + uint16_t stacksel; + + *faultptr = 0; + + cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); + rflags = GETREG(ctx, vcpu, VM_REG_GUEST_RFLAGS); + stacksel = GETREG(ctx, vcpu, VM_REG_GUEST_SS); + + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS, &seg_desc.base, + &seg_desc.limit, &seg_desc.access); + assert(error == 0); + + /* + * Section "Error Code" in the Intel SDM vol 3: the error code is + * pushed on the stack as a doubleword or word (depending on the + * default interrupt, trap or task gate size). + */ + if (task_type == SDT_SYS386BSY || task_type == SDT_SYS386TSS) + bytes = 4; + else + bytes = 2; + + /* + * PUSH instruction from Intel SDM vol 2: the 'B' flag in the + * stack-segment descriptor determines the size of the stack + * pointer outside of 64-bit mode. + */ + if (SEG_DESC_DEF32(seg_desc.access)) + stacksize = 4; + else + stacksize = 2; + + esp = GETREG(ctx, vcpu, VM_REG_GUEST_RSP); + esp -= bytes; + + if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, + &seg_desc, esp, bytes, stacksize, PROT_WRITE, &gla)) { + sel_exception(ctx, vcpu, IDT_SS, stacksel, 1); + *faultptr = 1; + return (0); + } + + if (vie_alignment_check(paging->cpl, bytes, cr0, rflags, gla)) { + vm_inject_ac(ctx, vcpu, 1); + *faultptr = 1; + return (0); + } + + error = vm_copy_setup(ctx, vcpu, paging, gla, bytes, PROT_WRITE, + iov, nitems(iov), faultptr); + if (error || *faultptr) + return (error); + + vm_copyout(ctx, vcpu, &errcode, iov, bytes); + SETREG(ctx, vcpu, VM_REG_GUEST_RSP, esp); + return (0); +} + +/* + * Evaluate return value from helper functions and potentially return to + * the VM run loop. + */ +#define CHKERR(error,fault) \ + do { \ + assert((error == 0) || (error == EFAULT)); \ + if (error) \ + return (VMEXIT_ABORT); \ + else if (fault) \ + return (VMEXIT_CONTINUE); \ + } while (0) + +int +vmexit_task_switch(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu) +{ + struct seg_desc nt; + struct tss32 oldtss, newtss; + struct vm_task_switch *task_switch; + struct vm_guest_paging *paging, sup_paging; + struct user_segment_descriptor nt_desc, ot_desc; + struct iovec nt_iov[2], ot_iov[2]; + uint64_t cr0, ot_base; + uint32_t eip, ot_lim, access; + int error, ext, fault, minlimit, nt_type, ot_type, vcpu; + enum task_switch_reason reason; + uint16_t nt_sel, ot_sel; + + task_switch = &vmexit->u.task_switch; + nt_sel = task_switch->tsssel; + ext = vmexit->u.task_switch.ext; + reason = vmexit->u.task_switch.reason; + paging = &vmexit->u.task_switch.paging; + vcpu = *pvcpu; + + assert(paging->cpu_mode == CPU_MODE_PROTECTED); + + /* + * Calculate the instruction pointer to store in the old TSS. + */ + eip = vmexit->rip + vmexit->inst_length; + + /* + * Section 4.6, "Access Rights" in Intel SDM Vol 3. + * The following page table accesses are implicitly supervisor mode: + * - accesses to GDT or LDT to load segment descriptors + * - accesses to the task state segment during task switch + */ + sup_paging = *paging; + sup_paging.cpl = 0; /* implicit supervisor mode */ + + /* Fetch the new TSS descriptor */ + error = read_tss_descriptor(ctx, vcpu, task_switch, nt_sel, &nt_desc, + &fault); + CHKERR(error, fault); + + nt = usd_to_seg_desc(&nt_desc); + + /* Verify the type of the new TSS */ + nt_type = SEG_DESC_TYPE(nt.access); + if (nt_type != SDT_SYS386BSY && nt_type != SDT_SYS386TSS && + nt_type != SDT_SYS286BSY && nt_type != SDT_SYS286TSS) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* TSS descriptor must have present bit set */ + if (!SEG_DESC_PRESENT(nt.access)) { + sel_exception(ctx, vcpu, IDT_NP, nt_sel, ext); + goto done; + } + + /* + * TSS must have a minimum length of 104 bytes for a 32-bit TSS and + * 44 bytes for a 16-bit TSS. + */ + if (nt_type == SDT_SYS386BSY || nt_type == SDT_SYS386TSS) + minlimit = 104 - 1; + else if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) + minlimit = 44 - 1; + else + minlimit = 0; + + assert(minlimit > 0); + if (nt.limit < minlimit) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* TSS must be busy if task switch is due to IRET */ + if (reason == TSR_IRET && !TSS_BUSY(nt_type)) { + sel_exception(ctx, vcpu, IDT_TS, nt_sel, ext); + goto done; + } + + /* + * TSS must be available (not busy) if task switch reason is + * CALL, JMP, exception or interrupt. + */ + if (reason != TSR_IRET && TSS_BUSY(nt_type)) { + sel_exception(ctx, vcpu, IDT_GP, nt_sel, ext); + goto done; + } + + /* Fetch the new TSS */ + error = vm_copy_setup(ctx, vcpu, &sup_paging, nt.base, minlimit + 1, + PROT_READ | PROT_WRITE, nt_iov, nitems(nt_iov), &fault); + CHKERR(error, fault); + vm_copyin(ctx, vcpu, nt_iov, &newtss, minlimit + 1); + + /* Get the old TSS selector from the guest's task register */ + ot_sel = GETREG(ctx, vcpu, VM_REG_GUEST_TR); + if (ISLDT(ot_sel) || IDXSEL(ot_sel) == 0) { + /* + * This might happen if a task switch was attempted without + * ever loading the task register with LTR. In this case the + * TR would contain the values from power-on: + * (sel = 0, base = 0, limit = 0xffff). + */ + sel_exception(ctx, vcpu, IDT_TS, ot_sel, task_switch->ext); + goto done; + } + + /* Get the old TSS base and limit from the guest's task register */ + error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR, &ot_base, &ot_lim, + &access); + assert(error == 0); + assert(!SEG_DESC_UNUSABLE(access) && SEG_DESC_PRESENT(access)); + ot_type = SEG_DESC_TYPE(access); + assert(ot_type == SDT_SYS386BSY || ot_type == SDT_SYS286BSY); + + /* Fetch the old TSS descriptor */ + error = read_tss_descriptor(ctx, vcpu, task_switch, ot_sel, &ot_desc, + &fault); + CHKERR(error, fault); + + /* Get the old TSS */ + error = vm_copy_setup(ctx, vcpu, &sup_paging, ot_base, minlimit + 1, + PROT_READ | PROT_WRITE, ot_iov, nitems(ot_iov), &fault); + CHKERR(error, fault); + vm_copyin(ctx, vcpu, ot_iov, &oldtss, minlimit + 1); + + /* + * Clear the busy bit in the old TSS descriptor if the task switch + * due to an IRET or JMP instruction. + */ + if (reason == TSR_IRET || reason == TSR_JMP) { + ot_desc.sd_type &= ~0x2; + error = desc_table_write(ctx, vcpu, &sup_paging, ot_sel, + &ot_desc, &fault); + CHKERR(error, fault); + } + + if (nt_type == SDT_SYS286BSY || nt_type == SDT_SYS286TSS) { + fprintf(stderr, "Task switch to 16-bit TSS not supported\n"); + return (VMEXIT_ABORT); + } + + /* Save processor state in old TSS */ + tss32_save(ctx, vcpu, task_switch, eip, &oldtss, ot_iov); + + /* + * If the task switch was triggered for any reason other than IRET + * then set the busy bit in the new TSS descriptor. + */ + if (reason != TSR_IRET) { + nt_desc.sd_type |= 0x2; + error = desc_table_write(ctx, vcpu, &sup_paging, nt_sel, + &nt_desc, &fault); + CHKERR(error, fault); + } + + /* Update task register to point at the new TSS */ + SETREG(ctx, vcpu, VM_REG_GUEST_TR, nt_sel); + + /* Update the hidden descriptor state of the task register */ + nt = usd_to_seg_desc(&nt_desc); + update_seg_desc(ctx, vcpu, VM_REG_GUEST_TR, &nt); + + /* Set CR0.TS */ + cr0 = GETREG(ctx, vcpu, VM_REG_GUEST_CR0); + SETREG(ctx, vcpu, VM_REG_GUEST_CR0, cr0 | CR0_TS); + + /* + * We are now committed to the task switch. Any exceptions encountered + * after this point will be handled in the context of the new task and + * the saved instruction pointer will belong to the new task. + */ + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, newtss.tss_eip); + assert(error == 0); + + /* Load processor state from new TSS */ + error = tss32_restore(ctx, vcpu, task_switch, ot_sel, &newtss, nt_iov, + &fault); + CHKERR(error, fault); + + /* + * Section "Interrupt Tasks" in Intel SDM, Vol 3: if an exception + * caused an error code to be generated, this error code is copied + * to the stack of the new task. + */ + if (task_switch->errcode_valid) { + assert(task_switch->ext); + assert(task_switch->reason == TSR_IDT_GATE); + error = push_errcode(ctx, vcpu, &task_switch->paging, nt_type, + task_switch->errcode, &fault); + CHKERR(error, fault); + } + + /* + * Treatment of virtual-NMI blocking if NMI is delivered through + * a task gate. + * + * Section "Architectural State Before A VM Exit", Intel SDM, Vol3: + * If the virtual NMIs VM-execution control is 1, VM entry injects + * an NMI, and delivery of the NMI causes a task switch that causes + * a VM exit, virtual-NMI blocking is in effect before the VM exit + * commences. + * + * Thus, virtual-NMI blocking is in effect at the time of the task + * switch VM exit. + */ + + /* + * Treatment of virtual-NMI unblocking on IRET from NMI handler task. + * + * Section "Changes to Instruction Behavior in VMX Non-Root Operation" + * If "virtual NMIs" control is 1 IRET removes any virtual-NMI blocking. + * This unblocking of virtual-NMI occurs even if IRET causes a fault. + * + * Thus, virtual-NMI blocking is cleared at the time of the task switch + * VM exit. + */ + + /* + * If the task switch was triggered by an event delivered through + * the IDT then extinguish the pending event from the vcpu's + * exitintinfo. + */ + if (task_switch->reason == TSR_IDT_GATE) { + error = vm_set_intinfo(ctx, vcpu, 0); + assert(error == 0); + } + + /* + * XXX should inject debug exception if 'T' bit is 1 + */ +done: + return (VMEXIT_CONTINUE); +} diff --git a/usr/src/cmd/bhyveload-uefi/i386/Makefile b/usr/src/cmd/bhyve/test/Makefile index f5b7bb6915..7dbee0c5f3 100644 --- a/usr/src/cmd/bhyveload-uefi/i386/Makefile +++ b/usr/src/cmd/bhyve/test/Makefile @@ -10,9 +10,9 @@ # # -# Copyright 2013 Pluribus Networks Inc. +# Copyright 2018 Joyent, Inc. # -include ../Makefile.com +SUBDIRS = scripts tst -install: all $(ROOTUSRSBINPROG32) +include Makefile.subdirs diff --git a/usr/src/cmd/bhyve/test/Makefile.com b/usr/src/cmd/bhyve/test/Makefile.com new file mode 100644 index 0000000000..f5efacc510 --- /dev/null +++ b/usr/src/cmd/bhyve/test/Makefile.com @@ -0,0 +1,61 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +include $(SRC)/Makefile.master +include $(SRC)/cmd/Makefile.cmd +include $(SRC)/cmd/Makefile.cmd.64 + +# +# Force c99 for everything +# +CSTD= $(CSTD_GNU99) +C99MODE= -xc99=%all +C99LMODE= -Xc99=%all + +CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration \ + -_gcc=-Wno-parentheses +CFLAGS64 += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration \ + -_gcc=-Wno-parentheses +CPPFLAGS = -I$(SRC)/cmd/bhyve \ + -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ + -I$(CONTRIB)/freebsd/dev/usb/controller \ + -I$(CONTRIB)/freebsd/dev/mii \ + $(CPPFLAGS.master) \ + -I$(SRC)/uts/i86pc/io/vmm \ + -I$(SRC)/uts/common \ + -I$(SRC)/uts/i86pc \ + -I$(SRC)/lib/libdladm/common \ + -DWITHOUT_CAPSICUM +CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 + +SMOFF += all_func_returns + +CLEANFILES += $(EXETESTS) +CLOBBERFILES += $(ROOTTESTS) + +# +# Install related definitions +# +ROOTOPTPKG = $(ROOT)/opt/bhyvetest +ROOTBIN = $(ROOTOPTPKG)/bin +ROOTTST = $(ROOTOPTPKG)/tst +ROOTTSTDIR = $(ROOTTST)/$(TSTDIR) +ROOTTSTEXES = $(EXETESTS:%=$(ROOTTSTDIR)/%) +ROOTTSTSH = $(SHTESTS:%=$(ROOTTSTDIR)/%) +ROOTOUT = $(OUTFILES:%=$(ROOTTSTDIR)/%) +ROOTTESTS = $(ROOTTSTEXES) $(ROOTTSTSH) $(ROOTOUT) +FILEMODE = 0555 +LDLIBS = $(LDLIBS.cmd) +LINTEXE = $(EXETESTS:%.exe=%.exe.ln) diff --git a/usr/src/cmd/bhyve/amd64/Makefile b/usr/src/cmd/bhyve/test/Makefile.subdirs index 13cdae6663..45f0aa67fa 100644 --- a/usr/src/cmd/bhyve/amd64/Makefile +++ b/usr/src/cmd/bhyve/test/Makefile.subdirs @@ -10,12 +10,20 @@ # # -# Copyright 2015 Pluribus Networks Inc. +# Copyright 2018 Joyent, Inc. # -include ../Makefile.com -include ../../Makefile.cmd.64 +.KEEP_STATE: -CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 +all := TARGET += all +clean := TARGET += clean +clobber := TARGET += clobber +install := TARGET += install +lint := TARGET += lint -install: all $(ROOTUSRSBINPROG64) +all clean clobber install lint: $(SUBDIRS) + +$(SUBDIRS): FRC + @cd $@; pwd; $(MAKE) $(TARGET) + +FRC: diff --git a/usr/src/cmd/bhyve/test/Makefile.targ b/usr/src/cmd/bhyve/test/Makefile.targ new file mode 100644 index 0000000000..e3ec55cfdb --- /dev/null +++ b/usr/src/cmd/bhyve/test/Makefile.targ @@ -0,0 +1,55 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +$(ROOTOPTPKG): + $(INS.dir) + +$(ROOTBIN): $(ROOTOPTPKG) + $(INS.dir) + +$(ROOTBIN)/%: %.ksh $(ROOTBIN) + $(INS.rename) + +$(ROOTTST): $(ROOTOPTPKG) + $(INS.dir) + +$(ROOTTSTDIR): $(ROOTTST) + $(INS.dir) + +$(ROOTTSTDIR)/%.ksh: %.ksh $(ROOTTSTDIR) + $(INS.file) + +$(ROOTTSTDIR)/%.out: %.out $(ROOTTSTDIR) + $(INS.file) + +%.exe: %.o $(SUPOBJS) + $(LINK.c) -o $@ $< $(SUPOBJS) $(LDLIBS) + $(POST_PROCESS) + +$(ROOTTSTDIR)/%.exe: %.exe $(ROOTTSTDIR) + $(INS.file) + +all: install + +%.exe.ln: %.c $(SUPOBJS) + $(LINT.c) $< $(LDLIBS) + +lint: $(LINTEXE) + +clean: + -$(RM) *.o $(CLEANFILES) + +clobber: clean + -$(RM) $(CLOBBERFILES) diff --git a/usr/src/cmd/bhyveload-uefi/amd64/Makefile b/usr/src/cmd/bhyve/test/scripts/Makefile index b602c50d05..d28a5edb8f 100644 --- a/usr/src/cmd/bhyveload-uefi/amd64/Makefile +++ b/usr/src/cmd/bhyve/test/scripts/Makefile @@ -10,12 +10,19 @@ # # -# Copyright 2013 Pluribus Networks Inc. +# Copyright 2018 Joyent, Inc. # include ../Makefile.com -include ../../Makefile.cmd.64 -CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 +SRCS = bhyvetest +SCRIPTS = $(SRCS:%=$(ROOTBIN)/%) -install: all $(ROOTUSRSBINPROG64) +SCRIPTS := FILEMODE = 0555 +CLOBBERFILES = $(SCRIPTS) + +install: $(SCRIPTS) + +lint: + +include ../Makefile.targ diff --git a/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh b/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh new file mode 100644 index 0000000000..95b7743417 --- /dev/null +++ b/usr/src/cmd/bhyve/test/scripts/bhyvetest.ksh @@ -0,0 +1,231 @@ +#!/bin/ksh +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2018 Joyent, Inc. +# + +# +# bhyve test suite driver +# +unalias -a + +bt_arg0=$(basename $0) +bt_root="$(cd $(dirname $0)/..; pwd -P)" +bt_ksh="/usr/bin/ksh" +bt_outdir= +bt_keep= +bt_all= +bt_tnum=0 +bt_tfail=0 +bt_tsuc=0 + +function usage +{ + typeset msg="$*" + [[ -z "$msg" ]] || echo "$msg" 2>&1 + cat <<USAGE >&2 +Usage: $bt_arg0 [ -o dir ] [ -k ] [ -a | test ... ] + + -o dir Sets 'dir' as the output directory + -a Runs all tests, ignores tests passed in + -k Keep output from all tests, not just failures + -m mdb binary to test +USAGE + exit 2 +} + +function fatal +{ + typeset msg="$*" + [[ -z "$msg" ]] && msg="failed" + echo "$bt_arg0: $msg" >&2 + exit 1 +} + +function setup_outdir +{ + bt_outdir="$bt_outdir/$bt_arg0.$$" + mkdir -p $bt_outdir || fatal "failed to make output dir $bt_outdir" +} + +function run_single +{ + typeset name=$1 + typeset expect base ext exe command odir res reason + typeset iserr + + [[ -z "$name" ]] && fail "missing test to run" + base=${name##*/} + ext=${base##*.} + expect=${base%%.*} + odir="$bt_outdir/current" + [[ -z "$ext" ]] && fatal "found test without ext: $name" + [[ -z "$expect" ]] && fatal "found test without prefix: $name" + + if [[ "$expect" == "err" || "$expect" == "ecreate" ]]; then + iserr="yup" + else + iserr="" + fi + + case "$ext" in + "ksh") + command="$bt_ksh ./$base" + ;; + "exe") + command="./$base" + ;; + "out") + # + # This is the file format for checking output against. + # + return 0 + ;; + *) + echo "skipping test $name (unknown extensino)" + return 0 + ;; + esac + + echo "Executing test $name ... \c" + mkdir -p "$odir" >/dev/null || fatal "can't make output directory" + cd $(dirname $name) || fatal "failed to enter test directory" + $command > "$odir/stdout" 2>"$odir/stderr" + res=$? + cd - > /dev/null || fatal "failed to leave test directory" + + if [[ -f "$name.out" ]] && \ + ! diff "$name.out" "$odir/stdout" >/dev/null; then + cp $name.out $odir/$base.out + reason="stdout mismatch" + elif [[ -n "$iserr" && $res -eq 0 ]]; then + reason="test exited $res, not non-zero" + elif [[ -z "$iserr" && $res -ne 0 ]]; then + reason="test exited $res, not zero" + fi + + if [[ -n "$reason" ]]; then + echo "$reason" + ((bt_tfail++)) + mv "$odir" "$bt_outdir/failure.$bt_tfail" || fatal \ + "failed to move test output directory" + cp "$name" "$bt_outdir/failure.$bt_tfail/$(basename $name)" || \ + fatal "failed to copy test into output directory" + else + echo "passed" + ((bt_tsuc++)) + mv "$odir" "$bt_outdir/success.$bt_tsuc" || fatal \ + "failed to move test directory" + fi + + ((bt_tnum++)) +} + +function run_all +{ + typeset tests t dir + + tests=$(ls -1 $bt_root/tst/*/*.@(ksh|exe)) + for t in $tests; do + run_single $t + done +} + +function welcome +{ + cat <<WELCOME +Starting tests... +output directory: $bt_outdir +WELCOME +} + +function cleanup +{ + [[ -n "$bt_keep" ]] && return + rm -rf "$bt_outdir"/success.* || fatal \ + "failed to remove successful test cases" + if [[ $bt_tfail -eq 0 ]]; then + rmdir "$bt_outdir" || fatal \ + "failed to remove test output directory" + fi +} + +function goodbye +{ + cat <<EOF + +------------- +Results +------------- + +Tests passed: $bt_tsuc +Tests failed: $bt_tfail +Tests ran: $bt_tnum + +EOF + if [[ $bt_tfail -eq 0 ]]; then + echo "Congrats, some tiny parts of bhyve aren't completely" \ + "broken, the tests pass". + else + echo "Some tests failed, you have some work to do." + fi +} + +while getopts ":ahko:m:" c $@; do + case "$c" in + a) + bt_all="y" + ;; + k) + bt_keep="y" + ;; + o) + bt_outdir="$OPTARG" + ;; + h) + usage + ;; + :) + usage "option requires an argument -- $OPTARG" + ;; + *) + usage "invalid option -- $OPTARG" + ;; + esac +done + +shift $((OPTIND-1)) + +[[ -z "$bt_all" && $# == 0 ]] && usage "no tests to run" + +[[ -z "$bt_outdir" ]] && bt_outdir="$PWD" + +setup_outdir +welcome + +if [[ ! -z "$bt_all" ]]; then + run_all +else + for t in $@; do + [[ -f $t ]] || fatal "cannot find test $t" + run_single $t + done +fi + +goodbye +cleanup + +# +# Exit 1 if we have tests that return non-zero +# +[[ $bt_tfai -eq 0 ]] diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/Makefile b/usr/src/cmd/bhyve/test/tst/Makefile index bf9219b435..f6a6ec96fc 100644 --- a/usr/src/cmd/mdb/intel/amd64/vmm/Makefile +++ b/usr/src/cmd/bhyve/test/tst/Makefile @@ -10,11 +10,9 @@ # # -# Copyright 2014 Pluribus Networks Inc. +# Copyright 2018 Joyent, Inc. # -MAKEVARS = CW_NO_SHADOW=true __GNUC= +SUBDIRS = mevent -include $(SRC)/Makefile.master -$(BUILD64)SUBDIRS += $(MACH64) -include ../../../Makefile.subdirs +include ../Makefile.subdirs diff --git a/usr/src/cmd/bhyvectl/amd64/Makefile b/usr/src/cmd/bhyve/test/tst/mevent/Makefile index b602c50d05..047886bc6a 100644 --- a/usr/src/cmd/bhyvectl/amd64/Makefile +++ b/usr/src/cmd/bhyve/test/tst/mevent/Makefile @@ -10,12 +10,21 @@ # # -# Copyright 2013 Pluribus Networks Inc. +# Copyright 2018 Joyent, Inc. # -include ../Makefile.com -include ../../Makefile.cmd.64 +TSTDIR = mevent +EXETESTS = \ + lists.delete.exe \ + read.disable.exe \ + read.pause.exe \ + read.requeue.exe \ -CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 +SHTESTS = +SUPOBJS = mevent.o testlib.o -install: all $(ROOTUSRSBINPROG64) +include ../../Makefile.com + +install: $(ROOTTESTS) + +include ../../Makefile.targ diff --git a/usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c b/usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c new file mode 100644 index 0000000000..c5ed91a790 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/lists.delete.c @@ -0,0 +1,172 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test: lists.delete + * Assertion: mevent_delete() causes the total number of events to decrease + * + * Strategy: 1. Create a pipe. + * 2. Call mevent_add() to be notified of writes to the pipe. The + * callback will do nothing other than generate an error if it + * is called. + * 3. Create another pipe and add a read event watcher to it. The + * callback will signal a cv when called. A write to the pipe + * followed by a wait on the cv will ensure that async + * operations in mevent.c are complete. See flush_and_wait(). + * 4. Call flush_and_wait(), then get event count. + * 5. Delete the event created in step 2. + * 6. Call flush_and_wait(), then get event count. + * 7. Verify result in step 6 is one less than result in step 4. + */ + +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> + +#include <sys/types.h> +#include <sys/stat.h> + +#include "testlib.h" +#include "mevent.h" + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; + +static int +get_count(void) +{ + int global = -1, change = -1, del_pending = -1; + int total; + + test_mevent_count_lists(&global, &change, &del_pending); + ASSERT_INT_NEQ(("count not set"), global, -1); + ASSERT_INT_NEQ(("count not set"), change, -1); + ASSERT_INT_NEQ(("count not set"), change, -1); + ASSERT_INT_EQ(("pending delete not processed"), del_pending, 0); + + total = global + change + del_pending; + + VERBOSE(("count = %d (%d + %d + %d)", total, global, change, + del_pending)); + + return (total); +} + +static void +not_called_cb(int fd, enum ev_type ev, void *arg) +{ + FAIL(("this callback should never be called")); +} + +static void +flush_cb(int fd, enum ev_type ev, void *arg) +{ + char buf[32]; + + /* Drain the pipe */ + while (read(fd, buf, sizeof (buf)) > 0) + ; + + pthread_mutex_lock(&mtx); + pthread_cond_signal(&cv); + pthread_mutex_unlock(&mtx); +} + +void +flush_and_wait(int fd) +{ + uint8_t msg = 42; + + /* + * Lock taken ahead of waking flush_cb so this thread doesn't race + * with the event thread. + */ + pthread_mutex_lock(&mtx); + if (write(fd, &msg, sizeof (msg)) != sizeof (msg)) { + FAIL(("bad write")); + } + + /* Wait for it to be read */ + pthread_cond_wait(&cv, &mtx); + pthread_mutex_unlock(&mtx); +} + +int +main(int argc, const char *argv[]) +{ + int unused_pipe[2]; + int flush_pipe[2]; + struct mevent *unused_evp, *flush_evp; + int count1, count2; + + start_test(argv[0], 5); + start_event_thread(); + + /* + * Create first pipe and related event + */ + if (pipe(unused_pipe) != 0) { + FAIL_ERRNO("pipe"); + } + VERBOSE(("unused_pipe[] = { %d, %d }", unused_pipe[0], unused_pipe[1])); + if (fcntl(unused_pipe[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + unused_evp = mevent_add(unused_pipe[0], EVF_READ, not_called_cb, NULL); + ASSERT_PTR_NEQ(("mevent_add"), unused_evp, NULL); + + /* + * Create flush pipe and related event + */ + if (pipe(flush_pipe) != 0) { + FAIL_ERRNO("pipe"); + } + VERBOSE(("flush_pipe[] = { %d, %d }", flush_pipe[0], + flush_pipe[1])); + if (fcntl(flush_pipe[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + flush_evp = mevent_add(flush_pipe[0], EVF_READ, flush_cb, NULL); + ASSERT_PTR_NEQ(("mevent_add"), flush_evp, NULL); + + /* Get count before delete. */ + flush_and_wait(flush_pipe[1]); + count1 = get_count(); + + /* + * Delete the first event and flush a read after the delete is + * complete. + */ + if (mevent_delete(unused_evp) != 0) { + FAIL_ERRNO("mevent_delete"); + } + + /* + * Verify count decreased. + */ + flush_and_wait(flush_pipe[1]); + count2 = get_count(); + if (count1 - 1 != count2) { + FAIL(("mevent_delete() did not decrease count by 1: " + "was %d, now %d", count1, count2)); + } + + PASS(); +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/mevent.c b/usr/src/cmd/bhyve/test/tst/mevent/mevent.c new file mode 100644 index 0000000000..17b6546847 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/mevent.c @@ -0,0 +1,57 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include "../../../mevent.c" +#include "testlib.h" + +/* + * Returns by reference the number of events on the global and change lists. + * + * Used by tests that wish to ensure that the event count changes as suggested + * by mevent_add() and mevent_delete(). Note that a delete does not immediately + * delete an event. Events that are pending delete are included in the change + * list until the next pass through the change list to process pending changes. + */ +void +test_mevent_count_lists(int *ret_global, int *ret_change, int *ret_del_pending) +{ + struct mevent *mevp; + int global = 0; + int change = 0; + int del_pending = 0; + + mevent_qlock(); + + LIST_FOREACH(mevp, &global_head, me_list) { + global++; + VERBOSE(("on global: type %d fd %d state %d", mevp->me_type, + mevp->me_fd, mevp->me_state)); + } + + LIST_FOREACH(mevp, &change_head, me_list) { + change++; + if (mevp->me_state == MEV_DEL_PENDING) { + del_pending++; + } + VERBOSE(("on change: type %d fd %d state %d", mevp->me_type, + mevp->me_fd, mevp->me_state)); + } + + mevent_qunlock(); + + *ret_global = global; + *ret_change = change; + *ret_del_pending = del_pending; +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c b/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c new file mode 100644 index 0000000000..d23b1af96c --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/read.disable.c @@ -0,0 +1,163 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test: read.cancel + * Assertion: A read is not requeued if mevent_disable() is called while it is + * being handled. + * + * Strategy: 1. Create a pipe + * 2. Call mevent_add() to be notified of writes to the pipe. The + * callback will signal a cv. + * 3. Write to the pipe then wait for a wakeup. + * 4. From the read event callback, disable the event then awaken + * the main thread. + * 5. In the main thread, add a timer event that will awaken the + * main thread after a short delay. + * 5. Write to the pipe and wait to be awoken. The wakeup should + * come from the timer event, not the read event. + */ + +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> + +#include <sys/types.h> +#include <sys/stat.h> + +#include "testlib.h" +#include "mevent.h" + +typedef enum { + CB_NONE, + CB_READ, + CB_TIMER, +} lastwake_t; + +static lastwake_t lastwake = CB_NONE; + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; + +static struct mevent *read_event; + +static void +munch(int fd, enum ev_type ev, void *arg) +{ + ssize_t nbytes; + char buf[32] = { 0 }; + int err; + + if ((nbytes = read(fd, buf, sizeof (buf))) < 0) { + FAIL_ERRNO("bad read"); + } + VERBOSE(("read %ld bytes '%s'", nbytes, buf)); + + err = mevent_disable(read_event); + ASSERT_INT_EQ(("mevent_disable: ", strerror(err)), err, 0); + + pthread_mutex_lock(&mtx); + + ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_NONE); + lastwake = CB_READ; + + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + + pthread_mutex_unlock(&mtx); +} + +static void +tick(int ms, enum ev_type ev, void *arg) +{ + pthread_mutex_lock(&mtx); + + ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_READ); + lastwake = CB_TIMER; + + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + + pthread_mutex_unlock(&mtx); +} + +int +main(int argc, const char *argv[]) +{ + int pipefds[2]; + struct mevent *timer; + ssize_t written; + char *msgs[] = { "first", "second" }; + char *msg; + + start_test(argv[0], 5); + start_event_thread(); + + if (pipe(pipefds) != 0) { + FAIL_ERRNO("pipe"); + } + if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + + /* + * First write + */ + msg = msgs[0]; + read_event = mevent_add(pipefds[0], EVF_READ, munch, msg); + ASSERT_PTR_NEQ(("mevent_add pipefd"), read_event, NULL); + + pthread_mutex_lock(&mtx); + written = write(pipefds[1], msg, strlen(msg)); + if (written < 0) { + FAIL_ERRNO("bad write"); + } + ASSERT_INT64_EQ(("write '%s' failed", msg), written, strlen(msg)); + + /* + * Wait for it to be read + */ + pthread_cond_wait(&cv, &mtx); + ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_READ); + pthread_mutex_unlock(&mtx); + + /* + * Add timer, second write. + */ + msg = msgs[1]; + timer = mevent_add(50, EVF_TIMER, tick, msg); + ASSERT_PTR_NEQ(("mevent_add timer"), timer, NULL); + + pthread_mutex_lock(&mtx); + written = write(pipefds[1], msg, strlen(msg)); + if (written < 0) { + FAIL_ERRNO("bad write"); + } + ASSERT_INT64_EQ(("write '%s' failed", msg), written, strlen(msg)); + + /* + * Wait for timer to expire + */ + pthread_cond_wait(&cv, &mtx); + ASSERT_INT_EQ(("wrong lastwake"), lastwake, CB_TIMER); + pthread_mutex_unlock(&mtx); + + PASS(); +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c b/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c new file mode 100644 index 0000000000..c877f014f6 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/read.pause.c @@ -0,0 +1,152 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test: read.pause + * Assertion: mevent_disable() can be used to pause reads. + * + * Strategy: 1. Create a pipe + * 2. Call mevent_add() to be notified of writes to the pipe. The + * callback will signal a cv. + * 3. In a loop, write to the pipe then wait on the cv. + */ + +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> + +#include <sys/types.h> +#include <sys/stat.h> + +#include "testlib.h" +#include "mevent.h" + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; + +static char cookie[] = "Chocolate chip with fudge stripes"; + +/* + * After this many bytes are sent, writes will get batched up, progress will be + * made on the write side via an interval timer + */ +const int pauseat = 8; + +static void +munch(int fd, enum ev_type ev, void *arg) +{ + static int i = 0; + char buf[sizeof (cookie)] = { 0 }; + ssize_t nbytes; + ssize_t expected; + + ASSERT_INT_EQ(("bad event"), ev, EVF_READ); + ASSERT_PTR_EQ(("bad cookie"), arg, cookie); + + /* + * For the first while, expect data to come a byte at a time. After the + * pause, we should get a burst with the rest of the data. + */ + if (i > pauseat) { + expected = strlen(cookie) - pauseat - 1; + } else { + expected = 1; + } + + if ((nbytes = read(fd, buf, sizeof (buf))) < 0) { + FAIL_ERRNO("bad read"); + } + VERBOSE(("read %ld bytes '%s'", nbytes, buf)); + + ASSERT_INT64_EQ(("wanted a byte of cookie"), nbytes, expected); + + if (expected == 1) { + ASSERT_CHAR_EQ(("bad byte %d of cookie", i), buf[0], cookie[i]); + } else { + ASSERT_STR_EQ(("bad last half of cookie"), buf, &cookie[i]); + } + + pthread_mutex_lock(&mtx); + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + pthread_mutex_unlock(&mtx); + + i++; +} + +static void +tick(int ms, enum ev_type ev, void *arg) +{ + pthread_mutex_lock(&mtx); + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + pthread_mutex_unlock(&mtx); +} + +int +main(int argc, const char *argv[]) +{ + int pipefds[2]; + struct mevent *evp, *timer; + ssize_t written; + + start_test(argv[0], 5); + start_event_thread(); + + if (pipe(pipefds) != 0) { + FAIL_ERRNO("pipe"); + } + if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + + evp = mevent_add(pipefds[0], EVF_READ, munch, cookie); + ASSERT_PTR_NEQ(("mevent_add pipefd"), evp, NULL); + + for (int i = 0; cookie[i] != 0; i++) { + pthread_mutex_lock(&mtx); + written = write(pipefds[1], cookie + i, 1); + if (written < 0) { + FAIL_ERRNO("bad write"); + } + ASSERT_INT64_EQ(("write byte %d of cookie", i), written, 1); + + /* Wait for it to be read */ + pthread_cond_wait(&cv, &mtx); + pthread_mutex_unlock(&mtx); + + if (i == pauseat) { + timer = mevent_add(10, EVF_TIMER, tick, + &cookie[pauseat]); + ASSERT_PTR_NEQ(("mevent_add timer"), timer, NULL); + VERBOSE(("disable munch")); + mevent_disable(evp); + } + } + + pthread_mutex_lock(&mtx); + + mevent_enable(evp); + + pthread_cond_wait(&cv, &mtx); + pthread_mutex_unlock(&mtx); + + PASS(); +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c b/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c new file mode 100644 index 0000000000..ddc3e27235 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/read.requeue.c @@ -0,0 +1,108 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +/* + * Test: read.requeue + * Assertion: A sequence of writes turns into a sequence of events. + * + * Strategy: 1. Create a pipe + * 2. Call mevent_add() to be notified of writes to the pipe. The + * callback will signal a cv. + * 3. In a loop, write to the pipe then wait on the cv. + */ + +#include <errno.h> +#include <fcntl.h> +#include <pthread.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> +#include <unistd.h> + +#include <sys/types.h> +#include <sys/stat.h> + +#include "testlib.h" +#include "mevent.h" + +static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t cv = PTHREAD_COND_INITIALIZER; + +static char *cookie = "Chocolate chip with fudge stripes"; + +static void +munch(int fd, enum ev_type ev, void *arg) +{ + static int i = 0; + char buf[8] = { 0 }; + ssize_t nbytes; + + ASSERT_INT_EQ(("bad event"), ev, EVF_READ); + ASSERT_PTR_EQ(("bad cookie"), arg, cookie); + + if ((nbytes = read(fd, buf, sizeof (buf))) < 0) { + ASSERT_INT64_EQ(("bad read: %s", strerror(errno)), nbytes, 1); + } + VERBOSE(("read %ld bytes '%s'", nbytes, buf)); + + ASSERT_INT64_EQ(("wanted a byte of cookie"), nbytes, 1); + + ASSERT_CHAR_EQ(("bad byte %d of cookie", i), buf[0], cookie[i]); + + pthread_mutex_lock(&mtx); + pthread_cond_signal(&cv); + VERBOSE(("wakeup")); + pthread_mutex_unlock(&mtx); + + i++; +} + +int +main(int argc, const char *argv[]) +{ + int pipefds[2]; + struct mevent *evp; + + start_test(argv[0], 5); + start_event_thread(); + + if (pipe(pipefds) != 0) { + FAIL_ERRNO("pipe"); + } + if (fcntl(pipefds[0], F_SETFL, O_NONBLOCK) != 0) { + FAIL_ERRNO("set pipe nonblocking"); + } + + evp = mevent_add(pipefds[0], EVF_READ, munch, cookie); + ASSERT_PTR_NEQ(("mevent_add"), evp, NULL); + + for (int i = 0; cookie[i] != '\0'; i++) { + ssize_t written; + + pthread_mutex_lock(&mtx); + written = write(pipefds[1], cookie + i, 1); + if (written < 0) { + FAIL_ERRNO("bad write"); + } + ASSERT_INT64_EQ(("write byte %d of cookie", i), written, 1); + + /* Wait for it to be read */ + pthread_cond_wait(&cv, &mtx); + pthread_mutex_unlock(&mtx); + } + + PASS(); +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/testlib.c b/usr/src/cmd/bhyve/test/tst/mevent/testlib.c new file mode 100644 index 0000000000..67261b9a31 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/testlib.c @@ -0,0 +1,70 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <pthread.h> +#include <signal.h> +#include <strings.h> +#include <unistd.h> + +#include "testlib.h" +#include "mevent.h" + +const char *testlib_prog; +boolean_t testlib_verbose; + +static void +timed_out(int signo) +{ + ASSERT_INT_EQ(("timeout signal"), signo, SIGALRM); + + FAIL(("Timed out")); +} + +void +start_test(const char *argv0, uint32_t timeout) +{ + char *val; + + testlib_prog = strrchr(argv0, '/'); + if (testlib_prog == NULL) { + testlib_prog = argv0; + } else { + testlib_prog++; + } + + testlib_verbose = ((val = getenv("TEST_VERBOSE")) != NULL) && + val[0] != '\0'; + + signal(SIGALRM, timed_out); + alarm(timeout); +} + +/* ARGSUSED */ +static void * +event_thread(void *arg) +{ + mevent_dispatch(); + return (NULL); +} + +void +start_event_thread(void) +{ + pthread_t tid; + + if (pthread_create(&tid, NULL, event_thread, NULL) != 0) { + FAIL_ERRNO("pthread_create"); + } +} diff --git a/usr/src/cmd/bhyve/test/tst/mevent/testlib.h b/usr/src/cmd/bhyve/test/tst/mevent/testlib.h new file mode 100644 index 0000000000..7e5ca2e9c9 --- /dev/null +++ b/usr/src/cmd/bhyve/test/tst/mevent/testlib.h @@ -0,0 +1,93 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _TESTLIB_H_ +#define _TESTLIB_H_ + +#include <assert.h> +#include <errno.h> +#include <signal.h> +#include <stdio.h> +#include <stdlib.h> +#include <strings.h> + +#include <sys/types.h> +#include <sys/stat.h> + +#include "mevent.h" + +#define EXIT_PASS 0 +#define EXIT_FAIL 1 + +#define VERBOSE(msg) \ + if (testlib_verbose) { \ + (void) printf("VERBOSE %s: %s:%d %s: ", testlib_prog, \ + __FILE__, __LINE__, __func__); \ + (void) printf msg; \ + (void) printf("\n"); \ + } + +#define FAIL_PROLOGUE() \ + (void) printf("FAIL %s: %s:%d: ", testlib_prog, __FILE__, __LINE__) + +#define FAIL(msg) \ + { \ + FAIL_PROLOGUE(); \ + (void) printf msg; \ + (void) printf("\n"); \ + exit(EXIT_FAIL); \ + } + +#define FAIL_ERRNO(msg) FAIL((msg ": %s", strerror(errno))) + +#define PASS() \ + { \ + (void) printf("PASS %s\n", testlib_prog); \ + exit(EXIT_PASS); \ + } + +#define ASSERT_CMP(msg, got, cmp, exp, nfmt) \ + if (!(got cmp exp)) { \ + FAIL_PROLOGUE(); \ + (void) printf msg; \ + (void) printf(": %s=" nfmt " %s %s=" nfmt "\n", \ + #got, got, #cmp, #exp, exp); \ + exit(EXIT_FAIL); \ + } + +#define ASSERT_CHAR_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%c") +#define ASSERT_INT_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%d") +#define ASSERT_INT_NEQ(msg, got, exp) ASSERT_CMP(msg, got, !=, exp, "%d") +#define ASSERT_INT64_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%ld") +#define ASSERT_PTR_EQ(msg, got, exp) ASSERT_CMP(msg, got, ==, exp, "%p") +#define ASSERT_PTR_NEQ(msg, got, exp) ASSERT_CMP(msg, got, !=, exp, "%p") + +#define ASSERT_STR_EQ(msg, got, exp) \ + if (strcmp(got, exp) != 0) { \ + FAIL_PROLOGUE(); \ + (void) printf msg; \ + (void) printf(": %s='%s' != %s='%s'\n", \ + #got, got, #exp, exp); \ + exit(EXIT_FAIL); \ + } + +extern const char *testlib_prog; +extern boolean_t testlib_verbose; + +extern void start_test(const char *, uint32_t); +extern void start_event_thread(void); +extern void test_mevent_count_lists(int *, int *, int *); + +#endif /* _TESTLIB_H_ */ diff --git a/usr/src/cmd/bhyve/uart_emul.c b/usr/src/cmd/bhyve/uart_emul.c index a8b5d40356..c0fff61d00 100644 --- a/usr/src/cmd/bhyve/uart_emul.c +++ b/usr/src/cmd/bhyve/uart_emul.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * Copyright (c) 2013 Neel Natu <neel@freebsd.org> * All rights reserved. @@ -24,7 +26,8 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z neel $ + * $FreeBSD$ + * */ /* * This file and its contents are supplied under the terms of the @@ -37,46 +40,42 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/types.h> #include <dev/ic/ns16550.h> - -#ifndef __FreeBSD__ -#include <sys/socket.h> -#include <sys/stat.h> +#ifndef WITHOUT_CAPSICUM +#include <sys/capsicum.h> +#include <capsicum_helpers.h> #endif + #include <stdio.h> #include <stdlib.h> #include <assert.h> +#include <err.h> +#include <errno.h> +#include <fcntl.h> #include <termios.h> #include <unistd.h> #include <stdbool.h> #include <string.h> #include <pthread.h> +#include <sysexits.h> #ifndef __FreeBSD__ -#include <errno.h> -#include <fcntl.h> -#include <poll.h> +#include <sys/socket.h> #endif -#ifndef __FreeBSD__ -#include <bhyve.h> - -#include "bhyverun.h" -#endif -#ifdef __FreeBSD__ #include "mevent.h" -#endif #include "uart_emul.h" #define COM1_BASE 0x3F8 #define COM1_IRQ 4 -#define COM2_BASE 0x2F8 -#define COM2_IRQ 3 +#define COM2_BASE 0x2F8 +#define COM2_IRQ 3 #define DEFAULT_RCLK 1843200 #define DEFAULT_BAUD 9600 @@ -89,15 +88,13 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z #define MSR_DELTA_MASK 0x0f #ifndef REG_SCR -#define REG_SCR com_scr +#define REG_SCR com_scr #endif #define FIFOSZ 16 static bool uart_stdio; /* stdio in use for i/o */ -#ifndef __FreeBSD__ -static bool uart_bcons; /* bhyveconsole in use for i/o */ -#endif +static struct termios tio_stdio_orig; static struct { int baseaddr; @@ -118,9 +115,15 @@ struct fifo { int size; /* size of the fifo */ }; +struct ttyfd { + bool opened; + int rfd; /* fd for reading */ + int wfd; /* fd for writing, may be == rfd */ +}; + struct uart_softc { pthread_mutex_t mtx; /* protects all softc elements */ - uint8_t data; /* Data register (R/W) */ + uint8_t data; /* Data register (R/W) */ uint8_t ier; /* Interrupt enable register (R/W) */ uint8_t lcr; /* Line control register (R/W) */ uint8_t mcr; /* Modem control register (R/W) */ @@ -133,16 +136,16 @@ struct uart_softc { uint8_t dlh; /* Baudrate divisor latch MSB */ struct fifo rxfifo; + struct mevent *mev; - bool opened; - bool stdio; + struct ttyfd tty; #ifndef __FreeBSD__ - bool bcons; + bool sock; struct { - pid_t clipid; int clifd; /* console client unix domain socket */ int servfd; /* console server unix domain socket */ - } usc_bcons; + struct mevent *servmev; /* mevent for server socket */ + } usc_sock; #endif bool thre_int_pending; /* THRE interrupt pending */ @@ -152,140 +155,222 @@ struct uart_softc { uart_intr_func_t intr_deassert; }; -#ifdef __FreeBSD__ static void uart_drain(int fd, enum ev_type ev, void *arg); -#else -static void uart_tty_drain(struct uart_softc *sc); -static int uart_bcons_drain(struct uart_softc *sc); -#endif - -static struct termios tio_orig, tio_new; /* I/O Terminals */ static void ttyclose(void) { - tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig); + tcsetattr(STDIN_FILENO, TCSANOW, &tio_stdio_orig); } static void -ttyopen(void) +ttyopen(struct ttyfd *tf) { - - tcgetattr(STDIN_FILENO, &tio_orig); - - tio_new = tio_orig; - cfmakeraw(&tio_new); - tcsetattr(STDIN_FILENO, TCSANOW, &tio_new); - - atexit(ttyclose); -} - -static bool -tty_char_available(void) -{ - fd_set rfds; - struct timeval tv; - - FD_ZERO(&rfds); - FD_SET(STDIN_FILENO, &rfds); - tv.tv_sec = 0; - tv.tv_usec = 0; - if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0 ) { - return (true); - } else { - return (false); + struct termios orig, new; + + tcgetattr(tf->rfd, &orig); + new = orig; + cfmakeraw(&new); + new.c_cflag |= CLOCAL; + tcsetattr(tf->rfd, TCSANOW, &new); + if (uart_stdio) { + tio_stdio_orig = orig; + atexit(ttyclose); } } static int -ttyread(void) +ttyread(struct ttyfd *tf) { - char rb; + unsigned char rb; - if (tty_char_available()) { - read(STDIN_FILENO, &rb, 1); - return (rb & 0xff); - } else { + if (read(tf->rfd, &rb, 1) == 1) + return (rb); + else return (-1); - } } static void -ttywrite(unsigned char wb) +ttywrite(struct ttyfd *tf, unsigned char wb) { - (void)write(STDIN_FILENO, &wb, 1); + (void)write(tf->wfd, &wb, 1); } #ifndef __FreeBSD__ static void -bconswrite(struct uart_softc *sc, unsigned char wb) +sockwrite(struct uart_softc *sc, unsigned char wb) { - (void) write(sc->usc_bcons.clifd, &wb, 1); + (void) write(sc->usc_sock.clifd, &wb, 1); } #endif static void -fifo_reset(struct fifo *fifo, int size) +rxfifo_reset(struct uart_softc *sc, int size) { + char flushbuf[32]; + struct fifo *fifo; + ssize_t nread; + int error; + + fifo = &sc->rxfifo; bzero(fifo, sizeof(struct fifo)); fifo->size = size; + + if (sc->tty.opened) { + /* + * Flush any unread input from the tty buffer. + */ + while (1) { + nread = read(sc->tty.rfd, flushbuf, sizeof(flushbuf)); + if (nread != sizeof(flushbuf)) + break; + } + + /* + * Enable mevent to trigger when new characters are available + * on the tty fd. + */ + error = mevent_enable(sc->mev); + assert(error == 0); + } +#ifndef __FreeBSD__ + if (sc->sock && sc->usc_sock.clifd != -1) { + /* Flush any unread input from the socket buffer. */ + do { + nread = read(sc->usc_sock.clifd, flushbuf, + sizeof (flushbuf)); + } while (nread == sizeof (flushbuf)); + + /* Enable mevent to trigger when new data available on sock */ + error = mevent_enable(sc->mev); + assert(error == 0); + } +#endif /* __FreeBSD__ */ +} + +static int +rxfifo_available(struct uart_softc *sc) +{ + struct fifo *fifo; + + fifo = &sc->rxfifo; + return (fifo->num < fifo->size); } static int -fifo_putchar(struct fifo *fifo, uint8_t ch) +rxfifo_putchar(struct uart_softc *sc, uint8_t ch) { + struct fifo *fifo; + int error; + + fifo = &sc->rxfifo; if (fifo->num < fifo->size) { fifo->buf[fifo->windex] = ch; fifo->windex = (fifo->windex + 1) % fifo->size; fifo->num++; + if (!rxfifo_available(sc)) { + if (sc->tty.opened) { + /* + * Disable mevent callback if the FIFO is full. + */ + error = mevent_disable(sc->mev); + assert(error == 0); + } +#ifndef __FreeBSD__ + if (sc->sock && sc->usc_sock.clifd != -1) { + /* + * Disable mevent callback if the FIFO is full. + */ + error = mevent_disable(sc->mev); + assert(error == 0); + } +#endif /* __FreeBSD__ */ + } return (0); } else return (-1); } static int -fifo_getchar(struct fifo *fifo) +rxfifo_getchar(struct uart_softc *sc) { - int c; + struct fifo *fifo; + int c, error, wasfull; + wasfull = 0; + fifo = &sc->rxfifo; if (fifo->num > 0) { + if (!rxfifo_available(sc)) + wasfull = 1; c = fifo->buf[fifo->rindex]; fifo->rindex = (fifo->rindex + 1) % fifo->size; fifo->num--; + if (wasfull) { + if (sc->tty.opened) { + error = mevent_enable(sc->mev); + assert(error == 0); + } +#ifndef __FreeBSD__ + if (sc->sock && sc->usc_sock.clifd != -1) { + error = mevent_enable(sc->mev); + assert(error == 0); + } +#endif /* __FreeBSD__ */ + } return (c); } else return (-1); } static int -fifo_numchars(struct fifo *fifo) +rxfifo_numchars(struct uart_softc *sc) { + struct fifo *fifo = &sc->rxfifo; return (fifo->num); } -static int -fifo_available(struct fifo *fifo) +static void +uart_opentty(struct uart_softc *sc) { - return (fifo->num < fifo->size); + ttyopen(&sc->tty); + sc->mev = mevent_add(sc->tty.rfd, EVF_READ, uart_drain, sc); + assert(sc->mev != NULL); } -static void -uart_opentty(struct uart_softc *sc) +static uint8_t +modem_status(uint8_t mcr) { - struct mevent *mev; + uint8_t msr; - assert(!sc->opened && sc->stdio); + if (mcr & MCR_LOOPBACK) { + /* + * In the loopback mode certain bits from the MCR are + * reflected back into MSR. + */ + msr = 0; + if (mcr & MCR_RTS) + msr |= MSR_CTS; + if (mcr & MCR_DTR) + msr |= MSR_DSR; + if (mcr & MCR_OUT1) + msr |= MSR_RI; + if (mcr & MCR_OUT2) + msr |= MSR_DCD; + } else { + /* + * Always assert DCD and DSR so tty open doesn't block + * even if CLOCAL is turned off. + */ + msr = MSR_DCD | MSR_DSR; + } + assert((msr & MSR_DELTA_MASK) == 0); - ttyopen(); -#ifdef __FreeBSD__ - mev = mevent_add(STDIN_FILENO, EVF_READ, uart_drain, sc); -#endif - assert(mev); + return (msr); } /* @@ -302,7 +387,7 @@ uart_intr_reason(struct uart_softc *sc) if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0) return (IIR_RLS); - else if (fifo_numchars(&sc->rxfifo) > 0 && (sc->ier & IER_ERXRDY) != 0) + else if (rxfifo_numchars(sc) > 0 && (sc->ier & IER_ERXRDY) != 0) return (IIR_RXTOUT); else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0) return (IIR_TXRDY); @@ -319,9 +404,14 @@ uart_reset(struct uart_softc *sc) divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16; sc->dll = divisor; +#ifndef __FreeBSD__ + sc->dlh = 0; +#else sc->dlh = divisor >> 16; +#endif + sc->msr = modem_status(sc->mcr); - fifo_reset(&sc->rxfifo, 1); /* no fifo until enabled by software */ + rxfifo_reset(sc, 1); /* no fifo until enabled by software */ } /* @@ -341,7 +431,6 @@ uart_toggle_intr(struct uart_softc *sc) (*sc->intr_assert)(sc->arg); } -#ifdef __FreeBSD__ static void uart_drain(int fd, enum ev_type ev, void *arg) { @@ -350,7 +439,7 @@ uart_drain(int fd, enum ev_type ev, void *arg) sc = arg; - assert(fd == STDIN_FILENO); + assert(fd == sc->tty.rfd); assert(ev == EVF_READ); /* @@ -361,35 +450,11 @@ uart_drain(int fd, enum ev_type ev, void *arg) pthread_mutex_lock(&sc->mtx); if ((sc->mcr & MCR_LOOPBACK) != 0) { - (void) ttyread(); - } else { - while (fifo_available(&sc->rxfifo) && - ((ch = ttyread()) != -1)) { - fifo_putchar(&sc->rxfifo, ch); - } - uart_toggle_intr(sc); - } - - pthread_mutex_unlock(&sc->mtx); -} -#else -static void -uart_tty_drain(struct uart_softc *sc) -{ - int ch; - - /* - * Take the softc lock to protect against concurrent - * access from a vCPU i/o exit - */ - pthread_mutex_lock(&sc->mtx); - - if ((sc->mcr & MCR_LOOPBACK) != 0) { - (void) ttyread(); + (void) ttyread(&sc->tty); } else { - while (fifo_available(&sc->rxfifo) && - ((ch = ttyread()) != -1)) { - fifo_putchar(&sc->rxfifo, ch); + while (rxfifo_available(sc) && + ((ch = ttyread(&sc->tty)) != -1)) { + rxfifo_putchar(sc, ch); } uart_toggle_intr(sc); } @@ -397,50 +462,6 @@ uart_tty_drain(struct uart_softc *sc) pthread_mutex_unlock(&sc->mtx); } -static int -uart_bcons_drain(struct uart_softc *sc) -{ - char ch; - int nbytes; - int ret = 0; - - /* - * Take the softc lock to protect against concurrent - * access from a vCPU i/o exit - */ - pthread_mutex_lock(&sc->mtx); - - if ((sc->mcr & MCR_LOOPBACK) != 0) { - (void) read(sc->usc_bcons.clifd, &ch, 1); - } else { - for (;;) { - nbytes = read(sc->usc_bcons.clifd, &ch, 1); - if (nbytes == 0) { - ret = 1; - break; - } - if (nbytes == -1 && - errno != EINTR && errno != EAGAIN) { - ret = -1; - break; - } - if (nbytes == -1) { - break; - } - - if (fifo_available(&sc->rxfifo)) { - fifo_putchar(&sc->rxfifo, ch); - } - } - uart_toggle_intr(sc); - } - - pthread_mutex_unlock(&sc->mtx); - - return (ret); -} -#endif - void uart_write(struct uart_softc *sc, int offset, uint8_t value) { @@ -449,12 +470,6 @@ uart_write(struct uart_softc *sc, int offset, uint8_t value) pthread_mutex_lock(&sc->mtx); - /* Open terminal */ - if (!sc->opened && sc->stdio) { - uart_opentty(sc); - sc->opened = true; - } - /* * Take care of the special case DLAB accesses first */ @@ -473,108 +488,96 @@ uart_write(struct uart_softc *sc, int offset, uint8_t value) switch (offset) { case REG_DATA: if (sc->mcr & MCR_LOOPBACK) { - if (fifo_putchar(&sc->rxfifo, value) != 0) + if (rxfifo_putchar(sc, value) != 0) sc->lsr |= LSR_OE; - } else if (sc->stdio) { - ttywrite(value); + } else if (sc->tty.opened) { + ttywrite(&sc->tty, value); #ifndef __FreeBSD__ - } else if (sc->bcons) { - bconswrite(sc, value); + } else if (sc->sock) { + sockwrite(sc, value); #endif } /* else drop on floor */ sc->thre_int_pending = true; break; case REG_IER: + /* Set pending when IER_ETXRDY is raised (edge-triggered). */ + if ((sc->ier & IER_ETXRDY) == 0 && (value & IER_ETXRDY) != 0) + sc->thre_int_pending = true; /* * Apply mask so that bits 4-7 are 0 * Also enables bits 0-3 only if they're 1 */ sc->ier = value & 0x0F; break; - case REG_FCR: - /* - * When moving from FIFO and 16450 mode and vice versa, - * the FIFO contents are reset. - */ - if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) { - fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1; - fifo_reset(&sc->rxfifo, fifosz); - } + case REG_FCR: + /* + * When moving from FIFO and 16450 mode and vice versa, + * the FIFO contents are reset. + */ + if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) { + fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1; + rxfifo_reset(sc, fifosz); + } - /* - * The FCR_ENABLE bit must be '1' for the programming - * of other FCR bits to be effective. - */ - if ((value & FCR_ENABLE) == 0) { - sc->fcr = 0; - } else { - if ((value & FCR_RCV_RST) != 0) - fifo_reset(&sc->rxfifo, FIFOSZ); - - sc->fcr = value & - (FCR_ENABLE | FCR_DMA | FCR_RX_MASK); - } - break; - case REG_LCR: - sc->lcr = value; - break; - case REG_MCR: - /* Apply mask so that bits 5-7 are 0 */ - sc->mcr = value & 0x1F; - - msr = 0; - if (sc->mcr & MCR_LOOPBACK) { - /* - * In the loopback mode certain bits from the - * MCR are reflected back into MSR - */ - if (sc->mcr & MCR_RTS) - msr |= MSR_CTS; - if (sc->mcr & MCR_DTR) - msr |= MSR_DSR; - if (sc->mcr & MCR_OUT1) - msr |= MSR_RI; - if (sc->mcr & MCR_OUT2) - msr |= MSR_DCD; - } + /* + * The FCR_ENABLE bit must be '1' for the programming + * of other FCR bits to be effective. + */ + if ((value & FCR_ENABLE) == 0) { + sc->fcr = 0; + } else { + if ((value & FCR_RCV_RST) != 0) + rxfifo_reset(sc, FIFOSZ); + + sc->fcr = value & + (FCR_ENABLE | FCR_DMA | FCR_RX_MASK); + } + break; + case REG_LCR: + sc->lcr = value; + break; + case REG_MCR: + /* Apply mask so that bits 5-7 are 0 */ + sc->mcr = value & 0x1F; + msr = modem_status(sc->mcr); - /* - * Detect if there has been any change between the - * previous and the new value of MSR. If there is - * then assert the appropriate MSR delta bit. - */ - if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS)) - sc->msr |= MSR_DCTS; - if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR)) - sc->msr |= MSR_DDSR; - if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD)) - sc->msr |= MSR_DDCD; - if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0) - sc->msr |= MSR_TERI; - - /* - * Update the value of MSR while retaining the delta - * bits. - */ - sc->msr &= MSR_DELTA_MASK; - sc->msr |= msr; - break; - case REG_LSR: - /* - * Line status register is not meant to be written to - * during normal operation. - */ - break; - case REG_MSR: - /* - * As far as I can tell MSR is a read-only register. - */ - break; - case REG_SCR: - sc->scr = value; - break; - default: - break; + /* + * Detect if there has been any change between the + * previous and the new value of MSR. If there is + * then assert the appropriate MSR delta bit. + */ + if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS)) + sc->msr |= MSR_DCTS; + if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR)) + sc->msr |= MSR_DDSR; + if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD)) + sc->msr |= MSR_DDCD; + if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0) + sc->msr |= MSR_TERI; + + /* + * Update the value of MSR while retaining the delta + * bits. + */ + sc->msr &= MSR_DELTA_MASK; + sc->msr |= msr; + break; + case REG_LSR: + /* + * Line status register is not meant to be written to + * during normal operation. + */ + break; + case REG_MSR: + /* + * As far as I can tell MSR is a read-only register. + */ + break; + case REG_SCR: + sc->scr = value; + break; + default: + break; } done: @@ -589,12 +592,6 @@ uart_read(struct uart_softc *sc, int offset) pthread_mutex_lock(&sc->mtx); - /* Open terminal */ - if (!sc->opened && sc->stdio) { - uart_opentty(sc); - sc->opened = true; - } - /* * Take care of the special case DLAB accesses first */ @@ -612,7 +609,7 @@ uart_read(struct uart_softc *sc, int offset) switch (offset) { case REG_DATA: - reg = fifo_getchar(&sc->rxfifo); + reg = rxfifo_getchar(sc); break; case REG_IER: reg = sc->ier; @@ -643,7 +640,7 @@ uart_read(struct uart_softc *sc, int offset) sc->lsr |= LSR_TEMT | LSR_THRE; /* Check for new receive data */ - if (fifo_numchars(&sc->rxfifo) > 0) + if (rxfifo_numchars(sc) > 0) sc->lsr |= LSR_RXRDY; else sc->lsr &= ~LSR_RXRDY; @@ -676,277 +673,123 @@ done: } #ifndef __FreeBSD__ -static void * -uart_tty_thread(void *param) -{ - struct uart_softc *sc = param; - pollfd_t pollset; - - pollset.fd = STDIN_FILENO; - pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND; - - for (;;) { - if (poll(&pollset, 1, -1) < 0) { - if (errno != EINTR) { - perror("poll failed"); - break; - } - continue; - } - uart_tty_drain(sc); - } - - return (NULL); -} - -/* - * Read the "ident" string from the client's descriptor; this routine also - * tolerates being called with pid=NULL, for times when you want to "eat" - * the ident string from a client without saving it. - */ -static int -get_client_ident(int clifd, pid_t *pid) +static void +uart_sock_drain(int fd, enum ev_type ev, void *arg) { - char buf[BUFSIZ], *bufp; - size_t buflen = sizeof (buf); - char c = '\0'; - int i = 0, r; - - /* "eat up the ident string" case, for simplicity */ - if (pid == NULL) { - while (read(clifd, &c, 1) == 1) { - if (c == '\n') - return (0); - } - } - - bzero(buf, sizeof (buf)); - while ((buflen > 1) && (r = read(clifd, &c, 1)) == 1) { - buflen--; - if (c == '\n') - break; - - buf[i] = c; - i++; - } - if (r == -1) - return (-1); - - /* - * We've filled the buffer, but still haven't seen \n. Keep eating - * until we find it; we don't expect this to happen, but this is - * defensive. - */ - if (c != '\n') { - while ((r = read(clifd, &c, sizeof (c))) > 0) - if (c == '\n') - break; - } + struct uart_softc *sc = arg; + char ch; /* - * Parse buffer for message of the form: IDENT <pid> + * Take the softc lock to protect against concurrent + * access from a vCPU i/o exit */ - bufp = buf; - if (strncmp(bufp, "IDENT ", 6) != 0) - return (-1); - bufp += 6; - errno = 0; - *pid = strtoll(bufp, &bufp, 10); - if (errno != 0) - return (-1); + pthread_mutex_lock(&sc->mtx); - return (0); -} + if ((sc->mcr & MCR_LOOPBACK) != 0) { + (void) read(sc->usc_sock.clifd, &ch, 1); + } else { + bool err_close = false; -static int -uart_bcons_accept_client(struct uart_softc *sc) -{ - int connfd; - struct sockaddr_un cliaddr; - socklen_t clilen; - pid_t pid; - - clilen = sizeof (cliaddr); - connfd = accept(sc->usc_bcons.servfd, - (struct sockaddr *)&cliaddr, &clilen); - if (connfd == -1) - return (-1); - if (get_client_ident(connfd, &pid) == -1) { - (void) shutdown(connfd, SHUT_RDWR); - (void) close(connfd); - return (-1); - } + while (rxfifo_available(sc)) { + int res; - if (fcntl(connfd, F_SETFL, O_NONBLOCK) < 0) { - (void) shutdown(connfd, SHUT_RDWR); - (void) close(connfd); - return (-1); - } - (void) write(connfd, "OK\n", 3); + res = read(sc->usc_sock.clifd, &ch, 1); + if (res == 0) { + err_close = true; + break; + } else if (res == -1) { + if (errno != EAGAIN && errno != EINTR) { + err_close = true; + } + break; + } - sc->usc_bcons.clipid = pid; - sc->usc_bcons.clifd = connfd; + rxfifo_putchar(sc, ch); + } + uart_toggle_intr(sc); - printf("Connection from process ID %lu.\n", pid); + if (err_close) { + (void) fprintf(stderr, "uart: closing client conn\n"); + (void) shutdown(sc->usc_sock.clifd, SHUT_RDWR); + mevent_delete_close(sc->mev); + sc->mev = NULL; + sc->usc_sock.clifd = -1; + } + } - return (0); + pthread_mutex_unlock(&sc->mtx); } static void -uart_bcons_reject_client(struct uart_softc *sc) +uart_sock_accept(int fd, enum ev_type ev, void *arg) { + struct uart_softc *sc = arg; int connfd; - struct sockaddr_un cliaddr; - socklen_t clilen; - char nak[MAXPATHLEN]; - clilen = sizeof (cliaddr); - connfd = accept(sc->usc_bcons.servfd, - (struct sockaddr *)&cliaddr, &clilen); + connfd = accept(sc->usc_sock.servfd, NULL, NULL); + if (connfd == -1) { + return; + } /* - * After hear its ident string, tell client to get lost. + * Do client connection management under protection of the softc lock + * to avoid racing with concurrent UART events. */ - if (get_client_ident(connfd, NULL) == 0) { - (void) snprintf(nak, sizeof (nak), "%lu\n", - sc->usc_bcons.clipid); - (void) write(connfd, nak, strlen(nak)); - } - (void) shutdown(connfd, SHUT_RDWR); - (void) close(connfd); -} - -static int -uart_bcons_client_event(struct uart_softc *sc) -{ - int res; - - res = uart_bcons_drain(sc); - if (res < 0) - return (-1); - - if (res > 0) { - fprintf(stderr, "Closing connection with bhyve console\n"); - (void) shutdown(sc->usc_bcons.clifd, SHUT_RDWR); - (void) close(sc->usc_bcons.clifd); - sc->usc_bcons.clifd = -1; - } - - return (0); -} - -static void -uart_bcons_server_event(struct uart_softc *sc) -{ - int clifd; + pthread_mutex_lock(&sc->mtx); - if (sc->usc_bcons.clifd != -1) { + if (sc->usc_sock.clifd != -1) { /* we're already handling a client */ - uart_bcons_reject_client(sc); - return; - } - - if (uart_bcons_accept_client(sc) == 0) { - pthread_mutex_lock(&bcons_wait_lock); - bcons_connected = B_TRUE; - pthread_cond_signal(&bcons_wait_done); - pthread_mutex_unlock(&bcons_wait_lock); - } -} - -static void * -uart_bcons_thread(void *param) -{ - struct uart_softc *sc = param; - struct pollfd pollfds[2]; - int res; - - /* read from client and write to vm */ - pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND | - POLLPRI | POLLERR | POLLHUP; - - /* the server socket; watch for events (new connections) */ - pollfds[1].events = pollfds[0].events; - - for (;;) { - pollfds[0].fd = sc->usc_bcons.clifd; - pollfds[1].fd = sc->usc_bcons.servfd; - pollfds[0].revents = pollfds[1].revents = 0; - - res = poll(pollfds, - sizeof (pollfds) / sizeof (struct pollfd), -1); - - if (res == -1 && errno != EINTR) { - perror("poll failed"); - /* we are hosed, close connection */ - break; - } - - /* event from client side */ - if (pollfds[0].revents) { - if (pollfds[0].revents & - (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) { - if (uart_bcons_client_event(sc) < 0) - break; - } else { - break; - } - } - - /* event from server socket */ - if (pollfds[1].revents) { - if (pollfds[1].revents & (POLLIN | POLLRDNORM)) { - uart_bcons_server_event(sc); - } else { - break; - } + (void) fprintf(stderr, "uart: unexpected client conn\n"); + (void) shutdown(connfd, SHUT_RDWR); + (void) close(connfd); + } else { + if (fcntl(connfd, F_SETFL, O_NONBLOCK) < 0) { + perror("uart: fcntl(O_NONBLOCK)"); + (void) shutdown(connfd, SHUT_RDWR); + (void) close(connfd); + } else { + sc->usc_sock.clifd = connfd; + sc->mev = mevent_add(sc->usc_sock.clifd, EVF_READ, + uart_sock_drain, sc); } } - if (sc->usc_bcons.clifd != -1) { - fprintf(stderr, "Closing connection with bhyve console\n"); - (void) shutdown(sc->usc_bcons.clifd, SHUT_RDWR); - (void) close(sc->usc_bcons.clifd); - sc->usc_bcons.clifd = -1; - } - - return (NULL); + pthread_mutex_unlock(&sc->mtx); } static int -init_bcons_sock(void) +init_sock(const char *path) { int servfd; struct sockaddr_un servaddr; - if (mkdir(BHYVE_TMPDIR, S_IRWXU) < 0 && errno != EEXIST) { - fprintf(stderr, "bhyve console setup: " - "could not mkdir %s", BHYVE_TMPDIR, strerror(errno)); - return (-1); - } - bzero(&servaddr, sizeof (servaddr)); servaddr.sun_family = AF_UNIX; - (void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path), - BHYVE_CONS_SOCKPATH, vmname); + + if (strlcpy(servaddr.sun_path, path, sizeof (servaddr.sun_path)) >= + sizeof (servaddr.sun_path)) { + (void) fprintf(stderr, "uart: path '%s' too long\n", + path); + return (-1); + } if ((servfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { - fprintf(stderr, "bhyve console setup: " - "could not create socket\n"); + (void) fprintf(stderr, "uart: socket() error - %s\n", + strerror(errno)); return (-1); } (void) unlink(servaddr.sun_path); if (bind(servfd, (struct sockaddr *)&servaddr, sizeof (servaddr)) == -1) { - fprintf(stderr, "bhyve console setup: " - "could not bind to socket\n"); + (void) fprintf(stderr, "uart: bind() error - %s\n", + strerror(errno)); goto out; } - if (listen(servfd, 4) == -1) { - fprintf(stderr, "bhyve console setup: " - "could not listen on socket"); + if (listen(servfd, 1) == -1) { + (void) fprintf(stderr, "uart: listen() error - %s\n", + strerror(errno)); goto out; } return (servfd); @@ -956,7 +799,7 @@ out: (void) close(servfd); return (-1); } -#endif +#endif /* not __FreeBSD__ */ int uart_legacy_alloc(int which, int *baseaddr, int *irq) @@ -978,8 +821,7 @@ uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, { struct uart_softc *sc; - sc = malloc(sizeof(struct uart_softc)); - bzero(sc, sizeof(struct uart_softc)); + sc = calloc(1, sizeof(struct uart_softc)); sc->arg = arg; sc->intr_assert = intr_assert; @@ -992,51 +834,130 @@ uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert, return (sc); } -int -uart_set_backend(struct uart_softc *sc, const char *opts) +#ifndef __FreeBSD__ +static int +uart_sock_backend(struct uart_softc *sc, const char *inopts) { -#ifndef __FreeBSD__ - int error; + char *opts; + char *opt; + char *nextopt; + char *path = NULL; + + if (strncmp(inopts, "socket,", 7) != 0) { + return (-1); + } + if ((opts = strdup(inopts + 7)) == NULL) { + return (-1); + } + + nextopt = opts; + for (opt = strsep(&nextopt, ","); opt != NULL; + opt = strsep(&nextopt, ",")) { + if (path == NULL && *opt == '/') { + path = opt; + continue; + } + /* + * XXX check for server and client options here. For now, + * everything is a server + */ + free(opts); + return (-1); + } + + sc->usc_sock.clifd = -1; + if ((sc->usc_sock.servfd = init_sock(path)) == -1) { + free(opts); + return (-1); + } + sc->sock = true; + sc->tty.rfd = sc->tty.wfd = -1; + sc->usc_sock.servmev = mevent_add(sc->usc_sock.servfd, EVF_READ, + uart_sock_accept, sc); + assert(sc->usc_sock.servmev != NULL); + + return (0); +} +#endif /* not __FreeBSD__ */ + +static int +uart_stdio_backend(struct uart_softc *sc) +{ +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; #endif - /* - * XXX one stdio backend supported at this time. - */ - if (opts == NULL) - return (0); -#ifdef __FreeBSD__ - if (strcmp("stdio", opts) == 0 && !uart_stdio) { - sc->stdio = true; - uart_stdio = true; - return (0); -#else - if (strcmp("stdio", opts) == 0 && !uart_stdio && !uart_bcons) { - sc->stdio = true; - uart_stdio = true; + if (uart_stdio) + return (-1); - error = pthread_create(NULL, NULL, uart_tty_thread, sc); - assert(error == 0); + sc->tty.rfd = STDIN_FILENO; + sc->tty.wfd = STDOUT_FILENO; + sc->tty.opened = true; - return (0); - } else if (strstr(opts, "bcons") != 0 && !uart_stdio && !uart_bcons) { - sc->bcons = true; - uart_bcons= true; + if (fcntl(sc->tty.rfd, F_SETFL, O_NONBLOCK) != 0) + return (-1); + if (fcntl(sc->tty.wfd, F_SETFL, O_NONBLOCK) != 0) + return (-1); - if (strstr(opts, "bcons,wait") != 0) { - bcons_wait = true; - } +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ); + if (caph_rights_limit(sc->tty.rfd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(sc->tty.rfd, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif - sc->usc_bcons.clifd = -1; - if ((sc->usc_bcons.servfd = init_bcons_sock()) == -1) { - fprintf(stderr, "bhyve console setup: " - "socket initialization failed\n"); - return (-1); - } - error = pthread_create(NULL, NULL, uart_bcons_thread, sc); - assert(error == 0); + uart_stdio = true; - return (0); + return (0); +} + +static int +uart_tty_backend(struct uart_softc *sc, const char *opts) +{ +#ifndef WITHOUT_CAPSICUM + cap_rights_t rights; + cap_ioctl_t cmds[] = { TIOCGETA, TIOCSETA, TIOCGWINSZ }; #endif - } else + int fd; + + fd = open(opts, O_RDWR | O_NONBLOCK); + if (fd < 0 || !isatty(fd)) return (-1); + + sc->tty.rfd = sc->tty.wfd = fd; + sc->tty.opened = true; + +#ifndef WITHOUT_CAPSICUM + cap_rights_init(&rights, CAP_EVENT, CAP_IOCTL, CAP_READ, CAP_WRITE); + if (caph_rights_limit(fd, &rights) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); + if (caph_ioctls_limit(fd, cmds, nitems(cmds)) == -1) + errx(EX_OSERR, "Unable to apply rights for sandbox"); +#endif + + return (0); +} + +int +uart_set_backend(struct uart_softc *sc, const char *opts) +{ + int retval; + + if (opts == NULL) + return (0); + +#ifndef __FreeBSD__ + if (strncmp("socket,", opts, 7) == 0) + return (uart_sock_backend(sc, opts)); +#endif + if (strcmp("stdio", opts) == 0) + retval = uart_stdio_backend(sc); + else + retval = uart_tty_backend(sc, opts); + if (retval == 0) + uart_opentty(sc); + + return (retval); } diff --git a/usr/src/cmd/bhyve/uart_emul.h b/usr/src/cmd/bhyve/uart_emul.h index ecff957991..a87202df1f 100644 --- a/usr/src/cmd/bhyve/uart_emul.h +++ b/usr/src/cmd/bhyve/uart_emul.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Neel Natu <neel@freebsd.org> * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/uart_emul.h 257293 2013-10-29 00:18:11Z neel $ + * $FreeBSD$ */ #ifndef _UART_EMUL_H_ diff --git a/usr/src/cmd/bhyve/usb_emul.c b/usr/src/cmd/bhyve/usb_emul.c new file mode 100644 index 0000000000..6ecdd9530e --- /dev/null +++ b/usr/src/cmd/bhyve/usb_emul.c @@ -0,0 +1,78 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Nahanni Systems Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/queue.h> + +#include <assert.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <pthread.h> + +#include "usb_emul.h" + +SET_DECLARE(usb_emu_set, struct usb_devemu); + +struct usb_devemu * +usb_emu_finddev(char *name) +{ + struct usb_devemu **udpp, *udp; + + SET_FOREACH(udpp, usb_emu_set) { + udp = *udpp; + if (!strcmp(udp->ue_emu, name)) + return (udp); + } + + return (NULL); +} + +struct usb_data_xfer_block * +usb_data_xfer_append(struct usb_data_xfer *xfer, void *buf, int blen, + void *hci_data, int ccs) +{ + struct usb_data_xfer_block *xb; + + if (xfer->ndata >= USB_MAX_XFER_BLOCKS) + return (NULL); + + xb = &xfer->data[xfer->tail]; + xb->buf = buf; + xb->blen = blen; + xb->hci_data = hci_data; + xb->ccs = ccs; + xb->processed = 0; + xb->bdone = 0; + xfer->ndata++; + xfer->tail = (xfer->tail + 1) % USB_MAX_XFER_BLOCKS; + return (xb); +} diff --git a/usr/src/cmd/bhyve/usb_emul.h b/usr/src/cmd/bhyve/usb_emul.h new file mode 100644 index 0000000000..e55a421b6f --- /dev/null +++ b/usr/src/cmd/bhyve/usb_emul.h @@ -0,0 +1,164 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com> + * Copyright 2018 Joyent, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _USB_EMUL_H_ +#define _USB_EMUL_H_ + +#include <stdlib.h> +#include <sys/linker_set.h> +#include <pthread.h> +#ifndef __FreeBSD__ +#include <synch.h> +#endif + +#define USB_MAX_XFER_BLOCKS 8 + +#define USB_XFER_OUT 0 +#define USB_XFER_IN 1 + + + +struct usb_hci; +struct usb_device_request; +struct usb_data_xfer; + +/* Device emulation handlers */ +struct usb_devemu { + char *ue_emu; /* name of device emulation */ + int ue_usbver; /* usb version: 2 or 3 */ + int ue_usbspeed; /* usb device speed */ + + /* instance creation */ + void *(*ue_init)(struct usb_hci *hci, char *opt); + + /* handlers */ + int (*ue_request)(void *sc, struct usb_data_xfer *xfer); + int (*ue_data)(void *sc, struct usb_data_xfer *xfer, int dir, + int epctx); + int (*ue_reset)(void *sc); + int (*ue_remove)(void *sc); + int (*ue_stop)(void *sc); +}; +#define USB_EMUL_SET(x) DATA_SET(usb_emu_set, x); + +/* + * USB device events to notify HCI when state changes + */ +enum hci_usbev { + USBDEV_ATTACH, + USBDEV_RESET, + USBDEV_STOP, + USBDEV_REMOVE, +}; + +/* usb controller, ie xhci, ehci */ +struct usb_hci { + int (*hci_intr)(struct usb_hci *hci, int epctx); + int (*hci_event)(struct usb_hci *hci, enum hci_usbev evid, + void *param); + void *hci_sc; /* private softc for hci */ + + /* controller managed fields */ + int hci_address; + int hci_port; +}; + +/* + * Each xfer block is mapped to the hci transfer block. + * On input into the device handler, blen is set to the lenght of buf. + * The device handler is to update blen to reflect on the residual size + * of the buffer, i.e. len(buf) - len(consumed). + */ +struct usb_data_xfer_block { + void *buf; /* IN or OUT pointer */ + int blen; /* in:len(buf), out:len(remaining) */ + int bdone; /* bytes transferred */ + uint32_t processed; /* device processed this + errcode */ + void *hci_data; /* HCI private reference */ + int ccs; + uint32_t streamid; + uint64_t trbnext; /* next TRB guest address */ +}; + +struct usb_data_xfer { + struct usb_data_xfer_block data[USB_MAX_XFER_BLOCKS]; + struct usb_device_request *ureq; /* setup ctl request */ + int ndata; /* # of data items */ + int head; + int tail; + pthread_mutex_t mtx; +}; + +enum USB_ERRCODE { + USB_ACK, + USB_NAK, + USB_STALL, + USB_NYET, + USB_ERR, + USB_SHORT +}; + +#define USB_DATA_GET_ERRCODE(x) (x)->processed >> 8 +#define USB_DATA_SET_ERRCODE(x,e) do { \ + (x)->processed = ((x)->processed & 0xFF) | (e << 8); \ + } while (0) + +#define USB_DATA_OK(x,i) ((x)->data[(i)].buf != NULL) + +#define USB_DATA_XFER_INIT(x) do { \ + memset((x), 0, sizeof(*(x))); \ + pthread_mutex_init(&((x)->mtx), NULL); \ + } while (0) + +#define USB_DATA_XFER_RESET(x) do { \ + memset((x)->data, 0, sizeof((x)->data)); \ + (x)->ndata = 0; \ + (x)->head = (x)->tail = 0; \ + } while (0) + +#define USB_DATA_XFER_LOCK(x) do { \ + pthread_mutex_lock(&((x)->mtx)); \ + } while (0) + +#define USB_DATA_XFER_UNLOCK(x) do { \ + pthread_mutex_unlock(&((x)->mtx)); \ + } while (0) +#ifndef __FreeBSD__ +#define USB_DATA_XFER_LOCK_HELD(x) MUTEX_HELD(&((x)->mtx)) +#endif + +struct usb_devemu *usb_emu_finddev(char *name); + +struct usb_data_xfer_block *usb_data_xfer_append(struct usb_data_xfer *xfer, + void *buf, int blen, void *hci_data, int ccs); + + +#endif /* _USB_EMUL_H_ */ diff --git a/usr/src/cmd/bhyve/usb_mouse.c b/usr/src/cmd/bhyve/usb_mouse.c new file mode 100644 index 0000000000..921fce5db9 --- /dev/null +++ b/usr/src/cmd/bhyve/usb_mouse.c @@ -0,0 +1,809 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Leon Dang <ldang@nahannisys.com> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/time.h> + +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include <dev/usb/usb.h> +#include <dev/usb/usbdi.h> + +#include "usb_emul.h" +#include "console.h" +#include "bhyvegc.h" + +static int umouse_debug = 0; +#define DPRINTF(params) if (umouse_debug) printf params +#define WPRINTF(params) printf params + +/* USB endpoint context (1-15) for reporting mouse data events*/ +#define UMOUSE_INTR_ENDPT 1 + +#define UMOUSE_REPORT_DESC_TYPE 0x22 + +#define UMOUSE_GET_REPORT 0x01 +#define UMOUSE_GET_IDLE 0x02 +#define UMOUSE_GET_PROTOCOL 0x03 +#define UMOUSE_SET_REPORT 0x09 +#define UMOUSE_SET_IDLE 0x0A +#define UMOUSE_SET_PROTOCOL 0x0B + +#define HSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) } + +enum { + UMSTR_LANG, + UMSTR_MANUFACTURER, + UMSTR_PRODUCT, + UMSTR_SERIAL, + UMSTR_CONFIG, + UMSTR_MAX +}; + +static const char *umouse_desc_strings[] = { + "\x04\x09", + "BHYVE", + "HID Tablet", + "01", + "HID Tablet Device", +}; + +struct umouse_hid_descriptor { + uint8_t bLength; + uint8_t bDescriptorType; + uint8_t bcdHID[2]; + uint8_t bCountryCode; + uint8_t bNumDescriptors; + uint8_t bReportDescriptorType; + uint8_t wItemLength[2]; +} __packed; + +struct umouse_config_desc { + struct usb_config_descriptor confd; + struct usb_interface_descriptor ifcd; + struct umouse_hid_descriptor hidd; + struct usb_endpoint_descriptor endpd; + struct usb_endpoint_ss_comp_descriptor sscompd; +} __packed; + +#define MOUSE_MAX_X 0x8000 +#define MOUSE_MAX_Y 0x8000 + +static const uint8_t umouse_report_desc[] = { + 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ + 0x09, 0x02, /* USAGE (Mouse) */ + 0xa1, 0x01, /* COLLECTION (Application) */ + 0x09, 0x01, /* USAGE (Pointer) */ + 0xa1, 0x00, /* COLLECTION (Physical) */ + 0x05, 0x09, /* USAGE_PAGE (Button) */ + 0x19, 0x01, /* USAGE_MINIMUM (Button 1) */ + 0x29, 0x03, /* USAGE_MAXIMUM (Button 3) */ + 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ + 0x25, 0x01, /* LOGICAL_MAXIMUM (1) */ + 0x75, 0x01, /* REPORT_SIZE (1) */ + 0x95, 0x03, /* REPORT_COUNT (3) */ + 0x81, 0x02, /* INPUT (Data,Var,Abs); 3 buttons */ + 0x75, 0x05, /* REPORT_SIZE (5) */ + 0x95, 0x01, /* REPORT_COUNT (1) */ + 0x81, 0x03, /* INPUT (Cnst,Var,Abs); padding */ + 0x05, 0x01, /* USAGE_PAGE (Generic Desktop) */ + 0x09, 0x30, /* USAGE (X) */ + 0x09, 0x31, /* USAGE (Y) */ + 0x35, 0x00, /* PHYSICAL_MINIMUM (0) */ + 0x46, 0xff, 0x7f, /* PHYSICAL_MAXIMUM (0x7fff) */ + 0x15, 0x00, /* LOGICAL_MINIMUM (0) */ + 0x26, 0xff, 0x7f, /* LOGICAL_MAXIMUM (0x7fff) */ + 0x75, 0x10, /* REPORT_SIZE (16) */ + 0x95, 0x02, /* REPORT_COUNT (2) */ + 0x81, 0x02, /* INPUT (Data,Var,Abs) */ + 0x05, 0x01, /* USAGE Page (Generic Desktop) */ + 0x09, 0x38, /* USAGE (Wheel) */ + 0x35, 0x00, /* PHYSICAL_MINIMUM (0) */ + 0x45, 0x00, /* PHYSICAL_MAXIMUM (0) */ + 0x15, 0x81, /* LOGICAL_MINIMUM (-127) */ + 0x25, 0x7f, /* LOGICAL_MAXIMUM (127) */ + 0x75, 0x08, /* REPORT_SIZE (8) */ + 0x95, 0x01, /* REPORT_COUNT (1) */ + 0x81, 0x06, /* INPUT (Data,Var,Rel) */ + 0xc0, /* END_COLLECTION */ + 0xc0 /* END_COLLECTION */ +}; + +struct umouse_report { + uint8_t buttons; /* bits: 0 left, 1 right, 2 middle */ + int16_t x; /* x position */ + int16_t y; /* y position */ + int8_t z; /* z wheel position */ +} __packed; + + +#define MSETW(ptr, val) ptr = { (uint8_t)(val), (uint8_t)((val) >> 8) } + +static struct usb_device_descriptor umouse_dev_desc = { + .bLength = sizeof(umouse_dev_desc), + .bDescriptorType = UDESC_DEVICE, + MSETW(.bcdUSB, UD_USB_3_0), + .bMaxPacketSize = 8, /* max packet size */ + MSETW(.idVendor, 0xFB5D), /* vendor */ + MSETW(.idProduct, 0x0001), /* product */ + MSETW(.bcdDevice, 0), /* device version */ + .iManufacturer = UMSTR_MANUFACTURER, + .iProduct = UMSTR_PRODUCT, + .iSerialNumber = UMSTR_SERIAL, + .bNumConfigurations = 1, +}; + +static struct umouse_config_desc umouse_confd = { + .confd = { + .bLength = sizeof(umouse_confd.confd), + .bDescriptorType = UDESC_CONFIG, + .wTotalLength[0] = sizeof(umouse_confd), + .bNumInterface = 1, + .bConfigurationValue = 1, + .iConfiguration = UMSTR_CONFIG, + .bmAttributes = UC_BUS_POWERED | UC_REMOTE_WAKEUP, + .bMaxPower = 0, + }, + .ifcd = { + .bLength = sizeof(umouse_confd.ifcd), + .bDescriptorType = UDESC_INTERFACE, + .bNumEndpoints = 1, + .bInterfaceClass = UICLASS_HID, + .bInterfaceSubClass = UISUBCLASS_BOOT, + .bInterfaceProtocol = UIPROTO_MOUSE, + }, + .hidd = { + .bLength = sizeof(umouse_confd.hidd), + .bDescriptorType = 0x21, + .bcdHID = { 0x01, 0x10 }, + .bCountryCode = 0, + .bNumDescriptors = 1, + .bReportDescriptorType = UMOUSE_REPORT_DESC_TYPE, + .wItemLength = { sizeof(umouse_report_desc), 0 }, + }, + .endpd = { + .bLength = sizeof(umouse_confd.endpd), + .bDescriptorType = UDESC_ENDPOINT, + .bEndpointAddress = UE_DIR_IN | UMOUSE_INTR_ENDPT, + .bmAttributes = UE_INTERRUPT, + .wMaxPacketSize[0] = 8, + .bInterval = 0xA, + }, + .sscompd = { + .bLength = sizeof(umouse_confd.sscompd), + .bDescriptorType = UDESC_ENDPOINT_SS_COMP, + .bMaxBurst = 0, + .bmAttributes = 0, + MSETW(.wBytesPerInterval, 0), + }, +}; + + +struct umouse_bos_desc { + struct usb_bos_descriptor bosd; + struct usb_devcap_ss_descriptor usbssd; +} __packed; + + +struct umouse_bos_desc umouse_bosd = { + .bosd = { + .bLength = sizeof(umouse_bosd.bosd), + .bDescriptorType = UDESC_BOS, + HSETW(.wTotalLength, sizeof(umouse_bosd)), + .bNumDeviceCaps = 1, + }, + .usbssd = { + .bLength = sizeof(umouse_bosd.usbssd), + .bDescriptorType = UDESC_DEVICE_CAPABILITY, + .bDevCapabilityType = 3, + .bmAttributes = 0, + HSETW(.wSpeedsSupported, 0x08), + .bFunctionalitySupport = 3, + .bU1DevExitLat = 0xa, /* dummy - not used */ + .wU2DevExitLat = { 0x20, 0x00 }, + } +}; + + +struct umouse_softc { + struct usb_hci *hci; + + char *opt; + + struct umouse_report um_report; + int newdata; + struct { + uint8_t idle; + uint8_t protocol; + uint8_t feature; + } hid; + + pthread_mutex_t mtx; + pthread_mutex_t ev_mtx; + int polling; + struct timeval prev_evt; +}; + +static void +umouse_event(uint8_t button, int x, int y, void *arg) +{ + struct umouse_softc *sc; + struct bhyvegc_image *gc; + + gc = console_get_image(); + if (gc == NULL) { + /* not ready */ + return; + } + + sc = arg; + + pthread_mutex_lock(&sc->mtx); + + sc->um_report.buttons = 0; + sc->um_report.z = 0; + + if (button & 0x01) + sc->um_report.buttons |= 0x01; /* left */ + if (button & 0x02) + sc->um_report.buttons |= 0x04; /* middle */ + if (button & 0x04) + sc->um_report.buttons |= 0x02; /* right */ + if (button & 0x8) + sc->um_report.z = 1; + if (button & 0x10) + sc->um_report.z = -1; + + /* scale coords to mouse resolution */ + sc->um_report.x = MOUSE_MAX_X * x / gc->width; + sc->um_report.y = MOUSE_MAX_Y * y / gc->height; + sc->newdata = 1; + pthread_mutex_unlock(&sc->mtx); + + pthread_mutex_lock(&sc->ev_mtx); + sc->hci->hci_intr(sc->hci, UE_DIR_IN | UMOUSE_INTR_ENDPT); + pthread_mutex_unlock(&sc->ev_mtx); +} + +static void * +umouse_init(struct usb_hci *hci, char *opt) +{ + struct umouse_softc *sc; + + sc = calloc(1, sizeof(struct umouse_softc)); + sc->hci = hci; + + sc->hid.protocol = 1; /* REPORT protocol */ + sc->opt = strdup(opt); + pthread_mutex_init(&sc->mtx, NULL); + pthread_mutex_init(&sc->ev_mtx, NULL); + + console_ptr_register(umouse_event, sc, 10); + + return (sc); +} + +#define UREQ(x,y) ((x) | ((y) << 8)) + +static int +umouse_request(void *scarg, struct usb_data_xfer *xfer) +{ + struct umouse_softc *sc; + struct usb_data_xfer_block *data; + const char *str; + uint16_t value; + uint16_t index; + uint16_t len; + uint16_t slen; + uint8_t *udata; + int err; + int i, idx; + int eshort; + + sc = scarg; + + data = NULL; + udata = NULL; + idx = xfer->head; + for (i = 0; i < xfer->ndata; i++) { + xfer->data[idx].bdone = 0; + if (data == NULL && USB_DATA_OK(xfer,i)) { + data = &xfer->data[idx]; + udata = data->buf; + } + + xfer->data[idx].processed = 1; + idx = (idx + 1) % USB_MAX_XFER_BLOCKS; + } + + err = USB_ERR_NORMAL_COMPLETION; + eshort = 0; + + if (!xfer->ureq) { + DPRINTF(("umouse_request: port %d\r\n", sc->hci->hci_port)); + goto done; + } + + value = UGETW(xfer->ureq->wValue); + index = UGETW(xfer->ureq->wIndex); + len = UGETW(xfer->ureq->wLength); + + DPRINTF(("umouse_request: port %d, type 0x%x, req 0x%x, val 0x%x, " + "idx 0x%x, len %u\r\n", + sc->hci->hci_port, xfer->ureq->bmRequestType, + xfer->ureq->bRequest, value, index, len)); + + switch (UREQ(xfer->ureq->bRequest, xfer->ureq->bmRequestType)) { + case UREQ(UR_GET_CONFIG, UT_READ_DEVICE): + DPRINTF(("umouse: (UR_GET_CONFIG, UT_READ_DEVICE)\r\n")); + if (!data) + break; + + *udata = umouse_confd.confd.bConfigurationValue; + data->blen = len > 0 ? len - 1 : 0; + eshort = data->blen > 0; + data->bdone += 1; + break; + + case UREQ(UR_GET_DESCRIPTOR, UT_READ_DEVICE): + DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_DEVICE) val %x\r\n", + value >> 8)); + if (!data) + break; + + switch (value >> 8) { + case UDESC_DEVICE: + DPRINTF(("umouse: (->UDESC_DEVICE) len %u ?= " + "sizeof(umouse_dev_desc) %lu\r\n", + len, sizeof(umouse_dev_desc))); + if ((value & 0xFF) != 0) { + err = USB_ERR_IOERROR; + goto done; + } + if (len > sizeof(umouse_dev_desc)) { + data->blen = len - sizeof(umouse_dev_desc); + len = sizeof(umouse_dev_desc); + } else + data->blen = 0; + memcpy(data->buf, &umouse_dev_desc, len); + data->bdone += len; + break; + + case UDESC_CONFIG: + DPRINTF(("umouse: (->UDESC_CONFIG)\r\n")); + if ((value & 0xFF) != 0) { + err = USB_ERR_IOERROR; + goto done; + } + if (len > sizeof(umouse_confd)) { + data->blen = len - sizeof(umouse_confd); + len = sizeof(umouse_confd); + } else + data->blen = 0; + + memcpy(data->buf, &umouse_confd, len); + data->bdone += len; + break; + + case UDESC_STRING: + DPRINTF(("umouse: (->UDESC_STRING)\r\n")); + str = NULL; + if ((value & 0xFF) < UMSTR_MAX) + str = umouse_desc_strings[value & 0xFF]; + else + goto done; + + if ((value & 0xFF) == UMSTR_LANG) { + udata[0] = 4; + udata[1] = UDESC_STRING; + data->blen = len - 2; + len -= 2; + data->bdone += 2; + + if (len >= 2) { + udata[2] = str[0]; + udata[3] = str[1]; + data->blen -= 2; + data->bdone += 2; + } else + data->blen = 0; + + goto done; + } + + slen = 2 + strlen(str) * 2; + udata[0] = slen; + udata[1] = UDESC_STRING; + + if (len > slen) { + data->blen = len - slen; + len = slen; + } else + data->blen = 0; + for (i = 2; i < len; i += 2) { + udata[i] = *str++; + udata[i+1] = '\0'; + } + data->bdone += slen; + + break; + + case UDESC_BOS: + DPRINTF(("umouse: USB3 BOS\r\n")); + if (len > sizeof(umouse_bosd)) { + data->blen = len - sizeof(umouse_bosd); + len = sizeof(umouse_bosd); + } else + data->blen = 0; + memcpy(udata, &umouse_bosd, len); + data->bdone += len; + break; + + default: + DPRINTF(("umouse: unknown(%d)->ERROR\r\n", value >> 8)); + err = USB_ERR_IOERROR; + goto done; + } + eshort = data->blen > 0; + break; + + case UREQ(UR_GET_DESCRIPTOR, UT_READ_INTERFACE): + DPRINTF(("umouse: (UR_GET_DESCRIPTOR, UT_READ_INTERFACE) " + "0x%x\r\n", (value >> 8))); + if (!data) + break; + + switch (value >> 8) { + case UMOUSE_REPORT_DESC_TYPE: + if (len > sizeof(umouse_report_desc)) { + data->blen = len - sizeof(umouse_report_desc); + len = sizeof(umouse_report_desc); + } else + data->blen = 0; + memcpy(data->buf, umouse_report_desc, len); + data->bdone += len; + break; + default: + DPRINTF(("umouse: IO ERROR\r\n")); + err = USB_ERR_IOERROR; + goto done; + } + eshort = data->blen > 0; + break; + + case UREQ(UR_GET_INTERFACE, UT_READ_INTERFACE): + DPRINTF(("umouse: (UR_GET_INTERFACE, UT_READ_INTERFACE)\r\n")); + if (index != 0) { + DPRINTF(("umouse get_interface, invalid index %d\r\n", + index)); + err = USB_ERR_IOERROR; + goto done; + } + + if (!data) + break; + + if (len > 0) { + *udata = 0; + data->blen = len - 1; + } + eshort = data->blen > 0; + data->bdone += 1; + break; + + case UREQ(UR_GET_STATUS, UT_READ_DEVICE): + DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_DEVICE)\r\n")); + if (data != NULL && len > 1) { + if (sc->hid.feature == UF_DEVICE_REMOTE_WAKEUP) + USETW(udata, UDS_REMOTE_WAKEUP); + else + USETW(udata, 0); + data->blen = len - 2; + data->bdone += 2; + } + + eshort = data->blen > 0; + break; + + case UREQ(UR_GET_STATUS, UT_READ_INTERFACE): + case UREQ(UR_GET_STATUS, UT_READ_ENDPOINT): + DPRINTF(("umouse: (UR_GET_STATUS, UT_READ_INTERFACE)\r\n")); + if (data != NULL && len > 1) { + USETW(udata, 0); + data->blen = len - 2; + data->bdone += 2; + } + eshort = data->blen > 0; + break; + + case UREQ(UR_SET_ADDRESS, UT_WRITE_DEVICE): + /* XXX Controller should've handled this */ + DPRINTF(("umouse set address %u\r\n", value)); + break; + + case UREQ(UR_SET_CONFIG, UT_WRITE_DEVICE): + DPRINTF(("umouse set config %u\r\n", value)); + break; + + case UREQ(UR_SET_DESCRIPTOR, UT_WRITE_DEVICE): + DPRINTF(("umouse set descriptor %u\r\n", value)); + break; + + + case UREQ(UR_CLEAR_FEATURE, UT_WRITE_DEVICE): + DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x\r\n", value)); + if (value == UF_DEVICE_REMOTE_WAKEUP) + sc->hid.feature = 0; + break; + + case UREQ(UR_SET_FEATURE, UT_WRITE_DEVICE): + DPRINTF(("umouse: (UR_SET_FEATURE, UT_WRITE_DEVICE) %x\r\n", value)); + if (value == UF_DEVICE_REMOTE_WAKEUP) + sc->hid.feature = UF_DEVICE_REMOTE_WAKEUP; + break; + + case UREQ(UR_CLEAR_FEATURE, UT_WRITE_INTERFACE): + case UREQ(UR_CLEAR_FEATURE, UT_WRITE_ENDPOINT): + case UREQ(UR_SET_FEATURE, UT_WRITE_INTERFACE): + case UREQ(UR_SET_FEATURE, UT_WRITE_ENDPOINT): + DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_INTERFACE)\r\n")); + err = USB_ERR_IOERROR; + goto done; + + case UREQ(UR_SET_INTERFACE, UT_WRITE_INTERFACE): + DPRINTF(("umouse set interface %u\r\n", value)); + break; + + case UREQ(UR_ISOCH_DELAY, UT_WRITE_DEVICE): + DPRINTF(("umouse set isoch delay %u\r\n", value)); + break; + + case UREQ(UR_SET_SEL, 0): + DPRINTF(("umouse set sel\r\n")); + break; + + case UREQ(UR_SYNCH_FRAME, UT_WRITE_ENDPOINT): + DPRINTF(("umouse synch frame\r\n")); + break; + + /* HID device requests */ + + case UREQ(UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE): + DPRINTF(("umouse: (UMOUSE_GET_REPORT, UT_READ_CLASS_INTERFACE) " + "0x%x\r\n", (value >> 8))); + if (!data) + break; + + if ((value >> 8) == 0x01 && len >= sizeof(sc->um_report)) { + /* TODO read from backend */ + + if (len > sizeof(sc->um_report)) { + data->blen = len - sizeof(sc->um_report); + len = sizeof(sc->um_report); + } else + data->blen = 0; + + memcpy(data->buf, &sc->um_report, len); + data->bdone += len; + } else { + err = USB_ERR_IOERROR; + goto done; + } + eshort = data->blen > 0; + break; + + case UREQ(UMOUSE_GET_IDLE, UT_READ_CLASS_INTERFACE): + if (data != NULL && len > 0) { + *udata = sc->hid.idle; + data->blen = len - 1; + data->bdone += 1; + } + eshort = data->blen > 0; + break; + + case UREQ(UMOUSE_GET_PROTOCOL, UT_READ_CLASS_INTERFACE): + if (data != NULL && len > 0) { + *udata = sc->hid.protocol; + data->blen = len - 1; + data->bdone += 1; + } + eshort = data->blen > 0; + break; + + case UREQ(UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE): + DPRINTF(("umouse: (UMOUSE_SET_REPORT, UT_WRITE_CLASS_INTERFACE) ignored\r\n")); + break; + + case UREQ(UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE): + sc->hid.idle = UGETW(xfer->ureq->wValue) >> 8; + DPRINTF(("umouse: (UMOUSE_SET_IDLE, UT_WRITE_CLASS_INTERFACE) %x\r\n", + sc->hid.idle)); + break; + + case UREQ(UMOUSE_SET_PROTOCOL, UT_WRITE_CLASS_INTERFACE): + sc->hid.protocol = UGETW(xfer->ureq->wValue) >> 8; + DPRINTF(("umouse: (UR_CLEAR_FEATURE, UT_WRITE_CLASS_INTERFACE) %x\r\n", + sc->hid.protocol)); + break; + + default: + DPRINTF(("**** umouse request unhandled\r\n")); + err = USB_ERR_IOERROR; + break; + } + +done: +/* UT_WRITE is 0, so this is condition is never true. */ +#ifdef __FreeBSD__ + if (xfer->ureq && (xfer->ureq->bmRequestType & UT_WRITE) && + (err == USB_ERR_NORMAL_COMPLETION) && (data != NULL)) + data->blen = 0; + else if (eshort) + err = USB_ERR_SHORT_XFER; +#else + if (eshort) + err = USB_ERR_SHORT_XFER; +#endif + + + DPRINTF(("umouse request error code %d (0=ok), blen %u txlen %u\r\n", + err, (data ? data->blen : 0), (data ? data->bdone : 0))); + + return (err); +} + +static int +umouse_data_handler(void *scarg, struct usb_data_xfer *xfer, int dir, + int epctx) +{ + struct umouse_softc *sc; + struct usb_data_xfer_block *data; + uint8_t *udata; + int len, i, idx; + int err; + + DPRINTF(("umouse handle data - DIR=%s|EP=%d, blen %d\r\n", + dir ? "IN" : "OUT", epctx, xfer->data[0].blen)); + + + /* find buffer to add data */ + udata = NULL; + err = USB_ERR_NORMAL_COMPLETION; + + /* handle xfer at first unprocessed item with buffer */ + data = NULL; + idx = xfer->head; + for (i = 0; i < xfer->ndata; i++) { + data = &xfer->data[idx]; + if (data->buf != NULL && data->blen != 0) { + break; + } else { + data->processed = 1; + data = NULL; + } + idx = (idx + 1) % USB_MAX_XFER_BLOCKS; + } + if (!data) + goto done; + + udata = data->buf; + len = data->blen; + + if (udata == NULL) { + DPRINTF(("umouse no buffer provided for input\r\n")); + err = USB_ERR_NOMEM; + goto done; + } + + sc = scarg; + + if (dir) { + + pthread_mutex_lock(&sc->mtx); + + if (!sc->newdata) { + err = USB_ERR_CANCELLED; + USB_DATA_SET_ERRCODE(&xfer->data[xfer->head], USB_NAK); + pthread_mutex_unlock(&sc->mtx); + goto done; + } + + if (sc->polling) { + err = USB_ERR_STALLED; + USB_DATA_SET_ERRCODE(data, USB_STALL); + pthread_mutex_unlock(&sc->mtx); + goto done; + } + sc->polling = 1; + + if (len > 0) { + sc->newdata = 0; + + data->processed = 1; + data->bdone += 6; + memcpy(udata, &sc->um_report, 6); + data->blen = len - 6; + if (data->blen > 0) + err = USB_ERR_SHORT_XFER; + } + + sc->polling = 0; + pthread_mutex_unlock(&sc->mtx); + } else { + USB_DATA_SET_ERRCODE(data, USB_STALL); + err = USB_ERR_STALLED; + } + +done: + return (err); +} + +static int +umouse_reset(void *scarg) +{ + struct umouse_softc *sc; + + sc = scarg; + + sc->newdata = 0; + + return (0); +} + +static int +umouse_remove(void *scarg) +{ + + return (0); +} + +static int +umouse_stop(void *scarg) +{ + + return (0); +} + + +struct usb_devemu ue_mouse = { + .ue_emu = "tablet", + .ue_usbver = 3, + .ue_usbspeed = USB_SPEED_HIGH, + .ue_init = umouse_init, + .ue_request = umouse_request, + .ue_data = umouse_data_handler, + .ue_reset = umouse_reset, + .ue_remove = umouse_remove, + .ue_stop = umouse_stop +}; +USB_EMUL_SET(ue_mouse); diff --git a/usr/src/cmd/bhyve/vga.c b/usr/src/cmd/bhyve/vga.c index 4330741042..314ddeb1e8 100644 --- a/usr/src/cmd/bhyve/vga.c +++ b/usr/src/cmd/bhyve/vga.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -24,6 +26,10 @@ * SUCH DAMAGE. */ +/* + * Copyright 2018 Joyent, Inc. + */ + #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); @@ -161,10 +167,10 @@ struct vga_softc { */ struct { uint8_t dac_state; - int dac_rd_index; - int dac_rd_subindex; - int dac_wr_index; - int dac_wr_subindex; + uint8_t dac_rd_index; + uint8_t dac_rd_subindex; + uint8_t dac_wr_index; + uint8_t dac_wr_subindex; uint8_t dac_palette[3 * 256]; uint32_t dac_palette_rgb[256]; } vga_dac; @@ -187,8 +193,10 @@ vga_check_size(struct bhyvegc *gc, struct vga_softc *sc) if (vga_in_reset(sc)) return; - old_width = sc->gc_width; - old_height = sc->gc_height; + //old_width = sc->gc_width; + //old_height = sc->gc_height; + old_width = sc->gc_image->width; + old_height = sc->gc_image->height; /* * Horizontal Display End: For text modes this is the number @@ -263,7 +271,7 @@ vga_get_text_pixel(struct vga_softc *sc, int x, int y) offset = 2 * sc->vga_crtc.crtc_start_addr; offset += (y / 16 * sc->gc_width / dots) * 2 + (x / dots) * 2; - bit = 7 - (x % dots); + bit = 7 - (x % dots > 7 ? 7 : x % dots); ch = sc->vga_ram[offset + 0 * 64*KB]; attr = sc->vga_ram[offset + 1 * 64*KB]; @@ -291,7 +299,7 @@ vga_get_text_pixel(struct vga_softc *sc, int x, int y) font = sc->vga_ram[font_offset + 2 * 64*KB]; - if ((bit > 0) && (font & (1 << bit))) + if (font & (1 << bit)) idx = sc->vga_atc.atc_palette[attr & 0xf]; else idx = sc->vga_atc.atc_palette[attr >> 4]; @@ -314,7 +322,7 @@ vga_render_text(struct vga_softc *sc) } } -static void +void vga_render(struct bhyvegc *gc, void *arg) { struct vga_softc *sc = arg; @@ -361,7 +369,11 @@ vga_mem_rd_handler(struct vmctx *ctx, uint64_t addr, void *arg1) /* * monochrome text mode: base 0xb0000 size 32kb */ +#ifdef __FreeBSD__ assert(0); +#else + abort(); +#endif case 0x3: /* * color text mode and CGA: base 0xb8000 size 32kb @@ -425,7 +437,11 @@ vga_mem_wr_handler(struct vmctx *ctx, uint64_t addr, uint8_t val, void *arg1) /* * monochrome text mode: base 0xb0000 size 32kb */ +#ifdef __FreeBSD__ assert(0); +#else + abort(); +#endif case 0x3: /* * color text mode and CGA: base 0xb8000 size 32kb @@ -858,6 +874,7 @@ vga_port_in_handler(struct vmctx *ctx, int in, int port, int bytes, assert(0); break; } + break; case DAC_DATA_PORT: *val = sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_rd_index + sc->vga_dac.dac_rd_subindex]; @@ -914,15 +931,33 @@ vga_port_in_handler(struct vmctx *ctx, int in, int port, int bytes, case GEN_INPUT_STS1_MONO_PORT: case GEN_INPUT_STS1_COLOR_PORT: sc->vga_atc.atc_flipflop = 0; +#ifdef __FreeBSD__ + sc->vga_sts1 = GEN_IS1_VR | GEN_IS1_DE; + //sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE); +#else + /* + * During the bhyve bring-up process, a guest image was failing + * to successfully boot. It appeared to be spinning, waiting + * for this value to be toggled. Until it can be ruled out + * that this is unnecessary (and documentation seems to + * indicate that it should be present), the toggle should + * remain. + */ sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE); +#endif *val = sc->vga_sts1; break; case GEN_FEATURE_CTRL_PORT: - assert(0); + // OpenBSD calls this with bytes = 1 + //assert(0); + *val = 0; + break; + case 0x3c3: + *val = 0; break; default: printf("XXX vga_port_in_handler() unhandled port 0x%x\n", port); - assert(0); + //assert(0); return (-1); } @@ -1060,7 +1095,7 @@ vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes, sc->vga_atc.atc_color_select_45 = (val & ATC_CS_C45) << 4; sc->vga_atc.atc_color_select_67 = - (val & ATC_CS_C67) << 6; + ((val & ATC_CS_C67) >> 2) << 6; break; default: //printf("XXX VGA ATC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_atc.atc_index); @@ -1095,7 +1130,8 @@ vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes, break; case SEQ_MEMORY_MODE: sc->vga_seq.seq_mm = val; - assert((sc->vga_seq.seq_mm & SEQ_MM_C4) == 0); + /* Windows queries Chain4 */ + //assert((sc->vga_seq.seq_mm & SEQ_MM_C4) == 0); break; default: //printf("XXX VGA SEQ: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_seq.seq_index); @@ -1161,6 +1197,9 @@ vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes, sc->vga_gc.gc_mode_oe = (val & GC_MODE_OE) != 0; sc->vga_gc.gc_mode_rm = (val >> 3) & 0x1; sc->vga_gc.gc_mode_wm = val & 0x3; + + if (sc->gc_image) + sc->gc_image->vgamode = 1; break; case GC_MISCELLANEOUS: sc->vga_gc.gc_misc = val; @@ -1188,8 +1227,10 @@ vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes, case GEN_INPUT_STS1_COLOR_PORT: /* write to Feature Control Register */ break; +// case 0x3c3: +// break; default: - printf("XXX vga_port_out_handler() unhandled port 0x%x\n", port); + printf("XXX vga_port_out_handler() unhandled port 0x%x, val 0x%x\n", port, val); //assert(0); return (-1); } @@ -1248,8 +1289,8 @@ vga_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes, return (error); } -int -vga_init(void) +void * +vga_init(int io_only) { struct inout_port iop; struct vga_softc *sc; @@ -1270,6 +1311,12 @@ vga_init(void) assert(error == 0); } + sc->gc_image = console_get_image(); + + /* only handle io ports; vga graphics is disabled */ + if (io_only) + return(sc); + sc->mr.name = "VGA memory"; sc->mr.flags = MEM_F_RW; sc->mr.base = 640 * KB; @@ -1282,8 +1329,29 @@ vga_init(void) sc->vga_ram = malloc(256 * KB); memset(sc->vga_ram, 0, 256 * KB); - sc->gc_image = console_get_image(); - console_fb_register(vga_render, sc); + { + static uint8_t palette[] = { + 0x00,0x00,0x00, 0x00,0x00,0x2a, 0x00,0x2a,0x00, 0x00,0x2a,0x2a, + 0x2a,0x00,0x00, 0x2a,0x00,0x2a, 0x2a,0x2a,0x00, 0x2a,0x2a,0x2a, + 0x00,0x00,0x15, 0x00,0x00,0x3f, 0x00,0x2a,0x15, 0x00,0x2a,0x3f, + 0x2a,0x00,0x15, 0x2a,0x00,0x3f, 0x2a,0x2a,0x15, 0x2a,0x2a,0x3f, + }; + int i; + + memcpy(sc->vga_dac.dac_palette, palette, 16 * 3 * sizeof (uint8_t)); + for (i = 0; i < 16; i++) { + sc->vga_dac.dac_palette_rgb[i] = + ((((sc->vga_dac.dac_palette[3*i + 0] << 2) | + ((sc->vga_dac.dac_palette[3*i + 0] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*i + 0] & 0x1)) << 16) | + (((sc->vga_dac.dac_palette[3*i + 1] << 2) | + ((sc->vga_dac.dac_palette[3*i + 1] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*i + 1] & 0x1)) << 8) | + (((sc->vga_dac.dac_palette[3*i + 2] << 2) | + ((sc->vga_dac.dac_palette[3*i + 2] & 0x1) << 1) | + (sc->vga_dac.dac_palette[3*i + 2] & 0x1)) << 0)); + } + } - return (0); + return (sc); } diff --git a/usr/src/cmd/bhyve/vga.h b/usr/src/cmd/bhyve/vga.h index 14637b12b3..36c6dc15fa 100644 --- a/usr/src/cmd/bhyve/vga.h +++ b/usr/src/cmd/bhyve/vga.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -38,8 +40,8 @@ #define GEN_MISC_OUTPUT_PORT 0x3cc #define GEN_INPUT_STS1_MONO_PORT 0x3ba #define GEN_INPUT_STS1_COLOR_PORT 0x3da -#define GEN_IS1_VR 0x08 /* Vertical retrace */ -#define GEN_IS1_DE 0x01 /* Display enable not */ +#define GEN_IS1_VR 0x08 /* Vertical retrace */ +#define GEN_IS1_DE 0x01 /* Display enable not */ /* Attribute controller registers. */ #define ATC_IDX_PORT 0x3c0 @@ -49,14 +51,14 @@ #define ATC_PALETTE0 0 #define ATC_PALETTE15 15 #define ATC_MODE_CONTROL 16 -#define ATC_MC_IPS 0x80 /* Internal palette size */ -#define ATC_MC_GA 0x01 /* Graphics/alphanumeric */ +#define ATC_MC_IPS 0x80 /* Internal palette size */ +#define ATC_MC_GA 0x01 /* Graphics/alphanumeric */ #define ATC_OVERSCAN_COLOR 17 #define ATC_COLOR_PLANE_ENABLE 18 #define ATC_HORIZ_PIXEL_PANNING 19 #define ATC_COLOR_SELECT 20 -#define ATC_CS_C67 0x0c /* Color select bits 6+7 */ -#define ATC_CS_C45 0x03 /* Color select bits 4+5 */ +#define ATC_CS_C67 0x0c /* Color select bits 6+7 */ +#define ATC_CS_C45 0x03 /* Color select bits 4+5 */ /* Sequencer registers. */ #define SEQ_IDX_PORT 0x3c4 @@ -66,22 +68,22 @@ #define SEQ_RESET_ASYNC 0x1 #define SEQ_RESET_SYNC 0x2 #define SEQ_CLOCKING_MODE 1 -#define SEQ_CM_SO 0x20 /* Screen off */ -#define SEQ_CM_89 0x01 /* 8/9 dot clock */ +#define SEQ_CM_SO 0x20 /* Screen off */ +#define SEQ_CM_89 0x01 /* 8/9 dot clock */ #define SEQ_MAP_MASK 2 #define SEQ_CHAR_MAP_SELECT 3 -#define SEQ_CMS_SAH 0x20 /* Char map A bit 2 */ -#define SEQ_CMS_SAH_SHIFT 5 -#define SEQ_CMS_SA 0x0c /* Char map A bits 0+1 */ -#define SEQ_CMS_SA_SHIFT 2 -#define SEQ_CMS_SBH 0x10 /* Char map B bit 2 */ -#define SEQ_CMS_SBH_SHIFT 4 -#define SEQ_CMS_SB 0x03 /* Char map B bits 0+1 */ -#define SEQ_CMS_SB_SHIFT 0 +#define SEQ_CMS_SAH 0x20 /* Char map A bit 2 */ +#define SEQ_CMS_SAH_SHIFT 5 +#define SEQ_CMS_SA 0x0c /* Char map A bits 0+1 */ +#define SEQ_CMS_SA_SHIFT 2 +#define SEQ_CMS_SBH 0x10 /* Char map B bit 2 */ +#define SEQ_CMS_SBH_SHIFT 4 +#define SEQ_CMS_SB 0x03 /* Char map B bits 0+1 */ +#define SEQ_CMS_SB_SHIFT 0 #define SEQ_MEMORY_MODE 4 -#define SEQ_MM_C4 0x08 /* Chain 4 */ -#define SEQ_MM_OE 0x04 /* Odd/even */ -#define SEQ_MM_EM 0x02 /* Extended memory */ +#define SEQ_MM_C4 0x08 /* Chain 4 */ +#define SEQ_MM_OE 0x04 /* Odd/even */ +#define SEQ_MM_EM 0x02 /* Extended memory */ /* Graphics controller registers. */ #define GC_IDX_PORT 0x3ce @@ -93,13 +95,13 @@ #define GC_DATA_ROTATE 3 #define GC_READ_MAP_SELECT 4 #define GC_MODE 5 -#define GC_MODE_OE 0x10 /* Odd/even */ -#define GC_MODE_C4 0x04 /* Chain 4 */ +#define GC_MODE_OE 0x10 /* Odd/even */ +#define GC_MODE_C4 0x04 /* Chain 4 */ #define GC_MISCELLANEOUS 6 -#define GC_MISC_GM 0x01 /* Graphics/alphanumeric */ -#define GC_MISC_MM 0x0c /* memory map */ -#define GC_MISC_MM_SHIFT 2 +#define GC_MISC_GM 0x01 /* Graphics/alphanumeric */ +#define GC_MISC_MM 0x0c /* memory map */ +#define GC_MISC_MM_SHIFT 2 #define GC_COLOR_DONT_CARE 7 #define GC_BIT_MASK 8 @@ -117,36 +119,36 @@ #define CRTC_END_HORIZ_RETRACE 5 #define CRTC_VERT_TOTAL 6 #define CRTC_OVERFLOW 7 -#define CRTC_OF_VRS9 0x80 /* VRS bit 9 */ -#define CRTC_OF_VRS9_SHIFT 7 -#define CRTC_OF_VDE9 0x40 /* VDE bit 9 */ -#define CRTC_OF_VDE9_SHIFT 6 -#define CRTC_OF_VRS8 0x04 /* VRS bit 8 */ -#define CRTC_OF_VRS8_SHIFT 2 -#define CRTC_OF_VDE8 0x02 /* VDE bit 8 */ -#define CRTC_OF_VDE8_SHIFT 1 +#define CRTC_OF_VRS9 0x80 /* VRS bit 9 */ +#define CRTC_OF_VRS9_SHIFT 7 +#define CRTC_OF_VDE9 0x40 /* VDE bit 9 */ +#define CRTC_OF_VDE9_SHIFT 6 +#define CRTC_OF_VRS8 0x04 /* VRS bit 8 */ +#define CRTC_OF_VRS8_SHIFT 2 +#define CRTC_OF_VDE8 0x02 /* VDE bit 8 */ +#define CRTC_OF_VDE8_SHIFT 1 #define CRTC_PRESET_ROW_SCAN 8 #define CRTC_MAX_SCAN_LINE 9 -#define CRTC_MSL_MSL 0x1f +#define CRTC_MSL_MSL 0x1f #define CRTC_CURSOR_START 10 -#define CRTC_CS_CO 0x20 /* Cursor off */ -#define CRTC_CS_CS 0x1f /* Cursor start */ +#define CRTC_CS_CO 0x20 /* Cursor off */ +#define CRTC_CS_CS 0x1f /* Cursor start */ #define CRTC_CURSOR_END 11 -#define CRTC_CE_CE 0x1f /* Cursor end */ +#define CRTC_CE_CE 0x1f /* Cursor end */ #define CRTC_START_ADDR_HIGH 12 #define CRTC_START_ADDR_LOW 13 #define CRTC_CURSOR_LOC_HIGH 14 #define CRTC_CURSOR_LOC_LOW 15 #define CRTC_VERT_RETRACE_START 16 #define CRTC_VERT_RETRACE_END 17 -#define CRTC_VRE_MASK 0xf +#define CRTC_VRE_MASK 0xf #define CRTC_VERT_DISP_END 18 #define CRTC_OFFSET 19 #define CRTC_UNDERLINE_LOC 20 #define CRTC_START_VERT_BLANK 21 #define CRTC_END_VERT_BLANK 22 #define CRTC_MODE_CONTROL 23 -#define CRTC_MC_TE 0x80 /* Timing enable */ +#define CRTC_MC_TE 0x80 /* Timing enable */ #define CRTC_LINE_COMPARE 24 /* DAC registers */ @@ -155,6 +157,6 @@ #define DAC_IDX_WR_PORT 0x3c8 #define DAC_DATA_PORT 0x3c9 -int vga_init(void); +void *vga_init(int io_only); #endif /* _VGA_H_ */ diff --git a/usr/src/cmd/bhyve/virtio.c b/usr/src/cmd/bhyve/virtio.c index c3b11dc439..47a3ed29ba 100644 --- a/usr/src/cmd/bhyve/virtio.c +++ b/usr/src/cmd/bhyve/virtio.c @@ -1,6 +1,9 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Chris Torek <torek @ torek net> * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -25,11 +28,13 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/virtio.c 270326 2014-08-22 13:01:22Z tychon $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/uio.h> +#include <machine/atomic.h> + #include <stdio.h> #include <stdint.h> #include <pthread.h> @@ -49,7 +54,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyve/virtio.c 270326 2014-08-22 13:01:22Z tyc * front of virtio-based device softc" constraint, let's use * this to convert. */ -#define DEV_SOFTC(vs) ((void *)(vs)) +#define DEV_SOFTC(vs) ((void *)(vs)) /* * Link a virtio_softc to its constants, the device softc, and @@ -97,6 +102,7 @@ vi_reset_dev(struct virtio_softc *vs) for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) { vq->vq_flags = 0; vq->vq_last_avail = 0; + vq->vq_save_used = 0; vq->vq_pfn = 0; vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR; } @@ -147,8 +153,13 @@ vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix) return (1); } else vs->vs_flags &= ~VIRTIO_USE_MSIX; + /* Only 1 MSI vector for bhyve */ pci_emul_add_msicap(vs->vs_pi, 1); + + /* Legacy interrupts are mandatory for virtio devices */ + pci_lintr_request(vs->vs_pi); + return (0); } @@ -188,6 +199,7 @@ vi_vq_init(struct virtio_softc *vs, uint32_t pfn) /* Mark queue as allocated, and start at 0 when we use it. */ vq->vq_flags = VQ_ALLOC; vq->vq_last_avail = 0; + vq->vq_save_used = 0; } /* @@ -247,12 +259,12 @@ _vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx, * that vq_has_descs() does one). */ int -vq_getchain(struct vqueue_info *vq, +vq_getchain(struct vqueue_info *vq, uint16_t *pidx, struct iovec *iov, int n_iov, uint16_t *flags) { int i; u_int ndesc, n_indir; - u_int idx, head, next; + u_int idx, next; volatile struct virtio_desc *vdir, *vindir, *vp; struct vmctx *ctx; struct virtio_softc *vs; @@ -295,8 +307,8 @@ vq_getchain(struct vqueue_info *vq, * index, but we just abort if the count gets excessive. */ ctx = vs->vs_pi->pi_vmctx; - head = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; - next = head; + *pidx = next = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)]; + vq->vq_last_avail++; for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) { if (next >= vq->vq_qsize) { fprintf(stderr, @@ -309,7 +321,7 @@ vq_getchain(struct vqueue_info *vq, if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) { _vq_record(i, vdir, ctx, iov, n_iov, flags); i++; - } else if ((vs->vs_negotiated_caps & + } else if ((vs->vs_vc->vc_hv_caps & VIRTIO_RING_F_INDIRECT_DESC) == 0) { fprintf(stderr, "%s: descriptor has forbidden INDIRECT flag, " @@ -370,16 +382,29 @@ loopy: } /* - * Return the currently-first request chain to the guest, setting - * its I/O length to the provided value. + * Return the currently-first request chain back to the available queue. * * (This chain is the one you handled when you called vq_getchain() * and used its positive return value.) */ void -vq_relchain(struct vqueue_info *vq, uint32_t iolen) +vq_retchain(struct vqueue_info *vq) { - uint16_t head, uidx, mask; + + vq->vq_last_avail--; +} + +/* + * Return specified request chain to the guest, setting its I/O length + * to the provided value. + * + * (This chain is the one you handled when you called vq_getchain() + * and used its positive return value.) + */ +void +vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen) +{ + uint16_t uidx, mask; volatile struct vring_used *vuh; volatile struct virtio_used *vue; @@ -395,12 +420,17 @@ vq_relchain(struct vqueue_info *vq, uint32_t iolen) */ mask = vq->vq_qsize - 1; vuh = vq->vq_used; - head = vq->vq_avail->va_ring[vq->vq_last_avail++ & mask]; uidx = vuh->vu_idx; vue = &vuh->vu_ring[uidx++ & mask]; - vue->vu_idx = head; /* ie, vue->id = head */ + vue->vu_idx = idx; vue->vu_tlen = iolen; + + /* + * Ensure the used descriptor is visible before updating the index. + * This is necessary on ISAs with memory ordering less strict than x86. + */ + atomic_thread_fence_rel(); vuh->vu_idx = uidx; } @@ -436,8 +466,15 @@ vq_endchains(struct vqueue_info *vq, int used_all_avail) * entire avail was processed, we need to interrupt always. */ vs = vq->vq_vs; - new_idx = vq->vq_used->vu_idx; old_idx = vq->vq_save_used; + vq->vq_save_used = new_idx = vq->vq_used->vu_idx; + + /* + * Use full memory barrier between vu_idx store from preceding + * vq_relchain() call and the loads from VQ_USED_EVENT_IDX() or + * va_flags below. + */ + atomic_thread_fence_seq_cst(); if (used_all_avail && (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY)) intr = 1; @@ -698,6 +735,9 @@ bad: switch (offset) { case VTCFG_R_GUESTCAP: vs->vs_negotiated_caps = value & vc->vc_hv_caps; + if (vc->vc_apply_features) + (*vc->vc_apply_features)(DEV_SOFTC(vs), + vs->vs_negotiated_caps); break; case VTCFG_R_PFN: if (vs->vs_curq >= vc->vc_nvq) diff --git a/usr/src/cmd/bhyve/virtio.h b/usr/src/cmd/bhyve/virtio.h index 1a2ebe8118..a2c3362ec2 100644 --- a/usr/src/cmd/bhyve/virtio.h +++ b/usr/src/cmd/bhyve/virtio.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Chris Torek <torek @ torek net> * All rights reserved. * @@ -23,12 +25,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/virtio.h 268276 2014-07-05 02:38:53Z grehan $ + * $FreeBSD$ */ #ifndef _VIRTIO_H_ #define _VIRTIO_H_ +#include <pthread_np.h> + /* * These are derived from several virtio specifications. * @@ -184,7 +188,7 @@ struct vring_used { /* * PFN register shift amount */ -#define VRING_PFN 12 +#define VRING_PFN 12 /* * Virtio device types @@ -209,7 +213,9 @@ struct vring_used { #define VIRTIO_VENDOR 0x1AF4 #define VIRTIO_DEV_NET 0x1000 #define VIRTIO_DEV_BLOCK 0x1001 -#define VIRTIO_DEV_RANDOM 0x1002 +#define VIRTIO_DEV_CONSOLE 0x1003 +#define VIRTIO_DEV_RANDOM 0x1005 +#define VIRTIO_DEV_SCSI 0x1008 /* * PCI config space constants. @@ -220,19 +226,19 @@ struct vring_used { * If MSI-X is not enabled, those two registers disappear and * the remaining configuration registers start at offset 20. */ -#define VTCFG_R_HOSTCAP 0 -#define VTCFG_R_GUESTCAP 4 -#define VTCFG_R_PFN 8 -#define VTCFG_R_QNUM 12 -#define VTCFG_R_QSEL 14 -#define VTCFG_R_QNOTIFY 16 -#define VTCFG_R_STATUS 18 -#define VTCFG_R_ISR 19 -#define VTCFG_R_CFGVEC 20 -#define VTCFG_R_QVEC 22 -#define VTCFG_R_CFG0 20 /* No MSI-X */ -#define VTCFG_R_CFG1 24 /* With MSI-X */ -#define VTCFG_R_MSIX 20 +#define VTCFG_R_HOSTCAP 0 +#define VTCFG_R_GUESTCAP 4 +#define VTCFG_R_PFN 8 +#define VTCFG_R_QNUM 12 +#define VTCFG_R_QSEL 14 +#define VTCFG_R_QNOTIFY 16 +#define VTCFG_R_STATUS 18 +#define VTCFG_R_ISR 19 +#define VTCFG_R_CFGVEC 20 +#define VTCFG_R_QVEC 22 +#define VTCFG_R_CFG0 20 /* No MSI-X */ +#define VTCFG_R_CFG1 24 /* With MSI-X */ +#define VTCFG_R_MSIX 20 /* * Bits in VTCFG_R_STATUS. Guests need not actually set any of these, @@ -251,7 +257,7 @@ struct vring_used { #define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */ #define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */ -#define VIRTIO_MSI_NO_VECTOR 0xFFFF +#define VIRTIO_MSI_NO_VECTOR 0xFFFF /* * Feature flags. @@ -352,6 +358,8 @@ struct virtio_consts { /* called to read config regs */ int (*vc_cfgwrite)(void *, int, int, uint32_t); /* called to write config regs */ + void (*vc_apply_features)(void *, uint64_t); + /* called to apply negotiated features */ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */ }; @@ -423,20 +431,6 @@ vq_has_descs(struct vqueue_info *vq) } /* - * Called by virtio driver as it starts processing chains. Each - * completed chain (obtained from vq_getchain()) is released by - * calling vq_relchain(), then when all are done, vq_endchains() - * can tell if / how-many chains were processed and know whether - * and how to generate an interrupt. - */ -static inline void -vq_startchains(struct vqueue_info *vq) -{ - - vq->vq_save_used = vq->vq_used->vu_idx; -} - -/* * Deliver an interrupt to guest on the given virtual queue * (if possible, or a generic MSI interrupt if not using MSI-X). */ @@ -447,11 +441,25 @@ vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq) if (pci_msix_enabled(vs->vs_pi)) pci_generate_msix(vs->vs_pi, vq->vq_msix_idx); else { +#ifndef __FreeBSD__ + boolean_t unlock = B_FALSE; + + if (vs->vs_mtx && !pthread_mutex_isowned_np(vs->vs_mtx)) { + unlock = B_TRUE; + pthread_mutex_lock(vs->vs_mtx); + } +#else VS_LOCK(vs); +#endif vs->vs_isr |= VTCFG_ISR_QUEUES; pci_generate_msi(vs->vs_pi, 0); pci_lintr_assert(vs->vs_pi); +#ifndef __FreeBSD__ + if (unlock) + pthread_mutex_unlock(vs->vs_mtx); +#else VS_UNLOCK(vs); +#endif } } @@ -463,9 +471,10 @@ int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix); void vi_reset_dev(struct virtio_softc *); void vi_set_io_bar(struct virtio_softc *, int); -int vq_getchain(struct vqueue_info *vq, +int vq_getchain(struct vqueue_info *vq, uint16_t *pidx, struct iovec *iov, int n_iov, uint16_t *flags); -void vq_relchain(struct vqueue_info *vq, uint32_t iolen); +void vq_retchain(struct vqueue_info *vq); +void vq_relchain(struct vqueue_info *vq, uint16_t idx, uint32_t iolen); void vq_endchains(struct vqueue_info *vq, int used_all_avail); uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, diff --git a/usr/src/cmd/bhyve/xmsr.c b/usr/src/cmd/bhyve/xmsr.c index 0c097251e0..994445b3e3 100644 --- a/usr/src/cmd/bhyve/xmsr.c +++ b/usr/src/cmd/bhyve/xmsr.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,11 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/xmsr.c 279227 2015-02-24 05:15:40Z neel $ + * $FreeBSD$ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyve/xmsr.c 279227 2015-02-24 05:15:40Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/types.h> @@ -77,6 +79,7 @@ emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t val) return (0); case MSR_NB_CFG1: + case MSR_LS_CFG: case MSR_IC_CFG: return (0); /* Ignore writes */ @@ -146,6 +149,7 @@ emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val) break; case MSR_NB_CFG1: + case MSR_LS_CFG: case MSR_IC_CFG: /* * The reset value is processor family dependent so @@ -195,12 +199,23 @@ emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val) /* * OpenBSD guests test bit 0 of this MSR to detect if the * workaround for erratum 721 is already applied. - * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf + * https://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf */ case 0xC0011029: *val = 1; break; +#ifndef __FreeBSD__ + case MSR_VM_CR: + /* + * We currently don't support nested virt. + * Windows seems to ignore the cpuid bits and reads this + * MSR anyways. + */ + *val = VM_CR_SVMDIS; + break; +#endif + default: error = -1; break; diff --git a/usr/src/cmd/bhyve/xmsr.h b/usr/src/cmd/bhyve/xmsr.h index ac3c147442..1fb47c3ae2 100644 --- a/usr/src/cmd/bhyve/xmsr.h +++ b/usr/src/cmd/bhyve/xmsr.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyve/xmsr.h 271888 2014-09-20 02:35:21Z neel $ + * $FreeBSD$ */ #ifndef _XMSR_H_ diff --git a/usr/src/cmd/bhyveconsole/Makefile b/usr/src/cmd/bhyveconsole/Makefile deleted file mode 100644 index 11d34e6599..0000000000 --- a/usr/src/cmd/bhyveconsole/Makefile +++ /dev/null @@ -1,41 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -include ../Makefile.cmd - -SUBDIRS= $(MACH) - -all := TARGET = all -install := TARGET = install -clean := TARGET = clean -clobber := TARGET = clobber -lint := TARGET = lint - -.KEEP_STATE: - -all: $(SUBDIRS) - -clean clobber lint: $(SUBDIRS) - -install: $(SUBDIRS) - -$(RM) $(ROOTUSRSBINPROG) - -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG) - -$(SUBDIRS): FRC - @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) - -FRC: - -include ../Makefile.targ diff --git a/usr/src/cmd/bhyveconsole/bhyveconsole.c b/usr/src/cmd/bhyveconsole/bhyveconsole.c deleted file mode 100644 index 7f237a72f6..0000000000 --- a/usr/src/cmd/bhyveconsole/bhyveconsole.c +++ /dev/null @@ -1,360 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2013 Pluribus Networks Inc. - */ - -#include <sys/param.h> -#include <sys/signal.h> -#include <sys/socket.h> -#include <sys/termios.h> -#include <assert.h> -#include <errno.h> -#include <libgen.h> -#include <stdarg.h> -#include <stdio.h> -#include <strings.h> -#include <unistd.h> - -#include <bhyve.h> - -static int masterfd; -static struct termios save_termios; -static int save_fd; - -static int nocmdchar = 0; -static char cmdchar = '~'; - -static const char *pname; - -#define BCONS_BUFSIZ 8192 - -static void -usage(void) -{ - (void) fprintf(stderr, "usage: %s vmname\n", pname); - exit(2); -} - -static void -bcons_error(const char *fmt, ...) -{ - va_list alist; - - (void) fprintf(stderr, "%s: ", pname); - va_start(alist, fmt); - (void) vfprintf(stderr, fmt, alist); - va_end(alist); - (void) fprintf(stderr, "\n"); -} - -static void -bcons_perror(const char *str) -{ - const char *estr; - - if ((estr = strerror(errno)) != NULL) - (void) fprintf(stderr, "%s: %s: %s\n", pname, str, estr); - else - (void) fprintf(stderr, "%s: %s: errno %d\n", pname, str, errno); -} - -/* - * Create the unix domain socket and call bhyve; handshake - * with it to determine whether it will allow us to connect. - */ -static int -get_console(const char *vmname) -{ - int sockfd = -1; - struct sockaddr_un servaddr; - char clientid[MAXPATHLEN]; - char handshake[MAXPATHLEN], c; - int msglen; - int i = 0, err = 0; - - if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) { - bcons_perror("could not create socket"); - return (-1); - } - - bzero(&servaddr, sizeof (servaddr)); - servaddr.sun_family = AF_UNIX; - (void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path), - BHYVE_CONS_SOCKPATH, vmname); - - if (connect(sockfd, (struct sockaddr *)&servaddr, - sizeof (servaddr)) == -1) { - bcons_perror("Could not connect to console server"); - goto bad; - } - masterfd = sockfd; - - msglen = snprintf(clientid, sizeof (clientid), "IDENT %lu\n", - getpid()); - assert(msglen > 0 && msglen < sizeof (clientid)); - - if (write(masterfd, clientid, msglen) != msglen) { - bcons_error("protocol error"); - goto bad; - } - - /* - * Take care not to accumulate more than our fill, and leave room for - * the NUL at the end. - */ - while ((err = read(masterfd, &c, 1)) == 1) { - if (i >= (sizeof (handshake) - 1)) - break; - if (c == '\n') - break; - handshake[i] = c; - i++; - } - handshake[i] = '\0'; - - /* - * If something went wrong during the handshake we bail; perhaps - * the server died off. - */ - if (err == -1) { - bcons_perror("Could not connect to console server"); - goto bad; - } - - if (strncmp(handshake, "OK", sizeof (handshake)) == 0) - return (0); - - bcons_error("Console is already in use by process ID %s.", - handshake); -bad: - (void) close(sockfd); - masterfd = -1; - return (-1); -} - -/* - * Place terminal into raw mode. - */ -static int -set_tty_rawmode(int fd) -{ - struct termios term; - if (tcgetattr(fd, &term) < 0) { - bcons_perror("failed to get user terminal settings"); - return (-1); - } - - /* Stash for later, so we can revert back to previous mode */ - save_termios = term; - save_fd = fd; - - /* disable 8->7 bit strip, start/stop, enable any char to restart */ - term.c_iflag &= ~(ISTRIP|IXON|IXANY); - /* disable NL->CR, CR->NL, ignore CR, UPPER->lower */ - term.c_iflag &= ~(INLCR|ICRNL|IGNCR|IUCLC); - /* disable output post-processing */ - term.c_oflag &= ~OPOST; - /* disable canonical mode, signal chars, echo & extended functions */ - term.c_lflag &= ~(ICANON|ISIG|ECHO|IEXTEN); - - term.c_cc[VMIN] = 1; /* byte-at-a-time */ - term.c_cc[VTIME] = 0; - - if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &term)) { - bcons_perror("failed to set user terminal to raw mode"); - return (-1); - } - - return (0); -} - -/* - * reset terminal settings for global environment - */ -static void -reset_tty(void) -{ - (void) tcsetattr(save_fd, TCSADRAIN, &save_termios); -} - -/* - * process_user_input watches the input stream for the escape sequence for - * 'quit' (by default, tilde-period). Because we might be fed just one - * keystroke at a time, state associated with the user input (are we at the - * beginning of the line? are we locally echoing the next character?) is - * maintained by beginning_of_line and local_echo across calls to the routine. - * - * This routine returns -1 when the 'quit' escape sequence has been issued, - * or an error is encountered and 0 otherwise. - */ -static int -process_user_input(int out_fd, int in_fd) -{ - static boolean_t beginning_of_line = B_TRUE; - static boolean_t local_echo = B_FALSE; - char ibuf[BCONS_BUFSIZ]; - int nbytes; - char *buf = ibuf; - char c; - - nbytes = read(in_fd, ibuf, sizeof (ibuf)); - if (nbytes == -1 && errno != EINTR) - return (-1); - - if (nbytes == -1) /* The read was interrupted. */ - return (0); - - for (c = *buf; nbytes > 0; c = *buf, --nbytes) { - buf++; - if (beginning_of_line && !nocmdchar) { - beginning_of_line = B_FALSE; - if (c == cmdchar) { - local_echo = B_TRUE; - continue; - } - } else if (local_echo) { - local_echo = B_FALSE; - if (c == '.') { - (void) write(STDOUT_FILENO, &cmdchar, 1); - (void) write(STDOUT_FILENO, &c, 1); - return (-1); - } - } - - (void) write(out_fd, &c, 1); - - beginning_of_line = (c == '\r' || c == '\n'); - } - - return (0); -} - -static int -process_output(int in_fd, int out_fd) -{ - int wrote = 0; - int cc; - char ibuf[BCONS_BUFSIZ]; - - cc = read(in_fd, ibuf, sizeof (ibuf)); - if (cc == -1 && errno != EINTR) - return (-1); - if (cc == 0) /* EOF */ - return (-1); - if (cc == -1) /* The read was interrupted. */ - return (0); - - do { - int len; - - len = write(out_fd, ibuf + wrote, cc - wrote); - if (len == -1 && errno != EINTR) - return (-1); - if (len != -1) - wrote += len; - } while (wrote < cc); - - return (0); -} - -/* - * This is the main I/O loop. - */ -static void -doio(void) -{ - struct pollfd pollfds[2]; - int res; - - /* read from vm and write to stdout */ - pollfds[0].fd = masterfd; - pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI; - - /* read from stdin and write to vm */ - pollfds[1].fd = STDIN_FILENO; - pollfds[1].events = pollfds[0].events; - - for (;;) { - pollfds[0].revents = pollfds[1].revents = 0; - - res = poll(pollfds, - sizeof (pollfds) / sizeof (struct pollfd), -1); - - if (res == -1 && errno != EINTR) { - bcons_perror("poll failed"); - /* we are hosed, close connection */ - break; - } - - /* event from master side stdout */ - if (pollfds[0].revents) { - if (pollfds[0].revents & - (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) { - if (process_output(masterfd, STDOUT_FILENO) - != 0) - break; - } else { - break; - } - } - - /* event from user stdin side */ - if (pollfds[1].revents) { - if (pollfds[1].revents & - (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) { - if (process_user_input(masterfd, STDIN_FILENO) - != 0) - break; - } else { - break; - } - } - } -} - -int -main(int argc, char **argv) -{ - char *vmname; - - pname = basename(argv[0]); - - if (argc == 2) { - vmname = argv[1]; - } else { - usage(); - } - - /* - * Make contact with bhyve - */ - if (get_console(vmname) == -1) - return (1); - - (void) printf("[Connected to vm '%s' console]\n", vmname); - - if (set_tty_rawmode(STDIN_FILENO) == -1) { - reset_tty(); - bcons_perror("failed to set stdin pty to raw mode"); - return (1); - } - - /* - * Run the I/O loop until we get disconnected. - */ - doio(); - reset_tty(); - (void) printf("\n[Connection to vm '%s' console closed]\n", vmname); - - return (0); -} diff --git a/usr/src/cmd/bhyveconsole/i386/Makefile b/usr/src/cmd/bhyveconsole/i386/Makefile deleted file mode 100644 index c4f317a9fa..0000000000 --- a/usr/src/cmd/bhyveconsole/i386/Makefile +++ /dev/null @@ -1,43 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -PROG= bhyveconsole - -OBJS= bhyveconsole.o - -SRCS= $(OBJS:%.o=../%.c) - -include ../../Makefile.cmd - -CFLAGS += $(CCVERBOSE) -LDLIBS += -lsocket - -.KEEP_STATE: - -%.o: ../%.c - $(COMPILE.c) $< - -all: $(PROG) - -$(PROG): $(OBJS) - $(LINK.c) $(OBJS) -o $@ $(LDLIBS) - $(POST_PROCESS) - -install: all $(ROOTUSRSBINPROG32) - -clean: - $(RM) $(OBJS) - -include ../../Makefile.targ diff --git a/usr/src/cmd/bhyvectl/Makefile b/usr/src/cmd/bhyvectl/Makefile index fe98204056..0a8a96cfc9 100644 --- a/usr/src/cmd/bhyvectl/Makefile +++ b/usr/src/cmd/bhyvectl/Makefile @@ -11,31 +11,50 @@ # # Copyright 2013 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. # PROG = bhyvectl include ../Makefile.cmd +include ../Makefile.cmd.64 -$(BUILD64)SUBDIRS += $(MACH64) +SRCS = bhyvectl.c +OBJS = $(SRCS:.c=.o) humanize_number.o -all := TARGET = all -install := TARGET = install -clean := TARGET = clean -clobber := TARGET = clobber -lint := TARGET = lint +CLEANFILES = $(PROG) +CLOBBERFILES += $(ROOTUSRSBINPROG) .KEEP_STATE: -all clean clobber lint: $(SUBDIRS) +CFLAGS += $(CCVERBOSE) +CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ + -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64 \ + $(CPPFLAGS.master) \ + -I$(SRC)/uts/i86pc/io/vmm \ + -I$(SRC)/uts/i86pc +LDLIBS += -lvmmapi -install: $(SUBDIRS) - -$(RM) $(ROOTUSRSBINPROG) - -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG) +CERRWARN += -_gcc=-Wno-uninitialized -$(SUBDIRS): FRC - @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) +# main() is too hairy for smatch +bhyvectl.o := SMATCH=off -FRC: +all: $(PROG) + +$(PROG): $(OBJS) + $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) + $(POST_PROCESS) + +install: all $(ROOTUSRSBINPROG) + +clean: + $(RM) $(OBJS) $(CLEANFILES) + +lint: lint_SRCS include ../Makefile.targ + +%.o: $(CONTRIB)/freebsd/lib/libutil/%.c + $(COMPILE.c) -o $@ $< + $(POST_PROCESS_O) diff --git a/usr/src/cmd/bhyvectl/Makefile.com b/usr/src/cmd/bhyvectl/Makefile.com deleted file mode 100644 index 03ca34792c..0000000000 --- a/usr/src/cmd/bhyvectl/Makefile.com +++ /dev/null @@ -1,48 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -PROG= bhyvectl - -SRCS = bhyvectl.c -OBJS = $(SRCS:.c=.o) - -include ../../Makefile.cmd - -.KEEP_STATE: - -CFLAGS += $(CCVERBOSE) -CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \ - -I$(ROOT)/usr/platform/i86pc/include \ - -I$(SRC)/uts/i86pc/io/vmm -LDLIBS += -lvmmapi - -all: $(PROG) - -$(PROG): $(OBJS) - $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) - $(POST_PROCESS) - -install: all $(ROOTUSRSBINPROG) - -clean: - $(RM) $(OBJS) - -lint: lint_SRCS - -include ../../Makefile.targ - -%.o: ../%.c - $(COMPILE.c) -I$(SRC)/common $< - $(POST_PROCESS_O) diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c index 07d0a83df5..b8bdf524a9 100644 --- a/usr/src/cmd/bhyvectl/bhyvectl.c +++ b/usr/src/cmd/bhyvectl/bhyvectl.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -38,30 +40,39 @@ /* * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/types.h> #include <sys/sysctl.h> #include <sys/errno.h> #include <sys/mman.h> +#include <sys/cpuset.h> #include <stdio.h> #include <stdlib.h> +#include <stdbool.h> +#include <string.h> #include <unistd.h> #include <libgen.h> #include <libutil.h> #include <fcntl.h> -#include <string.h> #include <getopt.h> +#include <time.h> #include <assert.h> +#include <libutil.h> +#include <machine/cpufunc.h> +#include <machine/specialreg.h> #include <machine/vmm.h> +#include <machine/vmm_dev.h> #include <vmmapi.h> +#include "amd/vmcb.h" #include "intel/vmcs.h" #define MB (1UL << 20) @@ -74,7 +85,7 @@ __FBSDID("$FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43 static const char *progname; static void -usage(void) +usage(bool cpu_intel) { (void)fprintf(stderr, @@ -82,6 +93,9 @@ usage(void) " [--cpu=<vcpu_number>]\n" " [--create]\n" " [--destroy]\n" +#ifndef __FreeBSD__ + " [--wrlock-cycle]\n" +#endif " [--get-all]\n" " [--get-stats]\n" " [--set-desc-ds]\n" @@ -113,10 +127,22 @@ usage(void) " [--desc-access=<ACCESS>]\n" " [--set-cr0=<CR0>]\n" " [--get-cr0]\n" + " [--set-cr2=<CR2>]\n" + " [--get-cr2]\n" " [--set-cr3=<CR3>]\n" " [--get-cr3]\n" " [--set-cr4=<CR4>]\n" " [--get-cr4]\n" + " [--set-dr0=<DR0>]\n" + " [--get-dr0]\n" + " [--set-dr1=<DR1>]\n" + " [--get-dr1]\n" + " [--set-dr2=<DR2>]\n" + " [--get-dr2]\n" + " [--set-dr3=<DR3>]\n" + " [--get-dr3]\n" + " [--set-dr6=<DR6>]\n" + " [--get-dr6]\n" " [--set-dr7=<DR7>]\n" " [--get-dr7]\n" " [--set-rsp=<RSP>]\n" @@ -155,64 +181,108 @@ usage(void) " [--get-ss]\n" " [--get-tr]\n" " [--get-ldtr]\n" - " [--get-vmcs-pinbased-ctls]\n" - " [--get-vmcs-procbased-ctls]\n" - " [--get-vmcs-procbased-ctls2]\n" - " [--get-vmcs-entry-interruption-info]\n" - " [--set-vmcs-entry-interruption-info=<info>]\n" - " [--get-vmcs-eptp]\n" - " [--get-vmcs-guest-physical-address\n" - " [--get-vmcs-guest-linear-address\n" - " [--set-vmcs-exception-bitmap]\n" - " [--get-vmcs-exception-bitmap]\n" - " [--get-vmcs-io-bitmap-address]\n" - " [--get-vmcs-tsc-offset]\n" - " [--get-vmcs-guest-pat]\n" - " [--get-vmcs-host-pat]\n" - " [--get-vmcs-host-cr0]\n" - " [--get-vmcs-host-cr3]\n" - " [--get-vmcs-host-cr4]\n" - " [--get-vmcs-host-rip]\n" - " [--get-vmcs-host-rsp]\n" - " [--get-vmcs-cr0-mask]\n" - " [--get-vmcs-cr0-shadow]\n" - " [--get-vmcs-cr4-mask]\n" - " [--get-vmcs-cr4-shadow]\n" - " [--get-vmcs-cr3-targets]\n" - " [--get-vmcs-apic-access-address]\n" - " [--get-vmcs-virtual-apic-address]\n" - " [--get-vmcs-tpr-threshold]\n" - " [--get-vmcs-msr-bitmap]\n" - " [--get-vmcs-msr-bitmap-address]\n" - " [--get-vmcs-vpid]\n" - " [--get-vmcs-ple-gap]\n" - " [--get-vmcs-ple-window]\n" - " [--get-vmcs-instruction-error]\n" - " [--get-vmcs-exit-ctls]\n" - " [--get-vmcs-entry-ctls]\n" - " [--get-vmcs-guest-sysenter]\n" - " [--get-vmcs-link]\n" - " [--get-vmcs-exit-reason]\n" - " [--get-vmcs-exit-qualification]\n" - " [--get-vmcs-exit-interruption-info]\n" - " [--get-vmcs-exit-interruption-error]\n" - " [--get-vmcs-interruptibility]\n" " [--set-x2apic-state=<state>]\n" " [--get-x2apic-state]\n" " [--unassign-pptdev=<bus/slot/func>]\n" " [--set-mem=<memory in units of MB>]\n" " [--get-lowmem]\n" - " [--get-highmem]\n", + " [--get-highmem]\n" + " [--get-gpa-pmap]\n" + " [--assert-lapic-lvt=<pin>]\n" + " [--inject-nmi]\n" + " [--force-reset]\n" + " [--force-poweroff]\n" + " [--get-rtc-time]\n" + " [--set-rtc-time=<secs>]\n" + " [--get-rtc-nvram]\n" + " [--set-rtc-nvram=<val>]\n" + " [--rtc-nvram-offset=<offset>]\n" + " [--get-active-cpus]\n" + " [--get-suspended-cpus]\n" + " [--get-intinfo]\n" + " [--get-eptp]\n" + " [--set-exception-bitmap]\n" + " [--get-exception-bitmap]\n" + " [--get-tsc-offset]\n" + " [--get-guest-pat]\n" + " [--get-io-bitmap-address]\n" + " [--get-msr-bitmap]\n" + " [--get-msr-bitmap-address]\n" + " [--get-guest-sysenter]\n" + " [--get-exit-reason]\n" + " [--get-cpu-topology]\n", progname); + + if (cpu_intel) { + (void)fprintf(stderr, + " [--get-vmcs-pinbased-ctls]\n" + " [--get-vmcs-procbased-ctls]\n" + " [--get-vmcs-procbased-ctls2]\n" + " [--get-vmcs-entry-interruption-info]\n" + " [--set-vmcs-entry-interruption-info=<info>]\n" + " [--get-vmcs-guest-physical-address\n" + " [--get-vmcs-guest-linear-address\n" + " [--get-vmcs-host-pat]\n" + " [--get-vmcs-host-cr0]\n" + " [--get-vmcs-host-cr3]\n" + " [--get-vmcs-host-cr4]\n" + " [--get-vmcs-host-rip]\n" + " [--get-vmcs-host-rsp]\n" + " [--get-vmcs-cr0-mask]\n" + " [--get-vmcs-cr0-shadow]\n" + " [--get-vmcs-cr4-mask]\n" + " [--get-vmcs-cr4-shadow]\n" + " [--get-vmcs-cr3-targets]\n" + " [--get-vmcs-apic-access-address]\n" + " [--get-vmcs-virtual-apic-address]\n" + " [--get-vmcs-tpr-threshold]\n" + " [--get-vmcs-vpid]\n" + " [--get-vmcs-instruction-error]\n" + " [--get-vmcs-exit-ctls]\n" + " [--get-vmcs-entry-ctls]\n" + " [--get-vmcs-link]\n" + " [--get-vmcs-exit-qualification]\n" + " [--get-vmcs-exit-interruption-info]\n" + " [--get-vmcs-exit-interruption-error]\n" + " [--get-vmcs-interruptibility]\n" + ); + } else { + (void)fprintf(stderr, + " [--get-vmcb-intercepts]\n" + " [--get-vmcb-asid]\n" + " [--get-vmcb-exit-details]\n" + " [--get-vmcb-tlb-ctrl]\n" + " [--get-vmcb-virq]\n" + " [--get-avic-apic-bar]\n" + " [--get-avic-backing-page]\n" + " [--get-avic-table]\n" + ); + } exit(1); } -static int get_stats, getcap, setcap, capval; +static int get_rtc_time, set_rtc_time; +static int get_rtc_nvram, set_rtc_nvram; +static int rtc_nvram_offset; +static uint8_t rtc_nvram_value; +static time_t rtc_secs; + +static int get_stats, getcap, setcap, capval, get_gpa_pmap; +static int inject_nmi, assert_lapic_lvt; +static int force_reset, force_poweroff; static const char *capname; -static int create, destroy, get_lowmem, get_highmem; +static int create, destroy, get_memmap, get_memseg; +static int get_intinfo; +static int get_active_cpus, get_suspended_cpus; static uint64_t memsize; -static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4; +static int set_cr0, get_cr0, set_cr2, get_cr2, set_cr3, get_cr3; +static int set_cr4, get_cr4; static int set_efer, get_efer; +static int set_dr0, get_dr0; +static int set_dr1, get_dr1; +static int set_dr2, get_dr2; +static int set_dr3, get_dr3; +static int set_dr6, get_dr6; static int set_dr7, get_dr7; static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags; static int set_rax, get_rax; @@ -234,6 +304,16 @@ static int set_x2apic_state, get_x2apic_state; enum x2apic_state x2apic_state; static int unassign_pptdev, bus, slot, func; static int run; +static int get_cpu_topology; +#ifndef __FreeBSD__ +static int wrlock_cycle; +#endif + +/* + * VMCB specific. + */ +static int get_vmcb_intercept, get_vmcb_exit_details, get_vmcb_tlb_ctrl; +static int get_vmcb_virq, get_avic_table; /* * VMCS-specific fields @@ -250,14 +330,15 @@ static int get_cr4_mask, get_cr4_shadow; static int get_cr3_targets; static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold; static int get_msr_bitmap, get_msr_bitmap_address; -static int get_vpid, get_ple_gap, get_ple_window; +static int get_vpid_asid; static int get_inst_err, get_exit_ctls, get_entry_ctls; static int get_host_cr0, get_host_cr3, get_host_cr4; static int get_host_rip, get_host_rsp; static int get_guest_pat, get_host_pat; static int get_guest_sysenter, get_vmcs_link; -static int get_vmcs_exit_reason, get_vmcs_exit_qualification; +static int get_exit_reason, get_vmcs_exit_qualification; static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error; +static int get_vmcs_exit_inst_length; static uint64_t desc_base; static uint32_t desc_limit, desc_access; @@ -291,29 +372,115 @@ dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu) printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type); printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error); break; + case VM_EXITCODE_SVM: + printf("\treason\t\tSVM\n"); + printf("\texit_reason\t\t%#lx\n", vmexit->u.svm.exitcode); + printf("\texitinfo1\t\t%#lx\n", vmexit->u.svm.exitinfo1); + printf("\texitinfo2\t\t%#lx\n", vmexit->u.svm.exitinfo2); + break; default: printf("*** unknown vm run exitcode %d\n", vmexit->exitcode); break; } } -static int -dump_vmcs_msr_bitmap(int vcpu, u_long addr) +/* AMD 6th generation and Intel compatible MSRs */ +#define MSR_AMD6TH_START 0xC0000000 +#define MSR_AMD6TH_END 0xC0001FFF +/* AMD 7th and 8th generation compatible MSRs */ +#define MSR_AMD7TH_START 0xC0010000 +#define MSR_AMD7TH_END 0xC0011FFF + +static const char * +msr_name(uint32_t msr) { - int error, fd, byte, bit, readable, writeable; - u_int msr; - const char *bitmap; + static char buf[32]; + + switch(msr) { + case MSR_TSC: + return ("MSR_TSC"); + case MSR_EFER: + return ("MSR_EFER"); + case MSR_STAR: + return ("MSR_STAR"); + case MSR_LSTAR: + return ("MSR_LSTAR"); + case MSR_CSTAR: + return ("MSR_CSTAR"); + case MSR_SF_MASK: + return ("MSR_SF_MASK"); + case MSR_FSBASE: + return ("MSR_FSBASE"); + case MSR_GSBASE: + return ("MSR_GSBASE"); + case MSR_KGSBASE: + return ("MSR_KGSBASE"); + case MSR_SYSENTER_CS_MSR: + return ("MSR_SYSENTER_CS_MSR"); + case MSR_SYSENTER_ESP_MSR: + return ("MSR_SYSENTER_ESP_MSR"); + case MSR_SYSENTER_EIP_MSR: + return ("MSR_SYSENTER_EIP_MSR"); + case MSR_PAT: + return ("MSR_PAT"); + } + snprintf(buf, sizeof(buf), "MSR %#08x", msr); + + return (buf); +} - error = -1; - bitmap = MAP_FAILED; +static inline void +print_msr_pm(uint64_t msr, int vcpu, int readable, int writeable) +{ - fd = open("/dev/mem", O_RDONLY, 0); - if (fd < 0) - goto done; + if (readable || writeable) { + printf("%-20s[%d]\t\t%c%c\n", msr_name(msr), vcpu, + readable ? 'R' : '-', writeable ? 'W' : '-'); + } +} - bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr); - if (bitmap == MAP_FAILED) - goto done; +/* + * Reference APM vol2, section 15.11 MSR Intercepts. + */ +static void +dump_amd_msr_pm(const char *bitmap, int vcpu) +{ + int byte, bit, readable, writeable; + uint32_t msr; + + for (msr = 0; msr < 0x2000; msr++) { + byte = msr / 4; + bit = (msr % 4) * 2; + + /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; + print_msr_pm(msr, vcpu, readable, writeable); + + /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ + byte += 2048; + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; + print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable, + writeable); + + /* MSR 0xC0010000 to 0xC0011FF is only for AMD */ + byte += 4096; + readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; + writeable = (bitmap[byte] & (2 << bit)) ? 0 : 1; + print_msr_pm(msr + MSR_AMD7TH_START, vcpu, readable, + writeable); + } +} + +/* + * Reference Intel SDM Vol3 Section 24.6.9 MSR-Bitmap Address + */ +static void +dump_intel_msr_pm(const char *bitmap, int vcpu) +{ + int byte, bit, readable, writeable; + uint32_t msr; for (msr = 0; msr < 0x2000; msr++) { byte = msr / 8; @@ -321,31 +488,56 @@ dump_vmcs_msr_bitmap(int vcpu, u_long addr) /* Look at MSRs in the range 0x00000000 to 0x00001FFF */ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; - writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; - if (readable || writeable) { - printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu, - readable ? 'R' : '-', - writeable ? 'W' : '-'); - } + writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; + print_msr_pm(msr, vcpu, readable, writeable); /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */ byte += 1024; readable = (bitmap[byte] & (1 << bit)) ? 0 : 1; - writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; - if (readable || writeable) { - printf("msr 0x%08x[%d]\t\t%c%c\n", - 0xc0000000 + msr, vcpu, - readable ? 'R' : '-', - writeable ? 'W' : '-'); - } + writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1; + print_msr_pm(msr + MSR_AMD6TH_START, vcpu, readable, + writeable); + } +} + +static int +dump_msr_bitmap(int vcpu, uint64_t addr, bool cpu_intel) +{ + int error, fd, map_size; + const char *bitmap; + + error = -1; + bitmap = MAP_FAILED; + + fd = open("/dev/mem", O_RDONLY, 0); + if (fd < 0) { + perror("Couldn't open /dev/mem"); + goto done; } + if (cpu_intel) + map_size = PAGE_SIZE; + else + map_size = 2 * PAGE_SIZE; + + bitmap = mmap(NULL, map_size, PROT_READ, MAP_SHARED, fd, addr); + if (bitmap == MAP_FAILED) { + perror("mmap failed"); + goto done; + } + + if (cpu_intel) + dump_intel_msr_pm(bitmap, vcpu); + else + dump_amd_msr_pm(bitmap, vcpu); + error = 0; done: if (bitmap != MAP_FAILED) - munmap((void *)bitmap, PAGE_SIZE); + munmap((void *)bitmap, map_size); if (fd >= 0) close(fd); + return (error); } @@ -363,14 +555,36 @@ vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val) return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val)); } +static int +vm_get_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes, + uint64_t *ret_val) +{ + + return (vm_get_register(ctx, vcpu, VMCB_ACCESS(off, bytes), ret_val)); +} + +static int +vm_set_vmcb_field(struct vmctx *ctx, int vcpu, int off, int bytes, + uint64_t val) +{ + + return (vm_set_register(ctx, vcpu, VMCB_ACCESS(off, bytes), val)); +} + enum { VMNAME = 1000, /* avoid collision with return values from getopt */ VCPU, SET_MEM, SET_EFER, SET_CR0, + SET_CR2, SET_CR3, SET_CR4, + SET_DR0, + SET_DR1, + SET_DR2, + SET_DR3, + SET_DR6, SET_DR7, SET_RSP, SET_RIP, @@ -388,492 +602,158 @@ enum { SET_TR, SET_LDTR, SET_X2APIC_STATE, - SET_VMCS_EXCEPTION_BITMAP, + SET_EXCEPTION_BITMAP, SET_VMCS_ENTRY_INTERRUPTION_INFO, SET_CAP, CAPNAME, UNASSIGN_PPTDEV, + GET_GPA_PMAP, + ASSERT_LAPIC_LVT, + SET_RTC_TIME, + SET_RTC_NVRAM, + RTC_NVRAM_OFFSET, }; -int -main(int argc, char *argv[]) +static void +print_cpus(const char *banner, const cpuset_t *cpus) { - char *vmname; - int error, ch, vcpu; - vm_paddr_t gpa; - size_t len; - struct vm_exit vmexit; - uint64_t ctl, eptp, bm, addr, u64; - struct vmctx *ctx; - int wired; - - uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat; - uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp; - uint64_t r8, r9, r10, r11, r12, r13, r14, r15; - uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; - - struct option opts[] = { - { "vm", REQ_ARG, 0, VMNAME }, - { "cpu", REQ_ARG, 0, VCPU }, - { "set-mem", REQ_ARG, 0, SET_MEM }, - { "set-efer", REQ_ARG, 0, SET_EFER }, - { "set-cr0", REQ_ARG, 0, SET_CR0 }, - { "set-cr3", REQ_ARG, 0, SET_CR3 }, - { "set-cr4", REQ_ARG, 0, SET_CR4 }, - { "set-dr7", REQ_ARG, 0, SET_DR7 }, - { "set-rsp", REQ_ARG, 0, SET_RSP }, - { "set-rip", REQ_ARG, 0, SET_RIP }, - { "set-rax", REQ_ARG, 0, SET_RAX }, - { "set-rflags", REQ_ARG, 0, SET_RFLAGS }, - { "desc-base", REQ_ARG, 0, DESC_BASE }, - { "desc-limit", REQ_ARG, 0, DESC_LIMIT }, - { "desc-access",REQ_ARG, 0, DESC_ACCESS }, - { "set-cs", REQ_ARG, 0, SET_CS }, - { "set-ds", REQ_ARG, 0, SET_DS }, - { "set-es", REQ_ARG, 0, SET_ES }, - { "set-fs", REQ_ARG, 0, SET_FS }, - { "set-gs", REQ_ARG, 0, SET_GS }, - { "set-ss", REQ_ARG, 0, SET_SS }, - { "set-tr", REQ_ARG, 0, SET_TR }, - { "set-ldtr", REQ_ARG, 0, SET_LDTR }, - { "set-x2apic-state",REQ_ARG, 0, SET_X2APIC_STATE }, - { "set-vmcs-exception-bitmap", - REQ_ARG, 0, SET_VMCS_EXCEPTION_BITMAP }, - { "set-vmcs-entry-interruption-info", - REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO }, - { "capname", REQ_ARG, 0, CAPNAME }, - { "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV }, - { "setcap", REQ_ARG, 0, SET_CAP }, - { "getcap", NO_ARG, &getcap, 1 }, - { "get-stats", NO_ARG, &get_stats, 1 }, - { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, - { "set-desc-ds",NO_ARG, &set_desc_ds, 1 }, - { "get-desc-es",NO_ARG, &get_desc_es, 1 }, - { "set-desc-es",NO_ARG, &set_desc_es, 1 }, - { "get-desc-ss",NO_ARG, &get_desc_ss, 1 }, - { "set-desc-ss",NO_ARG, &set_desc_ss, 1 }, - { "get-desc-cs",NO_ARG, &get_desc_cs, 1 }, - { "set-desc-cs",NO_ARG, &set_desc_cs, 1 }, - { "get-desc-fs",NO_ARG, &get_desc_fs, 1 }, - { "set-desc-fs",NO_ARG, &set_desc_fs, 1 }, - { "get-desc-gs",NO_ARG, &get_desc_gs, 1 }, - { "set-desc-gs",NO_ARG, &set_desc_gs, 1 }, - { "get-desc-tr",NO_ARG, &get_desc_tr, 1 }, - { "set-desc-tr",NO_ARG, &set_desc_tr, 1 }, - { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 }, - { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 }, - { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 }, - { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 }, - { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 }, - { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 }, - { "get-lowmem", NO_ARG, &get_lowmem, 1 }, - { "get-highmem",NO_ARG, &get_highmem, 1 }, - { "get-efer", NO_ARG, &get_efer, 1 }, - { "get-cr0", NO_ARG, &get_cr0, 1 }, - { "get-cr3", NO_ARG, &get_cr3, 1 }, - { "get-cr4", NO_ARG, &get_cr4, 1 }, - { "get-dr7", NO_ARG, &get_dr7, 1 }, - { "get-rsp", NO_ARG, &get_rsp, 1 }, - { "get-rip", NO_ARG, &get_rip, 1 }, - { "get-rax", NO_ARG, &get_rax, 1 }, - { "get-rbx", NO_ARG, &get_rbx, 1 }, - { "get-rcx", NO_ARG, &get_rcx, 1 }, - { "get-rdx", NO_ARG, &get_rdx, 1 }, - { "get-rsi", NO_ARG, &get_rsi, 1 }, - { "get-rdi", NO_ARG, &get_rdi, 1 }, - { "get-rbp", NO_ARG, &get_rbp, 1 }, - { "get-r8", NO_ARG, &get_r8, 1 }, - { "get-r9", NO_ARG, &get_r9, 1 }, - { "get-r10", NO_ARG, &get_r10, 1 }, - { "get-r11", NO_ARG, &get_r11, 1 }, - { "get-r12", NO_ARG, &get_r12, 1 }, - { "get-r13", NO_ARG, &get_r13, 1 }, - { "get-r14", NO_ARG, &get_r14, 1 }, - { "get-r15", NO_ARG, &get_r15, 1 }, - { "get-rflags", NO_ARG, &get_rflags, 1 }, - { "get-cs", NO_ARG, &get_cs, 1 }, - { "get-ds", NO_ARG, &get_ds, 1 }, - { "get-es", NO_ARG, &get_es, 1 }, - { "get-fs", NO_ARG, &get_fs, 1 }, - { "get-gs", NO_ARG, &get_gs, 1 }, - { "get-ss", NO_ARG, &get_ss, 1 }, - { "get-tr", NO_ARG, &get_tr, 1 }, - { "get-ldtr", NO_ARG, &get_ldtr, 1 }, - { "get-vmcs-pinbased-ctls", - NO_ARG, &get_pinbased_ctls, 1 }, - { "get-vmcs-procbased-ctls", - NO_ARG, &get_procbased_ctls, 1 }, - { "get-vmcs-procbased-ctls2", - NO_ARG, &get_procbased_ctls2, 1 }, - { "get-vmcs-guest-linear-address", - NO_ARG, &get_vmcs_gla, 1 }, - { "get-vmcs-guest-physical-address", - NO_ARG, &get_vmcs_gpa, 1 }, - { "get-vmcs-entry-interruption-info", - NO_ARG, &get_vmcs_entry_interruption_info, 1}, - { "get-vmcs-eptp", NO_ARG, &get_eptp, 1 }, - { "get-vmcs-exception-bitmap", - NO_ARG, &get_exception_bitmap, 1 }, - { "get-vmcs-io-bitmap-address", - NO_ARG, &get_io_bitmap, 1 }, - { "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 }, - { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 }, - { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 }, - { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 }, - { "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 }, - { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1}, - { "get-vmcs-apic-access-address", - NO_ARG, &get_apic_access_addr, 1}, - { "get-vmcs-virtual-apic-address", - NO_ARG, &get_virtual_apic_addr, 1}, - { "get-vmcs-tpr-threshold", - NO_ARG, &get_tpr_threshold, 1 }, - { "get-vmcs-msr-bitmap", - NO_ARG, &get_msr_bitmap, 1 }, - { "get-vmcs-msr-bitmap-address", - NO_ARG, &get_msr_bitmap_address, 1 }, - { "get-vmcs-vpid", NO_ARG, &get_vpid, 1 }, - { "get-vmcs-ple-gap", NO_ARG, &get_ple_gap, 1 }, - { "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 }, - { "get-vmcs-instruction-error", - NO_ARG, &get_inst_err, 1 }, - { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 }, - { "get-vmcs-entry-ctls", - NO_ARG, &get_entry_ctls, 1 }, - { "get-vmcs-guest-pat", NO_ARG, &get_guest_pat, 1 }, - { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 }, - { "get-vmcs-host-cr0", - NO_ARG, &get_host_cr0, 1 }, - { "get-vmcs-host-cr3", - NO_ARG, &get_host_cr3, 1 }, - { "get-vmcs-host-cr4", - NO_ARG, &get_host_cr4, 1 }, - { "get-vmcs-host-rip", - NO_ARG, &get_host_rip, 1 }, - { "get-vmcs-host-rsp", - NO_ARG, &get_host_rsp, 1 }, - { "get-vmcs-guest-sysenter", - NO_ARG, &get_guest_sysenter, 1 }, - { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 }, - { "get-vmcs-exit-reason", - NO_ARG, &get_vmcs_exit_reason, 1 }, - { "get-vmcs-exit-qualification", - NO_ARG, &get_vmcs_exit_qualification, 1 }, - { "get-vmcs-exit-interruption-info", - NO_ARG, &get_vmcs_exit_interruption_info, 1}, - { "get-vmcs-exit-interruption-error", - NO_ARG, &get_vmcs_exit_interruption_error, 1}, - { "get-vmcs-interruptibility", - NO_ARG, &get_vmcs_interruptibility, 1 }, - { "get-x2apic-state",NO_ARG, &get_x2apic_state, 1 }, - { "get-all", NO_ARG, &get_all, 1 }, - { "run", NO_ARG, &run, 1 }, - { "create", NO_ARG, &create, 1 }, - { "destroy", NO_ARG, &destroy, 1 }, - { NULL, 0, NULL, 0 } - }; - - vcpu = 0; - progname = basename(argv[0]); + int i; + int first; + + first = 1; + printf("%s:\t", banner); + if (!CPU_EMPTY(cpus)) { + for (i = 0; i < CPU_SETSIZE; i++) { + if (CPU_ISSET(i, cpus)) { + printf("%s%d", first ? " " : ", ", i); + first = 0; + } + } + } else + printf(" (none)"); + printf("\n"); +} - while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { - switch (ch) { - case 0: - break; - case VMNAME: - vmname = optarg; - break; - case VCPU: - vcpu = atoi(optarg); - break; - case SET_MEM: - memsize = atoi(optarg) * MB; - memsize = roundup(memsize, 2 * MB); - break; - case SET_EFER: - efer = strtoul(optarg, NULL, 0); - set_efer = 1; - break; - case SET_CR0: - cr0 = strtoul(optarg, NULL, 0); - set_cr0 = 1; - break; - case SET_CR3: - cr3 = strtoul(optarg, NULL, 0); - set_cr3 = 1; - break; - case SET_CR4: - cr4 = strtoul(optarg, NULL, 0); - set_cr4 = 1; - break; - case SET_DR7: - dr7 = strtoul(optarg, NULL, 0); - set_dr7 = 1; - break; - case SET_RSP: - rsp = strtoul(optarg, NULL, 0); - set_rsp = 1; - break; - case SET_RIP: - rip = strtoul(optarg, NULL, 0); - set_rip = 1; - break; - case SET_RAX: - rax = strtoul(optarg, NULL, 0); - set_rax = 1; - break; - case SET_RFLAGS: - rflags = strtoul(optarg, NULL, 0); - set_rflags = 1; - break; - case DESC_BASE: - desc_base = strtoul(optarg, NULL, 0); - break; - case DESC_LIMIT: - desc_limit = strtoul(optarg, NULL, 0); - break; - case DESC_ACCESS: - desc_access = strtoul(optarg, NULL, 0); - break; - case SET_CS: - cs = strtoul(optarg, NULL, 0); - set_cs = 1; - break; - case SET_DS: - ds = strtoul(optarg, NULL, 0); - set_ds = 1; - break; - case SET_ES: - es = strtoul(optarg, NULL, 0); - set_es = 1; - break; - case SET_FS: - fs = strtoul(optarg, NULL, 0); - set_fs = 1; - break; - case SET_GS: - gs = strtoul(optarg, NULL, 0); - set_gs = 1; - break; - case SET_SS: - ss = strtoul(optarg, NULL, 0); - set_ss = 1; - break; - case SET_TR: - tr = strtoul(optarg, NULL, 0); - set_tr = 1; - break; - case SET_LDTR: - ldtr = strtoul(optarg, NULL, 0); - set_ldtr = 1; - break; - case SET_X2APIC_STATE: - x2apic_state = strtol(optarg, NULL, 0); - set_x2apic_state = 1; - break; - case SET_VMCS_EXCEPTION_BITMAP: - exception_bitmap = strtoul(optarg, NULL, 0); - set_exception_bitmap = 1; - break; - case SET_VMCS_ENTRY_INTERRUPTION_INFO: - vmcs_entry_interruption_info = strtoul(optarg, NULL, 0); - set_vmcs_entry_interruption_info = 1; - break; - case SET_CAP: - capval = strtoul(optarg, NULL, 0); - setcap = 1; +static void +print_intinfo(const char *banner, uint64_t info) +{ + int type; + + printf("%s:\t", banner); + if (info & VM_INTINFO_VALID) { + type = info & VM_INTINFO_TYPE; + switch (type) { + case VM_INTINFO_HWINTR: + printf("extint"); break; - case CAPNAME: - capname = optarg; + case VM_INTINFO_NMI: + printf("nmi"); break; - case UNASSIGN_PPTDEV: - unassign_pptdev = 1; - if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3) - usage(); + case VM_INTINFO_SWINTR: + printf("swint"); break; default: - usage(); + printf("exception"); + break; } + printf(" vector %d", (int)VM_INTINFO_VECTOR(info)); + if (info & VM_INTINFO_DEL_ERRCODE) + printf(" errcode %#x", (u_int)(info >> 32)); + } else { + printf("n/a"); } - argc -= optind; - argv += optind; - - if (vmname == NULL) - usage(); - - error = 0; - - if (!error && create) - error = vm_create(vmname); - - if (!error) { - ctx = vm_open(vmname); - if (ctx == NULL) - error = -1; - } - - if (!error && memsize) - error = vm_setup_memory(ctx, memsize, VM_MMAP_NONE); - - if (!error && set_efer) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer); - - if (!error && set_cr0) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0); - - if (!error && set_cr3) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3); - - if (!error && set_cr4) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4); - - if (!error && set_dr7) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7); - - if (!error && set_rsp) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp); - - if (!error && set_rip) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip); - - if (!error && set_rax) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax); - - if (!error && set_rflags) { - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, - rflags); - } - - if (!error && set_desc_ds) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_es) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_ss) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_cs) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_fs) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS, - desc_base, desc_limit, desc_access); - } - - if (!error && set_desc_gs) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS, - desc_base, desc_limit, desc_access); - } + printf("\n"); +} - if (!error && set_desc_tr) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR, - desc_base, desc_limit, desc_access); +static bool +cpu_vendor_intel(void) +{ + u_int regs[4]; + char cpu_vendor[13]; + + do_cpuid(0, regs); + ((u_int *)&cpu_vendor)[0] = regs[1]; + ((u_int *)&cpu_vendor)[1] = regs[3]; + ((u_int *)&cpu_vendor)[2] = regs[2]; + cpu_vendor[12] = '\0'; + + if (strcmp(cpu_vendor, "AuthenticAMD") == 0) { + return (false); + } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) { + return (true); + } else { + fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor); + exit(1); } +} - if (!error && set_desc_ldtr) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR, - desc_base, desc_limit, desc_access); - } +static int +get_all_registers(struct vmctx *ctx, int vcpu) +{ + uint64_t cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7; + uint64_t rsp, rip, rflags, efer; + uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp; + uint64_t r8, r9, r10, r11, r12, r13, r14, r15; + int error = 0; - if (!error && set_desc_gdtr) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR, - desc_base, desc_limit, 0); + if (!error && (get_efer || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer); + if (error == 0) + printf("efer[%d]\t\t0x%016lx\n", vcpu, efer); } - if (!error && set_desc_idtr) { - error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR, - desc_base, desc_limit, 0); + if (!error && (get_cr0 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0); + if (error == 0) + printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0); } - if (!error && set_cs) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs); - - if (!error && set_ds) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds); - - if (!error && set_es) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es); - - if (!error && set_fs) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs); - - if (!error && set_gs) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs); - - if (!error && set_ss) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss); - - if (!error && set_tr) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr); - - if (!error && set_ldtr) - error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr); - - if (!error && set_x2apic_state) - error = vm_set_x2apic_state(ctx, vcpu, x2apic_state); - -#ifdef __FreeBSD__ - if (!error && unassign_pptdev) - error = vm_unassign_pptdev(ctx, bus, slot, func); -#endif - - if (!error && set_exception_bitmap) { - error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP, - exception_bitmap); + if (!error && (get_cr2 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR2, &cr2); + if (error == 0) + printf("cr2[%d]\t\t0x%016lx\n", vcpu, cr2); } - if (!error && set_vmcs_entry_interruption_info) { - error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO, - vmcs_entry_interruption_info); + if (!error && (get_cr3 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3); + if (error == 0) + printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3); } - if (!error && (get_lowmem || get_all)) { - gpa = 0; - error = vm_get_memory_seg(ctx, gpa, &len, &wired); + if (!error && (get_cr4 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4); if (error == 0) - printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len, - wired ? " wired" : ""); + printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4); } - if (!error && (get_highmem || get_all)) { - gpa = 4 * GB; - error = vm_get_memory_seg(ctx, gpa, &len, &wired); + if (!error && (get_dr0 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR0, &dr0); if (error == 0) - printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len, - wired ? " wired" : ""); + printf("dr0[%d]\t\t0x%016lx\n", vcpu, dr0); } - if (!error && (get_efer || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer); + if (!error && (get_dr1 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR1, &dr1); if (error == 0) - printf("efer[%d]\t\t0x%016lx\n", vcpu, efer); + printf("dr1[%d]\t\t0x%016lx\n", vcpu, dr1); } - if (!error && (get_cr0 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0); + if (!error && (get_dr2 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR2, &dr2); if (error == 0) - printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0); + printf("dr2[%d]\t\t0x%016lx\n", vcpu, dr2); } - if (!error && (get_cr3 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3); + if (!error && (get_dr3 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR3, &dr3); if (error == 0) - printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3); + printf("dr3[%d]\t\t0x%016lx\n", vcpu, dr3); } - if (!error && (get_cr4 || get_all)) { - error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4); + if (!error && (get_dr6 || get_all)) { + error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR6, &dr6); if (error == 0) - printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4); + printf("dr6[%d]\t\t0x%016lx\n", vcpu, dr6); } if (!error && (get_dr7 || get_all)) { @@ -991,30 +871,21 @@ main(int argc, char *argv[]) printf("rflags[%d]\t0x%016lx\n", vcpu, rflags); } -#ifdef __FreeBSD__ - if (!error && (get_stats || get_all)) { - int i, num_stats; - uint64_t *stats; - struct timeval tv; - const char *desc; + return (error); +} - stats = vm_get_stats(ctx, vcpu, &tv, &num_stats); - if (stats != NULL) { - printf("vcpu%d\n", vcpu); - for (i = 0; i < num_stats; i++) { - desc = vm_get_stat_desc(ctx, i); - printf("%-40s\t%ld\n", desc, stats[i]); - } - } - } -#endif +static int +get_all_segments(struct vmctx *ctx, int vcpu) +{ + uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; + int error = 0; if (!error && (get_desc_ds || get_all)) { error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS, - &desc_base, &desc_limit, &desc_access); + &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1023,7 +894,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1032,7 +903,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1041,7 +912,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1050,7 +921,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1059,7 +930,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1068,7 +939,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1077,7 +948,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n", - vcpu, desc_base, desc_limit, desc_access); + vcpu, desc_base, desc_limit, desc_access); } } @@ -1086,7 +957,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("gdtr[%d]\t\t0x%016lx/0x%08x\n", - vcpu, desc_base, desc_limit); + vcpu, desc_base, desc_limit); } } @@ -1095,7 +966,7 @@ main(int argc, char *argv[]) &desc_base, &desc_limit, &desc_access); if (error == 0) { printf("idtr[%d]\t\t0x%016lx/0x%08x\n", - vcpu, desc_base, desc_limit); + vcpu, desc_base, desc_limit); } } @@ -1147,82 +1018,14 @@ main(int argc, char *argv[]) printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr); } - if (!error && (get_x2apic_state || get_all)) { - error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state); - if (error == 0) - printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state); - } - - if (!error && (get_pinbased_ctls || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl); - if (error == 0) - printf("pinbased_ctls[%d]\t0x%08lx\n", vcpu, ctl); - } - - if (!error && (get_procbased_ctls || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_PRI_PROC_BASED_CTLS, &ctl); - if (error == 0) - printf("procbased_ctls[%d]\t0x%08lx\n", vcpu, ctl); - } - - if (!error && (get_procbased_ctls2 || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_SEC_PROC_BASED_CTLS, &ctl); - if (error == 0) - printf("procbased_ctls2[%d]\t0x%08lx\n", vcpu, ctl); - } - - if (!error && (get_vmcs_gla || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_GUEST_LINEAR_ADDRESS, &u64); - if (error == 0) - printf("gla[%d]\t\t0x%016lx\n", vcpu, u64); - } - - if (!error && (get_vmcs_gpa || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_GUEST_PHYSICAL_ADDRESS, &u64); - if (error == 0) - printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64); - } - - if (!error && (get_vmcs_entry_interruption_info || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64); - if (error == 0) { - printf("entry_interruption_info[%d]\t0x%08lx\n", - vcpu, u64); - } - } - - if (!error && (get_eptp || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp); - if (error == 0) - printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp); - } - - if (!error && (get_exception_bitmap || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP, - &bm); - if (error == 0) - printf("exception_bitmap[%d]\t0x%08lx\n", vcpu, bm); - } - - if (!error && (get_io_bitmap || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm); - if (error == 0) - printf("io_bitmap_a[%d]\t0x%08lx\n", vcpu, bm); - error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm); - if (error == 0) - printf("io_bitmap_b[%d]\t0x%08lx\n", vcpu, bm); - } + return (error); +} - if (!error && (get_tsc_offset || get_all)) { - uint64_t tscoff; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff); - if (error == 0) - printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff); - } +static int +get_misc_vmcs(struct vmctx *ctx, int vcpu) +{ + uint64_t ctl, cr0, cr3, cr4, rsp, rip, pat, addr, u64; + int error = 0; if (!error && (get_cr0_mask || get_all)) { uint64_t cr0mask; @@ -1259,7 +1062,7 @@ main(int argc, char *argv[]) error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT, &target_count); if (error == 0) { - printf("cr3_target_count[%d]\t0x%08lx\n", + printf("cr3_target_count[%d]\t0x%016lx\n", vcpu, target_count); } @@ -1292,57 +1095,55 @@ main(int argc, char *argv[]) } } - if (!error && (get_apic_access_addr || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr); + if (!error && (get_pinbased_ctls || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl); if (error == 0) - printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr); + printf("pinbased_ctls[%d]\t0x%016lx\n", vcpu, ctl); } - if (!error && (get_virtual_apic_addr || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr); + if (!error && (get_procbased_ctls || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_PRI_PROC_BASED_CTLS, &ctl); if (error == 0) - printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr); + printf("procbased_ctls[%d]\t0x%016lx\n", vcpu, ctl); } - if (!error && (get_tpr_threshold || get_all)) { - uint64_t threshold; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD, - &threshold); + if (!error && (get_procbased_ctls2 || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_SEC_PROC_BASED_CTLS, &ctl); if (error == 0) - printf("tpr_threshold[%d]\t0x%08lx\n", vcpu, threshold); + printf("procbased_ctls2[%d]\t0x%016lx\n", vcpu, ctl); } - if (!error && (get_msr_bitmap_address || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr); + if (!error && (get_vmcs_gla || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_LINEAR_ADDRESS, &u64); if (error == 0) - printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr); + printf("gla[%d]\t\t0x%016lx\n", vcpu, u64); } - if (!error && (get_msr_bitmap || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr); + if (!error && (get_vmcs_gpa || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_PHYSICAL_ADDRESS, &u64); if (error == 0) - error = dump_vmcs_msr_bitmap(vcpu, addr); + printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64); } - if (!error && (get_vpid || get_all)) { - uint64_t vpid; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid); - if (error == 0) - printf("vpid[%d]\t\t0x%04lx\n", vcpu, vpid); - } - - if (!error && (get_ple_window || get_all)) { - uint64_t window; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window); - if (error == 0) - printf("ple_window[%d]\t\t0x%08lx\n", vcpu, window); + if (!error && (get_vmcs_entry_interruption_info || + get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64); + if (error == 0) { + printf("entry_interruption_info[%d]\t0x%016lx\n", + vcpu, u64); + } } - if (!error && (get_ple_gap || get_all)) { - uint64_t gap; - error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap); + if (!error && (get_tpr_threshold || get_all)) { + uint64_t threshold; + error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD, + &threshold); if (error == 0) - printf("ple_gap[%d]\t\t0x%08lx\n", vcpu, gap); + printf("tpr_threshold[%d]\t0x%016lx\n", vcpu, threshold); } if (!error && (get_inst_err || get_all)) { @@ -1350,7 +1151,7 @@ main(int argc, char *argv[]) error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR, &insterr); if (error == 0) { - printf("instruction_error[%d]\t0x%08lx\n", + printf("instruction_error[%d]\t0x%016lx\n", vcpu, insterr); } } @@ -1358,13 +1159,13 @@ main(int argc, char *argv[]) if (!error && (get_exit_ctls || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl); if (error == 0) - printf("exit_ctls[%d]\t\t0x%08lx\n", vcpu, ctl); + printf("exit_ctls[%d]\t\t0x%016lx\n", vcpu, ctl); } if (!error && (get_entry_ctls || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl); if (error == 0) - printf("entry_ctls[%d]\t\t0x%08lx\n", vcpu, ctl); + printf("entry_ctls[%d]\t\t0x%016lx\n", vcpu, ctl); } if (!error && (get_host_pat || get_all)) { @@ -1373,12 +1174,6 @@ main(int argc, char *argv[]) printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat); } - if (!error && (get_guest_pat || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat); - if (error == 0) - printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat); - } - if (!error && (get_host_cr0 || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0); if (error == 0) @@ -1409,55 +1204,25 @@ main(int argc, char *argv[]) printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp); } - if (!error && (get_guest_sysenter || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_GUEST_IA32_SYSENTER_CS, &cs); - if (error == 0) - printf("guest_sysenter_cs[%d]\t0x%08lx\n", vcpu, cs); - - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_GUEST_IA32_SYSENTER_ESP, &rsp); - if (error == 0) - printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp); - error = vm_get_vmcs_field(ctx, vcpu, - VMCS_GUEST_IA32_SYSENTER_EIP, &rip); - if (error == 0) - printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip); - } - if (!error && (get_vmcs_link || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr); if (error == 0) printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr); } - if (!error && (get_vmcs_exit_reason || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64); - if (error == 0) - printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64); - } - - if (!error && (get_vmcs_exit_qualification || get_all)) { - error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION, - &u64); - if (error == 0) - printf("vmcs_exit_qualification[%d]\t0x%016lx\n", - vcpu, u64); - } - if (!error && (get_vmcs_exit_interruption_info || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_INFO, &u64); if (error == 0) { - printf("vmcs_exit_interruption_info[%d]\t0x%08lx\n", + printf("vmcs_exit_interruption_info[%d]\t0x%016lx\n", vcpu, u64); } } if (!error && (get_vmcs_exit_interruption_error || get_all)) { error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_ERRCODE, - &u64); + &u64); if (error == 0) { - printf("vmcs_exit_interruption_error[%d]\t0x%08lx\n", + printf("vmcs_exit_interruption_error[%d]\t0x%016lx\n", vcpu, u64); } } @@ -1466,9 +1231,1014 @@ main(int argc, char *argv[]) error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_INTERRUPTIBILITY, &u64); if (error == 0) { - printf("vmcs_guest_interruptibility[%d]\t0x%08lx\n", + printf("vmcs_guest_interruptibility[%d]\t0x%016lx\n", + vcpu, u64); + } + } + + if (!error && (get_vmcs_exit_inst_length || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_EXIT_INSTRUCTION_LENGTH, &u64); + if (error == 0) + printf("vmcs_exit_inst_length[%d]\t0x%08x\n", vcpu, + (uint32_t)u64); + } + + if (!error && (get_vmcs_exit_qualification || get_all)) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION, + &u64); + if (error == 0) + printf("vmcs_exit_qualification[%d]\t0x%016lx\n", vcpu, u64); + } + + return (error); +} + +static int +get_misc_vmcb(struct vmctx *ctx, int vcpu) +{ + uint64_t ctl, addr; + int error = 0; + + if (!error && (get_vmcb_intercept || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_CR_INTERCEPT, 4, + &ctl); + if (error == 0) + printf("cr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_DR_INTERCEPT, 4, + &ctl); + if (error == 0) + printf("dr_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXC_INTERCEPT, 4, + &ctl); + if (error == 0) + printf("exc_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST1_INTERCEPT, + 4, &ctl); + if (error == 0) + printf("inst1_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_INST2_INTERCEPT, + 4, &ctl); + if (error == 0) + printf("inst2_intercept[%d]\t0x%08x\n", vcpu, (int)ctl); + } + + if (!error && (get_vmcb_tlb_ctrl || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_TLB_CTRL, + 4, &ctl); + if (error == 0) + printf("TLB ctrl[%d]\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_vmcb_exit_details || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO1, + 8, &ctl); + if (error == 0) + printf("exitinfo1[%d]\t0x%016lx\n", vcpu, ctl); + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINFO2, + 8, &ctl); + if (error == 0) + printf("exitinfo2[%d]\t0x%016lx\n", vcpu, ctl); + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_EXITINTINFO, + 8, &ctl); + if (error == 0) + printf("exitintinfo[%d]\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_vmcb_virq || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_VIRQ, + 8, &ctl); + if (error == 0) + printf("v_irq/tpr[%d]\t0x%016lx\n", vcpu, ctl); + } + + if (!error && (get_apic_access_addr || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_BAR, 8, + &addr); + if (error == 0) + printf("AVIC apic_bar[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && (get_virtual_apic_addr || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PAGE, 8, + &addr); + if (error == 0) + printf("AVIC backing page[%d]\t0x%016lx\n", vcpu, addr); + } + + if (!error && (get_avic_table || get_all)) { + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_LT, 8, + &addr); + if (error == 0) + printf("AVIC logical table[%d]\t0x%016lx\n", + vcpu, addr); + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_AVIC_PT, 8, + &addr); + if (error == 0) + printf("AVIC physical table[%d]\t0x%016lx\n", + vcpu, addr); + } + + return (error); +} + +static struct option * +setup_options(bool cpu_intel) +{ + const struct option common_opts[] = { + { "vm", REQ_ARG, 0, VMNAME }, + { "cpu", REQ_ARG, 0, VCPU }, + { "set-mem", REQ_ARG, 0, SET_MEM }, + { "set-efer", REQ_ARG, 0, SET_EFER }, + { "set-cr0", REQ_ARG, 0, SET_CR0 }, + { "set-cr2", REQ_ARG, 0, SET_CR2 }, + { "set-cr3", REQ_ARG, 0, SET_CR3 }, + { "set-cr4", REQ_ARG, 0, SET_CR4 }, + { "set-dr0", REQ_ARG, 0, SET_DR0 }, + { "set-dr1", REQ_ARG, 0, SET_DR1 }, + { "set-dr2", REQ_ARG, 0, SET_DR2 }, + { "set-dr3", REQ_ARG, 0, SET_DR3 }, + { "set-dr6", REQ_ARG, 0, SET_DR6 }, + { "set-dr7", REQ_ARG, 0, SET_DR7 }, + { "set-rsp", REQ_ARG, 0, SET_RSP }, + { "set-rip", REQ_ARG, 0, SET_RIP }, + { "set-rax", REQ_ARG, 0, SET_RAX }, + { "set-rflags", REQ_ARG, 0, SET_RFLAGS }, + { "desc-base", REQ_ARG, 0, DESC_BASE }, + { "desc-limit", REQ_ARG, 0, DESC_LIMIT }, + { "desc-access",REQ_ARG, 0, DESC_ACCESS }, + { "set-cs", REQ_ARG, 0, SET_CS }, + { "set-ds", REQ_ARG, 0, SET_DS }, + { "set-es", REQ_ARG, 0, SET_ES }, + { "set-fs", REQ_ARG, 0, SET_FS }, + { "set-gs", REQ_ARG, 0, SET_GS }, + { "set-ss", REQ_ARG, 0, SET_SS }, + { "set-tr", REQ_ARG, 0, SET_TR }, + { "set-ldtr", REQ_ARG, 0, SET_LDTR }, + { "set-x2apic-state",REQ_ARG, 0, SET_X2APIC_STATE }, + { "set-exception-bitmap", + REQ_ARG, 0, SET_EXCEPTION_BITMAP }, + { "capname", REQ_ARG, 0, CAPNAME }, + { "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV }, + { "setcap", REQ_ARG, 0, SET_CAP }, + { "get-gpa-pmap", REQ_ARG, 0, GET_GPA_PMAP }, + { "assert-lapic-lvt", REQ_ARG, 0, ASSERT_LAPIC_LVT }, + { "get-rtc-time", NO_ARG, &get_rtc_time, 1 }, + { "set-rtc-time", REQ_ARG, 0, SET_RTC_TIME }, + { "rtc-nvram-offset", REQ_ARG, 0, RTC_NVRAM_OFFSET }, + { "get-rtc-nvram", NO_ARG, &get_rtc_nvram, 1 }, + { "set-rtc-nvram", REQ_ARG, 0, SET_RTC_NVRAM }, + { "getcap", NO_ARG, &getcap, 1 }, + { "get-stats", NO_ARG, &get_stats, 1 }, + { "get-desc-ds",NO_ARG, &get_desc_ds, 1 }, + { "set-desc-ds",NO_ARG, &set_desc_ds, 1 }, + { "get-desc-es",NO_ARG, &get_desc_es, 1 }, + { "set-desc-es",NO_ARG, &set_desc_es, 1 }, + { "get-desc-ss",NO_ARG, &get_desc_ss, 1 }, + { "set-desc-ss",NO_ARG, &set_desc_ss, 1 }, + { "get-desc-cs",NO_ARG, &get_desc_cs, 1 }, + { "set-desc-cs",NO_ARG, &set_desc_cs, 1 }, + { "get-desc-fs",NO_ARG, &get_desc_fs, 1 }, + { "set-desc-fs",NO_ARG, &set_desc_fs, 1 }, + { "get-desc-gs",NO_ARG, &get_desc_gs, 1 }, + { "set-desc-gs",NO_ARG, &set_desc_gs, 1 }, + { "get-desc-tr",NO_ARG, &get_desc_tr, 1 }, + { "set-desc-tr",NO_ARG, &set_desc_tr, 1 }, + { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 }, + { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 }, + { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 }, + { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 }, + { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 }, + { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 }, + { "get-memmap", NO_ARG, &get_memmap, 1 }, + { "get-memseg", NO_ARG, &get_memseg, 1 }, + { "get-efer", NO_ARG, &get_efer, 1 }, + { "get-cr0", NO_ARG, &get_cr0, 1 }, + { "get-cr2", NO_ARG, &get_cr2, 1 }, + { "get-cr3", NO_ARG, &get_cr3, 1 }, + { "get-cr4", NO_ARG, &get_cr4, 1 }, + { "get-dr0", NO_ARG, &get_dr0, 1 }, + { "get-dr1", NO_ARG, &get_dr1, 1 }, + { "get-dr2", NO_ARG, &get_dr2, 1 }, + { "get-dr3", NO_ARG, &get_dr3, 1 }, + { "get-dr6", NO_ARG, &get_dr6, 1 }, + { "get-dr7", NO_ARG, &get_dr7, 1 }, + { "get-rsp", NO_ARG, &get_rsp, 1 }, + { "get-rip", NO_ARG, &get_rip, 1 }, + { "get-rax", NO_ARG, &get_rax, 1 }, + { "get-rbx", NO_ARG, &get_rbx, 1 }, + { "get-rcx", NO_ARG, &get_rcx, 1 }, + { "get-rdx", NO_ARG, &get_rdx, 1 }, + { "get-rsi", NO_ARG, &get_rsi, 1 }, + { "get-rdi", NO_ARG, &get_rdi, 1 }, + { "get-rbp", NO_ARG, &get_rbp, 1 }, + { "get-r8", NO_ARG, &get_r8, 1 }, + { "get-r9", NO_ARG, &get_r9, 1 }, + { "get-r10", NO_ARG, &get_r10, 1 }, + { "get-r11", NO_ARG, &get_r11, 1 }, + { "get-r12", NO_ARG, &get_r12, 1 }, + { "get-r13", NO_ARG, &get_r13, 1 }, + { "get-r14", NO_ARG, &get_r14, 1 }, + { "get-r15", NO_ARG, &get_r15, 1 }, + { "get-rflags", NO_ARG, &get_rflags, 1 }, + { "get-cs", NO_ARG, &get_cs, 1 }, + { "get-ds", NO_ARG, &get_ds, 1 }, + { "get-es", NO_ARG, &get_es, 1 }, + { "get-fs", NO_ARG, &get_fs, 1 }, + { "get-gs", NO_ARG, &get_gs, 1 }, + { "get-ss", NO_ARG, &get_ss, 1 }, + { "get-tr", NO_ARG, &get_tr, 1 }, + { "get-ldtr", NO_ARG, &get_ldtr, 1 }, + { "get-eptp", NO_ARG, &get_eptp, 1 }, + { "get-exception-bitmap", + NO_ARG, &get_exception_bitmap, 1 }, + { "get-io-bitmap-address", + NO_ARG, &get_io_bitmap, 1 }, + { "get-tsc-offset", NO_ARG, &get_tsc_offset, 1 }, + { "get-msr-bitmap", + NO_ARG, &get_msr_bitmap, 1 }, + { "get-msr-bitmap-address", + NO_ARG, &get_msr_bitmap_address, 1 }, + { "get-guest-pat", NO_ARG, &get_guest_pat, 1 }, + { "get-guest-sysenter", + NO_ARG, &get_guest_sysenter, 1 }, + { "get-exit-reason", + NO_ARG, &get_exit_reason, 1 }, + { "get-x2apic-state", NO_ARG, &get_x2apic_state, 1 }, + { "get-all", NO_ARG, &get_all, 1 }, + { "run", NO_ARG, &run, 1 }, + { "create", NO_ARG, &create, 1 }, + { "destroy", NO_ARG, &destroy, 1 }, + { "inject-nmi", NO_ARG, &inject_nmi, 1 }, + { "force-reset", NO_ARG, &force_reset, 1 }, + { "force-poweroff", NO_ARG, &force_poweroff, 1 }, + { "get-active-cpus", NO_ARG, &get_active_cpus, 1 }, + { "get-suspended-cpus", NO_ARG, &get_suspended_cpus, 1 }, + { "get-intinfo", NO_ARG, &get_intinfo, 1 }, + { "get-cpu-topology", NO_ARG, &get_cpu_topology, 1 }, +#ifndef __FreeBSD__ + { "wrlock-cycle", NO_ARG, &wrlock_cycle, 1 }, +#endif + }; + + const struct option intel_opts[] = { + { "get-vmcs-pinbased-ctls", + NO_ARG, &get_pinbased_ctls, 1 }, + { "get-vmcs-procbased-ctls", + NO_ARG, &get_procbased_ctls, 1 }, + { "get-vmcs-procbased-ctls2", + NO_ARG, &get_procbased_ctls2, 1 }, + { "get-vmcs-guest-linear-address", + NO_ARG, &get_vmcs_gla, 1 }, + { "get-vmcs-guest-physical-address", + NO_ARG, &get_vmcs_gpa, 1 }, + { "get-vmcs-entry-interruption-info", + NO_ARG, &get_vmcs_entry_interruption_info, 1}, + { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 }, + { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 }, + { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 }, + { "get-vmcs-cr4-shadow", NO_ARG, &get_cr4_shadow, 1 }, + { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1 }, + { "get-vmcs-tpr-threshold", + NO_ARG, &get_tpr_threshold, 1 }, + { "get-vmcs-vpid", NO_ARG, &get_vpid_asid, 1 }, + { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 }, + { "get-vmcs-entry-ctls", + NO_ARG, &get_entry_ctls, 1 }, + { "get-vmcs-instruction-error", + NO_ARG, &get_inst_err, 1 }, + { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 }, + { "get-vmcs-host-cr0", + NO_ARG, &get_host_cr0, 1 }, + { "set-vmcs-entry-interruption-info", + REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO }, + { "get-vmcs-exit-qualification", + NO_ARG, &get_vmcs_exit_qualification, 1 }, + { "get-vmcs-exit-inst-length", + NO_ARG, &get_vmcs_exit_inst_length, 1 }, + { "get-vmcs-interruptibility", + NO_ARG, &get_vmcs_interruptibility, 1 }, + { "get-vmcs-exit-interruption-error", + NO_ARG, &get_vmcs_exit_interruption_error, 1 }, + { "get-vmcs-exit-interruption-info", + NO_ARG, &get_vmcs_exit_interruption_info, 1 }, + { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 }, + { "get-vmcs-host-cr3", + NO_ARG, &get_host_cr3, 1 }, + { "get-vmcs-host-cr4", + NO_ARG, &get_host_cr4, 1 }, + { "get-vmcs-host-rip", + NO_ARG, &get_host_rip, 1 }, + { "get-vmcs-host-rsp", + NO_ARG, &get_host_rsp, 1 }, + { "get-apic-access-address", + NO_ARG, &get_apic_access_addr, 1}, + { "get-virtual-apic-address", + NO_ARG, &get_virtual_apic_addr, 1} + }; + + const struct option amd_opts[] = { + { "get-vmcb-intercepts", + NO_ARG, &get_vmcb_intercept, 1 }, + { "get-vmcb-asid", + NO_ARG, &get_vpid_asid, 1 }, + { "get-vmcb-exit-details", + NO_ARG, &get_vmcb_exit_details, 1 }, + { "get-vmcb-tlb-ctrl", + NO_ARG, &get_vmcb_tlb_ctrl, 1 }, + { "get-vmcb-virq", + NO_ARG, &get_vmcb_virq, 1 }, + { "get-avic-apic-bar", + NO_ARG, &get_apic_access_addr, 1 }, + { "get-avic-backing-page", + NO_ARG, &get_virtual_apic_addr, 1 }, + { "get-avic-table", + NO_ARG, &get_avic_table, 1 } + }; + + const struct option null_opt = { + NULL, 0, NULL, 0 + }; + + struct option *all_opts; + char *cp; + int optlen; + + optlen = sizeof(common_opts); + + if (cpu_intel) + optlen += sizeof(intel_opts); + else + optlen += sizeof(amd_opts); + + optlen += sizeof(null_opt); + + all_opts = malloc(optlen); + + cp = (char *)all_opts; + memcpy(cp, common_opts, sizeof(common_opts)); + cp += sizeof(common_opts); + + if (cpu_intel) { + memcpy(cp, intel_opts, sizeof(intel_opts)); + cp += sizeof(intel_opts); + } else { + memcpy(cp, amd_opts, sizeof(amd_opts)); + cp += sizeof(amd_opts); + } + + memcpy(cp, &null_opt, sizeof(null_opt)); + cp += sizeof(null_opt); + + return (all_opts); +} + +static const char * +wday_str(int idx) +{ + static const char *weekdays[] = { + "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat" + }; + + if (idx >= 0 && idx < 7) + return (weekdays[idx]); + else + return ("UNK"); +} + +static const char * +mon_str(int idx) +{ + static const char *months[] = { + "Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec" + }; + + if (idx >= 0 && idx < 12) + return (months[idx]); + else + return ("UNK"); +} + +static int +show_memmap(struct vmctx *ctx) +{ + char name[SPECNAMELEN + 1], numbuf[8]; + vm_ooffset_t segoff; + vm_paddr_t gpa; + size_t maplen, seglen; + int error, flags, prot, segid, delim; + + printf("Address Length Segment Offset "); + printf("Prot Flags\n"); + + gpa = 0; + while (1) { + error = vm_mmap_getnext(ctx, &gpa, &segid, &segoff, &maplen, + &prot, &flags); + if (error) + return (errno == ENOENT ? 0 : error); + + error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name)); + if (error) + return (error); + + printf("%-12lX", gpa); + humanize_number(numbuf, sizeof(numbuf), maplen, "B", + HN_AUTOSCALE, HN_NOSPACE); + printf("%-12s", numbuf); + + printf("%-12s", name[0] ? name : "sysmem"); + printf("%-12lX", segoff); + printf("%c%c%c ", prot & PROT_READ ? 'R' : '-', + prot & PROT_WRITE ? 'W' : '-', + prot & PROT_EXEC ? 'X' : '-'); + + delim = '\0'; + if (flags & VM_MEMMAP_F_WIRED) { + printf("%cwired", delim); + delim = '/'; + } + if (flags & VM_MEMMAP_F_IOMMU) { + printf("%ciommu", delim); + delim = '/'; } + printf("\n"); + + gpa += maplen; + } +} + +static int +show_memseg(struct vmctx *ctx) +{ + char name[SPECNAMELEN + 1], numbuf[8]; + size_t seglen; + int error, segid; + + printf("ID Length Name\n"); + + segid = 0; + while (1) { + error = vm_get_memseg(ctx, segid, &seglen, name, sizeof(name)); + if (error) + return (errno == EINVAL ? 0 : error); + + if (seglen) { + printf("%-4d", segid); + humanize_number(numbuf, sizeof(numbuf), seglen, "B", + HN_AUTOSCALE, HN_NOSPACE); + printf("%-12s", numbuf); + printf("%s", name[0] ? name : "sysmem"); + printf("\n"); + } + segid++; + } +} + +int +main(int argc, char *argv[]) +{ + char *vmname; + int error, ch, vcpu, ptenum; + vm_paddr_t gpa_pmap; + struct vm_exit vmexit; + uint64_t rax, cr0, cr2, cr3, cr4, dr0, dr1, dr2, dr3, dr6, dr7; + uint64_t rsp, rip, rflags, efer, pat; + uint64_t eptp, bm, addr, u64, pteval[4], *pte, info[2]; + struct vmctx *ctx; + cpuset_t cpus; + bool cpu_intel; + uint64_t cs, ds, es, fs, gs, ss, tr, ldtr; + struct tm tm; + struct option *opts; + + cpu_intel = cpu_vendor_intel(); + opts = setup_options(cpu_intel); + + vcpu = 0; + vmname = NULL; + assert_lapic_lvt = -1; + progname = basename(argv[0]); + + while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) { + switch (ch) { + case 0: + break; + case VMNAME: + vmname = optarg; + break; + case VCPU: + vcpu = atoi(optarg); + break; + case SET_MEM: + memsize = atoi(optarg) * MB; + memsize = roundup(memsize, 2 * MB); + break; + case SET_EFER: + efer = strtoul(optarg, NULL, 0); + set_efer = 1; + break; + case SET_CR0: + cr0 = strtoul(optarg, NULL, 0); + set_cr0 = 1; + break; + case SET_CR2: + cr2 = strtoul(optarg, NULL, 0); + set_cr2 = 1; + break; + case SET_CR3: + cr3 = strtoul(optarg, NULL, 0); + set_cr3 = 1; + break; + case SET_CR4: + cr4 = strtoul(optarg, NULL, 0); + set_cr4 = 1; + break; + case SET_DR0: + dr0 = strtoul(optarg, NULL, 0); + set_dr0 = 1; + break; + case SET_DR1: + dr1 = strtoul(optarg, NULL, 0); + set_dr1 = 1; + break; + case SET_DR2: + dr2 = strtoul(optarg, NULL, 0); + set_dr2 = 1; + break; + case SET_DR3: + dr3 = strtoul(optarg, NULL, 0); + set_dr3 = 1; + break; + case SET_DR6: + dr6 = strtoul(optarg, NULL, 0); + set_dr6 = 1; + break; + case SET_DR7: + dr7 = strtoul(optarg, NULL, 0); + set_dr7 = 1; + break; + case SET_RSP: + rsp = strtoul(optarg, NULL, 0); + set_rsp = 1; + break; + case SET_RIP: + rip = strtoul(optarg, NULL, 0); + set_rip = 1; + break; + case SET_RAX: + rax = strtoul(optarg, NULL, 0); + set_rax = 1; + break; + case SET_RFLAGS: + rflags = strtoul(optarg, NULL, 0); + set_rflags = 1; + break; + case DESC_BASE: + desc_base = strtoul(optarg, NULL, 0); + break; + case DESC_LIMIT: + desc_limit = strtoul(optarg, NULL, 0); + break; + case DESC_ACCESS: + desc_access = strtoul(optarg, NULL, 0); + break; + case SET_CS: + cs = strtoul(optarg, NULL, 0); + set_cs = 1; + break; + case SET_DS: + ds = strtoul(optarg, NULL, 0); + set_ds = 1; + break; + case SET_ES: + es = strtoul(optarg, NULL, 0); + set_es = 1; + break; + case SET_FS: + fs = strtoul(optarg, NULL, 0); + set_fs = 1; + break; + case SET_GS: + gs = strtoul(optarg, NULL, 0); + set_gs = 1; + break; + case SET_SS: + ss = strtoul(optarg, NULL, 0); + set_ss = 1; + break; + case SET_TR: + tr = strtoul(optarg, NULL, 0); + set_tr = 1; + break; + case SET_LDTR: + ldtr = strtoul(optarg, NULL, 0); + set_ldtr = 1; + break; + case SET_X2APIC_STATE: + x2apic_state = strtol(optarg, NULL, 0); + set_x2apic_state = 1; + break; + case SET_EXCEPTION_BITMAP: + exception_bitmap = strtoul(optarg, NULL, 0); + set_exception_bitmap = 1; + break; + case SET_VMCS_ENTRY_INTERRUPTION_INFO: + vmcs_entry_interruption_info = strtoul(optarg, NULL, 0); + set_vmcs_entry_interruption_info = 1; + break; + case SET_CAP: + capval = strtoul(optarg, NULL, 0); + setcap = 1; + break; + case SET_RTC_TIME: + rtc_secs = strtoul(optarg, NULL, 0); + set_rtc_time = 1; + break; + case SET_RTC_NVRAM: + rtc_nvram_value = (uint8_t)strtoul(optarg, NULL, 0); + set_rtc_nvram = 1; + break; + case RTC_NVRAM_OFFSET: + rtc_nvram_offset = strtoul(optarg, NULL, 0); + break; + case GET_GPA_PMAP: + gpa_pmap = strtoul(optarg, NULL, 0); + get_gpa_pmap = 1; + break; + case CAPNAME: + capname = optarg; + break; + case UNASSIGN_PPTDEV: + unassign_pptdev = 1; + if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3) + usage(cpu_intel); + break; + case ASSERT_LAPIC_LVT: + assert_lapic_lvt = atoi(optarg); + break; + default: + usage(cpu_intel); + } + } + argc -= optind; + argv += optind; + + if (vmname == NULL) + usage(cpu_intel); + + error = 0; + + if (!error && create) + error = vm_create(vmname); + + if (!error) { + ctx = vm_open(vmname); + if (ctx == NULL) { + printf("VM:%s is not created.\n", vmname); + exit (1); + } + } + +#ifndef __FreeBSD__ + if (!error && wrlock_cycle) { + error = vm_wrlock_cycle(ctx); + exit(error); + } +#endif /* __FreeBSD__ */ + + if (!error && memsize) + error = vm_setup_memory(ctx, memsize, VM_MMAP_ALL); + + if (!error && set_efer) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer); + + if (!error && set_cr0) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0); + + if (!error && set_cr2) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR2, cr2); + + if (!error && set_cr3) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3); + + if (!error && set_cr4) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4); + + if (!error && set_dr0) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR0, dr0); + + if (!error && set_dr1) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR1, dr1); + + if (!error && set_dr2) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR2, dr2); + + if (!error && set_dr3) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR3, dr3); + + if (!error && set_dr6) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR6, dr6); + + if (!error && set_dr7) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7); + + if (!error && set_rsp) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp); + + if (!error && set_rip) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip); + + if (!error && set_rax) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax); + + if (!error && set_rflags) { + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS, + rflags); + } + + if (!error && set_desc_ds) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_es) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_ss) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_cs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_fs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_gs) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_tr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_ldtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR, + desc_base, desc_limit, desc_access); + } + + if (!error && set_desc_gdtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR, + desc_base, desc_limit, 0); + } + + if (!error && set_desc_idtr) { + error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR, + desc_base, desc_limit, 0); + } + + if (!error && set_cs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs); + + if (!error && set_ds) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds); + + if (!error && set_es) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es); + + if (!error && set_fs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs); + + if (!error && set_gs) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs); + + if (!error && set_ss) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss); + + if (!error && set_tr) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr); + + if (!error && set_ldtr) + error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr); + + if (!error && set_x2apic_state) + error = vm_set_x2apic_state(ctx, vcpu, x2apic_state); + + if (!error && unassign_pptdev) + error = vm_unassign_pptdev(ctx, bus, slot, func); + + if (!error && set_exception_bitmap) { + if (cpu_intel) + error = vm_set_vmcs_field(ctx, vcpu, + VMCS_EXCEPTION_BITMAP, + exception_bitmap); + else + error = vm_set_vmcb_field(ctx, vcpu, + VMCB_OFF_EXC_INTERCEPT, + 4, exception_bitmap); + } + + if (!error && cpu_intel && set_vmcs_entry_interruption_info) { + error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO, + vmcs_entry_interruption_info); + } + + if (!error && inject_nmi) { + error = vm_inject_nmi(ctx, vcpu); + } + + if (!error && assert_lapic_lvt != -1) { + error = vm_lapic_local_irq(ctx, vcpu, assert_lapic_lvt); + } + + if (!error && (get_memseg || get_all)) + error = show_memseg(ctx); + + if (!error && (get_memmap || get_all)) + error = show_memmap(ctx); + + if (!error) + error = get_all_registers(ctx, vcpu); + + if (!error) + error = get_all_segments(ctx, vcpu); + + if (!error) { + if (cpu_intel) + error = get_misc_vmcs(ctx, vcpu); + else + error = get_misc_vmcb(ctx, vcpu); + } + + if (!error && (get_x2apic_state || get_all)) { + error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state); + if (error == 0) + printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state); + } + + if (!error && (get_eptp || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp); + else + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_NPT_BASE, + 8, &eptp); + if (error == 0) + printf("%s[%d]\t\t0x%016lx\n", + cpu_intel ? "eptp" : "rvi/npt", vcpu, eptp); + } + + if (!error && (get_exception_bitmap || get_all)) { + if(cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_EXCEPTION_BITMAP, &bm); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_EXC_INTERCEPT, + 4, &bm); + if (error == 0) + printf("exception_bitmap[%d]\t%#lx\n", vcpu, bm); + } + + if (!error && (get_io_bitmap || get_all)) { + if (cpu_intel) { + error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, + &bm); + if (error == 0) + printf("io_bitmap_a[%d]\t%#lx\n", vcpu, bm); + error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, + &bm); + if (error == 0) + printf("io_bitmap_b[%d]\t%#lx\n", vcpu, bm); + } else { + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_IO_PERM, 8, &bm); + if (error == 0) + printf("io_bitmap[%d]\t%#lx\n", vcpu, bm); + } + } + + if (!error && (get_tsc_offset || get_all)) { + uint64_t tscoff; + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, + &tscoff); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_TSC_OFFSET, + 8, &tscoff); + if (error == 0) + printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff); + } + + if (!error && (get_msr_bitmap_address || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, + &addr); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_MSR_PERM, 8, &addr); + if (error == 0) + printf("msr_bitmap[%d]\t\t%#lx\n", vcpu, addr); + } + + if (!error && (get_msr_bitmap || get_all)) { + if (cpu_intel) { + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_MSR_BITMAP, &addr); + } else { + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_MSR_PERM, 8, + &addr); + } + + if (error == 0) + error = dump_msr_bitmap(vcpu, addr, cpu_intel); + } + + if (!error && (get_vpid_asid || get_all)) { + uint64_t vpid; + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid); + else + error = vm_get_vmcb_field(ctx, vcpu, VMCB_OFF_ASID, + 4, &vpid); + if (error == 0) + printf("%s[%d]\t\t0x%04lx\n", + cpu_intel ? "vpid" : "asid", vcpu, vpid); + } + + if (!error && (get_guest_pat || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_PAT, &pat); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_GUEST_PAT, 8, &pat); + if (error == 0) + printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat); + } + + if (!error && (get_guest_sysenter || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_CS, + &cs); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_SYSENTER_CS, 8, + &cs); + + if (error == 0) + printf("guest_sysenter_cs[%d]\t%#lx\n", vcpu, cs); + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_ESP, + &rsp); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_SYSENTER_ESP, 8, + &rsp); + + if (error == 0) + printf("guest_sysenter_sp[%d]\t%#lx\n", vcpu, rsp); + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, + VMCS_GUEST_IA32_SYSENTER_EIP, + &rip); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_SYSENTER_EIP, 8, + &rip); + if (error == 0) + printf("guest_sysenter_ip[%d]\t%#lx\n", vcpu, rip); + } + + if (!error && (get_exit_reason || get_all)) { + if (cpu_intel) + error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, + &u64); + else + error = vm_get_vmcb_field(ctx, vcpu, + VMCB_OFF_EXIT_REASON, 8, + &u64); + if (error == 0) + printf("exit_reason[%d]\t%#lx\n", vcpu, u64); } if (!error && setcap) { @@ -1479,6 +2249,42 @@ main(int argc, char *argv[]) printf("Capability \"%s\" is not available\n", capname); } + if (!error && get_gpa_pmap) { + error = vm_get_gpa_pmap(ctx, gpa_pmap, pteval, &ptenum); + if (error == 0) { + printf("gpa %#lx:", gpa_pmap); + pte = &pteval[0]; + while (ptenum-- > 0) + printf(" %#lx", *pte++); + printf("\n"); + } + } + + if (!error && set_rtc_nvram) + error = vm_rtc_write(ctx, rtc_nvram_offset, rtc_nvram_value); + + if (!error && (get_rtc_nvram || get_all)) { + error = vm_rtc_read(ctx, rtc_nvram_offset, &rtc_nvram_value); + if (error == 0) { + printf("rtc nvram[%03d]: 0x%02x\n", rtc_nvram_offset, + rtc_nvram_value); + } + } + + if (!error && set_rtc_time) + error = vm_rtc_settime(ctx, rtc_secs); + + if (!error && (get_rtc_time || get_all)) { + error = vm_rtc_gettime(ctx, &rtc_secs); + if (error == 0) { + gmtime_r(&rtc_secs, &tm); + printf("rtc time %#lx: %s %s %02d %02d:%02d:%02d %d\n", + rtc_secs, wday_str(tm.tm_wday), mon_str(tm.tm_mon), + tm.tm_mday, tm.tm_hour, tm.tm_min, tm.tm_sec, + 1900 + tm.tm_year); + } + } + if (!error && (getcap || get_all)) { int captype, val, getcaptype; @@ -1505,6 +2311,50 @@ main(int argc, char *argv[]) } } + if (!error && (get_active_cpus || get_all)) { + error = vm_active_cpus(ctx, &cpus); + if (!error) + print_cpus("active cpus", &cpus); + } + + if (!error && (get_suspended_cpus || get_all)) { + error = vm_suspended_cpus(ctx, &cpus); + if (!error) + print_cpus("suspended cpus", &cpus); + } + + if (!error && (get_intinfo || get_all)) { + error = vm_get_intinfo(ctx, vcpu, &info[0], &info[1]); + if (!error) { + print_intinfo("pending", info[0]); + print_intinfo("current", info[1]); + } + } + + if (!error && (get_stats || get_all)) { + int i, num_stats; + uint64_t *stats; + struct timeval tv; + const char *desc; + + stats = vm_get_stats(ctx, vcpu, &tv, &num_stats); + if (stats != NULL) { + printf("vcpu%d stats:\n", vcpu); + for (i = 0; i < num_stats; i++) { + desc = vm_get_stat_desc(ctx, i); + printf("%-40s\t%ld\n", desc, stats[i]); + } + } + } + + if (!error && (get_cpu_topology || get_all)) { + uint16_t sockets, cores, threads, maxcpus; + + vm_get_topology(ctx, &sockets, &cores, &threads, &maxcpus); + printf("cpu_topology:\tsockets=%hu, cores=%hu, threads=%hu, " + "maxcpus=%hu\n", sockets, cores, threads, maxcpus); + } + if (!error && run) { error = vm_run(ctx, vcpu, &vmexit); if (error == 0) @@ -1513,11 +2363,18 @@ main(int argc, char *argv[]) printf("vm_run error %d\n", error); } + if (!error && force_reset) + error = vm_suspend(ctx, VM_SUSPEND_RESET); + + if (!error && force_poweroff) + error = vm_suspend(ctx, VM_SUSPEND_POWEROFF); + if (error) printf("errno = %d\n", errno); if (!error && destroy) - error = vm_destroy(ctx); + vm_destroy(ctx); + free (opts); exit(error); } diff --git a/usr/src/cmd/bhyveload-uefi/Makefile b/usr/src/cmd/bhyveload-uefi/Makefile deleted file mode 100644 index bbcbacf32f..0000000000 --- a/usr/src/cmd/bhyveload-uefi/Makefile +++ /dev/null @@ -1,41 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -PROG = bhyveload-uefi - -include ../Makefile.cmd - -$(BUILD64)SUBDIRS += $(MACH64) - -all := TARGET = all -install := TARGET = install -clean := TARGET = clean -clobber := TARGET = clobber -lint := TARGET = lint - -.KEEP_STATE: - -all clean clobber lint: $(SUBDIRS) - -install: $(SUBDIRS) - -$(RM) $(ROOTUSRSBINPROG) - -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG) - -$(SUBDIRS): FRC - @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET) - -FRC: - -include ../Makefile.targ diff --git a/usr/src/cmd/bhyveload-uefi/Makefile.com b/usr/src/cmd/bhyveload-uefi/Makefile.com deleted file mode 100644 index 7865cca8d8..0000000000 --- a/usr/src/cmd/bhyveload-uefi/Makefile.com +++ /dev/null @@ -1,52 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -PROG= bhyveload-uefi - -SRCS = ../bhyveload-uefi.c expand_number.c -OBJS = bhyveload-uefi.o expand_number.o - -include ../../Makefile.cmd - -.KEEP_STATE: - -CFLAGS += $(CCVERBOSE) -CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \ - -I$(ROOT)/usr/platform/i86pc/include -LDLIBS += -lvmmapi - -all: $(PROG) - -$(PROG): $(OBJS) - $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS) - $(POST_PROCESS) - -install: all $(ROOTUSRSBINPROG) - -clean: - $(RM) $(OBJS) - -lint: lint_SRCS - -include ../../Makefile.targ - -%.o: ../%.c - $(COMPILE.c) $< - $(POST_PROCESS_O) - -%.o: $(CONTRIB)/freebsd/lib/libutil/%.c - $(COMPILE.c) $< - $(POST_PROCESS_O) - diff --git a/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c b/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c deleted file mode 100644 index 62a7ca5d0f..0000000000 --- a/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c +++ /dev/null @@ -1,190 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2013 Pluribus Networks Inc. - */ - -#include <sys/types.h> - -#include <machine/vmm.h> - -#include <errno.h> -#include <err.h> -#include <fcntl.h> -#include <stdio.h> -#include <stdlib.h> -#include <sysexits.h> -#include <unistd.h> - -#include <vmmapi.h> - -#define KB (1024UL) -#define MB (1024 * 1024UL) -#define GB (1024 * 1024 * 1024UL) - -#define UEFI_ROM_ADDR 0xFFE00000 -#define UEFI_ROM_SIZE (2 * MB) -/* - * N.B. the UEFI code zeros the first page in memory so use the second. - */ -#define BHYVE_HOB_ADDR 0x00002000 -#define BHYVE_BO_HOB_ADDR 0x00002080 - -#define UEFI_ROM_PATH "/usr/share/bhyve/uefi-rom.bin" - -struct platform_info { - uint32_t ncpus; -}; - -/* - * Boot order code: - * 0 - EFI_CD_HD - * 1 - EFI_CD - * 2 - EFI_HD_CD - * 3 - EFI_HD - * 4 - EFI_NET - * 5 - EFI_NET_CD_HD - * 6 - EFI_HD_HD_CD - * 7 - LEGACY_CD_HD - * 8 - LEGACY_CD - * 9 - LEGACY_HD_CD - * 10 - LEGACY_HD - * 11 - EFI_SHELL - */ - -struct bootorder_info { - uint32_t guestbootorder; -}; - -static char *vmname, *progname; -static struct vmctx *ctx; - -static void -usage(void) -{ - printf("usage: %s " - "[-c vcpus] [-m mem-size] [-b bootorder]" - "<vmname>\n", progname); - exit(1); -} - -int -main(int argc, char** argv) -{ - int opt, error, fd; - int guest_ncpus; - int guest_bootorder = 0; - uint64_t mem_size; - char *membase, *rombase; - struct platform_info *pi; - struct bootorder_info *bi; - - progname = argv[0]; - - guest_ncpus = 1; - mem_size = 256 * MB; - - while ((opt = getopt(argc, argv, "c:m:b:")) != -1) { - switch (opt) { - case 'c': - guest_ncpus = atoi(optarg); - break; - case 'm': - error = vm_parse_memsize(optarg, &mem_size); - if (error != 0 || mem_size == 0) - errx(EX_USAGE, "Invalid memsize '%s'", optarg); - break; - case 'b': - guest_bootorder = atoi(optarg); - if (guest_bootorder < 0 || guest_bootorder > 11) { - errx(EX_USAGE, "Invalid bootoption: %d\n" - "\tBoot order code:\n" - "\t0 - EFI_CD_HD\n" - "\t1 - EFI_CD\n" - "\t2 - EFI_HD_CD\n" - "\t3 - EFI_HD\n" - "\t4 - EFI_NET\n" - "\t5 - EFI_NET_CD_HD\n" - "\t6 - EFI_HD_HD_CD\n" - "\t7 - LEGACY_CD_HD\n" - "\t8 - LEGACY_CD\n" - "\t9 - LEGACY_HD_CD\n" - "\t10 - LEGACY_HD\n" - "\t11 - EFI_SHELL\n", guest_bootorder); - exit(1); - } - break; - case '?': - usage(); - } - } - - argc -= optind; - argv += optind; - - if (argc != 1) - usage(); - - vmname = argv[0]; - error = vm_create(vmname); - if (error != 0 && errno != EEXIST) { - perror("vm_create"); - exit(1); - - } - - ctx = vm_open(vmname); - if (ctx == NULL) { - perror("vm_open"); - exit(1); - } - - error = vm_set_capability(ctx, 0, VM_CAP_UNRESTRICTED_GUEST, 1); - if (error) { - perror("vm_set_capability(VM_CAP_UNRESTRICTED_GUEST)"); - } - - error = vm_setup_memory(ctx, mem_size, VM_MMAP_ALL); - if (error) { - perror("vm_setup_memory"); - exit(1); - } - membase = vm_map_gpa(ctx, 0, 8 * KB); - - error = vm_setup_rom(ctx, UEFI_ROM_ADDR, UEFI_ROM_SIZE); - if (error) { - perror("vm_setup_rom"); - exit(1); - } - rombase = vm_map_gpa(ctx, UEFI_ROM_ADDR, UEFI_ROM_SIZE); - - fd = open(UEFI_ROM_PATH, O_RDONLY); - if (fd == -1) { - perror("open"); - exit(1); - } - read(fd, rombase, UEFI_ROM_SIZE); - close(fd); - - pi = (struct platform_info *)(membase + BHYVE_HOB_ADDR); - pi->ncpus = guest_ncpus; - bi = (struct bootorder_info *)(membase + BHYVE_BO_HOB_ADDR); - bi->guestbootorder = guest_bootorder; - - error = vcpu_reset(ctx, 0); - if (error) { - perror("vcpu_reset"); - exit(1); - } - - return (0); -} diff --git a/usr/src/cmd/devfsadm/i386/misc_link_i386.c b/usr/src/cmd/devfsadm/i386/misc_link_i386.c index 2b678df527..4aeea7d294 100644 --- a/usr/src/cmd/devfsadm/i386/misc_link_i386.c +++ b/usr/src/cmd/devfsadm/i386/misc_link_i386.c @@ -45,6 +45,7 @@ static int vt00(di_minor_t minor, di_node_t node); static int kdmouse(di_minor_t minor, di_node_t node); static int ipmi(di_minor_t minor, di_node_t node); static int mc_node(di_minor_t minor, di_node_t node); +static int vmmctl(di_minor_t minor, di_node_t node); static devfsadm_create_t misc_cbt[] = { { "vt00", "ddi_display", NULL, @@ -84,6 +85,9 @@ static devfsadm_create_t misc_cbt[] = { { "pseudo", "ddi_pseudo", "ucode", TYPE_EXACT | DRV_EXACT, ILEVEL_0, ln_minor_name, }, + { "pseudo", "ddi_pseudo", "vmm", + TYPE_EXACT | DRV_EXACT, ILEVEL_0, vmmctl, + } }; DEVFSADM_CREATE_INIT_V0(misc_cbt); @@ -109,6 +113,9 @@ static devfsadm_remove_t misc_remove_cbt[] = { }, { "serial", "^tty[a-z]$", RM_ALWAYS | RM_PRE, ILEVEL_1, devfsadm_rm_all + }, + { "pseudo", "^vmmctl$", RM_ALWAYS | RM_PRE | RM_HOT, + ILEVEL_0, devfsadm_rm_all } }; @@ -345,3 +352,14 @@ mc_node(di_minor_t minor, di_node_t node) (void) devfsadm_mklink(linkpath, node, minor, 0); return (DEVFSADM_CONTINUE); } + +/* + * /dev/vmmctl -> /devices/pseudo/vmm@0:ctl + */ +static int +vmmctl(di_minor_t minor, di_node_t node) +{ + if (strcmp(di_minor_name(minor), "ctl") == 0) + (void) devfsadm_mklink("vmmctl", node, minor, 0); + return (DEVFSADM_CONTINUE); +} diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile b/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile deleted file mode 100644 index 49ca0c5eb3..0000000000 --- a/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile +++ /dev/null @@ -1,32 +0,0 @@ -# -# This file and its contents are supplied under the terms of the -# Common Development and Distribution License ("CDDL"), version 1.0. -# You may only use this file in accordance with the terms of version -# 1.0 of the CDDL. -# -# A full copy of the text of the CDDL should have accompanied this -# source. A copy of the CDDL is also available via the Internet at -# http://www.illumos.org/license/CDDL. -# - -# -# Copyright 2013 Pluribus Networks Inc. -# - -MODULE = vmm.so -MDBTGT = kvm - -MODSRCS = vmm.c - -include ../../../../../Makefile.cmd -include ../../../../../Makefile.cmd.64 -include ../../../Makefile.amd64 -include ../../../../Makefile.module - -CPPFLAGS = -D_KERNEL -D_MACHDEP -CPPFLAGS += -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64 -CPPFLAGS += -I$(CONTRIB)/freebsd -I$(CONTRIB)/freebsd/amd64 -CPPFLAGS += -I$(SRC)/uts/common -I$(SRC)/uts/i86pc -CPPFLAGS += -I$(SRC)/cmd/mdb/common - -CPPFLAGS += -_cc=-xdryrun diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c b/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c deleted file mode 100644 index 9e29d8662a..0000000000 --- a/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c +++ /dev/null @@ -1,238 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2014 Pluribus Networks Inc. - */ - -#include <sys/param.h> - -#include <mdb/mdb_modapi.h> -#include <sys/cpuvar.h> -#include <sys/varargs.h> -#include <sys/vmm.h> -#include <sys/vmm_impl.h> - -/* - * VMM trace debug walker/dcmd code - */ - -/* - * Initialize the vmm_trace_dmsg_t walker by either using the given starting - * address, or reading the value of the kernel's vmm_debug_rbuf pointer. - * We also allocate a vmm_trace_dmsg_t for storage, and save this using the - * walk_data pointer. - */ -static int -vmm_dmsg_walk_i(mdb_walk_state_t *wsp) -{ - uintptr_t rbuf_addr; - vmm_trace_rbuf_t rbuf; - - if (wsp->walk_addr == NULL) { - if (mdb_readvar(&rbuf_addr, "vmm_debug_rbuf") == -1) { - mdb_warn("failed to read 'vmm_debug_rbuf'"); - return (WALK_ERR); - } - - if (mdb_vread(&rbuf, sizeof (vmm_trace_rbuf_t), rbuf_addr) - == -1) { - mdb_warn("failed to read vmm_trace_rbuf_t at %p", - rbuf_addr); - return (WALK_ERR); - } - - wsp->walk_addr = (uintptr_t)(vmm_trace_dmsg_t *)rbuf.dmsgh; - } - - /* - * Save ptr to head of ring buffer to prevent looping. - */ - wsp->walk_arg = (void *)wsp->walk_addr; - wsp->walk_data = mdb_alloc(sizeof (vmm_trace_dmsg_t), UM_SLEEP); - return (WALK_NEXT); -} - -/* - * At each step, read a vmm_trace_dmsg_t into our private storage, and then - * invoke the callback function. We terminate when we reach a NULL next - * pointer. - */ -static int -vmm_dmsg_walk_s(mdb_walk_state_t *wsp) -{ - int status; - - if (wsp->walk_addr == NULL) - return (WALK_DONE); - - if (mdb_vread(wsp->walk_data, sizeof (vmm_trace_dmsg_t), - wsp->walk_addr) == -1) { - mdb_warn("failed to read vmm_trace_dmsg_t at %p", - wsp->walk_addr); - return (WALK_ERR); - } - - status = wsp->walk_callback(wsp->walk_addr, wsp->walk_data, - wsp->walk_cbdata); - - wsp->walk_addr = - (uintptr_t)(((vmm_trace_dmsg_t *)wsp->walk_data)->next); - - /* - * If we've looped then we're done. - */ - if (wsp->walk_addr == (uintptr_t)wsp->walk_arg) - wsp->walk_addr = NULL; - - return (status); -} - -/* - * The walker's fini function is invoked at the end of each walk. Since we - * dynamically allocated a vmm_trace_dmsg_t in vmm_dmsg_walk_i, we must - * free it now. - */ -static void -vmm_dmsg_walk_f(mdb_walk_state_t *wsp) -{ - mdb_free(wsp->walk_data, sizeof (vmm_trace_dmsg_t)); -} - -/* - * This routine is used by the vmm_dmsg_dump dcmd to dump content of - * VMM trace ring buffer. - */ -int -vmm_dmsg_dump(vmm_trace_dmsg_t *addr, int print_pathname, uint_t *printed) -{ - vmm_trace_dmsg_t dmsg, *dmsgh = addr; - char pathname[MAXPATHLEN]; - char merge[1024]; - - while (addr != NULL) { - if (mdb_vread(&dmsg, sizeof (dmsg), (uintptr_t)addr) != - sizeof (dmsg)) { - mdb_warn("failed to read message pointer in kernel"); - return (DCMD_ERR); - } - - (void) mdb_snprintf(merge, sizeof (merge), - "[%Y:%03d:%03d:%03d] : %s", - dmsg.timestamp.tv_sec, - (int)dmsg.timestamp.tv_nsec/1000000, - (int)(dmsg.timestamp.tv_nsec/1000)%1000, - (int)dmsg.timestamp.tv_nsec%1000, - dmsg.buf); - - mdb_printf("%s", merge); - - if (printed != NULL) { - (*printed)++; - } - - if (((addr = dmsg.next) == NULL) || (dmsg.next == dmsgh)) { - break; - } - } - - return (DCMD_OK); -} - -/* - * 1. Process flag passed to vmm_dmsg_dump dcmd. - * 2. Obtain VMM trace ring buffer pointer. - * 3. Pass VMM trace ring buffer pointer to vmm_dmsg_dump() - * to dump content of VMM trace ring buffer. - */ -int -vmm_rbuf_dump(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv) -{ - vmm_trace_rbuf_t rbuf; - uint_t printed = 0; /* have we printed anything? */ - int print_pathname = FALSE; - int rval = DCMD_OK; - - if (argc > 1) { - return (DCMD_USAGE); - } - - if (mdb_getopts(argc, argv, - 'a', MDB_OPT_SETBITS, TRUE, &print_pathname) != argc) { - return (DCMD_USAGE); - } - - /* - * If ring buffer address not provided try to obtain - * it using vmm_debug_rbuf global. - */ - if ((addr == NULL) || !(flags & DCMD_ADDRSPEC)) { - if (mdb_readvar(&addr, "vmm_debug_rbuf") == -1) { - mdb_warn("Failed to read 'vmm_debug_rbuf'."); - return (DCMD_ERR); - } - } - - if (mdb_vread(&rbuf, sizeof (rbuf), addr) != sizeof (rbuf)) { - mdb_warn("Failed to read ring buffer in kernel."); - return (DCMD_ERR); - } - - if (rbuf.dmsgh == NULL) { - mdb_printf("The vmm trace ring buffer is empty.\n"); - return (DCMD_OK); - } - - rval = vmm_dmsg_dump((vmm_trace_dmsg_t *)rbuf.dmsgh, - print_pathname, &printed); - - if (rval != DCMD_OK) { - return (rval); - } - - if (printed == 0) { - mdb_warn("Failed to read vmm trace ring buffer."); - return (DCMD_ERR); - } - - return (rval); -} - -/* - * MDB module linkage information: - * - * We declare a list of structures describing our dcmds, a list of structures - * describing our walkers, and a function named _mdb_init to return a pointer - * to our module information. - */ - -static const mdb_dcmd_t dcmds[] = { - { "vmm_dmsg_dump", "[-a]", "Dump vmm trace debug messages", - vmm_rbuf_dump }, - { NULL } -}; - -static const mdb_walker_t walkers[] = { - { "vmm_dmsg", - "walk ring buffer containing vmm trace debug messages", - vmm_dmsg_walk_i, vmm_dmsg_walk_s, vmm_dmsg_walk_f }, - { NULL } -}; - -static const mdb_modinfo_t modinfo = { - MDB_API_VERSION, dcmds, walkers -}; - -const mdb_modinfo_t * -_mdb_init(void) -{ - return (&modinfo); -} diff --git a/usr/src/compat/freebsd/amd64/machine/asmacros.h b/usr/src/compat/freebsd/amd64/machine/asmacros.h index fcf35a7b78..1f6955130b 100644 --- a/usr/src/compat/freebsd/amd64/machine/asmacros.h +++ b/usr/src/compat/freebsd/amd64/machine/asmacros.h @@ -25,4 +25,7 @@ x: #define END(x) \ .size x, [.-x] +#define ALIGN_TEXT \ + .p2align 4,0x90; /* 16-byte alignment, nop filled */ + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_ASMACROS_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/atomic.h b/usr/src/compat/freebsd/amd64/machine/atomic.h index 5b78143d21..1da9724b7d 100644 --- a/usr/src/compat/freebsd/amd64/machine/atomic.h +++ b/usr/src/compat/freebsd/amd64/machine/atomic.h @@ -11,31 +11,20 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ -static __inline u_char -atomic_load_acq_char(volatile u_char *p) -{ - u_char res; - - __asm volatile("lock ; " "cmpxchgb %b0,%1" - : "=a" (res), "=m" (*p) - : "m" (*p) : "memory", "cc"); - return (res); -} - -static __inline u_short +static __inline u_int atomic_load_acq_short(volatile u_short *p) { u_short res; - __asm volatile("lock ; " "cmpxchgw %w0,%1" - : "=a" (res), "=m" (*p) - : "m" (*p) - : "memory", "cc"); + res = *p; + __asm volatile("" : : : "memory"); + return (res); } @@ -44,10 +33,9 @@ atomic_load_acq_int(volatile u_int *p) { u_int res; - __asm volatile("lock ; " "cmpxchgl %0,%1" - : "=a" (res), "=m" (*p) - : "m" (*p) - : "memory", "cc"); + res = *p; + __asm volatile("" : : : "memory"); + return (res); } @@ -56,25 +44,10 @@ atomic_load_acq_long(volatile u_long *p) { u_long res; - __asm volatile("lock ; " "cmpxchgq %0,%1" - : "=a" (res), "=m" (*p) - : "m" (*p) - : "memory", "cc"); - return (res); -} - -static __inline void -atomic_store_rel_char(volatile u_char *p, u_char v) -{ + res = *p; __asm volatile("" : : : "memory"); - *p = v; -} -static __inline void -atomic_store_rel_short(volatile u_short *p, u_short v) -{ - __asm volatile("" : : : "memory"); - *p = v; + return (res); } static __inline void @@ -134,6 +107,23 @@ atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src) return (res); } +static __inline int +atomic_testandset_int(volatile u_int *p, u_int v) +{ + u_char res; + + __asm __volatile( + " lock ; " + " btsl %2,%1 ; " + " setc %0 ; " + "# atomic_testandset_int" + : "=q" (res), /* 0 */ + "+m" (*p) /* 1 */ + : "Ir" (v & 0x1f) /* 2 */ + : "cc"); + return (res); +} + /* * Atomically add the value of v to the integer pointed to by p and return * the previous value of *p. @@ -226,6 +216,13 @@ atomic_swap_long(volatile u_long *p, u_long v) return (v); } + +#define atomic_store_short(p, v) \ + (*(volatile u_short *)(p) = (u_short)(v)) +#define atomic_store_int(p, v) \ + (*(volatile u_int *)(p) = (u_int)(v)) + + #define atomic_readandclear_int(p) atomic_swap_int(p, 0) #define atomic_readandclear_long(p) atomic_swap_long(p, 0) @@ -241,4 +238,25 @@ atomic_swap_long(volatile u_long *p, u_long v) /* Operations on pointers. */ #define atomic_cmpset_ptr atomic_cmpset_long +/* Needed for the membar functions */ +#include_next <sys/atomic.h> + +static __inline void +atomic_thread_fence_rel(void) +{ + /* Equivalent to their __compiler_membar() */ + __asm __volatile(" " : : : "memory"); +} + +static __inline void +atomic_thread_fence_seq_cst(void) +{ + /* Equivalent to their !KERNEL storeload_barrer() */ + __asm __volatile("lock; addl $0,-8(%%rsp)" : : : "memory", "cc"); +} + +#define mb() membar_enter() +#define rmb() membar_consumer() +#define wmb() membar_producer() + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/cpufunc.h b/usr/src/compat/freebsd/amd64/machine/cpufunc.h index cf485e947c..0b7bcdaa59 100644 --- a/usr/src/compat/freebsd/amd64/machine/cpufunc.h +++ b/usr/src/compat/freebsd/amd64/machine/cpufunc.h @@ -16,6 +16,8 @@ #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_ +#include <sys/types.h> + static __inline u_long bsfq(u_long mask) { @@ -66,6 +68,12 @@ cpuid_count(u_int ax, u_int cx, u_int *p) } static __inline void +disable_intr(void) +{ + __asm __volatile("cli"); +} + +static __inline void enable_intr(void) { __asm __volatile("sti"); @@ -95,6 +103,15 @@ flsll(long long mask) return (flsl((long)mask)); } +static __inline u_long +read_rflags(void) +{ + u_long rf; + + __asm __volatile("pushfq; popq %0" : "=r" (rf)); + return (rf); +} + static __inline uint64_t rdmsr(u_int msr) { @@ -107,10 +124,10 @@ rdmsr(u_int msr) static __inline uint64_t rdtsc(void) { - uint32_t low, high; - - __asm __volatile("rdtsc" : "=a" (low), "=d" (high)); - return (low | ((uint64_t)high << 32)); + extern hrtime_t tsc_gethrtimeunscaled_delta(void); + + /* Get the TSC reading with any needed synch offset applied */ + return ((uint64_t)tsc_gethrtimeunscaled_delta()); } static __inline void @@ -162,4 +179,133 @@ rcr4(void) return (data); } +static __inline u_long +rxcr(u_int reg) +{ + u_int low, high; + + __asm __volatile("xgetbv" : "=a" (low), "=d" (high) : "c" (reg)); + return (low | ((uint64_t)high << 32)); +} + +static __inline void +load_xcr(u_int reg, u_long val) +{ + u_int low, high; + + low = val; + high = val >> 32; + __asm __volatile("xsetbv" : : "c" (reg), "a" (low), "d" (high)); +} + +static __inline void +write_rflags(u_long rf) +{ + __asm __volatile("pushq %0; popfq" : : "r" (rf)); +} + +static __inline uint64_t +rdr0(void) +{ + uint64_t data; + __asm __volatile("movq %%dr0,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr0(uint64_t dr0) +{ + __asm __volatile("movq %0,%%dr0" : : "r" (dr0)); +} + +static __inline uint64_t +rdr1(void) +{ + uint64_t data; + __asm __volatile("movq %%dr1,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr1(uint64_t dr1) +{ + __asm __volatile("movq %0,%%dr1" : : "r" (dr1)); +} + +static __inline uint64_t +rdr2(void) +{ + uint64_t data; + __asm __volatile("movq %%dr2,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr2(uint64_t dr2) +{ + __asm __volatile("movq %0,%%dr2" : : "r" (dr2)); +} + +static __inline uint64_t +rdr3(void) +{ + uint64_t data; + __asm __volatile("movq %%dr3,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr3(uint64_t dr3) +{ + __asm __volatile("movq %0,%%dr3" : : "r" (dr3)); +} + +static __inline uint64_t +rdr6(void) +{ + uint64_t data; + __asm __volatile("movq %%dr6,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr6(uint64_t dr6) +{ + __asm __volatile("movq %0,%%dr6" : : "r" (dr6)); +} + +static __inline uint64_t +rdr7(void) +{ + uint64_t data; + __asm __volatile("movq %%dr7,%0" : "=r" (data)); + return (data); +} + +static __inline void +load_dr7(uint64_t dr7) +{ + __asm __volatile("movq %0,%%dr7" : : "r" (dr7)); +} + +#ifdef _KERNEL +/* + * Including the native sys/segments.h in userspace seriously conflicts with + * the FreeBSD compat/contrib headers. + */ +#include <sys/segments.h> + +static __inline void +lldt(u_short sel) +{ + wr_ldtr(sel); +} + +static __inline u_short +sldt() +{ + return (rd_ldtr()); +} +#endif /* _KERNEL */ + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/fpu.h b/usr/src/compat/freebsd/amd64/machine/fpu.h index 48e686780c..6bc651d996 100644 --- a/usr/src/compat/freebsd/amd64/machine/fpu.h +++ b/usr/src/compat/freebsd/amd64/machine/fpu.h @@ -11,13 +11,12 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright (c) 2018, Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_ -#define XSAVE_AREA_ALIGN 64 - void fpuexit(kthread_t *td); void fpurestore(void *); void fpusave(void *); diff --git a/usr/src/compat/freebsd/vm/pmap.h b/usr/src/compat/freebsd/amd64/machine/iodev.h index 5958c4b101..c7cdddc817 100644 --- a/usr/src/compat/freebsd/vm/pmap.h +++ b/usr/src/compat/freebsd/amd64/machine/iodev.h @@ -10,12 +10,10 @@ */ /* - * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ -#ifndef _COMPAT_FREEBSD_VM_PMAP_H_ -#define _COMPAT_FREEBSD_VM_PMAP_H_ +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_IODEV_H +#define _COMPAT_FREEBSD_AMD64_MACHINE_IODEV_H -#include <machine/pmap.h> - -#endif /* _COMPAT_FREEBSD_VM_PMAP_H_ */ +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_IODEV_H */ diff --git a/usr/src/compat/freebsd/amd64/machine/md_var.h b/usr/src/compat/freebsd/amd64/machine/md_var.h index 60fdd566e5..ed57a8bebc 100644 --- a/usr/src/compat/freebsd/amd64/machine/md_var.h +++ b/usr/src/compat/freebsd/amd64/machine/md_var.h @@ -21,4 +21,8 @@ extern u_int cpu_exthigh; /* Highest arg to extended CPUID */ extern u_int cpu_id; /* Stepping ID */ extern char cpu_vendor[]; /* CPU Origin code */ +#include <sys/systm.h> + +#define Maxmem (physmax + 1) + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/param.h b/usr/src/compat/freebsd/amd64/machine/param.h index eaca5ab8d7..b152f4d526 100644 --- a/usr/src/compat/freebsd/amd64/machine/param.h +++ b/usr/src/compat/freebsd/amd64/machine/param.h @@ -36,4 +36,6 @@ /* Size of the level 4 page-map level-4 table units */ #define NPML4EPG (PAGE_SIZE/(sizeof (pml4_entry_t))) +#define CACHE_LINE_SIZE 64 + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_PARAM_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/pmap.h b/usr/src/compat/freebsd/amd64/machine/pmap.h index d0303bdd56..ce3185629b 100644 --- a/usr/src/compat/freebsd/amd64/machine/pmap.h +++ b/usr/src/compat/freebsd/amd64/machine/pmap.h @@ -1,4 +1,55 @@ /* + * All rights reserved. This copyright notice is Copyright Management + * Information under 17 USC 1202 and is included to protect this work and + * deter copyright infringement. Removal or alteration of this Copyright + * Management Information without the express written permission from + * Pluribus Networks Inc is prohibited, and any such unauthorized removal + * or alteration will be a violation of federal law. + * + * Copyright (c) 2003 Peter Wemm. + * Copyright (c) 1991 Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department and William Jolitz of UUNET Technologies Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Derived from hp300 version by Mike Hibler, this version by William + * Jolitz uses a recursive map [a pde points to the page directory] to + * map the page tables using the pagetables themselves. This is done to + * reduce the impact on kernel virtual memory for lots of sparse address + * space, and to reduce the cost of memory to each process. + * + * from: hp300: @(#)pmap.h 7.2 (Berkeley) 12/16/90 + * from: @(#)pmap.h 7.4 (Berkeley) 5/12/91 + * $FreeBSD$ + */ + +/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version @@ -13,32 +64,426 @@ * Copyright 2014 Pluribus Networks Inc. */ + #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ +/* + * Page-directory and page-table entries follow this format, with a few + * of the fields not present here and there, depending on a lot of things. + */ /* ---- Intel Nomenclature ---- */ -#define PG_V 0x001 /* P Valid */ -#define PG_RW 0x002 /* R/W Read/Write */ -#define PG_U 0x004 /* U/S User/Supervisor */ -#define PG_A 0x020 /* A Accessed */ -#define PG_M 0x040 /* D Dirty */ -#define PG_PS 0x080 /* PS Page size (0=4k,1=2M) */ +#define X86_PG_V 0x001 /* P Valid */ +#define X86_PG_RW 0x002 /* R/W Read/Write */ +#define X86_PG_U 0x004 /* U/S User/Supervisor */ +#define X86_PG_NC_PWT 0x008 /* PWT Write through */ +#define X86_PG_NC_PCD 0x010 /* PCD Cache disable */ +#define X86_PG_A 0x020 /* A Accessed */ +#define X86_PG_M 0x040 /* D Dirty */ +#define X86_PG_PS 0x080 /* PS Page size (0=4k,1=2M) */ +#define X86_PG_PTE_PAT 0x080 /* PAT PAT index */ +#define X86_PG_G 0x100 /* G Global */ +#define X86_PG_AVAIL1 0x200 /* / Available for system */ +#define X86_PG_AVAIL2 0x400 /* < programmers use */ +#define X86_PG_AVAIL3 0x800 /* \ */ +#define X86_PG_PDE_PAT 0x1000 /* PAT PAT index */ +#define X86_PG_NX (1ul<<63) /* No-execute */ +#define X86_PG_AVAIL(x) (1ul << (x)) + +/* Page level cache control fields used to determine the PAT type */ +#define X86_PG_PDE_CACHE (X86_PG_PDE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) +#define X86_PG_PTE_CACHE (X86_PG_PTE_PAT | X86_PG_NC_PWT | X86_PG_NC_PCD) + +/* + * Intel extended page table (EPT) bit definitions. + */ +#define EPT_PG_READ 0x001 /* R Read */ +#define EPT_PG_WRITE 0x002 /* W Write */ +#define EPT_PG_EXECUTE 0x004 /* X Execute */ +#define EPT_PG_IGNORE_PAT 0x040 /* IPAT Ignore PAT */ +#define EPT_PG_PS 0x080 /* PS Page size */ +#define EPT_PG_A 0x100 /* A Accessed */ +#define EPT_PG_M 0x200 /* D Dirty */ +#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) /* MT Memory Type */ + +/* + * Define the PG_xx macros in terms of the bits on x86 PTEs. + */ +#define PG_V X86_PG_V +#define PG_RW X86_PG_RW +#define PG_U X86_PG_U +#define PG_NC_PWT X86_PG_NC_PWT +#define PG_NC_PCD X86_PG_NC_PCD +#define PG_A X86_PG_A +#define PG_M X86_PG_M +#define PG_PS X86_PG_PS +#define PG_PTE_PAT X86_PG_PTE_PAT +#define PG_G X86_PG_G +#define PG_AVAIL1 X86_PG_AVAIL1 +#define PG_AVAIL2 X86_PG_AVAIL2 +#define PG_AVAIL3 X86_PG_AVAIL3 +#define PG_PDE_PAT X86_PG_PDE_PAT +#define PG_NX X86_PG_NX +#define PG_PDE_CACHE X86_PG_PDE_CACHE +#define PG_PTE_CACHE X86_PG_PTE_CACHE + +/* Our various interpretations of the above */ +#define PG_W X86_PG_AVAIL3 /* "Wired" pseudoflag */ +#define PG_MANAGED X86_PG_AVAIL2 +#define EPT_PG_EMUL_V X86_PG_AVAIL(52) +#define EPT_PG_EMUL_RW X86_PG_AVAIL(53) +#define PG_PROMOTED X86_PG_AVAIL(54) /* PDE only */ +#define PG_FRAME (0x000ffffffffff000ul) +#define PG_PS_FRAME (0x000fffffffe00000ul) + +/* + * Promotion to a 2MB (PDE) page mapping requires that the corresponding 4KB + * (PTE) page mappings have identical settings for the following fields: + */ +#define PG_PTE_PROMOTE (PG_NX | PG_MANAGED | PG_W | PG_G | PG_PTE_CACHE | \ + PG_M | PG_A | PG_U | PG_RW | PG_V) /* * Page Protection Exception bits */ + #define PGEX_P 0x01 /* Protection violation vs. not present */ #define PGEX_W 0x02 /* during a Write cycle */ #define PGEX_U 0x04 /* access from User mode (UPL) */ #define PGEX_RSV 0x08 /* reserved PTE field is non-zero */ #define PGEX_I 0x10 /* during an instruction fetch */ +/* + * undef the PG_xx macros that define bits in the regular x86 PTEs that + * have a different position in nested PTEs. This is done when compiling + * code that needs to be aware of the differences between regular x86 and + * nested PTEs. + * + * The appropriate bitmask will be calculated at runtime based on the pmap + * type. + */ +#ifdef AMD64_NPT_AWARE +#undef PG_AVAIL1 /* X86_PG_AVAIL1 aliases with EPT_PG_M */ +#undef PG_G +#undef PG_A +#undef PG_M +#undef PG_PDE_PAT +#undef PG_PDE_CACHE +#undef PG_PTE_PAT +#undef PG_PTE_CACHE +#undef PG_RW +#undef PG_V +#endif + +/* + * Pte related macros. This is complicated by having to deal with + * the sign extension of the 48th bit. + */ +#define KVADDR(l4, l3, l2, l1) ( \ + ((unsigned long)-1 << 47) | \ + ((unsigned long)(l4) << PML4SHIFT) | \ + ((unsigned long)(l3) << PDPSHIFT) | \ + ((unsigned long)(l2) << PDRSHIFT) | \ + ((unsigned long)(l1) << PAGE_SHIFT)) + +#define UVADDR(l4, l3, l2, l1) ( \ + ((unsigned long)(l4) << PML4SHIFT) | \ + ((unsigned long)(l3) << PDPSHIFT) | \ + ((unsigned long)(l2) << PDRSHIFT) | \ + ((unsigned long)(l1) << PAGE_SHIFT)) + +/* + * Number of kernel PML4 slots. Can be anywhere from 1 to 64 or so, + * but setting it larger than NDMPML4E makes no sense. + * + * Each slot provides .5 TB of kernel virtual space. + */ +#define NKPML4E 4 + +#define NUPML4E (NPML4EPG/2) /* number of userland PML4 pages */ +#define NUPDPE (NUPML4E*NPDPEPG)/* number of userland PDP pages */ +#define NUPDE (NUPDPE*NPDEPG) /* number of userland PD entries */ + +/* + * NDMPML4E is the maximum number of PML4 entries that will be + * used to implement the direct map. It must be a power of two, + * and should generally exceed NKPML4E. The maximum possible + * value is 64; using 128 will make the direct map intrude into + * the recursive page table map. + */ +#define NDMPML4E 8 + +/* + * These values control the layout of virtual memory. The starting address + * of the direct map, which is controlled by DMPML4I, must be a multiple of + * its size. (See the PHYS_TO_DMAP() and DMAP_TO_PHYS() macros.) + * + * Note: KPML4I is the index of the (single) level 4 page that maps + * the KVA that holds KERNBASE, while KPML4BASE is the index of the + * first level 4 page that maps VM_MIN_KERNEL_ADDRESS. If NKPML4E + * is 1, these are the same, otherwise KPML4BASE < KPML4I and extra + * level 4 PDEs are needed to map from VM_MIN_KERNEL_ADDRESS up to + * KERNBASE. + * + * (KPML4I combines with KPDPI to choose where KERNBASE starts. + * Or, in other words, KPML4I provides bits 39..47 of KERNBASE, + * and KPDPI provides bits 30..38.) + */ +#define PML4PML4I (NPML4EPG/2) /* Index of recursive pml4 mapping */ + +#define KPML4BASE (NPML4EPG-NKPML4E) /* KVM at highest addresses */ +#define DMPML4I rounddown(KPML4BASE-NDMPML4E, NDMPML4E) /* Below KVM */ + +#define KPML4I (NPML4EPG-1) +#define KPDPI (NPDPEPG-2) /* kernbase at -2GB */ + +/* + * XXX doesn't really belong here I guess... + */ +#define ISA_HOLE_START 0xa0000 +#define ISA_HOLE_LENGTH (0x100000-ISA_HOLE_START) + +#define PMAP_PCID_NONE 0xffffffff +#define PMAP_PCID_KERN 0 +#define PMAP_PCID_OVERMAX 0x1000 + +#ifndef LOCORE + +#ifdef __FreeBSD__ +#include <sys/queue.h> +#include <sys/_cpuset.h> +#include <sys/_lock.h> +#include <sys/_mutex.h> + +#include <vm/_vm_radix.h> +#endif /* __FreeBSD__ */ + typedef u_int64_t pd_entry_t; typedef u_int64_t pt_entry_t; typedef u_int64_t pdp_entry_t; typedef u_int64_t pml4_entry_t; +/* + * Address of current address space page table maps and directories. + */ +#ifdef _KERNEL +#define addr_PTmap (KVADDR(PML4PML4I, 0, 0, 0)) +#define addr_PDmap (KVADDR(PML4PML4I, PML4PML4I, 0, 0)) +#define addr_PDPmap (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, 0)) +#define addr_PML4map (KVADDR(PML4PML4I, PML4PML4I, PML4PML4I, PML4PML4I)) +#define addr_PML4pml4e (addr_PML4map + (PML4PML4I * sizeof(pml4_entry_t))) +#define PTmap ((pt_entry_t *)(addr_PTmap)) +#define PDmap ((pd_entry_t *)(addr_PDmap)) +#define PDPmap ((pd_entry_t *)(addr_PDPmap)) +#define PML4map ((pd_entry_t *)(addr_PML4map)) +#define PML4pml4e ((pd_entry_t *)(addr_PML4pml4e)) + +extern int nkpt; /* Initial number of kernel page tables */ +extern u_int64_t KPDPphys; /* physical address of kernel level 3 */ +extern u_int64_t KPML4phys; /* physical address of kernel level 4 */ + +/* + * virtual address to page table entry and + * to physical address. + * Note: these work recursively, thus vtopte of a pte will give + * the corresponding pde that in turn maps it. + */ +pt_entry_t *vtopte(vm_offset_t); #define vtophys(va) pmap_kextract(((vm_offset_t) (va))) -vm_paddr_t pmap_kextract(vm_offset_t va); +#ifndef __FreeBSD__ +extern vm_paddr_t pmap_kextract(vm_offset_t); +#endif + +#define pte_load_store(ptep, pte) atomic_swap_long(ptep, pte) +#define pte_load_clear(ptep) atomic_swap_long(ptep, 0) +#define pte_store(ptep, pte) do { \ + *(u_long *)(ptep) = (u_long)(pte); \ +} while (0) +#define pte_clear(ptep) pte_store(ptep, 0) + +#define pde_store(pdep, pde) pte_store(pdep, pde) + +extern pt_entry_t pg_nx; + +#endif /* _KERNEL */ + +#ifdef __FreeBSD__ +/* + * Pmap stuff + */ +struct pv_entry; +struct pv_chunk; + +/* + * Locks + * (p) PV list lock + */ +struct md_page { + TAILQ_HEAD(, pv_entry) pv_list; /* (p) */ + int pv_gen; /* (p) */ + int pat_mode; +}; +#endif /* __FreeBSD__ */ + +enum pmap_type { + PT_X86, /* regular x86 page tables */ + PT_EPT, /* Intel's nested page tables */ + PT_RVI, /* AMD's nested page tables */ +}; + +#ifdef __FreeBSD__ +struct pmap_pcids { + uint32_t pm_pcid; + uint32_t pm_gen; +}; + +/* + * The kernel virtual address (KVA) of the level 4 page table page is always + * within the direct map (DMAP) region. + */ +struct pmap { + struct mtx pm_mtx; + pml4_entry_t *pm_pml4; /* KVA of level 4 page table */ + uint64_t pm_cr3; + TAILQ_HEAD(,pv_chunk) pm_pvchunk; /* list of mappings in pmap */ + cpuset_t pm_active; /* active on cpus */ + enum pmap_type pm_type; /* regular or nested tables */ + struct pmap_statistics pm_stats; /* pmap statistics */ + struct vm_radix pm_root; /* spare page table pages */ + long pm_eptgen; /* EPT pmap generation id */ + int pm_flags; + struct pmap_pcids pm_pcids[MAXCPU]; +}; +#endif /* __FreeBSD__ */ + +/* flags */ +#define PMAP_NESTED_IPIMASK 0xff +#define PMAP_PDE_SUPERPAGE (1 << 8) /* supports 2MB superpages */ +#define PMAP_EMULATE_AD_BITS (1 << 9) /* needs A/D bits emulation */ +#define PMAP_SUPPORTS_EXEC_ONLY (1 << 10) /* execute only mappings ok */ + +typedef struct pmap *pmap_t; + +#ifdef _KERNEL +extern struct pmap kernel_pmap_store; +#define kernel_pmap (&kernel_pmap_store) + +#define PMAP_LOCK(pmap) mtx_lock(&(pmap)->pm_mtx) +#define PMAP_LOCK_ASSERT(pmap, type) \ + mtx_assert(&(pmap)->pm_mtx, (type)) +#define PMAP_LOCK_DESTROY(pmap) mtx_destroy(&(pmap)->pm_mtx) +#define PMAP_LOCK_INIT(pmap) mtx_init(&(pmap)->pm_mtx, "pmap", \ + NULL, MTX_DEF | MTX_DUPOK) +#define PMAP_LOCKED(pmap) mtx_owned(&(pmap)->pm_mtx) +#define PMAP_MTX(pmap) (&(pmap)->pm_mtx) +#define PMAP_TRYLOCK(pmap) mtx_trylock(&(pmap)->pm_mtx) +#define PMAP_UNLOCK(pmap) mtx_unlock(&(pmap)->pm_mtx) + +int pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags); +int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype); +#endif + +#ifdef __FreeBSD__ +/* + * For each vm_page_t, there is a list of all currently valid virtual + * mappings of that page. An entry is a pv_entry_t, the list is pv_list. + */ +typedef struct pv_entry { + vm_offset_t pv_va; /* virtual address for mapping */ + TAILQ_ENTRY(pv_entry) pv_next; +} *pv_entry_t; + +/* + * pv_entries are allocated in chunks per-process. This avoids the + * need to track per-pmap assignments. + */ +#define _NPCM 3 +#define _NPCPV 168 +struct pv_chunk { + pmap_t pc_pmap; + TAILQ_ENTRY(pv_chunk) pc_list; + uint64_t pc_map[_NPCM]; /* bitmap; 1 = free */ + TAILQ_ENTRY(pv_chunk) pc_lru; + struct pv_entry pc_pventry[_NPCPV]; +}; + +#ifdef _KERNEL + +extern caddr_t CADDR1; +extern pt_entry_t *CMAP1; +extern vm_paddr_t phys_avail[]; +extern vm_paddr_t dump_avail[]; +extern vm_offset_t virtual_avail; +extern vm_offset_t virtual_end; +extern vm_paddr_t dmaplimit; +extern int pmap_pcid_enabled; +extern int invpcid_works; + +#define pmap_page_get_memattr(m) ((vm_memattr_t)(m)->md.pat_mode) +#define pmap_page_is_write_mapped(m) (((m)->aflags & PGA_WRITEABLE) != 0) +#define pmap_unmapbios(va, sz) pmap_unmapdev((va), (sz)) + +struct thread; + +void pmap_activate_sw(struct thread *); +void pmap_bootstrap(vm_paddr_t *); +int pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde); +int pmap_change_attr(vm_offset_t, vm_size_t, int); +void pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate); +void pmap_init_pat(void); +void pmap_kenter(vm_offset_t va, vm_paddr_t pa); +void *pmap_kenter_temporary(vm_paddr_t pa, int i); +vm_paddr_t pmap_kextract(vm_offset_t); +void pmap_kremove(vm_offset_t); +void *pmap_mapbios(vm_paddr_t, vm_size_t); +void *pmap_mapdev(vm_paddr_t, vm_size_t); +void *pmap_mapdev_attr(vm_paddr_t, vm_size_t, int); +boolean_t pmap_page_is_mapped(vm_page_t m); +void pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma); +void pmap_pinit_pml4(vm_page_t); +void pmap_unmapdev(vm_offset_t, vm_size_t); +void pmap_invalidate_page(pmap_t, vm_offset_t); +void pmap_invalidate_range(pmap_t, vm_offset_t, vm_offset_t); +void pmap_invalidate_all(pmap_t); +void pmap_invalidate_cache(void); +void pmap_invalidate_cache_pages(vm_page_t *pages, int count); +void pmap_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva, + boolean_t force); +void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); +boolean_t pmap_map_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); +void pmap_unmap_io_transient(vm_page_t *, vm_offset_t *, int, boolean_t); +#endif /* _KERNEL */ + +/* Return various clipped indexes for a given VA */ +static __inline vm_pindex_t +pmap_pte_index(vm_offset_t va) +{ + + return ((va >> PAGE_SHIFT) & ((1ul << NPTEPGSHIFT) - 1)); +} + +static __inline vm_pindex_t +pmap_pde_index(vm_offset_t va) +{ + + return ((va >> PDRSHIFT) & ((1ul << NPDEPGSHIFT) - 1)); +} + +static __inline vm_pindex_t +pmap_pdpe_index(vm_offset_t va) +{ + + return ((va >> PDPSHIFT) & ((1ul << NPDPEPGSHIFT) - 1)); +} + +static __inline vm_pindex_t +pmap_pml4e_index(vm_offset_t va) +{ + + return ((va >> PML4SHIFT) & ((1ul << NPML4EPGSHIFT) - 1)); +} + +#endif /* __FreeBSD__ */ +#endif /* !LOCORE */ -#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ */ +#endif /* !_COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/reg.h b/usr/src/compat/freebsd/amd64/machine/reg.h new file mode 100644 index 0000000000..4a73463603 --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/reg.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_REG_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_REG_H_ + +#define DBREG_DR6_RESERVED1 0xffff0ff0 +#define DBREG_DR7_RESERVED1 0x0400 + + +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_REG_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/smp.h b/usr/src/compat/freebsd/amd64/machine/smp.h index ef719b9684..9c4f2d111b 100644 --- a/usr/src/compat/freebsd/amd64/machine/smp.h +++ b/usr/src/compat/freebsd/amd64/machine/smp.h @@ -11,9 +11,20 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ +#ifdef _KERNEL + +/* + * APIC-related functions are replaced with native calls rather than shims + * which attempt to replicate the FreeBSD interfaces. This is empty, but will + * remain present to appease sources which wish to include the path. + */ + +#endif /* _KERNEL */ + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/specialreg.h b/usr/src/compat/freebsd/amd64/machine/specialreg.h new file mode 100644 index 0000000000..871573ea6b --- /dev/null +++ b/usr/src/compat/freebsd/amd64/machine/specialreg.h @@ -0,0 +1,61 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SPECIALREG_H_ +#define _COMPAT_FREEBSD_AMD64_MACHINE_SPECIALREG_H_ + +#ifdef _SYS_X86_ARCHEXT_H +/* Our x86_archext conflicts with BSD header for the XFEATURE_ defines */ +#undef XFEATURE_AVX +#undef XFEATURE_MPX +#undef XFEATURE_AVX512 +#endif + +#ifdef _SYS_CONTROLREGS_H +/* Our CR4 defines conflict with BSD header */ +#undef CR4_VME +#undef CR4_PVI +#undef CR4_TSD +#undef CR4_DE +#undef CR4_PSE +#undef CR4_PAE +#undef CR4_MCE +#undef CR4_PGE +#undef CR4_PCE +#undef CR4_VMXE +#undef CR4_SMEP +#undef CR4_SMAP +#undef CR4_PKE +#undef CR4_PCIDE +#endif /* _SYS_CONTROLREGS_H */ + +#ifdef _SYS_X86_ARCHEXT_H +/* Our IA32 speculation-related defines conflict with BSD header */ +#undef IA32_ARCH_CAP_RDCL_NO +#undef IA32_ARCH_CAP_IBRS_ALL +#undef IA32_ARCH_CAP_RSBA +#undef IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY +#undef IA32_ARCH_CAP_SSB_NO +#undef IA32_ARCH_CAP_MDS_NO +#undef IA32_SPEC_CTRL_IBRS +#undef IA32_SPEC_CTRL_STIBP +#undef IA32_SPEC_CTRL_SSBD +#undef IA32_FLUSH_CMD_L1D +#undef MSR_IA32_SPEC_CTRL +#undef MSR_IA32_PRED_CMD +#endif /* _SYS_X86_ARCHEXT_H */ + +#include <x86/specialreg.h> +#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SPECIALREG_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/vmm.h b/usr/src/compat/freebsd/amd64/machine/vmm.h index 79c3ec959e..1c54c0830d 100644 --- a/usr/src/compat/freebsd/amd64/machine/vmm.h +++ b/usr/src/compat/freebsd/amd64/machine/vmm.h @@ -11,11 +11,14 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_ +#include <sys/_cpuset.h> + #include <sys/vmm.h> #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_ */ diff --git a/usr/src/compat/freebsd/amd64/machine/vmparam.h b/usr/src/compat/freebsd/amd64/machine/vmparam.h index c80c2af545..c76a3259f3 100644 --- a/usr/src/compat/freebsd/amd64/machine/vmparam.h +++ b/usr/src/compat/freebsd/amd64/machine/vmparam.h @@ -11,9 +11,35 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_ #define _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_ +extern caddr_t kpm_vbase; +extern size_t kpm_size; + +static inline uintptr_t +phys_to_dmap(uintptr_t pa) +{ + ASSERT3U(pa, <, kpm_size); + return ((uintptr_t)kpm_vbase + pa); +} + +static inline uintptr_t +dmap_to_phys(uintptr_t kva) +{ + const uintptr_t base = (uintptr_t)kpm_vbase; + + ASSERT3U(kva, >=, base); + ASSERT3U(kva, <, base + kpm_size); + + return (kva - base); +} + +#define PHYS_TO_DMAP(x) phys_to_dmap(x) +#define DMAP_TO_PHYS(x) dmap_to_phys(x) + + #endif /* _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_ */ diff --git a/usr/src/compat/freebsd/err.h b/usr/src/compat/freebsd/err.h new file mode 100644 index 0000000000..40d144e025 --- /dev/null +++ b/usr/src/compat/freebsd/err.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_ERR_H_ +#define _COMPAT_FREEBSD_ERR_H_ + +#define errc(code, num, ...) err(code, __VA_ARGS__) + +#include_next <err.h> + +#endif /* _COMPAT_FREEBSD_ERR_H_ */ diff --git a/usr/src/compat/freebsd/libutil.h b/usr/src/compat/freebsd/libutil.h index e22ffc0551..f899d4425e 100644 --- a/usr/src/compat/freebsd/libutil.h +++ b/usr/src/compat/freebsd/libutil.h @@ -17,5 +17,19 @@ #define _COMPAT_FREEBSD_LIBUTIL_H_ int expand_number(const char *_buf, uint64_t *_num); +int humanize_number(char *_buf, size_t _len, int64_t _number, + const char *_suffix, int _scale, int _flags); + +/* Values for humanize_number(3)'s flags parameter. */ +#define HN_DECIMAL 0x01 +#define HN_NOSPACE 0x02 +#define HN_B 0x04 +#define HN_DIVISOR_1000 0x08 +#define HN_IEC_PREFIXES 0x10 + +/* Values for humanize_number(3)'s scale parameter. */ +#define HN_GETSCALE 0x10 +#define HN_AUTOSCALE 0x20 + #endif /* _COMPAT_FREEBSD_LIBUTIL_H_ */ diff --git a/usr/src/compat/freebsd/net/ethernet.h b/usr/src/compat/freebsd/net/ethernet.h index a0d5a828c6..dcd3a58925 100644 --- a/usr/src/compat/freebsd/net/ethernet.h +++ b/usr/src/compat/freebsd/net/ethernet.h @@ -11,11 +11,25 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_ #define _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_ +#define ether_addr_octet octet + #include <sys/ethernet.h> +/* + * Some basic Ethernet constants. + */ +#define ETHER_ADDR_LEN 6 /* length of an Ethernet address */ +#define ETHER_CRC_LEN 4 /* length of the Ethernet CRC */ +#define ETHER_MIN_LEN 64 /* minimum frame len, including CRC */ + +#define ETHER_VLAN_ENCAP_LEN 4 /* len of 802.1Q VLAN encapsulation */ + +#define ETHER_IS_MULTICAST(addr) (*(addr) & 0x01) /* is address mcast/bcast? */ + #endif /* _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_ */ diff --git a/usr/src/compat/freebsd/pthread_np.h b/usr/src/compat/freebsd/pthread_np.h index 641c58f406..c4f76b259c 100644 --- a/usr/src/compat/freebsd/pthread_np.h +++ b/usr/src/compat/freebsd/pthread_np.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_PTHREAD_NP_H_ @@ -20,8 +21,9 @@ #include <sys/cpuset.h> #include <synch.h> +#include <pthread.h> -#define pthread_set_name_np(thread, name) +#define pthread_set_name_np pthread_setname_np #define pthread_mutex_isowned_np(x) _mutex_held(x) diff --git a/usr/src/compat/freebsd/sys/_cpuset.h b/usr/src/compat/freebsd/sys/_cpuset.h new file mode 100644 index 0000000000..286d26fc00 --- /dev/null +++ b/usr/src/compat/freebsd/sys/_cpuset.h @@ -0,0 +1,33 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS__CPUSET_H_ +#define _COMPAT_FREEBSD_SYS__CPUSET_H_ + +#ifdef _KERNEL +/* + * The sys/_cpuset.h header is used to communicate the layout of cpuset_t while + * sys/cpuset.h contains the manipulation routines. + * + * The explicit guard definition below is necessary as other contrib headers + * change their behavior based on its presence. + */ +#define _SYS__CPUSET_H_ + +#include <sys/cpuvar.h> + +#endif /* _KERNEL */ + +#endif /* _COMPAT_FREEBSD_SYS__CPUSET_H_ */ diff --git a/usr/src/compat/freebsd/sys/callout.h b/usr/src/compat/freebsd/sys/callout.h index 17b6e31507..6087a09f54 100644 --- a/usr/src/compat/freebsd/sys/callout.h +++ b/usr/src/compat/freebsd/sys/callout.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_CALLOUT_H_ @@ -41,6 +42,9 @@ int vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, int vmm_glue_callout_stop(struct callout *c); int vmm_glue_callout_drain(struct callout *c); +/* illumos-custom function for resource locality optimization */ +void vmm_glue_callout_localize(struct callout *c); + static __inline void callout_init(struct callout *c, int mpsafe) { diff --git a/usr/src/compat/freebsd/sys/cdefs.h b/usr/src/compat/freebsd/sys/cdefs.h index 974e323dbe..0b857437e3 100644 --- a/usr/src/compat/freebsd/sys/cdefs.h +++ b/usr/src/compat/freebsd/sys/cdefs.h @@ -11,48 +11,67 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_CDEFS_H_ #define _COMPAT_FREEBSD_SYS_CDEFS_H_ +/* + * Testing against Clang-specific extensions. + */ +#ifndef __has_extension +#define __has_extension __has_feature +#endif +#ifndef __has_feature +#define __has_feature(x) 0 +#endif + +/* + * Macro to test if we're using a specific version of gcc or later. + */ +#if defined(__GNUC__) && !defined(__INTEL_COMPILER) +#define __GNUC_PREREQ__(ma, mi) \ + (__GNUC__ > (ma) || __GNUC__ == (ma) && __GNUC_MINOR__ >= (mi)) +#else +#define __GNUC_PREREQ__(ma, mi) 0 +#endif + #define __FBSDID(s) #ifdef __GNUC__ +#define asm __asm #define inline __inline #define __GNUCLIKE___SECTION 1 #define __dead2 __attribute__((__noreturn__)) -#define __unused __attribute__((__unused__)) #define __used __attribute__((__used__)) #define __packed __attribute__((__packed__)) #define __aligned(x) __attribute__((__aligned__(x))) #define __section(x) __attribute__((__section__(x))) +#define __weak_symbol __attribute__((__weak__)) #endif -/* - * The __CONCAT macro is used to concatenate parts of symbol names, e.g. - * with "#define OLD(foo) __CONCAT(old,foo)", OLD(foo) produces oldfoo. - * The __CONCAT macro is a bit tricky to use if it must work in non-ANSI - * mode -- there must be no spaces between its arguments, and for nested - * __CONCAT's, all the __CONCAT's must be at the left. __CONCAT can also - * concatenate double-quoted strings produced by the __STRING macro, but - * this only works with ANSI C. - * - * __XSTRING is like __STRING, but it expands any macros in its argument - * first. It is only available with ANSI C. - */ -#if defined(__STDC__) || defined(__cplusplus) -#define __P(protos) protos /* full-blown ANSI C */ -#define __CONCAT1(x,y) x ## y -#define __CONCAT(x,y) __CONCAT1(x,y) -#define __STRING(x) #x /* stringify without expanding x */ -#define __XSTRING(x) __STRING(x) /* expand x, then stringify */ -#else /* !(__STDC__ || __cplusplus) */ -#define __P(protos) () /* traditional C preprocessor */ -#define __CONCAT(x,y) x/**/y -#define __STRING(x) "x" -#endif /* !(__STDC__ || __cplusplus) */ +#if !defined(__STDC_VERSION__) || __STDC_VERSION__ < 201112L || defined(lint) + +#if !__has_extension(c_static_assert) +#if (defined(__cplusplus) && __cplusplus >= 201103L) || \ + __has_extension(cxx_static_assert) +#define _Static_assert(x, y) static_assert(x, y) +#elif __GNUC_PREREQ__(4,6) +/* Nothing, gcc 4.6 and higher has _Static_assert built-in */ +#elif defined(__COUNTER__) +#define _Static_assert(x, y) __Static_assert(x, __COUNTER__) +#define __Static_assert(x, y) ___Static_assert(x, y) +#define ___Static_assert(x, y) typedef char __assert_ ## y[(x) ? 1 : -1] \ + __unused +#else +#define _Static_assert(x, y) struct __hack +#endif +#endif +#define static_assert(x, y) _Static_assert(x, y) + +#endif /* __STDC_VERSION__ || __STDC_VERSION__ < 201112L */ #endif /* _COMPAT_FREEBSD_SYS_CDEFS_H_ */ diff --git a/usr/src/compat/freebsd/sys/clock.h b/usr/src/compat/freebsd/sys/clock.h new file mode 100644 index 0000000000..ebf7f171a3 --- /dev/null +++ b/usr/src/compat/freebsd/sys/clock.h @@ -0,0 +1,110 @@ +/*- + * Copyright (c) 1996 The NetBSD Foundation, Inc. + * All rights reserved. + * + * This code is derived from software contributed to The NetBSD Foundation + * by Gordon W. Ross + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS + * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * $NetBSD: clock_subr.h,v 1.7 2000/10/03 13:41:07 tsutsui Exp $ + * + * + * This file is the central clearing-house for calendrical issues. + * + * In general the kernel does not know about minutes, hours, days, timezones, + * daylight savings time, leap-years and such. All that is theoretically a + * matter for userland only. + * + * Parts of kernel code does however care: badly designed filesystems store + * timestamps in local time and RTC chips sometimes track time in a local + * timezone instead of UTC and so on. + * + * All that code should go here for service. + * + * $FreeBSD$ + */ + +#ifndef _COMPAT_FREEBSD_SYS_CLOCK_H_ +#define _COMPAT_FREEBSD_SYS_CLOCK_H_ + +#include_next <sys/clock.h> + +#ifdef _KERNEL /* No user serviceable parts */ + +#ifdef __FreeBSD__ +/* + * Timezone info from settimeofday(2), usually not used + */ +extern int tz_minuteswest; +extern int tz_dsttime; +extern struct mtx resettodr_lock; + +int utc_offset(void); +#endif /* __FreeBSD__ */ + +/* + * Structure to hold the values typically reported by time-of-day clocks. + * This can be passed to the generic conversion functions to be converted + * to a struct timespec. + */ +struct clocktime { + int year; /* year (4 digit year) */ + int mon; /* month (1 - 12) */ + int day; /* day (1 - 31) */ + int hour; /* hour (0 - 23) */ + int min; /* minute (0 - 59) */ + int sec; /* second (0 - 59) */ + int dow; /* day of week (0 - 6; 0 = Sunday) */ + long nsec; /* nano seconds */ +}; + +int clock_ct_to_ts(struct clocktime *, struct timespec *); +void clock_ts_to_ct(struct timespec *, struct clocktime *); +#ifdef __FreeBSD__ +void clock_register(device_t, long); +#endif + +#ifndef __FreeBSD__ +extern u_char const bin2bcd_data[]; +#define bin2bcd(x) (bin2bcd_data[bin]) +#endif + +/* + * BCD to decimal and decimal to BCD. + */ +#define FROMBCD(x) bcd2bin(x) +#define TOBCD(x) bin2bcd(x) + +/* Some handy constants. */ +#define SECDAY (24 * 60 * 60) +#define SECYR (SECDAY * 365) + +/* Traditional POSIX base year */ +#define POSIX_BASE_YEAR 1970 + +void timespec2fattime(struct timespec *tsp, int utc, u_int16_t *ddp, u_int16_t *dtp, u_int8_t *dhp); +void fattime2timespec(unsigned dd, unsigned dt, unsigned dh, int utc, struct timespec *tsp); + +#endif /* _KERNEL */ + +#endif /* _COMPAT_FREEBSD_SYS_CLOCK_H_ */ diff --git a/usr/src/compat/freebsd/sys/cpuset.h b/usr/src/compat/freebsd/sys/cpuset.h index 8527624b5e..626b323d7d 100644 --- a/usr/src/compat/freebsd/sys/cpuset.h +++ b/usr/src/compat/freebsd/sys/cpuset.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_CPUSET_H_ @@ -19,26 +20,115 @@ #define NOCPU -1 #ifdef _KERNEL -#define CPU_SET(cpu, set) CPUSET_ADD(*(set), cpu) -#define CPU_SETOF(cpu, set) CPUSET_ONLY(*(set), cpu) -#define CPU_ZERO(set) CPUSET_ZERO(*(set)) -#define CPU_CLR(cpu, set) CPUSET_DEL(*(set), cpu) + +#include <sys/_cpuset.h> + +#define CPU_SET(cpu, set) cpuset_add((set), (cpu)) +#define CPU_SETOF(cpu, set) cpuset_only((set), (cpu)) +#define CPU_ZERO(set) cpuset_zero((cpuset_t *)(set)) +#define CPU_CLR(cpu, set) cpuset_del((set), (cpu)) +#define CPU_EMPTY(set) cpuset_isnull((set)) #define CPU_FFS(set) cpusetobj_ffs(set) -#define CPU_ISSET(cpu, set) CPU_IN_SET(*(set), cpu) -#define CPU_CMP(set1, set2) CPUSET_ISEQUAL(*(set1), *(set2)) -#define CPU_SET_ATOMIC(cpu, set) CPUSET_ATOMIC_ADD(*(set), cpu) +#define CPU_ISSET(cpu, set) cpu_in_set((cpuset_t *)(set), (cpu)) +#define CPU_AND(dst, src) cpuset_and( \ + (cpuset_t *)(dst), \ + (cpuset_t *)(src)) +#define CPU_OR(dst, src) cpuset_or( \ + (cpuset_t *)(dst), \ + (cpuset_t *)(src)) +#define CPU_CMP(set1, set2) (cpuset_isequal( \ + (cpuset_t *)(set1), \ + (cpuset_t *)(set2)) == 0) +#define CPU_SET_ATOMIC(cpu, set) cpuset_atomic_add( \ + (cpuset_t *)(set), \ + (cpu)) +#define CPU_CLR_ATOMIC(cpu, set) cpuset_atomic_del( \ + (cpuset_t *)(set), \ + (cpu)) + +#define CPU_SET_ATOMIC_ACQ(cpu, set) cpuset_atomic_add((set), (cpu)) -#include <sys/cpuvar.h> int cpusetobj_ffs(const cpuset_t *set); + #else + +#include <sys/bitmap.h> #include <machine/atomic.h> +#include <machine/cpufunc.h> + +/* For now, assume NCPU of 256 */ +#define CPU_SETSIZE (256) + +typedef struct { + ulong_t _bits[BT_BITOUL(CPU_SETSIZE)]; +} cpuset_t; + +static __inline int +cpuset_isempty(const cpuset_t *set) +{ + uint_t i; -typedef int cpuset_t; + for (i = 0; i < BT_BITOUL(CPU_SETSIZE); i++) { + if (set->_bits[i] != 0) + return (0); + } + return (1); +} -#define CPUSET(cpu) (1UL << (cpu)) +static __inline void +cpuset_zero(cpuset_t *dst) +{ + uint_t i; + + for (i = 0; i < BT_BITOUL(CPU_SETSIZE); i++) { + dst->_bits[i] = 0; + } +} + +static __inline int +cpuset_isequal(cpuset_t *s1, cpuset_t *s2) +{ + uint_t i; + + for (i = 0; i < BT_BITOUL(CPU_SETSIZE); i++) { + if (s1->_bits[i] != s2->_bits[i]) + return (0); + } + return (1); +} + +static __inline uint_t +cpusetobj_ffs(const cpuset_t *set) +{ + uint_t i, cbit; + + cbit = 0; + for (i = 0; i < BT_BITOUL(CPU_SETSIZE); i++) { + if (set->_bits[i] != 0) { + cbit = ffsl(set->_bits[i]); + cbit += i * sizeof (set->_bits[0]); + break; + } + } + return (cbit); +} + + +#define CPU_SET(cpu, setp) BT_SET((setp)->_bits, cpu) +#define CPU_CLR(cpu, setp) BT_CLEAR((setp)->_bits, cpu) +#define CPU_ZERO(setp) cpuset_zero((setp)) +#define CPU_CMP(set1, set2) (cpuset_isequal( \ + (cpuset_t *)(set1), \ + (cpuset_t *)(set2)) == 0) +#define CPU_FFS(set) cpusetobj_ffs(set) +#define CPU_ISSET(cpu, setp) BT_TEST((setp)->_bits, cpu) +#define CPU_EMPTY(setp) cpuset_isempty((setp)) +#define CPU_SET_ATOMIC(cpu, setp) \ + atomic_set_long(&(BT_WIM((setp)->_bits, cpu)), BT_BIW(cpu)) +#define CPU_CLR_ATOMIC(cpu, setp) \ + atomic_clear_long(&(BT_WIM((setp)->_bits, cpu)), BT_BIW(cpu)) -#define CPU_SET_ATOMIC(cpu, set) atomic_set_int((set), CPUSET(cpu)) #endif #endif /* _COMPAT_FREEBSD_SYS_CPUSET_H_ */ diff --git a/usr/src/compat/freebsd/sys/endian.h b/usr/src/compat/freebsd/sys/endian.h index a31bff55d6..24ea02d251 100644 --- a/usr/src/compat/freebsd/sys/endian.h +++ b/usr/src/compat/freebsd/sys/endian.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_ENDIAN_H_ @@ -122,4 +123,14 @@ le64enc(void *pp, uint64_t u) le32enc(p + 4, (uint32_t)(u >> 32)); } +#ifdef _LITTLE_ENDIAN +#define htole16(x) ((uint16_t)(x)) +#define htole32(x) ((uint32_t)(x)) +#define htole64(x) ((uint64_t)(x)) + +#define le16toh(x) ((uint16_t)(x)) +#define le32toh(x) ((uint32_t)(x)) +#define le64toh(x) ((uint64_t)(x)) +#endif + #endif /* _COMPAT_FREEBSD_SYS_ENDIAN_H_ */ diff --git a/usr/src/compat/freebsd/sys/eventhandler.h b/usr/src/compat/freebsd/sys/eventhandler.h new file mode 100644 index 0000000000..133aa664f0 --- /dev/null +++ b/usr/src/compat/freebsd/sys/eventhandler.h @@ -0,0 +1,19 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_EVENTHANDLER_H_ +#define _COMPAT_FREEBSD_SYS_EVENTHANDLER_H_ + +#endif /* _COMPAT_FREEBSD_SYS_EVENTHANDLER_H_ */ diff --git a/usr/src/compat/freebsd/sys/ioctl.h b/usr/src/compat/freebsd/sys/ioctl.h index e223e1e4c7..72a46b8085 100644 --- a/usr/src/compat/freebsd/sys/ioctl.h +++ b/usr/src/compat/freebsd/sys/ioctl.h @@ -17,6 +17,8 @@ #define _COMPAT_FREEBSD_SYS_IOCTL_H_ #include <sys/ioccom.h> +/* Get BSD compatibility from the ioctl header */ +#define BSD_COMP #include_next <sys/ioctl.h> #endif /* _COMPAT_FREEBSD_SYS_IOCTL_H_ */ diff --git a/usr/src/compat/freebsd/sys/kernel.h b/usr/src/compat/freebsd/sys/kernel.h index b1c07674e4..adf96f40fc 100644 --- a/usr/src/compat/freebsd/sys/kernel.h +++ b/usr/src/compat/freebsd/sys/kernel.h @@ -11,15 +11,32 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_KERNEL_H_ #define _COMPAT_FREEBSD_SYS_KERNEL_H_ -#define SYSINIT(uniquifier, subsystem, order, func, ident) +#define TUNABLE_INT_FETCH(path, var) #include <sys/linker_set.h> +typedef void (*sysinit_func_t)(const void *); + +struct sysinit { + const sysinit_func_t func; + const void *data; +}; + +#define SYSINIT(uniquifier, subsystem, order, func, ident) \ + static struct sysinit uniquifier ## _sys_init = { \ + (const sysinit_func_t)func, \ + (const void *)&(ident) \ + }; \ + DATA_SET(sysinit_set, uniquifier ## _sys_init); + +extern void sysinit(void); + #define ticks ddi_get_lbolt() #endif /* _COMPAT_FREEBSD_SYS_KERNEL_H_ */ diff --git a/usr/src/compat/freebsd/sys/limits.h b/usr/src/compat/freebsd/sys/limits.h index 99ae0f4d64..0e66319791 100644 --- a/usr/src/compat/freebsd/sys/limits.h +++ b/usr/src/compat/freebsd/sys/limits.h @@ -11,9 +11,14 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_LIMITS_H_ #define _COMPAT_FREEBSD_SYS_LIMITS_H_ +#include_next <limits.h> + +#define OFF_MAX ((off_t)-1) + #endif /* _COMPAT_FREEBSD_SYS_LIMITS_H_ */ diff --git a/usr/src/compat/freebsd/sys/lock.h b/usr/src/compat/freebsd/sys/lock.h new file mode 100644 index 0000000000..fd6021a87e --- /dev/null +++ b/usr/src/compat/freebsd/sys/lock.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_LOCK_H_ +#define _COMPAT_FREEBSD_SYS_LOCK_H_ + +#include_next <sys/lock.h> + +#define WITNESS_WARN(...) + +#endif /* _COMPAT_FREEBSD_SYS_LOCK_H_ */ diff --git a/usr/src/compat/freebsd/sys/malloc.h b/usr/src/compat/freebsd/sys/malloc.h index 579df44533..341d57b807 100644 --- a/usr/src/compat/freebsd/sys/malloc.h +++ b/usr/src/compat/freebsd/sys/malloc.h @@ -39,6 +39,11 @@ struct malloc_type { void free(void *addr, struct malloc_type *type); void *malloc(unsigned long size, struct malloc_type *type, int flags); void *old_malloc(unsigned long size, struct malloc_type *type , int flags); +void *contigmalloc(unsigned long, struct malloc_type *, int, vm_paddr_t, + vm_paddr_t, unsigned long, vm_paddr_t); +void contigfree(void *, unsigned long, struct malloc_type *); + + #endif /* _KERNEL */ #endif /* _COMPAT_FREEBSD_SYS_MALLOC_H_ */ diff --git a/usr/src/compat/freebsd/sys/mutex.h b/usr/src/compat/freebsd/sys/mutex.h index b99884b652..9e588cb98a 100644 --- a/usr/src/compat/freebsd/sys/mutex.h +++ b/usr/src/compat/freebsd/sys/mutex.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_MUTEX_H_ @@ -28,15 +29,11 @@ struct mtx; void mtx_init(struct mtx *, char *name, const char *type_name, int opts); void mtx_destroy(struct mtx *); -int mtx_sleep(void *chan, struct mtx *mtx, int priority, const char *wmesg, - int timo); - #endif /* KERNEL */ #include_next <sys/mutex.h> #ifdef _KERNEL struct mtx { - kmutex_type_t t; kmutex_t m; }; diff --git a/usr/src/compat/freebsd/sys/param.h b/usr/src/compat/freebsd/sys/param.h index f09e9183f6..b125f9014f 100644 --- a/usr/src/compat/freebsd/sys/param.h +++ b/usr/src/compat/freebsd/sys/param.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_PARAM_H_ @@ -18,8 +19,11 @@ #ifndef _KERNEL #define MAXCOMLEN 16 +/* default value of the kernel tunable 'maxphys' in i86pc */ +#define MAXPHYS (56 * 1024) #endif #define MAXHOSTNAMELEN 256 +#define SPECNAMELEN 63 #ifdef _KERNEL #include <sys/time.h> @@ -36,13 +40,18 @@ #define nitems(x) (sizeof((x)) / sizeof((x)[0])) #define rounddown(x,y) (((x)/(y))*(y)) +#define rounddown2(x, y) ((x)&(~((y)-1))) /* if y is power of two */ #define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */ #define roundup2(x,y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */ +#define powerof2(x) ((((x)-1)&(x))==0) /* Macros for min/max. */ #define MIN(a,b) (((a)<(b))?(a):(b)) #define MAX(a,b) (((a)>(b))?(a):(b)) +#define trunc_page(x) ((unsigned long)(x) & ~(PAGE_MASK)) +#define ptoa(x) ((unsigned long)(x) << PAGE_SHIFT) + #include_next <sys/param.h> #endif /* _COMPAT_FREEBSD_SYS_PARAM_H_ */ diff --git a/usr/src/compat/freebsd/sys/sdt.h b/usr/src/compat/freebsd/sys/sdt.h new file mode 100644 index 0000000000..32d887c0d8 --- /dev/null +++ b/usr/src/compat/freebsd/sys/sdt.h @@ -0,0 +1,37 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_SDT_H_ +#define _COMPAT_FREEBSD_SYS_SDT_H_ + +/* Empty macros to cover FreeBSD's SDT linker tricks */ + +#define SDT_PROVIDER_DECLARE(mod) +#define SDT_PROVIDER_DEFINE(mod) + +#define SDT_PROBE_DEFINE1(...) +#define SDT_PROBE_DEFINE2(...) +#define SDT_PROBE_DEFINE3(...) +#define SDT_PROBE_DEFINE4(...) +#define SDT_PROBE_DEFINE5(...) +#define SDT_PROBE1(...) +#define SDT_PROBE2(...) +#define SDT_PROBE3(...) +#define SDT_PROBE4(...) +#define SDT_PROBE5(...) + +#include_next <sys/sdt.h> + +#endif /* _COMPAT_FREEBSD_SYS_SDT_H_ */ diff --git a/usr/src/compat/freebsd/sys/sglist.h b/usr/src/compat/freebsd/sys/sglist.h new file mode 100644 index 0000000000..519c67915f --- /dev/null +++ b/usr/src/compat/freebsd/sys/sglist.h @@ -0,0 +1,29 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_SGLIST_H_ +#define _COMPAT_FREEBSD_SYS_SGLIST_H_ + +#ifdef _KERNEL + +struct sglist; + +struct sglist *sglist_alloc(int, int); +void sglist_free(struct sglist *); +int sglist_append_phys(struct sglist *, vm_paddr_t, size_t); + +#endif /* _KERNEL */ + +#endif /* _COMPAT_FREEBSD_SYS_SGLIST_H_ */ diff --git a/usr/src/compat/freebsd/sys/smp.h b/usr/src/compat/freebsd/sys/smp.h index 46183e8677..3d6413ce16 100644 --- a/usr/src/compat/freebsd/sys/smp.h +++ b/usr/src/compat/freebsd/sys/smp.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_SMP_H_ @@ -18,10 +19,7 @@ #include <sys/cpuset.h> -void smp_rendezvous(void (*)(void *), - void (*)(void *), - void (*)(void *), - void *arg); +#define IPI_AST 0 void ipi_cpu(int cpu, u_int ipi); diff --git a/usr/src/compat/freebsd/sys/socket.h b/usr/src/compat/freebsd/sys/socket.h new file mode 100644 index 0000000000..3bf7a8f440 --- /dev/null +++ b/usr/src/compat/freebsd/sys/socket.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_SYS_SOCKET_H +#define _COMPAT_FREEBSD_SYS_SOCKET_H + +#include_next <sys/socket.h> + +#define SO_NOSIGPIPE 0 + +#endif /* _COMPAT_FREEBSD_SYS_SOCKET_H */ diff --git a/usr/src/compat/freebsd/sys/systm.h b/usr/src/compat/freebsd/sys/systm.h index e25acc0e4a..43fa16d450 100644 --- a/usr/src/compat/freebsd/sys/systm.h +++ b/usr/src/compat/freebsd/sys/systm.h @@ -28,18 +28,9 @@ struct mtx; panic msg; \ } while (0) -#define CTASSERT(x) _CTASSERT(x, __LINE__) -#define _CTASSERT(x,y) __CTASSERT(x,y) -#define __CTASSERT(x,y) typedef char __assert ## y[(x) ? 1 : -1] - void critical_enter(void); void critical_exit(void); -int msleep_spin(void *chan, struct mtx *mutex, const char *wmesg, - int ticks); -void wakeup(void *chan); -void wakeup_one(void *chan); - struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex); void delete_unrhdr(struct unrhdr *uh); int alloc_unr(struct unrhdr *uh); diff --git a/usr/src/compat/freebsd/sys/time.h b/usr/src/compat/freebsd/sys/time.h index f8f9da5cdf..4e0fbfc02c 100644 --- a/usr/src/compat/freebsd/sys/time.h +++ b/usr/src/compat/freebsd/sys/time.h @@ -50,7 +50,13 @@ binuptime(struct bintime *bt) ((a)->frac cmp (b)->frac) : \ ((a)->sec cmp (b)->sec)) -#define SBT_1US (1000) +#define SBT_1S ((sbintime_t)1 << 32) +#define SBT_1M (SBT_1S * 60) +#define SBT_1MS (SBT_1S / 1000) +#define SBT_1US (SBT_1S / 1000000) +#define SBT_1NS (SBT_1S / 1000000000) +#define SBT_MAX 0x7fffffffffffffffLL + static __inline void bintime_add(struct bintime *bt, const struct bintime *bt2) @@ -91,14 +97,28 @@ bintime_mul(struct bintime *bt, u_int x) static __inline sbintime_t bttosbt(const struct bintime bt) { - return ((bt.sec * 1000000000) + - (((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32)); + return (((sbintime_t)bt.sec << 32) + (bt.frac >> 32)); +} + +static __inline struct bintime +sbttobt(sbintime_t _sbt) +{ + struct bintime _bt; + + _bt.sec = _sbt >> 32; + _bt.frac = _sbt << 32; + return (_bt); } static __inline sbintime_t sbinuptime(void) { - return (gethrtime()); + hrtime_t hrt = gethrtime(); + uint64_t sec = hrt / NANOSEC; + uint64_t nsec = hrt % NANOSEC; + + return (((sbintime_t)sec << 32) + + (nsec * (((uint64_t)1 << 63) / 500000000) >> 32)); } #endif /* _COMPAT_FREEBSD_SYS_TIME_H_ */ diff --git a/usr/src/compat/freebsd/sys/types.h b/usr/src/compat/freebsd/sys/types.h index 6fc8179f2e..63731da42e 100644 --- a/usr/src/compat/freebsd/sys/types.h +++ b/usr/src/compat/freebsd/sys/types.h @@ -11,6 +11,7 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #ifndef _COMPAT_FREEBSD_SYS_TYPES_H_ @@ -53,6 +54,16 @@ typedef __vm_ooffset_t vm_ooffset_t; typedef __vm_paddr_t vm_paddr_t; #endif +#ifndef __VM_PINDEX_T_DEFINED +#define __VM_PINDEX_T_DEFINED +typedef __uint64_t vm_pindex_t; +#endif + +#ifndef __VM_SIZE_T_DEFINED +#define __VM_SIZE_T_DEFINED +typedef __vm_size_t vm_size_t; +#endif + #ifndef __VM_MEMATTR_T_DEFINED #define __VM_MEMATTR_T_DEFINED typedef char vm_memattr_t; @@ -65,8 +76,8 @@ typedef char vm_memattr_t; typedef _Bool bool; #endif -#if defined(_KERNEL) && !defined(offsetof) -#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#if defined(_KERNEL) +typedef struct __dev_info **device_t; #endif #include_next <sys/types.h> diff --git a/usr/src/compat/freebsd/unistd.h b/usr/src/compat/freebsd/unistd.h new file mode 100644 index 0000000000..b4357e1da5 --- /dev/null +++ b/usr/src/compat/freebsd/unistd.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _COMPAT_FREEBSD_UNISTD_H +#define _COMPAT_FREEBSD_UNISTD_H + +#define setproctitle(fmt, ...) + +#include_next <unistd.h> + +#endif /* _COMPAT_FREEBSD_UNISTD_H */ diff --git a/usr/src/compat/freebsd/vm/vm.h b/usr/src/compat/freebsd/vm/vm.h index 7da22099b6..f5bb7b6eb8 100644 --- a/usr/src/compat/freebsd/vm/vm.h +++ b/usr/src/compat/freebsd/vm/vm.h @@ -11,23 +11,48 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _FREEBSD_VM_VM_H_ #define _FREEBSD_VM_VM_H_ #include <machine/vm.h> +#include <sys/mman.h> typedef u_char vm_prot_t; +/* + * Even though the FreeBSD VM_PROT defines happen to match illumos, this + * references the native values directly so there's no risk of breakage. + */ #define VM_PROT_NONE ((vm_prot_t) 0x00) -#define VM_PROT_READ ((vm_prot_t) 0x01) -#define VM_PROT_WRITE ((vm_prot_t) 0x02) -#define VM_PROT_EXECUTE ((vm_prot_t) 0x04) +#define VM_PROT_READ ((vm_prot_t) PROT_READ) +#define VM_PROT_WRITE ((vm_prot_t) PROT_WRITE) +#define VM_PROT_EXECUTE ((vm_prot_t) PROT_EXEC) #define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE) #define VM_PROT_RW (VM_PROT_READ|VM_PROT_WRITE) +struct vm_page; +typedef struct vm_page *vm_page_t; + +enum obj_type { OBJT_DEFAULT, OBJT_SWAP, OBJT_VNODE, OBJT_DEVICE, OBJT_PHYS, + OBJT_DEAD, OBJT_SG, OBJT_MGTDEVICE }; +typedef u_char objtype_t; + +union vm_map_object; +typedef union vm_map_object vm_map_object_t; + +struct vm_map_entry; +typedef struct vm_map_entry *vm_map_entry_t; + +struct vm_map; +typedef struct vm_map *vm_map_t; + +struct vm_object; +typedef struct vm_object *vm_object_t; + /* * <sys/promif.h> contains a troublesome preprocessor define for BYTE. * Do this ugly workaround to avoid it. diff --git a/usr/src/compat/freebsd/vm/vm_param.h b/usr/src/compat/freebsd/vm/vm_param.h new file mode 100644 index 0000000000..fd76b62a37 --- /dev/null +++ b/usr/src/compat/freebsd/vm/vm_param.h @@ -0,0 +1,21 @@ +#ifndef _COMPAT_FREEBSD_VM_VM_PARAM_H_ +#define _COMPAT_FREEBSD_VM_VM_PARAM_H_ + +#include <machine/vmparam.h> + +#define KERN_SUCCESS 0 + +/* Not a direct correlation, but the primary necessity is being non-zero */ +#define KERN_RESOURCE_SHORTAGE ENOMEM + +/* + * The VM_MAXUSER_ADDRESS is used to determine the upper limit size limit of a + * vmspace, their 'struct as' equivalent. The compat value is sized well below + * our native userlimit, even halving the available space below the VA hole. + * This is to avoid Intel EPT limits and leave room available in the usabe VA + * range for other mmap tricks. + */ +#define VM_MAXUSER_ADDRESS 0x00003ffffffffffful + + +#endif /* _COMPAT_FREEBSD_VM_VM_PARAM_H_ */ diff --git a/usr/src/compat/freebsd/x86/_types.h b/usr/src/compat/freebsd/x86/_types.h index a07fc017ad..8bbae549d8 100644 --- a/usr/src/compat/freebsd/x86/_types.h +++ b/usr/src/compat/freebsd/x86/_types.h @@ -41,9 +41,11 @@ typedef __int64_t __register_t; typedef __uint64_t __vm_offset_t; typedef __uint64_t __vm_paddr_t; typedef __int64_t __vm_ooffset_t; +typedef __uint64_t __vm_size_t; #else typedef __int32_t __register_t; typedef __uint32_t __vm_paddr_t; +typedef __uint32_t __vm_size_t; #endif #endif /* _FREEBSD_X86__TYPES_H_ */ diff --git a/usr/src/compat/freebsd/x86/segments.h b/usr/src/compat/freebsd/x86/segments.h index bc6ba976b8..11edc582b5 100644 --- a/usr/src/compat/freebsd/x86/segments.h +++ b/usr/src/compat/freebsd/x86/segments.h @@ -11,18 +11,19 @@ /* * Copyright 2015 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ -#ifndef _COMPAT_FREEBSD_X86_SEGMENTS_H_ -#define _COMPAT_FREEBSD_X86_SEGMENTS_H_ +#ifndef _COMPAT_FREEBSD_X86_SEGMENTS_H +#define _COMPAT_FREEBSD_X86_SEGMENTS_H -/* - * Entries in the Interrupt Descriptor Table (IDT) - */ -#define IDT_BP 3 /* #BP: Breakpoint */ +#if defined(_COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_) || defined(_KERNEL) #define IDT_UD 6 /* #UD: Undefined/Invalid Opcode */ #define IDT_SS 12 /* #SS: Stack Segment Fault */ #define IDT_GP 13 /* #GP: General Protection Fault */ #define IDT_AC 17 /* #AC: Alignment Check */ +#else +#include_next <x86/segments.h> +#endif -#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_ */ +#endif /* _COMPAT_FREEBSD_X86_SEGMENTS_H */ diff --git a/usr/src/head/bhyve.h b/usr/src/head/bhyve.h deleted file mode 100644 index 8c79ca1ccc..0000000000 --- a/usr/src/head/bhyve.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * COPYRIGHT 2013 Pluribus Networks Inc. - * - * All rights reserved. This copyright notice is Copyright Management - * Information under 17 USC 1202 and is included to protect this work and - * deter copyright infringement. Removal or alteration of this Copyright - * Management Information without the express written permission from - * Pluribus Networks Inc is prohibited, and any such unauthorized removal - * or alteration will be a violation of federal law. - */ -#ifndef _BHYVE_H -#define _BHYVE_H - -#ifdef __cplusplus -extern "C" { -#endif - -#define BHYVE_TMPDIR "/var/run/bhyve" -#define BHYVE_CONS_SOCKPATH BHYVE_TMPDIR "/%s.console_sock" - -#ifdef __cplusplus -} -#endif - -#endif /* _BHYVE_H */ diff --git a/usr/src/lib/Makefile b/usr/src/lib/Makefile index e2bbd9a8c0..b64d4c2bc1 100644 --- a/usr/src/lib/Makefile +++ b/usr/src/lib/Makefile @@ -280,7 +280,8 @@ SUBDIRS += \ i386_SUBDIRS= \ libfdisk \ - libsaveargs + libsaveargs \ + libvmmapi sparc_SUBDIRS= \ efcode \ @@ -504,7 +505,8 @@ HDRSUBDIRS= \ i386_HDRSUBDIRS= \ libfdisk \ - libsaveargs + libsaveargs \ + libvmmapi sparc_HDRSUBDIRS= \ libds \ diff --git a/usr/src/lib/libvmmapi/Makefile b/usr/src/lib/libvmmapi/Makefile index 60621fcb75..233fcd5edb 100644 --- a/usr/src/lib/libvmmapi/Makefile +++ b/usr/src/lib/libvmmapi/Makefile @@ -19,11 +19,13 @@ HDRS = vmmapi.h HDRDIR = common +CHECKHDRS = + $(BUILD64)SUBDIRS += $(MACH64) all:= TARGET= all install:= TARGET= install -clean:= TARGET= clean +clean:= TARGET= clean clobber:= TARGET= clobber lint:= TARGET= lint _msg:= TARGET= _msg diff --git a/usr/src/lib/libvmmapi/Makefile.com b/usr/src/lib/libvmmapi/Makefile.com index e41a82f9a2..34240f4331 100644 --- a/usr/src/lib/libvmmapi/Makefile.com +++ b/usr/src/lib/libvmmapi/Makefile.com @@ -12,11 +12,12 @@ # # Copyright 2013 Pluribus Networks Inc. # +# Copyright 2019 Joyent, Inc. -LIBRARY = libvmmapi.a +LIBRARY = libvmmapi.a VERS = .1 -OBJECTS = vmmapi.o expand_number.o +OBJECTS = vmmapi.o expand_number.o # include library definitions include ../../Makefile.lib @@ -24,16 +25,19 @@ include ../../Makefile.lib # install this library in the root filesystem include ../../Makefile.rootfs -SRCDIR = ../common +SRCDIR = ../common -LIBS = $(DYNLIB) $(LINTLIB) +LIBS = $(DYNLIB) $(LINTLIB) -CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ +CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \ $(CPPFLAGS.master) -I$(SRC)/uts/i86pc +# not linted +SMATCH=off + $(LINTLIB) := SRCS = $(SRCDIR)/$(LINTSRC) -LDLIBS += -lc +LDLIBS += -lc .KEEP_STATE: diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers index 7a8443a2b8..a64231ad1c 100644 --- a/usr/src/lib/libvmmapi/common/mapfile-vers +++ b/usr/src/lib/libvmmapi/common/mapfile-vers @@ -11,6 +11,7 @@ # # Copyright 2013 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. # # @@ -27,51 +28,96 @@ # MAPFILE HEADER END # -SUNWprivate_1.0 { - global: - vcpu_reset; - vm_activate_cpu; - vm_apicid2vcpu; - vm_capability_name2type; - vm_capability_type2name; - vm_copy_setup; - vm_copy_teardown; - vm_copyin; - vm_copyout; - vm_create; - vm_destroy; - vm_get_capability; - vm_get_desc; - vm_get_highmem_size; - vm_get_lowmem_limit; - vm_get_lowmem_size; - vm_get_memory_seg; - vm_get_register; - vm_get_seg_desc; - vm_get_x2apic_state; - vm_gla2gpa; - vm_inject_exception; - vm_isa_assert_irq; - vm_isa_deassert_irq; - vm_isa_pulse_irq; - vm_isa_set_irq_trigger; - vm_ioapic_assert_irq; - vm_ioapic_deassert_irq; - vm_ioapic_pincount; - vm_ioapic_pulse_irq; - vm_lapic_irq; - vm_lapic_msi; - vm_map_gpa; - vm_open; - vm_parse_memsize; - vm_restart_instruction; - vm_run; - vm_set_capability; - vm_set_desc; - vm_set_register; - vm_set_x2apic_state; - vm_setup_memory; - vm_setup_rom; - local: - *; +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + vcpu_reset; + vm_active_cpus; + vm_activate_cpu; + vm_active_cpus; + vm_apicid2vcpu; + vm_assign_pptdev; + vm_capability_name2type; + vm_capability_type2name; + vm_copy_setup; + vm_copy_teardown; + vm_copyin; + vm_copyout; + vm_create_devmem; + vm_create; + vm_create_devmem; + vm_debug_cpus; + vm_destroy; + vm_destroy; + vm_get_capability; + vm_get_desc; + vm_get_device_fd; + vm_get_gpa_pmap; + vm_get_hpet_capabilities; + vm_get_highmem_size; + vm_get_intinfo; + vm_get_lowmem_limit; + vm_get_lowmem_size; + vm_get_memflags; + vm_get_memseg; + vm_get_register; + vm_get_register_set; + vm_get_seg_desc; + vm_get_stat_desc; + vm_get_stats; + vm_get_topology; + vm_get_x2apic_state; + vm_gla2gpa; + vm_gla2gpa_nofault; + vm_inject_exception; + vm_inject_nmi; + vm_isa_assert_irq; + vm_isa_deassert_irq; + vm_isa_pulse_irq; + vm_isa_set_irq_trigger; + vm_ioapic_assert_irq; + vm_ioapic_deassert_irq; + vm_ioapic_pincount; + vm_ioapic_pulse_irq; + vm_isa_assert_irq; + vm_isa_deassert_irq; + vm_isa_pulse_irq; + vm_isa_set_irq_trigger; + vm_lapic_irq; + vm_lapic_local_irq; + vm_lapic_msi; + vm_map_gpa; + vm_map_pptdev_mmio; + vm_mmap_getnext; + vm_mmap_memseg; + vm_open; + vm_parse_memsize; + vm_reinit; + vm_restart_instruction; + vm_rtc_gettime; + vm_rtc_read; + vm_rtc_settime; + vm_rtc_write; + vm_run; + vm_set_capability; + vm_set_desc; + vm_set_intinfo; + vm_set_memflags; + vm_set_register; + vm_set_register_set; + vm_set_topology; + vm_set_x2apic_state; + vm_setup_memory; + vm_setup_pptdev_msi; + vm_setup_pptdev_msix; + vm_suspend; + vm_suspend_cpu; + vm_suspended_cpus; + vm_resume_cpu; + vm_unassign_pptdev; + vm_wrlock_cycle; + + local: + *; }; diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c index bbab3961a9..0b9b871081 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.c +++ b/usr/src/lib/libvmmapi/common/vmmapi.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tychon $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,10 +38,11 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tychon $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/sysctl.h> @@ -48,11 +51,10 @@ __FBSDID("$FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tych #include <sys/_iovec.h> #include <sys/cpuset.h> +#include <x86/segments.h> #include <machine/specialreg.h> -#ifndef __FreeBSD__ #include <errno.h> -#endif #include <stdio.h> #include <stdlib.h> #include <assert.h> @@ -70,23 +72,35 @@ __FBSDID("$FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tych #include "vmmapi.h" -#define KB (1024UL) #define MB (1024 * 1024UL) #define GB (1024 * 1024 * 1024UL) +#ifndef __FreeBSD__ +/* shim to no-op for now */ +#define MAP_NOCORE 0 +#define MAP_ALIGNED_SUPER 0 + +/* Rely on PROT_NONE for guard purposes */ +#define MAP_GUARD (MAP_PRIVATE | MAP_ANON | MAP_NORESERVE) +#endif + +/* + * Size of the guard region before and after the virtual address space + * mapping the guest physical memory. This must be a multiple of the + * superpage size for performance reasons. + */ +#define VM_MMAP_GUARD_SIZE (4 * MB) + +#define PROT_RW (PROT_READ | PROT_WRITE) +#define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC) + struct vmctx { int fd; uint32_t lowmem_limit; - enum vm_mmap_style vms; - char *lowermem_addr; - char *biosmem_addr; + int memflags; size_t lowmem; - char *lowmem_addr; size_t highmem; - char *highmem_addr; - uint64_t rombase; - uint64_t romlimit; - char *rom_addr; + char *baseaddr; char *name; }; @@ -94,68 +108,50 @@ struct vmctx { #define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x))) #define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x))) #else -#define CREATE(x) vmm_vm_create(x) -#define DESTROY(x) vmm_vm_destroy(x) -#endif +#define CREATE(x) vm_do_ctl(VMM_CREATE_VM, (x)) +#define DESTROY(x) vm_do_ctl(VMM_DESTROY_VM, (x)) static int -vm_device_open(const char *name) +vm_do_ctl(int cmd, const char *name) { - int fd, len; - char *vmfile; + int ctl_fd; -#ifdef __FreeBSD__ - len = strlen("/dev/vmm/") + strlen(name) + 1; -#else - len = strlen("/devices/pseudo/vmm@0:") + strlen(name) + 1; -#endif - vmfile = malloc(len); - assert(vmfile != NULL); -#ifdef __FreeBSD__ - snprintf(vmfile, len, "/dev/vmm/%s", name); -#else - snprintf(vmfile, len, "/devices/pseudo/vmm@0:%s", name); -#endif + ctl_fd = open(VMM_CTL_DEV, O_EXCL | O_RDWR); + if (ctl_fd < 0) { + return (-1); + } - /* Open the device file */ - fd = open(vmfile, O_RDWR, 0); + if (ioctl(ctl_fd, cmd, name) == -1) { + int err = errno; - free(vmfile); - return (fd); + /* Do not lose ioctl errno through the close(2) */ + (void) close(ctl_fd); + errno = err; + return (-1); + } + (void) close(ctl_fd); + + return (0); } +#endif -#ifndef __FreeBSD__ static int -vmm_vm_create(const char *name) +vm_device_open(const char *name) { - const char vmm_ctl[] = "/devices/pseudo/vmm@0:ctl"; - struct vmm_ioctl vi; - int err = 0; - int ctl_fd; + int fd, len; + char *vmfile; - (void) strlcpy(vi.vmm_name, name, sizeof (vi.vmm_name) - 1); + len = strlen("/dev/vmm/") + strlen(name) + 1; + vmfile = malloc(len); + assert(vmfile != NULL); + snprintf(vmfile, len, "/dev/vmm/%s", name); - ctl_fd = open(vmm_ctl, O_EXCL | O_RDWR); - if (ctl_fd == -1) { - err = errno; - if ((errno == EPERM) || (errno == EACCES)) { - fprintf(stderr, "you do not have permission to " - "perform that operation.\n"); - } else { - fprintf(stderr, "open: %s: %s\n", vmm_ctl, - strerror(errno)); - } - return (err); - } - if (ioctl(ctl_fd, VMM_CREATE_VM, &vi) == -1) { - err = errno; - fprintf(stderr, "couldn't create vm \"%s\"", name); - } - close (ctl_fd); + /* Open the device file */ + fd = open(vmfile, O_RDWR, 0); - return (err); + free(vmfile); + return (fd); } -#endif int vm_create(const char *name) @@ -173,6 +169,7 @@ vm_open(const char *name) assert(vm != NULL); vm->fd = -1; + vm->memflags = 0; vm->lowmem_limit = 3 * GB; vm->name = (char *)(vm + 1); strcpy(vm->name, name); @@ -182,54 +179,20 @@ vm_open(const char *name) return (vm); err: - (void) vm_destroy(vm); + vm_destroy(vm); return (NULL); } -#ifndef __FreeBSD__ -static int -vmm_vm_destroy(const char *name) -{ - const char vmm_ctl[] = "/devices/pseudo/vmm@0:ctl"; - struct vmm_ioctl vi; - int ctl_fd; - int err = 0; - - (void) strlcpy(vi.vmm_name, name, sizeof (vi.vmm_name) - 1); - - ctl_fd = open(vmm_ctl, O_EXCL | O_RDWR); - if (ctl_fd == -1) { - err = errno; - if ((errno == EPERM) || (errno == EACCES)) { - fprintf(stderr, "you do not have permission to " - "perform that operation.\n"); - } else { - fprintf(stderr, "open: %s: %s\n", vmm_ctl, - strerror(errno)); - } - return (err); - } - if (ioctl(ctl_fd, VMM_DESTROY_VM, &vi) == -1) { - err = errno; - fprintf(stderr, "couldn't destroy vm \"%s\"", name); - } - close (ctl_fd); - return (err); -} -#endif - -int +void vm_destroy(struct vmctx *vm) { - int err; assert(vm != NULL); if (vm->fd >= 0) close(vm->fd); - err = DESTROY(vm->name); + DESTROY(vm->name); free(vm); - return (err); } int @@ -256,92 +219,218 @@ vm_parse_memsize(const char *optarg, size_t *ret_memsize) return (error); } -#ifdef __FreeBSD__ -size_t -vmm_get_mem_total(void) +uint32_t +vm_get_lowmem_limit(struct vmctx *ctx) { - size_t mem_total = 0; - size_t oldlen = sizeof(mem_total); - int error; - error = sysctlbyname("hw.vmm.mem_total", &mem_total, &oldlen, NULL, 0); - if (error) - return -1; - return mem_total; + + return (ctx->lowmem_limit); } -size_t -vmm_get_mem_free(void) +void +vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) { - size_t mem_free = 0; - size_t oldlen = sizeof(mem_free); - int error; - error = sysctlbyname("hw.vmm.mem_free", &mem_free, &oldlen, NULL, 0); - if (error) - return -1; - return mem_free; + + ctx->lowmem_limit = limit; +} + +void +vm_set_memflags(struct vmctx *ctx, int flags) +{ + + ctx->memflags = flags; } -#endif int -vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len, - int *wired) +vm_get_memflags(struct vmctx *ctx) { - int error; - struct vm_memory_segment seg; - - bzero(&seg, sizeof(seg)); - seg.gpa = gpa; - error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg); - *ret_len = seg.len; - if (wired != NULL) - *wired = seg.wired; + + return (ctx->memflags); +} + +/* + * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len). + */ +int +vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off, + size_t len, int prot) +{ + struct vm_memmap memmap; + int error, flags; + + memmap.gpa = gpa; + memmap.segid = segid; + memmap.segoff = off; + memmap.len = len; + memmap.prot = prot; + memmap.flags = 0; + + if (ctx->memflags & VM_MEM_F_WIRED) + memmap.flags |= VM_MEMMAP_F_WIRED; + + /* + * If this mapping already exists then don't create it again. This + * is the common case for SYSMEM mappings created by bhyveload(8). + */ + error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags); + if (error == 0 && gpa == memmap.gpa) { + if (segid != memmap.segid || off != memmap.segoff || + prot != memmap.prot || flags != memmap.flags) { + errno = EEXIST; + return (-1); + } else { + return (0); + } + } + + error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap); return (error); } -uint32_t -vm_get_lowmem_limit(struct vmctx *ctx) +int +vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) { + struct vm_memmap memmap; + int error; - return (ctx->lowmem_limit); + bzero(&memmap, sizeof(struct vm_memmap)); + memmap.gpa = *gpa; + error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap); + if (error == 0) { + *gpa = memmap.gpa; + *segid = memmap.segid; + *segoff = memmap.segoff; + *len = memmap.len; + *prot = memmap.prot; + *flags = memmap.flags; + } + return (error); } -void -vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit) +/* + * Return 0 if the segments are identical and non-zero otherwise. + * + * This is slightly complicated by the fact that only device memory segments + * are named. + */ +static int +cmpseg(size_t len, const char *str, size_t len2, const char *str2) { - ctx->lowmem_limit = limit; + if (len == len2) { + if ((!str && !str2) || (str && str2 && !strcmp(str, str2))) + return (0); + } + return (-1); } static int -setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr) +vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name) { + struct vm_memseg memseg; + size_t n; int error; - struct vm_memory_segment seg; /* - * Create and optionally map 'len' bytes of memory at guest - * physical address 'gpa' + * If the memory segment has already been created then just return. + * This is the usual case for the SYSMEM segment created by userspace + * loaders like bhyveload(8). */ - bzero(&seg, sizeof(seg)); - seg.gpa = gpa; - seg.len = len; - error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg); - if (error == 0 && addr != NULL) { - *addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED, - ctx->fd, gpa); + error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name, + sizeof(memseg.name)); + if (error) + return (error); + + if (memseg.len != 0) { + if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) { + errno = EINVAL; + return (-1); + } else { + return (0); + } + } + + bzero(&memseg, sizeof(struct vm_memseg)); + memseg.segid = segid; + memseg.len = len; + if (name != NULL) { + n = strlcpy(memseg.name, name, sizeof(memseg.name)); + if (n >= sizeof(memseg.name)) { + errno = ENAMETOOLONG; + return (-1); + } + } + + error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg); + return (error); +} + +int +vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf, + size_t bufsize) +{ + struct vm_memseg memseg; + size_t n; + int error; + + memseg.segid = segid; + error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg); + if (error == 0) { + *lenp = memseg.len; + n = strlcpy(namebuf, memseg.name, bufsize); + if (n >= bufsize) { + errno = ENAMETOOLONG; + error = -1; + } } return (error); } +static int +#ifdef __FreeBSD__ +setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base) +#else +setup_memory_segment(struct vmctx *ctx, int segid, vm_paddr_t gpa, size_t len, + char *base) +#endif +{ + char *ptr; + int error, flags; + + /* Map 'len' bytes starting at 'gpa' in the guest address space */ +#ifdef __FreeBSD__ + error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL); +#else + /* + * As we use two segments for lowmem/highmem the offset within the + * segment is 0 on illumos. + */ + error = vm_mmap_memseg(ctx, gpa, segid, 0, len, PROT_ALL); +#endif + if (error) + return (error); + + flags = MAP_SHARED | MAP_FIXED; + if ((ctx->memflags & VM_MEM_F_INCORE) == 0) + flags |= MAP_NOCORE; + + /* mmap into the process address space on the host */ + ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa); + if (ptr == MAP_FAILED) + return (-1); + + return (0); +} + int vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) { - char **addr; + size_t objsize, len; + vm_paddr_t gpa; + char *baseaddr, *ptr; int error; - /* XXX VM_MMAP_SPARSE not implemented yet */ - assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL); - ctx->vms = vms; + assert(vms == VM_MMAP_ALL); /* * If 'memsize' cannot fit entirely in the 'lowmem' segment then @@ -349,81 +438,100 @@ vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms) */ if (memsize > ctx->lowmem_limit) { ctx->lowmem = ctx->lowmem_limit; - ctx->highmem = memsize - ctx->lowmem; + ctx->highmem = memsize - ctx->lowmem_limit; + objsize = 4*GB + ctx->highmem; } else { ctx->lowmem = memsize; ctx->highmem = 0; + objsize = ctx->lowmem; } - if (ctx->lowmem > 0) { - addr = (vms == VM_MMAP_ALL) ? &ctx->lowermem_addr : NULL; - error = setup_memory_segment(ctx, 0, 640*KB, addr); - if (error) - return (error); +#ifdef __FreeBSD__ + error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL); + if (error) + return (error); +#endif + + /* + * Stake out a contiguous region covering the guest physical memory + * and the adjoining guard regions. + */ + len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE; + ptr = mmap(NULL, len, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, 0); + if (ptr == MAP_FAILED) + return (-1); + + baseaddr = ptr + VM_MMAP_GUARD_SIZE; - addr = (vms == VM_MMAP_ALL) ? &ctx->biosmem_addr : NULL; - error = setup_memory_segment(ctx, 768*KB, 256*KB, addr); +#ifdef __FreeBSD__ + if (ctx->highmem > 0) { + gpa = 4*GB; + len = ctx->highmem; + error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); + } - addr = (vms == VM_MMAP_ALL) ? &ctx->lowmem_addr : NULL; - error = setup_memory_segment(ctx, 1*MB, ctx->lowmem - 1*MB, addr); + if (ctx->lowmem > 0) { + gpa = 0; + len = ctx->lowmem; + error = setup_memory_segment(ctx, gpa, len, baseaddr); if (error) return (error); } - +#else if (ctx->highmem > 0) { - addr = (vms == VM_MMAP_ALL) ? &ctx->highmem_addr : NULL; - error = setup_memory_segment(ctx, 4*GB, ctx->highmem, addr); + error = vm_alloc_memseg(ctx, VM_HIGHMEM, ctx->highmem, NULL); + if (error) + return (error); + gpa = 4*GB; + len = ctx->highmem; + error = setup_memory_segment(ctx, VM_HIGHMEM, gpa, len, baseaddr); if (error) return (error); } - return (0); -} + if (ctx->lowmem > 0) { + error = vm_alloc_memseg(ctx, VM_LOWMEM, ctx->lowmem, NULL); + if (error) + return (error); + gpa = 0; + len = ctx->lowmem; + error = setup_memory_segment(ctx, VM_LOWMEM, gpa, len, baseaddr); + if (error) + return (error); + } +#endif -int -vm_setup_rom(struct vmctx *ctx, vm_paddr_t gpa, size_t len) -{ - ctx->rombase = gpa; - ctx->romlimit = gpa + len; + ctx->baseaddr = baseaddr; - return (setup_memory_segment(ctx, gpa, len, &ctx->rom_addr)); + return (0); } +/* + * Returns a non-NULL pointer if [gaddr, gaddr+len) is entirely contained in + * the lowmem or highmem regions. + * + * In particular return NULL if [gaddr, gaddr+len) falls in guest MMIO region. + * The instruction emulation code depends on this behavior. + */ void * vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len) { - /* XXX VM_MMAP_SPARSE not implemented yet */ - assert(ctx->vms == VM_MMAP_ALL); - - if (gaddr + len <= 1*MB) { - if (gaddr + len <= 640*KB) - return ((void *)(ctx->lowermem_addr + gaddr)); - - if (768*KB <= gaddr && gaddr + len <= 1*MB) { - gaddr -= 768*KB; - return ((void *)(ctx->biosmem_addr + gaddr)); - } - - return (NULL); - } - - if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem) { - gaddr -= 1*MB; - return ((void *)(ctx->lowmem_addr + gaddr)); - } - - if (ctx->rombase <= gaddr && gaddr + len <= ctx->romlimit) { - gaddr -= ctx->rombase; - return ((void *)(ctx->rom_addr + gaddr)); + if (ctx->lowmem > 0) { + if (gaddr < ctx->lowmem && len <= ctx->lowmem && + gaddr + len <= ctx->lowmem) + return (ctx->baseaddr + gaddr); } - if (gaddr >= 4*GB) { - gaddr -= 4*GB; - if (gaddr < ctx->highmem && gaddr + len <= ctx->highmem) - return ((void *)(ctx->highmem_addr + gaddr)); + if (ctx->highmem > 0) { + if (gaddr >= 4*GB) { + if (gaddr < 4*GB + ctx->highmem && + len <= ctx->highmem && + gaddr + len <= 4*GB + ctx->highmem) + return (ctx->baseaddr + gaddr); + } } return (NULL); @@ -443,6 +551,79 @@ vm_get_highmem_size(struct vmctx *ctx) return (ctx->highmem); } +void * +vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len) +{ +#ifdef __FreeBSD__ + char pathname[MAXPATHLEN]; +#endif + size_t len2; + char *base, *ptr; + int fd, error, flags; + off_t mapoff; + + fd = -1; + ptr = MAP_FAILED; + if (name == NULL || strlen(name) == 0) { + errno = EINVAL; + goto done; + } + + error = vm_alloc_memseg(ctx, segid, len, name); + if (error) + goto done; + +#ifdef __FreeBSD__ + strlcpy(pathname, "/dev/vmm.io/", sizeof(pathname)); + strlcat(pathname, ctx->name, sizeof(pathname)); + strlcat(pathname, ".", sizeof(pathname)); + strlcat(pathname, name, sizeof(pathname)); + + fd = open(pathname, O_RDWR); + if (fd < 0) + goto done; +#else + { + struct vm_devmem_offset vdo; + + vdo.segid = segid; + error = ioctl(ctx->fd, VM_DEVMEM_GETOFFSET, &vdo); + if (error == 0) { + mapoff = vdo.offset; + } else { + goto done; + } + } +#endif + + /* + * Stake out a contiguous region covering the device memory and the + * adjoining guard regions. + */ + len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE; + base = mmap(NULL, len2, PROT_NONE, MAP_GUARD | MAP_ALIGNED_SUPER, -1, + 0); + if (base == MAP_FAILED) + goto done; + + flags = MAP_SHARED | MAP_FIXED; + if ((ctx->memflags & VM_MEM_F_INCORE) == 0) + flags |= MAP_NOCORE; + +#ifdef __FreeBSD__ + /* mmap the devmem region in the host address space */ + ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0); +#else + /* mmap the devmem region in the host address space */ + ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, ctx->fd, + mapoff); +#endif +done: + if (fd >= 0) + close(fd); + return (ptr); +} + int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, uint64_t base, uint32_t limit, uint32_t access) @@ -522,6 +703,40 @@ vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val) } int +vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, + const int *regnums, uint64_t *regvals) +{ + int error; + struct vm_register_set vmregset; + + bzero(&vmregset, sizeof(vmregset)); + vmregset.cpuid = vcpu; + vmregset.count = count; + vmregset.regnums = regnums; + vmregset.regvals = regvals; + + error = ioctl(ctx->fd, VM_SET_REGISTER_SET, &vmregset); + return (error); +} + +int +vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, + const int *regnums, uint64_t *regvals) +{ + int error; + struct vm_register_set vmregset; + + bzero(&vmregset, sizeof(vmregset)); + vmregset.cpuid = vcpu; + vmregset.count = count; + vmregset.regnums = regnums; + vmregset.regvals = regvals; + + error = ioctl(ctx->fd, VM_GET_REGISTER_SET, &vmregset); + return (error); +} + +int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) { int error; @@ -535,19 +750,21 @@ vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit) return (error); } -static int -vm_inject_exception_real(struct vmctx *ctx, int vcpu, int vector, - int error_code, int error_code_valid) +int +vm_suspend(struct vmctx *ctx, enum vm_suspend_how how) { - struct vm_exception exc; + struct vm_suspend vmsuspend; - bzero(&exc, sizeof(exc)); - exc.cpuid = vcpu; - exc.vector = vector; - exc.error_code = error_code; - exc.error_code_valid = error_code_valid; + bzero(&vmsuspend, sizeof(vmsuspend)); + vmsuspend.how = how; + return (ioctl(ctx->fd, VM_SUSPEND, &vmsuspend)); +} - return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc)); +int +vm_reinit(struct vmctx *ctx) +{ + + return (ioctl(ctx->fd, VM_REINIT, 0)); } int @@ -774,7 +991,7 @@ vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val) vmcap.cpuid = vcpu; vmcap.captype = cap; vmcap.capval = val; - + return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap)); } @@ -858,7 +1075,6 @@ vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix); } -#ifdef __FreeBSD__ uint64_t * vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, int *ret_entries) @@ -869,7 +1085,7 @@ vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv, vmstats.cpuid = vcpu; - error = ioctl(ctx->fd, VM_STATS, &vmstats); + error = ioctl(ctx->fd, VM_STATS_IOC, &vmstats); if (error == 0) { if (ret_entries) *ret_entries = vmstats.num_entries; @@ -891,7 +1107,6 @@ vm_get_stat_desc(struct vmctx *ctx, int index) else return (NULL); } -#endif int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state) @@ -1112,9 +1327,9 @@ vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities) return (error); } -static int -gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, - uint64_t gla, int prot, int *fault, uint64_t *gpa) +int +vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *fault) { struct vm_gla2gpa gg; int error; @@ -1134,14 +1349,23 @@ gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, } int -vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa) +vm_gla2gpa_nofault(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *fault) { - int error, fault; + struct vm_gla2gpa gg; + int error; + + bzero(&gg, sizeof(struct vm_gla2gpa)); + gg.vcpuid = vcpu; + gg.prot = prot; + gg.gla = gla; + gg.paging = *paging; - error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, gpa); - if (fault) - error = fault; + error = ioctl(ctx->fd, VM_GLA2GPA_NOFAULT, &gg); + if (error == 0) { + *fault = gg.fault; + *gpa = gg.gpa; + } return (error); } @@ -1151,11 +1375,12 @@ vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, - uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt) + uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, + int *fault) { void *va; uint64_t gpa; - int error, fault, i, n, off; + int error, i, n, off; for (i = 0; i < iovcnt; i++) { iov[i].iov_base = 0; @@ -1164,18 +1389,16 @@ vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging, while (len) { assert(iovcnt > 0); - error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, &gpa); - if (error) - return (-1); - if (fault) - return (1); + error = vm_gla2gpa(ctx, vcpu, paging, gla, prot, &gpa, fault); + if (error || *fault) + return (error); off = gpa & PAGE_MASK; n = min(len, PAGE_SIZE - off); va = vm_map_gpa(ctx, gpa, n); if (va == NULL) - return (-1); + return (EFAULT); iov->iov_base = va; iov->iov_len = n; @@ -1236,6 +1459,42 @@ vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov, } } +static int +vm_get_cpus(struct vmctx *ctx, int which, cpuset_t *cpus) +{ + struct vm_cpuset vm_cpuset; + int error; + + bzero(&vm_cpuset, sizeof(struct vm_cpuset)); + vm_cpuset.which = which; + vm_cpuset.cpusetsize = sizeof(cpuset_t); + vm_cpuset.cpus = cpus; + + error = ioctl(ctx->fd, VM_GET_CPUS, &vm_cpuset); + return (error); +} + +int +vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus) +{ + + return (vm_get_cpus(ctx, VM_ACTIVE_CPUS, cpus)); +} + +int +vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus) +{ + + return (vm_get_cpus(ctx, VM_SUSPENDED_CPUS, cpus)); +} + +int +vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus) +{ + + return (vm_get_cpus(ctx, VM_DEBUG_CPUS, cpus)); +} + int vm_activate_cpu(struct vmctx *ctx, int vcpu) { @@ -1249,9 +1508,203 @@ vm_activate_cpu(struct vmctx *ctx, int vcpu) } int +vm_suspend_cpu(struct vmctx *ctx, int vcpu) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + ac.vcpuid = vcpu; + error = ioctl(ctx->fd, VM_SUSPEND_CPU, &ac); + return (error); +} + +int +vm_resume_cpu(struct vmctx *ctx, int vcpu) +{ + struct vm_activate_cpu ac; + int error; + + bzero(&ac, sizeof(struct vm_activate_cpu)); + ac.vcpuid = vcpu; + error = ioctl(ctx->fd, VM_RESUME_CPU, &ac); + return (error); +} + +int +vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *info1, uint64_t *info2) +{ + struct vm_intinfo vmii; + int error; + + bzero(&vmii, sizeof(struct vm_intinfo)); + vmii.vcpuid = vcpu; + error = ioctl(ctx->fd, VM_GET_INTINFO, &vmii); + if (error == 0) { + *info1 = vmii.info1; + *info2 = vmii.info2; + } + return (error); +} + +int +vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t info1) +{ + struct vm_intinfo vmii; + int error; + + bzero(&vmii, sizeof(struct vm_intinfo)); + vmii.vcpuid = vcpu; + vmii.info1 = info1; + error = ioctl(ctx->fd, VM_SET_INTINFO, &vmii); + return (error); +} + +int +vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value) +{ + struct vm_rtc_data rtcdata; + int error; + + bzero(&rtcdata, sizeof(struct vm_rtc_data)); + rtcdata.offset = offset; + rtcdata.value = value; + error = ioctl(ctx->fd, VM_RTC_WRITE, &rtcdata); + return (error); +} + +int +vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval) +{ + struct vm_rtc_data rtcdata; + int error; + + bzero(&rtcdata, sizeof(struct vm_rtc_data)); + rtcdata.offset = offset; + error = ioctl(ctx->fd, VM_RTC_READ, &rtcdata); + if (error == 0) + *retval = rtcdata.value; + return (error); +} + +int +vm_rtc_settime(struct vmctx *ctx, time_t secs) +{ + struct vm_rtc_time rtctime; + int error; + + bzero(&rtctime, sizeof(struct vm_rtc_time)); + rtctime.secs = secs; + error = ioctl(ctx->fd, VM_RTC_SETTIME, &rtctime); + return (error); +} + +int +vm_rtc_gettime(struct vmctx *ctx, time_t *secs) +{ + struct vm_rtc_time rtctime; + int error; + + bzero(&rtctime, sizeof(struct vm_rtc_time)); + error = ioctl(ctx->fd, VM_RTC_GETTIME, &rtctime); + if (error == 0) + *secs = rtctime.secs; + return (error); +} + +int vm_restart_instruction(void *arg, int vcpu) { struct vmctx *ctx = arg; return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu)); } + +int +vm_set_topology(struct vmctx *ctx, + uint16_t sockets, uint16_t cores, uint16_t threads, uint16_t maxcpus) +{ + struct vm_cpu_topology topology; + + bzero(&topology, sizeof (struct vm_cpu_topology)); + topology.sockets = sockets; + topology.cores = cores; + topology.threads = threads; + topology.maxcpus = maxcpus; + return (ioctl(ctx->fd, VM_SET_TOPOLOGY, &topology)); +} + +int +vm_get_topology(struct vmctx *ctx, + uint16_t *sockets, uint16_t *cores, uint16_t *threads, uint16_t *maxcpus) +{ + struct vm_cpu_topology topology; + int error; + + bzero(&topology, sizeof (struct vm_cpu_topology)); + error = ioctl(ctx->fd, VM_GET_TOPOLOGY, &topology); + if (error == 0) { + *sockets = topology.sockets; + *cores = topology.cores; + *threads = topology.threads; + *maxcpus = topology.maxcpus; + } + return (error); +} + +int +vm_get_device_fd(struct vmctx *ctx) +{ + + return (ctx->fd); +} + +#ifndef __FreeBSD__ +int +vm_wrlock_cycle(struct vmctx *ctx) +{ + if (ioctl(ctx->fd, VM_WRLOCK_CYCLE, 0) != 0) { + return (errno); + } + return (0); +} +#endif /* __FreeBSD__ */ + +#ifdef __FreeBSD__ +const cap_ioctl_t * +vm_get_ioctls(size_t *len) +{ + cap_ioctl_t *cmds; + /* keep in sync with machine/vmm_dev.h */ + static const cap_ioctl_t vm_ioctl_cmds[] = { VM_RUN, VM_SUSPEND, VM_REINIT, + VM_ALLOC_MEMSEG, VM_GET_MEMSEG, VM_MMAP_MEMSEG, VM_MMAP_MEMSEG, + VM_MMAP_GETNEXT, VM_SET_REGISTER, VM_GET_REGISTER, + VM_SET_SEGMENT_DESCRIPTOR, VM_GET_SEGMENT_DESCRIPTOR, + VM_SET_REGISTER_SET, VM_GET_REGISTER_SET, + VM_INJECT_EXCEPTION, VM_LAPIC_IRQ, VM_LAPIC_LOCAL_IRQ, + VM_LAPIC_MSI, VM_IOAPIC_ASSERT_IRQ, VM_IOAPIC_DEASSERT_IRQ, + VM_IOAPIC_PULSE_IRQ, VM_IOAPIC_PINCOUNT, VM_ISA_ASSERT_IRQ, + VM_ISA_DEASSERT_IRQ, VM_ISA_PULSE_IRQ, VM_ISA_SET_IRQ_TRIGGER, + VM_SET_CAPABILITY, VM_GET_CAPABILITY, VM_BIND_PPTDEV, + VM_UNBIND_PPTDEV, VM_MAP_PPTDEV_MMIO, VM_PPTDEV_MSI, + VM_PPTDEV_MSIX, VM_INJECT_NMI, VM_STATS, VM_STAT_DESC, + VM_SET_X2APIC_STATE, VM_GET_X2APIC_STATE, + VM_GET_HPET_CAPABILITIES, VM_GET_GPA_PMAP, VM_GLA2GPA, + VM_GLA2GPA_NOFAULT, + VM_ACTIVATE_CPU, VM_GET_CPUS, VM_SUSPEND_CPU, VM_RESUME_CPU, + VM_SET_INTINFO, VM_GET_INTINFO, + VM_RTC_WRITE, VM_RTC_READ, VM_RTC_SETTIME, VM_RTC_GETTIME, + VM_RESTART_INSTRUCTION, VM_SET_TOPOLOGY, VM_GET_TOPOLOGY }; + + if (len == NULL) { + cmds = malloc(sizeof(vm_ioctl_cmds)); + if (cmds == NULL) + return (NULL); + bcopy(vm_ioctl_cmds, cmds, sizeof(vm_ioctl_cmds)); + return (cmds); + } + + *len = nitems(vm_ioctl_cmds); + return (NULL); +} +#endif /* __FreeBSD__ */ diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h index d7eb67aa58..a1507255cb 100644 --- a/usr/src/lib/libvmmapi/common/vmmapi.h +++ b/usr/src/lib/libvmmapi/common/vmmapi.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/lib/libvmmapi/vmmapi.h 280929 2015-04-01 00:15:31Z tychon $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,12 +38,20 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMMAPI_H_ #define _VMMAPI_H_ #include <sys/param.h> +#include <sys/cpuset.h> + +/* + * API version for out-of-tree consumers like grub-bhyve for making compile + * time decisions. + */ +#define VMMAPI_VERSION 0103 /* 2 digit major followed by 2 digit minor */ struct iovec; struct vmctx; @@ -57,19 +67,77 @@ enum vm_mmap_style { VM_MMAP_SPARSE, /* mappings created on-demand */ }; +/* + * 'flags' value passed to 'vm_set_memflags()'. + */ +#define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */ +#define VM_MEM_F_WIRED 0x02 /* guest memory is wired */ + +/* + * Identifiers for memory segments: + * - vm_setup_memory() uses VM_SYSMEM for the system memory segment. + * - the remaining identifiers can be used to create devmem segments. + */ +enum { +#ifdef __FreeBSD__ + VM_SYSMEM, +#else + VM_LOWMEM, + VM_HIGHMEM, +#endif + VM_BOOTROM, + VM_FRAMEBUFFER, +}; + +/* + * Get the length and name of the memory segment identified by 'segid'. + * Note that system memory segments are identified with a nul name. + * + * Returns 0 on success and non-zero otherwise. + */ +int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name, + size_t namesiz); + +/* + * Iterate over the guest address space. This function finds an address range + * that starts at an address >= *gpa. + * + * Returns 0 if the next address range was found and non-zero otherwise. + */ +int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); +/* + * Create a device memory segment identified by 'segid'. + * + * Returns a pointer to the memory segment on success and MAP_FAILED otherwise. + */ +void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name, + size_t len); + +/* + * Map the memory segment identified by 'segid' into the guest address space + * at [gpa,gpa+len) with protection 'prot'. + */ +int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, + vm_ooffset_t segoff, size_t len, int prot); + int vm_create(const char *name); +int vm_get_device_fd(struct vmctx *ctx); struct vmctx *vm_open(const char *name); -int vm_destroy(struct vmctx *ctx); +void vm_destroy(struct vmctx *ctx); int vm_parse_memsize(const char *optarg, size_t *memsize); -int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len, - int *wired); int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s); -int vm_setup_rom(struct vmctx *ctx, vm_paddr_t gpa, size_t len); void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len); +int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num); int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa); + uint64_t gla, int prot, uint64_t *gpa, int *fault); +int vm_gla2gpa_nofault(struct vmctx *, int vcpuid, + struct vm_guest_paging *paging, uint64_t gla, int prot, + uint64_t *gpa, int *fault); uint32_t vm_get_lowmem_limit(struct vmctx *ctx); void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit); +void vm_set_memflags(struct vmctx *ctx, int flags); +int vm_get_memflags(struct vmctx *ctx); size_t vm_get_lowmem_size(struct vmctx *ctx); size_t vm_get_highmem_size(struct vmctx *ctx); int vm_set_desc(struct vmctx *ctx, int vcpu, int reg, @@ -80,7 +148,13 @@ int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc); int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val); int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval); +int vm_set_register_set(struct vmctx *ctx, int vcpu, unsigned int count, + const int *regnums, uint64_t *regvals); +int vm_get_register_set(struct vmctx *ctx, int vcpu, unsigned int count, + const int *regnums, uint64_t *regvals); int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit); +int vm_suspend(struct vmctx *ctx, enum vm_suspend_how how); +int vm_reinit(struct vmctx *ctx); int vm_apicid2vcpu(struct vmctx *ctx, int apicid); int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid, uint32_t errcode, int restart_instruction); @@ -113,6 +187,13 @@ int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func, int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int vm_get_intinfo(struct vmctx *ctx, int vcpu, uint64_t *i1, uint64_t *i2); +int vm_set_intinfo(struct vmctx *ctx, int vcpu, uint64_t exit_intinfo); + +#ifdef __FreeBSD__ +const cap_ioctl_t *vm_get_ioctls(size_t *len); +#endif + /* * Return a pointer to the statistics buffer. Note that this is not MT-safe. */ @@ -127,11 +208,16 @@ int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities); /* * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'. - * The 'iovcnt' should be big enough to accomodate all GPA segments. - * Returns 0 on success, 1 on a guest fault condition and -1 otherwise. + * The 'iovcnt' should be big enough to accommodate all GPA segments. + * + * retval fault Interpretation + * 0 0 Success + * 0 1 An exception was injected into the guest + * EFAULT N/A Error */ int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg, - uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt); + uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt, + int *fault); void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov, void *host_dst, size_t len); void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, @@ -139,10 +225,32 @@ void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src, void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt); +/* RTC */ +int vm_rtc_write(struct vmctx *ctx, int offset, uint8_t value); +int vm_rtc_read(struct vmctx *ctx, int offset, uint8_t *retval); +int vm_rtc_settime(struct vmctx *ctx, time_t secs); +int vm_rtc_gettime(struct vmctx *ctx, time_t *secs); + /* Reset vcpu register state */ int vcpu_reset(struct vmctx *ctx, int vcpu); +int vm_active_cpus(struct vmctx *ctx, cpuset_t *cpus); +int vm_suspended_cpus(struct vmctx *ctx, cpuset_t *cpus); +int vm_debug_cpus(struct vmctx *ctx, cpuset_t *cpus); int vm_activate_cpu(struct vmctx *ctx, int vcpu); +int vm_suspend_cpu(struct vmctx *ctx, int vcpu); +int vm_resume_cpu(struct vmctx *ctx, int vcpu); + +/* CPU topology */ +int vm_set_topology(struct vmctx *ctx, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus); +int vm_get_topology(struct vmctx *ctx, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus); + +#ifndef __FreeBSD__ +/* illumos-specific APIs */ +int vm_wrlock_cycle(struct vmctx *ctx); +#endif /* __FreeBSD__ */ #ifdef __FreeBSD__ /* diff --git a/usr/src/pkg/manifests/system-bhyve-tests.mf b/usr/src/pkg/manifests/system-bhyve-tests.mf new file mode 100644 index 0000000000..14586b5177 --- /dev/null +++ b/usr/src/pkg/manifests/system-bhyve-tests.mf @@ -0,0 +1,35 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright 2018 OmniOS Community Edition (OmniOSce) Association. +# + +set name=pkg.fmri value=pkg:/system/bhyve/tests@$(PKGVERS) +set name=pkg.description value="BSD hypervisor tests" +set name=pkg.summary value="BSD hypervisor tests" +set name=info.classification \ + value=org.opensolaris.category.2008:System/Virtualization +set name=variant.arch value=i386 +dir path=opt/bhyvetest +dir path=opt/bhyvetest/bin +dir path=opt/bhyvetest/tst +dir path=opt/bhyvetest/tst/mevent +file path=opt/bhyvetest/bin/bhyvetest mode=0555 +file path=opt/bhyvetest/tst/mevent/lists.delete.exe mode=0555 +file path=opt/bhyvetest/tst/mevent/read.disable.exe mode=0555 +file path=opt/bhyvetest/tst/mevent/read.pause.exe mode=0555 +file path=opt/bhyvetest/tst/mevent/read.requeue.exe mode=0555 +license lic_CDDL license=lic_CDDL diff --git a/usr/src/pkg/manifests/system-bhyve.mf b/usr/src/pkg/manifests/system-bhyve.mf new file mode 100644 index 0000000000..2a51d4fc22 --- /dev/null +++ b/usr/src/pkg/manifests/system-bhyve.mf @@ -0,0 +1,46 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright 2018 Joyent, Inc. +# Copyright 2019 OmniOS Community Edition (OmniOSce) Association. +# + +# +# The default for payload-bearing actions in this package is to appear in the +# global zone only. See the include file for greater detail, as well as +# information about overriding the defaults. +# +<include global_zone_only_component> +set name=pkg.fmri value=pkg:/system/bhyve@$(PKGVERS) +set name=pkg.description value="BSD hypervisor" +set name=pkg.summary value="BSD hypervisor" +set name=info.classification \ + value=org.opensolaris.category.2008:System/Virtualization +set name=variant.arch value=i386 +dir path=kernel group=sys +dir path=usr group=sys +dir path=usr/kernel/drv group=sys +dir path=usr/kernel/drv/$(ARCH64) group=sys +dir path=usr/sbin +driver name=vmm +file path=usr/kernel/drv/$(ARCH64)/vmm +file path=usr/kernel/drv/vmm.conf +file path=usr/sbin/bhyve mode=0555 +file path=usr/sbin/bhyvectl mode=0555 +license lic_CDDL license=lic_CDDL +depend fmri=developer/acpi type=require +depend fmri=system/bhyve/firmware type=require +depend fmri=system/library/bhyve type=require diff --git a/usr/src/pkg/manifests/system-library-bhyve.mf b/usr/src/pkg/manifests/system-library-bhyve.mf new file mode 100644 index 0000000000..d9a15e1b37 --- /dev/null +++ b/usr/src/pkg/manifests/system-library-bhyve.mf @@ -0,0 +1,31 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright 2019 OmniOS Community Edition (OmniOSce) Association. +# + +set name=pkg.fmri value=pkg:/system/library/bhyve@$(PKGVERS) +set name=pkg.description value="BSD hypervisor (libraries)" +set name=pkg.summary value="BSD hypervisor (libraries)" +set name=info.classification \ + value=org.opensolaris.category.2008:System/Virtualization +set name=variant.arch value=i386 +dir path=lib group=bin +dir path=lib/$(ARCH64) group=bin +dir path=usr group=sys +dir path=usr/lib group=bin +file path=lib/$(ARCH64)/libvmmapi.so.1 +license lic_CDDL license=lic_CDDL diff --git a/usr/src/req.flg b/usr/src/req.flg index 9c992b1120..26415fa51f 100644 --- a/usr/src/req.flg +++ b/usr/src/req.flg @@ -33,3 +33,5 @@ echo_file usr/src/Makefile.master.64 echo_file usr/src/Makefile.msg.targ echo_file usr/src/Makefile.psm echo_file usr/src/Makefile.psm.targ + +find_files "s.*" usr/contrib/freebsd diff --git a/usr/src/tools/scripts/build_cscope.conf b/usr/src/tools/scripts/build_cscope.conf index 859b5137d6..298db1281b 100644 --- a/usr/src/tools/scripts/build_cscope.conf +++ b/usr/src/tools/scripts/build_cscope.conf @@ -22,8 +22,8 @@ # # Copyright 2005 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2018 Joyent, Inc. # -# ident "%Z%%M% %I% %E% SMI" # # This file configures the set of cross-references built by build_cscope. # The format is: @@ -35,6 +35,6 @@ # directories. # -complete -f . +complete "" . uts "" uts uts/sun4u uts/sun4v uts/i86pc psm "" psm/stand psm/stand/boot psm/stand/boot/sparcv9/sun4u psm/stand/boot/sparcv9/sun4v diff --git a/usr/src/tools/scripts/gensetdefs.pl b/usr/src/tools/scripts/gensetdefs.pl deleted file mode 100644 index 8ca5782feb..0000000000 --- a/usr/src/tools/scripts/gensetdefs.pl +++ /dev/null @@ -1,31 +0,0 @@ -#!/usr/bin/perl -w -# -# COPYRIGHT 2013 Pluribus Networks Inc. -# -# All rights reserved. This copyright notice is Copyright Management -# Information under 17 USC 1202 and is included to protect this work and -# deter copyright infringement. Removal or alteration of this Copyright -# Management Information without the express written permission from -# Pluribus Networks Inc is prohibited, and any such unauthorized removal -# or alteration will be a violation of federal law. - -use strict; - -my @Sections = split(/\n/, `elfedit -r -e \'shdr:sh_name -osimple\' $ARGV[0] 2>&1`); - -foreach my $Section (@Sections) { - if ($Section =~ "^set_") { - print "\tfixing $Section\n"; - - chomp(my $SectionAddr = `elfedit -r -e \'shdr:sh_addr -onum $Section\' $ARGV[0] 2>&1`); - chomp(my $SectionSize = `elfedit -r -e \'shdr:sh_size -onum $Section\' $ARGV[0] 2>&1`); - my $SectionEnd = hex($SectionAddr) + hex($SectionSize); - - `elfedit -e \'sym:st_bind __start_$Section global\' $ARGV[0] 2>&1`; - `elfedit -e \'sym:st_value __start_$Section $SectionAddr\' $ARGV[0] 2>&1`; - `elfedit -e \'sym:st_shndx __start_$Section $Section\' $ARGV[0] 2>&1`; - `elfedit -e \'sym:st_bind __stop_$Section global\' $ARGV[0] 2>&1`; - `elfedit -e \'sym:st_value __stop_$Section $SectionEnd\' $ARGV[0] 2>&1`; - `elfedit -e \'sym:st_shndx __stop_$Section $Section\' $ARGV[0] 2>&1`; - } -} diff --git a/usr/src/uts/Makefile.targ b/usr/src/uts/Makefile.targ index c5c32caa19..c42f458948 100644 --- a/usr/src/uts/Makefile.targ +++ b/usr/src/uts/Makefile.targ @@ -23,6 +23,7 @@ # Copyright 2014 Garrett D'Amore <garrett@damore.org> # Copyright 2016 Hans Rosenfeld <rosenfeld@grumpf.hope-2000.org> # Copyright (c) 2017 by Delphix. All rights reserved. +# Copyright 2019 Joyent, Inc. # # This Makefiles contains the common targets and definitions for # all kernels. It is to be included in the Makefiles for specific @@ -51,7 +52,7 @@ $(OBJECTS): $(INLINES) # Partially link .o files to generate the kmod. The fake dependency # on modstubs simplifies things... # -$(BINARY): $(OBJECTS) $(DTRACE_MAPFILE) +$(BINARY): $(OBJECTS) $(DTRACE_MAPFILE) $(MAPFILE) $(LD) -r $(LDFLAGS) -o $@ $(OBJECTS) $(CTFMERGE_UNIQUIFY_AGAINST_GENUNIX) $(POST_PROCESS) diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files index dbed5ea9cc..63f314ca93 100644 --- a/usr/src/uts/common/Makefile.files +++ b/usr/src/uts/common/Makefile.files @@ -1924,9 +1924,9 @@ LINT_DEFS += -Dunix # It is a bug in the current compilation system that the assember # can't process the -Y I, flag. # -NATIVE_INC_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common -AS_INC_PATH += $(INC_PATH) -I$(UTSBASE)/common -INCLUDE_PATH += $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common +NATIVE_INC_PATH += $(PRE_INC_PATH) $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common +AS_INC_PATH += $(PRE_INC_PATH) $(INC_PATH) -I$(UTSBASE)/common +INCLUDE_PATH += $(PRE_INC_PATH) $(INC_PATH) $(CCYFLAG)$(UTSBASE)/common PCIEB_OBJS += pcieb.o diff --git a/usr/src/uts/i86pc/Makefile.files b/usr/src/uts/i86pc/Makefile.files index 879b8d86cb..ca4ae0cd65 100644 --- a/usr/src/uts/i86pc/Makefile.files +++ b/usr/src/uts/i86pc/Makefile.files @@ -23,8 +23,8 @@ # Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. # # Copyright (c) 2010, Intel Corporation. -# Copyright 2018 Joyent, Inc. # Copyright 2019 OmniOS Community Edition (OmniOSce) Association. +# Copyright 2019 Joyent, Inc. # # This Makefile defines file modules in the directory uts/i86pc # and its children. These are the source files which are i86pc @@ -237,6 +237,46 @@ UPPC_OBJS += uppc.o psm_common.o XSVC_OBJS += xsvc.o AMD_IOMMU_OBJS += amd_iommu.o amd_iommu_impl.o amd_iommu_acpi.o \ amd_iommu_cmd.o amd_iommu_log.o amd_iommu_page_tables.o +VMM_OBJS += vmm.o \ + vmm_sol_dev.o \ + vmm_host.o \ + vmm_instruction_emul.o \ + vmm_ioport.o \ + vmm_lapic.o \ + vmm_mem.o \ + vmm_stat.o \ + vmm_util.o \ + x86.o \ + vdev.o \ + vatpic.o \ + vatpit.o \ + vhpet.o \ + vioapic.o \ + vlapic.o \ + vrtc.o \ + vpmtmr.o \ + ept.o \ + vmcs.o \ + vmx_msr.o \ + vmx.o \ + vmx_support.o \ + svm.o \ + svm_msr.o \ + npt.o \ + vmcb.o \ + svm_support.o \ + amdv.o \ + sol_iommu.o \ + sol_ppt.o \ + gipt.o \ + vmm_sol_vm.o \ + vmm_sol_glue.o \ + vmm_sol_ept.o \ + vmm_sol_rvi.o \ + vmm_support.o \ + vmm_zsd.o + +VIONA_OBJS += viona.o # # Build up defines and paths. diff --git a/usr/src/uts/i86pc/Makefile.i86pc b/usr/src/uts/i86pc/Makefile.i86pc index f5021ec738..b66b0ca2da 100644 --- a/usr/src/uts/i86pc/Makefile.i86pc +++ b/usr/src/uts/i86pc/Makefile.i86pc @@ -246,6 +246,7 @@ DRV_KMODS += dr DRV_KMODS += ioat DRV_KMODS += fipe DRV_KMODS += imc imcstub +DRV_KMODS += vmm DRV_KMODS += cpudrv diff --git a/usr/src/uts/i86pc/Makefile.rules b/usr/src/uts/i86pc/Makefile.rules index e4f2fee0a0..3d3c8131c1 100644 --- a/usr/src/uts/i86pc/Makefile.rules +++ b/usr/src/uts/i86pc/Makefile.rules @@ -225,6 +225,35 @@ $(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/dboot/%.c $(COMPILE.c) -o $@ $< $(CTFCONVERT_O) +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/amd/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/intel/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/io/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/%.s + $(COMPILE.s) -o $@ $< + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/intel/%.s + $(COMPILE.s) -o $@ $< + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/vmm/amd/%.s + $(COMPILE.s) -o $@ $< + +$(OBJS_DIR)/%.o: $(UTSBASE)/i86pc/io/viona/%.c + $(COMPILE.c) -o $@ $< + $(CTFCONVERT_O) + # # dboot stuff is always 32 bit, linked to run with phys_addr == virt_addr # diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c index 40bdd80a6e..2371a2f3ae 100644 --- a/usr/src/uts/i86pc/io/viona/viona.c +++ b/usr/src/uts/i86pc/io/viona/viona.c @@ -34,6 +34,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #include <sys/conf.h> @@ -194,8 +195,8 @@ static void *viona_state; static dev_info_t *viona_dip; static id_space_t *viona_minor_ids; /* - * copy tx mbufs from virtio ring to avoid necessitating a wait - * for packet transmission to free resources. + * copy tx mbufs from virtio ring to avoid necessitating a wait for packet + * transmission to free resources. */ static boolean_t copy_tx_mblks = B_TRUE; @@ -914,7 +915,7 @@ viona_ioc_tx_intr_clear(viona_link_t *link) static int vq_popchain(viona_link_t *link, viona_vring_hqueue_t *hq, struct iovec *iov, -int n_iov, uint16_t *cookie) + int n_iov, uint16_t *cookie) { int i; int ndesc, nindir; @@ -1139,10 +1140,12 @@ viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp, size_t mblklen; int n, i = 0; uint16_t cookie; - struct virtio_net_hdr *vrx; - struct virtio_net_mrgrxhdr *vmrgrx; + struct virtio_net_hdr *vrx = NULL; + struct virtio_net_mrgrxhdr *vmrgrx = NULL; +#if notyet mblk_t *ml; - caddr_t buf; +#endif + caddr_t buf = NULL; int total_len = 0; int copied_buf = 0; int num_bufs = 0; @@ -1312,8 +1315,10 @@ viona_desb_free(viona_desb_t *dp) { viona_link_t *link; viona_vring_hqueue_t *hq; +#if notyet struct virtio_used *vu; int uidx; +#endif uint_t ref; ref = atomic_dec_uint_nv(&dp->d_ref); diff --git a/usr/src/uts/i86pc/io/vmm/README.sync b/usr/src/uts/i86pc/io/vmm/README.sync new file mode 100644 index 0000000000..1cddfd829e --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/README.sync @@ -0,0 +1,18 @@ +The bhyve kernel module and its associated userland consumers have been updated +to the latest upstream FreeBSD sources as of: + + +commit 3b9cb80b242682690203709aaff4eafae41c138f +Author: jhb <jhb@FreeBSD.org> +Date: Mon Jun 3 23:17:35 2019 +0000 + + Emulate the AMD MSR_LS_CFG MSR used for various Ryzen errata. + + Writes are ignored and reads always return zero. + + Submitted by: José Albornoz <jojo@eljojo.net> (write-only version) + Reviewed by: Patrick Mooney, cem + MFC after: 2 weeks + Differential Revision: https://reviews.freebsd.org/D19506 + +Which corresponds to SVN revision: 348592 diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdv.c b/usr/src/uts/i86pc/io/vmm/amd/amdv.c index 6b62daae6c..c34a1e897b 100644 --- a/usr/src/uts/i86pc/io/vmm/amd/amdv.c +++ b/usr/src/uts/i86pc/io/vmm/amd/amdv.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/amd/amdv.c 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,141 +38,18 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/amd/amdv.c 245678 2013-01-20 03:42:49Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> #include <sys/errno.h> -#include <sys/smp.h> #include <machine/vmm.h> -#ifdef __FreeBSD__ #include "io/iommu.h" -#endif - -static int -amdv_init(void) -{ - - printf("amdv_init: not implemented\n"); - return (ENXIO); -} - -static int -amdv_cleanup(void) -{ - - printf("amdv_cleanup: not implemented\n"); - return (ENXIO); -} - -static void * -amdv_vminit(struct vm *vm) -{ - - printf("amdv_vminit: not implemented\n"); - return (NULL); -} - -static int -amdv_vmrun(void *arg, int vcpu, register_t rip) -{ - - printf("amdv_vmrun: not implemented\n"); - return (ENXIO); -} - -static void -amdv_vmcleanup(void *arg) -{ - - printf("amdv_vmcleanup: not implemented\n"); - return; -} - -static int -amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, - vm_memattr_t attr, int prot, boolean_t spok) -{ - - printf("amdv_vmmmap_set: not implemented\n"); - return (EINVAL); -} - -static vm_paddr_t -amdv_vmmmap_get(void *arg, vm_paddr_t gpa) -{ - - printf("amdv_vmmmap_get: not implemented\n"); - return (EINVAL); -} - -static int -amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval) -{ - - printf("amdv_getreg: not implemented\n"); - return (EINVAL); -} - -static int -amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val) -{ - - printf("amdv_setreg: not implemented\n"); - return (EINVAL); -} - -static int -amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc) -{ - - printf("amdv_get_desc: not implemented\n"); - return (EINVAL); -} - -static int -amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc) -{ - - printf("amdv_get_desc: not implemented\n"); - return (EINVAL); -} - -static int -amdv_getcap(void *arg, int vcpu, int type, int *retval) -{ - - printf("amdv_getcap: not implemented\n"); - return (EINVAL); -} - -static int -amdv_setcap(void *arg, int vcpu, int type, int val) -{ - - printf("amdv_setcap: not implemented\n"); - return (EINVAL); -} - -struct vmm_ops vmm_ops_amd = { - amdv_init, - amdv_cleanup, - amdv_vminit, - amdv_vmrun, - amdv_vmcleanup, - amdv_vmmmap_set, - amdv_vmmmap_get, - amdv_getreg, - amdv_setreg, - amdv_getdesc, - amdv_setdesc, - amdv_getcap, - amdv_setcap -}; static int amd_iommu_init(void) @@ -234,14 +113,14 @@ amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len) } static void -amd_iommu_add_device(void *domain, int bus, int slot, int func) +amd_iommu_add_device(void *domain, uint16_t rid) { printf("amd_iommu_add_device: not implemented\n"); } static void -amd_iommu_remove_device(void *domain, int bus, int slot, int func) +amd_iommu_remove_device(void *domain, uint16_t rid) { printf("amd_iommu_remove_device: not implemented\n"); @@ -254,7 +133,6 @@ amd_iommu_invalidate_tlb(void *domain) printf("amd_iommu_invalidate_tlb: not implemented\n"); } -#ifdef __FreeBSD__ struct iommu_ops iommu_ops_amd = { amd_iommu_init, amd_iommu_cleanup, @@ -268,4 +146,3 @@ struct iommu_ops iommu_ops_amd = { amd_iommu_remove_device, amd_iommu_invalidate_tlb, }; -#endif diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c new file mode 100644 index 0000000000..f6b6e60363 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_hw.c @@ -0,0 +1,1461 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/rman.h> +#include <sys/smp.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <dev/pci/pcivar.h> +#include <dev/pci/pcireg.h> + +#include <machine/resource.h> +#include <machine/vmm.h> +#include <machine/pmap.h> +#include <machine/vmparam.h> +#include <machine/pci_cfgreg.h> + +#include "pcib_if.h" + +#include "io/iommu.h" +#include "amdvi_priv.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, amdvi, CTLFLAG_RW, NULL, NULL); + +#define MOD_INC(a, s, m) (((a) + (s)) % ((m) * (s))) +#define MOD_DEC(a, s, m) (((a) - (s)) % ((m) * (s))) + +/* Print RID or device ID in PCI string format. */ +#define RID2PCI_STR(d) PCI_RID2BUS(d), PCI_RID2SLOT(d), PCI_RID2FUNC(d) + +static void amdvi_dump_cmds(struct amdvi_softc *softc); +static void amdvi_print_dev_cap(struct amdvi_softc *softc); + +MALLOC_DEFINE(M_AMDVI, "amdvi", "amdvi"); + +extern device_t *ivhd_devs; + +extern int ivhd_count; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, count, CTLFLAG_RDTUN, &ivhd_count, + 0, NULL); + +static int amdvi_enable_user = 0; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, enable, CTLFLAG_RDTUN, + &amdvi_enable_user, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi_enable", &amdvi_enable_user); + +#ifdef AMDVI_ATS_ENABLE +/* XXX: ATS is not tested. */ +static int amdvi_enable_iotlb = 1; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, iotlb_enabled, CTLFLAG_RDTUN, + &amdvi_enable_iotlb, 0, NULL); +TUNABLE_INT("hw.vmm.enable_iotlb", &amdvi_enable_iotlb); +#endif + +static int amdvi_host_ptp = 1; /* Use page tables for host. */ +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, host_ptp, CTLFLAG_RDTUN, + &amdvi_host_ptp, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.host_ptp", &amdvi_host_ptp); + +/* Page table level used <= supported by h/w[v1=7]. */ +static int amdvi_ptp_level = 4; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, ptp_level, CTLFLAG_RDTUN, + &amdvi_ptp_level, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.ptp_level", &amdvi_ptp_level); + +/* Disable fault event reporting. */ +static int amdvi_disable_io_fault = 0; +SYSCTL_INT(_hw_vmm_amdvi, OID_AUTO, disable_io_fault, CTLFLAG_RDTUN, + &amdvi_disable_io_fault, 0, NULL); +TUNABLE_INT("hw.vmm.amdvi.disable_io_fault", &amdvi_disable_io_fault); + +static uint32_t amdvi_dom_id = 0; /* 0 is reserved for host. */ +SYSCTL_UINT(_hw_vmm_amdvi, OID_AUTO, domain_id, CTLFLAG_RD, + &amdvi_dom_id, 0, NULL); +/* + * Device table entry. + * Bus(256) x Dev(32) x Fun(8) x DTE(256 bits or 32 bytes). + * = 256 * 2 * PAGE_SIZE. + */ +static struct amdvi_dte amdvi_dte[PCI_NUM_DEV_MAX] __aligned(PAGE_SIZE); +CTASSERT(PCI_NUM_DEV_MAX == 0x10000); +CTASSERT(sizeof(amdvi_dte) == 0x200000); + +static SLIST_HEAD (, amdvi_domain) dom_head; + +static inline uint32_t +amdvi_pci_read(struct amdvi_softc *softc, int off) +{ + + return (pci_cfgregread(PCI_RID2BUS(softc->pci_rid), + PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid), + off, 4)); +} + +#ifdef AMDVI_ATS_ENABLE +/* XXX: Should be in pci.c */ +/* + * Check if device has ATS capability and its enabled. + * If ATS is absent or disabled, return (-1), otherwise ATS + * queue length. + */ +static int +amdvi_find_ats_qlen(uint16_t devid) +{ + device_t dev; + uint32_t off, cap; + int qlen = -1; + + dev = pci_find_bsf(PCI_RID2BUS(devid), PCI_RID2SLOT(devid), + PCI_RID2FUNC(devid)); + + if (!dev) { + return (-1); + } +#define PCIM_ATS_EN BIT(31) + + if (pci_find_extcap(dev, PCIZ_ATS, &off) == 0) { + cap = pci_read_config(dev, off + 4, 4); + qlen = (cap & 0x1F); + qlen = qlen ? qlen : 32; + printf("AMD-Vi: PCI device %d.%d.%d ATS %s qlen=%d\n", + RID2PCI_STR(devid), + (cap & PCIM_ATS_EN) ? "enabled" : "Disabled", + qlen); + qlen = (cap & PCIM_ATS_EN) ? qlen : -1; + } + + return (qlen); +} + +/* + * Check if an endpoint device support device IOTLB or ATS. + */ +static inline bool +amdvi_dev_support_iotlb(struct amdvi_softc *softc, uint16_t devid) +{ + struct ivhd_dev_cfg *cfg; + int qlen, i; + bool pci_ats, ivhd_ats; + + qlen = amdvi_find_ats_qlen(devid); + if (qlen < 0) + return (false); + + KASSERT(softc, ("softc is NULL")); + cfg = softc->dev_cfg; + + ivhd_ats = false; + for (i = 0; i < softc->dev_cfg_cnt; i++) { + if ((cfg->start_id <= devid) && (cfg->end_id >= devid)) { + ivhd_ats = cfg->enable_ats; + break; + } + cfg++; + } + + pci_ats = (qlen < 0) ? false : true; + if (pci_ats != ivhd_ats) + device_printf(softc->dev, + "BIOS bug: mismatch in ATS setting for %d.%d.%d," + "ATS inv qlen = %d\n", RID2PCI_STR(devid), qlen); + + /* Ignore IVRS setting and respect PCI setting. */ + return (pci_ats); +} +#endif + +/* Enable IOTLB support for IOMMU if its supported. */ +static inline void +amdvi_hw_enable_iotlb(struct amdvi_softc *softc) +{ +#ifndef AMDVI_ATS_ENABLE + softc->iotlb = false; +#else + bool supported; + + supported = (softc->ivhd_flag & IVHD_FLAG_IOTLB) ? true : false; + + if (softc->pci_cap & AMDVI_PCI_CAP_IOTLB) { + if (!supported) + device_printf(softc->dev, "IOTLB disabled by BIOS.\n"); + + if (supported && !amdvi_enable_iotlb) { + device_printf(softc->dev, "IOTLB disabled by user.\n"); + supported = false; + } + } else + supported = false; + + softc->iotlb = supported; + +#endif +} + +static int +amdvi_init_cmd(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl = softc->ctrl; + + ctrl->cmd.len = 8; /* Use 256 command buffer entries. */ + softc->cmd_max = 1 << ctrl->cmd.len; + + softc->cmd = malloc(sizeof(struct amdvi_cmd) * + softc->cmd_max, M_AMDVI, M_WAITOK | M_ZERO); + + if ((uintptr_t)softc->cmd & PAGE_MASK) + panic("AMDVi: Command buffer not aligned on page boundary."); + + ctrl->cmd.base = vtophys(softc->cmd) / PAGE_SIZE; + /* + * XXX: Reset the h/w pointers in case IOMMU is restarting, + * h/w doesn't clear these pointers based on empirical data. + */ + ctrl->cmd_tail = 0; + ctrl->cmd_head = 0; + + return (0); +} + +/* + * Note: Update tail pointer after we have written the command since tail + * pointer update cause h/w to execute new commands, see section 3.3 + * of AMD IOMMU spec ver 2.0. + */ +/* Get the command tail pointer w/o updating it. */ +static struct amdvi_cmd * +amdvi_get_cmd_tail(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_cmd *tail; + + KASSERT(softc, ("softc is NULL")); + KASSERT(softc->cmd != NULL, ("cmd is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + + tail = (struct amdvi_cmd *)((uint8_t *)softc->cmd + + ctrl->cmd_tail); + + return (tail); +} + +/* + * Update the command tail pointer which will start command execution. + */ +static void +amdvi_update_cmd_tail(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + int size; + + size = sizeof(struct amdvi_cmd); + KASSERT(softc->cmd != NULL, ("cmd is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + + ctrl->cmd_tail = MOD_INC(ctrl->cmd_tail, size, softc->cmd_max); + softc->total_cmd++; + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "cmd_tail: %s Tail:0x%x, Head:0x%x.\n", + ctrl->cmd_tail, + ctrl->cmd_head); +#endif + +} + +/* + * Various commands supported by IOMMU. + */ + +/* Completion wait command. */ +static void +amdvi_cmd_cmp(struct amdvi_softc *softc, const uint64_t data) +{ + struct amdvi_cmd *cmd; + uint64_t pa; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + pa = vtophys(&softc->cmp_data); + cmd->opcode = AMDVI_CMP_WAIT_OPCODE; + cmd->word0 = (pa & 0xFFFFFFF8) | + (AMDVI_CMP_WAIT_STORE); + //(AMDVI_CMP_WAIT_FLUSH | AMDVI_CMP_WAIT_STORE); + cmd->word1 = (pa >> 32) & 0xFFFFF; + cmd->addr = data; + + amdvi_update_cmd_tail(softc); +} + +/* Invalidate device table entry. */ +static void +amdvi_cmd_inv_dte(struct amdvi_softc *softc, uint16_t devid) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + cmd->opcode = AMDVI_INVD_DTE_OPCODE; + cmd->word0 = devid; + amdvi_update_cmd_tail(softc); +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidated DTE:0x%x\n", devid); +#endif +} + +/* Invalidate IOMMU page, use for invalidation of domain. */ +static void +amdvi_cmd_inv_iommu_pages(struct amdvi_softc *softc, uint16_t domain_id, + uint64_t addr, bool guest_nested, + bool pde, bool page) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + + cmd->opcode = AMDVI_INVD_PAGE_OPCODE; + cmd->word1 = domain_id; + /* + * Invalidate all addresses for this domain. + */ + cmd->addr = addr; + cmd->addr |= pde ? AMDVI_INVD_PAGE_PDE : 0; + cmd->addr |= page ? AMDVI_INVD_PAGE_S : 0; + + amdvi_update_cmd_tail(softc); +} + +#ifdef AMDVI_ATS_ENABLE +/* Invalidate device IOTLB. */ +static void +amdvi_cmd_inv_iotlb(struct amdvi_softc *softc, uint16_t devid) +{ + struct amdvi_cmd *cmd; + int qlen; + + if (!softc->iotlb) + return; + + qlen = amdvi_find_ats_qlen(devid); + if (qlen < 0) { + panic("AMDVI: Invalid ATS qlen(%d) for device %d.%d.%d\n", + qlen, RID2PCI_STR(devid)); + } + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate IOTLB devID 0x%x" + " Qlen:%d\n", devid, qlen); +#endif + cmd->opcode = AMDVI_INVD_IOTLB_OPCODE; + cmd->word0 = devid; + cmd->word1 = qlen; + cmd->addr = AMDVI_INVD_IOTLB_ALL_ADDR | + AMDVI_INVD_IOTLB_S; + amdvi_update_cmd_tail(softc); +} +#endif + +#ifdef notyet /* For Interrupt Remap. */ +static void +amdvi_cmd_inv_intr_map(struct amdvi_softc *softc, + uint16_t devid) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + cmd->opcode = AMDVI_INVD_INTR_OPCODE; + cmd->word0 = devid; + amdvi_update_cmd_tail(softc); +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate INTR map of devID 0x%x\n", devid); +#endif +} +#endif + +/* Invalidate domain using INVALIDATE_IOMMU_PAGES command. */ +static void +amdvi_inv_domain(struct amdvi_softc *softc, uint16_t domain_id) +{ + struct amdvi_cmd *cmd; + + cmd = amdvi_get_cmd_tail(softc); + KASSERT(cmd != NULL, ("Cmd is NULL")); + + /* + * See section 3.3.3 of IOMMU spec rev 2.0, software note + * for invalidating domain. + */ + amdvi_cmd_inv_iommu_pages(softc, domain_id, AMDVI_INVD_PAGE_ALL_ADDR, + false, true, true); + +#ifdef AMDVI_DEBUG_CMD + device_printf(softc->dev, "Invalidate domain:0x%x\n", domain_id); + +#endif +} + +static bool +amdvi_cmp_wait(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + const uint64_t VERIFY = 0xA5A5; + volatile uint64_t *read; + int i; + bool status; + + ctrl = softc->ctrl; + read = &softc->cmp_data; + *read = 0; + amdvi_cmd_cmp(softc, VERIFY); + /* Wait for h/w to update completion data. */ + for (i = 0; i < 100 && (*read != VERIFY); i++) { + DELAY(1000); /* 1 ms */ + } + status = (VERIFY == softc->cmp_data) ? true : false; + +#ifdef AMDVI_DEBUG_CMD + if (status) + device_printf(softc->dev, "CMD completion DONE Tail:0x%x, " + "Head:0x%x, loop:%d.\n", ctrl->cmd_tail, + ctrl->cmd_head, loop); +#endif + return (status); +} + +static void +amdvi_wait(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + int i; + + KASSERT(softc, ("softc is NULL")); + + ctrl = softc->ctrl; + KASSERT(ctrl != NULL, ("ctrl is NULL")); + /* Don't wait if h/w is not enabled. */ + if ((ctrl->control & AMDVI_CTRL_EN) == 0) + return; + + for (i = 0; i < 10; i++) { + if (amdvi_cmp_wait(softc)) + return; + } + + device_printf(softc->dev, "Error: completion failed" + " tail:0x%x, head:0x%x.\n", + ctrl->cmd_tail, ctrl->cmd_head); + amdvi_dump_cmds(softc); +} + +static void +amdvi_dump_cmds(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_cmd *cmd; + int off, i; + + ctrl = softc->ctrl; + device_printf(softc->dev, "Dump all the commands:\n"); + /* + * If h/w is stuck in completion, it is the previous command, + * start dumping from previous command onward. + */ + off = MOD_DEC(ctrl->cmd_head, sizeof(struct amdvi_cmd), + softc->cmd_max); + for (i = 0; off != ctrl->cmd_tail && + i < softc->cmd_max; i++) { + cmd = (struct amdvi_cmd *)((uint8_t *)softc->cmd + off); + printf(" [CMD%d, off:0x%x] opcode= 0x%x 0x%x" + " 0x%x 0x%lx\n", i, off, cmd->opcode, + cmd->word0, cmd->word1, cmd->addr); + off = (off + sizeof(struct amdvi_cmd)) % + (softc->cmd_max * sizeof(struct amdvi_cmd)); + } +} + +static int +amdvi_init_event(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + + ctrl = softc->ctrl; + ctrl->event.len = 8; + softc->event_max = 1 << ctrl->event.len; + softc->event = malloc(sizeof(struct amdvi_event) * + softc->event_max, M_AMDVI, M_WAITOK | M_ZERO); + if ((uintptr_t)softc->event & PAGE_MASK) { + device_printf(softc->dev, "Event buffer not aligned on page."); + return (false); + } + ctrl->event.base = vtophys(softc->event) / PAGE_SIZE; + + /* Reset the pointers. */ + ctrl->evt_head = 0; + ctrl->evt_tail = 0; + + return (0); +} + +static inline void +amdvi_decode_evt_flag(uint16_t flag) +{ + + flag &= AMDVI_EVENT_FLAG_MASK; + printf(" 0x%b]\n", flag, + "\020" + "\001GN" + "\002NX" + "\003US" + "\004I" + "\005PR" + "\006RW" + "\007PE" + "\010RZ" + "\011TR" + ); +} + +/* See section 2.5.4 of AMD IOMMU spec ver 2.62.*/ +static inline void +amdvi_decode_evt_flag_type(uint8_t type) +{ + + switch (AMDVI_EVENT_FLAG_TYPE(type)) { + case 0: + printf("RSVD\n"); + break; + case 1: + printf("Master Abort\n"); + break; + case 2: + printf("Target Abort\n"); + break; + case 3: + printf("Data Err\n"); + break; + default: + break; + } +} + +static void +amdvi_decode_inv_dte_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", + devid, domid, addr); + amdvi_decode_evt_flag(flag); +} + +static void +amdvi_decode_pf_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[IO_PAGE_FAULT EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", + devid, domid, addr); + amdvi_decode_evt_flag(flag); +} + +static void +amdvi_decode_dte_hwerr_evt(uint16_t devid, uint16_t domid, + uint64_t addr, uint16_t flag) +{ + + printf("\t[DEV_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", devid, domid, addr); + amdvi_decode_evt_flag(flag); + amdvi_decode_evt_flag_type(flag); +} + +static void +amdvi_decode_page_hwerr_evt(uint16_t devid, uint16_t domid, uint64_t addr, + uint16_t flag) +{ + + printf("\t[PAGE_TAB_HW_ERR EVT: devId:0x%x DomId:0x%x" + " Addr:0x%lx", devid, domid, addr); + amdvi_decode_evt_flag(flag); + amdvi_decode_evt_flag_type(AMDVI_EVENT_FLAG_TYPE(flag)); +} + +static void +amdvi_decode_evt(struct amdvi_event *evt) +{ + struct amdvi_cmd *cmd; + + switch (evt->opcode) { + case AMDVI_EVENT_INVALID_DTE: + amdvi_decode_inv_dte_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_PFAULT: + amdvi_decode_pf_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_DTE_HW_ERROR: + amdvi_decode_dte_hwerr_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_PAGE_HW_ERROR: + amdvi_decode_page_hwerr_evt(evt->devid, evt->pasid_domid, + evt->addr, evt->flag); + break; + + case AMDVI_EVENT_ILLEGAL_CMD: + /* FALL THROUGH */ + case AMDVI_EVENT_CMD_HW_ERROR: + printf("\t[%s EVT]\n", (evt->opcode == AMDVI_EVENT_ILLEGAL_CMD) ? + "ILLEGAL CMD" : "CMD HW ERR"); + cmd = (struct amdvi_cmd *)PHYS_TO_DMAP(evt->addr); + printf("\tCMD opcode= 0x%x 0x%x 0x%x 0x%lx\n", + cmd->opcode, cmd->word0, cmd->word1, cmd->addr); + break; + + case AMDVI_EVENT_IOTLB_TIMEOUT: + printf("\t[IOTLB_INV_TIMEOUT devid:0x%x addr:0x%lx]\n", + evt->devid, evt->addr); + break; + + case AMDVI_EVENT_INVALID_DTE_REQ: + printf("\t[INV_DTE devid:0x%x addr:0x%lx type:0x%x tr:%d]\n", + evt->devid, evt->addr, evt->flag >> 9, + (evt->flag >> 8) & 1); + break; + + case AMDVI_EVENT_INVALID_PPR_REQ: + case AMDVI_EVENT_COUNTER_ZERO: + printf("AMD-Vi: v2 events.\n"); + break; + + default: + printf("Unsupported AMD-Vi event:%d\n", evt->opcode); + } +} + +static void +amdvi_print_events(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_event *event; + int i, size; + + ctrl = softc->ctrl; + size = sizeof(struct amdvi_event); + for (i = 0; i < softc->event_max; i++) { + event = &softc->event[ctrl->evt_head / size]; + if (!event->opcode) + break; + device_printf(softc->dev, "\t[Event%d: Head:0x%x Tail:0x%x]\n", + i, ctrl->evt_head, ctrl->evt_tail); + amdvi_decode_evt(event); + ctrl->evt_head = MOD_INC(ctrl->evt_head, size, + softc->event_max); + } +} + +static int +amdvi_init_dte(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + + ctrl = softc->ctrl; + ctrl->dte.base = vtophys(amdvi_dte) / PAGE_SIZE; + ctrl->dte.size = 0x1FF; /* 2MB device table. */ + + return (0); +} + +/* + * Not all capabilities of IOMMU are available in ACPI IVHD flag + * or EFR entry, read directly from device. + */ +static int +amdvi_print_pci_cap(device_t dev) +{ + struct amdvi_softc *softc; + uint32_t off, cap; + + + softc = device_get_softc(dev); + off = softc->cap_off; + + /* + * Section 3.7.1 of IOMMU sepc rev 2.0. + * Read capability from device. + */ + cap = amdvi_pci_read(softc, off); + + /* Make sure capability type[18:16] is 3. */ + KASSERT((((cap >> 16) & 0x7) == 0x3), + ("Not a IOMMU capability 0x%x@0x%x", cap, off)); + + softc->pci_cap = cap >> 24; + device_printf(softc->dev, "PCI cap 0x%x@0x%x feature:%b\n", + cap, off, softc->pci_cap, + "\20\1IOTLB\2HT\3NPCache\4EFR\5CapExt"); + + return (0); +} + +static void +amdvi_event_intr(void *arg) +{ + struct amdvi_softc *softc; + struct amdvi_ctrl *ctrl; + + softc = (struct amdvi_softc *)arg; + ctrl = softc->ctrl; + device_printf(softc->dev, "EVT INTR %ld Status:0x%x" + " EVT Head:0x%x Tail:0x%x]\n", softc->event_intr_cnt++, + ctrl->status, ctrl->evt_head, ctrl->evt_tail); + printf(" [CMD Total 0x%lx] Tail:0x%x, Head:0x%x.\n", + softc->total_cmd, ctrl->cmd_tail, ctrl->cmd_head); + + amdvi_print_events(softc); + ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; +} + +static void +amdvi_free_evt_intr_res(device_t dev) +{ + + struct amdvi_softc *softc; + + softc = device_get_softc(dev); + if (softc->event_tag != NULL) { + bus_teardown_intr(dev, softc->event_res, softc->event_tag); + } + if (softc->event_res != NULL) { + bus_release_resource(dev, SYS_RES_IRQ, softc->event_rid, + softc->event_res); + } + bus_delete_resource(dev, SYS_RES_IRQ, softc->event_rid); + PCIB_RELEASE_MSI(device_get_parent(device_get_parent(dev)), + dev, 1, &softc->event_irq); +} + +static bool +amdvi_alloc_intr_resources(struct amdvi_softc *softc) +{ + struct amdvi_ctrl *ctrl; + device_t dev, pcib; + device_t mmio_dev; + uint64_t msi_addr; + uint32_t msi_data; + int err; + + dev = softc->dev; + pcib = device_get_parent(device_get_parent(dev)); + mmio_dev = pci_find_bsf(PCI_RID2BUS(softc->pci_rid), + PCI_RID2SLOT(softc->pci_rid), PCI_RID2FUNC(softc->pci_rid)); + if (device_is_attached(mmio_dev)) { + device_printf(dev, + "warning: IOMMU device is claimed by another driver %s\n", + device_get_driver(mmio_dev)->name); + } + + softc->event_irq = -1; + softc->event_rid = 0; + + /* + * Section 3.7.1 of IOMMU rev 2.0. With MSI, there is only one + * interrupt. XXX: Enable MSI/X support. + */ + err = PCIB_ALLOC_MSI(pcib, dev, 1, 1, &softc->event_irq); + if (err) { + device_printf(dev, + "Couldn't find event MSI IRQ resource.\n"); + return (ENOENT); + } + + err = bus_set_resource(dev, SYS_RES_IRQ, softc->event_rid, + softc->event_irq, 1); + if (err) { + device_printf(dev, "Couldn't set event MSI resource.\n"); + return (ENXIO); + } + + softc->event_res = bus_alloc_resource_any(dev, SYS_RES_IRQ, + &softc->event_rid, RF_ACTIVE); + if (!softc->event_res) { + device_printf(dev, + "Unable to allocate event INTR resource.\n"); + return (ENOMEM); + } + + if (bus_setup_intr(dev, softc->event_res, + INTR_TYPE_MISC | INTR_MPSAFE, NULL, amdvi_event_intr, + softc, &softc->event_tag)) { + device_printf(dev, "Fail to setup event intr\n"); + bus_release_resource(softc->dev, SYS_RES_IRQ, + softc->event_rid, softc->event_res); + softc->event_res = NULL; + return (ENXIO); + } + + bus_describe_intr(dev, softc->event_res, softc->event_tag, + "fault"); + + err = PCIB_MAP_MSI(pcib, dev, softc->event_irq, &msi_addr, + &msi_data); + if (err) { + device_printf(dev, + "Event interrupt config failed, err=%d.\n", + err); + amdvi_free_evt_intr_res(softc->dev); + return (err); + } + + /* Clear interrupt status bits. */ + ctrl = softc->ctrl; + ctrl->status &= AMDVI_STATUS_EV_OF | AMDVI_STATUS_EV_INTR; + + /* Now enable MSI interrupt. */ + pci_enable_msi(mmio_dev, msi_addr, msi_data); + return (0); +} + + +static void +amdvi_print_dev_cap(struct amdvi_softc *softc) +{ + struct ivhd_dev_cfg *cfg; + int i; + + cfg = softc->dev_cfg; + for (i = 0; i < softc->dev_cfg_cnt; i++) { + device_printf(softc->dev, "device [0x%x - 0x%x]" + "config:%b%s\n", cfg->start_id, cfg->end_id, + cfg->data, + "\020\001INIT\002ExtInt\003NMI" + "\007LINT0\008LINT1", + cfg->enable_ats ? "ATS enabled" : ""); + cfg++; + } +} + +static int +amdvi_handle_sysctl(SYSCTL_HANDLER_ARGS) +{ + struct amdvi_softc *softc; + int result, type, error = 0; + + softc = (struct amdvi_softc *)arg1; + type = arg2; + + switch (type) { + case 0: + result = softc->ctrl->cmd_head; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 1: + result = softc->ctrl->cmd_tail; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 2: + result = softc->ctrl->evt_head; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + case 3: + result = softc->ctrl->evt_tail; + error = sysctl_handle_int(oidp, &result, 0, + req); + break; + + default: + device_printf(softc->dev, "Unknown sysctl:%d\n", type); + } + + return (error); +} + +static void +amdvi_add_sysctl(struct amdvi_softc *softc) +{ + struct sysctl_oid_list *child; + struct sysctl_ctx_list *ctx; + device_t dev; + + dev = softc->dev; + ctx = device_get_sysctl_ctx(dev); + child = SYSCTL_CHILDREN(device_get_sysctl_tree(dev)); + + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "event_intr_count", CTLFLAG_RD, + &softc->event_intr_cnt, "Event interrupt count"); + SYSCTL_ADD_ULONG(ctx, child, OID_AUTO, "command_count", CTLFLAG_RD, + &softc->total_cmd, "Command submitted count"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "pci_rid", CTLFLAG_RD, + &softc->pci_rid, 0, "IOMMU RID"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "start_dev_rid", CTLFLAG_RD, + &softc->start_dev_rid, 0, "Start of device under this IOMMU"); + SYSCTL_ADD_U16(ctx, child, OID_AUTO, "end_dev_rid", CTLFLAG_RD, + &softc->end_dev_rid, 0, "End of device under this IOMMU"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_head", + CTLTYPE_UINT | CTLFLAG_RD, softc, 0, + amdvi_handle_sysctl, "IU", "Command head"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "command_tail", + CTLTYPE_UINT | CTLFLAG_RD, softc, 1, + amdvi_handle_sysctl, "IU", "Command tail"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_head", + CTLTYPE_UINT | CTLFLAG_RD, softc, 2, + amdvi_handle_sysctl, "IU", "Command head"); + SYSCTL_ADD_PROC(ctx, child, OID_AUTO, "event_tail", + CTLTYPE_UINT | CTLFLAG_RD, softc, 3, + amdvi_handle_sysctl, "IU", "Command tail"); +} + +int +amdvi_setup_hw(struct amdvi_softc *softc) +{ + device_t dev; + int status; + + dev = softc->dev; + + amdvi_hw_enable_iotlb(softc); + + amdvi_print_dev_cap(softc); + + if ((status = amdvi_print_pci_cap(dev)) != 0) { + device_printf(dev, "PCI capability.\n"); + return (status); + } + if ((status = amdvi_init_cmd(softc)) != 0) { + device_printf(dev, "Couldn't configure command buffer.\n"); + return (status); + } + if ((status = amdvi_init_event(softc)) != 0) { + device_printf(dev, "Couldn't configure event buffer.\n"); + return (status); + } + if ((status = amdvi_init_dte(softc)) != 0) { + device_printf(dev, "Couldn't configure device table.\n"); + return (status); + } + if ((status = amdvi_alloc_intr_resources(softc)) != 0) { + return (status); + } + amdvi_add_sysctl(softc); + return (0); +} + +int +amdvi_teardown_hw(struct amdvi_softc *softc) +{ + device_t dev; + + dev = softc->dev; + + /* + * Called after disable, h/w is stopped by now, free all the resources. + */ + amdvi_free_evt_intr_res(dev); + + if (softc->cmd) + free(softc->cmd, M_AMDVI); + + if (softc->event) + free(softc->event, M_AMDVI); + + return (0); +} + +/*********** bhyve interfaces *********************/ +static int +amdvi_init(void) +{ + if (!ivhd_count) { + return (EIO); + } + if (!amdvi_enable_user && ivhd_count) { + printf("bhyve: Found %d AMD-Vi/IOMMU device(s), " + "use hw.vmm.amdvi.enable=1 to enable pass-through.\n", + ivhd_count); + return (EINVAL); + } + return (0); +} + +static void +amdvi_cleanup(void) +{ + /* Nothing. */ +} + +static uint16_t +amdvi_domainId(void) +{ + + /* + * If we hit maximum domain limit, rollover leaving host + * domain(0). + * XXX: make sure that this domain is not used. + */ + if (amdvi_dom_id == AMDVI_MAX_DOMAIN) + amdvi_dom_id = 1; + + return ((uint16_t)amdvi_dom_id++); +} + +static void +amdvi_do_inv_domain(uint16_t domain_id, bool create) +{ + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL")); + /* + * If not present pages are cached, invalidate page after + * creating domain. + */ +#if 0 + if (create && ((softc->pci_cap & AMDVI_PCI_CAP_NPCACHE) == 0)) + continue; +#endif + amdvi_inv_domain(softc, domain_id); + amdvi_wait(softc); + } +} + +static void * +amdvi_create_domain(vm_paddr_t maxaddr) +{ + struct amdvi_domain *dom; + + dom = malloc(sizeof(struct amdvi_domain), M_AMDVI, M_ZERO | M_WAITOK); + dom->id = amdvi_domainId(); + //dom->maxaddr = maxaddr; +#ifdef AMDVI_DEBUG_CMD + printf("Created domain #%d\n", dom->id); +#endif + /* + * Host domain(#0) don't create translation table. + */ + if (dom->id || amdvi_host_ptp) + dom->ptp = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); + + dom->ptp_level = amdvi_ptp_level; + + amdvi_do_inv_domain(dom->id, true); + SLIST_INSERT_HEAD(&dom_head, dom, next); + + return (dom); +} + +static void +amdvi_free_ptp(uint64_t *ptp, int level) +{ + int i; + + if (level < 1) + return; + + for (i = 0; i < NPTEPG ; i++) { + if ((ptp[i] & AMDVI_PT_PRESENT) == 0) + continue; + /* XXX: Add super-page or PTE mapping > 4KB. */ +#ifdef notyet + /* Super-page mapping. */ + if (AMDVI_PD_SUPER(ptp[i])) + continue; +#endif + + amdvi_free_ptp((uint64_t *)PHYS_TO_DMAP(ptp[i] + & AMDVI_PT_MASK), level - 1); + + } + + free(ptp, M_AMDVI); +} + +static void +amdvi_destroy_domain(void *arg) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain, ("domain is NULL")); +#ifdef AMDVI_DEBUG_CMD + printf("Destroying domain %d\n", domain->id); +#endif + if (domain->ptp) + amdvi_free_ptp(domain->ptp, domain->ptp_level); + + amdvi_do_inv_domain(domain->id, false); + SLIST_REMOVE(&dom_head, domain, amdvi_domain, next); + free(domain, M_AMDVI); +} + +static uint64_t +amdvi_set_pt(uint64_t *pt, int level, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t pg_size, bool create) +{ + uint64_t *page, pa; + int shift, index; + const int PT_SHIFT = 9; + const int PT_INDEX_MASK = (1 << PT_SHIFT) - 1; /* Based on PT_SHIFT */ + + if (!pg_size) + return (0); + + if (hpa & (pg_size - 1)) { + printf("HPA is not size aligned.\n"); + return (0); + } + if (gpa & (pg_size - 1)) { + printf("HPA is not size aligned.\n"); + return (0); + } + shift = PML4SHIFT; + while ((shift > PAGE_SHIFT) && (pg_size < (1UL << shift))) { + index = (gpa >> shift) & PT_INDEX_MASK; + + if ((pt[index] == 0) && create) { + page = malloc(PAGE_SIZE, M_AMDVI, M_WAITOK | M_ZERO); + pa = vtophys(page); + pt[index] = pa | AMDVI_PT_PRESENT | AMDVI_PT_RW | + ((level - 1) << AMDVI_PD_LEVEL_SHIFT); + } +#ifdef AMDVI_DEBUG_PTE + if ((gpa % 0x1000000) == 0) + printf("[level%d, shift = %d]PTE:0x%lx\n", + level, shift, pt[index]); +#endif +#define PTE2PA(x) ((uint64_t)(x) & AMDVI_PT_MASK) + pa = PTE2PA(pt[index]); + pt = (uint64_t *)PHYS_TO_DMAP(pa); + shift -= PT_SHIFT; + level--; + } + + /* Leaf entry. */ + index = (gpa >> shift) & PT_INDEX_MASK; + + if (create) { + pt[index] = hpa | AMDVI_PT_RW | AMDVI_PT_PRESENT; + } else + pt[index] = 0; + +#ifdef AMDVI_DEBUG_PTE + if ((gpa % 0x1000000) == 0) + printf("[Last level%d, shift = %d]PTE:0x%lx\n", + level, shift, pt[index]); +#endif + return (1ULL << shift); +} + +static uint64_t +amdvi_update_mapping(struct amdvi_domain *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t size, bool create) +{ + uint64_t mapped, *ptp, len; + int level; + + KASSERT(domain, ("domain is NULL")); + level = domain->ptp_level; + KASSERT(level, ("Page table level is 0")); + + ptp = domain->ptp; + KASSERT(ptp, ("PTP is NULL")); + mapped = 0; + while (mapped < size) { + len = amdvi_set_pt(ptp, level, gpa + mapped, hpa + mapped, + PAGE_SIZE, create); + if (!len) { + printf("Error: Couldn't map HPA:0x%lx GPA:0x%lx\n", + hpa, gpa); + return (0); + } + mapped += len; + } + + return (mapped); +} + +static uint64_t +amdvi_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, + uint64_t len) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + + if (domain->id && !domain->ptp) { + printf("ptp is NULL"); + return (-1); + } + + /* + * If host domain is created w/o page table, skip IOMMU page + * table set-up. + */ + if (domain->ptp) + return (amdvi_update_mapping(domain, gpa, hpa, len, true)); + else + return (len); +} + +static uint64_t +amdvi_destroy_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + /* + * If host domain is created w/o page table, skip IOMMU page + * table set-up. + */ + if (domain->ptp) + return (amdvi_update_mapping(domain, gpa, 0, len, false)); + return + (len); +} + +static struct amdvi_softc * +amdvi_find_iommu(uint16_t devid) +{ + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + if ((devid >= softc->start_dev_rid) && + (devid <= softc->end_dev_rid)) + return (softc); + } + + /* + * XXX: BIOS bug, device not in IVRS table, assume its from first IOMMU. + */ + printf("BIOS bug device(%d.%d.%d) doesn't have IVHD entry.\n", + RID2PCI_STR(devid)); + + return (device_get_softc(ivhd_devs[0])); +} + +/* + * Set-up device table entry. + * IOMMU spec Rev 2.0, section 3.2.2.2, some of the fields must + * be set concurrently, e.g. read and write bits. + */ +static void +amdvi_set_dte(struct amdvi_domain *domain, uint16_t devid, bool enable) +{ + struct amdvi_softc *softc; + struct amdvi_dte* temp; + + KASSERT(domain, ("domain is NULL for pci_rid:0x%x\n", devid)); + + softc = amdvi_find_iommu(devid); + KASSERT(softc, ("softc is NULL for pci_rid:0x%x\n", devid)); + + temp = &amdvi_dte[devid]; + +#ifdef AMDVI_ATS_ENABLE + /* If IOMMU and device support IOTLB, enable it. */ + if (amdvi_dev_support_iotlb(softc, devid) && softc->iotlb) + temp->iotlb_enable = 1; +#endif + + /* Avoid duplicate I/O faults. */ + temp->sup_second_io_fault = 1; + temp->sup_all_io_fault = amdvi_disable_io_fault; + + temp->dt_valid = 1; + temp->domain_id = domain->id; + + if (enable) { + if (domain->ptp) { + temp->pt_base = vtophys(domain->ptp) >> 12; + temp->pt_level = amdvi_ptp_level; + } + /* + * XXX: Page table valid[TV] bit must be set even if host domain + * page tables are not enabled. + */ + temp->pt_valid = 1; + temp->read_allow = 1; + temp->write_allow = 1; + } +} + +static void +amdvi_inv_device(uint16_t devid) +{ + struct amdvi_softc *softc; + + softc = amdvi_find_iommu(devid); + KASSERT(softc, ("softc is NULL")); + + amdvi_cmd_inv_dte(softc, devid); +#ifdef AMDVI_ATS_ENABLE + if (amdvi_dev_support_iotlb(softc, devid)) + amdvi_cmd_inv_iotlb(softc, devid); +#endif + amdvi_wait(softc); +} + +static void +amdvi_add_device(void *arg, uint16_t devid) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain != NULL, ("domain is NULL")); +#ifdef AMDVI_DEBUG_CMD + printf("Assigning device(%d.%d.%d) to domain:%d\n", + RID2PCI_STR(devid), domain->id); +#endif + amdvi_set_dte(domain, devid, true); + amdvi_inv_device(devid); +} + +static void +amdvi_remove_device(void *arg, uint16_t devid) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; +#ifdef AMDVI_DEBUG_CMD + printf("Remove device(0x%x) from domain:%d\n", + devid, domain->id); +#endif + amdvi_set_dte(domain, devid, false); + amdvi_inv_device(devid); +} + +static void +amdvi_enable(void) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_softc *softc; + uint64_t val; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL\n")); + ctrl = softc->ctrl; + KASSERT(ctrl, ("ctrl is NULL\n")); + + val = ( AMDVI_CTRL_EN | + AMDVI_CTRL_CMD | + AMDVI_CTRL_ELOG | + AMDVI_CTRL_ELOGINT | + AMDVI_CTRL_INV_TO_1S); + + if (softc->ivhd_flag & IVHD_FLAG_COH) + val |= AMDVI_CTRL_COH; + if (softc->ivhd_flag & IVHD_FLAG_HTT) + val |= AMDVI_CTRL_HTT; + if (softc->ivhd_flag & IVHD_FLAG_RPPW) + val |= AMDVI_CTRL_RPPW; + if (softc->ivhd_flag & IVHD_FLAG_PPW) + val |= AMDVI_CTRL_PPW; + if (softc->ivhd_flag & IVHD_FLAG_ISOC) + val |= AMDVI_CTRL_ISOC; + + ctrl->control = val; + } +} + +static void +amdvi_disable(void) +{ + struct amdvi_ctrl *ctrl; + struct amdvi_softc *softc; + int i; + + for (i = 0; i < ivhd_count; i++) { + softc = device_get_softc(ivhd_devs[i]); + KASSERT(softc, ("softc is NULL\n")); + ctrl = softc->ctrl; + KASSERT(ctrl, ("ctrl is NULL\n")); + + ctrl->control = 0; + } +} + +static void +amdvi_inv_tlb(void *arg) +{ + struct amdvi_domain *domain; + + domain = (struct amdvi_domain *)arg; + KASSERT(domain, ("domain is NULL")); + amdvi_do_inv_domain(domain->id, false); +} + +struct iommu_ops iommu_ops_amd = { + amdvi_init, + amdvi_cleanup, + amdvi_enable, + amdvi_disable, + amdvi_create_domain, + amdvi_destroy_domain, + amdvi_create_mapping, + amdvi_destroy_mapping, + amdvi_add_device, + amdvi_remove_device, + amdvi_inv_tlb +}; diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h b/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h new file mode 100644 index 0000000000..6ee6c36632 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/amdvi_priv.h @@ -0,0 +1,431 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016 Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _AMDVI_PRIV_H_ +#define _AMDVI_PRIV_H_ + +#include <contrib/dev/acpica/include/acpi.h> + +#define BIT(n) (1ULL << (n)) +/* Return value of bits[n:m] where n and (n >= ) m are bit positions. */ +#define REG_BITS(x, n, m) (((x) >> (m)) & \ + ((1 << (((n) - (m)) + 1)) - 1)) + +/* + * IOMMU PCI capability. + */ +#define AMDVI_PCI_CAP_IOTLB BIT(0) /* IOTLB is supported. */ +#define AMDVI_PCI_CAP_HT BIT(1) /* HyperTransport tunnel support. */ +#define AMDVI_PCI_CAP_NPCACHE BIT(2) /* Not present page cached. */ +#define AMDVI_PCI_CAP_EFR BIT(3) /* Extended features. */ +#define AMDVI_PCI_CAP_EXT BIT(4) /* Miscellaneous information reg. */ + +/* + * IOMMU extended features. + */ +#define AMDVI_EX_FEA_PREFSUP BIT(0) /* Prefetch command support. */ +#define AMDVI_EX_FEA_PPRSUP BIT(1) /* PPR support */ +#define AMDVI_EX_FEA_XTSUP BIT(2) /* Reserved */ +#define AMDVI_EX_FEA_NXSUP BIT(3) /* No-execute. */ +#define AMDVI_EX_FEA_GTSUP BIT(4) /* Guest translation support. */ +#define AMDVI_EX_FEA_EFRW BIT(5) /* Reserved */ +#define AMDVI_EX_FEA_IASUP BIT(6) /* Invalidate all command supp. */ +#define AMDVI_EX_FEA_GASUP BIT(7) /* Guest APIC or AVIC support. */ +#define AMDVI_EX_FEA_HESUP BIT(8) /* Hardware Error. */ +#define AMDVI_EX_FEA_PCSUP BIT(9) /* Performance counters support. */ +/* XXX: add more EFER bits. */ + +/* + * Device table entry or DTE + * NOTE: Must be 256-bits/32 bytes aligned. + */ +struct amdvi_dte { + uint32_t dt_valid:1; /* Device Table valid. */ + uint32_t pt_valid:1; /* Page translation valid. */ + uint16_t :7; /* Reserved[8:2] */ + uint8_t pt_level:3; /* Paging level, 0 to disable. */ + uint64_t pt_base:40; /* Page table root pointer. */ + uint8_t :3; /* Reserved[54:52] */ + uint8_t gv_valid:1; /* Revision 2, GVA to SPA. */ + uint8_t gv_level:2; /* Revision 2, GLX level. */ + uint8_t gv_cr3_lsb:3; /* Revision 2, GCR3[14:12] */ + uint8_t read_allow:1; /* I/O read enabled. */ + uint8_t write_allow:1; /* I/O write enabled. */ + uint8_t :1; /* Reserved[63] */ + uint16_t domain_id:16; /* Domain ID */ + uint16_t gv_cr3_lsb2:16; /* Revision 2, GCR3[30:15] */ + uint8_t iotlb_enable:1; /* Device support IOTLB */ + uint8_t sup_second_io_fault:1; /* Suppress subsequent I/O faults. */ + uint8_t sup_all_io_fault:1; /* Suppress all I/O page faults. */ + uint8_t IOctl:2; /* Port I/O control. */ + uint8_t iotlb_cache_disable:1; /* IOTLB cache hints. */ + uint8_t snoop_disable:1; /* Snoop disable. */ + uint8_t allow_ex:1; /* Allow exclusion. */ + uint8_t sysmgmt:2; /* System management message.*/ + uint8_t :1; /* Reserved[106] */ + uint32_t gv_cr3_msb:21; /* Revision 2, GCR3[51:31] */ + uint8_t intmap_valid:1; /* Interrupt map valid. */ + uint8_t intmap_len:4; /* Interrupt map table length. */ + uint8_t intmap_ign:1; /* Ignore unmapped interrupts. */ + uint64_t intmap_base:46; /* IntMap base. */ + uint8_t :4; /* Reserved[183:180] */ + uint8_t init_pass:1; /* INIT pass through or PT */ + uint8_t extintr_pass:1; /* External Interrupt PT */ + uint8_t nmi_pass:1; /* NMI PT */ + uint8_t :1; /* Reserved[187] */ + uint8_t intr_ctrl:2; /* Interrupt control */ + uint8_t lint0_pass:1; /* LINT0 PT */ + uint8_t lint1_pass:1; /* LINT1 PT */ + uint64_t :64; /* Reserved[255:192] */ +} __attribute__((__packed__)); +CTASSERT(sizeof(struct amdvi_dte) == 32); + +/* + * IOMMU command entry. + */ +struct amdvi_cmd { + uint32_t word0; + uint32_t word1:28; + uint8_t opcode:4; + uint64_t addr; +} __attribute__((__packed__)); + +/* Command opcodes. */ +#define AMDVI_CMP_WAIT_OPCODE 0x1 /* Completion wait. */ +#define AMDVI_INVD_DTE_OPCODE 0x2 /* Invalidate device table entry. */ +#define AMDVI_INVD_PAGE_OPCODE 0x3 /* Invalidate pages. */ +#define AMDVI_INVD_IOTLB_OPCODE 0x4 /* Invalidate IOTLB pages. */ +#define AMDVI_INVD_INTR_OPCODE 0x5 /* Invalidate Interrupt table. */ +#define AMDVI_PREFETCH_PAGES_OPCODE 0x6 /* Prefetch IOMMU pages. */ +#define AMDVI_COMP_PPR_OPCODE 0x7 /* Complete PPR request. */ +#define AMDVI_INV_ALL_OPCODE 0x8 /* Invalidate all. */ + +/* Completion wait attributes. */ +#define AMDVI_CMP_WAIT_STORE BIT(0) /* Write back data. */ +#define AMDVI_CMP_WAIT_INTR BIT(1) /* Completion wait interrupt. */ +#define AMDVI_CMP_WAIT_FLUSH BIT(2) /* Flush queue. */ + +/* Invalidate page. */ +#define AMDVI_INVD_PAGE_S BIT(0) /* Invalidation size. */ +#define AMDVI_INVD_PAGE_PDE BIT(1) /* Invalidate PDE. */ +#define AMDVI_INVD_PAGE_GN_GVA BIT(2) /* GPA or GVA. */ + +#define AMDVI_INVD_PAGE_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) + +/* Invalidate IOTLB. */ +#define AMDVI_INVD_IOTLB_S BIT(0) /* Invalidation size 4k or addr */ +#define AMDVI_INVD_IOTLB_GN_GVA BIT(2) /* GPA or GVA. */ + +#define AMDVI_INVD_IOTLB_ALL_ADDR (0x7FFFFFFFFFFFFULL << 12) +/* XXX: add more command entries. */ + +/* + * IOMMU event entry. + */ +struct amdvi_event { + uint16_t devid; + uint16_t pasid_hi; + uint16_t pasid_domid; /* PASID low or DomainID */ + uint16_t flag:12; + uint8_t opcode:4; + uint64_t addr; +} __attribute__((__packed__)); +CTASSERT(sizeof(struct amdvi_event) == 16); + +/* Various event types. */ +#define AMDVI_EVENT_INVALID_DTE 0x1 +#define AMDVI_EVENT_PFAULT 0x2 +#define AMDVI_EVENT_DTE_HW_ERROR 0x3 +#define AMDVI_EVENT_PAGE_HW_ERROR 0x4 +#define AMDVI_EVENT_ILLEGAL_CMD 0x5 +#define AMDVI_EVENT_CMD_HW_ERROR 0x6 +#define AMDVI_EVENT_IOTLB_TIMEOUT 0x7 +#define AMDVI_EVENT_INVALID_DTE_REQ 0x8 +#define AMDVI_EVENT_INVALID_PPR_REQ 0x9 +#define AMDVI_EVENT_COUNTER_ZERO 0xA + +#define AMDVI_EVENT_FLAG_MASK 0x1FF /* Mask for event flags. */ +#define AMDVI_EVENT_FLAG_TYPE(x) (((x) >> 9) & 0x3) + +/* + * IOMMU control block. + */ +struct amdvi_ctrl { + struct { + uint16_t size:9; + uint16_t :3; + uint64_t base:40; /* Devtable register base. */ + uint16_t :12; + } dte; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } cmd; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } event; + uint16_t control :13; + uint64_t :51; + struct { + uint8_t enable:1; + uint8_t allow:1; + uint16_t :10; + uint64_t base:40; + uint16_t :12; + uint16_t :12; + uint64_t limit:40; + uint16_t :12; + } excl; + /* + * Revision 2 only. + */ + uint64_t ex_feature; + struct { + uint16_t :12; + uint64_t base:40; + uint8_t :4; + uint8_t len:4; + uint8_t :4; + } ppr; + uint64_t first_event; + uint64_t second_event; + uint64_t event_status; + /* Revision 2 only, end. */ + uint8_t pad1[0x1FA8]; /* Padding. */ + uint32_t cmd_head:19; + uint64_t :45; + uint32_t cmd_tail:19; + uint64_t :45; + uint32_t evt_head:19; + uint64_t :45; + uint32_t evt_tail:19; + uint64_t :45; + uint32_t status:19; + uint64_t :45; + uint64_t pad2; + uint8_t :4; + uint16_t ppr_head:15; + uint64_t :45; + uint8_t :4; + uint16_t ppr_tail:15; + uint64_t :45; + uint8_t pad3[0x1FC0]; /* Padding. */ + + /* XXX: More for rev2. */ +} __attribute__((__packed__)); +CTASSERT(offsetof(struct amdvi_ctrl, pad1)== 0x58); +CTASSERT(offsetof(struct amdvi_ctrl, pad2)== 0x2028); +CTASSERT(offsetof(struct amdvi_ctrl, pad3)== 0x2040); + +#define AMDVI_MMIO_V1_SIZE (4 * PAGE_SIZE) /* v1 size */ +/* + * AMF IOMMU v2 size including event counters + */ +#define AMDVI_MMIO_V2_SIZE (8 * PAGE_SIZE) + +CTASSERT(sizeof(struct amdvi_ctrl) == 0x4000); +CTASSERT(sizeof(struct amdvi_ctrl) == AMDVI_MMIO_V1_SIZE); + +/* IVHD flag */ +#define IVHD_FLAG_HTT BIT(0) /* Hypertransport Tunnel. */ +#define IVHD_FLAG_PPW BIT(1) /* Pass posted write. */ +#define IVHD_FLAG_RPPW BIT(2) /* Response pass posted write. */ +#define IVHD_FLAG_ISOC BIT(3) /* Isoc support. */ +#define IVHD_FLAG_IOTLB BIT(4) /* IOTLB support. */ +#define IVHD_FLAG_COH BIT(5) /* Coherent control, default 1 */ +#define IVHD_FLAG_PFS BIT(6) /* Prefetch IOMMU pages. */ +#define IVHD_FLAG_PPRS BIT(7) /* Peripheral page support. */ + +/* IVHD device entry data setting. */ +#define IVHD_DEV_LINT0_PASS BIT(6) /* LINT0 interrupts. */ +#define IVHD_DEV_LINT1_PASS BIT(7) /* LINT1 interrupts. */ + +/* Bit[5:4] for System Mgmt. Bit3 is reserved. */ +#define IVHD_DEV_INIT_PASS BIT(0) /* INIT */ +#define IVHD_DEV_EXTINTR_PASS BIT(1) /* ExtInt */ +#define IVHD_DEV_NMI_PASS BIT(2) /* NMI */ + +/* IVHD 8-byte extended data settings. */ +#define IVHD_DEV_EXT_ATS_DISABLE BIT(31) /* Disable ATS */ + +/* IOMMU control register. */ +#define AMDVI_CTRL_EN BIT(0) /* IOMMU enable. */ +#define AMDVI_CTRL_HTT BIT(1) /* Hypertransport tunnel enable. */ +#define AMDVI_CTRL_ELOG BIT(2) /* Event log enable. */ +#define AMDVI_CTRL_ELOGINT BIT(3) /* Event log interrupt. */ +#define AMDVI_CTRL_COMINT BIT(4) /* Completion wait interrupt. */ +#define AMDVI_CTRL_PPW BIT(8) +#define AMDVI_CTRL_RPPW BIT(9) +#define AMDVI_CTRL_COH BIT(10) +#define AMDVI_CTRL_ISOC BIT(11) +#define AMDVI_CTRL_CMD BIT(12) /* Command buffer enable. */ +#define AMDVI_CTRL_PPRLOG BIT(13) +#define AMDVI_CTRL_PPRINT BIT(14) +#define AMDVI_CTRL_PPREN BIT(15) +#define AMDVI_CTRL_GTE BIT(16) /* Guest translation enable. */ +#define AMDVI_CTRL_GAE BIT(17) /* Guest APIC enable. */ + +/* Invalidation timeout. */ +#define AMDVI_CTRL_INV_NO_TO 0 /* No timeout. */ +#define AMDVI_CTRL_INV_TO_1ms 1 /* 1 ms */ +#define AMDVI_CTRL_INV_TO_10ms 2 /* 10 ms */ +#define AMDVI_CTRL_INV_TO_100ms 3 /* 100 ms */ +#define AMDVI_CTRL_INV_TO_1S 4 /* 1 second */ +#define AMDVI_CTRL_INV_TO_10S 5 /* 10 second */ +#define AMDVI_CTRL_INV_TO_100S 6 /* 100 second */ + +/* + * Max number of PCI devices. + * 256 bus x 32 slot/devices x 8 functions. + */ +#define PCI_NUM_DEV_MAX 0x10000 + +/* Maximum number of domains supported by IOMMU. */ +#define AMDVI_MAX_DOMAIN (BIT(16) - 1) + +/* + * IOMMU Page Table attributes. + */ +#define AMDVI_PT_PRESENT BIT(0) +#define AMDVI_PT_COHERENT BIT(60) +#define AMDVI_PT_READ BIT(61) +#define AMDVI_PT_WRITE BIT(62) + +#define AMDVI_PT_RW (AMDVI_PT_READ | AMDVI_PT_WRITE) +#define AMDVI_PT_MASK 0xFFFFFFFFFF000UL /* Only [51:12] for PA */ + +#define AMDVI_PD_LEVEL_SHIFT 9 +#define AMDVI_PD_SUPER(x) (((x) >> AMDVI_PD_LEVEL_SHIFT) == 7) +/* + * IOMMU Status, offset 0x2020 + */ +#define AMDVI_STATUS_EV_OF BIT(0) /* Event overflow. */ +#define AMDVI_STATUS_EV_INTR BIT(1) /* Event interrupt. */ +/* Completion wait command completed. */ +#define AMDVI_STATUS_CMP BIT(2) + +#define IVRS_CTRL_RID 1 /* MMIO RID */ + +/* ACPI IVHD */ +struct ivhd_dev_cfg { + uint32_t start_id; + uint32_t end_id; + uint8_t data; /* Device configuration. */ + bool enable_ats; /* ATS enabled for the device. */ + int ats_qlen; /* ATS invalidation queue depth. */ +}; + +struct amdvi_domain { + uint64_t *ptp; /* Highest level page table */ + int ptp_level; /* Level of page tables */ + u_int id; /* Domain id */ + SLIST_ENTRY (amdvi_domain) next; +}; + +/* + * I/O Virtualization Hardware Definition Block (IVHD) type 0x10 (legacy) + * uses ACPI_IVRS_HARDWARE define in contrib/dev/acpica/include/actbl2.h + * New IVHD types 0x11 and 0x40 as defined in AMD IOMMU spec[48882] are missing in + * ACPI code. These new types add extra field EFR(Extended Feature Register). + * XXX : Use definition from ACPI when it is available. + */ +typedef struct acpi_ivrs_hardware_efr_sup +{ + ACPI_IVRS_HEADER Header; + UINT16 CapabilityOffset; /* Offset for IOMMU control fields */ + UINT64 BaseAddress; /* IOMMU control registers */ + UINT16 PciSegmentGroup; + UINT16 Info; /* MSI number and unit ID */ + UINT32 Attr; /* IOMMU Feature */ + UINT64 ExtFR; /* IOMMU Extended Feature */ + UINT64 Reserved; /* v1 feature or v2 attribute */ +} __attribute__ ((__packed__)) ACPI_IVRS_HARDWARE_EFRSUP; +CTASSERT(sizeof(ACPI_IVRS_HARDWARE_EFRSUP) == 40); + +/* + * Different type of IVHD. + * XXX: Use AcpiIvrsType once new IVHD types are available. +*/ +enum IvrsType +{ + IVRS_TYPE_HARDWARE_LEGACY = 0x10, /* Legacy without EFRi support. */ + IVRS_TYPE_HARDWARE_EFR = 0x11, /* With EFR support. */ + IVRS_TYPE_HARDWARE_MIXED = 0x40, /* Mixed with EFR support. */ +}; + +/* + * AMD IOMMU softc. + */ +struct amdvi_softc { + struct amdvi_ctrl *ctrl; /* Control area. */ + device_t dev; /* IOMMU device. */ + enum IvrsType ivhd_type; /* IOMMU IVHD type. */ + bool iotlb; /* IOTLB supported by IOMMU */ + struct amdvi_cmd *cmd; /* Command descriptor area. */ + int cmd_max; /* Max number of commands. */ + uint64_t cmp_data; /* Command completion write back. */ + struct amdvi_event *event; /* Event descriptor area. */ + struct resource *event_res; /* Event interrupt resource. */ + void *event_tag; /* Event interrupt tag. */ + int event_max; /* Max number of events. */ + int event_irq; + int event_rid; + /* ACPI various flags. */ + uint32_t ivhd_flag; /* ACPI IVHD flag. */ + uint32_t ivhd_feature; /* ACPI v1 Reserved or v2 attribute. */ + uint64_t ext_feature; /* IVHD EFR */ + /* PCI related. */ + uint16_t cap_off; /* PCI Capability offset. */ + uint8_t pci_cap; /* PCI capability. */ + uint16_t pci_seg; /* IOMMU PCI domain/segment. */ + uint16_t pci_rid; /* PCI BDF of IOMMU */ + /* Device range under this IOMMU. */ + uint16_t start_dev_rid; /* First device under this IOMMU. */ + uint16_t end_dev_rid; /* Last device under this IOMMU. */ + + /* BIOS provided device configuration for end points. */ + struct ivhd_dev_cfg dev_cfg[10]; + int dev_cfg_cnt; + + /* Software statistics. */ + uint64_t event_intr_cnt; /* Total event INTR count. */ + uint64_t total_cmd; /* Total number of commands. */ +}; + +int amdvi_setup_hw(struct amdvi_softc *softc); +int amdvi_teardown_hw(struct amdvi_softc *softc); +#endif /* _AMDVI_PRIV_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c new file mode 100644 index 0000000000..370c20fb01 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/ivrs_drv.c @@ -0,0 +1,735 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2016, Anish Gupta (anish@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include "opt_acpi.h" +#include <sys/param.h> +#include <sys/bus.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/malloc.h> + +#include <machine/vmparam.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <contrib/dev/acpica/include/acpi.h> +#include <contrib/dev/acpica/include/accommon.h> +#include <dev/acpica/acpivar.h> + +#include "io/iommu.h" +#include "amdvi_priv.h" + +device_t *ivhd_devs; /* IVHD or AMD-Vi device list. */ +int ivhd_count; /* Number of IVHD header. */ +/* + * Cached IVHD header list. + * Single entry for each IVHD, filtered the legacy one. + */ +ACPI_IVRS_HARDWARE *ivhd_hdrs[10]; + +extern int amdvi_ptp_level; /* Page table levels. */ + +typedef int (*ivhd_iter_t)(ACPI_IVRS_HEADER *ptr, void *arg); +/* + * Iterate IVRS table for IVHD and IVMD device type. + */ +static void +ivrs_hdr_iterate_tbl(ivhd_iter_t iter, void *arg) +{ + ACPI_TABLE_IVRS *ivrs; + ACPI_IVRS_HEADER *ivrs_hdr, *end; + ACPI_STATUS status; + + status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); + if (ACPI_FAILURE(status)) + return; + + if (ivrs->Header.Length == 0) { + return; + } + + ivrs_hdr = (ACPI_IVRS_HEADER *)(ivrs + 1); + end = (ACPI_IVRS_HEADER *)((char *)ivrs + ivrs->Header.Length); + + while (ivrs_hdr < end) { + if ((uint8_t *)ivrs_hdr + ivrs_hdr->Length > (uint8_t *)end) { + printf("AMD-Vi:IVHD/IVMD is corrupted, length : %d\n", + ivrs_hdr->Length); + break; + } + + switch (ivrs_hdr->Type) { + case IVRS_TYPE_HARDWARE_LEGACY: /* Legacy */ + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + if (!iter(ivrs_hdr, arg)) + return; + break; + + case ACPI_IVRS_TYPE_MEMORY1: + case ACPI_IVRS_TYPE_MEMORY2: + case ACPI_IVRS_TYPE_MEMORY3: + if (!iter(ivrs_hdr, arg)) + return; + + break; + + default: + printf("AMD-Vi:Not IVHD/IVMD type(%d)", ivrs_hdr->Type); + + } + + ivrs_hdr = (ACPI_IVRS_HEADER *)((uint8_t *)ivrs_hdr + + ivrs_hdr->Length); + } +} + +static bool +ivrs_is_ivhd(UINT8 type) +{ + + switch(type) { + case IVRS_TYPE_HARDWARE_LEGACY: + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + return (true); + + default: + return (false); + } +} + +/* Count the number of AMD-Vi devices in the system. */ +static int +ivhd_count_iter(ACPI_IVRS_HEADER * ivrs_he, void *arg) +{ + + if (ivrs_is_ivhd(ivrs_he->Type)) + ivhd_count++; + + return (1); +} + +struct find_ivrs_hdr_args { + int i; + ACPI_IVRS_HEADER *ptr; +}; + +static int +ivrs_hdr_find_iter(ACPI_IVRS_HEADER * ivrs_hdr, void *args) +{ + struct find_ivrs_hdr_args *fi; + + fi = (struct find_ivrs_hdr_args *)args; + if (ivrs_is_ivhd(ivrs_hdr->Type)) { + if (fi->i == 0) { + fi->ptr = ivrs_hdr; + return (0); + } + fi->i--; + } + + return (1); +} + +static ACPI_IVRS_HARDWARE * +ivhd_find_by_index(int idx) +{ + struct find_ivrs_hdr_args fi; + + fi.i = idx; + fi.ptr = NULL; + + ivrs_hdr_iterate_tbl(ivrs_hdr_find_iter, &fi); + + return ((ACPI_IVRS_HARDWARE *)fi.ptr); +} + +static void +ivhd_dev_add_entry(struct amdvi_softc *softc, uint32_t start_id, + uint32_t end_id, uint8_t cfg, bool ats) +{ + struct ivhd_dev_cfg *dev_cfg; + + /* If device doesn't have special data, don't add it. */ + if (!cfg) + return; + + dev_cfg = &softc->dev_cfg[softc->dev_cfg_cnt++]; + dev_cfg->start_id = start_id; + dev_cfg->end_id = end_id; + dev_cfg->data = cfg; + dev_cfg->enable_ats = ats; +} + +/* + * Record device attributes as suggested by BIOS. + */ +static int +ivhd_dev_parse(ACPI_IVRS_HARDWARE* ivhd, struct amdvi_softc *softc) +{ + ACPI_IVRS_DE_HEADER *de; + uint8_t *p, *end; + int range_start_id = 0, range_end_id = 0; + uint32_t *extended; + uint8_t all_data = 0, range_data = 0; + bool range_enable_ats = false, enable_ats; + + softc->start_dev_rid = ~0; + softc->end_dev_rid = 0; + + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_LEGACY: + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE); + break; + + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + p = (uint8_t *)ivhd + sizeof(ACPI_IVRS_HARDWARE_EFRSUP); + break; + + default: + device_printf(softc->dev, + "unknown type: 0x%x\n", ivhd->Header.Type); + return (-1); + } + + end = (uint8_t *)ivhd + ivhd->Header.Length; + + while (p < end) { + de = (ACPI_IVRS_DE_HEADER *)p; + softc->start_dev_rid = MIN(softc->start_dev_rid, de->Id); + softc->end_dev_rid = MAX(softc->end_dev_rid, de->Id); + switch (de->Type) { + case ACPI_IVRS_TYPE_ALL: + all_data = de->DataSetting; + break; + + case ACPI_IVRS_TYPE_SELECT: + case ACPI_IVRS_TYPE_ALIAS_SELECT: + case ACPI_IVRS_TYPE_EXT_SELECT: + enable_ats = false; + if (de->Type == ACPI_IVRS_TYPE_EXT_SELECT) { + extended = (uint32_t *)(de + 1); + enable_ats = + (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? + false : true; + } + ivhd_dev_add_entry(softc, de->Id, de->Id, + de->DataSetting | all_data, enable_ats); + break; + + case ACPI_IVRS_TYPE_START: + case ACPI_IVRS_TYPE_ALIAS_START: + case ACPI_IVRS_TYPE_EXT_START: + range_start_id = de->Id; + range_data = de->DataSetting; + if (de->Type == ACPI_IVRS_TYPE_EXT_START) { + extended = (uint32_t *)(de + 1); + range_enable_ats = + (*extended & IVHD_DEV_EXT_ATS_DISABLE) ? + false : true; + } + break; + + case ACPI_IVRS_TYPE_END: + range_end_id = de->Id; + ivhd_dev_add_entry(softc, range_start_id, range_end_id, + range_data | all_data, range_enable_ats); + range_start_id = range_end_id = 0; + range_data = 0; + all_data = 0; + break; + + case ACPI_IVRS_TYPE_PAD4: + break; + + case ACPI_IVRS_TYPE_SPECIAL: + /* HPET or IOAPIC */ + break; + default: + if ((de->Type < 5) || + (de->Type >= ACPI_IVRS_TYPE_PAD8)) + device_printf(softc->dev, + "Unknown dev entry:0x%x\n", de->Type); + } + + if (softc->dev_cfg_cnt > + (sizeof(softc->dev_cfg) / sizeof(softc->dev_cfg[0]))) { + device_printf(softc->dev, + "WARN Too many device entries.\n"); + return (EINVAL); + } + if (de->Type < 0x40) + p += sizeof(ACPI_IVRS_DEVICE4); + else if (de->Type < 0x80) + p += sizeof(ACPI_IVRS_DEVICE8A); + else { + printf("Variable size IVHD type 0x%x not supported\n", + de->Type); + break; + } + } + + KASSERT((softc->end_dev_rid >= softc->start_dev_rid), + ("Device end[0x%x] < start[0x%x.\n", + softc->end_dev_rid, softc->start_dev_rid)); + + return (0); +} + +static bool +ivhd_is_newer(ACPI_IVRS_HEADER *old, ACPI_IVRS_HEADER *new) +{ + /* + * Newer IVRS header type take precedence. + */ + if ((old->DeviceId == new->DeviceId) && + (old->Type == IVRS_TYPE_HARDWARE_LEGACY) && + ((new->Type == IVRS_TYPE_HARDWARE_EFR) || + (new->Type == IVRS_TYPE_HARDWARE_MIXED))) { + return (true); + } + + return (false); +} + +static void +ivhd_identify(driver_t *driver, device_t parent) +{ + ACPI_TABLE_IVRS *ivrs; + ACPI_IVRS_HARDWARE *ivhd; + ACPI_STATUS status; + int i, count = 0; + uint32_t ivrs_ivinfo; + + if (acpi_disabled("ivhd")) + return; + + status = AcpiGetTable(ACPI_SIG_IVRS, 1, (ACPI_TABLE_HEADER **)&ivrs); + if (ACPI_FAILURE(status)) + return; + + if (ivrs->Header.Length == 0) { + return; + } + + ivrs_ivinfo = ivrs->Info; + printf("AMD-Vi: IVRS Info VAsize = %d PAsize = %d GVAsize = %d" + " flags:%b\n", + REG_BITS(ivrs_ivinfo, 21, 15), REG_BITS(ivrs_ivinfo, 14, 8), + REG_BITS(ivrs_ivinfo, 7, 5), REG_BITS(ivrs_ivinfo, 22, 22), + "\020\001EFRSup"); + + ivrs_hdr_iterate_tbl(ivhd_count_iter, NULL); + if (!ivhd_count) + return; + + for (i = 0; i < ivhd_count; i++) { + ivhd = ivhd_find_by_index(i); + KASSERT(ivhd, ("ivhd%d is NULL\n", i)); + ivhd_hdrs[i] = ivhd; + } + + /* + * Scan for presence of legacy and non-legacy device type + * for same AMD-Vi device and override the old one. + */ + for (i = ivhd_count - 1 ; i > 0 ; i--){ + if (ivhd_is_newer(&ivhd_hdrs[i-1]->Header, + &ivhd_hdrs[i]->Header)) { + ivhd_hdrs[i-1] = ivhd_hdrs[i]; + ivhd_count--; + } + } + + ivhd_devs = malloc(sizeof(device_t) * ivhd_count, M_DEVBUF, + M_WAITOK | M_ZERO); + for (i = 0; i < ivhd_count; i++) { + ivhd = ivhd_hdrs[i]; + KASSERT(ivhd, ("ivhd%d is NULL\n", i)); + + /* + * Use a high order to ensure that this driver is probed after + * the Host-PCI bridge and the root PCI bus. + */ + ivhd_devs[i] = BUS_ADD_CHILD(parent, + ACPI_DEV_BASE_ORDER + 10 * 10, "ivhd", i); + + /* + * XXX: In case device was not destroyed before, add will fail. + * locate the old device instance. + */ + if (ivhd_devs[i] == NULL) { + ivhd_devs[i] = device_find_child(parent, "ivhd", i); + if (ivhd_devs[i] == NULL) { + printf("AMD-Vi: cant find ivhd%d\n", i); + break; + } + } + count++; + } + + /* + * Update device count in case failed to attach. + */ + ivhd_count = count; +} + +static int +ivhd_probe(device_t dev) +{ + ACPI_IVRS_HARDWARE *ivhd; + int unit; + + if (acpi_get_handle(dev) != NULL) + return (ENXIO); + + unit = device_get_unit(dev); + KASSERT((unit < ivhd_count), + ("ivhd unit %d > count %d", unit, ivhd_count)); + ivhd = ivhd_hdrs[unit]; + KASSERT(ivhd, ("ivhd is NULL")); + + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_EFR: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd with EFR"); + break; + + case IVRS_TYPE_HARDWARE_MIXED: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd in mixed format"); + break; + + case IVRS_TYPE_HARDWARE_LEGACY: + default: + device_set_desc(dev, "AMD-Vi/IOMMU ivhd"); + break; + } + + return (BUS_PROBE_NOWILDCARD); +} + +static void +ivhd_print_flag(device_t dev, enum IvrsType ivhd_type, uint8_t flag) +{ + /* + * IVHD lgeacy type has two extra high bits in flag which has + * been moved to EFR for non-legacy device. + */ + switch (ivhd_type) { + case IVRS_TYPE_HARDWARE_LEGACY: + device_printf(dev, "Flag:%b\n", flag, + "\020" + "\001HtTunEn" + "\002PassPW" + "\003ResPassPW" + "\004Isoc" + "\005IotlbSup" + "\006Coherent" + "\007PreFSup" + "\008PPRSup"); + break; + + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + device_printf(dev, "Flag:%b\n", flag, + "\020" + "\001HtTunEn" + "\002PassPW" + "\003ResPassPW" + "\004Isoc" + "\005IotlbSup" + "\006Coherent"); + break; + + default: + device_printf(dev, "Can't decode flag of ivhd type :0x%x\n", + ivhd_type); + break; + } +} + +/* + * Feature in legacy IVHD type(0x10) and attribute in newer type(0x11 and 0x40). + */ +static void +ivhd_print_feature(device_t dev, enum IvrsType ivhd_type, uint32_t feature) +{ + switch (ivhd_type) { + case IVRS_TYPE_HARDWARE_LEGACY: + device_printf(dev, "Features(type:0x%x) HATS = %d GATS = %d" + " MsiNumPPR = %d PNBanks= %d PNCounters= %d\n", + ivhd_type, + REG_BITS(feature, 31, 30), + REG_BITS(feature, 29, 28), + REG_BITS(feature, 27, 23), + REG_BITS(feature, 22, 17), + REG_BITS(feature, 16, 13)); + device_printf(dev, "max PASID = %d GLXSup = %d Feature:%b\n", + REG_BITS(feature, 12, 8), + REG_BITS(feature, 4, 3), + feature, + "\020" + "\002NXSup" + "\003GTSup" + "\004<b4>" + "\005IASup" + "\006GASup" + "\007HESup"); + break; + + /* Fewer features or attributes are reported in non-legacy type. */ + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + device_printf(dev, "Features(type:0x%x) MsiNumPPR = %d" + " PNBanks= %d PNCounters= %d\n", + ivhd_type, + REG_BITS(feature, 27, 23), + REG_BITS(feature, 22, 17), + REG_BITS(feature, 16, 13)); + break; + + default: /* Other ivhd type features are not decoded. */ + device_printf(dev, "Can't decode ivhd type :0x%x\n", ivhd_type); + } +} + +/* Print extended features of IOMMU. */ +static void +ivhd_print_ext_feature(device_t dev, uint64_t ext_feature) +{ + uint32_t ext_low, ext_high; + + if (!ext_feature) + return; + + ext_low = ext_feature; + device_printf(dev, "Extended features[31:0]:%b " + "HATS = 0x%x GATS = 0x%x " + "GLXSup = 0x%x SmiFSup = 0x%x SmiFRC = 0x%x " + "GAMSup = 0x%x DualPortLogSup = 0x%x DualEventLogSup = 0x%x\n", + (int)ext_low, + "\020" + "\001PreFSup" + "\002PPRSup" + "\003<b2>" + "\004NXSup" + "\005GTSup" + "\006<b5>" + "\007IASup" + "\008GASup" + "\009HESup" + "\010PCSup", + REG_BITS(ext_low, 11, 10), + REG_BITS(ext_low, 13, 12), + REG_BITS(ext_low, 15, 14), + REG_BITS(ext_low, 17, 16), + REG_BITS(ext_low, 20, 18), + REG_BITS(ext_low, 23, 21), + REG_BITS(ext_low, 25, 24), + REG_BITS(ext_low, 29, 28)); + + ext_high = ext_feature >> 32; + device_printf(dev, "Extended features[62:32]:%b " + "Max PASID: 0x%x DevTblSegSup = 0x%x " + "MarcSup = 0x%x\n", + (int)(ext_high), + "\020" + "\006USSup" + "\009PprOvrflwEarlySup" + "\010PPRAutoRspSup" + "\013BlKStopMrkSup" + "\014PerfOptSup" + "\015MsiCapMmioSup" + "\017GIOSup" + "\018HASup" + "\019EPHSup" + "\020AttrFWSup" + "\021HDSup" + "\023InvIotlbSup", + REG_BITS(ext_high, 5, 0), + REG_BITS(ext_high, 8, 7), + REG_BITS(ext_high, 11, 10)); +} + +static int +ivhd_print_cap(struct amdvi_softc *softc, ACPI_IVRS_HARDWARE * ivhd) +{ + device_t dev; + int max_ptp_level; + + dev = softc->dev; + + ivhd_print_flag(dev, softc->ivhd_type, softc->ivhd_flag); + ivhd_print_feature(dev, softc->ivhd_type, softc->ivhd_feature); + ivhd_print_ext_feature(dev, softc->ext_feature); + max_ptp_level = 7; + /* Make sure device support minimum page level as requested by user. */ + if (max_ptp_level < amdvi_ptp_level) { + device_printf(dev, "insufficient PTP level:%d\n", + max_ptp_level); + return (EINVAL); + } else { + device_printf(softc->dev, "supported paging level:%d, will use only: %d\n", + max_ptp_level, amdvi_ptp_level); + } + + device_printf(softc->dev, "device range: 0x%x - 0x%x\n", + softc->start_dev_rid, softc->end_dev_rid); + + return (0); +} + +static int +ivhd_attach(device_t dev) +{ + ACPI_IVRS_HARDWARE *ivhd; + ACPI_IVRS_HARDWARE_EFRSUP *ivhd_efr; + struct amdvi_softc *softc; + int status, unit; + + unit = device_get_unit(dev); + KASSERT((unit < ivhd_count), + ("ivhd unit %d > count %d", unit, ivhd_count)); + /* Make sure its same device for which attach is called. */ + KASSERT((ivhd_devs[unit] == dev), + ("Not same device old %p new %p", ivhd_devs[unit], dev)); + + softc = device_get_softc(dev); + softc->dev = dev; + ivhd = ivhd_hdrs[unit]; + KASSERT(ivhd, ("ivhd is NULL")); + + softc->ivhd_type = ivhd->Header.Type; + softc->pci_seg = ivhd->PciSegmentGroup; + softc->pci_rid = ivhd->Header.DeviceId; + softc->ivhd_flag = ivhd->Header.Flags; + /* + * On lgeacy IVHD type(0x10), it is documented as feature + * but in newer type it is attribute. + */ + softc->ivhd_feature = ivhd->Reserved; + /* + * PCI capability has more capabilities that are not part of IVRS. + */ + softc->cap_off = ivhd->CapabilityOffset; + +#ifdef notyet + /* IVHD Info bit[4:0] is event MSI/X number. */ + softc->event_msix = ivhd->Info & 0x1F; +#endif + switch (ivhd->Header.Type) { + case IVRS_TYPE_HARDWARE_EFR: + case IVRS_TYPE_HARDWARE_MIXED: + ivhd_efr = (ACPI_IVRS_HARDWARE_EFRSUP *)ivhd; + softc->ext_feature = ivhd_efr->ExtFR; + break; + + } + + softc->ctrl = (struct amdvi_ctrl *) PHYS_TO_DMAP(ivhd->BaseAddress); + status = ivhd_dev_parse(ivhd, softc); + if (status != 0) { + device_printf(dev, + "endpoint device parsing error=%d\n", status); + } + + status = ivhd_print_cap(softc, ivhd); + if (status != 0) { + return (status); + } + + status = amdvi_setup_hw(softc); + if (status != 0) { + device_printf(dev, "couldn't be initialised, error=%d\n", + status); + return (status); + } + + return (0); +} + +static int +ivhd_detach(device_t dev) +{ + struct amdvi_softc *softc; + + softc = device_get_softc(dev); + + amdvi_teardown_hw(softc); + + /* + * XXX: delete the device. + * don't allow detach, return EBUSY. + */ + return (0); +} + +static int +ivhd_suspend(device_t dev) +{ + + return (0); +} + +static int +ivhd_resume(device_t dev) +{ + + return (0); +} + +static device_method_t ivhd_methods[] = { + DEVMETHOD(device_identify, ivhd_identify), + DEVMETHOD(device_probe, ivhd_probe), + DEVMETHOD(device_attach, ivhd_attach), + DEVMETHOD(device_detach, ivhd_detach), + DEVMETHOD(device_suspend, ivhd_suspend), + DEVMETHOD(device_resume, ivhd_resume), + DEVMETHOD_END +}; + +static driver_t ivhd_driver = { + "ivhd", + ivhd_methods, + sizeof(struct amdvi_softc), +}; + +static devclass_t ivhd_devclass; + +/* + * Load this module at the end after PCI re-probing to configure interrupt. + */ +DRIVER_MODULE_ORDERED(ivhd, acpi, ivhd_driver, ivhd_devclass, 0, 0, + SI_ORDER_ANY); +MODULE_DEPEND(ivhd, acpi, 1, 1, 1); +MODULE_DEPEND(ivhd, pci, 1, 1, 1); diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.c b/usr/src/uts/i86pc/io/vmm/amd/npt.c new file mode 100644 index 0000000000..e61464a964 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/npt.c @@ -0,0 +1,87 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> + +#include "npt.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, npt, CTLFLAG_RW, NULL, NULL); + +static int npt_flags; +SYSCTL_INT(_hw_vmm_npt, OID_AUTO, pmap_flags, CTLFLAG_RD, + &npt_flags, 0, NULL); + +#define NPT_IPIMASK 0xFF + +/* + * AMD nested page table init. + */ +int +svm_npt_init(int ipinum) +{ + int enable_superpage = 1; + + npt_flags = ipinum & NPT_IPIMASK; + TUNABLE_INT_FETCH("hw.vmm.npt.enable_superpage", &enable_superpage); + if (enable_superpage) + npt_flags |= PMAP_PDE_SUPERPAGE; + + return (0); +} + +static int +npt_pinit(pmap_t pmap) +{ + + return (pmap_pinit_type(pmap, PT_RVI, npt_flags)); +} + +struct vmspace * +svm_npt_alloc(vm_offset_t min, vm_offset_t max) +{ + + return (vmspace_alloc(min, max, npt_pinit)); +} + +void +svm_npt_free(struct vmspace *vmspace) +{ + + vmspace_free(vmspace); +} diff --git a/usr/src/uts/i86pc/io/vmm/amd/npt.h b/usr/src/uts/i86pc/io/vmm/amd/npt.h new file mode 100644 index 0000000000..35530d7833 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/npt.h @@ -0,0 +1,38 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_NPT_H_ +#define _SVM_NPT_H_ + +int svm_npt_init(int ipinum); +struct vmspace *svm_npt_alloc(vm_offset_t min, vm_offset_t max); +void svm_npt_free(struct vmspace *vmspace); + +#endif /* _SVM_NPT_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/offsets.in b/usr/src/uts/i86pc/io/vmm/amd/offsets.in new file mode 100644 index 0000000000..f8d2a716d7 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/offsets.in @@ -0,0 +1,36 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ +#include <sys/types.h> + +#include "amd/svm.h" + +svm_regctx + sctx_rbx SCTX_RBX + sctx_rcx SCTX_RCX + sctx_rbp SCTX_RBP + sctx_rdx SCTX_RDX + sctx_rdi SCTX_RDI + sctx_rsi SCTX_RSI + sctx_r8 SCTX_R8 + sctx_r9 SCTX_R9 + sctx_r10 SCTX_R10 + sctx_r11 SCTX_R11 + sctx_r12 SCTX_R12 + sctx_r13 SCTX_R13 + sctx_r14 SCTX_R14 + sctx_r15 SCTX_R15 + +/* Pull in definition for MSR_GSBASE */ +\#include <machine/specialreg.h> diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.c b/usr/src/uts/i86pc/io/vmm/amd/svm.c new file mode 100644 index 0000000000..25dc3a63fa --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.c @@ -0,0 +1,2446 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/smp.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/pcpu.h> +#include <sys/proc.h> +#include <sys/sysctl.h> + +#ifndef __FreeBSD__ +#include <sys/x86_archext.h> +#include <sys/trap.h> +#endif + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <machine/cpufunc.h> +#include <machine/psl.h> +#include <machine/md_var.h> +#include <machine/reg.h> +#include <machine/specialreg.h> +#include <machine/smp.h> +#include <machine/vmm.h> +#include <machine/vmm_dev.h> +#include <machine/vmm_instruction_emul.h> + +#include "vmm_lapic.h" +#include "vmm_stat.h" +#include "vmm_ktr.h" +#include "vmm_ioport.h" +#include "vatpic.h" +#include "vlapic.h" +#include "vlapic_priv.h" + +#include "x86.h" +#include "vmcb.h" +#include "svm.h" +#include "svm_softc.h" +#include "svm_msr.h" +#include "npt.h" + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, svm, CTLFLAG_RW, NULL, NULL); + +/* + * SVM CPUID function 0x8000_000A, edx bit decoding. + */ +#define AMD_CPUID_SVM_NP BIT(0) /* Nested paging or RVI */ +#define AMD_CPUID_SVM_LBR BIT(1) /* Last branch virtualization */ +#define AMD_CPUID_SVM_SVML BIT(2) /* SVM lock */ +#define AMD_CPUID_SVM_NRIP_SAVE BIT(3) /* Next RIP is saved */ +#define AMD_CPUID_SVM_TSC_RATE BIT(4) /* TSC rate control. */ +#define AMD_CPUID_SVM_VMCB_CLEAN BIT(5) /* VMCB state caching */ +#define AMD_CPUID_SVM_FLUSH_BY_ASID BIT(6) /* Flush by ASID */ +#define AMD_CPUID_SVM_DECODE_ASSIST BIT(7) /* Decode assist */ +#define AMD_CPUID_SVM_PAUSE_INC BIT(10) /* Pause intercept filter. */ +#define AMD_CPUID_SVM_PAUSE_FTH BIT(12) /* Pause filter threshold */ +#define AMD_CPUID_SVM_AVIC BIT(13) /* AVIC present */ + +#define VMCB_CACHE_DEFAULT (VMCB_CACHE_ASID | \ + VMCB_CACHE_IOPM | \ + VMCB_CACHE_I | \ + VMCB_CACHE_TPR | \ + VMCB_CACHE_CR2 | \ + VMCB_CACHE_CR | \ + VMCB_CACHE_DR | \ + VMCB_CACHE_DT | \ + VMCB_CACHE_SEG | \ + VMCB_CACHE_NP) + +static uint32_t vmcb_clean = VMCB_CACHE_DEFAULT; +SYSCTL_INT(_hw_vmm_svm, OID_AUTO, vmcb_clean, CTLFLAG_RDTUN, &vmcb_clean, + 0, NULL); + +static MALLOC_DEFINE(M_SVM, "svm", "svm"); +static MALLOC_DEFINE(M_SVM_VLAPIC, "svm-vlapic", "svm-vlapic"); + +#ifdef __FreeBSD__ +/* Per-CPU context area. */ +extern struct pcpu __pcpu[]; +#endif + +static uint32_t svm_feature = ~0U; /* AMD SVM features. */ +SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, features, CTLFLAG_RDTUN, &svm_feature, 0, + "SVM features advertised by CPUID.8000000AH:EDX"); + +static int disable_npf_assist; +SYSCTL_INT(_hw_vmm_svm, OID_AUTO, disable_npf_assist, CTLFLAG_RWTUN, + &disable_npf_assist, 0, NULL); + +#ifdef __FreeBSD__ +/* Maximum ASIDs supported by the processor */ +static uint32_t nasid; +SYSCTL_UINT(_hw_vmm_svm, OID_AUTO, num_asids, CTLFLAG_RDTUN, &nasid, 0, + "Number of ASIDs supported by this processor"); + +/* Current ASID generation for each host cpu */ +static struct asid asid[MAXCPU]; + +/* + * SVM host state saved area of size 4KB for each core. + */ +static uint8_t hsave[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); +#endif /* __FreeBSD__ */ + +static VMM_STAT_AMD(VCPU_EXITINTINFO, "VM exits during event delivery"); +static VMM_STAT_AMD(VCPU_INTINFO_INJECTED, "Events pending at VM entry"); +static VMM_STAT_AMD(VMEXIT_VINTR, "VM exits due to interrupt window"); + +static int svm_setreg(void *arg, int vcpu, int ident, uint64_t val); + +static __inline int +flush_by_asid(void) +{ + + return (svm_feature & AMD_CPUID_SVM_FLUSH_BY_ASID); +} + +static __inline int +decode_assist(void) +{ + + return (svm_feature & AMD_CPUID_SVM_DECODE_ASSIST); +} + +#ifdef __FreeBSD__ +static void +svm_disable(void *arg __unused) +{ + uint64_t efer; + + efer = rdmsr(MSR_EFER); + efer &= ~EFER_SVM; + wrmsr(MSR_EFER, efer); +} + +/* + * Disable SVM on all CPUs. + */ +static int +svm_cleanup(void) +{ + + smp_rendezvous(NULL, svm_disable, NULL, NULL); + return (0); +} + +/* + * Verify that all the features required by bhyve are available. + */ +static int +check_svm_features(void) +{ + u_int regs[4]; + + /* CPUID Fn8000_000A is for SVM */ + do_cpuid(0x8000000A, regs); + svm_feature &= regs[3]; + + /* + * The number of ASIDs can be configured to be less than what is + * supported by the hardware but not more. + */ + if (nasid == 0 || nasid > regs[1]) + nasid = regs[1]; + KASSERT(nasid > 1, ("Insufficient ASIDs for guests: %#x", nasid)); + + /* bhyve requires the Nested Paging feature */ + if (!(svm_feature & AMD_CPUID_SVM_NP)) { + printf("SVM: Nested Paging feature not available.\n"); + return (ENXIO); + } + + /* bhyve requires the NRIP Save feature */ + if (!(svm_feature & AMD_CPUID_SVM_NRIP_SAVE)) { + printf("SVM: NRIP Save feature not available.\n"); + return (ENXIO); + } + + return (0); +} + +static void +svm_enable(void *arg __unused) +{ + uint64_t efer; + + efer = rdmsr(MSR_EFER); + efer |= EFER_SVM; + wrmsr(MSR_EFER, efer); + + wrmsr(MSR_VM_HSAVE_PA, vtophys(hsave[curcpu])); +} + +/* + * Return 1 if SVM is enabled on this processor and 0 otherwise. + */ +static int +svm_available(void) +{ + uint64_t msr; + +#ifdef __FreeBSD__ + /* Section 15.4 Enabling SVM from APM2. */ + if ((amd_feature2 & AMDID2_SVM) == 0) { + printf("SVM: not available.\n"); + return (0); + } +#else + if (!is_x86_feature(x86_featureset, X86FSET_SVM)) { + cmn_err(CE_WARN, "processor does not support SVM operation\n"); + return (0); + } +#endif + + msr = rdmsr(MSR_VM_CR); + if ((msr & VM_CR_SVMDIS) != 0) { +#ifdef __FreeBSD__ + printf("SVM: disabled by BIOS.\n"); +#else + cmn_err(CE_WARN, "SVM disabled by BIOS.\n"); +#endif + return (0); + } + + return (1); +} + +static int +svm_init(int ipinum) +{ + int error, cpu; + + if (!svm_available()) + return (ENXIO); + + error = check_svm_features(); + if (error) + return (error); + + vmcb_clean &= VMCB_CACHE_DEFAULT; + + for (cpu = 0; cpu < MAXCPU; cpu++) { + /* + * Initialize the host ASIDs to their "highest" valid values. + * + * The next ASID allocation will rollover both 'gen' and 'num' + * and start off the sequence at {1,1}. + */ + asid[cpu].gen = ~0UL; + asid[cpu].num = nasid - 1; + } + + svm_msr_init(); + svm_npt_init(ipinum); + + /* Enable SVM on all CPUs */ + smp_rendezvous(NULL, svm_enable, NULL, NULL); + + return (0); +} + +static void +svm_restore(void) +{ + + svm_enable(NULL); +} +#else /* __FreeBSD__ */ +static int +svm_cleanup(void) +{ + /* This is taken care of by the hma registration */ + return (0); +} + +static int +svm_init(int ipinum) +{ + vmcb_clean &= VMCB_CACHE_DEFAULT; + + svm_msr_init(); + svm_npt_init(ipinum); + + return (0); +} + +static void +svm_restore(void) +{ + /* No-op on illumos */ +} +#endif /* __FreeBSD__ */ + +/* Pentium compatible MSRs */ +#define MSR_PENTIUM_START 0 +#define MSR_PENTIUM_END 0x1FFF +/* AMD 6th generation and Intel compatible MSRs */ +#define MSR_AMD6TH_START 0xC0000000UL +#define MSR_AMD6TH_END 0xC0001FFFUL +/* AMD 7th and 8th generation compatible MSRs */ +#define MSR_AMD7TH_START 0xC0010000UL +#define MSR_AMD7TH_END 0xC0011FFFUL + +/* + * Get the index and bit position for a MSR in permission bitmap. + * Two bits are used for each MSR: lower bit for read and higher bit for write. + */ +static int +svm_msr_index(uint64_t msr, int *index, int *bit) +{ + uint32_t base, off; + + *index = -1; + *bit = (msr % 4) * 2; + base = 0; + + if (msr <= MSR_PENTIUM_END) { + *index = msr / 4; + return (0); + } + + base += (MSR_PENTIUM_END - MSR_PENTIUM_START + 1); + if (msr >= MSR_AMD6TH_START && msr <= MSR_AMD6TH_END) { + off = (msr - MSR_AMD6TH_START); + *index = (off + base) / 4; + return (0); + } + + base += (MSR_AMD6TH_END - MSR_AMD6TH_START + 1); + if (msr >= MSR_AMD7TH_START && msr <= MSR_AMD7TH_END) { + off = (msr - MSR_AMD7TH_START); + *index = (off + base) / 4; + return (0); + } + + return (EINVAL); +} + +/* + * Allow vcpu to read or write the 'msr' without trapping into the hypervisor. + */ +static void +svm_msr_perm(uint8_t *perm_bitmap, uint64_t msr, bool read, bool write) +{ + int index, bit, error; + + error = svm_msr_index(msr, &index, &bit); + KASSERT(error == 0, ("%s: invalid msr %#lx", __func__, msr)); + KASSERT(index >= 0 && index < SVM_MSR_BITMAP_SIZE, + ("%s: invalid index %d for msr %#lx", __func__, index, msr)); + KASSERT(bit >= 0 && bit <= 6, ("%s: invalid bit position %d " + "msr %#lx", __func__, bit, msr)); + + if (read) + perm_bitmap[index] &= ~(1UL << bit); + + if (write) + perm_bitmap[index] &= ~(2UL << bit); +} + +static void +svm_msr_rw_ok(uint8_t *perm_bitmap, uint64_t msr) +{ + + svm_msr_perm(perm_bitmap, msr, true, true); +} + +static void +svm_msr_rd_ok(uint8_t *perm_bitmap, uint64_t msr) +{ + + svm_msr_perm(perm_bitmap, msr, true, false); +} + +static __inline int +svm_get_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask) +{ + struct vmcb_ctrl *ctrl; + + KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + return (ctrl->intercept[idx] & bitmask ? 1 : 0); +} + +static __inline void +svm_set_intercept(struct svm_softc *sc, int vcpu, int idx, uint32_t bitmask, + int enabled) +{ + struct vmcb_ctrl *ctrl; + uint32_t oldval; + + KASSERT(idx >=0 && idx < 5, ("invalid intercept index %d", idx)); + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + oldval = ctrl->intercept[idx]; + + if (enabled) + ctrl->intercept[idx] |= bitmask; + else + ctrl->intercept[idx] &= ~bitmask; + + if (ctrl->intercept[idx] != oldval) { + svm_set_dirty(sc, vcpu, VMCB_CACHE_I); + VCPU_CTR3(sc->vm, vcpu, "intercept[%d] modified " + "from %#x to %#x", idx, oldval, ctrl->intercept[idx]); + } +} + +static __inline void +svm_disable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) +{ + + svm_set_intercept(sc, vcpu, off, bitmask, 0); +} + +static __inline void +svm_enable_intercept(struct svm_softc *sc, int vcpu, int off, uint32_t bitmask) +{ + + svm_set_intercept(sc, vcpu, off, bitmask, 1); +} + +static void +vmcb_init(struct svm_softc *sc, int vcpu, uint64_t iopm_base_pa, + uint64_t msrpm_base_pa, uint64_t np_pml4) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + uint32_t mask; + int n; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + state = svm_get_vmcb_state(sc, vcpu); + + ctrl->iopm_base_pa = iopm_base_pa; + ctrl->msrpm_base_pa = msrpm_base_pa; + + /* Enable nested paging */ + ctrl->np_enable = 1; + ctrl->n_cr3 = np_pml4; + + /* + * Intercept accesses to the control registers that are not shadowed + * in the VMCB - i.e. all except cr0, cr2, cr3, cr4 and cr8. + */ + for (n = 0; n < 16; n++) { + mask = (BIT(n) << 16) | BIT(n); + if (n == 0 || n == 2 || n == 3 || n == 4 || n == 8) + svm_disable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); + else + svm_enable_intercept(sc, vcpu, VMCB_CR_INTCPT, mask); + } + + + /* + * Intercept everything when tracing guest exceptions otherwise + * just intercept machine check exception. + */ + if (vcpu_trace_exceptions(sc->vm, vcpu)) { + for (n = 0; n < 32; n++) { + /* + * Skip unimplemented vectors in the exception bitmap. + */ + if (n == 2 || n == 9) { + continue; + } + svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(n)); + } + } else { + svm_enable_intercept(sc, vcpu, VMCB_EXC_INTCPT, BIT(IDT_MC)); + } + + /* Intercept various events (for e.g. I/O, MSR and CPUID accesses) */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IO); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_MSR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_CPUID); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INTR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_INIT); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_NMI); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SMI); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_SHUTDOWN); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_FERR_FREEZE); + + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MONITOR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_MWAIT); + + /* + * From section "Canonicalization and Consistency Checks" in APMv2 + * the VMRUN intercept bit must be set to pass the consistency check. + */ + svm_enable_intercept(sc, vcpu, VMCB_CTRL2_INTCPT, VMCB_INTCPT_VMRUN); + + /* + * The ASID will be set to a non-zero value just before VMRUN. + */ + ctrl->asid = 0; + + /* + * Section 15.21.1, Interrupt Masking in EFLAGS + * Section 15.21.2, Virtualizing APIC.TPR + * + * This must be set for %rflag and %cr8 isolation of guest and host. + */ + ctrl->v_intr_masking = 1; + + /* Enable Last Branch Record aka LBR for debugging */ + ctrl->lbr_virt_en = 1; + state->dbgctl = BIT(0); + + /* EFER_SVM must always be set when the guest is executing */ + state->efer = EFER_SVM; + + /* Set up the PAT to power-on state */ + state->g_pat = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + + /* Set up DR6/7 to power-on state */ + state->dr6 = DBREG_DR6_RESERVED1; + state->dr7 = DBREG_DR7_RESERVED1; +} + +/* + * Initialize a virtual machine. + */ +static void * +svm_vminit(struct vm *vm, pmap_t pmap) +{ + struct svm_softc *svm_sc; + struct svm_vcpu *vcpu; + vm_paddr_t msrpm_pa, iopm_pa, pml4_pa; + int i; + uint16_t maxcpus; + + svm_sc = malloc(sizeof (*svm_sc), M_SVM, M_WAITOK | M_ZERO); + if (((uintptr_t)svm_sc & PAGE_MASK) != 0) + panic("malloc of svm_softc not aligned on page boundary"); + + svm_sc->msr_bitmap = contigmalloc(SVM_MSR_BITMAP_SIZE, M_SVM, + M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); + if (svm_sc->msr_bitmap == NULL) + panic("contigmalloc of SVM MSR bitmap failed"); + svm_sc->iopm_bitmap = contigmalloc(SVM_IO_BITMAP_SIZE, M_SVM, + M_WAITOK, 0, ~(vm_paddr_t)0, PAGE_SIZE, 0); + if (svm_sc->iopm_bitmap == NULL) + panic("contigmalloc of SVM IO bitmap failed"); + + svm_sc->vm = vm; + svm_sc->nptp = (vm_offset_t)vtophys(pmap->pm_pml4); + + /* + * Intercept read and write accesses to all MSRs. + */ + memset(svm_sc->msr_bitmap, 0xFF, SVM_MSR_BITMAP_SIZE); + + /* + * Access to the following MSRs is redirected to the VMCB when the + * guest is executing. Therefore it is safe to allow the guest to + * read/write these MSRs directly without hypervisor involvement. + */ + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_GSBASE); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_FSBASE); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_KGSBASE); + + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_STAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_LSTAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_CSTAR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SF_MASK); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_CS_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_ESP_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_SYSENTER_EIP_MSR); + svm_msr_rw_ok(svm_sc->msr_bitmap, MSR_PAT); + + svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_TSC); + + /* + * Intercept writes to make sure that the EFER_SVM bit is not cleared. + */ + svm_msr_rd_ok(svm_sc->msr_bitmap, MSR_EFER); + + /* Intercept access to all I/O ports. */ + memset(svm_sc->iopm_bitmap, 0xFF, SVM_IO_BITMAP_SIZE); + + iopm_pa = vtophys(svm_sc->iopm_bitmap); + msrpm_pa = vtophys(svm_sc->msr_bitmap); + pml4_pa = svm_sc->nptp; + maxcpus = vm_get_maxcpus(svm_sc->vm); + for (i = 0; i < maxcpus; i++) { + vcpu = svm_get_vcpu(svm_sc, i); + vcpu->nextrip = ~0; + vcpu->lastcpu = NOCPU; + vcpu->vmcb_pa = vtophys(&vcpu->vmcb); + vmcb_init(svm_sc, i, iopm_pa, msrpm_pa, pml4_pa); + svm_msr_guest_init(svm_sc, i); + } + return (svm_sc); +} + +/* + * Collateral for a generic SVM VM-exit. + */ +static void +vm_exit_svm(struct vm_exit *vme, uint64_t code, uint64_t info1, uint64_t info2) +{ + + vme->exitcode = VM_EXITCODE_SVM; + vme->u.svm.exitcode = code; + vme->u.svm.exitinfo1 = info1; + vme->u.svm.exitinfo2 = info2; +} + +static int +svm_cpl(struct vmcb_state *state) +{ + + /* + * From APMv2: + * "Retrieve the CPL from the CPL field in the VMCB, not + * from any segment DPL" + */ + return (state->cpl); +} + +static enum vm_cpu_mode +svm_vcpu_mode(struct vmcb *vmcb) +{ + struct vmcb_segment seg; + struct vmcb_state *state; + int error; + + state = &vmcb->state; + + if (state->efer & EFER_LMA) { + error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); + KASSERT(error == 0, ("%s: vmcb_seg(cs) error %d", __func__, + error)); + + /* + * Section 4.8.1 for APM2, check if Code Segment has + * Long attribute set in descriptor. + */ + if (seg.attrib & VMCB_CS_ATTRIB_L) + return (CPU_MODE_64BIT); + else + return (CPU_MODE_COMPATIBILITY); + } else if (state->cr0 & CR0_PE) { + return (CPU_MODE_PROTECTED); + } else { + return (CPU_MODE_REAL); + } +} + +static enum vm_paging_mode +svm_paging_mode(uint64_t cr0, uint64_t cr4, uint64_t efer) +{ + + if ((cr0 & CR0_PG) == 0) + return (PAGING_MODE_FLAT); + if ((cr4 & CR4_PAE) == 0) + return (PAGING_MODE_32); + if (efer & EFER_LME) + return (PAGING_MODE_64); + else + return (PAGING_MODE_PAE); +} + +/* + * ins/outs utility routines + */ +static uint64_t +svm_inout_str_index(struct svm_regctx *regs, int in) +{ + uint64_t val; + + val = in ? regs->sctx_rdi : regs->sctx_rsi; + + return (val); +} + +static uint64_t +svm_inout_str_count(struct svm_regctx *regs, int rep) +{ + uint64_t val; + + val = rep ? regs->sctx_rcx : 1; + + return (val); +} + +static void +svm_inout_str_seginfo(struct svm_softc *svm_sc, int vcpu, int64_t info1, + int in, struct vm_inout_str *vis) +{ + int error, s; + + if (in) { + vis->seg_name = VM_REG_GUEST_ES; + } else { + /* The segment field has standard encoding */ + s = (info1 >> 10) & 0x7; + vis->seg_name = vm_segment_name(s); + } + + error = vmcb_getdesc(svm_sc, vcpu, vis->seg_name, &vis->seg_desc); + KASSERT(error == 0, ("%s: svm_getdesc error %d", __func__, error)); +} + +static int +svm_inout_str_addrsize(uint64_t info1) +{ + uint32_t size; + + size = (info1 >> 7) & 0x7; + switch (size) { + case 1: + return (2); /* 16 bit */ + case 2: + return (4); /* 32 bit */ + case 4: + return (8); /* 64 bit */ + default: + panic("%s: invalid size encoding %d", __func__, size); + } +} + +static void +svm_paging_info(struct vmcb *vmcb, struct vm_guest_paging *paging) +{ + struct vmcb_state *state; + + state = &vmcb->state; + paging->cr3 = state->cr3; + paging->cpl = svm_cpl(state); + paging->cpu_mode = svm_vcpu_mode(vmcb); + paging->paging_mode = svm_paging_mode(state->cr0, state->cr4, + state->efer); +} + +#define UNHANDLED 0 + +/* + * Handle guest I/O intercept. + */ +static int +svm_handle_io(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + struct svm_regctx *regs; + struct vm_inout_str *vis; + uint64_t info1; + int inout_string; + + state = svm_get_vmcb_state(svm_sc, vcpu); + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + regs = svm_get_guest_regctx(svm_sc, vcpu); + + info1 = ctrl->exitinfo1; + inout_string = info1 & BIT(2) ? 1 : 0; + + /* + * The effective segment number in EXITINFO1[12:10] is populated + * only if the processor has the DecodeAssist capability. + * + * XXX this is not specified explicitly in APMv2 but can be verified + * empirically. + */ + if (inout_string && !decode_assist()) + return (UNHANDLED); + + vmexit->exitcode = VM_EXITCODE_INOUT; + vmexit->u.inout.in = (info1 & BIT(0)) ? 1 : 0; + vmexit->u.inout.string = inout_string; + vmexit->u.inout.rep = (info1 & BIT(3)) ? 1 : 0; + vmexit->u.inout.bytes = (info1 >> 4) & 0x7; + vmexit->u.inout.port = (uint16_t)(info1 >> 16); + vmexit->u.inout.eax = (uint32_t)(state->rax); + + if (inout_string) { + vmexit->exitcode = VM_EXITCODE_INOUT_STR; + vis = &vmexit->u.inout_str; + svm_paging_info(svm_get_vmcb(svm_sc, vcpu), &vis->paging); + vis->rflags = state->rflags; + vis->cr0 = state->cr0; + vis->index = svm_inout_str_index(regs, vmexit->u.inout.in); + vis->count = svm_inout_str_count(regs, vmexit->u.inout.rep); + vis->addrsize = svm_inout_str_addrsize(info1); + svm_inout_str_seginfo(svm_sc, vcpu, info1, + vmexit->u.inout.in, vis); + } + + return (UNHANDLED); +} + +static int +npf_fault_type(uint64_t exitinfo1) +{ + + if (exitinfo1 & VMCB_NPF_INFO1_W) + return (VM_PROT_WRITE); + else if (exitinfo1 & VMCB_NPF_INFO1_ID) + return (VM_PROT_EXECUTE); + else + return (VM_PROT_READ); +} + +static bool +svm_npf_emul_fault(uint64_t exitinfo1) +{ + + if (exitinfo1 & VMCB_NPF_INFO1_ID) { + return (false); + } + + if (exitinfo1 & VMCB_NPF_INFO1_GPT) { + return (false); + } + + if ((exitinfo1 & VMCB_NPF_INFO1_GPA) == 0) { + return (false); + } + + return (true); +} + +static void +svm_handle_inst_emul(struct vmcb *vmcb, uint64_t gpa, struct vm_exit *vmexit) +{ + struct vm_guest_paging *paging; + struct vmcb_segment seg; + struct vmcb_ctrl *ctrl; + char *inst_bytes; + int error, inst_len; + + ctrl = &vmcb->ctrl; + paging = &vmexit->u.inst_emul.paging; + + vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->u.inst_emul.gpa = gpa; + vmexit->u.inst_emul.gla = VIE_INVALID_GLA; + svm_paging_info(vmcb, paging); + + error = vmcb_seg(vmcb, VM_REG_GUEST_CS, &seg); + KASSERT(error == 0, ("%s: vmcb_seg(CS) error %d", __func__, error)); + + switch(paging->cpu_mode) { + case CPU_MODE_REAL: + vmexit->u.inst_emul.cs_base = seg.base; + vmexit->u.inst_emul.cs_d = 0; + break; + case CPU_MODE_PROTECTED: + case CPU_MODE_COMPATIBILITY: + vmexit->u.inst_emul.cs_base = seg.base; + + /* + * Section 4.8.1 of APM2, Default Operand Size or D bit. + */ + vmexit->u.inst_emul.cs_d = (seg.attrib & VMCB_CS_ATTRIB_D) ? + 1 : 0; + break; + default: + vmexit->u.inst_emul.cs_base = 0; + vmexit->u.inst_emul.cs_d = 0; + break; + } + + /* + * Copy the instruction bytes into 'vie' if available. + */ + if (decode_assist() && !disable_npf_assist) { + inst_len = ctrl->inst_len; + inst_bytes = (char *)ctrl->inst_bytes; + } else { + inst_len = 0; + inst_bytes = NULL; + } + vie_init(&vmexit->u.inst_emul.vie, inst_bytes, inst_len); +} + +#ifdef KTR +static const char * +intrtype_to_str(int intr_type) +{ + switch (intr_type) { + case VMCB_EVENTINJ_TYPE_INTR: + return ("hwintr"); + case VMCB_EVENTINJ_TYPE_NMI: + return ("nmi"); + case VMCB_EVENTINJ_TYPE_INTn: + return ("swintr"); + case VMCB_EVENTINJ_TYPE_EXCEPTION: + return ("exception"); + default: + panic("%s: unknown intr_type %d", __func__, intr_type); + } +} +#endif + +/* + * Inject an event to vcpu as described in section 15.20, "Event injection". + */ +static void +svm_eventinject(struct svm_softc *sc, int vcpu, int intr_type, int vector, + uint32_t error, bool ec_valid) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, + ("%s: event already pending %#lx", __func__, ctrl->eventinj)); + + KASSERT(vector >=0 && vector <= 255, ("%s: invalid vector %d", + __func__, vector)); + + switch (intr_type) { + case VMCB_EVENTINJ_TYPE_INTR: + case VMCB_EVENTINJ_TYPE_NMI: + case VMCB_EVENTINJ_TYPE_INTn: + break; + case VMCB_EVENTINJ_TYPE_EXCEPTION: + if (vector >= 0 && vector <= 31 && vector != 2) + break; + /* FALLTHROUGH */ + default: + panic("%s: invalid intr_type/vector: %d/%d", __func__, + intr_type, vector); + } + ctrl->eventinj = vector | (intr_type << 8) | VMCB_EVENTINJ_VALID; + if (ec_valid) { + ctrl->eventinj |= VMCB_EVENTINJ_EC_VALID; + ctrl->eventinj |= (uint64_t)error << 32; + VCPU_CTR3(sc->vm, vcpu, "Injecting %s at vector %d errcode %#x", + intrtype_to_str(intr_type), vector, error); + } else { + VCPU_CTR2(sc->vm, vcpu, "Injecting %s at vector %d", + intrtype_to_str(intr_type), vector); + } +} + +static void +svm_update_virqinfo(struct svm_softc *sc, int vcpu) +{ + struct vm *vm; + struct vlapic *vlapic; + struct vmcb_ctrl *ctrl; + + vm = sc->vm; + vlapic = vm_lapic(vm, vcpu); + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + /* Update %cr8 in the emulated vlapic */ + vlapic_set_cr8(vlapic, ctrl->v_tpr); + + /* Virtual interrupt injection is not used. */ + KASSERT(ctrl->v_intr_vector == 0, ("%s: invalid " + "v_intr_vector %d", __func__, ctrl->v_intr_vector)); +} + +static void +svm_save_intinfo(struct svm_softc *svm_sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + uint64_t intinfo; + + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + intinfo = ctrl->exitintinfo; + if (!VMCB_EXITINTINFO_VALID(intinfo)) + return; + + /* + * From APMv2, Section "Intercepts during IDT interrupt delivery" + * + * If a #VMEXIT happened during event delivery then record the event + * that was being delivered. + */ + VCPU_CTR2(svm_sc->vm, vcpu, "SVM:Pending INTINFO(0x%lx), vector=%d.\n", + intinfo, VMCB_EXITINTINFO_VECTOR(intinfo)); + vmm_stat_incr(svm_sc->vm, vcpu, VCPU_EXITINTINFO, 1); + vm_exit_intinfo(svm_sc->vm, vcpu, intinfo); +} + +static __inline int +vintr_intercept_enabled(struct svm_softc *sc, int vcpu) +{ + + return (svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_VINTR)); +} + +static __inline void +enable_intr_window_exiting(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + if (ctrl->v_irq && ctrl->v_intr_vector == 0) { + KASSERT(ctrl->v_ign_tpr, ("%s: invalid v_ign_tpr", __func__)); + KASSERT(vintr_intercept_enabled(sc, vcpu), + ("%s: vintr intercept should be enabled", __func__)); + return; + } + + VCPU_CTR0(sc->vm, vcpu, "Enable intr window exiting"); + ctrl->v_irq = 1; + ctrl->v_ign_tpr = 1; + ctrl->v_intr_vector = 0; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); +} + +static __inline void +disable_intr_window_exiting(struct svm_softc *sc, int vcpu) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + + if (!ctrl->v_irq && ctrl->v_intr_vector == 0) { + KASSERT(!vintr_intercept_enabled(sc, vcpu), + ("%s: vintr intercept should be disabled", __func__)); + return; + } + + VCPU_CTR0(sc->vm, vcpu, "Disable intr window exiting"); + ctrl->v_irq = 0; + ctrl->v_intr_vector = 0; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_VINTR); +} + +static int +svm_modify_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t val) +{ + struct vmcb_ctrl *ctrl; + int oldval, newval; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + oldval = ctrl->intr_shadow; + newval = val ? 1 : 0; + if (newval != oldval) { + ctrl->intr_shadow = newval; + VCPU_CTR1(sc->vm, vcpu, "Setting intr_shadow to %d", newval); + } + return (0); +} + +static int +svm_get_intr_shadow(struct svm_softc *sc, int vcpu, uint64_t *val) +{ + struct vmcb_ctrl *ctrl; + + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + *val = ctrl->intr_shadow; + return (0); +} + +/* + * Once an NMI is injected it blocks delivery of further NMIs until the handler + * executes an IRET. The IRET intercept is enabled when an NMI is injected to + * to track when the vcpu is done handling the NMI. + */ +static int +nmi_blocked(struct svm_softc *sc, int vcpu) +{ + int blocked; + + blocked = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_IRET); + return (blocked); +} + +static void +enable_nmi_blocking(struct svm_softc *sc, int vcpu) +{ + + KASSERT(!nmi_blocked(sc, vcpu), ("vNMI already blocked")); + VCPU_CTR0(sc->vm, vcpu, "vNMI blocking enabled"); + svm_enable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); +} + +static void +clear_nmi_blocking(struct svm_softc *sc, int vcpu) +{ + int error; + + KASSERT(nmi_blocked(sc, vcpu), ("vNMI already unblocked")); + VCPU_CTR0(sc->vm, vcpu, "vNMI blocking cleared"); + /* + * When the IRET intercept is cleared the vcpu will attempt to execute + * the "iret" when it runs next. However, it is possible to inject + * another NMI into the vcpu before the "iret" has actually executed. + * + * For e.g. if the "iret" encounters a #NPF when accessing the stack + * it will trap back into the hypervisor. If an NMI is pending for + * the vcpu it will be injected into the guest. + * + * XXX this needs to be fixed + */ + svm_disable_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, VMCB_INTCPT_IRET); + + /* + * Set 'intr_shadow' to prevent an NMI from being injected on the + * immediate VMRUN. + */ + error = svm_modify_intr_shadow(sc, vcpu, 1); + KASSERT(!error, ("%s: error %d setting intr_shadow", __func__, error)); +} + +#define EFER_MBZ_BITS 0xFFFFFFFFFFFF0200UL + +static int +svm_write_efer(struct svm_softc *sc, int vcpu, uint64_t newval, bool *retu) +{ + struct vm_exit *vme; + struct vmcb_state *state; + uint64_t changed, lma, oldval; + int error; + + state = svm_get_vmcb_state(sc, vcpu); + + oldval = state->efer; + VCPU_CTR2(sc->vm, vcpu, "wrmsr(efer) %#lx/%#lx", oldval, newval); + + newval &= ~0xFE; /* clear the Read-As-Zero (RAZ) bits */ + changed = oldval ^ newval; + + if (newval & EFER_MBZ_BITS) + goto gpf; + + /* APMv2 Table 14-5 "Long-Mode Consistency Checks" */ + if (changed & EFER_LME) { + if (state->cr0 & CR0_PG) + goto gpf; + } + + /* EFER.LMA = EFER.LME & CR0.PG */ + if ((newval & EFER_LME) != 0 && (state->cr0 & CR0_PG) != 0) + lma = EFER_LMA; + else + lma = 0; + + if ((newval & EFER_LMA) != lma) + goto gpf; + + if (newval & EFER_NXE) { + if (!vm_cpuid_capability(sc->vm, vcpu, VCC_NO_EXECUTE)) + goto gpf; + } + + /* + * XXX bhyve does not enforce segment limits in 64-bit mode. Until + * this is fixed flag guest attempt to set EFER_LMSLE as an error. + */ + if (newval & EFER_LMSLE) { + vme = vm_exitinfo(sc->vm, vcpu); + vm_exit_svm(vme, VMCB_EXIT_MSR, 1, 0); + *retu = true; + return (0); + } + + if (newval & EFER_FFXSR) { + if (!vm_cpuid_capability(sc->vm, vcpu, VCC_FFXSR)) + goto gpf; + } + + if (newval & EFER_TCE) { + if (!vm_cpuid_capability(sc->vm, vcpu, VCC_TCE)) + goto gpf; + } + + error = svm_setreg(sc, vcpu, VM_REG_GUEST_EFER, newval); + KASSERT(error == 0, ("%s: error %d updating efer", __func__, error)); + return (0); +gpf: + vm_inject_gp(sc->vm, vcpu); + return (0); +} + +static int +emulate_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, + bool *retu) +{ + int error; + + if (lapic_msr(num)) + error = lapic_wrmsr(sc->vm, vcpu, num, val, retu); + else if (num == MSR_EFER) + error = svm_write_efer(sc, vcpu, val, retu); + else + error = svm_wrmsr(sc, vcpu, num, val, retu); + + return (error); +} + +static int +emulate_rdmsr(struct svm_softc *sc, int vcpu, u_int num, bool *retu) +{ + struct vmcb_state *state; + struct svm_regctx *ctx; + uint64_t result; + int error; + + if (lapic_msr(num)) + error = lapic_rdmsr(sc->vm, vcpu, num, &result, retu); + else + error = svm_rdmsr(sc, vcpu, num, &result, retu); + + if (error == 0) { + state = svm_get_vmcb_state(sc, vcpu); + ctx = svm_get_guest_regctx(sc, vcpu); + state->rax = result & 0xffffffff; + ctx->sctx_rdx = result >> 32; + } + + return (error); +} + +#ifdef KTR +static const char * +exit_reason_to_str(uint64_t reason) +{ + static char reasonbuf[32]; + + switch (reason) { + case VMCB_EXIT_INVALID: + return ("invalvmcb"); + case VMCB_EXIT_SHUTDOWN: + return ("shutdown"); + case VMCB_EXIT_NPF: + return ("nptfault"); + case VMCB_EXIT_PAUSE: + return ("pause"); + case VMCB_EXIT_HLT: + return ("hlt"); + case VMCB_EXIT_CPUID: + return ("cpuid"); + case VMCB_EXIT_IO: + return ("inout"); + case VMCB_EXIT_MC: + return ("mchk"); + case VMCB_EXIT_INTR: + return ("extintr"); + case VMCB_EXIT_NMI: + return ("nmi"); + case VMCB_EXIT_VINTR: + return ("vintr"); + case VMCB_EXIT_MSR: + return ("msr"); + case VMCB_EXIT_IRET: + return ("iret"); + case VMCB_EXIT_MONITOR: + return ("monitor"); + case VMCB_EXIT_MWAIT: + return ("mwait"); + default: + snprintf(reasonbuf, sizeof(reasonbuf), "%#lx", reason); + return (reasonbuf); + } +} +#endif /* KTR */ + +/* + * From section "State Saved on Exit" in APMv2: nRIP is saved for all #VMEXITs + * that are due to instruction intercepts as well as MSR and IOIO intercepts + * and exceptions caused by INT3, INTO and BOUND instructions. + * + * Return 1 if the nRIP is valid and 0 otherwise. + */ +static int +nrip_valid(uint64_t exitcode) +{ + switch (exitcode) { + case 0x00 ... 0x0F: /* read of CR0 through CR15 */ + case 0x10 ... 0x1F: /* write of CR0 through CR15 */ + case 0x20 ... 0x2F: /* read of DR0 through DR15 */ + case 0x30 ... 0x3F: /* write of DR0 through DR15 */ + case 0x43: /* INT3 */ + case 0x44: /* INTO */ + case 0x45: /* BOUND */ + case 0x65 ... 0x7C: /* VMEXIT_CR0_SEL_WRITE ... VMEXIT_MSR */ + case 0x80 ... 0x8D: /* VMEXIT_VMRUN ... VMEXIT_XSETBV */ + return (1); + default: + return (0); + } +} + +static int +svm_vmexit(struct svm_softc *svm_sc, int vcpu, struct vm_exit *vmexit) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct svm_regctx *ctx; + uint64_t code, info1, info2, val; + uint32_t eax, ecx, edx; +#ifdef __FreeBSD__ + int error, errcode_valid, handled, idtvec, reflect; +#else + int error, errcode_valid = 0, handled, idtvec, reflect; +#endif + bool retu; + + ctx = svm_get_guest_regctx(svm_sc, vcpu); + vmcb = svm_get_vmcb(svm_sc, vcpu); + state = &vmcb->state; + ctrl = &vmcb->ctrl; + + handled = 0; + code = ctrl->exitcode; + info1 = ctrl->exitinfo1; + info2 = ctrl->exitinfo2; + + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmexit->rip = state->rip; + vmexit->inst_length = nrip_valid(code) ? ctrl->nrip - state->rip : 0; + + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_COUNT, 1); + + /* + * #VMEXIT(INVALID) needs to be handled early because the VMCB is + * in an inconsistent state and can trigger assertions that would + * never happen otherwise. + */ + if (code == VMCB_EXIT_INVALID) { + vm_exit_svm(vmexit, code, info1, info2); + return (0); + } + + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) == 0, ("%s: event " + "injection valid bit is set %#lx", __func__, ctrl->eventinj)); + + KASSERT(vmexit->inst_length >= 0 && vmexit->inst_length <= 15, + ("invalid inst_length %d: code (%#lx), info1 (%#lx), info2 (%#lx)", + vmexit->inst_length, code, info1, info2)); + + svm_update_virqinfo(svm_sc, vcpu); + svm_save_intinfo(svm_sc, vcpu); + + switch (code) { + case VMCB_EXIT_IRET: + /* + * Restart execution at "iret" but with the intercept cleared. + */ + vmexit->inst_length = 0; + clear_nmi_blocking(svm_sc, vcpu); + handled = 1; + break; + case VMCB_EXIT_VINTR: /* interrupt window exiting */ + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_VINTR, 1); + handled = 1; + break; + case VMCB_EXIT_INTR: /* external interrupt */ + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXTINT, 1); + handled = 1; + break; + case VMCB_EXIT_NMI: /* external NMI */ + handled = 1; + break; + case 0x40 ... 0x5F: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_EXCEPTION, 1); + reflect = 1; + idtvec = code - 0x40; + switch (idtvec) { + case IDT_MC: + /* + * Call the machine check handler by hand. Also don't + * reflect the machine check back into the guest. + */ + reflect = 0; + VCPU_CTR0(svm_sc->vm, vcpu, "Vectoring to MCE handler"); +#ifdef __FreeBSD__ + __asm __volatile("int $18"); +#else + vmm_call_trap(T_MCE); +#endif + break; + case IDT_PF: + error = svm_setreg(svm_sc, vcpu, VM_REG_GUEST_CR2, + info2); + KASSERT(error == 0, ("%s: error %d updating cr2", + __func__, error)); + /* fallthru */ + case IDT_NP: + case IDT_SS: + case IDT_GP: + case IDT_AC: + case IDT_TS: + errcode_valid = 1; + break; + + case IDT_DF: + errcode_valid = 1; + info1 = 0; + break; + + case IDT_BP: + case IDT_OF: + case IDT_BR: + /* + * The 'nrip' field is populated for INT3, INTO and + * BOUND exceptions and this also implies that + * 'inst_length' is non-zero. + * + * Reset 'inst_length' to zero so the guest %rip at + * event injection is identical to what it was when + * the exception originally happened. + */ + VCPU_CTR2(svm_sc->vm, vcpu, "Reset inst_length from %d " + "to zero before injecting exception %d", + vmexit->inst_length, idtvec); + vmexit->inst_length = 0; + /* fallthru */ + default: + errcode_valid = 0; + info1 = 0; + break; + } + KASSERT(vmexit->inst_length == 0, ("invalid inst_length (%d) " + "when reflecting exception %d into guest", + vmexit->inst_length, idtvec)); + + if (reflect) { + /* Reflect the exception back into the guest */ + VCPU_CTR2(svm_sc->vm, vcpu, "Reflecting exception " + "%d/%#x into the guest", idtvec, (int)info1); + error = vm_inject_exception(svm_sc->vm, vcpu, idtvec, + errcode_valid, info1, 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + } + handled = 1; + break; + case VMCB_EXIT_MSR: /* MSR access. */ + eax = state->rax; + ecx = ctx->sctx_rcx; + edx = ctx->sctx_rdx; + retu = false; + + if (info1) { + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_WRMSR, 1); + val = (uint64_t)edx << 32 | eax; + VCPU_CTR2(svm_sc->vm, vcpu, "wrmsr %#x val %#lx", + ecx, val); + if (emulate_wrmsr(svm_sc, vcpu, ecx, val, &retu)) { + vmexit->exitcode = VM_EXITCODE_WRMSR; + vmexit->u.msr.code = ecx; + vmexit->u.msr.wval = val; + } else if (!retu) { + handled = 1; + } else { + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_wrmsr retu with bogus exitcode")); + } + } else { + VCPU_CTR1(svm_sc->vm, vcpu, "rdmsr %#x", ecx); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_RDMSR, 1); + if (emulate_rdmsr(svm_sc, vcpu, ecx, &retu)) { + vmexit->exitcode = VM_EXITCODE_RDMSR; + vmexit->u.msr.code = ecx; + } else if (!retu) { + handled = 1; + } else { + KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS, + ("emulate_rdmsr retu with bogus exitcode")); + } + } + break; + case VMCB_EXIT_IO: + handled = svm_handle_io(svm_sc, vcpu, vmexit); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INOUT, 1); + break; + case VMCB_EXIT_CPUID: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_CPUID, 1); + handled = x86_emulate_cpuid(svm_sc->vm, vcpu, + (uint32_t *)&state->rax, + (uint32_t *)&ctx->sctx_rbx, + (uint32_t *)&ctx->sctx_rcx, + (uint32_t *)&ctx->sctx_rdx); + break; + case VMCB_EXIT_HLT: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_HLT, 1); + vmexit->exitcode = VM_EXITCODE_HLT; + vmexit->u.hlt.rflags = state->rflags; + break; + case VMCB_EXIT_PAUSE: + vmexit->exitcode = VM_EXITCODE_PAUSE; + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_PAUSE, 1); + break; + case VMCB_EXIT_NPF: + /* EXITINFO2 contains the faulting guest physical address */ + if (info1 & VMCB_NPF_INFO1_RSV) { + VCPU_CTR2(svm_sc->vm, vcpu, "nested page fault with " + "reserved bits set: info1(%#lx) info2(%#lx)", + info1, info2); + } else if (vm_mem_allocated(svm_sc->vm, vcpu, info2)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->u.paging.gpa = info2; + vmexit->u.paging.fault_type = npf_fault_type(info1); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_NESTED_FAULT, 1); + VCPU_CTR3(svm_sc->vm, vcpu, "nested page fault " + "on gpa %#lx/%#lx at rip %#lx", + info2, info1, state->rip); + } else if (svm_npf_emul_fault(info1)) { + svm_handle_inst_emul(vmcb, info2, vmexit); + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_INST_EMUL, 1); + VCPU_CTR3(svm_sc->vm, vcpu, "inst_emul fault " + "for gpa %#lx/%#lx at rip %#lx", + info2, info1, state->rip); + } + break; + case VMCB_EXIT_MONITOR: + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case VMCB_EXIT_MWAIT: + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + default: + vmm_stat_incr(svm_sc->vm, vcpu, VMEXIT_UNKNOWN, 1); + break; + } + + VCPU_CTR4(svm_sc->vm, vcpu, "%s %s vmexit at %#lx/%d", + handled ? "handled" : "unhandled", exit_reason_to_str(code), + vmexit->rip, vmexit->inst_length); + + if (handled) { + vmexit->rip += vmexit->inst_length; + vmexit->inst_length = 0; + state->rip = vmexit->rip; + } else { + if (vmexit->exitcode == VM_EXITCODE_BOGUS) { + /* + * If this VM exit was not claimed by anybody then + * treat it as a generic SVM exit. + */ + vm_exit_svm(vmexit, code, info1, info2); + } else { + /* + * The exitcode and collateral have been populated. + * The VM exit will be processed further in userland. + */ + } + } + return (handled); +} + +static void +svm_inj_intinfo(struct svm_softc *svm_sc, int vcpu) +{ + uint64_t intinfo; + + if (!vm_entry_intinfo(svm_sc->vm, vcpu, &intinfo)) + return; + + KASSERT(VMCB_EXITINTINFO_VALID(intinfo), ("%s: entry intinfo is not " + "valid: %#lx", __func__, intinfo)); + + svm_eventinject(svm_sc, vcpu, VMCB_EXITINTINFO_TYPE(intinfo), + VMCB_EXITINTINFO_VECTOR(intinfo), + VMCB_EXITINTINFO_EC(intinfo), + VMCB_EXITINTINFO_EC_VALID(intinfo)); + vmm_stat_incr(svm_sc->vm, vcpu, VCPU_INTINFO_INJECTED, 1); + VCPU_CTR1(svm_sc->vm, vcpu, "Injected entry intinfo: %#lx", intinfo); +} + +/* + * Inject event to virtual cpu. + */ +static void +svm_inj_interrupts(struct svm_softc *sc, int vcpu, struct vlapic *vlapic) +{ + struct vmcb_ctrl *ctrl; + struct vmcb_state *state; + struct svm_vcpu *vcpustate; + uint8_t v_tpr; + int vector, need_intr_window; + int extint_pending; + + state = svm_get_vmcb_state(sc, vcpu); + ctrl = svm_get_vmcb_ctrl(sc, vcpu); + vcpustate = svm_get_vcpu(sc, vcpu); + + need_intr_window = 0; + + vlapic_tmr_update(vlapic); + + if (vcpustate->nextrip != state->rip) { + ctrl->intr_shadow = 0; + VCPU_CTR2(sc->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vcpustate->nextrip, state->rip); + } + + /* + * Inject pending events or exceptions for this vcpu. + * + * An event might be pending because the previous #VMEXIT happened + * during event delivery (i.e. ctrl->exitintinfo). + * + * An event might also be pending because an exception was injected + * by the hypervisor (e.g. #PF during instruction emulation). + */ + svm_inj_intinfo(sc, vcpu); + + /* NMI event has priority over interrupts. */ + if (vm_nmi_pending(sc->vm, vcpu)) { + if (nmi_blocked(sc, vcpu)) { + /* + * Can't inject another NMI if the guest has not + * yet executed an "iret" after the last NMI. + */ + VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due " + "to NMI-blocking"); + } else if (ctrl->intr_shadow) { + /* + * Can't inject an NMI if the vcpu is in an intr_shadow. + */ + VCPU_CTR0(sc->vm, vcpu, "Cannot inject NMI due to " + "interrupt shadow"); + need_intr_window = 1; + goto done; + } else if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { + /* + * If there is already an exception/interrupt pending + * then defer the NMI until after that. + */ + VCPU_CTR1(sc->vm, vcpu, "Cannot inject NMI due to " + "eventinj %#lx", ctrl->eventinj); + + /* + * Use self-IPI to trigger a VM-exit as soon as + * possible after the event injection is completed. + * + * This works only if the external interrupt exiting + * is at a lower priority than the event injection. + * + * Although not explicitly specified in APMv2 the + * relative priorities were verified empirically. + */ + ipi_cpu(curcpu, IPI_AST); /* XXX vmm_ipinum? */ + } else { + vm_nmi_clear(sc->vm, vcpu); + + /* Inject NMI, vector number is not used */ + svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_NMI, + IDT_NMI, 0, false); + + /* virtual NMI blocking is now in effect */ + enable_nmi_blocking(sc, vcpu); + + VCPU_CTR0(sc->vm, vcpu, "Injecting vNMI"); + } + } + + extint_pending = vm_extint_pending(sc->vm, vcpu); + if (!extint_pending) { + if (!vlapic_pending_intr(vlapic, &vector)) + goto done; + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + } else { + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(sc->vm, &vector); + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + } + + /* + * If the guest has disabled interrupts or is in an interrupt shadow + * then we cannot inject the pending interrupt. + */ + if ((state->rflags & PSL_I) == 0) { + VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " + "rflags %#lx", vector, state->rflags); + need_intr_window = 1; + goto done; + } + + if (ctrl->intr_shadow) { + VCPU_CTR1(sc->vm, vcpu, "Cannot inject vector %d due to " + "interrupt shadow", vector); + need_intr_window = 1; + goto done; + } + + if (ctrl->eventinj & VMCB_EVENTINJ_VALID) { + VCPU_CTR2(sc->vm, vcpu, "Cannot inject vector %d due to " + "eventinj %#lx", vector, ctrl->eventinj); + need_intr_window = 1; + goto done; + } + + svm_eventinject(sc, vcpu, VMCB_EVENTINJ_TYPE_INTR, vector, 0, false); + + if (!extint_pending) { + vlapic_intr_accepted(vlapic, vector); + } else { + vm_extint_clear(sc->vm, vcpu); + vatpic_intr_accepted(sc->vm, vector); + } + + /* + * Force a VM-exit as soon as the vcpu is ready to accept another + * interrupt. This is done because the PIC might have another vector + * that it wants to inject. Also, if the APIC has a pending interrupt + * that was preempted by the ExtInt then it allows us to inject the + * APIC vector as soon as possible. + */ + need_intr_window = 1; +done: + /* + * The guest can modify the TPR by writing to %CR8. In guest mode + * the processor reflects this write to V_TPR without hypervisor + * intervention. + * + * The guest can also modify the TPR by writing to it via the memory + * mapped APIC page. In this case, the write will be emulated by the + * hypervisor. For this reason V_TPR must be updated before every + * VMRUN. + */ + v_tpr = vlapic_get_cr8(vlapic); + KASSERT(v_tpr <= 15, ("invalid v_tpr %#x", v_tpr)); + if (ctrl->v_tpr != v_tpr) { + VCPU_CTR2(sc->vm, vcpu, "VMCB V_TPR changed from %#x to %#x", + ctrl->v_tpr, v_tpr); + ctrl->v_tpr = v_tpr; + svm_set_dirty(sc, vcpu, VMCB_CACHE_TPR); + } + + if (need_intr_window) { + /* + * We use V_IRQ in conjunction with the VINTR intercept to + * trap into the hypervisor as soon as a virtual interrupt + * can be delivered. + * + * Since injected events are not subject to intercept checks + * we need to ensure that the V_IRQ is not actually going to + * be delivered on VM entry. The KASSERT below enforces this. + */ + KASSERT((ctrl->eventinj & VMCB_EVENTINJ_VALID) != 0 || + (state->rflags & PSL_I) == 0 || ctrl->intr_shadow, + ("Bogus intr_window_exiting: eventinj (%#lx), " + "intr_shadow (%u), rflags (%#lx)", + ctrl->eventinj, ctrl->intr_shadow, state->rflags)); + enable_intr_window_exiting(sc, vcpu); + } else { + disable_intr_window_exiting(sc, vcpu); + } +} + +static __inline void +restore_host_tss(void) +{ +#ifdef __FreeBSD__ + struct system_segment_descriptor *tss_sd; + + /* + * The TSS descriptor was in use prior to launching the guest so it + * has been marked busy. + * + * 'ltr' requires the descriptor to be marked available so change the + * type to "64-bit available TSS". + */ + tss_sd = PCPU_GET(tss); + tss_sd->sd_type = SDT_SYSTSS; + ltr(GSEL(GPROC0_SEL, SEL_KPL)); +#else + system_desc_t *tss = (system_desc_t *)&CPU->cpu_gdt[GDT_KTSS]; + + tss->ssd_type = SDT_SYSTSS; + wr_tsr(KTSS_SEL); +#endif +} + +#ifdef __FreeBSD__ +static void +check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) +{ + struct svm_vcpu *vcpustate; + struct vmcb_ctrl *ctrl; + long eptgen; + bool alloc_asid; + + KASSERT(CPU_ISSET(thiscpu, &pmap->pm_active), ("%s: nested pmap not " + "active on cpu %u", __func__, thiscpu)); + + vcpustate = svm_get_vcpu(sc, vcpuid); + ctrl = svm_get_vmcb_ctrl(sc, vcpuid); + + /* + * The TLB entries associated with the vcpu's ASID are not valid + * if either of the following conditions is true: + * + * 1. The vcpu's ASID generation is different than the host cpu's + * ASID generation. This happens when the vcpu migrates to a new + * host cpu. It can also happen when the number of vcpus executing + * on a host cpu is greater than the number of ASIDs available. + * + * 2. The pmap generation number is different than the value cached in + * the 'vcpustate'. This happens when the host invalidates pages + * belonging to the guest. + * + * asidgen eptgen Action + * mismatch mismatch + * 0 0 (a) + * 0 1 (b1) or (b2) + * 1 0 (c) + * 1 1 (d) + * + * (a) There is no mismatch in eptgen or ASID generation and therefore + * no further action is needed. + * + * (b1) If the cpu supports FlushByAsid then the vcpu's ASID is + * retained and the TLB entries associated with this ASID + * are flushed by VMRUN. + * + * (b2) If the cpu does not support FlushByAsid then a new ASID is + * allocated. + * + * (c) A new ASID is allocated. + * + * (d) A new ASID is allocated. + */ + + alloc_asid = false; + eptgen = pmap->pm_eptgen; + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_NOTHING; + + if (vcpustate->asid.gen != asid[thiscpu].gen) { + alloc_asid = true; /* (c) and (d) */ + } else if (vcpustate->eptgen != eptgen) { + if (flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; /* (b1) */ + else + alloc_asid = true; /* (b2) */ + } else { + /* + * This is the common case (a). + */ + KASSERT(!alloc_asid, ("ASID allocation not necessary")); + KASSERT(ctrl->tlb_ctrl == VMCB_TLB_FLUSH_NOTHING, + ("Invalid VMCB tlb_ctrl: %#x", ctrl->tlb_ctrl)); + } + + if (alloc_asid) { + if (++asid[thiscpu].num >= nasid) { + asid[thiscpu].num = 1; + if (++asid[thiscpu].gen == 0) + asid[thiscpu].gen = 1; + /* + * If this cpu does not support "flush-by-asid" + * then flush the entire TLB on a generation + * bump. Subsequent ASID allocation in this + * generation can be done without a TLB flush. + */ + if (!flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_ALL; + } + vcpustate->asid.gen = asid[thiscpu].gen; + vcpustate->asid.num = asid[thiscpu].num; + + ctrl->asid = vcpustate->asid.num; + svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); + /* + * If this cpu supports "flush-by-asid" then the TLB + * was not flushed after the generation bump. The TLB + * is flushed selectively after every new ASID allocation. + */ + if (flush_by_asid()) + ctrl->tlb_ctrl = VMCB_TLB_FLUSH_GUEST; + } + vcpustate->eptgen = eptgen; + + KASSERT(ctrl->asid != 0, ("Guest ASID must be non-zero")); + KASSERT(ctrl->asid == vcpustate->asid.num, + ("ASID mismatch: %u/%u", ctrl->asid, vcpustate->asid.num)); +} +#else /* __FreeBSD__ */ +static void +check_asid(struct svm_softc *sc, int vcpuid, pmap_t pmap, u_int thiscpu) +{ + struct svm_vcpu *vcpustate = svm_get_vcpu(sc, vcpuid); + struct vmcb_ctrl *ctrl = svm_get_vmcb_ctrl(sc, vcpuid); + long eptgen; + uint8_t flush; + + eptgen = pmap->pm_eptgen; + flush = hma_svm_asid_update(&vcpustate->hma_asid, flush_by_asid(), + vcpustate->eptgen == eptgen); + + if (flush != VMCB_TLB_FLUSH_NOTHING) { + ctrl->asid = vcpustate->hma_asid.hsa_asid; + svm_set_dirty(sc, vcpuid, VMCB_CACHE_ASID); + } + ctrl->tlb_ctrl = flush; + vcpustate->eptgen = eptgen; +} +#endif /* __FreeBSD__ */ + +static __inline void +disable_gintr(void) +{ + + __asm __volatile("clgi"); +} + +static __inline void +enable_gintr(void) +{ + + __asm __volatile("stgi"); +} + +static __inline void +svm_dr_enter_guest(struct svm_regctx *gctx) +{ + + /* Save host control debug registers. */ + gctx->host_dr7 = rdr7(); + gctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); + + /* + * Disable debugging in DR7 and DEBUGCTL to avoid triggering + * exceptions in the host based on the guest DRx values. The + * guest DR6, DR7, and DEBUGCTL are saved/restored in the + * VMCB. + */ + load_dr7(0); + wrmsr(MSR_DEBUGCTLMSR, 0); + + /* Save host debug registers. */ + gctx->host_dr0 = rdr0(); + gctx->host_dr1 = rdr1(); + gctx->host_dr2 = rdr2(); + gctx->host_dr3 = rdr3(); + gctx->host_dr6 = rdr6(); + + /* Restore guest debug registers. */ + load_dr0(gctx->sctx_dr0); + load_dr1(gctx->sctx_dr1); + load_dr2(gctx->sctx_dr2); + load_dr3(gctx->sctx_dr3); +} + +static __inline void +svm_dr_leave_guest(struct svm_regctx *gctx) +{ + + /* Save guest debug registers. */ + gctx->sctx_dr0 = rdr0(); + gctx->sctx_dr1 = rdr1(); + gctx->sctx_dr2 = rdr2(); + gctx->sctx_dr3 = rdr3(); + + /* + * Restore host debug registers. Restore DR7 and DEBUGCTL + * last. + */ + load_dr0(gctx->host_dr0); + load_dr1(gctx->host_dr1); + load_dr2(gctx->host_dr2); + load_dr3(gctx->host_dr3); + load_dr6(gctx->host_dr6); + wrmsr(MSR_DEBUGCTLMSR, gctx->host_debugctl); + load_dr7(gctx->host_dr7); +} + +/* + * Start vcpu with specified RIP. + */ +static int +svm_vmrun(void *arg, int vcpu, register_t rip, pmap_t pmap, + struct vm_eventinfo *evinfo) +{ + struct svm_regctx *gctx; + struct svm_softc *svm_sc; + struct svm_vcpu *vcpustate; + struct vmcb_state *state; + struct vmcb_ctrl *ctrl; + struct vm_exit *vmexit; + struct vlapic *vlapic; + struct vm *vm; + uint64_t vmcb_pa; + int handled; + uint16_t ldt_sel; + + svm_sc = arg; + vm = svm_sc->vm; + + vcpustate = svm_get_vcpu(svm_sc, vcpu); + state = svm_get_vmcb_state(svm_sc, vcpu); + ctrl = svm_get_vmcb_ctrl(svm_sc, vcpu); + vmexit = vm_exitinfo(vm, vcpu); + vlapic = vm_lapic(vm, vcpu); + + gctx = svm_get_guest_regctx(svm_sc, vcpu); + vmcb_pa = svm_sc->vcpu[vcpu].vmcb_pa; + + if (vcpustate->lastcpu != curcpu) { + /* + * Force new ASID allocation by invalidating the generation. + */ +#ifdef __FreeBSD__ + vcpustate->asid.gen = 0; +#else + vcpustate->hma_asid.hsa_gen = 0; +#endif + + /* + * Invalidate the VMCB state cache by marking all fields dirty. + */ + svm_set_dirty(svm_sc, vcpu, 0xffffffff); + + /* + * XXX + * Setting 'vcpustate->lastcpu' here is bit premature because + * we may return from this function without actually executing + * the VMRUN instruction. This could happen if an AST or yield + * condition is pending on the first time through the loop. + * + * This works for now but any new side-effects of vcpu + * migration should take this case into account. + */ + vcpustate->lastcpu = curcpu; + vmm_stat_incr(vm, vcpu, VCPU_MIGRATIONS, 1); + } + + svm_msr_guest_enter(svm_sc, vcpu); + +#ifndef __FreeBSD__ + VERIFY(!vcpustate->loaded && curthread->t_preempt != 0); + vcpustate->loaded = B_TRUE; +#endif + + /* Update Guest RIP */ + state->rip = rip; + + do { +#ifndef __FreeBSD__ + /* + * Interrupt injection may involve mutex contention which, on + * illumos bhyve, are blocking/non-spin. Doing so with global + * interrupts disabled is a recipe for deadlock, so it is + * performed here. + */ + svm_inj_interrupts(svm_sc, vcpu, vlapic); +#endif + + /* + * Disable global interrupts to guarantee atomicity during + * loading of guest state. This includes not only the state + * loaded by the "vmrun" instruction but also software state + * maintained by the hypervisor: suspended and rendezvous + * state, NPT generation number, vlapic interrupts etc. + */ + disable_gintr(); + + if (vcpu_suspended(evinfo)) { + enable_gintr(); + vm_exit_suspended(vm, vcpu, state->rip); + break; + } + + if (vcpu_runblocked(evinfo)) { + enable_gintr(); + vm_exit_runblock(vm, vcpu, state->rip); + break; + } + + if (vcpu_reqidle(evinfo)) { + enable_gintr(); + vm_exit_reqidle(vm, vcpu, state->rip); + break; + } + + /* We are asked to give the cpu by scheduler. */ + if (vcpu_should_yield(vm, vcpu)) { + enable_gintr(); + vm_exit_astpending(vm, vcpu, state->rip); + break; + } + + if (vcpu_debugged(vm, vcpu)) { + enable_gintr(); + vm_exit_debug(vm, vcpu, state->rip); + break; + } + + /* + * #VMEXIT resumes the host with the guest LDTR, so + * save the current LDT selector so it can be restored + * after an exit. The userspace hypervisor probably + * doesn't use a LDT, but save and restore it to be + * safe. + */ + ldt_sel = sldt(); + +#ifdef __FreeBSD__ + svm_inj_interrupts(svm_sc, vcpu, vlapic); +#endif + + /* Activate the nested pmap on 'curcpu' */ + CPU_SET_ATOMIC_ACQ(curcpu, &pmap->pm_active); + + /* + * Check the pmap generation and the ASID generation to + * ensure that the vcpu does not use stale TLB mappings. + */ + check_asid(svm_sc, vcpu, pmap, curcpu); + + ctrl->vmcb_clean = vmcb_clean & ~vcpustate->dirty; + vcpustate->dirty = 0; + VCPU_CTR1(vm, vcpu, "vmcb clean %#x", ctrl->vmcb_clean); + + /* Launch Virtual Machine. */ + VCPU_CTR1(vm, vcpu, "Resume execution at %#lx", state->rip); + svm_dr_enter_guest(gctx); +#ifdef __FreeBSD__ + svm_launch(vmcb_pa, gctx, &__pcpu[curcpu]); +#else + svm_launch(vmcb_pa, gctx, CPU); +#endif + svm_dr_leave_guest(gctx); + + CPU_CLR_ATOMIC(curcpu, &pmap->pm_active); + + /* + * The host GDTR and IDTR is saved by VMRUN and restored + * automatically on #VMEXIT. However, the host TSS needs + * to be restored explicitly. + */ + restore_host_tss(); + + /* Restore host LDTR. */ + lldt(ldt_sel); + + /* #VMEXIT disables interrupts so re-enable them here. */ + enable_gintr(); + + /* Update 'nextrip' */ + vcpustate->nextrip = state->rip; + + /* Handle #VMEXIT and if required return to user space. */ + handled = svm_vmexit(svm_sc, vcpu, vmexit); + } while (handled); + + svm_msr_guest_exit(svm_sc, vcpu); + +#ifndef __FreeBSD__ + VERIFY(vcpustate->loaded && curthread->t_preempt != 0); + vcpustate->loaded = B_FALSE; +#endif + + return (0); +} + +static void +svm_vmcleanup(void *arg) +{ + struct svm_softc *sc = arg; + + contigfree(sc->iopm_bitmap, SVM_IO_BITMAP_SIZE, M_SVM); + contigfree(sc->msr_bitmap, SVM_MSR_BITMAP_SIZE, M_SVM); + free(sc, M_SVM); +} + +static register_t * +swctx_regptr(struct svm_regctx *regctx, int reg) +{ + + switch (reg) { + case VM_REG_GUEST_RBX: + return (®ctx->sctx_rbx); + case VM_REG_GUEST_RCX: + return (®ctx->sctx_rcx); + case VM_REG_GUEST_RDX: + return (®ctx->sctx_rdx); + case VM_REG_GUEST_RDI: + return (®ctx->sctx_rdi); + case VM_REG_GUEST_RSI: + return (®ctx->sctx_rsi); + case VM_REG_GUEST_RBP: + return (®ctx->sctx_rbp); + case VM_REG_GUEST_R8: + return (®ctx->sctx_r8); + case VM_REG_GUEST_R9: + return (®ctx->sctx_r9); + case VM_REG_GUEST_R10: + return (®ctx->sctx_r10); + case VM_REG_GUEST_R11: + return (®ctx->sctx_r11); + case VM_REG_GUEST_R12: + return (®ctx->sctx_r12); + case VM_REG_GUEST_R13: + return (®ctx->sctx_r13); + case VM_REG_GUEST_R14: + return (®ctx->sctx_r14); + case VM_REG_GUEST_R15: + return (®ctx->sctx_r15); + case VM_REG_GUEST_DR0: + return (®ctx->sctx_dr0); + case VM_REG_GUEST_DR1: + return (®ctx->sctx_dr1); + case VM_REG_GUEST_DR2: + return (®ctx->sctx_dr2); + case VM_REG_GUEST_DR3: + return (®ctx->sctx_dr3); + default: + return (NULL); + } +} + +static int +svm_getreg(void *arg, int vcpu, int ident, uint64_t *val) +{ + struct svm_softc *svm_sc; + register_t *reg; + + svm_sc = arg; + + if (ident == VM_REG_GUEST_INTR_SHADOW) { + return (svm_get_intr_shadow(svm_sc, vcpu, val)); + } + + if (vmcb_read(svm_sc, vcpu, ident, val) == 0) { + return (0); + } + + reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); + + if (reg != NULL) { + *val = *reg; + return (0); + } + + VCPU_CTR1(svm_sc->vm, vcpu, "svm_getreg: unknown register %#x", ident); + return (EINVAL); +} + +static int +svm_setreg(void *arg, int vcpu, int ident, uint64_t val) +{ + struct svm_softc *svm_sc; + register_t *reg; + + svm_sc = arg; + + if (ident == VM_REG_GUEST_INTR_SHADOW) { + return (svm_modify_intr_shadow(svm_sc, vcpu, val)); + } + + if (vmcb_write(svm_sc, vcpu, ident, val) == 0) { + return (0); + } + + reg = swctx_regptr(svm_get_guest_regctx(svm_sc, vcpu), ident); + + if (reg != NULL) { + *reg = val; + return (0); + } + + /* + * XXX deal with CR3 and invalidate TLB entries tagged with the + * vcpu's ASID. This needs to be treated differently depending on + * whether 'running' is true/false. + */ + + VCPU_CTR1(svm_sc->vm, vcpu, "svm_setreg: unknown register %#x", ident); + return (EINVAL); +} + +static int +svm_setcap(void *arg, int vcpu, int type, int val) +{ + struct svm_softc *sc; + int error; + + sc = arg; + error = 0; + switch (type) { + case VM_CAP_HALT_EXIT: + svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_HLT, val); + break; + case VM_CAP_PAUSE_EXIT: + svm_set_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PAUSE, val); + break; + case VM_CAP_UNRESTRICTED_GUEST: + /* Unrestricted guest execution cannot be disabled in SVM */ + if (val == 0) + error = EINVAL; + break; + default: + error = ENOENT; + break; + } + return (error); +} + +static int +svm_getcap(void *arg, int vcpu, int type, int *retval) +{ + struct svm_softc *sc; + int error; + + sc = arg; + error = 0; + + switch (type) { + case VM_CAP_HALT_EXIT: + *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_HLT); + break; + case VM_CAP_PAUSE_EXIT: + *retval = svm_get_intercept(sc, vcpu, VMCB_CTRL1_INTCPT, + VMCB_INTCPT_PAUSE); + break; + case VM_CAP_UNRESTRICTED_GUEST: + *retval = 1; /* unrestricted guest is always enabled */ + break; + default: + error = ENOENT; + break; + } + return (error); +} + +static struct vlapic * +svm_vlapic_init(void *arg, int vcpuid) +{ + struct svm_softc *svm_sc; + struct vlapic *vlapic; + + svm_sc = arg; + vlapic = malloc(sizeof(struct vlapic), M_SVM_VLAPIC, M_WAITOK | M_ZERO); + vlapic->vm = svm_sc->vm; + vlapic->vcpuid = vcpuid; + vlapic->apic_page = (struct LAPIC *)&svm_sc->apic_page[vcpuid]; + + vlapic_init(vlapic); + + return (vlapic); +} + +static void +svm_vlapic_cleanup(void *arg, struct vlapic *vlapic) +{ + + vlapic_cleanup(vlapic); + free(vlapic, M_SVM_VLAPIC); +} + +#ifndef __FreeBSD__ +static void +svm_savectx(void *arg, int vcpu) +{ + struct svm_softc *sc = arg; + + if (sc->vcpu[vcpu].loaded) { + svm_msr_guest_exit(sc, vcpu); + } +} + +static void +svm_restorectx(void *arg, int vcpu) +{ + struct svm_softc *sc = arg; + + if (sc->vcpu[vcpu].loaded) { + svm_msr_guest_enter(sc, vcpu); + } +} +#endif /* __FreeBSD__ */ + +struct vmm_ops vmm_ops_amd = { + svm_init, + svm_cleanup, + svm_restore, + svm_vminit, + svm_vmrun, + svm_vmcleanup, + svm_getreg, + svm_setreg, + vmcb_getdesc, + vmcb_setdesc, + svm_getcap, + svm_setcap, + svm_npt_alloc, + svm_npt_free, + svm_vlapic_init, + svm_vlapic_cleanup, + +#ifndef __FreeBSD__ + svm_savectx, + svm_restorectx, +#endif +}; diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm.h b/usr/src/uts/i86pc/io/vmm/amd/svm.h new file mode 100644 index 0000000000..c78f7eb067 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm.h @@ -0,0 +1,74 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_H_ +#define _SVM_H_ + +/* + * Guest register state that is saved outside the VMCB. + */ +struct svm_regctx { + register_t sctx_rbp; + register_t sctx_rbx; + register_t sctx_rcx; + register_t sctx_rdx; + register_t sctx_rdi; + register_t sctx_rsi; + register_t sctx_r8; + register_t sctx_r9; + register_t sctx_r10; + register_t sctx_r11; + register_t sctx_r12; + register_t sctx_r13; + register_t sctx_r14; + register_t sctx_r15; + register_t sctx_dr0; + register_t sctx_dr1; + register_t sctx_dr2; + register_t sctx_dr3; + + register_t host_dr0; + register_t host_dr1; + register_t host_dr2; + register_t host_dr3; + register_t host_dr6; + register_t host_dr7; + uint64_t host_debugctl; +}; + +#ifdef __FreeBSD__ +struct pcpu; +void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct pcpu *pcpu); +#else +struct cpu; +void svm_launch(uint64_t pa, struct svm_regctx *gctx, struct cpu *pcpu); +#endif + +#endif /* _SVM_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c new file mode 100644 index 0000000000..0c1ce0e4e0 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.c @@ -0,0 +1,199 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/errno.h> +#include <sys/systm.h> + +#include <machine/cpufunc.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> + +#include "svm.h" +#include "vmcb.h" +#include "svm_softc.h" +#include "svm_msr.h" + +#ifndef MSR_AMDK8_IPM +#define MSR_AMDK8_IPM 0xc0010055 +#endif + +enum { + IDX_MSR_LSTAR, + IDX_MSR_CSTAR, + IDX_MSR_STAR, + IDX_MSR_SF_MASK, + HOST_MSR_NUM /* must be the last enumeration */ +}; + +#ifdef __FreeBSD__ +static uint64_t host_msrs[HOST_MSR_NUM]; + +void +svm_msr_init(void) +{ + /* + * It is safe to cache the values of the following MSRs because they + * don't change based on curcpu, curproc or curthread. + */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +} +#else + +CTASSERT(HOST_MSR_NUM == SVM_HOST_MSR_NUM); + +void +svm_msr_init(void) +{ + /* + * These MSRs do vary between CPUs on illumos, so saving system-wide + * values for them serves no purpose. + */ +} +#endif /* __FreeBSD__ */ + +void +svm_msr_guest_init(struct svm_softc *sc, int vcpu) +{ + /* + * All the MSRs accessible to the guest are either saved/restored by + * hardware on every #VMEXIT/VMRUN (e.g., G_PAT) or are saved/restored + * by VMSAVE/VMLOAD (e.g., MSR_GSBASE). + * + * There are no guest MSRs that are saved/restored "by hand" so nothing + * more to do here. + */ + return; +} + +void +svm_msr_guest_enter(struct svm_softc *sc, int vcpu) +{ + /* + * Save host MSRs (if any) and restore guest MSRs (if any). + */ +#ifndef __FreeBSD__ + uint64_t *host_msrs = sc->host_msrs[vcpu]; + + /* Save host MSRs */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +#endif /* __FreeBSD__ */ +} + +void +svm_msr_guest_exit(struct svm_softc *sc, int vcpu) +{ +#ifndef __FreeBSD__ + uint64_t *host_msrs = sc->host_msrs[vcpu]; +#endif + /* + * Save guest MSRs (if any) and restore host MSRs. + */ + wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]); + wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]); + wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]); + wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); + + /* MSR_KGSBASE will be restored on the way back to userspace */ +} + +int +svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result, + bool *retu) +{ + int error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + *result = 0; + break; + case MSR_MTRRcap: + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_SYSCFG: + case MSR_AMDK8_IPM: + case MSR_EXTFEATURES: + *result = 0; + break; + default: + error = EINVAL; + break; + } + + return (error); +} + +int +svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, bool *retu) +{ + int error = 0; + + switch (num) { + case MSR_MCG_CAP: + case MSR_MCG_STATUS: + break; /* ignore writes */ + case MSR_MTRRcap: + vm_inject_gp(sc->vm, vcpu); + break; + case MSR_MTRRdefType: + case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8: + case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1: + case MSR_MTRR64kBase: + case MSR_SYSCFG: + break; /* Ignore writes */ + case MSR_AMDK8_IPM: + /* + * Ignore writes to the "Interrupt Pending Message" MSR. + */ + break; + case MSR_K8_UCODE_UPDATE: + /* + * Ignore writes to microcode update register. + */ + break; + case MSR_EXTFEATURES: + break; + default: + error = EINVAL; + break; + } + + return (error); +} diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h new file mode 100644 index 0000000000..1dba8101ab --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_msr.h @@ -0,0 +1,46 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_MSR_H_ +#define _SVM_MSR_H_ + +struct svm_softc; + +void svm_msr_init(void); +void svm_msr_guest_init(struct svm_softc *sc, int vcpu); +void svm_msr_guest_enter(struct svm_softc *sc, int vcpu); +void svm_msr_guest_exit(struct svm_softc *sc, int vcpu); + +int svm_wrmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t val, + bool *retu); +int svm_rdmsr(struct svm_softc *sc, int vcpu, u_int num, uint64_t *result, + bool *retu); + +#endif /* _SVM_MSR_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h new file mode 100644 index 0000000000..b5ac1903e7 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_softc.h @@ -0,0 +1,131 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SVM_SOFTC_H_ +#define _SVM_SOFTC_H_ + +#define SVM_IO_BITMAP_SIZE (3 * PAGE_SIZE) +#define SVM_MSR_BITMAP_SIZE (2 * PAGE_SIZE) + +#ifdef __FreeBSD__ +struct asid { + uint64_t gen; /* range is [1, ~0UL] */ + uint32_t num; /* range is [1, nasid - 1] */ +}; +#else +#include <sys/hma.h> + +/* This must match HOST_MSR_NUM in svm_msr.c (where it is CTASSERTed) */ +#define SVM_HOST_MSR_NUM 4 +#endif /* __FreeBSD__ */ + +/* + * XXX separate out 'struct vmcb' from 'svm_vcpu' to avoid wasting space + * due to VMCB alignment requirements. + */ +struct svm_vcpu { + struct vmcb vmcb; /* hardware saved vcpu context */ + struct svm_regctx swctx; /* software saved vcpu context */ + uint64_t vmcb_pa; /* VMCB physical address */ + uint64_t nextrip; /* next instruction to be executed by guest */ + int lastcpu; /* host cpu that the vcpu last ran on */ + uint32_t dirty; /* state cache bits that must be cleared */ + long eptgen; /* pmap->pm_eptgen when the vcpu last ran */ +#ifdef __FreeBSD__ + struct asid asid; +#else + hma_svm_asid_t hma_asid; + boolean_t loaded; +#endif +} __aligned(PAGE_SIZE); + +/* + * SVM softc, one per virtual machine. + */ +struct svm_softc { + uint8_t apic_page[VM_MAXCPU][PAGE_SIZE]; + struct svm_vcpu vcpu[VM_MAXCPU]; + vm_offset_t nptp; /* nested page table */ + uint8_t *iopm_bitmap; /* shared by all vcpus */ + uint8_t *msr_bitmap; /* shared by all vcpus */ + struct vm *vm; +#ifndef __FreeBSD__ + uint64_t host_msrs[VM_MAXCPU][SVM_HOST_MSR_NUM]; +#endif +}; + +CTASSERT((offsetof(struct svm_softc, nptp) & PAGE_MASK) == 0); + +static __inline struct svm_vcpu * +svm_get_vcpu(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu])); +} + +static __inline struct vmcb * +svm_get_vmcb(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb)); +} + +static __inline struct vmcb_state * +svm_get_vmcb_state(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb.state)); +} + +static __inline struct vmcb_ctrl * +svm_get_vmcb_ctrl(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].vmcb.ctrl)); +} + +static __inline struct svm_regctx * +svm_get_guest_regctx(struct svm_softc *sc, int vcpu) +{ + + return (&(sc->vcpu[vcpu].swctx)); +} + +static __inline void +svm_set_dirty(struct svm_softc *sc, int vcpu, uint32_t dirtybits) +{ + struct svm_vcpu *vcpustate; + + vcpustate = svm_get_vcpu(sc, vcpu); + + vcpustate->dirty |= dirtybits; +} + +#endif /* _SVM_SOFTC_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/svm_support.s b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s new file mode 100644 index 0000000000..fad994b09c --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/svm_support.s @@ -0,0 +1,164 @@ +/*- + * Copyright (c) 2013, Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/asm_linkage.h> + +#include "svm_assym.h" + +/* Porting note: This is named 'svm_support.S' upstream. */ + +#if defined(lint) + +struct svm_regctx; +struct cpu; + +/*ARGSUSED*/ +void +svm_launch(uint64_t pa, struct svm_regctx *gctx, struct cpu *cpu) +{} + +#else /* lint */ + +#define VMLOAD .byte 0x0f, 0x01, 0xda +#define VMRUN .byte 0x0f, 0x01, 0xd8 +#define VMSAVE .byte 0x0f, 0x01, 0xdb + + +/* + * Flush scratch registers to avoid lingering guest state being used for + * Spectre v1 attacks when returning from guest entry. + */ +#define SVM_GUEST_FLUSH_SCRATCH \ + xorl %edi, %edi; \ + xorl %esi, %esi; \ + xorl %edx, %edx; \ + xorl %ecx, %ecx; \ + xorl %r8d, %r8d; \ + xorl %r9d, %r9d; \ + xorl %r10d, %r10d; \ + xorl %r11d, %r11d; + +/* Stack layout (offset from %rsp) for svm_launch */ +#define SVMSTK_R15 0x00 /* callee saved %r15 */ +#define SVMSTK_R14 0x08 /* callee saved %r14 */ +#define SVMSTK_R13 0x10 /* callee saved %r13 */ +#define SVMSTK_R12 0x18 /* callee saved %r12 */ +#define SVMSTK_RBX 0x20 /* callee saved %rbx */ +#define SVMSTK_RDX 0x28 /* save-args %rdx (struct cpu *) */ +#define SVMSTK_RSI 0x30 /* save-args %rsi (struct svm_regctx *) */ +#define SVMSTK_RDI 0x38 /* save-args %rdi (uint64_t vmcb_pa) */ +#define SVMSTK_FP 0x40 /* frame pointer %rbp */ +#define SVMSTKSIZE SVMSTK_FP + +/* + * svm_launch(uint64_t vmcb, struct svm_regctx *gctx, struct pcpu *pcpu) + * %rdi: physical address of VMCB + * %rsi: pointer to guest context + * %rdx: pointer to the pcpu data + */ +ENTRY_NP(svm_launch) + pushq %rbp + movq %rsp, %rbp + subq $SVMSTKSIZE, %rsp + movq %r15, SVMSTK_R15(%rsp) + movq %r14, SVMSTK_R14(%rsp) + movq %r13, SVMSTK_R13(%rsp) + movq %r12, SVMSTK_R12(%rsp) + movq %rbx, SVMSTK_RBX(%rsp) + movq %rdx, SVMSTK_RDX(%rsp) + movq %rsi, SVMSTK_RSI(%rsp) + movq %rdi, SVMSTK_RDI(%rsp) + + /* VMLOAD and VMRUN expect the VMCB physaddr in %rax */ + movq %rdi, %rax + + /* Restore guest state. */ + movq SCTX_R8(%rsi), %r8 + movq SCTX_R9(%rsi), %r9 + movq SCTX_R10(%rsi), %r10 + movq SCTX_R11(%rsi), %r11 + movq SCTX_R12(%rsi), %r12 + movq SCTX_R13(%rsi), %r13 + movq SCTX_R14(%rsi), %r14 + movq SCTX_R15(%rsi), %r15 + movq SCTX_RBP(%rsi), %rbp + movq SCTX_RBX(%rsi), %rbx + movq SCTX_RCX(%rsi), %rcx + movq SCTX_RDX(%rsi), %rdx + movq SCTX_RDI(%rsi), %rdi + movq SCTX_RSI(%rsi), %rsi /* %rsi must be restored last */ + + VMLOAD + VMRUN + VMSAVE + + /* Grab the svm_regctx pointer */ + movq SVMSTK_RSI(%rsp), %rax + + /* Save guest state. */ + movq %r8, SCTX_R8(%rax) + movq %r9, SCTX_R9(%rax) + movq %r10, SCTX_R10(%rax) + movq %r11, SCTX_R11(%rax) + movq %r12, SCTX_R12(%rax) + movq %r13, SCTX_R13(%rax) + movq %r14, SCTX_R14(%rax) + movq %r15, SCTX_R15(%rax) + movq %rbp, SCTX_RBP(%rax) + movq %rbx, SCTX_RBX(%rax) + movq %rcx, SCTX_RCX(%rax) + movq %rdx, SCTX_RDX(%rax) + movq %rdi, SCTX_RDI(%rax) + movq %rsi, SCTX_RSI(%rax) + + /* Restore callee-saved registers */ + movq SVMSTK_R15(%rsp), %r15 + movq SVMSTK_R14(%rsp), %r14 + movq SVMSTK_R13(%rsp), %r13 + movq SVMSTK_R12(%rsp), %r12 + movq SVMSTK_RBX(%rsp), %rbx + + /* Fix %gsbase to point back to the correct 'struct cpu *' */ + movq SVMSTK_RDX(%rsp), %rdx + movl %edx, %eax + shrq $32, %rdx + movl $MSR_GSBASE, %ecx + wrmsr + + SVM_GUEST_FLUSH_SCRATCH + + addq $SVMSTKSIZE, %rsp + popq %rbp + ret +SET_SIZE(svm_launch) + +#endif /* lint */ diff --git a/usr/src/uts/i86pc/io/vmm/amd/vmcb.c b/usr/src/uts/i86pc/io/vmm/amd/vmcb.c new file mode 100644 index 0000000000..5075b69867 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/vmcb.c @@ -0,0 +1,454 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> + +#include <machine/segments.h> +#include <machine/specialreg.h> +#include <machine/vmm.h> + +#include "vmm_ktr.h" + +#include "vmcb.h" +#include "svm.h" +#include "svm_softc.h" + +/* + * The VMCB aka Virtual Machine Control Block is a 4KB aligned page + * in memory that describes the virtual machine. + * + * The VMCB contains: + * - instructions or events in the guest to intercept + * - control bits that modify execution environment of the guest + * - guest processor state (e.g. general purpose registers) + */ + +/* + * Return VMCB segment area. + */ +static struct vmcb_segment * +vmcb_segptr(struct vmcb *vmcb, int type) +{ + struct vmcb_state *state; + struct vmcb_segment *seg; + + state = &vmcb->state; + + switch (type) { + case VM_REG_GUEST_CS: + seg = &state->cs; + break; + + case VM_REG_GUEST_DS: + seg = &state->ds; + break; + + case VM_REG_GUEST_ES: + seg = &state->es; + break; + + case VM_REG_GUEST_FS: + seg = &state->fs; + break; + + case VM_REG_GUEST_GS: + seg = &state->gs; + break; + + case VM_REG_GUEST_SS: + seg = &state->ss; + break; + + case VM_REG_GUEST_GDTR: + seg = &state->gdt; + break; + + case VM_REG_GUEST_IDTR: + seg = &state->idt; + break; + + case VM_REG_GUEST_LDTR: + seg = &state->ldt; + break; + + case VM_REG_GUEST_TR: + seg = &state->tr; + break; + + default: + seg = NULL; + break; + } + + return (seg); +} + +static int +vmcb_access(struct svm_softc *softc, int vcpu, int write, int ident, + uint64_t *val) +{ + struct vmcb *vmcb; + int off, bytes; + char *ptr; + + vmcb = svm_get_vmcb(softc, vcpu); + off = VMCB_ACCESS_OFFSET(ident); + bytes = VMCB_ACCESS_BYTES(ident); + + if ((off + bytes) >= sizeof (struct vmcb)) + return (EINVAL); + + ptr = (char *)vmcb; + + if (!write) + *val = 0; + + switch (bytes) { + case 8: + case 4: + case 2: + if (write) + memcpy(ptr + off, val, bytes); + else + memcpy(val, ptr + off, bytes); + break; + default: + VCPU_CTR1(softc->vm, vcpu, + "Invalid size %d for VMCB access: %d", bytes); + return (EINVAL); + } + + /* Invalidate all VMCB state cached by h/w. */ + if (write) + svm_set_dirty(softc, vcpu, 0xffffffff); + + return (0); +} + +/* + * Read from segment selector, control and general purpose register of VMCB. + */ +int +vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_segment *seg; + int err; + + vmcb = svm_get_vmcb(sc, vcpu); + state = &vmcb->state; + err = 0; + + if (VMCB_ACCESS_OK(ident)) + return (vmcb_access(sc, vcpu, 0, ident, retval)); + + switch (ident) { + case VM_REG_GUEST_CR0: + *retval = state->cr0; + break; + + case VM_REG_GUEST_CR2: + *retval = state->cr2; + break; + + case VM_REG_GUEST_CR3: + *retval = state->cr3; + break; + + case VM_REG_GUEST_CR4: + *retval = state->cr4; + break; + + case VM_REG_GUEST_DR6: + *retval = state->dr6; + break; + + case VM_REG_GUEST_DR7: + *retval = state->dr7; + break; + + case VM_REG_GUEST_EFER: + *retval = state->efer; + break; + + case VM_REG_GUEST_RAX: + *retval = state->rax; + break; + + case VM_REG_GUEST_RFLAGS: + *retval = state->rflags; + break; + + case VM_REG_GUEST_RIP: + *retval = state->rip; + break; + + case VM_REG_GUEST_RSP: + *retval = state->rsp; + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_SS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + seg = vmcb_segptr(vmcb, ident); + KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", + __func__, ident)); + *retval = seg->selector; + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + /* GDTR and IDTR don't have segment selectors */ + err = EINVAL; + break; + default: + err = EINVAL; + break; + } + + return (err); +} + +/* + * Write to segment selector, control and general purpose register of VMCB. + */ +int +vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val) +{ + struct vmcb *vmcb; + struct vmcb_state *state; + struct vmcb_segment *seg; + int err, dirtyseg; + + vmcb = svm_get_vmcb(sc, vcpu); + state = &vmcb->state; + dirtyseg = 0; + err = 0; + + if (VMCB_ACCESS_OK(ident)) + return (vmcb_access(sc, vcpu, 1, ident, &val)); + + switch (ident) { + case VM_REG_GUEST_CR0: + state->cr0 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_CR2: + state->cr2 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR2); + break; + + case VM_REG_GUEST_CR3: + state->cr3 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_CR4: + state->cr4 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_DR6: + state->dr6 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_DR); + break; + + case VM_REG_GUEST_DR7: + state->dr7 = val; + svm_set_dirty(sc, vcpu, VMCB_CACHE_DR); + break; + + case VM_REG_GUEST_EFER: + /* EFER_SVM must always be set when the guest is executing */ + state->efer = val | EFER_SVM; + svm_set_dirty(sc, vcpu, VMCB_CACHE_CR); + break; + + case VM_REG_GUEST_RAX: + state->rax = val; + break; + + case VM_REG_GUEST_RFLAGS: + state->rflags = val; + break; + + case VM_REG_GUEST_RIP: + state->rip = val; + break; + + case VM_REG_GUEST_RSP: + state->rsp = val; + break; + + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_SS: + dirtyseg = 1; /* FALLTHROUGH */ + case VM_REG_GUEST_FS: + case VM_REG_GUEST_GS: + case VM_REG_GUEST_LDTR: + case VM_REG_GUEST_TR: + seg = vmcb_segptr(vmcb, ident); + KASSERT(seg != NULL, ("%s: unable to get segment %d from VMCB", + __func__, ident)); + seg->selector = val; + if (dirtyseg) + svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); + break; + + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + /* GDTR and IDTR don't have segment selectors */ + err = EINVAL; + break; + default: + err = EINVAL; + break; + } + + return (err); +} + +int +vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg2) +{ + struct vmcb_segment *seg; + + seg = vmcb_segptr(vmcb, ident); + if (seg != NULL) { + bcopy(seg, seg2, sizeof(struct vmcb_segment)); + return (0); + } else { + return (EINVAL); + } +} + +int +vmcb_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + struct vmcb_segment *seg; + uint16_t attrib; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + + seg = vmcb_segptr(vmcb, reg); + KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", + __func__, reg)); + + seg->base = desc->base; + seg->limit = desc->limit; + if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { + /* + * Map seg_desc access to VMCB attribute format. + * + * SVM uses the 'P' bit in the segment attributes to indicate a + * NULL segment so clear it if the segment is marked unusable. + */ + attrib = ((desc->access & 0xF000) >> 4) | (desc->access & 0xFF); + if (SEG_DESC_UNUSABLE(desc->access)) { + attrib &= ~0x80; + } + seg->attrib = attrib; + } + + VCPU_CTR4(sc->vm, vcpu, "Setting desc %d: base (%#lx), limit (%#x), " + "attrib (%#x)", reg, seg->base, seg->limit, seg->attrib); + + switch (reg) { + case VM_REG_GUEST_CS: + case VM_REG_GUEST_DS: + case VM_REG_GUEST_ES: + case VM_REG_GUEST_SS: + svm_set_dirty(sc, vcpu, VMCB_CACHE_SEG); + break; + case VM_REG_GUEST_GDTR: + case VM_REG_GUEST_IDTR: + svm_set_dirty(sc, vcpu, VMCB_CACHE_DT); + break; + default: + break; + } + + return (0); +} + +int +vmcb_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc) +{ + struct vmcb *vmcb; + struct svm_softc *sc; + struct vmcb_segment *seg; + + sc = arg; + vmcb = svm_get_vmcb(sc, vcpu); + seg = vmcb_segptr(vmcb, reg); + KASSERT(seg != NULL, ("%s: invalid segment descriptor %d", + __func__, reg)); + + desc->base = seg->base; + desc->limit = seg->limit; + desc->access = 0; + + if (reg != VM_REG_GUEST_GDTR && reg != VM_REG_GUEST_IDTR) { + /* Map seg_desc access to VMCB attribute format */ + desc->access = ((seg->attrib & 0xF00) << 4) | + (seg->attrib & 0xFF); + + /* + * VT-x uses bit 16 to indicate a segment that has been loaded + * with a NULL selector (aka unusable). The 'desc->access' + * field is interpreted in the VT-x format by the + * processor-independent code. + * + * SVM uses the 'P' bit to convey the same information so + * convert it into the VT-x format. For more details refer to + * section "Segment State in the VMCB" in APMv2. + */ + if (reg != VM_REG_GUEST_CS && reg != VM_REG_GUEST_TR) { + if ((desc->access & 0x80) == 0) + desc->access |= 0x10000; /* Unusable segment */ + } + } + + return (0); +} diff --git a/usr/src/uts/i86pc/io/vmm/amd/vmcb.h b/usr/src/uts/i86pc/io/vmm/amd/vmcb.h new file mode 100644 index 0000000000..ec7caa91f9 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/amd/vmcb.h @@ -0,0 +1,336 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2013 Anish Gupta (akgupt3@gmail.com) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VMCB_H_ +#define _VMCB_H_ + +struct svm_softc; + +#define BIT(n) (1ULL << n) + +/* + * Secure Virtual Machine: AMD64 Programmer's Manual Vol2, Chapter 15 + * Layout of VMCB: AMD64 Programmer's Manual Vol2, Appendix B + */ + +/* vmcb_ctrl->intercept[] array indices */ +#define VMCB_CR_INTCPT 0 +#define VMCB_DR_INTCPT 1 +#define VMCB_EXC_INTCPT 2 +#define VMCB_CTRL1_INTCPT 3 +#define VMCB_CTRL2_INTCPT 4 + +/* intercept[VMCB_CTRL1_INTCPT] fields */ +#define VMCB_INTCPT_INTR BIT(0) +#define VMCB_INTCPT_NMI BIT(1) +#define VMCB_INTCPT_SMI BIT(2) +#define VMCB_INTCPT_INIT BIT(3) +#define VMCB_INTCPT_VINTR BIT(4) +#define VMCB_INTCPT_CR0_WRITE BIT(5) +#define VMCB_INTCPT_IDTR_READ BIT(6) +#define VMCB_INTCPT_GDTR_READ BIT(7) +#define VMCB_INTCPT_LDTR_READ BIT(8) +#define VMCB_INTCPT_TR_READ BIT(9) +#define VMCB_INTCPT_IDTR_WRITE BIT(10) +#define VMCB_INTCPT_GDTR_WRITE BIT(11) +#define VMCB_INTCPT_LDTR_WRITE BIT(12) +#define VMCB_INTCPT_TR_WRITE BIT(13) +#define VMCB_INTCPT_RDTSC BIT(14) +#define VMCB_INTCPT_RDPMC BIT(15) +#define VMCB_INTCPT_PUSHF BIT(16) +#define VMCB_INTCPT_POPF BIT(17) +#define VMCB_INTCPT_CPUID BIT(18) +#define VMCB_INTCPT_RSM BIT(19) +#define VMCB_INTCPT_IRET BIT(20) +#define VMCB_INTCPT_INTn BIT(21) +#define VMCB_INTCPT_INVD BIT(22) +#define VMCB_INTCPT_PAUSE BIT(23) +#define VMCB_INTCPT_HLT BIT(24) +#define VMCB_INTCPT_INVPG BIT(25) +#define VMCB_INTCPT_INVPGA BIT(26) +#define VMCB_INTCPT_IO BIT(27) +#define VMCB_INTCPT_MSR BIT(28) +#define VMCB_INTCPT_TASK_SWITCH BIT(29) +#define VMCB_INTCPT_FERR_FREEZE BIT(30) +#define VMCB_INTCPT_SHUTDOWN BIT(31) + +/* intercept[VMCB_CTRL2_INTCPT] fields */ +#define VMCB_INTCPT_VMRUN BIT(0) +#define VMCB_INTCPT_VMMCALL BIT(1) +#define VMCB_INTCPT_VMLOAD BIT(2) +#define VMCB_INTCPT_VMSAVE BIT(3) +#define VMCB_INTCPT_STGI BIT(4) +#define VMCB_INTCPT_CLGI BIT(5) +#define VMCB_INTCPT_SKINIT BIT(6) +#define VMCB_INTCPT_RDTSCP BIT(7) +#define VMCB_INTCPT_ICEBP BIT(8) +#define VMCB_INTCPT_WBINVD BIT(9) +#define VMCB_INTCPT_MONITOR BIT(10) +#define VMCB_INTCPT_MWAIT BIT(11) +#define VMCB_INTCPT_MWAIT_ARMED BIT(12) +#define VMCB_INTCPT_XSETBV BIT(13) + +/* VMCB TLB control */ +#define VMCB_TLB_FLUSH_NOTHING 0 /* Flush nothing */ +#define VMCB_TLB_FLUSH_ALL 1 /* Flush entire TLB */ +#define VMCB_TLB_FLUSH_GUEST 3 /* Flush all guest entries */ +#define VMCB_TLB_FLUSH_GUEST_NONGLOBAL 7 /* Flush guest non-PG entries */ + +/* VMCB state caching */ +#define VMCB_CACHE_NONE 0 /* No caching */ +#define VMCB_CACHE_I BIT(0) /* Intercept, TSC off, Pause filter */ +#define VMCB_CACHE_IOPM BIT(1) /* I/O and MSR permission */ +#define VMCB_CACHE_ASID BIT(2) /* ASID */ +#define VMCB_CACHE_TPR BIT(3) /* V_TPR to V_INTR_VECTOR */ +#define VMCB_CACHE_NP BIT(4) /* Nested Paging */ +#define VMCB_CACHE_CR BIT(5) /* CR0, CR3, CR4 & EFER */ +#define VMCB_CACHE_DR BIT(6) /* Debug registers */ +#define VMCB_CACHE_DT BIT(7) /* GDT/IDT */ +#define VMCB_CACHE_SEG BIT(8) /* User segments, CPL */ +#define VMCB_CACHE_CR2 BIT(9) /* page fault address */ +#define VMCB_CACHE_LBR BIT(10) /* Last branch */ + +/* VMCB control event injection */ +#define VMCB_EVENTINJ_EC_VALID BIT(11) /* Error Code valid */ +#define VMCB_EVENTINJ_VALID BIT(31) /* Event valid */ + +/* Event types that can be injected */ +#define VMCB_EVENTINJ_TYPE_INTR 0 +#define VMCB_EVENTINJ_TYPE_NMI 2 +#define VMCB_EVENTINJ_TYPE_EXCEPTION 3 +#define VMCB_EVENTINJ_TYPE_INTn 4 + +/* VMCB exit code, APM vol2 Appendix C */ +#define VMCB_EXIT_MC 0x52 +#define VMCB_EXIT_INTR 0x60 +#define VMCB_EXIT_NMI 0x61 +#define VMCB_EXIT_VINTR 0x64 +#define VMCB_EXIT_PUSHF 0x70 +#define VMCB_EXIT_POPF 0x71 +#define VMCB_EXIT_CPUID 0x72 +#define VMCB_EXIT_IRET 0x74 +#define VMCB_EXIT_PAUSE 0x77 +#define VMCB_EXIT_HLT 0x78 +#define VMCB_EXIT_IO 0x7B +#define VMCB_EXIT_MSR 0x7C +#define VMCB_EXIT_SHUTDOWN 0x7F +#define VMCB_EXIT_VMSAVE 0x83 +#define VMCB_EXIT_MONITOR 0x8A +#define VMCB_EXIT_MWAIT 0x8B +#define VMCB_EXIT_NPF 0x400 +#define VMCB_EXIT_INVALID -1 + +/* + * Nested page fault. + * Bit definitions to decode EXITINFO1. + */ +#define VMCB_NPF_INFO1_P BIT(0) /* Nested page present. */ +#define VMCB_NPF_INFO1_W BIT(1) /* Access was write. */ +#define VMCB_NPF_INFO1_U BIT(2) /* Access was user access. */ +#define VMCB_NPF_INFO1_RSV BIT(3) /* Reserved bits present. */ +#define VMCB_NPF_INFO1_ID BIT(4) /* Code read. */ + +#define VMCB_NPF_INFO1_GPA BIT(32) /* Guest physical address. */ +#define VMCB_NPF_INFO1_GPT BIT(33) /* Guest page table. */ + +/* + * EXITINTINFO, Interrupt exit info for all intrecepts. + * Section 15.7.2, Intercepts during IDT Interrupt Delivery. + */ +#define VMCB_EXITINTINFO_VECTOR(x) ((x) & 0xFF) +#define VMCB_EXITINTINFO_TYPE(x) (((x) >> 8) & 0x7) +#define VMCB_EXITINTINFO_EC_VALID(x) (((x) & BIT(11)) ? 1 : 0) +#define VMCB_EXITINTINFO_VALID(x) (((x) & BIT(31)) ? 1 : 0) +#define VMCB_EXITINTINFO_EC(x) (((x) >> 32) & 0xFFFFFFFF) + +/* Offset of various VMCB fields. */ +#define VMCB_OFF_CTRL(x) (x) +#define VMCB_OFF_STATE(x) ((x) + 0x400) + +#define VMCB_OFF_CR_INTERCEPT VMCB_OFF_CTRL(0x0) +#define VMCB_OFF_DR_INTERCEPT VMCB_OFF_CTRL(0x4) +#define VMCB_OFF_EXC_INTERCEPT VMCB_OFF_CTRL(0x8) +#define VMCB_OFF_INST1_INTERCEPT VMCB_OFF_CTRL(0xC) +#define VMCB_OFF_INST2_INTERCEPT VMCB_OFF_CTRL(0x10) +#define VMCB_OFF_IO_PERM VMCB_OFF_CTRL(0x40) +#define VMCB_OFF_MSR_PERM VMCB_OFF_CTRL(0x48) +#define VMCB_OFF_TSC_OFFSET VMCB_OFF_CTRL(0x50) +#define VMCB_OFF_ASID VMCB_OFF_CTRL(0x58) +#define VMCB_OFF_TLB_CTRL VMCB_OFF_CTRL(0x5C) +#define VMCB_OFF_VIRQ VMCB_OFF_CTRL(0x60) +#define VMCB_OFF_EXIT_REASON VMCB_OFF_CTRL(0x70) +#define VMCB_OFF_EXITINFO1 VMCB_OFF_CTRL(0x78) +#define VMCB_OFF_EXITINFO2 VMCB_OFF_CTRL(0x80) +#define VMCB_OFF_EXITINTINFO VMCB_OFF_CTRL(0x88) +#define VMCB_OFF_AVIC_BAR VMCB_OFF_CTRL(0x98) +#define VMCB_OFF_NPT_BASE VMCB_OFF_CTRL(0xB0) +#define VMCB_OFF_AVIC_PAGE VMCB_OFF_CTRL(0xE0) +#define VMCB_OFF_AVIC_LT VMCB_OFF_CTRL(0xF0) +#define VMCB_OFF_AVIC_PT VMCB_OFF_CTRL(0xF8) +#define VMCB_OFF_SYSENTER_CS VMCB_OFF_STATE(0x228) +#define VMCB_OFF_SYSENTER_ESP VMCB_OFF_STATE(0x230) +#define VMCB_OFF_SYSENTER_EIP VMCB_OFF_STATE(0x238) +#define VMCB_OFF_GUEST_PAT VMCB_OFF_STATE(0x268) + +/* + * Encode the VMCB offset and bytes that we want to read from VMCB. + */ +#define VMCB_ACCESS(o, w) (0x80000000 | (((w) & 0xF) << 16) | \ + ((o) & 0xFFF)) +#define VMCB_ACCESS_OK(v) ((v) & 0x80000000 ) +#define VMCB_ACCESS_BYTES(v) (((v) >> 16) & 0xF) +#define VMCB_ACCESS_OFFSET(v) ((v) & 0xFFF) + +#ifdef _KERNEL +/* VMCB save state area segment format */ +struct vmcb_segment { + uint16_t selector; + uint16_t attrib; + uint32_t limit; + uint64_t base; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_segment) == 16); + +/* Code segment descriptor attribute in 12 bit format as saved by VMCB. */ +#define VMCB_CS_ATTRIB_L BIT(9) /* Long mode. */ +#define VMCB_CS_ATTRIB_D BIT(10) /* OPerand size bit. */ + +/* + * The VMCB is divided into two areas - the first one contains various + * control bits including the intercept vector and the second one contains + * the guest state. + */ + +/* VMCB control area - padded up to 1024 bytes */ +struct vmcb_ctrl { + uint32_t intercept[5]; /* all intercepts */ + uint8_t pad1[0x28]; /* Offsets 0x14-0x3B are reserved. */ + uint16_t pause_filthresh; /* Offset 0x3C, PAUSE filter threshold */ + uint16_t pause_filcnt; /* Offset 0x3E, PAUSE filter count */ + uint64_t iopm_base_pa; /* 0x40: IOPM_BASE_PA */ + uint64_t msrpm_base_pa; /* 0x48: MSRPM_BASE_PA */ + uint64_t tsc_offset; /* 0x50: TSC_OFFSET */ + uint32_t asid; /* 0x58: Guest ASID */ + uint8_t tlb_ctrl; /* 0x5C: TLB_CONTROL */ + uint8_t pad2[3]; /* 0x5D-0x5F: Reserved. */ + uint8_t v_tpr; /* 0x60: V_TPR, guest CR8 */ + uint8_t v_irq:1; /* Is virtual interrupt pending? */ + uint8_t :7; /* Padding */ + uint8_t v_intr_prio:4; /* 0x62: Priority for virtual interrupt. */ + uint8_t v_ign_tpr:1; + uint8_t :3; + uint8_t v_intr_masking:1; /* Guest and host sharing of RFLAGS. */ + uint8_t :7; + uint8_t v_intr_vector; /* 0x64: Vector for virtual interrupt. */ + uint8_t pad3[3]; /* 0x65-0x67 Reserved. */ + uint64_t intr_shadow:1; /* 0x68: Interrupt shadow, section15.2.1 APM2 */ + uint64_t :63; + uint64_t exitcode; /* 0x70, Exitcode */ + uint64_t exitinfo1; /* 0x78, EXITINFO1 */ + uint64_t exitinfo2; /* 0x80, EXITINFO2 */ + uint64_t exitintinfo; /* 0x88, Interrupt exit value. */ + uint64_t np_enable:1; /* 0x90, Nested paging enable. */ + uint64_t :63; + uint8_t pad4[0x10]; /* 0x98-0xA7 reserved. */ + uint64_t eventinj; /* 0xA8, Event injection. */ + uint64_t n_cr3; /* B0, Nested page table. */ + uint64_t lbr_virt_en:1; /* Enable LBR virtualization. */ + uint64_t :63; + uint32_t vmcb_clean; /* 0xC0: VMCB clean bits for caching */ + uint32_t :32; /* 0xC4: Reserved */ + uint64_t nrip; /* 0xC8: Guest next nRIP. */ + uint8_t inst_len; /* 0xD0: #NPF decode assist */ + uint8_t inst_bytes[15]; + uint8_t padd6[0x320]; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_ctrl) == 1024); + +struct vmcb_state { + struct vmcb_segment es; + struct vmcb_segment cs; + struct vmcb_segment ss; + struct vmcb_segment ds; + struct vmcb_segment fs; + struct vmcb_segment gs; + struct vmcb_segment gdt; + struct vmcb_segment ldt; + struct vmcb_segment idt; + struct vmcb_segment tr; + uint8_t pad1[0x2b]; /* Reserved: 0xA0-0xCA */ + uint8_t cpl; + uint8_t pad2[4]; + uint64_t efer; + uint8_t pad3[0x70]; /* Reserved: 0xd8-0x147 */ + uint64_t cr4; + uint64_t cr3; /* Guest CR3 */ + uint64_t cr0; + uint64_t dr7; + uint64_t dr6; + uint64_t rflags; + uint64_t rip; + uint8_t pad4[0x58]; /* Reserved: 0x180-0x1D7 */ + uint64_t rsp; + uint8_t pad5[0x18]; /* Reserved 0x1E0-0x1F7 */ + uint64_t rax; + uint64_t star; + uint64_t lstar; + uint64_t cstar; + uint64_t sfmask; + uint64_t kernelgsbase; + uint64_t sysenter_cs; + uint64_t sysenter_esp; + uint64_t sysenter_eip; + uint64_t cr2; + uint8_t pad6[0x20]; + uint64_t g_pat; + uint64_t dbgctl; + uint64_t br_from; + uint64_t br_to; + uint64_t int_from; + uint64_t int_to; + uint8_t pad7[0x968]; /* Reserved up to end of VMCB */ +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb_state) == 0xC00); + +struct vmcb { + struct vmcb_ctrl ctrl; + struct vmcb_state state; +} __attribute__ ((__packed__)); +CTASSERT(sizeof(struct vmcb) == PAGE_SIZE); +CTASSERT(offsetof(struct vmcb, state) == 0x400); + +int vmcb_read(struct svm_softc *sc, int vcpu, int ident, uint64_t *retval); +int vmcb_write(struct svm_softc *sc, int vcpu, int ident, uint64_t val); +int vmcb_setdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); +int vmcb_getdesc(void *arg, int vcpu, int ident, struct seg_desc *desc); +int vmcb_seg(struct vmcb *vmcb, int ident, struct vmcb_segment *seg); + +#endif /* _KERNEL */ +#endif /* _VMCB_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.c b/usr/src/uts/i86pc/io/vmm/intel/ept.c index 5ae9ed2f6a..4915537b0a 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/ept.c +++ b/usr/src/uts/i86pc/io/vmm/intel/ept.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z grehan $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -39,33 +41,35 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z grehan $"); +__FBSDID("$FreeBSD$"); +#include <sys/param.h> +#include <sys/kernel.h> #include <sys/types.h> -#include <sys/errno.h> #include <sys/systm.h> -#include <sys/malloc.h> #include <sys/smp.h> +#include <sys/sysctl.h> +#ifndef __FreeBSD__ +#include <sys/hma.h> +#endif #include <vm/vm.h> #include <vm/pmap.h> - -#include <machine/param.h> -#include <machine/cpufunc.h> -#include <machine/pmap.h> -#include <machine/vmparam.h> +#include <vm/vm_extern.h> #include <machine/vmm.h> + #include "vmx_cpufunc.h" -#include "vmx.h" #include "ept.h" +#define EPT_SUPPORTS_EXEC_ONLY(cap) ((cap) & (1UL << 0)) #define EPT_PWL4(cap) ((cap) & (1UL << 6)) #define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14)) #define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */ #define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */ -#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) #define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20)) +#define AD_BITS_SUPPORTED(cap) ((cap) & (1UL << 21)) +#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32)) #define INVVPID_ALL_TYPES_MASK 0xF0000000000UL #define INVVPID_ALL_TYPES_SUPPORTED(cap) \ @@ -75,28 +79,22 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z g #define INVEPT_ALL_TYPES_SUPPORTED(cap) \ (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK) -#define EPT_PG_RD (1 << 0) -#define EPT_PG_WR (1 << 1) -#define EPT_PG_EX (1 << 2) -#define EPT_PG_MEMORY_TYPE(x) ((x) << 3) -#define EPT_PG_IGNORE_PAT (1 << 6) -#define EPT_PG_SUPERPAGE (1 << 7) +#define EPT_PWLEVELS 4 /* page walk levels */ +#define EPT_ENABLE_AD_BITS (1 << 6) -#define EPT_ADDR_MASK ((uint64_t)-1 << 12) +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, ept, CTLFLAG_RW, NULL, NULL); -MALLOC_DECLARE(M_VMX); +static int ept_enable_ad_bits; -static uint64_t page_sizes_mask; - -/* - * Set this to 1 to have the EPT tables respect the guest PAT settings - */ -static int ept_pat_passthru; +static int ept_pmap_flags; +SYSCTL_INT(_hw_vmm_ept, OID_AUTO, pmap_flags, CTLFLAG_RD, + &ept_pmap_flags, 0, NULL); int -ept_init(void) +ept_init(int ipinum) { - int page_shift; + int use_hw_ad_bits, use_superpages, use_exec_only; uint64_t cap; cap = rdmsr(MSR_VMX_EPT_VPID_CAP); @@ -116,17 +114,24 @@ ept_init(void) !INVEPT_ALL_TYPES_SUPPORTED(cap)) return (EINVAL); - /* Set bits in 'page_sizes_mask' for each valid page size */ - page_shift = PAGE_SHIFT; - page_sizes_mask = 1UL << page_shift; /* 4KB page */ + ept_pmap_flags = ipinum & PMAP_NESTED_IPIMASK; - page_shift += 9; - if (EPT_PDE_SUPERPAGE(cap)) - page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */ + use_superpages = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_superpages", &use_superpages); + if (use_superpages && EPT_PDE_SUPERPAGE(cap)) + ept_pmap_flags |= PMAP_PDE_SUPERPAGE; /* 2MB superpage */ - page_shift += 9; - if (EPT_PDPTE_SUPERPAGE(cap)) - page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */ + use_hw_ad_bits = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_hw_ad_bits", &use_hw_ad_bits); + if (use_hw_ad_bits && AD_BITS_SUPPORTED(cap)) + ept_enable_ad_bits = 1; + else + ept_pmap_flags |= PMAP_EMULATE_AD_BITS; + + use_exec_only = 1; + TUNABLE_INT_FETCH("hw.vmm.ept.use_exec_only", &use_exec_only); + if (use_exec_only && EPT_SUPPORTS_EXEC_ONLY(cap)) + ept_pmap_flags |= PMAP_SUPPORTS_EXEC_ONLY; return (0); } @@ -165,288 +170,61 @@ ept_dump(uint64_t *ptp, int nlevels) } #endif -static size_t -ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, - vm_memattr_t attr, vm_prot_t prot, boolean_t spok) -{ - int spshift, ptpshift, ptpindex, nlevels; - - /* - * Compute the size of the mapping that we can accomodate. - * - * This is based on three factors: - * - super page sizes supported by the processor - * - alignment of the region starting at 'gpa' and 'hpa' - * - length of the region 'len' - */ - spshift = PAGE_SHIFT; - if (spok) - spshift += (EPT_PWLEVELS - 1) * 9; - while (spshift >= PAGE_SHIFT) { - uint64_t spsize = 1UL << spshift; - if ((page_sizes_mask & spsize) != 0 && - (gpa & (spsize - 1)) == 0 && - (hpa & (spsize - 1)) == 0 && - length >= spsize) { - break; - } - spshift -= 9; - } - - if (spshift < PAGE_SHIFT) { - panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, " - "length 0x%016lx, page_sizes_mask 0x%016lx", - gpa, hpa, length, page_sizes_mask); - } - - nlevels = EPT_PWLEVELS; - while (--nlevels >= 0) { - ptpshift = PAGE_SHIFT + nlevels * 9; - ptpindex = (gpa >> ptpshift) & 0x1FF; - - /* We have reached the leaf mapping */ - if (spshift >= ptpshift) - break; - - /* - * We are working on a non-leaf page table page. - * - * Create the next level page table page if necessary and point - * to it from the current page table. - */ - if (ptp[ptpindex] == 0) { -#ifdef __FreeBSD__ - void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO); -#else - void *nlp = kmem_zalloc(PAGE_SIZE, KM_SLEEP); - ASSERT((((uintptr_t)nlp) & PAGE_MASK) == 0); -#endif - ptp[ptpindex] = vtophys(nlp); - ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX; - } - - /* Work our way down to the next level page table page */ -#ifdef __FreeBSD__ - ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK); -#else - ptp = (uint64_t *)hat_kpm_pfn2va(btop(ptp[ptpindex] & EPT_ADDR_MASK)); -#endif - } - - if ((gpa & ((1UL << ptpshift) - 1)) != 0) { - panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d " - "mismatch\n", gpa, ptpshift); - } - - if (prot != VM_PROT_NONE) { - /* Do the mapping */ - ptp[ptpindex] = hpa; - - /* Apply the access controls */ - if (prot & VM_PROT_READ) - ptp[ptpindex] |= EPT_PG_RD; - if (prot & VM_PROT_WRITE) - ptp[ptpindex] |= EPT_PG_WR; - if (prot & VM_PROT_EXECUTE) - ptp[ptpindex] |= EPT_PG_EX; - - /* - * By default the PAT type is ignored - this appears to - * be how other hypervisors handle EPT. Allow this to be - * overridden. - */ - ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr); - if (!ept_pat_passthru) - ptp[ptpindex] |= EPT_PG_IGNORE_PAT; - - if (nlevels > 0) - ptp[ptpindex] |= EPT_PG_SUPERPAGE; - } else { - /* Remove the mapping */ - ptp[ptpindex] = 0; - } - - return (1UL << ptpshift); -} - -static vm_paddr_t -ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa) -{ - int nlevels, ptpshift, ptpindex; - uint64_t ptpval, hpabase, pgmask; - - nlevels = EPT_PWLEVELS; - while (--nlevels >= 0) { - ptpshift = PAGE_SHIFT + nlevels * 9; - ptpindex = (gpa >> ptpshift) & 0x1FF; - - ptpval = ptp[ptpindex]; - - /* Cannot make progress beyond this point */ - if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0) - break; - - if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) { - pgmask = (1UL << ptpshift) - 1; - hpabase = ptpval & ~pgmask; - return (hpabase | (gpa & pgmask)); - } - - /* Work our way down to the next level page table page */ -#ifdef __FreBSD__ - ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK); -#else - ptp = (uint64_t *)hat_kpm_pfn2va(btop(ptpval & EPT_ADDR_MASK)); -#endif - } - - return ((vm_paddr_t)-1); -} - -static void -ept_free_pt_entry(pt_entry_t pte) -{ - if (pte == 0) - return; - - /* sanity check */ - if ((pte & EPT_PG_SUPERPAGE) != 0) - panic("ept_free_pt_entry: pte cannot have superpage bit"); - - return; -} - -static void -ept_free_pd_entry(pd_entry_t pde) -{ - pt_entry_t *pt; - int i; - - if (pde == 0) - return; - - if ((pde & EPT_PG_SUPERPAGE) == 0) { -#ifdef __FreeBSD__ - pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK); - for (i = 0; i < NPTEPG; i++) - ept_free_pt_entry(pt[i]); - free(pt, M_VMX); /* free the page table page */ -#else - page_t *pp; - pt = (pt_entry_t *)hat_kpm_pfn2va(btop(pde & EPT_ADDR_MASK)); - for (i = 0; i < NPTEPG; i++) - ept_free_pt_entry(pt[i]); - pp = page_numtopp_nolock(btop(pde & EPT_ADDR_MASK)); - kmem_free((void *)pp->p_offset, PAGE_SIZE); -#endif - } -} - +#ifdef __FreeBSD__ static void -ept_free_pdp_entry(pdp_entry_t pdpe) +invept_single_context(void *arg) { - pd_entry_t *pd; - int i; - - if (pdpe == 0) - return; + struct invept_desc desc = *(struct invept_desc *)arg; - if ((pdpe & EPT_PG_SUPERPAGE) == 0) { -#ifdef __FreeBSD__ - pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK); - for (i = 0; i < NPDEPG; i++) - ept_free_pd_entry(pd[i]); - free(pd, M_VMX); /* free the page directory page */ -#else - page_t *pp; - pd = (pd_entry_t *)hat_kpm_pfn2va(btop(pdpe & EPT_ADDR_MASK)); - for (i = 0; i < NPDEPG; i++) - ept_free_pd_entry(pd[i]); - pp = page_numtopp_nolock(btop(pdpe & EPT_ADDR_MASK)); - kmem_free((void *)pp->p_offset, PAGE_SIZE); -#endif - } + invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); } -static void -ept_free_pml4_entry(pml4_entry_t pml4e) +void +ept_invalidate_mappings(u_long eptp) { - pdp_entry_t *pdp; - int i; + struct invept_desc invept_desc = { 0 }; - if (pml4e == 0) - return; + invept_desc.eptp = eptp; - if ((pml4e & EPT_PG_SUPERPAGE) == 0) { -#ifdef __FreeBSD__ - pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK); - for (i = 0; i < NPDPEPG; i++) - ept_free_pdp_entry(pdp[i]); - free(pdp, M_VMX); /* free the page directory ptr page */ -#else - page_t *pp; - pdp = (pdp_entry_t *)hat_kpm_pfn2va(btop(pml4e - & EPT_ADDR_MASK)); - for (i = 0; i < NPDPEPG; i++) - ept_free_pdp_entry(pdp[i]); - pp = page_numtopp_nolock(btop(pml4e & EPT_ADDR_MASK)); - kmem_free((void *)pp->p_offset, PAGE_SIZE); -#endif - } + smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); } - +#else /* __FreeBSD__ */ void -ept_vmcleanup(struct vmx *vmx) +ept_invalidate_mappings(u_long eptp) { - int i; - - for (i = 0; i < NPML4EPG; i++) - ept_free_pml4_entry(vmx->pml4ept[i]); + hma_vmx_invept_allcpus((uintptr_t)eptp); } +#endif /* __FreeBSD__ */ -int -ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len, - vm_memattr_t attr, int prot, boolean_t spok) +static int +ept_pinit(pmap_t pmap) { - size_t n; - struct vmx *vmx = arg; - - while (len > 0) { - n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr, - prot, spok); - len -= n; - gpa += n; - hpa += n; - } - return (0); + return (pmap_pinit_type(pmap, PT_EPT, ept_pmap_flags)); } -vm_paddr_t -ept_vmmmap_get(void *arg, vm_paddr_t gpa) +struct vmspace * +ept_vmspace_alloc(vm_offset_t min, vm_offset_t max) { - vm_paddr_t hpa; - struct vmx *vmx; - vmx = arg; - hpa = ept_lookup_mapping(vmx->pml4ept, gpa); - return (hpa); + return (vmspace_alloc(min, max, ept_pinit)); } -static void -invept_single_context(void *arg) +void +ept_vmspace_free(struct vmspace *vmspace) { - struct invept_desc desc = *(struct invept_desc *)arg; - invept(INVEPT_TYPE_SINGLE_CONTEXT, desc); + vmspace_free(vmspace); } -void -ept_invalidate_mappings(u_long pml4ept) +uint64_t +eptp(uint64_t pml4) { - struct invept_desc invept_desc = { 0 }; + uint64_t eptp_val; - invept_desc.eptp = EPTP(pml4ept); + eptp_val = pml4 | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK; + if (ept_enable_ad_bits) + eptp_val |= EPT_ENABLE_AD_BITS; - smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc); + return (eptp_val); } diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.h b/usr/src/uts/i86pc/io/vmm/intel/ept.h index d0bcce7ec3..4a029e8b22 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/ept.h +++ b/usr/src/uts/i86pc/io/vmm/intel/ept.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/ept.h 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ #ifndef _EPT_H_ @@ -31,13 +33,9 @@ struct vmx; -#define EPT_PWLEVELS 4 /* page walk levels */ -#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK) - -int ept_init(void); -int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length, - vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings); -vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa); -void ept_invalidate_mappings(u_long ept_pml4); -void ept_vmcleanup(struct vmx *vmx); +int ept_init(int ipinum); +void ept_invalidate_mappings(u_long eptp); +struct vmspace *ept_vmspace_alloc(vm_offset_t min, vm_offset_t max); +void ept_vmspace_free(struct vmspace *vmspace); +uint64_t eptp(uint64_t pml4); #endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/offsets.in b/usr/src/uts/i86pc/io/vmm/intel/offsets.in new file mode 100644 index 0000000000..d60a2d8f5f --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/offsets.in @@ -0,0 +1,62 @@ +/* + * COPYRIGHT 2014 Pluribus Networks Inc. + * + * All rights reserved. This copyright notice is Copyright Management + * Information under 17 USC 1202 and is included to protect this work and + * deter copyright infringement. Removal or alteration of this Copyright + * Management Information without the express written permission from + * Pluribus Networks Inc is prohibited, and any such unauthorized removal + * or alteration will be a violation of federal law. + */ +#include <sys/types.h> +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/cpuvar.h> + +#include <machine/pmap.h> +#include <machine/vmm.h> + +#include "intel/vmx_cpufunc.h" +#include "intel/vmx.h" +#include "vm/vm_glue.h" + +vmxctx + guest_rdi VMXCTX_GUEST_RDI + guest_rsi VMXCTX_GUEST_RSI + guest_rdx VMXCTX_GUEST_RDX + guest_rcx VMXCTX_GUEST_RCX + guest_r8 VMXCTX_GUEST_R8 + guest_r9 VMXCTX_GUEST_R9 + guest_rax VMXCTX_GUEST_RAX + guest_rbx VMXCTX_GUEST_RBX + guest_rbp VMXCTX_GUEST_RBP + guest_r10 VMXCTX_GUEST_R10 + guest_r11 VMXCTX_GUEST_R11 + guest_r12 VMXCTX_GUEST_R12 + guest_r13 VMXCTX_GUEST_R13 + guest_r14 VMXCTX_GUEST_R14 + guest_r15 VMXCTX_GUEST_R15 + guest_cr2 VMXCTX_GUEST_CR2 + inst_fail_status VMXCTX_INST_FAIL_STATUS + pmap VMXCTX_PMAP + +vmx + eptgen VMX_EPTGEN + eptp VMX_EPTP + +pmap + pm_active PM_ACTIVE + pm_eptgen PM_EPTGEN + +cpu + cpu_id + +\#define VM_SUCCESS 0 +\#define VM_FAIL_INVALID 1 +\#define VM_FAIL_VALID 2 + +\#define VMX_GUEST_VMEXIT 0 +\#define VMX_VMRESUME_ERROR 1 +\#define VMX_VMLAUNCH_ERROR 2 +\#define VMX_INVEPT_ERROR 3 +\#define VMX_VMWRITE_ERROR 4 diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.c b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c index bbd2da2a34..d19f6bc262 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmcs.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,6 +38,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifdef __FreeBSD__ @@ -43,9 +46,10 @@ #endif #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/sysctl.h> #include <sys/systm.h> #include <sys/pcpu.h> @@ -64,6 +68,12 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z #include <ddb/ddb.h> #endif +SYSCTL_DECL(_hw_vmm_vmx); + +static int no_flush_rsb; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, no_flush_rsb, CTLFLAG_RW, + &no_flush_rsb, 0, "Do not flush RSB upon vmexit"); + static uint64_t vmcs_fix_regval(uint32_t encoding, uint64_t val) { @@ -117,6 +127,14 @@ vmcs_field_encoding(int ident) return (VMCS_GUEST_LDTR_SELECTOR); case VM_REG_GUEST_EFER: return (VMCS_GUEST_IA32_EFER); + case VM_REG_GUEST_PDPTE0: + return (VMCS_GUEST_PDPTE0); + case VM_REG_GUEST_PDPTE1: + return (VMCS_GUEST_PDPTE1); + case VM_REG_GUEST_PDPTE2: + return (VMCS_GUEST_PDPTE2); + case VM_REG_GUEST_PDPTE3: + return (VMCS_GUEST_PDPTE3); default: return (-1); } @@ -332,40 +350,15 @@ done: return (error); } -#ifndef __FreeBSD__ -int -vmcs_set_host_msr_save(struct vmcs *vmcs, u_long h_area, u_int h_count) -{ - int error; - - VMPTRLD(vmcs); - - /* - * Host MSRs are loaded from the VM-exit MSR-load area. - */ - if ((error = vmwrite(VMCS_EXIT_MSR_LOAD, h_area)) != 0) - goto done; - if ((error = vmwrite(VMCS_EXIT_MSR_LOAD_COUNT, h_count)) != 0) - goto done; - - error = 0; -done: - VMCLEAR(vmcs); - return (error); -} -#endif - int -vmcs_set_defaults(struct vmcs *vmcs, - u_long host_rip, u_long host_rsp, u_long ept_pml4, - uint32_t pinbased_ctls, uint32_t procbased_ctls, - uint32_t procbased_ctls2, uint32_t exit_ctls, - uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid) +vmcs_init(struct vmcs *vmcs) { int error, codesel, datasel, tsssel; u_long cr0, cr4, efer; - uint64_t eptp, pat, fsbase, idtrbase; - uint32_t exc_bitmap; + uint64_t pat; +#ifdef __FreeBSD__ + uint64_t fsbase, idtrbase; +#endif codesel = vmm_get_host_codesel(); datasel = vmm_get_host_datasel(); @@ -376,34 +369,6 @@ vmcs_set_defaults(struct vmcs *vmcs, */ VMPTRLD(vmcs); - /* - * Load the VMX controls - */ - if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0) - goto done; - if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0) - goto done; - if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0) - goto done; - if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0) - goto done; - if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0) - goto done; - - /* Guest state */ - - /* Initialize guest IA32_PAT MSR with the default value */ - pat = PAT_VALUE(0, PAT_WRITE_BACK) | - PAT_VALUE(1, PAT_WRITE_THROUGH) | - PAT_VALUE(2, PAT_UNCACHED) | - PAT_VALUE(3, PAT_UNCACHEABLE) | - PAT_VALUE(4, PAT_WRITE_BACK) | - PAT_VALUE(5, PAT_WRITE_THROUGH) | - PAT_VALUE(6, PAT_UNCACHED) | - PAT_VALUE(7, PAT_UNCACHEABLE); - if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0) - goto done; - /* Host state */ /* Initialize host IA32_PAT MSR */ @@ -466,37 +431,35 @@ vmcs_set_defaults(struct vmcs *vmcs, fsbase = vmm_get_host_fsbase(); if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0) goto done; -#endif idtrbase = vmm_get_host_idtrbase(); if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0) goto done; - /* instruction pointer */ - if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0) - goto done; - - /* stack pointer */ - if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0) - goto done; - - /* eptp */ - eptp = EPTP(ept_pml4); - if ((error = vmwrite(VMCS_EPTP, eptp)) != 0) +#else /* __FreeBSD__ */ + /* + * Configure host sysenter MSRs to be restored on VM exit. + * The thread-specific MSR_INTC_SEP_ESP value is loaded in vmx_run. + */ + if ((error = vmwrite(VMCS_HOST_IA32_SYSENTER_CS, KCS_SEL)) != 0) goto done; - - /* vpid */ - if ((error = vmwrite(VMCS_VPID, vpid)) != 0) + /* Natively defined as MSR_INTC_SEP_EIP */ + if ((error = vmwrite(VMCS_HOST_IA32_SYSENTER_EIP, + rdmsr(MSR_SYSENTER_EIP_MSR))) != 0) goto done; - /* msr bitmap */ - if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0) - goto done; +#endif /* __FreeBSD__ */ - /* exception bitmap */ - exc_bitmap = 1 << IDT_MC; - if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0) - goto done; + /* instruction pointer */ + if (no_flush_rsb) { + if ((error = vmwrite(VMCS_HOST_RIP, + (u_long)vmx_exit_guest)) != 0) + goto done; + } else { + if ((error = vmwrite(VMCS_HOST_RIP, + (u_long)vmx_exit_guest_flush_rsb)) != 0) + goto done; + } /* link pointer */ if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0) diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.h b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h index 20e99e8184..edde5c6dd5 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmcs.h +++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,19 +25,40 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmcs.h 276098 2014-12-23 02:14:49Z neel $ + * $FreeBSD$ + */ + +/* + * Copyright 2017 Joyent, Inc. */ #ifndef _VMCS_H_ #define _VMCS_H_ #ifdef _KERNEL +#ifndef _ASM struct vmcs { uint32_t identifier; uint32_t abort_code; char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2]; +#ifndef __FreeBSD__ + /* + * Keep the physical address of the VMCS cached adjacent for the + * structure so it can be referenced in contexts which are too delicate + * for a call into the HAT. For the moment it means wasting a whole + * page on padding for the PA value to maintain alignment, but it + * allows the consumers of 'struct vmcs *' to easily access the value + * without a significant change to the interface. + */ + uint64_t vmcs_pa; + char _pa_pad[PAGE_SIZE - sizeof (vm_paddr_t)]; +#endif }; +#ifdef __FreeBSD__ CTASSERT(sizeof(struct vmcs) == PAGE_SIZE); +#else +CTASSERT(sizeof(struct vmcs) == (2*PAGE_SIZE)); +#endif /* MSR save region is composed of an array of 'struct msr_entry' */ struct msr_entry { @@ -47,15 +70,6 @@ struct msr_entry { int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count); int vmcs_init(struct vmcs *vmcs); -#ifndef __FreeBSD__ -int vmcs_set_host_msr_save(struct vmcs *vmcs, u_long h_area, u_int h_count); -#endif -int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp, - u_long ept_pml4, - uint32_t pinbased_ctls, uint32_t procbased_ctls, - uint32_t procbased_ctls2, uint32_t exit_ctls, - uint32_t entry_ctls, u_long msr_bitmap, - uint16_t vpid); int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv); int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val); int vmcs_getdesc(struct vmcs *vmcs, int running, int ident, @@ -86,6 +100,65 @@ vmcs_write(uint32_t encoding, uint64_t val) error = vmwrite(encoding, val); KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error)); } + +#ifndef __FreeBSD__ +/* + * Due to header complexity combined with the need to cache the physical + * address for the VMCS, these must be defined here rather than vmx_cpufunc.h. + */ +static __inline int +vmclear(struct vmcs *vmcs) +{ + int error; + uint64_t addr = vmcs->vmcs_pa; + + __asm __volatile("vmclear %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} + +static __inline int +vmptrld(struct vmcs *vmcs) +{ + int error; + uint64_t addr = vmcs->vmcs_pa; + + __asm __volatile("vmptrld %[addr];" + VMX_SET_ERROR_CODE + : [error] "=r" (error) + : [addr] "m" (*(uint64_t *)&addr) + : "memory"); + return (error); +} + +static __inline void +VMCLEAR(struct vmcs *vmcs) +{ + int err; + + err = vmclear(vmcs); + if (err != 0) + panic("%s: vmclear(%p) error %d", __func__, vmcs, err); + + critical_exit(); +} + +static __inline void +VMPTRLD(struct vmcs *vmcs) +{ + int err; + + critical_enter(); + + err = vmptrld(vmcs); + if (err != 0) + panic("%s: vmptrld(%p) error %d", __func__, vmcs, err); +} +#endif /* __FreeBSD__ */ + #endif /* _VMX_CPUFUNC_H_ */ #define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH) @@ -99,6 +172,7 @@ vmcs_write(uint32_t encoding, uint64_t val) #define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO) #define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR) +#endif /* _ASM */ #endif /* _KERNEL */ #define VMCS_INITIAL 0xffffffffffffffff @@ -345,6 +419,14 @@ vmcs_write(uint32_t encoding, uint64_t val) #define EXIT_REASON_WBINVD 54 #define EXIT_REASON_XSETBV 55 #define EXIT_REASON_APIC_WRITE 56 +#define EXIT_REASON_RDRAND 57 +#define EXIT_REASON_INVPCID 58 +#define EXIT_REASON_VMFUNC 59 +#define EXIT_REASON_ENCLS 60 +#define EXIT_REASON_RDSEED 61 +#define EXIT_REASON_PM_LOG_FULL 62 +#define EXIT_REASON_XSAVES 63 +#define EXIT_REASON_XRSTORS 64 /* * NMI unblocking due to IRET. diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c index 7ddf4e2a46..ce42ff8c9c 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c @@ -1,6 +1,9 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. + * Copyright (c) 2018 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z tychon $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,10 +39,11 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z tychon $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> @@ -50,12 +54,21 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z t #include <sys/proc.h> #include <sys/sysctl.h> +#ifndef __FreeBSD__ +#include <sys/x86_archext.h> +#include <sys/smp_impldefs.h> +#include <sys/smt.h> +#include <sys/hma.h> +#include <sys/trap.h> +#endif + #include <vm/vm.h> #include <vm/pmap.h> #include <machine/psl.h> #include <machine/cpufunc.h> #include <machine/md_var.h> +#include <machine/reg.h> #include <machine/segments.h> #include <machine/smp.h> #include <machine/specialreg.h> @@ -75,6 +88,7 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z t #include "ept.h" #include "vmx_cpufunc.h" +#include "vmcs.h" #include "vmx.h" #include "vmx_msr.h" #include "x86.h" @@ -90,13 +104,30 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z t (PROCBASED_INT_WINDOW_EXITING | \ PROCBASED_NMI_WINDOW_EXITING) +#ifdef __FreeBSD__ +#define PROCBASED_CTLS_ONE_SETTING \ + (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_MWAIT_EXITING | \ + PROCBASED_MONITOR_EXITING | \ + PROCBASED_IO_EXITING | \ + PROCBASED_MSR_BITMAPS | \ + PROCBASED_CTLS_WINDOW_SETTING | \ + PROCBASED_CR8_LOAD_EXITING | \ + PROCBASED_CR8_STORE_EXITING) +#else +/* We consider TSC offset a necessity for unsynched TSC handling */ #define PROCBASED_CTLS_ONE_SETTING \ (PROCBASED_SECONDARY_CONTROLS | \ + PROCBASED_TSC_OFFSET | \ + PROCBASED_MWAIT_EXITING | \ + PROCBASED_MONITOR_EXITING | \ PROCBASED_IO_EXITING | \ PROCBASED_MSR_BITMAPS | \ PROCBASED_CTLS_WINDOW_SETTING | \ PROCBASED_CR8_LOAD_EXITING | \ PROCBASED_CR8_STORE_EXITING) +#endif /* __FreeBSD__ */ + #define PROCBASED_CTLS_ZERO_SETTING \ (PROCBASED_CR3_LOAD_EXITING | \ PROCBASED_CR3_STORE_EXITING | \ @@ -106,20 +137,21 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z t #define PROCBASED_CTLS2_ZERO_SETTING 0 #define VM_EXIT_CTLS_ONE_SETTING \ - (VM_EXIT_HOST_LMA | \ + (VM_EXIT_SAVE_DEBUG_CONTROLS | \ + VM_EXIT_HOST_LMA | \ + VM_EXIT_LOAD_PAT | \ VM_EXIT_SAVE_EFER | \ VM_EXIT_LOAD_EFER | \ - VM_EXIT_LOAD_PAT | \ - VM_EXIT_SAVE_PAT | \ - VM_EXIT_LOAD_PAT) + VM_EXIT_ACKNOWLEDGE_INTERRUPT) -#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS +#define VM_EXIT_CTLS_ZERO_SETTING 0 -#define VM_ENTRY_CTLS_ONE_SETTING (VM_ENTRY_LOAD_EFER | VM_ENTRY_LOAD_PAT) +#define VM_ENTRY_CTLS_ONE_SETTING \ + (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ + VM_ENTRY_LOAD_EFER) #define VM_ENTRY_CTLS_ZERO_SETTING \ - (VM_ENTRY_LOAD_DEBUG_CONTROLS | \ - VM_ENTRY_INTO_SMM | \ + (VM_ENTRY_INTO_SMM | \ VM_ENTRY_DEACTIVATE_DUAL_MONITOR) #define HANDLED 1 @@ -131,11 +163,10 @@ static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic"); SYSCTL_DECL(_hw_vmm); SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL); +#ifdef __FreeBSD__ int vmxon_enabled[MAXCPU]; static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE); -#ifndef __FreeBSD__ -static vm_paddr_t vmxon_region_pa[MAXCPU]; -#endif +#endif /*__FreeBSD__ */ static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2; static uint32_t exit_ctls, entry_ctls; @@ -159,29 +190,135 @@ SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD, /* * Optional capabilities */ +SYSCTL_NODE(_hw_vmm_vmx, OID_AUTO, cap, CTLFLAG_RW, NULL, NULL); + static int cap_halt_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, halt_exit, CTLFLAG_RD, &cap_halt_exit, 0, + "HLT triggers a VM-exit"); + static int cap_pause_exit; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, pause_exit, CTLFLAG_RD, &cap_pause_exit, + 0, "PAUSE triggers a VM-exit"); + static int cap_unrestricted_guest; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, unrestricted_guest, CTLFLAG_RD, + &cap_unrestricted_guest, 0, "Unrestricted guests"); + static int cap_monitor_trap; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, monitor_trap, CTLFLAG_RD, + &cap_monitor_trap, 0, "Monitor trap flag"); + static int cap_invpcid; +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, invpcid, CTLFLAG_RD, &cap_invpcid, + 0, "Guests are allowed to use INVPCID"); static int virtual_interrupt_delivery; -SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD, &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support"); static int posted_interrupts; -SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD, +SYSCTL_INT(_hw_vmm_vmx_cap, OID_AUTO, posted_interrupts, CTLFLAG_RD, &posted_interrupts, 0, "APICv posted interrupt support"); -static int pirvec; +static int pirvec = -1; SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD, &pirvec, 0, "APICv posted interrupt vector"); +#ifdef __FreeBSD__ static struct unrhdr *vpid_unr; +#endif /* __FreeBSD__ */ static u_int vpid_alloc_failed; SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD, &vpid_alloc_failed, 0, NULL); +static int guest_l1d_flush; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush, CTLFLAG_RD, + &guest_l1d_flush, 0, NULL); +static int guest_l1d_flush_sw; +SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, l1d_flush_sw, CTLFLAG_RD, + &guest_l1d_flush_sw, 0, NULL); + +static struct msr_entry msr_load_list[1] __aligned(16); + +/* + * The definitions of SDT probes for VMX. + */ + +SDT_PROBE_DEFINE3(vmm, vmx, exit, entry, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, taskswitch, + "struct vmx *", "int", "struct vm_exit *", "struct vm_task_switch *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, craccess, + "struct vmx *", "int", "struct vm_exit *", "uint64_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, rdmsr, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, wrmsr, + "struct vmx *", "int", "struct vm_exit *", "uint32_t", "uint64_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, halt, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, mtrap, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, pause, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, intrwindow, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, interrupt, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, nmiwindow, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, inout, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, cpuid, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, exception, + "struct vmx *", "int", "struct vm_exit *", "uint32_t", "int"); + +SDT_PROBE_DEFINE5(vmm, vmx, exit, nestedfault, + "struct vmx *", "int", "struct vm_exit *", "uint64_t", "uint64_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, mmiofault, + "struct vmx *", "int", "struct vm_exit *", "uint64_t"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, eoi, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, apicaccess, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, apicwrite, + "struct vmx *", "int", "struct vm_exit *", "struct vlapic *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, xsetbv, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, monitor, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, mwait, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE3(vmm, vmx, exit, vminsn, + "struct vmx *", "int", "struct vm_exit *"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, unknown, + "struct vmx *", "int", "struct vm_exit *", "uint32_t"); + +SDT_PROBE_DEFINE4(vmm, vmx, exit, return, + "struct vmx *", "int", "struct vm_exit *", "int"); + /* * Use the last page below 4GB as the APIC access address. This address is * occupied by the boot firmware so it is guaranteed that it will not conflict @@ -193,6 +330,9 @@ static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc); static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval); static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val); static void vmx_inject_pir(struct vlapic *vlapic); +#ifndef __FreeBSD__ +static int vmx_apply_tsc_adjust(struct vmx *, int); +#endif /* __FreeBSD__ */ #ifdef KTR static const char * @@ -279,8 +419,8 @@ exit_reason_to_str(int reason) return "monitor"; case EXIT_REASON_PAUSE: return "pause"; - case EXIT_REASON_MCE: - return "mce"; + case EXIT_REASON_MCE_DURING_ENTRY: + return "mce-during-entry"; case EXIT_REASON_TPR: return "tpr"; case EXIT_REASON_APIC_ACCESS: @@ -312,83 +452,6 @@ exit_reason_to_str(int reason) return (reasonbuf); } } - -#ifdef SETJMP_TRACE -static const char * -vmx_setjmp_rc2str(int rc) -{ - switch (rc) { - case VMX_RETURN_DIRECT: - return "direct"; - case VMX_RETURN_LONGJMP: - return "longjmp"; - case VMX_RETURN_VMRESUME: - return "vmresume"; - case VMX_RETURN_VMLAUNCH: - return "vmlaunch"; - case VMX_RETURN_AST: - return "ast"; - default: - return "unknown"; - } -} - -#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \ - VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \ - (vmxctx)->regname) - -static void -vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) -{ - uint64_t host_rip, host_rsp; - - if (vmxctx != &vmx->ctx[vcpu]) - panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p", - vmxctx, &vmx->ctx[vcpu]); - - VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx); - VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)", - vmx_setjmp_rc2str(rc), rc); - - host_rsp = host_rip = ~0; - vmread(VMCS_HOST_RIP, &host_rip); - vmread(VMCS_HOST_RSP, &host_rsp); - VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx", - host_rip, host_rsp); - - SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx); - SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip); - - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15); - SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2); -} -#endif -#else -static void __inline -vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc) -{ - return; -} #endif /* KTR */ static int @@ -411,7 +474,7 @@ vmx_allow_x2apic_msrs(struct vmx *vmx) for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i); - + for (i = 0; i < 8; i++) error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i); @@ -465,7 +528,11 @@ vpid_free(int vpid) */ if (vpid > VM_MAXCPU) +#ifdef __FreeBSD__ free_unr(vpid_unr, vpid); +#else + hma_vmx_vpid_free((uint16_t)vpid); +#endif } static void @@ -490,7 +557,14 @@ vpid_alloc(uint16_t *vpid, int num) * Allocate a unique VPID for each vcpu from the unit number allocator. */ for (i = 0; i < num; i++) { +#ifdef __FreeBSD__ x = alloc_unr(vpid_unr); +#else + uint16_t tmp; + + tmp = hma_vmx_vpid_alloc(); + x = (tmp == 0) ? -1 : tmp; +#endif if (x == -1) break; else @@ -519,6 +593,7 @@ vpid_alloc(uint16_t *vpid, int num) } } +#ifdef __FreeBSD__ static void vpid_init(void) { @@ -535,50 +610,6 @@ vpid_init(void) vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL); } -#ifndef __FreeBSD__ -static void -msr_save_area_init(struct msr_entry *g_area, int *g_count) -{ - int cnt; - - static struct msr_entry guest_msrs[] = { - { MSR_KGSBASE, 0, 0 }, - { MSR_LSTAR, 0, 0 }, - { MSR_CSTAR, 0, 0 }, - { MSR_STAR, 0, 0 }, - { MSR_SF_MASK, 0, 0 }, - }; - - cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]); - if (cnt > GUEST_MSR_MAX_ENTRIES) - panic("guest msr save area overrun"); - bcopy(guest_msrs, g_area, sizeof(guest_msrs)); - *g_count = cnt; -} - -static void -host_msr_save_area_init(struct msr_entry *h_area, int *h_count) -{ - int i, cnt; - - static struct msr_entry host_msrs[] = { - { MSR_LSTAR, 0, 0 }, - { MSR_CSTAR, 0, 0 }, - { MSR_STAR, 0, 0 }, - { MSR_SF_MASK, 0, 0 }, - }; - - cnt = sizeof(host_msrs) / sizeof(host_msrs[0]); - if (cnt > HOST_MSR_MAX_ENTRIES) - panic("host msr save area overrun"); - for (i = 0; i < cnt; i++) { - host_msrs[i].val = rdmsr(host_msrs[i].index); - } - bcopy(host_msrs, h_area, sizeof(host_msrs)); - *h_count = cnt; -} -#endif - static void vmx_disable(void *arg __unused) { @@ -603,17 +634,18 @@ vmx_disable(void *arg __unused) static int vmx_cleanup(void) { - -#ifdef __FreeBSD__ - if (pirvec != 0) - vmm_ipi_free(pirvec); -#endif + + if (pirvec >= 0) + lapic_ipi_free(pirvec); if (vpid_unr != NULL) { delete_unrhdr(vpid_unr); vpid_unr = NULL; } + if (nmi_flush_l1d_sw == 1) + nmi_flush_l1d_sw = 0; + smp_rendezvous(NULL, vmx_disable, NULL, NULL); return (0); @@ -636,40 +668,50 @@ vmx_enable(void *arg __unused) load_cr4(rcr4() | CR4_VMXE); *(uint32_t *)vmxon_region[curcpu] = vmx_revision(); -#ifdef __FreeBSD__ error = vmxon(vmxon_region[curcpu]); -#else - error = vmxon_pa(vmxon_region_pa[curcpu]); - ASSERT(error == 0); -#endif if (error == 0) vmxon_enabled[curcpu] = 1; } +static void +vmx_restore(void) +{ + + if (vmxon_enabled[curcpu]) + vmxon(vmxon_region[curcpu]); +} +#else /* __FreeBSD__ */ static int -vmx_init(void) +vmx_cleanup(void) { -#define X86FSET_VMX 35 - extern uchar_t x86_featureset[]; - extern boolean_t is_x86_feature(void *featureset, uint_t feature); - int error; - uint64_t fixed0, fixed1, feature_control; - uint32_t tmp; -#ifndef __FreeBSD__ - int i; + /* This is taken care of by the hma registration */ + return (0); +} + +static void +vmx_restore(void) +{ + /* No-op on illumos */ +} +#endif /* __FreeBSD__ */ + +static int +vmx_init(int ipinum) +{ + int error, use_tpr_shadow; +#ifdef __FreeBSD__ + uint64_t basic, fixed0, fixed1, feature_control; +#else + uint64_t fixed0, fixed1; #endif + uint32_t tmp, procbased2_vid_bits; +#ifdef __FreeBSD__ /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */ -#ifdef __FreeBSD__ if (!(cpu_feature2 & CPUID2_VMX)) { printf("vmx_init: processor does not support VMX operation\n"); return (ENXIO); } -#else - if (!is_x86_feature(x86_featureset, X86FSET_VMX)) { - cmn_err(CE_WARN, "vmx_init: processor does not support VMX operation\n"); - } -#endif /* * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits @@ -682,6 +724,18 @@ vmx_init(void) return (ENXIO); } + /* + * Verify capabilities MSR_VMX_BASIC: + * - bit 54 indicates support for INS/OUTS decoding + */ + basic = rdmsr(MSR_VMX_BASIC); + if ((basic & (1UL << 54)) == 0) { + printf("vmx_init: processor does not support desired basic " + "capabilities\n"); + return (EINVAL); + } +#endif /* __FreeBSD__ */ + /* Check support for primary processor-based VM-execution controls */ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, MSR_VMX_TRUE_PROCBASED_CTLS, @@ -769,13 +823,119 @@ vmx_init(void) PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp) == 0); + cap_invpcid = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, PROCBASED2_ENABLE_INVPCID, 0, + &tmp) == 0); + + /* + * Check support for virtual interrupt delivery. + */ + procbased2_vid_bits = (PROCBASED2_VIRTUALIZE_APIC_ACCESSES | + PROCBASED2_VIRTUALIZE_X2APIC_MODE | + PROCBASED2_APIC_REGISTER_VIRTUALIZATION | + PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY); + + use_tpr_shadow = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_USE_TPR_SHADOW, 0, + &tmp) == 0); + + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + procbased2_vid_bits, 0, &tmp); + if (error == 0 && use_tpr_shadow) { + virtual_interrupt_delivery = 1; + TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_vid", + &virtual_interrupt_delivery); + } + + if (virtual_interrupt_delivery) { + procbased_ctls |= PROCBASED_USE_TPR_SHADOW; + procbased_ctls2 |= procbased2_vid_bits; + procbased_ctls2 &= ~PROCBASED2_VIRTUALIZE_X2APIC_MODE; + + /* + * No need to emulate accesses to %CR8 if virtual + * interrupt delivery is enabled. + */ + procbased_ctls &= ~PROCBASED_CR8_LOAD_EXITING; + procbased_ctls &= ~PROCBASED_CR8_STORE_EXITING; + + /* + * Check for Posted Interrupts only if Virtual Interrupt + * Delivery is enabled. + */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_POSTED_INTERRUPT, 0, + &tmp); + if (error == 0) { +#ifdef __FreeBSD__ + pirvec = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : + &IDTVEC(justreturn)); + if (pirvec < 0) { + if (bootverbose) { + printf("vmx_init: unable to allocate " + "posted interrupt vector\n"); + } + } else { + posted_interrupts = 1; + TUNABLE_INT_FETCH("hw.vmm.vmx.use_apic_pir", + &posted_interrupts); + } +#else + /* + * If the PSM-provided interfaces for requesting and + * using a PIR IPI vector are present, use them for + * posted interrupts. + */ + if (psm_get_pir_ipivect != NULL && + psm_send_pir_ipi != NULL) { + pirvec = psm_get_pir_ipivect(); + posted_interrupts = 1; + } +#endif + } + } + + if (posted_interrupts) + pinbased_ctls |= PINBASED_POSTED_INTERRUPT; + /* Initialize EPT */ - error = ept_init(); + error = ept_init(ipinum); if (error) { printf("vmx_init: ept initialization failed (%d)\n", error); return (error); } +#ifdef __FreeBSD__ + guest_l1d_flush = (cpu_ia32_arch_caps & + IA32_ARCH_CAP_SKIP_L1DFL_VMENTRY) == 0; + TUNABLE_INT_FETCH("hw.vmm.l1d_flush", &guest_l1d_flush); + + /* + * L1D cache flush is enabled. Use IA32_FLUSH_CMD MSR when + * available. Otherwise fall back to the software flush + * method which loads enough data from the kernel text to + * flush existing L1D content, both on VMX entry and on NMI + * return. + */ + if (guest_l1d_flush) { + if ((cpu_stdext_feature3 & CPUID_STDEXT3_L1D_FLUSH) == 0) { + guest_l1d_flush_sw = 1; + TUNABLE_INT_FETCH("hw.vmm.l1d_flush_sw", + &guest_l1d_flush_sw); + } + if (guest_l1d_flush_sw) { + if (nmi_flush_l1d_sw <= 1) + nmi_flush_l1d_sw = 1; + } else { + msr_load_list[0].index = MSR_IA32_FLUSH_CMD; + msr_load_list[0].val = IA32_FLUSH_CMD_L1D; + } + } +#else + /* L1D flushing is taken care of by smt_acquire() and friends */ + guest_l1d_flush = 0; +#endif /* __FreeBSD__ */ + /* * Stash the cr0 and cr4 bits that must be fixed to 0 or 1 */ @@ -801,24 +961,52 @@ vmx_init(void) cr4_ones_mask = fixed0 & fixed1; cr4_zeros_mask = ~fixed0 & ~fixed1; -#ifndef __FreeBSD__ - for (i = 0; i < MAXCPU; i++) { - vmxon_region_pa[i] = vtophys(&vmxon_region[i]); - } -#endif - +#ifdef __FreeBSD__ vpid_init(); +#endif vmx_msr_init(); +#ifdef __FreeBSD__ /* enable VMX operation */ smp_rendezvous(NULL, vmx_enable, NULL, NULL); +#endif vmx_initialized = 1; return (0); } +static void +vmx_trigger_hostintr(int vector) +{ +#ifdef __FreeBSD__ + uintptr_t func; + struct gate_descriptor *gd; + + gd = &idt[vector]; + + KASSERT(vector >= 32 && vector <= 255, ("vmx_trigger_hostintr: " + "invalid vector %d", vector)); + KASSERT(gd->gd_p == 1, ("gate descriptor for vector %d not present", + vector)); + KASSERT(gd->gd_type == SDT_SYSIGT, ("gate descriptor for vector %d " + "has invalid type %d", vector, gd->gd_type)); + KASSERT(gd->gd_dpl == SEL_KPL, ("gate descriptor for vector %d " + "has invalid dpl %d", vector, gd->gd_dpl)); + KASSERT(gd->gd_selector == GSEL(GCODE_SEL, SEL_KPL), ("gate descriptor " + "for vector %d has invalid selector %d", vector, gd->gd_selector)); + KASSERT(gd->gd_ist == 0, ("gate descriptor for vector %d has invalid " + "IST %d", vector, gd->gd_ist)); + + func = ((long)gd->gd_hioffset << 16 | gd->gd_looffset); + vmx_call_isr(func); +#else + VERIFY(vector >= 32 && vector <= 255); + vmx_call_isr(vector - 32); +#endif /* __FreeBSD__ */ +} + static int vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) { @@ -852,15 +1040,14 @@ vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial) #define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init)) static void * -vmx_vminit(struct vm *vm) +vmx_vminit(struct vm *vm, pmap_t pmap) { uint16_t vpid[VM_MAXCPU]; - int i, error, guest_msr_count; -#ifndef __FreeBSD__ - int host_msr_count; -#endif + int i, error; struct vmx *vmx; struct vmcs *vmcs; + uint32_t exc_bitmap; + uint16_t maxcpus; vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO); if ((uintptr_t)vmx & PAGE_MASK) { @@ -869,6 +1056,8 @@ vmx_vminit(struct vm *vm) } vmx->vm = vm; + vmx->eptp = eptp(vtophys((vm_offset_t)pmap->pm_pml4)); + /* * Clean up EPTP-tagged guest physical and combined mappings * @@ -878,7 +1067,7 @@ vmx_vminit(struct vm *vm) * * Combined mappings for this EP4TA are also invalidated for all VPIDs. */ - ept_invalidate_mappings(vtophys(vmx->pml4ept)); + ept_invalidate_mappings(vmx->eptp); msr_bitmap_initialize(vmx->msr_bitmap); @@ -896,10 +1085,6 @@ vmx_vminit(struct vm *vm) * VM exit and entry respectively. It is also restored from the * host VMCS area on a VM exit. * - * MSR_PAT is saved and restored in the guest VMCS are on a VM exit - * and entry respectively. It is also restored from the host VMCS - * area on a VM exit. - * * The TSC MSR is exposed read-only. Writes are disallowed as * that will impact the host TSC. If the guest does a write * the "use TSC offsetting" execution control is enabled and the @@ -912,15 +1097,36 @@ vmx_vminit(struct vm *vm) guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) || guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) || guest_msr_rw(vmx, MSR_EFER) || - guest_msr_rw(vmx, MSR_PAT) || guest_msr_ro(vmx, MSR_TSC)) panic("vmx_vminit: error setting guest msr access"); vpid_alloc(vpid, VM_MAXCPU); - for (i = 0; i < VM_MAXCPU; i++) { + if (virtual_interrupt_delivery) { + error = vm_map_mmio(vm, DEFAULT_APIC_BASE, PAGE_SIZE, + APIC_ACCESS_ADDRESS); + /* XXX this should really return an error to the caller */ + KASSERT(error == 0, ("vm_map_mmio(apicbase) error %d", error)); + } + + maxcpus = vm_get_maxcpus(vm); + for (i = 0; i < maxcpus; i++) { +#ifndef __FreeBSD__ + /* + * Cache physical address lookups for various components which + * may be required inside the critical_enter() section implied + * by VMPTRLD() below. + */ + vm_paddr_t msr_bitmap_pa = vtophys(vmx->msr_bitmap); + vm_paddr_t apic_page_pa = vtophys(&vmx->apic_page[i]); + vm_paddr_t pir_desc_pa = vtophys(&vmx->pir_desc[i]); +#endif /* __FreeBSD__ */ + vmcs = &vmx->vmcs[i]; vmcs->identifier = vmx_revision(); +#ifndef __FreeBSD__ + vmcs->vmcs_pa = (uint64_t)vtophys(vmcs); +#endif error = vmclear(vmcs); if (error != 0) { panic("vmx_vminit: vmclear error %d on vcpu %d\n", @@ -929,42 +1135,83 @@ vmx_vminit(struct vm *vm) vmx_msr_guest_init(vmx, i); - error = vmcs_set_defaults(vmcs, - (u_long)vmx_longjmp, - (u_long)&vmx->ctx[i], - vtophys(vmx->pml4ept), - pinbased_ctls, - procbased_ctls, - procbased_ctls2, - exit_ctls, entry_ctls, - vtophys(vmx->msr_bitmap), - vpid[i]); - - if (error != 0) - panic("vmx_vminit: vmcs_set_defaults error %d", error); + error = vmcs_init(vmcs); + KASSERT(error == 0, ("vmcs_init error %d", error)); - vmx->cap[i].set = 0; - vmx->cap[i].proc_ctls = procbased_ctls; + VMPTRLD(vmcs); + error = 0; +#ifdef __FreeBSD__ + /* + * The illumos vmx_enter_guest implementation avoids some of + * the %rsp-manipulation games which are present in the stock + * one from FreeBSD. + */ + error += vmwrite(VMCS_HOST_RSP, (u_long)&vmx->ctx[i]); +#endif + error += vmwrite(VMCS_EPTP, vmx->eptp); + error += vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls); + error += vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls); + error += vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2); + error += vmwrite(VMCS_EXIT_CTLS, exit_ctls); + error += vmwrite(VMCS_ENTRY_CTLS, entry_ctls); +#ifdef __FreeBSD__ + error += vmwrite(VMCS_MSR_BITMAP, vtophys(vmx->msr_bitmap)); +#else + error += vmwrite(VMCS_MSR_BITMAP, msr_bitmap_pa); +#endif + error += vmwrite(VMCS_VPID, vpid[i]); + + if (guest_l1d_flush && !guest_l1d_flush_sw) { + vmcs_write(VMCS_ENTRY_MSR_LOAD, pmap_kextract( + (vm_offset_t)&msr_load_list[0])); + vmcs_write(VMCS_ENTRY_MSR_LOAD_COUNT, + nitems(msr_load_list)); + vmcs_write(VMCS_EXIT_MSR_STORE, 0); + vmcs_write(VMCS_EXIT_MSR_STORE_COUNT, 0); + } - vmx->state[i].lastcpu = -1; - vmx->state[i].vpid = vpid[i]; + /* exception bitmap */ + if (vcpu_trace_exceptions(vm, i)) + exc_bitmap = 0xffffffff; + else + exc_bitmap = 1 << IDT_MC; + error += vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap); -#ifndef __FreeBSD__ - msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count); + vmx->ctx[i].guest_dr6 = DBREG_DR6_RESERVED1; + error += vmwrite(VMCS_GUEST_DR7, DBREG_DR7_RESERVED1); - error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]), - guest_msr_count); - if (error != 0) - panic("vmcs_set_msr_save error %d", error); + if (virtual_interrupt_delivery) { + error += vmwrite(VMCS_APIC_ACCESS, APIC_ACCESS_ADDRESS); +#ifdef __FreeBSD__ + error += vmwrite(VMCS_VIRTUAL_APIC, + vtophys(&vmx->apic_page[i])); +#else + error += vmwrite(VMCS_VIRTUAL_APIC, apic_page_pa); +#endif + error += vmwrite(VMCS_EOI_EXIT0, 0); + error += vmwrite(VMCS_EOI_EXIT1, 0); + error += vmwrite(VMCS_EOI_EXIT2, 0); + error += vmwrite(VMCS_EOI_EXIT3, 0); + } + if (posted_interrupts) { + error += vmwrite(VMCS_PIR_VECTOR, pirvec); +#ifdef __FreeBSD__ + error += vmwrite(VMCS_PIR_DESC, + vtophys(&vmx->pir_desc[i])); +#else + error += vmwrite(VMCS_PIR_DESC, pir_desc_pa); +#endif + } + VMCLEAR(vmcs); + KASSERT(error == 0, ("vmx_vminit: error customizing the vmcs")); - host_msr_save_area_init(vmx->host_msrs[i], &host_msr_count); + vmx->cap[i].set = 0; + vmx->cap[i].proc_ctls = procbased_ctls; + vmx->cap[i].proc_ctls2 = procbased_ctls2; - error = vmcs_set_host_msr_save(&vmx->vmcs[i], - vtophys(vmx->host_msrs[i]), - host_msr_count); - if (error != 0) - panic("vmcs_set_msr_save error %d", error); -#endif + vmx->state[i].nextrip = ~0; + vmx->state[i].lastcpu = NOCPU; + vmx->state[i].vpid = vpid[i]; /* * Set up the CR0/4 shadows, and init the read shadow @@ -979,6 +1226,8 @@ vmx_vminit(struct vm *vm) error = vmx_setup_cr4_shadow(vmcs, 0); if (error != 0) panic("vmx_setup_cr4_shadow %d", error); + + vmx->ctx[i].pmap = pmap; } return (vmx); @@ -987,9 +1236,13 @@ vmx_vminit(struct vm *vm) static int vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx) { +#ifdef __FreeBSD__ int handled, func; - + func = vmxctx->guest_rax; +#else + int handled; +#endif handled = x86_emulate_cpuid(vm, vcpu, (uint32_t*)(&vmxctx->guest_rax), @@ -1016,6 +1269,8 @@ vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason, handled ? "handled" : "unhandled", exit_reason_to_str(exit_reason), rip); #endif + DTRACE_PROBE3(vmm__vexit, int, vcpu, uint64_t, rip, + uint32_t, exit_reason); } static __inline void @@ -1026,36 +1281,40 @@ vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip) #endif } -static void -vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) +static VMM_STAT_INTEL(VCPU_INVVPID_SAVED, "Number of vpid invalidations saved"); +static VMM_STAT_INTEL(VCPU_INVVPID_DONE, "Number of vpid invalidations done"); + +/* + * Invalidate guest mappings identified by its vpid from the TLB. + */ +static __inline void +vmx_invvpid(struct vmx *vmx, int vcpu, pmap_t pmap, int running) { struct vmxstate *vmxstate; - struct invvpid_desc invvpid_desc = { 0 }; -#ifndef __FreeBSD__ - desctbr_t idtr, gdtr; -#endif + struct invvpid_desc invvpid_desc; vmxstate = &vmx->state[vcpu]; - vmcs_write(VMCS_HOST_FS_BASE, vmm_get_host_fsbase()); - if (vmxstate->lastcpu == curcpu) + if (vmxstate->vpid == 0) return; - vmxstate->lastcpu = curcpu; - - vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); - - vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); - vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); - vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + if (!running) { + /* + * Set the 'lastcpu' to an invalid host cpu. + * + * This will invalidate TLB entries tagged with the vcpu's + * vpid the next time it runs via vmx_set_pcpu_defaults(). + */ + vmxstate->lastcpu = NOCPU; + return; + } -#ifndef __FreeBSD__ - vmcs_write(VMCS_HOST_IA32_SYSENTER_CS, rdmsr(MSR_SYSENTER_CS_MSR)); - vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR)); - vmcs_write(VMCS_HOST_IA32_SYSENTER_EIP, rdmsr(MSR_SYSENTER_EIP_MSR)); +#ifdef __FreeBSD__ + KASSERT(curthread->td_critnest > 0, ("%s: vcpu %d running outside " + "critical section", __func__, vcpu)); #endif /* - * If we are using VPIDs then invalidate all mappings tagged with 'vpid' + * Invalidate all mappings tagged with 'vpid' * * We do this because this vcpu was executing on a different host * cpu when it last ran. We do not track whether it invalidated @@ -1069,20 +1328,61 @@ vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu) * Note also that this will invalidate mappings tagged with 'vpid' * for "all" EP4TAs. */ - if (vmxstate->vpid != 0) { + if (pmap->pm_eptgen == vmx->eptgen[curcpu]) { + invvpid_desc._res1 = 0; + invvpid_desc._res2 = 0; invvpid_desc.vpid = vmxstate->vpid; + invvpid_desc.linear_addr = 0; invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc); + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_DONE, 1); + } else { + /* + * The invvpid can be skipped if an invept is going to + * be performed before entering the guest. The invept + * will invalidate combined mappings tagged with + * 'vmx->eptp' for all vpids. + */ + vmm_stat_incr(vmx->vm, vcpu, VCPU_INVVPID_SAVED, 1); } } -static void -vm_exit_update_rip(struct vm_exit *vmexit) +static void +vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu, pmap_t pmap) { - int error; + struct vmxstate *vmxstate; - error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length); - if (error) - panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error); +#ifndef __FreeBSD__ + /* + * Regardless of whether the VM appears to have migrated between CPUs, + * save the host sysenter stack pointer. As it points to the kernel + * stack of each thread, the correct value must be maintained for every + * trip into the critical section. + */ + vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR)); + + /* + * Perform any needed TSC_OFFSET adjustment based on TSC_MSR writes or + * migration between host CPUs with differing TSC values. + */ + VERIFY0(vmx_apply_tsc_adjust(vmx, vcpu)); +#endif + + vmxstate = &vmx->state[vcpu]; + if (vmxstate->lastcpu == curcpu) + return; + + vmxstate->lastcpu = curcpu; + + vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1); + +#ifndef __FreeBSD__ + /* Load the per-CPU IDT address */ + vmcs_write(VMCS_HOST_IDTR_BASE, vmm_get_host_idtrbase()); +#endif + vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase()); + vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase()); + vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase()); + vmx_invvpid(vmx, vcpu, pmap, 1); } /* @@ -1090,7 +1390,7 @@ vm_exit_update_rip(struct vm_exit *vmexit) */ CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0); -static void __inline +static __inline void vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) { @@ -1101,23 +1401,18 @@ vmx_set_int_window_exiting(struct vmx *vmx, int vcpu) } } -static void __inline +static __inline void vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu) { -#ifdef __FreeBSD__ KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls)); -#else - KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0, - ("intr_window_exiting not set: %x", vmx->cap[vcpu].proc_ctls)); -#endif vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting"); } -static void __inline +static __inline void vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) { @@ -1128,22 +1423,18 @@ vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu) } } -static void __inline +static __inline void vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu) { -#ifdef __FreeBSD__ KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls)); -#else - KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0, - ("nmi_window_exiting not set %x", vmx->cap[vcpu].proc_ctls)); -#endif vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING; vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls); VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting"); } +#ifdef __FreeBSD__ int vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) { @@ -1159,34 +1450,55 @@ vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset) return (error); } +#else /* __FreeBSD__ */ +/* + * Set the TSC adjustment, taking into account the offsets measured between + * host physical CPUs. This is required even if the guest has not set a TSC + * offset since vCPUs inherit the TSC offset of whatever physical CPU it has + * migrated onto. Without this mitigation, un-synched host TSCs will convey + * the appearance of TSC time-travel to the guest as its vCPUs migrate. + */ +static int +vmx_apply_tsc_adjust(struct vmx *vmx, int vcpu) +{ + extern hrtime_t tsc_gethrtime_tick_delta(void); + const uint64_t target_offset = (vcpu_tsc_offset(vmx->vm, vcpu) + + (uint64_t)tsc_gethrtime_tick_delta()); + int error = 0; + + ASSERT(vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET); + + if (vmx->tsc_offset_active[vcpu] != target_offset) { + error = vmwrite(VMCS_TSC_OFFSET, target_offset); + vmx->tsc_offset_active[vcpu] = target_offset; + } + + return (error); +} +#endif /* __FreeBSD__ */ #define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) #define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING) +#ifndef __FreeBSD__ +static uint32_t +vmx_inject_nmi(struct vmx *vmx, int vcpu) +#else static void vmx_inject_nmi(struct vmx *vmx, int vcpu) +#endif { uint32_t gi, info; gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); -#ifdef __FreeBSD__ KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " "interruptibility-state %#x", gi)); -#else - KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest " - "interruptibility-state %x", gi)); -#endif info = vmcs_read(VMCS_ENTRY_INTR_INFO); -#ifdef __FreeBSD__ KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " "VM-entry interruption information %#x", info)); -#else - KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid " - "VM-entry interruption information %x", info)); -#endif /* * Inject the virtual NMI. The vector must be the NMI IDT entry @@ -1199,32 +1511,220 @@ vmx_inject_nmi(struct vmx *vmx, int vcpu) /* Clear the request */ vm_nmi_clear(vmx->vm, vcpu); + +#ifndef __FreeBSD__ + return (info); +#endif } +#ifndef __FreeBSD__ static void -vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic) +vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, + uint64_t guestrip) { - int vector, need_nmi_exiting, extint_pending; - uint64_t rflags, entryinfo; + uint64_t entryinfo, rflags; uint32_t gi, info; + int vector; + boolean_t extint_pending = B_FALSE; + + vlapic_tmr_update(vlapic); + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + info = vmcs_read(VMCS_ENTRY_INTR_INFO); + + if (vmx->state[vcpu].nextrip != guestrip && + (gi & HWINTR_BLOCKING) != 0) { + VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vmx->state[vcpu].nextrip, guestrip); + gi &= ~HWINTR_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); + } + + /* + * It could be that an interrupt is already pending for injection from + * the VMCS. This would be the case if the vCPU exited for conditions + * such as an AST before a vm-entry delivered the injection. + */ + if ((info & VMCS_INTR_VALID) != 0) { + goto cantinject; + } if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { -#ifdef __FreeBSD__ KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " "intinfo is not valid: %#lx", __func__, entryinfo)); + + KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " + "pending exception: %#lx/%#x", __func__, entryinfo, info)); + + info = entryinfo; + vector = info & 0xff; + if (vector == IDT_BP || vector == IDT_OF) { + /* + * VT-x requires #BP and #OF to be injected as software + * exceptions. + */ + info &= ~VMCS_INTR_T_MASK; + info |= VMCS_INTR_T_SWEXCEPTION; + } + + if (info & VMCS_INTR_DEL_ERRCODE) + vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32); + + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + } + + if (vm_nmi_pending(vmx->vm, vcpu)) { + int need_nmi_exiting = 1; + + /* + * If there are no conditions blocking NMI injection then + * inject it directly here otherwise enable "NMI window + * exiting" to inject it as soon as we can. + * + * We also check for STI_BLOCKING because some implementations + * don't allow NMI injection in this case. If we are running + * on a processor that doesn't have this restriction it will + * immediately exit and the NMI will be injected in the + * "NMI window exiting" handler. + */ + if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) { + if ((info & VMCS_INTR_VALID) == 0) { + info = vmx_inject_nmi(vmx, vcpu); + need_nmi_exiting = 0; + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI " + "due to VM-entry intr info %#x", info); + } + } else { + VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to " + "Guest Interruptibility-state %#x", gi); + } + + if (need_nmi_exiting) { + vmx_set_nmi_window_exiting(vmx, vcpu); + return; + } + } + + /* Check the AT-PIC and APIC for interrupts. */ + if (vm_extint_pending(vmx->vm, vcpu)) { + /* Ask the legacy pic for a vector to inject */ + vatpic_pending_intr(vmx->vm, &vector); + extint_pending = B_TRUE; + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [0,255] can be delivered + * through the INTR pin. + */ + KASSERT(vector >= 0 && vector <= 255, + ("invalid vector %d from INTR", vector)); + } else if (!virtual_interrupt_delivery) { + /* Ask the local apic for a vector to inject */ + if (!vlapic_pending_intr(vlapic, &vector)) + return; + + /* + * From the Intel SDM, Volume 3, Section "Maskable + * Hardware Interrupts": + * - maskable interrupt vectors [16,255] can be delivered + * through the local APIC. + */ + KASSERT(vector >= 16 && vector <= 255, + ("invalid vector %d from local APIC", vector)); + } else { + /* No futher injection needed */ + return; + } + + /* + * Verify that the guest is interruptable and the above logic has not + * already queued an event for injection. + */ + if ((gi & HWINTR_BLOCKING) != 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "Guest Interruptibility-state %#x", vector, gi); + goto cantinject; + } + if ((info & VMCS_INTR_VALID) != 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "VM-entry intr info %#x", vector, info); + goto cantinject; + } + rflags = vmcs_read(VMCS_GUEST_RFLAGS); + if ((rflags & PSL_I) == 0) { + VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to " + "rflags %#lx", vector, rflags); + goto cantinject; + } + + /* Inject the interrupt */ + info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID; + info |= vector; + vmcs_write(VMCS_ENTRY_INTR_INFO, info); + + if (extint_pending) { + vm_extint_clear(vmx->vm, vcpu); + vatpic_intr_accepted(vmx->vm, vector); + + /* + * After we accepted the current ExtINT the PIC may + * have posted another one. If that is the case, set + * the Interrupt Window Exiting execution control so + * we can inject that one too. + * + * Also, interrupt window exiting allows us to inject any + * pending APIC vector that was preempted by the ExtINT + * as soon as possible. This applies both for the software + * emulated vlapic and the hardware assisted virtual APIC. + */ + vmx_set_int_window_exiting(vmx, vcpu); + } else { + /* Update the Local APIC ISR */ + vlapic_intr_accepted(vlapic, vector); + } + + VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector); + return; + +cantinject: + /* + * Set the Interrupt Window Exiting execution control so we can inject + * the interrupt as soon as blocking condition goes away. + */ + vmx_set_int_window_exiting(vmx, vcpu); +} #else +static void +vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic, + uint64_t guestrip) +{ + int vector, need_nmi_exiting, extint_pending; + uint64_t rflags, entryinfo; + uint32_t gi, info; + + vlapic_tmr_update(vlapic); + + if (vmx->state[vcpu].nextrip != guestrip) { + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + if (gi & HWINTR_BLOCKING) { + VCPU_CTR2(vmx->vm, vcpu, "Guest interrupt blocking " + "cleared due to rip change: %#lx/%#lx", + vmx->state[vcpu].nextrip, guestrip); + gi &= ~HWINTR_BLOCKING; + vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); + } + } + + if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) { KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry " - "intinfo is not valid: %lx", __func__, entryinfo)); -#endif + "intinfo is not valid: %#lx", __func__, entryinfo)); info = vmcs_read(VMCS_ENTRY_INTR_INFO); -#ifdef __FreeBSD__ KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " "pending exception: %#lx/%#x", __func__, entryinfo, info)); -#else - KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject " - "pending exception: %lx/%x", __func__, entryinfo, info)); -#endif info = entryinfo; vector = info & 0xff; @@ -1277,12 +1777,10 @@ vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic) extint_pending = vm_extint_pending(vmx->vm, vcpu); -#ifdef __FreeBSD__ if (!extint_pending && virtual_interrupt_delivery) { vmx_inject_pir(vlapic); return; } -#endif /* * If interrupt-window exiting is already in effect then don't bother @@ -1388,6 +1886,7 @@ cantinject: */ vmx_set_int_window_exiting(vmx, vcpu); } +#endif /* __FreeBSD__ */ /* * If the Virtual NMIs execution control is '1' then the logical processor @@ -1420,6 +1919,92 @@ vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid) vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi); } +static void +vmx_assert_nmi_blocking(struct vmx *vmx, int vcpuid) +{ + uint32_t gi; + + gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY); + KASSERT(gi & VMCS_INTERRUPTIBILITY_NMI_BLOCKING, + ("NMI blocking is not in effect %#x", gi)); +} + +static int +vmx_emulate_xsetbv(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) +{ + struct vmxctx *vmxctx; + uint64_t xcrval; + const struct xsave_limits *limits; + + vmxctx = &vmx->ctx[vcpu]; + limits = vmm_get_xsave_limits(); + + /* + * Note that the processor raises a GP# fault on its own if + * xsetbv is executed for CPL != 0, so we do not have to + * emulate that fault here. + */ + + /* Only xcr0 is supported. */ + if (vmxctx->guest_rcx != 0) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* We only handle xcr0 if both the host and guest have XSAVE enabled. */ + if (!limits->xsave_enabled || !(vmcs_read(VMCS_GUEST_CR4) & CR4_XSAVE)) { + vm_inject_ud(vmx->vm, vcpu); + return (HANDLED); + } + + xcrval = vmxctx->guest_rdx << 32 | (vmxctx->guest_rax & 0xffffffff); + if ((xcrval & ~limits->xcr0_allowed) != 0) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + if (!(xcrval & XFEATURE_ENABLED_X87)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* AVX (YMM_Hi128) requires SSE. */ + if (xcrval & XFEATURE_ENABLED_AVX && + (xcrval & XFEATURE_AVX) != XFEATURE_AVX) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * AVX512 requires base AVX (YMM_Hi128) as well as OpMask, + * ZMM_Hi256, and Hi16_ZMM. + */ + if (xcrval & XFEATURE_AVX512 && + (xcrval & (XFEATURE_AVX512 | XFEATURE_AVX)) != + (XFEATURE_AVX512 | XFEATURE_AVX)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * Intel MPX requires both bound register state flags to be + * set. + */ + if (((xcrval & XFEATURE_ENABLED_BNDREGS) != 0) != + ((xcrval & XFEATURE_ENABLED_BNDCSR) != 0)) { + vm_inject_gp(vmx->vm, vcpu); + return (HANDLED); + } + + /* + * This runs "inside" vmrun() with the guest's FPU state, so + * modifying xcr0 directly modifies the guest's xcr0, not the + * host's. + */ + load_xcr(0, xcrval); + return (HANDLED); +} + static uint64_t vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident) { @@ -1734,6 +2319,7 @@ vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla) paging = &vmexit->u.inst_emul.paging; vmexit->exitcode = VM_EXITCODE_INST_EMUL; + vmexit->inst_length = 0; vmexit->u.inst_emul.gpa = gpa; vmexit->u.inst_emul.gla = gla; vmx_paging_info(paging); @@ -1799,6 +2385,189 @@ ept_emulation_fault(uint64_t ept_qual) return (TRUE); } +static __inline int +apic_access_virtualization(struct vmx *vmx, int vcpuid) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) ? 1 : 0); +} + +static __inline int +x2apic_virtualization(struct vmx *vmx, int vcpuid) +{ + uint32_t proc_ctls2; + + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + return ((proc_ctls2 & PROCBASED2_VIRTUALIZE_X2APIC_MODE) ? 1 : 0); +} + +static int +vmx_handle_apic_write(struct vmx *vmx, int vcpuid, struct vlapic *vlapic, + uint64_t qual) +{ + int error, handled, offset; + uint32_t *apic_regs, vector; + bool retu; + + handled = HANDLED; + offset = APIC_WRITE_OFFSET(qual); + + if (!apic_access_virtualization(vmx, vcpuid)) { + /* + * In general there should not be any APIC write VM-exits + * unless APIC-access virtualization is enabled. + * + * However self-IPI virtualization can legitimately trigger + * an APIC-write VM-exit so treat it specially. + */ + if (x2apic_virtualization(vmx, vcpuid) && + offset == APIC_OFFSET_SELF_IPI) { + apic_regs = (uint32_t *)(vlapic->apic_page); + vector = apic_regs[APIC_OFFSET_SELF_IPI / 4]; + vlapic_self_ipi_handler(vlapic, vector); + return (HANDLED); + } else + return (UNHANDLED); + } + + switch (offset) { + case APIC_OFFSET_ID: + vlapic_id_write_handler(vlapic); + break; + case APIC_OFFSET_LDR: + vlapic_ldr_write_handler(vlapic); + break; + case APIC_OFFSET_DFR: + vlapic_dfr_write_handler(vlapic); + break; + case APIC_OFFSET_SVR: + vlapic_svr_write_handler(vlapic); + break; + case APIC_OFFSET_ESR: + vlapic_esr_write_handler(vlapic); + break; + case APIC_OFFSET_ICR_LOW: + retu = false; + error = vlapic_icrlo_write_handler(vlapic, &retu); + if (error != 0 || retu) + handled = UNHANDLED; + break; + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT: + vlapic_lvt_write_handler(vlapic, offset); + break; + case APIC_OFFSET_TIMER_ICR: + vlapic_icrtmr_write_handler(vlapic); + break; + case APIC_OFFSET_TIMER_DCR: + vlapic_dcr_write_handler(vlapic); + break; + default: + handled = UNHANDLED; + break; + } + return (handled); +} + +static bool +apic_access_fault(struct vmx *vmx, int vcpuid, uint64_t gpa) +{ + + if (apic_access_virtualization(vmx, vcpuid) && + (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE)) + return (true); + else + return (false); +} + +static int +vmx_handle_apic_access(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) +{ + uint64_t qual; + int access_type, offset, allowed; + + if (!apic_access_virtualization(vmx, vcpuid)) + return (UNHANDLED); + + qual = vmexit->u.vmx.exit_qualification; + access_type = APIC_ACCESS_TYPE(qual); + offset = APIC_ACCESS_OFFSET(qual); + + allowed = 0; + if (access_type == 0) { + /* + * Read data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } else if (access_type == 1) { + /* + * Write data access to the following registers is expected. + */ + switch (offset) { + case APIC_OFFSET_VER: + case APIC_OFFSET_APR: + case APIC_OFFSET_PPR: + case APIC_OFFSET_RRR: + case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7: + case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7: + case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7: + case APIC_OFFSET_CMCI_LVT: + case APIC_OFFSET_TIMER_CCR: + allowed = 1; + break; + default: + break; + } + } + + if (allowed) { + vmexit_inst_emul(vmexit, DEFAULT_APIC_BASE + offset, + VIE_INVALID_GLA); + } + + /* + * Regardless of whether the APIC-access is allowed this handler + * always returns UNHANDLED: + * - if the access is allowed then it is handled by emulating the + * instruction that caused the VM-exit (outside the critical section) + * - if the access is not allowed then it will be converted to an + * exitcode of VM_EXITCODE_VMX and will be dealt with in userland. + */ + return (UNHANDLED); +} + +static enum task_switch_reason +vmx_task_switch_reason(uint64_t qual) +{ + int reason; + + reason = (qual >> 30) & 0x3; + switch (reason) { + case 0: + return (TSR_CALL); + case 1: + return (TSR_IRET); + case 2: + return (TSR_JMP); + case 3: + return (TSR_IDT_GATE); + default: + panic("%s: invalid reason %d", __func__, reason); + } +} + static int emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) { @@ -1839,31 +2608,150 @@ emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu) return (error); } +#ifndef __FreeBSD__ +#define __predict_false(x) (x) +#endif + static int vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) { - int error, handled, in; - struct vmcs *vmcs; + int error, errcode, errcode_valid, handled, in; struct vmxctx *vmxctx; + struct vlapic *vlapic; struct vm_inout_str *vis; - uint32_t eax, ecx, edx, idtvec_info, intr_info, inst_info; - uint64_t qual, gla, gpa, cr3; + struct vm_task_switch *ts; + uint32_t eax, ecx, edx, idtvec_info, idtvec_err, intr_info, inst_info; + uint32_t intr_type, intr_vec, reason; + uint64_t exitintinfo, qual, gpa; bool retu; CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0); CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0); handled = UNHANDLED; - vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; + qual = vmexit->u.vmx.exit_qualification; + reason = vmexit->u.vmx.exit_reason; vmexit->exitcode = VM_EXITCODE_BOGUS; vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1); + SDT_PROBE3(vmm, vmx, exit, entry, vmx, vcpu, vmexit); + + /* + * VM-entry failures during or after loading guest state. + * + * These VM-exits are uncommon but must be handled specially + * as most VM-exit fields are not populated as usual. + */ + if (__predict_false(reason == EXIT_REASON_MCE_DURING_ENTRY)) { + VCPU_CTR0(vmx->vm, vcpu, "Handling MCE during VM-entry"); +#ifdef __FreeBSD__ + __asm __volatile("int $18"); +#else + vmm_call_trap(T_MCE); +#endif + return (1); + } + + /* + * VM exits that can be triggered during event delivery need to + * be handled specially by re-injecting the event if the IDT + * vectoring information field's valid bit is set. + * + * See "Information for VM Exits During Event Delivery" in Intel SDM + * for details. + */ + idtvec_info = vmcs_idt_vectoring_info(); + if (idtvec_info & VMCS_IDT_VEC_VALID) { + idtvec_info &= ~(1 << 12); /* clear undefined bit */ + exitintinfo = idtvec_info; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + idtvec_err = vmcs_idt_vectoring_err(); + exitintinfo |= (uint64_t)idtvec_err << 32; + } + error = vm_exit_intinfo(vmx->vm, vcpu, exitintinfo); + KASSERT(error == 0, ("%s: vm_set_intinfo error %d", + __func__, error)); + + /* + * If 'virtual NMIs' are being used and the VM-exit + * happened while injecting an NMI during the previous + * VM-entry, then clear "blocking by NMI" in the + * Guest Interruptibility-State so the NMI can be + * reinjected on the subsequent VM-entry. + * + * However, if the NMI was being delivered through a task + * gate, then the new task must start execution with NMIs + * blocked so don't clear NMI blocking in this case. + */ + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type == VMCS_INTR_T_NMI) { + if (reason != EXIT_REASON_TASK_SWITCH) + vmx_clear_nmi_blocking(vmx, vcpu); + else + vmx_assert_nmi_blocking(vmx, vcpu); + } + + /* + * Update VM-entry instruction length if the event being + * delivered was a software interrupt or software exception. + */ + if (intr_type == VMCS_INTR_T_SWINTR || + intr_type == VMCS_INTR_T_PRIV_SWEXCEPTION || + intr_type == VMCS_INTR_T_SWEXCEPTION) { + vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + } + } - switch (vmexit->u.vmx.exit_reason) { + switch (reason) { + case EXIT_REASON_TASK_SWITCH: + ts = &vmexit->u.task_switch; + ts->tsssel = qual & 0xffff; + ts->reason = vmx_task_switch_reason(qual); + ts->ext = 0; + ts->errcode_valid = 0; + vmx_paging_info(&ts->paging); + /* + * If the task switch was due to a CALL, JMP, IRET, software + * interrupt (INT n) or software exception (INT3, INTO), + * then the saved %rip references the instruction that caused + * the task switch. The instruction length field in the VMCS + * is valid in this case. + * + * In all other cases (e.g., NMI, hardware exception) the + * saved %rip is one that would have been saved in the old TSS + * had the task switch completed normally so the instruction + * length field is not needed in this case and is explicitly + * set to 0. + */ + if (ts->reason == TSR_IDT_GATE) { + KASSERT(idtvec_info & VMCS_IDT_VEC_VALID, + ("invalid idtvec_info %#x for IDT task switch", + idtvec_info)); + intr_type = idtvec_info & VMCS_INTR_T_MASK; + if (intr_type != VMCS_INTR_T_SWINTR && + intr_type != VMCS_INTR_T_SWEXCEPTION && + intr_type != VMCS_INTR_T_PRIV_SWEXCEPTION) { + /* Task switch triggered by external event */ + ts->ext = 1; + vmexit->inst_length = 0; + if (idtvec_info & VMCS_IDT_VEC_ERRCODE_VALID) { + ts->errcode_valid = 1; + ts->errcode = vmcs_idt_vectoring_err(); + } + } + } + vmexit->exitcode = VM_EXITCODE_TASK_SWITCH; + SDT_PROBE4(vmm, vmx, exit, taskswitch, vmx, vcpu, vmexit, ts); + VCPU_CTR4(vmx->vm, vcpu, "task switch reason %d, tss 0x%04x, " + "%s errcode 0x%016lx", ts->reason, ts->tsssel, + ts->ext ? "external" : "internal", + ((uint64_t)ts->errcode << 32) | ts->errcode_valid); + break; case EXIT_REASON_CR_ACCESS: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1); + SDT_PROBE4(vmm, vmx, exit, craccess, vmx, vcpu, vmexit, qual); switch (qual & 0xf) { case 0: handled = vmx_emulate_cr0_access(vmx, vcpu, qual); @@ -1881,6 +2769,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) retu = false; ecx = vmxctx->guest_rcx; VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx); + SDT_PROBE4(vmm, vmx, exit, rdmsr, vmx, vcpu, vmexit, ecx); error = emulate_rdmsr(vmx, vcpu, ecx, &retu); if (error) { vmexit->exitcode = VM_EXITCODE_RDMSR; @@ -1901,6 +2790,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) edx = vmxctx->guest_rdx; VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx", ecx, (uint64_t)edx << 32 | eax); + SDT_PROBE5(vmm, vmx, exit, wrmsr, vmx, vmexit, vcpu, ecx, + (uint64_t)edx << 32 | eax); error = emulate_wrmsr(vmx, vcpu, ecx, (uint64_t)edx << 32 | eax, &retu); if (error) { @@ -1917,19 +2808,29 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) break; case EXIT_REASON_HLT: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1); + SDT_PROBE3(vmm, vmx, exit, halt, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_HLT; vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS); + if (virtual_interrupt_delivery) + vmexit->u.hlt.intr_status = + vmcs_read(VMCS_GUEST_INTR_STATUS); + else + vmexit->u.hlt.intr_status = 0; break; case EXIT_REASON_MTF: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1); + SDT_PROBE3(vmm, vmx, exit, mtrap, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_MTRAP; + vmexit->inst_length = 0; break; case EXIT_REASON_PAUSE: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1); + SDT_PROBE3(vmm, vmx, exit, pause, vmx, vcpu, vmexit); vmexit->exitcode = VM_EXITCODE_PAUSE; break; case EXIT_REASON_INTR_WINDOW: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1); + SDT_PROBE3(vmm, vmx, exit, intrwindow, vmx, vcpu, vmexit); vmx_clear_int_window_exiting(vmx, vcpu); return (1); case EXIT_REASON_EXT_INTR: @@ -1943,6 +2844,8 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) * this virtual interrupt during the subsequent VM enter. */ intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + SDT_PROBE4(vmm, vmx, exit, interrupt, + vmx, vcpu, vmexit, intr_info); /* * XXX: Ignore this exit if VMCS_INTR_VALID is not set. @@ -1950,18 +2853,10 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) */ if (!(intr_info & VMCS_INTR_VALID)) return (1); -#ifdef __FreeBSD__ KASSERT((intr_info & VMCS_INTR_VALID) != 0 && (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, ("VM exit interruption info invalid: %#x", intr_info)); -#else - KASSERT((intr_info & VMCS_INTR_VALID) != 0 && - (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR, - ("VM exit interruption info invalid: %x", intr_info)); -#endif -#if 0 /* XXX */ vmx_trigger_hostintr(intr_info & 0xff); -#endif /* * This is special. We want to treat this as an 'handled' @@ -1970,6 +2865,7 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1); return (1); case EXIT_REASON_NMI_WINDOW: + SDT_PROBE3(vmm, vmx, exit, nmiwindow, vmx, vcpu, vmexit); /* Exit to allow the pending virtual NMI to be injected */ if (vm_nmi_pending(vmx->vm, vcpu)) vmx_inject_nmi(vmx, vcpu); @@ -1997,21 +2893,21 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) vis->addrsize = inout_str_addrsize(inst_info); inout_str_seginfo(vmx, vcpu, inst_info, in, vis); } + SDT_PROBE3(vmm, vmx, exit, inout, vmx, vcpu, vmexit); break; case EXIT_REASON_CPUID: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1); + SDT_PROBE3(vmm, vmx, exit, cpuid, vmx, vcpu, vmexit); handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx); break; case EXIT_REASON_EXCEPTION: vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1); intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); -#ifdef __FreeBSD__ KASSERT((intr_info & VMCS_INTR_VALID) != 0, ("VM exit interruption info invalid: %#x", intr_info)); -#else - KASSERT((intr_info & VMCS_INTR_VALID) != 0, - ("VM exit interruption info invalid: %x", intr_info)); -#endif + + intr_vec = intr_info & 0xff; + intr_type = intr_info & VMCS_INTR_T_MASK; /* * If Virtual NMIs control is 1 and the VM-exit is due to a @@ -2020,26 +2916,147 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) * the guest. * * See "Resuming Guest Software after Handling an Exception". + * See "Information for VM Exits Due to Vectored Events". */ if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && - (intr_info & 0xff) != IDT_DF && + (intr_vec != IDT_DF) && (intr_info & EXIT_QUAL_NMIUDTI) != 0) vmx_restore_nmi_blocking(vmx, vcpu); /* * The NMI has already been handled in vmx_exit_handle_nmi(). */ - if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) + if (intr_type == VMCS_INTR_T_NMI) return (1); - break; + + /* + * Call the machine check handler by hand. Also don't reflect + * the machine check back into the guest. + */ + if (intr_vec == IDT_MC) { + VCPU_CTR0(vmx->vm, vcpu, "Vectoring to MCE handler"); +#ifdef __FreeBSD__ + __asm __volatile("int $18"); +#else + vmm_call_trap(T_MCE); +#endif + return (1); + } + + if (intr_vec == IDT_PF) { + error = vmxctx_setreg(vmxctx, VM_REG_GUEST_CR2, qual); + KASSERT(error == 0, ("%s: vmxctx_setreg(cr2) error %d", + __func__, error)); + } + + /* + * Software exceptions exhibit trap-like behavior. This in + * turn requires populating the VM-entry instruction length + * so that the %rip in the trap frame is past the INT3/INTO + * instruction. + */ + if (intr_type == VMCS_INTR_T_SWEXCEPTION) + vmcs_write(VMCS_ENTRY_INST_LENGTH, vmexit->inst_length); + + /* Reflect all other exceptions back into the guest */ + errcode_valid = errcode = 0; + if (intr_info & VMCS_INTR_DEL_ERRCODE) { + errcode_valid = 1; + errcode = vmcs_read(VMCS_EXIT_INTR_ERRCODE); + } + VCPU_CTR2(vmx->vm, vcpu, "Reflecting exception %d/%#x into " + "the guest", intr_vec, errcode); + SDT_PROBE5(vmm, vmx, exit, exception, + vmx, vcpu, vmexit, intr_vec, errcode); + error = vm_inject_exception(vmx->vm, vcpu, intr_vec, + errcode_valid, errcode, 0); + KASSERT(error == 0, ("%s: vm_inject_exception error %d", + __func__, error)); + return (1); + case EXIT_REASON_EPT_FAULT: + /* + * If 'gpa' lies within the address space allocated to + * memory then this must be a nested page fault otherwise + * this must be an instruction that accesses MMIO space. + */ gpa = vmcs_gpa(); - if (ept_emulation_fault(qual)) { + if (vm_mem_allocated(vmx->vm, vcpu, gpa) || + apic_access_fault(vmx, vcpu, gpa)) { + vmexit->exitcode = VM_EXITCODE_PAGING; + vmexit->inst_length = 0; + vmexit->u.paging.gpa = gpa; + vmexit->u.paging.fault_type = ept_fault_type(qual); + vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NESTED_FAULT, 1); + SDT_PROBE5(vmm, vmx, exit, nestedfault, + vmx, vcpu, vmexit, gpa, qual); + } else if (ept_emulation_fault(qual)) { vmexit_inst_emul(vmexit, gpa, vmcs_gla()); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1); + SDT_PROBE4(vmm, vmx, exit, mmiofault, + vmx, vcpu, vmexit, gpa); } + /* + * If Virtual NMIs control is 1 and the VM-exit is due to an + * EPT fault during the execution of IRET then we must restore + * the state of "virtual-NMI blocking" before resuming. + * + * See description of "NMI unblocking due to IRET" in + * "Exit Qualification for EPT Violations". + */ + if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 && + (qual & EXIT_QUAL_NMIUDTI) != 0) + vmx_restore_nmi_blocking(vmx, vcpu); + break; + case EXIT_REASON_VIRTUALIZED_EOI: + vmexit->exitcode = VM_EXITCODE_IOAPIC_EOI; + vmexit->u.ioapic_eoi.vector = qual & 0xFF; + SDT_PROBE3(vmm, vmx, exit, eoi, vmx, vcpu, vmexit); + vmexit->inst_length = 0; /* trap-like */ + break; + case EXIT_REASON_APIC_ACCESS: + SDT_PROBE3(vmm, vmx, exit, apicaccess, vmx, vcpu, vmexit); + handled = vmx_handle_apic_access(vmx, vcpu, vmexit); + break; + case EXIT_REASON_APIC_WRITE: + /* + * APIC-write VM exit is trap-like so the %rip is already + * pointing to the next instruction. + */ + vmexit->inst_length = 0; + vlapic = vm_lapic(vmx->vm, vcpu); + SDT_PROBE4(vmm, vmx, exit, apicwrite, + vmx, vcpu, vmexit, vlapic); + handled = vmx_handle_apic_write(vmx, vcpu, vlapic, qual); + break; + case EXIT_REASON_XSETBV: + SDT_PROBE3(vmm, vmx, exit, xsetbv, vmx, vcpu, vmexit); + handled = vmx_emulate_xsetbv(vmx, vcpu, vmexit); + break; + case EXIT_REASON_MONITOR: + SDT_PROBE3(vmm, vmx, exit, monitor, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_MONITOR; + break; + case EXIT_REASON_MWAIT: + SDT_PROBE3(vmm, vmx, exit, mwait, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_MWAIT; + break; + case EXIT_REASON_VMCALL: + case EXIT_REASON_VMCLEAR: + case EXIT_REASON_VMLAUNCH: + case EXIT_REASON_VMPTRLD: + case EXIT_REASON_VMPTRST: + case EXIT_REASON_VMREAD: + case EXIT_REASON_VMRESUME: + case EXIT_REASON_VMWRITE: + case EXIT_REASON_VMXOFF: + case EXIT_REASON_VMXON: + SDT_PROBE3(vmm, vmx, exit, vminsn, vmx, vcpu, vmexit); + vmexit->exitcode = VM_EXITCODE_VMINSN; break; default: + SDT_PROBE4(vmm, vmx, exit, unknown, + vmx, vcpu, vmexit, reason); vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1); break; } @@ -2055,17 +3072,9 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) * the one we just processed. Therefore we update the * guest rip in the VMCS and in 'vmexit'. */ - vm_exit_update_rip(vmexit); vmexit->rip += vmexit->inst_length; vmexit->inst_length = 0; - - /* - * Special case for spinning up an AP - exit to userspace to - * give the controlling process a chance to intercept and - * spin up a thread for the AP. - */ - if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP) - handled = 0; + vmcs_write(VMCS_GUEST_RIP, vmexit->rip); } else { if (vmexit->exitcode == VM_EXITCODE_BOGUS) { /* @@ -2083,91 +3092,340 @@ vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit) */ } } + + SDT_PROBE4(vmm, vmx, exit, return, + vmx, vcpu, vmexit, handled); return (handled); } +static void +vmx_exit_inst_error(struct vmxctx *vmxctx, int rc, struct vm_exit *vmexit) +{ + + KASSERT(vmxctx->inst_fail_status != VM_SUCCESS, + ("vmx_exit_inst_error: invalid inst_fail_status %d", + vmxctx->inst_fail_status)); + + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = vmxctx->inst_fail_status; + vmexit->u.vmx.inst_error = vmcs_instruction_error(); + vmexit->u.vmx.exit_reason = ~0; + vmexit->u.vmx.exit_qualification = ~0; + + switch (rc) { + case VMX_VMRESUME_ERROR: + case VMX_VMLAUNCH_ERROR: + case VMX_INVEPT_ERROR: +#ifndef __FreeBSD__ + case VMX_VMWRITE_ERROR: +#endif + vmexit->u.vmx.inst_type = rc; + break; + default: + panic("vm_exit_inst_error: vmx_enter_guest returned %d", rc); + } +} + +/* + * If the NMI-exiting VM execution control is set to '1' then an NMI in + * non-root operation causes a VM-exit. NMI blocking is in effect so it is + * sufficient to simply vector to the NMI handler via a software interrupt. + * However, this must be done before maskable interrupts are enabled + * otherwise the "iret" issued by an interrupt handler will incorrectly + * clear NMI blocking. + */ +static __inline void +vmx_exit_handle_nmi(struct vmx *vmx, int vcpuid, struct vm_exit *vmexit) +{ + uint32_t intr_info; + + KASSERT((read_rflags() & PSL_I) == 0, ("interrupts enabled")); + + if (vmexit->u.vmx.exit_reason != EXIT_REASON_EXCEPTION) + return; + + intr_info = vmcs_read(VMCS_EXIT_INTR_INFO); + KASSERT((intr_info & VMCS_INTR_VALID) != 0, + ("VM exit interruption info invalid: %#x", intr_info)); + + if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI) { + KASSERT((intr_info & 0xff) == IDT_NMI, ("VM exit due " + "to NMI has invalid vector: %#x", intr_info)); + VCPU_CTR0(vmx->vm, vcpuid, "Vectoring to NMI handler"); +#ifdef __FreeBSD__ + __asm __volatile("int $2"); +#else + vmm_call_trap(T_NMIFLT); +#endif + } +} + +static __inline void +vmx_dr_enter_guest(struct vmxctx *vmxctx) +{ + register_t rflags; + + /* Save host control debug registers. */ + vmxctx->host_dr7 = rdr7(); + vmxctx->host_debugctl = rdmsr(MSR_DEBUGCTLMSR); + + /* + * Disable debugging in DR7 and DEBUGCTL to avoid triggering + * exceptions in the host based on the guest DRx values. The + * guest DR7 and DEBUGCTL are saved/restored in the VMCS. + */ + load_dr7(0); + wrmsr(MSR_DEBUGCTLMSR, 0); + + /* + * Disable single stepping the kernel to avoid corrupting the + * guest DR6. A debugger might still be able to corrupt the + * guest DR6 by setting a breakpoint after this point and then + * single stepping. + */ + rflags = read_rflags(); + vmxctx->host_tf = rflags & PSL_T; + write_rflags(rflags & ~PSL_T); + + /* Save host debug registers. */ + vmxctx->host_dr0 = rdr0(); + vmxctx->host_dr1 = rdr1(); + vmxctx->host_dr2 = rdr2(); + vmxctx->host_dr3 = rdr3(); + vmxctx->host_dr6 = rdr6(); + + /* Restore guest debug registers. */ + load_dr0(vmxctx->guest_dr0); + load_dr1(vmxctx->guest_dr1); + load_dr2(vmxctx->guest_dr2); + load_dr3(vmxctx->guest_dr3); + load_dr6(vmxctx->guest_dr6); +} + +static __inline void +vmx_dr_leave_guest(struct vmxctx *vmxctx) +{ + + /* Save guest debug registers. */ + vmxctx->guest_dr0 = rdr0(); + vmxctx->guest_dr1 = rdr1(); + vmxctx->guest_dr2 = rdr2(); + vmxctx->guest_dr3 = rdr3(); + vmxctx->guest_dr6 = rdr6(); + + /* + * Restore host debug registers. Restore DR7, DEBUGCTL, and + * PSL_T last. + */ + load_dr0(vmxctx->host_dr0); + load_dr1(vmxctx->host_dr1); + load_dr2(vmxctx->host_dr2); + load_dr3(vmxctx->host_dr3); + load_dr6(vmxctx->host_dr6); + wrmsr(MSR_DEBUGCTLMSR, vmxctx->host_debugctl); + load_dr7(vmxctx->host_dr7); + write_rflags(read_rflags() | vmxctx->host_tf); +} + static int -vmx_run(void *arg, int vcpu, register_t rip) +vmx_run(void *arg, int vcpu, register_t rip, pmap_t pmap, + struct vm_eventinfo *evinfo) { - int error, vie, rc, handled, astpending; - uint32_t exit_reason; + int rc, handled, launched; struct vmx *vmx; struct vm *vm; struct vmxctx *vmxctx; struct vmcs *vmcs; struct vm_exit *vmexit; struct vlapic *vlapic; - + uint32_t exit_reason; +#ifdef __FreeBSD__ + struct region_descriptor gdtr, idtr; + uint16_t ldt_sel; +#endif + vmx = arg; vm = vmx->vm; vmcs = &vmx->vmcs[vcpu]; vmxctx = &vmx->ctx[vcpu]; vlapic = vm_lapic(vm, vcpu); - vmxctx->launched = 0; + vmexit = vm_exitinfo(vm, vcpu); + launched = 0; - astpending = 0; - vmexit = vm_exitinfo(vmx->vm, vcpu); + KASSERT(vmxctx->pmap == pmap, + ("pmap %p different than ctx pmap %p", pmap, vmxctx->pmap)); vmx_msr_guest_enter(vmx, vcpu); VMPTRLD(vmcs); +#ifndef __FreeBSD__ + VERIFY(vmx->vmcs_state[vcpu] == VS_NONE && curthread->t_preempt != 0); + vmx->vmcs_state[vcpu] = VS_LOADED; +#endif + /* * XXX * We do this every time because we may setup the virtual machine * from a different process than the one that actually runs it. * * If the life of a virtual machine was spent entirely in the context - * of a single process we could do this once in vmcs_set_defaults(). + * of a single process we could do this once in vmx_vminit(). */ vmcs_write(VMCS_HOST_CR3, rcr3()); vmcs_write(VMCS_GUEST_RIP, rip); - vmx_set_pcpu_defaults(vmx, vcpu); + vmx_set_pcpu_defaults(vmx, vcpu, pmap); do { - vmx_inject_interrupts(vmx, vcpu, vlapic); - vmx_run_trace(vmx, vcpu); - rc = vmx_setjmp(vmxctx); -#ifdef SETJMP_TRACE - vmx_setjmp_trace(vmx, vcpu, vmxctx, rc); -#endif - switch (rc) { - case VMX_RETURN_DIRECT: - if (vmxctx->launched == 0) { - vmxctx->launched = 1; - vmx_launch(vmxctx); - } else - vmx_resume(vmxctx); - panic("vmx_launch/resume should not return"); + KASSERT(vmcs_guest_rip() == rip, ("%s: vmcs guest rip mismatch " + "%#lx/%#lx", __func__, vmcs_guest_rip(), rip)); + + handled = UNHANDLED; + /* + * Interrupts are disabled from this point on until the + * guest starts executing. This is done for the following + * reasons: + * + * If an AST is asserted on this thread after the check below, + * then the IPI_AST notification will not be lost, because it + * will cause a VM exit due to external interrupt as soon as + * the guest state is loaded. + * + * A posted interrupt after 'vmx_inject_interrupts()' will + * not be "lost" because it will be held pending in the host + * APIC because interrupts are disabled. The pending interrupt + * will be recognized as soon as the guest state is loaded. + * + * The same reasoning applies to the IPI generated by + * pmap_invalidate_ept(). + */ +#ifdef __FreeBSD__ + disable_intr(); + vmx_inject_interrupts(vmx, vcpu, vlapic, rip); +#else + /* + * The bulk of guest interrupt injection is done without + * interrupts disabled on the host CPU. This is necessary + * since contended mutexes might force the thread to sleep. + */ + vmx_inject_interrupts(vmx, vcpu, vlapic, rip); + disable_intr(); + if (virtual_interrupt_delivery) { + vmx_inject_pir(vlapic); + } +#endif /* __FreeBSD__ */ + + /* + * Check for vcpu suspension after injecting events because + * vmx_inject_interrupts() can suspend the vcpu due to a + * triple fault. + */ + if (vcpu_suspended(evinfo)) { + enable_intr(); + vm_exit_suspended(vmx->vm, vcpu, rip); break; - case VMX_RETURN_LONGJMP: - break; /* vm exit */ - case VMX_RETURN_AST: - astpending = 1; + } + + if (vcpu_runblocked(evinfo)) { + enable_intr(); + vm_exit_runblock(vmx->vm, vcpu, rip); break; - case VMX_RETURN_VMRESUME: - vie = vmcs_instruction_error(); - if (vmxctx->launch_error == VM_FAIL_INVALID || - vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) { - printf("vmresume error %d vmcs inst error %d\n", - vmxctx->launch_error, vie); - goto err_exit; + } + + if (vcpu_reqidle(evinfo)) { + enable_intr(); + vm_exit_reqidle(vmx->vm, vcpu, rip); + break; + } + + if (vcpu_should_yield(vm, vcpu)) { + enable_intr(); + vm_exit_astpending(vmx->vm, vcpu, rip); + vmx_astpending_trace(vmx, vcpu, rip); + handled = HANDLED; + break; + } + + if (vcpu_debugged(vm, vcpu)) { + enable_intr(); + vm_exit_debug(vmx->vm, vcpu, rip); + break; + } + +#ifndef __FreeBSD__ + if ((rc = smt_acquire()) != 1) { + enable_intr(); + vmexit->rip = rip; + vmexit->inst_length = 0; + if (rc == -1) { + vmexit->exitcode = VM_EXITCODE_HT; + } else { + vmexit->exitcode = VM_EXITCODE_BOGUS; + handled = HANDLED; } - vmx_launch(vmxctx); /* try to launch the guest */ - panic("vmx_launch should not return"); break; - case VMX_RETURN_VMLAUNCH: - vie = vmcs_instruction_error(); -#if 1 - printf("vmlaunch error %d vmcs inst error %d\n", - vmxctx->launch_error, vie); -#endif - goto err_exit; - default: - panic("vmx_setjmp returned %d", rc); } - - /* collect some basic information for VM exit processing */ + + /* + * If this thread has gone off-cpu due to mutex operations + * during vmx_run, the VMCS will have been unloaded, forcing a + * re-VMLAUNCH as opposed to VMRESUME. + */ + launched = (vmx->vmcs_state[vcpu] & VS_LAUNCHED) != 0; + /* + * Restoration of the GDT limit is taken care of by + * vmx_savectx(). Since the maximum practical index for the + * IDT is 255, restoring its limits from the post-VMX-exit + * default of 0xffff is not a concern. + * + * Only 64-bit hypervisor callers are allowed, which forgoes + * the need to restore any LDT descriptor. Toss an error to + * anyone attempting to break that rule. + */ + if (curproc->p_model != DATAMODEL_LP64) { + smt_release(); + enable_intr(); + bzero(vmexit, sizeof (*vmexit)); + vmexit->rip = rip; + vmexit->exitcode = VM_EXITCODE_VMX; + vmexit->u.vmx.status = VM_FAIL_INVALID; + handled = UNHANDLED; + break; + } +#else + /* + * VM exits restore the base address but not the + * limits of GDTR and IDTR. The VMCS only stores the + * base address, so VM exits set the limits to 0xffff. + * Save and restore the full GDTR and IDTR to restore + * the limits. + * + * The VMCS does not save the LDTR at all, and VM + * exits clear LDTR as if a NULL selector were loaded. + * The userspace hypervisor probably doesn't use a + * LDT, but save and restore it to be safe. + */ + sgdt(&gdtr); + sidt(&idtr); + ldt_sel = sldt(); +#endif + + vmx_run_trace(vmx, vcpu); + vmx_dr_enter_guest(vmxctx); + rc = vmx_enter_guest(vmxctx, vmx, launched); + vmx_dr_leave_guest(vmxctx); + +#ifndef __FreeBSD__ + vmx->vmcs_state[vcpu] |= VS_LAUNCHED; + smt_release(); +#else + bare_lgdt(&gdtr); + lidt(&idtr); + lldt(ldt_sel); +#endif + + /* Collect some information for VM exit processing */ vmexit->rip = rip = vmcs_guest_rip(); vmexit->inst_length = vmexit_instruction_length(); vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason(); @@ -2176,21 +3434,19 @@ vmx_run(void *arg, int vcpu, register_t rip) /* Update 'nextrip' */ vmx->state[vcpu].nextrip = rip; - /* enable interrupts */ - enable_intr(); - - if (astpending) { - handled = 1; - vmexit->inst_length = 0; - vmexit->exitcode = VM_EXITCODE_BOGUS; - vmx_astpending_trace(vmx, vcpu, rip); - vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1); - break; + if (rc == VMX_GUEST_VMEXIT) { + vmx_exit_handle_nmi(vmx, vcpu, vmexit); + enable_intr(); + handled = vmx_exit_process(vmx, vcpu, vmexit); + } else { + enable_intr(); + vmx_exit_inst_error(vmxctx, rc, vmexit); } - - handled = vmx_exit_process(vmx, vcpu, vmexit); +#ifdef __FreeBSD__ + launched = 1; +#endif vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled); - + rip = vmexit->rip; } while (handled); /* @@ -2204,44 +3460,36 @@ vmx_run(void *arg, int vcpu, register_t rip) } if (!handled) - vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1); + vmm_stat_incr(vm, vcpu, VMEXIT_USERSPACE, 1); - VCPU_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d", + VCPU_CTR1(vm, vcpu, "returning from vmx_run: exitcode %d", vmexit->exitcode); VMCLEAR(vmcs); vmx_msr_guest_exit(vmx, vcpu); - return (0); - -err_exit: - vmexit->exitcode = VM_EXITCODE_VMX; - vmexit->u.vmx.exit_reason = (uint32_t)-1; - vmexit->u.vmx.exit_qualification = (uint32_t)-1; - vmexit->u.vmx.status = ~0; - VMCLEAR(vmcs); - vmx_msr_guest_exit(vmx, vcpu); +#ifndef __FreeBSD__ + VERIFY(vmx->vmcs_state != VS_NONE && curthread->t_preempt != 0); + vmx->vmcs_state[vcpu] = VS_NONE; +#endif - return (ENOEXEC); + return (0); } static void vmx_vmcleanup(void *arg) { - int i, error; + int i; struct vmx *vmx = arg; + uint16_t maxcpus; - for (i = 0; i < VM_MAXCPU; i++) - vpid_free(vmx->state[i].vpid); + if (apic_access_virtualization(vmx, 0)) + vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); - /* - * XXXSMP we also need to clear the VMCS active on the other vcpus. - */ - error = vmclear(&vmx->vmcs[0]); - if (error != 0) - panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error); + maxcpus = vm_get_maxcpus(vmx->vm); + for (i = 0; i < maxcpus; i++) + vpid_free(vmx->state[i].vpid); - ept_vmcleanup(vmx); free(vmx, M_VMX); return; @@ -2284,6 +3532,16 @@ vmxctx_regptr(struct vmxctx *vmxctx, int reg) return (&vmxctx->guest_r15); case VM_REG_GUEST_CR2: return (&vmxctx->guest_cr2); + case VM_REG_GUEST_DR0: + return (&vmxctx->guest_dr0); + case VM_REG_GUEST_DR1: + return (&vmxctx->guest_dr1); + case VM_REG_GUEST_DR2: + return (&vmxctx->guest_dr2); + case VM_REG_GUEST_DR3: + return (&vmxctx->guest_dr3); + case VM_REG_GUEST_DR6: + return (&vmxctx->guest_dr6); default: break; } @@ -2315,6 +3573,46 @@ vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val) } static int +vmx_get_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t *retval) +{ + uint64_t gi; + int error; + + error = vmcs_getreg(&vmx->vmcs[vcpu], running, + VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY), &gi); + *retval = (gi & HWINTR_BLOCKING) ? 1 : 0; + return (error); +} + +static int +vmx_modify_intr_shadow(struct vmx *vmx, int vcpu, int running, uint64_t val) +{ + struct vmcs *vmcs; + uint64_t gi; + int error, ident; + + /* + * Forcing the vcpu into an interrupt shadow is not supported. + */ + if (val) { + error = EINVAL; + goto done; + } + + vmcs = &vmx->vmcs[vcpu]; + ident = VMCS_IDENT(VMCS_GUEST_INTERRUPTIBILITY); + error = vmcs_getreg(vmcs, running, ident, &gi); + if (error == 0) { + gi &= ~HWINTR_BLOCKING; + error = vmcs_setreg(vmcs, running, ident, gi); + } +done: + VCPU_CTR2(vmx->vm, vcpu, "Setting intr_shadow to %#lx %s", val, + error ? "failed" : "succeeded"); + return (error); +} + +static int vmx_shadow_reg(int reg) { int shreg; @@ -2324,8 +3622,8 @@ vmx_shadow_reg(int reg) switch (reg) { case VM_REG_GUEST_CR0: shreg = VMCS_CR0_SHADOW; - break; - case VM_REG_GUEST_CR4: + break; + case VM_REG_GUEST_CR4: shreg = VMCS_CR4_SHADOW; break; default: @@ -2345,6 +3643,9 @@ vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval) if (running && hostcpu != curcpu) panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu); + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_get_intr_shadow(vmx, vcpu, running, retval)); + if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0) return (0); @@ -2356,12 +3657,16 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) { int error, hostcpu, running, shadow; uint64_t ctls; + pmap_t pmap; struct vmx *vmx = arg; running = vcpu_is_running(vmx->vm, vcpu, &hostcpu); if (running && hostcpu != curcpu) panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu); + if (reg == VM_REG_GUEST_INTR_SHADOW) + return (vmx_modify_intr_shadow(vmx, vcpu, running, val)); + if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0) return (0); @@ -2389,10 +3694,22 @@ vmx_setreg(void *arg, int vcpu, int reg, uint64_t val) if (shadow > 0) { /* * Store the unmodified value in the shadow - */ + */ error = vmcs_setreg(&vmx->vmcs[vcpu], running, VMCS_IDENT(shadow), val); } + + if (reg == VM_REG_GUEST_CR3) { + /* + * Invalidate the guest vcpu's TLB mappings to emulate + * the behavior of updating %cr3. + * + * XXX the processor retains global mappings when %cr3 + * is updated but vmx_invvpid() does not. + */ + pmap = vmx->ctx[vcpu].pmap; + vmx_invvpid(vmx, vcpu, pmap, running); + } } return (error); @@ -2452,6 +3769,10 @@ vmx_getcap(void *arg, int vcpu, int type, int *retval) if (cap_unrestricted_guest) ret = 0; break; + case VM_CAP_ENABLE_INVPCID: + if (cap_invpcid) + ret = 0; + break; default: break; } @@ -2508,11 +3829,21 @@ vmx_setcap(void *arg, int vcpu, int type, int val) case VM_CAP_UNRESTRICTED_GUEST: if (cap_unrestricted_guest) { retval = 0; - baseval = procbased_ctls2; + pptr = &vmx->cap[vcpu].proc_ctls2; + baseval = *pptr; flag = PROCBASED2_UNRESTRICTED_GUEST; reg = VMCS_SEC_PROC_BASED_CTLS; } break; + case VM_CAP_ENABLE_INVPCID: + if (cap_invpcid) { + retval = 0; + pptr = &vmx->cap[vcpu].proc_ctls2; + baseval = *pptr; + flag = PROCBASED2_ENABLE_INVPCID; + reg = VMCS_SEC_PROC_BASED_CTLS; + } + break; default: break; } @@ -2546,15 +3877,18 @@ vmx_setcap(void *arg, int vcpu, int type, int val) } } - return (retval); + return (retval); } struct vlapic_vtx { struct vlapic vlapic; struct pir_desc *pir_desc; struct vmx *vmx; + u_int pending_prio; }; +#define VPR_PRIO_BIT(vpr) (1 << ((vpr) >> 4)) + #define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \ do { \ VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \ @@ -2576,7 +3910,7 @@ vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) struct vlapic_vtx *vlapic_vtx; struct pir_desc *pir_desc; uint64_t mask; - int idx, notify; + int idx, notify = 0; vlapic_vtx = (struct vlapic_vtx *)vlapic; pir_desc = vlapic_vtx->pir_desc; @@ -2589,7 +3923,37 @@ vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level) idx = vector / 64; mask = 1UL << (vector % 64); atomic_set_long(&pir_desc->pir[idx], mask); - notify = atomic_cmpset_long(&pir_desc->pending, 0, 1); + + /* + * A notification is required whenever the 'pending' bit makes a + * transition from 0->1. + * + * Even if the 'pending' bit is already asserted, notification about + * the incoming interrupt may still be necessary. For example, if a + * vCPU is HLTed with a high PPR, a low priority interrupt would cause + * the 0->1 'pending' transition with a notification, but the vCPU + * would ignore the interrupt for the time being. The same vCPU would + * need to then be notified if a high-priority interrupt arrived which + * satisfied the PPR. + * + * The priorities of interrupts injected while 'pending' is asserted + * are tracked in a custom bitfield 'pending_prio'. Should the + * to-be-injected interrupt exceed the priorities already present, the + * notification is sent. The priorities recorded in 'pending_prio' are + * cleared whenever the 'pending' bit makes another 0->1 transition. + */ + if (atomic_cmpset_long(&pir_desc->pending, 0, 1) != 0) { + notify = 1; + vlapic_vtx->pending_prio = 0; + } else { + const u_int old_prio = vlapic_vtx->pending_prio; + const u_int prio_bit = VPR_PRIO_BIT(vector & APIC_TPR_INT); + + if ((old_prio & prio_bit) == 0 && prio_bit > old_prio) { + atomic_set_int(&vlapic_vtx->pending_prio, prio_bit); + notify = 1; + } + } VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector, level, "vmx_set_intr_ready"); @@ -2616,8 +3980,27 @@ vmx_pending_intr(struct vlapic *vlapic, int *vecptr) pir_desc = vlapic_vtx->pir_desc; pending = atomic_load_acq_long(&pir_desc->pending); - if (!pending) - return (0); /* common case */ + if (!pending) { + /* + * While a virtual interrupt may have already been + * processed the actual delivery maybe pending the + * interruptibility of the guest. Recognize a pending + * interrupt by reevaluating virtual interrupts + * following Section 29.2.1 in the Intel SDM Volume 3. + */ + struct vm_exit *vmexit; + uint8_t rvi, ppr; + + vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid); + rvi = vmexit->u.hlt.intr_status & APIC_TPR_INT; + lapic = vlapic->apic_page; + ppr = lapic->ppr & APIC_TPR_INT; + if (rvi > ppr) { + return (1); + } + + return (0); + } /* * If there is an interrupt pending then it will be recognized only @@ -2627,21 +4010,38 @@ vmx_pending_intr(struct vlapic *vlapic, int *vecptr) * interrupt will be recognized. */ lapic = vlapic->apic_page; - ppr = lapic->ppr & 0xf0; + ppr = lapic->ppr & APIC_TPR_INT; if (ppr == 0) return (1); VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d", lapic->ppr); + vpr = 0; for (i = 3; i >= 0; i--) { pirval = pir_desc->pir[i]; if (pirval != 0) { - vpr = (i * 64 + flsl(pirval) - 1) & 0xf0; - return (vpr > ppr); + vpr = (i * 64 + flsl(pirval) - 1) & APIC_TPR_INT; + break; } } - return (0); + + /* + * If the highest-priority pending interrupt falls short of the + * processor priority of this vCPU, ensure that 'pending_prio' does not + * have any stale bits which would preclude a higher-priority interrupt + * from incurring a notification later. + */ + if (vpr <= ppr) { + const u_int prio_bit = VPR_PRIO_BIT(vpr); + const u_int old = vlapic_vtx->pending_prio; + + if (old > prio_bit && (old & prio_bit) == 0) { + vlapic_vtx->pending_prio = prio_bit; + } + return (0); + } + return (1); } static void @@ -2652,37 +4052,65 @@ vmx_intr_accepted(struct vlapic *vlapic, int vector) } static void -vmx_set_tmr(struct vlapic *vlapic, int vector, bool level) +vmx_set_tmr(struct vlapic *vlapic, const uint32_t *masks) +{ + vmcs_write(VMCS_EOI_EXIT0, ((uint64_t)masks[1] << 32) | masks[0]); + vmcs_write(VMCS_EOI_EXIT1, ((uint64_t)masks[3] << 32) | masks[2]); + vmcs_write(VMCS_EOI_EXIT2, ((uint64_t)masks[5] << 32) | masks[4]); + vmcs_write(VMCS_EOI_EXIT3, ((uint64_t)masks[7] << 32) | masks[6]); +} + +static void +vmx_enable_x2apic_mode(struct vlapic *vlapic) { - struct vlapic_vtx *vlapic_vtx; struct vmx *vmx; struct vmcs *vmcs; - uint64_t mask, val; + uint32_t proc_ctls2; + int vcpuid, error; - KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); - KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL), - ("vmx_set_tmr: vcpu cannot be running")); + vcpuid = vlapic->vcpuid; + vmx = ((struct vlapic_vtx *)vlapic)->vmx; + vmcs = &vmx->vmcs[vcpuid]; - vlapic_vtx = (struct vlapic_vtx *)vlapic; - vmx = vlapic_vtx->vmx; - vmcs = &vmx->vmcs[vlapic->vcpuid]; - mask = 1UL << (vector % 64); + proc_ctls2 = vmx->cap[vcpuid].proc_ctls2; + KASSERT((proc_ctls2 & PROCBASED2_VIRTUALIZE_APIC_ACCESSES) != 0, + ("%s: invalid proc_ctls2 %#x", __func__, proc_ctls2)); + + proc_ctls2 &= ~PROCBASED2_VIRTUALIZE_APIC_ACCESSES; + proc_ctls2 |= PROCBASED2_VIRTUALIZE_X2APIC_MODE; + vmx->cap[vcpuid].proc_ctls2 = proc_ctls2; VMPTRLD(vmcs); - val = vmcs_read(VMCS_EOI_EXIT(vector)); - if (level) - val |= mask; - else - val &= ~mask; - vmcs_write(VMCS_EOI_EXIT(vector), val); + vmcs_write(VMCS_SEC_PROC_BASED_CTLS, proc_ctls2); VMCLEAR(vmcs); + + if (vlapic->vcpuid == 0) { + /* + * The nested page table mappings are shared by all vcpus + * so unmap the APIC access page just once. + */ + error = vm_unmap_mmio(vmx->vm, DEFAULT_APIC_BASE, PAGE_SIZE); + KASSERT(error == 0, ("%s: vm_unmap_mmio error %d", + __func__, error)); + + /* + * The MSR bitmap is shared by all vcpus so modify it only + * once in the context of vcpu 0. + */ + error = vmx_allow_x2apic_msrs(vmx); + KASSERT(error == 0, ("%s: vmx_allow_x2apic_msrs error %d", + __func__, error)); + } } static void vmx_post_intr(struct vlapic *vlapic, int hostcpu) { - +#ifdef __FreeBSD__ ipi_cpu(hostcpu, pirvec); +#else + psm_send_pir_ipi(hostcpu); +#endif } /* @@ -2785,7 +4213,7 @@ vmx_vlapic_init(void *arg, int vcpuid) struct vmx *vmx; struct vlapic *vlapic; struct vlapic_vtx *vlapic_vtx; - + vmx = arg; vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO); @@ -2802,9 +4230,7 @@ vmx_vlapic_init(void *arg, int vcpuid) vlapic->ops.pending_intr = vmx_pending_intr; vlapic->ops.intr_accepted = vmx_intr_accepted; vlapic->ops.set_tmr = vmx_set_tmr; -#ifdef __FreeBSD__ vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode; -#endif } if (posted_interrupts) @@ -2823,20 +4249,129 @@ vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic) free(vlapic, M_VLAPIC); } +#ifndef __FreeBSD__ +static void +vmx_savectx(void *arg, int vcpu) +{ + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + + if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) { + VERIFY3U(vmclear(vmcs), ==, 0); + vmx_msr_guest_exit(vmx, vcpu); + /* + * Having VMCLEARed the VMCS, it can no longer be re-entered + * with VMRESUME, but must be VMLAUNCHed again. + */ + vmx->vmcs_state[vcpu] &= ~VS_LAUNCHED; + } + + reset_gdtr_limit(); +} + +static void +vmx_restorectx(void *arg, int vcpu) +{ + struct vmx *vmx = arg; + struct vmcs *vmcs = &vmx->vmcs[vcpu]; + + ASSERT0(vmx->vmcs_state[vcpu] & VS_LAUNCHED); + + if ((vmx->vmcs_state[vcpu] & VS_LOADED) != 0) { + vmx_msr_guest_enter(vmx, vcpu); + VERIFY3U(vmptrld(vmcs), ==, 0); + } +} +#endif /* __FreeBSD__ */ + struct vmm_ops vmm_ops_intel = { vmx_init, vmx_cleanup, + vmx_restore, vmx_vminit, vmx_run, vmx_vmcleanup, - ept_vmmmap_set, - ept_vmmmap_get, vmx_getreg, vmx_setreg, vmx_getdesc, vmx_setdesc, vmx_getcap, vmx_setcap, + ept_vmspace_alloc, + ept_vmspace_free, vmx_vlapic_init, vmx_vlapic_cleanup, + +#ifndef __FreeBSD__ + vmx_savectx, + vmx_restorectx, +#endif }; + +#ifndef __FreeBSD__ +/* Side-effect free HW validation derived from checks in vmx_init. */ +int +vmx_x86_supported(const char **msg) +{ + int error; + uint32_t tmp; + + ASSERT(msg != NULL); + + /* Check support for primary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS, + MSR_VMX_TRUE_PROCBASED_CTLS, PROCBASED_CTLS_ONE_SETTING, + PROCBASED_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired primary " + "processor-based controls"; + return (error); + } + + /* Check support for secondary processor-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, + MSR_VMX_PROCBASED_CTLS2, PROCBASED_CTLS2_ONE_SETTING, + PROCBASED_CTLS2_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired secondary " + "processor-based controls"; + return (error); + } + + /* Check support for pin-based VM-execution controls */ + error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS, + MSR_VMX_TRUE_PINBASED_CTLS, PINBASED_CTLS_ONE_SETTING, + PINBASED_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired pin-based controls"; + return (error); + } + + /* Check support for VM-exit controls */ + error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS, + VM_EXIT_CTLS_ONE_SETTING, VM_EXIT_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired exit controls"; + return (error); + } + + /* Check support for VM-entry controls */ + error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS, + VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING, &tmp); + if (error) { + *msg = "processor does not support desired entry controls"; + return (error); + } + + /* Unrestricted guest is nominally optional, but not for us. */ + error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2, + PROCBASED2_UNRESTRICTED_GUEST, 0, &tmp); + if (error) { + *msg = "processor does not support desired unrestricted guest " + "controls"; + return (error); + } + + return (0); +} +#endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.h b/usr/src/uts/i86pc/io/vmm/intel/vmx.h index 50ca62b371..2d16799bdd 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx.h +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx.h 284174 2015-06-09 00:14:47Z tychon $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #ifndef _VMX_H_ @@ -31,15 +37,9 @@ #include "vmcs.h" -#ifndef __FreeBSD__ -#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */ -#define HOST_MSR_MAX_ENTRIES 64 /* arbitrary */ -#endif +struct pmap; struct vmxctx { - register_t tmpstk[32]; /* vmx_return() stack */ - register_t tmpstktop; - register_t guest_rdi; /* Guest state */ register_t guest_rsi; register_t guest_rdx; @@ -56,7 +56,13 @@ struct vmxctx { register_t guest_r14; register_t guest_r15; register_t guest_cr2; + register_t guest_dr0; + register_t guest_dr1; + register_t guest_dr2; + register_t guest_dr3; + register_t guest_dr6; +#ifdef __FreeBSD__ register_t host_r15; /* Host state */ register_t host_r14; register_t host_r13; @@ -64,13 +70,24 @@ struct vmxctx { register_t host_rbp; register_t host_rsp; register_t host_rbx; - register_t host_rip; +#endif /* __FreeBSD__ */ + + register_t host_dr0; + register_t host_dr1; + register_t host_dr2; + register_t host_dr3; + register_t host_dr6; + register_t host_dr7; + uint64_t host_debugctl; + int host_tf; + + int inst_fail_status; + /* - * XXX todo debug registers and fpu state + * The pmap needs to be deactivated in vmx_enter_guest() + * so keep a copy of the 'pmap' in each vmxctx. */ - - int launched; /* vmcs launch state */ - int launch_error; + struct pmap *pmap; }; struct vmxcap { @@ -105,52 +122,55 @@ enum { IDX_MSR_STAR, IDX_MSR_SF_MASK, IDX_MSR_KGSBASE, + IDX_MSR_PAT, GUEST_MSR_NUM /* must be the last enumeration */ }; +#ifndef __FreeBSD__ +typedef enum { + VS_NONE = 0x0, + VS_LAUNCHED = 0x1, + VS_LOADED = 0x2 +} vmcs_state_t; +#endif /* __FreeBSD__ */ + /* virtual machine softc */ struct vmx { - pml4_entry_t pml4ept[NPML4EPG]; struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */ struct apic_page apic_page[VM_MAXCPU]; /* one apic page per vcpu */ char msr_bitmap[PAGE_SIZE]; struct pir_desc pir_desc[VM_MAXCPU]; -#ifdef __FreeBSD__ uint64_t guest_msrs[VM_MAXCPU][GUEST_MSR_NUM]; -#else - struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES]; - struct msr_entry host_msrs[VM_MAXCPU][HOST_MSR_MAX_ENTRIES]; +#ifndef __FreeBSD__ + uint64_t host_msrs[VM_MAXCPU][GUEST_MSR_NUM]; + uint64_t tsc_offset_active[VM_MAXCPU]; + vmcs_state_t vmcs_state[VM_MAXCPU]; #endif struct vmxctx ctx[VM_MAXCPU]; struct vmxcap cap[VM_MAXCPU]; struct vmxstate state[VM_MAXCPU]; + uint64_t eptp; struct vm *vm; + long eptgen[MAXCPU]; /* cached pmap->pm_eptgen */ }; -CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0); CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0); +CTASSERT((offsetof(struct vmx, pir_desc[0]) & 63) == 0); -#define VMX_RETURN_DIRECT 0 -#define VMX_RETURN_LONGJMP 1 -#define VMX_RETURN_VMRESUME 2 -#define VMX_RETURN_VMLAUNCH 3 -#define VMX_RETURN_AST 4 -/* - * vmx_setjmp() returns: - * - 0 when it returns directly - * - 1 when it returns from vmx_longjmp - * - 2 when it returns from vmx_resume (which would only be in the error case) - * - 3 when it returns from vmx_launch (which would only be in the error case) - * - 4 when it returns from vmx_resume or vmx_launch because of AST pending - */ -int vmx_setjmp(struct vmxctx *ctx); -void vmx_longjmp(void); /* returns via vmx_setjmp */ -void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */ -void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */ +#define VMX_GUEST_VMEXIT 0 +#define VMX_VMRESUME_ERROR 1 +#define VMX_VMLAUNCH_ERROR 2 +#define VMX_INVEPT_ERROR 3 +#define VMX_VMWRITE_ERROR 4 +int vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched); +void vmx_call_isr(uintptr_t entry); u_long vmx_fix_cr0(u_long cr0); u_long vmx_fix_cr4(u_long cr4); int vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset); +extern char vmx_exit_guest[]; +extern char vmx_exit_guest_flush_rsb[]; + #endif diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h index 08b1469f19..5408d129ad 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx_controls.h 260410 2014-01-07 21:04:49Z neel $ + * $FreeBSD$ */ #ifndef _VMX_CONTROLS_H_ diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h index 9513f6c70b..f0c5ba7691 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx_cpufunc.h 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,6 +38,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _VMX_CPUFUNC_H_ @@ -71,7 +74,12 @@ vmxon(char *region) int error; uint64_t addr; +#ifdef __FreeBSD__ addr = vtophys(region); +#else + /* This is pre-translated in illumos */ + addr = (uint64_t)region; +#endif __asm __volatile("vmxon %[addr];" VMX_SET_ERROR_CODE : [error] "=r" (error) @@ -81,21 +89,7 @@ vmxon(char *region) return (error); } -/* returns 0 on success and non-zero on failure */ -static __inline int -vmxon_pa(vm_paddr_t addr) -{ - int error; - - __asm __volatile("vmxon %[addr];" - VMX_SET_ERROR_CODE - : [error] "=r" (error) - : [addr] "m" (*(uint64_t *)&addr) - : "memory"); - - return (error); -} - +#ifdef __FreeBSD__ /* returns 0 on success and non-zero on failure */ static __inline int vmclear(struct vmcs *vmcs) @@ -111,6 +105,7 @@ vmclear(struct vmcs *vmcs) : "memory"); return (error); } +#endif /* __FreeBSD__ */ static __inline void vmxoff(void) @@ -126,6 +121,7 @@ vmptrst(uint64_t *addr) __asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory"); } +#ifdef __FreeBSD__ static __inline int vmptrld(struct vmcs *vmcs) { @@ -140,6 +136,7 @@ vmptrld(struct vmcs *vmcs) : "memory"); return (error); } +#endif /* __FreeBSD__ */ static __inline int vmwrite(uint64_t reg, uint64_t val) @@ -169,7 +166,8 @@ vmread(uint64_t r, uint64_t *addr) return (error); } -static void __inline +#ifdef __FreeBSD__ +static __inline void VMCLEAR(struct vmcs *vmcs) { int err; @@ -181,7 +179,7 @@ VMCLEAR(struct vmcs *vmcs) critical_exit(); } -static void __inline +static __inline void VMPTRLD(struct vmcs *vmcs) { int err; @@ -192,6 +190,7 @@ VMPTRLD(struct vmcs *vmcs) if (err != 0) panic("%s: vmptrld(%p) error %d", __func__, vmcs, err); } +#endif /* __FreeBSD__ */ #define INVVPID_TYPE_ADDRESS 0UL #define INVVPID_TYPE_SINGLE_CONTEXT 1UL @@ -205,7 +204,7 @@ struct invvpid_desc { }; CTASSERT(sizeof(struct invvpid_desc) == 16); -static void __inline +static __inline void invvpid(uint64_t type, struct invvpid_desc desc) { int error; @@ -228,7 +227,7 @@ struct invept_desc { }; CTASSERT(sizeof(struct invept_desc) == 16); -static void __inline +static __inline void invept(uint64_t type, struct invept_desc desc) { int error; diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c index 1ced311ca8..4a1a2cd358 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,26 +25,26 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.c 284174 2015-06-09 00:14:47Z tychon $ + * $FreeBSD$ + */ +/* + * Copyright 2017 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.c 284174 2015-06-09 00:14:47Z tychon $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> -#include <sys/cpuset.h> +#include <sys/proc.h> #include <machine/clock.h> #include <machine/cpufunc.h> #include <machine/md_var.h> +#include <machine/pcb.h> #include <machine/specialreg.h> #include <machine/vmm.h> -#ifndef __FreeBSD__ -#include <vm/pmap.h> -#endif - #include "vmx.h" #include "vmx_msr.h" @@ -184,7 +186,9 @@ msr_bitmap_change_access(char *bitmap, u_int msr, int access) static uint64_t misc_enable; static uint64_t platform_info; static uint64_t turbo_ratio_limit; +#ifdef __FreeBSD__ static uint64_t host_msrs[GUEST_MSR_NUM]; +#endif /* __FreeBSD__ */ static bool nehalem_cpu(void) @@ -234,13 +238,33 @@ westmere_cpu(void) return (false); } +static bool +pat_valid(uint64_t val) +{ + int i, pa; + + /* + * From Intel SDM: Table "Memory Types That Can Be Encoded With PAT" + * + * Extract PA0 through PA7 and validate that each one encodes a + * valid memory type. + */ + for (i = 0; i < 8; i++) { + pa = (val >> (i * 8)) & 0xff; + if (pa == 2 || pa == 3 || pa >= 8) + return (false); + } + return (true); +} + void vmx_msr_init(void) { uint64_t bus_freq, ratio; int i; -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ + /* XXXJOY: Do we want to do this caching? */ /* * It is safe to cache the values of the following MSRs because * they don't change based on curcpu, curproc or curthread. @@ -249,7 +273,7 @@ vmx_msr_init(void) host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); -#endif +#endif /* __FreeBSD__ */ /* * Initialize emulated MSRs @@ -308,6 +332,10 @@ vmx_msr_init(void) void vmx_msr_guest_init(struct vmx *vmx, int vcpuid) { + uint64_t *guest_msrs; + + guest_msrs = vmx->guest_msrs[vcpuid]; + /* * The permissions bitmap is shared between all vcpus so initialize it * once when initializing the vBSP. @@ -319,29 +347,55 @@ vmx_msr_guest_init(struct vmx *vmx, int vcpuid) guest_msr_rw(vmx, MSR_SF_MASK); guest_msr_rw(vmx, MSR_KGSBASE); } + + /* + * Initialize guest IA32_PAT MSR with default value after reset. + */ + guest_msrs[IDX_MSR_PAT] = PAT_VALUE(0, PAT_WRITE_BACK) | + PAT_VALUE(1, PAT_WRITE_THROUGH) | + PAT_VALUE(2, PAT_UNCACHED) | + PAT_VALUE(3, PAT_UNCACHEABLE) | + PAT_VALUE(4, PAT_WRITE_BACK) | + PAT_VALUE(5, PAT_WRITE_THROUGH) | + PAT_VALUE(6, PAT_UNCACHED) | + PAT_VALUE(7, PAT_UNCACHEABLE); + return; } void vmx_msr_guest_enter(struct vmx *vmx, int vcpuid) { -#ifdef __FreeBSD__ uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; - /* Save host MSRs (if any) and restore guest MSRs */ +#ifndef __FreeBSD__ + uint64_t *host_msrs = vmx->host_msrs[vcpuid]; + + /* Save host MSRs */ + host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); + host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR); + host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR); + host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK); +#endif /* __FreeBSD__ */ + + /* Save host MSRs (in particular, KGSBASE) and restore guest MSRs */ +#ifdef __FreeBSD__ + update_pcb_bases(curpcb); +#endif wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]); wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]); wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]); wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]); wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]); -#endif } void vmx_msr_guest_exit(struct vmx *vmx, int vcpuid) { -#ifdef __FreeBSD__ uint64_t *guest_msrs = vmx->guest_msrs[vcpuid]; +#ifndef __FreeBSD__ + uint64_t *host_msrs = vmx->host_msrs[vcpuid]; +#endif /* Save guest MSRs */ guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR); @@ -357,13 +411,16 @@ vmx_msr_guest_exit(struct vmx *vmx, int vcpuid) wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]); /* MSR_KGSBASE will be restored on the way back to userspace */ -#endif } int vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) { - int error = 0; + const uint64_t *guest_msrs; + int error; + + guest_msrs = vmx->guest_msrs[vcpuid]; + error = 0; switch (num) { case MSR_MCG_CAP: @@ -387,6 +444,9 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) case MSR_TURBO_RATIO_LIMIT1: *val = turbo_ratio_limit; break; + case MSR_PAT: + *val = guest_msrs[IDX_MSR_PAT]; + break; default: error = EINVAL; break; @@ -397,10 +457,13 @@ vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu) int vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) { + uint64_t *guest_msrs; uint64_t changed; int error; + guest_msrs = vmx->guest_msrs[vcpuid]; error = 0; + switch (num) { case MSR_MCG_CAP: case MSR_MCG_STATUS: @@ -433,9 +496,17 @@ vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu) error = EINVAL; break; + case MSR_PAT: + if (pat_valid(val)) + guest_msrs[IDX_MSR_PAT] = val; + else + vm_inject_gp(vmx->vm, vcpuid); + break; +#ifdef __FreeBSD__ case MSR_TSC: error = vmx_set_tsc_offset(vmx, vcpuid, val - rdtsc()); break; +#endif /* __FreeBSD__ */ default: error = EINVAL; break; diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h index 5300d14d9b..ac2adb0dd1 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.h 271888 2014-09-20 02:35:21Z neel $ + * $FreeBSD$ */ #ifndef _VMX_MSR_H_ diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s index d57dde1093..0130f88dd6 100644 --- a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s +++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s @@ -1,5 +1,6 @@ /*- * Copyright (c) 2011 NetApp, Inc. + * Copyright (c) 2013 Neel Natu <neel@freebsd.org> * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -23,7 +24,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/intel/vmx_support.S 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,50 +37,41 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2013 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ -#include <machine/asmacros.h> +#include <sys/asm_linkage.h> +#include <sys/segments.h> -#include "vmx_assym.s" +/* Porting note: This is named 'vmx_support.S' upstream. */ -/* - * Disable interrupts before updating %rsp in VMX_CHECK_AST or - * VMX_GUEST_RESTORE. - * - * The location that %rsp points to is a 'vmxctx' and not a - * real stack so we don't want an interrupt handler to trash it - */ -#define VMX_DISABLE_INTERRUPTS cli -/* - * If the thread hosting the vcpu has an ast pending then take care of it - * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST. - * - * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts - * are disabled. - */ -#ifdef __FreeBSD__ -#define VMX_CHECK_AST \ - movq PCPU(CURTHREAD),%rax; \ - testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax); \ - je 9f; \ - movq $VMX_RETURN_AST,%rsi; \ - movq %rdi,%rsp; \ - addq $VMXCTX_TMPSTKTOP,%rsp; \ - callq vmx_return; \ -9: -#else -#define VMX_CHECK_AST \ - movq %gs:CPU_THREAD,%rax; \ - movl T_ASTFLAG(%rax),%eax; \ - test %al,%al; \ - je 9f; \ - movq $VMX_RETURN_AST,%rsi; \ - movq %rdi,%rsp; \ - addq $VMXCTX_TMPSTKTOP,%rsp; \ - callq vmx_return; \ -9: -#endif + +#if defined(lint) + +struct vmxctx; +struct vmx; + +/*ARGSUSED*/ +void +vmx_launch(struct vmxctx *ctx) +{} + +void +vmx_exit_guest() +{} + +/*ARGSUSED*/ +int +vmx_enter_guest(struct vmxctx *ctx, struct vmx *vmx, int launched) +{ + return (0); +} + +#else /* lint */ + +#include "vmx_assym.h" +#include "vmcs.h" /* * Assumes that %rdi holds a pointer to the 'vmxctx'. @@ -92,7 +84,6 @@ * host context in case of an error with 'vmlaunch' or 'vmresume'. */ #define VMX_GUEST_RESTORE \ - movq %rdi,%rsp; \ movq VMXCTX_GUEST_CR2(%rdi),%rsi; \ movq %rsi,%cr2; \ movq VMXCTX_GUEST_RSI(%rdi),%rsi; \ @@ -111,161 +102,283 @@ movq VMXCTX_GUEST_R15(%rdi),%r15; \ movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */ -#define VM_INSTRUCTION_ERROR(reg) \ - jnc 1f; \ - movl $VM_FAIL_INVALID,reg; /* CF is set */ \ - jmp 3f; \ -1: jnz 2f; \ - movl $VM_FAIL_VALID,reg; /* ZF is set */ \ - jmp 3f; \ -2: movl $VM_SUCCESS,reg; \ -3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp) - - .text -/* - * int vmx_setjmp(ctxp) - * %rdi = ctxp - * - * Return value is '0' when it returns directly from here. - * Return value is '1' when it returns after a vm exit through vmx_longjmp. - */ -ENTRY(vmx_setjmp) - movq (%rsp),%rax /* return address */ - movq %r15,VMXCTX_HOST_R15(%rdi) - movq %r14,VMXCTX_HOST_R14(%rdi) - movq %r13,VMXCTX_HOST_R13(%rdi) - movq %r12,VMXCTX_HOST_R12(%rdi) - movq %rbp,VMXCTX_HOST_RBP(%rdi) - movq %rsp,VMXCTX_HOST_RSP(%rdi) - movq %rbx,VMXCTX_HOST_RBX(%rdi) - movq %rax,VMXCTX_HOST_RIP(%rdi) +#define VMX_GUEST_SAVE \ + movq %rdi, VMXSTK_TMPRDI(%rsp); \ + movq VMXSTK_RDI(%rsp), %rdi; \ + movq %rbp, VMXCTX_GUEST_RBP(%rdi); \ + leaq VMXSTK_FP(%rsp), %rbp; \ + movq %rsi, VMXCTX_GUEST_RSI(%rdi); \ + movq %rdx, VMXCTX_GUEST_RDX(%rdi); \ + movq %rcx, VMXCTX_GUEST_RCX(%rdi); \ + movq %r8, VMXCTX_GUEST_R8(%rdi); \ + movq %r9, VMXCTX_GUEST_R9(%rdi); \ + movq %rax, VMXCTX_GUEST_RAX(%rdi); \ + movq %rbx, VMXCTX_GUEST_RBX(%rdi); \ + movq %r10, VMXCTX_GUEST_R10(%rdi); \ + movq %r11, VMXCTX_GUEST_R11(%rdi); \ + movq %r12, VMXCTX_GUEST_R12(%rdi); \ + movq %r13, VMXCTX_GUEST_R13(%rdi); \ + movq %r14, VMXCTX_GUEST_R14(%rdi); \ + movq %r15, VMXCTX_GUEST_R15(%rdi); \ + movq %cr2, %rbx; \ + movq %rbx, VMXCTX_GUEST_CR2(%rdi); \ + movq VMXSTK_TMPRDI(%rsp), %rdx; \ + movq %rdx, VMXCTX_GUEST_RDI(%rdi); - /* - * XXX save host debug registers - */ - movl $VMX_RETURN_DIRECT,%eax - ret -END(vmx_setjmp) /* - * void vmx_return(struct vmxctx *ctxp, int retval) - * %rdi = ctxp - * %rsi = retval - * Return to vmm context through vmx_setjmp() with a value of 'retval'. + * Flush scratch registers to avoid lingering guest state being used for + * Spectre v1 attacks when returning from guest entry. */ -ENTRY(vmx_return) - /* Restore host context. */ - movq VMXCTX_HOST_R15(%rdi),%r15 - movq VMXCTX_HOST_R14(%rdi),%r14 - movq VMXCTX_HOST_R13(%rdi),%r13 - movq VMXCTX_HOST_R12(%rdi),%r12 - movq VMXCTX_HOST_RBP(%rdi),%rbp - movq VMXCTX_HOST_RSP(%rdi),%rsp - movq VMXCTX_HOST_RBX(%rdi),%rbx - movq VMXCTX_HOST_RIP(%rdi),%rax - movq %rax,(%rsp) /* return address */ +#define VMX_GUEST_FLUSH_SCRATCH \ + xorl %edi, %edi; \ + xorl %esi, %esi; \ + xorl %edx, %edx; \ + xorl %ecx, %ecx; \ + xorl %r8d, %r8d; \ + xorl %r9d, %r9d; \ + xorl %r10d, %r10d; \ + xorl %r11d, %r11d; - /* - * XXX restore host debug registers - */ - movl %esi,%eax - ret -END(vmx_return) -/* - * void vmx_longjmp(void) - * %rsp points to the struct vmxctx - */ -ENTRY(vmx_longjmp) - /* - * Save guest state that is not automatically saved in the vmcs. - */ - movq %rdi,VMXCTX_GUEST_RDI(%rsp) - movq %rsi,VMXCTX_GUEST_RSI(%rsp) - movq %rdx,VMXCTX_GUEST_RDX(%rsp) - movq %rcx,VMXCTX_GUEST_RCX(%rsp) - movq %r8,VMXCTX_GUEST_R8(%rsp) - movq %r9,VMXCTX_GUEST_R9(%rsp) - movq %rax,VMXCTX_GUEST_RAX(%rsp) - movq %rbx,VMXCTX_GUEST_RBX(%rsp) - movq %rbp,VMXCTX_GUEST_RBP(%rsp) - movq %r10,VMXCTX_GUEST_R10(%rsp) - movq %r11,VMXCTX_GUEST_R11(%rsp) - movq %r12,VMXCTX_GUEST_R12(%rsp) - movq %r13,VMXCTX_GUEST_R13(%rsp) - movq %r14,VMXCTX_GUEST_R14(%rsp) - movq %r15,VMXCTX_GUEST_R15(%rsp) - - movq %cr2,%rdi - movq %rdi,VMXCTX_GUEST_CR2(%rsp) - - movq %rsp,%rdi - movq $VMX_RETURN_LONGJMP,%rsi - - addq $VMXCTX_TMPSTKTOP,%rsp - callq vmx_return -END(vmx_longjmp) +/* Stack layout (offset from %rsp) for vmx_enter_guest */ +#define VMXSTK_TMPRDI 0x00 /* temp store %rdi on vmexit */ +#define VMXSTK_R15 0x08 /* callee saved %r15 */ +#define VMXSTK_R14 0x10 /* callee saved %r14 */ +#define VMXSTK_R13 0x18 /* callee saved %r13 */ +#define VMXSTK_R12 0x20 /* callee saved %r12 */ +#define VMXSTK_RBX 0x28 /* callee saved %rbx */ +#define VMXSTK_RDX 0x30 /* save-args %rdx (int launched) */ +#define VMXSTK_RSI 0x38 /* save-args %rsi (struct vmx *vmx) */ +#define VMXSTK_RDI 0x40 /* save-args %rdi (struct vmxctx *ctx) */ +#define VMXSTK_FP 0x48 /* frame pointer %rbp */ +#define VMXSTKSIZE VMXSTK_FP /* - * void vmx_resume(struct vmxctx *ctxp) - * %rdi = ctxp - * - * Although the return type is a 'void' this function may return indirectly - * through vmx_setjmp() with a return value of 2. + * vmx_enter_guest(struct vmxctx *vmxctx, int launched) + * Interrupts must be disabled on entry. */ -ENTRY(vmx_resume) - VMX_DISABLE_INTERRUPTS +ENTRY_NP(vmx_enter_guest) + pushq %rbp + movq %rsp, %rbp + subq $VMXSTKSIZE, %rsp + movq %r15, VMXSTK_R15(%rsp) + movq %r14, VMXSTK_R14(%rsp) + movq %r13, VMXSTK_R13(%rsp) + movq %r12, VMXSTK_R12(%rsp) + movq %rbx, VMXSTK_RBX(%rsp) + movq %rdx, VMXSTK_RDX(%rsp) + movq %rsi, VMXSTK_RSI(%rsp) + movq %rdi, VMXSTK_RDI(%rsp) + + movq %rdi, %r12 /* vmxctx */ + movq %rsi, %r13 /* vmx */ + movl %edx, %r14d /* launch state */ + movq VMXCTX_PMAP(%rdi), %rbx - VMX_CHECK_AST + /* Activate guest pmap on this cpu. */ + leaq PM_ACTIVE(%rbx), %rdi + movl %gs:CPU_ID, %esi + call cpuset_atomic_add + movq %r12, %rdi /* - * Restore guest state that is not automatically loaded from the vmcs. + * If 'vmx->eptgen[curcpu]' is not identical to 'pmap->pm_eptgen' + * then we must invalidate all mappings associated with this EPTP. */ - VMX_GUEST_RESTORE + movq PM_EPTGEN(%rbx), %r10 + movl %gs:CPU_ID, %eax + cmpq %r10, VMX_EPTGEN(%r13, %rax, 8) + je guest_restore + + /* Refresh 'vmx->eptgen[curcpu]' */ + movq %r10, VMX_EPTGEN(%r13, %rax, 8) + + /* Setup the invept descriptor on the host stack */ + pushq $0x0 + pushq VMX_EPTP(%r13) + movl $0x1, %eax /* Single context invalidate */ + invept (%rsp), %rax + leaq 0x10(%rsp), %rsp + jbe invept_error /* Check invept instruction error */ +guest_restore: + /* Write the current %rsp into the VMCS to be restored on vmexit */ + movl $VMCS_HOST_RSP, %eax + vmwrite %rsp, %rax + jbe vmwrite_error + + /* Check if vmresume is adequate or a full vmlaunch is required */ + cmpl $0, %r14d + je do_launch + + VMX_GUEST_RESTORE vmresume + /* + * In the common case, 'vmresume' returns back to the host through + * 'vmx_exit_guest'. If there is an error we return VMX_VMRESUME_ERROR + * to the caller. + */ + leaq VMXSTK_FP(%rsp), %rbp + movq VMXSTK_RDI(%rsp), %rdi + movl $VMX_VMRESUME_ERROR, %eax + jmp decode_inst_error +do_launch: + VMX_GUEST_RESTORE + vmlaunch /* - * Capture the reason why vmresume failed. + * In the common case, 'vmlaunch' returns back to the host through + * 'vmx_exit_guest'. If there is an error we return VMX_VMLAUNCH_ERROR + * to the caller. */ - VM_INSTRUCTION_ERROR(%eax) + leaq VMXSTK_FP(%rsp), %rbp + movq VMXSTK_RDI(%rsp), %rdi + movl $VMX_VMLAUNCH_ERROR, %eax + jmp decode_inst_error + +vmwrite_error: + movl $VMX_VMWRITE_ERROR, %eax + jmp decode_inst_error +invept_error: + movl $VMX_INVEPT_ERROR, %eax + jmp decode_inst_error +decode_inst_error: + movl $VM_FAIL_VALID, %r11d + jz inst_error + movl $VM_FAIL_INVALID, %r11d +inst_error: + movl %r11d, VMXCTX_INST_FAIL_STATUS(%rdi) - /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */ - movq %rsp,%rdi - movq $VMX_RETURN_VMRESUME,%rsi + movq VMXCTX_PMAP(%rdi), %rdi + leaq PM_ACTIVE(%rdi), %rdi + movl %gs:CPU_ID, %esi + movq %rax, %r12 + call cpuset_atomic_del + movq %r12, %rax - addq $VMXCTX_TMPSTKTOP,%rsp - callq vmx_return -END(vmx_resume) + movq VMXSTK_RBX(%rsp), %rbx + movq VMXSTK_R12(%rsp), %r12 + movq VMXSTK_R13(%rsp), %r13 + movq VMXSTK_R14(%rsp), %r14 + movq VMXSTK_R15(%rsp), %r15 + + VMX_GUEST_FLUSH_SCRATCH + + addq $VMXSTKSIZE, %rsp + popq %rbp + ret /* - * void vmx_launch(struct vmxctx *ctxp) - * %rdi = ctxp - * - * Although the return type is a 'void' this function may return indirectly - * through vmx_setjmp() with a return value of 3. + * Non-error VM-exit from the guest. Make this a label so it can + * be used by C code when setting up the VMCS. + * The VMCS-restored %rsp points to the struct vmxctx */ -ENTRY(vmx_launch) - VMX_DISABLE_INTERRUPTS +.align ASM_ENTRY_ALIGN; +ALTENTRY(vmx_exit_guest) + /* Save guest state that is not automatically saved in the vmcs. */ + VMX_GUEST_SAVE - VMX_CHECK_AST + /* Deactivate guest pmap on this cpu. */ + movq VMXCTX_PMAP(%rdi), %rdi + leaq PM_ACTIVE(%rdi), %rdi + movl %gs:CPU_ID, %esi + call cpuset_atomic_del /* - * Restore guest state that is not automatically loaded from the vmcs. + * This will return to the caller of 'vmx_enter_guest()' with a return + * value of VMX_GUEST_VMEXIT. */ - VMX_GUEST_RESTORE + movl $VMX_GUEST_VMEXIT, %eax + movq VMXSTK_RBX(%rsp), %rbx + movq VMXSTK_R12(%rsp), %r12 + movq VMXSTK_R13(%rsp), %r13 + movq VMXSTK_R14(%rsp), %r14 + movq VMXSTK_R15(%rsp), %r15 - vmlaunch + VMX_GUEST_FLUSH_SCRATCH + + addq $VMXSTKSIZE, %rsp + popq %rbp + ret +SET_SIZE(vmx_enter_guest) + + + +.align ASM_ENTRY_ALIGN; +ALTENTRY(vmx_exit_guest_flush_rsb) + /* Save guest state that is not automatically saved in the vmcs. */ + VMX_GUEST_SAVE + + /* Deactivate guest pmap on this cpu. */ + movq VMXCTX_PMAP(%rdi), %rdi + leaq PM_ACTIVE(%rdi), %rdi + movl %gs:CPU_ID, %esi + call cpuset_atomic_del + + VMX_GUEST_FLUSH_SCRATCH /* - * Capture the reason why vmlaunch failed. + * To prevent malicious branch target predictions from affecting the + * host, overwrite all entries in the RSB upon exiting a guest. */ - VM_INSTRUCTION_ERROR(%eax) + movl $16, %ecx /* 16 iterations, two calls per loop */ + movq %rsp, %rax +loop: + call 2f /* create an RSB entry. */ +1: + pause + call 1b /* capture rogue speculation. */ +2: + call 2f /* create an RSB entry. */ +1: + pause + call 1b /* capture rogue speculation. */ +2: + subl $1, %ecx + jnz loop + movq %rax, %rsp - /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */ - movq %rsp,%rdi - movq $VMX_RETURN_VMLAUNCH,%rsi + /* + * This will return to the caller of 'vmx_enter_guest()' with a return + * value of VMX_GUEST_VMEXIT. + */ + movl $VMX_GUEST_VMEXIT, %eax + movq VMXSTK_RBX(%rsp), %rbx + movq VMXSTK_R12(%rsp), %r12 + movq VMXSTK_R13(%rsp), %r13 + movq VMXSTK_R14(%rsp), %r14 + movq VMXSTK_R15(%rsp), %r15 + + addq $VMXSTKSIZE, %rsp + popq %rbp + ret +SET_SIZE(vmx_exit_guest_flush_rsb) + +/* + * %rdi = trapno + * + * We need to do enough to convince cmnint - and its iretting tail - that we're + * a legit interrupt stack frame. + */ +ENTRY_NP(vmx_call_isr) + pushq %rbp + movq %rsp, %rbp + movq %rsp, %r11 + andq $~0xf, %rsp /* align stack */ + pushq $KDS_SEL /* %ss */ + pushq %r11 /* %rsp */ + pushfq /* %rflags */ + pushq $KCS_SEL /* %cs */ + leaq .iret_dest(%rip), %rcx + pushq %rcx /* %rip */ + pushq $0 /* err */ + pushq %rdi /* trapno */ + cli + jmp cmnint /* %rip (and call) */ +.iret_dest: + popq %rbp + ret +SET_SIZE(vmx_call_isr) - addq $VMXCTX_TMPSTKTOP,%rsp - callq vmx_return -END(vmx_launch) +#endif /* lint */ diff --git a/usr/src/uts/i86pc/io/vmm/intel/vtd.c b/usr/src/uts/i86pc/io/vmm/intel/vtd.c new file mode 100644 index 0000000000..9474b30fc6 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/intel/vtd.c @@ -0,0 +1,690 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <vm/vm.h> +#include <vm/pmap.h> + +#include <dev/pci/pcireg.h> + +#include <machine/vmparam.h> +#include <contrib/dev/acpica/include/acpi.h> + +#include "io/iommu.h" + +/* + * Documented in the "Intel Virtualization Technology for Directed I/O", + * Architecture Spec, September 2008. + */ + +/* Section 10.4 "Register Descriptions" */ +struct vtdmap { + volatile uint32_t version; + volatile uint32_t res0; + volatile uint64_t cap; + volatile uint64_t ext_cap; + volatile uint32_t gcr; + volatile uint32_t gsr; + volatile uint64_t rta; + volatile uint64_t ccr; +}; + +#define VTD_CAP_SAGAW(cap) (((cap) >> 8) & 0x1F) +#define VTD_CAP_ND(cap) ((cap) & 0x7) +#define VTD_CAP_CM(cap) (((cap) >> 7) & 0x1) +#define VTD_CAP_SPS(cap) (((cap) >> 34) & 0xF) +#define VTD_CAP_RWBF(cap) (((cap) >> 4) & 0x1) + +#define VTD_ECAP_DI(ecap) (((ecap) >> 2) & 0x1) +#define VTD_ECAP_COHERENCY(ecap) ((ecap) & 0x1) +#define VTD_ECAP_IRO(ecap) (((ecap) >> 8) & 0x3FF) + +#define VTD_GCR_WBF (1 << 27) +#define VTD_GCR_SRTP (1 << 30) +#define VTD_GCR_TE (1U << 31) + +#define VTD_GSR_WBFS (1 << 27) +#define VTD_GSR_RTPS (1 << 30) +#define VTD_GSR_TES (1U << 31) + +#define VTD_CCR_ICC (1UL << 63) /* invalidate context cache */ +#define VTD_CCR_CIRG_GLOBAL (1UL << 61) /* global invalidation */ + +#define VTD_IIR_IVT (1UL << 63) /* invalidation IOTLB */ +#define VTD_IIR_IIRG_GLOBAL (1ULL << 60) /* global IOTLB invalidation */ +#define VTD_IIR_IIRG_DOMAIN (2ULL << 60) /* domain IOTLB invalidation */ +#define VTD_IIR_IIRG_PAGE (3ULL << 60) /* page IOTLB invalidation */ +#define VTD_IIR_DRAIN_READS (1ULL << 49) /* drain pending DMA reads */ +#define VTD_IIR_DRAIN_WRITES (1ULL << 48) /* drain pending DMA writes */ +#define VTD_IIR_DOMAIN_P 32 + +#define VTD_ROOT_PRESENT 0x1 +#define VTD_CTX_PRESENT 0x1 +#define VTD_CTX_TT_ALL (1UL << 2) + +#define VTD_PTE_RD (1UL << 0) +#define VTD_PTE_WR (1UL << 1) +#define VTD_PTE_SUPERPAGE (1UL << 7) +#define VTD_PTE_ADDR_M (0x000FFFFFFFFFF000UL) + +#define VTD_RID2IDX(rid) (((rid) & 0xff) * 2) + +struct domain { + uint64_t *ptp; /* first level page table page */ + int pt_levels; /* number of page table levels */ + int addrwidth; /* 'AW' field in context entry */ + int spsmask; /* supported super page sizes */ + u_int id; /* domain id */ + vm_paddr_t maxaddr; /* highest address to be mapped */ + SLIST_ENTRY(domain) next; +}; + +static SLIST_HEAD(, domain) domhead; + +#define DRHD_MAX_UNITS 8 +static int drhd_num; +static struct vtdmap *vtdmaps[DRHD_MAX_UNITS]; +static int max_domains; +typedef int (*drhd_ident_func_t)(void); + +static uint64_t root_table[PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); +static uint64_t ctx_tables[256][PAGE_SIZE / sizeof(uint64_t)] __aligned(4096); + +static MALLOC_DEFINE(M_VTD, "vtd", "vtd"); + +static int +vtd_max_domains(struct vtdmap *vtdmap) +{ + int nd; + + nd = VTD_CAP_ND(vtdmap->cap); + + switch (nd) { + case 0: + return (16); + case 1: + return (64); + case 2: + return (256); + case 3: + return (1024); + case 4: + return (4 * 1024); + case 5: + return (16 * 1024); + case 6: + return (64 * 1024); + default: + panic("vtd_max_domains: invalid value of nd (0x%0x)", nd); + } +} + +static u_int +domain_id(void) +{ + u_int id; + struct domain *dom; + + /* Skip domain id 0 - it is reserved when Caching Mode field is set */ + for (id = 1; id < max_domains; id++) { + SLIST_FOREACH(dom, &domhead, next) { + if (dom->id == id) + break; + } + if (dom == NULL) + break; /* found it */ + } + + if (id >= max_domains) + panic("domain ids exhausted"); + + return (id); +} + +static void +vtd_wbflush(struct vtdmap *vtdmap) +{ + + if (VTD_ECAP_COHERENCY(vtdmap->ext_cap) == 0) + pmap_invalidate_cache(); + + if (VTD_CAP_RWBF(vtdmap->cap)) { + vtdmap->gcr = VTD_GCR_WBF; + while ((vtdmap->gsr & VTD_GSR_WBFS) != 0) + ; + } +} + +static void +vtd_ctx_global_invalidate(struct vtdmap *vtdmap) +{ + + vtdmap->ccr = VTD_CCR_ICC | VTD_CCR_CIRG_GLOBAL; + while ((vtdmap->ccr & VTD_CCR_ICC) != 0) + ; +} + +static void +vtd_iotlb_global_invalidate(struct vtdmap *vtdmap) +{ + int offset; + volatile uint64_t *iotlb_reg, val; + + vtd_wbflush(vtdmap); + + offset = VTD_ECAP_IRO(vtdmap->ext_cap) * 16; + iotlb_reg = (volatile uint64_t *)((caddr_t)vtdmap + offset + 8); + + *iotlb_reg = VTD_IIR_IVT | VTD_IIR_IIRG_GLOBAL | + VTD_IIR_DRAIN_READS | VTD_IIR_DRAIN_WRITES; + + while (1) { + val = *iotlb_reg; + if ((val & VTD_IIR_IVT) == 0) + break; + } +} + +static void +vtd_translation_enable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = VTD_GCR_TE; + while ((vtdmap->gsr & VTD_GSR_TES) == 0) + ; +} + +static void +vtd_translation_disable(struct vtdmap *vtdmap) +{ + + vtdmap->gcr = 0; + while ((vtdmap->gsr & VTD_GSR_TES) != 0) + ; +} + +static int +vtd_init(void) +{ + int i, units, remaining; + struct vtdmap *vtdmap; + vm_paddr_t ctx_paddr; + char *end, envname[32]; + unsigned long mapaddr; + ACPI_STATUS status; + ACPI_TABLE_DMAR *dmar; + ACPI_DMAR_HEADER *hdr; + ACPI_DMAR_HARDWARE_UNIT *drhd; + + /* + * Allow the user to override the ACPI DMAR table by specifying the + * physical address of each remapping unit. + * + * The following example specifies two remapping units at + * physical addresses 0xfed90000 and 0xfeda0000 respectively. + * set vtd.regmap.0.addr=0xfed90000 + * set vtd.regmap.1.addr=0xfeda0000 + */ + for (units = 0; units < DRHD_MAX_UNITS; units++) { + snprintf(envname, sizeof(envname), "vtd.regmap.%d.addr", units); + if (getenv_ulong(envname, &mapaddr) == 0) + break; + vtdmaps[units] = (struct vtdmap *)PHYS_TO_DMAP(mapaddr); + } + + if (units > 0) + goto skip_dmar; + + /* Search for DMAR table. */ + status = AcpiGetTable(ACPI_SIG_DMAR, 0, (ACPI_TABLE_HEADER **)&dmar); + if (ACPI_FAILURE(status)) + return (ENXIO); + + end = (char *)dmar + dmar->Header.Length; + remaining = dmar->Header.Length - sizeof(ACPI_TABLE_DMAR); + while (remaining > sizeof(ACPI_DMAR_HEADER)) { + hdr = (ACPI_DMAR_HEADER *)(end - remaining); + if (hdr->Length > remaining) + break; + /* + * From Intel VT-d arch spec, version 1.3: + * BIOS implementations must report mapping structures + * in numerical order, i.e. All remapping structures of + * type 0 (DRHD) enumerated before remapping structures of + * type 1 (RMRR) and so forth. + */ + if (hdr->Type != ACPI_DMAR_TYPE_HARDWARE_UNIT) + break; + + drhd = (ACPI_DMAR_HARDWARE_UNIT *)hdr; + vtdmaps[units++] = (struct vtdmap *)PHYS_TO_DMAP(drhd->Address); + if (units >= DRHD_MAX_UNITS) + break; + remaining -= hdr->Length; + } + + if (units <= 0) + return (ENXIO); + +skip_dmar: + drhd_num = units; + vtdmap = vtdmaps[0]; + + if (VTD_CAP_CM(vtdmap->cap) != 0) + panic("vtd_init: invalid caching mode"); + + max_domains = vtd_max_domains(vtdmap); + + /* + * Set up the root-table to point to the context-entry tables + */ + for (i = 0; i < 256; i++) { + ctx_paddr = vtophys(ctx_tables[i]); + if (ctx_paddr & PAGE_MASK) + panic("ctx table (0x%0lx) not page aligned", ctx_paddr); + + root_table[i * 2] = ctx_paddr | VTD_ROOT_PRESENT; + } + + return (0); +} + +static void +vtd_cleanup(void) +{ +} + +static void +vtd_enable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_wbflush(vtdmap); + + /* Update the root table address */ + vtdmap->rta = vtophys(root_table); + vtdmap->gcr = VTD_GCR_SRTP; + while ((vtdmap->gsr & VTD_GSR_RTPS) == 0) + ; + + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + + vtd_translation_enable(vtdmap); + } +} + +static void +vtd_disable(void) +{ + int i; + struct vtdmap *vtdmap; + + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_translation_disable(vtdmap); + } +} + +static void +vtd_add_device(void *arg, uint16_t rid) +{ + int idx; + uint64_t *ctxp; + struct domain *dom = arg; + vm_paddr_t pt_paddr; + struct vtdmap *vtdmap; + uint8_t bus; + + vtdmap = vtdmaps[0]; + bus = PCI_RID2BUS(rid); + ctxp = ctx_tables[bus]; + pt_paddr = vtophys(dom->ptp); + idx = VTD_RID2IDX(rid); + + if (ctxp[idx] & VTD_CTX_PRESENT) { + panic("vtd_add_device: device %x is already owned by " + "domain %d", rid, + (uint16_t)(ctxp[idx + 1] >> 8)); + } + + /* + * Order is important. The 'present' bit is set only after all fields + * of the context pointer are initialized. + */ + ctxp[idx + 1] = dom->addrwidth | (dom->id << 8); + + if (VTD_ECAP_DI(vtdmap->ext_cap)) + ctxp[idx] = VTD_CTX_TT_ALL; + else + ctxp[idx] = 0; + + ctxp[idx] |= pt_paddr | VTD_CTX_PRESENT; + + /* + * 'Not Present' entries are not cached in either the Context Cache + * or in the IOTLB, so there is no need to invalidate either of them. + */ +} + +static void +vtd_remove_device(void *arg, uint16_t rid) +{ + int i, idx; + uint64_t *ctxp; + struct vtdmap *vtdmap; + uint8_t bus; + + bus = PCI_RID2BUS(rid); + ctxp = ctx_tables[bus]; + idx = VTD_RID2IDX(rid); + + /* + * Order is important. The 'present' bit is must be cleared first. + */ + ctxp[idx] = 0; + ctxp[idx + 1] = 0; + + /* + * Invalidate the Context Cache and the IOTLB. + * + * XXX use device-selective invalidation for Context Cache + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_ctx_global_invalidate(vtdmap); + vtd_iotlb_global_invalidate(vtdmap); + } +} + +#define CREATE_MAPPING 0 +#define REMOVE_MAPPING 1 + +static uint64_t +vtd_update_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len, + int remove) +{ + struct domain *dom; + int i, spshift, ptpshift, ptpindex, nlevels; + uint64_t spsize, *ptp; + + dom = arg; + ptpindex = 0; + ptpshift = 0; + + KASSERT(gpa + len > gpa, ("%s: invalid gpa range %#lx/%#lx", __func__, + gpa, len)); + KASSERT(gpa + len <= dom->maxaddr, ("%s: gpa range %#lx/%#lx beyond " + "domain maxaddr %#lx", __func__, gpa, len, dom->maxaddr)); + + if (gpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned gpa 0x%0lx", gpa); + + if (hpa & PAGE_MASK) + panic("vtd_create_mapping: unaligned hpa 0x%0lx", hpa); + + if (len & PAGE_MASK) + panic("vtd_create_mapping: unaligned len 0x%0lx", len); + + /* + * Compute the size of the mapping that we can accommodate. + * + * This is based on three factors: + * - supported super page size + * - alignment of the region starting at 'gpa' and 'hpa' + * - length of the region 'len' + */ + spshift = 48; + for (i = 3; i >= 0; i--) { + spsize = 1UL << spshift; + if ((dom->spsmask & (1 << i)) != 0 && + (gpa & (spsize - 1)) == 0 && + (hpa & (spsize - 1)) == 0 && + (len >= spsize)) { + break; + } + spshift -= 9; + } + + ptp = dom->ptp; + nlevels = dom->pt_levels; + while (--nlevels >= 0) { + ptpshift = 12 + nlevels * 9; + ptpindex = (gpa >> ptpshift) & 0x1FF; + + /* We have reached the leaf mapping */ + if (spshift >= ptpshift) { + break; + } + + /* + * We are working on a non-leaf page table page. + * + * Create a downstream page table page if necessary and point + * to it from the current page table. + */ + if (ptp[ptpindex] == 0) { + void *nlp = malloc(PAGE_SIZE, M_VTD, M_WAITOK | M_ZERO); + ptp[ptpindex] = vtophys(nlp)| VTD_PTE_RD | VTD_PTE_WR; + } + + ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & VTD_PTE_ADDR_M); + } + + if ((gpa & ((1UL << ptpshift) - 1)) != 0) + panic("gpa 0x%lx and ptpshift %d mismatch", gpa, ptpshift); + + /* + * Update the 'gpa' -> 'hpa' mapping + */ + if (remove) { + ptp[ptpindex] = 0; + } else { + ptp[ptpindex] = hpa | VTD_PTE_RD | VTD_PTE_WR; + + if (nlevels > 0) + ptp[ptpindex] |= VTD_PTE_SUPERPAGE; + } + + return (1UL << ptpshift); +} + +static uint64_t +vtd_create_mapping(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, uint64_t len) +{ + + return (vtd_update_mapping(arg, gpa, hpa, len, CREATE_MAPPING)); +} + +static uint64_t +vtd_remove_mapping(void *arg, vm_paddr_t gpa, uint64_t len) +{ + + return (vtd_update_mapping(arg, gpa, 0, len, REMOVE_MAPPING)); +} + +static void +vtd_invalidate_tlb(void *dom) +{ + int i; + struct vtdmap *vtdmap; + + /* + * Invalidate the IOTLB. + * XXX use domain-selective invalidation for IOTLB + */ + for (i = 0; i < drhd_num; i++) { + vtdmap = vtdmaps[i]; + vtd_iotlb_global_invalidate(vtdmap); + } +} + +static void * +vtd_create_domain(vm_paddr_t maxaddr) +{ + struct domain *dom; + vm_paddr_t addr; + int tmp, i, gaw, agaw, sagaw, res, pt_levels, addrwidth; + struct vtdmap *vtdmap; + + if (drhd_num <= 0) + panic("vtd_create_domain: no dma remapping hardware available"); + + vtdmap = vtdmaps[0]; + + /* + * Calculate AGAW. + * Section 3.4.2 "Adjusted Guest Address Width", Architecture Spec. + */ + addr = 0; + for (gaw = 0; addr < maxaddr; gaw++) + addr = 1ULL << gaw; + + res = (gaw - 12) % 9; + if (res == 0) + agaw = gaw; + else + agaw = gaw + 9 - res; + + if (agaw > 64) + agaw = 64; + + /* + * Select the smallest Supported AGAW and the corresponding number + * of page table levels. + */ + pt_levels = 2; + sagaw = 30; + addrwidth = 0; + tmp = VTD_CAP_SAGAW(vtdmap->cap); + for (i = 0; i < 5; i++) { + if ((tmp & (1 << i)) != 0 && sagaw >= agaw) + break; + pt_levels++; + addrwidth++; + sagaw += 9; + if (sagaw > 64) + sagaw = 64; + } + + if (i >= 5) { + panic("vtd_create_domain: SAGAW 0x%lx does not support AGAW %d", + VTD_CAP_SAGAW(vtdmap->cap), agaw); + } + + dom = malloc(sizeof(struct domain), M_VTD, M_ZERO | M_WAITOK); + dom->pt_levels = pt_levels; + dom->addrwidth = addrwidth; + dom->id = domain_id(); + dom->maxaddr = maxaddr; + dom->ptp = malloc(PAGE_SIZE, M_VTD, M_ZERO | M_WAITOK); + if ((uintptr_t)dom->ptp & PAGE_MASK) + panic("vtd_create_domain: ptp (%p) not page aligned", dom->ptp); + +#ifdef notyet + /* + * XXX superpage mappings for the iommu do not work correctly. + * + * By default all physical memory is mapped into the host_domain. + * When a VM is allocated wired memory the pages belonging to it + * are removed from the host_domain and added to the vm's domain. + * + * If the page being removed was mapped using a superpage mapping + * in the host_domain then we need to demote the mapping before + * removing the page. + * + * There is not any code to deal with the demotion at the moment + * so we disable superpage mappings altogether. + */ + dom->spsmask = VTD_CAP_SPS(vtdmap->cap); +#endif + + SLIST_INSERT_HEAD(&domhead, dom, next); + + return (dom); +} + +static void +vtd_free_ptp(uint64_t *ptp, int level) +{ + int i; + uint64_t *nlp; + + if (level > 1) { + for (i = 0; i < 512; i++) { + if ((ptp[i] & (VTD_PTE_RD | VTD_PTE_WR)) == 0) + continue; + if ((ptp[i] & VTD_PTE_SUPERPAGE) != 0) + continue; + nlp = (uint64_t *)PHYS_TO_DMAP(ptp[i] & VTD_PTE_ADDR_M); + vtd_free_ptp(nlp, level - 1); + } + } + + bzero(ptp, PAGE_SIZE); + free(ptp, M_VTD); +} + +static void +vtd_destroy_domain(void *arg) +{ + struct domain *dom; + + dom = arg; + + SLIST_REMOVE(&domhead, dom, domain, next); + vtd_free_ptp(dom->ptp, dom->pt_levels); + free(dom, M_VTD); +} + +struct iommu_ops iommu_ops_intel = { + vtd_init, + vtd_cleanup, + vtd_enable, + vtd_disable, + vtd_create_domain, + vtd_destroy_domain, + vtd_create_mapping, + vtd_remove_mapping, + vtd_add_device, + vtd_remove_device, + vtd_invalidate_tlb, +}; diff --git a/usr/src/uts/i86pc/io/vmm/io/iommu.h b/usr/src/uts/i86pc/io/vmm/io/iommu.h new file mode 100644 index 0000000000..f8003a5d45 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/iommu.h @@ -0,0 +1,76 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_IOMMU_H_ +#define _IO_IOMMU_H_ + +typedef int (*iommu_init_func_t)(void); +typedef void (*iommu_cleanup_func_t)(void); +typedef void (*iommu_enable_func_t)(void); +typedef void (*iommu_disable_func_t)(void); +typedef void *(*iommu_create_domain_t)(vm_paddr_t maxaddr); +typedef void (*iommu_destroy_domain_t)(void *domain); +typedef uint64_t (*iommu_create_mapping_t)(void *domain, vm_paddr_t gpa, + vm_paddr_t hpa, uint64_t len); +typedef uint64_t (*iommu_remove_mapping_t)(void *domain, vm_paddr_t gpa, + uint64_t len); +typedef void (*iommu_add_device_t)(void *domain, uint16_t rid); +typedef void (*iommu_remove_device_t)(void *dom, uint16_t rid); +typedef void (*iommu_invalidate_tlb_t)(void *dom); + +struct iommu_ops { + iommu_init_func_t init; /* module wide */ + iommu_cleanup_func_t cleanup; + iommu_enable_func_t enable; + iommu_disable_func_t disable; + + iommu_create_domain_t create_domain; /* domain-specific */ + iommu_destroy_domain_t destroy_domain; + iommu_create_mapping_t create_mapping; + iommu_remove_mapping_t remove_mapping; + iommu_add_device_t add_device; + iommu_remove_device_t remove_device; + iommu_invalidate_tlb_t invalidate_tlb; +}; + +extern struct iommu_ops iommu_ops_intel; +extern struct iommu_ops iommu_ops_amd; + +void iommu_cleanup(void); +void *iommu_host_domain(void); +void *iommu_create_domain(vm_paddr_t maxaddr); +void iommu_destroy_domain(void *dom); +void iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, + size_t len); +void iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len); +void iommu_add_device(void *dom, uint16_t rid); +void iommu_remove_device(void *dom, uint16_t rid); +void iommu_invalidate_tlb(void *domain); +#endif diff --git a/usr/src/uts/i86pc/io/vmm/io/ppt.h b/usr/src/uts/i86pc/io/vmm/io/ppt.h new file mode 100644 index 0000000000..686b15db49 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/ppt.h @@ -0,0 +1,56 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _IO_PPT_H_ +#define _IO_PPT_H_ + +int ppt_unassign_all(struct vm *vm); +int ppt_map_mmio(struct vm *vm, int bus, int slot, int func, + vm_paddr_t gpa, size_t len, vm_paddr_t hpa); +int ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, + uint64_t addr, uint64_t msg, int numvec); +int ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, + int idx, uint64_t addr, uint64_t msg, uint32_t vector_control); +int ppt_assigned_devices(struct vm *vm); +boolean_t ppt_is_mmio(struct vm *vm, vm_paddr_t gpa); + +/* + * Returns the number of devices sequestered by the ppt driver for assignment + * to virtual machines. + */ +int ppt_avail_devices(void); + +/* + * The following functions should never be called directly. + * Use 'vm_assign_pptdev()' and 'vm_unassign_pptdev()' instead. + */ +int ppt_assign_device(struct vm *vm, int bus, int slot, int func); +int ppt_unassign_device(struct vm *vm, int bus, int slot, int func); +#endif diff --git a/usr/src/uts/i86pc/io/vmm/io/sol_iommu.c b/usr/src/uts/i86pc/io/vmm/io/sol_iommu.c new file mode 100644 index 0000000000..989e88e17b --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/sol_iommu.c @@ -0,0 +1,86 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/cmn_err.h> + +/* + * IOMMU Stub + * + * Until proper iommu support can be wired into bhyve, stub out all the + * functions to either fail, if reasonable, or panic. + */ + +void +iommu_cleanup(void) +{ +} + +void * +iommu_host_domain(void) +{ + return (NULL); +} + +/*ARGSUSED*/ +void * +iommu_create_domain(vm_paddr_t maxaddr) +{ + return (NULL); +} + +/*ARGSUSED*/ +void +iommu_destroy_domain(void *dom) +{ + panic("unimplemented"); +} + +/*ARGSUSED*/ +void +iommu_create_mapping(void *dom, vm_paddr_t gpa, vm_paddr_t hpa, size_t len) +{ + panic("unimplemented"); +} + +/*ARGSUSED*/ +void +iommu_remove_mapping(void *dom, vm_paddr_t gpa, size_t len) +{ + panic("unimplemented"); +} + +/*ARGSUSED*/ +void +iommu_add_device(void *dom, uint16_t rid) +{ + panic("unimplemented"); +} + +/*ARGSUSED*/ +void +iommu_remove_device(void *dom, uint16_t rid) +{ + panic("unimplemented"); +} + +/*ARGSUSED*/ +void +iommu_invalidate_tlb(void *domain) +{ + panic("unimplemented"); +} + diff --git a/usr/src/uts/i86pc/io/vmm/io/sol_ppt.c b/usr/src/uts/i86pc/io/vmm/io/sol_ppt.c new file mode 100644 index 0000000000..9d5b1f5cdc --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/sol_ppt.c @@ -0,0 +1,92 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/errno.h> +#include <sys/cmn_err.h> + +#include <sys/vmm.h> + +/* + * PCI Pass-Through Stub + * + * Until proper passthrough support can be wired into bhyve, stub out all the + * functions to either fail or no-op. + */ + +int +ppt_unassign_all(struct vm *vm) +{ + return (0); +} + +/*ARGSUSED*/ +int +ppt_map_mmio(struct vm *vm, int bus, int slot, int func, vm_paddr_t gpa, + size_t len, vm_paddr_t hpa) +{ + return (ENXIO); +} + +/*ARGSUSED*/ +int +ppt_setup_msi(struct vm *vm, int vcpu, int bus, int slot, int func, + uint64_t addr, uint64_t msg, int numvec) +{ + return (ENXIO); +} + +/*ARGSUSED*/ +int +ppt_setup_msix(struct vm *vm, int vcpu, int bus, int slot, int func, int idx, + uint64_t addr, uint64_t msg, uint32_t vector_control) +{ + return (ENXIO); +} + +/*ARGSUSED*/ +int +ppt_assigned_devices(struct vm *vm) +{ + return (0); +} + +/*ARGSUSED*/ +boolean_t +ppt_is_mmio(struct vm *vm, vm_paddr_t gpa) +{ + return (B_FALSE); +} + +/*ARGSUSED*/ +int +ppt_avail_devices(void) +{ + return (0); +} + +/*ARGSUSED*/ +int +ppt_assign_device(struct vm *vm, int bus, int slot, int func) +{ + return (ENOENT); +} + +/*ARGSUSED*/ +int +ppt_unassign_device(struct vm *vm, int bus, int slot, int func) +{ + return (ENXIO); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.c b/usr/src/uts/i86pc/io/vmm/io/vatpic.c index a93b252c91..ba4cd7785e 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpic.c +++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -25,12 +27,11 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vatpic.c 279683 2015-03-06 02:05:45Z tychon $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/types.h> #include <sys/queue.h> -#include <sys/cpuset.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.h b/usr/src/uts/i86pc/io/vmm/io/vatpic.h index ef5e51b158..d4a1be1820 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpic.h +++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.h @@ -23,7 +23,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vatpic.h 273706 2014-10-26 19:03:06Z neel $ + * $FreeBSD$ */ #ifndef _VATPIC_H_ diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.c b/usr/src/uts/i86pc/io/vmm/io/vatpit.c index ce17bdc92c..9b3e7376d5 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpit.c +++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.c @@ -1,4 +1,5 @@ /*- + * Copyright (c) 2018 Joyent, Inc. * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2011 NetApp, Inc. * All rights reserved. @@ -26,12 +27,11 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vatpit.c 273706 2014-10-26 19:03:06Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/types.h> #include <sys/queue.h> -#include <sys/cpuset.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> @@ -79,7 +79,7 @@ struct vatpit_callout_arg { struct channel { int mode; uint16_t initial; /* initial counter value */ - sbintime_t now_sbt; /* uptime when counter was loaded */ + struct bintime now_bt; /* uptime when counter was loaded */ uint8_t cr[2]; uint8_t ol[2]; bool slatched; /* status latched */ @@ -88,7 +88,7 @@ struct channel { int olbyte; int frbyte; struct callout callout; - sbintime_t callout_sbt; /* target time */ + struct bintime callout_bt; /* target time */ struct vatpit_callout_arg callout_arg; }; @@ -96,26 +96,41 @@ struct vatpit { struct vm *vm; struct mtx mtx; - sbintime_t freq_sbt; + struct bintime freq_bt; struct channel channel[3]; }; static void pit_timer_start_cntr0(struct vatpit *vatpit); +static uint64_t +vatpit_delta_ticks(struct vatpit *vatpit, struct channel *c) +{ + struct bintime delta; + uint64_t result; + + binuptime(&delta); + bintime_sub(&delta, &c->now_bt); + + result = delta.sec * PIT_8254_FREQ; + result += delta.frac / vatpit->freq_bt.frac; + + return (result); +} + static int vatpit_get_out(struct vatpit *vatpit, int channel) { struct channel *c; - sbintime_t delta_ticks; + uint64_t delta_ticks; int out; c = &vatpit->channel[channel]; switch (c->mode) { case TIMER_INTTC: - delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt; - out = ((c->initial - delta_ticks) <= 0); + delta_ticks = vatpit_delta_ticks(vatpit, c); + out = (delta_ticks >= c->initial); break; default: out = 0; @@ -165,24 +180,28 @@ static void pit_timer_start_cntr0(struct vatpit *vatpit) { struct channel *c; - sbintime_t now, delta, precision; c = &vatpit->channel[0]; if (c->initial != 0) { - delta = c->initial * vatpit->freq_sbt; - precision = delta >> tc_precexp; - c->callout_sbt = c->callout_sbt + delta; + sbintime_t precision; + struct bintime now, delta; + + delta.sec = 0; + delta.frac = vatpit->freq_bt.frac * c->initial; + bintime_add(&c->callout_bt, &delta); + precision = bttosbt(delta) >> tc_precexp; /* - * Reset 'callout_sbt' if the time that the callout - * was supposed to fire is more than 'c->initial' - * ticks in the past. + * Reset 'callout_bt' if the time that the callout was supposed + * to fire is more than 'c->initial' ticks in the past. */ - now = sbinuptime(); - if (c->callout_sbt < now) - c->callout_sbt = now + delta; + binuptime(&now); + if (bintime_cmp(&c->callout_bt, &now, <)) { + c->callout_bt = now; + bintime_add(&c->callout_bt, &delta); + } - callout_reset_sbt(&c->callout, c->callout_sbt, + callout_reset_sbt(&c->callout, bttosbt(c->callout_bt), precision, vatpit_callout_handler, &c->callout_arg, C_ABSOLUTE); } @@ -192,7 +211,7 @@ static uint16_t pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch) { uint16_t lval; - sbintime_t delta_ticks; + uint64_t delta_ticks; /* cannot latch a new value until the old one has been consumed */ if (latch && c->olbyte != 0) @@ -208,12 +227,11 @@ pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch) * here. */ c->initial = TIMER_DIV(PIT_8254_FREQ, 100); - c->now_sbt = sbinuptime(); + binuptime(&c->now_bt); c->status &= ~TIMER_STS_NULLCNT; } - delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt; - + delta_ticks = vatpit_delta_ticks(vatpit, c); lval = c->initial - delta_ticks % c->initial; if (latch) { @@ -384,10 +402,10 @@ vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, c->frbyte = 0; c->crbyte = 0; c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8; - c->now_sbt = sbinuptime(); + binuptime(&c->now_bt); /* Start an interval timer for channel 0 */ if (port == TIMER_CNTR0) { - c->callout_sbt = c->now_sbt; + c->callout_bt = c->now_bt; pit_timer_start_cntr0(vatpit); } if (c->initial == 0) @@ -424,7 +442,6 @@ struct vatpit * vatpit_init(struct vm *vm) { struct vatpit *vatpit; - struct bintime bt; struct vatpit_callout_arg *arg; int i; @@ -433,11 +450,10 @@ vatpit_init(struct vm *vm) mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN); - FREQ2BT(PIT_8254_FREQ, &bt); - vatpit->freq_sbt = bttosbt(bt); + FREQ2BT(PIT_8254_FREQ, &vatpit->freq_bt); for (i = 0; i < 3; i++) { - callout_init(&vatpit->channel[i].callout, true); + callout_init(&vatpit->channel[i].callout, 1); arg = &vatpit->channel[i].callout_arg; arg->vatpit = vatpit; arg->channel_num = i; @@ -456,3 +472,16 @@ vatpit_cleanup(struct vatpit *vatpit) free(vatpit, M_VATPIT); } + +#ifndef __FreeBSD__ +void +vatpit_localize_resources(struct vatpit *vatpit) +{ + for (uint_t i = 0; i < 3; i++) { + /* Only localize channels which might be running */ + if (vatpit->channel[i].mode != 0) { + vmm_glue_callout_localize(&vatpit->channel[i].callout); + } + } +} +#endif /* __FreeBSD */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.h b/usr/src/uts/i86pc/io/vmm/io/vatpit.h index f20ad73e47..4bf9fe048d 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vatpit.h +++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2011 NetApp, Inc. * All rights reserved. @@ -24,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vatpit.h 273706 2014-10-26 19:03:06Z neel $ + * $FreeBSD$ */ #ifndef _VATPIT_H_ @@ -42,4 +44,8 @@ int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, uint32_t *eax); +#ifndef __FreeBSD__ +void vatpit_localize_resources(struct vatpit *); +#endif + #endif /* _VATPIT_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.c b/usr/src/uts/i86pc/io/vmm/io/vhpet.c index 25f6013da0..c82b4626bd 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vhpet.c +++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2013 Neel Natu <neel@freebsd.org> * All rights reserved. @@ -24,11 +26,15 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z tychon $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z tychon $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/lock.h> @@ -36,7 +42,6 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z ty #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/systm.h> -#include <sys/cpuset.h> #include <dev/acpica/acpi_hpet.h> @@ -52,7 +57,7 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z ty static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet"); -#define HPET_FREQ 10000000 /* 10.0 Mhz */ +#define HPET_FREQ 16777216 /* 16.7 (2^24) Mhz */ #define FS_PER_S 1000000000000000ul /* Timer N Configuration and Capabilities Register */ @@ -104,7 +109,6 @@ vhpet_capabilities(void) uint64_t cap = 0; cap |= 0x8086 << 16; /* vendor id */ - cap |= HPET_CAP_LEG_RT; /* legacy routing capable */ cap |= (VHPET_NUM_TIMERS - 1) << 8; /* number of timers */ cap |= 1; /* revision */ cap &= ~HPET_CAP_COUNT_SIZE; /* 32-bit timer */ @@ -127,15 +131,6 @@ vhpet_timer_msi_enabled(struct vhpet *vhpet, int n) { const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN; - /* - * LegacyReplacement Route configuration takes precedence over MSI - * for timers 0 and 1. - */ - if (n == 0 || n == 1) { - if (vhpet->config & HPET_CNF_LEG_RT) - return (false); - } - if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable) return (true); else @@ -152,41 +147,9 @@ vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n) if (vhpet_timer_msi_enabled(vhpet, n)) return (0); - if (vhpet->config & HPET_CNF_LEG_RT) { - /* - * In "legacy routing" timers 0 and 1 are connected to - * ioapic pins 2 and 8 respectively. - */ - switch (n) { - case 0: - return (2); - case 1: - return (8); - } - } - return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9); } -static __inline int -vhpet_timer_atpic_pin(struct vhpet *vhpet, int n) -{ - if (vhpet->config & HPET_CNF_LEG_RT) { - /* - * In "legacy routing" timers 0 and 1 are connected to - * 8259 master pin 0 and slave pin 0 respectively. - */ - switch (n) { - case 0: - return (0); - case 1: - return (8); - } - } - - return (-1); -} - static uint32_t vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) { @@ -211,7 +174,7 @@ vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) /* * The sbinuptime corresponding to the 'countbase' is * meaningless when the counter is disabled. Make sure - * that the the caller doesn't want to use it. + * that the caller doesn't want to use it. */ KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL")); } @@ -221,17 +184,12 @@ vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr) static void vhpet_timer_clear_isr(struct vhpet *vhpet, int n) { - int pin, legacy_pin; + int pin; if (vhpet->isr & (1 << n)) { pin = vhpet_timer_ioapic_pin(vhpet, n); KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n)); vioapic_deassert_irq(vhpet->vm, pin); - - legacy_pin = vhpet_timer_atpic_pin(vhpet, n); - if (legacy_pin != -1) - vatpic_deassert_irq(vhpet->vm, legacy_pin); - vhpet->isr &= ~(1 << n); } } @@ -257,12 +215,6 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n) KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: " "timer %d is using MSI", n)); - /* The legacy replacement interrupts are always edge triggered */ - if (vhpet->config & HPET_CNF_LEG_RT) { - if (n == 0 || n == 1) - return (true); - } - if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0) return (true); else @@ -272,7 +224,7 @@ vhpet_timer_edge_trig(struct vhpet *vhpet, int n) static void vhpet_timer_interrupt(struct vhpet *vhpet, int n) { - int pin, legacy_pin; + int pin; /* If interrupts are not enabled for this timer then just return. */ if (!vhpet_timer_interrupt_enabled(vhpet, n)) @@ -298,17 +250,11 @@ vhpet_timer_interrupt(struct vhpet *vhpet, int n) return; } - legacy_pin = vhpet_timer_atpic_pin(vhpet, n); - if (vhpet_timer_edge_trig(vhpet, n)) { vioapic_pulse_irq(vhpet->vm, pin); - if (legacy_pin != -1) - vatpic_pulse_irq(vhpet->vm, legacy_pin); } else { vhpet->isr |= 1 << n; vioapic_assert_irq(vhpet->vm, pin); - if (legacy_pin != -1) - vatpic_assert_irq(vhpet->vm, legacy_pin); } } @@ -402,10 +348,6 @@ vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now) { sbintime_t delta, precision; - /* If interrupts are not enabled for this timer then just return. */ - if (!vhpet_timer_interrupt_enabled(vhpet, n)) - return; - if (vhpet->timer[n].comprate != 0) vhpet_adjust_compval(vhpet, n, counter); else { @@ -588,6 +530,13 @@ vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size, counter = vhpet_counter(vhpet, nowptr); oldval = vhpet->config; update_register(&vhpet->config, data, mask); + + /* + * LegacyReplacement Routing is not supported so clear the + * bit explicitly. + */ + vhpet->config &= ~HPET_CNF_LEG_RT; + if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) { if (vhpet_counter_enabled(vhpet)) { vhpet_start_counting(vhpet); @@ -777,8 +726,10 @@ vhpet_init(struct vm *vm) vhpet->freq_sbt = bttosbt(bt); pincount = vioapic_pincount(vm); - if (pincount >= 24) - allowed_irqs = 0x00f00000; /* irqs 20, 21, 22 and 23 */ + if (pincount >= 32) + allowed_irqs = 0xff000000; /* irqs 24-31 */ + else if (pincount >= 20) + allowed_irqs = 0xf << (pincount - 4); /* 4 upper irqs */ else allowed_irqs = 0; @@ -819,3 +770,12 @@ vhpet_getcap(struct vm_hpet_cap *cap) cap->capabilities = vhpet_capabilities(); return (0); } +#ifndef __FreeBSD__ +void +vhpet_localize_resources(struct vhpet *vhpet) +{ + for (uint_t i = 0; i < VHPET_NUM_TIMERS; i++) { + vmm_glue_callout_localize(&vhpet->timer[i].callout); + } +} +#endif /* __FreeBSD */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.h b/usr/src/uts/i86pc/io/vmm/io/vhpet.h index 868809d166..8e28241b32 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vhpet.h +++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2013 Neel Natu <neel@freebsd.org> * All rights reserved. @@ -24,7 +26,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vhpet.h 258579 2013-11-25 19:04:51Z neel $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #ifndef _VHPET_H_ @@ -41,4 +47,8 @@ int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val, int size, void *arg); int vhpet_getcap(struct vm_hpet_cap *cap); +#ifndef __FreeBSD__ +void vhpet_localize_resources(struct vhpet *vhpet); +#endif + #endif /* _VHPET_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.c b/usr/src/uts/i86pc/io/vmm/io/vioapic.c index 5adf5de16d..dbd3420420 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vioapic.c +++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2013 Neel Natu <neel@freebsd.org> * All rights reserved. @@ -24,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -37,19 +39,20 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/queue.h> -#include <sys/cpuset.h> #include <sys/lock.h> #include <sys/mutex.h> #include <sys/systm.h> #include <sys/kernel.h> #include <sys/malloc.h> +#include <sys/cpuset.h> #include <x86/apicreg.h> #include <machine/vmm.h> @@ -62,7 +65,7 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z #define IOREGSEL 0x00 #define IOWIN 0x10 -#define REDIR_ENTRIES 24 +#define REDIR_ENTRIES 32 #define RTBL_RO_BITS ((uint64_t)(IOART_REM_IRR | IOART_DELIVS)) struct vioapic { @@ -234,48 +237,139 @@ vioapic_pulse_irq(struct vm *vm, int irq) return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE)); } +#define REDIR_IS_PHYS(reg) (((reg) & IOART_DESTMOD) == IOART_DESTPHY) +#define REDIR_IS_LOWPRIO(reg) (((reg) & IOART_DELMOD) == IOART_DELLOPRI) +/* Level-triggered interrupts only valid in fixed and low-priority modes */ +#define REDIR_IS_LVLTRIG(reg) \ + (((reg) & IOART_TRGRLVL) != 0 && \ + (((reg) & IOART_DELMOD) == IOART_DELFIXED || REDIR_IS_LOWPRIO(reg))) +#define REDIR_DEST(reg) ((reg) >> (32 + APIC_ID_SHIFT)) +#define REDIR_VECTOR(reg) ((reg) & IOART_INTVEC) + /* - * Reset the vlapic's trigger-mode register to reflect the ioapic pin - * configuration. + * Given a redirection entry, determine which vCPUs would be targeted. */ static void -vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg) +vioapic_calcdest(struct vioapic *vioapic, uint64_t redir_ent, cpuset_t *dmask) { - struct vioapic *vioapic; - struct vlapic *vlapic; - uint32_t low, high, dest; - int delmode, pin, vector; - bool level, phys; - vlapic = vm_lapic(vm, vcpuid); - vioapic = vm_ioapic(vm); + /* + * When calculating interrupt destinations with vlapic_calcdest(), the + * legacy xAPIC format is assumed, since the system lacks interrupt + * redirection hardware. + * See vlapic_deliver_intr() for more details. + */ + vlapic_calcdest(vioapic->vm, dmask, REDIR_DEST(redir_ent), + REDIR_IS_PHYS(redir_ent), REDIR_IS_LOWPRIO(redir_ent), false); +} + +/* + * Across all redirection entries utilizing a specified vector, determine the + * set of vCPUs which would be targeted by a level-triggered interrupt. + */ +static void +vioapic_tmr_active(struct vioapic *vioapic, uint8_t vec, cpuset_t *result) +{ + u_int i; + + CPU_ZERO(result); + if (vec == 0) { + return; + } + + for (i = 0; i < REDIR_ENTRIES; i++) { + cpuset_t dest; + const uint64_t val = vioapic->rtbl[i].reg; + + if (!REDIR_IS_LVLTRIG(val) || REDIR_VECTOR(val) != vec) { + continue; + } + + CPU_ZERO(&dest); + vioapic_calcdest(vioapic, val, &dest); + CPU_OR(result, &dest); + } +} + +/* + * Update TMR state in vLAPICs after changes to vIOAPIC pin configuration + */ +static void +vioapic_update_tmrs(struct vioapic *vioapic, int vcpuid, uint64_t oldval, + uint64_t newval) +{ + cpuset_t active, allset, newset, oldset; + struct vm *vm; + uint8_t newvec, oldvec; + + vm = vioapic->vm; + CPU_ZERO(&allset); + CPU_ZERO(&newset); + CPU_ZERO(&oldset); + newvec = oldvec = 0; + + if (REDIR_IS_LVLTRIG(oldval)) { + vioapic_calcdest(vioapic, oldval, &oldset); + CPU_OR(&allset, &oldset); + oldvec = REDIR_VECTOR(oldval); + } + + if (REDIR_IS_LVLTRIG(newval)) { + vioapic_calcdest(vioapic, newval, &newset); + CPU_OR(&allset, &newset); + newvec = REDIR_VECTOR(newval); + } + + if (CPU_EMPTY(&allset) || + (CPU_CMP(&oldset, &newset) == 0 && oldvec == newvec)) { + return; + } - VIOAPIC_LOCK(vioapic); /* - * Reset all vectors to be edge-triggered. + * Since the write to the redirection table has already occurred, a + * scan of level-triggered entries referencing the old vector will find + * only entries which are now currently valid. */ - vlapic_reset_tmr(vlapic); - for (pin = 0; pin < REDIR_ENTRIES; pin++) { - low = vioapic->rtbl[pin].reg; - high = vioapic->rtbl[pin].reg >> 32; + vioapic_tmr_active(vioapic, oldvec, &active); - level = low & IOART_TRGRLVL ? true : false; - if (!level) + while (!CPU_EMPTY(&allset)) { + struct vlapic *vlapic; + u_int i; + + i = CPU_FFS(&allset) - 1; + CPU_CLR(i, &allset); + + if (oldvec == newvec && + CPU_ISSET(i, &oldset) && CPU_ISSET(i, &newset)) { continue; + } - /* - * For a level-triggered 'pin' let the vlapic figure out if - * an assertion on this 'pin' would result in an interrupt - * being delivered to it. If yes, then it will modify the - * TMR bit associated with this vector to level-triggered. - */ - phys = ((low & IOART_DESTMOD) == IOART_DESTPHY); - delmode = low & IOART_DELMOD; - vector = low & IOART_INTVEC; - dest = high >> APIC_ID_SHIFT; - vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector); + if (i != vcpuid) { + vcpu_block_run(vm, i); + } + + vlapic = vm_lapic(vm, i); + if (CPU_ISSET(i, &oldset)) { + /* + * Perform the deassertion if no other level-triggered + * IOAPIC entries target this vCPU with the old vector + * + * Note: Sharing of vectors like that should be + * extremely rare in modern operating systems and was + * previously unsupported by the bhyve vIOAPIC. + */ + if (!CPU_ISSET(i, &active)) { + vlapic_tmr_set(vlapic, oldvec, false); + } + } + if (CPU_ISSET(i, &newset)) { + vlapic_tmr_set(vlapic, newvec, true); + } + + if (i != vcpuid) { + vcpu_unblock_run(vm, i); + } } - VIOAPIC_UNLOCK(vioapic); } static uint32_t @@ -319,7 +413,6 @@ vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data) uint64_t data64, mask64; uint64_t last, changed; int regnum, pin, lshift; - cpuset_t allvcpus; regnum = addr & 0xff; switch (regnum) { @@ -355,20 +448,15 @@ vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data) /* * If any fields in the redirection table entry (except mask - * or polarity) have changed then rendezvous all the vcpus - * to update their vlapic trigger-mode registers. + * or polarity) have changed then update the trigger-mode + * registers on all the vlapics. */ changed = last ^ vioapic->rtbl[pin].reg; if (changed & ~(IOART_INTMASK | IOART_INTPOL)) { VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate " "vlapic trigger-mode register", pin); - VIOAPIC_UNLOCK(vioapic); -#if 0 /* XXX */ - allvcpus = vm_active_cpus(vioapic->vm); - vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus, - vioapic_update_tmr, NULL); -#endif - VIOAPIC_LOCK(vioapic); + vioapic_update_tmrs(vioapic, vcpuid, last, + vioapic->rtbl[pin].reg); } /* diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.h b/usr/src/uts/i86pc/io/vmm/io/vioapic.h index 9479ebb10e..6bf3e80e05 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vioapic.h +++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * Copyright (c) 2013 Neel Natu <neel@freebsd.org> * All rights reserved. @@ -24,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vioapic.h 258699 2013-11-27 22:18:08Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -45,10 +47,6 @@ #define VIOAPIC_BASE 0xFEC00000 #define VIOAPIC_SIZE 4096 -#include "vdev.h" - -struct vm; - struct vioapic *vioapic_init(struct vm *vm); void vioapic_cleanup(struct vioapic *vioapic); diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c index 9a0a3058ea..4e58249c8d 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vlapic.c +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c @@ -1,6 +1,9 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. + * Copyright (c) 2019 Joyent, Inc. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -23,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,10 +39,11 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/lock.h> @@ -57,7 +61,6 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z n #include <machine/vmm.h> -#include "vmm_ipi.h" #include "vmm_lapic.h" #include "vmm_ktr.h" #include "vmm_stat.h" @@ -82,7 +85,15 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z n #define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx)) #define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx)) -#define VLAPIC_BUS_FREQ tsc_freq +/* + * APIC timer frequency: + * - arbitrary but chosen to be in the ballpark of contemporary hardware. + * - power-of-two to avoid loss of precision when converted to a bintime. + */ +#define VLAPIC_BUS_FREQ (128 * 1024 * 1024) + +static void vlapic_set_error(struct vlapic *, uint32_t, bool); +static void vlapic_tmr_reset(struct vlapic *); static __inline uint32_t vlapic_get_id(struct vlapic *vlapic) @@ -259,7 +270,6 @@ vlapic_dcr_write_handler(struct vlapic *vlapic) VLAPIC_TIMER_UNLOCK(vlapic); } - void vlapic_esr_write_handler(struct vlapic *vlapic) { @@ -287,7 +297,8 @@ vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level) } if (vector < 16) { - vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR); + vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR, + false); VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d", vector); return (1); @@ -449,20 +460,22 @@ vlapic_mask_lvts(struct vlapic *vlapic) } static int -vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt) +vlapic_fire_lvt(struct vlapic *vlapic, u_int lvt) { - uint32_t vec, mode; + uint32_t mode, reg, vec; - if (lvt & APIC_LVT_M) - return (0); + reg = atomic_load_acq_32(&vlapic->lvt_last[lvt]); - vec = lvt & APIC_LVT_VECTOR; - mode = lvt & APIC_LVT_DM; + if (reg & APIC_LVT_M) + return (0); + vec = reg & APIC_LVT_VECTOR; + mode = reg & APIC_LVT_DM; switch (mode) { case APIC_LVT_DM_FIXED: if (vec < 16) { - vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, + lvt == APIC_LVT_ERROR); return (0); } if (vlapic_set_intr_ready(vlapic, vec, false)) @@ -566,6 +579,8 @@ vlapic_update_ppr(struct vlapic *vlapic) VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr); } +static VMM_STAT(VLAPIC_GRATUITOUS_EOI, "EOI without any in-service interrupt"); + static void vlapic_process_eoi(struct vlapic *vlapic) { @@ -576,11 +591,7 @@ vlapic_process_eoi(struct vlapic *vlapic) isrptr = &lapic->isr0; tmrptr = &lapic->tmr0; - /* - * The x86 architecture reserves the the first 32 vectors for use - * by the processor. - */ - for (i = 7; i > 0; i--) { + for (i = 7; i >= 0; i--) { idx = i * 4; bitpos = fls(isrptr[idx]); if (bitpos-- != 0) { @@ -589,17 +600,21 @@ vlapic_process_eoi(struct vlapic *vlapic) vlapic->isrvec_stk_top); } isrptr[idx] &= ~(1 << bitpos); + vector = i * 32 + bitpos; + VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "EOI vector %d", + vector); VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi"); vlapic->isrvec_stk_top--; vlapic_update_ppr(vlapic); if ((tmrptr[idx] & (1 << bitpos)) != 0) { - vector = i * 32 + bitpos; vioapic_process_eoi(vlapic->vm, vlapic->vcpuid, vector); } return; } } + VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "Gratuitous EOI"); + vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_GRATUITOUS_EOI, 1); } static __inline int @@ -621,22 +636,22 @@ vlapic_periodic_timer(struct vlapic *vlapic) static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic"); -void -vlapic_set_error(struct vlapic *vlapic, uint32_t mask) +static void +vlapic_set_error(struct vlapic *vlapic, uint32_t mask, bool lvt_error) { - uint32_t lvt; vlapic->esr_pending |= mask; - if (vlapic->esr_firing) + + /* + * Avoid infinite recursion if the error LVT itself is configured with + * an illegal vector. + */ + if (lvt_error) return; - vlapic->esr_firing = 1; - // The error LVT always uses the fixed delivery mode. - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); - if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { + if (vlapic_fire_lvt(vlapic, APIC_LVT_ERROR)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1); } - vlapic->esr_firing = 0; } static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); @@ -644,13 +659,10 @@ static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic"); static void vlapic_fire_timer(struct vlapic *vlapic) { - uint32_t lvt; KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked")); - - // The timer LVT always uses the fixed delivery mode. - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); - if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) { + + if (vlapic_fire_lvt(vlapic, APIC_LVT_TIMER)) { VLAPIC_CTR0(vlapic, "vlapic timer fired"); vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1); } @@ -662,10 +674,8 @@ static VMM_STAT(VLAPIC_INTR_CMC, void vlapic_fire_cmci(struct vlapic *vlapic) { - uint32_t lvt; - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); - if (vlapic_fire_lvt(vlapic, lvt)) { + if (vlapic_fire_lvt(vlapic, APIC_LVT_CMCI)) { vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1); } } @@ -676,7 +686,6 @@ static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1, int vlapic_trigger_lvt(struct vlapic *vlapic, int vector) { - uint32_t lvt; if (vlapic_enabled(vlapic) == false) { /* @@ -699,35 +708,20 @@ vlapic_trigger_lvt(struct vlapic *vlapic, int vector) switch (vector) { case APIC_LVT_LINT0: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT); - break; case APIC_LVT_LINT1: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT); - break; case APIC_LVT_TIMER: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT); - lvt |= APIC_LVT_DM_FIXED; - break; case APIC_LVT_ERROR: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT); - lvt |= APIC_LVT_DM_FIXED; - break; case APIC_LVT_PMC: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT); - break; case APIC_LVT_THERMAL: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT); - break; case APIC_LVT_CMCI: - lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT); + if (vlapic_fire_lvt(vlapic, vector)) { + vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, + LVTS_TRIGGERRED, vector, 1); + } break; default: return (EINVAL); } - if (vlapic_fire_lvt(vlapic, lvt)) { - vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, - LVTS_TRIGGERRED, vector, 1); - } return (0); } @@ -831,11 +825,11 @@ vlapic_icrtmr_write_handler(struct vlapic *vlapic) /* * This function populates 'dmask' with the set of vcpus that match the * addressing specified by the (dest, phys, lowprio) tuple. - * + * * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit) * or xAPIC (8-bit) destination field. */ -static void +void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, bool lowprio, bool x2apic_dest) { @@ -860,12 +854,12 @@ vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, */ CPU_ZERO(dmask); vcpuid = vm_apicid2vcpuid(vm, dest); - if (vcpuid < VM_MAXCPU) + if (vcpuid < vm_get_maxcpus(vm)) CPU_SET(vcpuid, dmask); } else { /* * In the "Flat Model" the MDA is interpreted as an 8-bit wide - * bitmask. This model is only avilable in the xAPIC mode. + * bitmask. This model is only available in the xAPIC mode. */ mda_flat_ldest = dest & 0xff; @@ -883,7 +877,7 @@ vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, /* * Logical mode: match each APIC that has a bit set - * in it's LDR that matches a bit in the ldest. + * in its LDR that matches a bit in the ldest. */ CPU_ZERO(dmask); amask = vm_active_cpus(vm); @@ -987,6 +981,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) struct vlapic *vlapic2; struct vm_exit *vmexit; struct LAPIC *lapic; + uint16_t maxcpus; lapic = vlapic->apic_page; lapic->icr_lo &= ~APIC_DELSTAT_PEND; @@ -1000,7 +995,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) mode = icrval & APIC_DELMODE_MASK; if (mode == APIC_DELMODE_FIXED && vec < 16) { - vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR); + vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR, false); VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec); return (0); } @@ -1048,11 +1043,12 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) return (0); /* handled completely in the kernel */ } + maxcpus = vm_get_maxcpus(vlapic->vm); if (mode == APIC_DELMODE_INIT) { if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT) return (0); - if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { + if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { vlapic2 = vm_lapic(vlapic->vm, dest); /* move from INIT to waiting-for-SIPI state */ @@ -1065,7 +1061,7 @@ vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu) } if (mode == APIC_DELMODE_STARTUP) { - if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) { + if (vlapic->vcpuid == 0 && dest != 0 && dest < maxcpus) { vlapic2 = vm_lapic(vlapic->vm, dest); /* @@ -1118,11 +1114,7 @@ vlapic_pending_intr(struct vlapic *vlapic, int *vecptr) irrptr = &lapic->irr0; - /* - * The x86 architecture reserves the the first 32 vectors for use - * by the processor. - */ - for (i = 7; i > 0; i--) { + for (i = 7; i >= 0; i--) { idx = i * 4; val = atomic_load_acq_int(&irrptr[idx]); bitpos = fls(val); @@ -1461,7 +1453,7 @@ vlapic_reset(struct vlapic *vlapic) lapic->dfr = 0xffffffff; lapic->svr = APIC_SVR_VECTOR; vlapic_mask_lvts(vlapic); - vlapic_reset_tmr(vlapic); + vlapic_tmr_reset(vlapic); lapic->dcr_timer = 0; vlapic_dcr_write_handler(vlapic); @@ -1478,7 +1470,8 @@ void vlapic_init(struct vlapic *vlapic) { KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized")); - KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU, + KASSERT(vlapic->vcpuid >= 0 && + vlapic->vcpuid < vm_get_maxcpus(vlapic->vm), ("vlapic_init: vcpuid is not initialized")); KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not " "initialized")); @@ -1628,60 +1621,85 @@ vlapic_enabled(struct vlapic *vlapic) } static void -vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level) +vlapic_tmr_reset(struct vlapic *vlapic) { struct LAPIC *lapic; - uint32_t *tmrptr, mask; - int idx; lapic = vlapic->apic_page; - tmrptr = &lapic->tmr0; - idx = (vector / 32) * 4; - mask = 1 << (vector % 32); - if (level) - tmrptr[idx] |= mask; - else - tmrptr[idx] &= ~mask; - - if (vlapic->ops.set_tmr != NULL) - (*vlapic->ops.set_tmr)(vlapic, vector, level); + lapic->tmr0 = lapic->tmr1 = lapic->tmr2 = lapic->tmr3 = 0; + lapic->tmr4 = lapic->tmr5 = lapic->tmr6 = lapic->tmr7 = 0; + vlapic->tmr_pending = 1; } +/* + * Synchronize TMR designations into the LAPIC state. + * The vCPU must be in the VCPU_RUNNING state. + */ void -vlapic_reset_tmr(struct vlapic *vlapic) +vlapic_tmr_update(struct vlapic *vlapic) { - int vector; + struct LAPIC *lapic; + uint32_t *tmrptr; + uint32_t result[VLAPIC_TMR_CNT]; + u_int i, tmr_idx; + + if (vlapic->tmr_pending == 0) { + return; + } + + lapic = vlapic->apic_page; + tmrptr = &lapic->tmr0; + + VLAPIC_CTR0(vlapic, "synchronizing TMR"); + for (i = 0; i < VLAPIC_TMR_CNT; i++) { + tmr_idx = i * 4; - VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered"); + tmrptr[tmr_idx] &= ~vlapic->tmr_vec_deassert[i]; + tmrptr[tmr_idx] |= vlapic->tmr_vec_assert[i]; + vlapic->tmr_vec_deassert[i] = 0; + vlapic->tmr_vec_assert[i] = 0; + result[i] = tmrptr[tmr_idx]; + } + vlapic->tmr_pending = 0; - for (vector = 0; vector <= 255; vector++) - vlapic_set_tmr(vlapic, vector, false); + if (vlapic->ops.set_tmr != NULL) { + (*vlapic->ops.set_tmr)(vlapic, result); + } } +/* + * Designate the TMR state for a given interrupt vector. + * The caller must hold the vIOAPIC lock and prevent the vCPU corresponding to + * this vLAPIC instance from being-in or entering the VCPU_RUNNING state. + */ void -vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, - int delmode, int vector) +vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active) { - cpuset_t dmask; - bool lowprio; - - KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector)); + const uint32_t idx = vector / 32; + const uint32_t mask = 1 << (vector % 32); + + VLAPIC_CTR2(vlapic, "TMR for vector %u %sasserted", vector, + active ? "" : "de"); + if (active) { + vlapic->tmr_vec_assert[idx] |= mask; + vlapic->tmr_vec_deassert[idx] &= ~mask; + } else { + vlapic->tmr_vec_deassert[idx] |= mask; + vlapic->tmr_vec_assert[idx] &= ~mask; + } /* - * A level trigger is valid only for fixed and lowprio delivery modes. + * Track the number of TMR changes between calls to vlapic_tmr_update. + * While a simple boolean would suffice, this count may be useful when + * tracing or debugging, and is cheap to calculate. */ - if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) { - VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for " - "delivery-mode %d", delmode); - return; - } - - lowprio = (delmode == APIC_DELMODE_LOWPRIO); - vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false); - - if (!CPU_ISSET(vlapic->vcpuid, &dmask)) - return; + vlapic->tmr_pending = MIN(UINT32_MAX - 1, vlapic->tmr_pending) + 1; +} - VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector); - vlapic_set_tmr(vlapic, vector, true); +#ifndef __FreeBSD__ +void +vlapic_localize_resources(struct vlapic *vlapic) +{ + vmm_glue_callout_localize(&vlapic->callout); } +#endif /* __FreeBSD */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.h b/usr/src/uts/i86pc/io/vmm/io/vlapic.h index 3fa705d818..e1a52551a9 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vlapic.h +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,11 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vlapic.h 262281 2014-02-21 06:03:54Z neel $ + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. */ #ifndef _VLAPIC_H_ @@ -69,7 +75,6 @@ int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level); */ void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum); -void vlapic_set_error(struct vlapic *vlapic, uint32_t mask); void vlapic_fire_cmci(struct vlapic *vlapic); int vlapic_trigger_lvt(struct vlapic *vlapic, int vector); @@ -81,16 +86,11 @@ bool vlapic_enabled(struct vlapic *vlapic); void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys, int delmode, int vec); -/* Reset the trigger-mode bits for all vectors to be edge-triggered */ -void vlapic_reset_tmr(struct vlapic *vlapic); +void vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys, + bool lowprio, bool x2apic_dest); -/* - * Set the trigger-mode bit associated with 'vector' to level-triggered if - * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to - * this 'vlapic'. - */ -void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys, - int delmode, int vector); +void vlapic_tmr_update(struct vlapic *vlapic); +void vlapic_tmr_set(struct vlapic *vlapic, uint8_t vector, bool active); void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val); uint64_t vlapic_get_cr8(struct vlapic *vlapic); @@ -106,4 +106,9 @@ void vlapic_icrtmr_write_handler(struct vlapic *vlapic); void vlapic_dcr_write_handler(struct vlapic *vlapic); void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset); void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val); + +#ifndef __FreeBSD__ +void vlapic_localize_resources(struct vlapic *vlapic); +#endif + #endif /* _VLAPIC_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h index f9bd2e0e8b..5795d48d52 100644 --- a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h +++ b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2013 Neel Natu <neel@freebsd.org> * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/io/vlapic_priv.h 263211 2014-03-15 23:09:34Z tychon $ + * $FreeBSD$ */ #ifndef _VLAPIC_PRIV_H_ @@ -136,6 +138,8 @@ enum boot_state { #define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI +#define VLAPIC_TMR_CNT 8 + struct vlapic; struct vlapic_ops { @@ -143,7 +147,7 @@ struct vlapic_ops { int (*pending_intr)(struct vlapic *vlapic, int *vecptr); void (*intr_accepted)(struct vlapic *vlapic, int vector); void (*post_intr)(struct vlapic *vlapic, int hostcpu); - void (*set_tmr)(struct vlapic *vlapic, int vector, bool level); + void (*set_tmr)(struct vlapic *vlapic, const uint32_t *result); void (*enable_x2apic_mode)(struct vlapic *vlapic); }; @@ -154,7 +158,7 @@ struct vlapic { struct vlapic_ops ops; uint32_t esr_pending; - int esr_firing; + uint32_t tmr_pending; struct callout callout; /* vlapic timer */ struct bintime timer_fire_bt; /* callout expiry time */ @@ -182,6 +186,19 @@ struct vlapic { */ uint32_t svr_last; uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1]; + + /* + * Store intended modifications to the trigger-mode register state. + * Along with the tmr_pending counter above, these are protected by the + * vIOAPIC lock and can only be modified under specific conditions: + * + * 1. When holding the vIOAPIC lock, and the vCPU to which the vLAPIC + * belongs is prevented from entering the VCPU_RUNNING state. + * 2. When the owning vCPU is in the VCPU_RUNNING state, and is + * applying the TMR modifications prior to interrupt injection. + */ + uint32_t tmr_vec_deassert[VLAPIC_TMR_CNT]; + uint32_t tmr_vec_assert[VLAPIC_TMR_CNT]; }; void vlapic_init(struct vlapic *vlapic); diff --git a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c new file mode 100644 index 0000000000..4df909777d --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.c @@ -0,0 +1,105 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/systm.h> + +#include <machine/vmm.h> + +#include "vpmtmr.h" + +/* + * The ACPI Power Management timer is a free-running 24- or 32-bit + * timer with a frequency of 3.579545MHz + * + * This implementation will be 32-bits + */ + +#define PMTMR_FREQ 3579545 /* 3.579545MHz */ + +struct vpmtmr { + sbintime_t freq_sbt; + sbintime_t baseuptime; + uint32_t baseval; +}; + +static MALLOC_DEFINE(M_VPMTMR, "vpmtmr", "bhyve virtual acpi timer"); + +struct vpmtmr * +vpmtmr_init(struct vm *vm) +{ + struct vpmtmr *vpmtmr; + struct bintime bt; + + vpmtmr = malloc(sizeof(struct vpmtmr), M_VPMTMR, M_WAITOK | M_ZERO); + vpmtmr->baseuptime = sbinuptime(); + vpmtmr->baseval = 0; + + FREQ2BT(PMTMR_FREQ, &bt); + vpmtmr->freq_sbt = bttosbt(bt); + + return (vpmtmr); +} + +void +vpmtmr_cleanup(struct vpmtmr *vpmtmr) +{ + + free(vpmtmr, M_VPMTMR); +} + +int +vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vpmtmr *vpmtmr; + sbintime_t now, delta; + + if (!in || bytes != 4) + return (-1); + + vpmtmr = vm_pmtmr(vm); + + /* + * No locking needed because 'baseuptime' and 'baseval' are + * written only during initialization. + */ + now = sbinuptime(); + delta = now - vpmtmr->baseuptime; + KASSERT(delta >= 0, ("vpmtmr_handler: uptime went backwards: " + "%#lx to %#lx", vpmtmr->baseuptime, now)); + *val = vpmtmr->baseval + delta / vpmtmr->freq_sbt; + + return (0); +} diff --git a/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h new file mode 100644 index 0000000000..e6562da5c0 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vpmtmr.h @@ -0,0 +1,44 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _VPMTMR_H_ +#define _VPMTMR_H_ + +#define IO_PMTMR 0x408 + +struct vpmtmr; + +struct vpmtmr *vpmtmr_init(struct vm *vm); +void vpmtmr_cleanup(struct vpmtmr *pmtmr); + +int vpmtmr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/io/vrtc.c b/usr/src/uts/i86pc/io/vmm/io/vrtc.c new file mode 100644 index 0000000000..f12d22fc26 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vrtc.c @@ -0,0 +1,1061 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014, Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/queue.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/mutex.h> +#include <sys/clock.h> +#include <sys/sysctl.h> + +#include <machine/vmm.h> + +#include <isa/rtc.h> + +#include "vmm_ktr.h" +#include "vatpic.h" +#include "vioapic.h" +#include "vrtc.h" + +/* Register layout of the RTC */ +struct rtcdev { + uint8_t sec; + uint8_t alarm_sec; + uint8_t min; + uint8_t alarm_min; + uint8_t hour; + uint8_t alarm_hour; + uint8_t day_of_week; + uint8_t day_of_month; + uint8_t month; + uint8_t year; + uint8_t reg_a; + uint8_t reg_b; + uint8_t reg_c; + uint8_t reg_d; + uint8_t nvram[36]; + uint8_t century; + uint8_t nvram2[128 - 51]; +} __packed; +CTASSERT(sizeof(struct rtcdev) == 128); +CTASSERT(offsetof(struct rtcdev, century) == RTC_CENTURY); + +struct vrtc { + struct vm *vm; + struct mtx mtx; + struct callout callout; + u_int addr; /* RTC register to read or write */ + sbintime_t base_uptime; + time_t base_rtctime; + struct rtcdev rtcdev; +}; + +#define VRTC_LOCK(vrtc) mtx_lock(&((vrtc)->mtx)) +#define VRTC_UNLOCK(vrtc) mtx_unlock(&((vrtc)->mtx)) +#define VRTC_LOCKED(vrtc) mtx_owned(&((vrtc)->mtx)) + +/* + * RTC time is considered "broken" if: + * - RTC updates are halted by the guest + * - RTC date/time fields have invalid values + */ +#define VRTC_BROKEN_TIME ((time_t)-1) + +#define RTC_IRQ 8 +#define RTCSB_BIN 0x04 +#define RTCSB_ALL_INTRS (RTCSB_UINTR | RTCSB_AINTR | RTCSB_PINTR) +#define rtc_halted(vrtc) ((vrtc->rtcdev.reg_b & RTCSB_HALT) != 0) +#define aintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_AINTR) != 0) +#define pintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_PINTR) != 0) +#define uintr_enabled(vrtc) (((vrtc)->rtcdev.reg_b & RTCSB_UINTR) != 0) + +static void vrtc_callout_handler(void *arg); +static void vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval); + +static MALLOC_DEFINE(M_VRTC, "vrtc", "bhyve virtual rtc"); + +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, vrtc, CTLFLAG_RW, NULL, NULL); + +static int rtc_flag_broken_time = 1; +SYSCTL_INT(_hw_vmm_vrtc, OID_AUTO, flag_broken_time, CTLFLAG_RDTUN, + &rtc_flag_broken_time, 0, "Stop guest when invalid RTC time is detected"); + +static __inline bool +divider_enabled(int reg_a) +{ + /* + * The RTC is counting only when dividers are not held in reset. + */ + return ((reg_a & 0x70) == 0x20); +} + +static __inline bool +update_enabled(struct vrtc *vrtc) +{ + /* + * RTC date/time can be updated only if: + * - divider is not held in reset + * - guest has not disabled updates + * - the date/time fields have valid contents + */ + if (!divider_enabled(vrtc->rtcdev.reg_a)) + return (false); + + if (rtc_halted(vrtc)) + return (false); + + if (vrtc->base_rtctime == VRTC_BROKEN_TIME) + return (false); + + return (true); +} + +static time_t +vrtc_curtime(struct vrtc *vrtc, sbintime_t *basetime) +{ + sbintime_t now, delta; + time_t t, secs; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + t = vrtc->base_rtctime; + *basetime = vrtc->base_uptime; + if (update_enabled(vrtc)) { + now = sbinuptime(); + delta = now - vrtc->base_uptime; + KASSERT(delta >= 0, ("vrtc_curtime: uptime went backwards: " + "%#lx to %#lx", vrtc->base_uptime, now)); + secs = delta / SBT_1S; + t += secs; + *basetime += secs * SBT_1S; + } + return (t); +} + +static __inline uint8_t +rtcset(struct rtcdev *rtc, int val) +{ + + KASSERT(val >= 0 && val < 100, ("%s: invalid bin2bcd index %d", + __func__, val)); + + return ((rtc->reg_b & RTCSB_BIN) ? val : bin2bcd_data[val]); +} + +static void +secs_to_rtc(time_t rtctime, struct vrtc *vrtc, int force_update) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; + int hour; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + if (rtctime < 0) { + KASSERT(rtctime == VRTC_BROKEN_TIME, + ("%s: invalid vrtc time %#lx", __func__, rtctime)); + return; + } + + /* + * If the RTC is halted then the guest has "ownership" of the + * date/time fields. Don't update the RTC date/time fields in + * this case (unless forced). + */ + if (rtc_halted(vrtc) && !force_update) + return; + + ts.tv_sec = rtctime; + ts.tv_nsec = 0; + clock_ts_to_ct(&ts, &ct); + + KASSERT(ct.sec >= 0 && ct.sec <= 59, ("invalid clocktime sec %d", + ct.sec)); + KASSERT(ct.min >= 0 && ct.min <= 59, ("invalid clocktime min %d", + ct.min)); + KASSERT(ct.hour >= 0 && ct.hour <= 23, ("invalid clocktime hour %d", + ct.hour)); + KASSERT(ct.dow >= 0 && ct.dow <= 6, ("invalid clocktime wday %d", + ct.dow)); + KASSERT(ct.day >= 1 && ct.day <= 31, ("invalid clocktime mday %d", + ct.day)); + KASSERT(ct.mon >= 1 && ct.mon <= 12, ("invalid clocktime month %d", + ct.mon)); + KASSERT(ct.year >= POSIX_BASE_YEAR, ("invalid clocktime year %d", + ct.year)); + + rtc = &vrtc->rtcdev; + rtc->sec = rtcset(rtc, ct.sec); + rtc->min = rtcset(rtc, ct.min); + + if (rtc->reg_b & RTCSB_24HR) { + hour = ct.hour; + } else { + /* + * Convert to the 12-hour format. + */ + switch (ct.hour) { + case 0: /* 12 AM */ + case 12: /* 12 PM */ + hour = 12; + break; + default: + /* + * The remaining 'ct.hour' values are interpreted as: + * [1 - 11] -> 1 - 11 AM + * [13 - 23] -> 1 - 11 PM + */ + hour = ct.hour % 12; + break; + } + } + + rtc->hour = rtcset(rtc, hour); + + if ((rtc->reg_b & RTCSB_24HR) == 0 && ct.hour >= 12) + rtc->hour |= 0x80; /* set MSB to indicate PM */ + + rtc->day_of_week = rtcset(rtc, ct.dow + 1); + rtc->day_of_month = rtcset(rtc, ct.day); + rtc->month = rtcset(rtc, ct.mon); + rtc->year = rtcset(rtc, ct.year % 100); + rtc->century = rtcset(rtc, ct.year / 100); +} + +static int +rtcget(struct rtcdev *rtc, int val, int *retval) +{ + uint8_t upper, lower; + + if (rtc->reg_b & RTCSB_BIN) { + *retval = val; + return (0); + } + + lower = val & 0xf; + upper = (val >> 4) & 0xf; + + if (lower > 9 || upper > 9) + return (-1); + + *retval = upper * 10 + lower; + return (0); +} + +static time_t +rtc_to_secs(struct vrtc *vrtc) +{ + struct clocktime ct; + struct timespec ts; + struct rtcdev *rtc; +#ifdef __FreeBSD__ + struct vm *vm; +#endif + int century, error, hour, pm, year; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + +#ifdef __FreeBSD__ + vm = vrtc->vm; +#endif + rtc = &vrtc->rtcdev; + + bzero(&ct, sizeof(struct clocktime)); + + error = rtcget(rtc, rtc->sec, &ct.sec); + if (error || ct.sec < 0 || ct.sec > 59) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC sec %#x/%d", rtc->sec, ct.sec); +#endif + goto fail; + } + + error = rtcget(rtc, rtc->min, &ct.min); + if (error || ct.min < 0 || ct.min > 59) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC min %#x/%d", rtc->min, ct.min); +#endif + goto fail; + } + + pm = 0; + hour = rtc->hour; + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (hour & 0x80) { + hour &= ~0x80; + pm = 1; + } + } + error = rtcget(rtc, hour, &ct.hour); + if ((rtc->reg_b & RTCSB_24HR) == 0) { + if (ct.hour >= 1 && ct.hour <= 12) { + /* + * Convert from 12-hour format to internal 24-hour + * representation as follows: + * + * 12-hour format ct.hour + * 12 AM 0 + * 1 - 11 AM 1 - 11 + * 12 PM 12 + * 1 - 11 PM 13 - 23 + */ + if (ct.hour == 12) + ct.hour = 0; + if (pm) + ct.hour += 12; + } else { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC 12-hour format %#x/%d", + rtc->hour, ct.hour); +#endif + goto fail; + } + } + + if (error || ct.hour < 0 || ct.hour > 23) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC hour %#x/%d", rtc->hour, ct.hour); +#endif + goto fail; + } + + /* + * Ignore 'rtc->dow' because some guests like Linux don't bother + * setting it at all while others like OpenBSD/i386 set it incorrectly. + * + * clock_ct_to_ts() does not depend on 'ct.dow' anyways so ignore it. + */ + ct.dow = -1; + + error = rtcget(rtc, rtc->day_of_month, &ct.day); + if (error || ct.day < 1 || ct.day > 31) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC mday %#x/%d", rtc->day_of_month, + ct.day); +#endif + goto fail; + } + + error = rtcget(rtc, rtc->month, &ct.mon); + if (error || ct.mon < 1 || ct.mon > 12) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC month %#x/%d", rtc->month, ct.mon); +#endif + goto fail; + } + + error = rtcget(rtc, rtc->year, &year); + if (error || year < 0 || year > 99) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC year %#x/%d", rtc->year, year); +#endif + goto fail; + } + + error = rtcget(rtc, rtc->century, ¢ury); + ct.year = century * 100 + year; + if (error || ct.year < POSIX_BASE_YEAR) { +#ifdef __FreeBSD__ + VM_CTR2(vm, "Invalid RTC century %#x/%d", rtc->century, + ct.year); +#endif + goto fail; + } + + error = clock_ct_to_ts(&ct, &ts); + if (error || ts.tv_sec < 0) { +#ifdef __FreeBSD__ + VM_CTR3(vm, "Invalid RTC clocktime.date %04d-%02d-%02d", + ct.year, ct.mon, ct.day); + VM_CTR3(vm, "Invalid RTC clocktime.time %02d:%02d:%02d", + ct.hour, ct.min, ct.sec); +#endif + goto fail; + } + return (ts.tv_sec); /* success */ +fail: + /* + * Stop updating the RTC if the date/time fields programmed by + * the guest are invalid. + */ +#ifdef __FreeBSD__ + VM_CTR0(vrtc->vm, "Invalid RTC date/time programming detected"); +#endif + return (VRTC_BROKEN_TIME); +} + +static int +vrtc_time_update(struct vrtc *vrtc, time_t newtime, sbintime_t newbase) +{ + struct rtcdev *rtc; +#ifdef __FreeBSD__ + sbintime_t oldbase; +#endif + time_t oldtime; + uint8_t alarm_sec, alarm_min, alarm_hour; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + alarm_sec = rtc->alarm_sec; + alarm_min = rtc->alarm_min; + alarm_hour = rtc->alarm_hour; + + oldtime = vrtc->base_rtctime; + VM_CTR2(vrtc->vm, "Updating RTC secs from %#lx to %#lx", + oldtime, newtime); + +#ifdef __FreeBSD__ + oldbase = vrtc->base_uptime; + VM_CTR2(vrtc->vm, "Updating RTC base uptime from %#lx to %#lx", + oldbase, newbase); +#endif + vrtc->base_uptime = newbase; + + if (newtime == oldtime) + return (0); + + /* + * If 'newtime' indicates that RTC updates are disabled then just + * record that and return. There is no need to do alarm interrupt + * processing in this case. + */ + if (newtime == VRTC_BROKEN_TIME) { + vrtc->base_rtctime = VRTC_BROKEN_TIME; + return (0); + } + + /* + * Return an error if RTC updates are halted by the guest. + */ + if (rtc_halted(vrtc)) { + VM_CTR0(vrtc->vm, "RTC update halted by guest"); + return (EBUSY); + } + + do { + /* + * If the alarm interrupt is enabled and 'oldtime' is valid + * then visit all the seconds between 'oldtime' and 'newtime' + * to check for the alarm condition. + * + * Otherwise move the RTC time forward directly to 'newtime'. + */ + if (aintr_enabled(vrtc) && oldtime != VRTC_BROKEN_TIME) + vrtc->base_rtctime++; + else + vrtc->base_rtctime = newtime; + + if (aintr_enabled(vrtc)) { + /* + * Update the RTC date/time fields before checking + * if the alarm conditions are satisfied. + */ + secs_to_rtc(vrtc->base_rtctime, vrtc, 0); + + if ((alarm_sec >= 0xC0 || alarm_sec == rtc->sec) && + (alarm_min >= 0xC0 || alarm_min == rtc->min) && + (alarm_hour >= 0xC0 || alarm_hour == rtc->hour)) { + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_ALARM); + } + } + } while (vrtc->base_rtctime != newtime); + + if (uintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, rtc->reg_c | RTCIR_UPDATE); + + return (0); +} + +static sbintime_t +vrtc_freq(struct vrtc *vrtc) +{ + int ratesel; + + static sbintime_t pf[16] = { + 0, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 8192, + SBT_1S / 4096, + SBT_1S / 2048, + SBT_1S / 1024, + SBT_1S / 512, + SBT_1S / 256, + SBT_1S / 128, + SBT_1S / 64, + SBT_1S / 32, + SBT_1S / 16, + SBT_1S / 8, + SBT_1S / 4, + SBT_1S / 2, + }; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + /* + * If both periodic and alarm interrupts are enabled then use the + * periodic frequency to drive the callout. The minimum periodic + * frequency (2 Hz) is higher than the alarm frequency (1 Hz) so + * piggyback the alarm on top of it. The same argument applies to + * the update interrupt. + */ + if (pintr_enabled(vrtc) && divider_enabled(vrtc->rtcdev.reg_a)) { + ratesel = vrtc->rtcdev.reg_a & 0xf; + return (pf[ratesel]); + } else if (aintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else if (uintr_enabled(vrtc) && update_enabled(vrtc)) { + return (SBT_1S); + } else { + return (0); + } +} + +static void +vrtc_callout_reset(struct vrtc *vrtc, sbintime_t freqsbt) +{ + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + if (freqsbt == 0) { + if (callout_active(&vrtc->callout)) { + VM_CTR0(vrtc->vm, "RTC callout stopped"); + callout_stop(&vrtc->callout); + } + return; + } + VM_CTR1(vrtc->vm, "RTC callout frequency %d hz", SBT_1S / freqsbt); + callout_reset_sbt(&vrtc->callout, freqsbt, 0, vrtc_callout_handler, + vrtc, 0); +} + +static void +vrtc_callout_handler(void *arg) +{ + struct vrtc *vrtc = arg; + sbintime_t freqsbt, basetime; + time_t rtctime; + int error; + + VM_CTR0(vrtc->vm, "vrtc callout fired"); + + VRTC_LOCK(vrtc); + if (callout_pending(&vrtc->callout)) /* callout was reset */ + goto done; + + if (!callout_active(&vrtc->callout)) /* callout was stopped */ + goto done; + + callout_deactivate(&vrtc->callout); + + KASSERT((vrtc->rtcdev.reg_b & RTCSB_ALL_INTRS) != 0, + ("gratuitous vrtc callout")); + + if (pintr_enabled(vrtc)) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c | RTCIR_PERIOD); + + if (aintr_enabled(vrtc) || uintr_enabled(vrtc)) { + rtctime = vrtc_curtime(vrtc, &basetime); + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("%s: vrtc_time_update error %d", + __func__, error)); + } + + freqsbt = vrtc_freq(vrtc); + KASSERT(freqsbt != 0, ("%s: vrtc frequency cannot be zero", __func__)); + vrtc_callout_reset(vrtc, freqsbt); +done: + VRTC_UNLOCK(vrtc); +} + +static __inline void +vrtc_callout_check(struct vrtc *vrtc, sbintime_t freq) +{ + int active; + + active = callout_active(&vrtc->callout) ? 1 : 0; + KASSERT((freq == 0 && !active) || (freq != 0 && active), + ("vrtc callout %s with frequency %#lx", + active ? "active" : "inactive", freq)); +} + +static void +vrtc_set_reg_c(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + int oldirqf, newirqf; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + newval &= RTCIR_ALARM | RTCIR_PERIOD | RTCIR_UPDATE; + + oldirqf = rtc->reg_c & RTCIR_INT; + if ((aintr_enabled(vrtc) && (newval & RTCIR_ALARM) != 0) || + (pintr_enabled(vrtc) && (newval & RTCIR_PERIOD) != 0) || + (uintr_enabled(vrtc) && (newval & RTCIR_UPDATE) != 0)) { + newirqf = RTCIR_INT; + } else { + newirqf = 0; + } + + oldval = rtc->reg_c; + rtc->reg_c = newirqf | newval; + changed = oldval ^ rtc->reg_c; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_c changed from %#x to %#x", + oldval, rtc->reg_c); + } + + if (!oldirqf && newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d asserted", RTC_IRQ); + vatpic_pulse_irq(vrtc->vm, RTC_IRQ); + vioapic_pulse_irq(vrtc->vm, RTC_IRQ); + } else if (oldirqf && !newirqf) { + VM_CTR1(vrtc->vm, "RTC irq %d deasserted", RTC_IRQ); + } +} + +static int +vrtc_set_reg_b(struct vrtc *vrtc, uint8_t newval) +{ + struct rtcdev *rtc; + sbintime_t oldfreq, newfreq, basetime; + time_t curtime, rtctime; + int error; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + rtc = &vrtc->rtcdev; + oldval = rtc->reg_b; + oldfreq = vrtc_freq(vrtc); + + rtc->reg_b = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_b changed from %#x to %#x", + oldval, newval); + } + + if (changed & RTCSB_HALT) { + if ((newval & RTCSB_HALT) == 0) { + rtctime = rtc_to_secs(vrtc); + basetime = sbinuptime(); + if (rtctime == VRTC_BROKEN_TIME) { + if (rtc_flag_broken_time) + return (-1); + } + } else { + curtime = vrtc_curtime(vrtc, &basetime); + KASSERT(curtime == vrtc->base_rtctime, ("%s: mismatch " + "between vrtc basetime (%#lx) and curtime (%#lx)", + __func__, vrtc->base_rtctime, curtime)); + + /* + * Force a refresh of the RTC date/time fields so + * they reflect the time right before the guest set + * the HALT bit. + */ + secs_to_rtc(curtime, vrtc, 1); + + /* + * Updates are halted so mark 'base_rtctime' to denote + * that the RTC date/time is in flux. + */ + rtctime = VRTC_BROKEN_TIME; + rtc->reg_b &= ~RTCSB_UINTR; + } + error = vrtc_time_update(vrtc, rtctime, basetime); + KASSERT(error == 0, ("vrtc_time_update error %d", error)); + } + + /* + * Side effect of changes to the interrupt enable bits. + */ + if (changed & RTCSB_ALL_INTRS) + vrtc_set_reg_c(vrtc, vrtc->rtcdev.reg_c); + + /* + * Change the callout frequency if it has changed. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); + + /* + * The side effect of bits that control the RTC date/time format + * is handled lazily when those fields are actually read. + */ + return (0); +} + +static void +vrtc_set_reg_a(struct vrtc *vrtc, uint8_t newval) +{ + sbintime_t oldfreq, newfreq; + uint8_t oldval, changed; + + KASSERT(VRTC_LOCKED(vrtc), ("%s: vrtc not locked", __func__)); + + newval &= ~RTCSA_TUP; + oldval = vrtc->rtcdev.reg_a; + oldfreq = vrtc_freq(vrtc); + + if (divider_enabled(oldval) && !divider_enabled(newval)) { + VM_CTR2(vrtc->vm, "RTC divider held in reset at %#lx/%#lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else if (!divider_enabled(oldval) && divider_enabled(newval)) { + /* + * If the dividers are coming out of reset then update + * 'base_uptime' before this happens. This is done to + * maintain the illusion that the RTC date/time was frozen + * while the dividers were disabled. + */ + vrtc->base_uptime = sbinuptime(); + VM_CTR2(vrtc->vm, "RTC divider out of reset at %#lx/%#lx", + vrtc->base_rtctime, vrtc->base_uptime); + } else { + /* NOTHING */ + } + + vrtc->rtcdev.reg_a = newval; + changed = oldval ^ newval; + if (changed) { + VM_CTR2(vrtc->vm, "RTC reg_a changed from %#x to %#x", + oldval, newval); + } + + /* + * Side effect of changes to rate select and divider enable bits. + */ + newfreq = vrtc_freq(vrtc); + if (newfreq != oldfreq) + vrtc_callout_reset(vrtc, newfreq); + else + vrtc_callout_check(vrtc, newfreq); +} + +int +vrtc_set_time(struct vm *vm, time_t secs) +{ + struct vrtc *vrtc; + int error; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + error = vrtc_time_update(vrtc, secs, sbinuptime()); + VRTC_UNLOCK(vrtc); + + if (error) { + VM_CTR2(vrtc->vm, "Error %d setting RTC time to %#lx", error, + secs); + } else { + VM_CTR1(vrtc->vm, "RTC time set to %#lx", secs); + } + + return (error); +} + +time_t +vrtc_get_time(struct vm *vm) +{ + struct vrtc *vrtc; + sbintime_t basetime; + time_t t; + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + t = vrtc_curtime(vrtc, &basetime); + VRTC_UNLOCK(vrtc); + + return (t); +} + +int +vrtc_nvram_write(struct vm *vm, int offset, uint8_t value) +{ + struct vrtc *vrtc; + uint8_t *ptr; + + vrtc = vm_rtc(vm); + + /* + * Don't allow writes to RTC control registers or the date/time fields. + */ + if (offset < offsetof(struct rtcdev, nvram[0]) || + offset == RTC_CENTURY || offset >= sizeof(struct rtcdev)) { + VM_CTR1(vrtc->vm, "RTC nvram write to invalid offset %d", + offset); + return (EINVAL); + } + + VRTC_LOCK(vrtc); + ptr = (uint8_t *)(&vrtc->rtcdev); + ptr[offset] = value; + VM_CTR2(vrtc->vm, "RTC nvram write %#x to offset %#x", value, offset); + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval) +{ + struct vrtc *vrtc; + sbintime_t basetime; + time_t curtime; + uint8_t *ptr; + + /* + * Allow all offsets in the RTC to be read. + */ + if (offset < 0 || offset >= sizeof(struct rtcdev)) + return (EINVAL); + + vrtc = vm_rtc(vm); + VRTC_LOCK(vrtc); + + /* + * Update RTC date/time fields if necessary. + */ + if (offset < 10 || offset == RTC_CENTURY) { + curtime = vrtc_curtime(vrtc, &basetime); + secs_to_rtc(curtime, vrtc, 0); + } + + ptr = (uint8_t *)(&vrtc->rtcdev); + *retval = ptr[offset]; + + VRTC_UNLOCK(vrtc); + return (0); +} + +int +vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vrtc *vrtc; + + vrtc = vm_rtc(vm); + + if (bytes != 1) + return (-1); + + if (in) { + *val = 0xff; + return (0); + } + + VRTC_LOCK(vrtc); + vrtc->addr = *val & 0x7f; + VRTC_UNLOCK(vrtc); + + return (0); +} + +int +vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + sbintime_t basetime; + time_t curtime; + int error, offset; + + vrtc = vm_rtc(vm); + rtc = &vrtc->rtcdev; + + if (bytes != 1) + return (-1); + + VRTC_LOCK(vrtc); + offset = vrtc->addr; + if (offset >= sizeof(struct rtcdev)) { + VRTC_UNLOCK(vrtc); + return (-1); + } + + error = 0; + curtime = vrtc_curtime(vrtc, &basetime); + vrtc_time_update(vrtc, curtime, basetime); + + /* + * Update RTC date/time fields if necessary. + * + * This is not just for reads of the RTC. The side-effect of writing + * the century byte requires other RTC date/time fields (e.g. sec) + * to be updated here. + */ + if (offset < 10 || offset == RTC_CENTURY) + secs_to_rtc(curtime, vrtc, 0); + + if (in) { + if (offset == 12) { + /* + * XXX + * reg_c interrupt flags are updated only if the + * corresponding interrupt enable bit in reg_b is set. + */ + *val = vrtc->rtcdev.reg_c; + vrtc_set_reg_c(vrtc, 0); + } else { + *val = *((uint8_t *)rtc + offset); + } + VCPU_CTR2(vm, vcpuid, "Read value %#x from RTC offset %#x", + *val, offset); + } else { + switch (offset) { + case 10: + VCPU_CTR1(vm, vcpuid, "RTC reg_a set to %#x", *val); + vrtc_set_reg_a(vrtc, *val); + break; + case 11: + VCPU_CTR1(vm, vcpuid, "RTC reg_b set to %#x", *val); + error = vrtc_set_reg_b(vrtc, *val); + break; + case 12: + VCPU_CTR1(vm, vcpuid, "RTC reg_c set to %#x (ignored)", + *val); + break; + case 13: + VCPU_CTR1(vm, vcpuid, "RTC reg_d set to %#x (ignored)", + *val); + break; + case 0: + /* + * High order bit of 'seconds' is readonly. + */ + *val &= 0x7f; + /* FALLTHRU */ + default: + VCPU_CTR2(vm, vcpuid, "RTC offset %#x set to %#x", + offset, *val); + *((uint8_t *)rtc + offset) = *val; + break; + } + + /* + * XXX some guests (e.g. OpenBSD) write the century byte + * outside of RTCSB_HALT so re-calculate the RTC date/time. + */ + if (offset == RTC_CENTURY && !rtc_halted(vrtc)) { + curtime = rtc_to_secs(vrtc); + error = vrtc_time_update(vrtc, curtime, sbinuptime()); + KASSERT(!error, ("vrtc_time_update error %d", error)); + if (curtime == VRTC_BROKEN_TIME && rtc_flag_broken_time) + error = -1; + } + } + VRTC_UNLOCK(vrtc); + return (error); +} + +void +vrtc_reset(struct vrtc *vrtc) +{ + struct rtcdev *rtc; + + VRTC_LOCK(vrtc); + + rtc = &vrtc->rtcdev; + vrtc_set_reg_b(vrtc, rtc->reg_b & ~(RTCSB_ALL_INTRS | RTCSB_SQWE)); + vrtc_set_reg_c(vrtc, 0); + KASSERT(!callout_active(&vrtc->callout), ("rtc callout still active")); + + VRTC_UNLOCK(vrtc); +} + +struct vrtc * +vrtc_init(struct vm *vm) +{ + struct vrtc *vrtc; + struct rtcdev *rtc; + time_t curtime; + + vrtc = malloc(sizeof(struct vrtc), M_VRTC, M_WAITOK | M_ZERO); + vrtc->vm = vm; + mtx_init(&vrtc->mtx, "vrtc lock", NULL, MTX_DEF); + callout_init(&vrtc->callout, 1); + + /* Allow dividers to keep time but disable everything else */ + rtc = &vrtc->rtcdev; + rtc->reg_a = 0x20; + rtc->reg_b = RTCSB_24HR; + rtc->reg_c = 0; + rtc->reg_d = RTCSD_PWR; + + /* Reset the index register to a safe value. */ + vrtc->addr = RTC_STATUSD; + + /* + * Initialize RTC time to 00:00:00 Jan 1, 1970. + */ + curtime = 0; + + VRTC_LOCK(vrtc); + vrtc->base_rtctime = VRTC_BROKEN_TIME; + vrtc_time_update(vrtc, curtime, sbinuptime()); + secs_to_rtc(curtime, vrtc, 0); + VRTC_UNLOCK(vrtc); + + return (vrtc); +} + +void +vrtc_cleanup(struct vrtc *vrtc) +{ + + callout_drain(&vrtc->callout); + free(vrtc, M_VRTC); +} + +#ifndef __FreeBSD__ +void +vrtc_localize_resources(struct vrtc *vrtc) +{ + vmm_glue_callout_localize(&vrtc->callout); +} +#endif /* __FreeBSD */ diff --git a/usr/src/uts/i86pc/io/vmm/io/vrtc.h b/usr/src/uts/i86pc/io/vmm/io/vrtc.h new file mode 100644 index 0000000000..13abbedeb9 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/io/vrtc.h @@ -0,0 +1,60 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2014 Neel Natu (neel@freebsd.org) + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Copyright 2018 Joyent, Inc. + */ + +#ifndef _VRTC_H_ +#define _VRTC_H_ + +#include <isa/isareg.h> + +struct vrtc; + +struct vrtc *vrtc_init(struct vm *vm); +void vrtc_cleanup(struct vrtc *vrtc); +void vrtc_reset(struct vrtc *vrtc); + +time_t vrtc_get_time(struct vm *vm); +int vrtc_set_time(struct vm *vm, time_t secs); +int vrtc_nvram_write(struct vm *vm, int offset, uint8_t value); +int vrtc_nvram_read(struct vm *vm, int offset, uint8_t *retval); + +int vrtc_addr_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); +int vrtc_data_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes, + uint32_t *val); + +#ifndef __FreeBSD__ +void vrtc_localize_resources(struct vrtc *); +#endif + +#endif diff --git a/usr/src/uts/i86pc/io/vmm/offsets.in b/usr/src/uts/i86pc/io/vmm/offsets.in deleted file mode 100644 index 4b1fe1d6b6..0000000000 --- a/usr/src/uts/i86pc/io/vmm/offsets.in +++ /dev/null @@ -1,72 +0,0 @@ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - */ - -/* - * Copyright 2014 Pluribus Networks Inc. - */ - -#include <sys/types.h> -#include <sys/param.h> -#include <sys/systm.h> -#include <sys/cpuvar.h> - -#include <machine/pmap.h> - -#include <machine/vmm.h> -#include "intel/vmx_cpufunc.h" -#include "intel/vmx.h" - -vmxctx - tmpstktop VMXCTX_TMPSTKTOP - guest_rdi VMXCTX_GUEST_RDI - guest_rsi VMXCTX_GUEST_RSI - guest_rdx VMXCTX_GUEST_RDX - guest_rcx VMXCTX_GUEST_RCX - guest_r8 VMXCTX_GUEST_R8 - guest_r9 VMXCTX_GUEST_R9 - guest_rax VMXCTX_GUEST_RAX - guest_rbx VMXCTX_GUEST_RBX - guest_rbp VMXCTX_GUEST_RBP - guest_r10 VMXCTX_GUEST_R10 - guest_r11 VMXCTX_GUEST_R11 - guest_r12 VMXCTX_GUEST_R12 - guest_r13 VMXCTX_GUEST_R13 - guest_r14 VMXCTX_GUEST_R14 - guest_r15 VMXCTX_GUEST_R15 - guest_cr2 VMXCTX_GUEST_CR2 - host_r15 VMXCTX_HOST_R15 - host_r14 VMXCTX_HOST_R14 - host_r13 VMXCTX_HOST_R13 - host_r12 VMXCTX_HOST_R12 - host_rbp VMXCTX_HOST_RBP - host_rsp VMXCTX_HOST_RSP - host_rbx VMXCTX_HOST_RBX - host_rip VMXCTX_HOST_RIP - launch_error VMXCTX_LAUNCH_ERROR - -vmx VMX_SIZE - -\#define VM_SUCCESS 0 -\#define VM_FAIL_INVALID 1 -\#define VM_FAIL_VALID 2 - -\#define VMX_RETURN_DIRECT 0 -\#define VMX_RETURN_LONGJMP 1 -\#define VMX_RETURN_VMRESUME 2 -\#define VMX_RETURN_VMLAUNCH 3 -\#define VMX_RETURN_AST 4 - -cpu - cpu_thread - -_kthread - t_lwp - _tu._ts._t_astflag T_ASTFLAG diff --git a/usr/src/uts/i86pc/io/vmm/vm/pmap.h b/usr/src/uts/i86pc/io/vmm/vm/pmap.h new file mode 100644 index 0000000000..512fc4acee --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/pmap.h @@ -0,0 +1,27 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _PMAP_VM_ +#define _PMAP_VM_ + +#include <machine/pmap.h> +#include "vm_glue.h" + +void pmap_invalidate_cache(void); +void pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num); +int pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype); +long pmap_wired_count(pmap_t pmap); + +#endif /* _PMAP_VM_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_extern.h b/usr/src/uts/i86pc/io/vmm/vm/vm_extern.h new file mode 100644 index 0000000000..92a959960a --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_extern.h @@ -0,0 +1,35 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_EXTERN_H_ +#define _VM_EXTERN_H_ + +#include <sys/types.h> +#include <vm/vm.h> + +struct vmspace; +struct pmap; + +typedef int (*pmap_pinit_t)(struct pmap *pmap); + +struct vmspace *vmspace_alloc(vm_offset_t, vm_offset_t, pmap_pinit_t); +void vmspace_free(struct vmspace *); + +int vm_fault(vm_map_t, vm_offset_t, vm_prot_t, int); +int vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, + vm_prot_t prot, vm_page_t *ma, int max_count); + + +#endif /* _VM_EXTERN_H_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h b/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h new file mode 100644 index 0000000000..600872c321 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_glue.h @@ -0,0 +1,99 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VM_GLUE_ +#define _VM_GLUE_ + +#include <vm/pmap.h> +#include <vm/vm.h> +#include <sys/cpuvar.h> + +struct vmspace; +struct vm_map; +struct pmap; +struct vm_object; +struct vmm_pt_ops; + +struct vm_map { + struct vmspace *vmm_space; +}; + +struct pmap { + void *pm_pml4; + cpuset_t pm_active; + long pm_eptgen; + + /* Implementation private */ + enum pmap_type pm_type; + struct vmm_pt_ops *pm_ops; + void *pm_impl; +}; + +struct vmspace { + struct vm_map vm_map; + + /* Implementation private */ + kmutex_t vms_lock; + boolean_t vms_map_changing; + struct pmap vms_pmap; + uintptr_t vms_size; /* fixed after creation */ + + list_t vms_maplist; +}; + +typedef pfn_t (*vm_pager_fn_t)(vm_object_t, uintptr_t, pfn_t *, uint_t *); + +struct vm_object { + uint_t vmo_refcnt; /* manipulated with atomic ops */ + + /* This group of fields are fixed at creation time */ + objtype_t vmo_type; + size_t vmo_size; + vm_pager_fn_t vmo_pager; + void *vmo_data; + + kmutex_t vmo_lock; /* protects fields below */ + vm_memattr_t vmo_attr; +}; + +struct vm_page { + kmutex_t vmp_lock; + pfn_t vmp_pfn; + struct vm_object *vmp_obj_held; +}; + +/* Illumos-specific functions for setup and operation */ +int vm_segmap_obj(struct vmspace *, vm_object_t, struct as *, caddr_t *, + uint_t, uint_t, uint_t); +int vm_segmap_space(struct vmspace *, off_t, struct as *, caddr_t *, off_t, + uint_t, uint_t, uint_t); +void *vmspace_find_kva(struct vmspace *, uintptr_t, size_t); +void vmm_arena_init(void); +void vmm_arena_fini(void); + +struct vmm_pt_ops { + void * (*vpo_init)(uint64_t *); + void (*vpo_free)(void *); + uint64_t (*vpo_wired_cnt)(void *); + int (*vpo_is_wired)(void *, uint64_t, uint_t *); + int (*vpo_map)(void *, uint64_t, pfn_t, uint_t, uint_t, uint8_t); + uint64_t (*vpo_unmap)(void *, uint64_t, uint64_t); +}; + +extern struct vmm_pt_ops ept_ops; +extern struct vmm_pt_ops rvi_ops; + + +#endif /* _VM_GLUE_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_map.h b/usr/src/uts/i86pc/io/vmm/vm/vm_map.h new file mode 100644 index 0000000000..70826ac8f1 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_map.h @@ -0,0 +1,63 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_MAP_ +#define _VM_MAP_ + +#include "vm_glue.h" + +/* + * vm_map_wire and vm_map_unwire option flags + */ +#define VM_MAP_WIRE_SYSTEM 0 /* wiring in a kernel map */ +#define VM_MAP_WIRE_USER 1 /* wiring in a user map */ + +#define VM_MAP_WIRE_NOHOLES 0 /* region must not have holes */ +#define VM_MAP_WIRE_HOLESOK 2 /* region may have holes */ + +#define VM_MAP_WIRE_WRITE 4 /* Validate writable. */ + +/* + * The following "find_space" options are supported by vm_map_find(). + * + * For VMFS_ALIGNED_SPACE, the desired alignment is specified to + * the macro argument as log base 2 of the desired alignment. + */ +#define VMFS_NO_SPACE 0 /* don't find; use the given range */ +#define VMFS_ANY_SPACE 1 /* find range with any alignment */ +#define VMFS_OPTIMAL_SPACE 2 /* find range with optimal alignment */ +#define VMFS_SUPER_SPACE 3 /* find superpage-aligned range */ +#define VMFS_ALIGNED_SPACE(x) ((x) << 8) /* find range with fixed alignment */ + +/* + * vm_fault option flags + */ +#define VM_FAULT_NORMAL 0 /* Nothing special */ +#define VM_FAULT_WIRE 1 /* Wire the mapped page */ +#define VM_FAULT_DIRTY 2 /* Dirty the page; use w/VM_PROT_COPY */ + + + +pmap_t vmspace_pmap(struct vmspace *); + +int vm_map_find(vm_map_t, vm_object_t, vm_ooffset_t, vm_offset_t *, vm_size_t, + vm_offset_t, int, vm_prot_t, vm_prot_t, int); +int vm_map_remove(vm_map_t, vm_offset_t, vm_offset_t); +int vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags); + +long vmspace_resident_count(struct vmspace *vmspace); + + +#endif /* _VM_MAP_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_object.h b/usr/src/uts/i86pc/io/vmm/vm/vm_object.h new file mode 100644 index 0000000000..1f16fa9b83 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_object.h @@ -0,0 +1,31 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_OBJECT_ +#define _VM_OBJECT_ + +#include "vm_glue.h" + +vm_object_t vm_object_allocate(objtype_t, vm_pindex_t); +void vm_object_deallocate(vm_object_t); +void vm_object_reference(vm_object_t); +int vm_object_set_memattr(vm_object_t, vm_memattr_t); +void vm_object_clear(vm_object_t); + + +#define VM_OBJECT_WLOCK(vmo) mutex_enter(&(vmo)->vmo_lock) +#define VM_OBJECT_WUNLOCK(vmo) mutex_exit(&(vmo)->vmo_lock) + +#endif /* _VM_OBJECT_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_page.h b/usr/src/uts/i86pc/io/vmm/vm/vm_page.h new file mode 100644 index 0000000000..4559fe6d4c --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_page.h @@ -0,0 +1,28 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + + +#ifndef _VM_PAGE_ +#define _VM_PAGE_ + +#include "vm_glue.h" + +void vm_page_lock(vm_page_t); +void vm_page_unhold(vm_page_t); +void vm_page_unlock(vm_page_t); + +#define VM_PAGE_TO_PHYS(page) (mmu_ptob((uintptr_t)((page)->vmp_pfn))) + +#endif /* _VM_PAGE_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vm/vm_pager.h b/usr/src/uts/i86pc/io/vmm/vm/vm_pager.h new file mode 100644 index 0000000000..11aa344f61 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vm/vm_pager.h @@ -0,0 +1,23 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _VM_PAGER_ +#define _VM_PAGER_ + +vm_object_t vm_pager_allocate(objtype_t, void *, vm_ooffset_t, vm_prot_t, + vm_ooffset_t, void *); + + +#endif /* _VM_PAGER_ */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c index 7081368f4a..6df094b50e 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm.c +++ b/usr/src/uts/i86pc/io/vmm/vmm.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,10 +38,11 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> @@ -51,16 +54,26 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon #include <sys/lock.h> #include <sys/mutex.h> #include <sys/proc.h> +#include <sys/rwlock.h> #include <sys/sched.h> #include <sys/smp.h> -#include <x86/psl.h> #include <sys/systm.h> #include <vm/vm.h> - -#include <machine/vm.h> +#include <vm/vm_object.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/pmap.h> +#include <vm/vm_extern.h> +#include <vm/vm_param.h> + +#ifdef __FreeBSD__ +#include <machine/cpu.h> +#endif #include <machine/pcb.h> #include <machine/smp.h> +#include <machine/md_var.h> +#include <x86/psl.h> #include <x86/apicreg.h> #include <machine/vmm.h> @@ -77,83 +90,132 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon #include "vhpet.h" #include "vioapic.h" #include "vlapic.h" -#include "vmm_ipi.h" +#include "vpmtmr.h" +#include "vrtc.h" #include "vmm_stat.h" #include "vmm_lapic.h" -#ifdef __FreeBSD__ #include "io/ppt.h" #include "io/iommu.h" -#endif -struct vhpet; -struct vioapic; struct vlapic; +/* + * Initialization: + * (a) allocated when vcpu is created + * (i) initialized when vcpu is created and when it is reinitialized + * (o) initialized the first time the vcpu is created + * (x) initialized before use + */ struct vcpu { - int flags; - enum vcpu_state state; - struct mtx mtx; - int hostcpu; /* host cpuid this vcpu last ran on */ - struct vlapic *vlapic; - int vcpuid; - struct savefpu *guestfpu; /* guest fpu state */ - void *stats; - struct vm_exit exitinfo; + struct mtx mtx; /* (o) protects 'state' and 'hostcpu' */ + enum vcpu_state state; /* (o) vcpu state */ +#ifndef __FreeBSD__ + kcondvar_t vcpu_cv; /* (o) cpu waiter cv */ + kcondvar_t state_cv; /* (o) IDLE-transition cv */ +#endif /* __FreeBSD__ */ + int hostcpu; /* (o) vcpu's current host cpu */ +#ifndef __FreeBSD__ + int lastloccpu; /* (o) last host cpu localized to */ +#endif + u_int runblock; /* (i) block vcpu from run state */ + int reqidle; /* (i) request vcpu to idle */ + struct vlapic *vlapic; /* (i) APIC device model */ + enum x2apic_state x2apic_state; /* (i) APIC mode */ + uint64_t exitintinfo; /* (i) events pending at VM exit */ + int nmi_pending; /* (i) NMI pending */ + int extint_pending; /* (i) INTR pending */ + int exception_pending; /* (i) exception pending */ + int exc_vector; /* (x) exception collateral */ + int exc_errcode_valid; + uint32_t exc_errcode; + struct savefpu *guestfpu; /* (a,i) guest fpu state */ + uint64_t guest_xcr0; /* (i) guest %xcr0 register */ + void *stats; /* (a,i) statistics */ + struct vm_exit exitinfo; /* (x) exit reason and collateral */ uint64_t nextrip; /* (x) next instruction to execute */ - enum x2apic_state x2apic_state; - uint64_t exitintinfo; - int nmi_pending; - int extint_pending; - struct vm_exception exception; - int exception_pending; +#ifndef __FreeBSD__ + uint64_t tsc_offset; /* (x) offset from host TSC */ +#endif }; +#define vcpu_lock_initialized(v) mtx_initialized(&((v)->mtx)) #define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN) #define vcpu_lock(v) mtx_lock_spin(&((v)->mtx)) #define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx)) #define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED) -#define VM_MAX_MEMORY_SEGMENTS 8 - -struct vm { - void *cookie; /* processor-specific data */ - void *iommu; /* iommu-specific data */ - struct vcpu vcpu[VM_MAXCPU]; - struct vhpet *vhpet; - struct vioapic *vioapic; /* virtual ioapic */ - struct vatpic *vatpic; /* virtual atpic */ - struct vatpit *vatpit; /* virtual atpit */ - int num_mem_segs; - struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS]; - char name[VM_MAX_NAMELEN]; +struct mem_seg { + size_t len; + bool sysmem; + struct vm_object *object; +}; +#ifdef __FreeBSD__ +#define VM_MAX_MEMSEGS 3 +#else +#define VM_MAX_MEMSEGS 4 +#endif - /* - * Set of active vcpus. - * An active vcpu is one that has been started implicitly (BSP) or - * explicitly (AP) by sending it a startup ipi. - */ - cpuset_t active_cpus; +struct mem_map { + vm_paddr_t gpa; + size_t len; + vm_ooffset_t segoff; + int segid; + int prot; + int flags; +}; +#define VM_MAX_MEMMAPS 4 - vm_rendezvous_func_t rendezvous_func; +/* + * Initialization: + * (o) initialized the first time the VM is created + * (i) initialized when VM is created and when it is reinitialized + * (x) initialized before use + */ +struct vm { + void *cookie; /* (i) cpu-specific data */ + void *iommu; /* (x) iommu-specific data */ + struct vhpet *vhpet; /* (i) virtual HPET */ + struct vioapic *vioapic; /* (i) virtual ioapic */ + struct vatpic *vatpic; /* (i) virtual atpic */ + struct vatpit *vatpit; /* (i) virtual atpit */ + struct vpmtmr *vpmtmr; /* (i) virtual ACPI PM timer */ + struct vrtc *vrtc; /* (o) virtual RTC */ + volatile cpuset_t active_cpus; /* (i) active vcpus */ + volatile cpuset_t debug_cpus; /* (i) vcpus stopped for debug */ + int suspend; /* (i) stop VM execution */ + volatile cpuset_t suspended_cpus; /* (i) suspended vcpus */ + volatile cpuset_t halted_cpus; /* (x) cpus in a hard halt */ + struct mem_map mem_maps[VM_MAX_MEMMAPS]; /* (i) guest address space */ + struct mem_seg mem_segs[VM_MAX_MEMSEGS]; /* (o) guest memory regions */ + struct vmspace *vmspace; /* (o) guest's address space */ + char name[VM_MAX_NAMELEN]; /* (o) virtual machine name */ + struct vcpu vcpu[VM_MAXCPU]; /* (i) guest vcpus */ + /* The following describe the vm cpu topology */ + uint16_t sockets; /* (o) num of sockets */ + uint16_t cores; /* (o) num of cores/socket */ + uint16_t threads; /* (o) num of threads/core */ + uint16_t maxcpus; /* (o) max pluggable cpus */ +#ifndef __FreeBSD__ + list_t ioport_hooks; +#endif /* __FreeBSD__ */ }; static int vmm_initialized; static struct vmm_ops *ops; -#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0) +#define VMM_INIT(num) (ops != NULL ? (*ops->init)(num) : 0) #define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0) +#define VMM_RESUME() (ops != NULL ? (*ops->resume)() : 0) -#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL) -#define VMRUN(vmi, vcpu, rip) \ - (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO) +#define VMINIT(vm, pmap) (ops != NULL ? (*ops->vminit)(vm, pmap): NULL) +#define VMRUN(vmi, vcpu, rip, pmap, evinfo) \ + (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip, pmap, evinfo) : ENXIO) #define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL) -#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \ - (ops != NULL ? \ - (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \ - ENXIO) -#define VMMMAP_GET(vmi, gpa) \ - (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO) +#define VMSPACE_ALLOC(min, max) \ + (ops != NULL ? (*ops->vmspace_alloc)(min, max) : NULL) +#define VMSPACE_FREE(vmspace) \ + (ops != NULL ? (*ops->vmspace_free)(vmspace) : ENXIO) #define VMGETREG(vmi, vcpu, num, retval) \ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO) #define VMSETREG(vmi, vcpu, num, val) \ @@ -174,45 +236,134 @@ static struct vmm_ops *ops; #define fpu_start_emulating() load_cr0(rcr0() | CR0_TS) #define fpu_stop_emulating() clts() +SDT_PROVIDER_DEFINE(vmm); + static MALLOC_DEFINE(M_VM, "vm", "vm"); /* statistics */ static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime"); +SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); + +/* + * Halt the guest if all vcpus are executing a HLT instruction with + * interrupts disabled. + */ +static int halt_detection_enabled = 1; +SYSCTL_INT(_hw_vmm, OID_AUTO, halt_detection, CTLFLAG_RDTUN, + &halt_detection_enabled, 0, + "Halt VM if all vcpus execute HLT with interrupts disabled"); + static int vmm_ipinum; SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0, "IPI vector used for vcpu notifications"); +static int trace_guest_exceptions; +SYSCTL_INT(_hw_vmm, OID_AUTO, trace_guest_exceptions, CTLFLAG_RDTUN, + &trace_guest_exceptions, 0, + "Trap into hypervisor on all guest exceptions and reflect them back"); + +static void vm_free_memmap(struct vm *vm, int ident); +static bool sysmem_mapping(struct vm *vm, struct mem_map *mm); +static void vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr); + +#ifndef __FreeBSD__ +static void vm_clear_memseg(struct vm *, int); + +typedef struct vm_ioport_hook { + list_node_t vmih_node; + uint_t vmih_ioport; + void *vmih_arg; + vmm_rmem_cb_t vmih_rmem_cb; + vmm_wmem_cb_t vmih_wmem_cb; +} vm_ioport_hook_t; + +/* Flags for vtc_status */ +#define VTCS_FPU_RESTORED 1 /* guest FPU restored, host FPU saved */ +#define VTCS_FPU_CTX_CRITICAL 2 /* in ctx where FPU restore cannot be lazy */ + +typedef struct vm_thread_ctx { + struct vm *vtc_vm; + int vtc_vcpuid; + uint_t vtc_status; +} vm_thread_ctx_t; +#endif /* __FreeBSD__ */ + +#ifdef KTR +static const char * +vcpu_state2str(enum vcpu_state state) +{ + + switch (state) { + case VCPU_IDLE: + return ("idle"); + case VCPU_FROZEN: + return ("frozen"); + case VCPU_RUNNING: + return ("running"); + case VCPU_SLEEPING: + return ("sleeping"); + default: + return ("unknown"); + } +} +#endif + static void -vcpu_cleanup(struct vm *vm, int i) +vcpu_cleanup(struct vm *vm, int i, bool destroy) { struct vcpu *vcpu = &vm->vcpu[i]; VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic); -#ifdef __FreeBSD__ - vmm_stat_free(vcpu->stats); -#endif - fpu_save_area_free(vcpu->guestfpu); + if (destroy) { + vmm_stat_free(vcpu->stats); + fpu_save_area_free(vcpu->guestfpu); + } } static void -vcpu_init(struct vm *vm, uint32_t vcpu_id) +vcpu_init(struct vm *vm, int vcpu_id, bool create) { struct vcpu *vcpu; - + + KASSERT(vcpu_id >= 0 && vcpu_id < vm->maxcpus, + ("vcpu_init: invalid vcpu %d", vcpu_id)); + vcpu = &vm->vcpu[vcpu_id]; - vcpu_lock_init(vcpu); - vcpu->hostcpu = NOCPU; - vcpu->vcpuid = vcpu_id; + if (create) { +#ifdef __FreeBSD__ + KASSERT(!vcpu_lock_initialized(vcpu), ("vcpu %d already " + "initialized", vcpu_id)); +#endif + vcpu_lock_init(vcpu); + vcpu->state = VCPU_IDLE; + vcpu->hostcpu = NOCPU; +#ifndef __FreeBSD__ + vcpu->lastloccpu = NOCPU; +#endif + vcpu->guestfpu = fpu_save_area_alloc(); + vcpu->stats = vmm_stat_alloc(); + } + vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id); vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED); + vcpu->runblock = 0; + vcpu->reqidle = 0; vcpu->exitintinfo = 0; - vcpu->guestfpu = fpu_save_area_alloc(); + vcpu->nmi_pending = 0; + vcpu->extint_pending = 0; + vcpu->exception_pending = 0; + vcpu->guest_xcr0 = XFEATURE_ENABLED_X87; fpu_save_area_reset(vcpu->guestfpu); -#ifdef __FreeBSD__ - vcpu->stats = vmm_stat_alloc(); -#endif + vmm_stat_init(vcpu->stats); +} + +int +vcpu_trace_exceptions(struct vm *vm, int vcpuid) +{ + + return (trace_guest_exceptions); } struct vm_exit * @@ -220,7 +371,7 @@ vm_exitinfo(struct vm *vm, int cpuid) { struct vcpu *vcpu; - if (cpuid < 0 || cpuid >= VM_MAXCPU) + if (cpuid < 0 || cpuid >= vm->maxcpus) panic("vm_exitinfo: invalid cpuid %d", cpuid); vcpu = &vm->vcpu[cpuid]; @@ -228,24 +379,35 @@ vm_exitinfo(struct vm *vm, int cpuid) return (&vcpu->exitinfo); } +#ifdef __FreeBSD__ +static void +vmm_resume(void) +{ + VMM_RESUME(); +} +#endif + static int vmm_init(void) { int error; -#ifndef __FreeBSD__ - vmm_sol_glue_init(); -#endif - vmm_host_state_init(); -#ifdef __FreeBSD__ - vmm_ipi_init(); + +#ifdef __FreeBSD__ + vmm_ipinum = lapic_ipi_alloc(pti ? &IDTVEC(justreturn1_pti) : + &IDTVEC(justreturn)); + if (vmm_ipinum < 0) + vmm_ipinum = IPI_AST; +#else + /* We use cpu_poke() for IPIs */ + vmm_ipinum = 0; #endif error = vmm_mem_init(); if (error) return (error); - + if (vmm_is_intel()) ops = &vmm_ops_intel; else if (vmm_is_amd()) @@ -253,10 +415,15 @@ vmm_init(void) else return (ENXIO); - return (VMM_INIT()); +#ifdef __FreeBSD__ + vmm_resume_p = vmm_resume; +#endif + + return (VMM_INIT(vmm_ipinum)); } -#ifdef __FreeBSD__ +#ifdef __FreeBSD__ + static int vmm_handler(module_t mod, int what, void *arg) { @@ -265,8 +432,6 @@ vmm_handler(module_t mod, int what, void *arg) switch (what) { case MOD_LOAD: vmmdev_init(); - if (ppt_num_devices() > 0) - iommu_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; @@ -274,11 +439,12 @@ vmm_handler(module_t mod, int what, void *arg) case MOD_UNLOAD: error = vmmdev_cleanup(); if (error == 0) { -#ifndef __FreeBSD__ - vmm_sol_glue_cleanup(); -#endif + vmm_resume_p = NULL; iommu_cleanup(); - vmm_ipi_cleanup(); +#ifdef __FreeBSD__ + if (vmm_ipinum != IPI_AST) + lapic_ipi_free(vmm_ipinum); +#endif error = VMM_CLEANUP(); /* * Something bad happened - prevent new @@ -304,23 +470,19 @@ static moduledata_t vmm_kmod = { /* * vmm initialization has the following dependencies: * - * - iommu initialization must happen after the pci passthru driver has had - * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE). - * * - VT-x initialization requires smp_rendezvous() and therefore must happen * after SMP is fully functional (after SI_SUB_SMP). */ DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY); MODULE_VERSION(vmm, 1); -SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL); -#else +#else /* __FreeBSD__ */ + int vmm_mod_load() { int error; - vmmdev_init(); error = vmm_init(); if (error == 0) vmm_initialized = 1; @@ -333,9 +495,6 @@ vmm_mod_unload() { int error; - error = vmmdev_cleanup(); - if (error) - return (error); error = VMM_CLEANUP(); if (error) return (error); @@ -343,16 +502,63 @@ vmm_mod_unload() return (0); } + +#endif /* __FreeBSD__ */ + +static void +vm_init(struct vm *vm, bool create) +{ + int i; +#ifndef __FreeBSD__ + uint64_t tsc_off; #endif + vm->cookie = VMINIT(vm, vmspace_pmap(vm->vmspace)); + vm->iommu = NULL; + vm->vioapic = vioapic_init(vm); + vm->vhpet = vhpet_init(vm); + vm->vatpic = vatpic_init(vm); + vm->vatpit = vatpit_init(vm); + vm->vpmtmr = vpmtmr_init(vm); + if (create) + vm->vrtc = vrtc_init(vm); +#ifndef __FreeBSD__ + if (create) { + list_create(&vm->ioport_hooks, sizeof (vm_ioport_hook_t), + offsetof (vm_ioport_hook_t, vmih_node)); + } else { + VERIFY(list_is_empty(&vm->ioport_hooks)); + } +#endif /* __FreeBSD__ */ + + CPU_ZERO(&vm->active_cpus); + CPU_ZERO(&vm->debug_cpus); + + vm->suspend = 0; + CPU_ZERO(&vm->suspended_cpus); + + for (i = 0; i < vm->maxcpus; i++) + vcpu_init(vm, i, create); + +#ifndef __FreeBSD__ + tsc_off = (uint64_t)(-(int64_t)rdtsc()); + for (i = 0; i < vm->maxcpus; i++) { + vm->vcpu[i].tsc_offset = tsc_off; + } +#endif /* __FreeBSD__ */ +} + +/* + * The default CPU topology is a single thread per package. + */ +u_int cores_per_package = 1; +u_int threads_per_core = 1; + int vm_create(const char *name, struct vm **retvm) { - int i; struct vm *vm; - vm_paddr_t maxaddr; - - const int BSP = 0; + struct vmspace *vmspace; /* * If vmm.ko could not be successfully initialized then don't attempt @@ -364,269 +570,587 @@ vm_create(const char *name, struct vm **retvm) if (name == NULL || strlen(name) >= VM_MAX_NAMELEN) return (EINVAL); + vmspace = VMSPACE_ALLOC(0, VM_MAXUSER_ADDRESS); + if (vmspace == NULL) + return (ENOMEM); + vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO); strcpy(vm->name, name); - vm->cookie = VMINIT(vm); + vm->vmspace = vmspace; - vm->vioapic = vioapic_init(vm); - vm->vhpet = vhpet_init(vm); - vm->vatpic = vatpic_init(vm); - vm->vatpit = vatpit_init(vm); + vm->sockets = 1; + vm->cores = cores_per_package; /* XXX backwards compatibility */ + vm->threads = threads_per_core; /* XXX backwards compatibility */ + vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ - for (i = 0; i < VM_MAXCPU; i++) { - vcpu_init(vm, i); - } - -#ifdef __FreeBSD__ - maxaddr = vmm_mem_maxaddr(); - vm->iommu = iommu_create_domain(maxaddr); -#endif + vm_init(vm, true); *retvm = vm; return (0); } -static void -vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg) +void +vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus) { - size_t len; - vm_paddr_t hpa; - void *host_domain; - -#ifdef __FreeBSD__ - host_domain = iommu_host_domain(); -#endif - - len = 0; - while (len < seg->len) { - hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE); - if (hpa == (vm_paddr_t)-1) { - panic("vm_free_mem_segs: cannot free hpa " - "associated with gpa 0x%016lx", seg->gpa + len); - } - -#ifdef __FreeBSD__ - /* - * Remove the 'gpa' to 'hpa' mapping in VMs domain. - * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'. - */ - iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE); - iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE); -#endif - - vmm_mem_free(hpa, PAGE_SIZE); - - len += PAGE_SIZE; - } + *sockets = vm->sockets; + *cores = vm->cores; + *threads = vm->threads; + *maxcpus = vm->maxcpus; +} -#ifdef __FreeBSD__ - /* - * Invalidate cached translations associated with 'vm->iommu' since - * we have now moved some pages from it. - */ - iommu_invalidate_tlb(vm->iommu); -#endif +uint16_t +vm_get_maxcpus(struct vm *vm) +{ + return (vm->maxcpus); +} - bzero(seg, sizeof(struct vm_memory_segment)); +int +vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus) +{ + if (maxcpus != 0) + return (EINVAL); /* XXX remove when supported */ + if ((sockets * cores * threads) > vm->maxcpus) + return (EINVAL); + /* XXX need to check sockets * cores * threads == vCPU, how? */ + vm->sockets = sockets; + vm->cores = cores; + vm->threads = threads; + vm->maxcpus = VM_MAXCPU; /* XXX temp to keep code working */ + return(0); } -void -vm_destroy(struct vm *vm) +static void +vm_cleanup(struct vm *vm, bool destroy) { + struct mem_map *mm; int i; -#ifdef __FreeBSD__ ppt_unassign_all(vm); -#endif - - for (i = 0; i < vm->num_mem_segs; i++) - vm_free_mem_seg(vm, &vm->mem_segs[i]); - vm->num_mem_segs = 0; - - for (i = 0; i < VM_MAXCPU; i++) - vcpu_cleanup(vm, i); + if (vm->iommu != NULL) + iommu_destroy_domain(vm->iommu); + if (destroy) + vrtc_cleanup(vm->vrtc); + else + vrtc_reset(vm->vrtc); + vpmtmr_cleanup(vm->vpmtmr); vatpit_cleanup(vm->vatpit); vhpet_cleanup(vm->vhpet); vatpic_cleanup(vm->vatpic); vioapic_cleanup(vm->vioapic); -#ifdef __FreeBSD__ - iommu_destroy_domain(vm->iommu); -#endif + for (i = 0; i < vm->maxcpus; i++) + vcpu_cleanup(vm, i, destroy); VMCLEANUP(vm->cookie); + /* + * System memory is removed from the guest address space only when + * the VM is destroyed. This is because the mapping remains the same + * across VM reset. + * + * Device memory can be relocated by the guest (e.g. using PCI BARs) + * so those mappings are removed on a VM reset. + */ + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (destroy || !sysmem_mapping(vm, mm)) + vm_free_memmap(vm, i); +#ifndef __FreeBSD__ + else { + /* + * We need to reset the IOMMU flag so this mapping can + * be reused when a VM is rebooted. Since the IOMMU + * domain has already been destroyed we can just reset + * the flag here. + */ + mm->flags &= ~VM_MEMMAP_F_IOMMU; + } +#endif + } + + if (destroy) { + for (i = 0; i < VM_MAX_MEMSEGS; i++) + vm_free_memseg(vm, i); + + VMSPACE_FREE(vm->vmspace); + vm->vmspace = NULL; + } +#ifndef __FreeBSD__ + else { + /* + * Clear the first memory segment (low mem), old memory contents + * could confuse the UEFI firmware. + */ + vm_clear_memseg(vm, 0); + } +#endif +} + +void +vm_destroy(struct vm *vm) +{ + vm_cleanup(vm, true); free(vm, M_VM); } +int +vm_reinit(struct vm *vm) +{ + int error; + + /* + * A virtual machine can be reset only if all vcpus are suspended. + */ + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + vm_cleanup(vm, false); + vm_init(vm, false); + error = 0; + } else { + error = EBUSY; + } + + return (error); +} + const char * vm_name(struct vm *vm) { return (vm->name); } -#ifdef __FreeBSD__ int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa) { - const boolean_t spok = TRUE; /* superpage mappings are ok */ + vm_object_t obj; - return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE, - VM_PROT_RW, spok)); + if ((obj = vmm_mmio_alloc(vm->vmspace, gpa, len, hpa)) == NULL) + return (ENOMEM); + else + return (0); } int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len) { - const boolean_t spok = TRUE; /* superpage mappings are ok */ - return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0, - VM_PROT_NONE, spok)); + vmm_mmio_free(vm->vmspace, gpa, len); + return (0); } -#endif /* - * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise + * Return 'true' if 'gpa' is allocated in the guest address space. + * + * This function is called in the context of a running vcpu which acts as + * an implicit lock on 'vm->mem_maps[]'. */ -static boolean_t -vm_gpa_available(struct vm *vm, vm_paddr_t gpa) +bool +vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa) { + struct mem_map *mm; int i; - vm_paddr_t gpabase, gpalimit; - if (gpa & PAGE_MASK) - panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa); +#ifdef INVARIANTS + int hostcpu, state; + state = vcpu_get_state(vm, vcpuid, &hostcpu); + KASSERT(state == VCPU_RUNNING && hostcpu == curcpu, + ("%s: invalid vcpu state %d/%d", __func__, state, hostcpu)); +#endif - for (i = 0; i < vm->num_mem_segs; i++) { - gpabase = vm->mem_segs[i].gpa; - gpalimit = gpabase + vm->mem_segs[i].len; - if (gpa >= gpabase && gpa < gpalimit) - return (FALSE); + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len != 0 && gpa >= mm->gpa && gpa < mm->gpa + mm->len) + return (true); /* 'gpa' is sysmem or devmem */ } - return (TRUE); + if (ppt_is_mmio(vm, gpa)) + return (true); /* 'gpa' is pci passthru mmio */ + + return (false); } int -vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len) +vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem) { - int error, available, allocated; - struct vm_memory_segment *seg; - vm_paddr_t g, hpa; - void *host_domain; + struct mem_seg *seg; + vm_object_t obj; + +#ifndef __FreeBSD__ + extern pgcnt_t get_max_page_get(void); +#endif - const boolean_t spok = TRUE; /* superpage mappings are ok */ + if (ident < 0 || ident >= VM_MAX_MEMSEGS) + return (EINVAL); - if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0) + if (len == 0 || (len & PAGE_MASK)) return (EINVAL); - - available = allocated = 0; - g = gpa; - while (g < gpa + len) { - if (vm_gpa_available(vm, g)) - available++; - else - allocated++; - g += PAGE_SIZE; +#ifndef __FreeBSD__ + if (len > ptob(get_max_page_get())) + return (EINVAL); +#endif + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + if (seg->len == len && seg->sysmem == sysmem) + return (EEXIST); + else + return (EINVAL); } - /* - * If there are some allocated and some available pages in the address - * range then it is an error. - */ - if (allocated && available) + obj = vm_object_allocate(OBJT_DEFAULT, len >> PAGE_SHIFT); + if (obj == NULL) + return (ENOMEM); + + seg->len = len; + seg->object = obj; + seg->sysmem = sysmem; + return (0); +} + +int +vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + vm_object_t *objptr) +{ + struct mem_seg *seg; + + if (ident < 0 || ident >= VM_MAX_MEMSEGS) return (EINVAL); - /* - * If the entire address range being requested has already been - * allocated then there isn't anything more to do. - */ - if (allocated && available == 0) - return (0); + seg = &vm->mem_segs[ident]; + if (len) + *len = seg->len; + if (sysmem) + *sysmem = seg->sysmem; + if (objptr) + *objptr = seg->object; + return (0); +} + +#ifndef __FreeBSD__ +static void +vm_clear_memseg(struct vm *vm, int ident) +{ + struct mem_seg *seg; - if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS) - return (E2BIG); + KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, + ("%s: invalid memseg ident %d", __func__, ident)); -#ifdef __FreeBSD__ - host_domain = iommu_host_domain(); + seg = &vm->mem_segs[ident]; + + if (seg->object != NULL) + vm_object_clear(seg->object); +} #endif - seg = &vm->mem_segs[vm->num_mem_segs]; +void +vm_free_memseg(struct vm *vm, int ident) +{ + struct mem_seg *seg; + + KASSERT(ident >= 0 && ident < VM_MAX_MEMSEGS, + ("%s: invalid memseg ident %d", __func__, ident)); + + seg = &vm->mem_segs[ident]; + if (seg->object != NULL) { + vm_object_deallocate(seg->object); + bzero(seg, sizeof(struct mem_seg)); + } +} - error = 0; - seg->gpa = gpa; - seg->len = 0; - while (seg->len < len) { - hpa = vmm_mem_alloc(PAGE_SIZE); - if (hpa == 0) { - error = ENOMEM; +int +vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t first, + size_t len, int prot, int flags) +{ + struct mem_seg *seg; + struct mem_map *m, *map; + vm_ooffset_t last; + int i, error; + + if (prot == 0 || (prot & ~(VM_PROT_ALL)) != 0) + return (EINVAL); + + if (flags & ~VM_MEMMAP_F_WIRED) + return (EINVAL); + + if (segid < 0 || segid >= VM_MAX_MEMSEGS) + return (EINVAL); + + seg = &vm->mem_segs[segid]; + if (seg->object == NULL) + return (EINVAL); + + last = first + len; + if (first < 0 || first >= last || last > seg->len) + return (EINVAL); + + if ((gpa | first | last) & PAGE_MASK) + return (EINVAL); + + map = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + m = &vm->mem_maps[i]; + if (m->len == 0) { + map = m; break; } + } - error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE, - VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok); - if (error) - break; + if (map == NULL) + return (ENOSPC); -#ifdef __FreeBSD__ - /* - * Remove the 1:1 mapping for 'hpa' from the 'host_domain'. - * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain. - */ - iommu_remove_mapping(host_domain, hpa, PAGE_SIZE); - iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE); -#endif + error = vm_map_find(&vm->vmspace->vm_map, seg->object, first, &gpa, + len, 0, VMFS_NO_SPACE, prot, prot, 0); + if (error != KERN_SUCCESS) + return (EFAULT); + + vm_object_reference(seg->object); - seg->len += PAGE_SIZE; + if ((flags & VM_MEMMAP_F_WIRED) != 0) { + error = vm_map_wire(&vm->vmspace->vm_map, gpa, gpa + len, + VM_MAP_WIRE_USER | VM_MAP_WIRE_NOHOLES); + if (error != KERN_SUCCESS) { + vm_map_remove(&vm->vmspace->vm_map, gpa, gpa + len); + return (error == KERN_RESOURCE_SHORTAGE ? ENOMEM : + EFAULT); + } } - if (error) { - vm_free_mem_seg(vm, seg); - return (error); + map->gpa = gpa; + map->len = len; + map->segoff = first; + map->segid = segid; + map->prot = prot; + map->flags = flags; + return (0); +} + +int +vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags) +{ + struct mem_map *mm, *mmnext; + int i; + + mmnext = NULL; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (mm->len == 0 || mm->gpa < *gpa) + continue; + if (mmnext == NULL || mm->gpa < mmnext->gpa) + mmnext = mm; + } + + if (mmnext != NULL) { + *gpa = mmnext->gpa; + if (segid) + *segid = mmnext->segid; + if (segoff) + *segoff = mmnext->segoff; + if (len) + *len = mmnext->len; + if (prot) + *prot = mmnext->prot; + if (flags) + *flags = mmnext->flags; + return (0); + } else { + return (ENOENT); + } +} + +static void +vm_free_memmap(struct vm *vm, int ident) +{ + struct mem_map *mm; + int error; + + mm = &vm->mem_maps[ident]; + if (mm->len) { + error = vm_map_remove(&vm->vmspace->vm_map, mm->gpa, + mm->gpa + mm->len); + KASSERT(error == KERN_SUCCESS, ("%s: vm_map_remove error %d", + __func__, error)); + bzero(mm, sizeof(struct mem_map)); + } +} + +static __inline bool +sysmem_mapping(struct vm *vm, struct mem_map *mm) +{ + + if (mm->len != 0 && vm->mem_segs[mm->segid].sysmem) + return (true); + else + return (false); +} + +vm_paddr_t +vmm_sysmem_maxaddr(struct vm *vm) +{ + struct mem_map *mm; + vm_paddr_t maxaddr; + int i; + + maxaddr = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm)) { + if (maxaddr < mm->gpa + mm->len) + maxaddr = mm->gpa + mm->len; + } + } + return (maxaddr); +} + +static void +vm_iommu_modify(struct vm *vm, boolean_t map) +{ + int i, sz; + vm_paddr_t gpa, hpa; + struct mem_map *mm; +#ifdef __FreeBSD__ + void *vp, *cookie, *host_domain; +#else + void *vp, *cookie, *host_domain __unused; +#endif + + sz = PAGE_SIZE; + host_domain = iommu_host_domain(); + + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (!sysmem_mapping(vm, mm)) + continue; + + if (map) { + KASSERT((mm->flags & VM_MEMMAP_F_IOMMU) == 0, + ("iommu map found invalid memmap %#lx/%#lx/%#x", + mm->gpa, mm->len, mm->flags)); + if ((mm->flags & VM_MEMMAP_F_WIRED) == 0) + continue; + mm->flags |= VM_MEMMAP_F_IOMMU; + } else { + if ((mm->flags & VM_MEMMAP_F_IOMMU) == 0) + continue; + mm->flags &= ~VM_MEMMAP_F_IOMMU; + KASSERT((mm->flags & VM_MEMMAP_F_WIRED) != 0, + ("iommu unmap found invalid memmap %#lx/%#lx/%#x", + mm->gpa, mm->len, mm->flags)); + } + + gpa = mm->gpa; + while (gpa < mm->gpa + mm->len) { + vp = vm_gpa_hold(vm, -1, gpa, PAGE_SIZE, VM_PROT_WRITE, + &cookie); + KASSERT(vp != NULL, ("vm(%s) could not map gpa %#lx", + vm_name(vm), gpa)); + + vm_gpa_release(cookie); + + hpa = DMAP_TO_PHYS((uintptr_t)vp); + if (map) { + iommu_create_mapping(vm->iommu, gpa, hpa, sz); + iommu_remove_mapping(host_domain, hpa, sz); + } else { + iommu_remove_mapping(vm->iommu, gpa, sz); + iommu_create_mapping(host_domain, hpa, hpa, sz); + } + + gpa += PAGE_SIZE; + } } -#ifdef __FreeBSD__ /* - * Invalidate cached translations associated with 'host_domain' since - * we have now moved some pages from it. + * Invalidate the cached translations associated with the domain + * from which pages were removed. */ - iommu_invalidate_tlb(host_domain); -#endif + if (map) + iommu_invalidate_tlb(host_domain); + else + iommu_invalidate_tlb(vm->iommu); +} + +#define vm_iommu_unmap(vm) vm_iommu_modify((vm), FALSE) +#define vm_iommu_map(vm) vm_iommu_modify((vm), TRUE) + +int +vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func) +{ + int error; + + error = ppt_unassign_device(vm, bus, slot, func); + if (error) + return (error); - vm->num_mem_segs++; + if (ppt_assigned_devices(vm) == 0) + vm_iommu_unmap(vm); return (0); } -vm_paddr_t -vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len) +int +vm_assign_pptdev(struct vm *vm, int bus, int slot, int func) { - vm_paddr_t nextpage; + int error; + vm_paddr_t maxaddr; - nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE); - if (len > nextpage - gpa) - panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len); + /* Set up the IOMMU to do the 'gpa' to 'hpa' translation */ + if (ppt_assigned_devices(vm) == 0) { + KASSERT(vm->iommu == NULL, + ("vm_assign_pptdev: iommu must be NULL")); + maxaddr = vmm_sysmem_maxaddr(vm); + vm->iommu = iommu_create_domain(maxaddr); + if (vm->iommu == NULL) + return (ENXIO); + vm_iommu_map(vm); + } - return (VMMMAP_GET(vm->cookie, gpa)); + error = ppt_assign_device(vm, bus, slot, func); + return (error); } void * -vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, +vm_gpa_hold(struct vm *vm, int vcpuid, vm_paddr_t gpa, size_t len, int reqprot, void **cookie) { -#ifdef __FreeBSD__ - int count, pageoff; + int i, count, pageoff; + struct mem_map *mm; vm_page_t m; - +#ifdef INVARIANTS + /* + * All vcpus are frozen by ioctls that modify the memory map + * (e.g. VM_MMAP_MEMSEG). Therefore 'vm->memmap[]' stability is + * guaranteed if at least one vcpu is in the VCPU_FROZEN state. + */ + int state; + KASSERT(vcpuid >= -1 && vcpuid < vm->maxcpus, ("%s: invalid vcpuid %d", + __func__, vcpuid)); + for (i = 0; i < vm->maxcpus; i++) { + if (vcpuid != -1 && vcpuid != i) + continue; + state = vcpu_get_state(vm, i, NULL); + KASSERT(state == VCPU_FROZEN, ("%s: invalid vcpu state %d", + __func__, state)); + } +#endif pageoff = gpa & PAGE_MASK; if (len > PAGE_SIZE - pageoff) panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); - count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, - trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + count = 0; + for (i = 0; i < VM_MAX_MEMMAPS; i++) { + mm = &vm->mem_maps[i]; + if (sysmem_mapping(vm, mm) && gpa >= mm->gpa && + gpa < mm->gpa + mm->len) { + count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map, + trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1); + break; + } + } if (count == 1) { *cookie = m; @@ -635,54 +1159,23 @@ vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot, *cookie = NULL; return (NULL); } -#else - int pageoff; - vm_paddr_t hpa; - - pageoff = gpa & PAGE_MASK; - if (len > PAGE_SIZE - pageoff) - panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len); - - hpa = vm_gpa2hpa(vm, gpa, len); - if (hpa == (vm_paddr_t)-1) - return (NULL); - - return (hat_kpm_pfn2va(btop(hpa)) + pageoff); -#endif } void vm_gpa_release(void *cookie) { -#ifdef __FreeBSD__ vm_page_t m = cookie; vm_page_lock(m); vm_page_unhold(m); vm_page_unlock(m); -#endif -} - -int -vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, - struct vm_memory_segment *seg) -{ - int i; - - for (i = 0; i < vm->num_mem_segs; i++) { - if (gpabase == vm->mem_segs[i].gpa) { - *seg = vm->mem_segs[i]; - return (0); - } - } - return (-1); } int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval) { - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (reg >= VM_REG_LAST) @@ -697,7 +1190,7 @@ vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val) struct vcpu *vcpu; int error; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (reg >= VM_REG_LAST) @@ -751,7 +1244,7 @@ vm_get_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) @@ -764,7 +1257,7 @@ int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc) { - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (!is_segment_register(reg) && !is_descriptor_table(reg)) @@ -784,6 +1277,10 @@ restore_guest_fpustate(struct vcpu *vcpu) fpu_stop_emulating(); fpurestore(vcpu->guestfpu); + /* restore guest XCR0 if XSAVE is enabled in the host */ + if (rcr4() & CR4_XSAVE) + load_xcr(0, vcpu->guest_xcr0); + /* * The FPU is now "dirty" with the guest's state so turn on emulation * to trap any access to the FPU by the host. @@ -798,20 +1295,35 @@ save_guest_fpustate(struct vcpu *vcpu) if ((rcr0() & CR0_TS) == 0) panic("fpu emulation not enabled in host!"); + /* save guest XCR0 and restore host XCR0 */ + if (rcr4() & CR4_XSAVE) { + vcpu->guest_xcr0 = rxcr(0); + load_xcr(0, vmm_get_host_xcr0()); + } + /* save guest FPU state */ fpu_stop_emulating(); fpusave(vcpu->guestfpu); +#ifdef __FreeBSD__ fpu_start_emulating(); +#else + /* + * When the host state has been restored, we should not re-enable + * CR0.TS on illumos for eager FPU. + */ +#endif } static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle"); static int -vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, +vcpu_set_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate, bool from_idle) { + struct vcpu *vcpu; int error; + vcpu = &vm->vcpu[vcpuid]; vcpu_assert_locked(vcpu); /* @@ -820,8 +1332,17 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, * ioctl() operating on a vcpu at any point. */ if (from_idle) { - while (vcpu->state != VCPU_IDLE) + while (vcpu->state != VCPU_IDLE) { + vcpu->reqidle = 1; + vcpu_notify_event_locked(vcpu, false); + VCPU_CTR1(vm, vcpuid, "vcpu state change from %s to " + "idle requested", vcpu_state2str(vcpu->state)); +#ifdef __FreeBSD__ msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz); +#else + cv_wait(&vcpu->state_cv, &vcpu->mtx.m); +#endif + } } else { KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from " "vcpu idle state")); @@ -855,17 +1376,36 @@ vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate, break; } + if (newstate == VCPU_RUNNING) { + while (vcpu->runblock != 0) { +#ifdef __FreeBSD__ + msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); +#else + cv_wait(&vcpu->state_cv, &vcpu->mtx.m); +#endif + } + } + if (error) return (EBUSY); + VCPU_CTR2(vm, vcpuid, "vcpu state changed from %s to %s", + vcpu_state2str(vcpu->state), vcpu_state2str(newstate)); + vcpu->state = newstate; if (newstate == VCPU_RUNNING) vcpu->hostcpu = curcpu; else vcpu->hostcpu = NOCPU; - if (newstate == VCPU_IDLE) + if (newstate == VCPU_IDLE || + (newstate == VCPU_FROZEN && vcpu->runblock != 0)) { +#ifdef __FreeBSD__ wakeup(&vcpu->state); +#else + cv_broadcast(&vcpu->state_cv); +#endif + } return (0); } @@ -880,11 +1420,11 @@ vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate) } static void -vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) +vcpu_require_state_locked(struct vm *vm, int vcpuid, enum vcpu_state newstate) { int error; - if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0) + if ((error = vcpu_set_state_locked(vm, vcpuid, newstate, false)) != 0) panic("Error %d setting state to %d", error, newstate); } @@ -894,60 +1434,139 @@ vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate) static int vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu) { - struct vm_exit *vmexit; struct vcpu *vcpu; - int t, timo, spindown; +#ifdef __FreeBSD__ + const char *wmesg; +#else + const char *wmesg __unused; +#endif + int t, vcpu_halted, vm_halted; + + KASSERT(!CPU_ISSET(vcpuid, &vm->halted_cpus), ("vcpu already halted")); vcpu = &vm->vcpu[vcpuid]; - spindown = 0; + vcpu_halted = 0; + vm_halted = 0; vcpu_lock(vcpu); + while (1) { + /* + * Do a final check for pending NMI or interrupts before + * really putting this thread to sleep. Also check for + * software events that would cause this vcpu to wakeup. + * + * These interrupts/events could have happened after the + * vcpu returned from VMRUN() and before it acquired the + * vcpu lock above. + */ + if (vm->suspend || vcpu->reqidle) + break; + if (vm_nmi_pending(vm, vcpuid)) + break; + if (!intr_disabled) { + if (vm_extint_pending(vm, vcpuid) || + vlapic_pending_intr(vcpu->vlapic, NULL)) { + break; + } + } - /* - * Do a final check for pending NMI or interrupts before - * really putting this thread to sleep. - * - * These interrupts could have happened any time after we - * returned from VMRUN() and before we grabbed the vcpu lock. - */ - if (vm->rendezvous_func == NULL && - !vm_nmi_pending(vm, vcpuid) && - (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) { - t = ticks; - vcpu_require_state_locked(vcpu, VCPU_SLEEPING); - if (vlapic_enabled(vcpu->vlapic)) { - /* - * XXX msleep_spin() is not interruptible so use the - * 'timo' to put an upper bound on the sleep time. - */ - timo = hz; - msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo); + /* Don't go to sleep if the vcpu thread needs to yield */ + if (vcpu_should_yield(vm, vcpuid)) + break; + + if (vcpu_debugged(vm, vcpuid)) + break; + + /* + * Some Linux guests implement "halt" by having all vcpus + * execute HLT with interrupts disabled. 'halted_cpus' keeps + * track of the vcpus that have entered this state. When all + * vcpus enter the halted state the virtual machine is halted. + */ + if (intr_disabled) { + wmesg = "vmhalt"; + VCPU_CTR0(vm, vcpuid, "Halted"); + if (!vcpu_halted && halt_detection_enabled) { + vcpu_halted = 1; + CPU_SET_ATOMIC(vcpuid, &vm->halted_cpus); + } + if (CPU_CMP(&vm->halted_cpus, &vm->active_cpus) == 0) { + vm_halted = 1; + break; + } } else { - /* - * Spindown the vcpu if the apic is disabled and it - * had entered the halted state. - */ - spindown = 1; + wmesg = "vmidle"; } - vcpu_require_state_locked(vcpu, VCPU_FROZEN); + + t = ticks; + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); +#ifdef __FreeBSD__ + /* + * XXX msleep_spin() cannot be interrupted by signals so + * wake up periodically to check pending signals. + */ + msleep_spin(vcpu, &vcpu->mtx, wmesg, hz); +#else + /* + * Fortunately, cv_wait_sig can be interrupted by signals, so + * there is no need to periodically wake up. + */ + (void) cv_wait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m); +#endif + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t); } + + if (vcpu_halted) + CPU_CLR_ATOMIC(vcpuid, &vm->halted_cpus); + vcpu_unlock(vcpu); -#ifdef __FreeBSD__ - /* - * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it - * outside the confines of the vcpu spinlock. - */ - if (spindown) { - *retu = true; - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU; - vm_deactivate_cpu(vm, vcpuid); - VCPU_CTR0(vm, vcpuid, "spinning down cpu"); + if (vm_halted) + vm_suspend(vm, VM_SUSPEND_HALT); + + return (0); +} + +static int +vm_handle_paging(struct vm *vm, int vcpuid, bool *retu) +{ + int rv, ftype; + struct vm_map *map; + struct vcpu *vcpu; + struct vm_exit *vme; + + vcpu = &vm->vcpu[vcpuid]; + vme = &vcpu->exitinfo; + + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + + ftype = vme->u.paging.fault_type; + KASSERT(ftype == VM_PROT_READ || + ftype == VM_PROT_WRITE || ftype == VM_PROT_EXECUTE, + ("vm_handle_paging: invalid fault_type %d", ftype)); + + if (ftype == VM_PROT_READ || ftype == VM_PROT_WRITE) { + rv = pmap_emulate_accessed_dirty(vmspace_pmap(vm->vmspace), + vme->u.paging.gpa, ftype); + if (rv == 0) { + VCPU_CTR2(vm, vcpuid, "%s bit emulation for gpa %#lx", + ftype == VM_PROT_READ ? "accessed" : "dirty", + vme->u.paging.gpa); + goto done; + } } -#endif + map = &vm->vmspace->vm_map; + rv = vm_fault(map, vme->u.paging.gpa, ftype, VM_FAULT_NORMAL); + + VCPU_CTR3(vm, vcpuid, "vm_handle_paging rv = %d, gpa = %#lx, " + "ftype = %d", rv, vme->u.paging.gpa, ftype); + + if (rv != KERN_SUCCESS) + return (EFAULT); +done: return (0); } @@ -962,11 +1581,14 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) mem_region_read_t mread; mem_region_write_t mwrite; enum vm_cpu_mode cpu_mode; - int cs_d, error, length; + int cs_d, error, fault; vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; + KASSERT(vme->inst_length == 0, ("%s: invalid inst_length %d", + __func__, vme->inst_length)); + gla = vme->u.inst_emul.gla; gpa = vme->u.inst_emul.gpa; cs_base = vme->u.inst_emul.cs_base; @@ -979,37 +1601,31 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) /* Fetch, decode and emulate the faulting instruction */ if (vie->num_valid == 0) { - /* - * If the instruction length is not known then assume a - * maximum size instruction. - */ - length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE; error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip + - cs_base, length, vie); + cs_base, VIE_INST_SIZE, vie, &fault); } else { /* * The instruction bytes have already been copied into 'vie' */ - error = 0; + error = fault = 0; } - if (error == 1) - return (0); /* Resume guest to handle page fault */ - else if (error == -1) - return (EFAULT); - else if (error != 0) - panic("%s: vmm_fetch_instruction error %d", __func__, error); + if (error || fault) + return (error); - if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) - return (EFAULT); + if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0) { + VCPU_CTR1(vm, vcpuid, "Error decoding instruction at %#lx", + vme->rip + cs_base); + *retu = true; /* dump instruction bytes in userspace */ + return (0); + } /* - * If the instruction length was not specified then update it now - * along with 'nextrip'. + * Update 'nextrip' based on the length of the emulated instruction. */ - if (vme->inst_length == 0) { - vme->inst_length = vie->num_processed; - vcpu->nextrip += vie->num_processed; - } + vme->inst_length = vie->num_processed; + vcpu->nextrip += vie->num_processed; + VCPU_CTR1(vm, vcpuid, "nextrip updated to %#lx after instruction " + "decoding", vcpu->nextrip); /* return to userland unless this is an in-kernel emulated device */ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) { @@ -1032,47 +1648,394 @@ vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu) return (error); } +static int +vm_handle_suspend(struct vm *vm, int vcpuid, bool *retu) +{ +#ifdef __FreeBSD__ + int i, done; + struct vcpu *vcpu; + + done = 0; +#else + int i; + struct vcpu *vcpu; +#endif + vcpu = &vm->vcpu[vcpuid]; + + CPU_SET_ATOMIC(vcpuid, &vm->suspended_cpus); + + /* + * Wait until all 'active_cpus' have suspended themselves. + */ + vcpu_lock(vcpu); + while (1) { + if (CPU_CMP(&vm->suspended_cpus, &vm->active_cpus) == 0) { + VCPU_CTR0(vm, vcpuid, "All vcpus suspended"); + break; + } + + VCPU_CTR0(vm, vcpuid, "Sleeping during suspend"); + vcpu_require_state_locked(vm, vcpuid, VCPU_SLEEPING); +#ifdef __FreeBSD__ + msleep_spin(vcpu, &vcpu->mtx, "vmsusp", hz); +#else + /* + * To prevent vm_handle_suspend from becoming stuck in the + * kernel if the bhyve process driving its vCPUs is killed, + * offer a bail-out, even though not all the vCPUs have reached + * the suspended state. + */ + if (cv_reltimedwait_sig(&vcpu->vcpu_cv, &vcpu->mtx.m, + hz, TR_CLOCK_TICK) <= 0) { + if ((curproc->p_flag & SEXITING) != 0) { + vcpu_require_state_locked(vm, vcpuid, + VCPU_FROZEN); + break; + } + } +#endif + vcpu_require_state_locked(vm, vcpuid, VCPU_FROZEN); + } + vcpu_unlock(vcpu); + + /* + * Wakeup the other sleeping vcpus and return to userspace. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->suspended_cpus)) { + vcpu_notify_event(vm, i, false); + } + } + + *retu = true; + return (0); +} + +static int +vm_handle_reqidle(struct vm *vm, int vcpuid, bool *retu) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + KASSERT(vcpu->reqidle, ("invalid vcpu reqidle %d", vcpu->reqidle)); + vcpu->reqidle = 0; + vcpu_unlock(vcpu); + *retu = true; + return (0); +} + +#ifndef __FreeBSD__ +static int +vm_handle_wrmsr(struct vm *vm, int vcpuid, struct vm_exit *vme) +{ + struct vcpu *cpu = &vm->vcpu[vcpuid]; + const uint32_t code = vme->u.msr.code; + const uint64_t val = vme->u.msr.wval; + + switch (code) { + case MSR_TSC: + cpu->tsc_offset = val - rdtsc(); + return (0); + } + + return (-1); +} +#endif /* __FreeBSD__ */ + +int +vm_suspend(struct vm *vm, enum vm_suspend_how how) +{ + int i; + + if (how <= VM_SUSPEND_NONE || how >= VM_SUSPEND_LAST) + return (EINVAL); + + if (atomic_cmpset_int((uint_t *)&vm->suspend, 0, how) == 0) { + VM_CTR2(vm, "virtual machine already suspended %d/%d", + vm->suspend, how); + return (EALREADY); + } + + VM_CTR1(vm, "virtual machine successfully suspended %d", how); + + /* + * Notify all active vcpus that they are now suspended. + */ + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm, i, false); + } + + return (0); +} + +void +vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + KASSERT(vm->suspend > VM_SUSPEND_NONE && vm->suspend < VM_SUSPEND_LAST, + ("vm_exit_suspended: invalid suspend type %d", vm->suspend)); + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_SUSPENDED; + vmexit->u.suspended.how = vm->suspend; +} + +void +vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_DEBUG; +} + +void +vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_RUNBLOCK; + vmm_stat_incr(vm, vcpuid, VMEXIT_RUNBLOCK, 1); +} + +void +vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_REQIDLE; + vmm_stat_incr(vm, vcpuid, VMEXIT_REQIDLE, 1); +} + +void +vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip) +{ + struct vm_exit *vmexit; + + vmexit = vm_exitinfo(vm, vcpuid); + vmexit->rip = rip; + vmexit->inst_length = 0; + vmexit->exitcode = VM_EXITCODE_BOGUS; + vmm_stat_incr(vm, vcpuid, VMEXIT_ASTPENDING, 1); +} + +#ifndef __FreeBSD__ +/* + * Some vmm resources, such as the lapic, may have CPU-specific resources + * allocated to them which would benefit from migration onto the host CPU which + * is processing the vcpu state. + */ +static void +vm_localize_resources(struct vm *vm, struct vcpu *vcpu) +{ + /* + * Localizing cyclic resources requires acquisition of cpu_lock, and + * doing so with kpreempt disabled is a recipe for deadlock disaster. + */ + VERIFY(curthread->t_preempt == 0); + + /* + * Do not bother with localization if this vCPU is about to return to + * the host CPU it was last localized to. + */ + if (vcpu->lastloccpu == curcpu) + return; + + /* + * Localize system-wide resources to the primary boot vCPU. While any + * of the other vCPUs may access them, it keeps the potential interrupt + * footprint constrained to CPUs involved with this instance. + */ + if (vcpu == &vm->vcpu[0]) { + vhpet_localize_resources(vm->vhpet); + vrtc_localize_resources(vm->vrtc); + vatpit_localize_resources(vm->vatpit); + } + + vlapic_localize_resources(vcpu->vlapic); + + vcpu->lastloccpu = curcpu; +} + +static void +vmm_savectx(void *arg) +{ + vm_thread_ctx_t *vtc = arg; + struct vm *vm = vtc->vtc_vm; + const int vcpuid = vtc->vtc_vcpuid; + + if (ops->vmsavectx != NULL) { + ops->vmsavectx(vm->cookie, vcpuid); + } + + /* + * If the CPU holds the restored guest FPU state, save it and restore + * the host FPU state before this thread goes off-cpu. + */ + if ((vtc->vtc_status & VTCS_FPU_RESTORED) != 0) { + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + save_guest_fpustate(vcpu); + vtc->vtc_status &= ~VTCS_FPU_RESTORED; + } +} + +static void +vmm_restorectx(void *arg) +{ + vm_thread_ctx_t *vtc = arg; + struct vm *vm = vtc->vtc_vm; + const int vcpuid = vtc->vtc_vcpuid; + + /* + * When coming back on-cpu, only restore the guest FPU status if the + * thread is in a context marked as requiring it. This should be rare, + * occurring only when a future logic error results in a voluntary + * sleep during the VMRUN critical section. + * + * The common case will result in elision of the guest FPU state + * restoration, deferring that action until it is clearly necessary + * during vm_run. + */ + VERIFY((vtc->vtc_status & VTCS_FPU_RESTORED) == 0); + if ((vtc->vtc_status & VTCS_FPU_CTX_CRITICAL) != 0) { + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + restore_guest_fpustate(vcpu); + vtc->vtc_status |= VTCS_FPU_RESTORED; + } + + if (ops->vmrestorectx != NULL) { + ops->vmrestorectx(vm->cookie, vcpuid); + } + +} + +/* + * If we're in removectx(), we might still have state to tidy up. + */ +static void +vmm_freectx(void *arg, int isexec) +{ + vmm_savectx(arg); +} + +#endif /* __FreeBSD */ + int vm_run(struct vm *vm, struct vm_run *vmrun) { + struct vm_eventinfo evinfo; int error, vcpuid; struct vcpu *vcpu; +#ifdef __FreeBSD__ struct pcb *pcb; +#endif uint64_t tscval; struct vm_exit *vme; bool retu, intr_disabled; + pmap_t pmap; +#ifndef __FreeBSD__ + vm_thread_ctx_t vtc; + int affinity_type = CPU_CURRENT; +#endif vcpuid = vmrun->cpuid; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + if (CPU_ISSET(vcpuid, &vm->suspended_cpus)) return (EINVAL); + pmap = vmspace_pmap(vm->vmspace); vcpu = &vm->vcpu[vcpuid]; vme = &vcpu->exitinfo; + evinfo.rptr = &vcpu->runblock; + evinfo.sptr = &vm->suspend; + evinfo.iptr = &vcpu->reqidle; + +#ifndef __FreeBSD__ + vtc.vtc_vm = vm; + vtc.vtc_vcpuid = vcpuid; + vtc.vtc_status = 0; + + installctx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL, + NULL, vmm_freectx); +#endif + restart: +#ifndef __FreeBSD__ + thread_affinity_set(curthread, affinity_type); + /* + * Resource localization should happen after the CPU affinity for the + * thread has been set to ensure that access from restricted contexts, + * such as VMX-accelerated APIC operations, can occur without inducing + * cyclic cross-calls. + * + * This must be done prior to disabling kpreempt via critical_enter(). + */ + vm_localize_resources(vm, vcpu); + + affinity_type = CPU_CURRENT; +#endif + critical_enter(); + KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active), + ("vm_run: absurd pm_active")); + tscval = rdtsc(); #ifdef __FreeBSD__ pcb = PCPU_GET(curpcb); set_pcb_flags(pcb, PCB_FULL_IRET); +#else + /* Force a trip through update_sregs to reload %fs/%gs and friends */ + PCB_SET_UPDATE_SEGS(&ttolwp(curthread)->lwp_pcb); #endif -#ifndef __FreeBSD__ - installctx(curthread, vcpu, save_guest_fpustate, - restore_guest_fpustate, NULL, NULL, NULL, NULL); -#endif +#ifdef __FreeBSD__ restore_guest_fpustate(vcpu); +#else + if ((vtc.vtc_status & VTCS_FPU_RESTORED) == 0) { + restore_guest_fpustate(vcpu); + vtc.vtc_status |= VTCS_FPU_RESTORED; + } + vtc.vtc_status |= VTCS_FPU_CTX_CRITICAL; +#endif vcpu_require_state(vm, vcpuid, VCPU_RUNNING); - error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip); + error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip, pmap, &evinfo); vcpu_require_state(vm, vcpuid, VCPU_FROZEN); +#ifdef __FreeBSD__ save_guest_fpustate(vcpu); +#else + vtc.vtc_status &= ~VTCS_FPU_CTX_CRITICAL; +#endif + #ifndef __FreeBSD__ - removectx(curthread, vcpu, save_guest_fpustate, - restore_guest_fpustate, NULL, NULL, NULL, NULL); + /* + * Once clear of the delicate contexts comprising the VM_RUN handler, + * thread CPU affinity can be loosened while other processing occurs. + */ + thread_affinity_clear(curthread); #endif vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval); @@ -1083,10 +2046,25 @@ restart: retu = false; vcpu->nextrip = vme->rip + vme->inst_length; switch (vme->exitcode) { + case VM_EXITCODE_REQIDLE: + error = vm_handle_reqidle(vm, vcpuid, &retu); + break; + case VM_EXITCODE_SUSPENDED: + error = vm_handle_suspend(vm, vcpuid, &retu); + break; + case VM_EXITCODE_IOAPIC_EOI: + vioapic_process_eoi(vm, vcpuid, + vme->u.ioapic_eoi.vector); + break; + case VM_EXITCODE_RUNBLOCK: + break; case VM_EXITCODE_HLT: intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0); error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu); break; + case VM_EXITCODE_PAGING: + error = vm_handle_paging(vm, vcpuid, &retu); + break; case VM_EXITCODE_INST_EMUL: error = vm_handle_inst_emul(vm, vcpuid, &retu); break; @@ -1094,18 +2072,42 @@ restart: case VM_EXITCODE_INOUT_STR: error = vm_handle_inout(vm, vcpuid, vme, &retu); break; + case VM_EXITCODE_MONITOR: + case VM_EXITCODE_MWAIT: + case VM_EXITCODE_VMINSN: + vm_inject_ud(vm, vcpuid); + break; +#ifndef __FreeBSD__ + case VM_EXITCODE_WRMSR: + if (vm_handle_wrmsr(vm, vcpuid, vme) != 0) { + retu = true; + } + break; + + case VM_EXITCODE_HT: { + affinity_type = CPU_BEST; + break; + } + +#endif default: retu = true; /* handled in userland */ break; } } - if (error == 0 && retu == false) { + if (error == 0 && retu == false) goto restart; - } + +#ifndef __FreeBSD__ + removectx(curthread, &vtc, vmm_savectx, vmm_restorectx, NULL, NULL, + NULL, vmm_freectx); +#endif + + VCPU_CTR2(vm, vcpuid, "retu %d/%d", error, vme->exitcode); /* copy the exit information */ - bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit)); + bcopy(vme, &vmrun->vm_exit, sizeof (struct vm_exit)); return (error); } @@ -1119,7 +2121,7 @@ vm_restart_instruction(void *arg, int vcpuid) int error; vm = arg; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; @@ -1158,7 +2160,7 @@ vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info) struct vcpu *vcpu; int type, vector; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; @@ -1262,9 +2264,7 @@ nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2, if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) { VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)", info1, info2); -#ifdef __FreeBSD__ vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT); -#endif *retinfo = 0; return (0); } @@ -1293,11 +2293,11 @@ vcpu_exception_intinfo(struct vcpu *vcpu) uint64_t info = 0; if (vcpu->exception_pending) { - info = vcpu->exception.vector & 0xff; + info = vcpu->exc_vector & 0xff; info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION; - if (vcpu->exception.error_code_valid) { + if (vcpu->exc_errcode_valid) { info |= VM_INTINFO_DEL_ERRCODE; - info |= (uint64_t)vcpu->exception.error_code << 32; + info |= (uint64_t)vcpu->exc_errcode << 32; } } return (info); @@ -1310,7 +2310,8 @@ vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) uint64_t info1, info2; int valid; - KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); + KASSERT(vcpuid >= 0 && + vcpuid < vm->maxcpus, ("invalid vcpu %d", vcpuid)); vcpu = &vm->vcpu[vcpuid]; @@ -1322,7 +2323,7 @@ vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) info2 = vcpu_exception_intinfo(vcpu); vcpu->exception_pending = 0; VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx", - vcpu->exception.vector, info2); + vcpu->exc_vector, info2); } if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) { @@ -1346,76 +2347,93 @@ vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo) } int -vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception) +vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); - if (exception->vector < 0 || exception->vector >= 32) + vcpu = &vm->vcpu[vcpuid]; + *info1 = vcpu->exitintinfo; + *info2 = vcpu_exception_intinfo(vcpu); + return (0); +} + +int +vm_inject_exception(struct vm *vm, int vcpuid, int vector, int errcode_valid, + uint32_t errcode, int restart_instruction) +{ + struct vcpu *vcpu; + uint64_t regval; + int error; + + if (vcpuid < 0 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vector < 0 || vector >= 32) + return (EINVAL); + + /* + * A double fault exception should never be injected directly into + * the guest. It is a derived exception that results from specific + * combinations of nested faults. + */ + if (vector == IDT_DF) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; if (vcpu->exception_pending) { VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to " - "pending exception %d", exception->vector, - vcpu->exception.vector); + "pending exception %d", vector, vcpu->exc_vector); return (EBUSY); } - vcpu->exception_pending = 1; - vcpu->exception = *exception; - VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector); - return (0); -} + if (errcode_valid) { + /* + * Exceptions don't deliver an error code in real mode. + */ + error = vm_get_register(vm, vcpuid, VM_REG_GUEST_CR0, ®val); + KASSERT(!error, ("%s: error %d getting CR0", __func__, error)); + if (!(regval & CR0_PE)) + errcode_valid = 0; + } -int -vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception) -{ - struct vcpu *vcpu; - int pending; + /* + * From section 26.6.1 "Interruptibility State" in Intel SDM: + * + * Event blocking by "STI" or "MOV SS" is cleared after guest executes + * one instruction or incurs an exception. + */ + error = vm_set_register(vm, vcpuid, VM_REG_GUEST_INTR_SHADOW, 0); + KASSERT(error == 0, ("%s: error %d clearing interrupt shadow", + __func__, error)); - KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid)); + if (restart_instruction) + vm_restart_instruction(vm, vcpuid); - vcpu = &vm->vcpu[vcpuid]; - pending = vcpu->exception_pending; - if (pending) { - vcpu->exception_pending = 0; - *exception = vcpu->exception; - VCPU_CTR1(vm, vcpuid, "Exception %d delivered", - exception->vector); - } - return (pending); + vcpu->exception_pending = 1; + vcpu->exc_vector = vector; + vcpu->exc_errcode = errcode; + vcpu->exc_errcode_valid = errcode_valid; + VCPU_CTR1(vm, vcpuid, "Exception %d pending", vector); + return (0); } void vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid, int errcode) { - struct vm_exception exception; - struct vm_exit *vmexit; struct vm *vm; - int error; + int error, restart_instruction; vm = vmarg; + restart_instruction = 1; - exception.vector = vector; - exception.error_code = errcode; - exception.error_code_valid = errcode_valid; - error = vm_inject_exception(vm, vcpuid, &exception); + error = vm_inject_exception(vm, vcpuid, vector, errcode_valid, + errcode, restart_instruction); KASSERT(error == 0, ("vm_inject_exception error %d", error)); - - /* - * A fault-like exception allows the instruction to be restarted - * after the exception handler returns. - * - * By setting the inst_length to 0 we ensure that the instruction - * pointer remains at the faulting instruction. - */ - vmexit = vm_exitinfo(vm, vcpuid); - vmexit->inst_length = 0; } void @@ -1441,14 +2459,13 @@ vm_inject_nmi(struct vm *vm, int vcpuid) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->nmi_pending = 1; vcpu_notify_event(vm, vcpuid, false); - return (0); } @@ -1457,7 +2474,7 @@ vm_nmi_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -1470,7 +2487,7 @@ vm_nmi_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_nmi_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -1489,14 +2506,13 @@ vm_inject_extint(struct vm *vm, int vcpuid) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); vcpu = &vm->vcpu[vcpuid]; vcpu->extint_pending = 1; vcpu_notify_event(vm, vcpuid, false); - return (0); } @@ -1505,7 +2521,7 @@ vm_extint_pending(struct vm *vm, int vcpuid) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_extint_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -1518,7 +2534,7 @@ vm_extint_clear(struct vm *vm, int vcpuid) { struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_extint_pending: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -1533,7 +2549,7 @@ vm_extint_clear(struct vm *vm, int vcpuid) int vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) { - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) @@ -1545,7 +2561,7 @@ vm_get_capability(struct vm *vm, int vcpu, int type, int *retval) int vm_set_capability(struct vm *vm, int vcpu, int type, int val) { - if (vcpu < 0 || vcpu >= VM_MAXCPU) + if (vcpu < 0 || vcpu >= vm->maxcpus) return (EINVAL); if (type < 0 || type >= VM_CAP_MAX) @@ -1554,22 +2570,24 @@ vm_set_capability(struct vm *vm, int vcpu, int type, int val) return (VMSETCAP(vm->cookie, vcpu, type, val)); } -struct vhpet * -vm_hpet(struct vm *vm) +struct vlapic * +vm_lapic(struct vm *vm, int cpu) { - return (vm->vhpet); + return (vm->vcpu[cpu].vlapic); } struct vioapic * vm_ioapic(struct vm *vm) { + return (vm->vioapic); } -struct vlapic * -vm_lapic(struct vm *vm, int cpu) +struct vhpet * +vm_hpet(struct vm *vm) { - return (vm->vcpu[cpu].vlapic); + + return (vm->vhpet); } #ifdef __FreeBSD__ @@ -1594,7 +2612,7 @@ vmm_is_pptdev(int bus, int slot, int func) /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */ found = 0; for (i = 0; names[i] != NULL && !found; i++) { - cp = val = getenv(names[i]); + cp = val = kern_getenv(names[i]); while (cp != NULL && *cp != '\0') { if ((cp2 = strchr(cp, ' ')) != NULL) *cp2 = '\0'; @@ -1630,13 +2648,13 @@ vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate, int error; struct vcpu *vcpu; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_set_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; vcpu_lock(vcpu); - error = vcpu_set_state_locked(vcpu, newstate, from_idle); + error = vcpu_set_state_locked(vm, vcpuid, newstate, from_idle); vcpu_unlock(vcpu); return (error); @@ -1648,7 +2666,7 @@ vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) struct vcpu *vcpu; enum vcpu_state state; - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) panic("vm_get_run_state: invalid vcpuid %d", vcpuid); vcpu = &vm->vcpu[vcpuid]; @@ -1662,11 +2680,67 @@ vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu) return (state); } +void +vcpu_block_run(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vcpu_block_run: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu->runblock++; + if (vcpu->runblock == 1 && vcpu->state == VCPU_RUNNING) { + vcpu_notify_event_locked(vcpu, false); + } + while (vcpu->state == VCPU_RUNNING) { +#ifdef __FreeBSD__ + msleep_spin(&vcpu->state, &vcpu->mtx, "vcpublk", 0); +#else + cv_wait(&vcpu->state_cv, &vcpu->mtx.m); +#endif + } + vcpu_unlock(vcpu); +} + +void +vcpu_unblock_run(struct vm *vm, int vcpuid) +{ + struct vcpu *vcpu; + + if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + panic("vcpu_block_run: invalid vcpuid %d", vcpuid); + + vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + KASSERT(vcpu->runblock != 0, ("expected non-zero runblock")); + vcpu->runblock--; + if (vcpu->runblock == 0) { +#ifdef __FreeBSD__ + wakeup(&vcpu->state); +#else + cv_broadcast(&vcpu->state_cv); +#endif + } + vcpu_unlock(vcpu); +} + +#ifndef __FreeBSD__ +uint64_t +vcpu_tsc_offset(struct vm *vm, int vcpuid) +{ + return (vm->vcpu[vcpuid].tsc_offset); +} +#endif /* __FreeBSD__ */ + int vm_activate_cpu(struct vm *vm, int vcpuid) { - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (CPU_ISSET(vcpuid, &vm->active_cpus)) @@ -1677,6 +2751,55 @@ vm_activate_cpu(struct vm *vm, int vcpuid) return (0); } +int +vm_suspend_cpu(struct vm *vm, int vcpuid) +{ + int i; + + if (vcpuid < -1 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vcpuid == -1) { + vm->debug_cpus = vm->active_cpus; + for (i = 0; i < vm->maxcpus; i++) { + if (CPU_ISSET(i, &vm->active_cpus)) + vcpu_notify_event(vm, i, false); + } + } else { + if (!CPU_ISSET(vcpuid, &vm->active_cpus)) + return (EINVAL); + + CPU_SET_ATOMIC(vcpuid, &vm->debug_cpus); + vcpu_notify_event(vm, vcpuid, false); + } + return (0); +} + +int +vm_resume_cpu(struct vm *vm, int vcpuid) +{ + + if (vcpuid < -1 || vcpuid >= vm->maxcpus) + return (EINVAL); + + if (vcpuid == -1) { + CPU_ZERO(&vm->debug_cpus); + } else { + if (!CPU_ISSET(vcpuid, &vm->debug_cpus)) + return (EINVAL); + + CPU_CLR_ATOMIC(vcpuid, &vm->debug_cpus); + } + return (0); +} + +int +vcpu_debugged(struct vm *vm, int vcpuid) +{ + + return (CPU_ISSET(vcpuid, &vm->debug_cpus)); +} + cpuset_t vm_active_cpus(struct vm *vm) { @@ -1684,6 +2807,20 @@ vm_active_cpus(struct vm *vm) return (vm->active_cpus); } +cpuset_t +vm_debug_cpus(struct vm *vm) +{ + + return (vm->debug_cpus); +} + +cpuset_t +vm_suspended_cpus(struct vm *vm) +{ + + return (vm->suspended_cpus); +} + void * vcpu_stats(struct vm *vm, int vcpuid) { @@ -1694,7 +2831,7 @@ vcpu_stats(struct vm *vm, int vcpuid) int vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) { - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); *state = vm->vcpu[vcpuid].x2apic_state; @@ -1705,7 +2842,7 @@ vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state) int vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) { - if (vcpuid < 0 || vcpuid >= VM_MAXCPU) + if (vcpuid < 0 || vcpuid >= vm->maxcpus) return (EINVAL); if (state >= X2APIC_STATE_LAST) @@ -1725,15 +2862,11 @@ vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state) * - If the vcpu is running on a different host_cpu then an IPI will be directed * to the host_cpu to cause the vcpu to trap into the hypervisor. */ -void -vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) +static void +vcpu_notify_event_locked(struct vcpu *vcpu, bool lapic_intr) { int hostcpu; - struct vcpu *vcpu; - vcpu = &vm->vcpu[vcpuid]; - - vcpu_lock(vcpu); hostcpu = vcpu->hostcpu; if (vcpu->state == VCPU_RUNNING) { KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu")); @@ -1755,12 +2888,33 @@ vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) } else { KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent " "with hostcpu %d", vcpu->state, hostcpu)); - if (vcpu->state == VCPU_SLEEPING) + if (vcpu->state == VCPU_SLEEPING) { +#ifdef __FreeBSD__ wakeup_one(vcpu); +#else + cv_signal(&vcpu->vcpu_cv); +#endif + } } +} + +void +vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr) +{ + struct vcpu *vcpu = &vm->vcpu[vcpuid]; + + vcpu_lock(vcpu); + vcpu_notify_event_locked(vcpu, lapic_intr); vcpu_unlock(vcpu); } +struct vmspace * +vm_get_vmspace(struct vm *vm) +{ + + return (vm->vmspace); +} + int vm_apicid2vcpuid(struct vm *vm, int apicid) { @@ -1782,6 +2936,20 @@ vm_atpit(struct vm *vm) return (vm->vatpit); } +struct vpmtmr * +vm_pmtmr(struct vm *vm) +{ + + return (vm->vpmtmr); +} + +struct vrtc * +vm_rtc(struct vm *vm) +{ + + return (vm->vrtc); +} + enum vm_reg_name vm_segment_name(int seg) { @@ -1805,19 +2973,17 @@ vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, { int idx; -#ifdef __FreeBSD__ for (idx = 0; idx < num_copyinfo; idx++) { if (copyinfo[idx].cookie != NULL) vm_gpa_release(copyinfo[idx].cookie); } -#endif bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo)); } int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, - int num_copyinfo) + int num_copyinfo, int *fault) { int error, idx, nused; size_t n, off, remaining; @@ -1830,8 +2996,8 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, remaining = len; while (remaining > 0) { KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo")); - error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa); - if (error) + error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa, fault); + if (error || *fault) return (error); off = gpa & PAGE_MASK; n = min(remaining, PAGE_SIZE - off); @@ -1843,8 +3009,8 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, } for (idx = 0; idx < nused; idx++) { - hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len, - prot, &cookie); + hva = vm_gpa_hold(vm, vcpuid, copyinfo[idx].gpa, + copyinfo[idx].len, prot, &cookie); if (hva == NULL) break; copyinfo[idx].hva = hva; @@ -1853,8 +3019,9 @@ vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, if (idx != nused) { vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo); - return (-1); + return (EFAULT); } else { + *fault = 0; return (0); } } @@ -1892,3 +3059,125 @@ vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, idx++; } } + +/* + * Return the amount of in-use and wired memory for the VM. Since + * these are global stats, only return the values with for vCPU 0 + */ +VMM_STAT_DECLARE(VMM_MEM_RESIDENT); +VMM_STAT_DECLARE(VMM_MEM_WIRED); + +static void +vm_get_rescnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) +{ + + if (vcpu == 0) { + vmm_stat_set(vm, vcpu, VMM_MEM_RESIDENT, + PAGE_SIZE * vmspace_resident_count(vm->vmspace)); + } +} + +static void +vm_get_wiredcnt(struct vm *vm, int vcpu, struct vmm_stat_type *stat) +{ + + if (vcpu == 0) { + vmm_stat_set(vm, vcpu, VMM_MEM_WIRED, + PAGE_SIZE * pmap_wired_count(vmspace_pmap(vm->vmspace))); + } +} + +VMM_STAT_FUNC(VMM_MEM_RESIDENT, "Resident memory", vm_get_rescnt); +VMM_STAT_FUNC(VMM_MEM_WIRED, "Wired memory", vm_get_wiredcnt); + +#ifndef __FreeBSD__ +int +vm_ioport_hook(struct vm *vm, uint_t ioport, vmm_rmem_cb_t rfunc, + vmm_wmem_cb_t wfunc, void *arg, void **cookie) +{ + list_t *ih = &vm->ioport_hooks; + vm_ioport_hook_t *hook, *node; + + if (ioport == 0) { + return (EINVAL); + } + + /* + * Find the node position in the list which this region should be + * inserted behind to maintain sorted order. + */ + for (node = list_tail(ih); node != NULL; node = list_prev(ih, node)) { + if (ioport == node->vmih_ioport) { + /* Reject duplicate port hook */ + return (EEXIST); + } else if (ioport > node->vmih_ioport) { + break; + } + } + + hook = kmem_alloc(sizeof (*hook), KM_SLEEP); + hook->vmih_ioport = ioport; + hook->vmih_arg = arg; + hook->vmih_rmem_cb = rfunc; + hook->vmih_wmem_cb = wfunc; + if (node == NULL) { + list_insert_head(ih, hook); + } else { + list_insert_after(ih, node, hook); + } + + *cookie = (void *)hook; + return (0); +} + +void +vm_ioport_unhook(struct vm *vm, void **cookie) +{ + vm_ioport_hook_t *hook; + list_t *ih = &vm->ioport_hooks; + + hook = *cookie; + list_remove(ih, hook); + kmem_free(hook, sizeof (*hook)); + *cookie = NULL; +} + +int +vm_ioport_handle_hook(struct vm *vm, int cpuid, bool in, int port, int bytes, + uint32_t *val) +{ + vm_ioport_hook_t *hook; + list_t *ih = &vm->ioport_hooks; + int err = 0; + + for (hook = list_head(ih); hook != NULL; hook = list_next(ih, hook)) { + if (hook->vmih_ioport == port) { + break; + } + } + if (hook == NULL) { + return (ENOENT); + } + + if (in) { + uint64_t tval; + + if (hook->vmih_rmem_cb == NULL) { + return (ENOENT); + } + err = hook->vmih_rmem_cb(hook->vmih_arg, (uintptr_t)port, + (uint_t)bytes, &tval); + *val = (uint32_t)tval; + } else { + if (hook->vmih_wmem_cb == NULL) { + return (ENOENT); + } + err = hook->vmih_wmem_cb(hook->vmih_arg, (uintptr_t)port, + (uint_t)bytes, (uint64_t)*val); + } + + return (err); +} + + +#endif /* __FreeBSD__ */ diff --git a/usr/src/uts/i86pc/io/vmm/vmm.mapfile b/usr/src/uts/i86pc/io/vmm/vmm.mapfile new file mode 100644 index 0000000000..83c14de895 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm.mapfile @@ -0,0 +1,62 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2019 Joyent, Inc. +# + +# +# MAPFILE HEADER START +# +# WARNING: STOP NOW. DO NOT MODIFY THIS FILE. +# Object versioning must comply with the rules detailed in +# +# usr/src/lib/README.mapfiles +# +# You should not be making modifications here until you've read the most current +# copy of that file. If you need help, contact a gatekeeper for guidance. +# +# MAPFILE HEADER END +# + +$mapfile_version 2 + +SYMBOL_VERSION ILLUMOSprivate { + global: + # DDI Interfaces + _fini; + _init; + _info; + + # bhyve driver API + vmm_drv_hold; + vmm_drv_rele; + vmm_drv_release_reqd; + vmm_drv_lease_sign; + vmm_drv_lease_break; + vmm_drv_lease_expired; + vmm_drv_gpa2kva; + vmm_drv_ioport_hook; + vmm_drv_ioport_unhook; + vmm_drv_msi; + + # IOMMU API for PCI pass-thru + iommu_add_device; + iommu_host_domain; + iommu_remove_device; + lapic_intr_msi; + vm_iommu_domain; + vm_map_mmio; + vm_unmap_mmio; + + local: + *; +}; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.c b/usr/src/uts/i86pc/io/vmm/vmm_host.c index b94caf4009..9e390c93dd 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_host.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_host.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -39,7 +41,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/pcpu.h> @@ -50,11 +52,14 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z ne #include "vmm_host.h" -static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4; +static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4, + vmm_host_xcr0; +static struct xsave_limits vmm_xsave_limits; void vmm_host_state_init(void) { + unsigned int regs[4]; vmm_host_efer = rdmsr(MSR_EFER); vmm_host_pat = rdmsr(MSR_PAT); @@ -68,7 +73,36 @@ vmm_host_state_init(void) */ vmm_host_cr0 = rcr0() | CR0_TS; - vmm_host_cr4 = rcr4(); + /* + * On non-PCID or PCID but without INVPCID support machines, + * we flush kernel i.e. global TLB entries, by temporary + * clearing the CR4.PGE bit, see invltlb_glob(). If + * preemption occurs at the wrong time, cached vmm_host_cr4 + * might store the value with CR4.PGE cleared. Since FreeBSD + * requires support for PG_G on amd64, just set it + * unconditionally. + */ + vmm_host_cr4 = rcr4() | CR4_PGE; + + /* + * Only permit a guest to use XSAVE if the host is using + * XSAVE. Only permit a guest to use XSAVE features supported + * by the host. This ensures that the FPU state used by the + * guest is always a subset of the saved guest FPU state. + * + * In addition, only permit known XSAVE features where the + * rules for which features depend on other features is known + * to properly emulate xsetbv. + */ + if (vmm_host_cr4 & CR4_XSAVE) { + vmm_xsave_limits.xsave_enabled = 1; + vmm_host_xcr0 = rxcr(0); + vmm_xsave_limits.xcr0_allowed = vmm_host_xcr0 & + (XFEATURE_AVX | XFEATURE_MPX | XFEATURE_AVX512); + + cpuid_count(0xd, 0x0, regs); + vmm_xsave_limits.xsave_max_size = regs[1]; + } } uint64_t @@ -100,6 +134,13 @@ vmm_get_host_cr4(void) } uint64_t +vmm_get_host_xcr0(void) +{ + + return (vmm_host_xcr0); +} + +uint64_t vmm_get_host_datasel(void) { @@ -122,7 +163,6 @@ vmm_get_host_codesel(void) #endif } - uint64_t vmm_get_host_tsssel(void) { @@ -158,3 +198,10 @@ vmm_get_host_idtrbase(void) return (idtr.dtr_base); #endif } + +const struct xsave_limits * +vmm_get_xsave_limits(void) +{ + + return (&vmm_xsave_limits); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.h b/usr/src/uts/i86pc/io/vmm/vmm_host.h index 5de015a228..f12047819d 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_host.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_host.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_host.h 242275 2012-10-29 01:51:24Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,6 +38,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _VMM_HOST_H_ @@ -46,20 +49,28 @@ #endif #ifndef _KERNEL -#error "no user-servicable parts inside" +#error "no user-serviceable parts inside" #endif +struct xsave_limits { + int xsave_enabled; + uint64_t xcr0_allowed; + uint32_t xsave_max_size; +}; + void vmm_host_state_init(void); uint64_t vmm_get_host_pat(void); uint64_t vmm_get_host_efer(void); uint64_t vmm_get_host_cr0(void); uint64_t vmm_get_host_cr4(void); +uint64_t vmm_get_host_xcr0(void); uint64_t vmm_get_host_datasel(void); uint64_t vmm_get_host_codesel(void); uint64_t vmm_get_host_tsssel(void); uint64_t vmm_get_host_fsbase(void); uint64_t vmm_get_host_idtrbase(void); +const struct xsave_limits *vmm_get_xsave_limits(void); /* * Inline access to host state that is used on every VM entry @@ -89,8 +100,10 @@ vmm_get_host_gdtrbase(void) #endif } +#ifdef __FreeBSD__ struct pcpu; extern struct pcpu __pcpu[]; +#endif static __inline uint64_t vmm_get_host_gsbase(void) diff --git a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c index 72c7056e26..ea96cd8db0 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 Sandvine, Inc. * Copyright (c) 2012 NetApp, Inc. * All rights reserved. @@ -24,7 +26,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_instruction_emul.c 281987 2015-04-25 19:02:06Z tychon $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -37,15 +39,17 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_instruction_emul.c 281987 2015-04-25 19:02:06Z tychon $"); +__FBSDID("$FreeBSD$"); #ifdef _KERNEL #include <sys/param.h> #include <sys/pcpu.h> #include <sys/systm.h> +#include <sys/proc.h> #include <vm/vm.h> #include <vm/pmap.h> @@ -84,6 +88,9 @@ enum { VIE_OP_TYPE_MOVS, VIE_OP_TYPE_GROUP1, VIE_OP_TYPE_STOS, + VIE_OP_TYPE_BITTEST, + VIE_OP_TYPE_TWOB_GRP15, + VIE_OP_TYPE_ADD, VIE_OP_TYPE_LAST }; @@ -94,7 +101,12 @@ enum { #define VIE_OP_F_NO_MODRM (1 << 3) #define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4) +#ifdef _KERNEL static const struct vie_op two_byte_opcodes[256] = { + [0xAE] = { + .op_byte = 0xAE, + .op_type = VIE_OP_TYPE_TWOB_GRP15, + }, [0xB6] = { .op_byte = 0xB6, .op_type = VIE_OP_TYPE_MOVZX, @@ -103,6 +115,11 @@ static const struct vie_op two_byte_opcodes[256] = { .op_byte = 0xB7, .op_type = VIE_OP_TYPE_MOVZX, }, + [0xBA] = { + .op_byte = 0xBA, + .op_type = VIE_OP_TYPE_BITTEST, + .op_flags = VIE_OP_F_IMM8, + }, [0xBE] = { .op_byte = 0xBE, .op_type = VIE_OP_TYPE_MOVSX, @@ -110,14 +127,26 @@ static const struct vie_op two_byte_opcodes[256] = { }; static const struct vie_op one_byte_opcodes[256] = { + [0x03] = { + .op_byte = 0x03, + .op_type = VIE_OP_TYPE_ADD, + }, [0x0F] = { .op_byte = 0x0F, .op_type = VIE_OP_TYPE_TWO_BYTE }, + [0x0B] = { + .op_byte = 0x0B, + .op_type = VIE_OP_TYPE_OR, + }, [0x2B] = { .op_byte = 0x2B, .op_type = VIE_OP_TYPE_SUB, }, + [0x39] = { + .op_byte = 0x39, + .op_type = VIE_OP_TYPE_CMP, + }, [0x3B] = { .op_byte = 0x3B, .op_type = VIE_OP_TYPE_CMP, @@ -183,14 +212,20 @@ static const struct vie_op one_byte_opcodes[256] = { .op_byte = 0x23, .op_type = VIE_OP_TYPE_AND, }, + [0x80] = { + /* Group 1 extended opcode */ + .op_byte = 0x80, + .op_type = VIE_OP_TYPE_GROUP1, + .op_flags = VIE_OP_F_IMM8, + }, [0x81] = { - /* XXX Group 1 extended opcode */ + /* Group 1 extended opcode */ .op_byte = 0x81, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM, }, [0x83] = { - /* XXX Group 1 extended opcode */ + /* Group 1 extended opcode */ .op_byte = 0x83, .op_type = VIE_OP_TYPE_GROUP1, .op_flags = VIE_OP_F_IMM8, @@ -206,6 +241,7 @@ static const struct vie_op one_byte_opcodes[256] = { .op_type = VIE_OP_TYPE_PUSH, } }; +#endif /* struct vie.mod */ #define VIE_MOD_INDIRECT 0 @@ -394,6 +430,41 @@ getcc(int opsize, uint64_t x, uint64_t y) return (getcc64(x, y)); } +/* + * Macro creation of functions getaddflags{8,16,32,64} + */ +#define GETADDFLAGS(sz) \ +static u_long \ +getaddflags##sz(uint##sz##_t x, uint##sz##_t y) \ +{ \ + u_long rflags; \ + \ + __asm __volatile("add %2,%1; pushfq; popq %0" : \ + "=r" (rflags), "+r" (x) : "m" (y)); \ + return (rflags); \ +} struct __hack + +GETADDFLAGS(8); +GETADDFLAGS(16); +GETADDFLAGS(32); +GETADDFLAGS(64); + +static u_long +getaddflags(int opsize, uint64_t x, uint64_t y) +{ + KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8, + ("getaddflags: invalid operand size %d", opsize)); + + if (opsize == 1) + return (getaddflags8(x, y)); + else if (opsize == 2) + return (getaddflags16(x, y)); + else if (opsize == 4) + return (getaddflags32(x, y)); + else + return (getaddflags64(x, y)); +} + static int emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) @@ -596,13 +667,11 @@ emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, /* * Helper function to calculate and validate a linear address. - * - * Returns 0 on success and 1 if an exception was injected into the guest. */ static int get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, int opsize, int addrsize, int prot, enum vm_reg_name seg, - enum vm_reg_name gpr, uint64_t *gla) + enum vm_reg_name gpr, uint64_t *gla, int *fault) { struct seg_desc desc; uint64_t cr0, val, rflags; @@ -628,7 +697,7 @@ get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, vm_inject_ss(vm, vcpuid, 0); else vm_inject_gp(vm, vcpuid); - return (1); + goto guest_fault; } if (vie_canonical_check(paging->cpu_mode, *gla)) { @@ -636,14 +705,19 @@ get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging, vm_inject_ss(vm, vcpuid, 0); else vm_inject_gp(vm, vcpuid); - return (1); + goto guest_fault; } if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) { vm_inject_ac(vm, vcpuid, 0); - return (1); + goto guest_fault; } + *fault = 0; + return (0); + +guest_fault: + *fault = 1; return (0); } @@ -659,7 +733,7 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, #endif uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val; uint64_t rcx, rdi, rsi, rflags; - int error, opsize, seg, repeat; + int error, fault, opsize, seg, repeat; opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize; val = 0; @@ -682,8 +756,10 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * The count register is %rcx, %ecx or %cx depending on the * address size of the instruction. */ - if ((rcx & vie_size2mask(vie->addrsize)) == 0) - return (0); + if ((rcx & vie_size2mask(vie->addrsize)) == 0) { + error = 0; + goto done; + } } /* @@ -704,13 +780,16 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS; error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, - PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr); - if (error) + PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr, &fault); + if (error || fault) goto done; error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ, - copyinfo, nitems(copyinfo)); + copyinfo, nitems(copyinfo), &fault); if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + /* * case (2): read from system memory and write to mmio. */ @@ -719,11 +798,6 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, error = memwrite(vm, vcpuid, gpa, val, opsize, arg); if (error) goto done; - } else if (error > 0) { - /* - * Resume guest execution to handle fault. - */ - goto done; } else { /* * 'vm_copy_setup()' is expected to fail for cases (3) and (4) @@ -731,13 +805,17 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, */ error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize, - PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr); - if (error) + PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr, + &fault); + if (error || fault) goto done; error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize, - PROT_WRITE, copyinfo, nitems(copyinfo)); + PROT_WRITE, copyinfo, nitems(copyinfo), &fault); if (error == 0) { + if (fault) + goto done; /* Resume guest to handle fault */ + /* * case (3): read from MMIO and write to system memory. * @@ -753,27 +831,29 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, vm_copyout(vm, vcpuid, &val, copyinfo, opsize); vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); - } else if (error > 0) { - /* - * Resume guest execution to handle fault. - */ - goto done; } else { /* * Case (4): read from and write to mmio. + * + * Commit to the MMIO read/write (with potential + * side-effects) only after we are sure that the + * instruction is not going to be restarted due + * to address translation faults. */ error = vm_gla2gpa(vm, vcpuid, paging, srcaddr, - PROT_READ, &srcgpa); - if (error) - goto done; - error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); - if (error) + PROT_READ, &srcgpa, &fault); + if (error || fault) goto done; error = vm_gla2gpa(vm, vcpuid, paging, dstaddr, - PROT_WRITE, &dstgpa); + PROT_WRITE, &dstgpa, &fault); + if (error || fault) + goto done; + + error = memread(vm, vcpuid, srcgpa, &val, opsize, arg); if (error) goto done; + error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg); if (error) goto done; @@ -818,10 +898,9 @@ emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, vm_restart_instruction(vm, vcpuid); } done: - if (error < 0) - return (EFAULT); - else - return (0); + KASSERT(error == 0 || error == EFAULT, ("%s: unexpected error %d", + __func__, error)); + return (error); } static int @@ -979,12 +1058,38 @@ emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; - uint64_t val1, result, rflags, rflags2; + enum vm_reg_name reg; + uint64_t result, rflags, rflags2, val1, val2; size = vie->opsize; error = EINVAL; switch (vie->op.op_byte) { + case 0x0B: + /* + * OR reg (ModRM:reg) and mem (ModRM:r/m) and store the + * result in reg. + * + * 0b/r or r16, r/m16 + * 0b/r or r32, r/m32 + * REX.W + 0b/r or r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + result = val1 | val2; + error = vie_update_register(vm, vcpuid, reg, result, size); + break; case 0x81: case 0x83: /* @@ -1041,39 +1146,55 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { int error, size; - uint64_t op1, op2, rflags, rflags2; + uint64_t regop, memop, op1, op2, rflags, rflags2; enum vm_reg_name reg; size = vie->opsize; switch (vie->op.op_byte) { + case 0x39: case 0x3B: /* + * 39/r CMP r/m16, r16 + * 39/r CMP r/m32, r32 + * REX.W 39/r CMP r/m64, r64 + * * 3B/r CMP r16, r/m16 * 3B/r CMP r32, r/m32 * REX.W + 3B/r CMP r64, r/m64 * - * Compare first operand (reg) with second operand (r/m) and + * Compare the first operand with the second operand and * set status flags in EFLAGS register. The comparison is * performed by subtracting the second operand from the first * operand and then setting the status flags. */ - /* Get the first operand */ + /* Get the register operand */ reg = gpr_map[vie->reg]; - error = vie_read_register(vm, vcpuid, reg, &op1); + error = vie_read_register(vm, vcpuid, reg, ®op); if (error) return (error); - /* Get the second operand */ - error = memread(vm, vcpuid, gpa, &op2, size, arg); + /* Get the memory operand */ + error = memread(vm, vcpuid, gpa, &memop, size, arg); if (error) return (error); + if (vie->op.op_byte == 0x3B) { + op1 = regop; + op2 = memop; + } else { + op1 = memop; + op2 = regop; + } rflags2 = getcc(size, op1, op2); break; + case 0x80: case 0x81: case 0x83: /* + * 80 /7 cmp r/m8, imm8 + * REX + 80 /7 cmp r/m8, imm8 + * * 81 /7 cmp r/m16, imm16 * 81 /7 cmp r/m32, imm32 * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64 @@ -1089,6 +1210,8 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, * the status flags. * */ + if (vie->op.op_byte == 0x80) + size = 1; /* get the first operand */ error = memread(vm, vcpuid, gpa, &op1, size, arg); @@ -1111,6 +1234,62 @@ emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, } static int +emulate_add(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *arg) +{ + int error, size; + uint64_t nval, rflags, rflags2, val1, val2; + enum vm_reg_name reg; + + size = vie->opsize; + error = EINVAL; + + switch (vie->op.op_byte) { + case 0x03: + /* + * ADD r/m to r and store the result in r + * + * 03/r ADD r16, r/m16 + * 03/r ADD r32, r/m32 + * REX.W + 03/r ADD r64, r/m64 + */ + + /* get the first operand */ + reg = gpr_map[vie->reg]; + error = vie_read_register(vm, vcpuid, reg, &val1); + if (error) + break; + + /* get the second operand */ + error = memread(vm, vcpuid, gpa, &val2, size, arg); + if (error) + break; + + /* perform the operation and write the result */ + nval = val1 + val2; + error = vie_update_register(vm, vcpuid, reg, nval, size); + break; + default: + break; + } + + if (!error) { + rflags2 = getaddflags(size, val1, val2); + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + &rflags); + if (error) + return (error); + + rflags &= ~RFLAGS_STATUS_BITS; + rflags |= rflags2 & RFLAGS_STATUS_BITS; + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, + rflags, 8); + } + + return (error); +} + +static int emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, mem_region_read_t memread, mem_region_write_t memwrite, void *arg) { @@ -1178,7 +1357,7 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, #endif struct seg_desc ss_desc; uint64_t cr0, rflags, rsp, stack_gla, val; - int error, size, stackaddrsize, pushop; + int error, fault, size, stackaddrsize, pushop; val = 0; size = vie->opsize; @@ -1201,7 +1380,7 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, size = vie->opsize_override ? 2 : 8; } else { /* - * In protected or compability mode the 'B' flag in the + * In protected or compatibility mode the 'B' flag in the * stack-segment descriptor determines the size of the * stack pointer. */ @@ -1244,18 +1423,10 @@ emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie, } error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size, - pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo)); - if (error == -1) { - /* - * XXX cannot return a negative error value here because it - * ends up being the return value of the VM_RUN() ioctl and - * is interpreted as a pseudo-error (for e.g. ERESTART). - */ - return (EFAULT); - } else if (error == 1) { - /* Resume guest execution to handle page fault */ - return (0); - } + pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo), + &fault); + if (error || fault) + return (error); if (pushop) { error = memread(vm, vcpuid, mmio_gpa, &val, size, arg); @@ -1346,6 +1517,79 @@ emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, return (error); } +static int +emulate_bittest(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) +{ + uint64_t val, rflags; + int error, bitmask, bitoff; + + /* + * 0F BA is a Group 8 extended opcode. + * + * Currently we only emulate the 'Bit Test' instruction which is + * identified by a ModR/M:reg encoding of 100b. + */ + if ((vie->reg & 7) != 4) + return (EINVAL); + + error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags); + KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error)); + + error = memread(vm, vcpuid, gpa, &val, vie->opsize, memarg); + if (error) + return (error); + + /* + * Intel SDM, Vol 2, Table 3-2: + * "Range of Bit Positions Specified by Bit Offset Operands" + */ + bitmask = vie->opsize * 8 - 1; + bitoff = vie->immediate & bitmask; + + /* Copy the bit into the Carry flag in %rflags */ + if (val & (1UL << bitoff)) + rflags |= PSL_C; + else + rflags &= ~PSL_C; + + error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8); + KASSERT(error == 0, ("%s: error %d updating rflags", __func__, error)); + + return (0); +} + +static int +emulate_twob_group15(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, + mem_region_read_t memread, mem_region_write_t memwrite, void *memarg) +{ + int error; + uint64_t buf; + + switch (vie->reg & 7) { + case 0x7: /* CLFLUSH, CLFLUSHOPT, and SFENCE */ + if (vie->mod == 0x3) { + /* + * SFENCE. Ignore it, VM exit provides enough + * barriers on its own. + */ + error = 0; + } else { + /* + * CLFLUSH, CLFLUSHOPT. Only check for access + * rights. + */ + error = memread(vm, vcpuid, gpa, &buf, 1, memarg); + } + break; + default: + error = EINVAL; + break; + } + + return (error); +} + int vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, struct vm_guest_paging *paging, mem_region_read_t memread, @@ -1402,6 +1646,18 @@ vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie, error = emulate_sub(vm, vcpuid, gpa, vie, memread, memwrite, memarg); break; + case VIE_OP_TYPE_BITTEST: + error = emulate_bittest(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_TWOB_GRP15: + error = emulate_twob_group15(vm, vcpuid, gpa, vie, + memread, memwrite, memarg); + break; + case VIE_OP_TYPE_ADD: + error = emulate_add(vm, vcpuid, gpa, vie, memread, + memwrite, memarg); + break; default: error = EINVAL; break; @@ -1623,27 +1879,31 @@ ptp_release(void **cookie) } static void * -ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie) +ptp_hold(struct vm *vm, int vcpu, vm_paddr_t ptpphys, size_t len, void **cookie) { void *ptr; ptp_release(cookie); - ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie); + ptr = vm_gpa_hold(vm, vcpu, ptpphys, len, VM_PROT_RW, cookie); return (ptr); } -int -vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa) +static int +_vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault, bool check_only) { - int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable; + int nlevels, pfcode, retval, usermode, writable; + int ptpshift = 0, ptpindex = 0; + uint64_t ptpphys; + uint64_t *ptpbase = NULL, pte = 0, pgsize = 0; #ifdef __FreeBSD__ -#endif u_int retries; - uint64_t *ptpbase, ptpphys, pte, pgsize; +#endif uint32_t *ptpbase32, pte32; void *cookie; + *guest_fault = 0; + usermode = (paging->cpl == 3 ? 1 : 0); writable = prot & VM_PROT_WRITE; cookie = NULL; @@ -1664,7 +1924,8 @@ restart: * XXX assuming a non-stack reference otherwise a stack fault * should be generated. */ - vm_inject_gp(vm, vcpuid); + if (!check_only) + vm_inject_gp(vm, vcpuid); goto fault; } @@ -1679,7 +1940,8 @@ restart: /* Zero out the lower 12 bits. */ ptpphys &= ~0xfff; - ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); + ptpbase32 = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, + &cookie); if (ptpbase32 == NULL) goto error; @@ -1693,9 +1955,11 @@ restart: if ((pte32 & PG_V) == 0 || (usermode && (pte32 & PG_U) == 0) || (writable && (pte32 & PG_RW) == 0)) { - pfcode = pf_error_code(usermode, prot, 0, - pte32); - vm_inject_pf(vm, vcpuid, pfcode, gla); + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, + pte32); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } goto fault; } @@ -1706,7 +1970,7 @@ restart: * is only set at the last level providing the guest * physical address. */ - if ((pte32 & PG_A) == 0) { + if (!check_only && (pte32 & PG_A) == 0) { if (atomic_cmpset_32(&ptpbase32[ptpindex], pte32, pte32 | PG_A) == 0) { goto restart; @@ -1721,7 +1985,7 @@ restart: } /* Set the dirty bit in the page table entry if necessary */ - if (writable && (pte32 & PG_M) == 0) { + if (!check_only && writable && (pte32 & PG_M) == 0) { if (atomic_cmpset_32(&ptpbase32[ptpindex], pte32, pte32 | PG_M) == 0) { goto restart; @@ -1738,7 +2002,8 @@ restart: /* Zero out the lower 5 bits and the upper 32 bits */ ptpphys &= 0xffffffe0UL; - ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie); + ptpbase = ptp_hold(vm, vcpuid, ptpphys, sizeof(*ptpbase) * 4, + &cookie); if (ptpbase == NULL) goto error; @@ -1747,8 +2012,10 @@ restart: pte = ptpbase[ptpindex]; if ((pte & PG_V) == 0) { - pfcode = pf_error_code(usermode, prot, 0, pte); - vm_inject_pf(vm, vcpuid, pfcode, gla); + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } goto fault; } @@ -1761,7 +2028,7 @@ restart: /* Zero out the lower 12 bits and the upper 12 bits */ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12; - ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie); + ptpbase = ptp_hold(vm, vcpuid, ptpphys, PAGE_SIZE, &cookie); if (ptpbase == NULL) goto error; @@ -1774,13 +2041,15 @@ restart: if ((pte & PG_V) == 0 || (usermode && (pte & PG_U) == 0) || (writable && (pte & PG_RW) == 0)) { - pfcode = pf_error_code(usermode, prot, 0, pte); - vm_inject_pf(vm, vcpuid, pfcode, gla); + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 0, pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } goto fault; } /* Set the accessed bit in the page table entry */ - if ((pte & PG_A) == 0) { + if (!check_only && (pte & PG_A) == 0) { if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_A) == 0) { goto restart; @@ -1789,8 +2058,11 @@ restart: if (nlevels > 0 && (pte & PG_PS) != 0) { if (pgsize > 1 * GB) { - pfcode = pf_error_code(usermode, prot, 1, pte); - vm_inject_pf(vm, vcpuid, pfcode, gla); + if (!check_only) { + pfcode = pf_error_code(usermode, prot, 1, + pte); + vm_inject_pf(vm, vcpuid, pfcode, gla); + } goto fault; } break; @@ -1800,7 +2072,7 @@ restart: } /* Set the dirty bit in the page table entry if necessary */ - if (writable && (pte & PG_M) == 0) { + if (!check_only && writable && (pte & PG_M) == 0) { if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0) goto restart; } @@ -1810,18 +2082,38 @@ restart: *gpa = pte | (gla & (pgsize - 1)); done: ptp_release(&cookie); + KASSERT(retval == 0 || retval == EFAULT, ("%s: unexpected retval %d", + __func__, retval)); return (retval); error: - retval = -1; + retval = EFAULT; goto done; fault: - retval = 1; + *guest_fault = 1; goto done; } int +vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) +{ + + return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, + false)); +} + +int +vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *guest_fault) +{ + + return (_vm_gla2gpa(vm, vcpuid, paging, gla, prot, gpa, guest_fault, + true)); +} + +int vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, - uint64_t rip, int inst_length, struct vie *vie) + uint64_t rip, int inst_length, struct vie *vie, int *faultptr) { struct vm_copyinfo copyinfo[2]; int error, prot; @@ -1831,13 +2123,14 @@ vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, prot = PROT_READ | PROT_EXEC; error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot, - copyinfo, nitems(copyinfo)); - if (error == 0) { - vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); - vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); - vie->num_valid = inst_length; - } - return (error); + copyinfo, nitems(copyinfo), faultptr); + if (error || *faultptr) + return (error); + + vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length); + vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo)); + vie->num_valid = inst_length; + return (0); } static int @@ -2262,27 +2555,17 @@ decode_moffset(struct vie *vie) } /* - * Verify that all the bytes in the instruction buffer were consumed. - */ -static int -verify_inst_length(struct vie *vie) -{ - - if (vie->num_processed) - return (0); - else - return (-1); -} - -/* * Verify that the 'guest linear address' provided as collateral of the nested * page table fault matches with our instruction decoding. */ static int -verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) +verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie, + enum vm_cpu_mode cpu_mode) { int error; - uint64_t base, idx, gla2; + uint64_t base, segbase, idx, gla2; + enum vm_reg_name seg; + struct seg_desc desc; /* Skip 'gla' verification */ if (gla == VIE_INVALID_GLA) @@ -2302,7 +2585,7 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) * instruction */ if (vie->base_register == VM_REG_GUEST_RIP) - base += vie->num_valid; + base += vie->num_processed; } idx = 0; @@ -2315,14 +2598,48 @@ verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie) } } - /* XXX assuming that the base address of the segment is 0 */ - gla2 = base + vie->scale * idx + vie->displacement; + /* + * From "Specifying a Segment Selector", Intel SDM, Vol 1 + * + * In 64-bit mode, segmentation is generally (but not + * completely) disabled. The exceptions are the FS and GS + * segments. + * + * In legacy IA-32 mode, when the ESP or EBP register is used + * as the base, the SS segment is the default segment. For + * other data references, except when relative to stack or + * string destination the DS segment is the default. These + * can be overridden to allow other segments to be accessed. + */ + if (vie->segment_override) + seg = vie->segment_register; + else if (vie->base_register == VM_REG_GUEST_RSP || + vie->base_register == VM_REG_GUEST_RBP) + seg = VM_REG_GUEST_SS; + else + seg = VM_REG_GUEST_DS; + if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS && + seg != VM_REG_GUEST_GS) { + segbase = 0; + } else { + error = vm_get_seg_desc(vm, cpuid, seg, &desc); + if (error) { + printf("verify_gla: error %d getting segment" + " descriptor %d", error, + vie->segment_register); + return (-1); + } + segbase = desc.base; + } + + gla2 = segbase + base + vie->scale * idx + vie->displacement; gla2 &= size2mask[vie->addrsize]; if (gla != gla2) { - printf("verify_gla mismatch: " + printf("verify_gla mismatch: segbase(0x%0lx)" "base(0x%0lx), scale(%d), index(0x%0lx), " "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n", - base, vie->scale, idx, vie->displacement, gla, gla2); + segbase, base, vie->scale, idx, vie->displacement, + gla, gla2); return (-1); } @@ -2355,11 +2672,8 @@ vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla, if (decode_moffset(vie)) return (-1); - if (verify_inst_length(vie)) - return (-1); - if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) { - if (verify_gla(vm, cpuid, gla, vie)) + if (verify_gla(vm, cpuid, gla, vie, cpu_mode)) return (-1); } diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c index bea750f162..3d08fd5e85 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -25,12 +27,9 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_ioport.c 277168 2015-01-14 07:18:51Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> -#include <sys/types.h> -#include <sys/queue.h> -#include <sys/cpuset.h> #include <sys/systm.h> #include <machine/vmm.h> @@ -38,6 +37,8 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_ioport.c 277168 2015-01-14 07:18:51Z #include "vatpic.h" #include "vatpit.h" +#include "vpmtmr.h" +#include "vrtc.h" #include "vmm_ioport.h" #include "vmm_ktr.h" @@ -55,6 +56,9 @@ ioport_handler_func_t ioport_handler[MAX_IOPORTS] = { [IO_ICU2 + ICU_IMR_OFFSET] = vatpic_slave_handler, [IO_ELCR1] = vatpic_elc_handler, [IO_ELCR2] = vatpic_elc_handler, + [IO_PMTMR] = vpmtmr_handler, + [IO_RTC] = vrtc_addr_handler, + [IO_RTC + 1] = vrtc_data_handler, }; #ifdef KTR @@ -103,6 +107,7 @@ emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, uint32_t mask, val; int error; +#ifdef __FreeBSD__ /* * If there is no handler for the I/O port then punt to userspace. */ @@ -111,6 +116,28 @@ emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, *retu = true; return (0); } +#else /* __FreeBSD__ */ + handler = NULL; + if (vmexit->u.inout.port < MAX_IOPORTS) { + handler = ioport_handler[vmexit->u.inout.port]; + } + /* Look for hooks, if a standard handler is not present */ + if (handler == NULL) { + mask = vie_size2mask(vmexit->u.inout.bytes); + if (!vmexit->u.inout.in) { + val = vmexit->u.inout.eax & mask; + } + error = vm_ioport_handle_hook(vm, vcpuid, vmexit->u.inout.in, + vmexit->u.inout.port, vmexit->u.inout.bytes, &val); + if (error == 0) { + goto finish; + } + + *retu = true; + return (0); + } + +#endif /* __FreeBSD__ */ mask = vie_size2mask(vmexit->u.inout.bytes); @@ -131,6 +158,9 @@ emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit, return (EIO); } +#ifndef __FreeBSD__ +finish: +#endif /* __FreeBSD__ */ if (vmexit->u.inout.in) { vmexit->u.inout.eax &= ~mask; vmexit->u.inout.eax |= val & mask; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h index 624dd8f1d8..14e315f400 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com> * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_ioport.h 273706 2014-10-26 19:03:06Z neel $ + * $FreeBSD$ */ #ifndef _VMM_IOPORT_H_ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ktr.h b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h index 917c7f83a4..414d0341cc 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_ktr.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_ktr.h 258699 2013-11-27 22:18:08Z neel $ + * $FreeBSD$ */ #ifndef _VMM_KTR_H_ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c index 3215c74a44..43b2bebe97 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z tychon $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -39,7 +41,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z tychon $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/systm.h> @@ -49,7 +51,6 @@ __FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z t #include <x86/apicreg.h> #include <machine/vmm.h> -#include "vmm_ipi.h" #include "vmm_ktr.h" #include "vmm_lapic.h" #include "vlapic.h" @@ -67,10 +68,14 @@ lapic_set_intr(struct vm *vm, int cpu, int vector, bool level) { struct vlapic *vlapic; - if (cpu < 0 || cpu >= VM_MAXCPU) + if (cpu < 0 || cpu >= vm_get_maxcpus(vm)) return (EINVAL); - if (vector < 32 || vector > 255) + /* + * According to section "Maskable Hardware Interrupts" in Intel SDM + * vectors 16 through 255 can be delivered through the local APIC. + */ + if (vector < 16 || vector > 255) return (EINVAL); vlapic = vm_lapic(vm, cpu); @@ -86,7 +91,7 @@ lapic_set_local_intr(struct vm *vm, int cpu, int vector) cpuset_t dmask; int error; - if (cpu < -1 || cpu >= VM_MAXCPU) + if (cpu < -1 || cpu >= vm_get_maxcpus(vm)) return (EINVAL); if (cpu == -1) diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.h b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h index ee47ee7783..da3b0ff660 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_lapic.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_lapic.h 259863 2013-12-25 06:46:31Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.c b/usr/src/uts/i86pc/io/vmm/vmm_mem.c new file mode 100644 index 0000000000..a736d94bba --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_mem.c @@ -0,0 +1,124 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/sglist.h> +#include <sys/lock.h> +#include <sys/rwlock.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> + +#include <machine/md_var.h> + +#include "vmm_mem.h" + +int +vmm_mem_init(void) +{ + + return (0); +} + +vm_object_t +vmm_mmio_alloc(struct vmspace *vmspace, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa) +{ + int error; + vm_object_t obj; + struct sglist *sg; + + sg = sglist_alloc(1, M_WAITOK); + error = sglist_append_phys(sg, hpa, len); + KASSERT(error == 0, ("error %d appending physaddr to sglist", error)); + + obj = vm_pager_allocate(OBJT_SG, sg, len, VM_PROT_RW, 0, NULL); + if (obj != NULL) { + /* + * VT-x ignores the MTRR settings when figuring out the + * memory type for translations obtained through EPT. + * + * Therefore we explicitly force the pages provided by + * this object to be mapped as uncacheable. + */ + VM_OBJECT_WLOCK(obj); + error = vm_object_set_memattr(obj, VM_MEMATTR_UNCACHEABLE); + VM_OBJECT_WUNLOCK(obj); + if (error != KERN_SUCCESS) { + panic("vmm_mmio_alloc: vm_object_set_memattr error %d", + error); + } + error = vm_map_find(&vmspace->vm_map, obj, 0, &gpa, len, 0, + VMFS_NO_SPACE, VM_PROT_RW, VM_PROT_RW, 0); + if (error != KERN_SUCCESS) { + vm_object_deallocate(obj); + obj = NULL; + } + } + + /* + * Drop the reference on the sglist. + * + * If the scatter/gather object was successfully allocated then it + * has incremented the reference count on the sglist. Dropping the + * initial reference count ensures that the sglist will be freed + * when the object is deallocated. + * + * If the object could not be allocated then we end up freeing the + * sglist. + */ + sglist_free(sg); + + return (obj); +} + +void +vmm_mmio_free(struct vmspace *vmspace, vm_paddr_t gpa, size_t len) +{ + + vm_map_remove(&vmspace->vm_map, gpa, gpa + len); +} + +vm_paddr_t +vmm_mem_maxaddr(void) +{ + + return (ptoa(Maxmem)); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.h b/usr/src/uts/i86pc/io/vmm/vmm_mem.h index 05dc37fb9a..e6f88fb222 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_mem.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_mem.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_mem.h 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -41,9 +43,13 @@ #ifndef _VMM_MEM_H_ #define _VMM_MEM_H_ +struct vmspace; +struct vm_object; + int vmm_mem_init(void); -vm_paddr_t vmm_mem_alloc(size_t size); -void vmm_mem_free(vm_paddr_t start, size_t size); +struct vm_object *vmm_mmio_alloc(struct vmspace *, vm_paddr_t gpa, size_t len, + vm_paddr_t hpa); +void vmm_mmio_free(struct vmspace *, vm_paddr_t gpa, size_t size); vm_paddr_t vmm_mem_maxaddr(void); #endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c index 79e1cb1a44..2b612b20e9 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c @@ -11,6 +11,7 @@ /* * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -20,246 +21,401 @@ #include <sys/stat.h> #include <sys/vmsystm.h> #include <sys/ddi.h> -/* - * struct modctl in <sys/modctl.h> contains "void *__unused". - * Do this ugly workaround to avoid it. - */ -#undef __unused +#include <sys/mkdev.h> #include <sys/sunddi.h> #include <sys/fs/dv_node.h> +#include <sys/cpuset.h> +#include <sys/id_space.h> +#include <sys/fs/sdev_plugin.h> +#include <sys/smt.h> + +#include <sys/kernel.h> +#include <sys/hma.h> +#include <sys/x86_archext.h> #include <sys/vmm.h> #include <sys/vmm_instruction_emul.h> #include <sys/vmm_dev.h> #include <sys/vmm_impl.h> +#include <sys/vmm_drv.h> #include <vm/vm.h> #include <vm/seg_dev.h> #include "io/vatpic.h" #include "io/vioapic.h" +#include "io/vrtc.h" +#include "io/vhpet.h" #include "vmm_lapic.h" +#include "vmm_stat.h" +#include "vmm_util.h" +#include "vm/vm_glue.h" -static dev_info_t *vmm_dip; -static void *vmm_statep; +/* + * Locking details: + * + * Driver-wide data (vmmdev_*) , including HMA and sdev registration, is + * protected by vmmdev_mtx. The list of vmm_softc_t instances and related data + * (vmm_*) are protected by vmm_mtx. Actions requiring both locks must acquire + * vmmdev_mtx before vmm_mtx. The sdev plugin functions must not attempt to + * acquire vmmdev_mtx, as they could deadlock with plugin unregistration. + */ -static SLIST_HEAD(, vmm_softc) head; +static kmutex_t vmmdev_mtx; +static dev_info_t *vmmdev_dip; +static hma_reg_t *vmmdev_hma_reg; +static sdev_plugin_hdl_t vmmdev_sdev_hdl; -static kmutex_t vmmdev_mtx; +static kmutex_t vmm_mtx; +static list_t vmm_list; +static list_t vmm_destroy_list; +static id_space_t *vmm_minors; +static void *vmm_statep; -/* - * vmm trace ring - */ -int vmm_dmsg_ring_size = VMM_DMSG_RING_SIZE; -static vmm_trace_rbuf_t *vmm_debug_rbuf; -static vmm_trace_dmsg_t *vmm_trace_dmsg_alloc(void); -static void vmm_trace_dmsg_free(void); -static void vmm_trace_rbuf_alloc(void); -static void vmm_trace_rbuf_free(void); +static const char *vmmdev_hvm_name = "bhyve"; -/* - * This routine is used to manage debug messages - * on ring buffer. - */ -static vmm_trace_dmsg_t * -vmm_trace_dmsg_alloc(void) +/* For sdev plugin (/dev) */ +#define VMM_SDEV_ROOT "/dev/vmm" + +/* From uts/i86pc/io/vmm/intel/vmx.c */ +extern int vmx_x86_supported(const char **); + +/* Holds and hooks from drivers external to vmm */ +struct vmm_hold { + list_node_t vmh_node; + vmm_softc_t *vmh_sc; + boolean_t vmh_release_req; + uint_t vmh_ioport_hook_cnt; +}; + +struct vmm_lease { + list_node_t vml_node; + struct vm *vml_vm; + boolean_t vml_expired; + boolean_t (*vml_expire_func)(void *); + void *vml_expire_arg; + list_node_t vml_expire_node; + struct vmm_hold *vml_hold; +}; + +static int vmm_drv_block_hook(vmm_softc_t *, boolean_t); +static void vmm_lease_break_locked(vmm_softc_t *, vmm_lease_t *); + +static int +vmmdev_get_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) { - vmm_trace_dmsg_t *dmsg_alloc, *dmsg = vmm_debug_rbuf->dmsgp; + int error; + bool sysmem; - if (vmm_debug_rbuf->looped == TRUE) { - vmm_debug_rbuf->dmsgp = dmsg->next; - return (vmm_debug_rbuf->dmsgp); - } + error = vm_get_memseg(sc->vmm_vm, mseg->segid, &mseg->len, &sysmem, + NULL); + if (error || mseg->len == 0) + return (error); - /* - * If we're looping for the first time, - * connect the ring. - */ - if (((vmm_debug_rbuf->size + (sizeof (vmm_trace_dmsg_t))) > - vmm_debug_rbuf->maxsize) && (vmm_debug_rbuf->dmsgh != NULL)) { - dmsg->next = vmm_debug_rbuf->dmsgh; - vmm_debug_rbuf->dmsgp = vmm_debug_rbuf->dmsgh; - vmm_debug_rbuf->looped = TRUE; - return (vmm_debug_rbuf->dmsgp); - } - - /* If we've gotten this far then memory allocation is needed */ - dmsg_alloc = kmem_zalloc(sizeof (vmm_trace_dmsg_t), KM_NOSLEEP); - if (dmsg_alloc == NULL) { - vmm_debug_rbuf->allocfailed++; - return (dmsg_alloc); - } else { - vmm_debug_rbuf->size += sizeof (vmm_trace_dmsg_t); - } + if (!sysmem) { + vmm_devmem_entry_t *de; + list_t *dl = &sc->vmm_devmem_list; - if (vmm_debug_rbuf->dmsgp != NULL) { - dmsg->next = dmsg_alloc; - vmm_debug_rbuf->dmsgp = dmsg->next; - return (vmm_debug_rbuf->dmsgp); - } else { - /* - * We should only be here if we're initializing - * the ring buffer. - */ - if (vmm_debug_rbuf->dmsgh == NULL) { - vmm_debug_rbuf->dmsgh = dmsg_alloc; - } else { - /* Something is wrong */ - kmem_free(dmsg_alloc, sizeof (vmm_trace_dmsg_t)); - return (NULL); + for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { + if (de->vde_segid == mseg->segid) { + break; + } } - - vmm_debug_rbuf->dmsgp = dmsg_alloc; - return (vmm_debug_rbuf->dmsgp); + if (de != NULL) { + (void) strlcpy(mseg->name, de->vde_name, + sizeof (mseg->name)); + } + } else { + bzero(mseg->name, sizeof (mseg->name)); } + + return (error); } /* - * Free all messages on debug ring buffer. + * The 'devmem' hack: + * + * On native FreeBSD, bhyve consumers are allowed to create 'devmem' segments + * in the vm which appear with their own name related to the vm under /dev. + * Since this would be a hassle from an sdev perspective and would require a + * new cdev interface (or complicate the existing one), we choose to implement + * this in a different manner. When 'devmem' mappings are created, an + * identifying off_t is communicated back out to userspace. That off_t, + * residing above the normal guest memory space, can be used to mmap the + * 'devmem' mapping from the already-open vm device. */ -static void -vmm_trace_dmsg_free(void) -{ - vmm_trace_dmsg_t *dmsg_next, *dmsg = vmm_debug_rbuf->dmsgh; - while (dmsg != NULL) { - dmsg_next = dmsg->next; - kmem_free(dmsg, sizeof (vmm_trace_dmsg_t)); +static int +vmmdev_devmem_create(vmm_softc_t *sc, struct vm_memseg *mseg, const char *name) +{ + off_t map_offset; + vmm_devmem_entry_t *entry; + if (list_is_empty(&sc->vmm_devmem_list)) { + map_offset = VM_DEVMEM_START; + } else { + entry = list_tail(&sc->vmm_devmem_list); + map_offset = entry->vde_off + entry->vde_len; + if (map_offset < entry->vde_off) { + /* Do not tolerate overflow */ + return (ERANGE); + } /* - * If we've looped around the ring than we're done. + * XXXJOY: We could choose to search the list for duplicate + * names and toss an error. Since we're using the offset + * method for now, it does not make much of a difference. */ - if (dmsg_next == vmm_debug_rbuf->dmsgh) { - break; - } else { - dmsg = dmsg_next; - } } + + entry = kmem_zalloc(sizeof (*entry), KM_SLEEP); + entry->vde_segid = mseg->segid; + entry->vde_len = mseg->len; + entry->vde_off = map_offset; + (void) strlcpy(entry->vde_name, name, sizeof (entry->vde_name)); + list_insert_tail(&sc->vmm_devmem_list, entry); + + return (0); } -static void -vmm_trace_rbuf_alloc(void) +static boolean_t +vmmdev_devmem_segid(vmm_softc_t *sc, off_t off, off_t len, int *segidp) { - vmm_debug_rbuf = kmem_zalloc(sizeof (vmm_trace_rbuf_t), KM_SLEEP); + list_t *dl = &sc->vmm_devmem_list; + vmm_devmem_entry_t *de = NULL; - mutex_init(&vmm_debug_rbuf->lock, NULL, MUTEX_DRIVER, NULL); + VERIFY(off >= VM_DEVMEM_START); - if (vmm_dmsg_ring_size > 0) { - vmm_debug_rbuf->maxsize = vmm_dmsg_ring_size; + for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { + /* XXX: Only hit on direct offset/length matches for now */ + if (de->vde_off == off && de->vde_len == len) { + break; + } + } + if (de == NULL) { + return (B_FALSE); } -} + *segidp = de->vde_segid; + return (B_TRUE); +} static void -vmm_trace_rbuf_free(void) +vmmdev_devmem_purge(vmm_softc_t *sc) { - vmm_trace_dmsg_free(); - mutex_destroy(&vmm_debug_rbuf->lock); - kmem_free(vmm_debug_rbuf, sizeof (vmm_trace_rbuf_t)); + vmm_devmem_entry_t *entry; + + while ((entry = list_remove_head(&sc->vmm_devmem_list)) != NULL) { + kmem_free(entry, sizeof (*entry)); + } } -static void -vmm_vtrace_log(const char *fmt, va_list ap) +static int +vmmdev_alloc_memseg(vmm_softc_t *sc, struct vm_memseg *mseg) { - vmm_trace_dmsg_t *dmsg; + int error; + bool sysmem = true; - if (vmm_debug_rbuf == NULL) { - return; + if (VM_MEMSEG_NAME(mseg)) { + sysmem = false; } + error = vm_alloc_memseg(sc->vmm_vm, mseg->segid, mseg->len, sysmem); - /* - * If max size of ring buffer is smaller than size - * required for one debug message then just return - * since we have no room for the debug message. - */ - if (vmm_debug_rbuf->maxsize < (sizeof (vmm_trace_dmsg_t))) { - return; + if (error == 0 && VM_MEMSEG_NAME(mseg)) { + /* + * Rather than create a whole fresh device from which userspace + * can mmap this segment, instead make it available at an + * offset above where the main guest memory resides. + */ + error = vmmdev_devmem_create(sc, mseg, mseg->name); + if (error != 0) { + vm_free_memseg(sc->vmm_vm, mseg->segid); + } } + return (error); +} - mutex_enter(&vmm_debug_rbuf->lock); - - /* alloc or reuse on ring buffer */ - dmsg = vmm_trace_dmsg_alloc(); +/* + * Resource Locking and Exclusion + * + * Much of bhyve depends on key portions of VM state, such as the guest memory + * map, to remain unchanged while the guest is running. As ported from + * FreeBSD, the initial strategy for this resource exclusion hinged on gating + * access to the instance vCPUs. Threads acting on a single vCPU, like those + * performing the work of actually running the guest in VMX/SVM, would lock + * only that vCPU during ioctl() entry. For ioctls which would change VM-wide + * state, all of the vCPUs would be first locked, ensuring that the + * operation(s) could complete without any other threads stumbling into + * intermediate states. + * + * This approach is largely effective for bhyve. Common operations, such as + * running the vCPUs, steer clear of lock contention. The model begins to + * break down for operations which do not occur in the context of a specific + * vCPU. LAPIC MSI delivery, for example, may be initiated from a worker + * thread in the bhyve process. In order to properly protect those vCPU-less + * operations from encountering invalid states, additional locking is required. + * This was solved by forcing those operations to lock the VM_MAXCPU-1 vCPU. + * It does mean that class of operations will be serialized on locking the + * specific vCPU and that instances sized at VM_MAXCPU will potentially see + * undue contention on the VM_MAXCPU-1 vCPU. + * + * In order to address the shortcomings of this model, the concept of a + * read/write lock has been added to bhyve. Operations which change + * fundamental aspects of a VM (such as the memory map) must acquire the write + * lock, which also implies locking all of the vCPUs and waiting for all read + * lock holders to release. While it increases the cost and waiting time for + * those few operations, it allows most hot-path operations on the VM (which + * depend on its configuration remaining stable) to occur with minimal locking. + * + * Consumers of the Driver API (see below) are a special case when it comes to + * this locking, since they may hold a read lock via the drv_lease mechanism + * for an extended period of time. Rather than forcing those consumers to + * continuously poll for a write lock attempt, the lease system forces them to + * provide a release callback to trigger their clean-up (and potential later + * reacquisition) of the read lock. + */ - if (dmsg == NULL) { - /* resource allocation failed */ - mutex_exit(&vmm_debug_rbuf->lock); - return; - } +static void +vcpu_lock_one(vmm_softc_t *sc, int vcpu) +{ + ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); - gethrestime(&dmsg->timestamp); + /* + * Since this state transition is utilizing from_idle=true, it should + * not fail, but rather block until it can be successful. + */ + VERIFY0(vcpu_set_state(sc->vmm_vm, vcpu, VCPU_FROZEN, true)); +} - (void) vsnprintf(dmsg->buf, sizeof (dmsg->buf), fmt, ap); +static void +vcpu_unlock_one(vmm_softc_t *sc, int vcpu) +{ + ASSERT(vcpu >= 0 && vcpu < VM_MAXCPU); - mutex_exit(&vmm_debug_rbuf->lock); + VERIFY3U(vcpu_get_state(sc->vmm_vm, vcpu, NULL), ==, VCPU_FROZEN); + vcpu_set_state(sc->vmm_vm, vcpu, VCPU_IDLE, false); } -void -vmm_trace_log(const char *fmt, ...) +static void +vmm_read_lock(vmm_softc_t *sc) { - va_list ap; - - va_start(ap, fmt); - vmm_vtrace_log(fmt, ap); - va_end(ap); + rw_enter(&sc->vmm_rwlock, RW_READER); } -void -vmmdev_init(void) +static void +vmm_read_unlock(vmm_softc_t *sc) { - vmm_trace_rbuf_alloc(); + rw_exit(&sc->vmm_rwlock); } -int -vmmdev_cleanup(void) +static void +vmm_write_lock(vmm_softc_t *sc) { - int error; + int maxcpus; - if (SLIST_EMPTY(&head)) - error = 0; - else - error = EBUSY; + /* First lock all the vCPUs */ + maxcpus = vm_get_maxcpus(sc->vmm_vm); + for (int vcpu = 0; vcpu < maxcpus; vcpu++) { + vcpu_lock_one(sc, vcpu); + } - if (error == 0) - vmm_trace_dmsg_free(); + mutex_enter(&sc->vmm_lease_lock); + VERIFY3U(sc->vmm_lease_blocker, !=, UINT_MAX); + sc->vmm_lease_blocker++; + if (sc->vmm_lease_blocker == 1) { + list_t *list = &sc->vmm_lease_list; + vmm_lease_t *lease = list_head(list); - return (error); + while (lease != NULL) { + boolean_t sync_break = B_FALSE; + + if (!lease->vml_expired) { + void *arg = lease->vml_expire_arg; + lease->vml_expired = B_TRUE; + sync_break = lease->vml_expire_func(arg); + } + + if (sync_break) { + vmm_lease_t *next; + + /* + * These leases which are synchronously broken + * result in vmm_read_unlock() calls from a + * different thread than the corresponding + * vmm_read_lock(). This is acceptable, given + * that the rwlock underpinning the whole + * mechanism tolerates the behavior. This + * flexibility is _only_ afforded to VM read + * lock (RW_READER) holders. + */ + next = list_next(list, lease); + vmm_lease_break_locked(sc, lease); + lease = next; + } else { + lease = list_next(list, lease); + } + } + } + mutex_exit(&sc->vmm_lease_lock); + + rw_enter(&sc->vmm_rwlock, RW_WRITER); + /* + * For now, the 'maxcpus' value for an instance is fixed at the + * compile-time constant of VM_MAXCPU at creation. If this changes in + * the future, allowing for dynamic vCPU resource sizing, acquisition + * of the write lock will need to be wary of such changes. + */ + VERIFY(maxcpus == vm_get_maxcpus(sc->vmm_vm)); } -int -vmmdev_do_ioctl(struct vmm_softc *sc, int cmd, intptr_t arg, int mode, - cred_t *credp, int *rvalp) +static void +vmm_write_unlock(vmm_softc_t *sc) { - int error, vcpu, state_changed; - struct vm_memory_segment seg; - struct vm_register vmreg; - struct vm_seg_desc vmsegdesc; - struct vm_run vmrun; - struct vm_exception vmexc; - struct vm_lapic_irq vmirq; - struct vm_lapic_msi vmmsi; - struct vm_ioapic_irq ioapic_irq; - struct vm_isa_irq isa_irq; - struct vm_capability vmcap; - struct vm_nmi vmnmi; - struct vm_x2apic x2apic; - struct vm_gla2gpa gg; - struct vm_activate_cpu vac; - int pincount; - int i; - - vcpu = -1; - state_changed = 0; + int maxcpus; + + mutex_enter(&sc->vmm_lease_lock); + VERIFY3U(sc->vmm_lease_blocker, !=, 0); + sc->vmm_lease_blocker--; + if (sc->vmm_lease_blocker == 0) { + cv_broadcast(&sc->vmm_lease_cv); + } + mutex_exit(&sc->vmm_lease_lock); /* - * Some VMM ioctls can operate only on vcpus that are not running. + * The VM write lock _must_ be released from the same thread it was + * acquired in, unlike the read lock. */ + VERIFY(rw_write_held(&sc->vmm_rwlock)); + rw_exit(&sc->vmm_rwlock); + + /* Unlock all the vCPUs */ + maxcpus = vm_get_maxcpus(sc->vmm_vm); + for (int vcpu = 0; vcpu < maxcpus; vcpu++) { + vcpu_unlock_one(sc, vcpu); + } +} + +static int +vmmdev_do_ioctl(vmm_softc_t *sc, int cmd, intptr_t arg, int md, + cred_t *credp, int *rvalp) +{ + int error = 0, vcpu = -1; + void *datap = (void *)arg; + enum vm_lock_type { + LOCK_NONE = 0, + LOCK_VCPU, + LOCK_READ_HOLD, + LOCK_WRITE_HOLD + } lock_type = LOCK_NONE; + + /* Acquire any exclusion resources needed for the operation. */ switch (cmd) { case VM_RUN: case VM_GET_REGISTER: case VM_SET_REGISTER: case VM_GET_SEGMENT_DESCRIPTOR: case VM_SET_SEGMENT_DESCRIPTOR: + case VM_GET_REGISTER_SET: + case VM_SET_REGISTER_SET: case VM_INJECT_EXCEPTION: case VM_GET_CAPABILITY: case VM_SET_CAPABILITY: @@ -267,494 +423,1320 @@ vmmdev_do_ioctl(struct vmm_softc *sc, int cmd, intptr_t arg, int mode, case VM_PPTDEV_MSIX: case VM_SET_X2APIC_STATE: case VM_GLA2GPA: + case VM_GLA2GPA_NOFAULT: case VM_ACTIVATE_CPU: + case VM_SET_INTINFO: + case VM_GET_INTINFO: case VM_RESTART_INSTRUCTION: /* - * XXX fragile, handle with care - * Assumes that the first field of the ioctl data is the vcpu. + * Copy in the ID of the vCPU chosen for this operation. + * Since a nefarious caller could update their struct between + * this locking and when the rest of the ioctl data is copied + * in, it is _critical_ that this local 'vcpu' variable be used + * rather than the in-struct one when performing the ioctl. */ - if (ddi_copyin((void *)arg, &vcpu, sizeof (vcpu), mode)) { + if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { return (EFAULT); } - if (vcpu < 0 || vcpu >= VM_MAXCPU) { - error = EINVAL; - goto done; + if (vcpu < 0 || vcpu > vm_get_maxcpus(sc->vmm_vm)) { + return (EINVAL); } + vcpu_lock_one(sc, vcpu); + lock_type = LOCK_VCPU; + break; + + case VM_REINIT: + case VM_BIND_PPTDEV: + case VM_UNBIND_PPTDEV: + case VM_MAP_PPTDEV_MMIO: + case VM_ALLOC_MEMSEG: + case VM_MMAP_MEMSEG: + case VM_WRLOCK_CYCLE: + vmm_write_lock(sc); + lock_type = LOCK_WRITE_HOLD; + break; - error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); - if (error) - goto done; + case VM_GET_GPA_PMAP: + case VM_GET_MEMSEG: + case VM_MMAP_GETNEXT: + case VM_LAPIC_IRQ: + case VM_INJECT_NMI: + case VM_IOAPIC_ASSERT_IRQ: + case VM_IOAPIC_DEASSERT_IRQ: + case VM_IOAPIC_PULSE_IRQ: + case VM_LAPIC_MSI: + case VM_LAPIC_LOCAL_IRQ: + case VM_GET_X2APIC_STATE: + case VM_RTC_READ: + case VM_RTC_WRITE: + case VM_RTC_SETTIME: + case VM_RTC_GETTIME: +#ifndef __FreeBSD__ + case VM_DEVMEM_GETOFFSET: +#endif + vmm_read_lock(sc); + lock_type = LOCK_READ_HOLD; + break; - state_changed = 1; + case VM_IOAPIC_PINCOUNT: + default: break; - case VM_MAP_MEMORY: + } + + /* Execute the primary logic for the ioctl. */ + switch (cmd) { + case VM_RUN: { + struct vm_run vmrun; + + if (ddi_copyin(datap, &vmrun, sizeof (vmrun), md)) { + error = EFAULT; + break; + } + vmrun.cpuid = vcpu; + + if (!(curthread->t_schedflag & TS_VCPU)) + smt_mark_as_vcpu(); + + error = vm_run(sc->vmm_vm, &vmrun); /* - * ioctls that operate on the entire virtual machine must - * prevent all vcpus from running. + * XXXJOY: I think it's necessary to do copyout, even in the + * face of errors, since the exit state is communicated out. */ - error = 0; - for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) { - error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true); - if (error) - break; + if (ddi_copyout(&vmrun, datap, sizeof (vmrun), md)) { + error = EFAULT; + break; } + break; + } + case VM_SUSPEND: { + struct vm_suspend vmsuspend; - if (error) { - while (--vcpu >= 0) - vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); - goto done; + if (ddi_copyin(datap, &vmsuspend, sizeof (vmsuspend), md)) { + error = EFAULT; + break; } - - state_changed = 2; + error = vm_suspend(sc->vmm_vm, vmsuspend.how); break; + } + case VM_REINIT: + if ((error = vmm_drv_block_hook(sc, B_TRUE)) != 0) { + /* + * The VM instance should be free of driver-attached + * hooks during the reinitialization process. + */ + break; + } + error = vm_reinit(sc->vmm_vm); + (void) vmm_drv_block_hook(sc, B_FALSE); + break; + case VM_STAT_DESC: { + struct vm_stat_desc statdesc; - default: + if (ddi_copyin(datap, &statdesc, sizeof (statdesc), md)) { + error = EFAULT; + break; + } + error = vmm_stat_desc_copy(statdesc.index, statdesc.desc, + sizeof (statdesc.desc)); + if (error == 0 && + ddi_copyout(&statdesc, datap, sizeof (statdesc), md)) { + error = EFAULT; + break; + } break; } + case VM_STATS_IOC: { + struct vm_stats vmstats; - switch(cmd) { - case VM_RUN: - if (ddi_copyin((void *)arg, &vmrun, - sizeof (struct vm_run), mode)) { - return (EFAULT); + CTASSERT(MAX_VM_STATS >= MAX_VMM_STAT_ELEMS); + if (ddi_copyin(datap, &vmstats, sizeof (vmstats), md)) { + error = EFAULT; + break; } - error = vm_run(sc->vm, &vmrun); - if (ddi_copyout(&vmrun, (void *)arg, - sizeof (struct vm_run), mode)) { - return (EFAULT); + hrt2tv(gethrtime(), &vmstats.tv); + error = vmm_stat_copy(sc->vmm_vm, vmstats.cpuid, + &vmstats.num_entries, vmstats.statbuf); + if (error == 0 && + ddi_copyout(&vmstats, datap, sizeof (vmstats), md)) { + error = EFAULT; + break; } break; - case VM_LAPIC_IRQ: - if (ddi_copyin((void *)arg, &vmirq, - sizeof (struct vm_lapic_irq), mode)) { - return (EFAULT); + } + + /* XXXJOY: punt on these for now */ + case VM_PPTDEV_MSI: { + struct vm_pptdev_msi pptmsi; + + if (ddi_copyin(datap, &pptmsi, sizeof (pptmsi), md)) { + error = EFAULT; + break; } - error = lapic_intr_edge(sc->vm, vmirq.cpuid, vmirq.vector); - if (ddi_copyout(&vmirq, (void *)arg, - sizeof (struct vm_lapic_irq), mode)) { - return (EFAULT); + return (ENOTTY); + } + case VM_PPTDEV_MSIX: { + struct vm_pptdev_msix pptmsix; + + if (ddi_copyin(datap, &pptmsix, sizeof (pptmsix), md)) { + error = EFAULT; + break; } - break; - case VM_LAPIC_LOCAL_IRQ: - if (ddi_copyin((void *)arg, &vmirq, - sizeof (struct vm_lapic_irq), mode)) { - return (EFAULT); + return (ENOTTY); + } + case VM_MAP_PPTDEV_MMIO: { + struct vm_pptdev_mmio pptmmio; + + if (ddi_copyin(datap, &pptmmio, sizeof (pptmmio), md)) { + error = EFAULT; + break; } - error = lapic_set_local_intr(sc->vm, vmirq.cpuid, - vmirq.vector); - if (ddi_copyout(&vmirq, (void *)arg, - sizeof (struct vm_lapic_irq), mode)) { - return (EFAULT); + return (ENOTTY); + } + case VM_BIND_PPTDEV: + case VM_UNBIND_PPTDEV: { + struct vm_pptdev pptdev; + + if (ddi_copyin(datap, &pptdev, sizeof (pptdev), md)) { + error = EFAULT; + break; + } + return (ENOTTY); + } + + case VM_INJECT_EXCEPTION: { + struct vm_exception vmexc; + + if (ddi_copyin(datap, &vmexc, sizeof (vmexc), md)) { + error = EFAULT; + break; } + error = vm_inject_exception(sc->vmm_vm, vcpu, vmexc.vector, + vmexc.error_code_valid, vmexc.error_code, + vmexc.restart_instruction); break; - case VM_LAPIC_MSI: - if (ddi_copyin((void *)arg, &vmmsi, - sizeof (struct vm_lapic_msi), mode)) { - return (EFAULT); + } + case VM_INJECT_NMI: { + struct vm_nmi vmnmi; + + if (ddi_copyin(datap, &vmnmi, sizeof (vmnmi), md)) { + error = EFAULT; + break; } - error = lapic_intr_msi(sc->vm, vmmsi.addr, vmmsi.msg); - if (ddi_copyout(&vmmsi, (void *)arg, - sizeof (struct vm_lapic_msi), mode)) { - return (EFAULT); + error = vm_inject_nmi(sc->vmm_vm, vmnmi.cpuid); + break; + } + case VM_LAPIC_IRQ: { + struct vm_lapic_irq vmirq; + + if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { + error = EFAULT; + break; } - case VM_IOAPIC_ASSERT_IRQ: - if (ddi_copyin((void *)arg, &ioapic_irq, - sizeof (struct vm_ioapic_irq), mode)) { - return (EFAULT); + error = lapic_intr_edge(sc->vmm_vm, vmirq.cpuid, vmirq.vector); + break; + } + case VM_LAPIC_LOCAL_IRQ: { + struct vm_lapic_irq vmirq; + + if (ddi_copyin(datap, &vmirq, sizeof (vmirq), md)) { + error = EFAULT; + break; } - error = vioapic_assert_irq(sc->vm, ioapic_irq.irq);; - if (ddi_copyout(&ioapic_irq, (void *)arg, - sizeof (struct vm_ioapic_irq), mode)) { - return (EFAULT); + error = lapic_set_local_intr(sc->vmm_vm, vmirq.cpuid, + vmirq.vector); + break; + } + case VM_LAPIC_MSI: { + struct vm_lapic_msi vmmsi; + + if (ddi_copyin(datap, &vmmsi, sizeof (vmmsi), md)) { + error = EFAULT; + break; } + error = lapic_intr_msi(sc->vmm_vm, vmmsi.addr, vmmsi.msg); break; - case VM_IOAPIC_DEASSERT_IRQ: - if (ddi_copyin((void *)arg, &ioapic_irq, - sizeof (struct vm_ioapic_irq), mode)) { - return (EFAULT); + } + + case VM_IOAPIC_ASSERT_IRQ: { + struct vm_ioapic_irq ioapic_irq; + + if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { + error = EFAULT; + break; } - error = vioapic_deassert_irq(sc->vm, ioapic_irq.irq); - if (ddi_copyout(&ioapic_irq, (void *)arg, - sizeof (struct vm_ioapic_irq), mode)) { - return (EFAULT); + error = vioapic_assert_irq(sc->vmm_vm, ioapic_irq.irq); + break; + } + case VM_IOAPIC_DEASSERT_IRQ: { + struct vm_ioapic_irq ioapic_irq; + + if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { + error = EFAULT; + break; } + error = vioapic_deassert_irq(sc->vmm_vm, ioapic_irq.irq); break; - case VM_IOAPIC_PULSE_IRQ: - if (ddi_copyin((void *)arg, &ioapic_irq, - sizeof (struct vm_ioapic_irq), mode)) { - return (EFAULT); + } + case VM_IOAPIC_PULSE_IRQ: { + struct vm_ioapic_irq ioapic_irq; + + if (ddi_copyin(datap, &ioapic_irq, sizeof (ioapic_irq), md)) { + error = EFAULT; + break; } - error = vioapic_pulse_irq(sc->vm, ioapic_irq.irq); - if (ddi_copyout(&ioapic_irq, (void *)arg, - sizeof (struct vm_ioapic_irq), mode)) { - return (EFAULT); + error = vioapic_pulse_irq(sc->vmm_vm, ioapic_irq.irq); + break; + } + case VM_IOAPIC_PINCOUNT: { + int pincount; + + pincount = vioapic_pincount(sc->vmm_vm); + if (ddi_copyout(&pincount, datap, sizeof (int), md)) { + error = EFAULT; + break; } break; - case VM_IOAPIC_PINCOUNT: - error = 0; - pincount = vioapic_pincount(sc->vm); - if (ddi_copyout(&pincount, (void *)arg, sizeof (int), mode)) { - return (EFAULT); + } + + case VM_ISA_ASSERT_IRQ: { + struct vm_isa_irq isa_irq; + + if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { + error = EFAULT; + break; + } + error = vatpic_assert_irq(sc->vmm_vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) { + error = vioapic_assert_irq(sc->vmm_vm, + isa_irq.ioapic_irq); } break; - case VM_ISA_ASSERT_IRQ: - if (ddi_copyin((void *)arg, &isa_irq, - sizeof (struct vm_isa_irq), mode)) { - return (EFAULT); + } + case VM_ISA_DEASSERT_IRQ: { + struct vm_isa_irq isa_irq; + + if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { + error = EFAULT; + break; } - error = vatpic_assert_irq(sc->vm, isa_irq.atpic_irq); - if (error == 0 && isa_irq.ioapic_irq != -1) - error = vioapic_assert_irq(sc->vm, + error = vatpic_deassert_irq(sc->vmm_vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) { + error = vioapic_deassert_irq(sc->vmm_vm, isa_irq.ioapic_irq); - if (ddi_copyout(&isa_irq, (void *)arg, - sizeof (struct vm_isa_irq), mode)) { - return (EFAULT); - } break; - case VM_ISA_DEASSERT_IRQ: - if (ddi_copyin((void *)arg, &isa_irq, - sizeof (struct vm_isa_irq), mode)) { - return (EFAULT); + } + case VM_ISA_PULSE_IRQ: { + struct vm_isa_irq isa_irq; + + if (ddi_copyin(datap, &isa_irq, sizeof (isa_irq), md)) { + error = EFAULT; + break; } - error = vatpic_deassert_irq(sc->vm, isa_irq.atpic_irq); - if (error == 0 && isa_irq.ioapic_irq != -1) - error = vioapic_deassert_irq(sc->vm, + error = vatpic_pulse_irq(sc->vmm_vm, isa_irq.atpic_irq); + if (error == 0 && isa_irq.ioapic_irq != -1) { + error = vioapic_pulse_irq(sc->vmm_vm, isa_irq.ioapic_irq); - if (ddi_copyout(&isa_irq, (void *)arg, - sizeof (struct vm_isa_irq), mode)) { - return (EFAULT); - } break; - case VM_ISA_PULSE_IRQ: - if (ddi_copyin((void *)arg, &isa_irq, - sizeof (struct vm_isa_irq), mode)) { - return (EFAULT); + } + case VM_ISA_SET_IRQ_TRIGGER: { + struct vm_isa_irq_trigger isa_irq_trigger; + + if (ddi_copyin(datap, &isa_irq_trigger, + sizeof (isa_irq_trigger), md)) { + error = EFAULT; + break; } - error = vatpic_pulse_irq(sc->vm, isa_irq.atpic_irq); - if (error == 0 && isa_irq.ioapic_irq != -1) - error = vioapic_pulse_irq(sc->vm, isa_irq.ioapic_irq); - if (ddi_copyout(&isa_irq, (void *)arg, - sizeof (struct vm_isa_irq), mode)) { - return (EFAULT); - + error = vatpic_set_irq_trigger(sc->vmm_vm, + isa_irq_trigger.atpic_irq, isa_irq_trigger.trigger); + break; + } + + case VM_MMAP_GETNEXT: { + struct vm_memmap mm; + + if (ddi_copyin(datap, &mm, sizeof (mm), md)) { + error = EFAULT; + break; + } + error = vm_mmap_getnext(sc->vmm_vm, &mm.gpa, &mm.segid, + &mm.segoff, &mm.len, &mm.prot, &mm.flags); + if (error == 0 && ddi_copyout(&mm, datap, sizeof (mm), md)) { + error = EFAULT; + break; } break; - case VM_MAP_MEMORY: - if (ddi_copyin((void *)arg, &seg, - sizeof (struct vm_memory_segment), mode)) { - return (EFAULT); + } + case VM_MMAP_MEMSEG: { + struct vm_memmap mm; + + if (ddi_copyin(datap, &mm, sizeof (mm), md)) { + error = EFAULT; + break; } - error = vm_malloc(sc->vm, seg.gpa, seg.len); + error = vm_mmap_memseg(sc->vmm_vm, mm.gpa, mm.segid, mm.segoff, + mm.len, mm.prot, mm.flags); break; - case VM_GET_MEMORY_SEG: - if (ddi_copyin((void *)arg, &seg, - sizeof (struct vm_memory_segment), mode)) { - return (EFAULT); + } + case VM_ALLOC_MEMSEG: { + struct vm_memseg vmseg; + + if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { + error = EFAULT; + break; } - seg.len = 0; - (void)vm_gpabase2memseg(sc->vm, seg.gpa, &seg); - if (ddi_copyout(&seg, (void *)arg, - sizeof (struct vm_memory_segment), mode)) { - return (EFAULT); + error = vmmdev_alloc_memseg(sc, &vmseg); + break; + } + case VM_GET_MEMSEG: { + struct vm_memseg vmseg; + + if (ddi_copyin(datap, &vmseg, sizeof (vmseg), md)) { + error = EFAULT; + break; + } + error = vmmdev_get_memseg(sc, &vmseg); + if (error == 0 && + ddi_copyout(&vmseg, datap, sizeof (vmseg), md)) { + error = EFAULT; + break; } - error = 0; break; - case VM_GET_REGISTER: - if (ddi_copyin((void *)arg, &vmreg, - sizeof (struct vm_register), mode)) { - return (EFAULT); + } + case VM_GET_REGISTER: { + struct vm_register vmreg; + + if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { + error = EFAULT; + break; } - error = vm_get_register(sc->vm, vmreg.cpuid, vmreg.regnum, - &vmreg.regval); - if (!error) { - if (ddi_copyout(&vmreg, (void *)arg, - sizeof (struct vm_register), mode)) { - return (EFAULT); - } + error = vm_get_register(sc->vmm_vm, vcpu, vmreg.regnum, + &vmreg.regval); + if (error == 0 && + ddi_copyout(&vmreg, datap, sizeof (vmreg), md)) { + error = EFAULT; + break; } break; - case VM_SET_REGISTER: - if (ddi_copyin((void *)arg, &vmreg, - sizeof (struct vm_register), mode)) { - return (EFAULT); + } + case VM_SET_REGISTER: { + struct vm_register vmreg; + + if (ddi_copyin(datap, &vmreg, sizeof (vmreg), md)) { + error = EFAULT; + break; } - error = vm_set_register(sc->vm, vmreg.cpuid, vmreg.regnum, - vmreg.regval); + error = vm_set_register(sc->vmm_vm, vcpu, vmreg.regnum, + vmreg.regval); break; - case VM_SET_SEGMENT_DESCRIPTOR: - if (ddi_copyin((void *)arg, &vmsegdesc, - sizeof (struct vm_seg_desc), mode)) { - return (EFAULT); + } + case VM_SET_SEGMENT_DESCRIPTOR: { + struct vm_seg_desc vmsegd; + + if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { + error = EFAULT; + break; } - error = vm_set_seg_desc(sc->vm, vmsegdesc.cpuid, - vmsegdesc.regnum, - &vmsegdesc.desc); + error = vm_set_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, + &vmsegd.desc); break; - case VM_GET_SEGMENT_DESCRIPTOR: - if (ddi_copyin((void *)arg, &vmsegdesc, - sizeof (struct vm_seg_desc), mode)) { - return (EFAULT); + } + case VM_GET_SEGMENT_DESCRIPTOR: { + struct vm_seg_desc vmsegd; + + if (ddi_copyin(datap, &vmsegd, sizeof (vmsegd), md)) { + error = EFAULT; + break; } - error = vm_get_seg_desc(sc->vm, vmsegdesc.cpuid, - vmsegdesc.regnum, - &vmsegdesc.desc); - if (!error) { - if (ddi_copyout(&vmsegdesc, (void *)arg, - sizeof (struct vm_seg_desc), mode)) { - return (EFAULT); + error = vm_get_seg_desc(sc->vmm_vm, vcpu, vmsegd.regnum, + &vmsegd.desc); + if (error == 0 && + ddi_copyout(&vmsegd, datap, sizeof (vmsegd), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GET_REGISTER_SET: { + struct vm_register_set vrs; + int regnums[VM_REG_LAST]; + uint64_t regvals[VM_REG_LAST]; + + if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { + error = EFAULT; + break; + } + if (vrs.count > VM_REG_LAST || vrs.count == 0) { + error = EINVAL; + break; + } + if (ddi_copyin(vrs.regnums, regnums, + sizeof (int) * vrs.count, md)) { + error = EFAULT; + break; + } + + error = 0; + for (uint_t i = 0; i < vrs.count && error == 0; i++) { + if (regnums[i] < 0) { + error = EINVAL; + break; } + error = vm_get_register(sc->vmm_vm, vcpu, regnums[i], + ®vals[i]); + } + if (error == 0 && ddi_copyout(regvals, vrs.regvals, + sizeof (uint64_t) * vrs.count, md)) { + error = EFAULT; } break; - case VM_GET_CAPABILITY: - if (ddi_copyin((void *)arg, &vmcap, - sizeof (struct vm_capability), mode)) { - return (EFAULT); + } + case VM_SET_REGISTER_SET: { + struct vm_register_set vrs; + int regnums[VM_REG_LAST]; + uint64_t regvals[VM_REG_LAST]; + + if (ddi_copyin(datap, &vrs, sizeof (vrs), md)) { + error = EFAULT; + break; } - error = vm_get_capability(sc->vm, vmcap.cpuid, - vmcap.captype, - &vmcap.capval); - if (!error) { - if (ddi_copyout(&vmcap, (void *)arg, - sizeof (struct vm_capability), mode)) { - return (EFAULT); + if (vrs.count > VM_REG_LAST || vrs.count == 0) { + error = EINVAL; + break; + } + if (ddi_copyin(vrs.regnums, regnums, + sizeof (int) * vrs.count, md)) { + error = EFAULT; + break; + } + if (ddi_copyin(vrs.regvals, regvals, + sizeof (uint64_t) * vrs.count, md)) { + error = EFAULT; + break; + } + + error = 0; + for (uint_t i = 0; i < vrs.count && error == 0; i++) { + /* + * Setting registers in a set is not atomic, since a + * failure in the middle of the set will cause a + * bail-out and inconsistent register state. Callers + * should be wary of this. + */ + if (regnums[i] < 0) { + error = EINVAL; + break; } + error = vm_set_register(sc->vmm_vm, vcpu, regnums[i], + regvals[i]); } break; - case VM_SET_CAPABILITY: - if (ddi_copyin((void *)arg, &vmcap, - sizeof (struct vm_capability), mode)) { - return (EFAULT); + } + + case VM_GET_CAPABILITY: { + struct vm_capability vmcap; + + if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { + error = EFAULT; + break; + } + error = vm_get_capability(sc->vmm_vm, vcpu, vmcap.captype, + &vmcap.capval); + if (error == 0 && + ddi_copyout(&vmcap, datap, sizeof (vmcap), md)) { + error = EFAULT; + break; } - error = vm_set_capability(sc->vm, vmcap.cpuid, - vmcap.captype, - vmcap.capval); break; - case VM_SET_X2APIC_STATE: - if (ddi_copyin((void *)arg, &x2apic, - sizeof (struct vm_x2apic), mode)) { - return (EFAULT); + } + case VM_SET_CAPABILITY: { + struct vm_capability vmcap; + + if (ddi_copyin(datap, &vmcap, sizeof (vmcap), md)) { + error = EFAULT; + break; } - error = vm_set_x2apic_state(sc->vm, - x2apic.cpuid, x2apic.state); + error = vm_set_capability(sc->vmm_vm, vcpu, vmcap.captype, + vmcap.capval); break; - case VM_GET_X2APIC_STATE: - if (ddi_copyin((void *)arg, &x2apic, - sizeof (struct vm_x2apic), mode)) { - return (EFAULT); + } + case VM_SET_X2APIC_STATE: { + struct vm_x2apic x2apic; + + if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { + error = EFAULT; + break; } - error = vm_get_x2apic_state(sc->vm, - x2apic.cpuid, &x2apic.state); - if (!error) { - if (ddi_copyout(&x2apic, (void *)arg, - sizeof (struct vm_x2apic), mode)) { - return (EFAULT); - } + error = vm_set_x2apic_state(sc->vmm_vm, vcpu, x2apic.state); + break; + } + case VM_GET_X2APIC_STATE: { + struct vm_x2apic x2apic; + + if (ddi_copyin(datap, &x2apic, sizeof (x2apic), md)) { + error = EFAULT; + break; + } + error = vm_get_x2apic_state(sc->vmm_vm, x2apic.cpuid, + &x2apic.state); + if (error == 0 && + ddi_copyout(&x2apic, datap, sizeof (x2apic), md)) { + error = EFAULT; + break; + } + break; + } + case VM_GET_GPA_PMAP: { + struct vm_gpa_pte gpapte; + + if (ddi_copyin(datap, &gpapte, sizeof (gpapte), md)) { + error = EFAULT; + break; + } +#ifdef __FreeBSD__ + /* XXXJOY: add function? */ + pmap_get_mapping(vmspace_pmap(vm_get_vmspace(sc->vmm_vm)), + gpapte.gpa, gpapte.pte, &gpapte.ptenum); +#endif + error = 0; + break; + } + case VM_GET_HPET_CAPABILITIES: { + struct vm_hpet_cap hpetcap; + + error = vhpet_getcap(&hpetcap); + if (error == 0 && + ddi_copyout(&hpetcap, datap, sizeof (hpetcap), md)) { + error = EFAULT; + break; } break; + } case VM_GLA2GPA: { + struct vm_gla2gpa gg; + CTASSERT(PROT_READ == VM_PROT_READ); CTASSERT(PROT_WRITE == VM_PROT_WRITE); CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); - if (ddi_copyin((void *)arg, &gg, - sizeof (struct vm_gla2gpa), mode)) { - return (EFAULT); + + if (ddi_copyin(datap, &gg, sizeof (gg), md)) { + error = EFAULT; + break; } - error = vm_gla2gpa(sc->vm, gg.vcpuid, &gg.paging, gg.gla, - gg.prot, &gg.gpa); - KASSERT(error == 0 || error == 1 || error == -1, - ("%s: vm_gla2gpa unknown error %d", __func__, error)); - if (error >= 0) { - /* - * error = 0: the translation was successful - * error = 1: a fault was injected into the guest - */ - gg.fault = error; - error = 0; - if (ddi_copyout(&gg, (void *)arg, - sizeof (struct vm_gla2gpa), mode)) { - return (EFAULT); - } - } else { + gg.vcpuid = vcpu; + error = vm_gla2gpa(sc->vmm_vm, vcpu, &gg.paging, gg.gla, + gg.prot, &gg.gpa, &gg.fault); + if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { error = EFAULT; + break; } break; } + case VM_GLA2GPA_NOFAULT: { + struct vm_gla2gpa gg; + + CTASSERT(PROT_READ == VM_PROT_READ); + CTASSERT(PROT_WRITE == VM_PROT_WRITE); + CTASSERT(PROT_EXEC == VM_PROT_EXECUTE); + + if (ddi_copyin(datap, &gg, sizeof (gg), md)) { + error = EFAULT; + break; + } + gg.vcpuid = vcpu; + error = vm_gla2gpa_nofault(sc->vmm_vm, vcpu, &gg.paging, + gg.gla, gg.prot, &gg.gpa, &gg.fault); + if (error == 0 && ddi_copyout(&gg, datap, sizeof (gg), md)) { + error = EFAULT; + break; + } + break; + } + case VM_ACTIVATE_CPU: - if (ddi_copyin((void *)arg, &vac, - sizeof (struct vm_activate_cpu), mode)) { - return (EFAULT); + error = vm_activate_cpu(sc->vmm_vm, vcpu); + break; + + case VM_SUSPEND_CPU: + if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { + error = EFAULT; + } else { + error = vm_suspend_cpu(sc->vmm_vm, vcpu); } - error = vm_activate_cpu(sc->vm, vac.vcpuid); break; + + case VM_RESUME_CPU: + if (ddi_copyin(datap, &vcpu, sizeof (vcpu), md)) { + error = EFAULT; + } else { + error = vm_resume_cpu(sc->vmm_vm, vcpu); + } + break; + + case VM_GET_CPUS: { + struct vm_cpuset vm_cpuset; + cpuset_t tempset; + void *srcp = &tempset; + int size; + + if (ddi_copyin(datap, &vm_cpuset, sizeof (vm_cpuset), md)) { + error = EFAULT; + break; + } + + /* Be more generous about sizing since our cpuset_t is large. */ + size = vm_cpuset.cpusetsize; + if (size <= 0 || size > sizeof (cpuset_t)) { + error = ERANGE; + } + /* + * If they want a ulong_t or less, make sure they receive the + * low bits with all the useful information. + */ + if (size <= sizeof (tempset.cpub[0])) { + srcp = &tempset.cpub[0]; + } + + if (vm_cpuset.which == VM_ACTIVE_CPUS) { + tempset = vm_active_cpus(sc->vmm_vm); + } else if (vm_cpuset.which == VM_SUSPENDED_CPUS) { + tempset = vm_suspended_cpus(sc->vmm_vm); + } else if (vm_cpuset.which == VM_DEBUG_CPUS) { + tempset = vm_debug_cpus(sc->vmm_vm); + } else { + error = EINVAL; + } + + ASSERT(size > 0 && size <= sizeof (tempset)); + if (error == 0 && + ddi_copyout(srcp, vm_cpuset.cpus, size, md)) { + error = EFAULT; + break; + } + break; + } + case VM_SET_INTINFO: { + struct vm_intinfo vmii; + + if (ddi_copyin(datap, &vmii, sizeof (vmii), md)) { + error = EFAULT; + break; + } + error = vm_exit_intinfo(sc->vmm_vm, vcpu, vmii.info1); + break; + } + case VM_GET_INTINFO: { + struct vm_intinfo vmii; + + vmii.vcpuid = vcpu; + error = vm_get_intinfo(sc->vmm_vm, vcpu, &vmii.info1, + &vmii.info2); + if (error == 0 && + ddi_copyout(&vmii, datap, sizeof (vmii), md)) { + error = EFAULT; + break; + } + break; + } + case VM_RTC_WRITE: { + struct vm_rtc_data rtcdata; + + if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { + error = EFAULT; + break; + } + error = vrtc_nvram_write(sc->vmm_vm, rtcdata.offset, + rtcdata.value); + break; + } + case VM_RTC_READ: { + struct vm_rtc_data rtcdata; + + if (ddi_copyin(datap, &rtcdata, sizeof (rtcdata), md)) { + error = EFAULT; + break; + } + error = vrtc_nvram_read(sc->vmm_vm, rtcdata.offset, + &rtcdata.value); + if (error == 0 && + ddi_copyout(&rtcdata, datap, sizeof (rtcdata), md)) { + error = EFAULT; + break; + } + break; + } + case VM_RTC_SETTIME: { + struct vm_rtc_time rtctime; + + if (ddi_copyin(datap, &rtctime, sizeof (rtctime), md)) { + error = EFAULT; + break; + } + error = vrtc_set_time(sc->vmm_vm, rtctime.secs); + break; + } + case VM_RTC_GETTIME: { + struct vm_rtc_time rtctime; + + rtctime.secs = vrtc_get_time(sc->vmm_vm); + if (ddi_copyout(&rtctime, datap, sizeof (rtctime), md)) { + error = EFAULT; + break; + } + break; + } + case VM_RESTART_INSTRUCTION: - error = vm_restart_instruction(sc->vm, vcpu); + error = vm_restart_instruction(sc->vmm_vm, vcpu); break; + + case VM_SET_TOPOLOGY: { + struct vm_cpu_topology topo; + + if (ddi_copyin(datap, &topo, sizeof (topo), md) != 0) { + error = EFAULT; + break; + } + error = vm_set_topology(sc->vmm_vm, topo.sockets, topo.cores, + topo.threads, topo.maxcpus); + break; + } + case VM_GET_TOPOLOGY: { + struct vm_cpu_topology topo; + + vm_get_topology(sc->vmm_vm, &topo.sockets, &topo.cores, + &topo.threads, &topo.maxcpus); + if (ddi_copyout(&topo, datap, sizeof (topo), md) != 0) { + error = EFAULT; + break; + } + break; + } + +#ifndef __FreeBSD__ + case VM_DEVMEM_GETOFFSET: { + struct vm_devmem_offset vdo; + list_t *dl = &sc->vmm_devmem_list; + vmm_devmem_entry_t *de = NULL; + + if (ddi_copyin(datap, &vdo, sizeof (vdo), md) != 0) { + error = EFAULT; + break; + } + + for (de = list_head(dl); de != NULL; de = list_next(dl, de)) { + if (de->vde_segid == vdo.segid) { + break; + } + } + if (de != NULL) { + vdo.offset = de->vde_off; + if (ddi_copyout(&vdo, datap, sizeof (vdo), md) != 0) { + error = EFAULT; + } + } else { + error = ENOENT; + } + break; + } + case VM_WRLOCK_CYCLE: { + /* + * Present a test mechanism to acquire/release the write lock + * on the VM without any other effects. + */ + break; + } +#endif default: error = ENOTTY; break; } - if (state_changed == 1) { - vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); - } else if (state_changed == 2) { - for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) - vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false); + /* Release exclusion resources */ + switch (lock_type) { + case LOCK_NONE: + break; + case LOCK_VCPU: + vcpu_unlock_one(sc, vcpu); + break; + case LOCK_READ_HOLD: + vmm_read_unlock(sc); + break; + case LOCK_WRITE_HOLD: + vmm_write_unlock(sc); + break; + default: + panic("unexpected lock type"); + break; } -done: - /* Make sure that no handler returns a bogus value like ERESTART */ - KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error)); return (error); } -static -minor_t vmm_find_free_minor(void) +static vmm_softc_t * +vmm_lookup(const char *name) { - minor_t minor; + list_t *vml = &vmm_list; + vmm_softc_t *sc; + + ASSERT(MUTEX_HELD(&vmm_mtx)); - for (minor = 1; ; minor++) { - if (ddi_get_soft_state(vmm_statep, minor) == NULL) + for (sc = list_head(vml); sc != NULL; sc = list_next(vml, sc)) { + if (strcmp(sc->vmm_name, name) == 0) { break; + } } - return (minor); + return (sc); } -int -vmmdev_do_vm_create(dev_info_t *dip, char *name) +static int +vmmdev_do_vm_create(char *name, cred_t *cr) { - struct vmm_softc *sc; - minor_t minor; - int error; - - mutex_enter(&vmmdev_mtx); + vmm_softc_t *sc = NULL; + minor_t minor; + int error = ENOMEM; - if (strlen(name) >= VM_MAX_NAMELEN) { - mutex_exit(&vmmdev_mtx); + if (strnlen(name, VM_MAX_NAMELEN) >= VM_MAX_NAMELEN) { return (EINVAL); } - minor = vmm_find_free_minor(); - if (ddi_soft_state_zalloc(vmm_statep, minor) == DDI_FAILURE) { - mutex_exit(&vmmdev_mtx); - return (DDI_FAILURE); + mutex_enter(&vmm_mtx); + + /* Look for duplicates names */ + if (vmm_lookup(name) != NULL) { + mutex_exit(&vmm_mtx); + return (EEXIST); } - if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { - ddi_soft_state_free(vmm_statep, minor); - mutex_exit(&vmmdev_mtx); - return (DDI_FAILURE); + /* Allow only one instance per non-global zone. */ + if (!INGLOBALZONE(curproc)) { + for (sc = list_head(&vmm_list); sc != NULL; + sc = list_next(&vmm_list, sc)) { + if (sc->vmm_zone == curzone) { + mutex_exit(&vmm_mtx); + return (EINVAL); + } + } } - strcpy(sc->name, name); - sc->minor = minor; - if (ddi_create_minor_node(dip, name, S_IFCHR, minor, - DDI_PSEUDO, 0) == DDI_FAILURE) { + minor = id_alloc(vmm_minors); + if (ddi_soft_state_zalloc(vmm_statep, minor) != DDI_SUCCESS) { + goto fail; + } else if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { ddi_soft_state_free(vmm_statep, minor); - mutex_exit(&vmmdev_mtx); - return (DDI_FAILURE); + goto fail; + } else if (ddi_create_minor_node(vmmdev_dip, name, S_IFCHR, minor, + DDI_PSEUDO, 0) != DDI_SUCCESS) { + goto fail; } - error = vm_create(name, &sc->vm); - if (error != 0) { + error = vm_create(name, &sc->vmm_vm); + if (error == 0) { + /* Complete VM intialization and report success. */ + (void) strlcpy(sc->vmm_name, name, sizeof (sc->vmm_name)); + sc->vmm_minor = minor; + list_create(&sc->vmm_devmem_list, sizeof (vmm_devmem_entry_t), + offsetof(vmm_devmem_entry_t, vde_node)); + + list_create(&sc->vmm_holds, sizeof (vmm_hold_t), + offsetof(vmm_hold_t, vmh_node)); + cv_init(&sc->vmm_cv, NULL, CV_DEFAULT, NULL); + + mutex_init(&sc->vmm_lease_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&sc->vmm_lease_list, sizeof (vmm_lease_t), + offsetof(vmm_lease_t, vml_node)); + cv_init(&sc->vmm_lease_cv, NULL, CV_DEFAULT, NULL); + rw_init(&sc->vmm_rwlock, NULL, RW_DEFAULT, NULL); + + sc->vmm_zone = crgetzone(cr); + zone_hold(sc->vmm_zone); + vmm_zsd_add_vm(sc); + + list_insert_tail(&vmm_list, sc); + mutex_exit(&vmm_mtx); + return (0); + } + + ddi_remove_minor_node(vmmdev_dip, name); +fail: + id_free(vmm_minors, minor); + if (sc != NULL) { ddi_soft_state_free(vmm_statep, minor); - ddi_remove_minor_node(dip, name); - mutex_exit(&vmmdev_mtx); - return (error); } - SLIST_INSERT_HEAD(&head, sc, link); + mutex_exit(&vmm_mtx); + + return (error); +} + +/* + * Bhyve 'Driver' Interface + * + * While many devices are emulated in the bhyve userspace process, there are + * others with performance constraints which require that they run mostly or + * entirely in-kernel. For those not integrated directly into bhyve, an API is + * needed so they can query/manipulate the portions of VM state needed to + * fulfill their purpose. + * + * This includes: + * - Translating guest-physical addresses to host-virtual pointers + * - Injecting MSIs + * - Hooking IO port addresses + * + * The vmm_drv interface exists to provide that functionality to its consumers. + * (At this time, 'viona' is the only user) + */ +int +vmm_drv_hold(file_t *fp, cred_t *cr, vmm_hold_t **holdp) +{ + vnode_t *vp = fp->f_vnode; + const dev_t dev = vp->v_rdev; + vmm_softc_t *sc; + vmm_hold_t *hold; + int err = 0; + + if (vp->v_type != VCHR) { + return (ENXIO); + } + const major_t major = getmajor(dev); + const minor_t minor = getminor(dev); + mutex_enter(&vmmdev_mtx); + if (vmmdev_dip == NULL || major != ddi_driver_major(vmmdev_dip)) { + mutex_exit(&vmmdev_mtx); + return (ENOENT); + } + mutex_enter(&vmm_mtx); mutex_exit(&vmmdev_mtx); - return (0); + if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) { + err = ENOENT; + goto out; + } + /* XXXJOY: check cred permissions against instance */ + + if ((sc->vmm_flags & (VMM_CLEANUP|VMM_PURGED|VMM_DESTROY)) != 0) { + err = EBUSY; + goto out; + } + + hold = kmem_zalloc(sizeof (*hold), KM_SLEEP); + hold->vmh_sc = sc; + hold->vmh_release_req = B_FALSE; + + list_insert_tail(&sc->vmm_holds, hold); + sc->vmm_flags |= VMM_HELD; + *holdp = hold; + +out: + mutex_exit(&vmm_mtx); + return (err); } -static struct vmm_softc * -vmm_lookup(char *name) +void +vmm_drv_rele(vmm_hold_t *hold) { - struct vmm_softc *sc; - - SLIST_FOREACH(sc, &head, link) { - if (strcmp(sc->name, name) == 0) { - break; - } + vmm_softc_t *sc; + + ASSERT(hold != NULL); + ASSERT(hold->vmh_sc != NULL); + VERIFY(hold->vmh_ioport_hook_cnt == 0); + + mutex_enter(&vmm_mtx); + sc = hold->vmh_sc; + list_remove(&sc->vmm_holds, hold); + if (list_is_empty(&sc->vmm_holds)) { + sc->vmm_flags &= ~VMM_HELD; + cv_broadcast(&sc->vmm_cv); } + mutex_exit(&vmm_mtx); + kmem_free(hold, sizeof (*hold)); +} - return (sc); +boolean_t +vmm_drv_release_reqd(vmm_hold_t *hold) +{ + ASSERT(hold != NULL); + return (hold->vmh_release_req); } -struct vm * -vm_lookup_by_name(char *name) +vmm_lease_t * +vmm_drv_lease_sign(vmm_hold_t *hold, boolean_t (*expiref)(void *), void *arg) { - struct vmm_softc *sc; + vmm_softc_t *sc = hold->vmh_sc; + vmm_lease_t *lease; - mutex_enter(&vmmdev_mtx); + ASSERT3P(expiref, !=, NULL); - if ((sc = vmm_lookup(name)) == NULL) { - mutex_exit(&vmmdev_mtx); + if (hold->vmh_release_req) { return (NULL); } - mutex_exit(&vmmdev_mtx); + lease = kmem_alloc(sizeof (*lease), KM_SLEEP); + list_link_init(&lease->vml_node); + lease->vml_expire_func = expiref; + lease->vml_expire_arg = arg; + lease->vml_expired = B_FALSE; + lease->vml_hold = hold; + /* cache the VM pointer for one less pointer chase */ + lease->vml_vm = sc->vmm_vm; + + mutex_enter(&sc->vmm_lease_lock); + while (sc->vmm_lease_blocker != 0) { + cv_wait(&sc->vmm_lease_cv, &sc->vmm_lease_lock); + } + list_insert_tail(&sc->vmm_lease_list, lease); + vmm_read_lock(sc); + mutex_exit(&sc->vmm_lease_lock); - return (sc->vm); + return (lease); +} + +static void +vmm_lease_break_locked(vmm_softc_t *sc, vmm_lease_t *lease) +{ + ASSERT(MUTEX_HELD(&sc->vmm_lease_lock)); + + list_remove(&sc->vmm_lease_list, lease); + vmm_read_unlock(sc); + kmem_free(lease, sizeof (*lease)); +} + +void +vmm_drv_lease_break(vmm_hold_t *hold, vmm_lease_t *lease) +{ + vmm_softc_t *sc = hold->vmh_sc; + + VERIFY3P(hold, ==, lease->vml_hold); + + mutex_enter(&sc->vmm_lease_lock); + vmm_lease_break_locked(sc, lease); + mutex_exit(&sc->vmm_lease_lock); +} + +boolean_t +vmm_drv_lease_expired(vmm_lease_t *lease) +{ + return (lease->vml_expired); +} + +void * +vmm_drv_gpa2kva(vmm_lease_t *lease, uintptr_t gpa, size_t sz) +{ + ASSERT(lease != NULL); + + return (vmspace_find_kva(vm_get_vmspace(lease->vml_vm), gpa, sz)); } int -vmmdev_do_vm_destroy(dev_info_t *dip, char *name) +vmm_drv_msi(vmm_lease_t *lease, uint64_t addr, uint64_t msg) { - struct vmm_softc *sc; - dev_info_t *pdip = ddi_get_parent(dip); + ASSERT(lease != NULL); - mutex_enter(&vmmdev_mtx); + return (lapic_intr_msi(lease->vml_vm, addr, msg)); +} - if ((sc = vmm_lookup(name)) == NULL) { - mutex_exit(&vmmdev_mtx); - return (ENOENT); - } +int +vmm_drv_ioport_hook(vmm_hold_t *hold, uint_t ioport, vmm_drv_rmem_cb_t rfunc, + vmm_drv_wmem_cb_t wfunc, void *arg, void **cookie) +{ + vmm_softc_t *sc; + int err; - if (sc->open) { - mutex_exit(&vmmdev_mtx); + ASSERT(hold != NULL); + ASSERT(cookie != NULL); + + sc = hold->vmh_sc; + mutex_enter(&vmm_mtx); + /* Confirm that hook installation is not blocked */ + if ((sc->vmm_flags & VMM_BLOCK_HOOK) != 0) { + mutex_exit(&vmm_mtx); return (EBUSY); } + /* + * Optimistically record an installed hook which will prevent a block + * from being asserted while the mutex is dropped. + */ + hold->vmh_ioport_hook_cnt++; + mutex_exit(&vmm_mtx); + + vmm_write_lock(sc); + err = vm_ioport_hook(sc->vmm_vm, ioport, (vmm_rmem_cb_t)rfunc, + (vmm_wmem_cb_t)wfunc, arg, cookie); + vmm_write_unlock(sc); + + if (err != 0) { + mutex_enter(&vmm_mtx); + /* Walk back optimism about the hook installation */ + hold->vmh_ioport_hook_cnt--; + mutex_exit(&vmm_mtx); + } + return (err); +} - vm_destroy(sc->vm); - SLIST_REMOVE(&head, sc, vmm_softc, link); - ddi_remove_minor_node(dip, name); - ddi_soft_state_free(vmm_statep, sc->minor); - (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); +void +vmm_drv_ioport_unhook(vmm_hold_t *hold, void **cookie) +{ + vmm_softc_t *sc; - mutex_exit(&vmmdev_mtx); + ASSERT(hold != NULL); + ASSERT(cookie != NULL); + ASSERT(hold->vmh_ioport_hook_cnt != 0); + + sc = hold->vmh_sc; + vmm_write_lock(sc); + vm_ioport_unhook(sc->vmm_vm, cookie); + vmm_write_unlock(sc); + + mutex_enter(&vmm_mtx); + hold->vmh_ioport_hook_cnt--; + mutex_exit(&vmm_mtx); +} + +static int +vmm_drv_purge(vmm_softc_t *sc) +{ + ASSERT(MUTEX_HELD(&vmm_mtx)); + + if ((sc->vmm_flags & VMM_HELD) != 0) { + vmm_hold_t *hold; + + sc->vmm_flags |= VMM_CLEANUP; + for (hold = list_head(&sc->vmm_holds); hold != NULL; + hold = list_next(&sc->vmm_holds, hold)) { + hold->vmh_release_req = B_TRUE; + } + while ((sc->vmm_flags & VMM_HELD) != 0) { + if (cv_wait_sig(&sc->vmm_cv, &vmm_mtx) <= 0) { + return (EINTR); + } + } + sc->vmm_flags &= ~VMM_CLEANUP; + } + VERIFY(list_is_empty(&sc->vmm_holds)); + sc->vmm_flags |= VMM_PURGED; return (0); } -int -vmmdev_do_vm_mmap(struct vmm_softc *vmm_sc, off_t off, int nprot) +static int +vmm_drv_block_hook(vmm_softc_t *sc, boolean_t enable_block) { - vm_paddr_t paddr; + int err = 0; - mutex_enter(&vmmdev_mtx); + mutex_enter(&vmm_mtx); + if (!enable_block) { + VERIFY((sc->vmm_flags & VMM_BLOCK_HOOK) != 0); - paddr = vm_gpa2hpa(vmm_sc->vm, (vm_paddr_t)off, PAGE_SIZE); - if (paddr == -1) { - return (-1); + sc->vmm_flags &= ~VMM_BLOCK_HOOK; + goto done; } - mutex_exit(&vmmdev_mtx); + /* If any holds have hooks installed, the block is a failure */ + if (!list_is_empty(&sc->vmm_holds)) { + vmm_hold_t *hold; + + for (hold = list_head(&sc->vmm_holds); hold != NULL; + hold = list_next(&sc->vmm_holds, hold)) { + if (hold->vmh_ioport_hook_cnt != 0) { + err = EBUSY; + goto done; + } + } + } + sc->vmm_flags |= VMM_BLOCK_HOOK; + +done: + mutex_exit(&vmm_mtx); + return (err); +} + +static int +vmm_do_vm_destroy_locked(vmm_softc_t *sc, boolean_t clean_zsd) +{ + dev_info_t *pdip = ddi_get_parent(vmmdev_dip); + minor_t minor; + + ASSERT(MUTEX_HELD(&vmm_mtx)); + + if (clean_zsd) { + vmm_zsd_rem_vm(sc); + } + + if (vmm_drv_purge(sc) != 0) { + return (EINTR); + } + + /* Clean up devmem entries */ + vmmdev_devmem_purge(sc); + + list_remove(&vmm_list, sc); + ddi_remove_minor_node(vmmdev_dip, sc->vmm_name); + minor = sc->vmm_minor; + zone_rele(sc->vmm_zone); + if (sc->vmm_is_open) { + list_insert_tail(&vmm_destroy_list, sc); + sc->vmm_flags |= VMM_DESTROY; + } else { + vm_destroy(sc->vmm_vm); + ddi_soft_state_free(vmm_statep, minor); + id_free(vmm_minors, minor); + } + (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE); + + return (0); +} + +int +vmm_do_vm_destroy(vmm_softc_t *sc, boolean_t clean_zsd) +{ + int err; + + mutex_enter(&vmm_mtx); + err = vmm_do_vm_destroy_locked(sc, clean_zsd); + mutex_exit(&vmm_mtx); + + return (err); +} + +/* ARGSUSED */ +static int +vmmdev_do_vm_destroy(const char *name, cred_t *cr) +{ + vmm_softc_t *sc; + int err; + + if (crgetuid(cr) != 0) + return (EPERM); - return (btop(paddr)); + mutex_enter(&vmm_mtx); + + if ((sc = vmm_lookup(name)) == NULL) { + mutex_exit(&vmm_mtx); + return (ENOENT); + } + /* + * We don't check this in vmm_lookup() since that function is also used + * for validation during create and currently vmm names must be unique. + */ + if (!INGLOBALZONE(curproc) && sc->vmm_zone != curzone) { + mutex_exit(&vmm_mtx); + return (EPERM); + } + err = vmm_do_vm_destroy_locked(sc, B_TRUE); + mutex_exit(&vmm_mtx); + + return (err); } static int vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) { - minor_t minor; - struct vmm_softc *sc; + minor_t minor; + vmm_softc_t *sc; minor = getminor(*devp); if (minor == VMM_CTL_MINOR) { @@ -768,19 +1750,15 @@ vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) return (0); } - mutex_enter(&vmmdev_mtx); + mutex_enter(&vmm_mtx); sc = ddi_get_soft_state(vmm_statep, minor); if (sc == NULL) { - mutex_exit(&vmmdev_mtx); + mutex_exit(&vmm_mtx); return (ENXIO); } - if (sc->open) { - mutex_exit(&vmmdev_mtx); - return (EBUSY); - } - sc->open = B_TRUE; - mutex_exit(&vmmdev_mtx); + sc->vmm_is_open = B_TRUE; + mutex_exit(&vmm_mtx); return (0); } @@ -788,170 +1766,360 @@ vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp) static int vmm_close(dev_t dev, int flag, int otyp, cred_t *credp) { - minor_t minor; - struct vmm_softc *sc; + minor_t minor; + vmm_softc_t *sc; minor = getminor(dev); if (minor == VMM_CTL_MINOR) return (0); - mutex_enter(&vmmdev_mtx); + mutex_enter(&vmm_mtx); sc = ddi_get_soft_state(vmm_statep, minor); if (sc == NULL) { - mutex_exit(&vmmdev_mtx); + mutex_exit(&vmm_mtx); return (ENXIO); } - sc->open = B_FALSE; - mutex_exit(&vmmdev_mtx); + VERIFY(sc->vmm_is_open); + sc->vmm_is_open = B_FALSE; + + if (sc->vmm_flags & VMM_DESTROY) { + list_remove(&vmm_destroy_list, sc); + vm_destroy(sc->vmm_vm); + ddi_soft_state_free(vmm_statep, minor); + id_free(vmm_minors, minor); + } + mutex_exit(&vmm_mtx); return (0); } static int +vmm_is_supported(intptr_t arg) +{ + int r; + const char *msg; + + if (vmm_is_intel()) { + r = vmx_x86_supported(&msg); + } else if (vmm_is_amd()) { + /* + * HMA already ensured that the features necessary for SVM + * operation were present and online during vmm_attach(). + */ + r = 0; + } else { + r = ENXIO; + msg = "Unsupported CPU vendor"; + } + + if (r != 0 && arg != (intptr_t)NULL) { + if (copyoutstr(msg, (char *)arg, strlen(msg) + 1, NULL) != 0) + return (EFAULT); + } + return (r); +} + +static int vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) { - struct vmm_softc *sc; - struct vmm_ioctl kvi; - minor_t minor; + vmm_softc_t *sc; + minor_t minor; minor = getminor(dev); if (minor == VMM_CTL_MINOR) { - if (ddi_copyin((void *)arg, &kvi, sizeof (struct vmm_ioctl), - mode)) { - return (EFAULT); + void *argp = (void *)arg; + char name[VM_MAX_NAMELEN] = { 0 }; + size_t len = 0; + + if ((mode & FKIOCTL) != 0) { + len = strlcpy(name, argp, sizeof (name)); + } else { + if (copyinstr(argp, name, sizeof (name), &len) != 0) { + return (EFAULT); + } + } + if (len >= VM_MAX_NAMELEN) { + return (ENAMETOOLONG); } + switch (cmd) { case VMM_CREATE_VM: if ((mode & FWRITE) == 0) return (EPERM); - return (vmmdev_do_vm_create(vmm_dip, kvi.vmm_name)); + return (vmmdev_do_vm_create(name, credp)); case VMM_DESTROY_VM: if ((mode & FWRITE) == 0) return (EPERM); - return (vmmdev_do_vm_destroy(vmm_dip, kvi.vmm_name)); + return (vmmdev_do_vm_destroy(name, credp)); + case VMM_VM_SUPPORTED: + return (vmm_is_supported(arg)); default: - break; + /* No other actions are legal on ctl device */ + return (ENOTTY); } } sc = ddi_get_soft_state(vmm_statep, minor); ASSERT(sc); + if (sc->vmm_flags & VMM_DESTROY) + return (ENXIO); + return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp)); } static int -vmm_mmap(dev_t dev, off_t off, int prot) +vmm_segmap(dev_t dev, off_t off, struct as *as, caddr_t *addrp, off_t len, + unsigned int prot, unsigned int maxprot, unsigned int flags, cred_t *credp) { - struct vmm_softc *sc; + vmm_softc_t *sc; + const minor_t minor = getminor(dev); + struct vm *vm; + int err; + vm_object_t vmo = NULL; + struct vmspace *vms; - sc = ddi_get_soft_state(vmm_statep, getminor(dev)); + if (minor == VMM_CTL_MINOR) { + return (ENODEV); + } + if (off < 0 || (off + len) <= 0) { + return (EINVAL); + } + if ((prot & PROT_USER) == 0) { + return (EACCES); + } + + sc = ddi_get_soft_state(vmm_statep, minor); ASSERT(sc); - return (vmmdev_do_vm_mmap(sc, off, prot)); -} + if (sc->vmm_flags & VMM_DESTROY) + return (ENXIO); -static int -vmm_segmap(dev_t dev, off_t off, struct as *as, - caddr_t *addrp, off_t len, unsigned int prot, - unsigned int maxprot, unsigned int flags, cred_t *credp) -{ - struct segdev_crargs dev_a; - int error; + /* Grab read lock on the VM to prevent any changes to the memory map */ + vmm_read_lock(sc); - as_rangelock(as); + vm = sc->vmm_vm; + vms = vm_get_vmspace(vm); + if (off >= VM_DEVMEM_START) { + int segid; - error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags); - if (error != 0) { - as_rangeunlock(as); - return (error); + /* Mapping a devmem "device" */ + if (!vmmdev_devmem_segid(sc, off, len, &segid)) { + err = ENODEV; + goto out; + } + err = vm_get_memseg(vm, segid, NULL, NULL, &vmo); + if (err != 0) { + goto out; + } + err = vm_segmap_obj(vms, vmo, as, addrp, prot, maxprot, flags); + } else { + /* Mapping a part of the guest physical space */ + err = vm_segmap_space(vms, off, as, addrp, len, prot, maxprot, + flags); } - dev_a.mapfunc = vmm_mmap; - dev_a.dev = dev; - dev_a.offset = off; - dev_a.type = (flags & MAP_TYPE); - dev_a.prot = (uchar_t)prot; - dev_a.maxprot = (uchar_t)maxprot; - dev_a.hat_attr = 0; - dev_a.hat_flags = HAT_LOAD_NOCONSIST; - dev_a.devmap_data = NULL; - - error = as_map(as, *addrp, len, segdev_create, &dev_a); - - as_rangeunlock(as); - return (error); +out: + vmm_read_unlock(sc); + return (err); } -static int -vmm_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) +static sdev_plugin_validate_t +vmm_sdev_validate(sdev_ctx_t ctx) { - return (0); + const char *name = sdev_ctx_name(ctx); + vmm_softc_t *sc; + sdev_plugin_validate_t ret; + minor_t minor; + + if (sdev_ctx_vtype(ctx) != VCHR) + return (SDEV_VTOR_INVALID); + + VERIFY3S(sdev_ctx_minor(ctx, &minor), ==, 0); + + mutex_enter(&vmm_mtx); + if ((sc = vmm_lookup(name)) == NULL) + ret = SDEV_VTOR_INVALID; + else if (sc->vmm_minor != minor) + ret = SDEV_VTOR_STALE; + else + ret = SDEV_VTOR_VALID; + mutex_exit(&vmm_mtx); + + return (ret); } static int -vmm_probe(dev_info_t *dip) +vmm_sdev_filldir(sdev_ctx_t ctx) { - if (driver_installed(ddi_name_to_major("kvm"))) { - cmn_err(CE_WARN, "kvm is installed\n"); - return (DDI_PROBE_FAILURE); + vmm_softc_t *sc; + int ret; + + if (strcmp(sdev_ctx_path(ctx), VMM_SDEV_ROOT) != 0) { + cmn_err(CE_WARN, "%s: bad path '%s' != '%s'\n", __func__, + sdev_ctx_path(ctx), VMM_SDEV_ROOT); + return (EINVAL); } - return (DDI_PROBE_SUCCESS); + mutex_enter(&vmm_mtx); + ASSERT(vmmdev_dip != NULL); + for (sc = list_head(&vmm_list); sc != NULL; + sc = list_next(&vmm_list, sc)) { + if (INGLOBALZONE(curproc) || sc->vmm_zone == curzone) { + ret = sdev_plugin_mknod(ctx, sc->vmm_name, + S_IFCHR | 0600, + makedevice(ddi_driver_major(vmmdev_dip), + sc->vmm_minor)); + } else { + continue; + } + if (ret != 0 && ret != EEXIST) + goto out; + } + + ret = 0; + +out: + mutex_exit(&vmm_mtx); + return (ret); +} + +/* ARGSUSED */ +static void +vmm_sdev_inactive(sdev_ctx_t ctx) +{ } +static sdev_plugin_ops_t vmm_sdev_ops = { + .spo_version = SDEV_PLUGIN_VERSION, + .spo_flags = SDEV_PLUGIN_SUBDIR, + .spo_validate = vmm_sdev_validate, + .spo_filldir = vmm_sdev_filldir, + .spo_inactive = vmm_sdev_inactive +}; + +/* ARGSUSED */ static int -vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +vmm_info(dev_info_t *dip, ddi_info_cmd_t cmd, void *arg, void **result) { + int error; + switch (cmd) { - case DDI_ATTACH: + case DDI_INFO_DEVT2DEVINFO: + *result = (void *)vmmdev_dip; + error = DDI_SUCCESS; + break; + case DDI_INFO_DEVT2INSTANCE: + *result = (void *)0; + error = DDI_SUCCESS; break; default: + error = DDI_FAILURE; + break; + } + return (error); +} + +static int +vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) +{ + sdev_plugin_hdl_t sph; + hma_reg_t *reg = NULL; + boolean_t vmm_loaded = B_FALSE; + + if (cmd != DDI_ATTACH) { return (DDI_FAILURE); } - if (vmm_mod_load()) { + mutex_enter(&vmmdev_mtx); + /* Ensure we are not already attached. */ + if (vmmdev_dip != NULL) { + mutex_exit(&vmmdev_mtx); return (DDI_FAILURE); } - vmm_dip = dip; + vmm_sol_glue_init(); + vmm_arena_init(); - /* - * Create control node. Other nodes will be created on demand. - */ - if (ddi_create_minor_node(dip, VMM_CTL_MINOR_NODE, S_IFCHR, + if ((reg = hma_register(vmmdev_hvm_name)) == NULL) { + goto fail; + } else if (vmm_mod_load() != 0) { + goto fail; + } + vmm_loaded = B_TRUE; + + /* Create control node. Other nodes will be created on demand. */ + if (ddi_create_minor_node(dip, "ctl", S_IFCHR, VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) { - return (DDI_FAILURE); + goto fail; } - ddi_report_dev(dip); + if ((sph = sdev_plugin_register("vmm", &vmm_sdev_ops, NULL)) == + (sdev_plugin_hdl_t)NULL) { + ddi_remove_minor_node(dip, NULL); + goto fail; + } + ddi_report_dev(dip); + vmmdev_hma_reg = reg; + vmmdev_sdev_hdl = sph; + vmmdev_dip = dip; + mutex_exit(&vmmdev_mtx); return (DDI_SUCCESS); + +fail: + if (vmm_loaded) { + VERIFY0(vmm_mod_unload()); + } + if (reg != NULL) { + hma_unregister(reg); + } + vmm_arena_fini(); + vmm_sol_glue_cleanup(); + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); } static int vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { - switch (cmd) { - case DDI_DETACH: - break; - default: + if (cmd != DDI_DETACH) { return (DDI_FAILURE); } - if (vmm_mod_unload()) {; + /* Ensure that all resources have been cleaned up */ + mutex_enter(&vmmdev_mtx); + + mutex_enter(&vmm_mtx); + if (!list_is_empty(&vmm_list) || !list_is_empty(&vmm_destroy_list)) { + mutex_exit(&vmm_mtx); + mutex_exit(&vmmdev_mtx); return (DDI_FAILURE); } + mutex_exit(&vmm_mtx); - /* - * Remove the control node. - */ - ddi_remove_minor_node(dip, VMM_CTL_MINOR_NODE); - vmm_dip = NULL; + VERIFY(vmmdev_sdev_hdl != (sdev_plugin_hdl_t)NULL); + if (sdev_plugin_unregister(vmmdev_sdev_hdl) != 0) { + mutex_exit(&vmmdev_mtx); + return (DDI_FAILURE); + } + vmmdev_sdev_hdl = (sdev_plugin_hdl_t)NULL; + + /* Remove the control node. */ + ddi_remove_minor_node(dip, "ctl"); + vmmdev_dip = NULL; + + VERIFY0(vmm_mod_unload()); + hma_unregister(vmmdev_hma_reg); + vmmdev_hma_reg = NULL; + vmm_arena_fini(); + vmm_sol_glue_cleanup(); + + mutex_exit(&vmmdev_mtx); return (DDI_SUCCESS); } @@ -966,7 +2134,7 @@ static struct cb_ops vmm_cb_ops = { nodev, /* write */ vmm_ioctl, nodev, /* devmap */ - vmm_mmap, + nodev, /* mmap */ vmm_segmap, nochpoll, /* poll */ ddi_prop_op, @@ -977,9 +2145,9 @@ static struct cb_ops vmm_cb_ops = { static struct dev_ops vmm_ops = { DEVO_REV, 0, - ddi_no_info, + vmm_info, nulldev, /* identify */ - vmm_probe, + nulldev, /* probe */ vmm_attach, vmm_detach, nodev, /* reset */ @@ -989,7 +2157,7 @@ static struct dev_ops vmm_ops = { static struct modldrv modldrv = { &mod_driverops, - "vmm", + "bhyve vmm", &vmm_ops }; @@ -1004,16 +2172,27 @@ _init(void) { int error; - mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); + sysinit(); - error = ddi_soft_state_init(&vmm_statep, sizeof (struct vmm_softc), 0); + mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL); + mutex_init(&vmm_mtx, NULL, MUTEX_DRIVER, NULL); + list_create(&vmm_list, sizeof (vmm_softc_t), + offsetof(vmm_softc_t, vmm_node)); + list_create(&vmm_destroy_list, sizeof (vmm_softc_t), + offsetof(vmm_softc_t, vmm_node)); + vmm_minors = id_space_create("vmm_minors", VMM_CTL_MINOR + 1, MAXMIN32); + + error = ddi_soft_state_init(&vmm_statep, sizeof (vmm_softc_t), 0); if (error) { return (error); } + vmm_zsd_init(); + error = mod_install(&modlinkage); if (error) { ddi_soft_state_fini(&vmm_statep); + vmm_zsd_fini(); } return (error); @@ -1028,6 +2207,9 @@ _fini(void) if (error) { return (error); } + + vmm_zsd_fini(); + ddi_soft_state_fini(&vmm_statep); return (0); diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c new file mode 100644 index 0000000000..c26e763805 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_ept.c @@ -0,0 +1,268 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/machsystm.h> + +#include <sys/gipt.h> +#include <vm/vm_glue.h> + + +struct ept_map { + gipt_map_t em_gipt; + uint64_t em_wired_page_count; +}; +typedef struct ept_map ept_map_t; + +#define EPT_LOCK(m) (&(m)->em_gipt.giptm_lock) + +#define EPT_MAX_LEVELS 4 + +CTASSERT(EPT_MAX_LEVELS <= GIPT_MAX_LEVELS); + +#define EPT_R (0x1 << 0) +#define EPT_W (0x1 << 1) +#define EPT_X (0x1 << 2) +#define EPT_RWX (EPT_R | EPT_W | EPT_X) +#define EPT_LGPG (0x1 << 7) + +#define EPT_PA_MASK (0x000ffffffffff000ull) + +CTASSERT(EPT_R == PROT_READ); +CTASSERT(EPT_W == PROT_WRITE); +CTASSERT(EPT_X == PROT_EXEC); + + +#define EPT_PAT(attr) (((attr) & 0x7) << 3) +#define EPT_PADDR(addr) ((addr) & EPT_PA_MASK) + +#define EPT_IS_ABSENT(pte) (((pte) & EPT_RWX) == 0) +#define EPT_PTE_PFN(pte) mmu_btop(EPT_PADDR(pte)) +#define EPT_PTE_PROT(pte) ((pte) & EPT_RWX) +#define EPT_MAPS_PAGE(pte, lvl) \ + (EPT_PTE_PROT(pte) != 0 && (((pte) & EPT_LGPG) != 0 || (lvl) == 0)) + +/* + * Only assign EPT_LGPG for levels higher than 0. Although this bit is defined + * as being ignored at level 0, some versions of VMWare fail to honor this and + * report such a PTE as an EPT mis-configuration. + */ +#define EPT_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr) \ + (EPT_PADDR(pfn_to_pa(pfn)) | \ + (((lvl) != 0) ? EPT_LGPG : 0) | \ + EPT_PAT(attr) | ((prot) & EPT_RWX)) +#define EPT_PTE_ASSIGN_TABLE(pfn) (EPT_PADDR(pfn_to_pa(pfn)) | EPT_RWX) + + +static gipt_pte_type_t +ept_pte_type(uint64_t pte, uint_t level) +{ + if (EPT_IS_ABSENT(pte)) { + return (PTET_EMPTY); + } else if (EPT_MAPS_PAGE(pte, level)) { + return (PTET_PAGE); + } else { + return (PTET_LINK); + } +} + +static uint64_t +ept_pte_map(uint64_t pfn) +{ + return (EPT_PTE_ASSIGN_TABLE(pfn)); +} + +static void * +ept_create(uintptr_t *pml4_kaddr) +{ + ept_map_t *emap; + gipt_map_t *map; + gipt_t *root; + struct gipt_cbs cbs = { + .giptc_pte_type = ept_pte_type, + .giptc_pte_map = ept_pte_map, + }; + + emap = kmem_zalloc(sizeof (*emap), KM_SLEEP); + map = &emap->em_gipt; + root = gipt_alloc(); + root->gipt_level = EPT_MAX_LEVELS - 1; + gipt_map_init(map, EPT_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root); + + *pml4_kaddr = (uintptr_t)root->gipt_kva; + return (emap); +} + +static void +ept_destroy(void *arg) +{ + ept_map_t *emap = arg; + + if (emap != NULL) { + gipt_map_t *map = &emap->em_gipt; + + gipt_map_fini(map); + kmem_free(emap, sizeof (*emap)); + } +} + +static uint64_t +ept_wired_count(void *arg) +{ + ept_map_t *emap = arg; + uint64_t res; + + mutex_enter(EPT_LOCK(emap)); + res = emap->em_wired_page_count; + mutex_exit(EPT_LOCK(emap)); + + return (res); +} + +static int +ept_is_wired(void *arg, uint64_t va, uint_t *protp) +{ + ept_map_t *emap = arg; + gipt_t *pt; + int rv = -1; + + mutex_enter(EPT_LOCK(emap)); + pt = gipt_map_lookup_deepest(&emap->em_gipt, va); + if (pt != NULL) { + const uint64_t pte = GIPT_VA2PTE(pt, va); + + if (EPT_MAPS_PAGE(pte, pt->gipt_level)) { + *protp = EPT_PTE_PROT(pte); + rv = 0; + } + } + mutex_exit(EPT_LOCK(emap)); + + return (rv); +} + +static int +ept_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot, + uint8_t attr) +{ + ept_map_t *emap = arg; + gipt_map_t *map = &emap->em_gipt; + gipt_t *pt; + uint64_t *ptep, pte; + + ASSERT((prot & EPT_RWX) != 0 && (prot & ~EPT_RWX) == 0); + ASSERT3U(lvl, <, EPT_MAX_LEVELS); + + mutex_enter(EPT_LOCK(emap)); + pt = gipt_map_lookup(map, va, lvl); + if (pt == NULL) { + /* + * A table at the appropriate VA/level that would house this + * mapping does not currently exist. Try to walk down to that + * point, creating any necessary parent(s). + */ + pt = gipt_map_create_parents(map, va, lvl); + + /* + * There was a large page mapping in the way of creating the + * necessary parent table(s). + */ + if (pt == NULL) { + panic("unexpected large page @ %08lx", va); + } + } + ptep = GIPT_VA2PTEP(pt, va); + + pte = *ptep; + if (!EPT_IS_ABSENT(pte)) { + if (!EPT_MAPS_PAGE(pte, lvl)) { + panic("unexpected PT link @ %08lx in %p", va, pt); + } else { + panic("unexpected page mapped @ %08lx in %p", va, pt); + } + } + + pte = EPT_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr); + *ptep = pte; + pt->gipt_valid_cnt++; + emap->em_wired_page_count += gipt_level_count[lvl]; + + mutex_exit(EPT_LOCK(emap)); + return (0); +} + +static uint64_t +ept_unmap(void *arg, uint64_t va, uint64_t end_va) +{ + ept_map_t *emap = arg; + gipt_map_t *map = &emap->em_gipt; + gipt_t *pt; + uint64_t cur_va = va; + uint64_t unmapped = 0; + + mutex_enter(EPT_LOCK(emap)); + + pt = gipt_map_lookup_deepest(map, cur_va); + if (pt == NULL) { + mutex_exit(EPT_LOCK(emap)); + return (0); + } + if (!EPT_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) { + cur_va = gipt_map_next_page(map, cur_va, end_va, &pt); + if (cur_va == 0) { + mutex_exit(EPT_LOCK(emap)); + return (0); + } + } + + while (cur_va < end_va) { + uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va); + const uint_t lvl = pt->gipt_level; + + ASSERT(EPT_MAPS_PAGE(*ptep, lvl)); + *ptep = 0; + pt->gipt_valid_cnt--; + unmapped += gipt_level_count[pt->gipt_level]; + + gipt_t *next_pt = pt; + uint64_t next_va; + next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt); + + if (pt->gipt_valid_cnt == 0) { + gipt_map_clean_parents(map, pt); + } + if (next_va == 0) { + break; + } + pt = next_pt; + cur_va = next_va; + } + emap->em_wired_page_count -= unmapped; + + mutex_exit(EPT_LOCK(emap)); + + return (unmapped); +} + +struct vmm_pt_ops ept_ops = { + .vpo_init = ept_create, + .vpo_free = ept_destroy, + .vpo_wired_cnt = ept_wired_count, + .vpo_is_wired = ept_is_wired, + .vpo_map = ept_map, + .vpo_unmap = ept_unmap, +}; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c index 6588f5a46d..a8d94ea024 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c @@ -1,31 +1,4 @@ /* - * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org> - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: head/sys/kern/subr_sleepqueue.c 261520 2014-02-05 18:13:27Z jhb $ - */ -/*- * Copyright (c) 2004 Poul-Henning Kamp * All rights reserved. * @@ -63,6 +36,7 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #include <sys/types.h> @@ -73,22 +47,62 @@ #include <sys/queue.h> #include <sys/spl.h> #include <sys/systm.h> +#include <sys/ddidmareq.h> +#include <sys/id_space.h> +#include <sys/psm_defs.h> +#include <sys/smp_impldefs.h> +#include <sys/modhash.h> +#include <sys/hma.h> #include <machine/cpufunc.h> #include <machine/fpu.h> #include <machine/md_var.h> +#include <machine/pmap.h> #include <machine/specialreg.h> #include <machine/vmm.h> #include <sys/vmm_impl.h> +#include <sys/kernel.h> #include <vm/as.h> #include <vm/seg_kmem.h> +SET_DECLARE(sysinit_set, struct sysinit); + +void +sysinit(void) +{ + struct sysinit **si; + + SET_FOREACH(si, sysinit_set) + (*si)->func((*si)->data); +} + +u_char const bin2bcd_data[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, + 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99 +}; + vm_paddr_t pmap_kextract(vm_offset_t va) { pfn_t pfn; + /* + * Since hat_getpfnum() may block on an htable mutex, this is not at + * all safe to run from a critical_enter/kpreempt_disable context. + * The FreeBSD analog does not have the same locking constraints, so + * close attention must be paid wherever this is called. + */ + ASSERT(curthread->t_preempt == 0); + pfn = hat_getpfnum(kas.a_hat, (caddr_t)va); ASSERT(pfn != PFN_INVALID); return (pfn << PAGE_SHIFT) | ((uintptr_t)va & PAGE_MASK); @@ -97,45 +111,72 @@ pmap_kextract(vm_offset_t va) int cpusetobj_ffs(const cpuset_t *set) { -#if CPUSET_WORDS > 1 - int i, cbit; + uint_t large, small; - cbit = 0; - for (i = 0; i < CPUSET_WORDS; i++) { - if (set->cpub[i] != 0) { - cbit = ffsl(set->cpub[i]); - cbit += i * sizeof (set->cpub[0]); - break; - } - } - return (cbit); -#else - return(ffsl(*set)); -#endif -} + /* + * Rather than reaching into the cpuset_t ourselves, leave that task to + * cpuset_bounds(). The simplicity is worth the extra wasted work to + * find the upper bound. + */ + cpuset_bounds(set, &small, &large); -void -smp_rendezvous(void (* setup_func)(void *), - void (* action_func)(void *), - void (* teardown_func)(void *), - void *arg) -{ - cpuset_t cpuset; + if (small == CPUSET_NOTINSET) { + /* The FreeBSD version returns 0 if it find nothing */ + return (0); + } - ASSERT(setup_func == NULL); - ASSERT(teardown_func == NULL); + ASSERT3U(small, <=, INT_MAX); - CPUSET_ALL(cpuset); - xc_sync((xc_arg_t)arg, 0, 0, CPUSET2BV(cpuset), (xc_func_t)action_func); + /* Least significant bit index starts at 1 for valid results */ + return (small + 1); } struct kmem_item { void *addr; size_t size; - LIST_ENTRY(kmem_item) next; }; static kmutex_t kmem_items_lock; -static LIST_HEAD(, kmem_item) kmem_items; + +static mod_hash_t *vmm_alloc_hash; +uint_t vmm_alloc_hash_nchains = 16381; +uint_t vmm_alloc_hash_size = PAGESIZE; + +static void +vmm_alloc_hash_valdtor(mod_hash_val_t val) +{ + struct kmem_item *i = (struct kmem_item *)val; + + kmem_free(i->addr, i->size); + kmem_free(i, sizeof (struct kmem_item)); +} + +static void +vmm_alloc_init(void) +{ + vmm_alloc_hash = mod_hash_create_ptrhash("vmm_alloc_hash", + vmm_alloc_hash_nchains, vmm_alloc_hash_valdtor, + vmm_alloc_hash_size); + + VERIFY(vmm_alloc_hash != NULL); +} + +static uint_t +vmm_alloc_check(mod_hash_key_t key, mod_hash_val_t *val, void *unused) +{ + struct kmem_item *i = (struct kmem_item *)val; + + cmn_err(CE_PANIC, "!vmm_alloc_check: hash not empty: %p, %d", i->addr, + i->size); + + return (MH_WALK_TERMINATE); +} + +static void +vmm_alloc_cleanup(void) +{ + mod_hash_walk(vmm_alloc_hash, vmm_alloc_check, NULL); + mod_hash_destroy_ptrhash(vmm_alloc_hash); +} void * malloc(unsigned long size, struct malloc_type *mtp, int flags) @@ -148,17 +189,28 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags) kmem_flag = KM_NOSLEEP; if (flags & M_ZERO) { - p = kmem_zalloc(size + sizeof(struct kmem_item), kmem_flag); + p = kmem_zalloc(size, kmem_flag); } else { - p = kmem_alloc(size + sizeof(struct kmem_item), kmem_flag); + p = kmem_alloc(size, kmem_flag); + } + + if (p == NULL) + return (NULL); + + i = kmem_zalloc(sizeof (struct kmem_item), kmem_flag); + + if (i == NULL) { + kmem_free(p, size); + return (NULL); } mutex_enter(&kmem_items_lock); - i = p + size; i->addr = p; i->size = size; - LIST_INSERT_HEAD(&kmem_items, i, next); + VERIFY(mod_hash_insert(vmm_alloc_hash, + (mod_hash_key_t)PHYS_TO_DMAP(vtophys(p)), (mod_hash_val_t)i) == 0); + mutex_exit(&kmem_items_lock); return (p); @@ -167,29 +219,66 @@ malloc(unsigned long size, struct malloc_type *mtp, int flags) void free(void *addr, struct malloc_type *mtp) { - struct kmem_item *i; - mutex_enter(&kmem_items_lock); - LIST_FOREACH(i, &kmem_items, next) { - if (i->addr == addr) - break; - } - ASSERT(i != NULL); - LIST_REMOVE(i, next); + VERIFY(mod_hash_destroy(vmm_alloc_hash, + (mod_hash_key_t)PHYS_TO_DMAP(vtophys(addr))) == 0); mutex_exit(&kmem_items_lock); +} + +extern void *contig_alloc(size_t, ddi_dma_attr_t *, uintptr_t, int); +extern void contig_free(void *, size_t); - kmem_free(addr, i->size + sizeof(struct kmem_item)); +void * +contigmalloc(unsigned long size, struct malloc_type *type, int flags, + vm_paddr_t low, vm_paddr_t high, unsigned long alignment, + vm_paddr_t boundary) +{ + ddi_dma_attr_t attr = { + /* Using fastboot_dma_attr as a guide... */ + DMA_ATTR_V0, + low, /* dma_attr_addr_lo */ + high, /* dma_attr_addr_hi */ + 0x00000000FFFFFFFFULL, /* dma_attr_count_max */ + alignment, /* dma_attr_align */ + 1, /* dma_attr_burstsize */ + 1, /* dma_attr_minxfer */ + 0x00000000FFFFFFFFULL, /* dma_attr_maxxfer */ + 0x00000000FFFFFFFFULL, /* dma_attr_seg: any */ + 1, /* dma_attr_sgllen */ + alignment, /* dma_attr_granular */ + 0, /* dma_attr_flags */ + }; + int cansleep = (flags & M_WAITOK); + void *result; + + ASSERT(alignment == PAGESIZE); + + result = contig_alloc((size_t)size, &attr, alignment, cansleep); + + if (result != NULL && (flags & M_ZERO) != 0) { + bzero(result, size); + } + return (result); +} + +void +contigfree(void *addr, unsigned long size, struct malloc_type *type) +{ + contig_free(addr, size); } void mtx_init(struct mtx *mtx, char *name, const char *type_name, int opts) { - if (opts & MTX_SPIN) { - mutex_init(&mtx->m, name, MUTEX_SPIN, - (ddi_iblock_cookie_t)ipltospl(DISP_LEVEL)); - } else { - mutex_init(&mtx->m, name, MUTEX_DRIVER, NULL); - } + /* + * Requests that a mutex be initialized to the MTX_SPIN type are + * ignored. The limitations which may have required spinlocks on + * FreeBSD do not apply to how bhyve has been structured here. + * + * Adaptive mutexes are required to avoid deadlocks when certain + * cyclics behavior interacts with interrupts and contended locks. + */ + mutex_init(&mtx->m, name, MUTEX_ADAPTIVE, NULL); } void @@ -202,130 +291,14 @@ void critical_enter(void) { kpreempt_disable(); - thread_affinity_set(curthread, CPU_CURRENT); } void critical_exit(void) { - thread_affinity_clear(curthread); kpreempt_enable(); } -struct unr { - u_int item; - struct unr *link; -}; - -#define UNR_HASHSIZE 8 - -struct unrhdr { - struct mtx *mtx; - struct unr *hash[UNR_HASHSIZE]; - u_int min; - u_int max; - u_int next; -}; - -#define HASH_UNR(uh, i) ((uh)->hash[(i) & ((UNR_HASHSIZE) - 1)]) - -static struct mtx unr_mtx; - -/* - * Allocate a new unrheader set. - * - * Highest and lowest valid values given as parameters. - */ -struct unrhdr * -new_unrhdr(int low, int high, struct mtx *mtx) -{ - struct unrhdr *uh; - - uh = kmem_zalloc(sizeof (struct unrhdr), KM_SLEEP); - if (mtx) { - uh->mtx = mtx; - } else { - uh->mtx = &unr_mtx; - } - uh->min = low; - uh->max = high; - uh->next = uh->min; - - return (uh); -} - -void -delete_unrhdr(struct unrhdr *uh) -{ - kmem_free(uh, sizeof (struct unrhdr)); -} - -static struct unr * -unr_lookup(struct unrhdr *uh, int item) -{ - struct unr *unr; - - ASSERT(MUTEX_HELD(&uh->mtx->m)); - - for (unr = HASH_UNR(uh, item); unr != NULL; unr = unr->link) { - if (unr->item == item) - break; - } - - return (unr); -} - -int -alloc_unr(struct unrhdr *uh) -{ - struct unr *unr; - int item, start; - - mutex_enter(&uh->mtx->m); - start = uh->next; - for (;;) { - item = uh->next; - if (++uh->next == uh->max) { - uh->next = uh->min; - } - - if (unr_lookup(uh, item) == NULL) { - unr = kmem_zalloc(sizeof (struct unr), KM_SLEEP); - unr->item = item; - unr->link = HASH_UNR(uh, item); - HASH_UNR(uh, item) = unr; - break; - } - - if (item == start) { - item = -1; - break; - } - } - mutex_exit(&uh->mtx->m); - - return (item); -} - -void -free_unr(struct unrhdr *uh, u_int item) -{ - struct unr *unr, **unrp; - - mutex_enter(&uh->mtx->m); - unrp = &HASH_UNR(uh, item); - for (;;) { - ASSERT(*unrp != NULL); - if ((*unrp)->item == item) - break; - unrp = &(*unrp)->link; - } - unr = *unrp; - *unrp = unr->link; - mutex_exit(&uh->mtx->m); - kmem_free(unr, sizeof(struct unr)); -} - static void vmm_glue_callout_handler(void *arg) @@ -351,25 +324,43 @@ vmm_glue_callout_init(struct callout *c, int mpsafe) when.cyt_interval = CY_INFINITY; mutex_enter(&cpu_lock); - c->c_cyc_id = cyclic_add(&hdlr, &when); +#if 0 + /* + * XXXJOY: according to the freebsd sources, callouts do not begin + * their life in the ACTIVE state. + */ c->c_flags |= CALLOUT_ACTIVE; +#else + bzero(c, sizeof (*c)); +#endif + c->c_cyc_id = cyclic_add(&hdlr, &when); mutex_exit(&cpu_lock); } +static __inline hrtime_t +sbttohrtime(sbintime_t sbt) +{ + return (((sbt >> 32) * NANOSEC) + + (((uint64_t)NANOSEC * (uint32_t)sbt) >> 32)); +} + int vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr, void (*func)(void *), void *arg, int flags) { + hrtime_t target = sbttohrtime(sbt); + ASSERT(c->c_cyc_id != CYCLIC_NONE); c->c_func = func; c->c_arg = arg; c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING); - if (flags & C_ABSOLUTE) - cyclic_reprogram(c->c_cyc_id, sbt); - else - cyclic_reprogram(c->c_cyc_id, sbt + gethrtime()); + if (flags & C_ABSOLUTE) { + cyclic_reprogram(c->c_cyc_id, target); + } else { + cyclic_reprogram(c->c_cyc_id, target + gethrtime()); + } return (0); } @@ -397,201 +388,24 @@ vmm_glue_callout_drain(struct callout *c) return (0); } -static int -ipi_cpu_justreturn(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3) -{ - return (0); -} - -void -ipi_cpu(int cpu, u_int ipi) -{ - cpuset_t set; - - CPUSET_ONLY(set, cpu); - xc_call_nowait(NULL, NULL, NULL, CPUSET2BV(set), - ipi_cpu_justreturn); -} - -#define SC_TABLESIZE 256 /* Must be power of 2. */ -#define SC_MASK (SC_TABLESIZE - 1) -#define SC_SHIFT 8 -#define SC_HASH(wc) ((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \ - SC_MASK) -#define SC_LOOKUP(wc) &sleepq_chains[SC_HASH(wc)] - -struct sleepqueue { - u_int sq_blockedcnt; /* Num. of blocked threads. */ - LIST_ENTRY(sleepqueue) sq_hash; /* Chain. */ - void *sq_wchan; /* Wait channel. */ - kcondvar_t sq_cv; -}; - -struct sleepqueue_chain { - LIST_HEAD(, sleepqueue) sc_queues; /* List of sleep queues. */ - struct mtx sc_lock; /* Spin lock for this chain. */ -}; - -static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE]; - -#define SLEEPQ_CACHE_SZ (64) -static kmem_cache_t *vmm_sleepq_cache; - -static int -vmm_sleepq_cache_init(void *buf, void *user_arg, int kmflags) -{ - struct sleepqueue *sq = (struct sleepqueue *)buf; - - bzero(sq, sizeof (struct sleepqueue)); - cv_init(&sq->sq_cv, NULL, CV_DRIVER, NULL); - - return (0); -} - -static void -vmm_sleepq_cache_fini(void *buf, void *user_arg) -{ - struct sleepqueue *sq = (struct sleepqueue *)buf; - cv_destroy(&sq->sq_cv); -} - -static void -init_sleepqueues(void) -{ - int i; - - for (i = 0; i < SC_TABLESIZE; i++) { - LIST_INIT(&sleepq_chains[i].sc_queues); - mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL, - MTX_SPIN); - } - - vmm_sleepq_cache = kmem_cache_create("vmm_sleepq_cache", - sizeof (struct sleepqueue), SLEEPQ_CACHE_SZ, vmm_sleepq_cache_init, - vmm_sleepq_cache_fini, NULL, NULL, NULL, 0); - -} - -/* - * Lock the sleep queue chain associated with the specified wait channel. - */ -static void -sleepq_lock(void *wchan) -{ - struct sleepqueue_chain *sc; - - sc = SC_LOOKUP(wchan); - mtx_lock_spin(&sc->sc_lock); -} - -/* - * Look up the sleep queue associated with a given wait channel in the hash - * table locking the associated sleep queue chain. If no queue is found in - * the table, NULL is returned. - */ -static struct sleepqueue * -sleepq_lookup(void *wchan) -{ - struct sleepqueue_chain *sc; - struct sleepqueue *sq; - - KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); - sc = SC_LOOKUP(wchan); - mtx_assert(&sc->sc_lock, MA_OWNED); - LIST_FOREACH(sq, &sc->sc_queues, sq_hash) - if (sq->sq_wchan == wchan) - return (sq); - return (NULL); -} - -/* - * Unlock the sleep queue chain associated with a given wait channel. - */ -static void -sleepq_release(void *wchan) -{ - struct sleepqueue_chain *sc; - - sc = SC_LOOKUP(wchan); - mtx_unlock_spin(&sc->sc_lock); -} - -struct sleepqueue * -sleepq_add(void *wchan) -{ - struct sleepqueue_chain *sc; - struct sleepqueue *sq; - - sc = SC_LOOKUP(wchan); - - /* Look up the sleep queue associated with the wait channel 'wchan'. */ - sq = sleepq_lookup(wchan); - - if (sq == NULL) { - sq = kmem_cache_alloc(vmm_sleepq_cache, KM_SLEEP); - LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash); - sq->sq_wchan = wchan; - } - - sq->sq_blockedcnt++; - - return (sq); -} - -void -sleepq_remove(struct sleepqueue *sq) -{ - sq->sq_blockedcnt--; - - if (sq->sq_blockedcnt == 0) { - LIST_REMOVE(sq, sq_hash); - kmem_cache_free(vmm_sleepq_cache, sq); - } -} - -int -msleep_spin(void *chan, struct mtx *mtx, const char *wmesg, int ticks) -{ - struct sleepqueue *sq; - int error; - - sleepq_lock(chan); - sq = sleepq_add(chan); - sleepq_release(chan); - - cv_reltimedwait(&sq->sq_cv, &mtx->m, ticks, TR_CLOCK_TICK); - - sleepq_lock(chan); - sleepq_remove(sq); - sleepq_release(chan); - - return (error); -} - void -wakeup(void *chan) +vmm_glue_callout_localize(struct callout *c) { - struct sleepqueue *sq; - - sleepq_lock(chan); - sq = sleepq_lookup(chan); - if (sq != NULL) { - cv_broadcast(&sq->sq_cv); - } - sleepq_release(chan); + mutex_enter(&cpu_lock); + cyclic_move_here(c->c_cyc_id); + mutex_exit(&cpu_lock); } void -wakeup_one(void *chan) +ipi_cpu(int cpu, u_int ipi) { - struct sleepqueue *sq; - - sleepq_lock(chan); - sq = sleepq_lookup(chan); - if (sq != NULL) { - cv_signal(&sq->sq_cv); - } - sleepq_release(chan); + /* + * This was previously implemented as an invocation of asynchronous + * no-op crosscalls to interrupt the target CPU. Since even nowait + * crosscalls can block in certain circumstances, a direct poke_cpu() + * is safer when called from delicate contexts. + */ + poke_cpu(cpu); } u_int cpu_high; /* Highest arg to CPUID */ @@ -618,162 +432,257 @@ vmm_cpuid_init(void) cpu_exthigh = regs[0]; } -struct savefpu { - fpu_ctx_t fsa_fp_ctx; -}; - -static vmem_t *fpu_save_area_arena; - -static void -fpu_save_area_init(void) -{ - fpu_save_area_arena = vmem_create("fpu_save_area", - NULL, 0, XSAVE_AREA_ALIGN, - segkmem_alloc, segkmem_free, heap_arena, 0, VM_BESTFIT | VM_SLEEP); -} - -static void -fpu_save_area_cleanup(void) -{ - vmem_destroy(fpu_save_area_arena); -} - +/* + * FreeBSD uses the struct savefpu for managing the FPU state. That is mimicked + * by our hypervisor multiplexor framework structure. + */ struct savefpu * fpu_save_area_alloc(void) { - return (vmem_alloc(fpu_save_area_arena, sizeof (struct savefpu), - VM_SLEEP)); + return ((struct savefpu *)hma_fpu_alloc(KM_SLEEP)); } void fpu_save_area_free(struct savefpu *fsa) { - vmem_free(fpu_save_area_arena, fsa, sizeof (struct savefpu)); + hma_fpu_t *fpu = (hma_fpu_t *)fsa; + hma_fpu_free(fpu); } void fpu_save_area_reset(struct savefpu *fsa) { - extern const struct fxsave_state sse_initial; - extern const struct xsave_state avx_initial; - struct fpu_ctx *fp; - struct fxsave_state *fx; - struct xsave_state *xs; - - fp = &fsa->fsa_fp_ctx; - - fp->fpu_regs.kfpu_status = 0; - fp->fpu_regs.kfpu_xstatus = 0; - - switch (fp_save_mech) { - case FP_FXSAVE: - fx = &fp->fpu_regs.kfpu_u.kfpu_fx; - bcopy(&sse_initial, fx, sizeof (*fx)); - break; - case FP_XSAVE: - fp->fpu_xsave_mask = (XFEATURE_ENABLED_X87 | - XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX); - xs = &fp->fpu_regs.kfpu_u.kfpu_xs; - bcopy(&avx_initial, xs, sizeof (*xs)); - break; - default: - panic("Invalid fp_save_mech"); - /*NOTREACHED*/ - } + hma_fpu_t *fpu = (hma_fpu_t *)fsa; + hma_fpu_init(fpu); } +/* + * This glue function is supposed to save the host's FPU state. This is always + * paired in the general bhyve code with a call to fpusave. Therefore, we treat + * this as a nop and do all the work in fpusave(), which will have the context + * argument that we want anyways. + */ void fpuexit(kthread_t *td) { - fp_save(&curthread->t_lwp->lwp_pcb.pcb_fpu); } -static __inline void -vmm_fxrstor(struct fxsave_state *addr) +/* + * This glue function is supposed to restore the guest's FPU state from the save + * area back to the host. In FreeBSD, it is assumed that the host state has + * already been saved by a call to fpuexit(); however, we do both here. + */ +void +fpurestore(void *arg) { - __asm __volatile("fxrstor %0" : : "m" (*(addr))); -} + hma_fpu_t *fpu = arg; -static __inline void -vmm_fxsave(struct fxsave_state *addr) -{ - __asm __volatile("fxsave %0" : "=m" (*(addr))); + hma_fpu_start_guest(fpu); } -static __inline void -vmm_xrstor(struct xsave_state *addr, uint64_t mask) +/* + * This glue function is supposed to save the guest's FPU state. The host's FPU + * state is not expected to be restored necessarily due to the use of FPU + * emulation through CR0.TS. However, we can and do restore it here. + */ +void +fpusave(void *arg) { - uint32_t low, hi; + hma_fpu_t *fpu = arg; - low = mask; - hi = mask >> 32; - __asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi)); + hma_fpu_stop_guest(fpu); } -static __inline void -vmm_xsave(struct xsave_state *addr, uint64_t mask) +void +vmm_sol_glue_init(void) { - uint32_t low, hi; - - low = mask; - hi = mask >> 32; - __asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) : - "memory"); + vmm_alloc_init(); + vmm_cpuid_init(); } void -fpurestore(void *arg) +vmm_sol_glue_cleanup(void) { - struct savefpu *fsa = (struct savefpu *)arg; - struct fpu_ctx *fp; - - fp = &fsa->fsa_fp_ctx; - - switch (fp_save_mech) { - case FP_FXSAVE: - vmm_fxrstor(&fp->fpu_regs.kfpu_u.kfpu_fx); - break; - case FP_XSAVE: - vmm_xrstor(&fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); - break; - default: - panic("Invalid fp_save_mech"); - /*NOTREACHED*/ - } + vmm_alloc_cleanup(); } -void -fpusave(void *arg) + +/* From FreeBSD's sys/kern/subr_clock.c */ + +/*- + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1982, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: Utah $Hdr: clock.c 1.18 91/01/21$ + * from: @(#)clock.c 8.2 (Berkeley) 1/12/94 + * from: NetBSD: clock_subr.c,v 1.6 2001/07/07 17:04:02 thorpej Exp + * and + * from: src/sys/i386/isa/clock.c,v 1.176 2001/09/04 + */ + +#include <sys/clock.h> + +/*--------------------------------------------------------------------* + * Generic routines to convert between a POSIX date + * (seconds since 1/1/1970) and yr/mo/day/hr/min/sec + * Derived from NetBSD arch/hp300/hp300/clock.c + */ + +#define FEBRUARY 2 +#define days_in_year(y) (leapyear(y) ? 366 : 365) +#define days_in_month(y, m) \ + (month_days[(m) - 1] + (m == FEBRUARY ? leapyear(y) : 0)) +/* Day of week. Days are counted from 1/1/1970, which was a Thursday */ +#define day_of_week(days) (((days) + 4) % 7) + +static const int month_days[12] = { + 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31 +}; + + +/* + * This inline avoids some unnecessary modulo operations + * as compared with the usual macro: + * ( ((year % 4) == 0 && + * (year % 100) != 0) || + * ((year % 400) == 0) ) + * It is otherwise equivalent. + */ +static int +leapyear(int year) { - struct savefpu *fsa = (struct savefpu *)arg; - struct fpu_ctx *fp; - - fp = &fsa->fsa_fp_ctx; - - switch (fp_save_mech) { - case FP_FXSAVE: - vmm_fxsave(&fp->fpu_regs.kfpu_u.kfpu_fx); - break; - case FP_XSAVE: - vmm_xsave(&fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask); - break; - default: - panic("Invalid fp_save_mech"); - /*NOTREACHED*/ + int rv = 0; + + if ((year & 3) == 0) { + rv = 1; + if ((year % 100) == 0) { + rv = 0; + if ((year % 400) == 0) + rv = 1; + } } + return (rv); } -void -vmm_sol_glue_init(void) +int +clock_ct_to_ts(struct clocktime *ct, struct timespec *ts) { - vmm_cpuid_init(); - fpu_save_area_init(); - init_sleepqueues(); + int i, year, days; + + year = ct->year; + +#ifdef __FreeBSD__ + if (ct_debug) { + printf("ct_to_ts("); + print_ct(ct); + printf(")"); + } +#endif + + /* Sanity checks. */ + if (ct->mon < 1 || ct->mon > 12 || ct->day < 1 || + ct->day > days_in_month(year, ct->mon) || + ct->hour > 23 || ct->min > 59 || ct->sec > 59 || + (sizeof(time_t) == 4 && year > 2037)) { /* time_t overflow */ +#ifdef __FreeBSD__ + if (ct_debug) + printf(" = EINVAL\n"); +#endif + return (EINVAL); + } + + /* + * Compute days since start of time + * First from years, then from months. + */ + days = 0; + for (i = POSIX_BASE_YEAR; i < year; i++) + days += days_in_year(i); + + /* Months */ + for (i = 1; i < ct->mon; i++) + days += days_in_month(year, i); + days += (ct->day - 1); + + ts->tv_sec = (((time_t)days * 24 + ct->hour) * 60 + ct->min) * 60 + + ct->sec; + ts->tv_nsec = ct->nsec; + +#ifdef __FreeBSD__ + if (ct_debug) + printf(" = %ld.%09ld\n", (long)ts->tv_sec, (long)ts->tv_nsec); +#endif + return (0); } void -vmm_sol_glue_cleanup(void) -{ - fpu_save_area_cleanup(); - kmem_cache_destroy(vmm_sleepq_cache); +clock_ts_to_ct(struct timespec *ts, struct clocktime *ct) +{ + int i, year, days; + time_t rsec; /* remainder seconds */ + time_t secs; + + secs = ts->tv_sec; + days = secs / SECDAY; + rsec = secs % SECDAY; + + ct->dow = day_of_week(days); + + /* Subtract out whole years, counting them in i. */ + for (year = POSIX_BASE_YEAR; days >= days_in_year(year); year++) + days -= days_in_year(year); + ct->year = year; + + /* Subtract out whole months, counting them in i. */ + for (i = 1; days >= days_in_month(year, i); i++) + days -= days_in_month(year, i); + ct->mon = i; + + /* Days are what is left over (+1) from all that. */ + ct->day = days + 1; + + /* Hours, minutes, seconds are easy */ + ct->hour = rsec / 3600; + rsec = rsec % 3600; + ct->min = rsec / 60; + rsec = rsec % 60; + ct->sec = rsec; + ct->nsec = ts->tv_nsec; +#ifdef __FreeBSD__ + if (ct_debug) { + printf("ts_to_ct(%ld.%09ld) = ", + (long)ts->tv_sec, (long)ts->tv_nsec); + print_ct(ct); + printf("\n"); + } +#endif } diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c deleted file mode 100644 index 3bb5412d16..0000000000 --- a/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c +++ /dev/null @@ -1,111 +0,0 @@ -/*- - * Copyright (c) 2011 NetApp, Inc. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * - * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * $FreeBSD: head/sys/amd64/vmm/vmm_mem.c 245678 2013-01-20 03:42:49Z neel $ - */ -/* - * This file and its contents are supplied under the terms of the - * Common Development and Distribution License ("CDDL"), version 1.0. - * You may only use this file in accordance with the terms of version - * 1.0 of the CDDL. - * - * A full copy of the text of the CDDL should have accompanied this - * source. A copy of the CDDL is also available via the Internet at - * http://www.illumos.org/license/CDDL. - * - * Copyright 2013 Pluribus Networks Inc. - */ - -#include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_mem.c 245678 2013-01-20 03:42:49Z neel $"); - -#include <sys/param.h> -#include <sys/lock.h> -#include <sys/mutex.h> -#include <sys/systm.h> -#include <sys/malloc.h> -#include <sys/kernel.h> - -#include <vm/vm.h> -#include <machine/pmap.h> - -#include <sys/ddi.h> - -#include "vmm_util.h" -#include "vmm_mem.h" - -int -vmm_mem_init(void) -{ - return (0); -} - -vm_paddr_t -vmm_mem_alloc(size_t size) -{ - clock_t usec = 2 * 1000000; - vm_paddr_t pa; - caddr_t addr; - - if (size != PAGE_SIZE) - panic("vmm_mem_alloc: invalid allocation size %lu", size); - - while (usec > 0) { - if ((addr = kmem_zalloc(PAGE_SIZE, KM_NOSLEEP)) != NULL) { - ASSERT(((uintptr_t)addr & PAGE_MASK) == 0); - pa = vtophys((vm_offset_t)addr); - return (pa); - } - delay(drv_usectohz((clock_t)500000)); - usec -= 500000; - } - - return (NULL); -} - -void -vmm_mem_free(vm_paddr_t base, size_t length) -{ - page_t *pp; - - if (base & PAGE_MASK) { - panic("vmm_mem_free: base 0x%0lx must be aligned on a " - "0x%0x boundary\n", base, PAGE_SIZE); - } - - if (length != PAGE_SIZE) { - panic("vmm_mem_free: invalid length %lu", length); - } - - pp = page_numtopp_nolock(btop(base)); - kmem_free((void *)pp->p_offset, PAGE_SIZE); -} - -vm_paddr_t -vmm_mem_maxaddr(void) -{ - - return (ptob(physmax + 1)); -} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c new file mode 100644 index 0000000000..d630d32630 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_rvi.c @@ -0,0 +1,297 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/machsystm.h> +#include <sys/x86_archext.h> + +#include <sys/gipt.h> +#include <vm/vm_glue.h> + + +struct rvi_map { + gipt_map_t rm_gipt; + uint64_t rm_wired_page_count; +}; +typedef struct rvi_map rvi_map_t; + +#define RVI_LOCK(m) (&(m)->rm_gipt.giptm_lock) + +#define RVI_MAX_LEVELS 4 + +CTASSERT(RVI_MAX_LEVELS <= GIPT_MAX_LEVELS); + +#define RVI_PRESENT PT_VALID +#define RVI_WRITABLE PT_WRITABLE +#define RVI_ACCESSED PT_REF +#define RVI_DIRTY PT_MOD +#define RVI_LGPG PT_PAGESIZE +#define RVI_NX PT_NX +#define RVI_USER PT_USER +#define RVI_PWT PT_WRITETHRU +#define RVI_PCD PT_NOCACHE + +#define RVI_PA_MASK PT_PADDR + +#define RVI_PAT(attr) rvi_attr_to_pat(attr) +#define RVI_PADDR(addr) ((addr) & RVI_PA_MASK) +#define RVI_PROT(prot) \ + ((((prot) & PROT_WRITE) != 0 ? RVI_WRITABLE : 0) | \ + (((prot) & PROT_EXEC) == 0 ? RVI_NX : 0)) + +#define RVI_IS_ABSENT(pte) (((pte) & RVI_PRESENT) == 0) +#define RVI_PTE_PFN(pte) mmu_btop(RVI_PADDR(pte)) +#define RVI_MAPS_PAGE(pte, lvl) \ + (!RVI_IS_ABSENT(pte) && (((pte) & RVI_LGPG) != 0 || (lvl) == 0)) +#define RVI_PTE_PROT(pte) \ + (RVI_IS_ABSENT(pte) ? 0 : ( \ + PROT_READ | \ + (((pte) & RVI_NX) == 0 ? PROT_EXEC : 0) | \ + (((pte) & RVI_WRITABLE) != 0 ? PROT_WRITE : 0))) + +#define RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr) \ + (RVI_PADDR(pfn_to_pa(pfn)) | \ + (((lvl) != 0) ? RVI_LGPG : 0) | \ + RVI_USER | RVI_ACCESSED | RVI_PRESENT | \ + RVI_PAT(attr) | \ + RVI_PROT(prot)) + +#define RVI_PTE_ASSIGN_TABLE(pfn) \ + (RVI_PADDR(pfn_to_pa(pfn)) | \ + RVI_USER | RVI_ACCESSED | RVI_PRESENT | \ + RVI_PAT(MTRR_TYPE_WB) | \ + RVI_PROT(PROT_READ | PROT_WRITE | PROT_EXEC)) + + +/* Make sure that PAT indexes line up as expected */ +CTASSERT((PAT_DEFAULT_ATTRIBUTE & 0xf) == MTRR_TYPE_WB); +CTASSERT(((PAT_DEFAULT_ATTRIBUTE >> 24) & 0xf) == MTRR_TYPE_UC); + +static inline uint64_t +rvi_attr_to_pat(const uint8_t attr) +{ + if (attr == MTRR_TYPE_UC) { + /* !PAT + PCD + PWT -> PAT3 -> MTRR_TYPE_UC */ + return (RVI_PCD|RVI_PWT); + } else if (attr == MTRR_TYPE_WB) { + /* !PAT + !PCD + !PWT -> PAT0 -> MTRR_TYPE_WB */ + return (0); + } + + panic("unexpected memattr %x", attr); + return (0); +} + +static gipt_pte_type_t +rvi_pte_type(uint64_t pte, uint_t level) +{ + if (RVI_IS_ABSENT(pte)) { + return (PTET_EMPTY); + } else if (RVI_MAPS_PAGE(pte, level)) { + return (PTET_PAGE); + } else { + return (PTET_LINK); + } +} + +static uint64_t +rvi_pte_map(uint64_t pfn) +{ + return (RVI_PTE_ASSIGN_TABLE(pfn)); +} + +static void * +rvi_create(uintptr_t *pml4_kaddr) +{ + rvi_map_t *rmap; + gipt_map_t *map; + gipt_t *root; + struct gipt_cbs cbs = { + .giptc_pte_type = rvi_pte_type, + .giptc_pte_map = rvi_pte_map, + }; + + rmap = kmem_zalloc(sizeof (*rmap), KM_SLEEP); + map = &rmap->rm_gipt; + root = gipt_alloc(); + root->gipt_level = RVI_MAX_LEVELS - 1; + gipt_map_init(map, RVI_MAX_LEVELS, GIPT_HASH_SIZE_DEFAULT, &cbs, root); + + *pml4_kaddr = (uintptr_t)root->gipt_kva; + return (rmap); +} + +static void +rvi_destroy(void *arg) +{ + rvi_map_t *rmap = arg; + + if (rmap != NULL) { + gipt_map_t *map = &rmap->rm_gipt; + + gipt_map_fini(map); + kmem_free(rmap, sizeof (*rmap)); + } +} + +static uint64_t +rvi_wired_count(void *arg) +{ + rvi_map_t *rmap = arg; + uint64_t res; + + mutex_enter(RVI_LOCK(rmap)); + res = rmap->rm_wired_page_count; + mutex_exit(RVI_LOCK(rmap)); + + return (res); +} + +static int +rvi_is_wired(void *arg, uint64_t va, uint_t *protp) +{ + rvi_map_t *rmap = arg; + gipt_t *pt; + int rv = -1; + + mutex_enter(RVI_LOCK(rmap)); + pt = gipt_map_lookup_deepest(&rmap->rm_gipt, va); + if (pt != NULL) { + const uint64_t pte = GIPT_VA2PTE(pt, va); + + if (RVI_MAPS_PAGE(pte, pt->gipt_level)) { + *protp = RVI_PTE_PROT(pte); + rv = 0; + } + } + mutex_exit(RVI_LOCK(rmap)); + + return (rv); +} + +static int +rvi_map(void *arg, uint64_t va, pfn_t pfn, uint_t lvl, uint_t prot, + uint8_t attr) +{ + rvi_map_t *rmap = arg; + gipt_map_t *map = &rmap->rm_gipt; + gipt_t *pt; + uint64_t *ptep, pte; + + ASSERT((prot & PROT_READ) != 0); + ASSERT3U((prot & ~(PROT_READ | PROT_WRITE | PROT_EXEC)), ==, 0); + ASSERT3U(lvl, <, RVI_MAX_LEVELS); + + mutex_enter(RVI_LOCK(rmap)); + pt = gipt_map_lookup(map, va, lvl); + if (pt == NULL) { + /* + * A table at the appropriate VA/level that would house this + * mapping does not currently exist. Try to walk down to that + * point, creating any necessary parent(s). + */ + pt = gipt_map_create_parents(map, va, lvl); + + /* + * There was a large page mapping in the way of creating the + * necessary parent table(s). + */ + if (pt == NULL) { + panic("unexpected large page @ %08lx", va); + } + } + ptep = GIPT_VA2PTEP(pt, va); + + pte = *ptep; + if (!RVI_IS_ABSENT(pte)) { + if (!RVI_MAPS_PAGE(pte, lvl)) { + panic("unexpected PT link @ %08lx in %p", va, pt); + } else { + panic("unexpected page mapped @ %08lx in %p", va, pt); + } + } + + pte = RVI_PTE_ASSIGN_PAGE(lvl, pfn, prot, attr); + *ptep = pte; + pt->gipt_valid_cnt++; + rmap->rm_wired_page_count += gipt_level_count[lvl]; + + mutex_exit(RVI_LOCK(rmap)); + return (0); +} + +static uint64_t +rvi_unmap(void *arg, uint64_t va, uint64_t end_va) +{ + rvi_map_t *rmap = arg; + gipt_map_t *map = &rmap->rm_gipt; + gipt_t *pt; + uint64_t cur_va = va; + uint64_t unmapped = 0; + + mutex_enter(RVI_LOCK(rmap)); + + pt = gipt_map_lookup_deepest(map, cur_va); + if (pt == NULL) { + mutex_exit(RVI_LOCK(rmap)); + return (0); + } + if (!RVI_MAPS_PAGE(GIPT_VA2PTE(pt, cur_va), pt->gipt_level)) { + cur_va = gipt_map_next_page(map, cur_va, end_va, &pt); + if (cur_va == 0) { + mutex_exit(RVI_LOCK(rmap)); + return (0); + } + } + + while (cur_va < end_va) { + uint64_t *ptep = GIPT_VA2PTEP(pt, cur_va); + const uint_t lvl = pt->gipt_level; + + ASSERT(RVI_MAPS_PAGE(*ptep, lvl)); + *ptep = 0; + pt->gipt_valid_cnt--; + unmapped += gipt_level_count[pt->gipt_level]; + + gipt_t *next_pt = pt; + uint64_t next_va; + next_va = gipt_map_next_page(map, cur_va, end_va, &next_pt); + + if (pt->gipt_valid_cnt == 0) { + gipt_map_clean_parents(map, pt); + } + if (next_va == 0) { + break; + } + pt = next_pt; + cur_va = next_va; + } + rmap->rm_wired_page_count -= unmapped; + + mutex_exit(RVI_LOCK(rmap)); + + return (unmapped); +} + +struct vmm_pt_ops rvi_ops = { + .vpo_init = rvi_create, + .vpo_free = rvi_destroy, + .vpo_wired_cnt = rvi_wired_count, + .vpo_is_wired = rvi_is_wired, + .vpo_map = rvi_map, + .vpo_unmap = rvi_unmap, +}; diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c new file mode 100644 index 0000000000..66a67d9529 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_vm.c @@ -0,0 +1,1016 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/param.h> +#include <sys/kmem.h> +#include <sys/thread.h> +#include <sys/list.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/ddi.h> +#include <sys/sysmacros.h> +#include <sys/machsystm.h> +#include <sys/vmsystm.h> +#include <sys/malloc.h> +#include <sys/x86_archext.h> +#include <vm/as.h> +#include <vm/seg_vn.h> +#include <vm/seg_kmem.h> +#include <vm/seg_vmm.h> + +#include <vm/vm_extern.h> +#include <vm/vm_map.h> +#include "vm/vm_glue.h" + +#define PMAP_TO_VMMAP(pm) ((vm_map_t) \ + ((caddr_t)(pm) - offsetof(struct vmspace, vms_pmap))) +#define VMMAP_TO_VMSPACE(vmmap) ((struct vmspace *) \ + ((caddr_t)(vmmap) - offsetof(struct vmspace, vm_map))) + + +struct vmspace_mapping { + list_node_t vmsm_node; + vm_object_t vmsm_object; + uintptr_t vmsm_addr; + size_t vmsm_len; + off_t vmsm_offset; + uint_t vmsm_prot; +}; +typedef struct vmspace_mapping vmspace_mapping_t; + +#define VMSM_OFFSET(vmsm, addr) ( \ + (vmsm)->vmsm_offset + \ + ((addr) - (uintptr_t)(vmsm)->vmsm_addr)) + + +/* Private glue interfaces */ +static void pmap_free(pmap_t); +static vmspace_mapping_t *vm_mapping_find(struct vmspace *, uintptr_t, size_t, + boolean_t); +static void vm_mapping_remove(struct vmspace *, vmspace_mapping_t *); + +static vmem_t *vmm_alloc_arena = NULL; + +static void * +vmm_arena_alloc(vmem_t *vmp, size_t size, int vmflag) +{ + return (segkmem_xalloc(vmp, NULL, size, vmflag, 0, + segkmem_page_create, &kvps[KV_VVP])); +} + +static void +vmm_arena_free(vmem_t *vmp, void *inaddr, size_t size) +{ + segkmem_xfree(vmp, inaddr, size, &kvps[KV_VVP], NULL); +} + +void +vmm_arena_init(void) +{ + vmm_alloc_arena = vmem_create("vmm_alloc_arena", NULL, 0, 1024 * 1024, + vmm_arena_alloc, vmm_arena_free, kvmm_arena, 0, VM_SLEEP); + + ASSERT(vmm_alloc_arena != NULL); +} + +void +vmm_arena_fini(void) +{ + VERIFY(vmem_size(vmm_alloc_arena, VMEM_ALLOC) == 0); + vmem_destroy(vmm_alloc_arena); + vmm_alloc_arena = NULL; +} + +struct vmspace * +vmspace_alloc(vm_offset_t start, vm_offset_t end, pmap_pinit_t pinit) +{ + struct vmspace *vms; + const uintptr_t size = end + 1; + + /* + * This whole mess is built on the assumption that a 64-bit address + * space is available to work with for the various pagetable tricks. + */ + VERIFY(ttoproc(curthread)->p_model == DATAMODEL_LP64); + VERIFY(start == 0 && size > 0 && (size & PAGEOFFSET) == 0 && + size <= (uintptr_t)USERLIMIT); + + vms = kmem_zalloc(sizeof (*vms), KM_SLEEP); + vms->vms_size = size; + list_create(&vms->vms_maplist, sizeof (vmspace_mapping_t), + offsetof(vmspace_mapping_t, vmsm_node)); + + if (pinit(&vms->vms_pmap) == 0) { + kmem_free(vms, sizeof (*vms)); + return (NULL); + } + + return (vms); +} + +void +vmspace_free(struct vmspace *vms) +{ + VERIFY(list_is_empty(&vms->vms_maplist)); + + pmap_free(&vms->vms_pmap); + kmem_free(vms, sizeof (*vms)); +} + +pmap_t +vmspace_pmap(struct vmspace *vms) +{ + return (&vms->vms_pmap); +} + +long +vmspace_resident_count(struct vmspace *vms) +{ + /* XXXJOY: finish */ + return (0); +} + +void * +vmspace_find_kva(struct vmspace *vms, uintptr_t addr, size_t size) +{ + vmspace_mapping_t *vmsm; + void *result = NULL; + + /* + * Since vmspace_find_kva is provided so that vmm_drv consumers can do + * GPA2KVA translations, it is expected to be called when there is a + * read lock preventing vmspace alterations. As such, it can do the + * lockless vm_mapping_find() lookup. + */ + vmsm = vm_mapping_find(vms, addr, size, B_TRUE); + if (vmsm != NULL) { + struct vm_object *vmo = vmsm->vmsm_object; + + switch (vmo->vmo_type) { + case OBJT_DEFAULT: + result = (void *)((uintptr_t)vmo->vmo_data + + VMSM_OFFSET(vmsm, addr)); + break; + default: + break; + } + } + + return (result); +} + +static int +vmspace_pmap_iswired(struct vmspace *vms, uintptr_t addr, uint_t *prot) +{ + pmap_t pmap = &vms->vms_pmap; + int rv; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + + rv = pmap->pm_ops->vpo_is_wired(pmap->pm_impl, addr, prot); + return (rv); +} + +static void +pmap_free(pmap_t pmap) +{ + void *pmi = pmap->pm_impl; + struct vmm_pt_ops *ops = pmap->pm_ops; + + pmap->pm_pml4 = NULL; + pmap->pm_impl = NULL; + pmap->pm_ops = NULL; + + ops->vpo_free(pmi); +} + +int +pmap_pinit_type(pmap_t pmap, enum pmap_type type, int flags) +{ + /* For use in vmm only */ + pmap->pm_type = type; + switch (type) { + case PT_EPT: { + struct vmm_pt_ops *ops = &ept_ops; + void *pml4, *pmi; + + pmi = ops->vpo_init((uintptr_t *)&pml4); + + pmap->pm_ops = ops; + pmap->pm_impl = pmi; + pmap->pm_pml4 = pml4; + return (1); + } + case PT_RVI: { + struct vmm_pt_ops *ops = &rvi_ops; + void *pml4, *pmi; + + pmi = ops->vpo_init((uintptr_t *)&pml4); + + pmap->pm_ops = ops; + pmap->pm_impl = pmi; + pmap->pm_pml4 = pml4; + return (1); + } + default: + panic("unsupported pmap type: %x", type); + break; + } + + return (1); +} + +long +pmap_wired_count(pmap_t pmap) +{ + long val; + + val = pmap->pm_ops->vpo_wired_cnt(pmap->pm_impl); + VERIFY3S(val, >=, 0); + + return (val); +} + +int +pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype) +{ + /* Allow the fallback to vm_fault to handle this */ + return (-1); +} + + + +struct sglist_ent { + vm_paddr_t sge_pa; + size_t sge_len; +}; +struct sglist { + kmutex_t sg_lock; + uint_t sg_refcnt; + uint_t sg_len; + uint_t sg_next; + struct sglist_ent sg_entries[]; +}; + +#define SG_SIZE(cnt) (sizeof (struct sglist) + \ + (sizeof (struct sglist_ent) * (cnt))) + +struct sglist * +sglist_alloc(int nseg, int flags) +{ + const size_t sz = SG_SIZE(nseg); + const int flag = (flags & M_WAITOK) ? KM_SLEEP : KM_NOSLEEP; + struct sglist *sg; + + ASSERT(nseg > 0); + + sg = kmem_zalloc(sz, flag); + if (sg != NULL) { + sg->sg_len = nseg; + sg->sg_refcnt = 1; + } + return (sg); +} + +void +sglist_free(struct sglist *sg) +{ + size_t sz; + + mutex_enter(&sg->sg_lock); + if (sg->sg_refcnt > 1) { + sg->sg_refcnt--; + mutex_exit(&sg->sg_lock); + return; + } + + VERIFY(sg->sg_refcnt == 1); + sg->sg_refcnt = 0; + sz = SG_SIZE(sg->sg_len); + mutex_exit(&sg->sg_lock); + kmem_free(sg, sz); +} + +int +sglist_append_phys(struct sglist *sg, vm_paddr_t pa, size_t len) +{ + uint_t idx; + struct sglist_ent *ent; + + /* Restrict to page-aligned entries */ + if ((pa & PAGEOFFSET) != 0 || (len & PAGEOFFSET) != 0 || len == 0) { + return (EINVAL); + } + + mutex_enter(&sg->sg_lock); + idx = sg->sg_next; + if (idx >= sg->sg_len) { + mutex_exit(&sg->sg_lock); + return (ENOSPC); + } + + ent = &sg->sg_entries[idx]; + ASSERT(ent->sge_pa == 0 && ent->sge_len == 0); + ent->sge_pa = pa; + ent->sge_len = len; + sg->sg_next++; + + mutex_exit(&sg->sg_lock); + return (0); +} + + +static pfn_t +vm_object_pager_none(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) +{ + panic("bad vm_object pager"); + return (PFN_INVALID); +} + +static pfn_t +vm_object_pager_heap(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) +{ + const uintptr_t kaddr = ALIGN2PAGE((uintptr_t)vmo->vmo_data + off); + uint_t idx, level; + htable_t *ht; + x86pte_t pte; + pfn_t top_pfn, pfn; + + ASSERT(vmo->vmo_type == OBJT_DEFAULT); + ASSERT(off < vmo->vmo_size); + + ht = htable_getpage(kas.a_hat, kaddr, &idx); + if (ht == NULL) { + return (PFN_INVALID); + } + pte = x86pte_get(ht, idx); + if (!PTE_ISPAGE(pte, ht->ht_level)) { + htable_release(ht); + return (PFN_INVALID); + } + + pfn = top_pfn = PTE2PFN(pte, ht->ht_level); + level = ht->ht_level; + if (ht->ht_level > 0) { + pfn += mmu_btop(kaddr & LEVEL_OFFSET((uint_t)ht->ht_level)); + } + htable_release(ht); + + if (lpfn != NULL) { + *lpfn = top_pfn; + } + if (lvl != NULL) { + *lvl = level; + } + return (pfn); +} + +static pfn_t +vm_object_pager_sg(vm_object_t vmo, uintptr_t off, pfn_t *lpfn, uint_t *lvl) +{ + const uintptr_t aoff = ALIGN2PAGE(off); + uint_t level = 0; + uintptr_t pos = 0; + struct sglist *sg; + struct sglist_ent *ent; + pfn_t pfn = PFN_INVALID; + + ASSERT(vmo->vmo_type == OBJT_SG); + ASSERT(off < vmo->vmo_size); + + sg = vmo->vmo_data; + if (sg == NULL) { + return (PFN_INVALID); + } + + ent = &sg->sg_entries[0]; + for (uint_t i = 0; i < sg->sg_next; i++, ent++) { + if (aoff >= pos && aoff < (pos + ent->sge_len)) { + /* XXXJOY: Punt on large pages for now */ + level = 0; + pfn = mmu_btop(ent->sge_pa + (aoff - pos)); + break; + } + pos += ent->sge_len; + } + + if (lpfn != 0) { + *lpfn = pfn; + } + if (lvl != 0) { + *lvl = level; + } + return (pfn); +} + +static void +vm_reserve_pages(size_t npages) +{ + uint_t retries = 60; + int rc; + + mutex_enter(&freemem_lock); + if (availrmem < npages) { + mutex_exit(&freemem_lock); + + /* + * Set needfree and wait for the ZFS ARC reap thread to free up + * some memory. + */ + page_needfree(npages); + + mutex_enter(&freemem_lock); + while ((availrmem < npages) && retries-- > 0) { + mutex_exit(&freemem_lock); + rc = delay_sig(drv_usectohz(1 * MICROSEC)); + mutex_enter(&freemem_lock); + + if (rc == EINTR) + break; + } + mutex_exit(&freemem_lock); + + page_needfree(-npages); + } else { + mutex_exit(&freemem_lock); + } +} + +void +vm_object_clear(vm_object_t vmo) +{ + ASSERT(vmo->vmo_type == OBJT_DEFAULT); + + /* XXXJOY: Better zeroing approach? */ + bzero(vmo->vmo_data, vmo->vmo_size); +} + +vm_object_t +vm_object_allocate(objtype_t type, vm_pindex_t psize) +{ + vm_object_t vmo; + const size_t size = ptob((size_t)psize); + + vmo = kmem_alloc(sizeof (*vmo), KM_SLEEP); + mutex_init(&vmo->vmo_lock, NULL, MUTEX_DEFAULT, NULL); + + /* For now, these are to stay fixed after allocation */ + vmo->vmo_type = type; + vmo->vmo_size = size; + vmo->vmo_attr = VM_MEMATTR_DEFAULT; + + switch (type) { + case OBJT_DEFAULT: { + vm_reserve_pages(psize); + + /* XXXJOY: opt-in to larger pages? */ + vmo->vmo_data = vmem_alloc(vmm_alloc_arena, size, KM_NOSLEEP); + if (vmo->vmo_data == NULL) { + mutex_destroy(&vmo->vmo_lock); + kmem_free(vmo, sizeof (*vmo)); + return (NULL); + } + vm_object_clear(vmo); + vmo->vmo_pager = vm_object_pager_heap; + } + break; + case OBJT_SG: + vmo->vmo_data = NULL; + vmo->vmo_pager = vm_object_pager_sg; + break; + default: + panic("Unsupported vm_object type"); + break; + } + + vmo->vmo_refcnt = 1; + return (vmo); +} + +vm_object_t +vm_pager_allocate(objtype_t type, void *handle, vm_ooffset_t size, + vm_prot_t prot, vm_ooffset_t off, void *cred) +{ + struct vm_object *vmo; + struct sglist *sg = (struct sglist *)handle; + + /* XXXJOY: be very restrictive for now */ + VERIFY(type == OBJT_SG); + VERIFY(off == 0); + + vmo = vm_object_allocate(type, size); + vmo->vmo_data = sg; + + mutex_enter(&sg->sg_lock); + VERIFY(sg->sg_refcnt++ >= 1); + mutex_exit(&sg->sg_lock); + + return (vmo); +} + +void +vm_object_deallocate(vm_object_t vmo) +{ + ASSERT(vmo != NULL); + + uint_t ref = atomic_dec_uint_nv(&vmo->vmo_refcnt); + /* underflow would be a deadly serious mistake */ + VERIFY3U(ref, !=, UINT_MAX); + if (ref != 0) { + return; + } + + switch (vmo->vmo_type) { + case OBJT_DEFAULT: + vmem_free(vmm_alloc_arena, vmo->vmo_data, vmo->vmo_size); + break; + case OBJT_SG: + sglist_free((struct sglist *)vmo->vmo_data); + break; + default: + panic("Unsupported vm_object type"); + break; + } + + vmo->vmo_pager = vm_object_pager_none; + vmo->vmo_data = NULL; + vmo->vmo_size = 0; + mutex_destroy(&vmo->vmo_lock); + kmem_free(vmo, sizeof (*vmo)); +} + +CTASSERT(VM_MEMATTR_UNCACHEABLE == MTRR_TYPE_UC); +CTASSERT(VM_MEMATTR_WRITE_BACK == MTRR_TYPE_WB); +int +vm_object_set_memattr(vm_object_t vmo, vm_memattr_t attr) +{ + ASSERT(MUTEX_HELD(&vmo->vmo_lock)); + + switch (attr) { + case VM_MEMATTR_UNCACHEABLE: + case VM_MEMATTR_WRITE_BACK: + vmo->vmo_attr = attr; + return (0); + default: + break; + } + return (EINVAL); +} + +void +vm_object_reference(vm_object_t vmo) +{ + ASSERT(vmo != NULL); + + uint_t ref = atomic_inc_uint_nv(&vmo->vmo_refcnt); + /* overflow would be a deadly serious mistake */ + VERIFY3U(ref, !=, 0); +} + +static vmspace_mapping_t * +vm_mapping_find(struct vmspace *vms, uintptr_t addr, size_t size, + boolean_t no_lock) +{ + vmspace_mapping_t *vmsm; + list_t *ml = &vms->vms_maplist; + const uintptr_t range_end = addr + size; + + ASSERT(addr <= range_end); + + if (no_lock) { + /* + * This check should be superflous with the protections + * promised by the bhyve logic which calls into the VM shim. + * All the same, it is cheap to be paranoid. + */ + VERIFY(!vms->vms_map_changing); + } else { + VERIFY(MUTEX_HELD(&vms->vms_lock)); + } + + if (addr >= vms->vms_size) { + return (NULL); + } + for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { + const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; + + if (addr >= vmsm->vmsm_addr && addr < seg_end) { + if (range_end <= seg_end) { + return (vmsm); + } else { + return (NULL); + } + } + } + return (NULL); +} + +static boolean_t +vm_mapping_gap(struct vmspace *vms, uintptr_t addr, size_t size) +{ + vmspace_mapping_t *vmsm; + list_t *ml = &vms->vms_maplist; + const uintptr_t range_end = addr + size; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + + for (vmsm = list_head(ml); vmsm != NULL; vmsm = list_next(ml, vmsm)) { + const uintptr_t seg_end = vmsm->vmsm_addr + vmsm->vmsm_len; + + if ((vmsm->vmsm_addr >= addr && vmsm->vmsm_addr < range_end) || + (seg_end > addr && seg_end < range_end)) { + return (B_FALSE); + } + } + return (B_TRUE); +} + +static void +vm_mapping_remove(struct vmspace *vms, vmspace_mapping_t *vmsm) +{ + list_t *ml = &vms->vms_maplist; + + ASSERT(MUTEX_HELD(&vms->vms_lock)); + ASSERT(vms->vms_map_changing); + + list_remove(ml, vmsm); + vm_object_deallocate(vmsm->vmsm_object); + kmem_free(vmsm, sizeof (*vmsm)); +} + +int +vm_fault(vm_map_t map, vm_offset_t off, vm_prot_t type, int flag) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + pmap_t pmap = &vms->vms_pmap; + void *pmi = pmap->pm_impl; + const uintptr_t addr = off; + vmspace_mapping_t *vmsm; + struct vm_object *vmo; + uint_t prot, map_lvl; + pfn_t pfn; + uintptr_t map_addr; + + mutex_enter(&vms->vms_lock); + if (vmspace_pmap_iswired(vms, addr, &prot) == 0) { + int err = 0; + + /* + * It is possible that multiple vCPUs will race to fault-in a + * given address. In such cases, the race loser(s) will + * encounter the already-mapped page, needing to do nothing + * more than consider it a success. + * + * If the fault exceeds protection, it is an obvious error. + */ + if ((prot & type) != type) { + err = FC_PROT; + } + + mutex_exit(&vms->vms_lock); + return (err); + } + + /* Try to wire up the address */ + if ((vmsm = vm_mapping_find(vms, addr, 0, B_FALSE)) == NULL) { + mutex_exit(&vms->vms_lock); + return (FC_NOMAP); + } + vmo = vmsm->vmsm_object; + prot = vmsm->vmsm_prot; + + /* XXXJOY: punt on large pages for now */ + pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, addr), NULL, NULL); + map_lvl = 0; + map_addr = P2ALIGN((uintptr_t)addr, LEVEL_SIZE(map_lvl)); + VERIFY(pfn != PFN_INVALID); + + /* + * If pmap failure is to be handled, the previously acquired page locks + * would need to be released. + */ + VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl, prot, + vmo->vmo_attr)); + pmap->pm_eptgen++; + + mutex_exit(&vms->vms_lock); + return (0); +} + +int +vm_fault_quick_hold_pages(vm_map_t map, vm_offset_t addr, vm_size_t len, + vm_prot_t prot, vm_page_t *ma, int max_count) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + const uintptr_t vaddr = addr; + vmspace_mapping_t *vmsm; + struct vm_object *vmo; + vm_page_t vmp; + + ASSERT0(addr & PAGEOFFSET); + ASSERT(len == PAGESIZE); + ASSERT(max_count == 1); + + /* + * Unlike practically all of the other logic that queries or + * manipulates vmspace objects, vm_fault_quick_hold_pages() does so + * without holding vms_lock. This is safe because bhyve ensures that + * changes to the vmspace map occur only when all other threads have + * been excluded from running. + * + * Since this task can count on vms_maplist remaining static and does + * not need to modify the pmap (like vm_fault might), it can proceed + * without the lock. The vm_object has independent refcount and lock + * protection, while the vmo_pager methods do not rely on vms_lock for + * safety. + * + * Performing this work without locks is critical in cases where + * multiple vCPUs require simultaneous instruction emulation, such as + * for frequent guest APIC accesses on a host that lacks hardware + * acceleration for that behavior. + */ + if ((vmsm = vm_mapping_find(vms, vaddr, PAGESIZE, B_TRUE)) == NULL || + (prot & ~vmsm->vmsm_prot) != 0) { + return (-1); + } + + vmp = kmem_zalloc(sizeof (struct vm_page), KM_SLEEP); + + vmo = vmsm->vmsm_object; + vm_object_reference(vmo); + vmp->vmp_obj_held = vmo; + vmp->vmp_pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, vaddr), NULL, + NULL); + + *ma = vmp; + return (1); +} + +/* + * Find a suitable location for a mapping (and install it). + */ +int +vm_map_find(vm_map_t map, vm_object_t vmo, vm_ooffset_t off, vm_offset_t *addr, + vm_size_t len, vm_offset_t max_addr, int find_flags, vm_prot_t prot, + vm_prot_t prot_max, int cow) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + const size_t size = (size_t)len; + const uintptr_t uoff = (uintptr_t)off; + uintptr_t base = *addr; + vmspace_mapping_t *vmsm; + int res = 0; + + /* For use in vmm only */ + VERIFY(find_flags == VMFS_NO_SPACE); /* essentially MAP_FIXED */ + VERIFY(max_addr == 0); + + if (size == 0 || off < 0 || + uoff >= (uoff + size) || vmo->vmo_size < (uoff + size)) { + return (EINVAL); + } + + if (*addr >= vms->vms_size) { + return (ENOMEM); + } + + vmsm = kmem_alloc(sizeof (*vmsm), KM_SLEEP); + + mutex_enter(&vms->vms_lock); + vms->vms_map_changing = B_TRUE; + if (!vm_mapping_gap(vms, base, size)) { + res = ENOMEM; + goto out; + } + + if (res == 0) { + vmsm->vmsm_object = vmo; + vmsm->vmsm_addr = base; + vmsm->vmsm_len = len; + vmsm->vmsm_offset = (off_t)uoff; + vmsm->vmsm_prot = prot; + list_insert_tail(&vms->vms_maplist, vmsm); + + /* Communicate out the chosen address. */ + *addr = (vm_offset_t)base; + } +out: + vms->vms_map_changing = B_FALSE; + mutex_exit(&vms->vms_lock); + if (res != 0) { + kmem_free(vmsm, sizeof (*vmsm)); + } + return (res); +} + +int +vm_map_remove(vm_map_t map, vm_offset_t start, vm_offset_t end) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + pmap_t pmap = &vms->vms_pmap; + void *pmi = pmap->pm_impl; + const uintptr_t addr = start; + const size_t size = (size_t)(end - start); + vmspace_mapping_t *vmsm; + + ASSERT(start < end); + + mutex_enter(&vms->vms_lock); + vms->vms_map_changing = B_TRUE; + /* expect to match existing mapping exactly */ + if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL || + vmsm->vmsm_addr != addr || vmsm->vmsm_len != size) { + vms->vms_map_changing = B_FALSE; + mutex_exit(&vms->vms_lock); + return (ENOENT); + } + + (void) pmap->pm_ops->vpo_unmap(pmi, addr, end); + pmap->pm_eptgen++; + + vm_mapping_remove(vms, vmsm); + vms->vms_map_changing = B_FALSE; + mutex_exit(&vms->vms_lock); + return (0); +} + +int +vm_map_wire(vm_map_t map, vm_offset_t start, vm_offset_t end, int flags) +{ + struct vmspace *vms = VMMAP_TO_VMSPACE(map); + pmap_t pmap = &vms->vms_pmap; + void *pmi = pmap->pm_impl; + const uintptr_t addr = start; + const size_t size = end - start; + vmspace_mapping_t *vmsm; + struct vm_object *vmo; + uint_t prot; + + mutex_enter(&vms->vms_lock); + + /* For the time being, only exact-match mappings are expected */ + if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) { + mutex_exit(&vms->vms_lock); + return (FC_NOMAP); + } + vmo = vmsm->vmsm_object; + prot = vmsm->vmsm_prot; + + for (uintptr_t pos = addr; pos < end; ) { + pfn_t pfn; + uintptr_t pg_size, map_addr; + uint_t map_lvl = 0; + + /* XXXJOY: punt on large pages for now */ + pfn = vmo->vmo_pager(vmo, VMSM_OFFSET(vmsm, pos), NULL, NULL); + pg_size = LEVEL_SIZE(map_lvl); + map_addr = P2ALIGN(pos, pg_size); + VERIFY(pfn != PFN_INVALID); + + VERIFY0(pmap->pm_ops->vpo_map(pmi, map_addr, pfn, map_lvl, + prot, vmo->vmo_attr)); + vms->vms_pmap.pm_eptgen++; + + pos += pg_size; + } + + mutex_exit(&vms->vms_lock); + + return (0); +} + +/* Provided custom for bhyve 'devmem' segment mapping */ +int +vm_segmap_obj(struct vmspace *vms, vm_object_t vmo, struct as *as, + caddr_t *addrp, uint_t prot, uint_t maxprot, uint_t flags) +{ + const size_t size = vmo->vmo_size; + int err; + + if (vmo->vmo_type != OBJT_DEFAULT) { + /* Only support default objects for now */ + return (ENOTSUP); + } + + as_rangelock(as); + + err = choose_addr(as, addrp, size, 0, ADDR_VACALIGN, flags); + if (err == 0) { + segvmm_crargs_t svma; + + svma.kaddr = vmo->vmo_data; + svma.prot = prot; + svma.cookie = vmo; + svma.hold = (segvmm_holdfn_t)vm_object_reference; + svma.rele = (segvmm_relefn_t)vm_object_deallocate; + + err = as_map(as, *addrp, size, segvmm_create, &svma); + } + + as_rangeunlock(as); + return (err); +} + +int +vm_segmap_space(struct vmspace *vms, off_t off, struct as *as, caddr_t *addrp, + off_t len, uint_t prot, uint_t maxprot, uint_t flags) +{ + const uintptr_t addr = (uintptr_t)off; + const size_t size = (uintptr_t)len; + vmspace_mapping_t *vmsm; + vm_object_t vmo; + int err; + + if (off < 0 || len <= 0 || + (addr & PAGEOFFSET) != 0 || (size & PAGEOFFSET) != 0) { + return (EINVAL); + } + + mutex_enter(&vms->vms_lock); + if ((vmsm = vm_mapping_find(vms, addr, size, B_FALSE)) == NULL) { + mutex_exit(&vms->vms_lock); + return (ENXIO); + } + if ((prot & ~(vmsm->vmsm_prot | PROT_USER)) != 0) { + mutex_exit(&vms->vms_lock); + return (EACCES); + } + vmo = vmsm->vmsm_object; + if (vmo->vmo_type != OBJT_DEFAULT) { + /* Only support default objects for now */ + mutex_exit(&vms->vms_lock); + return (ENOTSUP); + } + + as_rangelock(as); + + err = choose_addr(as, addrp, size, off, ADDR_VACALIGN, flags); + if (err == 0) { + segvmm_crargs_t svma; + const uintptr_t addroff = addr - vmsm->vmsm_addr; + const uintptr_t mapoff = addroff + vmsm->vmsm_offset; + + VERIFY(addroff < vmsm->vmsm_len); + VERIFY((vmsm->vmsm_len - addroff) >= size); + VERIFY(mapoff < vmo->vmo_size); + VERIFY((mapoff + size) <= vmo->vmo_size); + + svma.kaddr = (void *)((uintptr_t)vmo->vmo_data + mapoff); + svma.prot = prot; + svma.cookie = vmo; + svma.hold = (segvmm_holdfn_t)vm_object_reference; + svma.rele = (segvmm_relefn_t)vm_object_deallocate; + + err = as_map(as, *addrp, len, segvmm_create, &svma); + } + + as_rangeunlock(as); + mutex_exit(&vms->vms_lock); + return (err); +} + +void +vm_page_lock(vm_page_t vmp) +{ + ASSERT(!MUTEX_HELD(&vmp->vmp_lock)); + + mutex_enter(&vmp->vmp_lock); +} + +void +vm_page_unlock(vm_page_t vmp) +{ + boolean_t purge = (vmp->vmp_pfn == PFN_INVALID); + + ASSERT(MUTEX_HELD(&vmp->vmp_lock)); + + mutex_exit(&vmp->vmp_lock); + + if (purge) { + mutex_destroy(&vmp->vmp_lock); + kmem_free(vmp, sizeof (*vmp)); + } +} + +void +vm_page_unhold(vm_page_t vmp) +{ + ASSERT(MUTEX_HELD(&vmp->vmp_lock)); + VERIFY(vmp->vmp_pfn != PFN_INVALID); + + vm_object_deallocate(vmp->vmp_obj_held); + vmp->vmp_obj_held = NULL; + vmp->vmp_pfn = PFN_INVALID; +} diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.c b/usr/src/uts/i86pc/io/vmm/vmm_stat.c new file mode 100644 index 0000000000..2cbcce9590 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.c @@ -0,0 +1,172 @@ +/*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2011 NetApp, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> + +#include <machine/vmm.h> +#include "vmm_util.h" +#include "vmm_stat.h" + +/* + * 'vst_num_elems' is the total number of addressable statistic elements + * 'vst_num_types' is the number of unique statistic types + * + * It is always true that 'vst_num_elems' is greater than or equal to + * 'vst_num_types'. This is because a stat type may represent more than + * one element (for e.g. VMM_STAT_ARRAY). + */ +static int vst_num_elems, vst_num_types; +static struct vmm_stat_type *vsttab[MAX_VMM_STAT_ELEMS]; + +static MALLOC_DEFINE(M_VMM_STAT, "vmm stat", "vmm stat"); + +#define vst_size ((size_t)vst_num_elems * sizeof(uint64_t)) + +void +vmm_stat_register(void *arg) +{ + struct vmm_stat_type *vst = arg; + + /* We require all stats to identify themselves with a description */ + if (vst->desc == NULL) + return; + + if (vst->scope == VMM_STAT_SCOPE_INTEL && !vmm_is_intel()) + return; + + if (vst->scope == VMM_STAT_SCOPE_AMD && !vmm_is_amd()) + return; + + if (vst_num_elems + vst->nelems >= MAX_VMM_STAT_ELEMS) { + printf("Cannot accommodate vmm stat type \"%s\"!\n", vst->desc); + return; + } + + vst->index = vst_num_elems; + vst_num_elems += vst->nelems; + + vsttab[vst_num_types++] = vst; +} + +int +vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf) +{ + struct vmm_stat_type *vst; + uint64_t *stats; + int i; + + if (vcpu < 0 || vcpu >= vm_get_maxcpus(vm)) + return (EINVAL); + + /* Let stats functions update their counters */ + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (vst->func != NULL) + (*vst->func)(vm, vcpu, vst); + } + + /* Copy over the stats */ + stats = vcpu_stats(vm, vcpu); + for (i = 0; i < vst_num_elems; i++) + buf[i] = stats[i]; + *num_stats = vst_num_elems; + return (0); +} + +void * +vmm_stat_alloc(void) +{ + + return (malloc(vst_size, M_VMM_STAT, M_WAITOK)); +} + +void +vmm_stat_init(void *vp) +{ + + bzero(vp, vst_size); +} + +void +vmm_stat_free(void *vp) +{ + free(vp, M_VMM_STAT); +} + +int +vmm_stat_desc_copy(int index, char *buf, int bufsize) +{ + int i; + struct vmm_stat_type *vst; + + for (i = 0; i < vst_num_types; i++) { + vst = vsttab[i]; + if (index >= vst->index && index < vst->index + vst->nelems) { + if (vst->nelems > 1) { + snprintf(buf, bufsize, "%s[%d]", + vst->desc, index - vst->index); + } else { + strlcpy(buf, vst->desc, bufsize); + } + return (0); /* found it */ + } + } + + return (EINVAL); +} + +/* global statistics */ +VMM_STAT(VCPU_MIGRATIONS, "vcpu migration across host cpus"); +VMM_STAT(VMEXIT_COUNT, "total number of vm exits"); +VMM_STAT(VMEXIT_EXTINT, "vm exits due to external interrupt"); +VMM_STAT(VMEXIT_HLT, "number of times hlt was intercepted"); +VMM_STAT(VMEXIT_CR_ACCESS, "number of times %cr access was intercepted"); +VMM_STAT(VMEXIT_RDMSR, "number of times rdmsr was intercepted"); +VMM_STAT(VMEXIT_WRMSR, "number of times wrmsr was intercepted"); +VMM_STAT(VMEXIT_MTRAP, "number of monitor trap exits"); +VMM_STAT(VMEXIT_PAUSE, "number of times pause was intercepted"); +VMM_STAT(VMEXIT_INTR_WINDOW, "vm exits due to interrupt window opening"); +VMM_STAT(VMEXIT_NMI_WINDOW, "vm exits due to nmi window opening"); +VMM_STAT(VMEXIT_INOUT, "number of times in/out was intercepted"); +VMM_STAT(VMEXIT_CPUID, "number of times cpuid was intercepted"); +VMM_STAT(VMEXIT_NESTED_FAULT, "vm exits due to nested page fault"); +VMM_STAT(VMEXIT_INST_EMUL, "vm exits for instruction emulation"); +VMM_STAT(VMEXIT_UNKNOWN, "number of vm exits for unknown reason"); +VMM_STAT(VMEXIT_ASTPENDING, "number of times astpending at exit"); +VMM_STAT(VMEXIT_REQIDLE, "number of times idle requested at exit"); +VMM_STAT(VMEXIT_USERSPACE, "number of vm exits handled in userspace"); +VMM_STAT(VMEXIT_RUNBLOCK, "number of times runblock at exit"); +VMM_STAT(VMEXIT_EXCEPTION, "number of vm exits due to exceptions"); diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.h b/usr/src/uts/i86pc/io/vmm/vmm_stat.h index 9bf7a60e0b..3232e23888 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_stat.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-3-Clause + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -10,7 +12,7 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 4. Neither the name of the University nor the names of its contributors + * 3. Neither the name of the University nor the names of its contributors * may be used to endorse or promote products derived from this software * without specific prior written permission. * @@ -26,15 +28,24 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_stat.h 250427 2013-05-10 02:59:49Z neel $ + * $FreeBSD$ + */ +/* + * Copyright 2018 Joyent, Inc. */ #ifndef _VMM_STAT_H_ #define _VMM_STAT_H_ +#include <machine/vmm.h> + struct vm; -#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ +#ifdef __FreeBSD__ +#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */ +#else +#define MAX_VMM_STAT_ELEMS (64 + VM_MAXCPU) /* arbitrary */ +#endif enum vmm_stat_scope { VMM_STAT_SCOPE_ANY, @@ -42,20 +53,28 @@ enum vmm_stat_scope { VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */ }; +struct vmm_stat_type; +typedef void (*vmm_stat_func_t)(struct vm *vm, int vcpu, + struct vmm_stat_type *stat); + struct vmm_stat_type { int index; /* position in the stats buffer */ int nelems; /* standalone or array */ const char *desc; /* description of statistic */ + vmm_stat_func_t func; enum vmm_stat_scope scope; }; -void vmm_stat_init(void *arg); +void vmm_stat_register(void *arg); -#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ +#define VMM_STAT_FDEFINE(type, nelems, desc, func, scope) \ struct vmm_stat_type type[1] = { \ - { -1, nelems, desc, scope } \ + { -1, nelems, desc, func, scope } \ }; \ - SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type) + SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_register, type) + +#define VMM_STAT_DEFINE(type, nelems, desc, scope) \ + VMM_STAT_FDEFINE(type, nelems, desc, NULL, scope) #define VMM_STAT_DECLARE(type) \ extern struct vmm_stat_type type[1] @@ -67,10 +86,14 @@ void vmm_stat_init(void *arg); #define VMM_STAT_AMD(type, desc) \ VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD) +#define VMM_STAT_FUNC(type, desc, func) \ + VMM_STAT_FDEFINE(type, 1, desc, func, VMM_STAT_SCOPE_ANY) + #define VMM_STAT_ARRAY(type, nelems, desc) \ VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY) void *vmm_stat_alloc(void); +void vmm_stat_init(void *vp); void vmm_stat_free(void *vp); /* @@ -79,7 +102,7 @@ void vmm_stat_free(void *vp); int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf); int vmm_stat_desc_copy(int index, char *buf, int buflen); -static void __inline +static __inline void vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, int statidx, uint64_t x) { @@ -92,9 +115,22 @@ vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, stats[vst->index + statidx] += x; #endif } - -static void __inline +static __inline void +vmm_stat_array_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, + int statidx, uint64_t val) +{ +#ifdef VMM_KEEP_STATS + uint64_t *stats; + + stats = vcpu_stats(vm, vcpu); + + if (vst->index >= 0 && statidx < vst->nelems) + stats[vst->index + statidx] = val; +#endif +} + +static __inline void vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) { @@ -103,6 +139,15 @@ vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x) #endif } +static __inline void +vmm_stat_set(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t val) +{ + +#ifdef VMM_KEEP_STATS + vmm_stat_array_set(vm, vcpu, vst, 0, val); +#endif +} + VMM_STAT_DECLARE(VCPU_MIGRATIONS); VMM_STAT_DECLARE(VMEXIT_COUNT); VMM_STAT_DECLARE(VMEXIT_EXTINT); @@ -121,7 +166,7 @@ VMM_STAT_DECLARE(VMEXIT_INST_EMUL); VMM_STAT_DECLARE(VMEXIT_UNKNOWN); VMM_STAT_DECLARE(VMEXIT_ASTPENDING); VMM_STAT_DECLARE(VMEXIT_USERSPACE); -VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS); -VMM_STAT_DECLARE(VMEXIT_USERSPACE); +VMM_STAT_DECLARE(VMEXIT_RUNBLOCK); VMM_STAT_DECLARE(VMEXIT_EXCEPTION); +VMM_STAT_DECLARE(VMEXIT_REQIDLE); #endif diff --git a/usr/src/uts/i86pc/io/vmm/vmm_support.s b/usr/src/uts/i86pc/io/vmm/vmm_support.s new file mode 100644 index 0000000000..5777d46959 --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_support.s @@ -0,0 +1,54 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/asm_linkage.h> +#include <sys/segments.h> + +/* + * %rdi = trapno + * + * This variant is for any explicit exception injection that we need: in this + * case, we can't just, for example, do a direct "int $2", as that will then + * trash our %cr3 via tr_nmiint due to KPTI, so we have to fake a trap frame. + * Both NMIs and MCEs don't push an 'err' into the frame. + */ +ENTRY_NP(vmm_call_trap) + pushq %rbp + movq %rsp, %rbp + movq %rsp, %r11 + andq $~0xf, %rsp /* align stack */ + pushq $KDS_SEL /* %ss */ + pushq %r11 /* %rsp */ + pushfq /* %rflags */ + pushq $KCS_SEL /* %cs */ + leaq .trap_iret_dest(%rip), %rcx + pushq %rcx /* %rip */ + cli + cmpq $T_NMIFLT, %rdi + je nmiint + cmpq $T_MCE, %rdi + je mcetrap + + pushq %rdi /* save our bad trapno... */ + leaq __vmm_call_bad_trap(%rip), %rdi + xorl %eax, %eax + call panic + /*NOTREACHED*/ + +.trap_iret_dest: + popq %rbp + ret +SET_SIZE(vmm_call_trap) + +__vmm_call_bad_trap: + .string "bad trapno for vmm_call_trap()" diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.c b/usr/src/uts/i86pc/io/vmm/vmm_util.c index fabd42e13c..3eadfe57e5 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_util.c +++ b/usr/src/uts/i86pc/io/vmm/vmm_util.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_util.c 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -39,7 +41,7 @@ */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_util.c 245678 2013-01-20 03:42:49Z neel $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> #include <sys/libkern.h> diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.h b/usr/src/uts/i86pc/io/vmm/vmm_util.h index fe1c1c9449..fc7e7364c7 100644 --- a/usr/src/uts/i86pc/io/vmm/vmm_util.h +++ b/usr/src/uts/i86pc/io/vmm/vmm_util.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/vmm_util.h 245678 2013-01-20 03:42:49Z neel $ + * $FreeBSD$ */ #ifndef _VMM_UTIL_H_ diff --git a/usr/src/uts/i86pc/io/vmm/vmm_zsd.c b/usr/src/uts/i86pc/io/vmm/vmm_zsd.c new file mode 100644 index 0000000000..0271cc339e --- /dev/null +++ b/usr/src/uts/i86pc/io/vmm/vmm_zsd.c @@ -0,0 +1,218 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018, Joyent, Inc. + */ + +#include <sys/cpuvar.h> +#include <sys/debug.h> +#include <sys/kmem.h> +#include <sys/ksynch.h> +#include <sys/list.h> +#include <sys/types.h> +#include <sys/vmm.h> +#include <sys/vmm_impl.h> +#include <sys/zone.h> + +/* + * zone specific data + * + * Zone specific data is used to keep an association between zones and the vmm + * instances that may be running in them. This is used to ensure that vmm + * instances do not outlive their parent zone. + * + * Locking strategy + * + * The global vmm_zsd_lock is held while modifying vmm_zsd_list. + * + * The per zone vz_lock in vmm_zsd_t is held while reading or writing anything + * within in vmm_zsd_t instance. This is important to ensure that there's not + * an accidental VM creating as a zone is going down. + */ + +/* + * One of these per zone. + */ +struct vmm_zsd { + list_t vz_vmms; /* vmm instances in the zone */ + list_node_t vz_linkage; /* link to other zones */ + boolean_t vz_active; /* B_FALSE early in shutdown callback */ + zoneid_t vz_zoneid; + kmutex_t vz_lock; +}; + +static kmutex_t vmm_zsd_lock; /* Protects vmm_zsd_list */ +static list_t vmm_zsd_list; /* Linkage between all zsd instances */ + +static zone_key_t vmm_zsd_key; + +int +vmm_zsd_add_vm(vmm_softc_t *sc) +{ + vmm_zsd_t *zsd; + + ASSERT(sc->vmm_zone != NULL); + + mutex_enter(&vmm_zsd_lock); + + for (zsd = list_head(&vmm_zsd_list); zsd != NULL; + zsd = list_next(&vmm_zsd_list, zsd)) { + if (zsd->vz_zoneid == sc->vmm_zone->zone_id) { + break; + } + } + + VERIFY(zsd != NULL); + mutex_exit(&vmm_zsd_lock); + + mutex_enter(&zsd->vz_lock); + if (!zsd->vz_active) { + mutex_exit(&zsd->vz_lock); + return (ENOSYS); + } + + sc->vmm_zsd = zsd; + list_insert_tail(&zsd->vz_vmms, sc); + + mutex_exit(&zsd->vz_lock); + + return (0); +} + +void +vmm_zsd_rem_vm(vmm_softc_t *sc) +{ + vmm_zsd_t *zsd = sc->vmm_zsd; + + mutex_enter(&zsd->vz_lock); + + list_remove(&zsd->vz_vmms, sc); + sc->vmm_zsd = NULL; + + mutex_exit(&zsd->vz_lock); +} + +static void * +vmm_zsd_create(zoneid_t zid) +{ + vmm_zsd_t *zsd; + zone_t *zone; + + zsd = kmem_zalloc(sizeof (*zsd), KM_SLEEP); + + list_create(&zsd->vz_vmms, sizeof (vmm_softc_t), + offsetof(vmm_softc_t, vmm_zsd_linkage)); + + zsd->vz_zoneid = zid; + + mutex_init(&zsd->vz_lock, NULL, MUTEX_DEFAULT, NULL); + + /* + * If the vmm module is loaded while this zone is in the midst of + * shutting down, vmm_zsd_destroy() may be called without + * vmm_zsd_shutdown() ever being called. If it is shutting down, there + * is no sense in letting any in-flight VM creation succeed so set + * vz_active accordingly. + * + * zone_find_by_id_nolock() is used rather than zone_find_by_id() + * so that the zone is returned regardless of state. + */ + zone = zone_find_by_id_nolock(zid); + VERIFY(zone != NULL); + zsd->vz_active = zone_status_get(zone) < ZONE_IS_SHUTTING_DOWN; + + mutex_enter(&vmm_zsd_lock); + list_insert_tail(&vmm_zsd_list, zsd); + mutex_exit(&vmm_zsd_lock); + + return (zsd); +} + +/* + * Tells all runing VMs in the zone to poweroff. This does not reclaim guest + * resources (memory, etc.). + */ +static void +vmm_zsd_shutdown(zoneid_t zid, void *data) +{ + vmm_zsd_t *zsd = data; + vmm_softc_t *sc; + + mutex_enter(&zsd->vz_lock); + + /* + * This may already be B_FALSE. See comment in vmm_zsd_create(). If it + * is already B_FALSE we will take a quick trip through the empty list. + */ + zsd->vz_active = B_FALSE; + + for (sc = list_head(&zsd->vz_vmms); sc != NULL; + sc = list_next(&zsd->vz_vmms, sc)) { + /* Send a poweroff to the VM, whether running or not. */ + (void) vm_suspend(sc->vmm_vm, VM_SUSPEND_POWEROFF); + } + mutex_exit(&zsd->vz_lock); +} + +/* + * Reap all VMs that remain and free up guest resources. + */ +static void +vmm_zsd_destroy(zoneid_t zid, void *data) +{ + vmm_zsd_t *zsd = data; + vmm_softc_t *sc; + + mutex_enter(&vmm_zsd_lock); + list_remove(&vmm_zsd_list, zsd); + mutex_exit(&vmm_zsd_lock); + + mutex_enter(&zsd->vz_lock); + ASSERT(!zsd->vz_active); + + while ((sc = list_remove_head(&zsd->vz_vmms)) != NULL) { + int err; + + /* + * This frees all resources associated with the vm, including + * sc. + */ + err = vmm_do_vm_destroy(sc, B_FALSE); + ASSERT3S(err, ==, 0); + } + + mutex_exit(&zsd->vz_lock); + mutex_destroy(&zsd->vz_lock); + + kmem_free(zsd, sizeof (*zsd)); +} + +void +vmm_zsd_init(void) +{ + mutex_init(&vmm_zsd_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&vmm_zsd_list, sizeof (vmm_zsd_t), + offsetof(vmm_zsd_t, vz_linkage)); + zone_key_create(&vmm_zsd_key, vmm_zsd_create, vmm_zsd_shutdown, + vmm_zsd_destroy); +} + +void +vmm_zsd_fini(void) +{ + /* Calls vmm_zsd_destroy() on all zones. */ + zone_key_delete(vmm_zsd_key); + ASSERT(list_is_empty(&vmm_zsd_list)); + + list_destroy(&vmm_zsd_list); + mutex_destroy(&vmm_zsd_lock); +} diff --git a/usr/src/uts/i86pc/io/vmm/vmx_assym.s b/usr/src/uts/i86pc/io/vmm/vmx_assym.s deleted file mode 100644 index d84ca30275..0000000000 --- a/usr/src/uts/i86pc/io/vmm/vmx_assym.s +++ /dev/null @@ -1 +0,0 @@ -#include "vmx_assym.h" diff --git a/usr/src/uts/i86pc/io/vmm/x86.c b/usr/src/uts/i86pc/io/vmm/x86.c index 02222ef5e7..d74f866013 100644 --- a/usr/src/uts/i86pc/io/vmm/x86.c +++ b/usr/src/uts/i86pc/io/vmm/x86.c @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/x86.c 255645 2013-09-17 17:56:53Z grehan $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,38 +38,69 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2014 Pluribus Networks Inc. + * Copyright 2018 Joyent, Inc. */ #include <sys/cdefs.h> -__FBSDID("$FreeBSD: head/sys/amd64/vmm/x86.c 255645 2013-09-17 17:56:53Z grehan $"); +__FBSDID("$FreeBSD$"); #include <sys/param.h> -#include <sys/types.h> +#include <sys/pcpu.h> #include <sys/systm.h> -#include <sys/cpuset.h> +#include <sys/sysctl.h> +#include <sys/x86_archext.h> #include <machine/clock.h> #include <machine/cpufunc.h> #include <machine/md_var.h> +#include <machine/segments.h> #include <machine/specialreg.h> #include <machine/vmm.h> +#include "vmm_host.h" +#include "vmm_ktr.h" +#include "vmm_util.h" #include "x86.h" +SYSCTL_DECL(_hw_vmm); +SYSCTL_NODE(_hw_vmm, OID_AUTO, topology, CTLFLAG_RD, 0, NULL); + #define CPUID_VM_HIGH 0x40000000 static const char bhyve_id[12] = "bhyve bhyve "; static uint64_t bhyve_xcpuids; +SYSCTL_ULONG(_hw_vmm, OID_AUTO, bhyve_xcpuids, CTLFLAG_RW, &bhyve_xcpuids, 0, + "Number of times an unknown cpuid leaf was accessed"); + +static int cpuid_leaf_b = 1; +SYSCTL_INT(_hw_vmm_topology, OID_AUTO, cpuid_leaf_b, CTLFLAG_RDTUN, + &cpuid_leaf_b, 0, NULL); + +/* + * Round up to the next power of two, if necessary, and then take log2. + * Returns -1 if argument is zero. + */ +static __inline int +log2(u_int x) +{ + + return (fls(x << (1 - powerof2(x))) - 1); +} int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx) { - int error; - unsigned int func, regs[4]; + const struct xsave_limits *limits; + uint64_t cr4; + int error, enable_invpcid, level, width = 0, x2apic_id = 0; + unsigned int func, regs[4], logical_cpus = 0; enum x2apic_state x2apic_state; + uint16_t cores, maxcpus, sockets, threads; + + VCPU_CTR2(vm, vcpu_id, "cpuid %#x,%#x", *eax, *ecx); /* * Requests for invalid CPUID levels should map to the highest @@ -102,26 +135,108 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, case CPUID_8000_0003: case CPUID_8000_0004: case CPUID_8000_0006: + cpuid_count(*eax, *ecx, regs); + break; case CPUID_8000_0008: cpuid_count(*eax, *ecx, regs); + if (vmm_is_amd()) { + /* + * As on Intel (0000_0007:0, EDX), mask out + * unsupported or unsafe AMD extended features + * (8000_0008 EBX). + */ + regs[1] &= (AMDFEID_CLZERO | AMDFEID_IRPERF | + AMDFEID_XSAVEERPTR); + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + /* + * Here, width is ApicIdCoreIdSize, present on + * at least Family 15h and newer. It + * represents the "number of bits in the + * initial apicid that indicate thread id + * within a package." + * + * Our topo_probe_amd() uses it for + * pkg_id_shift and other OSes may rely on it. + */ + width = MIN(0xF, log2(threads * cores)); + if (width < 0x4) + width = 0; + logical_cpus = MIN(0xFF, threads * cores - 1); + regs[2] = (width << AMDID_COREID_SIZE_SHIFT) | logical_cpus; + } break; case CPUID_8000_0001: + cpuid_count(*eax, *ecx, regs); + + /* + * Hide SVM from guest. + */ + regs[2] &= ~AMDID2_SVM; + + /* + * Don't advertise extended performance counter MSRs + * to the guest. + */ + regs[2] &= ~AMDID2_PCXC; + regs[2] &= ~AMDID2_PNXC; + regs[2] &= ~AMDID2_PTSCEL2I; + + /* + * Don't advertise Instruction Based Sampling feature. + */ + regs[2] &= ~AMDID2_IBS; + + /* NodeID MSR not available */ + regs[2] &= ~AMDID2_NODE_ID; + + /* Don't advertise the OS visible workaround feature */ + regs[2] &= ~AMDID2_OSVW; + + /* Hide mwaitx/monitorx capability from the guest */ + regs[2] &= ~AMDID2_MWAITX; + +#ifndef __FreeBSD__ + /* + * Detection routines for TCE and FFXSR are missing + * from our vm_cpuid_capability() detection logic + * today. Mask them out until that is remedied. + * They do not appear to be in common usage, so their + * absence should not cause undue trouble. + */ + regs[2] &= ~AMDID2_TCE; + regs[3] &= ~AMDID_FFXSR; +#endif + /* * Hide rdtscp/ia32_tsc_aux until we know how * to deal with them. */ - cpuid_count(*eax, *ecx, regs); regs[3] &= ~AMDID_RDTSCP; break; case CPUID_8000_0007: - cpuid_count(*eax, *ecx, regs); -#ifdef __FreeBSD__ /* - * If the host TSCs are not synchronized across - * physical cpus then we cannot advertise an - * invariant tsc to a vcpu. + * AMD uses this leaf to advertise the processor's + * power monitoring and RAS capabilities. These + * features are hardware-specific and exposing + * them to a guest doesn't make a lot of sense. + * + * Intel uses this leaf only to advertise the + * "Invariant TSC" feature with all other bits + * being reserved (set to zero). + */ + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* + * "Invariant TSC" can be advertised to the guest if: + * - host TSC frequency is invariant + * - host TSCs are synchronized across physical cpus * * XXX This still falls short because the vcpu * can observe the TSC moving backwards as it @@ -129,9 +244,73 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, * it should discourage the guest from using the * TSC to keep track of time. */ - if (!smp_tsc) - regs[3] &= ~AMDPM_TSC_INVARIANT; -#endif +#ifdef __FreeBSD__ + /* XXXJOY: Wire up with our own TSC logic */ + if (tsc_is_invariant && smp_tsc) + regs[3] |= AMDPM_TSC_INVARIANT; +#endif /* __FreeBSD__ */ + break; + + case CPUID_8000_001D: + /* AMD Cache topology, like 0000_0004 for Intel. */ + if (!vmm_is_amd()) + goto default_leaf; + + /* + * Similar to Intel, generate a ficticious cache + * topology for the guest with L3 shared by the + * package, and L1 and L2 local to a core. + */ + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + switch (*ecx) { + case 0: + logical_cpus = threads; + level = 1; + func = 1; /* data cache */ + break; + case 1: + logical_cpus = threads; + level = 2; + func = 3; /* unified cache */ + break; + case 2: + logical_cpus = threads * cores; + level = 3; + func = 3; /* unified cache */ + break; + default: + logical_cpus = 0; + level = 0; + func = 0; + break; + } + + logical_cpus = MIN(0xfff, logical_cpus - 1); + regs[0] = (logical_cpus << 14) | (1 << 8) | + (level << 5) | func; + regs[1] = (func > 0) ? (CACHE_LINE_SIZE - 1) : 0; + regs[2] = 0; + regs[3] = 0; + break; + + case CPUID_8000_001E: + /* AMD Family 16h+ additional identifiers */ + if (!vmm_is_amd() || CPUID_TO_FAMILY(cpu_id) < 0x16) + goto default_leaf; + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + regs[0] = vcpu_id; + threads = MIN(0xFF, threads - 1); + regs[1] = (threads << 8) | + (vcpu_id >> log2(threads + 1)); + /* + * XXX Bhyve topology cannot yet represent >1 node per + * processor. + */ + regs[2] = 0; + regs[3] = 0; break; case CPUID_0000_0001: @@ -150,22 +329,41 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT); /* - * Don't expose VMX, SpeedStep or TME capability. + * Don't expose VMX, SpeedStep, TME or SMX capability. * Advertise x2APIC capability and Hypervisor guest. */ regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2); + regs[2] &= ~(CPUID2_SMX); regs[2] |= CPUID2_HV; if (x2apic_state != X2APIC_DISABLED) regs[2] |= CPUID2_X2APIC; + else + regs[2] &= ~CPUID2_X2APIC; /* - * Hide xsave/osxsave/avx until the FPU save/restore - * issues are resolved + * Only advertise CPUID2_XSAVE in the guest if + * the host is using XSAVE. */ - regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE | - CPUID2_AVX); + if (!(regs[2] & CPUID2_OSXSAVE)) + regs[2] &= ~CPUID2_XSAVE; + + /* + * If CPUID2_XSAVE is being advertised and the + * guest has set CR4_XSAVE, set + * CPUID2_OSXSAVE. + */ + regs[2] &= ~CPUID2_OSXSAVE; + if (regs[2] & CPUID2_XSAVE) { + error = vm_get_register(vm, vcpu_id, + VM_REG_GUEST_CR4, &cr4); + if (error) + panic("x86_emulate_cpuid: error %d " + "fetching %%cr4", error); + if (cr4 & CR4_XSAVE) + regs[2] |= CPUID2_OSXSAVE; + } /* * Hide monitor/mwait until we know how to deal with @@ -177,7 +375,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, * Hide the performance and debug features. */ regs[2] &= ~CPUID2_PDCM; - + /* * No TSC deadline support in the APIC yet */ @@ -187,48 +385,95 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, * Hide thermal monitoring */ regs[3] &= ~(CPUID_ACPI | CPUID_TM); - + /* - * Machine check handling is done in the host. + * Hide the debug store capability. */ - regs[3] &= ~(CPUID_MCA | CPUID_MCE); - - /* - * Hide the debug store capability. - */ regs[3] &= ~CPUID_DS; /* - * Disable multi-core. + * Advertise the Machine Check and MTRR capability. + * + * Some guest OSes (e.g. Windows) will not boot if + * these features are absent. */ + regs[3] |= (CPUID_MCA | CPUID_MCE | CPUID_MTRR); + + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + logical_cpus = threads * cores; regs[1] &= ~CPUID_HTT_CORES; - regs[3] &= ~CPUID_HTT; + regs[1] |= (logical_cpus & 0xff) << 16; + regs[3] |= CPUID_HTT; break; case CPUID_0000_0004: - do_cpuid(4, regs); + cpuid_count(*eax, *ecx, regs); - /* - * Do not expose topology. - */ - regs[0] &= 0xffff8000; - /* - * The maximum number of processor cores in - * this physical processor package and the - * maximum number of threads sharing this - * cache are encoded with "plus 1" encoding. - * Adding one to the value in this register - * field to obtains the actual value. - * - * Therefore 0 for both indicates 1 core - * per package and no cache sharing. - */ + if (regs[0] || regs[1] || regs[2] || regs[3]) { + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + regs[0] &= 0x3ff; + regs[0] |= (cores - 1) << 26; + /* + * Cache topology: + * - L1 and L2 are shared only by the logical + * processors in a single core. + * - L3 and above are shared by all logical + * processors in the package. + */ + logical_cpus = threads; + level = (regs[0] >> 5) & 0x7; + if (level >= 3) + logical_cpus *= cores; + regs[0] |= (logical_cpus - 1) << 14; + } break; - case CPUID_0000_0006: case CPUID_0000_0007: + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + + /* leaf 0 */ + if (*ecx == 0) { + cpuid_count(*eax, *ecx, regs); + + /* Only leaf 0 is supported */ + regs[0] = 0; + + /* + * Expose known-safe features. + */ + regs[1] &= (CPUID_STDEXT_FSGSBASE | + CPUID_STDEXT_BMI1 | CPUID_STDEXT_HLE | + CPUID_STDEXT_AVX2 | CPUID_STDEXT_BMI2 | + CPUID_STDEXT_ERMS | CPUID_STDEXT_RTM | + CPUID_STDEXT_AVX512F | + CPUID_STDEXT_RDSEED | + CPUID_STDEXT_AVX512PF | + CPUID_STDEXT_AVX512ER | + CPUID_STDEXT_AVX512CD | CPUID_STDEXT_SHA); + regs[2] = 0; + regs[3] &= CPUID_STDEXT3_MD_CLEAR; + + /* Advertise INVPCID if it is enabled. */ + error = vm_get_capability(vm, vcpu_id, + VM_CAP_ENABLE_INVPCID, &enable_invpcid); + if (error == 0 && enable_invpcid) + regs[1] |= CPUID_STDEXT_INVPCID; + } + break; + + case CPUID_0000_0006: + regs[0] = CPUTPM1_ARAT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + case CPUID_0000_000A: - case CPUID_0000_000D: /* * Handle the access, but report 0 for * all options @@ -241,12 +486,93 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, case CPUID_0000_000B: /* - * Processor topology enumeration + * Intel processor topology enumeration */ - regs[0] = 0; - regs[1] = 0; - regs[2] = *ecx & 0xff; - regs[3] = vcpu_id; + if (vmm_is_intel()) { + vm_get_topology(vm, &sockets, &cores, &threads, + &maxcpus); + if (*ecx == 0) { + logical_cpus = threads; + width = log2(logical_cpus); + level = CPUID_TYPE_SMT; + x2apic_id = vcpu_id; + } + + if (*ecx == 1) { + logical_cpus = threads * cores; + width = log2(logical_cpus); + level = CPUID_TYPE_CORE; + x2apic_id = vcpu_id; + } + + if (!cpuid_leaf_b || *ecx >= 2) { + width = 0; + logical_cpus = 0; + level = 0; + x2apic_id = 0; + } + + regs[0] = width & 0x1f; + regs[1] = logical_cpus & 0xffff; + regs[2] = (level << 8) | (*ecx & 0xff); + regs[3] = x2apic_id; + } else { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + + case CPUID_0000_000D: + limits = vmm_get_xsave_limits(); + if (!limits->xsave_enabled) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + } + + cpuid_count(*eax, *ecx, regs); + switch (*ecx) { + case 0: + /* + * Only permit the guest to use bits + * that are active in the host in + * %xcr0. Also, claim that the + * maximum save area size is + * equivalent to the host's current + * save area size. Since this runs + * "inside" of vmrun(), it runs with + * the guest's xcr0, so the current + * save area size is correct as-is. + */ + regs[0] &= limits->xcr0_allowed; + regs[2] = limits->xsave_max_size; + regs[3] &= (limits->xcr0_allowed >> 32); + break; + case 1: + /* Only permit XSAVEOPT. */ + regs[0] &= CPUID_EXTSTATE_XSAVEOPT; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + break; + default: + /* + * If the leaf is for a permitted feature, + * pass through as-is, otherwise return + * all zeroes. + */ + if (!(limits->xcr0_allowed & (1ul << *ecx))) { + regs[0] = 0; + regs[1] = 0; + regs[2] = 0; + regs[3] = 0; + } + break; + } break; case 0x40000000: @@ -257,6 +583,7 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, break; default: +default_leaf: /* * The leaf value has already been clamped so * simply pass this through, keeping count of @@ -274,3 +601,45 @@ x86_emulate_cpuid(struct vm *vm, int vcpu_id, return (1); } + +bool +vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability cap) +{ + bool rv; + + KASSERT(cap > 0 && cap < VCC_LAST, ("%s: invalid vm_cpu_capability %d", + __func__, cap)); + + /* + * Simply passthrough the capabilities of the host cpu for now. + */ + rv = false; + switch (cap) { +#ifdef __FreeBSD__ + case VCC_NO_EXECUTE: + if (amd_feature & AMDID_NX) + rv = true; + break; + case VCC_FFXSR: + if (amd_feature & AMDID_FFXSR) + rv = true; + break; + case VCC_TCE: + if (amd_feature2 & AMDID2_TCE) + rv = true; + break; +#else + case VCC_NO_EXECUTE: + if (is_x86_feature(x86_featureset, X86FSET_NX)) + rv = true; + break; + /* XXXJOY: No kernel detection for FFXR or TCE at present, so ignore */ + case VCC_FFXSR: + case VCC_TCE: + break; +#endif + default: + panic("%s: unknown vm_cpu_capability %d", __func__, cap); + } + return (rv); +} diff --git a/usr/src/uts/i86pc/io/vmm/x86.h b/usr/src/uts/i86pc/io/vmm/x86.h index db2340b37b..0d70c04fd8 100644 --- a/usr/src/uts/i86pc/io/vmm/x86.h +++ b/usr/src/uts/i86pc/io/vmm/x86.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/vmm/x86.h 255287 2013-09-06 05:16:10Z grehan $ + * $FreeBSD$ */ #ifndef _X86_H_ @@ -47,6 +49,8 @@ #define CPUID_8000_0006 (0x80000006) #define CPUID_8000_0007 (0x80000007) #define CPUID_8000_0008 (0x80000008) +#define CPUID_8000_001D (0x8000001D) +#define CPUID_8000_001E (0x8000001E) /* * CPUID instruction Fn0000_0001: @@ -62,4 +66,17 @@ int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx); +enum vm_cpuid_capability { + VCC_NONE, + VCC_NO_EXECUTE, + VCC_FFXSR, + VCC_TCE, + VCC_LAST +}; + +/* + * Return 'true' if the capability 'cap' is enabled in this virtual cpu + * and 'false' otherwise. + */ +bool vm_cpuid_capability(struct vm *vm, int vcpuid, enum vm_cpuid_capability); #endif diff --git a/usr/src/uts/i86pc/os/gipt.c b/usr/src/uts/i86pc/os/gipt.c new file mode 100644 index 0000000000..ace7e03438 --- /dev/null +++ b/usr/src/uts/i86pc/os/gipt.c @@ -0,0 +1,566 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#include <sys/gipt.h> +#include <sys/malloc.h> +#include <sys/kmem.h> +#include <sys/sysmacros.h> +#include <sys/sunddi.h> +#include <sys/panic.h> +#include <vm/hat.h> +#include <vm/as.h> + +/* + * Generic Indexed Page Table + * + * There are several applications, such as hardware virtualization or IOMMU + * control, which require construction of a page table tree to represent a + * virtual address space. Many features of the existing htable system would be + * convenient for this, but its tight coupling to the VM system make it + * undesirable for independent consumers. The GIPT interface exists to provide + * page table allocation and indexing on top of which a table hierarchy + * (EPT, VT-d, etc) can be built by upstack logic. + * + * Types: + * + * gipt_t - Represents a single page table with a physical backing page and + * associated metadata. + * gipt_map_t - The workhorse of this facility, it contains an hash table to + * index all of the gipt_t entries which make up the page table tree. + * struct gipt_cbs - Callbacks used by the gipt_map_t: + * gipt_pte_type_cb_t - Given a PTE, emit the type (empty/page/table) + * gipt_pte_map_cb_t - Given a PFN, emit a (child) table mapping + */ + +/* + * For now, the level shifts are hard-coded to match with standard 4-level + * 64-bit paging structures. + */ + +#define GIPT_HASH(map, va, lvl) \ + ((((va) >> 12) + ((va) >> 28) + (lvl)) & ((map)->giptm_table_cnt - 1)) + +const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1] = { + 12, /* 4K */ + 21, /* 2M */ + 30, /* 1G */ + 39, /* 512G */ + 48 /* MAX */ +}; +const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1] = { + 0xfffffffffffff000ull, /* 4K */ + 0xffffffffffe00000ull, /* 2M */ + 0xffffffffc0000000ull, /* 1G */ + 0xffffff8000000000ull, /* 512G */ + 0xffff000000000000ull /* MAX */ +}; +const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1] = { + 0x0000000000001000ull, /* 4K */ + 0x0000000000200000ull, /* 2M */ + 0x0000000040000000ull, /* 1G */ + 0x0000008000000000ull, /* 512G */ + 0x0001000000000000ull /* MAX */ +}; +const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1] = { + 0x0000000000000001ull, /* 4K */ + 0x0000000000000200ull, /* 2M */ + 0x0000000000040000ull, /* 1G */ + 0x0000000008000000ull, /* 512G */ + 0x0000001000000000ull /* MAX */ +}; + +/* + * Allocate a gipt_t structure with corresponding page of memory to hold the + * PTEs which it contains. + */ +gipt_t * +gipt_alloc(void) +{ + gipt_t *pt; + void *page; + + pt = kmem_zalloc(sizeof (*pt), KM_SLEEP); + page = kmem_zalloc(PAGESIZE, KM_SLEEP); + pt->gipt_kva = page; + pt->gipt_pfn = hat_getpfnum(kas.a_hat, page); + + return (pt); +} + +/* + * Free a gipt_t structure along with its page of PTE storage. + */ +void +gipt_free(gipt_t *pt) +{ + void *page = pt->gipt_kva; + + ASSERT(pt->gipt_pfn != PFN_INVALID); + ASSERT(pt->gipt_kva != NULL); + + pt->gipt_pfn = PFN_INVALID; + pt->gipt_kva = NULL; + + kmem_free(page, PAGESIZE); + kmem_free(pt, sizeof (*pt)); +} + +/* + * Initialize a gipt_map_t with a max level (must be >= 1) and allocating its + * hash table based on a provided size (must be a power of 2). + */ +void +gipt_map_init(gipt_map_t *map, uint_t levels, uint_t hash_table_size, + const struct gipt_cbs *cbs, gipt_t *root) +{ + VERIFY(map->giptm_root == NULL); + VERIFY(map->giptm_hash == NULL); + VERIFY3U(levels, >, 0); + VERIFY3U(levels, <=, GIPT_MAX_LEVELS); + VERIFY(ISP2(hash_table_size)); + VERIFY(root != NULL); + + mutex_init(&map->giptm_lock, NULL, MUTEX_DEFAULT, NULL); + map->giptm_table_cnt = hash_table_size; + bcopy(cbs, &map->giptm_cbs, sizeof (*cbs)); + map->giptm_hash = kmem_alloc(sizeof (list_t) * map->giptm_table_cnt, + KM_SLEEP); + for (uint_t i = 0; i < hash_table_size; i++) { + list_create(&map->giptm_hash[i], sizeof (gipt_t), + offsetof(gipt_t, gipt_node)); + } + map->giptm_levels = levels; + + /* + * Insert the table root into the hash. It will be held in existence + * with an extra "valid" reference. This will prevent its clean-up + * during gipt_map_clean_parents() calls, even if it has no children. + */ + mutex_enter(&map->giptm_lock); + gipt_map_insert(map, root); + map->giptm_root = root; + root->gipt_valid_cnt++; + mutex_exit(&map->giptm_lock); +} + +/* + * Clean up a gipt_map_t by removing any lingering gipt_t entries referenced by + * it, and freeing its hash table. + */ +void +gipt_map_fini(gipt_map_t *map) +{ + const uint_t cnt = map->giptm_table_cnt; + const size_t sz = sizeof (list_t) * cnt; + + mutex_enter(&map->giptm_lock); + /* Clean up any lingering tables */ + for (uint_t i = 0; i < cnt; i++) { + list_t *list = &map->giptm_hash[i]; + gipt_t *pt; + + while ((pt = list_remove_head(list)) != NULL) { + gipt_free(pt); + } + ASSERT(list_is_empty(list)); + } + + kmem_free(map->giptm_hash, sz); + map->giptm_hash = NULL; + map->giptm_root = NULL; + map->giptm_levels = 0; + mutex_exit(&map->giptm_lock); + mutex_destroy(&map->giptm_lock); +} + +/* + * Look in the map for a gipt_t containing a given VA which is located at a + * specified level. + */ +gipt_t * +gipt_map_lookup(gipt_map_t *map, uint64_t va, uint_t lvl) +{ + gipt_t *pt; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + ASSERT3U(lvl, <=, GIPT_MAX_LEVELS); + + /* + * Lookup gipt_t at the VA aligned to the next level up. For example, + * level 0 corresponds to a page table containing 512 PTEs which cover + * 4k each, spanning a total 2MB. As such, the base VA of that table + * must be aligned to the same 2MB. + */ + const uint64_t masked_va = va & gipt_level_mask[lvl + 1]; + const uint_t hash = GIPT_HASH(map, masked_va, lvl); + + /* Only the root is expected to be at the top level. */ + if (lvl == (map->giptm_levels - 1) && map->giptm_root != NULL) { + pt = map->giptm_root; + + ASSERT3U(pt->gipt_level, ==, lvl); + + /* + * It may be so that the VA in question is not covered by the + * range of the table root. + */ + if (pt->gipt_vaddr != masked_va) { + return (NULL); + } + + return (pt); + } + + list_t *list = &map->giptm_hash[hash]; + for (pt = list_head(list); pt != NULL; pt = list_next(list, pt)) { + if (pt->gipt_vaddr == masked_va && pt->gipt_level == lvl) + break; + } + return (pt); +} + +/* + * Look in the map for the deepest (lowest level) gipt_t which contains a given + * VA. This could still fail if the VA is outside the range of the table root. + */ +gipt_t * +gipt_map_lookup_deepest(gipt_map_t *map, uint64_t va) +{ + gipt_t *pt = NULL; + uint_t lvl; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + + for (lvl = 0; lvl < map->giptm_levels; lvl++) { + pt = gipt_map_lookup(map, va, lvl); + if (pt != NULL) { + break; + } + } + return (pt); +} + +/* + * Given a VA inside a gipt_t, calculate (based on the level of that PT) the VA + * corresponding to the next entry in the table. It returns 0 if that VA would + * fall beyond the bounds of the table. + */ +static __inline__ uint64_t +gipt_next_va(gipt_t *pt, uint64_t va) +{ + const uint_t lvl = pt->gipt_level; + const uint64_t masked = va & gipt_level_mask[lvl]; + const uint64_t max = pt->gipt_vaddr + gipt_level_size[lvl+1]; + const uint64_t next = masked + gipt_level_size[lvl]; + + ASSERT3U(masked, >=, pt->gipt_vaddr); + ASSERT3U(masked, <, max); + + /* + * If the "next" VA would be outside this table, including cases where + * it overflowed, indicate an error result. + */ + if (next >= max || next <= masked) { + return (0); + } + return (next); +} + +/* + * For a given VA, find the next VA which corresponds to a valid page mapping. + * The gipt_t containing that VA will be indicated via 'ptp'. (The gipt_t of + * the starting VA can be passed in via 'ptp' for a minor optimization). If + * there is no valid mapping higher than 'va' but contained within 'max_va', + * then this will indicate failure with 0 returned. + */ +uint64_t +gipt_map_next_page(gipt_map_t *map, uint64_t va, uint64_t max_va, gipt_t **ptp) +{ + gipt_t *pt = *ptp; + uint64_t cur_va = va; + gipt_pte_type_cb_t pte_type = map->giptm_cbs.giptc_pte_type; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + ASSERT3U(max_va, !=, 0); + ASSERT3U(ptp, !=, NULL); + + /* + * If a starting table is not provided, search the map for the deepest + * table which contains the VA. If for some reason that VA is beyond + * coverage of the map root, indicate failure. + */ + if (pt == NULL) { + pt = gipt_map_lookup_deepest(map, cur_va); + if (pt == NULL) { + goto fail; + } + } + + /* + * From the starting table (at whatever level that may reside), walk + * forward through the PTEs looking for a valid page mapping. + */ + while (cur_va < max_va) { + const uint64_t next_va = gipt_next_va(pt, cur_va); + if (next_va == 0) { + /* + * The end of this table has been reached. Ascend one + * level to continue the walk if possible. If already + * at the root, the end of the table means failure. + */ + if (pt->gipt_level >= map->giptm_levels) { + goto fail; + } + pt = gipt_map_lookup(map, cur_va, pt->gipt_level + 1); + if (pt == NULL) { + goto fail; + } + continue; + } else if (next_va >= max_va) { + /* + * Terminate the walk with a failure if the VA + * corresponding to the next PTE is beyond the max. + */ + goto fail; + } + cur_va = next_va; + + const uint64_t pte = GIPT_VA2PTE(pt, cur_va); + const gipt_pte_type_t ptet = pte_type(pte, pt->gipt_level); + if (ptet == PTET_EMPTY) { + continue; + } else if (ptet == PTET_PAGE) { + /* A valid page mapping: success. */ + *ptp = pt; + return (cur_va); + } else if (ptet == PTET_LINK) { + /* + * A child page table is present at this PTE. Look it + * up from the map. + */ + ASSERT3U(pt->gipt_level, >, 0); + pt = gipt_map_lookup(map, cur_va, pt->gipt_level - 1); + ASSERT3P(pt, !=, NULL); + break; + } else { + panic("unexpected PTE type %x @ va %p", ptet, cur_va); + } + } + + /* + * By this point, the above loop has located a table structure to + * descend into in order to find the next page. + */ + while (cur_va < max_va) { + const uint64_t pte = GIPT_VA2PTE(pt, cur_va); + const gipt_pte_type_t ptet = pte_type(pte, pt->gipt_level); + + if (ptet == PTET_EMPTY) { + const uint64_t next_va = gipt_next_va(pt, cur_va); + if (next_va == 0 || next_va >= max_va) { + goto fail; + } + cur_va = next_va; + continue; + } else if (ptet == PTET_PAGE) { + /* A valid page mapping: success. */ + *ptp = pt; + return (cur_va); + } else if (ptet == PTET_LINK) { + /* + * A child page table is present at this PTE. Look it + * up from the map. + */ + ASSERT3U(pt->gipt_level, >, 0); + pt = gipt_map_lookup(map, cur_va, pt->gipt_level - 1); + ASSERT3P(pt, !=, NULL); + } else { + panic("unexpected PTE type %x @ va %p", ptet, cur_va); + } + } + +fail: + *ptp = NULL; + return (0); +} + +/* + * Insert a gipt_t into the map based on its VA and level. It is up to the + * caller to ensure that a duplicate entry does not already exist in the map. + */ +void +gipt_map_insert(gipt_map_t *map, gipt_t *pt) +{ + const uint_t hash = GIPT_HASH(map, pt->gipt_vaddr, pt->gipt_level); + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + ASSERT(gipt_map_lookup(map, pt->gipt_vaddr, pt->gipt_level) == NULL); + VERIFY3U(pt->gipt_level, <, map->giptm_levels); + + list_insert_head(&map->giptm_hash[hash], pt); +} + +/* + * Remove a gipt_t from the map. + */ +void +gipt_map_remove(gipt_map_t *map, gipt_t *pt) +{ + const uint_t hash = GIPT_HASH(map, pt->gipt_vaddr, pt->gipt_level); + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + + list_remove(&map->giptm_hash[hash], pt); +} + +/* + * Given a VA, create any missing gipt_t entries from the specified level all + * the way up to (but not including) the root. This is done from lowest level + * to highest, and stops when an existing table covering that VA is found. + * References to any created gipt_t tables, plus the final "found" gipt_t are + * stored in 'pts'. The number of gipt_t pointers stored to 'pts' serves as + * the return value (1 <= val <= root level). It is up to the caller to + * populate linking PTEs to the newly created empty tables. + */ +static uint_t +gipt_map_ensure_chain(gipt_map_t *map, uint64_t va, uint_t lvl, gipt_t **pts) +{ + const uint_t root_lvl = map->giptm_root->gipt_level; + uint_t clvl = lvl, count = 0; + gipt_t *child_pt = NULL; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + ASSERT3U(lvl, <, root_lvl); + ASSERT3P(map->giptm_root, !=, NULL); + + do { + const uint64_t pva = (va & gipt_level_mask[clvl + 1]); + gipt_t *pt; + + pt = gipt_map_lookup(map, pva, clvl); + if (pt != NULL) { + ASSERT3U(pva, ==, pt->gipt_vaddr); + + if (child_pt != NULL) { + child_pt->gipt_parent = pt; + } + pts[count++] = pt; + return (count); + } + + pt = gipt_alloc(); + pt->gipt_vaddr = pva; + pt->gipt_level = clvl; + if (child_pt != NULL) { + child_pt->gipt_parent = pt; + } + + gipt_map_insert(map, pt); + child_pt = pt; + pts[count++] = pt; + clvl++; + } while (clvl <= root_lvl); + + return (count); +} + +/* + * Ensure that a page table covering a VA at a specified level exists. This + * will create any necessary tables chaining up to the root as well. + */ +gipt_t * +gipt_map_create_parents(gipt_map_t *map, uint64_t va, uint_t lvl) +{ + gipt_t *pt, *pts[GIPT_MAX_LEVELS] = { 0 }; + gipt_pte_type_cb_t pte_type = map->giptm_cbs.giptc_pte_type; + gipt_pte_map_cb_t pte_map = map->giptm_cbs.giptc_pte_map; + uint64_t *ptep; + uint_t i, count; + + ASSERT(MUTEX_HELD(&map->giptm_lock)); + + count = gipt_map_ensure_chain(map, va, lvl, pts); + if (count == 1) { + /* Table already exists in the hierarchy */ + return (pts[0]); + } + ASSERT3U(count, >, 1); + + /* Make sure there is not already a large page mapping at the top */ + pt = pts[count - 1]; + if (pte_type(GIPT_VA2PTE(pt, va), pt->gipt_level) == PTET_PAGE) { + const uint_t end = count - 1; + + /* + * Nuke those gipt_t entries which were optimistically created + * for what was found to be a conflicted mapping. + */ + for (i = 0; i < end; i++) { + gipt_map_remove(map, pts[i]); + gipt_free(pts[i]); + } + return (NULL); + } + + /* Initialize the appropriate tables from bottom to top */ + for (i = 1; i < count; i++) { + pt = pts[i]; + ptep = GIPT_VA2PTEP(pt, va); + + /* + * Since gipt_map_ensure_chain() creates missing tables until + * it find a valid one, and that existing table has been + * checked for the existence of a large page, nothing should + * occupy this PTE. + */ + ASSERT3U(pte_type(*ptep, pt->gipt_level), ==, PTET_EMPTY); + + *ptep = pte_map(pts[i - 1]->gipt_pfn); + pt->gipt_valid_cnt++; + } + + return (pts[0]); +} + +/* + * If a page table is empty, free it from the map, as well as any parent tables + * that would subsequently become empty as part of the clean-up. As noted in + * gipt_map_init(), the table root is a special case and will remain in the + * map, even when empty. + */ +void +gipt_map_clean_parents(gipt_map_t *map, gipt_t *pt) +{ + ASSERT(MUTEX_HELD(&map->giptm_lock)); + + while (pt->gipt_valid_cnt == 0) { + gipt_t *parent = pt->gipt_parent; + uint64_t *ptep = GIPT_VA2PTEP(parent, pt->gipt_vaddr); + + ASSERT3S(map->giptm_cbs.giptc_pte_type(*ptep, + parent->gipt_level), ==, PTET_LINK); + + /* + * For now, it is assumed that all gipt consumers consider PTE + * zeroing as an adequate action for table unmap. + */ + *ptep = 0; + + parent->gipt_valid_cnt--; + gipt_map_remove(map, pt); + gipt_free(pt); + pt = parent; + } +} diff --git a/usr/src/uts/i86pc/sys/gipt.h b/usr/src/uts/i86pc/sys/gipt.h new file mode 100644 index 0000000000..4d7d523726 --- /dev/null +++ b/usr/src/uts/i86pc/sys/gipt.h @@ -0,0 +1,92 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _GIPT_H_ +#define _GIPT_H_ + +#include <sys/types.h> +#include <sys/mutex.h> +#include <sys/param.h> +#include <sys/list.h> + +struct gipt { + list_node_t gipt_node; + uint64_t gipt_vaddr; + uint64_t gipt_pfn; + uint16_t gipt_level; + uint16_t gipt_valid_cnt; + uint32_t _gipt_pad; + struct gipt *gipt_parent; + uint64_t *gipt_kva; + uint64_t _gipt_pad2; +}; +typedef struct gipt gipt_t; + +typedef enum { + PTET_EMPTY = 0, + PTET_PAGE = 1, + PTET_LINK = 2, +} gipt_pte_type_t; + +/* Given a PTE and its level, determine the type of that PTE */ +typedef gipt_pte_type_t (*gipt_pte_type_cb_t)(uint64_t, uint_t); +/* Given the PFN of a child table, emit a PTE that references it */ +typedef uint64_t (*gipt_pte_map_cb_t)(uint64_t); + +struct gipt_cbs { + gipt_pte_type_cb_t giptc_pte_type; + gipt_pte_map_cb_t giptc_pte_map; +}; + +struct gipt_map { + kmutex_t giptm_lock; + gipt_t *giptm_root; + list_t *giptm_hash; + struct gipt_cbs giptm_cbs; + size_t giptm_table_cnt; + uint_t giptm_levels; +}; +typedef struct gipt_map gipt_map_t; + +#define GIPT_HASH_SIZE_DEFAULT 0x2000 +#define GIPT_MAX_LEVELS 4 + +#define GIPT_VA2IDX(pt, va) \ + (((va) - (pt)->gipt_vaddr) >> \ + gipt_level_shift[(pt)->gipt_level]) + +#define GIPT_VA2PTE(pt, va) ((pt)->gipt_kva[GIPT_VA2IDX(pt, va)]) +#define GIPT_VA2PTEP(pt, va) (&(pt)->gipt_kva[GIPT_VA2IDX(pt, va)]) + +extern const uint_t gipt_level_shift[GIPT_MAX_LEVELS+1]; +extern const uint64_t gipt_level_mask[GIPT_MAX_LEVELS+1]; +extern const uint64_t gipt_level_size[GIPT_MAX_LEVELS+1]; +extern const uint64_t gipt_level_count[GIPT_MAX_LEVELS+1]; + +extern gipt_t *gipt_alloc(void); +extern void gipt_free(gipt_t *); +extern void gipt_map_init(gipt_map_t *, uint_t, uint_t, + const struct gipt_cbs *, gipt_t *); +extern void gipt_map_fini(gipt_map_t *); +extern gipt_t *gipt_map_lookup(gipt_map_t *, uint64_t, uint_t); +extern gipt_t *gipt_map_lookup_deepest(gipt_map_t *, uint64_t); +extern uint64_t gipt_map_next_page(gipt_map_t *, uint64_t, uint64_t, + gipt_t **); +extern void gipt_map_insert(gipt_map_t *, gipt_t *); +extern void gipt_map_remove(gipt_map_t *, gipt_t *); +extern gipt_t *gipt_map_create_parents(gipt_map_t *, uint64_t, uint_t); +extern void gipt_map_clean_parents(gipt_map_t *, gipt_t *); + +#endif /* _GIPT_H_ */ diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h index a4fb0f2527..a26cc00a55 100644 --- a/usr/src/uts/i86pc/sys/viona_io.h +++ b/usr/src/uts/i86pc/sys/viona_io.h @@ -11,6 +11,7 @@ /* * Copyright 2013 Pluribus Networks Inc. + * Copyright 2017 Joyent, Inc. */ #ifndef _VIONA_IO_H_ @@ -27,8 +28,8 @@ #define VNA_IOC_TX_RING_KICK (VNA_IOC | 8) #define VNA_IOC_RX_INTR_CLR (VNA_IOC | 9) #define VNA_IOC_TX_INTR_CLR (VNA_IOC | 10) -#define VNA_IOC_SET_FEATURES (VNA_IOC | 11) -#define VNA_IOC_GET_FEATURES (VNA_IOC | 12) +#define VNA_IOC_SET_FEATURES (VNA_IOC | 11) +#define VNA_IOC_GET_FEATURES (VNA_IOC | 12) typedef struct vioc_create { datalink_id_t c_linkid; diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h index e876ce748f..8a35d123c7 100644 --- a/usr/src/uts/i86pc/sys/vmm.h +++ b/usr/src/uts/i86pc/sys/vmm.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/include/vmm.h 273375 2014-10-21 07:10:43Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,18 +38,25 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMM_H_ #define _VMM_H_ +#include <sys/sdt.h> #include <x86/segments.h> +#ifdef _KERNEL +SDT_PROVIDER_DECLARE(vmm); +#endif + enum vm_suspend_how { VM_SUSPEND_NONE, VM_SUSPEND_RESET, VM_SUSPEND_POWEROFF, VM_SUSPEND_HALT, + VM_SUSPEND_TRIPLEFAULT, VM_SUSPEND_LAST }; @@ -89,6 +98,16 @@ enum vm_reg_name { VM_REG_GUEST_GDTR, VM_REG_GUEST_EFER, VM_REG_GUEST_CR2, + VM_REG_GUEST_PDPTE0, + VM_REG_GUEST_PDPTE1, + VM_REG_GUEST_PDPTE2, + VM_REG_GUEST_PDPTE3, + VM_REG_GUEST_INTR_SHADOW, + VM_REG_GUEST_DR0, + VM_REG_GUEST_DR1, + VM_REG_GUEST_DR2, + VM_REG_GUEST_DR3, + VM_REG_GUEST_DR6, VM_REG_LAST }; @@ -108,31 +127,37 @@ enum x2apic_state { #define VM_INTINFO_HWEXCEPTION (3 << 8) #define VM_INTINFO_SWINTR (4 << 8) + #define VM_MAX_NAMELEN 32 #ifdef _KERNEL struct vm; struct vm_exception; -struct vm_memory_segment; struct seg_desc; struct vm_exit; struct vm_run; struct vhpet; struct vioapic; struct vlapic; +struct vmspace; +struct vm_object; struct vm_guest_paging; +struct pmap; -typedef int (*vmm_init_func_t)(void); +struct vm_eventinfo { + u_int *rptr; /* runblock cookie */ + int *sptr; /* suspend cookie */ + int *iptr; /* reqidle cookie */ +}; + +typedef int (*vmm_init_func_t)(int ipinum); typedef int (*vmm_cleanup_func_t)(void); -typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */ -typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip); +typedef void (*vmm_resume_func_t)(void); +typedef void * (*vmi_init_func_t)(struct vm *vm, struct pmap *pmap); +typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip, + struct pmap *pmap, struct vm_eventinfo *info); typedef void (*vmi_cleanup_func_t)(void *vmi); -typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa, - vm_paddr_t hpa, size_t length, - vm_memattr_t attr, int prot, - boolean_t superpages_ok); -typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa); typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num, uint64_t *retval); typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num, @@ -143,26 +168,38 @@ typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num, struct seg_desc *desc); typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval); typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val); +typedef struct vmspace * (*vmi_vmspace_alloc)(vm_offset_t min, vm_offset_t max); +typedef void (*vmi_vmspace_free)(struct vmspace *vmspace); typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu); typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic); +#ifndef __FreeBSD__ +typedef void (*vmi_savectx)(void *vmi, int vcpu); +typedef void (*vmi_restorectx)(void *vmi, int vcpu); +#endif struct vmm_ops { vmm_init_func_t init; /* module wide initialization */ vmm_cleanup_func_t cleanup; + vmm_resume_func_t resume; vmi_init_func_t vminit; /* vm-specific initialization */ vmi_run_func_t vmrun; vmi_cleanup_func_t vmcleanup; - vmi_mmap_set_func_t vmmmap_set; - vmi_mmap_get_func_t vmmmap_get; vmi_get_register_t vmgetreg; vmi_set_register_t vmsetreg; vmi_get_desc_t vmgetdesc; vmi_set_desc_t vmsetdesc; vmi_get_cap_t vmgetcap; vmi_set_cap_t vmsetcap; + vmi_vmspace_alloc vmspace_alloc; + vmi_vmspace_free vmspace_free; vmi_vlapic_init vlapic_init; vmi_vlapic_cleanup vlapic_cleanup; + +#ifndef __FreeBSD__ + vmi_savectx vmsavectx; + vmi_restorectx vmrestorectx; +#endif }; extern struct vmm_ops vmm_ops_intel; @@ -170,20 +207,41 @@ extern struct vmm_ops vmm_ops_amd; int vm_create(const char *name, struct vm **retvm); void vm_destroy(struct vm *vm); +int vm_reinit(struct vm *vm); const char *vm_name(struct vm *vm); -int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len); -#ifdef __FreeBSD__ +uint16_t vm_get_maxcpus(struct vm *vm); +void vm_get_topology(struct vm *vm, uint16_t *sockets, uint16_t *cores, + uint16_t *threads, uint16_t *maxcpus); +int vm_set_topology(struct vm *vm, uint16_t sockets, uint16_t cores, + uint16_t threads, uint16_t maxcpus); + +/* + * APIs that modify the guest memory map require all vcpus to be frozen. + */ +int vm_mmap_memseg(struct vm *vm, vm_paddr_t gpa, int segid, vm_ooffset_t off, + size_t len, int prot, int flags); +int vm_alloc_memseg(struct vm *vm, int ident, size_t len, bool sysmem); +void vm_free_memseg(struct vm *vm, int ident); int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa); -#endif int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len); -#ifndef __FreeBSD__ -vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size); -#endif -void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot, - void **cookie); +int vm_assign_pptdev(struct vm *vm, int bus, int slot, int func); +int vm_unassign_pptdev(struct vm *vm, int bus, int slot, int func); + +/* + * APIs that inspect the guest memory map require only a *single* vcpu to + * be frozen. This acts like a read lock on the guest memory map since any + * modification requires *all* vcpus to be frozen. + */ +int vm_mmap_getnext(struct vm *vm, vm_paddr_t *gpa, int *segid, + vm_ooffset_t *segoff, size_t *len, int *prot, int *flags); +int vm_get_memseg(struct vm *vm, int ident, size_t *len, bool *sysmem, + struct vm_object **objptr); +vm_paddr_t vmm_sysmem_maxaddr(struct vm *vm); +void *vm_gpa_hold(struct vm *, int vcpuid, vm_paddr_t gpa, size_t len, + int prot, void **cookie); void vm_gpa_release(void *cookie); -int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase, - struct vm_memory_segment *seg); +bool vm_mem_allocated(struct vm *vm, int vcpuid, vm_paddr_t gpa); + int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval); int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val); int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, @@ -191,6 +249,7 @@ int vm_get_seg_desc(struct vm *vm, int vcpu, int reg, int vm_set_seg_desc(struct vm *vm, int vcpu, int reg, struct seg_desc *desc); int vm_run(struct vm *vm, struct vm_run *vmrun); +int vm_suspend(struct vm *vm, enum vm_suspend_how how); int vm_inject_nmi(struct vm *vm, int vcpu); int vm_nmi_pending(struct vm *vm, int vcpuid); void vm_nmi_clear(struct vm *vm, int vcpuid); @@ -206,10 +265,43 @@ int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state); int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state); int vm_apicid2vcpuid(struct vm *vm, int apicid); int vm_activate_cpu(struct vm *vm, int vcpu); -cpuset_t vm_active_cpus(struct vm *vm); +int vm_suspend_cpu(struct vm *vm, int vcpu); +int vm_resume_cpu(struct vm *vm, int vcpu); struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid); +void vm_exit_suspended(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_debug(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_runblock(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_astpending(struct vm *vm, int vcpuid, uint64_t rip); +void vm_exit_reqidle(struct vm *vm, int vcpuid, uint64_t rip); + +#ifdef _SYS__CPUSET_H_ +cpuset_t vm_active_cpus(struct vm *vm); +cpuset_t vm_debug_cpus(struct vm *vm); +cpuset_t vm_suspended_cpus(struct vm *vm); +#endif /* _SYS__CPUSET_H_ */ + +static __inline int +vcpu_runblocked(struct vm_eventinfo *info) +{ -typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg); + return (*info->rptr != 0); +} + +static __inline int +vcpu_suspended(struct vm_eventinfo *info) +{ + + return (*info->sptr); +} + +static __inline int +vcpu_reqidle(struct vm_eventinfo *info) +{ + + return (*info->iptr); +} + +int vcpu_debugged(struct vm *vm, int vcpuid); /* * Return 1 if device indicated by bus/slot/func is supposed to be a @@ -231,21 +323,43 @@ enum vcpu_state { int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state, bool from_idle); enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu); +void vcpu_block_run(struct vm *, int); +void vcpu_unblock_run(struct vm *, int); + +#ifndef __FreeBSD__ +uint64_t vcpu_tsc_offset(struct vm *vm, int vcpuid); +#endif -static int __inline +static __inline int vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu) { return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING); } +#ifdef _SYS_THREAD_H +static __inline int +vcpu_should_yield(struct vm *vm, int vcpu) +{ + + if (curthread->t_astflag) + return (1); + else if (CPU->cpu_runrun) + return (1); + else + return (0); +} +#endif /* _SYS_THREAD_H */ + void *vcpu_stats(struct vm *vm, int vcpu); -void vm_interrupt_hostcpu(struct vm *vm, int vcpu); void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr); +struct vmspace *vm_get_vmspace(struct vm *vm); struct vatpic *vm_atpic(struct vm *vm); struct vatpit *vm_atpit(struct vm *vm); +struct vpmtmr *vm_pmtmr(struct vm *vm); +struct vrtc *vm_rtc(struct vm *vm); /* - * Inject exception 'vme' into the guest vcpu. This function returns 0 on + * Inject exception 'vector' into the guest vcpu. This function returns 0 on * success and non-zero on failure. * * Wrapper functions like 'vm_inject_gp()' should be preferred to calling @@ -255,7 +369,8 @@ struct vatpit *vm_atpit(struct vm *vm); * This function should only be called in the context of the thread that is * executing this vcpu. */ -int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme); +int vm_inject_exception(struct vm *vm, int vcpuid, int vector, int err_valid, + uint32_t errcode, int restart_instruction); /* * This function is called after a VM-exit that occurred during exception or @@ -298,9 +413,10 @@ struct vm_copyinfo { * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for * a copyin or PROT_WRITE for a copyout. * - * Returns 0 on success. - * Returns 1 if an exception was injected into the guest. - * Returns -1 otherwise. + * retval is_fault Interpretation + * 0 0 Success + * 0 1 An exception was injected into the guest + * EFAULT N/A Unrecoverable error * * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if * the return value is 0. The 'copyinfo[]' resources should be freed by calling @@ -308,16 +424,18 @@ struct vm_copyinfo { */ int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo, - int num_copyinfo); + int num_copyinfo, int *is_fault); void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, int num_copyinfo); void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr, size_t len); void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr, struct vm_copyinfo *copyinfo, size_t len); + +int vcpu_trace_exceptions(struct vm *vm, int vcpuid); #endif /* KERNEL */ -#define VM_MAXCPU 16 /* maximum virtual cpus */ +#define VM_MAXCPU 32 /* maximum virtual cpus */ /* * Identifiers for optional vmm capabilities @@ -348,7 +466,6 @@ struct seg_desc { uint32_t limit; uint32_t access; }; - #define SEG_DESC_TYPE(access) ((access) & 0x001f) #define SEG_DESC_DPL(access) (((access) >> 5) & 0x3) #define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0) @@ -443,7 +560,20 @@ enum vm_exitcode { VM_EXITCODE_INST_EMUL, VM_EXITCODE_SPINUP_AP, VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */ + VM_EXITCODE_RUNBLOCK, + VM_EXITCODE_IOAPIC_EOI, + VM_EXITCODE_SUSPENDED, VM_EXITCODE_INOUT_STR, + VM_EXITCODE_TASK_SWITCH, + VM_EXITCODE_MONITOR, + VM_EXITCODE_MWAIT, + VM_EXITCODE_SVM, + VM_EXITCODE_REQIDLE, + VM_EXITCODE_DEBUG, + VM_EXITCODE_VMINSN, +#ifndef __FreeBSD__ + VM_EXITCODE_HT, +#endif VM_EXITCODE_MAX }; @@ -468,6 +598,22 @@ struct vm_inout_str { struct seg_desc seg_desc; }; +enum task_switch_reason { + TSR_CALL, + TSR_IRET, + TSR_JMP, + TSR_IDT_GATE, /* task gate in IDT */ +}; + +struct vm_task_switch { + uint16_t tsssel; /* new TSS selector */ + int ext; /* task switch due to external event */ + uint32_t errcode; + int errcode_valid; /* push 'errcode' on the new stack */ + enum task_switch_reason reason; + struct vm_guest_paging paging; +}; + struct vm_exit { enum vm_exitcode exitcode; int inst_length; /* 0 means unknown */ @@ -506,6 +652,14 @@ struct vm_exit { int inst_type; int inst_error; } vmx; + /* + * SVM specific payload. + */ + struct { + uint64_t exitcode; + uint64_t exitinfo1; + uint64_t exitinfo2; + } svm; struct { uint32_t code; /* ecx value */ uint64_t wval; @@ -516,7 +670,15 @@ struct vm_exit { } spinup_ap; struct { uint64_t rflags; + uint64_t intr_status; } hlt; + struct { + int vector; + } ioapic_eoi; + struct { + enum vm_suspend_how how; + } suspended; + struct vm_task_switch task_switch; } u; }; @@ -554,12 +716,28 @@ int vm_restart_instruction(void *vm, int vcpuid); #ifndef __FreeBSD__ #ifdef _KERNEL -extern void vmm_sol_glue_init(void); -extern void vmm_sol_glue_cleanup(void); -extern int vmm_mod_load(void); -extern int vmm_mod_unload(void); -#endif -#endif +void vmm_sol_glue_init(void); +void vmm_sol_glue_cleanup(void); + +int vmm_mod_load(void); +int vmm_mod_unload(void); + +void vmm_call_trap(uint64_t); + +/* + * Because of tangled headers, these are mirrored by vmm_drv.h to present the + * interface to driver consumers. + */ +typedef int (*vmm_rmem_cb_t)(void *, uintptr_t, uint_t, uint64_t *); +typedef int (*vmm_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t); + +int vm_ioport_hook(struct vm *, uint_t, vmm_rmem_cb_t, vmm_wmem_cb_t, void *, + void **); +void vm_ioport_unhook(struct vm *, void **); +int vm_ioport_handle_hook(struct vm *, int, bool, int, int, uint32_t *); + +#endif /* _KERNEL */ +#endif /* __FreeBSD */ #endif /* _VMM_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h index 3e74eb8786..58e581a60d 100644 --- a/usr/src/uts/i86pc/sys/vmm_dev.h +++ b/usr/src/uts/i86pc/sys/vmm_dev.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2011 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/include/vmm_dev.h 268889 2014-07-19 20:59:08Z neel $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -36,20 +38,30 @@ * http://www.illumos.org/license/CDDL. * * Copyright 2015 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMM_DEV_H_ #define _VMM_DEV_H_ -#ifdef _KERNEL -void vmmdev_init(void); -int vmmdev_cleanup(void); -#endif +#include <machine/vmm.h> + +struct vm_memmap { + vm_paddr_t gpa; + int segid; /* memory segment */ + vm_ooffset_t segoff; /* offset into memory segment */ + size_t len; /* mmap length */ + int prot; /* RWX */ + int flags; +}; +#define VM_MEMMAP_F_WIRED 0x01 +#define VM_MEMMAP_F_IOMMU 0x02 -struct vm_memory_segment { - vm_paddr_t gpa; /* in */ +#define VM_MEMSEG_NAME(m) ((m)->name[0] != '\0' ? (m)->name : NULL) +struct vm_memseg { + int segid; size_t len; - int wired; + char name[SPECNAMELEN + 1]; }; struct vm_register { @@ -64,6 +76,13 @@ struct vm_seg_desc { /* data or code segment */ struct seg_desc desc; }; +struct vm_register_set { + int cpuid; + unsigned int count; + const int *regnums; /* enum vm_reg_name */ + uint64_t *regvals; +}; + struct vm_run { int cpuid; struct vm_exit vm_exit; @@ -129,7 +148,7 @@ struct vm_pptdev_msi { int slot; int func; int numvec; /* 0 means disabled */ - uint32_t msg; + uint64_t msg; uint64_t addr; }; @@ -139,7 +158,7 @@ struct vm_pptdev_msix { int slot; int func; int idx; - uint32_t msg; + uint64_t msg; uint32_t vector_control; uint64_t addr; }; @@ -148,7 +167,12 @@ struct vm_nmi { int cpuid; }; +#ifdef __FreeBSD__ #define MAX_VM_STATS 64 +#else +#define MAX_VM_STATS (64 + VM_MAXCPU) +#endif + struct vm_stats { int cpuid; /* in */ int num_entries; /* out */ @@ -176,8 +200,8 @@ struct vm_hpet_cap { uint32_t capabilities; /* lower 32 bits of HPET capabilities */ }; -struct vm_activate_cpu { - int vcpuid; +struct vm_suspend { + enum vm_suspend_how how; }; struct vm_gla2gpa { @@ -189,13 +213,51 @@ struct vm_gla2gpa { uint64_t gpa; }; +struct vm_activate_cpu { + int vcpuid; +}; + struct vm_cpuset { int which; int cpusetsize; +#ifndef _KERNEL cpuset_t *cpus; +#else + void *cpus; +#endif }; #define VM_ACTIVE_CPUS 0 #define VM_SUSPENDED_CPUS 1 +#define VM_DEBUG_CPUS 2 + +struct vm_intinfo { + int vcpuid; + uint64_t info1; + uint64_t info2; +}; + +struct vm_rtc_time { + time_t secs; +}; + +struct vm_rtc_data { + int offset; + uint8_t value; +}; + +#ifndef __FreeBSD__ +struct vm_devmem_offset { + int segid; + off_t offset; +}; +#endif + +struct vm_cpu_topology { + uint16_t sockets; + uint16_t cores; + uint16_t threads; + uint16_t maxcpus; +}; enum { /* general routines */ @@ -203,20 +265,31 @@ enum { IOCNUM_RUN = 1, IOCNUM_SET_CAPABILITY = 2, IOCNUM_GET_CAPABILITY = 3, + IOCNUM_SUSPEND = 4, + IOCNUM_REINIT = 5, /* memory apis */ - IOCNUM_MAP_MEMORY = 10, - IOCNUM_GET_MEMORY_SEG = 11, + IOCNUM_MAP_MEMORY = 10, /* deprecated */ + IOCNUM_GET_MEMORY_SEG = 11, /* deprecated */ IOCNUM_GET_GPA_PMAP = 12, IOCNUM_GLA2GPA = 13, + IOCNUM_ALLOC_MEMSEG = 14, + IOCNUM_GET_MEMSEG = 15, + IOCNUM_MMAP_MEMSEG = 16, + IOCNUM_MMAP_GETNEXT = 17, + IOCNUM_GLA2GPA_NOFAULT = 18, /* register/state accessors */ IOCNUM_SET_REGISTER = 20, IOCNUM_GET_REGISTER = 21, IOCNUM_SET_SEGMENT_DESCRIPTOR = 22, IOCNUM_GET_SEGMENT_DESCRIPTOR = 23, + IOCNUM_SET_REGISTER_SET = 24, + IOCNUM_GET_REGISTER_SET = 25, /* interrupt injection */ + IOCNUM_GET_INTINFO = 28, + IOCNUM_SET_INTINFO = 29, IOCNUM_INJECT_EXCEPTION = 30, IOCNUM_LAPIC_IRQ = 31, IOCNUM_INJECT_NMI = 32, @@ -244,6 +317,10 @@ enum { IOCNUM_GET_X2APIC_STATE = 61, IOCNUM_GET_HPET_CAPABILITIES = 62, + /* CPU Topology */ + IOCNUM_SET_TOPOLOGY = 63, + IOCNUM_GET_TOPOLOGY = 64, + /* legacy interrupt injection */ IOCNUM_ISA_ASSERT_IRQ = 80, IOCNUM_ISA_DEASSERT_IRQ = 81, @@ -253,14 +330,36 @@ enum { /* vm_cpuset */ IOCNUM_ACTIVATE_CPU = 90, IOCNUM_GET_CPUSET = 91, + IOCNUM_SUSPEND_CPU = 92, + IOCNUM_RESUME_CPU = 93, + + /* RTC */ + IOCNUM_RTC_READ = 100, + IOCNUM_RTC_WRITE = 101, + IOCNUM_RTC_SETTIME = 102, + IOCNUM_RTC_GETTIME = 103, + +#ifndef __FreeBSD__ + /* illumos-custom ioctls */ + IOCNUM_DEVMEM_GETOFFSET = 256, + IOCNUM_WRLOCK_CYCLE = 257, +#endif }; #define VM_RUN \ _IOWR('v', IOCNUM_RUN, struct vm_run) -#define VM_MAP_MEMORY \ - _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment) -#define VM_GET_MEMORY_SEG \ - _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment) +#define VM_SUSPEND \ + _IOW('v', IOCNUM_SUSPEND, struct vm_suspend) +#define VM_REINIT \ + _IO('v', IOCNUM_REINIT) +#define VM_ALLOC_MEMSEG \ + _IOW('v', IOCNUM_ALLOC_MEMSEG, struct vm_memseg) +#define VM_GET_MEMSEG \ + _IOWR('v', IOCNUM_GET_MEMSEG, struct vm_memseg) +#define VM_MMAP_MEMSEG \ + _IOW('v', IOCNUM_MMAP_MEMSEG, struct vm_memmap) +#define VM_MMAP_GETNEXT \ + _IOWR('v', IOCNUM_MMAP_GETNEXT, struct vm_memmap) #define VM_SET_REGISTER \ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register) #define VM_GET_REGISTER \ @@ -269,6 +368,10 @@ enum { _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) #define VM_GET_SEGMENT_DESCRIPTOR \ _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc) +#define VM_SET_REGISTER_SET \ + _IOW('v', IOCNUM_SET_REGISTER_SET, struct vm_register_set) +#define VM_GET_REGISTER_SET \ + _IOWR('v', IOCNUM_GET_REGISTER_SET, struct vm_register_set) #define VM_INJECT_EXCEPTION \ _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception) #define VM_LAPIC_IRQ \ @@ -309,10 +412,8 @@ enum { _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix) #define VM_INJECT_NMI \ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi) -#ifdef __FreeBSD__ -#define VM_STATS \ +#define VM_STATS_IOC \ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats) -#endif #define VM_STAT_DESC \ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc) #define VM_SET_X2APIC_STATE \ @@ -321,14 +422,52 @@ enum { _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic) #define VM_GET_HPET_CAPABILITIES \ _IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap) +#define VM_SET_TOPOLOGY \ + _IOW('v', IOCNUM_SET_TOPOLOGY, struct vm_cpu_topology) +#define VM_GET_TOPOLOGY \ + _IOR('v', IOCNUM_GET_TOPOLOGY, struct vm_cpu_topology) #define VM_GET_GPA_PMAP \ _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte) #define VM_GLA2GPA \ _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa) +#define VM_GLA2GPA_NOFAULT \ + _IOWR('v', IOCNUM_GLA2GPA_NOFAULT, struct vm_gla2gpa) #define VM_ACTIVATE_CPU \ _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu) #define VM_GET_CPUS \ _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset) +#define VM_SUSPEND_CPU \ + _IOW('v', IOCNUM_SUSPEND_CPU, struct vm_activate_cpu) +#define VM_RESUME_CPU \ + _IOW('v', IOCNUM_RESUME_CPU, struct vm_activate_cpu) +#define VM_SET_INTINFO \ + _IOW('v', IOCNUM_SET_INTINFO, struct vm_intinfo) +#define VM_GET_INTINFO \ + _IOWR('v', IOCNUM_GET_INTINFO, struct vm_intinfo) +#define VM_RTC_WRITE \ + _IOW('v', IOCNUM_RTC_WRITE, struct vm_rtc_data) +#define VM_RTC_READ \ + _IOWR('v', IOCNUM_RTC_READ, struct vm_rtc_data) +#define VM_RTC_SETTIME \ + _IOW('v', IOCNUM_RTC_SETTIME, struct vm_rtc_time) +#define VM_RTC_GETTIME \ + _IOR('v', IOCNUM_RTC_GETTIME, struct vm_rtc_time) #define VM_RESTART_INSTRUCTION \ _IOW('v', IOCNUM_RESTART_INSTRUCTION, int) + +#ifndef __FreeBSD__ +#define VM_DEVMEM_GETOFFSET \ + _IOW('v', IOCNUM_DEVMEM_GETOFFSET, struct vm_devmem_offset) +#define VM_WRLOCK_CYCLE _IO('v', IOCNUM_WRLOCK_CYCLE) + +/* ioctls used against ctl device for vm create/destroy */ +#define VMM_IOC_BASE (('V' << 16) | ('M' << 8)) +#define VMM_CREATE_VM (VMM_IOC_BASE | 0x01) +#define VMM_DESTROY_VM (VMM_IOC_BASE | 0x02) +#define VMM_VM_SUPPORTED (VMM_IOC_BASE | 0x03) + +#define VMM_CTL_DEV "/dev/vmmctl" + +#endif + #endif diff --git a/usr/src/uts/i86pc/sys/vmm_drv.h b/usr/src/uts/i86pc/sys/vmm_drv.h new file mode 100644 index 0000000000..33fefc10ea --- /dev/null +++ b/usr/src/uts/i86pc/sys/vmm_drv.h @@ -0,0 +1,50 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _VMM_DRV_H_ +#define _VMM_DRV_H_ + +#ifdef _KERNEL +struct vmm_hold; +typedef struct vmm_hold vmm_hold_t; + +struct vmm_lease; +typedef struct vmm_lease vmm_lease_t; + +/* + * Because of tangled headers, these definitions mirror their vmm_[rw]mem_cb_t + * counterparts in vmm.h. + */ +typedef int (*vmm_drv_rmem_cb_t)(void *, uintptr_t, uint_t, uint64_t *); +typedef int (*vmm_drv_wmem_cb_t)(void *, uintptr_t, uint_t, uint64_t); + +extern int vmm_drv_hold(file_t *, cred_t *, vmm_hold_t **); +extern void vmm_drv_rele(vmm_hold_t *); +extern boolean_t vmm_drv_release_reqd(vmm_hold_t *); + +extern vmm_lease_t *vmm_drv_lease_sign(vmm_hold_t *, boolean_t (*)(void *), + void *); +extern void vmm_drv_lease_break(vmm_hold_t *, vmm_lease_t *); +extern boolean_t vmm_drv_lease_expired(vmm_lease_t *); + +extern void *vmm_drv_gpa2kva(vmm_lease_t *, uintptr_t, size_t); +extern int vmm_drv_msi(vmm_lease_t *, uint64_t, uint64_t); + +extern int vmm_drv_ioport_hook(vmm_hold_t *, uint_t, vmm_drv_rmem_cb_t, + vmm_drv_wmem_cb_t, void *, void **); +extern void vmm_drv_ioport_unhook(vmm_hold_t *, void **); +#endif /* _KERNEL */ + +#endif /* _VMM_DRV_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_impl.h b/usr/src/uts/i86pc/sys/vmm_impl.h index 1602fa286d..cdc56cc464 100644 --- a/usr/src/uts/i86pc/sys/vmm_impl.h +++ b/usr/src/uts/i86pc/sys/vmm_impl.h @@ -11,76 +11,79 @@ /* * Copyright 2014 Pluribus Networks Inc. + * Copyright 2019 Joyent, Inc. */ #ifndef _VMM_IMPL_H_ -#define _VMM_IMPL_H_ +#define _VMM_IMPL_H_ #include <sys/mutex.h> #include <sys/queue.h> #include <sys/varargs.h> +#include <sys/zone.h> + +#ifdef _KERNEL + +#define VMM_CTL_MINOR 0 /* - * /dev names: - * /dev/vmmctl - control device - * /dev/vmm/<name> - vm devices + * Rather than creating whole character devices for devmem mappings, they are + * available by mmap(2)ing the vmm handle at a specific offset. These offsets + * begin just above the maximum allow guest physical address. */ -#define VMM_DRIVER_NAME "vmm" +#include <vm/vm_param.h> +#define VM_DEVMEM_START (VM_MAXUSER_ADDRESS + 1) -#define VMM_CTL_MINOR_NODE "ctl" -#define VMM_CTL_MINOR_NAME VMM_DRIVER_NAME VMM_CTL_NODE -#define VMM_CTL_MINOR 0 - -#define VMM_IOC_BASE (('V' << 16) | ('M' << 8)) +struct vmm_devmem_entry { + list_node_t vde_node; + int vde_segid; + char vde_name[SPECNAMELEN + 1]; + size_t vde_len; + off_t vde_off; +}; +typedef struct vmm_devmem_entry vmm_devmem_entry_t; -#define VMM_CREATE_VM (VMM_IOC_BASE | 0x01) -#define VMM_DESTROY_VM (VMM_IOC_BASE | 0x02) +typedef struct vmm_zsd vmm_zsd_t; -struct vmm_ioctl { - char vmm_name[VM_MAX_NAMELEN]; +enum vmm_softc_state { + VMM_HELD = 1, /* external driver(s) possess hold on the VM */ + VMM_CLEANUP = 2, /* request that holds are released */ + VMM_PURGED = 4, /* all hold have been released */ + VMM_BLOCK_HOOK = 8, /* mem hook install temporarily blocked */ + VMM_DESTROY = 16 /* VM is destroyed, softc still around */ }; -#ifdef _KERNEL struct vmm_softc { - boolean_t open; - minor_t minor; - struct vm *vm; - char name[VM_MAX_NAMELEN]; - SLIST_ENTRY(vmm_softc) link; -}; -#endif + list_node_t vmm_node; + struct vm *vmm_vm; + minor_t vmm_minor; + char vmm_name[VM_MAX_NAMELEN]; + list_t vmm_devmem_list; -/* - * VMM trace ring buffer constants - */ -#define VMM_DMSG_RING_SIZE 0x100000 /* 1MB */ -#define VMM_DMSG_BUF_SIZE 256 + kcondvar_t vmm_cv; + list_t vmm_holds; + uint_t vmm_flags; + boolean_t vmm_is_open; -/* - * VMM trace ring buffer content - */ -typedef struct vmm_trace_dmsg { - timespec_t timestamp; - char buf[VMM_DMSG_BUF_SIZE]; - struct vmm_trace_dmsg *next; -} vmm_trace_dmsg_t; + kmutex_t vmm_lease_lock; + list_t vmm_lease_list; + uint_t vmm_lease_blocker; + kcondvar_t vmm_lease_cv; + krwlock_t vmm_rwlock; -/* - * VMM trace ring buffer header - */ -typedef struct vmm_trace_rbuf { - kmutex_t lock; /* lock to avoid clutter */ - int looped; /* completed ring */ - int allocfailed; /* dmsg mem alloc failed */ - size_t size; /* current size */ - size_t maxsize; /* max size */ - vmm_trace_dmsg_t *dmsgh; /* messages head */ - vmm_trace_dmsg_t *dmsgp; /* ptr to last message */ -} vmm_trace_rbuf_t; + /* For zone specific data */ + list_node_t vmm_zsd_linkage; + zone_t *vmm_zone; + vmm_zsd_t *vmm_zsd; +}; +typedef struct vmm_softc vmm_softc_t; -/* - * VMM trace ring buffer interfaces - */ -void vmm_trace_log(const char *fmt, ...); +void vmm_zsd_init(void); +void vmm_zsd_fini(void); +int vmm_zsd_add_vm(vmm_softc_t *sc); +void vmm_zsd_rem_vm(vmm_softc_t *sc); +int vmm_do_vm_destroy(vmm_softc_t *, boolean_t); + +#endif /* _KERNEL */ #endif /* _VMM_IMPL_H_ */ diff --git a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h index 8138890a2c..f10f407164 100644 --- a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h +++ b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h @@ -1,4 +1,6 @@ /*- + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * * Copyright (c) 2012 NetApp, Inc. * All rights reserved. * @@ -23,7 +25,7 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * $FreeBSD: head/sys/amd64/include/vmm_instruction_emul.h 276479 2014-12-31 20:31:32Z dim $ + * $FreeBSD$ */ /* * This file and its contents are supplied under the terms of the @@ -93,17 +95,26 @@ int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg, */ int vmm_fetch_instruction(struct vm *vm, int cpuid, struct vm_guest_paging *guest_paging, - uint64_t rip, int inst_length, struct vie *vie); + uint64_t rip, int inst_length, struct vie *vie, + int *is_fault); /* * Translate the guest linear address 'gla' to a guest physical address. * - * Returns 0 on success and '*gpa' contains the result of the translation. - * Returns 1 if an exception was injected into the guest. - * Returns -1 otherwise. + * retval is_fault Interpretation + * 0 0 'gpa' contains result of the translation + * 0 1 An exception was injected into the guest + * EFAULT N/A An unrecoverable hypervisor error occurred */ int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, - uint64_t gla, int prot, uint64_t *gpa); + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); + +/* + * Like vm_gla2gpa, but no exceptions are injected into the guest and + * PTEs are not changed. + */ +int vm_gla2gpa_nofault(struct vm *vm, int vcpuid, struct vm_guest_paging *paging, + uint64_t gla, int prot, uint64_t *gpa, int *is_fault); void vie_init(struct vie *vie, const char *inst_bytes, int inst_length); diff --git a/usr/src/uts/i86pc/viona/Makefile b/usr/src/uts/i86pc/viona/Makefile index c2b8bd8dcf..4ede5bbd84 100644 --- a/usr/src/uts/i86pc/viona/Makefile +++ b/usr/src/uts/i86pc/viona/Makefile @@ -11,6 +11,7 @@ # # Copyright 2013 Pluribus Networks Inc. +# Copyright 2017 Joyent, Inc. # # @@ -22,7 +23,7 @@ UTSBASE = ../.. # Define the module and object file sets. # MODULE = viona -OBJECTS = $(VIONA_OBJS:%=$(OBJS_DIR)/%) +OBJECTS = $(VIONA_OBJS:%=$(OBJS_DIR)/%) LINTS = $(VIONA_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/i86pc/io/viona @@ -42,6 +43,12 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) # # Overrides # +LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV +LINTTAGS += -erroff=E_FUNC_ARG_UNUSED +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_FUNC_RET_MAYBE_IGNORED2 +LINTTAGS += -erroff=E_FUNC_RET_ALWAYS_IGNOR2 + CFLAGS += $(CCVERBOSE) LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm diff --git a/usr/src/uts/i86pc/vmm/Makefile b/usr/src/uts/i86pc/vmm/Makefile index b3ab735781..5b93db987b 100644 --- a/usr/src/uts/i86pc/vmm/Makefile +++ b/usr/src/uts/i86pc/vmm/Makefile @@ -11,6 +11,7 @@ # # Copyright 2013 Pluribus Networks Inc. +# Copyright 2019 Joyent, Inc. # # @@ -26,6 +27,7 @@ OBJECTS = $(VMM_OBJS:%=$(OBJS_DIR)/%) LINTS = $(VMM_OBJS:%.o=$(LINTS_DIR)/%.ln) ROOTMODULE = $(USR_DRV_DIR)/$(MODULE) CONF_SRCDIR = $(UTSBASE)/i86pc/io/vmm +MAPFILE = $(UTSBASE)/i86pc/io/vmm/vmm.mapfile # # Include common rules. @@ -42,11 +44,52 @@ INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE) # # Overrides and additions # +LINTTAGS += -erroff=E_EMPTY_DECLARATION +LINTTAGS += -erroff=E_OPERANDS_INCOMPATIBLE_TYPES +LINTTAGS += -erroff=E_VOID_CANT_RETURN_VALUE +LINTTAGS += -erroff=E_YACC_ERROR +LINTTAGS += -erroff=E_STATIC_UNUSED +LINTTAGS += -erroff=E_FUNC_RET_MAYBE_IGNORED2 +LINTTAGS += -erroff=E_FUNC_RET_ALWAYS_IGNOR2 +LINTTAGS += -erroff=E_BAD_FORMAT_ARG_TYPE2 +LINTTAGS += -erroff=E_FUNC_ARG_UNUSED +LINTTAGS += -erroff=E_FUNC_SET_NOT_USED +LINTTAGS += -erroff=E_ASSIGN_NARROW_CONV +LINTTAGS += -erroff=E_BAD_PTR_CAST_ALIGN +LINTTAGS += -erroff=E_CONSTANT_CONDITION +LINTTAGS += -erroff=E_PTR_TO_VOID_IN_ARITHMETIC +LINTTAGS += -erroff=E_CONST_TRUNCATED_BY_ASSIGN +LINTTAGS += -erroff=E_NOP_ELSE_STMT +LINTTAGS += -erroff=E_FUNC_NO_RET_VAL +LINTTAGS += -erroff=E_OLD_STYLE_DECL_OR_BAD_TYPE +LINTTAGS += -erroff=E_VAR_USED_BEFORE_SET +LINTTAGS += -erroff=E_INTEGER_OVERFLOW_DETECTED +LINTTAGS += -erroff=E_STMT_NOT_REACHED +LINTTAGS += -erroff=E_FUNC_NO_RET_VAL +LINTTAGS += -erroff=E_USELESS_DECLARATION +LINTTAGS += -erroff=E_EXPR_NULL_EFFECT +LINTTAGS += -erroff=E_CASE_FALLTHRU +LINTTAGS += -erroff=E_FUNC_DECL_VAR_ARG2 +LINTTAGS += -erroff=E_ASM_IMPOSSIBLE_CONSTRAINT +LINTTAGS += -erroff=E_ASM_UNUSED_PARAM +LINTTAGS += -erroff=E_NOP_IF_STMT +LINTTAGS += -erroff=E_ZERO_OR_NEGATIVE_SUBSCRIPT + +CERRWARN += -_gcc=-Wno-empty-body + +# 3rd party code +SMOFF += all_func_returns + +# needs work +$(OBJS_DIR)/vmm_sol_dev.o := SMOFF += signed_integer_overflow_check + +# a can't happen: vmx_setcap() warn: variable dereferenced before check 'pptr' +$(OBJS_DIR)/vmx.o := SMOFF += deref_check # These sources only compile with gcc. Workaround a confluence of cruft # regarding dmake and shadow compilation by neutering the sun compiler. -amd64_CC = $(ONBLD_TOOLS)/bin/$(MACH)/cw -_gcc -CFLAGS += -_cc=-xdryrun +#amd64_CC = $(ONBLD_TOOLS)/bin/$(MACH)/cw -_gcc +#CFLAGS += -_cc=-xdryrun ALL_BUILDS = $(ALL_BUILDSONLY64) DEF_BUILDS = $(DEF_BUILDSONLY64) @@ -56,9 +99,23 @@ INC_PATH += -I$(UTSBASE)/i86pc/io/vmm -I$(UTSBASE)/i86pc/io/vmm/io AS_INC_PATH += -I$(UTSBASE)/i86pc/io/vmm -I$(OBJS_DIR) CFLAGS += -_gcc=-Wimplicit-function-declaration +# The FreeBSD %# notation makes gcc gripe +CFLAGS += -_gcc=-Wno-format +# enable collection of VMM statistics +CFLAGS += -DVMM_KEEP_STATS -OFFSETS_SRC = $(CONF_SRCDIR)/offsets.in -ASSYM_H = $(OBJS_DIR)/vmx_assym.h +LDFLAGS += -Nfs/dev + +$(OBJS_DIR)/vmm.o := CERRWARN += -_gcc=-Wno-pointer-sign -_gcc=-Wno-type-limits +$(OBJS_DIR)/svm.o := CERRWARN += -_gcc=-Wno-pointer-sign -_gcc=-Wno-type-limits + +LDFLAGS += -z type=kmod -M $(MAPFILE) + +OFFSETS_VMX = $(CONF_SRCDIR)/intel/offsets.in +OFFSETS_SVM = $(CONF_SRCDIR)/amd/offsets.in +ASSYM_VMX = $(OBJS_DIR)/vmx_assym.h +ASSYM_SVM = $(OBJS_DIR)/svm_assym.h +ASSYM_H = $(ASSYM_VMX) $(ASSYM_SVM) CLEANFILES += $(ASSYM_H) @@ -88,7 +145,10 @@ install: $(INSTALL_DEPS) # include $(UTSBASE)/i86pc/Makefile.targ -$(OBJECTS): $(ASSYM_H) +$(ASSYM_VMX): $(OFFSETS_VMX) $(GENASSYM) + $(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_VMX) >$@ +$(ASSYM_SVM): $(OFFSETS_SVM) $(GENASSYM) + $(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_SVM) >$@ -$(ASSYM_H): $(OFFSETS_SRC) $(GENASSYM) - $(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_SRC) >$@ +$(OBJS_DIR)/vmx_support.o: $(ASSYM_VMX) +$(OBJS_DIR)/svm_support.o: $(ASSYM_SVM) diff --git a/usr/src/uts/req.flg b/usr/src/uts/req.flg index ffbaa3f643..15085a486d 100644 --- a/usr/src/uts/req.flg +++ b/usr/src/uts/req.flg @@ -22,6 +22,7 @@ # # Copyright 2007 Sun Microsystems, Inc. All rights reserved. # Use is subject to license terms. +# Copyright 2018 Joyent, Inc. # # @@ -37,6 +38,7 @@ echo_file usr/src/Makefile # For full builds (open and closed), we want both etc/certs and # etc/keys. For an open source build, there's no etc/keys directory. +find_files "s.*" usr/contrib/freebsd find_files "s.*" usr/src/cmd/cmd-crypto/etc find_files "s.*" usr/src/common/acl find_files "s.*" usr/src/common/atomic @@ -56,4 +58,5 @@ find_files "s.*" usr/src/common/smbios find_files "s.*" usr/src/common/tsol find_files "s.*" usr/src/common/util find_files "s.*" usr/src/common/zfs +find_files "s.*" usr/src/compat/freebsd find_files "s.*" usr/src/psm/promif |