summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTycho Nightingale <tycho.nightingale@pluribusnetworks.com>2017-09-26 12:19:41 +0200
committerPatrick Mooney <pmooney@pfmooney.com>2018-02-22 15:57:20 +0000
commit43f85cd4da7e7860e4d240f14e6b5dd45700c7b6 (patch)
treefcf7f982094418a90da65ed16d48689bbc72ca5b
parent44db5f1c904128c3fd7c7ec37e9d894a10e93f8c (diff)
downloadillumos-joyent-43f85cd4da7e7860e4d240f14e6b5dd45700c7b6.tar.gz
OS-6409 import Pluribus bhyve port
Authored by: Krupal Joshi <krupal.joshi@pluribusnetworks.com> Contributed by: Pluribus Networks Inc. Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com> Reviewed by: Dan McDonald <danmcd@joyent.com> Reviewed by: Mike Gerdts <mike.gerdts@joyent.com> Reviewed by: Patrick Mooney <patrick.mooney@joyent.com> Approved by: Mike Gerdts <mike.gerdts@joyent.com>
-rw-r--r--usr/contrib/freebsd/amd64/machine/_types.h6
-rw-r--r--usr/contrib/freebsd/amd64/machine/psl.h6
-rw-r--r--usr/contrib/freebsd/amd64/machine/specialreg.h6
-rw-r--r--usr/contrib/freebsd/amd64/machine/timerreg.h54
-rw-r--r--usr/contrib/freebsd/amd64/machine/vm.h45
-rw-r--r--usr/contrib/freebsd/dev/acpica/acpi_hpet.h67
-rw-r--r--usr/contrib/freebsd/dev/ic/i8253reg.h78
-rw-r--r--usr/contrib/freebsd/dev/ic/i8259.h86
-rw-r--r--usr/contrib/freebsd/dev/ic/ns16550.h240
-rw-r--r--usr/contrib/freebsd/dev/pci/pcireg.h922
-rw-r--r--usr/contrib/freebsd/isa/isareg.h70
-rw-r--r--usr/contrib/freebsd/lib/libutil/expand_number.c93
-rw-r--r--usr/contrib/freebsd/sys/ata.h635
-rw-r--r--usr/contrib/freebsd/sys/linker_set.h119
-rw-r--r--usr/contrib/freebsd/sys/tree.h765
-rw-r--r--usr/contrib/freebsd/x86/apicreg.h455
-rw-r--r--usr/contrib/freebsd/x86/mptable.h204
-rw-r--r--usr/contrib/freebsd/x86/psl.h92
-rw-r--r--usr/contrib/freebsd/x86/specialreg.h839
-rw-r--r--usr/src/cmd/bhyve/Makefile41
-rw-r--r--usr/src/cmd/bhyve/Makefile.com94
-rw-r--r--usr/src/cmd/bhyve/acpi.h54
-rw-r--r--usr/src/cmd/bhyve/ahci.h304
-rw-r--r--usr/src/cmd/bhyve/amd64/Makefile21
-rw-r--r--usr/src/cmd/bhyve/atkbdc.c576
-rw-r--r--usr/src/cmd/bhyve/atkbdc.h38
-rw-r--r--usr/src/cmd/bhyve/bhyve_sol_glue.c86
-rw-r--r--usr/src/cmd/bhyve/bhyvegc.c78
-rw-r--r--usr/src/cmd/bhyve/bhyvegc.h44
-rw-r--r--usr/src/cmd/bhyve/bhyverun.c820
-rw-r--r--usr/src/cmd/bhyve/bhyverun.h73
-rw-r--r--usr/src/cmd/bhyve/block_if.c625
-rw-r--r--usr/src/cmd/bhyve/block_if.h70
-rw-r--r--usr/src/cmd/bhyve/console.c101
-rw-r--r--usr/src/cmd/bhyve/console.h50
-rw-r--r--usr/src/cmd/bhyve/consport.c155
-rw-r--r--usr/src/cmd/bhyve/dbgport.h34
-rw-r--r--usr/src/cmd/bhyve/inout.c297
-rw-r--r--usr/src/cmd/bhyve/inout.h91
-rw-r--r--usr/src/cmd/bhyve/ioapic.c74
-rw-r--r--usr/src/cmd/bhyve/ioapic.h39
-rw-r--r--usr/src/cmd/bhyve/mem.c291
-rw-r--r--usr/src/cmd/bhyve/mem.h61
-rw-r--r--usr/src/cmd/bhyve/mptbl.c377
-rw-r--r--usr/src/cmd/bhyve/mptbl.h35
-rw-r--r--usr/src/cmd/bhyve/pci_ahci.c2009
-rw-r--r--usr/src/cmd/bhyve/pci_emul.c2103
-rw-r--r--usr/src/cmd/bhyve/pci_emul.h283
-rw-r--r--usr/src/cmd/bhyve/pci_hostbridge.c70
-rw-r--r--usr/src/cmd/bhyve/pci_irq.c351
-rw-r--r--usr/src/cmd/bhyve/pci_irq.h45
-rw-r--r--usr/src/cmd/bhyve/pci_lpc.c433
-rw-r--r--usr/src/cmd/bhyve/pci_lpc.h72
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_block.c392
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_net.c870
-rw-r--r--usr/src/cmd/bhyve/pci_virtio_viona.c706
-rw-r--r--usr/src/cmd/bhyve/pm.c333
-rw-r--r--usr/src/cmd/bhyve/pmtmr.c212
-rw-r--r--usr/src/cmd/bhyve/post.c53
-rw-r--r--usr/src/cmd/bhyve/ps2kbd.c418
-rw-r--r--usr/src/cmd/bhyve/ps2kbd.h39
-rw-r--r--usr/src/cmd/bhyve/ps2mouse.c371
-rw-r--r--usr/src/cmd/bhyve/ps2mouse.h39
-rw-r--r--usr/src/cmd/bhyve/rfb.c420
-rw-r--r--usr/src/cmd/bhyve/rfb.h36
-rw-r--r--usr/src/cmd/bhyve/rtc.c380
-rw-r--r--usr/src/cmd/bhyve/rtc.h34
-rw-r--r--usr/src/cmd/bhyve/smbiostbl.c827
-rw-r--r--usr/src/cmd/bhyve/smbiostbl.h36
-rw-r--r--usr/src/cmd/bhyve/spinup_ap.c104
-rw-r--r--usr/src/cmd/bhyve/spinup_ap.h34
-rw-r--r--usr/src/cmd/bhyve/uart_emul.c1042
-rw-r--r--usr/src/cmd/bhyve/uart_emul.h45
-rw-r--r--usr/src/cmd/bhyve/vga.c1289
-rw-r--r--usr/src/cmd/bhyve/vga.h160
-rw-r--r--usr/src/cmd/bhyve/virtio.c755
-rw-r--r--usr/src/cmd/bhyve/virtio.h475
-rw-r--r--usr/src/cmd/bhyve/xmsr.c237
-rw-r--r--usr/src/cmd/bhyve/xmsr.h36
-rw-r--r--usr/src/cmd/bhyveconsole/Makefile41
-rw-r--r--usr/src/cmd/bhyveconsole/bhyveconsole.c360
-rw-r--r--usr/src/cmd/bhyveconsole/i386/Makefile43
-rw-r--r--usr/src/cmd/bhyvectl/Makefile41
-rw-r--r--usr/src/cmd/bhyvectl/Makefile.com48
-rw-r--r--usr/src/cmd/bhyvectl/amd64/Makefile21
-rw-r--r--usr/src/cmd/bhyvectl/bhyvectl.c1523
-rw-r--r--usr/src/cmd/bhyveload-uefi/Makefile41
-rw-r--r--usr/src/cmd/bhyveload-uefi/Makefile.com52
-rw-r--r--usr/src/cmd/bhyveload-uefi/amd64/Makefile21
-rw-r--r--usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c190
-rw-r--r--usr/src/cmd/bhyveload-uefi/i386/Makefile18
-rw-r--r--usr/src/cmd/mdb/intel/amd64/vmm/Makefile20
-rw-r--r--usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile32
-rw-r--r--usr/src/cmd/mdb/intel/amd64/vmm/vmm.c238
-rw-r--r--usr/src/compat/freebsd/amd64/machine/asmacros.h28
-rw-r--r--usr/src/compat/freebsd/amd64/machine/atomic.h244
-rw-r--r--usr/src/compat/freebsd/amd64/machine/clock.h23
-rw-r--r--usr/src/compat/freebsd/amd64/machine/cpufunc.h165
-rw-r--r--usr/src/compat/freebsd/amd64/machine/fpu.h29
-rw-r--r--usr/src/compat/freebsd/amd64/machine/md_var.h24
-rw-r--r--usr/src/compat/freebsd/amd64/machine/param.h39
-rw-r--r--usr/src/compat/freebsd/amd64/machine/pcb.h21
-rw-r--r--usr/src/compat/freebsd/amd64/machine/pmap.h44
-rw-r--r--usr/src/compat/freebsd/amd64/machine/segments.h21
-rw-r--r--usr/src/compat/freebsd/amd64/machine/smp.h19
-rw-r--r--usr/src/compat/freebsd/amd64/machine/vmm.h21
-rw-r--r--usr/src/compat/freebsd/amd64/machine/vmm_dev.h21
-rw-r--r--usr/src/compat/freebsd/amd64/machine/vmm_instruction_emul.h21
-rw-r--r--usr/src/compat/freebsd/amd64/machine/vmparam.h19
-rw-r--r--usr/src/compat/freebsd/libutil.h21
-rw-r--r--usr/src/compat/freebsd/net/ethernet.h21
-rw-r--r--usr/src/compat/freebsd/paths.h21
-rw-r--r--usr/src/compat/freebsd/pthread_np.h28
-rw-r--r--usr/src/compat/freebsd/string.h26
-rw-r--r--usr/src/compat/freebsd/strings.h23
-rw-r--r--usr/src/compat/freebsd/sys/_iovec.h24
-rw-r--r--usr/src/compat/freebsd/sys/_pthreadtypes.h19
-rw-r--r--usr/src/compat/freebsd/sys/_types.h22
-rw-r--r--usr/src/compat/freebsd/sys/callout.h70
-rw-r--r--usr/src/compat/freebsd/sys/cdefs.h58
-rw-r--r--usr/src/compat/freebsd/sys/cpuset.h44
-rw-r--r--usr/src/compat/freebsd/sys/disk.h19
-rw-r--r--usr/src/compat/freebsd/sys/endian.h125
-rw-r--r--usr/src/compat/freebsd/sys/errno.h27
-rw-r--r--usr/src/compat/freebsd/sys/fcntl.h23
-rw-r--r--usr/src/compat/freebsd/sys/ioctl.h22
-rw-r--r--usr/src/compat/freebsd/sys/kernel.h25
-rw-r--r--usr/src/compat/freebsd/sys/ktr.h27
-rw-r--r--usr/src/compat/freebsd/sys/libkern.h25
-rw-r--r--usr/src/compat/freebsd/sys/limits.h19
-rw-r--r--usr/src/compat/freebsd/sys/malloc.h44
-rw-r--r--usr/src/compat/freebsd/sys/module.h19
-rw-r--r--usr/src/compat/freebsd/sys/mutex.h81
-rw-r--r--usr/src/compat/freebsd/sys/param.h48
-rw-r--r--usr/src/compat/freebsd/sys/pcpu.h21
-rw-r--r--usr/src/compat/freebsd/sys/sched.h19
-rw-r--r--usr/src/compat/freebsd/sys/select.h23
-rw-r--r--usr/src/compat/freebsd/sys/smp.h28
-rw-r--r--usr/src/compat/freebsd/sys/sysctl.h27
-rw-r--r--usr/src/compat/freebsd/sys/systm.h53
-rw-r--r--usr/src/compat/freebsd/sys/time.h104
-rw-r--r--usr/src/compat/freebsd/sys/types.h74
-rw-r--r--usr/src/compat/freebsd/sys/uio.h26
-rw-r--r--usr/src/compat/freebsd/termios.h23
-rw-r--r--usr/src/compat/freebsd/uuid.h55
-rw-r--r--usr/src/compat/freebsd/vm/pmap.h21
-rw-r--r--usr/src/compat/freebsd/vm/vm.h39
-rw-r--r--usr/src/compat/freebsd/x86/_types.h49
-rw-r--r--usr/src/compat/freebsd/x86/segments.h28
-rw-r--r--usr/src/head/bhyve.h25
-rw-r--r--usr/src/lib/libvmmapi/Makefile49
-rw-r--r--usr/src/lib/libvmmapi/Makefile.com53
-rw-r--r--usr/src/lib/libvmmapi/amd64/Makefile21
-rw-r--r--usr/src/lib/libvmmapi/common/llib-lvmmapi2
-rw-r--r--usr/src/lib/libvmmapi/common/mapfile-vers77
-rw-r--r--usr/src/lib/libvmmapi/common/vmmapi.c1257
-rw-r--r--usr/src/lib/libvmmapi/common/vmmapi.h159
-rw-r--r--usr/src/tools/scripts/gensetdefs.pl31
-rw-r--r--usr/src/uts/i86pc/io/viona/viona.c1404
-rw-r--r--usr/src/uts/i86pc/io/viona/viona.conf14
-rw-r--r--usr/src/uts/i86pc/io/vmm/amd/amdv.c271
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/ept.c452
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/ept.h43
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmcs.c597
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmcs.h410
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmx.c2842
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmx.h156
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h96
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h245
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c445
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h70
-rw-r--r--usr/src/uts/i86pc/io/vmm/intel/vmx_support.s271
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vatpic.c809
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vatpic.h57
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vatpit.c458
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vatpit.h45
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vdev.c282
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vdev.h96
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vhpet.c821
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vhpet.h44
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vioapic.c514
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vioapic.h66
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vlapic.c1687
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vlapic.h109
-rw-r--r--usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h190
-rw-r--r--usr/src/uts/i86pc/io/vmm/offsets.in72
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm.c1894
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm.conf1
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_host.c160
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_host.h119
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c2370
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_ioport.c174
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_ioport.h37
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_ipi.h37
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_ktr.h69
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_lapic.c256
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_lapic.h87
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_mem.h49
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c1040
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c779
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c111
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_stat.h127
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_util.c125
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmm_util.h40
-rw-r--r--usr/src/uts/i86pc/io/vmm/vmx_assym.s1
-rw-r--r--usr/src/uts/i86pc/io/vmm/x86.c276
-rw-r--r--usr/src/uts/i86pc/io/vmm/x86.h65
-rw-r--r--usr/src/uts/i86pc/sys/Makefile14
-rw-r--r--usr/src/uts/i86pc/sys/viona_io.h45
-rw-r--r--usr/src/uts/i86pc/sys/vmm.h565
-rw-r--r--usr/src/uts/i86pc/sys/vmm_dev.h334
-rw-r--r--usr/src/uts/i86pc/sys/vmm_impl.h86
-rw-r--r--usr/src/uts/i86pc/sys/vmm_instruction_emul.h126
-rw-r--r--usr/src/uts/i86pc/viona/Makefile72
-rw-r--r--usr/src/uts/i86pc/vmm/Makefile94
215 files changed, 52310 insertions, 2 deletions
diff --git a/usr/contrib/freebsd/amd64/machine/_types.h b/usr/contrib/freebsd/amd64/machine/_types.h
new file mode 100644
index 0000000000..59994352b5
--- /dev/null
+++ b/usr/contrib/freebsd/amd64/machine/_types.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD: head/sys/amd64/include/_types.h 232261 2012-02-28 18:15:28Z tijl $ */
+
+#include <x86/_types.h>
diff --git a/usr/contrib/freebsd/amd64/machine/psl.h b/usr/contrib/freebsd/amd64/machine/psl.h
new file mode 100644
index 0000000000..c660bfbab0
--- /dev/null
+++ b/usr/contrib/freebsd/amd64/machine/psl.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD: head/sys/amd64/include/psl.h 233204 2012-03-19 21:29:57Z tijl $ */
+
+#include <x86/psl.h>
diff --git a/usr/contrib/freebsd/amd64/machine/specialreg.h b/usr/contrib/freebsd/amd64/machine/specialreg.h
new file mode 100644
index 0000000000..41d4125cb9
--- /dev/null
+++ b/usr/contrib/freebsd/amd64/machine/specialreg.h
@@ -0,0 +1,6 @@
+/*-
+ * This file is in the public domain.
+ */
+/* $FreeBSD: head/sys/amd64/include/specialreg.h 233207 2012-03-19 21:34:11Z tijl $ */
+
+#include <x86/specialreg.h>
diff --git a/usr/contrib/freebsd/amd64/machine/timerreg.h b/usr/contrib/freebsd/amd64/machine/timerreg.h
new file mode 100644
index 0000000000..bca7b4dd19
--- /dev/null
+++ b/usr/contrib/freebsd/amd64/machine/timerreg.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (C) 2005 TAKAHASHI Yoshihiro. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/include/timerreg.h 177642 2008-03-26 20:09:21Z phk $
+ */
+
+/*
+ * The outputs of the three timers are connected as follows:
+ *
+ * timer 0 -> irq 0
+ * timer 1 -> dma chan 0 (for dram refresh)
+ * timer 2 -> speaker (via keyboard controller)
+ *
+ * Timer 0 is used to call hardclock.
+ * Timer 2 is used to generate console beeps.
+ */
+
+#ifndef _MACHINE_TIMERREG_H_
+#define _MACHINE_TIMERREG_H_
+
+#ifdef _KERNEL
+
+#include <dev/ic/i8253reg.h>
+
+#define IO_TIMER1 0x40 /* 8253 Timer #1 */
+#define TIMER_CNTR0 (IO_TIMER1 + TIMER_REG_CNTR0)
+#define TIMER_CNTR1 (IO_TIMER1 + TIMER_REG_CNTR1)
+#define TIMER_CNTR2 (IO_TIMER1 + TIMER_REG_CNTR2)
+#define TIMER_MODE (IO_TIMER1 + TIMER_REG_MODE)
+
+#endif /* _KERNEL */
+
+#endif /* _MACHINE_TIMERREG_H_ */
diff --git a/usr/contrib/freebsd/amd64/machine/vm.h b/usr/contrib/freebsd/amd64/machine/vm.h
new file mode 100644
index 0000000000..885c1607ea
--- /dev/null
+++ b/usr/contrib/freebsd/amd64/machine/vm.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2009 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/include/vm.h 233671 2012-03-29 16:51:22Z jhb $
+ */
+
+#ifndef _MACHINE_VM_H_
+#define _MACHINE_VM_H_
+
+#include <machine/specialreg.h>
+
+/* Memory attributes. */
+#define VM_MEMATTR_UNCACHEABLE ((vm_memattr_t)PAT_UNCACHEABLE)
+#define VM_MEMATTR_WRITE_COMBINING ((vm_memattr_t)PAT_WRITE_COMBINING)
+#define VM_MEMATTR_WRITE_THROUGH ((vm_memattr_t)PAT_WRITE_THROUGH)
+#define VM_MEMATTR_WRITE_PROTECTED ((vm_memattr_t)PAT_WRITE_PROTECTED)
+#define VM_MEMATTR_WRITE_BACK ((vm_memattr_t)PAT_WRITE_BACK)
+#define VM_MEMATTR_WEAK_UNCACHEABLE ((vm_memattr_t)PAT_UNCACHED)
+
+#define VM_MEMATTR_DEFAULT VM_MEMATTR_WRITE_BACK
+
+#endif /* !_MACHINE_VM_H_ */
diff --git a/usr/contrib/freebsd/dev/acpica/acpi_hpet.h b/usr/contrib/freebsd/dev/acpica/acpi_hpet.h
new file mode 100644
index 0000000000..df817b7a2b
--- /dev/null
+++ b/usr/contrib/freebsd/dev/acpica/acpi_hpet.h
@@ -0,0 +1,67 @@
+/*-
+ * Copyright (c) 2005 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/dev/acpica/acpi_hpet.h 224919 2011-08-16 21:51:29Z mav $
+ */
+
+#ifndef __ACPI_HPET_H__
+#define __ACPI_HPET_H__
+
+#define HPET_MEM_WIDTH 0x400 /* Expected memory region size */
+
+/* General registers */
+#define HPET_CAPABILITIES 0x0 /* General capabilities and ID */
+#define HPET_CAP_VENDOR_ID 0xffff0000
+#define HPET_CAP_LEG_RT 0x00008000
+#define HPET_CAP_COUNT_SIZE 0x00002000 /* 1 = 64-bit, 0 = 32-bit */
+#define HPET_CAP_NUM_TIM 0x00001f00
+#define HPET_CAP_REV_ID 0x000000ff
+#define HPET_PERIOD 0x4 /* Period (1/hz) of timer */
+#define HPET_CONFIG 0x10 /* General configuration register */
+#define HPET_CNF_LEG_RT 0x00000002
+#define HPET_CNF_ENABLE 0x00000001
+#define HPET_ISR 0x20 /* General interrupt status register */
+#define HPET_MAIN_COUNTER 0xf0 /* Main counter register */
+
+/* Timer registers */
+#define HPET_TIMER_CAP_CNF(x) ((x) * 0x20 + 0x100)
+#define HPET_TCAP_INT_ROUTE 0xffffffff00000000
+#define HPET_TCAP_FSB_INT_DEL 0x00008000
+#define HPET_TCNF_FSB_EN 0x00004000
+#define HPET_TCNF_INT_ROUTE 0x00003e00
+#define HPET_TCNF_32MODE 0x00000100
+#define HPET_TCNF_VAL_SET 0x00000040
+#define HPET_TCAP_SIZE 0x00000020 /* 1 = 64-bit, 0 = 32-bit */
+#define HPET_TCAP_PER_INT 0x00000010 /* Supports periodic interrupts */
+#define HPET_TCNF_TYPE 0x00000008 /* 1 = periodic, 0 = one-shot */
+#define HPET_TCNF_INT_ENB 0x00000004
+#define HPET_TCNF_INT_TYPE 0x00000002 /* 1 = level triggered, 0 = edge */
+#define HPET_TIMER_COMPARATOR(x) ((x) * 0x20 + 0x108)
+#define HPET_TIMER_FSB_VAL(x) ((x) * 0x20 + 0x110)
+#define HPET_TIMER_FSB_ADDR(x) ((x) * 0x20 + 0x114)
+
+#define HPET_MIN_CYCLES 128 /* Period considered reliable. */
+
+#endif /* !__ACPI_HPET_H__ */
diff --git a/usr/contrib/freebsd/dev/ic/i8253reg.h b/usr/contrib/freebsd/dev/ic/i8253reg.h
new file mode 100644
index 0000000000..47568b3436
--- /dev/null
+++ b/usr/contrib/freebsd/dev/ic/i8253reg.h
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 1993 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: Header: timerreg.h,v 1.2 93/02/28 15:08:58 mccanne Exp
+ * $FreeBSD: head/sys/dev/ic/i8253reg.h 146215 2005-05-14 10:26:31Z nyan $
+ */
+
+/*
+ * Register definitions for the Intel 8253 Programmable Interval Timer.
+ *
+ * This chip has three independent 16-bit down counters that can be
+ * read on the fly. There are three mode registers and three countdown
+ * registers. The countdown registers are addressed directly, via the
+ * first three I/O ports. The three mode registers are accessed via
+ * the fourth I/O port, with two bits in the mode byte indicating the
+ * register. (Why are hardware interfaces always so braindead?).
+ *
+ * To write a value into the countdown register, the mode register
+ * is first programmed with a command indicating the which byte of
+ * the two byte register is to be modified. The three possibilities
+ * are load msb (TMR_MR_MSB), load lsb (TMR_MR_LSB), or load lsb then
+ * msb (TMR_MR_BOTH).
+ *
+ * To read the current value ("on the fly") from the countdown register,
+ * you write a "latch" command into the mode register, then read the stable
+ * value from the corresponding I/O port. For example, you write
+ * TMR_MR_LATCH into the corresponding mode register. Presumably,
+ * after doing this, a write operation to the I/O port would result
+ * in undefined behavior (but hopefully not fry the chip).
+ * Reading in this manner has no side effects.
+ */
+
+/*
+ * Macros for specifying values to be written into a mode register.
+ */
+#define TIMER_REG_CNTR0 0 /* timer 0 counter port */
+#define TIMER_REG_CNTR1 1 /* timer 1 counter port */
+#define TIMER_REG_CNTR2 2 /* timer 2 counter port */
+#define TIMER_REG_MODE 3 /* timer mode port */
+#define TIMER_SEL0 0x00 /* select counter 0 */
+#define TIMER_SEL1 0x40 /* select counter 1 */
+#define TIMER_SEL2 0x80 /* select counter 2 */
+#define TIMER_INTTC 0x00 /* mode 0, intr on terminal cnt */
+#define TIMER_ONESHOT 0x02 /* mode 1, one shot */
+#define TIMER_RATEGEN 0x04 /* mode 2, rate generator */
+#define TIMER_SQWAVE 0x06 /* mode 3, square wave */
+#define TIMER_SWSTROBE 0x08 /* mode 4, s/w triggered strobe */
+#define TIMER_HWSTROBE 0x0a /* mode 5, h/w triggered strobe */
+#define TIMER_LATCH 0x00 /* latch counter for reading */
+#define TIMER_LSB 0x10 /* r/w counter LSB */
+#define TIMER_MSB 0x20 /* r/w counter MSB */
+#define TIMER_16BIT 0x30 /* r/w counter 16 bits, LSB first */
+#define TIMER_BCD 0x01 /* count in BCD */
diff --git a/usr/contrib/freebsd/dev/ic/i8259.h b/usr/contrib/freebsd/dev/ic/i8259.h
new file mode 100644
index 0000000000..be523c1df4
--- /dev/null
+++ b/usr/contrib/freebsd/dev/ic/i8259.h
@@ -0,0 +1,86 @@
+/*-
+ * Copyright (c) 2003 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/dev/ic/i8259.h 151580 2005-10-23 09:05:51Z glebius $
+ */
+
+/*
+ * Register defintions for the i8259A programmable interrupt controller.
+ */
+
+#ifndef _DEV_IC_I8259_H_
+#define _DEV_IC_I8259_H_
+
+/* Initialization control word 1. Written to even address. */
+#define ICW1_IC4 0x01 /* ICW4 present */
+#define ICW1_SNGL 0x02 /* 1 = single, 0 = cascaded */
+#define ICW1_ADI 0x04 /* 1 = 4, 0 = 8 byte vectors */
+#define ICW1_LTIM 0x08 /* 1 = level trigger, 0 = edge */
+#define ICW1_RESET 0x10 /* must be 1 */
+/* 0x20 - 0x80 - in 8080/8085 mode only */
+
+/* Initialization control word 2. Written to the odd address. */
+/* No definitions, it is the base vector of the IDT for 8086 mode */
+
+/* Initialization control word 3. Written to the odd address. */
+/* For a master PIC, bitfield indicating a slave 8259 on given input */
+/* For slave, lower 3 bits are the slave's ID binary id on master */
+
+/* Initialization control word 4. Written to the odd address. */
+#define ICW4_8086 0x01 /* 1 = 8086, 0 = 8080 */
+#define ICW4_AEOI 0x02 /* 1 = Auto EOI */
+#define ICW4_MS 0x04 /* 1 = buffered master, 0 = slave */
+#define ICW4_BUF 0x08 /* 1 = enable buffer mode */
+#define ICW4_SFNM 0x10 /* 1 = special fully nested mode */
+
+/* Operation control words. Written after initialization. */
+
+/* Operation control word type 1 */
+/*
+ * No definitions. Written to the odd address. Bitmask for interrupts.
+ * 1 = disabled.
+ */
+
+/* Operation control word type 2. Bit 3 (0x08) must be zero. Even address. */
+#define OCW2_L0 0x01 /* Level */
+#define OCW2_L1 0x02
+#define OCW2_L2 0x04
+/* 0x08 must be 0 to select OCW2 vs OCW3 */
+/* 0x10 must be 0 to select OCW2 vs ICW1 */
+#define OCW2_EOI 0x20 /* 1 = EOI */
+#define OCW2_SL 0x40 /* EOI mode */
+#define OCW2_R 0x80 /* EOI mode */
+
+/* Operation control word type 3. Bit 3 (0x08) must be set. Even address. */
+#define OCW3_RIS 0x01 /* 1 = read IS, 0 = read IR */
+#define OCW3_RR 0x02 /* register read */
+#define OCW3_P 0x04 /* poll mode command */
+/* 0x08 must be 1 to select OCW3 vs OCW2 */
+#define OCW3_SEL 0x08 /* must be 1 */
+/* 0x10 must be 0 to select OCW3 vs ICW1 */
+#define OCW3_SMM 0x20 /* special mode mask */
+#define OCW3_ESMM 0x40 /* enable SMM */
+
+#endif /* !_DEV_IC_I8259_H_ */
diff --git a/usr/contrib/freebsd/dev/ic/ns16550.h b/usr/contrib/freebsd/dev/ic/ns16550.h
new file mode 100644
index 0000000000..5e8f30e3e8
--- /dev/null
+++ b/usr/contrib/freebsd/dev/ic/ns16550.h
@@ -0,0 +1,240 @@
+/*-
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)ns16550.h 7.1 (Berkeley) 5/9/91
+ * $FreeBSD: head/sys/dev/ic/ns16550.h 257170 2013-10-26 17:24:59Z zbb $
+ */
+
+/*
+ * NS8250... UART registers.
+ */
+
+/* 8250 registers #[0-6]. */
+
+#define com_data 0 /* data register (R/W) */
+#define REG_DATA com_data
+
+#define com_ier 1 /* interrupt enable register (W) */
+#define REG_IER com_ier
+#define IER_ERXRDY 0x1
+#define IER_ETXRDY 0x2
+#define IER_ERLS 0x4
+#define IER_EMSC 0x8
+
+#define IER_BITS "\20\1ERXRDY\2ETXRDY\3ERLS\4EMSC"
+
+#define com_iir 2 /* interrupt identification register (R) */
+#define REG_IIR com_iir
+#define IIR_IMASK 0xf
+#define IIR_RXTOUT 0xc
+#define IIR_BUSY 0x7
+#define IIR_RLS 0x6
+#define IIR_RXRDY 0x4
+#define IIR_TXRDY 0x2
+#define IIR_NOPEND 0x1
+#define IIR_MLSC 0x0
+#define IIR_FIFO_MASK 0xc0 /* set if FIFOs are enabled */
+
+#define IIR_BITS "\20\1NOPEND\2TXRDY\3RXRDY"
+
+#define com_lcr 3 /* line control register (R/W) */
+#define com_cfcr com_lcr /* character format control register (R/W) */
+#define REG_LCR com_lcr
+#define LCR_DLAB 0x80
+#define CFCR_DLAB LCR_DLAB
+#define LCR_EFR_ENABLE 0xbf /* magic to enable EFR on 16650 up */
+#define CFCR_EFR_ENABLE LCR_EFR_ENABLE
+#define LCR_SBREAK 0x40
+#define CFCR_SBREAK LCR_SBREAK
+#define LCR_PZERO 0x30
+#define CFCR_PZERO LCR_PZERO
+#define LCR_PONE 0x20
+#define CFCR_PONE LCR_PONE
+#define LCR_PEVEN 0x10
+#define CFCR_PEVEN LCR_PEVEN
+#define LCR_PODD 0x00
+#define CFCR_PODD LCR_PODD
+#define LCR_PENAB 0x08
+#define CFCR_PENAB LCR_PENAB
+#define LCR_STOPB 0x04
+#define CFCR_STOPB LCR_STOPB
+#define LCR_8BITS 0x03
+#define CFCR_8BITS LCR_8BITS
+#define LCR_7BITS 0x02
+#define CFCR_7BITS LCR_7BITS
+#define LCR_6BITS 0x01
+#define CFCR_6BITS LCR_6BITS
+#define LCR_5BITS 0x00
+#define CFCR_5BITS LCR_5BITS
+
+#define com_mcr 4 /* modem control register (R/W) */
+#define REG_MCR com_mcr
+#define MCR_PRESCALE 0x80 /* only available on 16650 up */
+#define MCR_LOOPBACK 0x10
+#define MCR_IE 0x08
+#define MCR_IENABLE MCR_IE
+#define MCR_DRS 0x04
+#define MCR_RTS 0x02
+#define MCR_DTR 0x01
+
+#define MCR_BITS "\20\1DTR\2RTS\3DRS\4IE\5LOOPBACK\10PRESCALE"
+
+#define com_lsr 5 /* line status register (R/W) */
+#define REG_LSR com_lsr
+#define LSR_RCV_FIFO 0x80
+#define LSR_TEMT 0x40
+#define LSR_TSRE LSR_TEMT
+#define LSR_THRE 0x20
+#define LSR_TXRDY LSR_THRE
+#define LSR_BI 0x10
+#define LSR_FE 0x08
+#define LSR_PE 0x04
+#define LSR_OE 0x02
+#define LSR_RXRDY 0x01
+#define LSR_RCV_MASK 0x1f
+
+#define LSR_BITS "\20\1RXRDY\2OE\3PE\4FE\5BI\6THRE\7TEMT\10RCV_FIFO"
+
+#define com_msr 6 /* modem status register (R/W) */
+#define REG_MSR com_msr
+#define MSR_DCD 0x80
+#define MSR_RI 0x40
+#define MSR_DSR 0x20
+#define MSR_CTS 0x10
+#define MSR_DDCD 0x08
+#define MSR_TERI 0x04
+#define MSR_DDSR 0x02
+#define MSR_DCTS 0x01
+
+#define MSR_BITS "\20\1DCTS\2DDSR\3TERI\4DDCD\5CTS\6DSR\7RI\10DCD"
+
+/* 8250 multiplexed registers #[0-1]. Access enabled by LCR[7]. */
+#define com_dll 0 /* divisor latch low (R/W) */
+#define com_dlbl com_dll
+#define com_dlm 1 /* divisor latch high (R/W) */
+#define com_dlbh com_dlm
+#define REG_DLL com_dll
+#define REG_DLH com_dlm
+
+/* 16450 register #7. Not multiplexed. */
+#define com_scr 7 /* scratch register (R/W) */
+
+/* 16550 register #2. Not multiplexed. */
+#define com_fcr 2 /* FIFO control register (W) */
+#define com_fifo com_fcr
+#define REG_FCR com_fcr
+#define FCR_ENABLE 0x01
+#define FIFO_ENABLE FCR_ENABLE
+#define FCR_RCV_RST 0x02
+#define FIFO_RCV_RST FCR_RCV_RST
+#define FCR_XMT_RST 0x04
+#define FIFO_XMT_RST FCR_XMT_RST
+#define FCR_DMA 0x08
+#define FIFO_DMA_MODE FCR_DMA
+#define FCR_RX_LOW 0x00
+#define FIFO_RX_LOW FCR_RX_LOW
+#define FCR_RX_MEDL 0x40
+#define FIFO_RX_MEDL FCR_RX_MEDL
+#define FCR_RX_MEDH 0x80
+#define FIFO_RX_MEDH FCR_RX_MEDH
+#define FCR_RX_HIGH 0xc0
+#define FIFO_RX_HIGH FCR_RX_HIGH
+
+#define FCR_BITS "\20\1ENABLE\2RCV_RST\3XMT_RST\4DMA"
+
+/* 16650 registers #2,[4-7]. Access enabled by LCR_EFR_ENABLE. */
+
+#define com_efr 2 /* enhanced features register (R/W) */
+#define REG_EFR com_efr
+#define EFR_CTS 0x80
+#define EFR_AUTOCTS EFR_CTS
+#define EFR_RTS 0x40
+#define EFR_AUTORTS EFR_RTS
+#define EFR_EFE 0x10 /* enhanced functions enable */
+
+#define com_xon1 4 /* XON 1 character (R/W) */
+#define com_xon2 5 /* XON 2 character (R/W) */
+#define com_xoff1 6 /* XOFF 1 character (R/W) */
+#define com_xoff2 7 /* XOFF 2 character (R/W) */
+
+#define DW_REG_USR 31 /* DesignWare derived Uart Status Reg */
+#define com_usr 39 /* Octeon 16750/16550 Uart Status Reg */
+#define REG_USR com_usr
+#define USR_BUSY 1 /* Uart Busy. Serial transfer in progress */
+#define USR_TXFIFO_NOTFULL 2 /* Uart TX FIFO Not full */
+
+/* 16950 register #1. Access enabled by ACR[7]. Also requires !LCR[7]. */
+#define com_asr 1 /* additional status register (R[0-7]/W[0-1]) */
+
+/* 16950 register #3. R/W access enabled by ACR[7]. */
+#define com_rfl 3 /* receiver fifo level (R) */
+
+/*
+ * 16950 register #4. Access enabled by ACR[7]. Also requires
+ * !LCR_EFR_ENABLE.
+ */
+#define com_tfl 4 /* transmitter fifo level (R) */
+
+/*
+ * 16950 register #5. Accessible if !LCR_EFR_ENABLE. Read access also
+ * requires ACR[6].
+ */
+#define com_icr 5 /* index control register (R/W) */
+
+/*
+ * 16950 register #7. It is the same as com_scr except it has a different
+ * abbreviation in the manufacturer's data sheet and it also serves as an
+ * index into the Indexed Control register set.
+ */
+#define com_spr com_scr /* scratch pad (and index) register (R/W) */
+#define REG_SPR com_scr
+
+/*
+ * 16950 indexed control registers #[0-0x13]. Access is via index in SPR,
+ * data in ICR (if ICR is accessible).
+ */
+
+#define com_acr 0 /* additional control register (R/W) */
+#define ACR_ASE 0x80 /* ASR/RFL/TFL enable */
+#define ACR_ICRE 0x40 /* ICR enable */
+#define ACR_TLE 0x20 /* TTL/RTL enable */
+
+#define com_cpr 1 /* clock prescaler register (R/W) */
+#define com_tcr 2 /* times clock register (R/W) */
+#define com_ttl 4 /* transmitter trigger level (R/W) */
+#define com_rtl 5 /* receiver trigger level (R/W) */
+/* ... */
+
+/* Hardware extension mode register for RSB-2000/3000. */
+#define com_emr com_msr
+#define EMR_EXBUFF 0x04
+#define EMR_CTSFLW 0x08
+#define EMR_DSRFLW 0x10
+#define EMR_RTSFLW 0x20
+#define EMR_DTRFLW 0x40
+#define EMR_EFMODE 0x80
diff --git a/usr/contrib/freebsd/dev/pci/pcireg.h b/usr/contrib/freebsd/dev/pci/pcireg.h
new file mode 100644
index 0000000000..32a569dbd4
--- /dev/null
+++ b/usr/contrib/freebsd/dev/pci/pcireg.h
@@ -0,0 +1,922 @@
+/*-
+ * Copyright (c) 1997, Stefan Esser <se@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/dev/pci/pcireg.h 266468 2014-05-20 14:39:22Z mav $
+ *
+ */
+
+/*
+ * PCIM_xxx: mask to locate subfield in register
+ * PCIR_xxx: config register offset
+ * PCIC_xxx: device class
+ * PCIS_xxx: device subclass
+ * PCIP_xxx: device programming interface
+ * PCIV_xxx: PCI vendor ID (only required to fixup ancient devices)
+ * PCID_xxx: device ID
+ * PCIY_xxx: capability identification number
+ * PCIZ_xxx: extended capability identification number
+ */
+
+/* some PCI bus constants */
+#define PCI_DOMAINMAX 65535 /* highest supported domain number */
+#define PCI_BUSMAX 255 /* highest supported bus number */
+#define PCI_SLOTMAX 31 /* highest supported slot number */
+#define PCI_FUNCMAX 7 /* highest supported function number */
+#define PCI_REGMAX 255 /* highest supported config register addr. */
+#define PCIE_REGMAX 4095 /* highest supported config register addr. */
+#define PCI_MAXHDRTYPE 2
+
+#define PCIE_ARI_SLOTMAX 0
+#define PCIE_ARI_FUNCMAX 255
+
+#define PCI_RID_BUS_SHIFT 8
+#define PCI_RID_SLOT_SHIFT 3
+#define PCI_RID_FUNC_SHIFT 0
+
+#define PCI_RID(bus, slot, func) \
+ ((((bus) & PCI_BUSMAX) << PCI_RID_BUS_SHIFT) | \
+ (((slot) & PCI_SLOTMAX) << PCI_RID_SLOT_SHIFT) | \
+ (((func) & PCI_FUNCMAX) << PCI_RID_FUNC_SHIFT))
+
+#define PCI_ARI_RID(bus, func) \
+ ((((bus) & PCI_BUSMAX) << PCI_RID_BUS_SHIFT) | \
+ (((func) & PCIE_ARI_FUNCMAX) << PCI_RID_FUNC_SHIFT))
+
+#define PCI_RID2BUS(rid) (((rid) >> PCI_RID_BUS_SHIFT) & PCI_BUSMAX)
+#define PCI_RID2SLOT(rid) (((rid) >> PCI_RID_SLOT_SHIFT) & PCI_SLOTMAX)
+#define PCI_RID2FUNC(rid) (((rid) >> PCI_RID_FUNC_SHIFT) & PCI_FUNCMAX)
+
+#define PCIE_ARI_SLOT(func) (((func) >> PCI_RID_SLOT_SHIFT) & PCI_SLOTMAX)
+#define PCIE_ARI_FUNC(func) (((func) >> PCI_RID_FUNC_SHIFT) & PCI_FUNCMAX)
+
+/* PCI config header registers for all devices */
+
+#define PCIR_DEVVENDOR 0x00
+#define PCIR_VENDOR 0x00
+#define PCIR_DEVICE 0x02
+#define PCIR_COMMAND 0x04
+#define PCIM_CMD_PORTEN 0x0001
+#define PCIM_CMD_MEMEN 0x0002
+#define PCIM_CMD_BUSMASTEREN 0x0004
+#define PCIM_CMD_SPECIALEN 0x0008
+#define PCIM_CMD_MWRICEN 0x0010
+#define PCIM_CMD_PERRESPEN 0x0040
+#define PCIM_CMD_SERRESPEN 0x0100
+#define PCIM_CMD_BACKTOBACK 0x0200
+#define PCIM_CMD_INTxDIS 0x0400
+#define PCIR_STATUS 0x06
+#define PCIM_STATUS_INTxSTATE 0x0008
+#define PCIM_STATUS_CAPPRESENT 0x0010
+#define PCIM_STATUS_66CAPABLE 0x0020
+#define PCIM_STATUS_BACKTOBACK 0x0080
+#define PCIM_STATUS_MDPERR 0x0100
+#define PCIM_STATUS_SEL_FAST 0x0000
+#define PCIM_STATUS_SEL_MEDIMUM 0x0200
+#define PCIM_STATUS_SEL_SLOW 0x0400
+#define PCIM_STATUS_SEL_MASK 0x0600
+#define PCIM_STATUS_STABORT 0x0800
+#define PCIM_STATUS_RTABORT 0x1000
+#define PCIM_STATUS_RMABORT 0x2000
+#define PCIM_STATUS_SERR 0x4000
+#define PCIM_STATUS_PERR 0x8000
+#define PCIR_REVID 0x08
+#define PCIR_PROGIF 0x09
+#define PCIR_SUBCLASS 0x0a
+#define PCIR_CLASS 0x0b
+#define PCIR_CACHELNSZ 0x0c
+#define PCIR_LATTIMER 0x0d
+#define PCIR_HDRTYPE 0x0e
+#define PCIM_HDRTYPE 0x7f
+#define PCIM_HDRTYPE_NORMAL 0x00
+#define PCIM_HDRTYPE_BRIDGE 0x01
+#define PCIM_HDRTYPE_CARDBUS 0x02
+#define PCIM_MFDEV 0x80
+#define PCIR_BIST 0x0f
+
+/* Capability Register Offsets */
+
+#define PCICAP_ID 0x0
+#define PCICAP_NEXTPTR 0x1
+
+/* Capability Identification Numbers */
+
+#define PCIY_PMG 0x01 /* PCI Power Management */
+#define PCIY_AGP 0x02 /* AGP */
+#define PCIY_VPD 0x03 /* Vital Product Data */
+#define PCIY_SLOTID 0x04 /* Slot Identification */
+#define PCIY_MSI 0x05 /* Message Signaled Interrupts */
+#define PCIY_CHSWP 0x06 /* CompactPCI Hot Swap */
+#define PCIY_PCIX 0x07 /* PCI-X */
+#define PCIY_HT 0x08 /* HyperTransport */
+#define PCIY_VENDOR 0x09 /* Vendor Unique */
+#define PCIY_DEBUG 0x0a /* Debug port */
+#define PCIY_CRES 0x0b /* CompactPCI central resource control */
+#define PCIY_HOTPLUG 0x0c /* PCI Hot-Plug */
+#define PCIY_SUBVENDOR 0x0d /* PCI-PCI bridge subvendor ID */
+#define PCIY_AGP8X 0x0e /* AGP 8x */
+#define PCIY_SECDEV 0x0f /* Secure Device */
+#define PCIY_EXPRESS 0x10 /* PCI Express */
+#define PCIY_MSIX 0x11 /* MSI-X */
+#define PCIY_SATA 0x12 /* SATA */
+#define PCIY_PCIAF 0x13 /* PCI Advanced Features */
+
+/* Extended Capability Register Fields */
+
+#define PCIR_EXTCAP 0x100
+#define PCIM_EXTCAP_ID 0x0000ffff
+#define PCIM_EXTCAP_VER 0x000f0000
+#define PCIM_EXTCAP_NEXTPTR 0xfff00000
+#define PCI_EXTCAP_ID(ecap) ((ecap) & PCIM_EXTCAP_ID)
+#define PCI_EXTCAP_VER(ecap) (((ecap) & PCIM_EXTCAP_VER) >> 16)
+#define PCI_EXTCAP_NEXTPTR(ecap) (((ecap) & PCIM_EXTCAP_NEXTPTR) >> 20)
+
+/* Extended Capability Identification Numbers */
+
+#define PCIZ_AER 0x0001 /* Advanced Error Reporting */
+#define PCIZ_VC 0x0002 /* Virtual Channel if MFVC Ext Cap not set */
+#define PCIZ_SERNUM 0x0003 /* Device Serial Number */
+#define PCIZ_PWRBDGT 0x0004 /* Power Budgeting */
+#define PCIZ_RCLINK_DCL 0x0005 /* Root Complex Link Declaration */
+#define PCIZ_RCLINK_CTL 0x0006 /* Root Complex Internal Link Control */
+#define PCIZ_RCEC_ASSOC 0x0007 /* Root Complex Event Collector Association */
+#define PCIZ_MFVC 0x0008 /* Multi-Function Virtual Channel */
+#define PCIZ_VC2 0x0009 /* Virtual Channel if MFVC Ext Cap set */
+#define PCIZ_RCRB 0x000a /* RCRB Header */
+#define PCIZ_VENDOR 0x000b /* Vendor Unique */
+#define PCIZ_CAC 0x000c /* Configuration Access Correction -- obsolete */
+#define PCIZ_ACS 0x000d /* Access Control Services */
+#define PCIZ_ARI 0x000e /* Alternative Routing-ID Interpretation */
+#define PCIZ_ATS 0x000f /* Address Translation Services */
+#define PCIZ_SRIOV 0x0010 /* Single Root IO Virtualization */
+#define PCIZ_MRIOV 0x0011 /* Multiple Root IO Virtualization */
+#define PCIZ_MULTICAST 0x0012 /* Multicast */
+#define PCIZ_PAGE_REQ 0x0013 /* Page Request */
+#define PCIZ_AMD 0x0014 /* Reserved for AMD */
+#define PCIZ_RESIZE_BAR 0x0015 /* Resizable BAR */
+#define PCIZ_DPA 0x0016 /* Dynamic Power Allocation */
+#define PCIZ_TPH_REQ 0x0017 /* TPH Requester */
+#define PCIZ_LTR 0x0018 /* Latency Tolerance Reporting */
+#define PCIZ_SEC_PCIE 0x0019 /* Secondary PCI Express */
+#define PCIZ_PMUX 0x001a /* Protocol Multiplexing */
+#define PCIZ_PASID 0x001b /* Process Address Space ID */
+#define PCIZ_LN_REQ 0x001c /* LN Requester */
+#define PCIZ_DPC 0x001d /* Downstream Porto Containment */
+#define PCIZ_L1PM 0x001e /* L1 PM Substates */
+
+/* config registers for header type 0 devices */
+
+#define PCIR_BARS 0x10
+#define PCIR_BAR(x) (PCIR_BARS + (x) * 4)
+#define PCIR_MAX_BAR_0 5
+#define PCI_RID2BAR(rid) (((rid) - PCIR_BARS) / 4)
+#define PCI_BAR_IO(x) (((x) & PCIM_BAR_SPACE) == PCIM_BAR_IO_SPACE)
+#define PCI_BAR_MEM(x) (((x) & PCIM_BAR_SPACE) == PCIM_BAR_MEM_SPACE)
+#define PCIM_BAR_SPACE 0x00000001
+#define PCIM_BAR_MEM_SPACE 0
+#define PCIM_BAR_IO_SPACE 1
+#define PCIM_BAR_MEM_TYPE 0x00000006
+#define PCIM_BAR_MEM_32 0
+#define PCIM_BAR_MEM_1MB 2 /* Locate below 1MB in PCI <= 2.1 */
+#define PCIM_BAR_MEM_64 4
+#define PCIM_BAR_MEM_PREFETCH 0x00000008
+#define PCIM_BAR_MEM_BASE 0xfffffffffffffff0ULL
+#define PCIM_BAR_IO_RESERVED 0x00000002
+#define PCIM_BAR_IO_BASE 0xfffffffc
+#define PCIR_CIS 0x28
+#define PCIM_CIS_ASI_MASK 0x00000007
+#define PCIM_CIS_ASI_CONFIG 0
+#define PCIM_CIS_ASI_BAR0 1
+#define PCIM_CIS_ASI_BAR1 2
+#define PCIM_CIS_ASI_BAR2 3
+#define PCIM_CIS_ASI_BAR3 4
+#define PCIM_CIS_ASI_BAR4 5
+#define PCIM_CIS_ASI_BAR5 6
+#define PCIM_CIS_ASI_ROM 7
+#define PCIM_CIS_ADDR_MASK 0x0ffffff8
+#define PCIM_CIS_ROM_MASK 0xf0000000
+#define PCIM_CIS_CONFIG_MASK 0xff
+#define PCIR_SUBVEND_0 0x2c
+#define PCIR_SUBDEV_0 0x2e
+#define PCIR_BIOS 0x30
+#define PCIM_BIOS_ENABLE 0x01
+#define PCIM_BIOS_ADDR_MASK 0xfffff800
+#define PCIR_CAP_PTR 0x34
+#define PCIR_INTLINE 0x3c
+#define PCIR_INTPIN 0x3d
+#define PCIR_MINGNT 0x3e
+#define PCIR_MAXLAT 0x3f
+
+/* config registers for header type 1 (PCI-to-PCI bridge) devices */
+
+#define PCIR_MAX_BAR_1 1
+#define PCIR_SECSTAT_1 0x1e
+
+#define PCIR_PRIBUS_1 0x18
+#define PCIR_SECBUS_1 0x19
+#define PCIR_SUBBUS_1 0x1a
+#define PCIR_SECLAT_1 0x1b
+
+#define PCIR_IOBASEL_1 0x1c
+#define PCIR_IOLIMITL_1 0x1d
+#define PCIR_IOBASEH_1 0x30
+#define PCIR_IOLIMITH_1 0x32
+#define PCIM_BRIO_16 0x0
+#define PCIM_BRIO_32 0x1
+#define PCIM_BRIO_MASK 0xf
+
+#define PCIR_MEMBASE_1 0x20
+#define PCIR_MEMLIMIT_1 0x22
+
+#define PCIR_PMBASEL_1 0x24
+#define PCIR_PMLIMITL_1 0x26
+#define PCIR_PMBASEH_1 0x28
+#define PCIR_PMLIMITH_1 0x2c
+#define PCIM_BRPM_32 0x0
+#define PCIM_BRPM_64 0x1
+#define PCIM_BRPM_MASK 0xf
+
+#define PCIR_BIOS_1 0x38
+#define PCIR_BRIDGECTL_1 0x3e
+
+/* config registers for header type 2 (CardBus) devices */
+
+#define PCIR_MAX_BAR_2 0
+#define PCIR_CAP_PTR_2 0x14
+#define PCIR_SECSTAT_2 0x16
+
+#define PCIR_PRIBUS_2 0x18
+#define PCIR_SECBUS_2 0x19
+#define PCIR_SUBBUS_2 0x1a
+#define PCIR_SECLAT_2 0x1b
+
+#define PCIR_MEMBASE0_2 0x1c
+#define PCIR_MEMLIMIT0_2 0x20
+#define PCIR_MEMBASE1_2 0x24
+#define PCIR_MEMLIMIT1_2 0x28
+#define PCIR_IOBASE0_2 0x2c
+#define PCIR_IOLIMIT0_2 0x30
+#define PCIR_IOBASE1_2 0x34
+#define PCIR_IOLIMIT1_2 0x38
+
+#define PCIR_BRIDGECTL_2 0x3e
+
+#define PCIR_SUBVEND_2 0x40
+#define PCIR_SUBDEV_2 0x42
+
+#define PCIR_PCCARDIF_2 0x44
+
+/* PCI device class, subclass and programming interface definitions */
+
+#define PCIC_OLD 0x00
+#define PCIS_OLD_NONVGA 0x00
+#define PCIS_OLD_VGA 0x01
+
+#define PCIC_STORAGE 0x01
+#define PCIS_STORAGE_SCSI 0x00
+#define PCIS_STORAGE_IDE 0x01
+#define PCIP_STORAGE_IDE_MODEPRIM 0x01
+#define PCIP_STORAGE_IDE_PROGINDPRIM 0x02
+#define PCIP_STORAGE_IDE_MODESEC 0x04
+#define PCIP_STORAGE_IDE_PROGINDSEC 0x08
+#define PCIP_STORAGE_IDE_MASTERDEV 0x80
+#define PCIS_STORAGE_FLOPPY 0x02
+#define PCIS_STORAGE_IPI 0x03
+#define PCIS_STORAGE_RAID 0x04
+#define PCIS_STORAGE_ATA_ADMA 0x05
+#define PCIS_STORAGE_SATA 0x06
+#define PCIP_STORAGE_SATA_AHCI_1_0 0x01
+#define PCIS_STORAGE_SAS 0x07
+#define PCIS_STORAGE_NVM 0x08
+#define PCIP_STORAGE_NVM_NVMHCI_1_0 0x01
+#define PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0 0x02
+#define PCIS_STORAGE_OTHER 0x80
+
+#define PCIC_NETWORK 0x02
+#define PCIS_NETWORK_ETHERNET 0x00
+#define PCIS_NETWORK_TOKENRING 0x01
+#define PCIS_NETWORK_FDDI 0x02
+#define PCIS_NETWORK_ATM 0x03
+#define PCIS_NETWORK_ISDN 0x04
+#define PCIS_NETWORK_WORLDFIP 0x05
+#define PCIS_NETWORK_PICMG 0x06
+#define PCIS_NETWORK_OTHER 0x80
+
+#define PCIC_DISPLAY 0x03
+#define PCIS_DISPLAY_VGA 0x00
+#define PCIS_DISPLAY_XGA 0x01
+#define PCIS_DISPLAY_3D 0x02
+#define PCIS_DISPLAY_OTHER 0x80
+
+#define PCIC_MULTIMEDIA 0x04
+#define PCIS_MULTIMEDIA_VIDEO 0x00
+#define PCIS_MULTIMEDIA_AUDIO 0x01
+#define PCIS_MULTIMEDIA_TELE 0x02
+#define PCIS_MULTIMEDIA_HDA 0x03
+#define PCIS_MULTIMEDIA_OTHER 0x80
+
+#define PCIC_MEMORY 0x05
+#define PCIS_MEMORY_RAM 0x00
+#define PCIS_MEMORY_FLASH 0x01
+#define PCIS_MEMORY_OTHER 0x80
+
+#define PCIC_BRIDGE 0x06
+#define PCIS_BRIDGE_HOST 0x00
+#define PCIS_BRIDGE_ISA 0x01
+#define PCIS_BRIDGE_EISA 0x02
+#define PCIS_BRIDGE_MCA 0x03
+#define PCIS_BRIDGE_PCI 0x04
+#define PCIP_BRIDGE_PCI_SUBTRACTIVE 0x01
+#define PCIS_BRIDGE_PCMCIA 0x05
+#define PCIS_BRIDGE_NUBUS 0x06
+#define PCIS_BRIDGE_CARDBUS 0x07
+#define PCIS_BRIDGE_RACEWAY 0x08
+#define PCIS_BRIDGE_PCI_TRANSPARENT 0x09
+#define PCIS_BRIDGE_INFINIBAND 0x0a
+#define PCIS_BRIDGE_OTHER 0x80
+
+#define PCIC_SIMPLECOMM 0x07
+#define PCIS_SIMPLECOMM_UART 0x00
+#define PCIP_SIMPLECOMM_UART_8250 0x00
+#define PCIP_SIMPLECOMM_UART_16450A 0x01
+#define PCIP_SIMPLECOMM_UART_16550A 0x02
+#define PCIP_SIMPLECOMM_UART_16650A 0x03
+#define PCIP_SIMPLECOMM_UART_16750A 0x04
+#define PCIP_SIMPLECOMM_UART_16850A 0x05
+#define PCIP_SIMPLECOMM_UART_16950A 0x06
+#define PCIS_SIMPLECOMM_PAR 0x01
+#define PCIS_SIMPLECOMM_MULSER 0x02
+#define PCIS_SIMPLECOMM_MODEM 0x03
+#define PCIS_SIMPLECOMM_GPIB 0x04
+#define PCIS_SIMPLECOMM_SMART_CARD 0x05
+#define PCIS_SIMPLECOMM_OTHER 0x80
+
+#define PCIC_BASEPERIPH 0x08
+#define PCIS_BASEPERIPH_PIC 0x00
+#define PCIP_BASEPERIPH_PIC_8259A 0x00
+#define PCIP_BASEPERIPH_PIC_ISA 0x01
+#define PCIP_BASEPERIPH_PIC_EISA 0x02
+#define PCIP_BASEPERIPH_PIC_IO_APIC 0x10
+#define PCIP_BASEPERIPH_PIC_IOX_APIC 0x20
+#define PCIS_BASEPERIPH_DMA 0x01
+#define PCIS_BASEPERIPH_TIMER 0x02
+#define PCIS_BASEPERIPH_RTC 0x03
+#define PCIS_BASEPERIPH_PCIHOT 0x04
+#define PCIS_BASEPERIPH_SDHC 0x05
+#define PCIS_BASEPERIPH_IOMMU 0x06
+#define PCIS_BASEPERIPH_OTHER 0x80
+
+#define PCIC_INPUTDEV 0x09
+#define PCIS_INPUTDEV_KEYBOARD 0x00
+#define PCIS_INPUTDEV_DIGITIZER 0x01
+#define PCIS_INPUTDEV_MOUSE 0x02
+#define PCIS_INPUTDEV_SCANNER 0x03
+#define PCIS_INPUTDEV_GAMEPORT 0x04
+#define PCIS_INPUTDEV_OTHER 0x80
+
+#define PCIC_DOCKING 0x0a
+#define PCIS_DOCKING_GENERIC 0x00
+#define PCIS_DOCKING_OTHER 0x80
+
+#define PCIC_PROCESSOR 0x0b
+#define PCIS_PROCESSOR_386 0x00
+#define PCIS_PROCESSOR_486 0x01
+#define PCIS_PROCESSOR_PENTIUM 0x02
+#define PCIS_PROCESSOR_ALPHA 0x10
+#define PCIS_PROCESSOR_POWERPC 0x20
+#define PCIS_PROCESSOR_MIPS 0x30
+#define PCIS_PROCESSOR_COPROC 0x40
+
+#define PCIC_SERIALBUS 0x0c
+#define PCIS_SERIALBUS_FW 0x00
+#define PCIS_SERIALBUS_ACCESS 0x01
+#define PCIS_SERIALBUS_SSA 0x02
+#define PCIS_SERIALBUS_USB 0x03
+#define PCIP_SERIALBUS_USB_UHCI 0x00
+#define PCIP_SERIALBUS_USB_OHCI 0x10
+#define PCIP_SERIALBUS_USB_EHCI 0x20
+#define PCIP_SERIALBUS_USB_XHCI 0x30
+#define PCIP_SERIALBUS_USB_DEVICE 0xfe
+#define PCIS_SERIALBUS_FC 0x04
+#define PCIS_SERIALBUS_SMBUS 0x05
+#define PCIS_SERIALBUS_INFINIBAND 0x06
+#define PCIS_SERIALBUS_IPMI 0x07
+#define PCIP_SERIALBUS_IPMI_SMIC 0x00
+#define PCIP_SERIALBUS_IPMI_KCS 0x01
+#define PCIP_SERIALBUS_IPMI_BT 0x02
+#define PCIS_SERIALBUS_SERCOS 0x08
+#define PCIS_SERIALBUS_CANBUS 0x09
+
+#define PCIC_WIRELESS 0x0d
+#define PCIS_WIRELESS_IRDA 0x00
+#define PCIS_WIRELESS_IR 0x01
+#define PCIS_WIRELESS_RF 0x10
+#define PCIS_WIRELESS_BLUETOOTH 0x11
+#define PCIS_WIRELESS_BROADBAND 0x12
+#define PCIS_WIRELESS_80211A 0x20
+#define PCIS_WIRELESS_80211B 0x21
+#define PCIS_WIRELESS_OTHER 0x80
+
+#define PCIC_INTELLIIO 0x0e
+#define PCIS_INTELLIIO_I2O 0x00
+
+#define PCIC_SATCOM 0x0f
+#define PCIS_SATCOM_TV 0x01
+#define PCIS_SATCOM_AUDIO 0x02
+#define PCIS_SATCOM_VOICE 0x03
+#define PCIS_SATCOM_DATA 0x04
+
+#define PCIC_CRYPTO 0x10
+#define PCIS_CRYPTO_NETCOMP 0x00
+#define PCIS_CRYPTO_ENTERTAIN 0x10
+#define PCIS_CRYPTO_OTHER 0x80
+
+#define PCIC_DASP 0x11
+#define PCIS_DASP_DPIO 0x00
+#define PCIS_DASP_PERFCNTRS 0x01
+#define PCIS_DASP_COMM_SYNC 0x10
+#define PCIS_DASP_MGMT_CARD 0x20
+#define PCIS_DASP_OTHER 0x80
+
+#define PCIC_OTHER 0xff
+
+/* Bridge Control Values. */
+#define PCIB_BCR_PERR_ENABLE 0x0001
+#define PCIB_BCR_SERR_ENABLE 0x0002
+#define PCIB_BCR_ISA_ENABLE 0x0004
+#define PCIB_BCR_VGA_ENABLE 0x0008
+#define PCIB_BCR_MASTER_ABORT_MODE 0x0020
+#define PCIB_BCR_SECBUS_RESET 0x0040
+#define PCIB_BCR_SECBUS_BACKTOBACK 0x0080
+#define PCIB_BCR_PRI_DISCARD_TIMEOUT 0x0100
+#define PCIB_BCR_SEC_DISCARD_TIMEOUT 0x0200
+#define PCIB_BCR_DISCARD_TIMER_STATUS 0x0400
+#define PCIB_BCR_DISCARD_TIMER_SERREN 0x0800
+
+/* PCI power manangement */
+#define PCIR_POWER_CAP 0x2
+#define PCIM_PCAP_SPEC 0x0007
+#define PCIM_PCAP_PMEREQCLK 0x0008
+#define PCIM_PCAP_DEVSPECINIT 0x0020
+#define PCIM_PCAP_AUXPWR_0 0x0000
+#define PCIM_PCAP_AUXPWR_55 0x0040
+#define PCIM_PCAP_AUXPWR_100 0x0080
+#define PCIM_PCAP_AUXPWR_160 0x00c0
+#define PCIM_PCAP_AUXPWR_220 0x0100
+#define PCIM_PCAP_AUXPWR_270 0x0140
+#define PCIM_PCAP_AUXPWR_320 0x0180
+#define PCIM_PCAP_AUXPWR_375 0x01c0
+#define PCIM_PCAP_AUXPWRMASK 0x01c0
+#define PCIM_PCAP_D1SUPP 0x0200
+#define PCIM_PCAP_D2SUPP 0x0400
+#define PCIM_PCAP_D0PME 0x0800
+#define PCIM_PCAP_D1PME 0x1000
+#define PCIM_PCAP_D2PME 0x2000
+#define PCIM_PCAP_D3PME_HOT 0x4000
+#define PCIM_PCAP_D3PME_COLD 0x8000
+
+#define PCIR_POWER_STATUS 0x4
+#define PCIM_PSTAT_D0 0x0000
+#define PCIM_PSTAT_D1 0x0001
+#define PCIM_PSTAT_D2 0x0002
+#define PCIM_PSTAT_D3 0x0003
+#define PCIM_PSTAT_DMASK 0x0003
+#define PCIM_PSTAT_NOSOFTRESET 0x0008
+#define PCIM_PSTAT_PMEENABLE 0x0100
+#define PCIM_PSTAT_D0POWER 0x0000
+#define PCIM_PSTAT_D1POWER 0x0200
+#define PCIM_PSTAT_D2POWER 0x0400
+#define PCIM_PSTAT_D3POWER 0x0600
+#define PCIM_PSTAT_D0HEAT 0x0800
+#define PCIM_PSTAT_D1HEAT 0x0a00
+#define PCIM_PSTAT_D2HEAT 0x0c00
+#define PCIM_PSTAT_D3HEAT 0x0e00
+#define PCIM_PSTAT_DATASELMASK 0x1e00
+#define PCIM_PSTAT_DATAUNKN 0x0000
+#define PCIM_PSTAT_DATADIV10 0x2000
+#define PCIM_PSTAT_DATADIV100 0x4000
+#define PCIM_PSTAT_DATADIV1000 0x6000
+#define PCIM_PSTAT_DATADIVMASK 0x6000
+#define PCIM_PSTAT_PME 0x8000
+
+#define PCIR_POWER_BSE 0x6
+#define PCIM_PMCSR_BSE_D3B3 0x00
+#define PCIM_PMCSR_BSE_D3B2 0x40
+#define PCIM_PMCSR_BSE_BPCCE 0x80
+
+#define PCIR_POWER_DATA 0x7
+
+/* VPD capability registers */
+#define PCIR_VPD_ADDR 0x2
+#define PCIR_VPD_DATA 0x4
+
+/* PCI Message Signalled Interrupts (MSI) */
+#define PCIR_MSI_CTRL 0x2
+#define PCIM_MSICTRL_VECTOR 0x0100
+#define PCIM_MSICTRL_64BIT 0x0080
+#define PCIM_MSICTRL_MME_MASK 0x0070
+#define PCIM_MSICTRL_MME_1 0x0000
+#define PCIM_MSICTRL_MME_2 0x0010
+#define PCIM_MSICTRL_MME_4 0x0020
+#define PCIM_MSICTRL_MME_8 0x0030
+#define PCIM_MSICTRL_MME_16 0x0040
+#define PCIM_MSICTRL_MME_32 0x0050
+#define PCIM_MSICTRL_MMC_MASK 0x000E
+#define PCIM_MSICTRL_MMC_1 0x0000
+#define PCIM_MSICTRL_MMC_2 0x0002
+#define PCIM_MSICTRL_MMC_4 0x0004
+#define PCIM_MSICTRL_MMC_8 0x0006
+#define PCIM_MSICTRL_MMC_16 0x0008
+#define PCIM_MSICTRL_MMC_32 0x000A
+#define PCIM_MSICTRL_MSI_ENABLE 0x0001
+#define PCIR_MSI_ADDR 0x4
+#define PCIR_MSI_ADDR_HIGH 0x8
+#define PCIR_MSI_DATA 0x8
+#define PCIR_MSI_DATA_64BIT 0xc
+#define PCIR_MSI_MASK 0x10
+#define PCIR_MSI_PENDING 0x14
+
+/* PCI-X definitions */
+
+/* For header type 0 devices */
+#define PCIXR_COMMAND 0x2
+#define PCIXM_COMMAND_DPERR_E 0x0001 /* Data Parity Error Recovery */
+#define PCIXM_COMMAND_ERO 0x0002 /* Enable Relaxed Ordering */
+#define PCIXM_COMMAND_MAX_READ 0x000c /* Maximum Burst Read Count */
+#define PCIXM_COMMAND_MAX_READ_512 0x0000
+#define PCIXM_COMMAND_MAX_READ_1024 0x0004
+#define PCIXM_COMMAND_MAX_READ_2048 0x0008
+#define PCIXM_COMMAND_MAX_READ_4096 0x000c
+#define PCIXM_COMMAND_MAX_SPLITS 0x0070 /* Maximum Split Transactions */
+#define PCIXM_COMMAND_MAX_SPLITS_1 0x0000
+#define PCIXM_COMMAND_MAX_SPLITS_2 0x0010
+#define PCIXM_COMMAND_MAX_SPLITS_3 0x0020
+#define PCIXM_COMMAND_MAX_SPLITS_4 0x0030
+#define PCIXM_COMMAND_MAX_SPLITS_8 0x0040
+#define PCIXM_COMMAND_MAX_SPLITS_12 0x0050
+#define PCIXM_COMMAND_MAX_SPLITS_16 0x0060
+#define PCIXM_COMMAND_MAX_SPLITS_32 0x0070
+#define PCIXM_COMMAND_VERSION 0x3000
+#define PCIXR_STATUS 0x4
+#define PCIXM_STATUS_DEVFN 0x000000FF
+#define PCIXM_STATUS_BUS 0x0000FF00
+#define PCIXM_STATUS_64BIT 0x00010000
+#define PCIXM_STATUS_133CAP 0x00020000
+#define PCIXM_STATUS_SC_DISCARDED 0x00040000
+#define PCIXM_STATUS_UNEXP_SC 0x00080000
+#define PCIXM_STATUS_COMPLEX_DEV 0x00100000
+#define PCIXM_STATUS_MAX_READ 0x00600000
+#define PCIXM_STATUS_MAX_READ_512 0x00000000
+#define PCIXM_STATUS_MAX_READ_1024 0x00200000
+#define PCIXM_STATUS_MAX_READ_2048 0x00400000
+#define PCIXM_STATUS_MAX_READ_4096 0x00600000
+#define PCIXM_STATUS_MAX_SPLITS 0x03800000
+#define PCIXM_STATUS_MAX_SPLITS_1 0x00000000
+#define PCIXM_STATUS_MAX_SPLITS_2 0x00800000
+#define PCIXM_STATUS_MAX_SPLITS_3 0x01000000
+#define PCIXM_STATUS_MAX_SPLITS_4 0x01800000
+#define PCIXM_STATUS_MAX_SPLITS_8 0x02000000
+#define PCIXM_STATUS_MAX_SPLITS_12 0x02800000
+#define PCIXM_STATUS_MAX_SPLITS_16 0x03000000
+#define PCIXM_STATUS_MAX_SPLITS_32 0x03800000
+#define PCIXM_STATUS_MAX_CUM_READ 0x1C000000
+#define PCIXM_STATUS_RCVD_SC_ERR 0x20000000
+#define PCIXM_STATUS_266CAP 0x40000000
+#define PCIXM_STATUS_533CAP 0x80000000
+
+/* For header type 1 devices (PCI-X bridges) */
+#define PCIXR_SEC_STATUS 0x2
+#define PCIXM_SEC_STATUS_64BIT 0x0001
+#define PCIXM_SEC_STATUS_133CAP 0x0002
+#define PCIXM_SEC_STATUS_SC_DISC 0x0004
+#define PCIXM_SEC_STATUS_UNEXP_SC 0x0008
+#define PCIXM_SEC_STATUS_SC_OVERRUN 0x0010
+#define PCIXM_SEC_STATUS_SR_DELAYED 0x0020
+#define PCIXM_SEC_STATUS_BUS_MODE 0x03c0
+#define PCIXM_SEC_STATUS_VERSION 0x3000
+#define PCIXM_SEC_STATUS_266CAP 0x4000
+#define PCIXM_SEC_STATUS_533CAP 0x8000
+#define PCIXR_BRIDGE_STATUS 0x4
+#define PCIXM_BRIDGE_STATUS_DEVFN 0x000000FF
+#define PCIXM_BRIDGE_STATUS_BUS 0x0000FF00
+#define PCIXM_BRIDGE_STATUS_64BIT 0x00010000
+#define PCIXM_BRIDGE_STATUS_133CAP 0x00020000
+#define PCIXM_BRIDGE_STATUS_SC_DISCARDED 0x00040000
+#define PCIXM_BRIDGE_STATUS_UNEXP_SC 0x00080000
+#define PCIXM_BRIDGE_STATUS_SC_OVERRUN 0x00100000
+#define PCIXM_BRIDGE_STATUS_SR_DELAYED 0x00200000
+#define PCIXM_BRIDGE_STATUS_DEVID_MSGCAP 0x20000000
+#define PCIXM_BRIDGE_STATUS_266CAP 0x40000000
+#define PCIXM_BRIDGE_STATUS_533CAP 0x80000000
+
+/* HT (HyperTransport) Capability definitions */
+#define PCIR_HT_COMMAND 0x2
+#define PCIM_HTCMD_CAP_MASK 0xf800 /* Capability type. */
+#define PCIM_HTCAP_SLAVE 0x0000 /* 000xx */
+#define PCIM_HTCAP_HOST 0x2000 /* 001xx */
+#define PCIM_HTCAP_SWITCH 0x4000 /* 01000 */
+#define PCIM_HTCAP_INTERRUPT 0x8000 /* 10000 */
+#define PCIM_HTCAP_REVISION_ID 0x8800 /* 10001 */
+#define PCIM_HTCAP_UNITID_CLUMPING 0x9000 /* 10010 */
+#define PCIM_HTCAP_EXT_CONFIG_SPACE 0x9800 /* 10011 */
+#define PCIM_HTCAP_ADDRESS_MAPPING 0xa000 /* 10100 */
+#define PCIM_HTCAP_MSI_MAPPING 0xa800 /* 10101 */
+#define PCIM_HTCAP_DIRECT_ROUTE 0xb000 /* 10110 */
+#define PCIM_HTCAP_VCSET 0xb800 /* 10111 */
+#define PCIM_HTCAP_RETRY_MODE 0xc000 /* 11000 */
+#define PCIM_HTCAP_X86_ENCODING 0xc800 /* 11001 */
+#define PCIM_HTCAP_GEN3 0xd000 /* 11010 */
+#define PCIM_HTCAP_FLE 0xd800 /* 11011 */
+#define PCIM_HTCAP_PM 0xe000 /* 11100 */
+#define PCIM_HTCAP_HIGH_NODE_COUNT 0xe800 /* 11101 */
+
+/* HT MSI Mapping Capability definitions. */
+#define PCIM_HTCMD_MSI_ENABLE 0x0001
+#define PCIM_HTCMD_MSI_FIXED 0x0002
+#define PCIR_HTMSI_ADDRESS_LO 0x4
+#define PCIR_HTMSI_ADDRESS_HI 0x8
+
+/* PCI Vendor capability definitions */
+#define PCIR_VENDOR_LENGTH 0x2
+#define PCIR_VENDOR_DATA 0x3
+
+/* PCI EHCI Debug Port definitions */
+#define PCIR_DEBUG_PORT 0x2
+#define PCIM_DEBUG_PORT_OFFSET 0x1FFF
+#define PCIM_DEBUG_PORT_BAR 0xe000
+
+/* PCI-PCI Bridge Subvendor definitions */
+#define PCIR_SUBVENDCAP_ID 0x4
+
+/* PCI Express definitions */
+#define PCIER_FLAGS 0x2
+#define PCIEM_FLAGS_VERSION 0x000F
+#define PCIEM_FLAGS_TYPE 0x00F0
+#define PCIEM_TYPE_ENDPOINT 0x0000
+#define PCIEM_TYPE_LEGACY_ENDPOINT 0x0010
+#define PCIEM_TYPE_ROOT_PORT 0x0040
+#define PCIEM_TYPE_UPSTREAM_PORT 0x0050
+#define PCIEM_TYPE_DOWNSTREAM_PORT 0x0060
+#define PCIEM_TYPE_PCI_BRIDGE 0x0070
+#define PCIEM_TYPE_PCIE_BRIDGE 0x0080
+#define PCIEM_TYPE_ROOT_INT_EP 0x0090
+#define PCIEM_TYPE_ROOT_EC 0x00a0
+#define PCIEM_FLAGS_SLOT 0x0100
+#define PCIEM_FLAGS_IRQ 0x3e00
+#define PCIER_DEVICE_CAP 0x4
+#define PCIEM_CAP_MAX_PAYLOAD 0x00000007
+#define PCIEM_CAP_PHANTHOM_FUNCS 0x00000018
+#define PCIEM_CAP_EXT_TAG_FIELD 0x00000020
+#define PCIEM_CAP_L0S_LATENCY 0x000001c0
+#define PCIEM_CAP_L1_LATENCY 0x00000e00
+#define PCIEM_CAP_ROLE_ERR_RPT 0x00008000
+#define PCIEM_CAP_SLOT_PWR_LIM_VAL 0x03fc0000
+#define PCIEM_CAP_SLOT_PWR_LIM_SCALE 0x0c000000
+#define PCIEM_CAP_FLR 0x10000000
+#define PCIER_DEVICE_CTL 0x8
+#define PCIEM_CTL_COR_ENABLE 0x0001
+#define PCIEM_CTL_NFER_ENABLE 0x0002
+#define PCIEM_CTL_FER_ENABLE 0x0004
+#define PCIEM_CTL_URR_ENABLE 0x0008
+#define PCIEM_CTL_RELAXED_ORD_ENABLE 0x0010
+#define PCIEM_CTL_MAX_PAYLOAD 0x00e0
+#define PCIEM_CTL_EXT_TAG_FIELD 0x0100
+#define PCIEM_CTL_PHANTHOM_FUNCS 0x0200
+#define PCIEM_CTL_AUX_POWER_PM 0x0400
+#define PCIEM_CTL_NOSNOOP_ENABLE 0x0800
+#define PCIEM_CTL_MAX_READ_REQUEST 0x7000
+#define PCIEM_CTL_BRDG_CFG_RETRY 0x8000 /* PCI-E - PCI/PCI-X bridges */
+#define PCIEM_CTL_INITIATE_FLR 0x8000 /* FLR capable endpoints */
+#define PCIER_DEVICE_STA 0xa
+#define PCIEM_STA_CORRECTABLE_ERROR 0x0001
+#define PCIEM_STA_NON_FATAL_ERROR 0x0002
+#define PCIEM_STA_FATAL_ERROR 0x0004
+#define PCIEM_STA_UNSUPPORTED_REQ 0x0008
+#define PCIEM_STA_AUX_POWER 0x0010
+#define PCIEM_STA_TRANSACTION_PND 0x0020
+#define PCIER_LINK_CAP 0xc
+#define PCIEM_LINK_CAP_MAX_SPEED 0x0000000f
+#define PCIEM_LINK_CAP_MAX_WIDTH 0x000003f0
+#define PCIEM_LINK_CAP_ASPM 0x00000c00
+#define PCIEM_LINK_CAP_L0S_EXIT 0x00007000
+#define PCIEM_LINK_CAP_L1_EXIT 0x00038000
+#define PCIEM_LINK_CAP_CLOCK_PM 0x00040000
+#define PCIEM_LINK_CAP_SURPRISE_DOWN 0x00080000
+#define PCIEM_LINK_CAP_DL_ACTIVE 0x00100000
+#define PCIEM_LINK_CAP_LINK_BW_NOTIFY 0x00200000
+#define PCIEM_LINK_CAP_ASPM_COMPLIANCE 0x00400000
+#define PCIEM_LINK_CAP_PORT 0xff000000
+#define PCIER_LINK_CTL 0x10
+#define PCIEM_LINK_CTL_ASPMC_DIS 0x0000
+#define PCIEM_LINK_CTL_ASPMC_L0S 0x0001
+#define PCIEM_LINK_CTL_ASPMC_L1 0x0002
+#define PCIEM_LINK_CTL_ASPMC 0x0003
+#define PCIEM_LINK_CTL_RCB 0x0008
+#define PCIEM_LINK_CTL_LINK_DIS 0x0010
+#define PCIEM_LINK_CTL_RETRAIN_LINK 0x0020
+#define PCIEM_LINK_CTL_COMMON_CLOCK 0x0040
+#define PCIEM_LINK_CTL_EXTENDED_SYNC 0x0080
+#define PCIEM_LINK_CTL_ECPM 0x0100
+#define PCIEM_LINK_CTL_HAWD 0x0200
+#define PCIEM_LINK_CTL_LBMIE 0x0400
+#define PCIEM_LINK_CTL_LABIE 0x0800
+#define PCIER_LINK_STA 0x12
+#define PCIEM_LINK_STA_SPEED 0x000f
+#define PCIEM_LINK_STA_WIDTH 0x03f0
+#define PCIEM_LINK_STA_TRAINING_ERROR 0x0400
+#define PCIEM_LINK_STA_TRAINING 0x0800
+#define PCIEM_LINK_STA_SLOT_CLOCK 0x1000
+#define PCIEM_LINK_STA_DL_ACTIVE 0x2000
+#define PCIEM_LINK_STA_LINK_BW_MGMT 0x4000
+#define PCIEM_LINK_STA_LINK_AUTO_BW 0x8000
+#define PCIER_SLOT_CAP 0x14
+#define PCIEM_SLOT_CAP_APB 0x00000001
+#define PCIEM_SLOT_CAP_PCP 0x00000002
+#define PCIEM_SLOT_CAP_MRLSP 0x00000004
+#define PCIEM_SLOT_CAP_AIP 0x00000008
+#define PCIEM_SLOT_CAP_PIP 0x00000010
+#define PCIEM_SLOT_CAP_HPS 0x00000020
+#define PCIEM_SLOT_CAP_HPC 0x00000040
+#define PCIEM_SLOT_CAP_SPLV 0x00007f80
+#define PCIEM_SLOT_CAP_SPLS 0x00018000
+#define PCIEM_SLOT_CAP_EIP 0x00020000
+#define PCIEM_SLOT_CAP_NCCS 0x00040000
+#define PCIEM_SLOT_CAP_PSN 0xfff80000
+#define PCIER_SLOT_CTL 0x18
+#define PCIEM_SLOT_CTL_ABPE 0x0001
+#define PCIEM_SLOT_CTL_PFDE 0x0002
+#define PCIEM_SLOT_CTL_MRLSCE 0x0004
+#define PCIEM_SLOT_CTL_PDCE 0x0008
+#define PCIEM_SLOT_CTL_CCIE 0x0010
+#define PCIEM_SLOT_CTL_HPIE 0x0020
+#define PCIEM_SLOT_CTL_AIC 0x00c0
+#define PCIEM_SLOT_CTL_PIC 0x0300
+#define PCIEM_SLOT_CTL_PCC 0x0400
+#define PCIEM_SLOT_CTL_EIC 0x0800
+#define PCIEM_SLOT_CTL_DLLSCE 0x1000
+#define PCIER_SLOT_STA 0x1a
+#define PCIEM_SLOT_STA_ABP 0x0001
+#define PCIEM_SLOT_STA_PFD 0x0002
+#define PCIEM_SLOT_STA_MRLSC 0x0004
+#define PCIEM_SLOT_STA_PDC 0x0008
+#define PCIEM_SLOT_STA_CC 0x0010
+#define PCIEM_SLOT_STA_MRLSS 0x0020
+#define PCIEM_SLOT_STA_PDS 0x0040
+#define PCIEM_SLOT_STA_EIS 0x0080
+#define PCIEM_SLOT_STA_DLLSC 0x0100
+#define PCIER_ROOT_CTL 0x1c
+#define PCIEM_ROOT_CTL_SERR_CORR 0x0001
+#define PCIEM_ROOT_CTL_SERR_NONFATAL 0x0002
+#define PCIEM_ROOT_CTL_SERR_FATAL 0x0004
+#define PCIEM_ROOT_CTL_PME 0x0008
+#define PCIEM_ROOT_CTL_CRS_VIS 0x0010
+#define PCIER_ROOT_CAP 0x1e
+#define PCIEM_ROOT_CAP_CRS_VIS 0x0001
+#define PCIER_ROOT_STA 0x20
+#define PCIEM_ROOT_STA_PME_REQID_MASK 0x0000ffff
+#define PCIEM_ROOT_STA_PME_STATUS 0x00010000
+#define PCIEM_ROOT_STA_PME_PEND 0x00020000
+#define PCIER_DEVICE_CAP2 0x24
+#define PCIEM_CAP2_ARI 0x20
+#define PCIER_DEVICE_CTL2 0x28
+#define PCIEM_CTL2_COMP_TIMEOUT_VAL 0x000f
+#define PCIEM_CTL2_COMP_TIMEOUT_DIS 0x0010
+#define PCIEM_CTL2_ARI 0x0020
+#define PCIEM_CTL2_ATOMIC_REQ_ENABLE 0x0040
+#define PCIEM_CTL2_ATOMIC_EGR_BLOCK 0x0080
+#define PCIEM_CTL2_ID_ORDERED_REQ_EN 0x0100
+#define PCIEM_CTL2_ID_ORDERED_CMP_EN 0x0200
+#define PCIEM_CTL2_LTR_ENABLE 0x0400
+#define PCIEM_CTL2_OBFF 0x6000
+#define PCIEM_OBFF_DISABLE 0x0000
+#define PCIEM_OBFF_MSGA_ENABLE 0x2000
+#define PCIEM_OBFF_MSGB_ENABLE 0x4000
+#define PCIEM_OBFF_WAKE_ENABLE 0x6000
+#define PCIEM_CTL2_END2END_TLP 0x8000
+#define PCIER_DEVICE_STA2 0x2a
+#define PCIER_LINK_CAP2 0x2c
+#define PCIER_LINK_CTL2 0x30
+#define PCIER_LINK_STA2 0x32
+#define PCIER_SLOT_CAP2 0x34
+#define PCIER_SLOT_CTL2 0x38
+#define PCIER_SLOT_STA2 0x3a
+
+/* MSI-X definitions */
+#define PCIR_MSIX_CTRL 0x2
+#define PCIM_MSIXCTRL_MSIX_ENABLE 0x8000
+#define PCIM_MSIXCTRL_FUNCTION_MASK 0x4000
+#define PCIM_MSIXCTRL_TABLE_SIZE 0x07FF
+#define PCIR_MSIX_TABLE 0x4
+#define PCIR_MSIX_PBA 0x8
+#define PCIM_MSIX_BIR_MASK 0x7
+#define PCIM_MSIX_BIR_BAR_10 0
+#define PCIM_MSIX_BIR_BAR_14 1
+#define PCIM_MSIX_BIR_BAR_18 2
+#define PCIM_MSIX_BIR_BAR_1C 3
+#define PCIM_MSIX_BIR_BAR_20 4
+#define PCIM_MSIX_BIR_BAR_24 5
+#define PCIM_MSIX_VCTRL_MASK 0x1
+
+/* PCI Advanced Features definitions */
+#define PCIR_PCIAF_CAP 0x3
+#define PCIM_PCIAFCAP_TP 0x01
+#define PCIM_PCIAFCAP_FLR 0x02
+#define PCIR_PCIAF_CTRL 0x4
+#define PCIR_PCIAFCTRL_FLR 0x01
+#define PCIR_PCIAF_STATUS 0x5
+#define PCIR_PCIAFSTATUS_TP 0x01
+
+/* Advanced Error Reporting */
+#define PCIR_AER_UC_STATUS 0x04
+#define PCIM_AER_UC_TRAINING_ERROR 0x00000001
+#define PCIM_AER_UC_DL_PROTOCOL_ERROR 0x00000010
+#define PCIM_AER_UC_SURPRISE_LINK_DOWN 0x00000020
+#define PCIM_AER_UC_POISONED_TLP 0x00001000
+#define PCIM_AER_UC_FC_PROTOCOL_ERROR 0x00002000
+#define PCIM_AER_UC_COMPLETION_TIMEOUT 0x00004000
+#define PCIM_AER_UC_COMPLETER_ABORT 0x00008000
+#define PCIM_AER_UC_UNEXPECTED_COMPLETION 0x00010000
+#define PCIM_AER_UC_RECEIVER_OVERFLOW 0x00020000
+#define PCIM_AER_UC_MALFORMED_TLP 0x00040000
+#define PCIM_AER_UC_ECRC_ERROR 0x00080000
+#define PCIM_AER_UC_UNSUPPORTED_REQUEST 0x00100000
+#define PCIM_AER_UC_ACS_VIOLATION 0x00200000
+#define PCIM_AER_UC_INTERNAL_ERROR 0x00400000
+#define PCIM_AER_UC_MC_BLOCKED_TLP 0x00800000
+#define PCIM_AER_UC_ATOMIC_EGRESS_BLK 0x01000000
+#define PCIM_AER_UC_TLP_PREFIX_BLOCKED 0x02000000
+#define PCIR_AER_UC_MASK 0x08 /* Shares bits with UC_STATUS */
+#define PCIR_AER_UC_SEVERITY 0x0c /* Shares bits with UC_STATUS */
+#define PCIR_AER_COR_STATUS 0x10
+#define PCIM_AER_COR_RECEIVER_ERROR 0x00000001
+#define PCIM_AER_COR_BAD_TLP 0x00000040
+#define PCIM_AER_COR_BAD_DLLP 0x00000080
+#define PCIM_AER_COR_REPLAY_ROLLOVER 0x00000100
+#define PCIM_AER_COR_REPLAY_TIMEOUT 0x00001000
+#define PCIM_AER_COR_ADVISORY_NF_ERROR 0x00002000
+#define PCIM_AER_COR_INTERNAL_ERROR 0x00004000
+#define PCIM_AER_COR_HEADER_LOG_OVFLOW 0x00008000
+#define PCIR_AER_COR_MASK 0x14 /* Shares bits with COR_STATUS */
+#define PCIR_AER_CAP_CONTROL 0x18
+#define PCIM_AER_FIRST_ERROR_PTR 0x0000001f
+#define PCIM_AER_ECRC_GEN_CAPABLE 0x00000020
+#define PCIM_AER_ECRC_GEN_ENABLE 0x00000040
+#define PCIM_AER_ECRC_CHECK_CAPABLE 0x00000080
+#define PCIM_AER_ECRC_CHECK_ENABLE 0x00000100
+#define PCIM_AER_MULT_HDR_CAPABLE 0x00000200
+#define PCIM_AER_MULT_HDR_ENABLE 0x00000400
+#define PCIM_AER_TLP_PREFIX_LOG_PRESENT 0x00000800
+#define PCIR_AER_HEADER_LOG 0x1c
+#define PCIR_AER_ROOTERR_CMD 0x2c /* Only for root complex ports */
+#define PCIM_AER_ROOTERR_COR_ENABLE 0x00000001
+#define PCIM_AER_ROOTERR_NF_ENABLE 0x00000002
+#define PCIM_AER_ROOTERR_F_ENABLE 0x00000004
+#define PCIR_AER_ROOTERR_STATUS 0x30 /* Only for root complex ports */
+#define PCIM_AER_ROOTERR_COR_ERR 0x00000001
+#define PCIM_AER_ROOTERR_MULTI_COR_ERR 0x00000002
+#define PCIM_AER_ROOTERR_UC_ERR 0x00000004
+#define PCIM_AER_ROOTERR_MULTI_UC_ERR 0x00000008
+#define PCIM_AER_ROOTERR_FIRST_UC_FATAL 0x00000010
+#define PCIM_AER_ROOTERR_NF_ERR 0x00000020
+#define PCIM_AER_ROOTERR_F_ERR 0x00000040
+#define PCIM_AER_ROOTERR_INT_MESSAGE 0xf8000000
+#define PCIR_AER_COR_SOURCE_ID 0x34 /* Only for root complex ports */
+#define PCIR_AER_ERR_SOURCE_ID 0x36 /* Only for root complex ports */
+#define PCIR_AER_TLP_PREFIX_LOG 0x38 /* Only for TLP prefix functions */
+
+/* Virtual Channel definitions */
+#define PCIR_VC_CAP1 0x04
+#define PCIM_VC_CAP1_EXT_COUNT 0x00000007
+#define PCIM_VC_CAP1_LOWPRI_EXT_COUNT 0x00000070
+#define PCIR_VC_CAP2 0x08
+#define PCIR_VC_CONTROL 0x0C
+#define PCIR_VC_STATUS 0x0E
+#define PCIR_VC_RESOURCE_CAP(n) (0x10 + (n) * 0x0C)
+#define PCIR_VC_RESOURCE_CTL(n) (0x14 + (n) * 0x0C)
+#define PCIR_VC_RESOURCE_STA(n) (0x18 + (n) * 0x0C)
+
+/* Serial Number definitions */
+#define PCIR_SERIAL_LOW 0x04
+#define PCIR_SERIAL_HIGH 0x08
+
diff --git a/usr/contrib/freebsd/isa/isareg.h b/usr/contrib/freebsd/isa/isareg.h
new file mode 100644
index 0000000000..e83e34674f
--- /dev/null
+++ b/usr/contrib/freebsd/isa/isareg.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)isa.h 5.7 (Berkeley) 5/9/91
+ * $FreeBSD: head/sys/isa/isareg.h 263379 2014-03-19 21:03:04Z imp $
+ */
+
+#ifdef PC98
+#error isareg.h is included from PC-9801 source
+#endif
+
+#ifndef _ISA_ISA_H_
+#define _ISA_ISA_H_
+
+/*
+ * ISA Bus conventions
+ */
+
+/*
+ * Input / Output Port Assignments
+ */
+#ifndef IO_ISABEGIN
+#define IO_ISABEGIN 0x000 /* 0x000 - Beginning of I/O Registers */
+#define IO_ICU1 0x020 /* 8259A Interrupt Controller #1 */
+#define IO_KBD 0x060 /* 8042 Keyboard */
+#define IO_RTC 0x070 /* RTC */
+#define IO_ICU2 0x0A0 /* 8259A Interrupt Controller #2 */
+
+#define IO_MDA 0x3B0 /* Monochome Adapter */
+#define IO_VGA 0x3C0 /* E/VGA Ports */
+#define IO_CGA 0x3D0 /* CGA Ports */
+
+#endif /* !IO_ISABEGIN */
+
+/*
+ * Input / Output Port Sizes
+ */
+#define IO_CGASIZE 12 /* CGA controllers */
+#define IO_MDASIZE 12 /* Monochrome display controllers */
+#define IO_VGASIZE 16 /* VGA controllers */
+
+#endif /* !_ISA_ISA_H_ */
diff --git a/usr/contrib/freebsd/lib/libutil/expand_number.c b/usr/contrib/freebsd/lib/libutil/expand_number.c
new file mode 100644
index 0000000000..f3b4da89f9
--- /dev/null
+++ b/usr/contrib/freebsd/lib/libutil/expand_number.c
@@ -0,0 +1,93 @@
+/*-
+ * Copyright (c) 2007 Eric Anderson <anderson@FreeBSD.org>
+ * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/lib/libutil/expand_number.c 255069 2013-08-30 11:21:52Z pluknet $");
+
+#include <sys/types.h>
+#include <ctype.h>
+#include <errno.h>
+#include <inttypes.h>
+#include <libutil.h>
+#include <stdint.h>
+
+int
+expand_number(const char *buf, uint64_t *num)
+{
+ char *endptr;
+ uintmax_t umaxval;
+ uint64_t number;
+ unsigned shift;
+ int serrno;
+
+ serrno = errno;
+ errno = 0;
+ umaxval = strtoumax(buf, &endptr, 0);
+ if (umaxval > UINT64_MAX)
+ errno = ERANGE;
+ if (errno != 0)
+ return (-1);
+ errno = serrno;
+ number = umaxval;
+
+ switch (tolower((unsigned char)*endptr)) {
+ case 'e':
+ shift = 60;
+ break;
+ case 'p':
+ shift = 50;
+ break;
+ case 't':
+ shift = 40;
+ break;
+ case 'g':
+ shift = 30;
+ break;
+ case 'm':
+ shift = 20;
+ break;
+ case 'k':
+ shift = 10;
+ break;
+ case 'b':
+ case '\0': /* No unit. */
+ *num = number;
+ return (0);
+ default:
+ /* Unrecognized unit. */
+ errno = EINVAL;
+ return (-1);
+ }
+
+ if ((number << shift) >> shift != number) {
+ /* Overflow */
+ errno = ERANGE;
+ return (-1);
+ }
+ *num = number << shift;
+ return (0);
+}
diff --git a/usr/contrib/freebsd/sys/ata.h b/usr/contrib/freebsd/sys/ata.h
new file mode 100644
index 0000000000..705460355f
--- /dev/null
+++ b/usr/contrib/freebsd/sys/ata.h
@@ -0,0 +1,635 @@
+/*-
+ * Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification, immediately at the beginning of the file.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/sys/ata.h 264853 2014-04-24 01:28:14Z smh $
+ */
+
+#ifndef _SYS_ATA_H_
+#define _SYS_ATA_H_
+
+#include <sys/ioccom.h>
+
+/* ATA/ATAPI device parameters */
+struct ata_params {
+/*000*/ u_int16_t config; /* configuration info */
+#define ATA_PROTO_MASK 0x8003
+#define ATA_PROTO_ATAPI 0x8000
+#define ATA_PROTO_ATAPI_12 0x8000
+#define ATA_PROTO_ATAPI_16 0x8001
+#define ATA_PROTO_CFA 0x848a
+#define ATA_ATAPI_TYPE_MASK 0x1f00
+#define ATA_ATAPI_TYPE_DIRECT 0x0000 /* disk/floppy */
+#define ATA_ATAPI_TYPE_TAPE 0x0100 /* streaming tape */
+#define ATA_ATAPI_TYPE_CDROM 0x0500 /* CD-ROM device */
+#define ATA_ATAPI_TYPE_OPTICAL 0x0700 /* optical disk */
+#define ATA_DRQ_MASK 0x0060
+#define ATA_DRQ_SLOW 0x0000 /* cpu 3 ms delay */
+#define ATA_DRQ_INTR 0x0020 /* interrupt 10 ms delay */
+#define ATA_DRQ_FAST 0x0040 /* accel 50 us delay */
+#define ATA_RESP_INCOMPLETE 0x0004
+
+/*001*/ u_int16_t cylinders; /* # of cylinders */
+/*002*/ u_int16_t specconf; /* specific configuration */
+/*003*/ u_int16_t heads; /* # heads */
+ u_int16_t obsolete4;
+ u_int16_t obsolete5;
+/*006*/ u_int16_t sectors; /* # sectors/track */
+/*007*/ u_int16_t vendor7[3];
+/*010*/ u_int8_t serial[20]; /* serial number */
+/*020*/ u_int16_t retired20;
+ u_int16_t retired21;
+ u_int16_t obsolete22;
+/*023*/ u_int8_t revision[8]; /* firmware revision */
+/*027*/ u_int8_t model[40]; /* model name */
+/*047*/ u_int16_t sectors_intr; /* sectors per interrupt */
+/*048*/ u_int16_t usedmovsd; /* double word read/write? */
+/*049*/ u_int16_t capabilities1;
+#define ATA_SUPPORT_DMA 0x0100
+#define ATA_SUPPORT_LBA 0x0200
+#define ATA_SUPPORT_IORDY 0x0400
+#define ATA_SUPPORT_IORDYDIS 0x0800
+#define ATA_SUPPORT_OVERLAP 0x4000
+
+/*050*/ u_int16_t capabilities2;
+/*051*/ u_int16_t retired_piomode; /* PIO modes 0-2 */
+#define ATA_RETIRED_PIO_MASK 0x0300
+
+/*052*/ u_int16_t retired_dmamode; /* DMA modes */
+#define ATA_RETIRED_DMA_MASK 0x0003
+
+/*053*/ u_int16_t atavalid; /* fields valid */
+#define ATA_FLAG_54_58 0x0001 /* words 54-58 valid */
+#define ATA_FLAG_64_70 0x0002 /* words 64-70 valid */
+#define ATA_FLAG_88 0x0004 /* word 88 valid */
+
+/*054*/ u_int16_t current_cylinders;
+/*055*/ u_int16_t current_heads;
+/*056*/ u_int16_t current_sectors;
+/*057*/ u_int16_t current_size_1;
+/*058*/ u_int16_t current_size_2;
+/*059*/ u_int16_t multi;
+#define ATA_MULTI_VALID 0x0100
+
+/*060*/ u_int16_t lba_size_1;
+ u_int16_t lba_size_2;
+ u_int16_t obsolete62;
+/*063*/ u_int16_t mwdmamodes; /* multiword DMA modes */
+/*064*/ u_int16_t apiomodes; /* advanced PIO modes */
+
+/*065*/ u_int16_t mwdmamin; /* min. M/W DMA time/word ns */
+/*066*/ u_int16_t mwdmarec; /* rec. M/W DMA time ns */
+/*067*/ u_int16_t pioblind; /* min. PIO cycle w/o flow */
+/*068*/ u_int16_t pioiordy; /* min. PIO cycle IORDY flow */
+/*069*/ u_int16_t support3;
+#define ATA_SUPPORT_RZAT 0x0020
+#define ATA_SUPPORT_DRAT 0x4000
+ u_int16_t reserved70;
+/*071*/ u_int16_t rlsovlap; /* rel time (us) for overlap */
+/*072*/ u_int16_t rlsservice; /* rel time (us) for service */
+ u_int16_t reserved73;
+ u_int16_t reserved74;
+/*075*/ u_int16_t queue;
+#define ATA_QUEUE_LEN(x) ((x) & 0x001f)
+
+/*76*/ u_int16_t satacapabilities;
+#define ATA_SATA_GEN1 0x0002
+#define ATA_SATA_GEN2 0x0004
+#define ATA_SATA_GEN3 0x0008
+#define ATA_SUPPORT_NCQ 0x0100
+#define ATA_SUPPORT_IFPWRMNGTRCV 0x0200
+#define ATA_SUPPORT_PHYEVENTCNT 0x0400
+#define ATA_SUPPORT_NCQ_UNLOAD 0x0800
+#define ATA_SUPPORT_NCQ_PRIO 0x1000
+#define ATA_SUPPORT_HAPST 0x2000
+#define ATA_SUPPORT_DAPST 0x4000
+#define ATA_SUPPORT_READLOGDMAEXT 0x8000
+
+/*77*/ u_int16_t satacapabilities2;
+#define ATA_SATA_CURR_GEN_MASK 0x0006
+#define ATA_SUPPORT_NCQ_STREAM 0x0010
+#define ATA_SUPPORT_NCQ_QMANAGEMENT 0x0020
+#define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040
+/*78*/ u_int16_t satasupport;
+#define ATA_SUPPORT_NONZERO 0x0002
+#define ATA_SUPPORT_AUTOACTIVATE 0x0004
+#define ATA_SUPPORT_IFPWRMNGT 0x0008
+#define ATA_SUPPORT_INORDERDATA 0x0010
+#define ATA_SUPPORT_ASYNCNOTIF 0x0020
+#define ATA_SUPPORT_SOFTSETPRESERVE 0x0040
+/*79*/ u_int16_t sataenabled;
+#define ATA_ENABLED_DAPST 0x0080
+
+/*080*/ u_int16_t version_major;
+/*081*/ u_int16_t version_minor;
+
+ struct {
+/*082/085*/ u_int16_t command1;
+#define ATA_SUPPORT_SMART 0x0001
+#define ATA_SUPPORT_SECURITY 0x0002
+#define ATA_SUPPORT_REMOVABLE 0x0004
+#define ATA_SUPPORT_POWERMGT 0x0008
+#define ATA_SUPPORT_PACKET 0x0010
+#define ATA_SUPPORT_WRITECACHE 0x0020
+#define ATA_SUPPORT_LOOKAHEAD 0x0040
+#define ATA_SUPPORT_RELEASEIRQ 0x0080
+#define ATA_SUPPORT_SERVICEIRQ 0x0100
+#define ATA_SUPPORT_RESET 0x0200
+#define ATA_SUPPORT_PROTECTED 0x0400
+#define ATA_SUPPORT_WRITEBUFFER 0x1000
+#define ATA_SUPPORT_READBUFFER 0x2000
+#define ATA_SUPPORT_NOP 0x4000
+
+/*083/086*/ u_int16_t command2;
+#define ATA_SUPPORT_MICROCODE 0x0001
+#define ATA_SUPPORT_QUEUED 0x0002
+#define ATA_SUPPORT_CFA 0x0004
+#define ATA_SUPPORT_APM 0x0008
+#define ATA_SUPPORT_NOTIFY 0x0010
+#define ATA_SUPPORT_STANDBY 0x0020
+#define ATA_SUPPORT_SPINUP 0x0040
+#define ATA_SUPPORT_MAXSECURITY 0x0100
+#define ATA_SUPPORT_AUTOACOUSTIC 0x0200
+#define ATA_SUPPORT_ADDRESS48 0x0400
+#define ATA_SUPPORT_OVERLAY 0x0800
+#define ATA_SUPPORT_FLUSHCACHE 0x1000
+#define ATA_SUPPORT_FLUSHCACHE48 0x2000
+
+/*084/087*/ u_int16_t extension;
+#define ATA_SUPPORT_SMARTLOG 0x0001
+#define ATA_SUPPORT_SMARTTEST 0x0002
+#define ATA_SUPPORT_MEDIASN 0x0004
+#define ATA_SUPPORT_MEDIAPASS 0x0008
+#define ATA_SUPPORT_STREAMING 0x0010
+#define ATA_SUPPORT_GENLOG 0x0020
+#define ATA_SUPPORT_WRITEDMAFUAEXT 0x0040
+#define ATA_SUPPORT_WRITEDMAQFUAEXT 0x0080
+#define ATA_SUPPORT_64BITWWN 0x0100
+#define ATA_SUPPORT_UNLOAD 0x2000
+ } __packed support, enabled;
+
+/*088*/ u_int16_t udmamodes; /* UltraDMA modes */
+/*089*/ u_int16_t erase_time; /* time req'd in 2min units */
+/*090*/ u_int16_t enhanced_erase_time; /* time req'd in 2min units */
+/*091*/ u_int16_t apm_value;
+/*092*/ u_int16_t master_passwd_revision; /* password revision code */
+/*093*/ u_int16_t hwres;
+#define ATA_CABLE_ID 0x2000
+
+/*094*/ u_int16_t acoustic;
+#define ATA_ACOUSTIC_CURRENT(x) ((x) & 0x00ff)
+#define ATA_ACOUSTIC_VENDOR(x) (((x) & 0xff00) >> 8)
+
+/*095*/ u_int16_t stream_min_req_size;
+/*096*/ u_int16_t stream_transfer_time;
+/*097*/ u_int16_t stream_access_latency;
+/*098*/ u_int32_t stream_granularity;
+/*100*/ u_int16_t lba_size48_1;
+ u_int16_t lba_size48_2;
+ u_int16_t lba_size48_3;
+ u_int16_t lba_size48_4;
+ u_int16_t reserved104;
+/*105*/ u_int16_t max_dsm_blocks;
+/*106*/ u_int16_t pss;
+#define ATA_PSS_LSPPS 0x000F
+#define ATA_PSS_LSSABOVE512 0x1000
+#define ATA_PSS_MULTLS 0x2000
+#define ATA_PSS_VALID_MASK 0xC000
+#define ATA_PSS_VALID_VALUE 0x4000
+/*107*/ u_int16_t isd;
+/*108*/ u_int16_t wwn[4];
+ u_int16_t reserved112[5];
+/*117*/ u_int16_t lss_1;
+/*118*/ u_int16_t lss_2;
+/*119*/ u_int16_t support2;
+#define ATA_SUPPORT_WRITEREADVERIFY 0x0002
+#define ATA_SUPPORT_WRITEUNCORREXT 0x0004
+#define ATA_SUPPORT_RWLOGDMAEXT 0x0008
+#define ATA_SUPPORT_MICROCODE3 0x0010
+#define ATA_SUPPORT_FREEFALL 0x0020
+/*120*/ u_int16_t enabled2;
+ u_int16_t reserved121[6];
+/*127*/ u_int16_t removable_status;
+/*128*/ u_int16_t security_status;
+#define ATA_SECURITY_LEVEL 0x0100 /* 0: high, 1: maximum */
+#define ATA_SECURITY_ENH_SUPP 0x0020 /* enhanced erase supported */
+#define ATA_SECURITY_COUNT_EXP 0x0010 /* count expired */
+#define ATA_SECURITY_FROZEN 0x0008 /* security config is frozen */
+#define ATA_SECURITY_LOCKED 0x0004 /* drive is locked */
+#define ATA_SECURITY_ENABLED 0x0002 /* ATA Security is enabled */
+#define ATA_SECURITY_SUPPORTED 0x0001 /* ATA Security is supported */
+
+ u_int16_t reserved129[31];
+/*160*/ u_int16_t cfa_powermode1;
+ u_int16_t reserved161;
+/*162*/ u_int16_t cfa_kms_support;
+/*163*/ u_int16_t cfa_trueide_modes;
+/*164*/ u_int16_t cfa_memory_modes;
+ u_int16_t reserved165[4];
+/*169*/ u_int16_t support_dsm;
+#define ATA_SUPPORT_DSM_TRIM 0x0001
+ u_int16_t reserved170[6];
+/*176*/ u_int8_t media_serial[60];
+/*206*/ u_int16_t sct;
+ u_int16_t reserved206[2];
+/*209*/ u_int16_t lsalign;
+/*210*/ u_int16_t wrv_sectors_m3_1;
+ u_int16_t wrv_sectors_m3_2;
+/*212*/ u_int16_t wrv_sectors_m2_1;
+ u_int16_t wrv_sectors_m2_2;
+/*214*/ u_int16_t nv_cache_caps;
+/*215*/ u_int16_t nv_cache_size_1;
+ u_int16_t nv_cache_size_2;
+/*217*/ u_int16_t media_rotation_rate;
+#define ATA_RATE_NOT_REPORTED 0x0000
+#define ATA_RATE_NON_ROTATING 0x0001
+ u_int16_t reserved218;
+/*219*/ u_int16_t nv_cache_opt;
+/*220*/ u_int16_t wrv_mode;
+ u_int16_t reserved221;
+/*222*/ u_int16_t transport_major;
+/*223*/ u_int16_t transport_minor;
+ u_int16_t reserved224[31];
+/*255*/ u_int16_t integrity;
+} __packed;
+
+/* ATA Dataset Management */
+#define ATA_DSM_BLK_SIZE 512
+#define ATA_DSM_BLK_RANGES 64
+#define ATA_DSM_RANGE_SIZE 8
+#define ATA_DSM_RANGE_MAX 65535
+
+/*
+ * ATA Device Register
+ *
+ * bit 7 Obsolete (was 1 in early ATA specs)
+ * bit 6 Sets LBA/CHS mode. 1=LBA, 0=CHS
+ * bit 5 Obsolete (was 1 in early ATA specs)
+ * bit 4 1 = Slave Drive, 0 = Master Drive
+ * bit 3-0 In LBA mode, 27-24 of address. In CHS mode, head number
+*/
+
+#define ATA_DEV_MASTER 0x00
+#define ATA_DEV_SLAVE 0x10
+#define ATA_DEV_LBA 0x40
+
+/* ATA limits */
+#define ATA_MAX_28BIT_LBA 268435455UL
+
+/* ATA Status Register */
+#define ATA_STATUS_ERROR 0x01
+#define ATA_STATUS_DEVICE_FAULT 0x20
+
+/* ATA Error Register */
+#define ATA_ERROR_ABORT 0x04
+#define ATA_ERROR_ID_NOT_FOUND 0x10
+
+/* ATA HPA Features */
+#define ATA_HPA_FEAT_MAX_ADDR 0x00
+#define ATA_HPA_FEAT_SET_PWD 0x01
+#define ATA_HPA_FEAT_LOCK 0x02
+#define ATA_HPA_FEAT_UNLOCK 0x03
+#define ATA_HPA_FEAT_FREEZE 0x04
+
+/* ATA transfer modes */
+#define ATA_MODE_MASK 0x0f
+#define ATA_DMA_MASK 0xf0
+#define ATA_PIO 0x00
+#define ATA_PIO0 0x08
+#define ATA_PIO1 0x09
+#define ATA_PIO2 0x0a
+#define ATA_PIO3 0x0b
+#define ATA_PIO4 0x0c
+#define ATA_PIO_MAX 0x0f
+#define ATA_DMA 0x10
+#define ATA_WDMA0 0x20
+#define ATA_WDMA1 0x21
+#define ATA_WDMA2 0x22
+#define ATA_UDMA0 0x40
+#define ATA_UDMA1 0x41
+#define ATA_UDMA2 0x42
+#define ATA_UDMA3 0x43
+#define ATA_UDMA4 0x44
+#define ATA_UDMA5 0x45
+#define ATA_UDMA6 0x46
+#define ATA_SA150 0x47
+#define ATA_SA300 0x48
+#define ATA_DMA_MAX 0x4f
+
+
+/* ATA commands */
+#define ATA_NOP 0x00 /* NOP */
+#define ATA_NF_FLUSHQUEUE 0x00 /* flush queued cmd's */
+#define ATA_NF_AUTOPOLL 0x01 /* start autopoll function */
+#define ATA_DATA_SET_MANAGEMENT 0x06
+#define ATA_DSM_TRIM 0x01
+#define ATA_DEVICE_RESET 0x08 /* reset device */
+#define ATA_READ 0x20 /* read */
+#define ATA_READ48 0x24 /* read 48bit LBA */
+#define ATA_READ_DMA48 0x25 /* read DMA 48bit LBA */
+#define ATA_READ_DMA_QUEUED48 0x26 /* read DMA QUEUED 48bit LBA */
+#define ATA_READ_NATIVE_MAX_ADDRESS48 0x27 /* read native max addr 48bit */
+#define ATA_READ_MUL48 0x29 /* read multi 48bit LBA */
+#define ATA_READ_STREAM_DMA48 0x2a /* read DMA stream 48bit LBA */
+#define ATA_READ_LOG_EXT 0x2f /* read log ext - PIO Data-In */
+#define ATA_READ_STREAM48 0x2b /* read stream 48bit LBA */
+#define ATA_WRITE 0x30 /* write */
+#define ATA_WRITE48 0x34 /* write 48bit LBA */
+#define ATA_WRITE_DMA48 0x35 /* write DMA 48bit LBA */
+#define ATA_WRITE_DMA_QUEUED48 0x36 /* write DMA QUEUED 48bit LBA*/
+#define ATA_SET_MAX_ADDRESS48 0x37 /* set max address 48bit */
+#define ATA_WRITE_MUL48 0x39 /* write multi 48bit LBA */
+#define ATA_WRITE_STREAM_DMA48 0x3a
+#define ATA_WRITE_STREAM48 0x3b
+#define ATA_WRITE_DMA_FUA48 0x3d
+#define ATA_WRITE_DMA_QUEUED_FUA48 0x3e
+#define ATA_WRITE_LOG_EXT 0x3f
+#define ATA_READ_VERIFY 0x40
+#define ATA_READ_VERIFY48 0x42
+#define ATA_READ_LOG_DMA_EXT 0x47 /* read log DMA ext - PIO Data-In */
+#define ATA_READ_FPDMA_QUEUED 0x60 /* read DMA NCQ */
+#define ATA_WRITE_FPDMA_QUEUED 0x61 /* write DMA NCQ */
+#define ATA_SEND_FPDMA_QUEUED 0x64 /* send DMA NCQ */
+#define ATA_RECV_FPDMA_QUEUED 0x65 /* recieve DMA NCQ */
+#define ATA_SEP_ATTN 0x67 /* SEP request */
+#define ATA_SEEK 0x70 /* seek */
+#define ATA_PACKET_CMD 0xa0 /* packet command */
+#define ATA_ATAPI_IDENTIFY 0xa1 /* get ATAPI params*/
+#define ATA_SERVICE 0xa2 /* service command */
+#define ATA_SMART_CMD 0xb0 /* SMART command */
+#define ATA_CFA_ERASE 0xc0 /* CFA erase */
+#define ATA_READ_MUL 0xc4 /* read multi */
+#define ATA_WRITE_MUL 0xc5 /* write multi */
+#define ATA_SET_MULTI 0xc6 /* set multi size */
+#define ATA_READ_DMA_QUEUED 0xc7 /* read DMA QUEUED */
+#define ATA_READ_DMA 0xc8 /* read DMA */
+#define ATA_WRITE_DMA 0xca /* write DMA */
+#define ATA_WRITE_DMA_QUEUED 0xcc /* write DMA QUEUED */
+#define ATA_WRITE_MUL_FUA48 0xce
+#define ATA_STANDBY_IMMEDIATE 0xe0 /* standby immediate */
+#define ATA_IDLE_IMMEDIATE 0xe1 /* idle immediate */
+#define ATA_STANDBY_CMD 0xe2 /* standby */
+#define ATA_IDLE_CMD 0xe3 /* idle */
+#define ATA_READ_BUFFER 0xe4 /* read buffer */
+#define ATA_READ_PM 0xe4 /* read portmultiplier */
+#define ATA_SLEEP 0xe6 /* sleep */
+#define ATA_FLUSHCACHE 0xe7 /* flush cache to disk */
+#define ATA_WRITE_PM 0xe8 /* write portmultiplier */
+#define ATA_FLUSHCACHE48 0xea /* flush cache to disk */
+#define ATA_ATA_IDENTIFY 0xec /* get ATA params */
+#define ATA_SETFEATURES 0xef /* features command */
+#define ATA_SF_SETXFER 0x03 /* set transfer mode */
+#define ATA_SF_ENAB_WCACHE 0x02 /* enable write cache */
+#define ATA_SF_DIS_WCACHE 0x82 /* disable write cache */
+#define ATA_SF_ENAB_PUIS 0x06 /* enable PUIS */
+#define ATA_SF_DIS_PUIS 0x86 /* disable PUIS */
+#define ATA_SF_PUIS_SPINUP 0x07 /* PUIS spin-up */
+#define ATA_SF_ENAB_RCACHE 0xaa /* enable readahead cache */
+#define ATA_SF_DIS_RCACHE 0x55 /* disable readahead cache */
+#define ATA_SF_ENAB_RELIRQ 0x5d /* enable release interrupt */
+#define ATA_SF_DIS_RELIRQ 0xdd /* disable release interrupt */
+#define ATA_SF_ENAB_SRVIRQ 0x5e /* enable service interrupt */
+#define ATA_SF_DIS_SRVIRQ 0xde /* disable service interrupt */
+#define ATA_SECURITY_SET_PASSWORD 0xf1 /* set drive password */
+#define ATA_SECURITY_UNLOCK 0xf2 /* unlock drive using passwd */
+#define ATA_SECURITY_ERASE_PREPARE 0xf3 /* prepare to erase drive */
+#define ATA_SECURITY_ERASE_UNIT 0xf4 /* erase all blocks on drive */
+#define ATA_SECURITY_FREEZE_LOCK 0xf5 /* freeze security config */
+#define ATA_SECURITY_DISABLE_PASSWORD 0xf6 /* disable drive password */
+#define ATA_READ_NATIVE_MAX_ADDRESS 0xf8 /* read native max address */
+#define ATA_SET_MAX_ADDRESS 0xf9 /* set max address */
+
+
+/* ATAPI commands */
+#define ATAPI_TEST_UNIT_READY 0x00 /* check if device is ready */
+#define ATAPI_REZERO 0x01 /* rewind */
+#define ATAPI_REQUEST_SENSE 0x03 /* get sense data */
+#define ATAPI_FORMAT 0x04 /* format unit */
+#define ATAPI_READ 0x08 /* read data */
+#define ATAPI_WRITE 0x0a /* write data */
+#define ATAPI_WEOF 0x10 /* write filemark */
+#define ATAPI_WF_WRITE 0x01
+#define ATAPI_SPACE 0x11 /* space command */
+#define ATAPI_SP_FM 0x01
+#define ATAPI_SP_EOD 0x03
+#define ATAPI_INQUIRY 0x12 /* get inquiry data */
+#define ATAPI_MODE_SELECT 0x15 /* mode select */
+#define ATAPI_ERASE 0x19 /* erase */
+#define ATAPI_MODE_SENSE 0x1a /* mode sense */
+#define ATAPI_START_STOP 0x1b /* start/stop unit */
+#define ATAPI_SS_LOAD 0x01
+#define ATAPI_SS_RETENSION 0x02
+#define ATAPI_SS_EJECT 0x04
+#define ATAPI_PREVENT_ALLOW 0x1e /* media removal */
+#define ATAPI_READ_FORMAT_CAPACITIES 0x23 /* get format capacities */
+#define ATAPI_READ_CAPACITY 0x25 /* get volume capacity */
+#define ATAPI_READ_BIG 0x28 /* read data */
+#define ATAPI_WRITE_BIG 0x2a /* write data */
+#define ATAPI_LOCATE 0x2b /* locate to position */
+#define ATAPI_READ_POSITION 0x34 /* read position */
+#define ATAPI_SYNCHRONIZE_CACHE 0x35 /* flush buf, close channel */
+#define ATAPI_WRITE_BUFFER 0x3b /* write device buffer */
+#define ATAPI_READ_BUFFER 0x3c /* read device buffer */
+#define ATAPI_READ_SUBCHANNEL 0x42 /* get subchannel info */
+#define ATAPI_READ_TOC 0x43 /* get table of contents */
+#define ATAPI_PLAY_10 0x45 /* play by lba */
+#define ATAPI_PLAY_MSF 0x47 /* play by MSF address */
+#define ATAPI_PLAY_TRACK 0x48 /* play by track number */
+#define ATAPI_PAUSE 0x4b /* pause audio operation */
+#define ATAPI_READ_DISK_INFO 0x51 /* get disk info structure */
+#define ATAPI_READ_TRACK_INFO 0x52 /* get track info structure */
+#define ATAPI_RESERVE_TRACK 0x53 /* reserve track */
+#define ATAPI_SEND_OPC_INFO 0x54 /* send OPC structurek */
+#define ATAPI_MODE_SELECT_BIG 0x55 /* set device parameters */
+#define ATAPI_REPAIR_TRACK 0x58 /* repair track */
+#define ATAPI_READ_MASTER_CUE 0x59 /* read master CUE info */
+#define ATAPI_MODE_SENSE_BIG 0x5a /* get device parameters */
+#define ATAPI_CLOSE_TRACK 0x5b /* close track/session */
+#define ATAPI_READ_BUFFER_CAPACITY 0x5c /* get buffer capicity */
+#define ATAPI_SEND_CUE_SHEET 0x5d /* send CUE sheet */
+#define ATAPI_SERVICE_ACTION_IN 0x96 /* get service data */
+#define ATAPI_BLANK 0xa1 /* blank the media */
+#define ATAPI_SEND_KEY 0xa3 /* send DVD key structure */
+#define ATAPI_REPORT_KEY 0xa4 /* get DVD key structure */
+#define ATAPI_PLAY_12 0xa5 /* play by lba */
+#define ATAPI_LOAD_UNLOAD 0xa6 /* changer control command */
+#define ATAPI_READ_STRUCTURE 0xad /* get DVD structure */
+#define ATAPI_PLAY_CD 0xb4 /* universal play command */
+#define ATAPI_SET_SPEED 0xbb /* set drive speed */
+#define ATAPI_MECH_STATUS 0xbd /* get changer status */
+#define ATAPI_READ_CD 0xbe /* read data */
+#define ATAPI_POLL_DSC 0xff /* poll DSC status bit */
+
+
+struct ata_ioc_devices {
+ int channel;
+ char name[2][32];
+ struct ata_params params[2];
+};
+
+/* pr channel ATA ioctl calls */
+#define IOCATAGMAXCHANNEL _IOR('a', 1, int)
+#define IOCATAREINIT _IOW('a', 2, int)
+#define IOCATAATTACH _IOW('a', 3, int)
+#define IOCATADETACH _IOW('a', 4, int)
+#define IOCATADEVICES _IOWR('a', 5, struct ata_ioc_devices)
+
+/* ATAPI request sense structure */
+struct atapi_sense {
+ u_int8_t error; /* current or deferred errors */
+#define ATA_SENSE_VALID 0x80
+
+ u_int8_t segment; /* segment number */
+ u_int8_t key; /* sense key */
+#define ATA_SENSE_KEY_MASK 0x0f /* sense key mask */
+#define ATA_SENSE_NO_SENSE 0x00 /* no specific sense key info */
+#define ATA_SENSE_RECOVERED_ERROR 0x01 /* command OK, data recovered */
+#define ATA_SENSE_NOT_READY 0x02 /* no access to drive */
+#define ATA_SENSE_MEDIUM_ERROR 0x03 /* non-recovered data error */
+#define ATA_SENSE_HARDWARE_ERROR 0x04 /* non-recoverable HW failure */
+#define ATA_SENSE_ILLEGAL_REQUEST 0x05 /* invalid command param(s) */
+#define ATA_SENSE_UNIT_ATTENTION 0x06 /* media changed */
+#define ATA_SENSE_DATA_PROTECT 0x07 /* write protect */
+#define ATA_SENSE_BLANK_CHECK 0x08 /* blank check */
+#define ATA_SENSE_VENDOR_SPECIFIC 0x09 /* vendor specific skey */
+#define ATA_SENSE_COPY_ABORTED 0x0a /* copy aborted */
+#define ATA_SENSE_ABORTED_COMMAND 0x0b /* command aborted, try again */
+#define ATA_SENSE_EQUAL 0x0c /* equal */
+#define ATA_SENSE_VOLUME_OVERFLOW 0x0d /* volume overflow */
+#define ATA_SENSE_MISCOMPARE 0x0e /* data dont match the medium */
+#define ATA_SENSE_RESERVED 0x0f
+#define ATA_SENSE_ILI 0x20;
+#define ATA_SENSE_EOM 0x40;
+#define ATA_SENSE_FILEMARK 0x80;
+
+ u_int32_t cmd_info; /* cmd information */
+ u_int8_t sense_length; /* additional sense len (n-7) */
+ u_int32_t cmd_specific_info; /* additional cmd spec info */
+ u_int8_t asc; /* additional sense code */
+ u_int8_t ascq; /* additional sense code qual */
+ u_int8_t replaceable_unit_code; /* replaceable unit code */
+ u_int8_t specific; /* sense key specific */
+#define ATA_SENSE_SPEC_VALID 0x80
+#define ATA_SENSE_SPEC_MASK 0x7f
+
+ u_int8_t specific1; /* sense key specific */
+ u_int8_t specific2; /* sense key specific */
+} __packed;
+
+struct ata_ioc_request {
+ union {
+ struct {
+ u_int8_t command;
+ u_int8_t feature;
+ u_int64_t lba;
+ u_int16_t count;
+ } ata;
+ struct {
+ char ccb[16];
+ struct atapi_sense sense;
+ } atapi;
+ } u;
+ caddr_t data;
+ int count;
+ int flags;
+#define ATA_CMD_CONTROL 0x01
+#define ATA_CMD_READ 0x02
+#define ATA_CMD_WRITE 0x04
+#define ATA_CMD_ATAPI 0x08
+
+ int timeout;
+ int error;
+};
+
+struct ata_security_password {
+ u_int16_t ctrl;
+#define ATA_SECURITY_PASSWORD_USER 0x0000
+#define ATA_SECURITY_PASSWORD_MASTER 0x0001
+#define ATA_SECURITY_ERASE_NORMAL 0x0000
+#define ATA_SECURITY_ERASE_ENHANCED 0x0002
+#define ATA_SECURITY_LEVEL_HIGH 0x0000
+#define ATA_SECURITY_LEVEL_MAXIMUM 0x0100
+
+ u_int8_t password[32];
+ u_int16_t revision;
+ u_int16_t reserved[238];
+};
+
+/* pr device ATA ioctl calls */
+#define IOCATAREQUEST _IOWR('a', 100, struct ata_ioc_request)
+#define IOCATAGPARM _IOR('a', 101, struct ata_params)
+#define IOCATAGMODE _IOR('a', 102, int)
+#define IOCATASMODE _IOW('a', 103, int)
+
+#define IOCATAGSPINDOWN _IOR('a', 104, int)
+#define IOCATASSPINDOWN _IOW('a', 105, int)
+
+
+struct ata_ioc_raid_config {
+ int lun;
+ int type;
+#define AR_JBOD 0x0001
+#define AR_SPAN 0x0002
+#define AR_RAID0 0x0004
+#define AR_RAID1 0x0008
+#define AR_RAID01 0x0010
+#define AR_RAID3 0x0020
+#define AR_RAID4 0x0040
+#define AR_RAID5 0x0080
+
+ int interleave;
+ int status;
+#define AR_READY 1
+#define AR_DEGRADED 2
+#define AR_REBUILDING 4
+
+ int progress;
+ int total_disks;
+ int disks[16];
+};
+
+struct ata_ioc_raid_status {
+ int lun;
+ int type;
+ int interleave;
+ int status;
+ int progress;
+ int total_disks;
+ struct {
+ int state;
+#define AR_DISK_ONLINE 0x01
+#define AR_DISK_PRESENT 0x02
+#define AR_DISK_SPARE 0x04
+ int lun;
+ } disks[16];
+};
+
+/* ATA RAID ioctl calls */
+#define IOCATARAIDCREATE _IOWR('a', 200, struct ata_ioc_raid_config)
+#define IOCATARAIDDELETE _IOW('a', 201, int)
+#define IOCATARAIDSTATUS _IOWR('a', 202, struct ata_ioc_raid_status)
+#define IOCATARAIDADDSPARE _IOW('a', 203, struct ata_ioc_raid_config)
+#define IOCATARAIDREBUILD _IOW('a', 204, int)
+
+#endif /* _SYS_ATA_H_ */
diff --git a/usr/contrib/freebsd/sys/linker_set.h b/usr/contrib/freebsd/sys/linker_set.h
new file mode 100644
index 0000000000..393dfbc131
--- /dev/null
+++ b/usr/contrib/freebsd/sys/linker_set.h
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 1999 John D. Polstra
+ * Copyright (c) 1999,2001 Peter Wemm <peter@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/sys/linker_set.h 215701 2010-11-22 19:32:54Z dim $
+ */
+
+#ifndef _SYS_LINKER_SET_H_
+#define _SYS_LINKER_SET_H_
+
+#ifdef __FreeBSD__
+#ifndef _SYS_CDEFS_H_
+#error this file needs sys/cdefs.h as a prerequisite
+#endif
+#else
+#ifndef _COMPAT_FREEBSD_SYS_CDEFS_H_
+#error this file needs sys/cdefs.h as a prerequisite
+#endif
+#endif
+
+/*
+ * The following macros are used to declare global sets of objects, which
+ * are collected by the linker into a `linker_set' as defined below.
+ * For ELF, this is done by constructing a separate segment for each set.
+ */
+
+/*
+ * Private macros, not to be used outside this header file.
+ */
+#ifdef __GNUCLIKE___SECTION
+#ifdef __FreeBSD__
+#define __MAKE_SET(set, sym) \
+ __GLOBL(__CONCAT(__start_set_,set)); \
+ __GLOBL(__CONCAT(__stop_set_,set)); \
+ static void const * const __set_##set##_sym_##sym \
+ __section("set_" #set) __used = &sym
+#else
+#define __MAKE_SET(set, sym) \
+ static void const * const __set_##set##_sym_##sym \
+ __section("set_" #set) __used = &sym
+#endif
+#else /* !__GNUCLIKE___SECTION */
+#ifndef lint
+#error this file needs to be ported to your compiler
+#endif /* lint */
+#define __MAKE_SET(set, sym) extern void const * const (__set_##set##_sym_##sym)
+#endif /* __GNUCLIKE___SECTION */
+
+/*
+ * Public macros.
+ */
+#define TEXT_SET(set, sym) __MAKE_SET(set, sym)
+#define DATA_SET(set, sym) __MAKE_SET(set, sym)
+#define BSS_SET(set, sym) __MAKE_SET(set, sym)
+#define ABS_SET(set, sym) __MAKE_SET(set, sym)
+#define SET_ENTRY(set, sym) __MAKE_SET(set, sym)
+
+/*
+ * Initialize before referring to a given linker set.
+ */
+#ifdef __FreeBSD__
+#define SET_DECLARE(set, ptype) \
+ extern ptype *__CONCAT(__start_set_,set); \
+ extern ptype *__CONCAT(__stop_set_,set)
+#else
+#define SET_DECLARE(set, ptype) \
+ _Pragma(__XSTRING(weak __CONCAT(__start_set_,set))) \
+ _Pragma(__XSTRING(weak __CONCAT(__stop_set_,set))) \
+ extern ptype *__CONCAT(__start_set_,set); \
+ extern ptype *__CONCAT(__stop_set_,set)
+#endif
+
+#define SET_BEGIN(set) \
+ (&__CONCAT(__start_set_,set))
+#define SET_LIMIT(set) \
+ (&__CONCAT(__stop_set_,set))
+
+/*
+ * Iterate over all the elements of a set.
+ *
+ * Sets always contain addresses of things, and "pvar" points to words
+ * containing those addresses. Thus is must be declared as "type **pvar",
+ * and the address of each set item is obtained inside the loop by "*pvar".
+ */
+#define SET_FOREACH(pvar, set) \
+ for (pvar = SET_BEGIN(set); pvar < SET_LIMIT(set); pvar++)
+
+#define SET_ITEM(set, i) \
+ ((SET_BEGIN(set))[i])
+
+/*
+ * Provide a count of the items in a set.
+ */
+#define SET_COUNT(set) \
+ (SET_LIMIT(set) - SET_BEGIN(set))
+
+#endif /* _SYS_LINKER_SET_H_ */
diff --git a/usr/contrib/freebsd/sys/tree.h b/usr/contrib/freebsd/sys/tree.h
new file mode 100644
index 0000000000..6b47e247bb
--- /dev/null
+++ b/usr/contrib/freebsd/sys/tree.h
@@ -0,0 +1,765 @@
+/* $NetBSD: tree.h,v 1.8 2004/03/28 19:38:30 provos Exp $ */
+/* $OpenBSD: tree.h,v 1.7 2002/10/17 21:51:54 art Exp $ */
+/* $FreeBSD: head/sys/sys/tree.h 189204 2009-03-01 04:57:23Z bms $ */
+
+/*-
+ * Copyright 2002 Niels Provos <provos@citi.umich.edu>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _SYS_TREE_H_
+#define _SYS_TREE_H_
+
+#include <sys/cdefs.h>
+
+/*
+ * This file defines data structures for different types of trees:
+ * splay trees and red-black trees.
+ *
+ * A splay tree is a self-organizing data structure. Every operation
+ * on the tree causes a splay to happen. The splay moves the requested
+ * node to the root of the tree and partly rebalances it.
+ *
+ * This has the benefit that request locality causes faster lookups as
+ * the requested nodes move to the top of the tree. On the other hand,
+ * every lookup causes memory writes.
+ *
+ * The Balance Theorem bounds the total access time for m operations
+ * and n inserts on an initially empty tree as O((m + n)lg n). The
+ * amortized cost for a sequence of m accesses to a splay tree is O(lg n);
+ *
+ * A red-black tree is a binary search tree with the node color as an
+ * extra attribute. It fulfills a set of conditions:
+ * - every search path from the root to a leaf consists of the
+ * same number of black nodes,
+ * - each red node (except for the root) has a black parent,
+ * - each leaf node is black.
+ *
+ * Every operation on a red-black tree is bounded as O(lg n).
+ * The maximum height of a red-black tree is 2lg (n+1).
+ */
+
+#define SPLAY_HEAD(name, type) \
+struct name { \
+ struct type *sph_root; /* root of the tree */ \
+}
+
+#define SPLAY_INITIALIZER(root) \
+ { NULL }
+
+#define SPLAY_INIT(root) do { \
+ (root)->sph_root = NULL; \
+} while (/*CONSTCOND*/ 0)
+
+#define SPLAY_ENTRY(type) \
+struct { \
+ struct type *spe_left; /* left element */ \
+ struct type *spe_right; /* right element */ \
+}
+
+#define SPLAY_LEFT(elm, field) (elm)->field.spe_left
+#define SPLAY_RIGHT(elm, field) (elm)->field.spe_right
+#define SPLAY_ROOT(head) (head)->sph_root
+#define SPLAY_EMPTY(head) (SPLAY_ROOT(head) == NULL)
+
+/* SPLAY_ROTATE_{LEFT,RIGHT} expect that tmp hold SPLAY_{RIGHT,LEFT} */
+#define SPLAY_ROTATE_RIGHT(head, tmp, field) do { \
+ SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(tmp, field); \
+ SPLAY_RIGHT(tmp, field) = (head)->sph_root; \
+ (head)->sph_root = tmp; \
+} while (/*CONSTCOND*/ 0)
+
+#define SPLAY_ROTATE_LEFT(head, tmp, field) do { \
+ SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(tmp, field); \
+ SPLAY_LEFT(tmp, field) = (head)->sph_root; \
+ (head)->sph_root = tmp; \
+} while (/*CONSTCOND*/ 0)
+
+#define SPLAY_LINKLEFT(head, tmp, field) do { \
+ SPLAY_LEFT(tmp, field) = (head)->sph_root; \
+ tmp = (head)->sph_root; \
+ (head)->sph_root = SPLAY_LEFT((head)->sph_root, field); \
+} while (/*CONSTCOND*/ 0)
+
+#define SPLAY_LINKRIGHT(head, tmp, field) do { \
+ SPLAY_RIGHT(tmp, field) = (head)->sph_root; \
+ tmp = (head)->sph_root; \
+ (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field); \
+} while (/*CONSTCOND*/ 0)
+
+#define SPLAY_ASSEMBLE(head, node, left, right, field) do { \
+ SPLAY_RIGHT(left, field) = SPLAY_LEFT((head)->sph_root, field); \
+ SPLAY_LEFT(right, field) = SPLAY_RIGHT((head)->sph_root, field);\
+ SPLAY_LEFT((head)->sph_root, field) = SPLAY_RIGHT(node, field); \
+ SPLAY_RIGHT((head)->sph_root, field) = SPLAY_LEFT(node, field); \
+} while (/*CONSTCOND*/ 0)
+
+/* Generates prototypes and inline functions */
+
+#define SPLAY_PROTOTYPE(name, type, field, cmp) \
+void name##_SPLAY(struct name *, struct type *); \
+void name##_SPLAY_MINMAX(struct name *, int); \
+struct type *name##_SPLAY_INSERT(struct name *, struct type *); \
+struct type *name##_SPLAY_REMOVE(struct name *, struct type *); \
+ \
+/* Finds the node with the same key as elm */ \
+static __inline struct type * \
+name##_SPLAY_FIND(struct name *head, struct type *elm) \
+{ \
+ if (SPLAY_EMPTY(head)) \
+ return(NULL); \
+ name##_SPLAY(head, elm); \
+ if ((cmp)(elm, (head)->sph_root) == 0) \
+ return (head->sph_root); \
+ return (NULL); \
+} \
+ \
+static __inline struct type * \
+name##_SPLAY_NEXT(struct name *head, struct type *elm) \
+{ \
+ name##_SPLAY(head, elm); \
+ if (SPLAY_RIGHT(elm, field) != NULL) { \
+ elm = SPLAY_RIGHT(elm, field); \
+ while (SPLAY_LEFT(elm, field) != NULL) { \
+ elm = SPLAY_LEFT(elm, field); \
+ } \
+ } else \
+ elm = NULL; \
+ return (elm); \
+} \
+ \
+static __inline struct type * \
+name##_SPLAY_MIN_MAX(struct name *head, int val) \
+{ \
+ name##_SPLAY_MINMAX(head, val); \
+ return (SPLAY_ROOT(head)); \
+}
+
+/* Main splay operation.
+ * Moves node close to the key of elm to top
+ */
+#define SPLAY_GENERATE(name, type, field, cmp) \
+struct type * \
+name##_SPLAY_INSERT(struct name *head, struct type *elm) \
+{ \
+ if (SPLAY_EMPTY(head)) { \
+ SPLAY_LEFT(elm, field) = SPLAY_RIGHT(elm, field) = NULL; \
+ } else { \
+ int __comp; \
+ name##_SPLAY(head, elm); \
+ __comp = (cmp)(elm, (head)->sph_root); \
+ if(__comp < 0) { \
+ SPLAY_LEFT(elm, field) = SPLAY_LEFT((head)->sph_root, field);\
+ SPLAY_RIGHT(elm, field) = (head)->sph_root; \
+ SPLAY_LEFT((head)->sph_root, field) = NULL; \
+ } else if (__comp > 0) { \
+ SPLAY_RIGHT(elm, field) = SPLAY_RIGHT((head)->sph_root, field);\
+ SPLAY_LEFT(elm, field) = (head)->sph_root; \
+ SPLAY_RIGHT((head)->sph_root, field) = NULL; \
+ } else \
+ return ((head)->sph_root); \
+ } \
+ (head)->sph_root = (elm); \
+ return (NULL); \
+} \
+ \
+struct type * \
+name##_SPLAY_REMOVE(struct name *head, struct type *elm) \
+{ \
+ struct type *__tmp; \
+ if (SPLAY_EMPTY(head)) \
+ return (NULL); \
+ name##_SPLAY(head, elm); \
+ if ((cmp)(elm, (head)->sph_root) == 0) { \
+ if (SPLAY_LEFT((head)->sph_root, field) == NULL) { \
+ (head)->sph_root = SPLAY_RIGHT((head)->sph_root, field);\
+ } else { \
+ __tmp = SPLAY_RIGHT((head)->sph_root, field); \
+ (head)->sph_root = SPLAY_LEFT((head)->sph_root, field);\
+ name##_SPLAY(head, elm); \
+ SPLAY_RIGHT((head)->sph_root, field) = __tmp; \
+ } \
+ return (elm); \
+ } \
+ return (NULL); \
+} \
+ \
+void \
+name##_SPLAY(struct name *head, struct type *elm) \
+{ \
+ struct type __node, *__left, *__right, *__tmp; \
+ int __comp; \
+\
+ SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\
+ __left = __right = &__node; \
+\
+ while ((__comp = (cmp)(elm, (head)->sph_root)) != 0) { \
+ if (__comp < 0) { \
+ __tmp = SPLAY_LEFT((head)->sph_root, field); \
+ if (__tmp == NULL) \
+ break; \
+ if ((cmp)(elm, __tmp) < 0){ \
+ SPLAY_ROTATE_RIGHT(head, __tmp, field); \
+ if (SPLAY_LEFT((head)->sph_root, field) == NULL)\
+ break; \
+ } \
+ SPLAY_LINKLEFT(head, __right, field); \
+ } else if (__comp > 0) { \
+ __tmp = SPLAY_RIGHT((head)->sph_root, field); \
+ if (__tmp == NULL) \
+ break; \
+ if ((cmp)(elm, __tmp) > 0){ \
+ SPLAY_ROTATE_LEFT(head, __tmp, field); \
+ if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\
+ break; \
+ } \
+ SPLAY_LINKRIGHT(head, __left, field); \
+ } \
+ } \
+ SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \
+} \
+ \
+/* Splay with either the minimum or the maximum element \
+ * Used to find minimum or maximum element in tree. \
+ */ \
+void name##_SPLAY_MINMAX(struct name *head, int __comp) \
+{ \
+ struct type __node, *__left, *__right, *__tmp; \
+\
+ SPLAY_LEFT(&__node, field) = SPLAY_RIGHT(&__node, field) = NULL;\
+ __left = __right = &__node; \
+\
+ while (1) { \
+ if (__comp < 0) { \
+ __tmp = SPLAY_LEFT((head)->sph_root, field); \
+ if (__tmp == NULL) \
+ break; \
+ if (__comp < 0){ \
+ SPLAY_ROTATE_RIGHT(head, __tmp, field); \
+ if (SPLAY_LEFT((head)->sph_root, field) == NULL)\
+ break; \
+ } \
+ SPLAY_LINKLEFT(head, __right, field); \
+ } else if (__comp > 0) { \
+ __tmp = SPLAY_RIGHT((head)->sph_root, field); \
+ if (__tmp == NULL) \
+ break; \
+ if (__comp > 0) { \
+ SPLAY_ROTATE_LEFT(head, __tmp, field); \
+ if (SPLAY_RIGHT((head)->sph_root, field) == NULL)\
+ break; \
+ } \
+ SPLAY_LINKRIGHT(head, __left, field); \
+ } \
+ } \
+ SPLAY_ASSEMBLE(head, &__node, __left, __right, field); \
+}
+
+#define SPLAY_NEGINF -1
+#define SPLAY_INF 1
+
+#define SPLAY_INSERT(name, x, y) name##_SPLAY_INSERT(x, y)
+#define SPLAY_REMOVE(name, x, y) name##_SPLAY_REMOVE(x, y)
+#define SPLAY_FIND(name, x, y) name##_SPLAY_FIND(x, y)
+#define SPLAY_NEXT(name, x, y) name##_SPLAY_NEXT(x, y)
+#define SPLAY_MIN(name, x) (SPLAY_EMPTY(x) ? NULL \
+ : name##_SPLAY_MIN_MAX(x, SPLAY_NEGINF))
+#define SPLAY_MAX(name, x) (SPLAY_EMPTY(x) ? NULL \
+ : name##_SPLAY_MIN_MAX(x, SPLAY_INF))
+
+#define SPLAY_FOREACH(x, name, head) \
+ for ((x) = SPLAY_MIN(name, head); \
+ (x) != NULL; \
+ (x) = SPLAY_NEXT(name, head, x))
+
+/* Macros that define a red-black tree */
+#define RB_HEAD(name, type) \
+struct name { \
+ struct type *rbh_root; /* root of the tree */ \
+}
+
+#define RB_INITIALIZER(root) \
+ { NULL }
+
+#define RB_INIT(root) do { \
+ (root)->rbh_root = NULL; \
+} while (/*CONSTCOND*/ 0)
+
+#define RB_BLACK 0
+#define RB_RED 1
+#define RB_ENTRY(type) \
+struct { \
+ struct type *rbe_left; /* left element */ \
+ struct type *rbe_right; /* right element */ \
+ struct type *rbe_parent; /* parent element */ \
+ int rbe_color; /* node color */ \
+}
+
+#define RB_LEFT(elm, field) (elm)->field.rbe_left
+#define RB_RIGHT(elm, field) (elm)->field.rbe_right
+#define RB_PARENT(elm, field) (elm)->field.rbe_parent
+#define RB_COLOR(elm, field) (elm)->field.rbe_color
+#define RB_ROOT(head) (head)->rbh_root
+#define RB_EMPTY(head) (RB_ROOT(head) == NULL)
+
+#define RB_SET(elm, parent, field) do { \
+ RB_PARENT(elm, field) = parent; \
+ RB_LEFT(elm, field) = RB_RIGHT(elm, field) = NULL; \
+ RB_COLOR(elm, field) = RB_RED; \
+} while (/*CONSTCOND*/ 0)
+
+#define RB_SET_BLACKRED(black, red, field) do { \
+ RB_COLOR(black, field) = RB_BLACK; \
+ RB_COLOR(red, field) = RB_RED; \
+} while (/*CONSTCOND*/ 0)
+
+#ifndef RB_AUGMENT
+#define RB_AUGMENT(x) do {} while (0)
+#endif
+
+#define RB_ROTATE_LEFT(head, elm, tmp, field) do { \
+ (tmp) = RB_RIGHT(elm, field); \
+ if ((RB_RIGHT(elm, field) = RB_LEFT(tmp, field)) != NULL) { \
+ RB_PARENT(RB_LEFT(tmp, field), field) = (elm); \
+ } \
+ RB_AUGMENT(elm); \
+ if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field)) != NULL) { \
+ if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \
+ RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \
+ else \
+ RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \
+ } else \
+ (head)->rbh_root = (tmp); \
+ RB_LEFT(tmp, field) = (elm); \
+ RB_PARENT(elm, field) = (tmp); \
+ RB_AUGMENT(tmp); \
+ if ((RB_PARENT(tmp, field))) \
+ RB_AUGMENT(RB_PARENT(tmp, field)); \
+} while (/*CONSTCOND*/ 0)
+
+#define RB_ROTATE_RIGHT(head, elm, tmp, field) do { \
+ (tmp) = RB_LEFT(elm, field); \
+ if ((RB_LEFT(elm, field) = RB_RIGHT(tmp, field)) != NULL) { \
+ RB_PARENT(RB_RIGHT(tmp, field), field) = (elm); \
+ } \
+ RB_AUGMENT(elm); \
+ if ((RB_PARENT(tmp, field) = RB_PARENT(elm, field)) != NULL) { \
+ if ((elm) == RB_LEFT(RB_PARENT(elm, field), field)) \
+ RB_LEFT(RB_PARENT(elm, field), field) = (tmp); \
+ else \
+ RB_RIGHT(RB_PARENT(elm, field), field) = (tmp); \
+ } else \
+ (head)->rbh_root = (tmp); \
+ RB_RIGHT(tmp, field) = (elm); \
+ RB_PARENT(elm, field) = (tmp); \
+ RB_AUGMENT(tmp); \
+ if ((RB_PARENT(tmp, field))) \
+ RB_AUGMENT(RB_PARENT(tmp, field)); \
+} while (/*CONSTCOND*/ 0)
+
+/* Generates prototypes and inline functions */
+#define RB_PROTOTYPE(name, type, field, cmp) \
+ RB_PROTOTYPE_INTERNAL(name, type, field, cmp,)
+#define RB_PROTOTYPE_STATIC(name, type, field, cmp) \
+ RB_PROTOTYPE_INTERNAL(name, type, field, cmp, __unused static)
+#define RB_PROTOTYPE_INTERNAL(name, type, field, cmp, attr) \
+attr void name##_RB_INSERT_COLOR(struct name *, struct type *); \
+attr void name##_RB_REMOVE_COLOR(struct name *, struct type *, struct type *);\
+attr struct type *name##_RB_REMOVE(struct name *, struct type *); \
+attr struct type *name##_RB_INSERT(struct name *, struct type *); \
+attr struct type *name##_RB_FIND(struct name *, struct type *); \
+attr struct type *name##_RB_NFIND(struct name *, struct type *); \
+attr struct type *name##_RB_NEXT(struct type *); \
+attr struct type *name##_RB_PREV(struct type *); \
+attr struct type *name##_RB_MINMAX(struct name *, int); \
+ \
+
+/* Main rb operation.
+ * Moves node close to the key of elm to top
+ */
+#define RB_GENERATE(name, type, field, cmp) \
+ RB_GENERATE_INTERNAL(name, type, field, cmp,)
+#define RB_GENERATE_STATIC(name, type, field, cmp) \
+ RB_GENERATE_INTERNAL(name, type, field, cmp, __unused static)
+#define RB_GENERATE_INTERNAL(name, type, field, cmp, attr) \
+attr void \
+name##_RB_INSERT_COLOR(struct name *head, struct type *elm) \
+{ \
+ struct type *parent, *gparent, *tmp; \
+ while ((parent = RB_PARENT(elm, field)) != NULL && \
+ RB_COLOR(parent, field) == RB_RED) { \
+ gparent = RB_PARENT(parent, field); \
+ if (parent == RB_LEFT(gparent, field)) { \
+ tmp = RB_RIGHT(gparent, field); \
+ if (tmp && RB_COLOR(tmp, field) == RB_RED) { \
+ RB_COLOR(tmp, field) = RB_BLACK; \
+ RB_SET_BLACKRED(parent, gparent, field);\
+ elm = gparent; \
+ continue; \
+ } \
+ if (RB_RIGHT(parent, field) == elm) { \
+ RB_ROTATE_LEFT(head, parent, tmp, field);\
+ tmp = parent; \
+ parent = elm; \
+ elm = tmp; \
+ } \
+ RB_SET_BLACKRED(parent, gparent, field); \
+ RB_ROTATE_RIGHT(head, gparent, tmp, field); \
+ } else { \
+ tmp = RB_LEFT(gparent, field); \
+ if (tmp && RB_COLOR(tmp, field) == RB_RED) { \
+ RB_COLOR(tmp, field) = RB_BLACK; \
+ RB_SET_BLACKRED(parent, gparent, field);\
+ elm = gparent; \
+ continue; \
+ } \
+ if (RB_LEFT(parent, field) == elm) { \
+ RB_ROTATE_RIGHT(head, parent, tmp, field);\
+ tmp = parent; \
+ parent = elm; \
+ elm = tmp; \
+ } \
+ RB_SET_BLACKRED(parent, gparent, field); \
+ RB_ROTATE_LEFT(head, gparent, tmp, field); \
+ } \
+ } \
+ RB_COLOR(head->rbh_root, field) = RB_BLACK; \
+} \
+ \
+attr void \
+name##_RB_REMOVE_COLOR(struct name *head, struct type *parent, struct type *elm) \
+{ \
+ struct type *tmp; \
+ while ((elm == NULL || RB_COLOR(elm, field) == RB_BLACK) && \
+ elm != RB_ROOT(head)) { \
+ if (RB_LEFT(parent, field) == elm) { \
+ tmp = RB_RIGHT(parent, field); \
+ if (RB_COLOR(tmp, field) == RB_RED) { \
+ RB_SET_BLACKRED(tmp, parent, field); \
+ RB_ROTATE_LEFT(head, parent, tmp, field);\
+ tmp = RB_RIGHT(parent, field); \
+ } \
+ if ((RB_LEFT(tmp, field) == NULL || \
+ RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\
+ (RB_RIGHT(tmp, field) == NULL || \
+ RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\
+ RB_COLOR(tmp, field) = RB_RED; \
+ elm = parent; \
+ parent = RB_PARENT(elm, field); \
+ } else { \
+ if (RB_RIGHT(tmp, field) == NULL || \
+ RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK) {\
+ struct type *oleft; \
+ if ((oleft = RB_LEFT(tmp, field)) \
+ != NULL) \
+ RB_COLOR(oleft, field) = RB_BLACK;\
+ RB_COLOR(tmp, field) = RB_RED; \
+ RB_ROTATE_RIGHT(head, tmp, oleft, field);\
+ tmp = RB_RIGHT(parent, field); \
+ } \
+ RB_COLOR(tmp, field) = RB_COLOR(parent, field);\
+ RB_COLOR(parent, field) = RB_BLACK; \
+ if (RB_RIGHT(tmp, field)) \
+ RB_COLOR(RB_RIGHT(tmp, field), field) = RB_BLACK;\
+ RB_ROTATE_LEFT(head, parent, tmp, field);\
+ elm = RB_ROOT(head); \
+ break; \
+ } \
+ } else { \
+ tmp = RB_LEFT(parent, field); \
+ if (RB_COLOR(tmp, field) == RB_RED) { \
+ RB_SET_BLACKRED(tmp, parent, field); \
+ RB_ROTATE_RIGHT(head, parent, tmp, field);\
+ tmp = RB_LEFT(parent, field); \
+ } \
+ if ((RB_LEFT(tmp, field) == NULL || \
+ RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) &&\
+ (RB_RIGHT(tmp, field) == NULL || \
+ RB_COLOR(RB_RIGHT(tmp, field), field) == RB_BLACK)) {\
+ RB_COLOR(tmp, field) = RB_RED; \
+ elm = parent; \
+ parent = RB_PARENT(elm, field); \
+ } else { \
+ if (RB_LEFT(tmp, field) == NULL || \
+ RB_COLOR(RB_LEFT(tmp, field), field) == RB_BLACK) {\
+ struct type *oright; \
+ if ((oright = RB_RIGHT(tmp, field)) \
+ != NULL) \
+ RB_COLOR(oright, field) = RB_BLACK;\
+ RB_COLOR(tmp, field) = RB_RED; \
+ RB_ROTATE_LEFT(head, tmp, oright, field);\
+ tmp = RB_LEFT(parent, field); \
+ } \
+ RB_COLOR(tmp, field) = RB_COLOR(parent, field);\
+ RB_COLOR(parent, field) = RB_BLACK; \
+ if (RB_LEFT(tmp, field)) \
+ RB_COLOR(RB_LEFT(tmp, field), field) = RB_BLACK;\
+ RB_ROTATE_RIGHT(head, parent, tmp, field);\
+ elm = RB_ROOT(head); \
+ break; \
+ } \
+ } \
+ } \
+ if (elm) \
+ RB_COLOR(elm, field) = RB_BLACK; \
+} \
+ \
+attr struct type * \
+name##_RB_REMOVE(struct name *head, struct type *elm) \
+{ \
+ struct type *child, *parent, *old = elm; \
+ int color; \
+ if (RB_LEFT(elm, field) == NULL) \
+ child = RB_RIGHT(elm, field); \
+ else if (RB_RIGHT(elm, field) == NULL) \
+ child = RB_LEFT(elm, field); \
+ else { \
+ struct type *left; \
+ elm = RB_RIGHT(elm, field); \
+ while ((left = RB_LEFT(elm, field)) != NULL) \
+ elm = left; \
+ child = RB_RIGHT(elm, field); \
+ parent = RB_PARENT(elm, field); \
+ color = RB_COLOR(elm, field); \
+ if (child) \
+ RB_PARENT(child, field) = parent; \
+ if (parent) { \
+ if (RB_LEFT(parent, field) == elm) \
+ RB_LEFT(parent, field) = child; \
+ else \
+ RB_RIGHT(parent, field) = child; \
+ RB_AUGMENT(parent); \
+ } else \
+ RB_ROOT(head) = child; \
+ if (RB_PARENT(elm, field) == old) \
+ parent = elm; \
+ (elm)->field = (old)->field; \
+ if (RB_PARENT(old, field)) { \
+ if (RB_LEFT(RB_PARENT(old, field), field) == old)\
+ RB_LEFT(RB_PARENT(old, field), field) = elm;\
+ else \
+ RB_RIGHT(RB_PARENT(old, field), field) = elm;\
+ RB_AUGMENT(RB_PARENT(old, field)); \
+ } else \
+ RB_ROOT(head) = elm; \
+ RB_PARENT(RB_LEFT(old, field), field) = elm; \
+ if (RB_RIGHT(old, field)) \
+ RB_PARENT(RB_RIGHT(old, field), field) = elm; \
+ if (parent) { \
+ left = parent; \
+ do { \
+ RB_AUGMENT(left); \
+ } while ((left = RB_PARENT(left, field)) != NULL); \
+ } \
+ goto color; \
+ } \
+ parent = RB_PARENT(elm, field); \
+ color = RB_COLOR(elm, field); \
+ if (child) \
+ RB_PARENT(child, field) = parent; \
+ if (parent) { \
+ if (RB_LEFT(parent, field) == elm) \
+ RB_LEFT(parent, field) = child; \
+ else \
+ RB_RIGHT(parent, field) = child; \
+ RB_AUGMENT(parent); \
+ } else \
+ RB_ROOT(head) = child; \
+color: \
+ if (color == RB_BLACK) \
+ name##_RB_REMOVE_COLOR(head, parent, child); \
+ return (old); \
+} \
+ \
+/* Inserts a node into the RB tree */ \
+attr struct type * \
+name##_RB_INSERT(struct name *head, struct type *elm) \
+{ \
+ struct type *tmp; \
+ struct type *parent = NULL; \
+ int comp = 0; \
+ tmp = RB_ROOT(head); \
+ while (tmp) { \
+ parent = tmp; \
+ comp = (cmp)(elm, parent); \
+ if (comp < 0) \
+ tmp = RB_LEFT(tmp, field); \
+ else if (comp > 0) \
+ tmp = RB_RIGHT(tmp, field); \
+ else \
+ return (tmp); \
+ } \
+ RB_SET(elm, parent, field); \
+ if (parent != NULL) { \
+ if (comp < 0) \
+ RB_LEFT(parent, field) = elm; \
+ else \
+ RB_RIGHT(parent, field) = elm; \
+ RB_AUGMENT(parent); \
+ } else \
+ RB_ROOT(head) = elm; \
+ name##_RB_INSERT_COLOR(head, elm); \
+ return (NULL); \
+} \
+ \
+/* Finds the node with the same key as elm */ \
+attr struct type * \
+name##_RB_FIND(struct name *head, struct type *elm) \
+{ \
+ struct type *tmp = RB_ROOT(head); \
+ int comp; \
+ while (tmp) { \
+ comp = cmp(elm, tmp); \
+ if (comp < 0) \
+ tmp = RB_LEFT(tmp, field); \
+ else if (comp > 0) \
+ tmp = RB_RIGHT(tmp, field); \
+ else \
+ return (tmp); \
+ } \
+ return (NULL); \
+} \
+ \
+/* Finds the first node greater than or equal to the search key */ \
+attr struct type * \
+name##_RB_NFIND(struct name *head, struct type *elm) \
+{ \
+ struct type *tmp = RB_ROOT(head); \
+ struct type *res = NULL; \
+ int comp; \
+ while (tmp) { \
+ comp = cmp(elm, tmp); \
+ if (comp < 0) { \
+ res = tmp; \
+ tmp = RB_LEFT(tmp, field); \
+ } \
+ else if (comp > 0) \
+ tmp = RB_RIGHT(tmp, field); \
+ else \
+ return (tmp); \
+ } \
+ return (res); \
+} \
+ \
+/* ARGSUSED */ \
+attr struct type * \
+name##_RB_NEXT(struct type *elm) \
+{ \
+ if (RB_RIGHT(elm, field)) { \
+ elm = RB_RIGHT(elm, field); \
+ while (RB_LEFT(elm, field)) \
+ elm = RB_LEFT(elm, field); \
+ } else { \
+ if (RB_PARENT(elm, field) && \
+ (elm == RB_LEFT(RB_PARENT(elm, field), field))) \
+ elm = RB_PARENT(elm, field); \
+ else { \
+ while (RB_PARENT(elm, field) && \
+ (elm == RB_RIGHT(RB_PARENT(elm, field), field)))\
+ elm = RB_PARENT(elm, field); \
+ elm = RB_PARENT(elm, field); \
+ } \
+ } \
+ return (elm); \
+} \
+ \
+/* ARGSUSED */ \
+attr struct type * \
+name##_RB_PREV(struct type *elm) \
+{ \
+ if (RB_LEFT(elm, field)) { \
+ elm = RB_LEFT(elm, field); \
+ while (RB_RIGHT(elm, field)) \
+ elm = RB_RIGHT(elm, field); \
+ } else { \
+ if (RB_PARENT(elm, field) && \
+ (elm == RB_RIGHT(RB_PARENT(elm, field), field))) \
+ elm = RB_PARENT(elm, field); \
+ else { \
+ while (RB_PARENT(elm, field) && \
+ (elm == RB_LEFT(RB_PARENT(elm, field), field)))\
+ elm = RB_PARENT(elm, field); \
+ elm = RB_PARENT(elm, field); \
+ } \
+ } \
+ return (elm); \
+} \
+ \
+attr struct type * \
+name##_RB_MINMAX(struct name *head, int val) \
+{ \
+ struct type *tmp = RB_ROOT(head); \
+ struct type *parent = NULL; \
+ while (tmp) { \
+ parent = tmp; \
+ if (val < 0) \
+ tmp = RB_LEFT(tmp, field); \
+ else \
+ tmp = RB_RIGHT(tmp, field); \
+ } \
+ return (parent); \
+}
+
+#define RB_NEGINF -1
+#define RB_INF 1
+
+#define RB_INSERT(name, x, y) name##_RB_INSERT(x, y)
+#define RB_REMOVE(name, x, y) name##_RB_REMOVE(x, y)
+#define RB_FIND(name, x, y) name##_RB_FIND(x, y)
+#define RB_NFIND(name, x, y) name##_RB_NFIND(x, y)
+#define RB_NEXT(name, x, y) name##_RB_NEXT(y)
+#define RB_PREV(name, x, y) name##_RB_PREV(y)
+#define RB_MIN(name, x) name##_RB_MINMAX(x, RB_NEGINF)
+#define RB_MAX(name, x) name##_RB_MINMAX(x, RB_INF)
+
+#define RB_FOREACH(x, name, head) \
+ for ((x) = RB_MIN(name, head); \
+ (x) != NULL; \
+ (x) = name##_RB_NEXT(x))
+
+#define RB_FOREACH_FROM(x, name, y) \
+ for ((x) = (y); \
+ ((x) != NULL) && ((y) = name##_RB_NEXT(x), (x) != NULL); \
+ (x) = (y))
+
+#define RB_FOREACH_SAFE(x, name, head, y) \
+ for ((x) = RB_MIN(name, head); \
+ ((x) != NULL) && ((y) = name##_RB_NEXT(x), (x) != NULL); \
+ (x) = (y))
+
+#define RB_FOREACH_REVERSE(x, name, head) \
+ for ((x) = RB_MAX(name, head); \
+ (x) != NULL; \
+ (x) = name##_RB_PREV(x))
+
+#define RB_FOREACH_REVERSE_FROM(x, name, y) \
+ for ((x) = (y); \
+ ((x) != NULL) && ((y) = name##_RB_PREV(x), (x) != NULL); \
+ (x) = (y))
+
+#define RB_FOREACH_REVERSE_SAFE(x, name, head, y) \
+ for ((x) = RB_MAX(name, head); \
+ ((x) != NULL) && ((y) = name##_RB_PREV(x), (x) != NULL); \
+ (x) = (y))
+
+#endif /* _SYS_TREE_H_ */
diff --git a/usr/contrib/freebsd/x86/apicreg.h b/usr/contrib/freebsd/x86/apicreg.h
new file mode 100644
index 0000000000..24006e2733
--- /dev/null
+++ b/usr/contrib/freebsd/x86/apicreg.h
@@ -0,0 +1,455 @@
+/*-
+ * Copyright (c) 1996, by Peter Wemm and Steve Passe
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. The name of the developer may NOT be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/x86/include/apicreg.h 259140 2013-12-09 21:08:52Z jhb $
+ */
+
+#ifndef _X86_APICREG_H_
+#define _X86_APICREG_H_
+
+/*
+ * Local && I/O APIC definitions.
+ */
+
+/*
+ * Pentium P54C+ Built-in APIC
+ * (Advanced programmable Interrupt Controller)
+ *
+ * Base Address of Built-in APIC in memory location
+ * is 0xfee00000.
+ *
+ * Map of APIC Registers:
+ *
+ * Offset (hex) Description Read/Write state
+ * 000 Reserved
+ * 010 Reserved
+ * 020 ID Local APIC ID R/W
+ * 030 VER Local APIC Version R
+ * 040 Reserved
+ * 050 Reserved
+ * 060 Reserved
+ * 070 Reserved
+ * 080 Task Priority Register R/W
+ * 090 Arbitration Priority Register R
+ * 0A0 Processor Priority Register R
+ * 0B0 EOI Register W
+ * 0C0 RRR Remote read R
+ * 0D0 Logical Destination R/W
+ * 0E0 Destination Format Register 0..27 R; 28..31 R/W
+ * 0F0 SVR Spurious Interrupt Vector Reg. 0..3 R; 4..9 R/W
+ * 100 ISR 000-031 R
+ * 110 ISR 032-063 R
+ * 120 ISR 064-095 R
+ * 130 ISR 095-128 R
+ * 140 ISR 128-159 R
+ * 150 ISR 160-191 R
+ * 160 ISR 192-223 R
+ * 170 ISR 224-255 R
+ * 180 TMR 000-031 R
+ * 190 TMR 032-063 R
+ * 1A0 TMR 064-095 R
+ * 1B0 TMR 095-128 R
+ * 1C0 TMR 128-159 R
+ * 1D0 TMR 160-191 R
+ * 1E0 TMR 192-223 R
+ * 1F0 TMR 224-255 R
+ * 200 IRR 000-031 R
+ * 210 IRR 032-063 R
+ * 220 IRR 064-095 R
+ * 230 IRR 095-128 R
+ * 240 IRR 128-159 R
+ * 250 IRR 160-191 R
+ * 260 IRR 192-223 R
+ * 270 IRR 224-255 R
+ * 280 Error Status Register R
+ * 290 Reserved
+ * 2A0 Reserved
+ * 2B0 Reserved
+ * 2C0 Reserved
+ * 2D0 Reserved
+ * 2E0 Reserved
+ * 2F0 Local Vector Table (CMCI) R/W
+ * 300 ICR_LOW Interrupt Command Reg. (0-31) R/W
+ * 310 ICR_HI Interrupt Command Reg. (32-63) R/W
+ * 320 Local Vector Table (Timer) R/W
+ * 330 Local Vector Table (Thermal) R/W (PIV+)
+ * 340 Local Vector Table (Performance) R/W (P6+)
+ * 350 LVT1 Local Vector Table (LINT0) R/W
+ * 360 LVT2 Local Vector Table (LINT1) R/W
+ * 370 LVT3 Local Vector Table (ERROR) R/W
+ * 380 Initial Count Reg. for Timer R/W
+ * 390 Current Count of Timer R
+ * 3A0 Reserved
+ * 3B0 Reserved
+ * 3C0 Reserved
+ * 3D0 Reserved
+ * 3E0 Timer Divide Configuration Reg. R/W
+ * 3F0 Reserved
+ */
+
+
+/******************************************************************************
+ * global defines, etc.
+ */
+
+
+/******************************************************************************
+ * LOCAL APIC structure
+ */
+
+#ifndef LOCORE
+#include <sys/types.h>
+
+#define PAD3 int : 32; int : 32; int : 32
+#define PAD4 int : 32; int : 32; int : 32; int : 32
+
+struct LAPIC {
+ /* reserved */ PAD4;
+ /* reserved */ PAD4;
+ u_int32_t id; PAD3;
+ u_int32_t version; PAD3;
+ /* reserved */ PAD4;
+ /* reserved */ PAD4;
+ /* reserved */ PAD4;
+ /* reserved */ PAD4;
+ u_int32_t tpr; PAD3;
+ u_int32_t apr; PAD3;
+ u_int32_t ppr; PAD3;
+ u_int32_t eoi; PAD3;
+ /* reserved */ PAD4;
+ u_int32_t ldr; PAD3;
+ u_int32_t dfr; PAD3;
+ u_int32_t svr; PAD3;
+ u_int32_t isr0; PAD3;
+ u_int32_t isr1; PAD3;
+ u_int32_t isr2; PAD3;
+ u_int32_t isr3; PAD3;
+ u_int32_t isr4; PAD3;
+ u_int32_t isr5; PAD3;
+ u_int32_t isr6; PAD3;
+ u_int32_t isr7; PAD3;
+ u_int32_t tmr0; PAD3;
+ u_int32_t tmr1; PAD3;
+ u_int32_t tmr2; PAD3;
+ u_int32_t tmr3; PAD3;
+ u_int32_t tmr4; PAD3;
+ u_int32_t tmr5; PAD3;
+ u_int32_t tmr6; PAD3;
+ u_int32_t tmr7; PAD3;
+ u_int32_t irr0; PAD3;
+ u_int32_t irr1; PAD3;
+ u_int32_t irr2; PAD3;
+ u_int32_t irr3; PAD3;
+ u_int32_t irr4; PAD3;
+ u_int32_t irr5; PAD3;
+ u_int32_t irr6; PAD3;
+ u_int32_t irr7; PAD3;
+ u_int32_t esr; PAD3;
+ /* reserved */ PAD4;
+ /* reserved */ PAD4;
+ /* reserved */ PAD4;
+ /* reserved */ PAD4;
+ /* reserved */ PAD4;
+ /* reserved */ PAD4;
+ u_int32_t lvt_cmci; PAD3;
+ u_int32_t icr_lo; PAD3;
+ u_int32_t icr_hi; PAD3;
+ u_int32_t lvt_timer; PAD3;
+ u_int32_t lvt_thermal; PAD3;
+ u_int32_t lvt_pcint; PAD3;
+ u_int32_t lvt_lint0; PAD3;
+ u_int32_t lvt_lint1; PAD3;
+ u_int32_t lvt_error; PAD3;
+ u_int32_t icr_timer; PAD3;
+ u_int32_t ccr_timer; PAD3;
+ /* reserved */ PAD4;
+ /* reserved */ PAD4;
+ /* reserved */ PAD4;
+ /* reserved */ PAD4;
+ u_int32_t dcr_timer; PAD3;
+ /* reserved */ PAD4;
+};
+
+typedef struct LAPIC lapic_t;
+
+/******************************************************************************
+ * I/O APIC structure
+ */
+
+struct IOAPIC {
+ u_int32_t ioregsel; PAD3;
+ u_int32_t iowin; PAD3;
+};
+
+typedef struct IOAPIC ioapic_t;
+
+#undef PAD4
+#undef PAD3
+
+#endif /* !LOCORE */
+
+
+/******************************************************************************
+ * various code 'logical' values
+ */
+
+/******************************************************************************
+ * LOCAL APIC defines
+ */
+
+/* default physical locations of LOCAL (CPU) APICs */
+#define DEFAULT_APIC_BASE 0xfee00000
+
+/* constants relating to APIC ID registers */
+#define APIC_ID_MASK 0xff000000
+#define APIC_ID_SHIFT 24
+#define APIC_ID_CLUSTER 0xf0
+#define APIC_ID_CLUSTER_ID 0x0f
+#define APIC_MAX_CLUSTER 0xe
+#define APIC_MAX_INTRACLUSTER_ID 3
+#define APIC_ID_CLUSTER_SHIFT 4
+
+/* fields in VER */
+#define APIC_VER_VERSION 0x000000ff
+#define APIC_VER_MAXLVT 0x00ff0000
+#define MAXLVTSHIFT 16
+#define APIC_VER_EOI_SUPPRESSION 0x01000000
+
+/* fields in LDR */
+#define APIC_LDR_RESERVED 0x00ffffff
+
+/* fields in DFR */
+#define APIC_DFR_RESERVED 0x0fffffff
+#define APIC_DFR_MODEL_MASK 0xf0000000
+#define APIC_DFR_MODEL_FLAT 0xf0000000
+#define APIC_DFR_MODEL_CLUSTER 0x00000000
+
+/* fields in SVR */
+#define APIC_SVR_VECTOR 0x000000ff
+#define APIC_SVR_VEC_PROG 0x000000f0
+#define APIC_SVR_VEC_FIX 0x0000000f
+#define APIC_SVR_ENABLE 0x00000100
+# define APIC_SVR_SWDIS 0x00000000
+# define APIC_SVR_SWEN 0x00000100
+#define APIC_SVR_FOCUS 0x00000200
+# define APIC_SVR_FEN 0x00000000
+# define APIC_SVR_FDIS 0x00000200
+#define APIC_SVR_EOI_SUPPRESSION 0x00001000
+
+/* fields in TPR */
+#define APIC_TPR_PRIO 0x000000ff
+# define APIC_TPR_INT 0x000000f0
+# define APIC_TPR_SUB 0x0000000f
+
+/* fields in ESR */
+#define APIC_ESR_SEND_CS_ERROR 0x00000001
+#define APIC_ESR_RECEIVE_CS_ERROR 0x00000002
+#define APIC_ESR_SEND_ACCEPT 0x00000004
+#define APIC_ESR_RECEIVE_ACCEPT 0x00000008
+#define APIC_ESR_SEND_ILLEGAL_VECTOR 0x00000020
+#define APIC_ESR_RECEIVE_ILLEGAL_VECTOR 0x00000040
+#define APIC_ESR_ILLEGAL_REGISTER 0x00000080
+
+/* fields in ICR_LOW */
+#define APIC_VECTOR_MASK 0x000000ff
+
+#define APIC_DELMODE_MASK 0x00000700
+# define APIC_DELMODE_FIXED 0x00000000
+# define APIC_DELMODE_LOWPRIO 0x00000100
+# define APIC_DELMODE_SMI 0x00000200
+# define APIC_DELMODE_RR 0x00000300
+# define APIC_DELMODE_NMI 0x00000400
+# define APIC_DELMODE_INIT 0x00000500
+# define APIC_DELMODE_STARTUP 0x00000600
+# define APIC_DELMODE_RESV 0x00000700
+
+#define APIC_DESTMODE_MASK 0x00000800
+# define APIC_DESTMODE_PHY 0x00000000
+# define APIC_DESTMODE_LOG 0x00000800
+
+#define APIC_DELSTAT_MASK 0x00001000
+# define APIC_DELSTAT_IDLE 0x00000000
+# define APIC_DELSTAT_PEND 0x00001000
+
+#define APIC_RESV1_MASK 0x00002000
+
+#define APIC_LEVEL_MASK 0x00004000
+# define APIC_LEVEL_DEASSERT 0x00000000
+# define APIC_LEVEL_ASSERT 0x00004000
+
+#define APIC_TRIGMOD_MASK 0x00008000
+# define APIC_TRIGMOD_EDGE 0x00000000
+# define APIC_TRIGMOD_LEVEL 0x00008000
+
+#define APIC_RRSTAT_MASK 0x00030000
+# define APIC_RRSTAT_INVALID 0x00000000
+# define APIC_RRSTAT_INPROG 0x00010000
+# define APIC_RRSTAT_VALID 0x00020000
+# define APIC_RRSTAT_RESV 0x00030000
+
+#define APIC_DEST_MASK 0x000c0000
+# define APIC_DEST_DESTFLD 0x00000000
+# define APIC_DEST_SELF 0x00040000
+# define APIC_DEST_ALLISELF 0x00080000
+# define APIC_DEST_ALLESELF 0x000c0000
+
+#define APIC_RESV2_MASK 0xfff00000
+
+#define APIC_ICRLO_RESV_MASK (APIC_RESV1_MASK | APIC_RESV2_MASK)
+
+/* fields in LVT1/2 */
+#define APIC_LVT_VECTOR 0x000000ff
+#define APIC_LVT_DM 0x00000700
+# define APIC_LVT_DM_FIXED 0x00000000
+# define APIC_LVT_DM_SMI 0x00000200
+# define APIC_LVT_DM_NMI 0x00000400
+# define APIC_LVT_DM_INIT 0x00000500
+# define APIC_LVT_DM_EXTINT 0x00000700
+#define APIC_LVT_DS 0x00001000
+#define APIC_LVT_IIPP 0x00002000
+#define APIC_LVT_IIPP_INTALO 0x00002000
+#define APIC_LVT_IIPP_INTAHI 0x00000000
+#define APIC_LVT_RIRR 0x00004000
+#define APIC_LVT_TM 0x00008000
+#define APIC_LVT_M 0x00010000
+
+
+/* fields in LVT Timer */
+#define APIC_LVTT_VECTOR 0x000000ff
+#define APIC_LVTT_DS 0x00001000
+#define APIC_LVTT_M 0x00010000
+#define APIC_LVTT_TM 0x00020000
+# define APIC_LVTT_TM_ONE_SHOT 0x00000000
+# define APIC_LVTT_TM_PERIODIC 0x00020000
+
+
+/* APIC timer current count */
+#define APIC_TIMER_MAX_COUNT 0xffffffff
+
+/* fields in TDCR */
+#define APIC_TDCR_2 0x00
+#define APIC_TDCR_4 0x01
+#define APIC_TDCR_8 0x02
+#define APIC_TDCR_16 0x03
+#define APIC_TDCR_32 0x08
+#define APIC_TDCR_64 0x09
+#define APIC_TDCR_128 0x0a
+#define APIC_TDCR_1 0x0b
+
+/* LVT table indices */
+#define APIC_LVT_LINT0 0
+#define APIC_LVT_LINT1 1
+#define APIC_LVT_TIMER 2
+#define APIC_LVT_ERROR 3
+#define APIC_LVT_PMC 4
+#define APIC_LVT_THERMAL 5
+#define APIC_LVT_CMCI 6
+#define APIC_LVT_MAX APIC_LVT_CMCI
+
+/******************************************************************************
+ * I/O APIC defines
+ */
+
+/* default physical locations of an IO APIC */
+#define DEFAULT_IO_APIC_BASE 0xfec00000
+
+/* window register offset */
+#define IOAPIC_WINDOW 0x10
+#define IOAPIC_EOIR 0x40
+
+/* indexes into IO APIC */
+#define IOAPIC_ID 0x00
+#define IOAPIC_VER 0x01
+#define IOAPIC_ARB 0x02
+#define IOAPIC_REDTBL 0x10
+#define IOAPIC_REDTBL0 IOAPIC_REDTBL
+#define IOAPIC_REDTBL1 (IOAPIC_REDTBL+0x02)
+#define IOAPIC_REDTBL2 (IOAPIC_REDTBL+0x04)
+#define IOAPIC_REDTBL3 (IOAPIC_REDTBL+0x06)
+#define IOAPIC_REDTBL4 (IOAPIC_REDTBL+0x08)
+#define IOAPIC_REDTBL5 (IOAPIC_REDTBL+0x0a)
+#define IOAPIC_REDTBL6 (IOAPIC_REDTBL+0x0c)
+#define IOAPIC_REDTBL7 (IOAPIC_REDTBL+0x0e)
+#define IOAPIC_REDTBL8 (IOAPIC_REDTBL+0x10)
+#define IOAPIC_REDTBL9 (IOAPIC_REDTBL+0x12)
+#define IOAPIC_REDTBL10 (IOAPIC_REDTBL+0x14)
+#define IOAPIC_REDTBL11 (IOAPIC_REDTBL+0x16)
+#define IOAPIC_REDTBL12 (IOAPIC_REDTBL+0x18)
+#define IOAPIC_REDTBL13 (IOAPIC_REDTBL+0x1a)
+#define IOAPIC_REDTBL14 (IOAPIC_REDTBL+0x1c)
+#define IOAPIC_REDTBL15 (IOAPIC_REDTBL+0x1e)
+#define IOAPIC_REDTBL16 (IOAPIC_REDTBL+0x20)
+#define IOAPIC_REDTBL17 (IOAPIC_REDTBL+0x22)
+#define IOAPIC_REDTBL18 (IOAPIC_REDTBL+0x24)
+#define IOAPIC_REDTBL19 (IOAPIC_REDTBL+0x26)
+#define IOAPIC_REDTBL20 (IOAPIC_REDTBL+0x28)
+#define IOAPIC_REDTBL21 (IOAPIC_REDTBL+0x2a)
+#define IOAPIC_REDTBL22 (IOAPIC_REDTBL+0x2c)
+#define IOAPIC_REDTBL23 (IOAPIC_REDTBL+0x2e)
+
+/* fields in VER */
+#define IOART_VER_VERSION 0x000000ff
+#define IOART_VER_MAXREDIR 0x00ff0000
+#define MAXREDIRSHIFT 16
+
+/*
+ * fields in the IO APIC's redirection table entries
+ */
+#define IOART_DEST APIC_ID_MASK /* broadcast addr: all APICs */
+
+#define IOART_RESV 0x00fe0000 /* reserved */
+
+#define IOART_INTMASK 0x00010000 /* R/W: INTerrupt mask */
+# define IOART_INTMCLR 0x00000000 /* clear, allow INTs */
+# define IOART_INTMSET 0x00010000 /* set, inhibit INTs */
+
+#define IOART_TRGRMOD 0x00008000 /* R/W: trigger mode */
+# define IOART_TRGREDG 0x00000000 /* edge */
+# define IOART_TRGRLVL 0x00008000 /* level */
+
+#define IOART_REM_IRR 0x00004000 /* RO: remote IRR */
+
+#define IOART_INTPOL 0x00002000 /* R/W: INT input pin polarity */
+# define IOART_INTAHI 0x00000000 /* active high */
+# define IOART_INTALO 0x00002000 /* active low */
+
+#define IOART_DELIVS 0x00001000 /* RO: delivery status */
+
+#define IOART_DESTMOD 0x00000800 /* R/W: destination mode */
+# define IOART_DESTPHY 0x00000000 /* physical */
+# define IOART_DESTLOG 0x00000800 /* logical */
+
+#define IOART_DELMOD 0x00000700 /* R/W: delivery mode */
+# define IOART_DELFIXED 0x00000000 /* fixed */
+# define IOART_DELLOPRI 0x00000100 /* lowest priority */
+# define IOART_DELSMI 0x00000200 /* System Management INT */
+# define IOART_DELRSV1 0x00000300 /* reserved */
+# define IOART_DELNMI 0x00000400 /* NMI signal */
+# define IOART_DELINIT 0x00000500 /* INIT signal */
+# define IOART_DELRSV2 0x00000600 /* reserved */
+# define IOART_DELEXINT 0x00000700 /* External INTerrupt */
+
+#define IOART_INTVEC 0x000000ff /* R/W: INTerrupt vector field */
+
+#endif /* _X86_APICREG_H_ */
diff --git a/usr/contrib/freebsd/x86/mptable.h b/usr/contrib/freebsd/x86/mptable.h
new file mode 100644
index 0000000000..8f3c62a295
--- /dev/null
+++ b/usr/contrib/freebsd/x86/mptable.h
@@ -0,0 +1,204 @@
+/*-
+ * Copyright (c) 1996, by Steve Passe
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. The name of the developer may NOT be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/x86/include/mptable.h 259228 2013-12-11 21:19:04Z jhb $
+ */
+
+#ifndef __MACHINE_MPTABLE_H__
+#define __MACHINE_MPTABLE_H__
+
+enum busTypes {
+ NOBUS = 0,
+ CBUS = 1,
+ CBUSII = 2,
+ EISA = 3,
+ ISA = 6,
+ MCA = 9,
+ PCI = 13,
+ XPRESS = 18,
+ MAX_BUSTYPE = 18,
+ UNKNOWN_BUSTYPE = 0xff
+};
+
+/* MP Floating Pointer Structure */
+typedef struct MPFPS {
+ uint8_t signature[4];
+ uint32_t pap;
+ uint8_t length;
+ uint8_t spec_rev;
+ uint8_t checksum;
+ uint8_t config_type;
+ uint8_t mpfb2;
+ uint8_t mpfb3;
+ uint8_t mpfb4;
+ uint8_t mpfb5;
+} __packed *mpfps_t;
+
+#define MPFB2_IMCR_PRESENT 0x80
+#define MPFB2_MUL_CLK_SRCS 0x40
+
+/* MP Configuration Table Header */
+typedef struct MPCTH {
+ uint8_t signature[4];
+ uint16_t base_table_length;
+ uint8_t spec_rev;
+ uint8_t checksum;
+ uint8_t oem_id[8];
+ uint8_t product_id[12];
+ uint32_t oem_table_pointer;
+ uint16_t oem_table_size;
+ uint16_t entry_count;
+ uint32_t apic_address;
+ uint16_t extended_table_length;
+ uint8_t extended_table_checksum;
+ uint8_t reserved;
+} __packed *mpcth_t;
+
+/* Base table entries */
+
+#define MPCT_ENTRY_PROCESSOR 0
+#define MPCT_ENTRY_BUS 1
+#define MPCT_ENTRY_IOAPIC 2
+#define MPCT_ENTRY_INT 3
+#define MPCT_ENTRY_LOCAL_INT 4
+
+typedef struct PROCENTRY {
+ uint8_t type;
+ uint8_t apic_id;
+ uint8_t apic_version;
+ uint8_t cpu_flags;
+ uint32_t cpu_signature;
+ uint32_t feature_flags;
+ uint32_t reserved1;
+ uint32_t reserved2;
+} __packed *proc_entry_ptr;
+
+#define PROCENTRY_FLAG_EN 0x01
+#define PROCENTRY_FLAG_BP 0x02
+
+typedef struct BUSENTRY {
+ uint8_t type;
+ uint8_t bus_id;
+ uint8_t bus_type[6];
+} __packed *bus_entry_ptr;
+
+typedef struct IOAPICENTRY {
+ uint8_t type;
+ uint8_t apic_id;
+ uint8_t apic_version;
+ uint8_t apic_flags;
+ uint32_t apic_address;
+} __packed *io_apic_entry_ptr;
+
+#define IOAPICENTRY_FLAG_EN 0x01
+
+typedef struct INTENTRY {
+ uint8_t type;
+ uint8_t int_type;
+ uint16_t int_flags;
+ uint8_t src_bus_id;
+ uint8_t src_bus_irq;
+ uint8_t dst_apic_id;
+ uint8_t dst_apic_int;
+} __packed *int_entry_ptr;
+
+#define INTENTRY_TYPE_INT 0
+#define INTENTRY_TYPE_NMI 1
+#define INTENTRY_TYPE_SMI 2
+#define INTENTRY_TYPE_EXTINT 3
+
+#define INTENTRY_FLAGS_POLARITY 0x3
+#define INTENTRY_FLAGS_POLARITY_CONFORM 0x0
+#define INTENTRY_FLAGS_POLARITY_ACTIVEHI 0x1
+#define INTENTRY_FLAGS_POLARITY_ACTIVELO 0x3
+#define INTENTRY_FLAGS_TRIGGER 0xc
+#define INTENTRY_FLAGS_TRIGGER_CONFORM 0x0
+#define INTENTRY_FLAGS_TRIGGER_EDGE 0x4
+#define INTENTRY_FLAGS_TRIGGER_LEVEL 0xc
+
+/* Extended table entries */
+
+typedef struct EXTENTRY {
+ uint8_t type;
+ uint8_t length;
+} __packed *ext_entry_ptr;
+
+#define MPCT_EXTENTRY_SAS 0x80
+#define MPCT_EXTENTRY_BHD 0x81
+#define MPCT_EXTENTRY_CBASM 0x82
+
+typedef struct SASENTRY {
+ uint8_t type;
+ uint8_t length;
+ uint8_t bus_id;
+ uint8_t address_type;
+ uint64_t address_base;
+ uint64_t address_length;
+} __packed *sas_entry_ptr;
+
+#define SASENTRY_TYPE_IO 0
+#define SASENTRY_TYPE_MEMORY 1
+#define SASENTRY_TYPE_PREFETCH 2
+
+typedef struct BHDENTRY {
+ uint8_t type;
+ uint8_t length;
+ uint8_t bus_id;
+ uint8_t bus_info;
+ uint8_t parent_bus;
+ uint8_t reserved[3];
+} __packed *bhd_entry_ptr;
+
+#define BHDENTRY_INFO_SUBTRACTIVE_DECODE 0x1
+
+typedef struct CBASMENTRY {
+ uint8_t type;
+ uint8_t length;
+ uint8_t bus_id;
+ uint8_t address_mod;
+ uint32_t predefined_range;
+} __packed *cbasm_entry_ptr;
+
+#define CBASMENTRY_ADDRESS_MOD_ADD 0x0
+#define CBASMENTRY_ADDRESS_MOD_SUBTRACT 0x1
+
+#define CBASMENTRY_RANGE_ISA_IO 0
+#define CBASMENTRY_RANGE_VGA_IO 1
+
+#ifdef _KERNEL
+struct mptable_hostb_softc {
+#ifdef NEW_PCIB
+ struct pcib_host_resources sc_host_res;
+ int sc_decodes_vga_io;
+ int sc_decodes_isa_io;
+#endif
+};
+
+#ifdef NEW_PCIB
+void mptable_pci_host_res_init(device_t pcib);
+#endif
+int mptable_pci_probe_table(int bus);
+int mptable_pci_route_interrupt(device_t pcib, device_t dev, int pin);
+#endif
+#endif /* !__MACHINE_MPTABLE_H__ */
diff --git a/usr/contrib/freebsd/x86/psl.h b/usr/contrib/freebsd/x86/psl.h
new file mode 100644
index 0000000000..6934b4feb7
--- /dev/null
+++ b/usr/contrib/freebsd/x86/psl.h
@@ -0,0 +1,92 @@
+/*-
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)psl.h 5.2 (Berkeley) 1/18/91
+ * $FreeBSD: head/sys/x86/include/psl.h 258135 2013-11-14 15:37:20Z emaste $
+ */
+
+#ifndef _MACHINE_PSL_H_
+#define _MACHINE_PSL_H_
+
+/*
+ * 386 processor status longword.
+ */
+#define PSL_C 0x00000001 /* carry bit */
+#define PSL_PF 0x00000004 /* parity bit */
+#define PSL_AF 0x00000010 /* bcd carry bit */
+#define PSL_Z 0x00000040 /* zero bit */
+#define PSL_N 0x00000080 /* negative bit */
+#define PSL_T 0x00000100 /* trace enable bit */
+#define PSL_I 0x00000200 /* interrupt enable bit */
+#define PSL_D 0x00000400 /* string instruction direction bit */
+#define PSL_V 0x00000800 /* overflow bit */
+#define PSL_IOPL 0x00003000 /* i/o privilege level */
+#define PSL_NT 0x00004000 /* nested task bit */
+#define PSL_RF 0x00010000 /* resume flag bit */
+#define PSL_VM 0x00020000 /* virtual 8086 mode bit */
+#define PSL_AC 0x00040000 /* alignment checking */
+#define PSL_VIF 0x00080000 /* virtual interrupt enable */
+#define PSL_VIP 0x00100000 /* virtual interrupt pending */
+#define PSL_ID 0x00200000 /* identification bit */
+
+/*
+ * The i486 manual says that we are not supposed to change reserved flags,
+ * but this is too much trouble since the reserved flags depend on the cpu
+ * and setting them to their historical values works in practice.
+ */
+#define PSL_RESERVED_DEFAULT 0x00000002
+
+/*
+ * Initial flags for kernel and user mode. The kernel later inherits
+ * PSL_I and some other flags from user mode.
+ */
+#define PSL_KERNEL PSL_RESERVED_DEFAULT
+#define PSL_USER (PSL_RESERVED_DEFAULT | PSL_I)
+
+/*
+ * Bits that can be changed in user mode on 486's. We allow these bits
+ * to be changed using ptrace(), sigreturn() and procfs. Setting PS_NT
+ * is undesirable but it may as well be allowed since users can inflict
+ * it on the kernel directly. Changes to PSL_AC are silently ignored on
+ * 386's.
+ *
+ * Users are allowed to change the privileged flag PSL_RF. The cpu sets PSL_RF
+ * in tf_eflags for faults. Debuggers should sometimes set it there too.
+ * tf_eflags is kept in the signal context during signal handling and there is
+ * no other place to remember it, so the PSL_RF bit may be corrupted by the
+ * signal handler without us knowing. Corruption of the PSL_RF bit at worst
+ * causes one more or one less debugger trap, so allowing it is fairly
+ * harmless.
+ */
+#define PSL_USERCHANGE (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_T \
+ | PSL_D | PSL_V | PSL_NT | PSL_RF | PSL_AC | PSL_ID)
+
+#endif /* !_MACHINE_PSL_H_ */
diff --git a/usr/contrib/freebsd/x86/specialreg.h b/usr/contrib/freebsd/x86/specialreg.h
new file mode 100644
index 0000000000..bea3122423
--- /dev/null
+++ b/usr/contrib/freebsd/x86/specialreg.h
@@ -0,0 +1,839 @@
+/*-
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)specialreg.h 7.1 (Berkeley) 5/9/91
+ * $FreeBSD: head/sys/x86/include/specialreg.h 273338 2014-10-20 18:09:33Z neel $
+ */
+
+#ifndef _MACHINE_SPECIALREG_H_
+#define _MACHINE_SPECIALREG_H_
+
+/*
+ * Bits in 386 special registers:
+ */
+#define CR0_PE 0x00000001 /* Protected mode Enable */
+#define CR0_MP 0x00000002 /* "Math" (fpu) Present */
+#define CR0_EM 0x00000004 /* EMulate FPU instructions. (trap ESC only) */
+#define CR0_TS 0x00000008 /* Task Switched (if MP, trap ESC and WAIT) */
+#define CR0_PG 0x80000000 /* PaGing enable */
+
+/*
+ * Bits in 486 special registers:
+ */
+#define CR0_NE 0x00000020 /* Numeric Error enable (EX16 vs IRQ13) */
+#define CR0_WP 0x00010000 /* Write Protect (honor page protect in
+ all modes) */
+#define CR0_AM 0x00040000 /* Alignment Mask (set to enable AC flag) */
+#define CR0_NW 0x20000000 /* Not Write-through */
+#define CR0_CD 0x40000000 /* Cache Disable */
+
+#define CR3_PCID_SAVE 0x8000000000000000
+
+/*
+ * Bits in PPro special registers
+ */
+#define CR4_VME 0x00000001 /* Virtual 8086 mode extensions */
+#define CR4_PVI 0x00000002 /* Protected-mode virtual interrupts */
+#define CR4_TSD 0x00000004 /* Time stamp disable */
+#define CR4_DE 0x00000008 /* Debugging extensions */
+#define CR4_PSE 0x00000010 /* Page size extensions */
+#define CR4_PAE 0x00000020 /* Physical address extension */
+#define CR4_MCE 0x00000040 /* Machine check enable */
+#define CR4_PGE 0x00000080 /* Page global enable */
+#define CR4_PCE 0x00000100 /* Performance monitoring counter enable */
+#define CR4_FXSR 0x00000200 /* Fast FPU save/restore used by OS */
+#define CR4_XMM 0x00000400 /* enable SIMD/MMX2 to use except 16 */
+#define CR4_VMXE 0x00002000 /* enable VMX operation (Intel-specific) */
+#define CR4_FSGSBASE 0x00010000 /* Enable FS/GS BASE accessing instructions */
+#define CR4_PCIDE 0x00020000 /* Enable Context ID */
+#define CR4_XSAVE 0x00040000 /* XSETBV/XGETBV */
+#define CR4_SMEP 0x00100000 /* Supervisor-Mode Execution Prevention */
+
+/*
+ * Bits in AMD64 special registers. EFER is 64 bits wide.
+ */
+#define EFER_SCE 0x000000001 /* System Call Extensions (R/W) */
+#define EFER_LME 0x000000100 /* Long mode enable (R/W) */
+#define EFER_LMA 0x000000400 /* Long mode active (R) */
+#define EFER_NXE 0x000000800 /* PTE No-Execute bit enable (R/W) */
+#define EFER_SVM 0x000001000 /* SVM enable bit for AMD, reserved for Intel */
+
+/*
+ * Intel Extended Features registers
+ */
+#define XCR0 0 /* XFEATURE_ENABLED_MASK register */
+
+#define XFEATURE_ENABLED_X87 0x00000001
+#define XFEATURE_ENABLED_SSE 0x00000002
+#define XFEATURE_ENABLED_YMM_HI128 0x00000004
+#define XFEATURE_ENABLED_AVX XFEATURE_ENABLED_YMM_HI128
+#define XFEATURE_ENABLED_BNDREGS 0x00000008
+#define XFEATURE_ENABLED_BNDCSR 0x00000010
+#define XFEATURE_ENABLED_OPMASK 0x00000020
+#define XFEATURE_ENABLED_ZMM_HI256 0x00000040
+#define XFEATURE_ENABLED_HI16_ZMM 0x00000080
+
+#define XFEATURE_AVX \
+ (XFEATURE_ENABLED_X87 | XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX)
+#define XFEATURE_AVX512 \
+ (XFEATURE_ENABLED_OPMASK | XFEATURE_ENABLED_ZMM_HI256 | \
+ XFEATURE_ENABLED_HI16_ZMM)
+#define XFEATURE_MPX \
+ (XFEATURE_ENABLED_BNDREGS | XFEATURE_ENABLED_BNDCSR)
+
+/*
+ * CPUID instruction features register
+ */
+#define CPUID_FPU 0x00000001
+#define CPUID_VME 0x00000002
+#define CPUID_DE 0x00000004
+#define CPUID_PSE 0x00000008
+#define CPUID_TSC 0x00000010
+#define CPUID_MSR 0x00000020
+#define CPUID_PAE 0x00000040
+#define CPUID_MCE 0x00000080
+#define CPUID_CX8 0x00000100
+#define CPUID_APIC 0x00000200
+#define CPUID_B10 0x00000400
+#define CPUID_SEP 0x00000800
+#define CPUID_MTRR 0x00001000
+#define CPUID_PGE 0x00002000
+#define CPUID_MCA 0x00004000
+#define CPUID_CMOV 0x00008000
+#define CPUID_PAT 0x00010000
+#define CPUID_PSE36 0x00020000
+#define CPUID_PSN 0x00040000
+#define CPUID_CLFSH 0x00080000
+#define CPUID_B20 0x00100000
+#define CPUID_DS 0x00200000
+#define CPUID_ACPI 0x00400000
+#define CPUID_MMX 0x00800000
+#define CPUID_FXSR 0x01000000
+#define CPUID_SSE 0x02000000
+#define CPUID_XMM 0x02000000
+#define CPUID_SSE2 0x04000000
+#define CPUID_SS 0x08000000
+#define CPUID_HTT 0x10000000
+#define CPUID_TM 0x20000000
+#define CPUID_IA64 0x40000000
+#define CPUID_PBE 0x80000000
+
+#define CPUID2_SSE3 0x00000001
+#define CPUID2_PCLMULQDQ 0x00000002
+#define CPUID2_DTES64 0x00000004
+#define CPUID2_MON 0x00000008
+#define CPUID2_DS_CPL 0x00000010
+#define CPUID2_VMX 0x00000020
+#define CPUID2_SMX 0x00000040
+#define CPUID2_EST 0x00000080
+#define CPUID2_TM2 0x00000100
+#define CPUID2_SSSE3 0x00000200
+#define CPUID2_CNXTID 0x00000400
+#define CPUID2_FMA 0x00001000
+#define CPUID2_CX16 0x00002000
+#define CPUID2_XTPR 0x00004000
+#define CPUID2_PDCM 0x00008000
+#define CPUID2_PCID 0x00020000
+#define CPUID2_DCA 0x00040000
+#define CPUID2_SSE41 0x00080000
+#define CPUID2_SSE42 0x00100000
+#define CPUID2_X2APIC 0x00200000
+#define CPUID2_MOVBE 0x00400000
+#define CPUID2_POPCNT 0x00800000
+#define CPUID2_TSCDLT 0x01000000
+#define CPUID2_AESNI 0x02000000
+#define CPUID2_XSAVE 0x04000000
+#define CPUID2_OSXSAVE 0x08000000
+#define CPUID2_AVX 0x10000000
+#define CPUID2_F16C 0x20000000
+#define CPUID2_RDRAND 0x40000000
+#define CPUID2_HV 0x80000000
+
+/*
+ * Important bits in the Thermal and Power Management flags
+ * CPUID.6 EAX and ECX.
+ */
+#define CPUTPM1_SENSOR 0x00000001
+#define CPUTPM1_TURBO 0x00000002
+#define CPUTPM1_ARAT 0x00000004
+#define CPUTPM2_EFFREQ 0x00000001
+
+/*
+ * Important bits in the AMD extended cpuid flags
+ */
+#define AMDID_SYSCALL 0x00000800
+#define AMDID_MP 0x00080000
+#define AMDID_NX 0x00100000
+#define AMDID_EXT_MMX 0x00400000
+#define AMDID_FFXSR 0x01000000
+#define AMDID_PAGE1GB 0x04000000
+#define AMDID_RDTSCP 0x08000000
+#define AMDID_LM 0x20000000
+#define AMDID_EXT_3DNOW 0x40000000
+#define AMDID_3DNOW 0x80000000
+
+#define AMDID2_LAHF 0x00000001
+#define AMDID2_CMP 0x00000002
+#define AMDID2_SVM 0x00000004
+#define AMDID2_EXT_APIC 0x00000008
+#define AMDID2_CR8 0x00000010
+#define AMDID2_ABM 0x00000020
+#define AMDID2_SSE4A 0x00000040
+#define AMDID2_MAS 0x00000080
+#define AMDID2_PREFETCH 0x00000100
+#define AMDID2_OSVW 0x00000200
+#define AMDID2_IBS 0x00000400
+#define AMDID2_XOP 0x00000800
+#define AMDID2_SKINIT 0x00001000
+#define AMDID2_WDT 0x00002000
+#define AMDID2_LWP 0x00008000
+#define AMDID2_FMA4 0x00010000
+#define AMDID2_TCE 0x00020000
+#define AMDID2_NODE_ID 0x00080000
+#define AMDID2_TBM 0x00200000
+#define AMDID2_TOPOLOGY 0x00400000
+#define AMDID2_PCXC 0x00800000
+#define AMDID2_PNXC 0x01000000
+#define AMDID2_DBE 0x04000000
+#define AMDID2_PTSC 0x08000000
+#define AMDID2_PTSCEL2I 0x10000000
+
+/*
+ * CPUID instruction 1 eax info
+ */
+#define CPUID_STEPPING 0x0000000f
+#define CPUID_MODEL 0x000000f0
+#define CPUID_FAMILY 0x00000f00
+#define CPUID_EXT_MODEL 0x000f0000
+#define CPUID_EXT_FAMILY 0x0ff00000
+#ifdef __i386__
+#define CPUID_TO_MODEL(id) \
+ ((((id) & CPUID_MODEL) >> 4) | \
+ ((((id) & CPUID_FAMILY) >= 0x600) ? \
+ (((id) & CPUID_EXT_MODEL) >> 12) : 0))
+#define CPUID_TO_FAMILY(id) \
+ ((((id) & CPUID_FAMILY) >> 8) + \
+ ((((id) & CPUID_FAMILY) == 0xf00) ? \
+ (((id) & CPUID_EXT_FAMILY) >> 20) : 0))
+#else
+#define CPUID_TO_MODEL(id) \
+ ((((id) & CPUID_MODEL) >> 4) | \
+ (((id) & CPUID_EXT_MODEL) >> 12))
+#define CPUID_TO_FAMILY(id) \
+ ((((id) & CPUID_FAMILY) >> 8) + \
+ (((id) & CPUID_EXT_FAMILY) >> 20))
+#endif
+
+/*
+ * CPUID instruction 1 ebx info
+ */
+#define CPUID_BRAND_INDEX 0x000000ff
+#define CPUID_CLFUSH_SIZE 0x0000ff00
+#define CPUID_HTT_CORES 0x00ff0000
+#define CPUID_LOCAL_APIC_ID 0xff000000
+
+/*
+ * CPUID instruction 5 info
+ */
+#define CPUID5_MON_MIN_SIZE 0x0000ffff /* eax */
+#define CPUID5_MON_MAX_SIZE 0x0000ffff /* ebx */
+#define CPUID5_MON_MWAIT_EXT 0x00000001 /* ecx */
+#define CPUID5_MWAIT_INTRBREAK 0x00000002 /* ecx */
+
+/*
+ * MWAIT cpu power states. Lower 4 bits are sub-states.
+ */
+#define MWAIT_C0 0xf0
+#define MWAIT_C1 0x00
+#define MWAIT_C2 0x10
+#define MWAIT_C3 0x20
+#define MWAIT_C4 0x30
+
+/*
+ * MWAIT extensions.
+ */
+/* Interrupt breaks MWAIT even when masked. */
+#define MWAIT_INTRBREAK 0x00000001
+
+/*
+ * CPUID instruction 6 ecx info
+ */
+#define CPUID_PERF_STAT 0x00000001
+#define CPUID_PERF_BIAS 0x00000008
+
+/*
+ * CPUID instruction 0xb ebx info.
+ */
+#define CPUID_TYPE_INVAL 0
+#define CPUID_TYPE_SMT 1
+#define CPUID_TYPE_CORE 2
+
+/*
+ * CPUID instruction 0xd Processor Extended State Enumeration Sub-leaf 1
+ */
+#define CPUID_EXTSTATE_XSAVEOPT 0x00000001
+#define CPUID_EXTSTATE_XSAVEC 0x00000002
+#define CPUID_EXTSTATE_XINUSE 0x00000004
+#define CPUID_EXTSTATE_XSAVES 0x00000008
+
+/*
+ * AMD extended function 8000_0007h edx info
+ */
+#define AMDPM_TS 0x00000001
+#define AMDPM_FID 0x00000002
+#define AMDPM_VID 0x00000004
+#define AMDPM_TTP 0x00000008
+#define AMDPM_TM 0x00000010
+#define AMDPM_STC 0x00000020
+#define AMDPM_100MHZ_STEPS 0x00000040
+#define AMDPM_HW_PSTATE 0x00000080
+#define AMDPM_TSC_INVARIANT 0x00000100
+#define AMDPM_CPB 0x00000200
+
+/*
+ * AMD extended function 8000_0008h ecx info
+ */
+#define AMDID_CMP_CORES 0x000000ff
+#define AMDID_COREID_SIZE 0x0000f000
+#define AMDID_COREID_SIZE_SHIFT 12
+
+/*
+ * CPUID instruction 7 Structured Extended Features, leaf 0 ebx info
+ */
+#define CPUID_STDEXT_FSGSBASE 0x00000001
+#define CPUID_STDEXT_TSC_ADJUST 0x00000002
+#define CPUID_STDEXT_BMI1 0x00000008
+#define CPUID_STDEXT_HLE 0x00000010
+#define CPUID_STDEXT_AVX2 0x00000020
+#define CPUID_STDEXT_SMEP 0x00000080
+#define CPUID_STDEXT_BMI2 0x00000100
+#define CPUID_STDEXT_ERMS 0x00000200
+#define CPUID_STDEXT_INVPCID 0x00000400
+#define CPUID_STDEXT_RTM 0x00000800
+#define CPUID_STDEXT_MPX 0x00004000
+#define CPUID_STDEXT_AVX512F 0x00010000
+#define CPUID_STDEXT_RDSEED 0x00040000
+#define CPUID_STDEXT_ADX 0x00080000
+#define CPUID_STDEXT_SMAP 0x00100000
+#define CPUID_STDEXT_CLFLUSHOPT 0x00800000
+#define CPUID_STDEXT_PROCTRACE 0x02000000
+#define CPUID_STDEXT_AVX512PF 0x04000000
+#define CPUID_STDEXT_AVX512ER 0x08000000
+#define CPUID_STDEXT_AVX512CD 0x10000000
+#define CPUID_STDEXT_SHA 0x20000000
+
+/*
+ * CPUID manufacturers identifiers
+ */
+#define AMD_VENDOR_ID "AuthenticAMD"
+#define CENTAUR_VENDOR_ID "CentaurHauls"
+#define CYRIX_VENDOR_ID "CyrixInstead"
+#define INTEL_VENDOR_ID "GenuineIntel"
+#define NEXGEN_VENDOR_ID "NexGenDriven"
+#define NSC_VENDOR_ID "Geode by NSC"
+#define RISE_VENDOR_ID "RiseRiseRise"
+#define SIS_VENDOR_ID "SiS SiS SiS "
+#define TRANSMETA_VENDOR_ID "GenuineTMx86"
+#define UMC_VENDOR_ID "UMC UMC UMC "
+
+/*
+ * Model-specific registers for the i386 family
+ */
+#define MSR_P5_MC_ADDR 0x000
+#define MSR_P5_MC_TYPE 0x001
+#define MSR_TSC 0x010
+#define MSR_P5_CESR 0x011
+#define MSR_P5_CTR0 0x012
+#define MSR_P5_CTR1 0x013
+#define MSR_IA32_PLATFORM_ID 0x017
+#define MSR_APICBASE 0x01b
+#define MSR_EBL_CR_POWERON 0x02a
+#define MSR_TEST_CTL 0x033
+#define MSR_IA32_FEATURE_CONTROL 0x03a
+#define MSR_BIOS_UPDT_TRIG 0x079
+#define MSR_BBL_CR_D0 0x088
+#define MSR_BBL_CR_D1 0x089
+#define MSR_BBL_CR_D2 0x08a
+#define MSR_BIOS_SIGN 0x08b
+#define MSR_PERFCTR0 0x0c1
+#define MSR_PERFCTR1 0x0c2
+#define MSR_PLATFORM_INFO 0x0ce
+#define MSR_MPERF 0x0e7
+#define MSR_APERF 0x0e8
+#define MSR_IA32_EXT_CONFIG 0x0ee /* Undocumented. Core Solo/Duo only */
+#define MSR_MTRRcap 0x0fe
+#define MSR_BBL_CR_ADDR 0x116
+#define MSR_BBL_CR_DECC 0x118
+#define MSR_BBL_CR_CTL 0x119
+#define MSR_BBL_CR_TRIG 0x11a
+#define MSR_BBL_CR_BUSY 0x11b
+#define MSR_BBL_CR_CTL3 0x11e
+#define MSR_SYSENTER_CS_MSR 0x174
+#define MSR_SYSENTER_ESP_MSR 0x175
+#define MSR_SYSENTER_EIP_MSR 0x176
+#define MSR_MCG_CAP 0x179
+#define MSR_MCG_STATUS 0x17a
+#define MSR_MCG_CTL 0x17b
+#define MSR_EVNTSEL0 0x186
+#define MSR_EVNTSEL1 0x187
+#define MSR_THERM_CONTROL 0x19a
+#define MSR_THERM_INTERRUPT 0x19b
+#define MSR_THERM_STATUS 0x19c
+#define MSR_IA32_MISC_ENABLE 0x1a0
+#define MSR_IA32_TEMPERATURE_TARGET 0x1a2
+#define MSR_TURBO_RATIO_LIMIT 0x1ad
+#define MSR_TURBO_RATIO_LIMIT1 0x1ae
+#define MSR_DEBUGCTLMSR 0x1d9
+#define MSR_LASTBRANCHFROMIP 0x1db
+#define MSR_LASTBRANCHTOIP 0x1dc
+#define MSR_LASTINTFROMIP 0x1dd
+#define MSR_LASTINTTOIP 0x1de
+#define MSR_ROB_CR_BKUPTMPDR6 0x1e0
+#define MSR_MTRRVarBase 0x200
+#define MSR_MTRR64kBase 0x250
+#define MSR_MTRR16kBase 0x258
+#define MSR_MTRR4kBase 0x268
+#define MSR_PAT 0x277
+#define MSR_MC0_CTL2 0x280
+#define MSR_MTRRdefType 0x2ff
+#define MSR_MC0_CTL 0x400
+#define MSR_MC0_STATUS 0x401
+#define MSR_MC0_ADDR 0x402
+#define MSR_MC0_MISC 0x403
+#define MSR_MC1_CTL 0x404
+#define MSR_MC1_STATUS 0x405
+#define MSR_MC1_ADDR 0x406
+#define MSR_MC1_MISC 0x407
+#define MSR_MC2_CTL 0x408
+#define MSR_MC2_STATUS 0x409
+#define MSR_MC2_ADDR 0x40a
+#define MSR_MC2_MISC 0x40b
+#define MSR_MC3_CTL 0x40c
+#define MSR_MC3_STATUS 0x40d
+#define MSR_MC3_ADDR 0x40e
+#define MSR_MC3_MISC 0x40f
+#define MSR_MC4_CTL 0x410
+#define MSR_MC4_STATUS 0x411
+#define MSR_MC4_ADDR 0x412
+#define MSR_MC4_MISC 0x413
+#define MSR_RAPL_POWER_UNIT 0x606
+#define MSR_PKG_ENERGY_STATUS 0x611
+#define MSR_DRAM_ENERGY_STATUS 0x619
+#define MSR_PP0_ENERGY_STATUS 0x639
+#define MSR_PP1_ENERGY_STATUS 0x641
+
+/*
+ * VMX MSRs
+ */
+#define MSR_VMX_BASIC 0x480
+#define MSR_VMX_PINBASED_CTLS 0x481
+#define MSR_VMX_PROCBASED_CTLS 0x482
+#define MSR_VMX_EXIT_CTLS 0x483
+#define MSR_VMX_ENTRY_CTLS 0x484
+#define MSR_VMX_CR0_FIXED0 0x486
+#define MSR_VMX_CR0_FIXED1 0x487
+#define MSR_VMX_CR4_FIXED0 0x488
+#define MSR_VMX_CR4_FIXED1 0x489
+#define MSR_VMX_PROCBASED_CTLS2 0x48b
+#define MSR_VMX_EPT_VPID_CAP 0x48c
+#define MSR_VMX_TRUE_PINBASED_CTLS 0x48d
+#define MSR_VMX_TRUE_PROCBASED_CTLS 0x48e
+#define MSR_VMX_TRUE_EXIT_CTLS 0x48f
+#define MSR_VMX_TRUE_ENTRY_CTLS 0x490
+
+/*
+ * X2APIC MSRs
+ */
+#define MSR_APIC_ID 0x802
+#define MSR_APIC_VERSION 0x803
+#define MSR_APIC_TPR 0x808
+#define MSR_APIC_EOI 0x80b
+#define MSR_APIC_LDR 0x80d
+#define MSR_APIC_SVR 0x80f
+#define MSR_APIC_ISR0 0x810
+#define MSR_APIC_ISR1 0x811
+#define MSR_APIC_ISR2 0x812
+#define MSR_APIC_ISR3 0x813
+#define MSR_APIC_ISR4 0x814
+#define MSR_APIC_ISR5 0x815
+#define MSR_APIC_ISR6 0x816
+#define MSR_APIC_ISR7 0x817
+#define MSR_APIC_TMR0 0x818
+#define MSR_APIC_IRR0 0x820
+#define MSR_APIC_ESR 0x828
+#define MSR_APIC_LVT_CMCI 0x82F
+#define MSR_APIC_ICR 0x830
+#define MSR_APIC_LVT_TIMER 0x832
+#define MSR_APIC_LVT_THERMAL 0x833
+#define MSR_APIC_LVT_PCINT 0x834
+#define MSR_APIC_LVT_LINT0 0x835
+#define MSR_APIC_LVT_LINT1 0x836
+#define MSR_APIC_LVT_ERROR 0x837
+#define MSR_APIC_ICR_TIMER 0x838
+#define MSR_APIC_CCR_TIMER 0x839
+#define MSR_APIC_DCR_TIMER 0x83e
+#define MSR_APIC_SELF_IPI 0x83f
+
+#define MSR_IA32_XSS 0xda0
+
+/*
+ * Constants related to MSR's.
+ */
+#define APICBASE_RESERVED 0x000002ff
+#define APICBASE_BSP 0x00000100
+#define APICBASE_X2APIC 0x00000400
+#define APICBASE_ENABLED 0x00000800
+#define APICBASE_ADDRESS 0xfffff000
+
+/* MSR_IA32_FEATURE_CONTROL related */
+#define IA32_FEATURE_CONTROL_LOCK 0x01 /* lock bit */
+#define IA32_FEATURE_CONTROL_SMX_EN 0x02 /* enable VMX inside SMX */
+#define IA32_FEATURE_CONTROL_VMX_EN 0x04 /* enable VMX outside SMX */
+
+/*
+ * PAT modes.
+ */
+#define PAT_UNCACHEABLE 0x00
+#define PAT_WRITE_COMBINING 0x01
+#define PAT_WRITE_THROUGH 0x04
+#define PAT_WRITE_PROTECTED 0x05
+#define PAT_WRITE_BACK 0x06
+#define PAT_UNCACHED 0x07
+#define PAT_VALUE(i, m) ((long long)(m) << (8 * (i)))
+#define PAT_MASK(i) PAT_VALUE(i, 0xff)
+
+/*
+ * Constants related to MTRRs
+ */
+#define MTRR_UNCACHEABLE 0x00
+#define MTRR_WRITE_COMBINING 0x01
+#define MTRR_WRITE_THROUGH 0x04
+#define MTRR_WRITE_PROTECTED 0x05
+#define MTRR_WRITE_BACK 0x06
+#define MTRR_N64K 8 /* numbers of fixed-size entries */
+#define MTRR_N16K 16
+#define MTRR_N4K 64
+#define MTRR_CAP_WC 0x0000000000000400
+#define MTRR_CAP_FIXED 0x0000000000000100
+#define MTRR_CAP_VCNT 0x00000000000000ff
+#define MTRR_DEF_ENABLE 0x0000000000000800
+#define MTRR_DEF_FIXED_ENABLE 0x0000000000000400
+#define MTRR_DEF_TYPE 0x00000000000000ff
+#define MTRR_PHYSBASE_PHYSBASE 0x000ffffffffff000
+#define MTRR_PHYSBASE_TYPE 0x00000000000000ff
+#define MTRR_PHYSMASK_PHYSMASK 0x000ffffffffff000
+#define MTRR_PHYSMASK_VALID 0x0000000000000800
+
+/*
+ * Cyrix configuration registers, accessible as IO ports.
+ */
+#define CCR0 0xc0 /* Configuration control register 0 */
+#define CCR0_NC0 0x01 /* First 64K of each 1M memory region is
+ non-cacheable */
+#define CCR0_NC1 0x02 /* 640K-1M region is non-cacheable */
+#define CCR0_A20M 0x04 /* Enables A20M# input pin */
+#define CCR0_KEN 0x08 /* Enables KEN# input pin */
+#define CCR0_FLUSH 0x10 /* Enables FLUSH# input pin */
+#define CCR0_BARB 0x20 /* Flushes internal cache when entering hold
+ state */
+#define CCR0_CO 0x40 /* Cache org: 1=direct mapped, 0=2x set
+ assoc */
+#define CCR0_SUSPEND 0x80 /* Enables SUSP# and SUSPA# pins */
+
+#define CCR1 0xc1 /* Configuration control register 1 */
+#define CCR1_RPL 0x01 /* Enables RPLSET and RPLVAL# pins */
+#define CCR1_SMI 0x02 /* Enables SMM pins */
+#define CCR1_SMAC 0x04 /* System management memory access */
+#define CCR1_MMAC 0x08 /* Main memory access */
+#define CCR1_NO_LOCK 0x10 /* Negate LOCK# */
+#define CCR1_SM3 0x80 /* SMM address space address region 3 */
+
+#define CCR2 0xc2
+#define CCR2_WB 0x02 /* Enables WB cache interface pins */
+#define CCR2_SADS 0x02 /* Slow ADS */
+#define CCR2_LOCK_NW 0x04 /* LOCK NW Bit */
+#define CCR2_SUSP_HLT 0x08 /* Suspend on HALT */
+#define CCR2_WT1 0x10 /* WT region 1 */
+#define CCR2_WPR1 0x10 /* Write-protect region 1 */
+#define CCR2_BARB 0x20 /* Flushes write-back cache when entering
+ hold state. */
+#define CCR2_BWRT 0x40 /* Enables burst write cycles */
+#define CCR2_USE_SUSP 0x80 /* Enables suspend pins */
+
+#define CCR3 0xc3
+#define CCR3_SMILOCK 0x01 /* SMM register lock */
+#define CCR3_NMI 0x02 /* Enables NMI during SMM */
+#define CCR3_LINBRST 0x04 /* Linear address burst cycles */
+#define CCR3_SMMMODE 0x08 /* SMM Mode */
+#define CCR3_MAPEN0 0x10 /* Enables Map0 */
+#define CCR3_MAPEN1 0x20 /* Enables Map1 */
+#define CCR3_MAPEN2 0x40 /* Enables Map2 */
+#define CCR3_MAPEN3 0x80 /* Enables Map3 */
+
+#define CCR4 0xe8
+#define CCR4_IOMASK 0x07
+#define CCR4_MEM 0x08 /* Enables momory bypassing */
+#define CCR4_DTE 0x10 /* Enables directory table entry cache */
+#define CCR4_FASTFPE 0x20 /* Fast FPU exception */
+#define CCR4_CPUID 0x80 /* Enables CPUID instruction */
+
+#define CCR5 0xe9
+#define CCR5_WT_ALLOC 0x01 /* Write-through allocate */
+#define CCR5_SLOP 0x02 /* LOOP instruction slowed down */
+#define CCR5_LBR1 0x10 /* Local bus region 1 */
+#define CCR5_ARREN 0x20 /* Enables ARR region */
+
+#define CCR6 0xea
+
+#define CCR7 0xeb
+
+/* Performance Control Register (5x86 only). */
+#define PCR0 0x20
+#define PCR0_RSTK 0x01 /* Enables return stack */
+#define PCR0_BTB 0x02 /* Enables branch target buffer */
+#define PCR0_LOOP 0x04 /* Enables loop */
+#define PCR0_AIS 0x08 /* Enables all instrcutions stalled to
+ serialize pipe. */
+#define PCR0_MLR 0x10 /* Enables reordering of misaligned loads */
+#define PCR0_BTBRT 0x40 /* Enables BTB test register. */
+#define PCR0_LSSER 0x80 /* Disable reorder */
+
+/* Device Identification Registers */
+#define DIR0 0xfe
+#define DIR1 0xff
+
+/*
+ * Machine Check register constants.
+ */
+#define MCG_CAP_COUNT 0x000000ff
+#define MCG_CAP_CTL_P 0x00000100
+#define MCG_CAP_EXT_P 0x00000200
+#define MCG_CAP_CMCI_P 0x00000400
+#define MCG_CAP_TES_P 0x00000800
+#define MCG_CAP_EXT_CNT 0x00ff0000
+#define MCG_CAP_SER_P 0x01000000
+#define MCG_STATUS_RIPV 0x00000001
+#define MCG_STATUS_EIPV 0x00000002
+#define MCG_STATUS_MCIP 0x00000004
+#define MCG_CTL_ENABLE 0xffffffffffffffff
+#define MCG_CTL_DISABLE 0x0000000000000000
+#define MSR_MC_CTL(x) (MSR_MC0_CTL + (x) * 4)
+#define MSR_MC_STATUS(x) (MSR_MC0_STATUS + (x) * 4)
+#define MSR_MC_ADDR(x) (MSR_MC0_ADDR + (x) * 4)
+#define MSR_MC_MISC(x) (MSR_MC0_MISC + (x) * 4)
+#define MSR_MC_CTL2(x) (MSR_MC0_CTL2 + (x)) /* If MCG_CAP_CMCI_P */
+#define MC_STATUS_MCA_ERROR 0x000000000000ffff
+#define MC_STATUS_MODEL_ERROR 0x00000000ffff0000
+#define MC_STATUS_OTHER_INFO 0x01ffffff00000000
+#define MC_STATUS_COR_COUNT 0x001fffc000000000 /* If MCG_CAP_CMCI_P */
+#define MC_STATUS_TES_STATUS 0x0060000000000000 /* If MCG_CAP_TES_P */
+#define MC_STATUS_AR 0x0080000000000000 /* If MCG_CAP_TES_P */
+#define MC_STATUS_S 0x0100000000000000 /* If MCG_CAP_TES_P */
+#define MC_STATUS_PCC 0x0200000000000000
+#define MC_STATUS_ADDRV 0x0400000000000000
+#define MC_STATUS_MISCV 0x0800000000000000
+#define MC_STATUS_EN 0x1000000000000000
+#define MC_STATUS_UC 0x2000000000000000
+#define MC_STATUS_OVER 0x4000000000000000
+#define MC_STATUS_VAL 0x8000000000000000
+#define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */
+#define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */
+#define MC_CTL2_THRESHOLD 0x0000000000007fff
+#define MC_CTL2_CMCI_EN 0x0000000040000000
+
+/*
+ * The following four 3-byte registers control the non-cacheable regions.
+ * These registers must be written as three separate bytes.
+ *
+ * NCRx+0: A31-A24 of starting address
+ * NCRx+1: A23-A16 of starting address
+ * NCRx+2: A15-A12 of starting address | NCR_SIZE_xx.
+ *
+ * The non-cacheable region's starting address must be aligned to the
+ * size indicated by the NCR_SIZE_xx field.
+ */
+#define NCR1 0xc4
+#define NCR2 0xc7
+#define NCR3 0xca
+#define NCR4 0xcd
+
+#define NCR_SIZE_0K 0
+#define NCR_SIZE_4K 1
+#define NCR_SIZE_8K 2
+#define NCR_SIZE_16K 3
+#define NCR_SIZE_32K 4
+#define NCR_SIZE_64K 5
+#define NCR_SIZE_128K 6
+#define NCR_SIZE_256K 7
+#define NCR_SIZE_512K 8
+#define NCR_SIZE_1M 9
+#define NCR_SIZE_2M 10
+#define NCR_SIZE_4M 11
+#define NCR_SIZE_8M 12
+#define NCR_SIZE_16M 13
+#define NCR_SIZE_32M 14
+#define NCR_SIZE_4G 15
+
+/*
+ * The address region registers are used to specify the location and
+ * size for the eight address regions.
+ *
+ * ARRx + 0: A31-A24 of start address
+ * ARRx + 1: A23-A16 of start address
+ * ARRx + 2: A15-A12 of start address | ARR_SIZE_xx
+ */
+#define ARR0 0xc4
+#define ARR1 0xc7
+#define ARR2 0xca
+#define ARR3 0xcd
+#define ARR4 0xd0
+#define ARR5 0xd3
+#define ARR6 0xd6
+#define ARR7 0xd9
+
+#define ARR_SIZE_0K 0
+#define ARR_SIZE_4K 1
+#define ARR_SIZE_8K 2
+#define ARR_SIZE_16K 3
+#define ARR_SIZE_32K 4
+#define ARR_SIZE_64K 5
+#define ARR_SIZE_128K 6
+#define ARR_SIZE_256K 7
+#define ARR_SIZE_512K 8
+#define ARR_SIZE_1M 9
+#define ARR_SIZE_2M 10
+#define ARR_SIZE_4M 11
+#define ARR_SIZE_8M 12
+#define ARR_SIZE_16M 13
+#define ARR_SIZE_32M 14
+#define ARR_SIZE_4G 15
+
+/*
+ * The region control registers specify the attributes associated with
+ * the ARRx addres regions.
+ */
+#define RCR0 0xdc
+#define RCR1 0xdd
+#define RCR2 0xde
+#define RCR3 0xdf
+#define RCR4 0xe0
+#define RCR5 0xe1
+#define RCR6 0xe2
+#define RCR7 0xe3
+
+#define RCR_RCD 0x01 /* Disables caching for ARRx (x = 0-6). */
+#define RCR_RCE 0x01 /* Enables caching for ARR7. */
+#define RCR_WWO 0x02 /* Weak write ordering. */
+#define RCR_WL 0x04 /* Weak locking. */
+#define RCR_WG 0x08 /* Write gathering. */
+#define RCR_WT 0x10 /* Write-through. */
+#define RCR_NLB 0x20 /* LBA# pin is not asserted. */
+
+/* AMD Write Allocate Top-Of-Memory and Control Register */
+#define AMD_WT_ALLOC_TME 0x40000 /* top-of-memory enable */
+#define AMD_WT_ALLOC_PRE 0x20000 /* programmable range enable */
+#define AMD_WT_ALLOC_FRE 0x10000 /* fixed (A0000-FFFFF) range enable */
+
+/* AMD64 MSR's */
+#define MSR_EFER 0xc0000080 /* extended features */
+#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target/cs/ss */
+#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target rip */
+#define MSR_CSTAR 0xc0000083 /* compat mode SYSCALL target rip */
+#define MSR_SF_MASK 0xc0000084 /* syscall flags mask */
+#define MSR_FSBASE 0xc0000100 /* base address of the %fs "segment" */
+#define MSR_GSBASE 0xc0000101 /* base address of the %gs "segment" */
+#define MSR_KGSBASE 0xc0000102 /* base address of the kernel %gs */
+#define MSR_PERFEVSEL0 0xc0010000
+#define MSR_PERFEVSEL1 0xc0010001
+#define MSR_PERFEVSEL2 0xc0010002
+#define MSR_PERFEVSEL3 0xc0010003
+#define MSR_K7_PERFCTR0 0xc0010004
+#define MSR_K7_PERFCTR1 0xc0010005
+#define MSR_K7_PERFCTR2 0xc0010006
+#define MSR_K7_PERFCTR3 0xc0010007
+#define MSR_SYSCFG 0xc0010010
+#define MSR_HWCR 0xc0010015
+#define MSR_IORRBASE0 0xc0010016
+#define MSR_IORRMASK0 0xc0010017
+#define MSR_IORRBASE1 0xc0010018
+#define MSR_IORRMASK1 0xc0010019
+#define MSR_TOP_MEM 0xc001001a /* boundary for ram below 4G */
+#define MSR_TOP_MEM2 0xc001001d /* boundary for ram above 4G */
+#define MSR_NB_CFG1 0xc001001f /* NB configuration 1 */
+#define MSR_P_STATE_LIMIT 0xc0010061 /* P-state Current Limit Register */
+#define MSR_P_STATE_CONTROL 0xc0010062 /* P-state Control Register */
+#define MSR_P_STATE_STATUS 0xc0010063 /* P-state Status Register */
+#define MSR_P_STATE_CONFIG(n) (0xc0010064 + (n)) /* P-state Config */
+#define MSR_SMM_ADDR 0xc0010112 /* SMM TSEG base address */
+#define MSR_SMM_MASK 0xc0010113 /* SMM TSEG address mask */
+#define MSR_IC_CFG 0xc0011021 /* Instruction Cache Configuration */
+#define MSR_K8_UCODE_UPDATE 0xc0010020 /* update microcode */
+#define MSR_MC0_CTL_MASK 0xc0010044
+#define MSR_VM_CR 0xc0010114 /* SVM: feature control */
+#define MSR_VM_HSAVE_PA 0xc0010117 /* SVM: host save area address */
+
+/* MSR_VM_CR related */
+#define VM_CR_SVMDIS 0x10 /* SVM: disabled by BIOS */
+
+/* VIA ACE crypto featureset: for via_feature_rng */
+#define VIA_HAS_RNG 1 /* cpu has RNG */
+
+/* VIA ACE crypto featureset: for via_feature_xcrypt */
+#define VIA_HAS_AES 1 /* cpu has AES */
+#define VIA_HAS_SHA 2 /* cpu has SHA1 & SHA256 */
+#define VIA_HAS_MM 4 /* cpu has RSA instructions */
+#define VIA_HAS_AESCTR 8 /* cpu has AES-CTR instructions */
+
+/* Centaur Extended Feature flags */
+#define VIA_CPUID_HAS_RNG 0x000004
+#define VIA_CPUID_DO_RNG 0x000008
+#define VIA_CPUID_HAS_ACE 0x000040
+#define VIA_CPUID_DO_ACE 0x000080
+#define VIA_CPUID_HAS_ACE2 0x000100
+#define VIA_CPUID_DO_ACE2 0x000200
+#define VIA_CPUID_HAS_PHE 0x000400
+#define VIA_CPUID_DO_PHE 0x000800
+#define VIA_CPUID_HAS_PMM 0x001000
+#define VIA_CPUID_DO_PMM 0x002000
+
+/* VIA ACE xcrypt-* instruction context control options */
+#define VIA_CRYPT_CWLO_ROUND_M 0x0000000f
+#define VIA_CRYPT_CWLO_ALG_M 0x00000070
+#define VIA_CRYPT_CWLO_ALG_AES 0x00000000
+#define VIA_CRYPT_CWLO_KEYGEN_M 0x00000080
+#define VIA_CRYPT_CWLO_KEYGEN_HW 0x00000000
+#define VIA_CRYPT_CWLO_KEYGEN_SW 0x00000080
+#define VIA_CRYPT_CWLO_NORMAL 0x00000000
+#define VIA_CRYPT_CWLO_INTERMEDIATE 0x00000100
+#define VIA_CRYPT_CWLO_ENCRYPT 0x00000000
+#define VIA_CRYPT_CWLO_DECRYPT 0x00000200
+#define VIA_CRYPT_CWLO_KEY128 0x0000000a /* 128bit, 10 rds */
+#define VIA_CRYPT_CWLO_KEY192 0x0000040c /* 192bit, 12 rds */
+#define VIA_CRYPT_CWLO_KEY256 0x0000080e /* 256bit, 15 rds */
+
+#endif /* !_MACHINE_SPECIALREG_H_ */
diff --git a/usr/src/cmd/bhyve/Makefile b/usr/src/cmd/bhyve/Makefile
new file mode 100644
index 0000000000..f47daead31
--- /dev/null
+++ b/usr/src/cmd/bhyve/Makefile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2014 Pluribus Networks Inc.
+#
+
+PROG = bhyve
+
+include ../Makefile.cmd
+
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all := TARGET = all
+install := TARGET = install
+clean := TARGET = clean
+clobber := TARGET = clobber
+lint := TARGET = lint
+
+.KEEP_STATE:
+
+all clean clobber lint: $(SUBDIRS)
+
+install: $(SUBDIRS)
+ -$(RM) $(ROOTUSRSBINPROG)
+ -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/bhyve/Makefile.com b/usr/src/cmd/bhyve/Makefile.com
new file mode 100644
index 0000000000..4a92b622ab
--- /dev/null
+++ b/usr/src/cmd/bhyve/Makefile.com
@@ -0,0 +1,94 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Pluribus Networks Inc.
+#
+
+PROG= bhyve
+
+SRCS = atkbdc.c \
+ bhyvegc.c \
+ bhyverun.c \
+ block_if.c \
+ console.c \
+ consport.c \
+ inout.c \
+ ioapic.c \
+ mem.c \
+ mptbl.c \
+ pci_ahci.c \
+ pci_emul.c \
+ pci_hostbridge.c \
+ pci_irq.c \
+ pci_lpc.c \
+ pci_virtio_block.c \
+ pci_virtio_net.c \
+ pci_virtio_viona.c \
+ pm.c \
+ pmtmr.c \
+ post.c \
+ ps2kbd.c \
+ ps2mouse.c \
+ rfb.c \
+ rtc.c \
+ smbiostbl.c \
+ uart_emul.c \
+ vga.c \
+ virtio.c \
+ vmm_instruction_emul.c \
+ xmsr.c \
+ spinup_ap.c \
+ bhyve_sol_glue.c
+
+OBJS = $(SRCS:.c=.o)
+
+include ../../Makefile.cmd
+
+.KEEP_STATE:
+
+CFLAGS += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration
+CFLAGS64 += $(CCVERBOSE) -_gcc=-Wimplicit-function-declaration
+CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \
+ -I$(ROOT)/usr/platform/i86pc/include \
+ -I$(SRC)/uts/i86pc/io/vmm \
+ -I$(SRC)/uts/common \
+ -I$(SRC)/uts/i86pc \
+ -I$(SRC)/lib/libdladm/common
+LDLIBS += -lsocket -lnsl -ldlpi -ldladm -lkstat -lmd -luuid -lvmmapi
+
+POST_PROCESS += ; $(GENSETDEFS) $@
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+ $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
+ $(POST_PROCESS)
+
+install: all $(ROOTUSRSBINPROG)
+
+clean:
+ $(RM) $(OBJS)
+
+lint: lint_SRCS
+
+include ../../Makefile.targ
+
+%.o: ../%.c
+ $(COMPILE.c) $<
+ $(POST_PROCESS_O)
+
+%.o: $(SRC)/uts/i86pc/io/vmm/%.c
+ $(COMPILE.c) $<
+ $(POST_PROCESS_O)
+
+%.o: ../%.s
+ $(COMPILE.s) $<
diff --git a/usr/src/cmd/bhyve/acpi.h b/usr/src/cmd/bhyve/acpi.h
new file mode 100644
index 0000000000..477f827286
--- /dev/null
+++ b/usr/src/cmd/bhyve/acpi.h
@@ -0,0 +1,54 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/acpi.h 266125 2014-05-15 14:16:55Z jhb $
+ */
+
+#ifndef _ACPI_H_
+#define _ACPI_H_
+
+#define SCI_INT 9
+
+#define SMI_CMD 0xb2
+#define BHYVE_ACPI_ENABLE 0xa0
+#define BHYVE_ACPI_DISABLE 0xa1
+
+#define PM1A_EVT_ADDR 0x400
+#define PM1A_CNT_ADDR 0x404
+
+#define IO_PMTMR 0x408 /* 4-byte i/o port for the timer */
+
+struct vmctx;
+
+int acpi_build(struct vmctx *ctx, int ncpu);
+void dsdt_line(const char *fmt, ...);
+void dsdt_fixed_ioport(uint16_t iobase, uint16_t length);
+void dsdt_fixed_irq(uint8_t irq);
+void dsdt_fixed_mem32(uint32_t base, uint32_t length);
+void dsdt_indent(int levels);
+void dsdt_unindent(int levels);
+void sci_init(struct vmctx *ctx);
+
+#endif /* _ACPI_H_ */
diff --git a/usr/src/cmd/bhyve/ahci.h b/usr/src/cmd/bhyve/ahci.h
new file mode 100644
index 0000000000..1cf09adcbf
--- /dev/null
+++ b/usr/src/cmd/bhyve/ahci.h
@@ -0,0 +1,304 @@
+/*-
+ * Copyright (c) 1998 - 2008 Søren Schmidt <sos@FreeBSD.org>
+ * Copyright (c) 2009-2012 Alexander Motin <mav@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer,
+ * without modification, immediately at the beginning of the file.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/ahci.h 256056 2013-10-04 18:31:38Z grehan $
+ */
+
+#ifndef _AHCI_H_
+#define _AHCI_H_
+
+/* ATA register defines */
+#define ATA_DATA 0 /* (RW) data */
+
+#define ATA_FEATURE 1 /* (W) feature */
+#define ATA_F_DMA 0x01 /* enable DMA */
+#define ATA_F_OVL 0x02 /* enable overlap */
+
+#define ATA_COUNT 2 /* (W) sector count */
+
+#define ATA_SECTOR 3 /* (RW) sector # */
+#define ATA_CYL_LSB 4 /* (RW) cylinder# LSB */
+#define ATA_CYL_MSB 5 /* (RW) cylinder# MSB */
+#define ATA_DRIVE 6 /* (W) Sector/Drive/Head */
+#define ATA_D_LBA 0x40 /* use LBA addressing */
+#define ATA_D_IBM 0xa0 /* 512 byte sectors, ECC */
+
+#define ATA_COMMAND 7 /* (W) command */
+
+#define ATA_ERROR 8 /* (R) error */
+#define ATA_E_ILI 0x01 /* illegal length */
+#define ATA_E_NM 0x02 /* no media */
+#define ATA_E_ABORT 0x04 /* command aborted */
+#define ATA_E_MCR 0x08 /* media change request */
+#define ATA_E_IDNF 0x10 /* ID not found */
+#define ATA_E_MC 0x20 /* media changed */
+#define ATA_E_UNC 0x40 /* uncorrectable data */
+#define ATA_E_ICRC 0x80 /* UDMA crc error */
+#define ATA_E_ATAPI_SENSE_MASK 0xf0 /* ATAPI sense key mask */
+
+#define ATA_IREASON 9 /* (R) interrupt reason */
+#define ATA_I_CMD 0x01 /* cmd (1) | data (0) */
+#define ATA_I_IN 0x02 /* read (1) | write (0) */
+#define ATA_I_RELEASE 0x04 /* released bus (1) */
+#define ATA_I_TAGMASK 0xf8 /* tag mask */
+
+#define ATA_STATUS 10 /* (R) status */
+#define ATA_ALTSTAT 11 /* (R) alternate status */
+#define ATA_S_ERROR 0x01 /* error */
+#define ATA_S_INDEX 0x02 /* index */
+#define ATA_S_CORR 0x04 /* data corrected */
+#define ATA_S_DRQ 0x08 /* data request */
+#define ATA_S_DSC 0x10 /* drive seek completed */
+#define ATA_S_SERVICE 0x10 /* drive needs service */
+#define ATA_S_DWF 0x20 /* drive write fault */
+#define ATA_S_DMA 0x20 /* DMA ready */
+#define ATA_S_READY 0x40 /* drive ready */
+#define ATA_S_BUSY 0x80 /* busy */
+
+#define ATA_CONTROL 12 /* (W) control */
+#define ATA_A_IDS 0x02 /* disable interrupts */
+#define ATA_A_RESET 0x04 /* RESET controller */
+#define ATA_A_4BIT 0x08 /* 4 head bits */
+#define ATA_A_HOB 0x80 /* High Order Byte enable */
+
+/* SATA register defines */
+#define ATA_SSTATUS 13
+#define ATA_SS_DET_MASK 0x0000000f
+#define ATA_SS_DET_NO_DEVICE 0x00000000
+#define ATA_SS_DET_DEV_PRESENT 0x00000001
+#define ATA_SS_DET_PHY_ONLINE 0x00000003
+#define ATA_SS_DET_PHY_OFFLINE 0x00000004
+
+#define ATA_SS_SPD_MASK 0x000000f0
+#define ATA_SS_SPD_NO_SPEED 0x00000000
+#define ATA_SS_SPD_GEN1 0x00000010
+#define ATA_SS_SPD_GEN2 0x00000020
+#define ATA_SS_SPD_GEN3 0x00000040
+
+#define ATA_SS_IPM_MASK 0x00000f00
+#define ATA_SS_IPM_NO_DEVICE 0x00000000
+#define ATA_SS_IPM_ACTIVE 0x00000100
+#define ATA_SS_IPM_PARTIAL 0x00000200
+#define ATA_SS_IPM_SLUMBER 0x00000600
+
+#define ATA_SERROR 14
+#define ATA_SE_DATA_CORRECTED 0x00000001
+#define ATA_SE_COMM_CORRECTED 0x00000002
+#define ATA_SE_DATA_ERR 0x00000100
+#define ATA_SE_COMM_ERR 0x00000200
+#define ATA_SE_PROT_ERR 0x00000400
+#define ATA_SE_HOST_ERR 0x00000800
+#define ATA_SE_PHY_CHANGED 0x00010000
+#define ATA_SE_PHY_IERROR 0x00020000
+#define ATA_SE_COMM_WAKE 0x00040000
+#define ATA_SE_DECODE_ERR 0x00080000
+#define ATA_SE_PARITY_ERR 0x00100000
+#define ATA_SE_CRC_ERR 0x00200000
+#define ATA_SE_HANDSHAKE_ERR 0x00400000
+#define ATA_SE_LINKSEQ_ERR 0x00800000
+#define ATA_SE_TRANSPORT_ERR 0x01000000
+#define ATA_SE_UNKNOWN_FIS 0x02000000
+#define ATA_SE_EXCHANGED 0x04000000
+
+#define ATA_SCONTROL 15
+#define ATA_SC_DET_MASK 0x0000000f
+#define ATA_SC_DET_IDLE 0x00000000
+#define ATA_SC_DET_RESET 0x00000001
+#define ATA_SC_DET_DISABLE 0x00000004
+
+#define ATA_SC_SPD_MASK 0x000000f0
+#define ATA_SC_SPD_NO_SPEED 0x00000000
+#define ATA_SC_SPD_SPEED_GEN1 0x00000010
+#define ATA_SC_SPD_SPEED_GEN2 0x00000020
+#define ATA_SC_SPD_SPEED_GEN3 0x00000040
+
+#define ATA_SC_IPM_MASK 0x00000f00
+#define ATA_SC_IPM_NONE 0x00000000
+#define ATA_SC_IPM_DIS_PARTIAL 0x00000100
+#define ATA_SC_IPM_DIS_SLUMBER 0x00000200
+
+#define ATA_SACTIVE 16
+
+#define AHCI_MAX_PORTS 32
+#define AHCI_MAX_SLOTS 32
+
+/* SATA AHCI v1.0 register defines */
+#define AHCI_CAP 0x00
+#define AHCI_CAP_NPMASK 0x0000001f
+#define AHCI_CAP_SXS 0x00000020
+#define AHCI_CAP_EMS 0x00000040
+#define AHCI_CAP_CCCS 0x00000080
+#define AHCI_CAP_NCS 0x00001F00
+#define AHCI_CAP_NCS_SHIFT 8
+#define AHCI_CAP_PSC 0x00002000
+#define AHCI_CAP_SSC 0x00004000
+#define AHCI_CAP_PMD 0x00008000
+#define AHCI_CAP_FBSS 0x00010000
+#define AHCI_CAP_SPM 0x00020000
+#define AHCI_CAP_SAM 0x00080000
+#define AHCI_CAP_ISS 0x00F00000
+#define AHCI_CAP_ISS_SHIFT 20
+#define AHCI_CAP_SCLO 0x01000000
+#define AHCI_CAP_SAL 0x02000000
+#define AHCI_CAP_SALP 0x04000000
+#define AHCI_CAP_SSS 0x08000000
+#define AHCI_CAP_SMPS 0x10000000
+#define AHCI_CAP_SSNTF 0x20000000
+#define AHCI_CAP_SNCQ 0x40000000
+#define AHCI_CAP_64BIT 0x80000000
+
+#define AHCI_GHC 0x04
+#define AHCI_GHC_AE 0x80000000
+#define AHCI_GHC_MRSM 0x00000004
+#define AHCI_GHC_IE 0x00000002
+#define AHCI_GHC_HR 0x00000001
+
+#define AHCI_IS 0x08
+#define AHCI_PI 0x0c
+#define AHCI_VS 0x10
+
+#define AHCI_CCCC 0x14
+#define AHCI_CCCC_TV_MASK 0xffff0000
+#define AHCI_CCCC_TV_SHIFT 16
+#define AHCI_CCCC_CC_MASK 0x0000ff00
+#define AHCI_CCCC_CC_SHIFT 8
+#define AHCI_CCCC_INT_MASK 0x000000f8
+#define AHCI_CCCC_INT_SHIFT 3
+#define AHCI_CCCC_EN 0x00000001
+#define AHCI_CCCP 0x18
+
+#define AHCI_EM_LOC 0x1C
+#define AHCI_EM_CTL 0x20
+#define AHCI_EM_MR 0x00000001
+#define AHCI_EM_TM 0x00000100
+#define AHCI_EM_RST 0x00000200
+#define AHCI_EM_LED 0x00010000
+#define AHCI_EM_SAFTE 0x00020000
+#define AHCI_EM_SES2 0x00040000
+#define AHCI_EM_SGPIO 0x00080000
+#define AHCI_EM_SMB 0x01000000
+#define AHCI_EM_XMT 0x02000000
+#define AHCI_EM_ALHD 0x04000000
+#define AHCI_EM_PM 0x08000000
+
+#define AHCI_CAP2 0x24
+#define AHCI_CAP2_BOH 0x00000001
+#define AHCI_CAP2_NVMP 0x00000002
+#define AHCI_CAP2_APST 0x00000004
+
+#define AHCI_OFFSET 0x100
+#define AHCI_STEP 0x80
+
+#define AHCI_P_CLB 0x00
+#define AHCI_P_CLBU 0x04
+#define AHCI_P_FB 0x08
+#define AHCI_P_FBU 0x0c
+#define AHCI_P_IS 0x10
+#define AHCI_P_IE 0x14
+#define AHCI_P_IX_DHR 0x00000001
+#define AHCI_P_IX_PS 0x00000002
+#define AHCI_P_IX_DS 0x00000004
+#define AHCI_P_IX_SDB 0x00000008
+#define AHCI_P_IX_UF 0x00000010
+#define AHCI_P_IX_DP 0x00000020
+#define AHCI_P_IX_PC 0x00000040
+#define AHCI_P_IX_MP 0x00000080
+
+#define AHCI_P_IX_PRC 0x00400000
+#define AHCI_P_IX_IPM 0x00800000
+#define AHCI_P_IX_OF 0x01000000
+#define AHCI_P_IX_INF 0x04000000
+#define AHCI_P_IX_IF 0x08000000
+#define AHCI_P_IX_HBD 0x10000000
+#define AHCI_P_IX_HBF 0x20000000
+#define AHCI_P_IX_TFE 0x40000000
+#define AHCI_P_IX_CPD 0x80000000
+
+#define AHCI_P_CMD 0x18
+#define AHCI_P_CMD_ST 0x00000001
+#define AHCI_P_CMD_SUD 0x00000002
+#define AHCI_P_CMD_POD 0x00000004
+#define AHCI_P_CMD_CLO 0x00000008
+#define AHCI_P_CMD_FRE 0x00000010
+#define AHCI_P_CMD_CCS_MASK 0x00001f00
+#define AHCI_P_CMD_CCS_SHIFT 8
+#define AHCI_P_CMD_ISS 0x00002000
+#define AHCI_P_CMD_FR 0x00004000
+#define AHCI_P_CMD_CR 0x00008000
+#define AHCI_P_CMD_CPS 0x00010000
+#define AHCI_P_CMD_PMA 0x00020000
+#define AHCI_P_CMD_HPCP 0x00040000
+#define AHCI_P_CMD_MPSP 0x00080000
+#define AHCI_P_CMD_CPD 0x00100000
+#define AHCI_P_CMD_ESP 0x00200000
+#define AHCI_P_CMD_FBSCP 0x00400000
+#define AHCI_P_CMD_APSTE 0x00800000
+#define AHCI_P_CMD_ATAPI 0x01000000
+#define AHCI_P_CMD_DLAE 0x02000000
+#define AHCI_P_CMD_ALPE 0x04000000
+#define AHCI_P_CMD_ASP 0x08000000
+#define AHCI_P_CMD_ICC_MASK 0xf0000000
+#define AHCI_P_CMD_NOOP 0x00000000
+#define AHCI_P_CMD_ACTIVE 0x10000000
+#define AHCI_P_CMD_PARTIAL 0x20000000
+#define AHCI_P_CMD_SLUMBER 0x60000000
+
+#define AHCI_P_TFD 0x20
+#define AHCI_P_SIG 0x24
+#define AHCI_P_SSTS 0x28
+#define AHCI_P_SCTL 0x2c
+#define AHCI_P_SERR 0x30
+#define AHCI_P_SACT 0x34
+#define AHCI_P_CI 0x38
+#define AHCI_P_SNTF 0x3C
+#define AHCI_P_FBS 0x40
+#define AHCI_P_FBS_EN 0x00000001
+#define AHCI_P_FBS_DEC 0x00000002
+#define AHCI_P_FBS_SDE 0x00000004
+#define AHCI_P_FBS_DEV 0x00000f00
+#define AHCI_P_FBS_DEV_SHIFT 8
+#define AHCI_P_FBS_ADO 0x0000f000
+#define AHCI_P_FBS_ADO_SHIFT 12
+#define AHCI_P_FBS_DWE 0x000f0000
+#define AHCI_P_FBS_DWE_SHIFT 16
+
+/* Just to be sure, if building as module. */
+#if MAXPHYS < 512 * 1024
+#undef MAXPHYS
+#define MAXPHYS 512 * 1024
+#endif
+/* Pessimistic prognosis on number of required S/G entries */
+#define AHCI_SG_ENTRIES (roundup(btoc(MAXPHYS) + 1, 8))
+/* Command list. 32 commands. First, 1Kbyte aligned. */
+#define AHCI_CL_OFFSET 0
+#define AHCI_CL_SIZE 32
+/* Command tables. Up to 32 commands, Each, 128byte aligned. */
+#define AHCI_CT_OFFSET (AHCI_CL_OFFSET + AHCI_CL_SIZE * AHCI_MAX_SLOTS)
+#define AHCI_CT_SIZE (128 + AHCI_SG_ENTRIES * 16)
+/* Total main work area. */
+#define AHCI_WORK_SIZE (AHCI_CT_OFFSET + AHCI_CT_SIZE * ch->numslots)
+
+#endif /* _AHCI_H_ */
diff --git a/usr/src/cmd/bhyve/amd64/Makefile b/usr/src/cmd/bhyve/amd64/Makefile
new file mode 100644
index 0000000000..13cdae6663
--- /dev/null
+++ b/usr/src/cmd/bhyve/amd64/Makefile
@@ -0,0 +1,21 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2015 Pluribus Networks Inc.
+#
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
+
+install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/bhyve/atkbdc.c b/usr/src/cmd/bhyve/atkbdc.c
new file mode 100644
index 0000000000..4d09d88266
--- /dev/null
+++ b/usr/src/cmd/bhyve/atkbdc.c
@@ -0,0 +1,576 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2015 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/atkbdc.c 267611 2014-06-18 17:20:02Z neel $");
+
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+
+#include <vmmapi.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "acpi.h"
+#include "inout.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+#include "ps2kbd.h"
+#include "ps2mouse.h"
+
+#define KBD_DATA_PORT 0x60
+
+#define KBD_STS_CTL_PORT 0x64
+
+#define KBDC_RESET 0xfe
+
+#define KBD_DEV_IRQ 1
+#define AUX_DEV_IRQ 12
+
+/* controller commands */
+#define KBDC_SET_COMMAND_BYTE 0x60
+#define KBDC_GET_COMMAND_BYTE 0x20
+#define KBDC_DISABLE_AUX_PORT 0xa7
+#define KBDC_ENABLE_AUX_PORT 0xa8
+#define KBDC_TEST_AUX_PORT 0xa9
+#define KBDC_TEST_CTRL 0xaa
+#define KBDC_TEST_KBD_PORT 0xab
+#define KBDC_DISABLE_KBD_PORT 0xad
+#define KBDC_ENABLE_KBD_PORT 0xae
+#define KBDC_READ_INPORT 0xc0
+#define KBDC_READ_OUTPORT 0xd0
+#define KBDC_WRITE_OUTPORT 0xd1
+#define KBDC_WRITE_KBD_OUTBUF 0xd2
+#define KBDC_WRITE_AUX_OUTBUF 0xd3
+#define KBDC_WRITE_TO_AUX 0xd4
+
+/* controller command byte (set by KBDC_SET_COMMAND_BYTE) */
+#define KBD_TRANSLATION 0x40
+#define KBD_SYS_FLAG_BIT 0x04
+#define KBD_DISABLE_KBD_PORT 0x10
+#define KBD_DISABLE_AUX_PORT 0x20
+#define KBD_ENABLE_AUX_INT 0x02
+#define KBD_ENABLE_KBD_INT 0x01
+#define KBD_KBD_CONTROL_BITS (KBD_DISABLE_KBD_PORT | KBD_ENABLE_KBD_INT)
+#define KBD_AUX_CONTROL_BITS (KBD_DISABLE_AUX_PORT | KBD_ENABLE_AUX_INT)
+
+/* controller status bits */
+#define KBDS_KBD_BUFFER_FULL 0x01
+#define KBDS_SYS_FLAG 0x04
+#define KBDS_CTRL_FLAG 0x08
+#define KBDS_AUX_BUFFER_FULL 0x20
+
+/* controller output port */
+#define KBDO_KBD_OUTFULL 0x10
+#define KBDO_AUX_OUTFULL 0x20
+
+#define RAMSZ 32
+
+struct kbd_dev {
+ bool irq_active;
+ int irq;
+
+ uint8_t buffer;
+};
+
+struct aux_dev {
+ bool irq_active;
+ int irq;
+
+ uint8_t buffer;
+};
+
+struct atkbdc_softc {
+ struct vmctx *ctx;
+ pthread_mutex_t mtx;
+
+ struct ps2kbd_softc *ps2kbd_sc;
+ struct ps2mouse_softc *ps2mouse_sc;
+
+ uint8_t status; /* status register */
+ uint8_t outport; /* controller output port */
+ uint8_t ram[RAMSZ]; /* byte0 = controller config */
+
+ uint32_t curcmd; /* current command for next byte */
+
+ struct kbd_dev kbd;
+ struct aux_dev aux;
+};
+
+static void
+atkbdc_assert_kbd_intr(struct atkbdc_softc *sc)
+{
+ if (!sc->kbd.irq_active &&
+ (sc->ram[0] & KBD_ENABLE_KBD_INT) != 0) {
+ sc->kbd.irq_active = true;
+ vm_isa_assert_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq);
+ }
+}
+
+static void
+atkbdc_deassert_kbd_intr(struct atkbdc_softc *sc)
+{
+ if (sc->kbd.irq_active) {
+ vm_isa_deassert_irq(sc->ctx, sc->kbd.irq, sc->kbd.irq);
+ sc->kbd.irq_active = false;
+ }
+}
+
+static void
+atkbdc_assert_aux_intr(struct atkbdc_softc *sc)
+{
+ if (!sc->aux.irq_active &&
+ (sc->ram[0] & KBD_ENABLE_AUX_INT) != 0) {
+ sc->aux.irq_active = true;
+ vm_isa_assert_irq(sc->ctx, sc->aux.irq, sc->aux.irq);
+ }
+}
+
+static void
+atkbdc_deassert_aux_intr(struct atkbdc_softc *sc)
+{
+ if (sc->aux.irq_active) {
+ vm_isa_deassert_irq(sc->ctx, sc->aux.irq, sc->aux.irq);
+ sc->aux.irq_active = false;
+ }
+}
+
+static void
+atkbdc_aux_queue_data(struct atkbdc_softc *sc, uint8_t val)
+{
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ sc->aux.buffer = val;
+ sc->status |= (KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
+ sc->outport |= KBDO_AUX_OUTFULL;
+ atkbdc_assert_aux_intr(sc);
+}
+
+static void
+atkbdc_kbd_queue_data(struct atkbdc_softc *sc, uint8_t val)
+{
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ sc->kbd.buffer = val;
+ sc->status |= KBDS_KBD_BUFFER_FULL;
+ sc->outport |= KBDO_KBD_OUTFULL;
+ atkbdc_assert_kbd_intr(sc);
+}
+
+static void
+atkbdc_aux_read(struct atkbdc_softc *sc)
+{
+ uint8_t val;
+
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ if (ps2mouse_read(sc->ps2mouse_sc, &val) != -1)
+ atkbdc_aux_queue_data(sc, val);
+}
+
+static void
+atkbdc_kbd_read(struct atkbdc_softc *sc)
+{
+ const uint8_t translation[256] = {
+ 0xff, 0x43, 0x41, 0x3f, 0x3d, 0x3b, 0x3c, 0x58,
+ 0x64, 0x44, 0x42, 0x40, 0x3e, 0x0f, 0x29, 0x59,
+ 0x65, 0x38, 0x2a, 0x70, 0x1d, 0x10, 0x02, 0x5a,
+ 0x66, 0x71, 0x2c, 0x1f, 0x1e, 0x11, 0x03, 0x5b,
+ 0x67, 0x2e, 0x2d, 0x20, 0x12, 0x05, 0x04, 0x5c,
+ 0x68, 0x39, 0x2f, 0x21, 0x14, 0x13, 0x06, 0x5d,
+ 0x69, 0x31, 0x30, 0x23, 0x22, 0x15, 0x07, 0x5e,
+ 0x6a, 0x72, 0x32, 0x24, 0x16, 0x08, 0x09, 0x5f,
+ 0x6b, 0x33, 0x25, 0x17, 0x18, 0x0b, 0x0a, 0x60,
+ 0x6c, 0x34, 0x35, 0x26, 0x27, 0x19, 0x0c, 0x61,
+ 0x6d, 0x73, 0x28, 0x74, 0x1a, 0x0d, 0x62, 0x6e,
+ 0x3a, 0x36, 0x1c, 0x1b, 0x75, 0x2b, 0x63, 0x76,
+ 0x55, 0x56, 0x77, 0x78, 0x79, 0x7a, 0x0e, 0x7b,
+ 0x7c, 0x4f, 0x7d, 0x4b, 0x47, 0x7e, 0x7f, 0x6f,
+ 0x52, 0x53, 0x50, 0x4c, 0x4d, 0x48, 0x01, 0x45,
+ 0x57, 0x4e, 0x51, 0x4a, 0x37, 0x49, 0x46, 0x54,
+ 0x80, 0x81, 0x82, 0x41, 0x54, 0x85, 0x86, 0x87,
+ 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f,
+ 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
+ 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7,
+ 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf,
+ 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf,
+ 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7,
+ 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7,
+ 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef,
+ 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff
+ };
+ uint8_t val;
+ uint8_t release = 0;
+
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ if (sc->ram[0] & KBD_TRANSLATION) {
+ while (ps2kbd_read(sc->ps2kbd_sc, &val) != -1) {
+ if (val == 0xf0) {
+ release = 0x80;
+ continue;
+ } else {
+ val = translation[val] | release;
+ }
+
+ atkbdc_kbd_queue_data(sc, val);
+ break;
+ }
+ } else {
+ if (ps2kbd_read(sc->ps2kbd_sc, &val) != -1)
+ atkbdc_kbd_queue_data(sc, val);
+ }
+}
+
+static void
+atkbdc_aux_poll(struct atkbdc_softc *sc)
+{
+ if ((sc->outport & KBDO_AUX_OUTFULL) == 0)
+ atkbdc_aux_read(sc);
+}
+
+static void
+atkbdc_kbd_poll(struct atkbdc_softc *sc)
+{
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ if ((sc->outport & KBDO_KBD_OUTFULL) == 0)
+ atkbdc_kbd_read(sc);
+}
+
+static void
+atkbdc_poll(struct atkbdc_softc *sc)
+{
+ atkbdc_aux_poll(sc);
+ atkbdc_kbd_poll(sc);
+}
+
+static void
+atkbdc_dequeue_data(struct atkbdc_softc *sc, uint8_t *buf)
+{
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ if (sc->outport & KBDO_AUX_OUTFULL) {
+ *buf = sc->aux.buffer;
+ sc->status &= ~(KBDS_AUX_BUFFER_FULL | KBDS_KBD_BUFFER_FULL);
+ sc->outport &= ~KBDO_AUX_OUTFULL;
+ atkbdc_deassert_aux_intr(sc);
+
+ atkbdc_poll(sc);
+ return;
+ }
+
+ *buf = sc->kbd.buffer;
+ sc->status &= ~KBDS_KBD_BUFFER_FULL;
+ sc->outport &= ~KBDO_KBD_OUTFULL;
+ atkbdc_deassert_kbd_intr(sc);
+
+ atkbdc_poll(sc);
+}
+
+static int
+atkbdc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ struct atkbdc_softc *sc;
+ uint8_t buf;
+ int retval;
+
+ if (bytes != 1)
+ return (-1);
+
+ sc = arg;
+ retval = 0;
+
+ pthread_mutex_lock(&sc->mtx);
+ if (in) {
+ sc->curcmd = 0;
+ sc->status &= ~KBDS_CTRL_FLAG;
+
+ /* read device buffer; includes kbd cmd responses */
+ atkbdc_dequeue_data(sc, &buf);
+ *eax = buf;
+
+ pthread_mutex_unlock(&sc->mtx);
+ return (retval);
+ }
+
+ if (sc->status & KBDS_CTRL_FLAG) {
+ /*
+ * Command byte for the controller.
+ */
+ switch (sc->curcmd) {
+ case KBDC_SET_COMMAND_BYTE:
+ sc->ram[0] = *eax;
+ if (sc->ram[0] & KBD_SYS_FLAG_BIT)
+ sc->status |= KBDS_SYS_FLAG;
+ else
+ sc->status &= KBDS_SYS_FLAG;
+ if (sc->outport & KBDO_AUX_OUTFULL)
+ atkbdc_assert_aux_intr(sc);
+ else if (sc->outport & KBDO_KBD_OUTFULL)
+ atkbdc_assert_kbd_intr(sc);
+ break;
+ case KBDC_WRITE_OUTPORT:
+ sc->outport = *eax;
+ if (sc->outport & KBDO_AUX_OUTFULL)
+ sc->status |= (KBDS_AUX_BUFFER_FULL |
+ KBDS_KBD_BUFFER_FULL);
+ if (sc->outport & KBDO_KBD_OUTFULL)
+ sc->status |= KBDS_KBD_BUFFER_FULL;
+ break;
+ case KBDC_WRITE_TO_AUX:
+ ps2mouse_write(sc->ps2mouse_sc, *eax);
+ atkbdc_poll(sc);
+ break;
+ case KBDC_WRITE_KBD_OUTBUF:
+ atkbdc_kbd_queue_data(sc, *eax);
+ break;
+ case KBDC_WRITE_AUX_OUTBUF:
+ atkbdc_aux_queue_data(sc, *eax);
+ break;
+ default:
+ /* write to particular RAM byte */
+ if (sc->curcmd >= 0x61 && sc->curcmd <= 0x7f) {
+ int byten;
+
+ byten = (sc->curcmd - 0x60) & 0x1f;
+ sc->ram[byten] = *eax & 0xff;
+ }
+ break;
+ }
+
+ sc->curcmd = 0;
+ sc->status &= ~KBDS_CTRL_FLAG;
+
+ pthread_mutex_unlock(&sc->mtx);
+ return (retval);
+ }
+
+ /*
+ * Data byte for the device.
+ */
+ ps2kbd_write(sc->ps2kbd_sc, *eax);
+ atkbdc_poll(sc);
+
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (retval);
+}
+
+
+static int
+atkbdc_sts_ctl_handler(struct vmctx *ctx, int vcpu, int in, int port,
+ int bytes, uint32_t *eax, void *arg)
+{
+ struct atkbdc_softc *sc;
+ int error, retval;
+
+ if (bytes != 1)
+ return (-1);
+
+ sc = arg;
+ retval = 0;
+
+ pthread_mutex_lock(&sc->mtx);
+
+ if (in) {
+ /* read status register */
+ *eax = sc->status;
+ pthread_mutex_unlock(&sc->mtx);
+ return (retval);
+ }
+
+ sc->curcmd = 0;
+ sc->status |= KBDS_CTRL_FLAG;
+
+ switch (*eax) {
+ case KBDC_GET_COMMAND_BYTE:
+ atkbdc_kbd_queue_data(sc, sc->ram[0]);
+ break;
+ case KBDC_TEST_CTRL:
+ atkbdc_kbd_queue_data(sc, 0x55);
+ break;
+ case KBDC_TEST_AUX_PORT:
+ case KBDC_TEST_KBD_PORT:
+ atkbdc_kbd_queue_data(sc, 0);
+ break;
+ case KBDC_READ_INPORT:
+ atkbdc_kbd_queue_data(sc, 0);
+ break;
+ case KBDC_READ_OUTPORT:
+ atkbdc_kbd_queue_data(sc, sc->outport);
+ break;
+ case KBDC_SET_COMMAND_BYTE:
+ case KBDC_WRITE_OUTPORT:
+ case KBDC_WRITE_KBD_OUTBUF:
+ case KBDC_WRITE_AUX_OUTBUF:
+ sc->curcmd = *eax;
+ break;
+ case KBDC_DISABLE_KBD_PORT:
+ sc->ram[0] |= KBD_DISABLE_KBD_PORT;
+ break;
+ case KBDC_ENABLE_KBD_PORT:
+ sc->ram[0] &= ~KBD_DISABLE_KBD_PORT;
+ atkbdc_poll(sc);
+ break;
+ case KBDC_WRITE_TO_AUX:
+ sc->curcmd = *eax;
+ break;
+ case KBDC_DISABLE_AUX_PORT:
+ sc->ram[0] |= KBD_DISABLE_AUX_PORT;
+ break;
+ case KBDC_ENABLE_AUX_PORT:
+ sc->ram[0] &= ~KBD_DISABLE_AUX_PORT;
+ break;
+ case KBDC_RESET: /* Pulse "reset" line */
+#ifdef __FreeBSD__
+ error = vm_suspend(ctx, VM_SUSPEND_RESET);
+ assert(error == 0 || errno == EALREADY);
+#else
+ exit(0);
+#endif
+ break;
+ default:
+ if (*eax >= 0x21 && *eax <= 0x3f) {
+ /* read "byte N" from RAM */
+ int byten;
+
+ byten = (*eax - 0x20) & 0x1f;
+ atkbdc_kbd_queue_data(sc, sc->ram[byten]);
+ }
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (retval);
+}
+
+void
+atkbdc_event(struct atkbdc_softc *sc)
+{
+ pthread_mutex_lock(&sc->mtx);
+ atkbdc_poll(sc);
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+void
+atkbdc_init(struct vmctx *ctx)
+{
+ struct inout_port iop;
+ struct atkbdc_softc *sc;
+ int error;
+
+ sc = calloc(1, sizeof(struct atkbdc_softc));
+ sc->ctx = ctx;
+
+ pthread_mutex_init(&sc->mtx, NULL);
+
+ bzero(&iop, sizeof(struct inout_port));
+ iop.name = "atkdbc";
+ iop.port = KBD_STS_CTL_PORT;
+ iop.size = 1;
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = atkbdc_sts_ctl_handler;
+ iop.arg = sc;
+
+ error = register_inout(&iop);
+ assert(error == 0);
+
+ bzero(&iop, sizeof(struct inout_port));
+ iop.name = "atkdbc";
+ iop.port = KBD_DATA_PORT;
+ iop.size = 1;
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = atkbdc_data_handler;
+ iop.arg = sc;
+
+ error = register_inout(&iop);
+ assert(error == 0);
+
+ pci_irq_reserve(KBD_DEV_IRQ);
+ sc->kbd.irq = KBD_DEV_IRQ;
+
+ pci_irq_reserve(AUX_DEV_IRQ);
+ sc->aux.irq = AUX_DEV_IRQ;
+
+ sc->ps2kbd_sc = ps2kbd_init(sc);
+ sc->ps2mouse_sc = ps2mouse_init(sc);
+}
+
+#ifdef __FreeBSD__
+static void
+atkbdc_dsdt(void)
+{
+
+ dsdt_line("");
+ dsdt_line("Device (KBD)");
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0303\"))");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(2);
+ dsdt_fixed_ioport(KBD_DATA_PORT, 1);
+ dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1);
+ dsdt_fixed_irq(1);
+ dsdt_unindent(2);
+ dsdt_line(" })");
+ dsdt_line("}");
+
+ dsdt_line("");
+ dsdt_line("Device (MOU)");
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0F13\"))");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(2);
+ dsdt_fixed_ioport(KBD_DATA_PORT, 1);
+ dsdt_fixed_ioport(KBD_STS_CTL_PORT, 1);
+ dsdt_fixed_irq(12);
+ dsdt_unindent(2);
+ dsdt_line(" })");
+ dsdt_line("}");
+}
+LPC_DSDT(atkbdc_dsdt);
+#endif
diff --git a/usr/src/cmd/bhyve/atkbdc.h b/usr/src/cmd/bhyve/atkbdc.h
new file mode 100644
index 0000000000..48b3a8b00c
--- /dev/null
+++ b/usr/src/cmd/bhyve/atkbdc.h
@@ -0,0 +1,38 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _ATKBDC_H_
+#define _ATKBDC_H_
+
+struct atkbdc_softc;
+struct vmctx;
+
+void atkbdc_init(struct vmctx *ctx);
+void atkbdc_event(struct atkbdc_softc *sc);
+
+#endif /* _ATKBDC_H_ */
diff --git a/usr/src/cmd/bhyve/bhyve_sol_glue.c b/usr/src/cmd/bhyve/bhyve_sol_glue.c
new file mode 100644
index 0000000000..633faacc5f
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyve_sol_glue.c
@@ -0,0 +1,86 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/uio.h>
+
+#include <termios.h>
+#include <unistd.h>
+
+/*
+ * Make a pre-existing termios structure into "raw" mode: character-at-a-time
+ * mode with no characters interpreted, 8-bit data path.
+ */
+void
+cfmakeraw(struct termios *t)
+{
+ t->c_iflag &= ~(IMAXBEL|IXOFF|INPCK|BRKINT|PARMRK|ISTRIP|INLCR|IGNCR|ICRNL|IXON|IGNPAR);
+ t->c_iflag |= IGNBRK;
+ t->c_oflag &= ~OPOST;
+ t->c_lflag &= ~(ECHO|ECHOE|ECHOK|ECHONL|ICANON|ISIG|IEXTEN|NOFLSH|TOSTOP |PENDIN);
+ t->c_cflag &= ~(CSIZE|PARENB);
+ t->c_cflag |= CS8|CREAD;
+ t->c_cc[VMIN] = 1;
+ t->c_cc[VTIME] = 0;
+}
+
+ssize_t
+preadv(int d, const struct iovec *iov, int iovcnt, off_t offset)
+{
+ off_t old_offset;
+ ssize_t n;
+
+ old_offset = lseek(d, (off_t)0, SEEK_CUR);
+ if (old_offset == -1)
+ return (-1);
+
+ offset = lseek(d, offset, SEEK_SET);
+ if (offset == -1)
+ return (-1);
+
+ n = readv(d, iov, iovcnt);
+ if (n == -1)
+ return (-1);
+
+ offset = lseek(d, old_offset, SEEK_SET);
+ if (offset == -1)
+ return (-1);
+
+ return (n);
+}
+
+ssize_t
+pwritev(int d, const struct iovec *iov, int iovcnt, off_t offset)
+{
+ off_t old_offset;
+ ssize_t n;
+
+ old_offset = lseek(d, (off_t)0, SEEK_CUR);
+ if (old_offset == -1)
+ return (-1);
+
+ offset = lseek(d, offset, SEEK_SET);
+ if (offset == -1)
+ return (-1);
+
+ n = writev(d, iov, iovcnt);
+ if (n == -1)
+ return (-1);
+
+ offset = lseek(d, old_offset, SEEK_SET);
+ if (offset == -1)
+ return (-1);
+
+ return (n);
+}
diff --git a/usr/src/cmd/bhyve/bhyvegc.c b/usr/src/cmd/bhyve/bhyvegc.c
new file mode 100644
index 0000000000..7a13c4c83f
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyvegc.c
@@ -0,0 +1,78 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include "bhyvegc.h"
+
+struct bhyvegc {
+ struct bhyvegc_image *gc_image;
+};
+
+struct bhyvegc *
+bhyvegc_init(int width, int height)
+{
+ struct bhyvegc *gc;
+ struct bhyvegc_image *gc_image;
+
+ gc = calloc(1, sizeof (struct bhyvegc));
+
+ gc_image = calloc(1, sizeof(struct bhyvegc_image));
+ gc_image->width = width;
+ gc_image->height = height;
+ gc_image->data = calloc(width * height, sizeof (uint32_t));
+
+ gc->gc_image = gc_image;
+
+ return (gc);
+}
+
+void
+bhyvegc_resize(struct bhyvegc *gc, int width, int height)
+{
+ struct bhyvegc_image *gc_image;
+
+ gc_image = gc->gc_image;
+
+ gc_image->width = width;
+ gc_image->height = height;
+ gc_image->data = realloc(gc_image->data,
+ sizeof (uint32_t) * width * height);
+ memset(gc_image->data, 0, width * height * sizeof (uint32_t));
+}
+
+struct bhyvegc_image *
+bhyvegc_get_image(struct bhyvegc *gc)
+{
+ return (gc->gc_image);
+}
diff --git a/usr/src/cmd/bhyve/bhyvegc.h b/usr/src/cmd/bhyve/bhyvegc.h
new file mode 100644
index 0000000000..19648f98af
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyvegc.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _BHYVEGC_H_
+#define _BHYVEGC_H_
+
+struct bhyvegc;
+
+struct bhyvegc_image {
+ int width;
+ int height;
+ uint32_t *data;
+};
+
+struct bhyvegc *bhyvegc_init(int width, int height);
+void bhyvegc_resize(struct bhyvegc *gc, int width, int height);
+struct bhyvegc_image *bhyvegc_get_image(struct bhyvegc *gc);
+
+#endif /* _BHYVEGC_H_ */
diff --git a/usr/src/cmd/bhyve/bhyverun.c b/usr/src/cmd/bhyve/bhyverun.c
new file mode 100644
index 0000000000..b985a2286e
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyverun.c
@@ -0,0 +1,820 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/bhyverun.c 281611 2015-04-16 20:11:49Z neel $");
+
+#include <sys/types.h>
+#include <sys/mman.h>
+#include <sys/time.h>
+
+#include <machine/segments.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <libgen.h>
+#include <unistd.h>
+#include <assert.h>
+#include <errno.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <sysexits.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "acpi.h"
+#include "atkbdc.h"
+#include "console.h"
+#include "inout.h"
+#include "dbgport.h"
+#include "ioapic.h"
+#include "mem.h"
+#ifdef __FreeBSD__
+#include "mevent.h"
+#endif
+#include "mptbl.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+#include "smbiostbl.h"
+#include "xmsr.h"
+#include "spinup_ap.h"
+#include "rfb.h"
+#include "rtc.h"
+#include "vga.h"
+
+#define GUEST_NIO_PORT 0x488 /* guest upcalls via i/o port */
+
+#define MB (1024UL * 1024)
+#define GB (1024UL * MB)
+
+typedef int (*vmexit_handler_t)(struct vmctx *, struct vm_exit *, int *vcpu);
+
+char *vmname;
+
+int guest_ncpus;
+char *guest_uuid_str;
+
+static int guest_vmexit_on_hlt, guest_vmexit_on_pause;
+static int virtio_msix = 1;
+static int x2apic_mode = 0; /* default is xAPIC */
+
+static int strictio;
+static int strictmsr = 1;
+
+#ifdef __FreeBSD__
+static int acpi;
+#endif
+
+static char *progname;
+static const int BSP = 0;
+
+#ifndef __FreeBSD__
+int bcons_wait = 0;
+int bcons_connected = 0;
+pthread_mutex_t bcons_wait_lock = PTHREAD_MUTEX_INITIALIZER;
+pthread_cond_t bcons_wait_done = PTHREAD_COND_INITIALIZER;
+#endif
+
+static cpuset_t cpumask;
+
+static void vm_loop(struct vmctx *ctx, int vcpu, uint64_t rip);
+
+static struct vm_exit vmexit[VM_MAXCPU];
+
+struct bhyvestats {
+ uint64_t vmexit_bogus;
+ uint64_t vmexit_bogus_switch;
+ uint64_t vmexit_hlt;
+ uint64_t vmexit_pause;
+ uint64_t vmexit_mtrap;
+ uint64_t vmexit_inst_emul;
+ uint64_t cpu_switch_rotate;
+ uint64_t cpu_switch_direct;
+ int io_reset;
+} stats;
+
+struct mt_vmm_info {
+ pthread_t mt_thr;
+ struct vmctx *mt_ctx;
+ int mt_vcpu;
+} mt_vmm_info[VM_MAXCPU];
+
+static void
+usage(int code)
+{
+
+#ifdef __FreeBSD__
+ fprintf(stderr,
+ "Usage: %s [-aehwAHIPW] [-g <gdb port>] [-s <pci>] [-c vcpus]\n"
+ " %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n"
+ " -a: local apic is in xAPIC mode (deprecated)\n"
+ " -A: create an ACPI table\n"
+ " -g: gdb port\n"
+ " -c: # cpus (default 1)\n"
+ " -C: include guest memory in core file\n"
+ " -p: pin 'vcpu' to 'hostcpu'\n"
+ " -H: vmexit from the guest on hlt\n"
+ " -P: vmexit from the guest on pause\n"
+ " -W: force virtio to use single-vector MSI\n"
+ " -e: exit on unhandled I/O access\n"
+ " -h: help\n"
+ " -s: <slot,driver,configinfo> PCI slot config\n"
+ " -l: LPC device configuration\n"
+ " -m: memory size in MB\n"
+ " -w: ignore unimplemented MSRs\n"
+ " -x: local apic is in x2APIC mode\n"
+ " -Y: disable MPtable generation\n"
+ " -U: uuid\n",
+ progname, (int)strlen(progname), "");
+#else
+ fprintf(stderr,
+ "Usage: %s [-ehwHPW] [-s <pci>] [-c vcpus]\n"
+ " %*s [-p vcpu:hostcpu] [-m mem] [-l <lpc>] <vm>\n"
+ " -c: # cpus (default 1)\n"
+ " -H: vmexit from the guest on hlt\n"
+ " -P: vmexit from the guest on pause\n"
+ " -W: force virtio to use single-vector MSI\n"
+ " -e: exit on unhandled I/O access\n"
+ " -h: help\n"
+ " -s: <slot,driver,configinfo> PCI slot config\n"
+ " -l: LPC device configuration\n"
+ " -m: memory size in MB\n"
+ " -w: ignore unimplemented MSRs\n"
+ " -Y: disable MPtable generation\n"
+ " -U: uuid\n",
+ progname, (int)strlen(progname), "");
+#endif
+
+ exit(code);
+}
+
+void
+vm_inject_fault(void *arg, int vcpu, int vector, int errcode_valid,
+ int errcode)
+{
+ struct vmctx *ctx;
+ int error, restart_instruction;
+
+ ctx = arg;
+ restart_instruction = 1;
+
+ error = vm_inject_exception(ctx, vcpu, vector, errcode_valid, errcode,
+ restart_instruction);
+ assert(error == 0);
+}
+
+void *
+paddr_guest2host(struct vmctx *ctx, uintptr_t gaddr, size_t len)
+{
+
+ return (vm_map_gpa(ctx, gaddr, len));
+}
+
+int
+fbsdrun_vmexit_on_pause(void)
+{
+
+ return (guest_vmexit_on_pause);
+}
+
+int
+fbsdrun_vmexit_on_hlt(void)
+{
+
+ return (guest_vmexit_on_hlt);
+}
+
+int
+fbsdrun_virtio_msix(void)
+{
+
+ return (virtio_msix);
+}
+
+static void *
+fbsdrun_start_thread(void *param)
+{
+ char tname[MAXCOMLEN + 1];
+ struct mt_vmm_info *mtp;
+ int vcpu;
+
+ mtp = param;
+ vcpu = mtp->mt_vcpu;
+
+ snprintf(tname, sizeof(tname), "vcpu %d", vcpu);
+ pthread_set_name_np(mtp->mt_thr, tname);
+
+ vm_loop(mtp->mt_ctx, vcpu, vmexit[vcpu].rip);
+
+ /* not reached */
+ exit(1);
+ return (NULL);
+}
+
+void
+fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip)
+{
+ int error;
+
+ assert(fromcpu == BSP);
+
+ /*
+ * The 'newcpu' must be activated in the context of 'fromcpu'. If
+ * vm_activate_cpu() is delayed until newcpu's pthread starts running
+ * then vmm.ko is out-of-sync with bhyve and this can create a race
+ * with vm_suspend().
+ */
+ error = vm_activate_cpu(ctx, newcpu);
+ assert(error == 0);
+
+ CPU_SET_ATOMIC(newcpu, &cpumask);
+
+ /*
+ * Set up the vmexit struct to allow execution to start
+ * at the given RIP
+ */
+ vmexit[newcpu].rip = rip;
+ vmexit[newcpu].inst_length = 0;
+
+ mt_vmm_info[newcpu].mt_ctx = ctx;
+ mt_vmm_info[newcpu].mt_vcpu = newcpu;
+
+ error = pthread_create(&mt_vmm_info[newcpu].mt_thr, NULL,
+ fbsdrun_start_thread, &mt_vmm_info[newcpu]);
+ assert(error == 0);
+}
+
+static int
+vmexit_handle_notify(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu,
+ uint32_t eax)
+{
+#if BHYVE_DEBUG
+ /*
+ * put guest-driven debug here
+ */
+#endif
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_inout(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int error;
+ int bytes, port, in, out, string;
+ int vcpu;
+
+ vcpu = *pvcpu;
+
+ port = vme->u.inout.port;
+ bytes = vme->u.inout.bytes;
+ string = vme->u.inout.string;
+ in = vme->u.inout.in;
+ out = !in;
+
+ /* Extra-special case of host notifications */
+ if (out && port == GUEST_NIO_PORT) {
+ error = vmexit_handle_notify(ctx, vme, pvcpu, vme->u.inout.eax);
+ return (error);
+ }
+
+ error = emulate_inout(ctx, vcpu, vme, strictio);
+ if (error) {
+ fprintf(stderr, "Unhandled %s%c 0x%04x at 0x%lx\n",
+ in ? "in" : "out",
+ bytes == 1 ? 'b' : (bytes == 2 ? 'w' : 'l'),
+ port, vmexit->rip);
+ return (VMEXIT_ABORT);
+ } else {
+ return (VMEXIT_CONTINUE);
+ }
+}
+
+static int
+vmexit_rdmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ uint64_t val;
+ uint32_t eax, edx;
+ int error;
+
+ val = 0;
+ error = emulate_rdmsr(ctx, *pvcpu, vme->u.msr.code, &val);
+ if (error != 0) {
+ fprintf(stderr, "rdmsr to register %#x on vcpu %d\n",
+ vme->u.msr.code, *pvcpu);
+ if (strictmsr) {
+ vm_inject_gp(ctx, *pvcpu);
+ return (VMEXIT_CONTINUE);
+ }
+ }
+
+ eax = val;
+ error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RAX, eax);
+ assert(error == 0);
+
+ edx = val >> 32;
+ error = vm_set_register(ctx, *pvcpu, VM_REG_GUEST_RDX, edx);
+ assert(error == 0);
+
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_wrmsr(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int error;
+
+ error = emulate_wrmsr(ctx, *pvcpu, vme->u.msr.code, vme->u.msr.wval);
+ if (error != 0) {
+ fprintf(stderr, "wrmsr to register %#x(%#lx) on vcpu %d\n",
+ vme->u.msr.code, vme->u.msr.wval, *pvcpu);
+ if (strictmsr) {
+ vm_inject_gp(ctx, *pvcpu);
+ return (VMEXIT_CONTINUE);
+ }
+ }
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_spinup_ap(struct vmctx *ctx, struct vm_exit *vme, int *pvcpu)
+{
+ int newcpu;
+ int retval = VMEXIT_CONTINUE;
+
+ newcpu = spinup_ap(ctx, *pvcpu,
+ vme->u.spinup_ap.vcpu, vme->u.spinup_ap.rip);
+
+ return (retval);
+}
+
+static int
+vmexit_vmx(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ fprintf(stderr, "vm exit[%d]\n", *pvcpu);
+ fprintf(stderr, "\treason\t\tVMX\n");
+ fprintf(stderr, "\trip\t\t0x%016lx\n", vmexit->rip);
+ fprintf(stderr, "\tinst_length\t%d\n", vmexit->inst_length);
+ fprintf(stderr, "\tstatus\t\t%d\n", vmexit->u.vmx.status);
+ fprintf(stderr, "\texit_reason\t%u\n", vmexit->u.vmx.exit_reason);
+ fprintf(stderr, "\tqualification\t0x%016lx\n",
+ vmexit->u.vmx.exit_qualification);
+ fprintf(stderr, "\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
+ fprintf(stderr, "\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
+
+ return (VMEXIT_ABORT);
+}
+
+static int
+vmexit_bogus(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ stats.vmexit_bogus++;
+
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_hlt(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ stats.vmexit_hlt++;
+
+ /*
+ * Just continue execution with the next instruction. We use
+ * the HLT VM exit as a way to be friendly with the host
+ * scheduler.
+ */
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_pause(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ stats.vmexit_pause++;
+
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_mtrap(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+
+ stats.vmexit_mtrap++;
+
+ return (VMEXIT_CONTINUE);
+}
+
+static int
+vmexit_inst_emul(struct vmctx *ctx, struct vm_exit *vmexit, int *pvcpu)
+{
+ int err, i;
+ struct vie *vie;
+
+ stats.vmexit_inst_emul++;
+
+ vie = &vmexit->u.inst_emul.vie;
+ err = emulate_mem(ctx, *pvcpu, vmexit->u.inst_emul.gpa,
+ vie, &vmexit->u.inst_emul.paging);
+
+ if (err) {
+ if (err == ESRCH) {
+ fprintf(stderr, "Unhandled memory access to 0x%lx\n",
+ vmexit->u.inst_emul.gpa);
+ }
+
+ fprintf(stderr, "Failed to emulate instruction [");
+ for (i = 0; i < vie->num_valid; i++) {
+ fprintf(stderr, "0x%02x%s", vie->inst[i],
+ i != (vie->num_valid - 1) ? " " : "");
+ }
+ fprintf(stderr, "] at 0x%lx\n", vmexit->rip);
+ return (VMEXIT_ABORT);
+ }
+
+ return (VMEXIT_CONTINUE);
+}
+
+static vmexit_handler_t handler[VM_EXITCODE_MAX] = {
+ [VM_EXITCODE_INOUT] = vmexit_inout,
+ [VM_EXITCODE_VMX] = vmexit_vmx,
+ [VM_EXITCODE_BOGUS] = vmexit_bogus,
+ [VM_EXITCODE_RDMSR] = vmexit_rdmsr,
+ [VM_EXITCODE_WRMSR] = vmexit_wrmsr,
+ [VM_EXITCODE_MTRAP] = vmexit_mtrap,
+ [VM_EXITCODE_INST_EMUL] = vmexit_inst_emul,
+ [VM_EXITCODE_SPINUP_AP] = vmexit_spinup_ap,
+};
+
+static void
+vm_loop(struct vmctx *ctx, int vcpu, uint64_t startrip)
+{
+#ifdef __FreeBSD__
+ cpuset_t mask;
+#endif
+ int error, rc, prevcpu;
+ enum vm_exitcode exitcode;
+
+#ifdef __FreeBSD__
+ if (pincpu >= 0) {
+ CPU_ZERO(&mask);
+ CPU_SET(pincpu + vcpu, &mask);
+ error = pthread_setaffinity_np(pthread_self(),
+ sizeof(mask), &mask);
+ assert(error == 0);
+ }
+#endif
+
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, startrip);
+ assert(error == 0);
+
+ while (1) {
+ error = vm_run(ctx, vcpu, &vmexit[vcpu]);
+ if (error != 0)
+ break;
+
+ prevcpu = vcpu;
+
+ exitcode = vmexit[vcpu].exitcode;
+ if (exitcode >= VM_EXITCODE_MAX || handler[exitcode] == NULL) {
+ fprintf(stderr, "vm_loop: unexpected exitcode 0x%x\n",
+ exitcode);
+ exit(1);
+ }
+
+ rc = (*handler[exitcode])(ctx, &vmexit[vcpu], &vcpu);
+
+ switch (rc) {
+ case VMEXIT_CONTINUE:
+ break;
+ case VMEXIT_ABORT:
+ abort();
+ default:
+ exit(1);
+ }
+ }
+ fprintf(stderr, "vm_run error %d, errno %d\n", error, errno);
+}
+
+static int
+num_vcpus_allowed(struct vmctx *ctx)
+{
+ int tmp, error;
+
+ error = vm_get_capability(ctx, BSP, VM_CAP_UNRESTRICTED_GUEST, &tmp);
+
+ /*
+ * The guest is allowed to spinup more than one processor only if the
+ * UNRESTRICTED_GUEST capability is available.
+ */
+ if (error == 0)
+ return (VM_MAXCPU);
+ else
+ return (1);
+}
+
+void
+fbsdrun_set_capabilities(struct vmctx *ctx, int cpu)
+{
+ int err, tmp;
+
+ if (fbsdrun_vmexit_on_hlt()) {
+ err = vm_get_capability(ctx, cpu, VM_CAP_HALT_EXIT, &tmp);
+ if (err < 0) {
+ fprintf(stderr, "VM exit on HLT not supported\n");
+ exit(1);
+ }
+ vm_set_capability(ctx, cpu, VM_CAP_HALT_EXIT, 1);
+ if (cpu == BSP)
+ handler[VM_EXITCODE_HLT] = vmexit_hlt;
+ }
+
+ if (fbsdrun_vmexit_on_pause()) {
+ /*
+ * pause exit support required for this mode
+ */
+ err = vm_get_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, &tmp);
+ if (err < 0) {
+ fprintf(stderr,
+ "SMP mux requested, no pause support\n");
+ exit(1);
+ }
+ vm_set_capability(ctx, cpu, VM_CAP_PAUSE_EXIT, 1);
+ if (cpu == BSP)
+ handler[VM_EXITCODE_PAUSE] = vmexit_pause;
+ }
+
+ if (x2apic_mode)
+ err = vm_set_x2apic_state(ctx, cpu, X2APIC_ENABLED);
+ else
+ err = vm_set_x2apic_state(ctx, cpu, X2APIC_DISABLED);
+
+ if (err) {
+ fprintf(stderr, "Unable to set x2apic state (%d)\n", err);
+ exit(1);
+ }
+
+#ifdef __FreeBSD__
+ vm_set_capability(ctx, cpu, VM_CAP_ENABLE_INVPCID, 1);
+#endif
+}
+
+int
+main(int argc, char *argv[])
+{
+ int c, error, gdb_port, rfb_port, err, bvmcons;
+ int max_vcpus;
+ struct vmctx *ctx;
+ uint64_t rip;
+ size_t memsize;
+
+ bvmcons = 0;
+ progname = basename(argv[0]);
+ gdb_port = 0;
+ rfb_port = -1;
+ guest_ncpus = 1;
+ memsize = 256 * MB;
+
+
+#ifdef __FreeBSD__
+ while ((c = getopt(argc, argv, "abehwxACHIPWYp:r:g:c:s:m:l:U:")) != -1) {
+#else
+ while ((c = getopt(argc, argv, "abehwxHIPWYr:c:s:m:l:U:")) != -1) {
+#endif
+ switch (c) {
+ case 'a':
+ x2apic_mode = 0;
+ break;
+#ifdef __FreeBSD__
+ case 'A':
+ acpi = 1;
+ break;
+#endif
+ case 'b':
+ bvmcons = 1;
+ break;
+#ifdef __FreeBSD__
+ case 'p':
+ pincpu = atoi(optarg);
+ break;
+#endif
+ case 'r':
+ if (optarg[0] == ':')
+ rfb_port = atoi(optarg + 1) + RFB_PORT;
+ else
+ rfb_port = atoi(optarg);
+ break;
+ case 'c':
+ guest_ncpus = atoi(optarg);
+ break;
+#ifdef __FreeBSD__
+ case 'g':
+ gdb_port = atoi(optarg);
+ break;
+#endif
+ case 'l':
+ if (lpc_device_parse(optarg) != 0) {
+ errx(EX_USAGE, "invalid lpc device "
+ "configuration '%s'", optarg);
+ }
+ break;
+ case 's':
+ if (pci_parse_slot(optarg) != 0)
+ exit(1);
+ else
+ break;
+ case 'm':
+ error = vm_parse_memsize(optarg, &memsize);
+ if (error)
+ errx(EX_USAGE, "invalid memsize '%s'", optarg);
+ break;
+ case 'H':
+ guest_vmexit_on_hlt = 1;
+ break;
+ case 'I':
+ /*
+ * The "-I" option was used to add an ioapic to the
+ * virtual machine.
+ *
+ * An ioapic is now provided unconditionally for each
+ * virtual machine and this option is now deprecated.
+ */
+ break;
+ case 'P':
+ guest_vmexit_on_pause = 1;
+ break;
+ case 'e':
+ strictio = 1;
+ break;
+ case 'U':
+ guest_uuid_str = optarg;
+ break;
+ case 'W':
+ virtio_msix = 0;
+ break;
+ case 'x':
+ x2apic_mode = 1;
+ break;
+ case 'h':
+ usage(0);
+ default:
+ usage(1);
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ usage(1);
+
+ vmname = argv[0];
+
+ ctx = vm_open(vmname);
+ if (ctx == NULL) {
+ perror("vm_open");
+ exit(1);
+ }
+
+ max_vcpus = num_vcpus_allowed(ctx);
+ if (guest_ncpus > max_vcpus) {
+ fprintf(stderr, "%d vCPUs requested but only %d available\n",
+ guest_ncpus, max_vcpus);
+ exit(1);
+ }
+
+ fbsdrun_set_capabilities(ctx, BSP);
+
+ err = vm_setup_memory(ctx, memsize, VM_MMAP_ALL);
+ if (err) {
+ fprintf(stderr, "Unable to setup memory (%d)\n", err);
+ exit(1);
+ }
+
+ error = init_msr();
+ if (error) {
+ fprintf(stderr, "init_msr error %d", error);
+ exit(1);
+ }
+
+ init_mem();
+ init_inout();
+ atkbdc_init(ctx);
+ pci_irq_init(ctx);
+ ioapic_init(ctx);
+
+ rtc_init(ctx);
+
+ /*
+ * Exit if a device emulation finds an error in it's initilization
+ */
+ if (init_pci(ctx) != 0)
+ exit(1);
+
+#ifdef __FreeBSD__
+ if (gdb_port != 0)
+ init_dbgport(gdb_port);
+#endif
+
+ if (bvmcons)
+ init_bvmcons();
+
+ console_init();
+ vga_init();
+ if (rfb_port != -1)
+ rfb_init(rfb_port);
+
+ error = vm_get_register(ctx, BSP, VM_REG_GUEST_RIP, &rip);
+ assert(error == 0);
+
+ /*
+ * build the guest tables, MP etc.
+ */
+ mptable_build(ctx, guest_ncpus);
+
+ error = smbios_build(ctx);
+ assert(error == 0);
+
+#ifdef __FreeBSD__
+ if (acpi) {
+ error = acpi_build(ctx, guest_ncpus);
+ assert(error == 0);
+ }
+
+ /*
+ * Change the proc title to include the VM name.
+ */
+ setproctitle("%s", vmname);
+#else
+ /*
+ * If applicable, wait for bhyveconsole
+ */
+ if (bcons_wait) {
+ printf("Waiting for bhyveconsole connection...\n");
+ (void) pthread_mutex_lock(&bcons_wait_lock);
+ while (!bcons_connected) {
+ (void) pthread_cond_wait(&bcons_wait_done,
+ &bcons_wait_lock);
+ }
+ (void) pthread_mutex_unlock(&bcons_wait_lock);
+ }
+#endif
+
+ /*
+ * Add CPU 0
+ */
+ fbsdrun_addcpu(ctx, BSP, BSP, rip);
+
+ /*
+ * Head off to the main event dispatch loop
+ */
+#ifdef __FreeBSD__
+ mevent_dispatch();
+#else
+ pthread_exit(NULL);
+#endif
+
+ exit(1);
+}
diff --git a/usr/src/cmd/bhyve/bhyverun.h b/usr/src/cmd/bhyve/bhyverun.h
new file mode 100644
index 0000000000..be89314c09
--- /dev/null
+++ b/usr/src/cmd/bhyve/bhyverun.h
@@ -0,0 +1,73 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/bhyverun.h 277310 2015-01-18 03:08:30Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _FBSDRUN_H_
+#define _FBSDRUN_H_
+
+#ifndef CTASSERT /* Allow lint to override */
+#define CTASSERT(x) _CTASSERT(x, __LINE__)
+#define _CTASSERT(x, y) __CTASSERT(x, y)
+#define __CTASSERT(x, y) typedef char __assert ## y[(x) ? 1 : -1]
+#endif
+
+#define VMEXIT_CONTINUE (0)
+#define VMEXIT_ABORT (-1)
+
+struct vmctx;
+extern int guest_ncpus;
+extern char *guest_uuid_str;
+extern char *vmname;
+#ifndef __FreeBSD__
+extern int bcons_wait;
+extern int bcons_connected;
+extern pthread_mutex_t bcons_wait_lock;
+extern pthread_cond_t bcons_wait_done;
+#endif
+
+void *paddr_guest2host(struct vmctx *ctx, uintptr_t addr, size_t len);
+
+void fbsdrun_set_capabilities(struct vmctx *ctx, int cpu);
+void fbsdrun_addcpu(struct vmctx *ctx, int fromcpu, int newcpu, uint64_t rip);
+int fbsdrun_muxed(void);
+int fbsdrun_vmexit_on_hlt(void);
+int fbsdrun_vmexit_on_pause(void);
+int fbsdrun_disable_x2apic(void);
+int fbsdrun_virtio_msix(void);
+#endif
diff --git a/usr/src/cmd/bhyve/block_if.c b/usr/src/cmd/bhyve/block_if.c
new file mode 100644
index 0000000000..2da946d420
--- /dev/null
+++ b/usr/src/cmd/bhyve/block_if.c
@@ -0,0 +1,625 @@
+/*-
+ * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z tychon $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/block_if.c 274330 2014-11-09 21:08:52Z tychon $");
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/errno.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <assert.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <signal.h>
+#include <unistd.h>
+
+#include <machine/atomic.h>
+
+#include "bhyverun.h"
+#ifdef __FreeBSD__
+#include "mevent.h"
+#endif
+#include "block_if.h"
+
+#define BLOCKIF_SIG 0xb109b109
+
+#define BLOCKIF_MAXREQ 33
+
+enum blockop {
+ BOP_READ,
+ BOP_WRITE,
+ BOP_FLUSH
+};
+
+enum blockstat {
+ BST_FREE,
+ BST_PEND,
+ BST_BUSY,
+ BST_DONE
+};
+
+struct blockif_elem {
+ TAILQ_ENTRY(blockif_elem) be_link;
+ struct blockif_req *be_req;
+ enum blockop be_op;
+ enum blockstat be_status;
+ pthread_t be_tid;
+};
+
+struct blockif_ctxt {
+ int bc_magic;
+ int bc_fd;
+ int bc_rdonly;
+ off_t bc_size;
+ int bc_sectsz;
+ pthread_t bc_btid;
+ pthread_mutex_t bc_mtx;
+ pthread_cond_t bc_cond;
+ int bc_closing;
+
+ /* Request elements and free/pending/busy queues */
+ TAILQ_HEAD(, blockif_elem) bc_freeq;
+ TAILQ_HEAD(, blockif_elem) bc_pendq;
+ TAILQ_HEAD(, blockif_elem) bc_busyq;
+ u_int bc_req_count;
+ struct blockif_elem bc_reqs[BLOCKIF_MAXREQ];
+};
+
+static pthread_once_t blockif_once = PTHREAD_ONCE_INIT;
+
+struct blockif_sig_elem {
+ pthread_mutex_t bse_mtx;
+ pthread_cond_t bse_cond;
+ int bse_pending;
+ struct blockif_sig_elem *bse_next;
+};
+
+static struct blockif_sig_elem *blockif_bse_head;
+
+static int
+blockif_enqueue(struct blockif_ctxt *bc, struct blockif_req *breq,
+ enum blockop op)
+{
+ struct blockif_elem *be;
+
+ assert(bc->bc_req_count < BLOCKIF_MAXREQ);
+
+ be = TAILQ_FIRST(&bc->bc_freeq);
+ assert(be != NULL);
+ assert(be->be_status == BST_FREE);
+
+ TAILQ_REMOVE(&bc->bc_freeq, be, be_link);
+ be->be_status = BST_PEND;
+ be->be_req = breq;
+ be->be_op = op;
+ TAILQ_INSERT_TAIL(&bc->bc_pendq, be, be_link);
+
+ bc->bc_req_count++;
+
+ return (0);
+}
+
+static int
+blockif_dequeue(struct blockif_ctxt *bc, struct blockif_elem **bep)
+{
+ struct blockif_elem *be;
+
+ if (bc->bc_req_count == 0)
+ return (ENOENT);
+
+ be = TAILQ_FIRST(&bc->bc_pendq);
+ assert(be != NULL);
+ assert(be->be_status == BST_PEND);
+ TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
+ be->be_status = BST_BUSY;
+ be->be_tid = bc->bc_btid;
+ TAILQ_INSERT_TAIL(&bc->bc_busyq, be, be_link);
+
+ *bep = be;
+
+ return (0);
+}
+
+static void
+blockif_complete(struct blockif_ctxt *bc, struct blockif_elem *be)
+{
+ assert(be->be_status == BST_DONE);
+
+ TAILQ_REMOVE(&bc->bc_busyq, be, be_link);
+ be->be_tid = 0;
+ be->be_status = BST_FREE;
+ be->be_req = NULL;
+ TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
+
+ bc->bc_req_count--;
+}
+
+static void
+blockif_proc(struct blockif_ctxt *bc, struct blockif_elem *be)
+{
+ struct blockif_req *br;
+ int err;
+
+ br = be->be_req;
+ err = 0;
+
+ switch (be->be_op) {
+ case BOP_READ:
+ if (preadv(bc->bc_fd, br->br_iov, br->br_iovcnt,
+ br->br_offset) < 0)
+ err = errno;
+ break;
+ case BOP_WRITE:
+ if (bc->bc_rdonly)
+ err = EROFS;
+ else if (pwritev(bc->bc_fd, br->br_iov, br->br_iovcnt,
+ br->br_offset) < 0)
+ err = errno;
+ break;
+ case BOP_FLUSH:
+ break;
+ default:
+ err = EINVAL;
+ break;
+ }
+
+ be->be_status = BST_DONE;
+
+ (*br->br_callback)(br, err);
+}
+
+static void *
+blockif_thr(void *arg)
+{
+ struct blockif_ctxt *bc;
+ struct blockif_elem *be;
+
+ bc = arg;
+
+ for (;;) {
+ pthread_mutex_lock(&bc->bc_mtx);
+ while (!blockif_dequeue(bc, &be)) {
+ pthread_mutex_unlock(&bc->bc_mtx);
+ blockif_proc(bc, be);
+ pthread_mutex_lock(&bc->bc_mtx);
+ blockif_complete(bc, be);
+ }
+ pthread_cond_wait(&bc->bc_cond, &bc->bc_mtx);
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ /*
+ * Check ctxt status here to see if exit requested
+ */
+ if (bc->bc_closing)
+ pthread_exit(NULL);
+ }
+
+ /* Not reached */
+ return (NULL);
+}
+
+#ifdef __FreeBSD__
+static void
+blockif_sigcont_handler(int signal, enum ev_type type, void *arg)
+#else
+static void
+blockif_sigcont_handler(int signal)
+#endif
+{
+ struct blockif_sig_elem *bse;
+
+ for (;;) {
+ /*
+ * Process the entire list even if not intended for
+ * this thread.
+ */
+ do {
+ bse = blockif_bse_head;
+ if (bse == NULL)
+ return;
+ } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
+ (uintptr_t)bse,
+ (uintptr_t)bse->bse_next));
+
+ pthread_mutex_lock(&bse->bse_mtx);
+ bse->bse_pending = 0;
+ pthread_cond_signal(&bse->bse_cond);
+ pthread_mutex_unlock(&bse->bse_mtx);
+ }
+}
+
+static void
+blockif_init(void)
+{
+#ifdef __FreeBSD__
+ mevent_add(SIGCONT, EVF_SIGNAL, blockif_sigcont_handler, NULL);
+ (void) signal(SIGCONT, SIG_IGN);
+#else
+ (void) sigset(SIGCONT, blockif_sigcont_handler);
+#endif
+}
+
+struct blockif_ctxt *
+blockif_open(const char *optstr, const char *ident)
+{
+ char tname[MAXCOMLEN + 1];
+ char *nopt, *xopts;
+ struct blockif_ctxt *bc;
+ struct stat sbuf;
+ off_t size;
+ int extra, fd, i, sectsz;
+ int nocache, sync, ro;
+
+ pthread_once(&blockif_once, blockif_init);
+
+ nocache = 0;
+ sync = 0;
+ ro = 0;
+
+ /*
+ * The first element in the optstring is always a pathname.
+ * Optional elements follow
+ */
+ nopt = strdup(optstr);
+ for (xopts = strtok(nopt, ",");
+ xopts != NULL;
+ xopts = strtok(NULL, ",")) {
+ if (!strcmp(xopts, "nocache"))
+ nocache = 1;
+ else if (!strcmp(xopts, "sync"))
+ sync = 1;
+ else if (!strcmp(xopts, "ro"))
+ ro = 1;
+ }
+
+ extra = 0;
+ if (nocache)
+ extra |= O_DIRECT;
+ if (sync)
+ extra |= O_SYNC;
+
+ fd = open(nopt, (ro ? O_RDONLY : O_RDWR) | extra);
+ if (fd < 0 && !ro) {
+ /* Attempt a r/w fail with a r/o open */
+ fd = open(nopt, O_RDONLY | extra);
+ ro = 1;
+ }
+
+ if (fd < 0) {
+ perror("Could not open backing file");
+ return (NULL);
+ }
+
+ if (fstat(fd, &sbuf) < 0) {
+ perror("Could not stat backing file");
+ close(fd);
+ return (NULL);
+ }
+
+ /*
+ * Deal with raw devices
+ */
+ size = sbuf.st_size;
+ sectsz = DEV_BSIZE;
+#ifdef __FreeBSD__
+ if (S_ISCHR(sbuf.st_mode)) {
+ if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
+ ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
+ perror("Could not fetch dev blk/sector size");
+ close(fd);
+ return (NULL);
+ }
+ assert(size != 0);
+ assert(sectsz != 0);
+ }
+#endif
+
+ bc = calloc(1, sizeof(struct blockif_ctxt));
+ if (bc == NULL) {
+ close(fd);
+ return (NULL);
+ }
+
+ bc->bc_magic = BLOCKIF_SIG;
+ bc->bc_fd = fd;
+ bc->bc_rdonly = ro;
+ bc->bc_size = size;
+ bc->bc_sectsz = sectsz;
+ pthread_mutex_init(&bc->bc_mtx, NULL);
+ pthread_cond_init(&bc->bc_cond, NULL);
+ TAILQ_INIT(&bc->bc_freeq);
+ TAILQ_INIT(&bc->bc_pendq);
+ TAILQ_INIT(&bc->bc_busyq);
+ bc->bc_req_count = 0;
+ for (i = 0; i < BLOCKIF_MAXREQ; i++) {
+ bc->bc_reqs[i].be_status = BST_FREE;
+ TAILQ_INSERT_HEAD(&bc->bc_freeq, &bc->bc_reqs[i], be_link);
+ }
+
+ pthread_create(&bc->bc_btid, NULL, blockif_thr, bc);
+
+ snprintf(tname, sizeof(tname), "blk-%s", ident);
+ pthread_set_name_np(bc->bc_btid, tname);
+
+ return (bc);
+}
+
+static int
+blockif_request(struct blockif_ctxt *bc, struct blockif_req *breq,
+ enum blockop op)
+{
+ int err;
+
+ err = 0;
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ if (bc->bc_req_count < BLOCKIF_MAXREQ) {
+ /*
+ * Enqueue and inform the block i/o thread
+ * that there is work available
+ */
+ blockif_enqueue(bc, breq, op);
+ pthread_cond_signal(&bc->bc_cond);
+ } else {
+ /*
+ * Callers are not allowed to enqueue more than
+ * the specified blockif queue limit. Return an
+ * error to indicate that the queue length has been
+ * exceeded.
+ */
+ err = E2BIG;
+ }
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ return (err);
+}
+
+int
+blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_READ));
+}
+
+int
+blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_WRITE));
+}
+
+int
+blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (blockif_request(bc, breq, BOP_FLUSH));
+}
+
+int
+blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq)
+{
+ struct blockif_elem *be;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ pthread_mutex_lock(&bc->bc_mtx);
+ /*
+ * Check pending requests.
+ */
+ TAILQ_FOREACH(be, &bc->bc_pendq, be_link) {
+ if (be->be_req == breq)
+ break;
+ }
+ if (be != NULL) {
+ /*
+ * Found it.
+ */
+ TAILQ_REMOVE(&bc->bc_pendq, be, be_link);
+ be->be_status = BST_FREE;
+ be->be_req = NULL;
+ TAILQ_INSERT_TAIL(&bc->bc_freeq, be, be_link);
+ bc->bc_req_count--;
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ return (0);
+ }
+
+ /*
+ * Check in-flight requests.
+ */
+ TAILQ_FOREACH(be, &bc->bc_busyq, be_link) {
+ if (be->be_req == breq)
+ break;
+ }
+ if (be == NULL) {
+ /*
+ * Didn't find it.
+ */
+ pthread_mutex_unlock(&bc->bc_mtx);
+ return (EINVAL);
+ }
+
+ /*
+ * Interrupt the processing thread to force it return
+ * prematurely via it's normal callback path.
+ */
+ while (be->be_status == BST_BUSY) {
+ struct blockif_sig_elem bse, *old_head;
+
+ pthread_mutex_init(&bse.bse_mtx, NULL);
+ pthread_cond_init(&bse.bse_cond, NULL);
+
+ bse.bse_pending = 1;
+
+ do {
+ old_head = blockif_bse_head;
+ bse.bse_next = old_head;
+ } while (!atomic_cmpset_ptr((uintptr_t *)&blockif_bse_head,
+ (uintptr_t)old_head,
+ (uintptr_t)&bse));
+
+ pthread_kill(be->be_tid, SIGCONT);
+
+ pthread_mutex_lock(&bse.bse_mtx);
+ while (bse.bse_pending)
+ pthread_cond_wait(&bse.bse_cond, &bse.bse_mtx);
+ pthread_mutex_unlock(&bse.bse_mtx);
+ }
+
+ pthread_mutex_unlock(&bc->bc_mtx);
+
+ /*
+ * The processing thread has been interrupted. Since it's not
+ * clear if the callback has been invoked yet, return EBUSY.
+ */
+ return (EBUSY);
+}
+
+int
+blockif_close(struct blockif_ctxt *bc)
+{
+ void *jval;
+ int err;
+
+ err = 0;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ /*
+ * Stop the block i/o thread
+ */
+ bc->bc_closing = 1;
+ pthread_cond_signal(&bc->bc_cond);
+ pthread_join(bc->bc_btid, &jval);
+
+ /* XXX Cancel queued i/o's ??? */
+
+ /*
+ * Release resources
+ */
+ bc->bc_magic = 0;
+ close(bc->bc_fd);
+ free(bc);
+
+ return (0);
+}
+
+/*
+ * Return virtual C/H/S values for a given block. Use the algorithm
+ * outlined in the VHD specification to calculate values.
+ */
+void
+blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h, uint8_t *s)
+{
+ off_t sectors; /* total sectors of the block dev */
+ off_t hcyl; /* cylinders times heads */
+ uint16_t secpt; /* sectors per track */
+ uint8_t heads;
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+
+ sectors = bc->bc_size / bc->bc_sectsz;
+
+ /* Clamp the size to the largest possible with CHS */
+ if (sectors > 65535UL*16*255)
+ sectors = 65535UL*16*255;
+
+ if (sectors >= 65536UL*16*63) {
+ secpt = 255;
+ heads = 16;
+ hcyl = sectors / secpt;
+ } else {
+ secpt = 17;
+ hcyl = sectors / secpt;
+ heads = (hcyl + 1023) / 1024;
+
+ if (heads < 4)
+ heads = 4;
+
+ if (hcyl >= (heads * 1024) || heads > 16) {
+ secpt = 31;
+ heads = 16;
+ hcyl = sectors / secpt;
+ }
+ if (hcyl >= (heads * 1024)) {
+ secpt = 63;
+ heads = 16;
+ hcyl = sectors / secpt;
+ }
+ }
+
+ *c = hcyl / heads;
+ *h = heads;
+ *s = secpt;
+}
+
+/*
+ * Accessors
+ */
+off_t
+blockif_size(struct blockif_ctxt *bc)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_size);
+}
+
+int
+blockif_sectsz(struct blockif_ctxt *bc)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_sectsz);
+}
+
+int
+blockif_queuesz(struct blockif_ctxt *bc)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (BLOCKIF_MAXREQ - 1);
+}
+
+int
+blockif_is_ro(struct blockif_ctxt *bc)
+{
+
+ assert(bc->bc_magic == BLOCKIF_SIG);
+ return (bc->bc_rdonly);
+}
diff --git a/usr/src/cmd/bhyve/block_if.h b/usr/src/cmd/bhyve/block_if.h
new file mode 100644
index 0000000000..5ef120933c
--- /dev/null
+++ b/usr/src/cmd/bhyve/block_if.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/block_if.h 268638 2014-07-15 00:25:54Z grehan $
+ */
+
+/*
+ * The block API to be used by bhyve block-device emulations. The routines
+ * are thread safe, with no assumptions about the context of the completion
+ * callback - it may occur in the caller's context, or asynchronously in
+ * another thread.
+ */
+
+#ifndef _BLOCK_IF_H_
+#define _BLOCK_IF_H_
+
+#include <sys/uio.h>
+#include <sys/unistd.h>
+
+#ifdef __FreeBSD__
+#define BLOCKIF_IOV_MAX 32 /* not practical to be IOV_MAX */
+#else
+#define BLOCKIF_IOV_MAX 16 /* not practical to be IOV_MAX */
+#endif
+
+struct blockif_req {
+ struct iovec br_iov[BLOCKIF_IOV_MAX];
+ int br_iovcnt;
+ off_t br_offset;
+ void (*br_callback)(struct blockif_req *req, int err);
+ void *br_param;
+};
+
+struct blockif_ctxt;
+struct blockif_ctxt *blockif_open(const char *optstr, const char *ident);
+off_t blockif_size(struct blockif_ctxt *bc);
+void blockif_chs(struct blockif_ctxt *bc, uint16_t *c, uint8_t *h,
+ uint8_t *s);
+int blockif_sectsz(struct blockif_ctxt *bc);
+int blockif_queuesz(struct blockif_ctxt *bc);
+int blockif_is_ro(struct blockif_ctxt *bc);
+int blockif_read(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_write(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_flush(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_cancel(struct blockif_ctxt *bc, struct blockif_req *breq);
+int blockif_close(struct blockif_ctxt *bc);
+
+#endif /* _BLOCK_IF_H_ */
diff --git a/usr/src/cmd/bhyve/console.c b/usr/src/cmd/bhyve/console.c
new file mode 100644
index 0000000000..a8d07709be
--- /dev/null
+++ b/usr/src/cmd/bhyve/console.c
@@ -0,0 +1,101 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include "bhyvegc.h"
+#include "console.h"
+
+static struct {
+ struct bhyvegc *gc;
+
+ fb_render_func_t fb_render_cb;
+ void *fb_arg;
+
+ kbd_event_func_t kbd_event_cb;
+ void *kbd_arg;
+
+ ptr_event_func_t ptr_event_cb;
+ void *ptr_arg;
+} console;
+
+void
+console_init(void)
+{
+ console.gc = bhyvegc_init(640, 400);
+}
+
+struct bhyvegc_image *
+console_get_image(void)
+{
+ struct bhyvegc_image *bhyvegc_image;
+
+ bhyvegc_image = bhyvegc_get_image(console.gc);
+
+ return (bhyvegc_image);
+}
+
+void
+console_fb_register(fb_render_func_t render_cb, void *arg)
+{
+ console.fb_render_cb = render_cb;
+ console.fb_arg = arg;
+}
+
+void
+console_refresh(void)
+{
+ (*console.fb_render_cb)(console.gc, console.fb_arg);
+}
+
+void
+console_kbd_register(kbd_event_func_t event_cb, void *arg)
+{
+ console.kbd_event_cb = event_cb;
+ console.kbd_arg = arg;
+}
+
+void
+console_ptr_register(ptr_event_func_t event_cb, void *arg)
+{
+ console.ptr_event_cb = event_cb;
+ console.ptr_arg = arg;
+}
+
+void
+console_key_event(int down, uint32_t keysym)
+{
+ (*console.kbd_event_cb)(down, keysym, console.kbd_arg);
+}
+
+void
+console_ptr_event(uint8_t button, int x, int y)
+{
+ (*console.ptr_event_cb)(button, x, y, console.ptr_arg);
+}
diff --git a/usr/src/cmd/bhyve/console.h b/usr/src/cmd/bhyve/console.h
new file mode 100644
index 0000000000..bffb7c2456
--- /dev/null
+++ b/usr/src/cmd/bhyve/console.h
@@ -0,0 +1,50 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _CONSOLE_H_
+#define _CONSOLE_H_
+
+struct bhyvegc;
+
+typedef void (*fb_render_func_t)(struct bhyvegc *gc, void *arg);
+typedef void (*kbd_event_func_t)(int down, uint32_t keysym, void *arg);
+typedef void (*ptr_event_func_t)(uint8_t mask, int x, int y, void *arg);
+
+void console_init(void);
+struct bhyvegc_image *console_get_image(void);
+
+void console_fb_register(fb_render_func_t render_cb, void *arg);
+void console_refresh(void);
+
+void console_kbd_register(kbd_event_func_t event_cb, void *arg);
+void console_key_event(int down, uint32_t keysym);
+
+void console_ptr_register(ptr_event_func_t event_cb, void *arg);
+void console_ptr_event(uint8_t button, int x, int y);
+
+#endif /* _CONSOLE_H_ */
diff --git a/usr/src/cmd/bhyve/consport.c b/usr/src/cmd/bhyve/consport.c
new file mode 100644
index 0000000000..69b6dfddf1
--- /dev/null
+++ b/usr/src/cmd/bhyve/consport.c
@@ -0,0 +1,155 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z jhb $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/consport.c 264277 2014-04-08 21:02:03Z jhb $");
+
+#include <sys/types.h>
+#include <sys/select.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+
+#include "inout.h"
+#include "pci_lpc.h"
+
+#define BVM_CONSOLE_PORT 0x220
+#define BVM_CONS_SIG ('b' << 8 | 'v')
+
+static struct termios tio_orig, tio_new;
+
+static void
+ttyclose(void)
+{
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+
+static void
+ttyopen(void)
+{
+#ifdef __FreeBSD__
+ tcgetattr(STDIN_FILENO, &tio_orig);
+
+ cfmakeraw(&tio_new);
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
+
+ atexit(ttyclose);
+#endif
+}
+
+static bool
+tty_char_available(void)
+{
+ fd_set rfds;
+ struct timeval tv;
+
+ FD_ZERO(&rfds);
+ FD_SET(STDIN_FILENO, &rfds);
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0) {
+ return (true);
+ } else {
+ return (false);
+ }
+}
+
+static int
+ttyread(void)
+{
+ char rb;
+
+ if (tty_char_available()) {
+ read(STDIN_FILENO, &rb, 1);
+ return (rb & 0xff);
+ } else {
+ return (-1);
+ }
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+ (void) write(STDOUT_FILENO, &wb, 1);
+}
+
+static int
+console_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ static int opened;
+
+ if (bytes == 2 && in) {
+ *eax = BVM_CONS_SIG;
+ return (0);
+ }
+
+ /*
+ * Guests might probe this port to look for old ISA devices
+ * using single-byte reads. Return 0xff for those.
+ */
+ if (bytes == 1 && in) {
+ *eax = 0xff;
+ return (0);
+ }
+
+ if (bytes != 4)
+ return (-1);
+
+ if (!opened) {
+ ttyopen();
+ opened = 1;
+ }
+
+ if (in)
+ *eax = ttyread();
+ else
+ ttywrite(*eax);
+
+ return (0);
+}
+
+SYSRES_IO(BVM_CONSOLE_PORT, 4);
+
+static struct inout_port consport = {
+ "bvmcons",
+ BVM_CONSOLE_PORT,
+ 1,
+ IOPORT_F_INOUT,
+ console_handler
+};
+
+void
+init_bvmcons(void)
+{
+
+ register_inout(&consport);
+}
diff --git a/usr/src/cmd/bhyve/dbgport.h b/usr/src/cmd/bhyve/dbgport.h
new file mode 100644
index 0000000000..b95df0bd31
--- /dev/null
+++ b/usr/src/cmd/bhyve/dbgport.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/dbgport.h 256156 2013-10-08 16:36:17Z neel $
+ */
+
+#ifndef _DBGPORT_H_
+#define _DBGPORT_H_
+
+void init_dbgport(int port);
+
+#endif
diff --git a/usr/src/cmd/bhyve/inout.c b/usr/src/cmd/bhyve/inout.c
new file mode 100644
index 0000000000..510649893a
--- /dev/null
+++ b/usr/src/cmd/bhyve/inout.c
@@ -0,0 +1,297 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/inout.c 277310 2015-01-18 03:08:30Z neel $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/inout.c 277310 2015-01-18 03:08:30Z neel $");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/_iovec.h>
+#include <sys/mman.h>
+
+#include <x86/psl.h>
+#include <x86/segments.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <assert.h>
+
+#include "bhyverun.h"
+#include "inout.h"
+
+SET_DECLARE(inout_port_set, struct inout_port);
+
+#define MAX_IOPORTS (1 << 16)
+
+#define VERIFY_IOPORT(port, size) \
+ assert((port) >= 0 && (size) > 0 && ((port) + (size)) <= MAX_IOPORTS)
+
+static struct {
+ const char *name;
+ int flags;
+ inout_func_t handler;
+ void *arg;
+} inout_handlers[MAX_IOPORTS];
+
+static int
+default_inout(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ if (in) {
+ switch (bytes) {
+ case 4:
+ *eax = 0xffffffff;
+ break;
+ case 2:
+ *eax = 0xffff;
+ break;
+ case 1:
+ *eax = 0xff;
+ break;
+ }
+ }
+
+ return (0);
+}
+
+static void
+register_default_iohandler(int start, int size)
+{
+ struct inout_port iop;
+
+ VERIFY_IOPORT(start, size);
+
+ bzero(&iop, sizeof(iop));
+ iop.name = "default";
+ iop.port = start;
+ iop.size = size;
+ iop.flags = IOPORT_F_INOUT | IOPORT_F_DEFAULT;
+ iop.handler = default_inout;
+
+ register_inout(&iop);
+}
+
+int
+emulate_inout(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit, int strict)
+{
+ int addrsize, bytes, flags, in, port, prot, rep;
+ uint32_t eax, val;
+ inout_func_t handler;
+ void *arg;
+ int error, retval;
+ enum vm_reg_name idxreg;
+ uint64_t gla, index, iterations, count;
+ struct vm_inout_str *vis;
+ struct iovec iov[2];
+
+ bytes = vmexit->u.inout.bytes;
+ in = vmexit->u.inout.in;
+ port = vmexit->u.inout.port;
+
+ assert(port < MAX_IOPORTS);
+ assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+ handler = inout_handlers[port].handler;
+
+ if (strict && handler == default_inout)
+ return (-1);
+
+ flags = inout_handlers[port].flags;
+ arg = inout_handlers[port].arg;
+
+ if (in) {
+ if (!(flags & IOPORT_F_IN))
+ return (-1);
+ } else {
+ if (!(flags & IOPORT_F_OUT))
+ return (-1);
+ }
+
+ retval = 0;
+ if (vmexit->u.inout.string) {
+ vis = &vmexit->u.inout_str;
+ rep = vis->inout.rep;
+ addrsize = vis->addrsize;
+ prot = in ? PROT_WRITE : PROT_READ;
+ assert(addrsize == 2 || addrsize == 4 || addrsize == 8);
+
+ /* Index register */
+ idxreg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
+ index = vis->index & vie_size2mask(addrsize);
+
+ /* Count register */
+ count = vis->count & vie_size2mask(addrsize);
+
+ /* Limit number of back-to-back in/out emulations to 16 */
+ iterations = MIN(count, 16);
+ while (iterations > 0) {
+ assert(retval == 0);
+ if (vie_calculate_gla(vis->paging.cpu_mode,
+ vis->seg_name, &vis->seg_desc, index, bytes,
+ addrsize, prot, &gla)) {
+ vm_inject_gp(ctx, vcpu);
+ break;
+ }
+
+ error = vm_copy_setup(ctx, vcpu, &vis->paging, gla,
+ bytes, prot, iov, nitems(iov));
+ if (error == -1) {
+ retval = -1; /* Unrecoverable error */
+ break;
+ } else if (error == 1) {
+ retval = 0; /* Resume guest to handle fault */
+ break;
+ }
+
+ if (vie_alignment_check(vis->paging.cpl, bytes,
+ vis->cr0, vis->rflags, gla)) {
+ vm_inject_ac(ctx, vcpu, 0);
+ break;
+ }
+
+ val = 0;
+ if (!in)
+ vm_copyin(ctx, vcpu, iov, &val, bytes);
+
+ retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
+ if (retval != 0)
+ break;
+
+ if (in)
+ vm_copyout(ctx, vcpu, &val, iov, bytes);
+
+ /* Update index */
+ if (vis->rflags & PSL_D)
+ index -= bytes;
+ else
+ index += bytes;
+
+ count--;
+ iterations--;
+ }
+
+ /* Update index register */
+ error = vie_update_register(ctx, vcpu, idxreg, index, addrsize);
+ assert(error == 0);
+
+ /*
+ * Update count register only if the instruction had a repeat
+ * prefix.
+ */
+ if (rep) {
+ error = vie_update_register(ctx, vcpu, VM_REG_GUEST_RCX,
+ count, addrsize);
+ assert(error == 0);
+ }
+
+ /* Restart the instruction if more iterations remain */
+ if (retval == 0 && count != 0) {
+ error = vm_restart_instruction(ctx, vcpu);
+ assert(error == 0);
+ }
+ } else {
+ eax = vmexit->u.inout.eax;
+ val = eax & vie_size2mask(bytes);
+ retval = handler(ctx, vcpu, in, port, bytes, &val, arg);
+ if (retval == 0 && in) {
+ eax &= ~vie_size2mask(bytes);
+ eax |= val & vie_size2mask(bytes);
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX,
+ eax);
+ assert(error == 0);
+ }
+ }
+ return (retval);
+}
+
+void
+init_inout(void)
+{
+ struct inout_port **iopp, *iop;
+
+ /*
+ * Set up the default handler for all ports
+ */
+ register_default_iohandler(0, MAX_IOPORTS);
+
+ /*
+ * Overwrite with specified handlers
+ */
+ SET_FOREACH(iopp, inout_port_set) {
+ iop = *iopp;
+ assert(iop->port < MAX_IOPORTS);
+ inout_handlers[iop->port].name = iop->name;
+ inout_handlers[iop->port].flags = iop->flags;
+ inout_handlers[iop->port].handler = iop->handler;
+ inout_handlers[iop->port].arg = NULL;
+ }
+}
+
+int
+register_inout(struct inout_port *iop)
+{
+ int i;
+
+ VERIFY_IOPORT(iop->port, iop->size);
+
+ /*
+ * Verify that the new registration is not overwriting an already
+ * allocated i/o range.
+ */
+ if ((iop->flags & IOPORT_F_DEFAULT) == 0) {
+ for (i = iop->port; i < iop->port + iop->size; i++) {
+ if ((inout_handlers[i].flags & IOPORT_F_DEFAULT) == 0)
+ return (-1);
+ }
+ }
+
+ for (i = iop->port; i < iop->port + iop->size; i++) {
+ inout_handlers[i].name = iop->name;
+ inout_handlers[i].flags = iop->flags;
+ inout_handlers[i].handler = iop->handler;
+ inout_handlers[i].arg = iop->arg;
+ }
+
+ return (0);
+}
+
+int
+unregister_inout(struct inout_port *iop)
+{
+
+ VERIFY_IOPORT(iop->port, iop->size);
+ assert(inout_handlers[iop->port].name == iop->name);
+
+ register_default_iohandler(iop->port, iop->size);
+
+ return (0);
+}
diff --git a/usr/src/cmd/bhyve/inout.h b/usr/src/cmd/bhyve/inout.h
new file mode 100644
index 0000000000..0d4046bd61
--- /dev/null
+++ b/usr/src/cmd/bhyve/inout.h
@@ -0,0 +1,91 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/inout.h 269094 2014-07-25 20:18:35Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _INOUT_H_
+#define _INOUT_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+struct vm_exit;
+
+/*
+ * inout emulation handlers return 0 on success and -1 on failure.
+ */
+typedef int (*inout_func_t)(struct vmctx *ctx, int vcpu, int in, int port,
+ int bytes, uint32_t *eax, void *arg);
+
+struct inout_port {
+ const char *name;
+ int port;
+ int size;
+ int flags;
+ inout_func_t handler;
+ void *arg;
+};
+#define IOPORT_F_IN 0x1
+#define IOPORT_F_OUT 0x2
+#define IOPORT_F_INOUT (IOPORT_F_IN | IOPORT_F_OUT)
+
+/*
+ * The following flags are used internally and must not be used by
+ * device models.
+ */
+#define IOPORT_F_DEFAULT 0x80000000 /* claimed by default handler */
+
+#define INOUT_PORT(name, port, flags, handler) \
+ static struct inout_port __CONCAT(__inout_port, __LINE__) = { \
+ #name, \
+ (port), \
+ 1, \
+ (flags), \
+ (handler), \
+ 0 \
+ }; \
+ DATA_SET(inout_port_set, __CONCAT(__inout_port, __LINE__))
+
+void init_inout(void);
+int emulate_inout(struct vmctx *, int vcpu, struct vm_exit *vmexit,
+ int strict);
+int register_inout(struct inout_port *iop);
+int unregister_inout(struct inout_port *iop);
+void init_bvmcons(void);
+
+#endif /* _INOUT_H_ */
diff --git a/usr/src/cmd/bhyve/ioapic.c b/usr/src/cmd/bhyve/ioapic.c
new file mode 100644
index 0000000000..86ff5c6580
--- /dev/null
+++ b/usr/src/cmd/bhyve/ioapic.c
@@ -0,0 +1,74 @@
+/*-
+ * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/ioapic.c 261268 2014-01-29 14:56:48Z jhb $");
+
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "ioapic.h"
+
+/*
+ * Assign PCI INTx interrupts to I/O APIC pins in a round-robin
+ * fashion. Note that we have no idea what the HPET is using, but the
+ * HPET is also programmable whereas this is intended for hardwired
+ * PCI interrupts.
+ *
+ * This assumes a single I/O APIC where pins >= 16 are permitted for
+ * PCI devices.
+ */
+static int pci_pins;
+
+void
+ioapic_init(struct vmctx *ctx)
+{
+
+ if (vm_ioapic_pincount(ctx, &pci_pins) < 0) {
+ pci_pins = 0;
+ return;
+ }
+
+ /* Ignore the first 16 pins. */
+ if (pci_pins <= 16) {
+ pci_pins = 0;
+ return;
+ }
+ pci_pins -= 16;
+}
+
+int
+ioapic_pci_alloc_irq(void)
+{
+ static int last_pin;
+
+ if (pci_pins == 0)
+ return (-1);
+ return (16 + (last_pin++ % pci_pins));
+}
diff --git a/usr/src/cmd/bhyve/ioapic.h b/usr/src/cmd/bhyve/ioapic.h
new file mode 100644
index 0000000000..789f90fea9
--- /dev/null
+++ b/usr/src/cmd/bhyve/ioapic.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/ioapic.h 261268 2014-01-29 14:56:48Z jhb $
+ */
+
+#ifndef _IOAPIC_H_
+#define _IOAPIC_H_
+
+/*
+ * Allocate a PCI IRQ from the I/O APIC.
+ */
+void ioapic_init(struct vmctx *ctx);
+int ioapic_pci_alloc_irq(void);
+
+#endif
diff --git a/usr/src/cmd/bhyve/mem.c b/usr/src/cmd/bhyve/mem.c
new file mode 100644
index 0000000000..a153a8e960
--- /dev/null
+++ b/usr/src/cmd/bhyve/mem.c
@@ -0,0 +1,291 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/mem.c 269700 2014-08-08 03:49:01Z neel $
+ */
+
+/*
+ * Memory ranges are represented with an RB tree. On insertion, the range
+ * is checked for overlaps. On lookup, the key has the same base and limit
+ * so it can be searched within the range.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/mem.c 269700 2014-08-08 03:49:01Z neel $");
+
+#include <sys/types.h>
+#include <sys/tree.h>
+#include <sys/errno.h>
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <pthread.h>
+
+#include "mem.h"
+
+struct mmio_rb_range {
+ RB_ENTRY(mmio_rb_range) mr_link; /* RB tree links */
+ struct mem_range mr_param;
+ uint64_t mr_base;
+ uint64_t mr_end;
+};
+
+struct mmio_rb_tree;
+RB_PROTOTYPE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+
+RB_HEAD(mmio_rb_tree, mmio_rb_range) mmio_rb_root, mmio_rb_fallback;
+
+/*
+ * Per-vCPU cache. Since most accesses from a vCPU will be to
+ * consecutive addresses in a range, it makes sense to cache the
+ * result of a lookup.
+ */
+static struct mmio_rb_range *mmio_hint[VM_MAXCPU];
+
+static pthread_rwlock_t mmio_rwlock;
+
+static int
+mmio_rb_range_compare(struct mmio_rb_range *a, struct mmio_rb_range *b)
+{
+ if (a->mr_end < b->mr_base)
+ return (-1);
+ else if (a->mr_base > b->mr_end)
+ return (1);
+ return (0);
+}
+
+static int
+mmio_rb_lookup(struct mmio_rb_tree *rbt, uint64_t addr,
+ struct mmio_rb_range **entry)
+{
+ struct mmio_rb_range find, *res;
+
+ find.mr_base = find.mr_end = addr;
+
+ res = RB_FIND(mmio_rb_tree, rbt, &find);
+
+ if (res != NULL) {
+ *entry = res;
+ return (0);
+ }
+
+ return (ENOENT);
+}
+
+static int
+mmio_rb_add(struct mmio_rb_tree *rbt, struct mmio_rb_range *new)
+{
+ struct mmio_rb_range *overlap;
+
+ overlap = RB_INSERT(mmio_rb_tree, rbt, new);
+
+ if (overlap != NULL) {
+#ifdef RB_DEBUG
+ printf("overlap detected: new %lx:%lx, tree %lx:%lx\n",
+ new->mr_base, new->mr_end,
+ overlap->mr_base, overlap->mr_end);
+#endif
+
+ return (EEXIST);
+ }
+
+ return (0);
+}
+
+#if 0
+static void
+mmio_rb_dump(struct mmio_rb_tree *rbt)
+{
+ struct mmio_rb_range *np;
+
+ pthread_rwlock_rdlock(&mmio_rwlock);
+ RB_FOREACH(np, mmio_rb_tree, rbt) {
+ printf(" %lx:%lx, %s\n", np->mr_base, np->mr_end,
+ np->mr_param.name);
+ }
+ pthread_rwlock_unlock(&mmio_rwlock);
+}
+#endif
+
+RB_GENERATE(mmio_rb_tree, mmio_rb_range, mr_link, mmio_rb_range_compare);
+
+static int
+mem_read(void *ctx, int vcpu, uint64_t gpa, uint64_t *rval, int size, void *arg)
+{
+ int error;
+ struct mem_range *mr = arg;
+
+ error = (*mr->handler)(ctx, vcpu, MEM_F_READ, gpa, size,
+ rval, mr->arg1, mr->arg2);
+ return (error);
+}
+
+static int
+mem_write(void *ctx, int vcpu, uint64_t gpa, uint64_t wval, int size, void *arg)
+{
+ int error;
+ struct mem_range *mr = arg;
+
+ error = (*mr->handler)(ctx, vcpu, MEM_F_WRITE, gpa, size,
+ &wval, mr->arg1, mr->arg2);
+ return (error);
+}
+
+int
+emulate_mem(struct vmctx *ctx, int vcpu, uint64_t paddr, struct vie *vie,
+ struct vm_guest_paging *paging)
+
+{
+ struct mmio_rb_range *entry;
+ int err, immutable;
+
+ pthread_rwlock_rdlock(&mmio_rwlock);
+ /*
+ * First check the per-vCPU cache
+ */
+ if (mmio_hint[vcpu] &&
+ paddr >= mmio_hint[vcpu]->mr_base &&
+ paddr <= mmio_hint[vcpu]->mr_end) {
+ entry = mmio_hint[vcpu];
+ } else
+ entry = NULL;
+
+ if (entry == NULL) {
+ if (mmio_rb_lookup(&mmio_rb_root, paddr, &entry) == 0) {
+ /* Update the per-vCPU cache */
+ mmio_hint[vcpu] = entry;
+ } else if (mmio_rb_lookup(&mmio_rb_fallback, paddr, &entry)) {
+ pthread_rwlock_unlock(&mmio_rwlock);
+ return (ESRCH);
+ }
+ }
+
+ assert(entry != NULL);
+
+ /*
+ * An 'immutable' memory range is guaranteed to be never removed
+ * so there is no need to hold 'mmio_rwlock' while calling the
+ * handler.
+ *
+ * XXX writes to the PCIR_COMMAND register can cause register_mem()
+ * to be called. If the guest is using PCI extended config space
+ * to modify the PCIR_COMMAND register then register_mem() can
+ * deadlock on 'mmio_rwlock'. However by registering the extended
+ * config space window as 'immutable' the deadlock can be avoided.
+ */
+ immutable = (entry->mr_param.flags & MEM_F_IMMUTABLE);
+ if (immutable)
+ pthread_rwlock_unlock(&mmio_rwlock);
+
+ err = vmm_emulate_instruction(ctx, vcpu, paddr, vie, paging,
+ mem_read, mem_write, &entry->mr_param);
+
+ if (!immutable)
+ pthread_rwlock_unlock(&mmio_rwlock);
+
+ return (err);
+}
+
+static int
+register_mem_int(struct mmio_rb_tree *rbt, struct mem_range *memp)
+{
+ struct mmio_rb_range *entry, *mrp;
+ int err;
+
+ err = 0;
+
+ mrp = malloc(sizeof(struct mmio_rb_range));
+
+ if (mrp != NULL) {
+ mrp->mr_param = *memp;
+ mrp->mr_base = memp->base;
+ mrp->mr_end = memp->base + memp->size - 1;
+ pthread_rwlock_wrlock(&mmio_rwlock);
+ if (mmio_rb_lookup(rbt, memp->base, &entry) != 0)
+ err = mmio_rb_add(rbt, mrp);
+ pthread_rwlock_unlock(&mmio_rwlock);
+ if (err)
+ free(mrp);
+ } else
+ err = ENOMEM;
+
+ return (err);
+}
+
+int
+register_mem(struct mem_range *memp)
+{
+
+ return (register_mem_int(&mmio_rb_root, memp));
+}
+
+int
+register_mem_fallback(struct mem_range *memp)
+{
+
+ return (register_mem_int(&mmio_rb_fallback, memp));
+}
+
+int
+unregister_mem(struct mem_range *memp)
+{
+ struct mem_range *mr;
+ struct mmio_rb_range *entry = NULL;
+ int err, i;
+
+ pthread_rwlock_wrlock(&mmio_rwlock);
+ err = mmio_rb_lookup(&mmio_rb_root, memp->base, &entry);
+ if (err == 0) {
+ mr = &entry->mr_param;
+ assert(mr->name == memp->name);
+ assert(mr->base == memp->base && mr->size == memp->size);
+ assert((mr->flags & MEM_F_IMMUTABLE) == 0);
+ RB_REMOVE(mmio_rb_tree, &mmio_rb_root, entry);
+
+ /* flush Per-vCPU cache */
+ for (i=0; i < VM_MAXCPU; i++) {
+ if (mmio_hint[i] == entry)
+ mmio_hint[i] = NULL;
+ }
+ }
+ pthread_rwlock_unlock(&mmio_rwlock);
+
+ if (entry)
+ free(entry);
+
+ return (err);
+}
+
+void
+init_mem(void)
+{
+
+ RB_INIT(&mmio_rb_root);
+ RB_INIT(&mmio_rb_fallback);
+ pthread_rwlock_init(&mmio_rwlock, NULL);
+}
diff --git a/usr/src/cmd/bhyve/mem.h b/usr/src/cmd/bhyve/mem.h
new file mode 100644
index 0000000000..09cf56b72e
--- /dev/null
+++ b/usr/src/cmd/bhyve/mem.h
@@ -0,0 +1,61 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/mem.h 269700 2014-08-08 03:49:01Z neel $
+ */
+
+#ifndef _MEM_H_
+#define _MEM_H_
+
+#include <sys/linker_set.h>
+
+struct vmctx;
+
+typedef int (*mem_func_t)(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2);
+
+struct mem_range {
+ const char *name;
+ int flags;
+ mem_func_t handler;
+ void *arg1;
+ long arg2;
+ uint64_t base;
+ uint64_t size;
+};
+#define MEM_F_READ 0x1
+#define MEM_F_WRITE 0x2
+#define MEM_F_RW 0x3
+#define MEM_F_IMMUTABLE 0x4 /* mem_range cannot be unregistered */
+
+void init_mem(void);
+int emulate_mem(struct vmctx *, int vcpu, uint64_t paddr, struct vie *vie,
+ struct vm_guest_paging *paging);
+
+int register_mem(struct mem_range *memp);
+int register_mem_fallback(struct mem_range *memp);
+int unregister_mem(struct mem_range *memp);
+
+#endif /* _MEM_H_ */
diff --git a/usr/src/cmd/bhyve/mptbl.c b/usr/src/cmd/bhyve/mptbl.c
new file mode 100644
index 0000000000..9d03765c7a
--- /dev/null
+++ b/usr/src/cmd/bhyve/mptbl.c
@@ -0,0 +1,377 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/mptbl.c 266125 2014-05-15 14:16:55Z jhb $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/mptbl.c 266125 2014-05-15 14:16:55Z jhb $");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <x86/mptable.h>
+
+#include <stdio.h>
+#include <string.h>
+
+#include "acpi.h"
+#include "bhyverun.h"
+#include "mptbl.h"
+#include "pci_emul.h"
+
+#define MPTABLE_BASE 0xE0000
+
+/* floating pointer length + maximum length of configuration table */
+#define MPTABLE_MAX_LENGTH (65536 + 16)
+
+#define LAPIC_PADDR 0xFEE00000
+#define LAPIC_VERSION 16
+
+#define IOAPIC_PADDR 0xFEC00000
+#define IOAPIC_VERSION 0x11
+
+#define MP_SPECREV 4
+#define MPFP_SIG "_MP_"
+
+/* Configuration header defines */
+#define MPCH_SIG "PCMP"
+#define MPCH_OEMID "BHyVe "
+#define MPCH_OEMID_LEN 8
+#define MPCH_PRODID "Hypervisor "
+#define MPCH_PRODID_LEN 12
+
+/* Processor entry defines */
+#define MPEP_SIG_FAMILY 6 /* XXX bhyve should supply this */
+#define MPEP_SIG_MODEL 26
+#define MPEP_SIG_STEPPING 5
+#define MPEP_SIG \
+ ((MPEP_SIG_FAMILY << 8) | \
+ (MPEP_SIG_MODEL << 4) | \
+ (MPEP_SIG_STEPPING))
+
+#define MPEP_FEATURES (0xBFEBFBFF) /* XXX Intel i7 */
+
+/* Number of local intr entries */
+#define MPEII_NUM_LOCAL_IRQ 2
+
+/* Bus entry defines */
+#define MPE_NUM_BUSES 2
+#define MPE_BUSNAME_LEN 6
+#define MPE_BUSNAME_ISA "ISA "
+#define MPE_BUSNAME_PCI "PCI "
+
+static void *oem_tbl_start;
+static int oem_tbl_size;
+
+static uint8_t
+mpt_compute_checksum(void *base, size_t len)
+{
+ uint8_t *bytes;
+ uint8_t sum;
+
+ for(bytes = base, sum = 0; len > 0; len--) {
+ sum += *bytes++;
+ }
+
+ return (256 - sum);
+}
+
+static void
+mpt_build_mpfp(mpfps_t mpfp, vm_paddr_t gpa)
+{
+
+ memset(mpfp, 0, sizeof(*mpfp));
+ memcpy(mpfp->signature, MPFP_SIG, 4);
+ mpfp->pap = gpa + sizeof(*mpfp);
+ mpfp->length = 1;
+ mpfp->spec_rev = MP_SPECREV;
+ mpfp->checksum = mpt_compute_checksum(mpfp, sizeof(*mpfp));
+}
+
+static void
+mpt_build_mpch(mpcth_t mpch)
+{
+
+ memset(mpch, 0, sizeof(*mpch));
+ memcpy(mpch->signature, MPCH_SIG, 4);
+ mpch->spec_rev = MP_SPECREV;
+ memcpy(mpch->oem_id, MPCH_OEMID, MPCH_OEMID_LEN);
+ memcpy(mpch->product_id, MPCH_PRODID, MPCH_PRODID_LEN);
+ mpch->apic_address = LAPIC_PADDR;
+}
+
+static void
+mpt_build_proc_entries(proc_entry_ptr mpep, int ncpu)
+{
+ int i;
+
+ for (i = 0; i < ncpu; i++) {
+ memset(mpep, 0, sizeof(*mpep));
+ mpep->type = MPCT_ENTRY_PROCESSOR;
+ mpep->apic_id = i; // XXX
+ mpep->apic_version = LAPIC_VERSION;
+ mpep->cpu_flags = PROCENTRY_FLAG_EN;
+ if (i == 0)
+ mpep->cpu_flags |= PROCENTRY_FLAG_BP;
+ mpep->cpu_signature = MPEP_SIG;
+ mpep->feature_flags = MPEP_FEATURES;
+ mpep++;
+ }
+}
+
+static void
+mpt_build_localint_entries(int_entry_ptr mpie)
+{
+
+ /* Hardcode LINT0 as ExtINT on all CPUs. */
+ memset(mpie, 0, sizeof(*mpie));
+ mpie->type = MPCT_ENTRY_LOCAL_INT;
+ mpie->int_type = INTENTRY_TYPE_EXTINT;
+ mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM |
+ INTENTRY_FLAGS_TRIGGER_CONFORM;
+ mpie->dst_apic_id = 0xff;
+ mpie->dst_apic_int = 0;
+ mpie++;
+
+ /* Hardcode LINT1 as NMI on all CPUs. */
+ memset(mpie, 0, sizeof(*mpie));
+ mpie->type = MPCT_ENTRY_LOCAL_INT;
+ mpie->int_type = INTENTRY_TYPE_NMI;
+ mpie->int_flags = INTENTRY_FLAGS_POLARITY_CONFORM |
+ INTENTRY_FLAGS_TRIGGER_CONFORM;
+ mpie->dst_apic_id = 0xff;
+ mpie->dst_apic_int = 1;
+}
+
+static void
+mpt_build_bus_entries(bus_entry_ptr mpeb)
+{
+
+ memset(mpeb, 0, sizeof(*mpeb));
+ mpeb->type = MPCT_ENTRY_BUS;
+ mpeb->bus_id = 0;
+ memcpy(mpeb->bus_type, MPE_BUSNAME_PCI, MPE_BUSNAME_LEN);
+ mpeb++;
+
+ memset(mpeb, 0, sizeof(*mpeb));
+ mpeb->type = MPCT_ENTRY_BUS;
+ mpeb->bus_id = 1;
+ memcpy(mpeb->bus_type, MPE_BUSNAME_ISA, MPE_BUSNAME_LEN);
+}
+
+static void
+mpt_build_ioapic_entries(io_apic_entry_ptr mpei, int id)
+{
+
+ memset(mpei, 0, sizeof(*mpei));
+ mpei->type = MPCT_ENTRY_IOAPIC;
+ mpei->apic_id = id;
+ mpei->apic_version = IOAPIC_VERSION;
+ mpei->apic_flags = IOAPICENTRY_FLAG_EN;
+ mpei->apic_address = IOAPIC_PADDR;
+}
+
+static int
+mpt_count_ioint_entries(void)
+{
+ int bus, count;
+
+ count = 0;
+ for (bus = 0; bus <= PCI_BUSMAX; bus++)
+ count += pci_count_lintr(bus);
+
+ /*
+ * Always include entries for the first 16 pins along with a entry
+ * for each active PCI INTx pin.
+ */
+ return (16 + count);
+}
+
+static void
+mpt_generate_pci_int(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
+ void *arg)
+{
+ int_entry_ptr *mpiep, mpie;
+
+ mpiep = arg;
+ mpie = *mpiep;
+ memset(mpie, 0, sizeof(*mpie));
+
+ /*
+ * This is always after another I/O interrupt entry, so cheat
+ * and fetch the I/O APIC ID from the prior entry.
+ */
+ mpie->type = MPCT_ENTRY_INT;
+ mpie->int_type = INTENTRY_TYPE_INT;
+ mpie->src_bus_id = bus;
+ mpie->src_bus_irq = slot << 2 | (pin - 1);
+ mpie->dst_apic_id = mpie[-1].dst_apic_id;
+ mpie->dst_apic_int = ioapic_irq;
+
+ *mpiep = mpie + 1;
+}
+
+static void
+mpt_build_ioint_entries(int_entry_ptr mpie, int id)
+{
+ int pin, bus;
+
+ /*
+ * The following config is taken from kernel mptable.c
+ * mptable_parse_default_config_ints(...), for now
+ * just use the default config, tweek later if needed.
+ */
+
+ /* First, generate the first 16 pins. */
+ for (pin = 0; pin < 16; pin++) {
+ memset(mpie, 0, sizeof(*mpie));
+ mpie->type = MPCT_ENTRY_INT;
+ mpie->src_bus_id = 1;
+ mpie->dst_apic_id = id;
+
+ /*
+ * All default configs route IRQs from bus 0 to the first 16
+ * pins of the first I/O APIC with an APIC ID of 2.
+ */
+ mpie->dst_apic_int = pin;
+ switch (pin) {
+ case 0:
+ /* Pin 0 is an ExtINT pin. */
+ mpie->int_type = INTENTRY_TYPE_EXTINT;
+ break;
+ case 2:
+ /* IRQ 0 is routed to pin 2. */
+ mpie->int_type = INTENTRY_TYPE_INT;
+ mpie->src_bus_irq = 0;
+ break;
+ case SCI_INT:
+ /* ACPI SCI is level triggered and active-lo. */
+ mpie->int_flags = INTENTRY_FLAGS_POLARITY_ACTIVELO |
+ INTENTRY_FLAGS_TRIGGER_LEVEL;
+ mpie->int_type = INTENTRY_TYPE_INT;
+ mpie->src_bus_irq = SCI_INT;
+ break;
+ default:
+ /* All other pins are identity mapped. */
+ mpie->int_type = INTENTRY_TYPE_INT;
+ mpie->src_bus_irq = pin;
+ break;
+ }
+ mpie++;
+ }
+
+ /* Next, generate entries for any PCI INTx interrupts. */
+ for (bus = 0; bus <= PCI_BUSMAX; bus++)
+ pci_walk_lintr(bus, mpt_generate_pci_int, &mpie);
+}
+
+void
+mptable_add_oemtbl(void *tbl, int tblsz)
+{
+
+ oem_tbl_start = tbl;
+ oem_tbl_size = tblsz;
+}
+
+int
+mptable_build(struct vmctx *ctx, int ncpu)
+{
+ mpcth_t mpch;
+ bus_entry_ptr mpeb;
+ io_apic_entry_ptr mpei;
+ proc_entry_ptr mpep;
+ mpfps_t mpfp;
+ int_entry_ptr mpie;
+ int ioints, bus;
+ char *curraddr;
+ char *startaddr;
+
+ startaddr = paddr_guest2host(ctx, MPTABLE_BASE, MPTABLE_MAX_LENGTH);
+ if (startaddr == NULL) {
+ fprintf(stderr, "mptable requires mapped mem\n");
+ return (ENOMEM);
+ }
+
+ /*
+ * There is no way to advertise multiple PCI hierarchies via MPtable
+ * so require that there is no PCI hierarchy with a non-zero bus
+ * number.
+ */
+ for (bus = 1; bus <= PCI_BUSMAX; bus++) {
+ if (pci_bus_configured(bus)) {
+ fprintf(stderr, "MPtable is incompatible with "
+ "multiple PCI hierarchies.\r\n");
+ fprintf(stderr, "MPtable generation can be disabled "
+ "by passing the -Y option to bhyve(8).\r\n");
+ return (EINVAL);
+ }
+ }
+
+ curraddr = startaddr;
+ mpfp = (mpfps_t)curraddr;
+ mpt_build_mpfp(mpfp, MPTABLE_BASE);
+ curraddr += sizeof(*mpfp);
+
+ mpch = (mpcth_t)curraddr;
+ mpt_build_mpch(mpch);
+ curraddr += sizeof(*mpch);
+
+ mpep = (proc_entry_ptr)curraddr;
+ mpt_build_proc_entries(mpep, ncpu);
+ curraddr += sizeof(*mpep) * ncpu;
+ mpch->entry_count += ncpu;
+
+ mpeb = (bus_entry_ptr) curraddr;
+ mpt_build_bus_entries(mpeb);
+ curraddr += sizeof(*mpeb) * MPE_NUM_BUSES;
+ mpch->entry_count += MPE_NUM_BUSES;
+
+ mpei = (io_apic_entry_ptr)curraddr;
+ mpt_build_ioapic_entries(mpei, 0);
+ curraddr += sizeof(*mpei);
+ mpch->entry_count++;
+
+ mpie = (int_entry_ptr) curraddr;
+ ioints = mpt_count_ioint_entries();
+ mpt_build_ioint_entries(mpie, 0);
+ curraddr += sizeof(*mpie) * ioints;
+ mpch->entry_count += ioints;
+
+ mpie = (int_entry_ptr)curraddr;
+ mpt_build_localint_entries(mpie);
+ curraddr += sizeof(*mpie) * MPEII_NUM_LOCAL_IRQ;
+ mpch->entry_count += MPEII_NUM_LOCAL_IRQ;
+
+ if (oem_tbl_start) {
+ mpch->oem_table_pointer = curraddr - startaddr + MPTABLE_BASE;
+ mpch->oem_table_size = oem_tbl_size;
+ memcpy(curraddr, oem_tbl_start, oem_tbl_size);
+ }
+
+ mpch->base_table_length = curraddr - (char *)mpch;
+ mpch->checksum = mpt_compute_checksum(mpch, mpch->base_table_length);
+
+ return (0);
+}
diff --git a/usr/src/cmd/bhyve/mptbl.h b/usr/src/cmd/bhyve/mptbl.h
new file mode 100644
index 0000000000..d78ea6da09
--- /dev/null
+++ b/usr/src/cmd/bhyve/mptbl.h
@@ -0,0 +1,35 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/mptbl.h 257423 2013-10-31 05:44:45Z neel $
+ */
+
+#ifndef _MPTBL_H_
+#define _MPTBL_H_
+
+int mptable_build(struct vmctx *ctx, int ncpu);
+void mptable_add_oemtbl(void *tbl, int tblsz);
+
+#endif /* _MPTBL_H_ */
diff --git a/usr/src/cmd/bhyve/pci_ahci.c b/usr/src/cmd/bhyve/pci_ahci.c
new file mode 100644
index 0000000000..b68c977c1f
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_ahci.c
@@ -0,0 +1,2009 @@
+/*-
+ * Copyright (c) 2013 Zhixiang Yu <zcore@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z tychon $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_ahci.c 274045 2014-11-03 12:55:31Z tychon $");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+#include <sys/ata.h>
+#include <sys/endian.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#include <inttypes.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "ahci.h"
+#include "block_if.h"
+
+#define MAX_PORTS 6 /* Intel ICH8 AHCI supports 6 ports */
+
+#define PxSIG_ATA 0x00000101 /* ATA drive */
+#define PxSIG_ATAPI 0xeb140101 /* ATAPI drive */
+
+enum sata_fis_type {
+ FIS_TYPE_REGH2D = 0x27, /* Register FIS - host to device */
+ FIS_TYPE_REGD2H = 0x34, /* Register FIS - device to host */
+ FIS_TYPE_DMAACT = 0x39, /* DMA activate FIS - device to host */
+ FIS_TYPE_DMASETUP = 0x41, /* DMA setup FIS - bidirectional */
+ FIS_TYPE_DATA = 0x46, /* Data FIS - bidirectional */
+ FIS_TYPE_BIST = 0x58, /* BIST activate FIS - bidirectional */
+ FIS_TYPE_PIOSETUP = 0x5F, /* PIO setup FIS - device to host */
+ FIS_TYPE_SETDEVBITS = 0xA1, /* Set dev bits FIS - device to host */
+};
+
+/*
+ * SCSI opcodes
+ */
+#define TEST_UNIT_READY 0x00
+#define REQUEST_SENSE 0x03
+#define INQUIRY 0x12
+#define START_STOP_UNIT 0x1B
+#define PREVENT_ALLOW 0x1E
+#define READ_CAPACITY 0x25
+#define READ_10 0x28
+#define POSITION_TO_ELEMENT 0x2B
+#define READ_TOC 0x43
+#define GET_EVENT_STATUS_NOTIFICATION 0x4A
+#define MODE_SENSE_10 0x5A
+#define READ_12 0xA8
+#define READ_CD 0xBE
+
+/*
+ * SCSI mode page codes
+ */
+#define MODEPAGE_RW_ERROR_RECOVERY 0x01
+#define MODEPAGE_CD_CAPABILITIES 0x2A
+
+/*
+ * ATA commands
+ */
+#define ATA_SF_ENAB_SATA_SF 0x10
+#define ATA_SATA_SF_AN 0x05
+#define ATA_SF_DIS_SATA_SF 0x90
+
+/*
+ * Debug printf
+ */
+#ifdef AHCI_DEBUG
+static FILE *dbg;
+#define DPRINTF(format, arg...) do{fprintf(dbg, format, ##arg);fflush(dbg);}while(0)
+#else
+#define DPRINTF(format, arg...)
+#endif
+#define WPRINTF(format, arg...) printf(format, ##arg)
+
+struct ahci_ioreq {
+ struct blockif_req io_req;
+ struct ahci_port *io_pr;
+ STAILQ_ENTRY(ahci_ioreq) io_flist;
+ TAILQ_ENTRY(ahci_ioreq) io_blist;
+ uint8_t *cfis;
+ uint32_t len;
+ uint32_t done;
+ int slot;
+ int prdtl;
+};
+
+struct ahci_port {
+ struct blockif_ctxt *bctx;
+ struct pci_ahci_softc *pr_sc;
+ uint8_t *cmd_lst;
+ uint8_t *rfis;
+ int atapi;
+ int reset;
+ int mult_sectors;
+ uint8_t xfermode;
+ uint8_t sense_key;
+ uint8_t asc;
+ uint32_t pending;
+
+ uint32_t clb;
+ uint32_t clbu;
+ uint32_t fb;
+ uint32_t fbu;
+ uint32_t is;
+ uint32_t ie;
+ uint32_t cmd;
+ uint32_t unused0;
+ uint32_t tfd;
+ uint32_t sig;
+ uint32_t ssts;
+ uint32_t sctl;
+ uint32_t serr;
+ uint32_t sact;
+ uint32_t ci;
+ uint32_t sntf;
+ uint32_t fbs;
+
+ /*
+ * i/o request info
+ */
+ struct ahci_ioreq *ioreq;
+ int ioqsz;
+ STAILQ_HEAD(ahci_fhead, ahci_ioreq) iofhd;
+ TAILQ_HEAD(ahci_bhead, ahci_ioreq) iobhd;
+};
+
+struct ahci_cmd_hdr {
+ uint16_t flags;
+ uint16_t prdtl;
+ uint32_t prdbc;
+ uint64_t ctba;
+ uint32_t reserved[4];
+};
+
+struct ahci_prdt_entry {
+ uint64_t dba;
+ uint32_t reserved;
+#define DBCMASK 0x3fffff
+ uint32_t dbc;
+};
+
+struct pci_ahci_softc {
+ struct pci_devinst *asc_pi;
+ pthread_mutex_t mtx;
+ int ports;
+ uint32_t cap;
+ uint32_t ghc;
+ uint32_t is;
+ uint32_t pi;
+ uint32_t vs;
+ uint32_t ccc_ctl;
+ uint32_t ccc_pts;
+ uint32_t em_loc;
+ uint32_t em_ctl;
+ uint32_t cap2;
+ uint32_t bohc;
+ uint32_t lintr;
+ struct ahci_port port[MAX_PORTS];
+};
+#define ahci_ctx(sc) ((sc)->asc_pi->pi_vmctx)
+
+static inline void lba_to_msf(uint8_t *buf, int lba)
+{
+ lba += 150;
+ buf[0] = (lba / 75) / 60;
+ buf[1] = (lba / 75) % 60;
+ buf[2] = lba % 75;
+}
+
+/*
+ * generate HBA intr depending on whether or not ports within
+ * the controller have an interrupt pending.
+ */
+static void
+ahci_generate_intr(struct pci_ahci_softc *sc)
+{
+ struct pci_devinst *pi;
+ int i;
+
+ pi = sc->asc_pi;
+
+ for (i = 0; i < sc->ports; i++) {
+ struct ahci_port *pr;
+ pr = &sc->port[i];
+ if (pr->is & pr->ie)
+ sc->is |= (1 << i);
+ }
+
+ DPRINTF("%s %x\n", __func__, sc->is);
+
+ if (sc->is && (sc->ghc & AHCI_GHC_IE)) {
+ if (pci_msi_enabled(pi)) {
+ /*
+ * Generate an MSI interrupt on every edge
+ */
+ pci_generate_msi(pi, 0);
+ } else if (!sc->lintr) {
+ /*
+ * Only generate a pin-based interrupt if one wasn't
+ * in progress
+ */
+ sc->lintr = 1;
+ pci_lintr_assert(pi);
+ }
+ } else if (sc->lintr) {
+ /*
+ * No interrupts: deassert pin-based signal if it had
+ * been asserted
+ */
+ pci_lintr_deassert(pi);
+ sc->lintr = 0;
+ }
+}
+
+static void
+ahci_write_fis(struct ahci_port *p, enum sata_fis_type ft, uint8_t *fis)
+{
+ int offset, len, irq;
+
+ if (p->rfis == NULL || !(p->cmd & AHCI_P_CMD_FRE))
+ return;
+
+ switch (ft) {
+ case FIS_TYPE_REGD2H:
+ offset = 0x40;
+ len = 20;
+ irq = AHCI_P_IX_DHR;
+ break;
+ case FIS_TYPE_SETDEVBITS:
+ offset = 0x58;
+ len = 8;
+ irq = AHCI_P_IX_SDB;
+ break;
+ case FIS_TYPE_PIOSETUP:
+ offset = 0x20;
+ len = 20;
+ irq = 0;
+ break;
+ default:
+ WPRINTF("unsupported fis type %d\n", ft);
+ return;
+ }
+ memcpy(p->rfis + offset, fis, len);
+ if (irq) {
+ p->is |= irq;
+ ahci_generate_intr(p->pr_sc);
+ }
+}
+
+static void
+ahci_write_fis_piosetup(struct ahci_port *p)
+{
+ uint8_t fis[20];
+
+ memset(fis, 0, sizeof(fis));
+ fis[0] = FIS_TYPE_PIOSETUP;
+ ahci_write_fis(p, FIS_TYPE_PIOSETUP, fis);
+}
+
+static void
+ahci_write_fis_sdb(struct ahci_port *p, int slot, uint32_t tfd)
+{
+ uint8_t fis[8];
+ uint8_t error;
+
+ error = (tfd >> 8) & 0xff;
+ memset(fis, 0, sizeof(fis));
+ fis[0] = error;
+ fis[2] = tfd & 0x77;
+ *(uint32_t *)(fis + 4) = (1 << slot);
+ if (fis[2] & ATA_S_ERROR)
+ p->is |= AHCI_P_IX_TFE;
+ p->tfd = tfd;
+ ahci_write_fis(p, FIS_TYPE_SETDEVBITS, fis);
+}
+
+static void
+ahci_write_fis_d2h(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t tfd)
+{
+ uint8_t fis[20];
+ uint8_t error;
+
+ error = (tfd >> 8) & 0xff;
+ memset(fis, 0, sizeof(fis));
+ fis[0] = FIS_TYPE_REGD2H;
+ fis[1] = (1 << 6);
+ fis[2] = tfd & 0xff;
+ fis[3] = error;
+ fis[4] = cfis[4];
+ fis[5] = cfis[5];
+ fis[6] = cfis[6];
+ fis[7] = cfis[7];
+ fis[8] = cfis[8];
+ fis[9] = cfis[9];
+ fis[10] = cfis[10];
+ fis[11] = cfis[11];
+ fis[12] = cfis[12];
+ fis[13] = cfis[13];
+ if (fis[2] & ATA_S_ERROR)
+ p->is |= AHCI_P_IX_TFE;
+ else
+ p->ci &= ~(1 << slot);
+ p->tfd = tfd;
+ ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
+}
+
+static void
+ahci_write_reset_fis_d2h(struct ahci_port *p)
+{
+ uint8_t fis[20];
+
+ memset(fis, 0, sizeof(fis));
+ fis[0] = FIS_TYPE_REGD2H;
+ fis[3] = 1;
+ fis[4] = 1;
+ if (p->atapi) {
+ fis[5] = 0x14;
+ fis[6] = 0xeb;
+ }
+ fis[12] = 1;
+ ahci_write_fis(p, FIS_TYPE_REGD2H, fis);
+}
+
+static void
+ahci_check_stopped(struct ahci_port *p)
+{
+ /*
+ * If we are no longer processing the command list and nothing
+ * is in-flight, clear the running bit, the current command
+ * slot, the command issue and active bits.
+ */
+ if (!(p->cmd & AHCI_P_CMD_ST)) {
+ if (p->pending == 0) {
+ p->cmd &= ~(AHCI_P_CMD_CR | AHCI_P_CMD_CCS_MASK);
+ p->ci = 0;
+ p->sact = 0;
+ }
+ }
+}
+
+static void
+ahci_port_stop(struct ahci_port *p)
+{
+ struct ahci_ioreq *aior;
+ uint8_t *cfis;
+ int slot;
+ int ncq;
+ int error;
+
+ assert(pthread_mutex_isowned_np(&p->pr_sc->mtx));
+
+ TAILQ_FOREACH(aior, &p->iobhd, io_blist) {
+ /*
+ * Try to cancel the outstanding blockif request.
+ */
+ error = blockif_cancel(p->bctx, &aior->io_req);
+ if (error != 0)
+ continue;
+
+ slot = aior->slot;
+ cfis = aior->cfis;
+ if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
+ cfis[2] == ATA_READ_FPDMA_QUEUED)
+ ncq = 1;
+
+ if (ncq)
+ p->sact &= ~(1 << slot);
+ else
+ p->ci &= ~(1 << slot);
+
+ /*
+ * This command is now done.
+ */
+ p->pending &= ~(1 << slot);
+
+ /*
+ * Delete the blockif request from the busy list
+ */
+ TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+ /*
+ * Move the blockif request back to the free list
+ */
+ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
+ }
+
+ ahci_check_stopped(p);
+}
+
+static void
+ahci_port_reset(struct ahci_port *pr)
+{
+ pr->sctl = 0;
+ pr->serr = 0;
+ pr->sact = 0;
+ pr->xfermode = ATA_UDMA6;
+ pr->mult_sectors = 128;
+
+ if (!pr->bctx) {
+ pr->ssts = ATA_SS_DET_NO_DEVICE;
+ pr->sig = 0xFFFFFFFF;
+ pr->tfd = 0x7F;
+ return;
+ }
+ pr->ssts = ATA_SS_DET_PHY_ONLINE | ATA_SS_SPD_GEN2 |
+ ATA_SS_IPM_ACTIVE;
+ pr->tfd = (1 << 8) | ATA_S_DSC | ATA_S_DMA;
+ if (!pr->atapi) {
+ pr->sig = PxSIG_ATA;
+ pr->tfd |= ATA_S_READY;
+ } else
+ pr->sig = PxSIG_ATAPI;
+ ahci_write_reset_fis_d2h(pr);
+}
+
+static void
+ahci_reset(struct pci_ahci_softc *sc)
+{
+ int i;
+
+ sc->ghc = AHCI_GHC_AE;
+ sc->is = 0;
+
+ if (sc->lintr) {
+ pci_lintr_deassert(sc->asc_pi);
+ sc->lintr = 0;
+ }
+
+ for (i = 0; i < sc->ports; i++) {
+ sc->port[i].ie = 0;
+ sc->port[i].is = 0;
+ ahci_port_reset(&sc->port[i]);
+ }
+}
+
+static void
+ata_string(uint8_t *dest, const char *src, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (*src)
+ dest[i ^ 1] = *src++;
+ else
+ dest[i ^ 1] = ' ';
+ }
+}
+
+static void
+atapi_string(uint8_t *dest, const char *src, int len)
+{
+ int i;
+
+ for (i = 0; i < len; i++) {
+ if (*src)
+ dest[i] = *src++;
+ else
+ dest[i] = ' ';
+ }
+}
+
+static void
+ahci_handle_dma(struct ahci_port *p, int slot, uint8_t *cfis, uint32_t done,
+ int seek)
+{
+ struct ahci_ioreq *aior;
+ struct blockif_req *breq;
+ struct pci_ahci_softc *sc;
+ struct ahci_prdt_entry *prdt;
+ struct ahci_cmd_hdr *hdr;
+ uint64_t lba;
+ uint32_t len;
+ int i, err, iovcnt, ncq, readop;
+
+ sc = p->pr_sc;
+ prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+ ncq = 0;
+ readop = 1;
+
+ prdt += seek;
+ if (cfis[2] == ATA_WRITE_DMA || cfis[2] == ATA_WRITE_DMA48 ||
+ cfis[2] == ATA_WRITE_FPDMA_QUEUED)
+ readop = 0;
+
+ if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
+ cfis[2] == ATA_READ_FPDMA_QUEUED) {
+ lba = ((uint64_t)cfis[10] << 40) |
+ ((uint64_t)cfis[9] << 32) |
+ ((uint64_t)cfis[8] << 24) |
+ ((uint64_t)cfis[6] << 16) |
+ ((uint64_t)cfis[5] << 8) |
+ cfis[4];
+ len = cfis[11] << 8 | cfis[3];
+ if (!len)
+ len = 65536;
+ ncq = 1;
+ } else if (cfis[2] == ATA_READ_DMA48 || cfis[2] == ATA_WRITE_DMA48) {
+ lba = ((uint64_t)cfis[10] << 40) |
+ ((uint64_t)cfis[9] << 32) |
+ ((uint64_t)cfis[8] << 24) |
+ ((uint64_t)cfis[6] << 16) |
+ ((uint64_t)cfis[5] << 8) |
+ cfis[4];
+ len = cfis[13] << 8 | cfis[12];
+ if (!len)
+ len = 65536;
+ } else {
+ lba = ((cfis[7] & 0xf) << 24) | (cfis[6] << 16) |
+ (cfis[5] << 8) | cfis[4];
+ len = cfis[12];
+ if (!len)
+ len = 256;
+ }
+ lba *= blockif_sectsz(p->bctx);
+ len *= blockif_sectsz(p->bctx);
+
+ /*
+ * Pull request off free list
+ */
+ aior = STAILQ_FIRST(&p->iofhd);
+ assert(aior != NULL);
+ STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
+ aior->cfis = cfis;
+ aior->slot = slot;
+ aior->len = len;
+ aior->done = done;
+ breq = &aior->io_req;
+ breq->br_offset = lba + done;
+ iovcnt = hdr->prdtl - seek;
+ if (iovcnt > BLOCKIF_IOV_MAX) {
+ aior->prdtl = iovcnt - BLOCKIF_IOV_MAX;
+ iovcnt = BLOCKIF_IOV_MAX;
+ } else
+ aior->prdtl = 0;
+ breq->br_iovcnt = iovcnt;
+
+ /*
+ * Mark this command in-flight.
+ */
+ p->pending |= 1 << slot;
+
+ /*
+ * Stuff request onto busy list
+ */
+ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+ /*
+ * Build up the iovec based on the prdt
+ */
+ for (i = 0; i < iovcnt; i++) {
+ uint32_t dbcsz;
+
+ dbcsz = (prdt->dbc & DBCMASK) + 1;
+ breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc),
+ prdt->dba, dbcsz);
+ breq->br_iov[i].iov_len = dbcsz;
+ aior->done += dbcsz;
+ prdt++;
+ }
+ if (readop)
+ err = blockif_read(p->bctx, breq);
+ else
+ err = blockif_write(p->bctx, breq);
+ assert(err == 0);
+
+ if (ncq)
+ p->ci &= ~(1 << slot);
+}
+
+static void
+ahci_handle_flush(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ struct ahci_ioreq *aior;
+ struct blockif_req *breq;
+ int err;
+
+ /*
+ * Pull request off free list
+ */
+ aior = STAILQ_FIRST(&p->iofhd);
+ assert(aior != NULL);
+ STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
+ aior->cfis = cfis;
+ aior->slot = slot;
+ aior->len = 0;
+ aior->done = 0;
+ aior->prdtl = 0;
+ breq = &aior->io_req;
+
+ /*
+ * Mark this command in-flight.
+ */
+ p->pending |= 1 << slot;
+
+ /*
+ * Stuff request onto busy list
+ */
+ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+ err = blockif_flush(p->bctx, breq);
+ assert(err == 0);
+}
+
+static inline void
+write_prdt(struct ahci_port *p, int slot, uint8_t *cfis,
+ void *buf, int size)
+{
+ struct ahci_cmd_hdr *hdr;
+ struct ahci_prdt_entry *prdt;
+ void *from;
+ int i, len;
+
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+ len = size;
+ from = buf;
+ prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+ for (i = 0; i < hdr->prdtl && len; i++) {
+ uint8_t *ptr;
+ uint32_t dbcsz;
+ int sublen;
+
+ dbcsz = (prdt->dbc & DBCMASK) + 1;
+ ptr = paddr_guest2host(ahci_ctx(p->pr_sc), prdt->dba, dbcsz);
+ sublen = len < dbcsz ? len : dbcsz;
+ memcpy(ptr, from, sublen);
+ len -= sublen;
+ from += sublen;
+ prdt++;
+ }
+ hdr->prdbc = size - len;
+}
+
+static void
+handle_identify(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ struct ahci_cmd_hdr *hdr;
+
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+ if (p->atapi || hdr->prdtl == 0) {
+ p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
+ p->is |= AHCI_P_IX_TFE;
+ } else {
+ uint16_t buf[256];
+ uint64_t sectors;
+ uint16_t cyl;
+ uint8_t sech, heads;
+
+ sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
+ blockif_chs(p->bctx, &cyl, &heads, &sech);
+ memset(buf, 0, sizeof(buf));
+ buf[0] = 0x0040;
+ buf[1] = cyl;
+ buf[3] = heads;
+ buf[6] = sech;
+ /* TODO emulate different serial? */
+ ata_string((uint8_t *)(buf+10), "123456", 20);
+ ata_string((uint8_t *)(buf+23), "001", 8);
+ ata_string((uint8_t *)(buf+27), "BHYVE SATA DISK", 40);
+ buf[47] = (0x8000 | 128);
+ buf[48] = 0x1;
+ buf[49] = (1 << 8 | 1 << 9 | 1 << 11);
+ buf[50] = (1 << 14);
+ buf[53] = (1 << 1 | 1 << 2);
+ if (p->mult_sectors)
+ buf[59] = (0x100 | p->mult_sectors);
+ buf[60] = sectors;
+ buf[61] = (sectors >> 16);
+ buf[63] = 0x7;
+ if (p->xfermode & ATA_WDMA0)
+ buf[63] |= (1 << ((p->xfermode & 7) + 8));
+ buf[64] = 0x3;
+ buf[65] = 100;
+ buf[66] = 100;
+ buf[67] = 100;
+ buf[68] = 100;
+ buf[75] = 31;
+ buf[76] = (1 << 8 | 1 << 2);
+ buf[80] = 0x1f0;
+ buf[81] = 0x28;
+ buf[82] = (1 << 5 | 1 << 14);
+ buf[83] = (1 << 10 | 1 << 12 | 1 << 13 | 1 << 14);
+ buf[84] = (1 << 14);
+ buf[85] = (1 << 5 | 1 << 14);
+ buf[86] = (1 << 10 | 1 << 12 | 1 << 13);
+ buf[87] = (1 << 14);
+ buf[88] = 0x7f;
+ if (p->xfermode & ATA_UDMA0)
+ buf[88] |= (1 << ((p->xfermode & 7) + 8));
+ buf[93] = (1 | 1 <<14);
+ buf[100] = sectors;
+ buf[101] = (sectors >> 16);
+ buf[102] = (sectors >> 32);
+ buf[103] = (sectors >> 48);
+ ahci_write_fis_piosetup(p);
+ write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
+ p->tfd = ATA_S_DSC | ATA_S_READY;
+ p->is |= AHCI_P_IX_DP;
+ p->ci &= ~(1 << slot);
+ }
+ ahci_generate_intr(p->pr_sc);
+}
+
+static void
+handle_atapi_identify(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ if (!p->atapi) {
+ p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
+ p->is |= AHCI_P_IX_TFE;
+ } else {
+ uint16_t buf[256];
+
+ memset(buf, 0, sizeof(buf));
+ buf[0] = (2 << 14 | 5 << 8 | 1 << 7 | 2 << 5);
+ /* TODO emulate different serial? */
+ ata_string((uint8_t *)(buf+10), "123456", 20);
+ ata_string((uint8_t *)(buf+23), "001", 8);
+ ata_string((uint8_t *)(buf+27), "BHYVE SATA DVD ROM", 40);
+ buf[49] = (1 << 9 | 1 << 8);
+ buf[50] = (1 << 14 | 1);
+ buf[53] = (1 << 2 | 1 << 1);
+ buf[62] = 0x3f;
+ buf[63] = 7;
+ buf[64] = 3;
+ buf[65] = 100;
+ buf[66] = 100;
+ buf[67] = 100;
+ buf[68] = 100;
+ buf[76] = (1 << 2 | 1 << 1);
+ buf[78] = (1 << 5);
+ buf[80] = (0x1f << 4);
+ buf[82] = (1 << 4);
+ buf[83] = (1 << 14);
+ buf[84] = (1 << 14);
+ buf[85] = (1 << 4);
+ buf[87] = (1 << 14);
+ buf[88] = (1 << 14 | 0x7f);
+ ahci_write_fis_piosetup(p);
+ write_prdt(p, slot, cfis, (void *)buf, sizeof(buf));
+ p->tfd = ATA_S_DSC | ATA_S_READY;
+ p->is |= AHCI_P_IX_DHR;
+ p->ci &= ~(1 << slot);
+ }
+ ahci_generate_intr(p->pr_sc);
+}
+
+static void
+atapi_inquiry(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t buf[36];
+ uint8_t *acmd;
+ int len;
+
+ acmd = cfis + 0x40;
+
+ buf[0] = 0x05;
+ buf[1] = 0x80;
+ buf[2] = 0x00;
+ buf[3] = 0x21;
+ buf[4] = 31;
+ buf[5] = 0;
+ buf[6] = 0;
+ buf[7] = 0;
+ atapi_string(buf + 8, "BHYVE", 8);
+ atapi_string(buf + 16, "BHYVE DVD-ROM", 16);
+ atapi_string(buf + 32, "001", 4);
+
+ len = sizeof(buf);
+ if (len > acmd[4])
+ len = acmd[4];
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ write_prdt(p, slot, cfis, buf, len);
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+}
+
+static void
+atapi_read_capacity(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t buf[8];
+ uint64_t sectors;
+
+ sectors = blockif_size(p->bctx) / 2048;
+ be32enc(buf, sectors - 1);
+ be32enc(buf + 4, 2048);
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ write_prdt(p, slot, cfis, buf, sizeof(buf));
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+}
+
+static void
+atapi_read_toc(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t *acmd;
+ uint8_t format;
+ int len;
+
+ acmd = cfis + 0x40;
+
+ len = be16dec(acmd + 7);
+ format = acmd[9] >> 6;
+ switch (format) {
+ case 0:
+ {
+ int msf, size;
+ uint64_t sectors;
+ uint8_t start_track, buf[20], *bp;
+
+ msf = (acmd[1] >> 1) & 1;
+ start_track = acmd[6];
+ if (start_track > 1 && start_track != 0xaa) {
+ uint32_t tfd;
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x24;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+ return;
+ }
+ bp = buf + 2;
+ *bp++ = 1;
+ *bp++ = 1;
+ if (start_track <= 1) {
+ *bp++ = 0;
+ *bp++ = 0x14;
+ *bp++ = 1;
+ *bp++ = 0;
+ if (msf) {
+ *bp++ = 0;
+ lba_to_msf(bp, 0);
+ bp += 3;
+ } else {
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ }
+ }
+ *bp++ = 0;
+ *bp++ = 0x14;
+ *bp++ = 0xaa;
+ *bp++ = 0;
+ sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
+ sectors >>= 2;
+ if (msf) {
+ *bp++ = 0;
+ lba_to_msf(bp, sectors);
+ bp += 3;
+ } else {
+ be32enc(bp, sectors);
+ bp += 4;
+ }
+ size = bp - buf;
+ be16enc(buf, size - 2);
+ if (len > size)
+ len = size;
+ write_prdt(p, slot, cfis, buf, len);
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ }
+ case 1:
+ {
+ uint8_t buf[12];
+
+ memset(buf, 0, sizeof(buf));
+ buf[1] = 0xa;
+ buf[2] = 0x1;
+ buf[3] = 0x1;
+ if (len > sizeof(buf))
+ len = sizeof(buf);
+ write_prdt(p, slot, cfis, buf, len);
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ }
+ case 2:
+ {
+ int msf, size;
+ uint64_t sectors;
+ uint8_t start_track, *bp, buf[50];
+
+ msf = (acmd[1] >> 1) & 1;
+ start_track = acmd[6];
+ bp = buf + 2;
+ *bp++ = 1;
+ *bp++ = 1;
+
+ *bp++ = 1;
+ *bp++ = 0x14;
+ *bp++ = 0;
+ *bp++ = 0xa0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 1;
+ *bp++ = 0;
+ *bp++ = 0;
+
+ *bp++ = 1;
+ *bp++ = 0x14;
+ *bp++ = 0;
+ *bp++ = 0xa1;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 1;
+ *bp++ = 0;
+ *bp++ = 0;
+
+ *bp++ = 1;
+ *bp++ = 0x14;
+ *bp++ = 0;
+ *bp++ = 0xa2;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ sectors = blockif_size(p->bctx) / blockif_sectsz(p->bctx);
+ sectors >>= 2;
+ if (msf) {
+ *bp++ = 0;
+ lba_to_msf(bp, sectors);
+ bp += 3;
+ } else {
+ be32enc(bp, sectors);
+ bp += 4;
+ }
+
+ *bp++ = 1;
+ *bp++ = 0x14;
+ *bp++ = 0;
+ *bp++ = 1;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ if (msf) {
+ *bp++ = 0;
+ lba_to_msf(bp, 0);
+ bp += 3;
+ } else {
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ *bp++ = 0;
+ }
+
+ size = bp - buf;
+ be16enc(buf, size - 2);
+ if (len > size)
+ len = size;
+ write_prdt(p, slot, cfis, buf, len);
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ }
+ default:
+ {
+ uint32_t tfd;
+
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x24;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+ break;
+ }
+ }
+}
+
+static void
+atapi_read(struct ahci_port *p, int slot, uint8_t *cfis,
+ uint32_t done, int seek)
+{
+ struct ahci_ioreq *aior;
+ struct ahci_cmd_hdr *hdr;
+ struct ahci_prdt_entry *prdt;
+ struct blockif_req *breq;
+ struct pci_ahci_softc *sc;
+ uint8_t *acmd;
+ uint64_t lba;
+ uint32_t len;
+ int i, err, iovcnt;
+
+ sc = p->pr_sc;
+ acmd = cfis + 0x40;
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+ prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+
+ prdt += seek;
+ lba = be32dec(acmd + 2);
+ if (acmd[0] == READ_10)
+ len = be16dec(acmd + 7);
+ else
+ len = be32dec(acmd + 6);
+ if (len == 0) {
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ }
+ lba *= 2048;
+ len *= 2048;
+
+ /*
+ * Pull request off free list
+ */
+ aior = STAILQ_FIRST(&p->iofhd);
+ assert(aior != NULL);
+ STAILQ_REMOVE_HEAD(&p->iofhd, io_flist);
+ aior->cfis = cfis;
+ aior->slot = slot;
+ aior->len = len;
+ aior->done = done;
+ breq = &aior->io_req;
+ breq->br_offset = lba + done;
+ iovcnt = hdr->prdtl - seek;
+ if (iovcnt > BLOCKIF_IOV_MAX) {
+ aior->prdtl = iovcnt - BLOCKIF_IOV_MAX;
+ iovcnt = BLOCKIF_IOV_MAX;
+ } else
+ aior->prdtl = 0;
+ breq->br_iovcnt = iovcnt;
+
+ /*
+ * Mark this command in-flight.
+ */
+ p->pending |= 1 << slot;
+
+ /*
+ * Stuff request onto busy list
+ */
+ TAILQ_INSERT_HEAD(&p->iobhd, aior, io_blist);
+
+ /*
+ * Build up the iovec based on the prdt
+ */
+ for (i = 0; i < iovcnt; i++) {
+ uint32_t dbcsz;
+
+ dbcsz = (prdt->dbc & DBCMASK) + 1;
+ breq->br_iov[i].iov_base = paddr_guest2host(ahci_ctx(sc),
+ prdt->dba, dbcsz);
+ breq->br_iov[i].iov_len = dbcsz;
+ aior->done += dbcsz;
+ prdt++;
+ }
+ err = blockif_read(p->bctx, breq);
+ assert(err == 0);
+}
+
+static void
+atapi_request_sense(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t buf[64];
+ uint8_t *acmd;
+ int len;
+
+ acmd = cfis + 0x40;
+ len = acmd[4];
+ if (len > sizeof(buf))
+ len = sizeof(buf);
+ memset(buf, 0, len);
+ buf[0] = 0x70 | (1 << 7);
+ buf[2] = p->sense_key;
+ buf[7] = 10;
+ buf[12] = p->asc;
+ write_prdt(p, slot, cfis, buf, len);
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+}
+
+static void
+atapi_start_stop_unit(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t *acmd = cfis + 0x40;
+ uint32_t tfd;
+
+ switch (acmd[4] & 3) {
+ case 0:
+ case 1:
+ case 3:
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ tfd = ATA_S_READY | ATA_S_DSC;
+ break;
+ case 2:
+ /* TODO eject media */
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x53;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ break;
+ }
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+}
+
+static void
+atapi_mode_sense(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t *acmd;
+ uint32_t tfd;
+ uint8_t pc, code;
+ int len;
+
+ acmd = cfis + 0x40;
+ len = be16dec(acmd + 7);
+ pc = acmd[2] >> 6;
+ code = acmd[2] & 0x3f;
+
+ switch (pc) {
+ case 0:
+ switch (code) {
+ case MODEPAGE_RW_ERROR_RECOVERY:
+ {
+ uint8_t buf[16];
+
+ if (len > sizeof(buf))
+ len = sizeof(buf);
+
+ memset(buf, 0, sizeof(buf));
+ be16enc(buf, 16 - 2);
+ buf[2] = 0x70;
+ buf[8] = 0x01;
+ buf[9] = 16 - 10;
+ buf[11] = 0x05;
+ write_prdt(p, slot, cfis, buf, len);
+ tfd = ATA_S_READY | ATA_S_DSC;
+ break;
+ }
+ case MODEPAGE_CD_CAPABILITIES:
+ {
+ uint8_t buf[30];
+
+ if (len > sizeof(buf))
+ len = sizeof(buf);
+
+ memset(buf, 0, sizeof(buf));
+ be16enc(buf, 30 - 2);
+ buf[2] = 0x70;
+ buf[8] = 0x2A;
+ buf[9] = 30 - 10;
+ buf[10] = 0x08;
+ buf[12] = 0x71;
+ be16enc(&buf[18], 2);
+ be16enc(&buf[20], 512);
+ write_prdt(p, slot, cfis, buf, len);
+ tfd = ATA_S_READY | ATA_S_DSC;
+ break;
+ }
+ default:
+ goto error;
+ break;
+ }
+ break;
+ case 3:
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x39;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ break;
+error:
+ case 1:
+ case 2:
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x24;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ break;
+ }
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+}
+
+static void
+atapi_get_event_status_notification(struct ahci_port *p, int slot,
+ uint8_t *cfis)
+{
+ uint8_t *acmd;
+ uint32_t tfd;
+
+ acmd = cfis + 0x40;
+
+ /* we don't support asynchronous operation */
+ if (!(acmd[1] & 1)) {
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x24;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ } else {
+ uint8_t buf[8];
+ int len;
+
+ len = be16dec(acmd + 7);
+ if (len > sizeof(buf))
+ len = sizeof(buf);
+
+ memset(buf, 0, sizeof(buf));
+ be16enc(buf, 8 - 2);
+ buf[2] = 0x04;
+ buf[3] = 0x10;
+ buf[5] = 0x02;
+ write_prdt(p, slot, cfis, buf, len);
+ tfd = ATA_S_READY | ATA_S_DSC;
+ }
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+}
+
+static void
+handle_packet_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+ uint8_t *acmd;
+
+ acmd = cfis + 0x40;
+
+#ifdef AHCI_DEBUG
+ {
+ int i;
+ DPRINTF("ACMD:");
+ for (i = 0; i < 16; i++)
+ DPRINTF("%02x ", acmd[i]);
+ DPRINTF("\n");
+ }
+#endif
+
+ switch (acmd[0]) {
+ case TEST_UNIT_READY:
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ case INQUIRY:
+ atapi_inquiry(p, slot, cfis);
+ break;
+ case READ_CAPACITY:
+ atapi_read_capacity(p, slot, cfis);
+ break;
+ case PREVENT_ALLOW:
+ /* TODO */
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ case READ_TOC:
+ atapi_read_toc(p, slot, cfis);
+ break;
+ case READ_10:
+ case READ_12:
+ atapi_read(p, slot, cfis, 0, 0);
+ break;
+ case REQUEST_SENSE:
+ atapi_request_sense(p, slot, cfis);
+ break;
+ case START_STOP_UNIT:
+ atapi_start_stop_unit(p, slot, cfis);
+ break;
+ case MODE_SENSE_10:
+ atapi_mode_sense(p, slot, cfis);
+ break;
+ case GET_EVENT_STATUS_NOTIFICATION:
+ atapi_get_event_status_notification(p, slot, cfis);
+ break;
+ default:
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x20;
+ ahci_write_fis_d2h(p, slot, cfis, (p->sense_key << 12) |
+ ATA_S_READY | ATA_S_ERROR);
+ break;
+ }
+}
+
+static void
+ahci_handle_cmd(struct ahci_port *p, int slot, uint8_t *cfis)
+{
+
+ switch (cfis[2]) {
+ case ATA_ATA_IDENTIFY:
+ handle_identify(p, slot, cfis);
+ break;
+ case ATA_SETFEATURES:
+ {
+ switch (cfis[3]) {
+ case ATA_SF_ENAB_SATA_SF:
+ switch (cfis[12]) {
+ case ATA_SATA_SF_AN:
+ p->tfd = ATA_S_DSC | ATA_S_READY;
+ break;
+ default:
+ p->tfd = ATA_S_ERROR | ATA_S_READY;
+ p->tfd |= (ATA_ERROR_ABORT << 8);
+ break;
+ }
+ break;
+ case ATA_SF_ENAB_WCACHE:
+ case ATA_SF_DIS_WCACHE:
+ case ATA_SF_ENAB_RCACHE:
+ case ATA_SF_DIS_RCACHE:
+ p->tfd = ATA_S_DSC | ATA_S_READY;
+ break;
+ case ATA_SF_SETXFER:
+ {
+ switch (cfis[12] & 0xf8) {
+ case ATA_PIO:
+ case ATA_PIO0:
+ break;
+ case ATA_WDMA0:
+ case ATA_UDMA0:
+ p->xfermode = (cfis[12] & 0x7);
+ break;
+ }
+ p->tfd = ATA_S_DSC | ATA_S_READY;
+ break;
+ }
+ default:
+ p->tfd = ATA_S_ERROR | ATA_S_READY;
+ p->tfd |= (ATA_ERROR_ABORT << 8);
+ break;
+ }
+ ahci_write_fis_d2h(p, slot, cfis, p->tfd);
+ break;
+ }
+ case ATA_SET_MULTI:
+ if (cfis[12] != 0 &&
+ (cfis[12] > 128 || (cfis[12] & (cfis[12] - 1)))) {
+ p->tfd = ATA_S_ERROR | ATA_S_READY;
+ p->tfd |= (ATA_ERROR_ABORT << 8);
+ } else {
+ p->mult_sectors = cfis[12];
+ p->tfd = ATA_S_DSC | ATA_S_READY;
+ }
+ p->is |= AHCI_P_IX_DP;
+ p->ci &= ~(1 << slot);
+ ahci_generate_intr(p->pr_sc);
+ break;
+ case ATA_READ_DMA:
+ case ATA_WRITE_DMA:
+ case ATA_READ_DMA48:
+ case ATA_WRITE_DMA48:
+ case ATA_READ_FPDMA_QUEUED:
+ case ATA_WRITE_FPDMA_QUEUED:
+ ahci_handle_dma(p, slot, cfis, 0, 0);
+ break;
+ case ATA_FLUSHCACHE:
+ case ATA_FLUSHCACHE48:
+ ahci_handle_flush(p, slot, cfis);
+ break;
+ case ATA_STANDBY_CMD:
+ break;
+ case ATA_NOP:
+ case ATA_STANDBY_IMMEDIATE:
+ case ATA_IDLE_IMMEDIATE:
+ case ATA_SLEEP:
+ ahci_write_fis_d2h(p, slot, cfis, ATA_S_READY | ATA_S_DSC);
+ break;
+ case ATA_ATAPI_IDENTIFY:
+ handle_atapi_identify(p, slot, cfis);
+ break;
+ case ATA_PACKET_CMD:
+ if (!p->atapi) {
+ p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
+ p->is |= AHCI_P_IX_TFE;
+ ahci_generate_intr(p->pr_sc);
+ } else
+ handle_packet_cmd(p, slot, cfis);
+ break;
+ default:
+ WPRINTF("Unsupported cmd:%02x\n", cfis[2]);
+ p->tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
+ p->is |= AHCI_P_IX_TFE;
+ ahci_generate_intr(p->pr_sc);
+ break;
+ }
+}
+
+static void
+ahci_handle_slot(struct ahci_port *p, int slot)
+{
+ struct ahci_cmd_hdr *hdr;
+ struct ahci_prdt_entry *prdt;
+ struct pci_ahci_softc *sc;
+ uint8_t *cfis;
+ int cfl;
+
+ sc = p->pr_sc;
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+ cfl = (hdr->flags & 0x1f) * 4;
+ cfis = paddr_guest2host(ahci_ctx(sc), hdr->ctba,
+ 0x80 + hdr->prdtl * sizeof(struct ahci_prdt_entry));
+ prdt = (struct ahci_prdt_entry *)(cfis + 0x80);
+
+#ifdef AHCI_DEBUG
+ DPRINTF("\ncfis:");
+ for (i = 0; i < cfl; i++) {
+ if (i % 10 == 0)
+ DPRINTF("\n");
+ DPRINTF("%02x ", cfis[i]);
+ }
+ DPRINTF("\n");
+
+ for (i = 0; i < hdr->prdtl; i++) {
+ DPRINTF("%d@%08"PRIx64"\n", prdt->dbc & 0x3fffff, prdt->dba);
+ prdt++;
+ }
+#endif
+
+ if (cfis[0] != FIS_TYPE_REGH2D) {
+ WPRINTF("Not a H2D FIS:%02x\n", cfis[0]);
+ return;
+ }
+
+ if (cfis[1] & 0x80) {
+ ahci_handle_cmd(p, slot, cfis);
+ } else {
+ if (cfis[15] & (1 << 2))
+ p->reset = 1;
+ else if (p->reset) {
+ p->reset = 0;
+ ahci_port_reset(p);
+ }
+ p->ci &= ~(1 << slot);
+ }
+}
+
+static void
+ahci_handle_port(struct ahci_port *p)
+{
+ int i;
+
+ if (!(p->cmd & AHCI_P_CMD_ST))
+ return;
+
+ /*
+ * Search for any new commands to issue ignoring those that
+ * are already in-flight.
+ */
+ for (i = 0; (i < 32) && p->ci; i++) {
+ if ((p->ci & (1 << i)) && !(p->pending & (1 << i))) {
+ p->cmd &= ~AHCI_P_CMD_CCS_MASK;
+ p->cmd |= i << AHCI_P_CMD_CCS_SHIFT;
+ ahci_handle_slot(p, i);
+ }
+ }
+}
+
+/*
+ * blockif callback routine - this runs in the context of the blockif
+ * i/o thread, so the mutex needs to be acquired.
+ */
+static void
+ata_ioreq_cb(struct blockif_req *br, int err)
+{
+ struct ahci_cmd_hdr *hdr;
+ struct ahci_ioreq *aior;
+ struct ahci_port *p;
+ struct pci_ahci_softc *sc;
+ uint32_t tfd;
+ uint8_t *cfis;
+ int pending, slot, ncq;
+
+ DPRINTF("%s %d\n", __func__, err);
+
+ ncq = 0;
+ aior = br->br_param;
+ p = aior->io_pr;
+ cfis = aior->cfis;
+ slot = aior->slot;
+ pending = aior->prdtl;
+ sc = p->pr_sc;
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + slot * AHCI_CL_SIZE);
+
+ if (cfis[2] == ATA_WRITE_FPDMA_QUEUED ||
+ cfis[2] == ATA_READ_FPDMA_QUEUED)
+ ncq = 1;
+
+ pthread_mutex_lock(&sc->mtx);
+
+ /*
+ * Delete the blockif request from the busy list
+ */
+ TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+ /*
+ * Move the blockif request back to the free list
+ */
+ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
+
+ if (pending && !err) {
+ ahci_handle_dma(p, slot, cfis, aior->done,
+ hdr->prdtl - pending);
+ goto out;
+ }
+
+ if (!err && aior->done == aior->len) {
+ tfd = ATA_S_READY | ATA_S_DSC;
+ if (ncq)
+ hdr->prdbc = 0;
+ else
+ hdr->prdbc = aior->len;
+ } else {
+ tfd = (ATA_E_ABORT << 8) | ATA_S_READY | ATA_S_ERROR;
+ hdr->prdbc = 0;
+ if (ncq)
+ p->serr |= (1 << slot);
+ }
+
+ if (ncq) {
+ p->sact &= ~(1 << slot);
+ ahci_write_fis_sdb(p, slot, tfd);
+ } else
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+
+ /*
+ * This command is now complete.
+ */
+ p->pending &= ~(1 << slot);
+
+ ahci_check_stopped(p);
+out:
+ pthread_mutex_unlock(&sc->mtx);
+ DPRINTF("%s exit\n", __func__);
+}
+
+static void
+atapi_ioreq_cb(struct blockif_req *br, int err)
+{
+ struct ahci_cmd_hdr *hdr;
+ struct ahci_ioreq *aior;
+ struct ahci_port *p;
+ struct pci_ahci_softc *sc;
+ uint8_t *cfis;
+ uint32_t tfd;
+ int pending, slot;
+
+ DPRINTF("%s %d\n", __func__, err);
+
+ aior = br->br_param;
+ p = aior->io_pr;
+ cfis = aior->cfis;
+ slot = aior->slot;
+ pending = aior->prdtl;
+ sc = p->pr_sc;
+ hdr = (struct ahci_cmd_hdr *)(p->cmd_lst + aior->slot * AHCI_CL_SIZE);
+
+ pthread_mutex_lock(&sc->mtx);
+
+ /*
+ * Delete the blockif request from the busy list
+ */
+ TAILQ_REMOVE(&p->iobhd, aior, io_blist);
+
+ /*
+ * Move the blockif request back to the free list
+ */
+ STAILQ_INSERT_TAIL(&p->iofhd, aior, io_flist);
+
+ if (pending && !err) {
+ atapi_read(p, slot, cfis, aior->done, hdr->prdtl - pending);
+ goto out;
+ }
+
+ if (!err && aior->done == aior->len) {
+ tfd = ATA_S_READY | ATA_S_DSC;
+ hdr->prdbc = aior->len;
+ } else {
+ p->sense_key = ATA_SENSE_ILLEGAL_REQUEST;
+ p->asc = 0x21;
+ tfd = (p->sense_key << 12) | ATA_S_READY | ATA_S_ERROR;
+ hdr->prdbc = 0;
+ }
+
+ cfis[4] = (cfis[4] & ~7) | ATA_I_CMD | ATA_I_IN;
+ ahci_write_fis_d2h(p, slot, cfis, tfd);
+
+ /*
+ * This command is now complete.
+ */
+ p->pending &= ~(1 << slot);
+
+ ahci_check_stopped(p);
+out:
+ pthread_mutex_unlock(&sc->mtx);
+ DPRINTF("%s exit\n", __func__);
+}
+
+static void
+pci_ahci_ioreq_init(struct ahci_port *pr)
+{
+ struct ahci_ioreq *vr;
+ int i;
+
+ pr->ioqsz = blockif_queuesz(pr->bctx);
+ pr->ioreq = calloc(pr->ioqsz, sizeof(struct ahci_ioreq));
+ STAILQ_INIT(&pr->iofhd);
+
+ /*
+ * Add all i/o request entries to the free queue
+ */
+ for (i = 0; i < pr->ioqsz; i++) {
+ vr = &pr->ioreq[i];
+ vr->io_pr = pr;
+ if (!pr->atapi)
+ vr->io_req.br_callback = ata_ioreq_cb;
+ else
+ vr->io_req.br_callback = atapi_ioreq_cb;
+ vr->io_req.br_param = vr;
+ STAILQ_INSERT_TAIL(&pr->iofhd, vr, io_flist);
+ }
+
+ TAILQ_INIT(&pr->iobhd);
+}
+
+static void
+pci_ahci_port_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
+{
+ int port = (offset - AHCI_OFFSET) / AHCI_STEP;
+ offset = (offset - AHCI_OFFSET) % AHCI_STEP;
+ struct ahci_port *p = &sc->port[port];
+
+ DPRINTF("pci_ahci_port %d: write offset 0x%"PRIx64" value 0x%"PRIx64"\n",
+ port, offset, value);
+
+ switch (offset) {
+ case AHCI_P_CLB:
+ p->clb = value;
+ break;
+ case AHCI_P_CLBU:
+ p->clbu = value;
+ break;
+ case AHCI_P_FB:
+ p->fb = value;
+ break;
+ case AHCI_P_FBU:
+ p->fbu = value;
+ break;
+ case AHCI_P_IS:
+ p->is &= ~value;
+ break;
+ case AHCI_P_IE:
+ p->ie = value & 0xFDC000FF;
+ ahci_generate_intr(sc);
+ break;
+ case AHCI_P_CMD:
+ {
+ p->cmd = value;
+
+ if (!(value & AHCI_P_CMD_ST)) {
+ ahci_port_stop(p);
+ } else {
+ uint64_t clb;
+
+ p->cmd |= AHCI_P_CMD_CR;
+ clb = (uint64_t)p->clbu << 32 | p->clb;
+ p->cmd_lst = paddr_guest2host(ahci_ctx(sc), clb,
+ AHCI_CL_SIZE * AHCI_MAX_SLOTS);
+ }
+
+ if (value & AHCI_P_CMD_FRE) {
+ uint64_t fb;
+
+ p->cmd |= AHCI_P_CMD_FR;
+ fb = (uint64_t)p->fbu << 32 | p->fb;
+ /* we don't support FBSCP, so rfis size is 256Bytes */
+ p->rfis = paddr_guest2host(ahci_ctx(sc), fb, 256);
+ } else {
+ p->cmd &= ~AHCI_P_CMD_FR;
+ }
+
+ if (value & AHCI_P_CMD_CLO) {
+ p->tfd = 0;
+ p->cmd &= ~AHCI_P_CMD_CLO;
+ }
+
+ ahci_handle_port(p);
+ break;
+ }
+ case AHCI_P_TFD:
+ case AHCI_P_SIG:
+ case AHCI_P_SSTS:
+ WPRINTF("pci_ahci_port: read only registers 0x%"PRIx64"\n", offset);
+ break;
+ case AHCI_P_SCTL:
+ if (!(p->cmd & AHCI_P_CMD_ST)) {
+ if (value & ATA_SC_DET_RESET)
+ ahci_port_reset(p);
+ p->sctl = value;
+ }
+ break;
+ case AHCI_P_SERR:
+ p->serr &= ~value;
+ break;
+ case AHCI_P_SACT:
+ p->sact |= value;
+ break;
+ case AHCI_P_CI:
+ p->ci |= value;
+ ahci_handle_port(p);
+ break;
+ case AHCI_P_SNTF:
+ case AHCI_P_FBS:
+ default:
+ break;
+ }
+}
+
+static void
+pci_ahci_host_write(struct pci_ahci_softc *sc, uint64_t offset, uint64_t value)
+{
+ DPRINTF("pci_ahci_host: write offset 0x%"PRIx64" value 0x%"PRIx64"\n",
+ offset, value);
+
+ switch (offset) {
+ case AHCI_CAP:
+ case AHCI_PI:
+ case AHCI_VS:
+ case AHCI_CAP2:
+ DPRINTF("pci_ahci_host: read only registers 0x%"PRIx64"\n", offset);
+ break;
+ case AHCI_GHC:
+ if (value & AHCI_GHC_HR)
+ ahci_reset(sc);
+ else if (value & AHCI_GHC_IE) {
+ sc->ghc |= AHCI_GHC_IE;
+ ahci_generate_intr(sc);
+ }
+ break;
+ case AHCI_IS:
+ sc->is &= ~value;
+ ahci_generate_intr(sc);
+ break;
+ default:
+ break;
+ }
+}
+
+static void
+pci_ahci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_ahci_softc *sc = pi->pi_arg;
+
+ assert(baridx == 5);
+ assert(size == 4);
+
+ pthread_mutex_lock(&sc->mtx);
+
+ if (offset < AHCI_OFFSET)
+ pci_ahci_host_write(sc, offset, value);
+ else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
+ pci_ahci_port_write(sc, offset, value);
+ else
+ WPRINTF("pci_ahci: unknown i/o write offset 0x%"PRIx64"\n", offset);
+
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+static uint64_t
+pci_ahci_host_read(struct pci_ahci_softc *sc, uint64_t offset)
+{
+ uint32_t value;
+
+ switch (offset) {
+ case AHCI_CAP:
+ case AHCI_GHC:
+ case AHCI_IS:
+ case AHCI_PI:
+ case AHCI_VS:
+ case AHCI_CCCC:
+ case AHCI_CCCP:
+ case AHCI_EM_LOC:
+ case AHCI_EM_CTL:
+ case AHCI_CAP2:
+ {
+ uint32_t *p = &sc->cap;
+ p += (offset - AHCI_CAP) / sizeof(uint32_t);
+ value = *p;
+ break;
+ }
+ default:
+ value = 0;
+ break;
+ }
+ DPRINTF("pci_ahci_host: read offset 0x%"PRIx64" value 0x%x\n",
+ offset, value);
+
+ return (value);
+}
+
+static uint64_t
+pci_ahci_port_read(struct pci_ahci_softc *sc, uint64_t offset)
+{
+ uint32_t value;
+ int port = (offset - AHCI_OFFSET) / AHCI_STEP;
+ offset = (offset - AHCI_OFFSET) % AHCI_STEP;
+
+ switch (offset) {
+ case AHCI_P_CLB:
+ case AHCI_P_CLBU:
+ case AHCI_P_FB:
+ case AHCI_P_FBU:
+ case AHCI_P_IS:
+ case AHCI_P_IE:
+ case AHCI_P_CMD:
+ case AHCI_P_TFD:
+ case AHCI_P_SIG:
+ case AHCI_P_SSTS:
+ case AHCI_P_SCTL:
+ case AHCI_P_SERR:
+ case AHCI_P_SACT:
+ case AHCI_P_CI:
+ case AHCI_P_SNTF:
+ case AHCI_P_FBS:
+ {
+ uint32_t *p= &sc->port[port].clb;
+ p += (offset - AHCI_P_CLB) / sizeof(uint32_t);
+ value = *p;
+ break;
+ }
+ default:
+ value = 0;
+ break;
+ }
+
+ DPRINTF("pci_ahci_port %d: read offset 0x%"PRIx64" value 0x%x\n",
+ port, offset, value);
+
+ return value;
+}
+
+static uint64_t
+pci_ahci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size)
+{
+ struct pci_ahci_softc *sc = pi->pi_arg;
+ uint32_t value;
+
+ assert(baridx == 5);
+ assert(size == 4);
+
+ pthread_mutex_lock(&sc->mtx);
+
+ if (offset < AHCI_OFFSET)
+ value = pci_ahci_host_read(sc, offset);
+ else if (offset < AHCI_OFFSET + sc->ports * AHCI_STEP)
+ value = pci_ahci_port_read(sc, offset);
+ else {
+ value = 0;
+ WPRINTF("pci_ahci: unknown i/o read offset 0x%"PRIx64"\n", offset);
+ }
+
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (value);
+}
+
+static int
+pci_ahci_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts, int atapi)
+{
+ char bident[sizeof("XX:X:X")];
+ struct blockif_ctxt *bctxt;
+ struct pci_ahci_softc *sc;
+ int ret, slots;
+
+ ret = 0;
+
+ if (opts == NULL) {
+ fprintf(stderr, "pci_ahci: backing device required\n");
+ return (1);
+ }
+
+#ifdef AHCI_DEBUG
+ dbg = fopen("/tmp/log", "w+");
+#endif
+
+ sc = calloc(1, sizeof(struct pci_ahci_softc));
+ pi->pi_arg = sc;
+ sc->asc_pi = pi;
+ sc->ports = MAX_PORTS;
+
+ /*
+ * Only use port 0 for a backing device. All other ports will be
+ * marked as unused
+ */
+ sc->port[0].atapi = atapi;
+
+ /*
+ * Attempt to open the backing image. Use the PCI
+ * slot/func for the identifier string.
+ */
+ snprintf(bident, sizeof(bident), "%d:%d", pi->pi_slot, pi->pi_func);
+ bctxt = blockif_open(opts, bident);
+ if (bctxt == NULL) {
+ ret = 1;
+ goto open_fail;
+ }
+ sc->port[0].bctx = bctxt;
+ sc->port[0].pr_sc = sc;
+
+ /*
+ * Allocate blockif request structures and add them
+ * to the free list
+ */
+ pci_ahci_ioreq_init(&sc->port[0]);
+
+ pthread_mutex_init(&sc->mtx, NULL);
+
+ /* Intel ICH8 AHCI */
+ slots = sc->port[0].ioqsz;
+ if (slots > 32)
+ slots = 32;
+ --slots;
+ sc->cap = AHCI_CAP_64BIT | AHCI_CAP_SNCQ | AHCI_CAP_SSNTF |
+ AHCI_CAP_SMPS | AHCI_CAP_SSS | AHCI_CAP_SALP |
+ AHCI_CAP_SAL | AHCI_CAP_SCLO | (0x3 << AHCI_CAP_ISS_SHIFT)|
+ AHCI_CAP_PMD | AHCI_CAP_SSC | AHCI_CAP_PSC |
+ (slots << AHCI_CAP_NCS_SHIFT) | AHCI_CAP_SXS | (sc->ports - 1);
+
+ /* Only port 0 implemented */
+ sc->pi = 1;
+ sc->vs = 0x10300;
+ sc->cap2 = AHCI_CAP2_APST;
+ ahci_reset(sc);
+
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x2821);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x8086);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_STORAGE_SATA);
+ pci_set_cfgdata8(pi, PCIR_PROGIF, PCIP_STORAGE_SATA_AHCI_1_0);
+ pci_emul_add_msicap(pi, 1);
+ pci_emul_alloc_bar(pi, 5, PCIBAR_MEM32,
+ AHCI_OFFSET + sc->ports * AHCI_STEP);
+
+ pci_lintr_request(pi);
+
+open_fail:
+ if (ret) {
+ blockif_close(sc->port[0].bctx);
+ free(sc);
+ }
+
+ return (ret);
+}
+
+static int
+pci_ahci_hd_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+ return (pci_ahci_init(ctx, pi, opts, 0));
+}
+
+static int
+pci_ahci_atapi_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+ return (pci_ahci_init(ctx, pi, opts, 1));
+}
+
+/*
+ * Use separate emulation names to distinguish drive and atapi devices
+ */
+struct pci_devemu pci_de_ahci_hd = {
+ .pe_emu = "ahci-hd",
+ .pe_init = pci_ahci_hd_init,
+ .pe_barwrite = pci_ahci_write,
+ .pe_barread = pci_ahci_read
+};
+PCI_EMUL_SET(pci_de_ahci_hd);
+
+struct pci_devemu pci_de_ahci_cd = {
+ .pe_emu = "ahci-cd",
+ .pe_init = pci_ahci_atapi_init,
+ .pe_barwrite = pci_ahci_write,
+ .pe_barread = pci_ahci_read
+};
+PCI_EMUL_SET(pci_de_ahci_cd);
diff --git a/usr/src/cmd/bhyve/pci_emul.c b/usr/src/cmd/bhyve/pci_emul.c
new file mode 100644
index 0000000000..3b4ca805cc
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_emul.c
@@ -0,0 +1,2103 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_emul.c 269700 2014-08-08 03:49:01Z neel $");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/errno.h>
+
+#include <ctype.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <assert.h>
+#include <stdbool.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "bhyverun.h"
+#include "inout.h"
+#include "ioapic.h"
+#include "mem.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+
+#define CONF1_ADDR_PORT 0x0cf8
+#define CONF1_DATA_PORT 0x0cfc
+
+#define CONF1_ENABLE 0x80000000ul
+
+#define CFGWRITE(pi,off,val,b) \
+do { \
+ if ((b) == 1) { \
+ pci_set_cfgdata8((pi),(off),(val)); \
+ } else if ((b) == 2) { \
+ pci_set_cfgdata16((pi),(off),(val)); \
+ } else { \
+ pci_set_cfgdata32((pi),(off),(val)); \
+ } \
+} while (0)
+
+#define MAXBUSES (PCI_BUSMAX + 1)
+#define MAXSLOTS (PCI_SLOTMAX + 1)
+#define MAXFUNCS (PCI_FUNCMAX + 1)
+
+struct funcinfo {
+ char *fi_name;
+ char *fi_param;
+ struct pci_devinst *fi_devi;
+};
+
+struct intxinfo {
+ int ii_count;
+ int ii_pirq_pin;
+ int ii_ioapic_irq;
+};
+
+struct slotinfo {
+ struct intxinfo si_intpins[4];
+ struct funcinfo si_funcs[MAXFUNCS];
+};
+
+struct businfo {
+ uint16_t iobase, iolimit; /* I/O window */
+ uint32_t membase32, memlimit32; /* mmio window below 4GB */
+ uint64_t membase64, memlimit64; /* mmio window above 4GB */
+ struct slotinfo slotinfo[MAXSLOTS];
+};
+
+static struct businfo *pci_businfo[MAXBUSES];
+
+SET_DECLARE(pci_devemu_set, struct pci_devemu);
+
+static uint64_t pci_emul_iobase;
+static uint64_t pci_emul_membase32;
+static uint64_t pci_emul_membase64;
+
+#define PCI_EMUL_IOBASE 0x2000
+#define PCI_EMUL_IOLIMIT 0x10000
+
+#define PCI_EMUL_ECFG_BASE 0xE0000000 /* 3.5GB */
+#define PCI_EMUL_ECFG_SIZE (MAXBUSES * 1024 * 1024) /* 1MB per bus */
+SYSRES_MEM(PCI_EMUL_ECFG_BASE, PCI_EMUL_ECFG_SIZE);
+
+#define PCI_EMUL_MEMLIMIT32 PCI_EMUL_ECFG_BASE
+
+#define PCI_EMUL_MEMBASE64 0xD000000000UL
+#define PCI_EMUL_MEMLIMIT64 0xFD00000000UL
+
+static struct pci_devemu *pci_emul_finddev(char *name);
+static void pci_lintr_route(struct pci_devinst *pi);
+static void pci_lintr_update(struct pci_devinst *pi);
+static void pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot,
+ int func, int coff, int bytes, uint32_t *val);
+
+/*
+ * I/O access
+ */
+
+/*
+ * Slot options are in the form:
+ *
+ * <bus>:<slot>:<func>,<emul>[,<config>]
+ * <slot>[:<func>],<emul>[,<config>]
+ *
+ * slot is 0..31
+ * func is 0..7
+ * emul is a string describing the type of PCI device e.g. virtio-net
+ * config is an optional string, depending on the device, that can be
+ * used for configuration.
+ * Examples are:
+ * 1,virtio-net,tap0
+ * 3:0,dummy
+ */
+static void
+pci_parse_slot_usage(char *aopt)
+{
+
+ fprintf(stderr, "Invalid PCI slot info field \"%s\"\n", aopt);
+}
+
+int
+pci_parse_slot(char *opt)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ char *emul, *config, *str, *cp;
+ int error, bnum, snum, fnum;
+
+ error = -1;
+ str = strdup(opt);
+
+ emul = config = NULL;
+ if ((cp = strchr(str, ',')) != NULL) {
+ *cp = '\0';
+ emul = cp + 1;
+ if ((cp = strchr(emul, ',')) != NULL) {
+ *cp = '\0';
+ config = cp + 1;
+ }
+ } else {
+ pci_parse_slot_usage(opt);
+ goto done;
+ }
+
+ /* <bus>:<slot>:<func> */
+ if (sscanf(str, "%d:%d:%d", &bnum, &snum, &fnum) != 3) {
+ bnum = 0;
+ /* <slot>:<func> */
+ if (sscanf(str, "%d:%d", &snum, &fnum) != 2) {
+ fnum = 0;
+ /* <slot> */
+ if (sscanf(str, "%d", &snum) != 1) {
+ snum = -1;
+ }
+ }
+ }
+
+ if (bnum < 0 || bnum >= MAXBUSES || snum < 0 || snum >= MAXSLOTS ||
+ fnum < 0 || fnum >= MAXFUNCS) {
+ pci_parse_slot_usage(opt);
+ goto done;
+ }
+
+ if (pci_businfo[bnum] == NULL)
+ pci_businfo[bnum] = calloc(1, sizeof(struct businfo));
+
+ bi = pci_businfo[bnum];
+ si = &bi->slotinfo[snum];
+
+ if (si->si_funcs[fnum].fi_name != NULL) {
+ fprintf(stderr, "pci slot %d:%d already occupied!\n",
+ snum, fnum);
+ goto done;
+ }
+
+ if (pci_emul_finddev(emul) == NULL) {
+ fprintf(stderr, "pci slot %d:%d: unknown device \"%s\"\n",
+ snum, fnum, emul);
+ goto done;
+ }
+
+ error = 0;
+ si->si_funcs[fnum].fi_name = emul;
+ si->si_funcs[fnum].fi_param = config;
+
+done:
+ if (error)
+ free(str);
+
+ return (error);
+}
+
+static int
+pci_valid_pba_offset(struct pci_devinst *pi, uint64_t offset)
+{
+
+ if (offset < pi->pi_msix.pba_offset)
+ return (0);
+
+ if (offset >= pi->pi_msix.pba_offset + pi->pi_msix.pba_size) {
+ return (0);
+ }
+
+ return (1);
+}
+
+int
+pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
+ uint64_t value)
+{
+ int msix_entry_offset;
+ int tab_index;
+ char *dest;
+
+ /* support only 4 or 8 byte writes */
+ if (size != 4 && size != 8)
+ return (-1);
+
+ /*
+ * Return if table index is beyond what device supports
+ */
+ tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+ if (tab_index >= pi->pi_msix.table_count)
+ return (-1);
+
+ msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+ /* support only aligned writes */
+ if ((msix_entry_offset % size) != 0)
+ return (-1);
+
+ dest = (char *)(pi->pi_msix.table + tab_index);
+ dest += msix_entry_offset;
+
+ if (size == 4)
+ *((uint32_t *)dest) = value;
+ else
+ *((uint64_t *)dest) = value;
+
+ return (0);
+}
+
+uint64_t
+pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size)
+{
+ char *dest;
+ int msix_entry_offset;
+ int tab_index;
+ uint64_t retval = ~0;
+
+ /*
+ * The PCI standard only allows 4 and 8 byte accesses to the MSI-X
+ * table but we also allow 1 byte access to accomodate reads from
+ * ddb.
+ */
+ if (size != 1 && size != 4 && size != 8)
+ return (retval);
+
+ msix_entry_offset = offset % MSIX_TABLE_ENTRY_SIZE;
+
+ /* support only aligned reads */
+ if ((msix_entry_offset % size) != 0) {
+ return (retval);
+ }
+
+ tab_index = offset / MSIX_TABLE_ENTRY_SIZE;
+
+ if (tab_index < pi->pi_msix.table_count) {
+ /* valid MSI-X Table access */
+ dest = (char *)(pi->pi_msix.table + tab_index);
+ dest += msix_entry_offset;
+
+ if (size == 1)
+ retval = *((uint8_t *)dest);
+ else if (size == 4)
+ retval = *((uint32_t *)dest);
+ else
+ retval = *((uint64_t *)dest);
+ } else if (pci_valid_pba_offset(pi, offset)) {
+ /* return 0 for PBA access */
+ retval = 0;
+ }
+
+ return (retval);
+}
+
+int
+pci_msix_table_bar(struct pci_devinst *pi)
+{
+
+ if (pi->pi_msix.table != NULL)
+ return (pi->pi_msix.table_bar);
+ else
+ return (-1);
+}
+
+int
+pci_msix_pba_bar(struct pci_devinst *pi)
+{
+
+ if (pi->pi_msix.table != NULL)
+ return (pi->pi_msix.pba_bar);
+ else
+ return (-1);
+}
+
+static int
+pci_emul_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ struct pci_devinst *pdi = arg;
+ struct pci_devemu *pe = pdi->pi_d;
+ uint64_t offset;
+ int i;
+
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ if (pdi->pi_bar[i].type == PCIBAR_IO &&
+ port >= pdi->pi_bar[i].addr &&
+ port + bytes <= pdi->pi_bar[i].addr + pdi->pi_bar[i].size) {
+ offset = port - pdi->pi_bar[i].addr;
+ if (in)
+ *eax = (*pe->pe_barread)(ctx, vcpu, pdi, i,
+ offset, bytes);
+ else
+ (*pe->pe_barwrite)(ctx, vcpu, pdi, i, offset,
+ bytes, *eax);
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+static int
+pci_emul_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2)
+{
+ struct pci_devinst *pdi = arg1;
+ struct pci_devemu *pe = pdi->pi_d;
+ uint64_t offset;
+ int bidx = (int) arg2;
+
+ assert(bidx <= PCI_BARMAX);
+ assert(pdi->pi_bar[bidx].type == PCIBAR_MEM32 ||
+ pdi->pi_bar[bidx].type == PCIBAR_MEM64);
+ assert(addr >= pdi->pi_bar[bidx].addr &&
+ addr + size <= pdi->pi_bar[bidx].addr + pdi->pi_bar[bidx].size);
+
+ offset = addr - pdi->pi_bar[bidx].addr;
+
+ if (dir == MEM_F_WRITE) {
+ if (size == 8) {
+ (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
+ 4, *val & 0xffffffff);
+ (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset + 4,
+ 4, *val >> 32);
+ } else {
+ (*pe->pe_barwrite)(ctx, vcpu, pdi, bidx, offset,
+ size, *val);
+ }
+ } else {
+ if (size == 8) {
+ *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
+ offset, 4);
+ *val |= (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
+ offset + 4, 4) << 32;
+ } else {
+ *val = (*pe->pe_barread)(ctx, vcpu, pdi, bidx,
+ offset, size);
+ }
+ }
+
+ return (0);
+}
+
+
+static int
+pci_emul_alloc_resource(uint64_t *baseptr, uint64_t limit, uint64_t size,
+ uint64_t *addr)
+{
+ uint64_t base;
+
+ assert((size & (size - 1)) == 0); /* must be a power of 2 */
+
+ base = roundup2(*baseptr, size);
+
+ if (base + size <= limit) {
+ *addr = base;
+ *baseptr = base + size;
+ return (0);
+ } else
+ return (-1);
+}
+
+int
+pci_emul_alloc_bar(struct pci_devinst *pdi, int idx, enum pcibar_type type,
+ uint64_t size)
+{
+
+ return (pci_emul_alloc_pbar(pdi, idx, 0, type, size));
+}
+
+/*
+ * Register (or unregister) the MMIO or I/O region associated with the BAR
+ * register 'idx' of an emulated pci device.
+ */
+static void
+modify_bar_registration(struct pci_devinst *pi, int idx, int registration)
+{
+ int error;
+ struct inout_port iop;
+ struct mem_range mr;
+
+ switch (pi->pi_bar[idx].type) {
+ case PCIBAR_IO:
+ bzero(&iop, sizeof(struct inout_port));
+ iop.name = pi->pi_name;
+ iop.port = pi->pi_bar[idx].addr;
+ iop.size = pi->pi_bar[idx].size;
+ if (registration) {
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = pci_emul_io_handler;
+ iop.arg = pi;
+ error = register_inout(&iop);
+ } else
+ error = unregister_inout(&iop);
+ break;
+ case PCIBAR_MEM32:
+ case PCIBAR_MEM64:
+ bzero(&mr, sizeof(struct mem_range));
+ mr.name = pi->pi_name;
+ mr.base = pi->pi_bar[idx].addr;
+ mr.size = pi->pi_bar[idx].size;
+ if (registration) {
+ mr.flags = MEM_F_RW;
+ mr.handler = pci_emul_mem_handler;
+ mr.arg1 = pi;
+ mr.arg2 = idx;
+ error = register_mem(&mr);
+ } else
+ error = unregister_mem(&mr);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ assert(error == 0);
+}
+
+static void
+unregister_bar(struct pci_devinst *pi, int idx)
+{
+
+ modify_bar_registration(pi, idx, 0);
+}
+
+static void
+register_bar(struct pci_devinst *pi, int idx)
+{
+
+ modify_bar_registration(pi, idx, 1);
+}
+
+/* Are we decoding i/o port accesses for the emulated pci device? */
+static int
+porten(struct pci_devinst *pi)
+{
+ uint16_t cmd;
+
+ cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
+
+ return (cmd & PCIM_CMD_PORTEN);
+}
+
+/* Are we decoding memory accesses for the emulated pci device? */
+static int
+memen(struct pci_devinst *pi)
+{
+ uint16_t cmd;
+
+ cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
+
+ return (cmd & PCIM_CMD_MEMEN);
+}
+
+/*
+ * Update the MMIO or I/O address that is decoded by the BAR register.
+ *
+ * If the pci device has enabled the address space decoding then intercept
+ * the address range decoded by the BAR register.
+ */
+static void
+update_bar_address(struct pci_devinst *pi, uint64_t addr, int idx, int type)
+{
+ int decode;
+
+ if (pi->pi_bar[idx].type == PCIBAR_IO)
+ decode = porten(pi);
+ else
+ decode = memen(pi);
+
+ if (decode)
+ unregister_bar(pi, idx);
+
+ switch (type) {
+ case PCIBAR_IO:
+ case PCIBAR_MEM32:
+ pi->pi_bar[idx].addr = addr;
+ break;
+ case PCIBAR_MEM64:
+ pi->pi_bar[idx].addr &= ~0xffffffffUL;
+ pi->pi_bar[idx].addr |= addr;
+ break;
+ case PCIBAR_MEMHI64:
+ pi->pi_bar[idx].addr &= 0xffffffff;
+ pi->pi_bar[idx].addr |= addr;
+ break;
+ default:
+ assert(0);
+ }
+
+ if (decode)
+ register_bar(pi, idx);
+}
+
+int
+pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx, uint64_t hostbase,
+ enum pcibar_type type, uint64_t size)
+{
+ int error;
+ uint64_t *baseptr, limit, addr, mask, lobits, bar;
+
+ assert(idx >= 0 && idx <= PCI_BARMAX);
+
+ if ((size & (size - 1)) != 0)
+ size = 1UL << flsl(size); /* round up to a power of 2 */
+
+ /* Enforce minimum BAR sizes required by the PCI standard */
+ if (type == PCIBAR_IO) {
+ if (size < 4)
+ size = 4;
+ } else {
+ if (size < 16)
+ size = 16;
+ }
+
+ switch (type) {
+ case PCIBAR_NONE:
+ baseptr = NULL;
+ addr = mask = lobits = 0;
+ break;
+ case PCIBAR_IO:
+ baseptr = &pci_emul_iobase;
+ limit = PCI_EMUL_IOLIMIT;
+ mask = PCIM_BAR_IO_BASE;
+ lobits = PCIM_BAR_IO_SPACE;
+ break;
+ case PCIBAR_MEM64:
+ /*
+ * XXX
+ * Some drivers do not work well if the 64-bit BAR is allocated
+ * above 4GB. Allow for this by allocating small requests under
+ * 4GB unless then allocation size is larger than some arbitrary
+ * number (32MB currently).
+ */
+ if (size > 32 * 1024 * 1024) {
+ /*
+ * XXX special case for device requiring peer-peer DMA
+ */
+ if (size == 0x100000000UL)
+ baseptr = &hostbase;
+ else
+ baseptr = &pci_emul_membase64;
+ limit = PCI_EMUL_MEMLIMIT64;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ break;
+ } else {
+ baseptr = &pci_emul_membase32;
+ limit = PCI_EMUL_MEMLIMIT32;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64;
+ }
+ break;
+ case PCIBAR_MEM32:
+ baseptr = &pci_emul_membase32;
+ limit = PCI_EMUL_MEMLIMIT32;
+ mask = PCIM_BAR_MEM_BASE;
+ lobits = PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ break;
+ default:
+ printf("pci_emul_alloc_base: invalid bar type %d\n", type);
+ assert(0);
+ }
+
+ if (baseptr != NULL) {
+ error = pci_emul_alloc_resource(baseptr, limit, size, &addr);
+ if (error != 0)
+ return (error);
+ }
+
+ pdi->pi_bar[idx].type = type;
+ pdi->pi_bar[idx].addr = addr;
+ pdi->pi_bar[idx].size = size;
+
+ /* Initialize the BAR register in config space */
+ bar = (addr & mask) | lobits;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx), bar);
+
+ if (type == PCIBAR_MEM64) {
+ assert(idx + 1 <= PCI_BARMAX);
+ pdi->pi_bar[idx + 1].type = PCIBAR_MEMHI64;
+ pci_set_cfgdata32(pdi, PCIR_BAR(idx + 1), bar >> 32);
+ }
+
+ register_bar(pdi, idx);
+
+ return (0);
+}
+
+#define CAP_START_OFFSET 0x40
+static int
+pci_emul_add_capability(struct pci_devinst *pi, u_char *capdata, int caplen)
+{
+ int i, capoff, reallen;
+ uint16_t sts;
+
+ assert(caplen > 0);
+
+ reallen = roundup2(caplen, 4); /* dword aligned */
+
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) == 0)
+ capoff = CAP_START_OFFSET;
+ else
+ capoff = pi->pi_capend + 1;
+
+ /* Check if we have enough space */
+ if (capoff + reallen > PCI_REGMAX + 1)
+ return (-1);
+
+ /* Set the previous capability pointer */
+ if ((sts & PCIM_STATUS_CAPPRESENT) == 0) {
+ pci_set_cfgdata8(pi, PCIR_CAP_PTR, capoff);
+ pci_set_cfgdata16(pi, PCIR_STATUS, sts|PCIM_STATUS_CAPPRESENT);
+ } else
+ pci_set_cfgdata8(pi, pi->pi_prevcap + 1, capoff);
+
+ /* Copy the capability */
+ for (i = 0; i < caplen; i++)
+ pci_set_cfgdata8(pi, capoff + i, capdata[i]);
+
+ /* Set the next capability pointer */
+ pci_set_cfgdata8(pi, capoff + 1, 0);
+
+ pi->pi_prevcap = capoff;
+ pi->pi_capend = capoff + reallen - 1;
+ return (0);
+}
+
+static struct pci_devemu *
+pci_emul_finddev(char *name)
+{
+ struct pci_devemu **pdpp, *pdp;
+
+ SET_FOREACH(pdpp, pci_devemu_set) {
+ pdp = *pdpp;
+ if (!strcmp(pdp->pe_emu, name)) {
+ return (pdp);
+ }
+ }
+
+ return (NULL);
+}
+
+static int
+pci_emul_init(struct vmctx *ctx, struct pci_devemu *pde, int bus, int slot,
+ int func, struct funcinfo *fi)
+{
+ struct pci_devinst *pdi;
+ int err;
+
+ pdi = calloc(1, sizeof(struct pci_devinst));
+
+ pdi->pi_vmctx = ctx;
+ pdi->pi_bus = bus;
+ pdi->pi_slot = slot;
+ pdi->pi_func = func;
+ pthread_mutex_init(&pdi->pi_lintr.lock, NULL);
+ pdi->pi_lintr.pin = 0;
+ pdi->pi_lintr.state = IDLE;
+ pdi->pi_lintr.pirq_pin = 0;
+ pdi->pi_lintr.ioapic_irq = 0;
+ pdi->pi_d = pde;
+ snprintf(pdi->pi_name, PI_NAMESZ, "%s-pci-%d", pde->pe_emu, slot);
+
+ /* Disable legacy interrupts */
+ pci_set_cfgdata8(pdi, PCIR_INTLINE, 255);
+ pci_set_cfgdata8(pdi, PCIR_INTPIN, 0);
+
+ pci_set_cfgdata8(pdi, PCIR_COMMAND,
+ PCIM_CMD_PORTEN | PCIM_CMD_MEMEN | PCIM_CMD_BUSMASTEREN);
+
+ err = (*pde->pe_init)(ctx, pdi, fi->fi_param);
+ if (err == 0)
+ fi->fi_devi = pdi;
+ else
+ free(pdi);
+
+ return (err);
+}
+
+void
+pci_populate_msicap(struct msicap *msicap, int msgnum, int nextptr)
+{
+ int mmc;
+
+ CTASSERT(sizeof(struct msicap) == 14);
+
+ /* Number of msi messages must be a power of 2 between 1 and 32 */
+ assert((msgnum & (msgnum - 1)) == 0 && msgnum >= 1 && msgnum <= 32);
+ mmc = ffs(msgnum) - 1;
+
+ bzero(msicap, sizeof(struct msicap));
+ msicap->capid = PCIY_MSI;
+ msicap->nextptr = nextptr;
+ msicap->msgctrl = PCIM_MSICTRL_64BIT | (mmc << 1);
+}
+
+int
+pci_emul_add_msicap(struct pci_devinst *pi, int msgnum)
+{
+ struct msicap msicap;
+
+ pci_populate_msicap(&msicap, msgnum, 0);
+
+ return (pci_emul_add_capability(pi, (u_char *)&msicap, sizeof(msicap)));
+}
+
+static void
+pci_populate_msixcap(struct msixcap *msixcap, int msgnum, int barnum,
+ uint32_t msix_tab_size)
+{
+ CTASSERT(sizeof(struct msixcap) == 12);
+
+ assert(msix_tab_size % 4096 == 0);
+
+ bzero(msixcap, sizeof(struct msixcap));
+ msixcap->capid = PCIY_MSIX;
+
+ /*
+ * Message Control Register, all fields set to
+ * zero except for the Table Size.
+ * Note: Table size N is encoded as N-1
+ */
+ msixcap->msgctrl = msgnum - 1;
+
+ /*
+ * MSI-X BAR setup:
+ * - MSI-X table start at offset 0
+ * - PBA table starts at a 4K aligned offset after the MSI-X table
+ */
+ msixcap->table_info = barnum & PCIM_MSIX_BIR_MASK;
+ msixcap->pba_info = msix_tab_size | (barnum & PCIM_MSIX_BIR_MASK);
+}
+
+static void
+pci_msix_table_init(struct pci_devinst *pi, int table_entries)
+{
+ int i, table_size;
+
+ assert(table_entries > 0);
+ assert(table_entries <= MAX_MSIX_TABLE_ENTRIES);
+
+ table_size = table_entries * MSIX_TABLE_ENTRY_SIZE;
+ pi->pi_msix.table = calloc(1, table_size);
+
+ /* set mask bit of vector control register */
+ for (i = 0; i < table_entries; i++)
+ pi->pi_msix.table[i].vector_control |= PCIM_MSIX_VCTRL_MASK;
+}
+
+int
+pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum)
+{
+ uint32_t tab_size;
+ struct msixcap msixcap;
+
+ assert(msgnum >= 1 && msgnum <= MAX_MSIX_TABLE_ENTRIES);
+ assert(barnum >= 0 && barnum <= PCIR_MAX_BAR_0);
+
+ tab_size = msgnum * MSIX_TABLE_ENTRY_SIZE;
+
+ /* Align table size to nearest 4K */
+ tab_size = roundup2(tab_size, 4096);
+
+ pi->pi_msix.table_bar = barnum;
+ pi->pi_msix.pba_bar = barnum;
+ pi->pi_msix.table_offset = 0;
+ pi->pi_msix.table_count = msgnum;
+ pi->pi_msix.pba_offset = tab_size;
+ pi->pi_msix.pba_size = PBA_SIZE(msgnum);
+
+ pci_msix_table_init(pi, msgnum);
+
+ pci_populate_msixcap(&msixcap, msgnum, barnum, tab_size);
+
+ /* allocate memory for MSI-X Table and PBA */
+ pci_emul_alloc_bar(pi, barnum, PCIBAR_MEM32,
+ tab_size + pi->pi_msix.pba_size);
+
+ return (pci_emul_add_capability(pi, (u_char *)&msixcap,
+ sizeof(msixcap)));
+}
+
+void
+msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask;
+ int off, table_bar;
+
+ off = offset - capoff;
+ table_bar = pi->pi_msix.table_bar;
+ /* Message Control Register */
+ if (off == 2 && bytes == 2) {
+ rwmask = PCIM_MSIXCTRL_MSIX_ENABLE | PCIM_MSIXCTRL_FUNCTION_MASK;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+
+ pi->pi_msix.enabled = val & PCIM_MSIXCTRL_MSIX_ENABLE;
+ pi->pi_msix.function_mask = val & PCIM_MSIXCTRL_FUNCTION_MASK;
+ pci_lintr_update(pi);
+ }
+
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+void
+msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+ uint16_t msgctrl, rwmask, msgdata, mme;
+ uint32_t addrlo;
+
+ /*
+ * If guest is writing to the message control register make sure
+ * we do not overwrite read-only fields.
+ */
+ if ((offset - capoff) == 2 && bytes == 2) {
+ rwmask = PCIM_MSICTRL_MME_MASK | PCIM_MSICTRL_MSI_ENABLE;
+ msgctrl = pci_get_cfgdata16(pi, offset);
+ msgctrl &= ~rwmask;
+ msgctrl |= val & rwmask;
+ val = msgctrl;
+
+ addrlo = pci_get_cfgdata32(pi, capoff + 4);
+ if (msgctrl & PCIM_MSICTRL_64BIT)
+ msgdata = pci_get_cfgdata16(pi, capoff + 12);
+ else
+ msgdata = pci_get_cfgdata16(pi, capoff + 8);
+
+ mme = msgctrl & PCIM_MSICTRL_MME_MASK;
+ pi->pi_msi.enabled = msgctrl & PCIM_MSICTRL_MSI_ENABLE ? 1 : 0;
+ if (pi->pi_msi.enabled) {
+ pi->pi_msi.addr = addrlo;
+ pi->pi_msi.msg_data = msgdata;
+ pi->pi_msi.maxmsgnum = 1 << (mme >> 4);
+ } else {
+ pi->pi_msi.maxmsgnum = 0;
+ }
+ pci_lintr_update(pi);
+ }
+
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+void
+pciecap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val)
+{
+
+ /* XXX don't write to the readonly parts */
+ CFGWRITE(pi, offset, val, bytes);
+}
+
+#define PCIECAP_VERSION 0x2
+int
+pci_emul_add_pciecap(struct pci_devinst *pi, int type)
+{
+ int err;
+ struct pciecap pciecap;
+
+ CTASSERT(sizeof(struct pciecap) == 60);
+
+ if (type != PCIEM_TYPE_ROOT_PORT)
+ return (-1);
+
+ bzero(&pciecap, sizeof(pciecap));
+
+ pciecap.capid = PCIY_EXPRESS;
+ pciecap.pcie_capabilities = PCIECAP_VERSION | PCIEM_TYPE_ROOT_PORT;
+ pciecap.link_capabilities = 0x411; /* gen1, x1 */
+ pciecap.link_status = 0x11; /* gen1, x1 */
+
+ err = pci_emul_add_capability(pi, (u_char *)&pciecap, sizeof(pciecap));
+ return (err);
+}
+
+/*
+ * This function assumes that 'coff' is in the capabilities region of the
+ * config space.
+ */
+static void
+pci_emul_capwrite(struct pci_devinst *pi, int offset, int bytes, uint32_t val)
+{
+ int capid;
+ uint8_t capoff, nextoff;
+
+ /* Do not allow un-aligned writes */
+ if ((offset & (bytes - 1)) != 0)
+ return;
+
+ /* Find the capability that we want to update */
+ capoff = CAP_START_OFFSET;
+ while (1) {
+ nextoff = pci_get_cfgdata8(pi, capoff + 1);
+ if (nextoff == 0)
+ break;
+ if (offset >= capoff && offset < nextoff)
+ break;
+
+ capoff = nextoff;
+ }
+ assert(offset >= capoff);
+
+ /*
+ * Capability ID and Next Capability Pointer are readonly.
+ * However, some o/s's do 4-byte writes that include these.
+ * For this case, trim the write back to 2 bytes and adjust
+ * the data.
+ */
+ if (offset == capoff || offset == capoff + 1) {
+ if (offset == capoff && bytes == 4) {
+ bytes = 2;
+ offset += 2;
+ val >>= 16;
+ } else
+ return;
+ }
+
+ capid = pci_get_cfgdata8(pi, capoff);
+ switch (capid) {
+ case PCIY_MSI:
+ msicap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ case PCIY_MSIX:
+ msixcap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ case PCIY_EXPRESS:
+ pciecap_cfgwrite(pi, capoff, offset, bytes, val);
+ break;
+ default:
+ break;
+ }
+}
+
+static int
+pci_emul_iscap(struct pci_devinst *pi, int offset)
+{
+ uint16_t sts;
+
+ sts = pci_get_cfgdata16(pi, PCIR_STATUS);
+ if ((sts & PCIM_STATUS_CAPPRESENT) != 0) {
+ if (offset >= CAP_START_OFFSET && offset <= pi->pi_capend)
+ return (1);
+ }
+ return (0);
+}
+
+static int
+pci_emul_fallback_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2)
+{
+ /*
+ * Ignore writes; return 0xff's for reads. The mem read code
+ * will take care of truncating to the correct size.
+ */
+ if (dir == MEM_F_READ) {
+ *val = 0xffffffffffffffff;
+ }
+
+ return (0);
+}
+
+static int
+pci_emul_ecfg_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int bytes, uint64_t *val, void *arg1, long arg2)
+{
+ int bus, slot, func, coff, in;
+
+ coff = addr & 0xfff;
+ func = (addr >> 12) & 0x7;
+ slot = (addr >> 15) & 0x1f;
+ bus = (addr >> 20) & 0xff;
+ in = (dir == MEM_F_READ);
+ if (in)
+ *val = ~0UL;
+ pci_cfgrw(ctx, vcpu, in, bus, slot, func, coff, bytes, (uint32_t *)val);
+ return (0);
+}
+
+uint64_t
+pci_ecfg_base(void)
+{
+
+ return (PCI_EMUL_ECFG_BASE);
+}
+
+#define BUSIO_ROUNDUP 32
+#define BUSMEM_ROUNDUP (1024 * 1024)
+
+int
+init_pci(struct vmctx *ctx)
+{
+ struct mem_range mr;
+ struct pci_devemu *pde;
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct funcinfo *fi;
+ size_t lowmem;
+ int bus, slot, func;
+ int error;
+
+ pci_emul_iobase = PCI_EMUL_IOBASE;
+ pci_emul_membase32 = vm_get_lowmem_limit(ctx);
+ pci_emul_membase64 = PCI_EMUL_MEMBASE64;
+
+ for (bus = 0; bus < MAXBUSES; bus++) {
+ if ((bi = pci_businfo[bus]) == NULL)
+ continue;
+ /*
+ * Keep track of the i/o and memory resources allocated to
+ * this bus.
+ */
+ bi->iobase = pci_emul_iobase;
+ bi->membase32 = pci_emul_membase32;
+ bi->membase64 = pci_emul_membase64;
+
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (func = 0; func < MAXFUNCS; func++) {
+ fi = &si->si_funcs[func];
+ if (fi->fi_name == NULL)
+ continue;
+ pde = pci_emul_finddev(fi->fi_name);
+ assert(pde != NULL);
+ error = pci_emul_init(ctx, pde, bus, slot,
+ func, fi);
+ if (error)
+ return (error);
+ }
+ }
+
+ /*
+ * Add some slop to the I/O and memory resources decoded by
+ * this bus to give a guest some flexibility if it wants to
+ * reprogram the BARs.
+ */
+ pci_emul_iobase += BUSIO_ROUNDUP;
+ pci_emul_iobase = roundup2(pci_emul_iobase, BUSIO_ROUNDUP);
+ bi->iolimit = pci_emul_iobase;
+
+ pci_emul_membase32 += BUSMEM_ROUNDUP;
+ pci_emul_membase32 = roundup2(pci_emul_membase32,
+ BUSMEM_ROUNDUP);
+ bi->memlimit32 = pci_emul_membase32;
+
+ pci_emul_membase64 += BUSMEM_ROUNDUP;
+ pci_emul_membase64 = roundup2(pci_emul_membase64,
+ BUSMEM_ROUNDUP);
+ bi->memlimit64 = pci_emul_membase64;
+ }
+
+ /*
+ * PCI backends are initialized before routing INTx interrupts
+ * so that LPC devices are able to reserve ISA IRQs before
+ * routing PIRQ pins.
+ */
+ for (bus = 0; bus < MAXBUSES; bus++) {
+ if ((bi = pci_businfo[bus]) == NULL)
+ continue;
+
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (func = 0; func < MAXFUNCS; func++) {
+ fi = &si->si_funcs[func];
+ if (fi->fi_devi == NULL)
+ continue;
+ pci_lintr_route(fi->fi_devi);
+ }
+ }
+ }
+ lpc_pirq_routed();
+
+ /*
+ * The guest physical memory map looks like the following:
+ * [0, lowmem) guest system memory
+ * [lowmem, lowmem_limit) memory hole (may be absent)
+ * [lowmem_limit, 0xE0000000) PCI hole (32-bit BAR allocation)
+ * [0xE0000000, 0xF0000000) PCI extended config window
+ * [0xF0000000, 4GB) LAPIC, IOAPIC, HPET, firmware
+ * [4GB, 4GB + highmem)
+ */
+
+ /*
+ * Accesses to memory addresses that are not allocated to system
+ * memory or PCI devices return 0xff's.
+ */
+ lowmem = vm_get_lowmem_size(ctx);
+ bzero(&mr, sizeof(struct mem_range));
+ mr.name = "PCI hole";
+ mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+ mr.base = lowmem;
+ mr.size = (4ULL * 1024 * 1024 * 1024) - lowmem;
+ mr.handler = pci_emul_fallback_handler;
+ error = register_mem_fallback(&mr);
+ assert(error == 0);
+
+ /* PCI extended config space */
+ bzero(&mr, sizeof(struct mem_range));
+ mr.name = "PCI ECFG";
+ mr.flags = MEM_F_RW | MEM_F_IMMUTABLE;
+ mr.base = PCI_EMUL_ECFG_BASE;
+ mr.size = PCI_EMUL_ECFG_SIZE;
+ mr.handler = pci_emul_ecfg_handler;
+ error = register_mem(&mr);
+ assert(error == 0);
+
+ return (0);
+}
+
+#ifdef __FreeBSD__
+static void
+pci_apic_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
+ void *arg)
+{
+
+ dsdt_line(" Package ()");
+ dsdt_line(" {");
+ dsdt_line(" 0x%X,", slot << 16 | 0xffff);
+ dsdt_line(" 0x%02X,", pin - 1);
+ dsdt_line(" Zero,");
+ dsdt_line(" 0x%X", ioapic_irq);
+ dsdt_line(" },");
+}
+
+static void
+pci_pirq_prt_entry(int bus, int slot, int pin, int pirq_pin, int ioapic_irq,
+ void *arg)
+{
+ char *name;
+
+ name = lpc_pirq_name(pirq_pin);
+ if (name == NULL)
+ return;
+ dsdt_line(" Package ()");
+ dsdt_line(" {");
+ dsdt_line(" 0x%X,", slot << 16 | 0xffff);
+ dsdt_line(" 0x%02X,", pin - 1);
+ dsdt_line(" %s,", name);
+ dsdt_line(" 0x00");
+ dsdt_line(" },");
+ free(name);
+}
+
+/*
+ * A bhyve virtual machine has a flat PCI hierarchy with a root port
+ * corresponding to each PCI bus.
+ */
+static void
+pci_bus_write_dsdt(int bus)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct pci_devinst *pi;
+ int count, func, slot;
+
+ /*
+ * If there are no devices on this 'bus' then just return.
+ */
+ if ((bi = pci_businfo[bus]) == NULL) {
+ /*
+ * Bus 0 is special because it decodes the I/O ports used
+ * for PCI config space access even if there are no devices
+ * on it.
+ */
+ if (bus != 0)
+ return;
+ }
+
+ dsdt_line(" Device (PC%02X)", bus);
+ dsdt_line(" {");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0A03\"))");
+ dsdt_line(" Name (_ADR, Zero)");
+
+ dsdt_line(" Method (_BBN, 0, NotSerialized)");
+ dsdt_line(" {");
+ dsdt_line(" Return (0x%08X)", bus);
+ dsdt_line(" }");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_line(" WordBusNumber (ResourceProducer, MinFixed, "
+ "MaxFixed, PosDecode,");
+ dsdt_line(" 0x0000, // Granularity");
+ dsdt_line(" 0x%04X, // Range Minimum", bus);
+ dsdt_line(" 0x%04X, // Range Maximum", bus);
+ dsdt_line(" 0x0000, // Translation Offset");
+ dsdt_line(" 0x0001, // Length");
+ dsdt_line(" ,, )");
+
+ if (bus == 0) {
+ dsdt_indent(3);
+ dsdt_fixed_ioport(0xCF8, 8);
+ dsdt_unindent(3);
+
+ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, "
+ "PosDecode, EntireRange,");
+ dsdt_line(" 0x0000, // Granularity");
+ dsdt_line(" 0x0000, // Range Minimum");
+ dsdt_line(" 0x0CF7, // Range Maximum");
+ dsdt_line(" 0x0000, // Translation Offset");
+ dsdt_line(" 0x0CF8, // Length");
+ dsdt_line(" ,, , TypeStatic)");
+
+ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, "
+ "PosDecode, EntireRange,");
+ dsdt_line(" 0x0000, // Granularity");
+ dsdt_line(" 0x0D00, // Range Minimum");
+ dsdt_line(" 0x%04X, // Range Maximum",
+ PCI_EMUL_IOBASE - 1);
+ dsdt_line(" 0x0000, // Translation Offset");
+ dsdt_line(" 0x%04X, // Length",
+ PCI_EMUL_IOBASE - 0x0D00);
+ dsdt_line(" ,, , TypeStatic)");
+
+ if (bi == NULL) {
+ dsdt_line(" })");
+ goto done;
+ }
+ }
+ assert(bi != NULL);
+
+ /* i/o window */
+ dsdt_line(" WordIO (ResourceProducer, MinFixed, MaxFixed, "
+ "PosDecode, EntireRange,");
+ dsdt_line(" 0x0000, // Granularity");
+ dsdt_line(" 0x%04X, // Range Minimum", bi->iobase);
+ dsdt_line(" 0x%04X, // Range Maximum",
+ bi->iolimit - 1);
+ dsdt_line(" 0x0000, // Translation Offset");
+ dsdt_line(" 0x%04X, // Length",
+ bi->iolimit - bi->iobase);
+ dsdt_line(" ,, , TypeStatic)");
+
+ /* mmio window (32-bit) */
+ dsdt_line(" DWordMemory (ResourceProducer, PosDecode, "
+ "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
+ dsdt_line(" 0x00000000, // Granularity");
+ dsdt_line(" 0x%08X, // Range Minimum\n", bi->membase32);
+ dsdt_line(" 0x%08X, // Range Maximum\n",
+ bi->memlimit32 - 1);
+ dsdt_line(" 0x00000000, // Translation Offset");
+ dsdt_line(" 0x%08X, // Length\n",
+ bi->memlimit32 - bi->membase32);
+ dsdt_line(" ,, , AddressRangeMemory, TypeStatic)");
+
+ /* mmio window (64-bit) */
+ dsdt_line(" QWordMemory (ResourceProducer, PosDecode, "
+ "MinFixed, MaxFixed, NonCacheable, ReadWrite,");
+ dsdt_line(" 0x0000000000000000, // Granularity");
+ dsdt_line(" 0x%016lX, // Range Minimum\n", bi->membase64);
+ dsdt_line(" 0x%016lX, // Range Maximum\n",
+ bi->memlimit64 - 1);
+ dsdt_line(" 0x0000000000000000, // Translation Offset");
+ dsdt_line(" 0x%016lX, // Length\n",
+ bi->memlimit64 - bi->membase64);
+ dsdt_line(" ,, , AddressRangeMemory, TypeStatic)");
+ dsdt_line(" })");
+
+ count = pci_count_lintr(bus);
+ if (count != 0) {
+ dsdt_indent(2);
+ dsdt_line("Name (PPRT, Package ()");
+ dsdt_line("{");
+ pci_walk_lintr(bus, pci_pirq_prt_entry, NULL);
+ dsdt_line("})");
+ dsdt_line("Name (APRT, Package ()");
+ dsdt_line("{");
+ pci_walk_lintr(bus, pci_apic_prt_entry, NULL);
+ dsdt_line("})");
+ dsdt_line("Method (_PRT, 0, NotSerialized)");
+ dsdt_line("{");
+ dsdt_line(" If (PICM)");
+ dsdt_line(" {");
+ dsdt_line(" Return (APRT)");
+ dsdt_line(" }");
+ dsdt_line(" Else");
+ dsdt_line(" {");
+ dsdt_line(" Return (PPRT)");
+ dsdt_line(" }");
+ dsdt_line("}");
+ dsdt_unindent(2);
+ }
+
+ dsdt_indent(2);
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (func = 0; func < MAXFUNCS; func++) {
+ pi = si->si_funcs[func].fi_devi;
+ if (pi != NULL && pi->pi_d->pe_write_dsdt != NULL)
+ pi->pi_d->pe_write_dsdt(pi);
+ }
+ }
+ dsdt_unindent(2);
+done:
+ dsdt_line(" }");
+}
+
+void
+pci_write_dsdt(void)
+{
+ int bus;
+
+ dsdt_indent(1);
+ dsdt_line("Name (PICM, 0x00)");
+ dsdt_line("Method (_PIC, 1, NotSerialized)");
+ dsdt_line("{");
+ dsdt_line(" Store (Arg0, PICM)");
+ dsdt_line("}");
+ dsdt_line("");
+ dsdt_line("Scope (_SB)");
+ dsdt_line("{");
+ for (bus = 0; bus < MAXBUSES; bus++)
+ pci_bus_write_dsdt(bus);
+ dsdt_line("}");
+ dsdt_unindent(1);
+}
+#endif
+
+int
+pci_bus_configured(int bus)
+{
+ assert(bus >= 0 && bus < MAXBUSES);
+ return (pci_businfo[bus] != NULL);
+}
+
+int
+pci_msi_enabled(struct pci_devinst *pi)
+{
+ return (pi->pi_msi.enabled);
+}
+
+int
+pci_msi_maxmsgnum(struct pci_devinst *pi)
+{
+ if (pi->pi_msi.enabled)
+ return (pi->pi_msi.maxmsgnum);
+ else
+ return (0);
+}
+
+int
+pci_msix_enabled(struct pci_devinst *pi)
+{
+
+ return (pi->pi_msix.enabled && !pi->pi_msi.enabled);
+}
+
+void
+pci_generate_msix(struct pci_devinst *pi, int index)
+{
+ struct msix_table_entry *mte;
+
+ if (!pci_msix_enabled(pi))
+ return;
+
+ if (pi->pi_msix.function_mask)
+ return;
+
+ if (index >= pi->pi_msix.table_count)
+ return;
+
+ mte = &pi->pi_msix.table[index];
+ if ((mte->vector_control & PCIM_MSIX_VCTRL_MASK) == 0) {
+ /* XXX Set PBA bit if interrupt is disabled */
+ vm_lapic_msi(pi->pi_vmctx, mte->addr, mte->msg_data);
+ }
+}
+
+void
+pci_generate_msi(struct pci_devinst *pi, int index)
+{
+
+ if (pci_msi_enabled(pi) && index < pci_msi_maxmsgnum(pi)) {
+ vm_lapic_msi(pi->pi_vmctx, pi->pi_msi.addr,
+ pi->pi_msi.msg_data + index);
+ }
+}
+
+static bool
+pci_lintr_permitted(struct pci_devinst *pi)
+{
+ uint16_t cmd;
+
+ cmd = pci_get_cfgdata16(pi, PCIR_COMMAND);
+ return (!(pi->pi_msi.enabled || pi->pi_msix.enabled ||
+ (cmd & PCIM_CMD_INTxDIS)));
+}
+
+void
+pci_lintr_request(struct pci_devinst *pi)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ int bestpin, bestcount, pin;
+
+ bi = pci_businfo[pi->pi_bus];
+ assert(bi != NULL);
+
+ /*
+ * Just allocate a pin from our slot. The pin will be
+ * assigned IRQs later when interrupts are routed.
+ */
+ si = &bi->slotinfo[pi->pi_slot];
+ bestpin = 0;
+ bestcount = si->si_intpins[0].ii_count;
+ for (pin = 1; pin < 4; pin++) {
+ if (si->si_intpins[pin].ii_count < bestcount) {
+ bestpin = pin;
+ bestcount = si->si_intpins[pin].ii_count;
+ }
+ }
+
+ si->si_intpins[bestpin].ii_count++;
+ pi->pi_lintr.pin = bestpin + 1;
+ pci_set_cfgdata8(pi, PCIR_INTPIN, bestpin + 1);
+}
+
+static void
+pci_lintr_route(struct pci_devinst *pi)
+{
+ struct businfo *bi;
+ struct intxinfo *ii;
+
+ if (pi->pi_lintr.pin == 0)
+ return;
+
+ bi = pci_businfo[pi->pi_bus];
+ assert(bi != NULL);
+ ii = &bi->slotinfo[pi->pi_slot].si_intpins[pi->pi_lintr.pin - 1];
+
+ /*
+ * Attempt to allocate an I/O APIC pin for this intpin if one
+ * is not yet assigned.
+ */
+ if (ii->ii_ioapic_irq == 0)
+ ii->ii_ioapic_irq = ioapic_pci_alloc_irq();
+ assert(ii->ii_ioapic_irq > 0);
+
+ /*
+ * Attempt to allocate a PIRQ pin for this intpin if one is
+ * not yet assigned.
+ */
+ if (ii->ii_pirq_pin == 0)
+ ii->ii_pirq_pin = pirq_alloc_pin(pi->pi_vmctx);
+ assert(ii->ii_pirq_pin > 0);
+
+ pi->pi_lintr.ioapic_irq = ii->ii_ioapic_irq;
+ pi->pi_lintr.pirq_pin = ii->ii_pirq_pin;
+ pci_set_cfgdata8(pi, PCIR_INTLINE, pirq_irq(ii->ii_pirq_pin));
+}
+
+void
+pci_lintr_assert(struct pci_devinst *pi)
+{
+
+ assert(pi->pi_lintr.pin > 0);
+
+ pthread_mutex_lock(&pi->pi_lintr.lock);
+ if (pi->pi_lintr.state == IDLE) {
+ if (pci_lintr_permitted(pi)) {
+ pi->pi_lintr.state = ASSERTED;
+ pci_irq_assert(pi);
+ } else
+ pi->pi_lintr.state = PENDING;
+ }
+ pthread_mutex_unlock(&pi->pi_lintr.lock);
+}
+
+void
+pci_lintr_deassert(struct pci_devinst *pi)
+{
+
+ assert(pi->pi_lintr.pin > 0);
+
+ pthread_mutex_lock(&pi->pi_lintr.lock);
+ if (pi->pi_lintr.state == ASSERTED) {
+ pi->pi_lintr.state = IDLE;
+ pci_irq_deassert(pi);
+ } else if (pi->pi_lintr.state == PENDING)
+ pi->pi_lintr.state = IDLE;
+ pthread_mutex_unlock(&pi->pi_lintr.lock);
+}
+
+static void
+pci_lintr_update(struct pci_devinst *pi)
+{
+
+ pthread_mutex_lock(&pi->pi_lintr.lock);
+ if (pi->pi_lintr.state == ASSERTED && !pci_lintr_permitted(pi)) {
+ pci_irq_deassert(pi);
+ pi->pi_lintr.state = PENDING;
+ } else if (pi->pi_lintr.state == PENDING && pci_lintr_permitted(pi)) {
+ pi->pi_lintr.state = ASSERTED;
+ pci_irq_assert(pi);
+ }
+ pthread_mutex_unlock(&pi->pi_lintr.lock);
+}
+
+int
+pci_count_lintr(int bus)
+{
+ int count, slot, pin;
+ struct slotinfo *slotinfo;
+
+ count = 0;
+ if (pci_businfo[bus] != NULL) {
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ slotinfo = &pci_businfo[bus]->slotinfo[slot];
+ for (pin = 0; pin < 4; pin++) {
+ if (slotinfo->si_intpins[pin].ii_count != 0)
+ count++;
+ }
+ }
+ }
+ return (count);
+}
+
+void
+pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct intxinfo *ii;
+ int slot, pin;
+
+ if ((bi = pci_businfo[bus]) == NULL)
+ return;
+
+ for (slot = 0; slot < MAXSLOTS; slot++) {
+ si = &bi->slotinfo[slot];
+ for (pin = 0; pin < 4; pin++) {
+ ii = &si->si_intpins[pin];
+ if (ii->ii_count != 0)
+ cb(bus, slot, pin + 1, ii->ii_pirq_pin,
+ ii->ii_ioapic_irq, arg);
+ }
+ }
+}
+
+/*
+ * Return 1 if the emulated device in 'slot' is a multi-function device.
+ * Return 0 otherwise.
+ */
+static int
+pci_emul_is_mfdev(int bus, int slot)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ int f, numfuncs;
+
+ numfuncs = 0;
+ if ((bi = pci_businfo[bus]) != NULL) {
+ si = &bi->slotinfo[slot];
+ for (f = 0; f < MAXFUNCS; f++) {
+ if (si->si_funcs[f].fi_devi != NULL) {
+ numfuncs++;
+ }
+ }
+ }
+ return (numfuncs > 1);
+}
+
+/*
+ * Ensure that the PCIM_MFDEV bit is properly set (or unset) depending on
+ * whether or not is a multi-function being emulated in the pci 'slot'.
+ */
+static void
+pci_emul_hdrtype_fixup(int bus, int slot, int off, int bytes, uint32_t *rv)
+{
+ int mfdev;
+
+ if (off <= PCIR_HDRTYPE && off + bytes > PCIR_HDRTYPE) {
+ mfdev = pci_emul_is_mfdev(bus, slot);
+ switch (bytes) {
+ case 1:
+ case 2:
+ *rv &= ~PCIM_MFDEV;
+ if (mfdev) {
+ *rv |= PCIM_MFDEV;
+ }
+ break;
+ case 4:
+ *rv &= ~(PCIM_MFDEV << 16);
+ if (mfdev) {
+ *rv |= (PCIM_MFDEV << 16);
+ }
+ break;
+ }
+ }
+}
+
+static uint32_t
+bits_changed(uint32_t old, uint32_t new, uint32_t mask)
+{
+
+ return ((old ^ new) & mask);
+}
+
+static void
+pci_emul_cmdwrite(struct pci_devinst *pi, uint32_t new, int bytes)
+{
+ int i;
+ uint16_t old;
+
+ /*
+ * The command register is at an offset of 4 bytes and thus the
+ * guest could write 1, 2 or 4 bytes starting at this offset.
+ */
+
+ old = pci_get_cfgdata16(pi, PCIR_COMMAND); /* stash old value */
+ CFGWRITE(pi, PCIR_COMMAND, new, bytes); /* update config */
+ new = pci_get_cfgdata16(pi, PCIR_COMMAND); /* get updated value */
+
+ /*
+ * If the MMIO or I/O address space decoding has changed then
+ * register/unregister all BARs that decode that address space.
+ */
+ for (i = 0; i <= PCI_BARMAX; i++) {
+ switch (pi->pi_bar[i].type) {
+ case PCIBAR_NONE:
+ case PCIBAR_MEMHI64:
+ break;
+ case PCIBAR_IO:
+ /* I/O address space decoding changed? */
+ if (bits_changed(old, new, PCIM_CMD_PORTEN)) {
+ if (porten(pi))
+ register_bar(pi, i);
+ else
+ unregister_bar(pi, i);
+ }
+ break;
+ case PCIBAR_MEM32:
+ case PCIBAR_MEM64:
+ /* MMIO address space decoding changed? */
+ if (bits_changed(old, new, PCIM_CMD_MEMEN)) {
+ if (memen(pi))
+ register_bar(pi, i);
+ else
+ unregister_bar(pi, i);
+ }
+ break;
+ default:
+ assert(0);
+ }
+ }
+
+ /*
+ * If INTx has been unmasked and is pending, assert the
+ * interrupt.
+ */
+ pci_lintr_update(pi);
+}
+
+static void
+pci_cfgrw(struct vmctx *ctx, int vcpu, int in, int bus, int slot, int func,
+ int coff, int bytes, uint32_t *eax)
+{
+ struct businfo *bi;
+ struct slotinfo *si;
+ struct pci_devinst *pi;
+ struct pci_devemu *pe;
+ int idx, needcfg;
+ uint64_t addr, bar, mask;
+
+ if ((bi = pci_businfo[bus]) != NULL) {
+ si = &bi->slotinfo[slot];
+ pi = si->si_funcs[func].fi_devi;
+ } else
+ pi = NULL;
+
+ /*
+ * Just return if there is no device at this slot:func or if the
+ * the guest is doing an un-aligned access.
+ */
+ if (pi == NULL || (bytes != 1 && bytes != 2 && bytes != 4) ||
+ (coff & (bytes - 1)) != 0) {
+ if (in)
+ *eax = 0xffffffff;
+ return;
+ }
+
+ /*
+ * Ignore all writes beyond the standard config space and return all
+ * ones on reads.
+ */
+ if (coff >= PCI_REGMAX + 1) {
+ if (in) {
+ *eax = 0xffffffff;
+ /*
+ * Extended capabilities begin at offset 256 in config
+ * space. Absence of extended capabilities is signaled
+ * with all 0s in the extended capability header at
+ * offset 256.
+ */
+ if (coff <= PCI_REGMAX + 4)
+ *eax = 0x00000000;
+ }
+ return;
+ }
+
+ pe = pi->pi_d;
+
+ /*
+ * Config read
+ */
+ if (in) {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgread != NULL) {
+ needcfg = pe->pe_cfgread(ctx, vcpu, pi, coff, bytes,
+ eax);
+ } else {
+ needcfg = 1;
+ }
+
+ if (needcfg) {
+ if (bytes == 1)
+ *eax = pci_get_cfgdata8(pi, coff);
+ else if (bytes == 2)
+ *eax = pci_get_cfgdata16(pi, coff);
+ else
+ *eax = pci_get_cfgdata32(pi, coff);
+ }
+
+ pci_emul_hdrtype_fixup(bus, slot, coff, bytes, eax);
+ } else {
+ /* Let the device emulation override the default handler */
+ if (pe->pe_cfgwrite != NULL &&
+ (*pe->pe_cfgwrite)(ctx, vcpu, pi, coff, bytes, *eax) == 0)
+ return;
+
+ /*
+ * Special handling for write to BAR registers
+ */
+ if (coff >= PCIR_BAR(0) && coff < PCIR_BAR(PCI_BARMAX + 1)) {
+ /*
+ * Ignore writes to BAR registers that are not
+ * 4-byte aligned.
+ */
+ if (bytes != 4 || (coff & 0x3) != 0)
+ return;
+ idx = (coff - PCIR_BAR(0)) / 4;
+ mask = ~(pi->pi_bar[idx].size - 1);
+ switch (pi->pi_bar[idx].type) {
+ case PCIBAR_NONE:
+ pi->pi_bar[idx].addr = bar = 0;
+ break;
+ case PCIBAR_IO:
+ addr = *eax & mask;
+ addr &= 0xffff;
+ bar = addr | PCIM_BAR_IO_SPACE;
+ /*
+ * Register the new BAR value for interception
+ */
+ if (addr != pi->pi_bar[idx].addr) {
+ update_bar_address(pi, addr, idx,
+ PCIBAR_IO);
+ }
+ break;
+ case PCIBAR_MEM32:
+ addr = bar = *eax & mask;
+ bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_32;
+ if (addr != pi->pi_bar[idx].addr) {
+ update_bar_address(pi, addr, idx,
+ PCIBAR_MEM32);
+ }
+ break;
+ case PCIBAR_MEM64:
+ addr = bar = *eax & mask;
+ bar |= PCIM_BAR_MEM_SPACE | PCIM_BAR_MEM_64 |
+ PCIM_BAR_MEM_PREFETCH;
+ if (addr != (uint32_t)pi->pi_bar[idx].addr) {
+ update_bar_address(pi, addr, idx,
+ PCIBAR_MEM64);
+ }
+ break;
+ case PCIBAR_MEMHI64:
+ mask = ~(pi->pi_bar[idx - 1].size - 1);
+ addr = ((uint64_t)*eax << 32) & mask;
+ bar = addr >> 32;
+ if (bar != pi->pi_bar[idx - 1].addr >> 32) {
+ update_bar_address(pi, addr, idx - 1,
+ PCIBAR_MEMHI64);
+ }
+ break;
+ default:
+ assert(0);
+ }
+ pci_set_cfgdata32(pi, coff, bar);
+
+ } else if (pci_emul_iscap(pi, coff)) {
+ pci_emul_capwrite(pi, coff, bytes, *eax);
+ } else if (coff == PCIR_COMMAND) {
+ pci_emul_cmdwrite(pi, *eax, bytes);
+ } else {
+ CFGWRITE(pi, coff, *eax, bytes);
+ }
+ }
+}
+
+static int cfgenable, cfgbus, cfgslot, cfgfunc, cfgoff;
+
+static int
+pci_emul_cfgaddr(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ uint32_t x;
+
+ if (bytes != 4) {
+ if (in)
+ *eax = (bytes == 2) ? 0xffff : 0xff;
+ return (0);
+ }
+
+ if (in) {
+ x = (cfgbus << 16) | (cfgslot << 11) | (cfgfunc << 8) | cfgoff;
+ if (cfgenable)
+ x |= CONF1_ENABLE;
+ *eax = x;
+ } else {
+ x = *eax;
+ cfgenable = (x & CONF1_ENABLE) == CONF1_ENABLE;
+ cfgoff = x & PCI_REGMAX;
+ cfgfunc = (x >> 8) & PCI_FUNCMAX;
+ cfgslot = (x >> 11) & PCI_SLOTMAX;
+ cfgbus = (x >> 16) & PCI_BUSMAX;
+ }
+
+ return (0);
+}
+INOUT_PORT(pci_cfgaddr, CONF1_ADDR_PORT, IOPORT_F_INOUT, pci_emul_cfgaddr);
+
+static int
+pci_emul_cfgdata(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int coff;
+
+ assert(bytes == 1 || bytes == 2 || bytes == 4);
+
+ coff = cfgoff + (port - CONF1_DATA_PORT);
+ if (cfgenable) {
+ pci_cfgrw(ctx, vcpu, in, cfgbus, cfgslot, cfgfunc, coff, bytes,
+ eax);
+ } else {
+ /* Ignore accesses to cfgdata if not enabled by cfgaddr */
+ if (in)
+ *eax = 0xffffffff;
+ }
+ return (0);
+}
+
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+0, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+1, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+2, IOPORT_F_INOUT, pci_emul_cfgdata);
+INOUT_PORT(pci_cfgdata, CONF1_DATA_PORT+3, IOPORT_F_INOUT, pci_emul_cfgdata);
+
+#define PCI_EMUL_TEST
+#ifdef PCI_EMUL_TEST
+/*
+ * Define a dummy test device
+ */
+#define DIOSZ 8
+#define DMEMSZ 4096
+struct pci_emul_dsoftc {
+ uint8_t ioregs[DIOSZ];
+ uint8_t memregs[DMEMSZ];
+};
+
+#define PCI_EMUL_MSI_MSGS 4
+#define PCI_EMUL_MSIX_MSGS 16
+
+static int
+pci_emul_dinit(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ int error;
+ struct pci_emul_dsoftc *sc;
+
+ sc = calloc(1, sizeof(struct pci_emul_dsoftc));
+
+ pi->pi_arg = sc;
+
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x0001);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x10DD);
+ pci_set_cfgdata8(pi, PCIR_CLASS, 0x02);
+
+ error = pci_emul_add_msicap(pi, PCI_EMUL_MSI_MSGS);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 0, PCIBAR_IO, DIOSZ);
+ assert(error == 0);
+
+ error = pci_emul_alloc_bar(pi, 1, PCIBAR_MEM32, DMEMSZ);
+ assert(error == 0);
+
+ return (0);
+}
+
+static void
+pci_emul_diow(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size, uint64_t value)
+{
+ int i;
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+
+ if (baridx == 0) {
+ if (offset + size > DIOSZ) {
+ printf("diow: iow too large, offset %ld size %d\n",
+ offset, size);
+ return;
+ }
+
+ if (size == 1) {
+ sc->ioregs[offset] = value & 0xff;
+ } else if (size == 2) {
+ *(uint16_t *)&sc->ioregs[offset] = value & 0xffff;
+ } else if (size == 4) {
+ *(uint32_t *)&sc->ioregs[offset] = value;
+ } else {
+ printf("diow: iow unknown size %d\n", size);
+ }
+
+ /*
+ * Special magic value to generate an interrupt
+ */
+ if (offset == 4 && size == 4 && pci_msi_enabled(pi))
+ pci_generate_msi(pi, value % pci_msi_maxmsgnum(pi));
+
+ if (value == 0xabcdef) {
+ for (i = 0; i < pci_msi_maxmsgnum(pi); i++)
+ pci_generate_msi(pi, i);
+ }
+ }
+
+ if (baridx == 1) {
+ if (offset + size > DMEMSZ) {
+ printf("diow: memw too large, offset %ld size %d\n",
+ offset, size);
+ return;
+ }
+
+ if (size == 1) {
+ sc->memregs[offset] = value;
+ } else if (size == 2) {
+ *(uint16_t *)&sc->memregs[offset] = value;
+ } else if (size == 4) {
+ *(uint32_t *)&sc->memregs[offset] = value;
+ } else if (size == 8) {
+ *(uint64_t *)&sc->memregs[offset] = value;
+ } else {
+ printf("diow: memw unknown size %d\n", size);
+ }
+
+ /*
+ * magic interrupt ??
+ */
+ }
+
+ if (baridx > 1) {
+ printf("diow: unknown bar idx %d\n", baridx);
+ }
+}
+
+static uint64_t
+pci_emul_dior(struct vmctx *ctx, int vcpu, struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size)
+{
+ struct pci_emul_dsoftc *sc = pi->pi_arg;
+ uint32_t value;
+
+ if (baridx == 0) {
+ if (offset + size > DIOSZ) {
+ printf("dior: ior too large, offset %ld size %d\n",
+ offset, size);
+ return (0);
+ }
+
+ if (size == 1) {
+ value = sc->ioregs[offset];
+ } else if (size == 2) {
+ value = *(uint16_t *) &sc->ioregs[offset];
+ } else if (size == 4) {
+ value = *(uint32_t *) &sc->ioregs[offset];
+ } else {
+ printf("dior: ior unknown size %d\n", size);
+ }
+ }
+
+ if (baridx == 1) {
+ if (offset + size > DMEMSZ) {
+ printf("dior: memr too large, offset %ld size %d\n",
+ offset, size);
+ return (0);
+ }
+
+ if (size == 1) {
+ value = sc->memregs[offset];
+ } else if (size == 2) {
+ value = *(uint16_t *) &sc->memregs[offset];
+ } else if (size == 4) {
+ value = *(uint32_t *) &sc->memregs[offset];
+ } else if (size == 8) {
+ value = *(uint64_t *) &sc->memregs[offset];
+ } else {
+ printf("dior: ior unknown size %d\n", size);
+ }
+ }
+
+
+ if (baridx > 1) {
+ printf("dior: unknown bar idx %d\n", baridx);
+ return (0);
+ }
+
+ return (value);
+}
+
+struct pci_devemu pci_dummy = {
+ .pe_emu = "dummy",
+ .pe_init = pci_emul_dinit,
+ .pe_barwrite = pci_emul_diow,
+ .pe_barread = pci_emul_dior
+};
+PCI_EMUL_SET(pci_dummy);
+
+#endif /* PCI_EMUL_TEST */
diff --git a/usr/src/cmd/bhyve/pci_emul.h b/usr/src/cmd/bhyve/pci_emul.h
new file mode 100644
index 0000000000..6af01c4c3c
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_emul.h
@@ -0,0 +1,283 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_emul.h 269700 2014-08-08 03:49:01Z neel $
+ */
+
+#ifndef _PCI_EMUL_H_
+#define _PCI_EMUL_H_
+
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/kernel.h>
+#include <sys/_pthreadtypes.h>
+
+#include <dev/pci/pcireg.h>
+
+#include <assert.h>
+
+#define PCI_BARMAX PCIR_MAX_BAR_0 /* BAR registers in a Type 0 header */
+
+struct vmctx;
+struct pci_devinst;
+struct memory_region;
+
+struct pci_devemu {
+ char *pe_emu; /* Name of device emulation */
+
+ /* instance creation */
+ int (*pe_init)(struct vmctx *, struct pci_devinst *,
+ char *opts);
+
+ /* ACPI DSDT enumeration */
+ void (*pe_write_dsdt)(struct pci_devinst *);
+
+ /* config space read/write callbacks */
+ int (*pe_cfgwrite)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int offset,
+ int bytes, uint32_t val);
+ int (*pe_cfgread)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int offset,
+ int bytes, uint32_t *retval);
+
+ /* BAR read/write callbacks */
+ void (*pe_barwrite)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size, uint64_t value);
+ uint64_t (*pe_barread)(struct vmctx *ctx, int vcpu,
+ struct pci_devinst *pi, int baridx,
+ uint64_t offset, int size);
+};
+#define PCI_EMUL_SET(x) DATA_SET(pci_devemu_set, x);
+
+enum pcibar_type {
+ PCIBAR_NONE,
+ PCIBAR_IO,
+ PCIBAR_MEM32,
+ PCIBAR_MEM64,
+ PCIBAR_MEMHI64
+};
+
+struct pcibar {
+ enum pcibar_type type; /* io or memory */
+ uint64_t size;
+ uint64_t addr;
+};
+
+#define PI_NAMESZ 40
+
+struct msix_table_entry {
+ uint64_t addr;
+ uint32_t msg_data;
+ uint32_t vector_control;
+} __packed;
+
+/*
+ * In case the structure is modified to hold extra information, use a define
+ * for the size that should be emulated.
+ */
+#define MSIX_TABLE_ENTRY_SIZE 16
+#define MAX_MSIX_TABLE_ENTRIES 2048
+#define PBA_SIZE(msgnum) (roundup2((msgnum), 64) / 8)
+
+enum lintr_stat {
+ IDLE,
+ ASSERTED,
+ PENDING
+};
+
+struct pci_devinst {
+ struct pci_devemu *pi_d;
+ struct vmctx *pi_vmctx;
+ uint8_t pi_bus, pi_slot, pi_func;
+ char pi_name[PI_NAMESZ];
+ int pi_bar_getsize;
+ int pi_prevcap;
+ int pi_capend;
+
+ struct {
+ int8_t pin;
+ enum lintr_stat state;
+ int pirq_pin;
+ int ioapic_irq;
+ pthread_mutex_t lock;
+ } pi_lintr;
+
+ struct {
+ int enabled;
+ uint64_t addr;
+ uint64_t msg_data;
+ int maxmsgnum;
+ } pi_msi;
+
+ struct {
+ int enabled;
+ int table_bar;
+ int pba_bar;
+ uint32_t table_offset;
+ int table_count;
+ uint32_t pba_offset;
+ int pba_size;
+ int function_mask;
+ struct msix_table_entry *table; /* allocated at runtime */
+ } pi_msix;
+
+ void *pi_arg; /* devemu-private data */
+
+ u_char pi_cfgdata[PCI_REGMAX + 1];
+ struct pcibar pi_bar[PCI_BARMAX + 1];
+};
+
+struct msicap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t msgctrl;
+ uint32_t addrlo;
+ uint32_t addrhi;
+ uint16_t msgdata;
+} __packed;
+
+struct msixcap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t msgctrl;
+ uint32_t table_info; /* bar index and offset within it */
+ uint32_t pba_info; /* bar index and offset within it */
+} __packed;
+
+struct pciecap {
+ uint8_t capid;
+ uint8_t nextptr;
+ uint16_t pcie_capabilities;
+
+ uint32_t dev_capabilities; /* all devices */
+ uint16_t dev_control;
+ uint16_t dev_status;
+
+ uint32_t link_capabilities; /* devices with links */
+ uint16_t link_control;
+ uint16_t link_status;
+
+ uint32_t slot_capabilities; /* ports with slots */
+ uint16_t slot_control;
+ uint16_t slot_status;
+
+ uint16_t root_control; /* root ports */
+ uint16_t root_capabilities;
+ uint32_t root_status;
+
+ uint32_t dev_capabilities2; /* all devices */
+ uint16_t dev_control2;
+ uint16_t dev_status2;
+
+ uint32_t link_capabilities2; /* devices with links */
+ uint16_t link_control2;
+ uint16_t link_status2;
+
+ uint32_t slot_capabilities2; /* ports with slots */
+ uint16_t slot_control2;
+ uint16_t slot_status2;
+} __packed;
+
+typedef void (*pci_lintr_cb)(int b, int s, int pin, int pirq_pin,
+ int ioapic_irq, void *arg);
+
+int init_pci(struct vmctx *ctx);
+void msicap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val);
+void msixcap_cfgwrite(struct pci_devinst *pi, int capoff, int offset,
+ int bytes, uint32_t val);
+void pci_callback(void);
+int pci_emul_alloc_bar(struct pci_devinst *pdi, int idx,
+ enum pcibar_type type, uint64_t size);
+int pci_emul_alloc_pbar(struct pci_devinst *pdi, int idx,
+ uint64_t hostbase, enum pcibar_type type, uint64_t size);
+int pci_emul_add_msicap(struct pci_devinst *pi, int msgnum);
+int pci_emul_add_pciecap(struct pci_devinst *pi, int pcie_device_type);
+void pci_generate_msi(struct pci_devinst *pi, int msgnum);
+void pci_generate_msix(struct pci_devinst *pi, int msgnum);
+void pci_lintr_assert(struct pci_devinst *pi);
+void pci_lintr_deassert(struct pci_devinst *pi);
+void pci_lintr_request(struct pci_devinst *pi);
+int pci_msi_enabled(struct pci_devinst *pi);
+int pci_msix_enabled(struct pci_devinst *pi);
+int pci_msix_table_bar(struct pci_devinst *pi);
+int pci_msix_pba_bar(struct pci_devinst *pi);
+int pci_msi_msgnum(struct pci_devinst *pi);
+int pci_parse_slot(char *opt);
+void pci_populate_msicap(struct msicap *cap, int msgs, int nextptr);
+int pci_emul_add_msixcap(struct pci_devinst *pi, int msgnum, int barnum);
+int pci_emul_msix_twrite(struct pci_devinst *pi, uint64_t offset, int size,
+ uint64_t value);
+uint64_t pci_emul_msix_tread(struct pci_devinst *pi, uint64_t offset, int size);
+int pci_count_lintr(int bus);
+void pci_walk_lintr(int bus, pci_lintr_cb cb, void *arg);
+void pci_write_dsdt(void);
+uint64_t pci_ecfg_base(void);
+int pci_bus_configured(int bus);
+
+static __inline void
+pci_set_cfgdata8(struct pci_devinst *pi, int offset, uint8_t val)
+{
+ assert(offset <= PCI_REGMAX);
+ *(uint8_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void
+pci_set_cfgdata16(struct pci_devinst *pi, int offset, uint16_t val)
+{
+ assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+ *(uint16_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline void
+pci_set_cfgdata32(struct pci_devinst *pi, int offset, uint32_t val)
+{
+ assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+ *(uint32_t *)(pi->pi_cfgdata + offset) = val;
+}
+
+static __inline uint8_t
+pci_get_cfgdata8(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= PCI_REGMAX);
+ return (*(uint8_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint16_t
+pci_get_cfgdata16(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= (PCI_REGMAX - 1) && (offset & 1) == 0);
+ return (*(uint16_t *)(pi->pi_cfgdata + offset));
+}
+
+static __inline uint32_t
+pci_get_cfgdata32(struct pci_devinst *pi, int offset)
+{
+ assert(offset <= (PCI_REGMAX - 3) && (offset & 3) == 0);
+ return (*(uint32_t *)(pi->pi_cfgdata + offset));
+}
+
+#endif /* _PCI_EMUL_H_ */
diff --git a/usr/src/cmd/bhyve/pci_hostbridge.c b/usr/src/cmd/bhyve/pci_hostbridge.c
new file mode 100644
index 0000000000..08956d082e
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_hostbridge.c
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_hostbridge.c 283264 2015-05-21 20:11:52Z tychon $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_hostbridge.c 283264 2015-05-21 20:11:52Z tychon $");
+
+#include "pci_emul.h"
+
+static int
+pci_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+ /* config space */
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1275); /* NetApp */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x1275); /* NetApp */
+ pci_set_cfgdata8(pi, PCIR_HDRTYPE, PCIM_HDRTYPE_NORMAL);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_HOST);
+
+ pci_emul_add_pciecap(pi, PCIEM_TYPE_ROOT_PORT);
+
+ return (0);
+}
+
+static int
+pci_amd_hostbridge_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ (void) pci_hostbridge_init(ctx, pi, opts);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, 0x1022); /* AMD */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, 0x7432); /* made up */
+
+ return (0);
+}
+
+struct pci_devemu pci_de_amd_hostbridge = {
+ .pe_emu = "amd_hostbridge",
+ .pe_init = pci_amd_hostbridge_init,
+};
+PCI_EMUL_SET(pci_de_amd_hostbridge);
+
+struct pci_devemu pci_de_hostbridge = {
+ .pe_emu = "hostbridge",
+ .pe_init = pci_hostbridge_init,
+};
+PCI_EMUL_SET(pci_de_hostbridge);
diff --git a/usr/src/cmd/bhyve/pci_irq.c b/usr/src/cmd/bhyve/pci_irq.c
new file mode 100644
index 0000000000..97ee330c65
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_irq.c
@@ -0,0 +1,351 @@
+/*-
+ * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_irq.c 266125 2014-05-15 14:16:55Z jhb $");
+
+#include <sys/param.h>
+#include <machine/vmm.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "inout.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+
+/*
+ * Implement an 8 pin PCI interrupt router compatible with the router
+ * present on Intel's ICH10 chip.
+ */
+
+/* Fields in each PIRQ register. */
+#define PIRQ_DIS 0x80
+#define PIRQ_IRQ 0x0f
+
+/* Only IRQs 3-7, 9-12, and 14-15 are permitted. */
+#define PERMITTED_IRQS 0xdef8
+#define IRQ_PERMITTED(irq) (((1U << (irq)) & PERMITTED_IRQS) != 0)
+
+/* IRQ count to disable an IRQ. */
+#define IRQ_DISABLED 0xff
+
+static struct pirq {
+ uint8_t reg;
+ int use_count;
+ int active_count;
+ pthread_mutex_t lock;
+} pirqs[8];
+
+static u_char irq_counts[16];
+static int pirq_cold = 1;
+
+/*
+ * Returns true if this pin is enabled with a valid IRQ. Setting the
+ * register to a reserved IRQ causes interrupts to not be asserted as
+ * if the pin was disabled.
+ */
+static bool
+pirq_valid_irq(int reg)
+{
+
+ if (reg & PIRQ_DIS)
+ return (false);
+ return (IRQ_PERMITTED(reg & PIRQ_IRQ));
+}
+
+uint8_t
+pirq_read(int pin)
+{
+
+ assert(pin > 0 && pin <= nitems(pirqs));
+ return (pirqs[pin - 1].reg);
+}
+
+void
+pirq_write(struct vmctx *ctx, int pin, uint8_t val)
+{
+ struct pirq *pirq;
+
+ assert(pin > 0 && pin <= nitems(pirqs));
+ pirq = &pirqs[pin - 1];
+ pthread_mutex_lock(&pirq->lock);
+ if (pirq->reg != (val & (PIRQ_DIS | PIRQ_IRQ))) {
+ if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg))
+ vm_isa_deassert_irq(ctx, pirq->reg & PIRQ_IRQ, -1);
+ pirq->reg = val & (PIRQ_DIS | PIRQ_IRQ);
+ if (pirq->active_count != 0 && pirq_valid_irq(pirq->reg))
+ vm_isa_assert_irq(ctx, pirq->reg & PIRQ_IRQ, -1);
+ }
+ pthread_mutex_unlock(&pirq->lock);
+}
+
+void
+pci_irq_reserve(int irq)
+{
+
+ assert(irq < nitems(irq_counts));
+ assert(pirq_cold);
+ assert(irq_counts[irq] == 0 || irq_counts[irq] == IRQ_DISABLED);
+ irq_counts[irq] = IRQ_DISABLED;
+}
+
+void
+pci_irq_use(int irq)
+{
+
+ assert(irq < nitems(irq_counts));
+ assert(pirq_cold);
+ if (irq_counts[irq] != IRQ_DISABLED)
+ irq_counts[irq]++;
+}
+
+void
+pci_irq_init(struct vmctx *ctx)
+{
+ int i;
+
+ for (i = 0; i < nitems(pirqs); i++) {
+ pirqs[i].reg = PIRQ_DIS;
+ pirqs[i].use_count = 0;
+ pirqs[i].active_count = 0;
+ pthread_mutex_init(&pirqs[i].lock, NULL);
+ }
+ for (i = 0; i < nitems(irq_counts); i++) {
+ if (IRQ_PERMITTED(i))
+ irq_counts[i] = 0;
+ else
+ irq_counts[i] = IRQ_DISABLED;
+ }
+}
+
+void
+pci_irq_assert(struct pci_devinst *pi)
+{
+ struct pirq *pirq;
+
+ if (pi->pi_lintr.pirq_pin > 0) {
+ assert(pi->pi_lintr.pirq_pin <= nitems(pirqs));
+ pirq = &pirqs[pi->pi_lintr.pirq_pin - 1];
+ pthread_mutex_lock(&pirq->lock);
+ pirq->active_count++;
+ if (pirq->active_count == 1 && pirq_valid_irq(pirq->reg)) {
+ vm_isa_assert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ,
+ pi->pi_lintr.ioapic_irq);
+ pthread_mutex_unlock(&pirq->lock);
+ return;
+ }
+ pthread_mutex_unlock(&pirq->lock);
+ }
+ vm_ioapic_assert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
+}
+
+void
+pci_irq_deassert(struct pci_devinst *pi)
+{
+ struct pirq *pirq;
+
+ if (pi->pi_lintr.pirq_pin > 0) {
+ assert(pi->pi_lintr.pirq_pin <= nitems(pirqs));
+ pirq = &pirqs[pi->pi_lintr.pirq_pin - 1];
+ pthread_mutex_lock(&pirq->lock);
+ pirq->active_count--;
+ if (pirq->active_count == 0 && pirq_valid_irq(pirq->reg)) {
+ vm_isa_deassert_irq(pi->pi_vmctx, pirq->reg & PIRQ_IRQ,
+ pi->pi_lintr.ioapic_irq);
+ pthread_mutex_unlock(&pirq->lock);
+ return;
+ }
+ pthread_mutex_unlock(&pirq->lock);
+ }
+ vm_ioapic_deassert_irq(pi->pi_vmctx, pi->pi_lintr.ioapic_irq);
+}
+
+int
+pirq_alloc_pin(struct vmctx *ctx)
+{
+ int best_count, best_irq, best_pin, irq, pin;
+
+ pirq_cold = 1;
+
+ /* First, find the least-used PIRQ pin. */
+ best_pin = 0;
+ best_count = pirqs[0].use_count;
+ for (pin = 1; pin < nitems(pirqs); pin++) {
+ if (pirqs[pin].use_count < best_count) {
+ best_pin = pin;
+ best_count = pirqs[pin].use_count;
+ }
+ }
+ pirqs[best_pin].use_count++;
+
+ /* Second, route this pin to an IRQ. */
+ if (pirqs[best_pin].reg == PIRQ_DIS) {
+ best_irq = -1;
+ best_count = 0;
+ for (irq = 0; irq < nitems(irq_counts); irq++) {
+ if (irq_counts[irq] == IRQ_DISABLED)
+ continue;
+ if (best_irq == -1 || irq_counts[irq] < best_count) {
+ best_irq = irq;
+ best_count = irq_counts[irq];
+ }
+ }
+ assert(best_irq != 0);
+ irq_counts[best_irq]++;
+ pirqs[best_pin].reg = best_irq;
+ vm_isa_set_irq_trigger(ctx, best_irq, LEVEL_TRIGGER);
+ }
+
+ return (best_pin + 1);
+}
+
+int
+pirq_irq(int pin)
+{
+
+ if (pin == -1)
+ return (255);
+ assert(pin > 0 && pin <= nitems(pirqs));
+ return (pirqs[pin - 1].reg & PIRQ_IRQ);
+}
+
+/* XXX: Generate $PIR table. */
+
+#ifdef __FreeBSD__
+static void
+pirq_dsdt(void)
+{
+ char *irq_prs, *old;
+ int irq, pin;
+
+ irq_prs = NULL;
+ for (irq = 0; irq < nitems(irq_counts); irq++) {
+ if (!IRQ_PERMITTED(irq))
+ continue;
+ if (irq_prs == NULL)
+ asprintf(&irq_prs, "%d", irq);
+ else {
+ old = irq_prs;
+ asprintf(&irq_prs, "%s,%d", old, irq);
+ free(old);
+ }
+ }
+
+ /*
+ * A helper method to validate a link register's value. This
+ * duplicates pirq_valid_irq().
+ */
+ dsdt_line("");
+ dsdt_line("Method (PIRV, 1, NotSerialized)");
+ dsdt_line("{");
+ dsdt_line(" If (And (Arg0, 0x%02X))", PIRQ_DIS);
+ dsdt_line(" {");
+ dsdt_line(" Return (0x00)");
+ dsdt_line(" }");
+ dsdt_line(" And (Arg0, 0x%02X, Local0)", PIRQ_IRQ);
+ dsdt_line(" If (LLess (Local0, 0x03))");
+ dsdt_line(" {");
+ dsdt_line(" Return (0x00)");
+ dsdt_line(" }");
+ dsdt_line(" If (LEqual (Local0, 0x08))");
+ dsdt_line(" {");
+ dsdt_line(" Return (0x00)");
+ dsdt_line(" }");
+ dsdt_line(" If (LEqual (Local0, 0x0D))");
+ dsdt_line(" {");
+ dsdt_line(" Return (0x00)");
+ dsdt_line(" }");
+ dsdt_line(" Return (0x01)");
+ dsdt_line("}");
+
+ for (pin = 0; pin < nitems(pirqs); pin++) {
+ dsdt_line("");
+ dsdt_line("Device (LNK%c)", 'A' + pin);
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0C0F\"))");
+ dsdt_line(" Name (_UID, 0x%02X)", pin + 1);
+ dsdt_line(" Method (_STA, 0, NotSerialized)");
+ dsdt_line(" {");
+ dsdt_line(" If (PIRV (PIR%c))", 'A' + pin);
+ dsdt_line(" {");
+ dsdt_line(" Return (0x0B)");
+ dsdt_line(" }");
+ dsdt_line(" Else");
+ dsdt_line(" {");
+ dsdt_line(" Return (0x09)");
+ dsdt_line(" }");
+ dsdt_line(" }");
+ dsdt_line(" Name (_PRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_line(" IRQ (Level, ActiveLow, Shared, )");
+ dsdt_line(" {%s}", irq_prs);
+ dsdt_line(" })");
+ dsdt_line(" Name (CB%02X, ResourceTemplate ()", pin + 1);
+ dsdt_line(" {");
+ dsdt_line(" IRQ (Level, ActiveLow, Shared, )");
+ dsdt_line(" {}");
+ dsdt_line(" })");
+ dsdt_line(" CreateWordField (CB%02X, 0x01, CIR%c)",
+ pin + 1, 'A' + pin);
+ dsdt_line(" Method (_CRS, 0, NotSerialized)");
+ dsdt_line(" {");
+ dsdt_line(" And (PIR%c, 0x%02X, Local0)", 'A' + pin,
+ PIRQ_DIS | PIRQ_IRQ);
+ dsdt_line(" If (PIRV (Local0))");
+ dsdt_line(" {");
+ dsdt_line(" ShiftLeft (0x01, Local0, CIR%c)", 'A' + pin);
+ dsdt_line(" }");
+ dsdt_line(" Else");
+ dsdt_line(" {");
+ dsdt_line(" Store (0x00, CIR%c)", 'A' + pin);
+ dsdt_line(" }");
+ dsdt_line(" Return (CB%02X)", pin + 1);
+ dsdt_line(" }");
+ dsdt_line(" Method (_DIS, 0, NotSerialized)");
+ dsdt_line(" {");
+ dsdt_line(" Store (0x80, PIR%c)", 'A' + pin);
+ dsdt_line(" }");
+ dsdt_line(" Method (_SRS, 1, NotSerialized)");
+ dsdt_line(" {");
+ dsdt_line(" CreateWordField (Arg0, 0x01, SIR%c)", 'A' + pin);
+ dsdt_line(" FindSetRightBit (SIR%c, Local0)", 'A' + pin);
+ dsdt_line(" Store (Decrement (Local0), PIR%c)", 'A' + pin);
+ dsdt_line(" }");
+ dsdt_line("}");
+ }
+ free(irq_prs);
+}
+LPC_DSDT(pirq_dsdt);
+#endif
diff --git a/usr/src/cmd/bhyve/pci_irq.h b/usr/src/cmd/bhyve/pci_irq.h
new file mode 100644
index 0000000000..483f12b61e
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_irq.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2014 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_irq.h 266125 2014-05-15 14:16:55Z jhb $
+ */
+
+#ifndef __PCI_IRQ_H__
+#define __PCI_IRQ_H__
+
+struct pci_devinst;
+
+void pci_irq_assert(struct pci_devinst *pi);
+void pci_irq_deassert(struct pci_devinst *pi);
+void pci_irq_init(struct vmctx *ctx);
+void pci_irq_reserve(int irq);
+void pci_irq_use(int irq);
+int pirq_alloc_pin(struct vmctx *ctx);
+int pirq_irq(int pin);
+uint8_t pirq_read(int pin);
+void pirq_write(struct vmctx *ctx, int pin, uint8_t val);
+
+#endif
diff --git a/usr/src/cmd/bhyve/pci_lpc.c b/usr/src/cmd/bhyve/pci_lpc.c
new file mode 100644
index 0000000000..8c060150dc
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_lpc.c
@@ -0,0 +1,433 @@
+/*-
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z neel $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_lpc.c 266933 2014-05-31 23:37:34Z neel $");
+
+#include <sys/types.h>
+#include <machine/vmm.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "inout.h"
+#include "pci_emul.h"
+#include "pci_irq.h"
+#include "pci_lpc.h"
+#include "uart_emul.h"
+
+#define IO_ICU1 0x20
+#define IO_ICU2 0xA0
+
+SET_DECLARE(lpc_dsdt_set, struct lpc_dsdt);
+SET_DECLARE(lpc_sysres_set, struct lpc_sysres);
+
+#define ELCR_PORT 0x4d0
+SYSRES_IO(ELCR_PORT, 2);
+
+#define IO_TIMER1_PORT 0x40
+
+#define NMISC_PORT 0x61
+SYSRES_IO(NMISC_PORT, 1);
+
+static struct pci_devinst *lpc_bridge;
+
+#define LPC_UART_NUM 2
+static struct lpc_uart_softc {
+ struct uart_softc *uart_softc;
+ const char *opts;
+ int iobase;
+ int irq;
+ int enabled;
+} lpc_uart_softc[LPC_UART_NUM];
+
+static const char *lpc_uart_names[LPC_UART_NUM] = { "COM1", "COM2" };
+
+/*
+ * LPC device configuration is in the following form:
+ * <lpc_device_name>[,<options>]
+ * For e.g. "com1,stdio"
+ */
+int
+lpc_device_parse(const char *opts)
+{
+ int unit, error;
+ char *str, *cpy, *lpcdev;
+
+ error = -1;
+ str = cpy = strdup(opts);
+ lpcdev = strsep(&str, ",");
+ if (lpcdev != NULL) {
+ for (unit = 0; unit < LPC_UART_NUM; unit++) {
+ if (strcasecmp(lpcdev, lpc_uart_names[unit]) == 0) {
+ lpc_uart_softc[unit].opts = str;
+ error = 0;
+ goto done;
+ }
+ }
+ }
+
+done:
+ if (error)
+ free(cpy);
+
+ return (error);
+}
+
+static void
+lpc_uart_intr_assert(void *arg)
+{
+ struct lpc_uart_softc *sc = arg;
+
+ assert(sc->irq >= 0);
+
+ vm_isa_pulse_irq(lpc_bridge->pi_vmctx, sc->irq, sc->irq);
+}
+
+static void
+lpc_uart_intr_deassert(void *arg)
+{
+ /*
+ * The COM devices on the LPC bus generate edge triggered interrupts,
+ * so nothing more to do here.
+ */
+}
+
+static int
+lpc_uart_io_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int offset;
+ struct lpc_uart_softc *sc = arg;
+
+ offset = port - sc->iobase;
+
+ switch (bytes) {
+ case 1:
+ if (in)
+ *eax = uart_read(sc->uart_softc, offset);
+ else
+ uart_write(sc->uart_softc, offset, *eax);
+ break;
+ case 2:
+ if (in) {
+ *eax = uart_read(sc->uart_softc, offset);
+ *eax |= uart_read(sc->uart_softc, offset + 1) << 8;
+ } else {
+ uart_write(sc->uart_softc, offset, *eax);
+ uart_write(sc->uart_softc, offset + 1, *eax >> 8);
+ }
+ break;
+ default:
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+lpc_init(void)
+{
+ struct lpc_uart_softc *sc;
+ struct inout_port iop;
+ const char *name;
+ int unit, error;
+
+ /* COM1 and COM2 */
+ for (unit = 0; unit < LPC_UART_NUM; unit++) {
+ sc = &lpc_uart_softc[unit];
+ name = lpc_uart_names[unit];
+
+ if (uart_legacy_alloc(unit, &sc->iobase, &sc->irq) != 0) {
+ fprintf(stderr, "Unable to allocate resources for "
+ "LPC device %s\n", name);
+ return (-1);
+ }
+ pci_irq_reserve(sc->irq);
+
+ sc->uart_softc = uart_init(lpc_uart_intr_assert,
+ lpc_uart_intr_deassert, sc);
+
+ if (uart_set_backend(sc->uart_softc, sc->opts) != 0) {
+ fprintf(stderr, "Unable to initialize backend '%s' "
+ "for LPC device %s\n", sc->opts, name);
+ return (-1);
+ }
+
+ bzero(&iop, sizeof(struct inout_port));
+ iop.name = name;
+ iop.port = sc->iobase;
+ iop.size = UART_IO_BAR_SIZE;
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = lpc_uart_io_handler;
+ iop.arg = sc;
+
+ error = register_inout(&iop);
+ assert(error == 0);
+ sc->enabled = 1;
+ }
+
+ return (0);
+}
+
+#ifdef __FreeBSD__
+static void
+pci_lpc_write_dsdt(struct pci_devinst *pi)
+{
+ struct lpc_dsdt **ldpp, *ldp;
+
+ dsdt_line("");
+ dsdt_line("Device (ISA)");
+ dsdt_line("{");
+ dsdt_line(" Name (_ADR, 0x%04X%04X)", pi->pi_slot, pi->pi_func);
+ dsdt_line(" OperationRegion (LPCR, PCI_Config, 0x00, 0x100)");
+ dsdt_line(" Field (LPCR, AnyAcc, NoLock, Preserve)");
+ dsdt_line(" {");
+ dsdt_line(" Offset (0x60),");
+ dsdt_line(" PIRA, 8,");
+ dsdt_line(" PIRB, 8,");
+ dsdt_line(" PIRC, 8,");
+ dsdt_line(" PIRD, 8,");
+ dsdt_line(" Offset (0x68),");
+ dsdt_line(" PIRE, 8,");
+ dsdt_line(" PIRF, 8,");
+ dsdt_line(" PIRG, 8,");
+ dsdt_line(" PIRH, 8");
+ dsdt_line(" }");
+ dsdt_line("");
+
+ dsdt_indent(1);
+ SET_FOREACH(ldpp, lpc_dsdt_set) {
+ ldp = *ldpp;
+ ldp->handler();
+ }
+
+ dsdt_line("");
+ dsdt_line("Device (PIC)");
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0000\"))");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(2);
+ dsdt_fixed_ioport(IO_ICU1, 2);
+ dsdt_fixed_ioport(IO_ICU2, 2);
+ dsdt_fixed_irq(2);
+ dsdt_unindent(2);
+ dsdt_line(" })");
+ dsdt_line("}");
+
+ dsdt_line("");
+ dsdt_line("Device (TIMR)");
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0100\"))");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(2);
+ dsdt_fixed_ioport(IO_TIMER1_PORT, 4);
+ dsdt_fixed_irq(0);
+ dsdt_unindent(2);
+ dsdt_line(" })");
+ dsdt_line("}");
+ dsdt_unindent(1);
+
+ dsdt_line("}");
+}
+
+static void
+pci_lpc_sysres_dsdt(void)
+{
+ struct lpc_sysres **lspp, *lsp;
+
+ dsdt_line("");
+ dsdt_line("Device (SIO)");
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0C02\"))");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+
+ dsdt_indent(2);
+ SET_FOREACH(lspp, lpc_sysres_set) {
+ lsp = *lspp;
+ switch (lsp->type) {
+ case LPC_SYSRES_IO:
+ dsdt_fixed_ioport(lsp->base, lsp->length);
+ break;
+ case LPC_SYSRES_MEM:
+ dsdt_fixed_mem32(lsp->base, lsp->length);
+ break;
+ }
+ }
+ dsdt_unindent(2);
+
+ dsdt_line(" })");
+ dsdt_line("}");
+}
+LPC_DSDT(pci_lpc_sysres_dsdt);
+
+static void
+pci_lpc_uart_dsdt(void)
+{
+ struct lpc_uart_softc *sc;
+ int unit;
+
+ for (unit = 0; unit < LPC_UART_NUM; unit++) {
+ sc = &lpc_uart_softc[unit];
+ if (!sc->enabled)
+ continue;
+ dsdt_line("");
+ dsdt_line("Device (%s)", lpc_uart_names[unit]);
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0501\"))");
+ dsdt_line(" Name (_UID, %d)", unit + 1);
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(2);
+ dsdt_fixed_ioport(sc->iobase, UART_IO_BAR_SIZE);
+ dsdt_fixed_irq(sc->irq);
+ dsdt_unindent(2);
+ dsdt_line(" })");
+ dsdt_line("}");
+ }
+}
+LPC_DSDT(pci_lpc_uart_dsdt);
+#endif
+
+static int
+pci_lpc_cfgwrite(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int coff, int bytes, uint32_t val)
+{
+ int pirq_pin;
+
+ if (bytes == 1) {
+ pirq_pin = 0;
+ if (coff >= 0x60 && coff <= 0x63)
+ pirq_pin = coff - 0x60 + 1;
+ if (coff >= 0x68 && coff <= 0x6b)
+ pirq_pin = coff - 0x68 + 5;
+ if (pirq_pin != 0) {
+ pirq_write(ctx, pirq_pin, val);
+ pci_set_cfgdata8(pi, coff, pirq_read(pirq_pin));
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+static void
+pci_lpc_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+}
+
+static uint64_t
+pci_lpc_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ return (0);
+}
+
+#define LPC_DEV 0x7000
+#define LPC_VENDOR 0x8086
+
+static int
+pci_lpc_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+
+ /*
+ * Do not allow more than one LPC bridge to be configured.
+ */
+ if (lpc_bridge != NULL) {
+ fprintf(stderr, "Only one LPC bridge is allowed.\n");
+ return (-1);
+ }
+
+ /*
+ * Enforce that the LPC can only be configured on bus 0. This
+ * simplifies the ACPI DSDT because it can provide a decode for
+ * all legacy i/o ports behind bus 0.
+ */
+ if (pi->pi_bus != 0) {
+ fprintf(stderr, "LPC bridge can be present only on bus 0.\n");
+ return (-1);
+ }
+
+ if (lpc_init() != 0)
+ return (-1);
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, LPC_DEV);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, LPC_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_BRIDGE);
+ pci_set_cfgdata8(pi, PCIR_SUBCLASS, PCIS_BRIDGE_ISA);
+
+ lpc_bridge = pi;
+
+ return (0);
+}
+
+char *
+lpc_pirq_name(int pin)
+{
+ char *name;
+
+ if (lpc_bridge == NULL)
+ return (NULL);
+ asprintf(&name, "\\_SB.PC00.ISA.LNK%c,", 'A' + pin - 1);
+ return (name);
+}
+
+void
+lpc_pirq_routed(void)
+{
+ int pin;
+
+ if (lpc_bridge == NULL)
+ return;
+
+ for (pin = 0; pin < 4; pin++)
+ pci_set_cfgdata8(lpc_bridge, 0x60 + pin, pirq_read(pin + 1));
+ for (pin = 0; pin < 4; pin++)
+ pci_set_cfgdata8(lpc_bridge, 0x68 + pin, pirq_read(pin + 5));
+}
+
+struct pci_devemu pci_de_lpc = {
+ .pe_emu = "lpc",
+ .pe_init = pci_lpc_init,
+#ifdef __FreeBSD__
+ .pe_write_dsdt = pci_lpc_write_dsdt,
+#endif
+ .pe_cfgwrite = pci_lpc_cfgwrite,
+ .pe_barwrite = pci_lpc_write,
+ .pe_barread = pci_lpc_read
+};
+PCI_EMUL_SET(pci_de_lpc);
diff --git a/usr/src/cmd/bhyve/pci_lpc.h b/usr/src/cmd/bhyve/pci_lpc.h
new file mode 100644
index 0000000000..4f725b1dd3
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_lpc.h
@@ -0,0 +1,72 @@
+/*-
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_lpc.h 266125 2014-05-15 14:16:55Z jhb $
+ */
+
+#ifndef _LPC_H_
+#define _LPC_H_
+
+#include <sys/linker_set.h>
+
+typedef void (*lpc_write_dsdt_t)(void);
+
+struct lpc_dsdt {
+ lpc_write_dsdt_t handler;
+};
+
+#define LPC_DSDT(handler) \
+ static struct lpc_dsdt __CONCAT(__lpc_dsdt, __LINE__) = { \
+ (handler), \
+ }; \
+ DATA_SET(lpc_dsdt_set, __CONCAT(__lpc_dsdt, __LINE__))
+
+enum lpc_sysres_type {
+ LPC_SYSRES_IO,
+ LPC_SYSRES_MEM
+};
+
+struct lpc_sysres {
+ enum lpc_sysres_type type;
+ uint32_t base;
+ uint32_t length;
+};
+
+#define LPC_SYSRES(type, base, length) \
+ static struct lpc_sysres __CONCAT(__lpc_sysres, __LINE__) = { \
+ (type), \
+ (base), \
+ (length) \
+ }; \
+ DATA_SET(lpc_sysres_set, __CONCAT(__lpc_sysres, __LINE__))
+
+#define SYSRES_IO(base, length) LPC_SYSRES(LPC_SYSRES_IO, base, length)
+#define SYSRES_MEM(base, length) LPC_SYSRES(LPC_SYSRES_MEM, base, length)
+
+int lpc_device_parse(const char *opt);
+char *lpc_pirq_name(int pin);
+void lpc_pirq_routed(void);
+
+#endif
diff --git a/usr/src/cmd/bhyve/pci_virtio_block.c b/usr/src/cmd/bhyve/pci_virtio_block.c
new file mode 100644
index 0000000000..65e2d9c57d
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_block.c
@@ -0,0 +1,392 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:47:09Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_block.c 266935 2014-06-01 02:47:09Z neel $");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/stat.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <sys/disk.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <md5.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#define VTBLK_RINGSZ 64
+
+#ifdef __FreeBSD__
+#define VTBLK_MAXSEGS 32
+#else
+#define VTBLK_MAXSEGS 16
+#endif
+
+#define VTBLK_S_OK 0
+#define VTBLK_S_IOERR 1
+#define VTBLK_S_UNSUPP 2
+
+#define VTBLK_BLK_ID_BYTES 20
+
+/* Capability bits */
+#define VTBLK_F_SEG_MAX (1 << 2) /* Maximum request segments */
+#define VTBLK_F_BLK_SIZE (1 << 6) /* cfg block size valid */
+
+/*
+ * Host capabilities
+ */
+#define VTBLK_S_HOSTCAPS \
+ ( VTBLK_F_SEG_MAX | \
+ VTBLK_F_BLK_SIZE | \
+ VIRTIO_RING_F_INDIRECT_DESC ) /* indirect descriptors */
+
+/*
+ * Config space "registers"
+ */
+struct vtblk_config {
+ uint64_t vbc_capacity;
+ uint32_t vbc_size_max;
+ uint32_t vbc_seg_max;
+ uint16_t vbc_geom_c;
+ uint8_t vbc_geom_h;
+ uint8_t vbc_geom_s;
+ uint32_t vbc_blk_size;
+ uint32_t vbc_sectors_max;
+} __packed;
+
+/*
+ * Fixed-size block header
+ */
+struct virtio_blk_hdr {
+#define VBH_OP_READ 0
+#define VBH_OP_WRITE 1
+#define VBH_OP_IDENT 8
+#define VBH_FLAG_BARRIER 0x80000000 /* OR'ed into vbh_type */
+ uint32_t vbh_type;
+ uint32_t vbh_ioprio;
+ uint64_t vbh_sector;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtblk_debug;
+#define DPRINTF(params) if (pci_vtblk_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtblk_softc {
+ struct virtio_softc vbsc_vs;
+ pthread_mutex_t vsc_mtx;
+ struct vqueue_info vbsc_vq;
+ int vbsc_fd;
+ struct vtblk_config vbsc_cfg;
+ char vbsc_ident[VTBLK_BLK_ID_BYTES];
+};
+
+static void pci_vtblk_reset(void *);
+static void pci_vtblk_notify(void *, struct vqueue_info *);
+static int pci_vtblk_cfgread(void *, int, int, uint32_t *);
+static int pci_vtblk_cfgwrite(void *, int, int, uint32_t);
+
+static struct virtio_consts vtblk_vi_consts = {
+ "vtblk", /* our name */
+ 1, /* we support 1 virtqueue */
+ sizeof(struct vtblk_config), /* config reg size */
+ pci_vtblk_reset, /* reset */
+ pci_vtblk_notify, /* device-wide qnotify */
+ pci_vtblk_cfgread, /* read PCI config */
+ pci_vtblk_cfgwrite, /* write PCI config */
+ VTBLK_S_HOSTCAPS, /* our capabilities */
+};
+
+static void
+pci_vtblk_reset(void *vsc)
+{
+ struct pci_vtblk_softc *sc = vsc;
+
+ DPRINTF(("vtblk: device reset requested !\n"));
+ vi_reset_dev(&sc->vbsc_vs);
+}
+
+static void
+pci_vtblk_proc(struct pci_vtblk_softc *sc, struct vqueue_info *vq)
+{
+ struct virtio_blk_hdr *vbh;
+ uint8_t *status;
+ int i, n;
+ int err;
+ int iolen;
+ int writeop, type;
+ off_t offset;
+ struct iovec iov[VTBLK_MAXSEGS + 2];
+ uint16_t flags[VTBLK_MAXSEGS + 2];
+
+ n = vq_getchain(vq, iov, VTBLK_MAXSEGS + 2, flags);
+
+ /*
+ * The first descriptor will be the read-only fixed header,
+ * and the last is for status (hence +2 above and below).
+ * The remaining iov's are the actual data I/O vectors.
+ *
+ * XXX - note - this fails on crash dump, which does a
+ * VIRTIO_BLK_T_FLUSH with a zero transfer length
+ */
+ assert(n >= 2 && n <= VTBLK_MAXSEGS + 2);
+
+ assert((flags[0] & VRING_DESC_F_WRITE) == 0);
+ assert(iov[0].iov_len == sizeof(struct virtio_blk_hdr));
+ vbh = (struct virtio_block_hdr *)iov[0].iov_base;
+
+ status = iov[--n].iov_base;
+ assert(iov[n].iov_len == 1);
+ assert(flags[n] & VRING_DESC_F_WRITE);
+
+ /*
+ * XXX
+ * The guest should not be setting the BARRIER flag because
+ * we don't advertise the capability.
+ */
+ type = vbh->vbh_type & ~VBH_FLAG_BARRIER;
+ writeop = (type == VBH_OP_WRITE);
+
+ offset = vbh->vbh_sector * DEV_BSIZE;
+
+ iolen = 0;
+ for (i = 1; i < n; i++) {
+ /*
+ * - write op implies read-only descriptor,
+ * - read/ident op implies write-only descriptor,
+ * therefore test the inverse of the descriptor bit
+ * to the op.
+ */
+ assert(((flags[i] & VRING_DESC_F_WRITE) == 0) == writeop);
+ iolen += iov[i].iov_len;
+ }
+
+ DPRINTF(("virtio-block: %s op, %d bytes, %d segs, offset %ld\n\r",
+ writeop ? "write" : "read/ident", iolen, i - 1, offset));
+
+ switch (type) {
+ case VBH_OP_WRITE:
+ err = pwritev(sc->vbsc_fd, iov + 1, i - 1, offset);
+ break;
+ case VBH_OP_READ:
+ err = preadv(sc->vbsc_fd, iov + 1, i - 1, offset);
+ break;
+ case VBH_OP_IDENT:
+ /* Assume a single buffer */
+ strlcpy(iov[1].iov_base, sc->vbsc_ident,
+ MIN(iov[1].iov_len, sizeof(sc->vbsc_ident)));
+ err = 0;
+ break;
+ default:
+ err = -ENOSYS;
+ break;
+ }
+
+ /* convert errno into a virtio block error return */
+ if (err < 0) {
+ if (err == -ENOSYS)
+ *status = VTBLK_S_UNSUPP;
+ else
+ *status = VTBLK_S_IOERR;
+ } else
+ *status = VTBLK_S_OK;
+
+ /*
+ * Return the descriptor back to the host.
+ * We wrote 1 byte (our status) to host.
+ */
+ vq_relchain(vq, 1);
+}
+
+static void
+pci_vtblk_notify(void *vsc, struct vqueue_info *vq)
+{
+ struct pci_vtblk_softc *sc = vsc;
+
+ vq_startchains(vq);
+ while (vq_has_descs(vq))
+ pci_vtblk_proc(sc, vq);
+ vq_endchains(vq, 1); /* Generate interrupt if appropriate. */
+}
+
+static int
+pci_vtblk_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ struct stat sbuf;
+ MD5_CTX mdctx;
+ u_char digest[16];
+ struct pci_vtblk_softc *sc;
+ off_t size;
+ int fd;
+ int sectsz;
+
+ if (opts == NULL) {
+ printf("virtio-block: backing device required\n");
+ return (1);
+ }
+
+ /*
+ * The supplied backing file has to exist
+ */
+ fd = open(opts, O_RDWR);
+ if (fd < 0) {
+ perror("Could not open backing file");
+ return (1);
+ }
+
+ if (fstat(fd, &sbuf) < 0) {
+ perror("Could not stat backing file");
+ close(fd);
+ return (1);
+ }
+
+ /*
+ * Deal with raw devices
+ */
+ size = sbuf.st_size;
+ sectsz = DEV_BSIZE;
+#ifdef __FreeBSD__
+ if (S_ISCHR(sbuf.st_mode)) {
+ if (ioctl(fd, DIOCGMEDIASIZE, &size) < 0 ||
+ ioctl(fd, DIOCGSECTORSIZE, &sectsz)) {
+ perror("Could not fetch dev blk/sector size");
+ close(fd);
+ return (1);
+ }
+ assert(size != 0);
+ assert(sectsz != 0);
+ }
+#endif
+
+ sc = calloc(1, sizeof(struct pci_vtblk_softc));
+
+ /* record fd of storage device/file */
+ sc->vbsc_fd = fd;
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ /* init virtio softc and virtqueues */
+ vi_softc_linkup(&sc->vbsc_vs, &vtblk_vi_consts, sc, pi, &sc->vbsc_vq);
+ sc->vbsc_vs.vs_mtx = &sc->vsc_mtx;
+
+ sc->vbsc_vq.vq_qsize = VTBLK_RINGSZ;
+ /* sc->vbsc_vq.vq_notify = we have no per-queue notify */
+
+ /*
+ * Create an identifier for the backing file. Use parts of the
+ * md5 sum of the filename
+ */
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, opts, strlen(opts));
+ MD5Final(digest, &mdctx);
+ sprintf(sc->vbsc_ident, "BHYVE-%02X%02X-%02X%02X-%02X%02X",
+ digest[0], digest[1], digest[2], digest[3], digest[4], digest[5]);
+
+ /* setup virtio block config space */
+ sc->vbsc_cfg.vbc_capacity = size / DEV_BSIZE; /* 512-byte units */
+ sc->vbsc_cfg.vbc_seg_max = VTBLK_MAXSEGS;
+ sc->vbsc_cfg.vbc_blk_size = sectsz;
+ sc->vbsc_cfg.vbc_size_max = 0; /* not negotiated */
+ sc->vbsc_cfg.vbc_geom_c = 0; /* no geometry */
+ sc->vbsc_cfg.vbc_geom_h = 0;
+ sc->vbsc_cfg.vbc_geom_s = 0;
+ sc->vbsc_cfg.vbc_sectors_max = 0;
+
+ /*
+ * Should we move some of this into virtio.c? Could
+ * have the device, class, and subdev_0 as fields in
+ * the virtio constants structure.
+ */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_BLOCK);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_STORAGE);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_BLOCK);
+
+ if (vi_intr_init(&sc->vbsc_vs, 1, fbsdrun_virtio_msix()))
+ return (1);
+ vi_set_io_bar(&sc->vbsc_vs, 0);
+ return (0);
+}
+
+static int
+pci_vtblk_cfgwrite(void *vsc, int offset, int size, uint32_t value)
+{
+
+ DPRINTF(("vtblk: write to readonly reg %d\n\r", offset));
+ return (1);
+}
+
+static int
+pci_vtblk_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+ struct pci_vtblk_softc *sc = vsc;
+ void *ptr;
+
+ /* our caller has already verified offset and size */
+ ptr = (uint8_t *)&sc->vbsc_cfg + offset;
+ memcpy(retval, ptr, size);
+ return (0);
+}
+
+struct pci_devemu pci_de_vblk = {
+ .pe_emu = "virtio-blk",
+ .pe_init = pci_vtblk_init,
+ .pe_barwrite = vi_pci_write,
+ .pe_barread = vi_pci_read
+};
+PCI_EMUL_SET(pci_de_vblk);
diff --git a/usr/src/cmd/bhyve/pci_virtio_net.c b/usr/src/cmd/bhyve/pci_virtio_net.c
new file mode 100644
index 0000000000..e58bdd0115
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_net.c
@@ -0,0 +1,870 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37:33Z grehan $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pci_virtio_net.c 253440 2013-07-17 23:37:33Z grehan $");
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/select.h>
+#include <sys/uio.h>
+#include <sys/ioctl.h>
+#include <net/ethernet.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <md5.h>
+#include <pthread.h>
+#include <pthread_np.h>
+#ifndef __FreeBSD__
+#include <poll.h>
+#include <libdlpi.h>
+#endif
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#ifdef __FreeBSD__
+#include "mevent.h"
+#endif
+#include "virtio.h"
+
+#define VTNET_RINGSZ 1024
+
+#define VTNET_MAXSEGS 32
+
+/*
+ * Host capabilities. Note that we only offer a few of these.
+ */
+#define VIRTIO_NET_F_CSUM (1 << 0) /* host handles partial cksum */
+#define VIRTIO_NET_F_GUEST_CSUM (1 << 1) /* guest handles partial cksum */
+#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
+#define VIRTIO_NET_F_GSO_DEPREC (1 << 6) /* deprecated: host handles GSO */
+#define VIRTIO_NET_F_GUEST_TSO4 (1 << 7) /* guest can rcv TSOv4 */
+#define VIRTIO_NET_F_GUEST_TSO6 (1 << 8) /* guest can rcv TSOv6 */
+#define VIRTIO_NET_F_GUEST_ECN (1 << 9) /* guest can rcv TSO with ECN */
+#define VIRTIO_NET_F_GUEST_UFO (1 << 10) /* guest can rcv UFO */
+#define VIRTIO_NET_F_HOST_TSO4 (1 << 11) /* host can rcv TSOv4 */
+#define VIRTIO_NET_F_HOST_TSO6 (1 << 12) /* host can rcv TSOv6 */
+#define VIRTIO_NET_F_HOST_ECN (1 << 13) /* host can rcv TSO with ECN */
+#define VIRTIO_NET_F_HOST_UFO (1 << 14) /* host can rcv UFO */
+#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
+#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
+#define VIRTIO_NET_F_CTRL_VQ (1 << 17) /* control channel available */
+#define VIRTIO_NET_F_CTRL_RX (1 << 18) /* control channel RX mode support */
+#define VIRTIO_NET_F_CTRL_VLAN (1 << 19) /* control channel VLAN filtering */
+#define VIRTIO_NET_F_GUEST_ANNOUNCE \
+ (1 << 21) /* guest can send gratuitous pkts */
+
+#define VTNET_S_HOSTCAPS \
+ ( VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | VIRTIO_NET_F_STATUS | \
+ VIRTIO_F_NOTIFY_ON_EMPTY)
+
+/*
+ * PCI config-space "registers"
+ */
+struct virtio_net_config {
+ uint8_t mac[6];
+ uint16_t status;
+} __packed;
+
+/*
+ * Queue definitions.
+ */
+#define VTNET_RXQ 0
+#define VTNET_TXQ 1
+#define VTNET_CTLQ 2 /* NB: not yet supported */
+
+#define VTNET_MAXQ 3
+
+/*
+ * Fixed network header size
+ */
+struct virtio_net_rxhdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+ uint16_t vrh_bufs;
+} __packed;
+
+/*
+ * Debug printf
+ */
+static int pci_vtnet_debug;
+#define DPRINTF(params) if (pci_vtnet_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_vtnet_softc {
+ struct virtio_softc vsc_vs;
+ struct vqueue_info vsc_queues[VTNET_MAXQ - 1];
+ pthread_mutex_t vsc_mtx;
+ struct mevent *vsc_mevp;
+
+#ifdef __FreeBSD
+ int vsc_tapfd;
+#else
+ dlpi_handle_t vsc_dhp;
+ int vsc_dlpifd;
+#endif
+ int vsc_rx_ready;
+ volatile int resetting; /* set and checked outside lock */
+
+ uint32_t vsc_features;
+ struct virtio_net_config vsc_config;
+
+ pthread_mutex_t rx_mtx;
+ int rx_in_progress;
+
+ pthread_t tx_tid;
+ pthread_mutex_t tx_mtx;
+ pthread_cond_t tx_cond;
+ int tx_in_progress;
+};
+
+static void pci_vtnet_reset(void *);
+/* static void pci_vtnet_notify(void *, struct vqueue_info *); */
+static int pci_vtnet_cfgread(void *, int, int, uint32_t *);
+static int pci_vtnet_cfgwrite(void *, int, int, uint32_t);
+
+static struct virtio_consts vtnet_vi_consts = {
+ "vtnet", /* our name */
+ VTNET_MAXQ - 1, /* we currently support 2 virtqueues */
+ sizeof(struct virtio_net_config), /* config reg size */
+ pci_vtnet_reset, /* reset */
+ NULL, /* device-wide qnotify -- not used */
+ pci_vtnet_cfgread, /* read PCI config */
+ pci_vtnet_cfgwrite, /* write PCI config */
+ VTNET_S_HOSTCAPS, /* our capabilities */
+};
+
+/*
+ * If the transmit thread is active then stall until it is done.
+ */
+static void
+pci_vtnet_txwait(struct pci_vtnet_softc *sc)
+{
+
+ pthread_mutex_lock(&sc->tx_mtx);
+ while (sc->tx_in_progress) {
+ pthread_mutex_unlock(&sc->tx_mtx);
+ usleep(10000);
+ pthread_mutex_lock(&sc->tx_mtx);
+ }
+ pthread_mutex_unlock(&sc->tx_mtx);
+}
+
+/*
+ * If the receive thread is active then stall until it is done.
+ */
+static void
+pci_vtnet_rxwait(struct pci_vtnet_softc *sc)
+{
+
+ pthread_mutex_lock(&sc->rx_mtx);
+ while (sc->rx_in_progress) {
+ pthread_mutex_unlock(&sc->rx_mtx);
+ usleep(10000);
+ pthread_mutex_lock(&sc->rx_mtx);
+ }
+ pthread_mutex_unlock(&sc->rx_mtx);
+}
+
+static void
+pci_vtnet_reset(void *vsc)
+{
+ struct pci_vtnet_softc *sc = vsc;
+
+ DPRINTF(("vtnet: device reset requested !\n"));
+
+ sc->resetting = 1;
+
+ /*
+ * Wait for the transmit and receive threads to finish their
+ * processing.
+ */
+ pci_vtnet_txwait(sc);
+ pci_vtnet_rxwait(sc);
+
+ sc->vsc_rx_ready = 0;
+
+ /* now reset rings, MSI-X vectors, and negotiated capabilities */
+ vi_reset_dev(&sc->vsc_vs);
+
+ sc->resetting = 0;
+}
+
+/*
+ * Called to send a buffer chain out to the tap device
+ */
+#ifdef __FreeBSD__
+static void
+pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+ int len)
+{
+ static char pad[60]; /* all zero bytes */
+
+ if (sc->vsc_tapfd == -1)
+ return;
+
+ /*
+ * If the length is < 60, pad out to that and add the
+ * extra zero'd segment to the iov. It is guaranteed that
+ * there is always an extra iov available by the caller.
+ */
+ if (len < 60) {
+ iov[iovcnt].iov_base = pad;
+ iov[iovcnt].iov_len = 60 - len;
+ iovcnt++;
+ }
+ (void) writev(sc->vsc_tapfd, iov, iovcnt);
+}
+#else
+static void
+pci_vtnet_tap_tx(struct pci_vtnet_softc *sc, struct iovec *iov, int iovcnt,
+ int len)
+{
+ int i;
+
+ for (i = 0; i < iovcnt; i++) {
+ (void) dlpi_send(sc->vsc_dhp, NULL, NULL,
+ iov[i].iov_base, iov[i].iov_len, NULL);
+ }
+}
+#endif
+
+#ifdef __FreeBSD__
+/*
+ * Called when there is read activity on the tap file descriptor.
+ * Each buffer posted by the guest is assumed to be able to contain
+ * an entire ethernet frame + rx header.
+ * MP note: the dummybuf is only used for discarding frames, so there
+ * is no need for it to be per-vtnet or locked.
+ */
+static uint8_t dummybuf[2048];
+#endif
+
+static void
+pci_vtnet_tap_rx(struct pci_vtnet_softc *sc)
+{
+ struct vqueue_info *vq;
+ struct virtio_net_rxhdr *vrx;
+ uint8_t *buf;
+#ifdef __FreeBSD__
+ int len;
+#endif
+ struct iovec iov[VTNET_MAXSEGS];
+#ifndef __FreeBSD__
+ size_t len;
+ int ret;
+#endif
+ int total_len = 0;
+
+ /*
+ * Should never be called without a valid tap fd
+ */
+#ifdef __FreeBSD__
+ assert(sc->vsc_tapfd != -1);
+#else
+ assert(sc->vsc_dlpifd != -1);
+#endif
+
+ /*
+ * But, will be called when the rx ring hasn't yet
+ * been set up or the guest is resetting the device.
+ */
+ if (!sc->vsc_rx_ready || sc->resetting) {
+#ifdef __FreeBSD__
+ /*
+ * Drop the packet and try later.
+ */
+ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+#endif
+ return;
+ }
+
+ /*
+ * Check for available rx buffers
+ */
+ vq = &sc->vsc_queues[VTNET_RXQ];
+ vq_startchains(vq);
+ if (!vq_has_descs(vq)) {
+ /*
+ * Drop the packet and try later. Interrupt on
+ * empty, if that's negotiated.
+ */
+#ifdef __FreeBSD__
+ (void) read(sc->vsc_tapfd, dummybuf, sizeof(dummybuf));
+#endif
+ vq_endchains(vq, 1);
+ return;
+ }
+
+ do {
+ /*
+ * Get descriptor chain
+ */
+ if (sc->vsc_vs.vs_negotiated_caps & VIRTIO_NET_F_MRG_RXBUF) {
+ assert(vq_getchain(vq, iov, 1, NULL) == 1);
+
+ /*
+ * Get a pointer to the rx header, and use the
+ * data immediately following it for the packet buffer.
+ */
+ vrx = (struct virtio_net_rxhdr *)iov[0].iov_base;
+ buf = (uint8_t *)(vrx + 1);
+ total_len = iov[0].iov_len;
+#ifdef __FreeBSD__
+ len = read(sc->vsc_tapfd, buf,
+ iov[0].iov_len - sizeof(struct virtio_net_rxhdr));
+
+ if (len < 0 && errno == EWOULDBLOCK) {
+ /*
+ * No more packets, but still some avail ring
+ * entries. Interrupt if needed/appropriate.
+ */
+ vq_endchains(vq, 0);
+ return;
+ }
+#else
+ len = iov[0].iov_len - sizeof(struct virtio_net_rxhdr);
+ ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, buf,
+ &len, 0, NULL);
+ if (ret != DLPI_SUCCESS) {
+ /*
+ * No more packets, but still some avail ring
+ * entries. Interrupt if needed/appropriate.
+ */
+ vq_endchains(vq, 0);
+ return;
+ }
+#endif
+ } else {
+ int i;
+ int num_segs;
+ num_segs = vq_getchain(vq, iov,
+ VTNET_MAXSEGS, NULL);
+ vrx = (struct virtio_net_rxhrd *)iov[0].iov_base;
+ total_len = iov[0].iov_len;
+ for (i = 1; i < num_segs; i++) {
+ buf = (uint8_t *)iov[i].iov_base;
+ total_len += iov[i].iov_len;
+#ifdef __FreeBSD__
+ len = read(sc->vsc_tapfd, buf, iov[i].iov_len);
+ if (len < 0 && errno == EWOULDBLOCK) {
+ /*
+ * No more packets,
+ * but still some avail ring entries.
+ * Interrupt if needed/appropriate.
+ */
+ break;
+ }
+#else
+ len = iov[i].iov_len;
+ ret = dlpi_recv(sc->vsc_dhp, NULL, NULL, buf,
+ &len, 0, NULL);
+ if (ret != DLPI_SUCCESS) {
+ /*
+ * No more packets,
+ * but still some avail ring entries.
+ * Interrupt if needed/appropriate.
+ */
+ total_len = 0;
+ break;
+ }
+#endif
+ }
+ if (total_len == 0) {
+ vq_endchains(vq, 0);
+ return;
+ }
+ }
+
+ /*
+ * The only valid field in the rx packet header is the
+ * number of buffers, which is always 1 without TSO
+ * support.
+ */
+ memset(vrx, 0, sizeof(struct virtio_net_rxhdr));
+ vrx->vrh_bufs = 1;
+
+ /*
+ * Release this chain and handle more chains.
+ */
+ vq_relchain(vq, total_len);
+ } while (vq_has_descs(vq));
+
+ /* Interrupt if needed, including for NOTIFY_ON_EMPTY. */
+ vq_endchains(vq, 1);
+}
+
+#ifdef __FreeBSD__
+static void
+pci_vtnet_tap_callback(int fd, enum ev_type type, void *param)
+{
+ struct pci_vtnet_softc *sc = param;
+
+ pthread_mutex_lock(&sc->rx_mtx);
+ sc->rx_in_progress = 1;
+ pci_vtnet_tap_rx(sc);
+ sc->rx_in_progress = 0;
+ pthread_mutex_unlock(&sc->rx_mtx);
+
+}
+#else
+static void *
+pci_vtnet_poll_thread(void *param)
+{
+ struct pci_vtnet_softc *sc = param;
+ pollfd_t pollset;
+
+ pollset.fd = sc->vsc_dlpifd;
+ pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
+
+ for (;;) {
+ if (poll(&pollset, 1, -1) < 0) {
+ if (errno == EINTR)
+ continue;
+ fprintf(stderr, "pci_vtnet_poll_thread poll() error %d\n", errno);
+ continue;
+ }
+ pthread_mutex_lock(&sc->vsc_mtx);
+ pci_vtnet_tap_rx(sc);
+ pthread_mutex_unlock(&sc->vsc_mtx);
+ }
+}
+#endif
+
+static void
+pci_vtnet_ping_rxq(void *vsc, struct vqueue_info *vq)
+{
+ struct pci_vtnet_softc *sc = vsc;
+
+ /*
+ * A qnotify means that the rx process can now begin
+ */
+ if (sc->vsc_rx_ready == 0) {
+ sc->vsc_rx_ready = 1;
+ }
+}
+
+static void
+pci_vtnet_proctx(struct pci_vtnet_softc *sc, struct vqueue_info *vq)
+{
+ struct iovec iov[VTNET_MAXSEGS + 1];
+ int i, n;
+ int plen, tlen;
+
+ /*
+ * Obtain chain of descriptors. The first one is
+ * really the header descriptor, so we need to sum
+ * up two lengths: packet length and transfer length.
+ */
+ n = vq_getchain(vq, iov, VTNET_MAXSEGS, NULL);
+ assert(n >= 1 && n <= VTNET_MAXSEGS);
+ plen = 0;
+ tlen = iov[0].iov_len;
+ for (i = 1; i < n; i++) {
+ plen += iov[i].iov_len;
+ tlen += iov[i].iov_len;
+ }
+
+ DPRINTF(("virtio: packet send, %d bytes, %d segs\n\r", plen, n));
+ pci_vtnet_tap_tx(sc, &iov[1], n - 1, plen);
+
+ /* chain is processed, release it and set tlen */
+ vq_relchain(vq, tlen);
+}
+
+static void
+pci_vtnet_ping_txq(void *vsc, struct vqueue_info *vq)
+{
+ struct pci_vtnet_softc *sc = vsc;
+
+ /*
+ * Any ring entries to process?
+ */
+ if (!vq_has_descs(vq))
+ return;
+
+ /* Signal the tx thread for processing */
+ pthread_mutex_lock(&sc->tx_mtx);
+ if (sc->tx_in_progress == 0)
+ pthread_cond_signal(&sc->tx_cond);
+ pthread_mutex_unlock(&sc->tx_mtx);
+}
+
+/*
+ * Thread which will handle processing of TX desc
+ */
+static void *
+pci_vtnet_tx_thread(void *param)
+{
+ struct pci_vtnet_softc *sc = param;
+ struct vqueue_info *vq;
+ int have_work, error;
+
+ vq = &sc->vsc_queues[VTNET_TXQ];
+
+ /*
+ * Let us wait till the tx queue pointers get initialised &
+ * first tx signaled
+ */
+ pthread_mutex_lock(&sc->tx_mtx);
+ error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
+ assert(error == 0);
+
+ for (;;) {
+ /* note - tx mutex is locked here */
+ do {
+ if (sc->resetting)
+ have_work = 0;
+ else
+ have_work = vq_has_descs(vq);
+
+ if (!have_work) {
+ sc->tx_in_progress = 0;
+ error = pthread_cond_wait(&sc->tx_cond,
+ &sc->tx_mtx);
+ assert(error == 0);
+ }
+ } while (!have_work);
+ sc->tx_in_progress = 1;
+ pthread_mutex_unlock(&sc->tx_mtx);
+
+ vq_startchains(vq);
+ do {
+ /*
+ * Run through entries, placing them into
+ * iovecs and sending when an end-of-packet
+ * is found
+ */
+ pci_vtnet_proctx(sc, vq);
+ } while (vq_has_descs(vq));
+
+ /*
+ * Generate an interrupt if needed.
+ */
+ vq_endchains(vq, 1);
+
+ pthread_mutex_lock(&sc->tx_mtx);
+ }
+}
+
+#ifdef notyet
+static void
+pci_vtnet_ping_ctlq(void *vsc, struct vqueue_info *vq)
+{
+
+ DPRINTF(("vtnet: control qnotify!\n\r"));
+}
+#endif
+
+#ifdef __FreeBSD__
+static int
+pci_vtnet_parsemac(char *mac_str, uint8_t *mac_addr)
+{
+ struct ether_addr *ea;
+ char *tmpstr;
+ char zero_addr[ETHER_ADDR_LEN] = { 0, 0, 0, 0, 0, 0 };
+
+ tmpstr = strsep(&mac_str,"=");
+
+ if ((mac_str != NULL) && (!strcmp(tmpstr,"mac"))) {
+ ea = ether_aton(mac_str);
+
+ if (ea == NULL || ETHER_IS_MULTICAST(ea->octet) ||
+ memcmp(ea->octet, zero_addr, ETHER_ADDR_LEN) == 0) {
+ fprintf(stderr, "Invalid MAC %s\n", mac_str);
+ return (EINVAL);
+ } else
+ memcpy(mac_addr, ea->octet, ETHER_ADDR_LEN);
+ }
+
+ return (0);
+}
+#endif
+
+
+static int
+pci_vtnet_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+#ifdef __FreeBSD__
+ MD5_CTX mdctx;
+ unsigned char digest[16];
+#else
+ uchar_t physaddr[DLPI_PHYSADDR_MAX];
+ size_t physaddrlen = DLPI_PHYSADDR_MAX;
+ int error;
+#endif
+ char nstr[80];
+ char tname[MAXCOMLEN + 1];
+ struct pci_vtnet_softc *sc;
+ const char *env_msi;
+ char *devname;
+ char *vtopts;
+ int mac_provided;
+ int use_msix;
+
+ sc = malloc(sizeof(struct pci_vtnet_softc));
+ memset(sc, 0, sizeof(struct pci_vtnet_softc));
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ vi_softc_linkup(&sc->vsc_vs, &vtnet_vi_consts, sc, pi, sc->vsc_queues);
+ sc->vsc_queues[VTNET_RXQ].vq_qsize = VTNET_RINGSZ;
+ sc->vsc_queues[VTNET_RXQ].vq_notify = pci_vtnet_ping_rxq;
+ sc->vsc_queues[VTNET_TXQ].vq_qsize = VTNET_RINGSZ;
+ sc->vsc_queues[VTNET_TXQ].vq_notify = pci_vtnet_ping_txq;
+#ifdef notyet
+ sc->vsc_queues[VTNET_CTLQ].vq_qsize = VTNET_RINGSZ;
+ sc->vsc_queues[VTNET_CTLQ].vq_notify = pci_vtnet_ping_ctlq;
+#endif
+
+ /*
+ * Use MSI if set by user
+ */
+ use_msix = 1;
+ if ((env_msi = getenv("BHYVE_USE_MSI")) != NULL) {
+ if (strcasecmp(env_msi, "yes") == 0)
+ use_msix = 0;
+ }
+
+ /*
+ * Attempt to open the tap device and read the MAC address
+ * if specified
+ */
+ mac_provided = 0;
+#ifdef __FreeBSD__
+ sc->vsc_tapfd = -1;
+#endif
+ if (opts != NULL) {
+ char tbuf[80];
+ int err;
+
+ devname = vtopts = strdup(opts);
+ (void) strsep(&vtopts, ",");
+
+#ifdef __FreBSD__
+ if (vtopts != NULL) {
+ err = pci_vtnet_parsemac(vtopts, sc->vsc_config.mac);
+ if (err != 0) {
+ free(devname);
+ return (err);
+ }
+ mac_provided = 1;
+ }
+#endif
+
+ strcpy(tbuf, "/dev/");
+ strlcat(tbuf, devname, sizeof(tbuf));
+
+ free(devname);
+
+#ifdef __FreeBSD__
+ sc->vsc_tapfd = open(tbuf, O_RDWR);
+ if (sc->vsc_tapfd == -1) {
+ WPRINTF(("open of tap device %s failed\n", tbuf));
+ } else {
+ /*
+ * Set non-blocking and register for read
+ * notifications with the event loop
+ */
+ int opt = 1;
+ if (ioctl(sc->vsc_tapfd, FIONBIO, &opt) < 0) {
+ WPRINTF(("tap device O_NONBLOCK failed\n"));
+ close(sc->vsc_tapfd);
+ sc->vsc_tapfd = -1;
+ }
+
+ sc->vsc_mevp = mevent_add(sc->vsc_tapfd,
+ EVF_READ,
+ pci_vtnet_tap_callback,
+ sc);
+ if (sc->vsc_mevp == NULL) {
+ WPRINTF(("Could not register event\n"));
+ close(sc->vsc_tapfd);
+ sc->vsc_tapfd = -1;
+ }
+ }
+#else
+ if (dlpi_open(opts, &sc->vsc_dhp, DLPI_RAW) != DLPI_SUCCESS) {
+ WPRINTF(("open of vnic device %s failed\n", opts));
+ }
+
+ if (dlpi_get_physaddr(sc->vsc_dhp, DL_CURR_PHYS_ADDR, physaddr, &physaddrlen) != DLPI_SUCCESS) {
+ WPRINTF(("read MAC address of vnic device %s failed\n", opts));
+ }
+ if (physaddrlen != ETHERADDRL) {
+ WPRINTF(("bad MAC address len %d on vnic device %s\n", physaddrlen, opts));
+ }
+ memcpy(sc->vsc_config.mac, physaddr, ETHERADDRL);
+
+ if (dlpi_bind(sc->vsc_dhp, DLPI_ANY_SAP, NULL) != DLPI_SUCCESS) {
+ WPRINTF(("bind of vnic device %s failed\n", opts));
+ }
+
+ if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_PHYS) != DLPI_SUCCESS) {
+ WPRINTF(("enable promiscous mode(physical) of vnic device %s failed\n", opts));
+ }
+ if (dlpi_promiscon(sc->vsc_dhp, DL_PROMISC_SAP) != DLPI_SUCCESS) {
+ WPRINTF(("enable promiscous mode(SAP) of vnic device %s failed\n", opts));
+ }
+
+ sc->vsc_dlpifd = dlpi_fd(sc->vsc_dhp);
+
+ if (fcntl(sc->vsc_dlpifd, F_SETFL, O_NONBLOCK) < 0) {
+ WPRINTF(("enable O_NONBLOCK of vnic device %s failed\n", opts));
+ dlpi_close(sc->vsc_dhp);
+ sc->vsc_dlpifd = -1;
+ }
+
+ error = pthread_create(NULL, NULL, pci_vtnet_poll_thread, sc);
+ assert(error == 0);
+#endif
+ }
+
+#ifdef __FreeBSD__
+ /*
+ * The default MAC address is the standard NetApp OUI of 00-a0-98,
+ * followed by an MD5 of the PCI slot/func number and dev name
+ */
+ if (!mac_provided) {
+ snprintf(nstr, sizeof(nstr), "%d-%d-%s", pi->pi_slot,
+ pi->pi_func, vmname);
+
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, nstr, strlen(nstr));
+ MD5Final(digest, &mdctx);
+
+ sc->vsc_config.mac[0] = 0x00;
+ sc->vsc_config.mac[1] = 0xa0;
+ sc->vsc_config.mac[2] = 0x98;
+ sc->vsc_config.mac[3] = digest[0];
+ sc->vsc_config.mac[4] = digest[1];
+ sc->vsc_config.mac[5] = digest[2];
+ }
+#endif
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+
+ /* link always up */
+ sc->vsc_config.status = 1;
+
+ /* use BAR 1 to map MSI-X table and PBA, if we're using MSI-X */
+ if (vi_intr_init(&sc->vsc_vs, 1, use_msix))
+ return (1);
+
+ /* use BAR 0 to map config regs in IO space */
+ vi_set_io_bar(&sc->vsc_vs, 0);
+
+ sc->resetting = 0;
+
+ sc->rx_in_progress = 0;
+ pthread_mutex_init(&sc->rx_mtx, NULL);
+
+ /*
+ * Initialize tx semaphore & spawn TX processing thread.
+ * As of now, only one thread for TX desc processing is
+ * spawned.
+ */
+ sc->tx_in_progress = 0;
+ pthread_mutex_init(&sc->tx_mtx, NULL);
+ pthread_cond_init(&sc->tx_cond, NULL);
+ pthread_create(&sc->tx_tid, NULL, pci_vtnet_tx_thread, (void *)sc);
+ snprintf(tname, sizeof(tname), "%s vtnet%d tx", vmname, pi->pi_slot);
+ pthread_set_name_np(sc->tx_tid, tname);
+
+ return (0);
+}
+
+static int
+pci_vtnet_cfgwrite(void *vsc, int offset, int size, uint32_t value)
+{
+ struct pci_vtnet_softc *sc = vsc;
+ void *ptr;
+
+ if (offset < 6) {
+ assert(offset + size <= 6);
+ /*
+ * The driver is allowed to change the MAC address
+ */
+ ptr = &sc->vsc_config.mac[offset];
+ memcpy(ptr, &value, size);
+ } else {
+ DPRINTF(("vtnet: write to readonly reg %d\n\r", offset));
+ return (1);
+ }
+ return (0);
+}
+
+static int
+pci_vtnet_cfgread(void *vsc, int offset, int size, uint32_t *retval)
+{
+ struct pci_vtnet_softc *sc = vsc;
+ void *ptr;
+
+ ptr = (uint8_t *)&sc->vsc_config + offset;
+ memcpy(retval, ptr, size);
+ return (0);
+}
+
+struct pci_devemu pci_de_vnet = {
+ .pe_emu = "virtio-net",
+ .pe_init = pci_vtnet_init,
+ .pe_barwrite = vi_pci_write,
+ .pe_barread = vi_pci_read
+};
+PCI_EMUL_SET(pci_de_vnet);
diff --git a/usr/src/cmd/bhyve/pci_virtio_viona.c b/usr/src/cmd/bhyve/pci_virtio_viona.c
new file mode 100644
index 0000000000..f4d5d528be
--- /dev/null
+++ b/usr/src/cmd/bhyve/pci_virtio_viona.c
@@ -0,0 +1,706 @@
+/*
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+
+#include <sys/param.h>
+#include <sys/linker_set.h>
+#include <sys/ioctl.h>
+#include <sys/viona_io.h>
+
+#include <errno.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <strings.h>
+#include <unistd.h>
+#include <assert.h>
+#include <pthread.h>
+#include <signal.h>
+#include <poll.h>
+#include <libdladm.h>
+#include <libdllink.h>
+#include <libdlvnic.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+#define VIONA_RINGSZ 1024
+
+/*
+ * PCI config-space register offsets
+ */
+#define VIONA_R_CFG0 24
+#define VIONA_R_CFG1 25
+#define VIONA_R_CFG2 26
+#define VIONA_R_CFG3 27
+#define VIONA_R_CFG4 28
+#define VIONA_R_CFG5 29
+#define VIONA_R_CFG6 30
+#define VIONA_R_CFG7 31
+#define VIONA_R_MAX 31
+
+#define VIONA_REGSZ VIONA_R_MAX+1
+
+/*
+ * Host capabilities
+ */
+#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
+#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
+#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
+
+#define VIONA_S_HOSTCAPS \
+ (VIRTIO_NET_F_MAC | \
+ VIRTIO_NET_F_MRG_RXBUF | \
+ VIRTIO_NET_F_STATUS)
+
+/*
+ * Queue definitions.
+ */
+#define VIONA_RXQ 0
+#define VIONA_TXQ 1
+#define VIONA_CTLQ 2
+
+#define VIONA_MAXQ 3
+
+/*
+ * Debug printf
+ */
+static int pci_viona_debug;
+#define DPRINTF(params) if (pci_viona_debug) printf params
+#define WPRINTF(params) printf params
+
+/*
+ * Per-device softc
+ */
+struct pci_viona_softc {
+ struct pci_devinst *vsc_pi;
+ pthread_mutex_t vsc_mtx;
+
+ int vsc_curq;
+ int vsc_status;
+ int vsc_isr;
+
+ datalink_id_t vsc_linkid;
+ char vsc_linkname[MAXLINKNAMELEN];
+ int vsc_vnafd;
+
+ uint32_t vsc_features;
+ uint8_t vsc_macaddr[6];
+
+ uint64_t vsc_pfn[VIONA_MAXQ];
+ uint16_t vsc_msix_table_idx[VIONA_MAXQ];
+ /*
+ * Flag to see if host is already sending data out.
+ * If it is, no need to wait for lock and send interrupt to host
+ * for new data.
+ */
+ boolean_t vsc_tx_kick_lock_held;
+
+ pthread_t tx_tid;
+ pthread_mutex_t tx_mtx;
+ pthread_cond_t tx_cond;
+};
+#define viona_ctx(sc) ((sc)->vsc_pi->pi_vmctx)
+
+/*
+ * Return the size of IO BAR that maps virtio header and device specific
+ * region. The size would vary depending on whether MSI-X is enabled or
+ * not.
+ */
+static uint64_t
+pci_viona_iosize(struct pci_devinst *pi)
+{
+ if (pci_msix_enabled(pi))
+ return (VIONA_REGSZ);
+ else
+ return (VIONA_REGSZ - (VTCFG_R_CFG1 - VTCFG_R_MSIX));
+}
+
+static uint16_t
+pci_viona_qsize(int qnum)
+{
+ /* XXX no ctl queue currently */
+ if (qnum == VIONA_CTLQ) {
+ return (0);
+ }
+
+ /* XXX fixed currently. Maybe different for tx/rx/ctl */
+ return (VIONA_RINGSZ);
+}
+
+static void
+pci_viona_ring_reset(struct pci_viona_softc *sc, int ring)
+{
+ int error;
+
+ assert(ring < VIONA_MAXQ);
+
+ switch (ring) {
+ case VIONA_RXQ:
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_RESET);
+ if (error != 0) {
+ WPRINTF(("ioctl viona rx ring reset failed %d\n",
+ error));
+ } else {
+ sc->vsc_pfn[VIONA_RXQ] = 0;
+ }
+ break;
+ case VIONA_TXQ:
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_RESET);
+ if (error != 0) {
+ WPRINTF(("ioctl viona tx ring reset failed %d\n",
+ error));
+ } else {
+ sc->vsc_pfn[VIONA_TXQ] = 0;
+ }
+ break;
+ case VIONA_CTLQ:
+ default:
+ break;
+ }
+}
+
+static void
+pci_viona_update_status(struct pci_viona_softc *sc, uint32_t value)
+{
+
+ if (value == 0) {
+ DPRINTF(("viona: device reset requested !\n"));
+ pci_viona_ring_reset(sc, VIONA_RXQ);
+ pci_viona_ring_reset(sc, VIONA_TXQ);
+ }
+
+ sc->vsc_status = value;
+}
+
+static void *
+pci_viona_poll_thread(void *param)
+{
+ struct pci_viona_softc *sc = param;
+ pollfd_t pollset;
+ int error;
+
+ pollset.fd = sc->vsc_vnafd;
+ pollset.events = POLLIN | POLLOUT;
+
+ for (;;) {
+ if (poll(&pollset, 1, -1) < 0) {
+ if (errno == EINTR || errno == EAGAIN) {
+ continue;
+ } else {
+ WPRINTF(("pci_viona_poll_thread poll()"
+ "error %d\n", errno));
+ break;
+ }
+ }
+ if (pollset.revents & POLLIN) {
+ pci_generate_msix(sc->vsc_pi,
+ sc->vsc_msix_table_idx[VIONA_RXQ]);
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_INTR_CLR);
+ if (error != 0) {
+ WPRINTF(("ioctl viona rx intr clear failed"
+ " %d\n", error));
+ }
+ }
+
+ if (pollset.revents & POLLOUT) {
+ pci_generate_msix(sc->vsc_pi,
+ sc->vsc_msix_table_idx[VIONA_TXQ]);
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_INTR_CLR);
+ if (error != 0) {
+ WPRINTF(("ioctl viona tx intr clear failed"
+ " %d\n", error));
+ }
+ }
+ }
+
+ pthread_exit(NULL);
+}
+
+static void
+pci_viona_ping_rxq(struct pci_viona_softc *sc)
+{
+ int error;
+
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_KICK);
+ if (error != 0) {
+ WPRINTF(("ioctl viona rx ring kick failed %d\n", error));
+ }
+}
+
+static void *
+pci_viona_tx_thread(void *param)
+{
+ struct pci_viona_softc *sc = (struct pci_viona_softc *)param;
+ int error;
+
+ pthread_mutex_lock(&sc->tx_mtx);
+ for (;;) {
+ error = pthread_cond_wait(&sc->tx_cond, &sc->tx_mtx);
+ assert(error == 0);
+ sc->vsc_tx_kick_lock_held = B_TRUE;
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_KICK);
+ if (error != 0) {
+ WPRINTF(("ioctl viona tx ring kick failed %d\n",
+ error));
+ }
+ sc->vsc_tx_kick_lock_held = B_FALSE;
+ }
+ pthread_mutex_unlock(&sc->tx_mtx);
+}
+
+static void
+pci_viona_ping_txq(struct pci_viona_softc *sc)
+{
+ /* Signal the tx thread for processing */
+ if (sc->vsc_tx_kick_lock_held)
+ return;
+ pthread_mutex_lock(&sc->tx_mtx);
+ pthread_cond_signal(&sc->tx_cond);
+ pthread_mutex_unlock(&sc->tx_mtx);
+}
+
+static void
+pci_viona_ping_ctlq(struct pci_viona_softc *sc)
+{
+ DPRINTF(("viona: control qnotify!\n\r"));
+}
+
+static void
+pci_viona_ring_init(struct pci_viona_softc *sc, uint64_t pfn)
+{
+ int qnum = sc->vsc_curq;
+ vioc_ring_init_t vna_ri;
+ int error;
+
+ assert(qnum < VIONA_MAXQ);
+
+ sc->vsc_pfn[qnum] = (pfn << VRING_PFN);
+
+ vna_ri.ri_qsize = pci_viona_qsize(qnum);
+ vna_ri.ri_qaddr = (pfn << VRING_PFN);
+
+ switch (qnum) {
+ case VIONA_RXQ:
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_RX_RING_INIT, &vna_ri);
+ if (error != 0) {
+ WPRINTF(("ioctl viona rx ring init failed %d\n",
+ error));
+ }
+ break;
+ case VIONA_TXQ:
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_TX_RING_INIT, &vna_ri);
+ if (error != 0) {
+ WPRINTF(("ioctl viona tx ring init failed %d\n",
+ error));
+ }
+ break;
+ case VIONA_CTLQ:
+ default:
+ break;
+ }
+}
+
+static int
+pci_viona_viona_init(struct vmctx *ctx, struct pci_viona_softc *sc)
+{
+ vioc_create_t vna_create;
+ char devname[MAXNAMELEN];
+ int ctlfd;
+ int error;
+
+ sc->vsc_vnafd = open("/devices/pseudo/viona@0:ctl", O_RDWR | O_EXCL);
+ if (sc->vsc_vnafd == -1) {
+ WPRINTF(("open viona ctl failed\n"));
+ return (-1);
+ }
+
+ vna_create.c_linkid = sc->vsc_linkid;
+ strlcpy(vna_create.c_vmname, vmname,
+ sizeof (vna_create.c_vmname));
+ vm_get_memory_seg(ctx, 1 * (1024 * 1024UL), &vna_create.c_lomem_size,
+ NULL);
+ vm_get_memory_seg(ctx, 4 * (1024 * 1024 * 1024UL),
+ &vna_create.c_himem_size, NULL);
+ error = ioctl(sc->vsc_vnafd, VNA_IOC_CREATE, &vna_create);
+ if (error != 0) {
+ WPRINTF(("ioctl viona create failed %d\n", error));
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+pci_viona_init(struct vmctx *ctx, struct pci_devinst *pi, char *opts)
+{
+ dladm_handle_t handle;
+ dladm_status_t status;
+ dladm_vnic_attr_t attr;
+ char errmsg[DLADM_STRSIZE];
+ int error;
+ struct pci_viona_softc *sc;
+ int i;
+
+ if (opts == NULL) {
+ printf("virtio-viona: vnic required\n");
+ return (1);
+ }
+
+ sc = malloc(sizeof (struct pci_viona_softc));
+ memset(sc, 0, sizeof (struct pci_viona_softc));
+
+ pi->pi_arg = sc;
+ sc->vsc_pi = pi;
+
+ pthread_mutex_init(&sc->vsc_mtx, NULL);
+
+ strlcpy(sc->vsc_linkname, opts, MAXLINKNAMELEN);
+
+ if ((status = dladm_open(&handle)) != DLADM_STATUS_OK) {
+ WPRINTF(("could not open /dev/dld"));
+ free(sc);
+ return (1);
+ }
+
+ if (dladm_name2info(handle, sc->vsc_linkname, &sc->vsc_linkid,
+ NULL, NULL, NULL) != DLADM_STATUS_OK) {
+ WPRINTF(("dladm_name2info() for %s failed: %s\n", opts,
+ dladm_status2str(status, errmsg)));
+ dladm_close(handle);
+ free(sc);
+ return (1);
+ }
+
+ if (dladm_vnic_info(handle, sc->vsc_linkid, &attr,
+ DLADM_OPT_ACTIVE) != DLADM_STATUS_OK) {
+ WPRINTF(("dladm_vnic_info() for %s failed: %s\n", opts,
+ dladm_status2str(status, errmsg)));
+ dladm_close(handle);
+ free(sc);
+ return (1);
+ }
+
+ sc->vsc_tx_kick_lock_held = B_FALSE;
+ memcpy(sc->vsc_macaddr, attr.va_mac_addr, ETHERADDRL);
+
+ dladm_close(handle);
+
+ error = pci_viona_viona_init(ctx, sc);
+ if (error != 0) {
+ free(sc);
+ return (1);
+ }
+
+ error = pthread_create(NULL, NULL, pci_viona_poll_thread, sc);
+ assert(error == 0);
+
+ /* initialize config space */
+ pci_set_cfgdata16(pi, PCIR_DEVICE, VIRTIO_DEV_NET);
+ pci_set_cfgdata16(pi, PCIR_VENDOR, VIRTIO_VENDOR);
+ pci_set_cfgdata8(pi, PCIR_CLASS, PCIC_NETWORK);
+ pci_set_cfgdata16(pi, PCIR_SUBDEV_0, VIRTIO_TYPE_NET);
+
+ /* MSI-X support */
+ for (i = 0; i < VIONA_MAXQ; i++)
+ sc->vsc_msix_table_idx[i] = VIRTIO_MSI_NO_VECTOR;
+
+ /*
+ * BAR 1 used to map MSI-X table and PBA
+ */
+ if (pci_emul_add_msixcap(pi, VIONA_MAXQ, 1)) {
+ free(sc);
+ return (1);
+ }
+
+ pci_emul_alloc_bar(pi, 0, PCIBAR_IO, VIONA_REGSZ);
+
+ /*
+ * Initialize tx semaphore & spawn TX processing thread
+ * As of now, only one thread for TX desc processing is
+ * spawned.
+ */
+ pthread_mutex_init(&sc->tx_mtx, NULL);
+ pthread_cond_init(&sc->tx_cond, NULL);
+ pthread_create(&sc->tx_tid, NULL, pci_viona_tx_thread, (void *)sc);
+
+ return (0);
+}
+
+/*
+ * Function pointer array to handle queue notifications
+ */
+static void (*pci_viona_qnotify[VIONA_MAXQ])(struct pci_viona_softc *) = {
+ pci_viona_ping_rxq,
+ pci_viona_ping_txq,
+ pci_viona_ping_ctlq
+};
+
+static uint64_t
+viona_adjust_offset(struct pci_devinst *pi, uint64_t offset)
+{
+ /*
+ * Device specific offsets used by guest would change based on
+ * whether MSI-X capability is enabled or not
+ */
+ if (!pci_msix_enabled(pi)) {
+ if (offset >= VTCFG_R_MSIX)
+ return (offset + (VTCFG_R_CFG1 - VTCFG_R_MSIX));
+ }
+
+ return (offset);
+}
+
+static void
+pci_viona_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct pci_viona_softc *sc = pi->pi_arg;
+ void *ptr;
+ int err = 0;
+
+ if (baridx == pci_msix_table_bar(pi) ||
+ baridx == pci_msix_pba_bar(pi)) {
+ pci_emul_msix_twrite(pi, offset, size, value);
+ return;
+ }
+
+ assert(baridx == 0);
+
+ if (offset + size > pci_viona_iosize(pi)) {
+ DPRINTF(("viona_write: 2big, offset %ld size %d\n",
+ offset, size));
+ return;
+ }
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+
+ offset = viona_adjust_offset(pi, offset);
+
+ switch (offset) {
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ err = ioctl(sc->vsc_vnafd, VNA_IOC_SET_FEATURES, &value);
+ if (err != 0)
+ WPRINTF(("ioctl feature negotiation returned"
+ " err = %d\n", err));
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ pci_viona_ring_init(sc, value);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ assert(value < VIONA_MAXQ);
+ sc->vsc_curq = value;
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ assert(value < VIONA_MAXQ);
+ (*pci_viona_qnotify[value])(sc);
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ pci_viona_update_status(sc, value);
+ break;
+ case VTCFG_R_CFGVEC:
+ assert(size == 2);
+ sc->vsc_msix_table_idx[VIONA_CTLQ] = value;
+ break;
+ case VTCFG_R_QVEC:
+ assert(size == 2);
+ assert(sc->vsc_curq != VIONA_CTLQ);
+ sc->vsc_msix_table_idx[sc->vsc_curq] = value;
+ break;
+ case VIONA_R_CFG0:
+ case VIONA_R_CFG1:
+ case VIONA_R_CFG2:
+ case VIONA_R_CFG3:
+ case VIONA_R_CFG4:
+ case VIONA_R_CFG5:
+ assert((size + offset) <= (VIONA_R_CFG5 + 1));
+ ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0];
+ /*
+ * The driver is allowed to change the MAC address
+ */
+ sc->vsc_macaddr[offset - VIONA_R_CFG0] = value;
+ if (size == 1) {
+ *(uint8_t *)ptr = value;
+ } else if (size == 2) {
+ *(uint16_t *)ptr = value;
+ } else {
+ *(uint32_t *)ptr = value;
+ }
+ break;
+ case VTCFG_R_HOSTCAP:
+ case VTCFG_R_QNUM:
+ case VTCFG_R_ISR:
+ case VIONA_R_CFG6:
+ case VIONA_R_CFG7:
+ DPRINTF(("viona: write to readonly reg %ld\n\r", offset));
+ break;
+ default:
+ DPRINTF(("viona: unknown i/o write offset %ld\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+}
+
+uint64_t
+pci_viona_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ struct pci_viona_softc *sc = pi->pi_arg;
+ void *ptr;
+ uint64_t value;
+ int err = 0;
+
+ if (baridx == pci_msix_table_bar(pi) ||
+ baridx == pci_msix_pba_bar(pi)) {
+ return (pci_emul_msix_tread(pi, offset, size));
+ }
+
+ assert(baridx == 0);
+
+ if (offset + size > pci_viona_iosize(pi)) {
+ DPRINTF(("viona_read: 2big, offset %ld size %d\n",
+ offset, size));
+ return (0);
+ }
+
+ pthread_mutex_lock(&sc->vsc_mtx);
+
+ offset = viona_adjust_offset(pi, offset);
+
+ switch (offset) {
+ case VTCFG_R_HOSTCAP:
+ assert(size == 4);
+ err = ioctl(sc->vsc_vnafd, VNA_IOC_GET_FEATURES, &value);
+ if (err != 0)
+ WPRINTF(("ioctl get host features returned"
+ " err = %d\n", err));
+ break;
+ case VTCFG_R_GUESTCAP:
+ assert(size == 4);
+ value = sc->vsc_features; /* XXX never read ? */
+ break;
+ case VTCFG_R_PFN:
+ assert(size == 4);
+ value = sc->vsc_pfn[sc->vsc_curq] >> VRING_PFN;
+ break;
+ case VTCFG_R_QNUM:
+ assert(size == 2);
+ value = pci_viona_qsize(sc->vsc_curq);
+ break;
+ case VTCFG_R_QSEL:
+ assert(size == 2);
+ value = sc->vsc_curq; /* XXX never read ? */
+ break;
+ case VTCFG_R_QNOTIFY:
+ assert(size == 2);
+ value = sc->vsc_curq; /* XXX never read ? */
+ break;
+ case VTCFG_R_STATUS:
+ assert(size == 1);
+ value = sc->vsc_status;
+ break;
+ case VTCFG_R_ISR:
+ assert(size == 1);
+ value = sc->vsc_isr;
+ sc->vsc_isr = 0; /* a read clears this flag */
+ break;
+ case VTCFG_R_CFGVEC:
+ assert(size == 2);
+ value = sc->vsc_msix_table_idx[VIONA_CTLQ];
+ break;
+ case VTCFG_R_QVEC:
+ assert(size == 2);
+ assert(sc->vsc_curq != VIONA_CTLQ);
+ value = sc->vsc_msix_table_idx[sc->vsc_curq];
+ break;
+ case VIONA_R_CFG0:
+ case VIONA_R_CFG1:
+ case VIONA_R_CFG2:
+ case VIONA_R_CFG3:
+ case VIONA_R_CFG4:
+ case VIONA_R_CFG5:
+ assert((size + offset) <= (VIONA_R_CFG5 + 1));
+ ptr = &sc->vsc_macaddr[offset - VIONA_R_CFG0];
+ if (size == 1) {
+ value = *(uint8_t *)ptr;
+ } else if (size == 2) {
+ value = *(uint16_t *)ptr;
+ } else {
+ value = *(uint32_t *)ptr;
+ }
+ break;
+ case VIONA_R_CFG6:
+ assert(size != 4);
+ value = 0x01; /* XXX link always up */
+ break;
+ case VIONA_R_CFG7:
+ assert(size == 1);
+ value = 0; /* XXX link status in LSB */
+ break;
+ default:
+ DPRINTF(("viona: unknown i/o read offset %ld\n\r", offset));
+ value = 0;
+ break;
+ }
+
+ pthread_mutex_unlock(&sc->vsc_mtx);
+
+ return (value);
+}
+
+struct pci_devemu pci_de_viona = {
+ .pe_emu = "virtio-net-viona",
+ .pe_init = pci_viona_init,
+ .pe_barwrite = pci_viona_write,
+ .pe_barread = pci_viona_read
+};
+PCI_EMUL_SET(pci_de_viona);
diff --git a/usr/src/cmd/bhyve/pm.c b/usr/src/cmd/bhyve/pm.c
new file mode 100644
index 0000000000..70c4f1fae8
--- /dev/null
+++ b/usr/src/cmd/bhyve/pm.c
@@ -0,0 +1,333 @@
+/*-
+ * Copyright (c) 2013 Advanced Computing Technologies LLC
+ * Written by: John H. Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pm.c 266125 2014-05-15 14:16:55Z jhb $");
+
+#include <sys/types.h>
+#include <machine/vmm.h>
+
+#include <assert.h>
+#include <pthread.h>
+#ifndef __FreeBSD__
+#include <stdlib.h>
+#endif
+#include <signal.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "inout.h"
+#ifdef __FreeBSD__
+#include "mevent.h"
+#endif
+#include "pci_irq.h"
+#include "pci_lpc.h"
+
+static pthread_mutex_t pm_lock = PTHREAD_MUTEX_INITIALIZER;
+#ifdef __FreeBSD__
+static struct mevent *power_button;
+static sig_t old_power_handler;
+#endif
+
+/*
+ * Reset Control register at I/O port 0xcf9. Bit 2 forces a system
+ * reset when it transitions from 0 to 1. Bit 1 selects the type of
+ * reset to attempt: 0 selects a "soft" reset, and 1 selects a "hard"
+ * reset.
+ */
+static int
+reset_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ static uint8_t reset_control;
+
+ if (bytes != 1)
+ return (-1);
+ if (in)
+ *eax = reset_control;
+ else {
+ reset_control = *eax;
+
+ /* Treat hard and soft resets the same. */
+ if (reset_control & 0x4) {
+#ifdef __FreeBSD__
+ error = vm_suspend(ctx, VM_SUSPEND_RESET);
+ assert(error == 0 || errno == EALREADY);
+#else
+ exit(0);
+#endif
+ }
+ }
+ return (0);
+}
+INOUT_PORT(reset_reg, 0xCF9, IOPORT_F_INOUT, reset_handler);
+
+/*
+ * ACPI's SCI is a level-triggered interrupt.
+ */
+static int sci_active;
+
+static void
+sci_assert(struct vmctx *ctx)
+{
+
+ if (sci_active)
+ return;
+ vm_isa_assert_irq(ctx, SCI_INT, SCI_INT);
+ sci_active = 1;
+}
+
+static void
+sci_deassert(struct vmctx *ctx)
+{
+
+ if (!sci_active)
+ return;
+ vm_isa_deassert_irq(ctx, SCI_INT, SCI_INT);
+ sci_active = 0;
+}
+
+/*
+ * Power Management 1 Event Registers
+ *
+ * The only power management event supported is a power button upon
+ * receiving SIGTERM.
+ */
+static uint16_t pm1_enable, pm1_status;
+
+#define PM1_TMR_STS 0x0001
+#define PM1_BM_STS 0x0010
+#define PM1_GBL_STS 0x0020
+#define PM1_PWRBTN_STS 0x0100
+#define PM1_SLPBTN_STS 0x0200
+#define PM1_RTC_STS 0x0400
+#define PM1_WAK_STS 0x8000
+
+#define PM1_TMR_EN 0x0001
+#define PM1_GBL_EN 0x0020
+#define PM1_PWRBTN_EN 0x0100
+#define PM1_SLPBTN_EN 0x0200
+#define PM1_RTC_EN 0x0400
+
+static void
+sci_update(struct vmctx *ctx)
+{
+ int need_sci;
+
+ /* See if the SCI should be active or not. */
+ need_sci = 0;
+ if ((pm1_enable & PM1_TMR_EN) && (pm1_status & PM1_TMR_STS))
+ need_sci = 1;
+ if ((pm1_enable & PM1_GBL_EN) && (pm1_status & PM1_GBL_STS))
+ need_sci = 1;
+ if ((pm1_enable & PM1_PWRBTN_EN) && (pm1_status & PM1_PWRBTN_STS))
+ need_sci = 1;
+ if ((pm1_enable & PM1_SLPBTN_EN) && (pm1_status & PM1_SLPBTN_STS))
+ need_sci = 1;
+ if ((pm1_enable & PM1_RTC_EN) && (pm1_status & PM1_RTC_STS))
+ need_sci = 1;
+ if (need_sci)
+ sci_assert(ctx);
+ else
+ sci_deassert(ctx);
+}
+
+static int
+pm1_status_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+
+ if (bytes != 2)
+ return (-1);
+
+ pthread_mutex_lock(&pm_lock);
+ if (in)
+ *eax = pm1_status;
+ else {
+ /*
+ * Writes are only permitted to clear certain bits by
+ * writing 1 to those flags.
+ */
+ pm1_status &= ~(*eax & (PM1_WAK_STS | PM1_RTC_STS |
+ PM1_SLPBTN_STS | PM1_PWRBTN_STS | PM1_BM_STS));
+ sci_update(ctx);
+ }
+ pthread_mutex_unlock(&pm_lock);
+ return (0);
+}
+
+static int
+pm1_enable_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+
+ if (bytes != 2)
+ return (-1);
+
+ pthread_mutex_lock(&pm_lock);
+ if (in)
+ *eax = pm1_enable;
+ else {
+ /*
+ * Only permit certain bits to be set. We never use
+ * the global lock, but ACPI-CA whines profusely if it
+ * can't set GBL_EN.
+ */
+ pm1_enable = *eax & (PM1_PWRBTN_EN | PM1_GBL_EN);
+ sci_update(ctx);
+ }
+ pthread_mutex_unlock(&pm_lock);
+ return (0);
+}
+INOUT_PORT(pm1_status, PM1A_EVT_ADDR, IOPORT_F_INOUT, pm1_status_handler);
+INOUT_PORT(pm1_enable, PM1A_EVT_ADDR + 2, IOPORT_F_INOUT, pm1_enable_handler);
+
+#ifdef __FreeBSD__
+static void
+power_button_handler(int signal, enum ev_type type, void *arg)
+{
+ struct vmctx *ctx;
+
+ ctx = arg;
+ pthread_mutex_lock(&pm_lock);
+ if (!(pm1_status & PM1_PWRBTN_STS)) {
+ pm1_status |= PM1_PWRBTN_STS;
+ sci_update(ctx);
+ }
+ pthread_mutex_unlock(&pm_lock);
+}
+#endif
+
+/*
+ * Power Management 1 Control Register
+ *
+ * This is mostly unimplemented except that we wish to handle writes that
+ * set SPL_EN to handle S5 (soft power off).
+ */
+static uint16_t pm1_control;
+
+#define PM1_SCI_EN 0x0001
+#define PM1_SLP_TYP 0x1c00
+#define PM1_SLP_EN 0x2000
+#define PM1_ALWAYS_ZERO 0xc003
+
+static int
+pm1_control_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+
+ if (bytes != 2)
+ return (-1);
+ if (in)
+ *eax = pm1_control;
+ else {
+ /*
+ * Various bits are write-only or reserved, so force them
+ * to zero in pm1_control. Always preserve SCI_EN as OSPM
+ * can never change it.
+ */
+ pm1_control = (pm1_control & PM1_SCI_EN) |
+ (*eax & ~(PM1_SLP_EN | PM1_ALWAYS_ZERO));
+
+ /*
+ * If SLP_EN is set, check for S5. Bhyve's _S5_ method
+ * says that '5' should be stored in SLP_TYP for S5.
+ */
+ if (*eax & PM1_SLP_EN) {
+ if ((pm1_control & PM1_SLP_TYP) >> 10 == 5) {
+#ifdef __FreeBSD__
+ error = vm_suspend(ctx, VM_SUSPEND_POWEROFF);
+ assert(error == 0 || errno == EALREADY);
+#else
+ exit(0);
+#endif
+ }
+ }
+ }
+ return (0);
+}
+INOUT_PORT(pm1_control, PM1A_CNT_ADDR, IOPORT_F_INOUT, pm1_control_handler);
+#ifdef __FreeBSD__
+SYSRES_IO(PM1A_EVT_ADDR, 8);
+#endif
+
+/*
+ * ACPI SMI Command Register
+ *
+ * This write-only register is used to enable and disable ACPI.
+ */
+static int
+smi_cmd_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+
+ assert(!in);
+ if (bytes != 1)
+ return (-1);
+
+ pthread_mutex_lock(&pm_lock);
+ switch (*eax) {
+ case BHYVE_ACPI_ENABLE:
+ pm1_control |= PM1_SCI_EN;
+#ifdef __FreeBSD__
+ if (power_button == NULL) {
+ power_button = mevent_add(SIGTERM, EVF_SIGNAL,
+ power_button_handler, ctx);
+ old_power_handler = signal(SIGTERM, SIG_IGN);
+ }
+#endif
+ break;
+ case BHYVE_ACPI_DISABLE:
+ pm1_control &= ~PM1_SCI_EN;
+#ifdef __FreeBSD__
+ if (power_button != NULL) {
+ mevent_delete(power_button);
+ power_button = NULL;
+ signal(SIGTERM, old_power_handler);
+ }
+#endif
+ break;
+ }
+ pthread_mutex_unlock(&pm_lock);
+ return (0);
+}
+INOUT_PORT(smi_cmd, SMI_CMD, IOPORT_F_OUT, smi_cmd_handler);
+#ifdef __FreeBSD__
+SYSRES_IO(SMI_CMD, 1);
+#endif
+
+void
+sci_init(struct vmctx *ctx)
+{
+
+ /*
+ * Mark ACPI's SCI as level trigger and bump its use count
+ * in the PIRQ router.
+ */
+ pci_irq_use(SCI_INT);
+ vm_isa_set_irq_trigger(ctx, SCI_INT, LEVEL_TRIGGER);
+}
diff --git a/usr/src/cmd/bhyve/pmtmr.c b/usr/src/cmd/bhyve/pmtmr.c
new file mode 100644
index 0000000000..92ab24be57
--- /dev/null
+++ b/usr/src/cmd/bhyve/pmtmr.c
@@ -0,0 +1,212 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/pmtmr.c 259998 2013-12-28 04:01:05Z jhb $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/pmtmr.c 259998 2013-12-28 04:01:05Z jhb $");
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/time.h>
+#include <machine/cpufunc.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <assert.h>
+#include <pthread.h>
+#ifndef __FreeBSD__
+#include <kstat.h>
+#endif
+
+#include "acpi.h"
+#include "inout.h"
+
+/*
+ * The ACPI Power Management timer is a free-running 24- or 32-bit
+ * timer with a frequency of 3.579545MHz
+ *
+ * This implementation will be 32-bits
+ */
+
+#define PMTMR_FREQ 3579545 /* 3.579545MHz */
+
+static pthread_mutex_t pmtmr_mtx;
+static pthread_once_t pmtmr_once = PTHREAD_ONCE_INIT;
+
+static uint64_t pmtmr_old;
+
+static uint64_t pmtmr_tscf;
+static uint64_t pmtmr_tsc_old;
+
+#ifdef __FreeBSD__
+static clockid_t clockid = CLOCK_UPTIME_FAST;
+static struct timespec pmtmr_uptime_old;
+
+#define timespecsub(vvp, uvp) \
+ do { \
+ (vvp)->tv_sec -= (uvp)->tv_sec; \
+ (vvp)->tv_nsec -= (uvp)->tv_nsec; \
+ if ((vvp)->tv_nsec < 0) { \
+ (vvp)->tv_sec--; \
+ (vvp)->tv_nsec += 1000000000; \
+ } \
+ } while (0)
+
+static uint64_t
+timespec_to_pmtmr(const struct timespec *tsnew, const struct timespec *tsold)
+{
+ struct timespec tsdiff;
+ int64_t nsecs;
+
+ tsdiff = *tsnew;
+ timespecsub(&tsdiff, tsold);
+ nsecs = tsdiff.tv_sec * 1000000000 + tsdiff.tv_nsec;
+ assert(nsecs >= 0);
+
+ return (nsecs * PMTMR_FREQ / 1000000000 + pmtmr_old);
+}
+#endif
+
+static uint64_t
+tsc_to_pmtmr(uint64_t tsc_new, uint64_t tsc_old)
+{
+
+ return ((tsc_new - tsc_old) * PMTMR_FREQ / pmtmr_tscf + pmtmr_old);
+}
+
+static void
+pmtmr_init(void)
+{
+#ifdef __FreeBSD__
+ size_t len;
+ int smp_tsc, err;
+ struct timespec tsnew, tsold = { 0 };
+
+ len = sizeof(smp_tsc);
+ err = sysctlbyname("kern.timecounter.smp_tsc", &smp_tsc, &len, NULL, 0);
+ assert(err == 0);
+
+ if (smp_tsc) {
+ len = sizeof(pmtmr_tscf);
+ err = sysctlbyname("machdep.tsc_freq", &pmtmr_tscf, &len,
+ NULL, 0);
+ assert(err == 0);
+
+ pmtmr_tsc_old = rdtsc();
+ pmtmr_old = tsc_to_pmtmr(pmtmr_tsc_old, 0);
+ } else {
+ if (getenv("BHYVE_PMTMR_PRECISE") != NULL)
+ clockid = CLOCK_UPTIME;
+
+ err = clock_gettime(clockid, &tsnew);
+ assert(err == 0);
+
+ pmtmr_uptime_old = tsnew;
+ pmtmr_old = timespec_to_pmtmr(&tsnew, &tsold);
+ }
+#else
+ kstat_ctl_t *kstat_ctl;
+ kstat_t *kstat;
+ kstat_named_t *kstat_cpu_freq;
+
+ kstat_ctl = kstat_open();
+ kstat = kstat_lookup(kstat_ctl, "cpu_info", 0, NULL);
+ kstat_read(kstat_ctl, kstat, NULL);
+ kstat_cpu_freq = kstat_data_lookup(kstat, "current_clock_Hz");
+ pmtmr_tscf = kstat_cpu_freq->value.ul;
+ kstat_close(kstat_ctl);
+
+ pmtmr_tsc_old = rdtsc();
+ pmtmr_old = tsc_to_pmtmr(pmtmr_tsc_old, 0);
+#endif
+ pthread_mutex_init(&pmtmr_mtx, NULL);
+}
+
+static uint32_t
+pmtmr_val(void)
+{
+ struct timespec tsnew;
+ uint64_t pmtmr_tsc_new;
+ uint64_t pmtmr_new;
+ int error;
+
+ pthread_once(&pmtmr_once, pmtmr_init);
+
+ pthread_mutex_lock(&pmtmr_mtx);
+
+#ifdef __FreeBSD__
+ if (pmtmr_tscf) {
+ pmtmr_tsc_new = rdtsc();
+ pmtmr_new = tsc_to_pmtmr(pmtmr_tsc_new, pmtmr_tsc_old);
+ pmtmr_tsc_old = pmtmr_tsc_new;
+ } else {
+ error = clock_gettime(clockid, &tsnew);
+ assert(error == 0);
+
+ pmtmr_new = timespec_to_pmtmr(&tsnew, &pmtmr_uptime_old);
+ pmtmr_uptime_old = tsnew;
+ }
+#else
+ pmtmr_tsc_new = rdtsc();
+ pmtmr_new = tsc_to_pmtmr(pmtmr_tsc_new, pmtmr_tsc_old);
+ pmtmr_tsc_old = pmtmr_tsc_new;
+#endif
+ pmtmr_old = pmtmr_new;
+
+ pthread_mutex_unlock(&pmtmr_mtx);
+
+ return (pmtmr_new);
+}
+
+static int
+pmtmr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 1);
+
+ if (bytes != 4)
+ return (-1);
+
+ *eax = pmtmr_val();
+
+ return (0);
+}
+
+INOUT_PORT(pmtmr, IO_PMTMR, IOPORT_F_IN, pmtmr_handler);
diff --git a/usr/src/cmd/bhyve/post.c b/usr/src/cmd/bhyve/post.c
new file mode 100644
index 0000000000..dcb481aac4
--- /dev/null
+++ b/usr/src/cmd/bhyve/post.c
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/post.c 260206 2014-01-02 21:26:59Z jhb $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/post.c 260206 2014-01-02 21:26:59Z jhb $");
+
+#include <sys/types.h>
+
+#include <assert.h>
+
+#include "inout.h"
+#include "pci_lpc.h"
+
+static int
+post_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ assert(in == 1);
+
+ if (bytes != 1)
+ return (-1);
+
+ *eax = 0xff; /* return some garbage */
+ return (0);
+}
+
+INOUT_PORT(post, 0x84, IOPORT_F_IN, post_data_handler);
+SYSRES_IO(0x84, 1);
diff --git a/usr/src/cmd/bhyve/ps2kbd.c b/usr/src/cmd/bhyve/ps2kbd.c
new file mode 100644
index 0000000000..22e566ac21
--- /dev/null
+++ b/usr/src/cmd/bhyve/ps2kbd.c
@@ -0,0 +1,418 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2015 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "atkbdc.h"
+#include "console.h"
+
+/* keyboard device commands */
+#define PS2KC_RESET_DEV 0xff
+#define PS2KC_DISABLE 0xf5
+#define PS2KC_ENABLE 0xf4
+#define PS2KC_SET_TYPEMATIC 0xf3
+#define PS2KC_SEND_DEV_ID 0xf2
+#define PS2KC_SET_SCANCODE_SET 0xf0
+#define PS2KC_ECHO 0xee
+#define PS2KC_SET_LEDS 0xed
+
+#define PS2KC_BAT_SUCCESS 0xaa
+#define PS2KC_ACK 0xfa
+
+#define PS2KBD_FIFOSZ 16
+
+struct fifo {
+ uint8_t buf[PS2KBD_FIFOSZ];
+ int rindex; /* index to read from */
+ int windex; /* index to write to */
+ int num; /* number of bytes in the fifo */
+ int size; /* size of the fifo */
+};
+
+struct ps2kbd_softc {
+ struct atkbdc_softc *atkbdc_sc;
+ pthread_mutex_t mtx;
+
+ bool enabled;
+ struct fifo fifo;
+
+ uint8_t curcmd; /* current command for next byte */
+};
+
+static void
+fifo_init(struct ps2kbd_softc *sc)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ fifo->size = sizeof(((struct fifo *)0)->buf);
+}
+
+static void
+fifo_reset(struct ps2kbd_softc *sc)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ bzero(fifo, sizeof(struct fifo));
+ fifo->size = sizeof(((struct fifo *)0)->buf);
+}
+
+static int
+fifo_available(struct ps2kbd_softc *sc)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ return (fifo->num < fifo->size);
+}
+
+static void
+fifo_put(struct ps2kbd_softc *sc, uint8_t val)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ if (fifo->num < fifo->size) {
+ fifo->buf[fifo->windex] = val;
+ fifo->windex = (fifo->windex + 1) % fifo->size;
+ fifo->num++;
+ }
+}
+
+static int
+fifo_get(struct ps2kbd_softc *sc, uint8_t *val)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ if (fifo->num > 0) {
+ *val = fifo->buf[fifo->rindex];
+ fifo->rindex = (fifo->rindex + 1) % fifo->size;
+ fifo->num--;
+ return (0);
+ }
+
+ return (-1);
+}
+
+int
+ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val)
+{
+ int retval;
+
+ pthread_mutex_lock(&sc->mtx);
+ retval = fifo_get(sc, val);
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (retval);
+}
+
+void
+ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val)
+{
+ pthread_mutex_lock(&sc->mtx);
+ if (sc->curcmd) {
+ switch (sc->curcmd) {
+ case PS2KC_SET_TYPEMATIC:
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_SET_SCANCODE_SET:
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_SET_LEDS:
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ default:
+ fprintf(stderr, "Unhandled ps2 keyboard current "
+ "command byte 0x%02x\n", val);
+ break;
+ }
+ sc->curcmd = 0;
+ } else {
+ switch (val) {
+ case PS2KC_RESET_DEV:
+ fifo_reset(sc);
+ fifo_put(sc, PS2KC_ACK);
+ fifo_put(sc, PS2KC_BAT_SUCCESS);
+ break;
+ case PS2KC_DISABLE:
+ sc->enabled = false;
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_ENABLE:
+ sc->enabled = true;
+ fifo_reset(sc);
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_SET_TYPEMATIC:
+ sc->curcmd = val;
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_SEND_DEV_ID:
+ fifo_put(sc, PS2KC_ACK);
+ fifo_put(sc, 0xab);
+ fifo_put(sc, 0x83);
+ break;
+ case PS2KC_SET_SCANCODE_SET:
+ sc->curcmd = val;
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ case PS2KC_ECHO:
+ fifo_put(sc, PS2KC_ECHO);
+ break;
+ case PS2KC_SET_LEDS:
+ sc->curcmd = val;
+ fifo_put(sc, PS2KC_ACK);
+ break;
+ default:
+ fprintf(stderr, "Unhandled ps2 keyboard command "
+ "0x%02x\n", val);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+/*
+ * Translate keysym to type 2 scancode and insert into keyboard buffer.
+ */
+static void
+ps2kbd_keysym_queue(struct ps2kbd_softc *sc,
+ int down, uint32_t keysym)
+{
+ /* ASCII to type 2 scancode lookup table */
+ const uint8_t translation[128] = {
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x29, 0x16, 0x52, 0x26, 0x25, 0x2e, 0x3d, 0x52,
+ 0x46, 0x45, 0x3e, 0x55, 0x41, 0x4e, 0x49, 0x4a,
+ 0x45, 0x16, 0x1e, 0x26, 0x25, 0x2e, 0x36, 0x3d,
+ 0x3e, 0x46, 0x4c, 0x4c, 0x41, 0x55, 0x49, 0x4a,
+ 0x1e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34,
+ 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44,
+ 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d,
+ 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x36, 0x4e,
+ 0x0e, 0x1c, 0x32, 0x21, 0x23, 0x24, 0x2b, 0x34,
+ 0x33, 0x43, 0x3b, 0x42, 0x4b, 0x3a, 0x31, 0x44,
+ 0x4d, 0x15, 0x2d, 0x1b, 0x2c, 0x3c, 0x2a, 0x1d,
+ 0x22, 0x35, 0x1a, 0x54, 0x5d, 0x5b, 0x0e, 0x00,
+ };
+
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ switch (keysym) {
+ case 0x0 ... 0x7f:
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, translation[keysym]);
+ break;
+ case 0xff08: /* Back space */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x66);
+ break;
+ case 0xff09: /* Tab */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x0d);
+ break;
+ case 0xff0d: /* Return */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x5a);
+ break;
+ case 0xff1b: /* Escape */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x76);
+ break;
+ case 0xff51: /* Left arrow */
+ fifo_put(sc, 0xe0);
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x6b);
+ break;
+ case 0xff52: /* Up arrow */
+ fifo_put(sc, 0xe0);
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x75);
+ break;
+ case 0xff53: /* Right arrow */
+ fifo_put(sc, 0xe0);
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x74);
+ break;
+ case 0xff54: /* Down arrow */
+ fifo_put(sc, 0xe0);
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x72);
+ break;
+ case 0xffbe: /* F1 */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x05);
+ break;
+ case 0xffbf: /* F2 */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x06);
+ break;
+ case 0xffc0: /* F3 */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x04);
+ break;
+ case 0xffc1: /* F4 */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x0c);
+ break;
+ case 0xffc2: /* F5 */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x03);
+ break;
+ case 0xffc3: /* F6 */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x0b);
+ break;
+ case 0xffc4: /* F7 */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x83);
+ break;
+ case 0xffc5: /* F8 */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x0a);
+ break;
+ case 0xffc6: /* F9 */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x01);
+ break;
+ case 0xffc7: /* F10 */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x09);
+ break;
+ case 0xffc8: /* F11 */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x78);
+ break;
+ case 0xffc9: /* F12 */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x07);
+ break;
+ case 0xffe1: /* Left shift */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x12);
+ break;
+ case 0xffe2: /* Right shift */
+ /* XXX */
+ break;
+ case 0xffe3: /* Left control */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x14);
+ break;
+ case 0xffe4: /* Right control */
+ /* XXX */
+ break;
+ case 0xffe7: /* Left meta */
+ /* XXX */
+ break;
+ case 0xffe8: /* Right meta */
+ /* XXX */
+ break;
+ case 0xffe9: /* Left alt */
+ if (!down)
+ fifo_put(sc, 0xf0);
+ fifo_put(sc, 0x11);
+ break;
+ case 0xffea: /* Right alt */
+ /* XXX */
+ break;
+ default:
+ fprintf(stderr, "Unhandled ps2 keyboard keysym 0x%x\n",
+ keysym);
+ break;
+ }
+}
+
+static void
+ps2kbd_event(int down, uint32_t keysym, void *arg)
+{
+ struct ps2kbd_softc *sc = arg;
+
+ pthread_mutex_lock(&sc->mtx);
+ if (!sc->enabled) {
+ pthread_mutex_unlock(&sc->mtx);
+ return;
+ }
+
+ ps2kbd_keysym_queue(sc, down, keysym);
+ pthread_mutex_unlock(&sc->mtx);
+
+ atkbdc_event(sc->atkbdc_sc);
+}
+
+struct ps2kbd_softc *
+ps2kbd_init(struct atkbdc_softc *atkbdc_sc)
+{
+ struct ps2kbd_softc *sc;
+
+ sc = calloc(1, sizeof (struct ps2kbd_softc));
+ pthread_mutex_init(&sc->mtx, NULL);
+ fifo_init(sc);
+ sc->atkbdc_sc = atkbdc_sc;
+
+ console_kbd_register(ps2kbd_event, sc);
+
+ return (sc);
+}
diff --git a/usr/src/cmd/bhyve/ps2kbd.h b/usr/src/cmd/bhyve/ps2kbd.h
new file mode 100644
index 0000000000..34c31b1ea8
--- /dev/null
+++ b/usr/src/cmd/bhyve/ps2kbd.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PS2KBD_H_
+#define _PS2KBD_H_
+
+struct atkbdc_softc;
+
+struct ps2kbd_softc *ps2kbd_init(struct atkbdc_softc *sc);
+
+int ps2kbd_read(struct ps2kbd_softc *sc, uint8_t *val);
+void ps2kbd_write(struct ps2kbd_softc *sc, uint8_t val);
+
+#endif /* _PS2KBD_H_ */
diff --git a/usr/src/cmd/bhyve/ps2mouse.c b/usr/src/cmd/bhyve/ps2mouse.c
new file mode 100644
index 0000000000..e96fbbf411
--- /dev/null
+++ b/usr/src/cmd/bhyve/ps2mouse.c
@@ -0,0 +1,371 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2015 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/types.h>
+
+#include <assert.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <strings.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "atkbdc.h"
+#include "console.h"
+
+/* mouse device commands */
+#define PS2MC_RESET_DEV 0xff
+#define PS2MC_SET_DEFAULTS 0xf6
+#define PS2MC_DISABLE 0xf5
+#define PS2MC_ENABLE 0xf4
+#define PS2MC_SET_SAMPLING_RATE 0xf3
+#define PS2MC_SEND_DEV_ID 0xf2
+#define PS2MC_SET_REMOTE_MODE 0xf0
+#define PS2MC_SEND_DEV_DATA 0xeb
+#define PS2MC_SET_STREAM_MODE 0xea
+#define PS2MC_SEND_DEV_STATUS 0xe9
+#define PS2MC_SET_RESOLUTION 0xe8
+#define PS2MC_SET_SCALING1 0xe7
+#define PS2MC_SET_SCALING2 0xe6
+
+#define PS2MC_BAT_SUCCESS 0xaa
+#define PS2MC_ACK 0xfa
+
+/* mouse device id */
+#define PS2MOUSE_DEV_ID 0x0
+
+/* mouse status bits */
+#define PS2M_STS_REMOTE_MODE 0x40
+#define PS2M_STS_ENABLE_DEV 0x20
+#define PS2M_STS_SCALING_21 0x10
+#define PS2M_STS_MID_BUTTON 0x04
+#define PS2M_STS_RIGHT_BUTTON 0x02
+#define PS2M_STS_LEFT_BUTTON 0x01
+
+#define PS2MOUSE_FIFOSZ 16
+
+struct fifo {
+ uint8_t buf[PS2MOUSE_FIFOSZ];
+ int rindex; /* index to read from */
+ int windex; /* index to write to */
+ int num; /* number of bytes in the fifo */
+ int size; /* size of the fifo */
+};
+
+struct ps2mouse_softc {
+ struct atkbdc_softc *atkbdc_sc;
+ pthread_mutex_t mtx;
+
+ uint8_t status;
+ uint8_t resolution;
+ uint8_t sampling_rate;
+ struct fifo fifo;
+
+ uint8_t curcmd; /* current command for next byte */
+
+ int cur_x, cur_y;
+ int delta_x, delta_y;
+};
+
+static void
+fifo_init(struct ps2mouse_softc *sc)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ fifo->size = sizeof(((struct fifo *)0)->buf);
+}
+
+static void
+fifo_reset(struct ps2mouse_softc *sc)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ bzero(fifo, sizeof(struct fifo));
+ fifo->size = sizeof(((struct fifo *)0)->buf);
+}
+
+static void
+fifo_put(struct ps2mouse_softc *sc, uint8_t val)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ if (fifo->num < fifo->size) {
+ fifo->buf[fifo->windex] = val;
+ fifo->windex = (fifo->windex + 1) % fifo->size;
+ fifo->num++;
+ }
+}
+
+static int
+fifo_get(struct ps2mouse_softc *sc, uint8_t *val)
+{
+ struct fifo *fifo;
+
+ fifo = &sc->fifo;
+ if (fifo->num > 0) {
+ *val = fifo->buf[fifo->rindex];
+ fifo->rindex = (fifo->rindex + 1) % fifo->size;
+ fifo->num--;
+ return (0);
+ }
+
+ return (-1);
+}
+
+static void
+movement_reset(struct ps2mouse_softc *sc)
+{
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ sc->delta_x = 0;
+ sc->delta_y = 0;
+}
+
+static void
+movement_update(struct ps2mouse_softc *sc, int x, int y)
+{
+ sc->delta_x += x - sc->cur_x;
+ sc->delta_y += sc->cur_y - y;
+ sc->cur_x = x;
+ sc->cur_y = y;
+}
+
+static void
+movement_get(struct ps2mouse_softc *sc)
+{
+ uint8_t val0, val1, val2;
+
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+
+ val0 = sc->status & (PS2M_STS_LEFT_BUTTON |
+ PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON);
+
+ if (sc->delta_x >= 0) {
+ if (sc->delta_x > 255) {
+ val0 |= (1 << 6);
+ val1 = 255;
+ } else
+ val1 = sc->delta_x;
+ } else {
+ val0 |= (1 << 4);
+ if (sc->delta_x < -255) {
+ val0 |= (1 << 6);
+ val1 = 255;
+ } else
+ val1 = sc->delta_x;
+ }
+ sc->delta_x = 0;
+
+ if (sc->delta_y >= 0) {
+ if (sc->delta_y > 255) {
+ val0 |= (1 << 7);
+ val2 = 255;
+ } else
+ val2 = sc->delta_y;
+ } else {
+ val0 |= (1 << 5);
+ if (sc->delta_y < -255) {
+ val0 |= (1 << 7);
+ val2 = 255;
+ } else
+ val2 = sc->delta_y;
+ }
+ sc->delta_y = 0;
+
+ fifo_put(sc, val0);
+ fifo_put(sc, val1);
+ fifo_put(sc, val2);
+}
+
+static void
+ps2mouse_reset(struct ps2mouse_softc *sc)
+{
+ assert(pthread_mutex_isowned_np(&sc->mtx));
+ fifo_reset(sc);
+ movement_reset(sc);
+ sc->status = 0x8;
+ sc->resolution = 4;
+ sc->sampling_rate = 100;
+
+ sc->cur_x = 0;
+ sc->cur_y = 0;
+ sc->delta_x = 0;
+ sc->delta_y = 0;
+}
+
+int
+ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val)
+{
+ int retval;
+
+ pthread_mutex_lock(&sc->mtx);
+ retval = fifo_get(sc, val);
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (retval);
+}
+
+void
+ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val)
+{
+ pthread_mutex_lock(&sc->mtx);
+ if (sc->curcmd) {
+ switch (sc->curcmd) {
+ case PS2MC_SET_SAMPLING_RATE:
+ sc->sampling_rate = val;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_SET_RESOLUTION:
+ sc->resolution = val;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ default:
+ fprintf(stderr, "Unhandled ps2 mouse current "
+ "command byte 0x%02x\n", val);
+ break;
+ }
+ sc->curcmd = 0;
+ } else {
+ switch (val) {
+ case PS2MC_RESET_DEV:
+ ps2mouse_reset(sc);
+ fifo_put(sc, PS2MC_ACK);
+ fifo_put(sc, PS2MC_BAT_SUCCESS);
+ fifo_put(sc, PS2MOUSE_DEV_ID);
+ break;
+ case PS2MC_SET_DEFAULTS:
+ ps2mouse_reset(sc);
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_DISABLE:
+ fifo_reset(sc);
+ sc->status &= ~PS2M_STS_ENABLE_DEV;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_ENABLE:
+ fifo_reset(sc);
+ sc->status |= PS2M_STS_ENABLE_DEV;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_SET_SAMPLING_RATE:
+ sc->curcmd = val;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_SEND_DEV_ID:
+ fifo_put(sc, PS2MC_ACK);
+ fifo_put(sc, PS2MOUSE_DEV_ID);
+ break;
+ case PS2MC_SET_REMOTE_MODE:
+ sc->status |= PS2M_STS_REMOTE_MODE;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_SEND_DEV_DATA:
+ fifo_put(sc, PS2MC_ACK);
+ movement_get(sc);
+ break;
+ case PS2MC_SET_STREAM_MODE:
+ sc->status &= ~PS2M_STS_REMOTE_MODE;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_SEND_DEV_STATUS:
+ fifo_put(sc, PS2MC_ACK);
+ fifo_put(sc, sc->status);
+ fifo_put(sc, sc->resolution);
+ fifo_put(sc, sc->sampling_rate);
+ break;
+ case PS2MC_SET_RESOLUTION:
+ sc->curcmd = val;
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ case PS2MC_SET_SCALING1:
+ case PS2MC_SET_SCALING2:
+ fifo_put(sc, PS2MC_ACK);
+ break;
+ default:
+ fprintf(stderr, "Unhandled ps2 mouse command "
+ "0x%02x\n", val);
+ break;
+ }
+ }
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+static void
+ps2mouse_event(uint8_t button, int x, int y, void *arg)
+{
+ struct ps2mouse_softc *sc = arg;
+
+ pthread_mutex_lock(&sc->mtx);
+ movement_update(sc, x, y);
+
+ sc->status &= ~(PS2M_STS_LEFT_BUTTON |
+ PS2M_STS_RIGHT_BUTTON | PS2M_STS_MID_BUTTON);
+ if (button & (1 << 0))
+ sc->status |= PS2M_STS_LEFT_BUTTON;
+ if (button & (1 << 1))
+ sc->status |= PS2M_STS_MID_BUTTON;
+ if (button & (1 << 2))
+ sc->status |= PS2M_STS_RIGHT_BUTTON;
+
+ if ((sc->status & PS2M_STS_ENABLE_DEV) == 0) {
+ /* no data reporting */
+ pthread_mutex_unlock(&sc->mtx);
+ return;
+ }
+
+ movement_get(sc);
+ pthread_mutex_unlock(&sc->mtx);
+
+ atkbdc_event(sc->atkbdc_sc);
+}
+
+struct ps2mouse_softc *
+ps2mouse_init(struct atkbdc_softc *atkbdc_sc)
+{
+ struct ps2mouse_softc *sc;
+
+ sc = calloc(1, sizeof (struct ps2mouse_softc));
+ pthread_mutex_init(&sc->mtx, NULL);
+ fifo_init(sc);
+ sc->atkbdc_sc = atkbdc_sc;
+
+ pthread_mutex_lock(&sc->mtx);
+ ps2mouse_reset(sc);
+ pthread_mutex_unlock(&sc->mtx);
+
+ console_ptr_register(ps2mouse_event, sc);
+
+ return (sc);
+}
+
diff --git a/usr/src/cmd/bhyve/ps2mouse.h b/usr/src/cmd/bhyve/ps2mouse.h
new file mode 100644
index 0000000000..1a78934b98
--- /dev/null
+++ b/usr/src/cmd/bhyve/ps2mouse.h
@@ -0,0 +1,39 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _PS2MOUSE_H_
+#define _PS2MOUSE_H_
+
+struct atkbdc_softc;
+
+struct ps2mouse_softc *ps2mouse_init(struct atkbdc_softc *sc);
+
+int ps2mouse_read(struct ps2mouse_softc *sc, uint8_t *val);
+void ps2mouse_write(struct ps2mouse_softc *sc, uint8_t val);
+
+#endif /* _PS2MOUSE_H_ */
diff --git a/usr/src/cmd/bhyve/rfb.c b/usr/src/cmd/bhyve/rfb.c
new file mode 100644
index 0000000000..0846316378
--- /dev/null
+++ b/usr/src/cmd/bhyve/rfb.c
@@ -0,0 +1,420 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2015 Nahanni Systems Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/socket.h>
+#include <netinet/in.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+
+#include "bhyvegc.h"
+#include "console.h"
+#include "rfb.h"
+
+struct rfb_softc {
+ int sfd;
+ pthread_t tid;
+
+ int width, height;
+
+ bool enc_raw_ok;
+ bool enc_resize_ok;
+};
+
+struct rfb_pixfmt {
+ uint8_t bpp;
+ uint8_t depth;
+ uint8_t bigendian;
+ uint8_t truecolor;
+ uint16_t red_max;
+ uint16_t green_max;
+ uint16_t blue_max;
+ uint8_t red_shift;
+ uint8_t green_shift;
+ uint8_t blue_shift;
+ uint8_t pad[3];
+};
+
+struct rfb_srvr_info {
+ uint16_t width;
+ uint16_t height;
+ struct rfb_pixfmt pixfmt;
+ uint32_t namelen;
+};
+
+struct rfb_pixfmt_msg {
+ uint8_t type;
+ uint8_t pad[3];
+ struct rfb_pixfmt pixfmt;
+};
+
+#define RFB_ENCODING_RAW 0
+#define RFB_ENCODING_RESIZE -223
+
+struct rfb_enc_msg {
+ uint8_t type;
+ uint8_t pad;
+ uint16_t numencs;
+};
+
+struct rfb_updt_msg {
+ uint8_t type;
+ uint8_t incremental;
+ uint16_t x;
+ uint16_t y;
+ uint16_t width;
+ uint16_t height;
+};
+
+struct rfb_key_msg {
+ uint8_t type;
+ uint8_t down;
+ uint16_t pad;
+ uint32_t code;
+};
+
+struct rfb_ptr_msg {
+ uint8_t type;
+ uint8_t button;
+ uint16_t x;
+ uint16_t y;
+};
+
+struct rfb_srvr_updt_msg {
+ uint8_t type;
+ uint8_t pad;
+ uint16_t numrects;
+};
+
+struct rfb_srvr_rect_hdr {
+ uint16_t x;
+ uint16_t y;
+ uint16_t width;
+ uint16_t height;
+ uint32_t encoding;
+};
+
+static void
+rfb_send_server_init_msg(int cfd)
+{
+ struct bhyvegc_image *gc_image;
+ struct rfb_srvr_info sinfo;
+ int len;
+
+ gc_image = console_get_image();
+
+ sinfo.width = ntohs(gc_image->width);
+ sinfo.height = ntohs(gc_image->height);
+ sinfo.pixfmt.bpp = 32;
+ sinfo.pixfmt.depth = 32;
+ sinfo.pixfmt.bigendian = 0;
+ sinfo.pixfmt.truecolor = 1;
+ sinfo.pixfmt.red_max = ntohs(255);
+ sinfo.pixfmt.green_max = ntohs(255);
+ sinfo.pixfmt.blue_max = ntohs(255);
+ sinfo.pixfmt.red_shift = 16;
+ sinfo.pixfmt.green_shift = 8;
+ sinfo.pixfmt.blue_shift = 0;
+ sinfo.namelen = ntohl(strlen("bhyve"));
+ len = write(cfd, &sinfo, sizeof(sinfo));
+ len = write(cfd, "bhyve", strlen("bhyve"));
+}
+
+static void
+rfb_send_resize_update_msg(struct rfb_softc *rc, int cfd)
+{
+ struct rfb_srvr_updt_msg supdt_msg;
+ struct rfb_srvr_rect_hdr srect_hdr;
+
+ /* Number of rectangles: 1 */
+ supdt_msg.type = 0;
+ supdt_msg.pad = 0;
+ supdt_msg.numrects = ntohs(1);
+ write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg));
+
+ /* Rectangle header */
+ srect_hdr.x = ntohs(0);
+ srect_hdr.y = ntohs(0);
+ srect_hdr.width = ntohs(rc->width);
+ srect_hdr.height = ntohs(rc->height);
+ srect_hdr.encoding = ntohl(RFB_ENCODING_RESIZE);
+ write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr));
+}
+
+static void
+rfb_recv_set_pixfmt_msg(struct rfb_softc *rc, int cfd)
+{
+ struct rfb_pixfmt_msg pixfmt_msg;
+ int len;
+
+ len = read(cfd, ((void *)&pixfmt_msg) + 1, sizeof(pixfmt_msg) - 1);
+}
+
+
+static void
+rfb_recv_set_encodings_msg(struct rfb_softc *rc, int cfd)
+{
+ struct rfb_enc_msg enc_msg;
+ int len, i;
+ uint32_t encoding;
+
+ assert((sizeof(enc_msg) - 1) == 3);
+ len = read(cfd, ((void *)&enc_msg) + 1, sizeof(enc_msg) - 1);
+
+ for (i = 0; i < ntohs(enc_msg.numencs); i++) {
+ len = read(cfd, &encoding, sizeof(encoding));
+ switch (ntohl(encoding)) {
+ case RFB_ENCODING_RAW:
+ rc->enc_raw_ok = true;
+ break;
+ case RFB_ENCODING_RESIZE:
+ rc->enc_resize_ok = true;
+ break;
+ }
+ }
+}
+
+static void
+rfb_resize_update(struct rfb_softc *rc, int fd)
+{
+ struct rfb_srvr_updt_msg supdt_msg;
+ struct rfb_srvr_rect_hdr srect_hdr;
+
+ /* Number of rectangles: 1 */
+ supdt_msg.type = 0;
+ supdt_msg.pad = 0;
+ supdt_msg.numrects = ntohs(1);
+ write(fd, &supdt_msg, sizeof (struct rfb_srvr_updt_msg));
+
+ /* Rectangle header */
+ srect_hdr.x = ntohs(0);
+ srect_hdr.y = ntohs(0);
+ srect_hdr.width = ntohs(rc->width);
+ srect_hdr.height = ntohs(rc->height);
+ srect_hdr.encoding = ntohl(RFB_ENCODING_RESIZE);
+ write(fd, &srect_hdr, sizeof (struct rfb_srvr_rect_hdr));
+}
+
+static void
+rfb_recv_update_msg(struct rfb_softc *rc, int cfd)
+{
+ struct rfb_updt_msg updt_msg;
+ struct rfb_srvr_updt_msg supdt_msg;
+ struct rfb_srvr_rect_hdr srect_hdr;
+ struct bhyvegc_image *gc_image;
+ int len;
+
+ len = read(cfd, ((void *)&updt_msg) + 1 , sizeof(updt_msg) - 1);
+
+ console_refresh();
+ gc_image = console_get_image();
+
+ if (rc->width != gc_image->width || rc->height != gc_image->height) {
+ rc->width = gc_image->width;
+ rc->height = gc_image->height;
+ rfb_send_resize_update_msg(rc, cfd);
+ }
+
+ /*
+ * Send the whole thing
+ */
+ /* Number of rectangles: 1 */
+ supdt_msg.type = 0;
+ supdt_msg.pad = 0;
+ supdt_msg.numrects = ntohs(1);
+ write(cfd, &supdt_msg, sizeof(struct rfb_srvr_updt_msg));
+
+ /* Rectangle header */
+ srect_hdr.x = ntohs(0);
+ srect_hdr.y = ntohs(0);
+ srect_hdr.width = ntohs(gc_image->width);
+ srect_hdr.height = ntohs(gc_image->height);
+ srect_hdr.encoding = ntohl(0); /* raw */
+ write(cfd, &srect_hdr, sizeof(struct rfb_srvr_rect_hdr));
+
+ write(cfd, gc_image->data, gc_image->width * gc_image->height *
+ sizeof(uint32_t));
+}
+
+static void
+rfb_recv_key_msg(struct rfb_softc *rc, int cfd)
+{
+ struct rfb_key_msg key_msg;
+ int len;
+
+ len = read(cfd, ((void *)&key_msg) + 1, sizeof(key_msg) - 1);
+
+ console_key_event(key_msg.down, ntohl(key_msg.code));
+}
+
+static void
+rfb_recv_ptr_msg(struct rfb_softc *rc, int cfd)
+{
+ struct rfb_ptr_msg ptr_msg;
+ int len;
+
+ len = read(cfd, ((void *)&ptr_msg) + 1, sizeof(ptr_msg) - 1);
+
+ console_ptr_event(ptr_msg.button, ntohs(ptr_msg.x), ntohs(ptr_msg.y));
+}
+
+void
+rfb_handle(struct rfb_softc *rc, int cfd)
+{
+ const char *vbuf = "RFB 003.008\n";
+ unsigned char buf[80];
+ int len;
+ uint32_t sres;
+
+ /* 1a. Send server version */
+ printf("server vers write: (%s), %d bytes\n", vbuf, (int) strlen(vbuf));
+ write(cfd, vbuf, strlen(vbuf));
+
+ /* 1b. Read client version */
+ len = read(cfd, buf, sizeof(buf));
+
+ /* 2a. Send security type 'none' */
+ buf[0] = 1;
+ buf[1] = 1; /* none */
+ write(cfd, buf, 2);
+
+ /* 2b. Read agreed security type */
+ len = read(cfd, buf, 1);
+
+ /* 2c. Write back a status of 0 */
+ sres = 0;
+ write(cfd, &sres, 4);
+
+ /* 3a. Read client shared-flag byte */
+ len = read(cfd, buf, 1);
+
+ /* 4a. Write server-init info */
+ rfb_send_server_init_msg(cfd);
+
+ /* Now read in client requests. 1st byte identifies type */
+ for (;;) {
+ len = read(cfd, buf, 1);
+ if (len <= 0) {
+ printf("exiting\n");
+ break;
+ }
+
+ switch (buf[0]) {
+ case 0:
+ rfb_recv_set_pixfmt_msg(rc, cfd);
+ break;
+ case 2:
+ rfb_recv_set_encodings_msg(rc, cfd);
+ break;
+ case 3:
+ rfb_recv_update_msg(rc, cfd);
+ break;
+ case 4:
+ rfb_recv_key_msg(rc, cfd);
+ break;
+ case 5:
+ rfb_recv_ptr_msg(rc, cfd);
+ break;
+ default:
+ printf("unknown client code!\n");
+ exit(1);
+ }
+ }
+}
+
+static void *
+rfb_thr(void *arg)
+{
+ struct rfb_softc *rc;
+ sigset_t set;
+
+ int cfd;
+
+ rc = arg;
+
+ sigemptyset(&set);
+ sigaddset(&set, SIGPIPE);
+ if (pthread_sigmask(SIG_BLOCK, &set, NULL) != 0) {
+ perror("pthread_sigmask");
+ return (NULL);
+ }
+
+ for (;;) {
+ cfd = accept(rc->sfd, NULL, NULL);
+ rfb_handle(rc, cfd);
+ }
+
+ /* NOTREACHED */
+ return (NULL);
+}
+
+int
+rfb_init(int port)
+{
+ struct rfb_softc *rc;
+ struct sockaddr_in sin;
+ int on = 1;
+
+ rc = calloc(1, sizeof(struct rfb_softc));
+
+ rc->sfd = socket(AF_INET, SOCK_STREAM, 0);
+ if (rc->sfd < 0) {
+ perror("socket");
+ return (-1);
+ }
+
+ setsockopt(rc->sfd, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on));
+
+#ifdef __FreeBSD__
+ sin.sin_len = sizeof(sin);
+#endif
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = htonl(INADDR_ANY);
+ sin.sin_port = htons(port);
+ if (bind(rc->sfd, (struct sockaddr *)&sin, sizeof(sin)) < 0) {
+ perror("bind");
+ return (-1);
+ }
+
+ if (listen(rc->sfd, 1) < 0) {
+ perror("listen");
+ return (-1);
+ }
+
+ pthread_create(&rc->tid, NULL, rfb_thr, rc);
+
+ return (0);
+}
diff --git a/usr/src/cmd/bhyve/rfb.h b/usr/src/cmd/bhyve/rfb.h
new file mode 100644
index 0000000000..5504c333ab
--- /dev/null
+++ b/usr/src/cmd/bhyve/rfb.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _RFB_H_
+#define _RFB_H_
+
+#define RFB_PORT 5900
+
+int rfb_init(int port);
+
+#endif /* _RFB_H_ */
diff --git a/usr/src/cmd/bhyve/rtc.c b/usr/src/cmd/bhyve/rtc.c
new file mode 100644
index 0000000000..5ab78e060f
--- /dev/null
+++ b/usr/src/cmd/bhyve/rtc.c
@@ -0,0 +1,380 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/rtc.c 260206 2014-01-02 21:26:59Z jhb $");
+
+#include <sys/types.h>
+#include <sys/time.h>
+
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "acpi.h"
+#include "inout.h"
+#include "pci_lpc.h"
+#include "rtc.h"
+
+#define IO_RTC 0x70
+
+#define RTC_SEC 0x00 /* seconds */
+#define RTC_SEC_ALARM 0x01
+#define RTC_MIN 0x02
+#define RTC_MIN_ALARM 0x03
+#define RTC_HRS 0x04
+#define RTC_HRS_ALARM 0x05
+#define RTC_WDAY 0x06
+#define RTC_DAY 0x07
+#define RTC_MONTH 0x08
+#define RTC_YEAR 0x09
+#define RTC_CENTURY 0x32 /* current century */
+
+#define RTC_STATUSA 0xA
+#define RTCSA_TUP 0x80 /* time update, don't look now */
+
+#define RTC_STATUSB 0xB
+#define RTCSB_DST 0x01
+#define RTCSB_24HR 0x02
+#define RTCSB_BIN 0x04 /* 0 = BCD, 1 = Binary */
+#define RTCSB_PINTR 0x40 /* 1 = enable periodic clock interrupt */
+#define RTCSB_HALT 0x80 /* stop clock updates */
+
+#define RTC_INTR 0x0c /* status register C (R) interrupt source */
+
+#define RTC_STATUSD 0x0d /* status register D (R) Lost Power */
+#define RTCSD_PWR 0x80 /* clock power OK */
+
+#define RTC_NVRAM_START 0x0e
+#define RTC_NVRAM_END 0x7f
+#define RTC_NVRAM_SZ (128 - RTC_NVRAM_START)
+#define nvoff(x) ((x) - RTC_NVRAM_START)
+
+#define RTC_DIAG 0x0e
+#define RTC_RSTCODE 0x0f
+#define RTC_EQUIPMENT 0x14
+#define RTC_LMEM_LSB 0x34
+#define RTC_LMEM_MSB 0x35
+#define RTC_HMEM_LSB 0x5b
+#define RTC_HMEM_SB 0x5c
+#define RTC_HMEM_MSB 0x5d
+
+#define m_64KB (64*1024)
+#define m_16MB (16*1024*1024)
+#define m_4GB (4ULL*1024*1024*1024)
+
+static int addr;
+
+static uint8_t rtc_nvram[RTC_NVRAM_SZ];
+
+/* XXX initialize these to default values as they would be from BIOS */
+static uint8_t status_a, status_b;
+
+static struct {
+ uint8_t hours;
+ uint8_t mins;
+ uint8_t secs;
+} rtc_alarm;
+
+static u_char const bin2bcd_data[] = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+ 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19,
+ 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29,
+ 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
+ 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
+ 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
+ 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
+ 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
+ 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99
+};
+#define bin2bcd(bin) (bin2bcd_data[bin])
+
+#define rtcout(val) ((status_b & RTCSB_BIN) ? (val) : bin2bcd((val)))
+
+static void
+timevalfix(struct timeval *t1)
+{
+
+ if (t1->tv_usec < 0) {
+ t1->tv_sec--;
+ t1->tv_usec += 1000000;
+ }
+ if (t1->tv_usec >= 1000000) {
+ t1->tv_sec++;
+ t1->tv_usec -= 1000000;
+ }
+}
+
+static void
+timevalsub(struct timeval *t1, const struct timeval *t2)
+{
+
+ t1->tv_sec -= t2->tv_sec;
+ t1->tv_usec -= t2->tv_usec;
+ timevalfix(t1);
+}
+
+static int
+rtc_addr_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ if (bytes != 1)
+ return (-1);
+
+ if (in) {
+ /* straight read of this register will return 0xFF */
+ *eax = 0xff;
+ return (0);
+ }
+
+ switch (*eax & 0x7f) {
+ case RTC_SEC:
+ case RTC_SEC_ALARM:
+ case RTC_MIN:
+ case RTC_MIN_ALARM:
+ case RTC_HRS:
+ case RTC_HRS_ALARM:
+ case RTC_WDAY:
+ case RTC_DAY:
+ case RTC_MONTH:
+ case RTC_YEAR:
+ case RTC_STATUSA:
+ case RTC_STATUSB:
+ case RTC_INTR:
+ case RTC_STATUSD:
+ case RTC_NVRAM_START ... RTC_NVRAM_END:
+ break;
+ default:
+ return (-1);
+ }
+
+ addr = *eax & 0x7f;
+ return (0);
+}
+
+static int
+rtc_data_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ int hour;
+ time_t t;
+ struct timeval cur, delta;
+
+ static struct timeval last;
+ static struct tm tm;
+
+ if (bytes != 1)
+ return (-1);
+
+ gettimeofday(&cur, NULL);
+
+ /*
+ * Increment the cached time only once per second so we can guarantee
+ * that the guest has at least one second to read the hour:min:sec
+ * separately and still get a coherent view of the time.
+ */
+ delta = cur;
+ timevalsub(&delta, &last);
+ if (delta.tv_sec >= 1 && (status_b & RTCSB_HALT) == 0) {
+ t = cur.tv_sec;
+ localtime_r(&t, &tm);
+ last = cur;
+ }
+
+ if (in) {
+ switch (addr) {
+ case RTC_SEC_ALARM:
+ *eax = rtc_alarm.secs;
+ break;
+ case RTC_MIN_ALARM:
+ *eax = rtc_alarm.mins;
+ break;
+ case RTC_HRS_ALARM:
+ *eax = rtc_alarm.hours;
+ break;
+ case RTC_SEC:
+ *eax = rtcout(tm.tm_sec);
+ return (0);
+ case RTC_MIN:
+ *eax = rtcout(tm.tm_min);
+ return (0);
+ case RTC_HRS:
+ if (status_b & RTCSB_24HR)
+ hour = tm.tm_hour;
+ else
+ hour = (tm.tm_hour % 12) + 1;
+
+ *eax = rtcout(hour);
+
+ /*
+ * If we are representing time in the 12-hour format
+ * then set the MSB to indicate PM.
+ */
+ if ((status_b & RTCSB_24HR) == 0 && tm.tm_hour >= 12)
+ *eax |= 0x80;
+
+ return (0);
+ case RTC_WDAY:
+ *eax = rtcout(tm.tm_wday + 1);
+ return (0);
+ case RTC_DAY:
+ *eax = rtcout(tm.tm_mday);
+ return (0);
+ case RTC_MONTH:
+ *eax = rtcout(tm.tm_mon + 1);
+ return (0);
+ case RTC_YEAR:
+ *eax = rtcout(tm.tm_year % 100);
+ return (0);
+ case RTC_STATUSA:
+ *eax = status_a;
+ return (0);
+ case RTC_STATUSB:
+ *eax = status_b;
+ return (0);
+ case RTC_INTR:
+ *eax = 0;
+ return (0);
+ case RTC_STATUSD:
+ *eax = RTCSD_PWR;
+ return (0);
+ case RTC_NVRAM_START ... RTC_NVRAM_END:
+ *eax = rtc_nvram[addr - RTC_NVRAM_START];
+ return (0);
+ default:
+ return (-1);
+ }
+ }
+
+ switch (addr) {
+ case RTC_STATUSA:
+ status_a = *eax & ~RTCSA_TUP;
+ break;
+ case RTC_STATUSB:
+ /* XXX not implemented yet XXX */
+ if (*eax & RTCSB_PINTR)
+ return (-1);
+ status_b = *eax;
+ break;
+ case RTC_STATUSD:
+ /* ignore write */
+ break;
+ case RTC_SEC_ALARM:
+ rtc_alarm.secs = *eax;
+ break;
+ case RTC_MIN_ALARM:
+ rtc_alarm.mins = *eax;
+ break;
+ case RTC_HRS_ALARM:
+ rtc_alarm.hours = *eax;
+ break;
+ case RTC_SEC:
+ case RTC_MIN:
+ case RTC_HRS:
+ case RTC_WDAY:
+ case RTC_DAY:
+ case RTC_MONTH:
+ case RTC_YEAR:
+ /*
+ * Ignore writes to the time of day registers
+ */
+ break;
+ case RTC_NVRAM_START ... RTC_NVRAM_END:
+ rtc_nvram[addr - RTC_NVRAM_START] = *eax;
+ break;
+ default:
+ return (-1);
+ }
+ return (0);
+}
+
+void
+rtc_init(struct vmctx *ctx)
+{
+ struct timeval cur;
+ struct tm tm;
+ size_t himem;
+ size_t lomem;
+ int err;
+
+ err = gettimeofday(&cur, NULL);
+ assert(err == 0);
+ (void) localtime_r(&cur.tv_sec, &tm);
+
+ memset(rtc_nvram, 0, sizeof(rtc_nvram));
+
+ rtc_nvram[nvoff(RTC_CENTURY)] = bin2bcd((tm.tm_year + 1900) / 100);
+
+ /* XXX init diag/reset code/equipment/checksum ? */
+
+ /*
+ * Report guest memory size in nvram cells as required by UEFI.
+ * Little-endian encoding.
+ * 0x34/0x35 - 64KB chunks above 16MB, below 4GB
+ * 0x5b/0x5c/0x5d - 64KB chunks above 4GB
+ */
+ lomem = (vm_get_lowmem_size(ctx) - m_16MB) / m_64KB;
+ rtc_nvram[nvoff(RTC_LMEM_LSB)] = lomem;
+ rtc_nvram[nvoff(RTC_LMEM_MSB)] = lomem >> 8;
+
+ himem = vm_get_highmem_size(ctx) / m_64KB;
+ rtc_nvram[nvoff(RTC_HMEM_LSB)] = himem;
+ rtc_nvram[nvoff(RTC_HMEM_SB)] = himem >> 8;
+ rtc_nvram[nvoff(RTC_HMEM_MSB)] = himem >> 16;
+}
+
+INOUT_PORT(rtc, IO_RTC, IOPORT_F_INOUT, rtc_addr_handler);
+INOUT_PORT(rtc, IO_RTC + 1, IOPORT_F_INOUT, rtc_data_handler);
+
+#ifdef __FreeBSD__
+static void
+rtc_dsdt(void)
+{
+
+ dsdt_line("");
+ dsdt_line("Device (RTC)");
+ dsdt_line("{");
+ dsdt_line(" Name (_HID, EisaId (\"PNP0B00\"))");
+ dsdt_line(" Name (_CRS, ResourceTemplate ()");
+ dsdt_line(" {");
+ dsdt_indent(2);
+ dsdt_fixed_ioport(IO_RTC, 2);
+ dsdt_fixed_irq(8);
+ dsdt_unindent(2);
+ dsdt_line(" })");
+ dsdt_line("}");
+}
+LPC_DSDT(rtc_dsdt);
+#endif
+
+SYSRES_IO(0x72, 6);
diff --git a/usr/src/cmd/bhyve/rtc.h b/usr/src/cmd/bhyve/rtc.h
new file mode 100644
index 0000000000..6406d24c37
--- /dev/null
+++ b/usr/src/cmd/bhyve/rtc.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2013 Peter Grehan <grehan@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/rtc.h 253181 2013-07-11 03:54:35Z grehan $
+ */
+
+#ifndef _RTC_H_
+#define _RTC_H_
+
+void rtc_init(struct vmctx *ctx);
+
+#endif /* _RTC_H_ */
diff --git a/usr/src/cmd/bhyve/smbiostbl.c b/usr/src/cmd/bhyve/smbiostbl.c
new file mode 100644
index 0000000000..7ba0f0dfa0
--- /dev/null
+++ b/usr/src/cmd/bhyve/smbiostbl.c
@@ -0,0 +1,827 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/smbiostbl.c 272007 2014-09-23 01:17:22Z grehan $");
+
+#include <sys/param.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <md5.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <uuid.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "bhyverun.h"
+#include "smbiostbl.h"
+
+#define MB (1024*1024)
+#define GB (1024ULL*1024*1024)
+
+#define SMBIOS_BASE 0xF1000
+
+/* BHYVE_ACPI_BASE - SMBIOS_BASE) */
+#define SMBIOS_MAX_LENGTH (0xF2400 - 0xF1000)
+
+#define SMBIOS_TYPE_BIOS 0
+#define SMBIOS_TYPE_SYSTEM 1
+#define SMBIOS_TYPE_CHASSIS 3
+#define SMBIOS_TYPE_PROCESSOR 4
+#define SMBIOS_TYPE_MEMARRAY 16
+#define SMBIOS_TYPE_MEMDEVICE 17
+#define SMBIOS_TYPE_MEMARRAYMAP 19
+#define SMBIOS_TYPE_BOOT 32
+#define SMBIOS_TYPE_EOT 127
+
+struct smbios_structure {
+ uint8_t type;
+ uint8_t length;
+ uint16_t handle;
+} __packed;
+
+typedef int (*initializer_func_t)(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+struct smbios_template_entry {
+ struct smbios_structure *entry;
+ const char **strings;
+ initializer_func_t initializer;
+};
+
+/*
+ * SMBIOS Structure Table Entry Point
+ */
+#define SMBIOS_ENTRY_EANCHOR "_SM_"
+#define SMBIOS_ENTRY_EANCHORLEN 4
+#define SMBIOS_ENTRY_IANCHOR "_DMI_"
+#define SMBIOS_ENTRY_IANCHORLEN 5
+
+struct smbios_entry_point {
+ char eanchor[4]; /* anchor tag */
+ uint8_t echecksum; /* checksum of entry point structure */
+ uint8_t eplen; /* length in bytes of entry point */
+ uint8_t major; /* major version of the SMBIOS spec */
+ uint8_t minor; /* minor version of the SMBIOS spec */
+ uint16_t maxssize; /* maximum size in bytes of a struct */
+ uint8_t revision; /* entry point structure revision */
+ uint8_t format[5]; /* entry point rev-specific data */
+ char ianchor[5]; /* intermediate anchor tag */
+ uint8_t ichecksum; /* intermediate checksum */
+ uint16_t stlen; /* len in bytes of structure table */
+ uint32_t staddr; /* physical addr of structure table */
+ uint16_t stnum; /* number of structure table entries */
+ uint8_t bcdrev; /* BCD value representing DMI ver */
+} __packed;
+
+/*
+ * BIOS Information
+ */
+#define SMBIOS_FL_ISA 0x00000010 /* ISA is supported */
+#define SMBIOS_FL_PCI 0x00000080 /* PCI is supported */
+#define SMBIOS_FL_SHADOW 0x00001000 /* BIOS shadowing is allowed */
+#define SMBIOS_FL_CDBOOT 0x00008000 /* Boot from CD is supported */
+#define SMBIOS_FL_SELBOOT 0x00010000 /* Selectable Boot supported */
+#define SMBIOS_FL_EDD 0x00080000 /* EDD Spec is supported */
+
+#define SMBIOS_XB1_FL_ACPI 0x00000001 /* ACPI is supported */
+
+#define SMBIOS_XB2_FL_BBS 0x00000001 /* BIOS Boot Specification */
+#define SMBIOS_XB2_FL_VM 0x00000010 /* Virtual Machine */
+
+struct smbios_table_type0 {
+ struct smbios_structure header;
+ uint8_t vendor; /* vendor string */
+ uint8_t version; /* version string */
+ uint16_t segment; /* address segment location */
+ uint8_t rel_date; /* release date */
+ uint8_t size; /* rom size */
+ uint64_t cflags; /* characteristics */
+ uint8_t xc_bytes[2]; /* characteristics ext bytes */
+ uint8_t sb_major_rel; /* system bios version */
+ uint8_t sb_minor_rele;
+ uint8_t ecfw_major_rel; /* embedded ctrl fw version */
+ uint8_t ecfw_minor_rel;
+} __packed;
+
+/*
+ * System Information
+ */
+#define SMBIOS_WAKEUP_SWITCH 0x06 /* power switch */
+
+struct smbios_table_type1 {
+ struct smbios_structure header;
+ uint8_t manufacturer; /* manufacturer string */
+ uint8_t product; /* product name string */
+ uint8_t version; /* version string */
+ uint8_t serial; /* serial number string */
+ uint8_t uuid[16]; /* uuid byte array */
+ uint8_t wakeup; /* wake-up event */
+ uint8_t sku; /* sku number string */
+ uint8_t family; /* family name string */
+} __packed;
+
+/*
+ * System Enclosure or Chassis
+ */
+#define SMBIOS_CHT_UNKNOWN 0x02 /* unknown */
+
+#define SMBIOS_CHST_SAFE 0x03 /* safe */
+
+#define SMBIOS_CHSC_NONE 0x03 /* none */
+
+struct smbios_table_type3 {
+ struct smbios_structure header;
+ uint8_t manufacturer; /* manufacturer string */
+ uint8_t type; /* type */
+ uint8_t version; /* version string */
+ uint8_t serial; /* serial number string */
+ uint8_t asset; /* asset tag string */
+ uint8_t bustate; /* boot-up state */
+ uint8_t psstate; /* power supply state */
+ uint8_t tstate; /* thermal state */
+ uint8_t security; /* security status */
+ uint8_t uheight; /* height in 'u's */
+ uint8_t cords; /* number of power cords */
+ uint8_t elems; /* number of element records */
+ uint8_t elemlen; /* length of records */
+ uint8_t sku; /* sku number string */
+} __packed;
+
+/*
+ * Processor Information
+ */
+#define SMBIOS_PRT_CENTRAL 0x03 /* central processor */
+
+#define SMBIOS_PRF_OTHER 0x01 /* other */
+
+#define SMBIOS_PRS_PRESENT 0x40 /* socket is populated */
+#define SMBIOS_PRS_ENABLED 0x1 /* enabled */
+
+#define SMBIOS_PRU_NONE 0x06 /* none */
+
+#define SMBIOS_PFL_64B 0x04 /* 64-bit capable */
+
+struct smbios_table_type4 {
+ struct smbios_structure header;
+ uint8_t socket; /* socket designation string */
+ uint8_t type; /* processor type */
+ uint8_t family; /* processor family */
+ uint8_t manufacturer; /* manufacturer string */
+ uint64_t cpuid; /* processor cpuid */
+ uint8_t version; /* version string */
+ uint8_t voltage; /* voltage */
+ uint16_t clkspeed; /* ext clock speed in mhz */
+ uint16_t maxspeed; /* maximum speed in mhz */
+ uint16_t curspeed; /* current speed in mhz */
+ uint8_t status; /* status */
+ uint8_t upgrade; /* upgrade */
+ uint16_t l1handle; /* l1 cache handle */
+ uint16_t l2handle; /* l2 cache handle */
+ uint16_t l3handle; /* l3 cache handle */
+ uint8_t serial; /* serial number string */
+ uint8_t asset; /* asset tag string */
+ uint8_t part; /* part number string */
+ uint8_t cores; /* cores per socket */
+ uint8_t ecores; /* enabled cores */
+ uint8_t threads; /* threads per socket */
+ uint16_t cflags; /* processor characteristics */
+ uint16_t family2; /* processor family 2 */
+} __packed;
+
+/*
+ * Physical Memory Array
+ */
+#define SMBIOS_MAL_SYSMB 0x03 /* system board or motherboard */
+
+#define SMBIOS_MAU_SYSTEM 0x03 /* system memory */
+
+#define SMBIOS_MAE_NONE 0x03 /* none */
+
+struct smbios_table_type16 {
+ struct smbios_structure header;
+ uint8_t location; /* physical device location */
+ uint8_t use; /* device functional purpose */
+ uint8_t ecc; /* err detect/correct method */
+ uint32_t size; /* max mem capacity in kb */
+ uint16_t errhand; /* handle of error (if any) */
+ uint16_t ndevs; /* num of slots or sockets */
+ uint64_t xsize; /* max mem capacity in bytes */
+} __packed;
+
+/*
+ * Memory Device
+ */
+#define SMBIOS_MDFF_UNKNOWN 0x02 /* unknown */
+
+#define SMBIOS_MDT_UNKNOWN 0x02 /* unknown */
+
+#define SMBIOS_MDF_UNKNOWN 0x0004 /* unknown */
+
+struct smbios_table_type17 {
+ struct smbios_structure header;
+ uint16_t arrayhand; /* handle of physl mem array */
+ uint16_t errhand; /* handle of mem error data */
+ uint16_t twidth; /* total width in bits */
+ uint16_t dwidth; /* data width in bits */
+ uint16_t size; /* size in bytes */
+ uint8_t form; /* form factor */
+ uint8_t set; /* set */
+ uint8_t dloc; /* device locator string */
+ uint8_t bloc; /* phys bank locator string */
+ uint8_t type; /* memory type */
+ uint16_t flags; /* memory characteristics */
+ uint16_t maxspeed; /* maximum speed in mhz */
+ uint8_t manufacturer; /* manufacturer string */
+ uint8_t serial; /* serial number string */
+ uint8_t asset; /* asset tag string */
+ uint8_t part; /* part number string */
+ uint8_t attributes; /* attributes */
+ uint32_t xsize; /* extended size in mbs */
+ uint16_t curspeed; /* current speed in mhz */
+ uint16_t minvoltage; /* minimum voltage */
+ uint16_t maxvoltage; /* maximum voltage */
+ uint16_t curvoltage; /* configured voltage */
+} __packed;
+
+/*
+ * Memory Array Mapped Address
+ */
+struct smbios_table_type19 {
+ struct smbios_structure header;
+ uint32_t saddr; /* start phys addr in kb */
+ uint32_t eaddr; /* end phys addr in kb */
+ uint16_t arrayhand; /* physical mem array handle */
+ uint8_t width; /* num of dev in row */
+ uint64_t xsaddr; /* start phys addr in bytes */
+ uint64_t xeaddr; /* end phys addr in bytes */
+} __packed;
+
+/*
+ * System Boot Information
+ */
+#define SMBIOS_BOOT_NORMAL 0 /* no errors detected */
+
+struct smbios_table_type32 {
+ struct smbios_structure header;
+ uint8_t reserved[6];
+ uint8_t status; /* boot status */
+} __packed;
+
+/*
+ * End-of-Table
+ */
+struct smbios_table_type127 {
+ struct smbios_structure header;
+} __packed;
+
+struct smbios_table_type0 smbios_type0_template = {
+ { SMBIOS_TYPE_BIOS, sizeof (struct smbios_table_type0), 0 },
+ 1, /* bios vendor string */
+ 2, /* bios version string */
+ 0xF000, /* bios address segment location */
+ 3, /* bios release date */
+ 0x0, /* bios size (64k * (n + 1) is the size in bytes) */
+ SMBIOS_FL_ISA | SMBIOS_FL_PCI | SMBIOS_FL_SHADOW |
+ SMBIOS_FL_CDBOOT | SMBIOS_FL_EDD,
+ { SMBIOS_XB1_FL_ACPI, SMBIOS_XB2_FL_BBS | SMBIOS_XB2_FL_VM },
+ 0x0, /* bios major release */
+ 0x0, /* bios minor release */
+ 0xff, /* embedded controller firmware major release */
+ 0xff /* embedded controller firmware minor release */
+};
+
+const char *smbios_type0_strings[] = {
+ "BHYVE", /* vendor string */
+ __TIME__, /* bios version string */
+ __DATE__, /* bios release date string */
+ NULL
+};
+
+struct smbios_table_type1 smbios_type1_template = {
+ { SMBIOS_TYPE_SYSTEM, sizeof (struct smbios_table_type1), 0 },
+ 1, /* manufacturer string */
+ 2, /* product string */
+ 3, /* version string */
+ 4, /* serial number string */
+ { 0 },
+ SMBIOS_WAKEUP_SWITCH,
+ 5, /* sku string */
+ 6 /* family string */
+};
+
+static int smbios_type1_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+const char *smbios_type1_strings[] = {
+ " ", /* manufacturer string */
+ "BHYVE", /* product name string */
+ "1.0", /* version string */
+ "None", /* serial number string */
+ "None", /* sku string */
+ " ", /* family name string */
+ NULL
+};
+
+struct smbios_table_type3 smbios_type3_template = {
+ { SMBIOS_TYPE_CHASSIS, sizeof (struct smbios_table_type3), 0 },
+ 1, /* manufacturer string */
+ SMBIOS_CHT_UNKNOWN,
+ 2, /* version string */
+ 3, /* serial number string */
+ 4, /* asset tag string */
+ SMBIOS_CHST_SAFE,
+ SMBIOS_CHST_SAFE,
+ SMBIOS_CHST_SAFE,
+ SMBIOS_CHSC_NONE,
+ 0, /* height in 'u's (0=enclosure height unspecified) */
+ 0, /* number of power cords (0=number unspecified) */
+ 0, /* number of contained element records */
+ 0, /* length of records */
+ 5 /* sku number string */
+};
+
+const char *smbios_type3_strings[] = {
+ " ", /* manufacturer string */
+ "1.0", /* version string */
+ "None", /* serial number string */
+ "None", /* asset tag string */
+ "None", /* sku number string */
+ NULL
+};
+
+struct smbios_table_type4 smbios_type4_template = {
+ { SMBIOS_TYPE_PROCESSOR, sizeof (struct smbios_table_type4), 0 },
+ 1, /* socket designation string */
+ SMBIOS_PRT_CENTRAL,
+ SMBIOS_PRF_OTHER,
+ 2, /* manufacturer string */
+ 0, /* cpuid */
+ 3, /* version string */
+ 0, /* voltage */
+ 0, /* external clock frequency in mhz (0=unknown) */
+ 0, /* maximum frequency in mhz (0=unknown) */
+ 0, /* current frequency in mhz (0=unknown) */
+ SMBIOS_PRS_PRESENT | SMBIOS_PRS_ENABLED,
+ SMBIOS_PRU_NONE,
+ -1, /* l1 cache handle */
+ -1, /* l2 cache handle */
+ -1, /* l3 cache handle */
+ 4, /* serial number string */
+ 5, /* asset tag string */
+ 6, /* part number string */
+ 0, /* cores per socket (0=unknown) */
+ 0, /* enabled cores per socket (0=unknown) */
+ 0, /* threads per socket (0=unknown) */
+ SMBIOS_PFL_64B,
+ SMBIOS_PRF_OTHER
+};
+
+const char *smbios_type4_strings[] = {
+ " ", /* socket designation string */
+ " ", /* manufacturer string */
+ " ", /* version string */
+ "None", /* serial number string */
+ "None", /* asset tag string */
+ "None", /* part number string */
+ NULL
+};
+
+static int smbios_type4_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+struct smbios_table_type16 smbios_type16_template = {
+ { SMBIOS_TYPE_MEMARRAY, sizeof (struct smbios_table_type16), 0 },
+ SMBIOS_MAL_SYSMB,
+ SMBIOS_MAU_SYSTEM,
+ SMBIOS_MAE_NONE,
+ 0x80000000, /* max mem capacity in kb (0x80000000=use extended) */
+ -1, /* handle of error (if any) */
+ 0, /* number of slots or sockets (TBD) */
+ 0 /* extended maximum memory capacity in bytes (TBD) */
+};
+
+static int smbios_type16_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+struct smbios_table_type17 smbios_type17_template = {
+ { SMBIOS_TYPE_MEMDEVICE, sizeof (struct smbios_table_type17), 0 },
+ -1, /* handle of physical memory array */
+ -1, /* handle of memory error data */
+ 64, /* total width in bits including ecc */
+ 64, /* data width in bits */
+ 0x7fff, /* size in bytes (0x7fff=use extended)*/
+ SMBIOS_MDFF_UNKNOWN,
+ 0, /* set (0x00=none, 0xff=unknown) */
+ 1, /* device locator string */
+ 2, /* physical bank locator string */
+ SMBIOS_MDT_UNKNOWN,
+ SMBIOS_MDF_UNKNOWN,
+ 0, /* maximum memory speed in mhz (0=unknown) */
+ 3, /* manufacturer string */
+ 4, /* serial number string */
+ 5, /* asset tag string */
+ 6, /* part number string */
+ 0, /* attributes (0=unknown rank information) */
+ 0, /* extended size in mb (TBD) */
+ 0, /* current speed in mhz (0=unknown) */
+ 0, /* minimum voltage in mv (0=unknown) */
+ 0, /* maximum voltage in mv (0=unknown) */
+ 0 /* configured voltage in mv (0=unknown) */
+};
+
+const char *smbios_type17_strings[] = {
+ " ", /* device locator string */
+ " ", /* physical bank locator string */
+ " ", /* manufacturer string */
+ "None", /* serial number string */
+ "None", /* asset tag string */
+ "None", /* part number string */
+ NULL
+};
+
+static int smbios_type17_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+struct smbios_table_type19 smbios_type19_template = {
+ { SMBIOS_TYPE_MEMARRAYMAP, sizeof (struct smbios_table_type19), 0 },
+ 0xffffffff, /* starting phys addr in kb (0xffffffff=use ext) */
+ 0xffffffff, /* ending phys addr in kb (0xffffffff=use ext) */
+ -1, /* physical memory array handle */
+ 1, /* number of devices that form a row */
+ 0, /* extended starting phys addr in bytes (TDB) */
+ 0 /* extended ending phys addr in bytes (TDB) */
+};
+
+static int smbios_type19_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+struct smbios_table_type32 smbios_type32_template = {
+ { SMBIOS_TYPE_BOOT, sizeof (struct smbios_table_type32), 0 },
+ { 0, 0, 0, 0, 0, 0 },
+ SMBIOS_BOOT_NORMAL
+};
+
+struct smbios_table_type127 smbios_type127_template = {
+ { SMBIOS_TYPE_EOT, sizeof (struct smbios_table_type127), 0 }
+};
+
+static int smbios_generic_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size);
+
+static struct smbios_template_entry smbios_template[] = {
+ { (struct smbios_structure *)&smbios_type0_template,
+ smbios_type0_strings,
+ smbios_generic_initializer },
+ { (struct smbios_structure *)&smbios_type1_template,
+ smbios_type1_strings,
+ smbios_type1_initializer },
+ { (struct smbios_structure *)&smbios_type3_template,
+ smbios_type3_strings,
+ smbios_generic_initializer },
+ { (struct smbios_structure *)&smbios_type4_template,
+ smbios_type4_strings,
+ smbios_type4_initializer },
+ { (struct smbios_structure *)&smbios_type16_template,
+ NULL,
+ smbios_type16_initializer },
+ { (struct smbios_structure *)&smbios_type17_template,
+ smbios_type17_strings,
+ smbios_type17_initializer },
+ { (struct smbios_structure *)&smbios_type19_template,
+ NULL,
+ smbios_type19_initializer },
+ { (struct smbios_structure *)&smbios_type32_template,
+ NULL,
+ smbios_generic_initializer },
+ { (struct smbios_structure *)&smbios_type127_template,
+ NULL,
+ smbios_generic_initializer },
+ { NULL,NULL, NULL }
+};
+
+static uint64_t guest_lomem, guest_himem;
+static uint16_t type16_handle;
+
+static int
+smbios_generic_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size)
+{
+ struct smbios_structure *entry;
+
+ memcpy(curaddr, template_entry, template_entry->length);
+ entry = (struct smbios_structure *)curaddr;
+ entry->handle = *n + 1;
+ curaddr += entry->length;
+ if (template_strings != NULL) {
+ int i;
+
+ for (i = 0; template_strings[i] != NULL; i++) {
+ const char *string;
+ int len;
+
+ string = template_strings[i];
+ len = strlen(string) + 1;
+ memcpy(curaddr, string, len);
+ curaddr += len;
+ }
+ *curaddr = '\0';
+ curaddr++;
+ } else {
+ /* Minimum string section is double nul */
+ *curaddr = '\0';
+ curaddr++;
+ *curaddr = '\0';
+ curaddr++;
+ }
+ (*n)++;
+ *endaddr = curaddr;
+
+ return (0);
+}
+
+static int
+smbios_type1_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size)
+{
+ struct smbios_table_type1 *type1;
+
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type1 = (struct smbios_table_type1 *)curaddr;
+
+ if (guest_uuid_str != NULL) {
+ uuid_t uuid;
+ uint32_t status;
+
+ uuid_from_string(guest_uuid_str, &uuid, &status);
+ if (status != uuid_s_ok)
+ return (-1);
+
+ uuid_enc_le(&type1->uuid, &uuid);
+ } else {
+ MD5_CTX mdctx;
+ u_char digest[16];
+ char hostname[MAXHOSTNAMELEN];
+
+ /*
+ * Universally unique and yet reproducible are an
+ * oxymoron, however reproducible is desirable in
+ * this case.
+ */
+ if (gethostname(hostname, sizeof(hostname)))
+ return (-1);
+
+ MD5Init(&mdctx);
+ MD5Update(&mdctx, vmname, strlen(vmname));
+ MD5Update(&mdctx, hostname, sizeof(hostname));
+ MD5Final(digest, &mdctx);
+
+ /*
+ * Set the variant and version number.
+ */
+ digest[6] &= 0x0F;
+ digest[6] |= 0x30; /* version 3 */
+ digest[8] &= 0x3F;
+ digest[8] |= 0x80;
+
+ memcpy(&type1->uuid, digest, sizeof (digest));
+ }
+
+ return (0);
+}
+
+static int
+smbios_type4_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size)
+{
+ int i;
+
+ for (i = 0; i < guest_ncpus; i++) {
+ struct smbios_table_type4 *type4;
+ char *p;
+ int nstrings, len;
+
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type4 = (struct smbios_table_type4 *)curaddr;
+ p = curaddr + sizeof (struct smbios_table_type4);
+ nstrings = 0;
+ while (p < *endaddr - 1) {
+ if (*p++ == '\0')
+ nstrings++;
+ }
+ len = sprintf(*endaddr - 1, "CPU #%d", i) + 1;
+ *endaddr += len - 1;
+ *(*endaddr) = '\0';
+ (*endaddr)++;
+ type4->socket = nstrings + 1;
+ curaddr = *endaddr;
+ }
+
+ return (0);
+}
+
+static int
+smbios_type16_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size)
+{
+ struct smbios_table_type16 *type16;
+
+ type16_handle = *n;
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type16 = (struct smbios_table_type16 *)curaddr;
+ type16->xsize = guest_lomem + guest_himem;
+ type16->ndevs = guest_himem > 0 ? 2 : 1;
+
+ return (0);
+}
+
+static int
+smbios_type17_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size)
+{
+ struct smbios_table_type17 *type17;
+
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type17 = (struct smbios_table_type17 *)curaddr;
+ type17->arrayhand = type16_handle;
+ type17->xsize = guest_lomem;
+
+ if (guest_himem > 0) {
+ curaddr = *endaddr;
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type17 = (struct smbios_table_type17 *)curaddr;
+ type17->arrayhand = type16_handle;
+ type17->xsize = guest_himem;
+ }
+
+ return (0);
+}
+
+static int
+smbios_type19_initializer(struct smbios_structure *template_entry,
+ const char **template_strings, char *curaddr, char **endaddr,
+ uint16_t *n, uint16_t *size)
+{
+ struct smbios_table_type19 *type19;
+
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type19 = (struct smbios_table_type19 *)curaddr;
+ type19->arrayhand = type16_handle;
+ type19->xsaddr = 0;
+ type19->xeaddr = guest_lomem;
+
+ if (guest_himem > 0) {
+ curaddr = *endaddr;
+ smbios_generic_initializer(template_entry, template_strings,
+ curaddr, endaddr, n, size);
+ type19 = (struct smbios_table_type19 *)curaddr;
+ type19->arrayhand = type16_handle;
+ type19->xsaddr = 4*GB;
+ type19->xeaddr = guest_himem;
+ }
+
+ return (0);
+}
+
+static void
+smbios_ep_initializer(struct smbios_entry_point *smbios_ep, uint32_t staddr)
+{
+ memset(smbios_ep, 0, sizeof(*smbios_ep));
+ memcpy(smbios_ep->eanchor, SMBIOS_ENTRY_EANCHOR,
+ SMBIOS_ENTRY_EANCHORLEN);
+ smbios_ep->eplen = 0x1F;
+ assert(sizeof (struct smbios_entry_point) == smbios_ep->eplen);
+ smbios_ep->major = 2;
+ smbios_ep->minor = 6;
+ smbios_ep->revision = 0;
+ memcpy(smbios_ep->ianchor, SMBIOS_ENTRY_IANCHOR,
+ SMBIOS_ENTRY_IANCHORLEN);
+ smbios_ep->staddr = staddr;
+ smbios_ep->bcdrev = 0x24;
+}
+
+static void
+smbios_ep_finalizer(struct smbios_entry_point *smbios_ep, uint16_t len,
+ uint16_t num, uint16_t maxssize)
+{
+ uint8_t checksum;
+ int i;
+
+ smbios_ep->maxssize = maxssize;
+ smbios_ep->stlen = len;
+ smbios_ep->stnum = num;
+
+ checksum = 0;
+ for (i = 0x10; i < 0x1f; i++) {
+ checksum -= ((uint8_t *)smbios_ep)[i];
+ }
+ smbios_ep->ichecksum = checksum;
+
+ checksum = 0;
+ for (i = 0; i < 0x1f; i++) {
+ checksum -= ((uint8_t *)smbios_ep)[i];
+ }
+ smbios_ep->echecksum = checksum;
+}
+
+int
+smbios_build(struct vmctx *ctx)
+{
+ struct smbios_entry_point *smbios_ep;
+ uint16_t n;
+ uint16_t maxssize;
+ char *curaddr, *startaddr, *ststartaddr;
+ int i;
+ int err;
+
+ guest_lomem = vm_get_lowmem_size(ctx);
+ guest_himem = vm_get_highmem_size(ctx);
+
+ startaddr = paddr_guest2host(ctx, SMBIOS_BASE, SMBIOS_MAX_LENGTH);
+ if (startaddr == NULL) {
+ fprintf(stderr, "smbios table requires mapped mem\n");
+ return (ENOMEM);
+ }
+
+ curaddr = startaddr;
+
+ smbios_ep = (struct smbios_entry_point *)curaddr;
+ smbios_ep_initializer(smbios_ep, SMBIOS_BASE +
+ sizeof(struct smbios_entry_point));
+ curaddr += sizeof(struct smbios_entry_point);
+ ststartaddr = curaddr;
+
+ n = 0;
+ maxssize = 0;
+ for (i = 0; smbios_template[i].entry != NULL; i++) {
+ struct smbios_structure *entry;
+ const char **strings;
+ initializer_func_t initializer;
+ char *endaddr;
+ uint16_t size;
+
+ entry = smbios_template[i].entry;
+ strings = smbios_template[i].strings;
+ initializer = smbios_template[i].initializer;
+
+ err = (*initializer)(entry, strings, curaddr, &endaddr,
+ &n, &size);
+ if (err != 0)
+ return (err);
+
+ if (size > maxssize)
+ maxssize = size;
+
+ curaddr = endaddr;
+ }
+
+ assert(curaddr - startaddr < SMBIOS_MAX_LENGTH);
+ smbios_ep_finalizer(smbios_ep, curaddr - ststartaddr, n, maxssize);
+
+ return (0);
+}
diff --git a/usr/src/cmd/bhyve/smbiostbl.h b/usr/src/cmd/bhyve/smbiostbl.h
new file mode 100644
index 0000000000..fd7f86be80
--- /dev/null
+++ b/usr/src/cmd/bhyve/smbiostbl.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/smbiostbl.h 262744 2014-03-04 17:12:06Z tychon $
+ */
+
+#ifndef _SMBIOSTBL_H_
+#define _SMBIOSTBL_H_
+
+struct vmctx;
+
+int smbios_build(struct vmctx *ctx);
+
+#endif /* _SMBIOSTBL_H_ */
diff --git a/usr/src/cmd/bhyve/spinup_ap.c b/usr/src/cmd/bhyve/spinup_ap.c
new file mode 100644
index 0000000000..e1dd562d3f
--- /dev/null
+++ b/usr/src/cmd/bhyve/spinup_ap.c
@@ -0,0 +1,104 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/spinup_ap.c 263432 2014-03-20 18:15:37Z neel $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/spinup_ap.c 263432 2014-03-20 18:15:37Z neel $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+#include "bhyverun.h"
+#include "spinup_ap.h"
+
+static void
+spinup_ap_realmode(struct vmctx *ctx, int newcpu, uint64_t *rip)
+{
+ int vector, error;
+ uint16_t cs;
+ uint64_t desc_base;
+ uint32_t desc_limit, desc_access;
+
+ vector = *rip >> PAGE_SHIFT;
+ *rip = 0;
+
+ /*
+ * Update the %cs and %rip of the guest so that it starts
+ * executing real mode code at at 'vector << 12'.
+ */
+ error = vm_set_register(ctx, newcpu, VM_REG_GUEST_RIP, *rip);
+ assert(error == 0);
+
+ error = vm_get_desc(ctx, newcpu, VM_REG_GUEST_CS, &desc_base,
+ &desc_limit, &desc_access);
+ assert(error == 0);
+
+ desc_base = vector << PAGE_SHIFT;
+ error = vm_set_desc(ctx, newcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ assert(error == 0);
+
+ cs = (vector << PAGE_SHIFT) >> 4;
+ error = vm_set_register(ctx, newcpu, VM_REG_GUEST_CS, cs);
+ assert(error == 0);
+}
+
+int
+spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip)
+{
+ int error;
+
+ assert(newcpu != 0);
+ assert(newcpu < guest_ncpus);
+
+ error = vcpu_reset(ctx, newcpu);
+ assert(error == 0);
+
+ fbsdrun_set_capabilities(ctx, newcpu);
+
+ /*
+ * Enable the 'unrestricted guest' mode for 'newcpu'.
+ *
+ * Set up the processor state in power-on 16-bit mode, with the CS:IP
+ * init'd to the specified low-mem 4K page.
+ */
+ error = vm_set_capability(ctx, newcpu, VM_CAP_UNRESTRICTED_GUEST, 1);
+ assert(error == 0);
+
+ spinup_ap_realmode(ctx, newcpu, &rip);
+
+ fbsdrun_addcpu(ctx, vcpu, newcpu, rip);
+
+ return (newcpu);
+}
diff --git a/usr/src/cmd/bhyve/spinup_ap.h b/usr/src/cmd/bhyve/spinup_ap.h
new file mode 100644
index 0000000000..090de091ba
--- /dev/null
+++ b/usr/src/cmd/bhyve/spinup_ap.h
@@ -0,0 +1,34 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/spinup_ap.h 240912 2012-09-25 02:33:25Z neel $
+ */
+
+#ifndef _SPINUP_AP_H_
+#define _SPINUP_AP_H_
+
+int spinup_ap(struct vmctx *ctx, int vcpu, int newcpu, uint64_t rip);
+
+#endif
diff --git a/usr/src/cmd/bhyve/uart_emul.c b/usr/src/cmd/bhyve/uart_emul.c
new file mode 100644
index 0000000000..a8b5d40356
--- /dev/null
+++ b/usr/src/cmd/bhyve/uart_emul.c
@@ -0,0 +1,1042 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/uart_emul.c 257293 2013-10-29 00:18:11Z neel $");
+
+#include <sys/types.h>
+#include <dev/ic/ns16550.h>
+
+#ifndef __FreeBSD__
+#include <sys/socket.h>
+#include <sys/stat.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <termios.h>
+#include <unistd.h>
+#include <stdbool.h>
+#include <string.h>
+#include <pthread.h>
+#ifndef __FreeBSD__
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#endif
+
+#ifndef __FreeBSD__
+#include <bhyve.h>
+
+#include "bhyverun.h"
+#endif
+#ifdef __FreeBSD__
+#include "mevent.h"
+#endif
+#include "uart_emul.h"
+
+#define COM1_BASE 0x3F8
+#define COM1_IRQ 4
+#define COM2_BASE 0x2F8
+#define COM2_IRQ 3
+
+#define DEFAULT_RCLK 1843200
+#define DEFAULT_BAUD 9600
+
+#define FCR_RX_MASK 0xC0
+
+#define MCR_OUT1 0x04
+#define MCR_OUT2 0x08
+
+#define MSR_DELTA_MASK 0x0f
+
+#ifndef REG_SCR
+#define REG_SCR com_scr
+#endif
+
+#define FIFOSZ 16
+
+static bool uart_stdio; /* stdio in use for i/o */
+#ifndef __FreeBSD__
+static bool uart_bcons; /* bhyveconsole in use for i/o */
+#endif
+
+static struct {
+ int baseaddr;
+ int irq;
+ bool inuse;
+} uart_lres[] = {
+ { COM1_BASE, COM1_IRQ, false},
+ { COM2_BASE, COM2_IRQ, false},
+};
+
+#define UART_NLDEVS (sizeof(uart_lres) / sizeof(uart_lres[0]))
+
+struct fifo {
+ uint8_t buf[FIFOSZ];
+ int rindex; /* index to read from */
+ int windex; /* index to write to */
+ int num; /* number of characters in the fifo */
+ int size; /* size of the fifo */
+};
+
+struct uart_softc {
+ pthread_mutex_t mtx; /* protects all softc elements */
+ uint8_t data; /* Data register (R/W) */
+ uint8_t ier; /* Interrupt enable register (R/W) */
+ uint8_t lcr; /* Line control register (R/W) */
+ uint8_t mcr; /* Modem control register (R/W) */
+ uint8_t lsr; /* Line status register (R/W) */
+ uint8_t msr; /* Modem status register (R/W) */
+ uint8_t fcr; /* FIFO control register (W) */
+ uint8_t scr; /* Scratch register (R/W) */
+
+ uint8_t dll; /* Baudrate divisor latch LSB */
+ uint8_t dlh; /* Baudrate divisor latch MSB */
+
+ struct fifo rxfifo;
+
+ bool opened;
+ bool stdio;
+#ifndef __FreeBSD__
+ bool bcons;
+ struct {
+ pid_t clipid;
+ int clifd; /* console client unix domain socket */
+ int servfd; /* console server unix domain socket */
+ } usc_bcons;
+#endif
+
+ bool thre_int_pending; /* THRE interrupt pending */
+
+ void *arg;
+ uart_intr_func_t intr_assert;
+ uart_intr_func_t intr_deassert;
+};
+
+#ifdef __FreeBSD__
+static void uart_drain(int fd, enum ev_type ev, void *arg);
+#else
+static void uart_tty_drain(struct uart_softc *sc);
+static int uart_bcons_drain(struct uart_softc *sc);
+#endif
+
+static struct termios tio_orig, tio_new; /* I/O Terminals */
+
+static void
+ttyclose(void)
+{
+
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_orig);
+}
+
+static void
+ttyopen(void)
+{
+
+ tcgetattr(STDIN_FILENO, &tio_orig);
+
+ tio_new = tio_orig;
+ cfmakeraw(&tio_new);
+ tcsetattr(STDIN_FILENO, TCSANOW, &tio_new);
+
+ atexit(ttyclose);
+}
+
+static bool
+tty_char_available(void)
+{
+ fd_set rfds;
+ struct timeval tv;
+
+ FD_ZERO(&rfds);
+ FD_SET(STDIN_FILENO, &rfds);
+ tv.tv_sec = 0;
+ tv.tv_usec = 0;
+ if (select(STDIN_FILENO + 1, &rfds, NULL, NULL, &tv) > 0 ) {
+ return (true);
+ } else {
+ return (false);
+ }
+}
+
+static int
+ttyread(void)
+{
+ char rb;
+
+ if (tty_char_available()) {
+ read(STDIN_FILENO, &rb, 1);
+ return (rb & 0xff);
+ } else {
+ return (-1);
+ }
+}
+
+static void
+ttywrite(unsigned char wb)
+{
+
+ (void)write(STDIN_FILENO, &wb, 1);
+}
+
+#ifndef __FreeBSD__
+static void
+bconswrite(struct uart_softc *sc, unsigned char wb)
+{
+ (void) write(sc->usc_bcons.clifd, &wb, 1);
+}
+#endif
+
+static void
+fifo_reset(struct fifo *fifo, int size)
+{
+ bzero(fifo, sizeof(struct fifo));
+ fifo->size = size;
+}
+
+static int
+fifo_putchar(struct fifo *fifo, uint8_t ch)
+{
+
+ if (fifo->num < fifo->size) {
+ fifo->buf[fifo->windex] = ch;
+ fifo->windex = (fifo->windex + 1) % fifo->size;
+ fifo->num++;
+ return (0);
+ } else
+ return (-1);
+}
+
+static int
+fifo_getchar(struct fifo *fifo)
+{
+ int c;
+
+ if (fifo->num > 0) {
+ c = fifo->buf[fifo->rindex];
+ fifo->rindex = (fifo->rindex + 1) % fifo->size;
+ fifo->num--;
+ return (c);
+ } else
+ return (-1);
+}
+
+static int
+fifo_numchars(struct fifo *fifo)
+{
+
+ return (fifo->num);
+}
+
+static int
+fifo_available(struct fifo *fifo)
+{
+
+ return (fifo->num < fifo->size);
+}
+
+static void
+uart_opentty(struct uart_softc *sc)
+{
+ struct mevent *mev;
+
+ assert(!sc->opened && sc->stdio);
+
+ ttyopen();
+#ifdef __FreeBSD__
+ mev = mevent_add(STDIN_FILENO, EVF_READ, uart_drain, sc);
+#endif
+ assert(mev);
+}
+
+/*
+ * The IIR returns a prioritized interrupt reason:
+ * - receive data available
+ * - transmit holding register empty
+ * - modem status change
+ *
+ * Return an interrupt reason if one is available.
+ */
+static int
+uart_intr_reason(struct uart_softc *sc)
+{
+
+ if ((sc->lsr & LSR_OE) != 0 && (sc->ier & IER_ERLS) != 0)
+ return (IIR_RLS);
+ else if (fifo_numchars(&sc->rxfifo) > 0 && (sc->ier & IER_ERXRDY) != 0)
+ return (IIR_RXTOUT);
+ else if (sc->thre_int_pending && (sc->ier & IER_ETXRDY) != 0)
+ return (IIR_TXRDY);
+ else if ((sc->msr & MSR_DELTA_MASK) != 0 && (sc->ier & IER_EMSC) != 0)
+ return (IIR_MLSC);
+ else
+ return (IIR_NOPEND);
+}
+
+static void
+uart_reset(struct uart_softc *sc)
+{
+ uint16_t divisor;
+
+ divisor = DEFAULT_RCLK / DEFAULT_BAUD / 16;
+ sc->dll = divisor;
+ sc->dlh = divisor >> 16;
+
+ fifo_reset(&sc->rxfifo, 1); /* no fifo until enabled by software */
+}
+
+/*
+ * Toggle the COM port's intr pin depending on whether or not we have an
+ * interrupt condition to report to the processor.
+ */
+static void
+uart_toggle_intr(struct uart_softc *sc)
+{
+ uint8_t intr_reason;
+
+ intr_reason = uart_intr_reason(sc);
+
+ if (intr_reason == IIR_NOPEND)
+ (*sc->intr_deassert)(sc->arg);
+ else
+ (*sc->intr_assert)(sc->arg);
+}
+
+#ifdef __FreeBSD__
+static void
+uart_drain(int fd, enum ev_type ev, void *arg)
+{
+ struct uart_softc *sc;
+ int ch;
+
+ sc = arg;
+
+ assert(fd == STDIN_FILENO);
+ assert(ev == EVF_READ);
+
+ /*
+ * This routine is called in the context of the mevent thread
+ * to take out the softc lock to protect against concurrent
+ * access from a vCPU i/o exit
+ */
+ pthread_mutex_lock(&sc->mtx);
+
+ if ((sc->mcr & MCR_LOOPBACK) != 0) {
+ (void) ttyread();
+ } else {
+ while (fifo_available(&sc->rxfifo) &&
+ ((ch = ttyread()) != -1)) {
+ fifo_putchar(&sc->rxfifo, ch);
+ }
+ uart_toggle_intr(sc);
+ }
+
+ pthread_mutex_unlock(&sc->mtx);
+}
+#else
+static void
+uart_tty_drain(struct uart_softc *sc)
+{
+ int ch;
+
+ /*
+ * Take the softc lock to protect against concurrent
+ * access from a vCPU i/o exit
+ */
+ pthread_mutex_lock(&sc->mtx);
+
+ if ((sc->mcr & MCR_LOOPBACK) != 0) {
+ (void) ttyread();
+ } else {
+ while (fifo_available(&sc->rxfifo) &&
+ ((ch = ttyread()) != -1)) {
+ fifo_putchar(&sc->rxfifo, ch);
+ }
+ uart_toggle_intr(sc);
+ }
+
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+static int
+uart_bcons_drain(struct uart_softc *sc)
+{
+ char ch;
+ int nbytes;
+ int ret = 0;
+
+ /*
+ * Take the softc lock to protect against concurrent
+ * access from a vCPU i/o exit
+ */
+ pthread_mutex_lock(&sc->mtx);
+
+ if ((sc->mcr & MCR_LOOPBACK) != 0) {
+ (void) read(sc->usc_bcons.clifd, &ch, 1);
+ } else {
+ for (;;) {
+ nbytes = read(sc->usc_bcons.clifd, &ch, 1);
+ if (nbytes == 0) {
+ ret = 1;
+ break;
+ }
+ if (nbytes == -1 &&
+ errno != EINTR && errno != EAGAIN) {
+ ret = -1;
+ break;
+ }
+ if (nbytes == -1) {
+ break;
+ }
+
+ if (fifo_available(&sc->rxfifo)) {
+ fifo_putchar(&sc->rxfifo, ch);
+ }
+ }
+ uart_toggle_intr(sc);
+ }
+
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (ret);
+}
+#endif
+
+void
+uart_write(struct uart_softc *sc, int offset, uint8_t value)
+{
+ int fifosz;
+ uint8_t msr;
+
+ pthread_mutex_lock(&sc->mtx);
+
+ /* Open terminal */
+ if (!sc->opened && sc->stdio) {
+ uart_opentty(sc);
+ sc->opened = true;
+ }
+
+ /*
+ * Take care of the special case DLAB accesses first
+ */
+ if ((sc->lcr & LCR_DLAB) != 0) {
+ if (offset == REG_DLL) {
+ sc->dll = value;
+ goto done;
+ }
+
+ if (offset == REG_DLH) {
+ sc->dlh = value;
+ goto done;
+ }
+ }
+
+ switch (offset) {
+ case REG_DATA:
+ if (sc->mcr & MCR_LOOPBACK) {
+ if (fifo_putchar(&sc->rxfifo, value) != 0)
+ sc->lsr |= LSR_OE;
+ } else if (sc->stdio) {
+ ttywrite(value);
+#ifndef __FreeBSD__
+ } else if (sc->bcons) {
+ bconswrite(sc, value);
+#endif
+ } /* else drop on floor */
+ sc->thre_int_pending = true;
+ break;
+ case REG_IER:
+ /*
+ * Apply mask so that bits 4-7 are 0
+ * Also enables bits 0-3 only if they're 1
+ */
+ sc->ier = value & 0x0F;
+ break;
+ case REG_FCR:
+ /*
+ * When moving from FIFO and 16450 mode and vice versa,
+ * the FIFO contents are reset.
+ */
+ if ((sc->fcr & FCR_ENABLE) ^ (value & FCR_ENABLE)) {
+ fifosz = (value & FCR_ENABLE) ? FIFOSZ : 1;
+ fifo_reset(&sc->rxfifo, fifosz);
+ }
+
+ /*
+ * The FCR_ENABLE bit must be '1' for the programming
+ * of other FCR bits to be effective.
+ */
+ if ((value & FCR_ENABLE) == 0) {
+ sc->fcr = 0;
+ } else {
+ if ((value & FCR_RCV_RST) != 0)
+ fifo_reset(&sc->rxfifo, FIFOSZ);
+
+ sc->fcr = value &
+ (FCR_ENABLE | FCR_DMA | FCR_RX_MASK);
+ }
+ break;
+ case REG_LCR:
+ sc->lcr = value;
+ break;
+ case REG_MCR:
+ /* Apply mask so that bits 5-7 are 0 */
+ sc->mcr = value & 0x1F;
+
+ msr = 0;
+ if (sc->mcr & MCR_LOOPBACK) {
+ /*
+ * In the loopback mode certain bits from the
+ * MCR are reflected back into MSR
+ */
+ if (sc->mcr & MCR_RTS)
+ msr |= MSR_CTS;
+ if (sc->mcr & MCR_DTR)
+ msr |= MSR_DSR;
+ if (sc->mcr & MCR_OUT1)
+ msr |= MSR_RI;
+ if (sc->mcr & MCR_OUT2)
+ msr |= MSR_DCD;
+ }
+
+ /*
+ * Detect if there has been any change between the
+ * previous and the new value of MSR. If there is
+ * then assert the appropriate MSR delta bit.
+ */
+ if ((msr & MSR_CTS) ^ (sc->msr & MSR_CTS))
+ sc->msr |= MSR_DCTS;
+ if ((msr & MSR_DSR) ^ (sc->msr & MSR_DSR))
+ sc->msr |= MSR_DDSR;
+ if ((msr & MSR_DCD) ^ (sc->msr & MSR_DCD))
+ sc->msr |= MSR_DDCD;
+ if ((sc->msr & MSR_RI) != 0 && (msr & MSR_RI) == 0)
+ sc->msr |= MSR_TERI;
+
+ /*
+ * Update the value of MSR while retaining the delta
+ * bits.
+ */
+ sc->msr &= MSR_DELTA_MASK;
+ sc->msr |= msr;
+ break;
+ case REG_LSR:
+ /*
+ * Line status register is not meant to be written to
+ * during normal operation.
+ */
+ break;
+ case REG_MSR:
+ /*
+ * As far as I can tell MSR is a read-only register.
+ */
+ break;
+ case REG_SCR:
+ sc->scr = value;
+ break;
+ default:
+ break;
+ }
+
+done:
+ uart_toggle_intr(sc);
+ pthread_mutex_unlock(&sc->mtx);
+}
+
+uint8_t
+uart_read(struct uart_softc *sc, int offset)
+{
+ uint8_t iir, intr_reason, reg;
+
+ pthread_mutex_lock(&sc->mtx);
+
+ /* Open terminal */
+ if (!sc->opened && sc->stdio) {
+ uart_opentty(sc);
+ sc->opened = true;
+ }
+
+ /*
+ * Take care of the special case DLAB accesses first
+ */
+ if ((sc->lcr & LCR_DLAB) != 0) {
+ if (offset == REG_DLL) {
+ reg = sc->dll;
+ goto done;
+ }
+
+ if (offset == REG_DLH) {
+ reg = sc->dlh;
+ goto done;
+ }
+ }
+
+ switch (offset) {
+ case REG_DATA:
+ reg = fifo_getchar(&sc->rxfifo);
+ break;
+ case REG_IER:
+ reg = sc->ier;
+ break;
+ case REG_IIR:
+ iir = (sc->fcr & FCR_ENABLE) ? IIR_FIFO_MASK : 0;
+
+ intr_reason = uart_intr_reason(sc);
+
+ /*
+ * Deal with side effects of reading the IIR register
+ */
+ if (intr_reason == IIR_TXRDY)
+ sc->thre_int_pending = false;
+
+ iir |= intr_reason;
+
+ reg = iir;
+ break;
+ case REG_LCR:
+ reg = sc->lcr;
+ break;
+ case REG_MCR:
+ reg = sc->mcr;
+ break;
+ case REG_LSR:
+ /* Transmitter is always ready for more data */
+ sc->lsr |= LSR_TEMT | LSR_THRE;
+
+ /* Check for new receive data */
+ if (fifo_numchars(&sc->rxfifo) > 0)
+ sc->lsr |= LSR_RXRDY;
+ else
+ sc->lsr &= ~LSR_RXRDY;
+
+ reg = sc->lsr;
+
+ /* The LSR_OE bit is cleared on LSR read */
+ sc->lsr &= ~LSR_OE;
+ break;
+ case REG_MSR:
+ /*
+ * MSR delta bits are cleared on read
+ */
+ reg = sc->msr;
+ sc->msr &= ~MSR_DELTA_MASK;
+ break;
+ case REG_SCR:
+ reg = sc->scr;
+ break;
+ default:
+ reg = 0xFF;
+ break;
+ }
+
+done:
+ uart_toggle_intr(sc);
+ pthread_mutex_unlock(&sc->mtx);
+
+ return (reg);
+}
+
+#ifndef __FreeBSD__
+static void *
+uart_tty_thread(void *param)
+{
+ struct uart_softc *sc = param;
+ pollfd_t pollset;
+
+ pollset.fd = STDIN_FILENO;
+ pollset.events = POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND;
+
+ for (;;) {
+ if (poll(&pollset, 1, -1) < 0) {
+ if (errno != EINTR) {
+ perror("poll failed");
+ break;
+ }
+ continue;
+ }
+ uart_tty_drain(sc);
+ }
+
+ return (NULL);
+}
+
+/*
+ * Read the "ident" string from the client's descriptor; this routine also
+ * tolerates being called with pid=NULL, for times when you want to "eat"
+ * the ident string from a client without saving it.
+ */
+static int
+get_client_ident(int clifd, pid_t *pid)
+{
+ char buf[BUFSIZ], *bufp;
+ size_t buflen = sizeof (buf);
+ char c = '\0';
+ int i = 0, r;
+
+ /* "eat up the ident string" case, for simplicity */
+ if (pid == NULL) {
+ while (read(clifd, &c, 1) == 1) {
+ if (c == '\n')
+ return (0);
+ }
+ }
+
+ bzero(buf, sizeof (buf));
+ while ((buflen > 1) && (r = read(clifd, &c, 1)) == 1) {
+ buflen--;
+ if (c == '\n')
+ break;
+
+ buf[i] = c;
+ i++;
+ }
+ if (r == -1)
+ return (-1);
+
+ /*
+ * We've filled the buffer, but still haven't seen \n. Keep eating
+ * until we find it; we don't expect this to happen, but this is
+ * defensive.
+ */
+ if (c != '\n') {
+ while ((r = read(clifd, &c, sizeof (c))) > 0)
+ if (c == '\n')
+ break;
+ }
+
+ /*
+ * Parse buffer for message of the form: IDENT <pid>
+ */
+ bufp = buf;
+ if (strncmp(bufp, "IDENT ", 6) != 0)
+ return (-1);
+ bufp += 6;
+ errno = 0;
+ *pid = strtoll(bufp, &bufp, 10);
+ if (errno != 0)
+ return (-1);
+
+ return (0);
+}
+
+static int
+uart_bcons_accept_client(struct uart_softc *sc)
+{
+ int connfd;
+ struct sockaddr_un cliaddr;
+ socklen_t clilen;
+ pid_t pid;
+
+ clilen = sizeof (cliaddr);
+ connfd = accept(sc->usc_bcons.servfd,
+ (struct sockaddr *)&cliaddr, &clilen);
+ if (connfd == -1)
+ return (-1);
+ if (get_client_ident(connfd, &pid) == -1) {
+ (void) shutdown(connfd, SHUT_RDWR);
+ (void) close(connfd);
+ return (-1);
+ }
+
+ if (fcntl(connfd, F_SETFL, O_NONBLOCK) < 0) {
+ (void) shutdown(connfd, SHUT_RDWR);
+ (void) close(connfd);
+ return (-1);
+ }
+ (void) write(connfd, "OK\n", 3);
+
+ sc->usc_bcons.clipid = pid;
+ sc->usc_bcons.clifd = connfd;
+
+ printf("Connection from process ID %lu.\n", pid);
+
+ return (0);
+}
+
+static void
+uart_bcons_reject_client(struct uart_softc *sc)
+{
+ int connfd;
+ struct sockaddr_un cliaddr;
+ socklen_t clilen;
+ char nak[MAXPATHLEN];
+
+ clilen = sizeof (cliaddr);
+ connfd = accept(sc->usc_bcons.servfd,
+ (struct sockaddr *)&cliaddr, &clilen);
+
+ /*
+ * After hear its ident string, tell client to get lost.
+ */
+ if (get_client_ident(connfd, NULL) == 0) {
+ (void) snprintf(nak, sizeof (nak), "%lu\n",
+ sc->usc_bcons.clipid);
+ (void) write(connfd, nak, strlen(nak));
+ }
+ (void) shutdown(connfd, SHUT_RDWR);
+ (void) close(connfd);
+}
+
+static int
+uart_bcons_client_event(struct uart_softc *sc)
+{
+ int res;
+
+ res = uart_bcons_drain(sc);
+ if (res < 0)
+ return (-1);
+
+ if (res > 0) {
+ fprintf(stderr, "Closing connection with bhyve console\n");
+ (void) shutdown(sc->usc_bcons.clifd, SHUT_RDWR);
+ (void) close(sc->usc_bcons.clifd);
+ sc->usc_bcons.clifd = -1;
+ }
+
+ return (0);
+}
+
+static void
+uart_bcons_server_event(struct uart_softc *sc)
+{
+ int clifd;
+
+ if (sc->usc_bcons.clifd != -1) {
+ /* we're already handling a client */
+ uart_bcons_reject_client(sc);
+ return;
+ }
+
+ if (uart_bcons_accept_client(sc) == 0) {
+ pthread_mutex_lock(&bcons_wait_lock);
+ bcons_connected = B_TRUE;
+ pthread_cond_signal(&bcons_wait_done);
+ pthread_mutex_unlock(&bcons_wait_lock);
+ }
+}
+
+static void *
+uart_bcons_thread(void *param)
+{
+ struct uart_softc *sc = param;
+ struct pollfd pollfds[2];
+ int res;
+
+ /* read from client and write to vm */
+ pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND |
+ POLLPRI | POLLERR | POLLHUP;
+
+ /* the server socket; watch for events (new connections) */
+ pollfds[1].events = pollfds[0].events;
+
+ for (;;) {
+ pollfds[0].fd = sc->usc_bcons.clifd;
+ pollfds[1].fd = sc->usc_bcons.servfd;
+ pollfds[0].revents = pollfds[1].revents = 0;
+
+ res = poll(pollfds,
+ sizeof (pollfds) / sizeof (struct pollfd), -1);
+
+ if (res == -1 && errno != EINTR) {
+ perror("poll failed");
+ /* we are hosed, close connection */
+ break;
+ }
+
+ /* event from client side */
+ if (pollfds[0].revents) {
+ if (pollfds[0].revents &
+ (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+ if (uart_bcons_client_event(sc) < 0)
+ break;
+ } else {
+ break;
+ }
+ }
+
+ /* event from server socket */
+ if (pollfds[1].revents) {
+ if (pollfds[1].revents & (POLLIN | POLLRDNORM)) {
+ uart_bcons_server_event(sc);
+ } else {
+ break;
+ }
+ }
+ }
+
+ if (sc->usc_bcons.clifd != -1) {
+ fprintf(stderr, "Closing connection with bhyve console\n");
+ (void) shutdown(sc->usc_bcons.clifd, SHUT_RDWR);
+ (void) close(sc->usc_bcons.clifd);
+ sc->usc_bcons.clifd = -1;
+ }
+
+ return (NULL);
+}
+
+static int
+init_bcons_sock(void)
+{
+ int servfd;
+ struct sockaddr_un servaddr;
+
+ if (mkdir(BHYVE_TMPDIR, S_IRWXU) < 0 && errno != EEXIST) {
+ fprintf(stderr, "bhyve console setup: "
+ "could not mkdir %s", BHYVE_TMPDIR, strerror(errno));
+ return (-1);
+ }
+
+ bzero(&servaddr, sizeof (servaddr));
+ servaddr.sun_family = AF_UNIX;
+ (void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path),
+ BHYVE_CONS_SOCKPATH, vmname);
+
+ if ((servfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+ fprintf(stderr, "bhyve console setup: "
+ "could not create socket\n");
+ return (-1);
+ }
+ (void) unlink(servaddr.sun_path);
+
+ if (bind(servfd, (struct sockaddr *)&servaddr,
+ sizeof (servaddr)) == -1) {
+ fprintf(stderr, "bhyve console setup: "
+ "could not bind to socket\n");
+ goto out;
+ }
+
+ if (listen(servfd, 4) == -1) {
+ fprintf(stderr, "bhyve console setup: "
+ "could not listen on socket");
+ goto out;
+ }
+ return (servfd);
+
+out:
+ (void) unlink(servaddr.sun_path);
+ (void) close(servfd);
+ return (-1);
+}
+#endif
+
+int
+uart_legacy_alloc(int which, int *baseaddr, int *irq)
+{
+
+ if (which < 0 || which >= UART_NLDEVS || uart_lres[which].inuse)
+ return (-1);
+
+ uart_lres[which].inuse = true;
+ *baseaddr = uart_lres[which].baseaddr;
+ *irq = uart_lres[which].irq;
+
+ return (0);
+}
+
+struct uart_softc *
+uart_init(uart_intr_func_t intr_assert, uart_intr_func_t intr_deassert,
+ void *arg)
+{
+ struct uart_softc *sc;
+
+ sc = malloc(sizeof(struct uart_softc));
+ bzero(sc, sizeof(struct uart_softc));
+
+ sc->arg = arg;
+ sc->intr_assert = intr_assert;
+ sc->intr_deassert = intr_deassert;
+
+ pthread_mutex_init(&sc->mtx, NULL);
+
+ uart_reset(sc);
+
+ return (sc);
+}
+
+int
+uart_set_backend(struct uart_softc *sc, const char *opts)
+{
+#ifndef __FreeBSD__
+ int error;
+#endif
+ /*
+ * XXX one stdio backend supported at this time.
+ */
+ if (opts == NULL)
+ return (0);
+
+#ifdef __FreeBSD__
+ if (strcmp("stdio", opts) == 0 && !uart_stdio) {
+ sc->stdio = true;
+ uart_stdio = true;
+ return (0);
+#else
+ if (strcmp("stdio", opts) == 0 && !uart_stdio && !uart_bcons) {
+ sc->stdio = true;
+ uart_stdio = true;
+
+ error = pthread_create(NULL, NULL, uart_tty_thread, sc);
+ assert(error == 0);
+
+ return (0);
+ } else if (strstr(opts, "bcons") != 0 && !uart_stdio && !uart_bcons) {
+ sc->bcons = true;
+ uart_bcons= true;
+
+ if (strstr(opts, "bcons,wait") != 0) {
+ bcons_wait = true;
+ }
+
+ sc->usc_bcons.clifd = -1;
+ if ((sc->usc_bcons.servfd = init_bcons_sock()) == -1) {
+ fprintf(stderr, "bhyve console setup: "
+ "socket initialization failed\n");
+ return (-1);
+ }
+ error = pthread_create(NULL, NULL, uart_bcons_thread, sc);
+ assert(error == 0);
+
+ return (0);
+#endif
+ } else
+ return (-1);
+}
diff --git a/usr/src/cmd/bhyve/uart_emul.h b/usr/src/cmd/bhyve/uart_emul.h
new file mode 100644
index 0000000000..ecff957991
--- /dev/null
+++ b/usr/src/cmd/bhyve/uart_emul.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/uart_emul.h 257293 2013-10-29 00:18:11Z neel $
+ */
+
+#ifndef _UART_EMUL_H_
+#define _UART_EMUL_H_
+
+
+#define UART_IO_BAR_SIZE 8
+
+struct uart_softc;
+
+typedef void (*uart_intr_func_t)(void *arg);
+struct uart_softc *uart_init(uart_intr_func_t intr_assert,
+ uart_intr_func_t intr_deassert, void *arg);
+
+int uart_legacy_alloc(int unit, int *ioaddr, int *irq);
+uint8_t uart_read(struct uart_softc *sc, int offset);
+void uart_write(struct uart_softc *sc, int offset, uint8_t value);
+int uart_set_backend(struct uart_softc *sc, const char *opt);
+#endif
diff --git a/usr/src/cmd/bhyve/vga.c b/usr/src/cmd/bhyve/vga.c
new file mode 100644
index 0000000000..4330741042
--- /dev/null
+++ b/usr/src/cmd/bhyve/vga.c
@@ -0,0 +1,1289 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <machine/vmm.h>
+
+#include "bhyvegc.h"
+#include "console.h"
+#include "inout.h"
+#include "mem.h"
+#include "vga.h"
+
+#define KB (1024UL)
+#define MB (1024 * 1024UL)
+
+struct vga_softc {
+ struct mem_range mr;
+
+ struct bhyvegc *gc;
+ int gc_width;
+ int gc_height;
+ struct bhyvegc_image *gc_image;
+
+ uint8_t *vga_ram;
+
+ /*
+ * General registers
+ */
+ uint8_t vga_misc;
+ uint8_t vga_sts1;
+
+ /*
+ * Sequencer
+ */
+ struct {
+ int seq_index;
+ uint8_t seq_reset;
+ uint8_t seq_clock_mode;
+ int seq_cm_dots;
+ uint8_t seq_map_mask;
+ uint8_t seq_cmap_sel;
+ int seq_cmap_pri_off;
+ int seq_cmap_sec_off;
+ uint8_t seq_mm;
+ } vga_seq;
+
+ /*
+ * CRT Controller
+ */
+ struct {
+ int crtc_index;
+ uint8_t crtc_mode_ctrl;
+ uint8_t crtc_horiz_total;
+ uint8_t crtc_horiz_disp_end;
+ uint8_t crtc_start_horiz_blank;
+ uint8_t crtc_end_horiz_blank;
+ uint8_t crtc_start_horiz_retrace;
+ uint8_t crtc_end_horiz_retrace;
+ uint8_t crtc_vert_total;
+ uint8_t crtc_overflow;
+ uint8_t crtc_present_row_scan;
+ uint8_t crtc_max_scan_line;
+ uint8_t crtc_cursor_start;
+ uint8_t crtc_cursor_on;
+ uint8_t crtc_cursor_end;
+ uint8_t crtc_start_addr_high;
+ uint8_t crtc_start_addr_low;
+ uint16_t crtc_start_addr;
+ uint8_t crtc_cursor_loc_low;
+ uint8_t crtc_cursor_loc_high;
+ uint16_t crtc_cursor_loc;
+ uint8_t crtc_vert_retrace_start;
+ uint8_t crtc_vert_retrace_end;
+ uint8_t crtc_vert_disp_end;
+ uint8_t crtc_offset;
+ uint8_t crtc_underline_loc;
+ uint8_t crtc_start_vert_blank;
+ uint8_t crtc_end_vert_blank;
+ uint8_t crtc_line_compare;
+ } vga_crtc;
+
+ /*
+ * Graphics Controller
+ */
+ struct {
+ int gc_index;
+ uint8_t gc_set_reset;
+ uint8_t gc_enb_set_reset;
+ uint8_t gc_color_compare;
+ uint8_t gc_rotate;
+ uint8_t gc_op;
+ uint8_t gc_read_map_sel;
+ uint8_t gc_mode;
+ bool gc_mode_c4; /* chain 4 */
+ bool gc_mode_oe; /* odd/even */
+ uint8_t gc_mode_rm; /* read mode */
+ uint8_t gc_mode_wm; /* write mode */
+ uint8_t gc_misc;
+ uint8_t gc_misc_gm; /* graphics mode */
+ uint8_t gc_misc_mm; /* memory map */
+ uint8_t gc_color_dont_care;
+ uint8_t gc_bit_mask;
+ uint8_t gc_latch0;
+ uint8_t gc_latch1;
+ uint8_t gc_latch2;
+ uint8_t gc_latch3;
+ } vga_gc;
+
+ /*
+ * Attribute Controller
+ */
+ struct {
+ int atc_flipflop;
+ int atc_index;
+ uint8_t atc_palette[16];
+ uint8_t atc_mode;
+ uint8_t atc_overscan_color;
+ uint8_t atc_color_plane_enb;
+ uint8_t atc_horiz_pixel_panning;
+ uint8_t atc_color_select;
+ uint8_t atc_color_select_45;
+ uint8_t atc_color_select_67;
+ } vga_atc;
+
+ /*
+ * DAC
+ */
+ struct {
+ uint8_t dac_state;
+ int dac_rd_index;
+ int dac_rd_subindex;
+ int dac_wr_index;
+ int dac_wr_subindex;
+ uint8_t dac_palette[3 * 256];
+ uint32_t dac_palette_rgb[256];
+ } vga_dac;
+};
+
+static bool
+vga_in_reset(struct vga_softc *sc)
+{
+ return (((sc->vga_seq.seq_clock_mode & SEQ_CM_SO) != 0) ||
+ ((sc->vga_seq.seq_reset & SEQ_RESET_ASYNC) == 0) ||
+ ((sc->vga_seq.seq_reset & SEQ_RESET_SYNC) == 0) ||
+ ((sc->vga_crtc.crtc_mode_ctrl & CRTC_MC_TE) == 0));
+}
+
+static void
+vga_check_size(struct bhyvegc *gc, struct vga_softc *sc)
+{
+ int old_width, old_height;
+
+ if (vga_in_reset(sc))
+ return;
+
+ old_width = sc->gc_width;
+ old_height = sc->gc_height;
+
+ /*
+ * Horizontal Display End: For text modes this is the number
+ * of characters. For graphics modes this is the number of
+ * pixels per scanlines divided by the number of pixels per
+ * character clock.
+ */
+ sc->gc_width = (sc->vga_crtc.crtc_horiz_disp_end + 1) *
+ sc->vga_seq.seq_cm_dots;
+
+ sc->gc_height = (sc->vga_crtc.crtc_vert_disp_end |
+ (((sc->vga_crtc.crtc_overflow & CRTC_OF_VDE8) >> CRTC_OF_VDE8_SHIFT) << 8) |
+ (((sc->vga_crtc.crtc_overflow & CRTC_OF_VDE9) >> CRTC_OF_VDE9_SHIFT) << 9)) + 1;
+
+ if (old_width != sc->gc_width || old_height != sc->gc_height)
+ bhyvegc_resize(gc, sc->gc_width, sc->gc_height);
+}
+
+static uint32_t
+vga_get_pixel(struct vga_softc *sc, int x, int y)
+{
+ int offset;
+ int bit;
+ uint8_t data;
+ uint8_t idx;
+
+ offset = (y * sc->gc_width / 8) + (x / 8);
+ bit = 7 - (x % 8);
+
+ data = (((sc->vga_ram[offset + 0 * 64*KB] >> bit) & 0x1) << 0) |
+ (((sc->vga_ram[offset + 1 * 64*KB] >> bit) & 0x1) << 1) |
+ (((sc->vga_ram[offset + 2 * 64*KB] >> bit) & 0x1) << 2) |
+ (((sc->vga_ram[offset + 3 * 64*KB] >> bit) & 0x1) << 3);
+
+ data &= sc->vga_atc.atc_color_plane_enb;
+
+ if (sc->vga_atc.atc_mode & ATC_MC_IPS) {
+ idx = sc->vga_atc.atc_palette[data] & 0x0f;
+ idx |= sc->vga_atc.atc_color_select_45;
+ } else {
+ idx = sc->vga_atc.atc_palette[data];
+ }
+ idx |= sc->vga_atc.atc_color_select_67;
+
+ return (sc->vga_dac.dac_palette_rgb[idx]);
+}
+
+static void
+vga_render_graphics(struct vga_softc *sc)
+{
+ int x, y;
+
+ for (y = 0; y < sc->gc_height; y++) {
+ for (x = 0; x < sc->gc_width; x++) {
+ int offset;
+
+ offset = y * sc->gc_width + x;
+ sc->gc_image->data[offset] = vga_get_pixel(sc, x, y);
+ }
+ }
+}
+
+static uint32_t
+vga_get_text_pixel(struct vga_softc *sc, int x, int y)
+{
+ int dots, offset, bit, font_offset;
+ uint8_t ch, attr, font;
+ uint8_t idx;
+
+ dots = sc->vga_seq.seq_cm_dots;
+
+ offset = 2 * sc->vga_crtc.crtc_start_addr;
+ offset += (y / 16 * sc->gc_width / dots) * 2 + (x / dots) * 2;
+
+ bit = 7 - (x % dots);
+
+ ch = sc->vga_ram[offset + 0 * 64*KB];
+ attr = sc->vga_ram[offset + 1 * 64*KB];
+
+ if (sc->vga_crtc.crtc_cursor_on &&
+ (offset == (sc->vga_crtc.crtc_cursor_loc * 2)) &&
+ ((y % 16) >= (sc->vga_crtc.crtc_cursor_start & CRTC_CS_CS)) &&
+ ((y % 16) <= (sc->vga_crtc.crtc_cursor_end & CRTC_CE_CE))) {
+ idx = sc->vga_atc.atc_palette[attr & 0xf];
+ return (sc->vga_dac.dac_palette_rgb[idx]);
+ }
+
+ if ((sc->vga_seq.seq_mm & SEQ_MM_EM) &&
+ sc->vga_seq.seq_cmap_pri_off != sc->vga_seq.seq_cmap_sec_off) {
+ if (attr & 0x8)
+ font_offset = sc->vga_seq.seq_cmap_pri_off +
+ (ch << 5) + y % 16;
+ else
+ font_offset = sc->vga_seq.seq_cmap_sec_off +
+ (ch << 5) + y % 16;
+ attr &= ~0x8;
+ } else {
+ font_offset = (ch << 5) + y % 16;
+ }
+
+ font = sc->vga_ram[font_offset + 2 * 64*KB];
+
+ if ((bit > 0) && (font & (1 << bit)))
+ idx = sc->vga_atc.atc_palette[attr & 0xf];
+ else
+ idx = sc->vga_atc.atc_palette[attr >> 4];
+
+ return (sc->vga_dac.dac_palette_rgb[idx]);
+}
+
+static void
+vga_render_text(struct vga_softc *sc)
+{
+ int x, y;
+
+ for (y = 0; y < sc->gc_height; y++) {
+ for (x = 0; x < sc->gc_width; x++) {
+ int offset;
+
+ offset = y * sc->gc_width + x;
+ sc->gc_image->data[offset] = vga_get_text_pixel(sc, x, y);
+ }
+ }
+}
+
+static void
+vga_render(struct bhyvegc *gc, void *arg)
+{
+ struct vga_softc *sc = arg;
+
+ vga_check_size(gc, sc);
+
+ if (vga_in_reset(sc)) {
+ memset(sc->gc_image->data, 0,
+ sc->gc_image->width * sc->gc_image->height *
+ sizeof (uint32_t));
+ return;
+ }
+
+ if (sc->vga_gc.gc_misc_gm && (sc->vga_atc.atc_mode & ATC_MC_GA))
+ vga_render_graphics(sc);
+ else
+ vga_render_text(sc);
+}
+
+static uint64_t
+vga_mem_rd_handler(struct vmctx *ctx, uint64_t addr, void *arg1)
+{
+ struct vga_softc *sc = arg1;
+ uint8_t map_sel;
+ int offset;
+
+ offset = addr;
+ switch (sc->vga_gc.gc_misc_mm) {
+ case 0x0:
+ /*
+ * extended mode: base 0xa0000 size 128k
+ */
+ offset -=0xa0000;
+ offset &= (128 * KB - 1);
+ break;
+ case 0x1:
+ /*
+ * EGA/VGA mode: base 0xa0000 size 64k
+ */
+ offset -=0xa0000;
+ offset &= (64 * KB - 1);
+ break;
+ case 0x2:
+ /*
+ * monochrome text mode: base 0xb0000 size 32kb
+ */
+ assert(0);
+ case 0x3:
+ /*
+ * color text mode and CGA: base 0xb8000 size 32kb
+ */
+ offset -=0xb8000;
+ offset &= (32 * KB - 1);
+ break;
+ }
+
+ /* Fill latches. */
+ sc->vga_gc.gc_latch0 = sc->vga_ram[offset + 0*64*KB];
+ sc->vga_gc.gc_latch1 = sc->vga_ram[offset + 1*64*KB];
+ sc->vga_gc.gc_latch2 = sc->vga_ram[offset + 2*64*KB];
+ sc->vga_gc.gc_latch3 = sc->vga_ram[offset + 3*64*KB];
+
+ if (sc->vga_gc.gc_mode_rm) {
+ /* read mode 1 */
+ assert(0);
+ }
+
+ map_sel = sc->vga_gc.gc_read_map_sel;
+ if (sc->vga_gc.gc_mode_oe) {
+ map_sel |= (offset & 1);
+ offset &= ~1;
+ }
+
+ /* read mode 0: return the byte from the selected plane. */
+ offset += map_sel * 64*KB;
+
+ return (sc->vga_ram[offset]);
+}
+
+static void
+vga_mem_wr_handler(struct vmctx *ctx, uint64_t addr, uint8_t val, void *arg1)
+{
+ struct vga_softc *sc = arg1;
+ uint8_t c0, c1, c2, c3;
+ uint8_t m0, m1, m2, m3;
+ uint8_t set_reset;
+ uint8_t enb_set_reset;
+ uint8_t mask;
+ int offset;
+
+ offset = addr;
+ switch (sc->vga_gc.gc_misc_mm) {
+ case 0x0:
+ /*
+ * extended mode: base 0xa0000 size 128kb
+ */
+ offset -=0xa0000;
+ offset &= (128 * KB - 1);
+ break;
+ case 0x1:
+ /*
+ * EGA/VGA mode: base 0xa0000 size 64kb
+ */
+ offset -=0xa0000;
+ offset &= (64 * KB - 1);
+ break;
+ case 0x2:
+ /*
+ * monochrome text mode: base 0xb0000 size 32kb
+ */
+ assert(0);
+ case 0x3:
+ /*
+ * color text mode and CGA: base 0xb8000 size 32kb
+ */
+ offset -=0xb8000;
+ offset &= (32 * KB - 1);
+ break;
+ }
+
+ set_reset = sc->vga_gc.gc_set_reset;
+ enb_set_reset = sc->vga_gc.gc_enb_set_reset;
+
+ c0 = sc->vga_gc.gc_latch0;
+ c1 = sc->vga_gc.gc_latch1;
+ c2 = sc->vga_gc.gc_latch2;
+ c3 = sc->vga_gc.gc_latch3;
+
+ switch (sc->vga_gc.gc_mode_wm) {
+ case 0:
+ /* write mode 0 */
+ mask = sc->vga_gc.gc_bit_mask;
+
+ val = (val >> sc->vga_gc.gc_rotate) |
+ (val << (8 - sc->vga_gc.gc_rotate));
+
+ switch (sc->vga_gc.gc_op) {
+ case 0x00: /* replace */
+ m0 = (set_reset & 1) ? mask : 0x00;
+ m1 = (set_reset & 2) ? mask : 0x00;
+ m2 = (set_reset & 4) ? mask : 0x00;
+ m3 = (set_reset & 8) ? mask : 0x00;
+
+ c0 = (enb_set_reset & 1) ? (c0 & ~mask) : (val & mask);
+ c1 = (enb_set_reset & 2) ? (c1 & ~mask) : (val & mask);
+ c2 = (enb_set_reset & 4) ? (c2 & ~mask) : (val & mask);
+ c3 = (enb_set_reset & 8) ? (c3 & ~mask) : (val & mask);
+
+ c0 |= m0;
+ c1 |= m1;
+ c2 |= m2;
+ c3 |= m3;
+ break;
+ case 0x08: /* AND */
+ m0 = set_reset & 1 ? 0xff : ~mask;
+ m1 = set_reset & 2 ? 0xff : ~mask;
+ m2 = set_reset & 4 ? 0xff : ~mask;
+ m3 = set_reset & 8 ? 0xff : ~mask;
+
+ c0 = enb_set_reset & 1 ? c0 & m0 : val & m0;
+ c1 = enb_set_reset & 2 ? c1 & m1 : val & m1;
+ c2 = enb_set_reset & 4 ? c2 & m2 : val & m2;
+ c3 = enb_set_reset & 8 ? c3 & m3 : val & m3;
+ break;
+ case 0x10: /* OR */
+ m0 = set_reset & 1 ? mask : 0x00;
+ m1 = set_reset & 2 ? mask : 0x00;
+ m2 = set_reset & 4 ? mask : 0x00;
+ m3 = set_reset & 8 ? mask : 0x00;
+
+ c0 = enb_set_reset & 1 ? c0 | m0 : val | m0;
+ c1 = enb_set_reset & 2 ? c1 | m1 : val | m1;
+ c2 = enb_set_reset & 4 ? c2 | m2 : val | m2;
+ c3 = enb_set_reset & 8 ? c3 | m3 : val | m3;
+ break;
+ case 0x18: /* XOR */
+ m0 = set_reset & 1 ? mask : 0x00;
+ m1 = set_reset & 2 ? mask : 0x00;
+ m2 = set_reset & 4 ? mask : 0x00;
+ m3 = set_reset & 8 ? mask : 0x00;
+
+ c0 = enb_set_reset & 1 ? c0 ^ m0 : val ^ m0;
+ c1 = enb_set_reset & 2 ? c1 ^ m1 : val ^ m1;
+ c2 = enb_set_reset & 4 ? c2 ^ m2 : val ^ m2;
+ c3 = enb_set_reset & 8 ? c3 ^ m3 : val ^ m3;
+ break;
+ }
+ break;
+ case 1:
+ /* write mode 1 */
+ break;
+ case 2:
+ /* write mode 2 */
+ mask = sc->vga_gc.gc_bit_mask;
+
+ switch (sc->vga_gc.gc_op) {
+ case 0x00: /* replace */
+ m0 = (val & 1 ? 0xff : 0x00) & mask;
+ m1 = (val & 2 ? 0xff : 0x00) & mask;
+ m2 = (val & 4 ? 0xff : 0x00) & mask;
+ m3 = (val & 8 ? 0xff : 0x00) & mask;
+
+ c0 &= ~mask;
+ c1 &= ~mask;
+ c2 &= ~mask;
+ c3 &= ~mask;
+
+ c0 |= m0;
+ c1 |= m1;
+ c2 |= m2;
+ c3 |= m3;
+ break;
+ case 0x08: /* AND */
+ m0 = (val & 1 ? 0xff : 0x00) | ~mask;
+ m1 = (val & 2 ? 0xff : 0x00) | ~mask;
+ m2 = (val & 4 ? 0xff : 0x00) | ~mask;
+ m3 = (val & 8 ? 0xff : 0x00) | ~mask;
+
+ c0 &= m0;
+ c1 &= m1;
+ c2 &= m2;
+ c3 &= m3;
+ break;
+ case 0x10: /* OR */
+ m0 = (val & 1 ? 0xff : 0x00) & mask;
+ m1 = (val & 2 ? 0xff : 0x00) & mask;
+ m2 = (val & 4 ? 0xff : 0x00) & mask;
+ m3 = (val & 8 ? 0xff : 0x00) & mask;
+
+ c0 |= m0;
+ c1 |= m1;
+ c2 |= m2;
+ c3 |= m3;
+ break;
+ case 0x18: /* XOR */
+ m0 = (val & 1 ? 0xff : 0x00) & mask;
+ m1 = (val & 2 ? 0xff : 0x00) & mask;
+ m2 = (val & 4 ? 0xff : 0x00) & mask;
+ m3 = (val & 8 ? 0xff : 0x00) & mask;
+
+ c0 ^= m0;
+ c1 ^= m1;
+ c2 ^= m2;
+ c3 ^= m3;
+ break;
+ }
+ break;
+ case 3:
+ /* write mode 3 */
+ mask = sc->vga_gc.gc_bit_mask & val;
+
+ val = (val >> sc->vga_gc.gc_rotate) |
+ (val << (8 - sc->vga_gc.gc_rotate));
+
+ switch (sc->vga_gc.gc_op) {
+ case 0x00: /* replace */
+ m0 = (set_reset & 1 ? 0xff : 0x00) & mask;
+ m1 = (set_reset & 2 ? 0xff : 0x00) & mask;
+ m2 = (set_reset & 4 ? 0xff : 0x00) & mask;
+ m3 = (set_reset & 8 ? 0xff : 0x00) & mask;
+
+ c0 &= ~mask;
+ c1 &= ~mask;
+ c2 &= ~mask;
+ c3 &= ~mask;
+
+ c0 |= m0;
+ c1 |= m1;
+ c2 |= m2;
+ c3 |= m3;
+ break;
+ case 0x08: /* AND */
+ m0 = (set_reset & 1 ? 0xff : 0x00) | ~mask;
+ m1 = (set_reset & 2 ? 0xff : 0x00) | ~mask;
+ m2 = (set_reset & 4 ? 0xff : 0x00) | ~mask;
+ m3 = (set_reset & 8 ? 0xff : 0x00) | ~mask;
+
+ c0 &= m0;
+ c1 &= m1;
+ c2 &= m2;
+ c3 &= m3;
+ break;
+ case 0x10: /* OR */
+ m0 = (set_reset & 1 ? 0xff : 0x00) & mask;
+ m1 = (set_reset & 2 ? 0xff : 0x00) & mask;
+ m2 = (set_reset & 4 ? 0xff : 0x00) & mask;
+ m3 = (set_reset & 8 ? 0xff : 0x00) & mask;
+
+ c0 |= m0;
+ c1 |= m1;
+ c2 |= m2;
+ c3 |= m3;
+ break;
+ case 0x18: /* XOR */
+ m0 = (set_reset & 1 ? 0xff : 0x00) & mask;
+ m1 = (set_reset & 2 ? 0xff : 0x00) & mask;
+ m2 = (set_reset & 4 ? 0xff : 0x00) & mask;
+ m3 = (set_reset & 8 ? 0xff : 0x00) & mask;
+
+ c0 ^= m0;
+ c1 ^= m1;
+ c2 ^= m2;
+ c3 ^= m3;
+ break;
+ }
+ break;
+ }
+
+ if (sc->vga_gc.gc_mode_oe) {
+ if (offset & 1) {
+ offset &= ~1;
+ if (sc->vga_seq.seq_map_mask & 2)
+ sc->vga_ram[offset + 1*64*KB] = c1;
+ if (sc->vga_seq.seq_map_mask & 8)
+ sc->vga_ram[offset + 3*64*KB] = c3;
+ } else {
+ if (sc->vga_seq.seq_map_mask & 1)
+ sc->vga_ram[offset + 0*64*KB] = c0;
+ if (sc->vga_seq.seq_map_mask & 4)
+ sc->vga_ram[offset + 2*64*KB] = c2;
+ }
+ } else {
+ if (sc->vga_seq.seq_map_mask & 1)
+ sc->vga_ram[offset + 0*64*KB] = c0;
+ if (sc->vga_seq.seq_map_mask & 2)
+ sc->vga_ram[offset + 1*64*KB] = c1;
+ if (sc->vga_seq.seq_map_mask & 4)
+ sc->vga_ram[offset + 2*64*KB] = c2;
+ if (sc->vga_seq.seq_map_mask & 8)
+ sc->vga_ram[offset + 3*64*KB] = c3;
+ }
+}
+
+static int
+vga_mem_handler(struct vmctx *ctx, int vcpu, int dir, uint64_t addr,
+ int size, uint64_t *val, void *arg1, long arg2)
+{
+ if (dir == MEM_F_WRITE) {
+ switch (size) {
+ case 1:
+ vga_mem_wr_handler(ctx, addr, *val, arg1);
+ break;
+ case 2:
+ vga_mem_wr_handler(ctx, addr, *val, arg1);
+ vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1);
+ break;
+ case 4:
+ vga_mem_wr_handler(ctx, addr, *val, arg1);
+ vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1);
+ vga_mem_wr_handler(ctx, addr + 2, *val >> 16, arg1);
+ vga_mem_wr_handler(ctx, addr + 3, *val >> 24, arg1);
+ break;
+ case 8:
+ vga_mem_wr_handler(ctx, addr, *val, arg1);
+ vga_mem_wr_handler(ctx, addr + 1, *val >> 8, arg1);
+ vga_mem_wr_handler(ctx, addr + 2, *val >> 16, arg1);
+ vga_mem_wr_handler(ctx, addr + 3, *val >> 24, arg1);
+ vga_mem_wr_handler(ctx, addr + 4, *val >> 32, arg1);
+ vga_mem_wr_handler(ctx, addr + 5, *val >> 40, arg1);
+ vga_mem_wr_handler(ctx, addr + 6, *val >> 48, arg1);
+ vga_mem_wr_handler(ctx, addr + 7, *val >> 56, arg1);
+ break;
+ }
+ } else {
+ switch (size) {
+ case 1:
+ *val = vga_mem_rd_handler(ctx, addr, arg1);
+ break;
+ case 2:
+ *val = vga_mem_rd_handler(ctx, addr, arg1);
+ *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8;
+ break;
+ case 4:
+ *val = vga_mem_rd_handler(ctx, addr, arg1);
+ *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8;
+ *val |= vga_mem_rd_handler(ctx, addr + 2, arg1) << 16;
+ *val |= vga_mem_rd_handler(ctx, addr + 3, arg1) << 24;
+ break;
+ case 8:
+ *val = vga_mem_rd_handler(ctx, addr, arg1);
+ *val |= vga_mem_rd_handler(ctx, addr + 1, arg1) << 8;
+ *val |= vga_mem_rd_handler(ctx, addr + 2, arg1) << 16;
+ *val |= vga_mem_rd_handler(ctx, addr + 3, arg1) << 24;
+ *val |= vga_mem_rd_handler(ctx, addr + 4, arg1) << 32;
+ *val |= vga_mem_rd_handler(ctx, addr + 5, arg1) << 40;
+ *val |= vga_mem_rd_handler(ctx, addr + 6, arg1) << 48;
+ *val |= vga_mem_rd_handler(ctx, addr + 7, arg1) << 56;
+ break;
+ }
+ }
+
+ return (0);
+}
+
+static int
+vga_port_in_handler(struct vmctx *ctx, int in, int port, int bytes,
+ uint8_t *val, void *arg)
+{
+ struct vga_softc *sc = arg;
+
+ switch (port) {
+ case CRTC_IDX_MONO_PORT:
+ case CRTC_IDX_COLOR_PORT:
+ *val = sc->vga_crtc.crtc_index;
+ break;
+ case CRTC_DATA_MONO_PORT:
+ case CRTC_DATA_COLOR_PORT:
+ switch (sc->vga_crtc.crtc_index) {
+ case CRTC_HORIZ_TOTAL:
+ *val = sc->vga_crtc.crtc_horiz_total;
+ break;
+ case CRTC_HORIZ_DISP_END:
+ *val = sc->vga_crtc.crtc_horiz_disp_end;
+ break;
+ case CRTC_START_HORIZ_BLANK:
+ *val = sc->vga_crtc.crtc_start_horiz_blank;
+ break;
+ case CRTC_END_HORIZ_BLANK:
+ *val = sc->vga_crtc.crtc_end_horiz_blank;
+ break;
+ case CRTC_START_HORIZ_RETRACE:
+ *val = sc->vga_crtc.crtc_start_horiz_retrace;
+ break;
+ case CRTC_END_HORIZ_RETRACE:
+ *val = sc->vga_crtc.crtc_end_horiz_retrace;
+ break;
+ case CRTC_VERT_TOTAL:
+ *val = sc->vga_crtc.crtc_vert_total;
+ break;
+ case CRTC_OVERFLOW:
+ *val = sc->vga_crtc.crtc_overflow;
+ break;
+ case CRTC_PRESET_ROW_SCAN:
+ *val = sc->vga_crtc.crtc_present_row_scan;
+ break;
+ case CRTC_MAX_SCAN_LINE:
+ *val = sc->vga_crtc.crtc_max_scan_line;
+ break;
+ case CRTC_CURSOR_START:
+ *val = sc->vga_crtc.crtc_cursor_start;
+ break;
+ case CRTC_CURSOR_END:
+ *val = sc->vga_crtc.crtc_cursor_end;
+ break;
+ case CRTC_START_ADDR_HIGH:
+ *val = sc->vga_crtc.crtc_start_addr_high;
+ break;
+ case CRTC_START_ADDR_LOW:
+ *val = sc->vga_crtc.crtc_start_addr_low;
+ break;
+ case CRTC_CURSOR_LOC_HIGH:
+ *val = sc->vga_crtc.crtc_cursor_loc_high;
+ break;
+ case CRTC_CURSOR_LOC_LOW:
+ *val = sc->vga_crtc.crtc_cursor_loc_low;
+ break;
+ case CRTC_VERT_RETRACE_START:
+ *val = sc->vga_crtc.crtc_vert_retrace_start;
+ break;
+ case CRTC_VERT_RETRACE_END:
+ *val = sc->vga_crtc.crtc_vert_retrace_end;
+ break;
+ case CRTC_VERT_DISP_END:
+ *val = sc->vga_crtc.crtc_vert_disp_end;
+ break;
+ case CRTC_OFFSET:
+ *val = sc->vga_crtc.crtc_offset;
+ break;
+ case CRTC_UNDERLINE_LOC:
+ *val = sc->vga_crtc.crtc_underline_loc;
+ break;
+ case CRTC_START_VERT_BLANK:
+ *val = sc->vga_crtc.crtc_start_vert_blank;
+ break;
+ case CRTC_END_VERT_BLANK:
+ *val = sc->vga_crtc.crtc_end_vert_blank;
+ break;
+ case CRTC_MODE_CONTROL:
+ *val = sc->vga_crtc.crtc_mode_ctrl;
+ break;
+ case CRTC_LINE_COMPARE:
+ *val = sc->vga_crtc.crtc_line_compare;
+ break;
+ default:
+ //printf("XXX VGA CRTC: inb 0x%04x at index %d\n", port, sc->vga_crtc.crtc_index);
+ assert(0);
+ break;
+ }
+ break;
+ case ATC_IDX_PORT:
+ *val = sc->vga_atc.atc_index;
+ break;
+ case ATC_DATA_PORT:
+ switch (sc->vga_atc.atc_index) {
+ case ATC_PALETTE0 ... ATC_PALETTE15:
+ *val = sc->vga_atc.atc_palette[sc->vga_atc.atc_index];
+ break;
+ case ATC_MODE_CONTROL:
+ *val = sc->vga_atc.atc_mode;
+ break;
+ case ATC_OVERSCAN_COLOR:
+ *val = sc->vga_atc.atc_overscan_color;
+ break;
+ case ATC_COLOR_PLANE_ENABLE:
+ *val = sc->vga_atc.atc_color_plane_enb;
+ break;
+ case ATC_HORIZ_PIXEL_PANNING:
+ *val = sc->vga_atc.atc_horiz_pixel_panning;
+ break;
+ case ATC_COLOR_SELECT:
+ *val = sc->vga_atc.atc_color_select;
+ break;
+ default:
+ //printf("XXX VGA ATC inb 0x%04x at index %d\n", port , sc->vga_atc.atc_index);
+ assert(0);
+ break;
+ }
+ break;
+ case SEQ_IDX_PORT:
+ *val = sc->vga_seq.seq_index;
+ break;
+ case SEQ_DATA_PORT:
+ switch (sc->vga_seq.seq_index) {
+ case SEQ_RESET:
+ *val = sc->vga_seq.seq_reset;
+ break;
+ case SEQ_CLOCKING_MODE:
+ *val = sc->vga_seq.seq_clock_mode;
+ break;
+ case SEQ_MAP_MASK:
+ *val = sc->vga_seq.seq_map_mask;
+ break;
+ case SEQ_CHAR_MAP_SELECT:
+ *val = sc->vga_seq.seq_cmap_sel;
+ break;
+ case SEQ_MEMORY_MODE:
+ *val = sc->vga_seq.seq_mm;
+ break;
+ default:
+ //printf("XXX VGA SEQ: inb 0x%04x at index %d\n", port, sc->vga_seq.seq_index);
+ assert(0);
+ break;
+ }
+ case DAC_DATA_PORT:
+ *val = sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_rd_index +
+ sc->vga_dac.dac_rd_subindex];
+ sc->vga_dac.dac_rd_subindex++;
+ if (sc->vga_dac.dac_rd_subindex == 3) {
+ sc->vga_dac.dac_rd_index++;
+ sc->vga_dac.dac_rd_subindex = 0;
+ }
+ break;
+ case GC_IDX_PORT:
+ *val = sc->vga_gc.gc_index;
+ break;
+ case GC_DATA_PORT:
+ switch (sc->vga_gc.gc_index) {
+ case GC_SET_RESET:
+ *val = sc->vga_gc.gc_set_reset;
+ break;
+ case GC_ENABLE_SET_RESET:
+ *val = sc->vga_gc.gc_enb_set_reset;
+ break;
+ case GC_COLOR_COMPARE:
+ *val = sc->vga_gc.gc_color_compare;
+ break;
+ case GC_DATA_ROTATE:
+ *val = sc->vga_gc.gc_rotate;
+ break;
+ case GC_READ_MAP_SELECT:
+ *val = sc->vga_gc.gc_read_map_sel;
+ break;
+ case GC_MODE:
+ *val = sc->vga_gc.gc_mode;
+ break;
+ case GC_MISCELLANEOUS:
+ *val = sc->vga_gc.gc_misc;
+ break;
+ case GC_COLOR_DONT_CARE:
+ *val = sc->vga_gc.gc_color_dont_care;
+ break;
+ case GC_BIT_MASK:
+ *val = sc->vga_gc.gc_bit_mask;
+ break;
+ default:
+ //printf("XXX VGA GC: inb 0x%04x at index %d\n", port, sc->vga_crtc.crtc_index);
+ assert(0);
+ break;
+ }
+ break;
+ case GEN_MISC_OUTPUT_PORT:
+ *val = sc->vga_misc;
+ break;
+ case GEN_INPUT_STS0_PORT:
+ assert(0);
+ break;
+ case GEN_INPUT_STS1_MONO_PORT:
+ case GEN_INPUT_STS1_COLOR_PORT:
+ sc->vga_atc.atc_flipflop = 0;
+ sc->vga_sts1 ^= (GEN_IS1_VR | GEN_IS1_DE);
+ *val = sc->vga_sts1;
+ break;
+ case GEN_FEATURE_CTRL_PORT:
+ assert(0);
+ break;
+ default:
+ printf("XXX vga_port_in_handler() unhandled port 0x%x\n", port);
+ assert(0);
+ return (-1);
+ }
+
+ return (0);
+}
+
+static int
+vga_port_out_handler(struct vmctx *ctx, int in, int port, int bytes,
+ uint8_t val, void *arg)
+{
+ struct vga_softc *sc = arg;
+
+ switch (port) {
+ case CRTC_IDX_MONO_PORT:
+ case CRTC_IDX_COLOR_PORT:
+ sc->vga_crtc.crtc_index = val;
+ break;
+ case CRTC_DATA_MONO_PORT:
+ case CRTC_DATA_COLOR_PORT:
+ switch (sc->vga_crtc.crtc_index) {
+ case CRTC_HORIZ_TOTAL:
+ sc->vga_crtc.crtc_horiz_total = val;
+ break;
+ case CRTC_HORIZ_DISP_END:
+ sc->vga_crtc.crtc_horiz_disp_end = val;
+ break;
+ case CRTC_START_HORIZ_BLANK:
+ sc->vga_crtc.crtc_start_horiz_blank = val;
+ break;
+ case CRTC_END_HORIZ_BLANK:
+ sc->vga_crtc.crtc_end_horiz_blank = val;
+ break;
+ case CRTC_START_HORIZ_RETRACE:
+ sc->vga_crtc.crtc_start_horiz_retrace = val;
+ break;
+ case CRTC_END_HORIZ_RETRACE:
+ sc->vga_crtc.crtc_end_horiz_retrace = val;
+ break;
+ case CRTC_VERT_TOTAL:
+ sc->vga_crtc.crtc_vert_total = val;
+ break;
+ case CRTC_OVERFLOW:
+ sc->vga_crtc.crtc_overflow = val;
+ break;
+ case CRTC_PRESET_ROW_SCAN:
+ sc->vga_crtc.crtc_present_row_scan = val;
+ break;
+ case CRTC_MAX_SCAN_LINE:
+ sc->vga_crtc.crtc_max_scan_line = val;
+ break;
+ case CRTC_CURSOR_START:
+ sc->vga_crtc.crtc_cursor_start = val;
+ sc->vga_crtc.crtc_cursor_on = (val & CRTC_CS_CO) == 0;
+ break;
+ case CRTC_CURSOR_END:
+ sc->vga_crtc.crtc_cursor_end = val;
+ break;
+ case CRTC_START_ADDR_HIGH:
+ sc->vga_crtc.crtc_start_addr_high = val;
+ sc->vga_crtc.crtc_start_addr &= 0x00ff;
+ sc->vga_crtc.crtc_start_addr |= (val << 8);
+ break;
+ case CRTC_START_ADDR_LOW:
+ sc->vga_crtc.crtc_start_addr_low = val;
+ sc->vga_crtc.crtc_start_addr &= 0xff00;
+ sc->vga_crtc.crtc_start_addr |= (val & 0xff);
+ break;
+ case CRTC_CURSOR_LOC_HIGH:
+ sc->vga_crtc.crtc_cursor_loc_high = val;
+ sc->vga_crtc.crtc_cursor_loc &= 0x00ff;
+ sc->vga_crtc.crtc_cursor_loc |= (val << 8);
+ break;
+ case CRTC_CURSOR_LOC_LOW:
+ sc->vga_crtc.crtc_cursor_loc_low = val;
+ sc->vga_crtc.crtc_cursor_loc &= 0xff00;
+ sc->vga_crtc.crtc_cursor_loc |= (val & 0xff);
+ break;
+ case CRTC_VERT_RETRACE_START:
+ sc->vga_crtc.crtc_vert_retrace_start = val;
+ break;
+ case CRTC_VERT_RETRACE_END:
+ sc->vga_crtc.crtc_vert_retrace_end = val;
+ break;
+ case CRTC_VERT_DISP_END:
+ sc->vga_crtc.crtc_vert_disp_end = val;
+ break;
+ case CRTC_OFFSET:
+ sc->vga_crtc.crtc_offset = val;
+ break;
+ case CRTC_UNDERLINE_LOC:
+ sc->vga_crtc.crtc_underline_loc = val;
+ break;
+ case CRTC_START_VERT_BLANK:
+ sc->vga_crtc.crtc_start_vert_blank = val;
+ break;
+ case CRTC_END_VERT_BLANK:
+ sc->vga_crtc.crtc_end_vert_blank = val;
+ break;
+ case CRTC_MODE_CONTROL:
+ sc->vga_crtc.crtc_mode_ctrl = val;
+ break;
+ case CRTC_LINE_COMPARE:
+ sc->vga_crtc.crtc_line_compare = val;
+ break;
+ default:
+ //printf("XXX VGA CRTC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_crtc.crtc_index);
+ assert(0);
+ break;
+ }
+ break;
+ case ATC_IDX_PORT:
+ if (sc->vga_atc.atc_flipflop == 0) {
+ if (sc->vga_atc.atc_index & 0x20)
+ assert(0);
+ sc->vga_atc.atc_index = val & ATC_IDX_MASK;
+ } else {
+ switch (sc->vga_atc.atc_index) {
+ case ATC_PALETTE0 ... ATC_PALETTE15:
+ sc->vga_atc.atc_palette[sc->vga_atc.atc_index] = val & 0x3f;
+ break;
+ case ATC_MODE_CONTROL:
+ sc->vga_atc.atc_mode = val;
+ break;
+ case ATC_OVERSCAN_COLOR:
+ sc->vga_atc.atc_overscan_color = val;
+ break;
+ case ATC_COLOR_PLANE_ENABLE:
+ sc->vga_atc.atc_color_plane_enb = val;
+ break;
+ case ATC_HORIZ_PIXEL_PANNING:
+ sc->vga_atc.atc_horiz_pixel_panning = val;
+ break;
+ case ATC_COLOR_SELECT:
+ sc->vga_atc.atc_color_select = val;
+ sc->vga_atc.atc_color_select_45 =
+ (val & ATC_CS_C45) << 4;
+ sc->vga_atc.atc_color_select_67 =
+ (val & ATC_CS_C67) << 6;
+ break;
+ default:
+ //printf("XXX VGA ATC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_atc.atc_index);
+ assert(0);
+ break;
+ }
+ }
+ sc->vga_atc.atc_flipflop ^= 1;
+ break;
+ case ATC_DATA_PORT:
+ break;
+ case SEQ_IDX_PORT:
+ sc->vga_seq.seq_index = val & 0x1f;
+ break;
+ case SEQ_DATA_PORT:
+ switch (sc->vga_seq.seq_index) {
+ case SEQ_RESET:
+ sc->vga_seq.seq_reset = val;
+ break;
+ case SEQ_CLOCKING_MODE:
+ sc->vga_seq.seq_clock_mode = val;
+ sc->vga_seq.seq_cm_dots = (val & SEQ_CM_89) ? 8 : 9;
+ break;
+ case SEQ_MAP_MASK:
+ sc->vga_seq.seq_map_mask = val;
+ break;
+ case SEQ_CHAR_MAP_SELECT:
+ sc->vga_seq.seq_cmap_sel = val;
+
+ sc->vga_seq.seq_cmap_pri_off = ((((val & SEQ_CMS_SA) >> SEQ_CMS_SA_SHIFT) * 2) + ((val & SEQ_CMS_SAH) >> SEQ_CMS_SAH_SHIFT)) * 8 * KB;
+ sc->vga_seq.seq_cmap_sec_off = ((((val & SEQ_CMS_SB) >> SEQ_CMS_SB_SHIFT) * 2) + ((val & SEQ_CMS_SBH) >> SEQ_CMS_SBH_SHIFT)) * 8 * KB;
+ break;
+ case SEQ_MEMORY_MODE:
+ sc->vga_seq.seq_mm = val;
+ assert((sc->vga_seq.seq_mm & SEQ_MM_C4) == 0);
+ break;
+ default:
+ //printf("XXX VGA SEQ: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_seq.seq_index);
+ assert(0);
+ break;
+ }
+ break;
+ case DAC_MASK:
+ break;
+ case DAC_IDX_RD_PORT:
+ sc->vga_dac.dac_rd_index = val;
+ sc->vga_dac.dac_rd_subindex = 0;
+ break;
+ case DAC_IDX_WR_PORT:
+ sc->vga_dac.dac_wr_index = val;
+ sc->vga_dac.dac_wr_subindex = 0;
+ break;
+ case DAC_DATA_PORT:
+ sc->vga_dac.dac_palette[3 * sc->vga_dac.dac_wr_index +
+ sc->vga_dac.dac_wr_subindex] = val;
+ sc->vga_dac.dac_wr_subindex++;
+ if (sc->vga_dac.dac_wr_subindex == 3) {
+ sc->vga_dac.dac_palette_rgb[sc->vga_dac.dac_wr_index] =
+ ((((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] << 2) |
+ ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] & 0x1) << 1) |
+ (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 0] & 0x1)) << 16) |
+ (((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] << 2) |
+ ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] & 0x1) << 1) |
+ (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 1] & 0x1)) << 8) |
+ (((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] << 2) |
+ ((sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] & 0x1) << 1) |
+ (sc->vga_dac.dac_palette[3*sc->vga_dac.dac_wr_index + 2] & 0x1)) << 0));
+
+ sc->vga_dac.dac_wr_index++;
+ sc->vga_dac.dac_wr_subindex = 0;
+ }
+ break;
+ case GC_IDX_PORT:
+ sc->vga_gc.gc_index = val;
+ break;
+ case GC_DATA_PORT:
+ switch (sc->vga_gc.gc_index) {
+ case GC_SET_RESET:
+ sc->vga_gc.gc_set_reset = val;
+ break;
+ case GC_ENABLE_SET_RESET:
+ sc->vga_gc.gc_enb_set_reset = val;
+ break;
+ case GC_COLOR_COMPARE:
+ sc->vga_gc.gc_color_compare = val;
+ break;
+ case GC_DATA_ROTATE:
+ sc->vga_gc.gc_rotate = val;
+ sc->vga_gc.gc_op = (val >> 3) & 0x3;
+ break;
+ case GC_READ_MAP_SELECT:
+ sc->vga_gc.gc_read_map_sel = val;
+ break;
+ case GC_MODE:
+ sc->vga_gc.gc_mode = val;
+ sc->vga_gc.gc_mode_c4 = (val & GC_MODE_C4) != 0;
+ assert(!sc->vga_gc.gc_mode_c4);
+ sc->vga_gc.gc_mode_oe = (val & GC_MODE_OE) != 0;
+ sc->vga_gc.gc_mode_rm = (val >> 3) & 0x1;
+ sc->vga_gc.gc_mode_wm = val & 0x3;
+ break;
+ case GC_MISCELLANEOUS:
+ sc->vga_gc.gc_misc = val;
+ sc->vga_gc.gc_misc_gm = val & GC_MISC_GM;
+ sc->vga_gc.gc_misc_mm = (val & GC_MISC_MM) >>
+ GC_MISC_MM_SHIFT;
+ break;
+ case GC_COLOR_DONT_CARE:
+ sc->vga_gc.gc_color_dont_care = val;
+ break;
+ case GC_BIT_MASK:
+ sc->vga_gc.gc_bit_mask = val;
+ break;
+ default:
+ //printf("XXX VGA GC: outb 0x%04x, 0x%02x at index %d\n", port, val, sc->vga_gc.gc_index);
+ assert(0);
+ break;
+ }
+ break;
+ case GEN_INPUT_STS0_PORT:
+ /* write to Miscellaneous Output Register */
+ sc->vga_misc = val;
+ break;
+ case GEN_INPUT_STS1_MONO_PORT:
+ case GEN_INPUT_STS1_COLOR_PORT:
+ /* write to Feature Control Register */
+ break;
+ default:
+ printf("XXX vga_port_out_handler() unhandled port 0x%x\n", port);
+ //assert(0);
+ return (-1);
+ }
+ return (0);
+}
+
+static int
+vga_port_handler(struct vmctx *ctx, int vcpu, int in, int port, int bytes,
+ uint32_t *eax, void *arg)
+{
+ uint8_t val;
+ int error;
+
+ switch (bytes) {
+ case 1:
+ if (in) {
+ *eax &= ~0xff;
+ error = vga_port_in_handler(ctx, in, port, 1,
+ &val, arg);
+ if (!error) {
+ *eax |= val & 0xff;
+ }
+ } else {
+ val = *eax & 0xff;
+ error = vga_port_out_handler(ctx, in, port, 1,
+ val, arg);
+ }
+ break;
+ case 2:
+ if (in) {
+ *eax &= ~0xffff;
+ error = vga_port_in_handler(ctx, in, port, 1,
+ &val, arg);
+ if (!error) {
+ *eax |= val & 0xff;
+ }
+ error = vga_port_in_handler(ctx, in, port + 1, 1,
+ &val, arg);
+ if (!error) {
+ *eax |= (val & 0xff) << 8;
+ }
+ } else {
+ val = *eax & 0xff;
+ error = vga_port_out_handler(ctx, in, port, 1,
+ val, arg);
+ val = (*eax >> 8) & 0xff;
+ error =vga_port_out_handler(ctx, in, port + 1, 1,
+ val, arg);
+ }
+ break;
+ default:
+ assert(0);
+ return (-1);
+ }
+
+ return (error);
+}
+
+int
+vga_init(void)
+{
+ struct inout_port iop;
+ struct vga_softc *sc;
+ int port, error;
+
+ sc = calloc(1, sizeof(struct vga_softc));
+
+ bzero(&iop, sizeof(struct inout_port));
+ iop.name = "VGA";
+ for (port = VGA_IOPORT_START; port <= VGA_IOPORT_END; port++) {
+ iop.port = port;
+ iop.size = 1;
+ iop.flags = IOPORT_F_INOUT;
+ iop.handler = vga_port_handler;
+ iop.arg = sc;
+
+ error = register_inout(&iop);
+ assert(error == 0);
+ }
+
+ sc->mr.name = "VGA memory";
+ sc->mr.flags = MEM_F_RW;
+ sc->mr.base = 640 * KB;
+ sc->mr.size = 128 * KB;
+ sc->mr.handler = vga_mem_handler;
+ sc->mr.arg1 = sc;
+ error = register_mem_fallback(&sc->mr);
+ assert(error == 0);
+
+ sc->vga_ram = malloc(256 * KB);
+ memset(sc->vga_ram, 0, 256 * KB);
+
+ sc->gc_image = console_get_image();
+ console_fb_register(vga_render, sc);
+
+ return (0);
+}
diff --git a/usr/src/cmd/bhyve/vga.h b/usr/src/cmd/bhyve/vga.h
new file mode 100644
index 0000000000..14637b12b3
--- /dev/null
+++ b/usr/src/cmd/bhyve/vga.h
@@ -0,0 +1,160 @@
+/*-
+ * Copyright (c) 2015 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD$
+ */
+
+#ifndef _VGA_H_
+#define _VGA_H_
+
+#define VGA_IOPORT_START 0x3c0
+#define VGA_IOPORT_END 0x3df
+
+/* General registers */
+#define GEN_INPUT_STS0_PORT 0x3c2
+#define GEN_FEATURE_CTRL_PORT 0x3ca
+#define GEN_MISC_OUTPUT_PORT 0x3cc
+#define GEN_INPUT_STS1_MONO_PORT 0x3ba
+#define GEN_INPUT_STS1_COLOR_PORT 0x3da
+#define GEN_IS1_VR 0x08 /* Vertical retrace */
+#define GEN_IS1_DE 0x01 /* Display enable not */
+
+/* Attribute controller registers. */
+#define ATC_IDX_PORT 0x3c0
+#define ATC_DATA_PORT 0x3c1
+
+#define ATC_IDX_MASK 0x1f
+#define ATC_PALETTE0 0
+#define ATC_PALETTE15 15
+#define ATC_MODE_CONTROL 16
+#define ATC_MC_IPS 0x80 /* Internal palette size */
+#define ATC_MC_GA 0x01 /* Graphics/alphanumeric */
+#define ATC_OVERSCAN_COLOR 17
+#define ATC_COLOR_PLANE_ENABLE 18
+#define ATC_HORIZ_PIXEL_PANNING 19
+#define ATC_COLOR_SELECT 20
+#define ATC_CS_C67 0x0c /* Color select bits 6+7 */
+#define ATC_CS_C45 0x03 /* Color select bits 4+5 */
+
+/* Sequencer registers. */
+#define SEQ_IDX_PORT 0x3c4
+#define SEQ_DATA_PORT 0x3c5
+
+#define SEQ_RESET 0
+#define SEQ_RESET_ASYNC 0x1
+#define SEQ_RESET_SYNC 0x2
+#define SEQ_CLOCKING_MODE 1
+#define SEQ_CM_SO 0x20 /* Screen off */
+#define SEQ_CM_89 0x01 /* 8/9 dot clock */
+#define SEQ_MAP_MASK 2
+#define SEQ_CHAR_MAP_SELECT 3
+#define SEQ_CMS_SAH 0x20 /* Char map A bit 2 */
+#define SEQ_CMS_SAH_SHIFT 5
+#define SEQ_CMS_SA 0x0c /* Char map A bits 0+1 */
+#define SEQ_CMS_SA_SHIFT 2
+#define SEQ_CMS_SBH 0x10 /* Char map B bit 2 */
+#define SEQ_CMS_SBH_SHIFT 4
+#define SEQ_CMS_SB 0x03 /* Char map B bits 0+1 */
+#define SEQ_CMS_SB_SHIFT 0
+#define SEQ_MEMORY_MODE 4
+#define SEQ_MM_C4 0x08 /* Chain 4 */
+#define SEQ_MM_OE 0x04 /* Odd/even */
+#define SEQ_MM_EM 0x02 /* Extended memory */
+
+/* Graphics controller registers. */
+#define GC_IDX_PORT 0x3ce
+#define GC_DATA_PORT 0x3cf
+
+#define GC_SET_RESET 0
+#define GC_ENABLE_SET_RESET 1
+#define GC_COLOR_COMPARE 2
+#define GC_DATA_ROTATE 3
+#define GC_READ_MAP_SELECT 4
+#define GC_MODE 5
+#define GC_MODE_OE 0x10 /* Odd/even */
+#define GC_MODE_C4 0x04 /* Chain 4 */
+
+#define GC_MISCELLANEOUS 6
+#define GC_MISC_GM 0x01 /* Graphics/alphanumeric */
+#define GC_MISC_MM 0x0c /* memory map */
+#define GC_MISC_MM_SHIFT 2
+#define GC_COLOR_DONT_CARE 7
+#define GC_BIT_MASK 8
+
+/* CRT controller registers. */
+#define CRTC_IDX_MONO_PORT 0x3b4
+#define CRTC_DATA_MONO_PORT 0x3b5
+#define CRTC_IDX_COLOR_PORT 0x3d4
+#define CRTC_DATA_COLOR_PORT 0x3d5
+
+#define CRTC_HORIZ_TOTAL 0
+#define CRTC_HORIZ_DISP_END 1
+#define CRTC_START_HORIZ_BLANK 2
+#define CRTC_END_HORIZ_BLANK 3
+#define CRTC_START_HORIZ_RETRACE 4
+#define CRTC_END_HORIZ_RETRACE 5
+#define CRTC_VERT_TOTAL 6
+#define CRTC_OVERFLOW 7
+#define CRTC_OF_VRS9 0x80 /* VRS bit 9 */
+#define CRTC_OF_VRS9_SHIFT 7
+#define CRTC_OF_VDE9 0x40 /* VDE bit 9 */
+#define CRTC_OF_VDE9_SHIFT 6
+#define CRTC_OF_VRS8 0x04 /* VRS bit 8 */
+#define CRTC_OF_VRS8_SHIFT 2
+#define CRTC_OF_VDE8 0x02 /* VDE bit 8 */
+#define CRTC_OF_VDE8_SHIFT 1
+#define CRTC_PRESET_ROW_SCAN 8
+#define CRTC_MAX_SCAN_LINE 9
+#define CRTC_MSL_MSL 0x1f
+#define CRTC_CURSOR_START 10
+#define CRTC_CS_CO 0x20 /* Cursor off */
+#define CRTC_CS_CS 0x1f /* Cursor start */
+#define CRTC_CURSOR_END 11
+#define CRTC_CE_CE 0x1f /* Cursor end */
+#define CRTC_START_ADDR_HIGH 12
+#define CRTC_START_ADDR_LOW 13
+#define CRTC_CURSOR_LOC_HIGH 14
+#define CRTC_CURSOR_LOC_LOW 15
+#define CRTC_VERT_RETRACE_START 16
+#define CRTC_VERT_RETRACE_END 17
+#define CRTC_VRE_MASK 0xf
+#define CRTC_VERT_DISP_END 18
+#define CRTC_OFFSET 19
+#define CRTC_UNDERLINE_LOC 20
+#define CRTC_START_VERT_BLANK 21
+#define CRTC_END_VERT_BLANK 22
+#define CRTC_MODE_CONTROL 23
+#define CRTC_MC_TE 0x80 /* Timing enable */
+#define CRTC_LINE_COMPARE 24
+
+/* DAC registers */
+#define DAC_MASK 0x3c6
+#define DAC_IDX_RD_PORT 0x3c7
+#define DAC_IDX_WR_PORT 0x3c8
+#define DAC_DATA_PORT 0x3c9
+
+int vga_init(void);
+
+#endif /* _VGA_H_ */
diff --git a/usr/src/cmd/bhyve/virtio.c b/usr/src/cmd/bhyve/virtio.c
new file mode 100644
index 0000000000..c3b11dc439
--- /dev/null
+++ b/usr/src/cmd/bhyve/virtio.c
@@ -0,0 +1,755 @@
+/*-
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/virtio.c 270326 2014-08-22 13:01:22Z tychon $");
+
+#include <sys/param.h>
+#include <sys/uio.h>
+
+#include <stdio.h>
+#include <stdint.h>
+#include <pthread.h>
+#include <pthread_np.h>
+
+#include "bhyverun.h"
+#include "pci_emul.h"
+#include "virtio.h"
+
+/*
+ * Functions for dealing with generalized "virtual devices" as
+ * defined by <https://www.google.com/#output=search&q=virtio+spec>
+ */
+
+/*
+ * In case we decide to relax the "virtio softc comes at the
+ * front of virtio-based device softc" constraint, let's use
+ * this to convert.
+ */
+#define DEV_SOFTC(vs) ((void *)(vs))
+
+/*
+ * Link a virtio_softc to its constants, the device softc, and
+ * the PCI emulation.
+ */
+void
+vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
+ void *dev_softc, struct pci_devinst *pi,
+ struct vqueue_info *queues)
+{
+ int i;
+
+ /* vs and dev_softc addresses must match */
+ assert((void *)vs == dev_softc);
+ vs->vs_vc = vc;
+ vs->vs_pi = pi;
+ pi->pi_arg = vs;
+
+ vs->vs_queues = queues;
+ for (i = 0; i < vc->vc_nvq; i++) {
+ queues[i].vq_vs = vs;
+ queues[i].vq_num = i;
+ }
+}
+
+/*
+ * Reset device (device-wide). This erases all queues, i.e.,
+ * all the queues become invalid (though we don't wipe out the
+ * internal pointers, we just clear the VQ_ALLOC flag).
+ *
+ * It resets negotiated features to "none".
+ *
+ * If MSI-X is enabled, this also resets all the vectors to NO_VECTOR.
+ */
+void
+vi_reset_dev(struct virtio_softc *vs)
+{
+ struct vqueue_info *vq;
+ int i, nvq;
+
+ if (vs->vs_mtx)
+ assert(pthread_mutex_isowned_np(vs->vs_mtx));
+
+ nvq = vs->vs_vc->vc_nvq;
+ for (vq = vs->vs_queues, i = 0; i < nvq; vq++, i++) {
+ vq->vq_flags = 0;
+ vq->vq_last_avail = 0;
+ vq->vq_pfn = 0;
+ vq->vq_msix_idx = VIRTIO_MSI_NO_VECTOR;
+ }
+ vs->vs_negotiated_caps = 0;
+ vs->vs_curq = 0;
+ /* vs->vs_status = 0; -- redundant */
+ if (vs->vs_isr)
+ pci_lintr_deassert(vs->vs_pi);
+ vs->vs_isr = 0;
+ vs->vs_msix_cfg_idx = VIRTIO_MSI_NO_VECTOR;
+}
+
+/*
+ * Set I/O BAR (usually 0) to map PCI config registers.
+ */
+void
+vi_set_io_bar(struct virtio_softc *vs, int barnum)
+{
+ size_t size;
+
+ /*
+ * ??? should we use CFG0 if MSI-X is disabled?
+ * Existing code did not...
+ */
+ size = VTCFG_R_CFG1 + vs->vs_vc->vc_cfgsize;
+ pci_emul_alloc_bar(vs->vs_pi, barnum, PCIBAR_IO, size);
+}
+
+/*
+ * Initialize MSI-X vector capabilities if we're to use MSI-X,
+ * or MSI capabilities if not.
+ *
+ * We assume we want one MSI-X vector per queue, here, plus one
+ * for the config vec.
+ */
+int
+vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix)
+{
+ int nvec;
+
+ if (use_msix) {
+ vs->vs_flags |= VIRTIO_USE_MSIX;
+ VS_LOCK(vs);
+ vi_reset_dev(vs); /* set all vectors to NO_VECTOR */
+ VS_UNLOCK(vs);
+ nvec = vs->vs_vc->vc_nvq + 1;
+ if (pci_emul_add_msixcap(vs->vs_pi, nvec, barnum))
+ return (1);
+ } else
+ vs->vs_flags &= ~VIRTIO_USE_MSIX;
+ /* Only 1 MSI vector for bhyve */
+ pci_emul_add_msicap(vs->vs_pi, 1);
+ return (0);
+}
+
+/*
+ * Initialize the currently-selected virtio queue (vs->vs_curq).
+ * The guest just gave us a page frame number, from which we can
+ * calculate the addresses of the queue.
+ */
+void
+vi_vq_init(struct virtio_softc *vs, uint32_t pfn)
+{
+ struct vqueue_info *vq;
+ uint64_t phys;
+ size_t size;
+ char *base;
+
+ vq = &vs->vs_queues[vs->vs_curq];
+ vq->vq_pfn = pfn;
+ phys = (uint64_t)pfn << VRING_PFN;
+ size = vring_size(vq->vq_qsize);
+ base = paddr_guest2host(vs->vs_pi->pi_vmctx, phys, size);
+
+ /* First page(s) are descriptors... */
+ vq->vq_desc = (struct virtio_desc *)base;
+ base += vq->vq_qsize * sizeof(struct virtio_desc);
+
+ /* ... immediately followed by "avail" ring (entirely uint16_t's) */
+ vq->vq_avail = (struct vring_avail *)base;
+ base += (2 + vq->vq_qsize + 1) * sizeof(uint16_t);
+
+ /* Then it's rounded up to the next page... */
+ base = (char *)roundup2((uintptr_t)base, VRING_ALIGN);
+
+ /* ... and the last page(s) are the used ring. */
+ vq->vq_used = (struct vring_used *)base;
+
+ /* Mark queue as allocated, and start at 0 when we use it. */
+ vq->vq_flags = VQ_ALLOC;
+ vq->vq_last_avail = 0;
+}
+
+/*
+ * Helper inline for vq_getchain(): record the i'th "real"
+ * descriptor.
+ */
+static inline void
+_vq_record(int i, volatile struct virtio_desc *vd, struct vmctx *ctx,
+ struct iovec *iov, int n_iov, uint16_t *flags) {
+
+ if (i >= n_iov)
+ return;
+ iov[i].iov_base = paddr_guest2host(ctx, vd->vd_addr, vd->vd_len);
+ iov[i].iov_len = vd->vd_len;
+ if (flags != NULL)
+ flags[i] = vd->vd_flags;
+}
+#define VQ_MAX_DESCRIPTORS 512 /* see below */
+
+/*
+ * Examine the chain of descriptors starting at the "next one" to
+ * make sure that they describe a sensible request. If so, return
+ * the number of "real" descriptors that would be needed/used in
+ * acting on this request. This may be smaller than the number of
+ * available descriptors, e.g., if there are two available but
+ * they are two separate requests, this just returns 1. Or, it
+ * may be larger: if there are indirect descriptors involved,
+ * there may only be one descriptor available but it may be an
+ * indirect pointing to eight more. We return 8 in this case,
+ * i.e., we do not count the indirect descriptors, only the "real"
+ * ones.
+ *
+ * Basically, this vets the vd_flags and vd_next field of each
+ * descriptor and tells you how many are involved. Since some may
+ * be indirect, this also needs the vmctx (in the pci_devinst
+ * at vs->vs_pi) so that it can find indirect descriptors.
+ *
+ * As we process each descriptor, we copy and adjust it (guest to
+ * host address wise, also using the vmtctx) into the given iov[]
+ * array (of the given size). If the array overflows, we stop
+ * placing values into the array but keep processing descriptors,
+ * up to VQ_MAX_DESCRIPTORS, before giving up and returning -1.
+ * So you, the caller, must not assume that iov[] is as big as the
+ * return value (you can process the same thing twice to allocate
+ * a larger iov array if needed, or supply a zero length to find
+ * out how much space is needed).
+ *
+ * If you want to verify the WRITE flag on each descriptor, pass a
+ * non-NULL "flags" pointer to an array of "uint16_t" of the same size
+ * as n_iov and we'll copy each vd_flags field after unwinding any
+ * indirects.
+ *
+ * If some descriptor(s) are invalid, this prints a diagnostic message
+ * and returns -1. If no descriptors are ready now it simply returns 0.
+ *
+ * You are assumed to have done a vq_ring_ready() if needed (note
+ * that vq_has_descs() does one).
+ */
+int
+vq_getchain(struct vqueue_info *vq,
+ struct iovec *iov, int n_iov, uint16_t *flags)
+{
+ int i;
+ u_int ndesc, n_indir;
+ u_int idx, head, next;
+ volatile struct virtio_desc *vdir, *vindir, *vp;
+ struct vmctx *ctx;
+ struct virtio_softc *vs;
+ const char *name;
+
+ vs = vq->vq_vs;
+ name = vs->vs_vc->vc_name;
+
+ /*
+ * Note: it's the responsibility of the guest not to
+ * update vq->vq_avail->va_idx until all of the descriptors
+ * the guest has written are valid (including all their
+ * vd_next fields and vd_flags).
+ *
+ * Compute (last_avail - va_idx) in integers mod 2**16. This is
+ * the number of descriptors the device has made available
+ * since the last time we updated vq->vq_last_avail.
+ *
+ * We just need to do the subtraction as an unsigned int,
+ * then trim off excess bits.
+ */
+ idx = vq->vq_last_avail;
+ ndesc = (uint16_t)((u_int)vq->vq_avail->va_idx - idx);
+ if (ndesc == 0)
+ return (0);
+ if (ndesc > vq->vq_qsize) {
+ /* XXX need better way to diagnose issues */
+ fprintf(stderr,
+ "%s: ndesc (%u) out of range, driver confused?\r\n",
+ name, (u_int)ndesc);
+ return (-1);
+ }
+
+ /*
+ * Now count/parse "involved" descriptors starting from
+ * the head of the chain.
+ *
+ * To prevent loops, we could be more complicated and
+ * check whether we're re-visiting a previously visited
+ * index, but we just abort if the count gets excessive.
+ */
+ ctx = vs->vs_pi->pi_vmctx;
+ head = vq->vq_avail->va_ring[idx & (vq->vq_qsize - 1)];
+ next = head;
+ for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
+ if (next >= vq->vq_qsize) {
+ fprintf(stderr,
+ "%s: descriptor index %u out of range, "
+ "driver confused?\r\n",
+ name, next);
+ return (-1);
+ }
+ vdir = &vq->vq_desc[next];
+ if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
+ _vq_record(i, vdir, ctx, iov, n_iov, flags);
+ i++;
+ } else if ((vs->vs_negotiated_caps &
+ VIRTIO_RING_F_INDIRECT_DESC) == 0) {
+ fprintf(stderr,
+ "%s: descriptor has forbidden INDIRECT flag, "
+ "driver confused?\r\n",
+ name);
+ return (-1);
+ } else {
+ n_indir = vdir->vd_len / 16;
+ if ((vdir->vd_len & 0xf) || n_indir == 0) {
+ fprintf(stderr,
+ "%s: invalid indir len 0x%x, "
+ "driver confused?\r\n",
+ name, (u_int)vdir->vd_len);
+ return (-1);
+ }
+ vindir = paddr_guest2host(ctx,
+ vdir->vd_addr, vdir->vd_len);
+ /*
+ * Indirects start at the 0th, then follow
+ * their own embedded "next"s until those run
+ * out. Each one's indirect flag must be off
+ * (we don't really have to check, could just
+ * ignore errors...).
+ */
+ next = 0;
+ for (;;) {
+ vp = &vindir[next];
+ if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
+ fprintf(stderr,
+ "%s: indirect desc has INDIR flag,"
+ " driver confused?\r\n",
+ name);
+ return (-1);
+ }
+ _vq_record(i, vp, ctx, iov, n_iov, flags);
+ if (++i > VQ_MAX_DESCRIPTORS)
+ goto loopy;
+ if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
+ break;
+ next = vp->vd_next;
+ if (next >= n_indir) {
+ fprintf(stderr,
+ "%s: invalid next %u > %u, "
+ "driver confused?\r\n",
+ name, (u_int)next, n_indir);
+ return (-1);
+ }
+ }
+ }
+ if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0)
+ return (i);
+ }
+loopy:
+ fprintf(stderr,
+ "%s: descriptor loop? count > %d - driver confused?\r\n",
+ name, i);
+ return (-1);
+}
+
+/*
+ * Return the currently-first request chain to the guest, setting
+ * its I/O length to the provided value.
+ *
+ * (This chain is the one you handled when you called vq_getchain()
+ * and used its positive return value.)
+ */
+void
+vq_relchain(struct vqueue_info *vq, uint32_t iolen)
+{
+ uint16_t head, uidx, mask;
+ volatile struct vring_used *vuh;
+ volatile struct virtio_used *vue;
+
+ /*
+ * Notes:
+ * - mask is N-1 where N is a power of 2 so computes x % N
+ * - vuh points to the "used" data shared with guest
+ * - vue points to the "used" ring entry we want to update
+ * - head is the same value we compute in vq_iovecs().
+ *
+ * (I apologize for the two fields named vu_idx; the
+ * virtio spec calls the one that vue points to, "id"...)
+ */
+ mask = vq->vq_qsize - 1;
+ vuh = vq->vq_used;
+ head = vq->vq_avail->va_ring[vq->vq_last_avail++ & mask];
+
+ uidx = vuh->vu_idx;
+ vue = &vuh->vu_ring[uidx++ & mask];
+ vue->vu_idx = head; /* ie, vue->id = head */
+ vue->vu_tlen = iolen;
+ vuh->vu_idx = uidx;
+}
+
+/*
+ * Driver has finished processing "available" chains and calling
+ * vq_relchain on each one. If driver used all the available
+ * chains, used_all should be set.
+ *
+ * If the "used" index moved we may need to inform the guest, i.e.,
+ * deliver an interrupt. Even if the used index did NOT move we
+ * may need to deliver an interrupt, if the avail ring is empty and
+ * we are supposed to interrupt on empty.
+ *
+ * Note that used_all_avail is provided by the caller because it's
+ * a snapshot of the ring state when he decided to finish interrupt
+ * processing -- it's possible that descriptors became available after
+ * that point. (It's also typically a constant 1/True as well.)
+ */
+void
+vq_endchains(struct vqueue_info *vq, int used_all_avail)
+{
+ struct virtio_softc *vs;
+ uint16_t event_idx, new_idx, old_idx;
+ int intr;
+
+ /*
+ * Interrupt generation: if we're using EVENT_IDX,
+ * interrupt if we've crossed the event threshold.
+ * Otherwise interrupt is generated if we added "used" entries,
+ * but suppressed by VRING_AVAIL_F_NO_INTERRUPT.
+ *
+ * In any case, though, if NOTIFY_ON_EMPTY is set and the
+ * entire avail was processed, we need to interrupt always.
+ */
+ vs = vq->vq_vs;
+ new_idx = vq->vq_used->vu_idx;
+ old_idx = vq->vq_save_used;
+ if (used_all_avail &&
+ (vs->vs_negotiated_caps & VIRTIO_F_NOTIFY_ON_EMPTY))
+ intr = 1;
+ else if (vs->vs_negotiated_caps & VIRTIO_RING_F_EVENT_IDX) {
+ event_idx = VQ_USED_EVENT_IDX(vq);
+ /*
+ * This calculation is per docs and the kernel
+ * (see src/sys/dev/virtio/virtio_ring.h).
+ */
+ intr = (uint16_t)(new_idx - event_idx - 1) <
+ (uint16_t)(new_idx - old_idx);
+ } else {
+ intr = new_idx != old_idx &&
+ !(vq->vq_avail->va_flags & VRING_AVAIL_F_NO_INTERRUPT);
+ }
+ if (intr)
+ vq_interrupt(vs, vq);
+}
+
+/* Note: these are in sorted order to make for a fast search */
+static struct config_reg {
+ uint16_t cr_offset; /* register offset */
+ uint8_t cr_size; /* size (bytes) */
+ uint8_t cr_ro; /* true => reg is read only */
+ const char *cr_name; /* name of reg */
+} config_regs[] = {
+ { VTCFG_R_HOSTCAP, 4, 1, "HOSTCAP" },
+ { VTCFG_R_GUESTCAP, 4, 0, "GUESTCAP" },
+ { VTCFG_R_PFN, 4, 0, "PFN" },
+ { VTCFG_R_QNUM, 2, 1, "QNUM" },
+ { VTCFG_R_QSEL, 2, 0, "QSEL" },
+ { VTCFG_R_QNOTIFY, 2, 0, "QNOTIFY" },
+ { VTCFG_R_STATUS, 1, 0, "STATUS" },
+ { VTCFG_R_ISR, 1, 0, "ISR" },
+ { VTCFG_R_CFGVEC, 2, 0, "CFGVEC" },
+ { VTCFG_R_QVEC, 2, 0, "QVEC" },
+};
+
+static inline struct config_reg *
+vi_find_cr(int offset) {
+ u_int hi, lo, mid;
+ struct config_reg *cr;
+
+ lo = 0;
+ hi = sizeof(config_regs) / sizeof(*config_regs) - 1;
+ while (hi >= lo) {
+ mid = (hi + lo) >> 1;
+ cr = &config_regs[mid];
+ if (cr->cr_offset == offset)
+ return (cr);
+ if (cr->cr_offset < offset)
+ lo = mid + 1;
+ else
+ hi = mid - 1;
+ }
+ return (NULL);
+}
+
+/*
+ * Handle pci config space reads.
+ * If it's to the MSI-X info, do that.
+ * If it's part of the virtio standard stuff, do that.
+ * Otherwise dispatch to the actual driver.
+ */
+uint64_t
+vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size)
+{
+ struct virtio_softc *vs = pi->pi_arg;
+ struct virtio_consts *vc;
+ struct config_reg *cr;
+ uint64_t virtio_config_size, max;
+ const char *name;
+ uint32_t newoff;
+ uint32_t value;
+ int error;
+
+ if (vs->vs_flags & VIRTIO_USE_MSIX) {
+ if (baridx == pci_msix_table_bar(pi) ||
+ baridx == pci_msix_pba_bar(pi)) {
+ return (pci_emul_msix_tread(pi, offset, size));
+ }
+ }
+
+ /* XXX probably should do something better than just assert() */
+ assert(baridx == 0);
+
+ if (vs->vs_mtx)
+ pthread_mutex_lock(vs->vs_mtx);
+
+ vc = vs->vs_vc;
+ name = vc->vc_name;
+ value = size == 1 ? 0xff : size == 2 ? 0xffff : 0xffffffff;
+
+ if (size != 1 && size != 2 && size != 4)
+ goto bad;
+
+ if (pci_msix_enabled(pi))
+ virtio_config_size = VTCFG_R_CFG1;
+ else
+ virtio_config_size = VTCFG_R_CFG0;
+
+ if (offset >= virtio_config_size) {
+ /*
+ * Subtract off the standard size (including MSI-X
+ * registers if enabled) and dispatch to underlying driver.
+ * If that fails, fall into general code.
+ */
+ newoff = offset - virtio_config_size;
+ max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
+ if (newoff + size > max)
+ goto bad;
+ error = (*vc->vc_cfgread)(DEV_SOFTC(vs), newoff, size, &value);
+ if (!error)
+ goto done;
+ }
+
+bad:
+ cr = vi_find_cr(offset);
+ if (cr == NULL || cr->cr_size != size) {
+ if (cr != NULL) {
+ /* offset must be OK, so size must be bad */
+ fprintf(stderr,
+ "%s: read from %s: bad size %d\r\n",
+ name, cr->cr_name, size);
+ } else {
+ fprintf(stderr,
+ "%s: read from bad offset/size %jd/%d\r\n",
+ name, (uintmax_t)offset, size);
+ }
+ goto done;
+ }
+
+ switch (offset) {
+ case VTCFG_R_HOSTCAP:
+ value = vc->vc_hv_caps;
+ break;
+ case VTCFG_R_GUESTCAP:
+ value = vs->vs_negotiated_caps;
+ break;
+ case VTCFG_R_PFN:
+ if (vs->vs_curq < vc->vc_nvq)
+ value = vs->vs_queues[vs->vs_curq].vq_pfn;
+ break;
+ case VTCFG_R_QNUM:
+ value = vs->vs_curq < vc->vc_nvq ?
+ vs->vs_queues[vs->vs_curq].vq_qsize : 0;
+ break;
+ case VTCFG_R_QSEL:
+ value = vs->vs_curq;
+ break;
+ case VTCFG_R_QNOTIFY:
+ value = 0; /* XXX */
+ break;
+ case VTCFG_R_STATUS:
+ value = vs->vs_status;
+ break;
+ case VTCFG_R_ISR:
+ value = vs->vs_isr;
+ vs->vs_isr = 0; /* a read clears this flag */
+ if (value)
+ pci_lintr_deassert(pi);
+ break;
+ case VTCFG_R_CFGVEC:
+ value = vs->vs_msix_cfg_idx;
+ break;
+ case VTCFG_R_QVEC:
+ value = vs->vs_curq < vc->vc_nvq ?
+ vs->vs_queues[vs->vs_curq].vq_msix_idx :
+ VIRTIO_MSI_NO_VECTOR;
+ break;
+ }
+done:
+ if (vs->vs_mtx)
+ pthread_mutex_unlock(vs->vs_mtx);
+ return (value);
+}
+
+/*
+ * Handle pci config space writes.
+ * If it's to the MSI-X info, do that.
+ * If it's part of the virtio standard stuff, do that.
+ * Otherwise dispatch to the actual driver.
+ */
+void
+vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value)
+{
+ struct virtio_softc *vs = pi->pi_arg;
+ struct vqueue_info *vq;
+ struct virtio_consts *vc;
+ struct config_reg *cr;
+ uint64_t virtio_config_size, max;
+ const char *name;
+ uint32_t newoff;
+ int error;
+
+ if (vs->vs_flags & VIRTIO_USE_MSIX) {
+ if (baridx == pci_msix_table_bar(pi) ||
+ baridx == pci_msix_pba_bar(pi)) {
+ pci_emul_msix_twrite(pi, offset, size, value);
+ return;
+ }
+ }
+
+ /* XXX probably should do something better than just assert() */
+ assert(baridx == 0);
+
+ if (vs->vs_mtx)
+ pthread_mutex_lock(vs->vs_mtx);
+
+ vc = vs->vs_vc;
+ name = vc->vc_name;
+
+ if (size != 1 && size != 2 && size != 4)
+ goto bad;
+
+ if (pci_msix_enabled(pi))
+ virtio_config_size = VTCFG_R_CFG1;
+ else
+ virtio_config_size = VTCFG_R_CFG0;
+
+ if (offset >= virtio_config_size) {
+ /*
+ * Subtract off the standard size (including MSI-X
+ * registers if enabled) and dispatch to underlying driver.
+ */
+ newoff = offset - virtio_config_size;
+ max = vc->vc_cfgsize ? vc->vc_cfgsize : 0x100000000;
+ if (newoff + size > max)
+ goto bad;
+ error = (*vc->vc_cfgwrite)(DEV_SOFTC(vs), newoff, size, value);
+ if (!error)
+ goto done;
+ }
+
+bad:
+ cr = vi_find_cr(offset);
+ if (cr == NULL || cr->cr_size != size || cr->cr_ro) {
+ if (cr != NULL) {
+ /* offset must be OK, wrong size and/or reg is R/O */
+ if (cr->cr_size != size)
+ fprintf(stderr,
+ "%s: write to %s: bad size %d\r\n",
+ name, cr->cr_name, size);
+ if (cr->cr_ro)
+ fprintf(stderr,
+ "%s: write to read-only reg %s\r\n",
+ name, cr->cr_name);
+ } else {
+ fprintf(stderr,
+ "%s: write to bad offset/size %jd/%d\r\n",
+ name, (uintmax_t)offset, size);
+ }
+ goto done;
+ }
+
+ switch (offset) {
+ case VTCFG_R_GUESTCAP:
+ vs->vs_negotiated_caps = value & vc->vc_hv_caps;
+ break;
+ case VTCFG_R_PFN:
+ if (vs->vs_curq >= vc->vc_nvq)
+ goto bad_qindex;
+ vi_vq_init(vs, value);
+ break;
+ case VTCFG_R_QSEL:
+ /*
+ * Note that the guest is allowed to select an
+ * invalid queue; we just need to return a QNUM
+ * of 0 while the bad queue is selected.
+ */
+ vs->vs_curq = value;
+ break;
+ case VTCFG_R_QNOTIFY:
+ if (value >= vc->vc_nvq) {
+ fprintf(stderr, "%s: queue %d notify out of range\r\n",
+ name, (int)value);
+ goto done;
+ }
+ vq = &vs->vs_queues[value];
+ if (vq->vq_notify)
+ (*vq->vq_notify)(DEV_SOFTC(vs), vq);
+ else if (vc->vc_qnotify)
+ (*vc->vc_qnotify)(DEV_SOFTC(vs), vq);
+ else
+ fprintf(stderr,
+ "%s: qnotify queue %d: missing vq/vc notify\r\n",
+ name, (int)value);
+ break;
+ case VTCFG_R_STATUS:
+ vs->vs_status = value;
+ if (value == 0)
+ (*vc->vc_reset)(DEV_SOFTC(vs));
+ break;
+ case VTCFG_R_CFGVEC:
+ vs->vs_msix_cfg_idx = value;
+ break;
+ case VTCFG_R_QVEC:
+ if (vs->vs_curq >= vc->vc_nvq)
+ goto bad_qindex;
+ vq = &vs->vs_queues[vs->vs_curq];
+ vq->vq_msix_idx = value;
+ break;
+ }
+ goto done;
+
+bad_qindex:
+ fprintf(stderr,
+ "%s: write config reg %s: curq %d >= max %d\r\n",
+ name, cr->cr_name, vs->vs_curq, vc->vc_nvq);
+done:
+ if (vs->vs_mtx)
+ pthread_mutex_unlock(vs->vs_mtx);
+}
diff --git a/usr/src/cmd/bhyve/virtio.h b/usr/src/cmd/bhyve/virtio.h
new file mode 100644
index 0000000000..1a2ebe8118
--- /dev/null
+++ b/usr/src/cmd/bhyve/virtio.h
@@ -0,0 +1,475 @@
+/*-
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/virtio.h 268276 2014-07-05 02:38:53Z grehan $
+ */
+
+#ifndef _VIRTIO_H_
+#define _VIRTIO_H_
+
+/*
+ * These are derived from several virtio specifications.
+ *
+ * Some useful links:
+ * https://github.com/rustyrussell/virtio-spec
+ * http://people.redhat.com/pbonzini/virtio-spec.pdf
+ */
+
+/*
+ * A virtual device has zero or more "virtual queues" (virtqueue).
+ * Each virtqueue uses at least two 4096-byte pages, laid out thus:
+ *
+ * +-----------------------------------------------+
+ * | "desc": <N> descriptors, 16 bytes each |
+ * | ----------------------------------------- |
+ * | "avail": 2 uint16; <N> uint16; 1 uint16 |
+ * | ----------------------------------------- |
+ * | pad to 4k boundary |
+ * +-----------------------------------------------+
+ * | "used": 2 x uint16; <N> elems; 1 uint16 |
+ * | ----------------------------------------- |
+ * | pad to 4k boundary |
+ * +-----------------------------------------------+
+ *
+ * The number <N> that appears here is always a power of two and is
+ * limited to no more than 32768 (as it must fit in a 16-bit field).
+ * If <N> is sufficiently large, the above will occupy more than
+ * two pages. In any case, all pages must be physically contiguous
+ * within the guest's physical address space.
+ *
+ * The <N> 16-byte "desc" descriptors consist of a 64-bit guest
+ * physical address <addr>, a 32-bit length <len>, a 16-bit
+ * <flags>, and a 16-bit <next> field (all in guest byte order).
+ *
+ * There are three flags that may be set :
+ * NEXT descriptor is chained, so use its "next" field
+ * WRITE descriptor is for host to write into guest RAM
+ * (else host is to read from guest RAM)
+ * INDIRECT descriptor address field is (guest physical)
+ * address of a linear array of descriptors
+ *
+ * Unless INDIRECT is set, <len> is the number of bytes that may
+ * be read/written from guest physical address <addr>. If
+ * INDIRECT is set, WRITE is ignored and <len> provides the length
+ * of the indirect descriptors (and <len> must be a multiple of
+ * 16). Note that NEXT may still be set in the main descriptor
+ * pointing to the indirect, and should be set in each indirect
+ * descriptor that uses the next descriptor (these should generally
+ * be numbered sequentially). However, INDIRECT must not be set
+ * in the indirect descriptors. Upon reaching an indirect descriptor
+ * without a NEXT bit, control returns to the direct descriptors.
+ *
+ * Except inside an indirect, each <next> value must be in the
+ * range [0 .. N) (i.e., the half-open interval). (Inside an
+ * indirect, each <next> must be in the range [0 .. <len>/16).)
+ *
+ * The "avail" data structures reside in the same pages as the
+ * "desc" structures since both together are used by the device to
+ * pass information to the hypervisor's virtual driver. These
+ * begin with a 16-bit <flags> field and 16-bit index <idx>, then
+ * have <N> 16-bit <ring> values, followed by one final 16-bit
+ * field <used_event>. The <N> <ring> entries are simply indices
+ * indices into the descriptor ring (and thus must meet the same
+ * constraints as each <next> value). However, <idx> is counted
+ * up from 0 (initially) and simply wraps around after 65535; it
+ * is taken mod <N> to find the next available entry.
+ *
+ * The "used" ring occupies a separate page or pages, and contains
+ * values written from the virtual driver back to the guest OS.
+ * This begins with a 16-bit <flags> and 16-bit <idx>, then there
+ * are <N> "vring_used" elements, followed by a 16-bit <avail_event>.
+ * The <N> "vring_used" elements consist of a 32-bit <id> and a
+ * 32-bit <len> (vu_tlen below). The <id> is simply the index of
+ * the head of a descriptor chain the guest made available
+ * earlier, and the <len> is the number of bytes actually written,
+ * e.g., in the case of a network driver that provided a large
+ * receive buffer but received only a small amount of data.
+ *
+ * The two event fields, <used_event> and <avail_event>, in the
+ * avail and used rings (respectively -- note the reversal!), are
+ * always provided, but are used only if the virtual device
+ * negotiates the VIRTIO_RING_F_EVENT_IDX feature during feature
+ * negotiation. Similarly, both rings provide a flag --
+ * VRING_AVAIL_F_NO_INTERRUPT and VRING_USED_F_NO_NOTIFY -- in
+ * their <flags> field, indicating that the guest does not need an
+ * interrupt, or that the hypervisor driver does not need a
+ * notify, when descriptors are added to the corresponding ring.
+ * (These are provided only for interrupt optimization and need
+ * not be implemented.)
+ */
+#define VRING_ALIGN 4096
+
+#define VRING_DESC_F_NEXT (1 << 0)
+#define VRING_DESC_F_WRITE (1 << 1)
+#define VRING_DESC_F_INDIRECT (1 << 2)
+
+struct virtio_desc { /* AKA vring_desc */
+ uint64_t vd_addr; /* guest physical address */
+ uint32_t vd_len; /* length of scatter/gather seg */
+ uint16_t vd_flags; /* VRING_F_DESC_* */
+ uint16_t vd_next; /* next desc if F_NEXT */
+} __packed;
+
+struct virtio_used { /* AKA vring_used_elem */
+ uint32_t vu_idx; /* head of used descriptor chain */
+ uint32_t vu_tlen; /* length written-to */
+} __packed;
+
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+
+struct vring_avail {
+ uint16_t va_flags; /* VRING_AVAIL_F_* */
+ uint16_t va_idx; /* counts to 65535, then cycles */
+ uint16_t va_ring[]; /* size N, reported in QNUM value */
+/* uint16_t va_used_event; -- after N ring entries */
+} __packed;
+
+#define VRING_USED_F_NO_NOTIFY 1
+struct vring_used {
+ uint16_t vu_flags; /* VRING_USED_F_* */
+ uint16_t vu_idx; /* counts to 65535, then cycles */
+ struct virtio_used vu_ring[]; /* size N */
+/* uint16_t vu_avail_event; -- after N ring entries */
+} __packed;
+
+/*
+ * The address of any given virtual queue is determined by a single
+ * Page Frame Number register. The guest writes the PFN into the
+ * PCI config space. However, a device that has two or more
+ * virtqueues can have a different PFN, and size, for each queue.
+ * The number of queues is determinable via the PCI config space
+ * VTCFG_R_QSEL register. Writes to QSEL select the queue: 0 means
+ * queue #0, 1 means queue#1, etc. Once a queue is selected, the
+ * remaining PFN and QNUM registers refer to that queue.
+ *
+ * QNUM is a read-only register containing a nonzero power of two
+ * that indicates the (hypervisor's) queue size. Or, if reading it
+ * produces zero, the hypervisor does not have a corresponding
+ * queue. (The number of possible queues depends on the virtual
+ * device. The block device has just one; the network device
+ * provides either two -- 0 = receive, 1 = transmit -- or three,
+ * with 2 = control.)
+ *
+ * PFN is a read/write register giving the physical page address of
+ * the virtqueue in guest memory (the guest must allocate enough space
+ * based on the hypervisor's provided QNUM).
+ *
+ * QNOTIFY is effectively write-only: when the guest writes a queue
+ * number to the register, the hypervisor should scan the specified
+ * virtqueue. (Reading QNOTIFY currently always gets 0).
+ */
+
+/*
+ * PFN register shift amount
+ */
+#define VRING_PFN 12
+
+/*
+ * Virtio device types
+ *
+ * XXX Should really be merged with <dev/virtio/virtio.h> defines
+ */
+#define VIRTIO_TYPE_NET 1
+#define VIRTIO_TYPE_BLOCK 2
+#define VIRTIO_TYPE_CONSOLE 3
+#define VIRTIO_TYPE_ENTROPY 4
+#define VIRTIO_TYPE_BALLOON 5
+#define VIRTIO_TYPE_IOMEMORY 6
+#define VIRTIO_TYPE_RPMSG 7
+#define VIRTIO_TYPE_SCSI 8
+#define VIRTIO_TYPE_9P 9
+
+/* experimental IDs start at 65535 and work down */
+
+/*
+ * PCI vendor/device IDs
+ */
+#define VIRTIO_VENDOR 0x1AF4
+#define VIRTIO_DEV_NET 0x1000
+#define VIRTIO_DEV_BLOCK 0x1001
+#define VIRTIO_DEV_RANDOM 0x1002
+
+/*
+ * PCI config space constants.
+ *
+ * If MSI-X is enabled, the ISR register is generally not used,
+ * and the configuration vector and queue vector appear at offsets
+ * 20 and 22 with the remaining configuration registers at 24.
+ * If MSI-X is not enabled, those two registers disappear and
+ * the remaining configuration registers start at offset 20.
+ */
+#define VTCFG_R_HOSTCAP 0
+#define VTCFG_R_GUESTCAP 4
+#define VTCFG_R_PFN 8
+#define VTCFG_R_QNUM 12
+#define VTCFG_R_QSEL 14
+#define VTCFG_R_QNOTIFY 16
+#define VTCFG_R_STATUS 18
+#define VTCFG_R_ISR 19
+#define VTCFG_R_CFGVEC 20
+#define VTCFG_R_QVEC 22
+#define VTCFG_R_CFG0 20 /* No MSI-X */
+#define VTCFG_R_CFG1 24 /* With MSI-X */
+#define VTCFG_R_MSIX 20
+
+/*
+ * Bits in VTCFG_R_STATUS. Guests need not actually set any of these,
+ * but a guest writing 0 to this register means "please reset".
+ */
+#define VTCFG_STATUS_ACK 0x01 /* guest OS has acknowledged dev */
+#define VTCFG_STATUS_DRIVER 0x02 /* guest OS driver is loaded */
+#define VTCFG_STATUS_DRIVER_OK 0x04 /* guest OS driver ready */
+#define VTCFG_STATUS_FAILED 0x80 /* guest has given up on this dev */
+
+/*
+ * Bits in VTCFG_R_ISR. These apply only if not using MSI-X.
+ *
+ * (We don't [yet?] ever use CONF_CHANGED.)
+ */
+#define VTCFG_ISR_QUEUES 0x01 /* re-scan queues */
+#define VTCFG_ISR_CONF_CHANGED 0x80 /* configuration changed */
+
+#define VIRTIO_MSI_NO_VECTOR 0xFFFF
+
+/*
+ * Feature flags.
+ * Note: bits 0 through 23 are reserved to each device type.
+ */
+#define VIRTIO_F_NOTIFY_ON_EMPTY (1 << 24)
+#define VIRTIO_RING_F_INDIRECT_DESC (1 << 28)
+#define VIRTIO_RING_F_EVENT_IDX (1 << 29)
+
+/* From section 2.3, "Virtqueue Configuration", of the virtio specification */
+static inline size_t
+vring_size(u_int qsz)
+{
+ size_t size;
+
+ /* constant 3 below = va_flags, va_idx, va_used_event */
+ size = sizeof(struct virtio_desc) * qsz + sizeof(uint16_t) * (3 + qsz);
+ size = roundup2(size, VRING_ALIGN);
+
+ /* constant 3 below = vu_flags, vu_idx, vu_avail_event */
+ size += sizeof(uint16_t) * 3 + sizeof(struct virtio_used) * qsz;
+ size = roundup2(size, VRING_ALIGN);
+
+ return (size);
+}
+
+struct vmctx;
+struct pci_devinst;
+struct vqueue_info;
+
+/*
+ * A virtual device, with some number (possibly 0) of virtual
+ * queues and some size (possibly 0) of configuration-space
+ * registers private to the device. The virtio_softc should come
+ * at the front of each "derived class", so that a pointer to the
+ * virtio_softc is also a pointer to the more specific, derived-
+ * from-virtio driver's softc.
+ *
+ * Note: inside each hypervisor virtio driver, changes to these
+ * data structures must be locked against other threads, if any.
+ * Except for PCI config space register read/write, we assume each
+ * driver does the required locking, but we need a pointer to the
+ * lock (if there is one) for PCI config space read/write ops.
+ *
+ * When the guest reads or writes the device's config space, the
+ * generic layer checks for operations on the special registers
+ * described above. If the offset of the register(s) being read
+ * or written is past the CFG area (CFG0 or CFG1), the request is
+ * passed on to the virtual device, after subtracting off the
+ * generic-layer size. (So, drivers can just use the offset as
+ * an offset into "struct config", for instance.)
+ *
+ * (The virtio layer also makes sure that the read or write is to/
+ * from a "good" config offset, hence vc_cfgsize, and on BAR #0.
+ * However, the driver must verify the read or write size and offset
+ * and that no one is writing a readonly register.)
+ *
+ * The BROKED flag ("this thing done gone and broked") is for future
+ * use.
+ */
+#define VIRTIO_USE_MSIX 0x01
+#define VIRTIO_EVENT_IDX 0x02 /* use the event-index values */
+#define VIRTIO_BROKED 0x08 /* ??? */
+
+struct virtio_softc {
+ struct virtio_consts *vs_vc; /* constants (see below) */
+ int vs_flags; /* VIRTIO_* flags from above */
+ pthread_mutex_t *vs_mtx; /* POSIX mutex, if any */
+ struct pci_devinst *vs_pi; /* PCI device instance */
+ uint32_t vs_negotiated_caps; /* negotiated capabilities */
+ struct vqueue_info *vs_queues; /* one per vc_nvq */
+ int vs_curq; /* current queue */
+ uint8_t vs_status; /* value from last status write */
+ uint8_t vs_isr; /* ISR flags, if not MSI-X */
+ uint16_t vs_msix_cfg_idx; /* MSI-X vector for config event */
+};
+
+#define VS_LOCK(vs) \
+do { \
+ if (vs->vs_mtx) \
+ pthread_mutex_lock(vs->vs_mtx); \
+} while (0)
+
+#define VS_UNLOCK(vs) \
+do { \
+ if (vs->vs_mtx) \
+ pthread_mutex_unlock(vs->vs_mtx); \
+} while (0)
+
+struct virtio_consts {
+ const char *vc_name; /* name of driver (for diagnostics) */
+ int vc_nvq; /* number of virtual queues */
+ size_t vc_cfgsize; /* size of dev-specific config regs */
+ void (*vc_reset)(void *); /* called on virtual device reset */
+ void (*vc_qnotify)(void *, struct vqueue_info *);
+ /* called on QNOTIFY if no VQ notify */
+ int (*vc_cfgread)(void *, int, int, uint32_t *);
+ /* called to read config regs */
+ int (*vc_cfgwrite)(void *, int, int, uint32_t);
+ /* called to write config regs */
+ uint64_t vc_hv_caps; /* hypervisor-provided capabilities */
+};
+
+/*
+ * Data structure allocated (statically) per virtual queue.
+ *
+ * Drivers may change vq_qsize after a reset. When the guest OS
+ * requests a device reset, the hypervisor first calls
+ * vs->vs_vc->vc_reset(); then the data structure below is
+ * reinitialized (for each virtqueue: vs->vs_vc->vc_nvq).
+ *
+ * The remaining fields should only be fussed-with by the generic
+ * code.
+ *
+ * Note: the addresses of vq_desc, vq_avail, and vq_used are all
+ * computable from each other, but it's a lot simpler if we just
+ * keep a pointer to each one. The event indices are similarly
+ * (but more easily) computable, and this time we'll compute them:
+ * they're just XX_ring[N].
+ */
+#define VQ_ALLOC 0x01 /* set once we have a pfn */
+#define VQ_BROKED 0x02 /* ??? */
+struct vqueue_info {
+ uint16_t vq_qsize; /* size of this queue (a power of 2) */
+ void (*vq_notify)(void *, struct vqueue_info *);
+ /* called instead of vc_notify, if not NULL */
+
+ struct virtio_softc *vq_vs; /* backpointer to softc */
+ uint16_t vq_num; /* we're the num'th queue in the softc */
+
+ uint16_t vq_flags; /* flags (see above) */
+ uint16_t vq_last_avail; /* a recent value of vq_avail->va_idx */
+ uint16_t vq_save_used; /* saved vq_used->vu_idx; see vq_endchains */
+ uint16_t vq_msix_idx; /* MSI-X index, or VIRTIO_MSI_NO_VECTOR */
+
+ uint32_t vq_pfn; /* PFN of virt queue (not shifted!) */
+
+ volatile struct virtio_desc *vq_desc; /* descriptor array */
+ volatile struct vring_avail *vq_avail; /* the "avail" ring */
+ volatile struct vring_used *vq_used; /* the "used" ring */
+
+};
+/* as noted above, these are sort of backwards, name-wise */
+#define VQ_AVAIL_EVENT_IDX(vq) \
+ (*(volatile uint16_t *)&(vq)->vq_used->vu_ring[(vq)->vq_qsize])
+#define VQ_USED_EVENT_IDX(vq) \
+ ((vq)->vq_avail->va_ring[(vq)->vq_qsize])
+
+/*
+ * Is this ring ready for I/O?
+ */
+static inline int
+vq_ring_ready(struct vqueue_info *vq)
+{
+
+ return (vq->vq_flags & VQ_ALLOC);
+}
+
+/*
+ * Are there "available" descriptors? (This does not count
+ * how many, just returns True if there are some.)
+ */
+static inline int
+vq_has_descs(struct vqueue_info *vq)
+{
+
+ return (vq_ring_ready(vq) && vq->vq_last_avail !=
+ vq->vq_avail->va_idx);
+}
+
+/*
+ * Called by virtio driver as it starts processing chains. Each
+ * completed chain (obtained from vq_getchain()) is released by
+ * calling vq_relchain(), then when all are done, vq_endchains()
+ * can tell if / how-many chains were processed and know whether
+ * and how to generate an interrupt.
+ */
+static inline void
+vq_startchains(struct vqueue_info *vq)
+{
+
+ vq->vq_save_used = vq->vq_used->vu_idx;
+}
+
+/*
+ * Deliver an interrupt to guest on the given virtual queue
+ * (if possible, or a generic MSI interrupt if not using MSI-X).
+ */
+static inline void
+vq_interrupt(struct virtio_softc *vs, struct vqueue_info *vq)
+{
+
+ if (pci_msix_enabled(vs->vs_pi))
+ pci_generate_msix(vs->vs_pi, vq->vq_msix_idx);
+ else {
+ VS_LOCK(vs);
+ vs->vs_isr |= VTCFG_ISR_QUEUES;
+ pci_generate_msi(vs->vs_pi, 0);
+ pci_lintr_assert(vs->vs_pi);
+ VS_UNLOCK(vs);
+ }
+}
+
+struct iovec;
+void vi_softc_linkup(struct virtio_softc *vs, struct virtio_consts *vc,
+ void *dev_softc, struct pci_devinst *pi,
+ struct vqueue_info *queues);
+int vi_intr_init(struct virtio_softc *vs, int barnum, int use_msix);
+void vi_reset_dev(struct virtio_softc *);
+void vi_set_io_bar(struct virtio_softc *, int);
+
+int vq_getchain(struct vqueue_info *vq,
+ struct iovec *iov, int n_iov, uint16_t *flags);
+void vq_relchain(struct vqueue_info *vq, uint32_t iolen);
+void vq_endchains(struct vqueue_info *vq, int used_all_avail);
+
+uint64_t vi_pci_read(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size);
+void vi_pci_write(struct vmctx *ctx, int vcpu, struct pci_devinst *pi,
+ int baridx, uint64_t offset, int size, uint64_t value);
+#endif /* _VIRTIO_H_ */
diff --git a/usr/src/cmd/bhyve/xmsr.c b/usr/src/cmd/bhyve/xmsr.c
new file mode 100644
index 0000000000..0c097251e0
--- /dev/null
+++ b/usr/src/cmd/bhyve/xmsr.c
@@ -0,0 +1,237 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/xmsr.c 279227 2015-02-24 05:15:40Z neel $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyve/xmsr.c 279227 2015-02-24 05:15:40Z neel $");
+
+#include <sys/types.h>
+
+#include <machine/cpufunc.h>
+#include <machine/vmm.h>
+#include <machine/specialreg.h>
+
+#include <vmmapi.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "xmsr.h"
+
+static int cpu_vendor_intel, cpu_vendor_amd;
+
+int
+emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t val)
+{
+
+ if (cpu_vendor_intel) {
+ switch (num) {
+#ifndef __FreeBSD__
+ case MSR_PERFCTR0:
+ case MSR_PERFCTR1:
+ case MSR_EVNTSEL0:
+ case MSR_EVNTSEL1:
+ return (0);
+#endif
+ case 0xd04: /* Sandy Bridge uncore PMCs */
+ case 0xc24:
+ return (0);
+ case MSR_BIOS_UPDT_TRIG:
+ return (0);
+ case MSR_BIOS_SIGN:
+ return (0);
+ default:
+ break;
+ }
+ } else if (cpu_vendor_amd) {
+ switch (num) {
+ case MSR_HWCR:
+ /*
+ * Ignore writes to hardware configuration MSR.
+ */
+ return (0);
+
+ case MSR_NB_CFG1:
+ case MSR_IC_CFG:
+ return (0); /* Ignore writes */
+
+ case MSR_PERFEVSEL0:
+ case MSR_PERFEVSEL1:
+ case MSR_PERFEVSEL2:
+ case MSR_PERFEVSEL3:
+ /* Ignore writes to the PerfEvtSel MSRs */
+ return (0);
+
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ /* Ignore writes to the PerfCtr MSRs */
+ return (0);
+
+ case MSR_P_STATE_CONTROL:
+ /* Ignore write to change the P-state */
+ return (0);
+
+ default:
+ break;
+ }
+ }
+ return (-1);
+}
+
+int
+emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t num, uint64_t *val)
+{
+ int error = 0;
+
+ if (cpu_vendor_intel) {
+ switch (num) {
+ case MSR_BIOS_SIGN:
+ case MSR_IA32_PLATFORM_ID:
+ case MSR_PKG_ENERGY_STATUS:
+ case MSR_PP0_ENERGY_STATUS:
+ case MSR_PP1_ENERGY_STATUS:
+ case MSR_DRAM_ENERGY_STATUS:
+ *val = 0;
+ break;
+ case MSR_RAPL_POWER_UNIT:
+ /*
+ * Use the default value documented in section
+ * "RAPL Interfaces" in Intel SDM vol3.
+ */
+ *val = 0x000a1003;
+ break;
+ default:
+ error = -1;
+ break;
+ }
+ } else if (cpu_vendor_amd) {
+ switch (num) {
+ case MSR_BIOS_SIGN:
+ *val = 0;
+ break;
+ case MSR_HWCR:
+ /*
+ * Bios and Kernel Developer's Guides for AMD Families
+ * 12H, 14H, 15H and 16H.
+ */
+ *val = 0x01000010; /* Reset value */
+ *val |= 1 << 9; /* MONITOR/MWAIT disable */
+ break;
+
+ case MSR_NB_CFG1:
+ case MSR_IC_CFG:
+ /*
+ * The reset value is processor family dependent so
+ * just return 0.
+ */
+ *val = 0;
+ break;
+
+ case MSR_PERFEVSEL0:
+ case MSR_PERFEVSEL1:
+ case MSR_PERFEVSEL2:
+ case MSR_PERFEVSEL3:
+ /*
+ * PerfEvtSel MSRs are not properly virtualized so just
+ * return zero.
+ */
+ *val = 0;
+ break;
+
+ case MSR_K7_PERFCTR0:
+ case MSR_K7_PERFCTR1:
+ case MSR_K7_PERFCTR2:
+ case MSR_K7_PERFCTR3:
+ /*
+ * PerfCtr MSRs are not properly virtualized so just
+ * return zero.
+ */
+ *val = 0;
+ break;
+
+ case MSR_SMM_ADDR:
+ case MSR_SMM_MASK:
+ /*
+ * Return the reset value defined in the AMD Bios and
+ * Kernel Developer's Guide.
+ */
+ *val = 0;
+ break;
+
+ case MSR_P_STATE_LIMIT:
+ case MSR_P_STATE_CONTROL:
+ case MSR_P_STATE_STATUS:
+ case MSR_P_STATE_CONFIG(0): /* P0 configuration */
+ *val = 0;
+ break;
+
+ /*
+ * OpenBSD guests test bit 0 of this MSR to detect if the
+ * workaround for erratum 721 is already applied.
+ * http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf
+ */
+ case 0xC0011029:
+ *val = 1;
+ break;
+
+ default:
+ error = -1;
+ break;
+ }
+ } else {
+ error = -1;
+ }
+ return (error);
+}
+
+int
+init_msr(void)
+{
+ int error;
+ u_int regs[4];
+ char cpu_vendor[13];
+
+ do_cpuid(0, regs);
+ ((u_int *)&cpu_vendor)[0] = regs[1];
+ ((u_int *)&cpu_vendor)[1] = regs[3];
+ ((u_int *)&cpu_vendor)[2] = regs[2];
+ cpu_vendor[12] = '\0';
+
+ error = 0;
+ if (strcmp(cpu_vendor, "AuthenticAMD") == 0) {
+ cpu_vendor_amd = 1;
+ } else if (strcmp(cpu_vendor, "GenuineIntel") == 0) {
+ cpu_vendor_intel = 1;
+ } else {
+ fprintf(stderr, "Unknown cpu vendor \"%s\"\n", cpu_vendor);
+ error = -1;
+ }
+ return (error);
+}
diff --git a/usr/src/cmd/bhyve/xmsr.h b/usr/src/cmd/bhyve/xmsr.h
new file mode 100644
index 0000000000..ac3c147442
--- /dev/null
+++ b/usr/src/cmd/bhyve/xmsr.h
@@ -0,0 +1,36 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyve/xmsr.h 271888 2014-09-20 02:35:21Z neel $
+ */
+
+#ifndef _XMSR_H_
+#define _XMSR_H_
+
+int init_msr(void);
+int emulate_wrmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t val);
+int emulate_rdmsr(struct vmctx *ctx, int vcpu, uint32_t code, uint64_t *val);
+
+#endif
diff --git a/usr/src/cmd/bhyveconsole/Makefile b/usr/src/cmd/bhyveconsole/Makefile
new file mode 100644
index 0000000000..11d34e6599
--- /dev/null
+++ b/usr/src/cmd/bhyveconsole/Makefile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+include ../Makefile.cmd
+
+SUBDIRS= $(MACH)
+
+all := TARGET = all
+install := TARGET = install
+clean := TARGET = clean
+clobber := TARGET = clobber
+lint := TARGET = lint
+
+.KEEP_STATE:
+
+all: $(SUBDIRS)
+
+clean clobber lint: $(SUBDIRS)
+
+install: $(SUBDIRS)
+ -$(RM) $(ROOTUSRSBINPROG)
+ -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/bhyveconsole/bhyveconsole.c b/usr/src/cmd/bhyveconsole/bhyveconsole.c
new file mode 100644
index 0000000000..7f237a72f6
--- /dev/null
+++ b/usr/src/cmd/bhyveconsole/bhyveconsole.c
@@ -0,0 +1,360 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/param.h>
+#include <sys/signal.h>
+#include <sys/socket.h>
+#include <sys/termios.h>
+#include <assert.h>
+#include <errno.h>
+#include <libgen.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <bhyve.h>
+
+static int masterfd;
+static struct termios save_termios;
+static int save_fd;
+
+static int nocmdchar = 0;
+static char cmdchar = '~';
+
+static const char *pname;
+
+#define BCONS_BUFSIZ 8192
+
+static void
+usage(void)
+{
+ (void) fprintf(stderr, "usage: %s vmname\n", pname);
+ exit(2);
+}
+
+static void
+bcons_error(const char *fmt, ...)
+{
+ va_list alist;
+
+ (void) fprintf(stderr, "%s: ", pname);
+ va_start(alist, fmt);
+ (void) vfprintf(stderr, fmt, alist);
+ va_end(alist);
+ (void) fprintf(stderr, "\n");
+}
+
+static void
+bcons_perror(const char *str)
+{
+ const char *estr;
+
+ if ((estr = strerror(errno)) != NULL)
+ (void) fprintf(stderr, "%s: %s: %s\n", pname, str, estr);
+ else
+ (void) fprintf(stderr, "%s: %s: errno %d\n", pname, str, errno);
+}
+
+/*
+ * Create the unix domain socket and call bhyve; handshake
+ * with it to determine whether it will allow us to connect.
+ */
+static int
+get_console(const char *vmname)
+{
+ int sockfd = -1;
+ struct sockaddr_un servaddr;
+ char clientid[MAXPATHLEN];
+ char handshake[MAXPATHLEN], c;
+ int msglen;
+ int i = 0, err = 0;
+
+ if ((sockfd = socket(AF_UNIX, SOCK_STREAM, 0)) == -1) {
+ bcons_perror("could not create socket");
+ return (-1);
+ }
+
+ bzero(&servaddr, sizeof (servaddr));
+ servaddr.sun_family = AF_UNIX;
+ (void) snprintf(servaddr.sun_path, sizeof (servaddr.sun_path),
+ BHYVE_CONS_SOCKPATH, vmname);
+
+ if (connect(sockfd, (struct sockaddr *)&servaddr,
+ sizeof (servaddr)) == -1) {
+ bcons_perror("Could not connect to console server");
+ goto bad;
+ }
+ masterfd = sockfd;
+
+ msglen = snprintf(clientid, sizeof (clientid), "IDENT %lu\n",
+ getpid());
+ assert(msglen > 0 && msglen < sizeof (clientid));
+
+ if (write(masterfd, clientid, msglen) != msglen) {
+ bcons_error("protocol error");
+ goto bad;
+ }
+
+ /*
+ * Take care not to accumulate more than our fill, and leave room for
+ * the NUL at the end.
+ */
+ while ((err = read(masterfd, &c, 1)) == 1) {
+ if (i >= (sizeof (handshake) - 1))
+ break;
+ if (c == '\n')
+ break;
+ handshake[i] = c;
+ i++;
+ }
+ handshake[i] = '\0';
+
+ /*
+ * If something went wrong during the handshake we bail; perhaps
+ * the server died off.
+ */
+ if (err == -1) {
+ bcons_perror("Could not connect to console server");
+ goto bad;
+ }
+
+ if (strncmp(handshake, "OK", sizeof (handshake)) == 0)
+ return (0);
+
+ bcons_error("Console is already in use by process ID %s.",
+ handshake);
+bad:
+ (void) close(sockfd);
+ masterfd = -1;
+ return (-1);
+}
+
+/*
+ * Place terminal into raw mode.
+ */
+static int
+set_tty_rawmode(int fd)
+{
+ struct termios term;
+ if (tcgetattr(fd, &term) < 0) {
+ bcons_perror("failed to get user terminal settings");
+ return (-1);
+ }
+
+ /* Stash for later, so we can revert back to previous mode */
+ save_termios = term;
+ save_fd = fd;
+
+ /* disable 8->7 bit strip, start/stop, enable any char to restart */
+ term.c_iflag &= ~(ISTRIP|IXON|IXANY);
+ /* disable NL->CR, CR->NL, ignore CR, UPPER->lower */
+ term.c_iflag &= ~(INLCR|ICRNL|IGNCR|IUCLC);
+ /* disable output post-processing */
+ term.c_oflag &= ~OPOST;
+ /* disable canonical mode, signal chars, echo & extended functions */
+ term.c_lflag &= ~(ICANON|ISIG|ECHO|IEXTEN);
+
+ term.c_cc[VMIN] = 1; /* byte-at-a-time */
+ term.c_cc[VTIME] = 0;
+
+ if (tcsetattr(STDIN_FILENO, TCSAFLUSH, &term)) {
+ bcons_perror("failed to set user terminal to raw mode");
+ return (-1);
+ }
+
+ return (0);
+}
+
+/*
+ * reset terminal settings for global environment
+ */
+static void
+reset_tty(void)
+{
+ (void) tcsetattr(save_fd, TCSADRAIN, &save_termios);
+}
+
+/*
+ * process_user_input watches the input stream for the escape sequence for
+ * 'quit' (by default, tilde-period). Because we might be fed just one
+ * keystroke at a time, state associated with the user input (are we at the
+ * beginning of the line? are we locally echoing the next character?) is
+ * maintained by beginning_of_line and local_echo across calls to the routine.
+ *
+ * This routine returns -1 when the 'quit' escape sequence has been issued,
+ * or an error is encountered and 0 otherwise.
+ */
+static int
+process_user_input(int out_fd, int in_fd)
+{
+ static boolean_t beginning_of_line = B_TRUE;
+ static boolean_t local_echo = B_FALSE;
+ char ibuf[BCONS_BUFSIZ];
+ int nbytes;
+ char *buf = ibuf;
+ char c;
+
+ nbytes = read(in_fd, ibuf, sizeof (ibuf));
+ if (nbytes == -1 && errno != EINTR)
+ return (-1);
+
+ if (nbytes == -1) /* The read was interrupted. */
+ return (0);
+
+ for (c = *buf; nbytes > 0; c = *buf, --nbytes) {
+ buf++;
+ if (beginning_of_line && !nocmdchar) {
+ beginning_of_line = B_FALSE;
+ if (c == cmdchar) {
+ local_echo = B_TRUE;
+ continue;
+ }
+ } else if (local_echo) {
+ local_echo = B_FALSE;
+ if (c == '.') {
+ (void) write(STDOUT_FILENO, &cmdchar, 1);
+ (void) write(STDOUT_FILENO, &c, 1);
+ return (-1);
+ }
+ }
+
+ (void) write(out_fd, &c, 1);
+
+ beginning_of_line = (c == '\r' || c == '\n');
+ }
+
+ return (0);
+}
+
+static int
+process_output(int in_fd, int out_fd)
+{
+ int wrote = 0;
+ int cc;
+ char ibuf[BCONS_BUFSIZ];
+
+ cc = read(in_fd, ibuf, sizeof (ibuf));
+ if (cc == -1 && errno != EINTR)
+ return (-1);
+ if (cc == 0) /* EOF */
+ return (-1);
+ if (cc == -1) /* The read was interrupted. */
+ return (0);
+
+ do {
+ int len;
+
+ len = write(out_fd, ibuf + wrote, cc - wrote);
+ if (len == -1 && errno != EINTR)
+ return (-1);
+ if (len != -1)
+ wrote += len;
+ } while (wrote < cc);
+
+ return (0);
+}
+
+/*
+ * This is the main I/O loop.
+ */
+static void
+doio(void)
+{
+ struct pollfd pollfds[2];
+ int res;
+
+ /* read from vm and write to stdout */
+ pollfds[0].fd = masterfd;
+ pollfds[0].events = POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI;
+
+ /* read from stdin and write to vm */
+ pollfds[1].fd = STDIN_FILENO;
+ pollfds[1].events = pollfds[0].events;
+
+ for (;;) {
+ pollfds[0].revents = pollfds[1].revents = 0;
+
+ res = poll(pollfds,
+ sizeof (pollfds) / sizeof (struct pollfd), -1);
+
+ if (res == -1 && errno != EINTR) {
+ bcons_perror("poll failed");
+ /* we are hosed, close connection */
+ break;
+ }
+
+ /* event from master side stdout */
+ if (pollfds[0].revents) {
+ if (pollfds[0].revents &
+ (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+ if (process_output(masterfd, STDOUT_FILENO)
+ != 0)
+ break;
+ } else {
+ break;
+ }
+ }
+
+ /* event from user stdin side */
+ if (pollfds[1].revents) {
+ if (pollfds[1].revents &
+ (POLLIN | POLLRDNORM | POLLRDBAND | POLLPRI)) {
+ if (process_user_input(masterfd, STDIN_FILENO)
+ != 0)
+ break;
+ } else {
+ break;
+ }
+ }
+ }
+}
+
+int
+main(int argc, char **argv)
+{
+ char *vmname;
+
+ pname = basename(argv[0]);
+
+ if (argc == 2) {
+ vmname = argv[1];
+ } else {
+ usage();
+ }
+
+ /*
+ * Make contact with bhyve
+ */
+ if (get_console(vmname) == -1)
+ return (1);
+
+ (void) printf("[Connected to vm '%s' console]\n", vmname);
+
+ if (set_tty_rawmode(STDIN_FILENO) == -1) {
+ reset_tty();
+ bcons_perror("failed to set stdin pty to raw mode");
+ return (1);
+ }
+
+ /*
+ * Run the I/O loop until we get disconnected.
+ */
+ doio();
+ reset_tty();
+ (void) printf("\n[Connection to vm '%s' console closed]\n", vmname);
+
+ return (0);
+}
diff --git a/usr/src/cmd/bhyveconsole/i386/Makefile b/usr/src/cmd/bhyveconsole/i386/Makefile
new file mode 100644
index 0000000000..c4f317a9fa
--- /dev/null
+++ b/usr/src/cmd/bhyveconsole/i386/Makefile
@@ -0,0 +1,43 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+PROG= bhyveconsole
+
+OBJS= bhyveconsole.o
+
+SRCS= $(OBJS:%.o=../%.c)
+
+include ../../Makefile.cmd
+
+CFLAGS += $(CCVERBOSE)
+LDLIBS += -lsocket
+
+.KEEP_STATE:
+
+%.o: ../%.c
+ $(COMPILE.c) $<
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+ $(LINK.c) $(OBJS) -o $@ $(LDLIBS)
+ $(POST_PROCESS)
+
+install: all $(ROOTUSRSBINPROG32)
+
+clean:
+ $(RM) $(OBJS)
+
+include ../../Makefile.targ
diff --git a/usr/src/cmd/bhyvectl/Makefile b/usr/src/cmd/bhyvectl/Makefile
new file mode 100644
index 0000000000..fe98204056
--- /dev/null
+++ b/usr/src/cmd/bhyvectl/Makefile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+PROG = bhyvectl
+
+include ../Makefile.cmd
+
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all := TARGET = all
+install := TARGET = install
+clean := TARGET = clean
+clobber := TARGET = clobber
+lint := TARGET = lint
+
+.KEEP_STATE:
+
+all clean clobber lint: $(SUBDIRS)
+
+install: $(SUBDIRS)
+ -$(RM) $(ROOTUSRSBINPROG)
+ -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/bhyvectl/Makefile.com b/usr/src/cmd/bhyvectl/Makefile.com
new file mode 100644
index 0000000000..03ca34792c
--- /dev/null
+++ b/usr/src/cmd/bhyvectl/Makefile.com
@@ -0,0 +1,48 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+PROG= bhyvectl
+
+SRCS = bhyvectl.c
+OBJS = $(SRCS:.c=.o)
+
+include ../../Makefile.cmd
+
+.KEEP_STATE:
+
+CFLAGS += $(CCVERBOSE)
+CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \
+ -I$(ROOT)/usr/platform/i86pc/include \
+ -I$(SRC)/uts/i86pc/io/vmm
+LDLIBS += -lvmmapi
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+ $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
+ $(POST_PROCESS)
+
+install: all $(ROOTUSRSBINPROG)
+
+clean:
+ $(RM) $(OBJS)
+
+lint: lint_SRCS
+
+include ../../Makefile.targ
+
+%.o: ../%.c
+ $(COMPILE.c) -I$(SRC)/common $<
+ $(POST_PROCESS_O)
diff --git a/usr/src/cmd/bhyvectl/amd64/Makefile b/usr/src/cmd/bhyvectl/amd64/Makefile
new file mode 100644
index 0000000000..b602c50d05
--- /dev/null
+++ b/usr/src/cmd/bhyvectl/amd64/Makefile
@@ -0,0 +1,21 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
+
+install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/bhyvectl/bhyvectl.c b/usr/src/cmd/bhyvectl/bhyvectl.c
new file mode 100644
index 0000000000..07d0a83df5
--- /dev/null
+++ b/usr/src/cmd/bhyvectl/bhyvectl.c
@@ -0,0 +1,1523 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/usr.sbin/bhyvectl/bhyvectl.c 273375 2014-10-21 07:10:43Z neel $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <sys/errno.h>
+#include <sys/mman.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <libutil.h>
+#include <fcntl.h>
+#include <string.h>
+#include <getopt.h>
+#include <assert.h>
+
+#include <machine/vmm.h>
+#include <vmmapi.h>
+
+#include "intel/vmcs.h"
+
+#define MB (1UL << 20)
+#define GB (1UL << 30)
+
+#define REQ_ARG required_argument
+#define NO_ARG no_argument
+#define OPT_ARG optional_argument
+
+static const char *progname;
+
+static void
+usage(void)
+{
+
+ (void)fprintf(stderr,
+ "Usage: %s --vm=<vmname>\n"
+ " [--cpu=<vcpu_number>]\n"
+ " [--create]\n"
+ " [--destroy]\n"
+ " [--get-all]\n"
+ " [--get-stats]\n"
+ " [--set-desc-ds]\n"
+ " [--get-desc-ds]\n"
+ " [--set-desc-es]\n"
+ " [--get-desc-es]\n"
+ " [--set-desc-gs]\n"
+ " [--get-desc-gs]\n"
+ " [--set-desc-fs]\n"
+ " [--get-desc-fs]\n"
+ " [--set-desc-cs]\n"
+ " [--get-desc-cs]\n"
+ " [--set-desc-ss]\n"
+ " [--get-desc-ss]\n"
+ " [--set-desc-tr]\n"
+ " [--get-desc-tr]\n"
+ " [--set-desc-ldtr]\n"
+ " [--get-desc-ldtr]\n"
+ " [--set-desc-gdtr]\n"
+ " [--get-desc-gdtr]\n"
+ " [--set-desc-idtr]\n"
+ " [--get-desc-idtr]\n"
+ " [--run]\n"
+ " [--capname=<capname>]\n"
+ " [--getcap]\n"
+ " [--setcap=<0|1>]\n"
+ " [--desc-base=<BASE>]\n"
+ " [--desc-limit=<LIMIT>]\n"
+ " [--desc-access=<ACCESS>]\n"
+ " [--set-cr0=<CR0>]\n"
+ " [--get-cr0]\n"
+ " [--set-cr3=<CR3>]\n"
+ " [--get-cr3]\n"
+ " [--set-cr4=<CR4>]\n"
+ " [--get-cr4]\n"
+ " [--set-dr7=<DR7>]\n"
+ " [--get-dr7]\n"
+ " [--set-rsp=<RSP>]\n"
+ " [--get-rsp]\n"
+ " [--set-rip=<RIP>]\n"
+ " [--get-rip]\n"
+ " [--get-rax]\n"
+ " [--set-rax=<RAX>]\n"
+ " [--get-rbx]\n"
+ " [--get-rcx]\n"
+ " [--get-rdx]\n"
+ " [--get-rsi]\n"
+ " [--get-rdi]\n"
+ " [--get-rbp]\n"
+ " [--get-r8]\n"
+ " [--get-r9]\n"
+ " [--get-r10]\n"
+ " [--get-r11]\n"
+ " [--get-r12]\n"
+ " [--get-r13]\n"
+ " [--get-r14]\n"
+ " [--get-r15]\n"
+ " [--set-rflags=<RFLAGS>]\n"
+ " [--get-rflags]\n"
+ " [--set-cs]\n"
+ " [--get-cs]\n"
+ " [--set-ds]\n"
+ " [--get-ds]\n"
+ " [--set-es]\n"
+ " [--get-es]\n"
+ " [--set-fs]\n"
+ " [--get-fs]\n"
+ " [--set-gs]\n"
+ " [--get-gs]\n"
+ " [--set-ss]\n"
+ " [--get-ss]\n"
+ " [--get-tr]\n"
+ " [--get-ldtr]\n"
+ " [--get-vmcs-pinbased-ctls]\n"
+ " [--get-vmcs-procbased-ctls]\n"
+ " [--get-vmcs-procbased-ctls2]\n"
+ " [--get-vmcs-entry-interruption-info]\n"
+ " [--set-vmcs-entry-interruption-info=<info>]\n"
+ " [--get-vmcs-eptp]\n"
+ " [--get-vmcs-guest-physical-address\n"
+ " [--get-vmcs-guest-linear-address\n"
+ " [--set-vmcs-exception-bitmap]\n"
+ " [--get-vmcs-exception-bitmap]\n"
+ " [--get-vmcs-io-bitmap-address]\n"
+ " [--get-vmcs-tsc-offset]\n"
+ " [--get-vmcs-guest-pat]\n"
+ " [--get-vmcs-host-pat]\n"
+ " [--get-vmcs-host-cr0]\n"
+ " [--get-vmcs-host-cr3]\n"
+ " [--get-vmcs-host-cr4]\n"
+ " [--get-vmcs-host-rip]\n"
+ " [--get-vmcs-host-rsp]\n"
+ " [--get-vmcs-cr0-mask]\n"
+ " [--get-vmcs-cr0-shadow]\n"
+ " [--get-vmcs-cr4-mask]\n"
+ " [--get-vmcs-cr4-shadow]\n"
+ " [--get-vmcs-cr3-targets]\n"
+ " [--get-vmcs-apic-access-address]\n"
+ " [--get-vmcs-virtual-apic-address]\n"
+ " [--get-vmcs-tpr-threshold]\n"
+ " [--get-vmcs-msr-bitmap]\n"
+ " [--get-vmcs-msr-bitmap-address]\n"
+ " [--get-vmcs-vpid]\n"
+ " [--get-vmcs-ple-gap]\n"
+ " [--get-vmcs-ple-window]\n"
+ " [--get-vmcs-instruction-error]\n"
+ " [--get-vmcs-exit-ctls]\n"
+ " [--get-vmcs-entry-ctls]\n"
+ " [--get-vmcs-guest-sysenter]\n"
+ " [--get-vmcs-link]\n"
+ " [--get-vmcs-exit-reason]\n"
+ " [--get-vmcs-exit-qualification]\n"
+ " [--get-vmcs-exit-interruption-info]\n"
+ " [--get-vmcs-exit-interruption-error]\n"
+ " [--get-vmcs-interruptibility]\n"
+ " [--set-x2apic-state=<state>]\n"
+ " [--get-x2apic-state]\n"
+ " [--unassign-pptdev=<bus/slot/func>]\n"
+ " [--set-mem=<memory in units of MB>]\n"
+ " [--get-lowmem]\n"
+ " [--get-highmem]\n",
+ progname);
+ exit(1);
+}
+
+static int get_stats, getcap, setcap, capval;
+static const char *capname;
+static int create, destroy, get_lowmem, get_highmem;
+static uint64_t memsize;
+static int set_cr0, get_cr0, set_cr3, get_cr3, set_cr4, get_cr4;
+static int set_efer, get_efer;
+static int set_dr7, get_dr7;
+static int set_rsp, get_rsp, set_rip, get_rip, set_rflags, get_rflags;
+static int set_rax, get_rax;
+static int get_rbx, get_rcx, get_rdx, get_rsi, get_rdi, get_rbp;
+static int get_r8, get_r9, get_r10, get_r11, get_r12, get_r13, get_r14, get_r15;
+static int set_desc_ds, get_desc_ds;
+static int set_desc_es, get_desc_es;
+static int set_desc_fs, get_desc_fs;
+static int set_desc_gs, get_desc_gs;
+static int set_desc_cs, get_desc_cs;
+static int set_desc_ss, get_desc_ss;
+static int set_desc_gdtr, get_desc_gdtr;
+static int set_desc_idtr, get_desc_idtr;
+static int set_desc_tr, get_desc_tr;
+static int set_desc_ldtr, get_desc_ldtr;
+static int set_cs, set_ds, set_es, set_fs, set_gs, set_ss, set_tr, set_ldtr;
+static int get_cs, get_ds, get_es, get_fs, get_gs, get_ss, get_tr, get_ldtr;
+static int set_x2apic_state, get_x2apic_state;
+enum x2apic_state x2apic_state;
+static int unassign_pptdev, bus, slot, func;
+static int run;
+
+/*
+ * VMCS-specific fields
+ */
+static int get_pinbased_ctls, get_procbased_ctls, get_procbased_ctls2;
+static int get_eptp, get_io_bitmap, get_tsc_offset;
+static int get_vmcs_entry_interruption_info, set_vmcs_entry_interruption_info;
+static int get_vmcs_interruptibility;
+uint32_t vmcs_entry_interruption_info;
+static int get_vmcs_gpa, get_vmcs_gla;
+static int get_exception_bitmap, set_exception_bitmap, exception_bitmap;
+static int get_cr0_mask, get_cr0_shadow;
+static int get_cr4_mask, get_cr4_shadow;
+static int get_cr3_targets;
+static int get_apic_access_addr, get_virtual_apic_addr, get_tpr_threshold;
+static int get_msr_bitmap, get_msr_bitmap_address;
+static int get_vpid, get_ple_gap, get_ple_window;
+static int get_inst_err, get_exit_ctls, get_entry_ctls;
+static int get_host_cr0, get_host_cr3, get_host_cr4;
+static int get_host_rip, get_host_rsp;
+static int get_guest_pat, get_host_pat;
+static int get_guest_sysenter, get_vmcs_link;
+static int get_vmcs_exit_reason, get_vmcs_exit_qualification;
+static int get_vmcs_exit_interruption_info, get_vmcs_exit_interruption_error;
+
+static uint64_t desc_base;
+static uint32_t desc_limit, desc_access;
+
+static int get_all;
+
+static void
+dump_vm_run_exitcode(struct vm_exit *vmexit, int vcpu)
+{
+ printf("vm exit[%d]\n", vcpu);
+ printf("\trip\t\t0x%016lx\n", vmexit->rip);
+ printf("\tinst_length\t%d\n", vmexit->inst_length);
+ switch (vmexit->exitcode) {
+ case VM_EXITCODE_INOUT:
+ printf("\treason\t\tINOUT\n");
+ printf("\tdirection\t%s\n", vmexit->u.inout.in ? "IN" : "OUT");
+ printf("\tbytes\t\t%d\n", vmexit->u.inout.bytes);
+ printf("\tflags\t\t%s%s\n",
+ vmexit->u.inout.string ? "STRING " : "",
+ vmexit->u.inout.rep ? "REP " : "");
+ printf("\tport\t\t0x%04x\n", vmexit->u.inout.port);
+ printf("\teax\t\t0x%08x\n", vmexit->u.inout.eax);
+ break;
+ case VM_EXITCODE_VMX:
+ printf("\treason\t\tVMX\n");
+ printf("\tstatus\t\t%d\n", vmexit->u.vmx.status);
+ printf("\texit_reason\t0x%08x (%u)\n",
+ vmexit->u.vmx.exit_reason, vmexit->u.vmx.exit_reason);
+ printf("\tqualification\t0x%016lx\n",
+ vmexit->u.vmx.exit_qualification);
+ printf("\tinst_type\t\t%d\n", vmexit->u.vmx.inst_type);
+ printf("\tinst_error\t\t%d\n", vmexit->u.vmx.inst_error);
+ break;
+ default:
+ printf("*** unknown vm run exitcode %d\n", vmexit->exitcode);
+ break;
+ }
+}
+
+static int
+dump_vmcs_msr_bitmap(int vcpu, u_long addr)
+{
+ int error, fd, byte, bit, readable, writeable;
+ u_int msr;
+ const char *bitmap;
+
+ error = -1;
+ bitmap = MAP_FAILED;
+
+ fd = open("/dev/mem", O_RDONLY, 0);
+ if (fd < 0)
+ goto done;
+
+ bitmap = mmap(NULL, PAGE_SIZE, PROT_READ, 0, fd, addr);
+ if (bitmap == MAP_FAILED)
+ goto done;
+
+ for (msr = 0; msr < 0x2000; msr++) {
+ byte = msr / 8;
+ bit = msr & 0x7;
+
+ /* Look at MSRs in the range 0x00000000 to 0x00001FFF */
+ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+ writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+ if (readable || writeable) {
+ printf("msr 0x%08x[%d]\t\t%c%c\n", msr, vcpu,
+ readable ? 'R' : '-',
+ writeable ? 'W' : '-');
+ }
+
+ /* Look at MSRs in the range 0xC0000000 to 0xC0001FFF */
+ byte += 1024;
+ readable = (bitmap[byte] & (1 << bit)) ? 0 : 1;
+ writeable = (bitmap[2048 + byte] & (1 << bit)) ? 0 : 1;
+ if (readable || writeable) {
+ printf("msr 0x%08x[%d]\t\t%c%c\n",
+ 0xc0000000 + msr, vcpu,
+ readable ? 'R' : '-',
+ writeable ? 'W' : '-');
+ }
+ }
+
+ error = 0;
+done:
+ if (bitmap != MAP_FAILED)
+ munmap((void *)bitmap, PAGE_SIZE);
+ if (fd >= 0)
+ close(fd);
+ return (error);
+}
+
+static int
+vm_get_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t *ret_val)
+{
+
+ return (vm_get_register(ctx, vcpu, VMCS_IDENT(field), ret_val));
+}
+
+static int
+vm_set_vmcs_field(struct vmctx *ctx, int vcpu, int field, uint64_t val)
+{
+
+ return (vm_set_register(ctx, vcpu, VMCS_IDENT(field), val));
+}
+
+enum {
+ VMNAME = 1000, /* avoid collision with return values from getopt */
+ VCPU,
+ SET_MEM,
+ SET_EFER,
+ SET_CR0,
+ SET_CR3,
+ SET_CR4,
+ SET_DR7,
+ SET_RSP,
+ SET_RIP,
+ SET_RAX,
+ SET_RFLAGS,
+ DESC_BASE,
+ DESC_LIMIT,
+ DESC_ACCESS,
+ SET_CS,
+ SET_DS,
+ SET_ES,
+ SET_FS,
+ SET_GS,
+ SET_SS,
+ SET_TR,
+ SET_LDTR,
+ SET_X2APIC_STATE,
+ SET_VMCS_EXCEPTION_BITMAP,
+ SET_VMCS_ENTRY_INTERRUPTION_INFO,
+ SET_CAP,
+ CAPNAME,
+ UNASSIGN_PPTDEV,
+};
+
+int
+main(int argc, char *argv[])
+{
+ char *vmname;
+ int error, ch, vcpu;
+ vm_paddr_t gpa;
+ size_t len;
+ struct vm_exit vmexit;
+ uint64_t ctl, eptp, bm, addr, u64;
+ struct vmctx *ctx;
+ int wired;
+
+ uint64_t cr0, cr3, cr4, dr7, rsp, rip, rflags, efer, pat;
+ uint64_t rax, rbx, rcx, rdx, rsi, rdi, rbp;
+ uint64_t r8, r9, r10, r11, r12, r13, r14, r15;
+ uint64_t cs, ds, es, fs, gs, ss, tr, ldtr;
+
+ struct option opts[] = {
+ { "vm", REQ_ARG, 0, VMNAME },
+ { "cpu", REQ_ARG, 0, VCPU },
+ { "set-mem", REQ_ARG, 0, SET_MEM },
+ { "set-efer", REQ_ARG, 0, SET_EFER },
+ { "set-cr0", REQ_ARG, 0, SET_CR0 },
+ { "set-cr3", REQ_ARG, 0, SET_CR3 },
+ { "set-cr4", REQ_ARG, 0, SET_CR4 },
+ { "set-dr7", REQ_ARG, 0, SET_DR7 },
+ { "set-rsp", REQ_ARG, 0, SET_RSP },
+ { "set-rip", REQ_ARG, 0, SET_RIP },
+ { "set-rax", REQ_ARG, 0, SET_RAX },
+ { "set-rflags", REQ_ARG, 0, SET_RFLAGS },
+ { "desc-base", REQ_ARG, 0, DESC_BASE },
+ { "desc-limit", REQ_ARG, 0, DESC_LIMIT },
+ { "desc-access",REQ_ARG, 0, DESC_ACCESS },
+ { "set-cs", REQ_ARG, 0, SET_CS },
+ { "set-ds", REQ_ARG, 0, SET_DS },
+ { "set-es", REQ_ARG, 0, SET_ES },
+ { "set-fs", REQ_ARG, 0, SET_FS },
+ { "set-gs", REQ_ARG, 0, SET_GS },
+ { "set-ss", REQ_ARG, 0, SET_SS },
+ { "set-tr", REQ_ARG, 0, SET_TR },
+ { "set-ldtr", REQ_ARG, 0, SET_LDTR },
+ { "set-x2apic-state",REQ_ARG, 0, SET_X2APIC_STATE },
+ { "set-vmcs-exception-bitmap",
+ REQ_ARG, 0, SET_VMCS_EXCEPTION_BITMAP },
+ { "set-vmcs-entry-interruption-info",
+ REQ_ARG, 0, SET_VMCS_ENTRY_INTERRUPTION_INFO },
+ { "capname", REQ_ARG, 0, CAPNAME },
+ { "unassign-pptdev", REQ_ARG, 0, UNASSIGN_PPTDEV },
+ { "setcap", REQ_ARG, 0, SET_CAP },
+ { "getcap", NO_ARG, &getcap, 1 },
+ { "get-stats", NO_ARG, &get_stats, 1 },
+ { "get-desc-ds",NO_ARG, &get_desc_ds, 1 },
+ { "set-desc-ds",NO_ARG, &set_desc_ds, 1 },
+ { "get-desc-es",NO_ARG, &get_desc_es, 1 },
+ { "set-desc-es",NO_ARG, &set_desc_es, 1 },
+ { "get-desc-ss",NO_ARG, &get_desc_ss, 1 },
+ { "set-desc-ss",NO_ARG, &set_desc_ss, 1 },
+ { "get-desc-cs",NO_ARG, &get_desc_cs, 1 },
+ { "set-desc-cs",NO_ARG, &set_desc_cs, 1 },
+ { "get-desc-fs",NO_ARG, &get_desc_fs, 1 },
+ { "set-desc-fs",NO_ARG, &set_desc_fs, 1 },
+ { "get-desc-gs",NO_ARG, &get_desc_gs, 1 },
+ { "set-desc-gs",NO_ARG, &set_desc_gs, 1 },
+ { "get-desc-tr",NO_ARG, &get_desc_tr, 1 },
+ { "set-desc-tr",NO_ARG, &set_desc_tr, 1 },
+ { "set-desc-ldtr", NO_ARG, &set_desc_ldtr, 1 },
+ { "get-desc-ldtr", NO_ARG, &get_desc_ldtr, 1 },
+ { "set-desc-gdtr", NO_ARG, &set_desc_gdtr, 1 },
+ { "get-desc-gdtr", NO_ARG, &get_desc_gdtr, 1 },
+ { "set-desc-idtr", NO_ARG, &set_desc_idtr, 1 },
+ { "get-desc-idtr", NO_ARG, &get_desc_idtr, 1 },
+ { "get-lowmem", NO_ARG, &get_lowmem, 1 },
+ { "get-highmem",NO_ARG, &get_highmem, 1 },
+ { "get-efer", NO_ARG, &get_efer, 1 },
+ { "get-cr0", NO_ARG, &get_cr0, 1 },
+ { "get-cr3", NO_ARG, &get_cr3, 1 },
+ { "get-cr4", NO_ARG, &get_cr4, 1 },
+ { "get-dr7", NO_ARG, &get_dr7, 1 },
+ { "get-rsp", NO_ARG, &get_rsp, 1 },
+ { "get-rip", NO_ARG, &get_rip, 1 },
+ { "get-rax", NO_ARG, &get_rax, 1 },
+ { "get-rbx", NO_ARG, &get_rbx, 1 },
+ { "get-rcx", NO_ARG, &get_rcx, 1 },
+ { "get-rdx", NO_ARG, &get_rdx, 1 },
+ { "get-rsi", NO_ARG, &get_rsi, 1 },
+ { "get-rdi", NO_ARG, &get_rdi, 1 },
+ { "get-rbp", NO_ARG, &get_rbp, 1 },
+ { "get-r8", NO_ARG, &get_r8, 1 },
+ { "get-r9", NO_ARG, &get_r9, 1 },
+ { "get-r10", NO_ARG, &get_r10, 1 },
+ { "get-r11", NO_ARG, &get_r11, 1 },
+ { "get-r12", NO_ARG, &get_r12, 1 },
+ { "get-r13", NO_ARG, &get_r13, 1 },
+ { "get-r14", NO_ARG, &get_r14, 1 },
+ { "get-r15", NO_ARG, &get_r15, 1 },
+ { "get-rflags", NO_ARG, &get_rflags, 1 },
+ { "get-cs", NO_ARG, &get_cs, 1 },
+ { "get-ds", NO_ARG, &get_ds, 1 },
+ { "get-es", NO_ARG, &get_es, 1 },
+ { "get-fs", NO_ARG, &get_fs, 1 },
+ { "get-gs", NO_ARG, &get_gs, 1 },
+ { "get-ss", NO_ARG, &get_ss, 1 },
+ { "get-tr", NO_ARG, &get_tr, 1 },
+ { "get-ldtr", NO_ARG, &get_ldtr, 1 },
+ { "get-vmcs-pinbased-ctls",
+ NO_ARG, &get_pinbased_ctls, 1 },
+ { "get-vmcs-procbased-ctls",
+ NO_ARG, &get_procbased_ctls, 1 },
+ { "get-vmcs-procbased-ctls2",
+ NO_ARG, &get_procbased_ctls2, 1 },
+ { "get-vmcs-guest-linear-address",
+ NO_ARG, &get_vmcs_gla, 1 },
+ { "get-vmcs-guest-physical-address",
+ NO_ARG, &get_vmcs_gpa, 1 },
+ { "get-vmcs-entry-interruption-info",
+ NO_ARG, &get_vmcs_entry_interruption_info, 1},
+ { "get-vmcs-eptp", NO_ARG, &get_eptp, 1 },
+ { "get-vmcs-exception-bitmap",
+ NO_ARG, &get_exception_bitmap, 1 },
+ { "get-vmcs-io-bitmap-address",
+ NO_ARG, &get_io_bitmap, 1 },
+ { "get-vmcs-tsc-offset", NO_ARG,&get_tsc_offset, 1 },
+ { "get-vmcs-cr0-mask", NO_ARG, &get_cr0_mask, 1 },
+ { "get-vmcs-cr0-shadow", NO_ARG,&get_cr0_shadow, 1 },
+ { "get-vmcs-cr4-mask", NO_ARG, &get_cr4_mask, 1 },
+ { "get-vmcs-cr4-shadow", NO_ARG,&get_cr4_shadow, 1 },
+ { "get-vmcs-cr3-targets", NO_ARG, &get_cr3_targets, 1},
+ { "get-vmcs-apic-access-address",
+ NO_ARG, &get_apic_access_addr, 1},
+ { "get-vmcs-virtual-apic-address",
+ NO_ARG, &get_virtual_apic_addr, 1},
+ { "get-vmcs-tpr-threshold",
+ NO_ARG, &get_tpr_threshold, 1 },
+ { "get-vmcs-msr-bitmap",
+ NO_ARG, &get_msr_bitmap, 1 },
+ { "get-vmcs-msr-bitmap-address",
+ NO_ARG, &get_msr_bitmap_address, 1 },
+ { "get-vmcs-vpid", NO_ARG, &get_vpid, 1 },
+ { "get-vmcs-ple-gap", NO_ARG, &get_ple_gap, 1 },
+ { "get-vmcs-ple-window", NO_ARG,&get_ple_window,1 },
+ { "get-vmcs-instruction-error",
+ NO_ARG, &get_inst_err, 1 },
+ { "get-vmcs-exit-ctls", NO_ARG, &get_exit_ctls, 1 },
+ { "get-vmcs-entry-ctls",
+ NO_ARG, &get_entry_ctls, 1 },
+ { "get-vmcs-guest-pat", NO_ARG, &get_guest_pat, 1 },
+ { "get-vmcs-host-pat", NO_ARG, &get_host_pat, 1 },
+ { "get-vmcs-host-cr0",
+ NO_ARG, &get_host_cr0, 1 },
+ { "get-vmcs-host-cr3",
+ NO_ARG, &get_host_cr3, 1 },
+ { "get-vmcs-host-cr4",
+ NO_ARG, &get_host_cr4, 1 },
+ { "get-vmcs-host-rip",
+ NO_ARG, &get_host_rip, 1 },
+ { "get-vmcs-host-rsp",
+ NO_ARG, &get_host_rsp, 1 },
+ { "get-vmcs-guest-sysenter",
+ NO_ARG, &get_guest_sysenter, 1 },
+ { "get-vmcs-link", NO_ARG, &get_vmcs_link, 1 },
+ { "get-vmcs-exit-reason",
+ NO_ARG, &get_vmcs_exit_reason, 1 },
+ { "get-vmcs-exit-qualification",
+ NO_ARG, &get_vmcs_exit_qualification, 1 },
+ { "get-vmcs-exit-interruption-info",
+ NO_ARG, &get_vmcs_exit_interruption_info, 1},
+ { "get-vmcs-exit-interruption-error",
+ NO_ARG, &get_vmcs_exit_interruption_error, 1},
+ { "get-vmcs-interruptibility",
+ NO_ARG, &get_vmcs_interruptibility, 1 },
+ { "get-x2apic-state",NO_ARG, &get_x2apic_state, 1 },
+ { "get-all", NO_ARG, &get_all, 1 },
+ { "run", NO_ARG, &run, 1 },
+ { "create", NO_ARG, &create, 1 },
+ { "destroy", NO_ARG, &destroy, 1 },
+ { NULL, 0, NULL, 0 }
+ };
+
+ vcpu = 0;
+ progname = basename(argv[0]);
+
+ while ((ch = getopt_long(argc, argv, "", opts, NULL)) != -1) {
+ switch (ch) {
+ case 0:
+ break;
+ case VMNAME:
+ vmname = optarg;
+ break;
+ case VCPU:
+ vcpu = atoi(optarg);
+ break;
+ case SET_MEM:
+ memsize = atoi(optarg) * MB;
+ memsize = roundup(memsize, 2 * MB);
+ break;
+ case SET_EFER:
+ efer = strtoul(optarg, NULL, 0);
+ set_efer = 1;
+ break;
+ case SET_CR0:
+ cr0 = strtoul(optarg, NULL, 0);
+ set_cr0 = 1;
+ break;
+ case SET_CR3:
+ cr3 = strtoul(optarg, NULL, 0);
+ set_cr3 = 1;
+ break;
+ case SET_CR4:
+ cr4 = strtoul(optarg, NULL, 0);
+ set_cr4 = 1;
+ break;
+ case SET_DR7:
+ dr7 = strtoul(optarg, NULL, 0);
+ set_dr7 = 1;
+ break;
+ case SET_RSP:
+ rsp = strtoul(optarg, NULL, 0);
+ set_rsp = 1;
+ break;
+ case SET_RIP:
+ rip = strtoul(optarg, NULL, 0);
+ set_rip = 1;
+ break;
+ case SET_RAX:
+ rax = strtoul(optarg, NULL, 0);
+ set_rax = 1;
+ break;
+ case SET_RFLAGS:
+ rflags = strtoul(optarg, NULL, 0);
+ set_rflags = 1;
+ break;
+ case DESC_BASE:
+ desc_base = strtoul(optarg, NULL, 0);
+ break;
+ case DESC_LIMIT:
+ desc_limit = strtoul(optarg, NULL, 0);
+ break;
+ case DESC_ACCESS:
+ desc_access = strtoul(optarg, NULL, 0);
+ break;
+ case SET_CS:
+ cs = strtoul(optarg, NULL, 0);
+ set_cs = 1;
+ break;
+ case SET_DS:
+ ds = strtoul(optarg, NULL, 0);
+ set_ds = 1;
+ break;
+ case SET_ES:
+ es = strtoul(optarg, NULL, 0);
+ set_es = 1;
+ break;
+ case SET_FS:
+ fs = strtoul(optarg, NULL, 0);
+ set_fs = 1;
+ break;
+ case SET_GS:
+ gs = strtoul(optarg, NULL, 0);
+ set_gs = 1;
+ break;
+ case SET_SS:
+ ss = strtoul(optarg, NULL, 0);
+ set_ss = 1;
+ break;
+ case SET_TR:
+ tr = strtoul(optarg, NULL, 0);
+ set_tr = 1;
+ break;
+ case SET_LDTR:
+ ldtr = strtoul(optarg, NULL, 0);
+ set_ldtr = 1;
+ break;
+ case SET_X2APIC_STATE:
+ x2apic_state = strtol(optarg, NULL, 0);
+ set_x2apic_state = 1;
+ break;
+ case SET_VMCS_EXCEPTION_BITMAP:
+ exception_bitmap = strtoul(optarg, NULL, 0);
+ set_exception_bitmap = 1;
+ break;
+ case SET_VMCS_ENTRY_INTERRUPTION_INFO:
+ vmcs_entry_interruption_info = strtoul(optarg, NULL, 0);
+ set_vmcs_entry_interruption_info = 1;
+ break;
+ case SET_CAP:
+ capval = strtoul(optarg, NULL, 0);
+ setcap = 1;
+ break;
+ case CAPNAME:
+ capname = optarg;
+ break;
+ case UNASSIGN_PPTDEV:
+ unassign_pptdev = 1;
+ if (sscanf(optarg, "%d/%d/%d", &bus, &slot, &func) != 3)
+ usage();
+ break;
+ default:
+ usage();
+ }
+ }
+ argc -= optind;
+ argv += optind;
+
+ if (vmname == NULL)
+ usage();
+
+ error = 0;
+
+ if (!error && create)
+ error = vm_create(vmname);
+
+ if (!error) {
+ ctx = vm_open(vmname);
+ if (ctx == NULL)
+ error = -1;
+ }
+
+ if (!error && memsize)
+ error = vm_setup_memory(ctx, memsize, VM_MMAP_NONE);
+
+ if (!error && set_efer)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_EFER, efer);
+
+ if (!error && set_cr0)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR0, cr0);
+
+ if (!error && set_cr3)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR3, cr3);
+
+ if (!error && set_cr4)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CR4, cr4);
+
+ if (!error && set_dr7)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DR7, dr7);
+
+ if (!error && set_rsp)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RSP, rsp);
+
+ if (!error && set_rip)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RIP, rip);
+
+ if (!error && set_rax)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RAX, rax);
+
+ if (!error && set_rflags) {
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+ rflags);
+ }
+
+ if (!error && set_desc_ds) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_DS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_es) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_ES,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_ss) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_SS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_cs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_fs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_FS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_gs) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GS,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_tr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_TR,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_ldtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+ desc_base, desc_limit, desc_access);
+ }
+
+ if (!error && set_desc_gdtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+ desc_base, desc_limit, 0);
+ }
+
+ if (!error && set_desc_idtr) {
+ error = vm_set_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+ desc_base, desc_limit, 0);
+ }
+
+ if (!error && set_cs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_CS, cs);
+
+ if (!error && set_ds)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_DS, ds);
+
+ if (!error && set_es)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_ES, es);
+
+ if (!error && set_fs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_FS, fs);
+
+ if (!error && set_gs)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_GS, gs);
+
+ if (!error && set_ss)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_SS, ss);
+
+ if (!error && set_tr)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_TR, tr);
+
+ if (!error && set_ldtr)
+ error = vm_set_register(ctx, vcpu, VM_REG_GUEST_LDTR, ldtr);
+
+ if (!error && set_x2apic_state)
+ error = vm_set_x2apic_state(ctx, vcpu, x2apic_state);
+
+#ifdef __FreeBSD__
+ if (!error && unassign_pptdev)
+ error = vm_unassign_pptdev(ctx, bus, slot, func);
+#endif
+
+ if (!error && set_exception_bitmap) {
+ error = vm_set_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+ exception_bitmap);
+ }
+
+ if (!error && set_vmcs_entry_interruption_info) {
+ error = vm_set_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,
+ vmcs_entry_interruption_info);
+ }
+
+ if (!error && (get_lowmem || get_all)) {
+ gpa = 0;
+ error = vm_get_memory_seg(ctx, gpa, &len, &wired);
+ if (error == 0)
+ printf("lowmem\t\t0x%016lx/%ld%s\n", gpa, len,
+ wired ? " wired" : "");
+ }
+
+ if (!error && (get_highmem || get_all)) {
+ gpa = 4 * GB;
+ error = vm_get_memory_seg(ctx, gpa, &len, &wired);
+ if (error == 0)
+ printf("highmem\t\t0x%016lx/%ld%s\n", gpa, len,
+ wired ? " wired" : "");
+ }
+
+ if (!error && (get_efer || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_EFER, &efer);
+ if (error == 0)
+ printf("efer[%d]\t\t0x%016lx\n", vcpu, efer);
+ }
+
+ if (!error && (get_cr0 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR0, &cr0);
+ if (error == 0)
+ printf("cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+ }
+
+ if (!error && (get_cr3 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR3, &cr3);
+ if (error == 0)
+ printf("cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+ }
+
+ if (!error && (get_cr4 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CR4, &cr4);
+ if (error == 0)
+ printf("cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+ }
+
+ if (!error && (get_dr7 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DR7, &dr7);
+ if (error == 0)
+ printf("dr7[%d]\t\t0x%016lx\n", vcpu, dr7);
+ }
+
+ if (!error && (get_rsp || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSP, &rsp);
+ if (error == 0)
+ printf("rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
+ }
+
+ if (!error && (get_rip || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RIP, &rip);
+ if (error == 0)
+ printf("rip[%d]\t\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && (get_rax || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RAX, &rax);
+ if (error == 0)
+ printf("rax[%d]\t\t0x%016lx\n", vcpu, rax);
+ }
+
+ if (!error && (get_rbx || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBX, &rbx);
+ if (error == 0)
+ printf("rbx[%d]\t\t0x%016lx\n", vcpu, rbx);
+ }
+
+ if (!error && (get_rcx || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RCX, &rcx);
+ if (error == 0)
+ printf("rcx[%d]\t\t0x%016lx\n", vcpu, rcx);
+ }
+
+ if (!error && (get_rdx || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDX, &rdx);
+ if (error == 0)
+ printf("rdx[%d]\t\t0x%016lx\n", vcpu, rdx);
+ }
+
+ if (!error && (get_rsi || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RSI, &rsi);
+ if (error == 0)
+ printf("rsi[%d]\t\t0x%016lx\n", vcpu, rsi);
+ }
+
+ if (!error && (get_rdi || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RDI, &rdi);
+ if (error == 0)
+ printf("rdi[%d]\t\t0x%016lx\n", vcpu, rdi);
+ }
+
+ if (!error && (get_rbp || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RBP, &rbp);
+ if (error == 0)
+ printf("rbp[%d]\t\t0x%016lx\n", vcpu, rbp);
+ }
+
+ if (!error && (get_r8 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R8, &r8);
+ if (error == 0)
+ printf("r8[%d]\t\t0x%016lx\n", vcpu, r8);
+ }
+
+ if (!error && (get_r9 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R9, &r9);
+ if (error == 0)
+ printf("r9[%d]\t\t0x%016lx\n", vcpu, r9);
+ }
+
+ if (!error && (get_r10 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R10, &r10);
+ if (error == 0)
+ printf("r10[%d]\t\t0x%016lx\n", vcpu, r10);
+ }
+
+ if (!error && (get_r11 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R11, &r11);
+ if (error == 0)
+ printf("r11[%d]\t\t0x%016lx\n", vcpu, r11);
+ }
+
+ if (!error && (get_r12 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R12, &r12);
+ if (error == 0)
+ printf("r12[%d]\t\t0x%016lx\n", vcpu, r12);
+ }
+
+ if (!error && (get_r13 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R13, &r13);
+ if (error == 0)
+ printf("r13[%d]\t\t0x%016lx\n", vcpu, r13);
+ }
+
+ if (!error && (get_r14 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R14, &r14);
+ if (error == 0)
+ printf("r14[%d]\t\t0x%016lx\n", vcpu, r14);
+ }
+
+ if (!error && (get_r15 || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_R15, &r15);
+ if (error == 0)
+ printf("r15[%d]\t\t0x%016lx\n", vcpu, r15);
+ }
+
+ if (!error && (get_rflags || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_RFLAGS,
+ &rflags);
+ if (error == 0)
+ printf("rflags[%d]\t0x%016lx\n", vcpu, rflags);
+ }
+
+#ifdef __FreeBSD__
+ if (!error && (get_stats || get_all)) {
+ int i, num_stats;
+ uint64_t *stats;
+ struct timeval tv;
+ const char *desc;
+
+ stats = vm_get_stats(ctx, vcpu, &tv, &num_stats);
+ if (stats != NULL) {
+ printf("vcpu%d\n", vcpu);
+ for (i = 0; i < num_stats; i++) {
+ desc = vm_get_stat_desc(ctx, i);
+ printf("%-40s\t%ld\n", desc, stats[i]);
+ }
+ }
+ }
+#endif
+
+ if (!error && (get_desc_ds || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_DS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ds desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_es || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_ES,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("es desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_fs || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_FS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("fs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_gs || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("gs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_ss || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_SS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ss desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_cs || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_CS,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("cs desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_tr || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_TR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("tr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_ldtr || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_LDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("ldtr desc[%d]\t0x%016lx/0x%08x/0x%08x\n",
+ vcpu, desc_base, desc_limit, desc_access);
+ }
+ }
+
+ if (!error && (get_desc_gdtr || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_GDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("gdtr[%d]\t\t0x%016lx/0x%08x\n",
+ vcpu, desc_base, desc_limit);
+ }
+ }
+
+ if (!error && (get_desc_idtr || get_all)) {
+ error = vm_get_desc(ctx, vcpu, VM_REG_GUEST_IDTR,
+ &desc_base, &desc_limit, &desc_access);
+ if (error == 0) {
+ printf("idtr[%d]\t\t0x%016lx/0x%08x\n",
+ vcpu, desc_base, desc_limit);
+ }
+ }
+
+ if (!error && (get_cs || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_CS, &cs);
+ if (error == 0)
+ printf("cs[%d]\t\t0x%04lx\n", vcpu, cs);
+ }
+
+ if (!error && (get_ds || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_DS, &ds);
+ if (error == 0)
+ printf("ds[%d]\t\t0x%04lx\n", vcpu, ds);
+ }
+
+ if (!error && (get_es || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_ES, &es);
+ if (error == 0)
+ printf("es[%d]\t\t0x%04lx\n", vcpu, es);
+ }
+
+ if (!error && (get_fs || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_FS, &fs);
+ if (error == 0)
+ printf("fs[%d]\t\t0x%04lx\n", vcpu, fs);
+ }
+
+ if (!error && (get_gs || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_GS, &gs);
+ if (error == 0)
+ printf("gs[%d]\t\t0x%04lx\n", vcpu, gs);
+ }
+
+ if (!error && (get_ss || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_SS, &ss);
+ if (error == 0)
+ printf("ss[%d]\t\t0x%04lx\n", vcpu, ss);
+ }
+
+ if (!error && (get_tr || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_TR, &tr);
+ if (error == 0)
+ printf("tr[%d]\t\t0x%04lx\n", vcpu, tr);
+ }
+
+ if (!error && (get_ldtr || get_all)) {
+ error = vm_get_register(ctx, vcpu, VM_REG_GUEST_LDTR, &ldtr);
+ if (error == 0)
+ printf("ldtr[%d]\t\t0x%04lx\n", vcpu, ldtr);
+ }
+
+ if (!error && (get_x2apic_state || get_all)) {
+ error = vm_get_x2apic_state(ctx, vcpu, &x2apic_state);
+ if (error == 0)
+ printf("x2apic_state[%d]\t%d\n", vcpu, x2apic_state);
+ }
+
+ if (!error && (get_pinbased_ctls || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PIN_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("pinbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_procbased_ctls || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_PRI_PROC_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("procbased_ctls[%d]\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_procbased_ctls2 || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_SEC_PROC_BASED_CTLS, &ctl);
+ if (error == 0)
+ printf("procbased_ctls2[%d]\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_vmcs_gla || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_LINEAR_ADDRESS, &u64);
+ if (error == 0)
+ printf("gla[%d]\t\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && (get_vmcs_gpa || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_PHYSICAL_ADDRESS, &u64);
+ if (error == 0)
+ printf("gpa[%d]\t\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && (get_vmcs_entry_interruption_info || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_INTR_INFO,&u64);
+ if (error == 0) {
+ printf("entry_interruption_info[%d]\t0x%08lx\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && (get_eptp || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EPTP, &eptp);
+ if (error == 0)
+ printf("eptp[%d]\t\t0x%016lx\n", vcpu, eptp);
+ }
+
+ if (!error && (get_exception_bitmap || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXCEPTION_BITMAP,
+ &bm);
+ if (error == 0)
+ printf("exception_bitmap[%d]\t0x%08lx\n", vcpu, bm);
+ }
+
+ if (!error && (get_io_bitmap || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_A, &bm);
+ if (error == 0)
+ printf("io_bitmap_a[%d]\t0x%08lx\n", vcpu, bm);
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_IO_BITMAP_B, &bm);
+ if (error == 0)
+ printf("io_bitmap_b[%d]\t0x%08lx\n", vcpu, bm);
+ }
+
+ if (!error && (get_tsc_offset || get_all)) {
+ uint64_t tscoff;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_TSC_OFFSET, &tscoff);
+ if (error == 0)
+ printf("tsc_offset[%d]\t0x%016lx\n", vcpu, tscoff);
+ }
+
+ if (!error && (get_cr0_mask || get_all)) {
+ uint64_t cr0mask;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_MASK, &cr0mask);
+ if (error == 0)
+ printf("cr0_mask[%d]\t\t0x%016lx\n", vcpu, cr0mask);
+ }
+
+ if (!error && (get_cr0_shadow || get_all)) {
+ uint64_t cr0shadow;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR0_SHADOW,
+ &cr0shadow);
+ if (error == 0)
+ printf("cr0_shadow[%d]\t\t0x%016lx\n", vcpu, cr0shadow);
+ }
+
+ if (!error && (get_cr4_mask || get_all)) {
+ uint64_t cr4mask;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_MASK, &cr4mask);
+ if (error == 0)
+ printf("cr4_mask[%d]\t\t0x%016lx\n", vcpu, cr4mask);
+ }
+
+ if (!error && (get_cr4_shadow || get_all)) {
+ uint64_t cr4shadow;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR4_SHADOW,
+ &cr4shadow);
+ if (error == 0)
+ printf("cr4_shadow[%d]\t\t0x%016lx\n", vcpu, cr4shadow);
+ }
+
+ if (!error && (get_cr3_targets || get_all)) {
+ uint64_t target_count, target_addr;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET_COUNT,
+ &target_count);
+ if (error == 0) {
+ printf("cr3_target_count[%d]\t0x%08lx\n",
+ vcpu, target_count);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET0,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target0[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET1,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target1[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET2,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target2[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_CR3_TARGET3,
+ &target_addr);
+ if (error == 0) {
+ printf("cr3_target3[%d]\t\t0x%016lx\n",
+ vcpu, target_addr);
+ }
+ }
+
+ if (!error && (get_apic_access_addr || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_APIC_ACCESS, &addr);
+ if (error == 0)
+ printf("apic_access_addr[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && (get_virtual_apic_addr || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_VIRTUAL_APIC, &addr);
+ if (error == 0)
+ printf("virtual_apic_addr[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && (get_tpr_threshold || get_all)) {
+ uint64_t threshold;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_TPR_THRESHOLD,
+ &threshold);
+ if (error == 0)
+ printf("tpr_threshold[%d]\t0x%08lx\n", vcpu, threshold);
+ }
+
+ if (!error && (get_msr_bitmap_address || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+ if (error == 0)
+ printf("msr_bitmap[%d]\t\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && (get_msr_bitmap || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_MSR_BITMAP, &addr);
+ if (error == 0)
+ error = dump_vmcs_msr_bitmap(vcpu, addr);
+ }
+
+ if (!error && (get_vpid || get_all)) {
+ uint64_t vpid;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_VPID, &vpid);
+ if (error == 0)
+ printf("vpid[%d]\t\t0x%04lx\n", vcpu, vpid);
+ }
+
+ if (!error && (get_ple_window || get_all)) {
+ uint64_t window;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_WINDOW, &window);
+ if (error == 0)
+ printf("ple_window[%d]\t\t0x%08lx\n", vcpu, window);
+ }
+
+ if (!error && (get_ple_gap || get_all)) {
+ uint64_t gap;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_PLE_GAP, &gap);
+ if (error == 0)
+ printf("ple_gap[%d]\t\t0x%08lx\n", vcpu, gap);
+ }
+
+ if (!error && (get_inst_err || get_all)) {
+ uint64_t insterr;
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_INSTRUCTION_ERROR,
+ &insterr);
+ if (error == 0) {
+ printf("instruction_error[%d]\t0x%08lx\n",
+ vcpu, insterr);
+ }
+ }
+
+ if (!error && (get_exit_ctls || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_CTLS, &ctl);
+ if (error == 0)
+ printf("exit_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_entry_ctls || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_ENTRY_CTLS, &ctl);
+ if (error == 0)
+ printf("entry_ctls[%d]\t\t0x%08lx\n", vcpu, ctl);
+ }
+
+ if (!error && (get_host_pat || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_IA32_PAT, &pat);
+ if (error == 0)
+ printf("host_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+ }
+
+ if (!error && (get_guest_pat || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_GUEST_IA32_PAT, &pat);
+ if (error == 0)
+ printf("guest_pat[%d]\t\t0x%016lx\n", vcpu, pat);
+ }
+
+ if (!error && (get_host_cr0 || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR0, &cr0);
+ if (error == 0)
+ printf("host_cr0[%d]\t\t0x%016lx\n", vcpu, cr0);
+ }
+
+ if (!error && (get_host_cr3 || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR3, &cr3);
+ if (error == 0)
+ printf("host_cr3[%d]\t\t0x%016lx\n", vcpu, cr3);
+ }
+
+ if (!error && (get_host_cr4 || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_CR4, &cr4);
+ if (error == 0)
+ printf("host_cr4[%d]\t\t0x%016lx\n", vcpu, cr4);
+ }
+
+ if (!error && (get_host_rip || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RIP, &rip);
+ if (error == 0)
+ printf("host_rip[%d]\t\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && (get_host_rsp || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_HOST_RSP, &rsp);
+ if (error == 0)
+ printf("host_rsp[%d]\t\t0x%016lx\n", vcpu, rsp);
+ }
+
+ if (!error && (get_guest_sysenter || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_CS, &cs);
+ if (error == 0)
+ printf("guest_sysenter_cs[%d]\t0x%08lx\n", vcpu, cs);
+
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_ESP, &rsp);
+ if (error == 0)
+ printf("guest_sysenter_sp[%d]\t0x%016lx\n", vcpu, rsp);
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_IA32_SYSENTER_EIP, &rip);
+ if (error == 0)
+ printf("guest_sysenter_ip[%d]\t0x%016lx\n", vcpu, rip);
+ }
+
+ if (!error && (get_vmcs_link || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_LINK_POINTER, &addr);
+ if (error == 0)
+ printf("vmcs_pointer[%d]\t0x%016lx\n", vcpu, addr);
+ }
+
+ if (!error && (get_vmcs_exit_reason || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_REASON, &u64);
+ if (error == 0)
+ printf("vmcs_exit_reason[%d]\t0x%016lx\n", vcpu, u64);
+ }
+
+ if (!error && (get_vmcs_exit_qualification || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_QUALIFICATION,
+ &u64);
+ if (error == 0)
+ printf("vmcs_exit_qualification[%d]\t0x%016lx\n",
+ vcpu, u64);
+ }
+
+ if (!error && (get_vmcs_exit_interruption_info || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_INFO, &u64);
+ if (error == 0) {
+ printf("vmcs_exit_interruption_info[%d]\t0x%08lx\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && (get_vmcs_exit_interruption_error || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu, VMCS_EXIT_INTR_ERRCODE,
+ &u64);
+ if (error == 0) {
+ printf("vmcs_exit_interruption_error[%d]\t0x%08lx\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && (get_vmcs_interruptibility || get_all)) {
+ error = vm_get_vmcs_field(ctx, vcpu,
+ VMCS_GUEST_INTERRUPTIBILITY, &u64);
+ if (error == 0) {
+ printf("vmcs_guest_interruptibility[%d]\t0x%08lx\n",
+ vcpu, u64);
+ }
+ }
+
+ if (!error && setcap) {
+ int captype;
+ captype = vm_capability_name2type(capname);
+ error = vm_set_capability(ctx, vcpu, captype, capval);
+ if (error != 0 && errno == ENOENT)
+ printf("Capability \"%s\" is not available\n", capname);
+ }
+
+ if (!error && (getcap || get_all)) {
+ int captype, val, getcaptype;
+
+ if (getcap && capname)
+ getcaptype = vm_capability_name2type(capname);
+ else
+ getcaptype = -1;
+
+ for (captype = 0; captype < VM_CAP_MAX; captype++) {
+ if (getcaptype >= 0 && captype != getcaptype)
+ continue;
+ error = vm_get_capability(ctx, vcpu, captype, &val);
+ if (error == 0) {
+ printf("Capability \"%s\" is %s on vcpu %d\n",
+ vm_capability_type2name(captype),
+ val ? "set" : "not set", vcpu);
+ } else if (errno == ENOENT) {
+ error = 0;
+ printf("Capability \"%s\" is not available\n",
+ vm_capability_type2name(captype));
+ } else {
+ break;
+ }
+ }
+ }
+
+ if (!error && run) {
+ error = vm_run(ctx, vcpu, &vmexit);
+ if (error == 0)
+ dump_vm_run_exitcode(&vmexit, vcpu);
+ else
+ printf("vm_run error %d\n", error);
+ }
+
+ if (error)
+ printf("errno = %d\n", errno);
+
+ if (!error && destroy)
+ error = vm_destroy(ctx);
+
+ exit(error);
+}
diff --git a/usr/src/cmd/bhyveload-uefi/Makefile b/usr/src/cmd/bhyveload-uefi/Makefile
new file mode 100644
index 0000000000..bbcbacf32f
--- /dev/null
+++ b/usr/src/cmd/bhyveload-uefi/Makefile
@@ -0,0 +1,41 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+PROG = bhyveload-uefi
+
+include ../Makefile.cmd
+
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all := TARGET = all
+install := TARGET = install
+clean := TARGET = clean
+clobber := TARGET = clobber
+lint := TARGET = lint
+
+.KEEP_STATE:
+
+all clean clobber lint: $(SUBDIRS)
+
+install: $(SUBDIRS)
+ -$(RM) $(ROOTUSRSBINPROG)
+ -$(LN) $(ISAEXEC) $(ROOTUSRSBINPROG)
+
+$(SUBDIRS): FRC
+ @cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
diff --git a/usr/src/cmd/bhyveload-uefi/Makefile.com b/usr/src/cmd/bhyveload-uefi/Makefile.com
new file mode 100644
index 0000000000..7865cca8d8
--- /dev/null
+++ b/usr/src/cmd/bhyveload-uefi/Makefile.com
@@ -0,0 +1,52 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+PROG= bhyveload-uefi
+
+SRCS = ../bhyveload-uefi.c expand_number.c
+OBJS = bhyveload-uefi.o expand_number.o
+
+include ../../Makefile.cmd
+
+.KEEP_STATE:
+
+CFLAGS += $(CCVERBOSE)
+CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd $(CPPFLAGS.master) \
+ -I$(ROOT)/usr/platform/i86pc/include
+LDLIBS += -lvmmapi
+
+all: $(PROG)
+
+$(PROG): $(OBJS)
+ $(LINK.c) -o $@ $(OBJS) $(LDFLAGS) $(LDLIBS)
+ $(POST_PROCESS)
+
+install: all $(ROOTUSRSBINPROG)
+
+clean:
+ $(RM) $(OBJS)
+
+lint: lint_SRCS
+
+include ../../Makefile.targ
+
+%.o: ../%.c
+ $(COMPILE.c) $<
+ $(POST_PROCESS_O)
+
+%.o: $(CONTRIB)/freebsd/lib/libutil/%.c
+ $(COMPILE.c) $<
+ $(POST_PROCESS_O)
+
diff --git a/usr/src/cmd/bhyveload-uefi/amd64/Makefile b/usr/src/cmd/bhyveload-uefi/amd64/Makefile
new file mode 100644
index 0000000000..b602c50d05
--- /dev/null
+++ b/usr/src/cmd/bhyveload-uefi/amd64/Makefile
@@ -0,0 +1,21 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+include ../Makefile.com
+include ../../Makefile.cmd.64
+
+CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
+
+install: all $(ROOTUSRSBINPROG64)
diff --git a/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c b/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c
new file mode 100644
index 0000000000..62a7ca5d0f
--- /dev/null
+++ b/usr/src/cmd/bhyveload-uefi/bhyveload-uefi.c
@@ -0,0 +1,190 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/types.h>
+
+#include <machine/vmm.h>
+
+#include <errno.h>
+#include <err.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sysexits.h>
+#include <unistd.h>
+
+#include <vmmapi.h>
+
+#define KB (1024UL)
+#define MB (1024 * 1024UL)
+#define GB (1024 * 1024 * 1024UL)
+
+#define UEFI_ROM_ADDR 0xFFE00000
+#define UEFI_ROM_SIZE (2 * MB)
+/*
+ * N.B. the UEFI code zeros the first page in memory so use the second.
+ */
+#define BHYVE_HOB_ADDR 0x00002000
+#define BHYVE_BO_HOB_ADDR 0x00002080
+
+#define UEFI_ROM_PATH "/usr/share/bhyve/uefi-rom.bin"
+
+struct platform_info {
+ uint32_t ncpus;
+};
+
+/*
+ * Boot order code:
+ * 0 - EFI_CD_HD
+ * 1 - EFI_CD
+ * 2 - EFI_HD_CD
+ * 3 - EFI_HD
+ * 4 - EFI_NET
+ * 5 - EFI_NET_CD_HD
+ * 6 - EFI_HD_HD_CD
+ * 7 - LEGACY_CD_HD
+ * 8 - LEGACY_CD
+ * 9 - LEGACY_HD_CD
+ * 10 - LEGACY_HD
+ * 11 - EFI_SHELL
+ */
+
+struct bootorder_info {
+ uint32_t guestbootorder;
+};
+
+static char *vmname, *progname;
+static struct vmctx *ctx;
+
+static void
+usage(void)
+{
+ printf("usage: %s "
+ "[-c vcpus] [-m mem-size] [-b bootorder]"
+ "<vmname>\n", progname);
+ exit(1);
+}
+
+int
+main(int argc, char** argv)
+{
+ int opt, error, fd;
+ int guest_ncpus;
+ int guest_bootorder = 0;
+ uint64_t mem_size;
+ char *membase, *rombase;
+ struct platform_info *pi;
+ struct bootorder_info *bi;
+
+ progname = argv[0];
+
+ guest_ncpus = 1;
+ mem_size = 256 * MB;
+
+ while ((opt = getopt(argc, argv, "c:m:b:")) != -1) {
+ switch (opt) {
+ case 'c':
+ guest_ncpus = atoi(optarg);
+ break;
+ case 'm':
+ error = vm_parse_memsize(optarg, &mem_size);
+ if (error != 0 || mem_size == 0)
+ errx(EX_USAGE, "Invalid memsize '%s'", optarg);
+ break;
+ case 'b':
+ guest_bootorder = atoi(optarg);
+ if (guest_bootorder < 0 || guest_bootorder > 11) {
+ errx(EX_USAGE, "Invalid bootoption: %d\n"
+ "\tBoot order code:\n"
+ "\t0 - EFI_CD_HD\n"
+ "\t1 - EFI_CD\n"
+ "\t2 - EFI_HD_CD\n"
+ "\t3 - EFI_HD\n"
+ "\t4 - EFI_NET\n"
+ "\t5 - EFI_NET_CD_HD\n"
+ "\t6 - EFI_HD_HD_CD\n"
+ "\t7 - LEGACY_CD_HD\n"
+ "\t8 - LEGACY_CD\n"
+ "\t9 - LEGACY_HD_CD\n"
+ "\t10 - LEGACY_HD\n"
+ "\t11 - EFI_SHELL\n", guest_bootorder);
+ exit(1);
+ }
+ break;
+ case '?':
+ usage();
+ }
+ }
+
+ argc -= optind;
+ argv += optind;
+
+ if (argc != 1)
+ usage();
+
+ vmname = argv[0];
+ error = vm_create(vmname);
+ if (error != 0 && errno != EEXIST) {
+ perror("vm_create");
+ exit(1);
+
+ }
+
+ ctx = vm_open(vmname);
+ if (ctx == NULL) {
+ perror("vm_open");
+ exit(1);
+ }
+
+ error = vm_set_capability(ctx, 0, VM_CAP_UNRESTRICTED_GUEST, 1);
+ if (error) {
+ perror("vm_set_capability(VM_CAP_UNRESTRICTED_GUEST)");
+ }
+
+ error = vm_setup_memory(ctx, mem_size, VM_MMAP_ALL);
+ if (error) {
+ perror("vm_setup_memory");
+ exit(1);
+ }
+ membase = vm_map_gpa(ctx, 0, 8 * KB);
+
+ error = vm_setup_rom(ctx, UEFI_ROM_ADDR, UEFI_ROM_SIZE);
+ if (error) {
+ perror("vm_setup_rom");
+ exit(1);
+ }
+ rombase = vm_map_gpa(ctx, UEFI_ROM_ADDR, UEFI_ROM_SIZE);
+
+ fd = open(UEFI_ROM_PATH, O_RDONLY);
+ if (fd == -1) {
+ perror("open");
+ exit(1);
+ }
+ read(fd, rombase, UEFI_ROM_SIZE);
+ close(fd);
+
+ pi = (struct platform_info *)(membase + BHYVE_HOB_ADDR);
+ pi->ncpus = guest_ncpus;
+ bi = (struct bootorder_info *)(membase + BHYVE_BO_HOB_ADDR);
+ bi->guestbootorder = guest_bootorder;
+
+ error = vcpu_reset(ctx, 0);
+ if (error) {
+ perror("vcpu_reset");
+ exit(1);
+ }
+
+ return (0);
+}
diff --git a/usr/src/cmd/bhyveload-uefi/i386/Makefile b/usr/src/cmd/bhyveload-uefi/i386/Makefile
new file mode 100644
index 0000000000..f5b7bb6915
--- /dev/null
+++ b/usr/src/cmd/bhyveload-uefi/i386/Makefile
@@ -0,0 +1,18 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+include ../Makefile.com
+
+install: all $(ROOTUSRSBINPROG32)
diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/Makefile b/usr/src/cmd/mdb/intel/amd64/vmm/Makefile
new file mode 100644
index 0000000000..bf9219b435
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/amd64/vmm/Makefile
@@ -0,0 +1,20 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2014 Pluribus Networks Inc.
+#
+
+MAKEVARS = CW_NO_SHADOW=true __GNUC=
+
+include $(SRC)/Makefile.master
+$(BUILD64)SUBDIRS += $(MACH64)
+include ../../../Makefile.subdirs
diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile b/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile
new file mode 100644
index 0000000000..49ca0c5eb3
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/amd64/vmm/amd64/Makefile
@@ -0,0 +1,32 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+MODULE = vmm.so
+MDBTGT = kvm
+
+MODSRCS = vmm.c
+
+include ../../../../../Makefile.cmd
+include ../../../../../Makefile.cmd.64
+include ../../../Makefile.amd64
+include ../../../../Makefile.module
+
+CPPFLAGS = -D_KERNEL -D_MACHDEP
+CPPFLAGS += -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64
+CPPFLAGS += -I$(CONTRIB)/freebsd -I$(CONTRIB)/freebsd/amd64
+CPPFLAGS += -I$(SRC)/uts/common -I$(SRC)/uts/i86pc
+CPPFLAGS += -I$(SRC)/cmd/mdb/common
+
+CPPFLAGS += -_cc=-xdryrun
diff --git a/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c b/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c
new file mode 100644
index 0000000000..9e29d8662a
--- /dev/null
+++ b/usr/src/cmd/mdb/intel/amd64/vmm/vmm.c
@@ -0,0 +1,238 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/param.h>
+
+#include <mdb/mdb_modapi.h>
+#include <sys/cpuvar.h>
+#include <sys/varargs.h>
+#include <sys/vmm.h>
+#include <sys/vmm_impl.h>
+
+/*
+ * VMM trace debug walker/dcmd code
+ */
+
+/*
+ * Initialize the vmm_trace_dmsg_t walker by either using the given starting
+ * address, or reading the value of the kernel's vmm_debug_rbuf pointer.
+ * We also allocate a vmm_trace_dmsg_t for storage, and save this using the
+ * walk_data pointer.
+ */
+static int
+vmm_dmsg_walk_i(mdb_walk_state_t *wsp)
+{
+ uintptr_t rbuf_addr;
+ vmm_trace_rbuf_t rbuf;
+
+ if (wsp->walk_addr == NULL) {
+ if (mdb_readvar(&rbuf_addr, "vmm_debug_rbuf") == -1) {
+ mdb_warn("failed to read 'vmm_debug_rbuf'");
+ return (WALK_ERR);
+ }
+
+ if (mdb_vread(&rbuf, sizeof (vmm_trace_rbuf_t), rbuf_addr)
+ == -1) {
+ mdb_warn("failed to read vmm_trace_rbuf_t at %p",
+ rbuf_addr);
+ return (WALK_ERR);
+ }
+
+ wsp->walk_addr = (uintptr_t)(vmm_trace_dmsg_t *)rbuf.dmsgh;
+ }
+
+ /*
+ * Save ptr to head of ring buffer to prevent looping.
+ */
+ wsp->walk_arg = (void *)wsp->walk_addr;
+ wsp->walk_data = mdb_alloc(sizeof (vmm_trace_dmsg_t), UM_SLEEP);
+ return (WALK_NEXT);
+}
+
+/*
+ * At each step, read a vmm_trace_dmsg_t into our private storage, and then
+ * invoke the callback function. We terminate when we reach a NULL next
+ * pointer.
+ */
+static int
+vmm_dmsg_walk_s(mdb_walk_state_t *wsp)
+{
+ int status;
+
+ if (wsp->walk_addr == NULL)
+ return (WALK_DONE);
+
+ if (mdb_vread(wsp->walk_data, sizeof (vmm_trace_dmsg_t),
+ wsp->walk_addr) == -1) {
+ mdb_warn("failed to read vmm_trace_dmsg_t at %p",
+ wsp->walk_addr);
+ return (WALK_ERR);
+ }
+
+ status = wsp->walk_callback(wsp->walk_addr, wsp->walk_data,
+ wsp->walk_cbdata);
+
+ wsp->walk_addr =
+ (uintptr_t)(((vmm_trace_dmsg_t *)wsp->walk_data)->next);
+
+ /*
+ * If we've looped then we're done.
+ */
+ if (wsp->walk_addr == (uintptr_t)wsp->walk_arg)
+ wsp->walk_addr = NULL;
+
+ return (status);
+}
+
+/*
+ * The walker's fini function is invoked at the end of each walk. Since we
+ * dynamically allocated a vmm_trace_dmsg_t in vmm_dmsg_walk_i, we must
+ * free it now.
+ */
+static void
+vmm_dmsg_walk_f(mdb_walk_state_t *wsp)
+{
+ mdb_free(wsp->walk_data, sizeof (vmm_trace_dmsg_t));
+}
+
+/*
+ * This routine is used by the vmm_dmsg_dump dcmd to dump content of
+ * VMM trace ring buffer.
+ */
+int
+vmm_dmsg_dump(vmm_trace_dmsg_t *addr, int print_pathname, uint_t *printed)
+{
+ vmm_trace_dmsg_t dmsg, *dmsgh = addr;
+ char pathname[MAXPATHLEN];
+ char merge[1024];
+
+ while (addr != NULL) {
+ if (mdb_vread(&dmsg, sizeof (dmsg), (uintptr_t)addr) !=
+ sizeof (dmsg)) {
+ mdb_warn("failed to read message pointer in kernel");
+ return (DCMD_ERR);
+ }
+
+ (void) mdb_snprintf(merge, sizeof (merge),
+ "[%Y:%03d:%03d:%03d] : %s",
+ dmsg.timestamp.tv_sec,
+ (int)dmsg.timestamp.tv_nsec/1000000,
+ (int)(dmsg.timestamp.tv_nsec/1000)%1000,
+ (int)dmsg.timestamp.tv_nsec%1000,
+ dmsg.buf);
+
+ mdb_printf("%s", merge);
+
+ if (printed != NULL) {
+ (*printed)++;
+ }
+
+ if (((addr = dmsg.next) == NULL) || (dmsg.next == dmsgh)) {
+ break;
+ }
+ }
+
+ return (DCMD_OK);
+}
+
+/*
+ * 1. Process flag passed to vmm_dmsg_dump dcmd.
+ * 2. Obtain VMM trace ring buffer pointer.
+ * 3. Pass VMM trace ring buffer pointer to vmm_dmsg_dump()
+ * to dump content of VMM trace ring buffer.
+ */
+int
+vmm_rbuf_dump(uintptr_t addr, uint_t flags, int argc, const mdb_arg_t *argv)
+{
+ vmm_trace_rbuf_t rbuf;
+ uint_t printed = 0; /* have we printed anything? */
+ int print_pathname = FALSE;
+ int rval = DCMD_OK;
+
+ if (argc > 1) {
+ return (DCMD_USAGE);
+ }
+
+ if (mdb_getopts(argc, argv,
+ 'a', MDB_OPT_SETBITS, TRUE, &print_pathname) != argc) {
+ return (DCMD_USAGE);
+ }
+
+ /*
+ * If ring buffer address not provided try to obtain
+ * it using vmm_debug_rbuf global.
+ */
+ if ((addr == NULL) || !(flags & DCMD_ADDRSPEC)) {
+ if (mdb_readvar(&addr, "vmm_debug_rbuf") == -1) {
+ mdb_warn("Failed to read 'vmm_debug_rbuf'.");
+ return (DCMD_ERR);
+ }
+ }
+
+ if (mdb_vread(&rbuf, sizeof (rbuf), addr) != sizeof (rbuf)) {
+ mdb_warn("Failed to read ring buffer in kernel.");
+ return (DCMD_ERR);
+ }
+
+ if (rbuf.dmsgh == NULL) {
+ mdb_printf("The vmm trace ring buffer is empty.\n");
+ return (DCMD_OK);
+ }
+
+ rval = vmm_dmsg_dump((vmm_trace_dmsg_t *)rbuf.dmsgh,
+ print_pathname, &printed);
+
+ if (rval != DCMD_OK) {
+ return (rval);
+ }
+
+ if (printed == 0) {
+ mdb_warn("Failed to read vmm trace ring buffer.");
+ return (DCMD_ERR);
+ }
+
+ return (rval);
+}
+
+/*
+ * MDB module linkage information:
+ *
+ * We declare a list of structures describing our dcmds, a list of structures
+ * describing our walkers, and a function named _mdb_init to return a pointer
+ * to our module information.
+ */
+
+static const mdb_dcmd_t dcmds[] = {
+ { "vmm_dmsg_dump", "[-a]", "Dump vmm trace debug messages",
+ vmm_rbuf_dump },
+ { NULL }
+};
+
+static const mdb_walker_t walkers[] = {
+ { "vmm_dmsg",
+ "walk ring buffer containing vmm trace debug messages",
+ vmm_dmsg_walk_i, vmm_dmsg_walk_s, vmm_dmsg_walk_f },
+ { NULL }
+};
+
+static const mdb_modinfo_t modinfo = {
+ MDB_API_VERSION, dcmds, walkers
+};
+
+const mdb_modinfo_t *
+_mdb_init(void)
+{
+ return (&modinfo);
+}
diff --git a/usr/src/compat/freebsd/amd64/machine/asmacros.h b/usr/src/compat/freebsd/amd64/machine/asmacros.h
new file mode 100644
index 0000000000..fcf35a7b78
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/asmacros.h
@@ -0,0 +1,28 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_ASMACROS_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_ASMACROS_H_
+
+#define ENTRY(x) \
+ .text; .p2align 4,0x90; \
+ .globl x; \
+ .type x, @function; \
+x:
+
+#define END(x) \
+ .size x, [.-x]
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_ASMACROS_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/atomic.h b/usr/src/compat/freebsd/amd64/machine/atomic.h
new file mode 100644
index 0000000000..5b78143d21
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/atomic.h
@@ -0,0 +1,244 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_
+
+static __inline u_char
+atomic_load_acq_char(volatile u_char *p)
+{
+ u_char res;
+
+ __asm volatile("lock ; " "cmpxchgb %b0,%1"
+ : "=a" (res), "=m" (*p)
+ : "m" (*p) : "memory", "cc");
+ return (res);
+}
+
+static __inline u_short
+atomic_load_acq_short(volatile u_short *p)
+{
+ u_short res;
+
+ __asm volatile("lock ; " "cmpxchgw %w0,%1"
+ : "=a" (res), "=m" (*p)
+ : "m" (*p)
+ : "memory", "cc");
+ return (res);
+}
+
+static __inline u_int
+atomic_load_acq_int(volatile u_int *p)
+{
+ u_int res;
+
+ __asm volatile("lock ; " "cmpxchgl %0,%1"
+ : "=a" (res), "=m" (*p)
+ : "m" (*p)
+ : "memory", "cc");
+ return (res);
+}
+
+static __inline u_long
+atomic_load_acq_long(volatile u_long *p)
+{
+ u_long res;
+
+ __asm volatile("lock ; " "cmpxchgq %0,%1"
+ : "=a" (res), "=m" (*p)
+ : "m" (*p)
+ : "memory", "cc");
+ return (res);
+}
+
+static __inline void
+atomic_store_rel_char(volatile u_char *p, u_char v)
+{
+ __asm volatile("" : : : "memory");
+ *p = v;
+}
+
+static __inline void
+atomic_store_rel_short(volatile u_short *p, u_short v)
+{
+ __asm volatile("" : : : "memory");
+ *p = v;
+}
+
+static __inline void
+atomic_store_rel_int(volatile u_int *p, u_int v)
+{
+ __asm volatile("" : : : "memory");
+ *p = v;
+}
+
+static __inline void
+atomic_store_rel_long(volatile u_long *p, u_long v)
+{
+ __asm volatile("" : : : "memory");
+ *p = v;
+}
+
+/*
+ * Atomic compare and set.
+ *
+ * if (*dst == expect) *dst = src (all 32 bit words)
+ *
+ * Returns 0 on failure, non-zero on success
+ */
+static __inline int
+atomic_cmpset_int(volatile u_int *dst, u_int expect, u_int src)
+{
+ u_char res;
+
+ __asm __volatile(
+ " lock ; "
+ " cmpxchgl %3,%1 ; "
+ " sete %0 ; "
+ "# atomic_cmpset_int"
+ : "=q" (res), /* 0 */
+ "+m" (*dst), /* 1 */
+ "+a" (expect) /* 2 */
+ : "r" (src) /* 3 */
+ : "memory", "cc");
+ return (res);
+}
+
+static __inline int
+atomic_cmpset_long(volatile u_long *dst, u_long expect, u_long src)
+{
+ u_char res;
+
+ __asm __volatile(
+ " lock ; "
+ " cmpxchgq %3,%1 ; "
+ " sete %0 ; "
+ "# atomic_cmpset_long"
+ : "=q" (res), /* 0 */
+ "+m" (*dst), /* 1 */
+ "+a" (expect) /* 2 */
+ : "r" (src) /* 3 */
+ : "memory", "cc");
+ return (res);
+}
+
+/*
+ * Atomically add the value of v to the integer pointed to by p and return
+ * the previous value of *p.
+ */
+static __inline u_int
+atomic_fetchadd_int(volatile u_int *p, u_int v)
+{
+
+ __asm __volatile(
+ " lock ; "
+ " xaddl %0, %1 ; "
+ "# atomic_fetchadd_int"
+ : "+r" (v), /* 0 (result) */
+ "=m" (*p) /* 1 */
+ : "m" (*p) /* 2 */
+ : "cc");
+ return (v);
+}
+
+static __inline void
+atomic_set_int(volatile u_int *p, u_int v)
+{
+ __asm volatile(
+ "lock ; " "orl %1,%0"
+ : "=m" (*p)
+ : "ir" (v), "m" (*p)
+ : "cc");
+}
+
+static __inline void
+atomic_clear_int(volatile u_int *p, u_int v)
+{
+ __asm volatile(
+ "lock ; " "andl %1,%0"
+ : "=m" (*p)
+ : "ir" (~v), "m" (*p)
+ : "cc");
+}
+
+static __inline void
+atomic_subtract_int(volatile u_int *p, u_int v)
+{
+ __asm volatile(
+ "lock ; " "subl %1,%0"
+ : "=m" (*p)
+ : "ir" (v), "m" (*p)
+ : "cc");
+}
+
+static __inline void
+atomic_set_long(volatile u_long *p, u_long v)
+{
+ __asm volatile(
+ "lock ; " "orq %1,%0"
+ : "+m" (*p)
+ : "ir" (v)
+ : "cc");
+}
+
+static __inline void
+atomic_clear_long(volatile u_long *p, u_long v)
+{
+ __asm volatile("lock ; " "andq %1,%0"
+ : "+m" (*p)
+ : "ir" (~v)
+ : "cc");
+}
+
+static __inline u_int
+atomic_swap_int(volatile u_int *p, u_int v)
+{
+
+ __asm __volatile(
+ " xchgl %1,%0 ; "
+ "# atomic_swap_int"
+ : "+r" (v), /* 0 */
+ "+m" (*p)); /* 1 */
+ return (v);
+}
+
+static __inline u_long
+atomic_swap_long(volatile u_long *p, u_long v)
+{
+
+ __asm __volatile(
+ " xchgq %1,%0 ; "
+ "# atomic_swap_long"
+ : "+r" (v), /* 0 */
+ "+m" (*p)); /* 1 */
+ return (v);
+}
+
+#define atomic_readandclear_int(p) atomic_swap_int(p, 0)
+#define atomic_readandclear_long(p) atomic_swap_long(p, 0)
+
+/* Operations on 32-bit double words. */
+#define atomic_load_acq_32 atomic_load_acq_int
+#define atomic_store_rel_32 atomic_store_rel_int
+#define atomic_cmpset_32 atomic_cmpset_int
+
+/* Operations on 64-bit quad words. */
+#define atomic_cmpset_64 atomic_cmpset_long
+#define atomic_readandclear_64 atomic_readandclear_long
+
+/* Operations on pointers. */
+#define atomic_cmpset_ptr atomic_cmpset_long
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_ATOMIC_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/clock.h b/usr/src/compat/freebsd/amd64/machine/clock.h
new file mode 100644
index 0000000000..f50b42a126
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/clock.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_CLOCK_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_CLOCK_H_
+
+extern uint64_t cpu_freq_hz;
+
+#define tsc_freq cpu_freq_hz
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_CLOCK_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/cpufunc.h b/usr/src/compat/freebsd/amd64/machine/cpufunc.h
new file mode 100644
index 0000000000..cf485e947c
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/cpufunc.h
@@ -0,0 +1,165 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_
+
+static __inline u_long
+bsfq(u_long mask)
+{
+ u_long result;
+
+ __asm __volatile("bsfq %1,%0" : "=r" (result) : "rm" (mask));
+ return (result);
+}
+
+static __inline u_int
+bsrl(u_int mask)
+{
+ u_int result;
+
+ __asm __volatile("bsrl %1,%0" : "=r" (result) : "rm" (mask));
+ return (result);
+}
+
+static __inline u_long
+bsrq(u_long mask)
+{
+ u_long result;
+
+ __asm __volatile("bsrq %1,%0" : "=r" (result) : "rm" (mask));
+ return (result);
+}
+
+static __inline void
+clts(void)
+{
+ __asm __volatile("clts");
+}
+
+static __inline void
+do_cpuid(u_int ax, u_int *p)
+{
+ __asm __volatile("cpuid"
+ : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3])
+ : "0" (ax));
+}
+
+static __inline void
+cpuid_count(u_int ax, u_int cx, u_int *p)
+{
+ __asm __volatile("cpuid"
+ : "=a" (p[0]), "=b" (p[1]), "=c" (p[2]), "=d" (p[3])
+ : "0" (ax), "c" (cx));
+}
+
+static __inline void
+enable_intr(void)
+{
+ __asm __volatile("sti");
+}
+
+static __inline int
+ffsl(long mask)
+{
+ return (mask == 0 ? mask : (int)bsfq((u_long)mask) + 1);
+}
+
+static __inline int
+fls(int mask)
+{
+ return (mask == 0 ? mask : (int)bsrl((u_int)mask) + 1);
+}
+
+static __inline int
+flsl(long mask)
+{
+ return (mask == 0 ? mask : (int)bsrq((u_long)mask) + 1);
+}
+
+static __inline int
+flsll(long long mask)
+{
+ return (flsl((long)mask));
+}
+
+static __inline uint64_t
+rdmsr(u_int msr)
+{
+ uint32_t low, high;
+
+ __asm __volatile("rdmsr" : "=a" (low), "=d" (high) : "c" (msr));
+ return (low | ((uint64_t)high << 32));
+}
+
+static __inline uint64_t
+rdtsc(void)
+{
+ uint32_t low, high;
+
+ __asm __volatile("rdtsc" : "=a" (low), "=d" (high));
+ return (low | ((uint64_t)high << 32));
+}
+
+static __inline void
+wrmsr(u_int msr, uint64_t newval)
+{
+ uint32_t low, high;
+
+ low = newval;
+ high = newval >> 32;
+ __asm __volatile("wrmsr" : : "a" (low), "d" (high), "c" (msr));
+}
+
+static __inline void
+load_cr0(u_long data)
+{
+ __asm __volatile("movq %0,%%cr0" : : "r" (data));
+}
+
+static __inline u_long
+rcr0(void)
+{
+ u_long data;
+
+ __asm __volatile("movq %%cr0,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline u_long
+rcr3(void)
+{
+ u_long data;
+
+ __asm __volatile("movq %%cr3,%0" : "=r" (data));
+ return (data);
+}
+
+static __inline void
+load_cr4(u_long data)
+{
+ __asm __volatile("movq %0,%%cr4" : : "r" (data));
+}
+
+static __inline u_long
+rcr4(void)
+{
+ u_long data;
+
+ __asm __volatile("movq %%cr4,%0" : "=r" (data));
+ return (data);
+}
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_CPUFUNC_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/fpu.h b/usr/src/compat/freebsd/amd64/machine/fpu.h
new file mode 100644
index 0000000000..48e686780c
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/fpu.h
@@ -0,0 +1,29 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_
+
+#define XSAVE_AREA_ALIGN 64
+
+void fpuexit(kthread_t *td);
+void fpurestore(void *);
+void fpusave(void *);
+
+struct savefpu *fpu_save_area_alloc(void);
+void fpu_save_area_free(struct savefpu *fsa);
+void fpu_save_area_reset(struct savefpu *fsa);
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_FPU_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/md_var.h b/usr/src/compat/freebsd/amd64/machine/md_var.h
new file mode 100644
index 0000000000..60fdd566e5
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/md_var.h
@@ -0,0 +1,24 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_
+
+extern u_int cpu_high; /* Highest arg to CPUID */
+extern u_int cpu_exthigh; /* Highest arg to extended CPUID */
+extern u_int cpu_id; /* Stepping ID */
+extern char cpu_vendor[]; /* CPU Origin code */
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_MD_VAR_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/param.h b/usr/src/compat/freebsd/amd64/machine/param.h
new file mode 100644
index 0000000000..eaca5ab8d7
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/param.h
@@ -0,0 +1,39 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PARAM_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_PARAM_H_
+
+#ifdef _KERNEL
+#define MAXCPU NCPU
+#endif /* _KERNEL */
+
+#define PAGE_SHIFT 12 /* LOG2(PAGE_SIZE) */
+#define PAGE_SIZE (1<<PAGE_SHIFT) /* bytes/page */
+#define PAGE_MASK (PAGE_SIZE-1)
+
+/* Size of the level 1 page table units */
+#define NPTEPG (PAGE_SIZE/(sizeof (pt_entry_t)))
+
+/* Size of the level 2 page directory units */
+#define NPDEPG (PAGE_SIZE/(sizeof (pd_entry_t)))
+
+/* Size of the level 3 page directory pointer table units */
+#define NPDPEPG (PAGE_SIZE/(sizeof (pdp_entry_t)))
+
+/* Size of the level 4 page-map level-4 table units */
+#define NPML4EPG (PAGE_SIZE/(sizeof (pml4_entry_t)))
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_PARAM_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/pcb.h b/usr/src/compat/freebsd/amd64/machine/pcb.h
new file mode 100644
index 0000000000..75b5de640c
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/pcb.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PCB_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_PCB_H_
+
+#include <machine/fpu.h>
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_PCB_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/pmap.h b/usr/src/compat/freebsd/amd64/machine/pmap.h
new file mode 100644
index 0000000000..d0303bdd56
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/pmap.h
@@ -0,0 +1,44 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_
+
+ /* ---- Intel Nomenclature ---- */
+#define PG_V 0x001 /* P Valid */
+#define PG_RW 0x002 /* R/W Read/Write */
+#define PG_U 0x004 /* U/S User/Supervisor */
+#define PG_A 0x020 /* A Accessed */
+#define PG_M 0x040 /* D Dirty */
+#define PG_PS 0x080 /* PS Page size (0=4k,1=2M) */
+
+/*
+ * Page Protection Exception bits
+ */
+#define PGEX_P 0x01 /* Protection violation vs. not present */
+#define PGEX_W 0x02 /* during a Write cycle */
+#define PGEX_U 0x04 /* access from User mode (UPL) */
+#define PGEX_RSV 0x08 /* reserved PTE field is non-zero */
+#define PGEX_I 0x10 /* during an instruction fetch */
+
+typedef u_int64_t pd_entry_t;
+typedef u_int64_t pt_entry_t;
+typedef u_int64_t pdp_entry_t;
+typedef u_int64_t pml4_entry_t;
+
+#define vtophys(va) pmap_kextract(((vm_offset_t) (va)))
+vm_paddr_t pmap_kextract(vm_offset_t va);
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_PMAP_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/segments.h b/usr/src/compat/freebsd/amd64/machine/segments.h
new file mode 100644
index 0000000000..d0655f4a0e
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/segments.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_
+
+#include <x86/segments.h>
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/smp.h b/usr/src/compat/freebsd/amd64/machine/smp.h
new file mode 100644
index 0000000000..ef719b9684
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/smp.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SMP_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/vmm.h b/usr/src/compat/freebsd/amd64/machine/vmm.h
new file mode 100644
index 0000000000..79c3ec959e
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/vmm.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_
+
+#include <sys/vmm.h>
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_VMM_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/vmm_dev.h b/usr/src/compat/freebsd/amd64/machine/vmm_dev.h
new file mode 100644
index 0000000000..fe9cb6c705
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/vmm_dev.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMM_DEV_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_VMM_DEV_H_
+
+#include <sys/vmm_dev.h>
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_VMM_DEV_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/vmm_instruction_emul.h b/usr/src/compat/freebsd/amd64/machine/vmm_instruction_emul.h
new file mode 100644
index 0000000000..02c3f391c7
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/vmm_instruction_emul.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMM_INSTRUCTION_EMUL_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_VMM_INSTRUCTION_EMUL_H_
+
+#include <sys/vmm_instruction_emul.h>
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_VMM_INSTRUCTION_EMUL_H_ */
diff --git a/usr/src/compat/freebsd/amd64/machine/vmparam.h b/usr/src/compat/freebsd/amd64/machine/vmparam.h
new file mode 100644
index 0000000000..c80c2af545
--- /dev/null
+++ b/usr/src/compat/freebsd/amd64/machine/vmparam.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_
+#define _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_VMPARAM_H_ */
diff --git a/usr/src/compat/freebsd/libutil.h b/usr/src/compat/freebsd/libutil.h
new file mode 100644
index 0000000000..e22ffc0551
--- /dev/null
+++ b/usr/src/compat/freebsd/libutil.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_LIBUTIL_H_
+#define _COMPAT_FREEBSD_LIBUTIL_H_
+
+int expand_number(const char *_buf, uint64_t *_num);
+
+#endif /* _COMPAT_FREEBSD_LIBUTIL_H_ */
diff --git a/usr/src/compat/freebsd/net/ethernet.h b/usr/src/compat/freebsd/net/ethernet.h
new file mode 100644
index 0000000000..a0d5a828c6
--- /dev/null
+++ b/usr/src/compat/freebsd/net/ethernet.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_
+#define _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_
+
+#include <sys/ethernet.h>
+
+#endif /* _COMPAT_FREEBSD_SYS_NET_ETHERNET_H_ */
diff --git a/usr/src/compat/freebsd/paths.h b/usr/src/compat/freebsd/paths.h
new file mode 100644
index 0000000000..e43c963f93
--- /dev/null
+++ b/usr/src/compat/freebsd/paths.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_PATHS_H_
+#define _COMPAT_FREEBSD_PATHS_H_
+
+#define _PATH_TMP "/tmp/"
+
+#endif /* _COMPAT_FREEBSD_PATHS_H_ */
diff --git a/usr/src/compat/freebsd/pthread_np.h b/usr/src/compat/freebsd/pthread_np.h
new file mode 100644
index 0000000000..641c58f406
--- /dev/null
+++ b/usr/src/compat/freebsd/pthread_np.h
@@ -0,0 +1,28 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_PTHREAD_NP_H_
+#define _COMPAT_FREEBSD_PTHREAD_NP_H_
+
+#include <sys/param.h>
+#include <sys/cpuset.h>
+
+#include <synch.h>
+
+#define pthread_set_name_np(thread, name)
+
+#define pthread_mutex_isowned_np(x) _mutex_held(x)
+
+#endif /* _COMPAT_FREEBSD_PTHREAD_NP_H_ */
diff --git a/usr/src/compat/freebsd/string.h b/usr/src/compat/freebsd/string.h
new file mode 100644
index 0000000000..7e0f5c7ddc
--- /dev/null
+++ b/usr/src/compat/freebsd/string.h
@@ -0,0 +1,26 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_STRING_H_
+#define _COMPAT_FREEBSD_STRING_H_
+
+/*
+ * This is quite a hack; blame bcopy/bcmp/bzero and memcpy/memcmp/memset.
+ */
+#include <strings.h>
+
+#include_next <string.h>
+
+#endif /* _COMPAT_FREEBSD_STRING_H_ */
diff --git a/usr/src/compat/freebsd/strings.h b/usr/src/compat/freebsd/strings.h
new file mode 100644
index 0000000000..fa3539fb96
--- /dev/null
+++ b/usr/src/compat/freebsd/strings.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_STRINGS_H_
+#define _COMPAT_FREEBSD_STRINGS_H_
+
+#include <machine/cpufunc.h>
+
+#include_next <strings.h>
+
+#endif /* _COMPAT_FREEBSD_STRINGS_H_ */
diff --git a/usr/src/compat/freebsd/sys/_iovec.h b/usr/src/compat/freebsd/sys/_iovec.h
new file mode 100644
index 0000000000..b755ae7e21
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/_iovec.h
@@ -0,0 +1,24 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS__IOVEC_H_
+#define _COMPAT_FREEBSD_SYS__IOVEC_H_
+
+struct iovec {
+ void *iov_base; /* Base address. */
+ size_t iov_len; /* Length. */
+};
+
+#endif /* _COMPAT_FREEBSD_SYS__IOVEC_H_ */
diff --git a/usr/src/compat/freebsd/sys/_pthreadtypes.h b/usr/src/compat/freebsd/sys/_pthreadtypes.h
new file mode 100644
index 0000000000..d746da3712
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/_pthreadtypes.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS__PTHREADTYPES_H_
+#define _COMPAT_FREEBSD_SYS__PTHREADTYPES_H_
+
+#endif /* _COMPAT_FREEBSD_SYS__PTHREADTYPES_H_ */
diff --git a/usr/src/compat/freebsd/sys/_types.h b/usr/src/compat/freebsd/sys/_types.h
new file mode 100644
index 0000000000..62c327d216
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/_types.h
@@ -0,0 +1,22 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS__TYPES_H_
+#define _COMPAT_FREEBSD_SYS__TYPES_H_
+
+#include <sys/cdefs.h>
+#include <machine/_types.h>
+
+#endif /* _COMPAT_FREEBSD_SYS__TYPES_H_ */
diff --git a/usr/src/compat/freebsd/sys/callout.h b/usr/src/compat/freebsd/sys/callout.h
new file mode 100644
index 0000000000..17b6e31507
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/callout.h
@@ -0,0 +1,70 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_CALLOUT_H_
+#define _COMPAT_FREEBSD_SYS_CALLOUT_H_
+
+#include <sys/cyclic.h>
+
+struct callout {
+ cyclic_id_t c_cyc_id;
+ int c_flags;
+ void (*c_func)(void *);
+ void *c_arg;
+
+};
+
+#define CALLOUT_ACTIVE 0x0002 /* callout is currently active */
+#define CALLOUT_PENDING 0x0004 /* callout is waiting for timeout */
+
+#define C_ABSOLUTE 0x0200 /* event time is absolute. */
+
+#define callout_active(c) ((c)->c_flags & CALLOUT_ACTIVE)
+#define callout_deactivate(c) ((c)->c_flags &= ~CALLOUT_ACTIVE)
+#define callout_pending(c) ((c)->c_flags & CALLOUT_PENDING)
+
+void vmm_glue_callout_init(struct callout *c, int mpsafe);
+int vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt,
+ sbintime_t pr, void (*func)(void *), void *arg, int flags);
+int vmm_glue_callout_stop(struct callout *c);
+int vmm_glue_callout_drain(struct callout *c);
+
+static __inline void
+callout_init(struct callout *c, int mpsafe)
+{
+ vmm_glue_callout_init(c, mpsafe);
+}
+
+static __inline int
+callout_stop(struct callout *c)
+{
+ return (vmm_glue_callout_stop(c));
+}
+
+static __inline int
+callout_drain(struct callout *c)
+{
+ return (vmm_glue_callout_drain(c));
+}
+
+static __inline int
+callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr,
+ void (*func)(void *), void *arg, int flags)
+{
+ return (vmm_glue_callout_reset_sbt(c, sbt, pr, func, arg, flags));
+}
+
+
+#endif /* _COMPAT_FREEBSD_SYS_CALLOUT_H_ */
diff --git a/usr/src/compat/freebsd/sys/cdefs.h b/usr/src/compat/freebsd/sys/cdefs.h
new file mode 100644
index 0000000000..974e323dbe
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/cdefs.h
@@ -0,0 +1,58 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_CDEFS_H_
+#define _COMPAT_FREEBSD_SYS_CDEFS_H_
+
+#define __FBSDID(s)
+
+#ifdef __GNUC__
+#define inline __inline
+
+#define __GNUCLIKE___SECTION 1
+
+#define __dead2 __attribute__((__noreturn__))
+#define __unused __attribute__((__unused__))
+#define __used __attribute__((__used__))
+#define __packed __attribute__((__packed__))
+#define __aligned(x) __attribute__((__aligned__(x)))
+#define __section(x) __attribute__((__section__(x)))
+#endif
+
+/*
+ * The __CONCAT macro is used to concatenate parts of symbol names, e.g.
+ * with "#define OLD(foo) __CONCAT(old,foo)", OLD(foo) produces oldfoo.
+ * The __CONCAT macro is a bit tricky to use if it must work in non-ANSI
+ * mode -- there must be no spaces between its arguments, and for nested
+ * __CONCAT's, all the __CONCAT's must be at the left. __CONCAT can also
+ * concatenate double-quoted strings produced by the __STRING macro, but
+ * this only works with ANSI C.
+ *
+ * __XSTRING is like __STRING, but it expands any macros in its argument
+ * first. It is only available with ANSI C.
+ */
+#if defined(__STDC__) || defined(__cplusplus)
+#define __P(protos) protos /* full-blown ANSI C */
+#define __CONCAT1(x,y) x ## y
+#define __CONCAT(x,y) __CONCAT1(x,y)
+#define __STRING(x) #x /* stringify without expanding x */
+#define __XSTRING(x) __STRING(x) /* expand x, then stringify */
+#else /* !(__STDC__ || __cplusplus) */
+#define __P(protos) () /* traditional C preprocessor */
+#define __CONCAT(x,y) x/**/y
+#define __STRING(x) "x"
+#endif /* !(__STDC__ || __cplusplus) */
+
+#endif /* _COMPAT_FREEBSD_SYS_CDEFS_H_ */
diff --git a/usr/src/compat/freebsd/sys/cpuset.h b/usr/src/compat/freebsd/sys/cpuset.h
new file mode 100644
index 0000000000..8527624b5e
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/cpuset.h
@@ -0,0 +1,44 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_CPUSET_H_
+#define _COMPAT_FREEBSD_SYS_CPUSET_H_
+
+#define NOCPU -1
+
+#ifdef _KERNEL
+#define CPU_SET(cpu, set) CPUSET_ADD(*(set), cpu)
+#define CPU_SETOF(cpu, set) CPUSET_ONLY(*(set), cpu)
+#define CPU_ZERO(set) CPUSET_ZERO(*(set))
+#define CPU_CLR(cpu, set) CPUSET_DEL(*(set), cpu)
+#define CPU_FFS(set) cpusetobj_ffs(set)
+#define CPU_ISSET(cpu, set) CPU_IN_SET(*(set), cpu)
+#define CPU_CMP(set1, set2) CPUSET_ISEQUAL(*(set1), *(set2))
+#define CPU_SET_ATOMIC(cpu, set) CPUSET_ATOMIC_ADD(*(set), cpu)
+
+#include <sys/cpuvar.h>
+
+int cpusetobj_ffs(const cpuset_t *set);
+#else
+#include <machine/atomic.h>
+
+typedef int cpuset_t;
+
+#define CPUSET(cpu) (1UL << (cpu))
+
+#define CPU_SET_ATOMIC(cpu, set) atomic_set_int((set), CPUSET(cpu))
+#endif
+
+#endif /* _COMPAT_FREEBSD_SYS_CPUSET_H_ */
diff --git a/usr/src/compat/freebsd/sys/disk.h b/usr/src/compat/freebsd/sys/disk.h
new file mode 100644
index 0000000000..c9bdc6a2d8
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/disk.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_DISK_H_
+#define _COMPAT_FREEBSD_SYS_DISK_H_
+
+#endif /* _COMPAT_FREEBSD_SYS_DISK_H_ */
diff --git a/usr/src/compat/freebsd/sys/endian.h b/usr/src/compat/freebsd/sys/endian.h
new file mode 100644
index 0000000000..a31bff55d6
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/endian.h
@@ -0,0 +1,125 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_ENDIAN_H_
+#define _COMPAT_FREEBSD_SYS_ENDIAN_H_
+
+static __inline uint16_t
+be16dec(const void *pp)
+{
+ uint8_t const *p = (uint8_t const *)pp;
+
+ return ((p[0] << 8) | p[1]);
+}
+
+static __inline uint32_t
+be32dec(const void *pp)
+{
+ uint8_t const *p = (uint8_t const *)pp;
+
+ return (((unsigned)p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3]);
+}
+
+static __inline uint64_t
+be64dec(const void *pp)
+{
+ uint8_t const *p = (uint8_t const *)pp;
+
+ return (((uint64_t)be32dec(p) << 32) | be32dec(p + 4));
+}
+
+static __inline uint16_t
+le16dec(const void *pp)
+{
+ uint8_t const *p = (uint8_t const *)pp;
+
+ return ((p[1] << 8) | p[0]);
+}
+
+static __inline uint32_t
+le32dec(const void *pp)
+{
+ uint8_t const *p = (uint8_t const *)pp;
+
+ return (((unsigned)p[3] << 24) | (p[2] << 16) | (p[1] << 8) | p[0]);
+}
+
+static __inline uint64_t
+le64dec(const void *pp)
+{
+ uint8_t const *p = (uint8_t const *)pp;
+
+ return (((uint64_t)le32dec(p + 4) << 32) | le32dec(p));
+}
+
+static __inline void
+be16enc(void *pp, uint16_t u)
+{
+ uint8_t *p = (uint8_t *)pp;
+
+ p[0] = (u >> 8) & 0xff;
+ p[1] = u & 0xff;
+}
+
+static __inline void
+be32enc(void *pp, uint32_t u)
+{
+ uint8_t *p = (uint8_t *)pp;
+
+ p[0] = (u >> 24) & 0xff;
+ p[1] = (u >> 16) & 0xff;
+ p[2] = (u >> 8) & 0xff;
+ p[3] = u & 0xff;
+}
+
+static __inline void
+be64enc(void *pp, uint64_t u)
+{
+ uint8_t *p = (uint8_t *)pp;
+
+ be32enc(p, (uint32_t)(u >> 32));
+ be32enc(p + 4, (uint32_t)(u & 0xffffffffU));
+}
+
+static __inline void
+le16enc(void *pp, uint16_t u)
+{
+ uint8_t *p = (uint8_t *)pp;
+
+ p[0] = u & 0xff;
+ p[1] = (u >> 8) & 0xff;
+}
+
+static __inline void
+le32enc(void *pp, uint32_t u)
+{
+ uint8_t *p = (uint8_t *)pp;
+
+ p[0] = u & 0xff;
+ p[1] = (u >> 8) & 0xff;
+ p[2] = (u >> 16) & 0xff;
+ p[3] = (u >> 24) & 0xff;
+}
+
+static __inline void
+le64enc(void *pp, uint64_t u)
+{
+ uint8_t *p = (uint8_t *)pp;
+
+ le32enc(p, (uint32_t)(u & 0xffffffffU));
+ le32enc(p + 4, (uint32_t)(u >> 32));
+}
+
+#endif /* _COMPAT_FREEBSD_SYS_ENDIAN_H_ */
diff --git a/usr/src/compat/freebsd/sys/errno.h b/usr/src/compat/freebsd/sys/errno.h
new file mode 100644
index 0000000000..bd37f43065
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/errno.h
@@ -0,0 +1,27 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_ERRNO_H_
+#define _COMPAT_FREEBSD_SYS_ERRNO_H_
+
+#ifndef _KERNEL
+extern int *___errno();
+
+#define errno (*(___errno()))
+#endif
+
+#include_next <sys/errno.h>
+
+#endif /* _COMPAT_FREEBSD_SYS_ERRNO_H_ */
diff --git a/usr/src/compat/freebsd/sys/fcntl.h b/usr/src/compat/freebsd/sys/fcntl.h
new file mode 100644
index 0000000000..062a3b84ac
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/fcntl.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_FCNTL_H_
+#define _COMPAT_FREEBSD_SYS_FCNTL_H_
+
+#define O_DIRECT 0x0
+
+#include_next <sys/fcntl.h>
+
+#endif /* _COMPAT_FREEBSD_SYS_FCNTL_H_ */
diff --git a/usr/src/compat/freebsd/sys/ioctl.h b/usr/src/compat/freebsd/sys/ioctl.h
new file mode 100644
index 0000000000..e223e1e4c7
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/ioctl.h
@@ -0,0 +1,22 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_IOCTL_H_
+#define _COMPAT_FREEBSD_SYS_IOCTL_H_
+
+#include <sys/ioccom.h>
+#include_next <sys/ioctl.h>
+
+#endif /* _COMPAT_FREEBSD_SYS_IOCTL_H_ */
diff --git a/usr/src/compat/freebsd/sys/kernel.h b/usr/src/compat/freebsd/sys/kernel.h
new file mode 100644
index 0000000000..b1c07674e4
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/kernel.h
@@ -0,0 +1,25 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_KERNEL_H_
+#define _COMPAT_FREEBSD_SYS_KERNEL_H_
+
+#define SYSINIT(uniquifier, subsystem, order, func, ident)
+
+#include <sys/linker_set.h>
+
+#define ticks ddi_get_lbolt()
+
+#endif /* _COMPAT_FREEBSD_SYS_KERNEL_H_ */
diff --git a/usr/src/compat/freebsd/sys/ktr.h b/usr/src/compat/freebsd/sys/ktr.h
new file mode 100644
index 0000000000..96c499ef18
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/ktr.h
@@ -0,0 +1,27 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_KTR_H_
+#define _COMPAT_FREEBSD_SYS_KTR_H_
+
+#define CTR0(m, format)
+#define CTR1(m, format, p1)
+#define CTR2(m, format, p1, p2)
+#define CTR3(m, format, p1, p2, p3)
+#define CTR4(m, format, p1, p2, p3, p4)
+#define CTR5(m, format, p1, p2, p3, p4, p5)
+#define CTR6(m, d, p1, p2, p3, p4, p5, p6)
+
+#endif /* _COMPAT_FREEBSD_SYS_KTR_H_ */
diff --git a/usr/src/compat/freebsd/sys/libkern.h b/usr/src/compat/freebsd/sys/libkern.h
new file mode 100644
index 0000000000..94675a0d66
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/libkern.h
@@ -0,0 +1,25 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_LIBKERN_H_
+#define _COMPAT_FREEBSD_SYS_LIBKERN_H_
+
+#include <sys/systm.h>
+
+#ifndef min
+static __inline u_int min(u_int a, u_int b) { return (a < b ? a : b); }
+#endif
+
+#endif /* _COMPAT_FREEBSD_SYS_LIBKERN_H_ */
diff --git a/usr/src/compat/freebsd/sys/limits.h b/usr/src/compat/freebsd/sys/limits.h
new file mode 100644
index 0000000000..99ae0f4d64
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/limits.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_LIMITS_H_
+#define _COMPAT_FREEBSD_SYS_LIMITS_H_
+
+#endif /* _COMPAT_FREEBSD_SYS_LIMITS_H_ */
diff --git a/usr/src/compat/freebsd/sys/malloc.h b/usr/src/compat/freebsd/sys/malloc.h
new file mode 100644
index 0000000000..579df44533
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/malloc.h
@@ -0,0 +1,44 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_MALLOC_H_
+#define _COMPAT_FREEBSD_SYS_MALLOC_H_
+
+/*
+ * flags to malloc.
+ */
+#define M_NOWAIT 0x0001 /* do not block */
+#define M_WAITOK 0x0002 /* ok to block */
+#define M_ZERO 0x0100 /* bzero the allocation */
+
+struct malloc_type {
+ const char *ks_shortdesc; /* Printable type name. */
+};
+
+#ifdef _KERNEL
+#define MALLOC_DEFINE(type, shortdesc, longdesc) \
+ struct malloc_type type[1] = { \
+ { shortdesc } \
+ }
+
+#define MALLOC_DECLARE(type) \
+ extern struct malloc_type type[1]
+
+void free(void *addr, struct malloc_type *type);
+void *malloc(unsigned long size, struct malloc_type *type, int flags);
+void *old_malloc(unsigned long size, struct malloc_type *type , int flags);
+#endif /* _KERNEL */
+
+#endif /* _COMPAT_FREEBSD_SYS_MALLOC_H_ */
diff --git a/usr/src/compat/freebsd/sys/module.h b/usr/src/compat/freebsd/sys/module.h
new file mode 100644
index 0000000000..87b73e3fa3
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/module.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_MODULE_H_
+#define _COMPAT_FREEBSD_SYS_MODULE_H_
+
+#endif /* _COMPAT_FREEBSD_SYS_MODULE_H_ */
diff --git a/usr/src/compat/freebsd/sys/mutex.h b/usr/src/compat/freebsd/sys/mutex.h
new file mode 100644
index 0000000000..b99884b652
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/mutex.h
@@ -0,0 +1,81 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_MUTEX_H_
+#define _COMPAT_FREEBSD_SYS_MUTEX_H_
+
+#ifdef _KERNEL
+
+#include <sys/debug.h>
+
+#define MTX_DEF 0x00000000
+#define MTX_SPIN 0x00000001
+
+struct mtx;
+
+void mtx_init(struct mtx *, char *name, const char *type_name, int opts);
+void mtx_destroy(struct mtx *);
+
+int mtx_sleep(void *chan, struct mtx *mtx, int priority, const char *wmesg,
+ int timo);
+
+#endif /* KERNEL */
+#include_next <sys/mutex.h>
+#ifdef _KERNEL
+
+struct mtx {
+ kmutex_type_t t;
+ kmutex_t m;
+};
+
+static __inline void mtx_lock(struct mtx *mtx)
+{
+ mutex_enter(&mtx->m);
+}
+
+static __inline void mtx_unlock(struct mtx *mtx)
+{
+ mutex_exit(&mtx->m);
+}
+
+static __inline void mtx_lock_spin(struct mtx *mtx)
+{
+ mutex_enter(&mtx->m);
+}
+
+static __inline void mtx_unlock_spin(struct mtx *mtx)
+{
+ mutex_exit(&mtx->m);
+}
+
+static __inline int mtx_owned(struct mtx *mtx)
+{
+ return (mutex_owned(&mtx->m));
+}
+
+#define MA_OWNED 0
+
+static __inline void mtx_assert(struct mtx *mtx, int what)
+{
+ switch (what) {
+ case MA_OWNED:
+ ASSERT(mutex_owned(&mtx->m));
+ break;
+ }
+}
+
+#endif /* _KERNEL */
+
+#endif /* _COMPAT_FREEBSD_SYS_MUTEX_H_ */
diff --git a/usr/src/compat/freebsd/sys/param.h b/usr/src/compat/freebsd/sys/param.h
new file mode 100644
index 0000000000..f09e9183f6
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/param.h
@@ -0,0 +1,48 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_PARAM_H_
+#define _COMPAT_FREEBSD_SYS_PARAM_H_
+
+#ifndef _KERNEL
+#define MAXCOMLEN 16
+#endif
+#define MAXHOSTNAMELEN 256
+
+#ifdef _KERNEL
+#include <sys/time.h>
+
+#ifndef FALSE
+#define FALSE 0
+#endif
+#ifndef TRUE
+#define TRUE 1
+#endif
+#endif
+
+#include <machine/param.h>
+
+#define nitems(x) (sizeof((x)) / sizeof((x)[0]))
+#define rounddown(x,y) (((x)/(y))*(y))
+#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) /* to any y */
+#define roundup2(x,y) (((x)+((y)-1))&(~((y)-1))) /* if y is powers of two */
+
+/* Macros for min/max. */
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#define MAX(a,b) (((a)>(b))?(a):(b))
+
+#include_next <sys/param.h>
+
+#endif /* _COMPAT_FREEBSD_SYS_PARAM_H_ */
diff --git a/usr/src/compat/freebsd/sys/pcpu.h b/usr/src/compat/freebsd/sys/pcpu.h
new file mode 100644
index 0000000000..f29c9c5018
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/pcpu.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_PCPU_H_
+#define _COMPAT_FREEBSD_SYS_PCPU_H_
+
+#define curcpu (CPU->cpu_id)
+
+#endif /* _COMPAT_FREEBSD_SYS_PCPU_H_ */
diff --git a/usr/src/compat/freebsd/sys/sched.h b/usr/src/compat/freebsd/sys/sched.h
new file mode 100644
index 0000000000..b426ee757e
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/sched.h
@@ -0,0 +1,19 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_SCHED_H_
+#define _COMPAT_FREEBSD_SYS_SCHED_H_
+
+#endif /* _COMPAT_FREEBSD_SYS_SCHED_H_ */
diff --git a/usr/src/compat/freebsd/sys/select.h b/usr/src/compat/freebsd/sys/select.h
new file mode 100644
index 0000000000..fcb40c23b1
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/select.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_MODULE_H_
+#define _COMPAT_FREEBSD_SYS_MODULE_H_
+
+void *memset(void *s, int c, size_t n);
+
+#include_next <sys/select.h>
+
+#endif /* _COMPAT_FREEBSD_SYS_MODULE_H_ */
diff --git a/usr/src/compat/freebsd/sys/smp.h b/usr/src/compat/freebsd/sys/smp.h
new file mode 100644
index 0000000000..46183e8677
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/smp.h
@@ -0,0 +1,28 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_SMP_H_
+#define _COMPAT_FREEBSD_SYS_SMP_H_
+
+#include <sys/cpuset.h>
+
+void smp_rendezvous(void (*)(void *),
+ void (*)(void *),
+ void (*)(void *),
+ void *arg);
+
+void ipi_cpu(int cpu, u_int ipi);
+
+#endif /* _COMPAT_FREEBSD_SYS_SMP_H_ */
diff --git a/usr/src/compat/freebsd/sys/sysctl.h b/usr/src/compat/freebsd/sys/sysctl.h
new file mode 100644
index 0000000000..9f6a695e34
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/sysctl.h
@@ -0,0 +1,27 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_SYSCTL_H_
+#define _COMPAT_FREEBSD_SYS_SYSCTL_H_
+
+#define SYSCTL_DECL(name)
+
+#define SYSCTL_NODE(parent, nbr, name, access, handler, descr)
+
+#define SYSCTL_INT(parent, nbr, name, access, ptr, val, descr)
+#define SYSCTL_UINT(parent, nbr, name, access, ptr, val, descr)
+#define SYSCTL_ULONG(parent, nbr, name, access, ptr, val, descr)
+
+#endif /* _COMPAT_FREEBSD_SYS_SYSCTL_H_ */
diff --git a/usr/src/compat/freebsd/sys/systm.h b/usr/src/compat/freebsd/sys/systm.h
new file mode 100644
index 0000000000..e25acc0e4a
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/systm.h
@@ -0,0 +1,53 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_SYSTM_H_
+#define _COMPAT_FREEBSD_SYS_SYSTM_H_
+
+#include <machine/atomic.h>
+#include <machine/cpufunc.h>
+#include <sys/callout.h>
+#include <sys/queue.h>
+
+struct mtx;
+
+#define KASSERT(exp,msg) do { \
+ if (!(exp)) \
+ panic msg; \
+} while (0)
+
+#define CTASSERT(x) _CTASSERT(x, __LINE__)
+#define _CTASSERT(x,y) __CTASSERT(x,y)
+#define __CTASSERT(x,y) typedef char __assert ## y[(x) ? 1 : -1]
+
+void critical_enter(void);
+void critical_exit(void);
+
+int msleep_spin(void *chan, struct mtx *mutex, const char *wmesg,
+ int ticks);
+void wakeup(void *chan);
+void wakeup_one(void *chan);
+
+struct unrhdr *new_unrhdr(int low, int high, struct mtx *mutex);
+void delete_unrhdr(struct unrhdr *uh);
+int alloc_unr(struct unrhdr *uh);
+void free_unr(struct unrhdr *uh, u_int item);
+
+#include <sys/libkern.h>
+
+#include_next <sys/systm.h>
+#include <sys/cmn_err.h>
+
+#endif /* _COMPAT_FREEBSD_SYS_SYSTM_H_ */
diff --git a/usr/src/compat/freebsd/sys/time.h b/usr/src/compat/freebsd/sys/time.h
new file mode 100644
index 0000000000..f8f9da5cdf
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/time.h
@@ -0,0 +1,104 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_TIME_H_
+#define _COMPAT_FREEBSD_SYS_TIME_H_
+
+#include_next <sys/time.h>
+
+#define tc_precexp 0
+
+struct bintime {
+ ulong_t sec; /* seconds */
+ uint64_t frac; /* 64 bit fraction of a second */
+};
+
+#define BT2FREQ(bt) \
+ (((uint64_t)0x8000000000000000 + ((bt)->frac >> 2)) / \
+ ((bt)->frac >> 1))
+
+#define FREQ2BT(freq, bt) \
+{ \
+ (bt)->sec = 0; \
+ (bt)->frac = ((uint64_t)0x8000000000000000 / (freq)) << 1; \
+}
+
+static __inline void
+binuptime(struct bintime *bt)
+{
+ hrtime_t now = gethrtime();
+
+ bt->sec = now / 1000000000;
+ /* 18446744073 = int(2^64 / 1000000000) = 1ns in 64-bit fractions */
+ bt->frac = (now % 1000000000) * (uint64_t)18446744073LL;
+}
+
+#define bintime_cmp(a, b, cmp) \
+ (((a)->sec == (b)->sec) ? \
+ ((a)->frac cmp (b)->frac) : \
+ ((a)->sec cmp (b)->sec))
+
+#define SBT_1US (1000)
+
+static __inline void
+bintime_add(struct bintime *bt, const struct bintime *bt2)
+{
+ uint64_t u;
+
+ u = bt->frac;
+ bt->frac += bt2->frac;
+ if (u > bt->frac)
+ bt->sec++;
+ bt->sec += bt2->sec;
+}
+
+static __inline void
+bintime_sub(struct bintime *bt, const struct bintime *bt2)
+{
+ uint64_t u;
+
+ u = bt->frac;
+ bt->frac -= bt2->frac;
+ if (u < bt->frac)
+ bt->sec--;
+ bt->sec -= bt2->sec;
+}
+
+static __inline void
+bintime_mul(struct bintime *bt, u_int x)
+{
+ uint64_t p1, p2;
+
+ p1 = (bt->frac & 0xffffffffull) * x;
+ p2 = (bt->frac >> 32) * x + (p1 >> 32);
+ bt->sec *= x;
+ bt->sec += (p2 >> 32);
+ bt->frac = (p2 << 32) | (p1 & 0xffffffffull);
+}
+
+static __inline sbintime_t
+bttosbt(const struct bintime bt)
+{
+ return ((bt.sec * 1000000000) +
+ (((uint64_t)1000000000 * (uint32_t)(bt.frac >> 32)) >> 32));
+}
+
+static __inline sbintime_t
+sbinuptime(void)
+{
+ return (gethrtime());
+}
+
+#endif /* _COMPAT_FREEBSD_SYS_TIME_H_ */
diff --git a/usr/src/compat/freebsd/sys/types.h b/usr/src/compat/freebsd/sys/types.h
new file mode 100644
index 0000000000..6fc8179f2e
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/types.h
@@ -0,0 +1,74 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_TYPES_H_
+#define _COMPAT_FREEBSD_SYS_TYPES_H_
+
+#include <sys/_types.h>
+
+typedef __uint8_t u_int8_t; /* unsigned integrals (deprecated) */
+typedef __uint16_t u_int16_t;
+typedef __uint32_t u_int32_t;
+typedef __uint64_t u_int64_t;
+
+#ifndef __REGISTER_T_DEFINED
+#define __REGISTER_T_DEFINED
+typedef __register_t register_t;
+#endif
+
+#ifndef __SBINTIME_T_DEFINED
+#define __SBINTIME_T_DEFINED
+typedef __int64_t sbintime_t;
+#endif
+
+#ifndef __VM_MEMATTR_T_DEFINED
+#define __VM_MEMATTR_T_DEFINED
+typedef char vm_memattr_t;
+#endif
+
+#ifndef __VM_OFFSET_T_DEFINED
+#define __VM_OFFSET_T_DEFINED
+typedef __vm_offset_t vm_offset_t;
+#endif
+
+#ifndef __VM_OOFFSET_T_DEFINED
+#define __VM_OOFFSET_T_DEFINED
+typedef __vm_ooffset_t vm_ooffset_t;
+#endif
+
+#ifndef __VM_PADDR_T_DEFINED
+#define __VM_PADDR_T_DEFINED
+typedef __vm_paddr_t vm_paddr_t;
+#endif
+
+#ifndef __VM_MEMATTR_T_DEFINED
+#define __VM_MEMATTR_T_DEFINED
+typedef char vm_memattr_t;
+#endif
+
+#ifndef __bool_true_false_are_defined
+#define __bool_true_false_are_defined 1
+#define false 0
+#define true 1
+typedef _Bool bool;
+#endif
+
+#if defined(_KERNEL) && !defined(offsetof)
+#define offsetof(s, m) ((size_t)(&(((s *)0)->m)))
+#endif
+
+#include_next <sys/types.h>
+
+#endif /* _COMPAT_FREEBSD_SYS_TYPES_H_ */
diff --git a/usr/src/compat/freebsd/sys/uio.h b/usr/src/compat/freebsd/sys/uio.h
new file mode 100644
index 0000000000..05c6f2a028
--- /dev/null
+++ b/usr/src/compat/freebsd/sys/uio.h
@@ -0,0 +1,26 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_SYS_UIO_H_
+#define _COMPAT_FREEBSD_SYS_UIO_H_
+
+#include_next <sys/uio.h>
+
+#ifndef _KERNEL
+ssize_t preadv(int, const struct iovec *, int, off_t);
+ssize_t pwritev(int, const struct iovec *, int, off_t);
+#endif
+
+#endif /* _COMPAT_FREEBSD_SYS_UIO_H_ */
diff --git a/usr/src/compat/freebsd/termios.h b/usr/src/compat/freebsd/termios.h
new file mode 100644
index 0000000000..feaa705358
--- /dev/null
+++ b/usr/src/compat/freebsd/termios.h
@@ -0,0 +1,23 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_TERMIOS_H_
+#define _COMPAT_FREEBSD_TERMIOS_H_
+
+#include_next <termios.h>
+
+void cfmakeraw(struct termios *);
+
+#endif /* _COMPAT_FREEBSD_TERMIOS_H_ */
diff --git a/usr/src/compat/freebsd/uuid.h b/usr/src/compat/freebsd/uuid.h
new file mode 100644
index 0000000000..72ef2c7787
--- /dev/null
+++ b/usr/src/compat/freebsd/uuid.h
@@ -0,0 +1,55 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_UUID_H_
+#define _COMPAT_FREEBSD_UUID_H_
+
+#include <sys/endian.h>
+#include <uuid/uuid.h>
+
+/* Status codes returned by the functions. */
+#define uuid_s_ok 0
+#define uuid_s_bad_version 1
+#define uuid_s_invalid_string_uuid 2
+
+static __inline void
+uuid_from_string(char *str, uuid_t *uuidp, uint32_t *status)
+{
+ if (uuid_parse(str, *uuidp) == 0) {
+ *status = uuid_s_ok;
+ } else {
+ *status = uuid_s_invalid_string_uuid;
+ }
+}
+
+static __inline void
+uuid_enc_le(void *buf, uuid_t *uuidp)
+{
+ uchar_t *p;
+ int i;
+
+ p = buf;
+ be32enc(p, ((struct uuid *)uuidp)->time_low);
+ be16enc(p + 4, ((struct uuid *)uuidp)->time_mid);
+ be16enc(p + 6, ((struct uuid *)uuidp)->time_hi_and_version);
+ p[8] = ((struct uuid *)uuidp)->clock_seq_hi_and_reserved;
+ p[9] = ((struct uuid *)uuidp)->clock_seq_low;
+
+ for (i = 0; i < 6; i++)
+ p[10 + i] = ((struct uuid *)uuidp)->node_addr[i];
+
+}
+
+#endif /* _COMPAT_FREEBSD_UUID_H_ */
diff --git a/usr/src/compat/freebsd/vm/pmap.h b/usr/src/compat/freebsd/vm/pmap.h
new file mode 100644
index 0000000000..5958c4b101
--- /dev/null
+++ b/usr/src/compat/freebsd/vm/pmap.h
@@ -0,0 +1,21 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_VM_PMAP_H_
+#define _COMPAT_FREEBSD_VM_PMAP_H_
+
+#include <machine/pmap.h>
+
+#endif /* _COMPAT_FREEBSD_VM_PMAP_H_ */
diff --git a/usr/src/compat/freebsd/vm/vm.h b/usr/src/compat/freebsd/vm/vm.h
new file mode 100644
index 0000000000..7da22099b6
--- /dev/null
+++ b/usr/src/compat/freebsd/vm/vm.h
@@ -0,0 +1,39 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _FREEBSD_VM_VM_H_
+#define _FREEBSD_VM_VM_H_
+
+#include <machine/vm.h>
+
+typedef u_char vm_prot_t;
+
+#define VM_PROT_NONE ((vm_prot_t) 0x00)
+#define VM_PROT_READ ((vm_prot_t) 0x01)
+#define VM_PROT_WRITE ((vm_prot_t) 0x02)
+#define VM_PROT_EXECUTE ((vm_prot_t) 0x04)
+
+#define VM_PROT_ALL (VM_PROT_READ|VM_PROT_WRITE|VM_PROT_EXECUTE)
+#define VM_PROT_RW (VM_PROT_READ|VM_PROT_WRITE)
+
+/*
+ * <sys/promif.h> contains a troublesome preprocessor define for BYTE.
+ * Do this ugly workaround to avoid it.
+ */
+#define _SYS_PROMIF_H
+#include <vm/hat_i86.h>
+#undef _SYS_PROMIF_H
+
+#endif /* _FREEBSD_VM_VM_H_ */
diff --git a/usr/src/compat/freebsd/x86/_types.h b/usr/src/compat/freebsd/x86/_types.h
new file mode 100644
index 0000000000..a07fc017ad
--- /dev/null
+++ b/usr/src/compat/freebsd/x86/_types.h
@@ -0,0 +1,49 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _FREEBSD_X86__TYPES_H_
+#define _FREEBSD_X86__TYPES_H_
+
+/*
+ * Basic types upon which most other types are built.
+ */
+typedef signed char __int8_t;
+typedef unsigned char __uint8_t;
+typedef short __int16_t;
+typedef unsigned short __uint16_t;
+typedef int __int32_t;
+typedef unsigned int __uint32_t;
+#ifdef _LP64
+typedef long __int64_t;
+typedef unsigned long __uint64_t;
+#else
+typedef long long __int64_t;
+typedef unsigned long long __uint64_t;
+#endif
+
+/*
+ * Standard type definitions.
+ */
+#ifdef _LP64
+typedef __int64_t __register_t;
+typedef __uint64_t __vm_offset_t;
+typedef __uint64_t __vm_paddr_t;
+typedef __int64_t __vm_ooffset_t;
+#else
+typedef __int32_t __register_t;
+typedef __uint32_t __vm_paddr_t;
+#endif
+
+#endif /* _FREEBSD_X86__TYPES_H_ */
diff --git a/usr/src/compat/freebsd/x86/segments.h b/usr/src/compat/freebsd/x86/segments.h
new file mode 100644
index 0000000000..bc6ba976b8
--- /dev/null
+++ b/usr/src/compat/freebsd/x86/segments.h
@@ -0,0 +1,28 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _COMPAT_FREEBSD_X86_SEGMENTS_H_
+#define _COMPAT_FREEBSD_X86_SEGMENTS_H_
+
+/*
+ * Entries in the Interrupt Descriptor Table (IDT)
+ */
+#define IDT_BP 3 /* #BP: Breakpoint */
+#define IDT_UD 6 /* #UD: Undefined/Invalid Opcode */
+#define IDT_SS 12 /* #SS: Stack Segment Fault */
+#define IDT_GP 13 /* #GP: General Protection Fault */
+#define IDT_AC 17 /* #AC: Alignment Check */
+
+#endif /* _COMPAT_FREEBSD_AMD64_MACHINE_SEGMENTS_H_ */
diff --git a/usr/src/head/bhyve.h b/usr/src/head/bhyve.h
new file mode 100644
index 0000000000..8c79ca1ccc
--- /dev/null
+++ b/usr/src/head/bhyve.h
@@ -0,0 +1,25 @@
+/*
+ * COPYRIGHT 2013 Pluribus Networks Inc.
+ *
+ * All rights reserved. This copyright notice is Copyright Management
+ * Information under 17 USC 1202 and is included to protect this work and
+ * deter copyright infringement. Removal or alteration of this Copyright
+ * Management Information without the express written permission from
+ * Pluribus Networks Inc is prohibited, and any such unauthorized removal
+ * or alteration will be a violation of federal law.
+ */
+#ifndef _BHYVE_H
+#define _BHYVE_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BHYVE_TMPDIR "/var/run/bhyve"
+#define BHYVE_CONS_SOCKPATH BHYVE_TMPDIR "/%s.console_sock"
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BHYVE_H */
diff --git a/usr/src/lib/libvmmapi/Makefile b/usr/src/lib/libvmmapi/Makefile
new file mode 100644
index 0000000000..60621fcb75
--- /dev/null
+++ b/usr/src/lib/libvmmapi/Makefile
@@ -0,0 +1,49 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+include ../Makefile.lib
+
+HDRS = vmmapi.h
+
+HDRDIR = common
+
+$(BUILD64)SUBDIRS += $(MACH64)
+
+all:= TARGET= all
+install:= TARGET= install
+clean:= TARGET= clean
+clobber:= TARGET= clobber
+lint:= TARGET= lint
+_msg:= TARGET= _msg
+
+.KEEP_STATE:
+
+all install clean clobber lint: $(SUBDIRS)
+
+# install rule for install_h target
+
+install_h: $(ROOTHDRS)
+
+check: $(CHECKHDRS)
+
+_msg: $(MSGSUBDIRS)
+
+$(SUBDIRS): FRC
+ cd $@; pwd; $(MAKE) CW_NO_SHADOW=true __GNUC= $(TARGET)
+
+FRC:
+
+include ../Makefile.targ
+include ../../Makefile.msg.targ
diff --git a/usr/src/lib/libvmmapi/Makefile.com b/usr/src/lib/libvmmapi/Makefile.com
new file mode 100644
index 0000000000..e41a82f9a2
--- /dev/null
+++ b/usr/src/lib/libvmmapi/Makefile.com
@@ -0,0 +1,53 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+LIBRARY = libvmmapi.a
+VERS = .1
+
+OBJECTS = vmmapi.o expand_number.o
+
+# include library definitions
+include ../../Makefile.lib
+
+# install this library in the root filesystem
+include ../../Makefile.rootfs
+
+SRCDIR = ../common
+
+LIBS = $(DYNLIB) $(LINTLIB)
+
+CPPFLAGS = -I$(COMPAT)/freebsd -I$(CONTRIB)/freebsd \
+ $(CPPFLAGS.master) -I$(SRC)/uts/i86pc
+
+$(LINTLIB) := SRCS = $(SRCDIR)/$(LINTSRC)
+
+LDLIBS += -lc
+
+.KEEP_STATE:
+
+all: $(LIBS)
+
+lint: lintcheck
+
+pics/%.o: $(CONTRIB)/freebsd/lib/libutil/%.c
+ $(COMPILE.c) -o $@ $<
+ $(POST_PROCESS_O)
+
+pics/%.o: ../common/%.c
+ $(COMPILE.c) -o $@ $<
+ $(POST_PROCESS_O)
+
+# include library targets
+include ../../Makefile.targ
diff --git a/usr/src/lib/libvmmapi/amd64/Makefile b/usr/src/lib/libvmmapi/amd64/Makefile
new file mode 100644
index 0000000000..b5cac1ffce
--- /dev/null
+++ b/usr/src/lib/libvmmapi/amd64/Makefile
@@ -0,0 +1,21 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+include ../Makefile.com
+include ../../Makefile.lib.64
+
+CPPFLAGS += -I$(COMPAT)/freebsd/amd64 -I$(CONTRIB)/freebsd/amd64
+
+install: all $(ROOTLIBS64) $(ROOTLINKS64) $(ROOTLINT64)
diff --git a/usr/src/lib/libvmmapi/common/llib-lvmmapi b/usr/src/lib/libvmmapi/common/llib-lvmmapi
new file mode 100644
index 0000000000..221ed3a23e
--- /dev/null
+++ b/usr/src/lib/libvmmapi/common/llib-lvmmapi
@@ -0,0 +1,2 @@
+/* LINTLIBRARY */
+/* PROTOLIB1 */
diff --git a/usr/src/lib/libvmmapi/common/mapfile-vers b/usr/src/lib/libvmmapi/common/mapfile-vers
new file mode 100644
index 0000000000..7a8443a2b8
--- /dev/null
+++ b/usr/src/lib/libvmmapi/common/mapfile-vers
@@ -0,0 +1,77 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+#
+# MAPFILE HEADER START
+#
+# WARNING: STOP NOW. DO NOT MODIFY THIS FILE.
+# Object versioning must comply with the rules detailed in
+#
+# usr/src/lib/README.mapfiles
+#
+# You should not be making modifications here until you've read the most current
+# copy of that file. If you need help, contact a gatekeeper for guidance.
+#
+# MAPFILE HEADER END
+#
+
+SUNWprivate_1.0 {
+ global:
+ vcpu_reset;
+ vm_activate_cpu;
+ vm_apicid2vcpu;
+ vm_capability_name2type;
+ vm_capability_type2name;
+ vm_copy_setup;
+ vm_copy_teardown;
+ vm_copyin;
+ vm_copyout;
+ vm_create;
+ vm_destroy;
+ vm_get_capability;
+ vm_get_desc;
+ vm_get_highmem_size;
+ vm_get_lowmem_limit;
+ vm_get_lowmem_size;
+ vm_get_memory_seg;
+ vm_get_register;
+ vm_get_seg_desc;
+ vm_get_x2apic_state;
+ vm_gla2gpa;
+ vm_inject_exception;
+ vm_isa_assert_irq;
+ vm_isa_deassert_irq;
+ vm_isa_pulse_irq;
+ vm_isa_set_irq_trigger;
+ vm_ioapic_assert_irq;
+ vm_ioapic_deassert_irq;
+ vm_ioapic_pincount;
+ vm_ioapic_pulse_irq;
+ vm_lapic_irq;
+ vm_lapic_msi;
+ vm_map_gpa;
+ vm_open;
+ vm_parse_memsize;
+ vm_restart_instruction;
+ vm_run;
+ vm_set_capability;
+ vm_set_desc;
+ vm_set_register;
+ vm_set_x2apic_state;
+ vm_setup_memory;
+ vm_setup_rom;
+ local:
+ *;
+};
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.c b/usr/src/lib/libvmmapi/common/vmmapi.c
new file mode 100644
index 0000000000..bbab3961a9
--- /dev/null
+++ b/usr/src/lib/libvmmapi/common/vmmapi.c
@@ -0,0 +1,1257 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tychon $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/lib/libvmmapi/vmmapi.c 280929 2015-04-01 00:15:31Z tychon $");
+
+#include <sys/param.h>
+#include <sys/sysctl.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <sys/_iovec.h>
+#include <sys/cpuset.h>
+
+#include <machine/specialreg.h>
+
+#ifndef __FreeBSD__
+#include <errno.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <string.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <libutil.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+#ifndef __FreeBSD__
+#include <sys/vmm_impl.h>
+#endif
+
+#include "vmmapi.h"
+
+#define KB (1024UL)
+#define MB (1024 * 1024UL)
+#define GB (1024 * 1024 * 1024UL)
+
+struct vmctx {
+ int fd;
+ uint32_t lowmem_limit;
+ enum vm_mmap_style vms;
+ char *lowermem_addr;
+ char *biosmem_addr;
+ size_t lowmem;
+ char *lowmem_addr;
+ size_t highmem;
+ char *highmem_addr;
+ uint64_t rombase;
+ uint64_t romlimit;
+ char *rom_addr;
+ char *name;
+};
+
+#ifdef __FreeBSD__
+#define CREATE(x) sysctlbyname("hw.vmm.create", NULL, NULL, (x), strlen((x)))
+#define DESTROY(x) sysctlbyname("hw.vmm.destroy", NULL, NULL, (x), strlen((x)))
+#else
+#define CREATE(x) vmm_vm_create(x)
+#define DESTROY(x) vmm_vm_destroy(x)
+#endif
+
+static int
+vm_device_open(const char *name)
+{
+ int fd, len;
+ char *vmfile;
+
+#ifdef __FreeBSD__
+ len = strlen("/dev/vmm/") + strlen(name) + 1;
+#else
+ len = strlen("/devices/pseudo/vmm@0:") + strlen(name) + 1;
+#endif
+ vmfile = malloc(len);
+ assert(vmfile != NULL);
+#ifdef __FreeBSD__
+ snprintf(vmfile, len, "/dev/vmm/%s", name);
+#else
+ snprintf(vmfile, len, "/devices/pseudo/vmm@0:%s", name);
+#endif
+
+ /* Open the device file */
+ fd = open(vmfile, O_RDWR, 0);
+
+ free(vmfile);
+ return (fd);
+}
+
+#ifndef __FreeBSD__
+static int
+vmm_vm_create(const char *name)
+{
+ const char vmm_ctl[] = "/devices/pseudo/vmm@0:ctl";
+ struct vmm_ioctl vi;
+ int err = 0;
+ int ctl_fd;
+
+ (void) strlcpy(vi.vmm_name, name, sizeof (vi.vmm_name) - 1);
+
+ ctl_fd = open(vmm_ctl, O_EXCL | O_RDWR);
+ if (ctl_fd == -1) {
+ err = errno;
+ if ((errno == EPERM) || (errno == EACCES)) {
+ fprintf(stderr, "you do not have permission to "
+ "perform that operation.\n");
+ } else {
+ fprintf(stderr, "open: %s: %s\n", vmm_ctl,
+ strerror(errno));
+ }
+ return (err);
+ }
+ if (ioctl(ctl_fd, VMM_CREATE_VM, &vi) == -1) {
+ err = errno;
+ fprintf(stderr, "couldn't create vm \"%s\"", name);
+ }
+ close (ctl_fd);
+
+ return (err);
+}
+#endif
+
+int
+vm_create(const char *name)
+{
+
+ return (CREATE((char *)name));
+}
+
+struct vmctx *
+vm_open(const char *name)
+{
+ struct vmctx *vm;
+
+ vm = malloc(sizeof(struct vmctx) + strlen(name) + 1);
+ assert(vm != NULL);
+
+ vm->fd = -1;
+ vm->lowmem_limit = 3 * GB;
+ vm->name = (char *)(vm + 1);
+ strcpy(vm->name, name);
+
+ if ((vm->fd = vm_device_open(vm->name)) < 0)
+ goto err;
+
+ return (vm);
+err:
+ (void) vm_destroy(vm);
+ return (NULL);
+}
+
+#ifndef __FreeBSD__
+static int
+vmm_vm_destroy(const char *name)
+{
+ const char vmm_ctl[] = "/devices/pseudo/vmm@0:ctl";
+ struct vmm_ioctl vi;
+ int ctl_fd;
+ int err = 0;
+
+ (void) strlcpy(vi.vmm_name, name, sizeof (vi.vmm_name) - 1);
+
+ ctl_fd = open(vmm_ctl, O_EXCL | O_RDWR);
+ if (ctl_fd == -1) {
+ err = errno;
+ if ((errno == EPERM) || (errno == EACCES)) {
+ fprintf(stderr, "you do not have permission to "
+ "perform that operation.\n");
+ } else {
+ fprintf(stderr, "open: %s: %s\n", vmm_ctl,
+ strerror(errno));
+ }
+ return (err);
+ }
+ if (ioctl(ctl_fd, VMM_DESTROY_VM, &vi) == -1) {
+ err = errno;
+ fprintf(stderr, "couldn't destroy vm \"%s\"", name);
+ }
+ close (ctl_fd);
+ return (err);
+}
+#endif
+
+int
+vm_destroy(struct vmctx *vm)
+{
+ int err;
+ assert(vm != NULL);
+
+ if (vm->fd >= 0)
+ close(vm->fd);
+ err = DESTROY(vm->name);
+
+ free(vm);
+ return (err);
+}
+
+int
+vm_parse_memsize(const char *optarg, size_t *ret_memsize)
+{
+ char *endptr;
+ size_t optval;
+ int error;
+
+ optval = strtoul(optarg, &endptr, 0);
+ if (*optarg != '\0' && *endptr == '\0') {
+ /*
+ * For the sake of backward compatibility if the memory size
+ * specified on the command line is less than a megabyte then
+ * it is interpreted as being in units of MB.
+ */
+ if (optval < MB)
+ optval *= MB;
+ *ret_memsize = optval;
+ error = 0;
+ } else
+ error = expand_number(optarg, ret_memsize);
+
+ return (error);
+}
+
+#ifdef __FreeBSD__
+size_t
+vmm_get_mem_total(void)
+{
+ size_t mem_total = 0;
+ size_t oldlen = sizeof(mem_total);
+ int error;
+ error = sysctlbyname("hw.vmm.mem_total", &mem_total, &oldlen, NULL, 0);
+ if (error)
+ return -1;
+ return mem_total;
+}
+
+size_t
+vmm_get_mem_free(void)
+{
+ size_t mem_free = 0;
+ size_t oldlen = sizeof(mem_free);
+ int error;
+ error = sysctlbyname("hw.vmm.mem_free", &mem_free, &oldlen, NULL, 0);
+ if (error)
+ return -1;
+ return mem_free;
+}
+#endif
+
+int
+vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
+ int *wired)
+{
+ int error;
+ struct vm_memory_segment seg;
+
+ bzero(&seg, sizeof(seg));
+ seg.gpa = gpa;
+ error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
+ *ret_len = seg.len;
+ if (wired != NULL)
+ *wired = seg.wired;
+ return (error);
+}
+
+uint32_t
+vm_get_lowmem_limit(struct vmctx *ctx)
+{
+
+ return (ctx->lowmem_limit);
+}
+
+void
+vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit)
+{
+
+ ctx->lowmem_limit = limit;
+}
+
+static int
+setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr)
+{
+ int error;
+ struct vm_memory_segment seg;
+
+ /*
+ * Create and optionally map 'len' bytes of memory at guest
+ * physical address 'gpa'
+ */
+ bzero(&seg, sizeof(seg));
+ seg.gpa = gpa;
+ seg.len = len;
+ error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
+ if (error == 0 && addr != NULL) {
+ *addr = mmap(NULL, len, PROT_READ | PROT_WRITE, MAP_SHARED,
+ ctx->fd, gpa);
+ }
+ return (error);
+}
+
+int
+vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
+{
+ char **addr;
+ int error;
+
+ /* XXX VM_MMAP_SPARSE not implemented yet */
+ assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL);
+ ctx->vms = vms;
+
+ /*
+ * If 'memsize' cannot fit entirely in the 'lowmem' segment then
+ * create another 'highmem' segment above 4GB for the remainder.
+ */
+ if (memsize > ctx->lowmem_limit) {
+ ctx->lowmem = ctx->lowmem_limit;
+ ctx->highmem = memsize - ctx->lowmem;
+ } else {
+ ctx->lowmem = memsize;
+ ctx->highmem = 0;
+ }
+
+ if (ctx->lowmem > 0) {
+ addr = (vms == VM_MMAP_ALL) ? &ctx->lowermem_addr : NULL;
+ error = setup_memory_segment(ctx, 0, 640*KB, addr);
+ if (error)
+ return (error);
+
+ addr = (vms == VM_MMAP_ALL) ? &ctx->biosmem_addr : NULL;
+ error = setup_memory_segment(ctx, 768*KB, 256*KB, addr);
+ if (error)
+ return (error);
+
+ addr = (vms == VM_MMAP_ALL) ? &ctx->lowmem_addr : NULL;
+ error = setup_memory_segment(ctx, 1*MB, ctx->lowmem - 1*MB, addr);
+ if (error)
+ return (error);
+ }
+
+ if (ctx->highmem > 0) {
+ addr = (vms == VM_MMAP_ALL) ? &ctx->highmem_addr : NULL;
+ error = setup_memory_segment(ctx, 4*GB, ctx->highmem, addr);
+ if (error)
+ return (error);
+ }
+
+ return (0);
+}
+
+int
+vm_setup_rom(struct vmctx *ctx, vm_paddr_t gpa, size_t len)
+{
+ ctx->rombase = gpa;
+ ctx->romlimit = gpa + len;
+
+ return (setup_memory_segment(ctx, gpa, len, &ctx->rom_addr));
+}
+
+void *
+vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
+{
+
+ /* XXX VM_MMAP_SPARSE not implemented yet */
+ assert(ctx->vms == VM_MMAP_ALL);
+
+ if (gaddr + len <= 1*MB) {
+ if (gaddr + len <= 640*KB)
+ return ((void *)(ctx->lowermem_addr + gaddr));
+
+ if (768*KB <= gaddr && gaddr + len <= 1*MB) {
+ gaddr -= 768*KB;
+ return ((void *)(ctx->biosmem_addr + gaddr));
+ }
+
+ return (NULL);
+ }
+
+ if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem) {
+ gaddr -= 1*MB;
+ return ((void *)(ctx->lowmem_addr + gaddr));
+ }
+
+ if (ctx->rombase <= gaddr && gaddr + len <= ctx->romlimit) {
+ gaddr -= ctx->rombase;
+ return ((void *)(ctx->rom_addr + gaddr));
+ }
+
+ if (gaddr >= 4*GB) {
+ gaddr -= 4*GB;
+ if (gaddr < ctx->highmem && gaddr + len <= ctx->highmem)
+ return ((void *)(ctx->highmem_addr + gaddr));
+ }
+
+ return (NULL);
+}
+
+size_t
+vm_get_lowmem_size(struct vmctx *ctx)
+{
+
+ return (ctx->lowmem);
+}
+
+size_t
+vm_get_highmem_size(struct vmctx *ctx)
+{
+
+ return (ctx->highmem);
+}
+
+int
+vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
+ uint64_t base, uint32_t limit, uint32_t access)
+{
+ int error;
+ struct vm_seg_desc vmsegdesc;
+
+ bzero(&vmsegdesc, sizeof(vmsegdesc));
+ vmsegdesc.cpuid = vcpu;
+ vmsegdesc.regnum = reg;
+ vmsegdesc.desc.base = base;
+ vmsegdesc.desc.limit = limit;
+ vmsegdesc.desc.access = access;
+
+ error = ioctl(ctx->fd, VM_SET_SEGMENT_DESCRIPTOR, &vmsegdesc);
+ return (error);
+}
+
+int
+vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
+ uint64_t *base, uint32_t *limit, uint32_t *access)
+{
+ int error;
+ struct vm_seg_desc vmsegdesc;
+
+ bzero(&vmsegdesc, sizeof(vmsegdesc));
+ vmsegdesc.cpuid = vcpu;
+ vmsegdesc.regnum = reg;
+
+ error = ioctl(ctx->fd, VM_GET_SEGMENT_DESCRIPTOR, &vmsegdesc);
+ if (error == 0) {
+ *base = vmsegdesc.desc.base;
+ *limit = vmsegdesc.desc.limit;
+ *access = vmsegdesc.desc.access;
+ }
+ return (error);
+}
+
+int
+vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg, struct seg_desc *seg_desc)
+{
+ int error;
+
+ error = vm_get_desc(ctx, vcpu, reg, &seg_desc->base, &seg_desc->limit,
+ &seg_desc->access);
+ return (error);
+}
+
+int
+vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val)
+{
+ int error;
+ struct vm_register vmreg;
+
+ bzero(&vmreg, sizeof(vmreg));
+ vmreg.cpuid = vcpu;
+ vmreg.regnum = reg;
+ vmreg.regval = val;
+
+ error = ioctl(ctx->fd, VM_SET_REGISTER, &vmreg);
+ return (error);
+}
+
+int
+vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *ret_val)
+{
+ int error;
+ struct vm_register vmreg;
+
+ bzero(&vmreg, sizeof(vmreg));
+ vmreg.cpuid = vcpu;
+ vmreg.regnum = reg;
+
+ error = ioctl(ctx->fd, VM_GET_REGISTER, &vmreg);
+ *ret_val = vmreg.regval;
+ return (error);
+}
+
+int
+vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *vmexit)
+{
+ int error;
+ struct vm_run vmrun;
+
+ bzero(&vmrun, sizeof(vmrun));
+ vmrun.cpuid = vcpu;
+
+ error = ioctl(ctx->fd, VM_RUN, &vmrun);
+ bcopy(&vmrun.vm_exit, vmexit, sizeof(struct vm_exit));
+ return (error);
+}
+
+static int
+vm_inject_exception_real(struct vmctx *ctx, int vcpu, int vector,
+ int error_code, int error_code_valid)
+{
+ struct vm_exception exc;
+
+ bzero(&exc, sizeof(exc));
+ exc.cpuid = vcpu;
+ exc.vector = vector;
+ exc.error_code = error_code;
+ exc.error_code_valid = error_code_valid;
+
+ return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc));
+}
+
+int
+vm_inject_exception(struct vmctx *ctx, int vcpu, int vector, int errcode_valid,
+ uint32_t errcode, int restart_instruction)
+{
+ struct vm_exception exc;
+
+ exc.cpuid = vcpu;
+ exc.vector = vector;
+ exc.error_code = errcode;
+ exc.error_code_valid = errcode_valid;
+ exc.restart_instruction = restart_instruction;
+
+ return (ioctl(ctx->fd, VM_INJECT_EXCEPTION, &exc));
+}
+
+int
+vm_apicid2vcpu(struct vmctx *ctx, int apicid)
+{
+ /*
+ * The apic id associated with the 'vcpu' has the same numerical value
+ * as the 'vcpu' itself.
+ */
+ return (apicid);
+}
+
+int
+vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector)
+{
+ struct vm_lapic_irq vmirq;
+
+ bzero(&vmirq, sizeof(vmirq));
+ vmirq.cpuid = vcpu;
+ vmirq.vector = vector;
+
+ return (ioctl(ctx->fd, VM_LAPIC_IRQ, &vmirq));
+}
+
+int
+vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector)
+{
+ struct vm_lapic_irq vmirq;
+
+ bzero(&vmirq, sizeof(vmirq));
+ vmirq.cpuid = vcpu;
+ vmirq.vector = vector;
+
+ return (ioctl(ctx->fd, VM_LAPIC_LOCAL_IRQ, &vmirq));
+}
+
+int
+vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg)
+{
+ struct vm_lapic_msi vmmsi;
+
+ bzero(&vmmsi, sizeof(vmmsi));
+ vmmsi.addr = addr;
+ vmmsi.msg = msg;
+
+ return (ioctl(ctx->fd, VM_LAPIC_MSI, &vmmsi));
+}
+
+int
+vm_ioapic_assert_irq(struct vmctx *ctx, int irq)
+{
+ struct vm_ioapic_irq ioapic_irq;
+
+ bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
+ ioapic_irq.irq = irq;
+
+ return (ioctl(ctx->fd, VM_IOAPIC_ASSERT_IRQ, &ioapic_irq));
+}
+
+int
+vm_ioapic_deassert_irq(struct vmctx *ctx, int irq)
+{
+ struct vm_ioapic_irq ioapic_irq;
+
+ bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
+ ioapic_irq.irq = irq;
+
+ return (ioctl(ctx->fd, VM_IOAPIC_DEASSERT_IRQ, &ioapic_irq));
+}
+
+int
+vm_ioapic_pulse_irq(struct vmctx *ctx, int irq)
+{
+ struct vm_ioapic_irq ioapic_irq;
+
+ bzero(&ioapic_irq, sizeof(struct vm_ioapic_irq));
+ ioapic_irq.irq = irq;
+
+ return (ioctl(ctx->fd, VM_IOAPIC_PULSE_IRQ, &ioapic_irq));
+}
+
+int
+vm_ioapic_pincount(struct vmctx *ctx, int *pincount)
+{
+
+ return (ioctl(ctx->fd, VM_IOAPIC_PINCOUNT, pincount));
+}
+
+int
+vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
+{
+ struct vm_isa_irq isa_irq;
+
+ bzero(&isa_irq, sizeof(struct vm_isa_irq));
+ isa_irq.atpic_irq = atpic_irq;
+ isa_irq.ioapic_irq = ioapic_irq;
+
+ return (ioctl(ctx->fd, VM_ISA_ASSERT_IRQ, &isa_irq));
+}
+
+int
+vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
+{
+ struct vm_isa_irq isa_irq;
+
+ bzero(&isa_irq, sizeof(struct vm_isa_irq));
+ isa_irq.atpic_irq = atpic_irq;
+ isa_irq.ioapic_irq = ioapic_irq;
+
+ return (ioctl(ctx->fd, VM_ISA_DEASSERT_IRQ, &isa_irq));
+}
+
+int
+vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq)
+{
+ struct vm_isa_irq isa_irq;
+
+ bzero(&isa_irq, sizeof(struct vm_isa_irq));
+ isa_irq.atpic_irq = atpic_irq;
+ isa_irq.ioapic_irq = ioapic_irq;
+
+ return (ioctl(ctx->fd, VM_ISA_PULSE_IRQ, &isa_irq));
+}
+
+int
+vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
+ enum vm_intr_trigger trigger)
+{
+ struct vm_isa_irq_trigger isa_irq_trigger;
+
+ bzero(&isa_irq_trigger, sizeof(struct vm_isa_irq_trigger));
+ isa_irq_trigger.atpic_irq = atpic_irq;
+ isa_irq_trigger.trigger = trigger;
+
+ return (ioctl(ctx->fd, VM_ISA_SET_IRQ_TRIGGER, &isa_irq_trigger));
+}
+
+int
+vm_inject_nmi(struct vmctx *ctx, int vcpu)
+{
+ struct vm_nmi vmnmi;
+
+ bzero(&vmnmi, sizeof(vmnmi));
+ vmnmi.cpuid = vcpu;
+
+ return (ioctl(ctx->fd, VM_INJECT_NMI, &vmnmi));
+}
+
+static struct {
+ const char *name;
+ int type;
+} capstrmap[] = {
+ { "hlt_exit", VM_CAP_HALT_EXIT },
+ { "mtrap_exit", VM_CAP_MTRAP_EXIT },
+ { "pause_exit", VM_CAP_PAUSE_EXIT },
+ { "unrestricted_guest", VM_CAP_UNRESTRICTED_GUEST },
+ { "enable_invpcid", VM_CAP_ENABLE_INVPCID },
+ { 0 }
+};
+
+int
+vm_capability_name2type(const char *capname)
+{
+ int i;
+
+ for (i = 0; capstrmap[i].name != NULL && capname != NULL; i++) {
+ if (strcmp(capstrmap[i].name, capname) == 0)
+ return (capstrmap[i].type);
+ }
+
+ return (-1);
+}
+
+const char *
+vm_capability_type2name(int type)
+{
+ int i;
+
+ for (i = 0; capstrmap[i].name != NULL; i++) {
+ if (capstrmap[i].type == type)
+ return (capstrmap[i].name);
+ }
+
+ return (NULL);
+}
+
+int
+vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+ int *retval)
+{
+ int error;
+ struct vm_capability vmcap;
+
+ bzero(&vmcap, sizeof(vmcap));
+ vmcap.cpuid = vcpu;
+ vmcap.captype = cap;
+
+ error = ioctl(ctx->fd, VM_GET_CAPABILITY, &vmcap);
+ *retval = vmcap.capval;
+ return (error);
+}
+
+int
+vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap, int val)
+{
+ struct vm_capability vmcap;
+
+ bzero(&vmcap, sizeof(vmcap));
+ vmcap.cpuid = vcpu;
+ vmcap.captype = cap;
+ vmcap.capval = val;
+
+ return (ioctl(ctx->fd, VM_SET_CAPABILITY, &vmcap));
+}
+
+int
+vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
+{
+ struct vm_pptdev pptdev;
+
+ bzero(&pptdev, sizeof(pptdev));
+ pptdev.bus = bus;
+ pptdev.slot = slot;
+ pptdev.func = func;
+
+ return (ioctl(ctx->fd, VM_BIND_PPTDEV, &pptdev));
+}
+
+int
+vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func)
+{
+ struct vm_pptdev pptdev;
+
+ bzero(&pptdev, sizeof(pptdev));
+ pptdev.bus = bus;
+ pptdev.slot = slot;
+ pptdev.func = func;
+
+ return (ioctl(ctx->fd, VM_UNBIND_PPTDEV, &pptdev));
+}
+
+int
+vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+ struct vm_pptdev_mmio pptmmio;
+
+ bzero(&pptmmio, sizeof(pptmmio));
+ pptmmio.bus = bus;
+ pptmmio.slot = slot;
+ pptmmio.func = func;
+ pptmmio.gpa = gpa;
+ pptmmio.len = len;
+ pptmmio.hpa = hpa;
+
+ return (ioctl(ctx->fd, VM_MAP_PPTDEV_MMIO, &pptmmio));
+}
+
+int
+vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+ uint64_t addr, uint64_t msg, int numvec)
+{
+ struct vm_pptdev_msi pptmsi;
+
+ bzero(&pptmsi, sizeof(pptmsi));
+ pptmsi.vcpu = vcpu;
+ pptmsi.bus = bus;
+ pptmsi.slot = slot;
+ pptmsi.func = func;
+ pptmsi.msg = msg;
+ pptmsi.addr = addr;
+ pptmsi.numvec = numvec;
+
+ return (ioctl(ctx->fd, VM_PPTDEV_MSI, &pptmsi));
+}
+
+int
+vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot, int func,
+ int idx, uint64_t addr, uint64_t msg, uint32_t vector_control)
+{
+ struct vm_pptdev_msix pptmsix;
+
+ bzero(&pptmsix, sizeof(pptmsix));
+ pptmsix.vcpu = vcpu;
+ pptmsix.bus = bus;
+ pptmsix.slot = slot;
+ pptmsix.func = func;
+ pptmsix.idx = idx;
+ pptmsix.msg = msg;
+ pptmsix.addr = addr;
+ pptmsix.vector_control = vector_control;
+
+ return ioctl(ctx->fd, VM_PPTDEV_MSIX, &pptmsix);
+}
+
+#ifdef __FreeBSD__
+uint64_t *
+vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
+ int *ret_entries)
+{
+ int error;
+
+ static struct vm_stats vmstats;
+
+ vmstats.cpuid = vcpu;
+
+ error = ioctl(ctx->fd, VM_STATS, &vmstats);
+ if (error == 0) {
+ if (ret_entries)
+ *ret_entries = vmstats.num_entries;
+ if (ret_tv)
+ *ret_tv = vmstats.tv;
+ return (vmstats.statbuf);
+ } else
+ return (NULL);
+}
+
+const char *
+vm_get_stat_desc(struct vmctx *ctx, int index)
+{
+ static struct vm_stat_desc statdesc;
+
+ statdesc.index = index;
+ if (ioctl(ctx->fd, VM_STAT_DESC, &statdesc) == 0)
+ return (statdesc.desc);
+ else
+ return (NULL);
+}
+#endif
+
+int
+vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *state)
+{
+ int error;
+ struct vm_x2apic x2apic;
+
+ bzero(&x2apic, sizeof(x2apic));
+ x2apic.cpuid = vcpu;
+
+ error = ioctl(ctx->fd, VM_GET_X2APIC_STATE, &x2apic);
+ *state = x2apic.state;
+ return (error);
+}
+
+int
+vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state state)
+{
+ int error;
+ struct vm_x2apic x2apic;
+
+ bzero(&x2apic, sizeof(x2apic));
+ x2apic.cpuid = vcpu;
+ x2apic.state = state;
+
+ error = ioctl(ctx->fd, VM_SET_X2APIC_STATE, &x2apic);
+
+ return (error);
+}
+
+/*
+ * From Intel Vol 3a:
+ * Table 9-1. IA-32 Processor States Following Power-up, Reset or INIT
+ */
+int
+vcpu_reset(struct vmctx *vmctx, int vcpu)
+{
+ int error;
+ uint64_t rflags, rip, cr0, cr4, zero, desc_base, rdx;
+ uint32_t desc_access, desc_limit;
+ uint16_t sel;
+
+ zero = 0;
+
+ rflags = 0x2;
+ error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RFLAGS, rflags);
+ if (error)
+ goto done;
+
+ rip = 0xfff0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RIP, rip)) != 0)
+ goto done;
+
+ cr0 = CR0_NE;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR0, cr0)) != 0)
+ goto done;
+
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR3, zero)) != 0)
+ goto done;
+
+ cr4 = 0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CR4, cr4)) != 0)
+ goto done;
+
+ /*
+ * CS: present, r/w, accessed, 16-bit, byte granularity, usable
+ */
+ desc_base = 0xffff0000;
+ desc_limit = 0xffff;
+ desc_access = 0x0093;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_CS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ sel = 0xf000;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_CS, sel)) != 0)
+ goto done;
+
+ /*
+ * SS,DS,ES,FS,GS: present, r/w, accessed, 16-bit, byte granularity
+ */
+ desc_base = 0;
+ desc_limit = 0xffff;
+ desc_access = 0x0093;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_SS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_DS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_ES,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_FS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GS,
+ desc_base, desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ sel = 0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_SS, sel)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_DS, sel)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_ES, sel)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_FS, sel)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_GS, sel)) != 0)
+ goto done;
+
+ /* General purpose registers */
+ rdx = 0xf00;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RAX, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBX, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RCX, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDX, rdx)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSI, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RDI, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RBP, zero)) != 0)
+ goto done;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_RSP, zero)) != 0)
+ goto done;
+
+ /* GDTR, IDTR */
+ desc_base = 0;
+ desc_limit = 0xffff;
+ desc_access = 0;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_GDTR,
+ desc_base, desc_limit, desc_access);
+ if (error != 0)
+ goto done;
+
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_IDTR,
+ desc_base, desc_limit, desc_access);
+ if (error != 0)
+ goto done;
+
+ /* TR */
+ desc_base = 0;
+ desc_limit = 0xffff;
+ desc_access = 0x0000008b;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_TR, 0, 0, desc_access);
+ if (error)
+ goto done;
+
+ sel = 0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_TR, sel)) != 0)
+ goto done;
+
+ /* LDTR */
+ desc_base = 0;
+ desc_limit = 0xffff;
+ desc_access = 0x00000082;
+ error = vm_set_desc(vmctx, vcpu, VM_REG_GUEST_LDTR, desc_base,
+ desc_limit, desc_access);
+ if (error)
+ goto done;
+
+ sel = 0;
+ if ((error = vm_set_register(vmctx, vcpu, VM_REG_GUEST_LDTR, 0)) != 0)
+ goto done;
+
+ /* XXX cr2, debug registers */
+
+ error = 0;
+done:
+ return (error);
+}
+
+int
+vm_get_gpa_pmap(struct vmctx *ctx, uint64_t gpa, uint64_t *pte, int *num)
+{
+ int error, i;
+ struct vm_gpa_pte gpapte;
+
+ bzero(&gpapte, sizeof(gpapte));
+ gpapte.gpa = gpa;
+
+ error = ioctl(ctx->fd, VM_GET_GPA_PMAP, &gpapte);
+
+ if (error == 0) {
+ *num = gpapte.ptenum;
+ for (i = 0; i < gpapte.ptenum; i++)
+ pte[i] = gpapte.pte[i];
+ }
+
+ return (error);
+}
+
+int
+vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities)
+{
+ int error;
+ struct vm_hpet_cap cap;
+
+ bzero(&cap, sizeof(struct vm_hpet_cap));
+ error = ioctl(ctx->fd, VM_GET_HPET_CAPABILITIES, &cap);
+ if (capabilities != NULL)
+ *capabilities = cap.capabilities;
+ return (error);
+}
+
+static int
+gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ uint64_t gla, int prot, int *fault, uint64_t *gpa)
+{
+ struct vm_gla2gpa gg;
+ int error;
+
+ bzero(&gg, sizeof(struct vm_gla2gpa));
+ gg.vcpuid = vcpu;
+ gg.prot = prot;
+ gg.gla = gla;
+ gg.paging = *paging;
+
+ error = ioctl(ctx->fd, VM_GLA2GPA, &gg);
+ if (error == 0) {
+ *fault = gg.fault;
+ *gpa = gg.gpa;
+ }
+ return (error);
+}
+
+int
+vm_gla2gpa(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ uint64_t gla, int prot, uint64_t *gpa)
+{
+ int error, fault;
+
+ error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, gpa);
+ if (fault)
+ error = fault;
+ return (error);
+}
+
+#ifndef min
+#define min(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+int
+vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *paging,
+ uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt)
+{
+ void *va;
+ uint64_t gpa;
+ int error, fault, i, n, off;
+
+ for (i = 0; i < iovcnt; i++) {
+ iov[i].iov_base = 0;
+ iov[i].iov_len = 0;
+ }
+
+ while (len) {
+ assert(iovcnt > 0);
+ error = gla2gpa(ctx, vcpu, paging, gla, prot, &fault, &gpa);
+ if (error)
+ return (-1);
+ if (fault)
+ return (1);
+
+ off = gpa & PAGE_MASK;
+ n = min(len, PAGE_SIZE - off);
+
+ va = vm_map_gpa(ctx, gpa, n);
+ if (va == NULL)
+ return (-1);
+
+ iov->iov_base = va;
+ iov->iov_len = n;
+ iov++;
+ iovcnt--;
+
+ gla += n;
+ len -= n;
+ }
+ return (0);
+}
+
+void
+vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov, int iovcnt)
+{
+
+ return;
+}
+
+void
+vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *iov, void *vp, size_t len)
+{
+ const char *src;
+ char *dst;
+ size_t n;
+
+ dst = vp;
+ while (len) {
+ assert(iov->iov_len);
+ n = min(len, iov->iov_len);
+ src = iov->iov_base;
+ bcopy(src, dst, n);
+
+ iov++;
+ dst += n;
+ len -= n;
+ }
+}
+
+void
+vm_copyout(struct vmctx *ctx, int vcpu, const void *vp, struct iovec *iov,
+ size_t len)
+{
+ const char *src;
+ char *dst;
+ size_t n;
+
+ src = vp;
+ while (len) {
+ assert(iov->iov_len);
+ n = min(len, iov->iov_len);
+ dst = iov->iov_base;
+ bcopy(src, dst, n);
+
+ iov++;
+ src += n;
+ len -= n;
+ }
+}
+
+int
+vm_activate_cpu(struct vmctx *ctx, int vcpu)
+{
+ struct vm_activate_cpu ac;
+ int error;
+
+ bzero(&ac, sizeof(struct vm_activate_cpu));
+ ac.vcpuid = vcpu;
+ error = ioctl(ctx->fd, VM_ACTIVATE_CPU, &ac);
+ return (error);
+}
+
+int
+vm_restart_instruction(void *arg, int vcpu)
+{
+ struct vmctx *ctx = arg;
+
+ return (ioctl(ctx->fd, VM_RESTART_INSTRUCTION, &vcpu));
+}
diff --git a/usr/src/lib/libvmmapi/common/vmmapi.h b/usr/src/lib/libvmmapi/common/vmmapi.h
new file mode 100644
index 0000000000..d7eb67aa58
--- /dev/null
+++ b/usr/src/lib/libvmmapi/common/vmmapi.h
@@ -0,0 +1,159 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/lib/libvmmapi/vmmapi.h 280929 2015-04-01 00:15:31Z tychon $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _VMMAPI_H_
+#define _VMMAPI_H_
+
+#include <sys/param.h>
+
+struct iovec;
+struct vmctx;
+enum x2apic_state;
+
+/*
+ * Different styles of mapping the memory assigned to a VM into the address
+ * space of the controlling process.
+ */
+enum vm_mmap_style {
+ VM_MMAP_NONE, /* no mapping */
+ VM_MMAP_ALL, /* fully and statically mapped */
+ VM_MMAP_SPARSE, /* mappings created on-demand */
+};
+
+int vm_create(const char *name);
+struct vmctx *vm_open(const char *name);
+int vm_destroy(struct vmctx *ctx);
+int vm_parse_memsize(const char *optarg, size_t *memsize);
+int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
+ int *wired);
+int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
+int vm_setup_rom(struct vmctx *ctx, vm_paddr_t gpa, size_t len);
+void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
+int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, int prot, uint64_t *gpa);
+uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
+void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
+size_t vm_get_lowmem_size(struct vmctx *ctx);
+size_t vm_get_highmem_size(struct vmctx *ctx);
+int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
+ uint64_t base, uint32_t limit, uint32_t access);
+int vm_get_desc(struct vmctx *ctx, int vcpu, int reg,
+ uint64_t *base, uint32_t *limit, uint32_t *access);
+int vm_get_seg_desc(struct vmctx *ctx, int vcpu, int reg,
+ struct seg_desc *seg_desc);
+int vm_set_register(struct vmctx *ctx, int vcpu, int reg, uint64_t val);
+int vm_get_register(struct vmctx *ctx, int vcpu, int reg, uint64_t *retval);
+int vm_run(struct vmctx *ctx, int vcpu, struct vm_exit *ret_vmexit);
+int vm_apicid2vcpu(struct vmctx *ctx, int apicid);
+int vm_inject_exception(struct vmctx *ctx, int vcpu, int vector,
+ int errcode_valid, uint32_t errcode, int restart_instruction);
+int vm_lapic_irq(struct vmctx *ctx, int vcpu, int vector);
+int vm_lapic_local_irq(struct vmctx *ctx, int vcpu, int vector);
+int vm_lapic_msi(struct vmctx *ctx, uint64_t addr, uint64_t msg);
+int vm_ioapic_assert_irq(struct vmctx *ctx, int irq);
+int vm_ioapic_deassert_irq(struct vmctx *ctx, int irq);
+int vm_ioapic_pulse_irq(struct vmctx *ctx, int irq);
+int vm_ioapic_pincount(struct vmctx *ctx, int *pincount);
+int vm_isa_assert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
+int vm_isa_deassert_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
+int vm_isa_pulse_irq(struct vmctx *ctx, int atpic_irq, int ioapic_irq);
+int vm_isa_set_irq_trigger(struct vmctx *ctx, int atpic_irq,
+ enum vm_intr_trigger trigger);
+int vm_inject_nmi(struct vmctx *ctx, int vcpu);
+int vm_capability_name2type(const char *capname);
+const char *vm_capability_type2name(int type);
+int vm_get_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+ int *retval);
+int vm_set_capability(struct vmctx *ctx, int vcpu, enum vm_cap_type cap,
+ int val);
+int vm_assign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
+int vm_unassign_pptdev(struct vmctx *ctx, int bus, int slot, int func);
+int vm_map_pptdev_mmio(struct vmctx *ctx, int bus, int slot, int func,
+ vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+int vm_setup_pptdev_msi(struct vmctx *ctx, int vcpu, int bus, int slot,
+ int func, uint64_t addr, uint64_t msg, int numvec);
+int vm_setup_pptdev_msix(struct vmctx *ctx, int vcpu, int bus, int slot,
+ int func, int idx, uint64_t addr, uint64_t msg,
+ uint32_t vector_control);
+
+/*
+ * Return a pointer to the statistics buffer. Note that this is not MT-safe.
+ */
+uint64_t *vm_get_stats(struct vmctx *ctx, int vcpu, struct timeval *ret_tv,
+ int *ret_entries);
+const char *vm_get_stat_desc(struct vmctx *ctx, int index);
+
+int vm_get_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state *s);
+int vm_set_x2apic_state(struct vmctx *ctx, int vcpu, enum x2apic_state s);
+
+int vm_get_hpet_capabilities(struct vmctx *ctx, uint32_t *capabilities);
+
+/*
+ * Translate the GLA range [gla,gla+len) into GPA segments in 'iov'.
+ * The 'iovcnt' should be big enough to accomodate all GPA segments.
+ * Returns 0 on success, 1 on a guest fault condition and -1 otherwise.
+ */
+int vm_copy_setup(struct vmctx *ctx, int vcpu, struct vm_guest_paging *pg,
+ uint64_t gla, size_t len, int prot, struct iovec *iov, int iovcnt);
+void vm_copyin(struct vmctx *ctx, int vcpu, struct iovec *guest_iov,
+ void *host_dst, size_t len);
+void vm_copyout(struct vmctx *ctx, int vcpu, const void *host_src,
+ struct iovec *guest_iov, size_t len);
+void vm_copy_teardown(struct vmctx *ctx, int vcpu, struct iovec *iov,
+ int iovcnt);
+
+/* Reset vcpu register state */
+int vcpu_reset(struct vmctx *ctx, int vcpu);
+
+int vm_activate_cpu(struct vmctx *ctx, int vcpu);
+
+#ifdef __FreeBSD__
+/*
+ * FreeBSD specific APIs
+ */
+int vm_setup_freebsd_registers(struct vmctx *ctx, int vcpu,
+ uint64_t rip, uint64_t cr3, uint64_t gdtbase,
+ uint64_t rsp);
+int vm_setup_freebsd_registers_i386(struct vmctx *vmctx, int vcpu,
+ uint32_t eip, uint32_t gdtbase,
+ uint32_t esp);
+void vm_setup_freebsd_gdt(uint64_t *gdtr);
+#endif
+#endif /* _VMMAPI_H_ */
diff --git a/usr/src/tools/scripts/gensetdefs.pl b/usr/src/tools/scripts/gensetdefs.pl
new file mode 100644
index 0000000000..8ca5782feb
--- /dev/null
+++ b/usr/src/tools/scripts/gensetdefs.pl
@@ -0,0 +1,31 @@
+#!/usr/bin/perl -w
+#
+# COPYRIGHT 2013 Pluribus Networks Inc.
+#
+# All rights reserved. This copyright notice is Copyright Management
+# Information under 17 USC 1202 and is included to protect this work and
+# deter copyright infringement. Removal or alteration of this Copyright
+# Management Information without the express written permission from
+# Pluribus Networks Inc is prohibited, and any such unauthorized removal
+# or alteration will be a violation of federal law.
+
+use strict;
+
+my @Sections = split(/\n/, `elfedit -r -e \'shdr:sh_name -osimple\' $ARGV[0] 2>&1`);
+
+foreach my $Section (@Sections) {
+ if ($Section =~ "^set_") {
+ print "\tfixing $Section\n";
+
+ chomp(my $SectionAddr = `elfedit -r -e \'shdr:sh_addr -onum $Section\' $ARGV[0] 2>&1`);
+ chomp(my $SectionSize = `elfedit -r -e \'shdr:sh_size -onum $Section\' $ARGV[0] 2>&1`);
+ my $SectionEnd = hex($SectionAddr) + hex($SectionSize);
+
+ `elfedit -e \'sym:st_bind __start_$Section global\' $ARGV[0] 2>&1`;
+ `elfedit -e \'sym:st_value __start_$Section $SectionAddr\' $ARGV[0] 2>&1`;
+ `elfedit -e \'sym:st_shndx __start_$Section $Section\' $ARGV[0] 2>&1`;
+ `elfedit -e \'sym:st_bind __stop_$Section global\' $ARGV[0] 2>&1`;
+ `elfedit -e \'sym:st_value __stop_$Section $SectionEnd\' $ARGV[0] 2>&1`;
+ `elfedit -e \'sym:st_shndx __stop_$Section $Section\' $ARGV[0] 2>&1`;
+ }
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona.c b/usr/src/uts/i86pc/io/viona/viona.c
new file mode 100644
index 0000000000..40bdd80a6e
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona.c
@@ -0,0 +1,1404 @@
+/*
+ * Copyright (c) 2013 Chris Torek <torek @ torek net>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/conf.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/ddi.h>
+#include <sys/sunddi.h>
+#include <sys/sunndi.h>
+#include <sys/sysmacros.h>
+#include <sys/strsubr.h>
+#include <sys/strsun.h>
+#include <vm/seg_kmem.h>
+
+#include <sys/dls.h>
+#include <sys/mac_client.h>
+
+#include <sys/viona_io.h>
+
+#define MB (1024UL * 1024)
+#define GB (1024UL * MB)
+
+/*
+ * Min. octets in an ethernet frame minus FCS
+ */
+#define MIN_BUF_SIZE 60
+
+#define VIONA_NAME "Virtio Network Accelerator"
+
+#define VIONA_CTL_MINOR 0
+#define VIONA_CTL_NODE_NAME "ctl"
+
+#define VIONA_CLI_NAME "viona"
+
+#define VTNET_MAXSEGS 32
+
+#define VRING_ALIGN 4096
+
+#define VRING_DESC_F_NEXT (1 << 0)
+#define VRING_DESC_F_WRITE (1 << 1)
+#define VRING_DESC_F_INDIRECT (1 << 2)
+
+#define VRING_AVAIL_F_NO_INTERRUPT 1
+
+#define VRING_USED_F_NO_NOTIFY 1
+
+#define BCM_NIC_DRIVER "bnxe"
+/*
+ * Host capabilities
+ */
+#define VIRTIO_NET_F_MAC (1 << 5) /* host supplies MAC */
+#define VIRTIO_NET_F_MRG_RXBUF (1 << 15) /* host can merge RX buffers */
+#define VIRTIO_NET_F_STATUS (1 << 16) /* config status field available */
+
+#define VIONA_S_HOSTCAPS \
+ (VIRTIO_NET_F_MAC | VIRTIO_NET_F_MRG_RXBUF | \
+ VIRTIO_NET_F_STATUS)
+
+#pragma pack(1)
+struct virtio_desc {
+ uint64_t vd_addr;
+ uint32_t vd_len;
+ uint16_t vd_flags;
+ uint16_t vd_next;
+};
+#pragma pack()
+
+#pragma pack(1)
+struct virtio_used {
+ uint32_t vu_idx;
+ uint32_t vu_tlen;
+};
+#pragma pack()
+
+#pragma pack(1)
+struct virtio_net_mrgrxhdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+ uint16_t vrh_bufs;
+};
+struct virtio_net_hdr {
+ uint8_t vrh_flags;
+ uint8_t vrh_gso_type;
+ uint16_t vrh_hdr_len;
+ uint16_t vrh_gso_size;
+ uint16_t vrh_csum_start;
+ uint16_t vrh_csum_offset;
+};
+#pragma pack()
+
+typedef struct viona_vring_hqueue {
+ /* Internal state */
+ uint16_t hq_size;
+ kmutex_t hq_a_mutex;
+ kmutex_t hq_u_mutex;
+ uint16_t hq_cur_aidx; /* trails behind 'avail_idx' */
+
+ /* Host-context pointers to the queue */
+ caddr_t hq_baseaddr;
+ uint16_t *hq_avail_flags;
+ uint16_t *hq_avail_idx; /* monotonically increasing */
+ uint16_t *hq_avail_ring;
+
+ uint16_t *hq_used_flags;
+ uint16_t *hq_used_idx; /* monotonically increasing */
+ struct virtio_used *hq_used_ring;
+} viona_vring_hqueue_t;
+
+
+typedef struct viona_link {
+ datalink_id_t l_linkid;
+
+ struct vm *l_vm;
+ size_t l_vm_lomemsize;
+ caddr_t l_vm_lomemaddr;
+ size_t l_vm_himemsize;
+ caddr_t l_vm_himemaddr;
+
+ mac_handle_t l_mh;
+ mac_client_handle_t l_mch;
+
+ kmem_cache_t *l_desb_kmc;
+
+ pollhead_t l_pollhead;
+
+ viona_vring_hqueue_t l_rx_vring;
+ uint_t l_rx_intr;
+
+ viona_vring_hqueue_t l_tx_vring;
+ kcondvar_t l_tx_cv;
+ uint_t l_tx_intr;
+ kmutex_t l_tx_mutex;
+ int l_tx_outstanding;
+ uint32_t l_features;
+} viona_link_t;
+
+typedef struct {
+ frtn_t d_frtn;
+ viona_link_t *d_link;
+ uint_t d_ref;
+ uint16_t d_cookie;
+ int d_len;
+} viona_desb_t;
+
+typedef struct viona_soft_state {
+ viona_link_t *ss_link;
+} viona_soft_state_t;
+
+typedef struct used_elem {
+ uint16_t id;
+ uint32_t len;
+} used_elem_t;
+
+static void *viona_state;
+static dev_info_t *viona_dip;
+static id_space_t *viona_minor_ids;
+/*
+ * copy tx mbufs from virtio ring to avoid necessitating a wait
+ * for packet transmission to free resources.
+ */
+static boolean_t copy_tx_mblks = B_TRUE;
+
+extern struct vm *vm_lookup_by_name(char *name);
+extern uint64_t vm_gpa2hpa(struct vm *vm, uint64_t gpa, size_t len);
+
+static int viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
+static int viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
+static int viona_open(dev_t *devp, int flag, int otype, cred_t *credp);
+static int viona_close(dev_t dev, int flag, int otype, cred_t *credp);
+static int viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
+ cred_t *credp, int *rval);
+static int viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp);
+
+static int viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create);
+static int viona_ioc_delete(viona_soft_state_t *ss);
+
+static int viona_vm_map(viona_link_t *link);
+static caddr_t viona_gpa2kva(viona_link_t *link, uint64_t gpa);
+static void viona_vm_unmap(viona_link_t *link);
+
+static int viona_ioc_rx_ring_init(viona_link_t *link,
+ vioc_ring_init_t *u_ri);
+static int viona_ioc_tx_ring_init(viona_link_t *link,
+ vioc_ring_init_t *u_ri);
+static int viona_ioc_rx_ring_reset(viona_link_t *link);
+static int viona_ioc_tx_ring_reset(viona_link_t *link);
+static void viona_ioc_rx_ring_kick(viona_link_t *link);
+static void viona_ioc_tx_ring_kick(viona_link_t *link);
+static int viona_ioc_rx_intr_clear(viona_link_t *link);
+static int viona_ioc_tx_intr_clear(viona_link_t *link);
+
+static void viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t loopback);
+static void viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq);
+
+static struct cb_ops viona_cb_ops = {
+ viona_open,
+ viona_close,
+ nodev,
+ nodev,
+ nodev,
+ nodev,
+ nodev,
+ viona_ioctl,
+ nodev,
+ nodev,
+ nodev,
+ viona_chpoll,
+ ddi_prop_op,
+ 0,
+ D_MP | D_NEW | D_HOTPLUG,
+ CB_REV,
+ nodev,
+ nodev
+};
+
+static struct dev_ops viona_ops = {
+ DEVO_REV,
+ 0,
+ nodev,
+ nulldev,
+ nulldev,
+ viona_attach,
+ viona_detach,
+ nodev,
+ &viona_cb_ops,
+ NULL,
+ ddi_power,
+ ddi_quiesce_not_needed
+};
+
+static struct modldrv modldrv = {
+ &mod_driverops,
+ VIONA_NAME,
+ &viona_ops,
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1, &modldrv, NULL
+};
+
+int
+_init(void)
+{
+ int ret;
+
+ ret = ddi_soft_state_init(&viona_state,
+ sizeof (viona_soft_state_t), 0);
+ if (ret == 0) {
+ ret = mod_install(&modlinkage);
+ if (ret != 0) {
+ ddi_soft_state_fini(&viona_state);
+ return (ret);
+ }
+ }
+
+ return (ret);
+}
+
+int
+_fini(void)
+{
+ int ret;
+
+ ret = mod_remove(&modlinkage);
+ if (ret == 0) {
+ ddi_soft_state_fini(&viona_state);
+ }
+
+ return (ret);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
+
+static void
+set_viona_tx_mode()
+{
+ major_t bcm_nic_major;
+ if ((bcm_nic_major = ddi_name_to_major(BCM_NIC_DRIVER))
+ != DDI_MAJOR_T_NONE) {
+ if (ddi_hold_installed_driver(bcm_nic_major) != NULL) {
+ copy_tx_mblks = B_FALSE;
+ ddi_rele_driver(bcm_nic_major);
+ }
+ }
+}
+
+static int
+viona_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ if (cmd != DDI_ATTACH) {
+ return (DDI_FAILURE);
+ }
+
+ viona_minor_ids = id_space_create("viona_minor_id",
+ VIONA_CTL_MINOR + 1, UINT16_MAX);
+
+ if (ddi_create_minor_node(dip, VIONA_CTL_NODE_NAME,
+ S_IFCHR, VIONA_CTL_MINOR, DDI_PSEUDO, 0) != DDI_SUCCESS) {
+ return (DDI_FAILURE);
+ }
+
+ viona_dip = dip;
+
+ set_viona_tx_mode();
+ ddi_report_dev(viona_dip);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+viona_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ if (cmd != DDI_DETACH) {
+ return (DDI_FAILURE);
+ }
+
+ id_space_destroy(viona_minor_ids);
+
+ ddi_remove_minor_node(viona_dip, NULL);
+
+ viona_dip = NULL;
+
+ return (DDI_SUCCESS);
+}
+
+static int
+viona_open(dev_t *devp, int flag, int otype, cred_t *credp)
+{
+ int minor;
+
+ if (otype != OTYP_CHR) {
+ return (EINVAL);
+ }
+
+ if (drv_priv(credp) != 0) {
+ return (EPERM);
+ }
+
+ if (getminor(*devp) != VIONA_CTL_MINOR) {
+ return (ENXIO);
+ }
+
+ minor = id_alloc(viona_minor_ids);
+ if (minor == 0) {
+ /* All minors are busy */
+ return (EBUSY);
+ }
+
+ if (ddi_soft_state_zalloc(viona_state, minor) != DDI_SUCCESS) {
+ id_free(viona_minor_ids, minor);
+ }
+
+ *devp = makedevice(getmajor(*devp), minor);
+
+ return (0);
+}
+
+static int
+viona_close(dev_t dev, int flag, int otype, cred_t *credp)
+{
+ int minor;
+ viona_soft_state_t *ss;
+
+ if (otype != OTYP_CHR) {
+ return (EINVAL);
+ }
+
+ if (drv_priv(credp) != 0) {
+ return (EPERM);
+ }
+
+ minor = getminor(dev);
+
+ ss = ddi_get_soft_state(viona_state, minor);
+ if (ss == NULL) {
+ return (ENXIO);
+ }
+
+ viona_ioc_delete(ss);
+
+ ddi_soft_state_free(viona_state, minor);
+
+ id_free(viona_minor_ids, minor);
+
+ return (0);
+}
+
+static int
+viona_ioctl(dev_t dev, int cmd, intptr_t data, int mode,
+ cred_t *credp, int *rval)
+{
+ viona_soft_state_t *ss;
+ int err = 0;
+
+ ss = ddi_get_soft_state(viona_state, getminor(dev));
+ if (ss == NULL) {
+ return (ENXIO);
+ }
+
+ switch (cmd) {
+ case VNA_IOC_CREATE:
+ err = viona_ioc_create(ss, (vioc_create_t *)data);
+ break;
+ case VNA_IOC_DELETE:
+ err = viona_ioc_delete(ss);
+ break;
+ case VNA_IOC_SET_FEATURES:
+ if (ss->ss_link == NULL) {
+ return (ENOSYS);
+ }
+ ss->ss_link->l_features = *(int *)data & VIONA_S_HOSTCAPS;
+ break;
+ case VNA_IOC_GET_FEATURES:
+ if (ss->ss_link == NULL) {
+ return (ENOSYS);
+ }
+ *(int *)data = VIONA_S_HOSTCAPS;
+ break;
+ case VNA_IOC_RX_RING_INIT:
+ if (ss->ss_link == NULL) {
+ return (ENOSYS);
+ }
+ err = viona_ioc_rx_ring_init(ss->ss_link,
+ (vioc_ring_init_t *)data);
+ break;
+ case VNA_IOC_RX_RING_RESET:
+ if (ss->ss_link == NULL) {
+ return (ENOSYS);
+ }
+ err = viona_ioc_rx_ring_reset(ss->ss_link);
+ break;
+ case VNA_IOC_RX_RING_KICK:
+ if (ss->ss_link == NULL) {
+ return (ENOSYS);
+ }
+ viona_ioc_rx_ring_kick(ss->ss_link);
+ err = 0;
+ break;
+ case VNA_IOC_TX_RING_INIT:
+ if (ss->ss_link == NULL) {
+ return (ENOSYS);
+ }
+ err = viona_ioc_tx_ring_init(ss->ss_link,
+ (vioc_ring_init_t *)data);
+ break;
+ case VNA_IOC_TX_RING_RESET:
+ if (ss->ss_link == NULL) {
+ return (ENOSYS);
+ }
+ err = viona_ioc_tx_ring_reset(ss->ss_link);
+ break;
+ case VNA_IOC_TX_RING_KICK:
+ if (ss->ss_link == NULL) {
+ return (ENOSYS);
+ }
+ viona_ioc_tx_ring_kick(ss->ss_link);
+ err = 0;
+ break;
+ case VNA_IOC_RX_INTR_CLR:
+ if (ss->ss_link == NULL) {
+ return (ENOSYS);
+ }
+ err = viona_ioc_rx_intr_clear(ss->ss_link);
+ break;
+ case VNA_IOC_TX_INTR_CLR:
+ if (ss->ss_link == NULL) {
+ return (ENOSYS);
+ }
+ err = viona_ioc_tx_intr_clear(ss->ss_link);
+ break;
+ default:
+ err = ENOTTY;
+ break;
+ }
+
+ return (err);
+}
+
+static int
+viona_chpoll(dev_t dev, short events, int anyyet, short *reventsp,
+ struct pollhead **phpp)
+{
+ viona_soft_state_t *ss;
+
+ ss = ddi_get_soft_state(viona_state, getminor(dev));
+ if (ss == NULL || ss->ss_link == NULL) {
+ return (ENXIO);
+ }
+
+ *reventsp = 0;
+
+ if (ss->ss_link->l_rx_intr && (events & POLLIN)) {
+ *reventsp |= POLLIN;
+ }
+
+ if (ss->ss_link->l_tx_intr && (events & POLLOUT)) {
+ *reventsp |= POLLOUT;
+ }
+
+ if (*reventsp == 0 && !anyyet) {
+ *phpp = &ss->ss_link->l_pollhead;
+ }
+
+ return (0);
+}
+
+static int
+viona_ioc_create(viona_soft_state_t *ss, vioc_create_t *u_create)
+{
+ vioc_create_t k_create;
+ viona_link_t *link;
+ char cli_name[MAXNAMELEN];
+ int err;
+
+ if (ss->ss_link != NULL) {
+ return (ENOSYS);
+ }
+ if (copyin(u_create, &k_create, sizeof (k_create)) != 0) {
+ return (EFAULT);
+ }
+
+ link = kmem_zalloc(sizeof (viona_link_t), KM_SLEEP);
+
+ link->l_linkid = k_create.c_linkid;
+ link->l_vm = vm_lookup_by_name(k_create.c_vmname);
+ if (link->l_vm == NULL) {
+ err = ENXIO;
+ goto bail;
+ }
+
+ link->l_vm_lomemsize = k_create.c_lomem_size;
+ link->l_vm_himemsize = k_create.c_himem_size;
+ err = viona_vm_map(link);
+ if (err != 0) {
+ goto bail;
+ }
+
+ err = mac_open_by_linkid(link->l_linkid, &link->l_mh);
+ if (err != 0) {
+ cmn_err(CE_WARN, "viona create mac_open_by_linkid"
+ " returned %d\n", err);
+ goto bail;
+ }
+
+ snprintf(cli_name, sizeof (cli_name), "%s-%d",
+ VIONA_CLI_NAME, link->l_linkid);
+ err = mac_client_open(link->l_mh, &link->l_mch, cli_name, 0);
+ if (err != 0) {
+ cmn_err(CE_WARN, "viona create mac_client_open"
+ " returned %d\n", err);
+ goto bail;
+ }
+
+ link->l_features = VIONA_S_HOSTCAPS;
+ link->l_desb_kmc = kmem_cache_create(cli_name,
+ sizeof (viona_desb_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+ mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&link->l_rx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&link->l_rx_vring.hq_a_mutex, NULL, MUTEX_DRIVER, NULL);
+ mutex_init(&link->l_tx_vring.hq_u_mutex, NULL, MUTEX_DRIVER, NULL);
+ if (copy_tx_mblks) {
+ mutex_init(&link->l_tx_mutex, NULL, MUTEX_DRIVER, NULL);
+ cv_init(&link->l_tx_cv, NULL, CV_DRIVER, NULL);
+ }
+ ss->ss_link = link;
+
+ return (0);
+
+bail:
+ if (link->l_mch != NULL) {
+ mac_client_close(link->l_mch, 0);
+ }
+ if (link->l_mh != NULL) {
+ mac_close(link->l_mh);
+ }
+
+ kmem_free(link, sizeof (viona_link_t));
+
+ return (err);
+}
+
+static int
+viona_ioc_delete(viona_soft_state_t *ss)
+{
+ viona_link_t *link;
+
+ link = ss->ss_link;
+ if (link == NULL) {
+ return (ENOSYS);
+ }
+ if (copy_tx_mblks) {
+ mutex_enter(&link->l_tx_mutex);
+ while (link->l_tx_outstanding != 0) {
+ cv_wait(&link->l_tx_cv, &link->l_tx_mutex);
+ }
+ mutex_exit(&link->l_tx_mutex);
+ }
+ if (link->l_mch != NULL) {
+ mac_rx_clear(link->l_mch);
+ mac_client_close(link->l_mch, 0);
+ }
+ if (link->l_mh != NULL) {
+ mac_close(link->l_mh);
+ }
+
+ viona_vm_unmap(link);
+ mutex_destroy(&link->l_tx_vring.hq_a_mutex);
+ mutex_destroy(&link->l_tx_vring.hq_u_mutex);
+ mutex_destroy(&link->l_rx_vring.hq_a_mutex);
+ mutex_destroy(&link->l_rx_vring.hq_u_mutex);
+ if (copy_tx_mblks) {
+ mutex_destroy(&link->l_tx_mutex);
+ cv_destroy(&link->l_tx_cv);
+ }
+
+ kmem_cache_destroy(link->l_desb_kmc);
+
+ kmem_free(link, sizeof (viona_link_t));
+
+ ss->ss_link = NULL;
+
+ return (0);
+}
+
+static caddr_t
+viona_mapin_vm_chunk(viona_link_t *link, uint64_t gpa, size_t len)
+{
+ caddr_t addr;
+ size_t offset;
+ pfn_t pfnum;
+
+ if (len == 0)
+ return (NULL);
+
+ addr = vmem_alloc(heap_arena, len, VM_SLEEP);
+ if (addr == NULL)
+ return (NULL);
+
+ for (offset = 0; offset < len; offset += PAGESIZE) {
+ pfnum = btop(vm_gpa2hpa(link->l_vm, gpa + offset, PAGESIZE));
+ ASSERT(pfnum);
+ hat_devload(kas.a_hat, addr + offset, PAGESIZE, pfnum,
+ PROT_READ | PROT_WRITE, HAT_LOAD_LOCK);
+ }
+
+ return (addr);
+}
+
+/*
+ * Map the guest physical address space into the kernel virtual address space.
+ */
+static int
+viona_vm_map(viona_link_t *link)
+{
+ link->l_vm_lomemaddr = viona_mapin_vm_chunk(link,
+ 0, link->l_vm_lomemsize);
+ if (link->l_vm_lomemaddr == NULL)
+ return (-1);
+ link->l_vm_himemaddr = viona_mapin_vm_chunk(link,
+ 4 * (1024 * 1024 * 1024UL), link->l_vm_himemsize);
+ if (link->l_vm_himemsize && link->l_vm_himemaddr == NULL)
+ return (-1);
+
+ return (0);
+}
+
+/*
+ * Translate a guest physical address into a kernel virtual address.
+ */
+static caddr_t
+viona_gpa2kva(viona_link_t *link, uint64_t gpa)
+{
+ if (gpa < link->l_vm_lomemsize)
+ return (link->l_vm_lomemaddr + gpa);
+
+ gpa -= (4 * GB);
+ if (gpa < link->l_vm_himemsize)
+ return (link->l_vm_himemaddr + gpa);
+
+ return (NULL);
+}
+
+static void
+viona_vm_unmap(viona_link_t *link)
+{
+ if (link->l_vm_lomemaddr) {
+ hat_unload(kas.a_hat, link->l_vm_lomemaddr,
+ link->l_vm_lomemsize, HAT_UNLOAD_UNLOCK);
+ vmem_free(heap_arena, link->l_vm_lomemaddr,
+ link->l_vm_lomemsize);
+ }
+ if (link->l_vm_himemaddr) {
+ hat_unload(kas.a_hat, link->l_vm_himemaddr,
+ link->l_vm_himemsize, HAT_UNLOAD_UNLOCK);
+ vmem_free(heap_arena, link->l_vm_himemaddr,
+ link->l_vm_himemsize);
+ }
+}
+
+static int
+viona_ioc_ring_init_common(viona_link_t *link, viona_vring_hqueue_t *hq,
+ vioc_ring_init_t *u_ri)
+{
+ vioc_ring_init_t k_ri;
+
+ if (copyin(u_ri, &k_ri, sizeof (k_ri)) != 0) {
+ return (EFAULT);
+ }
+
+ hq->hq_size = k_ri.ri_qsize;
+ hq->hq_baseaddr = viona_gpa2kva(link, k_ri.ri_qaddr);
+ if (hq->hq_baseaddr == NULL)
+ return (EINVAL);
+
+ hq->hq_avail_flags = (uint16_t *)(viona_gpa2kva(link,
+ k_ri.ri_qaddr + hq->hq_size * sizeof (struct virtio_desc)));
+ if (hq->hq_avail_flags == NULL)
+ return (EINVAL);
+ hq->hq_avail_idx = hq->hq_avail_flags + 1;
+ hq->hq_avail_ring = hq->hq_avail_flags + 2;
+
+ hq->hq_used_flags = (uint16_t *)(viona_gpa2kva(link,
+ P2ROUNDUP(k_ri.ri_qaddr +
+ hq->hq_size * sizeof (struct virtio_desc) + 2, VRING_ALIGN)));
+ if (hq->hq_used_flags == NULL)
+ return (EINVAL);
+ hq->hq_used_idx = hq->hq_used_flags + 1;
+ hq->hq_used_ring = (struct virtio_used *)(hq->hq_used_flags + 2);
+
+ /*
+ * Initialize queue indexes
+ */
+ hq->hq_cur_aidx = 0;
+
+ return (0);
+}
+
+static int
+viona_ioc_rx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri)
+{
+ viona_vring_hqueue_t *hq;
+ int rval;
+
+ hq = &link->l_rx_vring;
+
+ rval = viona_ioc_ring_init_common(link, hq, u_ri);
+ if (rval != 0) {
+ return (rval);
+ }
+
+ return (0);
+}
+
+static int
+viona_ioc_tx_ring_init(viona_link_t *link, vioc_ring_init_t *u_ri)
+{
+ viona_vring_hqueue_t *hq;
+
+ hq = &link->l_tx_vring;
+
+ return (viona_ioc_ring_init_common(link, hq, u_ri));
+}
+
+static int
+viona_ioc_ring_reset_common(viona_vring_hqueue_t *hq)
+{
+ /*
+ * Reset all soft state
+ */
+ hq->hq_cur_aidx = 0;
+
+ return (0);
+}
+
+static int
+viona_ioc_rx_ring_reset(viona_link_t *link)
+{
+ viona_vring_hqueue_t *hq;
+
+ mac_rx_clear(link->l_mch);
+
+ hq = &link->l_rx_vring;
+
+ return (viona_ioc_ring_reset_common(hq));
+}
+
+static int
+viona_ioc_tx_ring_reset(viona_link_t *link)
+{
+ viona_vring_hqueue_t *hq;
+
+ hq = &link->l_tx_vring;
+
+ return (viona_ioc_ring_reset_common(hq));
+}
+
+static void
+viona_ioc_rx_ring_kick(viona_link_t *link)
+{
+ viona_vring_hqueue_t *hq = &link->l_rx_vring;
+
+ atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY);
+
+ mac_rx_set(link->l_mch, viona_rx, link);
+}
+
+/*
+ * Return the number of available descriptors in the vring taking care
+ * of the 16-bit index wraparound.
+ */
+static inline int
+viona_hq_num_avail(viona_vring_hqueue_t *hq)
+{
+ uint16_t ndesc;
+
+ /*
+ * We're just computing (a-b) in GF(216).
+ *
+ * The only glitch here is that in standard C,
+ * uint16_t promotes to (signed) int when int has
+ * more than 16 bits (pretty much always now), so
+ * we have to force it back to unsigned.
+ */
+ ndesc = (unsigned)*hq->hq_avail_idx - (unsigned)hq->hq_cur_aidx;
+
+ ASSERT(ndesc <= hq->hq_size);
+
+ return (ndesc);
+}
+
+static void
+viona_ioc_tx_ring_kick(viona_link_t *link)
+{
+ viona_vring_hqueue_t *hq = &link->l_tx_vring;
+
+ do {
+ atomic_or_16(hq->hq_used_flags, VRING_USED_F_NO_NOTIFY);
+ while (viona_hq_num_avail(hq)) {
+ viona_tx(link, hq);
+ }
+ if (copy_tx_mblks) {
+ mutex_enter(&link->l_tx_mutex);
+ if (link->l_tx_outstanding != 0) {
+ cv_wait_sig(&link->l_tx_cv, &link->l_tx_mutex);
+ }
+ mutex_exit(&link->l_tx_mutex);
+ }
+ atomic_and_16(hq->hq_used_flags, ~VRING_USED_F_NO_NOTIFY);
+ } while (viona_hq_num_avail(hq));
+}
+
+static int
+viona_ioc_rx_intr_clear(viona_link_t *link)
+{
+ link->l_rx_intr = 0;
+
+ return (0);
+}
+
+static int
+viona_ioc_tx_intr_clear(viona_link_t *link)
+{
+ link->l_tx_intr = 0;
+
+ return (0);
+}
+#define VQ_MAX_DESCRIPTORS 512
+
+static int
+vq_popchain(viona_link_t *link, viona_vring_hqueue_t *hq, struct iovec *iov,
+int n_iov, uint16_t *cookie)
+{
+ int i;
+ int ndesc, nindir;
+ int idx, head, next;
+ struct virtio_desc *vdir, *vindir, *vp;
+
+ idx = hq->hq_cur_aidx;
+ ndesc = (uint16_t)((unsigned)*hq->hq_avail_idx - (unsigned)idx);
+
+ if (ndesc == 0)
+ return (0);
+ if (ndesc > hq->hq_size) {
+ cmn_err(CE_NOTE, "ndesc (%d) out of range\n", ndesc);
+ return (-1);
+ }
+
+ head = hq->hq_avail_ring[idx & (hq->hq_size - 1)];
+ next = head;
+
+ for (i = 0; i < VQ_MAX_DESCRIPTORS; next = vdir->vd_next) {
+ if (next >= hq->hq_size) {
+ cmn_err(CE_NOTE, "descriptor index (%d)"
+ "out of range\n", next);
+ return (-1);
+ }
+
+ vdir = (struct virtio_desc *)(hq->hq_baseaddr +
+ next * sizeof (struct virtio_desc));
+ if ((vdir->vd_flags & VRING_DESC_F_INDIRECT) == 0) {
+ if (i > n_iov)
+ return (-1);
+ iov[i].iov_base = viona_gpa2kva(link, vdir->vd_addr);
+ if (iov[i].iov_base == NULL) {
+ cmn_err(CE_NOTE, "invalid guest physical"
+ " address 0x%"PRIx64"\n", vdir->vd_addr);
+ return (-1);
+ }
+ iov[i++].iov_len = vdir->vd_len;
+ } else {
+ nindir = vdir->vd_len / 16;
+ if ((vdir->vd_len & 0xf) || nindir == 0) {
+ cmn_err(CE_NOTE, "invalid indir len 0x%x\n",
+ vdir->vd_len);
+ return (-1);
+ }
+ vindir = (struct virtio_desc *)
+ viona_gpa2kva(link, vdir->vd_addr);
+ if (vindir == NULL) {
+ cmn_err(CE_NOTE, "invalid guest physical"
+ " address 0x%"PRIx64"\n", vdir->vd_addr);
+ return (-1);
+ }
+ next = 0;
+ for (;;) {
+ vp = &vindir[next];
+ if (vp->vd_flags & VRING_DESC_F_INDIRECT) {
+ cmn_err(CE_NOTE, "indirect desc"
+ " has INDIR flag\n");
+ return (-1);
+ }
+ if (i > n_iov)
+ return (-1);
+ iov[i].iov_base =
+ viona_gpa2kva(link, vp->vd_addr);
+ if (iov[i].iov_base == NULL) {
+ cmn_err(CE_NOTE, "invalid guest"
+ " physical address 0x%"PRIx64"\n",
+ vp->vd_addr);
+ return (-1);
+ }
+ iov[i++].iov_len = vp->vd_len;
+
+ if (i > VQ_MAX_DESCRIPTORS)
+ goto loopy;
+ if ((vp->vd_flags & VRING_DESC_F_NEXT) == 0)
+ break;
+
+ next = vp->vd_next;
+ if (next >= nindir) {
+ cmn_err(CE_NOTE, "invalid next"
+ " %d > %d\n", next, nindir);
+ return (-1);
+ }
+ }
+ }
+ if ((vdir->vd_flags & VRING_DESC_F_NEXT) == 0) {
+ *cookie = head;
+ hq->hq_cur_aidx++;
+ return (i);
+ }
+ }
+
+loopy:
+ cmn_err(CE_NOTE, "%d > descriptor loop count\n", i);
+
+ return (-1);
+}
+
+static void
+vq_pushchain(viona_vring_hqueue_t *hq, uint32_t len, uint16_t cookie)
+{
+ struct virtio_used *vu;
+ int uidx;
+
+ uidx = *hq->hq_used_idx;
+ vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)];
+ vu->vu_idx = cookie;
+ vu->vu_tlen = len;
+ membar_producer();
+ *hq->hq_used_idx = uidx;
+}
+
+static void
+vq_pushchain_mrgrx(viona_vring_hqueue_t *hq, int num_bufs, used_elem_t *elem)
+{
+ struct virtio_used *vu;
+ int uidx;
+ int i;
+
+ uidx = *hq->hq_used_idx;
+ if (num_bufs == 1) {
+ vu = &hq->hq_used_ring[uidx++ & (hq->hq_size - 1)];
+ vu->vu_idx = elem[0].id;
+ vu->vu_tlen = elem[0].len;
+ } else {
+ for (i = 0; i < num_bufs; i++) {
+ vu = &hq->hq_used_ring[(uidx + i) & (hq->hq_size - 1)];
+ vu->vu_idx = elem[i].id;
+ vu->vu_tlen = elem[i].len;
+ }
+ uidx = uidx + num_bufs;
+ }
+ membar_producer();
+ *hq->hq_used_idx = uidx;
+}
+
+/*
+ * Copy bytes from mp to iov.
+ * copied_buf: Total num_bytes copied from mblk to iov array.
+ * buf: pointer to iov_base.
+ * i: index of iov array. Mainly used to identify if we are
+ * dealing with first iov array element.
+ * rxhdr_size: Virtio header size. Two possibilities in case
+ * of MRGRX buf, header has 2 additional bytes.
+ * In case of mrgrx, virtio header should be part of iov[0].
+ * In case of non-mrgrx, virtio header may or may not be part
+ * of iov[0].
+ */
+static int
+copy_in_mblk(mblk_t *mp, int copied_buf, caddr_t buf, struct iovec *iov,
+ int i, int rxhdr_size)
+{
+ int copied_chunk = 0;
+ mblk_t *ml;
+ int total_buf_len = iov->iov_len;
+ /*
+ * iov[0] might have header, adjust
+ * total_buf_len accordingly
+ */
+ if (i == 0) {
+ total_buf_len = iov->iov_len - rxhdr_size;
+ }
+ for (ml = mp; ml != NULL; ml = ml->b_cont) {
+ size_t chunk = MBLKL(ml);
+ /*
+ * If chunk is less than
+ * copied_buf we should move
+ * to correct msgblk
+ */
+ if (copied_buf != 0) {
+ if (copied_buf < chunk) {
+ chunk -= copied_buf;
+ } else {
+ copied_buf -= chunk;
+ continue;
+ }
+ }
+ /*
+ * iov[0] already has virtio header.
+ * and if copied chunk is length of iov_len break
+ */
+ if (copied_chunk == total_buf_len) {
+ break;
+ }
+ /*
+ * Sometimes chunk is total mblk len, sometimes mblk is
+ * divided into multiple chunks.
+ */
+ if (chunk > copied_buf) {
+ if (chunk > copied_chunk) {
+ if ((chunk + copied_chunk) > total_buf_len)
+ chunk = (size_t)total_buf_len
+ - copied_chunk;
+ } else {
+ if (chunk > (total_buf_len - copied_chunk))
+ chunk = (size_t)((total_buf_len
+ - copied_chunk) - chunk);
+ }
+ bcopy(ml->b_rptr + copied_buf, buf, chunk);
+ } else {
+ if (chunk > (total_buf_len - copied_chunk)) {
+ chunk = (size_t)(total_buf_len - copied_chunk);
+ }
+ bcopy(ml->b_rptr + copied_buf, buf, chunk);
+ }
+ buf += chunk;
+ copied_chunk += chunk;
+ }
+ return (copied_chunk);
+}
+
+static void
+viona_rx(void *arg, mac_resource_handle_t mrh, mblk_t *mp,
+ boolean_t loopback)
+{
+ viona_link_t *link = arg;
+ viona_vring_hqueue_t *hq = &link->l_rx_vring;
+ mblk_t *mp0 = mp;
+
+ while (viona_hq_num_avail(hq)) {
+ struct iovec iov[VTNET_MAXSEGS];
+ size_t mblklen;
+ int n, i = 0;
+ uint16_t cookie;
+ struct virtio_net_hdr *vrx;
+ struct virtio_net_mrgrxhdr *vmrgrx;
+ mblk_t *ml;
+ caddr_t buf;
+ int total_len = 0;
+ int copied_buf = 0;
+ int num_bufs = 0;
+ int num_pops = 0;
+ used_elem_t uelem[VTNET_MAXSEGS];
+
+ if (mp == NULL) {
+ break;
+ }
+ mblklen = msgsize(mp);
+ if (mblklen == 0) {
+ break;
+ }
+
+ mutex_enter(&hq->hq_a_mutex);
+ n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie);
+ mutex_exit(&hq->hq_a_mutex);
+ if (n <= 0) {
+ break;
+ }
+ num_pops++;
+ if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) {
+ int total_n = n;
+ int mrgrxhdr_size = sizeof (struct virtio_net_mrgrxhdr);
+ /*
+ * Get a pointer to the rx header, and use the
+ * data immediately following it for the packet buffer.
+ */
+ vmrgrx = (struct virtio_net_mrgrxhdr *)iov[0].iov_base;
+ if (n == 1) {
+ buf = iov[0].iov_base + mrgrxhdr_size;
+ }
+ while (mblklen > copied_buf) {
+ if (total_n == i) {
+ mutex_enter(&hq->hq_a_mutex);
+ n = vq_popchain(link, hq, &iov[i],
+ VTNET_MAXSEGS, &cookie);
+ mutex_exit(&hq->hq_a_mutex);
+ if (n <= 0) {
+ freemsgchain(mp0);
+ return;
+ }
+ num_pops++;
+ total_n += n;
+ }
+ if (total_n > i) {
+ int copied_chunk = 0;
+ if (i != 0) {
+ buf = iov[i].iov_base;
+ }
+ copied_chunk = copy_in_mblk(mp,
+ copied_buf, buf, &iov[i], i,
+ mrgrxhdr_size);
+ copied_buf += copied_chunk;
+ uelem[i].id = cookie;
+ uelem[i].len = copied_chunk;
+ if (i == 0) {
+ uelem[i].len += mrgrxhdr_size;
+ }
+ }
+ num_bufs++;
+ i++;
+ }
+ } else {
+ boolean_t virt_hdr_incl_iov = B_FALSE;
+ int rxhdr_size = sizeof (struct virtio_net_hdr);
+ /* First element is header */
+ vrx = (struct virtio_net_hdr *)iov[0].iov_base;
+ if (n == 1 || iov[0].iov_len > rxhdr_size) {
+ buf = iov[0].iov_base + rxhdr_size;
+ virt_hdr_incl_iov = B_TRUE;
+ total_len += rxhdr_size;
+ if (iov[0].iov_len < rxhdr_size) {
+ // Buff too small to fit pkt. Drop it.
+ freemsgchain(mp0);
+ return;
+ }
+ } else {
+ total_len = iov[0].iov_len;
+ }
+ if (iov[0].iov_len == rxhdr_size)
+ i++;
+ while (mblklen > copied_buf) {
+ if (n > i) {
+ int copied_chunk = 0;
+ if (i != 0) {
+ buf = iov[i].iov_base;
+ }
+ /*
+ * In case of non-mrgrx buf, first
+ * descriptor always has header and
+ * rest of the descriptors have data.
+ * But it is not guaranteed that first
+ * descriptor will only have virtio
+ * header. It might also have data.
+ */
+ if (virt_hdr_incl_iov) {
+ copied_chunk = copy_in_mblk(mp,
+ copied_buf, buf, &iov[i],
+ i, rxhdr_size);
+ } else {
+ copied_chunk = copy_in_mblk(mp,
+ copied_buf, buf, &iov[i],
+ i, 0);
+ }
+ copied_buf += copied_chunk;
+ total_len += copied_chunk;
+ } else {
+ /*
+ * Drop packet as it cant fit
+ * in buf provided by guest.
+ */
+ freemsgchain(mp0);
+ return;
+ }
+ i++;
+ }
+ }
+ /*
+ * The only valid field in the rx packet header is the
+ * number of buffers, which is always 1 without TSO
+ * support.
+ */
+ if (link->l_features & VIRTIO_NET_F_MRG_RXBUF) {
+ memset(vmrgrx, 0, sizeof (struct virtio_net_mrgrxhdr));
+ vmrgrx->vrh_bufs = num_bufs;
+ /*
+ * Make sure iov[0].iov_len >= MIN_BUF_SIZE
+ * otherwise guest will consider it as invalid frame.
+ */
+ if (num_bufs == 1 && uelem[0].len < MIN_BUF_SIZE) {
+ uelem[0].len = MIN_BUF_SIZE;
+ }
+ /*
+ * Release this chain and handle more chains.
+ */
+ mutex_enter(&hq->hq_u_mutex);
+ vq_pushchain_mrgrx(hq, num_pops, uelem);
+ mutex_exit(&hq->hq_u_mutex);
+ } else {
+ memset(vrx, 0, sizeof (struct virtio_net_hdr));
+ if (total_len < MIN_BUF_SIZE) {
+ total_len = MIN_BUF_SIZE;
+ }
+ /*
+ * Release this chain and handle more chains.
+ */
+ mutex_enter(&hq->hq_u_mutex);
+ vq_pushchain(hq, total_len, cookie);
+ mutex_exit(&hq->hq_u_mutex);
+ }
+
+ mp = mp->b_next;
+ }
+
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ if (atomic_cas_uint(&link->l_rx_intr, 0, 1) == 0) {
+ pollwakeup(&link->l_pollhead, POLLIN);
+ }
+ }
+
+ freemsgchain(mp0);
+}
+
+static void
+viona_desb_free(viona_desb_t *dp)
+{
+ viona_link_t *link;
+ viona_vring_hqueue_t *hq;
+ struct virtio_used *vu;
+ int uidx;
+ uint_t ref;
+
+ ref = atomic_dec_uint_nv(&dp->d_ref);
+ if (ref != 0)
+ return;
+
+ link = dp->d_link;
+ hq = &link->l_tx_vring;
+
+ mutex_enter(&hq->hq_u_mutex);
+ vq_pushchain(hq, dp->d_len, dp->d_cookie);
+ mutex_exit(&hq->hq_u_mutex);
+
+ kmem_cache_free(link->l_desb_kmc, dp);
+
+ if ((*hq->hq_avail_flags & VRING_AVAIL_F_NO_INTERRUPT) == 0) {
+ if (atomic_cas_uint(&link->l_tx_intr, 0, 1) == 0) {
+ pollwakeup(&link->l_pollhead, POLLOUT);
+ }
+ }
+ if (copy_tx_mblks) {
+ mutex_enter(&link->l_tx_mutex);
+ if (--link->l_tx_outstanding == 0) {
+ cv_broadcast(&link->l_tx_cv);
+ }
+ mutex_exit(&link->l_tx_mutex);
+ }
+}
+
+static void
+viona_tx(viona_link_t *link, viona_vring_hqueue_t *hq)
+{
+ struct iovec iov[VTNET_MAXSEGS];
+ uint16_t cookie;
+ int i, n;
+ mblk_t *mp_head, *mp_tail, *mp;
+ viona_desb_t *dp;
+ mac_client_handle_t link_mch = link->l_mch;
+
+ mp_head = mp_tail = NULL;
+
+ mutex_enter(&hq->hq_a_mutex);
+ n = vq_popchain(link, hq, iov, VTNET_MAXSEGS, &cookie);
+ mutex_exit(&hq->hq_a_mutex);
+ ASSERT(n != 0);
+
+ dp = kmem_cache_alloc(link->l_desb_kmc, KM_SLEEP);
+ dp->d_frtn.free_func = viona_desb_free;
+ dp->d_frtn.free_arg = (void *)dp;
+ dp->d_link = link;
+ dp->d_cookie = cookie;
+
+ dp->d_ref = 0;
+ dp->d_len = iov[0].iov_len;
+
+ for (i = 1; i < n; i++) {
+ dp->d_ref++;
+ dp->d_len += iov[i].iov_len;
+ if (copy_tx_mblks) {
+ mp = desballoc((uchar_t *)iov[i].iov_base,
+ iov[i].iov_len, BPRI_MED, &dp->d_frtn);
+ ASSERT(mp);
+ } else {
+ mp = allocb(iov[i].iov_len, BPRI_MED);
+ ASSERT(mp);
+ bcopy((uchar_t *)iov[i].iov_base, mp->b_wptr,
+ iov[i].iov_len);
+ }
+ mp->b_wptr += iov[i].iov_len;
+ if (mp_head == NULL) {
+ ASSERT(mp_tail == NULL);
+ mp_head = mp;
+ } else {
+ ASSERT(mp_tail != NULL);
+ mp_tail->b_cont = mp;
+ }
+ mp_tail = mp;
+ }
+ if (copy_tx_mblks == B_FALSE) {
+ viona_desb_free(dp);
+ }
+ if (copy_tx_mblks) {
+ mutex_enter(&link->l_tx_mutex);
+ link->l_tx_outstanding++;
+ mutex_exit(&link->l_tx_mutex);
+ }
+ mac_tx(link_mch, mp_head, 0, MAC_DROP_ON_NO_DESC, NULL);
+}
diff --git a/usr/src/uts/i86pc/io/viona/viona.conf b/usr/src/uts/i86pc/io/viona/viona.conf
new file mode 100644
index 0000000000..e66488531a
--- /dev/null
+++ b/usr/src/uts/i86pc/io/viona/viona.conf
@@ -0,0 +1,14 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+name="viona" parent="pseudo";
diff --git a/usr/src/uts/i86pc/io/vmm/amd/amdv.c b/usr/src/uts/i86pc/io/vmm/amd/amdv.c
new file mode 100644
index 0000000000..6b62daae6c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/amd/amdv.c
@@ -0,0 +1,271 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/amd/amdv.c 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/amd/amdv.c 245678 2013-01-20 03:42:49Z neel $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/errno.h>
+#include <sys/smp.h>
+
+#include <machine/vmm.h>
+#ifdef __FreeBSD__
+#include "io/iommu.h"
+#endif
+
+static int
+amdv_init(void)
+{
+
+ printf("amdv_init: not implemented\n");
+ return (ENXIO);
+}
+
+static int
+amdv_cleanup(void)
+{
+
+ printf("amdv_cleanup: not implemented\n");
+ return (ENXIO);
+}
+
+static void *
+amdv_vminit(struct vm *vm)
+{
+
+ printf("amdv_vminit: not implemented\n");
+ return (NULL);
+}
+
+static int
+amdv_vmrun(void *arg, int vcpu, register_t rip)
+{
+
+ printf("amdv_vmrun: not implemented\n");
+ return (ENXIO);
+}
+
+static void
+amdv_vmcleanup(void *arg)
+{
+
+ printf("amdv_vmcleanup: not implemented\n");
+ return;
+}
+
+static int
+amdv_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot, boolean_t spok)
+{
+
+ printf("amdv_vmmmap_set: not implemented\n");
+ return (EINVAL);
+}
+
+static vm_paddr_t
+amdv_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+
+ printf("amdv_vmmmap_get: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getreg(void *arg, int vcpu, int regnum, uint64_t *retval)
+{
+
+ printf("amdv_getreg: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setreg(void *arg, int vcpu, int regnum, uint64_t val)
+{
+
+ printf("amdv_setreg: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+ printf("amdv_get_desc: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setdesc(void *vmi, int vcpu, int num, struct seg_desc *desc)
+{
+
+ printf("amdv_get_desc: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_getcap(void *arg, int vcpu, int type, int *retval)
+{
+
+ printf("amdv_getcap: not implemented\n");
+ return (EINVAL);
+}
+
+static int
+amdv_setcap(void *arg, int vcpu, int type, int val)
+{
+
+ printf("amdv_setcap: not implemented\n");
+ return (EINVAL);
+}
+
+struct vmm_ops vmm_ops_amd = {
+ amdv_init,
+ amdv_cleanup,
+ amdv_vminit,
+ amdv_vmrun,
+ amdv_vmcleanup,
+ amdv_vmmmap_set,
+ amdv_vmmmap_get,
+ amdv_getreg,
+ amdv_setreg,
+ amdv_getdesc,
+ amdv_setdesc,
+ amdv_getcap,
+ amdv_setcap
+};
+
+static int
+amd_iommu_init(void)
+{
+
+ printf("amd_iommu_init: not implemented\n");
+ return (ENXIO);
+}
+
+static void
+amd_iommu_cleanup(void)
+{
+
+ printf("amd_iommu_cleanup: not implemented\n");
+}
+
+static void
+amd_iommu_enable(void)
+{
+
+ printf("amd_iommu_enable: not implemented\n");
+}
+
+static void
+amd_iommu_disable(void)
+{
+
+ printf("amd_iommu_disable: not implemented\n");
+}
+
+static void *
+amd_iommu_create_domain(vm_paddr_t maxaddr)
+{
+
+ printf("amd_iommu_create_domain: not implemented\n");
+ return (NULL);
+}
+
+static void
+amd_iommu_destroy_domain(void *domain)
+{
+
+ printf("amd_iommu_destroy_domain: not implemented\n");
+}
+
+static uint64_t
+amd_iommu_create_mapping(void *domain, vm_paddr_t gpa, vm_paddr_t hpa,
+ uint64_t len)
+{
+
+ printf("amd_iommu_create_mapping: not implemented\n");
+ return (0);
+}
+
+static uint64_t
+amd_iommu_remove_mapping(void *domain, vm_paddr_t gpa, uint64_t len)
+{
+
+ printf("amd_iommu_remove_mapping: not implemented\n");
+ return (0);
+}
+
+static void
+amd_iommu_add_device(void *domain, int bus, int slot, int func)
+{
+
+ printf("amd_iommu_add_device: not implemented\n");
+}
+
+static void
+amd_iommu_remove_device(void *domain, int bus, int slot, int func)
+{
+
+ printf("amd_iommu_remove_device: not implemented\n");
+}
+
+static void
+amd_iommu_invalidate_tlb(void *domain)
+{
+
+ printf("amd_iommu_invalidate_tlb: not implemented\n");
+}
+
+#ifdef __FreeBSD__
+struct iommu_ops iommu_ops_amd = {
+ amd_iommu_init,
+ amd_iommu_cleanup,
+ amd_iommu_enable,
+ amd_iommu_disable,
+ amd_iommu_create_domain,
+ amd_iommu_destroy_domain,
+ amd_iommu_create_mapping,
+ amd_iommu_remove_mapping,
+ amd_iommu_add_device,
+ amd_iommu_remove_device,
+ amd_iommu_invalidate_tlb,
+};
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.c b/usr/src/uts/i86pc/io/vmm/intel/ept.c
new file mode 100644
index 0000000000..5ae9ed2f6a
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/ept.c
@@ -0,0 +1,452 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z grehan $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/ept.c 252475 2013-07-01 20:05:43Z grehan $");
+
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/smp.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/param.h>
+#include <machine/cpufunc.h>
+#include <machine/pmap.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "ept.h"
+
+#define EPT_PWL4(cap) ((cap) & (1UL << 6))
+#define EPT_MEMORY_TYPE_WB(cap) ((cap) & (1UL << 14))
+#define EPT_PDE_SUPERPAGE(cap) ((cap) & (1UL << 16)) /* 2MB pages */
+#define EPT_PDPTE_SUPERPAGE(cap) ((cap) & (1UL << 17)) /* 1GB pages */
+#define INVVPID_SUPPORTED(cap) ((cap) & (1UL << 32))
+#define INVEPT_SUPPORTED(cap) ((cap) & (1UL << 20))
+
+#define INVVPID_ALL_TYPES_MASK 0xF0000000000UL
+#define INVVPID_ALL_TYPES_SUPPORTED(cap) \
+ (((cap) & INVVPID_ALL_TYPES_MASK) == INVVPID_ALL_TYPES_MASK)
+
+#define INVEPT_ALL_TYPES_MASK 0x6000000UL
+#define INVEPT_ALL_TYPES_SUPPORTED(cap) \
+ (((cap) & INVEPT_ALL_TYPES_MASK) == INVEPT_ALL_TYPES_MASK)
+
+#define EPT_PG_RD (1 << 0)
+#define EPT_PG_WR (1 << 1)
+#define EPT_PG_EX (1 << 2)
+#define EPT_PG_MEMORY_TYPE(x) ((x) << 3)
+#define EPT_PG_IGNORE_PAT (1 << 6)
+#define EPT_PG_SUPERPAGE (1 << 7)
+
+#define EPT_ADDR_MASK ((uint64_t)-1 << 12)
+
+MALLOC_DECLARE(M_VMX);
+
+static uint64_t page_sizes_mask;
+
+/*
+ * Set this to 1 to have the EPT tables respect the guest PAT settings
+ */
+static int ept_pat_passthru;
+
+int
+ept_init(void)
+{
+ int page_shift;
+ uint64_t cap;
+
+ cap = rdmsr(MSR_VMX_EPT_VPID_CAP);
+
+ /*
+ * Verify that:
+ * - page walk length is 4 steps
+ * - extended page tables can be laid out in write-back memory
+ * - invvpid instruction with all possible types is supported
+ * - invept instruction with all possible types is supported
+ */
+ if (!EPT_PWL4(cap) ||
+ !EPT_MEMORY_TYPE_WB(cap) ||
+ !INVVPID_SUPPORTED(cap) ||
+ !INVVPID_ALL_TYPES_SUPPORTED(cap) ||
+ !INVEPT_SUPPORTED(cap) ||
+ !INVEPT_ALL_TYPES_SUPPORTED(cap))
+ return (EINVAL);
+
+ /* Set bits in 'page_sizes_mask' for each valid page size */
+ page_shift = PAGE_SHIFT;
+ page_sizes_mask = 1UL << page_shift; /* 4KB page */
+
+ page_shift += 9;
+ if (EPT_PDE_SUPERPAGE(cap))
+ page_sizes_mask |= 1UL << page_shift; /* 2MB superpage */
+
+ page_shift += 9;
+ if (EPT_PDPTE_SUPERPAGE(cap))
+ page_sizes_mask |= 1UL << page_shift; /* 1GB superpage */
+
+ return (0);
+}
+
+#if 0
+static void
+ept_dump(uint64_t *ptp, int nlevels)
+{
+ int i, t, tabs;
+ uint64_t *ptpnext, ptpval;
+
+ if (--nlevels < 0)
+ return;
+
+ tabs = 3 - nlevels;
+ for (t = 0; t < tabs; t++)
+ printf("\t");
+ printf("PTP = %p\n", ptp);
+
+ for (i = 0; i < 512; i++) {
+ ptpval = ptp[i];
+
+ if (ptpval == 0)
+ continue;
+
+ for (t = 0; t < tabs; t++)
+ printf("\t");
+ printf("%3d 0x%016lx\n", i, ptpval);
+
+ if (nlevels != 0 && (ptpval & EPT_PG_SUPERPAGE) == 0) {
+ ptpnext = (uint64_t *)
+ PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+ ept_dump(ptpnext, nlevels);
+ }
+ }
+}
+#endif
+
+static size_t
+ept_create_mapping(uint64_t *ptp, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, vm_prot_t prot, boolean_t spok)
+{
+ int spshift, ptpshift, ptpindex, nlevels;
+
+ /*
+ * Compute the size of the mapping that we can accomodate.
+ *
+ * This is based on three factors:
+ * - super page sizes supported by the processor
+ * - alignment of the region starting at 'gpa' and 'hpa'
+ * - length of the region 'len'
+ */
+ spshift = PAGE_SHIFT;
+ if (spok)
+ spshift += (EPT_PWLEVELS - 1) * 9;
+ while (spshift >= PAGE_SHIFT) {
+ uint64_t spsize = 1UL << spshift;
+ if ((page_sizes_mask & spsize) != 0 &&
+ (gpa & (spsize - 1)) == 0 &&
+ (hpa & (spsize - 1)) == 0 &&
+ length >= spsize) {
+ break;
+ }
+ spshift -= 9;
+ }
+
+ if (spshift < PAGE_SHIFT) {
+ panic("Invalid spshift for gpa 0x%016lx, hpa 0x%016lx, "
+ "length 0x%016lx, page_sizes_mask 0x%016lx",
+ gpa, hpa, length, page_sizes_mask);
+ }
+
+ nlevels = EPT_PWLEVELS;
+ while (--nlevels >= 0) {
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ /* We have reached the leaf mapping */
+ if (spshift >= ptpshift)
+ break;
+
+ /*
+ * We are working on a non-leaf page table page.
+ *
+ * Create the next level page table page if necessary and point
+ * to it from the current page table.
+ */
+ if (ptp[ptpindex] == 0) {
+#ifdef __FreeBSD__
+ void *nlp = malloc(PAGE_SIZE, M_VMX, M_WAITOK | M_ZERO);
+#else
+ void *nlp = kmem_zalloc(PAGE_SIZE, KM_SLEEP);
+ ASSERT((((uintptr_t)nlp) & PAGE_MASK) == 0);
+#endif
+ ptp[ptpindex] = vtophys(nlp);
+ ptp[ptpindex] |= EPT_PG_RD | EPT_PG_WR | EPT_PG_EX;
+ }
+
+ /* Work our way down to the next level page table page */
+#ifdef __FreeBSD__
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptp[ptpindex] & EPT_ADDR_MASK);
+#else
+ ptp = (uint64_t *)hat_kpm_pfn2va(btop(ptp[ptpindex] & EPT_ADDR_MASK));
+#endif
+ }
+
+ if ((gpa & ((1UL << ptpshift) - 1)) != 0) {
+ panic("ept_create_mapping: gpa 0x%016lx and ptpshift %d "
+ "mismatch\n", gpa, ptpshift);
+ }
+
+ if (prot != VM_PROT_NONE) {
+ /* Do the mapping */
+ ptp[ptpindex] = hpa;
+
+ /* Apply the access controls */
+ if (prot & VM_PROT_READ)
+ ptp[ptpindex] |= EPT_PG_RD;
+ if (prot & VM_PROT_WRITE)
+ ptp[ptpindex] |= EPT_PG_WR;
+ if (prot & VM_PROT_EXECUTE)
+ ptp[ptpindex] |= EPT_PG_EX;
+
+ /*
+ * By default the PAT type is ignored - this appears to
+ * be how other hypervisors handle EPT. Allow this to be
+ * overridden.
+ */
+ ptp[ptpindex] |= EPT_PG_MEMORY_TYPE(attr);
+ if (!ept_pat_passthru)
+ ptp[ptpindex] |= EPT_PG_IGNORE_PAT;
+
+ if (nlevels > 0)
+ ptp[ptpindex] |= EPT_PG_SUPERPAGE;
+ } else {
+ /* Remove the mapping */
+ ptp[ptpindex] = 0;
+ }
+
+ return (1UL << ptpshift);
+}
+
+static vm_paddr_t
+ept_lookup_mapping(uint64_t *ptp, vm_paddr_t gpa)
+{
+ int nlevels, ptpshift, ptpindex;
+ uint64_t ptpval, hpabase, pgmask;
+
+ nlevels = EPT_PWLEVELS;
+ while (--nlevels >= 0) {
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gpa >> ptpshift) & 0x1FF;
+
+ ptpval = ptp[ptpindex];
+
+ /* Cannot make progress beyond this point */
+ if ((ptpval & (EPT_PG_RD | EPT_PG_WR | EPT_PG_EX)) == 0)
+ break;
+
+ if (nlevels == 0 || (ptpval & EPT_PG_SUPERPAGE)) {
+ pgmask = (1UL << ptpshift) - 1;
+ hpabase = ptpval & ~pgmask;
+ return (hpabase | (gpa & pgmask));
+ }
+
+ /* Work our way down to the next level page table page */
+#ifdef __FreBSD__
+ ptp = (uint64_t *)PHYS_TO_DMAP(ptpval & EPT_ADDR_MASK);
+#else
+ ptp = (uint64_t *)hat_kpm_pfn2va(btop(ptpval & EPT_ADDR_MASK));
+#endif
+ }
+
+ return ((vm_paddr_t)-1);
+}
+
+static void
+ept_free_pt_entry(pt_entry_t pte)
+{
+ if (pte == 0)
+ return;
+
+ /* sanity check */
+ if ((pte & EPT_PG_SUPERPAGE) != 0)
+ panic("ept_free_pt_entry: pte cannot have superpage bit");
+
+ return;
+}
+
+static void
+ept_free_pd_entry(pd_entry_t pde)
+{
+ pt_entry_t *pt;
+ int i;
+
+ if (pde == 0)
+ return;
+
+ if ((pde & EPT_PG_SUPERPAGE) == 0) {
+#ifdef __FreeBSD__
+ pt = (pt_entry_t *)PHYS_TO_DMAP(pde & EPT_ADDR_MASK);
+ for (i = 0; i < NPTEPG; i++)
+ ept_free_pt_entry(pt[i]);
+ free(pt, M_VMX); /* free the page table page */
+#else
+ page_t *pp;
+ pt = (pt_entry_t *)hat_kpm_pfn2va(btop(pde & EPT_ADDR_MASK));
+ for (i = 0; i < NPTEPG; i++)
+ ept_free_pt_entry(pt[i]);
+ pp = page_numtopp_nolock(btop(pde & EPT_ADDR_MASK));
+ kmem_free((void *)pp->p_offset, PAGE_SIZE);
+#endif
+ }
+}
+
+static void
+ept_free_pdp_entry(pdp_entry_t pdpe)
+{
+ pd_entry_t *pd;
+ int i;
+
+ if (pdpe == 0)
+ return;
+
+ if ((pdpe & EPT_PG_SUPERPAGE) == 0) {
+#ifdef __FreeBSD__
+ pd = (pd_entry_t *)PHYS_TO_DMAP(pdpe & EPT_ADDR_MASK);
+ for (i = 0; i < NPDEPG; i++)
+ ept_free_pd_entry(pd[i]);
+ free(pd, M_VMX); /* free the page directory page */
+#else
+ page_t *pp;
+ pd = (pd_entry_t *)hat_kpm_pfn2va(btop(pdpe & EPT_ADDR_MASK));
+ for (i = 0; i < NPDEPG; i++)
+ ept_free_pd_entry(pd[i]);
+ pp = page_numtopp_nolock(btop(pdpe & EPT_ADDR_MASK));
+ kmem_free((void *)pp->p_offset, PAGE_SIZE);
+#endif
+ }
+}
+
+static void
+ept_free_pml4_entry(pml4_entry_t pml4e)
+{
+ pdp_entry_t *pdp;
+ int i;
+
+ if (pml4e == 0)
+ return;
+
+ if ((pml4e & EPT_PG_SUPERPAGE) == 0) {
+#ifdef __FreeBSD__
+ pdp = (pdp_entry_t *)PHYS_TO_DMAP(pml4e & EPT_ADDR_MASK);
+ for (i = 0; i < NPDPEPG; i++)
+ ept_free_pdp_entry(pdp[i]);
+ free(pdp, M_VMX); /* free the page directory ptr page */
+#else
+ page_t *pp;
+ pdp = (pdp_entry_t *)hat_kpm_pfn2va(btop(pml4e
+ & EPT_ADDR_MASK));
+ for (i = 0; i < NPDPEPG; i++)
+ ept_free_pdp_entry(pdp[i]);
+ pp = page_numtopp_nolock(btop(pml4e & EPT_ADDR_MASK));
+ kmem_free((void *)pp->p_offset, PAGE_SIZE);
+#endif
+ }
+}
+
+void
+ept_vmcleanup(struct vmx *vmx)
+{
+ int i;
+
+ for (i = 0; i < NPML4EPG; i++)
+ ept_free_pml4_entry(vmx->pml4ept[i]);
+}
+
+int
+ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t len,
+ vm_memattr_t attr, int prot, boolean_t spok)
+{
+ size_t n;
+ struct vmx *vmx = arg;
+
+ while (len > 0) {
+ n = ept_create_mapping(vmx->pml4ept, gpa, hpa, len, attr,
+ prot, spok);
+ len -= n;
+ gpa += n;
+ hpa += n;
+ }
+
+ return (0);
+}
+
+vm_paddr_t
+ept_vmmmap_get(void *arg, vm_paddr_t gpa)
+{
+ vm_paddr_t hpa;
+ struct vmx *vmx;
+
+ vmx = arg;
+ hpa = ept_lookup_mapping(vmx->pml4ept, gpa);
+ return (hpa);
+}
+
+static void
+invept_single_context(void *arg)
+{
+ struct invept_desc desc = *(struct invept_desc *)arg;
+
+ invept(INVEPT_TYPE_SINGLE_CONTEXT, desc);
+}
+
+void
+ept_invalidate_mappings(u_long pml4ept)
+{
+ struct invept_desc invept_desc = { 0 };
+
+ invept_desc.eptp = EPTP(pml4ept);
+
+ smp_rendezvous(NULL, invept_single_context, NULL, &invept_desc);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/intel/ept.h b/usr/src/uts/i86pc/io/vmm/intel/ept.h
new file mode 100644
index 0000000000..d0bcce7ec3
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/ept.h
@@ -0,0 +1,43 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/ept.h 245678 2013-01-20 03:42:49Z neel $
+ */
+
+#ifndef _EPT_H_
+#define _EPT_H_
+
+struct vmx;
+
+#define EPT_PWLEVELS 4 /* page walk levels */
+#define EPTP(pml4) ((pml4) | (EPT_PWLEVELS - 1) << 3 | PAT_WRITE_BACK)
+
+int ept_init(void);
+int ept_vmmmap_set(void *arg, vm_paddr_t gpa, vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot, boolean_t allow_superpage_mappings);
+vm_paddr_t ept_vmmmap_get(void *arg, vm_paddr_t gpa);
+void ept_invalidate_mappings(u_long ept_pml4);
+void ept_vmcleanup(struct vmx *vmx);
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.c b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c
new file mode 100644
index 0000000000..bbd2da2a34
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.c
@@ -0,0 +1,597 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifdef __FreeBSD__
+#include "opt_ddb.h"
+#endif
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmcs.c 266550 2014-05-22 17:22:37Z neel $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/pcpu.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/segments.h>
+#include <machine/vmm.h>
+#include "vmm_host.h"
+#include "vmx_cpufunc.h"
+#include "vmcs.h"
+#include "ept.h"
+#include "vmx.h"
+
+#ifdef DDB
+#include <ddb/ddb.h>
+#endif
+
+static uint64_t
+vmcs_fix_regval(uint32_t encoding, uint64_t val)
+{
+
+ switch (encoding) {
+ case VMCS_GUEST_CR0:
+ val = vmx_fix_cr0(val);
+ break;
+ case VMCS_GUEST_CR4:
+ val = vmx_fix_cr4(val);
+ break;
+ default:
+ break;
+ }
+ return (val);
+}
+
+static uint32_t
+vmcs_field_encoding(int ident)
+{
+ switch (ident) {
+ case VM_REG_GUEST_CR0:
+ return (VMCS_GUEST_CR0);
+ case VM_REG_GUEST_CR3:
+ return (VMCS_GUEST_CR3);
+ case VM_REG_GUEST_CR4:
+ return (VMCS_GUEST_CR4);
+ case VM_REG_GUEST_DR7:
+ return (VMCS_GUEST_DR7);
+ case VM_REG_GUEST_RSP:
+ return (VMCS_GUEST_RSP);
+ case VM_REG_GUEST_RIP:
+ return (VMCS_GUEST_RIP);
+ case VM_REG_GUEST_RFLAGS:
+ return (VMCS_GUEST_RFLAGS);
+ case VM_REG_GUEST_ES:
+ return (VMCS_GUEST_ES_SELECTOR);
+ case VM_REG_GUEST_CS:
+ return (VMCS_GUEST_CS_SELECTOR);
+ case VM_REG_GUEST_SS:
+ return (VMCS_GUEST_SS_SELECTOR);
+ case VM_REG_GUEST_DS:
+ return (VMCS_GUEST_DS_SELECTOR);
+ case VM_REG_GUEST_FS:
+ return (VMCS_GUEST_FS_SELECTOR);
+ case VM_REG_GUEST_GS:
+ return (VMCS_GUEST_GS_SELECTOR);
+ case VM_REG_GUEST_TR:
+ return (VMCS_GUEST_TR_SELECTOR);
+ case VM_REG_GUEST_LDTR:
+ return (VMCS_GUEST_LDTR_SELECTOR);
+ case VM_REG_GUEST_EFER:
+ return (VMCS_GUEST_IA32_EFER);
+ default:
+ return (-1);
+ }
+
+}
+
+static int
+vmcs_seg_desc_encoding(int seg, uint32_t *base, uint32_t *lim, uint32_t *acc)
+{
+
+ switch (seg) {
+ case VM_REG_GUEST_ES:
+ *base = VMCS_GUEST_ES_BASE;
+ *lim = VMCS_GUEST_ES_LIMIT;
+ *acc = VMCS_GUEST_ES_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_CS:
+ *base = VMCS_GUEST_CS_BASE;
+ *lim = VMCS_GUEST_CS_LIMIT;
+ *acc = VMCS_GUEST_CS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_SS:
+ *base = VMCS_GUEST_SS_BASE;
+ *lim = VMCS_GUEST_SS_LIMIT;
+ *acc = VMCS_GUEST_SS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_DS:
+ *base = VMCS_GUEST_DS_BASE;
+ *lim = VMCS_GUEST_DS_LIMIT;
+ *acc = VMCS_GUEST_DS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_FS:
+ *base = VMCS_GUEST_FS_BASE;
+ *lim = VMCS_GUEST_FS_LIMIT;
+ *acc = VMCS_GUEST_FS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_GS:
+ *base = VMCS_GUEST_GS_BASE;
+ *lim = VMCS_GUEST_GS_LIMIT;
+ *acc = VMCS_GUEST_GS_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_TR:
+ *base = VMCS_GUEST_TR_BASE;
+ *lim = VMCS_GUEST_TR_LIMIT;
+ *acc = VMCS_GUEST_TR_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_LDTR:
+ *base = VMCS_GUEST_LDTR_BASE;
+ *lim = VMCS_GUEST_LDTR_LIMIT;
+ *acc = VMCS_GUEST_LDTR_ACCESS_RIGHTS;
+ break;
+ case VM_REG_GUEST_IDTR:
+ *base = VMCS_GUEST_IDTR_BASE;
+ *lim = VMCS_GUEST_IDTR_LIMIT;
+ *acc = VMCS_INVALID_ENCODING;
+ break;
+ case VM_REG_GUEST_GDTR:
+ *base = VMCS_GUEST_GDTR_BASE;
+ *lim = VMCS_GUEST_GDTR_LIMIT;
+ *acc = VMCS_INVALID_ENCODING;
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ return (0);
+}
+
+int
+vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *retval)
+{
+ int error;
+ uint32_t encoding;
+
+ /*
+ * If we need to get at vmx-specific state in the VMCS we can bypass
+ * the translation of 'ident' to 'encoding' by simply setting the
+ * sign bit. As it so happens the upper 16 bits are reserved (i.e
+ * set to 0) in the encodings for the VMCS so we are free to use the
+ * sign bit.
+ */
+ if (ident < 0)
+ encoding = ident & 0x7fffffff;
+ else
+ encoding = vmcs_field_encoding(ident);
+
+ if (encoding == (uint32_t)-1)
+ return (EINVAL);
+
+ if (!running)
+ VMPTRLD(vmcs);
+
+ error = vmread(encoding, retval);
+
+ if (!running)
+ VMCLEAR(vmcs);
+
+ return (error);
+}
+
+int
+vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val)
+{
+ int error;
+ uint32_t encoding;
+
+ if (ident < 0)
+ encoding = ident & 0x7fffffff;
+ else
+ encoding = vmcs_field_encoding(ident);
+
+ if (encoding == (uint32_t)-1)
+ return (EINVAL);
+
+ val = vmcs_fix_regval(encoding, val);
+
+ if (!running)
+ VMPTRLD(vmcs);
+
+ error = vmwrite(encoding, val);
+
+ if (!running)
+ VMCLEAR(vmcs);
+
+ return (error);
+}
+
+int
+vmcs_setdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc)
+{
+ int error;
+ uint32_t base, limit, access;
+
+ error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+ if (error != 0)
+ panic("vmcs_setdesc: invalid segment register %d", seg);
+
+ if (!running)
+ VMPTRLD(vmcs);
+ if ((error = vmwrite(base, desc->base)) != 0)
+ goto done;
+
+ if ((error = vmwrite(limit, desc->limit)) != 0)
+ goto done;
+
+ if (access != VMCS_INVALID_ENCODING) {
+ if ((error = vmwrite(access, desc->access)) != 0)
+ goto done;
+ }
+done:
+ if (!running)
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_getdesc(struct vmcs *vmcs, int running, int seg, struct seg_desc *desc)
+{
+ int error;
+ uint32_t base, limit, access;
+ uint64_t u64;
+
+ error = vmcs_seg_desc_encoding(seg, &base, &limit, &access);
+ if (error != 0)
+ panic("vmcs_getdesc: invalid segment register %d", seg);
+
+ if (!running)
+ VMPTRLD(vmcs);
+ if ((error = vmread(base, &u64)) != 0)
+ goto done;
+ desc->base = u64;
+
+ if ((error = vmread(limit, &u64)) != 0)
+ goto done;
+ desc->limit = u64;
+
+ if (access != VMCS_INVALID_ENCODING) {
+ if ((error = vmread(access, &u64)) != 0)
+ goto done;
+ desc->access = u64;
+ }
+done:
+ if (!running)
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+int
+vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count)
+{
+ int error;
+
+ VMPTRLD(vmcs);
+
+ /*
+ * Guest MSRs are saved in the VM-exit MSR-store area.
+ * Guest MSRs are loaded from the VM-entry MSR-load area.
+ * Both areas point to the same location in memory.
+ */
+ if ((error = vmwrite(VMCS_EXIT_MSR_STORE, g_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_MSR_STORE_COUNT, g_count)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD, g_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_ENTRY_MSR_LOAD_COUNT, g_count)) != 0)
+ goto done;
+
+ error = 0;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+#ifndef __FreeBSD__
+int
+vmcs_set_host_msr_save(struct vmcs *vmcs, u_long h_area, u_int h_count)
+{
+ int error;
+
+ VMPTRLD(vmcs);
+
+ /*
+ * Host MSRs are loaded from the VM-exit MSR-load area.
+ */
+ if ((error = vmwrite(VMCS_EXIT_MSR_LOAD, h_area)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_MSR_LOAD_COUNT, h_count)) != 0)
+ goto done;
+
+ error = 0;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+#endif
+
+int
+vmcs_set_defaults(struct vmcs *vmcs,
+ u_long host_rip, u_long host_rsp, u_long ept_pml4,
+ uint32_t pinbased_ctls, uint32_t procbased_ctls,
+ uint32_t procbased_ctls2, uint32_t exit_ctls,
+ uint32_t entry_ctls, u_long msr_bitmap, uint16_t vpid)
+{
+ int error, codesel, datasel, tsssel;
+ u_long cr0, cr4, efer;
+ uint64_t eptp, pat, fsbase, idtrbase;
+ uint32_t exc_bitmap;
+
+ codesel = vmm_get_host_codesel();
+ datasel = vmm_get_host_datasel();
+ tsssel = vmm_get_host_tsssel();
+
+ /*
+ * Make sure we have a "current" VMCS to work with.
+ */
+ VMPTRLD(vmcs);
+
+ /*
+ * Load the VMX controls
+ */
+ if ((error = vmwrite(VMCS_PIN_BASED_CTLS, pinbased_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_PRI_PROC_BASED_CTLS, procbased_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_SEC_PROC_BASED_CTLS, procbased_ctls2)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_EXIT_CTLS, exit_ctls)) != 0)
+ goto done;
+ if ((error = vmwrite(VMCS_ENTRY_CTLS, entry_ctls)) != 0)
+ goto done;
+
+ /* Guest state */
+
+ /* Initialize guest IA32_PAT MSR with the default value */
+ pat = PAT_VALUE(0, PAT_WRITE_BACK) |
+ PAT_VALUE(1, PAT_WRITE_THROUGH) |
+ PAT_VALUE(2, PAT_UNCACHED) |
+ PAT_VALUE(3, PAT_UNCACHEABLE) |
+ PAT_VALUE(4, PAT_WRITE_BACK) |
+ PAT_VALUE(5, PAT_WRITE_THROUGH) |
+ PAT_VALUE(6, PAT_UNCACHED) |
+ PAT_VALUE(7, PAT_UNCACHEABLE);
+ if ((error = vmwrite(VMCS_GUEST_IA32_PAT, pat)) != 0)
+ goto done;
+
+ /* Host state */
+
+ /* Initialize host IA32_PAT MSR */
+ pat = vmm_get_host_pat();
+ if ((error = vmwrite(VMCS_HOST_IA32_PAT, pat)) != 0)
+ goto done;
+
+ /* Load the IA32_EFER MSR */
+ efer = vmm_get_host_efer();
+ if ((error = vmwrite(VMCS_HOST_IA32_EFER, efer)) != 0)
+ goto done;
+
+ /* Load the control registers */
+
+ cr0 = vmm_get_host_cr0();
+ if ((error = vmwrite(VMCS_HOST_CR0, cr0)) != 0)
+ goto done;
+
+ cr4 = vmm_get_host_cr4() | CR4_VMXE;
+ if ((error = vmwrite(VMCS_HOST_CR4, cr4)) != 0)
+ goto done;
+
+ /* Load the segment selectors */
+ if ((error = vmwrite(VMCS_HOST_ES_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_CS_SELECTOR, codesel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_SS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_DS_SELECTOR, datasel)) != 0)
+ goto done;
+
+#ifdef __FreeBSD__
+ if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, datasel)) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, datasel)) != 0)
+ goto done;
+#else
+ if ((error = vmwrite(VMCS_HOST_FS_SELECTOR, vmm_get_host_fssel())) != 0)
+ goto done;
+
+ if ((error = vmwrite(VMCS_HOST_GS_SELECTOR, vmm_get_host_gssel())) != 0)
+ goto done;
+#endif
+
+ if ((error = vmwrite(VMCS_HOST_TR_SELECTOR, tsssel)) != 0)
+ goto done;
+
+#ifdef __FreeBSD__
+ /*
+ * Load the Base-Address for %fs and idtr.
+ *
+ * Note that we exclude %gs, tss and gdtr here because their base
+ * address is pcpu specific.
+ */
+ fsbase = vmm_get_host_fsbase();
+ if ((error = vmwrite(VMCS_HOST_FS_BASE, fsbase)) != 0)
+ goto done;
+#endif
+
+ idtrbase = vmm_get_host_idtrbase();
+ if ((error = vmwrite(VMCS_HOST_IDTR_BASE, idtrbase)) != 0)
+ goto done;
+
+ /* instruction pointer */
+ if ((error = vmwrite(VMCS_HOST_RIP, host_rip)) != 0)
+ goto done;
+
+ /* stack pointer */
+ if ((error = vmwrite(VMCS_HOST_RSP, host_rsp)) != 0)
+ goto done;
+
+ /* eptp */
+ eptp = EPTP(ept_pml4);
+ if ((error = vmwrite(VMCS_EPTP, eptp)) != 0)
+ goto done;
+
+ /* vpid */
+ if ((error = vmwrite(VMCS_VPID, vpid)) != 0)
+ goto done;
+
+ /* msr bitmap */
+ if ((error = vmwrite(VMCS_MSR_BITMAP, msr_bitmap)) != 0)
+ goto done;
+
+ /* exception bitmap */
+ exc_bitmap = 1 << IDT_MC;
+ if ((error = vmwrite(VMCS_EXCEPTION_BITMAP, exc_bitmap)) != 0)
+ goto done;
+
+ /* link pointer */
+ if ((error = vmwrite(VMCS_LINK_POINTER, ~0)) != 0)
+ goto done;
+done:
+ VMCLEAR(vmcs);
+ return (error);
+}
+
+#ifdef DDB
+extern int vmxon_enabled[];
+
+DB_SHOW_COMMAND(vmcs, db_show_vmcs)
+{
+ uint64_t cur_vmcs, val;
+ uint32_t exit;
+
+ if (!vmxon_enabled[curcpu]) {
+ db_printf("VMX not enabled\n");
+ return;
+ }
+
+ if (have_addr) {
+ db_printf("Only current VMCS supported\n");
+ return;
+ }
+
+ vmptrst(&cur_vmcs);
+ if (cur_vmcs == VMCS_INITIAL) {
+ db_printf("No current VM context\n");
+ return;
+ }
+ db_printf("VMCS: %jx\n", cur_vmcs);
+ db_printf("VPID: %lu\n", vmcs_read(VMCS_VPID));
+ db_printf("Activity: ");
+ val = vmcs_read(VMCS_GUEST_ACTIVITY);
+ switch (val) {
+ case 0:
+ db_printf("Active");
+ break;
+ case 1:
+ db_printf("HLT");
+ break;
+ case 2:
+ db_printf("Shutdown");
+ break;
+ case 3:
+ db_printf("Wait for SIPI");
+ break;
+ default:
+ db_printf("Unknown: %#lx", val);
+ }
+ db_printf("\n");
+ exit = vmcs_read(VMCS_EXIT_REASON);
+ if (exit & 0x80000000)
+ db_printf("Entry Failure Reason: %u\n", exit & 0xffff);
+ else
+ db_printf("Exit Reason: %u\n", exit & 0xffff);
+ db_printf("Qualification: %#lx\n", vmcs_exit_qualification());
+ db_printf("Guest Linear Address: %#lx\n",
+ vmcs_read(VMCS_GUEST_LINEAR_ADDRESS));
+ switch (exit & 0x8000ffff) {
+ case EXIT_REASON_EXCEPTION:
+ case EXIT_REASON_EXT_INTR:
+ val = vmcs_read(VMCS_EXIT_INTR_INFO);
+ db_printf("Interrupt Type: ");
+ switch (val >> 8 & 0x7) {
+ case 0:
+ db_printf("external");
+ break;
+ case 2:
+ db_printf("NMI");
+ break;
+ case 3:
+ db_printf("HW exception");
+ break;
+ case 4:
+ db_printf("SW exception");
+ break;
+ default:
+ db_printf("?? %lu", val >> 8 & 0x7);
+ break;
+ }
+ db_printf(" Vector: %lu", val & 0xff);
+ if (val & 0x800)
+ db_printf(" Error Code: %lx",
+ vmcs_read(VMCS_EXIT_INTR_ERRCODE));
+ db_printf("\n");
+ break;
+ case EXIT_REASON_EPT_FAULT:
+ case EXIT_REASON_EPT_MISCONFIG:
+ db_printf("Guest Physical Address: %#lx\n",
+ vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS));
+ break;
+ }
+ db_printf("VM-instruction error: %#lx\n", vmcs_instruction_error());
+}
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmcs.h b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h
new file mode 100644
index 0000000000..20e99e8184
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmcs.h
@@ -0,0 +1,410 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmcs.h 276098 2014-12-23 02:14:49Z neel $
+ */
+
+#ifndef _VMCS_H_
+#define _VMCS_H_
+
+#ifdef _KERNEL
+struct vmcs {
+ uint32_t identifier;
+ uint32_t abort_code;
+ char _impl_specific[PAGE_SIZE - sizeof(uint32_t) * 2];
+};
+CTASSERT(sizeof(struct vmcs) == PAGE_SIZE);
+
+/* MSR save region is composed of an array of 'struct msr_entry' */
+struct msr_entry {
+ uint32_t index;
+ uint32_t reserved;
+ uint64_t val;
+
+};
+
+int vmcs_set_msr_save(struct vmcs *vmcs, u_long g_area, u_int g_count);
+int vmcs_init(struct vmcs *vmcs);
+#ifndef __FreeBSD__
+int vmcs_set_host_msr_save(struct vmcs *vmcs, u_long h_area, u_int h_count);
+#endif
+int vmcs_set_defaults(struct vmcs *vmcs, u_long host_rip, u_long host_rsp,
+ u_long ept_pml4,
+ uint32_t pinbased_ctls, uint32_t procbased_ctls,
+ uint32_t procbased_ctls2, uint32_t exit_ctls,
+ uint32_t entry_ctls, u_long msr_bitmap,
+ uint16_t vpid);
+int vmcs_getreg(struct vmcs *vmcs, int running, int ident, uint64_t *rv);
+int vmcs_setreg(struct vmcs *vmcs, int running, int ident, uint64_t val);
+int vmcs_getdesc(struct vmcs *vmcs, int running, int ident,
+ struct seg_desc *desc);
+int vmcs_setdesc(struct vmcs *vmcs, int running, int ident,
+ struct seg_desc *desc);
+
+/*
+ * Avoid header pollution caused by inline use of 'vtophys()' in vmx_cpufunc.h
+ */
+#ifdef _VMX_CPUFUNC_H_
+static __inline uint64_t
+vmcs_read(uint32_t encoding)
+{
+ int error;
+ uint64_t val;
+
+ error = vmread(encoding, &val);
+ KASSERT(error == 0, ("vmcs_read(%u) error %d", encoding, error));
+ return (val);
+}
+
+static __inline void
+vmcs_write(uint32_t encoding, uint64_t val)
+{
+ int error;
+
+ error = vmwrite(encoding, val);
+ KASSERT(error == 0, ("vmcs_write(%u) error %d", encoding, error));
+}
+#endif /* _VMX_CPUFUNC_H_ */
+
+#define vmexit_instruction_length() vmcs_read(VMCS_EXIT_INSTRUCTION_LENGTH)
+#define vmcs_guest_rip() vmcs_read(VMCS_GUEST_RIP)
+#define vmcs_instruction_error() vmcs_read(VMCS_INSTRUCTION_ERROR)
+#define vmcs_exit_reason() (vmcs_read(VMCS_EXIT_REASON) & 0xffff)
+#define vmcs_exit_qualification() vmcs_read(VMCS_EXIT_QUALIFICATION)
+#define vmcs_guest_cr3() vmcs_read(VMCS_GUEST_CR3)
+#define vmcs_gpa() vmcs_read(VMCS_GUEST_PHYSICAL_ADDRESS)
+#define vmcs_gla() vmcs_read(VMCS_GUEST_LINEAR_ADDRESS)
+#define vmcs_idt_vectoring_info() vmcs_read(VMCS_IDT_VECTORING_INFO)
+#define vmcs_idt_vectoring_err() vmcs_read(VMCS_IDT_VECTORING_ERROR)
+
+#endif /* _KERNEL */
+
+#define VMCS_INITIAL 0xffffffffffffffff
+
+#define VMCS_IDENT(encoding) ((encoding) | 0x80000000)
+/*
+ * VMCS field encodings from Appendix H, Intel Architecture Manual Vol3B.
+ */
+#define VMCS_INVALID_ENCODING 0xffffffff
+
+/* 16-bit control fields */
+#define VMCS_VPID 0x00000000
+#define VMCS_PIR_VECTOR 0x00000002
+
+/* 16-bit guest-state fields */
+#define VMCS_GUEST_ES_SELECTOR 0x00000800
+#define VMCS_GUEST_CS_SELECTOR 0x00000802
+#define VMCS_GUEST_SS_SELECTOR 0x00000804
+#define VMCS_GUEST_DS_SELECTOR 0x00000806
+#define VMCS_GUEST_FS_SELECTOR 0x00000808
+#define VMCS_GUEST_GS_SELECTOR 0x0000080A
+#define VMCS_GUEST_LDTR_SELECTOR 0x0000080C
+#define VMCS_GUEST_TR_SELECTOR 0x0000080E
+#define VMCS_GUEST_INTR_STATUS 0x00000810
+
+/* 16-bit host-state fields */
+#define VMCS_HOST_ES_SELECTOR 0x00000C00
+#define VMCS_HOST_CS_SELECTOR 0x00000C02
+#define VMCS_HOST_SS_SELECTOR 0x00000C04
+#define VMCS_HOST_DS_SELECTOR 0x00000C06
+#define VMCS_HOST_FS_SELECTOR 0x00000C08
+#define VMCS_HOST_GS_SELECTOR 0x00000C0A
+#define VMCS_HOST_TR_SELECTOR 0x00000C0C
+
+/* 64-bit control fields */
+#define VMCS_IO_BITMAP_A 0x00002000
+#define VMCS_IO_BITMAP_B 0x00002002
+#define VMCS_MSR_BITMAP 0x00002004
+#define VMCS_EXIT_MSR_STORE 0x00002006
+#define VMCS_EXIT_MSR_LOAD 0x00002008
+#define VMCS_ENTRY_MSR_LOAD 0x0000200A
+#define VMCS_EXECUTIVE_VMCS 0x0000200C
+#define VMCS_TSC_OFFSET 0x00002010
+#define VMCS_VIRTUAL_APIC 0x00002012
+#define VMCS_APIC_ACCESS 0x00002014
+#define VMCS_PIR_DESC 0x00002016
+#define VMCS_EPTP 0x0000201A
+#define VMCS_EOI_EXIT0 0x0000201C
+#define VMCS_EOI_EXIT1 0x0000201E
+#define VMCS_EOI_EXIT2 0x00002020
+#define VMCS_EOI_EXIT3 0x00002022
+#define VMCS_EOI_EXIT(vector) (VMCS_EOI_EXIT0 + ((vector) / 64) * 2)
+
+/* 64-bit read-only fields */
+#define VMCS_GUEST_PHYSICAL_ADDRESS 0x00002400
+
+/* 64-bit guest-state fields */
+#define VMCS_LINK_POINTER 0x00002800
+#define VMCS_GUEST_IA32_DEBUGCTL 0x00002802
+#define VMCS_GUEST_IA32_PAT 0x00002804
+#define VMCS_GUEST_IA32_EFER 0x00002806
+#define VMCS_GUEST_IA32_PERF_GLOBAL_CTRL 0x00002808
+#define VMCS_GUEST_PDPTE0 0x0000280A
+#define VMCS_GUEST_PDPTE1 0x0000280C
+#define VMCS_GUEST_PDPTE2 0x0000280E
+#define VMCS_GUEST_PDPTE3 0x00002810
+
+/* 64-bit host-state fields */
+#define VMCS_HOST_IA32_PAT 0x00002C00
+#define VMCS_HOST_IA32_EFER 0x00002C02
+#define VMCS_HOST_IA32_PERF_GLOBAL_CTRL 0x00002C04
+
+/* 32-bit control fields */
+#define VMCS_PIN_BASED_CTLS 0x00004000
+#define VMCS_PRI_PROC_BASED_CTLS 0x00004002
+#define VMCS_EXCEPTION_BITMAP 0x00004004
+#define VMCS_PF_ERROR_MASK 0x00004006
+#define VMCS_PF_ERROR_MATCH 0x00004008
+#define VMCS_CR3_TARGET_COUNT 0x0000400A
+#define VMCS_EXIT_CTLS 0x0000400C
+#define VMCS_EXIT_MSR_STORE_COUNT 0x0000400E
+#define VMCS_EXIT_MSR_LOAD_COUNT 0x00004010
+#define VMCS_ENTRY_CTLS 0x00004012
+#define VMCS_ENTRY_MSR_LOAD_COUNT 0x00004014
+#define VMCS_ENTRY_INTR_INFO 0x00004016
+#define VMCS_ENTRY_EXCEPTION_ERROR 0x00004018
+#define VMCS_ENTRY_INST_LENGTH 0x0000401A
+#define VMCS_TPR_THRESHOLD 0x0000401C
+#define VMCS_SEC_PROC_BASED_CTLS 0x0000401E
+#define VMCS_PLE_GAP 0x00004020
+#define VMCS_PLE_WINDOW 0x00004022
+
+/* 32-bit read-only data fields */
+#define VMCS_INSTRUCTION_ERROR 0x00004400
+#define VMCS_EXIT_REASON 0x00004402
+#define VMCS_EXIT_INTR_INFO 0x00004404
+#define VMCS_EXIT_INTR_ERRCODE 0x00004406
+#define VMCS_IDT_VECTORING_INFO 0x00004408
+#define VMCS_IDT_VECTORING_ERROR 0x0000440A
+#define VMCS_EXIT_INSTRUCTION_LENGTH 0x0000440C
+#define VMCS_EXIT_INSTRUCTION_INFO 0x0000440E
+
+/* 32-bit guest-state fields */
+#define VMCS_GUEST_ES_LIMIT 0x00004800
+#define VMCS_GUEST_CS_LIMIT 0x00004802
+#define VMCS_GUEST_SS_LIMIT 0x00004804
+#define VMCS_GUEST_DS_LIMIT 0x00004806
+#define VMCS_GUEST_FS_LIMIT 0x00004808
+#define VMCS_GUEST_GS_LIMIT 0x0000480A
+#define VMCS_GUEST_LDTR_LIMIT 0x0000480C
+#define VMCS_GUEST_TR_LIMIT 0x0000480E
+#define VMCS_GUEST_GDTR_LIMIT 0x00004810
+#define VMCS_GUEST_IDTR_LIMIT 0x00004812
+#define VMCS_GUEST_ES_ACCESS_RIGHTS 0x00004814
+#define VMCS_GUEST_CS_ACCESS_RIGHTS 0x00004816
+#define VMCS_GUEST_SS_ACCESS_RIGHTS 0x00004818
+#define VMCS_GUEST_DS_ACCESS_RIGHTS 0x0000481A
+#define VMCS_GUEST_FS_ACCESS_RIGHTS 0x0000481C
+#define VMCS_GUEST_GS_ACCESS_RIGHTS 0x0000481E
+#define VMCS_GUEST_LDTR_ACCESS_RIGHTS 0x00004820
+#define VMCS_GUEST_TR_ACCESS_RIGHTS 0x00004822
+#define VMCS_GUEST_INTERRUPTIBILITY 0x00004824
+#define VMCS_GUEST_ACTIVITY 0x00004826
+#define VMCS_GUEST_SMBASE 0x00004828
+#define VMCS_GUEST_IA32_SYSENTER_CS 0x0000482A
+#define VMCS_PREEMPTION_TIMER_VALUE 0x0000482E
+
+/* 32-bit host state fields */
+#define VMCS_HOST_IA32_SYSENTER_CS 0x00004C00
+
+/* Natural Width control fields */
+#define VMCS_CR0_MASK 0x00006000
+#define VMCS_CR4_MASK 0x00006002
+#define VMCS_CR0_SHADOW 0x00006004
+#define VMCS_CR4_SHADOW 0x00006006
+#define VMCS_CR3_TARGET0 0x00006008
+#define VMCS_CR3_TARGET1 0x0000600A
+#define VMCS_CR3_TARGET2 0x0000600C
+#define VMCS_CR3_TARGET3 0x0000600E
+
+/* Natural Width read-only fields */
+#define VMCS_EXIT_QUALIFICATION 0x00006400
+#define VMCS_IO_RCX 0x00006402
+#define VMCS_IO_RSI 0x00006404
+#define VMCS_IO_RDI 0x00006406
+#define VMCS_IO_RIP 0x00006408
+#define VMCS_GUEST_LINEAR_ADDRESS 0x0000640A
+
+/* Natural Width guest-state fields */
+#define VMCS_GUEST_CR0 0x00006800
+#define VMCS_GUEST_CR3 0x00006802
+#define VMCS_GUEST_CR4 0x00006804
+#define VMCS_GUEST_ES_BASE 0x00006806
+#define VMCS_GUEST_CS_BASE 0x00006808
+#define VMCS_GUEST_SS_BASE 0x0000680A
+#define VMCS_GUEST_DS_BASE 0x0000680C
+#define VMCS_GUEST_FS_BASE 0x0000680E
+#define VMCS_GUEST_GS_BASE 0x00006810
+#define VMCS_GUEST_LDTR_BASE 0x00006812
+#define VMCS_GUEST_TR_BASE 0x00006814
+#define VMCS_GUEST_GDTR_BASE 0x00006816
+#define VMCS_GUEST_IDTR_BASE 0x00006818
+#define VMCS_GUEST_DR7 0x0000681A
+#define VMCS_GUEST_RSP 0x0000681C
+#define VMCS_GUEST_RIP 0x0000681E
+#define VMCS_GUEST_RFLAGS 0x00006820
+#define VMCS_GUEST_PENDING_DBG_EXCEPTIONS 0x00006822
+#define VMCS_GUEST_IA32_SYSENTER_ESP 0x00006824
+#define VMCS_GUEST_IA32_SYSENTER_EIP 0x00006826
+
+/* Natural Width host-state fields */
+#define VMCS_HOST_CR0 0x00006C00
+#define VMCS_HOST_CR3 0x00006C02
+#define VMCS_HOST_CR4 0x00006C04
+#define VMCS_HOST_FS_BASE 0x00006C06
+#define VMCS_HOST_GS_BASE 0x00006C08
+#define VMCS_HOST_TR_BASE 0x00006C0A
+#define VMCS_HOST_GDTR_BASE 0x00006C0C
+#define VMCS_HOST_IDTR_BASE 0x00006C0E
+#define VMCS_HOST_IA32_SYSENTER_ESP 0x00006C10
+#define VMCS_HOST_IA32_SYSENTER_EIP 0x00006C12
+#define VMCS_HOST_RSP 0x00006C14
+#define VMCS_HOST_RIP 0x00006c16
+
+/*
+ * VM instruction error numbers
+ */
+#define VMRESUME_WITH_NON_LAUNCHED_VMCS 5
+
+/*
+ * VMCS exit reasons
+ */
+#define EXIT_REASON_EXCEPTION 0
+#define EXIT_REASON_EXT_INTR 1
+#define EXIT_REASON_TRIPLE_FAULT 2
+#define EXIT_REASON_INIT 3
+#define EXIT_REASON_SIPI 4
+#define EXIT_REASON_IO_SMI 5
+#define EXIT_REASON_SMI 6
+#define EXIT_REASON_INTR_WINDOW 7
+#define EXIT_REASON_NMI_WINDOW 8
+#define EXIT_REASON_TASK_SWITCH 9
+#define EXIT_REASON_CPUID 10
+#define EXIT_REASON_GETSEC 11
+#define EXIT_REASON_HLT 12
+#define EXIT_REASON_INVD 13
+#define EXIT_REASON_INVLPG 14
+#define EXIT_REASON_RDPMC 15
+#define EXIT_REASON_RDTSC 16
+#define EXIT_REASON_RSM 17
+#define EXIT_REASON_VMCALL 18
+#define EXIT_REASON_VMCLEAR 19
+#define EXIT_REASON_VMLAUNCH 20
+#define EXIT_REASON_VMPTRLD 21
+#define EXIT_REASON_VMPTRST 22
+#define EXIT_REASON_VMREAD 23
+#define EXIT_REASON_VMRESUME 24
+#define EXIT_REASON_VMWRITE 25
+#define EXIT_REASON_VMXOFF 26
+#define EXIT_REASON_VMXON 27
+#define EXIT_REASON_CR_ACCESS 28
+#define EXIT_REASON_DR_ACCESS 29
+#define EXIT_REASON_INOUT 30
+#define EXIT_REASON_RDMSR 31
+#define EXIT_REASON_WRMSR 32
+#define EXIT_REASON_INVAL_VMCS 33
+#define EXIT_REASON_INVAL_MSR 34
+#define EXIT_REASON_MWAIT 36
+#define EXIT_REASON_MTF 37
+#define EXIT_REASON_MONITOR 39
+#define EXIT_REASON_PAUSE 40
+#define EXIT_REASON_MCE_DURING_ENTRY 41
+#define EXIT_REASON_TPR 43
+#define EXIT_REASON_APIC_ACCESS 44
+#define EXIT_REASON_VIRTUALIZED_EOI 45
+#define EXIT_REASON_GDTR_IDTR 46
+#define EXIT_REASON_LDTR_TR 47
+#define EXIT_REASON_EPT_FAULT 48
+#define EXIT_REASON_EPT_MISCONFIG 49
+#define EXIT_REASON_INVEPT 50
+#define EXIT_REASON_RDTSCP 51
+#define EXIT_REASON_VMX_PREEMPT 52
+#define EXIT_REASON_INVVPID 53
+#define EXIT_REASON_WBINVD 54
+#define EXIT_REASON_XSETBV 55
+#define EXIT_REASON_APIC_WRITE 56
+
+/*
+ * NMI unblocking due to IRET.
+ *
+ * Applies to VM-exits due to hardware exception or EPT fault.
+ */
+#define EXIT_QUAL_NMIUDTI (1 << 12)
+/*
+ * VMCS interrupt information fields
+ */
+#define VMCS_INTR_VALID (1U << 31)
+#define VMCS_INTR_T_MASK 0x700 /* Interruption-info type */
+#define VMCS_INTR_T_HWINTR (0 << 8)
+#define VMCS_INTR_T_NMI (2 << 8)
+#define VMCS_INTR_T_HWEXCEPTION (3 << 8)
+#define VMCS_INTR_T_SWINTR (4 << 8)
+#define VMCS_INTR_T_PRIV_SWEXCEPTION (5 << 8)
+#define VMCS_INTR_T_SWEXCEPTION (6 << 8)
+#define VMCS_INTR_DEL_ERRCODE (1 << 11)
+
+/*
+ * VMCS IDT-Vectoring information fields
+ */
+#define VMCS_IDT_VEC_VALID (1U << 31)
+#define VMCS_IDT_VEC_ERRCODE_VALID (1 << 11)
+
+/*
+ * VMCS Guest interruptibility field
+ */
+#define VMCS_INTERRUPTIBILITY_STI_BLOCKING (1 << 0)
+#define VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING (1 << 1)
+#define VMCS_INTERRUPTIBILITY_SMI_BLOCKING (1 << 2)
+#define VMCS_INTERRUPTIBILITY_NMI_BLOCKING (1 << 3)
+
+/*
+ * Exit qualification for EXIT_REASON_INVAL_VMCS
+ */
+#define EXIT_QUAL_NMI_WHILE_STI_BLOCKING 3
+
+/*
+ * Exit qualification for EPT violation
+ */
+#define EPT_VIOLATION_DATA_READ (1UL << 0)
+#define EPT_VIOLATION_DATA_WRITE (1UL << 1)
+#define EPT_VIOLATION_INST_FETCH (1UL << 2)
+#define EPT_VIOLATION_GPA_READABLE (1UL << 3)
+#define EPT_VIOLATION_GPA_WRITEABLE (1UL << 4)
+#define EPT_VIOLATION_GPA_EXECUTABLE (1UL << 5)
+#define EPT_VIOLATION_GLA_VALID (1UL << 7)
+#define EPT_VIOLATION_XLAT_VALID (1UL << 8)
+
+/*
+ * Exit qualification for APIC-access VM exit
+ */
+#define APIC_ACCESS_OFFSET(qual) ((qual) & 0xFFF)
+#define APIC_ACCESS_TYPE(qual) (((qual) >> 12) & 0xF)
+
+/*
+ * Exit qualification for APIC-write VM exit
+ */
+#define APIC_WRITE_OFFSET(qual) ((qual) & 0xFFF)
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.c b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
new file mode 100644
index 0000000000..7ddf4e2a46
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.c
@@ -0,0 +1,2842 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z tychon $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx.c 284174 2015-06-09 00:14:47Z tychon $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/proc.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/psl.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/segments.h>
+#include <machine/smp.h>
+#include <machine/specialreg.h>
+#include <machine/vmparam.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+#include <machine/vmm_instruction_emul.h>
+#include "vmm_lapic.h"
+#include "vmm_host.h"
+#include "vmm_ioport.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+#include "vatpic.h"
+#include "vlapic.h"
+#include "vlapic_priv.h"
+
+#include "ept.h"
+#include "vmx_cpufunc.h"
+#include "vmx.h"
+#include "vmx_msr.h"
+#include "x86.h"
+#include "vmx_controls.h"
+
+#define PINBASED_CTLS_ONE_SETTING \
+ (PINBASED_EXTINT_EXITING | \
+ PINBASED_NMI_EXITING | \
+ PINBASED_VIRTUAL_NMI)
+#define PINBASED_CTLS_ZERO_SETTING 0
+
+#define PROCBASED_CTLS_WINDOW_SETTING \
+ (PROCBASED_INT_WINDOW_EXITING | \
+ PROCBASED_NMI_WINDOW_EXITING)
+
+#define PROCBASED_CTLS_ONE_SETTING \
+ (PROCBASED_SECONDARY_CONTROLS | \
+ PROCBASED_IO_EXITING | \
+ PROCBASED_MSR_BITMAPS | \
+ PROCBASED_CTLS_WINDOW_SETTING | \
+ PROCBASED_CR8_LOAD_EXITING | \
+ PROCBASED_CR8_STORE_EXITING)
+#define PROCBASED_CTLS_ZERO_SETTING \
+ (PROCBASED_CR3_LOAD_EXITING | \
+ PROCBASED_CR3_STORE_EXITING | \
+ PROCBASED_IO_BITMAPS)
+
+#define PROCBASED_CTLS2_ONE_SETTING PROCBASED2_ENABLE_EPT
+#define PROCBASED_CTLS2_ZERO_SETTING 0
+
+#define VM_EXIT_CTLS_ONE_SETTING \
+ (VM_EXIT_HOST_LMA | \
+ VM_EXIT_SAVE_EFER | \
+ VM_EXIT_LOAD_EFER | \
+ VM_EXIT_LOAD_PAT | \
+ VM_EXIT_SAVE_PAT | \
+ VM_EXIT_LOAD_PAT)
+
+#define VM_EXIT_CTLS_ZERO_SETTING VM_EXIT_SAVE_DEBUG_CONTROLS
+
+#define VM_ENTRY_CTLS_ONE_SETTING (VM_ENTRY_LOAD_EFER | VM_ENTRY_LOAD_PAT)
+
+#define VM_ENTRY_CTLS_ZERO_SETTING \
+ (VM_ENTRY_LOAD_DEBUG_CONTROLS | \
+ VM_ENTRY_INTO_SMM | \
+ VM_ENTRY_DEACTIVATE_DUAL_MONITOR)
+
+#define HANDLED 1
+#define UNHANDLED 0
+
+static MALLOC_DEFINE(M_VMX, "vmx", "vmx");
+static MALLOC_DEFINE(M_VLAPIC, "vlapic", "vlapic");
+
+SYSCTL_DECL(_hw_vmm);
+SYSCTL_NODE(_hw_vmm, OID_AUTO, vmx, CTLFLAG_RW, NULL, NULL);
+
+int vmxon_enabled[MAXCPU];
+static char vmxon_region[MAXCPU][PAGE_SIZE] __aligned(PAGE_SIZE);
+#ifndef __FreeBSD__
+static vm_paddr_t vmxon_region_pa[MAXCPU];
+#endif
+
+static uint32_t pinbased_ctls, procbased_ctls, procbased_ctls2;
+static uint32_t exit_ctls, entry_ctls;
+
+static uint64_t cr0_ones_mask, cr0_zeros_mask;
+SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_ones_mask, CTLFLAG_RD,
+ &cr0_ones_mask, 0, NULL);
+SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr0_zeros_mask, CTLFLAG_RD,
+ &cr0_zeros_mask, 0, NULL);
+
+static uint64_t cr4_ones_mask, cr4_zeros_mask;
+SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_ones_mask, CTLFLAG_RD,
+ &cr4_ones_mask, 0, NULL);
+SYSCTL_ULONG(_hw_vmm_vmx, OID_AUTO, cr4_zeros_mask, CTLFLAG_RD,
+ &cr4_zeros_mask, 0, NULL);
+
+static int vmx_initialized;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, initialized, CTLFLAG_RD,
+ &vmx_initialized, 0, "Intel VMX initialized");
+
+/*
+ * Optional capabilities
+ */
+static int cap_halt_exit;
+static int cap_pause_exit;
+static int cap_unrestricted_guest;
+static int cap_monitor_trap;
+static int cap_invpcid;
+
+static int virtual_interrupt_delivery;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, virtual_interrupt_delivery, CTLFLAG_RD,
+ &virtual_interrupt_delivery, 0, "APICv virtual interrupt delivery support");
+
+static int posted_interrupts;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupts, CTLFLAG_RD,
+ &posted_interrupts, 0, "APICv posted interrupt support");
+
+static int pirvec;
+SYSCTL_INT(_hw_vmm_vmx, OID_AUTO, posted_interrupt_vector, CTLFLAG_RD,
+ &pirvec, 0, "APICv posted interrupt vector");
+
+static struct unrhdr *vpid_unr;
+static u_int vpid_alloc_failed;
+SYSCTL_UINT(_hw_vmm_vmx, OID_AUTO, vpid_alloc_failed, CTLFLAG_RD,
+ &vpid_alloc_failed, 0, NULL);
+
+/*
+ * Use the last page below 4GB as the APIC access address. This address is
+ * occupied by the boot firmware so it is guaranteed that it will not conflict
+ * with a page in system memory.
+ */
+#define APIC_ACCESS_ADDRESS 0xFFFFF000
+
+static int vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc);
+static int vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval);
+static int vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val);
+static void vmx_inject_pir(struct vlapic *vlapic);
+
+#ifdef KTR
+static const char *
+exit_reason_to_str(int reason)
+{
+ static char reasonbuf[32];
+
+ switch (reason) {
+ case EXIT_REASON_EXCEPTION:
+ return "exception";
+ case EXIT_REASON_EXT_INTR:
+ return "extint";
+ case EXIT_REASON_TRIPLE_FAULT:
+ return "triplefault";
+ case EXIT_REASON_INIT:
+ return "init";
+ case EXIT_REASON_SIPI:
+ return "sipi";
+ case EXIT_REASON_IO_SMI:
+ return "iosmi";
+ case EXIT_REASON_SMI:
+ return "smi";
+ case EXIT_REASON_INTR_WINDOW:
+ return "intrwindow";
+ case EXIT_REASON_NMI_WINDOW:
+ return "nmiwindow";
+ case EXIT_REASON_TASK_SWITCH:
+ return "taskswitch";
+ case EXIT_REASON_CPUID:
+ return "cpuid";
+ case EXIT_REASON_GETSEC:
+ return "getsec";
+ case EXIT_REASON_HLT:
+ return "hlt";
+ case EXIT_REASON_INVD:
+ return "invd";
+ case EXIT_REASON_INVLPG:
+ return "invlpg";
+ case EXIT_REASON_RDPMC:
+ return "rdpmc";
+ case EXIT_REASON_RDTSC:
+ return "rdtsc";
+ case EXIT_REASON_RSM:
+ return "rsm";
+ case EXIT_REASON_VMCALL:
+ return "vmcall";
+ case EXIT_REASON_VMCLEAR:
+ return "vmclear";
+ case EXIT_REASON_VMLAUNCH:
+ return "vmlaunch";
+ case EXIT_REASON_VMPTRLD:
+ return "vmptrld";
+ case EXIT_REASON_VMPTRST:
+ return "vmptrst";
+ case EXIT_REASON_VMREAD:
+ return "vmread";
+ case EXIT_REASON_VMRESUME:
+ return "vmresume";
+ case EXIT_REASON_VMWRITE:
+ return "vmwrite";
+ case EXIT_REASON_VMXOFF:
+ return "vmxoff";
+ case EXIT_REASON_VMXON:
+ return "vmxon";
+ case EXIT_REASON_CR_ACCESS:
+ return "craccess";
+ case EXIT_REASON_DR_ACCESS:
+ return "draccess";
+ case EXIT_REASON_INOUT:
+ return "inout";
+ case EXIT_REASON_RDMSR:
+ return "rdmsr";
+ case EXIT_REASON_WRMSR:
+ return "wrmsr";
+ case EXIT_REASON_INVAL_VMCS:
+ return "invalvmcs";
+ case EXIT_REASON_INVAL_MSR:
+ return "invalmsr";
+ case EXIT_REASON_MWAIT:
+ return "mwait";
+ case EXIT_REASON_MTF:
+ return "mtf";
+ case EXIT_REASON_MONITOR:
+ return "monitor";
+ case EXIT_REASON_PAUSE:
+ return "pause";
+ case EXIT_REASON_MCE:
+ return "mce";
+ case EXIT_REASON_TPR:
+ return "tpr";
+ case EXIT_REASON_APIC_ACCESS:
+ return "apic-access";
+ case EXIT_REASON_GDTR_IDTR:
+ return "gdtridtr";
+ case EXIT_REASON_LDTR_TR:
+ return "ldtrtr";
+ case EXIT_REASON_EPT_FAULT:
+ return "eptfault";
+ case EXIT_REASON_EPT_MISCONFIG:
+ return "eptmisconfig";
+ case EXIT_REASON_INVEPT:
+ return "invept";
+ case EXIT_REASON_RDTSCP:
+ return "rdtscp";
+ case EXIT_REASON_VMX_PREEMPT:
+ return "vmxpreempt";
+ case EXIT_REASON_INVVPID:
+ return "invvpid";
+ case EXIT_REASON_WBINVD:
+ return "wbinvd";
+ case EXIT_REASON_XSETBV:
+ return "xsetbv";
+ case EXIT_REASON_APIC_WRITE:
+ return "apic-write";
+ default:
+ snprintf(reasonbuf, sizeof(reasonbuf), "%d", reason);
+ return (reasonbuf);
+ }
+}
+
+#ifdef SETJMP_TRACE
+static const char *
+vmx_setjmp_rc2str(int rc)
+{
+ switch (rc) {
+ case VMX_RETURN_DIRECT:
+ return "direct";
+ case VMX_RETURN_LONGJMP:
+ return "longjmp";
+ case VMX_RETURN_VMRESUME:
+ return "vmresume";
+ case VMX_RETURN_VMLAUNCH:
+ return "vmlaunch";
+ case VMX_RETURN_AST:
+ return "ast";
+ default:
+ return "unknown";
+ }
+}
+
+#define SETJMP_TRACE(vmx, vcpu, vmxctx, regname) \
+ VMM_CTR1((vmx)->vm, (vcpu), "setjmp trace " #regname " 0x%016lx", \
+ (vmxctx)->regname)
+
+static void
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+ uint64_t host_rip, host_rsp;
+
+ if (vmxctx != &vmx->ctx[vcpu])
+ panic("vmx_setjmp_trace: invalid vmxctx %p; should be %p",
+ vmxctx, &vmx->ctx[vcpu]);
+
+ VMM_CTR1((vmx)->vm, (vcpu), "vmxctx = %p", vmxctx);
+ VMM_CTR2((vmx)->vm, (vcpu), "setjmp return code %s(%d)",
+ vmx_setjmp_rc2str(rc), rc);
+
+ host_rsp = host_rip = ~0;
+ vmread(VMCS_HOST_RIP, &host_rip);
+ vmread(VMCS_HOST_RSP, &host_rsp);
+ VMM_CTR2((vmx)->vm, (vcpu), "vmcs host_rip 0x%016lx, host_rsp 0x%016lx",
+ host_rip, host_rsp);
+
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r15);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r14);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r13);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_r12);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rsp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rbx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, host_rip);
+
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdi);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rsi);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rdx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rcx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r8);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r9);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rax);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbx);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_rbp);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r10);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r11);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r12);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r13);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r14);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_r15);
+ SETJMP_TRACE(vmx, vcpu, vmxctx, guest_cr2);
+}
+#endif
+#else
+static void __inline
+vmx_setjmp_trace(struct vmx *vmx, int vcpu, struct vmxctx *vmxctx, int rc)
+{
+ return;
+}
+#endif /* KTR */
+
+static int
+vmx_allow_x2apic_msrs(struct vmx *vmx)
+{
+ int i, error;
+
+ error = 0;
+
+ /*
+ * Allow readonly access to the following x2APIC MSRs from the guest.
+ */
+ error += guest_msr_ro(vmx, MSR_APIC_ID);
+ error += guest_msr_ro(vmx, MSR_APIC_VERSION);
+ error += guest_msr_ro(vmx, MSR_APIC_LDR);
+ error += guest_msr_ro(vmx, MSR_APIC_SVR);
+
+ for (i = 0; i < 8; i++)
+ error += guest_msr_ro(vmx, MSR_APIC_ISR0 + i);
+
+ for (i = 0; i < 8; i++)
+ error += guest_msr_ro(vmx, MSR_APIC_TMR0 + i);
+
+ for (i = 0; i < 8; i++)
+ error += guest_msr_ro(vmx, MSR_APIC_IRR0 + i);
+
+ error += guest_msr_ro(vmx, MSR_APIC_ESR);
+ error += guest_msr_ro(vmx, MSR_APIC_LVT_TIMER);
+ error += guest_msr_ro(vmx, MSR_APIC_LVT_THERMAL);
+ error += guest_msr_ro(vmx, MSR_APIC_LVT_PCINT);
+ error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT0);
+ error += guest_msr_ro(vmx, MSR_APIC_LVT_LINT1);
+ error += guest_msr_ro(vmx, MSR_APIC_LVT_ERROR);
+ error += guest_msr_ro(vmx, MSR_APIC_ICR_TIMER);
+ error += guest_msr_ro(vmx, MSR_APIC_DCR_TIMER);
+ error += guest_msr_ro(vmx, MSR_APIC_ICR);
+
+ /*
+ * Allow TPR, EOI and SELF_IPI MSRs to be read and written by the guest.
+ *
+ * These registers get special treatment described in the section
+ * "Virtualizing MSR-Based APIC Accesses".
+ */
+ error += guest_msr_rw(vmx, MSR_APIC_TPR);
+ error += guest_msr_rw(vmx, MSR_APIC_EOI);
+ error += guest_msr_rw(vmx, MSR_APIC_SELF_IPI);
+
+ return (error);
+}
+
+u_long
+vmx_fix_cr0(u_long cr0)
+{
+
+ return ((cr0 | cr0_ones_mask) & ~cr0_zeros_mask);
+}
+
+u_long
+vmx_fix_cr4(u_long cr4)
+{
+
+ return ((cr4 | cr4_ones_mask) & ~cr4_zeros_mask);
+}
+
+static void
+vpid_free(int vpid)
+{
+ if (vpid < 0 || vpid > 0xffff)
+ panic("vpid_free: invalid vpid %d", vpid);
+
+ /*
+ * VPIDs [0,VM_MAXCPU] are special and are not allocated from
+ * the unit number allocator.
+ */
+
+ if (vpid > VM_MAXCPU)
+ free_unr(vpid_unr, vpid);
+}
+
+static void
+vpid_alloc(uint16_t *vpid, int num)
+{
+ int i, x;
+
+ if (num <= 0 || num > VM_MAXCPU)
+ panic("invalid number of vpids requested: %d", num);
+
+ /*
+ * If the "enable vpid" execution control is not enabled then the
+ * VPID is required to be 0 for all vcpus.
+ */
+ if ((procbased_ctls2 & PROCBASED2_ENABLE_VPID) == 0) {
+ for (i = 0; i < num; i++)
+ vpid[i] = 0;
+ return;
+ }
+
+ /*
+ * Allocate a unique VPID for each vcpu from the unit number allocator.
+ */
+ for (i = 0; i < num; i++) {
+ x = alloc_unr(vpid_unr);
+ if (x == -1)
+ break;
+ else
+ vpid[i] = x;
+ }
+
+ if (i < num) {
+ atomic_add_int(&vpid_alloc_failed, 1);
+
+ /*
+ * If the unit number allocator does not have enough unique
+ * VPIDs then we need to allocate from the [1,VM_MAXCPU] range.
+ *
+ * These VPIDs are not be unique across VMs but this does not
+ * affect correctness because the combined mappings are also
+ * tagged with the EP4TA which is unique for each VM.
+ *
+ * It is still sub-optimal because the invvpid will invalidate
+ * combined mappings for a particular VPID across all EP4TAs.
+ */
+ while (i-- > 0)
+ vpid_free(vpid[i]);
+
+ for (i = 0; i < num; i++)
+ vpid[i] = i + 1;
+ }
+}
+
+static void
+vpid_init(void)
+{
+ /*
+ * VPID 0 is required when the "enable VPID" execution control is
+ * disabled.
+ *
+ * VPIDs [1,VM_MAXCPU] are used as the "overflow namespace" when the
+ * unit number allocator does not have sufficient unique VPIDs to
+ * satisfy the allocation.
+ *
+ * The remaining VPIDs are managed by the unit number allocator.
+ */
+ vpid_unr = new_unrhdr(VM_MAXCPU + 1, 0xffff, NULL);
+}
+
+#ifndef __FreeBSD__
+static void
+msr_save_area_init(struct msr_entry *g_area, int *g_count)
+{
+ int cnt;
+
+ static struct msr_entry guest_msrs[] = {
+ { MSR_KGSBASE, 0, 0 },
+ { MSR_LSTAR, 0, 0 },
+ { MSR_CSTAR, 0, 0 },
+ { MSR_STAR, 0, 0 },
+ { MSR_SF_MASK, 0, 0 },
+ };
+
+ cnt = sizeof(guest_msrs) / sizeof(guest_msrs[0]);
+ if (cnt > GUEST_MSR_MAX_ENTRIES)
+ panic("guest msr save area overrun");
+ bcopy(guest_msrs, g_area, sizeof(guest_msrs));
+ *g_count = cnt;
+}
+
+static void
+host_msr_save_area_init(struct msr_entry *h_area, int *h_count)
+{
+ int i, cnt;
+
+ static struct msr_entry host_msrs[] = {
+ { MSR_LSTAR, 0, 0 },
+ { MSR_CSTAR, 0, 0 },
+ { MSR_STAR, 0, 0 },
+ { MSR_SF_MASK, 0, 0 },
+ };
+
+ cnt = sizeof(host_msrs) / sizeof(host_msrs[0]);
+ if (cnt > HOST_MSR_MAX_ENTRIES)
+ panic("host msr save area overrun");
+ for (i = 0; i < cnt; i++) {
+ host_msrs[i].val = rdmsr(host_msrs[i].index);
+ }
+ bcopy(host_msrs, h_area, sizeof(host_msrs));
+ *h_count = cnt;
+}
+#endif
+
+static void
+vmx_disable(void *arg __unused)
+{
+ struct invvpid_desc invvpid_desc = { 0 };
+ struct invept_desc invept_desc = { 0 };
+
+ if (vmxon_enabled[curcpu]) {
+ /*
+ * See sections 25.3.3.3 and 25.3.3.4 in Intel Vol 3b.
+ *
+ * VMXON or VMXOFF are not required to invalidate any TLB
+ * caching structures. This prevents potential retention of
+ * cached information in the TLB between distinct VMX episodes.
+ */
+ invvpid(INVVPID_TYPE_ALL_CONTEXTS, invvpid_desc);
+ invept(INVEPT_TYPE_ALL_CONTEXTS, invept_desc);
+ vmxoff();
+ }
+ load_cr4(rcr4() & ~CR4_VMXE);
+}
+
+static int
+vmx_cleanup(void)
+{
+
+#ifdef __FreeBSD__
+ if (pirvec != 0)
+ vmm_ipi_free(pirvec);
+#endif
+
+ if (vpid_unr != NULL) {
+ delete_unrhdr(vpid_unr);
+ vpid_unr = NULL;
+ }
+
+ smp_rendezvous(NULL, vmx_disable, NULL, NULL);
+
+ return (0);
+}
+
+static void
+vmx_enable(void *arg __unused)
+{
+ int error;
+ uint64_t feature_control;
+
+ feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+ if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 0 ||
+ (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
+ wrmsr(MSR_IA32_FEATURE_CONTROL,
+ feature_control | IA32_FEATURE_CONTROL_VMX_EN |
+ IA32_FEATURE_CONTROL_LOCK);
+ }
+
+ load_cr4(rcr4() | CR4_VMXE);
+
+ *(uint32_t *)vmxon_region[curcpu] = vmx_revision();
+#ifdef __FreeBSD__
+ error = vmxon(vmxon_region[curcpu]);
+#else
+ error = vmxon_pa(vmxon_region_pa[curcpu]);
+ ASSERT(error == 0);
+#endif
+ if (error == 0)
+ vmxon_enabled[curcpu] = 1;
+}
+
+static int
+vmx_init(void)
+{
+#define X86FSET_VMX 35
+ extern uchar_t x86_featureset[];
+ extern boolean_t is_x86_feature(void *featureset, uint_t feature);
+ int error;
+ uint64_t fixed0, fixed1, feature_control;
+ uint32_t tmp;
+#ifndef __FreeBSD__
+ int i;
+#endif
+
+ /* CPUID.1:ECX[bit 5] must be 1 for processor to support VMX */
+#ifdef __FreeBSD__
+ if (!(cpu_feature2 & CPUID2_VMX)) {
+ printf("vmx_init: processor does not support VMX operation\n");
+ return (ENXIO);
+ }
+#else
+ if (!is_x86_feature(x86_featureset, X86FSET_VMX)) {
+ cmn_err(CE_WARN, "vmx_init: processor does not support VMX operation\n");
+ }
+#endif
+
+ /*
+ * Verify that MSR_IA32_FEATURE_CONTROL lock and VMXON enable bits
+ * are set (bits 0 and 2 respectively).
+ */
+ feature_control = rdmsr(MSR_IA32_FEATURE_CONTROL);
+ if ((feature_control & IA32_FEATURE_CONTROL_LOCK) == 1 &&
+ (feature_control & IA32_FEATURE_CONTROL_VMX_EN) == 0) {
+ printf("vmx_init: VMX operation disabled by BIOS\n");
+ return (ENXIO);
+ }
+
+ /* Check support for primary processor-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_CTLS_ONE_SETTING,
+ PROCBASED_CTLS_ZERO_SETTING, &procbased_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired primary "
+ "processor-based controls\n");
+ return (error);
+ }
+
+ /* Clear the processor-based ctl bits that are set on demand */
+ procbased_ctls &= ~PROCBASED_CTLS_WINDOW_SETTING;
+
+ /* Check support for secondary processor-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED_CTLS2_ONE_SETTING,
+ PROCBASED_CTLS2_ZERO_SETTING, &procbased_ctls2);
+ if (error) {
+ printf("vmx_init: processor does not support desired secondary "
+ "processor-based controls\n");
+ return (error);
+ }
+
+ /* Check support for VPID */
+ error = vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2, MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED2_ENABLE_VPID, 0, &tmp);
+ if (error == 0)
+ procbased_ctls2 |= PROCBASED2_ENABLE_VPID;
+
+ /* Check support for pin-based VM-execution controls */
+ error = vmx_set_ctlreg(MSR_VMX_PINBASED_CTLS,
+ MSR_VMX_TRUE_PINBASED_CTLS,
+ PINBASED_CTLS_ONE_SETTING,
+ PINBASED_CTLS_ZERO_SETTING, &pinbased_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "pin-based controls\n");
+ return (error);
+ }
+
+ /* Check support for VM-exit controls */
+ error = vmx_set_ctlreg(MSR_VMX_EXIT_CTLS, MSR_VMX_TRUE_EXIT_CTLS,
+ VM_EXIT_CTLS_ONE_SETTING,
+ VM_EXIT_CTLS_ZERO_SETTING,
+ &exit_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "exit controls\n");
+ return (error);
+ }
+
+ /* Check support for VM-entry controls */
+ error = vmx_set_ctlreg(MSR_VMX_ENTRY_CTLS, MSR_VMX_TRUE_ENTRY_CTLS,
+ VM_ENTRY_CTLS_ONE_SETTING, VM_ENTRY_CTLS_ZERO_SETTING,
+ &entry_ctls);
+ if (error) {
+ printf("vmx_init: processor does not support desired "
+ "entry controls\n");
+ return (error);
+ }
+
+ /*
+ * Check support for optional features by testing them
+ * as individual bits
+ */
+ cap_halt_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_HLT_EXITING, 0,
+ &tmp) == 0);
+
+ cap_monitor_trap = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_PROCBASED_CTLS,
+ PROCBASED_MTF, 0,
+ &tmp) == 0);
+
+ cap_pause_exit = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS,
+ MSR_VMX_TRUE_PROCBASED_CTLS,
+ PROCBASED_PAUSE_EXITING, 0,
+ &tmp) == 0);
+
+ cap_unrestricted_guest = (vmx_set_ctlreg(MSR_VMX_PROCBASED_CTLS2,
+ MSR_VMX_PROCBASED_CTLS2,
+ PROCBASED2_UNRESTRICTED_GUEST, 0,
+ &tmp) == 0);
+
+ /* Initialize EPT */
+ error = ept_init();
+ if (error) {
+ printf("vmx_init: ept initialization failed (%d)\n", error);
+ return (error);
+ }
+
+ /*
+ * Stash the cr0 and cr4 bits that must be fixed to 0 or 1
+ */
+ fixed0 = rdmsr(MSR_VMX_CR0_FIXED0);
+ fixed1 = rdmsr(MSR_VMX_CR0_FIXED1);
+ cr0_ones_mask = fixed0 & fixed1;
+ cr0_zeros_mask = ~fixed0 & ~fixed1;
+
+ /*
+ * CR0_PE and CR0_PG can be set to zero in VMX non-root operation
+ * if unrestricted guest execution is allowed.
+ */
+ if (cap_unrestricted_guest)
+ cr0_ones_mask &= ~(CR0_PG | CR0_PE);
+
+ /*
+ * Do not allow the guest to set CR0_NW or CR0_CD.
+ */
+ cr0_zeros_mask |= (CR0_NW | CR0_CD);
+
+ fixed0 = rdmsr(MSR_VMX_CR4_FIXED0);
+ fixed1 = rdmsr(MSR_VMX_CR4_FIXED1);
+ cr4_ones_mask = fixed0 & fixed1;
+ cr4_zeros_mask = ~fixed0 & ~fixed1;
+
+#ifndef __FreeBSD__
+ for (i = 0; i < MAXCPU; i++) {
+ vmxon_region_pa[i] = vtophys(&vmxon_region[i]);
+ }
+#endif
+
+ vpid_init();
+
+ vmx_msr_init();
+
+ /* enable VMX operation */
+ smp_rendezvous(NULL, vmx_enable, NULL, NULL);
+
+ vmx_initialized = 1;
+
+ return (0);
+}
+
+static int
+vmx_setup_cr_shadow(int which, struct vmcs *vmcs, uint32_t initial)
+{
+ int error, mask_ident, shadow_ident;
+ uint64_t mask_value;
+
+ if (which != 0 && which != 4)
+ panic("vmx_setup_cr_shadow: unknown cr%d", which);
+
+ if (which == 0) {
+ mask_ident = VMCS_CR0_MASK;
+ mask_value = cr0_ones_mask | cr0_zeros_mask;
+ shadow_ident = VMCS_CR0_SHADOW;
+ } else {
+ mask_ident = VMCS_CR4_MASK;
+ mask_value = cr4_ones_mask | cr4_zeros_mask;
+ shadow_ident = VMCS_CR4_SHADOW;
+ }
+
+ error = vmcs_setreg(vmcs, 0, VMCS_IDENT(mask_ident), mask_value);
+ if (error)
+ return (error);
+
+ error = vmcs_setreg(vmcs, 0, VMCS_IDENT(shadow_ident), initial);
+ if (error)
+ return (error);
+
+ return (0);
+}
+#define vmx_setup_cr0_shadow(vmcs,init) vmx_setup_cr_shadow(0, (vmcs), (init))
+#define vmx_setup_cr4_shadow(vmcs,init) vmx_setup_cr_shadow(4, (vmcs), (init))
+
+static void *
+vmx_vminit(struct vm *vm)
+{
+ uint16_t vpid[VM_MAXCPU];
+ int i, error, guest_msr_count;
+#ifndef __FreeBSD__
+ int host_msr_count;
+#endif
+ struct vmx *vmx;
+ struct vmcs *vmcs;
+
+ vmx = malloc(sizeof(struct vmx), M_VMX, M_WAITOK | M_ZERO);
+ if ((uintptr_t)vmx & PAGE_MASK) {
+ panic("malloc of struct vmx not aligned on %d byte boundary",
+ PAGE_SIZE);
+ }
+ vmx->vm = vm;
+
+ /*
+ * Clean up EPTP-tagged guest physical and combined mappings
+ *
+ * VMX transitions are not required to invalidate any guest physical
+ * mappings. So, it may be possible for stale guest physical mappings
+ * to be present in the processor TLBs.
+ *
+ * Combined mappings for this EP4TA are also invalidated for all VPIDs.
+ */
+ ept_invalidate_mappings(vtophys(vmx->pml4ept));
+
+ msr_bitmap_initialize(vmx->msr_bitmap);
+
+ /*
+ * It is safe to allow direct access to MSR_GSBASE and MSR_FSBASE.
+ * The guest FSBASE and GSBASE are saved and restored during
+ * vm-exit and vm-entry respectively. The host FSBASE and GSBASE are
+ * always restored from the vmcs host state area on vm-exit.
+ *
+ * The SYSENTER_CS/ESP/EIP MSRs are identical to FS/GSBASE in
+ * how they are saved/restored so can be directly accessed by the
+ * guest.
+ *
+ * MSR_EFER is saved and restored in the guest VMCS area on a
+ * VM exit and entry respectively. It is also restored from the
+ * host VMCS area on a VM exit.
+ *
+ * MSR_PAT is saved and restored in the guest VMCS are on a VM exit
+ * and entry respectively. It is also restored from the host VMCS
+ * area on a VM exit.
+ *
+ * The TSC MSR is exposed read-only. Writes are disallowed as
+ * that will impact the host TSC. If the guest does a write
+ * the "use TSC offsetting" execution control is enabled and the
+ * difference between the host TSC and the guest TSC is written
+ * into the TSC offset in the VMCS.
+ */
+ if (guest_msr_rw(vmx, MSR_GSBASE) ||
+ guest_msr_rw(vmx, MSR_FSBASE) ||
+ guest_msr_rw(vmx, MSR_SYSENTER_CS_MSR) ||
+ guest_msr_rw(vmx, MSR_SYSENTER_ESP_MSR) ||
+ guest_msr_rw(vmx, MSR_SYSENTER_EIP_MSR) ||
+ guest_msr_rw(vmx, MSR_EFER) ||
+ guest_msr_rw(vmx, MSR_PAT) ||
+ guest_msr_ro(vmx, MSR_TSC))
+ panic("vmx_vminit: error setting guest msr access");
+
+ vpid_alloc(vpid, VM_MAXCPU);
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vmcs = &vmx->vmcs[i];
+ vmcs->identifier = vmx_revision();
+ error = vmclear(vmcs);
+ if (error != 0) {
+ panic("vmx_vminit: vmclear error %d on vcpu %d\n",
+ error, i);
+ }
+
+ vmx_msr_guest_init(vmx, i);
+
+ error = vmcs_set_defaults(vmcs,
+ (u_long)vmx_longjmp,
+ (u_long)&vmx->ctx[i],
+ vtophys(vmx->pml4ept),
+ pinbased_ctls,
+ procbased_ctls,
+ procbased_ctls2,
+ exit_ctls, entry_ctls,
+ vtophys(vmx->msr_bitmap),
+ vpid[i]);
+
+ if (error != 0)
+ panic("vmx_vminit: vmcs_set_defaults error %d", error);
+
+ vmx->cap[i].set = 0;
+ vmx->cap[i].proc_ctls = procbased_ctls;
+
+ vmx->state[i].lastcpu = -1;
+ vmx->state[i].vpid = vpid[i];
+
+#ifndef __FreeBSD__
+ msr_save_area_init(vmx->guest_msrs[i], &guest_msr_count);
+
+ error = vmcs_set_msr_save(vmcs, vtophys(vmx->guest_msrs[i]),
+ guest_msr_count);
+ if (error != 0)
+ panic("vmcs_set_msr_save error %d", error);
+
+ host_msr_save_area_init(vmx->host_msrs[i], &host_msr_count);
+
+ error = vmcs_set_host_msr_save(&vmx->vmcs[i],
+ vtophys(vmx->host_msrs[i]),
+ host_msr_count);
+ if (error != 0)
+ panic("vmcs_set_msr_save error %d", error);
+#endif
+
+ /*
+ * Set up the CR0/4 shadows, and init the read shadow
+ * to the power-on register value from the Intel Sys Arch.
+ * CR0 - 0x60000010
+ * CR4 - 0
+ */
+ error = vmx_setup_cr0_shadow(vmcs, 0x60000010);
+ if (error != 0)
+ panic("vmx_setup_cr0_shadow %d", error);
+
+ error = vmx_setup_cr4_shadow(vmcs, 0);
+ if (error != 0)
+ panic("vmx_setup_cr4_shadow %d", error);
+ }
+
+ return (vmx);
+}
+
+static int
+vmx_handle_cpuid(struct vm *vm, int vcpu, struct vmxctx *vmxctx)
+{
+ int handled, func;
+
+ func = vmxctx->guest_rax;
+
+ handled = x86_emulate_cpuid(vm, vcpu,
+ (uint32_t*)(&vmxctx->guest_rax),
+ (uint32_t*)(&vmxctx->guest_rbx),
+ (uint32_t*)(&vmxctx->guest_rcx),
+ (uint32_t*)(&vmxctx->guest_rdx));
+ return (handled);
+}
+
+static __inline void
+vmx_run_trace(struct vmx *vmx, int vcpu)
+{
+#ifdef KTR
+ VCPU_CTR1(vmx->vm, vcpu, "Resume execution at %#lx", vmcs_guest_rip());
+#endif
+}
+
+static __inline void
+vmx_exit_trace(struct vmx *vmx, int vcpu, uint64_t rip, uint32_t exit_reason,
+ int handled)
+{
+#ifdef KTR
+ VCPU_CTR3(vmx->vm, vcpu, "%s %s vmexit at 0x%0lx",
+ handled ? "handled" : "unhandled",
+ exit_reason_to_str(exit_reason), rip);
+#endif
+}
+
+static __inline void
+vmx_astpending_trace(struct vmx *vmx, int vcpu, uint64_t rip)
+{
+#ifdef KTR
+ VCPU_CTR1(vmx->vm, vcpu, "astpending vmexit at 0x%0lx", rip);
+#endif
+}
+
+static void
+vmx_set_pcpu_defaults(struct vmx *vmx, int vcpu)
+{
+ struct vmxstate *vmxstate;
+ struct invvpid_desc invvpid_desc = { 0 };
+#ifndef __FreeBSD__
+ desctbr_t idtr, gdtr;
+#endif
+
+ vmxstate = &vmx->state[vcpu];
+ vmcs_write(VMCS_HOST_FS_BASE, vmm_get_host_fsbase());
+ if (vmxstate->lastcpu == curcpu)
+ return;
+
+ vmxstate->lastcpu = curcpu;
+
+ vmm_stat_incr(vmx->vm, vcpu, VCPU_MIGRATIONS, 1);
+
+ vmcs_write(VMCS_HOST_TR_BASE, vmm_get_host_trbase());
+ vmcs_write(VMCS_HOST_GDTR_BASE, vmm_get_host_gdtrbase());
+ vmcs_write(VMCS_HOST_GS_BASE, vmm_get_host_gsbase());
+
+#ifndef __FreeBSD__
+ vmcs_write(VMCS_HOST_IA32_SYSENTER_CS, rdmsr(MSR_SYSENTER_CS_MSR));
+ vmcs_write(VMCS_HOST_IA32_SYSENTER_ESP, rdmsr(MSR_SYSENTER_ESP_MSR));
+ vmcs_write(VMCS_HOST_IA32_SYSENTER_EIP, rdmsr(MSR_SYSENTER_EIP_MSR));
+#endif
+
+ /*
+ * If we are using VPIDs then invalidate all mappings tagged with 'vpid'
+ *
+ * We do this because this vcpu was executing on a different host
+ * cpu when it last ran. We do not track whether it invalidated
+ * mappings associated with its 'vpid' during that run. So we must
+ * assume that the mappings associated with 'vpid' on 'curcpu' are
+ * stale and invalidate them.
+ *
+ * Note that we incur this penalty only when the scheduler chooses to
+ * move the thread associated with this vcpu between host cpus.
+ *
+ * Note also that this will invalidate mappings tagged with 'vpid'
+ * for "all" EP4TAs.
+ */
+ if (vmxstate->vpid != 0) {
+ invvpid_desc.vpid = vmxstate->vpid;
+ invvpid(INVVPID_TYPE_SINGLE_CONTEXT, invvpid_desc);
+ }
+}
+
+static void
+vm_exit_update_rip(struct vm_exit *vmexit)
+{
+ int error;
+
+ error = vmwrite(VMCS_GUEST_RIP, vmexit->rip + vmexit->inst_length);
+ if (error)
+ panic("vmx_run: error %d writing to VMCS_GUEST_RIP", error);
+}
+
+/*
+ * We depend on 'procbased_ctls' to have the Interrupt Window Exiting bit set.
+ */
+CTASSERT((PROCBASED_CTLS_ONE_SETTING & PROCBASED_INT_WINDOW_EXITING) != 0);
+
+static void __inline
+vmx_set_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+
+ if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) == 0) {
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_INT_WINDOW_EXITING;
+ vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ VCPU_CTR0(vmx->vm, vcpu, "Enabling interrupt window exiting");
+ }
+}
+
+static void __inline
+vmx_clear_int_window_exiting(struct vmx *vmx, int vcpu)
+{
+
+#ifdef __FreeBSD__
+ KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
+ ("intr_window_exiting not set: %#x", vmx->cap[vcpu].proc_ctls));
+#else
+ KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0,
+ ("intr_window_exiting not set: %x", vmx->cap[vcpu].proc_ctls));
+#endif
+ vmx->cap[vcpu].proc_ctls &= ~PROCBASED_INT_WINDOW_EXITING;
+ vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ VCPU_CTR0(vmx->vm, vcpu, "Disabling interrupt window exiting");
+}
+
+static void __inline
+vmx_set_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+
+ if ((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) == 0) {
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_NMI_WINDOW_EXITING;
+ vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ VCPU_CTR0(vmx->vm, vcpu, "Enabling NMI window exiting");
+ }
+}
+
+static void __inline
+vmx_clear_nmi_window_exiting(struct vmx *vmx, int vcpu)
+{
+
+#ifdef __FreeBSD__
+ KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
+ ("nmi_window_exiting not set %#x", vmx->cap[vcpu].proc_ctls));
+#else
+ KASSERT((vmx->cap[vcpu].proc_ctls & PROCBASED_NMI_WINDOW_EXITING) != 0,
+ ("nmi_window_exiting not set %x", vmx->cap[vcpu].proc_ctls));
+#endif
+ vmx->cap[vcpu].proc_ctls &= ~PROCBASED_NMI_WINDOW_EXITING;
+ vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ VCPU_CTR0(vmx->vm, vcpu, "Disabling NMI window exiting");
+}
+
+int
+vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset)
+{
+ int error;
+
+ if ((vmx->cap[vcpu].proc_ctls & PROCBASED_TSC_OFFSET) == 0) {
+ vmx->cap[vcpu].proc_ctls |= PROCBASED_TSC_OFFSET;
+ vmcs_write(VMCS_PRI_PROC_BASED_CTLS, vmx->cap[vcpu].proc_ctls);
+ VCPU_CTR0(vmx->vm, vcpu, "Enabling TSC offsetting");
+ }
+
+ error = vmwrite(VMCS_TSC_OFFSET, offset);
+
+ return (error);
+}
+
+#define NMI_BLOCKING (VMCS_INTERRUPTIBILITY_NMI_BLOCKING | \
+ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
+#define HWINTR_BLOCKING (VMCS_INTERRUPTIBILITY_STI_BLOCKING | \
+ VMCS_INTERRUPTIBILITY_MOVSS_BLOCKING)
+
+static void
+vmx_inject_nmi(struct vmx *vmx, int vcpu)
+{
+ uint32_t gi, info;
+
+ gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+#ifdef __FreeBSD__
+ KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
+ "interruptibility-state %#x", gi));
+#else
+ KASSERT((gi & NMI_BLOCKING) == 0, ("vmx_inject_nmi: invalid guest "
+ "interruptibility-state %x", gi));
+#endif
+
+ info = vmcs_read(VMCS_ENTRY_INTR_INFO);
+#ifdef __FreeBSD__
+ KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
+ "VM-entry interruption information %#x", info));
+#else
+ KASSERT((info & VMCS_INTR_VALID) == 0, ("vmx_inject_nmi: invalid "
+ "VM-entry interruption information %x", info));
+#endif
+
+ /*
+ * Inject the virtual NMI. The vector must be the NMI IDT entry
+ * or the VMCS entry check will fail.
+ */
+ info = IDT_NMI | VMCS_INTR_T_NMI | VMCS_INTR_VALID;
+ vmcs_write(VMCS_ENTRY_INTR_INFO, info);
+
+ VCPU_CTR0(vmx->vm, vcpu, "Injecting vNMI");
+
+ /* Clear the request */
+ vm_nmi_clear(vmx->vm, vcpu);
+}
+
+static void
+vmx_inject_interrupts(struct vmx *vmx, int vcpu, struct vlapic *vlapic)
+{
+ int vector, need_nmi_exiting, extint_pending;
+ uint64_t rflags, entryinfo;
+ uint32_t gi, info;
+
+ if (vm_entry_intinfo(vmx->vm, vcpu, &entryinfo)) {
+#ifdef __FreeBSD__
+ KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
+ "intinfo is not valid: %#lx", __func__, entryinfo));
+#else
+ KASSERT((entryinfo & VMCS_INTR_VALID) != 0, ("%s: entry "
+ "intinfo is not valid: %lx", __func__, entryinfo));
+#endif
+
+ info = vmcs_read(VMCS_ENTRY_INTR_INFO);
+#ifdef __FreeBSD__
+ KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
+ "pending exception: %#lx/%#x", __func__, entryinfo, info));
+#else
+ KASSERT((info & VMCS_INTR_VALID) == 0, ("%s: cannot inject "
+ "pending exception: %lx/%x", __func__, entryinfo, info));
+#endif
+
+ info = entryinfo;
+ vector = info & 0xff;
+ if (vector == IDT_BP || vector == IDT_OF) {
+ /*
+ * VT-x requires #BP and #OF to be injected as software
+ * exceptions.
+ */
+ info &= ~VMCS_INTR_T_MASK;
+ info |= VMCS_INTR_T_SWEXCEPTION;
+ }
+
+ if (info & VMCS_INTR_DEL_ERRCODE)
+ vmcs_write(VMCS_ENTRY_EXCEPTION_ERROR, entryinfo >> 32);
+
+ vmcs_write(VMCS_ENTRY_INTR_INFO, info);
+ }
+
+ if (vm_nmi_pending(vmx->vm, vcpu)) {
+ /*
+ * If there are no conditions blocking NMI injection then
+ * inject it directly here otherwise enable "NMI window
+ * exiting" to inject it as soon as we can.
+ *
+ * We also check for STI_BLOCKING because some implementations
+ * don't allow NMI injection in this case. If we are running
+ * on a processor that doesn't have this restriction it will
+ * immediately exit and the NMI will be injected in the
+ * "NMI window exiting" handler.
+ */
+ need_nmi_exiting = 1;
+ gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+ if ((gi & (HWINTR_BLOCKING | NMI_BLOCKING)) == 0) {
+ info = vmcs_read(VMCS_ENTRY_INTR_INFO);
+ if ((info & VMCS_INTR_VALID) == 0) {
+ vmx_inject_nmi(vmx, vcpu);
+ need_nmi_exiting = 0;
+ } else {
+ VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI "
+ "due to VM-entry intr info %#x", info);
+ }
+ } else {
+ VCPU_CTR1(vmx->vm, vcpu, "Cannot inject NMI due to "
+ "Guest Interruptibility-state %#x", gi);
+ }
+
+ if (need_nmi_exiting)
+ vmx_set_nmi_window_exiting(vmx, vcpu);
+ }
+
+ extint_pending = vm_extint_pending(vmx->vm, vcpu);
+
+#ifdef __FreeBSD__
+ if (!extint_pending && virtual_interrupt_delivery) {
+ vmx_inject_pir(vlapic);
+ return;
+ }
+#endif
+
+ /*
+ * If interrupt-window exiting is already in effect then don't bother
+ * checking for pending interrupts. This is just an optimization and
+ * not needed for correctness.
+ */
+ if ((vmx->cap[vcpu].proc_ctls & PROCBASED_INT_WINDOW_EXITING) != 0) {
+ VCPU_CTR0(vmx->vm, vcpu, "Skip interrupt injection due to "
+ "pending int_window_exiting");
+ return;
+ }
+
+ if (!extint_pending) {
+ /* Ask the local apic for a vector to inject */
+ if (!vlapic_pending_intr(vlapic, &vector))
+ return;
+
+ /*
+ * From the Intel SDM, Volume 3, Section "Maskable
+ * Hardware Interrupts":
+ * - maskable interrupt vectors [16,255] can be delivered
+ * through the local APIC.
+ */
+ KASSERT(vector >= 16 && vector <= 255,
+ ("invalid vector %d from local APIC", vector));
+ } else {
+ /* Ask the legacy pic for a vector to inject */
+ vatpic_pending_intr(vmx->vm, &vector);
+
+ /*
+ * From the Intel SDM, Volume 3, Section "Maskable
+ * Hardware Interrupts":
+ * - maskable interrupt vectors [0,255] can be delivered
+ * through the INTR pin.
+ */
+ KASSERT(vector >= 0 && vector <= 255,
+ ("invalid vector %d from INTR", vector));
+ }
+
+ /* Check RFLAGS.IF and the interruptibility state of the guest */
+ rflags = vmcs_read(VMCS_GUEST_RFLAGS);
+ if ((rflags & PSL_I) == 0) {
+ VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
+ "rflags %#lx", vector, rflags);
+ goto cantinject;
+ }
+
+ gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+ if (gi & HWINTR_BLOCKING) {
+ VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
+ "Guest Interruptibility-state %#x", vector, gi);
+ goto cantinject;
+ }
+
+ info = vmcs_read(VMCS_ENTRY_INTR_INFO);
+ if (info & VMCS_INTR_VALID) {
+ /*
+ * This is expected and could happen for multiple reasons:
+ * - A vectoring VM-entry was aborted due to astpending
+ * - A VM-exit happened during event injection.
+ * - An exception was injected above.
+ * - An NMI was injected above or after "NMI window exiting"
+ */
+ VCPU_CTR2(vmx->vm, vcpu, "Cannot inject vector %d due to "
+ "VM-entry intr info %#x", vector, info);
+ goto cantinject;
+ }
+
+ /* Inject the interrupt */
+ info = VMCS_INTR_T_HWINTR | VMCS_INTR_VALID;
+ info |= vector;
+ vmcs_write(VMCS_ENTRY_INTR_INFO, info);
+
+ if (!extint_pending) {
+ /* Update the Local APIC ISR */
+ vlapic_intr_accepted(vlapic, vector);
+ } else {
+ vm_extint_clear(vmx->vm, vcpu);
+ vatpic_intr_accepted(vmx->vm, vector);
+
+ /*
+ * After we accepted the current ExtINT the PIC may
+ * have posted another one. If that is the case, set
+ * the Interrupt Window Exiting execution control so
+ * we can inject that one too.
+ *
+ * Also, interrupt window exiting allows us to inject any
+ * pending APIC vector that was preempted by the ExtINT
+ * as soon as possible. This applies both for the software
+ * emulated vlapic and the hardware assisted virtual APIC.
+ */
+ vmx_set_int_window_exiting(vmx, vcpu);
+ }
+
+ VCPU_CTR1(vmx->vm, vcpu, "Injecting hwintr at vector %d", vector);
+
+ return;
+
+cantinject:
+ /*
+ * Set the Interrupt Window Exiting execution control so we can inject
+ * the interrupt as soon as blocking condition goes away.
+ */
+ vmx_set_int_window_exiting(vmx, vcpu);
+}
+
+/*
+ * If the Virtual NMIs execution control is '1' then the logical processor
+ * tracks virtual-NMI blocking in the Guest Interruptibility-state field of
+ * the VMCS. An IRET instruction in VMX non-root operation will remove any
+ * virtual-NMI blocking.
+ *
+ * This unblocking occurs even if the IRET causes a fault. In this case the
+ * hypervisor needs to restore virtual-NMI blocking before resuming the guest.
+ */
+static void
+vmx_restore_nmi_blocking(struct vmx *vmx, int vcpuid)
+{
+ uint32_t gi;
+
+ VCPU_CTR0(vmx->vm, vcpuid, "Restore Virtual-NMI blocking");
+ gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+ gi |= VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
+ vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
+}
+
+static void
+vmx_clear_nmi_blocking(struct vmx *vmx, int vcpuid)
+{
+ uint32_t gi;
+
+ VCPU_CTR0(vmx->vm, vcpuid, "Clear Virtual-NMI blocking");
+ gi = vmcs_read(VMCS_GUEST_INTERRUPTIBILITY);
+ gi &= ~VMCS_INTERRUPTIBILITY_NMI_BLOCKING;
+ vmcs_write(VMCS_GUEST_INTERRUPTIBILITY, gi);
+}
+
+static uint64_t
+vmx_get_guest_reg(struct vmx *vmx, int vcpu, int ident)
+{
+ const struct vmxctx *vmxctx;
+
+ vmxctx = &vmx->ctx[vcpu];
+
+ switch (ident) {
+ case 0:
+ return (vmxctx->guest_rax);
+ case 1:
+ return (vmxctx->guest_rcx);
+ case 2:
+ return (vmxctx->guest_rdx);
+ case 3:
+ return (vmxctx->guest_rbx);
+ case 4:
+ return (vmcs_read(VMCS_GUEST_RSP));
+ case 5:
+ return (vmxctx->guest_rbp);
+ case 6:
+ return (vmxctx->guest_rsi);
+ case 7:
+ return (vmxctx->guest_rdi);
+ case 8:
+ return (vmxctx->guest_r8);
+ case 9:
+ return (vmxctx->guest_r9);
+ case 10:
+ return (vmxctx->guest_r10);
+ case 11:
+ return (vmxctx->guest_r11);
+ case 12:
+ return (vmxctx->guest_r12);
+ case 13:
+ return (vmxctx->guest_r13);
+ case 14:
+ return (vmxctx->guest_r14);
+ case 15:
+ return (vmxctx->guest_r15);
+ default:
+ panic("invalid vmx register %d", ident);
+ }
+}
+
+static void
+vmx_set_guest_reg(struct vmx *vmx, int vcpu, int ident, uint64_t regval)
+{
+ struct vmxctx *vmxctx;
+
+ vmxctx = &vmx->ctx[vcpu];
+
+ switch (ident) {
+ case 0:
+ vmxctx->guest_rax = regval;
+ break;
+ case 1:
+ vmxctx->guest_rcx = regval;
+ break;
+ case 2:
+ vmxctx->guest_rdx = regval;
+ break;
+ case 3:
+ vmxctx->guest_rbx = regval;
+ break;
+ case 4:
+ vmcs_write(VMCS_GUEST_RSP, regval);
+ break;
+ case 5:
+ vmxctx->guest_rbp = regval;
+ break;
+ case 6:
+ vmxctx->guest_rsi = regval;
+ break;
+ case 7:
+ vmxctx->guest_rdi = regval;
+ break;
+ case 8:
+ vmxctx->guest_r8 = regval;
+ break;
+ case 9:
+ vmxctx->guest_r9 = regval;
+ break;
+ case 10:
+ vmxctx->guest_r10 = regval;
+ break;
+ case 11:
+ vmxctx->guest_r11 = regval;
+ break;
+ case 12:
+ vmxctx->guest_r12 = regval;
+ break;
+ case 13:
+ vmxctx->guest_r13 = regval;
+ break;
+ case 14:
+ vmxctx->guest_r14 = regval;
+ break;
+ case 15:
+ vmxctx->guest_r15 = regval;
+ break;
+ default:
+ panic("invalid vmx register %d", ident);
+ }
+}
+
+static int
+vmx_emulate_cr0_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+ uint64_t crval, regval;
+
+ /* We only handle mov to %cr0 at this time */
+ if ((exitqual & 0xf0) != 0x00)
+ return (UNHANDLED);
+
+ regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
+
+ vmcs_write(VMCS_CR0_SHADOW, regval);
+
+ crval = regval | cr0_ones_mask;
+ crval &= ~cr0_zeros_mask;
+ vmcs_write(VMCS_GUEST_CR0, crval);
+
+ if (regval & CR0_PG) {
+ uint64_t efer, entry_ctls;
+
+ /*
+ * If CR0.PG is 1 and EFER.LME is 1 then EFER.LMA and
+ * the "IA-32e mode guest" bit in VM-entry control must be
+ * equal.
+ */
+ efer = vmcs_read(VMCS_GUEST_IA32_EFER);
+ if (efer & EFER_LME) {
+ efer |= EFER_LMA;
+ vmcs_write(VMCS_GUEST_IA32_EFER, efer);
+ entry_ctls = vmcs_read(VMCS_ENTRY_CTLS);
+ entry_ctls |= VM_ENTRY_GUEST_LMA;
+ vmcs_write(VMCS_ENTRY_CTLS, entry_ctls);
+ }
+ }
+
+ return (HANDLED);
+}
+
+static int
+vmx_emulate_cr4_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+ uint64_t crval, regval;
+
+ /* We only handle mov to %cr4 at this time */
+ if ((exitqual & 0xf0) != 0x00)
+ return (UNHANDLED);
+
+ regval = vmx_get_guest_reg(vmx, vcpu, (exitqual >> 8) & 0xf);
+
+ vmcs_write(VMCS_CR4_SHADOW, regval);
+
+ crval = regval | cr4_ones_mask;
+ crval &= ~cr4_zeros_mask;
+ vmcs_write(VMCS_GUEST_CR4, crval);
+
+ return (HANDLED);
+}
+
+static int
+vmx_emulate_cr8_access(struct vmx *vmx, int vcpu, uint64_t exitqual)
+{
+ struct vlapic *vlapic;
+ uint64_t cr8;
+ int regnum;
+
+ /* We only handle mov %cr8 to/from a register at this time. */
+ if ((exitqual & 0xe0) != 0x00) {
+ return (UNHANDLED);
+ }
+
+ vlapic = vm_lapic(vmx->vm, vcpu);
+ regnum = (exitqual >> 8) & 0xf;
+ if (exitqual & 0x10) {
+ cr8 = vlapic_get_cr8(vlapic);
+ vmx_set_guest_reg(vmx, vcpu, regnum, cr8);
+ } else {
+ cr8 = vmx_get_guest_reg(vmx, vcpu, regnum);
+ vlapic_set_cr8(vlapic, cr8);
+ }
+
+ return (HANDLED);
+}
+
+/*
+ * From section "Guest Register State" in the Intel SDM: CPL = SS.DPL
+ */
+static int
+vmx_cpl(void)
+{
+ uint32_t ssar;
+
+ ssar = vmcs_read(VMCS_GUEST_SS_ACCESS_RIGHTS);
+ return ((ssar >> 5) & 0x3);
+}
+
+static enum vm_cpu_mode
+vmx_cpu_mode(void)
+{
+ uint32_t csar;
+
+ if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LMA) {
+ csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+ if (csar & 0x2000)
+ return (CPU_MODE_64BIT); /* CS.L = 1 */
+ else
+ return (CPU_MODE_COMPATIBILITY);
+ } else if (vmcs_read(VMCS_GUEST_CR0) & CR0_PE) {
+ return (CPU_MODE_PROTECTED);
+ } else {
+ return (CPU_MODE_REAL);
+ }
+}
+
+static enum vm_paging_mode
+vmx_paging_mode(void)
+{
+
+ if (!(vmcs_read(VMCS_GUEST_CR0) & CR0_PG))
+ return (PAGING_MODE_FLAT);
+ if (!(vmcs_read(VMCS_GUEST_CR4) & CR4_PAE))
+ return (PAGING_MODE_32);
+ if (vmcs_read(VMCS_GUEST_IA32_EFER) & EFER_LME)
+ return (PAGING_MODE_64);
+ else
+ return (PAGING_MODE_PAE);
+}
+
+static uint64_t
+inout_str_index(struct vmx *vmx, int vcpuid, int in)
+{
+ uint64_t val;
+ int error;
+ enum vm_reg_name reg;
+
+ reg = in ? VM_REG_GUEST_RDI : VM_REG_GUEST_RSI;
+ error = vmx_getreg(vmx, vcpuid, reg, &val);
+ KASSERT(error == 0, ("%s: vmx_getreg error %d", __func__, error));
+ return (val);
+}
+
+static uint64_t
+inout_str_count(struct vmx *vmx, int vcpuid, int rep)
+{
+ uint64_t val;
+ int error;
+
+ if (rep) {
+ error = vmx_getreg(vmx, vcpuid, VM_REG_GUEST_RCX, &val);
+ KASSERT(!error, ("%s: vmx_getreg error %d", __func__, error));
+ } else {
+ val = 1;
+ }
+ return (val);
+}
+
+static int
+inout_str_addrsize(uint32_t inst_info)
+{
+ uint32_t size;
+
+ size = (inst_info >> 7) & 0x7;
+ switch (size) {
+ case 0:
+ return (2); /* 16 bit */
+ case 1:
+ return (4); /* 32 bit */
+ case 2:
+ return (8); /* 64 bit */
+ default:
+ panic("%s: invalid size encoding %d", __func__, size);
+ }
+}
+
+static void
+inout_str_seginfo(struct vmx *vmx, int vcpuid, uint32_t inst_info, int in,
+ struct vm_inout_str *vis)
+{
+ int error, s;
+
+ if (in) {
+ vis->seg_name = VM_REG_GUEST_ES;
+ } else {
+ s = (inst_info >> 15) & 0x7;
+ vis->seg_name = vm_segment_name(s);
+ }
+
+ error = vmx_getdesc(vmx, vcpuid, vis->seg_name, &vis->seg_desc);
+ KASSERT(error == 0, ("%s: vmx_getdesc error %d", __func__, error));
+}
+
+static void
+vmx_paging_info(struct vm_guest_paging *paging)
+{
+ paging->cr3 = vmcs_guest_cr3();
+ paging->cpl = vmx_cpl();
+ paging->cpu_mode = vmx_cpu_mode();
+ paging->paging_mode = vmx_paging_mode();
+}
+
+static void
+vmexit_inst_emul(struct vm_exit *vmexit, uint64_t gpa, uint64_t gla)
+{
+ struct vm_guest_paging *paging;
+ uint32_t csar;
+
+ paging = &vmexit->u.inst_emul.paging;
+
+ vmexit->exitcode = VM_EXITCODE_INST_EMUL;
+ vmexit->u.inst_emul.gpa = gpa;
+ vmexit->u.inst_emul.gla = gla;
+ vmx_paging_info(paging);
+ switch (paging->cpu_mode) {
+ case CPU_MODE_REAL:
+ vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
+ vmexit->u.inst_emul.cs_d = 0;
+ break;
+ case CPU_MODE_PROTECTED:
+ case CPU_MODE_COMPATIBILITY:
+ vmexit->u.inst_emul.cs_base = vmcs_read(VMCS_GUEST_CS_BASE);
+ csar = vmcs_read(VMCS_GUEST_CS_ACCESS_RIGHTS);
+ vmexit->u.inst_emul.cs_d = SEG_DESC_DEF32(csar);
+ break;
+ default:
+ vmexit->u.inst_emul.cs_base = 0;
+ vmexit->u.inst_emul.cs_d = 0;
+ break;
+ }
+ vie_init(&vmexit->u.inst_emul.vie, NULL, 0);
+}
+
+static int
+ept_fault_type(uint64_t ept_qual)
+{
+ int fault_type;
+
+ if (ept_qual & EPT_VIOLATION_DATA_WRITE)
+ fault_type = VM_PROT_WRITE;
+ else if (ept_qual & EPT_VIOLATION_INST_FETCH)
+ fault_type = VM_PROT_EXECUTE;
+ else
+ fault_type= VM_PROT_READ;
+
+ return (fault_type);
+}
+
+static boolean_t
+ept_emulation_fault(uint64_t ept_qual)
+{
+ int read, write;
+
+ /* EPT fault on an instruction fetch doesn't make sense here */
+ if (ept_qual & EPT_VIOLATION_INST_FETCH)
+ return (FALSE);
+
+ /* EPT fault must be a read fault or a write fault */
+ read = ept_qual & EPT_VIOLATION_DATA_READ ? 1 : 0;
+ write = ept_qual & EPT_VIOLATION_DATA_WRITE ? 1 : 0;
+ if ((read | write) == 0)
+ return (FALSE);
+
+ /*
+ * The EPT violation must have been caused by accessing a
+ * guest-physical address that is a translation of a guest-linear
+ * address.
+ */
+ if ((ept_qual & EPT_VIOLATION_GLA_VALID) == 0 ||
+ (ept_qual & EPT_VIOLATION_XLAT_VALID) == 0) {
+ return (FALSE);
+ }
+
+ return (TRUE);
+}
+
+static int
+emulate_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
+{
+ int error;
+
+ if (lapic_msr(num))
+ error = lapic_wrmsr(vmx->vm, vcpuid, num, val, retu);
+ else
+ error = vmx_wrmsr(vmx, vcpuid, num, val, retu);
+
+ return (error);
+}
+
+static int
+emulate_rdmsr(struct vmx *vmx, int vcpuid, u_int num, bool *retu)
+{
+ struct vmxctx *vmxctx;
+ uint64_t result;
+ uint32_t eax, edx;
+ int error;
+
+ if (lapic_msr(num))
+ error = lapic_rdmsr(vmx->vm, vcpuid, num, &result, retu);
+ else
+ error = vmx_rdmsr(vmx, vcpuid, num, &result, retu);
+
+ if (error == 0) {
+ eax = result;
+ vmxctx = &vmx->ctx[vcpuid];
+ error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RAX, eax);
+ KASSERT(error == 0, ("vmxctx_setreg(rax) error %d", error));
+
+ edx = result >> 32;
+ error = vmxctx_setreg(vmxctx, VM_REG_GUEST_RDX, edx);
+ KASSERT(error == 0, ("vmxctx_setreg(rdx) error %d", error));
+ }
+
+ return (error);
+}
+
+static int
+vmx_exit_process(struct vmx *vmx, int vcpu, struct vm_exit *vmexit)
+{
+ int error, handled, in;
+ struct vmcs *vmcs;
+ struct vmxctx *vmxctx;
+ struct vm_inout_str *vis;
+ uint32_t eax, ecx, edx, idtvec_info, intr_info, inst_info;
+ uint64_t qual, gla, gpa, cr3;
+ bool retu;
+
+ CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_VIRTUAL_NMI) != 0);
+ CTASSERT((PINBASED_CTLS_ONE_SETTING & PINBASED_NMI_EXITING) != 0);
+
+ handled = UNHANDLED;
+ vmcs = &vmx->vmcs[vcpu];
+ vmxctx = &vmx->ctx[vcpu];
+ qual = vmexit->u.vmx.exit_qualification;
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_COUNT, 1);
+
+ switch (vmexit->u.vmx.exit_reason) {
+ case EXIT_REASON_CR_ACCESS:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CR_ACCESS, 1);
+ switch (qual & 0xf) {
+ case 0:
+ handled = vmx_emulate_cr0_access(vmx, vcpu, qual);
+ break;
+ case 4:
+ handled = vmx_emulate_cr4_access(vmx, vcpu, qual);
+ break;
+ case 8:
+ handled = vmx_emulate_cr8_access(vmx, vcpu, qual);
+ break;
+ }
+ break;
+ case EXIT_REASON_RDMSR:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_RDMSR, 1);
+ retu = false;
+ ecx = vmxctx->guest_rcx;
+ VCPU_CTR1(vmx->vm, vcpu, "rdmsr 0x%08x", ecx);
+ error = emulate_rdmsr(vmx, vcpu, ecx, &retu);
+ if (error) {
+ vmexit->exitcode = VM_EXITCODE_RDMSR;
+ vmexit->u.msr.code = ecx;
+ } else if (!retu) {
+ handled = HANDLED;
+ } else {
+ /* Return to userspace with a valid exitcode */
+ KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
+ ("emulate_rdmsr retu with bogus exitcode"));
+ }
+ break;
+ case EXIT_REASON_WRMSR:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_WRMSR, 1);
+ retu = false;
+ eax = vmxctx->guest_rax;
+ ecx = vmxctx->guest_rcx;
+ edx = vmxctx->guest_rdx;
+ VCPU_CTR2(vmx->vm, vcpu, "wrmsr 0x%08x value 0x%016lx",
+ ecx, (uint64_t)edx << 32 | eax);
+ error = emulate_wrmsr(vmx, vcpu, ecx,
+ (uint64_t)edx << 32 | eax, &retu);
+ if (error) {
+ vmexit->exitcode = VM_EXITCODE_WRMSR;
+ vmexit->u.msr.code = ecx;
+ vmexit->u.msr.wval = (uint64_t)edx << 32 | eax;
+ } else if (!retu) {
+ handled = HANDLED;
+ } else {
+ /* Return to userspace with a valid exitcode */
+ KASSERT(vmexit->exitcode != VM_EXITCODE_BOGUS,
+ ("emulate_wrmsr retu with bogus exitcode"));
+ }
+ break;
+ case EXIT_REASON_HLT:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_HLT, 1);
+ vmexit->exitcode = VM_EXITCODE_HLT;
+ vmexit->u.hlt.rflags = vmcs_read(VMCS_GUEST_RFLAGS);
+ break;
+ case EXIT_REASON_MTF:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_MTRAP, 1);
+ vmexit->exitcode = VM_EXITCODE_MTRAP;
+ break;
+ case EXIT_REASON_PAUSE:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_PAUSE, 1);
+ vmexit->exitcode = VM_EXITCODE_PAUSE;
+ break;
+ case EXIT_REASON_INTR_WINDOW:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INTR_WINDOW, 1);
+ vmx_clear_int_window_exiting(vmx, vcpu);
+ return (1);
+ case EXIT_REASON_EXT_INTR:
+ /*
+ * External interrupts serve only to cause VM exits and allow
+ * the host interrupt handler to run.
+ *
+ * If this external interrupt triggers a virtual interrupt
+ * to a VM, then that state will be recorded by the
+ * host interrupt handler in the VM's softc. We will inject
+ * this virtual interrupt during the subsequent VM enter.
+ */
+ intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
+
+ /*
+ * XXX: Ignore this exit if VMCS_INTR_VALID is not set.
+ * This appears to be a bug in VMware Fusion?
+ */
+ if (!(intr_info & VMCS_INTR_VALID))
+ return (1);
+#ifdef __FreeBSD__
+ KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
+ (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
+ ("VM exit interruption info invalid: %#x", intr_info));
+#else
+ KASSERT((intr_info & VMCS_INTR_VALID) != 0 &&
+ (intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_HWINTR,
+ ("VM exit interruption info invalid: %x", intr_info));
+#endif
+#if 0 /* XXX */
+ vmx_trigger_hostintr(intr_info & 0xff);
+#endif
+
+ /*
+ * This is special. We want to treat this as an 'handled'
+ * VM-exit but not increment the instruction pointer.
+ */
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXTINT, 1);
+ return (1);
+ case EXIT_REASON_NMI_WINDOW:
+ /* Exit to allow the pending virtual NMI to be injected */
+ if (vm_nmi_pending(vmx->vm, vcpu))
+ vmx_inject_nmi(vmx, vcpu);
+ vmx_clear_nmi_window_exiting(vmx, vcpu);
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_NMI_WINDOW, 1);
+ return (1);
+ case EXIT_REASON_INOUT:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INOUT, 1);
+ vmexit->exitcode = VM_EXITCODE_INOUT;
+ vmexit->u.inout.bytes = (qual & 0x7) + 1;
+ vmexit->u.inout.in = in = (qual & 0x8) ? 1 : 0;
+ vmexit->u.inout.string = (qual & 0x10) ? 1 : 0;
+ vmexit->u.inout.rep = (qual & 0x20) ? 1 : 0;
+ vmexit->u.inout.port = (uint16_t)(qual >> 16);
+ vmexit->u.inout.eax = (uint32_t)(vmxctx->guest_rax);
+ if (vmexit->u.inout.string) {
+ inst_info = vmcs_read(VMCS_EXIT_INSTRUCTION_INFO);
+ vmexit->exitcode = VM_EXITCODE_INOUT_STR;
+ vis = &vmexit->u.inout_str;
+ vmx_paging_info(&vis->paging);
+ vis->rflags = vmcs_read(VMCS_GUEST_RFLAGS);
+ vis->cr0 = vmcs_read(VMCS_GUEST_CR0);
+ vis->index = inout_str_index(vmx, vcpu, in);
+ vis->count = inout_str_count(vmx, vcpu, vis->inout.rep);
+ vis->addrsize = inout_str_addrsize(inst_info);
+ inout_str_seginfo(vmx, vcpu, inst_info, in, vis);
+ }
+ break;
+ case EXIT_REASON_CPUID:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_CPUID, 1);
+ handled = vmx_handle_cpuid(vmx->vm, vcpu, vmxctx);
+ break;
+ case EXIT_REASON_EXCEPTION:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_EXCEPTION, 1);
+ intr_info = vmcs_read(VMCS_EXIT_INTR_INFO);
+#ifdef __FreeBSD__
+ KASSERT((intr_info & VMCS_INTR_VALID) != 0,
+ ("VM exit interruption info invalid: %#x", intr_info));
+#else
+ KASSERT((intr_info & VMCS_INTR_VALID) != 0,
+ ("VM exit interruption info invalid: %x", intr_info));
+#endif
+
+ /*
+ * If Virtual NMIs control is 1 and the VM-exit is due to a
+ * fault encountered during the execution of IRET then we must
+ * restore the state of "virtual-NMI blocking" before resuming
+ * the guest.
+ *
+ * See "Resuming Guest Software after Handling an Exception".
+ */
+ if ((idtvec_info & VMCS_IDT_VEC_VALID) == 0 &&
+ (intr_info & 0xff) != IDT_DF &&
+ (intr_info & EXIT_QUAL_NMIUDTI) != 0)
+ vmx_restore_nmi_blocking(vmx, vcpu);
+
+ /*
+ * The NMI has already been handled in vmx_exit_handle_nmi().
+ */
+ if ((intr_info & VMCS_INTR_T_MASK) == VMCS_INTR_T_NMI)
+ return (1);
+ break;
+ case EXIT_REASON_EPT_FAULT:
+ gpa = vmcs_gpa();
+ if (ept_emulation_fault(qual)) {
+ vmexit_inst_emul(vmexit, gpa, vmcs_gla());
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_INST_EMUL, 1);
+ }
+ break;
+ default:
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_UNKNOWN, 1);
+ break;
+ }
+
+ if (handled) {
+ /*
+ * It is possible that control is returned to userland
+ * even though we were able to handle the VM exit in the
+ * kernel.
+ *
+ * In such a case we want to make sure that the userland
+ * restarts guest execution at the instruction *after*
+ * the one we just processed. Therefore we update the
+ * guest rip in the VMCS and in 'vmexit'.
+ */
+ vm_exit_update_rip(vmexit);
+ vmexit->rip += vmexit->inst_length;
+ vmexit->inst_length = 0;
+
+ /*
+ * Special case for spinning up an AP - exit to userspace to
+ * give the controlling process a chance to intercept and
+ * spin up a thread for the AP.
+ */
+ if (vmexit->exitcode == VM_EXITCODE_SPINUP_AP)
+ handled = 0;
+ } else {
+ if (vmexit->exitcode == VM_EXITCODE_BOGUS) {
+ /*
+ * If this VM exit was not claimed by anybody then
+ * treat it as a generic VMX exit.
+ */
+ vmexit->exitcode = VM_EXITCODE_VMX;
+ vmexit->u.vmx.status = VM_SUCCESS;
+ vmexit->u.vmx.inst_type = 0;
+ vmexit->u.vmx.inst_error = 0;
+ } else {
+ /*
+ * The exitcode and collateral have been populated.
+ * The VM exit will be processed further in userland.
+ */
+ }
+ }
+ return (handled);
+}
+
+static int
+vmx_run(void *arg, int vcpu, register_t rip)
+{
+ int error, vie, rc, handled, astpending;
+ uint32_t exit_reason;
+ struct vmx *vmx;
+ struct vm *vm;
+ struct vmxctx *vmxctx;
+ struct vmcs *vmcs;
+ struct vm_exit *vmexit;
+ struct vlapic *vlapic;
+
+ vmx = arg;
+ vm = vmx->vm;
+ vmcs = &vmx->vmcs[vcpu];
+ vmxctx = &vmx->ctx[vcpu];
+ vlapic = vm_lapic(vm, vcpu);
+ vmxctx->launched = 0;
+
+ astpending = 0;
+ vmexit = vm_exitinfo(vmx->vm, vcpu);
+
+ vmx_msr_guest_enter(vmx, vcpu);
+
+ VMPTRLD(vmcs);
+
+ /*
+ * XXX
+ * We do this every time because we may setup the virtual machine
+ * from a different process than the one that actually runs it.
+ *
+ * If the life of a virtual machine was spent entirely in the context
+ * of a single process we could do this once in vmcs_set_defaults().
+ */
+ vmcs_write(VMCS_HOST_CR3, rcr3());
+
+ vmcs_write(VMCS_GUEST_RIP, rip);
+ vmx_set_pcpu_defaults(vmx, vcpu);
+ do {
+ vmx_inject_interrupts(vmx, vcpu, vlapic);
+ vmx_run_trace(vmx, vcpu);
+ rc = vmx_setjmp(vmxctx);
+#ifdef SETJMP_TRACE
+ vmx_setjmp_trace(vmx, vcpu, vmxctx, rc);
+#endif
+ switch (rc) {
+ case VMX_RETURN_DIRECT:
+ if (vmxctx->launched == 0) {
+ vmxctx->launched = 1;
+ vmx_launch(vmxctx);
+ } else
+ vmx_resume(vmxctx);
+ panic("vmx_launch/resume should not return");
+ break;
+ case VMX_RETURN_LONGJMP:
+ break; /* vm exit */
+ case VMX_RETURN_AST:
+ astpending = 1;
+ break;
+ case VMX_RETURN_VMRESUME:
+ vie = vmcs_instruction_error();
+ if (vmxctx->launch_error == VM_FAIL_INVALID ||
+ vie != VMRESUME_WITH_NON_LAUNCHED_VMCS) {
+ printf("vmresume error %d vmcs inst error %d\n",
+ vmxctx->launch_error, vie);
+ goto err_exit;
+ }
+ vmx_launch(vmxctx); /* try to launch the guest */
+ panic("vmx_launch should not return");
+ break;
+ case VMX_RETURN_VMLAUNCH:
+ vie = vmcs_instruction_error();
+#if 1
+ printf("vmlaunch error %d vmcs inst error %d\n",
+ vmxctx->launch_error, vie);
+#endif
+ goto err_exit;
+ default:
+ panic("vmx_setjmp returned %d", rc);
+ }
+
+ /* collect some basic information for VM exit processing */
+ vmexit->rip = rip = vmcs_guest_rip();
+ vmexit->inst_length = vmexit_instruction_length();
+ vmexit->u.vmx.exit_reason = exit_reason = vmcs_exit_reason();
+ vmexit->u.vmx.exit_qualification = vmcs_exit_qualification();
+
+ /* Update 'nextrip' */
+ vmx->state[vcpu].nextrip = rip;
+
+ /* enable interrupts */
+ enable_intr();
+
+ if (astpending) {
+ handled = 1;
+ vmexit->inst_length = 0;
+ vmexit->exitcode = VM_EXITCODE_BOGUS;
+ vmx_astpending_trace(vmx, vcpu, rip);
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_ASTPENDING, 1);
+ break;
+ }
+
+ handled = vmx_exit_process(vmx, vcpu, vmexit);
+ vmx_exit_trace(vmx, vcpu, rip, exit_reason, handled);
+
+ } while (handled);
+
+ /*
+ * If a VM exit has been handled then the exitcode must be BOGUS
+ * If a VM exit is not handled then the exitcode must not be BOGUS
+ */
+ if ((handled && vmexit->exitcode != VM_EXITCODE_BOGUS) ||
+ (!handled && vmexit->exitcode == VM_EXITCODE_BOGUS)) {
+ panic("Mismatch between handled (%d) and exitcode (%d)",
+ handled, vmexit->exitcode);
+ }
+
+ if (!handled)
+ vmm_stat_incr(vmx->vm, vcpu, VMEXIT_USERSPACE, 1);
+
+ VCPU_CTR1(vmx->vm, vcpu, "goto userland: exitcode %d",
+ vmexit->exitcode);
+
+ VMCLEAR(vmcs);
+ vmx_msr_guest_exit(vmx, vcpu);
+
+ return (0);
+
+err_exit:
+ vmexit->exitcode = VM_EXITCODE_VMX;
+ vmexit->u.vmx.exit_reason = (uint32_t)-1;
+ vmexit->u.vmx.exit_qualification = (uint32_t)-1;
+ vmexit->u.vmx.status = ~0;
+ VMCLEAR(vmcs);
+ vmx_msr_guest_exit(vmx, vcpu);
+
+ return (ENOEXEC);
+}
+
+static void
+vmx_vmcleanup(void *arg)
+{
+ int i, error;
+ struct vmx *vmx = arg;
+
+ for (i = 0; i < VM_MAXCPU; i++)
+ vpid_free(vmx->state[i].vpid);
+
+ /*
+ * XXXSMP we also need to clear the VMCS active on the other vcpus.
+ */
+ error = vmclear(&vmx->vmcs[0]);
+ if (error != 0)
+ panic("vmx_vmcleanup: vmclear error %d on vcpu 0", error);
+
+ ept_vmcleanup(vmx);
+ free(vmx, M_VMX);
+
+ return;
+}
+
+static register_t *
+vmxctx_regptr(struct vmxctx *vmxctx, int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_RAX:
+ return (&vmxctx->guest_rax);
+ case VM_REG_GUEST_RBX:
+ return (&vmxctx->guest_rbx);
+ case VM_REG_GUEST_RCX:
+ return (&vmxctx->guest_rcx);
+ case VM_REG_GUEST_RDX:
+ return (&vmxctx->guest_rdx);
+ case VM_REG_GUEST_RSI:
+ return (&vmxctx->guest_rsi);
+ case VM_REG_GUEST_RDI:
+ return (&vmxctx->guest_rdi);
+ case VM_REG_GUEST_RBP:
+ return (&vmxctx->guest_rbp);
+ case VM_REG_GUEST_R8:
+ return (&vmxctx->guest_r8);
+ case VM_REG_GUEST_R9:
+ return (&vmxctx->guest_r9);
+ case VM_REG_GUEST_R10:
+ return (&vmxctx->guest_r10);
+ case VM_REG_GUEST_R11:
+ return (&vmxctx->guest_r11);
+ case VM_REG_GUEST_R12:
+ return (&vmxctx->guest_r12);
+ case VM_REG_GUEST_R13:
+ return (&vmxctx->guest_r13);
+ case VM_REG_GUEST_R14:
+ return (&vmxctx->guest_r14);
+ case VM_REG_GUEST_R15:
+ return (&vmxctx->guest_r15);
+ case VM_REG_GUEST_CR2:
+ return (&vmxctx->guest_cr2);
+ default:
+ break;
+ }
+ return (NULL);
+}
+
+static int
+vmxctx_getreg(struct vmxctx *vmxctx, int reg, uint64_t *retval)
+{
+ register_t *regp;
+
+ if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+ *retval = *regp;
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+static int
+vmxctx_setreg(struct vmxctx *vmxctx, int reg, uint64_t val)
+{
+ register_t *regp;
+
+ if ((regp = vmxctx_regptr(vmxctx, reg)) != NULL) {
+ *regp = val;
+ return (0);
+ } else
+ return (EINVAL);
+}
+
+static int
+vmx_shadow_reg(int reg)
+{
+ int shreg;
+
+ shreg = -1;
+
+ switch (reg) {
+ case VM_REG_GUEST_CR0:
+ shreg = VMCS_CR0_SHADOW;
+ break;
+ case VM_REG_GUEST_CR4:
+ shreg = VMCS_CR4_SHADOW;
+ break;
+ default:
+ break;
+ }
+
+ return (shreg);
+}
+
+static int
+vmx_getreg(void *arg, int vcpu, int reg, uint64_t *retval)
+{
+ int running, hostcpu;
+ struct vmx *vmx = arg;
+
+ running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
+ if (running && hostcpu != curcpu)
+ panic("vmx_getreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ if (vmxctx_getreg(&vmx->ctx[vcpu], reg, retval) == 0)
+ return (0);
+
+ return (vmcs_getreg(&vmx->vmcs[vcpu], running, reg, retval));
+}
+
+static int
+vmx_setreg(void *arg, int vcpu, int reg, uint64_t val)
+{
+ int error, hostcpu, running, shadow;
+ uint64_t ctls;
+ struct vmx *vmx = arg;
+
+ running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
+ if (running && hostcpu != curcpu)
+ panic("vmx_setreg: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ if (vmxctx_setreg(&vmx->ctx[vcpu], reg, val) == 0)
+ return (0);
+
+ error = vmcs_setreg(&vmx->vmcs[vcpu], running, reg, val);
+
+ if (error == 0) {
+ /*
+ * If the "load EFER" VM-entry control is 1 then the
+ * value of EFER.LMA must be identical to "IA-32e mode guest"
+ * bit in the VM-entry control.
+ */
+ if ((entry_ctls & VM_ENTRY_LOAD_EFER) != 0 &&
+ (reg == VM_REG_GUEST_EFER)) {
+ vmcs_getreg(&vmx->vmcs[vcpu], running,
+ VMCS_IDENT(VMCS_ENTRY_CTLS), &ctls);
+ if (val & EFER_LMA)
+ ctls |= VM_ENTRY_GUEST_LMA;
+ else
+ ctls &= ~VM_ENTRY_GUEST_LMA;
+ vmcs_setreg(&vmx->vmcs[vcpu], running,
+ VMCS_IDENT(VMCS_ENTRY_CTLS), ctls);
+ }
+
+ shadow = vmx_shadow_reg(reg);
+ if (shadow > 0) {
+ /*
+ * Store the unmodified value in the shadow
+ */
+ error = vmcs_setreg(&vmx->vmcs[vcpu], running,
+ VMCS_IDENT(shadow), val);
+ }
+ }
+
+ return (error);
+}
+
+static int
+vmx_getdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+ int hostcpu, running;
+ struct vmx *vmx = arg;
+
+ running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
+ if (running && hostcpu != curcpu)
+ panic("vmx_getdesc: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ return (vmcs_getdesc(&vmx->vmcs[vcpu], running, reg, desc));
+}
+
+static int
+vmx_setdesc(void *arg, int vcpu, int reg, struct seg_desc *desc)
+{
+ int hostcpu, running;
+ struct vmx *vmx = arg;
+
+ running = vcpu_is_running(vmx->vm, vcpu, &hostcpu);
+ if (running && hostcpu != curcpu)
+ panic("vmx_setdesc: %s%d is running", vm_name(vmx->vm), vcpu);
+
+ return (vmcs_setdesc(&vmx->vmcs[vcpu], running, reg, desc));
+}
+
+static int
+vmx_getcap(void *arg, int vcpu, int type, int *retval)
+{
+ struct vmx *vmx = arg;
+ int vcap;
+ int ret;
+
+ ret = ENOENT;
+
+ vcap = vmx->cap[vcpu].set;
+
+ switch (type) {
+ case VM_CAP_HALT_EXIT:
+ if (cap_halt_exit)
+ ret = 0;
+ break;
+ case VM_CAP_PAUSE_EXIT:
+ if (cap_pause_exit)
+ ret = 0;
+ break;
+ case VM_CAP_MTRAP_EXIT:
+ if (cap_monitor_trap)
+ ret = 0;
+ break;
+ case VM_CAP_UNRESTRICTED_GUEST:
+ if (cap_unrestricted_guest)
+ ret = 0;
+ break;
+ default:
+ break;
+ }
+
+ if (ret == 0)
+ *retval = (vcap & (1 << type)) ? 1 : 0;
+
+ return (ret);
+}
+
+static int
+vmx_setcap(void *arg, int vcpu, int type, int val)
+{
+ struct vmx *vmx = arg;
+ struct vmcs *vmcs = &vmx->vmcs[vcpu];
+ uint32_t baseval;
+ uint32_t *pptr;
+ int error;
+ int flag;
+ int reg;
+ int retval;
+
+ retval = ENOENT;
+ pptr = NULL;
+
+ switch (type) {
+ case VM_CAP_HALT_EXIT:
+ if (cap_halt_exit) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_HLT_EXITING;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_MTRAP_EXIT:
+ if (cap_monitor_trap) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_MTF;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_PAUSE_EXIT:
+ if (cap_pause_exit) {
+ retval = 0;
+ pptr = &vmx->cap[vcpu].proc_ctls;
+ baseval = *pptr;
+ flag = PROCBASED_PAUSE_EXITING;
+ reg = VMCS_PRI_PROC_BASED_CTLS;
+ }
+ break;
+ case VM_CAP_UNRESTRICTED_GUEST:
+ if (cap_unrestricted_guest) {
+ retval = 0;
+ baseval = procbased_ctls2;
+ flag = PROCBASED2_UNRESTRICTED_GUEST;
+ reg = VMCS_SEC_PROC_BASED_CTLS;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (retval == 0) {
+ if (val) {
+ baseval |= flag;
+ } else {
+ baseval &= ~flag;
+ }
+ VMPTRLD(vmcs);
+ error = vmwrite(reg, baseval);
+ VMCLEAR(vmcs);
+
+ if (error) {
+ retval = error;
+ } else {
+ /*
+ * Update optional stored flags, and record
+ * setting
+ */
+ if (pptr != NULL) {
+ *pptr = baseval;
+ }
+
+ if (val) {
+ vmx->cap[vcpu].set |= (1 << type);
+ } else {
+ vmx->cap[vcpu].set &= ~(1 << type);
+ }
+ }
+ }
+
+ return (retval);
+}
+
+struct vlapic_vtx {
+ struct vlapic vlapic;
+ struct pir_desc *pir_desc;
+ struct vmx *vmx;
+};
+
+#define VMX_CTR_PIR(vm, vcpuid, pir_desc, notify, vector, level, msg) \
+do { \
+ VCPU_CTR2(vm, vcpuid, msg " assert %s-triggered vector %d", \
+ level ? "level" : "edge", vector); \
+ VCPU_CTR1(vm, vcpuid, msg " pir0 0x%016lx", pir_desc->pir[0]); \
+ VCPU_CTR1(vm, vcpuid, msg " pir1 0x%016lx", pir_desc->pir[1]); \
+ VCPU_CTR1(vm, vcpuid, msg " pir2 0x%016lx", pir_desc->pir[2]); \
+ VCPU_CTR1(vm, vcpuid, msg " pir3 0x%016lx", pir_desc->pir[3]); \
+ VCPU_CTR1(vm, vcpuid, msg " notify: %s", notify ? "yes" : "no");\
+} while (0)
+
+/*
+ * vlapic->ops handlers that utilize the APICv hardware assist described in
+ * Chapter 29 of the Intel SDM.
+ */
+static int
+vmx_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
+{
+ struct vlapic_vtx *vlapic_vtx;
+ struct pir_desc *pir_desc;
+ uint64_t mask;
+ int idx, notify;
+
+ vlapic_vtx = (struct vlapic_vtx *)vlapic;
+ pir_desc = vlapic_vtx->pir_desc;
+
+ /*
+ * Keep track of interrupt requests in the PIR descriptor. This is
+ * because the virtual APIC page pointed to by the VMCS cannot be
+ * modified if the vcpu is running.
+ */
+ idx = vector / 64;
+ mask = 1UL << (vector % 64);
+ atomic_set_long(&pir_desc->pir[idx], mask);
+ notify = atomic_cmpset_long(&pir_desc->pending, 0, 1);
+
+ VMX_CTR_PIR(vlapic->vm, vlapic->vcpuid, pir_desc, notify, vector,
+ level, "vmx_set_intr_ready");
+ return (notify);
+}
+
+static int
+vmx_pending_intr(struct vlapic *vlapic, int *vecptr)
+{
+ struct vlapic_vtx *vlapic_vtx;
+ struct pir_desc *pir_desc;
+ struct LAPIC *lapic;
+ uint64_t pending, pirval;
+ uint32_t ppr, vpr;
+ int i;
+
+ /*
+ * This function is only expected to be called from the 'HLT' exit
+ * handler which does not care about the vector that is pending.
+ */
+ KASSERT(vecptr == NULL, ("vmx_pending_intr: vecptr must be NULL"));
+
+ vlapic_vtx = (struct vlapic_vtx *)vlapic;
+ pir_desc = vlapic_vtx->pir_desc;
+
+ pending = atomic_load_acq_long(&pir_desc->pending);
+ if (!pending)
+ return (0); /* common case */
+
+ /*
+ * If there is an interrupt pending then it will be recognized only
+ * if its priority is greater than the processor priority.
+ *
+ * Special case: if the processor priority is zero then any pending
+ * interrupt will be recognized.
+ */
+ lapic = vlapic->apic_page;
+ ppr = lapic->ppr & 0xf0;
+ if (ppr == 0)
+ return (1);
+
+ VCPU_CTR1(vlapic->vm, vlapic->vcpuid, "HLT with non-zero PPR %d",
+ lapic->ppr);
+
+ for (i = 3; i >= 0; i--) {
+ pirval = pir_desc->pir[i];
+ if (pirval != 0) {
+ vpr = (i * 64 + flsl(pirval) - 1) & 0xf0;
+ return (vpr > ppr);
+ }
+ }
+ return (0);
+}
+
+static void
+vmx_intr_accepted(struct vlapic *vlapic, int vector)
+{
+
+ panic("vmx_intr_accepted: not expected to be called");
+}
+
+static void
+vmx_set_tmr(struct vlapic *vlapic, int vector, bool level)
+{
+ struct vlapic_vtx *vlapic_vtx;
+ struct vmx *vmx;
+ struct vmcs *vmcs;
+ uint64_t mask, val;
+
+ KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
+ KASSERT(!vcpu_is_running(vlapic->vm, vlapic->vcpuid, NULL),
+ ("vmx_set_tmr: vcpu cannot be running"));
+
+ vlapic_vtx = (struct vlapic_vtx *)vlapic;
+ vmx = vlapic_vtx->vmx;
+ vmcs = &vmx->vmcs[vlapic->vcpuid];
+ mask = 1UL << (vector % 64);
+
+ VMPTRLD(vmcs);
+ val = vmcs_read(VMCS_EOI_EXIT(vector));
+ if (level)
+ val |= mask;
+ else
+ val &= ~mask;
+ vmcs_write(VMCS_EOI_EXIT(vector), val);
+ VMCLEAR(vmcs);
+}
+
+static void
+vmx_post_intr(struct vlapic *vlapic, int hostcpu)
+{
+
+ ipi_cpu(hostcpu, pirvec);
+}
+
+/*
+ * Transfer the pending interrupts in the PIR descriptor to the IRR
+ * in the virtual APIC page.
+ */
+static void
+vmx_inject_pir(struct vlapic *vlapic)
+{
+ struct vlapic_vtx *vlapic_vtx;
+ struct pir_desc *pir_desc;
+ struct LAPIC *lapic;
+ uint64_t val, pirval;
+ int rvi, pirbase = -1;
+ uint16_t intr_status_old, intr_status_new;
+
+ vlapic_vtx = (struct vlapic_vtx *)vlapic;
+ pir_desc = vlapic_vtx->pir_desc;
+ if (atomic_cmpset_long(&pir_desc->pending, 1, 0) == 0) {
+ VCPU_CTR0(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
+ "no posted interrupt pending");
+ return;
+ }
+
+ pirval = 0;
+ pirbase = -1;
+ lapic = vlapic->apic_page;
+
+ val = atomic_readandclear_long(&pir_desc->pir[0]);
+ if (val != 0) {
+ lapic->irr0 |= val;
+ lapic->irr1 |= val >> 32;
+ pirbase = 0;
+ pirval = val;
+ }
+
+ val = atomic_readandclear_long(&pir_desc->pir[1]);
+ if (val != 0) {
+ lapic->irr2 |= val;
+ lapic->irr3 |= val >> 32;
+ pirbase = 64;
+ pirval = val;
+ }
+
+ val = atomic_readandclear_long(&pir_desc->pir[2]);
+ if (val != 0) {
+ lapic->irr4 |= val;
+ lapic->irr5 |= val >> 32;
+ pirbase = 128;
+ pirval = val;
+ }
+
+ val = atomic_readandclear_long(&pir_desc->pir[3]);
+ if (val != 0) {
+ lapic->irr6 |= val;
+ lapic->irr7 |= val >> 32;
+ pirbase = 192;
+ pirval = val;
+ }
+
+ VLAPIC_CTR_IRR(vlapic, "vmx_inject_pir");
+
+ /*
+ * Update RVI so the processor can evaluate pending virtual
+ * interrupts on VM-entry.
+ *
+ * It is possible for pirval to be 0 here, even though the
+ * pending bit has been set. The scenario is:
+ * CPU-Y is sending a posted interrupt to CPU-X, which
+ * is running a guest and processing posted interrupts in h/w.
+ * CPU-X will eventually exit and the state seen in s/w is
+ * the pending bit set, but no PIR bits set.
+ *
+ * CPU-X CPU-Y
+ * (vm running) (host running)
+ * rx posted interrupt
+ * CLEAR pending bit
+ * SET PIR bit
+ * READ/CLEAR PIR bits
+ * SET pending bit
+ * (vm exit)
+ * pending bit set, PIR 0
+ */
+ if (pirval != 0) {
+ rvi = pirbase + flsl(pirval) - 1;
+ intr_status_old = vmcs_read(VMCS_GUEST_INTR_STATUS);
+ intr_status_new = (intr_status_old & 0xFF00) | rvi;
+ if (intr_status_new > intr_status_old) {
+ vmcs_write(VMCS_GUEST_INTR_STATUS, intr_status_new);
+ VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vmx_inject_pir: "
+ "guest_intr_status changed from 0x%04x to 0x%04x",
+ intr_status_old, intr_status_new);
+ }
+ }
+}
+
+static struct vlapic *
+vmx_vlapic_init(void *arg, int vcpuid)
+{
+ struct vmx *vmx;
+ struct vlapic *vlapic;
+ struct vlapic_vtx *vlapic_vtx;
+
+ vmx = arg;
+
+ vlapic = malloc(sizeof(struct vlapic_vtx), M_VLAPIC, M_WAITOK | M_ZERO);
+ vlapic->vm = vmx->vm;
+ vlapic->vcpuid = vcpuid;
+ vlapic->apic_page = (struct LAPIC *)&vmx->apic_page[vcpuid];
+
+ vlapic_vtx = (struct vlapic_vtx *)vlapic;
+ vlapic_vtx->pir_desc = &vmx->pir_desc[vcpuid];
+ vlapic_vtx->vmx = vmx;
+
+ if (virtual_interrupt_delivery) {
+ vlapic->ops.set_intr_ready = vmx_set_intr_ready;
+ vlapic->ops.pending_intr = vmx_pending_intr;
+ vlapic->ops.intr_accepted = vmx_intr_accepted;
+ vlapic->ops.set_tmr = vmx_set_tmr;
+#ifdef __FreeBSD__
+ vlapic->ops.enable_x2apic_mode = vmx_enable_x2apic_mode;
+#endif
+ }
+
+ if (posted_interrupts)
+ vlapic->ops.post_intr = vmx_post_intr;
+
+ vlapic_init(vlapic);
+
+ return (vlapic);
+}
+
+static void
+vmx_vlapic_cleanup(void *arg, struct vlapic *vlapic)
+{
+
+ vlapic_cleanup(vlapic);
+ free(vlapic, M_VLAPIC);
+}
+
+struct vmm_ops vmm_ops_intel = {
+ vmx_init,
+ vmx_cleanup,
+ vmx_vminit,
+ vmx_run,
+ vmx_vmcleanup,
+ ept_vmmmap_set,
+ ept_vmmmap_get,
+ vmx_getreg,
+ vmx_setreg,
+ vmx_getdesc,
+ vmx_setdesc,
+ vmx_getcap,
+ vmx_setcap,
+ vmx_vlapic_init,
+ vmx_vlapic_cleanup,
+};
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx.h b/usr/src/uts/i86pc/io/vmm/intel/vmx.h
new file mode 100644
index 0000000000..50ca62b371
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx.h
@@ -0,0 +1,156 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx.h 284174 2015-06-09 00:14:47Z tychon $
+ */
+
+#ifndef _VMX_H_
+#define _VMX_H_
+
+#include "vmcs.h"
+
+#ifndef __FreeBSD__
+#define GUEST_MSR_MAX_ENTRIES 64 /* arbitrary */
+#define HOST_MSR_MAX_ENTRIES 64 /* arbitrary */
+#endif
+
+struct vmxctx {
+ register_t tmpstk[32]; /* vmx_return() stack */
+ register_t tmpstktop;
+
+ register_t guest_rdi; /* Guest state */
+ register_t guest_rsi;
+ register_t guest_rdx;
+ register_t guest_rcx;
+ register_t guest_r8;
+ register_t guest_r9;
+ register_t guest_rax;
+ register_t guest_rbx;
+ register_t guest_rbp;
+ register_t guest_r10;
+ register_t guest_r11;
+ register_t guest_r12;
+ register_t guest_r13;
+ register_t guest_r14;
+ register_t guest_r15;
+ register_t guest_cr2;
+
+ register_t host_r15; /* Host state */
+ register_t host_r14;
+ register_t host_r13;
+ register_t host_r12;
+ register_t host_rbp;
+ register_t host_rsp;
+ register_t host_rbx;
+ register_t host_rip;
+ /*
+ * XXX todo debug registers and fpu state
+ */
+
+ int launched; /* vmcs launch state */
+ int launch_error;
+};
+
+struct vmxcap {
+ int set;
+ uint32_t proc_ctls;
+ uint32_t proc_ctls2;
+};
+
+struct vmxstate {
+ uint64_t nextrip; /* next instruction to be executed by guest */
+ int lastcpu; /* host cpu that this 'vcpu' last ran on */
+ uint16_t vpid;
+};
+
+struct apic_page {
+ uint32_t reg[PAGE_SIZE / 4];
+};
+CTASSERT(sizeof(struct apic_page) == PAGE_SIZE);
+
+/* Posted Interrupt Descriptor (described in section 29.6 of the Intel SDM) */
+struct pir_desc {
+ uint64_t pir[4];
+ uint64_t pending;
+ uint64_t unused[3];
+} __aligned(64);
+CTASSERT(sizeof(struct pir_desc) == 64);
+
+/* Index into the 'guest_msrs[]' array */
+enum {
+ IDX_MSR_LSTAR,
+ IDX_MSR_CSTAR,
+ IDX_MSR_STAR,
+ IDX_MSR_SF_MASK,
+ IDX_MSR_KGSBASE,
+ GUEST_MSR_NUM /* must be the last enumeration */
+};
+
+/* virtual machine softc */
+struct vmx {
+ pml4_entry_t pml4ept[NPML4EPG];
+ struct vmcs vmcs[VM_MAXCPU]; /* one vmcs per virtual cpu */
+ struct apic_page apic_page[VM_MAXCPU]; /* one apic page per vcpu */
+ char msr_bitmap[PAGE_SIZE];
+ struct pir_desc pir_desc[VM_MAXCPU];
+#ifdef __FreeBSD__
+ uint64_t guest_msrs[VM_MAXCPU][GUEST_MSR_NUM];
+#else
+ struct msr_entry guest_msrs[VM_MAXCPU][GUEST_MSR_MAX_ENTRIES];
+ struct msr_entry host_msrs[VM_MAXCPU][HOST_MSR_MAX_ENTRIES];
+#endif
+ struct vmxctx ctx[VM_MAXCPU];
+ struct vmxcap cap[VM_MAXCPU];
+ struct vmxstate state[VM_MAXCPU];
+ struct vm *vm;
+};
+CTASSERT((offsetof(struct vmx, pml4ept) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, vmcs) & PAGE_MASK) == 0);
+CTASSERT((offsetof(struct vmx, msr_bitmap) & PAGE_MASK) == 0);
+
+#define VMX_RETURN_DIRECT 0
+#define VMX_RETURN_LONGJMP 1
+#define VMX_RETURN_VMRESUME 2
+#define VMX_RETURN_VMLAUNCH 3
+#define VMX_RETURN_AST 4
+/*
+ * vmx_setjmp() returns:
+ * - 0 when it returns directly
+ * - 1 when it returns from vmx_longjmp
+ * - 2 when it returns from vmx_resume (which would only be in the error case)
+ * - 3 when it returns from vmx_launch (which would only be in the error case)
+ * - 4 when it returns from vmx_resume or vmx_launch because of AST pending
+ */
+int vmx_setjmp(struct vmxctx *ctx);
+void vmx_longjmp(void); /* returns via vmx_setjmp */
+void vmx_launch(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
+void vmx_resume(struct vmxctx *ctx) __dead2; /* may return via vmx_setjmp */
+
+u_long vmx_fix_cr0(u_long cr0);
+u_long vmx_fix_cr4(u_long cr4);
+
+int vmx_set_tsc_offset(struct vmx *vmx, int vcpu, uint64_t offset);
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h
new file mode 100644
index 0000000000..08b1469f19
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_controls.h
@@ -0,0 +1,96 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx_controls.h 260410 2014-01-07 21:04:49Z neel $
+ */
+
+#ifndef _VMX_CONTROLS_H_
+#define _VMX_CONTROLS_H_
+
+/* Pin-Based VM-Execution Controls */
+#define PINBASED_EXTINT_EXITING (1 << 0)
+#define PINBASED_NMI_EXITING (1 << 3)
+#define PINBASED_VIRTUAL_NMI (1 << 5)
+#define PINBASED_PREMPTION_TIMER (1 << 6)
+#define PINBASED_POSTED_INTERRUPT (1 << 7)
+
+/* Primary Processor-Based VM-Execution Controls */
+#define PROCBASED_INT_WINDOW_EXITING (1 << 2)
+#define PROCBASED_TSC_OFFSET (1 << 3)
+#define PROCBASED_HLT_EXITING (1 << 7)
+#define PROCBASED_INVLPG_EXITING (1 << 9)
+#define PROCBASED_MWAIT_EXITING (1 << 10)
+#define PROCBASED_RDPMC_EXITING (1 << 11)
+#define PROCBASED_RDTSC_EXITING (1 << 12)
+#define PROCBASED_CR3_LOAD_EXITING (1 << 15)
+#define PROCBASED_CR3_STORE_EXITING (1 << 16)
+#define PROCBASED_CR8_LOAD_EXITING (1 << 19)
+#define PROCBASED_CR8_STORE_EXITING (1 << 20)
+#define PROCBASED_USE_TPR_SHADOW (1 << 21)
+#define PROCBASED_NMI_WINDOW_EXITING (1 << 22)
+#define PROCBASED_MOV_DR_EXITING (1 << 23)
+#define PROCBASED_IO_EXITING (1 << 24)
+#define PROCBASED_IO_BITMAPS (1 << 25)
+#define PROCBASED_MTF (1 << 27)
+#define PROCBASED_MSR_BITMAPS (1 << 28)
+#define PROCBASED_MONITOR_EXITING (1 << 29)
+#define PROCBASED_PAUSE_EXITING (1 << 30)
+#define PROCBASED_SECONDARY_CONTROLS (1U << 31)
+
+/* Secondary Processor-Based VM-Execution Controls */
+#define PROCBASED2_VIRTUALIZE_APIC_ACCESSES (1 << 0)
+#define PROCBASED2_ENABLE_EPT (1 << 1)
+#define PROCBASED2_DESC_TABLE_EXITING (1 << 2)
+#define PROCBASED2_ENABLE_RDTSCP (1 << 3)
+#define PROCBASED2_VIRTUALIZE_X2APIC_MODE (1 << 4)
+#define PROCBASED2_ENABLE_VPID (1 << 5)
+#define PROCBASED2_WBINVD_EXITING (1 << 6)
+#define PROCBASED2_UNRESTRICTED_GUEST (1 << 7)
+#define PROCBASED2_APIC_REGISTER_VIRTUALIZATION (1 << 8)
+#define PROCBASED2_VIRTUAL_INTERRUPT_DELIVERY (1 << 9)
+#define PROCBASED2_PAUSE_LOOP_EXITING (1 << 10)
+#define PROCBASED2_ENABLE_INVPCID (1 << 12)
+
+/* VM Exit Controls */
+#define VM_EXIT_SAVE_DEBUG_CONTROLS (1 << 2)
+#define VM_EXIT_HOST_LMA (1 << 9)
+#define VM_EXIT_LOAD_PERF_GLOBAL_CTRL (1 << 12)
+#define VM_EXIT_ACKNOWLEDGE_INTERRUPT (1 << 15)
+#define VM_EXIT_SAVE_PAT (1 << 18)
+#define VM_EXIT_LOAD_PAT (1 << 19)
+#define VM_EXIT_SAVE_EFER (1 << 20)
+#define VM_EXIT_LOAD_EFER (1 << 21)
+#define VM_EXIT_SAVE_PREEMPTION_TIMER (1 << 22)
+
+/* VM Entry Controls */
+#define VM_ENTRY_LOAD_DEBUG_CONTROLS (1 << 2)
+#define VM_ENTRY_GUEST_LMA (1 << 9)
+#define VM_ENTRY_INTO_SMM (1 << 10)
+#define VM_ENTRY_DEACTIVATE_DUAL_MONITOR (1 << 11)
+#define VM_ENTRY_LOAD_PERF_GLOBAL_CTRL (1 << 13)
+#define VM_ENTRY_LOAD_PAT (1 << 14)
+#define VM_ENTRY_LOAD_EFER (1 << 15)
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h
new file mode 100644
index 0000000000..9513f6c70b
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_cpufunc.h
@@ -0,0 +1,245 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx_cpufunc.h 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _VMX_CPUFUNC_H_
+#define _VMX_CPUFUNC_H_
+
+struct vmcs;
+
+/*
+ * Section 5.2 "Conventions" from Intel Architecture Manual 2B.
+ *
+ * error
+ * VMsucceed 0
+ * VMFailInvalid 1
+ * VMFailValid 2 see also VMCS VM-Instruction Error Field
+ */
+#define VM_SUCCESS 0
+#define VM_FAIL_INVALID 1
+#define VM_FAIL_VALID 2
+#define VMX_SET_ERROR_CODE \
+ " jnc 1f;" \
+ " mov $1, %[error];" /* CF: error = 1 */ \
+ " jmp 3f;" \
+ "1: jnz 2f;" \
+ " mov $2, %[error];" /* ZF: error = 2 */ \
+ " jmp 3f;" \
+ "2: mov $0, %[error];" \
+ "3:"
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon(char *region)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(region);
+ __asm __volatile("vmxon %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+
+ return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmxon_pa(vm_paddr_t addr)
+{
+ int error;
+
+ __asm __volatile("vmxon %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+
+ return (error);
+}
+
+/* returns 0 on success and non-zero on failure */
+static __inline int
+vmclear(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(vmcs);
+ __asm __volatile("vmclear %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+ return (error);
+}
+
+static __inline void
+vmxoff(void)
+{
+
+ __asm __volatile("vmxoff");
+}
+
+static __inline void
+vmptrst(uint64_t *addr)
+{
+
+ __asm __volatile("vmptrst %[addr]" :: [addr]"m" (*addr) : "memory");
+}
+
+static __inline int
+vmptrld(struct vmcs *vmcs)
+{
+ int error;
+ uint64_t addr;
+
+ addr = vtophys(vmcs);
+ __asm __volatile("vmptrld %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [addr] "m" (*(uint64_t *)&addr)
+ : "memory");
+ return (error);
+}
+
+static __inline int
+vmwrite(uint64_t reg, uint64_t val)
+{
+ int error;
+
+ __asm __volatile("vmwrite %[val], %[reg];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [val] "r" (val), [reg] "r" (reg)
+ : "memory");
+
+ return (error);
+}
+
+static __inline int
+vmread(uint64_t r, uint64_t *addr)
+{
+ int error;
+
+ __asm __volatile("vmread %[r], %[addr];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [r] "r" (r), [addr] "m" (*addr)
+ : "memory");
+
+ return (error);
+}
+
+static void __inline
+VMCLEAR(struct vmcs *vmcs)
+{
+ int err;
+
+ err = vmclear(vmcs);
+ if (err != 0)
+ panic("%s: vmclear(%p) error %d", __func__, vmcs, err);
+
+ critical_exit();
+}
+
+static void __inline
+VMPTRLD(struct vmcs *vmcs)
+{
+ int err;
+
+ critical_enter();
+
+ err = vmptrld(vmcs);
+ if (err != 0)
+ panic("%s: vmptrld(%p) error %d", __func__, vmcs, err);
+}
+
+#define INVVPID_TYPE_ADDRESS 0UL
+#define INVVPID_TYPE_SINGLE_CONTEXT 1UL
+#define INVVPID_TYPE_ALL_CONTEXTS 2UL
+
+struct invvpid_desc {
+ uint16_t vpid;
+ uint16_t _res1;
+ uint32_t _res2;
+ uint64_t linear_addr;
+};
+CTASSERT(sizeof(struct invvpid_desc) == 16);
+
+static void __inline
+invvpid(uint64_t type, struct invvpid_desc desc)
+{
+ int error;
+
+ __asm __volatile("invvpid %[desc], %[type];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [desc] "m" (desc), [type] "r" (type)
+ : "memory");
+
+ if (error)
+ panic("invvpid error %d", error);
+}
+
+#define INVEPT_TYPE_SINGLE_CONTEXT 1UL
+#define INVEPT_TYPE_ALL_CONTEXTS 2UL
+struct invept_desc {
+ uint64_t eptp;
+ uint64_t _res;
+};
+CTASSERT(sizeof(struct invept_desc) == 16);
+
+static void __inline
+invept(uint64_t type, struct invept_desc desc)
+{
+ int error;
+
+ __asm __volatile("invept %[desc], %[type];"
+ VMX_SET_ERROR_CODE
+ : [error] "=r" (error)
+ : [desc] "m" (desc), [type] "r" (type)
+ : "memory");
+
+ if (error)
+ panic("invept error %d", error);
+}
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c
new file mode 100644
index 0000000000..1ced311ca8
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.c
@@ -0,0 +1,445 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.c 284174 2015-06-09 00:14:47Z tychon $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.c 284174 2015-06-09 00:14:47Z tychon $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+
+#include <machine/clock.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+#include <machine/vmm.h>
+
+#ifndef __FreeBSD__
+#include <vm/pmap.h>
+#endif
+
+#include "vmx.h"
+#include "vmx_msr.h"
+
+static boolean_t
+vmx_ctl_allows_one_setting(uint64_t msr_val, int bitpos)
+{
+
+ if (msr_val & (1UL << (bitpos + 32)))
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+static boolean_t
+vmx_ctl_allows_zero_setting(uint64_t msr_val, int bitpos)
+{
+
+ if ((msr_val & (1UL << bitpos)) == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+uint32_t
+vmx_revision(void)
+{
+
+ return (rdmsr(MSR_VMX_BASIC) & 0xffffffff);
+}
+
+/*
+ * Generate a bitmask to be used for the VMCS execution control fields.
+ *
+ * The caller specifies what bits should be set to one in 'ones_mask'
+ * and what bits should be set to zero in 'zeros_mask'. The don't-care
+ * bits are set to the default value. The default values are obtained
+ * based on "Algorithm 3" in Section 27.5.1 "Algorithms for Determining
+ * VMX Capabilities".
+ *
+ * Returns zero on success and non-zero on error.
+ */
+int
+vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+ uint32_t zeros_mask, uint32_t *retval)
+{
+ int i;
+ uint64_t val, trueval;
+ boolean_t true_ctls_avail, one_allowed, zero_allowed;
+
+ /* We cannot ask the same bit to be set to both '1' and '0' */
+ if ((ones_mask ^ zeros_mask) != (ones_mask | zeros_mask))
+ return (EINVAL);
+
+ if (rdmsr(MSR_VMX_BASIC) & (1UL << 55))
+ true_ctls_avail = TRUE;
+ else
+ true_ctls_avail = FALSE;
+
+ val = rdmsr(ctl_reg);
+ if (true_ctls_avail)
+ trueval = rdmsr(true_ctl_reg); /* step c */
+ else
+ trueval = val; /* step a */
+
+ for (i = 0; i < 32; i++) {
+ one_allowed = vmx_ctl_allows_one_setting(trueval, i);
+ zero_allowed = vmx_ctl_allows_zero_setting(trueval, i);
+
+ KASSERT(one_allowed || zero_allowed,
+ ("invalid zero/one setting for bit %d of ctl 0x%0x, "
+ "truectl 0x%0x\n", i, ctl_reg, true_ctl_reg));
+
+ if (zero_allowed && !one_allowed) { /* b(i),c(i) */
+ if (ones_mask & (1 << i))
+ return (EINVAL);
+ *retval &= ~(1 << i);
+ } else if (one_allowed && !zero_allowed) { /* b(i),c(i) */
+ if (zeros_mask & (1 << i))
+ return (EINVAL);
+ *retval |= 1 << i;
+ } else {
+ if (zeros_mask & (1 << i)) /* b(ii),c(ii) */
+ *retval &= ~(1 << i);
+ else if (ones_mask & (1 << i)) /* b(ii), c(ii) */
+ *retval |= 1 << i;
+ else if (!true_ctls_avail)
+ *retval &= ~(1 << i); /* b(iii) */
+ else if (vmx_ctl_allows_zero_setting(val, i))/* c(iii)*/
+ *retval &= ~(1 << i);
+ else if (vmx_ctl_allows_one_setting(val, i)) /* c(iv) */
+ *retval |= 1 << i;
+ else {
+ panic("vmx_set_ctlreg: unable to determine "
+ "correct value of ctl bit %d for msr "
+ "0x%0x and true msr 0x%0x", i, ctl_reg,
+ true_ctl_reg);
+ }
+ }
+ }
+
+ return (0);
+}
+
+void
+msr_bitmap_initialize(char *bitmap)
+{
+
+ memset(bitmap, 0xff, PAGE_SIZE);
+}
+
+int
+msr_bitmap_change_access(char *bitmap, u_int msr, int access)
+{
+ int byte, bit;
+
+ if (msr <= 0x00001FFF)
+ byte = msr / 8;
+ else if (msr >= 0xC0000000 && msr <= 0xC0001FFF)
+ byte = 1024 + (msr - 0xC0000000) / 8;
+ else
+ return (EINVAL);
+
+ bit = msr & 0x7;
+
+ if (access & MSR_BITMAP_ACCESS_READ)
+ bitmap[byte] &= ~(1 << bit);
+ else
+ bitmap[byte] |= 1 << bit;
+
+ byte += 2048;
+ if (access & MSR_BITMAP_ACCESS_WRITE)
+ bitmap[byte] &= ~(1 << bit);
+ else
+ bitmap[byte] |= 1 << bit;
+
+ return (0);
+}
+
+static uint64_t misc_enable;
+static uint64_t platform_info;
+static uint64_t turbo_ratio_limit;
+static uint64_t host_msrs[GUEST_MSR_NUM];
+
+static bool
+nehalem_cpu(void)
+{
+ u_int family, model;
+
+ /*
+ * The family:model numbers belonging to the Nehalem microarchitecture
+ * are documented in Section 35.5, Intel SDM dated Feb 2014.
+ */
+ family = CPUID_TO_FAMILY(cpu_id);
+ model = CPUID_TO_MODEL(cpu_id);
+ if (family == 0x6) {
+ switch (model) {
+ case 0x1A:
+ case 0x1E:
+ case 0x1F:
+ case 0x2E:
+ return (true);
+ default:
+ break;
+ }
+ }
+ return (false);
+}
+
+static bool
+westmere_cpu(void)
+{
+ u_int family, model;
+
+ /*
+ * The family:model numbers belonging to the Westmere microarchitecture
+ * are documented in Section 35.6, Intel SDM dated Feb 2014.
+ */
+ family = CPUID_TO_FAMILY(cpu_id);
+ model = CPUID_TO_MODEL(cpu_id);
+ if (family == 0x6) {
+ switch (model) {
+ case 0x25:
+ case 0x2C:
+ return (true);
+ default:
+ break;
+ }
+ }
+ return (false);
+}
+
+void
+vmx_msr_init(void)
+{
+ uint64_t bus_freq, ratio;
+ int i;
+
+#ifdef __FreeBSD__
+ /*
+ * It is safe to cache the values of the following MSRs because
+ * they don't change based on curcpu, curproc or curthread.
+ */
+ host_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+ host_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+ host_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+ host_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+#endif
+
+ /*
+ * Initialize emulated MSRs
+ */
+ misc_enable = rdmsr(MSR_IA32_MISC_ENABLE);
+ /*
+ * Set mandatory bits
+ * 11: branch trace disabled
+ * 12: PEBS unavailable
+ * Clear unsupported features
+ * 16: SpeedStep enable
+ * 18: enable MONITOR FSM
+ */
+ misc_enable |= (1 << 12) | (1 << 11);
+ misc_enable &= ~((1 << 18) | (1 << 16));
+
+ if (nehalem_cpu() || westmere_cpu())
+ bus_freq = 133330000; /* 133Mhz */
+ else
+ bus_freq = 100000000; /* 100Mhz */
+
+ /*
+ * XXXtime
+ * The ratio should really be based on the virtual TSC frequency as
+ * opposed to the host TSC.
+ */
+ ratio = (tsc_freq / bus_freq) & 0xff;
+
+ /*
+ * The register definition is based on the micro-architecture
+ * but the following bits are always the same:
+ * [15:8] Maximum Non-Turbo Ratio
+ * [28] Programmable Ratio Limit for Turbo Mode
+ * [29] Programmable TDC-TDP Limit for Turbo Mode
+ * [47:40] Maximum Efficiency Ratio
+ *
+ * The other bits can be safely set to 0 on all
+ * micro-architectures up to Haswell.
+ */
+ platform_info = (ratio << 8) | (ratio << 40);
+
+ /*
+ * The number of valid bits in the MSR_TURBO_RATIO_LIMITx register is
+ * dependent on the maximum cores per package supported by the micro-
+ * architecture. For e.g., Westmere supports 6 cores per package and
+ * uses the low 48 bits. Sandybridge support 8 cores per package and
+ * uses up all 64 bits.
+ *
+ * However, the unused bits are reserved so we pretend that all bits
+ * in this MSR are valid.
+ */
+ for (i = 0; i < 8; i++)
+ turbo_ratio_limit = (turbo_ratio_limit << 8) | ratio;
+}
+
+void
+vmx_msr_guest_init(struct vmx *vmx, int vcpuid)
+{
+ /*
+ * The permissions bitmap is shared between all vcpus so initialize it
+ * once when initializing the vBSP.
+ */
+ if (vcpuid == 0) {
+ guest_msr_rw(vmx, MSR_LSTAR);
+ guest_msr_rw(vmx, MSR_CSTAR);
+ guest_msr_rw(vmx, MSR_STAR);
+ guest_msr_rw(vmx, MSR_SF_MASK);
+ guest_msr_rw(vmx, MSR_KGSBASE);
+ }
+ return;
+}
+
+void
+vmx_msr_guest_enter(struct vmx *vmx, int vcpuid)
+{
+#ifdef __FreeBSD__
+ uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
+
+ /* Save host MSRs (if any) and restore guest MSRs */
+ wrmsr(MSR_LSTAR, guest_msrs[IDX_MSR_LSTAR]);
+ wrmsr(MSR_CSTAR, guest_msrs[IDX_MSR_CSTAR]);
+ wrmsr(MSR_STAR, guest_msrs[IDX_MSR_STAR]);
+ wrmsr(MSR_SF_MASK, guest_msrs[IDX_MSR_SF_MASK]);
+ wrmsr(MSR_KGSBASE, guest_msrs[IDX_MSR_KGSBASE]);
+#endif
+}
+
+void
+vmx_msr_guest_exit(struct vmx *vmx, int vcpuid)
+{
+#ifdef __FreeBSD__
+ uint64_t *guest_msrs = vmx->guest_msrs[vcpuid];
+
+ /* Save guest MSRs */
+ guest_msrs[IDX_MSR_LSTAR] = rdmsr(MSR_LSTAR);
+ guest_msrs[IDX_MSR_CSTAR] = rdmsr(MSR_CSTAR);
+ guest_msrs[IDX_MSR_STAR] = rdmsr(MSR_STAR);
+ guest_msrs[IDX_MSR_SF_MASK] = rdmsr(MSR_SF_MASK);
+ guest_msrs[IDX_MSR_KGSBASE] = rdmsr(MSR_KGSBASE);
+
+ /* Restore host MSRs */
+ wrmsr(MSR_LSTAR, host_msrs[IDX_MSR_LSTAR]);
+ wrmsr(MSR_CSTAR, host_msrs[IDX_MSR_CSTAR]);
+ wrmsr(MSR_STAR, host_msrs[IDX_MSR_STAR]);
+ wrmsr(MSR_SF_MASK, host_msrs[IDX_MSR_SF_MASK]);
+
+ /* MSR_KGSBASE will be restored on the way back to userspace */
+#endif
+}
+
+int
+vmx_rdmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t *val, bool *retu)
+{
+ int error = 0;
+
+ switch (num) {
+ case MSR_MCG_CAP:
+ case MSR_MCG_STATUS:
+ *val = 0;
+ break;
+ case MSR_MTRRcap:
+ case MSR_MTRRdefType:
+ case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+ case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+ case MSR_MTRR64kBase:
+ *val = 0;
+ break;
+ case MSR_IA32_MISC_ENABLE:
+ *val = misc_enable;
+ break;
+ case MSR_PLATFORM_INFO:
+ *val = platform_info;
+ break;
+ case MSR_TURBO_RATIO_LIMIT:
+ case MSR_TURBO_RATIO_LIMIT1:
+ *val = turbo_ratio_limit;
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ return (error);
+}
+
+int
+vmx_wrmsr(struct vmx *vmx, int vcpuid, u_int num, uint64_t val, bool *retu)
+{
+ uint64_t changed;
+ int error;
+
+ error = 0;
+ switch (num) {
+ case MSR_MCG_CAP:
+ case MSR_MCG_STATUS:
+ break; /* ignore writes */
+ case MSR_MTRRcap:
+ vm_inject_gp(vmx->vm, vcpuid);
+ break;
+ case MSR_MTRRdefType:
+ case MSR_MTRR4kBase ... MSR_MTRR4kBase + 8:
+ case MSR_MTRR16kBase ... MSR_MTRR16kBase + 1:
+ case MSR_MTRR64kBase:
+ break; /* Ignore writes */
+ case MSR_IA32_MISC_ENABLE:
+ changed = val ^ misc_enable;
+ /*
+ * If the host has disabled the NX feature then the guest
+ * also cannot use it. However, a Linux guest will try to
+ * enable the NX feature by writing to the MISC_ENABLE MSR.
+ *
+ * This can be safely ignored because the memory management
+ * code looks at CPUID.80000001H:EDX.NX to check if the
+ * functionality is actually enabled.
+ */
+ changed &= ~(1UL << 34);
+
+ /*
+ * Punt to userspace if any other bits are being modified.
+ */
+ if (changed)
+ error = EINVAL;
+
+ break;
+ case MSR_TSC:
+ error = vmx_set_tsc_offset(vmx, vcpuid, val - rdtsc());
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h
new file mode 100644
index 0000000000..5300d14d9b
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_msr.h
@@ -0,0 +1,70 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx_msr.h 271888 2014-09-20 02:35:21Z neel $
+ */
+
+#ifndef _VMX_MSR_H_
+#define _VMX_MSR_H_
+
+struct vmx;
+
+void vmx_msr_init(void);
+void vmx_msr_guest_init(struct vmx *vmx, int vcpuid);
+void vmx_msr_guest_enter(struct vmx *vmx, int vcpuid);
+void vmx_msr_guest_exit(struct vmx *vmx, int vcpuid);
+int vmx_rdmsr(struct vmx *, int vcpuid, u_int num, uint64_t *val, bool *retu);
+int vmx_wrmsr(struct vmx *, int vcpuid, u_int num, uint64_t val, bool *retu);
+
+uint32_t vmx_revision(void);
+
+int vmx_set_ctlreg(int ctl_reg, int true_ctl_reg, uint32_t ones_mask,
+ uint32_t zeros_mask, uint32_t *retval);
+
+/*
+ * According to Section 21.10.4 "Software Access to Related Structures",
+ * changes to data structures pointed to by the VMCS must be made only when
+ * there is no logical processor with a current VMCS that points to the
+ * data structure.
+ *
+ * This pretty much limits us to configuring the MSR bitmap before VMCS
+ * initialization for SMP VMs. Unless of course we do it the hard way - which
+ * would involve some form of synchronization between the vcpus to vmclear
+ * all VMCSs' that point to the bitmap.
+ */
+#define MSR_BITMAP_ACCESS_NONE 0x0
+#define MSR_BITMAP_ACCESS_READ 0x1
+#define MSR_BITMAP_ACCESS_WRITE 0x2
+#define MSR_BITMAP_ACCESS_RW (MSR_BITMAP_ACCESS_READ|MSR_BITMAP_ACCESS_WRITE)
+void msr_bitmap_initialize(char *bitmap);
+int msr_bitmap_change_access(char *bitmap, u_int msr, int access);
+
+#define guest_msr_rw(vmx, msr) \
+ msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_RW)
+
+#define guest_msr_ro(vmx, msr) \
+ msr_bitmap_change_access((vmx)->msr_bitmap, (msr), MSR_BITMAP_ACCESS_READ)
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
new file mode 100644
index 0000000000..d57dde1093
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/intel/vmx_support.s
@@ -0,0 +1,271 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/intel/vmx_support.S 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <machine/asmacros.h>
+
+#include "vmx_assym.s"
+
+/*
+ * Disable interrupts before updating %rsp in VMX_CHECK_AST or
+ * VMX_GUEST_RESTORE.
+ *
+ * The location that %rsp points to is a 'vmxctx' and not a
+ * real stack so we don't want an interrupt handler to trash it
+ */
+#define VMX_DISABLE_INTERRUPTS cli
+
+/*
+ * If the thread hosting the vcpu has an ast pending then take care of it
+ * by returning from vmx_setjmp() with a return value of VMX_RETURN_AST.
+ *
+ * Assumes that %rdi holds a pointer to the 'vmxctx' and that interrupts
+ * are disabled.
+ */
+#ifdef __FreeBSD__
+#define VMX_CHECK_AST \
+ movq PCPU(CURTHREAD),%rax; \
+ testl $TDF_ASTPENDING | TDF_NEEDRESCHED,TD_FLAGS(%rax); \
+ je 9f; \
+ movq $VMX_RETURN_AST,%rsi; \
+ movq %rdi,%rsp; \
+ addq $VMXCTX_TMPSTKTOP,%rsp; \
+ callq vmx_return; \
+9:
+#else
+#define VMX_CHECK_AST \
+ movq %gs:CPU_THREAD,%rax; \
+ movl T_ASTFLAG(%rax),%eax; \
+ test %al,%al; \
+ je 9f; \
+ movq $VMX_RETURN_AST,%rsi; \
+ movq %rdi,%rsp; \
+ addq $VMXCTX_TMPSTKTOP,%rsp; \
+ callq vmx_return; \
+9:
+#endif
+
+/*
+ * Assumes that %rdi holds a pointer to the 'vmxctx'.
+ *
+ * On "return" all registers are updated to reflect guest state. The two
+ * exceptions are %rip and %rsp. These registers are atomically switched
+ * by hardware from the guest area of the vmcs.
+ *
+ * We modify %rsp to point to the 'vmxctx' so we can use it to restore
+ * host context in case of an error with 'vmlaunch' or 'vmresume'.
+ */
+#define VMX_GUEST_RESTORE \
+ movq %rdi,%rsp; \
+ movq VMXCTX_GUEST_CR2(%rdi),%rsi; \
+ movq %rsi,%cr2; \
+ movq VMXCTX_GUEST_RSI(%rdi),%rsi; \
+ movq VMXCTX_GUEST_RDX(%rdi),%rdx; \
+ movq VMXCTX_GUEST_RCX(%rdi),%rcx; \
+ movq VMXCTX_GUEST_R8(%rdi),%r8; \
+ movq VMXCTX_GUEST_R9(%rdi),%r9; \
+ movq VMXCTX_GUEST_RAX(%rdi),%rax; \
+ movq VMXCTX_GUEST_RBX(%rdi),%rbx; \
+ movq VMXCTX_GUEST_RBP(%rdi),%rbp; \
+ movq VMXCTX_GUEST_R10(%rdi),%r10; \
+ movq VMXCTX_GUEST_R11(%rdi),%r11; \
+ movq VMXCTX_GUEST_R12(%rdi),%r12; \
+ movq VMXCTX_GUEST_R13(%rdi),%r13; \
+ movq VMXCTX_GUEST_R14(%rdi),%r14; \
+ movq VMXCTX_GUEST_R15(%rdi),%r15; \
+ movq VMXCTX_GUEST_RDI(%rdi),%rdi; /* restore rdi the last */
+
+#define VM_INSTRUCTION_ERROR(reg) \
+ jnc 1f; \
+ movl $VM_FAIL_INVALID,reg; /* CF is set */ \
+ jmp 3f; \
+1: jnz 2f; \
+ movl $VM_FAIL_VALID,reg; /* ZF is set */ \
+ jmp 3f; \
+2: movl $VM_SUCCESS,reg; \
+3: movl reg,VMXCTX_LAUNCH_ERROR(%rsp)
+
+ .text
+/*
+ * int vmx_setjmp(ctxp)
+ * %rdi = ctxp
+ *
+ * Return value is '0' when it returns directly from here.
+ * Return value is '1' when it returns after a vm exit through vmx_longjmp.
+ */
+ENTRY(vmx_setjmp)
+ movq (%rsp),%rax /* return address */
+ movq %r15,VMXCTX_HOST_R15(%rdi)
+ movq %r14,VMXCTX_HOST_R14(%rdi)
+ movq %r13,VMXCTX_HOST_R13(%rdi)
+ movq %r12,VMXCTX_HOST_R12(%rdi)
+ movq %rbp,VMXCTX_HOST_RBP(%rdi)
+ movq %rsp,VMXCTX_HOST_RSP(%rdi)
+ movq %rbx,VMXCTX_HOST_RBX(%rdi)
+ movq %rax,VMXCTX_HOST_RIP(%rdi)
+
+ /*
+ * XXX save host debug registers
+ */
+ movl $VMX_RETURN_DIRECT,%eax
+ ret
+END(vmx_setjmp)
+
+/*
+ * void vmx_return(struct vmxctx *ctxp, int retval)
+ * %rdi = ctxp
+ * %rsi = retval
+ * Return to vmm context through vmx_setjmp() with a value of 'retval'.
+ */
+ENTRY(vmx_return)
+ /* Restore host context. */
+ movq VMXCTX_HOST_R15(%rdi),%r15
+ movq VMXCTX_HOST_R14(%rdi),%r14
+ movq VMXCTX_HOST_R13(%rdi),%r13
+ movq VMXCTX_HOST_R12(%rdi),%r12
+ movq VMXCTX_HOST_RBP(%rdi),%rbp
+ movq VMXCTX_HOST_RSP(%rdi),%rsp
+ movq VMXCTX_HOST_RBX(%rdi),%rbx
+ movq VMXCTX_HOST_RIP(%rdi),%rax
+ movq %rax,(%rsp) /* return address */
+
+ /*
+ * XXX restore host debug registers
+ */
+ movl %esi,%eax
+ ret
+END(vmx_return)
+
+/*
+ * void vmx_longjmp(void)
+ * %rsp points to the struct vmxctx
+ */
+ENTRY(vmx_longjmp)
+ /*
+ * Save guest state that is not automatically saved in the vmcs.
+ */
+ movq %rdi,VMXCTX_GUEST_RDI(%rsp)
+ movq %rsi,VMXCTX_GUEST_RSI(%rsp)
+ movq %rdx,VMXCTX_GUEST_RDX(%rsp)
+ movq %rcx,VMXCTX_GUEST_RCX(%rsp)
+ movq %r8,VMXCTX_GUEST_R8(%rsp)
+ movq %r9,VMXCTX_GUEST_R9(%rsp)
+ movq %rax,VMXCTX_GUEST_RAX(%rsp)
+ movq %rbx,VMXCTX_GUEST_RBX(%rsp)
+ movq %rbp,VMXCTX_GUEST_RBP(%rsp)
+ movq %r10,VMXCTX_GUEST_R10(%rsp)
+ movq %r11,VMXCTX_GUEST_R11(%rsp)
+ movq %r12,VMXCTX_GUEST_R12(%rsp)
+ movq %r13,VMXCTX_GUEST_R13(%rsp)
+ movq %r14,VMXCTX_GUEST_R14(%rsp)
+ movq %r15,VMXCTX_GUEST_R15(%rsp)
+
+ movq %cr2,%rdi
+ movq %rdi,VMXCTX_GUEST_CR2(%rsp)
+
+ movq %rsp,%rdi
+ movq $VMX_RETURN_LONGJMP,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_longjmp)
+
+/*
+ * void vmx_resume(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 2.
+ */
+ENTRY(vmx_resume)
+ VMX_DISABLE_INTERRUPTS
+
+ VMX_CHECK_AST
+
+ /*
+ * Restore guest state that is not automatically loaded from the vmcs.
+ */
+ VMX_GUEST_RESTORE
+
+ vmresume
+
+ /*
+ * Capture the reason why vmresume failed.
+ */
+ VM_INSTRUCTION_ERROR(%eax)
+
+ /* Return via vmx_setjmp with return value of VMX_RETURN_VMRESUME */
+ movq %rsp,%rdi
+ movq $VMX_RETURN_VMRESUME,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_resume)
+
+/*
+ * void vmx_launch(struct vmxctx *ctxp)
+ * %rdi = ctxp
+ *
+ * Although the return type is a 'void' this function may return indirectly
+ * through vmx_setjmp() with a return value of 3.
+ */
+ENTRY(vmx_launch)
+ VMX_DISABLE_INTERRUPTS
+
+ VMX_CHECK_AST
+
+ /*
+ * Restore guest state that is not automatically loaded from the vmcs.
+ */
+ VMX_GUEST_RESTORE
+
+ vmlaunch
+
+ /*
+ * Capture the reason why vmlaunch failed.
+ */
+ VM_INSTRUCTION_ERROR(%eax)
+
+ /* Return via vmx_setjmp with return value of VMX_RETURN_VMLAUNCH */
+ movq %rsp,%rdi
+ movq $VMX_RETURN_VMLAUNCH,%rsi
+
+ addq $VMXCTX_TMPSTKTOP,%rsp
+ callq vmx_return
+END(vmx_launch)
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.c b/usr/src/uts/i86pc/io/vmm/io/vatpic.c
new file mode 100644
index 0000000000..a93b252c91
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.c
@@ -0,0 +1,809 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vatpic.c 279683 2015-03-06 02:05:45Z tychon $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+#include <x86/apicreg.h>
+#include <dev/ic/i8259.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_ktr.h"
+#include "vmm_lapic.h"
+#include "vioapic.h"
+#include "vatpic.h"
+
+static MALLOC_DEFINE(M_VATPIC, "atpic", "bhyve virtual atpic (8259)");
+
+#define VATPIC_LOCK(vatpic) mtx_lock_spin(&((vatpic)->mtx))
+#define VATPIC_UNLOCK(vatpic) mtx_unlock_spin(&((vatpic)->mtx))
+#define VATPIC_LOCKED(vatpic) mtx_owned(&((vatpic)->mtx))
+
+enum irqstate {
+ IRQSTATE_ASSERT,
+ IRQSTATE_DEASSERT,
+ IRQSTATE_PULSE
+};
+
+struct atpic {
+ bool ready;
+ int icw_num;
+ int rd_cmd_reg;
+
+ bool aeoi;
+ bool poll;
+ bool rotate;
+ bool sfn; /* special fully-nested mode */
+
+ int irq_base;
+ uint8_t request; /* Interrupt Request Register (IIR) */
+ uint8_t service; /* Interrupt Service (ISR) */
+ uint8_t mask; /* Interrupt Mask Register (IMR) */
+ uint8_t smm; /* special mask mode */
+
+ int acnt[8]; /* sum of pin asserts and deasserts */
+ int lowprio; /* lowest priority irq */
+
+ bool intr_raised;
+};
+
+struct vatpic {
+ struct vm *vm;
+ struct mtx mtx;
+ struct atpic atpic[2];
+ uint8_t elc[2];
+};
+
+#define VATPIC_CTR0(vatpic, fmt) \
+ VM_CTR0((vatpic)->vm, fmt)
+
+#define VATPIC_CTR1(vatpic, fmt, a1) \
+ VM_CTR1((vatpic)->vm, fmt, a1)
+
+#define VATPIC_CTR2(vatpic, fmt, a1, a2) \
+ VM_CTR2((vatpic)->vm, fmt, a1, a2)
+
+#define VATPIC_CTR3(vatpic, fmt, a1, a2, a3) \
+ VM_CTR3((vatpic)->vm, fmt, a1, a2, a3)
+
+#define VATPIC_CTR4(vatpic, fmt, a1, a2, a3, a4) \
+ VM_CTR4((vatpic)->vm, fmt, a1, a2, a3, a4)
+
+/*
+ * Loop over all the pins in priority order from highest to lowest.
+ */
+#define ATPIC_PIN_FOREACH(pinvar, atpic, tmpvar) \
+ for (tmpvar = 0, pinvar = (atpic->lowprio + 1) & 0x7; \
+ tmpvar < 8; \
+ tmpvar++, pinvar = (pinvar + 1) & 0x7)
+
+static void vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate);
+
+static __inline bool
+master_atpic(struct vatpic *vatpic, struct atpic *atpic)
+{
+
+ if (atpic == &vatpic->atpic[0])
+ return (true);
+ else
+ return (false);
+}
+
+static __inline int
+vatpic_get_highest_isrpin(struct atpic *atpic)
+{
+ int bit, pin;
+ int i;
+
+ ATPIC_PIN_FOREACH(pin, atpic, i) {
+ bit = (1 << pin);
+
+ if (atpic->service & bit) {
+ /*
+ * An IS bit that is masked by an IMR bit will not be
+ * cleared by a non-specific EOI in Special Mask Mode.
+ */
+ if (atpic->smm && (atpic->mask & bit) != 0)
+ continue;
+ else
+ return (pin);
+ }
+ }
+
+ return (-1);
+}
+
+static __inline int
+vatpic_get_highest_irrpin(struct atpic *atpic)
+{
+ int serviced;
+ int bit, pin, tmp;
+
+ /*
+ * In 'Special Fully-Nested Mode' when an interrupt request from
+ * a slave is in service, the slave is not locked out from the
+ * master's priority logic.
+ */
+ serviced = atpic->service;
+ if (atpic->sfn)
+ serviced &= ~(1 << 2);
+
+ /*
+ * In 'Special Mask Mode', when a mask bit is set in OCW1 it inhibits
+ * further interrupts at that level and enables interrupts from all
+ * other levels that are not masked. In other words the ISR has no
+ * bearing on the levels that can generate interrupts.
+ */
+ if (atpic->smm)
+ serviced = 0;
+
+ ATPIC_PIN_FOREACH(pin, atpic, tmp) {
+ bit = 1 << pin;
+
+ /*
+ * If there is already an interrupt in service at the same
+ * or higher priority then bail.
+ */
+ if ((serviced & bit) != 0)
+ break;
+
+ /*
+ * If an interrupt is asserted and not masked then return
+ * the corresponding 'pin' to the caller.
+ */
+ if ((atpic->request & bit) != 0 && (atpic->mask & bit) == 0)
+ return (pin);
+ }
+
+ return (-1);
+}
+
+static void
+vatpic_notify_intr(struct vatpic *vatpic)
+{
+ struct atpic *atpic;
+ int pin;
+
+ KASSERT(VATPIC_LOCKED(vatpic), ("vatpic_notify_intr not locked"));
+
+ /*
+ * First check the slave.
+ */
+ atpic = &vatpic->atpic[1];
+ if (!atpic->intr_raised &&
+ (pin = vatpic_get_highest_irrpin(atpic)) != -1) {
+ VATPIC_CTR4(vatpic, "atpic slave notify pin = %d "
+ "(imr 0x%x irr 0x%x isr 0x%x)", pin,
+ atpic->mask, atpic->request, atpic->service);
+
+ /*
+ * Cascade the request from the slave to the master.
+ */
+ atpic->intr_raised = true;
+ vatpic_set_pinstate(vatpic, 2, true);
+ vatpic_set_pinstate(vatpic, 2, false);
+ } else {
+ VATPIC_CTR3(vatpic, "atpic slave no eligible interrupts "
+ "(imr 0x%x irr 0x%x isr 0x%x)",
+ atpic->mask, atpic->request, atpic->service);
+ }
+
+ /*
+ * Then check the master.
+ */
+ atpic = &vatpic->atpic[0];
+ if (!atpic->intr_raised &&
+ (pin = vatpic_get_highest_irrpin(atpic)) != -1) {
+ VATPIC_CTR4(vatpic, "atpic master notify pin = %d "
+ "(imr 0x%x irr 0x%x isr 0x%x)", pin,
+ atpic->mask, atpic->request, atpic->service);
+
+ /*
+ * From Section 3.6.2, "Interrupt Modes", in the
+ * MPtable Specification, Version 1.4
+ *
+ * PIC interrupts are routed to both the Local APIC
+ * and the I/O APIC to support operation in 1 of 3
+ * modes.
+ *
+ * 1. Legacy PIC Mode: the PIC effectively bypasses
+ * all APIC components. In this mode the local APIC is
+ * disabled and LINT0 is reconfigured as INTR to
+ * deliver the PIC interrupt directly to the CPU.
+ *
+ * 2. Virtual Wire Mode: the APIC is treated as a
+ * virtual wire which delivers interrupts from the PIC
+ * to the CPU. In this mode LINT0 is programmed as
+ * ExtINT to indicate that the PIC is the source of
+ * the interrupt.
+ *
+ * 3. Virtual Wire Mode via I/O APIC: PIC interrupts are
+ * fielded by the I/O APIC and delivered to the appropriate
+ * CPU. In this mode the I/O APIC input 0 is programmed
+ * as ExtINT to indicate that the PIC is the source of the
+ * interrupt.
+ */
+ atpic->intr_raised = true;
+ lapic_set_local_intr(vatpic->vm, -1, APIC_LVT_LINT0);
+ vioapic_pulse_irq(vatpic->vm, 0);
+ } else {
+ VATPIC_CTR3(vatpic, "atpic master no eligible interrupts "
+ "(imr 0x%x irr 0x%x isr 0x%x)",
+ atpic->mask, atpic->request, atpic->service);
+ }
+}
+
+static int
+vatpic_icw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+ VATPIC_CTR1(vatpic, "atpic icw1 0x%x", val);
+
+ atpic->ready = false;
+
+ atpic->icw_num = 1;
+ atpic->request = 0;
+ atpic->mask = 0;
+ atpic->lowprio = 7;
+ atpic->rd_cmd_reg = 0;
+ atpic->poll = 0;
+ atpic->smm = 0;
+
+ if ((val & ICW1_SNGL) != 0) {
+ VATPIC_CTR0(vatpic, "vatpic cascade mode required");
+ return (-1);
+ }
+
+ if ((val & ICW1_IC4) == 0) {
+ VATPIC_CTR0(vatpic, "vatpic icw4 required");
+ return (-1);
+ }
+
+ atpic->icw_num++;
+
+ return (0);
+}
+
+static int
+vatpic_icw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+ VATPIC_CTR1(vatpic, "atpic icw2 0x%x", val);
+
+ atpic->irq_base = val & 0xf8;
+
+ atpic->icw_num++;
+
+ return (0);
+}
+
+static int
+vatpic_icw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+ VATPIC_CTR1(vatpic, "atpic icw3 0x%x", val);
+
+ atpic->icw_num++;
+
+ return (0);
+}
+
+static int
+vatpic_icw4(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+ VATPIC_CTR1(vatpic, "atpic icw4 0x%x", val);
+
+ if ((val & ICW4_8086) == 0) {
+ VATPIC_CTR0(vatpic, "vatpic microprocessor mode required");
+ return (-1);
+ }
+
+ if ((val & ICW4_AEOI) != 0)
+ atpic->aeoi = true;
+
+ if ((val & ICW4_SFNM) != 0) {
+ if (master_atpic(vatpic, atpic)) {
+ atpic->sfn = true;
+ } else {
+ VATPIC_CTR1(vatpic, "Ignoring special fully nested "
+ "mode on slave atpic: %#x", val);
+ }
+ }
+
+ atpic->icw_num = 0;
+ atpic->ready = true;
+
+ return (0);
+}
+
+static int
+vatpic_ocw1(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+ VATPIC_CTR1(vatpic, "atpic ocw1 0x%x", val);
+
+ atpic->mask = val & 0xff;
+
+ return (0);
+}
+
+static int
+vatpic_ocw2(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+ VATPIC_CTR1(vatpic, "atpic ocw2 0x%x", val);
+
+ atpic->rotate = ((val & OCW2_R) != 0);
+
+ if ((val & OCW2_EOI) != 0) {
+ int isr_bit;
+
+ if ((val & OCW2_SL) != 0) {
+ /* specific EOI */
+ isr_bit = val & 0x7;
+ } else {
+ /* non-specific EOI */
+ isr_bit = vatpic_get_highest_isrpin(atpic);
+ }
+
+ if (isr_bit != -1) {
+ atpic->service &= ~(1 << isr_bit);
+
+ if (atpic->rotate)
+ atpic->lowprio = isr_bit;
+ }
+ } else if ((val & OCW2_SL) != 0 && atpic->rotate == true) {
+ /* specific priority */
+ atpic->lowprio = val & 0x7;
+ }
+
+ return (0);
+}
+
+static int
+vatpic_ocw3(struct vatpic *vatpic, struct atpic *atpic, uint8_t val)
+{
+ VATPIC_CTR1(vatpic, "atpic ocw3 0x%x", val);
+
+ if (val & OCW3_ESMM) {
+ atpic->smm = val & OCW3_SMM ? 1 : 0;
+ VATPIC_CTR2(vatpic, "%s atpic special mask mode %s",
+ master_atpic(vatpic, atpic) ? "master" : "slave",
+ atpic->smm ? "enabled" : "disabled");
+ }
+
+ if (val & OCW3_RR) {
+ /* read register command */
+ atpic->rd_cmd_reg = val & OCW3_RIS;
+
+ /* Polling mode */
+ atpic->poll = ((val & OCW3_P) != 0);
+ }
+
+ return (0);
+}
+
+static void
+vatpic_set_pinstate(struct vatpic *vatpic, int pin, bool newstate)
+{
+ struct atpic *atpic;
+ int oldcnt, newcnt;
+ bool level;
+
+ KASSERT(pin >= 0 && pin < 16,
+ ("vatpic_set_pinstate: invalid pin number %d", pin));
+ KASSERT(VATPIC_LOCKED(vatpic),
+ ("vatpic_set_pinstate: vatpic is not locked"));
+
+ atpic = &vatpic->atpic[pin >> 3];
+
+ oldcnt = atpic->acnt[pin & 0x7];
+ if (newstate)
+ atpic->acnt[pin & 0x7]++;
+ else
+ atpic->acnt[pin & 0x7]--;
+ newcnt = atpic->acnt[pin & 0x7];
+
+ if (newcnt < 0) {
+ VATPIC_CTR2(vatpic, "atpic pin%d: bad acnt %d", pin, newcnt);
+ }
+
+ level = ((vatpic->elc[pin >> 3] & (1 << (pin & 0x7))) != 0);
+
+ if ((oldcnt == 0 && newcnt == 1) || (newcnt > 0 && level == true)) {
+ /* rising edge or level */
+ VATPIC_CTR1(vatpic, "atpic pin%d: asserted", pin);
+ atpic->request |= (1 << (pin & 0x7));
+ } else if (oldcnt == 1 && newcnt == 0) {
+ /* falling edge */
+ VATPIC_CTR1(vatpic, "atpic pin%d: deasserted", pin);
+ if (level)
+ atpic->request &= ~(1 << (pin & 0x7));
+ } else {
+ VATPIC_CTR3(vatpic, "atpic pin%d: %s, ignored, acnt %d",
+ pin, newstate ? "asserted" : "deasserted", newcnt);
+ }
+
+ vatpic_notify_intr(vatpic);
+}
+
+static int
+vatpic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate)
+{
+ struct vatpic *vatpic;
+ struct atpic *atpic;
+
+ if (irq < 0 || irq > 15)
+ return (EINVAL);
+
+ vatpic = vm_atpic(vm);
+ atpic = &vatpic->atpic[irq >> 3];
+
+ if (atpic->ready == false)
+ return (0);
+
+ VATPIC_LOCK(vatpic);
+ switch (irqstate) {
+ case IRQSTATE_ASSERT:
+ vatpic_set_pinstate(vatpic, irq, true);
+ break;
+ case IRQSTATE_DEASSERT:
+ vatpic_set_pinstate(vatpic, irq, false);
+ break;
+ case IRQSTATE_PULSE:
+ vatpic_set_pinstate(vatpic, irq, true);
+ vatpic_set_pinstate(vatpic, irq, false);
+ break;
+ default:
+ panic("vatpic_set_irqstate: invalid irqstate %d", irqstate);
+ }
+ VATPIC_UNLOCK(vatpic);
+
+ return (0);
+}
+
+int
+vatpic_assert_irq(struct vm *vm, int irq)
+{
+ return (vatpic_set_irqstate(vm, irq, IRQSTATE_ASSERT));
+}
+
+int
+vatpic_deassert_irq(struct vm *vm, int irq)
+{
+ return (vatpic_set_irqstate(vm, irq, IRQSTATE_DEASSERT));
+}
+
+int
+vatpic_pulse_irq(struct vm *vm, int irq)
+{
+ return (vatpic_set_irqstate(vm, irq, IRQSTATE_PULSE));
+}
+
+int
+vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger)
+{
+ struct vatpic *vatpic;
+
+ if (irq < 0 || irq > 15)
+ return (EINVAL);
+
+ /*
+ * See comment in vatpic_elc_handler. These IRQs must be
+ * edge triggered.
+ */
+ if (trigger == LEVEL_TRIGGER) {
+ switch (irq) {
+ case 0:
+ case 1:
+ case 2:
+ case 8:
+ case 13:
+ return (EINVAL);
+ }
+ }
+
+ vatpic = vm_atpic(vm);
+
+ VATPIC_LOCK(vatpic);
+
+ if (trigger == LEVEL_TRIGGER)
+ vatpic->elc[irq >> 3] |= 1 << (irq & 0x7);
+ else
+ vatpic->elc[irq >> 3] &= ~(1 << (irq & 0x7));
+
+ VATPIC_UNLOCK(vatpic);
+
+ return (0);
+}
+
+void
+vatpic_pending_intr(struct vm *vm, int *vecptr)
+{
+ struct vatpic *vatpic;
+ struct atpic *atpic;
+ int pin;
+
+ vatpic = vm_atpic(vm);
+
+ atpic = &vatpic->atpic[0];
+
+ VATPIC_LOCK(vatpic);
+
+ pin = vatpic_get_highest_irrpin(atpic);
+ if (pin == 2) {
+ atpic = &vatpic->atpic[1];
+ pin = vatpic_get_highest_irrpin(atpic);
+ }
+
+ /*
+ * If there are no pins active at this moment then return the spurious
+ * interrupt vector instead.
+ */
+ if (pin == -1)
+ pin = 7;
+
+ KASSERT(pin >= 0 && pin <= 7, ("%s: invalid pin %d", __func__, pin));
+ *vecptr = atpic->irq_base + pin;
+
+ VATPIC_UNLOCK(vatpic);
+}
+
+static void
+vatpic_pin_accepted(struct atpic *atpic, int pin)
+{
+ atpic->intr_raised = false;
+
+ if (atpic->acnt[pin] == 0)
+ atpic->request &= ~(1 << pin);
+
+ if (atpic->aeoi == true) {
+ if (atpic->rotate == true)
+ atpic->lowprio = pin;
+ } else {
+ atpic->service |= (1 << pin);
+ }
+}
+
+void
+vatpic_intr_accepted(struct vm *vm, int vector)
+{
+ struct vatpic *vatpic;
+ int pin;
+
+ vatpic = vm_atpic(vm);
+
+ VATPIC_LOCK(vatpic);
+
+ pin = vector & 0x7;
+
+ if ((vector & ~0x7) == vatpic->atpic[1].irq_base) {
+ vatpic_pin_accepted(&vatpic->atpic[1], pin);
+ /*
+ * If this vector originated from the slave,
+ * accept the cascaded interrupt too.
+ */
+ vatpic_pin_accepted(&vatpic->atpic[0], 2);
+ } else {
+ vatpic_pin_accepted(&vatpic->atpic[0], pin);
+ }
+
+ vatpic_notify_intr(vatpic);
+
+ VATPIC_UNLOCK(vatpic);
+}
+
+static int
+vatpic_read(struct vatpic *vatpic, struct atpic *atpic, bool in, int port,
+ int bytes, uint32_t *eax)
+{
+ int pin;
+
+ VATPIC_LOCK(vatpic);
+
+ if (atpic->poll) {
+ atpic->poll = 0;
+ pin = vatpic_get_highest_irrpin(atpic);
+ if (pin >= 0) {
+ vatpic_pin_accepted(atpic, pin);
+ *eax = 0x80 | pin;
+ } else {
+ *eax = 0;
+ }
+ } else {
+ if (port & ICU_IMR_OFFSET) {
+ /* read interrrupt mask register */
+ *eax = atpic->mask;
+ } else {
+ if (atpic->rd_cmd_reg == OCW3_RIS) {
+ /* read interrupt service register */
+ *eax = atpic->service;
+ } else {
+ /* read interrupt request register */
+ *eax = atpic->request;
+ }
+ }
+ }
+
+ VATPIC_UNLOCK(vatpic);
+
+ return (0);
+
+}
+
+static int
+vatpic_write(struct vatpic *vatpic, struct atpic *atpic, bool in, int port,
+ int bytes, uint32_t *eax)
+{
+ int error;
+ uint8_t val;
+
+ error = 0;
+ val = *eax;
+
+ VATPIC_LOCK(vatpic);
+
+ if (port & ICU_IMR_OFFSET) {
+ switch (atpic->icw_num) {
+ case 2:
+ error = vatpic_icw2(vatpic, atpic, val);
+ break;
+ case 3:
+ error = vatpic_icw3(vatpic, atpic, val);
+ break;
+ case 4:
+ error = vatpic_icw4(vatpic, atpic, val);
+ break;
+ default:
+ error = vatpic_ocw1(vatpic, atpic, val);
+ break;
+ }
+ } else {
+ if (val & (1 << 4))
+ error = vatpic_icw1(vatpic, atpic, val);
+
+ if (atpic->ready) {
+ if (val & (1 << 3))
+ error = vatpic_ocw3(vatpic, atpic, val);
+ else
+ error = vatpic_ocw2(vatpic, atpic, val);
+ }
+ }
+
+ if (atpic->ready)
+ vatpic_notify_intr(vatpic);
+
+ VATPIC_UNLOCK(vatpic);
+
+ return (error);
+}
+
+int
+vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+ uint32_t *eax)
+{
+ struct vatpic *vatpic;
+ struct atpic *atpic;
+
+ vatpic = vm_atpic(vm);
+ atpic = &vatpic->atpic[0];
+
+ if (bytes != 1)
+ return (-1);
+
+ if (in) {
+ return (vatpic_read(vatpic, atpic, in, port, bytes, eax));
+ }
+
+ return (vatpic_write(vatpic, atpic, in, port, bytes, eax));
+}
+
+int
+vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+ uint32_t *eax)
+{
+ struct vatpic *vatpic;
+ struct atpic *atpic;
+
+ vatpic = vm_atpic(vm);
+ atpic = &vatpic->atpic[1];
+
+ if (bytes != 1)
+ return (-1);
+
+ if (in) {
+ return (vatpic_read(vatpic, atpic, in, port, bytes, eax));
+ }
+
+ return (vatpic_write(vatpic, atpic, in, port, bytes, eax));
+}
+
+int
+vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+ uint32_t *eax)
+{
+ struct vatpic *vatpic;
+ bool is_master;
+
+ vatpic = vm_atpic(vm);
+ is_master = (port == IO_ELCR1);
+
+ if (bytes != 1)
+ return (-1);
+
+ VATPIC_LOCK(vatpic);
+
+ if (in) {
+ if (is_master)
+ *eax = vatpic->elc[0];
+ else
+ *eax = vatpic->elc[1];
+ } else {
+ /*
+ * For the master PIC the cascade channel (IRQ2), the
+ * heart beat timer (IRQ0), and the keyboard
+ * controller (IRQ1) cannot be programmed for level
+ * mode.
+ *
+ * For the slave PIC the real time clock (IRQ8) and
+ * the floating point error interrupt (IRQ13) cannot
+ * be programmed for level mode.
+ */
+ if (is_master)
+ vatpic->elc[0] = (*eax & 0xf8);
+ else
+ vatpic->elc[1] = (*eax & 0xde);
+ }
+
+ VATPIC_UNLOCK(vatpic);
+
+ return (0);
+}
+
+struct vatpic *
+vatpic_init(struct vm *vm)
+{
+ struct vatpic *vatpic;
+
+ vatpic = malloc(sizeof(struct vatpic), M_VATPIC, M_WAITOK | M_ZERO);
+ vatpic->vm = vm;
+
+ mtx_init(&vatpic->mtx, "vatpic lock", NULL, MTX_SPIN);
+
+ return (vatpic);
+}
+
+void
+vatpic_cleanup(struct vatpic *vatpic)
+{
+ free(vatpic, M_VATPIC);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpic.h b/usr/src/uts/i86pc/io/vmm/io/vatpic.h
new file mode 100644
index 0000000000..ef5e51b158
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpic.h
@@ -0,0 +1,57 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vatpic.h 273706 2014-10-26 19:03:06Z neel $
+ */
+
+#ifndef _VATPIC_H_
+#define _VATPIC_H_
+
+#include <isa/isareg.h>
+
+#define ICU_IMR_OFFSET 1
+
+#define IO_ELCR1 0x4d0
+#define IO_ELCR2 0x4d1
+
+struct vatpic *vatpic_init(struct vm *vm);
+void vatpic_cleanup(struct vatpic *vatpic);
+
+int vatpic_master_handler(struct vm *vm, int vcpuid, bool in, int port,
+ int bytes, uint32_t *eax);
+int vatpic_slave_handler(struct vm *vm, int vcpuid, bool in, int port,
+ int bytes, uint32_t *eax);
+int vatpic_elc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+ uint32_t *eax);
+
+int vatpic_assert_irq(struct vm *vm, int irq);
+int vatpic_deassert_irq(struct vm *vm, int irq);
+int vatpic_pulse_irq(struct vm *vm, int irq);
+int vatpic_set_irq_trigger(struct vm *vm, int irq, enum vm_intr_trigger trigger);
+
+void vatpic_pending_intr(struct vm *vm, int *vecptr);
+void vatpic_intr_accepted(struct vm *vm, int vector);
+
+#endif /* _VATPIC_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.c b/usr/src/uts/i86pc/io/vmm/io/vatpit.c
new file mode 100644
index 0000000000..ce17bdc92c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.c
@@ -0,0 +1,458 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vatpit.c 273706 2014-10-26 19:03:06Z neel $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/cpuset.h>
+#include <sys/kernel.h>
+#include <sys/lock.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_ktr.h"
+#include "vatpic.h"
+#include "vioapic.h"
+#include "vatpit.h"
+
+static MALLOC_DEFINE(M_VATPIT, "atpit", "bhyve virtual atpit (8254)");
+
+#define VATPIT_LOCK(vatpit) mtx_lock_spin(&((vatpit)->mtx))
+#define VATPIT_UNLOCK(vatpit) mtx_unlock_spin(&((vatpit)->mtx))
+#define VATPIT_LOCKED(vatpit) mtx_owned(&((vatpit)->mtx))
+
+#define TIMER_SEL_MASK 0xc0
+#define TIMER_RW_MASK 0x30
+#define TIMER_MODE_MASK 0x0f
+#define TIMER_SEL_READBACK 0xc0
+
+#define TIMER_STS_OUT 0x80
+#define TIMER_STS_NULLCNT 0x40
+
+#define TIMER_RB_LCTR 0x20
+#define TIMER_RB_LSTATUS 0x10
+#define TIMER_RB_CTR_2 0x08
+#define TIMER_RB_CTR_1 0x04
+#define TIMER_RB_CTR_0 0x02
+
+#define TMR2_OUT_STS 0x20
+
+#define PIT_8254_FREQ 1193182
+#define TIMER_DIV(freq, hz) (((freq) + (hz) / 2) / (hz))
+
+struct vatpit_callout_arg {
+ struct vatpit *vatpit;
+ int channel_num;
+};
+
+
+struct channel {
+ int mode;
+ uint16_t initial; /* initial counter value */
+ sbintime_t now_sbt; /* uptime when counter was loaded */
+ uint8_t cr[2];
+ uint8_t ol[2];
+ bool slatched; /* status latched */
+ uint8_t status;
+ int crbyte;
+ int olbyte;
+ int frbyte;
+ struct callout callout;
+ sbintime_t callout_sbt; /* target time */
+ struct vatpit_callout_arg callout_arg;
+};
+
+struct vatpit {
+ struct vm *vm;
+ struct mtx mtx;
+
+ sbintime_t freq_sbt;
+
+ struct channel channel[3];
+};
+
+static void pit_timer_start_cntr0(struct vatpit *vatpit);
+
+static int
+vatpit_get_out(struct vatpit *vatpit, int channel)
+{
+ struct channel *c;
+ sbintime_t delta_ticks;
+ int out;
+
+ c = &vatpit->channel[channel];
+
+ switch (c->mode) {
+ case TIMER_INTTC:
+ delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt;
+ out = ((c->initial - delta_ticks) <= 0);
+ break;
+ default:
+ out = 0;
+ break;
+ }
+
+ return (out);
+}
+
+static void
+vatpit_callout_handler(void *a)
+{
+ struct vatpit_callout_arg *arg = a;
+ struct vatpit *vatpit;
+ struct callout *callout;
+ struct channel *c;
+
+ vatpit = arg->vatpit;
+ c = &vatpit->channel[arg->channel_num];
+ callout = &c->callout;
+
+ VM_CTR1(vatpit->vm, "atpit t%d fired", arg->channel_num);
+
+ VATPIT_LOCK(vatpit);
+
+ if (callout_pending(callout)) /* callout was reset */
+ goto done;
+
+ if (!callout_active(callout)) /* callout was stopped */
+ goto done;
+
+ callout_deactivate(callout);
+
+ if (c->mode == TIMER_RATEGEN) {
+ pit_timer_start_cntr0(vatpit);
+ }
+
+ vatpic_pulse_irq(vatpit->vm, 0);
+ vioapic_pulse_irq(vatpit->vm, 2);
+
+done:
+ VATPIT_UNLOCK(vatpit);
+ return;
+}
+
+static void
+pit_timer_start_cntr0(struct vatpit *vatpit)
+{
+ struct channel *c;
+ sbintime_t now, delta, precision;
+
+ c = &vatpit->channel[0];
+ if (c->initial != 0) {
+ delta = c->initial * vatpit->freq_sbt;
+ precision = delta >> tc_precexp;
+ c->callout_sbt = c->callout_sbt + delta;
+
+ /*
+ * Reset 'callout_sbt' if the time that the callout
+ * was supposed to fire is more than 'c->initial'
+ * ticks in the past.
+ */
+ now = sbinuptime();
+ if (c->callout_sbt < now)
+ c->callout_sbt = now + delta;
+
+ callout_reset_sbt(&c->callout, c->callout_sbt,
+ precision, vatpit_callout_handler, &c->callout_arg,
+ C_ABSOLUTE);
+ }
+}
+
+static uint16_t
+pit_update_counter(struct vatpit *vatpit, struct channel *c, bool latch)
+{
+ uint16_t lval;
+ sbintime_t delta_ticks;
+
+ /* cannot latch a new value until the old one has been consumed */
+ if (latch && c->olbyte != 0)
+ return (0);
+
+ if (c->initial == 0) {
+ /*
+ * This is possibly an o/s bug - reading the value of
+ * the timer without having set up the initial value.
+ *
+ * The original user-space version of this code set
+ * the timer to 100hz in this condition; do the same
+ * here.
+ */
+ c->initial = TIMER_DIV(PIT_8254_FREQ, 100);
+ c->now_sbt = sbinuptime();
+ c->status &= ~TIMER_STS_NULLCNT;
+ }
+
+ delta_ticks = (sbinuptime() - c->now_sbt) / vatpit->freq_sbt;
+
+ lval = c->initial - delta_ticks % c->initial;
+
+ if (latch) {
+ c->olbyte = 2;
+ c->ol[1] = lval; /* LSB */
+ c->ol[0] = lval >> 8; /* MSB */
+ }
+
+ return (lval);
+}
+
+static int
+pit_readback1(struct vatpit *vatpit, int channel, uint8_t cmd)
+{
+ struct channel *c;
+
+ c = &vatpit->channel[channel];
+
+ /*
+ * Latch the count/status of the timer if not already latched.
+ * N.B. that the count/status latch-select bits are active-low.
+ */
+ if (!(cmd & TIMER_RB_LCTR) && !c->olbyte) {
+ (void) pit_update_counter(vatpit, c, true);
+ }
+
+ if (!(cmd & TIMER_RB_LSTATUS) && !c->slatched) {
+ c->slatched = true;
+ /*
+ * For mode 0, see if the elapsed time is greater
+ * than the initial value - this results in the
+ * output pin being set to 1 in the status byte.
+ */
+ if (c->mode == TIMER_INTTC && vatpit_get_out(vatpit, channel))
+ c->status |= TIMER_STS_OUT;
+ else
+ c->status &= ~TIMER_STS_OUT;
+ }
+
+ return (0);
+}
+
+static int
+pit_readback(struct vatpit *vatpit, uint8_t cmd)
+{
+ int error;
+
+ /*
+ * The readback command can apply to all timers.
+ */
+ error = 0;
+ if (cmd & TIMER_RB_CTR_0)
+ error = pit_readback1(vatpit, 0, cmd);
+ if (!error && cmd & TIMER_RB_CTR_1)
+ error = pit_readback1(vatpit, 1, cmd);
+ if (!error && cmd & TIMER_RB_CTR_2)
+ error = pit_readback1(vatpit, 2, cmd);
+
+ return (error);
+}
+
+
+static int
+vatpit_update_mode(struct vatpit *vatpit, uint8_t val)
+{
+ struct channel *c;
+ int sel, rw, mode;
+
+ sel = val & TIMER_SEL_MASK;
+ rw = val & TIMER_RW_MASK;
+ mode = val & TIMER_MODE_MASK;
+
+ if (sel == TIMER_SEL_READBACK)
+ return (pit_readback(vatpit, val));
+
+ if (rw != TIMER_LATCH && rw != TIMER_16BIT)
+ return (-1);
+
+ if (rw != TIMER_LATCH) {
+ /*
+ * Counter mode is not affected when issuing a
+ * latch command.
+ */
+ if (mode != TIMER_INTTC &&
+ mode != TIMER_RATEGEN &&
+ mode != TIMER_SQWAVE &&
+ mode != TIMER_SWSTROBE)
+ return (-1);
+ }
+
+ c = &vatpit->channel[sel >> 6];
+ if (rw == TIMER_LATCH)
+ pit_update_counter(vatpit, c, true);
+ else {
+ c->mode = mode;
+ c->olbyte = 0; /* reset latch after reprogramming */
+ c->status |= TIMER_STS_NULLCNT;
+ }
+
+ return (0);
+}
+
+int
+vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+ uint32_t *eax)
+{
+ struct vatpit *vatpit;
+ struct channel *c;
+ uint8_t val;
+ int error;
+
+ vatpit = vm_atpit(vm);
+
+ if (bytes != 1)
+ return (-1);
+
+ val = *eax;
+
+ if (port == TIMER_MODE) {
+ if (in) {
+ VM_CTR0(vatpit->vm, "vatpit attempt to read mode");
+ return (-1);
+ }
+
+ VATPIT_LOCK(vatpit);
+ error = vatpit_update_mode(vatpit, val);
+ VATPIT_UNLOCK(vatpit);
+
+ return (error);
+ }
+
+ /* counter ports */
+ KASSERT(port >= TIMER_CNTR0 && port <= TIMER_CNTR2,
+ ("invalid port 0x%x", port));
+ c = &vatpit->channel[port - TIMER_CNTR0];
+
+ VATPIT_LOCK(vatpit);
+ if (in && c->slatched) {
+ /*
+ * Return the status byte if latched
+ */
+ *eax = c->status;
+ c->slatched = false;
+ c->status = 0;
+ } else if (in) {
+ /*
+ * The spec says that once the output latch is completely
+ * read it should revert to "following" the counter. Use
+ * the free running counter for this case (i.e. Linux
+ * TSC calibration). Assuming the access mode is 16-bit,
+ * toggle the MSB/LSB bit on each read.
+ */
+ if (c->olbyte == 0) {
+ uint16_t tmp;
+
+ tmp = pit_update_counter(vatpit, c, false);
+ if (c->frbyte)
+ tmp >>= 8;
+ tmp &= 0xff;
+ *eax = tmp;
+ c->frbyte ^= 1;
+ } else
+ *eax = c->ol[--c->olbyte];
+ } else {
+ c->cr[c->crbyte++] = *eax;
+ if (c->crbyte == 2) {
+ c->status &= ~TIMER_STS_NULLCNT;
+ c->frbyte = 0;
+ c->crbyte = 0;
+ c->initial = c->cr[0] | (uint16_t)c->cr[1] << 8;
+ c->now_sbt = sbinuptime();
+ /* Start an interval timer for channel 0 */
+ if (port == TIMER_CNTR0) {
+ c->callout_sbt = c->now_sbt;
+ pit_timer_start_cntr0(vatpit);
+ }
+ if (c->initial == 0)
+ c->initial = 0xffff;
+ }
+ }
+ VATPIT_UNLOCK(vatpit);
+
+ return (0);
+}
+
+int
+vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+ uint32_t *eax)
+{
+ struct vatpit *vatpit;
+
+ vatpit = vm_atpit(vm);
+
+ if (in) {
+ VATPIT_LOCK(vatpit);
+ if (vatpit_get_out(vatpit, 2))
+ *eax = TMR2_OUT_STS;
+ else
+ *eax = 0;
+
+ VATPIT_UNLOCK(vatpit);
+ }
+
+ return (0);
+}
+
+struct vatpit *
+vatpit_init(struct vm *vm)
+{
+ struct vatpit *vatpit;
+ struct bintime bt;
+ struct vatpit_callout_arg *arg;
+ int i;
+
+ vatpit = malloc(sizeof(struct vatpit), M_VATPIT, M_WAITOK | M_ZERO);
+ vatpit->vm = vm;
+
+ mtx_init(&vatpit->mtx, "vatpit lock", NULL, MTX_SPIN);
+
+ FREQ2BT(PIT_8254_FREQ, &bt);
+ vatpit->freq_sbt = bttosbt(bt);
+
+ for (i = 0; i < 3; i++) {
+ callout_init(&vatpit->channel[i].callout, true);
+ arg = &vatpit->channel[i].callout_arg;
+ arg->vatpit = vatpit;
+ arg->channel_num = i;
+ }
+
+ return (vatpit);
+}
+
+void
+vatpit_cleanup(struct vatpit *vatpit)
+{
+ int i;
+
+ for (i = 0; i < 3; i++)
+ callout_drain(&vatpit->channel[i].callout);
+
+ free(vatpit, M_VATPIT);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vatpit.h b/usr/src/uts/i86pc/io/vmm/io/vatpit.h
new file mode 100644
index 0000000000..f20ad73e47
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vatpit.h
@@ -0,0 +1,45 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vatpit.h 273706 2014-10-26 19:03:06Z neel $
+ */
+
+#ifndef _VATPIT_H_
+#define _VATPIT_H_
+
+#include <machine/timerreg.h>
+
+#define NMISC_PORT 0x61
+
+struct vatpit *vatpit_init(struct vm *vm);
+void vatpit_cleanup(struct vatpit *vatpit);
+
+int vatpit_handler(struct vm *vm, int vcpuid, bool in, int port, int bytes,
+ uint32_t *eax);
+int vatpit_nmisc_handler(struct vm *vm, int vcpuid, bool in, int port,
+ int bytes, uint32_t *eax);
+
+#endif /* _VATPIT_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vdev.c b/usr/src/uts/i86pc/io/vmm/io/vdev.c
new file mode 100644
index 0000000000..0f835625f3
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vdev.c
@@ -0,0 +1,282 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vdev.c 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vdev.c 245678 2013-01-20 03:42:49Z neel $");
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+
+#include "vdev.h"
+
+struct vdev {
+ SLIST_ENTRY(vdev) entry;
+ struct vdev_ops *ops;
+ void *dev;
+};
+static SLIST_HEAD(, vdev) vdev_head;
+static int vdev_count;
+
+struct vdev_region {
+ SLIST_ENTRY(vdev_region) entry;
+ struct vdev_ops *ops;
+ void *dev;
+ struct io_region *io;
+};
+static SLIST_HEAD(, vdev_region) region_head;
+static int region_count;
+
+static MALLOC_DEFINE(M_VDEV, "vdev", "vdev");
+
+#define VDEV_INIT (0)
+#define VDEV_RESET (1)
+#define VDEV_HALT (2)
+
+// static const char* vdev_event_str[] = {"VDEV_INIT", "VDEV_RESET", "VDEV_HALT"};
+
+static int
+vdev_system_event(int event)
+{
+ struct vdev *vd;
+ int rc;
+
+ // TODO: locking
+ SLIST_FOREACH(vd, &vdev_head, entry) {
+ // printf("%s : %s Device %s\n", __func__, vdev_event_str[event], vd->ops->name);
+ switch (event) {
+ case VDEV_INIT:
+ rc = vd->ops->init(vd->dev);
+ break;
+ case VDEV_RESET:
+ rc = vd->ops->reset(vd->dev);
+ break;
+ case VDEV_HALT:
+ rc = vd->ops->halt(vd->dev);
+ break;
+ default:
+ break;
+ }
+ if (rc) {
+ printf("vdev %s init failed rc=%d\n",
+ vd->ops->name, rc);
+ return rc;
+ }
+ }
+ return 0;
+}
+
+int
+vdev_init(void)
+{
+ return vdev_system_event(VDEV_INIT);
+}
+
+int
+vdev_reset(void)
+{
+ return vdev_system_event(VDEV_RESET);
+}
+
+int
+vdev_halt(void)
+{
+ return vdev_system_event(VDEV_HALT);
+}
+
+void
+vdev_vm_init(void)
+{
+ SLIST_INIT(&vdev_head);
+ vdev_count = 0;
+
+ SLIST_INIT(&region_head);
+ region_count = 0;
+}
+void
+vdev_vm_cleanup(void)
+{
+ struct vdev *vd;
+
+ // TODO: locking
+ while (!SLIST_EMPTY(&vdev_head)) {
+ vd = SLIST_FIRST(&vdev_head);
+ SLIST_REMOVE_HEAD(&vdev_head, entry);
+ free(vd, M_VDEV);
+ vdev_count--;
+ }
+}
+
+int
+vdev_register(struct vdev_ops *ops, void *dev)
+{
+ struct vdev *vd;
+ vd = malloc(sizeof(*vd), M_VDEV, M_WAITOK | M_ZERO);
+ vd->ops = ops;
+ vd->dev = dev;
+
+ // TODO: locking
+ SLIST_INSERT_HEAD(&vdev_head, vd, entry);
+ vdev_count++;
+ return 0;
+}
+
+void
+vdev_unregister(void *dev)
+{
+ struct vdev *vd, *found;
+
+ found = NULL;
+ // TODO: locking
+ SLIST_FOREACH(vd, &vdev_head, entry) {
+ if (vd->dev == dev) {
+ found = vd;
+ }
+ }
+
+ if (found) {
+ SLIST_REMOVE(&vdev_head, found, vdev, entry);
+ free(found, M_VDEV);
+ }
+}
+
+#define IN_RANGE(val, start, end) \
+ (((val) >= (start)) && ((val) < (end)))
+
+static struct vdev_region*
+vdev_find_region(struct io_region *io, void *dev)
+{
+ struct vdev_region *region, *found;
+ uint64_t region_base;
+ uint64_t region_end;
+
+ found = NULL;
+
+ // TODO: locking
+ // FIXME: we should verify we are in the context the current
+ // vcpu here as well.
+ SLIST_FOREACH(region, &region_head, entry) {
+ region_base = region->io->base;
+ region_end = region_base + region->io->len;
+ if (IN_RANGE(io->base, region_base, region_end) &&
+ IN_RANGE(io->base+io->len, region_base, region_end+1) &&
+ (dev && dev == region->dev)) {
+ found = region;
+ break;
+ }
+ }
+ return found;
+}
+
+int
+vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io)
+{
+ struct vdev_region *region;
+
+ region = vdev_find_region(io, dev);
+ if (region) {
+ return -EEXIST;
+ }
+
+ region = malloc(sizeof(*region), M_VDEV, M_WAITOK | M_ZERO);
+ region->io = io;
+ region->ops = ops;
+ region->dev = dev;
+
+ // TODO: locking
+ SLIST_INSERT_HEAD(&region_head, region, entry);
+ region_count++;
+
+ return 0;
+}
+
+void
+vdev_unregister_region(void *dev, struct io_region *io)
+{
+ struct vdev_region *region;
+
+ region = vdev_find_region(io, dev);
+
+ if (region) {
+ SLIST_REMOVE(&region_head, region, vdev_region, entry);
+ free(region, M_VDEV);
+ region_count--;
+ }
+}
+
+static int
+vdev_memrw(uint64_t gpa, opsize_t size, uint64_t *data, int read)
+{
+ struct vdev_region *region;
+ struct io_region io;
+ region_attr_t attr;
+ int rc;
+
+ io.base = gpa;
+ io.len = size;
+
+ region = vdev_find_region(&io, NULL);
+ if (!region)
+ return -EINVAL;
+
+ attr = (read) ? MMIO_READ : MMIO_WRITE;
+ if (!(region->io->attr & attr))
+ return -EPERM;
+
+ if (read)
+ rc = region->ops->memread(region->dev, gpa, size, data);
+ else
+ rc = region->ops->memwrite(region->dev, gpa, size, *data);
+
+ return rc;
+}
+
+int
+vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data)
+{
+ return vdev_memrw(gpa, size, data, 1);
+}
+
+int
+vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data)
+{
+ return vdev_memrw(gpa, size, &data, 0);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vdev.h b/usr/src/uts/i86pc/io/vmm/io/vdev.h
new file mode 100644
index 0000000000..dd2df75ad8
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vdev.h
@@ -0,0 +1,96 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vdev.h 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _VDEV_H_
+#define _VDEV_H_
+
+typedef enum {
+ BYTE = 1,
+ WORD = 2,
+ DWORD = 4,
+ QWORD = 8,
+} opsize_t;
+
+typedef enum {
+ MMIO_READ = 1,
+ MMIO_WRITE = 2,
+} region_attr_t;
+
+struct io_region {
+ uint64_t base;
+ uint64_t len;
+ region_attr_t attr;
+ int vcpu;
+};
+
+typedef int (*vdev_init_t)(void* dev);
+typedef int (*vdev_reset_t)(void* dev);
+typedef int (*vdev_halt_t)(void* dev);
+typedef int (*vdev_memread_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t *data);
+typedef int (*vdev_memwrite_t)(void* dev, uint64_t gpa, opsize_t size, uint64_t data);
+
+
+struct vdev_ops {
+ const char *name;
+ vdev_init_t init;
+ vdev_reset_t reset;
+ vdev_halt_t halt;
+ vdev_memread_t memread;
+ vdev_memwrite_t memwrite;
+};
+
+
+void vdev_vm_init(void);
+void vdev_vm_cleanup(void);
+
+int vdev_register(struct vdev_ops *ops, void *dev);
+void vdev_unregister(void *dev);
+
+int vdev_register_region(struct vdev_ops *ops, void *dev, struct io_region *io);
+void vdev_unregister_region(void *dev, struct io_region *io);
+
+int vdev_init(void);
+int vdev_reset(void);
+int vdev_halt(void);
+int vdev_memread(uint64_t gpa, opsize_t size, uint64_t *data);
+int vdev_memwrite(uint64_t gpa, opsize_t size, uint64_t data);
+
+#endif /* _VDEV_H_ */
+
diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.c b/usr/src/uts/i86pc/io/vmm/io/vhpet.c
new file mode 100644
index 0000000000..25f6013da0
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.c
@@ -0,0 +1,821 @@
+/*-
+ * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z tychon $
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vhpet.c 263035 2014-03-11 16:56:00Z tychon $");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+
+#include <dev/acpica/acpi_hpet.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+
+#include "vmm_lapic.h"
+#include "vatpic.h"
+#include "vioapic.h"
+#include "vhpet.h"
+
+#include "vmm_ktr.h"
+
+static MALLOC_DEFINE(M_VHPET, "vhpet", "bhyve virtual hpet");
+
+#define HPET_FREQ 10000000 /* 10.0 Mhz */
+#define FS_PER_S 1000000000000000ul
+
+/* Timer N Configuration and Capabilities Register */
+#define HPET_TCAP_RO_MASK (HPET_TCAP_INT_ROUTE | \
+ HPET_TCAP_FSB_INT_DEL | \
+ HPET_TCAP_SIZE | \
+ HPET_TCAP_PER_INT)
+/*
+ * HPET requires at least 3 timers and up to 32 timers per block.
+ */
+#define VHPET_NUM_TIMERS 8
+CTASSERT(VHPET_NUM_TIMERS >= 3 && VHPET_NUM_TIMERS <= 32);
+
+struct vhpet_callout_arg {
+ struct vhpet *vhpet;
+ int timer_num;
+};
+
+struct vhpet {
+ struct vm *vm;
+ struct mtx mtx;
+ sbintime_t freq_sbt;
+
+ uint64_t config; /* Configuration */
+ uint64_t isr; /* Interrupt Status */
+ uint32_t countbase; /* HPET counter base value */
+ sbintime_t countbase_sbt; /* uptime corresponding to base value */
+
+ struct {
+ uint64_t cap_config; /* Configuration */
+ uint64_t msireg; /* FSB interrupt routing */
+ uint32_t compval; /* Comparator */
+ uint32_t comprate;
+ struct callout callout;
+ sbintime_t callout_sbt; /* time when counter==compval */
+ struct vhpet_callout_arg arg;
+ } timer[VHPET_NUM_TIMERS];
+};
+
+#define VHPET_LOCK(vhp) mtx_lock(&((vhp)->mtx))
+#define VHPET_UNLOCK(vhp) mtx_unlock(&((vhp)->mtx))
+
+static void vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter,
+ sbintime_t now);
+
+static uint64_t
+vhpet_capabilities(void)
+{
+ uint64_t cap = 0;
+
+ cap |= 0x8086 << 16; /* vendor id */
+ cap |= HPET_CAP_LEG_RT; /* legacy routing capable */
+ cap |= (VHPET_NUM_TIMERS - 1) << 8; /* number of timers */
+ cap |= 1; /* revision */
+ cap &= ~HPET_CAP_COUNT_SIZE; /* 32-bit timer */
+
+ cap &= 0xffffffff;
+ cap |= (FS_PER_S / HPET_FREQ) << 32; /* tick period in fs */
+
+ return (cap);
+}
+
+static __inline bool
+vhpet_counter_enabled(struct vhpet *vhpet)
+{
+
+ return ((vhpet->config & HPET_CNF_ENABLE) ? true : false);
+}
+
+static __inline bool
+vhpet_timer_msi_enabled(struct vhpet *vhpet, int n)
+{
+ const uint64_t msi_enable = HPET_TCAP_FSB_INT_DEL | HPET_TCNF_FSB_EN;
+
+ /*
+ * LegacyReplacement Route configuration takes precedence over MSI
+ * for timers 0 and 1.
+ */
+ if (n == 0 || n == 1) {
+ if (vhpet->config & HPET_CNF_LEG_RT)
+ return (false);
+ }
+
+ if ((vhpet->timer[n].cap_config & msi_enable) == msi_enable)
+ return (true);
+ else
+ return (false);
+}
+
+static __inline int
+vhpet_timer_ioapic_pin(struct vhpet *vhpet, int n)
+{
+ /*
+ * If the timer is configured to use MSI then treat it as if the
+ * timer is not connected to the ioapic.
+ */
+ if (vhpet_timer_msi_enabled(vhpet, n))
+ return (0);
+
+ if (vhpet->config & HPET_CNF_LEG_RT) {
+ /*
+ * In "legacy routing" timers 0 and 1 are connected to
+ * ioapic pins 2 and 8 respectively.
+ */
+ switch (n) {
+ case 0:
+ return (2);
+ case 1:
+ return (8);
+ }
+ }
+
+ return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ROUTE) >> 9);
+}
+
+static __inline int
+vhpet_timer_atpic_pin(struct vhpet *vhpet, int n)
+{
+ if (vhpet->config & HPET_CNF_LEG_RT) {
+ /*
+ * In "legacy routing" timers 0 and 1 are connected to
+ * 8259 master pin 0 and slave pin 0 respectively.
+ */
+ switch (n) {
+ case 0:
+ return (0);
+ case 1:
+ return (8);
+ }
+ }
+
+ return (-1);
+}
+
+static uint32_t
+vhpet_counter(struct vhpet *vhpet, sbintime_t *nowptr)
+{
+ uint32_t val;
+ sbintime_t now, delta;
+
+ val = vhpet->countbase;
+ if (vhpet_counter_enabled(vhpet)) {
+ now = sbinuptime();
+ delta = now - vhpet->countbase_sbt;
+#ifdef __FreeBSD__
+ KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: "
+ "%#lx to %#lx", vhpet->countbase_sbt, now));
+#else
+ KASSERT(delta >= 0, ("vhpet_counter: uptime went backwards: "
+ "%lx to %lx", vhpet->countbase_sbt, now));
+#endif
+ val += delta / vhpet->freq_sbt;
+ if (nowptr != NULL)
+ *nowptr = now;
+ } else {
+ /*
+ * The sbinuptime corresponding to the 'countbase' is
+ * meaningless when the counter is disabled. Make sure
+ * that the the caller doesn't want to use it.
+ */
+ KASSERT(nowptr == NULL, ("vhpet_counter: nowptr must be NULL"));
+ }
+ return (val);
+}
+
+static void
+vhpet_timer_clear_isr(struct vhpet *vhpet, int n)
+{
+ int pin, legacy_pin;
+
+ if (vhpet->isr & (1 << n)) {
+ pin = vhpet_timer_ioapic_pin(vhpet, n);
+ KASSERT(pin != 0, ("vhpet timer %d irq incorrectly routed", n));
+ vioapic_deassert_irq(vhpet->vm, pin);
+
+ legacy_pin = vhpet_timer_atpic_pin(vhpet, n);
+ if (legacy_pin != -1)
+ vatpic_deassert_irq(vhpet->vm, legacy_pin);
+
+ vhpet->isr &= ~(1 << n);
+ }
+}
+
+static __inline bool
+vhpet_periodic_timer(struct vhpet *vhpet, int n)
+{
+
+ return ((vhpet->timer[n].cap_config & HPET_TCNF_TYPE) != 0);
+}
+
+static __inline bool
+vhpet_timer_interrupt_enabled(struct vhpet *vhpet, int n)
+{
+
+ return ((vhpet->timer[n].cap_config & HPET_TCNF_INT_ENB) != 0);
+}
+
+static __inline bool
+vhpet_timer_edge_trig(struct vhpet *vhpet, int n)
+{
+
+ KASSERT(!vhpet_timer_msi_enabled(vhpet, n), ("vhpet_timer_edge_trig: "
+ "timer %d is using MSI", n));
+
+ /* The legacy replacement interrupts are always edge triggered */
+ if (vhpet->config & HPET_CNF_LEG_RT) {
+ if (n == 0 || n == 1)
+ return (true);
+ }
+
+ if ((vhpet->timer[n].cap_config & HPET_TCNF_INT_TYPE) == 0)
+ return (true);
+ else
+ return (false);
+}
+
+static void
+vhpet_timer_interrupt(struct vhpet *vhpet, int n)
+{
+ int pin, legacy_pin;
+
+ /* If interrupts are not enabled for this timer then just return. */
+ if (!vhpet_timer_interrupt_enabled(vhpet, n))
+ return;
+
+ /*
+ * If a level triggered interrupt is already asserted then just return.
+ */
+ if ((vhpet->isr & (1 << n)) != 0) {
+ VM_CTR1(vhpet->vm, "hpet t%d intr is already asserted", n);
+ return;
+ }
+
+ if (vhpet_timer_msi_enabled(vhpet, n)) {
+ lapic_intr_msi(vhpet->vm, vhpet->timer[n].msireg >> 32,
+ vhpet->timer[n].msireg & 0xffffffff);
+ return;
+ }
+
+ pin = vhpet_timer_ioapic_pin(vhpet, n);
+ if (pin == 0) {
+ VM_CTR1(vhpet->vm, "hpet t%d intr is not routed to ioapic", n);
+ return;
+ }
+
+ legacy_pin = vhpet_timer_atpic_pin(vhpet, n);
+
+ if (vhpet_timer_edge_trig(vhpet, n)) {
+ vioapic_pulse_irq(vhpet->vm, pin);
+ if (legacy_pin != -1)
+ vatpic_pulse_irq(vhpet->vm, legacy_pin);
+ } else {
+ vhpet->isr |= 1 << n;
+ vioapic_assert_irq(vhpet->vm, pin);
+ if (legacy_pin != -1)
+ vatpic_assert_irq(vhpet->vm, legacy_pin);
+ }
+}
+
+static void
+vhpet_adjust_compval(struct vhpet *vhpet, int n, uint32_t counter)
+{
+ uint32_t compval, comprate, compnext;
+
+ KASSERT(vhpet->timer[n].comprate != 0, ("hpet t%d is not periodic", n));
+
+ compval = vhpet->timer[n].compval;
+ comprate = vhpet->timer[n].comprate;
+
+ /*
+ * Calculate the comparator value to be used for the next periodic
+ * interrupt.
+ *
+ * This function is commonly called from the callout handler.
+ * In this scenario the 'counter' is ahead of 'compval'. To find
+ * the next value to program into the accumulator we divide the
+ * number space between 'compval' and 'counter' into 'comprate'
+ * sized units. The 'compval' is rounded up such that is "ahead"
+ * of 'counter'.
+ */
+ compnext = compval + ((counter - compval) / comprate + 1) * comprate;
+
+ vhpet->timer[n].compval = compnext;
+}
+
+static void
+vhpet_handler(void *a)
+{
+ int n;
+ uint32_t counter;
+ sbintime_t now;
+ struct vhpet *vhpet;
+ struct callout *callout;
+ struct vhpet_callout_arg *arg;
+
+ arg = a;
+ vhpet = arg->vhpet;
+ n = arg->timer_num;
+ callout = &vhpet->timer[n].callout;
+
+ VM_CTR1(vhpet->vm, "hpet t%d fired", n);
+
+ VHPET_LOCK(vhpet);
+
+ if (callout_pending(callout)) /* callout was reset */
+ goto done;
+
+ if (!callout_active(callout)) /* callout was stopped */
+ goto done;
+
+ callout_deactivate(callout);
+
+ if (!vhpet_counter_enabled(vhpet))
+ panic("vhpet(%p) callout with counter disabled", vhpet);
+
+ counter = vhpet_counter(vhpet, &now);
+ vhpet_start_timer(vhpet, n, counter, now);
+ vhpet_timer_interrupt(vhpet, n);
+done:
+ VHPET_UNLOCK(vhpet);
+ return;
+}
+
+static void
+vhpet_stop_timer(struct vhpet *vhpet, int n, sbintime_t now)
+{
+
+ VM_CTR1(vhpet->vm, "hpet t%d stopped", n);
+ callout_stop(&vhpet->timer[n].callout);
+
+ /*
+ * If the callout was scheduled to expire in the past but hasn't
+ * had a chance to execute yet then trigger the timer interrupt
+ * here. Failing to do so will result in a missed timer interrupt
+ * in the guest. This is especially bad in one-shot mode because
+ * the next interrupt has to wait for the counter to wrap around.
+ */
+ if (vhpet->timer[n].callout_sbt < now) {
+ VM_CTR1(vhpet->vm, "hpet t%d interrupt triggered after "
+ "stopping timer", n);
+ vhpet_timer_interrupt(vhpet, n);
+ }
+}
+
+static void
+vhpet_start_timer(struct vhpet *vhpet, int n, uint32_t counter, sbintime_t now)
+{
+ sbintime_t delta, precision;
+
+ /* If interrupts are not enabled for this timer then just return. */
+ if (!vhpet_timer_interrupt_enabled(vhpet, n))
+ return;
+
+ if (vhpet->timer[n].comprate != 0)
+ vhpet_adjust_compval(vhpet, n, counter);
+ else {
+ /*
+ * In one-shot mode it is the guest's responsibility to make
+ * sure that the comparator value is not in the "past". The
+ * hardware doesn't have any belt-and-suspenders to deal with
+ * this so we don't either.
+ */
+ }
+
+ delta = (vhpet->timer[n].compval - counter) * vhpet->freq_sbt;
+ precision = delta >> tc_precexp;
+ vhpet->timer[n].callout_sbt = now + delta;
+ callout_reset_sbt(&vhpet->timer[n].callout, vhpet->timer[n].callout_sbt,
+ precision, vhpet_handler, &vhpet->timer[n].arg, C_ABSOLUTE);
+}
+
+static void
+vhpet_start_counting(struct vhpet *vhpet)
+{
+ int i;
+
+ vhpet->countbase_sbt = sbinuptime();
+ for (i = 0; i < VHPET_NUM_TIMERS; i++) {
+ /*
+ * Restart the timers based on the value of the main counter
+ * when it stopped counting.
+ */
+ vhpet_start_timer(vhpet, i, vhpet->countbase,
+ vhpet->countbase_sbt);
+ }
+}
+
+static void
+vhpet_stop_counting(struct vhpet *vhpet, uint32_t counter, sbintime_t now)
+{
+ int i;
+
+ vhpet->countbase = counter;
+ for (i = 0; i < VHPET_NUM_TIMERS; i++)
+ vhpet_stop_timer(vhpet, i, now);
+}
+
+static __inline void
+update_register(uint64_t *regptr, uint64_t data, uint64_t mask)
+{
+
+ *regptr &= ~mask;
+ *regptr |= (data & mask);
+}
+
+static void
+vhpet_timer_update_config(struct vhpet *vhpet, int n, uint64_t data,
+ uint64_t mask)
+{
+ bool clear_isr;
+ int old_pin, new_pin;
+ uint32_t allowed_irqs;
+ uint64_t oldval, newval;
+
+ if (vhpet_timer_msi_enabled(vhpet, n) ||
+ vhpet_timer_edge_trig(vhpet, n)) {
+ if (vhpet->isr & (1 << n))
+ panic("vhpet timer %d isr should not be asserted", n);
+ }
+ old_pin = vhpet_timer_ioapic_pin(vhpet, n);
+ oldval = vhpet->timer[n].cap_config;
+
+ newval = oldval;
+ update_register(&newval, data, mask);
+ newval &= ~(HPET_TCAP_RO_MASK | HPET_TCNF_32MODE);
+ newval |= oldval & HPET_TCAP_RO_MASK;
+
+ if (newval == oldval)
+ return;
+
+ vhpet->timer[n].cap_config = newval;
+ VM_CTR2(vhpet->vm, "hpet t%d cap_config set to 0x%016x", n, newval);
+
+ /*
+ * Validate the interrupt routing in the HPET_TCNF_INT_ROUTE field.
+ * If it does not match the bits set in HPET_TCAP_INT_ROUTE then set
+ * it to the default value of 0.
+ */
+ allowed_irqs = vhpet->timer[n].cap_config >> 32;
+ new_pin = vhpet_timer_ioapic_pin(vhpet, n);
+ if (new_pin != 0 && (allowed_irqs & (1 << new_pin)) == 0) {
+ VM_CTR3(vhpet->vm, "hpet t%d configured invalid irq %d, "
+ "allowed_irqs 0x%08x", n, new_pin, allowed_irqs);
+ new_pin = 0;
+ vhpet->timer[n].cap_config &= ~HPET_TCNF_INT_ROUTE;
+ }
+
+ if (!vhpet_periodic_timer(vhpet, n))
+ vhpet->timer[n].comprate = 0;
+
+ /*
+ * If the timer's ISR bit is set then clear it in the following cases:
+ * - interrupt is disabled
+ * - interrupt type is changed from level to edge or fsb.
+ * - interrupt routing is changed
+ *
+ * This is to ensure that this timer's level triggered interrupt does
+ * not remain asserted forever.
+ */
+ if (vhpet->isr & (1 << n)) {
+ KASSERT(old_pin != 0, ("timer %d isr asserted to ioapic pin %d",
+ n, old_pin));
+ if (!vhpet_timer_interrupt_enabled(vhpet, n))
+ clear_isr = true;
+ else if (vhpet_timer_msi_enabled(vhpet, n))
+ clear_isr = true;
+ else if (vhpet_timer_edge_trig(vhpet, n))
+ clear_isr = true;
+ else if (vhpet_timer_ioapic_pin(vhpet, n) != old_pin)
+ clear_isr = true;
+ else
+ clear_isr = false;
+
+ if (clear_isr) {
+ VM_CTR1(vhpet->vm, "hpet t%d isr cleared due to "
+ "configuration change", n);
+ vioapic_deassert_irq(vhpet->vm, old_pin);
+ vhpet->isr &= ~(1 << n);
+ }
+ }
+}
+
+int
+vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val, int size,
+ void *arg)
+{
+ struct vhpet *vhpet;
+ uint64_t data, mask, oldval, val64;
+ uint32_t isr_clear_mask, old_compval, old_comprate, counter;
+ sbintime_t now, *nowptr;
+ int i, offset;
+
+ vhpet = vm_hpet(vm);
+ offset = gpa - VHPET_BASE;
+
+ VHPET_LOCK(vhpet);
+
+ /* Accesses to the HPET should be 4 or 8 bytes wide */
+ switch (size) {
+ case 8:
+ mask = 0xffffffffffffffff;
+ data = val;
+ break;
+ case 4:
+ mask = 0xffffffff;
+ data = val;
+ if ((offset & 0x4) != 0) {
+ mask <<= 32;
+ data <<= 32;
+ }
+ break;
+ default:
+ VM_CTR2(vhpet->vm, "hpet invalid mmio write: "
+ "offset 0x%08x, size %d", offset, size);
+ goto done;
+ }
+
+ /* Access to the HPET should be naturally aligned to its width */
+ if (offset & (size - 1)) {
+ VM_CTR2(vhpet->vm, "hpet invalid mmio write: "
+ "offset 0x%08x, size %d", offset, size);
+ goto done;
+ }
+
+ if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) {
+ /*
+ * Get the most recent value of the counter before updating
+ * the 'config' register. If the HPET is going to be disabled
+ * then we need to update 'countbase' with the value right
+ * before it is disabled.
+ */
+ nowptr = vhpet_counter_enabled(vhpet) ? &now : NULL;
+ counter = vhpet_counter(vhpet, nowptr);
+ oldval = vhpet->config;
+ update_register(&vhpet->config, data, mask);
+ if ((oldval ^ vhpet->config) & HPET_CNF_ENABLE) {
+ if (vhpet_counter_enabled(vhpet)) {
+ vhpet_start_counting(vhpet);
+ VM_CTR0(vhpet->vm, "hpet enabled");
+ } else {
+ vhpet_stop_counting(vhpet, counter, now);
+ VM_CTR0(vhpet->vm, "hpet disabled");
+ }
+ }
+ goto done;
+ }
+
+ if (offset == HPET_ISR || offset == HPET_ISR + 4) {
+ isr_clear_mask = vhpet->isr & data;
+ for (i = 0; i < VHPET_NUM_TIMERS; i++) {
+ if ((isr_clear_mask & (1 << i)) != 0) {
+ VM_CTR1(vhpet->vm, "hpet t%d isr cleared", i);
+ vhpet_timer_clear_isr(vhpet, i);
+ }
+ }
+ goto done;
+ }
+
+ if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) {
+ /* Zero-extend the counter to 64-bits before updating it */
+ val64 = vhpet_counter(vhpet, NULL);
+ update_register(&val64, data, mask);
+ vhpet->countbase = val64;
+ if (vhpet_counter_enabled(vhpet))
+ vhpet_start_counting(vhpet);
+ goto done;
+ }
+
+ for (i = 0; i < VHPET_NUM_TIMERS; i++) {
+ if (offset == HPET_TIMER_CAP_CNF(i) ||
+ offset == HPET_TIMER_CAP_CNF(i) + 4) {
+ vhpet_timer_update_config(vhpet, i, data, mask);
+ break;
+ }
+
+ if (offset == HPET_TIMER_COMPARATOR(i) ||
+ offset == HPET_TIMER_COMPARATOR(i) + 4) {
+ old_compval = vhpet->timer[i].compval;
+ old_comprate = vhpet->timer[i].comprate;
+ if (vhpet_periodic_timer(vhpet, i)) {
+ /*
+ * In periodic mode writes to the comparator
+ * change the 'compval' register only if the
+ * HPET_TCNF_VAL_SET bit is set in the config
+ * register.
+ */
+ val64 = vhpet->timer[i].comprate;
+ update_register(&val64, data, mask);
+ vhpet->timer[i].comprate = val64;
+ if ((vhpet->timer[i].cap_config &
+ HPET_TCNF_VAL_SET) != 0) {
+ vhpet->timer[i].compval = val64;
+ }
+ } else {
+ KASSERT(vhpet->timer[i].comprate == 0,
+ ("vhpet one-shot timer %d has invalid "
+ "rate %u", i, vhpet->timer[i].comprate));
+ val64 = vhpet->timer[i].compval;
+ update_register(&val64, data, mask);
+ vhpet->timer[i].compval = val64;
+ }
+ vhpet->timer[i].cap_config &= ~HPET_TCNF_VAL_SET;
+
+ if (vhpet->timer[i].compval != old_compval ||
+ vhpet->timer[i].comprate != old_comprate) {
+ if (vhpet_counter_enabled(vhpet)) {
+ counter = vhpet_counter(vhpet, &now);
+ vhpet_start_timer(vhpet, i, counter,
+ now);
+ }
+ }
+ break;
+ }
+
+ if (offset == HPET_TIMER_FSB_VAL(i) ||
+ offset == HPET_TIMER_FSB_ADDR(i)) {
+ update_register(&vhpet->timer[i].msireg, data, mask);
+ break;
+ }
+ }
+done:
+ VHPET_UNLOCK(vhpet);
+ return (0);
+}
+
+int
+vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval, int size,
+ void *arg)
+{
+ int i, offset;
+ struct vhpet *vhpet;
+ uint64_t data;
+
+ vhpet = vm_hpet(vm);
+ offset = gpa - VHPET_BASE;
+
+ VHPET_LOCK(vhpet);
+
+ /* Accesses to the HPET should be 4 or 8 bytes wide */
+ if (size != 4 && size != 8) {
+ VM_CTR2(vhpet->vm, "hpet invalid mmio read: "
+ "offset 0x%08x, size %d", offset, size);
+ data = 0;
+ goto done;
+ }
+
+ /* Access to the HPET should be naturally aligned to its width */
+ if (offset & (size - 1)) {
+ VM_CTR2(vhpet->vm, "hpet invalid mmio read: "
+ "offset 0x%08x, size %d", offset, size);
+ data = 0;
+ goto done;
+ }
+
+ if (offset == HPET_CAPABILITIES || offset == HPET_CAPABILITIES + 4) {
+ data = vhpet_capabilities();
+ goto done;
+ }
+
+ if (offset == HPET_CONFIG || offset == HPET_CONFIG + 4) {
+ data = vhpet->config;
+ goto done;
+ }
+
+ if (offset == HPET_ISR || offset == HPET_ISR + 4) {
+ data = vhpet->isr;
+ goto done;
+ }
+
+ if (offset == HPET_MAIN_COUNTER || offset == HPET_MAIN_COUNTER + 4) {
+ data = vhpet_counter(vhpet, NULL);
+ goto done;
+ }
+
+ for (i = 0; i < VHPET_NUM_TIMERS; i++) {
+ if (offset == HPET_TIMER_CAP_CNF(i) ||
+ offset == HPET_TIMER_CAP_CNF(i) + 4) {
+ data = vhpet->timer[i].cap_config;
+ break;
+ }
+
+ if (offset == HPET_TIMER_COMPARATOR(i) ||
+ offset == HPET_TIMER_COMPARATOR(i) + 4) {
+ data = vhpet->timer[i].compval;
+ break;
+ }
+
+ if (offset == HPET_TIMER_FSB_VAL(i) ||
+ offset == HPET_TIMER_FSB_ADDR(i)) {
+ data = vhpet->timer[i].msireg;
+ break;
+ }
+ }
+
+ if (i >= VHPET_NUM_TIMERS)
+ data = 0;
+done:
+ VHPET_UNLOCK(vhpet);
+
+ if (size == 4) {
+ if (offset & 0x4)
+ data >>= 32;
+ }
+ *rval = data;
+ return (0);
+}
+
+struct vhpet *
+vhpet_init(struct vm *vm)
+{
+ int i, pincount;
+ struct vhpet *vhpet;
+ uint64_t allowed_irqs;
+ struct vhpet_callout_arg *arg;
+ struct bintime bt;
+
+ vhpet = malloc(sizeof(struct vhpet), M_VHPET, M_WAITOK | M_ZERO);
+ vhpet->vm = vm;
+ mtx_init(&vhpet->mtx, "vhpet lock", NULL, MTX_DEF);
+
+ FREQ2BT(HPET_FREQ, &bt);
+ vhpet->freq_sbt = bttosbt(bt);
+
+ pincount = vioapic_pincount(vm);
+ if (pincount >= 24)
+ allowed_irqs = 0x00f00000; /* irqs 20, 21, 22 and 23 */
+ else
+ allowed_irqs = 0;
+
+ /*
+ * Initialize HPET timer hardware state.
+ */
+ for (i = 0; i < VHPET_NUM_TIMERS; i++) {
+ vhpet->timer[i].cap_config = allowed_irqs << 32;
+ vhpet->timer[i].cap_config |= HPET_TCAP_PER_INT;
+ vhpet->timer[i].cap_config |= HPET_TCAP_FSB_INT_DEL;
+
+ vhpet->timer[i].compval = 0xffffffff;
+ callout_init(&vhpet->timer[i].callout, 1);
+
+ arg = &vhpet->timer[i].arg;
+ arg->vhpet = vhpet;
+ arg->timer_num = i;
+ }
+
+ return (vhpet);
+}
+
+void
+vhpet_cleanup(struct vhpet *vhpet)
+{
+ int i;
+
+ for (i = 0; i < VHPET_NUM_TIMERS; i++)
+ callout_drain(&vhpet->timer[i].callout);
+
+ free(vhpet, M_VHPET);
+}
+
+int
+vhpet_getcap(struct vm_hpet_cap *cap)
+{
+
+ cap->capabilities = vhpet_capabilities();
+ return (0);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vhpet.h b/usr/src/uts/i86pc/io/vmm/io/vhpet.h
new file mode 100644
index 0000000000..868809d166
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vhpet.h
@@ -0,0 +1,44 @@
+/*-
+ * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vhpet.h 258579 2013-11-25 19:04:51Z neel $
+ */
+
+#ifndef _VHPET_H_
+#define _VHPET_H_
+
+#define VHPET_BASE 0xfed00000
+#define VHPET_SIZE 1024
+
+struct vhpet *vhpet_init(struct vm *vm);
+void vhpet_cleanup(struct vhpet *vhpet);
+int vhpet_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t val,
+ int size, void *arg);
+int vhpet_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *val,
+ int size, void *arg);
+int vhpet_getcap(struct vm_hpet_cap *cap);
+
+#endif /* _VHPET_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.c b/usr/src/uts/i86pc/io/vmm/io/vioapic.c
new file mode 100644
index 0000000000..5adf5de16d
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.c
@@ -0,0 +1,514 @@
+/*-
+ * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vioapic.c 262139 2014-02-17 22:57:51Z neel $");
+
+#include <sys/param.h>
+#include <sys/queue.h>
+#include <sys/cpuset.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+
+#include <x86/apicreg.h>
+#include <machine/vmm.h>
+
+#include "vmm_ktr.h"
+#include "vmm_lapic.h"
+#include "vlapic.h"
+#include "vioapic.h"
+
+#define IOREGSEL 0x00
+#define IOWIN 0x10
+
+#define REDIR_ENTRIES 24
+#define RTBL_RO_BITS ((uint64_t)(IOART_REM_IRR | IOART_DELIVS))
+
+struct vioapic {
+ struct vm *vm;
+ struct mtx mtx;
+ uint32_t id;
+ uint32_t ioregsel;
+ struct {
+ uint64_t reg;
+ int acnt; /* sum of pin asserts (+1) and deasserts (-1) */
+ } rtbl[REDIR_ENTRIES];
+};
+
+#define VIOAPIC_LOCK(vioapic) mtx_lock_spin(&((vioapic)->mtx))
+#define VIOAPIC_UNLOCK(vioapic) mtx_unlock_spin(&((vioapic)->mtx))
+#define VIOAPIC_LOCKED(vioapic) mtx_owned(&((vioapic)->mtx))
+
+static MALLOC_DEFINE(M_VIOAPIC, "vioapic", "bhyve virtual ioapic");
+
+#define VIOAPIC_CTR1(vioapic, fmt, a1) \
+ VM_CTR1((vioapic)->vm, fmt, a1)
+
+#define VIOAPIC_CTR2(vioapic, fmt, a1, a2) \
+ VM_CTR2((vioapic)->vm, fmt, a1, a2)
+
+#define VIOAPIC_CTR3(vioapic, fmt, a1, a2, a3) \
+ VM_CTR3((vioapic)->vm, fmt, a1, a2, a3)
+
+#define VIOAPIC_CTR4(vioapic, fmt, a1, a2, a3, a4) \
+ VM_CTR4((vioapic)->vm, fmt, a1, a2, a3, a4)
+
+#ifdef KTR
+static const char *
+pinstate_str(bool asserted)
+{
+
+ if (asserted)
+ return ("asserted");
+ else
+ return ("deasserted");
+}
+#endif
+
+static void
+vioapic_send_intr(struct vioapic *vioapic, int pin)
+{
+ int vector, delmode;
+ uint32_t low, high, dest;
+ bool level, phys;
+
+ KASSERT(pin >= 0 && pin < REDIR_ENTRIES,
+ ("vioapic_set_pinstate: invalid pin number %d", pin));
+
+ KASSERT(VIOAPIC_LOCKED(vioapic),
+ ("vioapic_set_pinstate: vioapic is not locked"));
+
+ low = vioapic->rtbl[pin].reg;
+ high = vioapic->rtbl[pin].reg >> 32;
+
+ if ((low & IOART_INTMASK) == IOART_INTMSET) {
+ VIOAPIC_CTR1(vioapic, "ioapic pin%d: masked", pin);
+ return;
+ }
+
+ phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
+ delmode = low & IOART_DELMOD;
+ level = low & IOART_TRGRLVL ? true : false;
+ if (level)
+ vioapic->rtbl[pin].reg |= IOART_REM_IRR;
+
+ vector = low & IOART_INTVEC;
+ dest = high >> APIC_ID_SHIFT;
+ vlapic_deliver_intr(vioapic->vm, level, dest, phys, delmode, vector);
+}
+
+static void
+vioapic_set_pinstate(struct vioapic *vioapic, int pin, bool newstate)
+{
+ int oldcnt, newcnt;
+ bool needintr;
+
+ KASSERT(pin >= 0 && pin < REDIR_ENTRIES,
+ ("vioapic_set_pinstate: invalid pin number %d", pin));
+
+ KASSERT(VIOAPIC_LOCKED(vioapic),
+ ("vioapic_set_pinstate: vioapic is not locked"));
+
+ oldcnt = vioapic->rtbl[pin].acnt;
+ if (newstate)
+ vioapic->rtbl[pin].acnt++;
+ else
+ vioapic->rtbl[pin].acnt--;
+ newcnt = vioapic->rtbl[pin].acnt;
+
+ if (newcnt < 0) {
+ VIOAPIC_CTR2(vioapic, "ioapic pin%d: bad acnt %d",
+ pin, newcnt);
+ }
+
+ needintr = false;
+ if (oldcnt == 0 && newcnt == 1) {
+ needintr = true;
+ VIOAPIC_CTR1(vioapic, "ioapic pin%d: asserted", pin);
+ } else if (oldcnt == 1 && newcnt == 0) {
+ VIOAPIC_CTR1(vioapic, "ioapic pin%d: deasserted", pin);
+ } else {
+ VIOAPIC_CTR3(vioapic, "ioapic pin%d: %s, ignored, acnt %d",
+ pin, pinstate_str(newstate), newcnt);
+ }
+
+ if (needintr)
+ vioapic_send_intr(vioapic, pin);
+}
+
+enum irqstate {
+ IRQSTATE_ASSERT,
+ IRQSTATE_DEASSERT,
+ IRQSTATE_PULSE
+};
+
+static int
+vioapic_set_irqstate(struct vm *vm, int irq, enum irqstate irqstate)
+{
+ struct vioapic *vioapic;
+
+ if (irq < 0 || irq >= REDIR_ENTRIES)
+ return (EINVAL);
+
+ vioapic = vm_ioapic(vm);
+
+ VIOAPIC_LOCK(vioapic);
+ switch (irqstate) {
+ case IRQSTATE_ASSERT:
+ vioapic_set_pinstate(vioapic, irq, true);
+ break;
+ case IRQSTATE_DEASSERT:
+ vioapic_set_pinstate(vioapic, irq, false);
+ break;
+ case IRQSTATE_PULSE:
+ vioapic_set_pinstate(vioapic, irq, true);
+ vioapic_set_pinstate(vioapic, irq, false);
+ break;
+ default:
+ panic("vioapic_set_irqstate: invalid irqstate %d", irqstate);
+ }
+ VIOAPIC_UNLOCK(vioapic);
+
+ return (0);
+}
+
+int
+vioapic_assert_irq(struct vm *vm, int irq)
+{
+
+ return (vioapic_set_irqstate(vm, irq, IRQSTATE_ASSERT));
+}
+
+int
+vioapic_deassert_irq(struct vm *vm, int irq)
+{
+
+ return (vioapic_set_irqstate(vm, irq, IRQSTATE_DEASSERT));
+}
+
+int
+vioapic_pulse_irq(struct vm *vm, int irq)
+{
+
+ return (vioapic_set_irqstate(vm, irq, IRQSTATE_PULSE));
+}
+
+/*
+ * Reset the vlapic's trigger-mode register to reflect the ioapic pin
+ * configuration.
+ */
+static void
+vioapic_update_tmr(struct vm *vm, int vcpuid, void *arg)
+{
+ struct vioapic *vioapic;
+ struct vlapic *vlapic;
+ uint32_t low, high, dest;
+ int delmode, pin, vector;
+ bool level, phys;
+
+ vlapic = vm_lapic(vm, vcpuid);
+ vioapic = vm_ioapic(vm);
+
+ VIOAPIC_LOCK(vioapic);
+ /*
+ * Reset all vectors to be edge-triggered.
+ */
+ vlapic_reset_tmr(vlapic);
+ for (pin = 0; pin < REDIR_ENTRIES; pin++) {
+ low = vioapic->rtbl[pin].reg;
+ high = vioapic->rtbl[pin].reg >> 32;
+
+ level = low & IOART_TRGRLVL ? true : false;
+ if (!level)
+ continue;
+
+ /*
+ * For a level-triggered 'pin' let the vlapic figure out if
+ * an assertion on this 'pin' would result in an interrupt
+ * being delivered to it. If yes, then it will modify the
+ * TMR bit associated with this vector to level-triggered.
+ */
+ phys = ((low & IOART_DESTMOD) == IOART_DESTPHY);
+ delmode = low & IOART_DELMOD;
+ vector = low & IOART_INTVEC;
+ dest = high >> APIC_ID_SHIFT;
+ vlapic_set_tmr_level(vlapic, dest, phys, delmode, vector);
+ }
+ VIOAPIC_UNLOCK(vioapic);
+}
+
+static uint32_t
+vioapic_read(struct vioapic *vioapic, int vcpuid, uint32_t addr)
+{
+ int regnum, pin, rshift;
+
+ regnum = addr & 0xff;
+ switch (regnum) {
+ case IOAPIC_ID:
+ return (vioapic->id);
+ break;
+ case IOAPIC_VER:
+ return (((REDIR_ENTRIES - 1) << MAXREDIRSHIFT) | 0x11);
+ break;
+ case IOAPIC_ARB:
+ return (vioapic->id);
+ break;
+ default:
+ break;
+ }
+
+ /* redirection table entries */
+ if (regnum >= IOAPIC_REDTBL &&
+ regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
+ pin = (regnum - IOAPIC_REDTBL) / 2;
+ if ((regnum - IOAPIC_REDTBL) % 2)
+ rshift = 32;
+ else
+ rshift = 0;
+
+ return (vioapic->rtbl[pin].reg >> rshift);
+ }
+
+ return (0);
+}
+
+static void
+vioapic_write(struct vioapic *vioapic, int vcpuid, uint32_t addr, uint32_t data)
+{
+ uint64_t data64, mask64;
+ uint64_t last, changed;
+ int regnum, pin, lshift;
+ cpuset_t allvcpus;
+
+ regnum = addr & 0xff;
+ switch (regnum) {
+ case IOAPIC_ID:
+ vioapic->id = data & APIC_ID_MASK;
+ break;
+ case IOAPIC_VER:
+ case IOAPIC_ARB:
+ /* readonly */
+ break;
+ default:
+ break;
+ }
+
+ /* redirection table entries */
+ if (regnum >= IOAPIC_REDTBL &&
+ regnum < IOAPIC_REDTBL + REDIR_ENTRIES * 2) {
+ pin = (regnum - IOAPIC_REDTBL) / 2;
+ if ((regnum - IOAPIC_REDTBL) % 2)
+ lshift = 32;
+ else
+ lshift = 0;
+
+ last = vioapic->rtbl[pin].reg;
+
+ data64 = (uint64_t)data << lshift;
+ mask64 = (uint64_t)0xffffffff << lshift;
+ vioapic->rtbl[pin].reg &= ~mask64 | RTBL_RO_BITS;
+ vioapic->rtbl[pin].reg |= data64 & ~RTBL_RO_BITS;
+
+ VIOAPIC_CTR2(vioapic, "ioapic pin%d: redir table entry %#lx",
+ pin, vioapic->rtbl[pin].reg);
+
+ /*
+ * If any fields in the redirection table entry (except mask
+ * or polarity) have changed then rendezvous all the vcpus
+ * to update their vlapic trigger-mode registers.
+ */
+ changed = last ^ vioapic->rtbl[pin].reg;
+ if (changed & ~(IOART_INTMASK | IOART_INTPOL)) {
+ VIOAPIC_CTR1(vioapic, "ioapic pin%d: recalculate "
+ "vlapic trigger-mode register", pin);
+ VIOAPIC_UNLOCK(vioapic);
+#if 0 /* XXX */
+ allvcpus = vm_active_cpus(vioapic->vm);
+ vm_smp_rendezvous(vioapic->vm, vcpuid, allvcpus,
+ vioapic_update_tmr, NULL);
+#endif
+ VIOAPIC_LOCK(vioapic);
+ }
+
+ /*
+ * Generate an interrupt if the following conditions are met:
+ * - pin is not masked
+ * - previous interrupt has been EOIed
+ * - pin level is asserted
+ */
+ if ((vioapic->rtbl[pin].reg & IOART_INTMASK) == IOART_INTMCLR &&
+ (vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0 &&
+ (vioapic->rtbl[pin].acnt > 0)) {
+ VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at rtbl "
+ "write, acnt %d", pin, vioapic->rtbl[pin].acnt);
+ vioapic_send_intr(vioapic, pin);
+ }
+ }
+}
+
+static int
+vioapic_mmio_rw(struct vioapic *vioapic, int vcpuid, uint64_t gpa,
+ uint64_t *data, int size, bool doread)
+{
+ uint64_t offset;
+
+ offset = gpa - VIOAPIC_BASE;
+
+ /*
+ * The IOAPIC specification allows 32-bit wide accesses to the
+ * IOREGSEL (offset 0) and IOWIN (offset 16) registers.
+ */
+ if (size != 4 || (offset != IOREGSEL && offset != IOWIN)) {
+ if (doread)
+ *data = 0;
+ return (0);
+ }
+
+ VIOAPIC_LOCK(vioapic);
+ if (offset == IOREGSEL) {
+ if (doread)
+ *data = vioapic->ioregsel;
+ else
+ vioapic->ioregsel = *data;
+ } else {
+ if (doread) {
+ *data = vioapic_read(vioapic, vcpuid,
+ vioapic->ioregsel);
+ } else {
+ vioapic_write(vioapic, vcpuid, vioapic->ioregsel,
+ *data);
+ }
+ }
+ VIOAPIC_UNLOCK(vioapic);
+
+ return (0);
+}
+
+int
+vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa, uint64_t *rval,
+ int size, void *arg)
+{
+ int error;
+ struct vioapic *vioapic;
+
+ vioapic = vm_ioapic(vm);
+ error = vioapic_mmio_rw(vioapic, vcpuid, gpa, rval, size, true);
+ return (error);
+}
+
+int
+vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa, uint64_t wval,
+ int size, void *arg)
+{
+ int error;
+ struct vioapic *vioapic;
+
+ vioapic = vm_ioapic(vm);
+ error = vioapic_mmio_rw(vioapic, vcpuid, gpa, &wval, size, false);
+ return (error);
+}
+
+void
+vioapic_process_eoi(struct vm *vm, int vcpuid, int vector)
+{
+ struct vioapic *vioapic;
+ int pin;
+
+ KASSERT(vector >= 0 && vector < 256,
+ ("vioapic_process_eoi: invalid vector %d", vector));
+
+ vioapic = vm_ioapic(vm);
+ VIOAPIC_CTR1(vioapic, "ioapic processing eoi for vector %d", vector);
+
+ /*
+ * XXX keep track of the pins associated with this vector instead
+ * of iterating on every single pin each time.
+ */
+ VIOAPIC_LOCK(vioapic);
+ for (pin = 0; pin < REDIR_ENTRIES; pin++) {
+ if ((vioapic->rtbl[pin].reg & IOART_REM_IRR) == 0)
+ continue;
+ if ((vioapic->rtbl[pin].reg & IOART_INTVEC) != vector)
+ continue;
+ vioapic->rtbl[pin].reg &= ~IOART_REM_IRR;
+ if (vioapic->rtbl[pin].acnt > 0) {
+ VIOAPIC_CTR2(vioapic, "ioapic pin%d: asserted at eoi, "
+ "acnt %d", pin, vioapic->rtbl[pin].acnt);
+ vioapic_send_intr(vioapic, pin);
+ }
+ }
+ VIOAPIC_UNLOCK(vioapic);
+}
+
+struct vioapic *
+vioapic_init(struct vm *vm)
+{
+ int i;
+ struct vioapic *vioapic;
+
+ vioapic = malloc(sizeof(struct vioapic), M_VIOAPIC, M_WAITOK | M_ZERO);
+
+ vioapic->vm = vm;
+ mtx_init(&vioapic->mtx, "vioapic lock", NULL, MTX_SPIN);
+
+ /* Initialize all redirection entries to mask all interrupts */
+ for (i = 0; i < REDIR_ENTRIES; i++)
+ vioapic->rtbl[i].reg = 0x0001000000010000UL;
+
+ return (vioapic);
+}
+
+void
+vioapic_cleanup(struct vioapic *vioapic)
+{
+
+ free(vioapic, M_VIOAPIC);
+}
+
+int
+vioapic_pincount(struct vm *vm)
+{
+
+ return (REDIR_ENTRIES);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vioapic.h b/usr/src/uts/i86pc/io/vmm/io/vioapic.h
new file mode 100644
index 0000000000..9479ebb10e
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vioapic.h
@@ -0,0 +1,66 @@
+/*-
+ * Copyright (c) 2013 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vioapic.h 258699 2013-11-27 22:18:08Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _VIOAPIC_H_
+#define _VIOAPIC_H_
+
+#define VIOAPIC_BASE 0xFEC00000
+#define VIOAPIC_SIZE 4096
+
+#include "vdev.h"
+
+struct vm;
+
+struct vioapic *vioapic_init(struct vm *vm);
+void vioapic_cleanup(struct vioapic *vioapic);
+
+int vioapic_assert_irq(struct vm *vm, int irq);
+int vioapic_deassert_irq(struct vm *vm, int irq);
+int vioapic_pulse_irq(struct vm *vm, int irq);
+
+int vioapic_mmio_write(void *vm, int vcpuid, uint64_t gpa,
+ uint64_t wval, int size, void *arg);
+int vioapic_mmio_read(void *vm, int vcpuid, uint64_t gpa,
+ uint64_t *rval, int size, void *arg);
+
+int vioapic_pincount(struct vm *vm);
+void vioapic_process_eoi(struct vm *vm, int vcpuid, int vector);
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.c b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
new file mode 100644
index 0000000000..9a0a3058ea
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.c
@@ -0,0 +1,1687 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/io/vlapic.c 273375 2014-10-21 07:10:43Z neel $");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/clock.h>
+#include <machine/smp.h>
+
+#include <machine/vmm.h>
+
+#include "vmm_ipi.h"
+#include "vmm_lapic.h"
+#include "vmm_ktr.h"
+#include "vmm_stat.h"
+
+#include "vlapic.h"
+#include "vlapic_priv.h"
+#include "vioapic.h"
+
+#define PRIO(x) ((x) >> 4)
+
+#define VLAPIC_VERSION (16)
+
+#define x2apic(vlapic) (((vlapic)->msr_apicbase & APICBASE_X2APIC) ? 1 : 0)
+
+/*
+ * The 'vlapic->timer_mtx' is used to provide mutual exclusion between the
+ * vlapic_callout_handler() and vcpu accesses to:
+ * - timer_freq_bt, timer_period_bt, timer_fire_bt
+ * - timer LVT register
+ */
+#define VLAPIC_TIMER_LOCK(vlapic) mtx_lock_spin(&((vlapic)->timer_mtx))
+#define VLAPIC_TIMER_UNLOCK(vlapic) mtx_unlock_spin(&((vlapic)->timer_mtx))
+#define VLAPIC_TIMER_LOCKED(vlapic) mtx_owned(&((vlapic)->timer_mtx))
+
+#define VLAPIC_BUS_FREQ tsc_freq
+
+static __inline uint32_t
+vlapic_get_id(struct vlapic *vlapic)
+{
+
+ if (x2apic(vlapic))
+ return (vlapic->vcpuid);
+ else
+ return (vlapic->vcpuid << 24);
+}
+
+static uint32_t
+x2apic_ldr(struct vlapic *vlapic)
+{
+ int apicid;
+ uint32_t ldr;
+
+ apicid = vlapic_get_id(vlapic);
+ ldr = 1 << (apicid & 0xf);
+ ldr |= (apicid & 0xffff0) << 12;
+ return (ldr);
+}
+
+void
+vlapic_dfr_write_handler(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic;
+
+ lapic = vlapic->apic_page;
+ if (x2apic(vlapic)) {
+ VM_CTR1(vlapic->vm, "ignoring write to DFR in x2apic mode: %#x",
+ lapic->dfr);
+ lapic->dfr = 0;
+ return;
+ }
+
+ lapic->dfr &= APIC_DFR_MODEL_MASK;
+ lapic->dfr |= APIC_DFR_RESERVED;
+
+ if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_FLAT)
+ VLAPIC_CTR0(vlapic, "vlapic DFR in Flat Model");
+ else if ((lapic->dfr & APIC_DFR_MODEL_MASK) == APIC_DFR_MODEL_CLUSTER)
+ VLAPIC_CTR0(vlapic, "vlapic DFR in Cluster Model");
+ else
+ VLAPIC_CTR1(vlapic, "DFR in Unknown Model %#x", lapic->dfr);
+}
+
+void
+vlapic_ldr_write_handler(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic;
+
+ lapic = vlapic->apic_page;
+
+ /* LDR is read-only in x2apic mode */
+ if (x2apic(vlapic)) {
+ VLAPIC_CTR1(vlapic, "ignoring write to LDR in x2apic mode: %#x",
+ lapic->ldr);
+ lapic->ldr = x2apic_ldr(vlapic);
+ } else {
+ lapic->ldr &= ~APIC_LDR_RESERVED;
+ VLAPIC_CTR1(vlapic, "vlapic LDR set to %#x", lapic->ldr);
+ }
+}
+
+void
+vlapic_id_write_handler(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic;
+
+ /*
+ * We don't allow the ID register to be modified so reset it back to
+ * its default value.
+ */
+ lapic = vlapic->apic_page;
+ lapic->id = vlapic_get_id(vlapic);
+}
+
+static int
+vlapic_timer_divisor(uint32_t dcr)
+{
+ switch (dcr & 0xB) {
+ case APIC_TDCR_1:
+ return (1);
+ case APIC_TDCR_2:
+ return (2);
+ case APIC_TDCR_4:
+ return (4);
+ case APIC_TDCR_8:
+ return (8);
+ case APIC_TDCR_16:
+ return (16);
+ case APIC_TDCR_32:
+ return (32);
+ case APIC_TDCR_64:
+ return (64);
+ case APIC_TDCR_128:
+ return (128);
+ default:
+ panic("vlapic_timer_divisor: invalid dcr 0x%08x", dcr);
+ }
+}
+
+#if 0
+static inline void
+vlapic_dump_lvt(uint32_t offset, uint32_t *lvt)
+{
+ printf("Offset %x: lvt %08x (V:%02x DS:%x M:%x)\n", offset,
+ *lvt, *lvt & APIC_LVTT_VECTOR, *lvt & APIC_LVTT_DS,
+ *lvt & APIC_LVTT_M);
+}
+#endif
+
+static uint32_t
+vlapic_get_ccr(struct vlapic *vlapic)
+{
+ struct bintime bt_now, bt_rem;
+ struct LAPIC *lapic;
+ uint32_t ccr;
+
+ ccr = 0;
+ lapic = vlapic->apic_page;
+
+ VLAPIC_TIMER_LOCK(vlapic);
+ if (callout_active(&vlapic->callout)) {
+ /*
+ * If the timer is scheduled to expire in the future then
+ * compute the value of 'ccr' based on the remaining time.
+ */
+ binuptime(&bt_now);
+ if (bintime_cmp(&vlapic->timer_fire_bt, &bt_now, >)) {
+ bt_rem = vlapic->timer_fire_bt;
+ bintime_sub(&bt_rem, &bt_now);
+ ccr += bt_rem.sec * BT2FREQ(&vlapic->timer_freq_bt);
+ ccr += bt_rem.frac / vlapic->timer_freq_bt.frac;
+ }
+ }
+#ifdef __FreeBSD__
+ KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %#x, "
+ "icr_timer is %#x", ccr, lapic->icr_timer));
+#else
+ KASSERT(ccr <= lapic->icr_timer, ("vlapic_get_ccr: invalid ccr %x, "
+ "icr_timer is %x", ccr, lapic->icr_timer));
+#endif
+ VLAPIC_CTR2(vlapic, "vlapic ccr_timer = %#x, icr_timer = %#x",
+ ccr, lapic->icr_timer);
+ VLAPIC_TIMER_UNLOCK(vlapic);
+ return (ccr);
+}
+
+void
+vlapic_dcr_write_handler(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic;
+ int divisor;
+
+ lapic = vlapic->apic_page;
+ VLAPIC_TIMER_LOCK(vlapic);
+
+ divisor = vlapic_timer_divisor(lapic->dcr_timer);
+ VLAPIC_CTR2(vlapic, "vlapic dcr_timer=%#x, divisor=%d",
+ lapic->dcr_timer, divisor);
+
+ /*
+ * Update the timer frequency and the timer period.
+ *
+ * XXX changes to the frequency divider will not take effect until
+ * the timer is reloaded.
+ */
+ FREQ2BT(VLAPIC_BUS_FREQ / divisor, &vlapic->timer_freq_bt);
+ vlapic->timer_period_bt = vlapic->timer_freq_bt;
+ bintime_mul(&vlapic->timer_period_bt, lapic->icr_timer);
+
+ VLAPIC_TIMER_UNLOCK(vlapic);
+}
+
+
+void
+vlapic_esr_write_handler(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic;
+
+ lapic = vlapic->apic_page;
+ lapic->esr = vlapic->esr_pending;
+ vlapic->esr_pending = 0;
+}
+
+int
+vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level)
+{
+ struct LAPIC *lapic;
+ uint32_t *irrptr, *tmrptr, mask;
+ int idx;
+
+ KASSERT(vector >= 0 && vector < 256, ("invalid vector %d", vector));
+
+ lapic = vlapic->apic_page;
+ if (!(lapic->svr & APIC_SVR_ENABLE)) {
+ VLAPIC_CTR1(vlapic, "vlapic is software disabled, ignoring "
+ "interrupt %d", vector);
+ return (0);
+ }
+
+ if (vector < 16) {
+ vlapic_set_error(vlapic, APIC_ESR_RECEIVE_ILLEGAL_VECTOR);
+ VLAPIC_CTR1(vlapic, "vlapic ignoring interrupt to vector %d",
+ vector);
+ return (1);
+ }
+
+ if (vlapic->ops.set_intr_ready)
+ return ((*vlapic->ops.set_intr_ready)(vlapic, vector, level));
+
+ idx = (vector / 32) * 4;
+ mask = 1 << (vector % 32);
+
+ irrptr = &lapic->irr0;
+ atomic_set_int(&irrptr[idx], mask);
+
+ /*
+ * Verify that the trigger-mode of the interrupt matches with
+ * the vlapic TMR registers.
+ */
+ tmrptr = &lapic->tmr0;
+ if ((tmrptr[idx] & mask) != (level ? mask : 0)) {
+ VLAPIC_CTR3(vlapic, "vlapic TMR[%d] is 0x%08x but "
+ "interrupt is %s-triggered", idx / 4, tmrptr[idx],
+ level ? "level" : "edge");
+ }
+
+ VLAPIC_CTR_IRR(vlapic, "vlapic_set_intr_ready");
+ return (1);
+}
+
+static __inline uint32_t *
+vlapic_get_lvtptr(struct vlapic *vlapic, uint32_t offset)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+ int i;
+
+ switch (offset) {
+ case APIC_OFFSET_CMCI_LVT:
+ return (&lapic->lvt_cmci);
+ case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+ i = (offset - APIC_OFFSET_TIMER_LVT) >> 2;
+ return ((&lapic->lvt_timer) + i);;
+ default:
+ panic("vlapic_get_lvt: invalid LVT\n");
+ }
+}
+
+static __inline int
+lvt_off_to_idx(uint32_t offset)
+{
+ int index;
+
+ switch (offset) {
+ case APIC_OFFSET_CMCI_LVT:
+ index = APIC_LVT_CMCI;
+ break;
+ case APIC_OFFSET_TIMER_LVT:
+ index = APIC_LVT_TIMER;
+ break;
+ case APIC_OFFSET_THERM_LVT:
+ index = APIC_LVT_THERMAL;
+ break;
+ case APIC_OFFSET_PERF_LVT:
+ index = APIC_LVT_PMC;
+ break;
+ case APIC_OFFSET_LINT0_LVT:
+ index = APIC_LVT_LINT0;
+ break;
+ case APIC_OFFSET_LINT1_LVT:
+ index = APIC_LVT_LINT1;
+ break;
+ case APIC_OFFSET_ERROR_LVT:
+ index = APIC_LVT_ERROR;
+ break;
+ default:
+ index = -1;
+ break;
+ }
+#ifdef __FreeBSD__
+ KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
+ "invalid lvt index %d for offset %#x", index, offset));
+#else
+ KASSERT(index >= 0 && index <= VLAPIC_MAXLVT_INDEX, ("lvt_off_to_idx: "
+ "invalid lvt index %d for offset %x", index, offset));
+#endif
+
+ return (index);
+}
+
+static __inline uint32_t
+vlapic_get_lvt(struct vlapic *vlapic, uint32_t offset)
+{
+ int idx;
+ uint32_t val;
+
+ idx = lvt_off_to_idx(offset);
+ val = atomic_load_acq_32(&vlapic->lvt_last[idx]);
+ return (val);
+}
+
+void
+vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset)
+{
+ uint32_t *lvtptr, mask, val;
+ struct LAPIC *lapic;
+ int idx;
+
+ lapic = vlapic->apic_page;
+ lvtptr = vlapic_get_lvtptr(vlapic, offset);
+ val = *lvtptr;
+ idx = lvt_off_to_idx(offset);
+
+ if (!(lapic->svr & APIC_SVR_ENABLE))
+ val |= APIC_LVT_M;
+ mask = APIC_LVT_M | APIC_LVT_DS | APIC_LVT_VECTOR;
+ switch (offset) {
+ case APIC_OFFSET_TIMER_LVT:
+ mask |= APIC_LVTT_TM;
+ break;
+ case APIC_OFFSET_ERROR_LVT:
+ break;
+ case APIC_OFFSET_LINT0_LVT:
+ case APIC_OFFSET_LINT1_LVT:
+ mask |= APIC_LVT_TM | APIC_LVT_RIRR | APIC_LVT_IIPP;
+ /* FALLTHROUGH */
+ default:
+ mask |= APIC_LVT_DM;
+ break;
+ }
+ val &= mask;
+ *lvtptr = val;
+ atomic_store_rel_32(&vlapic->lvt_last[idx], val);
+}
+
+static void
+vlapic_mask_lvts(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+
+ lapic->lvt_cmci |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_CMCI_LVT);
+
+ lapic->lvt_timer |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_TIMER_LVT);
+
+ lapic->lvt_thermal |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_THERM_LVT);
+
+ lapic->lvt_pcint |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_PERF_LVT);
+
+ lapic->lvt_lint0 |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT0_LVT);
+
+ lapic->lvt_lint1 |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_LINT1_LVT);
+
+ lapic->lvt_error |= APIC_LVT_M;
+ vlapic_lvt_write_handler(vlapic, APIC_OFFSET_ERROR_LVT);
+}
+
+static int
+vlapic_fire_lvt(struct vlapic *vlapic, uint32_t lvt)
+{
+ uint32_t vec, mode;
+
+ if (lvt & APIC_LVT_M)
+ return (0);
+
+ vec = lvt & APIC_LVT_VECTOR;
+ mode = lvt & APIC_LVT_DM;
+
+ switch (mode) {
+ case APIC_LVT_DM_FIXED:
+ if (vec < 16) {
+ vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR);
+ return (0);
+ }
+ if (vlapic_set_intr_ready(vlapic, vec, false))
+ vcpu_notify_event(vlapic->vm, vlapic->vcpuid, true);
+ break;
+ case APIC_LVT_DM_NMI:
+ vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
+ break;
+ case APIC_LVT_DM_EXTINT:
+ vm_inject_extint(vlapic->vm, vlapic->vcpuid);
+ break;
+ default:
+ // Other modes ignored
+ return (0);
+ }
+ return (1);
+}
+
+#if 1
+static void
+dump_isrvec_stk(struct vlapic *vlapic)
+{
+ int i;
+ uint32_t *isrptr;
+
+ isrptr = &vlapic->apic_page->isr0;
+ for (i = 0; i < 8; i++)
+ printf("ISR%d 0x%08x\n", i, isrptr[i * 4]);
+
+ for (i = 0; i <= vlapic->isrvec_stk_top; i++)
+ printf("isrvec_stk[%d] = %d\n", i, vlapic->isrvec_stk[i]);
+}
+#endif
+
+/*
+ * Algorithm adopted from section "Interrupt, Task and Processor Priority"
+ * in Intel Architecture Manual Vol 3a.
+ */
+static void
+vlapic_update_ppr(struct vlapic *vlapic)
+{
+ int isrvec, tpr, ppr;
+
+ /*
+ * Note that the value on the stack at index 0 is always 0.
+ *
+ * This is a placeholder for the value of ISRV when none of the
+ * bits is set in the ISRx registers.
+ */
+ isrvec = vlapic->isrvec_stk[vlapic->isrvec_stk_top];
+ tpr = vlapic->apic_page->tpr;
+
+#if 1
+ {
+ int i, lastprio, curprio, vector, idx;
+ uint32_t *isrptr;
+
+ if (vlapic->isrvec_stk_top == 0 && isrvec != 0)
+ panic("isrvec_stk is corrupted: %d", isrvec);
+
+ /*
+ * Make sure that the priority of the nested interrupts is
+ * always increasing.
+ */
+ lastprio = -1;
+ for (i = 1; i <= vlapic->isrvec_stk_top; i++) {
+ curprio = PRIO(vlapic->isrvec_stk[i]);
+ if (curprio <= lastprio) {
+ dump_isrvec_stk(vlapic);
+ panic("isrvec_stk does not satisfy invariant");
+ }
+ lastprio = curprio;
+ }
+
+ /*
+ * Make sure that each bit set in the ISRx registers has a
+ * corresponding entry on the isrvec stack.
+ */
+ i = 1;
+ isrptr = &vlapic->apic_page->isr0;
+ for (vector = 0; vector < 256; vector++) {
+ idx = (vector / 32) * 4;
+ if (isrptr[idx] & (1 << (vector % 32))) {
+ if (i > vlapic->isrvec_stk_top ||
+ vlapic->isrvec_stk[i] != vector) {
+ dump_isrvec_stk(vlapic);
+ panic("ISR and isrvec_stk out of sync");
+ }
+ i++;
+ }
+ }
+ }
+#endif
+
+ if (PRIO(tpr) >= PRIO(isrvec))
+ ppr = tpr;
+ else
+ ppr = isrvec & 0xf0;
+
+ vlapic->apic_page->ppr = ppr;
+ VLAPIC_CTR1(vlapic, "vlapic_update_ppr 0x%02x", ppr);
+}
+
+static void
+vlapic_process_eoi(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+ uint32_t *isrptr, *tmrptr;
+ int i, idx, bitpos, vector;
+
+ isrptr = &lapic->isr0;
+ tmrptr = &lapic->tmr0;
+
+ /*
+ * The x86 architecture reserves the the first 32 vectors for use
+ * by the processor.
+ */
+ for (i = 7; i > 0; i--) {
+ idx = i * 4;
+ bitpos = fls(isrptr[idx]);
+ if (bitpos-- != 0) {
+ if (vlapic->isrvec_stk_top <= 0) {
+ panic("invalid vlapic isrvec_stk_top %d",
+ vlapic->isrvec_stk_top);
+ }
+ isrptr[idx] &= ~(1 << bitpos);
+ VLAPIC_CTR_ISR(vlapic, "vlapic_process_eoi");
+ vlapic->isrvec_stk_top--;
+ vlapic_update_ppr(vlapic);
+ if ((tmrptr[idx] & (1 << bitpos)) != 0) {
+ vector = i * 32 + bitpos;
+ vioapic_process_eoi(vlapic->vm, vlapic->vcpuid,
+ vector);
+ }
+ return;
+ }
+ }
+}
+
+static __inline int
+vlapic_get_lvt_field(uint32_t lvt, uint32_t mask)
+{
+
+ return (lvt & mask);
+}
+
+static __inline int
+vlapic_periodic_timer(struct vlapic *vlapic)
+{
+ uint32_t lvt;
+
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+
+ return (vlapic_get_lvt_field(lvt, APIC_LVTT_TM_PERIODIC));
+}
+
+static VMM_STAT(VLAPIC_INTR_ERROR, "error interrupts generated by vlapic");
+
+void
+vlapic_set_error(struct vlapic *vlapic, uint32_t mask)
+{
+ uint32_t lvt;
+
+ vlapic->esr_pending |= mask;
+ if (vlapic->esr_firing)
+ return;
+ vlapic->esr_firing = 1;
+
+ // The error LVT always uses the fixed delivery mode.
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT);
+ if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) {
+ vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_ERROR, 1);
+ }
+ vlapic->esr_firing = 0;
+}
+
+static VMM_STAT(VLAPIC_INTR_TIMER, "timer interrupts generated by vlapic");
+
+static void
+vlapic_fire_timer(struct vlapic *vlapic)
+{
+ uint32_t lvt;
+
+ KASSERT(VLAPIC_TIMER_LOCKED(vlapic), ("vlapic_fire_timer not locked"));
+
+ // The timer LVT always uses the fixed delivery mode.
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+ if (vlapic_fire_lvt(vlapic, lvt | APIC_LVT_DM_FIXED)) {
+ VLAPIC_CTR0(vlapic, "vlapic timer fired");
+ vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_TIMER, 1);
+ }
+}
+
+static VMM_STAT(VLAPIC_INTR_CMC,
+ "corrected machine check interrupts generated by vlapic");
+
+void
+vlapic_fire_cmci(struct vlapic *vlapic)
+{
+ uint32_t lvt;
+
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT);
+ if (vlapic_fire_lvt(vlapic, lvt)) {
+ vmm_stat_incr(vlapic->vm, vlapic->vcpuid, VLAPIC_INTR_CMC, 1);
+ }
+}
+
+static VMM_STAT_ARRAY(LVTS_TRIGGERRED, VLAPIC_MAXLVT_INDEX + 1,
+ "lvts triggered");
+
+int
+vlapic_trigger_lvt(struct vlapic *vlapic, int vector)
+{
+ uint32_t lvt;
+
+ if (vlapic_enabled(vlapic) == false) {
+ /*
+ * When the local APIC is global/hardware disabled,
+ * LINT[1:0] pins are configured as INTR and NMI pins,
+ * respectively.
+ */
+ switch (vector) {
+ case APIC_LVT_LINT0:
+ vm_inject_extint(vlapic->vm, vlapic->vcpuid);
+ break;
+ case APIC_LVT_LINT1:
+ vm_inject_nmi(vlapic->vm, vlapic->vcpuid);
+ break;
+ default:
+ break;
+ }
+ return (0);
+ }
+
+ switch (vector) {
+ case APIC_LVT_LINT0:
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT0_LVT);
+ break;
+ case APIC_LVT_LINT1:
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_LINT1_LVT);
+ break;
+ case APIC_LVT_TIMER:
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_TIMER_LVT);
+ lvt |= APIC_LVT_DM_FIXED;
+ break;
+ case APIC_LVT_ERROR:
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_ERROR_LVT);
+ lvt |= APIC_LVT_DM_FIXED;
+ break;
+ case APIC_LVT_PMC:
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_PERF_LVT);
+ break;
+ case APIC_LVT_THERMAL:
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_THERM_LVT);
+ break;
+ case APIC_LVT_CMCI:
+ lvt = vlapic_get_lvt(vlapic, APIC_OFFSET_CMCI_LVT);
+ break;
+ default:
+ return (EINVAL);
+ }
+ if (vlapic_fire_lvt(vlapic, lvt)) {
+ vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
+ LVTS_TRIGGERRED, vector, 1);
+ }
+ return (0);
+}
+
+static void
+vlapic_callout_handler(void *arg)
+{
+ struct vlapic *vlapic;
+ struct bintime bt, btnow;
+ sbintime_t rem_sbt;
+
+ vlapic = arg;
+
+ VLAPIC_TIMER_LOCK(vlapic);
+ if (callout_pending(&vlapic->callout)) /* callout was reset */
+ goto done;
+
+ if (!callout_active(&vlapic->callout)) /* callout was stopped */
+ goto done;
+
+ callout_deactivate(&vlapic->callout);
+
+ vlapic_fire_timer(vlapic);
+
+ if (vlapic_periodic_timer(vlapic)) {
+ binuptime(&btnow);
+#ifdef __FreeBSD__
+ KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
+ ("vlapic callout at %#lx.%#lx, expected at %#lx.#%lx",
+ btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
+ vlapic->timer_fire_bt.frac));
+#else
+ KASSERT(bintime_cmp(&btnow, &vlapic->timer_fire_bt, >=),
+ ("vlapic callout at %lx.%lx, expected at %lx.%lx",
+ btnow.sec, btnow.frac, vlapic->timer_fire_bt.sec,
+ vlapic->timer_fire_bt.frac));
+#endif
+
+ /*
+ * Compute the delta between when the timer was supposed to
+ * fire and the present time.
+ */
+ bt = btnow;
+ bintime_sub(&bt, &vlapic->timer_fire_bt);
+
+ rem_sbt = bttosbt(vlapic->timer_period_bt);
+ if (bintime_cmp(&bt, &vlapic->timer_period_bt, <)) {
+ /*
+ * Adjust the time until the next countdown downward
+ * to account for the lost time.
+ */
+ rem_sbt -= bttosbt(bt);
+ } else {
+ /*
+ * If the delta is greater than the timer period then
+ * just reset our time base instead of trying to catch
+ * up.
+ */
+ vlapic->timer_fire_bt = btnow;
+ VLAPIC_CTR2(vlapic, "vlapic timer lagging by %lu "
+ "usecs, period is %lu usecs - resetting time base",
+ bttosbt(bt) / SBT_1US,
+ bttosbt(vlapic->timer_period_bt) / SBT_1US);
+ }
+
+ bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
+ callout_reset_sbt(&vlapic->callout, rem_sbt, 0,
+ vlapic_callout_handler, vlapic, 0);
+ }
+done:
+ VLAPIC_TIMER_UNLOCK(vlapic);
+}
+
+void
+vlapic_icrtmr_write_handler(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic;
+ sbintime_t sbt;
+ uint32_t icr_timer;
+
+ VLAPIC_TIMER_LOCK(vlapic);
+
+ lapic = vlapic->apic_page;
+ icr_timer = lapic->icr_timer;
+
+ vlapic->timer_period_bt = vlapic->timer_freq_bt;
+ bintime_mul(&vlapic->timer_period_bt, icr_timer);
+
+ if (icr_timer != 0) {
+ binuptime(&vlapic->timer_fire_bt);
+ bintime_add(&vlapic->timer_fire_bt, &vlapic->timer_period_bt);
+
+ sbt = bttosbt(vlapic->timer_period_bt);
+ callout_reset_sbt(&vlapic->callout, sbt, 0,
+ vlapic_callout_handler, vlapic, 0);
+ } else
+ callout_stop(&vlapic->callout);
+
+ VLAPIC_TIMER_UNLOCK(vlapic);
+}
+
+/*
+ * This function populates 'dmask' with the set of vcpus that match the
+ * addressing specified by the (dest, phys, lowprio) tuple.
+ *
+ * 'x2apic_dest' specifies whether 'dest' is interpreted as x2APIC (32-bit)
+ * or xAPIC (8-bit) destination field.
+ */
+static void
+vlapic_calcdest(struct vm *vm, cpuset_t *dmask, uint32_t dest, bool phys,
+ bool lowprio, bool x2apic_dest)
+{
+ struct vlapic *vlapic;
+ uint32_t dfr, ldr, ldest, cluster;
+ uint32_t mda_flat_ldest, mda_cluster_ldest, mda_ldest, mda_cluster_id;
+ cpuset_t amask;
+ int vcpuid;
+
+ if ((x2apic_dest && dest == 0xffffffff) ||
+ (!x2apic_dest && dest == 0xff)) {
+ /*
+ * Broadcast in both logical and physical modes.
+ */
+ *dmask = vm_active_cpus(vm);
+ return;
+ }
+
+ if (phys) {
+ /*
+ * Physical mode: destination is APIC ID.
+ */
+ CPU_ZERO(dmask);
+ vcpuid = vm_apicid2vcpuid(vm, dest);
+ if (vcpuid < VM_MAXCPU)
+ CPU_SET(vcpuid, dmask);
+ } else {
+ /*
+ * In the "Flat Model" the MDA is interpreted as an 8-bit wide
+ * bitmask. This model is only avilable in the xAPIC mode.
+ */
+ mda_flat_ldest = dest & 0xff;
+
+ /*
+ * In the "Cluster Model" the MDA is used to identify a
+ * specific cluster and a set of APICs in that cluster.
+ */
+ if (x2apic_dest) {
+ mda_cluster_id = dest >> 16;
+ mda_cluster_ldest = dest & 0xffff;
+ } else {
+ mda_cluster_id = (dest >> 4) & 0xf;
+ mda_cluster_ldest = dest & 0xf;
+ }
+
+ /*
+ * Logical mode: match each APIC that has a bit set
+ * in it's LDR that matches a bit in the ldest.
+ */
+ CPU_ZERO(dmask);
+ amask = vm_active_cpus(vm);
+ while ((vcpuid = CPU_FFS(&amask)) != 0) {
+ vcpuid--;
+ CPU_CLR(vcpuid, &amask);
+
+ vlapic = vm_lapic(vm, vcpuid);
+ dfr = vlapic->apic_page->dfr;
+ ldr = vlapic->apic_page->ldr;
+
+ if ((dfr & APIC_DFR_MODEL_MASK) ==
+ APIC_DFR_MODEL_FLAT) {
+ ldest = ldr >> 24;
+ mda_ldest = mda_flat_ldest;
+ } else if ((dfr & APIC_DFR_MODEL_MASK) ==
+ APIC_DFR_MODEL_CLUSTER) {
+ if (x2apic(vlapic)) {
+ cluster = ldr >> 16;
+ ldest = ldr & 0xffff;
+ } else {
+ cluster = ldr >> 28;
+ ldest = (ldr >> 24) & 0xf;
+ }
+ if (cluster != mda_cluster_id)
+ continue;
+ mda_ldest = mda_cluster_ldest;
+ } else {
+ /*
+ * Guest has configured a bad logical
+ * model for this vcpu - skip it.
+ */
+ VLAPIC_CTR1(vlapic, "vlapic has bad logical "
+ "model %x - cannot deliver interrupt", dfr);
+ continue;
+ }
+
+ if ((mda_ldest & ldest) != 0) {
+ CPU_SET(vcpuid, dmask);
+ if (lowprio)
+ break;
+ }
+ }
+ }
+}
+
+static VMM_STAT_ARRAY(IPIS_SENT, VM_MAXCPU, "ipis sent to vcpu");
+
+static void
+vlapic_set_tpr(struct vlapic *vlapic, uint8_t val)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+
+ if (lapic->tpr != val) {
+ VCPU_CTR2(vlapic->vm, vlapic->vcpuid, "vlapic TPR changed "
+ "from %#x to %#x", lapic->tpr, val);
+ lapic->tpr = val;
+ vlapic_update_ppr(vlapic);
+ }
+}
+
+static uint8_t
+vlapic_get_tpr(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+
+ return (lapic->tpr);
+}
+
+void
+vlapic_set_cr8(struct vlapic *vlapic, uint64_t val)
+{
+ uint8_t tpr;
+
+ if (val & ~0xf) {
+ vm_inject_gp(vlapic->vm, vlapic->vcpuid);
+ return;
+ }
+
+ tpr = val << 4;
+ vlapic_set_tpr(vlapic, tpr);
+}
+
+uint64_t
+vlapic_get_cr8(struct vlapic *vlapic)
+{
+ uint8_t tpr;
+
+ tpr = vlapic_get_tpr(vlapic);
+ return (tpr >> 4);
+}
+
+int
+vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu)
+{
+ int i;
+ bool phys;
+ cpuset_t dmask;
+ uint64_t icrval;
+ uint32_t dest, vec, mode;
+ struct vlapic *vlapic2;
+ struct vm_exit *vmexit;
+ struct LAPIC *lapic;
+
+ lapic = vlapic->apic_page;
+ lapic->icr_lo &= ~APIC_DELSTAT_PEND;
+ icrval = ((uint64_t)lapic->icr_hi << 32) | lapic->icr_lo;
+
+ if (x2apic(vlapic))
+ dest = icrval >> 32;
+ else
+ dest = icrval >> (32 + 24);
+ vec = icrval & APIC_VECTOR_MASK;
+ mode = icrval & APIC_DELMODE_MASK;
+
+ if (mode == APIC_DELMODE_FIXED && vec < 16) {
+ vlapic_set_error(vlapic, APIC_ESR_SEND_ILLEGAL_VECTOR);
+ VLAPIC_CTR1(vlapic, "Ignoring invalid IPI %d", vec);
+ return (0);
+ }
+
+ VLAPIC_CTR2(vlapic, "icrlo 0x%016lx triggered ipi %d", icrval, vec);
+
+ if (mode == APIC_DELMODE_FIXED || mode == APIC_DELMODE_NMI) {
+ switch (icrval & APIC_DEST_MASK) {
+ case APIC_DEST_DESTFLD:
+ phys = ((icrval & APIC_DESTMODE_LOG) == 0);
+ vlapic_calcdest(vlapic->vm, &dmask, dest, phys, false,
+ x2apic(vlapic));
+ break;
+ case APIC_DEST_SELF:
+ CPU_SETOF(vlapic->vcpuid, &dmask);
+ break;
+ case APIC_DEST_ALLISELF:
+ dmask = vm_active_cpus(vlapic->vm);
+ break;
+ case APIC_DEST_ALLESELF:
+ dmask = vm_active_cpus(vlapic->vm);
+ CPU_CLR(vlapic->vcpuid, &dmask);
+ break;
+ default:
+ CPU_ZERO(&dmask); /* satisfy gcc */
+ break;
+ }
+
+ while ((i = CPU_FFS(&dmask)) != 0) {
+ i--;
+ CPU_CLR(i, &dmask);
+ if (mode == APIC_DELMODE_FIXED) {
+ lapic_intr_edge(vlapic->vm, i, vec);
+ vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid,
+ IPIS_SENT, i, 1);
+ VLAPIC_CTR2(vlapic, "vlapic sending ipi %d "
+ "to vcpuid %d", vec, i);
+ } else {
+ vm_inject_nmi(vlapic->vm, i);
+ VLAPIC_CTR1(vlapic, "vlapic sending ipi nmi "
+ "to vcpuid %d", i);
+ }
+ }
+
+ return (0); /* handled completely in the kernel */
+ }
+
+ if (mode == APIC_DELMODE_INIT) {
+ if ((icrval & APIC_LEVEL_MASK) == APIC_LEVEL_DEASSERT)
+ return (0);
+
+ if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+ vlapic2 = vm_lapic(vlapic->vm, dest);
+
+ /* move from INIT to waiting-for-SIPI state */
+ if (vlapic2->boot_state == BS_INIT) {
+ vlapic2->boot_state = BS_SIPI;
+ }
+
+ return (0);
+ }
+ }
+
+ if (mode == APIC_DELMODE_STARTUP) {
+ if (vlapic->vcpuid == 0 && dest != 0 && dest < VM_MAXCPU) {
+ vlapic2 = vm_lapic(vlapic->vm, dest);
+
+ /*
+ * Ignore SIPIs in any state other than wait-for-SIPI
+ */
+ if (vlapic2->boot_state != BS_SIPI)
+ return (0);
+
+ vlapic2->boot_state = BS_RUNNING;
+
+ *retu = true;
+ vmexit = vm_exitinfo(vlapic->vm, vlapic->vcpuid);
+ vmexit->exitcode = VM_EXITCODE_SPINUP_AP;
+ vmexit->u.spinup_ap.vcpu = dest;
+ vmexit->u.spinup_ap.rip = vec << PAGE_SHIFT;
+
+ return (0);
+ }
+ }
+
+ /*
+ * This will cause a return to userland.
+ */
+ return (1);
+}
+
+void
+vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val)
+{
+ int vec;
+
+ KASSERT(x2apic(vlapic), ("SELF_IPI does not exist in xAPIC mode"));
+
+ vec = val & 0xff;
+ lapic_intr_edge(vlapic->vm, vlapic->vcpuid, vec);
+ vmm_stat_array_incr(vlapic->vm, vlapic->vcpuid, IPIS_SENT,
+ vlapic->vcpuid, 1);
+ VLAPIC_CTR1(vlapic, "vlapic self-ipi %d", vec);
+}
+
+int
+vlapic_pending_intr(struct vlapic *vlapic, int *vecptr)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+ int idx, i, bitpos, vector;
+ uint32_t *irrptr, val;
+
+ if (vlapic->ops.pending_intr)
+ return ((*vlapic->ops.pending_intr)(vlapic, vecptr));
+
+ irrptr = &lapic->irr0;
+
+ /*
+ * The x86 architecture reserves the the first 32 vectors for use
+ * by the processor.
+ */
+ for (i = 7; i > 0; i--) {
+ idx = i * 4;
+ val = atomic_load_acq_int(&irrptr[idx]);
+ bitpos = fls(val);
+ if (bitpos != 0) {
+ vector = i * 32 + (bitpos - 1);
+ if (PRIO(vector) > PRIO(lapic->ppr)) {
+ VLAPIC_CTR1(vlapic, "pending intr %d", vector);
+ if (vecptr != NULL)
+ *vecptr = vector;
+ return (1);
+ } else
+ break;
+ }
+ }
+ return (0);
+}
+
+void
+vlapic_intr_accepted(struct vlapic *vlapic, int vector)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+ uint32_t *irrptr, *isrptr;
+ int idx, stk_top;
+
+ if (vlapic->ops.intr_accepted)
+ return ((*vlapic->ops.intr_accepted)(vlapic, vector));
+
+ /*
+ * clear the ready bit for vector being accepted in irr
+ * and set the vector as in service in isr.
+ */
+ idx = (vector / 32) * 4;
+
+ irrptr = &lapic->irr0;
+ atomic_clear_int(&irrptr[idx], 1 << (vector % 32));
+ VLAPIC_CTR_IRR(vlapic, "vlapic_intr_accepted");
+
+ isrptr = &lapic->isr0;
+ isrptr[idx] |= 1 << (vector % 32);
+ VLAPIC_CTR_ISR(vlapic, "vlapic_intr_accepted");
+
+ /*
+ * Update the PPR
+ */
+ vlapic->isrvec_stk_top++;
+
+ stk_top = vlapic->isrvec_stk_top;
+ if (stk_top >= ISRVEC_STK_SIZE)
+ panic("isrvec_stk_top overflow %d", stk_top);
+
+ vlapic->isrvec_stk[stk_top] = vector;
+ vlapic_update_ppr(vlapic);
+}
+
+void
+vlapic_svr_write_handler(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic;
+ uint32_t old, new, changed;
+
+ lapic = vlapic->apic_page;
+
+ new = lapic->svr;
+ old = vlapic->svr_last;
+ vlapic->svr_last = new;
+
+ changed = old ^ new;
+ if ((changed & APIC_SVR_ENABLE) != 0) {
+ if ((new & APIC_SVR_ENABLE) == 0) {
+ /*
+ * The apic is now disabled so stop the apic timer
+ * and mask all the LVT entries.
+ */
+ VLAPIC_CTR0(vlapic, "vlapic is software-disabled");
+ VLAPIC_TIMER_LOCK(vlapic);
+ callout_stop(&vlapic->callout);
+ VLAPIC_TIMER_UNLOCK(vlapic);
+ vlapic_mask_lvts(vlapic);
+ } else {
+ /*
+ * The apic is now enabled so restart the apic timer
+ * if it is configured in periodic mode.
+ */
+ VLAPIC_CTR0(vlapic, "vlapic is software-enabled");
+ if (vlapic_periodic_timer(vlapic))
+ vlapic_icrtmr_write_handler(vlapic);
+ }
+ }
+}
+
+int
+vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset,
+ uint64_t *data, bool *retu)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+ uint32_t *reg;
+ int i;
+
+ /* Ignore MMIO accesses in x2APIC mode */
+ if (x2apic(vlapic) && mmio_access) {
+ VLAPIC_CTR1(vlapic, "MMIO read from offset %#lx in x2APIC mode",
+ offset);
+ *data = 0;
+ goto done;
+ }
+
+ if (!x2apic(vlapic) && !mmio_access) {
+ /*
+ * XXX Generate GP fault for MSR accesses in xAPIC mode
+ */
+ VLAPIC_CTR1(vlapic, "x2APIC MSR read from offset %#lx in "
+ "xAPIC mode", offset);
+ *data = 0;
+ goto done;
+ }
+
+ if (offset > sizeof(*lapic)) {
+ *data = 0;
+ goto done;
+ }
+
+ offset &= ~3;
+ switch(offset)
+ {
+ case APIC_OFFSET_ID:
+ *data = lapic->id;
+ break;
+ case APIC_OFFSET_VER:
+ *data = lapic->version;
+ break;
+ case APIC_OFFSET_TPR:
+ *data = vlapic_get_tpr(vlapic);
+ break;
+ case APIC_OFFSET_APR:
+ *data = lapic->apr;
+ break;
+ case APIC_OFFSET_PPR:
+ *data = lapic->ppr;
+ break;
+ case APIC_OFFSET_EOI:
+ *data = lapic->eoi;
+ break;
+ case APIC_OFFSET_LDR:
+ *data = lapic->ldr;
+ break;
+ case APIC_OFFSET_DFR:
+ *data = lapic->dfr;
+ break;
+ case APIC_OFFSET_SVR:
+ *data = lapic->svr;
+ break;
+ case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+ i = (offset - APIC_OFFSET_ISR0) >> 2;
+ reg = &lapic->isr0;
+ *data = *(reg + i);
+ break;
+ case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+ i = (offset - APIC_OFFSET_TMR0) >> 2;
+ reg = &lapic->tmr0;
+ *data = *(reg + i);
+ break;
+ case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+ i = (offset - APIC_OFFSET_IRR0) >> 2;
+ reg = &lapic->irr0;
+ *data = atomic_load_acq_int(reg + i);
+ break;
+ case APIC_OFFSET_ESR:
+ *data = lapic->esr;
+ break;
+ case APIC_OFFSET_ICR_LOW:
+ *data = lapic->icr_lo;
+ if (x2apic(vlapic))
+ *data |= (uint64_t)lapic->icr_hi << 32;
+ break;
+ case APIC_OFFSET_ICR_HI:
+ *data = lapic->icr_hi;
+ break;
+ case APIC_OFFSET_CMCI_LVT:
+ case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+ *data = vlapic_get_lvt(vlapic, offset);
+#ifdef INVARIANTS
+ reg = vlapic_get_lvtptr(vlapic, offset);
+ KASSERT(*data == *reg, ("inconsistent lvt value at "
+ "offset %#lx: %#lx/%#x", offset, *data, *reg));
+#endif
+ break;
+ case APIC_OFFSET_TIMER_ICR:
+ *data = lapic->icr_timer;
+ break;
+ case APIC_OFFSET_TIMER_CCR:
+ *data = vlapic_get_ccr(vlapic);
+ break;
+ case APIC_OFFSET_TIMER_DCR:
+ *data = lapic->dcr_timer;
+ break;
+ case APIC_OFFSET_SELF_IPI:
+ /*
+ * XXX generate a GP fault if vlapic is in x2apic mode
+ */
+ *data = 0;
+ break;
+ case APIC_OFFSET_RRR:
+ default:
+ *data = 0;
+ break;
+ }
+done:
+ VLAPIC_CTR2(vlapic, "vlapic read offset %#x, data %#lx", offset, *data);
+ return 0;
+}
+
+int
+vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset,
+ uint64_t data, bool *retu)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+ uint32_t *regptr;
+ int retval;
+
+#ifdef __FreeBSD__
+ KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE,
+ ("vlapic_write: invalid offset %#lx", offset));
+#else
+ KASSERT((offset & 0xf) == 0 && offset < PAGE_SIZE,
+ ("vlapic_write: invalid offset %lx", offset));
+#endif
+
+ VLAPIC_CTR2(vlapic, "vlapic write offset %#lx, data %#lx",
+ offset, data);
+
+ if (offset > sizeof(*lapic))
+ return (0);
+
+ /* Ignore MMIO accesses in x2APIC mode */
+ if (x2apic(vlapic) && mmio_access) {
+ VLAPIC_CTR2(vlapic, "MMIO write of %#lx to offset %#lx "
+ "in x2APIC mode", data, offset);
+ return (0);
+ }
+
+ /*
+ * XXX Generate GP fault for MSR accesses in xAPIC mode
+ */
+ if (!x2apic(vlapic) && !mmio_access) {
+ VLAPIC_CTR2(vlapic, "x2APIC MSR write of %#lx to offset %#lx "
+ "in xAPIC mode", data, offset);
+ return (0);
+ }
+
+ retval = 0;
+ switch(offset)
+ {
+ case APIC_OFFSET_ID:
+ lapic->id = data;
+ vlapic_id_write_handler(vlapic);
+ break;
+ case APIC_OFFSET_TPR:
+ vlapic_set_tpr(vlapic, data & 0xff);
+ break;
+ case APIC_OFFSET_EOI:
+ vlapic_process_eoi(vlapic);
+ break;
+ case APIC_OFFSET_LDR:
+ lapic->ldr = data;
+ vlapic_ldr_write_handler(vlapic);
+ break;
+ case APIC_OFFSET_DFR:
+ lapic->dfr = data;
+ vlapic_dfr_write_handler(vlapic);
+ break;
+ case APIC_OFFSET_SVR:
+ lapic->svr = data;
+ vlapic_svr_write_handler(vlapic);
+ break;
+ case APIC_OFFSET_ICR_LOW:
+ lapic->icr_lo = data;
+ if (x2apic(vlapic))
+ lapic->icr_hi = data >> 32;
+ retval = vlapic_icrlo_write_handler(vlapic, retu);
+ break;
+ case APIC_OFFSET_ICR_HI:
+ lapic->icr_hi = data;
+ break;
+ case APIC_OFFSET_CMCI_LVT:
+ case APIC_OFFSET_TIMER_LVT ... APIC_OFFSET_ERROR_LVT:
+ regptr = vlapic_get_lvtptr(vlapic, offset);
+ *regptr = data;
+ vlapic_lvt_write_handler(vlapic, offset);
+ break;
+ case APIC_OFFSET_TIMER_ICR:
+ lapic->icr_timer = data;
+ vlapic_icrtmr_write_handler(vlapic);
+ break;
+
+ case APIC_OFFSET_TIMER_DCR:
+ lapic->dcr_timer = data;
+ vlapic_dcr_write_handler(vlapic);
+ break;
+
+ case APIC_OFFSET_ESR:
+ vlapic_esr_write_handler(vlapic);
+ break;
+
+ case APIC_OFFSET_SELF_IPI:
+ if (x2apic(vlapic))
+ vlapic_self_ipi_handler(vlapic, data);
+ break;
+
+ case APIC_OFFSET_VER:
+ case APIC_OFFSET_APR:
+ case APIC_OFFSET_PPR:
+ case APIC_OFFSET_RRR:
+ case APIC_OFFSET_ISR0 ... APIC_OFFSET_ISR7:
+ case APIC_OFFSET_TMR0 ... APIC_OFFSET_TMR7:
+ case APIC_OFFSET_IRR0 ... APIC_OFFSET_IRR7:
+ case APIC_OFFSET_TIMER_CCR:
+ default:
+ // Read only.
+ break;
+ }
+
+ return (retval);
+}
+
+static void
+vlapic_reset(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic;
+
+ lapic = vlapic->apic_page;
+ bzero(lapic, sizeof(struct LAPIC));
+
+ lapic->id = vlapic_get_id(vlapic);
+ lapic->version = VLAPIC_VERSION;
+ lapic->version |= (VLAPIC_MAXLVT_INDEX << MAXLVTSHIFT);
+ lapic->dfr = 0xffffffff;
+ lapic->svr = APIC_SVR_VECTOR;
+ vlapic_mask_lvts(vlapic);
+ vlapic_reset_tmr(vlapic);
+
+ lapic->dcr_timer = 0;
+ vlapic_dcr_write_handler(vlapic);
+
+ if (vlapic->vcpuid == 0)
+ vlapic->boot_state = BS_RUNNING; /* BSP */
+ else
+ vlapic->boot_state = BS_INIT; /* AP */
+
+ vlapic->svr_last = lapic->svr;
+}
+
+void
+vlapic_init(struct vlapic *vlapic)
+{
+ KASSERT(vlapic->vm != NULL, ("vlapic_init: vm is not initialized"));
+ KASSERT(vlapic->vcpuid >= 0 && vlapic->vcpuid < VM_MAXCPU,
+ ("vlapic_init: vcpuid is not initialized"));
+ KASSERT(vlapic->apic_page != NULL, ("vlapic_init: apic_page is not "
+ "initialized"));
+
+ /*
+ * If the vlapic is configured in x2apic mode then it will be
+ * accessed in the critical section via the MSR emulation code.
+ *
+ * Therefore the timer mutex must be a spinlock because blockable
+ * mutexes cannot be acquired in a critical section.
+ */
+ mtx_init(&vlapic->timer_mtx, "vlapic timer mtx", NULL, MTX_SPIN);
+ callout_init(&vlapic->callout, 1);
+
+ vlapic->msr_apicbase = DEFAULT_APIC_BASE | APICBASE_ENABLED;
+
+ if (vlapic->vcpuid == 0)
+ vlapic->msr_apicbase |= APICBASE_BSP;
+
+ vlapic_reset(vlapic);
+}
+
+void
+vlapic_cleanup(struct vlapic *vlapic)
+{
+
+ callout_drain(&vlapic->callout);
+}
+
+uint64_t
+vlapic_get_apicbase(struct vlapic *vlapic)
+{
+
+ return (vlapic->msr_apicbase);
+}
+
+int
+vlapic_set_apicbase(struct vlapic *vlapic, uint64_t new)
+{
+
+ if (vlapic->msr_apicbase != new) {
+ VLAPIC_CTR2(vlapic, "Changing APIC_BASE MSR from %#lx to %#lx "
+ "not supported", vlapic->msr_apicbase, new);
+ return (-1);
+ }
+
+ return (0);
+}
+
+void
+vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+ struct vlapic *vlapic;
+ struct LAPIC *lapic;
+
+ vlapic = vm_lapic(vm, vcpuid);
+
+ if (state == X2APIC_DISABLED)
+ vlapic->msr_apicbase &= ~APICBASE_X2APIC;
+ else
+ vlapic->msr_apicbase |= APICBASE_X2APIC;
+
+ /*
+ * Reset the local APIC registers whose values are mode-dependent.
+ *
+ * XXX this works because the APIC mode can be changed only at vcpu
+ * initialization time.
+ */
+ lapic = vlapic->apic_page;
+ lapic->id = vlapic_get_id(vlapic);
+ if (x2apic(vlapic)) {
+ lapic->ldr = x2apic_ldr(vlapic);
+ lapic->dfr = 0;
+ } else {
+ lapic->ldr = 0;
+ lapic->dfr = 0xffffffff;
+ }
+
+ if (state == X2APIC_ENABLED) {
+ if (vlapic->ops.enable_x2apic_mode)
+ (*vlapic->ops.enable_x2apic_mode)(vlapic);
+ }
+}
+
+void
+vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
+ int delmode, int vec)
+{
+ bool lowprio;
+ int vcpuid;
+ cpuset_t dmask;
+
+ if (delmode != IOART_DELFIXED &&
+ delmode != IOART_DELLOPRI &&
+ delmode != IOART_DELEXINT) {
+ VM_CTR1(vm, "vlapic intr invalid delmode %#x", delmode);
+ return;
+ }
+ lowprio = (delmode == IOART_DELLOPRI);
+
+ /*
+ * We don't provide any virtual interrupt redirection hardware so
+ * all interrupts originating from the ioapic or MSI specify the
+ * 'dest' in the legacy xAPIC format.
+ */
+ vlapic_calcdest(vm, &dmask, dest, phys, lowprio, false);
+
+ while ((vcpuid = CPU_FFS(&dmask)) != 0) {
+ vcpuid--;
+ CPU_CLR(vcpuid, &dmask);
+ if (delmode == IOART_DELEXINT) {
+ vm_inject_extint(vm, vcpuid);
+ } else {
+ lapic_set_intr(vm, vcpuid, vec, level);
+ }
+ }
+}
+
+void
+vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum)
+{
+ /*
+ * Post an interrupt to the vcpu currently running on 'hostcpu'.
+ *
+ * This is done by leveraging features like Posted Interrupts (Intel)
+ * Doorbell MSR (AMD AVIC) that avoid a VM exit.
+ *
+ * If neither of these features are available then fallback to
+ * sending an IPI to 'hostcpu'.
+ */
+ if (vlapic->ops.post_intr)
+ (*vlapic->ops.post_intr)(vlapic, hostcpu);
+ else
+ ipi_cpu(hostcpu, ipinum);
+}
+
+bool
+vlapic_enabled(struct vlapic *vlapic)
+{
+ struct LAPIC *lapic = vlapic->apic_page;
+
+ if ((vlapic->msr_apicbase & APICBASE_ENABLED) != 0 &&
+ (lapic->svr & APIC_SVR_ENABLE) != 0)
+ return (true);
+ else
+ return (false);
+}
+
+static void
+vlapic_set_tmr(struct vlapic *vlapic, int vector, bool level)
+{
+ struct LAPIC *lapic;
+ uint32_t *tmrptr, mask;
+ int idx;
+
+ lapic = vlapic->apic_page;
+ tmrptr = &lapic->tmr0;
+ idx = (vector / 32) * 4;
+ mask = 1 << (vector % 32);
+ if (level)
+ tmrptr[idx] |= mask;
+ else
+ tmrptr[idx] &= ~mask;
+
+ if (vlapic->ops.set_tmr != NULL)
+ (*vlapic->ops.set_tmr)(vlapic, vector, level);
+}
+
+void
+vlapic_reset_tmr(struct vlapic *vlapic)
+{
+ int vector;
+
+ VLAPIC_CTR0(vlapic, "vlapic resetting all vectors to edge-triggered");
+
+ for (vector = 0; vector <= 255; vector++)
+ vlapic_set_tmr(vlapic, vector, false);
+}
+
+void
+vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
+ int delmode, int vector)
+{
+ cpuset_t dmask;
+ bool lowprio;
+
+ KASSERT(vector >= 0 && vector <= 255, ("invalid vector %d", vector));
+
+ /*
+ * A level trigger is valid only for fixed and lowprio delivery modes.
+ */
+ if (delmode != APIC_DELMODE_FIXED && delmode != APIC_DELMODE_LOWPRIO) {
+ VLAPIC_CTR1(vlapic, "Ignoring level trigger-mode for "
+ "delivery-mode %d", delmode);
+ return;
+ }
+
+ lowprio = (delmode == APIC_DELMODE_LOWPRIO);
+ vlapic_calcdest(vlapic->vm, &dmask, dest, phys, lowprio, false);
+
+ if (!CPU_ISSET(vlapic->vcpuid, &dmask))
+ return;
+
+ VLAPIC_CTR1(vlapic, "vector %d set to level-triggered", vector);
+ vlapic_set_tmr(vlapic, vector, true);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic.h b/usr/src/uts/i86pc/io/vmm/io/vlapic.h
new file mode 100644
index 0000000000..3fa705d818
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic.h
@@ -0,0 +1,109 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vlapic.h 262281 2014-02-21 06:03:54Z neel $
+ */
+
+#ifndef _VLAPIC_H_
+#define _VLAPIC_H_
+
+struct vm;
+enum x2apic_state;
+
+int vlapic_write(struct vlapic *vlapic, int mmio_access, uint64_t offset,
+ uint64_t data, bool *retu);
+int vlapic_read(struct vlapic *vlapic, int mmio_access, uint64_t offset,
+ uint64_t *data, bool *retu);
+
+/*
+ * Returns 0 if there is no eligible vector that can be delivered to the
+ * guest at this time and non-zero otherwise.
+ *
+ * If an eligible vector number is found and 'vecptr' is not NULL then it will
+ * be stored in the location pointed to by 'vecptr'.
+ *
+ * Note that the vector does not automatically transition to the ISR as a
+ * result of calling this function.
+ */
+int vlapic_pending_intr(struct vlapic *vlapic, int *vecptr);
+
+/*
+ * Transition 'vector' from IRR to ISR. This function is called with the
+ * vector returned by 'vlapic_pending_intr()' when the guest is able to
+ * accept this interrupt (i.e. RFLAGS.IF = 1 and no conditions exist that
+ * block interrupt delivery).
+ */
+void vlapic_intr_accepted(struct vlapic *vlapic, int vector);
+
+/*
+ * Returns 1 if the vcpu needs to be notified of the interrupt and 0 otherwise.
+ */
+int vlapic_set_intr_ready(struct vlapic *vlapic, int vector, bool level);
+
+/*
+ * Post an interrupt to the vcpu running on 'hostcpu'. This will use a
+ * hardware assist if available (e.g. Posted Interrupt) or fall back to
+ * sending an 'ipinum' to interrupt the 'hostcpu'.
+ */
+void vlapic_post_intr(struct vlapic *vlapic, int hostcpu, int ipinum);
+
+void vlapic_set_error(struct vlapic *vlapic, uint32_t mask);
+void vlapic_fire_cmci(struct vlapic *vlapic);
+int vlapic_trigger_lvt(struct vlapic *vlapic, int vector);
+
+uint64_t vlapic_get_apicbase(struct vlapic *vlapic);
+int vlapic_set_apicbase(struct vlapic *vlapic, uint64_t val);
+void vlapic_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state s);
+bool vlapic_enabled(struct vlapic *vlapic);
+
+void vlapic_deliver_intr(struct vm *vm, bool level, uint32_t dest, bool phys,
+ int delmode, int vec);
+
+/* Reset the trigger-mode bits for all vectors to be edge-triggered */
+void vlapic_reset_tmr(struct vlapic *vlapic);
+
+/*
+ * Set the trigger-mode bit associated with 'vector' to level-triggered if
+ * the (dest,phys,delmode) tuple resolves to an interrupt being delivered to
+ * this 'vlapic'.
+ */
+void vlapic_set_tmr_level(struct vlapic *vlapic, uint32_t dest, bool phys,
+ int delmode, int vector);
+
+void vlapic_set_cr8(struct vlapic *vlapic, uint64_t val);
+uint64_t vlapic_get_cr8(struct vlapic *vlapic);
+
+/* APIC write handlers */
+void vlapic_id_write_handler(struct vlapic *vlapic);
+void vlapic_ldr_write_handler(struct vlapic *vlapic);
+void vlapic_dfr_write_handler(struct vlapic *vlapic);
+void vlapic_svr_write_handler(struct vlapic *vlapic);
+void vlapic_esr_write_handler(struct vlapic *vlapic);
+int vlapic_icrlo_write_handler(struct vlapic *vlapic, bool *retu);
+void vlapic_icrtmr_write_handler(struct vlapic *vlapic);
+void vlapic_dcr_write_handler(struct vlapic *vlapic);
+void vlapic_lvt_write_handler(struct vlapic *vlapic, uint32_t offset);
+void vlapic_self_ipi_handler(struct vlapic *vlapic, uint64_t val);
+#endif /* _VLAPIC_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
new file mode 100644
index 0000000000..f9bd2e0e8b
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/io/vlapic_priv.h
@@ -0,0 +1,190 @@
+/*-
+ * Copyright (c) 2013 Neel Natu <neel@freebsd.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/io/vlapic_priv.h 263211 2014-03-15 23:09:34Z tychon $
+ */
+
+#ifndef _VLAPIC_PRIV_H_
+#define _VLAPIC_PRIV_H_
+
+#include <x86/apicreg.h>
+
+/*
+ * APIC Register: Offset Description
+ */
+#define APIC_OFFSET_ID 0x20 /* Local APIC ID */
+#define APIC_OFFSET_VER 0x30 /* Local APIC Version */
+#define APIC_OFFSET_TPR 0x80 /* Task Priority Register */
+#define APIC_OFFSET_APR 0x90 /* Arbitration Priority */
+#define APIC_OFFSET_PPR 0xA0 /* Processor Priority Register */
+#define APIC_OFFSET_EOI 0xB0 /* EOI Register */
+#define APIC_OFFSET_RRR 0xC0 /* Remote read */
+#define APIC_OFFSET_LDR 0xD0 /* Logical Destination */
+#define APIC_OFFSET_DFR 0xE0 /* Destination Format Register */
+#define APIC_OFFSET_SVR 0xF0 /* Spurious Vector Register */
+#define APIC_OFFSET_ISR0 0x100 /* In Service Register */
+#define APIC_OFFSET_ISR1 0x110
+#define APIC_OFFSET_ISR2 0x120
+#define APIC_OFFSET_ISR3 0x130
+#define APIC_OFFSET_ISR4 0x140
+#define APIC_OFFSET_ISR5 0x150
+#define APIC_OFFSET_ISR6 0x160
+#define APIC_OFFSET_ISR7 0x170
+#define APIC_OFFSET_TMR0 0x180 /* Trigger Mode Register */
+#define APIC_OFFSET_TMR1 0x190
+#define APIC_OFFSET_TMR2 0x1A0
+#define APIC_OFFSET_TMR3 0x1B0
+#define APIC_OFFSET_TMR4 0x1C0
+#define APIC_OFFSET_TMR5 0x1D0
+#define APIC_OFFSET_TMR6 0x1E0
+#define APIC_OFFSET_TMR7 0x1F0
+#define APIC_OFFSET_IRR0 0x200 /* Interrupt Request Register */
+#define APIC_OFFSET_IRR1 0x210
+#define APIC_OFFSET_IRR2 0x220
+#define APIC_OFFSET_IRR3 0x230
+#define APIC_OFFSET_IRR4 0x240
+#define APIC_OFFSET_IRR5 0x250
+#define APIC_OFFSET_IRR6 0x260
+#define APIC_OFFSET_IRR7 0x270
+#define APIC_OFFSET_ESR 0x280 /* Error Status Register */
+#define APIC_OFFSET_CMCI_LVT 0x2F0 /* Local Vector Table (CMCI) */
+#define APIC_OFFSET_ICR_LOW 0x300 /* Interrupt Command Register */
+#define APIC_OFFSET_ICR_HI 0x310
+#define APIC_OFFSET_TIMER_LVT 0x320 /* Local Vector Table (Timer) */
+#define APIC_OFFSET_THERM_LVT 0x330 /* Local Vector Table (Thermal) */
+#define APIC_OFFSET_PERF_LVT 0x340 /* Local Vector Table (PMC) */
+#define APIC_OFFSET_LINT0_LVT 0x350 /* Local Vector Table (LINT0) */
+#define APIC_OFFSET_LINT1_LVT 0x360 /* Local Vector Table (LINT1) */
+#define APIC_OFFSET_ERROR_LVT 0x370 /* Local Vector Table (ERROR) */
+#define APIC_OFFSET_TIMER_ICR 0x380 /* Timer's Initial Count */
+#define APIC_OFFSET_TIMER_CCR 0x390 /* Timer's Current Count */
+#define APIC_OFFSET_TIMER_DCR 0x3E0 /* Timer's Divide Configuration */
+#define APIC_OFFSET_SELF_IPI 0x3F0 /* Self IPI register */
+
+#define VLAPIC_CTR0(vlapic, format) \
+ VCPU_CTR0((vlapic)->vm, (vlapic)->vcpuid, format)
+
+#define VLAPIC_CTR1(vlapic, format, p1) \
+ VCPU_CTR1((vlapic)->vm, (vlapic)->vcpuid, format, p1)
+
+#define VLAPIC_CTR2(vlapic, format, p1, p2) \
+ VCPU_CTR2((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2)
+
+#define VLAPIC_CTR3(vlapic, format, p1, p2, p3) \
+ VCPU_CTR3((vlapic)->vm, (vlapic)->vcpuid, format, p1, p2, p3)
+
+#define VLAPIC_CTR_IRR(vlapic, msg) \
+do { \
+ uint32_t *irrptr = &(vlapic)->apic_page->irr0; \
+ irrptr[0] = irrptr[0]; /* silence compiler */ \
+ VLAPIC_CTR1((vlapic), msg " irr0 0x%08x", irrptr[0 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr1 0x%08x", irrptr[1 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr2 0x%08x", irrptr[2 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr3 0x%08x", irrptr[3 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr4 0x%08x", irrptr[4 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr5 0x%08x", irrptr[5 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr6 0x%08x", irrptr[6 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " irr7 0x%08x", irrptr[7 << 2]); \
+} while (0)
+
+#define VLAPIC_CTR_ISR(vlapic, msg) \
+do { \
+ uint32_t *isrptr = &(vlapic)->apic_page->isr0; \
+ isrptr[0] = isrptr[0]; /* silence compiler */ \
+ VLAPIC_CTR1((vlapic), msg " isr0 0x%08x", isrptr[0 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr1 0x%08x", isrptr[1 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr2 0x%08x", isrptr[2 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr3 0x%08x", isrptr[3 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr4 0x%08x", isrptr[4 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr5 0x%08x", isrptr[5 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr6 0x%08x", isrptr[6 << 2]); \
+ VLAPIC_CTR1((vlapic), msg " isr7 0x%08x", isrptr[7 << 2]); \
+} while (0)
+
+enum boot_state {
+ BS_INIT,
+ BS_SIPI,
+ BS_RUNNING
+};
+
+/*
+ * 16 priority levels with at most one vector injected per level.
+ */
+#define ISRVEC_STK_SIZE (16 + 1)
+
+#define VLAPIC_MAXLVT_INDEX APIC_LVT_CMCI
+
+struct vlapic;
+
+struct vlapic_ops {
+ int (*set_intr_ready)(struct vlapic *vlapic, int vector, bool level);
+ int (*pending_intr)(struct vlapic *vlapic, int *vecptr);
+ void (*intr_accepted)(struct vlapic *vlapic, int vector);
+ void (*post_intr)(struct vlapic *vlapic, int hostcpu);
+ void (*set_tmr)(struct vlapic *vlapic, int vector, bool level);
+ void (*enable_x2apic_mode)(struct vlapic *vlapic);
+};
+
+struct vlapic {
+ struct vm *vm;
+ int vcpuid;
+ struct LAPIC *apic_page;
+ struct vlapic_ops ops;
+
+ uint32_t esr_pending;
+ int esr_firing;
+
+ struct callout callout; /* vlapic timer */
+ struct bintime timer_fire_bt; /* callout expiry time */
+ struct bintime timer_freq_bt; /* timer frequency */
+ struct bintime timer_period_bt; /* timer period */
+ struct mtx timer_mtx;
+
+ /*
+ * The 'isrvec_stk' is a stack of vectors injected by the local apic.
+ * A vector is popped from the stack when the processor does an EOI.
+ * The vector on the top of the stack is used to compute the
+ * Processor Priority in conjunction with the TPR.
+ */
+ uint8_t isrvec_stk[ISRVEC_STK_SIZE];
+ int isrvec_stk_top;
+
+ uint64_t msr_apicbase;
+ enum boot_state boot_state;
+
+ /*
+ * Copies of some registers in the virtual APIC page. We do this for
+ * a couple of different reasons:
+ * - to be able to detect what changed (e.g. svr_last)
+ * - to maintain a coherent snapshot of the register (e.g. lvt_last)
+ */
+ uint32_t svr_last;
+ uint32_t lvt_last[VLAPIC_MAXLVT_INDEX + 1];
+};
+
+void vlapic_init(struct vlapic *vlapic);
+void vlapic_cleanup(struct vlapic *vlapic);
+
+#endif /* _VLAPIC_PRIV_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/offsets.in b/usr/src/uts/i86pc/io/vmm/offsets.in
new file mode 100644
index 0000000000..4b1fe1d6b6
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/offsets.in
@@ -0,0 +1,72 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/cpuvar.h>
+
+#include <machine/pmap.h>
+
+#include <machine/vmm.h>
+#include "intel/vmx_cpufunc.h"
+#include "intel/vmx.h"
+
+vmxctx
+ tmpstktop VMXCTX_TMPSTKTOP
+ guest_rdi VMXCTX_GUEST_RDI
+ guest_rsi VMXCTX_GUEST_RSI
+ guest_rdx VMXCTX_GUEST_RDX
+ guest_rcx VMXCTX_GUEST_RCX
+ guest_r8 VMXCTX_GUEST_R8
+ guest_r9 VMXCTX_GUEST_R9
+ guest_rax VMXCTX_GUEST_RAX
+ guest_rbx VMXCTX_GUEST_RBX
+ guest_rbp VMXCTX_GUEST_RBP
+ guest_r10 VMXCTX_GUEST_R10
+ guest_r11 VMXCTX_GUEST_R11
+ guest_r12 VMXCTX_GUEST_R12
+ guest_r13 VMXCTX_GUEST_R13
+ guest_r14 VMXCTX_GUEST_R14
+ guest_r15 VMXCTX_GUEST_R15
+ guest_cr2 VMXCTX_GUEST_CR2
+ host_r15 VMXCTX_HOST_R15
+ host_r14 VMXCTX_HOST_R14
+ host_r13 VMXCTX_HOST_R13
+ host_r12 VMXCTX_HOST_R12
+ host_rbp VMXCTX_HOST_RBP
+ host_rsp VMXCTX_HOST_RSP
+ host_rbx VMXCTX_HOST_RBX
+ host_rip VMXCTX_HOST_RIP
+ launch_error VMXCTX_LAUNCH_ERROR
+
+vmx VMX_SIZE
+
+\#define VM_SUCCESS 0
+\#define VM_FAIL_INVALID 1
+\#define VM_FAIL_VALID 2
+
+\#define VMX_RETURN_DIRECT 0
+\#define VMX_RETURN_LONGJMP 1
+\#define VMX_RETURN_VMRESUME 2
+\#define VMX_RETURN_VMLAUNCH 3
+\#define VMX_RETURN_AST 4
+
+cpu
+ cpu_thread
+
+_kthread
+ t_lwp
+ _tu._ts._t_astflag T_ASTFLAG
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.c b/usr/src/uts/i86pc/io/vmm/vmm.c
new file mode 100644
index 0000000000..7081368f4a
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm.c
@@ -0,0 +1,1894 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm.c 280929 2015-04-01 00:15:31Z tychon $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/module.h>
+#include <sys/sysctl.h>
+#include <sys/malloc.h>
+#include <sys/pcpu.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/proc.h>
+#include <sys/sched.h>
+#include <sys/smp.h>
+#include <x86/psl.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+
+#include <machine/vm.h>
+#include <machine/pcb.h>
+#include <machine/smp.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_dev.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include "vmm_ioport.h"
+#include "vmm_ktr.h"
+#include "vmm_host.h"
+#include "vmm_mem.h"
+#include "vmm_util.h"
+#include "vatpic.h"
+#include "vatpit.h"
+#include "vhpet.h"
+#include "vioapic.h"
+#include "vlapic.h"
+#include "vmm_ipi.h"
+#include "vmm_stat.h"
+#include "vmm_lapic.h"
+
+#ifdef __FreeBSD__
+#include "io/ppt.h"
+#include "io/iommu.h"
+#endif
+
+struct vhpet;
+struct vioapic;
+struct vlapic;
+
+struct vcpu {
+ int flags;
+ enum vcpu_state state;
+ struct mtx mtx;
+ int hostcpu; /* host cpuid this vcpu last ran on */
+ struct vlapic *vlapic;
+ int vcpuid;
+ struct savefpu *guestfpu; /* guest fpu state */
+ void *stats;
+ struct vm_exit exitinfo;
+ uint64_t nextrip; /* (x) next instruction to execute */
+ enum x2apic_state x2apic_state;
+ uint64_t exitintinfo;
+ int nmi_pending;
+ int extint_pending;
+ struct vm_exception exception;
+ int exception_pending;
+};
+
+#define vcpu_lock_init(v) mtx_init(&((v)->mtx), "vcpu lock", 0, MTX_SPIN)
+#define vcpu_lock(v) mtx_lock_spin(&((v)->mtx))
+#define vcpu_unlock(v) mtx_unlock_spin(&((v)->mtx))
+#define vcpu_assert_locked(v) mtx_assert(&((v)->mtx), MA_OWNED)
+
+#define VM_MAX_MEMORY_SEGMENTS 8
+
+struct vm {
+ void *cookie; /* processor-specific data */
+ void *iommu; /* iommu-specific data */
+ struct vcpu vcpu[VM_MAXCPU];
+ struct vhpet *vhpet;
+ struct vioapic *vioapic; /* virtual ioapic */
+ struct vatpic *vatpic; /* virtual atpic */
+ struct vatpit *vatpit; /* virtual atpit */
+ int num_mem_segs;
+ struct vm_memory_segment mem_segs[VM_MAX_MEMORY_SEGMENTS];
+ char name[VM_MAX_NAMELEN];
+
+ /*
+ * Set of active vcpus.
+ * An active vcpu is one that has been started implicitly (BSP) or
+ * explicitly (AP) by sending it a startup ipi.
+ */
+ cpuset_t active_cpus;
+
+ vm_rendezvous_func_t rendezvous_func;
+};
+
+static int vmm_initialized;
+
+static struct vmm_ops *ops;
+#define VMM_INIT() (ops != NULL ? (*ops->init)() : 0)
+#define VMM_CLEANUP() (ops != NULL ? (*ops->cleanup)() : 0)
+
+#define VMINIT(vm) (ops != NULL ? (*ops->vminit)(vm): NULL)
+#define VMRUN(vmi, vcpu, rip) \
+ (ops != NULL ? (*ops->vmrun)(vmi, vcpu, rip) : ENXIO)
+#define VMCLEANUP(vmi) (ops != NULL ? (*ops->vmcleanup)(vmi) : NULL)
+#define VMMMAP_SET(vmi, gpa, hpa, len, attr, prot, spm) \
+ (ops != NULL ? \
+ (*ops->vmmmap_set)(vmi, gpa, hpa, len, attr, prot, spm) : \
+ ENXIO)
+#define VMMMAP_GET(vmi, gpa) \
+ (ops != NULL ? (*ops->vmmmap_get)(vmi, gpa) : ENXIO)
+#define VMGETREG(vmi, vcpu, num, retval) \
+ (ops != NULL ? (*ops->vmgetreg)(vmi, vcpu, num, retval) : ENXIO)
+#define VMSETREG(vmi, vcpu, num, val) \
+ (ops != NULL ? (*ops->vmsetreg)(vmi, vcpu, num, val) : ENXIO)
+#define VMGETDESC(vmi, vcpu, num, desc) \
+ (ops != NULL ? (*ops->vmgetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define VMSETDESC(vmi, vcpu, num, desc) \
+ (ops != NULL ? (*ops->vmsetdesc)(vmi, vcpu, num, desc) : ENXIO)
+#define VMGETCAP(vmi, vcpu, num, retval) \
+ (ops != NULL ? (*ops->vmgetcap)(vmi, vcpu, num, retval) : ENXIO)
+#define VMSETCAP(vmi, vcpu, num, val) \
+ (ops != NULL ? (*ops->vmsetcap)(vmi, vcpu, num, val) : ENXIO)
+#define VLAPIC_INIT(vmi, vcpu) \
+ (ops != NULL ? (*ops->vlapic_init)(vmi, vcpu) : NULL)
+#define VLAPIC_CLEANUP(vmi, vlapic) \
+ (ops != NULL ? (*ops->vlapic_cleanup)(vmi, vlapic) : NULL)
+
+#define fpu_start_emulating() load_cr0(rcr0() | CR0_TS)
+#define fpu_stop_emulating() clts()
+
+static MALLOC_DEFINE(M_VM, "vm", "vm");
+
+/* statistics */
+static VMM_STAT(VCPU_TOTAL_RUNTIME, "vcpu total runtime");
+
+static int vmm_ipinum;
+SYSCTL_INT(_hw_vmm, OID_AUTO, ipinum, CTLFLAG_RD, &vmm_ipinum, 0,
+ "IPI vector used for vcpu notifications");
+
+static void
+vcpu_cleanup(struct vm *vm, int i)
+{
+ struct vcpu *vcpu = &vm->vcpu[i];
+
+ VLAPIC_CLEANUP(vm->cookie, vcpu->vlapic);
+#ifdef __FreeBSD__
+ vmm_stat_free(vcpu->stats);
+#endif
+ fpu_save_area_free(vcpu->guestfpu);
+}
+
+static void
+vcpu_init(struct vm *vm, uint32_t vcpu_id)
+{
+ struct vcpu *vcpu;
+
+ vcpu = &vm->vcpu[vcpu_id];
+
+ vcpu_lock_init(vcpu);
+ vcpu->hostcpu = NOCPU;
+ vcpu->vcpuid = vcpu_id;
+ vcpu->vlapic = VLAPIC_INIT(vm->cookie, vcpu_id);
+ vm_set_x2apic_state(vm, vcpu_id, X2APIC_DISABLED);
+ vcpu->exitintinfo = 0;
+ vcpu->guestfpu = fpu_save_area_alloc();
+ fpu_save_area_reset(vcpu->guestfpu);
+#ifdef __FreeBSD__
+ vcpu->stats = vmm_stat_alloc();
+#endif
+}
+
+struct vm_exit *
+vm_exitinfo(struct vm *vm, int cpuid)
+{
+ struct vcpu *vcpu;
+
+ if (cpuid < 0 || cpuid >= VM_MAXCPU)
+ panic("vm_exitinfo: invalid cpuid %d", cpuid);
+
+ vcpu = &vm->vcpu[cpuid];
+
+ return (&vcpu->exitinfo);
+}
+
+static int
+vmm_init(void)
+{
+ int error;
+
+#ifndef __FreeBSD__
+ vmm_sol_glue_init();
+#endif
+
+ vmm_host_state_init();
+#ifdef __FreeBSD__
+ vmm_ipi_init();
+#endif
+
+ error = vmm_mem_init();
+ if (error)
+ return (error);
+
+ if (vmm_is_intel())
+ ops = &vmm_ops_intel;
+ else if (vmm_is_amd())
+ ops = &vmm_ops_amd;
+ else
+ return (ENXIO);
+
+ return (VMM_INIT());
+}
+
+#ifdef __FreeBSD__
+static int
+vmm_handler(module_t mod, int what, void *arg)
+{
+ int error;
+
+ switch (what) {
+ case MOD_LOAD:
+ vmmdev_init();
+ if (ppt_num_devices() > 0)
+ iommu_init();
+ error = vmm_init();
+ if (error == 0)
+ vmm_initialized = 1;
+ break;
+ case MOD_UNLOAD:
+ error = vmmdev_cleanup();
+ if (error == 0) {
+#ifndef __FreeBSD__
+ vmm_sol_glue_cleanup();
+#endif
+ iommu_cleanup();
+ vmm_ipi_cleanup();
+ error = VMM_CLEANUP();
+ /*
+ * Something bad happened - prevent new
+ * VMs from being created
+ */
+ if (error)
+ vmm_initialized = 0;
+ }
+ break;
+ default:
+ error = 0;
+ break;
+ }
+ return (error);
+}
+
+static moduledata_t vmm_kmod = {
+ "vmm",
+ vmm_handler,
+ NULL
+};
+
+/*
+ * vmm initialization has the following dependencies:
+ *
+ * - iommu initialization must happen after the pci passthru driver has had
+ * a chance to attach to any passthru devices (after SI_SUB_CONFIGURE).
+ *
+ * - VT-x initialization requires smp_rendezvous() and therefore must happen
+ * after SMP is fully functional (after SI_SUB_SMP).
+ */
+DECLARE_MODULE(vmm, vmm_kmod, SI_SUB_SMP + 1, SI_ORDER_ANY);
+MODULE_VERSION(vmm, 1);
+
+SYSCTL_NODE(_hw, OID_AUTO, vmm, CTLFLAG_RW, NULL, NULL);
+#else
+int
+vmm_mod_load()
+{
+ int error;
+
+ vmmdev_init();
+ error = vmm_init();
+ if (error == 0)
+ vmm_initialized = 1;
+
+ return (error);
+}
+
+int
+vmm_mod_unload()
+{
+ int error;
+
+ error = vmmdev_cleanup();
+ if (error)
+ return (error);
+ error = VMM_CLEANUP();
+ if (error)
+ return (error);
+ vmm_initialized = 0;
+
+ return (0);
+}
+#endif
+
+int
+vm_create(const char *name, struct vm **retvm)
+{
+ int i;
+ struct vm *vm;
+ vm_paddr_t maxaddr;
+
+ const int BSP = 0;
+
+ /*
+ * If vmm.ko could not be successfully initialized then don't attempt
+ * to create the virtual machine.
+ */
+ if (!vmm_initialized)
+ return (ENXIO);
+
+ if (name == NULL || strlen(name) >= VM_MAX_NAMELEN)
+ return (EINVAL);
+
+ vm = malloc(sizeof(struct vm), M_VM, M_WAITOK | M_ZERO);
+ strcpy(vm->name, name);
+ vm->cookie = VMINIT(vm);
+
+ vm->vioapic = vioapic_init(vm);
+ vm->vhpet = vhpet_init(vm);
+ vm->vatpic = vatpic_init(vm);
+ vm->vatpit = vatpit_init(vm);
+
+ for (i = 0; i < VM_MAXCPU; i++) {
+ vcpu_init(vm, i);
+ }
+
+#ifdef __FreeBSD__
+ maxaddr = vmm_mem_maxaddr();
+ vm->iommu = iommu_create_domain(maxaddr);
+#endif
+
+ *retvm = vm;
+ return (0);
+}
+
+static void
+vm_free_mem_seg(struct vm *vm, struct vm_memory_segment *seg)
+{
+ size_t len;
+ vm_paddr_t hpa;
+ void *host_domain;
+
+#ifdef __FreeBSD__
+ host_domain = iommu_host_domain();
+#endif
+
+ len = 0;
+ while (len < seg->len) {
+ hpa = vm_gpa2hpa(vm, seg->gpa + len, PAGE_SIZE);
+ if (hpa == (vm_paddr_t)-1) {
+ panic("vm_free_mem_segs: cannot free hpa "
+ "associated with gpa 0x%016lx", seg->gpa + len);
+ }
+
+#ifdef __FreeBSD__
+ /*
+ * Remove the 'gpa' to 'hpa' mapping in VMs domain.
+ * And resurrect the 1:1 mapping for 'hpa' in 'host_domain'.
+ */
+ iommu_remove_mapping(vm->iommu, seg->gpa + len, PAGE_SIZE);
+ iommu_create_mapping(host_domain, hpa, hpa, PAGE_SIZE);
+#endif
+
+ vmm_mem_free(hpa, PAGE_SIZE);
+
+ len += PAGE_SIZE;
+ }
+
+#ifdef __FreeBSD__
+ /*
+ * Invalidate cached translations associated with 'vm->iommu' since
+ * we have now moved some pages from it.
+ */
+ iommu_invalidate_tlb(vm->iommu);
+#endif
+
+ bzero(seg, sizeof(struct vm_memory_segment));
+}
+
+void
+vm_destroy(struct vm *vm)
+{
+ int i;
+
+#ifdef __FreeBSD__
+ ppt_unassign_all(vm);
+#endif
+
+ for (i = 0; i < vm->num_mem_segs; i++)
+ vm_free_mem_seg(vm, &vm->mem_segs[i]);
+
+ vm->num_mem_segs = 0;
+
+ for (i = 0; i < VM_MAXCPU; i++)
+ vcpu_cleanup(vm, i);
+
+ vatpit_cleanup(vm->vatpit);
+ vhpet_cleanup(vm->vhpet);
+ vatpic_cleanup(vm->vatpic);
+ vioapic_cleanup(vm->vioapic);
+
+#ifdef __FreeBSD__
+ iommu_destroy_domain(vm->iommu);
+#endif
+
+ VMCLEANUP(vm->cookie);
+
+ free(vm, M_VM);
+}
+
+const char *
+vm_name(struct vm *vm)
+{
+ return (vm->name);
+}
+
+#ifdef __FreeBSD__
+int
+vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa)
+{
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ return (VMMMAP_SET(vm->cookie, gpa, hpa, len, VM_MEMATTR_UNCACHEABLE,
+ VM_PROT_RW, spok));
+}
+
+int
+vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ return (VMMMAP_SET(vm->cookie, gpa, 0, len, 0,
+ VM_PROT_NONE, spok));
+}
+#endif
+
+/*
+ * Returns TRUE if 'gpa' is available for allocation and FALSE otherwise
+ */
+static boolean_t
+vm_gpa_available(struct vm *vm, vm_paddr_t gpa)
+{
+ int i;
+ vm_paddr_t gpabase, gpalimit;
+
+ if (gpa & PAGE_MASK)
+ panic("vm_gpa_available: gpa (0x%016lx) not page aligned", gpa);
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ gpabase = vm->mem_segs[i].gpa;
+ gpalimit = gpabase + vm->mem_segs[i].len;
+ if (gpa >= gpabase && gpa < gpalimit)
+ return (FALSE);
+ }
+
+ return (TRUE);
+}
+
+int
+vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ int error, available, allocated;
+ struct vm_memory_segment *seg;
+ vm_paddr_t g, hpa;
+ void *host_domain;
+
+ const boolean_t spok = TRUE; /* superpage mappings are ok */
+
+ if ((gpa & PAGE_MASK) || (len & PAGE_MASK) || len == 0)
+ return (EINVAL);
+
+ available = allocated = 0;
+ g = gpa;
+ while (g < gpa + len) {
+ if (vm_gpa_available(vm, g))
+ available++;
+ else
+ allocated++;
+
+ g += PAGE_SIZE;
+ }
+
+ /*
+ * If there are some allocated and some available pages in the address
+ * range then it is an error.
+ */
+ if (allocated && available)
+ return (EINVAL);
+
+ /*
+ * If the entire address range being requested has already been
+ * allocated then there isn't anything more to do.
+ */
+ if (allocated && available == 0)
+ return (0);
+
+ if (vm->num_mem_segs >= VM_MAX_MEMORY_SEGMENTS)
+ return (E2BIG);
+
+#ifdef __FreeBSD__
+ host_domain = iommu_host_domain();
+#endif
+
+ seg = &vm->mem_segs[vm->num_mem_segs];
+
+ error = 0;
+ seg->gpa = gpa;
+ seg->len = 0;
+ while (seg->len < len) {
+ hpa = vmm_mem_alloc(PAGE_SIZE);
+ if (hpa == 0) {
+ error = ENOMEM;
+ break;
+ }
+
+ error = VMMMAP_SET(vm->cookie, gpa + seg->len, hpa, PAGE_SIZE,
+ VM_MEMATTR_WRITE_BACK, VM_PROT_ALL, spok);
+ if (error)
+ break;
+
+#ifdef __FreeBSD__
+ /*
+ * Remove the 1:1 mapping for 'hpa' from the 'host_domain'.
+ * Add mapping for 'gpa + seg->len' to 'hpa' in the VMs domain.
+ */
+ iommu_remove_mapping(host_domain, hpa, PAGE_SIZE);
+ iommu_create_mapping(vm->iommu, gpa + seg->len, hpa, PAGE_SIZE);
+#endif
+
+ seg->len += PAGE_SIZE;
+ }
+
+ if (error) {
+ vm_free_mem_seg(vm, seg);
+ return (error);
+ }
+
+#ifdef __FreeBSD__
+ /*
+ * Invalidate cached translations associated with 'host_domain' since
+ * we have now moved some pages from it.
+ */
+ iommu_invalidate_tlb(host_domain);
+#endif
+
+ vm->num_mem_segs++;
+
+ return (0);
+}
+
+vm_paddr_t
+vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t len)
+{
+ vm_paddr_t nextpage;
+
+ nextpage = rounddown(gpa + PAGE_SIZE, PAGE_SIZE);
+ if (len > nextpage - gpa)
+ panic("vm_gpa2hpa: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+ return (VMMMAP_GET(vm->cookie, gpa));
+}
+
+void *
+vm_gpa_hold(struct vm *vm, vm_paddr_t gpa, size_t len, int reqprot,
+ void **cookie)
+{
+#ifdef __FreeBSD__
+ int count, pageoff;
+ vm_page_t m;
+
+ pageoff = gpa & PAGE_MASK;
+ if (len > PAGE_SIZE - pageoff)
+ panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+ count = vm_fault_quick_hold_pages(&vm->vmspace->vm_map,
+ trunc_page(gpa), PAGE_SIZE, reqprot, &m, 1);
+
+ if (count == 1) {
+ *cookie = m;
+ return ((void *)(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)) + pageoff));
+ } else {
+ *cookie = NULL;
+ return (NULL);
+ }
+#else
+ int pageoff;
+ vm_paddr_t hpa;
+
+ pageoff = gpa & PAGE_MASK;
+ if (len > PAGE_SIZE - pageoff)
+ panic("vm_gpa_hold: invalid gpa/len: 0x%016lx/%lu", gpa, len);
+
+ hpa = vm_gpa2hpa(vm, gpa, len);
+ if (hpa == (vm_paddr_t)-1)
+ return (NULL);
+
+ return (hat_kpm_pfn2va(btop(hpa)) + pageoff);
+#endif
+}
+
+void
+vm_gpa_release(void *cookie)
+{
+#ifdef __FreeBSD__
+ vm_page_t m = cookie;
+
+ vm_page_lock(m);
+ vm_page_unhold(m);
+ vm_page_unlock(m);
+#endif
+}
+
+int
+vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+ struct vm_memory_segment *seg)
+{
+ int i;
+
+ for (i = 0; i < vm->num_mem_segs; i++) {
+ if (gpabase == vm->mem_segs[i].gpa) {
+ *seg = vm->mem_segs[i];
+ return (0);
+ }
+ }
+ return (-1);
+}
+
+int
+vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (reg >= VM_REG_LAST)
+ return (EINVAL);
+
+ return (VMGETREG(vm->cookie, vcpu, reg, retval));
+}
+
+int
+vm_set_register(struct vm *vm, int vcpuid, int reg, uint64_t val)
+{
+ struct vcpu *vcpu;
+ int error;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (reg >= VM_REG_LAST)
+ return (EINVAL);
+
+ error = VMSETREG(vm->cookie, vcpuid, reg, val);
+ if (error || reg != VM_REG_GUEST_RIP)
+ return (error);
+
+ /* Set 'nextrip' to match the value of %rip */
+ VCPU_CTR1(vm, vcpuid, "Setting nextrip to %#lx", val);
+ vcpu = &vm->vcpu[vcpuid];
+ vcpu->nextrip = val;
+ return (0);
+}
+
+static boolean_t
+is_descriptor_table(int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_IDTR:
+ case VM_REG_GUEST_GDTR:
+ return (TRUE);
+ default:
+ return (FALSE);
+ }
+}
+
+static boolean_t
+is_segment_register(int reg)
+{
+
+ switch (reg) {
+ case VM_REG_GUEST_ES:
+ case VM_REG_GUEST_CS:
+ case VM_REG_GUEST_SS:
+ case VM_REG_GUEST_DS:
+ case VM_REG_GUEST_FS:
+ case VM_REG_GUEST_GS:
+ case VM_REG_GUEST_TR:
+ case VM_REG_GUEST_LDTR:
+ return (TRUE);
+ default:
+ return (FALSE);
+ }
+}
+
+int
+vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc)
+{
+
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (!is_segment_register(reg) && !is_descriptor_table(reg))
+ return (EINVAL);
+
+ return (VMGETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+int
+vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (!is_segment_register(reg) && !is_descriptor_table(reg))
+ return (EINVAL);
+
+ return (VMSETDESC(vm->cookie, vcpu, reg, desc));
+}
+
+static void
+restore_guest_fpustate(struct vcpu *vcpu)
+{
+
+ /* flush host state to the pcb */
+ fpuexit(curthread);
+
+ /* restore guest FPU state */
+ fpu_stop_emulating();
+ fpurestore(vcpu->guestfpu);
+
+ /*
+ * The FPU is now "dirty" with the guest's state so turn on emulation
+ * to trap any access to the FPU by the host.
+ */
+ fpu_start_emulating();
+}
+
+static void
+save_guest_fpustate(struct vcpu *vcpu)
+{
+
+ if ((rcr0() & CR0_TS) == 0)
+ panic("fpu emulation not enabled in host!");
+
+ /* save guest FPU state */
+ fpu_stop_emulating();
+ fpusave(vcpu->guestfpu);
+ fpu_start_emulating();
+}
+
+static VMM_STAT(VCPU_IDLE_TICKS, "number of ticks vcpu was idle");
+
+static int
+vcpu_set_state_locked(struct vcpu *vcpu, enum vcpu_state newstate,
+ bool from_idle)
+{
+ int error;
+
+ vcpu_assert_locked(vcpu);
+
+ /*
+ * State transitions from the vmmdev_ioctl() must always begin from
+ * the VCPU_IDLE state. This guarantees that there is only a single
+ * ioctl() operating on a vcpu at any point.
+ */
+ if (from_idle) {
+ while (vcpu->state != VCPU_IDLE)
+ msleep_spin(&vcpu->state, &vcpu->mtx, "vmstat", hz);
+ } else {
+ KASSERT(vcpu->state != VCPU_IDLE, ("invalid transition from "
+ "vcpu idle state"));
+ }
+
+ if (vcpu->state == VCPU_RUNNING) {
+ KASSERT(vcpu->hostcpu == curcpu, ("curcpu %d and hostcpu %d "
+ "mismatch for running vcpu", curcpu, vcpu->hostcpu));
+ } else {
+ KASSERT(vcpu->hostcpu == NOCPU, ("Invalid hostcpu %d for a "
+ "vcpu that is not running", vcpu->hostcpu));
+ }
+
+ /*
+ * The following state transitions are allowed:
+ * IDLE -> FROZEN -> IDLE
+ * FROZEN -> RUNNING -> FROZEN
+ * FROZEN -> SLEEPING -> FROZEN
+ */
+ switch (vcpu->state) {
+ case VCPU_IDLE:
+ case VCPU_RUNNING:
+ case VCPU_SLEEPING:
+ error = (newstate != VCPU_FROZEN);
+ break;
+ case VCPU_FROZEN:
+ error = (newstate == VCPU_FROZEN);
+ break;
+ default:
+ error = 1;
+ break;
+ }
+
+ if (error)
+ return (EBUSY);
+
+ vcpu->state = newstate;
+ if (newstate == VCPU_RUNNING)
+ vcpu->hostcpu = curcpu;
+ else
+ vcpu->hostcpu = NOCPU;
+
+ if (newstate == VCPU_IDLE)
+ wakeup(&vcpu->state);
+
+ return (0);
+}
+
+static void
+vcpu_require_state(struct vm *vm, int vcpuid, enum vcpu_state newstate)
+{
+ int error;
+
+ if ((error = vcpu_set_state(vm, vcpuid, newstate, false)) != 0)
+ panic("Error %d setting state to %d\n", error, newstate);
+}
+
+static void
+vcpu_require_state_locked(struct vcpu *vcpu, enum vcpu_state newstate)
+{
+ int error;
+
+ if ((error = vcpu_set_state_locked(vcpu, newstate, false)) != 0)
+ panic("Error %d setting state to %d", error, newstate);
+}
+
+/*
+ * Emulate a guest 'hlt' by sleeping until the vcpu is ready to run.
+ */
+static int
+vm_handle_hlt(struct vm *vm, int vcpuid, bool intr_disabled, bool *retu)
+{
+ struct vm_exit *vmexit;
+ struct vcpu *vcpu;
+ int t, timo, spindown;
+
+ vcpu = &vm->vcpu[vcpuid];
+ spindown = 0;
+
+ vcpu_lock(vcpu);
+
+ /*
+ * Do a final check for pending NMI or interrupts before
+ * really putting this thread to sleep.
+ *
+ * These interrupts could have happened any time after we
+ * returned from VMRUN() and before we grabbed the vcpu lock.
+ */
+ if (vm->rendezvous_func == NULL &&
+ !vm_nmi_pending(vm, vcpuid) &&
+ (intr_disabled || !vlapic_pending_intr(vcpu->vlapic, NULL))) {
+ t = ticks;
+ vcpu_require_state_locked(vcpu, VCPU_SLEEPING);
+ if (vlapic_enabled(vcpu->vlapic)) {
+ /*
+ * XXX msleep_spin() is not interruptible so use the
+ * 'timo' to put an upper bound on the sleep time.
+ */
+ timo = hz;
+ msleep_spin(vcpu, &vcpu->mtx, "vmidle", timo);
+ } else {
+ /*
+ * Spindown the vcpu if the apic is disabled and it
+ * had entered the halted state.
+ */
+ spindown = 1;
+ }
+ vcpu_require_state_locked(vcpu, VCPU_FROZEN);
+ vmm_stat_incr(vm, vcpuid, VCPU_IDLE_TICKS, ticks - t);
+ }
+ vcpu_unlock(vcpu);
+
+#ifdef __FreeBSD__
+ /*
+ * Since 'vm_deactivate_cpu()' grabs a sleep mutex we must call it
+ * outside the confines of the vcpu spinlock.
+ */
+ if (spindown) {
+ *retu = true;
+ vmexit = vm_exitinfo(vm, vcpuid);
+ vmexit->exitcode = VM_EXITCODE_SPINDOWN_CPU;
+ vm_deactivate_cpu(vm, vcpuid);
+ VCPU_CTR0(vm, vcpuid, "spinning down cpu");
+ }
+#endif
+
+ return (0);
+}
+
+static int
+vm_handle_inst_emul(struct vm *vm, int vcpuid, bool *retu)
+{
+ struct vie *vie;
+ struct vcpu *vcpu;
+ struct vm_exit *vme;
+ uint64_t gla, gpa, cs_base;
+ struct vm_guest_paging *paging;
+ mem_region_read_t mread;
+ mem_region_write_t mwrite;
+ enum vm_cpu_mode cpu_mode;
+ int cs_d, error, length;
+
+ vcpu = &vm->vcpu[vcpuid];
+ vme = &vcpu->exitinfo;
+
+ gla = vme->u.inst_emul.gla;
+ gpa = vme->u.inst_emul.gpa;
+ cs_base = vme->u.inst_emul.cs_base;
+ cs_d = vme->u.inst_emul.cs_d;
+ vie = &vme->u.inst_emul.vie;
+ paging = &vme->u.inst_emul.paging;
+ cpu_mode = paging->cpu_mode;
+
+ VCPU_CTR1(vm, vcpuid, "inst_emul fault accessing gpa %#lx", gpa);
+
+ /* Fetch, decode and emulate the faulting instruction */
+ if (vie->num_valid == 0) {
+ /*
+ * If the instruction length is not known then assume a
+ * maximum size instruction.
+ */
+ length = vme->inst_length ? vme->inst_length : VIE_INST_SIZE;
+ error = vmm_fetch_instruction(vm, vcpuid, paging, vme->rip +
+ cs_base, length, vie);
+ } else {
+ /*
+ * The instruction bytes have already been copied into 'vie'
+ */
+ error = 0;
+ }
+ if (error == 1)
+ return (0); /* Resume guest to handle page fault */
+ else if (error == -1)
+ return (EFAULT);
+ else if (error != 0)
+ panic("%s: vmm_fetch_instruction error %d", __func__, error);
+
+ if (vmm_decode_instruction(vm, vcpuid, gla, cpu_mode, cs_d, vie) != 0)
+ return (EFAULT);
+
+ /*
+ * If the instruction length was not specified then update it now
+ * along with 'nextrip'.
+ */
+ if (vme->inst_length == 0) {
+ vme->inst_length = vie->num_processed;
+ vcpu->nextrip += vie->num_processed;
+ }
+
+ /* return to userland unless this is an in-kernel emulated device */
+ if (gpa >= DEFAULT_APIC_BASE && gpa < DEFAULT_APIC_BASE + PAGE_SIZE) {
+ mread = lapic_mmio_read;
+ mwrite = lapic_mmio_write;
+ } else if (gpa >= VIOAPIC_BASE && gpa < VIOAPIC_BASE + VIOAPIC_SIZE) {
+ mread = vioapic_mmio_read;
+ mwrite = vioapic_mmio_write;
+ } else if (gpa >= VHPET_BASE && gpa < VHPET_BASE + VHPET_SIZE) {
+ mread = vhpet_mmio_read;
+ mwrite = vhpet_mmio_write;
+ } else {
+ *retu = true;
+ return (0);
+ }
+
+ error = vmm_emulate_instruction(vm, vcpuid, gpa, vie, paging,
+ mread, mwrite, retu);
+
+ return (error);
+}
+
+int
+vm_run(struct vm *vm, struct vm_run *vmrun)
+{
+ int error, vcpuid;
+ struct vcpu *vcpu;
+ struct pcb *pcb;
+ uint64_t tscval;
+ struct vm_exit *vme;
+ bool retu, intr_disabled;
+
+ vcpuid = vmrun->cpuid;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+ vme = &vcpu->exitinfo;
+restart:
+ critical_enter();
+
+ tscval = rdtsc();
+
+#ifdef __FreeBSD__
+ pcb = PCPU_GET(curpcb);
+ set_pcb_flags(pcb, PCB_FULL_IRET);
+#endif
+
+#ifndef __FreeBSD__
+ installctx(curthread, vcpu, save_guest_fpustate,
+ restore_guest_fpustate, NULL, NULL, NULL, NULL);
+#endif
+ restore_guest_fpustate(vcpu);
+
+ vcpu_require_state(vm, vcpuid, VCPU_RUNNING);
+ error = VMRUN(vm->cookie, vcpuid, vcpu->nextrip);
+ vcpu_require_state(vm, vcpuid, VCPU_FROZEN);
+
+ save_guest_fpustate(vcpu);
+#ifndef __FreeBSD__
+ removectx(curthread, vcpu, save_guest_fpustate,
+ restore_guest_fpustate, NULL, NULL, NULL, NULL);
+#endif
+
+ vmm_stat_incr(vm, vcpuid, VCPU_TOTAL_RUNTIME, rdtsc() - tscval);
+
+ critical_exit();
+
+ if (error == 0) {
+ retu = false;
+ vcpu->nextrip = vme->rip + vme->inst_length;
+ switch (vme->exitcode) {
+ case VM_EXITCODE_HLT:
+ intr_disabled = ((vme->u.hlt.rflags & PSL_I) == 0);
+ error = vm_handle_hlt(vm, vcpuid, intr_disabled, &retu);
+ break;
+ case VM_EXITCODE_INST_EMUL:
+ error = vm_handle_inst_emul(vm, vcpuid, &retu);
+ break;
+ case VM_EXITCODE_INOUT:
+ case VM_EXITCODE_INOUT_STR:
+ error = vm_handle_inout(vm, vcpuid, vme, &retu);
+ break;
+ default:
+ retu = true; /* handled in userland */
+ break;
+ }
+ }
+
+ if (error == 0 && retu == false) {
+ goto restart;
+ }
+
+ /* copy the exit information */
+ bcopy(vme, &vmrun->vm_exit, sizeof(struct vm_exit));
+ return (error);
+}
+
+int
+vm_restart_instruction(void *arg, int vcpuid)
+{
+ struct vm *vm;
+ struct vcpu *vcpu;
+ enum vcpu_state state;
+ uint64_t rip;
+ int error;
+
+ vm = arg;
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+ state = vcpu_get_state(vm, vcpuid, NULL);
+ if (state == VCPU_RUNNING) {
+ /*
+ * When a vcpu is "running" the next instruction is determined
+ * by adding 'rip' and 'inst_length' in the vcpu's 'exitinfo'.
+ * Thus setting 'inst_length' to zero will cause the current
+ * instruction to be restarted.
+ */
+ vcpu->exitinfo.inst_length = 0;
+ VCPU_CTR1(vm, vcpuid, "restarting instruction at %#lx by "
+ "setting inst_length to zero", vcpu->exitinfo.rip);
+ } else if (state == VCPU_FROZEN) {
+ /*
+ * When a vcpu is "frozen" it is outside the critical section
+ * around VMRUN() and 'nextrip' points to the next instruction.
+ * Thus instruction restart is achieved by setting 'nextrip'
+ * to the vcpu's %rip.
+ */
+ error = vm_get_register(vm, vcpuid, VM_REG_GUEST_RIP, &rip);
+ KASSERT(!error, ("%s: error %d getting rip", __func__, error));
+ VCPU_CTR2(vm, vcpuid, "restarting instruction by updating "
+ "nextrip from %#lx to %#lx", vcpu->nextrip, rip);
+ vcpu->nextrip = rip;
+ } else {
+ panic("%s: invalid state %d", __func__, state);
+ }
+ return (0);
+}
+
+int
+vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t info)
+{
+ struct vcpu *vcpu;
+ int type, vector;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ if (info & VM_INTINFO_VALID) {
+ type = info & VM_INTINFO_TYPE;
+ vector = info & 0xff;
+ if (type == VM_INTINFO_NMI && vector != IDT_NMI)
+ return (EINVAL);
+ if (type == VM_INTINFO_HWEXCEPTION && vector >= 32)
+ return (EINVAL);
+ if (info & VM_INTINFO_RSVD)
+ return (EINVAL);
+ } else {
+ info = 0;
+ }
+ VCPU_CTR2(vm, vcpuid, "%s: info1(%#lx)", __func__, info);
+ vcpu->exitintinfo = info;
+ return (0);
+}
+
+enum exc_class {
+ EXC_BENIGN,
+ EXC_CONTRIBUTORY,
+ EXC_PAGEFAULT
+};
+
+#define IDT_VE 20 /* Virtualization Exception (Intel specific) */
+
+static enum exc_class
+exception_class(uint64_t info)
+{
+ int type, vector;
+
+#ifdef __FreeBSD__
+ KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %#lx", info));
+#else
+ KASSERT(info & VM_INTINFO_VALID, ("intinfo must be valid: %lx", info));
+#endif
+ type = info & VM_INTINFO_TYPE;
+ vector = info & 0xff;
+
+ /* Table 6-4, "Interrupt and Exception Classes", Intel SDM, Vol 3 */
+ switch (type) {
+ case VM_INTINFO_HWINTR:
+ case VM_INTINFO_SWINTR:
+ case VM_INTINFO_NMI:
+ return (EXC_BENIGN);
+ default:
+ /*
+ * Hardware exception.
+ *
+ * SVM and VT-x use identical type values to represent NMI,
+ * hardware interrupt and software interrupt.
+ *
+ * SVM uses type '3' for all exceptions. VT-x uses type '3'
+ * for exceptions except #BP and #OF. #BP and #OF use a type
+ * value of '5' or '6'. Therefore we don't check for explicit
+ * values of 'type' to classify 'intinfo' into a hardware
+ * exception.
+ */
+ break;
+ }
+
+ switch (vector) {
+ case IDT_PF:
+ case IDT_VE:
+ return (EXC_PAGEFAULT);
+ case IDT_DE:
+ case IDT_TS:
+ case IDT_NP:
+ case IDT_SS:
+ case IDT_GP:
+ return (EXC_CONTRIBUTORY);
+ default:
+ return (EXC_BENIGN);
+ }
+}
+
+static int
+nested_fault(struct vm *vm, int vcpuid, uint64_t info1, uint64_t info2,
+ uint64_t *retinfo)
+{
+ enum exc_class exc1, exc2;
+ int type1, vector1;
+
+#ifdef __FreeBSD__
+ KASSERT(info1 & VM_INTINFO_VALID, ("info1 %#lx is not valid", info1));
+ KASSERT(info2 & VM_INTINFO_VALID, ("info2 %#lx is not valid", info2));
+#else
+ KASSERT(info1 & VM_INTINFO_VALID, ("info1 %lx is not valid", info1));
+ KASSERT(info2 & VM_INTINFO_VALID, ("info2 %lx is not valid", info2));
+#endif
+
+ /*
+ * If an exception occurs while attempting to call the double-fault
+ * handler the processor enters shutdown mode (aka triple fault).
+ */
+ type1 = info1 & VM_INTINFO_TYPE;
+ vector1 = info1 & 0xff;
+ if (type1 == VM_INTINFO_HWEXCEPTION && vector1 == IDT_DF) {
+ VCPU_CTR2(vm, vcpuid, "triple fault: info1(%#lx), info2(%#lx)",
+ info1, info2);
+#ifdef __FreeBSD__
+ vm_suspend(vm, VM_SUSPEND_TRIPLEFAULT);
+#endif
+ *retinfo = 0;
+ return (0);
+ }
+
+ /*
+ * Table 6-5 "Conditions for Generating a Double Fault", Intel SDM, Vol3
+ */
+ exc1 = exception_class(info1);
+ exc2 = exception_class(info2);
+ if ((exc1 == EXC_CONTRIBUTORY && exc2 == EXC_CONTRIBUTORY) ||
+ (exc1 == EXC_PAGEFAULT && exc2 != EXC_BENIGN)) {
+ /* Convert nested fault into a double fault. */
+ *retinfo = IDT_DF;
+ *retinfo |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+ *retinfo |= VM_INTINFO_DEL_ERRCODE;
+ } else {
+ /* Handle exceptions serially */
+ *retinfo = info2;
+ }
+ return (1);
+}
+
+static uint64_t
+vcpu_exception_intinfo(struct vcpu *vcpu)
+{
+ uint64_t info = 0;
+
+ if (vcpu->exception_pending) {
+ info = vcpu->exception.vector & 0xff;
+ info |= VM_INTINFO_VALID | VM_INTINFO_HWEXCEPTION;
+ if (vcpu->exception.error_code_valid) {
+ info |= VM_INTINFO_DEL_ERRCODE;
+ info |= (uint64_t)vcpu->exception.error_code << 32;
+ }
+ }
+ return (info);
+}
+
+int
+vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *retinfo)
+{
+ struct vcpu *vcpu;
+ uint64_t info1, info2;
+ int valid;
+
+ KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ info1 = vcpu->exitintinfo;
+ vcpu->exitintinfo = 0;
+
+ info2 = 0;
+ if (vcpu->exception_pending) {
+ info2 = vcpu_exception_intinfo(vcpu);
+ vcpu->exception_pending = 0;
+ VCPU_CTR2(vm, vcpuid, "Exception %d delivered: %#lx",
+ vcpu->exception.vector, info2);
+ }
+
+ if ((info1 & VM_INTINFO_VALID) && (info2 & VM_INTINFO_VALID)) {
+ valid = nested_fault(vm, vcpuid, info1, info2, retinfo);
+ } else if (info1 & VM_INTINFO_VALID) {
+ *retinfo = info1;
+ valid = 1;
+ } else if (info2 & VM_INTINFO_VALID) {
+ *retinfo = info2;
+ valid = 1;
+ } else {
+ valid = 0;
+ }
+
+ if (valid) {
+ VCPU_CTR4(vm, vcpuid, "%s: info1(%#lx), info2(%#lx), "
+ "retinfo(%#lx)", __func__, info1, info2, *retinfo);
+ }
+
+ return (valid);
+}
+
+int
+vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *exception)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (exception->vector < 0 || exception->vector >= 32)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ if (vcpu->exception_pending) {
+ VCPU_CTR2(vm, vcpuid, "Unable to inject exception %d due to "
+ "pending exception %d", exception->vector,
+ vcpu->exception.vector);
+ return (EBUSY);
+ }
+
+ vcpu->exception_pending = 1;
+ vcpu->exception = *exception;
+ VCPU_CTR1(vm, vcpuid, "Exception %d pending", exception->vector);
+ return (0);
+}
+
+int
+vm_exception_pending(struct vm *vm, int vcpuid, struct vm_exception *exception)
+{
+ struct vcpu *vcpu;
+ int pending;
+
+ KASSERT(vcpuid >= 0 && vcpuid < VM_MAXCPU, ("invalid vcpu %d", vcpuid));
+
+ vcpu = &vm->vcpu[vcpuid];
+ pending = vcpu->exception_pending;
+ if (pending) {
+ vcpu->exception_pending = 0;
+ *exception = vcpu->exception;
+ VCPU_CTR1(vm, vcpuid, "Exception %d delivered",
+ exception->vector);
+ }
+ return (pending);
+}
+
+void
+vm_inject_fault(void *vmarg, int vcpuid, int vector, int errcode_valid,
+ int errcode)
+{
+ struct vm_exception exception;
+ struct vm_exit *vmexit;
+ struct vm *vm;
+ int error;
+
+ vm = vmarg;
+
+ exception.vector = vector;
+ exception.error_code = errcode;
+ exception.error_code_valid = errcode_valid;
+ error = vm_inject_exception(vm, vcpuid, &exception);
+ KASSERT(error == 0, ("vm_inject_exception error %d", error));
+
+ /*
+ * A fault-like exception allows the instruction to be restarted
+ * after the exception handler returns.
+ *
+ * By setting the inst_length to 0 we ensure that the instruction
+ * pointer remains at the faulting instruction.
+ */
+ vmexit = vm_exitinfo(vm, vcpuid);
+ vmexit->inst_length = 0;
+}
+
+void
+vm_inject_pf(void *vmarg, int vcpuid, int error_code, uint64_t cr2)
+{
+ struct vm *vm;
+ int error;
+
+ vm = vmarg;
+ VCPU_CTR2(vm, vcpuid, "Injecting page fault: error_code %#x, cr2 %#lx",
+ error_code, cr2);
+
+ error = vm_set_register(vm, vcpuid, VM_REG_GUEST_CR2, cr2);
+ KASSERT(error == 0, ("vm_set_register(cr2) error %d", error));
+
+ vm_inject_fault(vm, vcpuid, IDT_PF, 1, error_code);
+}
+
+static VMM_STAT(VCPU_NMI_COUNT, "number of NMIs delivered to vcpu");
+
+int
+vm_inject_nmi(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu->nmi_pending = 1;
+ vcpu_notify_event(vm, vcpuid, false);
+
+ return (0);
+}
+
+int
+vm_nmi_pending(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ return (vcpu->nmi_pending);
+}
+
+void
+vm_nmi_clear(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_nmi_pending: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ if (vcpu->nmi_pending == 0)
+ panic("vm_nmi_clear: inconsistent nmi_pending state");
+
+ vcpu->nmi_pending = 0;
+ vmm_stat_incr(vm, vcpuid, VCPU_NMI_COUNT, 1);
+}
+
+static VMM_STAT(VCPU_EXTINT_COUNT, "number of ExtINTs delivered to vcpu");
+
+int
+vm_inject_extint(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu->extint_pending = 1;
+ vcpu_notify_event(vm, vcpuid, false);
+
+ return (0);
+}
+
+int
+vm_extint_pending(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ return (vcpu->extint_pending);
+}
+
+void
+vm_extint_clear(struct vm *vm, int vcpuid)
+{
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_extint_pending: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ if (vcpu->extint_pending == 0)
+ panic("vm_extint_clear: inconsistent extint_pending state");
+
+ vcpu->extint_pending = 0;
+ vmm_stat_incr(vm, vcpuid, VCPU_EXTINT_COUNT, 1);
+}
+
+int
+vm_get_capability(struct vm *vm, int vcpu, int type, int *retval)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (type < 0 || type >= VM_CAP_MAX)
+ return (EINVAL);
+
+ return (VMGETCAP(vm->cookie, vcpu, type, retval));
+}
+
+int
+vm_set_capability(struct vm *vm, int vcpu, int type, int val)
+{
+ if (vcpu < 0 || vcpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (type < 0 || type >= VM_CAP_MAX)
+ return (EINVAL);
+
+ return (VMSETCAP(vm->cookie, vcpu, type, val));
+}
+
+struct vhpet *
+vm_hpet(struct vm *vm)
+{
+ return (vm->vhpet);
+}
+
+struct vioapic *
+vm_ioapic(struct vm *vm)
+{
+ return (vm->vioapic);
+}
+
+struct vlapic *
+vm_lapic(struct vm *vm, int cpu)
+{
+ return (vm->vcpu[cpu].vlapic);
+}
+
+#ifdef __FreeBSD__
+boolean_t
+vmm_is_pptdev(int bus, int slot, int func)
+{
+ int found, i, n;
+ int b, s, f;
+ char *val, *cp, *cp2;
+
+ /*
+ * XXX
+ * The length of an environment variable is limited to 128 bytes which
+ * puts an upper limit on the number of passthru devices that may be
+ * specified using a single environment variable.
+ *
+ * Work around this by scanning multiple environment variable
+ * names instead of a single one - yuck!
+ */
+ const char *names[] = { "pptdevs", "pptdevs2", "pptdevs3", NULL };
+
+ /* set pptdevs="1/2/3 4/5/6 7/8/9 10/11/12" */
+ found = 0;
+ for (i = 0; names[i] != NULL && !found; i++) {
+ cp = val = getenv(names[i]);
+ while (cp != NULL && *cp != '\0') {
+ if ((cp2 = strchr(cp, ' ')) != NULL)
+ *cp2 = '\0';
+
+ n = sscanf(cp, "%d/%d/%d", &b, &s, &f);
+ if (n == 3 && bus == b && slot == s && func == f) {
+ found = 1;
+ break;
+ }
+
+ if (cp2 != NULL)
+ *cp2++ = ' ';
+
+ cp = cp2;
+ }
+ freeenv(val);
+ }
+ return (found);
+}
+#endif
+
+void *
+vm_iommu_domain(struct vm *vm)
+{
+
+ return (vm->iommu);
+}
+
+int
+vcpu_set_state(struct vm *vm, int vcpuid, enum vcpu_state newstate,
+ bool from_idle)
+{
+ int error;
+ struct vcpu *vcpu;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_set_run_state: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ error = vcpu_set_state_locked(vcpu, newstate, from_idle);
+ vcpu_unlock(vcpu);
+
+ return (error);
+}
+
+enum vcpu_state
+vcpu_get_state(struct vm *vm, int vcpuid, int *hostcpu)
+{
+ struct vcpu *vcpu;
+ enum vcpu_state state;
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ panic("vm_get_run_state: invalid vcpuid %d", vcpuid);
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ state = vcpu->state;
+ if (hostcpu != NULL)
+ *hostcpu = vcpu->hostcpu;
+ vcpu_unlock(vcpu);
+
+ return (state);
+}
+
+int
+vm_activate_cpu(struct vm *vm, int vcpuid)
+{
+
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (CPU_ISSET(vcpuid, &vm->active_cpus))
+ return (EBUSY);
+
+ VCPU_CTR0(vm, vcpuid, "activated");
+ CPU_SET_ATOMIC(vcpuid, &vm->active_cpus);
+ return (0);
+}
+
+cpuset_t
+vm_active_cpus(struct vm *vm)
+{
+
+ return (vm->active_cpus);
+}
+
+void *
+vcpu_stats(struct vm *vm, int vcpuid)
+{
+
+ return (vm->vcpu[vcpuid].stats);
+}
+
+int
+vm_get_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state *state)
+{
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ *state = vm->vcpu[vcpuid].x2apic_state;
+
+ return (0);
+}
+
+int
+vm_set_x2apic_state(struct vm *vm, int vcpuid, enum x2apic_state state)
+{
+ if (vcpuid < 0 || vcpuid >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (state >= X2APIC_STATE_LAST)
+ return (EINVAL);
+
+ vm->vcpu[vcpuid].x2apic_state = state;
+
+ vlapic_set_x2apic_state(vm, vcpuid, state);
+
+ return (0);
+}
+
+/*
+ * This function is called to ensure that a vcpu "sees" a pending event
+ * as soon as possible:
+ * - If the vcpu thread is sleeping then it is woken up.
+ * - If the vcpu is running on a different host_cpu then an IPI will be directed
+ * to the host_cpu to cause the vcpu to trap into the hypervisor.
+ */
+void
+vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr)
+{
+ int hostcpu;
+ struct vcpu *vcpu;
+
+ vcpu = &vm->vcpu[vcpuid];
+
+ vcpu_lock(vcpu);
+ hostcpu = vcpu->hostcpu;
+ if (vcpu->state == VCPU_RUNNING) {
+ KASSERT(hostcpu != NOCPU, ("vcpu running on invalid hostcpu"));
+ if (hostcpu != curcpu) {
+ if (lapic_intr) {
+ vlapic_post_intr(vcpu->vlapic, hostcpu,
+ vmm_ipinum);
+ } else {
+ ipi_cpu(hostcpu, vmm_ipinum);
+ }
+ } else {
+ /*
+ * If the 'vcpu' is running on 'curcpu' then it must
+ * be sending a notification to itself (e.g. SELF_IPI).
+ * The pending event will be picked up when the vcpu
+ * transitions back to guest context.
+ */
+ }
+ } else {
+ KASSERT(hostcpu == NOCPU, ("vcpu state %d not consistent "
+ "with hostcpu %d", vcpu->state, hostcpu));
+ if (vcpu->state == VCPU_SLEEPING)
+ wakeup_one(vcpu);
+ }
+ vcpu_unlock(vcpu);
+}
+
+int
+vm_apicid2vcpuid(struct vm *vm, int apicid)
+{
+ /*
+ * XXX apic id is assumed to be numerically identical to vcpu id
+ */
+ return (apicid);
+}
+
+struct vatpic *
+vm_atpic(struct vm *vm)
+{
+ return (vm->vatpic);
+}
+
+struct vatpit *
+vm_atpit(struct vm *vm)
+{
+ return (vm->vatpit);
+}
+
+enum vm_reg_name
+vm_segment_name(int seg)
+{
+ static enum vm_reg_name seg_names[] = {
+ VM_REG_GUEST_ES,
+ VM_REG_GUEST_CS,
+ VM_REG_GUEST_SS,
+ VM_REG_GUEST_DS,
+ VM_REG_GUEST_FS,
+ VM_REG_GUEST_GS
+ };
+
+ KASSERT(seg >= 0 && seg < nitems(seg_names),
+ ("%s: invalid segment encoding %d", __func__, seg));
+ return (seg_names[seg]);
+}
+
+void
+vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+ int num_copyinfo)
+{
+ int idx;
+
+#ifdef __FreeBSD__
+ for (idx = 0; idx < num_copyinfo; idx++) {
+ if (copyinfo[idx].cookie != NULL)
+ vm_gpa_release(copyinfo[idx].cookie);
+ }
+#endif
+ bzero(copyinfo, num_copyinfo * sizeof(struct vm_copyinfo));
+}
+
+int
+vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+ int num_copyinfo)
+{
+ int error, idx, nused;
+ size_t n, off, remaining;
+ void *hva, *cookie;
+ uint64_t gpa;
+
+ bzero(copyinfo, sizeof(struct vm_copyinfo) * num_copyinfo);
+
+ nused = 0;
+ remaining = len;
+ while (remaining > 0) {
+ KASSERT(nused < num_copyinfo, ("insufficient vm_copyinfo"));
+ error = vm_gla2gpa(vm, vcpuid, paging, gla, prot, &gpa);
+ if (error)
+ return (error);
+ off = gpa & PAGE_MASK;
+ n = min(remaining, PAGE_SIZE - off);
+ copyinfo[nused].gpa = gpa;
+ copyinfo[nused].len = n;
+ remaining -= n;
+ gla += n;
+ nused++;
+ }
+
+ for (idx = 0; idx < nused; idx++) {
+ hva = vm_gpa_hold(vm, copyinfo[idx].gpa, copyinfo[idx].len,
+ prot, &cookie);
+ if (hva == NULL)
+ break;
+ copyinfo[idx].hva = hva;
+ copyinfo[idx].cookie = cookie;
+ }
+
+ if (idx != nused) {
+ vm_copy_teardown(vm, vcpuid, copyinfo, num_copyinfo);
+ return (-1);
+ } else {
+ return (0);
+ }
+}
+
+void
+vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo, void *kaddr,
+ size_t len)
+{
+ char *dst;
+ int idx;
+
+ dst = kaddr;
+ idx = 0;
+ while (len > 0) {
+ bcopy(copyinfo[idx].hva, dst, copyinfo[idx].len);
+ len -= copyinfo[idx].len;
+ dst += copyinfo[idx].len;
+ idx++;
+ }
+}
+
+void
+vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+ struct vm_copyinfo *copyinfo, size_t len)
+{
+ const char *src;
+ int idx;
+
+ src = kaddr;
+ idx = 0;
+ while (len > 0) {
+ bcopy(src, copyinfo[idx].hva, copyinfo[idx].len);
+ len -= copyinfo[idx].len;
+ src += copyinfo[idx].len;
+ idx++;
+ }
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm.conf b/usr/src/uts/i86pc/io/vmm/vmm.conf
new file mode 100644
index 0000000000..8833076014
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm.conf
@@ -0,0 +1 @@
+name="vmm" parent="pseudo";
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.c b/usr/src/uts/i86pc/io/vmm/vmm_host.c
new file mode 100644
index 0000000000..b94caf4009
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_host.c
@@ -0,0 +1,160 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_host.c 242275 2012-10-29 01:51:24Z neel $");
+
+#include <sys/param.h>
+#include <sys/pcpu.h>
+
+#include <machine/cpufunc.h>
+#include <machine/segments.h>
+#include <machine/specialreg.h>
+
+#include "vmm_host.h"
+
+static uint64_t vmm_host_efer, vmm_host_pat, vmm_host_cr0, vmm_host_cr4;
+
+void
+vmm_host_state_init(void)
+{
+
+ vmm_host_efer = rdmsr(MSR_EFER);
+ vmm_host_pat = rdmsr(MSR_PAT);
+
+ /*
+ * We always want CR0.TS to be set when the processor does a VM exit.
+ *
+ * With emulation turned on unconditionally after a VM exit, we are
+ * able to trap inadvertent use of the FPU until the guest FPU state
+ * has been safely squirreled away.
+ */
+ vmm_host_cr0 = rcr0() | CR0_TS;
+
+ vmm_host_cr4 = rcr4();
+}
+
+uint64_t
+vmm_get_host_pat(void)
+{
+
+ return (vmm_host_pat);
+}
+
+uint64_t
+vmm_get_host_efer(void)
+{
+
+ return (vmm_host_efer);
+}
+
+uint64_t
+vmm_get_host_cr0(void)
+{
+
+ return (vmm_host_cr0);
+}
+
+uint64_t
+vmm_get_host_cr4(void)
+{
+
+ return (vmm_host_cr4);
+}
+
+uint64_t
+vmm_get_host_datasel(void)
+{
+
+#ifdef __FreeBSD__
+ return (GSEL(GDATA_SEL, SEL_KPL));
+#else
+ return (SEL_GDT(GDT_KDATA, SEL_KPL));
+#endif
+
+}
+
+uint64_t
+vmm_get_host_codesel(void)
+{
+
+#ifdef __FreeBSD__
+ return (GSEL(GCODE_SEL, SEL_KPL));
+#else
+ return (SEL_GDT(GDT_KCODE, SEL_KPL));
+#endif
+}
+
+
+uint64_t
+vmm_get_host_tsssel(void)
+{
+
+#ifdef __FreeBSD__
+ return (GSEL(GPROC0_SEL, SEL_KPL));
+#else
+ return (SEL_GDT(GDT_KTSS, SEL_KPL));
+#endif
+}
+
+uint64_t
+vmm_get_host_fsbase(void)
+{
+
+#ifdef __FreeBSD__
+ return (0);
+#else
+ return (rdmsr(MSR_FSBASE));
+#endif
+}
+
+uint64_t
+vmm_get_host_idtrbase(void)
+{
+
+#ifdef __FreeBSD__
+ return (r_idt.rd_base);
+#else
+ desctbr_t idtr;
+
+ rd_idtr(&idtr);
+ return (idtr.dtr_base);
+#endif
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_host.h b/usr/src/uts/i86pc/io/vmm/vmm_host.h
new file mode 100644
index 0000000000..5de015a228
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_host.h
@@ -0,0 +1,119 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_host.h 242275 2012-10-29 01:51:24Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _VMM_HOST_H_
+#define _VMM_HOST_H_
+
+#ifndef __FreeBSD__
+#include <sys/cpuvar.h>
+#endif
+
+#ifndef _KERNEL
+#error "no user-servicable parts inside"
+#endif
+
+void vmm_host_state_init(void);
+
+uint64_t vmm_get_host_pat(void);
+uint64_t vmm_get_host_efer(void);
+uint64_t vmm_get_host_cr0(void);
+uint64_t vmm_get_host_cr4(void);
+uint64_t vmm_get_host_datasel(void);
+uint64_t vmm_get_host_codesel(void);
+uint64_t vmm_get_host_tsssel(void);
+uint64_t vmm_get_host_fsbase(void);
+uint64_t vmm_get_host_idtrbase(void);
+
+/*
+ * Inline access to host state that is used on every VM entry
+ */
+static __inline uint64_t
+vmm_get_host_trbase(void)
+{
+
+#ifdef __FreeBSD__
+ return ((uint64_t)PCPU_GET(tssp));
+#else
+ return ((u_long)CPU->cpu_tss);
+#endif
+}
+
+static __inline uint64_t
+vmm_get_host_gdtrbase(void)
+{
+
+#ifdef __FreeBSD__
+ return ((uint64_t)&gdt[NGDT * curcpu]);
+#else
+ desctbr_t gdtr;
+
+ rd_gdtr(&gdtr);
+ return (gdtr.dtr_base);
+#endif
+}
+
+struct pcpu;
+extern struct pcpu __pcpu[];
+
+static __inline uint64_t
+vmm_get_host_gsbase(void)
+{
+
+#ifdef __FreeBSD__
+ return ((uint64_t)&__pcpu[curcpu]);
+#else
+ return (rdmsr(MSR_GSBASE));
+#endif
+}
+
+#ifndef __FreeBSD__
+static __inline uint64_t
+vmm_get_host_fssel(void)
+{
+ return (KFS_SEL);
+}
+
+static __inline uint64_t
+vmm_get_host_gssel(void)
+{
+ return (KGS_SEL);
+}
+#endif
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
new file mode 100644
index 0000000000..72c7056e26
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_instruction_emul.c
@@ -0,0 +1,2370 @@
+/*-
+ * Copyright (c) 2012 Sandvine, Inc.
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_instruction_emul.c 281987 2015-04-25 19:02:06Z tychon $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_instruction_emul.c 281987 2015-04-25 19:02:06Z tychon $");
+
+#ifdef _KERNEL
+#include <sys/param.h>
+#include <sys/pcpu.h>
+#include <sys/systm.h>
+
+#include <vm/vm.h>
+#include <vm/pmap.h>
+
+#include <machine/vmparam.h>
+#include <machine/vmm.h>
+#else /* !_KERNEL */
+#include <sys/types.h>
+#include <sys/errno.h>
+#include <sys/_iovec.h>
+
+#include <machine/vmm.h>
+
+#include <assert.h>
+#include <vmmapi.h>
+#define KASSERT(exp,msg) assert((exp))
+#endif /* _KERNEL */
+
+#include <machine/vmm_instruction_emul.h>
+#include <x86/psl.h>
+#include <x86/specialreg.h>
+
+/* struct vie_op.op_type */
+enum {
+ VIE_OP_TYPE_NONE = 0,
+ VIE_OP_TYPE_MOV,
+ VIE_OP_TYPE_MOVSX,
+ VIE_OP_TYPE_MOVZX,
+ VIE_OP_TYPE_AND,
+ VIE_OP_TYPE_OR,
+ VIE_OP_TYPE_SUB,
+ VIE_OP_TYPE_TWO_BYTE,
+ VIE_OP_TYPE_PUSH,
+ VIE_OP_TYPE_CMP,
+ VIE_OP_TYPE_POP,
+ VIE_OP_TYPE_MOVS,
+ VIE_OP_TYPE_GROUP1,
+ VIE_OP_TYPE_STOS,
+ VIE_OP_TYPE_LAST
+};
+
+/* struct vie_op.op_flags */
+#define VIE_OP_F_IMM (1 << 0) /* 16/32-bit immediate operand */
+#define VIE_OP_F_IMM8 (1 << 1) /* 8-bit immediate operand */
+#define VIE_OP_F_MOFFSET (1 << 2) /* 16/32/64-bit immediate moffset */
+#define VIE_OP_F_NO_MODRM (1 << 3)
+#define VIE_OP_F_NO_GLA_VERIFICATION (1 << 4)
+
+static const struct vie_op two_byte_opcodes[256] = {
+ [0xB6] = {
+ .op_byte = 0xB6,
+ .op_type = VIE_OP_TYPE_MOVZX,
+ },
+ [0xB7] = {
+ .op_byte = 0xB7,
+ .op_type = VIE_OP_TYPE_MOVZX,
+ },
+ [0xBE] = {
+ .op_byte = 0xBE,
+ .op_type = VIE_OP_TYPE_MOVSX,
+ },
+};
+
+static const struct vie_op one_byte_opcodes[256] = {
+ [0x0F] = {
+ .op_byte = 0x0F,
+ .op_type = VIE_OP_TYPE_TWO_BYTE
+ },
+ [0x2B] = {
+ .op_byte = 0x2B,
+ .op_type = VIE_OP_TYPE_SUB,
+ },
+ [0x3B] = {
+ .op_byte = 0x3B,
+ .op_type = VIE_OP_TYPE_CMP,
+ },
+ [0x88] = {
+ .op_byte = 0x88,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0x89] = {
+ .op_byte = 0x89,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0x8A] = {
+ .op_byte = 0x8A,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0x8B] = {
+ .op_byte = 0x8B,
+ .op_type = VIE_OP_TYPE_MOV,
+ },
+ [0xA1] = {
+ .op_byte = 0xA1,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+ },
+ [0xA3] = {
+ .op_byte = 0xA3,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_MOFFSET | VIE_OP_F_NO_MODRM,
+ },
+ [0xA4] = {
+ .op_byte = 0xA4,
+ .op_type = VIE_OP_TYPE_MOVS,
+ .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+ },
+ [0xA5] = {
+ .op_byte = 0xA5,
+ .op_type = VIE_OP_TYPE_MOVS,
+ .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+ },
+ [0xAA] = {
+ .op_byte = 0xAA,
+ .op_type = VIE_OP_TYPE_STOS,
+ .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+ },
+ [0xAB] = {
+ .op_byte = 0xAB,
+ .op_type = VIE_OP_TYPE_STOS,
+ .op_flags = VIE_OP_F_NO_MODRM | VIE_OP_F_NO_GLA_VERIFICATION
+ },
+ [0xC6] = {
+ /* XXX Group 11 extended opcode - not just MOV */
+ .op_byte = 0xC6,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_IMM8,
+ },
+ [0xC7] = {
+ .op_byte = 0xC7,
+ .op_type = VIE_OP_TYPE_MOV,
+ .op_flags = VIE_OP_F_IMM,
+ },
+ [0x23] = {
+ .op_byte = 0x23,
+ .op_type = VIE_OP_TYPE_AND,
+ },
+ [0x81] = {
+ /* XXX Group 1 extended opcode */
+ .op_byte = 0x81,
+ .op_type = VIE_OP_TYPE_GROUP1,
+ .op_flags = VIE_OP_F_IMM,
+ },
+ [0x83] = {
+ /* XXX Group 1 extended opcode */
+ .op_byte = 0x83,
+ .op_type = VIE_OP_TYPE_GROUP1,
+ .op_flags = VIE_OP_F_IMM8,
+ },
+ [0x8F] = {
+ /* XXX Group 1A extended opcode - not just POP */
+ .op_byte = 0x8F,
+ .op_type = VIE_OP_TYPE_POP,
+ },
+ [0xFF] = {
+ /* XXX Group 5 extended opcode - not just PUSH */
+ .op_byte = 0xFF,
+ .op_type = VIE_OP_TYPE_PUSH,
+ }
+};
+
+/* struct vie.mod */
+#define VIE_MOD_INDIRECT 0
+#define VIE_MOD_INDIRECT_DISP8 1
+#define VIE_MOD_INDIRECT_DISP32 2
+#define VIE_MOD_DIRECT 3
+
+/* struct vie.rm */
+#define VIE_RM_SIB 4
+#define VIE_RM_DISP32 5
+
+#define GB (1024 * 1024 * 1024)
+
+static enum vm_reg_name gpr_map[16] = {
+ VM_REG_GUEST_RAX,
+ VM_REG_GUEST_RCX,
+ VM_REG_GUEST_RDX,
+ VM_REG_GUEST_RBX,
+ VM_REG_GUEST_RSP,
+ VM_REG_GUEST_RBP,
+ VM_REG_GUEST_RSI,
+ VM_REG_GUEST_RDI,
+ VM_REG_GUEST_R8,
+ VM_REG_GUEST_R9,
+ VM_REG_GUEST_R10,
+ VM_REG_GUEST_R11,
+ VM_REG_GUEST_R12,
+ VM_REG_GUEST_R13,
+ VM_REG_GUEST_R14,
+ VM_REG_GUEST_R15
+};
+
+static uint64_t size2mask[] = {
+ [1] = 0xff,
+ [2] = 0xffff,
+ [4] = 0xffffffff,
+ [8] = 0xffffffffffffffff,
+};
+
+static int
+vie_read_register(void *vm, int vcpuid, enum vm_reg_name reg, uint64_t *rval)
+{
+ int error;
+
+ error = vm_get_register(vm, vcpuid, reg, rval);
+
+ return (error);
+}
+
+static void
+vie_calc_bytereg(struct vie *vie, enum vm_reg_name *reg, int *lhbr)
+{
+ *lhbr = 0;
+ *reg = gpr_map[vie->reg];
+
+ /*
+ * 64-bit mode imposes limitations on accessing legacy high byte
+ * registers (lhbr).
+ *
+ * The legacy high-byte registers cannot be addressed if the REX
+ * prefix is present. In this case the values 4, 5, 6 and 7 of the
+ * 'ModRM:reg' field address %spl, %bpl, %sil and %dil respectively.
+ *
+ * If the REX prefix is not present then the values 4, 5, 6 and 7
+ * of the 'ModRM:reg' field address the legacy high-byte registers,
+ * %ah, %ch, %dh and %bh respectively.
+ */
+ if (!vie->rex_present) {
+ if (vie->reg & 0x4) {
+ *lhbr = 1;
+ *reg = gpr_map[vie->reg & 0x3];
+ }
+ }
+}
+
+static int
+vie_read_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t *rval)
+{
+ uint64_t val;
+ int error, lhbr;
+ enum vm_reg_name reg;
+
+ vie_calc_bytereg(vie, &reg, &lhbr);
+ error = vm_get_register(vm, vcpuid, reg, &val);
+
+ /*
+ * To obtain the value of a legacy high byte register shift the
+ * base register right by 8 bits (%ah = %rax >> 8).
+ */
+ if (lhbr)
+ *rval = val >> 8;
+ else
+ *rval = val;
+ return (error);
+}
+
+static int
+vie_write_bytereg(void *vm, int vcpuid, struct vie *vie, uint8_t byte)
+{
+ uint64_t origval, val, mask;
+ int error, lhbr;
+ enum vm_reg_name reg;
+
+ vie_calc_bytereg(vie, &reg, &lhbr);
+ error = vm_get_register(vm, vcpuid, reg, &origval);
+ if (error == 0) {
+ val = byte;
+ mask = 0xff;
+ if (lhbr) {
+ /*
+ * Shift left by 8 to store 'byte' in a legacy high
+ * byte register.
+ */
+ val <<= 8;
+ mask <<= 8;
+ }
+ val |= origval & ~mask;
+ error = vm_set_register(vm, vcpuid, reg, val);
+ }
+ return (error);
+}
+
+int
+vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+ uint64_t val, int size)
+{
+ int error;
+ uint64_t origval;
+
+ switch (size) {
+ case 1:
+ case 2:
+ error = vie_read_register(vm, vcpuid, reg, &origval);
+ if (error)
+ return (error);
+ val &= size2mask[size];
+ val |= origval & ~size2mask[size];
+ break;
+ case 4:
+ val &= 0xffffffffUL;
+ break;
+ case 8:
+ break;
+ default:
+ return (EINVAL);
+ }
+
+ error = vm_set_register(vm, vcpuid, reg, val);
+ return (error);
+}
+
+#define RFLAGS_STATUS_BITS (PSL_C | PSL_PF | PSL_AF | PSL_Z | PSL_N | PSL_V)
+
+/*
+ * Return the status flags that would result from doing (x - y).
+ */
+#define GETCC(sz) \
+static u_long \
+getcc##sz(uint##sz##_t x, uint##sz##_t y) \
+{ \
+ u_long rflags; \
+ \
+ __asm __volatile("sub %2,%1; pushfq; popq %0" : \
+ "=r" (rflags), "+r" (x) : "m" (y)); \
+ return (rflags); \
+} struct __hack
+
+GETCC(8);
+GETCC(16);
+GETCC(32);
+GETCC(64);
+
+static u_long
+getcc(int opsize, uint64_t x, uint64_t y)
+{
+ KASSERT(opsize == 1 || opsize == 2 || opsize == 4 || opsize == 8,
+ ("getcc: invalid operand size %d", opsize));
+
+ if (opsize == 1)
+ return (getcc8(x, y));
+ else if (opsize == 2)
+ return (getcc16(x, y));
+ else if (opsize == 4)
+ return (getcc32(x, y));
+ else
+ return (getcc64(x, y));
+}
+
+static int
+emulate_mov(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint8_t byte;
+ uint64_t val;
+
+ size = vie->opsize;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x88:
+ /*
+ * MOV byte from reg (ModRM:reg) to mem (ModRM:r/m)
+ * 88/r: mov r/m8, r8
+ * REX + 88/r: mov r/m8, r8 (%ah, %ch, %dh, %bh not available)
+ */
+ size = 1; /* override for byte operation */
+ error = vie_read_bytereg(vm, vcpuid, vie, &byte);
+ if (error == 0)
+ error = memwrite(vm, vcpuid, gpa, byte, size, arg);
+ break;
+ case 0x89:
+ /*
+ * MOV from reg (ModRM:reg) to mem (ModRM:r/m)
+ * 89/r: mov r/m16, r16
+ * 89/r: mov r/m32, r32
+ * REX.W + 89/r mov r/m64, r64
+ */
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val);
+ if (error == 0) {
+ val &= size2mask[size];
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ }
+ break;
+ case 0x8A:
+ /*
+ * MOV byte from mem (ModRM:r/m) to reg (ModRM:reg)
+ * 8A/r: mov r8, r/m8
+ * REX + 8A/r: mov r8, r/m8
+ */
+ size = 1; /* override for byte operation */
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0)
+ error = vie_write_bytereg(vm, vcpuid, vie, val);
+ break;
+ case 0x8B:
+ /*
+ * MOV from mem (ModRM:r/m) to reg (ModRM:reg)
+ * 8B/r: mov r16, r/m16
+ * 8B/r: mov r32, r/m32
+ * REX.W 8B/r: mov r64, r/m64
+ */
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0) {
+ reg = gpr_map[vie->reg];
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ }
+ break;
+ case 0xA1:
+ /*
+ * MOV from seg:moffset to AX/EAX/RAX
+ * A1: mov AX, moffs16
+ * A1: mov EAX, moffs32
+ * REX.W + A1: mov RAX, moffs64
+ */
+ error = memread(vm, vcpuid, gpa, &val, size, arg);
+ if (error == 0) {
+ reg = VM_REG_GUEST_RAX;
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ }
+ break;
+ case 0xA3:
+ /*
+ * MOV from AX/EAX/RAX to seg:moffset
+ * A3: mov moffs16, AX
+ * A3: mov moffs32, EAX
+ * REX.W + A3: mov moffs64, RAX
+ */
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+ if (error == 0) {
+ val &= size2mask[size];
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ }
+ break;
+ case 0xC6:
+ /*
+ * MOV from imm8 to mem (ModRM:r/m)
+ * C6/0 mov r/m8, imm8
+ * REX + C6/0 mov r/m8, imm8
+ */
+ size = 1; /* override for byte operation */
+ error = memwrite(vm, vcpuid, gpa, vie->immediate, size, arg);
+ break;
+ case 0xC7:
+ /*
+ * MOV from imm16/imm32 to mem (ModRM:r/m)
+ * C7/0 mov r/m16, imm16
+ * C7/0 mov r/m32, imm32
+ * REX.W + C7/0 mov r/m64, imm32 (sign-extended to 64-bits)
+ */
+ val = vie->immediate & size2mask[size];
+ error = memwrite(vm, vcpuid, gpa, val, size, arg);
+ break;
+ default:
+ break;
+ }
+
+ return (error);
+}
+
+static int
+emulate_movx(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite,
+ void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t val;
+
+ size = vie->opsize;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0xB6:
+ /*
+ * MOV and zero extend byte from mem (ModRM:r/m) to
+ * reg (ModRM:reg).
+ *
+ * 0F B6/r movzx r16, r/m8
+ * 0F B6/r movzx r32, r/m8
+ * REX.W + 0F B6/r movzx r64, r/m8
+ */
+
+ /* get the first operand */
+ error = memread(vm, vcpuid, gpa, &val, 1, arg);
+ if (error)
+ break;
+
+ /* get the second operand */
+ reg = gpr_map[vie->reg];
+
+ /* zero-extend byte */
+ val = (uint8_t)val;
+
+ /* write the result */
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ break;
+ case 0xB7:
+ /*
+ * MOV and zero extend word from mem (ModRM:r/m) to
+ * reg (ModRM:reg).
+ *
+ * 0F B7/r movzx r32, r/m16
+ * REX.W + 0F B7/r movzx r64, r/m16
+ */
+ error = memread(vm, vcpuid, gpa, &val, 2, arg);
+ if (error)
+ return (error);
+
+ reg = gpr_map[vie->reg];
+
+ /* zero-extend word */
+ val = (uint16_t)val;
+
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ break;
+ case 0xBE:
+ /*
+ * MOV and sign extend byte from mem (ModRM:r/m) to
+ * reg (ModRM:reg).
+ *
+ * 0F BE/r movsx r16, r/m8
+ * 0F BE/r movsx r32, r/m8
+ * REX.W + 0F BE/r movsx r64, r/m8
+ */
+
+ /* get the first operand */
+ error = memread(vm, vcpuid, gpa, &val, 1, arg);
+ if (error)
+ break;
+
+ /* get the second operand */
+ reg = gpr_map[vie->reg];
+
+ /* sign extend byte */
+ val = (int8_t)val;
+
+ /* write the result */
+ error = vie_update_register(vm, vcpuid, reg, val, size);
+ break;
+ default:
+ break;
+ }
+ return (error);
+}
+
+/*
+ * Helper function to calculate and validate a linear address.
+ *
+ * Returns 0 on success and 1 if an exception was injected into the guest.
+ */
+static int
+get_gla(void *vm, int vcpuid, struct vie *vie, struct vm_guest_paging *paging,
+ int opsize, int addrsize, int prot, enum vm_reg_name seg,
+ enum vm_reg_name gpr, uint64_t *gla)
+{
+ struct seg_desc desc;
+ uint64_t cr0, val, rflags;
+ int error;
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
+ KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+ KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+ error = vm_get_seg_desc(vm, vcpuid, seg, &desc);
+ KASSERT(error == 0, ("%s: error %d getting segment descriptor %d",
+ __func__, error, seg));
+
+ error = vie_read_register(vm, vcpuid, gpr, &val);
+ KASSERT(error == 0, ("%s: error %d getting register %d", __func__,
+ error, gpr));
+
+ if (vie_calculate_gla(paging->cpu_mode, seg, &desc, val, opsize,
+ addrsize, prot, gla)) {
+ if (seg == VM_REG_GUEST_SS)
+ vm_inject_ss(vm, vcpuid, 0);
+ else
+ vm_inject_gp(vm, vcpuid);
+ return (1);
+ }
+
+ if (vie_canonical_check(paging->cpu_mode, *gla)) {
+ if (seg == VM_REG_GUEST_SS)
+ vm_inject_ss(vm, vcpuid, 0);
+ else
+ vm_inject_gp(vm, vcpuid);
+ return (1);
+ }
+
+ if (vie_alignment_check(paging->cpl, opsize, cr0, rflags, *gla)) {
+ vm_inject_ac(vm, vcpuid, 0);
+ return (1);
+ }
+
+ return (0);
+}
+
+static int
+emulate_movs(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ struct vm_guest_paging *paging, mem_region_read_t memread,
+ mem_region_write_t memwrite, void *arg)
+{
+#ifdef _KERNEL
+ struct vm_copyinfo copyinfo[2];
+#else
+ struct iovec copyinfo[2];
+#endif
+ uint64_t dstaddr, srcaddr, dstgpa, srcgpa, val;
+ uint64_t rcx, rdi, rsi, rflags;
+ int error, opsize, seg, repeat;
+
+ opsize = (vie->op.op_byte == 0xA4) ? 1 : vie->opsize;
+ val = 0;
+ error = 0;
+
+ /*
+ * XXX although the MOVS instruction is only supposed to be used with
+ * the "rep" prefix some guests like FreeBSD will use "repnz" instead.
+ *
+ * Empirically the "repnz" prefix has identical behavior to "rep"
+ * and the zero flag does not make a difference.
+ */
+ repeat = vie->repz_present | vie->repnz_present;
+
+ if (repeat) {
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
+ KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
+
+ /*
+ * The count register is %rcx, %ecx or %cx depending on the
+ * address size of the instruction.
+ */
+ if ((rcx & vie_size2mask(vie->addrsize)) == 0)
+ return (0);
+ }
+
+ /*
+ * Source Destination Comments
+ * --------------------------------------------
+ * (1) memory memory n/a
+ * (2) memory mmio emulated
+ * (3) mmio memory emulated
+ * (4) mmio mmio emulated
+ *
+ * At this point we don't have sufficient information to distinguish
+ * between (2), (3) and (4). We use 'vm_copy_setup()' to tease this
+ * out because it will succeed only when operating on regular memory.
+ *
+ * XXX the emulation doesn't properly handle the case where 'gpa'
+ * is straddling the boundary between the normal memory and MMIO.
+ */
+
+ seg = vie->segment_override ? vie->segment_register : VM_REG_GUEST_DS;
+ error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
+ PROT_READ, seg, VM_REG_GUEST_RSI, &srcaddr);
+ if (error)
+ goto done;
+
+ error = vm_copy_setup(vm, vcpuid, paging, srcaddr, opsize, PROT_READ,
+ copyinfo, nitems(copyinfo));
+ if (error == 0) {
+ /*
+ * case (2): read from system memory and write to mmio.
+ */
+ vm_copyin(vm, vcpuid, copyinfo, &val, opsize);
+ vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+ error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
+ if (error)
+ goto done;
+ } else if (error > 0) {
+ /*
+ * Resume guest execution to handle fault.
+ */
+ goto done;
+ } else {
+ /*
+ * 'vm_copy_setup()' is expected to fail for cases (3) and (4)
+ * if 'srcaddr' is in the mmio space.
+ */
+
+ error = get_gla(vm, vcpuid, vie, paging, opsize, vie->addrsize,
+ PROT_WRITE, VM_REG_GUEST_ES, VM_REG_GUEST_RDI, &dstaddr);
+ if (error)
+ goto done;
+
+ error = vm_copy_setup(vm, vcpuid, paging, dstaddr, opsize,
+ PROT_WRITE, copyinfo, nitems(copyinfo));
+ if (error == 0) {
+ /*
+ * case (3): read from MMIO and write to system memory.
+ *
+ * A MMIO read can have side-effects so we
+ * commit to it only after vm_copy_setup() is
+ * successful. If a page-fault needs to be
+ * injected into the guest then it will happen
+ * before the MMIO read is attempted.
+ */
+ error = memread(vm, vcpuid, gpa, &val, opsize, arg);
+ if (error)
+ goto done;
+
+ vm_copyout(vm, vcpuid, &val, copyinfo, opsize);
+ vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+ } else if (error > 0) {
+ /*
+ * Resume guest execution to handle fault.
+ */
+ goto done;
+ } else {
+ /*
+ * Case (4): read from and write to mmio.
+ */
+ error = vm_gla2gpa(vm, vcpuid, paging, srcaddr,
+ PROT_READ, &srcgpa);
+ if (error)
+ goto done;
+ error = memread(vm, vcpuid, srcgpa, &val, opsize, arg);
+ if (error)
+ goto done;
+
+ error = vm_gla2gpa(vm, vcpuid, paging, dstaddr,
+ PROT_WRITE, &dstgpa);
+ if (error)
+ goto done;
+ error = memwrite(vm, vcpuid, dstgpa, val, opsize, arg);
+ if (error)
+ goto done;
+ }
+ }
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSI, &rsi);
+ KASSERT(error == 0, ("%s: error %d getting rsi", __func__, error));
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
+ KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+ KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+ if (rflags & PSL_D) {
+ rsi -= opsize;
+ rdi -= opsize;
+ } else {
+ rsi += opsize;
+ rdi += opsize;
+ }
+
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSI, rsi,
+ vie->addrsize);
+ KASSERT(error == 0, ("%s: error %d updating rsi", __func__, error));
+
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
+ vie->addrsize);
+ KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
+
+ if (repeat) {
+ rcx = rcx - 1;
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
+ rcx, vie->addrsize);
+ KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
+
+ /*
+ * Repeat the instruction if the count register is not zero.
+ */
+ if ((rcx & vie_size2mask(vie->addrsize)) != 0)
+ vm_restart_instruction(vm, vcpuid);
+ }
+done:
+ if (error < 0)
+ return (EFAULT);
+ else
+ return (0);
+}
+
+static int
+emulate_stos(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ struct vm_guest_paging *paging, mem_region_read_t memread,
+ mem_region_write_t memwrite, void *arg)
+{
+ int error, opsize, repeat;
+ uint64_t val;
+ uint64_t rcx, rdi, rflags;
+
+ opsize = (vie->op.op_byte == 0xAA) ? 1 : vie->opsize;
+ repeat = vie->repz_present | vie->repnz_present;
+
+ if (repeat) {
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RCX, &rcx);
+ KASSERT(!error, ("%s: error %d getting rcx", __func__, error));
+
+ /*
+ * The count register is %rcx, %ecx or %cx depending on the
+ * address size of the instruction.
+ */
+ if ((rcx & vie_size2mask(vie->addrsize)) == 0)
+ return (0);
+ }
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RAX, &val);
+ KASSERT(!error, ("%s: error %d getting rax", __func__, error));
+
+ error = memwrite(vm, vcpuid, gpa, val, opsize, arg);
+ if (error)
+ return (error);
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RDI, &rdi);
+ KASSERT(error == 0, ("%s: error %d getting rdi", __func__, error));
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+ KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+ if (rflags & PSL_D)
+ rdi -= opsize;
+ else
+ rdi += opsize;
+
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RDI, rdi,
+ vie->addrsize);
+ KASSERT(error == 0, ("%s: error %d updating rdi", __func__, error));
+
+ if (repeat) {
+ rcx = rcx - 1;
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RCX,
+ rcx, vie->addrsize);
+ KASSERT(!error, ("%s: error %d updating rcx", __func__, error));
+
+ /*
+ * Repeat the instruction if the count register is not zero.
+ */
+ if ((rcx & vie_size2mask(vie->addrsize)) != 0)
+ vm_restart_instruction(vm, vcpuid);
+ }
+
+ return (0);
+}
+
+static int
+emulate_and(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ enum vm_reg_name reg;
+ uint64_t result, rflags, rflags2, val1, val2;
+
+ size = vie->opsize;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x23:
+ /*
+ * AND reg (ModRM:reg) and mem (ModRM:r/m) and store the
+ * result in reg.
+ *
+ * 23/r and r16, r/m16
+ * 23/r and r32, r/m32
+ * REX.W + 23/r and r64, r/m64
+ */
+
+ /* get the first operand */
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val1);
+ if (error)
+ break;
+
+ /* get the second operand */
+ error = memread(vm, vcpuid, gpa, &val2, size, arg);
+ if (error)
+ break;
+
+ /* perform the operation and write the result */
+ result = val1 & val2;
+ error = vie_update_register(vm, vcpuid, reg, result, size);
+ break;
+ case 0x81:
+ case 0x83:
+ /*
+ * AND mem (ModRM:r/m) with immediate and store the
+ * result in mem.
+ *
+ * 81 /4 and r/m16, imm16
+ * 81 /4 and r/m32, imm32
+ * REX.W + 81 /4 and r/m64, imm32 sign-extended to 64
+ *
+ * 83 /4 and r/m16, imm8 sign-extended to 16
+ * 83 /4 and r/m32, imm8 sign-extended to 32
+ * REX.W + 83/4 and r/m64, imm8 sign-extended to 64
+ */
+
+ /* get the first operand */
+ error = memread(vm, vcpuid, gpa, &val1, size, arg);
+ if (error)
+ break;
+
+ /*
+ * perform the operation with the pre-fetched immediate
+ * operand and write the result
+ */
+ result = val1 & vie->immediate;
+ error = memwrite(vm, vcpuid, gpa, result, size, arg);
+ break;
+ default:
+ break;
+ }
+ if (error)
+ return (error);
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+ if (error)
+ return (error);
+
+ /*
+ * OF and CF are cleared; the SF, ZF and PF flags are set according
+ * to the result; AF is undefined.
+ *
+ * The updated status flags are obtained by subtracting 0 from 'result'.
+ */
+ rflags2 = getcc(size, result, 0);
+ rflags &= ~RFLAGS_STATUS_BITS;
+ rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
+
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+ return (error);
+}
+
+static int
+emulate_or(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ uint64_t val1, result, rflags, rflags2;
+
+ size = vie->opsize;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x81:
+ case 0x83:
+ /*
+ * OR mem (ModRM:r/m) with immediate and store the
+ * result in mem.
+ *
+ * 81 /1 or r/m16, imm16
+ * 81 /1 or r/m32, imm32
+ * REX.W + 81 /1 or r/m64, imm32 sign-extended to 64
+ *
+ * 83 /1 or r/m16, imm8 sign-extended to 16
+ * 83 /1 or r/m32, imm8 sign-extended to 32
+ * REX.W + 83/1 or r/m64, imm8 sign-extended to 64
+ */
+
+ /* get the first operand */
+ error = memread(vm, vcpuid, gpa, &val1, size, arg);
+ if (error)
+ break;
+
+ /*
+ * perform the operation with the pre-fetched immediate
+ * operand and write the result
+ */
+ result = val1 | vie->immediate;
+ error = memwrite(vm, vcpuid, gpa, result, size, arg);
+ break;
+ default:
+ break;
+ }
+ if (error)
+ return (error);
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+ if (error)
+ return (error);
+
+ /*
+ * OF and CF are cleared; the SF, ZF and PF flags are set according
+ * to the result; AF is undefined.
+ *
+ * The updated status flags are obtained by subtracting 0 from 'result'.
+ */
+ rflags2 = getcc(size, result, 0);
+ rflags &= ~RFLAGS_STATUS_BITS;
+ rflags |= rflags2 & (PSL_PF | PSL_Z | PSL_N);
+
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+ return (error);
+}
+
+static int
+emulate_cmp(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ uint64_t op1, op2, rflags, rflags2;
+ enum vm_reg_name reg;
+
+ size = vie->opsize;
+ switch (vie->op.op_byte) {
+ case 0x3B:
+ /*
+ * 3B/r CMP r16, r/m16
+ * 3B/r CMP r32, r/m32
+ * REX.W + 3B/r CMP r64, r/m64
+ *
+ * Compare first operand (reg) with second operand (r/m) and
+ * set status flags in EFLAGS register. The comparison is
+ * performed by subtracting the second operand from the first
+ * operand and then setting the status flags.
+ */
+
+ /* Get the first operand */
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &op1);
+ if (error)
+ return (error);
+
+ /* Get the second operand */
+ error = memread(vm, vcpuid, gpa, &op2, size, arg);
+ if (error)
+ return (error);
+
+ rflags2 = getcc(size, op1, op2);
+ break;
+ case 0x81:
+ case 0x83:
+ /*
+ * 81 /7 cmp r/m16, imm16
+ * 81 /7 cmp r/m32, imm32
+ * REX.W + 81 /7 cmp r/m64, imm32 sign-extended to 64
+ *
+ * 83 /7 cmp r/m16, imm8 sign-extended to 16
+ * 83 /7 cmp r/m32, imm8 sign-extended to 32
+ * REX.W + 83 /7 cmp r/m64, imm8 sign-extended to 64
+ *
+ * Compare mem (ModRM:r/m) with immediate and set
+ * status flags according to the results. The
+ * comparison is performed by subtracting the
+ * immediate from the first operand and then setting
+ * the status flags.
+ *
+ */
+
+ /* get the first operand */
+ error = memread(vm, vcpuid, gpa, &op1, size, arg);
+ if (error)
+ return (error);
+
+ rflags2 = getcc(size, op1, vie->immediate);
+ break;
+ default:
+ return (EINVAL);
+ }
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+ if (error)
+ return (error);
+ rflags &= ~RFLAGS_STATUS_BITS;
+ rflags |= rflags2 & RFLAGS_STATUS_BITS;
+
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, rflags, 8);
+ return (error);
+}
+
+static int
+emulate_sub(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ mem_region_read_t memread, mem_region_write_t memwrite, void *arg)
+{
+ int error, size;
+ uint64_t nval, rflags, rflags2, val1, val2;
+ enum vm_reg_name reg;
+
+ size = vie->opsize;
+ error = EINVAL;
+
+ switch (vie->op.op_byte) {
+ case 0x2B:
+ /*
+ * SUB r/m from r and store the result in r
+ *
+ * 2B/r SUB r16, r/m16
+ * 2B/r SUB r32, r/m32
+ * REX.W + 2B/r SUB r64, r/m64
+ */
+
+ /* get the first operand */
+ reg = gpr_map[vie->reg];
+ error = vie_read_register(vm, vcpuid, reg, &val1);
+ if (error)
+ break;
+
+ /* get the second operand */
+ error = memread(vm, vcpuid, gpa, &val2, size, arg);
+ if (error)
+ break;
+
+ /* perform the operation and write the result */
+ nval = val1 - val2;
+ error = vie_update_register(vm, vcpuid, reg, nval, size);
+ break;
+ default:
+ break;
+ }
+
+ if (!error) {
+ rflags2 = getcc(size, val1, val2);
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
+ &rflags);
+ if (error)
+ return (error);
+
+ rflags &= ~RFLAGS_STATUS_BITS;
+ rflags |= rflags2 & RFLAGS_STATUS_BITS;
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RFLAGS,
+ rflags, 8);
+ }
+
+ return (error);
+}
+
+static int
+emulate_stack_op(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+ struct vm_guest_paging *paging, mem_region_read_t memread,
+ mem_region_write_t memwrite, void *arg)
+{
+#ifdef _KERNEL
+ struct vm_copyinfo copyinfo[2];
+#else
+ struct iovec copyinfo[2];
+#endif
+ struct seg_desc ss_desc;
+ uint64_t cr0, rflags, rsp, stack_gla, val;
+ int error, size, stackaddrsize, pushop;
+
+ val = 0;
+ size = vie->opsize;
+ pushop = (vie->op.op_type == VIE_OP_TYPE_PUSH) ? 1 : 0;
+
+ /*
+ * From "Address-Size Attributes for Stack Accesses", Intel SDL, Vol 1
+ */
+ if (paging->cpu_mode == CPU_MODE_REAL) {
+ stackaddrsize = 2;
+ } else if (paging->cpu_mode == CPU_MODE_64BIT) {
+ /*
+ * "Stack Manipulation Instructions in 64-bit Mode", SDM, Vol 3
+ * - Stack pointer size is always 64-bits.
+ * - PUSH/POP of 32-bit values is not possible in 64-bit mode.
+ * - 16-bit PUSH/POP is supported by using the operand size
+ * override prefix (66H).
+ */
+ stackaddrsize = 8;
+ size = vie->opsize_override ? 2 : 8;
+ } else {
+ /*
+ * In protected or compability mode the 'B' flag in the
+ * stack-segment descriptor determines the size of the
+ * stack pointer.
+ */
+ error = vm_get_seg_desc(vm, vcpuid, VM_REG_GUEST_SS, &ss_desc);
+ KASSERT(error == 0, ("%s: error %d getting SS descriptor",
+ __func__, error));
+ if (SEG_DESC_DEF32(ss_desc.access))
+ stackaddrsize = 4;
+ else
+ stackaddrsize = 2;
+ }
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_CR0, &cr0);
+ KASSERT(error == 0, ("%s: error %d getting cr0", __func__, error));
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RFLAGS, &rflags);
+ KASSERT(error == 0, ("%s: error %d getting rflags", __func__, error));
+
+ error = vie_read_register(vm, vcpuid, VM_REG_GUEST_RSP, &rsp);
+ KASSERT(error == 0, ("%s: error %d getting rsp", __func__, error));
+ if (pushop) {
+ rsp -= size;
+ }
+
+ if (vie_calculate_gla(paging->cpu_mode, VM_REG_GUEST_SS, &ss_desc,
+ rsp, size, stackaddrsize, pushop ? PROT_WRITE : PROT_READ,
+ &stack_gla)) {
+ vm_inject_ss(vm, vcpuid, 0);
+ return (0);
+ }
+
+ if (vie_canonical_check(paging->cpu_mode, stack_gla)) {
+ vm_inject_ss(vm, vcpuid, 0);
+ return (0);
+ }
+
+ if (vie_alignment_check(paging->cpl, size, cr0, rflags, stack_gla)) {
+ vm_inject_ac(vm, vcpuid, 0);
+ return (0);
+ }
+
+ error = vm_copy_setup(vm, vcpuid, paging, stack_gla, size,
+ pushop ? PROT_WRITE : PROT_READ, copyinfo, nitems(copyinfo));
+ if (error == -1) {
+ /*
+ * XXX cannot return a negative error value here because it
+ * ends up being the return value of the VM_RUN() ioctl and
+ * is interpreted as a pseudo-error (for e.g. ERESTART).
+ */
+ return (EFAULT);
+ } else if (error == 1) {
+ /* Resume guest execution to handle page fault */
+ return (0);
+ }
+
+ if (pushop) {
+ error = memread(vm, vcpuid, mmio_gpa, &val, size, arg);
+ if (error == 0)
+ vm_copyout(vm, vcpuid, &val, copyinfo, size);
+ } else {
+ vm_copyin(vm, vcpuid, copyinfo, &val, size);
+ error = memwrite(vm, vcpuid, mmio_gpa, val, size, arg);
+ rsp += size;
+ }
+ vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+
+ if (error == 0) {
+ error = vie_update_register(vm, vcpuid, VM_REG_GUEST_RSP, rsp,
+ stackaddrsize);
+ KASSERT(error == 0, ("error %d updating rsp", error));
+ }
+ return (error);
+}
+
+static int
+emulate_push(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+ struct vm_guest_paging *paging, mem_region_read_t memread,
+ mem_region_write_t memwrite, void *arg)
+{
+ int error;
+
+ /*
+ * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+ *
+ * PUSH is part of the group 5 extended opcodes and is identified
+ * by ModRM:reg = b110.
+ */
+ if ((vie->reg & 7) != 6)
+ return (EINVAL);
+
+ error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
+ memwrite, arg);
+ return (error);
+}
+
+static int
+emulate_pop(void *vm, int vcpuid, uint64_t mmio_gpa, struct vie *vie,
+ struct vm_guest_paging *paging, mem_region_read_t memread,
+ mem_region_write_t memwrite, void *arg)
+{
+ int error;
+
+ /*
+ * Table A-6, "Opcode Extensions", Intel SDM, Vol 2.
+ *
+ * POP is part of the group 1A extended opcodes and is identified
+ * by ModRM:reg = b000.
+ */
+ if ((vie->reg & 7) != 0)
+ return (EINVAL);
+
+ error = emulate_stack_op(vm, vcpuid, mmio_gpa, vie, paging, memread,
+ memwrite, arg);
+ return (error);
+}
+
+static int
+emulate_group1(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ struct vm_guest_paging *paging, mem_region_read_t memread,
+ mem_region_write_t memwrite, void *memarg)
+{
+ int error;
+
+ switch (vie->reg & 7) {
+ case 0x1: /* OR */
+ error = emulate_or(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case 0x4: /* AND */
+ error = emulate_and(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case 0x7: /* CMP */
+ error = emulate_cmp(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+int
+vmm_emulate_instruction(void *vm, int vcpuid, uint64_t gpa, struct vie *vie,
+ struct vm_guest_paging *paging, mem_region_read_t memread,
+ mem_region_write_t memwrite, void *memarg)
+{
+ int error;
+
+ if (!vie->decoded)
+ return (EINVAL);
+
+ switch (vie->op.op_type) {
+ case VIE_OP_TYPE_GROUP1:
+ error = emulate_group1(vm, vcpuid, gpa, vie, paging, memread,
+ memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_POP:
+ error = emulate_pop(vm, vcpuid, gpa, vie, paging, memread,
+ memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_PUSH:
+ error = emulate_push(vm, vcpuid, gpa, vie, paging, memread,
+ memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_CMP:
+ error = emulate_cmp(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_MOV:
+ error = emulate_mov(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_MOVSX:
+ case VIE_OP_TYPE_MOVZX:
+ error = emulate_movx(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_MOVS:
+ error = emulate_movs(vm, vcpuid, gpa, vie, paging, memread,
+ memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_STOS:
+ error = emulate_stos(vm, vcpuid, gpa, vie, paging, memread,
+ memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_AND:
+ error = emulate_and(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_OR:
+ error = emulate_or(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ case VIE_OP_TYPE_SUB:
+ error = emulate_sub(vm, vcpuid, gpa, vie,
+ memread, memwrite, memarg);
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+
+ return (error);
+}
+
+int
+vie_alignment_check(int cpl, int size, uint64_t cr0, uint64_t rf, uint64_t gla)
+{
+ KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
+ ("%s: invalid size %d", __func__, size));
+ KASSERT(cpl >= 0 && cpl <= 3, ("%s: invalid cpl %d", __func__, cpl));
+
+ if (cpl != 3 || (cr0 & CR0_AM) == 0 || (rf & PSL_AC) == 0)
+ return (0);
+
+ return ((gla & (size - 1)) ? 1 : 0);
+}
+
+int
+vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla)
+{
+ uint64_t mask;
+
+ if (cpu_mode != CPU_MODE_64BIT)
+ return (0);
+
+ /*
+ * The value of the bit 47 in the 'gla' should be replicated in the
+ * most significant 16 bits.
+ */
+ mask = ~((1UL << 48) - 1);
+ if (gla & (1UL << 47))
+ return ((gla & mask) != mask);
+ else
+ return ((gla & mask) != 0);
+}
+
+uint64_t
+vie_size2mask(int size)
+{
+ KASSERT(size == 1 || size == 2 || size == 4 || size == 8,
+ ("vie_size2mask: invalid size %d", size));
+ return (size2mask[size]);
+}
+
+int
+vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
+ struct seg_desc *desc, uint64_t offset, int length, int addrsize,
+ int prot, uint64_t *gla)
+{
+ uint64_t firstoff, low_limit, high_limit, segbase;
+ int glasize, type;
+
+ KASSERT(seg >= VM_REG_GUEST_ES && seg <= VM_REG_GUEST_GS,
+ ("%s: invalid segment %d", __func__, seg));
+ KASSERT(length == 1 || length == 2 || length == 4 || length == 8,
+ ("%s: invalid operand size %d", __func__, length));
+#ifdef __FreeBSD__
+ KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
+ ("%s: invalid prot %#x", __func__, prot));
+#else
+ KASSERT((prot & ~(PROT_READ | PROT_WRITE)) == 0,
+ ("%s: invalid prot %x", __func__, prot));
+#endif
+
+ firstoff = offset;
+ if (cpu_mode == CPU_MODE_64BIT) {
+ KASSERT(addrsize == 4 || addrsize == 8, ("%s: invalid address "
+ "size %d for cpu_mode %d", __func__, addrsize, cpu_mode));
+ glasize = 8;
+ } else {
+ KASSERT(addrsize == 2 || addrsize == 4, ("%s: invalid address "
+ "size %d for cpu mode %d", __func__, addrsize, cpu_mode));
+ glasize = 4;
+ /*
+ * If the segment selector is loaded with a NULL selector
+ * then the descriptor is unusable and attempting to use
+ * it results in a #GP(0).
+ */
+ if (SEG_DESC_UNUSABLE(desc->access))
+ return (-1);
+
+ /*
+ * The processor generates a #NP exception when a segment
+ * register is loaded with a selector that points to a
+ * descriptor that is not present. If this was the case then
+ * it would have been checked before the VM-exit.
+ */
+#ifdef __FreeBSD__
+ KASSERT(SEG_DESC_PRESENT(desc->access),
+ ("segment %d not present: %#x", seg, desc->access));
+#else
+ KASSERT(SEG_DESC_PRESENT(desc->access),
+ ("segment %d not present: %x", seg, desc->access));
+#endif
+
+ /*
+ * The descriptor type must indicate a code/data segment.
+ */
+ type = SEG_DESC_TYPE(desc->access);
+#ifdef __FreeBSD__
+ KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
+ "descriptor type %#x", seg, type));
+#else
+ KASSERT(type >= 16 && type <= 31, ("segment %d has invalid "
+ "descriptor type %x", seg, type));
+#endif
+
+ if (prot & PROT_READ) {
+ /* #GP on a read access to a exec-only code segment */
+ if ((type & 0xA) == 0x8)
+ return (-1);
+ }
+
+ if (prot & PROT_WRITE) {
+ /*
+ * #GP on a write access to a code segment or a
+ * read-only data segment.
+ */
+ if (type & 0x8) /* code segment */
+ return (-1);
+
+ if ((type & 0xA) == 0) /* read-only data seg */
+ return (-1);
+ }
+
+ /*
+ * 'desc->limit' is fully expanded taking granularity into
+ * account.
+ */
+ if ((type & 0xC) == 0x4) {
+ /* expand-down data segment */
+ low_limit = desc->limit + 1;
+ high_limit = SEG_DESC_DEF32(desc->access) ?
+ 0xffffffff : 0xffff;
+ } else {
+ /* code segment or expand-up data segment */
+ low_limit = 0;
+ high_limit = desc->limit;
+ }
+
+ while (length > 0) {
+ offset &= vie_size2mask(addrsize);
+ if (offset < low_limit || offset > high_limit)
+ return (-1);
+ offset++;
+ length--;
+ }
+ }
+
+ /*
+ * In 64-bit mode all segments except %fs and %gs have a segment
+ * base address of 0.
+ */
+ if (cpu_mode == CPU_MODE_64BIT && seg != VM_REG_GUEST_FS &&
+ seg != VM_REG_GUEST_GS) {
+ segbase = 0;
+ } else {
+ segbase = desc->base;
+ }
+
+ /*
+ * Truncate 'firstoff' to the effective address size before adding
+ * it to the segment base.
+ */
+ firstoff &= vie_size2mask(addrsize);
+ *gla = (segbase + firstoff) & vie_size2mask(glasize);
+ return (0);
+}
+
+#ifdef _KERNEL
+void
+vie_init(struct vie *vie, const char *inst_bytes, int inst_length)
+{
+ KASSERT(inst_length >= 0 && inst_length <= VIE_INST_SIZE,
+ ("%s: invalid instruction length (%d)", __func__, inst_length));
+
+ bzero(vie, sizeof(struct vie));
+
+ vie->base_register = VM_REG_LAST;
+ vie->index_register = VM_REG_LAST;
+ vie->segment_register = VM_REG_LAST;
+
+ if (inst_length) {
+ bcopy(inst_bytes, vie->inst, inst_length);
+ vie->num_valid = inst_length;
+ }
+}
+
+static int
+pf_error_code(int usermode, int prot, int rsvd, uint64_t pte)
+{
+ int error_code = 0;
+
+ if (pte & PG_V)
+ error_code |= PGEX_P;
+ if (prot & VM_PROT_WRITE)
+ error_code |= PGEX_W;
+ if (usermode)
+ error_code |= PGEX_U;
+ if (rsvd)
+ error_code |= PGEX_RSV;
+ if (prot & VM_PROT_EXECUTE)
+ error_code |= PGEX_I;
+
+ return (error_code);
+}
+
+static void
+ptp_release(void **cookie)
+{
+ if (*cookie != NULL) {
+ vm_gpa_release(*cookie);
+ *cookie = NULL;
+ }
+}
+
+static void *
+ptp_hold(struct vm *vm, vm_paddr_t ptpphys, size_t len, void **cookie)
+{
+ void *ptr;
+
+ ptp_release(cookie);
+ ptr = vm_gpa_hold(vm, ptpphys, len, VM_PROT_RW, cookie);
+ return (ptr);
+}
+
+int
+vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, int prot, uint64_t *gpa)
+{
+ int nlevels, pfcode, ptpshift, ptpindex, retval, usermode, writable;
+#ifdef __FreeBSD__
+#endif
+ u_int retries;
+ uint64_t *ptpbase, ptpphys, pte, pgsize;
+ uint32_t *ptpbase32, pte32;
+ void *cookie;
+
+ usermode = (paging->cpl == 3 ? 1 : 0);
+ writable = prot & VM_PROT_WRITE;
+ cookie = NULL;
+ retval = 0;
+#ifdef __FreeBSD__
+ retries = 0;
+#endif
+restart:
+ ptpphys = paging->cr3; /* root of the page tables */
+ ptp_release(&cookie);
+#ifdef __FreeBSD__
+ if (retries++ > 0)
+ maybe_yield();
+#endif
+
+ if (vie_canonical_check(paging->cpu_mode, gla)) {
+ /*
+ * XXX assuming a non-stack reference otherwise a stack fault
+ * should be generated.
+ */
+ vm_inject_gp(vm, vcpuid);
+ goto fault;
+ }
+
+ if (paging->paging_mode == PAGING_MODE_FLAT) {
+ *gpa = gla;
+ goto done;
+ }
+
+ if (paging->paging_mode == PAGING_MODE_32) {
+ nlevels = 2;
+ while (--nlevels >= 0) {
+ /* Zero out the lower 12 bits. */
+ ptpphys &= ~0xfff;
+
+ ptpbase32 = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
+
+ if (ptpbase32 == NULL)
+ goto error;
+
+ ptpshift = PAGE_SHIFT + nlevels * 10;
+ ptpindex = (gla >> ptpshift) & 0x3FF;
+ pgsize = 1UL << ptpshift;
+
+ pte32 = ptpbase32[ptpindex];
+
+ if ((pte32 & PG_V) == 0 ||
+ (usermode && (pte32 & PG_U) == 0) ||
+ (writable && (pte32 & PG_RW) == 0)) {
+ pfcode = pf_error_code(usermode, prot, 0,
+ pte32);
+ vm_inject_pf(vm, vcpuid, pfcode, gla);
+ goto fault;
+ }
+
+ /*
+ * Emulate the x86 MMU's management of the accessed
+ * and dirty flags. While the accessed flag is set
+ * at every level of the page table, the dirty flag
+ * is only set at the last level providing the guest
+ * physical address.
+ */
+ if ((pte32 & PG_A) == 0) {
+ if (atomic_cmpset_32(&ptpbase32[ptpindex],
+ pte32, pte32 | PG_A) == 0) {
+ goto restart;
+ }
+ }
+
+ /* XXX must be ignored if CR4.PSE=0 */
+ if (nlevels > 0 && (pte32 & PG_PS) != 0)
+ break;
+
+ ptpphys = pte32;
+ }
+
+ /* Set the dirty bit in the page table entry if necessary */
+ if (writable && (pte32 & PG_M) == 0) {
+ if (atomic_cmpset_32(&ptpbase32[ptpindex],
+ pte32, pte32 | PG_M) == 0) {
+ goto restart;
+ }
+ }
+
+ /* Zero out the lower 'ptpshift' bits */
+ pte32 >>= ptpshift; pte32 <<= ptpshift;
+ *gpa = pte32 | (gla & (pgsize - 1));
+ goto done;
+ }
+
+ if (paging->paging_mode == PAGING_MODE_PAE) {
+ /* Zero out the lower 5 bits and the upper 32 bits */
+ ptpphys &= 0xffffffe0UL;
+
+ ptpbase = ptp_hold(vm, ptpphys, sizeof(*ptpbase) * 4, &cookie);
+ if (ptpbase == NULL)
+ goto error;
+
+ ptpindex = (gla >> 30) & 0x3;
+
+ pte = ptpbase[ptpindex];
+
+ if ((pte & PG_V) == 0) {
+ pfcode = pf_error_code(usermode, prot, 0, pte);
+ vm_inject_pf(vm, vcpuid, pfcode, gla);
+ goto fault;
+ }
+
+ ptpphys = pte;
+
+ nlevels = 2;
+ } else
+ nlevels = 4;
+ while (--nlevels >= 0) {
+ /* Zero out the lower 12 bits and the upper 12 bits */
+ ptpphys >>= 12; ptpphys <<= 24; ptpphys >>= 12;
+
+ ptpbase = ptp_hold(vm, ptpphys, PAGE_SIZE, &cookie);
+ if (ptpbase == NULL)
+ goto error;
+
+ ptpshift = PAGE_SHIFT + nlevels * 9;
+ ptpindex = (gla >> ptpshift) & 0x1FF;
+ pgsize = 1UL << ptpshift;
+
+ pte = ptpbase[ptpindex];
+
+ if ((pte & PG_V) == 0 ||
+ (usermode && (pte & PG_U) == 0) ||
+ (writable && (pte & PG_RW) == 0)) {
+ pfcode = pf_error_code(usermode, prot, 0, pte);
+ vm_inject_pf(vm, vcpuid, pfcode, gla);
+ goto fault;
+ }
+
+ /* Set the accessed bit in the page table entry */
+ if ((pte & PG_A) == 0) {
+ if (atomic_cmpset_64(&ptpbase[ptpindex],
+ pte, pte | PG_A) == 0) {
+ goto restart;
+ }
+ }
+
+ if (nlevels > 0 && (pte & PG_PS) != 0) {
+ if (pgsize > 1 * GB) {
+ pfcode = pf_error_code(usermode, prot, 1, pte);
+ vm_inject_pf(vm, vcpuid, pfcode, gla);
+ goto fault;
+ }
+ break;
+ }
+
+ ptpphys = pte;
+ }
+
+ /* Set the dirty bit in the page table entry if necessary */
+ if (writable && (pte & PG_M) == 0) {
+ if (atomic_cmpset_64(&ptpbase[ptpindex], pte, pte | PG_M) == 0)
+ goto restart;
+ }
+
+ /* Zero out the lower 'ptpshift' bits and the upper 12 bits */
+ pte >>= ptpshift; pte <<= (ptpshift + 12); pte >>= 12;
+ *gpa = pte | (gla & (pgsize - 1));
+done:
+ ptp_release(&cookie);
+ return (retval);
+error:
+ retval = -1;
+ goto done;
+fault:
+ retval = 1;
+ goto done;
+}
+
+int
+vmm_fetch_instruction(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t rip, int inst_length, struct vie *vie)
+{
+ struct vm_copyinfo copyinfo[2];
+ int error, prot;
+
+ if (inst_length > VIE_INST_SIZE)
+ panic("vmm_fetch_instruction: invalid length %d", inst_length);
+
+ prot = PROT_READ | PROT_EXEC;
+ error = vm_copy_setup(vm, vcpuid, paging, rip, inst_length, prot,
+ copyinfo, nitems(copyinfo));
+ if (error == 0) {
+ vm_copyin(vm, vcpuid, copyinfo, vie->inst, inst_length);
+ vm_copy_teardown(vm, vcpuid, copyinfo, nitems(copyinfo));
+ vie->num_valid = inst_length;
+ }
+ return (error);
+}
+
+static int
+vie_peek(struct vie *vie, uint8_t *x)
+{
+
+ if (vie->num_processed < vie->num_valid) {
+ *x = vie->inst[vie->num_processed];
+ return (0);
+ } else
+ return (-1);
+}
+
+static void
+vie_advance(struct vie *vie)
+{
+
+ vie->num_processed++;
+}
+
+static bool
+segment_override(uint8_t x, int *seg)
+{
+
+ switch (x) {
+ case 0x2E:
+ *seg = VM_REG_GUEST_CS;
+ break;
+ case 0x36:
+ *seg = VM_REG_GUEST_SS;
+ break;
+ case 0x3E:
+ *seg = VM_REG_GUEST_DS;
+ break;
+ case 0x26:
+ *seg = VM_REG_GUEST_ES;
+ break;
+ case 0x64:
+ *seg = VM_REG_GUEST_FS;
+ break;
+ case 0x65:
+ *seg = VM_REG_GUEST_GS;
+ break;
+ default:
+ return (false);
+ }
+ return (true);
+}
+
+static int
+decode_prefixes(struct vie *vie, enum vm_cpu_mode cpu_mode, int cs_d)
+{
+ uint8_t x;
+
+ while (1) {
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ if (x == 0x66)
+ vie->opsize_override = 1;
+ else if (x == 0x67)
+ vie->addrsize_override = 1;
+ else if (x == 0xF3)
+ vie->repz_present = 1;
+ else if (x == 0xF2)
+ vie->repnz_present = 1;
+ else if (segment_override(x, &vie->segment_register))
+ vie->segment_override = 1;
+ else
+ break;
+
+ vie_advance(vie);
+ }
+
+ /*
+ * From section 2.2.1, "REX Prefixes", Intel SDM Vol 2:
+ * - Only one REX prefix is allowed per instruction.
+ * - The REX prefix must immediately precede the opcode byte or the
+ * escape opcode byte.
+ * - If an instruction has a mandatory prefix (0x66, 0xF2 or 0xF3)
+ * the mandatory prefix must come before the REX prefix.
+ */
+ if (cpu_mode == CPU_MODE_64BIT && x >= 0x40 && x <= 0x4F) {
+ vie->rex_present = 1;
+ vie->rex_w = x & 0x8 ? 1 : 0;
+ vie->rex_r = x & 0x4 ? 1 : 0;
+ vie->rex_x = x & 0x2 ? 1 : 0;
+ vie->rex_b = x & 0x1 ? 1 : 0;
+ vie_advance(vie);
+ }
+
+ /*
+ * Section "Operand-Size And Address-Size Attributes", Intel SDM, Vol 1
+ */
+ if (cpu_mode == CPU_MODE_64BIT) {
+ /*
+ * Default address size is 64-bits and default operand size
+ * is 32-bits.
+ */
+ vie->addrsize = vie->addrsize_override ? 4 : 8;
+ if (vie->rex_w)
+ vie->opsize = 8;
+ else if (vie->opsize_override)
+ vie->opsize = 2;
+ else
+ vie->opsize = 4;
+ } else if (cs_d) {
+ /* Default address and operand sizes are 32-bits */
+ vie->addrsize = vie->addrsize_override ? 2 : 4;
+ vie->opsize = vie->opsize_override ? 2 : 4;
+ } else {
+ /* Default address and operand sizes are 16-bits */
+ vie->addrsize = vie->addrsize_override ? 4 : 2;
+ vie->opsize = vie->opsize_override ? 4 : 2;
+ }
+ return (0);
+}
+
+static int
+decode_two_byte_opcode(struct vie *vie)
+{
+ uint8_t x;
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ vie->op = two_byte_opcodes[x];
+
+ if (vie->op.op_type == VIE_OP_TYPE_NONE)
+ return (-1);
+
+ vie_advance(vie);
+ return (0);
+}
+
+static int
+decode_opcode(struct vie *vie)
+{
+ uint8_t x;
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ vie->op = one_byte_opcodes[x];
+
+ if (vie->op.op_type == VIE_OP_TYPE_NONE)
+ return (-1);
+
+ vie_advance(vie);
+
+ if (vie->op.op_type == VIE_OP_TYPE_TWO_BYTE)
+ return (decode_two_byte_opcode(vie));
+
+ return (0);
+}
+
+static int
+decode_modrm(struct vie *vie, enum vm_cpu_mode cpu_mode)
+{
+ uint8_t x;
+
+ if (vie->op.op_flags & VIE_OP_F_NO_MODRM)
+ return (0);
+
+ if (cpu_mode == CPU_MODE_REAL)
+ return (-1);
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ vie->mod = (x >> 6) & 0x3;
+ vie->rm = (x >> 0) & 0x7;
+ vie->reg = (x >> 3) & 0x7;
+
+ /*
+ * A direct addressing mode makes no sense in the context of an EPT
+ * fault. There has to be a memory access involved to cause the
+ * EPT fault.
+ */
+ if (vie->mod == VIE_MOD_DIRECT)
+ return (-1);
+
+ if ((vie->mod == VIE_MOD_INDIRECT && vie->rm == VIE_RM_DISP32) ||
+ (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)) {
+ /*
+ * Table 2-5: Special Cases of REX Encodings
+ *
+ * mod=0, r/m=5 is used in the compatibility mode to
+ * indicate a disp32 without a base register.
+ *
+ * mod!=3, r/m=4 is used in the compatibility mode to
+ * indicate that the SIB byte is present.
+ *
+ * The 'b' bit in the REX prefix is don't care in
+ * this case.
+ */
+ } else {
+ vie->rm |= (vie->rex_b << 3);
+ }
+
+ vie->reg |= (vie->rex_r << 3);
+
+ /* SIB */
+ if (vie->mod != VIE_MOD_DIRECT && vie->rm == VIE_RM_SIB)
+ goto done;
+
+ vie->base_register = gpr_map[vie->rm];
+
+ switch (vie->mod) {
+ case VIE_MOD_INDIRECT_DISP8:
+ vie->disp_bytes = 1;
+ break;
+ case VIE_MOD_INDIRECT_DISP32:
+ vie->disp_bytes = 4;
+ break;
+ case VIE_MOD_INDIRECT:
+ if (vie->rm == VIE_RM_DISP32) {
+ vie->disp_bytes = 4;
+ /*
+ * Table 2-7. RIP-Relative Addressing
+ *
+ * In 64-bit mode mod=00 r/m=101 implies [rip] + disp32
+ * whereas in compatibility mode it just implies disp32.
+ */
+
+ if (cpu_mode == CPU_MODE_64BIT)
+ vie->base_register = VM_REG_GUEST_RIP;
+ else
+ vie->base_register = VM_REG_LAST;
+ }
+ break;
+ }
+
+done:
+ vie_advance(vie);
+
+ return (0);
+}
+
+static int
+decode_sib(struct vie *vie)
+{
+ uint8_t x;
+
+ /* Proceed only if SIB byte is present */
+ if (vie->mod == VIE_MOD_DIRECT || vie->rm != VIE_RM_SIB)
+ return (0);
+
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ /* De-construct the SIB byte */
+ vie->ss = (x >> 6) & 0x3;
+ vie->index = (x >> 3) & 0x7;
+ vie->base = (x >> 0) & 0x7;
+
+ /* Apply the REX prefix modifiers */
+ vie->index |= vie->rex_x << 3;
+ vie->base |= vie->rex_b << 3;
+
+ switch (vie->mod) {
+ case VIE_MOD_INDIRECT_DISP8:
+ vie->disp_bytes = 1;
+ break;
+ case VIE_MOD_INDIRECT_DISP32:
+ vie->disp_bytes = 4;
+ break;
+ }
+
+ if (vie->mod == VIE_MOD_INDIRECT &&
+ (vie->base == 5 || vie->base == 13)) {
+ /*
+ * Special case when base register is unused if mod = 0
+ * and base = %rbp or %r13.
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ vie->disp_bytes = 4;
+ } else {
+ vie->base_register = gpr_map[vie->base];
+ }
+
+ /*
+ * All encodings of 'index' are valid except for %rsp (4).
+ *
+ * Documented in:
+ * Table 2-3: 32-bit Addressing Forms with the SIB Byte
+ * Table 2-5: Special Cases of REX Encodings
+ */
+ if (vie->index != 4)
+ vie->index_register = gpr_map[vie->index];
+
+ /* 'scale' makes sense only in the context of an index register */
+ if (vie->index_register < VM_REG_LAST)
+ vie->scale = 1 << vie->ss;
+
+ vie_advance(vie);
+
+ return (0);
+}
+
+static int
+decode_displacement(struct vie *vie)
+{
+ int n, i;
+ uint8_t x;
+
+ union {
+ char buf[4];
+ int8_t signed8;
+ int32_t signed32;
+ } u;
+
+ if ((n = vie->disp_bytes) == 0)
+ return (0);
+
+ if (n != 1 && n != 4)
+ panic("decode_displacement: invalid disp_bytes %d", n);
+
+ for (i = 0; i < n; i++) {
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ u.buf[i] = x;
+ vie_advance(vie);
+ }
+
+ if (n == 1)
+ vie->displacement = u.signed8; /* sign-extended */
+ else
+ vie->displacement = u.signed32; /* sign-extended */
+
+ return (0);
+}
+
+static int
+decode_immediate(struct vie *vie)
+{
+ int i, n;
+ uint8_t x;
+ union {
+ char buf[4];
+ int8_t signed8;
+ int16_t signed16;
+ int32_t signed32;
+ } u;
+
+ /* Figure out immediate operand size (if any) */
+ if (vie->op.op_flags & VIE_OP_F_IMM) {
+ /*
+ * Section 2.2.1.5 "Immediates", Intel SDM:
+ * In 64-bit mode the typical size of immediate operands
+ * remains 32-bits. When the operand size if 64-bits, the
+ * processor sign-extends all immediates to 64-bits prior
+ * to their use.
+ */
+ if (vie->opsize == 4 || vie->opsize == 8)
+ vie->imm_bytes = 4;
+ else
+ vie->imm_bytes = 2;
+ } else if (vie->op.op_flags & VIE_OP_F_IMM8) {
+ vie->imm_bytes = 1;
+ }
+
+ if ((n = vie->imm_bytes) == 0)
+ return (0);
+
+ KASSERT(n == 1 || n == 2 || n == 4,
+ ("%s: invalid number of immediate bytes: %d", __func__, n));
+
+ for (i = 0; i < n; i++) {
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ u.buf[i] = x;
+ vie_advance(vie);
+ }
+
+ /* sign-extend the immediate value before use */
+ if (n == 1)
+ vie->immediate = u.signed8;
+ else if (n == 2)
+ vie->immediate = u.signed16;
+ else
+ vie->immediate = u.signed32;
+
+ return (0);
+}
+
+static int
+decode_moffset(struct vie *vie)
+{
+ int i, n;
+ uint8_t x;
+ union {
+ char buf[8];
+ uint64_t u64;
+ } u;
+
+ if ((vie->op.op_flags & VIE_OP_F_MOFFSET) == 0)
+ return (0);
+
+ /*
+ * Section 2.2.1.4, "Direct Memory-Offset MOVs", Intel SDM:
+ * The memory offset size follows the address-size of the instruction.
+ */
+ n = vie->addrsize;
+ KASSERT(n == 2 || n == 4 || n == 8, ("invalid moffset bytes: %d", n));
+
+ u.u64 = 0;
+ for (i = 0; i < n; i++) {
+ if (vie_peek(vie, &x))
+ return (-1);
+
+ u.buf[i] = x;
+ vie_advance(vie);
+ }
+ vie->displacement = u.u64;
+ return (0);
+}
+
+/*
+ * Verify that all the bytes in the instruction buffer were consumed.
+ */
+static int
+verify_inst_length(struct vie *vie)
+{
+
+ if (vie->num_processed)
+ return (0);
+ else
+ return (-1);
+}
+
+/*
+ * Verify that the 'guest linear address' provided as collateral of the nested
+ * page table fault matches with our instruction decoding.
+ */
+static int
+verify_gla(struct vm *vm, int cpuid, uint64_t gla, struct vie *vie)
+{
+ int error;
+ uint64_t base, idx, gla2;
+
+ /* Skip 'gla' verification */
+ if (gla == VIE_INVALID_GLA)
+ return (0);
+
+ base = 0;
+ if (vie->base_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->base_register, &base);
+ if (error) {
+ printf("verify_gla: error %d getting base reg %d\n",
+ error, vie->base_register);
+ return (-1);
+ }
+
+ /*
+ * RIP-relative addressing starts from the following
+ * instruction
+ */
+ if (vie->base_register == VM_REG_GUEST_RIP)
+ base += vie->num_valid;
+ }
+
+ idx = 0;
+ if (vie->index_register != VM_REG_LAST) {
+ error = vm_get_register(vm, cpuid, vie->index_register, &idx);
+ if (error) {
+ printf("verify_gla: error %d getting index reg %d\n",
+ error, vie->index_register);
+ return (-1);
+ }
+ }
+
+ /* XXX assuming that the base address of the segment is 0 */
+ gla2 = base + vie->scale * idx + vie->displacement;
+ gla2 &= size2mask[vie->addrsize];
+ if (gla != gla2) {
+ printf("verify_gla mismatch: "
+ "base(0x%0lx), scale(%d), index(0x%0lx), "
+ "disp(0x%0lx), gla(0x%0lx), gla2(0x%0lx)\n",
+ base, vie->scale, idx, vie->displacement, gla, gla2);
+ return (-1);
+ }
+
+ return (0);
+}
+
+int
+vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
+ enum vm_cpu_mode cpu_mode, int cs_d, struct vie *vie)
+{
+
+ if (decode_prefixes(vie, cpu_mode, cs_d))
+ return (-1);
+
+ if (decode_opcode(vie))
+ return (-1);
+
+ if (decode_modrm(vie, cpu_mode))
+ return (-1);
+
+ if (decode_sib(vie))
+ return (-1);
+
+ if (decode_displacement(vie))
+ return (-1);
+
+ if (decode_immediate(vie))
+ return (-1);
+
+ if (decode_moffset(vie))
+ return (-1);
+
+ if (verify_inst_length(vie))
+ return (-1);
+
+ if ((vie->op.op_flags & VIE_OP_F_NO_GLA_VERIFICATION) == 0) {
+ if (verify_gla(vm, cpuid, gla, vie))
+ return (-1);
+ }
+
+ vie->decoded = 1; /* success */
+
+ return (0);
+}
+#endif /* _KERNEL */
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.c b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c
new file mode 100644
index 0000000000..bea750f162
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.c
@@ -0,0 +1,174 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_ioport.c 277168 2015-01-14 07:18:51Z neel $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/queue.h>
+#include <sys/cpuset.h>
+#include <sys/systm.h>
+
+#include <machine/vmm.h>
+#include <machine/vmm_instruction_emul.h>
+
+#include "vatpic.h"
+#include "vatpit.h"
+#include "vmm_ioport.h"
+#include "vmm_ktr.h"
+
+#define MAX_IOPORTS 1280
+
+ioport_handler_func_t ioport_handler[MAX_IOPORTS] = {
+ [TIMER_MODE] = vatpit_handler,
+ [TIMER_CNTR0] = vatpit_handler,
+ [TIMER_CNTR1] = vatpit_handler,
+ [TIMER_CNTR2] = vatpit_handler,
+ [NMISC_PORT] = vatpit_nmisc_handler,
+ [IO_ICU1] = vatpic_master_handler,
+ [IO_ICU1 + ICU_IMR_OFFSET] = vatpic_master_handler,
+ [IO_ICU2] = vatpic_slave_handler,
+ [IO_ICU2 + ICU_IMR_OFFSET] = vatpic_slave_handler,
+ [IO_ELCR1] = vatpic_elc_handler,
+ [IO_ELCR2] = vatpic_elc_handler,
+};
+
+#ifdef KTR
+static const char *
+inout_instruction(struct vm_exit *vmexit)
+{
+ int index;
+
+ static const char *iodesc[] = {
+ "outb", "outw", "outl",
+ "inb", "inw", "inl",
+ "outsb", "outsw", "outsd",
+ "insb", "insw", "insd",
+ };
+
+ switch (vmexit->u.inout.bytes) {
+ case 1:
+ index = 0;
+ break;
+ case 2:
+ index = 1;
+ break;
+ default:
+ index = 2;
+ break;
+ }
+
+ if (vmexit->u.inout.in)
+ index += 3;
+
+ if (vmexit->u.inout.string)
+ index += 6;
+
+ KASSERT(index < nitems(iodesc), ("%s: invalid index %d",
+ __func__, index));
+
+ return (iodesc[index]);
+}
+#endif /* KTR */
+
+static int
+emulate_inout_port(struct vm *vm, int vcpuid, struct vm_exit *vmexit,
+ bool *retu)
+{
+ ioport_handler_func_t handler;
+ uint32_t mask, val;
+ int error;
+
+ /*
+ * If there is no handler for the I/O port then punt to userspace.
+ */
+ if (vmexit->u.inout.port >= MAX_IOPORTS ||
+ (handler = ioport_handler[vmexit->u.inout.port]) == NULL) {
+ *retu = true;
+ return (0);
+ }
+
+ mask = vie_size2mask(vmexit->u.inout.bytes);
+
+ if (!vmexit->u.inout.in) {
+ val = vmexit->u.inout.eax & mask;
+ }
+
+ error = (*handler)(vm, vcpuid, vmexit->u.inout.in,
+ vmexit->u.inout.port, vmexit->u.inout.bytes, &val);
+ if (error) {
+ /*
+ * The value returned by this function is also the return value
+ * of vm_run(). This needs to be a positive number otherwise it
+ * can be interpreted as a "pseudo-error" like ERESTART.
+ *
+ * Enforce this by mapping all errors to EIO.
+ */
+ return (EIO);
+ }
+
+ if (vmexit->u.inout.in) {
+ vmexit->u.inout.eax &= ~mask;
+ vmexit->u.inout.eax |= val & mask;
+ error = vm_set_register(vm, vcpuid, VM_REG_GUEST_RAX,
+ vmexit->u.inout.eax);
+ KASSERT(error == 0, ("emulate_ioport: error %d setting guest "
+ "rax register", error));
+ }
+ *retu = false;
+ return (0);
+}
+
+static int
+emulate_inout_str(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu)
+{
+ *retu = true;
+ return (0); /* Return to userspace to finish emulation */
+}
+
+int
+vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vmexit, bool *retu)
+{
+ int bytes, error;
+
+ bytes = vmexit->u.inout.bytes;
+ KASSERT(bytes == 1 || bytes == 2 || bytes == 4,
+ ("vm_handle_inout: invalid operand size %d", bytes));
+
+ if (vmexit->u.inout.string)
+ error = emulate_inout_str(vm, vcpuid, vmexit, retu);
+ else
+ error = emulate_inout_port(vm, vcpuid, vmexit, retu);
+
+ VCPU_CTR4(vm, vcpuid, "%s%s 0x%04x: %s",
+ vmexit->u.inout.rep ? "rep " : "",
+ inout_instruction(vmexit),
+ vmexit->u.inout.port,
+ error ? "error" : (*retu ? "userspace" : "handled"));
+
+ return (error);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ioport.h b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h
new file mode 100644
index 0000000000..624dd8f1d8
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_ioport.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2014 Tycho Nightingale <tycho.nightingale@pluribusnetworks.com>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_ioport.h 273706 2014-10-26 19:03:06Z neel $
+ */
+
+#ifndef _VMM_IOPORT_H_
+#define _VMM_IOPORT_H_
+
+typedef int (*ioport_handler_func_t)(struct vm *vm, int vcpuid,
+ bool in, int port, int bytes, uint32_t *val);
+
+int vm_handle_inout(struct vm *vm, int vcpuid, struct vm_exit *vme, bool *retu);
+
+#endif /* _VMM_IOPORT_H_ */
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ipi.h b/usr/src/uts/i86pc/io/vmm/vmm_ipi.h
new file mode 100644
index 0000000000..4dff03ba1f
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_ipi.h
@@ -0,0 +1,37 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_ipi.h 260466 2014-01-09 03:25:54Z neel $
+ */
+
+#ifndef _VMM_IPI_H_
+#define _VMM_IPI_H_
+
+#ifdef __FreeBSD__
+int vmm_ipi_alloc(void);
+void vmm_ipi_free(int num);
+#endif
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_ktr.h b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h
new file mode 100644
index 0000000000..917c7f83a4
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_ktr.h
@@ -0,0 +1,69 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_ktr.h 258699 2013-11-27 22:18:08Z neel $
+ */
+
+#ifndef _VMM_KTR_H_
+#define _VMM_KTR_H_
+
+#include <sys/ktr.h>
+#include <sys/pcpu.h>
+
+#ifndef KTR_VMM
+#define KTR_VMM KTR_GEN
+#endif
+
+#define VCPU_CTR0(vm, vcpuid, format) \
+CTR2(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid))
+
+#define VCPU_CTR1(vm, vcpuid, format, p1) \
+CTR3(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1))
+
+#define VCPU_CTR2(vm, vcpuid, format, p1, p2) \
+CTR4(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2))
+
+#define VCPU_CTR3(vm, vcpuid, format, p1, p2, p3) \
+CTR5(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), (p1), (p2), (p3))
+
+#define VCPU_CTR4(vm, vcpuid, format, p1, p2, p3, p4) \
+CTR6(KTR_VMM, "vm %s[%d]: " format, vm_name((vm)), (vcpuid), \
+ (p1), (p2), (p3), (p4))
+
+#define VM_CTR0(vm, format) \
+CTR1(KTR_VMM, "vm %s: " format, vm_name((vm)))
+
+#define VM_CTR1(vm, format, p1) \
+CTR2(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1))
+
+#define VM_CTR2(vm, format, p1, p2) \
+CTR3(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2))
+
+#define VM_CTR3(vm, format, p1, p2, p3) \
+CTR4(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3))
+
+#define VM_CTR4(vm, format, p1, p2, p3, p4) \
+CTR5(KTR_VMM, "vm %s: " format, vm_name((vm)), (p1), (p2), (p3), (p4))
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.c b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
new file mode 100644
index 0000000000..3215c74a44
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.c
@@ -0,0 +1,256 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z tychon $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_lapic.c 264509 2014-04-15 17:06:26Z tychon $");
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/smp.h>
+
+#include <x86/specialreg.h>
+#include <x86/apicreg.h>
+
+#include <machine/vmm.h>
+#include "vmm_ipi.h"
+#include "vmm_ktr.h"
+#include "vmm_lapic.h"
+#include "vlapic.h"
+
+/*
+ * Some MSI message definitions
+ */
+#define MSI_X86_ADDR_MASK 0xfff00000
+#define MSI_X86_ADDR_BASE 0xfee00000
+#define MSI_X86_ADDR_RH 0x00000008 /* Redirection Hint */
+#define MSI_X86_ADDR_LOG 0x00000004 /* Destination Mode */
+
+int
+lapic_set_intr(struct vm *vm, int cpu, int vector, bool level)
+{
+ struct vlapic *vlapic;
+
+ if (cpu < 0 || cpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (vector < 32 || vector > 255)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ if (vlapic_set_intr_ready(vlapic, vector, level))
+ vcpu_notify_event(vm, cpu, true);
+ return (0);
+}
+
+int
+lapic_set_local_intr(struct vm *vm, int cpu, int vector)
+{
+ struct vlapic *vlapic;
+ cpuset_t dmask;
+ int error;
+
+ if (cpu < -1 || cpu >= VM_MAXCPU)
+ return (EINVAL);
+
+ if (cpu == -1)
+ dmask = vm_active_cpus(vm);
+ else
+ CPU_SETOF(cpu, &dmask);
+ error = 0;
+ while ((cpu = CPU_FFS(&dmask)) != 0) {
+ cpu--;
+ CPU_CLR(cpu, &dmask);
+ vlapic = vm_lapic(vm, cpu);
+ error = vlapic_trigger_lvt(vlapic, vector);
+ if (error)
+ break;
+ }
+
+ return (error);
+}
+
+int
+lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg)
+{
+ int delmode, vec;
+ uint32_t dest;
+ bool phys;
+
+ VM_CTR2(vm, "lapic MSI addr: %#lx msg: %#lx", addr, msg);
+
+ if ((addr & MSI_X86_ADDR_MASK) != MSI_X86_ADDR_BASE) {
+ VM_CTR1(vm, "lapic MSI invalid addr %#lx", addr);
+ return (-1);
+ }
+
+ /*
+ * Extract the x86-specific fields from the MSI addr/msg
+ * params according to the Intel Arch spec, Vol3 Ch 10.
+ *
+ * The PCI specification does not support level triggered
+ * MSI/MSI-X so ignore trigger level in 'msg'.
+ *
+ * The 'dest' is interpreted as a logical APIC ID if both
+ * the Redirection Hint and Destination Mode are '1' and
+ * physical otherwise.
+ */
+ dest = (addr >> 12) & 0xff;
+ phys = ((addr & (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG)) !=
+ (MSI_X86_ADDR_RH | MSI_X86_ADDR_LOG));
+ delmode = msg & APIC_DELMODE_MASK;
+ vec = msg & 0xff;
+
+ VM_CTR3(vm, "lapic MSI %s dest %#x, vec %d",
+ phys ? "physical" : "logical", dest, vec);
+
+ vlapic_deliver_intr(vm, LAPIC_TRIG_EDGE, dest, phys, delmode, vec);
+ return (0);
+}
+
+static boolean_t
+x2apic_msr(u_int msr)
+{
+ if (msr >= 0x800 && msr <= 0xBFF)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+static u_int
+x2apic_msr_to_regoff(u_int msr)
+{
+
+ return ((msr - 0x800) << 4);
+}
+
+boolean_t
+lapic_msr(u_int msr)
+{
+
+ if (x2apic_msr(msr) || (msr == MSR_APICBASE))
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+int
+lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval, bool *retu)
+{
+ int error;
+ u_int offset;
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ if (msr == MSR_APICBASE) {
+ *rval = vlapic_get_apicbase(vlapic);
+ error = 0;
+ } else {
+ offset = x2apic_msr_to_regoff(msr);
+ error = vlapic_read(vlapic, 0, offset, rval, retu);
+ }
+
+ return (error);
+}
+
+int
+lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t val, bool *retu)
+{
+ int error;
+ u_int offset;
+ struct vlapic *vlapic;
+
+ vlapic = vm_lapic(vm, cpu);
+
+ if (msr == MSR_APICBASE) {
+ error = vlapic_set_apicbase(vlapic, val);
+ } else {
+ offset = x2apic_msr_to_regoff(msr);
+ error = vlapic_write(vlapic, 0, offset, val, retu);
+ }
+
+ return (error);
+}
+
+int
+lapic_mmio_write(void *vm, int cpu, uint64_t gpa, uint64_t wval, int size,
+ void *arg)
+{
+ int error;
+ uint64_t off;
+ struct vlapic *vlapic;
+
+ off = gpa - DEFAULT_APIC_BASE;
+
+ /*
+ * Memory mapped local apic accesses must be 4 bytes wide and
+ * aligned on a 16-byte boundary.
+ */
+ if (size != 4 || off & 0xf)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ error = vlapic_write(vlapic, 1, off, wval, arg);
+ return (error);
+}
+
+int
+lapic_mmio_read(void *vm, int cpu, uint64_t gpa, uint64_t *rval, int size,
+ void *arg)
+{
+ int error;
+ uint64_t off;
+ struct vlapic *vlapic;
+
+ off = gpa - DEFAULT_APIC_BASE;
+
+ /*
+ * Memory mapped local apic accesses should be aligned on a
+ * 16-byte boundary. They are also suggested to be 4 bytes
+ * wide, alas not all OSes follow suggestions.
+ */
+ off &= ~3;
+ if (off & 0xf)
+ return (EINVAL);
+
+ vlapic = vm_lapic(vm, cpu);
+ error = vlapic_read(vlapic, 1, off, rval, arg);
+ return (error);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_lapic.h b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h
new file mode 100644
index 0000000000..ee47ee7783
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_lapic.h
@@ -0,0 +1,87 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_lapic.h 259863 2013-12-25 06:46:31Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _VMM_LAPIC_H_
+#define _VMM_LAPIC_H_
+
+struct vm;
+
+boolean_t lapic_msr(u_int num);
+int lapic_rdmsr(struct vm *vm, int cpu, u_int msr, uint64_t *rval,
+ bool *retu);
+int lapic_wrmsr(struct vm *vm, int cpu, u_int msr, uint64_t wval,
+ bool *retu);
+
+int lapic_mmio_read(void *vm, int cpu, uint64_t gpa,
+ uint64_t *rval, int size, void *arg);
+int lapic_mmio_write(void *vm, int cpu, uint64_t gpa,
+ uint64_t wval, int size, void *arg);
+
+/*
+ * Signals to the LAPIC that an interrupt at 'vector' needs to be generated
+ * to the 'cpu', the state is recorded in IRR.
+ */
+int lapic_set_intr(struct vm *vm, int cpu, int vector, bool trig);
+
+#define LAPIC_TRIG_LEVEL true
+#define LAPIC_TRIG_EDGE false
+static __inline int
+lapic_intr_level(struct vm *vm, int cpu, int vector)
+{
+
+ return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_LEVEL));
+}
+
+static __inline int
+lapic_intr_edge(struct vm *vm, int cpu, int vector)
+{
+
+ return (lapic_set_intr(vm, cpu, vector, LAPIC_TRIG_EDGE));
+}
+
+/*
+ * Triggers the LAPIC local interrupt (LVT) 'vector' on 'cpu'. 'cpu' can
+ * be set to -1 to trigger the interrupt on all CPUs.
+ */
+int lapic_set_local_intr(struct vm *vm, int cpu, int vector);
+
+int lapic_intr_msi(struct vm *vm, uint64_t addr, uint64_t msg);
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_mem.h b/usr/src/uts/i86pc/io/vmm/vmm_mem.h
new file mode 100644
index 0000000000..05dc37fb9a
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_mem.h
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_mem.h 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _VMM_MEM_H_
+#define _VMM_MEM_H_
+
+int vmm_mem_init(void);
+vm_paddr_t vmm_mem_alloc(size_t size);
+void vmm_mem_free(vm_paddr_t start, size_t size);
+vm_paddr_t vmm_mem_maxaddr(void);
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
new file mode 100644
index 0000000000..79e1cb1a44
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_dev.c
@@ -0,0 +1,1040 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/conf.h>
+#include <sys/cpuvar.h>
+#include <sys/ioccom.h>
+#include <sys/stat.h>
+#include <sys/vmsystm.h>
+#include <sys/ddi.h>
+/*
+ * struct modctl in <sys/modctl.h> contains "void *__unused".
+ * Do this ugly workaround to avoid it.
+ */
+#undef __unused
+#include <sys/sunddi.h>
+#include <sys/fs/dv_node.h>
+
+#include <sys/vmm.h>
+#include <sys/vmm_instruction_emul.h>
+#include <sys/vmm_dev.h>
+#include <sys/vmm_impl.h>
+
+#include <vm/vm.h>
+#include <vm/seg_dev.h>
+
+#include "io/vatpic.h"
+#include "io/vioapic.h"
+#include "vmm_lapic.h"
+
+static dev_info_t *vmm_dip;
+static void *vmm_statep;
+
+static SLIST_HEAD(, vmm_softc) head;
+
+static kmutex_t vmmdev_mtx;
+
+/*
+ * vmm trace ring
+ */
+int vmm_dmsg_ring_size = VMM_DMSG_RING_SIZE;
+static vmm_trace_rbuf_t *vmm_debug_rbuf;
+static vmm_trace_dmsg_t *vmm_trace_dmsg_alloc(void);
+static void vmm_trace_dmsg_free(void);
+static void vmm_trace_rbuf_alloc(void);
+static void vmm_trace_rbuf_free(void);
+
+/*
+ * This routine is used to manage debug messages
+ * on ring buffer.
+ */
+static vmm_trace_dmsg_t *
+vmm_trace_dmsg_alloc(void)
+{
+ vmm_trace_dmsg_t *dmsg_alloc, *dmsg = vmm_debug_rbuf->dmsgp;
+
+ if (vmm_debug_rbuf->looped == TRUE) {
+ vmm_debug_rbuf->dmsgp = dmsg->next;
+ return (vmm_debug_rbuf->dmsgp);
+ }
+
+ /*
+ * If we're looping for the first time,
+ * connect the ring.
+ */
+ if (((vmm_debug_rbuf->size + (sizeof (vmm_trace_dmsg_t))) >
+ vmm_debug_rbuf->maxsize) && (vmm_debug_rbuf->dmsgh != NULL)) {
+ dmsg->next = vmm_debug_rbuf->dmsgh;
+ vmm_debug_rbuf->dmsgp = vmm_debug_rbuf->dmsgh;
+ vmm_debug_rbuf->looped = TRUE;
+ return (vmm_debug_rbuf->dmsgp);
+ }
+
+ /* If we've gotten this far then memory allocation is needed */
+ dmsg_alloc = kmem_zalloc(sizeof (vmm_trace_dmsg_t), KM_NOSLEEP);
+ if (dmsg_alloc == NULL) {
+ vmm_debug_rbuf->allocfailed++;
+ return (dmsg_alloc);
+ } else {
+ vmm_debug_rbuf->size += sizeof (vmm_trace_dmsg_t);
+ }
+
+ if (vmm_debug_rbuf->dmsgp != NULL) {
+ dmsg->next = dmsg_alloc;
+ vmm_debug_rbuf->dmsgp = dmsg->next;
+ return (vmm_debug_rbuf->dmsgp);
+ } else {
+ /*
+ * We should only be here if we're initializing
+ * the ring buffer.
+ */
+ if (vmm_debug_rbuf->dmsgh == NULL) {
+ vmm_debug_rbuf->dmsgh = dmsg_alloc;
+ } else {
+ /* Something is wrong */
+ kmem_free(dmsg_alloc, sizeof (vmm_trace_dmsg_t));
+ return (NULL);
+ }
+
+ vmm_debug_rbuf->dmsgp = dmsg_alloc;
+ return (vmm_debug_rbuf->dmsgp);
+ }
+}
+
+/*
+ * Free all messages on debug ring buffer.
+ */
+static void
+vmm_trace_dmsg_free(void)
+{
+ vmm_trace_dmsg_t *dmsg_next, *dmsg = vmm_debug_rbuf->dmsgh;
+
+ while (dmsg != NULL) {
+ dmsg_next = dmsg->next;
+ kmem_free(dmsg, sizeof (vmm_trace_dmsg_t));
+
+ /*
+ * If we've looped around the ring than we're done.
+ */
+ if (dmsg_next == vmm_debug_rbuf->dmsgh) {
+ break;
+ } else {
+ dmsg = dmsg_next;
+ }
+ }
+}
+
+static void
+vmm_trace_rbuf_alloc(void)
+{
+ vmm_debug_rbuf = kmem_zalloc(sizeof (vmm_trace_rbuf_t), KM_SLEEP);
+
+ mutex_init(&vmm_debug_rbuf->lock, NULL, MUTEX_DRIVER, NULL);
+
+ if (vmm_dmsg_ring_size > 0) {
+ vmm_debug_rbuf->maxsize = vmm_dmsg_ring_size;
+ }
+}
+
+
+static void
+vmm_trace_rbuf_free(void)
+{
+ vmm_trace_dmsg_free();
+ mutex_destroy(&vmm_debug_rbuf->lock);
+ kmem_free(vmm_debug_rbuf, sizeof (vmm_trace_rbuf_t));
+}
+
+static void
+vmm_vtrace_log(const char *fmt, va_list ap)
+{
+ vmm_trace_dmsg_t *dmsg;
+
+ if (vmm_debug_rbuf == NULL) {
+ return;
+ }
+
+ /*
+ * If max size of ring buffer is smaller than size
+ * required for one debug message then just return
+ * since we have no room for the debug message.
+ */
+ if (vmm_debug_rbuf->maxsize < (sizeof (vmm_trace_dmsg_t))) {
+ return;
+ }
+
+ mutex_enter(&vmm_debug_rbuf->lock);
+
+ /* alloc or reuse on ring buffer */
+ dmsg = vmm_trace_dmsg_alloc();
+
+ if (dmsg == NULL) {
+ /* resource allocation failed */
+ mutex_exit(&vmm_debug_rbuf->lock);
+ return;
+ }
+
+ gethrestime(&dmsg->timestamp);
+
+ (void) vsnprintf(dmsg->buf, sizeof (dmsg->buf), fmt, ap);
+
+ mutex_exit(&vmm_debug_rbuf->lock);
+}
+
+void
+vmm_trace_log(const char *fmt, ...)
+{
+ va_list ap;
+
+ va_start(ap, fmt);
+ vmm_vtrace_log(fmt, ap);
+ va_end(ap);
+}
+
+void
+vmmdev_init(void)
+{
+ vmm_trace_rbuf_alloc();
+}
+
+int
+vmmdev_cleanup(void)
+{
+ int error;
+
+ if (SLIST_EMPTY(&head))
+ error = 0;
+ else
+ error = EBUSY;
+
+ if (error == 0)
+ vmm_trace_dmsg_free();
+
+ return (error);
+}
+
+int
+vmmdev_do_ioctl(struct vmm_softc *sc, int cmd, intptr_t arg, int mode,
+ cred_t *credp, int *rvalp)
+{
+ int error, vcpu, state_changed;
+ struct vm_memory_segment seg;
+ struct vm_register vmreg;
+ struct vm_seg_desc vmsegdesc;
+ struct vm_run vmrun;
+ struct vm_exception vmexc;
+ struct vm_lapic_irq vmirq;
+ struct vm_lapic_msi vmmsi;
+ struct vm_ioapic_irq ioapic_irq;
+ struct vm_isa_irq isa_irq;
+ struct vm_capability vmcap;
+ struct vm_nmi vmnmi;
+ struct vm_x2apic x2apic;
+ struct vm_gla2gpa gg;
+ struct vm_activate_cpu vac;
+ int pincount;
+ int i;
+
+ vcpu = -1;
+ state_changed = 0;
+
+ /*
+ * Some VMM ioctls can operate only on vcpus that are not running.
+ */
+ switch (cmd) {
+ case VM_RUN:
+ case VM_GET_REGISTER:
+ case VM_SET_REGISTER:
+ case VM_GET_SEGMENT_DESCRIPTOR:
+ case VM_SET_SEGMENT_DESCRIPTOR:
+ case VM_INJECT_EXCEPTION:
+ case VM_GET_CAPABILITY:
+ case VM_SET_CAPABILITY:
+ case VM_PPTDEV_MSI:
+ case VM_PPTDEV_MSIX:
+ case VM_SET_X2APIC_STATE:
+ case VM_GLA2GPA:
+ case VM_ACTIVATE_CPU:
+ case VM_RESTART_INSTRUCTION:
+ /*
+ * XXX fragile, handle with care
+ * Assumes that the first field of the ioctl data is the vcpu.
+ */
+ if (ddi_copyin((void *)arg, &vcpu, sizeof (vcpu), mode)) {
+ return (EFAULT);
+ }
+ if (vcpu < 0 || vcpu >= VM_MAXCPU) {
+ error = EINVAL;
+ goto done;
+ }
+
+ error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
+ if (error)
+ goto done;
+
+ state_changed = 1;
+ break;
+ case VM_MAP_MEMORY:
+ /*
+ * ioctls that operate on the entire virtual machine must
+ * prevent all vcpus from running.
+ */
+ error = 0;
+ for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++) {
+ error = vcpu_set_state(sc->vm, vcpu, VCPU_FROZEN, true);
+ if (error)
+ break;
+ }
+
+ if (error) {
+ while (--vcpu >= 0)
+ vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
+ goto done;
+ }
+
+ state_changed = 2;
+ break;
+
+ default:
+ break;
+ }
+
+ switch(cmd) {
+ case VM_RUN:
+ if (ddi_copyin((void *)arg, &vmrun,
+ sizeof (struct vm_run), mode)) {
+ return (EFAULT);
+ }
+ error = vm_run(sc->vm, &vmrun);
+ if (ddi_copyout(&vmrun, (void *)arg,
+ sizeof (struct vm_run), mode)) {
+ return (EFAULT);
+ }
+ break;
+ case VM_LAPIC_IRQ:
+ if (ddi_copyin((void *)arg, &vmirq,
+ sizeof (struct vm_lapic_irq), mode)) {
+ return (EFAULT);
+ }
+ error = lapic_intr_edge(sc->vm, vmirq.cpuid, vmirq.vector);
+ if (ddi_copyout(&vmirq, (void *)arg,
+ sizeof (struct vm_lapic_irq), mode)) {
+ return (EFAULT);
+ }
+ break;
+ case VM_LAPIC_LOCAL_IRQ:
+ if (ddi_copyin((void *)arg, &vmirq,
+ sizeof (struct vm_lapic_irq), mode)) {
+ return (EFAULT);
+ }
+ error = lapic_set_local_intr(sc->vm, vmirq.cpuid,
+ vmirq.vector);
+ if (ddi_copyout(&vmirq, (void *)arg,
+ sizeof (struct vm_lapic_irq), mode)) {
+ return (EFAULT);
+ }
+ break;
+ case VM_LAPIC_MSI:
+ if (ddi_copyin((void *)arg, &vmmsi,
+ sizeof (struct vm_lapic_msi), mode)) {
+ return (EFAULT);
+ }
+ error = lapic_intr_msi(sc->vm, vmmsi.addr, vmmsi.msg);
+ if (ddi_copyout(&vmmsi, (void *)arg,
+ sizeof (struct vm_lapic_msi), mode)) {
+ return (EFAULT);
+ }
+ case VM_IOAPIC_ASSERT_IRQ:
+ if (ddi_copyin((void *)arg, &ioapic_irq,
+ sizeof (struct vm_ioapic_irq), mode)) {
+ return (EFAULT);
+ }
+ error = vioapic_assert_irq(sc->vm, ioapic_irq.irq);;
+ if (ddi_copyout(&ioapic_irq, (void *)arg,
+ sizeof (struct vm_ioapic_irq), mode)) {
+ return (EFAULT);
+ }
+ break;
+ case VM_IOAPIC_DEASSERT_IRQ:
+ if (ddi_copyin((void *)arg, &ioapic_irq,
+ sizeof (struct vm_ioapic_irq), mode)) {
+ return (EFAULT);
+ }
+ error = vioapic_deassert_irq(sc->vm, ioapic_irq.irq);
+ if (ddi_copyout(&ioapic_irq, (void *)arg,
+ sizeof (struct vm_ioapic_irq), mode)) {
+ return (EFAULT);
+ }
+ break;
+ case VM_IOAPIC_PULSE_IRQ:
+ if (ddi_copyin((void *)arg, &ioapic_irq,
+ sizeof (struct vm_ioapic_irq), mode)) {
+ return (EFAULT);
+ }
+ error = vioapic_pulse_irq(sc->vm, ioapic_irq.irq);
+ if (ddi_copyout(&ioapic_irq, (void *)arg,
+ sizeof (struct vm_ioapic_irq), mode)) {
+ return (EFAULT);
+ }
+ break;
+ case VM_IOAPIC_PINCOUNT:
+ error = 0;
+ pincount = vioapic_pincount(sc->vm);
+ if (ddi_copyout(&pincount, (void *)arg, sizeof (int), mode)) {
+ return (EFAULT);
+ }
+ break;
+ case VM_ISA_ASSERT_IRQ:
+ if (ddi_copyin((void *)arg, &isa_irq,
+ sizeof (struct vm_isa_irq), mode)) {
+ return (EFAULT);
+ }
+ error = vatpic_assert_irq(sc->vm, isa_irq.atpic_irq);
+ if (error == 0 && isa_irq.ioapic_irq != -1)
+ error = vioapic_assert_irq(sc->vm,
+ isa_irq.ioapic_irq);
+ if (ddi_copyout(&isa_irq, (void *)arg,
+ sizeof (struct vm_isa_irq), mode)) {
+ return (EFAULT);
+
+ }
+ break;
+ case VM_ISA_DEASSERT_IRQ:
+ if (ddi_copyin((void *)arg, &isa_irq,
+ sizeof (struct vm_isa_irq), mode)) {
+ return (EFAULT);
+ }
+ error = vatpic_deassert_irq(sc->vm, isa_irq.atpic_irq);
+ if (error == 0 && isa_irq.ioapic_irq != -1)
+ error = vioapic_deassert_irq(sc->vm,
+ isa_irq.ioapic_irq);
+ if (ddi_copyout(&isa_irq, (void *)arg,
+ sizeof (struct vm_isa_irq), mode)) {
+ return (EFAULT);
+
+ }
+ break;
+ case VM_ISA_PULSE_IRQ:
+ if (ddi_copyin((void *)arg, &isa_irq,
+ sizeof (struct vm_isa_irq), mode)) {
+ return (EFAULT);
+ }
+ error = vatpic_pulse_irq(sc->vm, isa_irq.atpic_irq);
+ if (error == 0 && isa_irq.ioapic_irq != -1)
+ error = vioapic_pulse_irq(sc->vm, isa_irq.ioapic_irq);
+ if (ddi_copyout(&isa_irq, (void *)arg,
+ sizeof (struct vm_isa_irq), mode)) {
+ return (EFAULT);
+
+ }
+ break;
+ case VM_MAP_MEMORY:
+ if (ddi_copyin((void *)arg, &seg,
+ sizeof (struct vm_memory_segment), mode)) {
+ return (EFAULT);
+ }
+ error = vm_malloc(sc->vm, seg.gpa, seg.len);
+ break;
+ case VM_GET_MEMORY_SEG:
+ if (ddi_copyin((void *)arg, &seg,
+ sizeof (struct vm_memory_segment), mode)) {
+ return (EFAULT);
+ }
+ seg.len = 0;
+ (void)vm_gpabase2memseg(sc->vm, seg.gpa, &seg);
+ if (ddi_copyout(&seg, (void *)arg,
+ sizeof (struct vm_memory_segment), mode)) {
+ return (EFAULT);
+ }
+ error = 0;
+ break;
+ case VM_GET_REGISTER:
+ if (ddi_copyin((void *)arg, &vmreg,
+ sizeof (struct vm_register), mode)) {
+ return (EFAULT);
+ }
+ error = vm_get_register(sc->vm, vmreg.cpuid, vmreg.regnum,
+ &vmreg.regval);
+ if (!error) {
+ if (ddi_copyout(&vmreg, (void *)arg,
+ sizeof (struct vm_register), mode)) {
+ return (EFAULT);
+ }
+ }
+ break;
+ case VM_SET_REGISTER:
+ if (ddi_copyin((void *)arg, &vmreg,
+ sizeof (struct vm_register), mode)) {
+ return (EFAULT);
+ }
+ error = vm_set_register(sc->vm, vmreg.cpuid, vmreg.regnum,
+ vmreg.regval);
+ break;
+ case VM_SET_SEGMENT_DESCRIPTOR:
+ if (ddi_copyin((void *)arg, &vmsegdesc,
+ sizeof (struct vm_seg_desc), mode)) {
+ return (EFAULT);
+ }
+ error = vm_set_seg_desc(sc->vm, vmsegdesc.cpuid,
+ vmsegdesc.regnum,
+ &vmsegdesc.desc);
+ break;
+ case VM_GET_SEGMENT_DESCRIPTOR:
+ if (ddi_copyin((void *)arg, &vmsegdesc,
+ sizeof (struct vm_seg_desc), mode)) {
+ return (EFAULT);
+ }
+ error = vm_get_seg_desc(sc->vm, vmsegdesc.cpuid,
+ vmsegdesc.regnum,
+ &vmsegdesc.desc);
+ if (!error) {
+ if (ddi_copyout(&vmsegdesc, (void *)arg,
+ sizeof (struct vm_seg_desc), mode)) {
+ return (EFAULT);
+ }
+ }
+ break;
+ case VM_GET_CAPABILITY:
+ if (ddi_copyin((void *)arg, &vmcap,
+ sizeof (struct vm_capability), mode)) {
+ return (EFAULT);
+ }
+ error = vm_get_capability(sc->vm, vmcap.cpuid,
+ vmcap.captype,
+ &vmcap.capval);
+ if (!error) {
+ if (ddi_copyout(&vmcap, (void *)arg,
+ sizeof (struct vm_capability), mode)) {
+ return (EFAULT);
+ }
+ }
+ break;
+ case VM_SET_CAPABILITY:
+ if (ddi_copyin((void *)arg, &vmcap,
+ sizeof (struct vm_capability), mode)) {
+ return (EFAULT);
+ }
+ error = vm_set_capability(sc->vm, vmcap.cpuid,
+ vmcap.captype,
+ vmcap.capval);
+ break;
+ case VM_SET_X2APIC_STATE:
+ if (ddi_copyin((void *)arg, &x2apic,
+ sizeof (struct vm_x2apic), mode)) {
+ return (EFAULT);
+ }
+ error = vm_set_x2apic_state(sc->vm,
+ x2apic.cpuid, x2apic.state);
+ break;
+ case VM_GET_X2APIC_STATE:
+ if (ddi_copyin((void *)arg, &x2apic,
+ sizeof (struct vm_x2apic), mode)) {
+ return (EFAULT);
+ }
+ error = vm_get_x2apic_state(sc->vm,
+ x2apic.cpuid, &x2apic.state);
+ if (!error) {
+ if (ddi_copyout(&x2apic, (void *)arg,
+ sizeof (struct vm_x2apic), mode)) {
+ return (EFAULT);
+ }
+ }
+ break;
+ case VM_GLA2GPA: {
+ CTASSERT(PROT_READ == VM_PROT_READ);
+ CTASSERT(PROT_WRITE == VM_PROT_WRITE);
+ CTASSERT(PROT_EXEC == VM_PROT_EXECUTE);
+ if (ddi_copyin((void *)arg, &gg,
+ sizeof (struct vm_gla2gpa), mode)) {
+ return (EFAULT);
+ }
+ error = vm_gla2gpa(sc->vm, gg.vcpuid, &gg.paging, gg.gla,
+ gg.prot, &gg.gpa);
+ KASSERT(error == 0 || error == 1 || error == -1,
+ ("%s: vm_gla2gpa unknown error %d", __func__, error));
+ if (error >= 0) {
+ /*
+ * error = 0: the translation was successful
+ * error = 1: a fault was injected into the guest
+ */
+ gg.fault = error;
+ error = 0;
+ if (ddi_copyout(&gg, (void *)arg,
+ sizeof (struct vm_gla2gpa), mode)) {
+ return (EFAULT);
+ }
+ } else {
+ error = EFAULT;
+ }
+ break;
+ }
+ case VM_ACTIVATE_CPU:
+ if (ddi_copyin((void *)arg, &vac,
+ sizeof (struct vm_activate_cpu), mode)) {
+ return (EFAULT);
+ }
+ error = vm_activate_cpu(sc->vm, vac.vcpuid);
+ break;
+ case VM_RESTART_INSTRUCTION:
+ error = vm_restart_instruction(sc->vm, vcpu);
+ break;
+ default:
+ error = ENOTTY;
+ break;
+ }
+
+ if (state_changed == 1) {
+ vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
+ } else if (state_changed == 2) {
+ for (vcpu = 0; vcpu < VM_MAXCPU; vcpu++)
+ vcpu_set_state(sc->vm, vcpu, VCPU_IDLE, false);
+ }
+
+done:
+ /* Make sure that no handler returns a bogus value like ERESTART */
+ KASSERT(error >= 0, ("vmmdev_ioctl: invalid error return %d", error));
+ return (error);
+}
+
+static
+minor_t vmm_find_free_minor(void)
+{
+ minor_t minor;
+
+ for (minor = 1; ; minor++) {
+ if (ddi_get_soft_state(vmm_statep, minor) == NULL)
+ break;
+ }
+
+ return (minor);
+}
+
+int
+vmmdev_do_vm_create(dev_info_t *dip, char *name)
+{
+ struct vmm_softc *sc;
+ minor_t minor;
+ int error;
+
+ mutex_enter(&vmmdev_mtx);
+
+ if (strlen(name) >= VM_MAX_NAMELEN) {
+ mutex_exit(&vmmdev_mtx);
+ return (EINVAL);
+ }
+
+ minor = vmm_find_free_minor();
+ if (ddi_soft_state_zalloc(vmm_statep, minor) == DDI_FAILURE) {
+ mutex_exit(&vmmdev_mtx);
+ return (DDI_FAILURE);
+ }
+
+ if ((sc = ddi_get_soft_state(vmm_statep, minor)) == NULL) {
+ ddi_soft_state_free(vmm_statep, minor);
+ mutex_exit(&vmmdev_mtx);
+ return (DDI_FAILURE);
+ }
+ strcpy(sc->name, name);
+ sc->minor = minor;
+
+ if (ddi_create_minor_node(dip, name, S_IFCHR, minor,
+ DDI_PSEUDO, 0) == DDI_FAILURE) {
+ ddi_soft_state_free(vmm_statep, minor);
+ mutex_exit(&vmmdev_mtx);
+ return (DDI_FAILURE);
+ }
+
+ error = vm_create(name, &sc->vm);
+ if (error != 0) {
+ ddi_soft_state_free(vmm_statep, minor);
+ ddi_remove_minor_node(dip, name);
+ mutex_exit(&vmmdev_mtx);
+ return (error);
+ }
+ SLIST_INSERT_HEAD(&head, sc, link);
+
+ mutex_exit(&vmmdev_mtx);
+
+ return (0);
+}
+
+static struct vmm_softc *
+vmm_lookup(char *name)
+{
+ struct vmm_softc *sc;
+
+ SLIST_FOREACH(sc, &head, link) {
+ if (strcmp(sc->name, name) == 0) {
+ break;
+ }
+ }
+
+ return (sc);
+
+}
+
+struct vm *
+vm_lookup_by_name(char *name)
+{
+ struct vmm_softc *sc;
+
+ mutex_enter(&vmmdev_mtx);
+
+ if ((sc = vmm_lookup(name)) == NULL) {
+ mutex_exit(&vmmdev_mtx);
+ return (NULL);
+ }
+
+ mutex_exit(&vmmdev_mtx);
+
+ return (sc->vm);
+}
+
+int
+vmmdev_do_vm_destroy(dev_info_t *dip, char *name)
+{
+ struct vmm_softc *sc;
+ dev_info_t *pdip = ddi_get_parent(dip);
+
+ mutex_enter(&vmmdev_mtx);
+
+ if ((sc = vmm_lookup(name)) == NULL) {
+ mutex_exit(&vmmdev_mtx);
+ return (ENOENT);
+ }
+
+ if (sc->open) {
+ mutex_exit(&vmmdev_mtx);
+ return (EBUSY);
+ }
+
+ vm_destroy(sc->vm);
+ SLIST_REMOVE(&head, sc, vmm_softc, link);
+ ddi_remove_minor_node(dip, name);
+ ddi_soft_state_free(vmm_statep, sc->minor);
+ (void) devfs_clean(pdip, NULL, DV_CLEAN_FORCE);
+
+ mutex_exit(&vmmdev_mtx);
+
+ return (0);
+}
+
+int
+vmmdev_do_vm_mmap(struct vmm_softc *vmm_sc, off_t off, int nprot)
+{
+ vm_paddr_t paddr;
+
+ mutex_enter(&vmmdev_mtx);
+
+ paddr = vm_gpa2hpa(vmm_sc->vm, (vm_paddr_t)off, PAGE_SIZE);
+ if (paddr == -1) {
+ return (-1);
+ }
+
+ mutex_exit(&vmmdev_mtx);
+
+ return (btop(paddr));
+}
+
+
+static int
+vmm_open(dev_t *devp, int flag, int otyp, cred_t *credp)
+{
+ minor_t minor;
+ struct vmm_softc *sc;
+
+ minor = getminor(*devp);
+ if (minor == VMM_CTL_MINOR) {
+ /*
+ * Master control device must be opened exclusively.
+ */
+ if ((flag & FEXCL) != FEXCL || otyp != OTYP_CHR) {
+ return (EINVAL);
+ }
+
+ return (0);
+ }
+
+ mutex_enter(&vmmdev_mtx);
+ sc = ddi_get_soft_state(vmm_statep, minor);
+ if (sc == NULL) {
+ mutex_exit(&vmmdev_mtx);
+ return (ENXIO);
+ }
+
+ if (sc->open) {
+ mutex_exit(&vmmdev_mtx);
+ return (EBUSY);
+ }
+ sc->open = B_TRUE;
+ mutex_exit(&vmmdev_mtx);
+
+ return (0);
+}
+
+static int
+vmm_close(dev_t dev, int flag, int otyp, cred_t *credp)
+{
+ minor_t minor;
+ struct vmm_softc *sc;
+
+ minor = getminor(dev);
+ if (minor == VMM_CTL_MINOR)
+ return (0);
+
+ mutex_enter(&vmmdev_mtx);
+ sc = ddi_get_soft_state(vmm_statep, minor);
+ if (sc == NULL) {
+ mutex_exit(&vmmdev_mtx);
+ return (ENXIO);
+ }
+
+ sc->open = B_FALSE;
+ mutex_exit(&vmmdev_mtx);
+
+ return (0);
+}
+
+static int
+vmm_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
+ int *rvalp)
+{
+ struct vmm_softc *sc;
+ struct vmm_ioctl kvi;
+ minor_t minor;
+
+ minor = getminor(dev);
+
+ if (minor == VMM_CTL_MINOR) {
+ if (ddi_copyin((void *)arg, &kvi, sizeof (struct vmm_ioctl),
+ mode)) {
+ return (EFAULT);
+ }
+ switch (cmd) {
+ case VMM_CREATE_VM:
+ if ((mode & FWRITE) == 0)
+ return (EPERM);
+ return (vmmdev_do_vm_create(vmm_dip, kvi.vmm_name));
+ case VMM_DESTROY_VM:
+ if ((mode & FWRITE) == 0)
+ return (EPERM);
+ return (vmmdev_do_vm_destroy(vmm_dip, kvi.vmm_name));
+ default:
+ break;
+ }
+ }
+
+ sc = ddi_get_soft_state(vmm_statep, minor);
+ ASSERT(sc);
+
+ return (vmmdev_do_ioctl(sc, cmd, arg, mode, credp, rvalp));
+}
+
+static int
+vmm_mmap(dev_t dev, off_t off, int prot)
+{
+ struct vmm_softc *sc;
+
+ sc = ddi_get_soft_state(vmm_statep, getminor(dev));
+ ASSERT(sc);
+
+ return (vmmdev_do_vm_mmap(sc, off, prot));
+}
+
+static int
+vmm_segmap(dev_t dev, off_t off, struct as *as,
+ caddr_t *addrp, off_t len, unsigned int prot,
+ unsigned int maxprot, unsigned int flags, cred_t *credp)
+{
+ struct segdev_crargs dev_a;
+ int error;
+
+ as_rangelock(as);
+
+ error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
+ if (error != 0) {
+ as_rangeunlock(as);
+ return (error);
+ }
+
+ dev_a.mapfunc = vmm_mmap;
+ dev_a.dev = dev;
+ dev_a.offset = off;
+ dev_a.type = (flags & MAP_TYPE);
+ dev_a.prot = (uchar_t)prot;
+ dev_a.maxprot = (uchar_t)maxprot;
+ dev_a.hat_attr = 0;
+ dev_a.hat_flags = HAT_LOAD_NOCONSIST;
+ dev_a.devmap_data = NULL;
+
+ error = as_map(as, *addrp, len, segdev_create, &dev_a);
+
+ as_rangeunlock(as);
+
+ return (error);
+}
+
+static int
+vmm_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
+{
+ return (0);
+}
+
+static int
+vmm_probe(dev_info_t *dip)
+{
+ if (driver_installed(ddi_name_to_major("kvm"))) {
+ cmn_err(CE_WARN, "kvm is installed\n");
+ return (DDI_PROBE_FAILURE);
+ }
+
+ return (DDI_PROBE_SUCCESS);
+}
+
+static int
+vmm_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
+{
+ switch (cmd) {
+ case DDI_ATTACH:
+ break;
+ default:
+ return (DDI_FAILURE);
+ }
+
+ if (vmm_mod_load()) {
+ return (DDI_FAILURE);
+ }
+
+ vmm_dip = dip;
+
+ /*
+ * Create control node. Other nodes will be created on demand.
+ */
+ if (ddi_create_minor_node(dip, VMM_CTL_MINOR_NODE, S_IFCHR,
+ VMM_CTL_MINOR, DDI_PSEUDO, 0) != 0) {
+ return (DDI_FAILURE);
+ }
+
+ ddi_report_dev(dip);
+
+ return (DDI_SUCCESS);
+}
+
+static int
+vmm_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
+{
+ switch (cmd) {
+ case DDI_DETACH:
+ break;
+ default:
+ return (DDI_FAILURE);
+ }
+
+ if (vmm_mod_unload()) {;
+ return (DDI_FAILURE);
+ }
+
+ /*
+ * Remove the control node.
+ */
+ ddi_remove_minor_node(dip, VMM_CTL_MINOR_NODE);
+ vmm_dip = NULL;
+
+ return (DDI_SUCCESS);
+}
+
+static struct cb_ops vmm_cb_ops = {
+ vmm_open,
+ vmm_close,
+ nodev, /* strategy */
+ nodev, /* print */
+ nodev, /* dump */
+ nodev, /* read */
+ nodev, /* write */
+ vmm_ioctl,
+ nodev, /* devmap */
+ vmm_mmap,
+ vmm_segmap,
+ nochpoll, /* poll */
+ ddi_prop_op,
+ NULL,
+ D_NEW | D_MP | D_DEVMAP
+};
+
+static struct dev_ops vmm_ops = {
+ DEVO_REV,
+ 0,
+ ddi_no_info,
+ nulldev, /* identify */
+ vmm_probe,
+ vmm_attach,
+ vmm_detach,
+ nodev, /* reset */
+ &vmm_cb_ops,
+ (struct bus_ops *)NULL
+};
+
+static struct modldrv modldrv = {
+ &mod_driverops,
+ "vmm",
+ &vmm_ops
+};
+
+static struct modlinkage modlinkage = {
+ MODREV_1,
+ &modldrv,
+ NULL
+};
+
+int
+_init(void)
+{
+ int error;
+
+ mutex_init(&vmmdev_mtx, NULL, MUTEX_DRIVER, NULL);
+
+ error = ddi_soft_state_init(&vmm_statep, sizeof (struct vmm_softc), 0);
+ if (error) {
+ return (error);
+ }
+
+ error = mod_install(&modlinkage);
+ if (error) {
+ ddi_soft_state_fini(&vmm_statep);
+ }
+
+ return (error);
+}
+
+int
+_fini(void)
+{
+ int error;
+
+ error = mod_remove(&modlinkage);
+ if (error) {
+ return (error);
+ }
+ ddi_soft_state_fini(&vmm_statep);
+
+ return (0);
+}
+
+int
+_info(struct modinfo *modinfop)
+{
+ return (mod_info(&modlinkage, modinfop));
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
new file mode 100644
index 0000000000..6588f5a46d
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_glue.c
@@ -0,0 +1,779 @@
+/*
+ * Copyright (c) 2004 John Baldwin <jhb@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/kern/subr_sleepqueue.c 261520 2014-02-05 18:13:27Z jhb $
+ */
+/*-
+ * Copyright (c) 2004 Poul-Henning Kamp
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/kern/subr_unit.c 255057 2013-08-30 07:37:45Z kib $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/types.h>
+#include <sys/archsystm.h>
+#include <sys/cpuset.h>
+#include <sys/fp.h>
+#include <sys/malloc.h>
+#include <sys/queue.h>
+#include <sys/spl.h>
+#include <sys/systm.h>
+
+#include <machine/cpufunc.h>
+#include <machine/fpu.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+#include <machine/vmm.h>
+#include <sys/vmm_impl.h>
+
+#include <vm/as.h>
+#include <vm/seg_kmem.h>
+
+vm_paddr_t
+pmap_kextract(vm_offset_t va)
+{
+ pfn_t pfn;
+
+ pfn = hat_getpfnum(kas.a_hat, (caddr_t)va);
+ ASSERT(pfn != PFN_INVALID);
+ return (pfn << PAGE_SHIFT) | ((uintptr_t)va & PAGE_MASK);
+}
+
+int
+cpusetobj_ffs(const cpuset_t *set)
+{
+#if CPUSET_WORDS > 1
+ int i, cbit;
+
+ cbit = 0;
+ for (i = 0; i < CPUSET_WORDS; i++) {
+ if (set->cpub[i] != 0) {
+ cbit = ffsl(set->cpub[i]);
+ cbit += i * sizeof (set->cpub[0]);
+ break;
+ }
+ }
+ return (cbit);
+#else
+ return(ffsl(*set));
+#endif
+}
+
+void
+smp_rendezvous(void (* setup_func)(void *),
+ void (* action_func)(void *),
+ void (* teardown_func)(void *),
+ void *arg)
+{
+ cpuset_t cpuset;
+
+ ASSERT(setup_func == NULL);
+ ASSERT(teardown_func == NULL);
+
+ CPUSET_ALL(cpuset);
+ xc_sync((xc_arg_t)arg, 0, 0, CPUSET2BV(cpuset), (xc_func_t)action_func);
+}
+
+struct kmem_item {
+ void *addr;
+ size_t size;
+ LIST_ENTRY(kmem_item) next;
+};
+static kmutex_t kmem_items_lock;
+static LIST_HEAD(, kmem_item) kmem_items;
+
+void *
+malloc(unsigned long size, struct malloc_type *mtp, int flags)
+{
+ void *p;
+ struct kmem_item *i;
+ int kmem_flag = KM_SLEEP;
+
+ if (flags & M_NOWAIT)
+ kmem_flag = KM_NOSLEEP;
+
+ if (flags & M_ZERO) {
+ p = kmem_zalloc(size + sizeof(struct kmem_item), kmem_flag);
+ } else {
+ p = kmem_alloc(size + sizeof(struct kmem_item), kmem_flag);
+ }
+
+ mutex_enter(&kmem_items_lock);
+ i = p + size;
+ i->addr = p;
+ i->size = size;
+
+ LIST_INSERT_HEAD(&kmem_items, i, next);
+ mutex_exit(&kmem_items_lock);
+
+ return (p);
+}
+
+void
+free(void *addr, struct malloc_type *mtp)
+{
+ struct kmem_item *i;
+
+ mutex_enter(&kmem_items_lock);
+ LIST_FOREACH(i, &kmem_items, next) {
+ if (i->addr == addr)
+ break;
+ }
+ ASSERT(i != NULL);
+ LIST_REMOVE(i, next);
+ mutex_exit(&kmem_items_lock);
+
+ kmem_free(addr, i->size + sizeof(struct kmem_item));
+}
+
+void
+mtx_init(struct mtx *mtx, char *name, const char *type_name, int opts)
+{
+ if (opts & MTX_SPIN) {
+ mutex_init(&mtx->m, name, MUTEX_SPIN,
+ (ddi_iblock_cookie_t)ipltospl(DISP_LEVEL));
+ } else {
+ mutex_init(&mtx->m, name, MUTEX_DRIVER, NULL);
+ }
+}
+
+void
+mtx_destroy(struct mtx *mtx)
+{
+ mutex_destroy(&mtx->m);
+}
+
+void
+critical_enter(void)
+{
+ kpreempt_disable();
+ thread_affinity_set(curthread, CPU_CURRENT);
+}
+
+void
+critical_exit(void)
+{
+ thread_affinity_clear(curthread);
+ kpreempt_enable();
+}
+
+struct unr {
+ u_int item;
+ struct unr *link;
+};
+
+#define UNR_HASHSIZE 8
+
+struct unrhdr {
+ struct mtx *mtx;
+ struct unr *hash[UNR_HASHSIZE];
+ u_int min;
+ u_int max;
+ u_int next;
+};
+
+#define HASH_UNR(uh, i) ((uh)->hash[(i) & ((UNR_HASHSIZE) - 1)])
+
+static struct mtx unr_mtx;
+
+/*
+ * Allocate a new unrheader set.
+ *
+ * Highest and lowest valid values given as parameters.
+ */
+struct unrhdr *
+new_unrhdr(int low, int high, struct mtx *mtx)
+{
+ struct unrhdr *uh;
+
+ uh = kmem_zalloc(sizeof (struct unrhdr), KM_SLEEP);
+ if (mtx) {
+ uh->mtx = mtx;
+ } else {
+ uh->mtx = &unr_mtx;
+ }
+ uh->min = low;
+ uh->max = high;
+ uh->next = uh->min;
+
+ return (uh);
+}
+
+void
+delete_unrhdr(struct unrhdr *uh)
+{
+ kmem_free(uh, sizeof (struct unrhdr));
+}
+
+static struct unr *
+unr_lookup(struct unrhdr *uh, int item)
+{
+ struct unr *unr;
+
+ ASSERT(MUTEX_HELD(&uh->mtx->m));
+
+ for (unr = HASH_UNR(uh, item); unr != NULL; unr = unr->link) {
+ if (unr->item == item)
+ break;
+ }
+
+ return (unr);
+}
+
+int
+alloc_unr(struct unrhdr *uh)
+{
+ struct unr *unr;
+ int item, start;
+
+ mutex_enter(&uh->mtx->m);
+ start = uh->next;
+ for (;;) {
+ item = uh->next;
+ if (++uh->next == uh->max) {
+ uh->next = uh->min;
+ }
+
+ if (unr_lookup(uh, item) == NULL) {
+ unr = kmem_zalloc(sizeof (struct unr), KM_SLEEP);
+ unr->item = item;
+ unr->link = HASH_UNR(uh, item);
+ HASH_UNR(uh, item) = unr;
+ break;
+ }
+
+ if (item == start) {
+ item = -1;
+ break;
+ }
+ }
+ mutex_exit(&uh->mtx->m);
+
+ return (item);
+}
+
+void
+free_unr(struct unrhdr *uh, u_int item)
+{
+ struct unr *unr, **unrp;
+
+ mutex_enter(&uh->mtx->m);
+ unrp = &HASH_UNR(uh, item);
+ for (;;) {
+ ASSERT(*unrp != NULL);
+ if ((*unrp)->item == item)
+ break;
+ unrp = &(*unrp)->link;
+ }
+ unr = *unrp;
+ *unrp = unr->link;
+ mutex_exit(&uh->mtx->m);
+ kmem_free(unr, sizeof(struct unr));
+}
+
+
+static void
+vmm_glue_callout_handler(void *arg)
+{
+ struct callout *c = arg;
+
+ c->c_flags &= ~CALLOUT_PENDING;
+ if (c->c_flags & CALLOUT_ACTIVE) {
+ (c->c_func)(c->c_arg);
+ }
+}
+
+void
+vmm_glue_callout_init(struct callout *c, int mpsafe)
+{
+ cyc_handler_t hdlr;
+ cyc_time_t when;
+
+ hdlr.cyh_level = CY_LOW_LEVEL;
+ hdlr.cyh_func = vmm_glue_callout_handler;
+ hdlr.cyh_arg = c;
+ when.cyt_when = CY_INFINITY;
+ when.cyt_interval = CY_INFINITY;
+
+ mutex_enter(&cpu_lock);
+ c->c_cyc_id = cyclic_add(&hdlr, &when);
+ c->c_flags |= CALLOUT_ACTIVE;
+ mutex_exit(&cpu_lock);
+}
+
+int
+vmm_glue_callout_reset_sbt(struct callout *c, sbintime_t sbt, sbintime_t pr,
+ void (*func)(void *), void *arg, int flags)
+{
+ ASSERT(c->c_cyc_id != CYCLIC_NONE);
+
+ c->c_func = func;
+ c->c_arg = arg;
+ c->c_flags |= (CALLOUT_ACTIVE | CALLOUT_PENDING);
+
+ if (flags & C_ABSOLUTE)
+ cyclic_reprogram(c->c_cyc_id, sbt);
+ else
+ cyclic_reprogram(c->c_cyc_id, sbt + gethrtime());
+
+ return (0);
+}
+
+int
+vmm_glue_callout_stop(struct callout *c)
+{
+ ASSERT(c->c_cyc_id != CYCLIC_NONE);
+ cyclic_reprogram(c->c_cyc_id, CY_INFINITY);
+ c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+
+ return (0);
+}
+
+int
+vmm_glue_callout_drain(struct callout *c)
+{
+ ASSERT(c->c_cyc_id != CYCLIC_NONE);
+ mutex_enter(&cpu_lock);
+ cyclic_remove(c->c_cyc_id);
+ c->c_cyc_id = CYCLIC_NONE;
+ c->c_flags &= ~(CALLOUT_ACTIVE | CALLOUT_PENDING);
+ mutex_exit(&cpu_lock);
+
+ return (0);
+}
+
+static int
+ipi_cpu_justreturn(xc_arg_t a1, xc_arg_t a2, xc_arg_t a3)
+{
+ return (0);
+}
+
+void
+ipi_cpu(int cpu, u_int ipi)
+{
+ cpuset_t set;
+
+ CPUSET_ONLY(set, cpu);
+ xc_call_nowait(NULL, NULL, NULL, CPUSET2BV(set),
+ ipi_cpu_justreturn);
+}
+
+#define SC_TABLESIZE 256 /* Must be power of 2. */
+#define SC_MASK (SC_TABLESIZE - 1)
+#define SC_SHIFT 8
+#define SC_HASH(wc) ((((uintptr_t)(wc) >> SC_SHIFT) ^ (uintptr_t)(wc)) & \
+ SC_MASK)
+#define SC_LOOKUP(wc) &sleepq_chains[SC_HASH(wc)]
+
+struct sleepqueue {
+ u_int sq_blockedcnt; /* Num. of blocked threads. */
+ LIST_ENTRY(sleepqueue) sq_hash; /* Chain. */
+ void *sq_wchan; /* Wait channel. */
+ kcondvar_t sq_cv;
+};
+
+struct sleepqueue_chain {
+ LIST_HEAD(, sleepqueue) sc_queues; /* List of sleep queues. */
+ struct mtx sc_lock; /* Spin lock for this chain. */
+};
+
+static struct sleepqueue_chain sleepq_chains[SC_TABLESIZE];
+
+#define SLEEPQ_CACHE_SZ (64)
+static kmem_cache_t *vmm_sleepq_cache;
+
+static int
+vmm_sleepq_cache_init(void *buf, void *user_arg, int kmflags)
+{
+ struct sleepqueue *sq = (struct sleepqueue *)buf;
+
+ bzero(sq, sizeof (struct sleepqueue));
+ cv_init(&sq->sq_cv, NULL, CV_DRIVER, NULL);
+
+ return (0);
+}
+
+static void
+vmm_sleepq_cache_fini(void *buf, void *user_arg)
+{
+ struct sleepqueue *sq = (struct sleepqueue *)buf;
+ cv_destroy(&sq->sq_cv);
+}
+
+static void
+init_sleepqueues(void)
+{
+ int i;
+
+ for (i = 0; i < SC_TABLESIZE; i++) {
+ LIST_INIT(&sleepq_chains[i].sc_queues);
+ mtx_init(&sleepq_chains[i].sc_lock, "sleepq chain", NULL,
+ MTX_SPIN);
+ }
+
+ vmm_sleepq_cache = kmem_cache_create("vmm_sleepq_cache",
+ sizeof (struct sleepqueue), SLEEPQ_CACHE_SZ, vmm_sleepq_cache_init,
+ vmm_sleepq_cache_fini, NULL, NULL, NULL, 0);
+
+}
+
+/*
+ * Lock the sleep queue chain associated with the specified wait channel.
+ */
+static void
+sleepq_lock(void *wchan)
+{
+ struct sleepqueue_chain *sc;
+
+ sc = SC_LOOKUP(wchan);
+ mtx_lock_spin(&sc->sc_lock);
+}
+
+/*
+ * Look up the sleep queue associated with a given wait channel in the hash
+ * table locking the associated sleep queue chain. If no queue is found in
+ * the table, NULL is returned.
+ */
+static struct sleepqueue *
+sleepq_lookup(void *wchan)
+{
+ struct sleepqueue_chain *sc;
+ struct sleepqueue *sq;
+
+ KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__));
+ sc = SC_LOOKUP(wchan);
+ mtx_assert(&sc->sc_lock, MA_OWNED);
+ LIST_FOREACH(sq, &sc->sc_queues, sq_hash)
+ if (sq->sq_wchan == wchan)
+ return (sq);
+ return (NULL);
+}
+
+/*
+ * Unlock the sleep queue chain associated with a given wait channel.
+ */
+static void
+sleepq_release(void *wchan)
+{
+ struct sleepqueue_chain *sc;
+
+ sc = SC_LOOKUP(wchan);
+ mtx_unlock_spin(&sc->sc_lock);
+}
+
+struct sleepqueue *
+sleepq_add(void *wchan)
+{
+ struct sleepqueue_chain *sc;
+ struct sleepqueue *sq;
+
+ sc = SC_LOOKUP(wchan);
+
+ /* Look up the sleep queue associated with the wait channel 'wchan'. */
+ sq = sleepq_lookup(wchan);
+
+ if (sq == NULL) {
+ sq = kmem_cache_alloc(vmm_sleepq_cache, KM_SLEEP);
+ LIST_INSERT_HEAD(&sc->sc_queues, sq, sq_hash);
+ sq->sq_wchan = wchan;
+ }
+
+ sq->sq_blockedcnt++;
+
+ return (sq);
+}
+
+void
+sleepq_remove(struct sleepqueue *sq)
+{
+ sq->sq_blockedcnt--;
+
+ if (sq->sq_blockedcnt == 0) {
+ LIST_REMOVE(sq, sq_hash);
+ kmem_cache_free(vmm_sleepq_cache, sq);
+ }
+}
+
+int
+msleep_spin(void *chan, struct mtx *mtx, const char *wmesg, int ticks)
+{
+ struct sleepqueue *sq;
+ int error;
+
+ sleepq_lock(chan);
+ sq = sleepq_add(chan);
+ sleepq_release(chan);
+
+ cv_reltimedwait(&sq->sq_cv, &mtx->m, ticks, TR_CLOCK_TICK);
+
+ sleepq_lock(chan);
+ sleepq_remove(sq);
+ sleepq_release(chan);
+
+ return (error);
+}
+
+void
+wakeup(void *chan)
+{
+ struct sleepqueue *sq;
+
+ sleepq_lock(chan);
+ sq = sleepq_lookup(chan);
+ if (sq != NULL) {
+ cv_broadcast(&sq->sq_cv);
+ }
+ sleepq_release(chan);
+}
+
+void
+wakeup_one(void *chan)
+{
+ struct sleepqueue *sq;
+
+ sleepq_lock(chan);
+ sq = sleepq_lookup(chan);
+ if (sq != NULL) {
+ cv_signal(&sq->sq_cv);
+ }
+ sleepq_release(chan);
+}
+
+u_int cpu_high; /* Highest arg to CPUID */
+u_int cpu_exthigh; /* Highest arg to extended CPUID */
+u_int cpu_id; /* Stepping ID */
+char cpu_vendor[20]; /* CPU Origin code */
+
+static void
+vmm_cpuid_init(void)
+{
+ u_int regs[4];
+
+ do_cpuid(0, regs);
+ cpu_high = regs[0];
+ ((u_int *)&cpu_vendor)[0] = regs[1];
+ ((u_int *)&cpu_vendor)[1] = regs[3];
+ ((u_int *)&cpu_vendor)[2] = regs[2];
+ cpu_vendor[12] = '\0';
+
+ do_cpuid(1, regs);
+ cpu_id = regs[0];
+
+ do_cpuid(0x80000000, regs);
+ cpu_exthigh = regs[0];
+}
+
+struct savefpu {
+ fpu_ctx_t fsa_fp_ctx;
+};
+
+static vmem_t *fpu_save_area_arena;
+
+static void
+fpu_save_area_init(void)
+{
+ fpu_save_area_arena = vmem_create("fpu_save_area",
+ NULL, 0, XSAVE_AREA_ALIGN,
+ segkmem_alloc, segkmem_free, heap_arena, 0, VM_BESTFIT | VM_SLEEP);
+}
+
+static void
+fpu_save_area_cleanup(void)
+{
+ vmem_destroy(fpu_save_area_arena);
+}
+
+struct savefpu *
+fpu_save_area_alloc(void)
+{
+ return (vmem_alloc(fpu_save_area_arena, sizeof (struct savefpu),
+ VM_SLEEP));
+}
+
+void
+fpu_save_area_free(struct savefpu *fsa)
+{
+ vmem_free(fpu_save_area_arena, fsa, sizeof (struct savefpu));
+}
+
+void
+fpu_save_area_reset(struct savefpu *fsa)
+{
+ extern const struct fxsave_state sse_initial;
+ extern const struct xsave_state avx_initial;
+ struct fpu_ctx *fp;
+ struct fxsave_state *fx;
+ struct xsave_state *xs;
+
+ fp = &fsa->fsa_fp_ctx;
+
+ fp->fpu_regs.kfpu_status = 0;
+ fp->fpu_regs.kfpu_xstatus = 0;
+
+ switch (fp_save_mech) {
+ case FP_FXSAVE:
+ fx = &fp->fpu_regs.kfpu_u.kfpu_fx;
+ bcopy(&sse_initial, fx, sizeof (*fx));
+ break;
+ case FP_XSAVE:
+ fp->fpu_xsave_mask = (XFEATURE_ENABLED_X87 |
+ XFEATURE_ENABLED_SSE | XFEATURE_ENABLED_AVX);
+ xs = &fp->fpu_regs.kfpu_u.kfpu_xs;
+ bcopy(&avx_initial, xs, sizeof (*xs));
+ break;
+ default:
+ panic("Invalid fp_save_mech");
+ /*NOTREACHED*/
+ }
+}
+
+void
+fpuexit(kthread_t *td)
+{
+ fp_save(&curthread->t_lwp->lwp_pcb.pcb_fpu);
+}
+
+static __inline void
+vmm_fxrstor(struct fxsave_state *addr)
+{
+ __asm __volatile("fxrstor %0" : : "m" (*(addr)));
+}
+
+static __inline void
+vmm_fxsave(struct fxsave_state *addr)
+{
+ __asm __volatile("fxsave %0" : "=m" (*(addr)));
+}
+
+static __inline void
+vmm_xrstor(struct xsave_state *addr, uint64_t mask)
+{
+ uint32_t low, hi;
+
+ low = mask;
+ hi = mask >> 32;
+ __asm __volatile("xrstor %0" : : "m" (*addr), "a" (low), "d" (hi));
+}
+
+static __inline void
+vmm_xsave(struct xsave_state *addr, uint64_t mask)
+{
+ uint32_t low, hi;
+
+ low = mask;
+ hi = mask >> 32;
+ __asm __volatile("xsave %0" : "=m" (*addr) : "a" (low), "d" (hi) :
+ "memory");
+}
+
+void
+fpurestore(void *arg)
+{
+ struct savefpu *fsa = (struct savefpu *)arg;
+ struct fpu_ctx *fp;
+
+ fp = &fsa->fsa_fp_ctx;
+
+ switch (fp_save_mech) {
+ case FP_FXSAVE:
+ vmm_fxrstor(&fp->fpu_regs.kfpu_u.kfpu_fx);
+ break;
+ case FP_XSAVE:
+ vmm_xrstor(&fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
+ break;
+ default:
+ panic("Invalid fp_save_mech");
+ /*NOTREACHED*/
+ }
+}
+
+void
+fpusave(void *arg)
+{
+ struct savefpu *fsa = (struct savefpu *)arg;
+ struct fpu_ctx *fp;
+
+ fp = &fsa->fsa_fp_ctx;
+
+ switch (fp_save_mech) {
+ case FP_FXSAVE:
+ vmm_fxsave(&fp->fpu_regs.kfpu_u.kfpu_fx);
+ break;
+ case FP_XSAVE:
+ vmm_xsave(&fp->fpu_regs.kfpu_u.kfpu_xs, fp->fpu_xsave_mask);
+ break;
+ default:
+ panic("Invalid fp_save_mech");
+ /*NOTREACHED*/
+ }
+}
+
+void
+vmm_sol_glue_init(void)
+{
+ vmm_cpuid_init();
+ fpu_save_area_init();
+ init_sleepqueues();
+}
+
+void
+vmm_sol_glue_cleanup(void)
+{
+ fpu_save_area_cleanup();
+ kmem_cache_destroy(vmm_sleepq_cache);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c b/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c
new file mode 100644
index 0000000000..3bb5412d16
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_sol_mem.c
@@ -0,0 +1,111 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_mem.c 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_mem.c 245678 2013-01-20 03:42:49Z neel $");
+
+#include <sys/param.h>
+#include <sys/lock.h>
+#include <sys/mutex.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+
+#include <vm/vm.h>
+#include <machine/pmap.h>
+
+#include <sys/ddi.h>
+
+#include "vmm_util.h"
+#include "vmm_mem.h"
+
+int
+vmm_mem_init(void)
+{
+ return (0);
+}
+
+vm_paddr_t
+vmm_mem_alloc(size_t size)
+{
+ clock_t usec = 2 * 1000000;
+ vm_paddr_t pa;
+ caddr_t addr;
+
+ if (size != PAGE_SIZE)
+ panic("vmm_mem_alloc: invalid allocation size %lu", size);
+
+ while (usec > 0) {
+ if ((addr = kmem_zalloc(PAGE_SIZE, KM_NOSLEEP)) != NULL) {
+ ASSERT(((uintptr_t)addr & PAGE_MASK) == 0);
+ pa = vtophys((vm_offset_t)addr);
+ return (pa);
+ }
+ delay(drv_usectohz((clock_t)500000));
+ usec -= 500000;
+ }
+
+ return (NULL);
+}
+
+void
+vmm_mem_free(vm_paddr_t base, size_t length)
+{
+ page_t *pp;
+
+ if (base & PAGE_MASK) {
+ panic("vmm_mem_free: base 0x%0lx must be aligned on a "
+ "0x%0x boundary\n", base, PAGE_SIZE);
+ }
+
+ if (length != PAGE_SIZE) {
+ panic("vmm_mem_free: invalid length %lu", length);
+ }
+
+ pp = page_numtopp_nolock(btop(base));
+ kmem_free((void *)pp->p_offset, PAGE_SIZE);
+}
+
+vm_paddr_t
+vmm_mem_maxaddr(void)
+{
+
+ return (ptob(physmax + 1));
+}
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_stat.h b/usr/src/uts/i86pc/io/vmm/vmm_stat.h
new file mode 100644
index 0000000000..9bf7a60e0b
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_stat.h
@@ -0,0 +1,127 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_stat.h 250427 2013-05-10 02:59:49Z neel $
+ */
+
+#ifndef _VMM_STAT_H_
+#define _VMM_STAT_H_
+
+struct vm;
+
+#define MAX_VMM_STAT_ELEMS 64 /* arbitrary */
+
+enum vmm_stat_scope {
+ VMM_STAT_SCOPE_ANY,
+ VMM_STAT_SCOPE_INTEL, /* Intel VMX specific statistic */
+ VMM_STAT_SCOPE_AMD, /* AMD SVM specific statistic */
+};
+
+struct vmm_stat_type {
+ int index; /* position in the stats buffer */
+ int nelems; /* standalone or array */
+ const char *desc; /* description of statistic */
+ enum vmm_stat_scope scope;
+};
+
+void vmm_stat_init(void *arg);
+
+#define VMM_STAT_DEFINE(type, nelems, desc, scope) \
+ struct vmm_stat_type type[1] = { \
+ { -1, nelems, desc, scope } \
+ }; \
+ SYSINIT(type##_stat, SI_SUB_KLD, SI_ORDER_ANY, vmm_stat_init, type)
+
+#define VMM_STAT_DECLARE(type) \
+ extern struct vmm_stat_type type[1]
+
+#define VMM_STAT(type, desc) \
+ VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_ANY)
+#define VMM_STAT_INTEL(type, desc) \
+ VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_INTEL)
+#define VMM_STAT_AMD(type, desc) \
+ VMM_STAT_DEFINE(type, 1, desc, VMM_STAT_SCOPE_AMD)
+
+#define VMM_STAT_ARRAY(type, nelems, desc) \
+ VMM_STAT_DEFINE(type, nelems, desc, VMM_STAT_SCOPE_ANY)
+
+void *vmm_stat_alloc(void);
+void vmm_stat_free(void *vp);
+
+/*
+ * 'buf' should be at least fit 'MAX_VMM_STAT_TYPES' entries
+ */
+int vmm_stat_copy(struct vm *vm, int vcpu, int *num_stats, uint64_t *buf);
+int vmm_stat_desc_copy(int index, char *buf, int buflen);
+
+static void __inline
+vmm_stat_array_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst,
+ int statidx, uint64_t x)
+{
+#ifdef VMM_KEEP_STATS
+ uint64_t *stats;
+
+ stats = vcpu_stats(vm, vcpu);
+
+ if (vst->index >= 0 && statidx < vst->nelems)
+ stats[vst->index + statidx] += x;
+#endif
+}
+
+
+static void __inline
+vmm_stat_incr(struct vm *vm, int vcpu, struct vmm_stat_type *vst, uint64_t x)
+{
+
+#ifdef VMM_KEEP_STATS
+ vmm_stat_array_incr(vm, vcpu, vst, 0, x);
+#endif
+}
+
+VMM_STAT_DECLARE(VCPU_MIGRATIONS);
+VMM_STAT_DECLARE(VMEXIT_COUNT);
+VMM_STAT_DECLARE(VMEXIT_EXTINT);
+VMM_STAT_DECLARE(VMEXIT_HLT);
+VMM_STAT_DECLARE(VMEXIT_CR_ACCESS);
+VMM_STAT_DECLARE(VMEXIT_RDMSR);
+VMM_STAT_DECLARE(VMEXIT_WRMSR);
+VMM_STAT_DECLARE(VMEXIT_MTRAP);
+VMM_STAT_DECLARE(VMEXIT_PAUSE);
+VMM_STAT_DECLARE(VMEXIT_INTR_WINDOW);
+VMM_STAT_DECLARE(VMEXIT_NMI_WINDOW);
+VMM_STAT_DECLARE(VMEXIT_INOUT);
+VMM_STAT_DECLARE(VMEXIT_CPUID);
+VMM_STAT_DECLARE(VMEXIT_NESTED_FAULT);
+VMM_STAT_DECLARE(VMEXIT_INST_EMUL);
+VMM_STAT_DECLARE(VMEXIT_UNKNOWN);
+VMM_STAT_DECLARE(VMEXIT_ASTPENDING);
+VMM_STAT_DECLARE(VMEXIT_USERSPACE);
+VMM_STAT_DECLARE(VMEXIT_RENDEZVOUS);
+VMM_STAT_DECLARE(VMEXIT_USERSPACE);
+VMM_STAT_DECLARE(VMEXIT_EXCEPTION);
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.c b/usr/src/uts/i86pc/io/vmm/vmm_util.c
new file mode 100644
index 0000000000..fabd42e13c
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_util.c
@@ -0,0 +1,125 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_util.c 245678 2013-01-20 03:42:49Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/vmm_util.c 245678 2013-01-20 03:42:49Z neel $");
+
+#include <sys/param.h>
+#include <sys/libkern.h>
+
+#include <machine/md_var.h>
+
+#include "vmm_util.h"
+
+boolean_t
+vmm_is_intel(void)
+{
+
+ if (strcmp(cpu_vendor, "GenuineIntel") == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+boolean_t
+vmm_is_amd(void)
+{
+ if (strcmp(cpu_vendor, "AuthenticAMD") == 0)
+ return (TRUE);
+ else
+ return (FALSE);
+}
+
+boolean_t
+vmm_supports_1G_pages(void)
+{
+ unsigned int regs[4];
+
+ /*
+ * CPUID.80000001:EDX[bit 26] = 1 indicates support for 1GB pages
+ *
+ * Both Intel and AMD support this bit.
+ */
+ if (cpu_exthigh >= 0x80000001) {
+ do_cpuid(0x80000001, regs);
+ if (regs[3] & (1 << 26))
+ return (TRUE);
+ }
+ return (FALSE);
+}
+
+#ifdef __FreeBSD__
+#include <sys/proc.h>
+#include <machine/frame.h>
+#define DUMP_REG(x) printf(#x "\t\t0x%016lx\n", (long)(tf->tf_ ## x))
+#define DUMP_SEG(x) printf(#x "\t\t0x%04x\n", (unsigned)(tf->tf_ ## x))
+void
+dump_trapframe(struct trapframe *tf)
+{
+ DUMP_REG(rdi);
+ DUMP_REG(rsi);
+ DUMP_REG(rdx);
+ DUMP_REG(rcx);
+ DUMP_REG(r8);
+ DUMP_REG(r9);
+ DUMP_REG(rax);
+ DUMP_REG(rbx);
+ DUMP_REG(rbp);
+ DUMP_REG(r10);
+ DUMP_REG(r11);
+ DUMP_REG(r12);
+ DUMP_REG(r13);
+ DUMP_REG(r14);
+ DUMP_REG(r15);
+ DUMP_REG(trapno);
+ DUMP_REG(addr);
+ DUMP_REG(flags);
+ DUMP_REG(err);
+ DUMP_REG(rip);
+ DUMP_REG(rflags);
+ DUMP_REG(rsp);
+ DUMP_SEG(cs);
+ DUMP_SEG(ss);
+ DUMP_SEG(fs);
+ DUMP_SEG(gs);
+ DUMP_SEG(es);
+ DUMP_SEG(ds);
+}
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmm_util.h b/usr/src/uts/i86pc/io/vmm/vmm_util.h
new file mode 100644
index 0000000000..fe1c1c9449
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmm_util.h
@@ -0,0 +1,40 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/vmm_util.h 245678 2013-01-20 03:42:49Z neel $
+ */
+
+#ifndef _VMM_UTIL_H_
+#define _VMM_UTIL_H_
+
+struct trapframe;
+
+boolean_t vmm_is_intel(void);
+boolean_t vmm_is_amd(void);
+boolean_t vmm_supports_1G_pages(void);
+
+void dump_trapframe(struct trapframe *tf);
+
+#endif
diff --git a/usr/src/uts/i86pc/io/vmm/vmx_assym.s b/usr/src/uts/i86pc/io/vmm/vmx_assym.s
new file mode 100644
index 0000000000..d84ca30275
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/vmx_assym.s
@@ -0,0 +1 @@
+#include "vmx_assym.h"
diff --git a/usr/src/uts/i86pc/io/vmm/x86.c b/usr/src/uts/i86pc/io/vmm/x86.c
new file mode 100644
index 0000000000..02222ef5e7
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/x86.c
@@ -0,0 +1,276 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/x86.c 255645 2013-09-17 17:56:53Z grehan $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD: head/sys/amd64/vmm/x86.c 255645 2013-09-17 17:56:53Z grehan $");
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/cpuset.h>
+
+#include <machine/clock.h>
+#include <machine/cpufunc.h>
+#include <machine/md_var.h>
+#include <machine/specialreg.h>
+
+#include <machine/vmm.h>
+
+#include "x86.h"
+
+#define CPUID_VM_HIGH 0x40000000
+
+static const char bhyve_id[12] = "bhyve bhyve ";
+
+static uint64_t bhyve_xcpuids;
+
+int
+x86_emulate_cpuid(struct vm *vm, int vcpu_id,
+ uint32_t *eax, uint32_t *ebx, uint32_t *ecx, uint32_t *edx)
+{
+ int error;
+ unsigned int func, regs[4];
+ enum x2apic_state x2apic_state;
+
+ /*
+ * Requests for invalid CPUID levels should map to the highest
+ * available level instead.
+ */
+ if (cpu_exthigh != 0 && *eax >= 0x80000000) {
+ if (*eax > cpu_exthigh)
+ *eax = cpu_exthigh;
+ } else if (*eax >= 0x40000000) {
+ if (*eax > CPUID_VM_HIGH)
+ *eax = CPUID_VM_HIGH;
+ } else if (*eax > cpu_high) {
+ *eax = cpu_high;
+ }
+
+ func = *eax;
+
+ /*
+ * In general the approach used for CPU topology is to
+ * advertise a flat topology where all CPUs are packages with
+ * no multi-core or SMT.
+ */
+ switch (func) {
+ /*
+ * Pass these through to the guest
+ */
+ case CPUID_0000_0000:
+ case CPUID_0000_0002:
+ case CPUID_0000_0003:
+ case CPUID_8000_0000:
+ case CPUID_8000_0002:
+ case CPUID_8000_0003:
+ case CPUID_8000_0004:
+ case CPUID_8000_0006:
+ case CPUID_8000_0008:
+ cpuid_count(*eax, *ecx, regs);
+ break;
+
+ case CPUID_8000_0001:
+ /*
+ * Hide rdtscp/ia32_tsc_aux until we know how
+ * to deal with them.
+ */
+ cpuid_count(*eax, *ecx, regs);
+ regs[3] &= ~AMDID_RDTSCP;
+ break;
+
+ case CPUID_8000_0007:
+ cpuid_count(*eax, *ecx, regs);
+#ifdef __FreeBSD__
+ /*
+ * If the host TSCs are not synchronized across
+ * physical cpus then we cannot advertise an
+ * invariant tsc to a vcpu.
+ *
+ * XXX This still falls short because the vcpu
+ * can observe the TSC moving backwards as it
+ * migrates across physical cpus. But at least
+ * it should discourage the guest from using the
+ * TSC to keep track of time.
+ */
+ if (!smp_tsc)
+ regs[3] &= ~AMDPM_TSC_INVARIANT;
+#endif
+ break;
+
+ case CPUID_0000_0001:
+ do_cpuid(1, regs);
+
+ error = vm_get_x2apic_state(vm, vcpu_id, &x2apic_state);
+ if (error) {
+ panic("x86_emulate_cpuid: error %d "
+ "fetching x2apic state", error);
+ }
+
+ /*
+ * Override the APIC ID only in ebx
+ */
+ regs[1] &= ~(CPUID_LOCAL_APIC_ID);
+ regs[1] |= (vcpu_id << CPUID_0000_0001_APICID_SHIFT);
+
+ /*
+ * Don't expose VMX, SpeedStep or TME capability.
+ * Advertise x2APIC capability and Hypervisor guest.
+ */
+ regs[2] &= ~(CPUID2_VMX | CPUID2_EST | CPUID2_TM2);
+
+ regs[2] |= CPUID2_HV;
+
+ if (x2apic_state != X2APIC_DISABLED)
+ regs[2] |= CPUID2_X2APIC;
+
+ /*
+ * Hide xsave/osxsave/avx until the FPU save/restore
+ * issues are resolved
+ */
+ regs[2] &= ~(CPUID2_XSAVE | CPUID2_OSXSAVE |
+ CPUID2_AVX);
+
+ /*
+ * Hide monitor/mwait until we know how to deal with
+ * these instructions.
+ */
+ regs[2] &= ~CPUID2_MON;
+
+ /*
+ * Hide the performance and debug features.
+ */
+ regs[2] &= ~CPUID2_PDCM;
+
+ /*
+ * No TSC deadline support in the APIC yet
+ */
+ regs[2] &= ~CPUID2_TSCDLT;
+
+ /*
+ * Hide thermal monitoring
+ */
+ regs[3] &= ~(CPUID_ACPI | CPUID_TM);
+
+ /*
+ * Machine check handling is done in the host.
+ */
+ regs[3] &= ~(CPUID_MCA | CPUID_MCE);
+
+ /*
+ * Hide the debug store capability.
+ */
+ regs[3] &= ~CPUID_DS;
+
+ /*
+ * Disable multi-core.
+ */
+ regs[1] &= ~CPUID_HTT_CORES;
+ regs[3] &= ~CPUID_HTT;
+ break;
+
+ case CPUID_0000_0004:
+ do_cpuid(4, regs);
+
+ /*
+ * Do not expose topology.
+ */
+ regs[0] &= 0xffff8000;
+ /*
+ * The maximum number of processor cores in
+ * this physical processor package and the
+ * maximum number of threads sharing this
+ * cache are encoded with "plus 1" encoding.
+ * Adding one to the value in this register
+ * field to obtains the actual value.
+ *
+ * Therefore 0 for both indicates 1 core
+ * per package and no cache sharing.
+ */
+ break;
+
+ case CPUID_0000_0006:
+ case CPUID_0000_0007:
+ case CPUID_0000_000A:
+ case CPUID_0000_000D:
+ /*
+ * Handle the access, but report 0 for
+ * all options
+ */
+ regs[0] = 0;
+ regs[1] = 0;
+ regs[2] = 0;
+ regs[3] = 0;
+ break;
+
+ case CPUID_0000_000B:
+ /*
+ * Processor topology enumeration
+ */
+ regs[0] = 0;
+ regs[1] = 0;
+ regs[2] = *ecx & 0xff;
+ regs[3] = vcpu_id;
+ break;
+
+ case 0x40000000:
+ regs[0] = CPUID_VM_HIGH;
+ bcopy(bhyve_id, &regs[1], 4);
+ bcopy(bhyve_id + 4, &regs[2], 4);
+ bcopy(bhyve_id + 8, &regs[3], 4);
+ break;
+
+ default:
+ /*
+ * The leaf value has already been clamped so
+ * simply pass this through, keeping count of
+ * how many unhandled leaf values have been seen.
+ */
+ atomic_add_long(&bhyve_xcpuids, 1);
+ cpuid_count(*eax, *ecx, regs);
+ break;
+ }
+
+ *eax = regs[0];
+ *ebx = regs[1];
+ *ecx = regs[2];
+ *edx = regs[3];
+
+ return (1);
+}
diff --git a/usr/src/uts/i86pc/io/vmm/x86.h b/usr/src/uts/i86pc/io/vmm/x86.h
new file mode 100644
index 0000000000..db2340b37b
--- /dev/null
+++ b/usr/src/uts/i86pc/io/vmm/x86.h
@@ -0,0 +1,65 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/vmm/x86.h 255287 2013-09-06 05:16:10Z grehan $
+ */
+
+#ifndef _X86_H_
+#define _X86_H_
+
+#define CPUID_0000_0000 (0x0)
+#define CPUID_0000_0001 (0x1)
+#define CPUID_0000_0002 (0x2)
+#define CPUID_0000_0003 (0x3)
+#define CPUID_0000_0004 (0x4)
+#define CPUID_0000_0006 (0x6)
+#define CPUID_0000_0007 (0x7)
+#define CPUID_0000_000A (0xA)
+#define CPUID_0000_000B (0xB)
+#define CPUID_0000_000D (0xD)
+#define CPUID_8000_0000 (0x80000000)
+#define CPUID_8000_0001 (0x80000001)
+#define CPUID_8000_0002 (0x80000002)
+#define CPUID_8000_0003 (0x80000003)
+#define CPUID_8000_0004 (0x80000004)
+#define CPUID_8000_0006 (0x80000006)
+#define CPUID_8000_0007 (0x80000007)
+#define CPUID_8000_0008 (0x80000008)
+
+/*
+ * CPUID instruction Fn0000_0001:
+ */
+#define CPUID_0000_0001_APICID_MASK (0xff<<24)
+#define CPUID_0000_0001_APICID_SHIFT 24
+
+/*
+ * CPUID instruction Fn0000_0001 ECX
+ */
+#define CPUID_0000_0001_FEAT0_VMX (1<<5)
+
+int x86_emulate_cpuid(struct vm *vm, int vcpu_id, uint32_t *eax, uint32_t *ebx,
+ uint32_t *ecx, uint32_t *edx);
+
+#endif
diff --git a/usr/src/uts/i86pc/sys/Makefile b/usr/src/uts/i86pc/sys/Makefile
index 7fa9f8f6ef..80461a55c1 100644
--- a/usr/src/uts/i86pc/sys/Makefile
+++ b/usr/src/uts/i86pc/sys/Makefile
@@ -21,6 +21,7 @@
#
# Copyright 2009 Sun Microsystems, Inc. All rights reserved.
# Use is subject to license terms.
+# Copyright 2017 Joyent, Inc.
#
# uts/i86pc/sys/Makefile
#
@@ -36,7 +37,7 @@ include ../Makefile.i86pc
#
FILEMODE = 644
-HDRS= \
+CHKHDRS= \
acpidev.h \
amd_iommu.h \
asm_misc.h \
@@ -66,6 +67,15 @@ HDRS= \
xc_levels.h \
xsvc.h
+NOCHKHDRS= \
+ vmm.h \
+ vmm_impl.h \
+ vmm_instruction_emul.h
+
+HDRS= \
+ $(CHKHDRS) \
+ $(NOCHKHDRS)
+
ROOTHDRS= $(HDRS:%=$(USR_PSM_ISYS_DIR)/%)
ROOTDIR= $(ROOT)/usr/share/src
@@ -74,7 +84,7 @@ ROOTDIRS= $(ROOTDIR)/uts $(ROOTDIR)/uts/$(PLATFORM)
ROOTLINK= $(ROOTDIR)/uts/$(PLATFORM)/sys
LINKDEST= ../../../../platform/$(PLATFORM)/include/sys
-CHECKHDRS= $(HDRS:%.h=%.check)
+CHECKHDRS= $(CHKHDRS:%.h=%.check)
.KEEP_STATE:
diff --git a/usr/src/uts/i86pc/sys/viona_io.h b/usr/src/uts/i86pc/sys/viona_io.h
new file mode 100644
index 0000000000..a4fb0f2527
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/viona_io.h
@@ -0,0 +1,45 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2013 Pluribus Networks Inc.
+ */
+
+#ifndef _VIONA_IO_H_
+#define _VIONA_IO_H_
+
+#define VNA_IOC (('V' << 16)|('C' << 8))
+#define VNA_IOC_CREATE (VNA_IOC | 1)
+#define VNA_IOC_DELETE (VNA_IOC | 2)
+#define VNA_IOC_RX_RING_INIT (VNA_IOC | 3)
+#define VNA_IOC_TX_RING_INIT (VNA_IOC | 4)
+#define VNA_IOC_RX_RING_RESET (VNA_IOC | 5)
+#define VNA_IOC_TX_RING_RESET (VNA_IOC | 6)
+#define VNA_IOC_RX_RING_KICK (VNA_IOC | 7)
+#define VNA_IOC_TX_RING_KICK (VNA_IOC | 8)
+#define VNA_IOC_RX_INTR_CLR (VNA_IOC | 9)
+#define VNA_IOC_TX_INTR_CLR (VNA_IOC | 10)
+#define VNA_IOC_SET_FEATURES (VNA_IOC | 11)
+#define VNA_IOC_GET_FEATURES (VNA_IOC | 12)
+
+typedef struct vioc_create {
+ datalink_id_t c_linkid;
+ char c_vmname[64];
+ size_t c_lomem_size;
+ size_t c_himem_size;
+} vioc_create_t;
+
+typedef struct vioc_ring_init {
+ uint16_t ri_qsize;
+ uint64_t ri_qaddr;
+} vioc_ring_init_t;
+
+#endif /* _VIONA_IO_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm.h b/usr/src/uts/i86pc/sys/vmm.h
new file mode 100644
index 0000000000..e876ce748f
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm.h
@@ -0,0 +1,565 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/include/vmm.h 273375 2014-10-21 07:10:43Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _VMM_H_
+#define _VMM_H_
+
+#include <x86/segments.h>
+
+enum vm_suspend_how {
+ VM_SUSPEND_NONE,
+ VM_SUSPEND_RESET,
+ VM_SUSPEND_POWEROFF,
+ VM_SUSPEND_HALT,
+ VM_SUSPEND_LAST
+};
+
+/*
+ * Identifiers for architecturally defined registers.
+ */
+enum vm_reg_name {
+ VM_REG_GUEST_RAX,
+ VM_REG_GUEST_RBX,
+ VM_REG_GUEST_RCX,
+ VM_REG_GUEST_RDX,
+ VM_REG_GUEST_RSI,
+ VM_REG_GUEST_RDI,
+ VM_REG_GUEST_RBP,
+ VM_REG_GUEST_R8,
+ VM_REG_GUEST_R9,
+ VM_REG_GUEST_R10,
+ VM_REG_GUEST_R11,
+ VM_REG_GUEST_R12,
+ VM_REG_GUEST_R13,
+ VM_REG_GUEST_R14,
+ VM_REG_GUEST_R15,
+ VM_REG_GUEST_CR0,
+ VM_REG_GUEST_CR3,
+ VM_REG_GUEST_CR4,
+ VM_REG_GUEST_DR7,
+ VM_REG_GUEST_RSP,
+ VM_REG_GUEST_RIP,
+ VM_REG_GUEST_RFLAGS,
+ VM_REG_GUEST_ES,
+ VM_REG_GUEST_CS,
+ VM_REG_GUEST_SS,
+ VM_REG_GUEST_DS,
+ VM_REG_GUEST_FS,
+ VM_REG_GUEST_GS,
+ VM_REG_GUEST_LDTR,
+ VM_REG_GUEST_TR,
+ VM_REG_GUEST_IDTR,
+ VM_REG_GUEST_GDTR,
+ VM_REG_GUEST_EFER,
+ VM_REG_GUEST_CR2,
+ VM_REG_LAST
+};
+
+enum x2apic_state {
+ X2APIC_DISABLED,
+ X2APIC_ENABLED,
+ X2APIC_STATE_LAST
+};
+
+#define VM_INTINFO_VECTOR(info) ((info) & 0xff)
+#define VM_INTINFO_DEL_ERRCODE 0x800
+#define VM_INTINFO_RSVD 0x7ffff000
+#define VM_INTINFO_VALID 0x80000000
+#define VM_INTINFO_TYPE 0x700
+#define VM_INTINFO_HWINTR (0 << 8)
+#define VM_INTINFO_NMI (2 << 8)
+#define VM_INTINFO_HWEXCEPTION (3 << 8)
+#define VM_INTINFO_SWINTR (4 << 8)
+
+#define VM_MAX_NAMELEN 32
+
+#ifdef _KERNEL
+
+struct vm;
+struct vm_exception;
+struct vm_memory_segment;
+struct seg_desc;
+struct vm_exit;
+struct vm_run;
+struct vhpet;
+struct vioapic;
+struct vlapic;
+struct vm_guest_paging;
+
+typedef int (*vmm_init_func_t)(void);
+typedef int (*vmm_cleanup_func_t)(void);
+typedef void * (*vmi_init_func_t)(struct vm *vm); /* instance specific apis */
+typedef int (*vmi_run_func_t)(void *vmi, int vcpu, register_t rip);
+typedef void (*vmi_cleanup_func_t)(void *vmi);
+typedef int (*vmi_mmap_set_func_t)(void *vmi, vm_paddr_t gpa,
+ vm_paddr_t hpa, size_t length,
+ vm_memattr_t attr, int prot,
+ boolean_t superpages_ok);
+typedef vm_paddr_t (*vmi_mmap_get_func_t)(void *vmi, vm_paddr_t gpa);
+typedef int (*vmi_get_register_t)(void *vmi, int vcpu, int num,
+ uint64_t *retval);
+typedef int (*vmi_set_register_t)(void *vmi, int vcpu, int num,
+ uint64_t val);
+typedef int (*vmi_get_desc_t)(void *vmi, int vcpu, int num,
+ struct seg_desc *desc);
+typedef int (*vmi_set_desc_t)(void *vmi, int vcpu, int num,
+ struct seg_desc *desc);
+typedef int (*vmi_get_cap_t)(void *vmi, int vcpu, int num, int *retval);
+typedef int (*vmi_set_cap_t)(void *vmi, int vcpu, int num, int val);
+typedef struct vlapic * (*vmi_vlapic_init)(void *vmi, int vcpu);
+typedef void (*vmi_vlapic_cleanup)(void *vmi, struct vlapic *vlapic);
+
+struct vmm_ops {
+ vmm_init_func_t init; /* module wide initialization */
+ vmm_cleanup_func_t cleanup;
+
+ vmi_init_func_t vminit; /* vm-specific initialization */
+ vmi_run_func_t vmrun;
+ vmi_cleanup_func_t vmcleanup;
+ vmi_mmap_set_func_t vmmmap_set;
+ vmi_mmap_get_func_t vmmmap_get;
+ vmi_get_register_t vmgetreg;
+ vmi_set_register_t vmsetreg;
+ vmi_get_desc_t vmgetdesc;
+ vmi_set_desc_t vmsetdesc;
+ vmi_get_cap_t vmgetcap;
+ vmi_set_cap_t vmsetcap;
+ vmi_vlapic_init vlapic_init;
+ vmi_vlapic_cleanup vlapic_cleanup;
+};
+
+extern struct vmm_ops vmm_ops_intel;
+extern struct vmm_ops vmm_ops_amd;
+
+int vm_create(const char *name, struct vm **retvm);
+void vm_destroy(struct vm *vm);
+const char *vm_name(struct vm *vm);
+int vm_malloc(struct vm *vm, vm_paddr_t gpa, size_t len);
+#ifdef __FreeBSD__
+int vm_map_mmio(struct vm *vm, vm_paddr_t gpa, size_t len, vm_paddr_t hpa);
+#endif
+int vm_unmap_mmio(struct vm *vm, vm_paddr_t gpa, size_t len);
+#ifndef __FreeBSD__
+vm_paddr_t vm_gpa2hpa(struct vm *vm, vm_paddr_t gpa, size_t size);
+#endif
+void *vm_gpa_hold(struct vm *, vm_paddr_t gpa, size_t len, int prot,
+ void **cookie);
+void vm_gpa_release(void *cookie);
+int vm_gpabase2memseg(struct vm *vm, vm_paddr_t gpabase,
+ struct vm_memory_segment *seg);
+int vm_get_register(struct vm *vm, int vcpu, int reg, uint64_t *retval);
+int vm_set_register(struct vm *vm, int vcpu, int reg, uint64_t val);
+int vm_get_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *ret_desc);
+int vm_set_seg_desc(struct vm *vm, int vcpu, int reg,
+ struct seg_desc *desc);
+int vm_run(struct vm *vm, struct vm_run *vmrun);
+int vm_inject_nmi(struct vm *vm, int vcpu);
+int vm_nmi_pending(struct vm *vm, int vcpuid);
+void vm_nmi_clear(struct vm *vm, int vcpuid);
+int vm_inject_extint(struct vm *vm, int vcpu);
+int vm_extint_pending(struct vm *vm, int vcpuid);
+void vm_extint_clear(struct vm *vm, int vcpuid);
+struct vlapic *vm_lapic(struct vm *vm, int cpu);
+struct vioapic *vm_ioapic(struct vm *vm);
+struct vhpet *vm_hpet(struct vm *vm);
+int vm_get_capability(struct vm *vm, int vcpu, int type, int *val);
+int vm_set_capability(struct vm *vm, int vcpu, int type, int val);
+int vm_get_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state *state);
+int vm_set_x2apic_state(struct vm *vm, int vcpu, enum x2apic_state state);
+int vm_apicid2vcpuid(struct vm *vm, int apicid);
+int vm_activate_cpu(struct vm *vm, int vcpu);
+cpuset_t vm_active_cpus(struct vm *vm);
+struct vm_exit *vm_exitinfo(struct vm *vm, int vcpuid);
+
+typedef void (*vm_rendezvous_func_t)(struct vm *vm, int vcpuid, void *arg);
+
+/*
+ * Return 1 if device indicated by bus/slot/func is supposed to be a
+ * pci passthrough device.
+ *
+ * Return 0 otherwise.
+ */
+int vmm_is_pptdev(int bus, int slot, int func);
+
+void *vm_iommu_domain(struct vm *vm);
+
+enum vcpu_state {
+ VCPU_IDLE,
+ VCPU_FROZEN,
+ VCPU_RUNNING,
+ VCPU_SLEEPING,
+};
+
+int vcpu_set_state(struct vm *vm, int vcpu, enum vcpu_state state,
+ bool from_idle);
+enum vcpu_state vcpu_get_state(struct vm *vm, int vcpu, int *hostcpu);
+
+static int __inline
+vcpu_is_running(struct vm *vm, int vcpu, int *hostcpu)
+{
+ return (vcpu_get_state(vm, vcpu, hostcpu) == VCPU_RUNNING);
+}
+
+void *vcpu_stats(struct vm *vm, int vcpu);
+void vm_interrupt_hostcpu(struct vm *vm, int vcpu);
+void vcpu_notify_event(struct vm *vm, int vcpuid, bool lapic_intr);
+struct vatpic *vm_atpic(struct vm *vm);
+struct vatpit *vm_atpit(struct vm *vm);
+
+/*
+ * Inject exception 'vme' into the guest vcpu. This function returns 0 on
+ * success and non-zero on failure.
+ *
+ * Wrapper functions like 'vm_inject_gp()' should be preferred to calling
+ * this function directly because they enforce the trap-like or fault-like
+ * behavior of an exception.
+ *
+ * This function should only be called in the context of the thread that is
+ * executing this vcpu.
+ */
+int vm_inject_exception(struct vm *vm, int vcpuid, struct vm_exception *vme);
+
+/*
+ * This function is called after a VM-exit that occurred during exception or
+ * interrupt delivery through the IDT. The format of 'intinfo' is described
+ * in Figure 15-1, "EXITINTINFO for All Intercepts", APM, Vol 2.
+ *
+ * If a VM-exit handler completes the event delivery successfully then it
+ * should call vm_exit_intinfo() to extinguish the pending event. For e.g.,
+ * if the task switch emulation is triggered via a task gate then it should
+ * call this function with 'intinfo=0' to indicate that the external event
+ * is not pending anymore.
+ *
+ * Return value is 0 on success and non-zero on failure.
+ */
+int vm_exit_intinfo(struct vm *vm, int vcpuid, uint64_t intinfo);
+
+/*
+ * This function is called before every VM-entry to retrieve a pending
+ * event that should be injected into the guest. This function combines
+ * nested events into a double or triple fault.
+ *
+ * Returns 0 if there are no events that need to be injected into the guest
+ * and non-zero otherwise.
+ */
+int vm_entry_intinfo(struct vm *vm, int vcpuid, uint64_t *info);
+
+int vm_get_intinfo(struct vm *vm, int vcpuid, uint64_t *info1, uint64_t *info2);
+
+enum vm_reg_name vm_segment_name(int seg_encoding);
+
+struct vm_copyinfo {
+ uint64_t gpa;
+ size_t len;
+ void *hva;
+ void *cookie;
+};
+
+/*
+ * Set up 'copyinfo[]' to copy to/from guest linear address space starting
+ * at 'gla' and 'len' bytes long. The 'prot' should be set to PROT_READ for
+ * a copyin or PROT_WRITE for a copyout.
+ *
+ * Returns 0 on success.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ *
+ * The 'copyinfo[]' can be passed to 'vm_copyin()' or 'vm_copyout()' only if
+ * the return value is 0. The 'copyinfo[]' resources should be freed by calling
+ * 'vm_copy_teardown()' after the copy is done.
+ */
+int vm_copy_setup(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, size_t len, int prot, struct vm_copyinfo *copyinfo,
+ int num_copyinfo);
+void vm_copy_teardown(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+ int num_copyinfo);
+void vm_copyin(struct vm *vm, int vcpuid, struct vm_copyinfo *copyinfo,
+ void *kaddr, size_t len);
+void vm_copyout(struct vm *vm, int vcpuid, const void *kaddr,
+ struct vm_copyinfo *copyinfo, size_t len);
+#endif /* KERNEL */
+
+#define VM_MAXCPU 16 /* maximum virtual cpus */
+
+/*
+ * Identifiers for optional vmm capabilities
+ */
+enum vm_cap_type {
+ VM_CAP_HALT_EXIT,
+ VM_CAP_MTRAP_EXIT,
+ VM_CAP_PAUSE_EXIT,
+ VM_CAP_UNRESTRICTED_GUEST,
+ VM_CAP_ENABLE_INVPCID,
+ VM_CAP_MAX
+};
+
+enum vm_intr_trigger {
+ EDGE_TRIGGER,
+ LEVEL_TRIGGER
+};
+
+/*
+ * The 'access' field has the format specified in Table 21-2 of the Intel
+ * Architecture Manual vol 3b.
+ *
+ * XXX The contents of the 'access' field are architecturally defined except
+ * bit 16 - Segment Unusable.
+ */
+struct seg_desc {
+ uint64_t base;
+ uint32_t limit;
+ uint32_t access;
+};
+
+#define SEG_DESC_TYPE(access) ((access) & 0x001f)
+#define SEG_DESC_DPL(access) (((access) >> 5) & 0x3)
+#define SEG_DESC_PRESENT(access) (((access) & 0x0080) ? 1 : 0)
+#define SEG_DESC_DEF32(access) (((access) & 0x4000) ? 1 : 0)
+#define SEG_DESC_GRANULARITY(access) (((access) & 0x8000) ? 1 : 0)
+#define SEG_DESC_UNUSABLE(access) (((access) & 0x10000) ? 1 : 0)
+
+enum vm_cpu_mode {
+ CPU_MODE_REAL,
+ CPU_MODE_PROTECTED,
+ CPU_MODE_COMPATIBILITY, /* IA-32E mode (CS.L = 0) */
+ CPU_MODE_64BIT, /* IA-32E mode (CS.L = 1) */
+};
+
+enum vm_paging_mode {
+ PAGING_MODE_FLAT,
+ PAGING_MODE_32,
+ PAGING_MODE_PAE,
+ PAGING_MODE_64,
+};
+
+struct vm_guest_paging {
+ uint64_t cr3;
+ int cpl;
+ enum vm_cpu_mode cpu_mode;
+ enum vm_paging_mode paging_mode;
+};
+
+/*
+ * The data structures 'vie' and 'vie_op' are meant to be opaque to the
+ * consumers of instruction decoding. The only reason why their contents
+ * need to be exposed is because they are part of the 'vm_exit' structure.
+ */
+struct vie_op {
+ uint8_t op_byte; /* actual opcode byte */
+ uint8_t op_type; /* type of operation (e.g. MOV) */
+ uint16_t op_flags;
+};
+
+#define VIE_INST_SIZE 15
+struct vie {
+ uint8_t inst[VIE_INST_SIZE]; /* instruction bytes */
+ uint8_t num_valid; /* size of the instruction */
+ uint8_t num_processed;
+
+ uint8_t addrsize:4, opsize:4; /* address and operand sizes */
+ uint8_t rex_w:1, /* REX prefix */
+ rex_r:1,
+ rex_x:1,
+ rex_b:1,
+ rex_present:1,
+ repz_present:1, /* REP/REPE/REPZ prefix */
+ repnz_present:1, /* REPNE/REPNZ prefix */
+ opsize_override:1, /* Operand size override */
+ addrsize_override:1, /* Address size override */
+ segment_override:1; /* Segment override */
+
+ uint8_t mod:2, /* ModRM byte */
+ reg:4,
+ rm:4;
+
+ uint8_t ss:2, /* SIB byte */
+ index:4,
+ base:4;
+
+ uint8_t disp_bytes;
+ uint8_t imm_bytes;
+
+ uint8_t scale;
+ int base_register; /* VM_REG_GUEST_xyz */
+ int index_register; /* VM_REG_GUEST_xyz */
+ int segment_register; /* VM_REG_GUEST_xyz */
+
+ int64_t displacement; /* optional addr displacement */
+ int64_t immediate; /* optional immediate operand */
+
+ uint8_t decoded; /* set to 1 if successfully decoded */
+
+ struct vie_op op; /* opcode description */
+};
+
+enum vm_exitcode {
+ VM_EXITCODE_INOUT,
+ VM_EXITCODE_VMX,
+ VM_EXITCODE_BOGUS,
+ VM_EXITCODE_RDMSR,
+ VM_EXITCODE_WRMSR,
+ VM_EXITCODE_HLT,
+ VM_EXITCODE_MTRAP,
+ VM_EXITCODE_PAUSE,
+ VM_EXITCODE_PAGING,
+ VM_EXITCODE_INST_EMUL,
+ VM_EXITCODE_SPINUP_AP,
+ VM_EXITCODE_DEPRECATED1, /* used to be SPINDOWN_CPU */
+ VM_EXITCODE_INOUT_STR,
+ VM_EXITCODE_MAX
+};
+
+struct vm_inout {
+ uint16_t bytes:3; /* 1 or 2 or 4 */
+ uint16_t in:1;
+ uint16_t string:1;
+ uint16_t rep:1;
+ uint16_t port;
+ uint32_t eax; /* valid for out */
+};
+
+struct vm_inout_str {
+ struct vm_inout inout; /* must be the first element */
+ struct vm_guest_paging paging;
+ uint64_t rflags;
+ uint64_t cr0;
+ uint64_t index;
+ uint64_t count; /* rep=1 (%rcx), rep=0 (1) */
+ int addrsize;
+ enum vm_reg_name seg_name;
+ struct seg_desc seg_desc;
+};
+
+struct vm_exit {
+ enum vm_exitcode exitcode;
+ int inst_length; /* 0 means unknown */
+ uint64_t rip;
+ union {
+ struct vm_inout inout;
+ struct vm_inout_str inout_str;
+ struct {
+ uint64_t gpa;
+ int fault_type;
+ } paging;
+ struct {
+ uint64_t gpa;
+ uint64_t gla;
+ uint64_t cs_base;
+ int cs_d; /* CS.D */
+ struct vm_guest_paging paging;
+ struct vie vie;
+ } inst_emul;
+ /*
+ * VMX specific payload. Used when there is no "better"
+ * exitcode to represent the VM-exit.
+ */
+ struct {
+ int status; /* vmx inst status */
+ /*
+ * 'exit_reason' and 'exit_qualification' are valid
+ * only if 'status' is zero.
+ */
+ uint32_t exit_reason;
+ uint64_t exit_qualification;
+ /*
+ * 'inst_error' and 'inst_type' are valid
+ * only if 'status' is non-zero.
+ */
+ int inst_type;
+ int inst_error;
+ } vmx;
+ struct {
+ uint32_t code; /* ecx value */
+ uint64_t wval;
+ } msr;
+ struct {
+ int vcpu;
+ uint64_t rip;
+ } spinup_ap;
+ struct {
+ uint64_t rflags;
+ } hlt;
+ } u;
+};
+
+/* APIs to inject faults into the guest */
+void vm_inject_fault(void *vm, int vcpuid, int vector, int errcode_valid,
+ int errcode);
+
+static __inline void
+vm_inject_ud(void *vm, int vcpuid)
+{
+ vm_inject_fault(vm, vcpuid, IDT_UD, 0, 0);
+}
+
+static __inline void
+vm_inject_gp(void *vm, int vcpuid)
+{
+ vm_inject_fault(vm, vcpuid, IDT_GP, 1, 0);
+}
+
+static __inline void
+vm_inject_ac(void *vm, int vcpuid, int errcode)
+{
+ vm_inject_fault(vm, vcpuid, IDT_AC, 1, errcode);
+}
+
+static __inline void
+vm_inject_ss(void *vm, int vcpuid, int errcode)
+{
+ vm_inject_fault(vm, vcpuid, IDT_SS, 1, errcode);
+}
+
+void vm_inject_pf(void *vm, int vcpuid, int error_code, uint64_t cr2);
+
+int vm_restart_instruction(void *vm, int vcpuid);
+
+#ifndef __FreeBSD__
+#ifdef _KERNEL
+extern void vmm_sol_glue_init(void);
+extern void vmm_sol_glue_cleanup(void);
+
+extern int vmm_mod_load(void);
+extern int vmm_mod_unload(void);
+#endif
+#endif
+
+#endif /* _VMM_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_dev.h b/usr/src/uts/i86pc/sys/vmm_dev.h
new file mode 100644
index 0000000000..3e74eb8786
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm_dev.h
@@ -0,0 +1,334 @@
+/*-
+ * Copyright (c) 2011 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/include/vmm_dev.h 268889 2014-07-19 20:59:08Z neel $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _VMM_DEV_H_
+#define _VMM_DEV_H_
+
+#ifdef _KERNEL
+void vmmdev_init(void);
+int vmmdev_cleanup(void);
+#endif
+
+struct vm_memory_segment {
+ vm_paddr_t gpa; /* in */
+ size_t len;
+ int wired;
+};
+
+struct vm_register {
+ int cpuid;
+ int regnum; /* enum vm_reg_name */
+ uint64_t regval;
+};
+
+struct vm_seg_desc { /* data or code segment */
+ int cpuid;
+ int regnum; /* enum vm_reg_name */
+ struct seg_desc desc;
+};
+
+struct vm_run {
+ int cpuid;
+ struct vm_exit vm_exit;
+};
+
+struct vm_exception {
+ int cpuid;
+ int vector;
+ uint32_t error_code;
+ int error_code_valid;
+ int restart_instruction;
+};
+
+struct vm_lapic_msi {
+ uint64_t msg;
+ uint64_t addr;
+};
+
+struct vm_lapic_irq {
+ int cpuid;
+ int vector;
+};
+
+struct vm_ioapic_irq {
+ int irq;
+};
+
+struct vm_isa_irq {
+ int atpic_irq;
+ int ioapic_irq;
+};
+
+struct vm_isa_irq_trigger {
+ int atpic_irq;
+ enum vm_intr_trigger trigger;
+};
+
+struct vm_capability {
+ int cpuid;
+ enum vm_cap_type captype;
+ int capval;
+ int allcpus;
+};
+
+struct vm_pptdev {
+ int bus;
+ int slot;
+ int func;
+};
+
+struct vm_pptdev_mmio {
+ int bus;
+ int slot;
+ int func;
+ vm_paddr_t gpa;
+ vm_paddr_t hpa;
+ size_t len;
+};
+
+struct vm_pptdev_msi {
+ int vcpu;
+ int bus;
+ int slot;
+ int func;
+ int numvec; /* 0 means disabled */
+ uint32_t msg;
+ uint64_t addr;
+};
+
+struct vm_pptdev_msix {
+ int vcpu;
+ int bus;
+ int slot;
+ int func;
+ int idx;
+ uint32_t msg;
+ uint32_t vector_control;
+ uint64_t addr;
+};
+
+struct vm_nmi {
+ int cpuid;
+};
+
+#define MAX_VM_STATS 64
+struct vm_stats {
+ int cpuid; /* in */
+ int num_entries; /* out */
+ struct timeval tv;
+ uint64_t statbuf[MAX_VM_STATS];
+};
+
+struct vm_stat_desc {
+ int index; /* in */
+ char desc[128]; /* out */
+};
+
+struct vm_x2apic {
+ int cpuid;
+ enum x2apic_state state;
+};
+
+struct vm_gpa_pte {
+ uint64_t gpa; /* in */
+ uint64_t pte[4]; /* out */
+ int ptenum;
+};
+
+struct vm_hpet_cap {
+ uint32_t capabilities; /* lower 32 bits of HPET capabilities */
+};
+
+struct vm_activate_cpu {
+ int vcpuid;
+};
+
+struct vm_gla2gpa {
+ int vcpuid; /* inputs */
+ int prot; /* PROT_READ or PROT_WRITE */
+ uint64_t gla;
+ struct vm_guest_paging paging;
+ int fault; /* outputs */
+ uint64_t gpa;
+};
+
+struct vm_cpuset {
+ int which;
+ int cpusetsize;
+ cpuset_t *cpus;
+};
+#define VM_ACTIVE_CPUS 0
+#define VM_SUSPENDED_CPUS 1
+
+enum {
+ /* general routines */
+ IOCNUM_ABIVERS = 0,
+ IOCNUM_RUN = 1,
+ IOCNUM_SET_CAPABILITY = 2,
+ IOCNUM_GET_CAPABILITY = 3,
+
+ /* memory apis */
+ IOCNUM_MAP_MEMORY = 10,
+ IOCNUM_GET_MEMORY_SEG = 11,
+ IOCNUM_GET_GPA_PMAP = 12,
+ IOCNUM_GLA2GPA = 13,
+
+ /* register/state accessors */
+ IOCNUM_SET_REGISTER = 20,
+ IOCNUM_GET_REGISTER = 21,
+ IOCNUM_SET_SEGMENT_DESCRIPTOR = 22,
+ IOCNUM_GET_SEGMENT_DESCRIPTOR = 23,
+
+ /* interrupt injection */
+ IOCNUM_INJECT_EXCEPTION = 30,
+ IOCNUM_LAPIC_IRQ = 31,
+ IOCNUM_INJECT_NMI = 32,
+ IOCNUM_IOAPIC_ASSERT_IRQ = 33,
+ IOCNUM_IOAPIC_DEASSERT_IRQ = 34,
+ IOCNUM_IOAPIC_PULSE_IRQ = 35,
+ IOCNUM_LAPIC_MSI = 36,
+ IOCNUM_LAPIC_LOCAL_IRQ = 37,
+ IOCNUM_IOAPIC_PINCOUNT = 38,
+ IOCNUM_RESTART_INSTRUCTION = 39,
+
+ /* PCI pass-thru */
+ IOCNUM_BIND_PPTDEV = 40,
+ IOCNUM_UNBIND_PPTDEV = 41,
+ IOCNUM_MAP_PPTDEV_MMIO = 42,
+ IOCNUM_PPTDEV_MSI = 43,
+ IOCNUM_PPTDEV_MSIX = 44,
+
+ /* statistics */
+ IOCNUM_VM_STATS = 50,
+ IOCNUM_VM_STAT_DESC = 51,
+
+ /* kernel device state */
+ IOCNUM_SET_X2APIC_STATE = 60,
+ IOCNUM_GET_X2APIC_STATE = 61,
+ IOCNUM_GET_HPET_CAPABILITIES = 62,
+
+ /* legacy interrupt injection */
+ IOCNUM_ISA_ASSERT_IRQ = 80,
+ IOCNUM_ISA_DEASSERT_IRQ = 81,
+ IOCNUM_ISA_PULSE_IRQ = 82,
+ IOCNUM_ISA_SET_IRQ_TRIGGER = 83,
+
+ /* vm_cpuset */
+ IOCNUM_ACTIVATE_CPU = 90,
+ IOCNUM_GET_CPUSET = 91,
+};
+
+#define VM_RUN \
+ _IOWR('v', IOCNUM_RUN, struct vm_run)
+#define VM_MAP_MEMORY \
+ _IOWR('v', IOCNUM_MAP_MEMORY, struct vm_memory_segment)
+#define VM_GET_MEMORY_SEG \
+ _IOWR('v', IOCNUM_GET_MEMORY_SEG, struct vm_memory_segment)
+#define VM_SET_REGISTER \
+ _IOW('v', IOCNUM_SET_REGISTER, struct vm_register)
+#define VM_GET_REGISTER \
+ _IOWR('v', IOCNUM_GET_REGISTER, struct vm_register)
+#define VM_SET_SEGMENT_DESCRIPTOR \
+ _IOW('v', IOCNUM_SET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define VM_GET_SEGMENT_DESCRIPTOR \
+ _IOWR('v', IOCNUM_GET_SEGMENT_DESCRIPTOR, struct vm_seg_desc)
+#define VM_INJECT_EXCEPTION \
+ _IOW('v', IOCNUM_INJECT_EXCEPTION, struct vm_exception)
+#define VM_LAPIC_IRQ \
+ _IOW('v', IOCNUM_LAPIC_IRQ, struct vm_lapic_irq)
+#define VM_LAPIC_LOCAL_IRQ \
+ _IOW('v', IOCNUM_LAPIC_LOCAL_IRQ, struct vm_lapic_irq)
+#define VM_LAPIC_MSI \
+ _IOW('v', IOCNUM_LAPIC_MSI, struct vm_lapic_msi)
+#define VM_IOAPIC_ASSERT_IRQ \
+ _IOW('v', IOCNUM_IOAPIC_ASSERT_IRQ, struct vm_ioapic_irq)
+#define VM_IOAPIC_DEASSERT_IRQ \
+ _IOW('v', IOCNUM_IOAPIC_DEASSERT_IRQ, struct vm_ioapic_irq)
+#define VM_IOAPIC_PULSE_IRQ \
+ _IOW('v', IOCNUM_IOAPIC_PULSE_IRQ, struct vm_ioapic_irq)
+#define VM_IOAPIC_PINCOUNT \
+ _IOR('v', IOCNUM_IOAPIC_PINCOUNT, int)
+#define VM_ISA_ASSERT_IRQ \
+ _IOW('v', IOCNUM_ISA_ASSERT_IRQ, struct vm_isa_irq)
+#define VM_ISA_DEASSERT_IRQ \
+ _IOW('v', IOCNUM_ISA_DEASSERT_IRQ, struct vm_isa_irq)
+#define VM_ISA_PULSE_IRQ \
+ _IOW('v', IOCNUM_ISA_PULSE_IRQ, struct vm_isa_irq)
+#define VM_ISA_SET_IRQ_TRIGGER \
+ _IOW('v', IOCNUM_ISA_SET_IRQ_TRIGGER, struct vm_isa_irq_trigger)
+#define VM_SET_CAPABILITY \
+ _IOW('v', IOCNUM_SET_CAPABILITY, struct vm_capability)
+#define VM_GET_CAPABILITY \
+ _IOWR('v', IOCNUM_GET_CAPABILITY, struct vm_capability)
+#define VM_BIND_PPTDEV \
+ _IOW('v', IOCNUM_BIND_PPTDEV, struct vm_pptdev)
+#define VM_UNBIND_PPTDEV \
+ _IOW('v', IOCNUM_UNBIND_PPTDEV, struct vm_pptdev)
+#define VM_MAP_PPTDEV_MMIO \
+ _IOW('v', IOCNUM_MAP_PPTDEV_MMIO, struct vm_pptdev_mmio)
+#define VM_PPTDEV_MSI \
+ _IOW('v', IOCNUM_PPTDEV_MSI, struct vm_pptdev_msi)
+#define VM_PPTDEV_MSIX \
+ _IOW('v', IOCNUM_PPTDEV_MSIX, struct vm_pptdev_msix)
+#define VM_INJECT_NMI \
+ _IOW('v', IOCNUM_INJECT_NMI, struct vm_nmi)
+#ifdef __FreeBSD__
+#define VM_STATS \
+ _IOWR('v', IOCNUM_VM_STATS, struct vm_stats)
+#endif
+#define VM_STAT_DESC \
+ _IOWR('v', IOCNUM_VM_STAT_DESC, struct vm_stat_desc)
+#define VM_SET_X2APIC_STATE \
+ _IOW('v', IOCNUM_SET_X2APIC_STATE, struct vm_x2apic)
+#define VM_GET_X2APIC_STATE \
+ _IOWR('v', IOCNUM_GET_X2APIC_STATE, struct vm_x2apic)
+#define VM_GET_HPET_CAPABILITIES \
+ _IOR('v', IOCNUM_GET_HPET_CAPABILITIES, struct vm_hpet_cap)
+#define VM_GET_GPA_PMAP \
+ _IOWR('v', IOCNUM_GET_GPA_PMAP, struct vm_gpa_pte)
+#define VM_GLA2GPA \
+ _IOWR('v', IOCNUM_GLA2GPA, struct vm_gla2gpa)
+#define VM_ACTIVATE_CPU \
+ _IOW('v', IOCNUM_ACTIVATE_CPU, struct vm_activate_cpu)
+#define VM_GET_CPUS \
+ _IOW('v', IOCNUM_GET_CPUSET, struct vm_cpuset)
+#define VM_RESTART_INSTRUCTION \
+ _IOW('v', IOCNUM_RESTART_INSTRUCTION, int)
+#endif
diff --git a/usr/src/uts/i86pc/sys/vmm_impl.h b/usr/src/uts/i86pc/sys/vmm_impl.h
new file mode 100644
index 0000000000..1602fa286d
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm_impl.h
@@ -0,0 +1,86 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright 2014 Pluribus Networks Inc.
+ */
+
+#ifndef _VMM_IMPL_H_
+#define _VMM_IMPL_H_
+
+#include <sys/mutex.h>
+#include <sys/queue.h>
+#include <sys/varargs.h>
+
+/*
+ * /dev names:
+ * /dev/vmmctl - control device
+ * /dev/vmm/<name> - vm devices
+ */
+#define VMM_DRIVER_NAME "vmm"
+
+#define VMM_CTL_MINOR_NODE "ctl"
+#define VMM_CTL_MINOR_NAME VMM_DRIVER_NAME VMM_CTL_NODE
+#define VMM_CTL_MINOR 0
+
+#define VMM_IOC_BASE (('V' << 16) | ('M' << 8))
+
+#define VMM_CREATE_VM (VMM_IOC_BASE | 0x01)
+#define VMM_DESTROY_VM (VMM_IOC_BASE | 0x02)
+
+struct vmm_ioctl {
+ char vmm_name[VM_MAX_NAMELEN];
+};
+
+#ifdef _KERNEL
+struct vmm_softc {
+ boolean_t open;
+ minor_t minor;
+ struct vm *vm;
+ char name[VM_MAX_NAMELEN];
+ SLIST_ENTRY(vmm_softc) link;
+};
+#endif
+
+/*
+ * VMM trace ring buffer constants
+ */
+#define VMM_DMSG_RING_SIZE 0x100000 /* 1MB */
+#define VMM_DMSG_BUF_SIZE 256
+
+/*
+ * VMM trace ring buffer content
+ */
+typedef struct vmm_trace_dmsg {
+ timespec_t timestamp;
+ char buf[VMM_DMSG_BUF_SIZE];
+ struct vmm_trace_dmsg *next;
+} vmm_trace_dmsg_t;
+
+/*
+ * VMM trace ring buffer header
+ */
+typedef struct vmm_trace_rbuf {
+ kmutex_t lock; /* lock to avoid clutter */
+ int looped; /* completed ring */
+ int allocfailed; /* dmsg mem alloc failed */
+ size_t size; /* current size */
+ size_t maxsize; /* max size */
+ vmm_trace_dmsg_t *dmsgh; /* messages head */
+ vmm_trace_dmsg_t *dmsgp; /* ptr to last message */
+} vmm_trace_rbuf_t;
+
+/*
+ * VMM trace ring buffer interfaces
+ */
+void vmm_trace_log(const char *fmt, ...);
+
+#endif /* _VMM_IMPL_H_ */
diff --git a/usr/src/uts/i86pc/sys/vmm_instruction_emul.h b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h
new file mode 100644
index 0000000000..8138890a2c
--- /dev/null
+++ b/usr/src/uts/i86pc/sys/vmm_instruction_emul.h
@@ -0,0 +1,126 @@
+/*-
+ * Copyright (c) 2012 NetApp, Inc.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY NETAPP, INC ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL NETAPP, INC OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $FreeBSD: head/sys/amd64/include/vmm_instruction_emul.h 276479 2014-12-31 20:31:32Z dim $
+ */
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ *
+ * Copyright 2015 Pluribus Networks Inc.
+ */
+
+#ifndef _VMM_INSTRUCTION_EMUL_H_
+#define _VMM_INSTRUCTION_EMUL_H_
+
+#include <sys/mman.h>
+
+/*
+ * Callback functions to read and write memory regions.
+ */
+typedef int (*mem_region_read_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t *rval, int rsize, void *arg);
+
+typedef int (*mem_region_write_t)(void *vm, int cpuid, uint64_t gpa,
+ uint64_t wval, int wsize, void *arg);
+
+/*
+ * Emulate the decoded 'vie' instruction.
+ *
+ * The callbacks 'mrr' and 'mrw' emulate reads and writes to the memory region
+ * containing 'gpa'. 'mrarg' is an opaque argument that is passed into the
+ * callback functions.
+ *
+ * 'void *vm' should be 'struct vm *' when called from kernel context and
+ * 'struct vmctx *' when called from user context.
+ * s
+ */
+int vmm_emulate_instruction(void *vm, int cpuid, uint64_t gpa, struct vie *vie,
+ struct vm_guest_paging *paging, mem_region_read_t mrr,
+ mem_region_write_t mrw, void *mrarg);
+
+int vie_update_register(void *vm, int vcpuid, enum vm_reg_name reg,
+ uint64_t val, int size);
+
+/*
+ * Returns 1 if an alignment check exception should be injected and 0 otherwise.
+ */
+int vie_alignment_check(int cpl, int operand_size, uint64_t cr0,
+ uint64_t rflags, uint64_t gla);
+
+/* Returns 1 if the 'gla' is not canonical and 0 otherwise. */
+int vie_canonical_check(enum vm_cpu_mode cpu_mode, uint64_t gla);
+
+uint64_t vie_size2mask(int size);
+
+int vie_calculate_gla(enum vm_cpu_mode cpu_mode, enum vm_reg_name seg,
+ struct seg_desc *desc, uint64_t off, int length, int addrsize, int prot,
+ uint64_t *gla);
+
+#ifdef _KERNEL
+/*
+ * APIs to fetch and decode the instruction from nested page fault handler.
+ *
+ * 'vie' must be initialized before calling 'vmm_fetch_instruction()'
+ */
+int vmm_fetch_instruction(struct vm *vm, int cpuid,
+ struct vm_guest_paging *guest_paging,
+ uint64_t rip, int inst_length, struct vie *vie);
+
+/*
+ * Translate the guest linear address 'gla' to a guest physical address.
+ *
+ * Returns 0 on success and '*gpa' contains the result of the translation.
+ * Returns 1 if an exception was injected into the guest.
+ * Returns -1 otherwise.
+ */
+int vm_gla2gpa(struct vm *vm, int vcpuid, struct vm_guest_paging *paging,
+ uint64_t gla, int prot, uint64_t *gpa);
+
+void vie_init(struct vie *vie, const char *inst_bytes, int inst_length);
+
+/*
+ * Decode the instruction fetched into 'vie' so it can be emulated.
+ *
+ * 'gla' is the guest linear address provided by the hardware assist
+ * that caused the nested page table fault. It is used to verify that
+ * the software instruction decoding is in agreement with the hardware.
+ *
+ * Some hardware assists do not provide the 'gla' to the hypervisor.
+ * To skip the 'gla' verification for this or any other reason pass
+ * in VIE_INVALID_GLA instead.
+ */
+#define VIE_INVALID_GLA (1UL << 63) /* a non-canonical address */
+int vmm_decode_instruction(struct vm *vm, int cpuid, uint64_t gla,
+ enum vm_cpu_mode cpu_mode, int csd, struct vie *vie);
+#endif /* _KERNEL */
+
+#endif /* _VMM_INSTRUCTION_EMUL_H_ */
diff --git a/usr/src/uts/i86pc/viona/Makefile b/usr/src/uts/i86pc/viona/Makefile
new file mode 100644
index 0000000000..c2b8bd8dcf
--- /dev/null
+++ b/usr/src/uts/i86pc/viona/Makefile
@@ -0,0 +1,72 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = viona
+OBJECTS = $(VIONA_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(VIONA_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(USR_DRV_DIR)/$(MODULE)
+CONF_SRCDIR = $(UTSBASE)/i86pc/io/viona
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/i86pc/Makefile.i86pc
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY) $(SRC_CONFILE)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+#
+# Overrides
+#
+CFLAGS += $(CCVERBOSE)
+LDFLAGS += -dy -Ndrv/dld -Nmisc/mac -Nmisc/dls -Ndrv/vmm
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/i86pc/Makefile.targ
diff --git a/usr/src/uts/i86pc/vmm/Makefile b/usr/src/uts/i86pc/vmm/Makefile
new file mode 100644
index 0000000000..b3ab735781
--- /dev/null
+++ b/usr/src/uts/i86pc/vmm/Makefile
@@ -0,0 +1,94 @@
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright 2013 Pluribus Networks Inc.
+#
+
+#
+# Path to the base of the uts directory tree (usually /usr/src/uts).
+#
+UTSBASE = ../..
+
+#
+# Define the module and object file sets.
+#
+MODULE = vmm
+OBJECTS = $(VMM_OBJS:%=$(OBJS_DIR)/%)
+LINTS = $(VMM_OBJS:%.o=$(LINTS_DIR)/%.ln)
+ROOTMODULE = $(USR_DRV_DIR)/$(MODULE)
+CONF_SRCDIR = $(UTSBASE)/i86pc/io/vmm
+
+#
+# Include common rules.
+#
+include $(UTSBASE)/i86pc/Makefile.i86pc
+
+#
+# Define targets
+#
+ALL_TARGET = $(BINARY)
+LINT_TARGET = $(MODULE).lint
+INSTALL_TARGET = $(BINARY) $(ROOTMODULE) $(ROOT_CONFFILE)
+
+#
+# Overrides and additions
+#
+
+# These sources only compile with gcc. Workaround a confluence of cruft
+# regarding dmake and shadow compilation by neutering the sun compiler.
+amd64_CC = $(ONBLD_TOOLS)/bin/$(MACH)/cw -_gcc
+CFLAGS += -_cc=-xdryrun
+
+ALL_BUILDS = $(ALL_BUILDSONLY64)
+DEF_BUILDS = $(DEF_BUILDSONLY64)
+PRE_INC_PATH = -I$(COMPAT)/freebsd -I$(COMPAT)/freebsd/amd64 \
+ -I$(CONTRIB)/freebsd -I$(CONTRIB)/freebsd/amd64
+INC_PATH += -I$(UTSBASE)/i86pc/io/vmm -I$(UTSBASE)/i86pc/io/vmm/io
+AS_INC_PATH += -I$(UTSBASE)/i86pc/io/vmm -I$(OBJS_DIR)
+
+CFLAGS += -_gcc=-Wimplicit-function-declaration
+
+OFFSETS_SRC = $(CONF_SRCDIR)/offsets.in
+ASSYM_H = $(OBJS_DIR)/vmx_assym.h
+
+CLEANFILES += $(ASSYM_H)
+
+#
+# Default build targets.
+#
+.KEEP_STATE:
+
+def: $(DEF_DEPS)
+
+all: $(ALL_DEPS)
+
+clean: $(CLEAN_DEPS)
+
+clobber: $(CLOBBER_DEPS)
+
+lint: $(LINT_DEPS)
+
+modlintlib: $(MODLINTLIB_DEPS)
+
+clean.lint: $(CLEAN_LINT_DEPS)
+
+install: $(INSTALL_DEPS)
+
+#
+# Include common targets.
+#
+include $(UTSBASE)/i86pc/Makefile.targ
+
+$(OBJECTS): $(ASSYM_H)
+
+$(ASSYM_H): $(OFFSETS_SRC) $(GENASSYM)
+ $(OFFSETS_CREATE) -I../../i86pc/io/vmm < $(OFFSETS_SRC) >$@